diff --git a/BEVfusion_fix/fix_weight.py b/BEVfusion_fix/fix_weight.py
new file mode 100644
index 0000000000000000000000000000000000000000..3006a51e9fb25e6535a17e4c074df13e312261bd
--- /dev/null
+++ b/BEVfusion_fix/fix_weight.py
@@ -0,0 +1,18 @@
+import torch
+
+# 指向你刚刚发给我的纯 LiDAR 官方权重
+ckpt_path = 'pth/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-2628f933.pth'
+ckpt = torch.load(ckpt_path, map_location='cpu')
+state_dict = ckpt['state_dict']
+
+fixed_count = 0
+for key in list(state_dict.keys()):
+    # 修复 3D 稀疏卷积维度 (16,3,3,3,16) -> (3,3,3,16,16)
+    if 'pts_middle_encoder' in key and state_dict[key].dim() == 5:
+        state_dict[key] = state_dict[key].permute(1, 2, 3, 4, 0).contiguous()
+        fixed_count += 1
+
+ckpt['state_dict'] = state_dict
+fixed_path = ckpt_path.replace('.pth', '_fixed.pth')
+torch.save(ckpt, fixed_path)
+print(f'✅ 纯 LiDAR 权重修复完成！已保存至 {fixed_path}，共处理 {fixed_count} 个层。')
\ No newline at end of file
diff --git a/BEVfusion_fix/mmcv-2.2.0+das.opt1.dtk2604.torch251-cp310-cp310-manylinux_2_28_x86_64.whl b/BEVfusion_fix/mmcv-2.2.0+das.opt1.dtk2604.torch251-cp310-cp310-manylinux_2_28_x86_64.whl
new file mode 100644
index 0000000000000000000000000000000000000000..fa0f01f02e20d2887c7389033c74e6b2bf508318
Binary files /dev/null and b/BEVfusion_fix/mmcv-2.2.0+das.opt1.dtk2604.torch251-cp310-cp310-manylinux_2_28_x86_64.whl differ
diff --git "a/BEVfusion_fix/pip_\350\277\207\347\250\213.txt" "b/BEVfusion_fix/pip_\350\277\207\347\250\213.txt"
new file mode 100644
index 0000000000000000000000000000000000000000..a19447e586bf6a6592f819d0821aa366ef13db14
--- /dev/null
+++ "b/BEVfusion_fix/pip_\350\277\207\347\250\213.txt"
@@ -0,0 +1,17 @@
+pip  uninstall  megatron-core
+pip uninstall  vllm
+pip uninstall  mmcv-full
+pip install OpenMPI 
+pip install mpi4py   Pillow  mmengine
+pip install mmdet
+pip install nuscenes-devkit
+
+pip install opencv-python==4.9.0.80
+pip install mmcv-2.1.0+das.opt1.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
+
+pip install -v -e . --no-build-isolation
+pip install -e . --no-build-isolation
+
+  Created wheel for mmdet3d: filename=mmdet3d-0.0.0-0.editable-cp310-cp310-linux_x86_64.whl size=6984 sha256=97be2853385856a1d76c007755805580a7380dbf324a1d24acb5b81e453c8732
+
+  python3 setup.py build_ext -v --inplace
\ No newline at end of file
diff --git a/BEVfusion_fix/setup.cfg b/BEVfusion_fix/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..1ad205d5e44f8ab724f97ed1966930d2a8fcd19f
--- /dev/null
+++ b/BEVfusion_fix/setup.cfg
@@ -0,0 +1,19 @@
+[yapf]
+BASED_ON_STYLE = pep8
+BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
+SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
+
+[isort]
+line_length = 79
+multi_line_output = 0
+extra_standard_library = setuptools
+known_first_party = mmdet3d
+known_third_party = cv2,imageio,indoor3d_util,load_scannet_data,lyft_dataset_sdk,m2r,matplotlib,mmcv,mmdet,mmengine,nuimages,numba,numpy,nuscenes,pandas,plyfile,pycocotools,pyquaternion,pytest,pytorch_sphinx_theme,recommonmark,requests,scannet_utils,scipy,seaborn,shapely,skimage,sphinx,tensorflow,terminaltables,torch,trimesh,ts,waymo_open_dataset
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = THIRDPARTY
+
+[codespell]
+ignore-words-list = ans,refridgerator,crate,hist,formating,dout,wan,nd,fo,avod,AVOD,warmup
+
+[flake8]
+per-file-ignores = mmdet3d/configs/*:F401,F403,F405
diff --git a/BEVfusion_fix/setup.py b/BEVfusion_fix/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..86ad8a266d05288fda0e97cd3ed9c22580f0fba5
--- /dev/null
+++ b/BEVfusion_fix/setup.py
@@ -0,0 +1,227 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import platform
+import shutil
+import sys
+import warnings
+from os import path as osp
+from setuptools import find_packages, setup
+
+import torch
+from torch.utils.cpp_extension import (BuildExtension, CppExtension,
+                                       CUDAExtension)
+
+
+def readme():
+    with open('README.md', encoding='utf-8') as f:
+        content = f.read()
+    return content
+
+
+version_file = 'mmdet3d/version.py'
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+def make_cuda_ext(name,
+                  module,
+                  sources,
+                  sources_cuda=[],
+                  extra_args=[],
+                  extra_include_path=[]):
+
+    define_macros = []
+    extra_compile_args = {'cxx': [] + extra_args}
+
+    if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
+        define_macros += [('WITH_CUDA', None)]
+        extension = CUDAExtension
+        extra_compile_args['nvcc'] = extra_args + [
+            '-D__CUDA_NO_HALF_OPERATORS__',
+            '-D__CUDA_NO_HALF_CONVERSIONS__',
+            '-D__CUDA_NO_HALF2_OPERATORS__',
+        ]
+        sources += sources_cuda
+    else:
+        print('Compiling {} without CUDA'.format(name))
+        extension = CppExtension
+        # raise EnvironmentError('CUDA is required to compile MMDetection!')
+
+    return extension(
+        name='{}.{}'.format(module, name),
+        sources=[os.path.join(*module.split('.'), p) for p in sources],
+        include_dirs=extra_include_path,
+        define_macros=define_macros,
+        extra_compile_args=extra_compile_args)
+
+
+def parse_requirements(fname='requirements.txt', with_version=True):
+    """Parse the package dependencies listed in a requirements file but strips
+    specific versioning information.
+
+    Args:
+        fname (str): path to requirements file
+        with_version (bool, default=False): if True include version specs
+
+    Returns:
+        list[str]: list of requirements items
+
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+    import re
+    import sys
+    from os.path import exists
+    require_fpath = fname
+
+    def parse_line(line):
+        """Parse information from a line in a requirements text file."""
+        if line.startswith('-r '):
+            # Allow specifying requirements in other files
+            target = line.split(' ')[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {'line': line}
+            if line.startswith('-e '):
+                info['package'] = line.split('#egg=')[1]
+            else:
+                # Remove versioning from the package
+                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+
+                info['package'] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ';' in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(';'))
+                        info['platform_deps'] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info['version'] = (op, version)
+            yield info
+
+    def parse_require_file(fpath):
+        with open(fpath, 'r') as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    for info in parse_line(line):
+                        yield info
+
+    def gen_packages_items():
+        if exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info['package']]
+                if with_version and 'version' in info:
+                    parts.extend(info['version'])
+                if not sys.version.startswith('3.4'):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get('platform_deps')
+                    if platform_deps is not None:
+                        parts.append(';' + platform_deps)
+                item = ''.join(parts)
+                yield item
+
+    packages = list(gen_packages_items())
+    return packages
+
+
+def add_mim_extention():
+    """Add extra files that are required to support MIM into the package.
+
+    These files will be added by creating a symlink to the originals if the
+    package is installed in `editable` mode (e.g. pip install -e .), or by
+    copying from the originals otherwise.
+    """
+
+    # parse installment mode
+    if 'develop' in sys.argv:
+        # installed by `pip install -e .`
+        if platform.system() == 'Windows':
+            # set `copy` mode here since symlink fails on Windows.
+            mode = 'copy'
+        else:
+            mode = 'symlink'
+    elif 'sdist' in sys.argv or 'bdist_wheel' in sys.argv:
+        # installed by `pip install .`
+        # or create source distribution by `python setup.py sdist`
+        mode = 'copy'
+    else:
+        return
+
+    filenames = [
+        'tools', 'configs', 'demo', 'model-index.yml', 'dataset-index.yml'
+    ]
+    repo_path = osp.dirname(__file__)
+    mim_path = osp.join(repo_path, 'mmdet3d', '.mim')
+    os.makedirs(mim_path, exist_ok=True)
+
+    for filename in filenames:
+        if osp.exists(filename):
+            src_path = osp.join(repo_path, filename)
+            tar_path = osp.join(mim_path, filename)
+
+            if osp.isfile(tar_path) or osp.islink(tar_path):
+                os.remove(tar_path)
+            elif osp.isdir(tar_path):
+                shutil.rmtree(tar_path)
+
+            if mode == 'symlink':
+                src_relpath = osp.relpath(src_path, osp.dirname(tar_path))
+                os.symlink(src_relpath, tar_path)
+            elif mode == 'copy':
+                if osp.isfile(src_path):
+                    shutil.copyfile(src_path, tar_path)
+                elif osp.isdir(src_path):
+                    shutil.copytree(src_path, tar_path)
+                else:
+                    warnings.warn(f'Cannot copy file {src_path}.')
+            else:
+                raise ValueError(f'Invalid mode {mode}')
+
+
+if __name__ == '__main__':
+    add_mim_extention()
+    setup(
+        name='mmdet3d',
+        version=get_version(),
+        description=("OpenMMLab's next-generation platform"
+                     'for general 3D object detection.'),
+        long_description=readme(),
+        long_description_content_type='text/markdown',
+        author='MMDetection3D Contributors',
+        author_email='zwwdev@gmail.com',
+        keywords='computer vision, 3D object detection',
+        url='https://github.com/open-mmlab/mmdetection3d',
+        packages=find_packages(exclude=('configs', 'tools', 'demo')),
+        include_package_data=True,
+        classifiers=[
+            'Development Status :: 5 - Production/Stable',
+            'License :: OSI Approved :: Apache Software License',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.7',
+            'Programming Language :: Python :: 3.8',
+            'Programming Language :: Python :: 3.9',
+        ],
+        license='Apache License 2.0',
+        install_requires=parse_requirements('requirements/runtime.txt'),
+        extras_require={
+            'all': parse_requirements('requirements.txt'),
+            'tests': parse_requirements('requirements/tests.txt'),
+            'build': parse_requirements('requirements/build.txt'),
+            'optional': parse_requirements('requirements/optional.txt'),
+            'mim': parse_requirements('requirements/mminstall.txt'),
+        },
+        ext_modules=[],
+        cmdclass={'build_ext': BuildExtension},
+        zip_safe=False)
diff --git a/BEVfusion_fix/testmodel.py b/BEVfusion_fix/testmodel.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f297392942fad84ea62e1a960851c29d1a4367
--- /dev/null
+++ b/BEVfusion_fix/testmodel.py
@@ -0,0 +1,32 @@
+import torch
+from mmengine.config import Config
+from mmdet3d.registry import MODELS
+
+# 1. 唤醒 MMDetection3D 的全家桶注册表 (修复报错的这行极其关键！)
+from mmdet3d.utils import register_all_modules
+register_all_modules(init_default_scope=True)
+
+# 2. 显式导入 BEVFusion 项目，触发自定义算子和模块的注册！
+import projects.BEVFusion.bevfusion
+
+print("🔍 正在解析 BEVFusion 配置文件...")
+# 使用官方提供的默认配置文件
+config_file = 'projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py'
+cfg = Config.fromfile(config_file)
+
+print("🧱 正在海光 DCU 上构建 BEVFusion 模型架构...")
+try:
+    # 实例化模型
+    model = MODELS.build(cfg.model)
+    
+    # 推入海光 GPU (DCU) 显存
+    model.cuda()
+    
+    # 打印一下网络参数量，确认实体存在
+    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"\n✅ 帅！模型在显存中构建成功！")
+    print(f"📊 模型总可训练参数量: {num_params / 1e6:.2f} M (百万)")
+    print("🚀 恭喜！高层 API 与配置文件解析完美通关！")
+    
+except Exception as e:
+    print(f"\n❌ 模型构建失败，报错信息如下:\n{e}")
\ No newline at end of file
diff --git "a/BEVfusion_fix/\345\221\275\344\273\244" "b/BEVfusion_fix/\345\221\275\344\273\244"
new file mode 100644
index 0000000000000000000000000000000000000000..4c791d6de1d577bdf394e104f884248c5dd19123
--- /dev/null
+++ "b/BEVfusion_fix/\345\221\275\344\273\244"
@@ -0,0 +1,11 @@
+python3 tools/test.py \
+    projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py \
+    pth/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-5239b1af_fixed.pth \
+    --cfg-options \
+    test_dataloader.dataset.ann_file=nuscenes_infos_mini_val.pkl test_dataloader.batch_size=4
+
+python3 tools/test.py \
+    projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py \
+    pth/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-2628f933_fixed.pth \
+    --cfg-options \
+    test_dataloader.dataset.ann_file=nuscenes_infos_mini_val.pkl
\ No newline at end of file
diff --git a/mmde.tar.gz b/mmde.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..d5e3a66c768fb4cad9469c888390a3a2cbd9bbe8
Binary files /dev/null and b/mmde.tar.gz differ
diff --git a/mmde/.circleci/config.yml b/mmde/.circleci/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..af5086c72c6c603085af6d697236693690039f7a
--- /dev/null
+++ b/mmde/.circleci/config.yml
@@ -0,0 +1,35 @@
+version: 2.1
+
+# this allows you to use CircleCI's dynamic configuration feature
+setup: true
+
+# the path-filtering orb is required to continue a pipeline based on
+# the path of an updated fileset
+orbs:
+  path-filtering: circleci/path-filtering@0.1.2
+
+workflows:
+  # the always-run workflow is always triggered, regardless of the pipeline parameters.
+  always-run:
+    jobs:
+      # the path-filtering/filter job determines which pipeline
+      # parameters to update.
+      - path-filtering/filter:
+          name: check-updated-files
+          # 3-column, whitespace-delimited mapping. One mapping per
+          # line:
+          # <regex path-to-test> <parameter-to-set> <value-of-pipeline-parameter>
+          mapping: |
+            mmdet3d/.* lint_only false
+            requirements/.* lint_only false
+            tests/.* lint_only false
+            tools/.* lint_only false
+            configs/.* lint_only false
+            .circleci/.* lint_only false
+            projects/.* lint_only false
+          base-revision: dev-1.x
+          # this is the path of the configuration we should trigger once
+          # path filtering and pipeline parameter value updates are
+          # complete. In this case, we are using the parent dynamic
+          # configuration itself.
+          config-path: .circleci/test.yml
diff --git a/mmde/.circleci/docker/Dockerfile b/mmde/.circleci/docker/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..cc4c705b4acd727c39aa4f0cf4a058124327590d
--- /dev/null
+++ b/mmde/.circleci/docker/Dockerfile
@@ -0,0 +1,13 @@
+ARG PYTORCH="1.8.1"
+ARG CUDA="10.2"
+ARG CUDNN="7"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+# To fix GPG key error when running apt-get update
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+RUN apt-get update && apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx
diff --git a/mmde/.circleci/test.yml b/mmde/.circleci/test.yml
new file mode 100644
index 0000000000000000000000000000000000000000..19a25a8610468507883161ddccd31f5789f75c1d
--- /dev/null
+++ b/mmde/.circleci/test.yml
@@ -0,0 +1,199 @@
+version: 2.1
+
+# the default pipeline parameters, which will be updated according to
+# the results of the path-filtering orb
+parameters:
+  lint_only:
+    type: boolean
+    default: true
+
+jobs:
+  lint:
+    docker:
+      - image: cimg/python:3.7.4
+    steps:
+      - checkout
+      - run:
+          name: Install pre-commit hook
+          command: |
+            pip install pre-commit
+            pre-commit install
+      - run:
+          name: Linting
+          command: pre-commit run --all-files
+      - run:
+          name: Check docstring coverage
+          command: |
+            pip install interrogate
+            interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-magic --ignore-regex "__repr__" --fail-under 90 mmdet3d
+
+  build_cpu:
+    parameters:
+      # The python version must match available image tags in
+      # https://circleci.com/developer/images/image/cimg/python
+      python:
+        type: string
+      torch:
+        type: string
+      torchvision:
+        type: string
+    docker:
+      - image: cimg/python:<< parameters.python >>
+    resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Install Libraries
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx libjpeg-dev zlib1g-dev libtinfo-dev libncurses5
+      - run:
+          name: Configure Python & pip
+          command: |
+            pip install --upgrade pip
+            pip install wheel
+      - run:
+          name: Install PyTorch
+          command: pip install torch==<< parameters.torch >>+cpu torchvision==<< parameters.torchvision >>+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - when:
+          condition:
+            equal: ["3.9.0", << parameters.python >>]
+          steps:
+            - run: pip install "protobuf <= 3.20.1" && sudo apt-get update && sudo apt-get -y install libprotobuf-dev protobuf-compiler cmake
+      - run:
+          name: Install mmdet3d dependencies
+          command: |
+            pip install git+ssh://git@github.com/open-mmlab/mmengine.git@main
+            pip install -U openmim
+            mim install 'mmcv >= 2.0.0rc4'
+            pip install git+ssh://git@github.com/open-mmlab/mmdetection.git@dev-3.x
+            pip install -r requirements/tests.txt
+      - run:
+          name: Build and install
+          command: |
+            pip install -e .
+      - run:
+          name: Run unittests
+          command: |
+            coverage run --branch --source mmdet3d -m pytest tests/
+            coverage xml
+            coverage report -m
+
+  build_cuda:
+    parameters:
+      torch:
+        type: string
+      cuda:
+        type: enum
+        enum: ["10.2", "11.7"]
+      cudnn:
+        type: integer
+        default: 8
+    machine:
+      image: linux-cuda-11:default
+      # docker_layer_caching: true
+    resource_class: gpu.nvidia.small.multi
+    steps:
+      - checkout
+      - run:
+          name: Install nvidia-container-toolkit and Restart Docker
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y nvidia-container-toolkit
+            sudo systemctl restart docker
+      - run:
+          # Cloning repos in VM since Docker doesn't have access to the private key
+          name: Clone Repos
+          command: |
+            git clone -b main --depth 1 ssh://git@github.com/open-mmlab/mmengine.git /home/circleci/mmengine
+            git clone -b dev-3.x --depth 1 ssh://git@github.com/open-mmlab/mmdetection.git /home/circleci/mmdetection
+      - run:
+          name: Build Docker image
+          command: |
+            docker build .circleci/docker -t mmdet3d:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >>
+            docker run --gpus all -t -d -v /home/circleci/project:/mmdetection3d -v /home/circleci/mmengine:/mmengine -v /home/circleci/mmdetection:/mmdetection -w /mmdetection3d --name mmdet3d mmdet3d:gpu
+            docker exec mmdet3d apt-get install -y git
+      - run:
+          name: Install mmdet3d dependencies
+          command: |
+            docker exec mmdet3d pip install -e /mmengine
+            docker exec mmdet3d pip install -U openmim
+            docker exec mmdet3d mim install 'mmcv >= 2.0.0rc4'
+            docker exec mmdet3d pip install -e /mmdetection
+            docker exec mmdet3d pip install -r requirements/tests.txt
+      - run:
+          name: Build and install
+          command: |
+            docker exec mmdet3d pip install -e .
+      - run:
+          name: Run unittests
+          command: |
+            docker exec mmdet3d pytest tests/
+
+workflows:
+  pr_stage_lint:
+    when: << pipeline.parameters.lint_only >>
+    jobs:
+      - lint:
+          name: lint
+          filters:
+            branches:
+              ignore:
+                - dev-1.x
+  pr_stage_test:
+    when:
+      not: << pipeline.parameters.lint_only >>
+    jobs:
+      - lint:
+          name: lint
+          filters:
+            branches:
+              ignore:
+                - dev-1.x
+      - build_cpu:
+          name: minimum_version_cpu
+          torch: 1.8.1
+          torchvision: 0.9.1
+          python: 3.7.4 # The lowest python 3.7.x version available on CircleCI images
+          requires:
+            - lint
+      - build_cpu:
+          name: maximum_version_cpu
+          torch: 2.0.0
+          torchvision: 0.15.1
+          python: 3.9.0
+          requires:
+            - minimum_version_cpu
+      - hold:
+          type: approval
+          requires:
+            - maximum_version_cpu
+      - build_cuda:
+          name: mainstream_version_gpu
+          torch: 1.8.1
+          # Use double quotation mark to explicitly specify its type
+          # as string instead of number
+          cuda: "10.2"
+          cudnn: 7
+          requires:
+            - hold
+      - build_cuda:
+          name: maximum_version_gpu
+          torch: 2.0.0
+          cuda: "11.7"
+          cudnn: 8
+          requires:
+            - hold
+  merge_stage_test:
+    when:
+      not: << pipeline.parameters.lint_only >>
+    jobs:
+      - build_cuda:
+          name: minimum_version_gpu
+          torch: 1.8.1
+          cuda: "10.2"
+          cudnn: 7
+          filters:
+            branches:
+              only:
+                - dev-1.x
diff --git a/mmde/.dev_scripts/benchmark_full_models.txt b/mmde/.dev_scripts/benchmark_full_models.txt
new file mode 100644
index 0000000000000000000000000000000000000000..80b7e2a9cf184d3d9c1e2904dfe45eabe40f2073
--- /dev/null
+++ b/mmde/.dev_scripts/benchmark_full_models.txt
@@ -0,0 +1,26 @@
+configs/3dssd/3dssd_4xb4_kitti-3d-car.py
+configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d.py
+configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py
+configs/fcaf3d/fcaf3d_2xb8_s3dis-3d-5class.py
+configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py
+configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py
+configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py
+configs/h3dnet/h3dnet_8xb3_scannet-seg.py
+configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py
+configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py
+configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py
+configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py
+configs/paconv/paconv_ssg_8xb8-cosine-150e_s3dis-seg.py
+configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py
+configs/pgd/pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d.py
+configs/point_rcnn/point-rcnn_8xb2_kitti-3d-3class.py
+configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py
+configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py
+configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py
+configs/pv_rcnn/pv_rcnn_8xb2-80e_kitti-3d-3class.py
+configs/regnet/pointpillars_hv_regnet-1.6gf_fpn_sbn-all_8xb4-2x_nus-3d.py
+configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class.py
+configs/second/second_hv_secfpn_8xb6-amp-80e_kitti-3d-3class.py
+configs/smoke/smoke_dla34_dlaneck_gn-all_4xb8-6x_kitti-mono3d.py
+configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_nus-3d.py
+configs/votenet/votenet_8xb8_scannet-3d.py
diff --git a/mmde/.dev_scripts/benchmark_options.py b/mmde/.dev_scripts/benchmark_options.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8cc338b06d9c987e9d3db4dfe207181511fa9ce
--- /dev/null
+++ b/mmde/.dev_scripts/benchmark_options.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+third_part_libs = [
+    'conda install openblas-devel -c anaconda',
+    "pip install -U git+https://github.com/NVIDIA/MinkowskiEngine -v --no-deps --install-option='--blas_include_dirs=/opt/conda/include' --install-option='--blas=openblas'"  # noqa
+]
+default_floating_range = 0.5
+model_floating_ranges = {
+    'configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py': # noqa
+    0.7
+}
diff --git a/mmde/.dev_scripts/benchmark_train_models.txt b/mmde/.dev_scripts/benchmark_train_models.txt
new file mode 100644
index 0000000000000000000000000000000000000000..89c30fd353811817c5a496c43baa67eaf2265c0c
--- /dev/null
+++ b/mmde/.dev_scripts/benchmark_train_models.txt
@@ -0,0 +1,13 @@
+configs/3dssd/3dssd_4xb4_kitti-3d-car.py
+configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d.py
+configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py
+configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py
+configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py
+configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py
+configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py
+configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py
+configs/pv_rcnn/pv_rcnn_8xb2-80e_kitti-3d-3class.py
+configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class.py
+configs/second/second_hv_secfpn_8xb6-amp-80e_kitti-3d-3class.py
+configs/smoke/smoke_dla34_dlaneck_gn-all_4xb8-6x_kitti-mono3d.py
+configs/votenet/votenet_8xb8_scannet-3d.py
diff --git a/mmde/.dev_scripts/covignore.cfg b/mmde/.dev_scripts/covignore.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..64e01e9d23fffde42f400e2adaf6060cbcc9b7a1
--- /dev/null
+++ b/mmde/.dev_scripts/covignore.cfg
@@ -0,0 +1,6 @@
+# Each line should be the relative path to the root directory
+# of this repo. Support regular expression as well.
+# For example:
+# .*/utils.py
+
+.*/__init__.py
diff --git a/mmde/.dev_scripts/diff_coverage_test.sh b/mmde/.dev_scripts/diff_coverage_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..52f4dd9ae071325163e3aa60b566be6a10dd4bd3
--- /dev/null
+++ b/mmde/.dev_scripts/diff_coverage_test.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+readarray -t IGNORED_FILES < $( dirname "$0" )/covignore.cfg
+
+
+REUSE_COVERAGE_REPORT=${REUSE_COVERAGE_REPORT:-0}
+REPO=${1:-"origin"}
+BRANCH=${2:-"refactor_dev"}
+
+git fetch $REPO $BRANCH
+
+PY_FILES=""
+for FILE_NAME in $(git diff --name-only ${REPO}/${BRANCH}); do
+    # Only test python files in mmdet3d/ existing in current branch, and not ignored in covignore.cfg
+    if [ ${FILE_NAME: -3} == ".py" ] && [ ${FILE_NAME:0:8} == "mmdet3d/" ] && [ -f "$FILE_NAME" ]; then
+        IGNORED=false
+        for IGNORED_FILE_NAME in "${IGNORED_FILES[@]}"; do
+            # Skip blank lines
+            if [ -z "$IGNORED_FILE_NAME" ]; then
+                continue
+            fi
+            if [ "${IGNORED_FILE_NAME::1}" != "#" ] && [[ "$FILE_NAME" =~ $IGNORED_FILE_NAME ]]; then
+                echo "Ignoring $FILE_NAME"
+                IGNORED=true
+                break
+            fi
+        done
+        if [ "$IGNORED" = false ]; then
+            PY_FILES="$PY_FILES $FILE_NAME"
+        fi
+    fi
+done
+
+# Only test the coverage when PY_FILES are not empty, otherwise they will test the entire project
+if [ ! -z "${PY_FILES}" ]
+then
+    if [ "$REUSE_COVERAGE_REPORT" == "0" ]; then
+        coverage run --branch --source mmdet3d -m pytest tests/
+    fi
+    coverage report --fail-under 80 -m $PY_FILES
+    interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-magic --ignore-regex "__repr__" --fail-under 95 $PY_FILES
+fi
diff --git a/mmde/.dev_scripts/gather_models.py b/mmde/.dev_scripts/gather_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..768a86c0cb91ec47123c3a953bae27068a98fffe
--- /dev/null
+++ b/mmde/.dev_scripts/gather_models.py
@@ -0,0 +1,229 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Script to gather benchmarked models and prepare them for upload.
+
+Usage:
+python gather_models.py ${root_path} ${out_dir}
+
+Example:
+python gather_models.py \
+work_dirs/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d \
+work_dirs/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d
+
+Note that before running the above command, rename the directory with the
+config name if you did not use the default directory name, create
+a corresponding directory 'pgd' under the above path and put the used config
+into it.
+"""
+
+import argparse
+import glob
+import json
+import shutil
+import subprocess
+from os import path as osp
+
+import mmengine
+import torch
+
+# build schedule look-up table to automatically find the final model
+SCHEDULES_LUT = {
+    '_1x_': 12,
+    '_2x_': 24,
+    '_20e_': 20,
+    '_3x_': 36,
+    '_4x_': 48,
+    '_24e_': 24,
+    '_6x_': 73,
+    '_50e_': 50,
+    '_80e_': 80,
+    '_100e_': 100,
+    '_150e_': 150,
+    '_200e_': 200,
+    '_250e_': 250,
+    '_400e_': 400
+}
+
+# TODO: add support for lyft dataset
+RESULTS_LUT = {
+    'coco': ['bbox_mAP', 'segm_mAP'],
+    'nus': ['pts_bbox_NuScenes/NDS', 'NDS'],
+    'kitti-3d-3class': ['KITTI/Overall_3D_moderate', 'Overall_3D_moderate'],
+    'kitti-3d-car': ['KITTI/Car_3D_moderate_strict', 'Car_3D_moderate_strict'],
+    'lyft': ['score'],
+    'scannet_seg': ['miou'],
+    's3dis_seg': ['miou'],
+    'scannet': ['mAP_0.50'],
+    'sunrgbd': ['mAP_0.50'],
+    'kitti-mono3d': [
+        'img_bbox/KITTI/Car_3D_AP40_moderate_strict',
+        'Car_3D_AP40_moderate_strict'
+    ],
+    'nus-mono3d': ['img_bbox_NuScenes/NDS', 'NDS']
+}
+
+
+def get_model_dataset(log_json_path):
+    for key in RESULTS_LUT:
+        if log_json_path.find(key) != -1:
+            return key
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    torch.save(checkpoint, out_file)
+    sha = subprocess.check_output(['sha256sum', out_file]).decode()
+    final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8])
+    subprocess.Popen(['mv', out_file, final_file])
+    return final_file
+
+
+def get_final_epoch(config):
+    if config.find('grid_rcnn') != -1 and config.find('2x') != -1:
+        # grid_rcnn 2x trains 25 epochs
+        return 25
+
+    for schedule_name, epoch_num in SCHEDULES_LUT.items():
+        if config.find(schedule_name) != -1:
+            return epoch_num
+
+
+def get_best_results(log_json_path):
+    dataset = get_model_dataset(log_json_path)
+    max_dict = dict()
+    max_memory = 0
+    with open(log_json_path, 'r') as f:
+        for line in f.readlines():
+            log_line = json.loads(line)
+            if 'mode' not in log_line.keys():
+                continue
+
+            # record memory and find best results & epochs
+            if log_line['mode'] == 'train' \
+                    and max_memory <= log_line['memory']:
+                max_memory = log_line['memory']
+
+            elif log_line['mode'] == 'val':
+                result_dict = {
+                    key: log_line[key]
+                    for key in RESULTS_LUT[dataset] if key in log_line
+                }
+                if len(max_dict) == 0:
+                    max_dict = result_dict
+                    max_dict['epoch'] = log_line['epoch']
+                elif all(
+                    [max_dict[key] <= result_dict[key]
+                     for key in result_dict]):
+                    max_dict.update(result_dict)
+                    max_dict['epoch'] = log_line['epoch']
+
+        max_dict['memory'] = max_memory
+        return max_dict
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Gather benchmarked models')
+    parser.add_argument(
+        'root',
+        type=str,
+        help='root path of benchmarked models to be gathered')
+    parser.add_argument(
+        'out', type=str, help='output path of gathered models to be stored')
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    models_root = args.root
+    models_out = args.out
+    mmengine.mkdir_or_exist(models_out)
+
+    # find all models in the root directory to be gathered
+    raw_configs = list(mmengine.scandir('./configs', '.py', recursive=True))
+
+    # filter configs that is not trained in the experiments dir
+    used_configs = []
+    for raw_config in raw_configs:
+        if osp.exists(osp.join(models_root, raw_config)):
+            used_configs.append(raw_config)
+    print(f'Find {len(used_configs)} models to be gathered')
+
+    # find final_ckpt and log file for trained each config
+    # and parse the best performance
+    model_infos = []
+    for used_config in used_configs:
+        # get logs
+        log_json_path = glob.glob(osp.join(models_root, '*.log.json'))[0]
+        log_txt_path = glob.glob(osp.join(models_root, '*.log'))[0]
+        model_performance = get_best_results(log_json_path)
+        final_epoch = model_performance['epoch']
+        final_model = 'epoch_{}.pth'.format(final_epoch)
+        model_path = osp.join(models_root, final_model)
+
+        # skip if the model is still training
+        if not osp.exists(model_path):
+            print(f'Expected {model_path} does not exist!')
+            continue
+
+        if model_performance is None:
+            print(f'Obtained no performance for model {used_config}')
+            continue
+
+        model_time = osp.split(log_txt_path)[-1].split('.')[0]
+        model_infos.append(
+            dict(
+                config=used_config,
+                results=model_performance,
+                epochs=final_epoch,
+                model_time=model_time,
+                log_json_path=osp.split(log_json_path)[-1]))
+
+    # publish model for each checkpoint
+    publish_model_infos = []
+    for model in model_infos:
+        model_publish_dir = osp.join(models_out, model['config'].rstrip('.py'))
+        mmengine.mkdir_or_exist(model_publish_dir)
+
+        model_name = model['config'].split('/')[-1].rstrip(
+            '.py') + '_' + model['model_time']
+        publish_model_path = osp.join(model_publish_dir, model_name)
+        trained_model_path = osp.join(models_root,
+                                      'epoch_{}.pth'.format(model['epochs']))
+
+        # convert model
+        final_model_path = process_checkpoint(trained_model_path,
+                                              publish_model_path)
+
+        # copy log
+        shutil.copy(
+            osp.join(models_root, model['log_json_path']),
+            osp.join(model_publish_dir, f'{model_name}.log.json'))
+        shutil.copy(
+            osp.join(models_root, model['log_json_path'].rstrip('.json')),
+            osp.join(model_publish_dir, f'{model_name}.log'))
+
+        # copy config to guarantee reproducibility
+        config_path = model['config']
+        config_path = osp.join(
+            'configs',
+            config_path) if 'configs' not in config_path else config_path
+        target_cconfig_path = osp.split(config_path)[-1]
+        shutil.copy(config_path,
+                    osp.join(model_publish_dir, target_cconfig_path))
+
+        model['model_path'] = final_model_path
+        publish_model_infos.append(model)
+
+    models = dict(models=publish_model_infos)
+    print(f'Totally gathered {len(publish_model_infos)} models')
+    mmengine.dump(models, osp.join(models_out, 'model_info.json'))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/.dev_scripts/gen_benchmark_script.py b/mmde/.dev_scripts/gen_benchmark_script.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ae1128029754e17288ae97208c357b0dfa9cd9b
--- /dev/null
+++ b/mmde/.dev_scripts/gen_benchmark_script.py
@@ -0,0 +1,193 @@
+import argparse
+import re
+from os import path as osp
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Generate benchmark training/testing scripts')
+    parser.add_argument(
+        '--input_file',
+        required=False,
+        type=str,
+        help='Input file containing the paths '
+        'of configs to be trained/tested.')
+    parser.add_argument(
+        '--output_file',
+        required=True,
+        type=str,
+        help='Output file containing the '
+        'commands to train/test selected models.')
+    parser.add_argument(
+        '--gpus_per_node',
+        type=int,
+        default=8,
+        help='GPUs per node config for slurm, '
+        'should be set according to your slurm environment')
+    parser.add_argument(
+        '--cpus_per_task',
+        type=int,
+        default=5,
+        help='CPUs per task config for slurm, '
+        'should be set according to your slurm environment')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=8,
+        help='Totally used num of GPUs config for slurm (in testing), '
+        'should be set according to your slurm environment')
+    parser.add_argument(
+        '--mode', type=str, default='train', help='Train or test')
+    parser.add_argument(
+        '--long_work_dir',
+        action='store_true',
+        help='Whether use full relative path of config as work dir')
+    parser.add_argument(
+        '--max_keep_ckpts',
+        type=int,
+        default=1,
+        help='The max number of checkpoints saved in training')
+    parser.add_argument(
+        '--full_log',
+        action='store_true',
+        help='Whether save full log in a file')
+
+    args = parser.parse_args()
+    return args
+
+
+args = parse_args()
+assert args.mode in ['train', 'test'], 'Currently we only support ' \
+    'automatically generating training or testing scripts.'
+
+config_paths = []
+
+if args.input_file is not None:
+    with open(args.input_file, 'r') as fi:
+        config_paths = fi.read().strip().split('\n')
+else:
+    while True:
+        print('Please type a config path and '
+              'press enter (press enter directly to exit):')
+        config_path = input()
+        if config_path != '':
+            config_paths.append(config_path)
+        else:
+            break
+
+script = '''PARTITION=$1
+CHECKPOINT_DIR=$2
+
+'''
+
+if args.mode == 'train':
+    for i, config_path in enumerate(config_paths):
+        root_dir = osp.dirname(osp.dirname(osp.abspath(__file__)))
+        if not osp.exists(osp.join(root_dir, config_path)):
+            print(f'Invalid config path (does not exist):\n{config_path}')
+            continue
+
+        config_name = config_path.split('/')[-1][:-3]
+        match_obj = re.match(r'^.*_[0-9]+x([0-9]+)_.*$', config_name)
+        if match_obj is None:
+            print(f'Invalid config path (no GPU num in '
+                  f'config name):\n{config_path}')
+            continue
+
+        gpu_num = int(match_obj.group(1))
+        work_dir_name = config_path if args.long_work_dir else config_name
+
+        script += f"echo '{config_path}' &\n"
+        if args.full_log:
+            script += f'mkdir -p $CHECKPOINT_DIR/{work_dir_name}\n'
+
+        # training commands
+        script += f'GPUS={gpu_num} GPUS_PER_NODE={args.gpus_per_node} ' \
+                  f'CPUS_PER_TASK={args.cpus_per_task} ' \
+                  f'./tools/slurm_train.sh $PARTITION {config_name} ' \
+                  f'{config_path} \\\n'
+        script += f'$CHECKPOINT_DIR/{work_dir_name} --cfg-options ' \
+                  f'checkpoint_config.max_keep_ckpts=' \
+                  f'{args.max_keep_ckpts} \\\n' \
+
+        # if output full log, redirect stdout and stderr to
+        # another log file in work dir
+        if args.full_log:
+            script += f'2>&1|tee $CHECKPOINT_DIR/{work_dir_name}' \
+                      f'/FULL_LOG.txt &\n'
+        else:
+            script += '>/dev/null &\n'
+
+        if i != len(config_paths) - 1:
+            script += '\n'
+
+        print(f'Successfully generated script for {config_name}')
+
+    with open(args.output_file, 'w') as fo:
+        fo.write(script)
+
+elif args.mode == 'test':
+    for i, config_path in enumerate(config_paths):
+        root_dir = osp.dirname(osp.dirname(osp.abspath(__file__)))
+        if not osp.exists(osp.join(root_dir, config_path)):
+            print(f'Invalid config path (does not exist):\n{config_path}')
+            continue
+
+        config_name = config_path.split('/')[-1][:-3]
+
+        tasks = {
+            'scannet_seg', 'scannet', 's3dis_seg', 'sunrgbd', 'kitti', 'nus',
+            'lyft', 'waymo'
+        }
+        eval_option = None
+        for task in tasks:
+            if task in config_name:
+                eval_option = task
+                break
+        if eval_option is None:
+            print(f'Invalid config path (invalid task):\n{config_path}')
+            continue
+
+        work_dir_name = config_path if args.long_work_dir else config_name
+
+        script += f"echo '{config_path}' &\n"
+        if args.full_log:
+            script += f'mkdir -p $CHECKPOINT_DIR/{work_dir_name}\n'
+
+        # training commands
+        script += f'GPUS={args.gpus} GPUS_PER_NODE={args.gpus_per_node} ' \
+                  f'CPUS_PER_TASK={args.cpus_per_task} ' \
+                  f'./tools/slurm_test.sh $PARTITION {config_name} ' \
+                  f'{config_path} \\\n'
+        script += f'$CHECKPOINT_DIR/{work_dir_name}/latest.pth ' \
+
+        if eval_option in ['scannet_seg', 's3dis_seg']:
+            script += '--eval mIoU \\\n'
+        elif eval_option in ['scannet', 'sunrgbd', 'kitti', 'nus']:
+            script += '--eval map \\\n'
+        elif eval_option in ['lyft']:
+            script += f'--format-only --eval-options jsonfile_prefix=' \
+                      f'$CHECKPOINT_DIR/{work_dir_name}/results_challenge ' \
+                      f'csv_savepath=$CHECKPOINT_DIR/{work_dir_name}/' \
+                      f'results_challenge.csv \\\n'
+        elif eval_option in ['waymo']:
+            script += f'--eval waymo --eval-options pklfile_prefix=' \
+                      f'$CHECKPOINT_DIR/{work_dir_name}/kitti_results ' \
+                      f'submission_prefix=$CHECKPOINT_DIR/{work_dir_name}/' \
+                      f'kitti_results \\\n'
+
+        # if output full log, redirect stdout and stderr to
+        # another log file in work dir
+        if args.full_log:
+            script += f'2>&1|tee $CHECKPOINT_DIR/{work_dir_name}' \
+                      f'/FULL_LOG.txt &\n'
+        else:
+            script += '>/dev/null &\n'
+
+        if i != len(config_paths) - 1:
+            script += '\n'
+
+        print(f'Successfully generated script for {config_name}')
+
+    with open(args.output_file, 'w') as fo:
+        fo.write(script)
diff --git a/mmde/.dev_scripts/linter.sh b/mmde/.dev_scripts/linter.sh
new file mode 100644
index 0000000000000000000000000000000000000000..64161ca654c06b4400ee8da51ede24b506e97c26
--- /dev/null
+++ b/mmde/.dev_scripts/linter.sh
@@ -0,0 +1,3 @@
+yapf -r -i mmdet3d/ configs/ tests/ tools/
+isort mmdet3d/ configs/ tests/ tools/
+flake8 .
diff --git a/mmde/.dev_scripts/test_benchmark.sh b/mmde/.dev_scripts/test_benchmark.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d95a66c4558ba0882765f04a06e7e86451f18899
--- /dev/null
+++ b/mmde/.dev_scripts/test_benchmark.sh
@@ -0,0 +1,128 @@
+PARTITION=$1
+CHECKPOINT_DIR=$2
+
+echo 'configs/3dssd/3dssd_4xb4_kitti-3d-car.py' &
+mkdir -p $CHECKPOINT_DIR/configs/3dssd/3dssd_4xb4_kitti-3d-car.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION 3dssd_4x4_kitti-3d-car configs/3dssd/3dssd_4xb4_kitti-3d-car.py \
+$CHECKPOINT_DIR/configs/3dssd/3dssd_4xb4_kitti-3d-car.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/3dssd/3dssd_4xb4_kitti-3d-car.py/FULL_LOG.txt &
+
+echo 'configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION centerpoint_02pillar_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py \
+$CHECKPOINT_DIR/configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py/FULL_LOG.txt &
+
+echo 'configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py' &
+mkdir -p $CHECKPOINT_DIR/configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py \
+$CHECKPOINT_DIR/configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py/FULL_LOG.txt &
+
+echo 'configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/fcos3d/fcos3d_r101-caffe-fpn-head-gn-dcn_8xb2-1x_nus-mono3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d configs/fcos3d/fcos3d_r101-caffe-fpn-head-gn-dcn_8xb2-1x_nus-mono3d.py \
+$CHECKPOINT_DIR/configs/fcos3d/fcos3d_r101-caffe-fpn-head-gn-dcn_8xb2-1x_nus-mono3d.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/fcos3d/fcos3d_r101-caffe-fpn-head-gn-dcn_8xb2-1x_nus-mono3d.py/FULL_LOG.txt &
+
+echo 'configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py' &
+mkdir -p $CHECKPOINT_DIR/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py \
+$CHECKPOINT_DIR/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py/FULL_LOG.txt &
+
+echo 'configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py \
+$CHECKPOINT_DIR/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py/FULL_LOG.txt &
+
+echo 'configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py' &
+mkdir -p $CHECKPOINT_DIR/configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION groupfree3d_8x4_scannet-3d-18class-L6-O256 configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py \
+$CHECKPOINT_DIR/configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py/FULL_LOG.txt &
+
+echo 'configs/h3dnet/h3dnet_8xb3_scannet-seg.py' &
+mkdir -p $CHECKPOINT_DIR/configs/h3dnet/h3dnet_8xb3_scannet-seg.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION h3dnet_3x8_scannet-3d-18class configs/h3dnet/h3dnet_8xb3_scannet-seg.py \
+$CHECKPOINT_DIR/configs/h3dnet/h3dnet_8xb3_scannet-seg.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/h3dnet/h3dnet_8xb3_scannet-seg.py/FULL_LOG.txt &
+
+echo 'configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py \
+$CHECKPOINT_DIR/configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py/FULL_LOG.txt &
+
+echo 'configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION imvotenet_stage2_16x8_sunrgbd-3d-10class configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py \
+$CHECKPOINT_DIR/configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py/FULL_LOG.txt &
+
+echo 'configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py' &
+mkdir -p $CHECKPOINT_DIR/configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION imvoxelnet_4x8_kitti-3d-car configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py \
+$CHECKPOINT_DIR/configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py/FULL_LOG.txt &
+
+echo 'configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py' &
+mkdir -p $CHECKPOINT_DIR/configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py \
+$CHECKPOINT_DIR/configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py/FULL_LOG.txt &
+
+echo 'configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py' &
+mkdir -p $CHECKPOINT_DIR/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py \
+$CHECKPOINT_DIR/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py/FULL_LOG.txt &
+
+echo 'configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py' &
+mkdir -p $CHECKPOINT_DIR/configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION pointnet2_msg_16x2_cosine_80e_s3dis_seg-3d-13class configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py \
+$CHECKPOINT_DIR/configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py/latest.pth --eval mIoU \
+2>&1|tee $CHECKPOINT_DIR/configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py/FULL_LOG.txt &
+
+echo 'configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py' &
+mkdir -p $CHECKPOINT_DIR/configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION pointnet2_msg_16x2_cosine_250e_scannet_seg-3d-20class configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py \
+$CHECKPOINT_DIR/configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py/FULL_LOG.txt &
+
+echo 'configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py \
+$CHECKPOINT_DIR/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py/latest.pth --format-only --eval-options jsonfile_prefix=$CHECKPOINT_DIR/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py/results_challenge csv_savepath=$CHECKPOINT_DIR/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py/results_challenge.csv \
+2>&1|tee $CHECKPOINT_DIR/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py/FULL_LOG.txt &
+
+echo 'configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py' &
+mkdir -p $CHECKPOINT_DIR/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py \
+$CHECKPOINT_DIR/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py/latest.pth --eval waymo --eval-options pklfile_prefix=$CHECKPOINT_DIR/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py/kitti_results submission_prefix=$CHECKPOINT_DIR/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py/kitti_results \
+2>&1|tee $CHECKPOINT_DIR/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py/FULL_LOG.txt &
+
+echo 'configs/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d configs/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py \
+$CHECKPOINT_DIR/configs/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py/FULL_LOG.txt &
+
+echo 'configs/second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py' &
+mkdir -p $CHECKPOINT_DIR/configs/second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION hv_second_secfpn_6x8_80e_kitti-3d-3class configs/second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py \
+$CHECKPOINT_DIR/configs/second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py/FULL_LOG.txt &
+
+echo 'configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py \
+$CHECKPOINT_DIR/configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py/latest.pth --format-only --eval-options jsonfile_prefix=$CHECKPOINT_DIR/configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py/results_challenge csv_savepath=$CHECKPOINT_DIR/configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py/results_challenge.csv \
+2>&1|tee $CHECKPOINT_DIR/configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py/FULL_LOG.txt &
+
+echo 'configs/votenet/votenet_8xb8_scannet-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/votenet/votenet_8xb8_scannet-3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_test.sh $PARTITION votenet_8x8_scannet-3d-18class configs/votenet/votenet_8xb8_scannet-3d.py \
+$CHECKPOINT_DIR/configs/votenet/votenet_8xb8_scannet-3d.py/latest.pth --eval map \
+2>&1|tee $CHECKPOINT_DIR/configs/votenet/votenet_8xb8_scannet-3d.py/FULL_LOG.txt &
diff --git a/mmde/.dev_scripts/train_benchmark.sh b/mmde/.dev_scripts/train_benchmark.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9efa3208454374212e3fa1975cf474494c16271b
--- /dev/null
+++ b/mmde/.dev_scripts/train_benchmark.sh
@@ -0,0 +1,128 @@
+PARTITION=$1
+CHECKPOINT_DIR=$2
+
+echo 'configs/3dssd/3dssd_4xb4_kitti-3d-car.py' &
+mkdir -p $CHECKPOINT_DIR/configs/3dssd/3dssd_4xb4_kitti-3d-car.py
+GPUS=4 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION 3dssd_4x4_kitti-3d-car configs/3dssd/3dssd_4xb4_kitti-3d-car.py \
+$CHECKPOINT_DIR/configs/3dssd/3dssd_4xb4_kitti-3d-car.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/3dssd/3dssd_4xb4_kitti-3d-car.py/FULL_LOG.txt &
+
+echo 'configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION centerpoint_02pillar_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py \
+$CHECKPOINT_DIR/configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py/FULL_LOG.txt &
+
+echo 'configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py' &
+mkdir -p $CHECKPOINT_DIR/configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py \
+$CHECKPOINT_DIR/configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py/FULL_LOG.txt &
+
+echo 'configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/fcos3d/fcos3d_r101-caffe-fpn-head-gn-dcn_8xb2-1x_nus-mono3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d configs/fcos3d/fcos3d_r101-caffe-fpn-head-gn-dcn_8xb2-1x_nus-mono3d.py \
+$CHECKPOINT_DIR/configs/fcos3d/fcos3d_r101-caffe-fpn-head-gn-dcn_8xb2-1x_nus-mono3d.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/fcos3d/fcos3d_r101-caffe-fpn-head-gn-dcn_8xb2-1x_nus-mono3d.py/FULL_LOG.txt &
+
+echo 'configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py' &
+mkdir -p $CHECKPOINT_DIR/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py \
+$CHECKPOINT_DIR/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/second/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py/FULL_LOG.txt &
+
+echo 'configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py \
+$CHECKPOINT_DIR/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py/FULL_LOG.txt &
+
+echo 'configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py' &
+mkdir -p $CHECKPOINT_DIR/configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py
+GPUS=4 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION groupfree3d_8x4_scannet-3d-18class-L6-O256 configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py \
+$CHECKPOINT_DIR/configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py/FULL_LOG.txt &
+
+echo 'configs/h3dnet/h3dnet_8xb3_scannet-seg.py' &
+mkdir -p $CHECKPOINT_DIR/configs/h3dnet/h3dnet_8xb3_scannet-seg.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION h3dnet_3x8_scannet-3d-18class configs/h3dnet/h3dnet_8xb3_scannet-seg.py \
+$CHECKPOINT_DIR/configs/h3dnet/h3dnet_8xb3_scannet-seg.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/h3dnet/h3dnet_8xb3_scannet-seg.py/FULL_LOG.txt &
+
+echo 'configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py
+GPUS=4 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py \
+$CHECKPOINT_DIR/configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py/FULL_LOG.txt &
+
+echo 'configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION imvotenet_stage2_16x8_sunrgbd-3d-10class configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py \
+$CHECKPOINT_DIR/configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py/FULL_LOG.txt &
+
+echo 'configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py' &
+mkdir -p $CHECKPOINT_DIR/configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION imvoxelnet_4x8_kitti-3d-car configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py \
+$CHECKPOINT_DIR/configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py/FULL_LOG.txt &
+
+echo 'configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py' &
+mkdir -p $CHECKPOINT_DIR/configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py \
+$CHECKPOINT_DIR/configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py/FULL_LOG.txt &
+
+echo 'configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py' &
+mkdir -p $CHECKPOINT_DIR/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py \
+$CHECKPOINT_DIR/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py/FULL_LOG.txt &
+
+echo 'configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py' &
+mkdir -p $CHECKPOINT_DIR/configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py
+GPUS=2 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION pointnet2_msg_16x2_cosine_80e_s3dis_seg-3d-13class configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py \
+$CHECKPOINT_DIR/configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py/FULL_LOG.txt &
+
+echo 'configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py' &
+mkdir -p $CHECKPOINT_DIR/configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py
+GPUS=2 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION pointnet2_msg_16x2_cosine_250e_scannet_seg-3d-20class configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py \
+$CHECKPOINT_DIR/configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py/FULL_LOG.txt &
+
+echo 'configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py \
+$CHECKPOINT_DIR/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py/FULL_LOG.txt &
+
+echo 'configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py' &
+mkdir -p $CHECKPOINT_DIR/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py
+GPUS=16 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py \
+$CHECKPOINT_DIR/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py/FULL_LOG.txt &
+
+echo 'configs/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d configs/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py \
+$CHECKPOINT_DIR/configs/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py/FULL_LOG.txt &
+
+echo 'configs/second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py' &
+mkdir -p $CHECKPOINT_DIR/configs/second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION hv_second_secfpn_6x8_80e_kitti-3d-3class configs/second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py \
+$CHECKPOINT_DIR/configs/second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py/FULL_LOG.txt &
+
+echo 'configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py
+GPUS=16 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py \
+$CHECKPOINT_DIR/configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py/FULL_LOG.txt &
+
+echo 'configs/votenet/votenet_8xb8_scannet-3d.py' &
+mkdir -p $CHECKPOINT_DIR/configs/votenet/votenet_8xb8_scannet-3d.py
+GPUS=8 GPUS_PER_NODE=8 CPUS_PER_TASK=5 ./tools/slurm_train.sh $PARTITION votenet_8x8_scannet-3d-18class configs/votenet/votenet_8xb8_scannet-3d.py \
+$CHECKPOINT_DIR/configs/votenet/votenet_8xb8_scannet-3d.py --cfg-options checkpoint_config.max_keep_ckpts=1 \
+2>&1|tee $CHECKPOINT_DIR/configs/votenet/votenet_8xb8_scannet-3d.py/FULL_LOG.txt &
diff --git a/mmde/.github/CODE_OF_CONDUCT.md b/mmde/.github/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..92afad1c5ab5d5781115dee45c131d3751d3cd31
--- /dev/null
+++ b/mmde/.github/CODE_OF_CONDUCT.md
@@ -0,0 +1,76 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+- Using welcoming and inclusive language
+- Being respectful of differing viewpoints and experiences
+- Gracefully accepting constructive criticism
+- Focusing on what is best for the community
+- Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+- The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+- Trolling, insulting/derogatory comments, and personal or political attacks
+- Public or private harassment
+- Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+- Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at chenkaidev@gmail.com. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
+
+[homepage]: https://www.contributor-covenant.org
diff --git a/mmde/.github/CONTRIBUTING.md b/mmde/.github/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..9b015021e5e9c2f9bbe9b4937560cbd6bacd3711
--- /dev/null
+++ b/mmde/.github/CONTRIBUTING.md
@@ -0,0 +1 @@
+We appreciate all contributions to improve MMDetection3D. Please refer to [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) in MMCV for more details about the contributing guideline.
diff --git a/mmde/.github/ISSUE_TEMPLATE/1-bug-report.yml b/mmde/.github/ISSUE_TEMPLATE/1-bug-report.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6c76fedcf921cc265d28d2bf97d6400e34d81b6b
--- /dev/null
+++ b/mmde/.github/ISSUE_TEMPLATE/1-bug-report.yml
@@ -0,0 +1,119 @@
+name: "🐞 Bug report"
+description: "Create a report to help us reproduce and fix the bug"
+labels: kind/bug
+title: "[Bug] "
+
+body:
+  - type: markdown
+    attributes:
+      value: |
+        ## Note
+        For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmdetection3d/discussions).
+        If this issue is about installing MMCV, please file an issue at [MMCV](https://github.com/open-mmlab/mmcv/issues/new/choose).
+        If it's anything about model deployment, please raise it to [MMDeploy](https://github.com/open-mmlab/mmdeploy).
+
+        Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.**
+
+  - type: checkboxes
+    attributes:
+      label: Prerequisite
+      description: Please check the following items before creating a new issue.
+      options:
+      - label: I have searched [Issues](https://github.com/open-mmlab/mmdetection3d/issues) and [Discussions](https://github.com/open-mmlab/mmdetection3d/discussions) but cannot get the expected help.
+        required: true
+      - label: I have read the [FAQ documentation](https://mmdetection3d.readthedocs.io/en/latest/notes/faq.html) but cannot get the expected help.
+        required: true
+      - label: The bug has not been fixed in the [latest version (dev-1.x)](https://github.com/open-mmlab/mmdetection3d/tree/dev-1.x) or [latest version (dev-1.0)](https://github.com/open-mmlab/mmdetection3d/tree/dev-1.0).
+        required: true
+
+  - type: dropdown
+    id: task
+    attributes:
+      label: Task
+      description: The problem arises when
+      options:
+        - I'm using the official example scripts/configs for the officially supported tasks/models/datasets.
+        - I have modified the scripts/configs, or I'm working on my own tasks/models/datasets.
+    validations:
+      required: true
+
+  - type: dropdown
+    id: branch
+    attributes:
+      label: Branch
+      description: The problem arises when I'm working on
+      options:
+        - main branch https://github.com/open-mmlab/mmdetection3d
+        - 1.x branch https://github.com/open-mmlab/mmdetection3d/tree/dev-1.x
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Environment
+      description: |
+        Please run `python mmdet3d/utils/collect_env.py` to collect necessary environment information and copy-paste it here.
+        You may add additional information that may be helpful for locating the problem, such as
+          - How you installed PyTorch \[e.g., pip, conda, source\]
+          - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Reproduces the problem - code sample
+      description: |
+        Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
+      placeholder: |
+        ```python
+        # Sample code to reproduce the problem
+        ```
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Reproduces the problem - command or script
+      description: |
+        What command or script did you run?
+      placeholder: |
+        ```shell
+        The command or script you run.
+        ```
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Reproduces the problem - error message
+      description: |
+        Please provide the error message or logs you got, with the full traceback.
+
+        Tip: You can attach images or log files by dragging them into the text area..
+      placeholder: |
+        ```
+        The error message or logs you got, with the full traceback.
+        ```
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Additional information
+      description: |
+        Tell us anything else you think we should know.
+
+        Tip: You can attach images or log files by dragging them into the text area.
+      placeholder: |
+        1. What's your expected result?
+        2. What dataset did you use?
+        3. What do you think might be the reason?
+
+  - type: markdown
+    attributes:
+      value: |
+        ## Acknowledgement
+        Thanks for taking the time to fill out this report.
+
+        If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [**Here**](https://github.com/open-mmlab/mmdetection3d/pulls)!
+        Please refer to [**Contribution Guide**](https://mmdetection3d.readthedocs.io/en/latest/notes/contribution_guides.html) for contributing.
diff --git a/mmde/.github/ISSUE_TEMPLATE/2-feature_request.yml b/mmde/.github/ISSUE_TEMPLATE/2-feature_request.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0dca109bab10afd1047f21287842b61e012f9027
--- /dev/null
+++ b/mmde/.github/ISSUE_TEMPLATE/2-feature_request.yml
@@ -0,0 +1,37 @@
+name: 🚀 Feature request
+description: Suggest an idea for this project
+labels: [feature-request]
+title: "[Feature] "
+
+body:
+  - type: markdown
+    attributes:
+      value: |
+        ## Note
+        For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmdetection3d/discussions).
+
+        Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.**
+
+  - type: textarea
+    attributes:
+      label: What is the feature?
+      description: Tell us more about the feature and how this feature can help.
+      placeholder: |
+        E.g., It is inconvenient when \[....\].
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Any other context?
+      description: |
+        Have you considered any alternative solutions or features? If so, what are they? Also, feel free to add any other context or screenshots about the feature request here.
+
+  - type: markdown
+    attributes:
+      value: |
+        ## Acknowledgement
+        Thanks for taking the time to fill out this report.
+
+        We strongly appreciate you creating a new PR to implement it [**Here**](https://github.com/open-mmlab/mmdetection3d/pulls)!
+        Please refer to [**Contribution Guide**](https://mmdetection3d.readthedocs.io/en/latest/notes/contribution_guides.html) for contributing.
diff --git a/mmde/.github/ISSUE_TEMPLATE/3-new-model.yml b/mmde/.github/ISSUE_TEMPLATE/3-new-model.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c4771018d4a5af03eb1b6ab42b5fd446818a669f
--- /dev/null
+++ b/mmde/.github/ISSUE_TEMPLATE/3-new-model.yml
@@ -0,0 +1,49 @@
+name: "\U0001F31F New model/dataset/scheduler addition"
+description: Submit a proposal/request to implement a new model / dataset / scheduler
+labels: [ "feature-request" ]
+title: "[New Models] "
+
+
+body:
+  - type: markdown
+    attributes:
+      value: |
+        ## Note
+        For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmdetection3d/discussions).
+
+        Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.**
+
+  - type: textarea
+    id: description-request
+    validations:
+      required: true
+    attributes:
+      label: Model/Dataset/Scheduler description
+      description: |
+        Put any and all important information relative to the model/dataset/scheduler
+
+  - type: checkboxes
+    attributes:
+      label: Open source status
+      description: |
+          Please provide the open-source status, which would be very helpful
+      options:
+        - label: "The model implementation is available"
+        - label: "The model weights are available."
+
+  - type: textarea
+    id: additional-info
+    attributes:
+      label: Provide useful links for the implementation
+      description: |
+        Please provide information regarding the implementation, the weights, and the authors.
+        Please mention the authors by @gh-username if you're aware of their usernames.
+
+  - type: markdown
+    attributes:
+      value: |
+        ## Acknowledgement
+        Thanks for taking the time to fill out this report.
+
+        We strongly appreciate you creating a new PR to implement it [**Here**](https://github.com/open-mmlab/mmdetection3d/pulls)!
+        Please refer to [**Contribution Guide**](https://mmdetection3d.readthedocs.io/en/latest/notes/contribution_guides.html) for contributing.
diff --git a/mmde/.github/ISSUE_TEMPLATE/4-documentation.yml b/mmde/.github/ISSUE_TEMPLATE/4-documentation.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e362f69a74aa4ce9b09f2157228ddf29cd258b04
--- /dev/null
+++ b/mmde/.github/ISSUE_TEMPLATE/4-documentation.yml
@@ -0,0 +1,47 @@
+name: 📚 Documentation
+description: Report an issue related to the documentation.
+labels: "docs"
+title: "[Docs] "
+
+body:
+  - type: markdown
+    attributes:
+      value: |
+        ## Note
+        For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmdetection3d/discussions).
+
+        Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.**
+
+  - type: dropdown
+    id: branch
+    attributes:
+      label: Branch
+      description: This issue is related to the
+      options:
+        - main branch  https://mmdetection3d.readthedocs.io/en/latest/
+        - dev-1.x branch https://mmdetection3d.readthedocs.io/en/dev-1.x/
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: 📚 The doc issue
+      description: >
+        A clear and concise description the issue.
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Suggest a potential alternative/fix
+      description: >
+        Tell us how we could improve the documentation in this regard.
+
+  - type: markdown
+    attributes:
+      value: |
+        ## Acknowledgement
+        Thanks for taking the time to fill out this report.
+
+        If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [**here**](https://github.com/open-mmlab/mmdetection3d/pulls)!
+        Please refer to [**Contribution Guide**](https://mmdetection3d.readthedocs.io/en/latest/notes/contribution_guides.html) for contributing.
diff --git a/mmde/.github/ISSUE_TEMPLATE/config.yml b/mmde/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0bd05bc557b48571cb2ca2903cc551d331ee58b1
--- /dev/null
+++ b/mmde/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,9 @@
+blank_issues_enabled: false
+
+contact_links:
+  - name: Common Issues
+    url: https://mmdetection3d.readthedocs.io/en/latest/notes/faq.html
+    about: Check if your issue already has solutions
+  - name: MMDet3D Documentation
+    url: https://mmdetection3d.readthedocs.io/en/latest/
+    about: Check if your question is answered in docs
diff --git a/mmde/.github/pull_request_template.md b/mmde/.github/pull_request_template.md
new file mode 100644
index 0000000000000000000000000000000000000000..3668d833ba68ab2dc534bac2c2eb87ba0de9ff1d
--- /dev/null
+++ b/mmde/.github/pull_request_template.md
@@ -0,0 +1,25 @@
+Thanks for your contribution and we appreciate it a lot. The following instructions would make your pull request more healthy and more easily get feedback. If you do not understand some items, don't worry, just make the pull request and seek help from maintainers.
+
+## Motivation
+
+Please describe the motivation of this PR and the goal you want to achieve through this PR.
+
+## Modification
+
+Please briefly describe what modification is made in this PR.
+
+## BC-breaking (Optional)
+
+Does the modification introduce changes that break the back-compatibility of the downstream repos?
+If so, please describe how it breaks the compatibility and how the downstream projects should modify their code to keep compatibility with this PR.
+
+## Use cases (Optional)
+
+If this PR introduces a new feature, it is better to list some use cases here, and update the documentation.
+
+## Checklist
+
+1. Pre-commit or other linting tools are used to fix the potential lint issues.
+2. The modification is covered by complete unit tests. If not, please add more unit test to ensure the correctness.
+3. If the modification has potential influence on downstream projects, this PR should be tested with downstream projects.
+4. The documentation has been modified accordingly, like docstring or example tutorials.
diff --git a/mmde/.github/workflows/deploy.yml b/mmde/.github/workflows/deploy.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a5f8dbec2b326ee3a3a7857b11557dc2cf3c02bd
--- /dev/null
+++ b/mmde/.github/workflows/deploy.yml
@@ -0,0 +1,28 @@
+name: deploy
+
+on: push
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-n-publish:
+    runs-on: ubuntu-latest
+    if: startsWith(github.event.ref, 'refs/tags')
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Install torch
+        run: pip install torch
+      - name: Install wheel
+        run: pip install wheel
+      - name: Build MMDet3D
+        run: python setup.py sdist bdist_wheel
+      - name: Publish distribution to PyPI
+        run: |
+          pip install twine
+          twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }}
diff --git a/mmde/.github/workflows/lint.yml b/mmde/.github/workflows/lint.yml
new file mode 100644
index 0000000000000000000000000000000000000000..62a6ac1103fbb70425f4d3784e20ab18efbb4e06
--- /dev/null
+++ b/mmde/.github/workflows/lint.yml
@@ -0,0 +1,27 @@
+name: lint
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Install pre-commit hook
+        run: |
+          pip install pre-commit
+          pre-commit install
+      - name: Linting
+        run: pre-commit run --all-files
+      - name: Check docstring coverage
+        run: |
+          pip install interrogate
+          interrogate -v --ignore-init-method --ignore-magic --ignore-module --ignore-nested-functions --ignore-regex "__repr__" --fail-under 90 mmdet3d
diff --git a/mmde/.github/workflows/merge_stage_test.yml b/mmde/.github/workflows/merge_stage_test.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0c1cf119c2ecd1ef436da324fc21613c65f6041d
--- /dev/null
+++ b/mmde/.github/workflows/merge_stage_test.yml
@@ -0,0 +1,263 @@
+name: merge_stage_test
+
+on:
+  push:
+    paths-ignore:
+      - 'README.md'
+      - 'README_zh-CN.md'
+      - 'docs/**'
+      - 'demo/**'
+      - '.dev_scripts/**'
+      - '.circleci/**'
+    branches:
+      - dev-1.x
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build_cpu_py:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python-version: [3.8, 3.9]
+        torch: [1.8.1]
+        include:
+          - torch: 1.8.1
+            torchvision: 0.9.1
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: pip install pip --upgrade && pip install wheel
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
+      - name: Install MMEngine
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install MMCV
+        run: |
+          pip install -U openmim
+          mim install 'mmcv >= 2.0.0rc4'
+      - name: Install MMDet
+        run: pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
+      - name: Install other dependencies
+        run: pip install -r requirements/tests.txt
+      - name: Build and install
+        run: rm -rf .eggs && pip install -e .
+      - name: Run unittests and generate coverage report
+        run: |
+          coverage run --branch --source mmdet3d -m pytest tests/
+          coverage xml
+          coverage report -m
+
+  build_cpu_pt:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python-version: [3.7]
+        torch: [1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.0, 1.13.0]
+        include:
+          - torch: 1.8.1
+            torchvision: 0.9.1
+          - torch: 1.9.1
+            torchvision: 0.10.1
+          - torch: 1.10.1
+            torchvision: 0.11.2
+          - torch: 1.11.0
+            torchvision: 0.12.0
+          - torch: 1.12.0
+            torchvision: 0.13.0
+          - torch: 1.13.0
+            torchvision: 0.14.0
+          - python-version: 3.8
+            torch: 2.0.0
+            torchvision: 0.15.1
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: pip install pip --upgrade && pip install wheel
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
+      - name: Install MMEngine
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install MMCV
+        run: |
+          pip install -U openmim
+          mim install 'mmcv >= 2.0.0rc4'
+      - name: Install MMDet
+        run: pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
+      - name: Install other dependencies
+        run: pip install -r requirements/tests.txt
+      - name: Build and install
+        run: rm -rf .eggs && pip install -e .
+      - name: Run unittests and generate coverage report
+        run: |
+          coverage run --branch --source mmdet3d -m pytest tests/
+          coverage xml
+          coverage report -m
+      # Only upload coverage report for python3.7 && pytorch1.8.1 cpu
+      - name: Upload coverage to Codecov
+        if: ${{matrix.torch == '1.8.1' && matrix.python-version == '3.7'}}
+        uses: codecov/codecov-action@v1.0.14
+        with:
+          file: ./coverage.xml
+          flags: unittests
+          env_vars: OS,PYTHON
+          name: codecov-umbrella
+          fail_ci_if_error: false
+
+  build_cu102:
+    runs-on: ubuntu-22.04
+    container:
+      image: pytorch/pytorch:1.8.1-cuda10.2-cudnn7-devel
+    strategy:
+      matrix:
+        python-version: [3.7]
+        include:
+          - torch: 1.8.1
+            cuda: 10.2
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: pip install pip --upgrade && pip install wheel
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Install system dependencies
+        run: apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6
+      - name: Install mmdet3d dependencies
+        run: |
+          pip install git+https://github.com/open-mmlab/mmengine.git@main
+          pip install -U openmim
+          mim install 'mmcv >= 2.0.0rc4'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
+          pip install -r requirements/tests.txt
+      - name: Build and install
+        run: pip install -e .
+      - name: Run unittests and generate coverage report
+        run: |
+          coverage run --branch --source mmdet3d -m pytest tests/
+          coverage xml
+          coverage report -m
+
+  build_cu116:
+    runs-on: ubuntu-22.04
+    container:
+      image: pytorch/pytorch:1.13.0-cuda11.6-cudnn8-devel
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: pip install pip --upgrade && pip install wheel
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Install system dependencies
+        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg
+      - name: Install mmdet3d dependencies
+        run: |
+          pip install git+https://github.com/open-mmlab/mmengine.git@main
+          pip install -U openmim
+          mim install 'mmcv >= 2.0.0rc4'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
+          pip install -r requirements/tests.txt
+      - name: Build and install
+        run: pip install -e .
+      - name: Run unittests and generate coverage report
+        run: |
+          coverage run --branch --source mmcv -m pytest tests
+          coverage xml
+          coverage report -m
+
+  build_cu117:
+    runs-on: ubuntu-22.04
+    container:
+      image: pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel
+    strategy:
+      matrix:
+        python-version: [3.9]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: pip install pip --upgrade && pip install wheel
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Install system dependencies
+        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg
+      - name: Install mmdet3d dependencies
+        run: |
+          pip install git+https://github.com/open-mmlab/mmengine.git@main
+          pip install -U openmim
+          mim install 'mmcv >= 2.0.0rc4'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
+          pip install -r requirements/tests.txt
+      - name: Build and install
+        run: pip install -e .
+      - name: Run unittests and generate coverage report
+        run: |
+          coverage run --branch --source mmcv -m pytest tests
+          coverage xml
+          coverage report -m
+
+  build_windows:
+    runs-on: windows-2022
+    strategy:
+      matrix:
+        python-version: [3.7]
+        platform: [cpu, cu111]
+        torch: [1.8.1]
+        torchvision: [0.9.1]
+        include:
+          - python-version: 3.8
+            platform: cu117
+            torch: 2.0.0
+            torchvision: 0.15.1
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: python -m pip install pip --upgrade && pip install wheel
+      - name: Install lmdb
+        run: pip install lmdb
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+${{matrix.platform}} torchvision==${{matrix.torchvision}}+${{matrix.platform}} -f https://download.pytorch.org/whl/${{matrix.platform}}/torch_stable.html
+      - name: Install mmdet3d dependencies
+        run: |
+          pip install git+https://github.com/open-mmlab/mmengine.git@main
+          pip install -U openmim
+          mim install 'mmcv >= 2.0.0rc4'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
+          pip install -r requirements/tests.txt
+      - name: Build and install
+        run: pip install -e .
+      - name: Run unittests and generate coverage report
+        run: pytest tests/
diff --git a/mmde/.github/workflows/pr_stage_test.yml b/mmde/.github/workflows/pr_stage_test.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ca0ef629de60c9de1443c96ad4d0a203675c10c6
--- /dev/null
+++ b/mmde/.github/workflows/pr_stage_test.yml
@@ -0,0 +1,170 @@
+name: pr_stage_test
+
+on:
+  pull_request:
+    paths-ignore:
+      - 'README.md'
+      - 'README_zh-CN.md'
+      - 'docs/**'
+      - 'demo/**'
+      - '.dev_scripts/**'
+      - '.circleci/**'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build_cpu:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python-version: [3.7]
+        include:
+          - torch: 1.8.1
+            torchvision: 0.9.1
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: python -m pip install pip --upgrade && pip install wheel
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
+      - name: Install MMEngine
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install MMCV
+        run: |
+          pip install -U openmim
+          mim install 'mmcv >= 2.0.0rc4'
+      - name: Install MMDet
+        run: pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
+      - name: Install other dependencies
+        run: pip install -r requirements/tests.txt
+      - name: Build and install
+        run: rm -rf .eggs && pip install -e .
+      - name: Run unittests and generate coverage report
+        run: |
+          coverage run --branch --source mmdet3d -m pytest tests/
+          coverage xml
+          coverage report -m
+      # Upload coverage report for python3.7 && pytorch1.8.1 cpu
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v1.0.14
+        with:
+          file: ./coverage.xml
+          flags: unittests
+          env_vars: OS,PYTHON
+          name: codecov-umbrella
+          fail_ci_if_error: false
+
+  build_cu102:
+    runs-on: ubuntu-22.04
+    container:
+      image: pytorch/pytorch:1.8.1-cuda10.2-cudnn7-devel
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: pip install pip --upgrade && pip install wheel
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Install system dependencies
+        run: apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6
+      - name: Install mmdet3d dependencies
+        run: |
+          pip install git+https://github.com/open-mmlab/mmengine.git@main
+          pip install -U openmim
+          mim install 'mmcv >= 2.0.0rc4'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
+          pip install -r requirements/tests.txt
+      - name: Build and install
+        run: pip install -e .
+      - name: Run unittests and generate coverage report
+        run: |
+          coverage run --branch --source mmdet3d -m pytest tests/
+          coverage xml
+          coverage report -m
+
+  build_cu117:
+    runs-on: ubuntu-22.04
+    container:
+      image: pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel
+    strategy:
+      matrix:
+        python-version: [3.9]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: pip install pip --upgrade && pip install wheel
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Install system dependencies
+        run: apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6
+      - name: Install mmdet3d dependencies
+        run: |
+          pip install git+https://github.com/open-mmlab/mmengine.git@main
+          pip install -U openmim
+          mim install 'mmcv >= 2.0.0rc4'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
+          pip install -r requirements/tests.txt
+      - name: Build and install
+        run: pip install -e .
+      - name: Run unittests and generate coverage report
+        run: |
+          coverage run --branch --source mmdet3d -m pytest tests/
+          coverage xml
+          coverage report -m
+
+  build_windows:
+    runs-on: windows-2022
+    strategy:
+      matrix:
+        python-version: [3.7]
+        platform: [cpu, cu111]
+        torch: [1.8.1]
+        torchvision: [0.9.1]
+        include:
+          - python-version: 3.8
+            platform: cu117
+            torch: 2.0.0
+            torchvision: 0.15.1
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: python -m pip install pip --upgrade && pip install wheel
+      - name: Install lmdb
+        run: pip install lmdb
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+${{matrix.platform}} torchvision==${{matrix.torchvision}}+${{matrix.platform}} -f https://download.pytorch.org/whl/${{matrix.platform}}/torch_stable.html
+      - name: Install mmdet3d dependencies
+        run: |
+          pip install git+https://github.com/open-mmlab/mmengine.git@main
+          pip install -U openmim
+          mim install 'mmcv >= 2.0.0rc4'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
+          pip install -r requirements/tests.txt
+      - name: Build and install
+        run: pip install -e .
+      - name: Run unittests and generate coverage report
+        run: pytest tests/
diff --git a/mmde/.github/workflows/test_mim.yml b/mmde/.github/workflows/test_mim.yml
new file mode 100644
index 0000000000000000000000000000000000000000..148c51de0ccc4c0faa842c33e10772634881ce8b
--- /dev/null
+++ b/mmde/.github/workflows/test_mim.yml
@@ -0,0 +1,44 @@
+name: test-mim
+
+on:
+  push:
+    paths:
+      - 'model-index.yml'
+      - 'configs/**'
+
+  pull_request:
+    paths:
+      - 'model-index.yml'
+      - 'configs/**'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build_cpu:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python-version: [3.7]
+        torch: [1.8.1]
+        include:
+          - torch: 1.8.1
+            torch_version: torch1.8
+            torchvision: 0.9.1
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: pip install pip --upgrade && pip install wheel
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
+      - name: Install openmim
+        run: pip install openmim
+      - name: Build and install
+        run: rm -rf .eggs && mim install -e .
+      - name: test commands of mim
+        run: mim search mmdet3d
diff --git a/mmde/.gitignore b/mmde/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..2fefc6a9046c1fd59fbf7c25df8d99d25960b414
--- /dev/null
+++ b/mmde/.gitignore
@@ -0,0 +1,137 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+*.ipynb
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/en/_build/
+docs/zh_cn/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# cython generated cpp
+data
+.vscode
+.idea
+
+# custom
+*.pkl
+*.pkl.json
+*.log.json
+work_dirs/
+exps/
+*~
+mmdet3d/.mim
+
+# Pytorch
+*.pth
+
+# demo
+*.jpg
+*.png
+data/s3dis/Stanford3dDataset_v1.2_Aligned_Version/
+data/scannet/scans/
+data/sunrgbd/OFFICIAL_SUNRGBD/
+*.obj
+*.ply
+
+# Waymo evaluation
+mmdet3d/evaluation/functional/waymo_utils/compute_detection_metrics_main
+mmdet3d/evaluation/functional/waymo_utils/compute_detection_let_metrics_main
+mmdet3d/evaluation/functional/waymo_utils/compute_segmentation_metrics_main
diff --git a/mmde/.pre-commit-config-zh-cn.yaml b/mmde/.pre-commit-config-zh-cn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c78ad1d46af323622e6fa8217dc7ae7ca915e53
--- /dev/null
+++ b/mmde/.pre-commit-config-zh-cn.yaml
@@ -0,0 +1,50 @@
+repos:
+  - repo: https://gitee.com/openmmlab/mirrors-flake8
+    rev: 5.0.4
+    hooks:
+      - id: flake8
+  - repo: https://gitee.com/openmmlab/mirrors-isort
+    rev: 5.11.5
+    hooks:
+      - id: isort
+  - repo: https://gitee.com/openmmlab/mirrors-yapf
+    rev: v0.32.0
+    hooks:
+      - id: yapf
+  - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: double-quote-string-fixer
+      - id: check-merge-conflict
+      - id: fix-encoding-pragma
+        args: ["--remove"]
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+  - repo: https://gitee.com/openmmlab/mirrors-codespell
+    rev: v2.2.1
+    hooks:
+      - id: codespell
+  - repo: https://gitee.com/openmmlab/mirrors-mdformat
+    rev: 0.7.9
+    hooks:
+      - id: mdformat
+        args: ["--number"]
+        additional_dependencies:
+          - mdformat-openmmlab
+          - mdformat_frontmatter
+          - linkify-it-py
+  - repo: https://gitee.com/openmmlab/mirrors-docformatter
+    rev: v1.3.1
+    hooks:
+      - id: docformatter
+        args: ["--in-place", "--wrap-descriptions", "79"]
+  - repo: https://gitee.com/openmmlab/pre-commit-hooks
+    rev: v0.2.0
+    hooks:
+      - id: check-algo-readme
+      - id: check-copyright
+        args: ["mmdet3d"]
diff --git a/mmde/.pre-commit-config.yaml b/mmde/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b0d7231c65c1730639d548a7b24f775554da6069
--- /dev/null
+++ b/mmde/.pre-commit-config.yaml
@@ -0,0 +1,50 @@
+repos:
+  - repo: https://github.com/PyCQA/flake8
+    rev: 5.0.4
+    hooks:
+      - id: flake8
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.11.5
+    hooks:
+      - id: isort
+  - repo: https://github.com/pre-commit/mirrors-yapf
+    rev: v0.32.0
+    hooks:
+      - id: yapf
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: double-quote-string-fixer
+      - id: check-merge-conflict
+      - id: fix-encoding-pragma
+        args: ["--remove"]
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.1
+    hooks:
+      - id: codespell
+  - repo: https://github.com/executablebooks/mdformat
+    rev: 0.7.9
+    hooks:
+      - id: mdformat
+        args: [ "--number" ]
+        additional_dependencies:
+          - mdformat-openmmlab
+          - mdformat_frontmatter
+          - linkify-it-py
+  - repo: https://github.com/myint/docformatter
+    rev: v1.3.1
+    hooks:
+      - id: docformatter
+        args: ["--in-place", "--wrap-descriptions", "79"]
+  - repo: https://github.com/open-mmlab/pre-commit-hooks
+    rev: v0.2.0  # Use the ref you want to point at
+    hooks:
+      - id: check-algo-readme
+      - id: check-copyright
+        args: ["mmdet3d"]  # replace the dir_to_check with your expected directory to check
diff --git a/mmde/.readthedocs.yml b/mmde/.readthedocs.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9b5979785858cba4d3d3441c96f7ba026ec82b2f
--- /dev/null
+++ b/mmde/.readthedocs.yml
@@ -0,0 +1,14 @@
+version: 2
+
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.8"
+
+formats:
+  - epub
+
+python:
+  install:
+    - requirements: requirements/docs.txt
+    - requirements: requirements/readthedocs.txt
diff --git a/mmde/CITATION.cff b/mmde/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..958f6f35f12985162b10c3fd05e884f7dfe44c9d
--- /dev/null
+++ b/mmde/CITATION.cff
@@ -0,0 +1,8 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - name: "MMDetection3D Contributors"
+title: "OpenMMLab's Next-generation Platform for General 3D Object Detection"
+date-released: 2020-07-23
+url: "https://github.com/open-mmlab/mmdetection3d"
+license: Apache-2.0
diff --git a/mmde/LICENSE b/mmde/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..04adf5cbc620ad190547b092fa449e36df5f7bf4
--- /dev/null
+++ b/mmde/LICENSE
@@ -0,0 +1,203 @@
+Copyright 2018-2019 Open-MMLab. All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018-2019 Open-MMLab.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/mmde/MANIFEST.in b/mmde/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..4d334909a51461efe52067ef6d07be85d46ab37c
--- /dev/null
+++ b/mmde/MANIFEST.in
@@ -0,0 +1,6 @@
+include mmdet3d/.mim/model-index.yml
+include mmdet3d/.mim/dataset-index.yml
+include requirements/*.txt
+recursive-include mmdet3d/.mim/ops *.cpp *.cu *.h *.cc
+recursive-include mmdet3d/.mim/configs *.py *.yml
+recursive-include mmdet3d/.mim/tools *.sh *.py
diff --git a/mmde/README.md b/mmde/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b02f58824a6d94acdbd64574726979b21c47771
--- /dev/null
+++ b/mmde/README.md
@@ -0,0 +1,410 @@
+<div align="center">
+  <img src="resources/mmdet3d-logo.png" width="600"/>
+  <div>&nbsp;</div>
+  <div align="center">
+    <b><font size="5">OpenMMLab website</font></b>
+    <sup>
+      <a href="https://openmmlab.com">
+        <i><font size="4">HOT</font></i>
+      </a>
+    </sup>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <b><font size="5">OpenMMLab platform</font></b>
+    <sup>
+      <a href="https://platform.openmmlab.com">
+        <i><font size="4">TRY IT OUT</font></i>
+      </a>
+    </sup>
+  </div>
+  <div>&nbsp;</div>
+
+[![PyPI](https://img.shields.io/pypi/v/mmdet3d)](https://pypi.org/project/mmdet3d)
+[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmdetection3d.readthedocs.io/en/latest/)
+[![badge](https://github.com/open-mmlab/mmdetection3d/workflows/build/badge.svg)](https://github.com/open-mmlab/mmdetection3d/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmdetection3d/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmdetection3d)
+[![license](https://img.shields.io/github/license/open-mmlab/mmdetection3d.svg)](https://github.com/open-mmlab/mmdetection3d/blob/main/LICENSE)
+[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmdetection3d.svg)](https://github.com/open-mmlab/mmdetection3d/issues)
+[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmdetection3d.svg)](https://github.com/open-mmlab/mmdetection3d/issues)
+
+[📘Documentation](https://mmdetection3d.readthedocs.io/en/latest/) |
+[🛠️Installation](https://mmdetection3d.readthedocs.io/en/latest/get_started.html) |
+[👀Model Zoo](https://mmdetection3d.readthedocs.io/en/latest/model_zoo.html) |
+[🆕Update News](https://mmdetection3d.readthedocs.io/en/latest/notes/changelog.html) |
+[🚀Ongoing Projects](https://github.com/open-mmlab/mmdetection3d/projects) |
+[🤔Reporting Issues](https://github.com/open-mmlab/mmdetection3d/issues/new/choose)
+
+</div>
+
+<div align="center">
+
+English | [简体中文](README_zh-CN.md)
+
+</div>
+
+<div align="center">
+  <a href="https://openmmlab.medium.com/" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219255827-67c1a27f-f8c5-46a9-811d-5e57448c61d1.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://discord.com/channels/1037617289144569886/1046608014234370059" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://twitter.com/OpenMMLab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346637-d30c8a0f-3eba-4699-8131-512fb06d46db.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.youtube.com/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://space.bilibili.com/1293512903" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026751-d7d14cce-a7c9-4e82-9942-8375fca65b99.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.zhihu.com/people/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026120-ba71e48b-6e94-4bd4-b4e9-b7d175b5e362.png" width="3%" alt="" /></a>
+</div>
+
+## Introduction
+
+MMDetection3D is an open source object detection toolbox based on PyTorch, towards the next-generation platform for general 3D detection. It is a part of the [OpenMMLab](https://openmmlab.com/) project.
+
+The main branch works with **PyTorch 1.8+**.
+
+![demo image](resources/mmdet3d_outdoor_demo.gif)
+
+<details open>
+<summary>Major features</summary>
+
+- **Support multi-modality/single-modality detectors out of box**
+
+  It directly supports multi-modality/single-modality detectors including MVXNet, VoteNet, PointPillars, etc.
+
+- **Support indoor/outdoor 3D detection out of box**
+
+  It directly supports popular indoor and outdoor 3D detection datasets, including ScanNet, SUNRGB-D, Waymo, nuScenes, Lyft, and KITTI. For nuScenes dataset, we also support [nuImages dataset](https://github.com/open-mmlab/mmdetection3d/tree/main/configs/nuimages).
+
+- **Natural integration with 2D detection**
+
+  All the about **300+ models, methods of 40+ papers**, and modules supported in [MMDetection](https://github.com/open-mmlab/mmdetection/blob/3.x/docs/en/model_zoo.md) can be trained or used in this codebase.
+
+- **High efficiency**
+
+  It trains faster than other codebases. The main results are as below. Details can be found in [benchmark.md](./docs/en/notes/benchmarks.md). We compare the number of samples trained per second (the higher, the better). The models that are not supported by other codebases are marked by `✗`.
+
+  |       Methods       | MMDetection3D | [OpenPCDet](https://github.com/open-mmlab/OpenPCDet) | [votenet](https://github.com/facebookresearch/votenet) | [Det3D](https://github.com/poodarchu/Det3D) |
+  | :-----------------: | :-----------: | :--------------------------------------------------: | :----------------------------------------------------: | :-----------------------------------------: |
+  |       VoteNet       |      358      |                          ✗                           |                           77                           |                      ✗                      |
+  |  PointPillars-car   |      141      |                          ✗                           |                           ✗                            |                     140                     |
+  | PointPillars-3class |      107      |                          44                          |                           ✗                            |                      ✗                      |
+  |       SECOND        |      40       |                          30                          |                           ✗                            |                      ✗                      |
+  |       Part-A2       |      17       |                          14                          |                           ✗                            |                      ✗                      |
+
+</details>
+
+Like [MMDetection](https://github.com/open-mmlab/mmdetection) and [MMCV](https://github.com/open-mmlab/mmcv), MMDetection3D can also be used as a library to support different projects on top of it.
+
+## What's New
+
+### Highlight
+
+In version 1.4, MMDetecion3D refactors the Waymo dataset and accelerates the preprocessing, training/testing setup, and evaluation of Waymo dataset. We also extends the support for camera-based, such as Monocular and BEV, 3D object detection models on Waymo. A detailed description of the Waymo data information is provided [here](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html).
+
+Besides, in version 1.4, MMDetection3D provides [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) to help community users get started with Waymo and use it for quick iterative development.
+
+**v1.4.0** was released in 8/1/2024：
+
+- Support the training of [DSVT](<(https://arxiv.org/abs/2301.06051)>) in `projects`
+- Support [Nerf-Det](https://arxiv.org/abs/2307.14620) in `projects`
+- Refactor Waymo dataset
+
+**v1.3.0** was released in 18/10/2023:
+
+- Support [CENet](https://arxiv.org/abs/2207.12691) in `projects`
+- Enhance demos with new 3D inferencers
+
+**v1.2.0** was released in 4/7/2023
+
+- Support [New Config Type](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) in `mmdet3d/configs`
+- Support the inference of [DSVT](<(https://arxiv.org/abs/2301.06051)>) in `projects`
+- Support downloading datasets from [OpenDataLab](https://opendatalab.com/) using `mim`
+
+**v1.1.1** was released in 30/5/2023:
+
+- Support [TPVFormer](https://arxiv.org/pdf/2302.07817.pdf) in `projects`
+- Support the training of BEVFusion in `projects`
+- Support lidar-based 3D semantic segmentation benchmark
+
+## Installation
+
+Please refer to [Installation](https://mmdetection3d.readthedocs.io/en/latest/get_started.html) for installation instructions.
+
+## Getting Started
+
+For detailed user guides and advanced guides, please refer to our [documentation](https://mmdetection3d.readthedocs.io/en/latest/):
+
+<details>
+<summary>User Guides</summary>
+
+- [Train & Test](https://mmdetection3d.readthedocs.io/en/latest/user_guides/index.html#train-test)
+  - [Learn about Configs](https://mmdetection3d.readthedocs.io/en/latest/user_guides/config.html)
+  - [Coordinate System](https://mmdetection3d.readthedocs.io/en/latest/user_guides/coord_sys_tutorial.html)
+  - [Dataset Preparation](https://mmdetection3d.readthedocs.io/en/latest/user_guides/dataset_prepare.html)
+  - [Customize Data Pipelines](https://mmdetection3d.readthedocs.io/en/latest/user_guides/data_pipeline.html)
+  - [Test and Train on Standard Datasets](https://mmdetection3d.readthedocs.io/en/latest/user_guides/train_test.html)
+  - [Inference](https://mmdetection3d.readthedocs.io/en/latest/user_guides/inference.html)
+  - [Train with Customized Datasets](https://mmdetection3d.readthedocs.io/en/latest/user_guides/new_data_model.html)
+- [Useful Tools](https://mmdetection3d.readthedocs.io/en/latest/user_guides/index.html#useful-tools)
+
+</details>
+
+<details>
+<summary>Advanced Guides</summary>
+
+- [Datasets](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/index.html#datasets)
+  - [KITTI Dataset](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/kitti.html)
+  - [NuScenes Dataset](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/nuscenes.html)
+  - [Lyft Dataset](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/lyft.html)
+  - [Waymo Dataset](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html)
+  - [SUN RGB-D Dataset](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/sunrgbd.html)
+  - [ScanNet Dataset](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/scannet.html)
+  - [S3DIS Dataset](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/s3dis.html)
+  - [SemanticKITTI Dataset](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/semantickitti.html)
+- [Supported Tasks](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/index.html#supported-tasks)
+  - [LiDAR-Based 3D Detection](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/supported_tasks/lidar_det3d.html)
+  - [Vision-Based 3D Detection](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/supported_tasks/vision_det3d.html)
+  - [LiDAR-Based 3D Semantic Segmentation](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/supported_tasks/lidar_sem_seg3d.html)
+- [Customization](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/index.html#customization)
+  - [Customize Datasets](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/customize_dataset.html)
+  - [Customize Models](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/customize_models.html)
+  - [Customize Runtime Settings](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/customize_runtime.html)
+
+</details>
+
+## Overview of Benchmark and Model Zoo
+
+Results and models are available in the [model zoo](docs/en/model_zoo.md).
+
+<div align="center">
+  <b>Components</b>
+</div>
+<table align="center">
+  <tbody>
+    <tr align="center" valign="bottom">
+      <td>
+        <b>Backbones</b>
+      </td>
+      <td>
+        <b>Heads</b>
+      </td>
+      <td>
+        <b>Features</b>
+      </td>
+    </tr>
+    <tr valign="top">
+      <td>
+      <ul>
+        <li><a href="configs/pointnet2">PointNet (CVPR'2017)</a></li>
+        <li><a href="configs/pointnet2">PointNet++ (NeurIPS'2017)</a></li>
+        <li><a href="configs/regnet">RegNet (CVPR'2020)</a></li>
+        <li><a href="configs/dgcnn">DGCNN (TOG'2019)</a></li>
+        <li>DLA (CVPR'2018)</li>
+        <li>MinkResNet (CVPR'2019)</li>
+        <li><a href="configs/minkunet">MinkUNet (CVPR'2019)</a></li>
+        <li><a href="configs/cylinder3d">Cylinder3D (CVPR'2021)</a></li>
+      </ul>
+      </td>
+      <td>
+      <ul>
+        <li><a href="configs/free_anchor">FreeAnchor (NeurIPS'2019)</a></li>
+      </ul>
+      </td>
+      <td>
+      <ul>
+        <li><a href="configs/dynamic_voxelization">Dynamic Voxelization (CoRL'2019)</a></li>
+      </ul>
+      </td>
+    </tr>
+</td>
+    </tr>
+  </tbody>
+</table>
+
+<div align="center">
+  <b>Architectures</b>
+</div>
+<table align="center">
+  <tbody>
+    <tr align="center" valign="middle">
+      <td>
+        <b>LiDAR-based 3D Object Detection</b>
+      </td>
+      <td>
+        <b>Camera-based 3D Object Detection</b>
+      </td>
+      <td>
+        <b>Multi-modal 3D Object Detection</b>
+      </td>
+      <td>
+        <b>3D Semantic Segmentation</b>
+      </td>
+    </tr>
+    <tr valign="top">
+      <td>
+        <li><b>Outdoor</b></li>
+        <ul>
+            <li><a href="configs/second">SECOND (Sensor'2018)</a></li>
+            <li><a href="configs/pointpillars">PointPillars (CVPR'2019)</a></li>
+            <li><a href="configs/ssn">SSN (ECCV'2020)</a></li>
+            <li><a href="configs/3dssd">3DSSD (CVPR'2020)</a></li>
+            <li><a href="configs/sassd">SA-SSD (CVPR'2020)</a></li>
+            <li><a href="configs/point_rcnn">PointRCNN (CVPR'2019)</a></li>
+            <li><a href="configs/parta2">Part-A2 (TPAMI'2020)</a></li>
+            <li><a href="configs/centerpoint">CenterPoint (CVPR'2021)</a></li>
+            <li><a href="configs/pv_rcnn">PV-RCNN (CVPR'2020)</a></li>
+            <li><a href="projects/CenterFormer">CenterFormer (ECCV'2022)</a></li>
+        </ul>
+        <li><b>Indoor</b></li>
+        <ul>
+            <li><a href="configs/votenet">VoteNet (ICCV'2019)</a></li>
+            <li><a href="configs/h3dnet">H3DNet (ECCV'2020)</a></li>
+            <li><a href="configs/groupfree3d">Group-Free-3D (ICCV'2021)</a></li>
+            <li><a href="configs/fcaf3d">FCAF3D (ECCV'2022)</a></li>
+            <li><a href="projects/TR3D">TR3D (ArXiv'2023)</a></li>
+      </ul>
+      </td>
+      <td>
+        <li><b>Outdoor</b></li>
+        <ul>
+          <li><a href="configs/imvoxelnet">ImVoxelNet (WACV'2022)</a></li>
+          <li><a href="configs/smoke">SMOKE (CVPRW'2020)</a></li>
+          <li><a href="configs/fcos3d">FCOS3D (ICCVW'2021)</a></li>
+          <li><a href="configs/pgd">PGD (CoRL'2021)</a></li>
+          <li><a href="configs/monoflex">MonoFlex (CVPR'2021)</a></li>
+          <li><a href="projects/DETR3D">DETR3D (CoRL'2021)</a></li>
+          <li><a href="projects/PETR">PETR (ECCV'2022)</a></li>
+        </ul>
+        <li><b>Indoor</b></li>
+        <ul>
+          <li><a href="configs/imvoxelnet">ImVoxelNet (WACV'2022)</a></li>
+        </ul>
+      </td>
+      <td>
+        <li><b>Outdoor</b></li>
+        <ul>
+          <li><a href="configs/mvxnet">MVXNet (ICRA'2019)</a></li>
+          <li><a href="projects/BEVFusion">BEVFusion (ICRA'2023)</a></li>
+        </ul>
+        <li><b>Indoor</b></li>
+        <ul>
+          <li><a href="configs/imvotenet">ImVoteNet (CVPR'2020)</a></li>
+        </ul>
+      </td>
+      <td>
+        <li><b>Outdoor</b></li>
+        <ul>
+          <li><a href="configs/minkunet">MinkUNet (CVPR'2019)</a></li>
+          <li><a href="configs/spvcnn">SPVCNN (ECCV'2020)</a></li>
+          <li><a href="configs/cylinder3d">Cylinder3D (CVPR'2021)</a></li>
+          <li><a href="projects/TPVFormer">TPVFormer (CVPR'2023)</a></li>
+        </ul>
+        <li><b>Indoor</b></li>
+        <ul>
+          <li><a href="configs/pointnet2">PointNet++ (NeurIPS'2017)</a></li>
+          <li><a href="configs/paconv">PAConv (CVPR'2021)</a></li>
+          <li><a href="configs/dgcnn">DGCNN (TOG'2019)</a></li>
+        </ul>
+      </ul>
+      </td>
+    </tr>
+</td>
+    </tr>
+  </tbody>
+</table>
+
+|               | ResNet | VoVNet | Swin-T | PointNet++ | SECOND | DGCNN | RegNetX | DLA | MinkResNet | Cylinder3D | MinkUNet |
+| :-----------: | :----: | :----: | :----: | :--------: | :----: | :---: | :-----: | :-: | :--------: | :--------: | :------: |
+|    SECOND     |   ✗    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+| PointPillars  |   ✗    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✓    |  ✗  |     ✗      |     ✗      |    ✗     |
+|  FreeAnchor   |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✓    |  ✗  |     ✗      |     ✗      |    ✗     |
+|    VoteNet    |   ✗    |   ✗    |   ✗    |     ✓      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|    H3DNet     |   ✗    |   ✗    |   ✗    |     ✓      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|     3DSSD     |   ✗    |   ✗    |   ✗    |     ✓      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|    Part-A2    |   ✗    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|    MVXNet     |   ✓    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|  CenterPoint  |   ✗    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|      SSN      |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✓    |  ✗  |     ✗      |     ✗      |    ✗     |
+|   ImVoteNet   |   ✓    |   ✗    |   ✗    |     ✓      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|    FCOS3D     |   ✓    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|  PointNet++   |   ✗    |   ✗    |   ✗    |     ✓      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+| Group-Free-3D |   ✗    |   ✗    |   ✗    |     ✓      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|  ImVoxelNet   |   ✓    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|    PAConv     |   ✗    |   ✗    |   ✗    |     ✓      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|     DGCNN     |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✓   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|     SMOKE     |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✓  |     ✗      |     ✗      |    ✗     |
+|      PGD      |   ✓    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|   MonoFlex    |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✓  |     ✗      |     ✗      |    ✗     |
+|    SA-SSD     |   ✗    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|    FCAF3D     |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✓      |     ✗      |    ✗     |
+|    PV-RCNN    |   ✗    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|  Cylinder3D   |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✓      |    ✗     |
+|   MinkUNet    |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✓     |
+|    SPVCNN     |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✓     |
+|   BEVFusion   |   ✗    |   ✗    |   ✓    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+| CenterFormer  |   ✗    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|     TR3D      |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✓      |     ✗      |    ✗     |
+|    DETR3D     |   ✓    |   ✓    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|     PETR      |   ✗    |   ✓    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|   TPVFormer   |   ✓    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+
+**Note:** All the about **500+ models, methods of 90+ papers** in 2D detection supported by [MMDetection](https://github.com/open-mmlab/mmdetection/blob/3.x/docs/en/model_zoo.md) can be trained or used in this codebase.
+
+## FAQ
+
+Please refer to [FAQ](docs/en/notes/faq.md) for frequently asked questions.
+
+## Contributing
+
+We appreciate all contributions to improve MMDetection3D. Please refer to [CONTRIBUTING.md](docs/en/notes/contribution_guides.md) for the contributing guideline.
+
+## Acknowledgement
+
+MMDetection3D is an open source project that is contributed by researchers and engineers from various colleges and companies. We appreciate all the contributors as well as users who give valuable feedbacks. We wish that the toolbox and benchmark could serve the growing research community by providing a flexible toolkit to reimplement existing methods and develop their own new 3D detectors.
+
+## Citation
+
+If you find this project useful in your research, please consider cite:
+
+```latex
+@misc{mmdet3d2020,
+    title={{MMDetection3D: OpenMMLab} next-generation platform for general {3D} object detection},
+    author={MMDetection3D Contributors},
+    howpublished = {\url{https://github.com/open-mmlab/mmdetection3d}},
+    year={2020}
+}
+```
+
+## License
+
+This project is released under the [Apache 2.0 license](LICENSE).
+
+## Projects in OpenMMLab
+
+- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models.
+- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
+- [MMEval](https://github.com/open-mmlab/mmeval): A unified evaluation library for multiple machine learning libraries.
+- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
+- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab pre-training toolbox and benchmark.
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
+- [MMagic](https://github.com/open-mmlab/mmagic): Open**MM**Lab **A**dvanced, **G**enerative and **I**ntelligent **C**reation toolbox.
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.
diff --git a/mmde/README_zh-CN.md b/mmde/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..330373d019f3b06cb18b2867128082a387cc077f
--- /dev/null
+++ b/mmde/README_zh-CN.md
@@ -0,0 +1,427 @@
+<div align="center">
+  <img src="resources/mmdet3d-logo.png" width="600"/>
+  <div>&nbsp;</div>
+  <div align="center">
+    <b><font size="5">OpenMMLab 官网</font></b>
+    <sup>
+      <a href="https://openmmlab.com">
+        <i><font size="4">HOT</font></i>
+      </a>
+    </sup>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <b><font size="5">OpenMMLab 开放平台</font></b>
+    <sup>
+      <a href="https://platform.openmmlab.com">
+        <i><font size="4">TRY IT OUT</font></i>
+      </a>
+    </sup>
+  </div>
+  <div>&nbsp;</div>
+
+[![PyPI](https://img.shields.io/pypi/v/mmdet3d)](https://pypi.org/project/mmdet3d)
+[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmdetection3d.readthedocs.io/zh_CN/latest/)
+[![badge](https://github.com/open-mmlab/mmdetection3d/workflows/build/badge.svg)](https://github.com/open-mmlab/mmdetection3d/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmdetection3d/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmdetection3d)
+[![license](https://img.shields.io/github/license/open-mmlab/mmdetection3d.svg)](https://github.com/open-mmlab/mmdetection3d/blob/main/LICENSE)
+[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmdetection3d.svg)](https://github.com/open-mmlab/mmdetection3d/issues)
+[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmdetection3d.svg)](https://github.com/open-mmlab/mmdetection3d/issues)
+
+[📘使用文档](https://mmdetection3d.readthedocs.io/zh_CN/latest/) |
+[🛠️安装教程](https://mmdetection3d.readthedocs.io/zh_CN/latest/get_started.html) |
+[👀模型库](https://mmdetection3d.readthedocs.io/zh_CN/latest/model_zoo.html) |
+[🆕更新日志](https://mmdetection3d.readthedocs.io/en/latest/notes/changelog.html) |
+[🚀进行中的项目](https://github.com/open-mmlab/mmdetection3d/projects) |
+[🤔报告问题](https://github.com/open-mmlab/mmdetection3d/issues/new/choose)
+
+</div>
+
+<div align="center">
+
+[English](README.md) | 简体中文
+
+</div>
+
+<div align="center">
+  <a href="https://openmmlab.medium.com/" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219255827-67c1a27f-f8c5-46a9-811d-5e57448c61d1.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://discord.com/channels/1037617289144569886/1046608014234370059" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://twitter.com/OpenMMLab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346637-d30c8a0f-3eba-4699-8131-512fb06d46db.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.youtube.com/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://space.bilibili.com/1293512903" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026751-d7d14cce-a7c9-4e82-9942-8375fca65b99.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.zhihu.com/people/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026120-ba71e48b-6e94-4bd4-b4e9-b7d175b5e362.png" width="3%" alt="" /></a>
+</div>
+
+## 简介
+
+MMDetection3D 是一个基于 PyTorch 的目标检测开源工具箱，下一代面向 3D 检测的平台。它是 [OpenMMlab](https://openmmlab.com/) 项目的一部分。
+
+主分支代码目前支持 PyTorch 1.8 以上的版本。
+
+![demo image](resources/mmdet3d_outdoor_demo.gif)
+
+<details open>
+<summary>主要特性</summary>
+
+- **支持多模态/单模态的检测器**
+
+  支持多模态/单模态检测器，包括 MVXNet，VoteNet，PointPillars 等。
+
+- **支持户内/户外的数据集**
+
+  支持室内/室外的 3D 检测数据集，包括 ScanNet，SUNRGB-D，Waymo，nuScenes，Lyft，KITTI。对于 nuScenes 数据集，我们也支持 [nuImages 数据集](https://github.com/open-mmlab/mmdetection3d/tree/main/configs/nuimages)。
+
+- **与 2D 检测器的自然整合**
+
+  [MMDetection](https://github.com/open-mmlab/mmdetection/blob/3.x/docs/zh_cn/model_zoo.md) 支持的 **300+ 个模型，40+ 的论文算法**，和相关模块都可以在此代码库中训练或使用。
+
+- **性能高**
+
+  训练速度比其他代码库更快。下表可见主要的对比结果。更多的细节可见[基准测评文档](./docs/zh_cn/notes/benchmarks.md)。我们对比了每秒训练的样本数（值越高越好）。其他代码库不支持的模型被标记为 `✗`。
+
+  |       Methods       | MMDetection3D | [OpenPCDet](https://github.com/open-mmlab/OpenPCDet) | [votenet](https://github.com/facebookresearch/votenet) | [Det3D](https://github.com/poodarchu/Det3D) |
+  | :-----------------: | :-----------: | :--------------------------------------------------: | :----------------------------------------------------: | :-----------------------------------------: |
+  |       VoteNet       |      358      |                          ✗                           |                           77                           |                      ✗                      |
+  |  PointPillars-car   |      141      |                          ✗                           |                           ✗                            |                     140                     |
+  | PointPillars-3class |      107      |                          44                          |                           ✗                            |                      ✗                      |
+  |       SECOND        |      40       |                          30                          |                           ✗                            |                      ✗                      |
+  |       Part-A2       |      17       |                          14                          |                           ✗                            |                      ✗                      |
+
+</details>
+
+和 [MMDetection](https://github.com/open-mmlab/mmdetection)，[MMCV](https://github.com/open-mmlab/mmcv) 一样，MMDetection3D 也可以作为一个库去支持各式各样的项目。
+
+## 最新进展
+
+### 亮点
+
+在1.4版本中，MMDetecion3D 重构了 Waymo 数据集, 加速了 Waymo 数据集的预处理、训练/测试启动、验证的速度。并且在 Waymo 上拓展了对 单目/BEV 等基于相机的三维目标检测模型的支持。在[这里](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html)提供了对 Waymo 数据信息的详细解读。
+
+此外，在1.4版本中，MMDetection3D 提供了 [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) 来帮助社区用户上手 Waymo 并用于快速迭代开发。
+
+**v1.4.0** 版本已经在 2024.1.8 发布：
+
+- 在 `projects` 中支持了 [DSVT](<(https://arxiv.org/abs/2301.06051)>) 的训练
+- 在 `projects` 中支持了 [Nerf-Det](https://arxiv.org/abs/2307.14620)
+- 重构了 Waymo 数据集
+
+**v1.3.0** 版本已经在 2023.10.18 发布：
+
+- 在 `projects` 中支持 [CENet](https://arxiv.org/abs/2207.12691)
+- 使用新的 3D inferencers 增强演示代码效果
+
+**v1.2.0** 版本已经在 2023.7.4 发布：
+
+- 在 `mmdet3d/configs`中支持 [新Config样式](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta)
+- 在 `projects` 中支持 [DSVT](<(https://arxiv.org/abs/2301.06051)>) 的推理
+- 支持通过 `mim` 从 [OpenDataLab](https://opendatalab.com/) 下载数据集
+
+**v1.1.1** 版本已经在 2023.5.30 发布：
+
+- 在 `projects` 中支持 [TPVFormer](https://arxiv.org/pdf/2302.07817.pdf)
+- 在 `projects` 中支持 BEVFusion 的训练
+- 支持基于激光雷达的 3D 语义分割基准
+
+## 安装
+
+请参考[快速入门文档](https://mmdetection3d.readthedocs.io/zh_CN/latest/get_started.html)进行安装。
+
+## 教程
+
+<details>
+<summary>用户指南</summary>
+
+- [训练 & 测试](https://mmdetection3d.readthedocs.io/zh_CN/latest/user_guides/index.html#train-test)
+  - [学习配置文件](https://mmdetection3d.readthedocs.io/zh_CN/latest/user_guides/config.html)
+  - [坐标系](https://mmdetection3d.readthedocs.io/zh_CN/latest/user_guides/coord_sys_tutorial.html)
+  - [数据预处理](https://mmdetection3d.readthedocs.io/zh_CN/latest/user_guides/dataset_prepare.html)
+  - [自定义数据预处理流程](https://mmdetection3d.readthedocs.io/zh_CN/latest/user_guides/data_pipeline.html)
+  - [在标注数据集上测试和训练](https://mmdetection3d.readthedocs.io/zh_CN/latest/user_guides/train_test.html)
+  - [推理](https://mmdetection3d.readthedocs.io/zh_CN/latest/user_guides/inference.html)
+  - [在自定义数据集上进行训练](https://mmdetection3d.readthedocs.io/zh_CN/latest/user_guides/new_data_model.html)
+- [实用工具](https://mmdetection3d.readthedocs.io/zh_CN/latest/user_guides/index.html#useful-tools)
+
+</details>
+
+<details>
+<summary>进阶教程</summary>
+
+- [数据集](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/index.html#datasets)
+  - [KITTI 数据集](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/datasets/kitti.html)
+  - [NuScenes 数据集](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/datasets/nuscenes.html)
+  - [Lyft 数据集](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/datasets/lyft.html)
+  - [Waymo 数据集](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/datasets/waymo.html)
+  - [SUN RGB-D 数据集](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/datasets/sunrgbd.html)
+  - [ScanNet 数据集](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/datasets/scannet.html)
+  - [S3DIS 数据集](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/datasets/s3dis.html)
+  - [SemanticKITTI 数据集](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/datasets/semantickitti.html)
+- [支持的任务](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/index.html#supported-tasks)
+  - [基于激光雷达的 3D 检测](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/supported_tasks/lidar_det3d.html)
+  - [基于视觉的 3D 检测](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/supported_tasks/vision_det3d.html)
+  - [基于激光雷达的 3D 语义分割](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/supported_tasks/lidar_sem_seg3d.html)
+- [自定义项目](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/index.html#customization)
+  - [自定义数据集](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/customize_dataset.html)
+  - [自定义模型](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/customize_models.html)
+  - [自定义运行时配置](https://mmdetection3d.readthedocs.io/zh_CN/latest/advanced_guides/customize_runtime.html)
+
+</details>
+
+## 基准测试和模型库
+
+测试结果和模型可以在[模型库](docs/zh_cn/model_zoo.md)中找到。
+
+<div align="center">
+  <b>模块组件</b>
+</div>
+<table align="center">
+  <tbody>
+    <tr align="center" valign="bottom">
+      <td>
+        <b>主干网络</b>
+      </td>
+      <td>
+        <b>检测头</b>
+      </td>
+      <td>
+        <b>特性</b>
+      </td>
+    </tr>
+    <tr valign="top">
+      <td>
+      <ul>
+        <li><a href="configs/pointnet2">PointNet (CVPR'2017)</a></li>
+        <li><a href="configs/pointnet2">PointNet++ (NeurIPS'2017)</a></li>
+        <li><a href="configs/regnet">RegNet (CVPR'2020)</a></li>
+        <li><a href="configs/dgcnn">DGCNN (TOG'2019)</a></li>
+        <li>DLA (CVPR'2018)</li>
+        <li>MinkResNet (CVPR'2019)</li>
+        <li><a href="configs/minkunet">MinkUNet (CVPR'2019)</a></li>
+        <li><a href="configs/cylinder3d">Cylinder3D (CVPR'2021)</a></li>
+      </ul>
+      </td>
+      <td>
+      <ul>
+        <li><a href="configs/free_anchor">FreeAnchor (NeurIPS'2019)</a></li>
+      </ul>
+      </td>
+      <td>
+      <ul>
+        <li><a href="configs/dynamic_voxelization">Dynamic Voxelization (CoRL'2019)</a></li>
+      </ul>
+      </td>
+    </tr>
+</td>
+    </tr>
+  </tbody>
+</table>
+
+<div align="center">
+  <b>算法模型</b>
+</div>
+<table align="center">
+  <tbody>
+    <tr align="center" valign="middle">
+      <td>
+        <b>激光雷达 3D 目标检测</b>
+      </td>
+      <td>
+        <b>相机 3D 目标检测</b>
+      </td>
+      <td>
+        <b>多模态 3D 目标检测</b>
+      </td>
+      <td>
+        <b>3D 语义分割</b>
+      </td>
+    </tr>
+    <tr valign="top">
+      <td>
+        <li><b>室外</b></li>
+        <ul>
+            <li><a href="configs/second">SECOND (Sensor'2018)</a></li>
+            <li><a href="configs/pointpillars">PointPillars (CVPR'2019)</a></li>
+            <li><a href="configs/ssn">SSN (ECCV'2020)</a></li>
+            <li><a href="configs/3dssd">3DSSD (CVPR'2020)</a></li>
+            <li><a href="configs/sassd">SA-SSD (CVPR'2020)</a></li>
+            <li><a href="configs/point_rcnn">PointRCNN (CVPR'2019)</a></li>
+            <li><a href="configs/parta2">Part-A2 (TPAMI'2020)</a></li>
+            <li><a href="configs/centerpoint">CenterPoint (CVPR'2021)</a></li>
+            <li><a href="configs/pv_rcnn">PV-RCNN (CVPR'2020)</a></li>
+            <li><a href="projects/CenterFormer">CenterFormer (ECCV'2022)</a></li>
+        </ul>
+        <li><b>室内</b></li>
+        <ul>
+            <li><a href="configs/votenet">VoteNet (ICCV'2019)</a></li>
+            <li><a href="configs/h3dnet">H3DNet (ECCV'2020)</a></li>
+            <li><a href="configs/groupfree3d">Group-Free-3D (ICCV'2021)</a></li>
+            <li><a href="configs/fcaf3d">FCAF3D (ECCV'2022)</a></li>
+            <li><a href="projects/TR3D">TR3D (ArXiv'2023)</a></li>
+      </ul>
+      </td>
+      <td>
+        <li><b>室外</b></li>
+        <ul>
+          <li><a href="configs/imvoxelnet">ImVoxelNet (WACV'2022)</a></li>
+          <li><a href="configs/smoke">SMOKE (CVPRW'2020)</a></li>
+          <li><a href="configs/fcos3d">FCOS3D (ICCVW'2021)</a></li>
+          <li><a href="configs/pgd">PGD (CoRL'2021)</a></li>
+          <li><a href="configs/monoflex">MonoFlex (CVPR'2021)</a></li>
+          <li><a href="projects/DETR3D">DETR3D (CoRL'2021)</a></li>
+          <li><a href="projects/PETR">PETR (ECCV'2022)</a></li>
+        </ul>
+        <li><b>Indoor</b></li>
+        <ul>
+          <li><a href="configs/imvoxelnet">ImVoxelNet (WACV'2022)</a></li>
+        </ul>
+      </td>
+      <td>
+        <li><b>室外</b></li>
+        <ul>
+          <li><a href="configs/mvxnet">MVXNet (ICRA'2019)</a></li>
+          <li><a href="projects/BEVFusion">BEVFusion (ICRA'2023)</a></li>
+        </ul>
+        <li><b>室内</b></li>
+        <ul>
+          <li><a href="configs/imvotenet">ImVoteNet (CVPR'2020)</a></li>
+        </ul>
+      </td>
+      <td>
+        <li><b>室外</b></li>
+        <ul>
+          <li><a href="configs/minkunet">MinkUNet (CVPR'2019)</a></li>
+          <li><a href="configs/spvcnn">SPVCNN (ECCV'2020)</a></li>
+          <li><a href="configs/cylinder3d">Cylinder3D (CVPR'2021)</a></li>
+          <li><a href="projects/TPVFormer">TPVFormer (CVPR'2023)</a></li>
+        </ul>
+        <li><b>室内</b></li>
+        <ul>
+          <li><a href="configs/pointnet2">PointNet++ (NeurIPS'2017)</a></li>
+          <li><a href="configs/paconv">PAConv (CVPR'2021)</a></li>
+          <li><a href="configs/dgcnn">DGCNN (TOG'2019)</a></li>
+        </ul>
+      </ul>
+      </td>
+    </tr>
+</td>
+    </tr>
+  </tbody>
+</table>
+
+|               | ResNet | VoVNet | Swin-T | PointNet++ | SECOND | DGCNN | RegNetX | DLA | MinkResNet | Cylinder3D | MinkUNet |
+| :-----------: | :----: | :----: | :----: | :--------: | :----: | :---: | :-----: | :-: | :--------: | :--------: | :------: |
+|    SECOND     |   ✗    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+| PointPillars  |   ✗    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✓    |  ✗  |     ✗      |     ✗      |    ✗     |
+|  FreeAnchor   |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✓    |  ✗  |     ✗      |     ✗      |    ✗     |
+|    VoteNet    |   ✗    |   ✗    |   ✗    |     ✓      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|    H3DNet     |   ✗    |   ✗    |   ✗    |     ✓      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|     3DSSD     |   ✗    |   ✗    |   ✗    |     ✓      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|    Part-A2    |   ✗    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|    MVXNet     |   ✓    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|  CenterPoint  |   ✗    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|      SSN      |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✓    |  ✗  |     ✗      |     ✗      |    ✗     |
+|   ImVoteNet   |   ✓    |   ✗    |   ✗    |     ✓      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|    FCOS3D     |   ✓    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|  PointNet++   |   ✗    |   ✗    |   ✗    |     ✓      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+| Group-Free-3D |   ✗    |   ✗    |   ✗    |     ✓      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|  ImVoxelNet   |   ✓    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|    PAConv     |   ✗    |   ✗    |   ✗    |     ✓      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|     DGCNN     |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✓   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|     SMOKE     |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✓  |     ✗      |     ✗      |    ✗     |
+|      PGD      |   ✓    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|   MonoFlex    |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✓  |     ✗      |     ✗      |    ✗     |
+|    SA-SSD     |   ✗    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|    FCAF3D     |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✓      |     ✗      |    ✗     |
+|    PV-RCNN    |   ✗    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|  Cylinder3D   |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✓      |    ✗     |
+|   MinkUNet    |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✓     |
+|    SPVCNN     |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✓     |
+|   BEVFusion   |   ✗    |   ✗    |   ✓    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+| CenterFormer  |   ✗    |   ✗    |   ✗    |     ✗      |   ✓    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|     TR3D      |   ✗    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✓      |     ✗      |    ✗     |
+|    DETR3D     |   ✓    |   ✓    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|     PETR      |   ✗    |   ✓    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+|   TPVFormer   |   ✓    |   ✗    |   ✗    |     ✗      |   ✗    |   ✗   |    ✗    |  ✗  |     ✗      |     ✗      |    ✗     |
+
+**注意：**[MMDetection](https://github.com/open-mmlab/mmdetection/blob/3.x/docs/zh_cn/model_zoo.md) 支持的基于 2D 检测的 **300+ 个模型，40+ 的论文算法**在 MMDetection3D 中都可以被训练或使用。
+
+## 常见问题
+
+请参考 [FAQ](docs/zh_cn/notes/faq.md) 了解其他用户的常见问题。
+
+## 贡献指南
+
+我们感谢所有的贡献者为改进和提升 MMDetection3D 所作出的努力。请参考[贡献指南](docs/en/notes/contribution_guides.md)来了解参与项目贡献的相关指引。
+
+## 致谢
+
+MMDetection3D 是一款由来自不同高校和企业的研发人员共同参与贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者，以及提供宝贵反馈的用户。我们希望这个工具箱和基准测试可以为社区提供灵活的代码工具，供用户复现已有算法并开发自己的新的 3D 检测模型。
+
+## 引用
+
+如果你觉得本项目对你的研究工作有所帮助，请参考如下 bibtex 引用 MMdetection3D：
+
+```latex
+@misc{mmdet3d2020,
+    title={{MMDetection3D: OpenMMLab} next-generation platform for general {3D} object detection},
+    author={MMDetection3D Contributors},
+    howpublished = {\url{https://github.com/open-mmlab/mmdetection3d}},
+    year={2020}
+}
+```
+
+## 开源许可证
+
+该项目采用 [Apache 2.0 开源许可证](LICENSE)。
+
+## OpenMMLab 的其他项目
+
+- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练基础库
+- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
+- [MMEval](https://github.com/open-mmlab/mmeval): 统一开放的跨框架算法评测库
+- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
+- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab 深度学习预训练工具箱
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱与测试基准
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
+- [MMagic](https://github.com/open-mmlab/mmagic): OpenMMLab 新一代人工智能内容生成（AIGC）工具箱
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
+
+## 欢迎加入 OpenMMLab 社区
+
+扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，扫描下方微信二维码添加喵喵好友，进入 MMDetection3D 微信交流社群。【加好友申请格式：研究方向+地区+学校/公司+姓名】
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/58739961/187154320-f3312cdf-31f2-4316-9dbb-8d7b0e1b7e08.jpg" height="400" />  <img src="https://github.com/open-mmlab/mmdetection3d/assets/62195058/dfb3f6a9-25c6-47a5-936b-3f1d7347a42b" height="400" />
+</div>
+
+我们会在 OpenMMLab 社区为大家
+
+- 📢 分享 AI 框架的前沿核心技术
+- 💻 解读 PyTorch 常用模块源码
+- 📰 发布 OpenMMLab 的相关新闻
+- 🚀 介绍 OpenMMLab 开发的前沿算法
+- 🏃 获取更高效的问题答疑和意见反馈
+- 🔥 提供与各行各业开发者充分交流的平台
+
+干货满满 📘，等你来撩 💗，OpenMMLab 社区期待您的加入 👬
diff --git a/mmde/bench_bs1.py b/mmde/bench_bs1.py
new file mode 100644
index 0000000000000000000000000000000000000000..31b38209e51e37aa33fa52680505718f619312fd
--- /dev/null
+++ b/mmde/bench_bs1.py
@@ -0,0 +1,5 @@
+_base_ = ['projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py']
+test_dataloader = dict(
+    batch_size=1,
+    dataset=dict(ann_file='nuscenes_infos_mini_val.pkl')
+)
diff --git a/mmde/bench_bs4.py b/mmde/bench_bs4.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce8d8f2764440706bd565755fb97aa0de3e7ba5d
--- /dev/null
+++ b/mmde/bench_bs4.py
@@ -0,0 +1,5 @@
+_base_ = ['projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py']
+test_dataloader = dict(
+    batch_size=4,
+    dataset=dict(ann_file='nuscenes_infos_mini_val.pkl')
+)
diff --git a/mmde/configs/3dssd/3dssd_4xb4_kitti-3d-car.py b/mmde/configs/3dssd/3dssd_4xb4_kitti-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fbdfeccfe931349dff1db5dc12517caef0d3ed8
--- /dev/null
+++ b/mmde/configs/3dssd/3dssd_4xb4_kitti-3d-car.py
@@ -0,0 +1,119 @@
+_base_ = [
+    '../_base_/models/3dssd.py', '../_base_/datasets/kitti-3d-car.py',
+    '../_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+point_cloud_range = [0, -40, -5, 70, 40, 3]
+input_modality = dict(use_lidar=True, use_camera=False)
+backend_args = None
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-1.0471975511965976, 1.0471975511965976]),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.9, 1.1]),
+    # 3DSSD can get a higher performance without this transform
+    # dict(type='BackgroundPointsFilter', bbox_enlarge_range=(0.5, 2.0, 0.5)),
+    dict(type='PointSample', num_points=16384),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(type='PointSample', num_points=16384),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=4, dataset=dict(dataset=dict(pipeline=train_pipeline, )))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# model settings
+model = dict(
+    bbox_head=dict(
+        num_classes=1,
+        bbox_coder=dict(
+            type='AnchorFreeBBoxCoder', num_dir_bins=12, with_rot=True)))
+
+# optimizer
+lr = 0.002  # max learning rate
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.),
+    clip_grad=dict(max_norm=35, norm_type=2),
+)
+
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=80, val_interval=2)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=80,
+        by_epoch=True,
+        milestones=[45, 60],
+        gamma=0.1)
+]
diff --git a/mmde/configs/3dssd/README.md b/mmde/configs/3dssd/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d39469627f4956ebcc2e588e66e27d5f1e0272a6
--- /dev/null
+++ b/mmde/configs/3dssd/README.md
@@ -0,0 +1,45 @@
+# 3DSSD: Point-based 3D Single Stage Object Detector
+
+> [3DSSD: Point-based 3D Single Stage Object Detector](https://arxiv.org/abs/2002.10187)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Currently, there have been many kinds of voxel-based 3D single stage detectors, while point-based single stage methods are still underexplored. In this paper, we first present a lightweight and effective point-based 3D single stage object detector, named 3DSSD, achieving a good balance between accuracy and efficiency. In this paradigm, all upsampling layers and refinement stage, which are indispensable in all existing point-based methods, are abandoned to reduce the large computation cost. We novelly propose a fusion sampling strategy in downsampling process to make detection on less representative points feasible. A delicate box prediction network including a candidate generation layer, an anchor-free regression head with a 3D center-ness assignment strategy is designed to meet with our demand of accuracy and speed. Our paradigm is an elegant single stage anchor-free framework, showing great superiority to other existing methods. We evaluate 3DSSD on widely used KITTI dataset and more challenging nuScenes dataset. Our method outperforms all state-of-the-art voxel-based single stage methods by a large margin, and has comparable performance to two stage point-based methods as well, with inference speed more than 25 FPS, 2x faster than former state-of-the-art point-based methods.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/30491025/143854187-54ed1257-a046-4764-81cd-d2c8404137d3.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement 3DSSD and provide the results and checkpoints on KITTI datasets.
+
+Some settings in our implementation are different from the [official implementation](https://github.com/Jia-Research-Lab/3DSSD), which bring marginal differences to the performance on KITTI datasets in our experiments. To simplify and unify the models of our implementation, we skip them in our models. These differences are listed as below:
+
+1. We keep the scenes without any object while the official code skips these scenes in training. In the official implementation, only 3229 and 3394 samples are used as training and validation sets, respectively. In our implementation, we keep using 3712 and 3769 samples as training and validation sets, respectively, as those used for all the other models in our implementation on KITTI datasets.
+2. We do not modify the decay of `batch normalization` during training.
+3. While using [`DataBaseSampler`](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/pipelines/dbsampler.py#L80) for data augmentation, the official code uses road planes as reference to place the sampled objects while we do not.
+4. We perform detection using LIDAR coordinates while the official code uses camera coordinates.
+
+## Results and models
+
+### KITTI
+
+|                    Backbone                    | Class | Lr schd | Mem (GB) | Inf time (fps) |           mAP            |                                                                                                                                                Download                                                                                                                                                |
+| :--------------------------------------------: | :---: | :-----: | :------: | :------------: | :----------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [PointNet2SAMSG](./3dssd_4xb4_kitti-3d-car.py) |  Car  |   72e   |   4.7    |                | 78.58(81.27)<sup>1</sup> | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/3dssd/3dssd_4x4_kitti-3d-car/3dssd_4x4_kitti-3d-car_20210818_203828-b89c8fc4.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/3dssd/3dssd_4x4_kitti-3d-car/3dssd_4x4_kitti-3d-car_20210818_203828.log.json) |
+
+\[1\]: We report two different 3D object detection performance here. 78.58mAP is evaluated by our evaluation code and 81.27mAP is evaluated by the official development kit （so as that used in the paper and official code of 3DSSD ）. We found that the commonly used Python implementation of [`rotate_iou`](https://github.com/traveller59/second.pytorch/blob/e42e4a0e17262ab7d180ee96a0a36427f2c20a44/second/core/non_max_suppression/nms_gpu.py#L605) which is used in our KITTI dataset evaluation, is different from the official implementation in [KITTI benchmark](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d).
+
+## Citation
+
+```latex
+@inproceedings{yang20203dssd,
+    author = {Zetong Yang and Yanan Sun and Shu Liu and Jiaya Jia},
+    title = {3DSSD: Point-based 3D Single Stage Object Detector},
+    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+    year = {2020}
+}
+```
diff --git a/mmde/configs/3dssd/metafile.yml b/mmde/configs/3dssd/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..bd2d146c7bd6111a41a364f9f699e27ceba85f7a
--- /dev/null
+++ b/mmde/configs/3dssd/metafile.yml
@@ -0,0 +1,29 @@
+Collections:
+  - Name: 3DSSD
+    Metadata:
+      Training Data: KITTI
+      Training Techniques:
+        - AdamW
+      Training Resources: 4x TITAN X
+      Architecture:
+        - PointNet++
+    Paper:
+      URL: https://arxiv.org/abs/2002.10187
+      Title: '3DSSD: Point-based 3D Single Stage Object Detector'
+    README: configs/3dssd/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/ssd3dnet.py#L7
+      Version: v0.6.0
+
+Models:
+  - Name: 3dssd_4x4_kitti-3d-car
+    In Collection: 3DSSD
+    Config: configs/3dssd/3dssd_4xb4_kitti-3d-car.py
+    Metadata:
+      Training Memory (GB): 4.7
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 78.58
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/3dssd/3dssd_4x4_kitti-3d-car/3dssd_4x4_kitti-3d-car_20210818_203828-b89c8fc4.pth
diff --git a/mmde/configs/_base_/datasets/kitti-3d-3class.py b/mmde/configs/_base_/datasets/kitti-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c405090577bfd670bc9b55a77a5ea39af6ecca1
--- /dev/null
+++ b/mmde/configs/_base_/datasets/kitti-3d-3class.py
@@ -0,0 +1,167 @@
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+metainfo = dict(classes=class_names)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/kitti/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,  # x, y, z, intensity
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=6,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='kitti_infos_train.pkl',
+            data_prefix=dict(pts='training/velodyne_reduced'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/configs/_base_/datasets/kitti-3d-car.py b/mmde/configs/_base_/datasets/kitti-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..daea7207e01d33eddd328e2b1728209bf4b6fc16
--- /dev/null
+++ b/mmde/configs/_base_/datasets/kitti-3d-car.py
@@ -0,0 +1,165 @@
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+metainfo = dict(classes=class_names)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/kitti/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,  # x, y, z, intensity
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=6,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='kitti_infos_train.pkl',
+            data_prefix=dict(pts='training/velodyne_reduced'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/configs/_base_/datasets/kitti-mono3d.py b/mmde/configs/_base_/datasets/kitti-mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5cd6117aaad6c2eba2dc8315da3c34b220c6201
--- /dev/null
+++ b/mmde/configs/_base_/datasets/kitti-mono3d.py
@@ -0,0 +1,100 @@
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+input_modality = dict(use_lidar=False, use_camera=True)
+metainfo = dict(classes=class_names)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/kitti/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='Resize', scale=(1242, 375), keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(type='Resize', scale=(1242, 375), keep_ratio=True),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='kitti_infos_train.pkl',
+        data_prefix=dict(img='training/image_2'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        load_type='fov_image_based',
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='Camera' in monocular 3d
+        # detection task
+        box_type_3d='Camera',
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img='training/image_2'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        load_type='fov_image_based',
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Camera',
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/configs/_base_/datasets/lyft-3d-range100.py b/mmde/configs/_base_/datasets/lyft-3d-range100.py
new file mode 100644
index 0000000000000000000000000000000000000000..58d63cdc495c278cc716b7c4eb65f01162350c89
--- /dev/null
+++ b/mmde/configs/_base_/datasets/lyft-3d-range100.py
@@ -0,0 +1,150 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-100, -100, -5, 100, 100, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+data_prefix = dict(pts='v1.01-train/lidar', img='', sweeps='v1.01-train/lidar')
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/lyft/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        metainfo=dict(classes=class_names),
+        modality=input_modality,
+        data_prefix=data_prefix,
+        test_mode=False,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=dict(classes=class_names),
+        modality=input_modality,
+        test_mode=True,
+        data_prefix=data_prefix,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='LyftMetric',
+    data_root=data_root,
+    ann_file='lyft_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/configs/_base_/datasets/lyft-3d.py b/mmde/configs/_base_/datasets/lyft-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9e1c4c7eccc7d22b70e6d09abde8d1ec1d98e78
--- /dev/null
+++ b/mmde/configs/_base_/datasets/lyft-3d.py
@@ -0,0 +1,160 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-80, -80, -5, 80, 80, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(use_lidar=True, use_camera=False)
+data_prefix = dict(pts='v1.01-train/lidar', img='', sweeps='v1.01-train/lidar')
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/lyft/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        metainfo=dict(classes=class_names),
+        modality=input_modality,
+        data_prefix=data_prefix,
+        test_mode=False,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=dict(classes=class_names),
+        modality=input_modality,
+        data_prefix=data_prefix,
+        test_mode=True,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=dict(classes=class_names),
+        modality=input_modality,
+        test_mode=True,
+        data_prefix=data_prefix,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='LyftMetric',
+    data_root=data_root,
+    ann_file='lyft_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/configs/_base_/datasets/nuim-instance.py b/mmde/configs/_base_/datasets/nuim-instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..913e50793bd73d1684b92e65a50dcb84d643441e
--- /dev/null
+++ b/mmde/configs/_base_/datasets/nuim-instance.py
@@ -0,0 +1,70 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/nuimages/'
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/nuimages/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1280, 720), (1920, 1080)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='PackDetInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1600, 900),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor')),
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-train.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/mmde/configs/_base_/datasets/nus-3d.py b/mmde/configs/_base_/datasets/nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..46fa854478826d03c0f657de28724c4e20a63c63
--- /dev/null
+++ b/mmde/configs/_base_/datasets/nus-3d.py
@@ -0,0 +1,169 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# Using calibration info convert the Lidar-coordinate point cloud range to the
+# ego-coordinate point cloud range could bring a little promotion in nuScenes.
+# point_cloud_range = [-50, -50.8, -5, 50, 49.2, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+metainfo = dict(classes=class_names)
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(use_lidar=True, use_camera=False)
+data_prefix = dict(pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP')
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/nuscenes/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        test_mode=True,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        test_mode=True,
+        backend_args=backend_args),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        test_mode=False,
+        data_prefix=data_prefix,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        data_prefix=data_prefix,
+        test_mode=True,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        test_mode=True,
+        data_prefix=data_prefix,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='NuScenesMetric',
+    data_root=data_root,
+    ann_file=data_root + 'nuscenes_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/configs/_base_/datasets/nus-mono3d.py b/mmde/configs/_base_/datasets/nus-mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a1d22650718ed287216c69a1156c31fcfb9897b
--- /dev/null
+++ b/mmde/configs/_base_/datasets/nus-mono3d.py
@@ -0,0 +1,119 @@
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+metainfo = dict(classes=class_names)
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/nuscenes/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='Resize', scale=(1600, 900), keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'attr_labels',
+            'gt_bboxes_3d', 'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(type='mmdet.Resize', scale=(1600, 900), keep_ratio=True),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='',
+            CAM_FRONT='samples/CAM_FRONT',
+            CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+            CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+            CAM_BACK='samples/CAM_BACK',
+            CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+            CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
+        ann_file='nuscenes_infos_train.pkl',
+        load_type='mv_image_based',
+        pipeline=train_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        test_mode=False,
+        # we use box_type_3d='Camera' in monocular 3d
+        # detection task
+        box_type_3d='Camera',
+        use_valid_flag=True,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='',
+            CAM_FRONT='samples/CAM_FRONT',
+            CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+            CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+            CAM_BACK='samples/CAM_BACK',
+            CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+            CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
+        ann_file='nuscenes_infos_val.pkl',
+        load_type='mv_image_based',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Camera',
+        use_valid_flag=True,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='NuScenesMetric',
+    data_root=data_root,
+    ann_file=data_root + 'nuscenes_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/configs/_base_/datasets/s3dis-3d.py b/mmde/configs/_base_/datasets/s3dis-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..04280332e44fa194014a8baedbea6d3c113a2d33
--- /dev/null
+++ b/mmde/configs/_base_/datasets/s3dis-3d.py
@@ -0,0 +1,134 @@
+# dataset settings
+dataset_type = 'S3DISDataset'
+data_root = 'data/s3dis/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/s3dis/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+metainfo = dict(classes=('table', 'chair', 'sofa', 'bookcase', 'board'))
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='PointSample', num_points=100000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[0.9, 1.1],
+        translation_std=[.1, .1, .1],
+        shift_height=False),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=100000),
+            dict(type='NormalizePointsColor', color_mean=None),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=13,
+        dataset=dict(
+            type='ConcatDataset',
+            datasets=[
+                dict(
+                    type=dataset_type,
+                    data_root=data_root,
+                    ann_file=f's3dis_infos_Area_{i}.pkl',
+                    pipeline=train_pipeline,
+                    filter_empty_gt=True,
+                    metainfo=metainfo,
+                    box_type_3d='Depth',
+                    backend_args=backend_args) for i in train_area
+            ])))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/configs/_base_/datasets/s3dis-seg.py b/mmde/configs/_base_/datasets/s3dis-seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdebc94b4ae318656272c19429bc7cf11aa2a2b7
--- /dev/null
+++ b/mmde/configs/_base_/datasets/s3dis-seg.py
@@ -0,0 +1,169 @@
+# For S3DIS seg we usually do 13-class segmentation
+class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+metainfo = dict(classes=class_names)
+dataset_type = 'S3DISSegDataset'
+data_root = 'data/s3dis/'
+input_modality = dict(use_lidar=True, use_camera=False)
+data_prefix = dict(
+    pts='points',
+    pts_instance_mask='instance_mask',
+    pts_semantic_mask='semantic_mask')
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/s3dis/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+num_points = 4096
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.0,
+        ignore_index=len(class_names),
+        use_normalized_coord=True,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+tta_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[[
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.,
+                flip_ratio_bev_vertical=0.)
+        ], [dict(type='Pack3DDetInputs', keys=['points'])]])
+]
+
+# train on area 1, 2, 3, 4, 6
+# test on area 5
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=[f's3dis_infos_Area_{i}.pkl' for i in train_area],
+        metainfo=metainfo,
+        data_prefix=data_prefix,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        ignore_index=len(class_names),
+        scene_idxs=[
+            f'seg_info/Area_{i}_resampled_scene_idxs.npy' for i in train_area
+        ],
+        test_mode=False,
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=f's3dis_infos_Area_{test_area}.pkl',
+        metainfo=metainfo,
+        data_prefix=data_prefix,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        ignore_index=len(class_names),
+        scene_idxs=f'seg_info/Area_{test_area}_resampled_scene_idxs.npy',
+        test_mode=True,
+        backend_args=backend_args))
+val_dataloader = test_dataloader
+
+val_evaluator = dict(type='SegMetric')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+tta_model = dict(type='Seg3DTTAModel')
diff --git a/mmde/configs/_base_/datasets/scannet-3d.py b/mmde/configs/_base_/datasets/scannet-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..67a39eff38288156276f6a1c284d5fddd11c66d0
--- /dev/null
+++ b/mmde/configs/_base_/datasets/scannet-3d.py
@@ -0,0 +1,141 @@
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = 'data/scannet/'
+
+metainfo = dict(
+    classes=('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+             'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+             'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+             'garbagebin'))
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/scannet/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointSample', num_points=40000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0],
+        shift_height=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=40000),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth',
+            backend_args=backend_args)))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/configs/_base_/datasets/scannet-seg.py b/mmde/configs/_base_/datasets/scannet-seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..759e5e2a38f850d69cbbfb1c9acb12a02fe7269e
--- /dev/null
+++ b/mmde/configs/_base_/datasets/scannet-seg.py
@@ -0,0 +1,164 @@
+# For ScanNet seg we usually do 20-class segmentation
+class_names = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table',
+               'door', 'window', 'bookshelf', 'picture', 'counter', 'desk',
+               'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink',
+               'bathtub', 'otherfurniture')
+metainfo = dict(classes=class_names)
+dataset_type = 'ScanNetSegDataset'
+data_root = 'data/scannet/'
+input_modality = dict(use_lidar=True, use_camera=False)
+data_prefix = dict(
+    pts='points',
+    pts_instance_mask='instance_mask',
+    pts_semantic_mask='semantic_mask')
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/scannet/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+num_points = 8192
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.5,
+        ignore_index=len(class_names),
+        use_normalized_coord=False,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+tta_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[[
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.,
+                flip_ratio_bev_vertical=0.)
+        ], [dict(type='Pack3DDetInputs', keys=['points'])]])
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_train.pkl',
+        metainfo=metainfo,
+        data_prefix=data_prefix,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        ignore_index=len(class_names),
+        scene_idxs=data_root + 'seg_info/train_resampled_scene_idxs.npy',
+        test_mode=False,
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        metainfo=metainfo,
+        data_prefix=data_prefix,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        ignore_index=len(class_names),
+        test_mode=True,
+        backend_args=backend_args))
+val_dataloader = test_dataloader
+
+val_evaluator = dict(type='SegMetric')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+tta_model = dict(type='Seg3DTTAModel')
diff --git a/mmde/configs/_base_/datasets/semantickitti.py b/mmde/configs/_base_/datasets/semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae464d8b60d6b1e7c887f65fbb333d0b2db77668
--- /dev/null
+++ b/mmde/configs/_base_/datasets/semantickitti.py
@@ -0,0 +1,224 @@
+# For SemanticKitti we usually do 19-class segmentation.
+# For labels_map we follow the uniform format of MMDetection & MMSegmentation
+# i.e. we consider the unlabeled class as the last one, which is different
+# from the original implementation of some methods e.g. Cylinder3D.
+dataset_type = 'SemanticKittiDataset'
+data_root = 'data/semantickitti/'
+class_names = [
+    'car', 'bicycle', 'motorcycle', 'truck', 'bus', 'person', 'bicyclist',
+    'motorcyclist', 'road', 'parking', 'sidewalk', 'other-ground', 'building',
+    'fence', 'vegetation', 'trunck', 'terrian', 'pole', 'traffic-sign'
+]
+labels_map = {
+    0: 19,  # "unlabeled"
+    1: 19,  # "outlier" mapped to "unlabeled" --------------mapped
+    10: 0,  # "car"
+    11: 1,  # "bicycle"
+    13: 4,  # "bus" mapped to "other-vehicle" --------------mapped
+    15: 2,  # "motorcycle"
+    16: 4,  # "on-rails" mapped to "other-vehicle" ---------mapped
+    18: 3,  # "truck"
+    20: 4,  # "other-vehicle"
+    30: 5,  # "person"
+    31: 6,  # "bicyclist"
+    32: 7,  # "motorcyclist"
+    40: 8,  # "road"
+    44: 9,  # "parking"
+    48: 10,  # "sidewalk"
+    49: 11,  # "other-ground"
+    50: 12,  # "building"
+    51: 13,  # "fence"
+    52: 19,  # "other-structure" mapped to "unlabeled" ------mapped
+    60: 8,  # "lane-marking" to "road" ---------------------mapped
+    70: 14,  # "vegetation"
+    71: 15,  # "trunk"
+    72: 16,  # "terrain"
+    80: 17,  # "pole"
+    81: 18,  # "traffic-sign"
+    99: 19,  # "other-object" to "unlabeled" ----------------mapped
+    252: 0,  # "moving-car" to "car" ------------------------mapped
+    253: 6,  # "moving-bicyclist" to "bicyclist" ------------mapped
+    254: 5,  # "moving-person" to "person" ------------------mapped
+    255: 7,  # "moving-motorcyclist" to "motorcyclist" ------mapped
+    256: 4,  # "moving-on-rails" mapped to "other-vehic------mapped
+    257: 4,  # "moving-bus" mapped to "other-vehicle" -------mapped
+    258: 3,  # "moving-truck" to "truck" --------------------mapped
+    259: 4  # "moving-other"-vehicle to "other-vehicle"-----mapped
+}
+
+metainfo = dict(
+    classes=class_names, seg_label_mapping=labels_map, max_label=259)
+
+input_modality = dict(use_lidar=True, use_camera=False)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/semantickitti/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti',
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.1, 0.1, 0.1],
+    ),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti',
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+tta_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti',
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='TestTimeAug',
+        transforms=[[
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.,
+                flip_ratio_bev_vertical=0.),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.,
+                flip_ratio_bev_vertical=1.),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=1.,
+                flip_ratio_bev_vertical=0.),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=1.,
+                flip_ratio_bev_vertical=1.)
+        ],
+                    [
+                        dict(
+                            type='GlobalRotScaleTrans',
+                            rot_range=[pcd_rotate_range, pcd_rotate_range],
+                            scale_ratio_range=[
+                                pcd_scale_factor, pcd_scale_factor
+                            ],
+                            translation_std=[0, 0, 0])
+                        for pcd_rotate_range in [-0.78539816, 0.0, 0.78539816]
+                        for pcd_scale_factor in [0.95, 1.0, 1.05]
+                    ], [dict(type='Pack3DDetInputs', keys=['points'])]])
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='semantickitti_infos_train.pkl',
+        pipeline=train_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        ignore_index=19,
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='semantickitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        ignore_index=19,
+        test_mode=True,
+        backend_args=backend_args))
+
+val_dataloader = test_dataloader
+
+val_evaluator = dict(type='SegMetric')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+tta_model = dict(type='Seg3DTTAModel')
diff --git a/mmde/configs/_base_/datasets/sunrgbd-3d.py b/mmde/configs/_base_/datasets/sunrgbd-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..28578348d27ddc031173c543f7be12aa1b541c12
--- /dev/null
+++ b/mmde/configs/_base_/datasets/sunrgbd-3d.py
@@ -0,0 +1,126 @@
+dataset_type = 'SUNRGBDDataset'
+data_root = 'data/sunrgbd/'
+class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+               'night_stand', 'bookshelf', 'bathtub')
+
+metainfo = dict(classes=class_names)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/sunrgbd/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+    ),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.523599, 0.523599],
+        scale_ratio_range=[0.85, 1.15],
+        shift_height=True),
+    dict(type='PointSample', num_points=20000),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+            ),
+            dict(type='PointSample', num_points=20000)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='sunrgbd_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth',
+            backend_args=backend_args)))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py b/mmde/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0142b9c940cce19cddd3ff1b4e6738917c317f4
--- /dev/null
+++ b/mmde/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py
@@ -0,0 +1,184 @@
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    # base shape (1248, 832), scale (0.95, 1.05)
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(0.95, 1.05),
+        # ratio_range=(1., 1.),
+        interpolation='nearest',
+        keep_ratio=True,
+    ),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        # load one frame every three frames
+        load_interval=3,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        load_eval_anns=False,
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='WaymoMetric',
+    waymo_bin_file='./data/waymo/waymo_format/fov_gt.bin',
+    metric='LET_mAP',
+    load_type='fov_image_based',
+    result_prefix='./pgd_fov_pred')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/configs/_base_/datasets/waymoD3-mv-mono3d-3class.py b/mmde/configs/_base_/datasets/waymoD3-mv-mono3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..af245effa796aaca7f23c9d9150058ef30172b25
--- /dev/null
+++ b/mmde/configs/_base_/datasets/waymoD3-mv-mono3d-3class.py
@@ -0,0 +1,191 @@
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    # base shape (1248, 832), scale (0.95, 1.05)
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        # ratio_range=(1., 1.),
+        ratio_range=(0.95, 1.05),
+        interpolation='nearest',
+        keep_ratio=True,
+    ),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='Resize3D',
+        scale_factor=0.65,
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='Resize3D',
+        scale_factor=0.65,
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        # load one frame every three frames
+        load_interval=3,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=0,
+    persistent_workers=False,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        # load_eval_anns=False,
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=0,
+    persistent_workers=False,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        load_eval_anns=False,
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='WaymoMetric',
+    waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
+    metric='LET_mAP',
+    load_type='mv_image_based',
+    result_prefix='./pgd_mv_pred',
+    nms_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=500,
+        nms_thr=0.05,
+        score_thr=0.001,
+        min_bbox_size=0,
+        max_per_frame=100))
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/configs/_base_/datasets/waymoD5-3d-3class.py b/mmde/configs/_base_/datasets/waymoD5-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8f14998d275e783635a8cad709197397c0e057e
--- /dev/null
+++ b/mmde/configs/_base_/datasets/waymoD5-3d-3class.py
@@ -0,0 +1,178 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+data_root = 'data/waymo/kitti_format/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+metainfo = dict(classes=class_names)
+
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    # dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='waymo_infos_train.pkl',
+            data_prefix=dict(
+                pts='training/velodyne', sweeps='training/velodyne'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='WaymoMetric', waymo_bin_file='./data/waymo/waymo_format/gt.bin')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/configs/_base_/datasets/waymoD5-3d-car.py b/mmde/configs/_base_/datasets/waymoD5-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..972e9289bee2baaf3d0bee03d48f0a1dd3c329b2
--- /dev/null
+++ b/mmde/configs/_base_/datasets/waymoD5-3d-car.py
@@ -0,0 +1,173 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+class_names = ['Car']
+metainfo = dict(classes=class_names)
+
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(type='Pack3DDetInputs', keys=['points']),
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='waymo_infos_train.pkl',
+            data_prefix=dict(
+                pts='training/velodyne', sweeps='training/velodyne'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='WaymoMetric', waymo_bin_file='./data/waymo/waymo_format/gt.bin')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/configs/_base_/datasets/waymoD5-fov-mono3d-3class.py b/mmde/configs/_base_/datasets/waymoD5-fov-mono3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..614b6a9efab202041b75f7966501c6bc174eed44
--- /dev/null
+++ b/mmde/configs/_base_/datasets/waymoD5-fov-mono3d-3class.py
@@ -0,0 +1,163 @@
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    # base shape (1248, 832), scale (0.95, 1.05)
+    dict(
+        type='RandomResize3D',
+        scale=(1284, 832),
+        ratio_range=(0.95, 1.05),
+        keep_ratio=True,
+    ),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        keep_ratio=True),
+    dict(type='Pack3DDetInputs', keys=['img']),
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        keep_ratio=True),
+    dict(type='Pack3DDetInputs', keys=['img']),
+]
+
+metainfo = dict(CLASSES=class_names)
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        # load one frame every three frames
+        load_interval=5,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='WaymoMetric',
+    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
+    waymo_bin_file='./data/waymo/waymo_format/fov_gt.bin',
+    data_root='./data/waymo/waymo_format',
+    metric='LET_mAP',
+    load_type='fov_image_based',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
diff --git a/mmde/configs/_base_/datasets/waymoD5-mv-mono3d-3class.py b/mmde/configs/_base_/datasets/waymoD5-mv-mono3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..0840d5eab16651b17a11a192980d6c26f2c81d36
--- /dev/null
+++ b/mmde/configs/_base_/datasets/waymoD5-mv-mono3d-3class.py
@@ -0,0 +1,163 @@
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    # base shape (1248, 832), scale (0.95, 1.05)
+    dict(
+        type='RandomResize3D',
+        scale=(1284, 832),
+        ratio_range=(0.95, 1.05),
+        keep_ratio=True,
+    ),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        keep_ratio=True),
+    dict(type='Pack3DDetInputs', keys=['img']),
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        keep_ratio=True),
+    dict(type='Pack3DDetInputs', keys=['img']),
+]
+
+metainfo = dict(classes=class_names)
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        # load one frame every three frames
+        load_interval=5,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='WaymoMetric',
+    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
+    waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
+    data_root='./data/waymo/waymo_format',
+    metric='LET_mAP',
+    load_type='mv_image_based',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
diff --git a/mmde/configs/_base_/datasets/waymoD5-mv3d-3class.py b/mmde/configs/_base_/datasets/waymoD5-mv3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ea267899368a70bcd6542aaf1a5861d4c850b70
--- /dev/null
+++ b/mmde/configs/_base_/datasets/waymoD5-mv3d-3class.py
@@ -0,0 +1,178 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+input_modality = dict(use_lidar=False, use_camera=True)
+point_cloud_range = [-35.0, -75.0, -2, 75.0, 75.0, 4]
+
+train_transforms = [
+    dict(type='PhotoMetricDistortion3D'),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(0.95, 1.05),
+        keep_ratio=True),
+    dict(type='RandomCrop3D', crop_size=(1080, 720)),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5, flip_box3d=False),
+]
+
+train_pipeline = [
+    dict(
+        type='LoadMultiViewImageFromFiles',
+        to_float32=True,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='MultiViewWrapper', transforms=train_transforms),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(
+        type='Pack3DDetInputs', keys=[
+            'img',
+            'gt_bboxes_3d',
+            'gt_labels_3d',
+        ]),
+]
+test_transforms = [
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        keep_ratio=True)
+]
+test_pipeline = [
+    dict(
+        type='LoadMultiViewImageFromFiles',
+        to_float32=True,
+        backend_args=backend_args),
+    dict(type='MultiViewWrapper', transforms=test_transforms),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'ori_cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam',
+            'num_ref_frames', 'num_views'
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadMultiViewImageFromFiles',
+        to_float32=True,
+        backend_args=backend_args),
+    dict(type='MultiViewWrapper', transforms=test_transforms),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'ori_cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam',
+            'num_ref_frames', 'num_views'
+        ])
+]
+metainfo = dict(classes=class_names)
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        cam_sync_instances=True,
+        metainfo=metainfo,
+        box_type_3d='Lidar',
+        load_interval=5,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_val.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='Lidar',
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_val.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='Lidar',
+        backend_args=backend_args))
+val_evaluator = dict(
+    type='WaymoMetric',
+    waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
+    metric='LET_mAP')
+
+test_evaluator = val_evaluator
diff --git a/mmde/configs/_base_/default_runtime.py b/mmde/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..9249ab9952f8e15223982fdb05ffb5f34fea5f3a
--- /dev/null
+++ b/mmde/configs/_base_/default_runtime.py
@@ -0,0 +1,23 @@
+default_scope = 'mmdet3d'
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=-1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='Det3DVisualizationHook'))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+
+log_level = 'INFO'
+load_from = None
+resume = False
+
+# TODO: support auto scaling lr
diff --git a/mmde/configs/_base_/models/3dssd.py b/mmde/configs/_base_/models/3dssd.py
new file mode 100644
index 0000000000000000000000000000000000000000..323286193a1a928ee95fc91243898f46cdb37cf8
--- /dev/null
+++ b/mmde/configs/_base_/models/3dssd.py
@@ -0,0 +1,76 @@
+model = dict(
+    type='SSD3DNet',
+    data_preprocessor=dict(type='Det3DDataPreprocessor'),
+    backbone=dict(
+        type='PointNet2SAMSG',
+        in_channels=4,
+        num_points=(4096, 512, (256, 256)),
+        radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)),
+        num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)),
+        sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)),
+                     ((64, 64, 128), (64, 64, 128), (64, 96, 128)),
+                     ((128, 128, 256), (128, 192, 256), (128, 256, 256))),
+        aggregation_channels=(64, 128, 256),
+        fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')),
+        fps_sample_range_lists=((-1), (-1), (512, -1)),
+        norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
+        sa_cfg=dict(
+            type='PointSAModuleMSG',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    bbox_head=dict(
+        type='SSD3DHead',
+        vote_module_cfg=dict(
+            in_channels=256,
+            num_points=256,
+            gt_per_seed=1,
+            conv_channels=(128, ),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+            with_res_feat=False,
+            vote_xyz_range=(3.0, 3.0, 2.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModuleMSG',
+            num_point=256,
+            radii=(4.8, 6.4),
+            sample_nums=(16, 32),
+            mlp_channels=((256, 256, 256, 512), (256, 256, 512, 1024)),
+            norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
+            use_xyz=True,
+            normalize_xyz=False,
+            bias=True),
+        pred_layer_cfg=dict(
+            in_channels=1536,
+            shared_conv_channels=(512, 128),
+            cls_conv_channels=(128, ),
+            reg_conv_channels=(128, ),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+            bias=True),
+        objectness_loss=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        center_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        dir_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        corner_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        vote_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        sample_mode='spec', pos_distance_thr=10.0, expand_dims_length=0.05),
+    test_cfg=dict(
+        nms_cfg=dict(type='nms', iou_thr=0.1),
+        sample_mode='spec',
+        score_thr=0.0,
+        per_class_proposal=True,
+        max_output_num=100))
diff --git a/mmde/configs/_base_/models/cascade-mask-rcnn_r50_fpn.py b/mmde/configs/_base_/models/cascade-mask-rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..147b5f03359cc18957799caab7dfdff954fbd3a7
--- /dev/null
+++ b/mmde/configs/_base_/models/cascade-mask-rcnn_r50_fpn.py
@@ -0,0 +1,199 @@
+# model settings
+model = dict(
+    type='CascadeRCNN',
+    pretrained='torchvision://resnet50',
+    _scope_='mmdet',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            nms_post=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/mmde/configs/_base_/models/centerpoint_pillar02_second_secfpn_nus.py b/mmde/configs/_base_/models/centerpoint_pillar02_second_secfpn_nus.py
new file mode 100644
index 0000000000000000000000000000000000000000..233b9122f7a74cdbd0dded437d54de9f62090ba4
--- /dev/null
+++ b/mmde/configs/_base_/models/centerpoint_pillar02_second_secfpn_nus.py
@@ -0,0 +1,89 @@
+voxel_size = [0.2, 0.2, 8]
+model = dict(
+    type='CenterPoint',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=20,
+            voxel_size=voxel_size,
+            max_voxels=(30000, 40000))),
+    pts_voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=5,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=(0.2, 0.2, 8),
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+        legacy=False),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        out_channels=[64, 128, 256],
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        conv_cfg=dict(type='Conv2d', bias=False)),
+    pts_neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        out_channels=[128, 128, 128],
+        upsample_strides=[0.5, 1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=True),
+    pts_bbox_head=dict(
+        type='CenterHead',
+        in_channels=sum([128, 128, 128]),
+        tasks=[
+            dict(num_class=1, class_names=['car']),
+            dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+            dict(num_class=2, class_names=['bus', 'trailer']),
+            dict(num_class=1, class_names=['barrier']),
+            dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+            dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.1,
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='mmdet.GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(
+            type='mmdet.L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            out_size_factor=4,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            pc_range=[-51.2, -51.2],
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            nms_type='rotate',
+            pre_max_size=1000,
+            post_max_size=83,
+            nms_thr=0.2)))
diff --git a/mmde/configs/_base_/models/centerpoint_voxel01_second_secfpn_nus.py b/mmde/configs/_base_/models/centerpoint_voxel01_second_secfpn_nus.py
new file mode 100644
index 0000000000000000000000000000000000000000..91dcd17f4ece520b1a6298a08a2b9bfa1eaef0f8
--- /dev/null
+++ b/mmde/configs/_base_/models/centerpoint_voxel01_second_secfpn_nus.py
@@ -0,0 +1,89 @@
+voxel_size = [0.1, 0.1, 0.2]
+model = dict(
+    type='CenterPoint',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=10,
+            voxel_size=voxel_size,
+            max_voxels=(90000, 120000))),
+    pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+    pts_middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=5,
+        sparse_shape=[41, 1024, 1024],
+        output_channels=128,
+        order=('conv', 'norm', 'act'),
+        encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
+                                                                      128)),
+        encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
+        block_type='basicblock'),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        out_channels=[128, 256],
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        conv_cfg=dict(type='Conv2d', bias=False)),
+    pts_neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        out_channels=[256, 256],
+        upsample_strides=[1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=True),
+    pts_bbox_head=dict(
+        type='CenterHead',
+        in_channels=sum([256, 256]),
+        tasks=[
+            dict(num_class=1, class_names=['car']),
+            dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+            dict(num_class=2, class_names=['bus', 'trailer']),
+            dict(num_class=1, class_names=['barrier']),
+            dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+            dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.1,
+            out_size_factor=8,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='mmdet.GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(
+            type='mmdet.L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[1024, 1024, 40],
+            voxel_size=voxel_size,
+            out_size_factor=8,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            out_size_factor=8,
+            voxel_size=voxel_size[:2],
+            nms_type='rotate',
+            pre_max_size=1000,
+            post_max_size=83,
+            nms_thr=0.2)))
diff --git a/mmde/configs/_base_/models/cylinder3d.py b/mmde/configs/_base_/models/cylinder3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..02e83233636d96b3d6b27b5a33e680f367a6db69
--- /dev/null
+++ b/mmde/configs/_base_/models/cylinder3d.py
@@ -0,0 +1,41 @@
+grid_shape = [480, 360, 32]
+model = dict(
+    type='Cylinder3D',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_type='cylindrical',
+        voxel_layer=dict(
+            grid_shape=grid_shape,
+            point_cloud_range=[0, -3.14159265359, -4, 50, 3.14159265359, 2],
+            max_num_points=-1,
+            max_voxels=-1,
+        ),
+    ),
+    voxel_encoder=dict(
+        type='SegVFE',
+        feat_channels=[64, 128, 256, 256],
+        in_channels=6,
+        with_voxel_center=True,
+        feat_compression=16,
+        return_point_feats=False),
+    backbone=dict(
+        type='Asymm3DSpconv',
+        grid_size=grid_shape,
+        input_channels=16,
+        base_channels=32,
+        norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.1)),
+    decode_head=dict(
+        type='Cylinder3DHead',
+        channels=128,
+        num_classes=20,
+        loss_ce=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            class_weight=None,
+            loss_weight=1.0),
+        loss_lovasz=dict(type='LovaszLoss', loss_weight=1.0, reduction='none'),
+    ),
+    train_cfg=None,
+    test_cfg=dict(mode='whole'),
+)
diff --git a/mmde/configs/_base_/models/dgcnn.py b/mmde/configs/_base_/models/dgcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdafa507933efc5eda7877718dc1ac61202ae0fe
--- /dev/null
+++ b/mmde/configs/_base_/models/dgcnn.py
@@ -0,0 +1,29 @@
+# model settings
+model = dict(
+    type='EncoderDecoder3D',
+    data_preprocessor=dict(type='Det3DDataPreprocessor'),
+    backbone=dict(
+        type='DGCNNBackbone',
+        in_channels=9,  # [xyz, rgb, normal_xyz], modified with dataset
+        num_samples=(20, 20, 20),
+        knn_modes=('D-KNN', 'F-KNN', 'F-KNN'),
+        radius=(None, None, None),
+        gf_channels=((64, 64), (64, 64), (64, )),
+        fa_channels=(1024, ),
+        act_cfg=dict(type='LeakyReLU', negative_slope=0.2)),
+    decode_head=dict(
+        type='DGCNNHead',
+        fp_channels=(1216, 512),
+        channels=256,
+        dropout_ratio=0.5,
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        act_cfg=dict(type='LeakyReLU', negative_slope=0.2),
+        loss_decode=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            class_weight=None,  # modified with dataset
+            loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide'))
diff --git a/mmde/configs/_base_/models/fcaf3d.py b/mmde/configs/_base_/models/fcaf3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae47827aae2798000456069697ce04d9ee1f830e
--- /dev/null
+++ b/mmde/configs/_base_/models/fcaf3d.py
@@ -0,0 +1,20 @@
+model = dict(
+    type='MinkSingleStage3DDetector',
+    data_preprocessor=dict(type='Det3DDataPreprocessor'),
+    backbone=dict(type='MinkResNet', in_channels=3, depth=34),
+    bbox_head=dict(
+        type='FCAF3DHead',
+        in_channels=(64, 128, 256, 512),
+        out_channels=128,
+        voxel_size=.01,
+        pts_prune_threshold=100000,
+        pts_assign_threshold=27,
+        pts_center_threshold=18,
+        num_classes=18,
+        num_reg_outs=6,
+        center_loss=dict(type='mmdet.CrossEntropyLoss', use_sigmoid=True),
+        bbox_loss=dict(type='AxisAlignedIoULoss'),
+        cls_loss=dict(type='mmdet.FocalLoss'),
+    ),
+    train_cfg=dict(),
+    test_cfg=dict(nms_pre=1000, iou_thr=.5, score_thr=.01))
diff --git a/mmde/configs/_base_/models/fcos3d.py b/mmde/configs/_base_/models/fcos3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbb20efc4e9e821129da32a01b783a1f460b12f8
--- /dev/null
+++ b/mmde/configs/_base_/models/fcos3d.py
@@ -0,0 +1,86 @@
+# model settings
+model = dict(
+    type='FCOSMono3D',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5,
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type='FCOSMono3DHead',
+        num_classes=10,
+        in_channels=256,
+        stacked_convs=2,
+        feat_channels=256,
+        use_direction_classifier=True,
+        diff_rad_by_sin=True,
+        pred_attrs=True,
+        pred_velo=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        strides=[8, 16, 32, 64, 128],
+        group_reg_dims=(2, 1, 3, 1, 2),  # offset, depth, size, rot, velo
+        cls_branch=(256, ),
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            ()  # velo
+        ),
+        dir_branch=(256, ),
+        attr_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_attr=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9),
+        norm_on_bbox=True,
+        centerness_on_reg=True,
+        center_sampling=True,
+        conv_bias=True,
+        dcn_on_last_conv=True),
+    train_cfg=dict(
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05],
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_thr=0.8,
+        score_thr=0.05,
+        min_bbox_size=0,
+        max_per_img=200))
diff --git a/mmde/configs/_base_/models/groupfree3d.py b/mmde/configs/_base_/models/groupfree3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..96275758c56229c4a2d578041139608831c89a9c
--- /dev/null
+++ b/mmde/configs/_base_/models/groupfree3d.py
@@ -0,0 +1,75 @@
+model = dict(
+    type='GroupFree3DNet',
+    data_preprocessor=dict(type='Det3DDataPreprocessor'),
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=3,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 288)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    bbox_head=dict(
+        type='GroupFree3DHead',
+        in_channels=288,
+        num_decoder_layers=6,
+        num_proposal=256,
+        transformerlayers=dict(
+            type='BaseTransformerLayer',
+            attn_cfgs=dict(
+                type='GroupFree3DMHA',
+                embed_dims=288,
+                num_heads=8,
+                attn_drop=0.1,
+                dropout_layer=dict(type='Dropout', drop_prob=0.1)),
+            ffn_cfgs=dict(
+                embed_dims=288,
+                feedforward_channels=2048,
+                ffn_drop=0.1,
+                act_cfg=dict(type='ReLU', inplace=True)),
+            operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',
+                             'norm')),
+        pred_layer_cfg=dict(
+            in_channels=288, shared_conv_channels=(288, 288), bias=True),
+        sampling_objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=8.0),
+        objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        center_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        dir_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=1.0,
+            reduction='sum',
+            loss_weight=10.0),
+        semantic_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(sample_mode='kps'),
+    test_cfg=dict(
+        sample_mode='kps',
+        nms_thr=0.25,
+        score_thr=0.0,
+        per_class_proposal=True,
+        prediction_stages='last'))
diff --git a/mmde/configs/_base_/models/h3dnet.py b/mmde/configs/_base_/models/h3dnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..559b06c90b8ff41c8bcf4f28946392f115b38890
--- /dev/null
+++ b/mmde/configs/_base_/models/h3dnet.py
@@ -0,0 +1,351 @@
+primitive_z_cfg = dict(
+    type='PrimitiveHead',
+    num_dims=2,
+    num_classes=18,
+    primitive_mode='z',
+    upper_thresh=100.0,
+    surface_thresh=0.5,
+    vote_module_cfg=dict(
+        in_channels=256,
+        vote_per_seed=1,
+        gt_per_seed=1,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        norm_feats=True,
+        vote_loss=dict(
+            type='ChamferDistance',
+            mode='l1',
+            reduction='none',
+            loss_dst_weight=10.0)),
+    vote_aggregation_cfg=dict(
+        type='PointSAModule',
+        num_point=1024,
+        radius=0.3,
+        num_sample=16,
+        mlp_channels=[256, 128, 128, 128],
+        use_xyz=True,
+        normalize_xyz=True),
+    feat_channels=(128, 128),
+    conv_cfg=dict(type='Conv1d'),
+    norm_cfg=dict(type='BN1d'),
+    objectness_loss=dict(
+        type='mmdet.CrossEntropyLoss',
+        class_weight=[0.4, 0.6],
+        reduction='mean',
+        loss_weight=30.0),
+    center_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_reg_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_cls_loss=dict(
+        type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+    train_cfg=dict(
+        sample_mode='vote',
+        dist_thresh=0.2,
+        var_thresh=1e-2,
+        lower_thresh=1e-6,
+        num_point=100,
+        num_point_line=10,
+        line_thresh=0.2),
+    test_cfg=dict(sample_mode='seed'))
+
+primitive_xy_cfg = dict(
+    type='PrimitiveHead',
+    num_dims=1,
+    num_classes=18,
+    primitive_mode='xy',
+    upper_thresh=100.0,
+    surface_thresh=0.5,
+    vote_module_cfg=dict(
+        in_channels=256,
+        vote_per_seed=1,
+        gt_per_seed=1,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        norm_feats=True,
+        vote_loss=dict(
+            type='ChamferDistance',
+            mode='l1',
+            reduction='none',
+            loss_dst_weight=10.0)),
+    vote_aggregation_cfg=dict(
+        type='PointSAModule',
+        num_point=1024,
+        radius=0.3,
+        num_sample=16,
+        mlp_channels=[256, 128, 128, 128],
+        use_xyz=True,
+        normalize_xyz=True),
+    feat_channels=(128, 128),
+    conv_cfg=dict(type='Conv1d'),
+    norm_cfg=dict(type='BN1d'),
+    objectness_loss=dict(
+        type='mmdet.CrossEntropyLoss',
+        class_weight=[0.4, 0.6],
+        reduction='mean',
+        loss_weight=30.0),
+    center_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_reg_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_cls_loss=dict(
+        type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+    train_cfg=dict(
+        sample_mode='vote',
+        dist_thresh=0.2,
+        var_thresh=1e-2,
+        lower_thresh=1e-6,
+        num_point=100,
+        num_point_line=10,
+        line_thresh=0.2),
+    test_cfg=dict(sample_mode='seed'))
+
+primitive_line_cfg = dict(
+    type='PrimitiveHead',
+    num_dims=0,
+    num_classes=18,
+    primitive_mode='line',
+    upper_thresh=100.0,
+    surface_thresh=0.5,
+    vote_module_cfg=dict(
+        in_channels=256,
+        vote_per_seed=1,
+        gt_per_seed=1,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        norm_feats=True,
+        vote_loss=dict(
+            type='ChamferDistance',
+            mode='l1',
+            reduction='none',
+            loss_dst_weight=10.0)),
+    vote_aggregation_cfg=dict(
+        type='PointSAModule',
+        num_point=1024,
+        radius=0.3,
+        num_sample=16,
+        mlp_channels=[256, 128, 128, 128],
+        use_xyz=True,
+        normalize_xyz=True),
+    feat_channels=(128, 128),
+    conv_cfg=dict(type='Conv1d'),
+    norm_cfg=dict(type='BN1d'),
+    objectness_loss=dict(
+        type='mmdet.CrossEntropyLoss',
+        class_weight=[0.4, 0.6],
+        reduction='mean',
+        loss_weight=30.0),
+    center_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=1.0,
+        loss_dst_weight=1.0),
+    semantic_reg_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=1.0,
+        loss_dst_weight=1.0),
+    semantic_cls_loss=dict(
+        type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=2.0),
+    train_cfg=dict(
+        sample_mode='vote',
+        dist_thresh=0.2,
+        var_thresh=1e-2,
+        lower_thresh=1e-6,
+        num_point=100,
+        num_point_line=10,
+        line_thresh=0.2),
+    test_cfg=dict(sample_mode='seed'))
+
+model = dict(
+    type='H3DNet',
+    data_preprocessor=dict(type='Det3DDataPreprocessor'),
+    backbone=dict(
+        type='MultiBackbone',
+        num_streams=4,
+        suffixes=['net0', 'net1', 'net2', 'net3'],
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        backbones=dict(
+            type='PointNet2SASSG',
+            in_channels=4,
+            num_points=(2048, 1024, 512, 256),
+            radius=(0.2, 0.4, 0.8, 1.2),
+            num_samples=(64, 32, 16, 16),
+            sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                         (128, 128, 256)),
+            fp_channels=((256, 256), (256, 256)),
+            norm_cfg=dict(type='BN2d'),
+            sa_cfg=dict(
+                type='PointSAModule',
+                pool_mod='max',
+                use_xyz=True,
+                normalize_xyz=True))),
+    rpn_head=dict(
+        type='VoteHead',
+        vote_module_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModule',
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        pred_layer_cfg=dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True),
+        objectness_loss=dict(
+            type='mmdet.CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        semantic_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    roi_head=dict(
+        type='H3DRoIHead',
+        primitive_list=[primitive_z_cfg, primitive_xy_cfg, primitive_line_cfg],
+        bbox_head=dict(
+            type='H3DBboxHead',
+            gt_per_seed=3,
+            num_proposal=256,
+            suface_matching_cfg=dict(
+                type='PointSAModule',
+                num_point=256 * 6,
+                radius=0.5,
+                num_sample=32,
+                mlp_channels=[128 + 6, 128, 64, 32],
+                use_xyz=True,
+                normalize_xyz=True),
+            line_matching_cfg=dict(
+                type='PointSAModule',
+                num_point=256 * 12,
+                radius=0.5,
+                num_sample=32,
+                mlp_channels=[128 + 12, 128, 64, 32],
+                use_xyz=True,
+                normalize_xyz=True),
+            primitive_refine_channels=[128, 128, 128],
+            upper_thresh=100.0,
+            surface_thresh=0.5,
+            line_thresh=0.5,
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            objectness_loss=dict(
+                type='mmdet.CrossEntropyLoss',
+                class_weight=[0.2, 0.8],
+                reduction='sum',
+                loss_weight=5.0),
+            center_loss=dict(
+                type='ChamferDistance',
+                mode='l2',
+                reduction='sum',
+                loss_src_weight=10.0,
+                loss_dst_weight=10.0),
+            dir_class_loss=dict(
+                type='mmdet.CrossEntropyLoss',
+                reduction='sum',
+                loss_weight=0.1),
+            dir_res_loss=dict(
+                type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+            size_class_loss=dict(
+                type='mmdet.CrossEntropyLoss',
+                reduction='sum',
+                loss_weight=0.1),
+            size_res_loss=dict(
+                type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+            semantic_loss=dict(
+                type='mmdet.CrossEntropyLoss',
+                reduction='sum',
+                loss_weight=0.1),
+            cues_objectness_loss=dict(
+                type='mmdet.CrossEntropyLoss',
+                class_weight=[0.3, 0.7],
+                reduction='mean',
+                loss_weight=5.0),
+            cues_semantic_loss=dict(
+                type='mmdet.CrossEntropyLoss',
+                class_weight=[0.3, 0.7],
+                reduction='mean',
+                loss_weight=5.0),
+            proposal_objectness_loss=dict(
+                type='mmdet.CrossEntropyLoss',
+                class_weight=[0.2, 0.8],
+                reduction='none',
+                loss_weight=5.0),
+            primitive_center_loss=dict(
+                type='mmdet.MSELoss', reduction='none', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mode='vote'),
+        rpn_proposal=dict(use_nms=False),
+        rcnn=dict(
+            pos_distance_thr=0.3,
+            neg_distance_thr=0.6,
+            sample_mode='vote',
+            far_threshold=0.6,
+            near_threshold=0.3,
+            mask_surface_threshold=0.3,
+            label_surface_threshold=0.3,
+            mask_line_threshold=0.3,
+            label_line_threshold=0.3)),
+    test_cfg=dict(
+        rpn=dict(
+            sample_mode='seed',
+            nms_thr=0.25,
+            score_thr=0.05,
+            per_class_proposal=True,
+            use_nms=False),
+        rcnn=dict(
+            sample_mode='seed',
+            nms_thr=0.25,
+            score_thr=0.05,
+            per_class_proposal=True)))
diff --git a/mmde/configs/_base_/models/imvotenet.py b/mmde/configs/_base_/models/imvotenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2946300288a2778af4503b2eb3ef0e3a88d9e85b
--- /dev/null
+++ b/mmde/configs/_base_/models/imvotenet.py
@@ -0,0 +1,118 @@
+model = dict(
+    type='ImVoteNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        # use caffe img_norm
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    img_backbone=dict(
+        type='mmdet.ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    img_neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    img_rpn_head=dict(
+        _scope_='mmdet',
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    img_roi_head=dict(
+        _scope_='mmdet',
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=10,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+
+    # model training and testing settings
+    train_cfg=dict(
+        _scope_='mmdet',
+        img_rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        img_rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        img_rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        img_rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        img_rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/mmde/configs/_base_/models/mask-rcnn_r50_fpn.py b/mmde/configs/_base_/models/mask-rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..881d4df66266ef89cc050c9df416dc41f4973245
--- /dev/null
+++ b/mmde/configs/_base_/models/mask-rcnn_r50_fpn.py
@@ -0,0 +1,125 @@
+# model settings
+model = dict(
+    type='MaskRCNN',
+    pretrained='torchvision://resnet50',
+    _scope_='mmdet',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=28,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/mmde/configs/_base_/models/minkunet.py b/mmde/configs/_base_/models/minkunet.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd4386032b046a23c3155b1e8f550762875aade9
--- /dev/null
+++ b/mmde/configs/_base_/models/minkunet.py
@@ -0,0 +1,33 @@
+model = dict(
+    type='MinkUNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_type='minkunet',
+        batch_first=False,
+        max_voxels=80000,
+        voxel_layer=dict(
+            max_num_points=-1,
+            point_cloud_range=[-100, -100, -20, 100, 100, 20],
+            voxel_size=[0.05, 0.05, 0.05],
+            max_voxels=(-1, -1))),
+    backbone=dict(
+        type='MinkUNetBackbone',
+        in_channels=4,
+        num_stages=4,
+        base_channels=32,
+        encoder_channels=[32, 64, 128, 256],
+        encoder_blocks=[2, 2, 2, 2],
+        decoder_channels=[256, 128, 96, 96],
+        decoder_blocks=[2, 2, 2, 2],
+        block_type='basic',
+        sparseconv_backend='torchsparse'),
+    decode_head=dict(
+        type='MinkUNetHead',
+        channels=96,
+        num_classes=19,
+        dropout_ratio=0,
+        loss_decode=dict(type='mmdet.CrossEntropyLoss', avg_non_ignore=True),
+        ignore_index=19),
+    train_cfg=dict(),
+    test_cfg=dict())
diff --git a/mmde/configs/_base_/models/multiview_dfm.py b/mmde/configs/_base_/models/multiview_dfm.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fa5376d1e3313816d481cad4ba605513fcb8365
--- /dev/null
+++ b/mmde/configs/_base_/models/multiview_dfm.py
@@ -0,0 +1,104 @@
+model = dict(
+    type='MultiViewDfM',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=64,
+        num_outs=4),
+    neck_2d=None,
+    bbox_head_2d=None,
+    backbone_stereo=None,
+    depth_head=None,
+    backbone_3d=None,
+    neck_3d=dict(type='OutdoorImVoxelNeck', in_channels=64, out_channels=256),
+    valid_sample=True,
+    voxel_size=(0.5, 0.5, 0.5),  # n_voxels=[240, 300, 12]
+    anchor_generator=dict(
+        type='AlignedAnchor3DRangeGenerator',
+        ranges=[[-35.0, -75.0, -2, 75.0, 75.0, 4]],
+        rotations=[.0]),
+    bbox_head_3d=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=256,
+        feat_channels=256,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-35.0, -75.0, 0, 75.0, 75.0, 0],
+                    [-35.0, -75.0, -0.1188, 75.0, 75.0, -0.1188],
+                    [-35.0, -75.0, -0.0345, 75.0, 75.0, -0.0345]],
+            sizes=[
+                [0.91, 0.84, 1.74],  # pedestrian
+                [1.81, 0.84, 1.77],  # cyclist
+                [4.73, 2.08, 1.77],  # car
+            ],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        dir_offset=-0.7854,  # -pi / 4
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1)
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.05,
+        score_thr=0.001,
+        min_bbox_size=0,
+        nms_pre=4096,
+        max_num=500))
diff --git a/mmde/configs/_base_/models/paconv_ssg-cuda.py b/mmde/configs/_base_/models/paconv_ssg-cuda.py
new file mode 100644
index 0000000000000000000000000000000000000000..f513bd4a2f94964f70dba926ef03b427a795e417
--- /dev/null
+++ b/mmde/configs/_base_/models/paconv_ssg-cuda.py
@@ -0,0 +1,7 @@
+_base_ = './paconv_ssg.py'
+
+model = dict(
+    backbone=dict(
+        sa_cfg=dict(
+            type='PAConvCUDASAModule',
+            scorenet_cfg=dict(mlp_channels=[8, 16, 16]))))
diff --git a/mmde/configs/_base_/models/paconv_ssg.py b/mmde/configs/_base_/models/paconv_ssg.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f6991f750bf257f35b43b5e73aaa3ba17f15e84
--- /dev/null
+++ b/mmde/configs/_base_/models/paconv_ssg.py
@@ -0,0 +1,50 @@
+# model settings
+model = dict(
+    type='EncoderDecoder3D',
+    data_preprocessor=dict(type='Det3DDataPreprocessor'),
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=9,  # [xyz, rgb, normalized_xyz]
+        num_points=(1024, 256, 64, 16),
+        radius=(None, None, None, None),  # use kNN instead of ball query
+        num_samples=(32, 32, 32, 32),
+        sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
+                                                                    512)),
+        fp_channels=(),
+        norm_cfg=dict(type='BN2d', momentum=0.1),
+        sa_cfg=dict(
+            type='PAConvSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False,
+            paconv_num_kernels=[16, 16, 16],
+            paconv_kernel_input='w_neighbor',
+            scorenet_input='w_neighbor_dist',
+            scorenet_cfg=dict(
+                mlp_channels=[16, 16, 16],
+                score_norm='softmax',
+                temp_factor=1.0,
+                last_bn=False))),
+    decode_head=dict(
+        type='PAConvHead',
+        # PAConv model's decoder takes skip connections from beckbone
+        # different from PointNet++, it also concats input features in the last
+        # level of decoder, leading to `128 + 6` as the channel number
+        fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+                     (128 + 6, 128, 128, 128)),
+        channels=128,
+        dropout_ratio=0.5,
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        act_cfg=dict(type='ReLU'),
+        loss_decode=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            class_weight=None,  # should be modified with dataset
+            loss_weight=1.0)),
+    # correlation loss to regularize PAConv's kernel weights
+    loss_regularization=dict(
+        type='PAConvRegularizationLoss', reduction='sum', loss_weight=10.0),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide'))
diff --git a/mmde/configs/_base_/models/parta2.py b/mmde/configs/_base_/models/parta2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7db477a9187999679a04eedbe5c0e66792e16906
--- /dev/null
+++ b/mmde/configs/_base_/models/parta2.py
@@ -0,0 +1,207 @@
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+model = dict(
+    type='PartA2',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=5,  # max_points_per_voxel
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(16000, 40000))),
+    voxel_encoder=dict(type='HardSimpleVFE'),
+    middle_encoder=dict(
+        type='SparseUNet',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    rpn_head=dict(
+        type='PartA2RPNHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -1.78, 70.4, 40.0, -1.78]],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        assigner_per_size=True,
+        assign_per_class=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    roi_head=dict(
+        type='PartAggregationROIHead',
+        num_classes=3,
+        semantic_head=dict(
+            type='PointwiseSemanticHead',
+            in_channels=16,
+            extra_width=0.2,
+            seg_score_thr=0.3,
+            num_classes=3,
+            loss_seg=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_part=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                loss_weight=1.0)),
+        seg_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='max')),
+        bbox_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='avg')),
+        bbox_head=dict(
+            type='PartA2BboxHead',
+            num_classes=3,
+            seg_in_channels=16,
+            part_in_channels=4,
+            seg_conv_channels=[64, 64],
+            part_conv_channels=[64, 64],
+            merge_conv_channels=[128, 128],
+            down_conv_channels=[128, 256],
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            shared_fc_channels=[256, 512, 512, 512],
+            cls_channels=[256, 256],
+            reg_channels=[256, 256],
+            dropout_ratio=0.1,
+            roi_feat_size=14,
+            with_corner_loss=True,
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss',
+                beta=1.0 / 9.0,
+                reduction='sum',
+                loss_weight=1.0),
+            loss_cls=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.45,
+                    min_pos_iou=0.45,
+                    ignore_iof_thr=-1)
+            ],
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=9000,
+            nms_post=512,
+            max_num=512,
+            nms_thr=0.8,
+            score_thr=0,
+            use_rotate_nms=False),
+        rcnn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1)
+            ],
+            sampler=dict(
+                type='IoUNegPiecewiseSampler',
+                num=128,
+                pos_fraction=0.55,
+                neg_piece_fractions=[0.8, 0.2],
+                neg_iou_piece_thrs=[0.55, 0.1],
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False,
+                return_iou=True),
+            cls_pos_thr=0.75,
+            cls_neg_thr=0.25)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1024,
+            nms_post=100,
+            max_num=100,
+            nms_thr=0.7,
+            score_thr=0,
+            use_rotate_nms=True),
+        rcnn=dict(
+            use_rotate_nms=True,
+            use_raw_score=True,
+            nms_thr=0.01,
+            score_thr=0.1)))
diff --git a/mmde/configs/_base_/models/pgd.py b/mmde/configs/_base_/models/pgd.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7c098d270d4b42ea503ed77eef530bba2684e9c
--- /dev/null
+++ b/mmde/configs/_base_/models/pgd.py
@@ -0,0 +1,56 @@
+_base_ = './fcos3d.py'
+# model settings
+model = dict(
+    bbox_head=dict(
+        _delete_=True,
+        type='PGDHead',
+        num_classes=10,
+        in_channels=256,
+        stacked_convs=2,
+        feat_channels=256,
+        use_direction_classifier=True,
+        diff_rad_by_sin=True,
+        pred_attrs=True,
+        pred_velo=True,
+        pred_bbox2d=True,
+        pred_keypoints=False,
+        dir_offset=0.7854,  # pi/4
+        strides=[8, 16, 32, 64, 128],
+        group_reg_dims=(2, 1, 3, 1, 2),  # offset, depth, size, rot, velo
+        cls_branch=(256, ),
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            ()  # velo
+        ),
+        dir_branch=(256, ),
+        attr_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_attr=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        norm_on_bbox=True,
+        centerness_on_reg=True,
+        center_sampling=True,
+        conv_bias=True,
+        dcn_on_last_conv=True,
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 50),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=6,
+        bbox_coder=dict(type='PGDBBoxCoder', code_size=9)),
+    test_cfg=dict(nms_pre=1000, nms_thr=0.8, score_thr=0.01, max_per_img=200))
diff --git a/mmde/configs/_base_/models/point_rcnn.py b/mmde/configs/_base_/models/point_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..c23a78b55d530f919d401ce308ba7209717c6322
--- /dev/null
+++ b/mmde/configs/_base_/models/point_rcnn.py
@@ -0,0 +1,148 @@
+model = dict(
+    type='PointRCNN',
+    data_preprocessor=dict(type='Det3DDataPreprocessor'),
+    backbone=dict(
+        type='PointNet2SAMSG',
+        in_channels=4,
+        num_points=(4096, 1024, 256, 64),
+        radii=((0.1, 0.5), (0.5, 1.0), (1.0, 2.0), (2.0, 4.0)),
+        num_samples=((16, 32), (16, 32), (16, 32), (16, 32)),
+        sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96,
+                                                                    128)),
+                     ((128, 196, 256), (128, 196, 256)), ((256, 256, 512),
+                                                          (256, 384, 512))),
+        fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')),
+        fps_sample_range_lists=((-1), (-1), (-1), (-1)),
+        aggregation_channels=(None, None, None, None),
+        dilated_group=(False, False, False, False),
+        out_indices=(0, 1, 2, 3),
+        norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
+        sa_cfg=dict(
+            type='PointSAModuleMSG',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    neck=dict(
+        type='PointNetFPNeck',
+        fp_channels=((1536, 512, 512), (768, 512, 512), (608, 256, 256),
+                     (257, 128, 128))),
+    rpn_head=dict(
+        type='PointRPNHead',
+        num_classes=3,
+        enlarge_width=0.1,
+        pred_layer_cfg=dict(
+            in_channels=128,
+            cls_linear_channels=(256, 256),
+            reg_linear_channels=(256, 256)),
+        cls_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        bbox_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=1.0 / 9.0,
+            reduction='sum',
+            loss_weight=1.0),
+        bbox_coder=dict(
+            type='PointXYZWHLRBBoxCoder',
+            code_size=8,
+            # code_size: (center residual (3), size regression (3),
+            #             torch.cos(yaw) (1), torch.sin(yaw) (1)
+            use_mean_size=True,
+            mean_size=[[3.9, 1.6, 1.56], [0.8, 0.6, 1.73], [1.76, 0.6,
+                                                            1.73]])),
+    roi_head=dict(
+        type='PointRCNNRoIHead',
+        bbox_roi_extractor=dict(
+            type='Single3DRoIPointExtractor',
+            roi_layer=dict(type='RoIPointPool3d', num_sampled_points=512)),
+        bbox_head=dict(
+            type='PointRCNNBboxHead',
+            num_classes=1,
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss',
+                beta=1.0 / 9.0,
+                reduction='sum',
+                loss_weight=1.0),
+            loss_cls=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                loss_weight=1.0),
+            pred_layer_cfg=dict(
+                in_channels=512,
+                cls_conv_channels=(256, 256),
+                reg_conv_channels=(256, 256),
+                bias=True),
+            in_channels=5,
+            # 5 = 3 (xyz) + scores + depth
+            mlp_channels=[128, 128],
+            num_points=(128, 32, -1),
+            radius=(0.2, 0.4, 100),
+            num_samples=(16, 16, 16),
+            sa_channels=((128, 128, 128), (128, 128, 256), (256, 256, 512)),
+            with_corner_loss=True),
+        depth_normalizer=70.0),
+    # model training and testing settings
+    train_cfg=dict(
+        pos_distance_thr=10.0,
+        rpn=dict(
+            rpn_proposal=dict(
+                use_rotate_nms=True,
+                score_thr=None,
+                iou_thr=0.8,
+                nms_pre=9000,
+                nms_post=512)),
+        rcnn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1,
+                    match_low_quality=False),
+                dict(  # for Cyclist
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1,
+                    match_low_quality=False),
+                dict(  # for Car
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1,
+                    match_low_quality=False)
+            ],
+            sampler=dict(
+                type='IoUNegPiecewiseSampler',
+                num=128,
+                pos_fraction=0.5,
+                neg_piece_fractions=[0.8, 0.2],
+                neg_iou_piece_thrs=[0.55, 0.1],
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False,
+                return_iou=True),
+            cls_pos_thr=0.7,
+            cls_neg_thr=0.25)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_cfg=dict(
+                use_rotate_nms=True,
+                iou_thr=0.85,
+                nms_pre=9000,
+                nms_post=512,
+                score_thr=None)),
+        rcnn=dict(use_rotate_nms=True, nms_thr=0.1, score_thr=0.1)))
diff --git a/mmde/configs/_base_/models/pointnet2_msg.py b/mmde/configs/_base_/models/pointnet2_msg.py
new file mode 100644
index 0000000000000000000000000000000000000000..222ab885557984125eb52a934f443870e6c6918d
--- /dev/null
+++ b/mmde/configs/_base_/models/pointnet2_msg.py
@@ -0,0 +1,28 @@
+_base_ = './pointnet2_ssg.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='PointNet2SAMSG',
+        in_channels=6,  # [xyz, rgb], should be modified with dataset
+        num_points=(1024, 256, 64, 16),
+        radii=((0.05, 0.1), (0.1, 0.2), (0.2, 0.4), (0.4, 0.8)),
+        num_samples=((16, 32), (16, 32), (16, 32), (16, 32)),
+        sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96,
+                                                                    128)),
+                     ((128, 196, 256), (128, 196, 256)), ((256, 256, 512),
+                                                          (256, 384, 512))),
+        aggregation_channels=(None, None, None, None),
+        fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')),
+        fps_sample_range_lists=((-1), (-1), (-1), (-1)),
+        dilated_group=(False, False, False, False),
+        out_indices=(0, 1, 2, 3),
+        sa_cfg=dict(
+            type='PointSAModuleMSG',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    decode_head=dict(
+        fp_channels=((1536, 256, 256), (512, 256, 256), (352, 256, 128),
+                     (128, 128, 128, 128))))
diff --git a/mmde/configs/_base_/models/pointnet2_ssg.py b/mmde/configs/_base_/models/pointnet2_ssg.py
new file mode 100644
index 0000000000000000000000000000000000000000..386fe82a5c4181c8232d3464cfddf2bc1d2aa390
--- /dev/null
+++ b/mmde/configs/_base_/models/pointnet2_ssg.py
@@ -0,0 +1,36 @@
+# model settings
+model = dict(
+    type='EncoderDecoder3D',
+    data_preprocessor=dict(type='Det3DDataPreprocessor'),
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=6,  # [xyz, rgb], should be modified with dataset
+        num_points=(1024, 256, 64, 16),
+        radius=(0.1, 0.2, 0.4, 0.8),
+        num_samples=(32, 32, 32, 32),
+        sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
+                                                                    512)),
+        fp_channels=(),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    decode_head=dict(
+        type='PointNet2Head',
+        fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+                     (128, 128, 128, 128)),
+        channels=128,
+        dropout_ratio=0.5,
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        act_cfg=dict(type='ReLU'),
+        loss_decode=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            class_weight=None,  # should be modified with dataset
+            loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide'))
diff --git a/mmde/configs/_base_/models/pointpillars_hv_fpn_lyft.py b/mmde/configs/_base_/models/pointpillars_hv_fpn_lyft.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a00b76f1b672e0c78c047cfd2b2b70be8072f0d
--- /dev/null
+++ b/mmde/configs/_base_/models/pointpillars_hv_fpn_lyft.py
@@ -0,0 +1,23 @@
+_base_ = './pointpillars_hv_fpn_nus.py'
+
+# model settings (based on nuScenes model settings)
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+model = dict(
+    data_preprocessor=dict(
+        voxel_layer=dict(
+            max_num_points=20,
+            point_cloud_range=[-80, -80, -5, 80, 80, 3],
+            max_voxels=(60000, 60000))),
+    pts_voxel_encoder=dict(
+        feat_channels=[64], point_cloud_range=[-80, -80, -5, 80, 80, 3]),
+    pts_middle_encoder=dict(output_shape=[640, 640]),
+    pts_bbox_head=dict(
+        num_classes=9,
+        anchor_generator=dict(
+            ranges=[[-80, -80, -1.8, 80, 80, -1.8]], custom_values=[]),
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
+    # model training settings (based on nuScenes model settings)
+    train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
diff --git a/mmde/configs/_base_/models/pointpillars_hv_fpn_nus.py b/mmde/configs/_base_/models/pointpillars_hv_fpn_nus.py
new file mode 100644
index 0000000000000000000000000000000000000000..694e69ea1334cc80cae953488a068cdc6de8c272
--- /dev/null
+++ b/mmde/configs/_base_/models/pointpillars_hv_fpn_nus.py
@@ -0,0 +1,100 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.25, 0.25, 8]
+model = dict(
+    type='MVXFasterRCNN',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=64,
+            point_cloud_range=[-50, -50, -5, 50, 50, 3],
+            voxel_size=voxel_size,
+            max_voxels=(30000, 40000))),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        in_channels=4,
+        feat_channels=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=[-50, -50, -5, 50, 50, 3],
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    pts_neck=dict(
+        type='mmdet.FPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        in_channels=[64, 128, 256],
+        out_channels=256,
+        start_level=0,
+        num_outs=3),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=10,
+        in_channels=256,
+        feat_channels=256,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
+            scales=[1, 2, 4],
+            sizes=[
+                [2.5981, 0.8660, 1.],  # 1.5 / sqrt(3)
+                [1.7321, 0.5774, 1.],  # 1 / sqrt(3)
+                [1., 1., 1.],
+                [0.4, 0.4, 1],
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        assigner_per_size=False,
+        diff_rad_by_sin=True,
+        dir_offset=-0.7854,  # -pi / 4
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            assigner=dict(
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        pts=dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_thr=0.2,
+            score_thr=0.05,
+            min_bbox_size=0,
+            max_num=500)))
diff --git a/mmde/configs/_base_/models/pointpillars_hv_fpn_range100_lyft.py b/mmde/configs/_base_/models/pointpillars_hv_fpn_range100_lyft.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e56144a7953b8b0234aaa08feb6856229db9000
--- /dev/null
+++ b/mmde/configs/_base_/models/pointpillars_hv_fpn_range100_lyft.py
@@ -0,0 +1,23 @@
+_base_ = './pointpillars_hv_fpn_nus.py'
+
+# model settings (based on nuScenes model settings)
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+model = dict(
+    data_preprocessor=dict(
+        voxel_layer=dict(
+            max_num_points=20,
+            point_cloud_range=[-100, -100, -5, 100, 100, 3],
+            max_voxels=(60000, 60000))),
+    pts_voxel_encoder=dict(
+        feat_channels=[64], point_cloud_range=[-100, -100, -5, 100, 100, 3]),
+    pts_middle_encoder=dict(output_shape=[800, 800]),
+    pts_bbox_head=dict(
+        num_classes=9,
+        anchor_generator=dict(
+            ranges=[[-100, -100, -1.8, 100, 100, -1.8]], custom_values=[]),
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
+    # model training settings (based on nuScenes model settings)
+    train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
diff --git a/mmde/configs/_base_/models/pointpillars_hv_secfpn_kitti.py b/mmde/configs/_base_/models/pointpillars_hv_secfpn_kitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..09933c346f3cbc797a6fea2f8368a3e14dc835b1
--- /dev/null
+++ b/mmde/configs/_base_/models/pointpillars_hv_secfpn_kitti.py
@@ -0,0 +1,98 @@
+voxel_size = [0.16, 0.16, 4]
+
+model = dict(
+    type='VoxelNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=32,  # max_points_per_voxel
+            point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1],
+            voxel_size=voxel_size,
+            max_voxels=(16000, 40000))),
+    voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=4,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]),
+    middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
+    backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        assign_per_class=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[
+                [0, -39.68, -0.6, 69.12, 39.68, -0.6],
+                [0, -39.68, -0.6, 69.12, 39.68, -0.6],
+                [0, -39.68, -1.78, 69.12, 39.68, -1.78],
+            ],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='mmdet3d.BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='mmdet3d.BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='mmdet3d.BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
diff --git a/mmde/configs/_base_/models/pointpillars_hv_secfpn_waymo.py b/mmde/configs/_base_/models/pointpillars_hv_secfpn_waymo.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e7fd55769b72dbdfe600f7923e851c619d6ae95
--- /dev/null
+++ b/mmde/configs/_base_/models/pointpillars_hv_secfpn_waymo.py
@@ -0,0 +1,112 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.32, 0.32, 6]
+model = dict(
+    type='MVXFasterRCNN',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=20,
+            point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+            voxel_size=voxel_size,
+            max_voxels=(32000, 32000))),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        in_channels=5,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[468, 468]),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[1, 2, 2],
+        out_channels=[64, 128, 256]),
+    pts_neck=dict(
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345],
+                    [-74.88, -74.88, 0, 74.88, 74.88, 0],
+                    [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188]],
+            sizes=[
+                [4.73, 2.08, 1.77],  # car
+                [0.91, 0.84, 1.74],  # pedestrian
+                [1.81, 0.84, 1.77]  # cyclist
+            ],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        dir_offset=-0.7854,  # -pi / 4
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            assigner=[
+                dict(  # car
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # pedestrian
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+                dict(  # cyclist
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+            ],
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        pts=dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_pre=4096,
+            nms_thr=0.25,
+            score_thr=0.1,
+            min_bbox_size=0,
+            max_num=500)))
diff --git a/mmde/configs/_base_/models/second_hv_secfpn_kitti.py b/mmde/configs/_base_/models/second_hv_secfpn_kitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e1a6b0fe61bb6b0665122442292dcbb6d682347
--- /dev/null
+++ b/mmde/configs/_base_/models/second_hv_secfpn_kitti.py
@@ -0,0 +1,94 @@
+voxel_size = [0.05, 0.05, 0.1]
+
+model = dict(
+    type='VoxelNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=5,
+            point_cloud_range=[0, -40, -3, 70.4, 40, 1],
+            voxel_size=voxel_size,
+            max_voxels=(16000, 40000))),
+    voxel_encoder=dict(type='HardSimpleVFE'),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+            ],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.35,
+                neg_iou_thr=0.2,
+                min_pos_iou=0.2,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.35,
+                neg_iou_thr=0.2,
+                min_pos_iou=0.2,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
diff --git a/mmde/configs/_base_/models/second_hv_secfpn_waymo.py b/mmde/configs/_base_/models/second_hv_secfpn_waymo.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6411251db579365c409af0e7ce81c18d79f592b
--- /dev/null
+++ b/mmde/configs/_base_/models/second_hv_secfpn_waymo.py
@@ -0,0 +1,108 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.08, 0.08, 0.1]
+model = dict(
+    type='MVXFasterRCNN',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=20,
+            point_cloud_range=[-76.8, -51.2, -2, 76.8, 51.2, 4],
+            voxel_size=voxel_size,
+            max_voxels=(80000, 90000))),
+    pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+    pts_middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=5,
+        sparse_shape=[61, 1280, 1920],
+        order=('conv', 'norm', 'act')),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=384,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    pts_neck=dict(
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[
+                [-76.8, -51.2, -0.0345, 76.8, 51.2, -0.0345],
+                [-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188],
+                [-76.8, -51.2, 0, 76.8, 51.2, 0],
+            ],
+            sizes=[
+                [4.73, 2.08, 1.77],  # car
+                [1.81, 0.84, 1.77],  # pedestrian
+                [0.91, 0.84, 1.74],  # cyclist
+            ],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        dir_offset=-0.7854,  # -pi / 4
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            assigner=[
+                dict(  # car
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # pedestrian
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+                dict(  # cyclist
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+            ],
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        pts=dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_pre=4096,
+            nms_thr=0.25,
+            score_thr=0.1,
+            min_bbox_size=0,
+            max_num=500)))
diff --git a/mmde/configs/_base_/models/smoke.py b/mmde/configs/_base_/models/smoke.py
new file mode 100644
index 0000000000000000000000000000000000000000..a36456c0b5e2d932163818b6e4a80bee9ec0cbe8
--- /dev/null
+++ b/mmde/configs/_base_/models/smoke.py
@@ -0,0 +1,61 @@
+# model settings
+model = dict(
+    type='SMOKEMono3D',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='DLANet',
+        depth=34,
+        in_channels=3,
+        norm_cfg=dict(type='GN', num_groups=32),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='http://dl.yf.io/dla/models/imagenet/dla34-ba72cf86.pth'
+        )),
+    neck=dict(
+        type='DLANeck',
+        in_channels=[16, 32, 64, 128, 256, 512],
+        start_level=2,
+        end_level=5,
+        norm_cfg=dict(type='GN', num_groups=32)),
+    bbox_head=dict(
+        type='SMOKEMono3DHead',
+        num_classes=3,
+        in_channels=64,
+        dim_channel=[3, 4, 5],
+        ori_channel=[6, 7],
+        stacked_convs=0,
+        feat_channels=64,
+        use_direction_classifier=False,
+        diff_rad_by_sin=False,
+        pred_attrs=False,
+        pred_velo=False,
+        dir_offset=0,
+        strides=None,
+        group_reg_dims=(8, ),
+        cls_branch=(256, ),
+        reg_branch=((256, ), ),
+        num_attrs=0,
+        bbox_code_size=7,
+        dir_branch=(),
+        attr_branch=(),
+        bbox_coder=dict(
+            type='SMOKECoder',
+            base_depth=(28.01, 16.32),
+            base_dims=((0.88, 1.73, 0.67), (1.78, 1.70, 0.58), (3.88, 1.63,
+                                                                1.53)),
+            code_size=7),
+        loss_cls=dict(type='mmdet.GaussianFocalLoss', loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.L1Loss', reduction='sum', loss_weight=1 / 300),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_attr=None,
+        conv_bias=True,
+        dcn_on_last_conv=False),
+    train_cfg=None,
+    test_cfg=dict(topK=100, local_maximum_kernel=3, max_per_img=100))
diff --git a/mmde/configs/_base_/models/spvcnn.py b/mmde/configs/_base_/models/spvcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5ee6efa66fd71da2c3d40d7ff76bf1a8ade5c7a
--- /dev/null
+++ b/mmde/configs/_base_/models/spvcnn.py
@@ -0,0 +1,34 @@
+model = dict(
+    type='MinkUNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_type='minkunet',
+        batch_first=False,
+        max_voxels=80000,
+        voxel_layer=dict(
+            max_num_points=-1,
+            point_cloud_range=[-100, -100, -20, 100, 100, 20],
+            voxel_size=[0.05, 0.05, 0.05],
+            max_voxels=(-1, -1))),
+    backbone=dict(
+        type='SPVCNNBackbone',
+        in_channels=4,
+        num_stages=4,
+        base_channels=32,
+        encoder_channels=[32, 64, 128, 256],
+        encoder_blocks=[2, 2, 2, 2],
+        decoder_channels=[256, 128, 96, 96],
+        decoder_blocks=[2, 2, 2, 2],
+        block_type='basic',
+        sparseconv_backend='torchsparse',
+        drop_ratio=0.3),
+    decode_head=dict(
+        type='MinkUNetHead',
+        channels=96,
+        num_classes=19,
+        dropout_ratio=0,
+        loss_decode=dict(type='mmdet.CrossEntropyLoss', avg_non_ignore=True),
+        ignore_index=19),
+    train_cfg=dict(),
+    test_cfg=dict())
diff --git a/mmde/configs/_base_/models/votenet.py b/mmde/configs/_base_/models/votenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e72c12a31bb0dd4e23ff8bab5fbd00dc7ecf86d
--- /dev/null
+++ b/mmde/configs/_base_/models/votenet.py
@@ -0,0 +1,73 @@
+model = dict(
+    type='VoteNet',
+    data_preprocessor=dict(type='Det3DDataPreprocessor'),
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=4,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 256)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    bbox_head=dict(
+        type='VoteHead',
+        vote_module_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModule',
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        pred_layer_cfg=dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True),
+        objectness_loss=dict(
+            type='mmdet.CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum',
+            loss_weight=10.0 / 3.0),
+        semantic_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mode='vote'),
+    test_cfg=dict(
+        sample_mode='seed',
+        nms_thr=0.25,
+        score_thr=0.05,
+        per_class_proposal=True))
diff --git a/mmde/configs/_base_/schedules/cosine.py b/mmde/configs/_base_/schedules/cosine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d800bf8339aa3968ed6236027613ac661e23283f
--- /dev/null
+++ b/mmde/configs/_base_/schedules/cosine.py
@@ -0,0 +1,30 @@
+# This schedule is mainly used by models with dynamic voxelization
+# optimizer
+lr = 0.003  # max learning rate
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=lr, weight_decay=0.001, betas=(0.95, 0.99)),
+    clip_grad=dict(max_norm=10, norm_type=2),
+)
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=1000),
+    dict(
+        type='CosineAnnealingLR',
+        begin=0,
+        T_max=40,
+        end=40,
+        by_epoch=True,
+        eta_min=1e-5)
+]
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=40, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/configs/_base_/schedules/cyclic-20e.py b/mmde/configs/_base_/schedules/cyclic-20e.py
new file mode 100644
index 0000000000000000000000000000000000000000..caff691b4e088da36d4153a98cef2b35707d7626
--- /dev/null
+++ b/mmde/configs/_base_/schedules/cyclic-20e.py
@@ -0,0 +1,65 @@
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 20. Please change the interval accordingly if you do not
+# use a default schedule.
+# optimizer
+lr = 1e-4
+# This schedule is mainly used by models on nuScenes dataset
+# max_norm=10 is better for SECOND
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01),
+    clip_grad=dict(max_norm=35, norm_type=2))
+# learning rate
+param_scheduler = [
+    # learning rate scheduler
+    # During the first 8 epochs, learning rate increases from 0 to lr * 10
+    # during the next 12 epochs, learning rate decreases from lr * 10 to
+    # lr * 1e-4
+    dict(
+        type='CosineAnnealingLR',
+        T_max=8,
+        eta_min=lr * 10,
+        begin=0,
+        end=8,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=12,
+        eta_min=lr * 1e-4,
+        begin=8,
+        end=20,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    # During the first 8 epochs, momentum increases from 0 to 0.85 / 0.95
+    # during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=8,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=8,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=12,
+        eta_min=1,
+        begin=8,
+        end=20,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=20)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
diff --git a/mmde/configs/_base_/schedules/cyclic-40e.py b/mmde/configs/_base_/schedules/cyclic-40e.py
new file mode 100644
index 0000000000000000000000000000000000000000..58618f62fd6908518c3f49fa949eefd9616e3029
--- /dev/null
+++ b/mmde/configs/_base_/schedules/cyclic-40e.py
@@ -0,0 +1,67 @@
+# The schedule is usually used by models trained on KITTI dataset
+# The learning rate set in the cyclic schedule is the initial learning rate
+# rather than the max learning rate. Since the target_ratio is (10, 1e-4),
+# the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4
+lr = 0.0018
+# The optimizer follows the setting in SECOND.Pytorch, but here we use
+# the official AdamW optimizer implemented by PyTorch.
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01),
+    clip_grad=dict(max_norm=10, norm_type=2))
+# learning rate
+param_scheduler = [
+    # learning rate scheduler
+    # During the first 16 epochs, learning rate increases from 0 to lr * 10
+    # during the next 24 epochs, learning rate decreases from lr * 10 to
+    # lr * 1e-4
+    dict(
+        type='CosineAnnealingLR',
+        T_max=16,
+        eta_min=lr * 10,
+        begin=0,
+        end=16,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=24,
+        eta_min=lr * 1e-4,
+        begin=16,
+        end=40,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    # During the first 16 epochs, momentum increases from 0 to 0.85 / 0.95
+    # during the next 24 epochs, momentum increases from 0.85 / 0.95 to 1
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=16,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=16,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=24,
+        eta_min=1,
+        begin=16,
+        end=40,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+# Runtime settings，training schedule for 40e
+# Although the max_epochs is 40, this schedule is usually used we
+# RepeatDataset with repeat ratio N, thus the actual max epoch
+# number could be Nx40
+train_cfg = dict(by_epoch=True, max_epochs=40, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (6 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=48)
diff --git a/mmde/configs/_base_/schedules/mmdet-schedule-1x.py b/mmde/configs/_base_/schedules/mmdet-schedule-1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..95f30be74ff37080ba0d227d55bbd587feeaa892
--- /dev/null
+++ b/mmde/configs/_base_/schedules/mmdet-schedule-1x.py
@@ -0,0 +1,28 @@
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/configs/_base_/schedules/schedule-2x.py b/mmde/configs/_base_/schedules/schedule-2x.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6f0a9660d4952bd46122d594e7d25b812b22118
--- /dev/null
+++ b/mmde/configs/_base_/schedules/schedule-2x.py
@@ -0,0 +1,36 @@
+# optimizer
+# This schedule is mainly used by models on nuScenes dataset
+lr = 0.001
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01),
+    # max_norm=10 is better for SECOND
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# training schedule for 2x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[20, 23],
+        gamma=0.1)
+]
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
diff --git a/mmde/configs/_base_/schedules/schedule-3x.py b/mmde/configs/_base_/schedules/schedule-3x.py
new file mode 100644
index 0000000000000000000000000000000000000000..21dee3e659e8641affdcc6ef284a63ca126847a0
--- /dev/null
+++ b/mmde/configs/_base_/schedules/schedule-3x.py
@@ -0,0 +1,31 @@
+# optimizer
+# This schedule is mainly used by models on indoor dataset,
+# e.g., VoteNet on SUNRGBD and ScanNet
+lr = 0.008  # max learning rate
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01),
+    clip_grad=dict(max_norm=10, norm_type=2),
+)
+
+# training schedule for 3x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=36, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[24, 32],
+        gamma=0.1)
+]
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (4 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
diff --git a/mmde/configs/_base_/schedules/seg-cosine-100e.py b/mmde/configs/_base_/schedules/seg-cosine-100e.py
new file mode 100644
index 0000000000000000000000000000000000000000..efc0754b9c656c02489c2a539179bdd6bb413a9f
--- /dev/null
+++ b/mmde/configs/_base_/schedules/seg-cosine-100e.py
@@ -0,0 +1,27 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.001),
+    clip_grad=None)
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=100,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=0,
+        end=100)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (4 GPUs) x (32 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/mmde/configs/_base_/schedules/seg-cosine-150e.py b/mmde/configs/_base_/schedules/seg-cosine-150e.py
new file mode 100644
index 0000000000000000000000000000000000000000..91190173ad17d24cc8c102285de89680b7306c6f
--- /dev/null
+++ b/mmde/configs/_base_/schedules/seg-cosine-150e.py
@@ -0,0 +1,27 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0001),
+    clip_grad=None)
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=150,
+        eta_min=0.002,
+        by_epoch=True,
+        begin=0,
+        end=150)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=150, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/mmde/configs/_base_/schedules/seg-cosine-200e.py b/mmde/configs/_base_/schedules/seg-cosine-200e.py
new file mode 100644
index 0000000000000000000000000000000000000000..a702168ba56b3b9114205f108db1befbecc02362
--- /dev/null
+++ b/mmde/configs/_base_/schedules/seg-cosine-200e.py
@@ -0,0 +1,27 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='Adam', lr=0.001, weight_decay=0.01),
+    clip_grad=None)
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=200,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=0,
+        end=200)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=200, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (2 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
diff --git a/mmde/configs/_base_/schedules/seg-cosine-50e.py b/mmde/configs/_base_/schedules/seg-cosine-50e.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd31219f7a5af741db7d7a6c5ac19b2963461f65
--- /dev/null
+++ b/mmde/configs/_base_/schedules/seg-cosine-50e.py
@@ -0,0 +1,27 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='Adam', lr=0.001, weight_decay=0.001),
+    clip_grad=None)
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=50,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=0,
+        end=50)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=50, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (2 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
diff --git a/mmde/configs/benchmark/hv_PartA2_secfpn_4x8_cyclic_80e_pcdet_kitti-3d-3class.py b/mmde/configs/benchmark/hv_PartA2_secfpn_4x8_cyclic_80e_pcdet_kitti-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..baec55f945a72d3b8a822bbe004676a755b8c721
--- /dev/null
+++ b/mmde/configs/benchmark/hv_PartA2_secfpn_4x8_cyclic_80e_pcdet_kitti-3d-3class.py
@@ -0,0 +1,386 @@
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]  # velodyne coordinates, x, y, z
+
+model = dict(
+    type='PartA2',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=5,  # max_points_per_voxel
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(16000, 40000))),
+    voxel_encoder=dict(type='HardSimpleVFE'),
+    middle_encoder=dict(
+        type='SparseUNet',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    rpn_head=dict(
+        type='PartA2RPNHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -1.78, 70.4, 40.0, -1.78]],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        assigner_per_size=True,
+        assign_per_class=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    roi_head=dict(
+        type='PartAggregationROIHead',
+        num_classes=3,
+        semantic_head=dict(
+            type='PointwiseSemanticHead',
+            in_channels=16,
+            extra_width=0.2,
+            seg_score_thr=0.3,
+            num_classes=3,
+            loss_seg=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_part=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                loss_weight=1.0)),
+        seg_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='max')),
+        bbox_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='avg')),
+        bbox_head=dict(
+            type='PartA2BboxHead',
+            num_classes=3,
+            seg_in_channels=16,
+            part_in_channels=4,
+            seg_conv_channels=[64, 64],
+            part_conv_channels=[64, 64],
+            merge_conv_channels=[128, 128],
+            down_conv_channels=[128, 256],
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            shared_fc_channels=[256, 512, 512, 512],
+            cls_channels=[256, 256],
+            reg_channels=[256, 256],
+            dropout_ratio=0.1,
+            roi_feat_size=14,
+            with_corner_loss=True,
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss',
+                beta=1.0 / 9.0,
+                reduction='sum',
+                loss_weight=1.0),
+            loss_cls=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.45,
+                    min_pos_iou=0.45,
+                    ignore_iof_thr=-1)
+            ],
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=9000,
+            nms_post=512,
+            max_num=512,
+            nms_thr=0.8,
+            score_thr=0,
+            use_rotate_nms=False),
+        rcnn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1)
+            ],
+            sampler=dict(
+                type='IoUNegPiecewiseSampler',
+                num=128,
+                pos_fraction=0.55,
+                neg_piece_fractions=[0.8, 0.2],
+                neg_iou_piece_thrs=[0.55, 0.1],
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False,
+                return_iou=True),
+            cls_pos_thr=0.75,
+            cls_neg_thr=0.25)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1024,
+            nms_post=100,
+            max_num=100,
+            nms_thr=0.7,
+            score_thr=0,
+            use_rotate_nms=True),
+        rcnn=dict(
+            use_rotate_nms=True,
+            use_raw_score=True,
+            nms_thr=0.01,
+            score_thr=0.1)))
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
+    classes=class_names,
+    sample_groups=dict(Car=20, Pedestrian=15, Cyclist=15))
+
+train_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_labels_3d', 'gt_bboxes_3d'])
+]
+test_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='kitti_infos_train.pkl',
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR'))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox')
+test_evaluator = val_evaluator
+
+# optimizer
+lr = 0.001  # max learning rate
+epoch_num = 80
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01),
+    clip_grad=dict(max_norm=10, norm_type=2))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=epoch_num * 0.4,
+        eta_min=lr * 10,
+        begin=0,
+        end=epoch_num * 0.4,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=epoch_num * 0.6,
+        eta_min=lr * 1e-4,
+        begin=epoch_num * 0.4,
+        end=epoch_num * 1,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=epoch_num * 0.4,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=epoch_num * 0.4,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=epoch_num * 0.6,
+        eta_min=1,
+        begin=epoch_num * 0.4,
+        end=epoch_num * 1,
+        convert_to_iter_based=True)
+]
+
+train_cfg = dict(by_epoch=True, max_epochs=epoch_num, val_interval=50)
+val_cfg = dict()
+test_cfg = dict()
+auto_scale_lr = dict(enable=False, base_batch_size=32)
+
+default_scope = 'mmdet3d'
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='Det3DVisualizationHook'))
+
+custom_hooks = [
+    dict(type='BenchmarkHook'),
+]
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+
+log_level = 'INFO'
+load_from = None
+resume = False
+find_unused_parameters = True
+work_dir = './work_dirs/parta2_secfpn_80e'
diff --git a/mmde/configs/benchmark/hv_pointpillars_secfpn_3x8_100e_det3d_kitti-3d-car.py b/mmde/configs/benchmark/hv_pointpillars_secfpn_3x8_100e_det3d_kitti-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc59480e6e9a284eac19abeddc906a38c7e9be16
--- /dev/null
+++ b/mmde/configs/benchmark/hv_pointpillars_secfpn_3x8_100e_det3d_kitti-3d-car.py
@@ -0,0 +1,248 @@
+# model settings
+voxel_size = [0.16, 0.16, 4]
+point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
+model = dict(
+    type='VoxelNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=64,
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(12000, 20000))),
+    voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=4,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range),
+    middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
+    backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=1,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[[0, -39.68, -1.78, 69.12, 39.68, -1.78]],
+            sizes=[[3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='Max3DIoUAssigner',
+            iou_calculator=dict(type='BboxOverlapsNearest3D'),
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.45,
+            min_pos_iou=0.45,
+            ignore_iof_thr=-1),
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    sample_groups=dict(Car=15),
+    classes=class_names)
+
+train_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[0.25, 0.25, 0.25],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.15707963267, 0.15707963267]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_labels_3d', 'gt_bboxes_3d'])
+]
+test_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='kitti_infos_train.pkl',
+            data_prefix=dict(pts='training/velodyne_reduced'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR')))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR'))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox')
+test_evaluator = val_evaluator
+
+# optimizer
+lr = 0.001  # max learning rate
+epoch_num = 50
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01),
+    clip_grad=dict(max_norm=10, norm_type=2))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=epoch_num * 0.4,
+        eta_min=lr * 10,
+        begin=0,
+        end=epoch_num * 0.4,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=epoch_num * 0.6,
+        eta_min=lr * 1e-4,
+        begin=epoch_num * 0.4,
+        end=epoch_num * 1,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=epoch_num * 0.4,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=epoch_num * 0.4,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=epoch_num * 0.6,
+        eta_min=1,
+        begin=epoch_num * 0.4,
+        end=epoch_num * 1,
+        convert_to_iter_based=True)
+]
+
+train_cfg = dict(by_epoch=True, max_epochs=epoch_num, val_interval=50)
+val_cfg = dict()
+test_cfg = dict()
+auto_scale_lr = dict(enable=False, base_batch_size=24)
+
+default_scope = 'mmdet3d'
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='Det3DVisualizationHook'))
+
+custom_hooks = [
+    dict(type='BenchmarkHook'),
+]
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+
+log_level = 'INFO'
+load_from = None
+resume = False
+work_dir = './work_dirs/pp_secfpn_100e'
diff --git a/mmde/configs/benchmark/hv_pointpillars_secfpn_4x8_80e_pcdet_kitti-3d-3class.py b/mmde/configs/benchmark/hv_pointpillars_secfpn_4x8_80e_pcdet_kitti-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..01dc8b5cca5f87ba1d01239bb92509f8ec3ba7b0
--- /dev/null
+++ b/mmde/configs/benchmark/hv_pointpillars_secfpn_4x8_80e_pcdet_kitti-3d-3class.py
@@ -0,0 +1,291 @@
+# model settings
+point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
+voxel_size = [0.16, 0.16, 4]
+model = dict(
+    type='VoxelNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=32,  # max_points_per_voxel
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(16000, 40000))),
+    voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=4,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+    ),
+    middle_encoder=dict(
+        type='PointPillarsScatter',
+        in_channels=64,
+        output_shape=[496, 432],
+    ),
+    backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256],
+    ),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128],
+    ),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+            ],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            Car=5,
+            Pedestrian=5,
+            Cyclist=5,
+        )),
+    classes=class_names,
+    sample_groups=dict(
+        Car=15,
+        Pedestrian=15,
+        Cyclist=15,
+    ))
+
+train_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_labels_3d', 'gt_bboxes_3d'])
+]
+test_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='kitti_infos_train.pkl',
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR'))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox')
+test_evaluator = val_evaluator
+
+# optimizer
+lr = 0.0003  # max learning rate
+epoch_num = 80
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01),
+    clip_grad=dict(max_norm=10, norm_type=2))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=epoch_num * 0.4,
+        eta_min=lr * 10,
+        begin=0,
+        end=epoch_num * 0.4,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=epoch_num * 0.6,
+        eta_min=lr * 1e-4,
+        begin=epoch_num * 0.4,
+        end=epoch_num * 1,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=epoch_num * 0.4,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=epoch_num * 0.4,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=epoch_num * 0.6,
+        eta_min=1,
+        begin=epoch_num * 0.4,
+        end=epoch_num * 1,
+        convert_to_iter_based=True)
+]
+
+train_cfg = dict(by_epoch=True, max_epochs=epoch_num, val_interval=50)
+val_cfg = dict()
+test_cfg = dict()
+auto_scale_lr = dict(enable=False, base_batch_size=32)
+
+default_scope = 'mmdet3d'
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='Det3DVisualizationHook'))
+
+custom_hooks = [
+    dict(type='BenchmarkHook'),
+]
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+
+log_level = 'INFO'
+load_from = None
+resume = False
+work_dir = './work_dirs/pp_secfpn_80e'
diff --git a/mmde/configs/benchmark/hv_second_secfpn_4x8_80e_pcdet_kitti-3d-3class.py b/mmde/configs/benchmark/hv_second_secfpn_4x8_80e_pcdet_kitti-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8dd0d24709c0819c580e78a0b8880fbfd0cd2cd
--- /dev/null
+++ b/mmde/configs/benchmark/hv_second_secfpn_4x8_80e_pcdet_kitti-3d-3class.py
@@ -0,0 +1,281 @@
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+model = dict(
+    type='VoxelNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=5,
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(16000, 40000))),
+    voxel_encoder=dict(type='HardSimpleVFE'),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+            ],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            Car=5,
+            Pedestrian=5,
+            Cyclist=5,
+        )),
+    classes=class_names,
+    sample_groups=dict(
+        Car=20,
+        Pedestrian=15,
+        Cyclist=15,
+    ))
+
+train_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='kitti_infos_train.pkl',
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR'))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox')
+test_evaluator = val_evaluator
+
+# optimizer
+lr = 0.0003  # max learning rate
+epoch_num = 80
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01),
+    clip_grad=dict(max_norm=10, norm_type=2))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=epoch_num * 0.4,
+        eta_min=lr * 10,
+        begin=0,
+        end=epoch_num * 0.4,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=epoch_num * 0.6,
+        eta_min=lr * 1e-4,
+        begin=epoch_num * 0.4,
+        end=epoch_num * 1,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=epoch_num * 0.4,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=epoch_num * 0.4,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=epoch_num * 0.6,
+        eta_min=1,
+        begin=epoch_num * 0.4,
+        end=epoch_num * 1,
+        convert_to_iter_based=True)
+]
+
+train_cfg = dict(by_epoch=True, max_epochs=epoch_num, val_interval=50)
+val_cfg = dict()
+test_cfg = dict()
+auto_scale_lr = dict(enable=False, base_batch_size=32)
+
+default_scope = 'mmdet3d'
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='Det3DVisualizationHook'))
+
+custom_hooks = [
+    dict(type='BenchmarkHook'),
+]
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+
+log_level = 'INFO'
+load_from = None
+resume = False
+work_dir = './work_dirs/pp_secfpn_100e'
diff --git a/mmde/configs/centerpoint/README.md b/mmde/configs/centerpoint/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6fd7af75cad0886ac454be835725a91ce4477eac
--- /dev/null
+++ b/mmde/configs/centerpoint/README.md
@@ -0,0 +1,136 @@
+# Center-based 3D Object Detection and Tracking
+
+> [Center-based 3D Object Detection and Tracking](https://arxiv.org/abs/2006.11275)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Three-dimensional objects are commonly represented as 3D boxes in a point-cloud. This representation mimics the well-studied image-based 2D bounding-box detection but comes with additional challenges. Objects in a 3D world do not follow any particular orientation, and box-based detectors have difficulties enumerating all orientations or fitting an axis-aligned bounding box to rotated objects. In this paper, we instead propose to represent, detect, and track 3D objects as points. Our framework, CenterPoint, first detects centers of objects using a keypoint detector and regresses to other attributes, including 3D size, 3D orientation, and velocity. In a second stage, it refines these estimates using additional point features on the object. In CenterPoint, 3D object tracking simplifies to greedy closest-point matching. The resulting detection and tracking algorithm is simple, efficient, and effective. CenterPoint achieved state-of-the-art performance on the nuScenes benchmark for both 3D detection and tracking, with 65.5 NDS and 63.8 AMOTA for a single model. On the Waymo Open Dataset, CenterPoint outperforms all previous single model method by a large margin and ranks first among all Lidar-only submissions.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/30491025/143854976-11af75ae-e828-43ad-835d-ac1146f99925.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement CenterPoint and provide the result and checkpoints on nuScenes dataset.
+
+We follow the below style to name config files. Contributors are advised to follow the same style.
+`{xxx}` is required field and `[yyy]` is optional.
+
+`{model}`: model type like `centerpoint`.
+
+`{model setting}`: voxel size and voxel type like `01voxel`, `02pillar`.
+
+`{backbone}`: backbone type like `second`.
+
+`{neck}`: neck type like `secfpn`.
+
+`[dcn]`: Whether to use deformable convolution.
+
+`[circle]`: Whether to use circular nms.
+
+`[batch_per_gpu x gpu]`: GPUs and samples per GPU, 4x8 is used by default.
+
+`{schedule}`: training schedule, options are 1x, 2x, 20e, etc. 1x and 2x means 12 epochs and 24 epochs respectively. 20e is adopted in cascade models, which denotes 20 epochs. For 1x/2x, initial learning rate decays by a factor of 10 at the 8/16th and 11/22th epochs. For 20e, initial learning rate decays by a factor of 10 at the 16th and 19th epochs.
+
+`{dataset}`: dataset like nus-3d, kitti-3d, lyft-3d, scannet-3d, sunrgbd-3d. We also indicate the number of classes we are using if there exist multiple settings, e.g., kitti-3d-3class and kitti-3d-car means training on KITTI dataset with 3 classes and single class, respectively.
+
+## Usage
+
+### Test time augmentation
+
+We have supported double-flip and scale augmentation during test time. To use test time augmentation, users need to modify the
+`test_pipeline` and `test_cfg` in the config.
+For example, we change `centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus.py` to the following.
+
+```python
+_base_ = './centerpoint_0075voxel_second_secfpn_circlenms' \
+         '_4x8_cyclic_20e_nus.py'
+
+model = dict(
+    test_cfg=dict(
+        pts=dict(
+            use_rotate_nms=True,
+            max_num=83)))
+
+point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0]
+backend_args = None
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args,
+        pad_empty_sweeps=True,
+        remove_close=True),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=[0.95, 1.0, 1.05],
+        flip=True,
+        pcd_horizontal_flip=True,
+        pcd_vertical_flip=True,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D', sync_2d=False),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+data = dict(
+    val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline))
+
+```
+
+## Results and models
+
+### CenterPoint
+
+|                                           Backbone                                           | Voxel type (voxel size) | Dcn | Circular nms | Mem (GB) | Inf time (fps) |  mAP  |  NDS  |                                                                                                                                                                                                                                               Download                                                                                                                                                                                                                                                |
+| :------------------------------------------------------------------------------------------: | :---------------------: | :-: | :----------: | :------: | :------------: | :---: | :---: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    [SECFPN](./centerpoint_voxel01_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d.py)    |       voxel (0.1)       |  ✗  |      ✓       |   5.2    |                | 56.11 | 64.61 |             [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus_20220810_030004-9061688e.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus_20220810_030004.log)             |
+|                                     above w/o circle nms                                     |       voxel (0.1)       |  ✗  |      ✗       |          |                |   x   |   x   |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|  [SECFPN](./centerpoint_voxel01_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py)  |       voxel (0.1)       |  ✓  |      ✓       |   5.5    |                | 56.10 | 64.69 |     [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus_20220810_052355-a6928835.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus_20220810_052355.log)     |
+|                                     above w/o circle nms                                     |       voxel (0.1)       |  ✓  |      ✗       |          |                |   x   |   x   |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|   [SECFPN](./centerpoint_voxel0075_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d.py)   |      voxel (0.075)      |  ✗  |      ✓       |   8.2    |                | 56.54 | 65.17 |         [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus_20220810_011659-04cb3a3b.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus_20220810_011659.log)         |
+|                                     above w/o circle nms                                     |      voxel (0.075)      |  ✗  |      ✗       |          |                | 57.63 | 65.39 |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| [SECFPN](./centerpoint_voxel0075_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py) |      voxel (0.075)      |  ✓  |      ✓       |   8.7    |                | 56.92 | 65.27 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus_20220810_025930-657f67e0.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus_20220810_025930.log) |
+|                                     above w/o circle nms                                     |      voxel (0.075)      |  ✓  |      ✗       |          |                | 57.43 | 65.63 |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|                                     above w/ double flip                                     |      voxel (0.075)      |  ✓  |      ✗       |          |                | 59.73 | 67.39 |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|                                      above w/ scale tta                                      |      voxel (0.075)      |  ✓  |      ✗       |          |                | 60.43 | 67.65 |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|                              above w/ circle nms w/o scale tta                               |      voxel (0.075)      |  ✓  |      ✗       |          |                | 59.52 | 67.24 |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|   [SECFPN](./centerpoint_pillar02_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d.py)    |      pillar (0.2)       |  ✗  |      ✓       |   4.6    |                | 48.70 | 59.62 |           [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus_20220811_031844-191a3822.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus_20220811_031844.log)           |
+|                                     above w/o circle nms                                     |      pillar (0.2)       |  ✗  |      ✗       |          |                | 49.12 | 59.66 |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|      [SECFPN](./centerpoint_pillar02_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d.py)       |      pillar (0.2)       |  ✓  |      ✗       |   4.9    |                | 48.38 | 59.79 |                       [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus_20220811_045458-808e69ad.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus_20220811_045458.log)                       |
+|                                     above w/ circle nms                                      |      pillar (0.2)       |  ✓  |      ✓       |          |                | 48.79 | 59.65 |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+
+**Note:** The model performance after coordinate refactor is slightly different (+/- 0.5 - 1 mAP/NDS) from the performance before coordinate refactor in v0.x branch. We are exploring the reason behind.                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+
+## Citation
+
+```latex
+@article{yin2021center,
+  title={Center-based 3D Object Detection and Tracking},
+  author={Yin, Tianwei and Zhou, Xingyi and Kr{\"a}henb{\"u}hl, Philipp},
+  journal={CVPR},
+  year={2021},
+}
+```
diff --git a/mmde/configs/centerpoint/centerpoint_pillar02_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/mmde/configs/centerpoint/centerpoint_pillar02_second_secfpn_8xb4-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6854b22b9249b5afb1468439a15a6ca80c7a38e
--- /dev/null
+++ b/mmde/configs/centerpoint/centerpoint_pillar02_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -0,0 +1,159 @@
+_base_ = [
+    '../_base_/datasets/nus-3d.py',
+    '../_base_/models/centerpoint_pillar02_second_secfpn_nus.py',
+    '../_base_/schedules/cyclic-20e.py', '../_base_/default_runtime.py'
+]
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# Using calibration info convert the Lidar-coordinate point cloud range to the
+# ego-coordinate point cloud range could bring a little promotion in nuScenes.
+# point_cloud_range = [-51.2, -52, -5.0, 51.2, 50.4, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_prefix = dict(pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP')
+model = dict(
+    data_preprocessor=dict(
+        voxel_layer=dict(point_cloud_range=point_cloud_range)),
+    pts_voxel_encoder=dict(point_cloud_range=point_cloud_range),
+    pts_bbox_head=dict(bbox_coder=dict(pc_range=point_cloud_range[:2])),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(point_cloud_range=point_cloud_range)),
+    test_cfg=dict(pts=dict(pc_range=point_cloud_range[:2])))
+
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+backend_args = None
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            car=5,
+            truck=5,
+            bus=5,
+            trailer=5,
+            construction_vehicle=5,
+            traffic_cone=5,
+            barrier=5,
+            motorcycle=5,
+            bicycle=5,
+            pedestrian=5)),
+    classes=class_names,
+    sample_groups=dict(
+        car=2,
+        truck=3,
+        construction_vehicle=7,
+        bus=4,
+        trailer=6,
+        barrier=2,
+        motorcycle=6,
+        bicycle=6,
+        pedestrian=2,
+        traffic_cone=2),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D')
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    _delete_=True,
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CBGSDataset',
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='nuscenes_infos_train.pkl',
+            pipeline=train_pipeline,
+            metainfo=dict(classes=class_names),
+            test_mode=False,
+            data_prefix=data_prefix,
+            use_valid_flag=True,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            backend_args=backend_args)))
+test_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names)))
+val_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names)))
+
+train_cfg = dict(val_interval=20)
diff --git a/mmde/configs/centerpoint/centerpoint_pillar02_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d.py b/mmde/configs/centerpoint/centerpoint_pillar02_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f6b4b4b34481fa1a5891b0ae5e655874a1f125f
--- /dev/null
+++ b/mmde/configs/centerpoint/centerpoint_pillar02_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d.py
@@ -0,0 +1,3 @@
+_base_ = ['./centerpoint_pillar02_second_secfpn_8xb4-cyclic-20e_nus-3d.py']
+
+model = dict(test_cfg=dict(pts=dict(nms_type='circle')))
diff --git a/mmde/configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py b/mmde/configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..974aa63407d5c2f63b5e916dd5eeb3a8782f4ea6
--- /dev/null
+++ b/mmde/configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py
@@ -0,0 +1,16 @@
+_base_ = ['./centerpoint_pillar02_second_secfpn_8xb4-cyclic-20e_nus-3d.py']
+
+model = dict(
+    pts_bbox_head=dict(
+        separate_head=dict(
+            type='DCNSeparateHead',
+            dcn_config=dict(
+                type='DCN',
+                in_channels=64,
+                out_channels=64,
+                kernel_size=3,
+                padding=1,
+                groups=4),
+            init_bias=-2.19,
+            final_kernel=3)),
+    test_cfg=dict(pts=dict(nms_type='circle')))
diff --git a/mmde/configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d.py b/mmde/configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d1466a0e9717c5026c79fd44d3f30762d5bc408
--- /dev/null
+++ b/mmde/configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d.py
@@ -0,0 +1,15 @@
+_base_ = ['./centerpoint_pillar02_second_secfpn_8xb4-cyclic-20e_nus-3d.py']
+
+model = dict(
+    pts_bbox_head=dict(
+        separate_head=dict(
+            type='DCNSeparateHead',
+            dcn_config=dict(
+                type='DCN',
+                in_channels=64,
+                out_channels=64,
+                kernel_size=3,
+                padding=1,
+                groups=4),
+            init_bias=-2.19,
+            final_kernel=3)))
diff --git a/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7d675598e933d8f988e8fe6815cf218bb118d19
--- /dev/null
+++ b/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -0,0 +1,145 @@
+_base_ = ['./centerpoint_voxel01_second_secfpn_8xb4-cyclic-20e_nus-3d.py']
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+voxel_size = [0.075, 0.075, 0.2]
+point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0]
+# Using calibration info convert the Lidar-coordinate point cloud range to the
+# ego-coordinate point cloud range could bring a little promotion in nuScenes.
+# point_cloud_range = [-54, -54.8, -5.0, 54, 53.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_prefix = dict(pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP')
+model = dict(
+    data_preprocessor=dict(
+        voxel_layer=dict(
+            voxel_size=voxel_size, point_cloud_range=point_cloud_range)),
+    pts_middle_encoder=dict(sparse_shape=[41, 1440, 1440]),
+    pts_bbox_head=dict(
+        bbox_coder=dict(
+            voxel_size=voxel_size[:2], pc_range=point_cloud_range[:2])),
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[1440, 1440, 40],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range)),
+    test_cfg=dict(
+        pts=dict(voxel_size=voxel_size[:2], pc_range=point_cloud_range[:2])))
+
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+backend_args = None
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            car=5,
+            truck=5,
+            bus=5,
+            trailer=5,
+            construction_vehicle=5,
+            traffic_cone=5,
+            barrier=5,
+            motorcycle=5,
+            bicycle=5,
+            pedestrian=5)),
+    classes=class_names,
+    sample_groups=dict(
+        car=2,
+        truck=3,
+        construction_vehicle=7,
+        bus=4,
+        trailer=6,
+        barrier=2,
+        motorcycle=6,
+        bicycle=6,
+        pedestrian=2,
+        traffic_cone=2),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    dataset=dict(
+        dataset=dict(
+            pipeline=train_pipeline, metainfo=dict(classes=class_names))))
+test_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names)))
+val_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names)))
diff --git a/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d.py b/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..46280c6e089c9e562f234bdc5063c92906807839
--- /dev/null
+++ b/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d.py
@@ -0,0 +1,3 @@
+_base_ = ['./centerpoint_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py']
+
+model = dict(test_cfg=dict(pts=dict(nms_type='circle')))
diff --git a/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py b/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..035cfc24252579657c9ceef8ab14a6a293a86b28
--- /dev/null
+++ b/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py
@@ -0,0 +1,16 @@
+_base_ = ['./centerpoint_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py']
+
+model = dict(
+    pts_bbox_head=dict(
+        separate_head=dict(
+            type='DCNSeparateHead',
+            dcn_config=dict(
+                type='DCN',
+                in_channels=64,
+                out_channels=64,
+                kernel_size=3,
+                padding=1,
+                groups=4),
+            init_bias=-2.19,
+            final_kernel=3)),
+    test_cfg=dict(pts=dict(nms_type='circle')))
diff --git a/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn-circlenms_8xb4-flip-tta-cyclic-20e_nus-3d.py b/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn-circlenms_8xb4-flip-tta-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ee8deda062725ac9cb1315399b87ea3bc01032a
--- /dev/null
+++ b/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn-circlenms_8xb4-flip-tta-cyclic-20e_nus-3d.py
@@ -0,0 +1,49 @@
+_base_ = './centerpoint_voxel0075_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py'  # noqa: E501
+
+point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0]
+# Using calibration info convert the Lidar-coordinate point cloud range to the
+# ego-coordinate point cloud range could bring a little promotion in nuScenes.
+# point_cloud_range = [-54, -54.8, -5.0, 54, 53.2, 3.0]
+backend_args = None
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        # Add double-flip augmentation
+        flip=True,
+        pcd_horizontal_flip=True,
+        pcd_vertical_flip=True,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D', sync_2d=False),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+data = dict(
+    val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline))
diff --git a/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d.py b/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..185676b6b91e7028f32bf83e4cd4d7c18a9d8be4
--- /dev/null
+++ b/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d.py
@@ -0,0 +1,15 @@
+_base_ = ['./centerpoint_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py']
+
+model = dict(
+    pts_bbox_head=dict(
+        separate_head=dict(
+            type='DCNSeparateHead',
+            dcn_config=dict(
+                type='DCN',
+                in_channels=64,
+                out_channels=64,
+                kernel_size=3,
+                padding=1,
+                groups=4),
+            init_bias=-2.19,
+            final_kernel=3)))
diff --git a/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn_8xb4-flip-tta-cyclic-20e_nus-3d.py b/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn_8xb4-flip-tta-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd49fb3f93149514b5adea18caf7512a57513a06
--- /dev/null
+++ b/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn_8xb4-flip-tta-cyclic-20e_nus-3d.py
@@ -0,0 +1,50 @@
+_base_ = \
+    './centerpoint_voxel0075_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d.py'
+
+point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0]
+# Using calibration info convert the Lidar-coordinate point cloud range to the
+# ego-coordinate point cloud range could bring a little promotion in nuScenes.
+# point_cloud_range = [-54, -54.8, -5.0, 54, 53.2, 3.0]
+backend_args = None
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        # Add double-flip augmentation
+        flip=True,
+        pcd_horizontal_flip=True,
+        pcd_vertical_flip=True,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D', sync_2d=False),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+data = dict(
+    val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline))
diff --git a/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn_8xb4-tta-cyclic-20e_nus-3d.py b/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn_8xb4-tta-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..327bb0a990a6ba7dd2386041809cd3223e8fb37d
--- /dev/null
+++ b/mmde/configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn_8xb4-tta-cyclic-20e_nus-3d.py
@@ -0,0 +1,52 @@
+_base_ = \
+    './centerpoint_voxel0075_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d.py'
+
+model = dict(test_cfg=dict(pts=dict(use_rotate_nms=True, max_num=500)))
+
+point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0]
+# Using calibration info convert the Lidar-coordinate point cloud range to the
+# ego-coordinate point cloud range could bring a little promotion in nuScenes.
+# point_cloud_range = [-54, -54.8, -5.0, 54, 53.2, 3.0]
+backend_args = None
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=[0.95, 1.0, 1.05],
+        # Add double-flip augmentation
+        flip=True,
+        pcd_horizontal_flip=True,
+        pcd_vertical_flip=True,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D', sync_2d=False),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+data = dict(
+    val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline))
diff --git a/mmde/configs/centerpoint/centerpoint_voxel01_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/mmde/configs/centerpoint/centerpoint_voxel01_second_secfpn_8xb4-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a44c14d5104a2226f523a16568d03c668f423ce
--- /dev/null
+++ b/mmde/configs/centerpoint/centerpoint_voxel01_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -0,0 +1,160 @@
+_base_ = [
+    '../_base_/datasets/nus-3d.py',
+    '../_base_/models/centerpoint_voxel01_second_secfpn_nus.py',
+    '../_base_/schedules/cyclic-20e.py', '../_base_/default_runtime.py'
+]
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# Using calibration info convert the Lidar-coordinate point cloud range to the
+# ego-coordinate point cloud range could bring a little promotion in nuScenes.
+# point_cloud_range = [-51.2, -52, -5.0, 51.2, 50.4, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_prefix = dict(pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP')
+model = dict(
+    data_preprocessor=dict(
+        voxel_layer=dict(point_cloud_range=point_cloud_range)),
+    pts_bbox_head=dict(bbox_coder=dict(pc_range=point_cloud_range[:2])),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(point_cloud_range=point_cloud_range)),
+    test_cfg=dict(pts=dict(pc_range=point_cloud_range[:2])))
+
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+backend_args = None
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            car=5,
+            truck=5,
+            bus=5,
+            trailer=5,
+            construction_vehicle=5,
+            traffic_cone=5,
+            barrier=5,
+            motorcycle=5,
+            bicycle=5,
+            pedestrian=5)),
+    classes=class_names,
+    sample_groups=dict(
+        car=2,
+        truck=3,
+        construction_vehicle=7,
+        bus=4,
+        trailer=6,
+        barrier=2,
+        motorcycle=6,
+        bicycle=6,
+        pedestrian=2,
+        traffic_cone=2),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    _delete_=True,
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CBGSDataset',
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='nuscenes_infos_train.pkl',
+            pipeline=train_pipeline,
+            metainfo=dict(classes=class_names),
+            test_mode=False,
+            data_prefix=data_prefix,
+            use_valid_flag=True,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            backend_args=backend_args)))
+test_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names)))
+val_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names)))
+
+train_cfg = dict(val_interval=20)
diff --git a/mmde/configs/centerpoint/centerpoint_voxel01_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d.py b/mmde/configs/centerpoint/centerpoint_voxel01_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..47c552a5eb9e64a84fe09c4389918252da0cecec
--- /dev/null
+++ b/mmde/configs/centerpoint/centerpoint_voxel01_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d.py
@@ -0,0 +1,3 @@
+_base_ = ['./centerpoint_voxel01_second_secfpn_8xb4-cyclic-20e_nus-3d.py']
+
+model = dict(test_cfg=dict(pts=dict(nms_type='circle')))
diff --git a/mmde/configs/centerpoint/centerpoint_voxel01_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py b/mmde/configs/centerpoint/centerpoint_voxel01_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..d684516e0393229c25d63715b236dc2b66e1a952
--- /dev/null
+++ b/mmde/configs/centerpoint/centerpoint_voxel01_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py
@@ -0,0 +1,16 @@
+_base_ = ['./centerpoint_voxel01_second_secfpn_8xb4-cyclic-20e_nus-3d.py']
+
+model = dict(
+    pts_bbox_head=dict(
+        separate_head=dict(
+            type='DCNSeparateHead',
+            dcn_config=dict(
+                type='DCN',
+                in_channels=64,
+                out_channels=64,
+                kernel_size=3,
+                padding=1,
+                groups=4),
+            init_bias=-2.19,
+            final_kernel=3)),
+    test_cfg=dict(pts=dict(nms_type='circle')))
diff --git a/mmde/configs/centerpoint/centerpoint_voxel01_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d.py b/mmde/configs/centerpoint/centerpoint_voxel01_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd571badfcf0ad38b0ace72df48830f778b800bb
--- /dev/null
+++ b/mmde/configs/centerpoint/centerpoint_voxel01_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d.py
@@ -0,0 +1,15 @@
+_base_ = ['./centerpoint_voxel01_second_secfpn_8xb4-cyclic-20e_nus-3d.py']
+
+model = dict(
+    pts_bbox_head=dict(
+        separate_head=dict(
+            type='DCNSeparateHead',
+            dcn_config=dict(
+                type='DCN',
+                in_channels=64,
+                out_channels=64,
+                kernel_size=3,
+                padding=1,
+                groups=4),
+            init_bias=-2.19,
+            final_kernel=3)))
diff --git a/mmde/configs/centerpoint/metafile.yml b/mmde/configs/centerpoint/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..aca5f7c7bbd42f0114a249617db0f5bace3b028d
--- /dev/null
+++ b/mmde/configs/centerpoint/metafile.yml
@@ -0,0 +1,95 @@
+Collections:
+  - Name: CenterPoint
+    Metadata:
+      Training Data: nuScenes
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Hard Voxelization
+    Paper:
+      URL: https://arxiv.org/abs/2006.11275
+      Title: 'Center-based 3D Object Detection and Tracking'
+    README: configs/centerpoint/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/centerpoint.py#L10
+      Version: v0.6.0
+
+Models:
+  - Name: centerpoint_voxel01_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d
+    In Collection: CenterPoint
+    Config: configs/centerpoint/centerpoint_voxel01_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d.py
+    metadata:
+      Training Memory (GB): 5.2
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 56.11
+          NDS: 64.61
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus_20220810_030004-9061688e.pth
+
+  - Name: centerpoint_voxel01_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d
+    In Collection: CenterPoint
+    Config: configs/centerpoint/centerpoint_voxel01_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py
+    Metadata:
+      Training Memory (GB): 5.5
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 56.10
+          NDS: 64.69
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus_20220810_052355-a6928835.pth
+
+  - Name: centerpoint_voxel0075_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d
+    In Collection: CenterPoint
+    Config: configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d.py
+    Metadata:
+      Training Memory (GB): 8.2
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 56.54
+          NDS: 65.17
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus_20220810_011659-04cb3a3b.pth
+
+  - Name: centerpoint_voxel0075_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d
+    In Collection: CenterPoint
+    Config: configs/centerpoint/centerpoint_voxel0075_second_secfpn_head-dcn-circlenms_8xb4-cyclic-20e_nus-3d.py
+    Metadata:
+      Training Memory (GB): 8.7
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 56.92
+          NDS: 65.27
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus_20220810_025930-657f67e0.pth
+
+  - Name: centerpoint_pillar02_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d
+    In Collection: CenterPoint
+    Config: configs/centerpoint/centerpoint_pillar02_second_secfpn_head-circlenms_8xb4-cyclic-20e_nus-3d.py
+    Metadata:
+      Training Memory (GB): 4.6
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 48.70
+          NDS: 59.62
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus_20220811_031844-191a3822.pth
+
+  - Name: centerpoint_pillar02_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d
+    In Collection: CenterPoint
+    Config: configs/centerpoint/centerpoint_pillar02_second_secfpn_head-dcn_8xb4-cyclic-20e_nus-3d.py
+    Metadata:
+      Training Memory (GB): 4.9
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 48.38
+          NDS: 59.79
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/centerpoint/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus_20220811_045458-808e69ad.pth
diff --git a/mmde/configs/cylinder3d/README.md b/mmde/configs/cylinder3d/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..366cd4539d6840fcb6b2ceefe6a31caf3aba130d
--- /dev/null
+++ b/mmde/configs/cylinder3d/README.md
@@ -0,0 +1,38 @@
+# Cylindrical and Asymmetrical 3D Convolution Networks for LiDAR Segmentation
+
+> [Cylindrical and Asymmetrical 3D Convolution Networks for LiDAR Segmentation](https://arxiv.org/abs/2011.10033)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+State-of-the-art methods for large-scale driving-scene LiDAR segmentation often project the point clouds to 2D space and then process them via 2D convolution. Although this corporation shows the competitiveness in the point cloud, it inevitably alters and abandons the 3D topology and geometric relations. A natural remedy is to utilize the3D voxelization and 3D convolution network. However, we found that in the outdoor point cloud, the improvement obtained in this way is quite limited. An important reason is the property of the outdoor point cloud, namely sparsity and varying density. Motivated by this investigation, we propose a new framework for the outdoor LiDAR segmentation, where cylindrical partition and asymmetrical 3D convolution networks are designed to explore the 3D geometric pat-tern while maintaining these inherent properties. Moreover, a point-wise refinement module is introduced to alleviate the interference of lossy voxel-based label encoding. We evaluate the proposed model on two large-scale datasets, i.e., SemanticKITTI and nuScenes. Our method achieves the 1st place in the leaderboard of SemanticKITTI and outperforms existing methods on nuScenes with a noticeable margin, about 4%. Furthermore, the proposed 3D framework also generalizes well to LiDAR panoptic segmentation and LiDAR 3D detection.
+
+![overview](https://user-images.githubusercontent.com/45515569/228523861-2923082c-37d9-4d4f-aa59-746a8d9284c2.png)
+
+## Introduction
+
+We implement Cylinder3D and provide the result and checkpoints on Semantickitti datasets.
+
+## Results and models
+
+### SemanticKITTI
+
+|                               Method                                | Lr schd | Laser-Polar Mix | Mem (GB) |   mIoU   |                                                                                                                                                                       Download                                                                                                                                                                       |
+| :-----------------------------------------------------------------: | :-----: | :-------------: | :------: | :------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [Cylinder3D](./cylinder3d_8xb2-laser-polar-mix-3x_semantickitti.py) |   3x    |        ✗        |   10.2   | 63.1±0.5 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/cylinder3d/cylinder3d_4xb4_3x_semantickitti/cylinder3d_4xb4_3x_semantickitti_20230318_191107-822a8c31.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/cylinder3d/cylinder3d_4xb4_3x_semantickitti/cylinder3d_4xb4_3x_semantickitti_20230318_191107.json) |
+| [Cylinder3D](./cylinder3d_8xb2-laser-polar-mix-3x_semantickitti.py) |   3x    |        ✔        |   12.8   |   67.0   |              [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/cylinder3d/cylinder3d_8xb2-amp-laser-polar-mix-3x_semantickitti_20230425_144950-372cdf69.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/cylinder3d/cylinder3d_8xb2-amp-laser-polar-mix-3x_semantickitti_20230425_144950.log)               |
+
+Note: We reproduce the performance comparable with its [official repo](https://github.com/xinge008/Cylinder3D). It's slightly lower than the performance (65.9 mIOU) reported in the paper due to the lack of point-wise refinement and shorter training time.
+
+## Citation
+
+```latex
+@inproceedings{zhu2021cylindrical,
+  title={Cylindrical and asymmetrical 3d convolution networks for lidar segmentation},
+  author={Zhu, Xinge and Zhou, Hui and Wang, Tai and Hong, Fangzhou and Ma, Yuexin and Li, Wei and Li, Hongsheng and Lin, Dahua},
+  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
+  pages={9939--9948},
+  year={2021}
+}
+```
diff --git a/mmde/configs/cylinder3d/cylinder3d_4xb4-3x_semantickitti.py b/mmde/configs/cylinder3d/cylinder3d_4xb4-3x_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..c54118a6fbf702a14a808d4a33a4a4d1753aa4b7
--- /dev/null
+++ b/mmde/configs/cylinder3d/cylinder3d_4xb4-3x_semantickitti.py
@@ -0,0 +1,38 @@
+_base_ = [
+    '../_base_/datasets/semantickitti.py', '../_base_/models/cylinder3d.py',
+    '../_base_/default_runtime.py'
+]
+
+# optimizer
+lr = 0.001
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01))
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=36, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[30],
+        gamma=0.1)
+]
+
+train_dataloader = dict(batch_size=4, )
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
+# auto_scale_lr = dict(enable=False, base_batch_size=32)
+
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=5))
diff --git a/mmde/configs/cylinder3d/cylinder3d_8xb2-laser-polar-mix-3x_semantickitti.py b/mmde/configs/cylinder3d/cylinder3d_8xb2-laser-polar-mix-3x_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..66e891bcd13bb0c15b95d451f0ec039640d5420f
--- /dev/null
+++ b/mmde/configs/cylinder3d/cylinder3d_8xb2-laser-polar-mix-3x_semantickitti.py
@@ -0,0 +1,80 @@
+_base_ = [
+    '../_base_/datasets/semantickitti.py', '../_base_/models/cylinder3d.py',
+    '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
+]
+
+train_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti'),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='LaserMix',
+                    num_areas=[3, 4, 5, 6],
+                    pitch_angles=[-25, 3],
+                    pre_transform=[
+                        dict(
+                            type='LoadPointsFromFile',
+                            coord_type='LIDAR',
+                            load_dim=4,
+                            use_dim=4),
+                        dict(
+                            type='LoadAnnotations3D',
+                            with_bbox_3d=False,
+                            with_label_3d=False,
+                            with_seg_3d=True,
+                            seg_3d_dtype='np.int32',
+                            seg_offset=2**16,
+                            dataset_type='semantickitti'),
+                        dict(type='PointSegClassMapping')
+                    ],
+                    prob=1)
+            ],
+            [
+                dict(
+                    type='PolarMix',
+                    instance_classes=[0, 1, 2, 3, 4, 5, 6, 7],
+                    swap_ratio=0.5,
+                    rotate_paste_ratio=1.0,
+                    pre_transform=[
+                        dict(
+                            type='LoadPointsFromFile',
+                            coord_type='LIDAR',
+                            load_dim=4,
+                            use_dim=4),
+                        dict(
+                            type='LoadAnnotations3D',
+                            with_bbox_3d=False,
+                            with_label_3d=False,
+                            with_seg_3d=True,
+                            seg_3d_dtype='np.int32',
+                            seg_offset=2**16,
+                            dataset_type='semantickitti'),
+                        dict(type='PointSegClassMapping')
+                    ],
+                    prob=1)
+            ],
+        ],
+        prob=[0.5, 0.5]),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[0., 6.28318531],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+    ),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1))
diff --git a/mmde/configs/cylinder3d/metafile.yml b/mmde/configs/cylinder3d/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e24e66b8006f0047f2c2c4a37da63a850af7f241
--- /dev/null
+++ b/mmde/configs/cylinder3d/metafile.yml
@@ -0,0 +1,42 @@
+Collections:
+  - Name: Cylinder3D
+    Metadata:
+      Training Techniques:
+        - AdamW
+      Training Resources: 4x A100 GPUs
+      Architecture:
+        - Cylinder3D
+    Paper:
+      URL: https://arxiv.org/abs/2011.10033
+      Title: 'Cylindrical and Asymmetrical 3D Convolution Networks for LiDAR Segmentation'
+    README: configs/cylinder3d/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/models/segmentors/cylinder3d.py#L13
+      Version: v1.1.0
+
+Models:
+  - Name: cylinder3d_4xb4-3x_semantickitti
+    In Collection: Cylinder3D
+    Config: configs/cylinder3d/cylinder3d_4xb4_3x_semantickitti.py
+    Metadata:
+      Training Data: SemanticKITTI
+      Training Memory (GB): 10.2
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: SemanticKITTI
+        Metrics:
+          mIoU: 63.1
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/cylinder3d/cylinder3d_4xb4_3x_semantickitti/cylinder3d_4xb4_3x_semantickitti_20230318_191107-822a8c31.pth
+
+  - Name: cylinder3d_8xb2-laser-polar-mix-3x_semantickitti
+    In Collection: Cylinder3D
+    Config: configs/cylinder3d/cylinder3d_8xb2-laser-polar-mix-3x_semantickitti.py
+    Metadata:
+      Training Data: SemanticKITTI
+      Training Memory (GB): 12.8
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: SemanticKITTI
+        Metrics:
+          mIoU: 67.0
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/cylinder3d/cylinder3d_4xb4_3x_semantickitti/cylinder3d_4xb4_3x_semantickitti_20230318_191107-822a8c31.pth
diff --git a/mmde/configs/dgcnn/README.md b/mmde/configs/dgcnn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..46c243ad5659a7318d5b148333fe13ef1ec0ba25
--- /dev/null
+++ b/mmde/configs/dgcnn/README.md
@@ -0,0 +1,55 @@
+# Dynamic Graph CNN for Learning on Point Clouds
+
+> [Dynamic Graph CNN for Learning on Point Clouds](https://arxiv.org/abs/1801.07829)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Point clouds provide a flexible geometric representation suitable for countless applications in computer graphics; they also comprise the raw output of most 3D data acquisition devices. While hand-designed features on point clouds have long been proposed in graphics and vision, however, the recent overwhelming success of convolutional neural networks (CNNs) for image analysis suggests the value of adapting insight from CNN to the point cloud world. Point clouds inherently lack topological information so designing a model to recover topology can enrich the representation power of point clouds. To this end, we propose a new neural network module dubbed EdgeConv suitable for CNN-based high-level tasks on point clouds including classification and segmentation. EdgeConv acts on graphs dynamically computed in each layer of the network. It is differentiable and can be plugged into existing architectures. Compared to existing modules operating in extrinsic space or treating each point independently, EdgeConv has several appealing properties: It incorporates local neighborhood information; it can be stacked applied to learn global shape properties; and in multi-layer systems affinity in feature space captures semantic characteristics over potentially long distances in the original embedding. We show the performance of our model on standard benchmarks including ModelNet40, ShapeNetPart, and S3DIS.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/30491025/143855852-3d7888ed-2cfc-416c-9ec8-57621edeaa34.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement DGCNN and provide the results and checkpoints on S3DIS dataset.
+
+**Notice**: We follow the implementations in the original DGCNN paper and a PyTorch implementation of DGCNN [code](https://github.com/AnTao97/dgcnn.pytorch).
+
+## Results and models
+
+### S3DIS
+
+|                           Method                           | Split  |   Lr schd   | Mem (GB) | Inf time (fps) | mIoU (Val set) |                                                                                                                                                                                                 Download                                                                                                                                                                                                 |
+| :--------------------------------------------------------: | :----: | :---------: | :------: | :------------: | :------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [DGCNN](./dgcnn_4xb32-cosine-100e_s3dis-seg_test-area1.py) | Area_1 | cosine 100e |   13.1   |                |     68.33      | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area1/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210731_000734-39658f14.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area1/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210731_000734.log.json) |
+| [DGCNN](./dgcnn_4xb32-cosine-100e_s3dis-seg_test-area2.py) | Area_2 | cosine 100e |   13.1   |                |     40.68      | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area2/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210731_144648-aea9ecb6.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area2/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210731_144648.log.json) |
+| [DGCNN](./dgcnn_4xb32-cosine-100e_s3dis-seg_test-area3.py) | Area_3 | cosine 100e |   13.1   |                |     69.38      | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area3/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210801_154629-2ff50ee0.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area3/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210801_154629.log.json) |
+| [DGCNN](./dgcnn_4xb32-cosine-100e_s3dis-seg_test-area4.py) | Area_4 | cosine 100e |   13.1   |                |     50.07      | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area4/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210802_073551-dffab9cd.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area4/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210802_073551.log.json) |
+| [DGCNN](./dgcnn_4xb32-cosine-100e_s3dis-seg_test-area5.py) | Area_5 | cosine 100e |   13.1   |                |     50.59      | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area5/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210730_235824-f277e0c5.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area5/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210730_235824.log.json) |
+| [DGCNN](./dgcnn_4xb32-cosine-100e_s3dis-seg_test-area6.py) | Area_6 | cosine 100e |   13.1   |                |     77.94      | [model](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area6/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210802_154317-e3511b32.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area6/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210802_154317.log.json) |
+|                           DGCNN                            | 6-fold |             |          |                |     59.43      |                                                                                                                                                                                                                                                                                                                                                                                                          |
+
+**Notes:**
+
+- We use XYZ+Color+Normalized_XYZ as input in all the experiments on S3DIS datasets.
+- `Area_5` Split means training the model on Area_1, 2, 3, 4, 6 and testing on Area_5.
+- `6-fold` Split means the overall result of 6 different splits (Area_1, Area_2, Area_3, Area_4, Area_5 and Area_6 Splits).
+- Users need to modify `train_area` and `test_area` in the S3DIS dataset's [config](./configs/_base_/datasets/s3dis_seg-3d-13class.py) to set the training and testing areas, respectively.
+
+## Indeterminism
+
+Since DGCNN testing adopts sliding patch inference which involves random point sampling, and the test script uses fixed random seeds while the random seeds of validation in training are not fixed, the test results may be slightly different from the results reported above.
+
+## Citation
+
+```latex
+@article{dgcnn,
+  title={Dynamic Graph CNN for Learning on Point Clouds},
+  author={Wang, Yue and Sun, Yongbin and Liu, Ziwei and Sarma, Sanjay E. and Bronstein, Michael M. and Solomon, Justin M.},
+  journal={ACM Transactions on Graphics (TOG)},
+  year={2019}
+}
+```
diff --git a/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area1.py b/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area1.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9b40af80fb289ef5f3ba5967dd32ad305e9e0a2
--- /dev/null
+++ b/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area1.py
@@ -0,0 +1,17 @@
+_base_ = './dgcnn_4xb32-cosine-100e_s3dis-seg_test-area5.py'
+
+# data settings
+train_area = [2, 3, 4, 5, 6]
+test_area = 1
+train_dataloader = dict(
+    batch_size=32,
+    dataset=dict(
+        ann_files=[f's3dis_infos_Area_{i}.pkl' for i in train_area],
+        scene_idxs=[
+            f'seg_info/Area_{i}_resampled_scene_idxs.npy' for i in train_area
+        ]))
+test_dataloader = dict(
+    dataset=dict(
+        ann_files=f's3dis_infos_Area_{test_area}.pkl',
+        scene_idxs=f'seg_info/Area_{test_area}_resampled_scene_idxs.npy'))
+val_dataloader = test_dataloader
diff --git a/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area2.py b/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7a13665fe1068b895ea233def6a10c7f20a7432
--- /dev/null
+++ b/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area2.py
@@ -0,0 +1,17 @@
+_base_ = './dgcnn_4xb32-cosine-100e_s3dis-seg_test-area5.py'
+
+# data settings
+train_area = [1, 3, 4, 5, 6]
+test_area = 2
+train_dataloader = dict(
+    batch_size=32,
+    dataset=dict(
+        ann_files=[f's3dis_infos_Area_{i}.pkl' for i in train_area],
+        scene_idxs=[
+            f'seg_info/Area_{i}_resampled_scene_idxs.npy' for i in train_area
+        ]))
+test_dataloader = dict(
+    dataset=dict(
+        ann_files=f's3dis_infos_Area_{test_area}.pkl',
+        scene_idxs=f'seg_info/Area_{test_area}_resampled_scene_idxs.npy'))
+val_dataloader = test_dataloader
diff --git a/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area3.py b/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area3.py
new file mode 100644
index 0000000000000000000000000000000000000000..56cbd98b700d2247b9fbd2be26a5f1804b43ef96
--- /dev/null
+++ b/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area3.py
@@ -0,0 +1,17 @@
+_base_ = './dgcnn_4xb32-cosine-100e_s3dis-seg_test-area5.py'
+
+# data settings
+train_area = [1, 2, 4, 5, 6]
+test_area = 3
+train_dataloader = dict(
+    batch_size=32,
+    dataset=dict(
+        ann_files=[f's3dis_infos_Area_{i}.pkl' for i in train_area],
+        scene_idxs=[
+            f'seg_info/Area_{i}_resampled_scene_idxs.npy' for i in train_area
+        ]))
+test_dataloader = dict(
+    dataset=dict(
+        ann_files=f's3dis_infos_Area_{test_area}.pkl',
+        scene_idxs=f'seg_info/Area_{test_area}_resampled_scene_idxs.npy'))
+val_dataloader = test_dataloader
diff --git a/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area4.py b/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area4.py
new file mode 100644
index 0000000000000000000000000000000000000000..842f1e18f6a79e568d61762c3d50cb5a4ac9d753
--- /dev/null
+++ b/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area4.py
@@ -0,0 +1,17 @@
+_base_ = './dgcnn_4xb32-cosine-100e_s3dis-seg_test-area5.py'
+
+# data settings
+train_area = [1, 2, 3, 5, 6]
+test_area = 4
+train_dataloader = dict(
+    batch_size=32,
+    dataset=dict(
+        ann_files=[f's3dis_infos_Area_{i}.pkl' for i in train_area],
+        scene_idxs=[
+            f'seg_info/Area_{i}_resampled_scene_idxs.npy' for i in train_area
+        ]))
+test_dataloader = dict(
+    dataset=dict(
+        ann_files=f's3dis_infos_Area_{test_area}.pkl',
+        scene_idxs=f'seg_info/Area_{test_area}_resampled_scene_idxs.npy'))
+val_dataloader = test_dataloader
diff --git a/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area5.py b/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area5.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba7a971bd4c4ce585f3c4969845dd1e3dca639d
--- /dev/null
+++ b/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area5.py
@@ -0,0 +1,21 @@
+_base_ = [
+    '../_base_/datasets/s3dis-seg.py', '../_base_/models/dgcnn.py',
+    '../_base_/schedules/seg-cosine-100e.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    backbone=dict(in_channels=9),  # [xyz, rgb, normalized_xyz]
+    decode_head=dict(
+        num_classes=13, ignore_index=13,
+        loss_decode=dict(class_weight=None)),  # S3DIS doesn't use class_weight
+    test_cfg=dict(
+        num_points=4096,
+        block_size=1.0,
+        sample_rate=0.5,
+        use_normalized_coord=True,
+        batch_size=24))
+
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=2))
+train_dataloader = dict(batch_size=32)
+train_cfg = dict(val_interval=2)
diff --git a/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area6.py b/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area6.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4f50cd96c6a9ab2d311f9c2745beee4b7b8759f
--- /dev/null
+++ b/mmde/configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area6.py
@@ -0,0 +1,17 @@
+_base_ = './dgcnn_4xb32-cosine-100e_s3dis-seg_test-area5.py'
+
+# data settings
+train_area = [1, 2, 3, 4, 5]
+test_area = 6
+train_dataloader = dict(
+    batch_size=32,
+    dataset=dict(
+        ann_files=[f's3dis_infos_Area_{i}.pkl' for i in train_area],
+        scene_idxs=[
+            f'seg_info/Area_{i}_resampled_scene_idxs.npy' for i in train_area
+        ]))
+test_dataloader = dict(
+    dataset=dict(
+        ann_files=f's3dis_infos_Area_{test_area}.pkl',
+        scene_idxs=f'seg_info/Area_{test_area}_resampled_scene_idxs.npy'))
+val_dataloader = test_dataloader
diff --git a/mmde/configs/dgcnn/metafile.yml b/mmde/configs/dgcnn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c383576802d2b36d6748206e4faa3413c32b06a0
--- /dev/null
+++ b/mmde/configs/dgcnn/metafile.yml
@@ -0,0 +1,89 @@
+Collections:
+  - Name: DGCNN
+    Metadata:
+      Training Techniques:
+        - SGD
+      Training Resources: 4x Titan XP GPUs
+      Architecture:
+        - DGCNN
+    Paper: https://arxiv.org/abs/1801.07829
+    README: configs/dgcnn/README.md
+
+Models:
+  - Name: dgcnn_4xb32-cosine-100e_s3dis-seg_test-area1.py
+    In Collection: DGCNN
+    Config: configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area1.py
+    Metadata:
+      Training Data: S3DIS
+      Training Memory (GB): 13.3
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: S3DIS Area1
+        Metrics:
+          mIoU: 68.33
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area1/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210731_000734-39658f14.pth
+
+  - Name: dgcnn_4xb32-cosine-100e_s3dis-seg_test-area2.py
+    In Collection: DGCNN
+    Config: configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area2.py
+    Metadata:
+      Training Data: S3DIS
+      Training Memory (GB): 13.3
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: S3DIS Area2
+        Metrics:
+          mIoU: 40.68
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area2/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210731_144648-aea9ecb6.pth
+
+  - Name: dgcnn_4xb32-cosine-100e_s3dis-seg_test-area3.py
+    In Collection: DGCNN
+    Config: configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area3.py
+    Metadata:
+      Training Data: S3DIS
+      Training Memory (GB): 13.3
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: S3DIS Area3
+        Metrics:
+          mIoU: 69.38
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area3/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210801_154629-2ff50ee0.pth
+
+  - Name: dgcnn_4xb32-cosine-100e_s3dis-seg_test-area4.py
+    In Collection: DGCNN
+    Config: configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area4.py
+    Metadata:
+      Training Data: S3DIS
+      Training Memory (GB): 13.3
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: S3DIS Area4
+        Metrics:
+          mIoU: 50.07
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area4/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210802_073551-dffab9cd.pth
+
+  - Name: dgcnn_4xb32-cosine-100e_s3dis-seg_test-area5.py
+    In Collection: DGCNN
+    Config: configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area5.py
+    Metadata:
+      Training Data: S3DIS
+      Training Memory (GB): 13.3
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: S3DIS Area5
+        Metrics:
+          mIoU: 50.59
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area5/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210730_235824-f277e0c5.pth
+
+  - Name: dgcnn_4xb32-cosine-100e_s3dis-seg_test-area6.py
+    In Collection: DGCNN
+    Config: configs/dgcnn/dgcnn_4xb32-cosine-100e_s3dis-seg_test-area6.py
+    Metadata:
+      Training Data: S3DIS
+      Training Memory (GB): 13.3
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: S3DIS Area6
+        Metrics:
+          mIoU: 77.94
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.17.0_models/dgcnn/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class/area6/dgcnn_32x4_cosine_100e_s3dis_seg-3d-13class_20210802_154317-e3511b32.pth
diff --git a/mmde/configs/dynamic_voxelization/README.md b/mmde/configs/dynamic_voxelization/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..835551ca7b61122fc02a48513cfa5dd8efd8eb49
--- /dev/null
+++ b/mmde/configs/dynamic_voxelization/README.md
@@ -0,0 +1,40 @@
+# Dynamic Voxelization
+
+> [End-to-End Multi-View Fusion for 3D Object Detection in LiDAR Point Clouds](https://arxiv.org/abs/1910.06528)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Recent work on 3D object detection advocates point cloud voxelization in birds-eye view, where objects preserve their physical dimensions and are naturally separable. When represented in this view, however, point clouds are sparse and have highly variable point density, which may cause detectors difficulties in detecting distant or small objects (pedestrians, traffic signs, etc.). On the other hand, perspective view provides dense observations, which could allow more favorable feature encoding for such cases. In this paper, we aim to synergize the birds-eye view and the perspective view and propose a novel end-to-end multi-view fusion (MVF) algorithm, which can effectively learn to utilize the complementary information from both. Specifically, we introduce dynamic voxelization, which has four merits compared to existing voxelization methods, i) removing the need of pre-allocating a tensor with fixed size; ii) overcoming the information loss due to stochastic point/voxel dropout; iii) yielding deterministic voxel embeddings and more stable detection outcomes; iv) establishing the bi-directional relationship between points and voxels, which potentially lays a natural foundation for cross-view feature fusion. By employing dynamic voxelization, the proposed feature fusion architecture enables each point to learn to fuse context information from different views. MVF operates on points and can be naturally extended to other approaches using LiDAR point clouds. We evaluate our MVF model extensively on the newly released Waymo Open Dataset and on the KITTI dataset and demonstrate that it significantly improves detection accuracy over the comparable single-view PointPillars baseline.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/30491025/143856017-98b77ecb-7c13-4164-9c1d-e3011a7645e6.png" width="600"/>
+</div>
+
+## Introduction
+
+We implement Dynamic Voxelization proposed in  and provide its results and models on KITTI dataset.
+
+## Results and models
+
+### KITTI
+
+|                               Model                                |  Class  |  Lr schd   | Mem (GB) | Inf time (fps) |  mAP  |                                                                                                                                                                                                                 Download                                                                                                                                                                                                                 |
+| :----------------------------------------------------------------: | :-----: | :--------: | :------: | :------------: | :---: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|       [SECOND](./second_dv_secfpn_8xb6-80e_kitti-3d-car.py)        |   Car   | cyclic 80e |   5.5    |                | 78.83 |                     [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_6x8_80e_kitti-3d-car/dv_second_secfpn_6x8_80e_kitti-3d-car_20200620_235228-ac2c1c0c.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_6x8_80e_kitti-3d-car/dv_second_secfpn_6x8_80e_kitti-3d-car_20200620_235228.log.json)                     |
+|  [SECOND](./second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py)   | 3 Class | cosine 80e |   5.5    |                | 65.27 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/dynamic_voxelization/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class_20210831_054106-e742d163.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/dynamic_voxelization/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class_20210831_054106.log.json) |
+| [PointPillars](./pointpillars_dv_secfpn_8xb6-160e_kitti-3d-car.py) |   Car   | cyclic 80e |   4.7    |                | 77.76 |       [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230844-ee7b75c9.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230844.log.json)       |
+
+## Citation
+
+```latex
+@article{zhou2019endtoend,
+    title={End-to-End Multi-View Fusion for 3D Object Detection in LiDAR Point Clouds},
+    author={Yin Zhou and Pei Sun and Yu Zhang and Dragomir Anguelov and Jiyang Gao and Tom Ouyang and James Guo and Jiquan Ngiam and Vijay Vasudevan},
+    year={2019},
+    eprint={1910.06528},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV}
+}
+```
diff --git a/mmde/configs/dynamic_voxelization/metafile.yml b/mmde/configs/dynamic_voxelization/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..171a8c3a55f8c5f9ba5da622707a37c473033330
--- /dev/null
+++ b/mmde/configs/dynamic_voxelization/metafile.yml
@@ -0,0 +1,53 @@
+Collections:
+  - Name: Dynamic Voxelization
+    Metadata:
+      Training Data: KITTI
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Dynamic Voxelization
+    Paper:
+      URL: https://arxiv.org/abs/1910.06528
+      Title: 'End-to-End Multi-View Fusion for 3D Object Detection in LiDAR Point Clouds'
+    README: configs/dynamic_voxelization/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/dynamic_voxelnet.py#L11
+      Version: v0.5.0
+
+Models:
+  - Name: dv_second_secfpn_6x8_80e_kitti-3d-car
+    In Collection: Dynamic Voxelization
+    Config: configs/dynamic_voxelization/second_dv_secfpn_8xb6-80e_kitti-3d-car.py
+    Metadata:
+      Training Memory (GB): 5.5
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 78.83
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_6x8_80e_kitti-3d-car/dv_second_secfpn_6x8_80e_kitti-3d-car_20200620_235228-ac2c1c0c.pth
+
+  - Name: dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class
+    In Collection: Dynamic Voxelization
+    Config: configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py
+    Metadata:
+      Training Memory (GB): 5.5
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 65.27
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/dynamic_voxelization/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class_20210831_054106-e742d163.pth
+
+  - Name: dv_pointpillars_secfpn_6x8_160e_kitti-3d-car
+    In Collection: Dynamic Voxelization
+    Config: configs/dynamic_voxelization/pointpillars_dv_secfpn_8xb6-160e_kitti-3d-car.py
+    Metadata:
+      Training Memory (GB): 4.7
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 77.76
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230844-ee7b75c9.pth
diff --git a/mmde/configs/dynamic_voxelization/pointpillars_dv_secfpn_8xb6-160e_kitti-3d-car.py b/mmde/configs/dynamic_voxelization/pointpillars_dv_secfpn_8xb6-160e_kitti-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..3999aa577d7b1fa1bfecbd4508759af3b7556ee8
--- /dev/null
+++ b/mmde/configs/dynamic_voxelization/pointpillars_dv_secfpn_8xb6-160e_kitti-3d-car.py
@@ -0,0 +1,21 @@
+_base_ = '../pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car.py'
+
+voxel_size = [0.16, 0.16, 4]
+point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
+
+model = dict(
+    type='DynamicVoxelNet',
+    data_preprocessor=dict(
+        voxel_type='dynamic',
+        voxel_layer=dict(
+            max_num_points=-1,
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(-1, -1))),
+    voxel_encoder=dict(
+        type='DynamicPillarFeatureNet',
+        in_channels=4,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range))
diff --git a/mmde/configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py b/mmde/configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c0b92a5e8b0bc7621844db4872a9edc1b738308
--- /dev/null
+++ b/mmde/configs/dynamic_voxelization/second_dv_secfpn_8xb2-cosine-80e_kitti-3d-3class.py
@@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/second_hv_secfpn_kitti.py',
+    '../_base_/datasets/kitti-3d-3class.py', '../_base_/schedules/cosine.py',
+    '../_base_/default_runtime.py'
+]
+
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+voxel_size = [0.05, 0.05, 0.1]
+
+model = dict(
+    type='DynamicVoxelNet',
+    data_preprocessor=dict(
+        voxel_type='dynamic',
+        voxel_layer=dict(
+            _delete_=True,
+            max_num_points=-1,
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(-1, -1))),
+    voxel_encoder=dict(
+        _delete_=True,
+        type='DynamicSimpleVFE',
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range))
diff --git a/mmde/configs/dynamic_voxelization/second_dv_secfpn_8xb6-80e_kitti-3d-car.py b/mmde/configs/dynamic_voxelization/second_dv_secfpn_8xb6-80e_kitti-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..6907dcbe4f4c240ea0d95910a4fa9374429ad878
--- /dev/null
+++ b/mmde/configs/dynamic_voxelization/second_dv_secfpn_8xb6-80e_kitti-3d-car.py
@@ -0,0 +1,20 @@
+_base_ = '../second/second_hv_secfpn_8xb6-80e_kitti-3d-car.py'
+
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+voxel_size = [0.05, 0.05, 0.1]
+
+model = dict(
+    type='DynamicVoxelNet',
+    data_preprocessor=dict(
+        voxel_type='dynamic',
+        voxel_layer=dict(
+            _delete_=True,
+            max_num_points=-1,
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(-1, -1))),
+    voxel_encoder=dict(
+        _delete_=True,
+        type='DynamicSimpleVFE',
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range))
diff --git a/mmde/configs/fcaf3d/README.md b/mmde/configs/fcaf3d/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a5612499fd41993813ed9727494409d5d3e06d8f
--- /dev/null
+++ b/mmde/configs/fcaf3d/README.md
@@ -0,0 +1,53 @@
+# FCAF3D: Fully Convolutional Anchor-Free 3D Object Detection
+
+> [FCAF3D: Fully Convolutional Anchor-Free 3D Object Detection](https://arxiv.org/abs/2112.00322)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Recently, promising applications in robotics and augmented reality have attracted considerable attention to 3D object detection from point clouds. In this paper, we present FCAF3D --- a first-in-class fully convolutional anchor-free indoor 3D object detection method. It is a simple yet effective method that uses a voxel representation of a point cloud and processes voxels with sparse convolutions. FCAF3D can handle large-scale scenes with minimal runtime through a single fully convolutional feed-forward pass. Existing 3D object detection methods make prior assumptions on the geometry of objects, and we argue that it limits their generalization ability. To eliminate prior assumptions, we propose a novel parametrization of oriented bounding boxes that allows obtaining better results in a purely data-driven way. The proposed method achieves state-of-the-art 3D object detection results in terms of mAP@0.5 on ScanNet V2 (+4.5), SUN RGB-D (+3.5), and S3DIS (+20.5) datasets.
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/6030962/182842796-98c10576-d39c-4c2b-a15a-a04c9870919c.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement FCAF3D and provide the result and checkpoints on the ScanNet and SUN RGB-D dataset.
+
+## Results and models
+
+### ScanNet
+
+|                      Backbone                      | Mem (GB) | Inf time (fps) |   AP@0.25    |    AP@0.5    |                                                                                                                                                          Download                                                                                                                                                           |
+| :------------------------------------------------: | :------: | :------------: | :----------: | :----------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [MinkResNet34](./fcaf3d_8x2_scannet-3d-18class.py) |   10.5   |      15.7      | 69.7(70.7\*) | 55.2(56.0\*) | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/fcaf3d/fcaf3d_8x2_scannet-3d-18class/fcaf3d_8x2_scannet-3d-18class_20220805_084956.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/fcaf3d/fcaf3d_8x2_scannet-3d-18class/fcaf3d_8x2_scannet-3d-18class_20220805_084956.log.json) |
+
+### SUN RGB-D
+
+|                      Backbone                      | Mem (GB) | Inf time (fps) |   AP@0.25    |    AP@0.5    |                                                                                                                                                          Download                                                                                                                                                           |
+| :------------------------------------------------: | :------: | :------------: | :----------: | :----------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [MinkResNet34](./fcaf3d_8x2_sunrgbd-3d-10class.py) |   6.3    |      17.9      | 63.8(63.8\*) | 47.3(48.2\*) | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/fcaf3d/fcaf3d_8x2_sunrgbd-3d-10class/fcaf3d_8x2_sunrgbd-3d-10class_20220805_165017.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/fcaf3d/fcaf3d_8x2_sunrgbd-3d-10class/fcaf3d_8x2_sunrgbd-3d-10class_20220805_165017.log.json) |
+
+### S3DIS
+
+|                     Backbone                     | Mem (GB) | Inf time (fps) |   AP@0.25    |    AP@0.5    |                                                                                                                                                    Download                                                                                                                                                     |
+| :----------------------------------------------: | :------: | :------------: | :----------: | :----------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [MinkResNet34](./fcaf3d_2xb8_s3dis-3d-5class.py) |   23.5   |      10.9      | 67.4(64.9\*) | 45.7(43.8\*) | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/fcaf3d/fcaf3d_8x2_s3dis-3d-5class/fcaf3d_8x2_s3dis-3d-5class_20220805_121957.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/fcaf3d/fcaf3d_8x2_s3dis-3d-5class/fcaf3d_8x2_s3dis-3d-5class_20220805_121957.log.json) |
+
+**Note**
+
+- We report the results across 5 train runs followed by 5 test runs. * means the results reported in the paper.
+- Inference time is given for a single NVidia RTX 4090 GPU. All models are trained on 2 GPUs.
+
+## Citation
+
+```latex
+@inproceedings{rukhovich2022fcaf3d,
+  title={FCAF3D: Fully Convolutional Anchor-Free 3D Object Detection},
+  author={Danila Rukhovich, Anna Vorontsova, Anton Konushin},
+  booktitle={European conference on computer vision},
+  year={2022}
+}
+```
diff --git a/mmde/configs/fcaf3d/fcaf3d_2xb8_s3dis-3d-5class.py b/mmde/configs/fcaf3d/fcaf3d_2xb8_s3dis-3d-5class.py
new file mode 100644
index 0000000000000000000000000000000000000000..9edd32fdc3e3601407b6c36f332522a7218ad0fe
--- /dev/null
+++ b/mmde/configs/fcaf3d/fcaf3d_2xb8_s3dis-3d-5class.py
@@ -0,0 +1,27 @@
+_base_ = [
+    '../_base_/models/fcaf3d.py', '../_base_/default_runtime.py',
+    '../_base_/datasets/s3dis-3d.py'
+]
+
+model = dict(bbox_head=dict(num_classes=5))
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=10, norm_type=2))
+
+# learning rate
+param_scheduler = dict(
+    type='MultiStepLR',
+    begin=0,
+    end=12,
+    by_epoch=True,
+    milestones=[8, 11],
+    gamma=0.1)
+
+custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
+
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=12)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/mmde/configs/fcaf3d/fcaf3d_2xb8_scannet-3d-18class.py b/mmde/configs/fcaf3d/fcaf3d_2xb8_scannet-3d-18class.py
new file mode 100644
index 0000000000000000000000000000000000000000..49a02970ac42c5c8d4c980ab8d355aa98a7f6644
--- /dev/null
+++ b/mmde/configs/fcaf3d/fcaf3d_2xb8_scannet-3d-18class.py
@@ -0,0 +1,94 @@
+_base_ = [
+    '../_base_/models/fcaf3d.py', '../_base_/default_runtime.py',
+    '../_base_/datasets/scannet-3d.py'
+]
+n_points = 100000
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D'),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(type='PointSample', num_points=n_points),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[.9, 1.1],
+        translation_std=[.1, .1, .1],
+        shift_height=False),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=n_points),
+            dict(type='NormalizePointsColor', color_mean=None),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    dataset=dict(
+        type='RepeatDataset',
+        times=10,
+        dataset=dict(pipeline=train_pipeline, filter_empty_gt=True)))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=10, norm_type=2))
+
+# learning rate
+param_scheduler = dict(
+    type='MultiStepLR',
+    begin=0,
+    end=12,
+    by_epoch=True,
+    milestones=[8, 11],
+    gamma=0.1)
+
+custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
+
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=12)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/mmde/configs/fcaf3d/fcaf3d_2xb8_sunrgbd-3d-10class.py b/mmde/configs/fcaf3d/fcaf3d_2xb8_sunrgbd-3d-10class.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ebe7056b8ccedec1355599657d7c8bda72f0f71
--- /dev/null
+++ b/mmde/configs/fcaf3d/fcaf3d_2xb8_sunrgbd-3d-10class.py
@@ -0,0 +1,92 @@
+_base_ = [
+    '../_base_/models/fcaf3d.py', '../_base_/default_runtime.py',
+    '../_base_/datasets/sunrgbd-3d.py'
+]
+n_points = 100000
+backend_args = None
+
+model = dict(
+    bbox_head=dict(
+        num_classes=10,
+        num_reg_outs=8,
+        bbox_loss=dict(type='RotatedIoU3DLoss')))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D'),
+    dict(type='PointSample', num_points=n_points),
+    dict(type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.523599, 0.523599],
+        scale_ratio_range=[0.85, 1.15],
+        translation_std=[.1, .1, .1],
+        shift_height=False),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=n_points)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    dataset=dict(
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(pipeline=train_pipeline, filter_empty_gt=True)))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=10, norm_type=2))
+
+# learning rate
+param_scheduler = dict(
+    type='MultiStepLR',
+    begin=0,
+    end=12,
+    by_epoch=True,
+    milestones=[8, 11],
+    gamma=0.1)
+
+custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
+
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=12)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/mmde/configs/fcaf3d/metafile.yml b/mmde/configs/fcaf3d/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c3294bde789a827dce1d21072b40d94e5cd6f07d
--- /dev/null
+++ b/mmde/configs/fcaf3d/metafile.yml
@@ -0,0 +1,58 @@
+Collections:
+  - Name: FCAF3D
+    Metadata:
+      Training Techniques:
+        - AdamW
+      Training Resources: 2x V100 GPUs
+      Architecture:
+        - MinkResNet
+    Paper:
+      URL: https://arxiv.org/abs/2112.00322
+      Title: 'FCAF3D: Fully Convolutional Anchor-Free 3D Object Detection'
+    README: configs/fcaf3d/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/mink_single_stage.py#L15
+      Version: v1.0.0rc4
+
+Models:
+  - Name: fcaf3d_2xb8_scannet-3d-18class
+    In Collection: FCAF3D
+    Config: configs/fcaf3d/fcaf3d_2xb8_scannet-3d-18class.py
+    Metadata:
+      Training Data: ScanNet
+      Training Memory (GB): 10.7
+    Results:
+      - Task: 3D Object Detection
+        Dataset: ScanNet
+        Metrics:
+          AP@0.25: 69.7
+          AP@0.5: 55.2
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/fcaf3d/fcaf3d_8x2_scannet-3d-18class/fcaf3d_8x2_scannet-3d-18class_20220805_084956.pth
+
+  - Name: fcaf3d_2xb8_sunrgbd-3d-10class
+    In Collection: FCAF3D
+    Config: configs/fcaf3d/fcaf3d_2xb8_sunrgbd-3d-10class.py
+    Metadata:
+      Training Data: SUNRGBD
+      Training Memory (GB): 6.5
+    Results:
+      - Task: 3D Object Detection
+        Dataset: SUNRGBD
+        Metrics:
+          AP@0.25: 63.76
+          AP@0.5: 47.31
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/fcaf3d/fcaf3d_8x2_sunrgbd-3d-10class/fcaf3d_8x2_sunrgbd-3d-10class_20220805_165017.pth
+
+  - Name: fcaf3d_2xb8_s3dis-3d-5class
+    In Collection: FCAF3D
+    Config: configs/fcaf3d/fcaf3d_2xb8_s3dis-3d-5class.py
+    Metadata:
+      Training Data: S3DIS
+      Training Memory (GB): 23.5
+    Results:
+      - Task: 3D Object Detection
+        Dataset: S3DIS
+        Metrics:
+          AP@0.25: 67.36
+          AP@0.5: 45.74
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/fcaf3d/fcaf3d_8x2_s3dis-3d-5class/fcaf3d_8x2_s3dis-3d-5class_20220805_121957.pth
diff --git a/mmde/configs/fcos3d/README.md b/mmde/configs/fcos3d/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3f27b5ea5f0e245ff46b2cb14a6f5fc56cc413fc
--- /dev/null
+++ b/mmde/configs/fcos3d/README.md
@@ -0,0 +1,75 @@
+# FCOS3D: Fully Convolutional One-Stage Monocular 3D Object Detection
+
+> [FCOS3D: Fully Convolutional One-Stage Monocular 3D Object Detection](https://arxiv.org/abs/2104.10956)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Monocular 3D object detection is an important task for autonomous driving considering its advantage of low cost. It is much more challenging than conventional 2D cases due to its inherent ill-posed property, which is mainly reflected in the lack of depth information. Recent progress on 2D detection offers opportunities to better solving this problem. However, it is non-trivial to make a general adapted 2D detector work in this 3D task. In this paper, we study this problem with a practice built on a fully convolutional single-stage detector and propose a general framework FCOS3D. Specifically, we first transform the commonly defined 7-DoF 3D targets to the image domain and decouple them as 2D and 3D attributes. Then the objects are distributed to different feature levels with consideration of their 2D scales and assigned only according to the projected 3D-center for the training procedure. Furthermore, the center-ness is redefined with a 2D Gaussian distribution based on the 3D-center to fit the 3D target formulation. All of these make this framework simple yet effective, getting rid of any 2D detection or 2D-3D correspondence priors. Our solution achieves 1st place out of all the vision-only methods in the nuScenes 3D detection challenge of NeurIPS 2020.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/30491025/143856739-93b7c4ff-e116-4824-8cc3-8cf1a433a84c.png" width="800"/>
+</div>
+
+## Introduction
+
+FCOS3D is a general anchor-free, one-stage monocular 3D object detector adapted from the original 2D version FCOS.
+It serves as a baseline built on top of mmdetection and mmdetection3d for 3D detection based on monocular vision.
+
+Currently we first support the benchmark on the large-scale nuScenes dataset, which achieved 1st place out of all the vision-only methods in the [nuScenes 3D detecton challenge](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Camera) of NeurIPS 2020.
+
+![demo image](../../resources/browse_dataset_mono.png)
+
+## Usage
+
+### Data Preparation
+
+After supporting FCOS3D and monocular 3D object detection in v0.13.0, the coco-style 2D json info files will include related annotations by default
+(see [here](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/dataset_converters/nuscenes_converter.py#L333) if you would like to change the parameter).
+So you can just follow the data preparation steps given in the documentation, then all the needed infos are ready together.
+
+### Training and Inference
+
+The way to training and inference a monocular 3D object detector is the same as others in mmdetection and mmdetection3d. You can basically follow the [documentation](https://mmdetection3d.readthedocs.io/en/latest/1_exist_data_model.html#train-predefined-models-on-standard-datasets) and change the `config`, `work_dirs`, etc. accordingly.
+
+### Test time augmentation
+
+We implement test time augmentation for the dense outputs of detection heads, which is more effective than merging predicted boxes at last.
+You can turn on it by setting `flip=True` in the `test_pipeline`.
+
+### Training with finetune
+
+Due to the scale and measurements of depth is different from those of other regression targets, we first train the model with depth weight equal to 0.2 for a more stable training procedure. For a stronger detector with better performance, please finetune the model with depth weight changed to 1.0 as shown in the [config](./fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d_finetune.py). Note that the path of `load_from` needs to be changed to yours accordingly.
+
+### Visualizing prediction results
+
+We also provide visualization functions to show the monocular 3D detection results. Simply follow the [documentation](https://mmdetection3d.readthedocs.io/en/latest/1_exist_data_model.html#test-existing-models-on-standard-datasets) and use the `single-gpu testing` command. You only need to add the `--show` flag and specify `--show-dir` to store the visualization results.
+
+## Results and models
+
+### NuScenes
+
+|                                        Backbone                                         | Lr schd | Mem (GB) | Inf time (fps) | mAP  | NDS  |                                                                                                                                                                                                                             Download                                                                                                                                                                                                                             |
+| :-------------------------------------------------------------------------------------: | :-----: | :------: | :------------: | :--: | :--: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|      [ResNet101 w/ DCN](./fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py)      |   1x    |   8.69   |                | 29.8 | 37.7 |                   [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_20210715_235813-4bed5239.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_20210715_235813.log.json)                   |
+| [above w/ finetune](./fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d_finetune.py) |   1x    |   8.69   |                | 32.1 | 39.5 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune_20210717_095645-8d806dc2.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune_20210717_095645.log.json) |
+|                                      above w/ tta                                       |   1x    |   8.69   |                | 33.1 | 40.3 |                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+
+## Citation
+
+```latex
+@inproceedings{wang2021fcos3d,
+	title={{FCOS3D: Fully} Convolutional One-Stage Monocular 3D Object Detection},
+	author={Wang, Tai and Zhu, Xinge and Pang, Jiangmiao and Lin, Dahua},
+	booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops},
+	year={2021}
+}
+# For the original 2D version
+@inproceedings{tian2019fcos,
+  title     =  {{FCOS: Fully} Convolutional One-Stage Object Detection},
+  author    =  {Tian, Zhi and Shen, Chunhua and Chen, Hao and He, Tong},
+  booktitle =  {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
+  year      =  {2019}
+}
+```
diff --git a/mmde/configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py b/mmde/configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7ba665cec1217fbce708b751800e9760f5f7d7c
--- /dev/null
+++ b/mmde/configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py
@@ -0,0 +1,70 @@
+_base_ = [
+    '../_base_/datasets/nus-mono3d.py', '../_base_/models/fcos3d.py',
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)))
+
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='mmdet.Resize', scale=(1600, 900), keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'attr_labels',
+            'gt_bboxes_3d', 'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(type='mmdet.Resize', scale_factor=1.0),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+train_dataloader = dict(
+    batch_size=2, num_workers=2, dataset=dict(pipeline=train_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(lr=0.002),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
diff --git a/mmde/configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d_finetune.py b/mmde/configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d_finetune.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8ea7a00f73c16f998c5b25bcd7e9aa3c4b58bf1
--- /dev/null
+++ b/mmde/configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d_finetune.py
@@ -0,0 +1,8 @@
+_base_ = './fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py'
+# model settings
+model = dict(
+    train_cfg=dict(
+        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05]))
+# optimizer
+optim_wrapper = dict(optimizer=dict(lr=0.001))
+load_from = 'work_dirs/fcos3d_nus/latest.pth'
diff --git a/mmde/configs/fcos3d/metafile.yml b/mmde/configs/fcos3d/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..481789216cb5c9b9e50c065d7a0a6e1282d0f944
--- /dev/null
+++ b/mmde/configs/fcos3d/metafile.yml
@@ -0,0 +1,43 @@
+Collections:
+  - Name: FCOS3D
+    Metadata:
+      Training Data: NuScenes
+      Training Techniques:
+        - SGD
+      Training Resources: 8x GeForce RTX 2080 Ti
+      Architecture:
+        - FCOSMono3DHead
+    Paper:
+      URL: https://arxiv.org/abs/2104.10956
+      Title: 'FCOS3D: Fully Convolutional One-Stage Monocular 3D Object Detection'
+    README: configs/fcos3d/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/fcos_mono3d.py#L7
+      Version: v0.13.0
+
+Models:
+  - Name: fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d_finetune
+    In Collection: FCOS3D
+    Config: configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py
+    Metadata:
+      Training Memory (GB): 8.7
+    Results:
+      - Task: 3D Object Detection
+        Dataset: NuScenes
+        Metrics:
+          mAP: 29.9
+          NDS: 37.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_20210715_235813-4bed5239.pth
+
+  - Name: fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d_finetune
+    In Collection: FCOS3D
+    Config: configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d_finetune.py
+    Metadata:
+      Training Memory (GB): 8.7
+    Results:
+      - Task: 3D Object Detection
+        Dataset: NuScenes
+        Metrics:
+          mAP: 32.1
+          NDS: 39.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune_20210717_095645-8d806dc2.pth
diff --git a/mmde/configs/free_anchor/README.md b/mmde/configs/free_anchor/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..37a3e1f1c2eaa431c0761470c2f23d1a780b03ad
--- /dev/null
+++ b/mmde/configs/free_anchor/README.md
@@ -0,0 +1,105 @@
+# FreeAnchor for 3D Object Detection
+
+> [FreeAnchor: Learning to Match Anchors for Visual Object Detection](https://arxiv.org/abs/1909.02466)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Modern CNN-based object detectors assign anchors for ground-truth objects under the restriction of object-anchor Intersection-over-Unit (IoU). In this study, we propose a learning-to-match approach to break IoU restriction, allowing objects to match anchors in a flexible manner. Our approach, referred to as FreeAnchor, updates hand-crafted anchor assignment to “free" anchor matching by formulating detector training as a maximum likelihood estimation (MLE) procedure. FreeAnchor targets at learning features which best explain a class of objects in terms of both classification and localization. FreeAnchor is implemented by optimizing detection customized likelihood and can be fused with CNN-based detectors in a plug-and-play manner. Experiments on COCO demonstrate that FreeAnchor consistently outperforms the counterparts with significant margins.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/36950400/143866685-e3ac08bb-cd0c-4ada-ba8a-18e03cccdd0f.png" width="600"/>
+</div>
+
+## Introduction
+
+We implement FreeAnchor in 3D detection systems and provide their first results with PointPillars on nuScenes dataset.
+With the implemented `FreeAnchor3DHead`, a PointPillar detector with a big backbone (e.g., RegNet-3.2GF) achieves top performance
+on the nuScenes benchmark.
+
+## Usage
+
+### Modify config
+
+As in the [baseline config](pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py), we only need to replace the head of an existing one-stage detector to use FreeAnchor head.
+Since the config is inherit from a common detector head, `_delete_=True` is necessary to avoid conflicts.
+The hyperparameters are specifically tuned according to the original paper.
+
+```python
+_base_ = [
+    '../_base_/models/pointpillars_hv_fpn_lyft.py',
+    '../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py'
+]
+
+model = dict(
+    pts_bbox_head=dict(
+        _delete_=True,
+        type='FreeAnchor3DHead',
+        num_classes=10,
+        in_channels=256,
+        feat_channels=256,
+        use_direction_classifier=True,
+        pre_anchor_topk=25,
+        bbox_thr=0.5,
+        gamma=2.0,
+        alpha=0.5,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
+            scales=[1, 2, 4],
+            sizes=[
+                [2.5981, 0.8660, 1.],  # 1.5 / sqrt(3)
+                [1.7321, 0.5774, 1.],  # 1 / sqrt(3)
+                [1., 1., 1.],
+                [0.4, 0.4, 1],
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        assigner_per_size=False,
+        diff_rad_by_sin=True,
+        dir_offset=-0.7854,  # -pi / 4
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.8),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg = dict(
+        pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.25, 0.25])))
+```
+
+## Results and models
+
+### PointPillars
+
+|                                                    Backbone                                                     | FreeAnchor | Lr schd | Mem (GB) | Inf time (fps) |  mAP  |  NDS  |                                                                                                                                                                                                                                                                    Download                                                                                                                                                                                                                                                                    |
+| :-------------------------------------------------------------------------------------------------------------: | :--------: | :-----: | :------: | :------------: | :---: | :---: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|                      [FPN](../pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py)                       |     ✗      |   2x    |   17.1   |                | 40.0  | 53.3  |                                                                        [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405.log.json)                                                                        |
+|                     [FPN](./pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py)                     |     ✓      |   2x    |   16.3   |                | 43.82 | 54.86 |                                                 [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20210816_163441-ae0897e7.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20210816_163441.log.json)                                                 |
+|            [RegNetX-400MF-FPN](../regnet/pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb4-2x_nus-3d.py)            |     ✗      |   2x    |   17.3   |                | 44.8  | 56.4  |                                                    [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239-c694dce7.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239.log.json)                                                    |
+|       [RegNetX-400MF-FPN](./pointpillars_hv_regnet-400mf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py)        |     ✓      |   2x    |   17.6   |                | 48.3  | 58.65 |                       [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20210827_213939-a2dd3fff.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20210827_213939.log.json)                       |
+|       [RegNetX-1.6GF-FPN](./pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py)        |     ✓      |   2x    |   24.3   |                | 52.04 | 61.49 |                       [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20210828_025608-bfbd506e.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20210828_025608.log.json)                       |
+| [RegNetX-1.6GF-FPN](./pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py)\* |     ✓      |   3x    |   24.4   |                | 52.69 | 62.45 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20210827_184909-14d2dbd1.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20210827_184909.log.json) |
+|       [RegNetX-3.2GF-FPN](./pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py)        |     ✓      |   2x    |   29.4   |                | 52.4  | 61.94 |                       [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20210827_181237-e385c35a.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20210827_181237.log.json)                       |
+| [RegNetX-3.2GF-FPN](./pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py)\* |     ✓      |   3x    |   29.2   |                | 54.23 | 63.41 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20210828_030816-06708918.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20210828_030816.log.json) |
+
+**Note**: Models noted by `*` means it is trained using stronger augmentation with vertical flip under bird-eye-view, global translation, and larger range of global rotation.
+
+## Citation
+
+```latex
+@inproceedings{zhang2019freeanchor,
+  title   =  {{FreeAnchor}: Learning to Match Anchors for Visual Object Detection},
+  author  =  {Zhang, Xiaosong and Wan, Fang and Liu, Chang and Ji, Rongrong and Ye, Qixiang},
+  booktitle =  {Neural Information Processing Systems},
+  year    =  {2019}
+}
+```
diff --git a/mmde/configs/free_anchor/metafile.yml b/mmde/configs/free_anchor/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..10d9970d472fa7d24fe9e8e48acbfe4a3efb5b21
--- /dev/null
+++ b/mmde/configs/free_anchor/metafile.yml
@@ -0,0 +1,122 @@
+Collections:
+  - Name: FreeAnchor
+    Metadata:
+      Training Data: nuScenes
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Hard Voxelization
+        - Free Anchor
+    Paper:
+      URL: https://arxiv.org/abs/1909.02466
+      Title: 'FreeAnchor: Learning to Match Anchors for Visual Object Detection'
+    README: configs/free_anchor/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/dense_heads/free_anchor3d_head.py#L13
+      Version: v0.5.0
+
+Models:
+  - Name: pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d
+    In Collection: FreeAnchor
+    Config: pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py
+    Metadata:
+      Training Memory (GB): 17.1
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 40.0
+          NDS: 53.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth
+
+  - Name: pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d
+    In Collection: FreeAnchor
+    Config: free_anchor/pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
+    Metadata:
+      Training Memory (GB): 16.3
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 43.82
+          NDS: 54.86
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20210816_163441-ae0897e7.pth
+
+  - Name: pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb4-2x_nus-3d
+    In Collection: FreeAnchor
+    Config: configs/regnet/pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb4-2x_nus-3d.py
+    Metadata:
+      Training Memory (GB): 17.3
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 44.8
+          NDS: 56.4
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20210816_163441-ae0897e7.pth
+
+  - Name: pointpillars_hv_regnet-400mf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d
+    In Collection: FreeAnchor
+    Config: configs/free_anchor/pointpillars_hv_regnet-400mf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
+    Metadata:
+      Training Memory (GB): 17.6
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 48.3
+          NDS: 58.65
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20210827_213939-a2dd3fff.pth
+
+  - Name: hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d
+    In Collection: FreeAnchor
+    Config: configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
+    Metadata:
+      Training Memory (GB): 24.3
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 52.04
+          NDS: 61.49
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20210828_025608-bfbd506e.pth
+
+  - Name: pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d
+    In Collection: FreeAnchor
+    Config: configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py
+    Metadata:
+      Training Memory (GB): 24.4
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 52.69
+          NDS: 62.45
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20210827_184909-14d2dbd1.pth
+
+  - Name: pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d
+    In Collection: FreeAnchor
+    Config: configs/free_anchor/pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
+    Metadata:
+      Training Memory (GB): 29.4
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 52.4
+          NDS: 61.94
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20210827_181237-e385c35a.pth
+
+  - Name: pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d
+    In Collection: FreeAnchor
+    Config: configs/free_anchor/pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py
+    Metadata:
+      Training Memory (GB): 29.2
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 54.23
+          NDS: 63.41
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20210828_030816-06708918.pth
diff --git a/mmde/configs/free_anchor/pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py b/mmde/configs/free_anchor/pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9fa321b6294f57878abf8c497684090adf11414
--- /dev/null
+++ b/mmde/configs/free_anchor/pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
@@ -0,0 +1,49 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_fpn_nus.py',
+    '../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py'
+]
+
+model = dict(
+    pts_bbox_head=dict(
+        _delete_=True,
+        type='FreeAnchor3DHead',
+        num_classes=10,
+        in_channels=256,
+        feat_channels=256,
+        use_direction_classifier=True,
+        pre_anchor_topk=25,
+        bbox_thr=0.5,
+        gamma=2.0,
+        alpha=0.5,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
+            scales=[1, 2, 4],
+            sizes=[
+                [2.5981, 0.8660, 1.],  # 1.5 / sqrt(3)
+                [1.7321, 0.5774, 1.],  # 1 / sqrt(3)
+                [1., 1., 1.],
+                [0.4, 0.4, 1],
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        assigner_per_size=False,
+        diff_rad_by_sin=True,
+        dir_offset=-0.7854,  # -pi / 4
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.8),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.25, 0.25])))
diff --git a/mmde/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py b/mmde/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..8968b3998c891e2cf307fa02001b58d5e1577c51
--- /dev/null
+++ b/mmde/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
@@ -0,0 +1,18 @@
+_base_ = './pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py'
+
+model = dict(
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch='regnetx_1.6gf',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[168, 408, 912]))
diff --git a/mmde/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py b/mmde/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d5a48507cc4c95a53282f7756561a5526d8d711
--- /dev/null
+++ b/mmde/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py
@@ -0,0 +1,76 @@
+_base_ = './pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py'
+
+model = dict(
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch='regnetx_1.6gf',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[168, 408, 912]))
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.7854, 0.7854],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.2, 0.2, 0.2]),
+    dict(
+        type='RandomFlip3D',
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+train_cfg = dict(max_epochs=36, val_interval=36)
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[28, 34],
+        gamma=0.1)
+]
diff --git a/mmde/configs/free_anchor/pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py b/mmde/configs/free_anchor/pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..079328ffcb67c2a95812672b8f14468d3acbff29
--- /dev/null
+++ b/mmde/configs/free_anchor/pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
@@ -0,0 +1,18 @@
+_base_ = './pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py'
+
+model = dict(
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch='regnetx_3.2gf',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[192, 432, 1008]))
diff --git a/mmde/configs/free_anchor/pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py b/mmde/configs/free_anchor/pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e4e02f5e6ab91182dc5f6899ff3d4de7ff091c3
--- /dev/null
+++ b/mmde/configs/free_anchor/pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py
@@ -0,0 +1,76 @@
+_base_ = './pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py'
+
+model = dict(
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch='regnetx_3.2gf',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[192, 432, 1008]))
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.7854, 0.7854],
+        scale_ratio_range=[0.9, 1.1],
+        translation_std=[0.2, 0.2, 0.2]),
+    dict(
+        type='RandomFlip3D',
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+train_cfg = dict(max_epochs=36, val_interval=36)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[28, 34],
+        gamma=0.1)
+]
diff --git a/mmde/configs/free_anchor/pointpillars_hv_regnet-400mf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py b/mmde/configs/free_anchor/pointpillars_hv_regnet-400mf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8198edf0469e056d3e1071641c856df0296c364
--- /dev/null
+++ b/mmde/configs/free_anchor/pointpillars_hv_regnet-400mf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
@@ -0,0 +1,18 @@
+_base_ = './pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py'
+
+model = dict(
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch='regnetx_400mf',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_400mf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[64, 160, 384]))
diff --git a/mmde/configs/groupfree3d/README.md b/mmde/configs/groupfree3d/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3133420c2185a76810c536f2a84a49fa50cb748a
--- /dev/null
+++ b/mmde/configs/groupfree3d/README.md
@@ -0,0 +1,45 @@
+# Group-Free 3D Object Detection via Transformers
+
+> [Group-Free 3D Object Detection via Transformers](https://arxiv.org/abs/2104.00678)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Recently, directly detecting 3D objects from 3D point clouds has received increasing attention. To extract object representation from an irregular point cloud, existing methods usually take a point grouping step to assign the points to an object candidate so that a PointNet-like network could be used to derive object features from the grouped points. However, the inaccurate point assignments caused by the hand-crafted grouping scheme decrease the performance of 3D object detection. In this paper, we present a simple yet effective method for directly detecting 3D objects from the 3D point cloud. Instead of grouping local points to each object candidate, our method computes the feature of an object from all the points in the point cloud with the help of an attention mechanism in the Transformers, where the contribution of each point is automatically learned in the network training. With an improved attention stacking scheme, our method fuses object features in different stages and generates more accurate object detection results. With few bells and whistles, the proposed method achieves state-of-the-art 3D object detection performance on two widely used benchmarks, ScanNet V2 and SUN RGB-D.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/36950400/143868101-09787c2a-9e0b-4013-8800-b4e315d535f0.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement Group-Free-3D and provide the result and checkpoints on ScanNet datasets.
+
+## Results and models
+
+### ScanNet
+
+|                              Method                              |   Backbone    | Lr schd | Mem (GB) | Inf time (fps) |     AP@0.25     |     AP@0.5      |                                                                                                                                                                                                        Download                                                                                                                                                                                                        |
+| :--------------------------------------------------------------: | :-----------: | :-----: | :------: | :------------: | :-------------: | :-------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    [L6, O256](./groupfree3d_head-L6-O256_4xb8_scannet-seg.py)    |  PointNet++   |   3x    |   6.7    |                | 66.17 (65.67\*) | 48.47 (47.74\*) |           [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-L6-O256/groupfree3d_8x4_scannet-3d-18class-L6-O256_20210702_145347-3499eb55.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-L6-O256/groupfree3d_8x4_scannet-3d-18class-L6-O256_20210702_145347.log.json)           |
+|   [L12, O256](./groupfree3d_head-L12-O256_4xb8_scannet-seg.py)   |  PointNet++   |   3x    |   9.4    |                | 66.57 (66.22\*) | 48.21 (48.95\*) |         [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-L12-O256/groupfree3d_8x4_scannet-3d-18class-L12-O256_20210702_150907-1c5551ad.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-L12-O256/groupfree3d_8x4_scannet-3d-18class-L12-O256_20210702_150907.log.json)         |
+| [L12, O256](./groupfree3d_w2x-head-L12-O256_4xb8_scannet-seg.py) | PointNet++w2x |   3x    |   13.3   |                | 68.20 (67.30\*) | 51.02 (50.44\*) | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O256/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O256_20210702_200301-944f0ac0.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O256/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O256_20210702_200301.log.json) |
+| [L12, O512](./groupfree3d_w2x-head-L12-O512_4xb8_scannet-seg.py) | PointNet++w2x |   3x    |   18.8   |                | 68.22 (68.20\*) | 52.61 (51.31\*) | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O512/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O512_20210702_220204-187b71c7.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O512/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O512_20210702_220204.log.json) |
+
+**Notes:**
+
+- We defined L6-O256 represent num_layers=6 and num_proposals=256. And w2x means that the model backbone weight is twice the original.
+- We report the best results (AP@0.50) on validation set during each training. * means the evaluation method in the paper: we train each setting 5 times and test each training trial 5 times, then the average performance of these 25 trials is reported to account for algorithm randomness.
+- We use 4 GPUs for training by default as the original code.
+
+## Citation
+
+```latex
+@article{liu2021,
+  title={Group-Free 3D Object Detection via Transformers},
+  author={Liu, Ze and Zhang, Zheng and Cao, Yue and Hu, Han and Tong, Xin},
+  journal={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
+  year={2021}
+}
+```
diff --git a/mmde/configs/groupfree3d/groupfree3d_head-L12-O256_4xb8_scannet-seg.py b/mmde/configs/groupfree3d/groupfree3d_head-L12-O256_4xb8_scannet-seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..f53b2bba9f2a8235e3097dfa7080a019c95861d9
--- /dev/null
+++ b/mmde/configs/groupfree3d/groupfree3d_head-L12-O256_4xb8_scannet-seg.py
@@ -0,0 +1,227 @@
+_base_ = [
+    '../_base_/datasets/scannet-3d.py', '../_base_/models/groupfree3d.py',
+    '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    bbox_head=dict(
+        num_classes=18,
+        num_decoder_layers=12,
+        size_cls_agnostic=False,
+        bbox_coder=dict(
+            type='GroupFree3DBBoxCoder',
+            num_sizes=18,
+            num_dir_bins=1,
+            with_rot=False,
+            size_cls_agnostic=False,
+            mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                        [1.876858, 1.8425595, 1.1931566],
+                        [0.61328, 0.6148609, 0.7182701],
+                        [1.3955007, 1.5121545, 0.83443564],
+                        [0.97949594, 1.0675149, 0.6329687],
+                        [0.531663, 0.5955577, 1.7500148],
+                        [0.9624706, 0.72462326, 1.1481868],
+                        [0.83221924, 1.0490936, 1.6875663],
+                        [0.21132214, 0.4206159, 0.5372846],
+                        [1.4440073, 1.8970833, 0.26985747],
+                        [1.0294262, 1.4040797, 0.87554324],
+                        [1.3766412, 0.65521795, 1.6813129],
+                        [0.6650819, 0.71111923, 1.298853],
+                        [0.41999173, 0.37906948, 1.7513971],
+                        [0.59359556, 0.5912492, 0.73919016],
+                        [0.50867593, 0.50656086, 0.30136237],
+                        [1.1511526, 1.0546296, 0.49706793],
+                        [0.47535285, 0.49249494, 0.5802117]]),
+        sampling_objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=8.0),
+        objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        center_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=0.04,
+            reduction='sum',
+            loss_weight=10.0),
+        dir_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=1.0 / 9.0,
+            reduction='sum',
+            loss_weight=10.0 / 9.0),
+        semantic_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    test_cfg=dict(
+        sample_mode='kps',
+        nms_thr=0.25,
+        score_thr=0.0,
+        per_class_proposal=True,
+        prediction_stages='last_three'))
+
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+
+metainfo = dict(classes=class_names)
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointSample', num_points=50000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0]),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=50000),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth',
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+
+# optimizer
+lr = 0.006
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_self_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_cross_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
+        }))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=80,
+        by_epoch=True,
+        milestones=[56, 68],
+        gamma=0.1)
+]
+
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=80, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=10))
diff --git a/mmde/configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py b/mmde/configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..e447b7fc55588b551816a5850243b8a928c0877c
--- /dev/null
+++ b/mmde/configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py
@@ -0,0 +1,227 @@
+_base_ = [
+    '../_base_/datasets/scannet-3d.py', '../_base_/models/groupfree3d.py',
+    '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    bbox_head=dict(
+        num_classes=18,
+        size_cls_agnostic=False,
+        bbox_coder=dict(
+            type='GroupFree3DBBoxCoder',
+            num_sizes=18,
+            num_dir_bins=1,
+            with_rot=False,
+            size_cls_agnostic=False,
+            mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                        [1.876858, 1.8425595, 1.1931566],
+                        [0.61328, 0.6148609, 0.7182701],
+                        [1.3955007, 1.5121545, 0.83443564],
+                        [0.97949594, 1.0675149, 0.6329687],
+                        [0.531663, 0.5955577, 1.7500148],
+                        [0.9624706, 0.72462326, 1.1481868],
+                        [0.83221924, 1.0490936, 1.6875663],
+                        [0.21132214, 0.4206159, 0.5372846],
+                        [1.4440073, 1.8970833, 0.26985747],
+                        [1.0294262, 1.4040797, 0.87554324],
+                        [1.3766412, 0.65521795, 1.6813129],
+                        [0.6650819, 0.71111923, 1.298853],
+                        [0.41999173, 0.37906948, 1.7513971],
+                        [0.59359556, 0.5912492, 0.73919016],
+                        [0.50867593, 0.50656086, 0.30136237],
+                        [1.1511526, 1.0546296, 0.49706793],
+                        [0.47535285, 0.49249494, 0.5802117]]),
+        sampling_objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=8.0),
+        objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        center_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=0.04,
+            reduction='sum',
+            loss_weight=10.0),
+        dir_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=1.0 / 9.0,
+            reduction='sum',
+            loss_weight=10.0 / 9.0),
+        semantic_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    test_cfg=dict(
+        sample_mode='kps',
+        nms_thr=0.25,
+        score_thr=0.0,
+        per_class_proposal=True,
+        prediction_stages='last_three'))
+
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+
+metainfo = dict(classes=class_names)
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointSample', num_points=50000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0]),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=50000),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth',
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+
+# optimizer
+lr = 0.006
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_self_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_cross_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
+        }))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=80,
+        by_epoch=True,
+        milestones=[56, 68],
+        gamma=0.1)
+]
+
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=80, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=10))
+randomness = dict(seed=4)
diff --git a/mmde/configs/groupfree3d/groupfree3d_w2x-head-L12-O256_4xb8_scannet-seg.py b/mmde/configs/groupfree3d/groupfree3d_w2x-head-L12-O256_4xb8_scannet-seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..52fb7de73578a9f01fe4441c6ce1def6eb8663d8
--- /dev/null
+++ b/mmde/configs/groupfree3d/groupfree3d_w2x-head-L12-O256_4xb8_scannet-seg.py
@@ -0,0 +1,242 @@
+_base_ = [
+    '../_base_/datasets/scannet-3d.py', '../_base_/models/groupfree3d.py',
+    '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=3,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((128, 128, 256), (256, 256, 512), (256, 256, 512),
+                     (256, 256, 512)),
+        fp_channels=((512, 512), (512, 288)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    bbox_head=dict(
+        num_classes=18,
+        num_decoder_layers=12,
+        size_cls_agnostic=False,
+        bbox_coder=dict(
+            type='GroupFree3DBBoxCoder',
+            num_sizes=18,
+            num_dir_bins=1,
+            with_rot=False,
+            size_cls_agnostic=False,
+            mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                        [1.876858, 1.8425595, 1.1931566],
+                        [0.61328, 0.6148609, 0.7182701],
+                        [1.3955007, 1.5121545, 0.83443564],
+                        [0.97949594, 1.0675149, 0.6329687],
+                        [0.531663, 0.5955577, 1.7500148],
+                        [0.9624706, 0.72462326, 1.1481868],
+                        [0.83221924, 1.0490936, 1.6875663],
+                        [0.21132214, 0.4206159, 0.5372846],
+                        [1.4440073, 1.8970833, 0.26985747],
+                        [1.0294262, 1.4040797, 0.87554324],
+                        [1.3766412, 0.65521795, 1.6813129],
+                        [0.6650819, 0.71111923, 1.298853],
+                        [0.41999173, 0.37906948, 1.7513971],
+                        [0.59359556, 0.5912492, 0.73919016],
+                        [0.50867593, 0.50656086, 0.30136237],
+                        [1.1511526, 1.0546296, 0.49706793],
+                        [0.47535285, 0.49249494, 0.5802117]]),
+        sampling_objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=8.0),
+        objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        center_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=0.04,
+            reduction='sum',
+            loss_weight=10.0),
+        dir_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=1.0 / 9.0,
+            reduction='sum',
+            loss_weight=10.0 / 9.0),
+        semantic_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    test_cfg=dict(
+        sample_mode='kps',
+        nms_thr=0.25,
+        score_thr=0.0,
+        per_class_proposal=True,
+        prediction_stages='last_three'))
+
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+
+metainfo = dict(classes=class_names)
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointSample', num_points=50000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0]),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=50000),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth',
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+
+# optimizer
+lr = 0.006
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_self_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_cross_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
+        }))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=80,
+        by_epoch=True,
+        milestones=[56, 68],
+        gamma=0.1)
+]
+
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=80, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=10))
diff --git a/mmde/configs/groupfree3d/groupfree3d_w2x-head-L12-O512_4xb8_scannet-seg.py b/mmde/configs/groupfree3d/groupfree3d_w2x-head-L12-O512_4xb8_scannet-seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..c24aa1a9f3e25db405551605283ad14df16627c8
--- /dev/null
+++ b/mmde/configs/groupfree3d/groupfree3d_w2x-head-L12-O512_4xb8_scannet-seg.py
@@ -0,0 +1,243 @@
+_base_ = [
+    '../_base_/datasets/scannet-3d.py', '../_base_/models/groupfree3d.py',
+    '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=3,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((128, 128, 256), (256, 256, 512), (256, 256, 512),
+                     (256, 256, 512)),
+        fp_channels=((512, 512), (512, 288)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    bbox_head=dict(
+        num_classes=18,
+        num_decoder_layers=12,
+        num_proposal=512,
+        size_cls_agnostic=False,
+        bbox_coder=dict(
+            type='GroupFree3DBBoxCoder',
+            num_sizes=18,
+            num_dir_bins=1,
+            with_rot=False,
+            size_cls_agnostic=False,
+            mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                        [1.876858, 1.8425595, 1.1931566],
+                        [0.61328, 0.6148609, 0.7182701],
+                        [1.3955007, 1.5121545, 0.83443564],
+                        [0.97949594, 1.0675149, 0.6329687],
+                        [0.531663, 0.5955577, 1.7500148],
+                        [0.9624706, 0.72462326, 1.1481868],
+                        [0.83221924, 1.0490936, 1.6875663],
+                        [0.21132214, 0.4206159, 0.5372846],
+                        [1.4440073, 1.8970833, 0.26985747],
+                        [1.0294262, 1.4040797, 0.87554324],
+                        [1.3766412, 0.65521795, 1.6813129],
+                        [0.6650819, 0.71111923, 1.298853],
+                        [0.41999173, 0.37906948, 1.7513971],
+                        [0.59359556, 0.5912492, 0.73919016],
+                        [0.50867593, 0.50656086, 0.30136237],
+                        [1.1511526, 1.0546296, 0.49706793],
+                        [0.47535285, 0.49249494, 0.5802117]]),
+        sampling_objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=8.0),
+        objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        center_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=0.04,
+            reduction='sum',
+            loss_weight=10.0),
+        dir_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=1.0 / 9.0,
+            reduction='sum',
+            loss_weight=10.0 / 9.0),
+        semantic_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    test_cfg=dict(
+        sample_mode='kps',
+        nms_thr=0.25,
+        score_thr=0.0,
+        per_class_proposal=True,
+        prediction_stages='last_three'))
+
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+
+metainfo = dict(classes=class_names)
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointSample', num_points=50000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0]),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=50000),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth',
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+
+# optimizer
+lr = 0.006
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_self_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_cross_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
+        }))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=80,
+        by_epoch=True,
+        milestones=[56, 68],
+        gamma=0.1)
+]
+
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=80, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=10))
diff --git a/mmde/configs/groupfree3d/metafile.yml b/mmde/configs/groupfree3d/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0b8a62eedd4314465cae9f084fb29c95514672f1
--- /dev/null
+++ b/mmde/configs/groupfree3d/metafile.yml
@@ -0,0 +1,72 @@
+Collections:
+  - Name: Group-Free-3D
+    Metadata:
+      Training Techniques:
+        - AdamW
+      Training Resources: 4x V100 GPUs
+      Architecture:
+        - PointNet++
+    Paper:
+      URL: https://arxiv.org/abs/2104.00678
+      Title: 'Group-Free 3D Object Detection via Transformers'
+    README: configs/groupfree3d/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/groupfree3dnet.py#L10
+      Version: v0.15.0
+
+Models:
+  - Name: groupfree3d_head-L6-O256_4xb8_scannet-seg.py
+    In Collection: Group-Free-3D
+    Config: configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py
+    Metadata:
+      Training Data: ScanNet
+      Training Memory (GB): 6.7
+    Results:
+      - Task: 3D Object Detection
+        Dataset: ScanNet
+        Metrics:
+          AP@0.25: 66.17
+          AP@0.5: 48.47
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-L6-O256/groupfree3d_8x4_scannet-3d-18class-L6-O256_20210702_145347-3499eb55.pth
+
+  - Name: groupfree3d_head-L12-O256_4xb8_scannet-seg.py
+    In Collection: Group-Free-3D
+    Config: configs/groupfree3d/groupfree3d_head-L12-O256_4xb8_scannet-seg.py
+    Metadata:
+      Training Data: ScanNet
+      Training Memory (GB): 9.4
+    Results:
+      - Task: 3D Object Detection
+        Dataset: ScanNet
+        Metrics:
+          AP@0.25: 66.57
+          AP@0.5: 48.21
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-L12-O256/groupfree3d_8x4_scannet-3d-18class-L12-O256_20210702_150907-1c5551ad.pth
+
+  - Name: groupfree3d_w2x-head-L12-O256_4xb8_scannet-seg.py
+    In Collection: Group-Free-3D
+    Config: configs/groupfree3d/groupfree3d_w2x-head-L12-O256_4xb8_scannet-seg.py
+    Metadata:
+      Training Data: ScanNet
+      Training Memory (GB): 13.3
+    Results:
+      - Task: 3D Object Detection
+        Dataset: ScanNet
+        Metrics:
+          AP@0.25: 68.20
+          AP@0.5: 51.02
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O256/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O256_20210702_200301-944f0ac0.pth
+
+  - Name: groupfree3d_w2x-head-L12-O512_4xb8_scannet-seg.py
+    In Collection: Group-Free-3D
+    Config: configs/groupfree3d/groupfree3d_w2x-head-L12-O512_4xb8_scannet-seg.py
+    Metadata:
+      Training Data: ScanNet
+      Training Memory (GB): 18.8
+    Results:
+      - Task: 3D Object Detection
+        Dataset: ScanNet
+        Metrics:
+          AP@0.25: 68.22
+          AP@0.5: 52.61
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O512/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O512_20210702_220204-187b71c7.pth
diff --git a/mmde/configs/h3dnet/README.md b/mmde/configs/h3dnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..53d91158def47a14a075298ea5fb004d7f898530
--- /dev/null
+++ b/mmde/configs/h3dnet/README.md
@@ -0,0 +1,44 @@
+# H3DNet: 3D Object Detection Using Hybrid Geometric Primitives
+
+> [H3DNet: 3D Object Detection Using Hybrid Geometric Primitives](https://arxiv.org/abs/2006.05682)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We introduce H3DNet, which takes a colorless 3D point cloud as input and outputs a collection of oriented object bounding boxes (or BB) and their semantic labels. The critical idea of H3DNet is to predict a hybrid set of geometric primitives, i.e., BB centers, BB face centers, and BB edge centers. We show how to convert the predicted geometric primitives into object proposals by defining a distance function between an object and the geometric primitives. This distance function enables continuous optimization of object proposals, and its local minimums provide high-fidelity object proposals. H3DNet then utilizes a matching and refinement module to classify object proposals into detected objects and fine-tune the geometric parameters of the detected objects. The hybrid set of geometric primitives not only provides more accurate signals for object detection than using a single type of geometric primitives, but it also provides an overcomplete set of constraints on the resulting 3D layout. Therefore, H3DNet can tolerate outliers in predicted geometric primitives. Our model achieves state-of-the-art 3D detection results on two large datasets with real 3D scans, ScanNet and SUN RGB-D.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/36950400/143868884-26f7fc63-93fd-48cb-a469-e2f55fda5550.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement H3DNet and provide the result and checkpoints on ScanNet datasets.
+
+## Results and models
+
+### ScanNet
+
+|                   Backbone                    | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 | AP@0.5 |                                                                                                                                                             Download                                                                                                                                                             |
+| :-------------------------------------------: | :-----: | :------: | :------------: | :-----: | :----: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [MultiBackbone](./h3dnet_8xb3_scannet-seg.py) |   3x    |   7.9    |                |  66.07  | 47.68  | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/h3dnet/h3dnet_3x8_scannet-3d-18class/h3dnet_3x8_scannet-3d-18class_20210824_003149-414bd304.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/h3dnet/h3dnet_scannet-3d-18class/h3dnet_3x8_scannet-3d-18class_20210824_003149.log.json) |
+
+**Notice**: If your current mmdetection3d version >= 0.6.0, and you are using the checkpoints downloaded from the above links or using checkpoints trained with mmdetection3d version \< 0.6.0, the checkpoints have to be first converted via [tools/model_converters/convert_h3dnet_checkpoints.py](../../tools/model_converters/convert_h3dnet_checkpoints.py):
+
+```
+python ./tools/model_converters/convert_h3dnet_checkpoints.py ${ORIGINAL_CHECKPOINT_PATH} --out=${NEW_CHECKPOINT_PATH}
+```
+
+Then you can use the converted checkpoints following [get_started.md](../../docs/en/get_started.md).
+
+## Citation
+
+```latex
+@inproceedings{zhang2020h3dnet,
+    author = {Zhang, Zaiwei and Sun, Bo and Yang, Haitao and Huang, Qixing},
+    title = {H3DNet: 3D Object Detection Using Hybrid Geometric Primitives},
+    booktitle = {Proceedings of the European Conference on Computer Vision},
+    year = {2020}
+}
+```
diff --git a/mmde/configs/h3dnet/h3dnet_8xb3_scannet-seg.py b/mmde/configs/h3dnet/h3dnet_8xb3_scannet-seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9a6e719015f69fffa6ee6bbbdc9f98c15eb7d83
--- /dev/null
+++ b/mmde/configs/h3dnet/h3dnet_8xb3_scannet-seg.py
@@ -0,0 +1,74 @@
+_base_ = [
+    '../_base_/datasets/scannet-3d.py', '../_base_/models/h3dnet.py',
+    '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    rpn_head=dict(
+        num_classes=18,
+        bbox_coder=dict(
+            type='PartialBinBasedBBoxCoder',
+            num_sizes=18,
+            num_dir_bins=24,
+            with_rot=False,
+            mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                        [1.876858, 1.8425595, 1.1931566],
+                        [0.61328, 0.6148609, 0.7182701],
+                        [1.3955007, 1.5121545, 0.83443564],
+                        [0.97949594, 1.0675149, 0.6329687],
+                        [0.531663, 0.5955577, 1.7500148],
+                        [0.9624706, 0.72462326, 1.1481868],
+                        [0.83221924, 1.0490936, 1.6875663],
+                        [0.21132214, 0.4206159, 0.5372846],
+                        [1.4440073, 1.8970833, 0.26985747],
+                        [1.0294262, 1.4040797, 0.87554324],
+                        [1.3766412, 0.65521795, 1.6813129],
+                        [0.6650819, 0.71111923, 1.298853],
+                        [0.41999173, 0.37906948, 1.7513971],
+                        [0.59359556, 0.5912492, 0.73919016],
+                        [0.50867593, 0.50656086, 0.30136237],
+                        [1.1511526, 1.0546296, 0.49706793],
+                        [0.47535285, 0.49249494, 0.5802117]])),
+    roi_head=dict(
+        bbox_head=dict(
+            num_classes=18,
+            bbox_coder=dict(
+                type='PartialBinBasedBBoxCoder',
+                num_sizes=18,
+                num_dir_bins=24,
+                with_rot=False,
+                mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                            [1.876858, 1.8425595, 1.1931566],
+                            [0.61328, 0.6148609, 0.7182701],
+                            [1.3955007, 1.5121545, 0.83443564],
+                            [0.97949594, 1.0675149, 0.6329687],
+                            [0.531663, 0.5955577, 1.7500148],
+                            [0.9624706, 0.72462326, 1.1481868],
+                            [0.83221924, 1.0490936, 1.6875663],
+                            [0.21132214, 0.4206159, 0.5372846],
+                            [1.4440073, 1.8970833, 0.26985747],
+                            [1.0294262, 1.4040797, 0.87554324],
+                            [1.3766412, 0.65521795, 1.6813129],
+                            [0.6650819, 0.71111923, 1.298853],
+                            [0.41999173, 0.37906948, 1.7513971],
+                            [0.59359556, 0.5912492, 0.73919016],
+                            [0.50867593, 0.50656086, 0.30136237],
+                            [1.1511526, 1.0546296, 0.49706793],
+                            [0.47535285, 0.49249494, 0.5802117]]))))
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=2,
+)
+
+# yapf:disable
+default_hooks = dict(
+    logger=dict(type='LoggerHook', interval=30)
+)
+# yapf:enable
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (3 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=24)
diff --git a/mmde/configs/h3dnet/metafile.yml b/mmde/configs/h3dnet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..93558bc308b282e7c1758efea9aa76ce43a9893f
--- /dev/null
+++ b/mmde/configs/h3dnet/metafile.yml
@@ -0,0 +1,29 @@
+Collections:
+  - Name: H3DNet
+    Metadata:
+      Training Data: ScanNet
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x GeForce GTX 1080 Ti
+      Architecture:
+    Paper:
+      URL: https://arxiv.org/abs/2006.05682
+      Title: 'H3DNet: 3D Object Detection Using Hybrid Geometric Primitives'
+    README: configs/h3dnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/h3dnet.py#L10
+      Version: v0.6.0
+
+Models:
+  - Name: h3dnet_3x8_scannet-3d-18class
+    In Collection: H3DNet
+    Config: configs/h3dnet/h3dnet_8xb3_scannet-seg.py
+    Metadata:
+      Training Memory (GB): 7.9
+    Results:
+      - Task: 3D Object Detection
+        Dataset: ScanNet
+        Metrics:
+          AP@0.25: 66.07
+          AP@0.5: 47.68
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/h3dnet/h3dnet_3x8_scannet-3d-18class/h3dnet_3x8_scannet-3d-18class_20210824_003149-414bd304.pth
diff --git a/mmde/configs/imvotenet/README.md b/mmde/configs/imvotenet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5d30f02baadd50466b1e12d11a7056cab86a5bc4
--- /dev/null
+++ b/mmde/configs/imvotenet/README.md
@@ -0,0 +1,43 @@
+# ImVoteNet: Boosting 3D Object Detection in Point Clouds with Image Votes
+
+> [ImVoteNet: Boosting 3D Object Detection in Point Clouds with Image Votes](https://arxiv.org/abs/2001.10692)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+3D object detection has seen quick progress thanks to advances in deep learning on point clouds. A few recent works have even shown state-of-the-art performance with just point clouds input (e.g. VOTENET). However, point cloud data have inherent limitations. They are sparse, lack color information and often suffer from sensor noise. Images, on the other hand, have high resolution and rich texture. Thus they can complement the 3D geometry provided by point clouds. Yet how to effectively use image information to assist point cloud based detection is still an open question. In this work, we build on top of VOTENET and propose a 3D detection architecture called IMVOTENET specialized for RGB-D scenes. IMVOTENET is based on fusing 2D votes in images and 3D votes in point clouds. Compared to prior work on multi-modal detection, we explicitly extract both geometric and semantic features from the 2D images. We leverage camera parameters to lift these features to 3D. To improve the synergy of 2D-3D feature fusion, we also propose a multi-tower training scheme. We validate our model on the challenging SUN RGB-D dataset, advancing state-of-the-art results by 5.7 mAP. We also provide rich ablation studies to analyze the contribution of each design choice.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/36950400/143869878-a2ae7f43-55c3-4b95-af09-8f97dfd975f4.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement ImVoteNet and provide the result and checkpoints on SUNRGBD.
+
+## Results and models
+
+### SUNRGBD-2D (Stage 1, image branch pre-train)
+
+|                             Backbone                             | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 | AP@0.5 |                                                                                                                                                                                                              Download                                                                                                                                                                                                              |
+| :--------------------------------------------------------------: | :-----: | :------: | :------------: | :-----: | :----: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [PointNet++](./imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py) |         |   2.1    |                |         | 62.70  | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210819_225618-62eba6ce.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210819_225618.json) |
+
+### SUNRGBD-3D (Stage 2)
+
+|                       Backbone                       | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 | AP@0.5 |                                                                                                                                                                                        Download                                                                                                                                                                                        |
+| :--------------------------------------------------: | :-----: | :------: | :------------: | :-----: | :----: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [PointNet++](./imvotenet_stage2_8xb16_sunrgbd-3d.py) |   3x    |   9.4    |                |  64.48  |        | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210819_192851-1bcd1b97.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210819_192851.log.json) |
+
+## Citation
+
+```latex
+@inproceedings{qi2020imvotenet,
+  title={Imvotenet: Boosting 3D object detection in point clouds with image votes},
+  author={Qi, Charles R and Chen, Xinlei and Litany, Or and Guibas, Leonidas J},
+  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
+  pages={4404--4413},
+  year={2020}
+}
+```
diff --git a/mmde/configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py b/mmde/configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5846ad163687fa93025f67617a9f69f88ae65fee
--- /dev/null
+++ b/mmde/configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py
@@ -0,0 +1,69 @@
+_base_ = [
+    '../_base_/datasets/sunrgbd-3d.py', '../_base_/default_runtime.py',
+    '../_base_/models/imvotenet.py'
+]
+
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_bbox_3d=False,
+        with_label_3d=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 480), (1333, 504), (1333, 528), (1333, 552),
+                (1333, 576), (1333, 600)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='Pack3DDetInputs', keys=['img', 'gt_bboxes', 'gt_bboxes_labels']),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 600), keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=(['img']),
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset', times=1, dataset=dict(pipeline=train_pipeline)))
+
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=8, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=8,
+        by_epoch=True,
+        milestones=[6],
+        gamma=0.1)
+]
+val_evaluator = dict(type='Indoor2DMetric')
+test_evaluator = val_evaluator
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
+
+load_from = 'http://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth'  # noqa
diff --git a/mmde/configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py b/mmde/configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f2218eac71a065f0c186a49234abc5183af01c1
--- /dev/null
+++ b/mmde/configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py
@@ -0,0 +1,228 @@
+_base_ = [
+    '../_base_/datasets/sunrgbd-3d.py', '../_base_/schedules/schedule-3x.py',
+    '../_base_/default_runtime.py', '../_base_/models/imvotenet.py'
+]
+
+class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+               'night_stand', 'bookshelf', 'bathtub')
+backend_args = None
+
+model = dict(
+    pts_backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=4,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 256)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    pts_bbox_heads=dict(
+        common=dict(
+            type='VoteHead',
+            num_classes=10,
+            bbox_coder=dict(
+                type='PartialBinBasedBBoxCoder',
+                num_sizes=10,
+                num_dir_bins=12,
+                with_rot=True,
+                mean_sizes=[[2.114256, 1.620300, 0.927272],
+                            [0.791118, 1.279516, 0.718182],
+                            [0.923508, 1.867419, 0.845495],
+                            [0.591958, 0.552978, 0.827272],
+                            [0.699104, 0.454178, 0.75625],
+                            [0.69519, 1.346299, 0.736364],
+                            [0.528526, 1.002642, 1.172878],
+                            [0.500618, 0.632163, 0.683424],
+                            [0.404671, 1.071108, 1.688889],
+                            [0.76584, 1.398258, 0.472728]]),
+            pred_layer_cfg=dict(
+                in_channels=128, shared_conv_channels=(128, 128), bias=True),
+            objectness_loss=dict(
+                type='mmdet.CrossEntropyLoss',
+                class_weight=[0.2, 0.8],
+                reduction='sum',
+                loss_weight=5.0),
+            center_loss=dict(
+                type='ChamferDistance',
+                mode='l2',
+                reduction='sum',
+                loss_src_weight=10.0,
+                loss_dst_weight=10.0),
+            dir_class_loss=dict(
+                type='mmdet.CrossEntropyLoss',
+                reduction='sum',
+                loss_weight=1.0),
+            dir_res_loss=dict(
+                type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+            size_class_loss=dict(
+                type='mmdet.CrossEntropyLoss',
+                reduction='sum',
+                loss_weight=1.0),
+            size_res_loss=dict(
+                type='mmdet.SmoothL1Loss',
+                reduction='sum',
+                loss_weight=10.0 / 3.0),
+            semantic_loss=dict(
+                type='mmdet.CrossEntropyLoss',
+                reduction='sum',
+                loss_weight=1.0)),
+        joint=dict(
+            vote_module_cfg=dict(
+                in_channels=512,
+                vote_per_seed=1,
+                gt_per_seed=3,
+                conv_channels=(512, 256),
+                conv_cfg=dict(type='Conv1d'),
+                norm_cfg=dict(type='BN1d'),
+                norm_feats=True,
+                vote_loss=dict(
+                    type='ChamferDistance',
+                    mode='l1',
+                    reduction='none',
+                    loss_dst_weight=10.0)),
+            vote_aggregation_cfg=dict(
+                type='PointSAModule',
+                num_point=256,
+                radius=0.3,
+                num_sample=16,
+                mlp_channels=[512, 128, 128, 128],
+                use_xyz=True,
+                normalize_xyz=True)),
+        pts=dict(
+            vote_module_cfg=dict(
+                in_channels=256,
+                vote_per_seed=1,
+                gt_per_seed=3,
+                conv_channels=(256, 256),
+                conv_cfg=dict(type='Conv1d'),
+                norm_cfg=dict(type='BN1d'),
+                norm_feats=True,
+                vote_loss=dict(
+                    type='ChamferDistance',
+                    mode='l1',
+                    reduction='none',
+                    loss_dst_weight=10.0)),
+            vote_aggregation_cfg=dict(
+                type='PointSAModule',
+                num_point=256,
+                radius=0.3,
+                num_sample=16,
+                mlp_channels=[256, 128, 128, 128],
+                use_xyz=True,
+                normalize_xyz=True)),
+        img=dict(
+            vote_module_cfg=dict(
+                in_channels=256,
+                vote_per_seed=1,
+                gt_per_seed=3,
+                conv_channels=(256, 256),
+                conv_cfg=dict(type='Conv1d'),
+                norm_cfg=dict(type='BN1d'),
+                norm_feats=True,
+                vote_loss=dict(
+                    type='ChamferDistance',
+                    mode='l1',
+                    reduction='none',
+                    loss_dst_weight=10.0)),
+            vote_aggregation_cfg=dict(
+                type='PointSAModule',
+                num_point=256,
+                radius=0.3,
+                num_sample=16,
+                mlp_channels=[256, 128, 128, 128],
+                use_xyz=True,
+                normalize_xyz=True)),
+        loss_weights=[0.4, 0.3, 0.3]),
+    img_mlp=dict(
+        in_channel=18,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        act_cfg=dict(type='ReLU')),
+    fusion_layer=dict(
+        type='VoteFusion',
+        num_classes=len(class_names),
+        max_imvote_per_pixel=3),
+    num_sampled_seed=1024,
+    freeze_img_branch=True,
+
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mode='vote')),
+    test_cfg=dict(
+        img_rcnn=dict(score_thr=0.1),
+        pts=dict(
+            sample_mode='seed',
+            nms_thr=0.25,
+            score_thr=0.05,
+            per_class_proposal=True)))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True),
+    dict(type='Resize', scale=(1333, 600), keep_ratio=True),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+    ),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.523599, 0.523599],
+        scale_ratio_range=[0.85, 1.15],
+        shift_height=True),
+    dict(type='PointSample', num_points=20000),
+    dict(
+        type='Pack3DDetInputs',
+        keys=([
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'points', 'gt_bboxes_3d',
+            'gt_labels_3d'
+        ]))
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 600), keep_ratio=True),
+    dict(type='PointSample', num_points=20000),
+    dict(type='Pack3DDetInputs', keys=['img', 'points'])
+]
+
+train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
+
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# may also use your own pre-trained image branch
+load_from = 'https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210819_225618-62eba6ce.pth'  # noqa
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
+randomness = dict(seed=8)
diff --git a/mmde/configs/imvotenet/metafile.yml b/mmde/configs/imvotenet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..191dd13a0aae41bfe30a2d79a425f6352b4777e5
--- /dev/null
+++ b/mmde/configs/imvotenet/metafile.yml
@@ -0,0 +1,43 @@
+Collections:
+  - Name: ImVoteNet
+    Metadata:
+      Training Data: SUNRGBD
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x TITAN Xp
+      Architecture:
+        - Faster R-CNN
+        - VoteNet
+        - Feature Pyramid Network
+    Paper:
+      URL: https://arxiv.org/abs/2001.10692
+      Title: 'ImVoteNet: Boosting 3D Object Detection in Point Clouds with Image Votes'
+    README: configs/imvotenet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/imvotenet.py#L56
+      Version: v0.12.0
+
+Models:
+  - Name: imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class
+    In Collection: ImVoteNet
+    Config: configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py
+    Metadata:
+      Training Memory (GB): 2.1
+    Results:
+      - Task: Object Detection
+        Dataset: SUNRGBD-2D
+        Metrics:
+          AP@0.5: 62.70
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210819_225618-62eba6ce.pth
+
+  - Name: imvotenet_stage2_16x8_sunrgbd-3d-10class
+    In Collection: ImVoteNet
+    Config: configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py
+    Metadata:
+      Training Memory (GB): 9.4
+    Results:
+      - Task: 3D Object Detection
+        Dataset: SUNRGBD-3D
+        Metrics:
+          AP@0.25: 64.48
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210819_192851-1bcd1b97.pth
diff --git a/mmde/configs/imvoxelnet/README.md b/mmde/configs/imvoxelnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..56c2b55ec28a0bf4f9580b1923123ff031486451
--- /dev/null
+++ b/mmde/configs/imvoxelnet/README.md
@@ -0,0 +1,44 @@
+# ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection
+
+> [ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection](https://arxiv.org/abs/2106.01178)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this paper, we introduce the task of multi-view RGB-based 3D object detection as an end-to-end optimization problem. To address this problem, we propose ImVoxelNet, a novel fully convolutional method of 3D object detection based on posed monocular or multi-view RGB images. The number of monocular images in each multiview input can variate during training and inference; actually, this number might be unique for each multi-view input. ImVoxelNet successfully handles both indoor and outdoor scenes, which makes it general-purpose. Specifically, it achieves state-of-the-art results in car detection on KITTI (monocular) and nuScenes (multi-view) benchmarks among all methods that accept RGB images. Moreover, it surpasses existing RGB-based 3D object detection methods on the SUN RGB-D dataset. On ScanNet, ImVoxelNet sets a new benchmark for multi-view 3D object detection.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/36950400/143871445-38a55168-b8cd-4520-8ed6-f5c8c8ea304a.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement a monocular 3D detector ImVoxelNet and provide its results and checkpoints on KITTI dataset.
+Results for SUN RGB-D, ScanNet and nuScenes are currently available in ImVoxelNet authors
+[repo](https://github.com/saic-vul/imvoxelnet) (based on mmdetection3d).
+
+## Results and models
+
+### KITTI
+
+|                    Backbone                    | Class | Lr schd | Mem (GB) | Inf time (fps) |  mAP  |                                                                                                                                                               Download                                                                                                                                                               |
+| :--------------------------------------------: | :---: | :-----: | :------: | :------------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet-50](./imvoxelnet_8xb4_kitti-3d-car.py) |  Car  |   3x    |          |                | 17.26 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x8_kitti-3d-car/imvoxelnet_4x8_kitti-3d-car_20210830_003014-3d0ffdf4.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x8_kitti-3d-car/imvoxelnet_4x8_kitti-3d-car_20210830_003014.log.json) |
+
+### SUN RGB-D
+
+|                      Backbone                       | Lr schd | Mem (GB) | Inf time (fps) | mAP@0.25 | mAP@0.5 |                                                                                                                                                                           Download                                                                                                                                                                           |
+| :-------------------------------------------------: | :-----: | :------: | :------------: | :------: | :-----: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet-50](./imvoxelnet_4x2_sunrgbd-3d-10class.py) |   2x    |   7.2    |      22.5      |  40.96   |  13.50  | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x2_sunrgbd-3d-10class/imvoxelnet_4x2_sunrgbd-3d-10class_20220809_184416-29ca7d2e.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x2_sunrgbd-3d-10class/imvoxelnet_4x2_sunrgbd-3d-10class_20220809_184416.log.json) |
+
+## Citation
+
+```latex
+@article{rukhovich2021imvoxelnet,
+  title={ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection},
+  author={Danila Rukhovich, Anna Vorontsova, Anton Konushin},
+  journal={arXiv preprint arXiv:2106.01178},
+  year={2021}
+}
+```
diff --git a/mmde/configs/imvoxelnet/imvoxelnet_2xb4_sunrgbd-3d-10class.py b/mmde/configs/imvoxelnet/imvoxelnet_2xb4_sunrgbd-3d-10class.py
new file mode 100644
index 0000000000000000000000000000000000000000..2884f921825cd5d27670e61d4852b02a45c6cb79
--- /dev/null
+++ b/mmde/configs/imvoxelnet/imvoxelnet_2xb4_sunrgbd-3d-10class.py
@@ -0,0 +1,137 @@
+_base_ = [
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+prior_generator = dict(
+    type='AlignedAnchor3DRangeGenerator',
+    ranges=[[-3.2, -0.2, -2.28, 3.2, 6.2, 0.28]],
+    rotations=[.0])
+model = dict(
+    type='ImVoxelNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+        style='pytorch'),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    neck_3d=dict(
+        type='IndoorImVoxelNeck',
+        in_channels=256,
+        out_channels=128,
+        n_blocks=[1, 1, 1]),
+    bbox_head=dict(
+        type='ImVoxelHead',
+        n_classes=10,
+        n_levels=3,
+        n_channels=128,
+        n_reg_outs=7,
+        pts_assign_threshold=27,
+        pts_center_threshold=18,
+        prior_generator=prior_generator),
+    prior_generator=prior_generator,
+    n_voxels=[40, 40, 16],
+    coord_type='DEPTH',
+    train_cfg=dict(),
+    test_cfg=dict(nms_pre=1000, iou_thr=.25, score_thr=.01))
+
+dataset_type = 'SUNRGBDDataset'
+data_root = 'data/sunrgbd/'
+class_names = [
+    'bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+    'night_stand', 'bookshelf', 'bathtub'
+]
+metainfo = dict(CLASSES=class_names)
+
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadAnnotations3D', backend_args=backend_args),
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='RandomResize', scale=[(512, 384), (768, 576)], keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(640, 480), keep_ratio=True),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='sunrgbd_infos_train.pkl',
+            pipeline=train_pipeline,
+            test_mode=False,
+            filter_empty_gt=True,
+            box_type_3d='Depth',
+            metainfo=metainfo,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        test_mode=True,
+        box_type_3d='Depth',
+        metainfo=metainfo,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='IndoorMetric',
+    ann_file=data_root + 'sunrgbd_infos_val.pkl',
+    metric='bbox')
+test_evaluator = val_evaluator
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        _delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
+    clip_grad=dict(max_norm=35., norm_type=2))
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# hooks
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', max_keep_ckpts=1))
+
+# runtime
+find_unused_parameters = True  # only 1 of 4 FPN outputs is used
diff --git a/mmde/configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py b/mmde/configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..df1e9d69ca12b2d0fa99a8e680807ed521a1e091
--- /dev/null
+++ b/mmde/configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py
@@ -0,0 +1,176 @@
+_base_ = [
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='ImVoxelNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+        style='pytorch'),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=64,
+        num_outs=4),
+    neck_3d=dict(type='OutdoorImVoxelNeck', in_channels=64, out_channels=256),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=1,
+        in_channels=256,
+        feat_channels=256,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-0.16, -39.68, -1.78, 68.96, 39.68, -1.78]],
+            sizes=[[3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    n_voxels=[216, 248, 12],
+    coord_type='LIDAR',
+    prior_generator=dict(
+        type='AlignedAnchor3DRangeGenerator',
+        ranges=[[-0.16, -39.68, -3.08, 68.96, 39.68, 0.76]],
+        rotations=[.0]),
+    train_cfg=dict(
+        assigner=dict(
+            type='Max3DIoUAssigner',
+            iou_calculator=dict(type='mmdet3d.BboxOverlapsNearest3D'),
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.45,
+            min_pos_iou=0.45,
+            ignore_iof_thr=-1),
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
+
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+input_modality = dict(use_lidar=False, use_camera=True)
+point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
+metainfo = dict(classes=class_names)
+
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadAnnotations3D', backend_args=backend_args),
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='RandomResize', scale=[(1173, 352), (1387, 416)],
+        keep_ratio=True),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(type='Resize', scale=(1280, 384), keep_ratio=True),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='kitti_infos_train.pkl',
+            data_prefix=dict(img='training/image_2'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            metainfo=metainfo,
+            box_type_3d='LiDAR',
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='kitti_infos_val.pkl',
+        data_prefix=dict(img='training/image_2'),
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        _delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
+    clip_grad=dict(max_norm=35., norm_type=2))
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# hooks
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', max_keep_ckpts=1))
+
+# runtime
+find_unused_parameters = True  # only 1 of 4 FPN outputs is used
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/configs/imvoxelnet/metafile.yml b/mmde/configs/imvoxelnet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a15c4cae1d14b5fd010ee143025e67646e169331
--- /dev/null
+++ b/mmde/configs/imvoxelnet/metafile.yml
@@ -0,0 +1,29 @@
+Collections:
+  - Name: ImVoxelNet
+    Metadata:
+      Training Data: KITTI
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x Tesla P40
+      Architecture:
+        - Anchor3DHead
+    Paper:
+      URL: https://arxiv.org/abs/2106.01178
+      Title: 'ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection'
+    README: configs/imvoxelnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/imvoxelnet.py#L11
+      Version: v0.15.0
+
+Models:
+  - Name: imvoxelnet_kitti-3d-car
+    In Collection: ImVoxelNet
+    Config: configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py
+    Metadata:
+      Training Memory (GB): 15.0
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 17.26
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x8_kitti-3d-car/imvoxelnet_4x8_kitti-3d-car_20210830_003014-3d0ffdf4.pth
diff --git a/mmde/configs/minkunet/README.md b/mmde/configs/minkunet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6efe23387ef0050b917baf5c3a34f5fa2b22bbea
--- /dev/null
+++ b/mmde/configs/minkunet/README.md
@@ -0,0 +1,53 @@
+# 4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural Networks
+
+> [4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural Networks](https://arxiv.org/abs/1904.08755)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In many robotics and VR/AR applications, 3D-videos are readily-available sources of input (a continuous sequence of depth images, or LIDAR scans). However, those 3D-videos are processed frame-by-frame either through 2D convnets or 3D perception algorithms. In this work, we propose 4-dimensional convolutional neural networks for spatio-temporal perception that can directly process such 3D-videos using high-dimensional convolutions. For this, we adopt sparse tensors and propose the generalized sparse convolution that encompasses all discrete convolutions. To implement the generalized sparse convolution, we create an open-source auto-differentiation library for sparse tensors that provides extensive functions for high-dimensional convolutional neural networks. We create 4D spatio-temporal convolutional neural networks using the library and validate them on various 3D semantic segmentation benchmarks and proposed 4D datasets for 3D-video perception. To overcome challenges in the 4D space, we propose the hybrid kernel, a special case of the generalized sparse convolution, and the trilateral-stationary conditional random field that enforces spatio-temporal consistency in the 7D space-time-chroma space. Experimentally, we show that convolutional neural networks with only generalized 3D sparse convolutions can outperform 2D or 2D-3D hybrid methods by a large margin. Also, we show that on 3D-videos, 4D spatio-temporal convolutional neural networks are robust to noise, outperform 3D convolutional neural networks and are faster than the 3D counterpart in some cases.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/72679458/225243534-cd0ed738-4224-4e7c-bcac-4f4c8d89f3a9.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement MinkUNet with [TorchSparse](https://github.com/mit-han-lab/torchsparse) / [Minkowski Engine](https://github.com/NVIDIA/MinkowskiEngine) / [Spconv](https://github.com/traveller59/spconv) backend and provide the result and checkpoints on SemanticKITTI datasets.
+
+## Results and models
+
+### SemanticKITTI
+
+|                                            Method                                             |     Backend      | Lr schd | Amp | Laser-Polar Mix | Mem (GB) | Training Time (hours) |  FPS   | mIoU |                                                                                                                                                                          Download                                                                                                                                                                           |
+| :-------------------------------------------------------------------------------------------: | :--------------: | :-----: | :-: | :-------------: | :------: | :-------------------: | :----: | :--: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|         [MinkUNet18-W16](./minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti.py)          |   torchsparse    |   15e   |  ✔  |        ✗        |   3.4    |           -           |   -    | 60.3 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w16_8xb2-15e_semantickitti/minkunet_w16_8xb2-15e_semantickitti_20230309_160737-0d8ec25b.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w16_8xb2-15e_semantickitti/minkunet_w16_8xb2-15e_semantickitti_20230309_160737.log) |
+|         [MinkUNet18-W20](./minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti.py)          |   torchsparse    |   15e   |  ✔  |        ✗        |   3.7    |           -           |   -    | 61.6 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w20_8xb2-15e_semantickitti/minkunet_w20_8xb2-15e_semantickitti_20230309_160718-c3b92e6e.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w20_8xb2-15e_semantickitti/minkunet_w20_8xb2-15e_semantickitti_20230309_160718.log) |
+|         [MinkUNet18-W32](./minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py)          |   torchsparse    |   15e   |  ✔  |        ✗        |   4.9    |           -           |   -    | 63.1 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w32_8xb2-15e_semantickitti/minkunet_w32_8xb2-15e_semantickitti_20230309_160710-7fa0a6f1.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w32_8xb2-15e_semantickitti/minkunet_w32_8xb2-15e_semantickitti_20230309_160710.log) |
+|     [MinkUNet34-W32](./minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti.py)     | minkowski engine |   3x    |  ✗  |        ✔        |   11.5   |          6.5          |  12.2  | 69.2 |          [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti_20230514_202236-839847a8.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti_20230514_202236.log)          |
+|    [MinkUNet34-W32](./minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti.py)     |      spconv      |   3x    |  ✔  |        ✔        |   6.7    |           2           | 14.6\* | 68.3 |         [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233152-e0698a0f.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233152.log)         |
+|      [MinkUNet34-W32](./minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti.py)       |      spconv      |   3x    |  ✗  |        ✔        |   10.5   |           6           |  14.5  | 69.3 |             [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti_20230512_233817-72b200d8.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti_20230512_233817.log)             |
+|  [MinkUNet34-W32](./minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py)  |   torchsparse    |   3x    |  ✔  |        ✔        |   6.6    |           3           |  12.8  | 69.3 |    [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233511-bef6cad0.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233511.log)    |
+|    [MinkUNet34-W32](./minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py)    |   torchsparse    |   3x    |  ✗  |        ✔        |   11.8   |          5.5          |  15.9  | 68.7 |        [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti_20230512_233601-2b61b0ab.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti_20230512_233601.log)        |
+| [MinkUNet34v2-W32](minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py) |   torchsparse    |   3x    |  ✔  |        ✔        |   8.9    |           -           |   -    | 70.3 |  [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230510_221853-b14a68b3.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230510_221853.log)  |
+
+**Note:** We follow the implementation in SPVNAS original [repo](https://github.com/mit-han-lab/spvnas) and W16\\W20\\W32 indicates different number of channels.
+
+**Note:** Due to TorchSparse backend, the model performance is unstable with TorchSparse backend and may fluctuate by about 1.5 mIoU for different random seeds.
+
+**Note:** Referring to [PCSeg](https://github.com/PJLab-ADG/PCSeg), MinkUNet34v2 is modified based on MinkUNet34.
+
+**Note\*:** Training Time and FPS are measured on NVIDIA A100. The versions of Torchsparse, Minkowski Engine and Spconv are 0.5.4, 1.4.0 and 2.3.6 respectively. Since spconv 2.3.6 has a bug with fp16 on in the inference stage, the actual FPS measurement using fp32.
+
+## Citation
+
+```latex
+@inproceedings{choy20194d,
+  title={4d spatio-temporal convnets: Minkowski convolutional neural networks},
+  author={Choy, Christopher and Gwak, JunYoung and Savarese, Silvio},
+  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
+  pages={3075--3084},
+  year={2019}
+}
+```
diff --git a/mmde/configs/minkunet/metafile.yml b/mmde/configs/minkunet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f9ae704cd45fdeb1a7b75e1b897922d82bc8eb69
--- /dev/null
+++ b/mmde/configs/minkunet/metafile.yml
@@ -0,0 +1,141 @@
+Collections:
+  - Name: MinkUNet
+    Metadata:
+      Training Techniques:
+        - AdamW
+      Architecture:
+        - MinkUNet
+    Paper:
+      URL: https://arxiv.org/abs/1904.08755
+      Title: '4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural Networks'
+    README: configs/minkunet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/1.1/mmdet3d/models/segmentors/minkunet.py#L13
+      Version: v1.1.0
+
+Models:
+  - Name: minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti
+    In Collection: MinkUNet
+    Config: configs/minkunet/minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti.py
+    Metadata:
+      Training Data: SemanticKITTI
+      Training Memory (GB): 3.4
+      Training Resources: 8x A100 GPUs
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: SemanticKITTI
+        Metrics:
+          mIoU: 60.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w16_8xb2-15e_semantickitti/minkunet_w16_8xb2-15e_semantickitti_20230309_160737-0d8ec25b.pth
+
+  - Name: minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti
+    In Collection: MinkUNet
+    Config: configs/minkunet/minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti.py
+    Metadata:
+      Training Data: SemanticKITTI
+      Training Memory (GB): 3.7
+      Training Resources: 8x A100 GPUs
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: SemanticKITTI
+        Metrics:
+          mIoU: 61.6
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w20_8xb2-15e_semantickitti/minkunet_w20_8xb2-15e_semantickitti_20230309_160718-c3b92e6e.pth
+
+  - Name: minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti
+    In Collection: MinkUNet
+    Config: configs/minkunet/minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py
+    Metadata:
+      Training Data: SemanticKITTI
+      Training Memory (GB): 4.9
+      Training Resources: 8x A100 GPUs
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: SemanticKITTI
+        Metrics:
+          mIoU: 63.1
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet_w32_8xb2-15e_semantickitti/minkunet_w32_8xb2-15e_semantickitti_20230309_160710-7fa0a6f1.pth
+
+  - Name: minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti
+    In Collection: MinkUNet
+    Config: configs/minkunet/minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti.py
+    Metadata:
+      Training Data: SemanticKITTI
+      Training Memory (GB): 11.5
+      Training Resources: 8x A100 GPUs
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: SemanticKITTI
+        Metrics:
+          mIoU: 69.2
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti_20230514_202236-839847a8.pth
+
+  - Name: minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti
+    In Collection: MinkUNet
+    Config: configs/minkunet/minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti.py
+    Metadata:
+      Training Data: SemanticKITTI
+      Training Memory (GB): 6.7
+      Training Resources: 8x A100 GPUs
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: SemanticKITTI
+        Metrics:
+          mIoU: 68.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233152-e0698a0f.pth
+
+  - Name: minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti
+    In Collection: MinkUNet
+    Config: configs/minkunet/minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti.py
+    Metadata:
+      Training Data: SemanticKITTI
+      Training Memory (GB): 10.5
+      Training Resources: 8x A100 GPUs
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: SemanticKITTI
+        Metrics:
+          mIoU: 69.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti_20230512_233817-72b200d8.pth
+
+  - Name: minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti
+    In Collection: MinkUNet
+    Config: configs/minkunet/minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py
+    Metadata:
+      Training Data: SemanticKITTI
+      Training Memory (GB): 6.6
+      Training Resources: 8x A100 GPUs
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: SemanticKITTI
+        Metrics:
+          mIoU: 69.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230512_233511-bef6cad0.pth
+
+  - Name: minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti
+    In Collection: MinkUNet
+    Config: configs/minkunet/minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py
+    Metadata:
+      Training Data: SemanticKITTI
+      Training Memory (GB): 11.8
+      Training Resources: 8x A100 GPUs
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: SemanticKITTI
+        Metrics:
+          mIoU: 68.7
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti_20230512_233601-2b61b0ab.pth
+
+  - Name: minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti
+    In Collection: MinkUNet
+    Config: configs/minkunet/minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py
+    Metadata:
+      Training Data: SemanticKITTI
+      Training Memory (GB): 8.9
+      Training Resources: 8x A100 GPUs
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: SemanticKITTI
+        Metrics:
+          mIoU: 70.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/minkunet/minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti_20230510_221853-b14a68b3.pth
diff --git a/mmde/configs/minkunet/minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti.py b/mmde/configs/minkunet/minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..94fe3bf9a46d1ad90fb9edc5964984a0121e83b5
--- /dev/null
+++ b/mmde/configs/minkunet/minkunet18_w16_torchsparse_8xb2-amp-15e_semantickitti.py
@@ -0,0 +1,13 @@
+_base_ = ['./minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py']
+
+model = dict(
+    backbone=dict(
+        base_channels=16,
+        encoder_channels=[16, 32, 64, 128],
+        decoder_channels=[128, 64, 48, 48]),
+    decode_head=dict(channels=48))
+
+# NOTE: Due to TorchSparse backend, the model performance is relatively
+# dependent on random seeds, and if random seeds are not specified the
+# model performance will be different (± 1.5 mIoU).
+randomness = dict(seed=1588147245)
diff --git a/mmde/configs/minkunet/minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti.py b/mmde/configs/minkunet/minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb2b3b3d37e396680aebd0109263a546bc13b304
--- /dev/null
+++ b/mmde/configs/minkunet/minkunet18_w20_torchsparse_8xb2-amp-15e_semantickitti.py
@@ -0,0 +1,8 @@
+_base_ = ['./minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py']
+
+model = dict(
+    backbone=dict(
+        base_channels=20,
+        encoder_channels=[20, 40, 81, 163],
+        decoder_channels=[163, 81, 61, 61]),
+    decode_head=dict(channels=61))
diff --git a/mmde/configs/minkunet/minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py b/mmde/configs/minkunet/minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..22c70cf55356a606f1d4243e8a083552c44d0229
--- /dev/null
+++ b/mmde/configs/minkunet/minkunet18_w32_torchsparse_8xb2-amp-15e_semantickitti.py
@@ -0,0 +1,54 @@
+_base_ = [
+    '../_base_/datasets/semantickitti.py', '../_base_/models/minkunet.py',
+    '../_base_/default_runtime.py'
+]
+
+train_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti'),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[0., 6.28318531],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+    ),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+
+train_dataloader = dict(
+    sampler=dict(seed=0), dataset=dict(pipeline=train_pipeline))
+
+lr = 0.24
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    loss_scale='dynamic',
+    optimizer=dict(
+        type='SGD', lr=lr, weight_decay=0.0001, momentum=0.9, nesterov=True))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.008, by_epoch=False, begin=0, end=125),
+    dict(
+        type='CosineAnnealingLR',
+        begin=0,
+        T_max=15,
+        by_epoch=True,
+        eta_min=1e-5,
+        convert_to_iter_based=True)
+]
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=15, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1))
+randomness = dict(seed=0, deterministic=False, diff_rank_seed=True)
+env_cfg = dict(cudnn_benchmark=True)
diff --git a/mmde/configs/minkunet/minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti.py b/mmde/configs/minkunet/minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb6436dba57168c36c65f229e1dfb2e0a7611111
--- /dev/null
+++ b/mmde/configs/minkunet/minkunet34_w32_minkowski_8xb2-laser-polar-mix-3x_semantickitti.py
@@ -0,0 +1,7 @@
+_base_ = [
+    './minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py'
+]
+
+model = dict(
+    data_preprocessor=dict(batch_first=True),
+    backbone=dict(sparseconv_backend='minkowski'))
diff --git a/mmde/configs/minkunet/minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti.py b/mmde/configs/minkunet/minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..69cc82a8ff93c9c847966c62316c5617e777295a
--- /dev/null
+++ b/mmde/configs/minkunet/minkunet34_w32_spconv_8xb2-amp-laser-polar-mix-3x_semantickitti.py
@@ -0,0 +1,9 @@
+_base_ = [
+    './minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py'
+]
+
+model = dict(
+    data_preprocessor=dict(batch_first=True),
+    backbone=dict(sparseconv_backend='spconv'))
+
+optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic')
diff --git a/mmde/configs/minkunet/minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti.py b/mmde/configs/minkunet/minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a7eff5f7ca0f238846d146a88c91cce7f08bc65
--- /dev/null
+++ b/mmde/configs/minkunet/minkunet34_w32_spconv_8xb2-laser-polar-mix-3x_semantickitti.py
@@ -0,0 +1,7 @@
+_base_ = [
+    './minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py'
+]
+
+model = dict(
+    data_preprocessor=dict(batch_first=True),
+    backbone=dict(sparseconv_backend='spconv'))
diff --git a/mmde/configs/minkunet/minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py b/mmde/configs/minkunet/minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..1165d7bc385a23fffe2e92fb6a9d79924d91849a
--- /dev/null
+++ b/mmde/configs/minkunet/minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py
@@ -0,0 +1,5 @@
+_base_ = [
+    './minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py'
+]
+
+optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic')
diff --git a/mmde/configs/minkunet/minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py b/mmde/configs/minkunet/minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..c34d7b649eb8a85fa42ac04e6ff57fda7ffd0bf8
--- /dev/null
+++ b/mmde/configs/minkunet/minkunet34_w32_torchsparse_8xb2-laser-polar-mix-3x_semantickitti.py
@@ -0,0 +1,84 @@
+_base_ = [
+    '../_base_/datasets/semantickitti.py', '../_base_/models/minkunet.py',
+    '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    data_preprocessor=dict(max_voxels=None),
+    backbone=dict(encoder_blocks=[2, 3, 4, 6]))
+
+train_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti'),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='LaserMix',
+                    num_areas=[3, 4, 5, 6],
+                    pitch_angles=[-25, 3],
+                    pre_transform=[
+                        dict(
+                            type='LoadPointsFromFile',
+                            coord_type='LIDAR',
+                            load_dim=4,
+                            use_dim=4),
+                        dict(
+                            type='LoadAnnotations3D',
+                            with_bbox_3d=False,
+                            with_label_3d=False,
+                            with_seg_3d=True,
+                            seg_3d_dtype='np.int32',
+                            seg_offset=2**16,
+                            dataset_type='semantickitti'),
+                        dict(type='PointSegClassMapping')
+                    ],
+                    prob=1)
+            ],
+            [
+                dict(
+                    type='PolarMix',
+                    instance_classes=[0, 1, 2, 3, 4, 5, 6, 7],
+                    swap_ratio=0.5,
+                    rotate_paste_ratio=1.0,
+                    pre_transform=[
+                        dict(
+                            type='LoadPointsFromFile',
+                            coord_type='LIDAR',
+                            load_dim=4,
+                            use_dim=4),
+                        dict(
+                            type='LoadAnnotations3D',
+                            with_bbox_3d=False,
+                            with_label_3d=False,
+                            with_seg_3d=True,
+                            seg_3d_dtype='np.int32',
+                            seg_offset=2**16,
+                            dataset_type='semantickitti'),
+                        dict(type='PointSegClassMapping')
+                    ],
+                    prob=1)
+            ],
+        ],
+        prob=[0.5, 0.5]),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[0., 6.28318531],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+    ),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1))
diff --git a/mmde/configs/minkunet/minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py b/mmde/configs/minkunet/minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b48e0454c845cb14f6ed15f2964bac3d0755112
--- /dev/null
+++ b/mmde/configs/minkunet/minkunet34v2_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py
@@ -0,0 +1,10 @@
+_base_ = [
+    './minkunet34_w32_torchsparse_8xb2-amp-laser-polar-mix-3x_semantickitti.py'
+]
+
+model = dict(
+    backbone=dict(type='MinkUNetBackboneV2'),
+    decode_head=dict(channels=256 + 128 + 96))
+
+randomness = dict(seed=None, deterministic=False, diff_rank_seed=True)
+env_cfg = dict(cudnn_benchmark=True)
diff --git a/mmde/configs/monoflex/README.md b/mmde/configs/monoflex/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0f402be24c8a02e4d13d3c41853ab96aa462865e
--- /dev/null
+++ b/mmde/configs/monoflex/README.md
@@ -0,0 +1,48 @@
+# Objects are Different: Flexible Monocular 3D Object Detection
+
+> [Objects are Different: Flexible Monocular 3D Object Detection](https://arxiv.org/abs/2104.02323)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+The precise localization of 3D objects from a single image without depth information is a highly challenging problem. Most existing methods adopt the same approach for all objects regardless of their diverse distributions, leading to limited performance for truncated objects. In this paper, we propose a flexible framework for monocular 3D object detection which explicitly decouples the truncated objects and adaptively combines multiple approaches for object depth estimation. Specifically, we decouple the edge of the feature map for predicting long-tail truncated objects so that the optimization of normal objects is not influenced. Furthermore, we formulate the object depth estimation as an uncertainty-guided ensemble of directly regressed object depth and solved depths from different groups of keypoints. Experiments demonstrate that our method outperforms the state-of-the-art method by relatively 27% for the moderate level and 30% for the hard level in the test set of KITTI benchmark while maintaining real-time efficiency.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/36950400/153138824-d54a7a47-773f-42f9-8a51-b0a71078593e.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement MonoFlex and provide the results and checkpoints on KITTI dataset.
+
+## Results and models
+
+### KITTI
+
+|                                Backbone                                 | Lr schd | Mem (GB) | Inf time (fps) |  mAP  |                                                                                                                                                               Download                                                                                                                                                               |
+| :---------------------------------------------------------------------: | :-----: | :------: | :------------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [DLA34](./monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d.py) |   6x    |   9.64   |                | 21.86 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/monoflex/monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d_20211228_027553-d46d9bb0.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/monoflex/monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d_20211228_027553.log.json) |
+
+Note: mAP represents Car moderate 3D strict AP11 results.
+Detailed performance on KITTI 3D detection (3D/BEV) is as follows, evaluated by AP11 and AP40 metric:
+
+|            |     Easy      |   Moderate    |     Hard      |
+| ---------- | :-----------: | :-----------: | :-----------: |
+| Car (AP11) | 28.02 / 36.11 | 21.86 / 29.46 | 19.01 / 24.83 |
+| Car (AP40) | 23.22 / 32.74 | 17.18 / 24.02 | 15.13 / 20.67 |
+
+Note: mAP represents Car moderate 3D strict AP11 / AP40 results. Because of the limited data for pedestrians and cyclists, the detection performance for these two classes is usually unstable. Therefore, we only list car detection results here. In addition, the AP11 result may fluctuate in a larger range (~1 AP), so AP40 is a more recommended metric for reference due to its much better stability.
+
+## Citation
+
+```latex
+@InProceedings{MonoFlex,
+    author    = {Zhang, Yunpeng and Lu, Jiwen and Zhou, Jie},
+    title     = {Objects Are Different: Flexible Monocular 3D Object Detection},
+    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+    month     = {June},
+    year      = {2021},
+    pages     = {3289-3298}
+}
+```
diff --git a/mmde/configs/monoflex/metafile.yml b/mmde/configs/monoflex/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..36fe1f0af13f0ba431dddade7016b5b9a95c0338
--- /dev/null
+++ b/mmde/configs/monoflex/metafile.yml
@@ -0,0 +1,30 @@
+Collections:
+  - Name: MonoFlex
+    Metadata:
+      Training Data: KITTI
+      Training Techniques:
+        - Adam
+      Training Resources: 2x V100 GPUS
+      Architecture:
+        - MonoFlexHead
+        - DLA
+    Paper:
+      URL: https://arxiv.org/abs/2104.02323
+      Title: 'Objects are Different: Flexible Monocular 3D Object Detection'
+    README: configs/monoflex/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/models/detectors/monoflex.py#L7
+      Version: v1.0.0
+
+Models:
+  - Name: monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d
+    In Collection: MonoFlex
+    Config: configs/monoflex/monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d.py
+    Metadata:
+      Training Memory (GB): 9.64
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 21.86
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/monoflex/monoflex_dla34_pytorch_dlaneck_gn-all_2x4_6x_kitti-mono3d_20211228_027553-d46d9bb0.pth
diff --git a/mmde/configs/mvfcos3d/README.md b/mmde/configs/mvfcos3d/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..934ec1ad6c9fb0a33bd8e14f058190dabaa0f07b
--- /dev/null
+++ b/mmde/configs/mvfcos3d/README.md
@@ -0,0 +1,62 @@
+# MV-FCOS3D++: Multi-View Camera-Only 4D Object Detection with Pretrained Monocular Backbones
+
+> [MV-FCOS3D++: Multi-View} Camera-Only 4D Object Detection with Pretrained Monocular Backbones](https://arxiv.org/abs/2207.12716)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this technical report, we present our solution, dubbed MV-FCOS3D++, for the Camera-Only 3D Detection track in Waymo Open Dataset Challenge 2022. For multi-view camera-only 3D detection, methods based on bird-eye-view or 3D geometric representations can leverage the stereo cues from overlapped regions between adjacent views and directly perform 3D detection without hand-crafted post-processing. However, it lacks direct semantic supervision for 2D backbones, which can be complemented by pretraining simple monocular-based detectors. Our solution is a multi-view framework for 4D detection following this paradigm. It is built upon a simple monocular detector FCOS3D++, pretrained only with object annotations of Waymo, and converts multi-view features to a 3D grid space to detect 3D objects thereon. A dual-path neck for single-frame understanding and temporal stereo matching is devised to incorporate multi-frame information. Our method finally achieves 49.75% mAPL with a single model and wins 2nd place in the WOD challenge, without any LiDAR-based depth supervision during training. The code will be released at [this https URL](https://github.com/Tai-Wang/Depth-from-Motion).
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection3d/assets/72679458/9313eb3c-cc41-40be-9ead-549b3b5fef44" width="800"/>
+</div>
+
+## Introduction
+
+We implement multi-view FCOS3D++ and provide the results on Waymo dataset.
+
+## Usage
+
+### Training commands
+
+1. You should train PGD first:
+
+```bash
+bash tools/dist_train.py configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py 8
+```
+
+2. Given pre-trained PGD backbone, you could train multi-view FCOS3D++:
+
+```bash
+bash tools/dist_train.sh configs/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py --cfg-options load_from=${PRETRAINED_CHECKPOINT}
+```
+
+**Note**:
+the path of `load_from` needs to be changed to yours accordingly.
+
+## Results and models
+
+### Waymo
+
+|                                Backbone                                | Load Interval | mAPL | mAP  | mAPH |                                                                                             Download                                                                                             |
+| :--------------------------------------------------------------------: | :-----------: | :--: | :--: | :--: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet101+DCN](./multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py) |      5x       | 38.2 | 52.9 | 49.5 | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class_20231127_122815.log) |
+|                              above @ Car                               |               | 56.5 | 73.3 | 72.3 |                                                                                                                                                                                                  |
+|                           above @ Pedestrian                           |               | 34.8 | 49.5 | 43.1 |                                                                                                                                                                                                  |
+|                            above @ Cyclist                             |               | 23.2 | 35.9 | 33.3 |                                                                                                                                                                                                  |
+
+**Note**:
+
+Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
+
+## Citation
+
+```latex
+@article{wang2022mvfcos3d++,
+  title={{MV-FCOS3D++: Multi-View} Camera-Only 4D Object Detection with Pretrained Monocular Backbones},
+  author={Wang, Tai and Lian, Qing and Zhu, Chenming and Zhu, Xinge and Zhang, Wenwei},
+  journal={arXiv preprint},
+  year={2022}
+}
+```
diff --git a/mmde/configs/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py b/mmde/configs/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..b75a6db5b3896a14821bc8744a22143f2a53cfbc
--- /dev/null
+++ b/mmde/configs/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py
@@ -0,0 +1,49 @@
+_base_ = [
+    '../_base_/datasets/waymoD5-mv3d-3class.py',
+    '../_base_/models/multiview_dfm.py'
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0005, weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
+    clip_grad=dict(max_norm=35., norm_type=2))
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+# hooks
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+)
+
+# training schedule for 2x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# runtime
+default_scope = 'mmdet3d'
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+
+log_level = 'INFO'
+load_from = None
+resume = False
+find_unused_parameters = True  # only 1 of 4 FPN outputs is used
diff --git a/mmde/configs/mvfcos3d/multiview-fcos3d_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py b/mmde/configs/mvfcos3d/multiview-fcos3d_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c0d2b4d56092e1209bb3589010d1b8e7b203ef3
--- /dev/null
+++ b/mmde/configs/mvfcos3d/multiview-fcos3d_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py
@@ -0,0 +1,53 @@
+_base_ = ['./multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py']
+
+model = dict(
+    bbox_head=dict(
+        _delete_=True,
+        type='CenterHead',
+        in_channels=256,
+        tasks=[
+            dict(num_class=1, class_names=['Pedestrian']),
+            dict(num_class=1, class_names=['Cyclist']),
+            dict(num_class=1, class_names=['Car']),
+        ],
+        common_heads=dict(reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            post_center_range=[-35.0, -75.0, -2, 75.0, 75.0, 4],
+            pc_range=[-35.0, -75.0, -2, 75.0, 75.0, 4],
+            max_num=2000,
+            score_threshold=0,
+            out_size_factor=1,
+            voxel_size=(.50, .50),
+            code_size=7),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='mmdet.GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(
+            type='mmdet.L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    train_cfg=dict(
+        _delete_=True,
+        grid_size=[220, 300, 1],
+        voxel_size=(0.5, 0.5, 6),
+        out_size_factor=1,
+        dense_reg=1,
+        gaussian_overlap=0.1,
+        max_objs=500,
+        min_radius=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        point_cloud_range=[-35.0, -75.0, -2, 75.0, 75.0, 4]),
+    test_cfg=dict(
+        _delete_=True,
+        post_center_limit_range=[-35.0, -75.0, -2, 75.0, 75.0, 4],
+        max_per_img=4096,
+        max_pool_nms=False,
+        min_radius=[0.5, 2, 6],
+        score_threshold=0,
+        out_size_factor=1,
+        voxel_size=(0.5, 0.5),
+        nms_type='circle',
+        pre_max_size=2000,
+        post_max_size=200,
+        nms_thr=0.2))
diff --git a/mmde/configs/mvxnet/README.md b/mmde/configs/mvxnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b7cadc3e179b6ae91faeeaffcc49815ce8f463aa
--- /dev/null
+++ b/mmde/configs/mvxnet/README.md
@@ -0,0 +1,38 @@
+# MVX-Net: Multimodal VoxelNet for 3D Object Detection
+
+> [MVX-Net: Multimodal VoxelNet for 3D Object Detection](https://arxiv.org/abs/1904.01649)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Many recent works on 3D object detection have focused on designing neural network architectures that can consume point cloud data. While these approaches demonstrate encouraging performance, they are typically based on a single modality and are unable to leverage information from other modalities, such as a camera. Although a few approaches fuse data from different modalities, these methods either use a complicated pipeline to process the modalities sequentially, or perform late-fusion and are unable to learn interaction between different modalities at early stages. In this work, we present PointFusion and VoxelFusion: two simple yet effective early-fusion approaches to combine the RGB and point cloud modalities, by leveraging the recently introduced VoxelNet architecture. Evaluation on the KITTI dataset demonstrates significant improvements in performance over approaches which only use point cloud data. Furthermore, the proposed method provides results competitive with the state-of-the-art multimodal algorithms, achieving top-2 ranking in five of the six bird's eye view and 3D detection categories on the KITTI benchmark, by using a simple single stage network.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/79644370/143880819-560675ca-e7e3-4d77-8808-ea661ff8e6e6.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement MVX-Net and provide its results and models on KITTI dataset.
+
+## Results and models
+
+### KITTI
+
+|                              Backbone                               |  Class  |  Lr schd   | Mem (GB) | Inf time (fps) | mAP  |                                                                                                                                                                                                  Download                                                                                                                                                                                                   |
+| :-----------------------------------------------------------------: | :-----: | :--------: | :------: | :------------: | :--: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [SECFPN](./mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py) | 3 Class | cosine 80e |   6.7    |                | 63.5 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class-8963258a.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class-20230424_132228.log) |
+
+## Citation
+
+```latex
+@inproceedings{sindagi2019mvx,
+  title={MVX-Net: Multimodal voxelnet for 3D object detection},
+  author={Sindagi, Vishwanath A and Zhou, Yin and Tuzel, Oncel},
+  booktitle={2019 International Conference on Robotics and Automation (ICRA)},
+  pages={7276--7282},
+  year={2019},
+  organization={IEEE}
+}
+```
diff --git a/mmde/configs/mvxnet/metafile.yml b/mmde/configs/mvxnet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..dc2b66283a9222394ce3f31827497c6ccf0f9890
--- /dev/null
+++ b/mmde/configs/mvxnet/metafile.yml
@@ -0,0 +1,31 @@
+Collections:
+  - Name: MVX-Net
+    Metadata:
+      Training Data: KITTI
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Feature Pyramid Network
+        - Dynamic Voxelization
+    Paper:
+      URL: https://arxiv.org/abs/1904.01649
+      Title: 'MVX-Net: Multimodal VoxelNet for 3D Object Detection'
+    README: configs/mvxnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/mvx_two_stage.py#L20
+      Version: v0.5.0
+
+Models:
+  - Name: dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class
+    Alias: mvxnet_kitti-3class
+    In Collection: MVX-Net
+    Config: configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py
+    Metadata:
+      Training Memory (GB): 6.7
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 63.5
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class-8963258a.pth
diff --git a/mmde/configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py b/mmde/configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6c750d9f77ecb29395d0bc5cccea9601f3d0f2a
--- /dev/null
+++ b/mmde/configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py
@@ -0,0 +1,273 @@
+_base_ = ['../_base_/schedules/cosine.py', '../_base_/default_runtime.py']
+
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+model = dict(
+    type='DynamicMVXFasterRCNN',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_type='dynamic',
+        voxel_layer=dict(
+            max_num_points=-1,
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(-1, -1)),
+        mean=[102.9801, 115.9465, 122.7717],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    img_backbone=dict(
+        type='mmdet.ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    img_neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        # make the image features more stable numerically to avoid loss nan
+        norm_cfg=dict(type='BN', requires_grad=False),
+        num_outs=5),
+    pts_voxel_encoder=dict(
+        type='DynamicVFE',
+        in_channels=4,
+        feat_channels=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=point_cloud_range,
+        fusion_layer=dict(
+            type='PointFusion',
+            img_channels=256,
+            pts_channels=64,
+            mid_channels=128,
+            out_channels=128,
+            img_levels=[0, 1, 2, 3, 4],
+            align_corners=False,
+            activate_out=True,
+            fuse_out=False)),
+    pts_middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=128,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    pts_neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+            ],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        assigner_per_size=True,
+        diff_rad_by_sin=True,
+        assign_per_class=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.35,
+                    neg_iou_thr=0.2,
+                    min_pos_iou=0.2,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.35,
+                    neg_iou_thr=0.2,
+                    min_pos_iou=0.2,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.45,
+                    min_pos_iou=0.45,
+                    ignore_iof_thr=-1),
+            ],
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        pts=dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_thr=0.01,
+            score_thr=0.1,
+            min_bbox_size=0,
+            nms_pre=100,
+            max_num=50)))
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=True, use_camera=True)
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='RandomResize', scale=[(640, 192), (2560, 768)], keep_ratio=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.2, 0.2, 0.2]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes',
+            'gt_labels'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1280, 384),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            # Temporary solution, fix this after refactor the augtest
+            dict(type='Resize', scale=0, keep_ratio=True),
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points', 'img'])
+]
+modality = dict(use_lidar=True, use_camera=True)
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            modality=modality,
+            ann_file='kitti_infos_train.pkl',
+            data_prefix=dict(
+                pts='training/velodyne_reduced', img='training/image_2'),
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            backend_args=backend_args)))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        modality=modality,
+        ann_file='kitti_infos_val.pkl',
+        data_prefix=dict(
+            pts='training/velodyne_reduced', img='training/image_2'),
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='kitti_infos_val.pkl',
+        modality=modality,
+        data_prefix=dict(
+            pts='training/velodyne_reduced', img='training/image_2'),
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+optim_wrapper = dict(
+    optimizer=dict(weight_decay=0.01),
+    clip_grad=dict(max_norm=35, norm_type=2),
+)
+val_evaluator = dict(
+    type='KittiMetric', ann_file='data/kitti/kitti_infos_val.pkl')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# You may need to download the model first is the network is unstable
+load_from = 'https://download.openmmlab.com/mmdetection3d/pretrain_models/mvx_faster_rcnn_detectron2-caffe_20e_coco-pretrain_gt-sample_kitti-3-class_moderate-79.3_20200207-a4a6a3c7.pth'  # noqa
diff --git a/mmde/configs/nuimages/README.md b/mmde/configs/nuimages/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bd96105a9a7ccbb1e05b31e966a1d18aca83d0e4
--- /dev/null
+++ b/mmde/configs/nuimages/README.md
@@ -0,0 +1,69 @@
+# Mask R-CNN
+
+> [Mask R-CNN](https://arxiv.org/abs/1703.06870)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present a conceptually simple, flexible, and general framework for object instance segmentation. Our approach efficiently detects objects in an image while simultaneously generating a high-quality segmentation mask for each instance. The method, called Mask R-CNN, extends Faster R-CNN by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition. Mask R-CNN is simple to train and adds only a small overhead to Faster R-CNN, running at 5 fps. Moreover, Mask R-CNN is easy to generalize to other tasks, e.g., allowing us to estimate human poses in the same framework. We show top results in all three tracks of the COCO suite of challenges, including instance segmentation, bounding-box object detection, and person keypoint detection. Without bells and whistles, Mask R-CNN outperforms all existing, single-model entries on every task, including the COCO 2016 challenge winners. We hope our simple and effective approach will serve as a solid baseline and help ease future research in instance-level recognition.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/40661020/143967081-c2552bed-9af2-46c4-ae44-5b3b74e5679f.png"/>
+</div>
+
+## Introduction
+
+We support and provide some baseline results on [nuImages dataset](https://www.nuscenes.org/nuimages).
+We follow the class mapping in nuScenes dataset, which maps the original categories into 10 foreground categories.
+The convert script can be found [here](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/dataset_converters/nuimage_converter.py).
+The baseline results include instance segmentation models, e.g., Mask R-CNN, Cascade Mask R-CNN, and HTC.
+We will support panoptic segmentation models in the future.
+
+![demo image](../../resources/nuimages_demo.gif)
+
+The dataset converted by the script of v0.6.0 only supports instance segmentation. Since v0.7.0, we also support to produce semantic segmentation mask of each image; thus, we can train HTC or semantic segmentation models using the dataset. To convert the nuImages dataset into COCO format, please use the command below:
+
+```shell
+python -u tools/dataset_converters/nuimage_converter.py --data-root ${DATA_ROOT} --version ${VERSIONS} \
+                                                    --out-dir ${OUT_DIR} --nproc ${NUM_WORKERS} --extra-tag ${TAG}
+```
+
+- `--data-root`: the root of the dataset, defaults to `./data/nuimages`.
+- `--version`: the version of the dataset, defaults to `v1.0-mini`. To get the full dataset, please use `--version v1.0-train v1.0-val v1.0-mini`
+- `--out-dir`: the output directory of annotations and semantic masks, defaults to `./data/nuimages/annotations/`.
+- `--nproc`: number of workers for data preparation, defaults to `4`. Larger number could reduce the preparation time as images are processed in parallel.
+- `--extra-tag`: extra tag of the annotations, defaults to `nuimages`. This can be used to separate different annotations processed in different time for study.
+
+## Results and models
+
+### Instance Segmentation
+
+We report Mask R-CNN and Cascade Mask R-CNN results on nuimages.
+
+|       Method       |                                       Backbone                                        | Pretraining | Lr schd | Mem (GB) |                                                                                   Box AP                                                                                   | Mask AP |                                                                                                                                                                                                                        Download                                                                                                                                                                                                                        |
+| :----------------: | :-----------------------------------------------------------------------------------: | :---------: | :-----: | :------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|     Mask R-CNN     |                        [R-50](./mask-rcnn_r50_fpn_1x_nuim.py)                         |     IN      |   1x    |   7.4    |                                                                                    47.8                                                                                    |  38.4   |                                                         [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_1x_nuim/mask_rcnn_r50_fpn_1x_nuim_20201008_195238-e99f5182.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_1x_nuim/mask_rcnn_r50_fpn_1x_nuim_20201008_195238.log.json)                                                         |
+|     Mask R-CNN     |                    [R-50](./mask-rcnn_r50_fpn_coco-2x_1x_nuim.py)                     | IN+COCO-2x  |   1x    |   7.4    |                                                                                    49.7                                                                                    |  40.5   |                                         [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_coco-2x_1x_nuim/mask_rcnn_r50_fpn_coco-2x_1x_nuim_20201008_195238-b1742a60.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_coco-2x_1x_nuim/mask_rcnn_r50_fpn_coco-2x_1x_nuim_20201008_195238.log.json)                                         |
+|     Mask R-CNN     |                  [R-50-CAFFE](./mask-rcnn_r50_caffe_fpn_1x_nuim.py)                   |     IN      |   1x    |   7.0    |                                                                                    47.7                                                                                    |  38.2   |                                                                                                       [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_1x_nuim/) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_1x_nuim/)                                                                                                       |
+|     Mask R-CNN     |              [R-50-CAFFE](./mask-rcnn_r50_caffe_fpn_coco-3x_1x_nuim.py)               | IN+COCO-3x  |   1x    |   7.0    |                                                                                    49.9                                                                                    |  40.8   |                             [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim_20201008_195305-661a992e.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim_20201008_195305.log.json)                             |
+|     Mask R-CNN     |              [R-50-CAFFE](./mask-rcnn_r50_caffe_fpn_coco-3x_20e_nuim.py)              | IN+COCO-3x  |   20e   |   7.0    |                                                                                    50.6                                                                                    |  41.3   |                           [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim_20201009_125002-5529442c.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim_20201009_125002.log.json)                           |
+|     Mask R-CNN     |                       [R-101](./mask-rcnn_r101_fpn_1x_nuim.py)                        |     IN      |   1x    |   10.9   |                                                                                    48.9                                                                                    |  39.1   |                                                       [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r101_fpn_1x_nuim/mask_rcnn_r101_fpn_1x_nuim_20201024_134803-65c7623a.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r101_fpn_1x_nuim/mask_rcnn_r101_fpn_1x_nuim_20201024_134803.log.json)                                                       |
+|     Mask R-CNN     |                 [X-101_32x4d](./mask-rcnn_x101_32x4d_fpn_1x_nuim.py)                  |     IN      |   1x    |   13.3   |                                                                                    50.4                                                                                    |  40.5   |                                           [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_x101_32x4d_fpn_1x_nuim/mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135741-b699ab37.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_x101_32x4d_fpn_1x_nuim/mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135741.log.json)                                           |
+| Cascade Mask R-CNN |                    [R-50](./cascade-mask-rcnn_r50_fpn_1x_nuim.py)                     |     IN      |   1x    |   8.9    |                                                                                    50.8                                                                                    |  40.4   |                                         [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_1x_nuim/cascade_mask_rcnn_r50_fpn_1x_nuim_20201008_195342-1147c036.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_1x_nuim/cascade_mask_rcnn_r50_fpn_1x_nuim_20201008_195342.log.json)                                         |
+| Cascade Mask R-CNN |                [R-50](./cascade-mask-rcnn_r50_fpn_coco-20e_1x_nuim.py)                | IN+COCO-20e |   1x    |   8.9    |                                                                                    52.8                                                                                    |  42.2   |                       [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim_20201009_124158-ad0540e3.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim_20201009_124158.log.json)                       |
+| Cascade Mask R-CNN |               [R-50](./cascade-mask-rcnn_r50_fpn_coco-20e_20e_nuim.py)                | IN+COCO-20e |   20e   |   8.9    |                                                                                    52.8                                                                                    |  42.2   |                     [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951.log.json)                     |
+| Cascade Mask R-CNN |                   [R-101](./cascade-mask-rcnn_r101_fpn_1x_nuim.py)                    |     IN      |   1x    |   12.5   |                                                                                    51.5                                                                                    |  40.7   |                                       [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r101_fpn_1x_nuim/cascade_mask_rcnn_r101_fpn_1x_nuim_20201024_134804-45215b1e.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r101_fpn_1x_nuim/cascade_mask_rcnn_r101_fpn_1x_nuim_20201024_134804.log.json)                                       |
+| Cascade Mask R-CNN |             [X-101_32x4d](./cascade-mask-rcnn_x101_32x4d_fpn_1x_nuim.py)              |     IN      |   1x    |   14.9   |                                                                                    52.8                                                                                    |  41.6   |                           [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135753-e0e49778.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135753.log.json)                           |
+|  HTC w/o semantic  |                   [R-50](./htc_without_semantic_r50_fpn_1x_nuim.py)                   |     IN      |   1x    |          |                                                                          [model](<>) \| [log](<>)                                                                          |         |                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+|        HTC         |                           [R-50](./htc_r50_fpn_1x_nuim.py)                            |     IN      |   1x    |          | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/) |         |                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+|        HTC         |                       [R-50](./htc_r50_fpn_coco-20e_1x_nuim.py)                       | IN+COCO-20e |   1x    |   11.6   |                                                                                    53.8                                                                                    |  43.8   |                                                   [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_1x_nuim/htc_r50_fpn_coco-20e_1x_nuim_20201010_070203-0b53a65e.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_1x_nuim/htc_r50_fpn_coco-20e_1x_nuim_20201010_070203.log.json)                                                   |
+|        HTC         |                      [R-50](./htc_r50_fpn_coco-20e_20e_nuim.py)                       | IN+COCO-20e |   20e   |   11.6   |                                                                                    54.8                                                                                    |  44.4   |                                                 [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_20e_nuim/htc_r50_fpn_coco-20e_20e_nuim_20201008_211415-d6c60a2c.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_20e_nuim/htc_r50_fpn_coco-20e_20e_nuim_20201008_211415.log.json)                                                 |
+|        HTC         | [X-101_64x4d + DCN_c3-c5](./htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim.py) | IN+COCO-20e |   20e   |   13.3   |                                                                                    57.3                                                                                    |  46.4   | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim_20201008_211222-0b16ac4b.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim_20201008_211222.log.json) |
+
+**Note**:
+
+1. `IN` means only using ImageNet pre-trained backbone. `IN+COCO-Nx` and `IN+COCO-Ne` means the backbone is first pre-trained on ImageNet, and then the detector is pre-trained on COCO train2017 dataset by `Nx` and `N` epochs schedules, respectively.
+2. All the training hyper-parameters follow the standard schedules on COCO dataset except that the images are resized from
+   1280 x 720 to 1920 x 1080 (relative ratio 0.8 to 1.2) since the images are in size 1600 x 900.
+3. The class order in the detectors released in v0.6.0 is different from the order in the configs because the bug in the conversion script. This bug has been fixed since v0.7.0 and the models trained by the correct class order are also released. If you used nuImages since v0.6.0, please re-convert the data through the conversion script using the above-mentioned command.
diff --git a/mmde/configs/nuimages/cascade-mask-rcnn-r50-fpn_coco-20e_nuim.py b/mmde/configs/nuimages/cascade-mask-rcnn-r50-fpn_coco-20e_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..64ff2b3e11caf9d44556304976d0b8381d1e6f27
--- /dev/null
+++ b/mmde/configs/nuimages/cascade-mask-rcnn-r50-fpn_coco-20e_nuim.py
@@ -0,0 +1,7 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_1x_nuim.py'
+
+# learning policy
+lr_config = dict(step=[16, 19])
+runner = dict(max_epochs=20)
+
+load_from = 'http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_bbox_mAP-0.419__segm_mAP-0.365_20200504_174711-4af8e66e.pth'  # noqa
diff --git a/mmde/configs/nuimages/cascade-mask-rcnn_r101_fpn_1x_nuim.py b/mmde/configs/nuimages/cascade-mask-rcnn_r101_fpn_1x_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..488359f73810c84a29e0ab0ea3304da785d5104e
--- /dev/null
+++ b/mmde/configs/nuimages/cascade-mask-rcnn_r101_fpn_1x_nuim.py
@@ -0,0 +1,2 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_1x_nuim.py'
+model = dict(pretrained='torchvision://resnet101', backbone=dict(depth=101))
diff --git a/mmde/configs/nuimages/cascade-mask-rcnn_r50_fpn_1x_nuim.py b/mmde/configs/nuimages/cascade-mask-rcnn_r50_fpn_1x_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..360d4dbc7f58b02418e102bdea5a3c912e7d621c
--- /dev/null
+++ b/mmde/configs/nuimages/cascade-mask-rcnn_r50_fpn_1x_nuim.py
@@ -0,0 +1,60 @@
+_base_ = [
+    '../_base_/models/cascade-mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/nuim-instance.py',
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=10,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=10,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=10,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_head=dict(num_classes=10)))
diff --git a/mmde/configs/nuimages/cascade-mask-rcnn_r50_fpn_coco-20e-1x_nuim.py b/mmde/configs/nuimages/cascade-mask-rcnn_r50_fpn_coco-20e-1x_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..3de330be827e69c675fe75f8a19c9edd18f5239d
--- /dev/null
+++ b/mmde/configs/nuimages/cascade-mask-rcnn_r50_fpn_coco-20e-1x_nuim.py
@@ -0,0 +1,3 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_1x_nuim.py'
+
+load_from = 'http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_bbox_mAP-0.419__segm_mAP-0.365_20200504_174711-4af8e66e.pth'  # noqa
diff --git a/mmde/configs/nuimages/cascade-mask-rcnn_x101_32x4d_fpn_1x_nuim.py b/mmde/configs/nuimages/cascade-mask-rcnn_x101_32x4d_fpn_1x_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae0bd486d9ad26816ac3041c19ff9656233ef1cd
--- /dev/null
+++ b/mmde/configs/nuimages/cascade-mask-rcnn_x101_32x4d_fpn_1x_nuim.py
@@ -0,0 +1,13 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_1x_nuim.py'
+model = dict(
+    pretrained='open-mmlab://resnext101_32x4d',
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch'))
diff --git a/mmde/configs/nuimages/htc_r50_fpn_1x_nuim.py b/mmde/configs/nuimages/htc_r50_fpn_1x_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c40f7ca2b5449b37ab4d3d2a0364af6fe6224e2
--- /dev/null
+++ b/mmde/configs/nuimages/htc_r50_fpn_1x_nuim.py
@@ -0,0 +1,38 @@
+_base_ = './htc_r50_fpn_head-without-semantic_1x_nuim.py'
+model = dict(
+    roi_head=dict(
+        semantic_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[8]),
+        semantic_head=dict(
+            type='FusedSemanticHead',
+            num_ins=5,
+            fusion_level=1,
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=32,
+            ignore_label=0,
+            loss_weight=0.2)))
+
+data_root = 'data/nuimages/'
+backend_args = None
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
+    dict(
+        type='Resize',
+        img_scale=[(1280, 720), (1920, 1080)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='SegRescale', scale_factor=1 / 8),
+    dict(type='PackDetInputs')
+]
+data = dict(
+    train=dict(
+        seg_prefix=data_root + 'annotations/semantic_masks/',
+        pipeline=train_pipeline))
diff --git a/mmde/configs/nuimages/htc_r50_fpn_coco-20e-1x_nuim.py b/mmde/configs/nuimages/htc_r50_fpn_coco-20e-1x_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5f60523db8fe4a1da0aeaa2059442c4cd259170
--- /dev/null
+++ b/mmde/configs/nuimages/htc_r50_fpn_coco-20e-1x_nuim.py
@@ -0,0 +1,3 @@
+_base_ = './htc_r50_fpn_1x_nuim.py'
+
+load_from = 'http://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_20e_coco/htc_r50_fpn_20e_coco_20200319-fe28c577.pth'  # noqa
diff --git a/mmde/configs/nuimages/htc_r50_fpn_coco-20e_nuim.py b/mmde/configs/nuimages/htc_r50_fpn_coco-20e_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..e08d939dd71cfa9a9a9256ca76c254d8c91fc0dd
--- /dev/null
+++ b/mmde/configs/nuimages/htc_r50_fpn_coco-20e_nuim.py
@@ -0,0 +1,4 @@
+_base_ = './htc_r50_fpn_coco-20e-1x_nuim.py'
+# learning policy
+lr_config = dict(step=[16, 19])
+runner = dict(max_epochs=20)
diff --git a/mmde/configs/nuimages/htc_r50_fpn_head-without-semantic_1x_nuim.py b/mmde/configs/nuimages/htc_r50_fpn_head-without-semantic_1x_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a9ba88ebbb50d07c2b2b5d501b358f03aec7ee7
--- /dev/null
+++ b/mmde/configs/nuimages/htc_r50_fpn_head-without-semantic_1x_nuim.py
@@ -0,0 +1,222 @@
+_base_ = [
+    '../_base_/datasets/nuim-instance.py',
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    type='HybridTaskCascade',
+    pretrained='torchvision://resnet50',
+    _scope_='mmdet',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='HybridTaskCascadeRoIHead',
+        interleaved=True,
+        mask_info_flow=True,
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=10,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=10,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=10,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=[
+            dict(
+                type='HTCMaskHead',
+                with_conv_res=False,
+                num_convs=4,
+                in_channels=256,
+                conv_out_channels=256,
+                num_classes=10,
+                loss_mask=dict(
+                    type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)),
+            dict(
+                type='HTCMaskHead',
+                num_convs=4,
+                in_channels=256,
+                conv_out_channels=256,
+                num_classes=10,
+                loss_mask=dict(
+                    type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)),
+            dict(
+                type='HTCMaskHead',
+                num_convs=4,
+                in_channels=256,
+                conv_out_channels=256,
+                num_classes=10,
+                loss_mask=dict(
+                    type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))
+        ]),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.001,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/mmde/configs/nuimages/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e-1xb16_nuim.py b/mmde/configs/nuimages/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e-1xb16_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ab095a881ad3f8ae0740916578291ce8aa8372c
--- /dev/null
+++ b/mmde/configs/nuimages/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e-1xb16_nuim.py
@@ -0,0 +1,23 @@
+_base_ = './htc_r50_fpn_1x_nuim.py'
+model = dict(
+    pretrained='open-mmlab://resnext101_64x4d',
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
+
+data = dict(samples_per_gpu=1, workers_per_gpu=1)
+# learning policy
+lr_config = dict(step=[16, 19])
+runner = dict(max_epochs=20)
+
+load_from = 'http://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco_20200312-946fd751.pth'  # noqa
diff --git a/mmde/configs/nuimages/mask-rcnn_r101_fpn_1x_nuim.py b/mmde/configs/nuimages/mask-rcnn_r101_fpn_1x_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce6f32974f5bb1b55963438ed799e38c6ff0afe
--- /dev/null
+++ b/mmde/configs/nuimages/mask-rcnn_r101_fpn_1x_nuim.py
@@ -0,0 +1,2 @@
+_base_ = './mask-rcnn_r50_fpn_1x_nuim.py'
+model = dict(pretrained='torchvision://resnet101', backbone=dict(depth=101))
diff --git a/mmde/configs/nuimages/mask-rcnn_r50_caffe_fpn_1x_nuim.py b/mmde/configs/nuimages/mask-rcnn_r50_caffe_fpn_1x_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d67f14346f53e198dbca6474f1c643cc6ca00d6
--- /dev/null
+++ b/mmde/configs/nuimages/mask-rcnn_r50_caffe_fpn_1x_nuim.py
@@ -0,0 +1,41 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/nuim-instance.py',
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    pretrained='open-mmlab://detectron2/resnet50_caffe',
+    backbone=dict(norm_cfg=dict(requires_grad=False), style='caffe'),
+    roi_head=dict(
+        bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10)))
+backend_args = None
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1280, 720), (1920, 1080)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='PackDetInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1600, 900),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor')),
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/mmde/configs/nuimages/mask-rcnn_r50_caffe_fpn_coco-3x_1x_nuim.py b/mmde/configs/nuimages/mask-rcnn_r50_caffe_fpn_coco-3x_1x_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..1be657cf64ccb359b3938ec5aa59e9a3deeac706
--- /dev/null
+++ b/mmde/configs/nuimages/mask-rcnn_r50_caffe_fpn_coco-3x_1x_nuim.py
@@ -0,0 +1,43 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/nuim-instance.py',
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    pretrained='open-mmlab://detectron2/resnet50_caffe',
+    backbone=dict(norm_cfg=dict(requires_grad=False), style='caffe'),
+    roi_head=dict(
+        bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10)))
+backend_args = None
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1280, 720), (1920, 1080)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='PackDetInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1600, 900),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor')),
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth'  # noqa
diff --git a/mmde/configs/nuimages/mask-rcnn_r50_caffe_fpn_coco-3x_20e_nuim.py b/mmde/configs/nuimages/mask-rcnn_r50_caffe_fpn_coco-3x_20e_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..c41d6980a7187385927fb5a8544ad115c32d55c4
--- /dev/null
+++ b/mmde/configs/nuimages/mask-rcnn_r50_caffe_fpn_coco-3x_20e_nuim.py
@@ -0,0 +1,47 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/nuim-instance.py',
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    pretrained='open-mmlab://detectron2/resnet50_caffe',
+    backbone=dict(norm_cfg=dict(requires_grad=False), style='caffe'),
+    roi_head=dict(
+        bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10)))
+backend_args = None
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1280, 720), (1920, 1080)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='PackDetInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1600, 900),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor')),
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+
+# learning policy
+lr_config = dict(step=[16, 19])
+runner = dict(max_epochs=20)
+
+load_from = 'http://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth'  # noqa
diff --git a/mmde/configs/nuimages/mask-rcnn_r50_fpn_1x_nuim.py b/mmde/configs/nuimages/mask-rcnn_r50_fpn_1x_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fc8925f49a0bce83e78b93957cab18629b1d5e5
--- /dev/null
+++ b/mmde/configs/nuimages/mask-rcnn_r50_fpn_1x_nuim.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/nuim-instance.py',
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10)))
diff --git a/mmde/configs/nuimages/mask-rcnn_r50_fpn_coco-2x_1x_nuim.py b/mmde/configs/nuimages/mask-rcnn_r50_fpn_coco-2x_1x_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..701101ee93f777ce54ef4d4bc90e11be6c07d26e
--- /dev/null
+++ b/mmde/configs/nuimages/mask-rcnn_r50_fpn_coco-2x_1x_nuim.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/nuim-instance.py',
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10)))
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392__segm_mAP-0.354_20200505_003907-3e542a40.pth'  # noqa
diff --git a/mmde/configs/nuimages/mask-rcnn_r50_fpn_coco-2x_1x_nus-2d.py b/mmde/configs/nuimages/mask-rcnn_r50_fpn_coco-2x_1x_nus-2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9886cef3bced61c8d27b4845552e91e16dbe8b
--- /dev/null
+++ b/mmde/configs/nuimages/mask-rcnn_r50_fpn_coco-2x_1x_nus-2d.py
@@ -0,0 +1,32 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/nuim-instance.py',
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10)))
+
+backend_args = None
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1600, 900),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor')),
+]
+data_root = 'data/nuimages/'
+# data = dict(
+#     val=dict(
+#         ann_file=data_root + 'annotations/nuimages_v1.0-mini.json'),
+#     test=dict(
+#         ann_file=data_root + 'annotations/nuimages_v1.0-mini.json'))
diff --git a/mmde/configs/nuimages/mask-rcnn_x101_32x4d_fpn_1x_nuim.py b/mmde/configs/nuimages/mask-rcnn_x101_32x4d_fpn_1x_nuim.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fd833de9abe5bda2b6df172e5ce1c69ec600c27
--- /dev/null
+++ b/mmde/configs/nuimages/mask-rcnn_x101_32x4d_fpn_1x_nuim.py
@@ -0,0 +1,13 @@
+_base_ = './mask-rcnn_r50_fpn_1x_nuim.py'
+model = dict(
+    pretrained='open-mmlab://resnext101_32x4d',
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch'))
diff --git a/mmde/configs/nuimages/metafile.yml b/mmde/configs/nuimages/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f817f142e203c3c4b05ef9717da1679a2069d108
--- /dev/null
+++ b/mmde/configs/nuimages/metafile.yml
@@ -0,0 +1,279 @@
+Collections:
+  - Name: Mask R-CNN
+    Metadata:
+      Training Data: nuImages
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x TITAN Xp
+      Architecture:
+        - Softmax
+        - RPN
+        - Convolution
+        - Dense Connections
+        - FPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/abs/1703.06870v3
+      Title: "Mask R-CNN"
+    README: configs/nuimages/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/mask_rcnn.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: mask-rcnn_r50_fpn_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/mask-rcnn_r50_fpn_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 7.4
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 47.8
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 38.4
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_1x_nuim/mask_rcnn_r50_fpn_1x_nuim_20201008_195238-e99f5182.pth
+
+  - Name: mask-rcnn_r50_fpn_coco-2x_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/mask-rcnn_r50_fpn_coco-2x_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 7.4
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 49.7
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_coco-2x_1x_nuim/mask_rcnn_r50_fpn_coco-2x_1x_nuim_20201008_195238-b1742a60.pth
+
+  - Name: mask-rcnn_r50_caffe_fpn_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/mask-rcnn_r50_caffe_fpn_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 7.0
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 47.7
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 38.2
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_1x_nuim/
+
+  - Name: mask-rcnn_r50_caffe_fpn_coco-3x_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/mask-rcnn_r50_caffe_fpn_coco-3x_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 7.0
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 49.9
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 40.8
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim_20201008_195305-661a992e.pth
+
+  - Name: mask-rcnn_r50_caffe_fpn_coco-3x_20e_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/mask-rcnn_r50_caffe_fpn_coco-3x_20e_nuim.py
+    Metadata:
+      Training Memory (GB): 7.0
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 50.6
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 41.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim_20201009_125002-5529442c.pth
+
+  - Name: mask-rcnn_r101_fpn_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/mask-rcnn_r101_fpn_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 10.9
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 48.9
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 39.1
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r101_fpn_1x_nuim/mask_rcnn_r101_fpn_1x_nuim_20201024_134803-65c7623a.pth
+
+  - Name: mask-rcnn_x101_32x4d_fpn_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/mask-rcnn_x101_32x4d_fpn_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 13.3
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 50.4
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_x101_32x4d_fpn_1x_nuim/mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135741-b699ab37.pth
+
+  - Name: cascade-mask-rcnn_r50_fpn_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/cascade-mask-rcnn_r50_fpn_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 8.9
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 50.8
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_1x_nuim/cascade_mask_rcnn_r50_fpn_1x_nuim_20201008_195342-1147c036.pth
+
+  - Name: cascade-mask-rcnn_r50_fpn_coco-20e_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/cascade-mask-rcnn_r50_fpn_coco-20e_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 8.9
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 52.8
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 42.2
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim_20201009_124158-ad0540e3.pth
+
+  - Name: cascade-mask-rcnn_r50_fpn_coco-20e_20e_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/cascade-mask-rcnn_r50_fpn_coco-20e_20e_nuim.py
+    Metadata:
+      Training Memory (GB): 8.9
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 52.8
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 42.2
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth
+
+  - Name: cascade-mask-rcnn_r101_fpn_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/cascade-mask-rcnn_r101_fpn_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 12.5
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 51.5
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 40.7
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r101_fpn_1x_nuim/cascade_mask_rcnn_r101_fpn_1x_nuim_20201024_134804-45215b1e.pth
+
+  - Name: cascade-mask-rcnn_x101_32x4d_fpn_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/cascade-mask-rcnn_x101_32x4d_fpn_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 14.9
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 52.8
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135753-e0e49778.pth
+
+  - Name: htc_r50_fpn_coco-20e_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/htc_r50_fpn_coco-20e_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 11.6
+      Training Resources: 8x V100 GPUs
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 53.8
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 43.8
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_1x_nuim/htc_r50_fpn_coco-20e_1x_nuim_20201010_070203-0b53a65e.pth
+
+  - Name: htc_r50_fpn_coco-20e_20e_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/htc_r50_fpn_coco-20e_20e_nuim.py
+    Metadata:
+      Training Memory (GB): 11.6
+      Training Resources: 8x V100 GPUs
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 54.8
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 44.4
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_20e_nuim/htc_r50_fpn_coco-20e_20e_nuim_20201008_211415-d6c60a2c.pth
+
+  - Name: htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim.py
+    Metadata:
+      Training Memory (GB): 13.3
+      Training Resources: 8x V100 GPUs
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 57.3
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 46.4
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim_20201008_211222-0b16ac4b.pth
diff --git a/mmde/configs/paconv/README.md b/mmde/configs/paconv/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..96063a9a384a7bf5a03bda4745226b8e7518ce3b
--- /dev/null
+++ b/mmde/configs/paconv/README.md
@@ -0,0 +1,51 @@
+# PAConv: Position Adaptive Convolution with Dynamic Kernel Assembling on Point Clouds
+
+> [PAConv: Position Adaptive Convolution with Dynamic Kernel Assembling on Point Clouds](https://arxiv.org/abs/2103.14635)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We introduce Position Adaptive Convolution (PAConv), a generic convolution operation for 3D point cloud processing. The key of PAConv is to construct the convolution kernel by dynamically assembling basic weight matrices stored in Weight Bank, where the coefficients of these weight matrices are self-adaptively learned from point positions through ScoreNet. In this way, the kernel is built in a data-driven manner, endowing PAConv with more flexibility than 2D convolutions to better handle the irregular and unordered point cloud data. Besides, the complexity of the learning process is reduced by combining weight matrices instead of brutally predicting kernels from point positions.
+Furthermore, different from the existing point convolution operators whose network architectures are often heavily engineered, we integrate our PAConv into classical MLP-based point cloud pipelines without changing network configurations. Even built on simple networks, our method still approaches or even surpasses the state-of-the-art models, and significantly improves baseline performance on both classification and segmentation tasks, yet with decent efficiency. Thorough ablation studies and visualizations are provided to understand PAConv.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/79644370/143881915-003d5f10-3999-474e-969a-c354cb738a11.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement PAConv and provide the result and checkpoints on S3DIS dataset.
+
+**Notice**: The original PAConv paper used step learning rate schedule. We discovered that cosine schedule achieves slightly better results and adopt it in our implementations.
+
+## Results and models
+
+### S3DIS
+
+|                              Method                               | Split  |   Lr schd   | Mem (GB) | Inf time (fps) | mIoU (Val set) |                                                                                                                                                                                                             Download                                                                                                                                                                                                             |
+| :---------------------------------------------------------------: | :----: | :---------: | :------: | :------------: | :------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    [PAConv (SSG)](./paconv_ssg_8xb8-cosine-150e_s3dis-seg.py)     | Area_5 | cosine 150e |   5.8    |                |     66.65      |           [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/paconv/paconv_ssg_8x8_cosine_150e_s3dis_seg-3d-13class/paconv_ssg_8x8_cosine_150e_s3dis_seg-3d-13class_20210729_200615-2147b2d1.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/paconv/paconv_ssg_8x8_cosine_150e_s3dis_seg-3d-13class/paconv_ssg_8x8_cosine_150e_s3dis_seg-3d-13class_20210729_200615.log.json)           |
+| [PAConv\* (SSG)](./paconv_ssg-cuda_8xb8-cosine-200e_s3dis-seg.py) | Area_5 | cosine 200e |   3.8    |                |     65.33      | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/paconv/paconv_cuda_ssg_8x8_cosine_200e_s3dis_seg-3d-13class/paconv_cuda_ssg_8x8_cosine_200e_s3dis_seg-3d-13class_20210802_171802-e5ea9bb9.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/paconv/paconv_cuda_ssg_8x8_cosine_200e_s3dis_seg-3d-13class/paconv_cuda_ssg_8x8_cosine_200e_s3dis_seg-3d-13class_20210802_171802.log.json) |
+
+**Notes:**
+
+- We use XYZ+Color+Normalized_XYZ as input in all the experiments on S3DIS datasets.
+- `Area_5` Split means training the model on Area_1, 2, 3, 4, 6 and testing on Area_5.
+- PAConv\* stands for the CUDA implementation of PAConv operations. See the [paper](https://arxiv.org/pdf/2103.14635.pdf) appendix section D for more details. In our experiments, the training of PAConv\* is found to be very unstable. We achieved slightly lower mIoU than the result in the paper, but is consistent with the result obtained by running their [official code](https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg). Besides, although the GPU memory consumption of PAConv\* is significantly lower than PAConv, its training and inference speed are actually slower (by ~10%).
+
+## Indeterminism
+
+Since PAConv testing adopts sliding patch inference which involves random point sampling, and the test script uses fixed random seeds while the random seeds of validation in training are not fixed, the test results may be slightly different from the results reported above.
+
+## Citation
+
+```latex
+@inproceedings{xu2021paconv,
+  title={PAConv: Position Adaptive Convolution with Dynamic Kernel Assembling on Point Clouds},
+  author={Xu, Mutian and Ding, Runyu and Zhao, Hengshuang and Qi, Xiaojuan},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={3173--3182},
+  year={2021}
+}
+```
diff --git a/mmde/configs/paconv/metafile.yml b/mmde/configs/paconv/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..558ab867a4b8f5b14b3fee0f3db09dcb7f926040
--- /dev/null
+++ b/mmde/configs/paconv/metafile.yml
@@ -0,0 +1,42 @@
+Collections:
+  - Name: PAConv
+    Metadata:
+      Training Techniques:
+        - SGD
+      Training Resources: 8x Titan XP GPUs
+      Architecture:
+        - PAConv
+    Paper:
+      URL: https://arxiv.org/abs/2103.14635
+      Title: 'PAConv: Position Adaptive Convolution with Dynamic Kernel Assembling on Point Clouds'
+    README: configs/paconv/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/ops/paconv/paconv.py#L106
+      Version: v0.16.0
+
+Models:
+  - Name: paconv_ssg_8xb8-cosine-150e_s3dis-seg.py
+    In Collection: PAConv
+    Config: configs/paconv/paconv_ssg_8xb8-cosine-150e_s3dis-seg.py
+    Metadata:
+      Training Data: S3DIS
+      Training Memory (GB): 5.8
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: S3DIS
+        Metrics:
+          mIoU: 66.65
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/paconv/paconv_ssg_8x8_cosine_150e_s3dis_seg-3d-13class/paconv_ssg_8x8_cosine_150e_s3dis_seg-3d-13class_20210729_200615-2147b2d1.pth
+
+  - Name: paconv_ssg-cuda_8xb8-cosine-200e_s3dis-seg
+    In Collection: PAConv
+    Config: configs/paconv/paconv_ssg-cuda_8xb8-cosine-200e_s3dis-seg.py
+    Metadata:
+      Training Data: S3DIS
+      Training Memory (GB): 5.8
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: S3DIS
+        Metrics:
+          mIoU: 66.65
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/paconv/paconv_cuda_ssg_8x8_cosine_200e_s3dis_seg-3d-13class/paconv_cuda_ssg_8x8_cosine_200e_s3dis_seg-3d-13class_20210802_171802-e5ea9bb9.pth
diff --git a/mmde/configs/paconv/paconv_ssg-cuda_8xb8-cosine-200e_s3dis-seg.py b/mmde/configs/paconv/paconv_ssg-cuda_8xb8-cosine-200e_s3dis-seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9b8cdc1b39ddc1fbceaa473993e51123c9b8c44
--- /dev/null
+++ b/mmde/configs/paconv/paconv_ssg-cuda_8xb8-cosine-200e_s3dis-seg.py
@@ -0,0 +1,64 @@
+_base_ = [
+    '../_base_/datasets/s3dis-seg.py', '../_base_/models/paconv_ssg-cuda.py',
+    '../_base_/schedules/seg-cosine-150e.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    decode_head=dict(
+        num_classes=13, ignore_index=13,
+        loss_decode=dict(class_weight=None)),  # S3DIS doesn't use class_weight
+    test_cfg=dict(
+        num_points=4096,
+        block_size=1.0,
+        sample_rate=0.5,
+        use_normalized_coord=True,
+        batch_size=12))
+
+# data settings
+num_points = 4096
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.0,
+        use_normalized_coord=True,
+        num_try=10000,
+        enlarge_size=None,
+        min_unique_num=num_points // 4,
+        eps=0.0),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[0.0, 6.283185307179586],  # [0, 2 * pi]
+        scale_ratio_range=[0.8, 1.2],
+        translation_std=[0, 0, 0]),
+    dict(
+        type='RandomJitterPoints',
+        jitter_std=[0.01, 0.01, 0.01],
+        clip_range=[-0.05, 0.05]),
+    dict(type='RandomDropPointsColor', drop_ratio=0.2),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+
+train_dataloader = dict(batch_size=8, dataset=dict(pipeline=train_pipeline))
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=200, val_interval=1)
diff --git a/mmde/configs/paconv/paconv_ssg_8xb8-cosine-150e_s3dis-seg.py b/mmde/configs/paconv/paconv_ssg_8xb8-cosine-150e_s3dis-seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..da29b89483278b127c40dd4c434ff7872b5120c9
--- /dev/null
+++ b/mmde/configs/paconv/paconv_ssg_8xb8-cosine-150e_s3dis-seg.py
@@ -0,0 +1,61 @@
+_base_ = [
+    '../_base_/datasets/s3dis-seg.py', '../_base_/models/paconv_ssg.py',
+    '../_base_/schedules/seg-cosine-150e.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    decode_head=dict(
+        num_classes=13, ignore_index=13,
+        loss_decode=dict(class_weight=None)),  # S3DIS doesn't use class_weight
+    test_cfg=dict(
+        num_points=4096,
+        block_size=1.0,
+        sample_rate=0.5,
+        use_normalized_coord=True,
+        batch_size=12))
+
+# data settings
+num_points = 4096
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.0,
+        use_normalized_coord=True,
+        num_try=10000,
+        enlarge_size=None,
+        min_unique_num=num_points // 4,
+        eps=0.0),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[0.0, 6.283185307179586],  # [0, 2 * pi]
+        scale_ratio_range=[0.8, 1.2],
+        translation_std=[0, 0, 0]),
+    dict(
+        type='RandomJitterPoints',
+        jitter_std=[0.01, 0.01, 0.01],
+        clip_range=[-0.05, 0.05]),
+    dict(type='RandomDropPointsColor', drop_ratio=0.2),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+
+train_dataloader = dict(batch_size=8, dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/configs/parta2/README.md b/mmde/configs/parta2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..716c73e8a72a294b710a61940d97a1ca6487cfe1
--- /dev/null
+++ b/mmde/configs/parta2/README.md
@@ -0,0 +1,38 @@
+# From Points to Parts: 3D Object Detection from Point Cloud with Part-aware and Part-aggregation Network
+
+> [From Points to Parts: 3D Object Detection from Point Cloud with Part-aware and Part-aggregation Network](https://arxiv.org/abs/1907.03670)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+3D object detection from LiDAR point cloud is a challenging problem in 3D scene understanding and has many practical applications. In this paper, we extend our preliminary work PointRCNN to a novel and strong point-cloud-based 3D object detection framework, the part-aware and aggregation neural network (Part-A2 net). The whole framework consists of the part-aware stage and the part-aggregation stage. Firstly, the part-aware stage for the first time fully utilizes free-of-charge part supervisions derived from 3D ground-truth boxes to simultaneously predict high quality 3D proposals and accurate intra-object part locations. The predicted intra-object part locations within the same proposal are grouped by our new-designed RoI-aware point cloud pooling module, which results in an effective representation to encode the geometry-specific features of each 3D proposal. Then the part-aggregation stage learns to re-score the box and refine the box location by exploring the spatial relationship of the pooled intra-object part locations. Extensive experiments are conducted to demonstrate the performance improvements from each component of our proposed framework. Our Part-A2 net outperforms all existing 3D detection methods and achieves new state-of-the-art on KITTI 3D object detection dataset by utilizing only the LiDAR point cloud data.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/79644370/143882774-6fc5f736-10d1-499a-8929-ca0768419049.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement Part-A^2 and provide its results and checkpoints on KITTI dataset.
+
+## Results and models
+
+### KITTI
+
+|                            Backbone                             |  Class  |  Lr schd   | Mem (GB) | Inf time (fps) |  mAP  |                                                                                                                                                                                                   Download                                                                                                                                                                                                   |
+| :-------------------------------------------------------------: | :-----: | :--------: | :------: | :------------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [SECFPN](./parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py) | 3 Class | cyclic 80e |   4.1    |                | 68.33 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class_20210831_022017-454a5344.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class_20210831_022017.log.json) |
+|  [SECFPN](./parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-car.py)   |   Car   | cyclic 80e |   4.0    |                | 79.08 |       [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car_20210831_022017-cb7ff621.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car_20210831_022017.log.json)       |
+
+## Citation
+
+```latex
+@article{shi2020points,
+  title={From points to parts: 3d object detection from point cloud with part-aware and part-aggregation network},
+  author={Shi, Shaoshuai and Wang, Zhe and Shi, Jianping and Wang, Xiaogang and Li, Hongsheng},
+  journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
+  year={2020},
+  publisher={IEEE}
+}
+```
diff --git a/mmde/configs/parta2/metafile.yml b/mmde/configs/parta2/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ac68c62aa1559f20cfdb959d9da6542b404beaf5
--- /dev/null
+++ b/mmde/configs/parta2/metafile.yml
@@ -0,0 +1,41 @@
+Collections:
+  - Name: Part-A^2
+    Metadata:
+      Training Data: KITTI
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Sparse U-Net
+    Paper:
+      URL: https://arxiv.org/abs/1907.03670
+      Title: 'From Points to Parts: 3D Object Detection from Point Cloud with Part-aware and Part-aggregation Network'
+    README: configs/parta2/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/parta2.py#L12
+      Version: v0.5.0
+
+Models:
+  - Name: parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class
+    In Collection: Part-A^2
+    Config: configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py
+    Metadata:
+      Training Memory (GB): 4.1
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 68.33
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class_20210831_022017-454a5344.pth
+
+  - Name: parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-car
+    In Collection: Part-A^2
+    Config: configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-car.py
+    Metadata:
+      Training Memory (GB): 4.0
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 79.08
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car_20210831_022017-cb7ff621.pth
diff --git a/mmde/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py b/mmde/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..f22e133b7b2e02d4a0a8be747a2889379a468379
--- /dev/null
+++ b/mmde/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py
@@ -0,0 +1,160 @@
+_base_ = [
+    '../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py',
+    '../_base_/models/parta2.py'
+]
+
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+input_modality = dict(use_lidar=True, use_camera=False)
+backend_args = None
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    backend_args=backend_args)
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='kitti_infos_train.pkl',
+            data_prefix=dict(pts='training/velodyne_reduced'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            metainfo=dict(classes=class_names),
+            box_type_3d='LiDAR',
+            test_mode=False,
+            backend_args=backend_args)))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='kitti_infos_val.pkl',
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        pipeline=test_pipeline,
+        modality=input_modality,
+        metainfo=dict(classes=class_names),
+        box_type_3d='LiDAR',
+        test_mode=True,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='kitti_infos_val.pkl',
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        metainfo=dict(classes=class_names),
+        box_type_3d='LiDAR',
+        test_mode=True,
+        backend_args=backend_args))
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+# Part-A2 uses a different learning rate from what SECOND uses.
+optim_wrapper = dict(optimizer=dict(lr=0.001))
+find_unused_parameters = True
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-car.py b/mmde/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..239cd0c6db6f5667455e7d90a147bac8f16beea6
--- /dev/null
+++ b/mmde/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-car.py
@@ -0,0 +1,154 @@
+_base_ = './parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py'
+
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]  # velodyne coordinates, x, y, z
+
+model = dict(
+    rpn_head=dict(
+        type='PartA2RPNHead',
+        num_classes=1,
+        anchor_generator=dict(
+            _delete_=True,
+            type='Anchor3DRangeGenerator',
+            ranges=[[0, -40.0, -1.78, 70.4, 40.0, -1.78]],
+            sizes=[[3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False)),
+    roi_head=dict(
+        num_classes=1,
+        semantic_head=dict(num_classes=1),
+        bbox_head=dict(num_classes=1)),
+    # model training and testing settings
+    train_cfg=dict(
+        _delete_=True,
+        rpn=dict(
+            assigner=dict(
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=9000,
+            nms_post=512,
+            max_num=512,
+            nms_thr=0.8,
+            score_thr=0,
+            use_rotate_nms=False),
+        rcnn=dict(
+            assigner=dict(  # for Car
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
+                pos_iou_thr=0.55,
+                neg_iou_thr=0.55,
+                min_pos_iou=0.55,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='IoUNegPiecewiseSampler',
+                num=128,
+                pos_fraction=0.55,
+                neg_piece_fractions=[0.8, 0.2],
+                neg_iou_piece_thrs=[0.55, 0.1],
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False,
+                return_iou=True),
+            cls_pos_thr=0.75,
+            cls_neg_thr=0.25)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1024,
+            nms_post=100,
+            max_num=100,
+            nms_thr=0.7,
+            score_thr=0,
+            use_rotate_nms=True),
+        rcnn=dict(
+            use_rotate_nms=True,
+            use_raw_score=True,
+            nms_thr=0.01,
+            score_thr=0.1)))
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+input_modality = dict(use_lidar=True, use_camera=False)
+backend_args = None
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    backend_args=backend_args)
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        dataset=dict(
+            pipeline=train_pipeline, metainfo=dict(classes=class_names))))
+test_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names)))
+val_dataloader = dict(dataset=dict(metainfo=dict(classes=class_names)))
+find_unused_parameters = True
diff --git a/mmde/configs/pgd/README.md b/mmde/configs/pgd/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6c41522b3d3bb1dd9498f1b6d3b192518429c8cd
--- /dev/null
+++ b/mmde/configs/pgd/README.md
@@ -0,0 +1,86 @@
+# Probabilistic and Geometric Depth: Detecting Objects in Perspective
+
+> [Probabilistic and Geometric Depth: Detecting Objects in Perspective](https://arxiv.org/abs/2107.14160)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+3D object detection is an important capability needed in various practical applications such as driver assistance systems. Monocular 3D detection, as a representative general setting among image-based approaches, provides a more economical solution than conventional settings relying on LiDARs but still yields unsatisfactory results. This paper first presents a systematic study on this problem. We observe that the current monocular 3D detection can be simplified as an instance depth estimation problem: The inaccurate instance depth blocks all the other 3D attribute predictions from improving the overall detection performance. Moreover, recent methods directly estimate the depth based on isolated instances or pixels while ignoring the geometric relations across different objects. To this end, we construct geometric relation graphs across predicted objects and use the graph to facilitate depth estimation. As the preliminary depth estimation of each instance is usually inaccurate in this ill-posed setting, we incorporate a probabilistic representation to capture the uncertainty. It provides an important indicator to identify confident predictions and further guide the depth propagation. Despite the simplicity of the basic idea, our method, PGD, obtains significant improvements on KITTI and nuScenes benchmarks, achieving 1st place out of all monocular vision-only methods while still maintaining real-time efficiency. Code and models will be released at [this https URL](https://github.com/open-mmlab/mmdetection3d).
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/79644370/143884065-d1a19fdf-bcc0-4249-84cf-b7a85fa1eb2f.png" width="800"/>
+</div>
+
+## Introduction
+
+PGD, also can be regarded as FCOS3D++, is a simple yet effective monocular 3D detector. It enhances the FCOS3D baseline by involving local geometric constraints and improving instance depth estimation.
+
+We release the code and model for both KITTI and nuScenes benchmark, which is a good supplement for the original FCOS3D baseline (only supported on nuScenes).
+
+For clean implementation, our preliminary release supports base models with proposed local geometric constraints and the probabilistic depth representation. We will involve the geometric graph part in the future.
+
+A more extensive study based on FCOS3D and PGD is on-going. Please stay tuned.
+
+## Results and models
+
+### KITTI
+
+|                             Backbone                              | Lr schd | Mem (GB) | Inf time (fps) | mAP_11 / mAP_40 |                                                                                                                                                                                              Download                                                                                                                                                                                              |
+| :---------------------------------------------------------------: | :-----: | :------: | :------------: | :-------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet101](./pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d.py) |   4x    |   9.07   |                |  18.33 / 13.23  | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d_20211022_102608-8a97533b.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d_20211022_102608.log.json) |
+
+Detailed performance on KITTI 3D detection (3D/BEV) is as follows, evaluated by AP11 and AP40 metric:
+
+|            |     Easy      |   Moderate    |     Hard      |
+| ---------- | :-----------: | :-----------: | :-----------: |
+| Car (AP11) | 24.09 / 30.11 | 18.33 / 23.46 | 16.90 / 19.33 |
+| Car (AP40) | 19.27 / 26.60 | 13.23 / 18.23 | 10.65 / 15.00 |
+
+Note: mAP represents Car moderate 3D strict AP11 / AP40 results. Because of the limited data for pedestrians and cyclists, the detection performance for these two classes is usually unstable. Therefore, we only list car detection results here. In addition, AP40 is a more recommended metric for reference due to its much better stability.
+
+### NuScenes
+
+|                                     Backbone                                      | Lr schd | Mem (GB) | mAP  | NDS  |                                                                                                                                                                                                              Download                                                                                                                                                                                                              |
+| :-------------------------------------------------------------------------------: | :-----: | :------: | :--: | :--: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|      [ResNet101 w/ DCN](./pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d.py)      |   1x    |   9.20   | 31.7 | 39.3 |                   [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_20211116_195350-f4b5eec2.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_20211116_195350.log.json)                   |
+| [above w/ finetune](./pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d_finetune.py) |   1x    |   9.20   | 34.6 | 41.1 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune_20211118_093245-fd419681.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune_20211118_093245.log.json) |
+|                                   above w/ tta                                    |   1x    |   9.20   | 35.5 | 41.8 |                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+|      [ResNet101 w/ DCN](./pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d.py)      |   2x    |   9.20   | 33.6 | 40.9 |                   [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_20211112_125314-cb677266.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_20211112_125314.log.json)                   |
+| [above w/ finetune](./pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d_finetune.py) |   2x    |   9.20   | 35.8 | 42.5 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune_20211114_162135-5ec7c1cd.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune_20211114_162135.log.json) |
+|                                   above w/ tta                                    |   2x    |   9.20   | 36.8 | 43.1 |                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+
+### Waymo
+
+|                                   Backbone                                   | Load Interval |  Camera view  | mAPL | mAP  | mAPH  |                                                                                             Download                                                                                              |
+| :--------------------------------------------------------------------------: | :-----------: | :-----------: | :--: | :--: | :---: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet101 w/ DCN](./pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py) |      3x       | front-of-view | 15.8 | 22.7 | 21.51 | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d_20231107_164117.log) |
+|                                 above @ Car                                  |               |               | 36.7 | 51.6 | 51.0  |                                                                                                                                                                                                   |
+|                              above @ Pedestrian                              |               |               | 9.0  | 14.1 | 11.4  |                                                                                                                                                                                                   |
+|                               above @ Cyclist                                |               |               | 1.6  | 2.5  |  2.2  |                                                                                                                                                                                                   |
+| [ResNet101 w/ DCN](./pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py)  |      3x       |  multi-view   | 20.8 | 29.3 | 27.7  |  [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d_20231120_202732.log)  |
+|                                 above @ Car                                  |               |               | 41.2 | 56.1 | 55.2  |                                                                                                                                                                                                   |
+|                              above @ Pedestrian                              |               |               | 20.0 | 29.6 | 25.8  |                                                                                                                                                                                                   |
+|                               above @ Cyclist                                |               |               | 1.4  | 2.2  |  2.0  |                                                                                                                                                                                                   |
+
+**Note**:
+
+Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
+
+## Citation
+
+```latex
+@inproceedings{wang2021pgd,
+    title={{Probabilistic and Geometric Depth: Detecting} Objects in Perspective},
+    author={Wang, Tai and Zhu, Xinge and Pang, Jiangmiao and Lin, Dahua},
+    booktitle={Conference on Robot Learning (CoRL) 2021},
+    year={2021}
+}
+# For the baseline version
+@inproceedings{wang2021fcos3d,
+    title={{FCOS3D: Fully} Convolutional One-Stage Monocular 3D Object Detection},
+    author={Wang, Tai and Zhu, Xinge and Pang, Jiangmiao and Lin, Dahua},
+    booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops},
+    year={2021}
+}
+```
diff --git a/mmde/configs/pgd/metafile.yml b/mmde/configs/pgd/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..676f58edb0c4b58db465341e88a6becbf6ace27e
--- /dev/null
+++ b/mmde/configs/pgd/metafile.yml
@@ -0,0 +1,83 @@
+Collections:
+  - Name: PGD
+    Metadata:
+      Training Data: KITTI
+      Training Techniques:
+        - SGD
+      Training Resources: 4x TITAN XP
+      Architecture:
+        - PGDHead
+    Paper:
+      URL: https://arxiv.org/abs/2107.14160
+      Title: 'Probabilistic and Geometric Depth: Detecting Objects in Perspective'
+    README: configs/pgd/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/models/dense_heads/pgd_head.py#17
+      Version: v1.0.0
+
+Models:
+  - Name: pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d
+    Alias:
+       - pgd_kitti
+    In Collection: PGD
+    Config: configs/pgd/pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d.py
+    Metadata:
+      Training Memory (GB): 9.1
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 18.33
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d_20211022_102608-8a97533b.pth
+
+  - Name: pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d
+    In Collection: PGD
+    Config: configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d.py
+    Metadata:
+      Training Memory (GB): 9.2
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 31.7
+          NDS: 39.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_20211116_195350-f4b5eec2.pth
+
+  - Name: pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d_finetune
+    In Collection: PGD
+    Config: configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d_finetune.py
+    Metadata:
+      Training Memory (GB): 9.2
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 34.6
+          NDS: 41.1
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune_20211118_093245-fd419681.pth
+
+  - Name: pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d
+    In Collection: PGD
+    Config: configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d.py
+    Metadata:
+      Training Memory (GB): 9.2
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 33.6
+          NDS: 40.9
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_20211112_125314-cb677266.pth
+
+  - Name: pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d_finetune
+    In Collection: PGD
+    Config: configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d_finetune.py
+    Metadata:
+      Training Memory (GB): 9.2
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 35.8
+          NDS: 42.5
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune_20211114_162135-5ec7c1cd.pth
diff --git a/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d.py b/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7709d3b67e7653ac6e0258fbb13852cf8fb3343
--- /dev/null
+++ b/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d.py
@@ -0,0 +1,104 @@
+_base_ = [
+    '../_base_/datasets/nus-mono3d.py', '../_base_/models/pgd.py',
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    bbox_head=dict(
+        pred_bbox2d=True,
+        group_reg_dims=(2, 1, 3, 1, 2,
+                        4),  # offset, depth, size, rot, velo, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (),  # velo
+            (256, )  # bbox2d
+        ),
+        loss_depth=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((31.99, 21.12), (37.15, 24.63), (39.69, 23.97),
+                         (40.91, 26.34), (34.16, 20.11), (22.35, 13.70),
+                         (24.28, 16.05), (27.26, 15.50), (20.61, 13.68),
+                         (22.74, 15.01)),
+            base_dims=((4.62, 1.73, 1.96), (6.93, 2.83, 2.51),
+                       (12.56, 3.89, 2.94), (11.22, 3.50, 2.95),
+                       (6.68, 3.21, 2.85), (6.68, 3.21, 2.85),
+                       (2.11, 1.46, 0.78), (0.73, 1.77, 0.67),
+                       (0.41, 1.08, 0.41), (0.50, 0.99, 2.52)),
+            code_size=9)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.05 for 2-dim velocity and 0.2 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05, 0.2, 0.2, 0.2, 0.2
+    ]),
+    test_cfg=dict(nms_pre=1000, nms_thr=0.8, score_thr=0.01, max_per_img=200))
+
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='mmdet.Resize', scale=(1600, 900), keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'attr_labels',
+            'gt_bboxes_3d', 'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(type='mmdet.Resize', scale_factor=1.0),
+    dict(type='Pack3DDetInputs', keys=['img']),
+]
+train_dataloader = dict(
+    batch_size=2, num_workers=2, dataset=dict(pipeline=train_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(lr=0.004),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+train_cfg = dict(max_epochs=12, val_interval=4)
+auto_scale_lr = dict(base_batch_size=32)
diff --git a/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d_finetune.py b/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d_finetune.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c84f820f91ed09f8971d170e544f034e7fb8ee1
--- /dev/null
+++ b/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d_finetune.py
@@ -0,0 +1,9 @@
+_base_ = './pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d.py'
+# model settings
+model = dict(
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05, 0.2, 0.2, 0.2, 0.2
+    ]))
+# optimizer
+optim_wrapper = dict(optimizer=dict(lr=0.002))
+load_from = 'work_dirs/pgd_nus_benchmark_1x/latest.pth'
diff --git a/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d.py b/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..b95b0ec3f046a6cb5bd6ffb4f6763d8d4360736b
--- /dev/null
+++ b/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d.py
@@ -0,0 +1,20 @@
+_base_ = './pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d.py'
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+train_cfg = dict(max_epochs=24)
diff --git a/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d_finetune.py b/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d_finetune.py
new file mode 100644
index 0000000000000000000000000000000000000000..a733bc4a1e527c4b436adcf12134f7ea2bd5b832
--- /dev/null
+++ b/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d_finetune.py
@@ -0,0 +1,9 @@
+_base_ = './pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d.py'
+# model settings
+model = dict(
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05, 0.2, 0.2, 0.2, 0.2
+    ]))
+# optimizer
+optim_wrapper = dict(optimizer=dict(lr=0.002))
+load_from = 'work_dirs/pgd_nus_benchmark_2x/latest.pth'
diff --git a/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d.py b/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f83134e874139a48d92e1c6108d3bf0f942c737
--- /dev/null
+++ b/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d.py
@@ -0,0 +1,127 @@
+_base_ = [
+    '../_base_/datasets/kitti-mono3d.py', '../_base_/models/pgd.py',
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(frozen_stages=0),
+    neck=dict(start_level=0, num_outs=4),
+    bbox_head=dict(
+        num_classes=3,
+        bbox_code_size=7,
+        pred_attrs=False,
+        pred_velo=False,
+        pred_bbox2d=True,
+        use_onlyreg_proj=True,
+        strides=(4, 8, 16, 32),
+        regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 1e8)),
+        group_reg_dims=(2, 1, 3, 1, 16,
+                        4),  # offset, depth, size, rot, kpts, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (256, ),  # kpts
+            (256, )  # bbox2d
+        ),
+        centerness_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 70),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=8,
+        pred_keypoints=True,
+        weight_dim=1,
+        loss_depth=dict(
+            type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
+            loss_weight=1.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((28.01, 16.32), ),
+            base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6), (3.9, 1.56, 1.6)),
+            code_size=7)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
+    ]),
+    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
+
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='mmdet.Resize', scale=(1242, 375), keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(type='mmdet.Resize', scale_factor=1.0),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+train_dataloader = dict(
+    batch_size=3, num_workers=3, dataset=dict(pipeline=train_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(lr=0.001),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=48,
+        by_epoch=True,
+        milestones=[32, 44],
+        gamma=0.1)
+]
+
+train_cfg = dict(max_epochs=48, val_interval=2)
+auto_scale_lr = dict(base_batch_size=12)
diff --git a/mmde/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py b/mmde/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa50e0e04f7cd65bb74cec1572945e9fa8c8e6a9
--- /dev/null
+++ b/mmde/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py
@@ -0,0 +1,112 @@
+_base_ = [
+    '../_base_/datasets/waymoD5-fov-mono3d-3class.py',
+    '../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
+    '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    neck=dict(num_outs=3),
+    bbox_head=dict(
+        num_classes=3,
+        bbox_code_size=7,
+        pred_attrs=False,
+        pred_velo=False,
+        pred_bbox2d=True,
+        use_onlyreg_proj=True,
+        strides=(8, 16, 32),
+        regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
+        group_reg_dims=(2, 1, 3, 1, 16,
+                        4),  # offset, depth, size, rot, kpts, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (256, ),  # kpts
+            (256, )  # bbox2d
+        ),
+        centerness_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 50),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=6,
+        pred_keypoints=True,
+        weight_dim=1,
+        loss_depth=dict(
+            type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
+            loss_weight=1.0),
+        loss_bbox2d=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
+        loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((41.01, 18.44), ),
+            base_dims=(
+                (4.73, 1.77, 2.08),  # Car
+                (0.91, 1.74, 0.84),  # Pedestrian
+                (1.81, 1.77, 0.84),  # Cyclist
+            ),
+            code_size=7)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
+    ]),
+    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.008,
+    ),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+auto_scale_lr = dict(base_batch_size=48)
diff --git a/mmde/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-mv-mono3d.py b/mmde/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-mv-mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..034f8660063cafaa9e050b0da6cb7d3a6ee3f9fc
--- /dev/null
+++ b/mmde/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-mv-mono3d.py
@@ -0,0 +1,112 @@
+_base_ = [
+    '../_base_/datasets/waymoD5-mv-mono3d-3class.py',
+    '../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
+    '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    neck=dict(num_outs=3),
+    bbox_head=dict(
+        num_classes=3,
+        bbox_code_size=7,
+        pred_attrs=False,
+        pred_velo=False,
+        pred_bbox2d=True,
+        use_onlyreg_proj=True,
+        strides=(8, 16, 32),
+        regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
+        group_reg_dims=(2, 1, 3, 1, 16,
+                        4),  # offset, depth, size, rot, kpts, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (256, ),  # kpts
+            (256, )  # bbox2d
+        ),
+        centerness_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 50),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=6,
+        pred_keypoints=True,
+        weight_dim=1,
+        loss_depth=dict(
+            type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
+            loss_weight=1.0),
+        loss_bbox2d=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
+        loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((41.01, 18.44), ),
+            base_dims=(
+                (4.73, 1.77, 2.08),
+                (0.91, 1.74, 0.84),
+                (1.81, 1.77, 0.84),
+            ),
+            code_size=7)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
+    ]),
+    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.008,
+    ),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+auto_scale_lr = dict(base_batch_size=48)
diff --git a/mmde/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py b/mmde/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4c61937490e382636227913478de341c041261a
--- /dev/null
+++ b/mmde/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py
@@ -0,0 +1,112 @@
+_base_ = [
+    '../_base_/datasets/waymoD3-fov-mono3d-3class.py',
+    '../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
+    '../_base_/default_runtime.py'
+]
+# load_from = '../Depth-from-Motion/checkpoints/pgd_init.pth'
+# model settings
+model = dict(
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    neck=dict(num_outs=3),
+    bbox_head=dict(
+        num_classes=3,
+        bbox_code_size=7,
+        pred_attrs=False,
+        pred_velo=False,
+        pred_bbox2d=True,
+        use_onlyreg_proj=True,
+        strides=(8, 16, 32),
+        regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
+        group_reg_dims=(2, 1, 3, 1, 16,
+                        4),  # offset, depth, size, rot, kpts, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (256, ),  # kpts
+            (256, )  # bbox2d
+        ),
+        centerness_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 50),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=6,
+        pred_keypoints=True,
+        weight_dim=1,
+        loss_depth=dict(
+            type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
+            loss_weight=1.0),
+        loss_bbox2d=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
+        loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((41.01, 18.44), ),
+            base_dims=(
+                (0.91, 1.74, 0.84),  # Pedestrian
+                (1.81, 1.77, 0.84),  # Cyclist
+                (4.73, 1.77, 2.08)),  # Car
+            code_size=7)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
+    ]),
+    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.008,
+    ),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+auto_scale_lr = dict(enable=False, base_batch_size=48)
diff --git a/mmde/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py b/mmde/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..2247aa44d86b9b0362720613fa35d6d2f0ef14d1
--- /dev/null
+++ b/mmde/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py
@@ -0,0 +1,111 @@
+_base_ = [
+    '../_base_/datasets/waymoD3-mv-mono3d-3class.py',
+    '../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
+    '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    neck=dict(num_outs=3),
+    bbox_head=dict(
+        num_classes=3,
+        bbox_code_size=7,
+        pred_attrs=False,
+        pred_velo=False,
+        pred_bbox2d=True,
+        use_onlyreg_proj=True,
+        strides=(8, 16, 32),
+        regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
+        group_reg_dims=(2, 1, 3, 1, 16,
+                        4),  # offset, depth, size, rot, kpts, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (256, ),  # kpts
+            (256, )  # bbox2d
+        ),
+        centerness_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 50),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=6,
+        pred_keypoints=True,
+        weight_dim=1,
+        loss_depth=dict(
+            type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
+            loss_weight=1.0),
+        loss_bbox2d=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
+        loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((41.01, 18.44), ),
+            base_dims=(
+                (0.91, 1.74, 0.84),  # Pedestrian
+                (1.81, 1.77, 0.84),  # Cyclist
+                (4.73, 1.77, 2.08)),  # Car
+            code_size=7)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
+    ]),
+    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.008,
+    ),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+auto_scale_lr = dict(enable=False, base_batch_size=48)
diff --git a/mmde/configs/point_rcnn/README.md b/mmde/configs/point_rcnn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..03be3caf1b477ff02b61422a9755aa8108e143d2
--- /dev/null
+++ b/mmde/configs/point_rcnn/README.md
@@ -0,0 +1,47 @@
+# PointRCNN: 3D Object Proposal Generation and Detection from Point Cloud
+
+> [PointRCNN: 3D Object Proposal Generation and Detection from Point Cloud](https://arxiv.org/abs/1812.04244)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this paper, we propose PointRCNN for 3D object detection from raw point cloud. The whole framework is composed of two stages: stage-1 for the bottom-up 3D proposal generation and stage-2 for refining proposals in the canonical coordinates to obtain the final detection results. Instead of generating proposals from RGB image or projecting point cloud to bird's view or voxels as previous methods do, our stage-1 sub-network directly generates a small number of high-quality 3D proposals from point cloud in a bottom-up manner via segmenting the point cloud of the whole scene into foreground points and background. The stage-2 sub-network transforms the pooled points of each proposal to canonical coordinates to learn better local spatial features, which is combined with global semantic features of each point learned in stage-1 for accurate box refinement and confidence prediction. Extensive experiments on the 3D detection benchmark of KITTI dataset show that our proposed architecture outperforms state-of-the-art methods with remarkable margins by using only point cloud as input.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/79644370/144959105-271038a2-4ae1-4cdb-b6a8-68c14daf83b0.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement PointRCNN and provide the result with checkpoints on KITTI dataset.
+
+## Results and models
+
+### KITTI
+
+|                      Backbone                      |  Class  |  Lr schd   | Mem (GB) | Inf time (fps) |  mAP  |                                                                                                                                   Download                                                                                                                                    |
+| :------------------------------------------------: | :-----: | :--------: | :------: | :------------: | :---: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [PointNet++](./point-rcnn_8xb2_kitti-3d-3class.py) | 3 Class | cyclic 40e |   4.6    |                | 70.83 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/point_rcnn/point_rcnn_2x8_kitti-3d-3classes_20211208_151344.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/point_rcnn/point_rcnn_2x8_kitti-3d-3classes_20211208_151344.log.json) |
+
+Note: mAP represents AP11 results on 3 Class under the moderate setting.
+
+Detailed performance on KITTI 3D detection (3D) is as follows, evaluated by AP11 metric:
+
+|            | Easy  | Moderate | Hard  |
+| ---------- | :---: | :------: | :---: |
+| Car        | 89.13 |  78.72   | 78.24 |
+| Pedestrian | 65.81 |  59.57   | 52.75 |
+| Cyclist    | 93.51 |  74.19   | 70.73 |
+
+## Citation
+
+```latex
+@inproceedings{Shi_2019_CVPR,
+    title = {PointRCNN: 3D Object Proposal Generation and Detection from Point Cloud},
+    author = {Shi, Shaoshuai and Wang, Xiaogang and Li, Hongsheng},
+    booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+    month = {June},
+    year = {2019}
+}
+```
diff --git a/mmde/configs/point_rcnn/metafile.yml b/mmde/configs/point_rcnn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2dcdc3a9d2126d5a99b6df6149330fb88d381bb3
--- /dev/null
+++ b/mmde/configs/point_rcnn/metafile.yml
@@ -0,0 +1,29 @@
+Collections:
+  - Name: PointRCNN
+    Metadata:
+      Training Data: KITTI
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x Titan XP GPUs
+      Architecture:
+        - PointNet++
+    Paper:
+      URL: https://arxiv.org/abs/1812.04244
+      Title: 'PointRCNN: 3D Object Proposal Generation and Detection from Point Cloud'
+    README: configs/point_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/models/detectors/point_rcnn.py#L8
+      Version: v1.0.0
+
+Models:
+  - Name: point-rcnn_8xb2_kitti-3d-3class
+    In Collection: PointRCNN
+    Config: configs/point_rcnn/point-rcnn_8xb2_kitti-3d-3class.py
+    Metadata:
+      Training Memory (GB): 4.6
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 70.83
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/point_rcnn/point_rcnn_2x8_kitti-3d-3classes_20211208_151344.pth
diff --git a/mmde/configs/point_rcnn/point-rcnn_8xb2_kitti-3d-3class.py b/mmde/configs/point_rcnn/point-rcnn_8xb2_kitti-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..1826198cd76ac524d15233e6588c3ee5d1bf7f7e
--- /dev/null
+++ b/mmde/configs/point_rcnn/point-rcnn_8xb2_kitti-3d-3class.py
@@ -0,0 +1,145 @@
+_base_ = [
+    '../_base_/datasets/kitti-3d-car.py', '../_base_/models/point_rcnn.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/cyclic-40e.py'
+]
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+backend_args = None
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
+    sample_groups=dict(Car=20, Pedestrian=15, Cyclist=15),
+    classes=class_names,
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointSample', num_points=16384, sample_range=40.0),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(type='PointSample', num_points=16384, sample_range=40.0)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(pipeline=train_pipeline, metainfo=metainfo)))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline, metainfo=metainfo))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline, metainfo=metainfo))
+
+lr = 0.001  # max learning rate
+optim_wrapper = dict(optimizer=dict(lr=lr, betas=(0.95, 0.85)))
+train_cfg = dict(by_epoch=True, max_epochs=80, val_interval=2)
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+param_scheduler = [
+    # learning rate scheduler
+    # During the first 35 epochs, learning rate increases from 0 to lr * 10
+    # during the next 45 epochs, learning rate decreases from lr * 10 to
+    # lr * 1e-4
+    dict(
+        type='CosineAnnealingLR',
+        T_max=35,
+        eta_min=lr * 10,
+        begin=0,
+        end=35,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=45,
+        eta_min=lr * 1e-4,
+        begin=35,
+        end=80,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    # During the first 35 epochs, momentum increases from 0 to 0.85 / 0.95
+    # during the next 45 epochs, momentum increases from 0.85 / 0.95 to 1
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=35,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=35,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=45,
+        eta_min=1,
+        begin=35,
+        end=80,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
diff --git a/mmde/configs/pointnet2/README.md b/mmde/configs/pointnet2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..955f2fbc50fd688dcce51e3b47a594f90e065c6e
--- /dev/null
+++ b/mmde/configs/pointnet2/README.md
@@ -0,0 +1,72 @@
+# PointNet++: Deep Hierarchical Feature Learning on Point Sets in a Metric Space
+
+> [PointNet++: Deep Hierarchical Feature Learning on Point Sets in a Metric Space](https://arxiv.org/abs/1706.02413)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Few prior works study deep learning on point sets. PointNet by Qi et al. is a pioneer in this direction. However, by design PointNet does not capture local structures induced by the metric space points live in, limiting its ability to recognize fine-grained patterns and generalizability to complex scenes. In this work, we introduce a hierarchical neural network that applies PointNet recursively on a nested partitioning of the input point set. By exploiting metric space distances, our network is able to learn local features with increasing contextual scales. With further observation that point sets are usually sampled with varying densities, which results in greatly decreased performance for networks trained on uniform densities, we propose novel set learning layers to adaptively combine features from multiple scales. Experiments show that our network called PointNet++ is able to learn deep point set features efficiently and robustly. In particular, results significantly better than state-of-the-art have been obtained on challenging benchmarks of 3D point clouds.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/79644370/143885530-ae53ed38-8132-4bb7-85a7-d2577de7de3f.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement PointNet++ and provide the result and checkpoints on ScanNet and S3DIS datasets.
+
+**Notice**: The original PointNet++ paper used step learning rate schedule. We discovered that cosine schedule achieves much better results and adopt it in our implementations. We also use a larger `weight_decay` factor because we find it consistently improves the performance.
+
+## Results and models
+
+### ScanNet
+
+|                                    Method                                     |   Input   |   Lr schd   | Mem (GB) | Inf time (fps) | mIoU (Val set) | mIoU (Test set) | Download                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| :---------------------------------------------------------------------------: | :-------: | :---------: | :------: | :------------: | :------------: | :-------------: | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| [PointNet++ (SSG)](./pointnet2_ssg_2xb16-cosine-200e_scannet-seg-xyz-only.py) |    XYZ    | cosine 200e |   1.9    |                |     53.91      |                 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_ssg_xyz-only_16x2_cosine_200e_scannet_seg-3d-20class/pointnet2_ssg_xyz-only_16x2_cosine_200e_scannet_seg-3d-20class_20210514_143628-4e341a48.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_ssg_xyz-only_16x2_cosine_200e_scannet_seg-3d-20class/pointnet2_ssg_xyz-only_16x2_cosine_200e_scannet_seg-3d-20class_20210514_143628.log.json) |
+|     [PointNet++ (SSG)](./pointnet2_ssg_2xb16-cosine-200e_scannet-seg.py)      | XYZ+Color | cosine 200e |   1.9    |                |     54.44      |                 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_ssg_16x2_cosine_200e_scannet_seg-3d-20class/pointnet2_ssg_16x2_cosine_200e_scannet_seg-3d-20class_20210514_143644-ee73704a.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_ssg_16x2_cosine_200e_scannet_seg-3d-20class/pointnet2_ssg_16x2_cosine_200e_scannet_seg-3d-20class_20210514_143644.log.json)                                     |
+| [PointNet++ (MSG)](./pointnet2_msg_2xb16-cosine-250e_scannet-seg-xyz-only.py) |    XYZ    | cosine 250e |   2.4    |                |     54.26      |                 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_msg_xyz-only_16x2_cosine_250e_scannet_seg-3d-20class/pointnet2_msg_xyz-only_16x2_cosine_250e_scannet_seg-3d-20class_20210514_143838-b4a3cf89.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_msg_xyz-only_16x2_cosine_250e_scannet_seg-3d-20class/pointnet2_msg_xyz-only_16x2_cosine_250e_scannet_seg-3d-20class_20210514_143838.log.json) |
+|     [PointNet++ (MSG)](./pointnet2_msg_2xb16-cosine-250e_scannet-seg.py)      | XYZ+Color | cosine 250e |   2.4    |                |     55.05      |                 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_msg_16x2_cosine_250e_scannet_seg-3d-20class/pointnet2_msg_16x2_cosine_250e_scannet_seg-3d-20class_20210514_144009-24477ab1.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_msg_16x2_cosine_250e_scannet_seg-3d-20class/pointnet2_msg_16x2_cosine_250e_scannet_seg-3d-20class_20210514_144009.log.json)                                     |
+
+**Notes:**
+
+- The original PointNet++ paper conducted experiments on the ScanNet V1 dataset, while later point cloud segmentor papers often used ScanNet V2. Following common practice, we report results on the ScanNet V2 dataset.
+
+- Since ScanNet dataset doesn't provide ground-truth labels for the test set, users can only evaluate test set performance by submitting to its online benchmark [website](http://kaldir.vc.in.tum.de/scannet_benchmark/). However, users are only allowed to submit once every two weeks. Therefore, we currently report val set mIoU. Test set performance may be added in the future.
+
+- To generate submission file for ScanNet online benchmark, you need to modify the ScanNet dataset's [config](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/_base_/datasets/scannet-seg.py#L126). Change `ann_file=data_root + 'scannet_infos_val.pkl'` to `ann_file=data_root + 'scannet_infos_test.pkl'`, and then simply run:
+
+  ```shell
+  python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} --format-only --options 'txt_prefix=exps/pointnet2_scannet_results'
+  ```
+
+  This will save the prediction results as `txt` files in `exps/pointnet2_scannet_results/`. Then, go to this folder and zip all files into `pn2_scannet.zip`. Now you can submit it to the online benchmark and wait for the test set result. More instructions can be found at their official [website](http://kaldir.vc.in.tum.de/scannet_benchmark/documentation#submission-policy).
+
+### S3DIS
+
+|                              Method                               | Split  |  Lr schd   | Mem (GB) | Inf time (fps) | mIoU (Val set) |                                                                                                                                                                                                            Download                                                                                                                                                                                                            |
+| :---------------------------------------------------------------: | :----: | :--------: | :------: | :------------: | :------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [PointNet++ (SSG)](./pointnet2_ssg_2xb16-cosine-50e_s3dis-seg.py) | Area_5 | cosine 50e |   3.6    |                |     56.93      | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_ssg_16x2_cosine_50e_s3dis_seg-3d-13class/pointnet2_ssg_16x2_cosine_50e_s3dis_seg-3d-13class_20210514_144205-995d0119.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_ssg_16x2_cosine_50e_s3dis_seg-3d-13class/pointnet2_ssg_16x2_cosine_50e_s3dis_seg-3d-13class_20210514_144205.log.json) |
+| [PointNet++ (MSG)](./pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py) | Area_5 | cosine 80e |   3.6    |                |     58.04      | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_msg_16x2_cosine_80e_s3dis_seg-3d-13class/pointnet2_msg_16x2_cosine_80e_s3dis_seg-3d-13class_20210514_144307-b2059817.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_msg_16x2_cosine_80e_s3dis_seg-3d-13class/pointnet2_msg_16x2_cosine_80e_s3dis_seg-3d-13class_20210514_144307.log.json) |
+
+**Notes:**
+
+- We use XYZ+Color+Normalized_XYZ as input in all the experiments on S3DIS datasets.
+- `Area_5` Split means training the model on Area_1, 2, 3, 4, 6 and testing on Area_5.
+
+## Indeterminism
+
+Since PointNet++ testing adopts sliding patch inference which involves random point sampling, and the test script uses fixed random seeds while the random seeds of validation in training are not fixed, the test results may be slightly different from the results reported above.
+
+## Citation
+
+```latex
+@inproceedings{qi2017pointnet++,
+  title={PointNet++ deep hierarchical feature learning on point sets in a metric space},
+  author={Qi, Charles R and Yi, Li and Su, Hao and Guibas, Leonidas J},
+  booktitle={Proceedings of the 31st International Conference on Neural Information Processing Systems},
+  pages={5105--5114},
+  year={2017}
+}
+```
diff --git a/mmde/configs/pointnet2/metafile.yml b/mmde/configs/pointnet2/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..cdceb045ab4d35e83af6197c2b18a556ab9f541f
--- /dev/null
+++ b/mmde/configs/pointnet2/metafile.yml
@@ -0,0 +1,95 @@
+Collections:
+  - Name: PointNet++
+    Metadata:
+      Training Techniques:
+        - Adam
+      Training Resources: 2x Titan XP GPUs
+      Architecture:
+        - PointNet++
+    Paper:
+      URL: https://arxiv.org/abs/1706.02413
+      Title: 'PointNet++: Deep Hierarchical Feature Learning on Point Sets in a Metric Space'
+    README: configs/pointnet2/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/backbones/pointnet2_sa_ssg.py#L12
+      Version: v0.14.0
+
+Models:
+  - Name: pointnet2_ssg_2xb16-cosine-200e_scannet-seg-xyz-only
+    In Collection: PointNet++
+    Config: configs/pointnet2/pointnet2_ssg_2xb16-cosine-200e_scannet-seg-xyz-only.py
+    Metadata:
+      Training Data: ScanNet
+      Training Memory (GB): 1.9
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: ScanNet
+        Metrics:
+          mIoU: 53.91
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_ssg_xyz-only_16x2_cosine_200e_scannet_seg-3d-20class/pointnet2_ssg_xyz-only_16x2_cosine_200e_scannet_seg-3d-20class_20210514_143628-4e341a48.pth
+
+  - Name: pointnet2_ssg_2xb16-cosine-200e_scannet-seg
+    In Collection: PointNet++
+    Config: configs/pointnet2/pointnet2_ssg_2xb16-cosine-200e_scannet-seg.py
+    Metadata:
+      Training Data: ScanNet
+      Training Memory (GB): 1.9
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: ScanNet
+        Metrics:
+          mIoU: 54.44
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_ssg_16x2_cosine_200e_scannet_seg-3d-20class/pointnet2_ssg_16x2_cosine_200e_scannet_seg-3d-20class_20210514_143644-ee73704a.pth
+
+  - Name: pointnet2_msg_2xb16-cosine-250e_scannet-seg-xyz-only
+    In Collection: PointNet++
+    Config: configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg-xyz-only.py
+    Metadata:
+      Training Data: ScanNet
+      Training Memory (GB): 2.4
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: ScanNet
+        Metrics:
+          mIoU: 54.26
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_msg_xyz-only_16x2_cosine_250e_scannet_seg-3d-20class/pointnet2_msg_xyz-only_16x2_cosine_250e_scannet_seg-3d-20class_20210514_143838-b4a3cf89.pth
+
+  - Name: pointnet2_msg_2xb16-cosine-250e_scannet-seg
+    In Collection: PointNet++
+    Config: configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py
+    Metadata:
+      Training Data: ScanNet
+      Training Memory (GB): 2.4
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: ScanNet
+        Metrics:
+          mIoU: 55.05
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_msg_16x2_cosine_250e_scannet_seg-3d-20class/pointnet2_msg_16x2_cosine_250e_scannet_seg-3d-20class_20210514_144009-24477ab1.pth
+
+  - Name: pointnet2_ssg_2xb16-cosine-50e_s3dis-seg
+    Alias: pointnet2-ssg_s3dis-seg
+    In Collection: PointNet++
+    Config: configs/pointnet2/pointnet2_ssg_2xb16-cosine-50e_s3dis-seg.py
+    Metadata:
+      Training Data: S3DIS
+      Training Memory (GB): 3.6
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: S3DIS
+        Metrics:
+          mIoU: 56.93
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_ssg_16x2_cosine_50e_s3dis_seg-3d-13class/pointnet2_ssg_16x2_cosine_50e_s3dis_seg-3d-13class_20210514_144205-995d0119.pth
+
+  - Name: pointnet2_msg_2xb16-cosine-80e_s3dis-seg
+    In Collection: PointNet++
+    Config: configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py
+    Metadata:
+      Training Data: S3DIS
+      Training Memory (GB): 3.6
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: S3DIS
+        Metrics:
+          mIoU: 58.04
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_msg_16x2_cosine_80e_s3dis_seg-3d-13class/pointnet2_msg_16x2_cosine_80e_s3dis_seg-3d-13class_20210514_144307-b2059817.pth
diff --git a/mmde/configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg-xyz-only.py b/mmde/configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg-xyz-only.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0b793f3167d80171bb1815e195b9b9ea9d82215
--- /dev/null
+++ b/mmde/configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg-xyz-only.py
@@ -0,0 +1,111 @@
+_base_ = [
+    '../_base_/datasets/scannet-seg.py', '../_base_/models/pointnet2_msg.py',
+    '../_base_/schedules/seg-cosine-200e.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    backbone=dict(in_channels=3),  # only [xyz]
+    decode_head=dict(
+        num_classes=20,
+        ignore_index=20,
+        # `class_weight` is generated in data pre-processing, saved in
+        # `data/scannet/seg_info/train_label_weight.npy`
+        # you can copy paste the values here, or input the file path as
+        # `class_weight=data/scannet/seg_info/train_label_weight.npy`
+        loss_decode=dict(class_weight=[
+            2.389689, 2.7215734, 4.5944676, 4.8543367, 4.096086, 4.907941,
+            4.690836, 4.512031, 4.623311, 4.9242644, 5.358117, 5.360071,
+            5.019636, 4.967126, 5.3502126, 5.4023647, 5.4027233, 5.4169416,
+            5.3954206, 4.6971426
+        ])),
+    test_cfg=dict(
+        num_points=8192,
+        block_size=1.5,
+        sample_rate=0.5,
+        use_normalized_coord=False,
+        batch_size=24))
+
+# dataset settings
+# in this setting, we only use xyz as network input
+# so we need to re-write all the data pipeline
+class_names = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table',
+               'door', 'window', 'bookshelf', 'picture', 'counter', 'desk',
+               'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink',
+               'bathtub', 'otherfurniture')
+num_points = 8192
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=False,
+        load_dim=6,
+        use_dim=[0, 1, 2],  # only load xyz coordinates
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.5,
+        ignore_index=len(class_names),
+        use_normalized_coord=False,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=False,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(
+        # a wrapper in order to successfully call test function
+        # actually we don't perform test-time-aug
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.0,
+                flip_ratio_bev_vertical=0.0),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(batch_size=16, dataset=dict(pipeline=train_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+val_dataloader = test_dataloader
+
+# runtime settings
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=5))
+
+# PointNet2-MSG needs longer training time than PointNet2-SSG
+train_cfg = dict(by_epoch=True, max_epochs=250, val_interval=5)
diff --git a/mmde/configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py b/mmde/configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..83c003b026905cb8a25ce8ccdf03a4349524eb5a
--- /dev/null
+++ b/mmde/configs/pointnet2/pointnet2_msg_2xb16-cosine-250e_scannet-seg.py
@@ -0,0 +1,35 @@
+_base_ = [
+    '../_base_/datasets/scannet-seg.py', '../_base_/models/pointnet2_msg.py',
+    '../_base_/schedules/seg-cosine-200e.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    decode_head=dict(
+        num_classes=20,
+        ignore_index=20,
+        # `class_weight` is generated in data pre-processing, saved in
+        # `data/scannet/seg_info/train_label_weight.npy`
+        # you can copy paste the values here, or input the file path as
+        # `class_weight=data/scannet/seg_info/train_label_weight.npy`
+        loss_decode=dict(class_weight=[
+            2.389689, 2.7215734, 4.5944676, 4.8543367, 4.096086, 4.907941,
+            4.690836, 4.512031, 4.623311, 4.9242644, 5.358117, 5.360071,
+            5.019636, 4.967126, 5.3502126, 5.4023647, 5.4027233, 5.4169416,
+            5.3954206, 4.6971426
+        ])),
+    test_cfg=dict(
+        num_points=8192,
+        block_size=1.5,
+        sample_rate=0.5,
+        use_normalized_coord=False,
+        batch_size=24))
+
+# data settings
+train_dataloader = dict(batch_size=16)
+
+# runtime settings
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=5))
+
+# PointNet2-MSG needs longer training time than PointNet2-SSG
+train_cfg = dict(by_epoch=True, max_epochs=250, val_interval=5)
diff --git a/mmde/configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py b/mmde/configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..0913df389ac6c2ac9d783d9134e92df5560bdcdf
--- /dev/null
+++ b/mmde/configs/pointnet2/pointnet2_msg_2xb16-cosine-80e_s3dis-seg.py
@@ -0,0 +1,26 @@
+_base_ = [
+    '../_base_/datasets/s3dis-seg.py', '../_base_/models/pointnet2_msg.py',
+    '../_base_/schedules/seg-cosine-50e.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    backbone=dict(in_channels=9),  # [xyz, rgb, normalized_xyz]
+    decode_head=dict(
+        num_classes=13, ignore_index=13,
+        loss_decode=dict(class_weight=None)),  # S3DIS doesn't use class_weight
+    test_cfg=dict(
+        num_points=4096,
+        block_size=1.0,
+        sample_rate=0.5,
+        use_normalized_coord=True,
+        batch_size=24))
+
+# data settings
+train_dataloader = dict(batch_size=16)
+
+# runtime settings
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=2))
+
+# PointNet2-MSG needs longer training time than PointNet2-SSG
+train_cfg = dict(by_epoch=True, max_epochs=80, val_interval=2)
diff --git a/mmde/configs/pointnet2/pointnet2_ssg_2xb16-cosine-200e_scannet-seg-xyz-only.py b/mmde/configs/pointnet2/pointnet2_ssg_2xb16-cosine-200e_scannet-seg-xyz-only.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8d4421f187feea4cd52b370eb8ef315797d344d
--- /dev/null
+++ b/mmde/configs/pointnet2/pointnet2_ssg_2xb16-cosine-200e_scannet-seg-xyz-only.py
@@ -0,0 +1,109 @@
+_base_ = [
+    '../_base_/datasets/scannet-seg.py', '../_base_/models/pointnet2_ssg.py',
+    '../_base_/schedules/seg-cosine-200e.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    backbone=dict(in_channels=3),  # only [xyz]
+    decode_head=dict(
+        num_classes=20,
+        ignore_index=20,
+        # `class_weight` is generated in data pre-processing, saved in
+        # `data/scannet/seg_info/train_label_weight.npy`
+        # you can copy paste the values here, or input the file path as
+        # `class_weight=data/scannet/seg_info/train_label_weight.npy`
+        loss_decode=dict(class_weight=[
+            2.389689, 2.7215734, 4.5944676, 4.8543367, 4.096086, 4.907941,
+            4.690836, 4.512031, 4.623311, 4.9242644, 5.358117, 5.360071,
+            5.019636, 4.967126, 5.3502126, 5.4023647, 5.4027233, 5.4169416,
+            5.3954206, 4.6971426
+        ])),
+    test_cfg=dict(
+        num_points=8192,
+        block_size=1.5,
+        sample_rate=0.5,
+        use_normalized_coord=False,
+        batch_size=24))
+
+# dataset settings
+# in this setting, we only use xyz as network input
+# so we need to re-write all the data pipeline
+class_names = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table',
+               'door', 'window', 'bookshelf', 'picture', 'counter', 'desk',
+               'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink',
+               'bathtub', 'otherfurniture')
+num_points = 8192
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=False,
+        load_dim=6,
+        use_dim=[0, 1, 2],  # only load xyz coordinates
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.5,
+        ignore_index=len(class_names),
+        use_normalized_coord=False,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=False,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(
+        # a wrapper in order to successfully call test function
+        # actually we don't perform test-time-aug
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.0,
+                flip_ratio_bev_vertical=0.0),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(batch_size=16, dataset=dict(pipeline=train_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+val_dataloader = test_dataloader
+
+# runtime settings
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=5))
+train_cfg = dict(val_interval=5)
diff --git a/mmde/configs/pointnet2/pointnet2_ssg_2xb16-cosine-200e_scannet-seg.py b/mmde/configs/pointnet2/pointnet2_ssg_2xb16-cosine-200e_scannet-seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..3900a87ac20b85034b4f029faeb79839f1daa6c4
--- /dev/null
+++ b/mmde/configs/pointnet2/pointnet2_ssg_2xb16-cosine-200e_scannet-seg.py
@@ -0,0 +1,33 @@
+_base_ = [
+    '../_base_/datasets/scannet-seg.py', '../_base_/models/pointnet2_ssg.py',
+    '../_base_/schedules/seg-cosine-200e.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    decode_head=dict(
+        num_classes=20,
+        ignore_index=20,
+        # `class_weight` is generated in data pre-processing, saved in
+        # `data/scannet/seg_info/train_label_weight.npy`
+        # you can copy paste the values here, or input the file path as
+        # `class_weight=data/scannet/seg_info/train_label_weight.npy`
+        loss_decode=dict(class_weight=[
+            2.389689, 2.7215734, 4.5944676, 4.8543367, 4.096086, 4.907941,
+            4.690836, 4.512031, 4.623311, 4.9242644, 5.358117, 5.360071,
+            5.019636, 4.967126, 5.3502126, 5.4023647, 5.4027233, 5.4169416,
+            5.3954206, 4.6971426
+        ])),
+    test_cfg=dict(
+        num_points=8192,
+        block_size=1.5,
+        sample_rate=0.5,
+        use_normalized_coord=False,
+        batch_size=24))
+
+# data settings
+train_dataloader = dict(batch_size=16)
+
+# runtime settings
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=5))
+train_cfg = dict(val_interval=5)
diff --git a/mmde/configs/pointnet2/pointnet2_ssg_2xb16-cosine-50e_s3dis-seg.py b/mmde/configs/pointnet2/pointnet2_ssg_2xb16-cosine-50e_s3dis-seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d4490d651036d800adc06c722955627b626f429
--- /dev/null
+++ b/mmde/configs/pointnet2/pointnet2_ssg_2xb16-cosine-50e_s3dis-seg.py
@@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/datasets/s3dis-seg.py', '../_base_/models/pointnet2_ssg.py',
+    '../_base_/schedules/seg-cosine-50e.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    backbone=dict(in_channels=9),  # [xyz, rgb, normalized_xyz]
+    decode_head=dict(
+        num_classes=13, ignore_index=13,
+        loss_decode=dict(class_weight=None)),  # S3DIS doesn't use class_weight
+    test_cfg=dict(
+        num_points=4096,
+        block_size=1.0,
+        sample_rate=0.5,
+        use_normalized_coord=True,
+        batch_size=24))
+
+# data settings
+train_dataloader = dict(batch_size=16)
+
+# runtime settings
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=2))
+train_cfg = dict(val_interval=2)
diff --git a/mmde/configs/pointpillars/README.md b/mmde/configs/pointpillars/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a2de0d4db38fc601d534f5198a74b8873e8a3ee1
--- /dev/null
+++ b/mmde/configs/pointpillars/README.md
@@ -0,0 +1,78 @@
+# PointPillars: Fast Encoders for Object Detection from Point Clouds
+
+> [PointPillars: Fast Encoders for Object Detection from Point Clouds](https://arxiv.org/abs/1812.05784)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Object detection in point clouds is an important aspect of many robotics applications such as autonomous driving. In this paper we consider the problem of encoding a point cloud into a format appropriate for a downstream detection pipeline. Recent literature suggests two types of encoders; fixed encoders tend to be fast but sacrifice accuracy, while encoders that are learned from data are more accurate, but slower. In this work we propose PointPillars, a novel encoder which utilizes PointNets to learn a representation of point clouds organized in vertical columns (pillars). While the encoded features can be used with any standard 2D convolutional detection architecture, we further propose a lean downstream network. Extensive experimentation shows that PointPillars outperforms previous encoders with respect to both speed and accuracy by a large margin. Despite only using lidar, our full detection pipeline significantly outperforms the state of the art, even among fusion methods, with respect to both the 3D and bird's eye view KITTI benchmarks. This detection performance is achieved while running at 62 Hz: a 2 - 4 fold runtime improvement. A faster version of our method matches the state of the art at 105 Hz. These benchmarks suggest that PointPillars is an appropriate encoding for object detection in point clouds.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/79644370/143885905-aab6ffcf-7727-495e-90ca-edb8dd5e324b.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement PointPillars and provide the results and checkpoints on KITTI, nuScenes, Lyft and Waymo datasets.
+
+## Results and models
+
+### KITTI
+
+|                            Backbone                             |  Class  |   Lr schd   | Mem (GB) | Inf time (fps) |  AP   |                                                                                                                                                                                                         Download                                                                                                                                                                                                         |
+| :-------------------------------------------------------------: | :-----: | :---------: | :------: | :------------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  [SECFPN](./pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car.py)   |   Car   | cyclic 160e |   5.4    |                | 77.6  |       [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20220331_134606-d42d15ed.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20220331_134606.log.json)       |
+| [SECFPN](./pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py) | 3 Class | cyclic 160e |   5.5    |                | 64.07 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class_20220301_150306-37dc2420.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class_20220301_150306.log.json) |
+
+### nuScenes
+
+|                                Backbone                                 | Lr schd | Mem (GB) | Inf time (fps) |  mAP  |  NDS  |                                                                                                                                                                                                     Download                                                                                                                                                                                                     |
+| :---------------------------------------------------------------------: | :-----: | :------: | :------------: | :---: | :---: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|      [SECFPN](./pointpillars_hv_secfpn_sbn-all_8xb4-2x_nus-3d.py)       |   2x    |   16.4   |                | 34.33 | 49.1  |   [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20210826_225857-f19d00a3.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20210826_225857.log.json)   |
+| [SECFPN (FP16)](./pointpillars_hv_secfpn_sbn-all_8xb2-amp-2x_nus-3d.py) |   2x    |   8.37   |                | 35.19 | 50.27 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626-c3f0483e.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626.log.json) |
+|         [FPN](./pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py)          |   2x    |   16.3   |                | 39.7  | 53.2  |         [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20210826_104936-fca299c1.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20210826_104936.log.json)         |
+|    [FPN (FP16)](./pointpillars_hv_fpn_sbn-all_8xb2-amp-2x_nus-3d.py)    |   2x    |   8.40   |                | 39.26 | 53.26 |       [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719-269f9dd6.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719.log.json)       |
+
+### Lyft
+
+|                           Backbone                            | Lr schd | Mem (GB) | Inf time (fps) | Private Score | Public Score |                                                                                                                                                                                                     Download                                                                                                                                                                                                     |
+| :-----------------------------------------------------------: | :-----: | :------: | :------------: | :-----------: | :----------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [SECFPN](./pointpillars_hv_secfpn_sbn-all_8xb2-2x_lyft-3d.py) |   2x    |   12.2   |                |     13.8      |     14.1     | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d_20210829_100455-82b81c39.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d_20210829_100455.log.json) |
+|    [FPN](./pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py)    |   2x    |   9.2    |                |     14.8      |     15.0     |       [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d_20210822_095429-0b3d6196.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d_20210822_095429.log.json)       |
+
+### Waymo
+
+|                                 Backbone                                 | Load Interval |  Class  | Lr schd | Mem (GB) | Inf time (fps) | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** |                                                                                                                                                                                                                   Download                                                                                                                                                                                                                   |
+| :----------------------------------------------------------------------: | :-----------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :----: | :---------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  [SECFPN](./pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-car.py)   |       5       |   Car   |   2x    |   7.76   |                |  70.2  |  69.6   |  62.6  |    62.1     |       [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car_20200901_204315-302fc3e7.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car_20200901_204315.log.json)       |
+| [SECFPN](./pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py) |       5       | 3 Class |   2x    |   8.12   |                |  64.7  |  57.6   |  58.4  |    52.1     | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class_20200831_204144-d1a706b1.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class_20200831_204144.log.json) |
+|                               above @ Car                                |               |         |   2x    |   8.12   |                |  68.5  |  67.9   |  60.1  |    59.6     |                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|                            above @ Pedestrian                            |               |         |   2x    |   8.12   |                |  67.8  |  50.6   |  59.6  |    44.3     |                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|                             above @ Cyclist                              |               |         |   2x    |   8.12   |                |  57.7  |  54.4   |  55.5  |    52.4     |                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|   [SECFPN](./pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-car.py)    |       1       |   Car   |   2x    |   7.76   |                |  72.1  |  71.5   |  63.6  |    63.1     |                                                                                                                           [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car.log.json)                                                                                                                            |
+|  [SECFPN](./pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-3class.py)  |       1       | 3 Class |   2x    |   8.12   |                |  68.8  |  63.3   |  62.6  |    57.6     |                                                                                                                        [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-3class/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-3class.log.json)                                                                                                                         |
+|                               above @ Car                                |               |         |   2x    |   8.12   |                |  71.6  |  71.0   |  63.1  |    62.5     |                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|                            above @ Pedestrian                            |               |         |   2x    |   8.12   |                |  70.6  |  56.7   |  62.9  |    50.2     |                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|                             above @ Cyclist                              |               |         |   2x    |   8.12   |                |  64.4  |  62.3   |  61.9  |    59.9     |                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+
+#### Note:
+
+- **Metric**: For model trained with 3 classes, the average APH@L2 (mAPH@L2) of all the categories is reported and used to rank the model. For model trained with only 1 class, the APH@L2 is reported and used to rank the model.
+- **Data Split**: Here we provide several baselines for waymo dataset, among which D5 means that we divide the dataset into 5 folds and only use one fold for efficient experiments. Using the complete dataset can boost the performance a lot, especially for the detection of cyclist and pedestrian, where more than 5 mAP or mAPH improvement can be expected.
+- **Implementation Details**: We basically follow the implementation in the [paper](https://arxiv.org/pdf/1912.04838.pdf) in terms of the network architecture (having a
+  stride of 1 for the first convolutional block). Different settings of voxelization, data augmentation and hyper parameters make these baselines outperform those in the paper by about 7 mAP for car and 4 mAP for pedestrian with only a subset of the whole dataset. All of these results are achieved without bells-and-whistles, e.g. ensemble, multi-scale training and test augmentation.
+- **License Aggrement**: To comply the [license agreement of Waymo dataset](https://waymo.com/open/terms/), the pre-trained models on Waymo dataset are not released. We still release the training log as a reference to ease the future research.
+- `FP16` means Mixed Precision (FP16) is adopted in training. With mixed precision training, we can train PointPillars with nuScenes dataset on 8 Titan XP GPUS with batch size of 2. This will cause OOM error without mixed precision training. The loss scale for PointPillars on nuScenes dataset is specifically tuned to avoid the loss to be Nan. We find 32 is more stable than 512, though loss scale 32 still cause Nan sometimes.
+
+## Citation
+
+```latex
+@inproceedings{lang2019pointpillars,
+  title={Pointpillars: Fast encoders for object detection from point clouds},
+  author={Lang, Alex H and Vora, Sourabh and Caesar, Holger and Zhou, Lubing and Yang, Jiong and Beijbom, Oscar},
+  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+  pages={12697--12705},
+  year={2019}
+}
+```
diff --git a/mmde/configs/pointpillars/metafile.yml b/mmde/configs/pointpillars/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..77f5692151a5480ce4e98df9dea967749a3d906f
--- /dev/null
+++ b/mmde/configs/pointpillars/metafile.yml
@@ -0,0 +1,215 @@
+Collections:
+  - Name: PointPillars
+    Metadata:
+      Training Techniques:
+        - AdamW
+      Architecture:
+        - Feature Pyramid Network
+    Paper:
+      URL: https://arxiv.org/abs/1812.05784
+      Title: 'PointPillars: Fast Encoders for Object Detection from Point Clouds'
+    README: configs/pointpillars/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/voxel_encoders/pillar_encoder.py#L13
+      Version: v0.6.0
+
+Models:
+  - Name: pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car
+    In Collection: PointPillars
+    Config: configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car.py
+    Metadata:
+      Training Data: KITTI
+      Training Memory (GB): 5.4
+      Training Resources: 8x V100 GPUs
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          AP: 77.6
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20220331_134606-d42d15ed.pth
+
+  - Name: pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class
+    Alias: pointpillars_kitti-3class
+    In Collection: PointPillars
+    Config: configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py
+    Metadata:
+      Training Data: KITTI
+      Training Memory (GB): 5.5
+      Training Resources: 8x V100 GPUs
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          AP: 64.07
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class_20220301_150306-37dc2420.pth
+
+  - Name: pointpillars_hv_secfpn_sbn-all_8xb4-2x_nus-3d
+    In Collection: PointPillars
+    Config: configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb4-2x_nus-3d.py
+    Metadata:
+      Training Data: nuScenes
+      Training Memory (GB): 16.4
+      Training Resources: 8x V100 GPUs
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 34.33
+          NDS: 49.1
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20210826_225857-f19d00a3.pth
+
+  - Name: pointpillars_hv_secfpn_sbn-all_8xb4-amp-2x_nus-3d
+    In Collection: PointPillars
+    Config: configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb4-amp-2x_nus-3d.py
+    Metadata:
+      Training Techniques:
+        - AdamW
+        - Mixed Precision Training
+      Training Resources: 8x TITAN Xp
+      Architecture:
+        - Hard Voxelization
+      Training Data: nuScenes
+      Training Memory (GB): 8.37
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 35.19
+          NDS: 50.27
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626-c3f0483e.pth
+    Code:
+      Version: v0.7.0
+
+  - Name: pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d
+    In Collection: PointPillars
+    Config: configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py
+    Metadata:
+      Training Data: nuScenes
+      Training Memory (GB): 16.3
+      Training Resources: 8x V100 GPUs
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 39.71
+          NDS: 53.15
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20210826_104936-fca299c1.pth
+
+  - Name: pointpillars_hv_fpn_sbn-all_8xb4-amp-2x_nus-3d
+    In Collection: PointPillars
+    Config: configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-amp-2x_nus-3d.py
+    Metadata:
+      Training Techniques:
+        - AdamW
+        - Mixed Precision Training
+      Training Resources: 8x TITAN Xp
+      Architecture:
+        - Hard Voxelization
+      Training Data: nuScenes
+      Training Memory (GB): 8.40
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 39.26
+          NDS: 53.26
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719-269f9dd6.pth
+    Code:
+      Version: v0.7.0
+
+  - Name: pointpillars_hv_secfpn_sbn-all_8xb2-2x_lyft-3d
+    In Collection: PointPillars
+    Config: configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb2-2x_lyft-3d.py
+    Metadata:
+      Training Data: Lyft
+      Training Memory (GB): 12.2
+      Training Resources: 8x V100 GPUs
+    Results:
+      - Task: 3D Object Detection
+        Dataset: Lyft
+        Metrics:
+          Private Score: 13.8
+          Public Score: 14.1
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d_20210829_100455-82b81c39.pth
+
+  - Name: pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d
+    In Collection: PointPillars
+    Config: configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py
+    Metadata:
+      Training Data: Lyft
+      Training Memory (GB): 9.2
+      Training Resources: 8x V100 GPUs
+    Results:
+      - Task: 3D Object Detection
+        Dataset: Lyft
+        Metrics:
+          Private Score: 14.0
+          Public Score: 15.0
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d_20210822_095429-0b3d6196.pth
+
+  - Name: pointpillars_hv_secfpn_sbn_2x16_2x_waymoD5-3d-car
+    In Collection: PointPillars
+    Config: configs/pointpillars/pointpillars_hv_secfpn_sbn_2x16_2x_waymoD5-3d-car.py
+    Metadata:
+      Training Data: Waymo
+      Training Memory (GB): 7.76
+      Training Resources: 8x GeForce GTX 1080 Ti
+    Results:
+      - Task: 3D Object Detection
+        Dataset: Waymo
+        Metrics:
+          mAP@L1: 70.2
+          mAPH@L1: 69.6
+          mAP@L2: 62.6
+          mAPH@L2: 62.1
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car_20200901_204315-302fc3e7.pth
+
+  - Name: pointpillars_hv_secfpn_sbn_2x16_2x_waymoD5-3d-3class
+    Alias: pointpillars_waymod5-3class
+    In Collection: PointPillars
+    Config: configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py
+    Metadata:
+      Training Data: Waymo
+      Training Memory (GB): 8.12
+      Training Resources: 8x GeForce GTX 1080 Ti
+    Results:
+      - Task: 3D Object Detection
+        Dataset: Waymo
+        Metrics:
+          mAP@L1: 64.7
+          mAPH@L1: 57.6
+          mAP@L2: 58.4
+          mAPH@L2: 52.1
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class_20200831_204144-d1a706b1.pth
+
+  - Name: pointpillars_hv_secfpn_sbn_2x16_2x_waymo-3d-car
+    In Collection: PointPillars
+    Config: configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-car.py
+    Metadata:
+      Training Data: Waymo
+      Training Memory (GB): 7.76
+      Training Resources: 8x GeForce GTX 1080 Ti
+    Results:
+      - Task: 3D Object Detection
+        Dataset: Waymo
+        Metrics:
+          mAP@L1: 72.1
+          mAPH@L1: 71.5
+          mAP@L2: 63.6
+          mAPH@L2: 63.1
+
+  - Name: pointpillars_hv_secfpn_sbn_2x16_2x_waymo-3d-3class
+    In Collection: PointPillars
+    Config: configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-3class.py
+    Metadata:
+      Training Data: Waymo
+      Training Memory (GB): 8.12
+      Training Resources: 8x GeForce GTX 1080 Ti
+    Results:
+      - Task: 3D Object Detection
+        Dataset: Waymo
+        Metrics:
+          mAP@L1: 68.8
+          mAPH@L1: 63.3
+          mAP@L2: 62.6
+          mAPH@L2: 57.6
diff --git a/mmde/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d-range100.py b/mmde/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d-range100.py
new file mode 100644
index 0000000000000000000000000000000000000000..d912bf5357d3e58ad6c11e548fd17fc3742aaa13
--- /dev/null
+++ b/mmde/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d-range100.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_fpn_range100_lyft.py',
+    '../_base_/datasets/lyft-3d-range100.py',
+    '../_base_/schedules/schedule-2x.py', '../_base_/default_runtime.py'
+]
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py b/mmde/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..8491dead5d236c0eec84dda06eb5ff2737101c89
--- /dev/null
+++ b/mmde/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_fpn_lyft.py',
+    '../_base_/datasets/lyft-3d.py', '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py'
+]
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-amp-2x_nus-3d.py b/mmde/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-amp-2x_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c2d8e05b80b7199ec9ec67a813654aef93c0f76
--- /dev/null
+++ b/mmde/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-amp-2x_nus-3d.py
@@ -0,0 +1,4 @@
+_base_ = './pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py'
+train_dataloader = dict(batch_size=2, num_workers=2)
+# schedule settings
+optim_wrapper = dict(type='AmpOptimWrapper', loss_scale=4096.)
diff --git a/mmde/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py b/mmde/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..6000f88b59b06c504e2756f7b901c876d8f1e59b
--- /dev/null
+++ b/mmde/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_fpn_nus.py',
+    '../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py'
+]
+
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+train_cfg = dict(val_interval=24)
diff --git a/mmde/configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py b/mmde/configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..65786abb5e7f2c861f3de96ddd11d8a238bda22a
--- /dev/null
+++ b/mmde/configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_secfpn_kitti.py',
+    '../_base_/datasets/kitti-3d-3class.py',
+    '../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py'
+]
+
+point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
+# dataset settings
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+backend_args = None
+
+# PointPillars adopted a different sampling strategies among classes
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=15, Cyclist=15),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+# PointPillars uses different augmentation hyper parameters
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler, use_ground_plane=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_labels_3d', 'gt_bboxes_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    dataset=dict(dataset=dict(pipeline=train_pipeline, metainfo=metainfo)))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline, metainfo=metainfo))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline, metainfo=metainfo))
+# In practice PointPillars also uses a different schedule
+# optimizer
+lr = 0.001
+epoch_num = 80
+optim_wrapper = dict(
+    optimizer=dict(lr=lr), clip_grad=dict(max_norm=35, norm_type=2))
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=epoch_num * 0.4,
+        eta_min=lr * 10,
+        begin=0,
+        end=epoch_num * 0.4,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=epoch_num * 0.6,
+        eta_min=lr * 1e-4,
+        begin=epoch_num * 0.4,
+        end=epoch_num * 1,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=epoch_num * 0.4,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=epoch_num * 0.4,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=epoch_num * 0.6,
+        eta_min=1,
+        begin=epoch_num * 0.4,
+        end=epoch_num * 1,
+        convert_to_iter_based=True)
+]
+# max_norm=35 is slightly better than 10 for PointPillars in the earlier
+# development of the codebase thus we keep the setting. But we does not
+# specifically tune this parameter.
+# PointPillars usually need longer schedule than second, we simply double
+# the training schedule. Do remind that since we use RepeatDataset and
+# repeat factor is 2, so we actually train 160 epochs.
+train_cfg = dict(by_epoch=True, max_epochs=epoch_num, val_interval=2)
+val_cfg = dict()
+test_cfg = dict()
diff --git a/mmde/configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car.py b/mmde/configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ffd46ce51cb2dbf4cda648d029f5c22ca9e7bc1
--- /dev/null
+++ b/mmde/configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car.py
@@ -0,0 +1,101 @@
+# model settings
+_base_ = './pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py'
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+metainfo = dict(classes=class_names)
+backend_args = None
+
+point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
+
+model = dict(
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=1,
+        anchor_generator=dict(
+            _delete_=True,
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[0, -39.68, -1.78, 69.12, 39.68, -1.78]],
+            sizes=[[3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=True)),
+    # model training and testing settings
+    train_cfg=dict(
+        _delete_=True,
+        assigner=dict(
+            type='Max3DIoUAssigner',
+            iou_calculator=dict(type='BboxOverlapsNearest3D'),
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.45,
+            min_pos_iou=0.45,
+            ignore_iof_thr=-1),
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False))
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler, use_ground_plane=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_labels_3d', 'gt_bboxes_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    dataset=dict(dataset=dict(pipeline=train_pipeline, metainfo=metainfo)))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline, metainfo=metainfo))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline, metainfo=metainfo))
diff --git a/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-3class.py b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..61f8fba228afadca56c2c936cc41cbcb1b985a1b
--- /dev/null
+++ b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-3class.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_secfpn_waymo.py',
+    '../_base_/datasets/waymoD5-3d-3class.py',
+    '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py',
+]
+
+# data settings
+train_dataloader = dict(dataset=dict(dataset=dict(load_interval=1)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (16 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
diff --git a/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-car.py b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..38bd95b3deb80497da3ba2493c6e50c5b9b41898
--- /dev/null
+++ b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-car.py
@@ -0,0 +1,42 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_secfpn_waymo.py',
+    '../_base_/datasets/waymoD5-3d-car.py',
+    '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py',
+]
+
+# data settings
+train_dataloader = dict(dataset=dict(dataset=dict(load_interval=1)))
+
+# model settings
+model = dict(
+    type='MVXFasterRCNN',
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=1,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345]],
+            sizes=[[4.73, 2.08, 1.77]],
+            rotations=[0, 1.57],
+            reshape_out=True)),
+    # model training and testing settings
+    train_cfg=dict(
+        _delete_=True,
+        pts=dict(
+            assigner=dict(
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.55,
+                neg_iou_thr=0.4,
+                min_pos_iou=0.4,
+                ignore_iof_thr=-1),
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            pos_weight=-1,
+            debug=False)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (16 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
diff --git a/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..f39d9eab290cc3d05f9c99e510eee82874b828df
--- /dev/null
+++ b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py
@@ -0,0 +1,6 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_secfpn_waymo.py',
+    '../_base_/datasets/waymoD5-3d-3class.py',
+    '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py',
+]
diff --git a/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-car.py b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2e98334ae60576728b5cd3a6519ac599eb7be91
--- /dev/null
+++ b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-car.py
@@ -0,0 +1,39 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_secfpn_waymo.py',
+    '../_base_/datasets/waymoD5-3d-car.py',
+    '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='MVXFasterRCNN',
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=1,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345]],
+            sizes=[[4.73, 2.08, 1.77]],
+            rotations=[0, 1.57],
+            reshape_out=True)),
+    # model training and testing settings
+    train_cfg=dict(
+        _delete_=True,
+        pts=dict(
+            assigner=dict(
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.55,
+                neg_iou_thr=0.4,
+                min_pos_iou=0.4,
+                ignore_iof_thr=-1),
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            pos_weight=-1,
+            debug=False)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (16 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
diff --git a/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb2-2x_lyft-3d-range100.py b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb2-2x_lyft-3d-range100.py
new file mode 100644
index 0000000000000000000000000000000000000000..90c20714bbf563665b93ab8366dce9c3306f1471
--- /dev/null
+++ b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb2-2x_lyft-3d-range100.py
@@ -0,0 +1,47 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_fpn_range100_lyft.py',
+    '../_base_/datasets/lyft-3d-range100.py',
+    '../_base_/schedules/schedule-2x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    pts_neck=dict(
+        _delete_=True,
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(
+        in_channels=384,
+        feat_channels=384,
+        anchor_generator=dict(
+            _delete_=True,
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-100, -100, -1.0715024, 100, 100, -1.0715024],
+                    [-100, -100, -0.3033737, 100, 100, -0.3033737],
+                    [-100, -100, -0.3519405, 100, 100, -0.3519405],
+                    [-100, -100, -0.8871424, 100, 100, -0.8871424],
+                    [-100, -100, -0.6276341, 100, 100, -0.6276341],
+                    [-100, -100, -1.3220503, 100, 100, -1.3220503],
+                    [-100, -100, -1.0709302, 100, 100, -1.0709302],
+                    [-100, -100, -0.9122268, 100, 100, -0.9122268],
+                    [-100, -100, -1.8012227, 100, 100, -1.8012227]],
+            sizes=[
+                [4.75, 1.92, 1.71],  # car
+                [10.24, 2.84, 3.44],  # truck
+                [12.70, 2.92, 3.42],  # bus
+                [6.52, 2.42, 2.34],  # emergency vehicle
+                [8.17, 2.75, 3.20],  # other vehicle
+                [2.35, 0.96, 1.59],  # motorcycle
+                [1.76, 0.63, 1.44],  # bicycle
+                [0.80, 0.76, 1.76],  # pedestrian
+                [0.73, 0.35, 0.50]  # animal
+            ],
+            rotations=[0, 1.57],
+            reshape_out=True)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb2-2x_lyft-3d.py b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb2-2x_lyft-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..61c5f70bfabfc702f84ba4a15bf1f3a54ebd4393
--- /dev/null
+++ b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb2-2x_lyft-3d.py
@@ -0,0 +1,48 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_fpn_lyft.py',
+    '../_base_/datasets/lyft-3d.py',
+    '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    pts_neck=dict(
+        _delete_=True,
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(
+        in_channels=384,
+        feat_channels=384,
+        anchor_generator=dict(
+            _delete_=True,
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-80, -80, -1.0715024, 80, 80, -1.0715024],
+                    [-80, -80, -0.3033737, 80, 80, -0.3033737],
+                    [-80, -80, -0.3519405, 80, 80, -0.3519405],
+                    [-80, -80, -0.8871424, 80, 80, -0.8871424],
+                    [-80, -80, -0.6276341, 80, 80, -0.6276341],
+                    [-80, -80, -1.3220503, 80, 80, -1.3220503],
+                    [-80, -80, -1.0709302, 80, 80, -1.0709302],
+                    [-80, -80, -0.9122268, 80, 80, -0.9122268],
+                    [-80, -80, -1.8012227, 80, 80, -1.8012227]],
+            sizes=[
+                [4.75, 1.92, 1.71],  # car
+                [10.24, 2.84, 3.44],  # truck
+                [12.70, 2.92, 3.42],  # bus
+                [6.52, 2.42, 2.34],  # emergency vehicle
+                [8.17, 2.75, 3.20],  # other vehicle
+                [2.35, 0.96, 1.59],  # motorcycle
+                [1.76, 0.63, 1.44],  # bicycle
+                [0.80, 0.76, 1.76],  # pedestrian
+                [0.73, 0.35, 0.50]  # animal
+            ],
+            rotations=[0, 1.57],
+            reshape_out=True)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb2-amp-2x_nus-3d.py b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb2-amp-2x_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..2724dc12a03daf745576ae3e4d563893b8d27802
--- /dev/null
+++ b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb2-amp-2x_nus-3d.py
@@ -0,0 +1,4 @@
+_base_ = './pointpillars_hv_secfpn_sbn-all_8xb4-2x_nus-3d.py'
+train_dataloader = dict(batch_size=2, num_workers=2)
+# schedule settings
+optim_wrapper = dict(type='AmpOptimWrapper', loss_scale=4096.)
diff --git a/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb4-2x_nus-3d.py b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb4-2x_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..51b5ae290c4f8f556e20767fcc985f3f29e53a8f
--- /dev/null
+++ b/mmde/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb4-2x_nus-3d.py
@@ -0,0 +1,48 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_fpn_nus.py',
+    '../_base_/datasets/nus-3d.py',
+    '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    pts_neck=dict(
+        _delete_=True,
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(
+        in_channels=384,
+        feat_channels=384,
+        anchor_generator=dict(
+            _delete_=True,
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[
+                [-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795],
+                [-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365],
+                [-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504],
+                [-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111],
+                [-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072],
+                [-49.6, -49.6, -1.80984986, 49.6, 49.6, -1.80984986],
+                [-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965],
+            ],
+            sizes=[
+                [4.60718145, 1.95017717, 1.72270761],  # car
+                [6.73778078, 2.4560939, 2.73004906],  # truck
+                [12.01320693, 2.87427237, 3.81509561],  # trailer
+                [1.68452161, 0.60058911, 1.27192197],  # bicycle
+                [0.7256437, 0.66344886, 1.75748069],  # pedestrian
+                [0.40359262, 0.39694519, 1.06232151],  # traffic_cone
+                [0.48578221, 2.49008838, 0.98297065],  # barrier
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True)))
+
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+train_cfg = dict(val_interval=24)
diff --git a/mmde/configs/pv_rcnn/README.md b/mmde/configs/pv_rcnn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5af319024b01fb8e13aa9660eec532963c86292d
--- /dev/null
+++ b/mmde/configs/pv_rcnn/README.md
@@ -0,0 +1,42 @@
+# PV-RCNN: Point-Voxel Feature Set Abstraction for 3D Object Detection
+
+> [PV-RCNN: Point-Voxel Feature Set Abstraction for 3D Object Detection](https://arxiv.org/abs/1912.13192)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+3D object detection has been receiving increasing attention from both industry and academia thanks to its wide applications in various fields such as autonomous driving and robotics. LiDAR sensors are widely adopted in autonomous driving vehicles and robots for capturing 3D scene information as sparse and irregular point clouds, which provide vital cues for 3D scene perception and understanding. In this paper, we propose to achieve high performance 3D object detection by designing novel point-voxel integrated networks to learn better 3D features from irregular point clouds.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/88368822/202114244-ccf52f56-b8c9-4f1b-9cc2-80c7a9952c99.png" width="800"/>
+</div>
+
+## Results and models
+
+### KITTI
+
+|                    Backbone                     |  Class  |  Lr schd   | Mem (GB) | Inf time (fps) |  mAP  |                                                                                                                                                                    Download                                                                                                                                                                    |
+| :---------------------------------------------: | :-----: | :--------: | :------: | :------------: | :---: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [SECFPN](./pv_rcnn_8xb2-80e_kitti-3d-3class.py) | 3 Class | cyclic 80e |   5.4    |                | 72.28 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/pv_rcnn/pv_rcnn_8xb2-80e_kitti-3d-3class/pv_rcnn_8xb2-80e_kitti-3d-3class_20221117_234428-b384d22f.pth) \\ [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/pv_rcnn/pv_rcnn_8xb2-80e_kitti-3d-3class/pv_rcnn_8xb2-80e_kitti-3d-3class_20221117_234428.json) |
+
+Note: mAP represents AP11 results on 3 Class under the moderate setting.
+
+Detailed performance on KITTI 3D detection (3D) is as follows, evaluated by AP11 metric:
+
+|            | Easy  | Moderate | Hard  |
+| ---------- | :---: | :------: | :---: |
+| Car        | 89.20 |  83.72   | 78.79 |
+| Pedestrian | 66.64 |  59.84   | 55.33 |
+| Cyclist    | 87.25 |  73.27   | 69.61 |
+
+## Citation
+
+```latex
+@article{ShaoshuaiShi2020PVRCNNPF,
+  title={PV-RCNN: Point-Voxel Feature Set Abstraction for 3D Object Detection},
+  author={Shaoshuai Shi and Chaoxu Guo and Li Jiang and Zhe Wang and Jianping Shi and Xiaogang Wang and Hongsheng Li},
+  journal={computer vision and pattern recognition},
+  year={2020}
+}
+```
diff --git a/mmde/configs/pv_rcnn/metafile.yml b/mmde/configs/pv_rcnn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ddef74959ef4a2892163bad9c639141714b4bcdf
--- /dev/null
+++ b/mmde/configs/pv_rcnn/metafile.yml
@@ -0,0 +1,29 @@
+Collections:
+  - Name: PV-RCNN
+    Metadata:
+      Training Data: KITTI
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - Feature Pyramid Network
+    Paper:
+      URL: https://arxiv.org/abs/1912.13192
+      Title: 'PV-RCNN: Point-Voxel Feature Set Abstraction for 3D Object Detection'
+    README: configs/pv_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/models/detectors/pv_rcnn.py#L12
+      Version: v1.1.0rc2
+
+Models:
+  - Name: pv_rcnn_8xb2-80e_kitti-3d-3class
+    In Collection: PV-RCNN
+    Config: configs/pv_rcnn/pv_rcnn_8xb2-80e_kitti-3d-3class.py
+    Metadata:
+      Training Memory (GB): 5.4
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 72.28
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/pv_rcnn/pv_rcnn_8xb2-80e_kitti-3d-3class/pv_rcnn_8xb2-80e_kitti-3d-3class_20221117_234428-b384d22f.pth
diff --git a/mmde/configs/pv_rcnn/pv_rcnn_8xb2-80e_kitti-3d-3class.py b/mmde/configs/pv_rcnn/pv_rcnn_8xb2-80e_kitti-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..f894c9aaca56c19702c75f637b4707b23a9cc9d7
--- /dev/null
+++ b/mmde/configs/pv_rcnn/pv_rcnn_8xb2-80e_kitti-3d-3class.py
@@ -0,0 +1,369 @@
+_base_ = [
+    '../_base_/datasets/kitti-3d-3class.py',
+    '../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py'
+]
+
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(CLASSES=class_names)
+backend_args = None
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler, use_ground_plane=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+model = dict(
+    type='PointVoxelRCNN',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=5,  # max_points_per_voxel
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(16000, 40000))),
+    voxel_encoder=dict(type='HardSimpleVFE'),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act'),
+        encoder_paddings=((0, 0, 0), ((1, 1, 1), 0, 0), ((1, 1, 1), 0, 0),
+                          ((0, 1, 1), 0, 0)),
+        return_middle_feats=True),
+    points_encoder=dict(
+        type='VoxelSetAbstraction',
+        num_keypoints=2048,
+        fused_out_channel=128,
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        voxel_sa_cfgs_list=[
+            dict(
+                type='StackedSAModuleMSG',
+                in_channels=16,
+                scale_factor=1,
+                radius=(0.4, 0.8),
+                sample_nums=(16, 16),
+                mlp_channels=((16, 16), (16, 16)),
+                use_xyz=True),
+            dict(
+                type='StackedSAModuleMSG',
+                in_channels=32,
+                scale_factor=2,
+                radius=(0.8, 1.2),
+                sample_nums=(16, 32),
+                mlp_channels=((32, 32), (32, 32)),
+                use_xyz=True),
+            dict(
+                type='StackedSAModuleMSG',
+                in_channels=64,
+                scale_factor=4,
+                radius=(1.2, 2.4),
+                sample_nums=(16, 32),
+                mlp_channels=((64, 64), (64, 64)),
+                use_xyz=True),
+            dict(
+                type='StackedSAModuleMSG',
+                in_channels=64,
+                scale_factor=8,
+                radius=(2.4, 4.8),
+                sample_nums=(16, 32),
+                mlp_channels=((64, 64), (64, 64)),
+                use_xyz=True)
+        ],
+        rawpoints_sa_cfgs=dict(
+            type='StackedSAModuleMSG',
+            in_channels=1,
+            radius=(0.4, 0.8),
+            sample_nums=(16, 16),
+            mlp_channels=((16, 16), (16, 16)),
+            use_xyz=True),
+        bev_feat_channel=256,
+        bev_scale_factor=8),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    rpn_head=dict(
+        type='PartA2RPNHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        dir_offset=0.78539,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -1.78, 70.4, 40.0, -1.78]],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        assigner_per_size=True,
+        assign_per_class=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    roi_head=dict(
+        type='PVRCNNRoiHead',
+        num_classes=3,
+        semantic_head=dict(
+            type='ForegroundSegmentationHead',
+            in_channels=640,
+            extra_width=0.1,
+            loss_seg=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                gamma=2.0,
+                alpha=0.25,
+                activated=True,
+                loss_weight=1.0)),
+        bbox_roi_extractor=dict(
+            type='Batch3DRoIGridExtractor',
+            grid_size=6,
+            roi_layer=dict(
+                type='StackedSAModuleMSG',
+                in_channels=128,
+                radius=(0.8, 1.6),
+                sample_nums=(16, 16),
+                mlp_channels=((64, 64), (64, 64)),
+                use_xyz=True,
+                pool_mod='max'),
+        ),
+        bbox_head=dict(
+            type='PVRCNNBBoxHead',
+            in_channels=128,
+            grid_size=6,
+            num_classes=3,
+            class_agnostic=True,
+            shared_fc_channels=(256, 256),
+            reg_channels=(256, 256),
+            cls_channels=(256, 256),
+            dropout_ratio=0.3,
+            with_corner_loss=True,
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss',
+                beta=1.0 / 9.0,
+                reduction='sum',
+                loss_weight=1.0),
+            loss_cls=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.45,
+                    min_pos_iou=0.45,
+                    ignore_iof_thr=-1)
+            ],
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=9000,
+            nms_post=512,
+            max_num=512,
+            nms_thr=0.8,
+            score_thr=0,
+            use_rotate_nms=True),
+        rcnn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1)
+            ],
+            sampler=dict(
+                type='IoUNegPiecewiseSampler',
+                num=128,
+                pos_fraction=0.5,
+                neg_piece_fractions=[0.8, 0.2],
+                neg_iou_piece_thrs=[0.55, 0.1],
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False,
+                return_iou=True),
+            cls_pos_thr=0.75,
+            cls_neg_thr=0.25)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1024,
+            nms_post=100,
+            max_num=100,
+            nms_thr=0.7,
+            score_thr=0,
+            use_rotate_nms=True),
+        rcnn=dict(
+            use_rotate_nms=True,
+            use_raw_score=True,
+            nms_thr=0.1,
+            score_thr=0.1)))
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    dataset=dict(dataset=dict(pipeline=train_pipeline, metainfo=metainfo)))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline, metainfo=metainfo))
+eval_dataloader = dict(dataset=dict(pipeline=test_pipeline, metainfo=metainfo))
+lr = 0.001
+optim_wrapper = dict(optimizer=dict(lr=lr))
+param_scheduler = [
+    # learning rate scheduler
+    # During the first 16 epochs, learning rate increases from 0 to lr * 10
+    # during the next 24 epochs, learning rate decreases from lr * 10 to
+    # lr * 1e-4
+    dict(
+        type='CosineAnnealingLR',
+        T_max=15,
+        eta_min=lr * 10,
+        begin=0,
+        end=15,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=25,
+        eta_min=lr * 1e-4,
+        begin=15,
+        end=40,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    # During the first 16 epochs, momentum increases from 0 to 0.85 / 0.95
+    # during the next 24 epochs, momentum increases from 0.85 / 0.95 to 1
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=15,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=15,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=25,
+        eta_min=1,
+        begin=15,
+        end=40,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
diff --git a/mmde/configs/regnet/README.md b/mmde/configs/regnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4c992ede55dc4d4d1ece681c91ce2e5698ae202a
--- /dev/null
+++ b/mmde/configs/regnet/README.md
@@ -0,0 +1,82 @@
+# Designing Network Design Spaces
+
+> [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678)
+
+<!-- [BACKBONE] -->
+
+## Abstract
+
+In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/79644370/144025148-b73002cb-3c82-42e4-8da4-65df97aead9c.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement RegNetX models in 3D detection systems and provide their first results with PointPillars on nuScenes and Lyft dataset.
+
+The pre-trained modles are converted from [model zoo of pycls](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md) and maintained in [mmcv](https://github.com/open-mmlab/mmcv).
+
+## Usage
+
+To use a regnet model, there are two steps to do:
+
+1. Convert the model to ResNet-style supported by MMDetection
+2. Modify backbone and neck in config accordingly
+
+### Convert model
+
+We already prepare models of FLOPs from 800M to 12G in our model zoo.
+
+For more general usage, we also provide script `regnet2mmdet.py` in the tools directory to convert the key of models pretrained by [pycls](https://github.com/facebookresearch/pycls/) to
+ResNet-style checkpoints used in MMDetection.
+
+```bash
+python -u tools/model_converters/regnet2mmdet.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
+### Modify config
+
+The users can modify the config's `depth` of backbone and corresponding keys in `arch` according to the configs in the [pycls model zoo](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md).
+The parameter `in_channels` in FPN can be found in the Figure 15 & 16 of the paper (`wi` in the legend).
+This directory already provides some configs with their performance, using RegNetX from 800MF to 12GF level.
+For other pre-trained models or self-implemented regnet models, the users are responsible to check these parameters by themselves.
+
+**Note**: Although Fig. 15 & 16 also provide `w0`, `wa`, `wm`, `group_w`, and `bot_mul` for `arch`, they are quantized thus inaccurate, using them sometimes produces different backbone that does not match the key in the pre-trained model.
+
+## Results and models
+
+### nuScenes
+
+|                                        Backbone                                         | Lr schd | Mem (GB) | Inf time (fps) |  mAP  | NDS  |                                                                                                                                                                                                                       Download                                                                                                                                                                                                                       |
+| :-------------------------------------------------------------------------------------: | :-----: | :------: | :------------: | :---: | :--: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|       [SECFPN](../pointpillars/pointpillars_hv_secfpn_sbn-all_8xb4-2x_nus-3d.py)        |   2x    |   16.4   |                | 35.17 | 49.7 |                     [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725-0817d270.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725.log.json)                     |
+| [RegNetX-400MF-SECFPN](./pointpillars_hv_regnet-400mf_secfpn_sbn-all_8xb4-2x_nus-3d.py) |   2x    |   16.4   |                | 41.2  | 55.2 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334-53044f32.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334.log.json) |
+|          [FPN](../pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py)           |   2x    |   17.1   |                | 40.0  | 53.3 |                           [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405.log.json)                           |
+|    [RegNetX-400MF-FPN](./pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb4-2x_nus-3d.py)    |   2x    |   17.3   |                | 44.8  | 56.4 |       [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239-c694dce7.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239.log.json)       |
+|    [RegNetX-1.6gF-FPN](./pointpillars_hv_regnet-1.6gf_fpn_sbn-all_8xb4-2x_nus-3d.py)    |   2x    |   24.0   |                | 48.2  | 59.3 |       [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d_20200629_050311-dcd4e090.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d_20200629_050311.log.json)       |
+
+### Lyft
+
+|                                        Backbone                                         | Lr schd | Mem (GB) | Inf time (fps) | Private Score | Public Score |                                                                                                                                                                                                                         Download                                                                                                                                                                                                                         |
+| :-------------------------------------------------------------------------------------: | :-----: | :------: | :------------: | :-----------: | :----------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|       [SECFPN](../pointpillars/pointpillars_hv_secfpn_sbn-all_8xb2-2x_lyft-3d.py)       |   2x    |   12.2   |                |     13.9      |     14.1     |                     [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d_20210517_204807-2518e3de.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d_20210517_204807.log.json)                     |
+| [RegNetX-400MF-SECFPN](./hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_lyft-3d.py) |   2x    |   15.9   |                |     14.9      |     15.1     | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d_20210524_092151-42513826.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d_20210524_092151.log.json) |
+|          [FPN](../pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py)          |   2x    |   9.2    |                |     14.9      |     15.1     |                           [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d_20210517_202818-fc6904c3.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d_20210517_202818.log.json)                           |
+|    [RegNetX-400MF-FPN](./hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_lyft-3d.py)    |   2x    |   13.0   |                |     16.0      |     16.1     |       [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d_20210521_115618-823dcf18.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d_20210521_115618.log.json)       |
+
+## Citation
+
+```latex
+@article{radosavovic2020designing,
+    title={Designing Network Design Spaces},
+    author={Ilija Radosavovic and Raj Prateek Kosaraju and Ross Girshick and Kaiming He and Piotr Dollár},
+    year={2020},
+    eprint={2003.13678},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV}
+}
+```
diff --git a/mmde/configs/regnet/metafile.yml b/mmde/configs/regnet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c2caa296abb8be24576dd7aa48da88836bfd2ab1
--- /dev/null
+++ b/mmde/configs/regnet/metafile.yml
@@ -0,0 +1,85 @@
+Models:
+  - Name: pointpillars_hv_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d
+    In Collection: PointPillars
+    Config: configs/regnet/pointpillars_hv_regnet-400mf_secfpn_sbn-all_8xb4-2x_nus-3d.py
+    Metadata:
+      Training Data: nuScenes
+      Training Memory (GB): 16.4
+      Architecture:
+        - RegNetX
+        - Hard Voxelization
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 41.2
+          NDS: 55.2
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334-53044f32.pth
+
+  - Name: pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb4-2x_nus-3d
+    In Collection: PointPillars
+    Config: configs/regnet/pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb4-2x_nus-3d.py
+    Metadata:
+      Training Data: nuScenes
+      Training Memory (GB): 17.3
+      Architecture:
+        - RegNetX
+        - Hard Voxelization
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 44.8
+          NDS: 56.4
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239-c694dce7.pth
+
+  - Name: pointpillars_hv_regnet-1.6gf_fpn_sbn-all_8xb4-2x_nus-3d
+    In Collection: PointPillars
+    Config: configs/regnet/pointpillars_hv_regnet-1.6gf_fpn_sbn-all_8xb4-2x_nus-3d.py
+    Metadata:
+      Training Data: nuScenes
+      Training Memory (GB): 24.0
+      Architecture:
+        - RegNetX
+        - Hard Voxelization
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 48.2
+          NDS: 59.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d_20200629_050311-dcd4e090.pth
+
+  - Name: pointpillars_hv_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d
+    In Collection: PointPillars
+    Config: configs/regnet/pointpillars_hv_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d.py
+    Metadata:
+      Training Data: Lyft
+      Training Memory (GB): 15.9
+      Architecture:
+        - RegNetX
+        - Hard Voxelization
+    Results:
+      - Task: 3D Object Detection
+        Dataset: Lyft
+        Metrics:
+          Private Score: 14.9
+          Public Score: 15.1
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d_20210524_092151-42513826.pth
+
+  - Name: pointpillars_hv_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d
+    In Collection: PointPillars
+    Config: configs/regnet/pointpillars_hv_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d.py
+    Metadata:
+      Training Data: Lyft
+      Training Memory (GB): 13.0
+      Architecture:
+        - RegNetX
+        - Hard Voxelization
+    Results:
+      - Task: 3D Object Detection
+        Dataset: Lyft
+        Metrics:
+          Private Score: 16.0
+          Public Score: 16.1
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d_20210521_115618-823dcf18.pth
diff --git a/mmde/configs/regnet/pointpillars_hv_regnet-1.6gf_fpn_sbn-all_8xb4-2x_nus-3d.py b/mmde/configs/regnet/pointpillars_hv_regnet-1.6gf_fpn_sbn-all_8xb4-2x_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..97fe8a3dfae58f264f2f16a7b876519b70029532
--- /dev/null
+++ b/mmde/configs/regnet/pointpillars_hv_regnet-1.6gf_fpn_sbn-all_8xb4-2x_nus-3d.py
@@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_fpn_nus.py',
+    '../_base_/datasets/nus-3d.py',
+    '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='MVXFasterRCNN',
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch='regnetx_1.6gf',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[168, 408, 912]))
diff --git a/mmde/configs/regnet/pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb2-2x_lyft-3d.py b/mmde/configs/regnet/pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb2-2x_lyft-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..90df3f5da072dbc8592026f2a78b08cdbae517b6
--- /dev/null
+++ b/mmde/configs/regnet/pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb2-2x_lyft-3d.py
@@ -0,0 +1,29 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_fpn_lyft.py',
+    '../_base_/datasets/lyft-3d.py',
+    '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='MVXFasterRCNN',
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_400mf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[64, 160, 384]))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/configs/regnet/pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb4-2x_nus-3d.py b/mmde/configs/regnet/pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb4-2x_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2118b58dc18c170c9b6ae98157803fdb8aceadc
--- /dev/null
+++ b/mmde/configs/regnet/pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb4-2x_nus-3d.py
@@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_fpn_nus.py',
+    '../_base_/datasets/nus-3d.py',
+    '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='MVXFasterRCNN',
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_400mf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[64, 160, 384]))
diff --git a/mmde/configs/regnet/pointpillars_hv_regnet-400mf_fpn_sbn-all_range100_8xb2-2x_lyft-3d.py b/mmde/configs/regnet/pointpillars_hv_regnet-400mf_fpn_sbn-all_range100_8xb2-2x_lyft-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e97e24e3b60e96d3b4a8ad5d6b909ab02ad1ffac
--- /dev/null
+++ b/mmde/configs/regnet/pointpillars_hv_regnet-400mf_fpn_sbn-all_range100_8xb2-2x_lyft-3d.py
@@ -0,0 +1,29 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_fpn_range100_lyft.py',
+    '../_base_/datasets/lyft-3d-range100.py',
+    '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py',
+]
+# model settings
+model = dict(
+    type='MVXFasterRCNN',
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_400mf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[64, 160, 384]))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/configs/regnet/pointpillars_hv_regnet-400mf_secfpn_sbn-all_8xb2-2x_lyft-3d.py b/mmde/configs/regnet/pointpillars_hv_regnet-400mf_secfpn_sbn-all_8xb2-2x_lyft-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..074e6d3b2f74230950342df2421f3470a5c24b53
--- /dev/null
+++ b/mmde/configs/regnet/pointpillars_hv_regnet-400mf_secfpn_sbn-all_8xb2-2x_lyft-3d.py
@@ -0,0 +1,39 @@
+_base_ = './pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb2-2x_lyft-3d.py'
+# model settings
+model = dict(
+    pts_neck=dict(
+        type='SECONDFPN',
+        _delete_=True,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 160, 384],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        in_channels=384,
+        feat_channels=384,
+        anchor_generator=dict(
+            _delete_=True,
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-80, -80, -1.0715024, 80, 80, -1.0715024],
+                    [-80, -80, -0.3033737, 80, 80, -0.3033737],
+                    [-80, -80, -0.3519405, 80, 80, -0.3519405],
+                    [-80, -80, -0.8871424, 80, 80, -0.8871424],
+                    [-80, -80, -0.6276341, 80, 80, -0.6276341],
+                    [-80, -80, -1.3220503, 80, 80, -1.3220503],
+                    [-80, -80, -1.0709302, 80, 80, -1.0709302],
+                    [-80, -80, -0.9122268, 80, 80, -0.9122268],
+                    [-80, -80, -1.8012227, 80, 80, -1.8012227]],
+            sizes=[
+                [4.75, 1.92, 1.71],  # car
+                [10.24, 2.84, 3.44],  # truck
+                [12.70, 2.92, 3.42],  # bus
+                [6.52, 2.42, 2.34],  # emergency vehicle
+                [8.17, 2.75, 3.20],  # other vehicle
+                [2.35, 0.96, 1.59],  # motorcycle
+                [1.76, 0.63, 1.44],  # bicycle
+                [0.80, 0.76, 1.76],  # pedestrian
+                [0.73, 0.35, 0.50]  # animal
+            ],
+            rotations=[0, 1.57],
+            reshape_out=True)))
diff --git a/mmde/configs/regnet/pointpillars_hv_regnet-400mf_secfpn_sbn-all_8xb4-2x_nus-3d.py b/mmde/configs/regnet/pointpillars_hv_regnet-400mf_secfpn_sbn-all_8xb4-2x_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..b48b709b343d6f8fd8eb91429e10eea00fb1e71a
--- /dev/null
+++ b/mmde/configs/regnet/pointpillars_hv_regnet-400mf_secfpn_sbn-all_8xb4-2x_nus-3d.py
@@ -0,0 +1,38 @@
+_base_ = './pointpillars_hv_regnet-400mf_fpn_sbn-all_8xb4-2x_nus-3d.py'
+# model settings
+model = dict(
+    pts_neck=dict(
+        type='SECONDFPN',
+        _delete_=True,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 160, 384],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        in_channels=384,
+        feat_channels=384,
+        anchor_generator=dict(
+            _delete_=True,
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[
+                [-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795],
+                [-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365],
+                [-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504],
+                [-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111],
+                [-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072],
+                [-49.6, -49.6, -1.80984986, 49.6, 49.6, -1.80984986],
+                [-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965],
+            ],
+            sizes=[
+                [4.60718145, 1.95017717, 1.72270761],  # car
+                [6.73778078, 2.4560939, 2.73004906],  # truck
+                [12.01320693, 2.87427237, 3.81509561],  # trailer
+                [1.68452161, 0.60058911, 1.27192197],  # bicycle
+                [0.7256437, 0.66344886, 1.75748069],  # pedestrian
+                [0.40359262, 0.39694519, 1.06232151],  # traffic_cone
+                [0.48578221, 2.49008838, 0.98297065],  # barrier
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True)))
diff --git a/mmde/configs/regnet/pointpillars_hv_regnet-400mf_secfpn_sbn-all_range100_8xb2-2x_lyft-3d.py b/mmde/configs/regnet/pointpillars_hv_regnet-400mf_secfpn_sbn-all_range100_8xb2-2x_lyft-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..a484349e9947ec8502df89f3174873759ad3ff35
--- /dev/null
+++ b/mmde/configs/regnet/pointpillars_hv_regnet-400mf_secfpn_sbn-all_range100_8xb2-2x_lyft-3d.py
@@ -0,0 +1,40 @@
+_base_ = \
+    './pointpillars_hv_regnet-400mf_fpn_sbn-all_range100_8xb2-2x_lyft-3d.py'
+# model settings
+model = dict(
+    pts_neck=dict(
+        type='SECONDFPN',
+        _delete_=True,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 160, 384],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        in_channels=384,
+        feat_channels=384,
+        anchor_generator=dict(
+            _delete_=True,
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-100, -100, -1.0715024, 100, 100, -1.0715024],
+                    [-100, -100, -0.3033737, 100, 100, -0.3033737],
+                    [-100, -100, -0.3519405, 100, 100, -0.3519405],
+                    [-100, -100, -0.8871424, 100, 100, -0.8871424],
+                    [-100, -100, -0.6276341, 100, 100, -0.6276341],
+                    [-100, -100, -1.3220503, 100, 100, -1.3220503],
+                    [-100, -100, -1.0709302, 100, 100, -1.0709302],
+                    [-100, -100, -0.9122268, 100, 100, -0.9122268],
+                    [-100, -100, -1.8012227, 100, 100, -1.8012227]],
+            sizes=[
+                [4.75, 1.92, 1.71],  # car
+                [10.24, 2.84, 3.44],  # truck
+                [12.70, 2.92, 3.42],  # bus
+                [6.52, 2.42, 2.34],  # emergency vehicle
+                [8.17, 2.75, 3.20],  # other vehicle
+                [2.35, 0.96, 1.59],  # motorcycle
+                [1.76, 0.63, 1.44],  # bicycle
+                [0.80, 0.76, 1.76],  # pedestrian
+                [0.73, 0.35, 0.50]  # animal
+            ],
+            rotations=[0, 1.57],
+            reshape_out=True)))
diff --git a/mmde/configs/sassd/README.md b/mmde/configs/sassd/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d1eb771b2c18047ea55e8f1e869abef89923bceb
--- /dev/null
+++ b/mmde/configs/sassd/README.md
@@ -0,0 +1,28 @@
+# Structure Aware Single-stage 3D Object Detection from Point Cloud
+
+> [Structure Aware Single-stage 3D Object Detection from Point Cloud](<%5Bhttps://arxiv.org/abs/2104.02323%5D(https://openaccess.thecvf.com/content_CVPR_2020/papers/He_Structure_Aware_Single-Stage_3D_Object_Detection_From_Point_Cloud_CVPR_2020_paper.pdf)>)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+3D object detection from point cloud data plays an essential role in autonomous driving. Current single-stage detectors are efficient by progressively downscaling the 3D point clouds in a fully convolutional manner. However, the downscaled features inevitably lose spatial information and cannot make full use of the structure information of 3D point cloud, degrading their localization precision. In this work, we propose to improve the localization precision of single-stage detectors by explicitly leveraging the structure information of 3D point cloud. Specifically, we design an auxiliary network which converts the convolutional features in the backbone network back to point-level representations. The auxiliary network is jointly optimized, by two point-level supervisions, to guide the convolutional features in the backbone network to be aware of the object structure. The auxiliary network can be detached after training and therefore introduces no extra computation in the inference stage. Besides, considering that single-stage detectors suffer from the discordance between the predicted bounding boxes and corresponding classification confidences, we develop an efficient part-sensitive warping operation to align the confidences to the predicted bounding boxes. Our proposed detector ranks at the top of KITTI 3D/BEV detection leaderboards and runs at 25 FPS for inference.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/30491025/172526367-c8b9bdf7-f901-4f2f-8855-bfd55c39f8d1.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement SA-SSD and provide the results and checkpoints on KITTI dataset.
+
+## Citation
+
+```latex
+@InProceedings{he2020sassd,
+    title={Structure Aware Single-stage 3D Object Detection from Point Cloud},
+    author={He, Chenhang and Zeng, Hui and Huang, Jianqiang and Hua, Xian-Sheng and Zhang, Lei},
+    booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+    year={2020}
+}
+```
diff --git a/mmde/configs/sassd/sassd_8xb6-80e_kitti-3d-3class.py b/mmde/configs/sassd/sassd_8xb6-80e_kitti-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..b145f49a5fe7ffb5e3bf823a5aa7530b809c6f71
--- /dev/null
+++ b/mmde/configs/sassd/sassd_8xb6-80e_kitti-3d-3class.py
@@ -0,0 +1,99 @@
+_base_ = [
+    '../_base_/datasets/kitti-3d-3class.py',
+    '../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py'
+]
+
+voxel_size = [0.05, 0.05, 0.1]
+
+model = dict(
+    type='SASSD',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=5,
+            point_cloud_range=[0, -40, -3, 70.4, 40, 1],
+            voxel_size=voxel_size,
+            max_voxels=(16000, 40000))),
+    voxel_encoder=dict(type='HardSimpleVFE'),
+    middle_encoder=dict(
+        type='SparseEncoderSASSD',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+            ],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.35,
+                neg_iou_thr=0.2,
+                min_pos_iou=0.2,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.35,
+                neg_iou_thr=0.2,
+                min_pos_iou=0.2,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
diff --git a/mmde/configs/second/README.md b/mmde/configs/second/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e91c4bcab552daf7a588f4bb33769538a67f25d9
--- /dev/null
+++ b/mmde/configs/second/README.md
@@ -0,0 +1,54 @@
+# Second: Sparsely embedded convolutional detection
+
+> [SECOND: Sparsely Embedded Convolutional Detection](https://www.mdpi.com/1424-8220/18/10/3337)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+LiDAR-based or RGB-D-based object detection is used in numerous applications, ranging from autonomous driving to robot vision. Voxel-based 3D convolutional networks have been used for some time to enhance the retention of information when processing point cloud LiDAR data. However, problems remain, including a slow inference speed and low orientation estimation performance. We therefore investigate an improved sparse convolution method for such networks, which significantly increases the speed of both training and inference. We also introduce a new form of angle loss regression to improve the orientation estimation performance and a new data augmentation approach that can enhance the convergence speed and performance. The proposed network produces state-of-the-art results on the KITTI 3D object detection benchmarks while maintaining a fast inference speed.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/79644370/143889364-10be11c3-838e-4fc9-9613-184f0cd08907.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement SECOND and provide the results and checkpoints on KITTI dataset.
+
+## Results and models
+
+### KITTI
+
+|                              Backbone                               |  Class  |  Lr schd   | Mem (GB) | Inf time (fps) |  mAP  |                                                                                                                                                                                             Download                                                                                                                                                                                             |
+| :-----------------------------------------------------------------: | :-----: | :--------: | :------: | :------------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|        [SECFPN](./second_hv_secfpn_8xb6-80e_kitti-3d-car.py)        |   Car   | cyclic 80e |   5.4    |                | 78.2  |                       [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/second/second_hv_secfpn_8xb6-80e_kitti-3d-car/second_hv_secfpn_8xb6-80e_kitti-3d-car-75d9305e.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/second/second_hv_secfpn_8xb6-80e_kitti-3d-car/second_hv_secfpn_8xb6-80e_kitti-3d-car-20230420_191750.log)                        |
+|  [SECFPN (FP16)](./second_hv_secfpn_8xb6-amp-80e_kitti-3d-car.py)   |   Car   | cyclic 80e |   2.9    |                | 78.72 |       [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301-1f5ad833.pth)\| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301.log.json)        |
+|      [SECFPN](./second_hv_secfpn_8xb6-80e_kitti-3d-3class.py)       | 3 Class | cyclic 80e |   5.4    |                | 65.3  |                 [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class/second_hv_secfpn_8xb6-80e_kitti-3d-3class-b086d0a3.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class/second_hv_secfpn_8xb6-80e_kitti-3d-3class-20230420_221130.log)                  |
+| [SECFPN (FP16)](./second_hv_secfpn_8xb6-amp-80e_kitti-3d-3class.py) | 3 Class | cyclic 80e |   2.9    |                | 67.4  | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059-05f67bdf.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059.log.json) |
+
+### Waymo
+
+|                              Backbone                              | Load Interval |  Class  | Lr schd | Mem (GB) | Inf time (fps) | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** |                                                                                           Download                                                                                            |
+| :----------------------------------------------------------------: | :-----------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :----: | :---------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [SECFPN](./second_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py) |       5       | 3 Class |   2x    |   8.12   |                |  65.3  |  61.7   |  58.9  |    55.7     | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_sbn_4x8_2x_waymoD5-3d-3class/hv_second_secfpn_sbn_4x8_2x_waymoD5-3d-3class_20201115_112448.log.json) |
+|                            above @ Car                             |               |         |   2x    |   8.12   |                |  67.1  |  66.6   |  58.7  |    58.2     |                                                                                                                                                                                               |
+|                         above @ Pedestrian                         |               |         |   2x    |   8.12   |                |  68.1  |  59.1   |  59.5  |    51.5     |                                                                                                                                                                                               |
+|                          above @ Cyclist                           |               |         |   2x    |   8.12   |                |  60.7  |  59.5   |  58.4  |    57.3     |                                                                                                                                                                                               |
+
+Note:
+
+- See more details about metrics and data split on Waymo [HERE](https://github.com/open-mmlab/mmdetection3d/tree/main/configs/pointpillars). For implementation details, we basically follow the original settings. All of these results are achieved without bells-and-whistles, e.g. ensemble, multi-scale training and test augmentation.
+- `FP16` means Mixed Precision (FP16) is adopted in training.
+
+## Citation
+
+```latex
+@article{yan2018second,
+  title={Second: Sparsely embedded convolutional detection},
+  author={Yan, Yan and Mao, Yuxing and Li, Bo},
+  journal={Sensors},
+  year={2018},
+  publisher={Multidisciplinary Digital Publishing Institute}
+}
+```
diff --git a/mmde/configs/second/metafile.yml b/mmde/configs/second/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..097fb73f299a84b7fcacc521d5a590e663bbe9e9
--- /dev/null
+++ b/mmde/configs/second/metafile.yml
@@ -0,0 +1,97 @@
+Collections:
+  - Name: SECOND
+    Metadata:
+      Training Techniques:
+        - AdamW
+      Architecture:
+        - Hard Voxelization
+    Paper:
+      URL: https://www.mdpi.com/1424-8220/18/10/3337
+      Title: 'SECOND: Sparsely Embedded Convolutional Detection'
+    README: configs/second/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/backbones/second.py#L11
+      Version: v0.5.0
+
+Models:
+  - Name: second_hv_secfpn_8xb6-80e_kitti-3d-car
+    In Collection: SECOND
+    Config: configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-car.py
+    Metadata:
+      Training Data: KITTI
+      Training Memory (GB): 5.4
+      Training Resources: 8x V100 GPUs
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 78.2
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/second/second_hv_secfpn_8xb6-80e_kitti-3d-car/second_hv_secfpn_8xb6-80e_kitti-3d-car-75d9305e.pth
+
+  - Name: second_hv_secfpn_8xb6-80e_kitti-3d-3class
+    In Collection: SECOND
+    Config: configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class.py
+    Metadata:
+      Training Data: KITTI
+      Training Memory (GB): 5.4
+      Training Resources: 8x V100 GPUs
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 65.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class/second_hv_secfpn_8xb6-80e_kitti-3d-3class-b086d0a3.pth
+
+  - Name: second_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class
+    In Collection: SECOND
+    Config: configs/second/second_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py
+    Metadata:
+      Training Data: Waymo
+      Training Memory (GB): 8.12
+      Training Resources: 8x GeForce GTX 1080 Ti
+    Results:
+      - Task: 3D Object Detection
+        Dataset: Waymo
+        Metrics:
+          mAP@L1: 65.3
+          mAPH@L1: 61.7
+          mAP@L2: 58.9
+          mAPH@L2: 55.7
+
+  - Name: second_hv_secfpn_8xb6-amp-80e_kitti-3d-car
+    In Collection: SECOND
+    Config: configs/second/second_hv_secfpn_8xb6-amp-80e_kitti-3d-car.py
+    Metadata:
+      Training Techniques:
+        - AdamW
+        - Mixed Precision Training
+      Training Resources: 8x TITAN Xp
+      Training Data: KITTI
+      Training Memory (GB): 2.9
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 78.72
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301-1f5ad833.pth
+    Code:
+      Version: v0.7.0
+
+  - Name: second_hv_secfpn_8xb6-amp-80e_kitti-3d-3class
+    In Collection: SECOND
+    Config: configs/second/second_hv_secfpn_8xb6-amp-80e_kitti-3d-3class.py
+    Metadata:
+      Training Techniques:
+        - AdamW
+        - Mixed Precision Training
+      Training Resources: 8x TITAN Xp
+      Training Data: KITTI
+      Training Memory (GB): 2.9
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 67.4
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059-05f67bdf.pth
+    Code:
+      Version: v0.7.0
diff --git a/mmde/configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class.py b/mmde/configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecd04eef92435fd9ac7cb505c92f07d062354e2e
--- /dev/null
+++ b/mmde/configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/second_hv_secfpn_kitti.py',
+    '../_base_/datasets/kitti-3d-3class.py',
+    '../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py'
+]
diff --git a/mmde/configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-car.py b/mmde/configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2c40834a533f3c479b62ae0569475799c4a35ba
--- /dev/null
+++ b/mmde/configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-car.py
@@ -0,0 +1,30 @@
+_base_ = [
+    '../_base_/models/second_hv_secfpn_kitti.py',
+    '../_base_/datasets/kitti-3d-car.py', '../_base_/schedules/cyclic-40e.py',
+    '../_base_/default_runtime.py'
+]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+model = dict(
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=1,
+        anchor_generator=dict(
+            _delete_=True,
+            type='Anchor3DRangeGenerator',
+            ranges=[[0, -40.0, -1.78, 70.4, 40.0, -1.78]],
+            sizes=[[3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=True)),
+    # model training and testing settings
+    train_cfg=dict(
+        _delete_=True,
+        assigner=dict(
+            type='Max3DIoUAssigner',
+            iou_calculator=dict(type='BboxOverlapsNearest3D'),
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.45,
+            min_pos_iou=0.45,
+            ignore_iof_thr=-1),
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False))
diff --git a/mmde/configs/second/second_hv_secfpn_8xb6-amp-80e_kitti-3d-3class.py b/mmde/configs/second/second_hv_secfpn_8xb6-amp-80e_kitti-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..38ba58e5f38ae6a1c317bb8f4c3a75333b3e7103
--- /dev/null
+++ b/mmde/configs/second/second_hv_secfpn_8xb6-amp-80e_kitti-3d-3class.py
@@ -0,0 +1,4 @@
+_base_ = 'second_hv_secfpn_8xb6-80e_kitti-3d-3class.py'
+
+# schedule settings
+optim_wrapper = dict(type='AmpOptimWrapper', loss_scale=4096.)
diff --git a/mmde/configs/second/second_hv_secfpn_8xb6-amp-80e_kitti-3d-car.py b/mmde/configs/second/second_hv_secfpn_8xb6-amp-80e_kitti-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..99de41d97620b130c5857f986600ff85a48eca67
--- /dev/null
+++ b/mmde/configs/second/second_hv_secfpn_8xb6-amp-80e_kitti-3d-car.py
@@ -0,0 +1,4 @@
+_base_ = 'second_hv_secfpn_8xb6-80e_kitti-3d-car.py'
+
+# schedule settings
+optim_wrapper = dict(type='AmpOptimWrapper', loss_scale=4096.)
diff --git a/mmde/configs/second/second_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py b/mmde/configs/second/second_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f313021f713803295be8861bbb61f5026f5a369
--- /dev/null
+++ b/mmde/configs/second/second_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py
@@ -0,0 +1,145 @@
+_base_ = [
+    '../_base_/models/second_hv_secfpn_waymo.py',
+    '../_base_/datasets/waymoD5-3d-3class.py',
+    '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py',
+]
+
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+metainfo = dict(classes=class_names)
+
+point_cloud_range = [-76.8, -51.2, -2, 76.8, 51.2, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+backend_args = None
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    # dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(type='Pack3DDetInputs', keys=['points']),
+        ])
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='waymo_infos_train.pkl',
+            data_prefix=dict(pts='training/velodyne'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (16 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
diff --git a/mmde/configs/smoke/README.md b/mmde/configs/smoke/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b03c9403463014f9fa938a9c09ff9acf37dc3e6f
--- /dev/null
+++ b/mmde/configs/smoke/README.md
@@ -0,0 +1,47 @@
+# SMOKE: Single-Stage Monocular 3D Object Detection via Keypoint Estimation
+
+> [SMOKE: Single-Stage Monocular 3D Object Detection via Keypoint Estimation](https://arxiv.org/abs/2002.10111)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Estimating 3D orientation and translation of objects is essential for infrastructure-less autonomous navigation and driving. In case of monocular vision, successful methods have been mainly based on two ingredients: (i) a network generating 2D region proposals, (ii) a R-CNN structure predicting 3D object pose by utilizing the acquired regions of interest. We argue that the 2D detection network is redundant and introduces non-negligible noise for 3D detection. Hence, we propose a novel 3D object detection method, named SMOKE, in this paper that predicts a 3D bounding box for each detected object by combining a single keypoint estimate with regressed 3D variables. As a second contribution, we propose a multi-step disentangling approach for constructing the 3D bounding box, which significantly improves both training convergence and detection accuracy. In contrast to previous 3D detection techniques, our method does not require complicated pre/post-processing, extra data, and a refinement stage. Despite of its structural simplicity, our proposed SMOKE network outperforms all existing monocular 3D detection methods on the KITTI dataset, giving the best state-of-the-art result on both 3D object detection and Bird's eye view evaluation.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/79644370/143886681-52cb72b9-6635-4624-a728-1c243b046517.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement SMOKE and provide the results and checkpoints on KITTI dataset.
+
+## Results and models
+
+### KITTI
+
+|                           Backbone                            | Lr schd | Mem (GB) | Inf time (fps) |  mAP  |                                                                                                                                                         Download                                                                                                                                                         |
+| :-----------------------------------------------------------: | :-----: | :------: | :------------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [DLA34](./smoke_dla34_dlaneck_gn-all_4xb8-6x_kitti-mono3d.py) |   6x    |   9.64   |                | 13.85 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/smoke/smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d_20210929_015553-d46d9bb0.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/smoke/smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d_20210929_015553.log.json) |
+
+Note: mAP represents Car moderate 3D strict AP11 results.
+
+Detailed performance on KITTI 3D detection (3D/BEV) is as follows, evaluated by AP11 metric:
+
+|            |     Easy      |   Moderate    |     Hard      |
+| ---------- | :-----------: | :-----------: | :-----------: |
+| Car        | 16.92 / 22.97 | 13.85 / 18.32 | 11.90 / 15.88 |
+| Pedestrian | 11.13 / 12.61 | 11.10 / 11.32 | 10.67 / 11.14 |
+| Cyclist    | 0.99  / 1.47  |  0.54 / 0.65  |  0.55 / 0.67  |
+
+## Citation
+
+```latex
+@inproceedings{liu2020smoke,
+  title={Smoke: Single-stage monocular 3d object detection via keypoint estimation},
+  author={Liu, Zechen and Wu, Zizhang and T{\'o}th, Roland},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops},
+  pages={996--997},
+  year={2020}
+}
+```
diff --git a/mmde/configs/smoke/metafile.yml b/mmde/configs/smoke/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..df555d8d79f6f580c920293eebe29d087a34137e
--- /dev/null
+++ b/mmde/configs/smoke/metafile.yml
@@ -0,0 +1,30 @@
+Collections:
+  - Name: SMOKE
+    Metadata:
+      Training Data: KITTI
+      Training Techniques:
+        - Adam
+      Training Resources: 4x V100 GPUS
+      Architecture:
+        - SMOKEMono3DHead
+        - DLA
+    Paper:
+      URL: https://arxiv.org/abs/2002.10111
+      Title: 'SMOKE: Single-Stage Monocular 3D Object Detection via Keypoint Estimation'
+    README: configs/smoke/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/models/detectors/smoke_mono3d.py#L7
+      Version: v1.0.0
+
+Models:
+  - Name: smoke_dla34_dlaneck_gn-all_4xb8-6x_kitti-mono3d
+    In Collection: SMOKE
+    Config: configs/smoke/smoke_dla34_dlaneck_gn-all_4xb8-6x_kitti-mono3d.py
+    Metadata:
+      Training Memory (GB): 9.6
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 13.8
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/smoke/smoke_dla34_pytorch_dlaneck_gn-all_8x4_6x_kitti-mono3d_20210929_015553-d46d9bb0.pth
diff --git a/mmde/configs/smoke/smoke_dla34_dlaneck_gn-all_4xb8-6x_kitti-mono3d.py b/mmde/configs/smoke/smoke_dla34_dlaneck_gn-all_4xb8-6x_kitti-mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ca6b44394a38ab40862ac1d36889f2939fa626d
--- /dev/null
+++ b/mmde/configs/smoke/smoke_dla34_dlaneck_gn-all_4xb8-6x_kitti-mono3d.py
@@ -0,0 +1,63 @@
+_base_ = [
+    '../_base_/datasets/kitti-mono3d.py', '../_base_/models/smoke.py',
+    '../_base_/default_runtime.py'
+]
+
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='RandomShiftScale', shift_scale=(0.2, 0.4), aug_prob=0.3),
+    dict(type='AffineResize', img_scale=(1280, 384), down_ratio=4),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(type='AffineResize', img_scale=(1280, 384), down_ratio=4),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+train_dataloader = dict(
+    batch_size=8, num_workers=4, dataset=dict(pipeline=train_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# training schedule for 6x
+max_epochs = 72
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=5)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[50],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='Adam', lr=2.5e-4),
+    clip_grad=None)
+
+find_unused_parameters = True
diff --git a/mmde/configs/spvcnn/README.md b/mmde/configs/spvcnn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..870e98c2ec6ce732d60fbcd4edfc0e08816dd0d2
--- /dev/null
+++ b/mmde/configs/spvcnn/README.md
@@ -0,0 +1,45 @@
+# Searching Efficient 3D Architectures with Sparse Point-Voxel Convolution
+
+> [Searching Efficient 3D Architectures with Sparse Point-Voxel Convolution ](https://arxiv.org/abs/2007.16100)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Self-driving cars need to understand 3D scenes efficiently and accurately in order to drive safely. Given the limited hardware resources, existing 3D perception models are not able to recognize small instances (e.g., pedestrians, cyclists) very well due to the low-resolution voxelization and aggressive downsampling. To this end, we propose Sparse Point-Voxel Convolution (SPVConv), a lightweight 3D module that equips the vanilla Sparse Convolution with the high-resolution point-based branch. With negligible overhead, this point-based branch is able to preserve the fine details even from large outdoor scenes. To explore the spectrum of efficient 3D models, we first define a flexible architecture design space based on SPVConv, and we then present 3D Neural Architecture Search (3D-NAS) to search the optimal network architecture over this diverse design space efficiently and effectively. Experimental results validate that the resulting SPVNAS model is fast and accurate: it outperforms the state-of-the-art MinkowskiNet by 3.3%, ranking 1st on the competitive SemanticKITTI leaderboard. It also achieves 8x computation reduction and 3x measured speedup over MinkowskiNet with higher accuracy. Finally, we transfer our method to 3D object detection, and it achieves consistent improvements over the one-stage detection baseline on KITTI.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/72679458/226509154-80c27d8e-c138-426a-b92e-72846997b5b3.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement SPVCNN with [TorchSparse](https://github.com/mit-han-lab/torchsparse) backend and provide the result and checkpoints on SemanticKITTI datasets.
+
+## Results and models
+
+### SemanticKITTI
+
+|                                 Method                                  | Lr schd | Laser-Polar Mix | Mem (GB) | mIoU |                                                                                                                                                                    Download                                                                                                                                                                     |
+| :---------------------------------------------------------------------: | :-----: | :-------------: | :------: | :--: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|        [SPVCNN-W16](./spvcnn_w16_8xb2-amp-15e_semantickitti.py)         |   15e   |        ✗        |   3.9    | 61.8 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/spvcnn/spvcnn_w16_8xb2-15e_semantickitti/spvcnn_w16_8xb2-15e_semantickitti_20230321_011645-a2734d85.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/spvcnn/spvcnn_w16_8xb2-15e_semantickitti/spvcnn_w16_8xb2-15e_semantickitti_20230321_011645.log) |
+|        [SPVCNN-W20](./spvcnn_w20_8xb2-amp-15e_semantickitti.py)         |   15e   |        ✗        |   4.2    | 62.6 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/spvcnn/spvcnn_w20_8xb2-15e_semantickitti/spvcnn_w20_8xb2-15e_semantickitti_20230321_011649-519e7eff.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/spvcnn/spvcnn_w20_8xb2-15e_semantickitti/spvcnn_w20_8xb2-15e_semantickitti_20230321_011649.log) |
+|        [SPVCNN-W32](./spvcnn_w32_8xb2-amp-15e_semantickitti.py)         |   15e   |        ✗        |   5.4    | 64.3 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/spvcnn/spvcnn_w32_8xb2-15e_semantickitti/spvcnn_w32_8xb2-15e_semantickitti_20230308_113324-f7c0c5b4.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/spvcnn/spvcnn_w32_8xb2-15e_semantickitti/spvcnn_w32_8xb2-15e_semantickitti_20230308_113324.log) |
+| [SPVCNN-W32](./spvcnn_w32_8xb2-amp-laser-polar-mix-3x_semantickitti.py) |   3x    |        ✔        |   7.2    | 68.7 |                [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/spvcnn/spvcnn_w32_8xb2-amp-laser-polar-mix-3x_semantickitti_20230425_125908-d68a68b7.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/spvcnn/spvcnn_w32_8xb2-amp-laser-polar-mix-3x_semantickitti_20230425_125908.log)                |
+
+**Note:** We follow the implementation in SPVNAS original [repo](https://github.com/mit-han-lab/spvnas) and W16\\W20\\W32 indicates different number of channels.
+
+**Note:** Due to TorchSparse backend, the model performance is unstable with TorchSparse backend and may fluctuate by about 1.5 mIoU for different random seeds.
+
+## Citation
+
+```latex
+@inproceedings{tang2020searching,
+  title={Searching efficient 3d architectures with sparse point-voxel convolution},
+  author={Tang, Haotian and Liu, Zhijian and Zhao, Shengyu and Lin, Yujun and Lin, Ji and Wang, Hanrui and Han, Song},
+  booktitle={Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXVIII},
+  pages={685--702},
+  year={2020},
+  organization={Springer}
+}
+```
diff --git a/mmde/configs/spvcnn/metafile.yml b/mmde/configs/spvcnn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e3f1cfd891ad0ff2b406091635fc9160103a06cc
--- /dev/null
+++ b/mmde/configs/spvcnn/metafile.yml
@@ -0,0 +1,71 @@
+Collections:
+  - Name: SPVCNN
+    Metadata:
+      Training Techniques:
+        - AdamW
+      Architecture:
+        - SPVCNN
+    Paper:
+      URL: https://arxiv.org/abs/2007.16100
+      Title: 'Searching Efficient 3D Architectures with Sparse Point-Voxel Convolution'
+    README: configs/spvcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/1.1/mmdet3d/models/backbones/spvcnn_backone.py#L22
+      Version: v1.1.0
+
+Models:
+  - Name: spvcnn_w16_8xb2-amp-15e_semantickitti
+    In Collection: SPVCNN
+    Config: configs/spvcnn/spvcnn_w16_8xb2-amp-15e_semantickitti.py
+    Metadata:
+      Training Data: SemanticKITTI
+      Training Memory (GB): 3.9
+      Training Resources: 8x A100 GPUs
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: SemanticKITTI
+        Metrics:
+          mIOU: 61.7
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/spvcnn/spvcnn_w16_8xb2-15e_semantickitti/spvcnn_w16_8xb2-15e_semantickitti_20230321_011645-a2734d85.pth
+
+  - Name: spvcnn_w20_8xb2-amp-15e_semantickitti
+    In Collection: SPVCNN
+    Config: configs/spvcnn/spvcnn_w20_8xb2-amp-15e_semantickitti.py
+    Metadata:
+      Training Data: SemanticKITTI
+      Training Memory (GB): 4.2
+      Training Resources: 8x A100 GPUs
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: SemanticKITTI
+        Metrics:
+          mIOU: 62.9
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/spvcnn/spvcnn_w20_8xb2-15e_semantickitti/spvcnn_w20_8xb2-15e_semantickitti_20230321_011649-519e7eff.pth
+
+  - Name: spvcnn_w32_8xb2-amp-15e_semantickitti
+    In Collection: SPVCNN
+    Config: configs/spvcnn/spvcnn_w32_8xb2-amp-15e_semantickitti.py
+    Metadata:
+      Training Data: SemanticKITTI
+      Training Memory (GB): 5.4
+      Training Resources: 8x A100 GPUs
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: SemanticKITTI
+        Metrics:
+          mIOU: 64.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/spvcnn/spvcnn_w32_8xb2-15e_semantickitti/spvcnn_w32_8xb2-15e_semantickitti_20230308_113324-f7c0c5b4.pth
+
+  - Name: spvcnn_w32_8xb2-amp-laser-polar-mix-3x_semantickitti
+    In Collection: SPVCNN
+    Config: configs/spvcnn/spvcnn_w32_8xb2-amp-laser-polar-mix-3x_semantickitti.py
+    Metadata:
+      Training Data: SemanticKITTI
+      Training Memory (GB): 7.2
+      Training Resources: 8x A100 GPUs
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: SemanticKITTI
+        Metrics:
+          mIOU: 64.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.1.0_models/spvcnn/spvcnn_w32_8xb2-amp-laser-polar-mix-3x_semantickitti_20230425_125908-d68a68b7.pth
diff --git a/mmde/configs/spvcnn/spvcnn_w16_8xb2-amp-15e_semantickitti.py b/mmde/configs/spvcnn/spvcnn_w16_8xb2-amp-15e_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..51a14cb574870e02b3f5b8a224a4305f5b652917
--- /dev/null
+++ b/mmde/configs/spvcnn/spvcnn_w16_8xb2-amp-15e_semantickitti.py
@@ -0,0 +1,10 @@
+_base_ = ['./spvcnn_w32_8xb2-amp-15e_semantickitti.py']
+
+model = dict(
+    backbone=dict(
+        base_channels=16,
+        encoder_channels=[16, 32, 64, 128],
+        decoder_channels=[128, 64, 48, 48]),
+    decode_head=dict(channels=48))
+
+randomness = dict(seed=1588147245)
diff --git a/mmde/configs/spvcnn/spvcnn_w20_8xb2-amp-15e_semantickitti.py b/mmde/configs/spvcnn/spvcnn_w20_8xb2-amp-15e_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..75ecc1477d2e5edd2528e8c2ceea917449e61e20
--- /dev/null
+++ b/mmde/configs/spvcnn/spvcnn_w20_8xb2-amp-15e_semantickitti.py
@@ -0,0 +1,8 @@
+_base_ = ['./spvcnn_w32_8xb2-amp-15e_semantickitti.py']
+
+model = dict(
+    backbone=dict(
+        base_channels=20,
+        encoder_channels=[20, 40, 81, 163],
+        decoder_channels=[163, 81, 61, 61]),
+    decode_head=dict(channels=61))
diff --git a/mmde/configs/spvcnn/spvcnn_w32_8xb2-amp-15e_semantickitti.py b/mmde/configs/spvcnn/spvcnn_w32_8xb2-amp-15e_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..090576f432a2c9e90d2da04ecbfb2ce980fe3780
--- /dev/null
+++ b/mmde/configs/spvcnn/spvcnn_w32_8xb2-amp-15e_semantickitti.py
@@ -0,0 +1,54 @@
+_base_ = [
+    '../_base_/datasets/semantickitti.py', '../_base_/models/spvcnn.py',
+    '../_base_/default_runtime.py'
+]
+
+train_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti'),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[0., 6.28318531],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+    ),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+
+train_dataloader = dict(
+    sampler=dict(seed=0), dataset=dict(pipeline=train_pipeline))
+
+lr = 0.24
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    loss_scale='dynamic',
+    optimizer=dict(
+        type='SGD', lr=lr, weight_decay=0.0001, momentum=0.9, nesterov=True))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.008, by_epoch=False, begin=0, end=125),
+    dict(
+        type='CosineAnnealingLR',
+        begin=0,
+        T_max=15,
+        by_epoch=True,
+        eta_min=1e-5,
+        convert_to_iter_based=True)
+]
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=15, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1))
+randomness = dict(seed=0, deterministic=False, diff_rank_seed=True)
+env_cfg = dict(cudnn_benchmark=True)
diff --git a/mmde/configs/spvcnn/spvcnn_w32_8xb2-amp-laser-polar-mix-3x_semantickitti.py b/mmde/configs/spvcnn/spvcnn_w32_8xb2-amp-laser-polar-mix-3x_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..500ca58474ca13d92885415e81c8acce21de6624
--- /dev/null
+++ b/mmde/configs/spvcnn/spvcnn_w32_8xb2-amp-laser-polar-mix-3x_semantickitti.py
@@ -0,0 +1,86 @@
+_base_ = [
+    '../_base_/datasets/semantickitti.py', '../_base_/models/spvcnn.py',
+    '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(data_preprocessor=dict(max_voxels=None))
+
+train_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti'),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='LaserMix',
+                    num_areas=[3, 4, 5, 6],
+                    pitch_angles=[-25, 3],
+                    pre_transform=[
+                        dict(
+                            type='LoadPointsFromFile',
+                            coord_type='LIDAR',
+                            load_dim=4,
+                            use_dim=4),
+                        dict(
+                            type='LoadAnnotations3D',
+                            with_bbox_3d=False,
+                            with_label_3d=False,
+                            with_seg_3d=True,
+                            seg_3d_dtype='np.int32',
+                            seg_offset=2**16,
+                            dataset_type='semantickitti'),
+                        dict(type='PointSegClassMapping')
+                    ],
+                    prob=1)
+            ],
+            [
+                dict(
+                    type='PolarMix',
+                    instance_classes=[0, 1, 2, 3, 4, 5, 6, 7],
+                    swap_ratio=0.5,
+                    rotate_paste_ratio=1.0,
+                    pre_transform=[
+                        dict(
+                            type='LoadPointsFromFile',
+                            coord_type='LIDAR',
+                            load_dim=4,
+                            use_dim=4),
+                        dict(
+                            type='LoadAnnotations3D',
+                            with_bbox_3d=False,
+                            with_label_3d=False,
+                            with_seg_3d=True,
+                            seg_3d_dtype='np.int32',
+                            seg_offset=2**16,
+                            dataset_type='semantickitti'),
+                        dict(type='PointSegClassMapping')
+                    ],
+                    prob=1)
+            ],
+        ],
+        prob=[0.5, 0.5]),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[0., 6.28318531],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+    ),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic')
+
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1))
+randomness = dict(seed=0, deterministic=False, diff_rank_seed=True)
+env_cfg = dict(cudnn_benchmark=True)
diff --git a/mmde/configs/ssn/README.md b/mmde/configs/ssn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..89af3f4817ed3fb881a840249809bd4b2587962d
--- /dev/null
+++ b/mmde/configs/ssn/README.md
@@ -0,0 +1,53 @@
+# SSN: Shape Signature Networks for Multi-class Object Detection from Point Clouds
+
+> [SSN: Shape Signature Networks for Multi-class Object Detection from Point Clouds](https://arxiv.org/abs/2004.02774)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Multi-class 3D object detection aims to localize and classify objects of multiple categories from point clouds. Due to the nature of point clouds, i.e. unstructured, sparse and noisy, some features benefit-ting multi-class discrimination are underexploited, such as shape information. In this paper, we propose a novel 3D shape signature to explore the shape information from point clouds. By incorporating operations of symmetry, convex hull and chebyshev fitting, the proposed shape sig-nature is not only compact and effective but also robust to the noise, which serves as a soft constraint to improve the feature capability of multi-class discrimination. Based on the proposed shape signature, we develop the shape signature networks (SSN) for 3D object detection, which consist of pyramid feature encoding part, shape-aware grouping heads and explicit shape encoding objective. Experiments show that the proposed method performs remarkably better than existing methods on two large-scale datasets. Furthermore, our shape signature can act as a plug-and-play component and ablation study shows its effectiveness and good scalability.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/79644370/144024507-9c1f23c1-5e5a-49c8-b346-ff37e30adc3a.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement PointPillars with Shape-aware grouping heads used in the SSN and provide the results and checkpoints on the nuScenes and Lyft dataset.
+
+## Results and models
+
+### NuScenes
+
+|                                            Backbone                                             | Lr schd | Mem (GB) | Inf time (fps) |  mAP  |  NDS  |                                                                                                                                                                                                                       Download                                                                                                                                                                                                                       |
+| :---------------------------------------------------------------------------------------------: | :-----: | :------: | :------------: | :---: | :---: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|           [SECFPN](../pointpillars/pointpillars_hv_secfpn_sbn-all_8xb4-2x_nus-3d.py)            |   2x    |   16.4   |                | 35.17 | 49.76 |                     [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725-0817d270.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725.log.json)                     |
+|                        [SSN](./ssn_hv_secfpn_sbn-all_16xb2-2x_nus-3d.py)                        |   2x    |   3.6    |                | 40.91 | 54.44 |                                              [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d_20210830_101351-51915986.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d_20210830_101351.log.json)                                              |
+| [RegNetX-400MF-SECFPN](../regnet/pointpillars_hv_regnet-400mf_secfpn_sbn-all_8xb4-2x_nus-3d.py) |   2x    |   16.4   |                | 41.15 | 55.20 | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334-53044f32.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334.log.json) |
+|          [RegNetX-400MF-SSN](./ssn_hv_regnet-400mf_secfpn_sbn-all_16xb2-2x_nus-3d.py)           |   2x    |   5.1    |                | 46.65 | 58.24 |                    [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d_20210829_210615-361e5e04.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d_20210829_210615.log.json)                    |
+
+### Lyft
+
+|                                   Backbone                                    | Lr schd | Mem (GB) | Inf time (fps) | Private Score | Public Score |                                                                                                                                                                                                      Download                                                                                                                                                                                                      |
+| :---------------------------------------------------------------------------: | :-----: | :------: | :------------: | :-----------: | :----------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  [SECFPN](../pointpillars/pointpillars_hv_secfpn_sbn-all_8xb2-2x_lyft-3d.py)  |   2x    |   12.2   |                |     13.9      |     14.1     |  [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d_20210517_204807-2518e3de.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d_20210517_204807.log.json)  |
+|              [SSN](./ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py)               |   2x    |   8.5    |                |     17.5      |     17.5     |                           [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d_20210822_134731-46841b41.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d_20210822_134731.log.json)                           |
+| [RegNetX-400MF-SSN](./ssn_hv_regnet-400mf_secfpn_sbn-all_16xb1-2x_lyft-3d.py) |   2x    |   7.4    |                |     17.9      |      18      | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_1x16_2x_lyft-3d/hv_ssn_regnet-400mf_secfpn_sbn-all_1x16_2x_lyft-3d_20210829_122825-d93475a1.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_1x16_2x_lyft-3d/hv_ssn_regnet-400mf_secfpn_sbn-all_1x16_2x_lyft-3d_20210829_122825.log.json) |
+
+Note:
+
+The main difference of the shape-aware grouping heads with the original SECOND FPN heads is that the former groups objects with similar sizes and shapes together, and design shape-specific heads for each group. Heavier heads (with more convolutions and large strides) are designed for large objects while smaller heads for small objects. Note that there may appear different feature map sizes in the outputs, so an anchor generator tailored to these feature maps is also needed in the implementation.
+
+Users could try other settings in terms of the head design. Here we basically refer to the implementation [HERE](https://github.com/xinge008/SSN).
+
+## Citation
+
+```latex
+@inproceedings{zhu2020ssn,
+  title={SSN: Shape Signature Networks for Multi-class Object Detection from Point Clouds},
+  author={Zhu, Xinge and Ma, Yuexin and Wang, Tai and Xu, Yan and Shi, Jianping and Lin, Dahua},
+  booktitle={Proceedings of the European Conference on Computer Vision},
+  year={2020}
+}
+```
diff --git a/mmde/configs/ssn/metafile.yml b/mmde/configs/ssn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..00b15b946f9ae1931ec024e04abd1f0b88d20b0e
--- /dev/null
+++ b/mmde/configs/ssn/metafile.yml
@@ -0,0 +1,72 @@
+Collections:
+  - Name: SSN
+    Metadata:
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x GeForce GTX 1080 Ti
+      Architecture:
+        - Hard Voxelization
+    Paper:
+      URL: https://arxiv.org/abs/2004.02774
+      Title: 'SSN: Shape Signature Networks for Multi-class Object Detection from Point Clouds'
+    README: configs/ssn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/dense_heads/shape_aware_head.py#L166
+      Version: v0.7.0
+
+Models:
+  - Name: hv_ssn_secfpn_sbn-all_16xb2-2x_nus-3d
+    In Collection: SSN
+    Config: configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_nus-3d.py
+    Metadata:
+      Training Data: nuScenes
+      Training Memory (GB): 3.6
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 40.91
+          NDS: 54.44
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d_20210830_101351-51915986.pth
+
+  - Name: hv_ssn_regnet-400mf_secfpn_sbn-all_16xb2-2x_nus-3d
+    In Collection: SSN
+    Config: configs/ssn/ssn_hv_regnet-400mf_secfpn_sbn-all_16xb2-2x_nus-3d.py
+    Metadata:
+      Training Data: nuScenes
+      Training Memory (GB): 5.1
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 46.65
+          NDS: 58.24
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d_20210829_210615-361e5e04.pth
+
+  - Name: hv_ssn_secfpn_sbn-all_16xb2-2x_lyft-3d
+    In Collection: SSN
+    Config: configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py
+    Metadata:
+      Training Data: Lyft
+      Training Memory (GB): 8.5
+    Results:
+      - Task: 3D Object Detection
+        Dataset: Lyft
+        Metrics:
+          Private Score: 17.5
+          Public Score: 17.5
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d_20210822_134731-46841b41.pth
+
+  - Name: hv_ssn_regnet-400mf_secfpn_sbn-all_16xb1-2x_lyft-3d
+    In Collection: SSN
+    Config: configs/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_16xb1-2x_lyft-3d.py
+    Metadata:
+      Training Data: Lyft
+      Training Memory (GB): 7.4
+    Results:
+      - Task: 3D Object Detection
+        Dataset: Lyft
+        Metrics:
+          Private Score: 17.9
+          Public Score: 18.0
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_1x16_2x_lyft-3d/hv_ssn_regnet-400mf_secfpn_sbn-all_1x16_2x_lyft-3d_20210829_122825-d93475a1.pth
diff --git a/mmde/configs/ssn/ssn_hv_regnet-400mf_secfpn_sbn-all_16xb1-2x_lyft-3d.py b/mmde/configs/ssn/ssn_hv_regnet-400mf_secfpn_sbn-all_16xb1-2x_lyft-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..355a645a98398468c1153bc3c5d42f2833fd6250
--- /dev/null
+++ b/mmde/configs/ssn/ssn_hv_regnet-400mf_secfpn_sbn-all_16xb1-2x_lyft-3d.py
@@ -0,0 +1,21 @@
+_base_ = './ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py'
+# model settings
+model = dict(
+    type='MVXFasterRCNN',
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_400mf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[64, 160, 384]))
+# dataset settings
+train_dataloader = dict(batch_size=1, num_workers=2)
diff --git a/mmde/configs/ssn/ssn_hv_regnet-400mf_secfpn_sbn-all_16xb2-2x_nus-3d.py b/mmde/configs/ssn/ssn_hv_regnet-400mf_secfpn_sbn-all_16xb2-2x_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd6056fa69c848cdf69caf4cd599e03c35a038ee
--- /dev/null
+++ b/mmde/configs/ssn/ssn_hv_regnet-400mf_secfpn_sbn-all_16xb2-2x_nus-3d.py
@@ -0,0 +1,20 @@
+_base_ = './ssn_hv_secfpn_sbn-all_16xb2-2x_nus-3d.py'
+# model settings
+model = dict(
+    type='MVXFasterRCNN',
+    data_preprocessor=dict(type='Det3DDataPreprocessor'),
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_400mf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[64, 160, 384]))
diff --git a/mmde/configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py b/mmde/configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb3a5d41b51bcfeabb7cf5ba4287a8619963cd5c
--- /dev/null
+++ b/mmde/configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_lyft-3d.py
@@ -0,0 +1,244 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_fpn_lyft.py',
+    '../_base_/datasets/lyft-3d.py',
+    '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py',
+]
+point_cloud_range = [-100, -100, -5, 100, 100, 3]
+# Note that the order of class names should be consistent with
+# the following anchors' order
+class_names = [
+    'bicycle', 'motorcycle', 'pedestrian', 'animal', 'car',
+    'emergency_vehicle', 'bus', 'other_vehicle', 'truck'
+]
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=2, num_workers=4, dataset=dict(pipeline=train_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# model settings
+model = dict(
+    data_preprocessor=dict(
+        voxel_layer=dict(point_cloud_range=[-100, -100, -5, 100, 100, 3])),
+    pts_voxel_encoder=dict(
+        feat_channels=[32, 64],
+        point_cloud_range=[-100, -100, -5, 100, 100, 3]),
+    pts_middle_encoder=dict(output_shape=[800, 800]),
+    pts_neck=dict(
+        _delete_=True,
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(
+        _delete_=True,
+        type='ShapeAwareHead',
+        num_classes=9,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGeneratorPerCls',
+            ranges=[[-100, -100, -1.0709302, 100, 100, -1.0709302],
+                    [-100, -100, -1.3220503, 100, 100, -1.3220503],
+                    [-100, -100, -0.9122268, 100, 100, -0.9122268],
+                    [-100, -100, -1.8012227, 100, 100, -1.8012227],
+                    [-100, -100, -1.0715024, 100, 100, -1.0715024],
+                    [-100, -100, -0.8871424, 100, 100, -0.8871424],
+                    [-100, -100, -0.3519405, 100, 100, -0.3519405],
+                    [-100, -100, -0.6276341, 100, 100, -0.6276341],
+                    [-100, -100, -0.3033737, 100, 100, -0.3033737]],
+            sizes=[
+                [1.76, 0.63, 1.44],  # bicycle
+                [2.35, 0.96, 1.59],  # motorcycle
+                [0.80, 0.76, 1.76],  # pedestrian
+                [0.73, 0.35, 0.50],  # animal
+                [4.75, 1.92, 1.71],  # car
+                [6.52, 2.42, 2.34],  # emergency vehicle
+                [12.70, 2.92, 3.42],  # bus
+                [8.17, 2.75, 3.20],  # other vehicle
+                [10.24, 2.84, 3.44]  # truck
+            ],
+            custom_values=[],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        tasks=[
+            dict(
+                num_class=2,
+                class_names=['bicycle', 'motorcycle'],
+                shared_conv_channels=(64, 64),
+                shared_conv_strides=(1, 1),
+                norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)),
+            dict(
+                num_class=2,
+                class_names=['pedestrian', 'animal'],
+                shared_conv_channels=(64, 64),
+                shared_conv_strides=(1, 1),
+                norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)),
+            dict(
+                num_class=2,
+                class_names=['car', 'emergency_vehicle'],
+                shared_conv_channels=(64, 64, 64),
+                shared_conv_strides=(2, 1, 1),
+                norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)),
+            dict(
+                num_class=3,
+                class_names=['bus', 'other_vehicle', 'truck'],
+                shared_conv_channels=(64, 64, 64),
+                shared_conv_strides=(2, 1, 1),
+                norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01))
+        ],
+        assign_per_class=True,
+        diff_rad_by_sin=True,
+        dir_offset=-0.7854,  # -pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        _delete_=True,
+        pts=dict(
+            assigner=[
+                dict(  # bicycle
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # motorcycle
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # pedestrian
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # animal
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # car
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.45,
+                    min_pos_iou=0.45,
+                    ignore_iof_thr=-1),
+                dict(  # emergency vehicle
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # bus
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.45,
+                    min_pos_iou=0.45,
+                    ignore_iof_thr=-1),
+                dict(  # other vehicle
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # truck
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.45,
+                    min_pos_iou=0.45,
+                    ignore_iof_thr=-1)
+            ],
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            pos_weight=-1,
+            debug=False)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (16 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
diff --git a/mmde/configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_nus-3d.py b/mmde/configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d5e67fa4493cc2f4de34b6f6ee7240247e19c27
--- /dev/null
+++ b/mmde/configs/ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_nus-3d.py
@@ -0,0 +1,256 @@
+_base_ = [
+    '../_base_/models/pointpillars_hv_fpn_nus.py',
+    '../_base_/datasets/nus-3d.py',
+    '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py',
+]
+# Note that the order of class names should be consistent with
+# the following anchors' order
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+class_names = [
+    'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier', 'car',
+    'truck', 'trailer', 'bus', 'construction_vehicle'
+]
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+    dataset=dict(pipeline=train_pipeline, metainfo=dict(classes=class_names)))
+test_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names)))
+val_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names)))
+
+# model settings
+model = dict(
+    data_preprocessor=dict(voxel_layer=dict(max_num_points=20)),
+    pts_voxel_encoder=dict(feat_channels=[64, 64]),
+    pts_neck=dict(
+        _delete_=True,
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(
+        _delete_=True,
+        type='ShapeAwareHead',
+        num_classes=10,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGeneratorPerCls',
+            ranges=[[-50, -50, -1.67339111, 50, 50, -1.67339111],
+                    [-50, -50, -1.71396371, 50, 50, -1.71396371],
+                    [-50, -50, -1.61785072, 50, 50, -1.61785072],
+                    [-50, -50, -1.80984986, 50, 50, -1.80984986],
+                    [-50, -50, -1.76396500, 50, 50, -1.76396500],
+                    [-50, -50, -1.80032795, 50, 50, -1.80032795],
+                    [-50, -50, -1.74440365, 50, 50, -1.74440365],
+                    [-50, -50, -1.68526504, 50, 50, -1.68526504],
+                    [-50, -50, -1.80673031, 50, 50, -1.80673031],
+                    [-50, -50, -1.64824291, 50, 50, -1.64824291]],
+            sizes=[
+                [1.68452161, 0.60058911, 1.27192197],  # bicycle
+                [2.09973778, 0.76279481, 1.44403034],  # motorcycle
+                [0.72564370, 0.66344886, 1.75748069],  # pedestrian
+                [0.40359262, 0.39694519, 1.06232151],  # traffic cone
+                [0.48578221, 2.49008838, 0.98297065],  # barrier
+                [4.60718145, 1.95017717, 1.72270761],  # car
+                [6.73778078, 2.45609390, 2.73004906],  # truck
+                [12.01320693, 2.87427237, 3.81509561],  # trailer
+                [11.1885991, 2.94046906, 3.47030982],  # bus
+                [6.38352896, 2.73050468, 3.13312415]  # construction vehicle
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        tasks=[
+            dict(
+                num_class=2,
+                class_names=['bicycle', 'motorcycle'],
+                shared_conv_channels=(64, 64),
+                shared_conv_strides=(1, 1),
+                norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)),
+            dict(
+                num_class=1,
+                class_names=['pedestrian'],
+                shared_conv_channels=(64, 64),
+                shared_conv_strides=(1, 1),
+                norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)),
+            dict(
+                num_class=2,
+                class_names=['traffic_cone', 'barrier'],
+                shared_conv_channels=(64, 64),
+                shared_conv_strides=(1, 1),
+                norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)),
+            dict(
+                num_class=1,
+                class_names=['car'],
+                shared_conv_channels=(64, 64, 64),
+                shared_conv_strides=(2, 1, 1),
+                norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)),
+            dict(
+                num_class=4,
+                class_names=[
+                    'truck', 'trailer', 'bus', 'construction_vehicle'
+                ],
+                shared_conv_channels=(64, 64, 64),
+                shared_conv_strides=(2, 1, 1),
+                norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01))
+        ],
+        assign_per_class=True,
+        diff_rad_by_sin=True,
+        dir_offset=-0.7854,  # -pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        _delete_=True,
+        pts=dict(
+            assigner=[
+                dict(  # bicycle
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # motorcycle
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+                dict(  # pedestrian
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # traffic cone
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # barrier
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # car
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.45,
+                    min_pos_iou=0.45,
+                    ignore_iof_thr=-1),
+                dict(  # truck
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # trailer
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # bus
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # construction vehicle
+                    type='Max3DIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1)
+            ],
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+            pos_weight=-1,
+            debug=False)))
diff --git a/mmde/configs/votenet/README.md b/mmde/configs/votenet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..21fe6dba3648ac9880c658c578da69d9e25eef68
--- /dev/null
+++ b/mmde/configs/votenet/README.md
@@ -0,0 +1,68 @@
+# Deep Hough Voting for 3D Object Detection in Point Clouds
+
+> [Deep Hough Voting for 3D Object Detection in Point Clouds](https://arxiv.org/abs/1904.09664)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Current 3D object detection methods are heavily influenced by 2D detectors. In order to leverage architectures in 2D detectors, they often convert 3D point clouds to regular grids (i.e., to voxel grids or to bird's eye view images), or rely on detection in 2D images to propose 3D boxes. Few works have attempted to directly detect objects in point clouds. In this work, we return to first principles to construct a 3D detection pipeline for point cloud data and as generic as possible. However, due to the sparse nature of the data -- samples from 2D manifolds in 3D space -- we face a major challenge when directly predicting bounding box parameters from scene points: a 3D object centroid can be far from any surface point thus hard to regress accurately in one step. To address the challenge, we propose VoteNet, an end-to-end 3D object detection network based on a synergy of deep point set networks and Hough voting. Our model achieves state-of-the-art 3D detection on two large datasets of real 3D scans, ScanNet and SUN RGB-D with a simple design, compact model size and high efficiency. Remarkably, VoteNet outperforms previous methods by using purely geometric information without relying on color images.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/79644370/143888295-af7435b4-9f75-4669-b5f8-a19ae24a051c.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement VoteNet and provide the result and checkpoints on ScanNet and SUNRGBD datasets.
+
+## Results and models
+
+### ScanNet
+
+|                  Backbone                  | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 | AP@0.5 |                                                                                                                                                                  Download                                                                                                                                                                  |
+| :----------------------------------------: | :-----: | :------: | :------------: | :-----: | :----: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [PointNet++](./votenet_8xb8_scannet-3d.py) |   3x    |   4.1    |                |  62.34  | 40.82  | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/votenet/votenet_8x8_scannet-3d-18class/votenet_8x8_scannet-3d-18class_20210823_234503-cf8134fa.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/votenet/votenet_8x8_scannet-3d-18class/votenet_8x8_scannet-3d-18class_20210823_234503.log.json) |
+
+### SUNRGBD
+
+|                  Backbone                   | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 | AP@0.5 |                                                                                                                                                                    Download                                                                                                                                                                    |
+| :-----------------------------------------: | :-----: | :------: | :------------: | :-----: | :----: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [PointNet++](./votenet_8xb16_sunrgbd-3d.py) |   3x    |   8.1    |                |  59.78  | 35.77  | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/votenet/votenet_16x8_sunrgbd-3d-10class/votenet_16x8_sunrgbd-3d-10class_20210820_162823-bf11f014.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/votenet/votenet_16x8_sunrgbd-3d-10class/votenet_16x8_sunrgbd-3d-10class_20210820_162823.log.json) |
+
+**Notice**: If your current mmdetection3d version >= 0.6.0, and you are using the checkpoints downloaded from the above links or using checkpoints trained with mmdetection3d version \< 0.6.0, the checkpoints have to be first converted via [tools/model_converters/convert_votenet_checkpoints.py](../../tools/model_converters/convert_votenet_checkpoints.py):
+
+```
+python ./tools/model_converters/convert_votenet_checkpoints.py ${ORIGINAL_CHECKPOINT_PATH} --out=${NEW_CHECKPOINT_PATH}
+```
+
+Then you can use the converted checkpoints following [get_started.md](../../docs/en/get_started.md).
+
+## Indeterminism
+
+Since test data preparation randomly downsamples the points, and the test script uses fixed random seeds while the random seeds of validation in training are not fixed, the test results may be slightly different from the results reported above.
+
+## IoU loss
+
+Adding IoU loss (simply = 1-IoU) boosts VoteNet's performance. To use IoU loss, add this loss term to the config file:
+
+```python
+iou_loss=dict(type='AxisAlignedIoULoss', reduction='sum', loss_weight=10.0 / 3.0)
+```
+
+|                        Backbone                         | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 | AP@0.5 | Download |
+| :-----------------------------------------------------: | :-----: | :------: | :------------: | :-----: | :----: | :------: |
+| [PointNet++](./votenet_head-iouloss_8xb8_scannet-3d.py) |   3x    |   4.1    |                |  63.81  | 44.21  |    /     |
+
+For now, we only support calculating IoU loss for axis-aligned bounding boxes since the CUDA op of general 3D IoU calculation does not implement the backward method. Therefore, IoU loss can only be used for ScanNet dataset for now.
+
+## Citation
+
+```latex
+@inproceedings{qi2019deep,
+    author = {Qi, Charles R and Litany, Or and He, Kaiming and Guibas, Leonidas J},
+    title = {Deep Hough Voting for 3D Object Detection in Point Clouds},
+    booktitle = {Proceedings of the IEEE International Conference on Computer Vision},
+    year = {2019}
+}
+```
diff --git a/mmde/configs/votenet/metafile.yml b/mmde/configs/votenet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7353a63961aede34ea8c6630dcb4ebabb2a2922f
--- /dev/null
+++ b/mmde/configs/votenet/metafile.yml
@@ -0,0 +1,59 @@
+Collections:
+  - Name: VoteNet
+    Metadata:
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - PointNet++
+    Paper:
+      URL: https://arxiv.org/abs/1904.09664
+      Title: 'Deep Hough Voting for 3D Object Detection in Point Clouds'
+    README: configs/votenet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/votenet.py#L10
+      Version: v0.5.0
+
+Models:
+  - Name: votenet_8xb16_sunrgbd-3d.py
+    In Collection: VoteNet
+    Config: configs/votenet/votenet_8xb16_sunrgbd-3d.py
+    Metadata:
+      Training Data: SUNRGBD
+      Training Memory (GB): 8.1
+    Results:
+      - Task: 3D Object Detection
+        Dataset: SUNRGBD
+        Metrics:
+          AP@0.25: 59.78
+          AP@0.5: 35.77
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/votenet/votenet_16x8_sunrgbd-3d-10class/votenet_16x8_sunrgbd-3d-10class_20210820_162823-bf11f014.pth
+
+  - Name: votenet_8xb8_scannet-3d.py
+    In Collection: VoteNet
+    Config: configs/votenet/votenet_8xb8_scannet-3d.py
+    Metadata:
+      Training Data: ScanNet
+      Training Memory (GB): 4.1
+    Results:
+      - Task: 3D Object Detection
+        Dataset: ScanNet
+        Metrics:
+          AP@0.25: 62.34
+          AP@0.5: 40.82
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/votenet/votenet_8x8_scannet-3d-18class/votenet_8x8_scannet-3d-18class_20210823_234503-cf8134fa.pth
+
+  - Name: votenet_iouloss_8x8_scannet-3d-18class
+    In Collection: VoteNet
+    Config: configs/votenet/votenet_head-iouloss_8xb8_scannet-3d.py
+    Metadata:
+      Training Data: ScanNet
+      Training Memory (GB): 4.1
+      Architecture:
+        - IoU Loss
+    Results:
+      - Task: 3D Object Detection
+        Dataset: ScanNet
+        Metrics:
+          AP@0.25: 63.81
+          AP@0.5: 44.21
diff --git a/mmde/configs/votenet/votenet_8xb16_sunrgbd-3d.py b/mmde/configs/votenet/votenet_8xb16_sunrgbd-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..947291068734f4c24d895b2387b05ade3e6c3a23
--- /dev/null
+++ b/mmde/configs/votenet/votenet_8xb16_sunrgbd-3d.py
@@ -0,0 +1,27 @@
+# TODO refactor the config of sunrgbd
+_base_ = [
+    '../_base_/datasets/sunrgbd-3d.py', '../_base_/models/votenet.py',
+    '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    bbox_head=dict(
+        num_classes=10,
+        bbox_coder=dict(
+            type='PartialBinBasedBBoxCoder',
+            num_sizes=10,
+            num_dir_bins=12,
+            with_rot=True,
+            mean_sizes=[
+                [2.114256, 1.620300, 0.927272], [0.791118, 1.279516, 0.718182],
+                [0.923508, 1.867419, 0.845495], [0.591958, 0.552978, 0.827272],
+                [0.699104, 0.454178, 0.75625], [0.69519, 1.346299, 0.736364],
+                [0.528526, 1.002642, 1.172878], [0.500618, 0.632163, 0.683424],
+                [0.404671, 1.071108, 1.688889], [0.76584, 1.398258, 0.472728]
+            ]),
+    ))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/mmde/configs/votenet/votenet_8xb8_scannet-3d.py b/mmde/configs/votenet/votenet_8xb8_scannet-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e298cbafe036365a579b69d6ec2b32a72a335982
--- /dev/null
+++ b/mmde/configs/votenet/votenet_8xb8_scannet-3d.py
@@ -0,0 +1,39 @@
+_base_ = [
+    '../_base_/datasets/scannet-3d.py', '../_base_/models/votenet.py',
+    '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    bbox_head=dict(
+        num_classes=18,
+        bbox_coder=dict(
+            type='PartialBinBasedBBoxCoder',
+            num_sizes=18,
+            num_dir_bins=1,
+            with_rot=False,
+            mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                        [1.876858, 1.8425595, 1.1931566],
+                        [0.61328, 0.6148609, 0.7182701],
+                        [1.3955007, 1.5121545, 0.83443564],
+                        [0.97949594, 1.0675149, 0.6329687],
+                        [0.531663, 0.5955577, 1.7500148],
+                        [0.9624706, 0.72462326, 1.1481868],
+                        [0.83221924, 1.0490936, 1.6875663],
+                        [0.21132214, 0.4206159, 0.5372846],
+                        [1.4440073, 1.8970833, 0.26985747],
+                        [1.0294262, 1.4040797, 0.87554324],
+                        [1.3766412, 0.65521795, 1.6813129],
+                        [0.6650819, 0.71111923, 1.298853],
+                        [0.41999173, 0.37906948, 1.7513971],
+                        [0.59359556, 0.5912492, 0.73919016],
+                        [0.50867593, 0.50656086, 0.30136237],
+                        [1.1511526, 1.0546296, 0.49706793],
+                        [0.47535285, 0.49249494, 0.5802117]])))
+
+default_hooks = dict(logger=dict(type='LoggerHook', interval=30))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/mmde/configs/votenet/votenet_head-iouloss_8xb8_scannet-3d.py b/mmde/configs/votenet/votenet_head-iouloss_8xb8_scannet-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..58d49a52f242fa1f326a6e02b7e4261fd7b59fe2
--- /dev/null
+++ b/mmde/configs/votenet/votenet_head-iouloss_8xb8_scannet-3d.py
@@ -0,0 +1,8 @@
+_base_ = ['./votenet_8xb8_scannet-3d.py']
+
+# model settings, add iou loss
+model = dict(
+    bbox_head=dict(
+        iou_loss=dict(
+            type='AxisAlignedIoULoss', reduction='sum', loss_weight=10.0 /
+            3.0)))
diff --git a/mmde/dataset-index.yml b/mmde/dataset-index.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6fbbbdb27f5f457dfe3f4f6c05e0742524ccb833
--- /dev/null
+++ b/mmde/dataset-index.yml
@@ -0,0 +1,29 @@
+kitti:
+  # The name of dataset in OpenDataLab referring to
+  # https://opendatalab.com/KITTI_Object/cli. You can also download it
+  # by running `odl get ${dataset}` independently
+  dataset: KITTI_Object
+  download_root: data
+  data_root: data/kitti
+  # Scripts for unzipping datasets
+  script: tools/dataset_converters/kitti_unzip.sh
+
+nuscenes:
+  # The name of dataset in OpenDataLab referring to
+  # https://opendatalab.com/nuScenes/cli. You can also download it
+  # by running `odl get ${dataset}` independently
+  dataset: nuScenes
+  download_root: data
+  data_root: data/nuscenes
+  # Scripts for unzipping datasets
+  script: tools/dataset_converters/nuscenes_unzip.sh
+
+semantickitti:
+  # The name of dataset in OpenDataLab referring to
+  # https://opendatalab.com/SemanticKITTI/cli. You can also download it
+  # by running `odl get ${dataset}` independently
+  dataset: SemanticKITTI
+  download_root: data
+  data_root: data/semantickitti
+  # Scripts for unzipping datasets
+  script: tools/dataset_converters/semantickitti_unzip.sh
diff --git a/mmde/demo/mono_det_demo.py b/mmde/demo/mono_det_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..42416fd9f4b0c4768867fc20a54cc216553a30ef
--- /dev/null
+++ b/mmde/demo/mono_det_demo.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+from argparse import ArgumentParser
+
+from mmengine.logging import print_log
+
+from mmdet3d.apis import MonoDet3DInferencer
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('infos', help='Infos file with annotations')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--cam-type',
+        type=str,
+        default='CAM_BACK',
+        help='choose camera type to inference')
+    parser.add_argument(
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction and visualization results.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection visualization results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection prediction results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(
+        img=call_args.pop('img'), infos=call_args.pop('infos'))
+    call_args.pop('cam_type')
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
+
+    inferencer = MonoDet3DInferencer(**init_args)
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/demo/multi_modality_demo.py b/mmde/demo/multi_modality_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3c28cdc4882024c82f5fc8d0efdd0217ec8130f
--- /dev/null
+++ b/mmde/demo/multi_modality_demo.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+from argparse import ArgumentParser
+
+from mmengine.logging import print_log
+
+from mmdet3d.apis import MultiModalityDet3DInferencer
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('pcd', help='Point cloud file')
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('infos', help='Infos file with annotations')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--cam-type',
+        type=str,
+        default='CAM_FRONT',
+        help='choose camera type to inference')
+    parser.add_argument(
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction and visualization results.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection visualization results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection prediction results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(
+        points=call_args.pop('pcd'),
+        img=call_args.pop('img'),
+        infos=call_args.pop('infos'))
+    call_args.pop('cam_type')
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
+
+    inferencer = MultiModalityDet3DInferencer(**init_args)
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/demo/pcd_demo.py b/mmde/demo/pcd_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ff1090f35b4df37f08d58ede1e70498a8ba9ef2
--- /dev/null
+++ b/mmde/demo/pcd_demo.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+from argparse import ArgumentParser
+
+from mmengine.logging import print_log
+
+from mmdet3d.apis import LidarDet3DInferencer
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('pcd', help='Point cloud file')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--pred-score-thr',
+        type=float,
+        default=0.3,
+        help='bbox score threshold')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction and visualization results.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection visualization results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection prediction results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(points=call_args.pop('pcd'))
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
+
+    inferencer = LidarDet3DInferencer(**init_args)
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/demo/pcd_seg_demo.py b/mmde/demo/pcd_seg_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..574c21eb2d806a5bee976ad519fb1d1b08e08c8e
--- /dev/null
+++ b/mmde/demo/pcd_seg_demo.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+from argparse import ArgumentParser
+
+from mmengine.logging import print_log
+
+from mmdet3d.apis import LidarSeg3DInferencer
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('pcd', help='Point cloud file')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('weights', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of prediction and visualization results.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='Show online visualization results')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=-1,
+        help='The interval of show (s). Demo will be blocked in showing'
+        'results, if wait_time is -1. Defaults to -1.')
+    parser.add_argument(
+        '--no-save-vis',
+        action='store_true',
+        help='Do not save detection visualization results')
+    parser.add_argument(
+        '--no-save-pred',
+        action='store_true',
+        help='Do not save detection prediction results')
+    parser.add_argument(
+        '--print-result',
+        action='store_true',
+        help='Whether to print the results.')
+    call_args = vars(parser.parse_args())
+
+    call_args['inputs'] = dict(points=call_args.pop('pcd'))
+
+    if call_args['no_save_vis'] and call_args['no_save_pred']:
+        call_args['out_dir'] = ''
+
+    init_kws = ['model', 'weights', 'device']
+    init_args = {}
+    for init_kw in init_kws:
+        init_args[init_kw] = call_args.pop(init_kw)
+
+    # NOTE: If your operating environment does not have a display device,
+    # (e.g. a remote server), you can save the predictions and visualize
+    # them in local devices.
+    if os.environ.get('DISPLAY') is None and call_args['show']:
+        print_log(
+            'Display device not found. `--show` is forced to False',
+            logger='current',
+            level=logging.WARNING)
+        call_args['show'] = False
+
+    return init_args, call_args
+
+
+def main():
+    # TODO: Support inference of point cloud numpy file.
+    init_args, call_args = parse_args()
+
+    inferencer = LidarSeg3DInferencer(**init_args)
+    inferencer(**call_args)
+
+    if call_args['out_dir'] != '' and not (call_args['no_save_vis']
+                                           and call_args['no_save_pred']):
+        print_log(
+            f'results have been saved at {call_args["out_dir"]}',
+            logger='current')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/docker/Dockerfile b/mmde/docker/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..3f1f61ca8739f53880b8c578b283fd0a86ba2964
--- /dev/null
+++ b/mmde/docker/Dockerfile
@@ -0,0 +1,40 @@
+ARG PYTORCH="1.9.0"
+ARG CUDA="11.1"
+ARG CUDNN="8"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" \
+    TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
+    CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
+    FORCE_CUDA="1"
+
+# Avoid Public GPG key error
+# https://github.com/NVIDIA/nvidia-docker/issues/1631
+RUN rm /etc/apt/sources.list.d/cuda.list \
+    && rm /etc/apt/sources.list.d/nvidia-ml.list \
+    && apt-key del 7fa2af80 \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+# (Optional, use Mirror to speed up downloads)
+# RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list && \
+#    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+
+# Install the required packages
+RUN apt-get update \
+    && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install MMEngine, MMCV and MMDetection
+RUN pip install openmim && \
+    mim install "mmengine" "mmcv>=2.0.0rc4" "mmdet>=3.0.0"
+
+# Install MMDetection3D
+RUN conda clean --all \
+    && git clone https://github.com/open-mmlab/mmdetection3d.git -b dev-1.x /mmdetection3d \
+    && cd /mmdetection3d \
+    && pip install --no-cache-dir -e .
+
+WORKDIR /mmdetection3d
diff --git a/mmde/docker/serve/Dockerfile b/mmde/docker/serve/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..31b87dd5211e24cc24d731ae09db462caf059ebb
--- /dev/null
+++ b/mmde/docker/serve/Dockerfile
@@ -0,0 +1,65 @@
+ARG PYTORCH="1.9.0"
+ARG CUDA="11.1"
+ARG CUDNN="8"
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+ARG MMCV="2.0.0rc4"
+ARG MMDET="3.3.0"
+ARG MMDET3D="1.4.0"
+
+ENV PYTHONUNBUFFERED TRUE
+
+# Avoid Public GPG key error
+# https://github.com/NVIDIA/nvidia-docker/issues/1631
+RUN rm /etc/apt/sources.list.d/cuda.list \
+    && rm /etc/apt/sources.list.d/nvidia-ml.list \
+    && apt-key del 7fa2af80 \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+# (Optional, use Mirror to speed up downloads)
+# RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list
+
+# Install the required packages
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    g++ \
+    openjdk-11-jre-headless \
+    # MMDet3D Requirements
+    ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV PATH="/opt/conda/bin:$PATH" \
+    FORCE_CUDA="1"
+
+# TORCHSEVER
+RUN pip install torchserve torch-model-archiver
+
+# MMLAB
+ARG PYTORCH
+ARG CUDA
+RUN pip install openmim
+RUN mim install mmengine
+RUN mim install mmcv==${MMCV}
+RUN mim install mmdet==${MMDET}
+RUN mim install mmdet3d==${MMDET3D}
+
+RUN useradd -m model-server \
+    && mkdir -p /home/model-server/tmp
+
+COPY entrypoint.sh /usr/local/bin/entrypoint.sh
+
+RUN chmod +x /usr/local/bin/entrypoint.sh \
+    && chown -R model-server /home/model-server
+
+COPY config.properties /home/model-server/config.properties
+RUN mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store
+
+EXPOSE 8080 8081 8082
+
+USER model-server
+WORKDIR /home/model-server
+ENV TEMP=/home/model-server/tmp
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
+CMD ["serve"]
diff --git a/mmde/docker/serve/config.properties b/mmde/docker/serve/config.properties
new file mode 100644
index 0000000000000000000000000000000000000000..efb9c47e40ab550bac765611e6c6c6f2a7152f11
--- /dev/null
+++ b/mmde/docker/serve/config.properties
@@ -0,0 +1,5 @@
+inference_address=http://0.0.0.0:8080
+management_address=http://0.0.0.0:8081
+metrics_address=http://0.0.0.0:8082
+model_store=/home/model-server/model-store
+load_models=all
diff --git a/mmde/docker/serve/entrypoint.sh b/mmde/docker/serve/entrypoint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..41ba00b048aed84b45c5a8015a016ff148e97d86
--- /dev/null
+++ b/mmde/docker/serve/entrypoint.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+
+if [[ "$1" = "serve" ]]; then
+    shift 1
+    torchserve --start --ts-config /home/model-server/config.properties
+else
+    eval "$@"
+fi
+
+# prevent docker exit
+tail -f /dev/null
diff --git a/mmde/docs/en/Makefile b/mmde/docs/en/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..d4bb2cbb9eddb1bb1b4f366623044af8e4830919
--- /dev/null
+++ b/mmde/docs/en/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/mmde/docs/en/_static/css/readthedocs.css b/mmde/docs/en/_static/css/readthedocs.css
new file mode 100644
index 0000000000000000000000000000000000000000..cc61ab82abc3da66a46dc51f0ed0cab0073a7493
--- /dev/null
+++ b/mmde/docs/en/_static/css/readthedocs.css
@@ -0,0 +1,6 @@
+.header-logo {
+    background-image: url("../image/mmdet3d-logo.png");
+    background-size: 182.5px 40px;
+    height: 40px;
+    width: 182.5px;
+}
diff --git a/mmde/docs/en/advanced_guides/customize_dataset.md b/mmde/docs/en/advanced_guides/customize_dataset.md
new file mode 100644
index 0000000000000000000000000000000000000000..2b57fa04f1c4eed5e996e3781d0778656417f3fd
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/customize_dataset.md
@@ -0,0 +1,503 @@
+# Customize Datasets
+
+In this note, you will know how to train and test predefined models with customized datasets.
+
+The basic steps are as below:
+
+1. Prepare data
+2. Prepare a config
+3. Train, test and inference models on the customized dataset
+
+## Data Preparation
+
+The ideal situation is that we can reorganize the customized raw data and convert the annotation format into KITTI style. However, considering some calibration files and 3D annotations in KITTI format are difficult to obtain for customized datasets, we introduce the basic data format in the doc.
+
+### Basic Data Format
+
+#### Point cloud Format
+
+Currently, we only support `.bin` format point cloud for training and inference. Before training on your own datasets, you need to convert your point cloud files with other formats to `.bin` files. The common point cloud data formats include `.pcd` and `.las`, we list some open-source tools for reference.
+
+1. Convert `.pcd` to `.bin`: https://github.com/DanielPollithy/pypcd
+
+- You can install `pypcd` with the following command:
+
+  ```bash
+  pip install git+https://github.com/DanielPollithy/pypcd.git
+  ```
+
+- You can use the following script to read the `.pcd` file and convert it to `.bin` format for saving:
+
+  ```python
+  import numpy as np
+  from pypcd import pypcd
+
+  pcd_data = pypcd.PointCloud.from_path('point_cloud_data.pcd')
+  points = np.zeros([pcd_data.width, 4], dtype=np.float32)
+  points[:, 0] = pcd_data.pc_data['x'].copy()
+  points[:, 1] = pcd_data.pc_data['y'].copy()
+  points[:, 2] = pcd_data.pc_data['z'].copy()
+  points[:, 3] = pcd_data.pc_data['intensity'].copy().astype(np.float32)
+  with open('point_cloud_data.bin', 'wb') as f:
+      f.write(points.tobytes())
+  ```
+
+2. Convert `.las` to `.bin`: The common conversion path is `.las -> .pcd -> .bin`, and the conversion path `.las -> .pcd` can be achieved through [this tool](https://github.com/Hitachi-Automotive-And-Industry-Lab/semantic-segmentation-editor).
+
+#### Label Format
+
+The most basic information: 3D bounding box and category label of each scene need to be contained in the `.txt` annotation file. Each line represents a 3D box in a certain scene as follow:
+
+```
+# format: [x, y, z, dx, dy, dz, yaw, category_name]
+1.23 1.42 0.23 3.96 1.65 1.55 1.56 Car
+3.51 2.15 0.42 1.05 0.87 1.86 1.23 Pedestrian
+...
+```
+
+**Note**: Currently we only support KITTI Metric evaluation for customized datasets evaluation.
+
+The 3D Box should be stored in unified 3D coordinates.
+
+#### Calibration Format
+
+For the point cloud data collected by each LiDAR, they are usually fused and converted to a certain LiDAR coordinate. So typically the calibration information file should contain the intrinsic matrix of each camera and the transformation extrinsic matrix from the LiDAR to each camera in `.txt` calibration file, while `Px` represents the intrinsic matrix of `camera_x` and `lidar2camx` represents the transformation extrinsic matrix from the `lidar` to `camera_x`.
+
+```
+P0
+P1
+P2
+P3
+P4
+...
+lidar2cam0
+lidar2cam1
+lidar2cam2
+lidar2cam3
+lidar2cam4
+...
+```
+
+### Raw Data Structure
+
+#### LiDAR-Based 3D Detection
+
+The raw data for LiDAR-based 3D object detection are typically organized as follows, where `ImageSets` contains split files indicating which files belong to training/validation set, `points` includes point cloud data which are supposed to be stored in `.bin` format and `labels` includes label files for 3D detection.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── custom
+│   │   ├── ImageSets
+│   │   │   ├── train.txt
+│   │   │   ├── val.txt
+│   │   ├── points
+│   │   │   ├── 000000.bin
+│   │   │   ├── 000001.bin
+│   │   │   ├── ...
+│   │   ├── labels
+│   │   │   ├── 000000.txt
+│   │   │   ├── 000001.txt
+│   │   │   ├── ...
+```
+
+#### Vision-Based 3D Detection
+
+The raw data for vision-based 3D object detection are typically organized as follows, where `ImageSets` contains split files indicating which files belong to training/validation set, `images` contains the images from different cameras, for example, images from `camera_x` need to be placed in `images/images_x`, `calibs` contains calibration information files which store the camera intrinsic matrix of each camera, and `labels` includes label files for 3D detection.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── custom
+│   │   ├── ImageSets
+│   │   │   ├── train.txt
+│   │   │   ├── val.txt
+│   │   ├── calibs
+│   │   │   ├── 000000.txt
+│   │   │   ├── 000001.txt
+│   │   │   ├── ...
+│   │   ├── images
+│   │   │   ├── images_0
+│   │   │   │   ├── 000000.png
+│   │   │   │   ├── 000001.png
+│   │   │   │   ├── ...
+│   │   │   ├── images_1
+│   │   │   ├── images_2
+│   │   │   ├── ...
+│   │   ├── labels
+│   │   │   ├── 000000.txt
+│   │   │   ├── 000001.txt
+│   │   │   ├── ...
+```
+
+#### Multi-Modality 3D Detection
+
+The raw data for multi-modality 3D object detection are typically organized as follows. Different from vision-based 3D object detection, calibration information files in `calibs` store the camera intrinsic matrix of each camera and extrinsic matrix.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── custom
+│   │   ├── ImageSets
+│   │   │   ├── train.txt
+│   │   │   ├── val.txt
+│   │   ├── calibs
+│   │   │   ├── 000000.txt
+│   │   │   ├── 000001.txt
+│   │   │   ├── ...
+│   │   ├── points
+│   │   │   ├── 000000.bin
+│   │   │   ├── 000001.bin
+│   │   │   ├── ...
+│   │   ├── images
+│   │   │   ├── images_0
+│   │   │   │   ├── 000000.png
+│   │   │   │   ├── 000001.png
+│   │   │   │   ├── ...
+│   │   │   ├── images_1
+│   │   │   ├── images_2
+│   │   │   ├── ...
+│   │   ├── labels
+│   │   │   ├── 000000.txt
+│   │   │   ├── 000001.txt
+│   │   │   ├── ...
+```
+
+#### LiDAR-Based 3D Semantic Segmentation
+
+The raw data for LiDAR-based 3D semantic segmentation are typically organized as follows, where `ImageSets` contains split files indicating which files belong to training/validation set, `points` includes point cloud data, and `semantic_mask` includes point-level label.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── custom
+│   │   ├── ImageSets
+│   │   │   ├── train.txt
+│   │   │   ├── val.txt
+│   │   ├── points
+│   │   │   ├── 000000.bin
+│   │   │   ├── 000001.bin
+│   │   │   ├── ...
+│   │   ├── semantic_mask
+│   │   │   ├── 000000.bin
+│   │   │   ├── 000001.bin
+│   │   │   ├── ...
+```
+
+### Data Converter
+
+Once you prepared the raw data following our instruction, you can directly use the following command to generate training/validation information files.
+
+```bash
+python tools/create_data.py custom --root-path ./data/custom --out-dir ./data/custom --extra-tag custom
+```
+
+## An example of customized dataset
+
+Once we finish data preparation, we can create a new dataset in `mmdet3d/datasets/my_dataset.py` to load the data.
+
+```python
+import mmengine
+
+from mmdet3d.registry import DATASETS
+from .det3d_dataset import Det3DDataset
+
+
+@DATASETS.register_module()
+class MyDataset(Det3DDataset):
+
+    # replace with all the classes in customized pkl info file
+    METAINFO = {
+        'classes': ('Pedestrian', 'Cyclist', 'Car')
+    }
+
+    def parse_ann_info(self, info):
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Data information of single data sample.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
+                  3D ground truth bboxes.
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+        """
+        ann_info = super().parse_ann_info(info)
+        if ann_info is None:
+            ann_info = dict()
+            # empty instance
+            ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
+
+        # filter the gt classes not used in training
+        ann_info = self._remove_dontcare(ann_info)
+        gt_bboxes_3d = LiDARInstance3DBoxes(ann_info['gt_bboxes_3d'])
+        ann_info['gt_bboxes_3d'] = gt_bboxes_3d
+        return ann_info
+```
+
+After the data pre-processing, there are two steps for users to train the customized new dataset:
+
+1. Modify the config file for using the customized dataset.
+2. Check the annotations of the customized dataset.
+
+Here we take training PointPillars on customized dataset as an example:
+
+### Prepare a config
+
+Here we demonstrate a config sample for pure point cloud training.
+
+#### Prepare dataset config
+
+In `configs/_base_/datasets/custom.py`:
+
+```python
+# dataset settings
+dataset_type = 'MyDataset'
+data_root = 'data/custom/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']  # replace with your dataset class
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]  # adjust according to your dataset
+input_modality = dict(use_lidar=True, use_camera=False)
+metainfo = dict(classes=class_names)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,  # replace with your point cloud data dimension
+        use_dim=4),  # replace with the actual dimension used in training and inference
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,  # replace with your point cloud data dimension
+        use_dim=4),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+eval_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='Pack3DDetInputs', keys=['points']),
+]
+train_dataloader = dict(
+    batch_size=6,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='custom_infos_train.pkl',  # specify your training pkl info
+            data_prefix=dict(pts='points'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            metainfo=metainfo,
+            box_type_3d='LiDAR')))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='points'),
+        ann_file='custom_infos_val.pkl',  # specify your validation pkl info
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR'))
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'custom_infos_val.pkl',  # specify your validation pkl info
+    metric='bbox')
+```
+
+#### Prepare model config
+
+For voxel-based detectors such as SECOND, PointPillars and CenterPoint, the point cloud range and voxel size should be adjusted according to your dataset.
+Theoretically, `voxel_size` is linked to the setting of `point_cloud_range`. Setting a smaller `voxel_size` will increase the voxel num and the corresponding memory consumption. In addition, the following issues need to be noted:
+
+If the `point_cloud_range` and `voxel_size` are set to be `[0, -40, -3, 70.4, 40, 1]` and `[0.05, 0.05, 0.1]` respectively, then the shape of intermediate feature map should be `[(1-(-3))/0.1+1, (40-(-40))/0.05, (70.4-0)/0.05]=[41, 1600, 1408]`. When changing `point_cloud_range`, remember to change the shape of intermediate feature map in `middle_encoder` according to the `voxel_size`.
+
+Regarding the setting of `anchor_range`, it is generally adjusted according to dataset. Note that `z` value needs to be adjusted accordingly to the position of the point cloud, please refer to this [issue](https://github.com/open-mmlab/mmdetection3d/issues/986).
+
+Regarding the setting of `anchor_size`, it is usually necessary to count the average length, width and height of objects in the entire training dataset as `anchor_size` to obtain the best results.
+
+In `configs/_base_/models/pointpillars_hv_secfpn_custom.py`:
+
+```python
+voxel_size = [0.16, 0.16, 4]  # adjust according to your dataset
+point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]  # adjust according to your dataset
+model = dict(
+    type='VoxelNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=32,
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(16000, 40000))),
+    voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=4,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range),
+    # the `output_shape` should be adjusted according to `point_cloud_range`
+    # and `voxel_size`
+    middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
+    backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        assign_per_class=True,
+        # adjust the `ranges` and `sizes` according to your dataset
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[
+                [0, -39.68, -0.6, 69.12, 39.68, -0.6],
+                [0, -39.68, -0.6, 69.12, 39.68, -0.6],
+                [0, -39.68, -1.78, 69.12, 39.68, -1.78],
+            ],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
+```
+
+#### Prepare overall config
+
+We combine all the configs above in `configs/pointpillars/pointpillars_hv_secfpn_8xb6_custom.py`:
+
+```python
+_base_ = [
+    '../_base_/models/pointpillars_hv_secfpn_custom.py',
+    '../_base_/datasets/custom.py',
+    '../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py'
+]
+```
+
+#### Visualize your dataset (optional)
+
+To validate whether your prepared data and config are correct, it's highly recommended to use `tools/misc/browse_dataset.py` script
+to visualize your dataset and annotations before training and validation. Please refer to [visualization doc](https://mmdetection3d.readthedocs.io/en/dev-1.x/user_guides/visualization.html) for more details.
+
+## Evaluation
+
+Once the data and config have been prepared, you can directly run the training/testing script following our doc.
+
+**Note**: We only provide an implementation for KITTI style evaluation for the customized dataset. It should be included in the dataset config:
+
+```python
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'custom_infos_val.pkl',  # specify your validation pkl info
+    metric='bbox')
+```
diff --git a/mmde/docs/en/advanced_guides/customize_models.md b/mmde/docs/en/advanced_guides/customize_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..4a53e244e4b15aa43da62c814dfd25d277b07c62
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/customize_models.md
@@ -0,0 +1,638 @@
+# Customize Models
+
+We basically categorize model components into 6 types:
+
+- encoder: Including voxel encoder and middle encoder used in voxel-based methods before backbone, e.g., `HardVFE` and `PointPillarsScatter`.
+- backbone: Usually an FCN network to extract feature maps, e.g., `ResNet`, `SECOND`.
+- neck: The component between backbones and heads, e.g., `FPN`, `SECONDFPN`.
+- head: The component for specific tasks, e.g., `bbox prediction` and `mask prediction`.
+- RoI extractor: The part for extracting RoI features from feature maps, e.g., `H3DRoIHead` and `PartAggregationROIHead`.
+- loss: The component in heads for calculating losses, e.g., `FocalLoss`, `L1Loss`, and `GHMLoss`.
+
+## Develop new components
+
+### Add a new encoder
+
+Here we show how to develop new components with an example of HardVFE.
+
+#### 1. Define a new voxel encoder (e.g. HardVFE: Voxel feature encoder used in HV-SECOND)
+
+Create a new file `mmdet3d/models/voxel_encoders/voxel_encoder.py`.
+
+```python
+import torch.nn as nn
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class HardVFE(nn.Module):
+
+    def __init__(self, arg1, arg2):
+        pass
+
+    def forward(self, x):  # should return a tuple
+        pass
+```
+
+#### 2. Import the module
+
+You can either add the following line to `mmdet3d/models/voxel_encoders/__init__.py`:
+
+```python
+from .voxel_encoder import HardVFE
+```
+
+or alternatively add
+
+```python
+custom_imports = dict(
+    imports=['mmdet3d.models.voxel_encoders.voxel_encoder'],
+    allow_failed_imports=False)
+```
+
+to the config file to avoid modifying the original code.
+
+#### 3. Use the voxel encoder in your config file
+
+```python
+model = dict(
+    ...
+    voxel_encoder=dict(
+        type='HardVFE',
+        arg1=xxx,
+        arg2=yyy),
+    ...
+)
+```
+
+### Add a new backbone
+
+Here we show how to develop new components with an example of [SECOND](https://www.mdpi.com/1424-8220/18/10/3337) (Sparsely Embedded Convolutional Detection).
+
+#### 1. Define a new backbone (e.g. SECOND)
+
+Create a new file `mmdet3d/models/backbones/second.py`.
+
+```python
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class SECOND(BaseModule):
+
+    def __init__(self, arg1, arg2):
+        pass
+
+    def forward(self, x):  # should return a tuple
+        pass
+```
+
+#### 2. Import the module
+
+You can either add the following line to `mmdet3d/models/backbones/__init__.py`:
+
+```python
+from .second import SECOND
+```
+
+or alternatively add
+
+```python
+custom_imports = dict(
+    imports=['mmdet3d.models.backbones.second'],
+    allow_failed_imports=False)
+```
+
+to the config file to avoid modifying the original code.
+
+#### 3. Use the backbone in your config file
+
+```python
+model = dict(
+    ...
+    backbone=dict(
+        type='SECOND',
+        arg1=xxx,
+        arg2=yyy),
+    ...
+)
+```
+
+### Add a new neck
+
+#### 1. Define a new neck (e.g. SECONDFPN)
+
+Create a new file `mmdet3d/models/necks/second_fpn.py`.
+
+```python
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class SECONDFPN(BaseModule):
+
+    def __init__(self,
+                 in_channels=[128, 128, 256],
+                 out_channels=[256, 256, 256],
+                 upsample_strides=[1, 2, 4],
+                 norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+                 upsample_cfg=dict(type='deconv', bias=False),
+                 conv_cfg=dict(type='Conv2d', bias=False),
+                 use_conv_for_no_stride=False,
+                 init_cfg=None):
+        pass
+
+    def forward(self, x):
+        # implementation is ignored
+        pass
+```
+
+#### 2. Import the module
+
+You can either add the following line to `mmdet3d/models/necks/__init__.py`:
+
+```python
+from .second_fpn import SECONDFPN
+```
+
+or alternatively add
+
+```python
+custom_imports = dict(
+    imports=['mmdet3d.models.necks.second_fpn'],
+    allow_failed_imports=False)
+```
+
+to the config file to avoid modifying the original code.
+
+#### 3. Use the neck in your config file
+
+```python
+model = dict(
+    ...
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    ...
+)
+```
+
+### Add a new head
+
+Here we show how to develop a new head with the example of [PartA2 Head](https://arxiv.org/abs/1907.03670) as the following.
+
+**Note**: Here the example of `PartA2 RoI Head` is used in the second stage. For one-stage heads, please refer to examples in `mmdet3d/models/dense_heads/`. They are more commonly used in 3D detection for autonomous driving due to its simplicity and high efficiency.
+
+First, add a new bbox head in `mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py`.
+`PartA2 RoI Head` implements a new bbox head for object detection.
+To implement a bbox head, basically we need to implement two functions of the new module as the following. Sometimes other related functions like `loss` and `get_targets` are also required.
+
+```python
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class PartA2BboxHead(BaseModule):
+    """PartA2 RoI head."""
+
+    def __init__(self,
+                 num_classes,
+                 seg_in_channels,
+                 part_in_channels,
+                 seg_conv_channels=None,
+                 part_conv_channels=None,
+                 merge_conv_channels=None,
+                 down_conv_channels=None,
+                 shared_fc_channels=None,
+                 cls_channels=None,
+                 reg_channels=None,
+                 dropout_ratio=0.1,
+                 roi_feat_size=14,
+                 with_corner_loss=True,
+                 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='none',
+                     loss_weight=1.0),
+                 init_cfg=None):
+        super(PartA2BboxHead, self).__init__(init_cfg=init_cfg)
+
+    def forward(self, seg_feats, part_feats):
+        pass
+```
+
+Second, implement a new RoI Head if it is necessary. We plan to inherit the new `PartAggregationROIHead` from `Base3DRoIHead`. We can find that a `Base3DRoIHead` already implements the following functions.
+
+```python
+from mmdet.models.roi_heads import BaseRoIHead
+
+from mmdet3d.registry import MODELS, TASK_UTILS
+
+
+class Base3DRoIHead(BaseRoIHead):
+    """Base class for 3d RoIHeads."""
+
+    def __init__(self,
+                 bbox_head=None,
+                 bbox_roi_extractor=None,
+                 mask_head=None,
+                 mask_roi_extractor=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None):
+        super(Base3DRoIHead, self).__init__(
+            bbox_head=bbox_head,
+            bbox_roi_extractor=bbox_roi_extractor,
+            mask_head=mask_head,
+            mask_roi_extractor=mask_roi_extractor,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+
+    def init_bbox_head(self, bbox_roi_extractor: dict,
+                       bbox_head: dict) -> None:
+        """Initialize box head and box roi extractor.
+
+        Args:
+            bbox_roi_extractor (dict or ConfigDict): Config of box
+                roi extractor.
+            bbox_head (dict or ConfigDict): Config of box in box head.
+        """
+        self.bbox_roi_extractor = MODELS.build(bbox_roi_extractor)
+        self.bbox_head = MODELS.build(bbox_head)
+
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            if isinstance(self.train_cfg.assigner, dict):
+                self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            elif isinstance(self.train_cfg.assigner, list):
+                self.bbox_assigner = [
+                    TASK_UTILS.build(res) for res in self.train_cfg.assigner
+                ]
+            self.bbox_sampler = TASK_UTILS.build(self.train_cfg.sampler)
+
+    def init_mask_head(self):
+        """Initialize mask head, skip since ``PartAggregationROIHead`` does not
+        have one."""
+        pass
+```
+
+Double Head's modification is mainly in the bbox_forward logic, and it inherits other logics from the `Base3DRoIHead`.
+In the `mmdet3d/models/roi_heads/part_aggregation_roi_head.py`, we implement the new RoI Head as the following:
+
+```python
+from typing import Dict, List, Tuple
+
+from mmdet.models.task_modules import AssignResult, SamplingResult
+from mmengine import ConfigDict
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import bbox3d2roi
+from mmdet3d.utils import InstanceList
+from ...structures.det3d_data_sample import SampleList
+from .base_3droi_head import Base3DRoIHead
+
+
+@MODELS.register_module()
+class PartAggregationROIHead(Base3DRoIHead):
+    """Part aggregation roi head for PartA2.
+
+    Args:
+        semantic_head (ConfigDict): Config of semantic head.
+        num_classes (int): The number of classes.
+        seg_roi_extractor (ConfigDict): Config of seg_roi_extractor.
+        bbox_roi_extractor (ConfigDict): Config of part_roi_extractor.
+        bbox_head (ConfigDict): Config of bbox_head.
+        train_cfg (ConfigDict): Training config.
+        test_cfg (ConfigDict): Testing config.
+    """
+
+    def __init__(self,
+                 semantic_head: dict,
+                 num_classes: int = 3,
+                 seg_roi_extractor: dict = None,
+                 bbox_head: dict = None,
+                 bbox_roi_extractor: dict = None,
+                 train_cfg: dict = None,
+                 test_cfg: dict = None,
+                 init_cfg: dict = None) -> None:
+        super(PartAggregationROIHead, self).__init__(
+            bbox_head=bbox_head,
+            bbox_roi_extractor=bbox_roi_extractor,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        self.num_classes = num_classes
+        assert semantic_head is not None
+        self.init_seg_head(seg_roi_extractor, semantic_head)
+
+    def init_seg_head(self, seg_roi_extractor: dict,
+                      semantic_head: dict) -> None:
+        """Initialize semantic head and seg roi extractor.
+
+        Args:
+            seg_roi_extractor (dict): Config of seg
+                roi extractor.
+            semantic_head (dict): Config of semantic head.
+        """
+        self.semantic_head = MODELS.build(semantic_head)
+        self.seg_roi_extractor = MODELS.build(seg_roi_extractor)
+
+    @property
+    def with_semantic(self):
+        """bool: whether the head has semantic branch"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    def predict(self,
+                feats_dict: Dict,
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False,
+                **kwargs) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        assert self.with_bbox, 'Bbox head must be implemented in PartA2.'
+        assert self.with_semantic, 'Semantic head must be implemented' \
+                                   ' in PartA2.'
+
+        batch_input_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        voxels_dict = feats_dict.pop('voxels_dict')
+        # TODO: Split predict semantic and bbox
+        results_list = self.predict_bbox(feats_dict, voxels_dict,
+                                         batch_input_metas, rpn_results_list,
+                                         self.test_cfg)
+        return results_list
+
+    def predict_bbox(self, feats_dict: Dict, voxel_dict: Dict,
+                     batch_input_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     test_cfg: ConfigDict) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            voxel_dict (dict): Contains information of voxels.
+            batch_input_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            test_cfg (Config): Test config.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        ...
+
+    def loss(self, feats_dict: Dict, rpn_results_list: InstanceList,
+             batch_data_samples: SampleList, **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        losses = dict()
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        voxels_dict = feats_dict.pop('voxels_dict')
+        for data_sample in batch_data_samples:
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            if 'ignored_instances' in data_sample:
+                batch_gt_instances_ignore.append(data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+        if self.with_semantic:
+            semantic_results = self._semantic_forward_train(
+                feats_dict, voxels_dict, batch_gt_instances_3d)
+            losses.update(semantic_results.pop('loss_semantic'))
+
+        sample_results = self._assign_and_sample(rpn_results_list,
+                                                 batch_gt_instances_3d)
+        if self.with_bbox:
+            feats_dict.update(semantic_results)
+            bbox_results = self._bbox_forward_train(feats_dict, voxels_dict,
+                                                    sample_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        return losses
+```
+
+Here we omit more details related to other functions. Please see the [code](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/models/roi_heads/part_aggregation_roi_head.py) for more details.
+
+Last, the users need to add the module in
+`mmdet3d/models/roi_heads/bbox_heads/__init__.py` and `mmdet3d/models/roi_heads/__init__.py` thus the corresponding registry could find and load them.
+
+Alternatively, the users can add
+
+```python
+custom_imports=dict(
+    imports=['mmdet3d.models.roi_heads.part_aggregation_roi_head', 'mmdet3d.models.roi_heads.bbox_heads.parta2_bbox_head'],
+    allow_failed_imports=False)
+```
+
+to the config file and achieve the same goal.
+
+The config file of `PartAggregationROIHead` is as the following:
+
+```python
+model = dict(
+    ...
+    roi_head=dict(
+        type='PartAggregationROIHead',
+        num_classes=3,
+        semantic_head=dict(
+            type='PointwiseSemanticHead',
+            in_channels=16,
+            extra_width=0.2,
+            seg_score_thr=0.3,
+            num_classes=3,
+            loss_seg=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_part=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                loss_weight=1.0)),
+        seg_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='max')),
+        bbox_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='avg')),
+        bbox_head=dict(
+            type='PartA2BboxHead',
+            num_classes=3,
+            seg_in_channels=16,
+            part_in_channels=4,
+            seg_conv_channels=[64, 64],
+            part_conv_channels=[64, 64],
+            merge_conv_channels=[128, 128],
+            down_conv_channels=[128, 256],
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            shared_fc_channels=[256, 512, 512, 512],
+            cls_channels=[256, 256],
+            reg_channels=[256, 256],
+            dropout_ratio=0.1,
+            roi_feat_size=14,
+            with_corner_loss=True,
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss',
+                beta=1.0 / 9.0,
+                reduction='sum',
+                loss_weight=1.0),
+            loss_cls=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                loss_weight=1.0))),
+    ...
+)
+```
+
+Since MMDetection 2.0, the config system supports to inherit configs such that the users can focus on the modification.
+The second stage of PartA2 Head mainly uses a new `PartAggregationROIHead` and a new
+`PartA2BboxHead`, the arguments are set according to the `__init__` function of each module.
+
+### Add a new loss
+
+Assume you want to add a new loss as `MyLoss` for bounding box regression.
+To add a new loss function, the users need to implement it in `mmdet3d/models/losses/my_loss.py`.
+The decorator `weighted_loss` enables the loss to be weighted for each element.
+
+```python
+import torch
+import torch.nn as nn
+from mmdet.models.losses.utils import weighted_loss
+
+from mmdet3d.registry import MODELS
+
+
+@weighted_loss
+def my_loss(pred, target):
+    assert pred.size() == target.size() and target.numel() > 0
+    loss = torch.abs(pred - target)
+    return loss
+
+
+@MODELS.register_module()
+class MyLoss(nn.Module):
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(MyLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * my_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+```
+
+Then the users need to add it in the `mmdet3d/models/losses/__init__.py`.
+
+```python
+from .my_loss import MyLoss, my_loss
+```
+
+Alternatively, you can add
+
+```python
+custom_imports=dict(
+    imports=['mmdet3d.models.losses.my_loss'],
+    allow_failed_imports=False)
+```
+
+to the config file and achieve the same goal.
+
+To use it, users should modify the `loss_xxx` field.
+Since `MyLoss` is for regression, you need to modify the `loss_bbox` field in the head.
+
+```python
+loss_bbox=dict(type='MyLoss', loss_weight=1.0)
+```
diff --git a/mmde/docs/en/advanced_guides/customize_runtime.md b/mmde/docs/en/advanced_guides/customize_runtime.md
new file mode 100644
index 0000000000000000000000000000000000000000..9ccacd381e6d0c6a42f8d6a54a9501d6e96b6f89
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/customize_runtime.md
@@ -0,0 +1,392 @@
+# Customize Runtime Settings
+
+## Customize optimization settings
+
+Optimization related configuration is now all managed by `optim_wrapper` which usually has three fields: `optimizer`, `paramwise_cfg`, `clip_grad`. Please refer to [OptimWrapper](https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.html) for more details. See the example below, where `AdamW` is used as an `optimizer`, the learning rate of the backbone is reduced by a factor of 10, and gradient clipping is added.
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    # optimizer
+    optimizer=dict(
+        type='AdamW',
+        lr=0.0001,
+        weight_decay=0.05,
+        eps=1e-8,
+        betas=(0.9, 0.999)),
+
+    # Parameter-level learning rate and weight decay settings
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+        },
+        norm_decay_mult=0.0),
+
+    # gradient clipping
+    clip_grad=dict(max_norm=0.01, norm_type=2))
+```
+
+### Customize optimizer supported by PyTorch
+
+We already support to use all the optimizers implemented by PyTorch, and the only modification is to change the `optimizer` field in `optim_wrapper` field of config files. For example, if you want to use `Adam` (note that the performance could drop a lot), the modification could be as the following:
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='Adam', lr=0.0003, weight_decay=0.0001))
+```
+
+To modify the learning rate of the model, the users only need to modify the `lr` in `optimizer`. The users can directly set arguments following the [API doc](https://pytorch.org/docs/stable/optim.html?highlight=optim#module-torch.optim) of PyTorch.
+
+### Customize self-implemented optimizer
+
+#### 1. Define a new optimizer
+
+A customized optimizer could be defined as following:
+
+Assume you want to add a optimizer named `MyOptimizer`, which has arguments `a`, `b`, and `c`.
+You need to create a new directory named `mmdet3d/engine/optimizers`, and then implement the new optimizer in a file, e.g., in `mmdet3d/engine/optimizers/my_optimizer.py`:
+
+```python
+from torch.optim import Optimizer
+
+from mmdet3d.registry import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module()
+class MyOptimizer(Optimizer):
+
+    def __init__(self, a, b, c):
+        pass
+```
+
+#### 2. Add the optimizer to registry
+
+To find the above module defined above, this module should be imported into the main namespace at first. There are two options to achieve it.
+
+- Modify `mmdet3d/engine/optimizers/__init__.py` to import it.
+
+  The newly defined module should be imported in `mmdet3d/engine/optimizers/__init__.py` so that the registry will find the new module and add it:
+
+  ```python
+  from .my_optimizer import MyOptimizer
+  ```
+
+- Use `custom_imports` in the config to manually import it.
+
+  ```python
+  custom_imports = dict(imports=['mmdet3d.engine.optimizers.my_optimizer'], allow_failed_imports=False)
+  ```
+
+  The module `mmdet3d.engine.optimizers.my_optimizer` will be imported at the beginning of the program and the class `MyOptimizer` is then automatically registered.
+  Note that only the package containing the class `MyOptimizer` should be imported.
+  `mmdet3d.engine.optimizers.my_optimizer.MyOptimizer` **cannot** be imported directly.
+
+  Actually users can use a totally different file directory structure with this importing method, as long as the module root is located in `PYTHONPATH`.
+
+#### 3. Specify the optimizer in the config file
+
+Then you can use `MyOptimizer` in `optimizer` field in `optim_wrapper` field of config files. In the configs, the optimizers are defined by the field `optimizer` like the following:
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+```
+
+To use your own optimizer, the field can be changed to:
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value))
+```
+
+### Customize optimizer wrapper constructor
+
+Some models may have some parameter-specific settings for optimization, e.g. weight decay for BatchNorm layers.
+The users can do those fine-grained parameter tuning through customizing optimizer wrapper constructor.
+
+```python
+from mmengine.optim import DefaultOptimWrapperConstructor
+
+from mmdet3d.registry import OPTIM_WRAPPER_CONSTRUCTORS
+from .my_optimizer import MyOptimizer
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class MyOptimizerWrapperConstructor(DefaultOptimWrapperConstructor):
+
+    def __init__(self,
+                 optim_wrapper_cfg: dict,
+                 paramwise_cfg: Optional[dict] = None):
+        pass
+
+    def __call__(self, model: nn.Module) -> OptimWrapper:
+
+        return optim_wrapper
+```
+
+The default optimizer wrapper constructor is implemented [here](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/default_constructor.py#L18), which could also serve as a template for the new optimizer wrapper constructor.
+
+### Additional settings
+
+Tricks not implemented by the optimizer should be implemented through optimizer wrapper constructor (e.g., set parameter-wise learning rates) or hooks. We list some common settings that could stabilize the training or accelerate the training. Feel free to create PR, issue for more settings.
+
+- __Use gradient clip to stabilize training__:
+  Some models need gradient clip to clip the gradients to stabilize the training process. An example is as below:
+
+  ```python
+  optim_wrapper = dict(
+      _delete_=True, clip_grad=dict(max_norm=35, norm_type=2))
+  ```
+
+  If your config inherits the base config which already sets the `optim_wrapper`, you might need `_delete_=True` to override the unnecessary settings. See the [config documentation](https://mmdetection3d.readthedocs.io/en/dev-1.x/user_guides/config.html) for more details.
+
+- __Use momentum schedule to accelerate model convergence__:
+  We support momentum scheduler to modify model's momentum according to learning rate, which could make the model converge in a faster way.
+  Momentum scheduler is usually used with LR scheduler, for example, the following config is used in [3D detection](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/schedules/cyclic-20e.py) to accelerate convergence.
+  For more details, please refer to the implementation of [CosineAnnealingLR](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py#L43) and [CosineAnnealingMomentum](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py#L71).
+
+  ```python
+  param_scheduler = [
+      # learning rate scheduler
+      # During the first 8 epochs, learning rate increases from 0 to lr * 10
+      # during the next 12 epochs, learning rate decreases from lr * 10 to lr * 1e-4
+      dict(
+          type='CosineAnnealingLR',
+          T_max=8,
+          eta_min=lr * 10,
+          begin=0,
+          end=8,
+          by_epoch=True,
+          convert_to_iter_based=True),
+      dict(
+          type='CosineAnnealingLR',
+          T_max=12,
+          eta_min=lr * 1e-4,
+          begin=8,
+          end=20,
+          by_epoch=True,
+          convert_to_iter_based=True),
+      # momentum scheduler
+      # During the first 8 epochs, momentum increases from 0 to 0.85 / 0.95
+      # during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1
+      dict(
+          type='CosineAnnealingMomentum',
+          T_max=8,
+          eta_min=0.85 / 0.95,
+          begin=0,
+          end=8,
+          by_epoch=True,
+          convert_to_iter_based=True),
+      dict(
+          type='CosineAnnealingMomentum',
+          T_max=12,
+          eta_min=1,
+          begin=8,
+          end=20,
+          by_epoch=True,
+          convert_to_iter_based=True)
+  ]
+  ```
+
+## Customize training schedules
+
+By default we use step learning rate with 1x schedule, this calls [`MultiStepLR`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py#L144) in MMEngine.
+We support many other learning rate schedule [here](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py), such as `CosineAnnealingLR` and `PolyLR` schedules. Here are some examples:
+
+- Poly schedule:
+
+  ```python
+  param_scheduler = [
+      dict(
+          type='PolyLR',
+          power=0.9,
+          eta_min=1e-4,
+          begin=0,
+          end=8,
+          by_epoch=True)]
+  ```
+
+- CosineAnnealing schedule:
+
+  ```python
+  param_scheduler = [
+      dict(
+          type='CosineAnnealingLR',
+          T_max=8,
+          eta_min=lr * 1e-5,
+          begin=0,
+          end=8,
+          by_epoch=True)]
+  ```
+
+## Customize train loop
+
+By default, `EpochBasedTrainLoop` is used in `train_cfg` and validation is done after every train epoch, as follows:
+
+```python
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_begin=1, val_interval=1)
+```
+
+Actually, both [`IterBasedTrainLoop`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L185) and [`EpochBasedTrainLoop`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L18) support dynamic interval, see the following example:
+
+```python
+# Before 365001th iteration, we do evaluation every 5000 iterations.
+# After 365000th iteration, we do evaluation every 368750 iterations,
+# which means that we do evaluation at the end of training.
+
+interval = 5000
+max_iters = 368750
+dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)]
+train_cfg = dict(
+    type='IterBasedTrainLoop',
+    max_iters=max_iters,
+    val_interval=interval,
+    dynamic_intervals=dynamic_intervals)
+```
+
+## Customize hooks
+
+### Customize self-implemented hooks
+
+#### 1. Implement a new hook
+
+MMEngine provides many useful [hooks](https://mmengine.readthedocs.io/en/latest/tutorials/hook.html), but there are some occasions when the users might need to implement a new hook. MMDetection3D supports customized hooks in training based on MMEngine after v1.1.0rc0. Thus the users could implement a hook directly in mmdet3d or their mmdet3d-based codebases and use the hook by only modifying the config in training.
+Here we give an example of creating a new hook in mmdet3d and using it in training.
+
+```python
+from mmengine.hooks import Hook
+
+from mmdet3d.registry import HOOKS
+
+
+@HOOKS.register_module()
+class MyHook(Hook):
+
+    def __init__(self, a, b):
+
+    def before_run(self, runner) -> None:
+
+    def after_run(self, runner) -> None:
+
+    def before_train(self, runner) -> None:
+
+    def after_train(self, runner) -> None:
+
+    def before_train_epoch(self, runner) -> None:
+
+    def after_train_epoch(self, runner) -> None:
+
+    def before_train_iter(self,
+                          runner,
+                          batch_idx: int,
+                          data_batch: DATA_BATCH = None) -> None:
+
+    def after_train_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None,
+                         outputs: Optional[dict] = None) -> None:
+```
+
+Depending on the functionality of the hook, users need to specify what the hook will do at each stage of the training in `before_run`, `after_run`, `before_train`, `after_train`, `before_train_epoch`, `after_train_epoch`, `before_train_iter`, and `after_train_iter`. There are more points where hooks can be inserted, refer to [base hook class](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/hook.py#L9) for more details.
+
+#### 2. Register the new hook
+
+Then we need to make `MyHook` imported. Assuming the file is in `mmdet3d/engine/hooks/my_hook.py`, there are two ways to do that:
+
+- Modify `mmdet3d/engine/hooks/__init__.py` to import it.
+
+  The newly defined module should be imported in `mmdet3d/engine/hooks/__init__.py` so that the registry will find the new module and add it:
+
+  ```python
+  from .my_hook import MyHook
+  ```
+
+- Use `custom_imports` in the config to manually import it.
+
+  ```python
+  custom_imports = dict(imports=['mmdet3d.engine.hooks.my_hook'], allow_failed_imports=False)
+  ```
+
+#### 3. Modify the config
+
+```python
+custom_hooks = [
+    dict(type='MyHook', a=a_value, b=b_value)
+]
+```
+
+You can also set the priority of the hook by adding key `priority` to `'NORMAL'` or `'HIGHEST'` as below:
+
+```python
+custom_hooks = [
+    dict(type='MyHook', a=a_value, b=b_value, priority='NORMAL')
+]
+```
+
+By default the hook's priority is set as `NORMAL` during registration.
+
+### Use hooks implemented in MMDetection3D
+
+If the hook is already implemented in MMDetection3D, you can directly modify the config to use the hook as below.
+
+#### Example: `DisableObjectSampleHook`
+
+We implement a customized hook named [DisableObjectSampleHook](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/engine/hooks/disable_object_sample_hook.py) to disable `ObjectSample` augmentation during training after specified epoch.
+
+We can set it in the config file if needed:
+
+```python
+custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=15)]
+```
+
+### Modify default runtime hooks
+
+There are some common hooks that are registered through `default_hooks`, they are
+
+- `IterTimerHook`: A hook that logs 'data_time' for loading data and 'time' for a model training step.
+- `LoggerHook`: A hook that collects logs from different components of `Runner` and writes them to terminal, json file, tensorboard and wandb etc.
+- `ParamSchedulerHook`: A hook that updates some hyper-parameters in optimizer, e.g., learning rate and momentum.
+- `CheckpointHook`: A hook that saves checkpoints periodically.
+- `DistSamplerSeedHook`: A hook that sets the seed for sampler and batch_sampler.
+- `Det3DVisualizationHook`: A hook used to visualize validation and testing process prediction results.
+
+`IterTimerHook`, `ParamSchedulerHook` and `DistSamplerSeedHook` are simple and no need to be modified usually, so here we reveal what we can do with `LoggerHook`, `CheckpointHook` and `Det3DVisualizationHook`.
+
+#### CheckpointHook
+
+Except saving checkpoints periodically, [`CheckpointHook`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py#L18) provides other options such as `max_keep_ckpts`, `save_optimizer` and etc. The users could set `max_keep_ckpts` to only save small number of checkpoints or decide whether to store state dict of optimizer by `save_optimizer`. More details of the arguments are [here](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py#L18).
+
+```python
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        interval=1,
+        max_keep_ckpts=3,
+        save_optimizer=True))
+```
+
+#### LoggerHook
+
+The `LoggerHook` enables setting intervals. Detailed instructions can be found in the [docstring](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/logger_hook.py#L19).
+
+```python
+default_hooks = dict(logger=dict(type='LoggerHook', interval=50))
+```
+
+#### Det3DVisualizationHook
+
+`Det3DVisualizationHook` use `DetLocalVisualizer` to visualize prediction results, and `Det3DLocalVisualizer` current supports different backends, e.g., `TensorboardVisBackend` and `WandbVisBackend` (see [docstring](https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py) for more details). The users could add multi backends to do visualization as follows.
+
+```python
+default_hooks = dict(
+    visualization=dict(type='Det3DVisualizationHook', draw=True))
+
+vis_backends = [dict(type='LocalVisBackend'),
+                dict(type='TensorboardVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+```
diff --git a/mmde/docs/en/advanced_guides/datasets/index.rst b/mmde/docs/en/advanced_guides/datasets/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dc416ef57967e143606f37884eda5496e47dd214
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/datasets/index.rst
@@ -0,0 +1,11 @@
+.. toctree::
+   :maxdepth: 3
+
+   kitti.md
+   nuscenes.md
+   lyft.md
+   waymo.md
+   sunrgbd.md
+   scannet.md
+   s3dis.md
+   semantickitti.md
diff --git a/mmde/docs/en/advanced_guides/datasets/kitti.md b/mmde/docs/en/advanced_guides/datasets/kitti.md
new file mode 100644
index 0000000000000000000000000000000000000000..c0f3108fa265a4b6a90af261fab1e01fcf4c78d6
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/datasets/kitti.md
@@ -0,0 +1,206 @@
+# KITTI Dataset
+
+This page provides specific tutorials about the usage of MMDetection3D for KITTI dataset.
+
+## Prepare dataset
+
+You can download KITTI 3D detection data [HERE](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d) and unzip all zip files. Besides, the road planes could be downloaded from [HERE](https://download.openmmlab.com/mmdetection3d/data/train_planes.zip), which are optional for data augmentation during training for better performance. The road planes are generated by [AVOD](https://github.com/kujason/avod), you can see more details [HERE](https://github.com/kujason/avod/issues/19).
+
+Like the general way to prepare dataset, it is recommended to symlink the dataset root to `$MMDETECTION3D/data`.
+
+The folder structure should be organized as follows before our processing.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── kitti
+│   │   ├── ImageSets
+│   │   ├── testing
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── velodyne
+│   │   ├── training
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── label_2
+│   │   │   ├── velodyne
+│   │   │   ├── planes (optional)
+```
+
+### Create KITTI dataset
+
+To create KITTI point cloud data, we load the raw point cloud data and generate the relevant annotations including object labels and bounding boxes. We also generate all single training objects' point cloud in KITTI dataset and save them as `.bin` files in `data/kitti/kitti_gt_database`. Meanwhile, `.pkl` info files are also generated for training or validation. Subsequently, create KITTI data by running:
+
+```bash
+mkdir ./data/kitti/ && mkdir ./data/kitti/ImageSets
+
+# Download data split
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/test.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/test.txt
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/train.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/train.txt
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/val.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/val.txt
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/trainval.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/trainval.txt
+
+python tools/create_data.py kitti --root-path ./data/kitti --out-dir ./data/kitti --extra-tag kitti --with-plane
+```
+
+Note that if your local disk does not have enough space for saving converted data, you can change the `--out-dir` to anywhere else, and you need to remove the `--with-plane` flag if `planes` are not prepared.
+
+The folder structure after processing should be as below
+
+```
+kitti
+├── ImageSets
+│   ├── test.txt
+│   ├── train.txt
+│   ├── trainval.txt
+│   ├── val.txt
+├── testing
+│   ├── calib
+│   ├── image_2
+│   ├── velodyne
+│   ├── velodyne_reduced
+├── training
+│   ├── calib
+│   ├── image_2
+│   ├── label_2
+│   ├── velodyne
+│   ├── velodyne_reduced
+│   ├── planes (optional)
+├── kitti_gt_database
+│   ├── xxxxx.bin
+├── kitti_infos_train.pkl
+├── kitti_infos_val.pkl
+├── kitti_dbinfos_train.pkl
+├── kitti_infos_test.pkl
+├── kitti_infos_trainval.pkl
+```
+
+- `kitti_gt_database/xxxxx.bin`: point cloud data included in each 3D bounding box of the training dataset.
+- `kitti_infos_train.pkl`: training dataset, a dict contains two keys: `metainfo` and `data_list`.
+  `metainfo` contains the basic information for the dataset itself, such as `categories`, `dataset` and `info_version`, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
+  - info\['sample_idx'\]: The index of this sample in the whole dataset.
+  - info\['images'\]: Information of images captured by multiple cameras. A dict contains five keys including: `CAM0`, `CAM1`, `CAM2`, `CAM3`, `R0_rect`.
+    - info\['images'\]\['R0_rect'\]: Rectifying rotation matrix with shape (4, 4).
+    - info\['images'\]\['CAM2'\]: Include some information about the `CAM2` camera sensor.
+      - info\['images'\]\['CAM2'\]\['img_path'\]: The filename of the image.
+      - info\['images'\]\['CAM2'\]\['height'\]: The height of the image.
+      - info\['images'\]\['CAM2'\]\['width'\]: The width of the image.
+      - info\['images'\]\['CAM2'\]\['cam2img'\]: Transformation matrix from camera to image with shape (4, 4).
+      - info\['images'\]\['CAM2'\]\['lidar2cam'\]: Transformation matrix from lidar to camera with shape (4, 4).
+      - info\['images'\]\['CAM2'\]\['lidar2img'\]: Transformation matrix from lidar to image with shape (4, 4).
+  - info\['lidar_points'\]: A dict containing all the information related to the lidar points.
+    - info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
+    - info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
+    - info\['lidar_points'\]\['Tr_velo_to_cam'\]: Transformation from Velodyne coordinate to camera coordinate with shape (4, 4).
+    - info\['lidar_points'\]\['Tr_imu_to_velo'\]: Transformation from IMU coordinate to Velodyne coordinate with shape (4, 4).
+  - info\['instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox'\]: List of 4 numbers representing the 2D bounding box of the instance, in (x1, y1, x2, y2) order.
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, h, w, yaw) order.
+    - info\['instances'\]\[i\]\['bbox_label'\]: An int indicate the 2D label of instance and the -1 indicating ignore.
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: An int indicate the 3D label of instance and the -1 indicating ignore.
+    - info\['instances'\]\[i\]\['depth'\]: Projected center depth of the 3D bounding box with respect to the image plane.
+    - info\['instances'\]\[i\]\['num_lidar_pts'\]: The number of LiDAR points in the 3D bounding box.
+    - info\['instances'\]\[i\]\['center_2d'\]: Projected 2D center of the 3D bounding box.
+    - info\['instances'\]\[i\]\['difficulty'\]: KITTI difficulty: 'Easy', 'Moderate', 'Hard'.
+    - info\['instances'\]\[i\]\['truncated'\]: Float from 0 (non-truncated) to 1 (truncated), where truncated refers to the object leaving image boundaries.
+    - info\['instances'\]\[i\]\['occluded'\]: Integer (0,1,2,3) indicating occlusion state: 0 = fully visible, 1 = partly occluded, 2 = largely occluded, 3 = unknown.
+    - info\['instances'\]\[i\]\['group_ids'\]: Used for multi-part object.
+  - info\['plane'\](optional): Road level information.
+
+Please refer to [kitti_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/kitti_converter.py) and [update_infos_to_v2.py ](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/update_infos_to_v2.py) for more details.
+
+## Train pipeline
+
+A typical train pipeline of 3D detection on KITTI is as below:
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4, # x, y, z, intensity
+        use_dim=4),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+```
+
+- Data augmentation:
+  - `ObjectNoise`: apply noise to each GT objects in the scene.
+  - `RandomFlip3D`: randomly flip input point cloud horizontally or vertically.
+  - `GlobalRotScaleTrans`: rotate input point cloud.
+
+## Evaluation
+
+An example to evaluate PointPillars with 8 GPUs with kitti metrics is as follows:
+
+```shell
+bash tools/dist_test.sh configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py work_dirs/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class/latest.pth 8
+```
+
+## Metrics
+
+KITTI evaluates 3D object detection performance using mean Average Precision (mAP) and Average Orientation Similarity (AOS), Please refer to its [official website](http://www.cvlibs.net/datasets/kitti/eval_3dobject.php) and [original paper](http://www.cvlibs.net/publications/Geiger2012CVPR.pdf) for more details.
+
+We also adopt this approach for evaluation on KITTI. An example of printed evaluation results is as follows:
+
+```
+Car AP@0.70, 0.70, 0.70:
+bbox AP:97.9252, 89.6183, 88.1564
+bev  AP:90.4196, 87.9491, 85.1700
+3d   AP:88.3891, 77.1624, 74.4654
+aos  AP:97.70, 89.11, 87.38
+Car AP@0.70, 0.50, 0.50:
+bbox AP:97.9252, 89.6183, 88.1564
+bev  AP:98.3509, 90.2042, 89.6102
+3d   AP:98.2800, 90.1480, 89.4736
+aos  AP:97.70, 89.11, 87.38
+```
+
+## Testing and make a submission
+
+An example to test PointPillars on KITTI with 8 GPUs and generate a submission to the leaderboard is as follows:
+
+- First, you need to modify the `test_dataloader` and `test_evaluator` dict in your config file, just like:
+
+  ```python
+  data_root = 'data/kitti/'
+  test_dataloader = dict(
+      dataset=dict(
+          ann_file='kitti_infos_test.pkl',
+          load_eval_anns=False,
+          data_prefix=dict(pts='testing/velodyne_reduced')))
+  test_evaluator = dict(
+      ann_file=data_root + 'kitti_infos_test.pkl',
+      format_only=True,
+      pklfile_prefix='results/kitti-3class/kitti_results',
+      submission_prefix='results/kitti-3class/kitti_results')
+  ```
+
+- And then, you can run the test script.
+
+  ```shell
+  ./tools/dist_test.sh configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py work_dirs/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class/latest.pth 8
+  ```
+
+After generating `results/kitti-3class/kitti_results/xxxxx.txt` files, you can submit these files to KITTI benchmark. Please refer to the [KITTI official website](http://www.cvlibs.net/datasets/kitti/index.php) for more details.
diff --git a/mmde/docs/en/advanced_guides/datasets/lyft.md b/mmde/docs/en/advanced_guides/datasets/lyft.md
new file mode 100644
index 0000000000000000000000000000000000000000..2f711518e8211ca04f9d9976703da1c793956329
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/datasets/lyft.md
@@ -0,0 +1,207 @@
+# Lyft Dataset
+
+This page provides specific tutorials about the usage of MMDetection3D for Lyft dataset.
+
+## Before Preparation
+
+You can download Lyft 3D detection data [HERE](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data) and unzip all zip files.
+
+Like the general way to prepare a dataset, it is recommended to symlink the dataset root to `$MMDETECTION3D/data`.
+
+The folder structure should be organized as follows before our processing.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── lyft
+│   │   ├── v1.01-train
+│   │   │   ├── v1.01-train (train_data)
+│   │   │   ├── lidar (train_lidar)
+│   │   │   ├── images (train_images)
+│   │   │   ├── maps (train_maps)
+│   │   ├── v1.01-test
+│   │   │   ├── v1.01-test (test_data)
+│   │   │   ├── lidar (test_lidar)
+│   │   │   ├── images (test_images)
+│   │   │   ├── maps (test_maps)
+│   │   ├── train.txt
+│   │   ├── val.txt
+│   │   ├── test.txt
+│   │   ├── sample_submission.csv
+```
+
+Here `v1.01-train` and `v1.01-test` contain the metafiles which are similar to those of nuScenes. `.txt` files contain the data split information.
+Lyft does not have an official split for training and validation set, so we provide a split considering the number of objects from different categories in different scenes.
+`sample_submission.csv` is the base file for submission on the Kaggle evaluation server.
+Note that we follow the original folder names for clear organization. Please rename the raw folders as shown above.
+
+## Dataset Preparation
+
+The way to organize Lyft dataset is similar to nuScenes. We also generate the `.pkl` files which share almost the same structure.
+Next, we will mainly focus on the difference between these two datasets. For a more detailed explanation of the info structure, please refer to [nuScenes tutorial](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/docs/en/advanced_guides/datasets/nuscenes_det.md).
+
+To prepare info files for Lyft, run the following commands:
+
+```bash
+python tools/create_data.py lyft --root-path ./data/lyft --out-dir ./data/lyft --extra-tag lyft --version v1.01
+python tools/dataset_converters/lyft_data_fixer.py --version v1.01 --root-folder ./data/lyft
+```
+
+Note that the second command serves the purpose of fixing a corrupted lidar data file. Please refer to the discussion [here](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110000) for more details.
+
+The folder structure after processing should be as below.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── lyft
+│   │   ├── v1.01-train
+│   │   │   ├── v1.01-train (train_data)
+│   │   │   ├── lidar (train_lidar)
+│   │   │   ├── images (train_images)
+│   │   │   ├── maps (train_maps)
+│   │   ├── v1.01-test
+│   │   │   ├── v1.01-test (test_data)
+│   │   │   ├── lidar (test_lidar)
+│   │   │   ├── images (test_images)
+│   │   │   ├── maps (test_maps)
+│   │   ├── train.txt
+│   │   ├── val.txt
+│   │   ├── test.txt
+│   │   ├── sample_submission.csv
+│   │   ├── lyft_infos_train.pkl
+│   │   ├── lyft_infos_val.pkl
+│   │   ├── lyft_infos_test.pkl
+```
+
+- `lyft_infos_train.pkl`: training dataset, a dict contains two keys: `metainfo` and `data_list`.
+  `metainfo` contains the basic information for the dataset itself, such as `categories`, `dataset` and `info_version`, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
+  - info\['sample_idx'\]: The index of this sample in the whole dataset.
+  - info\['token'\]: Sample data token.
+  - info\['timestamp'\]: Timestamp of the sample data.
+  - info\['lidar_points'\]: A dict containing all the information related to the lidar points.
+    - info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
+    - info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
+    - info\['lidar_points'\]\['lidar2ego'\]: The transformation matrix from this lidar sensor to ego vehicle. (4x4 list)
+    - info\['lidar_points'\]\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
+  - info\['lidar_sweeps'\]: A list contains sweeps information (The intermediate lidar frames without annotations).
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['data_path'\]: The lidar data path of i-th sweep.
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['lidar2ego'\]: The transformation matrix from this lidar sensor to ego vehicle in i-th sweep timestamp
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['ego2global'\]: The transformation matrix from the ego vehicle in i-th sweep timestamp to global coordinates. (4x4 list)
+    - info\['lidar_sweeps'\]\[i\]\['lidar2sensor'\]: The transformation matrix from the keyframe lidar to the i-th frame lidar. (4x4 list)
+    - info\['lidar_sweeps'\]\[i\]\['timestamp'\]: Timestamp of the sweep data.
+    - info\['lidar_sweeps'\]\[i\]\['sample_data_token'\]: The sweep sample data token.
+  - info\['images'\]: A dict contains six keys corresponding to each camera: `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_BACK'`, `'CAM_BACK_LEFT'`, `'CAM_BACK_RIGHT'`. Each dict contains all data information related to  corresponding camera.
+    - info\['images'\]\['CAM_XXX'\]\['img_path'\]: The filename of the image.
+    - info\['images'\]\['CAM_XXX'\]\['cam2img'\]: The transformation matrix recording the intrinsic parameters when projecting 3D points to each image plane. (3x3 list)
+    - info\['images'\]\['CAM_XXX'\]\['sample_data_token'\]: Sample data token of image.
+    - info\['images'\]\['CAM_XXX'\]\['timestamp'\]: Timestamp of the image.
+    - info\['images'\]\['CAM_XXX'\]\['cam2ego'\]: The transformation matrix from this camera sensor to ego vehicle. (4x4 list)
+    - info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]: The transformation matrix from lidar sensor to this camera. (4x4 list)
+  - info\['instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box in lidar coordinate system of the instance, in (x, y, z, l, w, h, yaw) order.
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: A int starting from 0 indicates the label of instance, while the -1 indicates ignore class.
+    - info\['instances'\]\[i\]\['bbox_3d_isvalid'\]: Whether each bounding box is valid. In general, we only take the 3D boxes that include at least one lidar or radar point as valid boxes.
+
+Next, we will elaborate on the difference compared to nuScenes in terms of the details recorded in these info files.
+
+- Without `lyft_database/xxxxx.bin`: This folder and `.bin` files are not extracted on the Lyft dataset due to the negligible effect of ground-truth sampling in the experiments.
+
+- `lyft_infos_train.pkl`:
+
+  - Without info\['instances'\]\[i\]\['velocity'\]: There is no velocity measurement on Lyft.
+  - Without info\['instances'\]\[i\]\['num_lidar_pts'\] and info\['instances'\]\['num_radar_pts'\]
+
+Here we only explain the data recorded in the training info files. The same applies to the validation set and test set (without instances).
+
+Please refer to [lyft_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/lyft_converter.py) for more details about the structure of `lyft_infos_xxx.pkl`.
+
+## Training pipeline
+
+### LiDAR-Based Methods
+
+A typical training pipeline of LiDAR-based 3D detection (including multi-modality methods) on Lyft is almost the same as nuScenes as below.
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+```
+
+Similar to nuScenes, models on Lyft also need the `'LoadPointsFromMultiSweeps'` pipeline to load point clouds from consecutive frames.
+In addition, considering the intensity of LiDAR points collected by Lyft is invalid, we also set the `use_dim` in `'LoadPointsFromMultiSweeps'` to `[0, 1, 2, 4]` by default,
+where the first 3 dimensions refer to point coordinates, and the last refers to timestamp differences.
+
+## Evaluation
+
+An example to evaluate PointPillars with 8 GPUs with Lyft metrics is as follows:
+
+```shell
+bash ./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py checkpoints/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d_20210517_202818-fc6904c3.pth 8
+```
+
+## Metrics
+
+Lyft proposes a more strict metric for evaluating the predicted 3D bounding boxes.
+The basic criteria to judge whether a predicted box is positive or not is the same as KITTI, i.e. the 3D Intersection over Union (IoU).
+However, it adopts a way similar to COCO to compute the mean average precision (mAP) -- compute the average precision under different thresholds of 3D IoU from 0.5-0.95.
+Actually, overlap more than 0.7 3D IoU is a quite strict criterion for 3D detection methods, so the overall performance seems a little low.
+The imbalance of annotations for different categories is another important reason for the finally lower results compared to other datasets.
+Please refer to its [official website](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/overview/evaluation) for more details about the definition of this metric.
+
+We employ this official method for evaluation on Lyft. An example of printed evaluation results is as follows:
+
+```
++mAPs@0.5:0.95------+--------------+
+| class             | mAP@0.5:0.95 |
++-------------------+--------------+
+| animal            | 0.0          |
+| bicycle           | 0.099        |
+| bus               | 0.177        |
+| car               | 0.422        |
+| emergency_vehicle | 0.0          |
+| motorcycle        | 0.049        |
+| other_vehicle     | 0.359        |
+| pedestrian        | 0.066        |
+| truck             | 0.176        |
+| Overall           | 0.15         |
++-------------------+--------------+
+```
+
+## Testing and make a submission
+
+An example to test PointPillars on Lyft with 8 GPUs and generate a submission to the leaderboard is as follows.
+
+```shell
+./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py work_dirs/pp-lyft/latest.pth 8 --cfg-options test_evaluator.jsonfile_prefix=work_dirs/pp-lyft/results_challenge  test_evaluator.csv_savepath=results/pp-lyft/results_challenge.csv
+```
+
+After generating the `work_dirs/pp-lyft/results_challenge.csv`, you can submit it to the Kaggle evaluation server. Please refer to the [official website](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles) for more information.
+
+We can also visualize the prediction results with our developed visualization tools. Please refer to the [visualization doc](https://mmdetection3d.readthedocs.io/en/latest/useful_tools.html#visualization) for more details.
diff --git a/mmde/docs/en/advanced_guides/datasets/nuscenes.md b/mmde/docs/en/advanced_guides/datasets/nuscenes.md
new file mode 100644
index 0000000000000000000000000000000000000000..bdda34a4fa703629fb2d75aa5b98ef98993382b8
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/datasets/nuscenes.md
@@ -0,0 +1,313 @@
+# NuScenes Dataset
+
+This page provides specific tutorials about the usage of MMDetection3D for nuScenes dataset.
+
+## Before Preparation
+
+You can download nuScenes 3D detection `Full dataset (v1.0)` [HERE](https://www.nuscenes.org/download) and unzip all zip files.
+
+If you want to implement 3D semantic segmentation task, you need to additionally download the `nuScenes-lidarseg` data annotation and place the extracted files in the nuScenes corresponding folder.
+
+**Note**: `v1.0trainval(test)/categroy.json` in nuScenes-lidarseg will replace the original `v1.0trainval(test)/categroy.json` of the Full dataset (v1.0), but will not affect the 3D object detection task.
+
+Like the general way to prepare dataset, it is recommended to symlink the dataset root to `$MMDETECTION3D/data`.
+
+The folder structure should be organized as follows before our processing.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── nuscenes
+│   │   ├── maps
+│   │   ├── samples
+│   │   ├── sweeps
+│   │   ├── lidarseg (optional)
+│   │   ├── v1.0-test
+|   |   ├── v1.0-trainval
+```
+
+## Dataset Preparation
+
+We typically need to organize the useful data information with a `.pkl` file in a specific style.
+To prepare these files for nuScenes, run the following command:
+
+```bash
+python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes
+```
+
+The folder structure after processing should be as below.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── nuscenes
+│   │   ├── maps
+│   │   ├── samples
+│   │   ├── sweeps
+│   │   ├── lidarseg (optional)
+│   │   ├── v1.0-test
+|   |   ├── v1.0-trainval
+│   │   ├── nuscenes_database
+│   │   ├── nuscenes_infos_train.pkl
+│   │   ├── nuscenes_infos_val.pkl
+│   │   ├── nuscenes_infos_test.pkl
+│   │   ├── nuscenes_dbinfos_train.pkl
+```
+
+- `nuscenes_database/xxxxx.bin`: point cloud data included in each 3D bounding box of the training dataset
+- `nuscenes_infos_train.pkl`: training dataset, a dict contains two keys: `metainfo` and `data_list`.
+  `metainfo` contains the basic information for the dataset itself, such as `categories`, `dataset` and `info_version`, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
+  - info\['sample_idx'\]: The index of this sample in the whole dataset.
+  - info\['token'\]: Sample data token.
+  - info\['timestamp'\]: Timestamp of the sample data.
+  - info\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
+  - info\['lidar_points'\]: A dict containing all the information related to the lidar points.
+    - info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
+    - info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
+    - info\['lidar_points'\]\['lidar2ego'\]: The transformation matrix from this lidar sensor to ego vehicle. (4x4 list)
+  - info\['lidar_sweeps'\]: A list contains sweeps information (The intermediate lidar frames without annotations)
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['data_path'\]: The lidar data path of i-th sweep.
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['lidar2ego'\]: The transformation matrix from this lidar sensor to ego vehicle. (4x4 list)
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
+    - info\['lidar_sweeps'\]\[i\]\['lidar2sensor'\]: The transformation matrix from the main lidar sensor to the current sensor (for collecting the sweep data). (4x4 list)
+    - info\['lidar_sweeps'\]\[i\]\['timestamp'\]: Timestamp of the sweep data.
+    - info\['lidar_sweeps'\]\[i\]\['sample_data_token'\]: The sweep sample data token.
+  - info\['images'\]: A dict contains six keys corresponding to each camera: `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_BACK'`, `'CAM_BACK_LEFT'`, `'CAM_BACK_RIGHT'`. Each dict contains all data information related to  corresponding camera.
+    - info\['images'\]\['CAM_XXX'\]\['img_path'\]: The filename of the image.
+    - info\['images'\]\['CAM_XXX'\]\['cam2img'\]: The transformation matrix recording the intrinsic parameters when projecting 3D points to each image plane. (3x3 list)
+    - info\['images'\]\['CAM_XXX'\]\['sample_data_token'\]: Sample data token of image.
+    - info\['images'\]\['CAM_XXX'\]\['timestamp'\]: Timestamp of the image.
+    - info\['images'\]\['CAM_XXX'\]\['cam2ego'\]: The transformation matrix from this camera sensor to ego vehicle. (4x4 list)
+    - info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]: The transformation matrix from lidar sensor to this camera. (4x4 list)
+  - info\['instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, w, h, yaw) order.
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: A int indicate the label of instance and the -1 indicate ignore.
+    - info\['instances'\]\[i\]\['velocity'\]: Velocities of 3D bounding boxes (no vertical measurements due to inaccuracy), a list has shape (2.).
+    - info\['instances'\]\[i\]\['num_lidar_pts'\]: Number of lidar points included in each 3D bounding box.
+    - info\['instances'\]\[i\]\['num_radar_pts'\]: Number of radar points included in each 3D bounding box.
+    - info\['instances'\]\[i\]\['bbox_3d_isvalid'\]: Whether each bounding box is valid. In general, we only take the 3D boxes that include at least one lidar or radar point as valid boxes.
+  - info\['cam_instances'\]: It is a dict containing keys `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_BACK'`, `'CAM_BACK_LEFT'`, `'CAM_BACK_RIGHT'`. For vision-based 3D object detection task, we split 3D annotations of the whole scenes according to the camera they belong to. For the i-th instance:
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label'\]: Label of instance.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label_3d'\]: Label of instance.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox'\]: 2D bounding box annotation (exterior rectangle of the projected 3D box), a list arrange as \[x1, y1, x2, y2\].
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['center_2d'\]: Projected center location on the image, a list has shape (2,), .
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['depth'\]: The depth of projected center.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['velocity'\]: Velocities of 3D bounding boxes (no vertical measurements due to inaccuracy), a list has shape (2,).
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['attr_label'\]: The attr label of instance. We maintain a default attribute collection and mapping for attribute classification.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, h, w, yaw) order.
+  - info\['pts_semantic_mask_path'\]：The filename of the lidar point cloud semantic segmentation annotation.
+
+Note:
+
+1. The differences between `bbox_3d` in `instances` and that in `cam_instances`.
+   Both `bbox_3d` have been converted to MMDet3D coordinate system, but `bboxes_3d` in `instances` is in LiDAR coordinate format and `bboxes_3d` in `cam_instances` is in Camera coordinate format. Mind the difference between them in 3D Box representation ('l, w, h' and 'l, h, w').
+
+2. Here we only explain the data recorded in the training info files. The same applies to validation and testing set (the `.pkl` file of test set does not contains `instances` and `cam_instances`).
+
+The core function to get `nuscenes_infos_xxx.pkl` is  [\_fill_trainval_infos](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/nuscenes_converter.py#L146).
+Please refer to [nuscenes_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/nuscenes_converter.py) for more details.
+
+## Training pipeline
+
+### LiDAR-Based Methods
+
+A typical training pipeline of LiDAR-based 3D detection (including multi-modality methods) on nuScenes is as below.
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+```
+
+Compared to general cases, nuScenes has a specific `'LoadPointsFromMultiSweeps'` pipeline to load point clouds from consecutive frames. This is a common practice used in this setting.
+Please refer to the nuScenes [original paper](https://arxiv.org/abs/1903.11027) for more details.
+The default `use_dim` in `'LoadPointsFromMultiSweeps'` is `[0, 1, 2, 4]`, where the first 3 dimensions refer to point coordinates and the last refers to timestamp differences.
+Intensity is not used by default due to its yielded noise when concatenating the points from different frames.
+
+### Vision-Based Methods
+
+#### Monocular-based
+
+In the NuScenes dataset, for multi-view images, this paradigm usually involves detecting and outputting 3D object detection results separately for each image, and then obtaining the final detection results through post-processing (such as NMS). Essentially, it directly extends monocular 3D detection to multi-view settings. A typical training pipeline of image-based monocular 3D detection on nuScenes is as below.
+
+```python
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='mmdet.Resize', scale=(1600, 900), keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'attr_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+```
+
+It follows the general pipeline of 2D detection while differs in some details:
+
+- It uses monocular pipelines to load images, which includes additional required information like camera intrinsics.
+- It needs to load 3D annotations.
+- Some data augmentation techniques need to be adjusted, such as `RandomFlip3D`.
+  Currently we do not support more augmentation methods, because how to transfer and apply other techniques is still under explored.
+
+#### BEV-based
+
+BEV, Bird's-Eye-View, is another popular 3D detection paradigm. It directly takes multi-view images to perform 3D detection, for nuScenes, they are `CAM_FRONT`, `CAM_FRONT_LEFT`, `CAM_FRONT_RIGHT`, `CAM_BACK`, `CAM_BACK_LEFT` and `CAM_BACK_RIGHT`. A basic training pipeline of bev-based 3D detection on nuScenes is as below.
+
+```python
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+train_transforms = [
+    dict(type='PhotoMetricDistortion3D'),
+    dict(
+        type='RandomResize3D',
+        scale=(1600, 900),
+        ratio_range=(1., 1.),
+        keep_ratio=True)
+]
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles',
+         to_float32=True,
+         num_views=6, ),
+    dict(type='LoadAnnotations3D',
+         with_bbox_3d=True,
+         with_label_3d=True,
+         with_attr_label=False),
+    # optional, data augmentation
+    dict(type='MultiViewWrapper', transforms=train_transforms),
+    # optional, filter object within specific point cloud range
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    # optional, filter object of specific classes
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+```
+
+To load multiple view of images, a little modification should be made to the dataset.
+
+```python
+data_prefix = dict(
+    CAM_FRONT='samples/CAM_FRONT',
+    CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+    CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+    CAM_BACK='samples/CAM_BACK',
+    CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+    CAM_BACK_LEFT='samples/CAM_BACK_LEFT',
+)
+train_dataloader = dict(
+    batch_size=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type="NuScenesDataset",
+        data_root="./data/nuScenes",
+        ann_file="nuscenes_infos_train.pkl",
+        data_prefix=data_prefix,
+        modality=dict(use_camera=True, use_lidar=False, ),
+        pipeline=train_pipeline,
+        test_mode=False, )
+)
+```
+
+## Evaluation
+
+An example to evaluate PointPillars with 8 GPUs with nuScenes metrics is as follows.
+
+```shell
+bash ./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py checkpoints/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth 8
+```
+
+## Metrics
+
+NuScenes proposes a comprehensive metric, namely nuScenes detection score (NDS), to evaluate different methods and set up the benchmark.
+It consists of mean Average Precision (mAP), Average Translation Error (ATE), Average Scale Error (ASE), Average Orientation Error (AOE), Average Velocity Error (AVE) and Average Attribute Error (AAE).
+Please refer to its [official website](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Any) for more details.
+
+We also adopt this approach for evaluation on nuScenes. An example of printed evaluation results is as follows:
+
+```
+mAP: 0.3197
+mATE: 0.7595
+mASE: 0.2700
+mAOE: 0.4918
+mAVE: 1.3307
+mAAE: 0.1724
+NDS: 0.3905
+Eval time: 170.8s
+
+Per-class results:
+Object Class    AP      ATE     ASE     AOE     AVE     AAE
+car     0.503   0.577   0.152   0.111   2.096   0.136
+truck   0.223   0.857   0.224   0.220   1.389   0.179
+bus     0.294   0.855   0.204   0.190   2.689   0.283
+trailer 0.081   1.094   0.243   0.553   0.742   0.167
+construction_vehicle    0.058   1.017   0.450   1.019   0.137   0.341
+pedestrian      0.392   0.687   0.284   0.694   0.876   0.158
+motorcycle      0.317   0.737   0.265   0.580   2.033   0.104
+bicycle 0.308   0.704   0.299   0.892   0.683   0.010
+traffic_cone    0.555   0.486   0.309   nan     nan     nan
+barrier 0.466   0.581   0.269   0.169   nan     nan
+```
+
+## Testing and make a submission
+
+An example to test PointPillars on nuScenes with 8 GPUs and generate a submission to the leaderboard is as follows.
+
+You should modify the `jsonfile_prefix` in the `test_evaluator` of corresponding configuration. For example, adding `test_evaluator = dict(type='NuScenesMetric', jsonfile_prefix='work_dirs/pp-nus/results_eval.json')` or using `--cfg-options "test_evaluator.jsonfile_prefix=work_dirs/pp-nus/results_eval.json)` after the test command.
+
+```shell
+./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py work_dirs/pp-nus/latest.pth 8 --cfg-options 'test_evaluator.jsonfile_prefix=work_dirs/pp-nus/results_eval'
+```
+
+Note that the testing info should be changed to that for testing set instead of validation set [here](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/datasets/nus-3d.py#L132).
+
+After generating the `work_dirs/pp-nus/results_eval.json`, you can compress it and submit it to nuScenes benchmark. Please refer to the [nuScenes official website](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Any) for more information.
+
+We can also visualize the prediction results with our developed visualization tools. Please refer to the [visualization doc](https://mmdetection3d.readthedocs.io/en/latest/useful_tools.html#visualization) for more details.
+
+## Notes
+
+### Transformation between `NuScenesBox` and our `CameraInstanceBoxes`.
+
+In general, the main difference of `NuScenesBox` and our `CameraInstanceBoxes` is mainly reflected in the yaw definition. `NuScenesBox` defines the rotation with a quaternion or three Euler angles while ours only defines one yaw angle due to the practical scenario. It requires us to add some additional rotations manually in the pre-processing and post-processing, such as [here](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L673).
+
+In addition, please note that the definition of corners and locations are detached in the `NuScenesBox`. For example, in monocular 3D detection, the definition of the box location is in its camera coordinate (see its official [illustration](https://www.nuscenes.org/nuscenes#data-collection) for car setup), which is consistent with [ours](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/bbox/structures/cam_box3d.py). In contrast, its corners are defined with the [convention](https://github.com/nutonomy/nuscenes-devkit/blob/02e9200218977193a1058dd7234f935834378319/python-sdk/nuscenes/utils/data_classes.py#L527) "x points forward, y to the left, z up". It results in different philosophy of dimension and rotation definitions from our `CameraInstanceBoxes`. An example to remove similar hacks is PR [#744](https://github.com/open-mmlab/mmdetection3d/pull/744). The same problem also exists in the LiDAR system. To deal with them, we typically add some transformation in the pre-processing and post-processing to guarantee the box will be in our coordinate system during the entire training and inference procedure.
diff --git a/mmde/docs/en/advanced_guides/datasets/s3dis.md b/mmde/docs/en/advanced_guides/datasets/s3dis.md
new file mode 100644
index 0000000000000000000000000000000000000000..0ca60ae20900955828e8611e63204447857dbcbd
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/datasets/s3dis.md
@@ -0,0 +1,262 @@
+# S3DIS Dataset
+
+## Dataset preparation
+
+For the overall process, please refer to the [README](https://github.com/open-mmlab/mmdetection3d/blob/master/data/s3dis/README.md/) page for S3DIS.
+
+### Export S3DIS data
+
+By exporting S3DIS data, we load the raw point cloud data and generate the relevant annotations including semantic labels and instance labels.
+
+The directory structure before exporting should be as below:
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── s3dis
+│   │   ├── meta_data
+│   │   ├── Stanford3dDataset_v1.2_Aligned_Version
+│   │   │   ├── Area_1
+│   │   │   │   ├── conferenceRoom_1
+│   │   │   │   ├── office_1
+│   │   │   │   ├── ...
+│   │   │   ├── Area_2
+│   │   │   ├── Area_3
+│   │   │   ├── Area_4
+│   │   │   ├── Area_5
+│   │   │   ├── Area_6
+│   │   ├── indoor3d_util.py
+│   │   ├── collect_indoor3d_data.py
+│   │   ├── README.md
+```
+
+Under folder `Stanford3dDataset_v1.2_Aligned_Version`, the rooms are spilted into 6 areas. We use 5 areas for training and 1 for evaluation (typically `Area_5`). Under the directory of each area, there are folders in which raw point cloud data and relevant annotations are saved. For instance, under folder `Area_1/office_1` the files are as below:
+
+- `office_1.txt`: A txt file storing coordinates and colors of each point in the raw point cloud data.
+
+- `Annotations/`: This folder contains txt files for different object instances. Each txt file represents one instance, e.g.
+
+  - `chair_1.txt`: A txt file storing raw point cloud data of one chair in this room.
+
+  If we concat all the txt files under `Annotations/`, we will get the same point cloud as denoted by `office_1.txt`.
+
+Export S3DIS data by running `python collect_indoor3d_data.py`. The main steps include:
+
+- Export original txt files to point cloud, instance label and semantic label.
+- Save point cloud data and relevant annotation files.
+
+And the core function `export` in `indoor3d_util.py` is as follows:
+
+```python
+def export(anno_path, out_filename):
+    """Convert original dataset files to points, instance mask and semantic
+    mask files. We aggregated all the points from each instance in the room.
+
+    Args:
+        anno_path (str): path to annotations. e.g. Area_1/office_2/Annotations/
+        out_filename (str): path to save collected points and labels.
+        file_format (str): txt or numpy, determines what file format to save.
+
+    Note:
+        the points are shifted before save, the most negative point is now
+            at origin.
+    """
+    points_list = []
+    ins_idx = 1  # instance ids should be indexed from 1, so 0 is unannotated
+
+    # an example of `anno_path`: Area_1/office_1/Annotations
+    # which contains all object instances in this room as txt files
+    for f in glob.glob(osp.join(anno_path, '*.txt')):
+        # get class name of this instance
+        one_class = osp.basename(f).split('_')[0]
+        if one_class not in class_names:  # some rooms have 'staris' class
+            one_class = 'clutter'
+        points = np.loadtxt(f)
+        labels = np.ones((points.shape[0], 1)) * class2label[one_class]
+        ins_labels = np.ones((points.shape[0], 1)) * ins_idx
+        ins_idx += 1
+        points_list.append(np.concatenate([points, labels, ins_labels], 1))
+
+    data_label = np.concatenate(points_list, 0)  # [N, 8], (pts, rgb, sem, ins)
+    # align point cloud to the origin
+    xyz_min = np.amin(data_label, axis=0)[0:3]
+    data_label[:, 0:3] -= xyz_min
+
+    np.save(f'{out_filename}_point.npy', data_label[:, :6].astype(np.float32))
+    np.save(f'{out_filename}_sem_label.npy', data_label[:, 6].astype(np.int64))
+    np.save(f'{out_filename}_ins_label.npy', data_label[:, 7].astype(np.int64))
+
+```
+
+where we load and concatenate all the point cloud instances under `Annotations/` to form raw point cloud and generate semantic/instance labels. After exporting each room, the point cloud data, semantic labels and instance labels should be saved in `.npy` files.
+
+### Create dataset
+
+```shell
+python tools/create_data.py s3dis --root-path ./data/s3dis \
+--out-dir ./data/s3dis --extra-tag s3dis
+```
+
+The above exported point cloud files, semantic label files and instance label files are further saved in `.bin` format. Meanwhile `.pkl` info files are also generated for each area.
+
+The directory structure after process should be as below:
+
+```
+s3dis
+├── meta_data
+├── indoor3d_util.py
+├── collect_indoor3d_data.py
+├── README.md
+├── Stanford3dDataset_v1.2_Aligned_Version
+├── s3dis_data
+├── points
+│   ├── xxxxx.bin
+├── instance_mask
+│   ├── xxxxx.bin
+├── semantic_mask
+│   ├── xxxxx.bin
+├── seg_info
+│   ├── Area_1_label_weight.npy
+│   ├── Area_1_resampled_scene_idxs.npy
+│   ├── Area_2_label_weight.npy
+│   ├── Area_2_resampled_scene_idxs.npy
+│   ├── Area_3_label_weight.npy
+│   ├── Area_3_resampled_scene_idxs.npy
+│   ├── Area_4_label_weight.npy
+│   ├── Area_4_resampled_scene_idxs.npy
+│   ├── Area_5_label_weight.npy
+│   ├── Area_5_resampled_scene_idxs.npy
+│   ├── Area_6_label_weight.npy
+│   ├── Area_6_resampled_scene_idxs.npy
+├── s3dis_infos_Area_1.pkl
+├── s3dis_infos_Area_2.pkl
+├── s3dis_infos_Area_3.pkl
+├── s3dis_infos_Area_4.pkl
+├── s3dis_infos_Area_5.pkl
+├── s3dis_infos_Area_6.pkl
+```
+
+- `points/xxxxx.bin`: The exported point cloud data.
+- `instance_mask/xxxxx.bin`: The instance label for each point, value range: \[0, ${NUM_INSTANCES}\], 0: unannotated.
+- `semantic_mask/xxxxx.bin`: The semantic label for each point, value range: \[0, 12\].
+- `s3dis_infos_Area_1.pkl`: Area 1 data infos, the detailed info of each room is as follows:
+  - info\['point_cloud'\]: {'num_features': 6, 'lidar_idx': sample_idx}.
+  - info\['pts_path'\]: The path of `points/xxxxx.bin`.
+  - info\['pts_instance_mask_path'\]: The path of `instance_mask/xxxxx.bin`.
+  - info\['pts_semantic_mask_path'\]: The path of `semantic_mask/xxxxx.bin`.
+- `seg_info`: The generated infos to support semantic segmentation model training.
+  - `Area_1_label_weight.npy`: Weighting factor for each semantic class. Since the number of points in different classes varies greatly, it's a common practice to use label re-weighting to get a better performance.
+  - `Area_1_resampled_scene_idxs.npy`: Re-sampling index for each scene. Different rooms will be sampled multiple times according to their number of points to balance training data.
+
+## Training pipeline
+
+A typical training pipeline of S3DIS for 3D semantic segmentation is as below.
+
+```python
+class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+num_points = 4096
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping'),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.0,
+        ignore_index=None,
+        use_normalized_coord=True,
+        enlarge_size=None,
+        min_unique_num=num_points // 4,
+        eps=0.0),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-3.141592653589793, 3.141592653589793],  # [-pi, pi]
+        scale_ratio_range=[0.8, 1.2],
+        translation_std=[0, 0, 0]),
+    dict(
+        type='RandomJitterPoints',
+        jitter_std=[0.01, 0.01, 0.01],
+        clip_range=[-0.05, 0.05]),
+    dict(type='RandomDropPointsColor', drop_ratio=0.2),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+```
+
+- `PointSegClassMapping`: Only the valid category ids will be mapped to class label ids like \[0, 13) during training. Other class ids will be converted to `ignore_index` which equals to `13`.
+- `IndoorPatchPointSample`: Crop a patch containing a fixed number of points from input point cloud. `block_size` indicates the size of the cropped block, typically `1.0` for S3DIS.
+- `NormalizePointsColor`: Normalize the RGB color values of input point cloud by dividing `255`.
+- Data augmentation:
+  - `GlobalRotScaleTrans`: randomly rotate and scale input point cloud.
+  - `RandomJitterPoints`: randomly jitter point cloud by adding different noise vector to each point.
+  - `RandomDropPointsColor`: set the colors of point cloud to all zeros by a probability `drop_ratio`.
+
+## Metrics
+
+Typically mean intersection over union (mIoU) is used for evaluation on S3DIS. In detail, we first compute IoU for multiple classes and then average them to get mIoU, please refer to [seg_eval.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/seg_eval.py).
+
+As introduced in section `Export S3DIS data`, S3DIS trains on 5 areas and evaluates on the remaining 1 area. But there are also other area split schemes in different papers.
+To enable flexible combination of train-val splits, we use sub-dataset to represent one area, and concatenate them to form a larger training set. An example of training on area 1, 2, 3, 4, 6 and evaluating on area 5 is shown as below:
+
+```python
+dataset_type = 'S3DISSegDataset'
+data_root = './data/s3dis/'
+class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=[f's3dis_infos_Area_{i}.pkl' for i in train_area],
+        metainfo=metainfo,
+        data_prefix=data_prefix,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        ignore_index=len(class_names),
+        scene_idxs=[
+            f'seg_info/Area_{i}_resampled_scene_idxs.npy' for i in train_area
+        ],
+        test_mode=False))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=f's3dis_infos_Area_{test_area}.pkl',
+        metainfo=metainfo,
+        data_prefix=data_prefix,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        ignore_index=len(class_names),
+        scene_idxs=f'seg_info/Area_{test_area}_resampled_scene_idxs.npy',
+        test_mode=True))
+val_dataloader = test_dataloader
+```
+
+where we specify the areas used for training/validation by setting `ann_files` and `scene_idxs` with lists that include corresponding paths. The train-val split can be simply modified via changing the `train_area` and `test_area` variables.
diff --git a/mmde/docs/en/advanced_guides/datasets/scannet.md b/mmde/docs/en/advanced_guides/datasets/scannet.md
new file mode 100644
index 0000000000000000000000000000000000000000..dc6a1b9bddd2273fe0d201b2929578a6ad339b40
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/datasets/scannet.md
@@ -0,0 +1,350 @@
+# ScanNet Dataset
+
+MMDetection3D supports LiDAR-based detection and segmentation on ScanNet dataset. This page provides specific tutorials about the usage.
+
+## Dataset preparation
+
+For the overall process, please refer to the [README](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/data/scannet/README.md) page for ScanNet.
+
+### Export ScanNet point cloud data
+
+By exporting ScanNet data, we load the raw point cloud data and generate the relevant annotations including semantic labels, instance labels and ground truth bounding boxes.
+
+```shell
+python batch_load_scannet_data.py
+```
+
+The directory structure before data preparation should be as below
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── scannet
+│   │   ├── meta_data
+│   │   ├── scans
+│   │   │   ├── scenexxxx_xx
+│   │   ├── batch_load_scannet_data.py
+│   │   ├── load_scannet_data.py
+│   │   ├── scannet_utils.py
+│   │   ├── README.md
+```
+
+Under folder `scans` there are overall 1201 train and 312 validation folders in which raw point cloud data and relevant annotations are saved. For instance, under folder `scene0001_01` the files are as below:
+
+- `scene0001_01_vh_clean_2.ply`: Mesh file storing coordinates and colors of each vertex. The mesh's vertices are taken as raw point cloud data.
+- `scene0001_01.aggregation.json`: Aggregation file including object ID, segments ID and label.
+- `scene0001_01_vh_clean_2.0.010000.segs.json`: Segmentation file including segments ID and vertex.
+- `scene0001_01.txt`: Meta file including axis-aligned matrix, etc.
+- `scene0001_01_vh_clean_2.labels.ply`: Annotation file containing the category of each vertex.
+
+The procedure of exporting ScanNet data by running `python batch_load_scannet_data.py` mainly includes the following 3 steps:
+
+- Export original files to point cloud, instance label, semantic label and bounding box file.
+- Downsample raw point cloud and filter invalid classes.
+- Save point cloud data and relevant annotation files.
+
+And the core function `export` in `load_scannet_data.py` is as follows:
+
+```python
+def export(mesh_file,
+           agg_file,
+           seg_file,
+           meta_file,
+           label_map_file,
+           output_file=None,
+           test_mode=False):
+
+    # label map file: ./data/scannet/meta_data/scannetv2-labels.combined.tsv
+    # the various label standards in the label map file, e.g. 'nyu40id'
+    label_map = scannet_utils.read_label_mapping(
+        label_map_file, label_from='raw_category', label_to='nyu40id')
+    # load raw point cloud data, 6-dims feature: XYZRGB
+    mesh_vertices = scannet_utils.read_mesh_vertices_rgb(mesh_file)
+
+    # Load scene axis alignment matrix: a 4x4 transformation matrix
+    # transform raw points in sensor coordinate system to a coordinate system
+    # which is axis-aligned with the length/width of the room
+    lines = open(meta_file).readlines()
+    # test set data doesn't have align_matrix
+    axis_align_matrix = np.eye(4)
+    for line in lines:
+        if 'axisAlignment' in line:
+            axis_align_matrix = [
+                float(x)
+                for x in line.rstrip().strip('axisAlignment = ').split(' ')
+            ]
+            break
+    axis_align_matrix = np.array(axis_align_matrix).reshape((4, 4))
+
+    # perform global alignment of mesh vertices
+    pts = np.ones((mesh_vertices.shape[0], 4))
+    # raw point cloud in homogeneous coordinates, each row: [x, y, z, 1]
+    pts[:, 0:3] = mesh_vertices[:, 0:3]
+    # transform raw mesh vertices to aligned mesh vertices
+    pts = np.dot(pts, axis_align_matrix.transpose())  # Nx4
+    aligned_mesh_vertices = np.concatenate([pts[:, 0:3], mesh_vertices[:, 3:]],
+                                           axis=1)
+
+    # Load semantic and instance labels
+    if not test_mode:
+        # each object has one semantic label and consists of several segments
+        object_id_to_segs, label_to_segs = read_aggregation(agg_file)
+        # many points may belong to the same segment
+        seg_to_verts, num_verts = read_segmentation(seg_file)
+        label_ids = np.zeros(shape=(num_verts), dtype=np.uint32)
+        object_id_to_label_id = {}
+        for label, segs in label_to_segs.items():
+            label_id = label_map[label]
+            for seg in segs:
+                verts = seg_to_verts[seg]
+                # each point has one semantic label
+                label_ids[verts] = label_id
+        instance_ids = np.zeros(
+            shape=(num_verts), dtype=np.uint32)  # 0: unannotated
+        for object_id, segs in object_id_to_segs.items():
+            for seg in segs:
+                verts = seg_to_verts[seg]
+                # object_id is 1-indexed, i.e. 1,2,3,.,,,.NUM_INSTANCES
+                # each point belongs to one object
+                instance_ids[verts] = object_id
+                if object_id not in object_id_to_label_id:
+                    object_id_to_label_id[object_id] = label_ids[verts][0]
+        # bbox format is [x, y, z, x_size, y_size, z_size, label_id]
+        # [x, y, z] is gravity center of bbox, [x_size, y_size, z_size] is axis-aligned
+        # [label_id] is semantic label id in 'nyu40id' standard
+        # Note: since 3D bbox is axis-aligned, the yaw is 0.
+        unaligned_bboxes = extract_bbox(mesh_vertices, object_id_to_segs,
+                                        object_id_to_label_id, instance_ids)
+        aligned_bboxes = extract_bbox(aligned_mesh_vertices, object_id_to_segs,
+                                      object_id_to_label_id, instance_ids)
+    ...
+
+    return mesh_vertices, label_ids, instance_ids, unaligned_bboxes, \
+        aligned_bboxes, object_id_to_label_id, axis_align_matrix
+
+```
+
+After exporting each scan, the raw point cloud could be downsampled, e.g. to 50000, if the number of points is too large (the raw point cloud won't be downsampled if it's also used in 3D semantic segmentation task). In addition, invalid semantic labels outside of `nyu40id` standard or optional `DONOT CARE` classes should be filtered. Finally, the point cloud data, semantic labels, instance labels and ground truth bounding boxes should be saved in `.npy` files.
+
+### Export ScanNet RGB data (optional)
+
+By exporting ScanNet RGB data, for each scene we load a set of RGB images with corresponding 4x4 pose matrices, and a single 4x4 camera intrinsic matrix. Note, that this step is optional and can be skipped if multi-view detection is not planned to use.
+
+```shell
+python extract_posed_images.py
+```
+
+Each of 1201 train, 312 validation and 100 test scenes contains a single `.sens` file. For instance, for scene `0001_01` we have `data/scannet/scans/scene0001_01/0001_01.sens`. For this scene all images and poses are extracted to `data/scannet/posed_images/scene0001_01`. Specifically, there will be 300 image files xxxxx.jpg, 300 camera pose files xxxxx.txt and a single `intrinsic.txt` file. Typically, single scene contains several thousand images. By default, we extract only 300 of them with resulting space occupation of \<100 Gb. To extract more images, use `--max-images-per-scene` parameter.
+
+### Create dataset
+
+```shell
+python tools/create_data.py scannet --root-path ./data/scannet \
+--out-dir ./data/scannet --extra-tag scannet
+```
+
+The above exported point cloud file, semantic label file and instance label file are further saved in `.bin` format. Meanwhile `.pkl` info files are also generated for train or validation. The core function `process_single_scene` of getting data infos is as follows.
+
+```python
+def process_single_scene(sample_idx):
+
+    # save point cloud, instance label and semantic label in .bin file respectively, get info['pts_path'], info['pts_instance_mask_path'] and info['pts_semantic_mask_path']
+    ...
+
+    # get annotations
+    if has_label:
+        annotations = {}
+        # box is of shape [k, 6 + class]
+        aligned_box_label = self.get_aligned_box_label(sample_idx)
+        unaligned_box_label = self.get_unaligned_box_label(sample_idx)
+        annotations['gt_num'] = aligned_box_label.shape[0]
+        if annotations['gt_num'] != 0:
+            aligned_box = aligned_box_label[:, :-1]  # k, 6
+            unaligned_box = unaligned_box_label[:, :-1]
+            classes = aligned_box_label[:, -1]  # k
+            annotations['name'] = np.array([
+                self.label2cat[self.cat_ids2class[classes[i]]]
+                for i in range(annotations['gt_num'])
+            ])
+            # default names are given to aligned bbox for compatibility
+            # we also save unaligned bbox info with marked names
+            annotations['location'] = aligned_box[:, :3]
+            annotations['dimensions'] = aligned_box[:, 3:6]
+            annotations['gt_boxes_upright_depth'] = aligned_box
+            annotations['unaligned_location'] = unaligned_box[:, :3]
+            annotations['unaligned_dimensions'] = unaligned_box[:, 3:6]
+            annotations[
+                'unaligned_gt_boxes_upright_depth'] = unaligned_box
+            annotations['index'] = np.arange(
+                annotations['gt_num'], dtype=np.int32)
+            annotations['class'] = np.array([
+                self.cat_ids2class[classes[i]]
+                for i in range(annotations['gt_num'])
+            ])
+        axis_align_matrix = self.get_axis_align_matrix(sample_idx)
+        annotations['axis_align_matrix'] = axis_align_matrix  # 4x4
+        info['annos'] = annotations
+    return info
+```
+
+The directory structure after process should be as below:
+
+```
+scannet
+├── meta_data
+├── batch_load_scannet_data.py
+├── load_scannet_data.py
+├── scannet_utils.py
+├── README.md
+├── scans
+├── scans_test
+├── scannet_instance_data
+├── points
+│   ├── xxxxx.bin
+├── instance_mask
+│   ├── xxxxx.bin
+├── semantic_mask
+│   ├── xxxxx.bin
+├── seg_info
+│   ├── train_label_weight.npy
+│   ├── train_resampled_scene_idxs.npy
+│   ├── val_label_weight.npy
+│   ├── val_resampled_scene_idxs.npy
+├── posed_images
+│   ├── scenexxxx_xx
+│   │   ├── xxxxxx.txt
+│   │   ├── xxxxxx.jpg
+│   │   ├── intrinsic.txt
+├── scannet_infos_train.pkl
+├── scannet_infos_val.pkl
+├── scannet_infos_test.pkl
+```
+
+- `points/xxxxx.bin`: The `axis-unaligned` point cloud data after downsample. Since ScanNet 3D detection task takes axis-aligned point clouds as input, while ScanNet 3D semantic segmentation task takes unaligned points, we choose to store unaligned points and their axis-align transform matrix. Note: the points would be axis-aligned in pre-processing pipeline [`GlobalAlignment`](https://github.com/open-mmlab/mmdetection3d/blob/9f0b01caf6aefed861ef4c3eb197c09362d26b32/mmdet3d/datasets/pipelines/transforms_3d.py#L423) of 3D detection task.
+- `instance_mask/xxxxx.bin`: The instance label for each point, value range: \[0, NUM_INSTANCES\], 0: unannotated.
+- `semantic_mask/xxxxx.bin`: The semantic label for each point, value range: \[1, 40\], i.e. `nyu40id` standard. Note: the `nyu40id` ID will be mapped to train ID in train pipeline `PointSegClassMapping`.
+- `seg_info`: The generated infos to support semantic segmentation model training.
+  - `train_label_weight.npy`: Weighting factor for each semantic class. Since the number of points in different classes varies greatly, it's a common practice to use label re-weighting to get a better performance.
+  - `train_resampled_scene_idxs.npy`: Re-sampling index for each scene. Different rooms will be sampled multiple times according to their number of points to balance training data.
+- `posed_images/scenexxxx_xx`: The set of `.jpg` images with `.txt` 4x4 poses and the single `.txt` file with camera intrinsic matrix.
+- `scannet_infos_train.pkl`: The train data infos, the detailed info of each scan is as follows:
+  - info\['lidar_points'\]: A dict containing all information related to the lidar points.
+    - info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
+    - info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
+    - info\['lidar_points'\]\['axis_align_matrix'\]: The transformation matrix to align the axis.
+  - info\['pts_semantic_mask_path'\]: The filename of the semantic mask annotation.
+  - info\['pts_instance_mask_path'\]: The filename of the instance mask annotation.
+  - info\['instances'\]: A list of dict contains all annotations, each dict contains all annotation information of single instance. For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 6 numbers representing the axis-aligned 3D bounding box of the instance in depth coordinate system, in (x, y, z, l, w, h) order.
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: The label of each 3d bounding boxes.
+- `scannet_infos_val.pkl`: The val data infos, which shares the same format as `scannet_infos_train.pkl`.
+- `scannet_infos_test.pkl`: The test data infos, which almost shares the same format as `scannet_infos_train.pkl` except for the lack of annotation.
+
+## Training pipeline
+
+A typical training pipeline of ScanNet for 3D detection is as follows.
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointSample', num_points=40000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0],
+        shift_height=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+```
+
+- `GlobalAlignment`: The previous point cloud would be axis-aligned using the axis-aligned matrix.
+- `PointSegClassMapping`: Only the valid category IDs will be mapped to class label IDs like \[0, 18) during training.
+- Data augmentation:
+  - `PointSample`: downsample the input point cloud.
+  - `RandomFlip3D`: randomly flip the input point cloud horizontally or vertically.
+  - `GlobalRotScaleTrans`: rotate the input point cloud, usually in the range of \[-5, 5\] (degrees) for ScanNet; then scale the input point cloud, usually by 1.0 for ScanNet (which means no scaling); finally translate the input point cloud, usually by 0 for ScanNet  (which means no translation).
+
+A typical training pipeline of ScanNet for 3D semantic segmentation is as below:
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping'),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.5,
+        ignore_index=len(class_names),
+        use_normalized_coord=False,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+```
+
+- `PointSegClassMapping`: Only the valid category ids will be mapped to class label ids like \[0, 20) during training. Other class ids will be converted to `ignore_index` which equals to `20`.
+- `IndoorPatchPointSample`: Crop a patch containing a fixed number of points from input point cloud. `block_size` indicates the size of the cropped block, typically `1.5` for ScanNet.
+- `NormalizePointsColor`: Normalize the RGB color values of input point cloud by dividing `255`.
+
+## Metrics
+
+- **Object Detection**: Typically mean Average Precision (mAP) is used for evaluation on ScanNet, e.g. `mAP@0.25` and `mAP@0.5`. In detail, a generic function to compute precision and recall for 3D object detection for multiple classes is called. Please refer to [indoor_eval](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/indoor_eval.py) for more details.
+
+  **Note**: As introduced in section `Export ScanNet data`, all ground truth 3D bounding box are axis-aligned, i.e. the yaw is zero. So the yaw target of network predicted 3D bounding box is also zero and axis-aligned 3D Non-Maximum Suppression (NMS), which is regardless of rotation, is adopted during post-processing .
+
+- **Semantic Segmentation**: Typically mean Intersection over Union (mIoU) is used for evaluation on ScanNet. In detail, we first compute IoU for multiple classes and then average them to get mIoU, please refer to [seg_eval](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/seg_eval.py).
+
+## Testing and Making a Submission
+
+By default, our codebase evaluates semantic segmentation results on the validation set.
+If you would like to test the model performance on the online benchmark, add `--format-only` flag in the evaluation script and change `ann_file=data_root + 'scannet_infos_val.pkl'` to `ann_file=data_root + 'scannet_infos_test.pkl'` in the ScanNet dataset's [config](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/_base_/datasets/scannet-seg.py#L126). Remember to specify the `txt_prefix` as the directory to save the testing results.
+
+Taking PointNet++ (SSG) on ScanNet for example, the following command can be used to do inference on test set:
+
+```
+./tools/dist_test.sh configs/pointnet2/pointnet2_ssg_16x2_cosine_200e_scannet-seg.py \
+    work_dirs/pointnet2_ssg/latest.pth --format-only \
+    --eval-options txt_prefix=work_dirs/pointnet2_ssg/test_submission
+```
+
+After generating the results, you can basically compress the folder and upload to the [ScanNet evaluation server](http://kaldir.vc.in.tum.de/scannet_benchmark/semantic_label_3d).
diff --git a/mmde/docs/en/advanced_guides/datasets/semantickitti.md b/mmde/docs/en/advanced_guides/datasets/semantickitti.md
new file mode 100644
index 0000000000000000000000000000000000000000..b187e35f242512792f1c23c2cf711c685f297552
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/datasets/semantickitti.md
@@ -0,0 +1,127 @@
+# SemanticKITTI Dataset
+
+This page provides specific tutorials about the usage of MMDetection3D for SemanticKITTI dataset.
+
+## Prepare dataset
+
+You can download SemanticKITTI dataset [HERE](http://semantic-kitti.org/dataset.html#download) and unzip all zip files.
+
+Like the general way to prepare dataset, it is recommended to symlink the dataset root to `$MMDETECTION3D/data`.
+
+The folder structure should be organized as follows before our processing.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── semantickitti
+│   │   ├── sequences
+│   │   │   ├── 00
+│   │   │   │   ├── labels
+│   │   │   │   ├── velodyne
+│   │   │   ├── 01
+│   │   │   ├── ..
+│   │   │   ├── 22
+```
+
+SemanticKITTI dataset contains 23 sequences, where \[0-7\], \[9-10\] are used as training set (about 19k training samples), sequence 8 as validation set (about 4k validation samples) and \[11-22\] as test set (about 20k test samples). Each sequence contains velodyne and labels folders for LIDAR point cloud data and segmentation annotations (where the high 16 bits store the instance segmentation annotations and the low 16 bits store the semantic segmentation annotations), respectively.
+
+### Create SemanticKITTI Dataset
+
+We support scripts that generate dataset information for training and testing. Create `.pkl` info by running:
+
+```bash
+python ./tools/create_data.py semantickitti --root-path ./data/semantickitti --out-dir ./data/semantickitti --extra-tag semantickitti
+```
+
+The folder structure after processing should be as below
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── semantickitti
+│   │   ├── sequences
+│   │   │   ├── 00
+│   │   │   │   ├── labels
+│   │   │   │   ├── velodyne
+│   │   │   ├── 01
+│   │   │   ├── ..
+│   │   │   ├── 22
+│   │   ├── semantickitti_infos_test.pkl
+│   │   ├── semantickitti_infos_train.pkl
+│   │   ├── semantickitti_infos_val.pkl
+```
+
+- `semantickitti_infos_train.pkl`: training dataset, a dict contains two keys: `metainfo` and `data_list`.
+  `metainfo` contains the basic information for the dataset itself, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
+  - info\['sample_id'\]: The index of this sample in the whole dataset.
+  - info\['lidar_points'\]: A dict containing all the information related to the lidar points.
+    - info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
+    - info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
+  - info\['pts_semantic_mask_pth'\]: The path of 3D semantic segmentation annotation file.
+
+Please refer to [semantickitti_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/semantickitti_converter.py) and [update_infos_to_v2.py ](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/update_infos_to_v2.py) for more details.
+
+## Train pipeline
+
+A typical train pipeline of 3D segmentation on SemanticKITTI is as below:
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti',
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.1, 0.1, 0.1],
+    ),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+```
+
+- Data augmentation:
+  - `RandomFlip3D`: randomly flip input point cloud horizontally or vertically.
+  - `GlobalRotScaleTrans`: rotate/scale/transform input point cloud.
+
+## Evaluation
+
+An example to evaluate MinkUNet with 8 GPUs with semantickitti metrics is as follows:
+
+```shell
+bash tools/dist_test.sh configs/minkunet/minkunet_w32_8xb2-15e_semantickitti.py checkpoints/minkunet_w32_8xb2-15e_semantickitti_20230309_160710-7fa0a6f1.pth 8
+```
+
+## Metrics
+
+Typically mean intersection over union (mIoU) is used for evaluation on Semantickitti. In detail, we first compute IoU for multiple classes and then average them to get mIoU, please refer to [seg_eval.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/seg_eval.py).
+
+An example of printed evaluation results is as follows:
+
+| classes | car    | bicycle | motorcycle | truck  | bus    | person | bicyclist | motorcyclist | road   | parking | sidewalk | other-ground | building | fence  | vegetation | trunck | terrian | pole   | traffic-sign | miou   | acc    | acc_cls |
+| ------- | ------ | ------- | ---------- | ------ | ------ | ------ | --------- | ------------ | ------ | ------- | -------- | ------------ | -------- | ------ | ---------- | ------ | ------- | ------ | ------------ | ------ | ------ | ------- |
+| results | 0.9687 | 0.1908  | 0.6313     | 0.8580 | 0.6359 | 0.6818 | 0.8444    | 0.0002       | 0.9353 | 0.4854  | 0.8106   | 0.0024       | 0.9050   | 0.6111 | 0.8822     | 0.6605 | 0.7493  | 0.6442 | 0.4837       | 0.6306 | 0.9202 | 0.6924  |
diff --git a/mmde/docs/en/advanced_guides/datasets/sunrgbd.md b/mmde/docs/en/advanced_guides/datasets/sunrgbd.md
new file mode 100644
index 0000000000000000000000000000000000000000..80dcd08d0b6b58eb51c480e819d98e7256599d24
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/datasets/sunrgbd.md
@@ -0,0 +1,250 @@
+# SUN RGB-D Dataset
+
+## Dataset preparation
+
+For the overall process, please refer to the [README](https://github.com/open-mmlab/mmdetection3d/blob/master/data/sunrgbd/README.md) page for SUN RGB-D.
+
+### Download SUN RGB-D data and toolbox
+
+Download SUNRGBD data [HERE](http://rgbd.cs.princeton.edu/data/). Then, move `SUNRGBD.zip`, `SUNRGBDMeta2DBB_v2.mat`, `SUNRGBDMeta3DBB_v2.mat` and `SUNRGBDtoolbox.zip` to the `OFFICIAL_SUNRGBD` folder, unzip the zip files.
+
+The directory structure before data preparation should be as below:
+
+```
+sunrgbd
+├── README.md
+├── matlab
+│   ├── extract_rgbd_data_v1.m
+│   ├── extract_rgbd_data_v2.m
+│   ├── extract_split.m
+├── OFFICIAL_SUNRGBD
+│   ├── SUNRGBD
+│   ├── SUNRGBDMeta2DBB_v2.mat
+│   ├── SUNRGBDMeta3DBB_v2.mat
+│   ├── SUNRGBDtoolbox
+```
+
+### Extract data and annotations for 3D detection from raw data
+
+Extract SUN RGB-D annotation data from raw annotation data by running (this requires MATLAB installed on your machine):
+
+```bash
+matlab -nosplash -nodesktop -r 'extract_split;quit;'
+matlab -nosplash -nodesktop -r 'extract_rgbd_data_v2;quit;'
+matlab -nosplash -nodesktop -r 'extract_rgbd_data_v1;quit;'
+```
+
+The main steps include:
+
+- Extract train and val split.
+- Extract data for 3D detection from raw data.
+- Extract and format detection annotation from raw data.
+
+The main component of `extract_rgbd_data_v2.m` which extracts point cloud data from depth map is as follows:
+
+```matlab
+data = SUNRGBDMeta(imageId);
+data.depthpath(1:16) = '';
+data.depthpath = strcat('../OFFICIAL_SUNRGBD', data.depthpath);
+data.rgbpath(1:16) = '';
+data.rgbpath = strcat('../OFFICIAL_SUNRGBD', data.rgbpath);
+
+% extract point cloud from depth map
+[rgb,points3d,depthInpaint,imsize]=read3dPoints(data);
+rgb(isnan(points3d(:,1)),:) = [];
+points3d(isnan(points3d(:,1)),:) = [];
+points3d_rgb = [points3d, rgb];
+
+% MAT files are 3x smaller than TXT files. In Python we can use
+% scipy.io.loadmat('xxx.mat')['points3d_rgb'] to load the data.
+mat_filename = strcat(num2str(imageId,'%06d'), '.mat');
+txt_filename = strcat(num2str(imageId,'%06d'), '.txt');
+% save point cloud data
+parsave(strcat(depth_folder, mat_filename), points3d_rgb);
+```
+
+The main component of `extract_rgbd_data_v1.m` which extracts annotation is as follows:
+
+```matlab
+% Write 2D and 3D box label
+data2d = data;
+fid = fopen(strcat(det_label_folder, txt_filename), 'w');
+for j = 1:length(data.groundtruth3DBB)
+    centroid = data.groundtruth3DBB(j).centroid;  % 3D bbox center
+    classname = data.groundtruth3DBB(j).classname;  % class name
+    orientation = data.groundtruth3DBB(j).orientation;  % 3D bbox orientation
+    coeffs = abs(data.groundtruth3DBB(j).coeffs);  % 3D bbox size
+    box2d = data2d.groundtruth2DBB(j).gtBb2D;  % 2D bbox
+    fprintf(fid, '%s %d %d %d %d %f %f %f %f %f %f %f %f\n', classname, box2d(1), box2d(2), box2d(3), box2d(4), centroid(1), centroid(2), centroid(3), coeffs(1), coeffs(2), coeffs(3), orientation(1), orientation(2));
+end
+fclose(fid);
+```
+
+The above two scripts call functions such as `read3dPoints` from the [toolbox](https://rgbd.cs.princeton.edu/data/SUNRGBDtoolbox.zip) provided by SUN RGB-D.
+
+The directory structure after extraction should be as follows.
+
+```
+sunrgbd
+├── README.md
+├── matlab
+│   ├── extract_rgbd_data_v1.m
+│   ├── extract_rgbd_data_v2.m
+│   ├── extract_split.m
+├── OFFICIAL_SUNRGBD
+│   ├── SUNRGBD
+│   ├── SUNRGBDMeta2DBB_v2.mat
+│   ├── SUNRGBDMeta3DBB_v2.mat
+│   ├── SUNRGBDtoolbox
+├── sunrgbd_trainval
+│   ├── calib
+│   ├── depth
+│   ├── image
+│   ├── label
+│   ├── label_v1
+│   ├── seg_label
+│   ├── train_data_idx.txt
+│   ├── val_data_idx.txt
+```
+
+Under each following folder there are overall 5285 train files and 5050 val files:
+
+- `calib`: Camera calibration information in `.txt`
+- `depth`: Point cloud saved in `.mat` (xyz+rgb)
+- `image`: Image data in `.jpg`
+- `label`: Detection annotation data in `.txt` (version 2)
+- `label_v1`: Detection annotation data in `.txt` (version 1)
+- `seg_label`: Segmentation annotation data in `.txt`
+
+Currently, we use v1 data for training and testing, so the version 2 labels are unused.
+
+### Create dataset
+
+Please run the command below to create the dataset.
+
+```shell
+python tools/create_data.py sunrgbd --root-path ./data/sunrgbd \
+--out-dir ./data/sunrgbd --extra-tag sunrgbd
+```
+
+or (if in a slurm environment)
+
+```
+bash tools/create_data.sh <job_name> sunrgbd
+```
+
+The above point cloud data are further saved in `.bin` format. Meanwhile `.pkl` info files are also generated for saving annotation and metadata.
+
+The directory structure after processing should be as follows.
+
+```
+sunrgbd
+├── README.md
+├── matlab
+│   ├── ...
+├── OFFICIAL_SUNRGBD
+│   ├── ...
+├── sunrgbd_trainval
+│   ├── ...
+├── points
+├── sunrgbd_infos_train.pkl
+├── sunrgbd_infos_val.pkl
+```
+
+- `points/xxxxxx.bin`: The point cloud data after downsample.
+- `sunrgbd_infos_train.pkl`: The train data infos, the detailed info of each scene is as follows:
+  - info\['lidar_points'\]: A dict containing all information related to the the lidar points.
+    - info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
+    - info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
+  - info\['images'\]: A dict containing all information relate to the image data.
+    - info\['images'\]\['CAM0'\]\['img_path'\]: The filename of the image.
+    - info\['images'\]\['CAM0'\]\['depth2img'\]: Transformation matrix from depth to image with shape (4, 4).
+    - info\['images'\]\['CAM0'\]\['height'\]: The height of image.
+    - info\['images'\]\['CAM0'\]\['width'\]: The width of image.
+  - info\['instances'\]: A list of dict contains all the annotations of this frame. Each dict corresponds to annotations of single instance. For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box in depth coordinate system.
+    - info\['instances'\]\[i\]\['bbox'\]: List of 4 numbers representing the 2D bounding box of the instance, in (x1, y1, x2, y2) order.
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: An int indicates the 3D label of instance and the -1 indicates ignore class.
+    - info\['instances'\]\[i\]\['bbox_label'\]: An int indicates the 2D label of instance and the -1 indicates ignore class.
+- `sunrgbd_infos_val.pkl`: The val data infos, which shares the same format as `sunrgbd_infos_train.pkl`.
+
+## Train pipeline
+
+A typical train pipeline of SUN RGB-D for point cloud only 3D detection is as follows.
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+    ),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.523599, 0.523599],
+        scale_ratio_range=[0.85, 1.15],
+        shift_height=True),
+    dict(type='PointSample', num_points=20000),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+```
+
+Data augmentation for point clouds:
+
+- `RandomFlip3D`: randomly flip the input point cloud horizontally or vertically.
+- `GlobalRotScaleTrans`: rotate the input point cloud, usually in the range of \[-30, 30\] (degrees) for SUN RGB-D; then scale the input point cloud, usually in the range of \[0.85, 1.15\] for SUN RGB-D; finally translate the input point cloud, usually by 0 for SUN RGB-D (which means no translation).
+- `PointSample`: downsample the input point cloud.
+
+A typical train pipeline of SUN RGB-D for multi-modality (point cloud and image) 3D detection is as follows.
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations3D'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1333, 600), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.0),
+    dict(type='Pad', size_divisor=32),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+    ),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.523599, 0.523599],
+        scale_ratio_range=[0.85, 1.15],
+        shift_height=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d','img', 'gt_bboxes', 'gt_bboxes_labels'])
+]
+```
+
+Data augmentation for images:
+
+- `Resize`: resize the input image, `keep_ratio=True` means the ratio of the image is kept unchanged.
+- `RandomFlip`: randomly flip the input image.
+
+The image augmentation functions are implemented in [MMDetection](https://github.com/open-mmlab/mmdetection/tree/dev-3.x/mmdet/datasets/transforms).
+
+## Metrics
+
+Same as ScanNet, typically mean Average Precision (mAP) is used for evaluation on SUN RGB-D, e.g. `mAP@0.25` and `mAP@0.5`. In detail, a generic function to compute precision and recall for 3D object detection for multiple classes is called. Please refer to [indoor_eval](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/indoor_eval.py) for more details.
+
+Since SUN RGB-D consists of image data, detection on image data is also feasible. For instance, in ImVoteNet, we first train an image detector, and we also use mAP for evaluation, e.g. `mAP@0.5`. We use the `eval_map` function from [MMDetection](https://github.com/open-mmlab/mmdetection) to calculate mAP.
diff --git a/mmde/docs/en/advanced_guides/datasets/waymo.md b/mmde/docs/en/advanced_guides/datasets/waymo.md
new file mode 100644
index 0000000000000000000000000000000000000000..f28ca253b825f44cec9a13ef9b5f4fb4d3ac5ea7
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/datasets/waymo.md
@@ -0,0 +1,202 @@
+# Waymo Dataset
+
+This page provides specific tutorials about the usage of MMDetection3D for Waymo dataset.
+
+## Prepare dataset
+
+Before preparing Waymo dataset, if you only installed requirements in `requirements/build.txt` and `requirements/runtime.txt` before, please install the official package for this dataset at first by running
+
+```
+pip install waymo-open-dataset-tf-2-6-0
+```
+
+or
+
+```
+pip install -r requirements/optional.txt
+```
+
+Like the general way to prepare dataset, it is recommended to symlink the dataset root to `$MMDETECTION3D/data`.
+Due to the original Waymo data format is based on `tfrecord`, we need to preprocess the raw data for convenient usage in the training and evaluation procedure. Our approach is to convert them into KITTI format.
+
+The folder structure should be organized as follows before our processing.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── waymo
+│   │   ├── waymo_format
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   │   ├── testing
+│   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
+│   │   ├── kitti_format
+│   │   │   ├── ImageSets
+
+```
+
+You can download Waymo open dataset V1.4 [HERE](https://waymo.com/open/download/) and its data split [HERE](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing). Then put `tfrecord` files into corresponding folders in `data/waymo/waymo_format/` and put the data split txt files into `data/waymo/kitti_format/ImageSets`. Download ground truth bin files for validation set [HERE](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects) and put it into `data/waymo/waymo_format/`. A tip is that you can use `gsutil` to download the large-scale dataset with commands. You can take this [tool](https://github.com/RalphMao/Waymo-Dataset-Tool) as an example for more details. Subsequently, prepare Waymo data by running
+
+```bash
+# TF_CPP_MIN_LOG_LEVEL=3 will disable all logging output from TensorFlow.
+# The number of `--workers` depends on the maximum number of cores in your CPU.
+TF_CPP_MIN_LOG_LEVEL=3 python tools/create_data.py waymo --root-path ./data/waymo --out-dir ./data/waymo --workers 128 --extra-tag waymo --version v1.4
+```
+
+Note that if your local disk does not have enough space for saving converted data, you can change the `--out-dir` to anywhere else. Just remember to create folders and prepare data there in advance and link them back to `data/waymo/kitti_format` after the data conversion.
+
+After the data conversion, the folder structure and info files should be organized as below.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── waymo
+│   │   ├── waymo_format
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   │   ├── testing
+│   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
+│   │   ├── kitti_format
+│   │   │   ├── ImageSets
+│   │   │   ├── training
+│   │   │   │   ├── image_0
+│   │   │   │   ├── image_1
+│   │   │   │   ├── image_2
+│   │   │   │   ├── image_3
+│   │   │   │   ├── image_4
+│   │   │   │   ├── velodyne
+│   │   │   ├── testing
+│   │   │   │   ├── (the same as training)
+│   │   │   ├── waymo_gt_database
+│   │   │   ├── waymo_infos_trainval.pkl
+│   │   │   ├── waymo_infos_train.pkl
+│   │   │   ├── waymo_infos_val.pkl
+│   │   │   ├── waymo_infos_test.pkl
+│   │   │   ├── waymo_dbinfos_train.pkl
+
+```
+
+- `kitti_format/training/image_{0-4}/{a}{bbb}{ccc}.jpg` Here because there are several cameras, we store the corresponding images. We use a coding way `{a}{bbb}{ccc}` to name the data for each frame, where `a` is the prefix for different split (`0` for training, `1` for validation and `2` for testing), `bbb` for segment index and `ccc` for frame index. You can easily locate the required frame according to this naming rule. We gather the data for training and validation together as KITTI and store the indices for different set in the `ImageSet` files.
+- `kitti_format/training/velodyne/{a}{bbb}{ccc}.bin` point cloud data for each frame.
+- `kitti_format/waymo_gt_database/xxx_{Car/Pedestrian/Cyclist}_x.bin`. point cloud data included in each 3D bounding box of the training dataset. These point clouds will be used in data augmentation e.g. `ObjectSample`. `xxx` is the index of training samples and `x` is the index of objects in this frame.
+- `kitti_format/waymo_infos_train.pkl`. training dataset information, a dict contains two keys: `metainfo` and `data_list`.`metainfo` contains the basic information for the dataset itself, such as `dataset`, `version` and `info_version`, while `data_list` is a list of dict, each dict (hereinafter referred to as `info`) contains all the detailed information of single sample as follows:
+  - info\['sample_idx'\]: The index of this sample in the whole dataset.
+  - info\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list).
+  - info\['timestamp'\]: Timestamp of the sample data.
+  - info\['context_name'\]: The context name of sample indices which `*.tfrecord` segment it extracted from.
+  - info\['lidar_points'\]: A dict containing all the information related to the lidar points.
+    - info\['lidar_points'\]\['lidar_path'\]: The filename of the lidar point cloud data.
+    - info\['lidar_points'\]\['num_pts_feats'\]: The feature dimension of point.
+  - info\['lidar_sweeps'\]: A list contains sweeps information of lidar
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['lidar_path'\]: The lidar data path of i-th sweep.
+    - info\['lidar_sweeps'\]\[i\]\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
+    - info\['lidar_sweeps'\]\[i\]\['timestamp'\]: Timestamp of the sweep data.
+  - info\['images'\]: A dict contains five keys corresponding to each camera: `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`. Each dict contains all data information related to  corresponding camera.
+    - info\['images'\]\['CAM_XXX'\]\['img_path'\]: The filename of the image.
+    - info\['images'\]\['CAM_XXX'\]\['height'\]: The height of the image.
+    - info\['images'\]\['CAM_XXX'\]\['width'\]: The width of the image.
+    - info\['images'\]\['CAM_XXX'\]\['cam2img'\]: The transformation matrix recording the intrinsic parameters when projecting 3D points to each image plane. (4x4 list)
+    - info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]: The transformation matrix from lidar sensor to this camera. (4x4 list)
+    - info\['images'\]\['CAM_XXX'\]\['lidar2img'\]: The transformation matrix from lidar sensor to each image plane. (4x4 list)
+  - info\['image_sweeps'\]: A list containing sweeps information of images.
+    - info\['image_sweeps'\]\[i\]\['images'\]\['CAM_XXX'\]\['img_path'\]: The image path of i-th sweep.
+    - info\['image_sweeps'\]\[i\]\['ego2global'\]: The transformation matrix from the ego vehicle to global coordinates. (4x4 list)
+    - info\['image_sweeps'\]\[i\]\['timestamp'\]: Timestamp of the sweep data.
+  - info\['instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, w, h, yaw) order.
+    - info\['instances'\]\[i\]\['bbox'\]: List of 4 numbers representing the 2D bounding box of the instance, in (x1, y1, x2, y2) order. (some instances may not have a corresponding 2D bounding box)
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: A int indicating the label of instance and the -1 indicating ignore.
+    - info\['instances'\]\[i\]\['bbox_label'\]: A int indicating the label of instance and the -1 indicating ignore.
+    - info\['instances'\]\[i\]\['num_lidar_pts'\]: Number of lidar points included in each 3D bounding box.
+    - info\['instances'\]\[i\]\['camera_id'\]: The index of the most visible camera for this instance.
+    - info\['instances'\]\[i\]\['group_id'\]: The index of this instance in this sample.
+  - info\['cam_sync_instances'\]: It is a list of dict. Each dict contains all annotation information of single instance. Its format is same with \['instances'\]. However, \['cam_sync_instances'\] is only for multi-view camera-based 3D Object Detection task.
+  - info\['cam_instances'\]: It is a dict containing keys `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`. For monocular camera-based 3D Object Detection task, we split 3D annotations of the whole scenes according to the camera they belong to. For the i-th instance:
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_3d'\]: List of 7 numbers representing the 3D bounding box of the instance, in (x, y, z, l, h, w, yaw) order.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox'\]: 2D bounding box annotation (exterior rectangle of the projected 3D box), a list arrange as \[x1, y1, x2, y2\].
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label_3d'\]: Label of instance.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label'\]: Label of instance.
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['center_2d'\]: Projected center location on the image, a list has shape (2,).
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['depth'\]: The depth of projected center.
+
+## Training
+
+Considering there are many similar frames in the original dataset, we can basically use a subset to train our model primarily. In our preliminary baselines, we load one frame every five frames, and thanks to our hyper parameters settings and data augmentation, we obtain a better result compared with the performance given in the original dataset [paper](https://arxiv.org/pdf/1912.04838.pdf). For more details about the configuration and performance, please refer to README.md in the `configs/pointpillars/`. A more complete benchmark based on other settings and methods is coming soon.
+
+## Evaluation
+
+For evaluation on Waymo, please follow the [instruction](https://github.com/waymo-research/waymo-open-dataset/blob/r1.3/docs/quick_start.md) to build the binary file `compute_detection_metrics_main` for metrics computation and put it into `mmdet3d/core/evaluation/waymo_utils/`.  Basically, you can follow the commands below to install `bazel` and build the file.
+
+```shell
+# download the code and enter the base directory
+git clone https://github.com/waymo-research/waymo-open-dataset.git waymo-od
+# git clone https://github.com/Abyssaledge/waymo-open-dataset-master waymo-od # if you want to use faster multi-thread version.
+cd waymo-od
+git checkout remotes/origin/master
+
+# use the Bazel build system
+sudo apt-get install --assume-yes pkg-config zip g++ zlib1g-dev unzip python3 python3-pip
+BAZEL_VERSION=3.1.0
+wget https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
+sudo bash bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
+sudo apt install build-essential
+
+# configure .bazelrc
+./configure.sh
+# delete previous bazel outputs and reset internal caches
+bazel clean
+
+bazel build waymo_open_dataset/metrics/tools/compute_detection_metrics_main
+cp bazel-bin/waymo_open_dataset/metrics/tools/compute_detection_metrics_main ../mmdetection3d/mmdet3d/evaluation/functional/waymo_utils/
+```
+
+Then you can evaluate your models on Waymo. An example to evaluate PointPillars on Waymo with 8 GPUs with Waymo metrics is as follows.
+
+```shell
+./tools/dist_test.sh configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-car.py checkpoints/hv_pointpillars_secfpn_sbn-2x16_2x_waymo-3d-car_latest.pth
+```
+
+`pklfile_prefix` should be set in `test_evaluator` of configuration if the bin file is needed to be generated, so you can add `--cfg-options "test_evaluator.pklfile_prefix=xxxx"` in the end of command if you want do it.
+
+**Notice**:
+
+1. Sometimes when using `bazel` to build `compute_detection_metrics_main`, an error `'round' is not a member of 'std'` may appear. We just need to remove the `std::` before `round` in that file.
+
+2. Considering it takes a little long time to evaluate once, we recommend to evaluate only once at the end of model training.
+
+3. To use TensorFlow with CUDA 9, it is recommended to compile it from source. Apart from official tutorials, you can refer to this [link](https://github.com/SmileTM/Tensorflow2.X-GPU-CUDA9.0) for possibly suitable precompiled packages and useful information for compiling it from source.
+
+## Testing and make a submission
+
+An example to test PointPillars on Waymo with 8 GPUs, generate the bin files and make a submission to the leaderboard.
+
+`submission_prefix` should be set in `test_evaluator` of configuration before you run the test command if you want to generate the bin files and make a submission to the leaderboard..
+
+After generating the bin file, you can simply build the binary file `create_submission` and use them to create a submission file by following the [instruction](https://github.com/waymo-research/waymo-open-dataset/blob/master/docs/quick_start.md/). Basically, here are some example commands.
+
+```shell
+cd ../waymo-od/
+bazel build waymo_open_dataset/metrics/tools/create_submission
+cp bazel-bin/waymo_open_dataset/metrics/tools/create_submission ../mmdetection3d/mmdet3d/core/evaluation/waymo_utils/
+vim waymo_open_dataset/metrics/tools/submission.txtpb  # set the metadata information
+cp waymo_open_dataset/metrics/tools/submission.txtpb ../mmdetection3d/mmdet3d/evaluation/functional/waymo_utils/
+
+cd ../mmdetection3d
+# suppose the result bin is in `results/waymo-car/submission`
+mmdet3d/core/evaluation/waymo_utils/create_submission  --input_filenames='results/waymo-car/kitti_results_test.bin' --output_filename='results/waymo-car/submission/model' --submission_filename='mmdet3d/evaluation/functional/waymo_utils/submission.txtpb'
+
+tar cvf results/waymo-car/submission/my_model.tar results/waymo-car/submission/my_model/
+gzip results/waymo-car/submission/my_model.tar
+```
+
+For evaluation on the validation set with the eval server, you can also use the same way to generate a submission. Make sure you change the fields in `submission.txtpb` before running the command above.
diff --git a/mmde/docs/en/advanced_guides/index.rst b/mmde/docs/en/advanced_guides/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c3485d8cd5dcd85a89f6476e5f26ad8d7a284257
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/index.rst
@@ -0,0 +1,27 @@
+Datasets
+**************
+
+.. toctree::
+   :maxdepth: 1
+
+   datasets/index.rst
+
+
+Supported Tasks
+**************
+
+.. toctree::
+   :maxdepth: 1
+
+   supported_tasks/index.rst
+
+
+Customization
+**************
+
+.. toctree::
+   :maxdepth: 1
+
+   customize_dataset.md
+   customize_models.md
+   customize_runtime.md
diff --git a/mmde/docs/en/advanced_guides/pure_point_cloud_dataset.md b/mmde/docs/en/advanced_guides/pure_point_cloud_dataset.md
new file mode 100644
index 0000000000000000000000000000000000000000..dc385466f42e77461be9b5368feb2bf928af0f25
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/pure_point_cloud_dataset.md
@@ -0,0 +1,461 @@
+# Use Pure Point Cloud Dataset
+
+## Data Pre-Processing
+
+### Convert Point cloud format
+
+Currently, we only support bin format point cloud training and inference, before training on your own datasets, you need to transform your point cloud format to bin file. The common point cloud data formats include pcd and las, we provide some open-source tools for reference.
+
+1. Convert pcd to bin: https://github.com/leofansq/Tools_RosBag2KITTI
+2. Convert las to bin: The common conversion path is las -> pcd -> bin, and the conversion from las -> pcd can be achieved through [this tool](https://github.com/Hitachi-Automotive-And-Industry-Lab/semantic-segmentation-editor).
+
+### Point cloud annotation
+
+MMDetection3D does not support point cloud annotation. Some open-source annotation tools are offered for reference:
+
+- [SUSTechPOINTS](https://github.com/naurril/SUSTechPOINTS)
+- [LATTE](https://github.com/bernwang/latte)
+
+Besides, we improved [LATTE](https://github.com/bernwang/latte) for better usage. More details can be found [here](https://arxiv.org/abs/2011.10174).
+
+## Support new data format
+
+To support a new data format, you can either convert them to existing formats or directly convert them to the middle format. You could also choose to convert them offline (before training by a script) or online (implement a new dataset and do the conversion at training).
+
+### Reorganize new data formats to existing format
+
+Once your datasets only contain point cloud file and 3D Bounding box annotations, without calib file. We recommend converting it into the basic formats, the annotations files in basic format has the following necessary keys:
+
+```python
+
+[
+    {'sample_idx':
+     'lidar_points': {'lidar_path': velodyne_path,
+                      ....
+                     },
+     'annos': {'box_type_3d':  (str)  'LiDAR/Camera/Depth'
+               'gt_bboxes_3d':  <np.ndarray> (n, 7)
+               'gt_names':  [list]
+               ....
+            }
+     'calib': { .....}
+     'images': { .....}
+    }
+]
+
+```
+
+In MMDetection3D, for the data that is inconvenient to read directly online, we recommend converting it into into basic format as above and do the conversion offline, thus you only need to modify the config's data annotation paths and classes after the conversion.
+To use data that share a similar format as the existing datasets, e.g., Lyft has a similar format as the nuScenes dataset, we recommend directly implementing a new data converter and a dataset class to convert the data and load the data, respectively. In this procedure, the code can inherit from the existing dataset classes to reuse the code.
+
+### Reorganize new data format to middle format
+
+There is also a way if users do not want to convert the annotation format to existing formats.
+Actually, we convert all the supported datasets into pickle files, which summarize useful information for model training and inference.
+
+The annotation of a dataset is a list of dict, each dict corresponds to a frame.
+A basic example (used in KITTI) is as follows. A frame consists of several keys, like `image`, `point_cloud`, `calib` and `annos`.
+As long as we could directly read data according to these information, the organization of raw data could also be different from existing ones.
+With this design, we provide an alternative choice for customizing datasets.
+
+```python
+
+[
+    {'image': {'image_idx': 0, 'image_path': 'training/image_2/000000.png', 'image_shape': array([ 370, 1224], dtype=int32)},
+     'point_cloud': {'num_features': 4, 'velodyne_path': 'training/velodyne/000000.bin'},
+     'calib': {'P0': array([[707.0493,   0.    , 604.0814,   0.    ],
+       [  0.    , 707.0493, 180.5066,   0.    ],
+       [  0.    ,   0.    ,   1.    ,   0.    ],
+       [  0.    ,   0.    ,   0.    ,   1.    ]]),
+       'P1': array([[ 707.0493,    0.    ,  604.0814, -379.7842],
+       [   0.    ,  707.0493,  180.5066,    0.    ],
+       [   0.    ,    0.    ,    1.    ,    0.    ],
+       [   0.    ,    0.    ,    0.    ,    1.    ]]),
+       'P2': array([[ 7.070493e+02,  0.000000e+00,  6.040814e+02,  4.575831e+01],
+       [ 0.000000e+00,  7.070493e+02,  1.805066e+02, -3.454157e-01],
+       [ 0.000000e+00,  0.000000e+00,  1.000000e+00,  4.981016e-03],
+       [ 0.000000e+00,  0.000000e+00,  0.000000e+00,  1.000000e+00]]),
+       'P3': array([[ 7.070493e+02,  0.000000e+00,  6.040814e+02, -3.341081e+02],
+       [ 0.000000e+00,  7.070493e+02,  1.805066e+02,  2.330660e+00],
+       [ 0.000000e+00,  0.000000e+00,  1.000000e+00,  3.201153e-03],
+       [ 0.000000e+00,  0.000000e+00,  0.000000e+00,  1.000000e+00]]),
+       'R0_rect': array([[ 0.9999128 ,  0.01009263, -0.00851193,  0.        ],
+       [-0.01012729,  0.9999406 , -0.00403767,  0.        ],
+       [ 0.00847068,  0.00412352,  0.9999556 ,  0.        ],
+       [ 0.        ,  0.        ,  0.        ,  1.        ]]),
+       'Tr_velo_to_cam': array([[ 0.00692796, -0.9999722 , -0.00275783, -0.02457729],
+       [-0.00116298,  0.00274984, -0.9999955 , -0.06127237],
+       [ 0.9999753 ,  0.00693114, -0.0011439 , -0.3321029 ],
+       [ 0.        ,  0.        ,  0.        ,  1.        ]]),
+       'Tr_imu_to_velo': array([[ 9.999976e-01,  7.553071e-04, -2.035826e-03, -8.086759e-01],
+       [-7.854027e-04,  9.998898e-01, -1.482298e-02,  3.195559e-01],
+       [ 2.024406e-03,  1.482454e-02,  9.998881e-01, -7.997231e-01],
+       [ 0.000000e+00,  0.000000e+00,  0.000000e+00,  1.000000e+00]])},
+     'annos': {'name': array(['Pedestrian'], dtype='<U10'), 'truncated': array([0.]), 'occluded': array([0]), 'alpha': array([-0.2]), 'bbox': array([[712.4 , 143.  , 810.73, 307.92]]), 'dimensions': array([[1.2 , 1.89, 0.48]]), 'location': array([[1.84, 1.47, 8.41]]), 'rotation_y': array([0.01]), 'score': array([0.]), 'index': array([0], dtype=int32), 'group_ids': array([0], dtype=int32), 'difficulty': array([0], dtype=int32), 'num_points_in_gt': array([377], dtype=int32)}}
+    ...
+]
+```
+
+On top of this you can write a new Dataset class inherited from `Custom3DDataset`, and overwrite related methods,
+like [KittiDataset](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/kitti_dataset.py) and [ScanNetDataset](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/scannet_dataset.py).
+
+### An example of customized dataset
+
+Here we provide an example of customized dataset.
+
+Assume the annotation has been reorganized into a list of dict in pickle files like basic format.
+The bounding boxes annotations are stored in `annotation.pkl` as the following
+
+```
+{'sample_idx': 120,
+ 'lidar_points': {'lidar_path': 'training/000004.bin'},
+ 'annos': {'bbox_type_3d': 'LiDAR',
+           'gt_bboxes_3d': array([[1.48129511,  3.52074146,  1.85652947, 1.74445975, 0.23195696, 0.57235193, -0.25525],
+           [ 2.90395617, -3.48033905,  1.52682471,[0.66077662, 0.17072392, 0.67153597, 2.23145]]),
+           'gt_names': ['car', 'pedestrian']
+          }
+}
+```
+
+If the pkl only contains the necessary keys, you can directly use the `Custom3DDataset` for training:
+
+Then in the config, to use `Custom3DDataset` you can modify the config as the following
+
+```python
+dataset_A_train = dict(
+    type='Custom3DDataset',
+    ann_file = 'annotation.pkl',
+    pipeline=train_pipeline
+)
+```
+
+otherwise you need to create a new dataset in `mmdet3d/datasets/my_dataset.py` to load the data and rewrite the `get_ann_info` method.
+
+```python
+import numpy as np
+from os import path as osp
+
+from mmdet3d.core import show_result
+from mmdet3d.core.bbox import DepthInstance3DBoxes
+from mmdet.datasets import DATASETS
+from .custom_3d import Custom3DDataset
+
+
+@DATASETS.register_module()
+class MyDataset(Custom3DDataset):
+    classes = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 pipeline=None,
+                 classes=None,
+                 modality=None,
+                 box_type_3d='Depth',
+                 filter_empty_gt=True,
+                 test_mode=False):
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            classes=classes,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode)
+
+    def get_ann_info(self, index):
+        # Use index to get the annos, thus the evalhook could also use this api
+        info = self.data_infos[index]
+        if info['annos']['gt_num'] != 0:
+            gt_bboxes_3d = info['annos']['gt_boxes_upright_depth'].astype(
+                np.float32)  # k, 6
+            gt_labels_3d = info['annos']['class'].astype(np.int64)
+        else:
+            gt_bboxes_3d = np.zeros((0, 6), dtype=np.float32)
+            gt_labels_3d = np.zeros((0, ), dtype=np.int64)
+
+        # to target box structure
+        gt_bboxes_3d = DepthInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            with_yaw=False,
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        pts_instance_mask_path = osp.join(self.data_root,
+                                          info['pts_instance_mask_path'])
+        pts_semantic_mask_path = osp.join(self.data_root,
+                                          info['pts_semantic_mask_path'])
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            pts_instance_mask_path=pts_instance_mask_path,
+            pts_semantic_mask_path=pts_semantic_mask_path)
+        return anns_results
+
+```
+
+Then in the config, to use `MyDataset` you can modify the config as the following
+
+```python
+dataset_A_train = dict(
+    type='MyDataset',
+    ann_file = 'annotation.pkl',
+    pipeline=train_pipeline
+)
+```
+
+## Customize datasets by dataset wrappers
+
+MMDetection3D also supports many dataset wrappers to mix the dataset or modify the dataset distribution for training like MMDetection.
+Currently it supports to three dataset wrappers as below:
+
+- `RepeatDataset`: simply repeat the whole dataset.
+- `ClassBalancedDataset`: repeat dataset in a class balanced manner.
+- `ConcatDataset`: concat datasets.
+
+### Repeat dataset
+
+We use `RepeatDataset` as wrapper to repeat the dataset. For example, suppose the original dataset is `Dataset_A`, to repeat it, the config looks like the following
+
+```python
+dataset_A_train = dict(
+        type='RepeatDataset',
+        times=N,
+        dataset=dict(  # This is the original config of Dataset_A
+            type='Dataset_A',
+            ...
+            pipeline=train_pipeline
+        )
+    )
+```
+
+### Class balanced dataset
+
+We use `ClassBalancedDataset` as wrapper to repeat the dataset based on category
+frequency. The dataset to repeat needs to instantiate function `self.get_cat_ids(idx)`
+to support `ClassBalancedDataset`.
+For example, to repeat `Dataset_A` with `oversample_thr=1e-3`, the config looks like the following
+
+```python
+dataset_A_train = dict(
+        type='ClassBalancedDataset',
+        oversample_thr=1e-3,
+        dataset=dict(  # This is the original config of Dataset_A
+            type='Dataset_A',
+            ...
+            pipeline=train_pipeline
+        )
+    )
+```
+
+You may refer to [source code](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/dataset_wrappers.py) for details.
+
+### Concatenate dataset
+
+There are three ways to concatenate the dataset.
+
+1. If the datasets you want to concatenate are in the same type with different annotation files, you can concatenate the dataset configs like the following.
+
+   ```python
+   dataset_A_train = dict(
+       type='Dataset_A',
+       ann_file = ['anno_file_1', 'anno_file_2'],
+       pipeline=train_pipeline
+   )
+   ```
+
+   If the concatenated dataset is used for test or evaluation, this manner supports to evaluate each dataset separately. To test the concatenated datasets as a whole, you can set `separate_eval=False` as below.
+
+   ```python
+   dataset_A_train = dict(
+       type='Dataset_A',
+       ann_file = ['anno_file_1', 'anno_file_2'],
+       separate_eval=False,
+       pipeline=train_pipeline
+   )
+   ```
+
+2. In case the dataset you want to concatenate is different, you can concatenate the dataset configs like the following.
+
+   ```python
+   dataset_A_train = dict()
+   dataset_B_train = dict()
+
+   data = dict(
+       imgs_per_gpu=2,
+       workers_per_gpu=2,
+       train = [
+           dataset_A_train,
+           dataset_B_train
+       ],
+       val = dataset_A_val,
+       test = dataset_A_test
+       )
+   ```
+
+   If the concatenated dataset is used for test or evaluation, this manner also supports to evaluate each dataset separately.
+
+3. We also support to define `ConcatDataset` explicitly as the following.
+
+   ```python
+   dataset_A_val = dict()
+   dataset_B_val = dict()
+
+   data = dict(
+       imgs_per_gpu=2,
+       workers_per_gpu=2,
+       train=dataset_A_train,
+       val=dict(
+           type='ConcatDataset',
+           datasets=[dataset_A_val, dataset_B_val],
+           separate_eval=False))
+   ```
+
+   This manner allows users to evaluate all the datasets as a single one by setting `separate_eval=False`.
+
+**Note:**
+
+1. The option `separate_eval=False` assumes the datasets use `self.data_infos` during evaluation. Therefore, COCO datasets do not support this behavior since COCO datasets do not fully rely on `self.data_infos` for evaluation. Combining different types of datasets and evaluating them as a whole is not tested thus is not suggested.
+2. Evaluating `ClassBalancedDataset` and `RepeatDataset` is not supported thus evaluating concatenated datasets of these types is also not supported.
+
+A more complex example that repeats `Dataset_A` and `Dataset_B` by N and M times, respectively, and then concatenates the repeated datasets is as the following.
+
+```python
+dataset_A_train = dict(
+    type='RepeatDataset',
+    times=N,
+    dataset=dict(
+        type='Dataset_A',
+        ...
+        pipeline=train_pipeline
+    )
+)
+dataset_A_val = dict(
+    ...
+    pipeline=test_pipeline
+)
+dataset_A_test = dict(
+    ...
+    pipeline=test_pipeline
+)
+dataset_B_train = dict(
+    type='RepeatDataset',
+    times=M,
+    dataset=dict(
+        type='Dataset_B',
+        ...
+        pipeline=train_pipeline
+    )
+)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train = [
+        dataset_A_train,
+        dataset_B_train
+    ],
+    val = dataset_A_val,
+    test = dataset_A_test
+)
+
+```
+
+## Modify Dataset Classes
+
+With existing dataset types, we can modify the class names of them to train subset of the annotations.
+For example, if you want to train only three classes of the current dataset,
+you can modify the classes of dataset.
+The dataset will filter out the ground truth boxes of other classes automatically.
+
+```python
+classes = ('person', 'bicycle', 'car')
+data = dict(
+    train=dict(classes=classes),
+    val=dict(classes=classes),
+    test=dict(classes=classes))
+```
+
+MMDetection V2.0 also supports to read the classes from a file, which is common in real applications.
+For example, assume the `classes.txt` contains the name of classes as the following.
+
+```
+person
+bicycle
+car
+```
+
+Users can set the classes as a file path, the dataset will load it and convert it to a list automatically.
+
+```python
+classes = 'path/to/classes.txt'
+data = dict(
+    train=dict(classes=classes),
+    val=dict(classes=classes),
+    test=dict(classes=classes))
+```
+
+## Loading Point Clouds Adjustment
+
+Generally speaking, the most basic bin data contains (x, y, z) information, and some also include intensity, elongation (point cloud elongation), timestamp, and the point cloud dimension ranges from 3 to 6. In MMDetection3D, you need to adjust the some settings in config while customized dataset training:
+
+```python
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        # adjust accordingly according to the dimension
+        # of the point cloud of your own dataset
+        load_dim=3,
+        # actually used dimension，you can also specify the
+        # specific dimension in list format
+        use_dim=3),
+```
+
+## Training Setting Adjustment
+
+In order to avoid some problems in the training process and improve the performance of the model on the custom dataset, some training settings need to be adjusted according to the dataset.
+
+### Adjust Point Cloud Range and Annotations in Config
+
+For example, we can adjust `point_cloud_range` in config file to change training point cloud range. In KITTI dataset, the `point_cloud_range` is set to be `[0, -39.68, -3, 69.12, 39.68, 1]`.
+By setting point cloud range, the `PointsRangeFilter` is used to filter point cloud and its mask (semantic and instance), and `ObjectRangeFilter` is used to filter 3D bounding boxes.
+
+```python
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+```
+
+### Adjust Voxel Size in Config
+
+Here you can refer to the setting of the existing datasets. theoretically, `voxel_size` is linked to the setting of `point_cloud_range`. Setting a smaller `voxel_size` will increase the voxel num and the corresponding memory consumption. In addition, the following issues need to be noted:
+
+if the `point_cloud_range` and `voxel_size` are set to be `[0, -40, -3, 70.4, 40, 1]` and `[0.05, 0.05, 0.1]` respectively, then the shape of intermediate feature map should be `[(1-(-3))/0.1+1, (40-(-40))/0.05, (70.4-0)/0.05]=[41, 1600, 1408]`. More details refers to this [issue](https://github.com/open-mmlab/mmdetection3d/issues/382).
+
+### Adjust Anchor Range and Size in Config
+
+```python
+anchor_generator=dict(
+    type='Anchor3DRangeGenerator',
+    ranges=[
+        [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+        [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+        [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+    ],
+    sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+    rotations=[0, 1.57],
+    reshape_out=False),
+```
+
+Regarding the setting of `anchor_range`, it is generally adjusted according to dataset. Note that `z` value needs to be adjusted accordingly to the position of the point cloud, please refer to this [issue](https://github.com/open-mmlab/mmdetection3d/issues/986).
+
+Regarding the setting of `anchor_size`, it is usually necessary to count the average length, width and height of the entire training dataset as `anchor_size` to obtain the best results.
+
+**Note** (related to MMDetection):
+
+- Before MMDetection v2.5.0, the dataset will filter out the empty GT images automatically if the classes are set and there is no way to disable that through config. This is an undesirable behavior and introduces confusion because if the classes are not set, the dataset only filters the empty GT images when `filter_empty_gt=True` and `test_mode=False`. After MMDetection v2.5.0, we decouple the image filtering process and the classes modification, i.e., the dataset will only filter empty GT images when `filter_empty_gt=True` and `test_mode=False`, no matter whether the classes are set. Thus, setting the classes only influences the annotations of classes used for training and users could decide whether to filter empty GT images by themselves.
+- Since the middle format only has box labels and does not contain the class names, when using `CustomDataset`, users cannot filter out the empty GT images through configs but only do this offline.
+- The features for setting dataset classes and dataset filtering will be refactored to be more user-friendly in the future (depends on the progress).
diff --git a/mmde/docs/en/advanced_guides/supported_tasks/index.rst b/mmde/docs/en/advanced_guides/supported_tasks/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..53b8f4f94ed2392e555d54b05c991adfa247c3b6
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/supported_tasks/index.rst
@@ -0,0 +1,6 @@
+.. toctree::
+   :maxdepth: 1
+
+   lidar_det3d.md
+   vision_det3d.md
+   lidar_sem_seg3d.md
diff --git a/mmde/docs/en/advanced_guides/supported_tasks/lidar_det3d.md b/mmde/docs/en/advanced_guides/supported_tasks/lidar_det3d.md
new file mode 100644
index 0000000000000000000000000000000000000000..a6d31f0b22d09abecf08612e1d9ddd6a9c47b9a0
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/supported_tasks/lidar_det3d.md
@@ -0,0 +1,104 @@
+# LiDAR-Based 3D Detection
+
+LiDAR-based 3D detection is one of the most basic tasks supported in MMDetection3D.
+It expects the given model to take any number of points with features collected by LiDAR as input, and predict the 3D bounding boxes and category labels for each object of interest.
+Next, taking PointPillars on the KITTI dataset as an example, we will show how to prepare data, train and test a model on a standard 3D detection benchmark, and how to visualize and validate the results.
+
+## Data Preparation
+
+To begin with, we need to download the raw data and reorganize the data in a standard way presented in the [doc for data preparation](https://mmdetection3d.readthedocs.io/en/dev-1.x/user_guides/dataset_prepare.html).
+Note that for KITTI, we need extra `.txt` files for data splits.
+
+Due to different ways of organizing the raw data in different datasets, we typically need to collect the useful data information with a `.pkl` file.
+So after getting all the raw data ready, we need to run the scripts provided in the `create_data.py` for different datasets to generate data infos.
+For example, for KITTI we need to run:
+
+```shell
+python tools/create_data.py kitti --root-path ./data/kitti --out-dir ./data/kitti --extra-tag kitti
+```
+
+Afterwards, the related folder structure should be as follows:
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── kitti
+│   │   ├── ImageSets
+│   │   ├── testing
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── velodyne
+│   │   │   ├── velodyne_reduced
+│   │   ├── training
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── label_2
+│   │   │   ├── velodyne
+│   │   │   ├── velodyne_reduced
+│   │   ├── kitti_gt_database
+│   │   ├── kitti_infos_train.pkl
+│   │   ├── kitti_infos_trainval.pkl
+│   │   ├── kitti_infos_val.pkl
+│   │   ├── kitti_infos_test.pkl
+│   │   ├── kitti_dbinfos_train.pkl
+```
+
+## Training
+
+Then let us train a model with provided configs for PointPillars.
+You can basically follow the examples provided in this [tutorial](https://mmdetection3d.readthedocs.io/en/dev-1.x/user_guides/train_test.html) when training with different GPU settings.
+Suppose we use 8 GPUs on a single machine with distributed training:
+
+```shell
+./tools/dist_train.sh configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py 8
+```
+
+Note that `8xb6` in the config name refers to the training is completed with 8 GPUs and 6 samples on each GPU.
+If your customized setting is different from this, sometimes you need to adjust the learning rate accordingly.
+A basic rule can be referred to [here](https://arxiv.org/abs/1706.02677). We have supported `--auto-scale-lr` to
+enable automatically scaling LR.
+
+## Quantitative Evaluation
+
+During training, the model checkpoints will be evaluated regularly according to the setting of `train_cfg = dict(val_interval=xxx)` in the config.
+We support official evaluation protocols for different datasets.
+For KITTI, the model will be evaluated with mean average precision (mAP) with Intersection over Union (IoU) thresholds 0.5/0.7 for 3 categories respectively.
+The evaluation results will be printed in the command like:
+
+```
+Car AP@0.70, 0.70, 0.70:
+bbox AP:98.1839, 89.7606, 88.7837
+bev AP:89.6905, 87.4570, 85.4865
+3d AP:87.4561, 76.7569, 74.1302
+aos AP:97.70, 88.73, 87.34
+Car AP@0.70, 0.50, 0.50:
+bbox AP:98.1839, 89.7606, 88.7837
+bev AP:98.4400, 90.1218, 89.6270
+3d AP:98.3329, 90.0209, 89.4035
+aos AP:97.70, 88.73, 87.34
+```
+
+In addition, you can also evaluate a specific model checkpoint after training is finished. Simply run scripts like the following:
+
+```shell
+./tools/dist_test.sh configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py work_dirs/pointpillars/latest.pth 8
+```
+
+## Testing and Making a Submission
+
+If you would like to only conduct inference or test the model performance on the online benchmark,
+you need to specify the `submission_prefix` for corresponding evaluator,
+e.g., add `test_evaluator = dict(type='KittiMetric', ann_file=data_root + 'kitti_infos_test.pkl', format_only=True, pklfile_prefix='results/kitti-3class/kitti_results', submission_prefix='results/kitti-3class/kitti_results')` in the configuration then you can get the results file.
+Please guarantee the `data_prefix` and `ann_file` in [info for testing](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/datasets/kitti-3d-3class.py#L117) in the config corresponds to the test set instead of validation set.
+After generating the results, you can basically compress the folder and upload to the KITTI evaluation server.
+
+## Qualitative Validation
+
+MMDetection3D also provides versatile tools for visualization such that we can have an intuitive feeling of the detection results predicted by our trained models.
+You can either set the `--show` option to visualize the detection results online during evaluation,
+or using `tools/misc/visualize_results.py` for offline visualization.
+Besides, we also provide scripts `tools/misc/browse_dataset.py` to visualize the dataset without inference.
+Please refer more details in the [doc for visualization](https://mmdetection3d.readthedocs.io/en/dev-1.x/user_guides/visualization.html).
diff --git a/mmde/docs/en/advanced_guides/supported_tasks/lidar_sem_seg3d.md b/mmde/docs/en/advanced_guides/supported_tasks/lidar_sem_seg3d.md
new file mode 100644
index 0000000000000000000000000000000000000000..521e3b20fea418a00b2e478c50b96f75dd9a8df1
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/supported_tasks/lidar_sem_seg3d.md
@@ -0,0 +1,93 @@
+# LiDAR-Based 3D Semantic Segmentation
+
+LiDAR-based 3D semantic segmentation is one of the most basic tasks supported in MMDetection3D.
+It expects the given model to take any number of points with features collected by LiDAR as input, and predict the semantic labels for each input point.
+Next, taking PointNet++ (SSG) on the ScanNet dataset as an example, we will show how to prepare data, train and test a model on a standard 3D semantic segmentation benchmark, and how to visualize and validate the results.
+
+## Data Preparation
+
+To begin with, we need to download the raw data from ScanNet's [official website](http://kaldir.vc.in.tum.de/scannet_benchmark/documentation).
+
+Due to different ways of organizing the raw data in different datasets, we typically need to collect the useful data information with a .pkl or .json file.
+
+So after getting all the raw data ready, we can follow the instructions presented in [ScanNet README doc](https://github.com/open-mmlab/mmdetection3d/blob/master/data/scannet/README.md/) to generate data infos.
+
+Afterwards, the related folder structure should be as follows:
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── scannet
+│   │   ├── scannet_utils.py
+│   │   ├── batch_load_scannet_data.py
+│   │   ├── load_scannet_data.py
+│   │   ├── scannet_utils.py
+│   │   ├── README.md
+│   │   ├── scans
+│   │   ├── scans_test
+│   │   ├── scannet_instance_data
+│   │   ├── points
+│   │   ├── instance_mask
+│   │   ├── semantic_mask
+│   │   ├── seg_info
+│   │   │   ├── train_label_weight.npy
+│   │   │   ├── train_resampled_scene_idxs.npy
+│   │   │   ├── val_label_weight.npy
+│   │   │   ├── val_resampled_scene_idxs.npy
+│   │   ├── scannet_infos_train.pkl
+│   │   ├── scannet_infos_val.pkl
+│   │   ├── scannet_infos_test.pkl
+```
+
+## Training
+
+Then let us train a model with provided configs for PointNet++ (SSG).
+You can basically follow this [tutorial](https://mmdetection3d.readthedocs.io/en/latest/1_exist_data_model.html#inference-with-existing-models) for sample scripts when training with different GPU settings.
+Suppose we use 2 GPUs on a single machine with distributed training:
+
+```
+./tools/dist_train.sh configs/pointnet2/pointnet2_ssg_2xb16-cosine-200e_scannet-seg.py 2
+```
+
+Note that `2xb16` in the config name refers to the training is completed with 2 GPUs and 16 samples on each GPU.
+If your customized setting is different from this, sometimes you need to adjust the learning rate accordingly.
+A basic rule can be referred to [here](https://arxiv.org/abs/1706.02677).
+
+## Quantitative Evaluation
+
+During training, the model checkpoints will be evaluated regularly according to the setting of `train_cfg = dict(val_interval=xxx)` in the config.
+We support official evaluation protocols for different datasets.
+For ScanNet, the model will be evaluated with mean Intersection over Union (mIoU) over all 20 categories.
+The evaluation results will be printed in the command like:
+
+```
++---------+--------+--------+---------+--------+--------+--------+--------+--------+--------+-----------+---------+---------+--------+---------+--------------+----------------+--------+--------+---------+----------------+--------+--------+---------+
+| classes | wall   | floor  | cabinet | bed    | chair  | sofa   | table  | door   | window | bookshelf | picture | counter | desk   | curtain | refrigerator | showercurtrain | toilet | sink   | bathtub | otherfurniture | miou   | acc    | acc_cls |
++---------+--------+--------+---------+--------+--------+--------+--------+--------+--------+-----------+---------+---------+--------+---------+--------------+----------------+--------+--------+---------+----------------+--------+--------+---------+
+| results | 0.7257 | 0.9373 | 0.4625  | 0.6613 | 0.7707 | 0.5562 | 0.5864 | 0.4010 | 0.4558 | 0.7011    | 0.2500  | 0.4645  | 0.4540 | 0.5399  | 0.2802       | 0.3488         | 0.7359 | 0.4971 | 0.6922  | 0.3681         | 0.5444 | 0.8118 | 0.6695  |
++---------+--------+--------+---------+--------+--------+--------+--------+--------+--------+-----------+---------+---------+--------+---------+--------------+----------------+--------+--------+---------+----------------+--------+--------+---------+
+```
+
+In addition, you can also evaluate a specific model checkpoint after training is finished. Simply run scripts like the following:
+
+```
+./tools/dist_test.sh configs/pointnet2/pointnet2_ssg_16x2_cosine_200e_scannet-seg.py work_dirs/pointnet2_ssg/latest.pth 8
+```
+
+## Testing and Making a Submission
+
+If you would like to only conduct inference or test the model performance on the online benchmark,
+you should change `ann_file='scannet_infos_val.pkl'` to `ann_file='scannet_infos_test.pkl'` in the
+ScanNet dataset's [config](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/datasets/scannet-seg.py#L129). Remember to
+specify the `submission_prefix` in the `test_evaluator`,
+e.g., adding `test_evaluator = dict(type='SegMetric', submission_prefix=work_dirs/pointnet2_ssg/test_submission`) or just add `--cfg-options test_evaluator.submission_prefix=work_dirs/pointnet2_ssg/test_submission` in the end of command.
+After generating the results, you can basically compress the folder and upload to the [ScanNet evaluation server](http://kaldir.vc.in.tum.de/scannet_benchmark/semantic_label_3d).
+
+## Qualitative Validation
+
+MMDetection3D also provides versatile tools for visualization such that you can use `tools/misc/visualize_results.py` with results pkl file for offline visualization of add `--show` in the end of test command to do the online visualization.
+Besides, we also provide scripts `tools/misc/browse_dataset.py` to visualize the dataset without inference.
+Please refer more details in the [doc for visualization](https://mmdetection3d.readthedocs.io/en/latest/useful_tools.html#visualization).
diff --git a/mmde/docs/en/advanced_guides/supported_tasks/vision_det3d.md b/mmde/docs/en/advanced_guides/supported_tasks/vision_det3d.md
new file mode 100644
index 0000000000000000000000000000000000000000..ca345ce00bcfc83f67e1c0e83005b2def9b9a3b4
--- /dev/null
+++ b/mmde/docs/en/advanced_guides/supported_tasks/vision_det3d.md
@@ -0,0 +1,125 @@
+# Vision-Based 3D Detection
+
+Vision-based 3D detection refers to the 3D detection solutions based on vision-only input, such as monocular, binocular, and multi-view image based 3D detection.
+Currently, we only support monocular and multi-view 3D detection methods. Other approaches should be also compatible with our framework and will be supported in the future.
+
+It expects the given model to take any number of images as input, and predict the 3D bounding boxes and category labels for each object of interest.
+Taking FCOS3D on the nuScenes dataset as an example, we will show how to prepare data, train and test a model on a standard 3D detection benchmark, and how to visualize and validate the results.
+
+## Data Preparation
+
+To begin with, we need to download the raw data and reorganize the data in a standard way presented in the [doc for data preparation](https://mmdetection3d.readthedocs.io/en/latest/data_preparation.html).
+
+Due to different ways of organizing the raw data in different datasets, we typically need to collect the useful data information with a .pkl or .json file.
+So after getting all the raw data ready, we need to run the scripts provided in the `create_data.py` for different datasets to generate data infos.
+For example, for nuScenes we need to run:
+
+```
+python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes
+```
+
+Afterwards, the related folder structure should be as follows:
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── nuscenes
+│   │   ├── maps
+│   │   ├── samples
+│   │   ├── sweeps
+│   │   ├── v1.0-test
+|   |   ├── v1.0-trainval
+│   │   ├── nuscenes_database
+│   │   ├── nuscenes_infos_train.pkl
+│   │   ├── nuscenes_infos_trainval.pkl
+│   │   ├── nuscenes_infos_val.pkl
+│   │   ├── nuscenes_infos_test.pkl
+│   │   ├── nuscenes_dbinfos_train.pkl
+```
+
+## Training
+
+Then let us train a model with provided configs for FCOS3D. The basic script is the same as other models.
+You can basically follow the examples provided in this [tutorial](https://mmdetection3d.readthedocs.io/en/latest/1_exist_data_model.html#inference-with-existing-models) when training with different GPU settings.
+Suppose we use 8 GPUs on a single machine with distributed training:
+
+```
+./tools/dist_train.sh configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py 8
+```
+
+Note that `8xb2` in the config name refers to the training is completed with 8 GPUs and 2 data samples on each GPU.
+If your customized setting is different from this, you should add `--auto-scale-lr` to enable automatically scaling learning rate. A basic rule can be referred to [here](https://arxiv.org/abs/1706.02677).
+
+We can also achieve better performance with finetuned FCOS3D by running:
+
+```
+./tools/dist_train.sh configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d_finetune.py 8
+```
+
+After training a baseline model with the previous script,
+please remember to modify the path [here](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d_finetune.py#L8) correspondingly.
+
+## Quantitative Evaluation
+
+During training, the model checkpoints will be evaluated regularly according to the setting of `train_cfg = dict(val_interval=xxx)` in the config.
+
+We support official evaluation protocols for different datasets.
+Due to the output format is the same as 3D detection based on other modalities, the evaluation methods are also the same.
+
+For nuScenes, the model will be evaluated with distance-based mean AP (mAP) and NuScenes Detection Score (NDS) for 10 categories respectively.
+The evaluation results will be printed in the command like:
+
+```
+mAP: 0.3197
+mATE: 0.7595
+mASE: 0.2700
+mAOE: 0.4918
+mAVE: 1.3307
+mAAE: 0.1724
+NDS: 0.3905
+Eval time: 170.8s
+
+Per-class results:
+Object Class    AP      ATE     ASE     AOE     AVE     AAE
+car     0.503   0.577   0.152   0.111   2.096   0.136
+truck   0.223   0.857   0.224   0.220   1.389   0.179
+bus     0.294   0.855   0.204   0.190   2.689   0.283
+trailer 0.081   1.094   0.243   0.553   0.742   0.167
+construction_vehicle    0.058   1.017   0.450   1.019   0.137   0.341
+pedestrian      0.392   0.687   0.284   0.694   0.876   0.158
+motorcycle      0.317   0.737   0.265   0.580   2.033   0.104
+bicycle 0.308   0.704   0.299   0.892   0.683   0.010
+traffic_cone    0.555   0.486   0.309   nan     nan     nan
+barrier 0.466   0.581   0.269   0.169   nan     nan
+```
+
+In addition, you can also evaluate a specific model checkpoint after training is finished. Simply run scripts like the following:
+
+```
+./tools/dist_test.sh configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py work_dirs/fcos3d/latest.pth 8
+```
+
+## Testing and Making a Submission
+
+If you would like to only conduct inference or test the model performance on the online benchmark,
+you just need to specify the `jsonfile_prefix` for corresponding evaluator,
+e.g., add `test_evaluator = dict(type='NuscenesMetric', jsonfile_prefix=work_dirs/fcos3d/test_submission)` in the configuration then you can get the results file.
+
+Please guarantee the `data_prefix` and `ann_file` in [info for testing](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/datasets/nus-mono3d.py#L93) in the config corresponds to the test set instead of validation set.
+
+After generating the results, you can basically compress the folder and upload to the evalAI evaluation server for nuScenes 3D detection challenge.
+
+## Qualitative Validation
+
+MMDetection3D also provides versatile tools for visualization such that we can have an intuitive feeling of the detection results predicted by our trained models.
+You can either set the `--eval-options 'show=True' 'out_dir=${SHOW_DIR}'` option to visualize the detection results online during evaluation,
+or using `tools/misc/visualize_results.py` for offline visualization.
+
+Besides, we also provide scripts `tools/misc/browse_dataset.py` to visualize the dataset without inference.
+Please refer more details in the [doc for visualization](https://mmdetection3d.readthedocs.io/en/latest/useful_tools.html#visualization).
+
+Note that currently we only support the visualization on images for vision-only methods.
+The visualization in the perspective view and bird-eye-view (BEV) will be integrated in the future.
diff --git a/mmde/docs/en/api.rst b/mmde/docs/en/api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..777e6f472da5ce023073c80abb54166fb90df2d3
--- /dev/null
+++ b/mmde/docs/en/api.rst
@@ -0,0 +1,154 @@
+mmdet3d.apis
+--------------
+.. automodule:: mmdet3d.apis
+    :members:
+
+mmdet3d.datasets
+--------------
+
+datasets
+^^^^^^^^^^
+.. automodule:: mmdet3d.datasets
+    :members:
+
+transforms
+^^^^^^^^^^^^
+.. automodule:: mmdet3d.datasets.transforms
+    :members:
+
+mmdet3d.engine
+--------------
+
+hooks
+^^^^^^^^^^
+.. automodule:: mmdet3d.engine.hooks
+    :members:
+
+mmdet3d.evaluation
+--------------------
+
+functional
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmdet3d.evaluation.functional
+    :members:
+
+metrics
+^^^^^^^^^^
+.. automodule:: mmdet3d.evaluation.metrics
+    :members:
+
+mmdet3d.models
+--------------
+
+backbones
+^^^^^^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.backbones
+    :members:
+
+data_preprocessors
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.data_preprocessors
+    :members:
+
+decode_heads
+^^^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.decode_heads
+    :members:
+
+dense_heads
+^^^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.dense_heads
+    :members:
+
+detectors
+^^^^^^^^^^
+.. automodule:: mmdet3d.models.detectors
+    :members:
+
+layers
+^^^^^^^^^^
+.. automodule:: mmdet3d.models.layers
+    :members:
+
+losses
+^^^^^^^^^^
+.. automodule:: mmdet3d.models.losses
+    :members:
+
+middle_encoders
+^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.middle_encoders
+    :members:
+
+necks
+^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.necks
+    :members:
+
+roi_heads
+^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.roi_heads
+    :members:
+
+segmentors
+^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.segmentors
+    :members:
+
+task_modules
+^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.task_modules
+    :members:
+
+test_time_augs
+^^^^^^^^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.test_time_augs
+    :members:
+
+utils
+^^^^^^^^^^
+.. automodule:: mmdet3d.models.utils
+    :members:
+
+voxel_encoders
+^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.voxel_encoders
+    :members:
+
+mmdet3d.structures
+--------------------
+
+structures
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmdet3d.structures
+    :members:
+
+bbox_3d
+^^^^^^^^^^
+.. automodule:: mmdet3d.structures.bbox_3d
+    :members:
+
+ops
+^^^^^^^^^^
+.. automodule:: mmdet3d.structures.ops
+    :members:
+
+points
+^^^^^^^^^^
+.. automodule:: mmdet3d.structures.points
+    :members:
+
+mmdet3d.testing
+----------------
+.. automodule:: mmdet3d.testing
+    :members:
+
+mmdet3d.visualization
+--------------------
+.. automodule:: mmdet3d.visualization
+    :members:
+
+mmdet3d.utils
+--------------
+.. automodule:: mmdet3d.utils
+    :members:
diff --git a/mmde/docs/en/conf.py b/mmde/docs/en/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..342871c0989882796e8069c86abcfb5d1329c2a0
--- /dev/null
+++ b/mmde/docs/en/conf.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+
+import os
+import subprocess
+import sys
+
+import pytorch_sphinx_theme
+
+sys.path.insert(0, os.path.abspath('../../'))
+
+# -- Project information -----------------------------------------------------
+
+project = 'MMDetection3D'
+copyright = '2020-2023, OpenMMLab'
+author = 'MMDetection3D Authors'
+
+# The full version, including alpha/beta/rc tags
+version_file = '../../mmdet3d/version.py'
+with open(version_file) as f:
+    exec(compile(f.read(), version_file, 'exec'))
+__version__ = locals()['__version__']
+release = __version__
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'sphinx_markdown_tables',
+    'sphinx_copybutton',
+    'myst_parser',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.autodoc.typehints',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.autosectionlabel',
+    'sphinx_tabs.tabs',
+]
+autodoc_typehints = 'description'
+autodoc_mock_imports = ['mmcv._ext']
+autosummary_generate = True  # Turn on sphinx.ext.autosummary
+
+# Ignore >>> when copying code
+copybutton_prompt_text = r'>>> |\.\.\. '
+copybutton_prompt_is_regexp = True
+
+myst_enable_extensions = ['colon_fence']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
+
+# The master toctree document.
+master_doc = 'index'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+# html_theme = 'sphinx_rtd_theme'
+html_theme = 'pytorch_sphinx_theme'
+html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+html_theme_options = {
+    'menu': [
+        {
+            'name': 'GitHub',
+            'url': 'https://github.com/open-mmlab/mmdetection3d'
+        },
+        {
+            'name':
+            'Upstream',
+            'children': [
+                {
+                    'name':
+                    'MMEngine',
+                    'url':
+                    'https://github.com/open-mmlab/mmengine',
+                    'description':
+                    'Foundational library for training deep learning models'
+                },
+                {
+                    'name': 'MMCV',
+                    'url': 'https://github.com/open-mmlab/mmcv',
+                    'description': 'Foundational library for computer vision'
+                },
+                {
+                    'name': 'MMDetection',
+                    'url': 'https://github.com/open-mmlab/mmdetection',
+                    'description': 'Object detection toolbox and benchmark'
+                },
+            ]
+        },
+    ],
+    # Specify the language of shared menu
+    'menu_lang':
+    'en'
+}
+
+language = 'en'
+
+master_doc = 'index'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+html_css_files = [
+    'https://cdn.datatables.net/1.13.2/css/dataTables.bootstrap5.min.css',
+    'css/readthedocs.css'
+]
+html_js_files = [
+    'https://cdn.datatables.net/1.13.2/js/jquery.dataTables.min.js',
+    'https://cdn.datatables.net/1.13.2/js/dataTables.bootstrap5.min.js',
+    'js/collapsed.js',
+    'js/table.js',
+]
+
+myst_heading_anchors = 4
+
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3', None),
+    'numpy': ('https://numpy.org/doc/stable', None),
+    'torch': ('https://pytorch.org/docs/stable/', None),
+    'mmcv': ('https://mmcv.readthedocs.io/en/latest/', None),
+    'mmengine': ('https://mmengine.readthedocs.io/en/latest/', None),
+    'mmdetection': ('https://mmdetection.readthedocs.io/en/latest/', None),
+}
+
+
+def builder_inited_handler(app):
+    subprocess.run(['./stat.py'])
+
+
+def setup(app):
+    app.connect('builder-inited', builder_inited_handler)
diff --git a/mmde/docs/en/get_started.md b/mmde/docs/en/get_started.md
new file mode 100644
index 0000000000000000000000000000000000000000..00437debb72c61ab2e7bd710c0923b67519f43f7
--- /dev/null
+++ b/mmde/docs/en/get_started.md
@@ -0,0 +1,302 @@
+# Get Started
+
+## Prerequisites
+
+In this section, we demonstrate how to prepare an environment with PyTorch.
+
+MMDetection3D works on Linux, Windows (experimental support) and macOS. It requires Python 3.7+, CUDA 10.0+, and PyTorch 1.8+.
+
+```{note}
+If you are experienced with PyTorch and have already installed it, just skip this part and jump to the [next section](#installation). Otherwise, you can follow these steps for the preparation.
+```
+
+**Step 0.** Download and install Miniconda from the [official website](https://docs.conda.io/en/latest/miniconda.html).
+
+**Step 1.** Create a conda environment and activate it.
+
+```shell
+conda create --name openmmlab python=3.8 -y
+conda activate openmmlab
+```
+
+**Step 2.** Install PyTorch following [official instructions](https://pytorch.org/get-started/locally/), e.g.
+
+On GPU platforms:
+
+```shell
+conda install pytorch torchvision -c pytorch
+```
+
+On CPU platforms:
+
+```shell
+conda install pytorch torchvision cpuonly -c pytorch
+```
+
+## Installation
+
+We recommend that users follow our best practices to install MMDetection3D. However, the whole process is highly customizable. See [Customize Installation](#customize-installation) section for more information.
+
+### Best Practices
+
+**Step 0.** Install [MMEngine](https://github.com/open-mmlab/mmengine), [MMCV](https://github.com/open-mmlab/mmcv) and [MMDetection](https://github.com/open-mmlab/mmdetection) using [MIM](https://github.com/open-mmlab/mim).
+
+```shell
+pip install -U openmim
+mim install mmengine
+mim install 'mmcv>=2.0.0rc4'
+mim install 'mmdet>=3.0.0'
+```
+
+**Note**: In MMCV-v2.x, `mmcv-full` is renamed to `mmcv`, if you want to install `mmcv` without CUDA ops, you can use `mim install "mmcv-lite>=2.0.0rc4"` to install the lite version.
+
+**Step 1.** Install MMDetection3D.
+
+Case a: If you develop and run mmdet3d directly, install it from source:
+
+```shell
+git clone https://github.com/open-mmlab/mmdetection3d.git -b dev-1.x
+# "-b dev-1.x" means checkout to the `dev-1.x` branch.
+cd mmdetection3d
+pip install -v -e .
+# "-v" means verbose, or more output
+# "-e" means installing a project in edtiable mode,
+# thus any local modifications made to the code will take effect without reinstallation.
+```
+
+Case b: If you use mmdet3d as a dependency or third-party package, install it with MIM:
+
+```shell
+mim install "mmdet3d>=1.1.0"
+```
+
+Note:
+
+1. If you would like to use `opencv-python-headless` instead of `opencv-python`,
+   you can install it before installing MMCV.
+
+2. Some dependencies are optional. Simply running `pip install -v -e .` will only install the minimum runtime requirements. To use optional dependencies like `albumentations` and `imagecorruptions` either install them manually with `pip install -r requirements/optional.txt` or specify desired extras when calling `pip` (e.g. `pip install -v -e .[optional]`). Valid keys for the extras field are: `all`, `tests`, `build`, and `optional`.
+
+   We have supported `spconv 2.0`. If the user has installed `spconv 2.0`, the code will use `spconv 2.0` first, which will take up less GPU memory than using the default `mmcv spconv`. Users can use the following commands to install `spconv 2.0`:
+
+   ```shell
+   pip install cumm-cuxxx
+   pip install spconv-cuxxx
+   ```
+
+   Where `xxx` is the CUDA version in the environment.
+
+   For example, using CUDA 10.2, the command will be `pip install cumm-cu102 && pip install spconv-cu102`.
+
+   Supported CUDA versions include 10.2, 11.1, 11.3, and 11.4. Users can also install it by building from the source. For more details please refer to [spconv v2.x](https://github.com/traveller59/spconv).
+
+   We also support `Minkowski Engine` as a sparse convolution backend. If necessary please follow original [installation guide](https://github.com/NVIDIA/MinkowskiEngine#installation) or use `pip` to install it:
+
+   ```shell
+   conda install openblas-devel -c anaconda
+   export CPLUS_INCLUDE_PATH=CPLUS_INCLUDE_PATH:${YOUR_CONDA_ENVS_DIR}/include
+   # replace ${YOUR_CONDA_ENVS_DIR} to your anaconda environment path e.g. `/home/username/anaconda3/envs/openmmlab`.
+   pip install -U git+https://github.com/NVIDIA/MinkowskiEngine -v --no-deps --install-option="--blas_include_dirs=/opt/conda/include" --install-option="--blas=openblas"
+   ```
+
+   We also support `Torchsparse` as a sparse convolution backend. If necessary please follow original [installation guide](https://github.com/mit-han-lab/torchsparse#installation) or use `pip` to install it:
+
+   ```shell
+   sudo apt-get install libsparsehash-dev
+   pip install --upgrade git+https://github.com/mit-han-lab/torchsparse.git@v1.4.0
+   ```
+
+   or omit sudo install by following command:
+
+   ```shell
+   conda install -c bioconda sparsehash
+   export CPLUS_INCLUDE_PATH=CPLUS_INCLUDE_PATH:${YOUR_CONDA_ENVS_DIR}/include
+   # replace ${YOUR_CONDA_ENVS_DIR} to your anaconda environment path e.g. `/home/username/anaconda3/envs/openmmlab`.
+   pip install --upgrade git+https://github.com/mit-han-lab/torchsparse.git@v1.4.0
+   ```
+
+3. The code can not be built for CPU only environment (where CUDA isn't available) for now.
+
+### Verify the Installation
+
+To verify whether MMDetection3D is installed correctly, we provide some sample codes to run an inference demo.
+
+**Step 1.** We need to download config and checkpoint files.
+
+```shell
+mim download mmdet3d --config pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car --dest .
+```
+
+The downloading will take several seconds or more, depending on your network environment. When it is done, you will find two files `pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car.py` and `hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20220331_134606-d42d15ed.pth` in your current folder.
+
+**Step 2.** Verify the inference demo.
+
+Case a: If you install MMDetection3D from source, just run the following command.
+
+```shell
+python demo/pcd_demo.py demo/data/kitti/000008.bin pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car.py hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20220331_134606-d42d15ed.pth --show
+```
+
+You will see a visualizer interface with point cloud, where bounding boxes are plotted on cars.
+
+**Note**:
+
+If you install MMDetection3D on a remote server without display device, you can leave out the `--show` argument. Demo will still save the predictions to  `outputs/pred/000008.json` file.
+
+**Note**:
+
+If you want to input a `.ply` file, you can use the following function and convert it to `.bin` format. Then you can use the converted `.bin` file to run demo.
+Note that you need to install `pandas` and `plyfile` before using this script. This function can also be used for data preprocessing for training `ply data`.
+
+```python
+import numpy as np
+import pandas as pd
+from plyfile import PlyData
+
+def convert_ply(input_path, output_path):
+    plydata = PlyData.read(input_path)  # read file
+    data = plydata.elements[0].data  # read data
+    data_pd = pd.DataFrame(data)  # convert to DataFrame
+    data_np = np.zeros(data_pd.shape, dtype=np.float)  # initialize array to store data
+    property_names = data[0].dtype.names  # read names of properties
+    for i, name in enumerate(
+            property_names):  # read data by property
+        data_np[:, i] = data_pd[name]
+    data_np.astype(np.float32).tofile(output_path)
+```
+
+Examples:
+
+```python
+convert_ply('./test.ply', './test.bin')
+```
+
+If you have point clouds in other format (`.off`, `.obj`, etc.), you can use `trimesh` to convert them into `.ply`.
+
+```python
+import trimesh
+
+def to_ply(input_path, output_path, original_type):
+    mesh = trimesh.load(input_path, file_type=original_type)  # read file
+    mesh.export(output_path, file_type='ply')  # convert to ply
+```
+
+Examples:
+
+```python
+to_ply('./test.obj', './test.ply', 'obj')
+```
+
+Case b: If you install MMDetection3D with MIM, open your python interpreter and copy&paste the following codes.
+
+```python
+from mmdet3d.apis import init_model, inference_detector
+
+config_file = 'pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car.py'
+checkpoint_file = 'hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20220331_134606-d42d15ed.pth'
+model = init_model(config_file, checkpoint_file)
+inference_detector(model, 'demo/data/kitti/000008.bin')
+```
+
+You will see a list of `Det3DDataSample`, and the predictions are in the `pred_instances_3d`, indicating the detected bounding boxes, labels, and scores.
+
+### Customize Installation
+
+#### CUDA Versions
+
+When installing PyTorch, you need to specify the version of CUDA. If you are not clear on which to choose, follow our recommendations:
+
+- For Ampere-based NVIDIA GPUs, such as GeForce 30 series and NVIDIA A100, CUDA 11 is a must.
+- For older NVIDIA GPUs, CUDA 11 is backward compatible, but CUDA 10.2 offers better compatibility and is more lightweight.
+
+Please make sure the GPU driver satisfies the minimum version requirements. See [this table](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions) for more information.
+
+```{note}
+Installing CUDA runtime libraries is enough if you follow our best practices, because no CUDA code will be compiled locally. However if you hope to compile MMCV from source or develop other CUDA operators, you need to install the complete CUDA toolkit from NVIDIA's [website](https://developer.nvidia.com/cuda-downloads), and its version should match the CUDA version of PyTorch. i.e., the specified version of cudatoolkit in `conda install` command.
+```
+
+#### Install MMEngine without MIM
+
+To install MMEngine with pip instead of MIM, please follow [MMEngine installation guides](https://mmengine.readthedocs.io/en/latest/get_started/installation.html).
+
+For example, you can install MMEngine by the following command:
+
+```shell
+pip install mmengine
+```
+
+#### Install MMCV without MIM
+
+MMCV contains C++ and CUDA extensions, thus depending on PyTorch in a complex way. MIM solves such dependencies automatically and makes the installation easier. However, it is not a must.
+
+To install MMCV with pip instead of MIM, please follow [MMCV installation guides](https://mmcv.readthedocs.io/en/2.x/get_started/installation.html). This requires manually specifying a find-url based on PyTorch version and its CUDA version.
+
+For example, the following command install MMCV built for PyTorch 1.12.x and CUDA 11.6:
+
+```shell
+pip install "mmcv>=2.0.0rc4" -f https://download.openmmlab.com/mmcv/dist/cu116/torch1.12.0/index.html
+```
+
+#### Install on Google Colab
+
+[Google Colab](https://colab.research.google.com/) usually has PyTorch installed, thus we only need to install MMEngine, MMCV, MMDetection, and MMDetection3D with the following commands.
+
+**Step 1.** Install [MMEngine](https://github.com/open-mmlab/mmengine), [MMCV](https://github.com/open-mmlab/mmcv) and [MMDetection](https://github.com/open-mmlab/mmdetection) using [MIM](https://github.com/open-mmlab/mim).
+
+```shell
+!pip3 install openmim
+!mim install mmengine
+!mim install "mmcv>=2.0.0rc4,<2.1.0"
+!mim install "mmdet>=3.0.0,<3.1.0"
+```
+
+**Step 2.** Install MMDetection3D from source.
+
+```shell
+!git clone https://github.com/open-mmlab/mmdetection3d.git -b dev-1.x
+%cd mmdetection3d
+!pip install -e .
+```
+
+**Step 3.** Verification.
+
+```python
+import mmdet3d
+print(mmdet3d.__version__)
+# Example output: 1.1.0, or an another version.
+```
+
+```{note}
+Within Jupyter, the exclamation mark `!` is used to call external executables and `%cd` is a [magic command](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-cd) to change the current working directory of Python.
+```
+
+#### Using MMDetection3D with Docker
+
+We provide a [Dockerfile](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/docker/Dockerfile) to build an image. Ensure that your [docker version](https://docs.docker.com/engine/install/) >= 19.03.
+
+```shell
+# build an image with PyTorch 1.9, CUDA 11.1
+# If you prefer other versions, just modified the Dockerfile
+docker build -t mmdetection3d docker/
+```
+
+Run it with:
+
+```shell
+docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmdetection3d/data mmdetection3d
+```
+
+### Troubleshooting
+
+If you have some issues during the installation, please first view the [FAQ](notes/faq.md) page.
+You may [open an issue](https://github.com/open-mmlab/mmdetection3d/issues/new/choose) on GitHub if no solution is found.
+
+### Use Multiple Versions of MMDetection3D in Development
+
+Training and testing scripts have already been modified in `PYTHONPATH` in order to make sure the scripts are using their own versions of MMDetection3D.
+
+To install the default version of MMDetection3D in your environment, you can exclude the following code in the related scripts:
+
+```shell
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH
+```
diff --git a/mmde/docs/en/index.rst b/mmde/docs/en/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0032f6ca0c7c4a160176432e93b6b501ae92617e
--- /dev/null
+++ b/mmde/docs/en/index.rst
@@ -0,0 +1,55 @@
+Welcome to MMDetection3D's documentation!
+==========================================
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Get Started
+
+   get_started.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: User Guides
+
+   user_guides/index.rst
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Advanced Guides
+
+   advanced_guides/index.rst
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Migrating from MMDetection3D 1.0
+
+   migration.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: API Reference
+
+   api.rst
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Model Zoo
+
+   model_zoo.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Notes
+
+   notes/index.rst
+
+.. toctree::
+   :caption: Switch Language
+
+   switch_language.md
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/mmde/docs/en/make.bat b/mmde/docs/en/make.bat
new file mode 100644
index 0000000000000000000000000000000000000000..922152e96a04a242e6fc40f124261d74890617d8
--- /dev/null
+++ b/mmde/docs/en/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/mmde/docs/en/migration.md b/mmde/docs/en/migration.md
new file mode 100644
index 0000000000000000000000000000000000000000..66b1de8d85fdec5f42e9f78bef643a2b76a486e2
--- /dev/null
+++ b/mmde/docs/en/migration.md
@@ -0,0 +1,33 @@
+Along with the release of OpenMMLab 2.0, MMDetection3D (namely MMDet3D) 1.1 made many significant changes, resulting in less redundant, more efficient code and a more consistent overall design. These changes break backward compatibility. Therefore, we prepared this migration guide to make the transition as smooth as possible so that all users can enjoy the productivity benefits of the new MMDet3D and the entire OpenMMLab 2.0 ecosystem.
+
+## Environment
+
+MMDet3D 1.1 depends on the new foundational library [MMEngine](https://github.com/open-mmlab/mmengine) for training deep learning models, and therefore has an entirely different dependency chain compared with MMDet3D 1.0. Even if you have a well-rounded MMDet3D 1.0 / 0.x environment before, you still need to create a new Python environment for MMDet3D 1.1. We provide a detailed [installation guide](./get_started.md) for reference.
+
+The configuration files in our new version have a lot of modifications because of the differences between MMCV 1.x and MMEngine. The guides for migration from MMCV to MMEngine can be seen [here](https://github.com/open-mmlab/mmengine/tree/main/docs/en/migration).
+
+We have renamed the names of the remote branches in MMDet3D 1.1 (renaming 1.1 to main, master to 1.0, and dev to dev-1.0). If your local branches in the git system are not aligned with branches of the remote repo, you can use the following commands to resolve it:
+
+```
+git fetch origin
+git checkout main
+git branch main_backup  # backup your main branch
+git reset --hard origin/main
+```
+
+## Dataset
+
+You should update the annotation files generated in the 1.0 version since some key words and structures of annotation in MMDet3D 1.1 have changed. Taking KITTI as an example, the update script is as follows:
+
+```python
+python tools/dataset_converters/update_infos_to_v2.py
+        --dataset kitti
+        --pkl-path ./data/kitti/kitti_infos_train.pkl
+        --out-dir ./kitti_v2/
+```
+
+If your annotation files are generated in the 0.x version, you should first update them to 1.0 version using this [script](../../tools/update_data_coords.py). Alternatively, you can re-generate annotation files from scratch using this [script](../../tools/create_data.py).
+
+## Model
+
+MMDet3D 1.1 supports loading weights trained on the old version (1.0 version). For models that are important or frequently used, we have thoroughly verified their precisions in the 1.1 version. Especially for some models that may experience potential performance drop or training bugs in the old version, such as [centerpoint](https://github.com/open-mmlab/mmdetection3d/issues/2390), we have checked them and ensured the right precision in the new version. If you encounter any problem, please feel free to raise an [issue](https://github.com/open-mmlab/mmdetection3d/issues). Additionally, we have added some of the latest SOTA methods in our [package](../../configs/) and [projects](../../projects/), making MMDet3D 1.1 a highly recommended choice for implementing your project.
diff --git a/mmde/docs/en/model_zoo.md b/mmde/docs/en/model_zoo.md
new file mode 100644
index 0000000000000000000000000000000000000000..562fd257430952b105708c98f4cecad821e81045
--- /dev/null
+++ b/mmde/docs/en/model_zoo.md
@@ -0,0 +1,141 @@
+# Model Zoo
+
+## Common settings
+
+- We use distributed training.
+- For fair comparison with other codebases, we report the GPU memory as the maximum value of `torch.cuda.max_memory_allocated()` for all 8 GPUs. Note that this value is usually less than what `nvidia-smi` shows.
+- We report the inference time as the total time of network forwarding and post-processing, excluding the data loading time. Results are obtained with the script [benchmark.py](https://github.com/open-mmlab/mmdetection/blob/master/tools/analysis_tools/benchmark.py) which computes the average time on 2000 images.
+
+## Baselines
+
+### SECOND
+
+Please refer to [SECOND](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/second) for details. We provide SECOND baselines on KITTI and Waymo datasets.
+
+### PointPillars
+
+Please refer to [PointPillars](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/pointpillars) for details. We provide pointpillars baselines on KITTI, nuScenes, Lyft, and Waymo datasets.
+
+### Part-A2
+
+Please refer to [Part-A2](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/parta2) for details.
+
+### VoteNet
+
+Please refer to [VoteNet](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/votenet) for details. We provide VoteNet baselines on ScanNet and SUNRGBD datasets.
+
+### Dynamic Voxelization
+
+Please refer to [Dynamic Voxelization](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/dynamic_voxelization) for details.
+
+### MVXNet
+
+Please refer to [MVXNet](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/mvxnet) for details.
+
+### RegNetX
+
+Please refer to [RegNet](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/regnet) for details. We provide pointpillars baselines with RegNetX backbones on nuScenes and Lyft datasets currently.
+
+### nuImages
+
+We also support baseline models on [nuImages dataset](https://www.nuscenes.org/nuimages). Please refer to [nuImages](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/nuimages) for details. We report Mask R-CNN, Cascade Mask R-CNN and HTC results currently.
+
+### H3DNet
+
+Please refer to [H3DNet](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/h3dnet) for details.
+
+### 3DSSD
+
+Please refer to [3DSSD](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/3dssd) for details.
+
+### CenterPoint
+
+Please refer to [CenterPoint](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/centerpoint) for details.
+
+### SSN
+
+Please refer to [SSN](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/ssn) for details. We provide pointpillars with shape-aware grouping heads used in SSN on the nuScenes and Lyft datasets currently.
+
+### ImVoteNet
+
+Please refer to [ImVoteNet](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/imvotenet) for details. We provide ImVoteNet baselines on SUNRGBD dataset.
+
+### FCOS3D
+
+Please refer to [FCOS3D](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/fcos3d) for details. We provide FCOS3D baselines on the nuScenes dataset.
+
+### PointNet++
+
+Please refer to [PointNet++](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/pointnet2) for details. We provide PointNet++ baselines on ScanNet and S3DIS datasets.
+
+### Group-Free-3D
+
+Please refer to [Group-Free-3D](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/groupfree3d) for details. We provide Group-Free-3D baselines on ScanNet dataset.
+
+### ImVoxelNet
+
+Please refer to [ImVoxelNet](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/imvoxelnet) for details. We provide ImVoxelNet baselines on KITTI dataset.
+
+### PAConv
+
+Please refer to [PAConv](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/paconv) for details. We provide PAConv baselines on S3DIS dataset.
+
+### DGCNN
+
+Please refer to [DGCNN](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/dgcnn) for details. We provide DGCNN baselines on S3DIS dataset.
+
+### SMOKE
+
+Please refer to [SMOKE](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/smoke) for details. We provide SMOKE baselines on KITTI dataset.
+
+### PGD
+
+Please refer to [PGD](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/pgd) for details. We provide PGD baselines on KITTI and nuScenes dataset.
+
+### PointRCNN
+
+Please refer to [PointRCNN](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/point_rcnn) for details. We provide PointRCNN baselines on KITTI dataset.
+
+### MonoFlex
+
+Please refer to [MonoFlex](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/monoflex) for details. We provide MonoFlex baselines on KITTI dataset.
+
+### SA-SSD
+
+Please refer to [SA-SSD](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/sassd) for details. We provide SA-SSD baselines on the KITTI dataset.
+
+### FCAF3D
+
+Please refer to [FCAF3D](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/fcaf3d) for details. We provide FCAF3D baselines on the ScanNet, S3DIS, and SUN RGB-D datasets.
+
+### PV-RCNN
+
+Please refer to [PV-RCNN](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/pv_rcnn) for details. We provide PV-RCNN baselines on the KITTI dataset.
+
+### BEVFusion
+
+Please refer to [BEVFusion](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/projects/BEVFusion) for details. We provide BEVFusion baselines on the NuScenes dataset.
+
+### CenterFormer
+
+Please refer to [CenterFormer](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/projects/CenterFormer) for details. We provide CenterFormer baselines on the Waymo dataset.
+
+### TR3D
+
+Please refer to [TR3D](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/projects/TR3D) for details. We provide TR3D baselines on the ScanNet, SUN RGB-D and S3DIS dataset.
+
+### DETR3D
+
+Please refer to [DETR3D](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/projects/DETR3D) for details. We provide DETR3D baselines on the nuScenes dataset.
+
+### PETR
+
+Please refer to [PETR](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/projects/PETR) for details. We provide PETR baselines on the nuScenes dataset.
+
+### TPVFormer
+
+Please refer to [TPVFormer](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/projects/TPVFormer) for details. We provide TPVFormer baselines on the nuScenes dataset.
+
+### Mixed Precision (FP16) Training
+
+Please refer to [Mixed Precision (FP16) Training on PointPillars](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/pointpillars/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py) for details.
diff --git a/mmde/docs/en/notes/benchmarks.md b/mmde/docs/en/notes/benchmarks.md
new file mode 100644
index 0000000000000000000000000000000000000000..c84ffc831775a4b8357a1f96e6e5f137eed18e88
--- /dev/null
+++ b/mmde/docs/en/notes/benchmarks.md
@@ -0,0 +1,286 @@
+# Benchmarks
+
+Here we benchmark the training and testing speed of models in MMDetection3D,
+with some other open source 3D detection codebases.
+
+## Settings
+
+- Hardwares: 8 NVIDIA Tesla V100 (32G) GPUs, Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+- Software: Python 3.7, CUDA 10.1, cuDNN 7.6.5, PyTorch 1.3, numba 0.48.0.
+- Model: Since all the other codebases implements different models, we compare the corresponding models including SECOND, PointPillars, Part-A2, and VoteNet with them separately.
+- Metrics: We use the average throughput in iterations of the entire training run and skip the first 50 iterations of each epoch to skip GPU warmup time.
+
+## Main Results
+
+We compare the training speed (samples/s) with other codebases if they implement the similar models. The results are as below, the greater the numbers in the table, the faster of the training process. The models that are not supported by other codebases are marked by `×`.
+
+|       Methods       | MMDetection3D | OpenPCDet | votenet | Det3D |
+| :-----------------: | :-----------: | :-------: | :-----: | :---: |
+|       VoteNet       |      358      |     ×     |   77    |   ×   |
+|  PointPillars-car   |      141      |     ×     |    ×    |  140  |
+| PointPillars-3class |      107      |    44     |    ×    |   ×   |
+|       SECOND        |      40       |    30     |    ×    |   ×   |
+|       Part-A2       |      17       |    14     |    ×    |   ×   |
+
+## Details of Comparison
+
+### Modification for Calculating Speed
+
+- __MMDetection3D__: We try to use as similar settings as those of other codebases as possible using [benchmark configs](https://github.com/open-mmlab/MMDetection3D/blob/main/configs/benchmark).
+
+- __Det3D__: For comparison with Det3D, we use the commit [519251e](https://github.com/poodarchu/Det3D/tree/519251e72a5c1fdd58972eabeac67808676b9bb7).
+
+- __OpenPCDet__: For comparison with OpenPCDet, we use the commit [b32fbddb](https://github.com/open-mmlab/OpenPCDet/tree/b32fbddbe06183507bad433ed99b407cbc2175c2).
+
+  For training speed, we add code to record the running time in the file `./tools/train_utils/train_utils.py`. We calculate the speed of each epoch, and report the average speed of all the epochs.
+
+  <details>
+    <summary>
+    (diff to make it use the same method for benchmarking speed - click to expand)
+    </summary>
+
+  ```diff
+  diff --git a/tools/train_utils/train_utils.py b/tools/train_utils/train_utils.py
+  index 91f21dd..021359d 100644
+  --- a/tools/train_utils/train_utils.py
+  +++ b/tools/train_utils/train_utils.py
+  @@ -2,6 +2,7 @@ import torch
+   import os
+   import glob
+   import tqdm
+  +import datetime
+   from torch.nn.utils import clip_grad_norm_
+
+
+  @@ -13,7 +14,10 @@ def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, ac
+       if rank == 0:
+           pbar = tqdm.tqdm(total=total_it_each_epoch, leave=leave_pbar, desc='train', dynamic_ncols=True)
+
+  +    start_time = None
+       for cur_it in range(total_it_each_epoch):
+  +        if cur_it > 49 and start_time is None:
+  +            start_time = datetime.datetime.now()
+           try:
+               batch = next(dataloader_iter)
+           except StopIteration:
+  @@ -55,9 +59,11 @@ def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, ac
+                   tb_log.add_scalar('learning_rate', cur_lr, accumulated_iter)
+                   for key, val in tb_dict.items():
+                       tb_log.add_scalar('train_' + key, val, accumulated_iter)
+  +    endtime = datetime.datetime.now()
+  +    speed = (endtime - start_time).seconds / (total_it_each_epoch - 50)
+       if rank == 0:
+           pbar.close()
+  -    return accumulated_iter
+  +    return accumulated_iter, speed
+
+
+   def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_cfg,
+  @@ -65,6 +71,7 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
+                   lr_warmup_scheduler=None, ckpt_save_interval=1, max_ckpt_save_num=50,
+                   merge_all_iters_to_one_epoch=False):
+       accumulated_iter = start_iter
+  +    speeds = []
+       with tqdm.trange(start_epoch, total_epochs, desc='epochs', dynamic_ncols=True, leave=(rank == 0)) as tbar:
+           total_it_each_epoch = len(train_loader)
+           if merge_all_iters_to_one_epoch:
+  @@ -82,7 +89,7 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
+                   cur_scheduler = lr_warmup_scheduler
+               else:
+                   cur_scheduler = lr_scheduler
+  -            accumulated_iter = train_one_epoch(
+  +            accumulated_iter, speed = train_one_epoch(
+                   model, optimizer, train_loader, model_func,
+                   lr_scheduler=cur_scheduler,
+                   accumulated_iter=accumulated_iter, optim_cfg=optim_cfg,
+  @@ -91,7 +98,7 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
+                   total_it_each_epoch=total_it_each_epoch,
+                   dataloader_iter=dataloader_iter
+               )
+  -
+  +            speeds.append(speed)
+               # save trained model
+               trained_epoch = cur_epoch + 1
+               if trained_epoch % ckpt_save_interval == 0 and rank == 0:
+  @@ -107,6 +114,8 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
+                   save_checkpoint(
+                       checkpoint_state(model, optimizer, trained_epoch, accumulated_iter), filename=ckpt_name,
+                   )
+  +            print(speed)
+  +    print(f'*******{sum(speeds) / len(speeds)}******')
+
+
+   def model_state_to_cpu(model_state):
+  ```
+
+  </details>
+
+### VoteNet
+
+- __MMDetection3D__: With release v0.1.0, run
+
+  ```bash
+  ./tools/dist_train.sh configs/votenet/votenet_8xb16_sunrgbd-3d.py 8 --no-validate
+  ```
+
+- __votenet__: At commit [2f6d6d3](https://github.com/facebookresearch/votenet/tree/2f6d6d36ff98d96901182e935afe48ccee82d566), run
+
+  ```bash
+  python train.py --dataset sunrgbd --batch_size 16
+  ```
+
+  Then benchmark the test speed by running
+
+  ```bash
+  python eval.py --dataset sunrgbd --checkpoint_path log_sunrgbd/checkpoint.tar --batch_size 1 --dump_dir eval_sunrgbd --cluster_sampling seed_fps --use_3d_nms --use_cls_nms --per_class_proposal
+  ```
+
+  Note that eval.py is modified to compute inference time.
+
+  <details>
+  <summary>
+  (diff to benchmark the similar models - click to expand)
+  </summary>
+
+  ```diff
+  diff --git a/eval.py b/eval.py
+    index c0b2886..04921e9 100644
+    --- a/eval.py
+    +++ b/eval.py
+    @@ -10,6 +10,7 @@ import os
+     import sys
+     import numpy as np
+     from datetime import datetime
+    +import time
+     import argparse
+     import importlib
+     import torch
+    @@ -28,7 +29,7 @@ parser.add_argument('--checkpoint_path', default=None, help='Model checkpoint pa
+     parser.add_argument('--dump_dir', default=None, help='Dump dir to save sample outputs [default: None]')
+     parser.add_argument('--num_point', type=int, default=20000, help='Point Number [default: 20000]')
+     parser.add_argument('--num_target', type=int, default=256, help='Point Number [default: 256]')
+    -parser.add_argument('--batch_size', type=int, default=8, help='Batch Size during training [default: 8]')
+    +parser.add_argument('--batch_size', type=int, default=1, help='Batch Size during training [default: 8]')
+     parser.add_argument('--vote_factor', type=int, default=1, help='Number of votes generated from each seed [default: 1]')
+     parser.add_argument('--cluster_sampling', default='vote_fps', help='Sampling strategy for vote clusters: vote_fps, seed_fps, random [default: vote_fps]')
+     parser.add_argument('--ap_iou_thresholds', default='0.25,0.5', help='A list of AP IoU thresholds [default: 0.25,0.5]')
+    @@ -132,6 +133,7 @@ CONFIG_DICT = {'remove_empty_box': (not FLAGS.faster_eval), 'use_3d_nms': FLAGS.
+     # ------------------------------------------------------------------------- GLOBAL CONFIG END
+
+     def evaluate_one_epoch():
+    +    time_list = list()
+         stat_dict = {}
+         ap_calculator_list = [APCalculator(iou_thresh, DATASET_CONFIG.class2type) \
+             for iou_thresh in AP_IOU_THRESHOLDS]
+    @@ -144,6 +146,8 @@ def evaluate_one_epoch():
+
+             # Forward pass
+             inputs = {'point_clouds': batch_data_label['point_clouds']}
+    +        torch.cuda.synchronize()
+    +        start_time = time.perf_counter()
+             with torch.no_grad():
+                 end_points = net(inputs)
+
+    @@ -161,6 +165,12 @@ def evaluate_one_epoch():
+
+             batch_pred_map_cls = parse_predictions(end_points, CONFIG_DICT)
+             batch_gt_map_cls = parse_groundtruths(end_points, CONFIG_DICT)
+    +        torch.cuda.synchronize()
+    +        elapsed = time.perf_counter() - start_time
+    +        time_list.append(elapsed)
+    +
+    +        if len(time_list==200):
+    +            print("average inference time: %4f"%(sum(time_list[5:])/len(time_list[5:])))
+             for ap_calculator in ap_calculator_list:
+                 ap_calculator.step(batch_pred_map_cls, batch_gt_map_cls)
+
+  ```
+
+### PointPillars-car
+
+- __MMDetection3D__: With release v0.1.0, run
+
+  ```bash
+  ./tools/dist_train.sh configs/benchmark/hv_pointpillars_secfpn_3x8_100e_det3d_kitti-3d-car.py 8 --no-validate
+  ```
+
+- __Det3D__: At commit [519251e](https://github.com/poodarchu/Det3D/tree/519251e72a5c1fdd58972eabeac67808676b9bb7), use `kitti_point_pillars_mghead_syncbn.py` and run
+
+  ```bash
+  ./tools/scripts/train.sh --launcher=slurm --gpus=8
+  ```
+
+  Note that the config in train.sh is modified to train point pillars.
+
+  <details>
+  <summary>
+  (diff to benchmark the similar models - click to expand)
+  </summary>
+
+  ```diff
+  diff --git a/tools/scripts/train.sh b/tools/scripts/train.sh
+  index 3a93f95..461e0ea 100755
+  --- a/tools/scripts/train.sh
+  +++ b/tools/scripts/train.sh
+  @@ -16,9 +16,9 @@ then
+   fi
+
+   # Voxelnet
+  -python -m torch.distributed.launch --nproc_per_node=8 ./tools/train.py examples/second/configs/  kitti_car_vfev3_spmiddlefhd_rpn1_mghead_syncbn.py --work_dir=$SECOND_WORK_DIR
+  +# python -m torch.distributed.launch --nproc_per_node=8 ./tools/train.py examples/second/configs/  kitti_car_vfev3_spmiddlefhd_rpn1_mghead_syncbn.py --work_dir=$SECOND_WORK_DIR
+   # python -m torch.distributed.launch --nproc_per_node=8 ./tools/train.py examples/cbgs/configs/  nusc_all_vfev3_spmiddleresnetfhd_rpn2_mghead_syncbn.py --work_dir=$NUSC_CBGS_WORK_DIR
+   # python -m torch.distributed.launch --nproc_per_node=8 ./tools/train.py examples/second/configs/  lyft_all_vfev3_spmiddleresnetfhd_rpn2_mghead_syncbn.py --work_dir=$LYFT_CBGS_WORK_DIR
+
+   # PointPillars
+  -# python -m torch.distributed.launch --nproc_per_node=8 ./tools/train.py ./examples/point_pillars/configs/  original_pp_mghead_syncbn_kitti.py --work_dir=$PP_WORK_DIR
+  +python -m torch.distributed.launch --nproc_per_node=8 ./tools/train.py ./examples/point_pillars/configs/  kitti_point_pillars_mghead_syncbn.py
+  ```
+
+  </details>
+
+### PointPillars-3class
+
+- __MMDetection3D__: With release v0.1.0, run
+
+  ```bash
+  ./tools/dist_train.sh configs/benchmark/hv_pointpillars_secfpn_4x8_80e_pcdet_kitti-3d-3class.py 8 --no-validate
+  ```
+
+- __OpenPCDet__: At commit [b32fbddb](https://github.com/open-mmlab/OpenPCDet/tree/b32fbddbe06183507bad433ed99b407cbc2175c2), run
+
+  ```bash
+  cd tools
+  sh scripts/slurm_train.sh ${PARTITION} ${JOB_NAME} 8  --cfg_file ./cfgs/kitti_models/pointpillar.yaml --batch_size 32  --workers 32 --epochs 80
+  ```
+
+### SECOND
+
+For SECOND, we mean the [SECONDv1.5](https://github.com/traveller59/second.pytorch/blob/master/second/configs/all.fhd.config) that was first implemented in [second.Pytorch](https://github.com/traveller59/second.pytorch). Det3D's implementation of SECOND uses its self-implemented Multi-Group Head, so its speed is not compatible with other codebases.
+
+- __MMDetection3D__: With release v0.1.0, run
+
+  ```bash
+  ./tools/dist_train.sh configs/benchmark/hv_second_secfpn_4x8_80e_pcdet_kitti-3d-3class.py 8 --no-validate
+  ```
+
+- __OpenPCDet__: At commit [b32fbddb](https://github.com/open-mmlab/OpenPCDet/tree/b32fbddbe06183507bad433ed99b407cbc2175c2), run
+
+  ```bash
+  cd tools
+  sh ./scripts/slurm_train.sh ${PARTITION} ${JOB_NAME} 8  --cfg_file ./cfgs/kitti_models/second.yaml --batch_size 32  --workers 32 --epochs 80
+  ```
+
+### Part-A2
+
+- __MMDetection3D__: With release v0.1.0, run
+
+  ```bash
+  ./tools/dist_train.sh configs/benchmark/hv_PartA2_secfpn_4x8_cyclic_80e_pcdet_kitti-3d-3class.py 8 --no-validate
+  ```
+
+- __OpenPCDet__: At commit [b32fbddb](https://github.com/open-mmlab/OpenPCDet/tree/b32fbddbe06183507bad433ed99b407cbc2175c2), train the model by running
+
+  ```bash
+  cd tools
+  sh ./scripts/slurm_train.sh ${PARTITION} ${JOB_NAME} 8  --cfg_file ./cfgs/kitti_models/PartA2.yaml --batch_size 32 --workers 32 --epochs 80
+  ```
diff --git a/mmde/docs/en/notes/changelog.md b/mmde/docs/en/notes/changelog.md
new file mode 100644
index 0000000000000000000000000000000000000000..fd038b8b5f701c4dd2307ec1f1767b63af97c9b5
--- /dev/null
+++ b/mmde/docs/en/notes/changelog.md
@@ -0,0 +1,418 @@
+# Changelog of v1.1
+
+### v1.4.0 (8/1/2024)
+
+#### Highlights
+
+- Refactor Waymo dataset (#2836)
+- Support the training of [DSVT](<(https://arxiv.org/abs/2301.06051)>) in `projects` (#2738)
+- Support [Nerf-Det](https://arxiv.org/abs/2307.14620) in `projects` (#2732)
+
+#### New Features
+
+- Support the training of [DSVT](<(https://arxiv.org/abs/2301.06051)>) in `projects` (#2738)
+- Support [Nerf-Det](https://arxiv.org/abs/2307.14620) in `projects` (#2732)
+- Support [MV-FCOS3D++](https://arxiv.org/abs/2207.12716)
+- Refactor Waymo dataset (#2836)
+
+#### Improvements
+
+- Support [PGD](https://arxiv.org/abs/2107.14160)) (front-of-view / multi-view) on Waymo dataset (#2835)
+- Release new [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) for verify some methods or debug quickly (#2835)
+
+#### Bug Fixes
+
+- Fix MinkUNet and SPVCNN some wrong configs (#2854)
+- Fix incorrect number of arguments in PETR (#2800)
+- Delete unused files in `mmdet3d/configs` (#2773)
+
+#### Contributors
+
+A total of 5 developers contributed to this release.
+
+@sunjiahao1999, @WendellZ524, @Yanyirong, @JingweiZhang12, @Tai-Wang
+
+### v1.3.0 (18/10/2023)
+
+#### Highlights
+
+- Support [CENet](https://arxiv.org/abs/2207.12691) in `projects` (#2619)
+- Enhance demos with new 3D inferencers (#2763)
+
+#### New Features
+
+- Support [CENet](https://arxiv.org/abs/2207.12691) in `projects` (#2619)
+
+#### Improvements
+
+- Enhance demos with new 3D inferencers (#2763)
+- Add BEV-based detection pipeline in nuScenes dataset tutorial (#2672)
+- Add the new config type of Cylinder3D in `mmdet3d/configs` (#2681)
+- Update [New Config Type](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) (#2655)
+- Update the QR code in README.md (#2703)
+
+#### Bug Fixes
+
+- Fix the download script of nuScenes dataset (#2660)
+- Fix circleCI and GitHub workflow configuration (#2652)
+- Fix the version of Open3D in requirements (#2633)
+- Fix unused files in `mmdet3d/configs` (#2773)
+- Fix support devices in FreeAnchor3DHead (#2769)
+- Fix readthedocs building and link (#2739, #2650)
+- Fix the pitch angle bug in LaserMix (#2710)
+
+#### Contributors
+
+A total of 6 developers contributed to this release.
+
+@sunjiahao1999, @Xiangxu-0103, @ZhaoCake, @LRJKD, @crazysteeaam, @wep21, @zhiqwang
+
+### v1.2.0 (4/7/2023)
+
+#### Highlights
+
+- Support [New Config Type](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) in `mmdet3d/config`  (#2608)
+- Support the inference of [DSVT](<(https://arxiv.org/abs/2301.06051)>) in `projects`  (#2606)
+- Support downloading datasets from [OpenDataLab](https://opendatalab.com/) using `mim`  (#2593)
+
+#### New Features
+
+- Support [New Config Type](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) in `mmdet3d/config`  (#2608)
+- Support the inference of [DSVT](<(https://arxiv.org/abs/2301.06051)>) in `projects`  (#2606)
+- Support downloading datasets from [OpenDataLab](https://opendatalab.com/) using `mim`  (#2593)
+
+#### Improvements
+
+- Enhanced visualization in interactive form (#2611)
+- Update README.md and Model Zoo (#2599, #2600)
+- Speed up S3DIS data preparation (#2585)
+
+#### Bug Fixes
+
+- Remove PointRCNN in benchmark training (#2610)
+- Fix wrong indoor detection visualization (#2625)
+- Fix MinkUNet download link (#2590)
+- Fix the formula in the `readthedocs` (#2580)
+
+#### Contributors
+
+A total of 5 developers contributed to this release.
+
+@sunjiahao1999, @Xiangxu-0103, @JingweiZhang12, @col14m, @zhulf0804
+
+### v1.1.1 (30/5/2023)
+
+#### Highlights
+
+- Support [TPVFormer](https://arxiv.org/pdf/2302.07817.pdf) in `projects` (#2399, #2517, #2535)
+- Support the training of BEVFusion in `projects` (#2546)
+- Support lidar-based 3D semantic segmentation benchmark (#2530, #2559)
+
+#### New Features
+
+- Support [TPVFormer](https://arxiv.org/pdf/2302.07817.pdf) in `projects` (#2399, #2517, #2535)
+- Support the training of \[BEVFusion\] in `projects` (#2558)
+- Support lidar-based 3D Semantic Segmentation Benchmark (#2530, #2559)
+- Support test-time augmentation for Segmentor (#2382)
+- Support `Minkowski ConvModule` and `Residual` Block (#2528)
+- Support the visualization of multi-view images in multi-modal methods (#2453)
+
+#### Improvements
+
+- Upload checkpoints and training log of PETR (#2555)
+- Replace `np.float` by default `float` in segmentation evaluation (#2527)
+- Add docs of converting SemanticKITTI datasets (#2515)
+- Support different colors for different classes in visualization (#2500)
+- Support tensor-like operations for `BaseInstance3DBoxes` and `BasePoint`
+- Add information of LiDAR Segmentation in NuScenes annotation files
+- Provide annotation files of datasets generated offline (#2457)
+- Refactor document structure (#2429)
+- Complete typehints and docstring (#2396, #2457, #2468, #2464, #2485)
+
+#### Bug Fixes
+
+- Fix the bug of abnormal loss when training SECOND in Automatic mixed precision(AMP) mode (#2452)
+- Add a warning in function `post_process_coords` in mmdet3d/dataset/convert_utils.py (#2557)
+- Fix invalid configs (#2477, #2536)
+- Fix bugs of unit test (#2466)
+- Update `local-rank` argument in test.py for pytorch 2.0 (#2469)
+- Fix docker file (#2451)
+- Fix demo and visualization (#2453)
+- Fix SUN RGB-D data converter (#2440)
+- Fix readthedocs building (#2459, #2419, #2505, #2396)
+- Fix CI #(2445)
+- Fix the version error of `torch` in github merge stage test (#2424)
+- Loose the version restriction of `numba` (#2416)
+
+#### Contributors
+
+A total of 10 developers contributed to this release.
+
+@sunjiahao1999, @Xiangxu-0103, @JingweiZhang12, @chriscarving, @jaan1729, @pd-michaelstanley, @filaPro, @kabouzeid, @A-new-b, @lbin
+
+### v1.1.0 (6/4/2023)
+
+#### Highlights
+
+- Support [Cylinder3D](https://arxiv.org/pdf/2011.10033.pdf) (#2291, #2344, #2350)
+- Support [MinkUnet](https://arxiv.org/abs/1904.08755) (#2294, #2358)
+- Support [SPVCNN](https://arxiv.org/abs/2007.16100) (#2320，#2372)
+- Support [TR3D](https://arxiv.org/abs/2302.02858) detector in `projects` (#2274)
+- Support the inference of [BEVFusion](https://arxiv.org/abs/2205.13542) in `projects` (#2175)
+- Support [DETR3D](https://arxiv.org/abs/2110.06922) in `projects` (#2173)
+
+#### New Features
+
+- Support [Cylinder3D](https://arxiv.org/pdf/2011.10033.pdf) (#2291, #2344, #2350)
+- Support [MinkUnet](https://arxiv.org/abs/1904.08755) (#2294, #2358)
+- Support [SPVCNN](https://arxiv.org/abs/2007.16100) (#2320，#2372)
+- Support [TR3D](https://arxiv.org/abs/2302.02858) detector in `projects` (#2274)
+- Support the inference of [BEVFusion](https://arxiv.org/abs/2205.13542) in `projects` (#2175)
+- Support [DETR3D](https://arxiv.org/abs/2110.06922) in `projects` (#2173)
+- Support PolarMix and LaserMix augmentation (#2265, #2302)
+- Support loading annotation of panoptic segmentation (#2223)
+- Support panoptic segmentation metric (#2230)
+- Add inferencer for LiDAR-based, monocular and multi-modality 3D detection (#2208, #2190, #2342)
+- Add inferencer for LiDAR-based segmentation (#2304)
+
+#### Improvements
+
+- Support `lazy_init` for CBGSDataset (#2271)
+- Support generating annotation files for test set on Waymo  (#2180)
+- Enhance the support for SemanticKitti (#2253, #2323)
+- File I/O migration and reconstruction (#2319)
+- Support `format_only` option for Lyft, NuScenes and Waymo datasets (#2333, #2151)
+- Replace `np.transpose` with `torch.permute` to speed up (#2277)
+- Allow setting local-rank for pytorch 2.0 (#2387)
+
+#### Bug Fixes
+
+- Fix the problem of reversal of length and width when drawing heatmap in CenterFormer (#2362)
+- Deprecate old type alias due to the new version of numpy (#2339)
+- Lose `trimesh` version requirements to fix numpy random state (#2340)
+- Fix the device mismatch error in CenterPoint (#2308)
+- Fix bug of visualization when there are no bboxes (#2231)
+- Fix bug of counting ignore index in IOU in segmentation evaluation (#2229)
+
+#### Contributors
+
+A total of 14 developers contributed to this release.
+
+@ZLTJohn, @SekiroRong, @shufanwu, @vansin, @triple-Mu, @404Vector, @filaPro, @sunjiahao1999, @Ginray, @Xiangxu-0103, @JingweiZhang12, @DezeZhao, @ZCMax, @roger-lcc
+
+### v1.1.0rc3 (7/1/2023)
+
+#### Highlights
+
+- Support [CenterFormer](https://arxiv.org/abs/2209.05588) in `projects` (#2175)
+- Support [PETR](https://arxiv.org/abs/2203.05625) in `projects` (#2173)
+
+#### New Features
+
+- Support [CenterFormer](https://arxiv.org/abs/2209.05588) in `projects` (#2175)
+- Support [PETR](https://arxiv.org/abs/2203.05625) in `projects` (#2173)
+- Refactor ImVoxelNet on SUN RGB-D into mmdet3d v1.1 (#2141)
+
+#### Improvements
+
+- Remove legacy builder.py (#2061)
+- Update `customize_dataset` documentation (#2153)
+- Update tutorial of LiDAR-based detection (#2120)
+
+#### Bug Fixes
+
+- Fix the configs of FCOS3D and PGD (#2191)
+- Fix numpy's `ValueError` in update_infos_to_v2.py (#2162)
+- Fix parameter missing in Det3DVisualizationHook (#2118)
+- Fix memory overflow in the rotated box IoU calculation (#2134)
+- Fix lidar2cam error in update_infos_to_v2.py for nus and lyft dataset (#2110)
+- Fix error of data type in Waymo metrics (#2109)
+- Update `bbox_3d` information in `cam_instances` for mono3d detection task (#2046)
+- Fix label saving of Waymo dataset (#2096)
+
+#### Contributors
+
+A total of 10 developers contributed to this release.
+
+@SekiroRong, @ZLTJohn, @vansin, @shanmo, @VVsssssk, @ZCMax, @Xiangxu-0103, @JingweiZhang12, @Tai-Wang, @lianqing11
+
+### v1.1.0rc2 (2/12/2022)
+
+#### Highlights
+
+- Support [PV-RCNN](https://arxiv.org/abs/1912.13192)
+- Speed up evaluation on Waymo dataset
+
+#### New Features
+
+- Support [PV-RCNN](https://arxiv.org/abs/1912.13192) (#1597, #2045)
+- Speed up evaluation on Waymo dataset (#2008)
+- Refactor FCAF3D into the framework of mmdet3d v1.1 (#1945)
+- Refactor S3DIS dataset into the framework of mmdet3d v1.1 (#1984)
+- Add `Projects/` folder and the first example project (#2042)
+
+#### Improvements
+
+- Rename `CLASSES` and `PALETTE` to `classes` and `palette` respectively (#1932)
+- Update `metainfo` in pkl files and add `categories` into metainfo (#1934)
+- Show instance statistics before and after through the pipeline (#1863)
+- Add configs of DGCNN for different testing areas (#1967)
+- Remove testing utils from `tests/utils/` to `mmdet3d/testing/` (#2012)
+- Add typehint for code in `models/layers/` (#2014)
+- Refine documentation (#1891, #1994)
+- Refine voxelization for better speed (#2062)
+
+#### Bug Fixes
+
+- Fix loop visualization error about point cloud (#1914)
+- Fix image conversion of Waymo to avoid information loss (#1979)
+- Fix evaluation on KITTI testset (#2005)
+- Fix sampling bug in `IoUNegPiecewiseSampler` (#2017)
+- Fix point cloud range in CenterPoint (#1998)
+- Fix some loading bugs and support FOV-image-based mode on Waymo dataset (#1942)
+- Fix dataset conversion utils (#1923, #2040, #1971)
+- Update metafiles in all the configs (#2006)
+
+#### Contributors
+
+A total of 12 developers contributed to this release.
+
+@vavanade, @oyel, @thinkthinking, @PeterH0323， @274869388, @cxiang26, @lianqing11, @VVsssssk, @ZCMax, @Xiangxu-0103, @JingweiZhang12, @Tai-Wang
+
+### v1.1.0rc1 (11/10/2022)
+
+#### Highlights
+
+- Support a camera-only 3D detection baseline on Waymo, [MV-FCOS3D++](https://arxiv.org/abs/2207.12716)
+
+#### New Features
+
+- Support a camera-only 3D detection baseline on Waymo, [MV-FCOS3D++](https://arxiv.org/abs/2207.12716), with new evaluation metrics and transformations (#1716)
+- Refactor PointRCNN in the framework of mmdet3d v1.1 (#1819)
+
+#### Improvements
+
+- Add `auto_scale_lr` in config to support training with auto-scale learning rates (#1807)
+- Fix CI (#1813, #1865, #1877)
+- Update `browse_dataset.py` script (#1817)
+- Update SUN RGB-D and Lyft datasets documentation (#1833)
+- Rename `convert_to_datasample` to `add_pred_to_datasample` in detectors (#1843)
+- Update customized dataset documentation (#1845)
+- Update `Det3DLocalVisualization` and visualization documentation (#1857)
+- Add the code of generating `cam_sync_labels` for Waymo dataset (#1870)
+- Update dataset transforms typehints (#1875)
+
+#### Bug Fixes
+
+- Fix missing registration of models in [setup_env.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/utils/setup_env.py) (#1808)
+- Fix the data base sampler bugs when using the ground plane data (#1812)
+- Add output directory existing check during visualization (#1828)
+- Fix bugs of nuScenes dataset for monocular 3D detection (#1837)
+- Fix visualization hook to support the visualization of different data modalities (#1839)
+- Fix monocular 3D detection demo (#1864)
+- Fix the lack of `num_pts_feats` key in nuscenes dataset and complete docstring (#1882)
+
+#### Contributors
+
+A total of 10 developers contributed to this release.
+
+@ZwwWayne, @Tai-Wang, @lianqing11, @VVsssssk, @ZCMax, @Xiangxu-0103, @JingweiZhang12, @tpoisonooo, @ice-tong, @jshilong
+
+### v1.1.0rc0 (1/9/2022)
+
+We are excited to announce the release of MMDetection3D 1.1.0rc0.
+MMDet3D 1.1.0rc0 is the first version of MMDetection3D 1.1, a part of the OpenMMLab 2.0 projects.
+Built upon the new [training engine](https://github.com/open-mmlab/mmengine) and [MMDet 3.x](https://github.com/open-mmlab/mmdetection/tree/3.x),
+MMDet3D 1.1 unifies the interfaces of dataset, models, evaluation, and visualization with faster training and testing speed.
+It also provides a standard data protocol for different datasets, modalities, and tasks for 3D perception.
+We will support more strong baselines in the future release, with our latest exploration on camera-only 3D detection from videos.
+
+### Highlights
+
+1. **New engines**. MMDet3D 1.1 is based on [MMEngine](https://github.com/open-mmlab/mmengine) and [MMDet 3.x](https://github.com/open-mmlab/mmdetection/tree/3.x), which provides a universal and powerful runner that allows more flexible customizations and significantly simplifies the entry points of high-level interfaces.
+
+2. **Unified interfaces**. As a part of the OpenMMLab 2.0 projects, MMDet3D 1.1 unifies and refactors the interfaces and internal logics of train, testing, datasets, models, evaluation, and visualization. All the OpenMMLab 2.0 projects share the same design in those interfaces and logics to allow the emergence of multi-task/modality algorithms.
+
+3. **Standard data protocol for all the datasets, modalities, and tasks for 3D perception**. Based on the unified base datasets inherited from MMEngine, we also design a standard data protocol that defines and unifies the common keys across different datasets, tasks, and modalities. It significantly simplifies the usage of multiple datasets and data modalities for multi-task frameworks and eases dataset customization. Please refer to the [documentation of customized datasets](../advanced_guides/customize_dataset.md) for details.
+
+4. **Strong baselines**. We will release strong baselines of many popular models to enable fair comparisons among state-of-the-art models.
+
+5. **More documentation and tutorials**. We add a bunch of documentation and tutorials to help users get started more smoothly. Read it [here](https://mmdetection3d.readthedocs.io/en/1.1/).
+
+### Breaking Changes
+
+MMDet3D 1.1 has undergone significant changes to have better design, higher efficiency, more flexibility, and more unified interfaces.
+Besides the changes of API, we briefly list the major breaking changes in this section.
+We will update the [migration guide](../migration.md) to provide complete details and migration instructions.
+Users can also refer to the [compatibility documentation](./compatibility.md) and [API doc](https://mmdetection3d.readthedocs.io/en/1.1/) for more details.
+
+#### Dependencies
+
+- MMDet3D 1.1 runs on PyTorch>=1.6. We have deprecated the support of PyTorch 1.5 to embrace the mixed precision training and other new features since PyTorch 1.6. Some models can still run on PyTorch 1.5, but the full functionality of MMDet3D 1.1 is not guaranteed.
+- MMDet3D 1.1 relies on MMEngine to run. MMEngine is a new foundational library for training deep learning models of OpenMMLab and are widely depended by OpenMMLab 2.0 projects. The dependencies of file IO and training are migrated from MMCV 1.x to MMEngine.
+- MMDet3D 1.1 relies on MMCV>=2.0.0rc0. Although MMCV no longer maintains the training functionalities since 2.0.0rc0, MMDet3D 1.1 relies on the data transforms, CUDA operators, and image processing interfaces in MMCV. Note that the package `mmcv` is the version that provides pre-built CUDA operators and `mmcv-lite` does not since MMCV 2.0.0rc0, while `mmcv-full` has been deprecated since 2.0.0rc0.
+- MMDet3D 1.1 is based on MMDet 3.x, which is also a part of OpenMMLab 2.0 projects.
+
+#### Training and testing
+
+- MMDet3D 1.1 uses Runner in [MMEngine](https://github.com/open-mmlab/mmengine) rather than that in MMCV. The new Runner implements and unifies the building logic of dataset, model, evaluation, and visualizer. Therefore, MMDet3D 1.1 no longer relies on the building logics of those modules in `mmdet3d.train.apis` and `tools/train.py`. Those code have been migrated into [MMEngine](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py). Please refer to the [migration guide of Runner in MMEngine](https://mmengine.readthedocs.io/en/latest/migration/runner.html) for more details.
+- The Runner in MMEngine also supports testing and validation. The testing scripts are also simplified, which has similar logic as that in training scripts to build the runner.
+- The execution points of hooks in the new Runner have been enriched to allow more flexible customization. Please refer to the [migration guide of Hook in MMEngine](https://mmengine.readthedocs.io/en/latest/migration/hook.html) for more details.
+- Learning rate and momentum scheduling has been migrated from Hook to [Parameter Scheduler in MMEngine](https://mmengine.readthedocs.io/en/latest/tutorials/param_scheduler.html). Please refer to the [migration guide of Parameter Scheduler in MMEngine](https://mmengine.readthedocs.io/en/latest/migration/param_scheduler.html) for more details.
+
+#### Configs
+
+- The [Runner in MMEngine](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py) uses a different config structure to ease the understanding of the components in runner. Users can read the [config example of MMDet3D 1.1](../user_guides/config.md) or refer to the [migration guide in MMEngine](https://mmengine.readthedocs.io/en/latest/migration/runner.html) for migration details.
+- The file names of configs and models are also refactored to follow the new rules unified across OpenMMLab 2.0 projects. The names of checkpoints are not updated for now as there is no BC-breaking of model weights between MMDet3D 1.1 and 1.0.x. We will progressively replace all the model weights by those trained in MMDet3D 1.1. Please refer to the [user guides of config](../user_guides/config.md) for more details.
+
+#### Dataset
+
+The Dataset classes implemented in MMDet3D 1.1 all inherits from the `Det3DDataset` and `Seg3DDataset`, which inherits from the [BaseDataset in MMEngine](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html). In addition to the changes of interfaces, there are several changes of Dataset in MMDet3D 1.1.
+
+- All the datasets support to serialize the internal data list to reduce the memory when multiple workers are built for data loading.
+- The internal data structure in the dataset is changed to be self-contained (without losing information like class names in MMDet3D 1.0.x) while keeping simplicity.
+- Common keys across different datasets and data modalities are defined and all the info files are unified into a standard protocol.
+- The evaluation functionality of each dataset has been removed from dataset so that some specific evaluation metrics like KITTI AP can be used to evaluate the prediction on other datasets.
+
+#### Data Transforms
+
+The data transforms in MMDet3D 1.1 all inherits from `BaseTransform` in MMCV>=2.0.0rc0, which defines a new convention in OpenMMLab 2.0 projects.
+Besides the interface changes, there are several changes listed as below:
+
+- The functionality of some data transforms (e.g., `Resize`) are decomposed into several transforms to simplify and clarify the usages.
+- The format of data dict processed by each data transform is changed according to the new data structure of dataset.
+- Some inefficient data transforms (e.g., normalization and padding) are moved into data preprocessor of model to improve data loading and training speed.
+- The same data transforms in different OpenMMLab 2.0 libraries have the same augmentation implementation and the logic given the same arguments, i.e., `Resize` in MMDet 3.x and MMSeg 1.x will resize the image in the exact same manner given the same arguments.
+
+#### Model
+
+The models in MMDet3D 1.1 all inherits from `BaseModel` in MMEngine, which defines a new convention of models in OpenMMLeb 2.0 projects.
+Users can refer to [the tutorial of model in MMengine](https://mmengine.readthedocs.io/en/latest/tutorials/model.html) for more details.
+Accordingly, there are several changes as the following:
+
+- The model interfaces, including the input and output formats, are significantly simplified and unified following the new convention in MMDet3D 1.1.
+  Specifically, all the input data in training and testing are packed into `inputs` and `data_samples`, where `inputs` contains model inputs like a dict contain a list of image tensors and the point cloud data, and `data_samples` contains other information of the current data sample such as ground truths, region proposals, and model predictions. In this way, different tasks in MMDet3D 1.1 can share the same input arguments, which makes the models more general and suitable for multi-task learning and some flexible training paradigms like semi-supervised learning.
+- The model has a data preprocessor module, which are used to pre-process the input data of model. In MMDet3D 1.1, the data preprocessor usually does necessary steps to form the input images into a batch, such as padding. It can also serve as a place for some special data augmentations or more efficient data transformations like normalization.
+- The internal logic of model have been changed. In MMDet3D 1.1, model uses `forward_train`, `forward_test`, `simple_test`, and `aug_test` to deal with different model forward logics. In MMDet3D 1.1 and OpenMMLab 2.0, the forward function has three modes: 'loss', 'predict', and 'tensor' for training, inference, and tracing or other purposes, respectively.
+  The forward function calls `self.loss`, `self.predict`, and `self._forward` given the modes 'loss', 'predict', and 'tensor', respectively.
+
+#### Evaluation
+
+The evaluation in MMDet3D 1.0.x strictly binds with the dataset. In contrast, MMDet3D 1.1 decomposes the evaluation from dataset, so that all the detection dataset can evaluate with KITTI AP and other metrics implemented in MMDet3D 1.1.
+MMDet3D 1.1 mainly implements corresponding metrics for each dataset, which are manipulated by [Evaluator](https://mmengine.readthedocs.io/en/latest/design/evaluator.html) to complete the evaluation.
+Users can build evaluator in MMDet3D 1.1 to conduct offline evaluation, i.e., evaluate predictions that may not produced in MMDet3D 1.1 with the dataset as long as the dataset and the prediction follows the dataset conventions. More details can be find in the [tutorial in mmengine](https://mmengine.readthedocs.io/en/latest/tutorials/evaluation.html).
+
+#### Visualization
+
+The functions of visualization in MMDet3D 1.1 are removed. Instead, in OpenMMLab 2.0 projects, we use [Visualizer](https://mmengine.readthedocs.io/en/latest/design/visualization.html) to visualize data. MMDet3D 1.1 implements `Det3DLocalVisualizer` to allow visualization of 2D and 3D data, ground truths, model predictions, and feature maps, etc., at any place. It also supports to send the visualization data to any external visualization backends such as Tensorboard.
+
+### Planned changes
+
+We list several planned changes of MMDet3D 1.1.0rc0 so that the community could more comprehensively know the progress of MMDet3D 1.1. Feel free to create a PR, issue, or discussion if you are interested, have any suggestions and feedbacks, or want to participate.
+
+1. Test-time augmentation: which is supported in MMDet3D 1.0.x, is not implemented in this version due to limited time slot. We will support it in the following releases with a new and simplified design.
+2. Inference interfaces: a unified inference interfaces will be supported in the future to ease the use of released models.
+3. Interfaces of useful tools that can be used in notebook: more useful tools that implemented in the `tools` directory will have their python interfaces so that they can be used through notebook and in downstream libraries.
+4. Documentation: we will add more design docs, tutorials, and migration guidance so that the community can deep dive into our new design, participate the future development, and smoothly migrate downstream libraries to MMDet3D 1.1.
+5. Wandb visualization: MMDet 2.x supports data visualization since v2.25.0, which has not been migrated to MMDet 3.x for now. Since Wandb provides strong visualization and experiment management capabilities, a `DetWandbVisualizer` and maybe a hook are planned to fully migrated those functionalities in MMDet 2.x and a `Det3DWandbVisualizer` will be supported in MMDet3D 1.1 accordingly.
+6. Will support recent new features added in MMDet3D 1.0.x and our recent exploration on camera-only 3D detection from videos: we will refactor these models and support them with benchmarks and models soon.
diff --git a/mmde/docs/en/notes/changelog_v1.0.x.md b/mmde/docs/en/notes/changelog_v1.0.x.md
new file mode 100644
index 0000000000000000000000000000000000000000..76d2ba059cce8d6591470887cb4769abd60bbe7d
--- /dev/null
+++ b/mmde/docs/en/notes/changelog_v1.0.x.md
@@ -0,0 +1,930 @@
+# Changelog of v1.0.x
+
+### v1.0.0 (6/4/2023)
+
+#### Improvements
+
+- Add BN in FPN to avoid loss Nan in MVXNet (#2282)
+- Update `s3dis_data_utils.py` (#2232)
+
+#### Bug Fixes
+
+- Fix precision error when using mixed precision on CenterPoint (#2341)
+- Replace `np.transpose` with `torch.permute` to speed up (@2273)
+- Update links of SECOND checkpoints (#2185)
+
+#### Contributors
+
+A total of 7 developers contributed to this release.
+@JingweiZhang12, @ZCMax, @Xiangxu-0103, @vansinhu, @cs1488, @sunjiahao1999, @Ginray
+
+### v1.0.0rc7 (7/1/2023)
+
+#### Improvements
+
+- Support training and testing on MLU (#2167)
+
+#### Contributors
+
+A total of 1 developers contributed to this release.
+@mengpenghui
+
+### v1.0.0rc6 (2/12/2022)
+
+#### New Features
+
+- Add `Projects/` folder and the first example project (#2082)
+
+#### Improvements
+
+- Update Waymo converter to save storage space (#1759)
+- Update model link and performance of CenterPoint (#1916)
+
+#### Bug Fixes
+
+- Fix GPU memory occupancy problem in PointRCNN (#1928)
+- Fix sampling bug in `IoUNegPiecewiseSampler` (#2018)
+
+#### Contributors
+
+A total of 6 developers contributed to this release.
+
+@oyel, @zzj403, @VVsssssk, @Tai-Wang, @tpoisonooo, @JingweiZhang12, @ZCMax
+
+### v1.0.0rc5 (11/10/2022)
+
+#### New Features
+
+- Support ImVoxelNet on SUN RGB-D (#1738)
+
+#### Improvements
+
+- Fix the cross-codebase reference problem in metafile README (#1644)
+- Update the Chinese documentation about getting started (#1715)
+- Fix docs link and add docs link checker (#1811)
+
+#### Bug Fixes
+
+- Fix a visualization bug that is potentially triggered by empty prediction labels (#1725)
+- Fix point cloud segmentation visualization bug due to wrong parameter passing (#1858)
+- Fix Nan loss bug during PointRCNN training (#1874)
+
+#### Contributors
+
+A total of 9 developers contributed to this release.
+
+@ZwwWayne, @Tai-Wang, @filaPro, @VVsssssk, @ZCMax, @Xiangxu-0103, @holtvogt, @tpoisonooo, @lianqing01
+
+### v1.0.0rc4 (8/8/2022)
+
+#### Highlights
+
+- Support [FCAF3D](https://arxiv.org/pdf/2112.00322.pdf)
+
+#### New Features
+
+- Support [FCAF3D](https://arxiv.org/pdf/2112.00322.pdf) (#1547)
+- Add the transformation to support multi-camera 3D object detection (#1580)
+- Support lift-splat-shoot view transformer (#1598)
+
+#### Improvements
+
+- Remove the limitation of the maximum number of points during SUN RGB-D preprocessing (#1555)
+- Support circle CI (#1647)
+- Add mim to extras_require in setup.py (#1560, #1574)
+- Update dockerfile package version (#1697)
+
+#### Bug Fixes
+
+- Flip yaw angle for DepthInstance3DBoxes.overlaps (#1548, #1556)
+- Fix DGCNN configs (#1587)
+- Fix bbox head not registered bug (#1625)
+- Fix missing objects in S3DIS preprocessing (#1665)
+- Fix spconv2.0 model loading bug (#1699)
+
+#### Contributors
+
+A total of 9 developers contributed to this release.
+
+@Tai-Wang, @ZwwWayne, @filaPro, @lianqing11, @ZCMax, @HuangJunJie2017, @Xiangxu-0103, @ChonghaoSima, @VVsssssk
+
+### v1.0.0rc3 (8/6/2022)
+
+#### Highlights
+
+- Support [SA-SSD](https://openaccess.thecvf.com/content_CVPR_2020/papers/He_Structure_Aware_Single-Stage_3D_Object_Detection_From_Point_Cloud_CVPR_2020_paper.pdf)
+
+#### New Features
+
+- Support [SA-SSD](https://openaccess.thecvf.com/content_CVPR_2020/papers/He_Structure_Aware_Single-Stage_3D_Object_Detection_From_Point_Cloud_CVPR_2020_paper.pdf) (#1337)
+
+#### Improvements
+
+- Add Chinese documentation for vision-only 3D detection (#1438)
+- Update CenterPoint pretrained models that are compatible with refactored coordinate systems (#1450)
+- Configure myst-parser to parse anchor tag in the documentation (#1488)
+- Replace markdownlint with mdformat for avoiding installing ruby (#1489)
+- Add missing `gt_names` when getting annotation info in Custom3DDataset (#1519)
+- Support S3DIS full ceph training (#1542)
+- Rewrite the installation and FAQ documentation (#1545)
+
+#### Bug Fixes
+
+- Fix the incorrect registry name when building RoI extractors (#1460)
+- Fix the potential problems caused by the registry scope update when composing pipelines (#1466) and using CocoDataset (#1536)
+- Fix the missing selection with `order` in the [box3d_nms](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/post_processing/box3d_nms.py) introduced by [#1403](https://github.com/open-mmlab/mmdetection3d/pull/1403) (#1479)
+- Update the [PointPillars config](https://github.com/open-mmlab/mmdetection3d/blob/master/configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py) to make it consistent with the log (#1486)
+- Fix heading anchor in documentation (#1490)
+- Fix the compatibility of mmcv in the dockerfile (#1508)
+- Make overwrite_spconv packaged when building whl (#1516)
+- Fix the requirement of mmcv and mmdet (#1537)
+- Update configs of PartA2 and support its compatibility with spconv 2.0 (#1538)
+
+#### Contributors
+
+A total of 13 developers contributed to this release.
+
+@Xiangxu-0103, @ZCMax, @jshilong, @filaPro, @atinfinity, @Tai-Wang, @wenbo-yu, @yi-chen-isuzu, @ZwwWayne, @wchen61, @VVsssssk, @AlexPasqua, @lianqing11
+
+### v1.0.0rc2 (1/5/2022)
+
+#### Highlights
+
+- Support spconv 2.0
+- Support MinkowskiEngine with MinkResNet
+- Support training models on custom datasets with only point clouds
+- Update Registry to distinguish the scope of built functions
+- Replace mmcv.iou3d with a set of bird-eye-view (BEV) operators to unify the operations of rotated boxes
+
+#### New Features
+
+- Add loader arguments in the configuration files (#1388)
+- Support [spconv 2.0](https://github.com/traveller59/spconv) when the package is installed. Users can still use spconv 1.x in MMCV with CUDA 9.0 (only cost more memory) without losing the compatibility of model weights between two versions (#1421)
+- Support MinkowskiEngine with MinkResNet (#1422)
+
+#### Improvements
+
+- Add the documentation for model deployment (#1373, #1436)
+- Add Chinese documentation of
+  - Speed benchmark (#1379)
+  - LiDAR-based 3D detection (#1368)
+  - LiDAR 3D segmentation (#1420)
+  - Coordinate system refactoring (#1384)
+- Support training models on custom datasets with only point clouds (#1393)
+- Replace mmcv.iou3d with a set of bird-eye-view (BEV) operators to unify the operations of rotated boxes (#1403, #1418)
+- Update Registry to distinguish the scope of building functions (#1412, #1443)
+- Replace recommonmark with myst_parser for documentation rendering (#1414)
+
+#### Bug Fixes
+
+- Fix the show pipeline in the [browse_dataset.py](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/misc/browse_dataset.py) (#1376)
+- Fix missing __init__ files after coordinate system refactoring (#1383)
+- Fix the incorrect yaw in the visualization caused by coordinate system refactoring (#1407)
+- Fix `NaiveSyncBatchNorm1d` and `NaiveSyncBatchNorm2d` to support non-distributed cases and more general inputs (#1435)
+
+#### Contributors
+
+A total of 11 developers contributed to this release.
+
+@ZCMax, @ZwwWayne, @Tai-Wang, @VVsssssk, @HanaRo, @JoeyforJoy, @ansonlcy, @filaPro, @jshilong, @Xiangxu-0103, @deleomike
+
+### v1.0.0rc1 (1/4/2022)
+
+#### Compatibility
+
+- We migrate all the mmdet3d ops to mmcv and do not need to compile them when installing mmdet3d.
+- To fix the imprecise timestamp and optimize its saving method, we reformat the point cloud data during Waymo data conversion. The data conversion time is also optimized significantly by supporting parallel processing. Please re-generate KITTI format Waymo data if necessary. See more details in the [compatibility documentation](https://github.com/open-mmlab/mmdetection3d/blob/master/docs/en/compatibility.md).
+- We update some of the model checkpoints after the refactor of coordinate systems. Please stay tuned for the release of the remaining model checkpoints.
+
+|               | Fully Updated | Partially Updated | In Progress | No Influcence |
+| ------------- | :-----------: | :---------------: | :---------: | :-----------: |
+| SECOND        |               |         ✓         |             |               |
+| PointPillars  |               |         ✓         |             |               |
+| FreeAnchor    |       ✓       |                   |             |               |
+| VoteNet       |       ✓       |                   |             |               |
+| H3DNet        |       ✓       |                   |             |               |
+| 3DSSD         |               |         ✓         |             |               |
+| Part-A2       |       ✓       |                   |             |               |
+| MVXNet        |       ✓       |                   |             |               |
+| CenterPoint   |               |                   |      ✓      |               |
+| SSN           |       ✓       |                   |             |               |
+| ImVoteNet     |       ✓       |                   |             |               |
+| FCOS3D        |               |                   |             |       ✓       |
+| PointNet++    |               |                   |             |       ✓       |
+| Group-Free-3D |               |                   |             |       ✓       |
+| ImVoxelNet    |       ✓       |                   |             |               |
+| PAConv        |               |                   |             |       ✓       |
+| DGCNN         |               |                   |             |       ✓       |
+| SMOKE         |               |                   |             |       ✓       |
+| PGD           |               |                   |             |       ✓       |
+| MonoFlex      |               |                   |             |       ✓       |
+
+#### Highlights
+
+- Migrate all the mmdet3d ops to mmcv
+- Support parallel waymo data converter
+- Add ScanNet instance segmentation dataset with metrics
+- Better compatibility for windows with CI support, op migration and bug fixes
+- Support loading annotations from Ceph
+
+#### New Features
+
+- Add ScanNet instance segmentation dataset with metrics (#1230)
+- Support different random seeds for different ranks (#1321)
+- Support loading annotations from Ceph (#1325)
+- Support resuming from the latest checkpoint automatically (#1329)
+- Add windows CI (#1345)
+
+#### Improvements
+
+- Update the table format and OpenMMLab project orders in [README.md](https://github.com/open-mmlab/mmdetection3d/blob/master/README.md) (#1272, #1283)
+- Migrate all the mmdet3d ops to mmcv (#1240, #1286, #1290, #1333)
+- Add `with_plane` flag in the KITTI data conversion (#1278)
+- Update instructions and links in the documentation (#1300, 1309, #1319)
+- Support parallel Waymo dataset converter and ground truth database generator (#1327)
+- Add quick installation commands to [getting_started.md](https://github.com/open-mmlab/mmdetection3d/blob/master/docs/en/getting_started.md) (#1366)
+
+#### Bug Fixes
+
+- Update nuimages configs to use new nms config style (#1258)
+- Fix the usage of np.long for windows compatibility (#1270)
+- Fix the incorrect indexing in `BasePoints` (#1274)
+- Fix the incorrect indexing in the [pillar_scatter.forward_single](https://github.com/open-mmlab/mmdetection3d/blob/dev/mmdet3d/models/middle_encoders/pillar_scatter.py#L38) (#1280)
+- Fix unit tests that use GPUs (#1301)
+- Fix incorrect feature dimensions in `DynamicPillarFeatureNet` caused by previous upgrading of `PillarFeatureNet` (#1302)
+- Remove the `CameraPoints` constraint in `PointSample` (#1314)
+- Fix imprecise timestamps saving of Waymo dataset (#1327)
+
+#### Contributors
+
+A total of 9 developers contributed to this release.
+
+@ZCMax, @ZwwWayne, @wHao-Wu, @Tai-Wang, @wangruohui, @zjwzcx, @Xiangxu-0103, @EdAyers, @hongye-dev, @zhanggefan
+
+### v1.0.0rc0 (18/2/2022)
+
+#### Compatibility
+
+- We refactor our three coordinate systems to make their rotation directions and origins more consistent, and further remove unnecessary hacks in different datasets and models. Therefore, please re-generate data infos or convert the old version to the new one with our provided scripts. We will also provide updated checkpoints in the next version. Please refer to the [compatibility documentation](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/docs/en/compatibility.md) for more details.
+- Unify the camera keys for consistent transformation between coordinate systems on different datasets. The modification changes the key names to `lidar2img`, `depth2img`, `cam2img`, etc., for easier understanding. Customized codes using legacy keys may be influenced.
+- The next release will begin to move files of CUDA ops to [MMCV](https://github.com/open-mmlab/mmcv). It will influence the way to import related functions. We will not break the compatibility but will raise a warning first and please prepare to migrate it.
+
+#### Highlights
+
+- Support new monocular 3D detectors: [PGD](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/pgd), [SMOKE](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/smoke), [MonoFlex](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/monoflex)
+- Support a new LiDAR-based detector: [PointRCNN](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/point_rcnn)
+- Support a new backbone: [DGCNN](https://github.com/open-mmlab/mmdetection3d/tree/v1.0.0.dev0/configs/dgcnn)
+- Support 3D object detection on the S3DIS dataset
+- Support compilation on Windows
+- Full benchmark for PAConv on S3DIS
+- Further enhancement for documentation, especially on the Chinese documentation
+
+#### New Features
+
+- Support 3D object detection on the S3DIS dataset (#835)
+- Support PointRCNN (#842, #843, #856, #974, #1022, #1109, #1125)
+- Support DGCNN (#896)
+- Support PGD (#938, #940, #948, #950, #964, #1014, #1065, #1070, #1157)
+- Support SMOKE (#939, #955, #959, #975, #988, #999, #1029)
+- Support MonoFlex (#1026, #1044, #1114, #1115, #1183)
+- Support CPU Training (#1196)
+
+#### Improvements
+
+- Support point sampling based on distance metric (#667, #840)
+- Refactor coordinate systems (#677, #774, #803, #899, #906, #912, #968, #1001)
+- Unify camera keys in PointFusion and transformations between different systems (#791, #805)
+- Refine documentation (#792, #827, #829, #836, #849, #854, #859, #1111, #1113, #1116, #1121, #1132, #1135, #1185, #1193, #1226)
+- Add a script to support benchmark regression (#808)
+- Benchmark PAConvCUDA on S3DIS (#847)
+- Support to download pdf and epub documentation (#850)
+- Change the `repeat` setting in Group-Free-3D configs to reduce training epochs (#855)
+- Support KITTI AP40 evaluation metric (#927)
+- Add the mmdet3d2torchserve tool for SECOND (#977)
+- Add code-spell pre-commit hook and fix typos (#995)
+- Support the latest numba version (#1043)
+- Set a default seed to use when the random seed is not specified (#1072)
+- Distribute mix-precision models to each algorithm folder (#1074)
+- Add abstract and a representative figure for each algorithm (#1086)
+- Upgrade pre-commit hook (#1088, #1217)
+- Support augmented data and ground truth visualization (#1092)
+- Add local yaw property for `CameraInstance3DBoxes` (#1130)
+- Lock the required numba version to 0.53.0 (#1159)
+- Support the usage of plane information for KITTI dataset (#1162)
+- Deprecate the support for "python setup.py test" (#1164)
+- Reduce the number of multi-process threads to accelerate training (#1168)
+- Support 3D flip augmentation for semantic segmentation (#1181)
+- Update README format for each model (#1195)
+
+#### Bug Fixes
+
+- Fix compiling errors on Windows (#766)
+- Fix the deprecated nms setting in the ImVoteNet config (#828)
+- Use the latest `wrap_fp16_model` import from mmcv (#861)
+- Remove 2D annotations generation on Lyft (#867)
+- Update index files for the Chinese documentation to be consistent with the English version (#873)
+- Fix the nested list transpose in the CenterPoint head (#879)
+- Fix deprecated pretrained model loading for RegNet (#889)
+- Fix the incorrect dimension indices of rotations and testing config in the CenterPoint test time augmentation (#892)
+- Fix and improve visualization tools (#956, #1066, #1073)
+- Fix PointPillars FLOPs calculation error (#1075)
+- Fix missing dimension information in the SUN RGB-D data generation (#1120)
+- Fix incorrect anchor range settings in the PointPillars [config](https://github.com/open-mmlab/mmdetection3d/blob/master/configs/_base_/models/hv_pointpillars_secfpn_kitti.py) for KITTI (#1163)
+- Fix incorrect model information in the RegNet metafile (#1184)
+- Fix bugs in non-distributed multi-gpu training and testing (#1197)
+- Fix a potential assertion error when generating corners from an empty box (#1212)
+- Upgrade bazel version according to the requirement of Waymo Devkit (#1223)
+
+#### Contributors
+
+A total of 12 developers contributed to this release.
+
+@THU17cyz, @wHao-Wu, @wangruohui, @Wuziyi616, @filaPro, @ZwwWayne, @Tai-Wang, @DCNSW, @xieenze, @robin-karlsson0, @ZCMax, @Otteri
+
+### v0.18.1 (1/2/2022)
+
+#### Improvements
+
+- Support Flip3D augmentation in semantic segmentation task (#1182)
+- Update regnet metafile (#1184)
+- Add point cloud annotation tools introduction in FAQ (#1185)
+- Add missing explanations of `cam_intrinsic` in the nuScenes dataset doc (#1193)
+
+#### Bug Fixes
+
+- Deprecate the support for "python setup.py test" (#1164)
+- Fix the rotation matrix while rotation axis=0 (#1182)
+- Fix the bug in non-distributed multi-gpu training/testing (#1197)
+- Fix a potential bug when generating corners for empty bounding boxes (#1212)
+
+#### Contributors
+
+A total of 4 developers contributed to this release.
+
+@ZwwWayne, @ZCMax, @Tai-Wang, @wHao-Wu
+
+### v0.18.0 (1/1/2022)
+
+#### Highlights
+
+- Update the required minimum version of mmdet and mmseg
+
+#### Improvements
+
+- Use the official markdownlint hook and add codespell hook for pre-committing (#1088)
+- Improve CI operation (#1095, #1102, #1103)
+- Use shared menu content from OpenMMLab's theme and remove duplicated contents from config (#1111)
+- Refactor the structure of documentation (#1113, #1121)
+- Update the required minimum version of mmdet and mmseg (#1147)
+
+#### Bug Fixes
+
+- Fix symlink failure on Windows (#1096)
+- Fix the upper bound of mmcv version in the mminstall requirements (#1104)
+- Fix API documentation compilation and mmcv build errors (#1116)
+- Fix figure links and pdf documentation compilation (#1132, #1135)
+
+#### Contributors
+
+A total of 4 developers contributed to this release.
+
+@ZwwWayne, @ZCMax, @Tai-Wang, @wHao-Wu
+
+### v0.17.3 (1/12/2021)
+
+#### Improvements
+
+- Change the default show value to `False` in show_result function to avoid unnecessary errors (#1034)
+- Improve the visualization of detection results with colorized points in [single_gpu_test](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/apis/test.py#L11) (#1050)
+- Clean unnecessary custom_imports in entrypoints (#1068)
+
+#### Bug Fixes
+
+- Update mmcv version in the Dockerfile (#1036)
+- Fix the memory-leak problem when loading checkpoints in [init_model](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/apis/inference.py#L36) (#1045)
+- Fix incorrect velocity indexing when formatting boxes on nuScenes (#1049)
+- Explicitly set cuda device ID in [init_model](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/apis/inference.py#L36) to avoid memory allocation on unexpected devices (#1056)
+- Fix PointPillars FLOPs calculation error (#1076)
+
+#### Contributors
+
+A total of 5 developers contributed to this release.
+
+@wHao-Wu, @Tai-Wang, @ZCMax, @MilkClouds, @aldakata
+
+### v0.17.2 (1/11/2021)
+
+#### Improvements
+
+- Update Group-Free-3D and FCOS3D bibtex (#985)
+- Update the solutions for incompatibility of pycocotools in the FAQ (#993)
+- Add Chinese documentation for the KITTI (#1003) and Lyft (#1010) dataset tutorial
+- Add the H3DNet checkpoint converter for incompatible keys (#1007)
+
+#### Bug Fixes
+
+- Update mmdetection and mmsegmentation version in the Dockerfile (#992)
+- Fix links in the Chinese documentation (#1015)
+
+#### Contributors
+
+A total of 4 developers contributed to this release.
+
+@Tai-Wang, @wHao-Wu, @ZwwWayne, @ZCMax
+
+### v0.17.1 (1/10/2021)
+
+#### Highlights
+
+- Support a faster but non-deterministic version of hard voxelization
+- Completion of dataset tutorials and the Chinese documentation
+- Improved the aesthetics of the documentation format
+
+#### Improvements
+
+- Add Chinese documentation for training on customized datasets and designing customized models (#729, #820)
+- Support a faster but non-deterministic version of hard voxelization (#904)
+- Update paper titles and code details for metafiles (#917)
+- Add a tutorial for KITTI dataset (#953)
+- Use Pytorch sphinx theme to improve the format of documentation (#958)
+- Use the docker to accelerate CI (#971)
+
+#### Bug Fixes
+
+- Fix the sphinx version used in the documentation (#902)
+- Fix a dynamic scatter bug that discards the first voxel by mistake when all input points are valid (#915)
+- Fix the inconsistent variable names used in the [unit test](https://github.com/open-mmlab/mmdetection3d/blob/master/tests/test_models/test_voxel_encoder/test_voxel_generator.py) for voxel generator (#919)
+- Upgrade to use `build_prior_generator` to replace the legacy `build_anchor_generator` (#941)
+- Fix a minor bug caused by a too small difference set in the FreeAnchor Head (#944)
+
+#### Contributors
+
+A total of 8 developers contributed to this release.
+
+@DCNSW, @zhanggefan, @mickeyouyou, @ZCMax, @wHao-Wu, @tojimahammatov, @xiliu8006, @Tai-Wang
+
+### v0.17.0 (1/9/2021)
+
+#### Compatibility
+
+- Unify the camera keys for consistent transformation between coordinate systems on different datasets. The modification change the key names to `lidar2img`, `depth2img`, `cam2img`, etc. for easier understanding. Customized codes using legacy keys may be influenced.
+- The next release will begin to move files of CUDA ops to [MMCV](https://github.com/open-mmlab/mmcv). It will influence the way to import related functions. We will not break the compatibility but will raise a warning first and please prepare to migrate it.
+
+#### Highlights
+
+- Support 3D object detection on the S3DIS dataset
+- Support compilation on Windows
+- Full benchmark for PAConv on S3DIS
+- Further enhancement for documentation, especially on the Chinese documentation
+
+#### New Features
+
+- Support 3D object detection on the S3DIS dataset (#835)
+
+#### Improvements
+
+- Support point sampling based on distance metric (#667, #840)
+- Update PointFusion to support unified camera keys (#791)
+- Add Chinese documentation for customized dataset (#792), data pipeline (#827), customized runtime (#829), 3D Detection on ScanNet (#836), nuScenes (#854) and Waymo (#859)
+- Unify camera keys used in transformation between different systems (#805)
+- Add a script to support benchmark regression (#808)
+- Benchmark PAConvCUDA on S3DIS (#847)
+- Add a tutorial for 3D detection on the Lyft dataset (#849)
+- Support to download pdf and epub documentation (#850)
+- Change the `repeat` setting in Group-Free-3D configs to reduce training epochs (#855)
+
+#### Bug Fixes
+
+- Fix compiling errors on Windows (#766)
+- Fix the deprecated nms setting in the ImVoteNet config (#828)
+- Use the latest `wrap_fp16_model` import from mmcv (#861)
+- Remove 2D annotations generation on Lyft (#867)
+- Update index files for the Chinese documentation to be consistent with the English version (#873)
+- Fix the nested list transpose in the CenterPoint head (#879)
+- Fix deprecated pretrained model loading for RegNet (#889)
+
+#### Contributors
+
+A total of 11 developers contributed to this release.
+
+@THU17cyz, @wHao-Wu, @wangruohui, @Wuziyi616, @filaPro, @ZwwWayne, @Tai-Wang, @DCNSW, @xieenze, @robin-karlsson0, @ZCMax
+
+### v0.16.0 (1/8/2021)
+
+#### Compatibility
+
+- Remove the rotation and dimension hack in the monocular 3D detection on nuScenes by applying corresponding transformation in the pre-processing and post-processing. The modification only influences nuScenes coco-style json files. Please re-run the data preparation scripts if necessary. See more details in the PR #744.
+- Add a new pre-processing module for the ScanNet dataset in order to support multi-view detectors. Please run the updated scripts to extract the RGB data and its annotations. See more details in the PR #696.
+
+#### Highlights
+
+- Support to use [MIM](https://github.com/open-mmlab/mim) with pip installation
+- Support PAConv [models and benchmarks](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/paconv) on S3DIS
+- Enhance the documentation especially on dataset tutorials
+
+#### New Features
+
+- Support RGB images on ScanNet for multi-view detectors (#696)
+- Support FLOPs and number of parameters calculation (#736)
+- Support to use [MIM](https://github.com/open-mmlab/mim) with pip installation (#782)
+- Support PAConv models and benchmarks on the S3DIS dataset (#783, #809)
+
+#### Improvements
+
+- Refactor Group-Free-3D to make it inherit BaseModule from MMCV (#704)
+- Modify the initialization methods of FCOS3D to be consistent with the refactored approach (#705)
+- Benchmark the Group-Free-3D [models](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/groupfree3d) on ScanNet (#710)
+- Add Chinese documentation for Getting Started (#725), FAQ (#730), Model Zoo (#735), Demo (#745), Quick Run (#746), Data Preparation (#787) and Configs (#788)
+- Add documentation for semantic segmentation on ScanNet and S3DIS (#743, #747, #806, #807)
+- Add a parameter `max_keep_ckpts` to limit the maximum number of saved Group-Free-3D checkpoints (#765)
+- Add documentation for 3D detection on SUN RGB-D and nuScenes (#770, #793)
+- Remove mmpycocotools in the Dockerfile (#785)
+
+#### Bug Fixes
+
+- Fix versions of OpenMMLab dependencies (#708)
+- Convert `rt_mat` to `torch.Tensor` in coordinate transformation for compatibility (#709)
+- Fix the `bev_range` initialization in `ObjectRangeFilter` according to the `gt_bboxes_3d` type (#717)
+- Fix Chinese documentation and incorrect doc format due to the incompatible Sphinx version (#718)
+- Fix a potential bug when setting `interval == 1` in [analyze_logs.py](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/analysis_tools/analyze_logs.py) (#720)
+- Update the structure of Chinese documentation (#722)
+- Fix FCOS3D FPN BC-Breaking caused by the code refactoring in MMDetection (#739)
+- Fix wrong `in_channels` when `with_distance=True` in the [Dynamic VFE Layers](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/voxel_encoders/voxel_encoder.py#L87) (#749)
+- Fix the dimension and yaw hack of FCOS3D on nuScenes (#744, #794, #795, #818)
+- Fix the missing default `bbox_mode` in the `show_multi_modality_result` (#825)
+
+#### Contributors
+
+A total of 12 developers contributed to this release.
+
+@yinchimaoliang, @gopi231091, @filaPro, @ZwwWayne, @ZCMax, @hjin2902, @wHao-Wu, @Wuziyi616, @xiliu8006, @THU17cyz, @DCNSW, @Tai-Wang
+
+### v0.15.0 (1/7/2021)
+
+#### Compatibility
+
+In order to fix the problem that the priority of EvalHook is too low, all hook priorities have been re-adjusted in 1.3.8, so MMDetection 2.14.0 needs to rely on the latest MMCV 1.3.8 version. For related information, please refer to [#1120](https://github.com/open-mmlab/mmcv/pull/1120), for related issues, please refer to [#5343](https://github.com/open-mmlab/mmdetection/issues/5343).
+
+#### Highlights
+
+- Support [PAConv](https://arxiv.org/abs/2103.14635)
+- Support monocular/multi-view 3D detector [ImVoxelNet](https://arxiv.org/abs/2106.01178) on KITTI
+- Support Transformer-based 3D detection method [Group-Free-3D](https://arxiv.org/abs/2104.00678) on ScanNet
+- Add documentation for tasks including LiDAR-based 3D detection, vision-only 3D detection and point-based 3D semantic segmentation
+- Add dataset documents like ScanNet
+
+#### New Features
+
+- Support Group-Free-3D on ScanNet (#539)
+- Support PAConv modules (#598, #599)
+- Support ImVoxelNet on KITTI (#627, #654)
+
+#### Improvements
+
+- Add unit tests for pipeline functions `LoadImageFromFileMono3D`, `ObjectNameFilter` and `ObjectRangeFilter` (#615)
+- Enhance [IndoorPatchPointSample](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/pipelines/transforms_3d.py) (#617)
+- Refactor model initialization methods based MMCV (#622)
+- Add Chinese docs (#629)
+- Add documentation for LiDAR-based 3D detection (#642)
+- Unify intrinsic and extrinsic matrices for all datasets (#653)
+- Add documentation for point-based 3D semantic segmentation (#663)
+- Add documentation of ScanNet for 3D detection (#664)
+- Refine docs for tutorials (#666)
+- Add documentation for vision-only 3D detection (#669)
+- Refine docs for Quick Run and Useful Tools (#686)
+
+#### Bug Fixes
+
+- Fix the bug of [BackgroundPointsFilter](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/pipelines/transforms_3d.py) using the bottom center of ground truth (#609)
+- Fix [LoadMultiViewImageFromFiles](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/pipelines/loading.py) to unravel stacked multi-view images to list to be consistent with DefaultFormatBundle (#611)
+- Fix the potential bug in [analyze_logs](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/analysis_tools/analyze_logs.py) when the training resumes from a checkpoint or is stopped before evaluation (#634)
+- Fix test commands in docs and make some refinements (#635)
+- Fix wrong config paths in unit tests (#641)
+
+### v0.14.0 (1/6/2021)
+
+#### Highlights
+
+- Support the point cloud segmentation method [PointNet++](https://arxiv.org/abs/1706.02413)
+
+#### New Features
+
+- Support PointNet++ (#479, #528, #532, #541)
+- Support RandomJitterPoints transform for point cloud segmentation (#584)
+- Support RandomDropPointsColor transform for point cloud segmentation (#585)
+
+#### Improvements
+
+- Move the point alignment of ScanNet from data pre-processing to pipeline (#439, #470)
+- Add compatibility document to provide detailed descriptions of BC-breaking changes (#504)
+- Add MMSegmentation installation requirement (#535)
+- Support points rotation even without bounding box in GlobalRotScaleTrans for point cloud segmentaiton (#540)
+- Support visualization of detection results and dataset browse for nuScenes Mono-3D dataset (#542, #582)
+- Support faster implementation of KNN (#586)
+- Support RegNetX models on Lyft dataset (#589)
+- Remove a useless parameter `label_weight` from segmentation datasets including `Custom3DSegDataset`, `ScanNetSegDataset` and `S3DISSegDataset` (#607)
+
+#### Bug Fixes
+
+- Fix a corrupted lidar data file in Lyft dataset in [data_preparation](https://github.com/open-mmlab/mmdetection3d/tree/master/docs/data_preparation.md) (#546)
+- Fix evaluation bugs in nuScenes and Lyft dataset (#549)
+- Fix converting points between coordinates with specific transformation matrix in the [coord_3d_mode.py](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/bbox/structures/coord_3d_mode.py) (#556)
+- Support PointPillars models on Lyft dataset (#578)
+- Fix the bug of demo with pre-trained VoteNet model on ScanNet (#600)
+
+### v0.13.0 (1/5/2021)
+
+#### Highlights
+
+- Support a monocular 3D detection method [FCOS3D](https://arxiv.org/abs/2104.10956)
+- Support ScanNet and S3DIS semantic segmentation dataset
+- Enhancement of visualization tools for dataset browsing and demos, including support of visualization for multi-modality data and point cloud segmentation.
+
+#### New Features
+
+- Support ScanNet semantic segmentation dataset (#390)
+- Support monocular 3D detection on nuScenes (#392)
+- Support multi-modality visualization (#405)
+- Support nuimages visualization (#408)
+- Support monocular 3D detection on KITTI (#415)
+- Support online visualization of semantic segmentation results (#416)
+- Support ScanNet test results submission to online benchmark (#418)
+- Support S3DIS data pre-processing and dataset class (#433)
+- Support FCOS3D (#436, #442, #482, #484)
+- Support dataset browse for multiple types of datasets (#467)
+- Adding paper-with-code (PWC) metafile for each model in the model zoo (#485)
+
+#### Improvements
+
+- Support dataset browsing for SUNRGBD, ScanNet or KITTI points and detection results (#367)
+- Add the pipeline to load data using file client (#430)
+- Support to customize the type of runner (#437)
+- Make pipeline functions process points and masks simultaneously when sampling points (#444)
+- Add waymo unit tests (#455)
+- Split the visualization of projecting points onto image from that for only points (#480)
+- Efficient implementation of PointSegClassMapping (#489)
+- Use the new model registry from mmcv (#495)
+
+#### Bug Fixes
+
+- Fix Pytorch 1.8 Compilation issue in the [scatter_points_cuda.cu](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/ops/voxel/src/scatter_points_cuda.cu) (#404)
+- Fix [dynamic_scatter](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/ops/voxel/src/scatter_points_cuda.cu) errors triggered by empty point input (#417)
+- Fix the bug of missing points caused by using break incorrectly in the voxelization (#423)
+- Fix the missing `coord_type` in the waymo dataset [config](https://github.com/open-mmlab/mmdetection3d/blob/master/configs/_base_/datasets/waymoD5-3d-3class.py) (#441)
+- Fix errors in four unittest functions of [configs](https://github.com/open-mmlab/mmdetection3d/blob/master/configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d.py), [test_detectors.py](https://github.com/open-mmlab/mmdetection3d/blob/master/tests/test_models/test_detectors.py), [test_heads.py](https://github.com/open-mmlab/mmdetection3d/blob/master/tests/test_models/test_heads/test_heads.py) (#453)
+- Fix 3DSSD training errors and simplify configs (#462)
+- Clamp 3D votes projections to image boundaries in ImVoteNet (#463)
+- Update out-of-date names of pipelines in the [config](https://github.com/open-mmlab/mmdetection3d/blob/master/configs/benchmark/hv_pointpillars_secfpn_3x8_100e_det3d_kitti-3d-car.py) of pointpillars benchmark (#474)
+- Fix the lack of a placeholder when unpacking RPN targets in the [h3d_bbox_head.py](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py) (#508)
+- Fix the incorrect value of `K` when creating pickle files for SUN RGB-D (#511)
+
+### v0.12.0 (1/4/2021)
+
+#### Highlights
+
+- Support a new multi-modality method [ImVoteNet](https://arxiv.org/abs/2001.10692).
+- Support PyTorch 1.7 and 1.8
+- Refactor the structure of tools and [train.py](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/train.py)/[test.py](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/test.py)
+
+#### New Features
+
+- Support LiDAR-based semantic segmentation metrics (#332)
+- Support [ImVoteNet](https://arxiv.org/abs/2001.10692) (#352, #384)
+- Support the KNN GPU operation (#360, #371)
+
+#### Improvements
+
+- Add FAQ for common problems in the documentation (#333)
+- Refactor the structure of tools (#339)
+- Refactor [train.py](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/train.py) and [test.py](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/test.py) (#343)
+- Support demo on nuScenes (#353)
+- Add 3DSSD checkpoints (#359)
+- Update the Bibtex of CenterPoint (#368)
+- Add citation format and reference to other OpenMMLab projects in the README (#374)
+- Upgrade the mmcv version requirements (#376)
+- Add numba and numpy version requirements in FAQ (#379)
+- Avoid unnecessary for-loop execution of vfe layer creation (#389)
+- Update SUNRGBD dataset documentation to stress the requirements for training ImVoteNet (#391)
+- Modify vote head to support 3DSSD (#396)
+
+#### Bug Fixes
+
+- Fix missing keys `coord_type` in database sampler config (#345)
+- Rename H3DNet configs (#349)
+- Fix CI by using ubuntu 18.04 in github workflow (#350)
+- Add assertions to avoid 4-dim points being input to [points_in_boxes](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py) (#357)
+- Fix the SECOND results on Waymo in the corresponding [README](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/second) (#363)
+- Fix the incorrect adopted pipeline when adding val to workflow (#370)
+- Fix a potential bug when indices used in the backwarding in ThreeNN (#377)
+- Fix a compilation error triggered by [scatter_points_cuda.cu](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/ops/voxel/src/scatter_points_cuda.cu) in PyTorch 1.7 (#393)
+
+### v0.11.0 (1/3/2021)
+
+#### Highlights
+
+- Support more friendly visualization interfaces based on open3d
+- Support a faster and more memory-efficient implementation of DynamicScatter
+- Refactor unit tests and details of configs
+
+#### New Features
+
+- Support new visualization methods based on open3d (#284, #323)
+
+#### Improvements
+
+- Refactor unit tests (#303)
+- Move the key `train_cfg` and `test_cfg` into the model configs (#307)
+- Update [README](https://github.com/open-mmlab/mmdetection3d/blob/master/README.md/) with [Chinese version](https://github.com/open-mmlab/mmdetection3d/blob/master/README_zh-CN.md/) and [instructions for getting started](https://github.com/open-mmlab/mmdetection3d/blob/master/docs/getting_started.md/). (#310, #316)
+- Support a faster and more memory-efficient implementation of DynamicScatter (#318, #326)
+
+#### Bug Fixes
+
+- Fix an unsupported bias setting in the unit test for centerpoint head (#304)
+- Fix errors due to typos in the centerpoint head (#308)
+- Fix a minor bug in [points_in_boxes.py](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/ops/roiaware_pool3d/points_in_boxes.py) when tensors are not in the same device. (#317)
+- Fix warning of deprecated usages of nonzero during training with PyTorch 1.6 (#330)
+
+### v0.10.0 (1/2/2021)
+
+#### Highlights
+
+- Preliminary release of API for SemanticKITTI dataset.
+- Documentation and demo enhancement for better user experience.
+- Fix a number of underlying minor bugs and add some corresponding important unit tests.
+
+#### New Features
+
+- Support SemanticKITTI dataset preliminarily (#287)
+
+#### Improvements
+
+- Add tag to README in configurations for specifying different uses (#262)
+- Update instructions for evaluation metrics in the documentation (#265)
+- Add nuImages entry in [README.md](https://github.com/open-mmlab/mmdetection3d/blob/master/README.md/) and gif demo (#266, #268)
+- Add unit test for voxelization (#275)
+
+#### Bug Fixes
+
+- Fixed the issue of unpacking size in [furthest_point_sample.py](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/ops/furthest_point_sample/furthest_point_sample.py) (#248)
+- Fix bugs for 3DSSD triggered by empty ground truths (#258)
+- Remove models without checkpoints in model zoo statistics of documentation (#259)
+- Fix some unclear installation instructions in [getting_started.md](https://github.com/open-mmlab/mmdetection3d/blob/master/docs/getting_started.md/) (#269)
+- Fix relative paths/links in the documentation (#271)
+- Fix a minor bug in [scatter_points_cuda.cu](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/ops/voxel/src/scatter_points_cuda.cu) when num_features != 4 (#275)
+- Fix the bug about missing text files when testing on KITTI (#278)
+- Fix issues caused by inplace modification of tensors in `BaseInstance3DBoxes` (#283)
+- Fix log analysis for evaluation and adjust the documentation accordingly (#285)
+
+### v0.9.0 (31/12/2020)
+
+#### Highlights
+
+- Documentation refactoring with better structure, especially about how to implement new models and customized datasets.
+- More compatible with refactored point structure by bug fixes in ground truth sampling.
+
+#### Improvements
+
+- Documentation refactoring (#242)
+
+#### Bug Fixes
+
+- Fix point structure related bugs in ground truth sampling (#211)
+- Fix loading points in ground truth sampling augmentation on nuScenes (#221)
+- Fix channel setting in the SeparateHead of CenterPoint (#228)
+- Fix evaluation for indoors 3D detection in case of less classes in prediction (#231)
+- Remove unreachable lines in nuScenes data converter (#235)
+- Minor adjustments of numpy implementation for perspective projection and prediction filtering criterion in KITTI evaluation (#241)
+
+### v0.8.0 (30/11/2020)
+
+#### Highlights
+
+- Refactor points structure with more constructive and clearer implementation.
+- Support axis-aligned IoU loss for VoteNet with better performance.
+- Update and enhance [SECOND](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/second) benchmark on Waymo.
+
+#### New Features
+
+- Support axis-aligned IoU loss for VoteNet. (#194)
+- Support points structure for consistent processing of all the point related representation. (#196, #204)
+
+#### Improvements
+
+- Enhance [SECOND](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/second) benchmark on Waymo with stronger baselines. (#205)
+- Add model zoo statistics and polish the documentation. (#201)
+
+### v0.7.0 (1/11/2020)
+
+#### Highlights
+
+- Support a new method [SSN](https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123700579.pdf) with benchmarks on nuScenes and Lyft datasets.
+- Update benchmarks for SECOND on Waymo, CenterPoint with TTA on nuScenes and models with mixed precision training on KITTI and nuScenes.
+- Support semantic segmentation on nuImages and provide [HTC](https://arxiv.org/abs/1901.07518) models with configurations and performance for reference.
+
+#### New Features
+
+- Modified primitive head which can support the setting on SUN-RGBD dataset (#136)
+- Support semantic segmentation and [HTC](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/nuimages) with models for reference on nuImages dataset (#155)
+- Support [SSN](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/ssn) on nuScenes and Lyft datasets (#147, #174, #166, #182)
+- Support double flip for test time augmentation of CenterPoint with updated benchmark (#143)
+
+#### Improvements
+
+- Update [SECOND](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/second) benchmark with configurations for reference on Waymo (#166)
+- Delete checkpoints on Waymo to comply its specific license agreement (#180)
+- Update models and instructions with [mixed precision training](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/fp16) on KITTI and nuScenes (#178)
+
+#### Bug Fixes
+
+- Fix incorrect code weights in anchor3d_head when introducing mixed precision training (#173)
+- Fix the incorrect label mapping on nuImages dataset (#155)
+
+### v0.6.1 (11/10/2020)
+
+#### Highlights
+
+- Support mixed precision training of voxel-based methods
+- Support docker with PyTorch 1.6.0
+- Update baseline configs and results ([CenterPoint](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/centerpoint) on nuScenes and [PointPillars](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/pointpillars) on Waymo with full dataset)
+- Switch model zoo to download.openmmlab.com
+
+#### New Features
+
+- Support dataset pipeline `VoxelBasedPointSampler` to sample multi-sweep points based on voxelization. (#125)
+- Support mixed precision training of voxel-based methods (#132)
+- Support docker with PyTorch 1.6.0 (#160)
+
+#### Improvements
+
+- Reduce requirements for the case exclusive of Waymo (#121)
+- Switch model zoo to download.openmmlab.com (#126)
+- Update docs related to Waymo (#128)
+- Add version assertion in the [init file](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/__init__.py) (#129)
+- Add evaluation interval setting for CenterPoint (#131)
+- Add unit test for CenterPoint (#133)
+- Update [PointPillars](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/pointpillars) baselines on Waymo with full dataset (#142)
+- Update [CenterPoint](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/centerpoint) results with models and logs (#154)
+
+#### Bug Fixes
+
+- Fix a bug of visualization in multi-batch case (#120)
+- Fix bugs in dcn unit test (#130)
+- Fix dcn bias bug in centerpoint (#137)
+- Fix dataset mapping in the evaluation of nuScenes mini dataset (#140)
+- Fix origin initialization in `CameraInstance3DBoxes` (#148, #150)
+- Correct documentation link in the getting_started.md (#159)
+- Fix model save path bug in gather_models.py (#153)
+- Fix image padding shape bug in `PointFusion` (#162)
+
+### v0.6.0 (20/9/2020)
+
+#### Highlights
+
+- Support new methods [H3DNet](https://arxiv.org/abs/2006.05682), [3DSSD](https://arxiv.org/abs/2002.10187), [CenterPoint](https://arxiv.org/abs/2006.11275).
+- Support new dataset [Waymo](https://waymo.com/open/) (with PointPillars baselines) and [nuImages](https://www.nuscenes.org/nuimages) (with Mask R-CNN and Cascade Mask R-CNN baselines).
+- Support Batch Inference
+- Support Pytorch 1.6
+- Start to publish `mmdet3d` package to PyPI since v0.5.0. You can use mmdet3d through `pip install mmdet3d`.
+
+#### Backwards Incompatible Changes
+
+- Support Batch Inference (#95, #103, #116): MMDetection3D v0.6.0 migrates to support batch inference based on MMDetection >= v2.4.0. This change influences all the test APIs in MMDetection3D and downstream codebases.
+- Start to use collect environment function from MMCV (#113): MMDetection3D v0.6.0 migrates to use `collect_env` function in MMCV.
+  `get_compiler_version` and `get_compiling_cuda_version` compiled in `mmdet3d.ops.utils` are removed. Please import these two functions from `mmcv.ops`.
+
+#### New Features
+
+- Support [nuImages](https://www.nuscenes.org/nuimages) dataset by converting them into coco format and release Mask R-CNN and Cascade Mask R-CNN baseline models (#91, #94)
+- Support to publish to PyPI in github-action (#17, #19, #25, #39, #40)
+- Support CBGSDataset and make it generally applicable to all the supported datasets (#75, #94)
+- Support [H3DNet](https://arxiv.org/abs/2006.05682) and release models on ScanNet dataset (#53, #58, #105)
+- Support Fusion Point Sampling used in [3DSSD](https://arxiv.org/abs/2002.10187) (#66)
+- Add `BackgroundPointsFilter` to filter background points in data pipeline (#84)
+- Support pointnet2 with multi-scale grouping in backbone and refactor pointnets (#82)
+- Support dilated ball query used in [3DSSD](https://arxiv.org/abs/2002.10187) (#96)
+- Support [3DSSD](https://arxiv.org/abs/2002.10187) and release models on KITTI dataset (#83, #100, #104)
+- Support [CenterPoint](https://arxiv.org/abs/2006.11275) and release models on nuScenes dataset (#49, #92)
+- Support [Waymo](https://waymo.com/open/) dataset and release PointPillars baseline models (#118)
+- Allow `LoadPointsFromMultiSweeps` to pad empty sweeps and select multiple sweeps randomly (#67)
+
+#### Improvements
+
+- Fix all warnings and bugs in PyTorch 1.6.0 (#70, #72)
+- Update issue templates (#43)
+- Update unit tests (#20, #24, #30)
+- Update documentation for using `ply` format point cloud data (#41)
+- Use points loader to load point cloud data in ground truth (GT) samplers (#87)
+- Unify version file of OpenMMLab projects by using `version.py` (#112)
+- Remove unnecessary data preprocessing commands of SUN RGB-D dataset (#110)
+
+#### Bug Fixes
+
+- Rename CosineAnealing to CosineAnnealing (#57)
+- Fix device inconsistent bug in 3D IoU computation (#69)
+- Fix a minor bug in json2csv of lyft dataset (#78)
+- Add missed test data for pointnet modules (#85)
+- Fix `use_valid_flag` bug in `CustomDataset` (#106)
+
+### v0.5.0 (9/7/2020)
+
+MMDetection3D is released.
diff --git a/mmde/docs/en/notes/compatibility.md b/mmde/docs/en/notes/compatibility.md
new file mode 100644
index 0000000000000000000000000000000000000000..9c233715e91c265e6ea616a1509f5a3546ef3324
--- /dev/null
+++ b/mmde/docs/en/notes/compatibility.md
@@ -0,0 +1,207 @@
+# Compatibility
+
+## v1.1.0rc0
+
+### OpenMMLab v2.0 Refactoring
+
+In this version, we make large refactoring based on MMEngine to achieve unified data elements, model interfaces, visualizers, evaluators and other runtime modules across different datasets, tasks and even codebases. A brief summary for this refactoring is as follows:
+
+- Data Element:
+  - We add [`Det3DDataSample`](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/structures/det3d_data_sample.py) as the common data element passing through datasets and models. It inherits from [`DetDataSample`](<%5Bhttps://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/structures/det3d_data_sample.py%5D(https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/structures/det_data_sample.py)>) in mmdetection and implement `InstanceData`, `PixelData`, and
+    `LabelData` inheriting from `BaseDataElement` in MMEngine to represent different types of ground truth labels or predictions.
+- Datasets:
+  - We add [`Det3DDataset`](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/datasets/det3d_dataset.py) and [`Seg3DDataset`](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/datasets/seg3d_dataset.py) as the base datasets to inherit from the unified `BaseDataset` in MMEngine. They implement most functions that are commonly used across different datasets and simplify the info loading/processing in the current datasets. Re-defined input arguments and functions can be most re-used in different datasets, which are important for the implementation of customized datasets.
+  - We define the common keys across different datasets and unify all the info files with a standard protocol. The same info is more clear for users because they share the same key across different dataset infos. Besides, for different settings, such as camera-only and LiDAR-only methods, we no longer need different info formats (like the previous pkl and json files). We can just revise the `parse_data_info` to read the necessary information from a complete info file.
+  - We add `train_dataloader`, `val_dataloader` and `test_dataloader` to replace the original `data` in the config. It simplify the levels of data-related fields.
+- Data Transforms
+  - Based on the basic transforms and wrappers re-implemented and simplified in the latest MMCV, we refactor data transforms to inherit from them.
+  - We also adjust the implementation of current data pipelines to make them compatible with our latest data protocol.
+  - Normalization, padding of images and voxelization operations are moved to the data-preprocessing.
+  - `DefaultFormatBundle3D` and `Collect3D` are replaced with `PackDet3DInputs` to pack the data into the element format as model input.
+- Models
+  - Unify the model interface as `inputs`, `data_samples`, `return_loss=False`
+  - The basic pre-processing before model forward includes: 1) convert input from CPU to GPU tensors; 2) padding images; 3) normalize images; 4) voxelization.
+  - Return `loss_dict` during training while return `list[data_sample]` during inference
+  - Simply function interfaces in the models
+  - Add `preprocess_cfg` in the model configs for pre-processing
+- Visualizer
+  - Design a unified visualizer, [`Det3DLocalVisualizer`](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/visualization/local_visualizer.py), based on MMEngine for different 3D tasks and settings
+  - Support browsing dataset and visualization hooks based on the [`Det3DLocalVisualizer`](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/visualization/local_visualizer.py)
+- Evaluator
+  - Decouple evaluators from datasets to make them more flexible: the evaluation codes of each dataset are implemented as a metric class exclusively.
+  - Add evaluator information to the current dataset configs
+- Registry
+  - Refactor all the registries to inherit from root registries in MMEngine
+  - When using modules from other codebases, it is necessary to specify the registry scope, such as `mmdet.ResNet`
+- Others: Refactor logging, hooks, scheduler, runner and other runtime configs based on MMEngine
+
+## v1.0.0rc1
+
+### Operators Migration
+
+We have adopted CUDA operators compiled from [mmcv](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/__init__.py) and removed all the CUDA operators in mmdet3d. We now do not need to compile the CUDA operators in mmdet3d anymore.
+
+### Waymo dataset converter refactoring
+
+In this version we did a major code refactoring that boosted the performance of waymo dataset conversion by multiprocessing.
+Meanwhile, we also fixed the imprecise timestamps saving issue in waymo dataset conversion. This change introduces following backward compatibility breaks:
+
+- The point cloud .bin files of waymo dataset need to be regenerated.
+  In the .bin files each point occupies 6 `float32` and the meaning of the last `float32` now changed from **imprecise timestamps** to **range frame offset**.
+  The **range frame offset** for each point is calculated as`ri * h * w + row * w + col` if the point is from the **TOP** lidar or `-1` otherwise.
+  The `h`, `w` denote the height and width of the TOP lidar's range frame.
+  The `ri`, `row`, `col` denote the return index, the row and the column of the range frame where each point locates.
+  Following tables show the difference across the change:
+
+Before
+
+| Element offset (float32) |  0  |  1  |  2  |     3     |     4      |            5            |
+| ------------------------ | :-: | :-: | :-: | :-------: | :--------: | :---------------------: |
+| Bytes offset             |  0  |  4  |  8  |    12     |     16     |           20            |
+| Meaning                  |  x  |  y  |  z  | intensity | elongation | **imprecise timestamp** |
+
+After
+
+| Element offset (float32) |  0  |  1  |  2  |     3     |     4      |           5            |
+| ------------------------ | :-: | :-: | :-: | :-------: | :--------: | :--------------------: |
+| Bytes offset             |  0  |  4  |  8  |    12     |     16     |           20           |
+| Meaning                  |  x  |  y  |  z  | intensity | elongation | **range frame offset** |
+
+- The objects' point cloud .bin files in the GT-database of waymo dataset need to be regenerated because we also dumped the range frame offset for each point into it.
+  Following tables show the difference across the change:
+
+Before
+
+| Element offset (float32) |  0  |  1  |  2  |     3     |     4      |
+| ------------------------ | :-: | :-: | :-: | :-------: | :--------: |
+| Bytes offset             |  0  |  4  |  8  |    12     |     16     |
+| Meaning                  |  x  |  y  |  z  | intensity | elongation |
+
+After
+
+| Element offset (float32) |  0  |  1  |  2  |     3     |     4      |           5            |
+| ------------------------ | :-: | :-: | :-: | :-------: | :--------: | :--------------------: |
+| Bytes offset             |  0  |  4  |  8  |    12     |     16     |           20           |
+| Meaning                  |  x  |  y  |  z  | intensity | elongation | **range frame offset** |
+
+- Any configuration that uses waymo dataset with GT Augmentation should change the `db_sampler.points_loader.load_dim` from `5` to `6`.
+
+## v1.0.0rc0
+
+### Coordinate system refactoring
+
+In this version, we did a major code refactoring which improved the consistency among the three coordinate systems (and corresponding box representation), LiDAR, Camera, and Depth. A brief summary for this refactoring is as follows:
+
+- The three coordinate systems are all right-handed now (which means the yaw angle increases in the counterclockwise direction).
+- The LiDAR system `(x_size, y_size, z_size)` corresponds to `(l, w, h)` instead of `(w, l, h)`. This is more natural since `l` is parallel with the direction where the yaw angle is zero, and we prefer using the positive direction of the `x` axis as that direction, which is exactly how we define yaw angle in Depth and Camera coordinate systems.
+- The APIs for box-related operations are improved and now are more user-friendly.
+
+#### ***NOTICE!!***
+
+Since definitions of box representation have changed, the annotation data of most datasets require updating:
+
+- SUN RGB-D: Yaw angles in the annotation should be reversed.
+- KITTI: For LiDAR boxes in GT databases, (x_size, y_size, z_size, yaw) out of (x, y, z, x_size, y_size, z_size) should be converted from the old LiDAR coordinate system to the new one. The training/validation data annotations should be left unchanged since they are under the Camera coordinate system, which is unmodified after the refactoring.
+- Waymo: Same as KITTI.
+- nuScenes: For LiDAR boxes in training/validation data and GT databases, (x_size, y_size, z_size, yaw) out of (x, y, z, x_size, y_size, z_size) should be converted.
+- Lyft: Same as nuScenes.
+
+Please regenerate the data annotation/GT database files or use [`update_data_coords.py`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0rc0/tools/update_data_coords.py) to update the data.
+
+To use boxes under Depth and LiDAR coordinate systems, or to convert boxes between different coordinate systems, users should be aware of the difference between the old and new definitions. For example, the rotation, flipping, and bev functions of [`DepthInstance3DBoxes`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0rc0/mmdet3d/core/bbox/structures/depth_box3d.py) and [`LiDARInstance3DBoxes`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0rc0/mmdet3d/core/bbox/structures/lidar_box3d.py) and box conversion [functions](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0rc0/mmdet3d/core/bbox/structures/box_3d_mode.py) have all been reimplemented in the refactoring.
+
+Consequently, functions like [`output_to_lyft_box`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0rc0/mmdet3d/datasets/lyft_dataset.py) undergo small modification to adapt to the new LiDAR/Depth box.
+
+Since the LiDAR system `(x_size, y_size, z_size)` now corresponds to `(l, w, h)` instead of `(w, l, h)`, the anchor sizes for LiDAR boxes are also changed, e.g., from `[1.6, 3.9, 1.56]` to `[3.9, 1.6, 1.56]`.
+
+Functions only involving points are generally unaffected except if they rely on some refactored utility functions such as `rotation_3d_in_axis`.
+
+#### Other BC-breaking or new features:
+
+- `array_converter`: Please refer to [array_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0rc0/mmdet3d/core/utils/array_converter.py). Functions wrapped with `array_converter` can convert array-like input types of `torch.Tensor`, `np.ndarray`, and `list/tuple/float` to `torch.Tensor` to process in an unified PyTorch pipeline. The result may finally be converted back to the input type. Most functions in [utils.py](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0rc0/mmdet3d/core/bbox/structures/utils.py) are wrapped with `array_converter`.
+- [`points_in_boxes`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0rc0/mmdet3d/core/bbox/structures/base_box3d.py) and [`points_in_boxes_batch`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0rc0/mmdet3d/core/bbox/structures/base_box3d.py) will be deprecated soon. They are renamed to `points_in_boxes_part` and `points_in_boxes_all` respectively, with more detailed docstrings. The major difference of the two functions is that if a point is enclosed by multiple boxes, `points_in_boxes_part` will only return the index of the first enclosing box while `points_in_boxes_all` will return all the indices of enclosing boxes.
+- `rotation_3d_in_axis`: Please refer to [utils.py](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0rc0/mmdet3d/core/bbox/structures/utils.py). Now this function supports multiple input types and more options. The function with the same name in [box_np_ops.py](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0rc0/mmdet3d/core/bbox/box_np_ops.py) is deleted since we do not need another function to tackle with NumPy data. `rotation_2d`, `points_cam2img`, and `limit_period` in box_np_ops.py are also deleted for the same reason.
+- `bev` method of [`CameraInstance3DBoxes`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0rc0/mmdet3d/core/bbox/structures/cam_box3d.py): Changed it to be consistent with the definition of bev in Depth and LiDAR coordinate systems.
+- Data augmentation utils in [data_augment_utils.py](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0rc0/mmdet3d/datasets/pipelines/data_augment_utils.py) now follow the rules of a right-handed system.
+- We do not need the yaw hacking in KITTI anymore after refining [`get_direction_target`](https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0rc0/mmdet3d/models/dense_heads/train_mixins.py). Interested users may refer to PR [#677](https://github.com/open-mmlab/mmdetection3d/pull/677) .
+
+## 0.16.0
+
+### Returned values of `QueryAndGroup` operation
+
+We modified the returned `grouped_xyz` value of operation `QueryAndGroup` to support PAConv segmentor. Originally, the `grouped_xyz` is centered by subtracting the grouping centers, which represents the relative positions of grouped points. Now, we didn't perform such subtraction and the returned `grouped_xyz` stands for the absolute coordinates of these points.
+
+Note that, the other returned variables of `QueryAndGroup` such as `new_features`, `unique_cnt` and `grouped_idx` are not affected.
+
+### NuScenes coco-style data pre-processing
+
+We remove the rotation and dimension hack in the monocular 3D detection on nuScenes. Specifically, we transform the rotation and dimension of boxes defined by nuScenes devkit to the coordinate system of our `CameraInstance3DBoxes` in the pre-processing and transform them back in the post-processing. In this way, we can remove the corresponding [hack](https://github.com/open-mmlab/mmdetection3d/pull/744/files#diff-5bee5062bd84e6fa25a2fdd71353f6f283dfdc4a66a0316c3b1ca26078c978b6L165) used in the visualization tools. The modification also guarantees the correctness of all the operations based on our `CameraInstance3DBoxes` (such as NMS and flip augmentation) when training monocular 3D detectors.
+
+The modification only influences nuScenes coco-style json files. Please re-run the nuScenes data preparation script if necessary. See more details in the PR [#744](https://github.com/open-mmlab/mmdetection3d/pull/744).
+
+### ScanNet dataset for ImVoxelNet
+
+We adopt a new pre-processing procedure for the ScanNet dataset in order to support ImVoxelNet, which is a multi-view method requiring image data. In previous versions of MMDetection3D, ScanNet dataset was only used for point cloud based 3D detection and segmentation methods. We plan adding ImVoxelNet to our model zoo, thus updating ScanNet correspondingly by adding image-related pre-processing steps. Specifically, we made these changes:
+
+- Add [script](https://github.com/open-mmlab/mmdetection3d/blob/master/data/scannet/extract_posed_images.py) for extracting RGB data.
+- Update [script](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/dataset_converters/scannet_data_utils.py) for annotation creating.
+- Add instructions in the documents on preparing image data.
+
+Please refer to the ScanNet [README.md](https://github.com/open-mmlab/mmdetection3d/blob/master/data/scannet/README.md/) for more details.
+
+## 0.15.0
+
+### MMCV Version
+
+In order to fix the problem that the priority of EvalHook is too low, all hook priorities have been re-adjusted in 1.3.8, so MMDetection 2.14.0 needs to rely on the latest MMCV 1.3.8 version. For related information, please refer to [#1120](https://github.com/open-mmlab/mmcv/pull/1120), for related issues, please refer to [#5343](https://github.com/open-mmlab/mmdetection/issues/5343).
+
+### Unified parameter initialization
+
+To unify the parameter initialization in OpenMMLab projects, MMCV supports `BaseModule` that accepts `init_cfg` to allow the modules' parameters initialized in a flexible and unified manner. Now the users need to explicitly call `model.init_weights()` in the training script to initialize the model (as in [here](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/train.py#L183), previously this was handled by the detector. Please refer to PR [#622](https://github.com/open-mmlab/mmdetection3d/pull/622) for details.
+
+### BackgroundPointsFilter
+
+We modified the dataset augmentation function `BackgroundPointsFilter`([here](https://github.com/open-mmlab/mmdetection3d/blob/v0.15.0/mmdet3d/datasets/pipelines/transforms_3d.py#L1132)). In previous version of MMdetection3D, `BackgroundPointsFilter` changes the gt_bboxes_3d's bottom center to the gravity center. In MMDetection3D 0.15.0,
+`BackgroundPointsFilter` will not change it. Please refer to PR [#609](https://github.com/open-mmlab/mmdetection3d/pull/609) for details.
+
+### Enhance `IndoorPatchPointSample` transform
+
+We enhance the pipeline function `IndoorPatchPointSample` used in point cloud segmentation task by adding more choices for patch selection. Also, we plan to remove the unused parameter `sample_rate` in the future. Please modify the code as well as the config files accordingly if you use this transform.
+
+## 0.14.0
+
+### Dataset class for 3D segmentation task
+
+We remove a useless parameter `label_weight` from segmentation datasets including `Custom3DSegDataset`, `ScanNetSegDataset` and `S3DISSegDataset` since this weight is utilized in the loss function of model class. Please modify the code as well as the config files accordingly if you use or inherit from these codes.
+
+### ScanNet data pre-processing
+
+We adopt new pre-processing and conversion steps of ScanNet dataset. In previous versions of MMDetection3D, ScanNet dataset was only used for 3D detection task, where we trained on the training set and tested on the validation set. In MMDetection3D 0.14.0, we further support 3D segmentation task on ScanNet, which includes online benchmarking on test set. Since the alignment matrix is not provided for test set data, we abandon the alignment of points in data generation steps to support both tasks. Besides, as 3D segmentation requires per-point prediction, we also remove the down-sampling step in data generation.
+
+- In the new ScanNet processing scripts, we save the unaligned points for all the training, validation and test set. For train and val set with annotations, we also store the `axis_align_matrix` in data infos. For ground-truth bounding boxes, we store boxes in both aligned and unaligned coordinates with key `gt_boxes_upright_depth` and key `unaligned_gt_boxes_upright_depth` respectively in data infos.
+
+- In `ScanNetDataset`, we now load the `axis_align_matrix` as a part of data annotations. If it is not contained in old data infos, we will use identity matrix for compatibility. We also add a transform function `GlobalAlignment` in ScanNet detection data pipeline to align the points.
+
+- Since the aligned boxes share the same key as in old data infos, we do not need to modify the code related to it. But do remember that they are not in the same coordinate system as the saved points.
+
+- There is an `PointSample` pipeline in the data pipelines for ScanNet detection task which down-samples points. So removing down-sampling in data generation will not affect the code.
+
+We have trained a [VoteNet](https://github.com/open-mmlab/mmdetection3d/blob/master/configs/votenet/votenet_8x8_scannet-3d-18class.py) model on the newly processed ScanNet dataset and get similar benchmark results. In order to prepare ScanNet data for both detection and segmentation tasks, please re-run the new pre-processing scripts following the ScanNet [README.md](https://github.com/open-mmlab/mmdetection3d/blob/master/data/scannet/README.md/).
+
+## 0.12.0
+
+### SUNRGBD dataset for ImVoteNet
+
+We adopt a new pre-processing procedure for the SUNRGBD dataset in order to support ImVoteNet, which is a multi-modality method requiring both image and point cloud data. In previous versions of MMDetection3D, SUNRGBD dataset was only used for point cloud based 3D detection methods. In MMDetection3D 0.12.0, we add ImVoteNet to our model zoo, thus updating SUNRGBD correspondingly by adding image-related pre-processing steps. Specifically, we made these changes:
+
+- Fix a bug in the image file path in meta data.
+- Convert calibration matrices from double to float to avoid type mismatch in further operations.
+- Add instructions in the documents on preparing image data.
+
+Please refer to the SUNRGBD [README.md](https://github.com/open-mmlab/mmdetection3d/blob/master/data/sunrgbd/README.md/) for more details.
+
+## 0.6.0
+
+### VoteNet and H3DNet model structure update
+
+In MMDetection 0.6.0, we updated the model structures of VoteNet and H3DNet, therefore model checkpoints generated by MMDetection \< 0.6.0 should be first converted to a format compatible with the latest structures via [convert_votenet_checkpoints.py](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/model_converters/convert_votenet_checkpoints.py) and [convert_h3dnet_checkpoints.py](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/model_converters/convert_h3dnet_checkpoints.py) . For more details, please refer to the VoteNet [README.md](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/votenet/README.md/) and H3DNet [README.md](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/h3dnet/README.md/).
diff --git a/mmde/docs/en/notes/contribution_guides.md b/mmde/docs/en/notes/contribution_guides.md
new file mode 100644
index 0000000000000000000000000000000000000000..e201b6c989610fd537d4fb5b43e58a0e93a8af45
--- /dev/null
+++ b/mmde/docs/en/notes/contribution_guides.md
@@ -0,0 +1,139 @@
+# Contribution Guide
+
+OpenMMLab welcomes everyone who is interested in contributing to our projects and accepts contribution in the form of PR.
+
+## What is PR
+
+`PR` is the abbreviation of `Pull Request`. Here's the definition of `PR` in the [official document](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) of Github.
+
+```
+Pull requests let you tell others about changes you have pushed to a branch in a repository on GitHub. Once a pull request is opened, you can discuss and review the potential changes with collaborators and add follow-up commits before your changes are merged into the base branch.
+```
+
+## Basic Workflow
+
+1. Get the most recent codebase
+
+2. Checkout a new branch from `dev-1.x` or `dev` branch, depending on the version of the codebase you want to contribute to. The main differences between `dev-1.x` and `dev` is that `dev-1.x` depends on MMEngine additionally and it's the main branch we maintains. We strongly recommend you pull request based on more advanced `dev-1.x` branch.
+
+3. Commit your changes ([Don't forget to use pre-commit hooks!](#3-commit-your-changes))
+
+4. Push your changes and create a PR
+
+5. Discuss and review your code
+
+6. Merge your branch to `dev-1.x` / `dev` branch
+
+## Procedures in detail
+
+### 1. Get the most recent codebase
+
+- When you work on your first PR
+
+  Fork the OpenMMLab repository: click the **fork** button at the top right corner of Github page
+  ![avatar](https://user-images.githubusercontent.com/34888372/224920532-dc11f696-1175-436a-8c0f-1966f5ca33d1.png)
+
+  Clone forked repository to local
+
+  ```bash
+  git clone git@github.com:XXX/mmdetection3d.git
+  ```
+
+  Add source repository to upstream
+
+  ```bash
+  git remote add upstream git@github.com:open-mmlab/mmdetection3d
+  ```
+
+- After your first PR
+
+  Checkout the latest branch of the local repository and pull the latest branch of the source repository. Here we assume that you are working on the `dev-1.x` branch.
+
+  ```bash
+  git checkout dev-1.x
+  git pull upstream dev-1.x
+  ```
+
+### 2. Checkout a new branch from the `dev-1.x` / `dev` branch
+
+```bash
+git checkout -b branchname
+```
+
+```{tip}
+To make commit history clear, we strongly recommend you checkout the `dev-1.x` branch before creating a new branch.
+```
+
+### 3. Commit your changes
+
+- If you are a first-time contributor, please install and initialize pre-commit hooks from the repository root directory first.
+
+  ```bash
+  pip install -U pre-commit
+  pre-commit install
+  ```
+
+- Commit your changes as usual. Pre-commit hooks will be triggered to stylize your code before each commit.
+
+  ```bash
+  # coding
+  git add [files]
+  git commit -m 'messages'
+  ```
+
+  ```{note}
+  Sometimes your code may be changed by pre-commit hooks. In this case, please remember to re-stage the modified files and commit again.
+  ```
+
+### 4. Push your changes to the forked repository and create a PR
+
+- Push the branch to your forked remote repository
+
+  ```bash
+  git push origin branchname
+  ```
+
+- Create a PR
+  ![avatar](https://user-images.githubusercontent.com/34888372/224922548-69455db9-68d1-4d92-a007-afcd2814b1c1.png)
+
+- Revise PR message template to describe your motivation and modifications made in this PR. You can also link the related issue to the PR manually in the PR message (For more information, checkout the [official guidance](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)).
+
+- Specifically, if you are contributing to `dev-1.x`, you will have to change the base branch of the PR to `dev-1.x` in the PR page, since the default base branch is `master`.
+
+  ![avatar](https://user-images.githubusercontent.com/34888372/224923009-1d611a30-0bfc-4fe5-93a2-96cc88a18886.png)
+
+- You can also ask a specific person to review the changes you've proposed.
+
+### 5. Discuss and review your code
+
+- Modify your codes according to reviewers' suggestions and then push your changes.
+
+### 6.  Merge your branch to the `dev-1.x` / `dev` branch and delete the branch
+
+- After the PR is merged by the maintainer, you can delete the branch you created in your forked repository.
+
+  ```bash
+  git branch -d branchname # delete local branch
+  git push origin --delete branchname # delete remote branch
+  ```
+
+## PR Specs
+
+1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style
+
+2. One short-time branch should be matched with only one PR
+
+3. Accomplish a detailed change in one PR. Avoid large PR
+
+   - Bad: Support Faster R-CNN
+   - Acceptable: Add a box head to Faster R-CNN
+   - Good: Add a parameter to box head to support custom conv-layer number
+
+4. Provide clear and significant commit message
+
+5. Provide clear and meaningful PR description
+
+   - Task name should be clarified in title. The general format is: \[Prefix\] Short description of the PR (Suffix)
+   - Prefix: add new feature \[Feature\], fix bug \[Fix\], related to documents \[Docs\], in developing \[WIP\] (which will not be reviewed temporarily)
+   - Introduce main changes, results and influences on other modules in short description
+   - Associate related issues and pull requests with a milestone
diff --git a/mmde/docs/en/notes/faq.md b/mmde/docs/en/notes/faq.md
new file mode 100644
index 0000000000000000000000000000000000000000..0a28fd01506b93b149a763b1828b2510a2daf00e
--- /dev/null
+++ b/mmde/docs/en/notes/faq.md
@@ -0,0 +1,57 @@
+# FAQ
+
+We list some potential troubles encountered by users and developers, along with their corresponding solutions. Feel free to enrich the list if you find any frequent issues and contribute your solutions to solve them. If you have any trouble with environment configuration, model training, etc, please create an issue using the [provided templates](https://github.com/open-mmlab/mmdetection3d/blob/master/.github/ISSUE_TEMPLATE/error-report.md) and fill in all required information in the template.
+
+## MMEngine/MMCV/MMDet/MMDet3D Installation
+
+- Compatibility issue between MMEngine, MMCV, MMDetection and MMDetection3D; "ConvWS is already registered in conv layer"; "AssertionError: MMCV==xxx is used but incompatible. Please install mmcv>=xxx, \<=xxx."
+
+- The required versions of MMEngine, MMCV and MMDetection for different versions of MMDetection3D are as below. Please install the correct version of MMEngine, MMCV and MMDetection to avoid installation issues.
+
+  | MMDetection3D version |     MMEngine version     |      MMCV version       |   MMDetection version    |
+  | --------------------- | :----------------------: | :---------------------: | :----------------------: |
+  | main                  | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.4.0 |
+  | v1.4.0                | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.4.0 |
+  | v1.3.0                | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.3.0 |
+  | v1.2.0                | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |  mmdet>=3.0.0, \<3.2.0   |
+  | v1.1.1                | mmengine>=0.7.1, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |  mmdet>=3.0.0, \<3.1.0   |
+
+  **Note:** If you want to install mmdet3d-v1.0.0rcx, the compatible MMDetection, MMSegmentation and MMCV versions table can be found at [here](https://mmdetection3d.readthedocs.io/en/latest/faq.html#mmcv-mmdet-mmdet3d-installation). Please choose the correct version of MMCV, MMDetection and MMSegmentation to avoid installation issues.
+
+- If you faced the error shown below when importing open3d:
+
+  `OSError: /lib/x86_64-linux-gnu/libm.so.6: version 'GLIBC_2.27' not found`
+
+  please downgrade open3d to 0.9.0.0, because the latest open3d needs the support of file 'GLIBC_2.27', which only exists in Ubuntu 18.04, not in Ubuntu 16.04.
+
+- If you faced the error when importing pycocotools, this is because nuscenes-devkit installs pycocotools but mmdet relies on mmpycocotools. The current workaround is as below. We will migrate to use pycocotools in the future.
+
+  ```shell
+  pip uninstall pycocotools mmpycocotools
+  pip install mmpycocotools
+  ```
+
+  **NOTE**: We have migrated to use pycocotools in mmdet3d >= 0.13.0.
+
+- If you face the error shown below when importing pycocotools:
+
+  `ValueError: numpy.ndarray size changed, may indicate binary incompatibility. Expected 88 from C header, got 80 from PyObject`
+
+  please downgrade pycocotools to 2.0.1 because of the incompatibility between the newest pycocotools and numpy \< 1.20.0. Or you can compile and install the latest pycocotools from source as below:
+
+  `pip install -e "git+https://github.com/cocodataset/cocoapi#egg=pycocotools&subdirectory=PythonAPI"`
+
+  or
+
+  `pip install -e "git+https://github.com/ppwwyyxx/cocoapi#egg=pycocotools&subdirectory=PythonAPI"`
+
+- If you face some errors about numba in cuda-9.0 environment, you should check the version of numba. In cuda-9.0 environment, the high version of numba is not supported and we suggest you could install numba==0.53.0.
+
+## How to annotate point cloud?
+
+MMDetection3D does not support point cloud annotation. Some open-source annotation tool are offered for reference:
+
+- [SUSTechPOINTS](https://github.com/naurril/SUSTechPOINTS)
+- [LATTE](https://github.com/bernwang/latte)
+
+Besides, we improved [LATTE](https://github.com/bernwang/latte) for better use. More details can be found [here](https://arxiv.org/abs/2011.10174).
diff --git a/mmde/docs/en/notes/index.rst b/mmde/docs/en/notes/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a82311fc33165e6d8373f616ba1d2dc19f28adcd
--- /dev/null
+++ b/mmde/docs/en/notes/index.rst
@@ -0,0 +1,9 @@
+.. toctree::
+   :maxdepth: 1
+
+   benchmarks.md
+   changelog_v1.0.x.md
+   changelog.md
+   compatibility.md
+   faq.md
+   contribution_guides.md
diff --git a/mmde/docs/en/stat.py b/mmde/docs/en/stat.py
new file mode 100644
index 0000000000000000000000000000000000000000..4937adb435355fc27aa6990821382cc9b5fc0712
--- /dev/null
+++ b/mmde/docs/en/stat.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+import functools as func
+import glob
+import re
+from os import path as osp
+
+import numpy as np
+
+url_prefix = 'https://github.com/open-mmlab/mmdetection3d/blob/main'
+
+files = sorted(glob.glob('../../configs/*/README.md'))
+
+stats = []
+titles = []
+num_ckpts = 0
+
+for f in files:
+    url = osp.dirname(f.replace('../../', url_prefix))
+
+    with open(f, 'r') as content_file:
+        content = content_file.read()
+
+    title = content.split('\n')[0].replace('# ', '').strip()
+    ckpts = set(x.lower().strip()
+                for x in re.findall(r'\[model\]\((https?.*)\)', content))
+
+    if len(ckpts) == 0:
+        continue
+
+    _papertype = [x for x in re.findall(r'\[([A-Z]+)\]', content)]
+    assert len(_papertype) > 0
+    papertype = _papertype[0]
+
+    paper = set([(papertype, title)])
+
+    titles.append(title)
+    num_ckpts += len(ckpts)
+
+    statsmsg = f"""
+\t* [{papertype}] [{title}]({url}) ({len(ckpts)} ckpts)
+"""
+    stats.append((paper, ckpts, statsmsg))
+
+allpapers = func.reduce(lambda a, b: a.union(b), [p for p, _, _ in stats])
+msglist = '\n'.join(x for _, _, x in stats)
+
+papertypes, papercounts = np.unique([t for t, _ in allpapers],
+                                    return_counts=True)
+countstr = '\n'.join(
+    [f'   - {t}: {c}' for t, c in zip(papertypes, papercounts)])
+
+modelzoo = f"""
+# Model Zoo Statistics
+
+* Number of papers: {len(set(titles))}
+{countstr}
+
+* Number of checkpoints: {num_ckpts}
+
+{msglist}
+"""
+
+with open('modelzoo_statistics.md', 'w') as f:
+    f.write(modelzoo)
diff --git a/mmde/docs/en/switch_language.md b/mmde/docs/en/switch_language.md
new file mode 100644
index 0000000000000000000000000000000000000000..d33d0803ef8f62e1410f12f991a8d78f2cf75df1
--- /dev/null
+++ b/mmde/docs/en/switch_language.md
@@ -0,0 +1,3 @@
+## <a href='https://mmdetection3d.readthedocs.io/en/latest/'>English</a>
+
+## <a href='https://mmdetection3d.readthedocs.io/zh_CN/latest/'>简体中文</a>
diff --git a/mmde/docs/en/user_guides/backends_support.md b/mmde/docs/en/user_guides/backends_support.md
new file mode 100644
index 0000000000000000000000000000000000000000..838fedefda55c47d62d9b307b05c747fa9315b60
--- /dev/null
+++ b/mmde/docs/en/user_guides/backends_support.md
@@ -0,0 +1,154 @@
+# Backends Support
+
+We support different file client backends: Disk, Ceph and LMDB, etc. Here is an example of how to modify configs for Ceph-based data loading and saving.
+
+## Load data and annotations from Ceph
+
+We support loading data and generated annotation info files (pkl and json) from Ceph:
+
+```python
+# set file client backends as Ceph
+backend_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/nuscenes/':
+        's3://openmmlab/datasets/detection3d/nuscenes/', # replace the path with your data path on Ceph
+        'data/nuscenes/':
+        's3://openmmlab/datasets/detection3d/nuscenes/' # replace the path with your data path on Ceph
+    }))
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    sample_groups=dict(Car=15),
+    classes=class_names,
+    # set file client for points loader to load training data
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    # set file client for data base sampler to load db info file
+    backend_args=backend_args)
+
+train_pipeline = [
+    # set file client for loading training data
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, backend_args=backend_args),
+    # set file client for loading training data annotations
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, backend_args=backend_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[0.25, 0.25, 0.25],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.15707963267, 0.15707963267]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    # set file client for loading validation/testing data
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+
+data = dict(
+    # set file client for loading training info files (.pkl)
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(pipeline=train_pipeline, classes=class_names, backend_args=backend_args)),
+    # set file client for loading validation info files (.pkl)
+    val=dict(pipeline=test_pipeline, classes=class_names,backend_args=backend_args),
+    # set file client for loading testing info files (.pkl)
+    test=dict(pipeline=test_pipeline, classes=class_names, backend_args=backend_args))
+```
+
+## Load pretrained model from Ceph
+
+```python
+model = dict(
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch='regnetx_1.6gf',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='s3://openmmlab/checkpoints/mmdetection3d/regnetx_1.6gf'), # replace the path with your pretrained model path on Ceph
+        ...
+```
+
+## Load checkpoint from Ceph
+
+```python
+# replace the path with your checkpoint path on Ceph
+load_from = 's3://openmmlab/checkpoints/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230614-77663cd6.pth.pth'
+resume_from = None
+workflow = [('train', 1)]
+```
+
+## Save checkpoint into Ceph
+
+```python
+# checkpoint saving
+# replace the path with your checkpoint saving path on Ceph
+checkpoint_config = dict(interval=1, max_keep_ckpts=2, out_dir='s3://openmmlab/mmdetection3d')
+```
+
+## EvalHook saves the best checkpoint into Ceph
+
+```python
+# replace the path with your checkpoint saving path on Ceph
+evaluation = dict(interval=1, save_best='bbox', out_dir='s3://openmmlab/mmdetection3d')
+```
+
+## Save the training log into Ceph
+
+The training log will be backed up to the specified Ceph path after training.
+
+```python
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook', out_dir='s3://openmmlab/mmdetection3d'),
+    ])
+```
+
+You can also delete the local training log after backing up to the specified Ceph path by setting `keep_local = False`.
+
+```python
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook', out_dir='s3://openmmlab/mmdetection3d', keep_local=False),
+    ])
+```
diff --git a/mmde/docs/en/user_guides/config.md b/mmde/docs/en/user_guides/config.md
new file mode 100644
index 0000000000000000000000000000000000000000..2d4358e5231148bdf9f5fa9afbcc04c5ef0ac3d2
--- /dev/null
+++ b/mmde/docs/en/user_guides/config.md
@@ -0,0 +1,573 @@
+# Learn about Configs
+
+MMDetection3D and other OpenMMLab repositories use [MMEngine's config system](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html). It has a modular and inheritance design, which is convenient to conduct various experiments.
+
+## Config file content
+
+MMDetection3D uses a modular design, all modules with different functions can be configured through the config. Taking PointPillars as an example, we will introduce each field in the config according to different function modules.
+
+### Model config
+
+In MMDetection3D's config, we use `model` to setup detection algorithm components. In addition to neural network components such as `voxel_encoder`, `backbone` etc, it also requires `data_preprocessor`, `train_cfg`, and `test_cfg`. `data_preprocessor` is responsible for processing a batch of data output by dataloader. `train_cfg` and `test_cfg` in the model config are training and testing hyperparameters of the components.
+
+```python
+model = dict(
+    type='VoxelNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=32,
+            point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1],
+            voxel_size=[0.16, 0.16, 4],
+            max_voxels=(16000, 40000))),
+    voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=4,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=[0.16, 0.16, 4],
+        point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]),
+    middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
+    backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        assign_per_class=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[0, -39.68, -0.6, 69.12, 39.68, -0.6],
+                    [0, -39.68, -0.6, 69.12, 39.68, -0.6],
+                    [0, -39.68, -1.78, 69.12, 39.68, -1.78]],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=0.1111111111111111,
+            loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    train_cfg=dict(
+        assigner=[
+            dict(
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1)
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
+```
+
+### Dataset and evaluator config
+
+[Dataloaders](https://pytorch.org/docs/stable/data.html?highlight=data%20loader#torch.utils.data.DataLoader) are required for the training, validation, and testing of the [runner](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html). Dataset and data pipeline need to be set to build the dataloader. Due to the complexity of this part, we use intermediate variables to simplify the writing of dataloader configs.
+
+```python
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+metainfo = dict(classes=class_names)
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=15, Cyclist=15),
+    points_loader=dict(
+        type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4))
+
+train_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler, use_ground_plane=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_labels_3d', 'gt_bboxes_3d'])
+]
+test_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+eval_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=6,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='kitti_infos_train.pkl',
+            data_prefix=dict(pts='training/velodyne_reduced'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            metainfo=metainfo,
+            box_type_3d='LiDAR')))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR'))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR'))
+```
+
+[Evaluators](https://mmengine.readthedocs.io/en/latest/tutorials/evaluation.html) are used to compute the metrics of the trained model on the validation and testing datasets. The config of evaluators consists of one or a list of metric configs:
+
+```python
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox')
+test_evaluator = val_evaluator
+```
+
+Since the test dataset has no annotation files, the test_dataloader and test_evaluator config in MMDetection3D are generally equal to the val's. If you want to save the detection results on the test dataset, you can write the config like this:
+
+```python
+# inference on test dataset and
+# format the output results for submission.
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='testing/velodyne_reduced'),
+        ann_file='kitti_infos_test.pkl',
+        load_eval_anns=False,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR'))
+test_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'kitti_infos_test.pkl',
+    metric='bbox',
+    format_only=True,
+    submission_prefix='results/kitti-3class/kitti_results')
+```
+
+### Training and testing config
+
+MMEngine's runner uses Loop to control the training, validation, and testing processes.
+Users can set the maximum training epochs and validation intervals with these fields:
+
+```python
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=80,
+    val_interval=2)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+```
+
+### Optimization config
+
+`optim_wrapper` is the field to configure optimization-related settings. The optimizer wrapper not only provides the functions of the optimizer, but also supports functions such as gradient clipping, mixed precision training, etc. Find more in [optimizer wrapper tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.html).
+
+```python
+optim_wrapper = dict(  # Optimizer wrapper config
+    type='OptimWrapper',  # Optimizer wrapper type, switch to AmpOptimWrapper to enable mixed precision training.
+    optimizer=dict(  # Optimizer config. Support all kinds of optimizers in PyTorch. Refer to https://pytorch.org/docs/stable/optim.html#algorithms
+        type='AdamW', lr=0.001, betas=(0.95, 0.99), weight_decay=0.01),
+    clip_grad=dict(max_norm=35, norm_type=2))  # Gradient clip option. Set None to disable gradient clip. Find usage in https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.html
+```
+
+`param_scheduler` is a field that configures methods of adjusting optimization hyperparameters such as learning rate and momentum. Users can combine multiple schedulers to create a desired parameter adjustment strategy. Find more in [parameter scheduler tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/param_scheduler.html) and [parameter scheduler API documents](https://mmengine.readthedocs.io/en/latest/api/optim.html#scheduler).
+
+```python
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=32,
+        eta_min=0.01,
+        begin=0,
+        end=32,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=48,
+        eta_min=1.0000000000000001e-07,
+        begin=32,
+        end=80,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=32,
+        eta_min=0.8947368421052632,
+        begin=0,
+        end=32,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=48,
+        eta_min=1,
+        begin=32,
+        end=80,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+```
+
+### Hook config
+
+Users can attach Hooks to training, validation, and testing loops to insert some operations during running. There are two different hook fields, one is `default_hooks` and the other is `custom_hooks`.
+
+`default_hooks` is a dict of hook configs, and they are the hooks must be required at the runtime. They have default priority which should not be modified. If not set, runner will use the default values. To disable a default hook, users can set its config to `None`.
+
+```python
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=-1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='Det3DVisualizationHook'))
+```
+
+`custom_hooks` is a list of all other hook configs. Users can develop their own hooks and insert them in this field.
+
+```python
+custom_hooks = []
+```
+
+### Runtime config
+
+```python
+default_scope = 'mmdet3d'  # The default registry scope to find modules. Refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html
+
+env_cfg = dict(
+    cudnn_benchmark=False,  # Whether to enable cudnn benchmark
+    mp_cfg=dict(  # Multi-processing config
+        mp_start_method='fork',  # Use fork to start multi-processing threads. 'fork' usually faster than 'spawn' but maybe unsafe. See discussion in https://github.com/pytorch/pytorch/issues/1355
+        opencv_num_threads=0),  # Disable opencv multi-threads to avoid system being overloaded
+    dist_cfg=dict(backend='nccl'))  # Distribution configs
+
+vis_backends = [dict(type='LocalVisBackend')]  # Visualization backends. Refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/visualization.html
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+log_processor = dict(
+    type='LogProcessor',  # Log processor to process runtime logs
+    window_size=50,  # Smooth interval of log values
+    by_epoch=True)  # Whether to format logs with epoch type. Should be consistent with the train loop's type.
+
+log_level = 'INFO'  # The level of logging.
+load_from = None  # Load model checkpoint as a pre-trained model from a given path. This will not resume training.
+resume = False  # Whether to resume from the checkpoint defined in `load_from`. If `load_from` is None, it will resume the latest checkpoint in the `work_dir`.
+```
+
+## Config file inheritance
+
+There are 4 basic component types under `configs/_base_`, dataset, model, schedule, default_runtime.
+Many methods could be easily constructed with one of these models like SECOND, PointPillars, PartA2, VoteNet.
+The configs that are composed of components from `_base_` are called _primitive_.
+
+For all configs under the same folder, it is recommended to have only **one** _primitive_ config. All other configs should inherit from the _primitive_ config. In this way, the maximum of inheritance level is 3.
+
+For easy understanding, we recommend contributors to inherit from existing methods.
+For example, if some modification is made based on PointPillars, users may first inherit the basic PointPillars structure by specifying `_base_ = '../pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py'`, then modify the necessary fields in the config files.
+
+If you are building an entirely new method that does not share the structure with any of the existing methods, you may create a folder `xxx_rcnn` under `configs`.
+
+Please refer to [MMEngine config tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html) for detailed documentation.
+
+By setting the `_base_` field, we can set which files the current configuration file inherits from.
+
+When `_base_` is a string of a file path, it means inheriting the contents from one config file.
+
+```python
+_base_ = './pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py'
+```
+
+When `_base_` is a list of multiple file paths, it means inheriting from multiple files.
+
+```python
+_base_ = [
+    '../_base_/models/pointpillars_hv_secfpn_kitti.py',
+    '../_base_/datasets/kitti-3d-3class.py',
+    '../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py'
+]
+```
+
+If you wish to inspect the config file, you may run `python tools/misc/print_config.py /PATH/TO/CONFIG` to see the complete config.
+
+### Ignore some fields in the base configs
+
+Sometimes, you may set `_delete_=True` to ignore some of the fields in base configs.
+You may refer to [MMEngine config tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html) for a simple illustration.
+
+In MMDetection3D, for example, to change the neck of PointPillars with the following config:
+
+```python
+model = dict(
+    type='MVXFasterRCNN',
+    data_preprocessor=dict(voxel_layer=dict(...)),
+    pts_voxel_encoder=dict(...),
+    pts_middle_encoder=dict(...),
+    pts_backbone=dict(...),
+    pts_neck=dict(
+        type='FPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        in_channels=[64, 128, 256],
+        out_channels=256,
+        start_level=0,
+        num_outs=3),
+    pts_bbox_head=dict(...))
+```
+
+`FPN` and `SECONDFPN` use different keywords to construct:
+
+```python
+_base_ = '../_base_/models/pointpillars_hv_fpn_nus.py'
+model = dict(
+    pts_neck=dict(
+        _delete_=True,
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(...))
+```
+
+The `_delete_=True` would replace all old keys in `pts_neck` field with new keys.
+
+### Use intermediate variables in configs
+
+Some intermediate variables are used in the configs files, like `train_pipeline`/`test_pipeline` in datasets.
+It's worth noting that when modifying intermediate variables in the children configs, user needs to pass the intermediate variables into corresponding fields again.
+For example, we would like to use a multi-scale strategy to train and test a PointPillars, `train_pipeline`/`test_pipeline` are intermediate variables we would like to modify.
+
+```python
+_base_ = './nus-3d.py'
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_labels_3d', 'gt_bboxes_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=[0.95, 1.0, 1.05],
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+```
+
+We first define the new `train_pipeline`/`test_pipeline` and pass them into dataloader fields.
+
+### Reuse variables in \_base\_ file
+
+If the users want to reuse the variables in the base file, they can get a copy of the corresponding variable by using `{{_base_.xxx}}`. E.g:
+
+```python
+_base_ = './pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py'
+
+a = {{_base_.model}}  # variable `a` is equal to the `model` defined in `_base_`
+```
+
+## Modify config through script arguments
+
+When submitting jobs using `tools/train.py` or `tools/test.py`, you may specify `--cfg-options` to in-place modify the config.
+
+- Update config keys of dict chains
+
+  The config options can be specified following the order of the dict keys in the original config.
+  For example, `--cfg-options model.backbone.norm_eval=False` changes the all BN modules in model backbones to `train` mode.
+
+- Update keys inside a list of configs
+
+  Some config dicts are composed as a list in your config. For example, the training pipeline `train_dataloader.dataset.pipeline` is normally a list
+  e.g. `[dict(type='LoadPointsFromFile'), ...]`. If you want to change `'LoadPointsFromFile'` to `'LoadPointsFromDict'` in the pipeline,
+  you may specify `--cfg-options data.train.pipeline.0.type=LoadPointsFromDict`.
+
+- Update values of list/tuple
+
+  If the value to be updated is a list or a tuple. For example, the config file normally sets `model.data_preprocessor.mean=[123.675, 116.28, 103.53]`. If you want to
+  change the mean values, you may specify `--cfg-options model.data_preprocessor.mean="[127,127,127]"`. Note that the quotation mark `"` is necessary to
+  support list/tuple data types, and that **NO** white space is allowed inside the quotation marks in the specified value.
+
+## Config Name Style
+
+We follow the below style to name config files. Contributors are advised to follow the same style.
+
+```
+{algorithm name}_{model component names [component1]_[component2]_[...]}_{training settings}_{training dataset information}_{testing dataset information}.py
+```
+
+The file name is divided to five parts. All parts and components are connected with `_` and words of each part or component should be connected with `-`.
+
+- `{algorithm name}`: The name of the algorithm. It can be a detector name such as `pointpillars`, `fcos3d`, etc.
+- `{model component names}`: Names of the components used in the algorithm such as voxel_encoder, backbone, neck, etc. For example, `second_secfpn_head-dcn-circlenms` means using SECOND's SparseEncoder, SECONDFPN and a detection head with DCN and circle NMS.
+- `{training settings}`: Information of training settings such as batch size, augmentations, loss trick, scheduler, and epochs/iterations. For example: `8xb4-tta-cyclic-20e` means using 8-gpus x 4-samples-per-gpu, test time augmentation, cyclic annealing learning rate, and train 20 epochs.
+  Some abbreviations:
+  - `{gpu x batch_per_gpu}`: GPUs and samples per GPU. `bN` indicates N batch size per GPU. E.g. `4xb4` is the short term of 4-GPUs x 4-samples-per-GPU.
+  - `{schedule}`: training schedule, options are `schedule-2x`, `schedule-3x`, `cyclic-20e`, etc.
+    `schedule-2x` and `schedule-3x` mean 24 epochs and 36 epochs respectively.
+    `cyclic-20e` means 20 epochs respectively.
+- `{training dataset information}`: Training dataset names like `kitti-3d-3class`, `nus-3d`, `s3dis-seg`, `scannet-seg`, `waymoD5-3d-car`. Here `3d` means dataset used for 3D object detection, and `seg` means dataset used for point cloud segmentation.
+- `{testing dataset information}` (optional): Testing dataset name for models trained on one dataset but tested on another. If not mentioned, it means the model was trained and tested on the same dataset type.
diff --git a/mmde/docs/en/user_guides/coord_sys_tutorial.md b/mmde/docs/en/user_guides/coord_sys_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..7c104a17ac66b276f9ccc83b0c666b296ea05901
--- /dev/null
+++ b/mmde/docs/en/user_guides/coord_sys_tutorial.md
@@ -0,0 +1,245 @@
+# Coordinate System
+
+## Overview
+
+MMDetection3D uses three different coordinate systems. The existence of different coordinate systems in the society of 3D object detection is necessary, because for various 3D data collection devices, such as LiDAR, depth camera, etc., the coordinate systems are not consistent, and different 3D datasets also follow different data formats. Early works, such as SECOND, VoteNet, convert the raw data to another format, forming conventions that some later works also follow, making the conversion between coordinate systems even more complicated.
+
+Despite the variety of datasets and equipment, by summarizing the line of works on 3D object detection we can roughly categorize coordinate systems into three:
+
+- Camera coordinate system -- the coordinate system of most cameras, in which the positive direction of the y-axis points to the ground, the positive direction of the x-axis points to the right, and the positive direction of the z-axis points to the front.
+
+  ```
+             up  z front
+              |    ^
+              |   /
+              |  /
+              | /
+              |/
+  left ------ 0 ------> x right
+              |
+              |
+              |
+              |
+              v
+            y down
+  ```
+
+- LiDAR coordinate system -- the coordinate system of many LiDARs, in which the negative direction of the z-axis points to the ground, the positive direction of the x-axis points to the front, and the positive direction of the y-axis points to the left.
+
+  ```
+               z up  x front
+                 ^    ^
+                 |   /
+                 |  /
+                 | /
+                 |/
+  y left <------ 0 ------ right
+  ```
+
+- Depth coordinate system -- the coordinate system used by VoteNet, H3DNet, etc., in which the negative direction of the z-axis points to the ground, the positive direction of the x-axis points to the right, and the positive direction of the y-axis points to the front.
+
+  ```
+            z up  y front
+              ^    ^
+              |   /
+              |  /
+              | /
+              |/
+  left ------ 0 ------> x right
+  ```
+
+The definition of coordinate systems in this tutorial is actually **more than just defining the three axes**. For a box in the form of $(x, y, z, dx, dy, dz, r)$, our coordinate systems also define how to interpret the box dimensions $(dx, dy, dz)$ and the yaw angle $r$.
+
+The illustration of the three coordinate systems is shown below:
+
+![](https://raw.githubusercontent.com/open-mmlab/mmdetection3d/master/resources/coord_sys_all.png)
+
+The three figures above are the 3D coordinate systems while the three figures below are the bird's eye view.
+
+We will stick to the three coordinate systems defined in this tutorial in the future.
+
+## Definition of the yaw angle
+
+Please refer to [wikipedia](https://en.wikipedia.org/wiki/Euler_angles#Tait%E2%80%93Bryan_angles) for the standard definition of the yaw angle. In object detection, we choose an axis as the gravity axis, and a reference direction on the plane $\\Pi$ perpendicular to the gravity axis, then the reference direction has a yaw angle of 0, and other directions on $\\Pi$ have non-zero yaw angles depending on its angle with the reference direction.
+
+Currently, for all supported datasets, annotations do not include pitch angle and roll angle, which means we need only consider the yaw angle when predicting boxes and calculating overlap between boxes.
+
+In MMDetection3D, all three coordinate systems are right-handed coordinate systems, which means the ascending direction of the yaw angle is counter-clockwise if viewed from the negative direction of the gravity axis (the axis is pointing at one's eyes).
+
+The figure below shows that, in this right-handed coordinate system, if we set the positive direction of the x-axis as a reference direction, then the positive direction of the y-axis has a yaw angle of $\\frac{\\pi}{2}$.
+
+```
+                     z up  y front (yaw=0.5*pi)
+                      ^    ^
+                      |   /
+                      |  /
+                      | /
+                      |/
+left (yaw=pi)  ------ 0 ------> x right (yaw=0)
+```
+
+For a box, the value of its yaw angle equals its direction minus a reference direction. In all three coordinate systems in MMDetection3D, the reference direction is always the positive direction of the x-axis, while the direction of a box is defined to be parallel with the x-axis if its yaw angle is 0. The definition of the yaw angle of a box is illustrated in the figure below.
+
+```
+y front
+  ^      box direction (yaw=0.5*pi)
+ /|\        ^
+  |        /|\
+  |     ____|____
+  |    |    |    |
+  |    |    |    |
+__|____|____|____|______\ x right
+  |    |    |    |      /
+  |    |    |    |
+  |    |____|____|
+  |
+```
+
+## Definition of the box dimensions
+
+The definition of the box dimensions cannot be disentangled with the definition of the yaw angle. In the previous section, we said that the direction of a box is defined to be parallel with the x-axis if its yaw angle is 0. Then naturally, the dimension of a box which corresponds to the x-axis should be $dx$. However, this is not always the case in some datasets (we will address that later).
+
+The following figures show the meaning of the correspondence between the x-axis and $dx$, and between the y-axis and $dy$.
+
+```
+y front
+  ^      box direction (yaw=0.5*pi)
+ /|\        ^
+  |        /|\
+  |     ____|____
+  |    |    |    |
+  |    |    |    | dx
+__|____|____|____|______\ x right
+  |    |    |    |      /
+  |    |    |    |
+  |    |____|____|
+  |         dy
+```
+
+Note that the box direction is always parallel with the edge $dx$.
+
+```
+y front
+  ^     _________
+ /|\   |    |    |
+  |    |    |    |
+  |    |    |    | dy
+  |    |____|____|____\  box direction (yaw=0)
+  |    |    |    |    /
+__|____|____|____|_________\ x right
+  |    |    |    |         /
+  |    |____|____|
+  |         dx
+  |
+```
+
+## Relation with raw coordinate systems of supported datasets
+
+### KITTI
+
+The raw annotation of KITTI is under camera coordinate system, see [get_label_anno](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/dataset_converters/kitti_data_utils.py). In MMDetection3D, to train LiDAR-based models on KITTI, the data is first converted from camera coordinate system to LiDAR coordinate system, see [get_ann_info](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/kitti_dataset.py). For training vision-based models, the data is kept in the camera coordinate system.
+
+In SECOND, the LiDAR coordinate system for a box is defined as follows (a bird's eye view):
+
+![](https://raw.githubusercontent.com/traveller59/second.pytorch/master/images/kittibox.png)
+
+For each box, the dimensions are $(w, l, h)$, and the reference direction for the yaw angle is the positive direction of the y axis. For more details, refer to the [repo](https://github.com/traveller59/second.pytorch#concepts).
+
+Our LiDAR coordinate system has two changes:
+
+- The yaw angle is defined to be right-handed instead of left-handed for consistency;
+- The box dimensions are $(l, w, h)$ instead of $(w, l, h)$, since $w$ corresponds to $dy$ and $l$ corresponds to $dx$ in KITTI.
+
+### Waymo
+
+We use the KITTI-format data of Waymo dataset. Therefore, KITTI and Waymo also share the same coordinate system in our implementation.
+
+### NuScenes
+
+NuScenes provides a toolkit for evaluation, in which each box is wrapped into a `Box` instance. The coordinate system of `Box` is different from our LiDAR coordinate system in that the first two elements of the box dimension correspond to $(dy, dx)$, or $(w, l)$, respectively, instead of the reverse. For more details, please refer to the NuScenes [tutorial](https://github.com/open-mmlab/mmdetection3d/blob/master/docs/en/datasets/nuscenes_det.md#notes).
+
+Readers may refer to the [NuScenes development kit](https://github.com/nutonomy/nuscenes-devkit/tree/master/python-sdk/nuscenes/eval/detection) for the definition of a [NuScenes box](https://github.com/nutonomy/nuscenes-devkit/blob/2c6a752319f23910d5f55cc995abc547a9e54142/python-sdk/nuscenes/utils/data_classes.py#L457) and implementation of [NuScenes evaluation](https://github.com/nutonomy/nuscenes-devkit/blob/master/python-sdk/nuscenes/eval/detection/evaluate.py).
+
+### Lyft
+
+Lyft shares the same data format with NuScenes as far as coordinate system is involved.
+
+Please refer to the [official website](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data) for more information.
+
+### ScanNet
+
+The raw data of ScanNet is not point cloud but mesh. The sampled point cloud data is under our depth coordinate system. For ScanNet detection task, the box annotations are axis-aligned, and the yaw angle is always zero. Therefore the direction of the yaw angle in our depth coordinate system makes no difference regarding ScanNet.
+
+### SUN RGB-D
+
+The raw data of SUN RGB-D is not point cloud but RGB-D image. By back projection, we obtain the corresponding point cloud for each image, which is under our Depth coordinate system. However, the annotation is not under our system and thus needs conversion.
+
+For the conversion from raw annotation to annotation under our Depth coordinate system, please refer to [sunrgbd_data_utils.py](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/dataset_converters/sunrgbd_data_utils.py).
+
+### S3DIS
+
+S3DIS shares the same coordinate system as ScanNet in our implementation. However, S3DIS is a segmentation-task-only dataset, and thus no annotation is coordinate system sensitive.
+
+## Examples
+
+### Box conversion (between different coordinate systems)
+
+Take the conversion between our Camera coordinate system and LiDAR coordinate system as an example:
+
+First, for points and box centers, the coordinates before and after the conversion satisfy the following relationship:
+
+- $x\_{LiDAR}=z\_{camera}$
+- $y\_{LiDAR}=-x\_{camera}$
+- $z\_{LiDAR}=-y\_{camera}$
+
+Then, the box dimensions before and after the conversion satisfy the following relationship:
+
+- $dx\_{LiDAR}=dx\_{camera}$
+- $dy\_{LiDAR}=dz\_{camera}$
+- $dz\_{LiDAR}=dy\_{camera}$
+
+Finally, the yaw angle should also be converted:
+
+- $r\_{LiDAR}=-\\frac{\\pi}{2}-r\_{camera}$
+
+See the code [here](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/bbox/structures/box_3d_mode.py) for more details.
+
+### Bird's Eye View
+
+The BEV of a camera coordinate system box is $(x, z, dx, dz, -r)$ if the 3D box is $(x, y, z, dx, dy, dz, r)$. The inversion of the sign of the yaw angle is because the positive direction of the gravity axis of the Camera coordinate system points to the ground.
+
+See the code [here](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/bbox/structures/cam_box3d.py) for more details.
+
+### Rotation of boxes
+
+We set the rotation of all kinds of boxes to be counter-clockwise about the gravity axis. Therefore, to rotate a 3D box we first calculate the new box center, and then we add the rotation angle to the yaw angle.
+
+See the code [here](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/bbox/structures/cam_box3d.py) for more details.
+
+## Common FAQ
+
+#### Q1: Are the box related ops universal to all coordinate system types?
+
+No. For example, [RoI-Aware Pooling ops](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/roiaware_pool3d.py) is applicable to boxes under Depth or LiDAR coordinate system only. The evaluation functions for KITTI dataset [here](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/evaluation/kitti_utils) are only applicable to boxes under Camera coordinate system since the rotation is clockwise if viewed from above.
+
+For each box related op, we have marked the type of boxes to which we can apply the op.
+
+#### Q2: In every coordinate system, do the three axes point exactly to the right, the front, and the ground, respectively?
+
+No. For example, in KITTI, we need a calibration matrix when converting from Camera coordinate system to LiDAR coordinate system.
+
+#### Q3: How does a phase difference of $2\\pi$ in the yaw angle of a box affect evaluation?
+
+For IoU calculation, a phase difference of $2\\pi$ in the yaw angle will result in the same box, thus not affecting evaluation.
+
+For angle prediction evaluation such as the NDS metric in NuScenes and the AOS metric in KITTI, the angle of predicted boxes will be first standardized, so the phase difference of $2\\pi$ will not change the result.
+
+#### Q4: How does a phase difference of $\\pi$ in the yaw angle of a box affect evaluation?
+
+For IoU calculation, a phase difference of $\\pi$ in the yaw angle will result in the same box, thus not affecting evaluation.
+
+However, for angle prediction evaluation, this will result in the exact opposite direction.
+
+Just think about a car. The yaw angle is the angle between the direction of the car front and the positive direction of the x-axis. If we add $\\pi$ to this angle, the car front will become the car rear.
+
+For categories such as barrier, the front and the rear have no difference, therefore a phase difference of $\\pi$ will not affect the angle prediction score.
diff --git a/mmde/docs/en/user_guides/data_pipeline.md b/mmde/docs/en/user_guides/data_pipeline.md
new file mode 100644
index 0000000000000000000000000000000000000000..01f5c61b99a75eff2cb1376a668dcb0e8891b98d
--- /dev/null
+++ b/mmde/docs/en/user_guides/data_pipeline.md
@@ -0,0 +1,199 @@
+# Customize Data Pipelines
+
+## Design of Data pipelines
+
+Following typical conventions, we use `Dataset` and `DataLoader` for data loading
+with multiple workers. `Dataset` returns a dict of data items corresponding
+the arguments of models' forward method.
+Since the data in object detection may not be the same size (point number, gt bbox size, etc.),
+we introduce a new `DataContainer` type in MMCV to help collect and distribute
+data of different size.
+See [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py) for more details.
+
+The data preparation pipeline and the dataset is decomposed. Usually a dataset
+defines how to process the annotations and a data pipeline defines all the steps to prepare a data dict.
+A pipeline consists of a sequence of operations. Each operation takes a dict as input and also output a dict for the next transform.
+
+We present a classical pipeline in the following figure. The blue blocks are pipeline operations. With the pipeline going on, each operator can add new keys (marked as green) to the result dict or update the existing keys (marked as orange).
+
+![](../../../resources/data_pipeline.png)
+
+The operations are categorized into data loading, pre-processing, formatting and test-time augmentation.
+
+Here is an pipeline example for PointPillars.
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1.0,
+        flip=False,
+        pcd_horizontal_flip=False,
+        pcd_vertical_flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+```
+
+For each operation, we list the related dict fields that are added/updated/removed.
+
+### Data loading
+
+`LoadPointsFromFile`
+
+- add: points
+
+`LoadPointsFromMultiSweeps`
+
+- update: points
+
+`LoadAnnotations3D`
+
+- add: gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, pts_instance_mask, pts_semantic_mask, bbox3d_fields, pts_mask_fields, pts_seg_fields
+
+### Pre-processing
+
+`GlobalRotScaleTrans`
+
+- add: pcd_trans, pcd_rotation, pcd_scale_factor
+- update: points, \*bbox3d_fields
+
+`RandomFlip3D`
+
+- add: flip, pcd_horizontal_flip, pcd_vertical_flip
+- update: points, \*bbox3d_fields
+
+`PointsRangeFilter`
+
+- update: points
+
+`ObjectRangeFilter`
+
+- update: gt_bboxes_3d, gt_labels_3d
+
+`ObjectNameFilter`
+
+- update: gt_bboxes_3d, gt_labels_3d
+
+`PointShuffle`
+
+- update: points
+
+`PointsRangeFilter`
+
+- update: points
+
+### Formatting
+
+`DefaultFormatBundle3D`
+
+- update: points, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels
+
+`Collect3D`
+
+- add: img_meta (the keys of img_meta is specified by `meta_keys`)
+- remove: all other keys except for those specified by `keys`
+
+### Test time augmentation
+
+`MultiScaleFlipAug`
+
+- update: scale, pcd_scale_factor, flip, flip_direction, pcd_horizontal_flip, pcd_vertical_flip with list of augmented data with these specific parameters
+
+## Extend and use custom pipelines
+
+1. Write a new pipeline in any file, e.g., `my_pipeline.py`. It takes a dict as input and return a dict.
+
+   ```python
+   from mmdet.datasets import PIPELINES
+
+   @PIPELINES.register_module()
+   class MyTransform:
+
+       def __call__(self, results):
+           results['dummy'] = True
+           return results
+   ```
+
+2. Import the new class.
+
+   ```python
+   from .my_pipeline import MyTransform
+   ```
+
+3. Use it in config files.
+
+   ```python
+   train_pipeline = [
+       dict(
+           type='LoadPointsFromFile',
+           load_dim=5,
+           use_dim=5,
+           backend_args=backend_args),
+       dict(
+           type='LoadPointsFromMultiSweeps',
+           sweeps_num=10,
+           backend_args=backend_args),
+       dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+       dict(
+           type='GlobalRotScaleTrans',
+           rot_range=[-0.3925, 0.3925],
+           scale_ratio_range=[0.95, 1.05],
+           translation_std=[0, 0, 0]),
+       dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+       dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+       dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+       dict(type='ObjectNameFilter', classes=class_names),
+       dict(type='MyTransform'),
+       dict(type='PointShuffle'),
+       dict(type='DefaultFormatBundle3D', class_names=class_names),
+       dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+   ]
+   ```
diff --git a/mmde/docs/en/user_guides/dataset_prepare.md b/mmde/docs/en/user_guides/dataset_prepare.md
new file mode 100644
index 0000000000000000000000000000000000000000..17f368a9d0ce133b802d756d929703c7e500f985
--- /dev/null
+++ b/mmde/docs/en/user_guides/dataset_prepare.md
@@ -0,0 +1,269 @@
+# Dataset Preparation
+
+## Before Preparation
+
+It is recommended to symlink the dataset root to `$MMDETECTION3D/data`.
+If your folder structure is different from the following, you may need to change the corresponding paths in config files.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── nuscenes
+│   │   ├── maps
+│   │   ├── samples
+│   │   ├── sweeps
+│   │   ├── v1.0-test
+|   |   ├── v1.0-trainval
+│   ├── kitti
+│   │   ├── ImageSets
+│   │   ├── testing
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── velodyne
+│   │   ├── training
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── label_2
+│   │   │   ├── velodyne
+│   ├── waymo
+│   │   ├── waymo_format
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   │   ├── testing
+│   │   │   ├── gt.bin
+│   │   ├── kitti_format
+│   │   │   ├── ImageSets
+│   ├── lyft
+│   │   ├── v1.01-train
+│   │   │   ├── v1.01-train (train_data)
+│   │   │   ├── lidar (train_lidar)
+│   │   │   ├── images (train_images)
+│   │   │   ├── maps (train_maps)
+│   │   ├── v1.01-test
+│   │   │   ├── v1.01-test (test_data)
+│   │   │   ├── lidar (test_lidar)
+│   │   │   ├── images (test_images)
+│   │   │   ├── maps (test_maps)
+│   │   ├── train.txt
+│   │   ├── val.txt
+│   │   ├── test.txt
+│   │   ├── sample_submission.csv
+│   ├── s3dis
+│   │   ├── meta_data
+│   │   ├── Stanford3dDataset_v1.2_Aligned_Version
+│   │   ├── collect_indoor3d_data.py
+│   │   ├── indoor3d_util.py
+│   │   ├── README.md
+│   ├── scannet
+│   │   ├── meta_data
+│   │   ├── scans
+│   │   ├── scans_test
+│   │   ├── batch_load_scannet_data.py
+│   │   ├── load_scannet_data.py
+│   │   ├── scannet_utils.py
+│   │   ├── README.md
+│   ├── sunrgbd
+│   │   ├── OFFICIAL_SUNRGBD
+│   │   ├── matlab
+│   │   ├── sunrgbd_data.py
+│   │   ├── sunrgbd_utils.py
+│   │   ├── README.md
+│   ├── semantickitti
+│   │   ├── sequences
+│   │   │   ├── 00
+│   │   │   │   ├── labels
+│   │   │   │   ├── velodyne
+│   │   │   ├── 01
+│   │   │   ├── ..
+│   │   │   ├── 22
+
+```
+
+## Download and Data Preparation
+
+### KITTI
+
+1. Download KITTI 3D detection data [HERE](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d). Alternatively, you
+   can download the dataset from [OpenDataLab](https://opendatalab.com/) using MIM. The command scripts are the following:
+
+```bash
+# install OpenDataLab CLI tools
+pip install -U opendatalab
+# log in OpenDataLab. Note that you should register an account on [OpenDataLab](https://opendatalab.com/) before.
+pip install odl
+odl login
+# download and preprocess by MIM
+mim download mmdet3d --dataset kitti
+```
+
+2. Prepare KITTI data splits by running:
+
+```bash
+mkdir ./data/kitti/ && mkdir ./data/kitti/ImageSets
+
+# Download data split
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/test.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/test.txt
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/train.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/train.txt
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/val.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/val.txt
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/trainval.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/trainval.txt
+```
+
+3. Generate info files by running:
+
+```bash
+python tools/create_data.py kitti --root-path ./data/kitti --out-dir ./data/kitti --extra-tag kitti
+```
+
+In an environment using slurm, users may run the following command instead:
+
+```bash
+sh tools/create_data.sh <partition> kitti
+```
+
+**Tips**:
+
+- **Ready-made Annotations**. We have also provided kitti data annotation files generated offline [here](#summary-of-annotation-files). You could download them and place them under `data/kitti/`. However, if you want to use `ObjectSample` Augmentation in LiDAR-based detection methods, you should additionally generate groundtruth database files and annotations.
+
+  ```bash
+  python tools/create_data.py kitti --root-path ./data/kitti --out-dir ./data/kitti --extra-tag kitti --only-gt-database
+  ```
+
+### Waymo
+
+Download Waymo open dataset V1.4.1 [HERE](https://waymo.com/open/download/) and its data split [HERE](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing). Then put `.tfrecord` files into corresponding folders in `data/waymo/waymo_format/` and put the data split `.txt` files into `data/waymo/kitti_format/ImageSets`. Download ground truth `.bin` file for validation set [HERE](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects) and put it into `data/waymo/waymo_format/`. A tip is that you can use `gsutil` to download the large-scale dataset with commands. You can take this [tool](https://github.com/RalphMao/Waymo-Dataset-Tool) as an example for more details. Subsequently, prepare waymo data by running:
+
+```bash
+# TF_CPP_MIN_LOG_LEVEL=3 will disable all logging output from TensorFlow.
+# The number of `--workers` depends on the maximum number of cores in your CPU.
+TF_CPP_MIN_LOG_LEVEL=3 python tools/create_data.py waymo --root-path ./data/waymo --out-dir ./data/waymo --workers 128 --extra-tag waymo --version v1.4
+```
+
+Note that:
+
+- In case the preprocessing of Waymo dataset is slow or blocked, consider reducing the value of `--workers`. If this doesn't resolve the issue, you could set `--workers` as 0 to avoid using multiprocess.
+
+- If your local disk does not have enough space for saving converted data, you can change the `--out-dir` to anywhere else. Just remember to create folders and prepare data there in advance and link them back to `data/waymo/kitti_format` after the data conversion.
+
+**Tips**:
+
+- **Ready-made Annotations**. We have provided the annotation files generated offline [here](#summary-of-annotation-files). However, the original Waymo data still needs to be converted to `kitti-format` data by yourself.
+
+- **Waymo-mini**. If you just want to use a part of Waymo Dataset to verify some methods or debug quickly, you could use our provided [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) which only contains two segments in train split and one segment in val split from the original dataset. All the images, point clouds and annotations in this compressed file have been processed offline so that you can directly download and unzip it to `data/waymo/`:
+
+  ```bash
+  tar -xzvf waymo_mini.tar.gz -C ./data/waymo_mini
+  ```
+
+### NuScenes
+
+1. Download nuScenes V1.0 full dataset data [HERE](https://www.nuscenes.org/download). Alternatively, you
+   can download the dataset from [OpenDataLab](https://opendatalab.com/) using MIM. The downloading and unzipping command scripts are the following:
+
+```bash
+# install OpenDataLab CLI tools
+pip install -U opendatalab
+# log in OpenDataLab. Note that you should register an account on [OpenDataLab](https://opendatalab.com/) before.
+pip install odl
+odl login
+# download and preprocess by MIM
+mim download mmdet3d --dataset nuscenes
+```
+
+2. Prepare nuscenes data by running:
+
+```bash
+python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes
+```
+
+**Tips**:
+
+- **Ready-made Annotations**. We have also provided NuScenes data annotation files generated offline [here](#summary-of-annotation-files). You could download them and place them under `data/nuscenes/`. However, if you want to use `ObjectSample` Augmentation in LiDAR-based detection methods, you should additionally generate groundtruth database files and annotations.
+
+```bash
+python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes --only-gt-database
+```
+
+### Lyft
+
+Download Lyft 3D detection data [HERE](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data). Prepare Lyft data by running:
+
+```bash
+python tools/create_data.py lyft --root-path ./data/lyft --out-dir ./data/lyft --extra-tag lyft --version v1.01
+python tools/dataset_converters/lyft_data_fixer.py --version v1.01 --root-folder ./data/lyft
+```
+
+Note that we follow the original folder names for clear organization. Please rename the raw folders as shown above. Also note that the second command serves the purpose of fixing a corrupted lidar data file. Please refer to the [discussion](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110000) for more details.
+
+### SemanticKITTI
+
+1. Download SemanticKITTI dataset [HERE](http://semantic-kitti.org/dataset.html#download) and unzip all zip files. Alternatively, you
+   can download the dataset from [OpenDataLab](https://opendatalab.com/) using MIM. The downloading and unzipping command scripts are the following:
+
+```bash
+# install OpenDataLab CLI tools
+pip install -U opendatalab
+# log in OpenDataLab. Note that you should register an account on [OpenDataLab](https://opendatalab.com/) before.
+pip install odl
+odl login
+# download and preprocess by MIM
+mim download mmdet3d --dataset semantickitti
+```
+
+2. Generate info files by running:
+
+```bash
+python ./tools/create_data.py semantickitti --root-path ./data/semantickitti --out-dir ./data/semantickitti --extra-tag semantickitti
+```
+
+**Tips**:
+
+- **Ready-made Annotations**. We have also provided SemanticKITTI data annotation files generated offline [here](#summary-of-annotation-files). You could download them and place them under `data/semantickitti/`.
+
+### S3DIS, ScanNet and SUN RGB-D
+
+To prepare S3DIS data, please see its [README](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/data/s3dis/README.md).
+
+To prepare ScanNet data, please see its [README](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/data/scannet/README.md).
+
+To prepare SUN RGB-D data, please see its [README](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/data/sunrgbd/README.md).
+
+**Tips**: For S3DIS, ScanNet and SUN RGB-D datasets, we have also provided data annotation files generated offline [here](#summary-of-annotation-files). You could download them and place them under `data/${DATASET}/`. However, you also need to generate point cloud files and semantic/instance masks files (if it has) by yourself.
+
+### Customized Datasets
+
+For using custom datasets, please refer to [Customize Datasets](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/docs/en/advanced_guides/customize_dataset.md).
+
+### Update data infos
+
+If you have used v1.0.0rc1-v1.0.0rc4 mmdetection3d to create data infos before, and now you want to use the newest v1.1.0 mmdetection3d, you need to update the data infos file.
+
+```bash
+python tools/dataset_converters/update_infos_to_v2.py --dataset ${DATA_SET} --pkl-path ${PKL_PATH} --out-dir ${OUT_DIR}
+```
+
+- `--dataset` : Name of dataset.
+- `--pkl-path` : Specify the data infos pkl file path.
+- `--out-dir` : Output direction of the data infos pkl file.
+
+Example:
+
+```bash
+python tools/dataset_converters/update_infos_to_v2.py --dataset kitti --pkl-path ./data/kitti/kitti_infos_trainval.pkl --out-dir ./data/kitti
+```
+
+### Summary of annotation files
+
+We provide ready-made annotation files we generated offline for reference. You can directly use these files for convenice.
+
+|                                                  Dataset                                                  |                                                                                                           Train annotation file                                                                                                           |                                                                                                        Val annotation file                                                                                                         |                                                                                                              Test information file                                                                                                              |
+| :-------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|                                                   KITTI                                                   |                                                                  [kitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_train.pkl)                                                                   |                                                                 [kitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_val.pkl)                                                                 |                                                                        [kitti_infos_test](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_test.pkl)                                                                         |
+|                                                 NuScenes                                                  | [nuscenes_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_train.pkl) [nuscenes_mini_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_train.pkl) | [nuscenes_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_val.pkl)  [nuscenes_mini_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_val.pkl) |                                                                                                                                                                                                                                                 |
+|                                                   Waymo                                                   |                                                         [waymo_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_train.pkl)                                                          |                                                        [waymo_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_val.pkl)                                                        | [waymo_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_test.pkl)   [waymo_infos_test_cam_only.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_test_cam_only.pkl) |
+| [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) |                                                                                                                                                                                                                                           |                                                                                                                                                                                                                                    |                                                                                                                                                                                                                                                 |
+|                                                 SUN RGB-D                                                 |                                                               [sunrgbd_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_train.pkl)                                                                |                                                              [sunrgbd_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_val.pkl)                                                              |                                                                                                                                                                                                                                                 |
+|                                                  ScanNet                                                  |                                                               [scannet_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_train.pkl)                                                                |                                                              [scannet_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_val.pkl)                                                              |                                                                   [scannet_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_test.pkl)                                                                    |
+|                                               SemanticKitti                                               |                                                      [semantickitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_train.pkl)                                                       |                                                     [semantickitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_val.pkl)                                                     |                                                          [semantickitti_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_test.pkl)                                                           |
diff --git a/mmde/docs/en/user_guides/index.rst b/mmde/docs/en/user_guides/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a65227e0cae36fe66073ddfc5880f43705893a37
--- /dev/null
+++ b/mmde/docs/en/user_guides/index.rst
@@ -0,0 +1,22 @@
+Train & Test
+**************
+.. toctree::
+   :maxdepth: 1
+
+   config.md
+   coord_sys_tutorial.md
+   dataset_prepare.md
+   data_pipeline.md
+   train_test.md
+   inference.md
+   new_data_model.md
+
+Useful Tools
+************
+.. toctree::
+   :maxdepth: 1
+
+   useful_tools.md
+   visualization.md
+   backends_support.md
+   model_deployment.md
diff --git a/mmde/docs/en/user_guides/inference.md b/mmde/docs/en/user_guides/inference.md
new file mode 100644
index 0000000000000000000000000000000000000000..68570ef84db6cfaf5cf5840e329fb32ef52cacb6
--- /dev/null
+++ b/mmde/docs/en/user_guides/inference.md
@@ -0,0 +1,101 @@
+# Inference
+
+## Introduction
+
+We provide scripts for multi-modality/single-modality (LiDAR-based/vision-based), indoor/outdoor 3D detection and 3D semantic segmentation demos. The pre-trained models can be downloaded from [model zoo](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/docs/en/model_zoo.md). We provide pre-processed sample data from KITTI, SUN RGB-D, nuScenes and ScanNet dataset. You can use any other data following our pre-processing steps.
+
+## Testing
+
+### 3D Detection
+
+#### Point cloud demo
+
+To test a 3D detector on point cloud data, simply run:
+
+```shell
+python demo/pcd_demo.py ${PCD_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${GPU_ID}] [--score-thr ${SCORE_THR}] [--out-dir ${OUT_DIR}] [--show]
+```
+
+The visualization results including a point cloud and predicted 3D bounding boxes will be saved in `${OUT_DIR}/PCD_NAME`, which you can open using [MeshLab](http://www.meshlab.net/). Note that if you set the flag `--show`, the prediction result will be displayed online using [Open3D](http://www.open3d.org/).
+
+Example on KITTI data using [PointPillars model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20220331_134606-d42d15ed.pth):
+
+```shell
+python demo/pcd_demo.py demo/data/kitti/000008.bin configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car.py ${CHECKPOINT_FILE} --show
+```
+
+Example on SUN RGB-D data using [VoteNet model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/votenet/votenet_16x8_sunrgbd-3d-10class/votenet_16x8_sunrgbd-3d-10class_20210820_162823-bf11f014.pth):
+
+```shell
+python demo/pcd_demo.py demo/data/sunrgbd/sunrgbd_000017.bin configs/votenet/votenet_8xb16_sunrgbd-3d.py ${CHECKPOINT_FILE} --show
+```
+
+#### Monocular 3D demo
+
+To test a monocular 3D detector on image data, simply run:
+
+```shell
+python demo/mono_det_demo.py ${IMAGE_FILE} ${ANNOTATION_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${GPU_ID}] [--cam-type ${CAM_TYPE}] [--score-thr ${SCORE-THR}] [--out-dir ${OUT_DIR}] [--show]
+```
+
+where the `ANNOTATION_FILE` should provide the 3D to 2D projection matrix (camera intrinsic matrix), and `CAM_TYPE` should be specified according to dataset. For example, if you want to inference on the front camera image, the `CAM_TYPE` should be set as `CAM_2` for KITTI, and `CAM_FRONT` for nuScenes. By specifying `CAM_TYPE`, you can even infer on any camera images for datasets with multi-view cameras, such as nuScenes and Waymo. `SCORE-THR` is the 3D bbox threshold while visualization. The visualization results including an image and its predicted 3D bounding boxes projected on the image will be saved in `${OUT_DIR}/IMG_NAME`.
+
+Example on KITTI data using [PGD model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d_20211022_102608-8a97533b.pth):
+
+```shell
+python demo/mono_det_demo.py demo/data/kitti/000008.png demo/data/kitti/000008.pkl  configs/pgd/pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d.py ${CHECKPOINT_FILE}  --show --cam-type CAM2 --score-thr 8
+```
+
+**Note**: For PGD, the prediction score is not among (0, 1).
+
+Example on nuScenes data using [FCOS3D model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune_20210717_095645-8d806dc2.pth):
+
+```shell
+python demo/mono_det_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__CAM_BACK__1532402927637525.jpg demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl  configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d_finetune.py ${CHECKPOINT_FILE}  --show --cam-type CAM_BACK
+```
+
+**Note** that when visualizing results of monocular 3D detection for flipped images, the camera intrinsic matrix should also be modified accordingly. See more details and examples in PR [#744](https://github.com/open-mmlab/mmdetection3d/pull/744).
+
+#### Multi-modality demo
+
+To test a 3D detector on multi-modality data (typically point cloud and image), simply run:
+
+```shell
+python demo/multi_modality_demo.py ${PCD_FILE} ${IMAGE_FILE} ${ANNOTATION_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${GPU_ID}] [--score-thr ${SCORE_THR}] [--out-dir ${OUT_DIR}] [--show]
+```
+
+where the `ANNOTATION_FILE` should provide the 3D to 2D projection matrix. The visualization results including a point cloud, an image, predicted 3D bounding boxes and their projection on the image will be saved in `${OUT_DIR}/PCD_NAME`.
+
+Example on KITTI data using [MVX-Net model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class-8963258a.pth):
+
+```shell
+python demo/multi_modality_demo.py demo/data/kitti/000008.bin demo/data/kitti/000008.png demo/data/kitti/000008.pkl configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py ${CHECKPOINT_FILE} --cam-type CAM2 --show
+```
+
+Example on SUN RGB-D data using [ImVoteNet model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210819_192851-1bcd1b97.pth):
+
+```shell
+python demo/multi_modality_demo.py demo/data/sunrgbd/000017.bin demo/data/sunrgbd/000017.jpg demo/data/sunrgbd/sunrgbd_000017_infos.pkl configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py ${CHECKPOINT_FILE} --cam-type CAM0 --show --score-thr 0.6
+```
+
+Example on NuScenes data using [BEVFusion model](https://drive.google.com/file/d/1QkvbYDk4G2d6SZoeJqish13qSyXA4lp3/view?usp=share_link):
+
+```shell
+python demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show
+```
+
+### 3D Segmentation
+
+To test a 3D segmentor on point cloud data, simply run:
+
+```shell
+python demo/pcd_seg_demo.py ${PCD_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${GPU_ID}] [--out-dir ${OUT_DIR}] [--show]
+```
+
+The visualization results including a point cloud and its predicted 3D segmentation mask will be saved in `${OUT_DIR}/PCD_NAME`.
+
+Example on ScanNet data using [PointNet++ (SSG) model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_ssg_16x2_cosine_200e_scannet_seg-3d-20class/pointnet2_ssg_16x2_cosine_200e_scannet_seg-3d-20class_20210514_143644-ee73704a.pth):
+
+```shell
+python demo/pcd_seg_demo.py demo/data/scannet/scene0000_00.bin configs/pointnet2/pointnet2_ssg_2xb16-cosine-200e_scannet-seg.py ${CHECKPOINT_FILE} --show
+```
diff --git a/mmde/docs/en/user_guides/model_deployment.md b/mmde/docs/en/user_guides/model_deployment.md
new file mode 100644
index 0000000000000000000000000000000000000000..6014b3a29a264ae4a898d2541327103cfb4f4330
--- /dev/null
+++ b/mmde/docs/en/user_guides/model_deployment.md
@@ -0,0 +1,4 @@
+# Model Deployment
+
+MMDet3D 1.1 fully relies on [MMDeploy](https://mmdeploy.readthedocs.io/) to deploy models.
+Please stay tuned and this document will be update soon.
diff --git a/mmde/docs/en/user_guides/new_data_model.md b/mmde/docs/en/user_guides/new_data_model.md
new file mode 100644
index 0000000000000000000000000000000000000000..3d224118e65fb072057a6d282439d1b1d0b69ca7
--- /dev/null
+++ b/mmde/docs/en/user_guides/new_data_model.md
@@ -0,0 +1,105 @@
+# Train with Customized Datasets
+
+In this note, you will know how to train and test predefined models with customized datasets. We use the Waymo dataset as an example to describe the whole process.
+
+The basic steps are as below:
+
+1. Prepare the customized dataset
+2. Prepare a config
+3. Train, test, inference models on the customized dataset.
+
+## Prepare the customized dataset
+
+There are three ways to support a new dataset in MMDetection3D:
+
+1. reorganize the dataset into existing format.
+2. reorganize the dataset into a standard format.
+3. implement a new dataset.
+
+Usually we recommend to use the first two methods which are usually easier than the third.
+
+In this note, we give an example for converting the data into KITTI format, you can refer to this to reorganize your dataset into kitti format. About the standard format dataset, and you can refer to [customize_dataset.md](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/docs/en/advanced_guides/customize_dataset.md).
+
+**Note**: We take Waymo as the example here considering its format is totally different from other existing formats. For other datasets using similar methods to organize data, like Lyft compared to nuScenes, it would be easier to directly implement the new data converter (for the second approach above) instead of converting it to another format (for the first approach above).
+
+### KITTI dataset format
+
+Firstly, the raw data for 3D object detection from KITTI are typically organized as follows, where `ImageSets` contains split files indicating which files belong to training/validation/testing set, `calib` contains calibration information files, `image_2` and `velodyne` include image data and point cloud data, and `label_2` includes label files for 3D detection.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── kitti
+│   │   ├── ImageSets
+│   │   ├── testing
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── velodyne
+│   │   ├── training
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── label_2
+│   │   │   ├── velodyne
+```
+
+Specific annotation format is described in the official object development [kit](https://s3.eu-central-1.amazonaws.com/avg-kitti/devkit_object.zip). For example, it consists of the following labels:
+
+```
+#Values    Name      Description
+----------------------------------------------------------------------------
+   1    type         Describes the type of object: 'Car', 'Van', 'Truck',
+                     'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram',
+                     'Misc' or 'DontCare'
+   1    truncated    Float from 0 (non-truncated) to 1 (truncated), where
+                     truncated refers to the object leaving image boundaries
+   1    occluded     Integer (0,1,2,3) indicating occlusion state:
+                     0 = fully visible, 1 = partly occluded
+                     2 = largely occluded, 3 = unknown
+   1    alpha        Observation angle of object, ranging [-pi..pi]
+   4    bbox         2D bounding box of object in the image (0-based index):
+                     contains left, top, right, bottom pixel coordinates
+   3    dimensions   3D object dimensions: height, width, length (in meters)
+   3    location     3D object location x,y,z in camera coordinates (in meters)
+   1    rotation_y   Rotation ry around Y-axis in camera coordinates [-pi..pi]
+   1    score        Only for results: Float, indicating confidence in
+                     detection, needed for p/r curves, higher is better.
+```
+
+Assume we use the Waymo dataset.
+
+After downloading the data, we need to implement a function to convert both the input data and annotation format into the KITTI style. Then we can implement `WaymoDataset` inherited from `KittiDataset` to load the data and perform training, and implement `WaymoMetric` inherited from `KittiMetric` for evaluation.
+
+Specifically, we implement a waymo [converter](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/waymo_converter.py) to convert Waymo data into KITTI format and a waymo dataset [class](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/datasets/waymo_dataset.py) to process it, in addition need to add a waymo [metric](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/metrics/waymo_metric.py) to evaluate results. Because we preprocess the raw data and reorganize it like KITTI, the dataset class could be implemented more easily by inheriting from KittiDataset. Regarding the dataset evaluation metric, because Waymo has its own evaluation approach, we need further implement a new Waymo metric; more about the metric could refer to [metric_and_evaluator.md](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/metric_and_evaluator.md). Afterward, users can successfully convert the data format and use `WaymoDataset` to train and evaluate the model by `WaymoMetric`.
+
+For more details about the intermediate results of preprocessing of Waymo dataset, please refer to its [waymo_det.md](https://mmdetection3d.readthedocs.io/en/latest/datasets/waymo_det.html).
+
+## Prepare a config
+
+The second step is to prepare configs such that the dataset could be successfully loaded. In addition, adjusting hyperparameters is usually necessary to obtain decent performance in 3D detection.
+
+Suppose we would like to train PointPillars on Waymo to achieve 3D detection for 3 classes, vehicle, cyclist and pedestrian, we need to prepare dataset config like [this](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/datasets/waymoD5-3d-3class.py), model config like [this](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/models/pointpillars_hv_secfpn_waymo.py) and combine them like [this](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py), compared to KITTI [dataset config](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/datasets/kitti-3d-3class.py), [model config](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/models/pointpillars_hv_secfpn_kitti.py) and [overall](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py).
+
+## Train a new model
+
+To train a model with the new config, you can simply run
+
+```shell
+python tools/train.py configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py
+```
+
+For more detailed usages, please refer to the [Case 1](https://mmdetection3d.readthedocs.io/en/latest/1_exist_data_model.html).
+
+## Test and inference
+
+To test the trained model, you can simply run
+
+```shell
+python tools/test.py configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py work_dirs/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class/latest.pth
+```
+
+**Note**: To use Waymo evaluation protocol, you need to follow the [tutorial](https://mmdetection3d.readthedocs.io/en/latest/datasets/waymo_det.html) and prepare files related to metrics computation as official instructions.
+
+For more detailed usages for test and inference, please refer to the [Case 1](https://mmdetection3d.readthedocs.io/en/latest/1_exist_data_model.html).
diff --git a/mmde/docs/en/user_guides/train_test.md b/mmde/docs/en/user_guides/train_test.md
new file mode 100644
index 0000000000000000000000000000000000000000..e3a85d3aee2834246d13641593640b45e9a1732c
--- /dev/null
+++ b/mmde/docs/en/user_guides/train_test.md
@@ -0,0 +1,258 @@
+# Test and Train on Standard Datasets
+
+### Test existing models on standard datasets
+
+- single GPU
+- CPU
+- single node multiple GPU
+- multiple node
+
+You can use the following commands to test a dataset.
+
+```shell
+# single-gpu testing
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--cfg-options test_evaluator.pklfile_prefix=${RESULT_FILE}]  [--show] [--show-dir ${SHOW_DIR}]
+
+# CPU: disable GPUs and run single-gpu testing script (experimental)
+export CUDA_VISIBLE_DEVICES=-1
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--cfg-options test_evaluator.pklfile_prefix=${RESULT_FILE}]  [--show] [--show-dir ${SHOW_DIR}]
+
+# multi-gpu testing
+./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--cfg-options test_evaluator.pklfile_prefix=${RESULT_FILE}]  [--show] [--show-dir ${SHOW_DIR}]
+```
+
+**Note**:
+
+For now, CPU testing is only supported for SMOKE.
+
+Optional arguments:
+
+- `--show`: If specified, detection results will be plotted in the silient mode. It is only applicable to single GPU testing and used for debugging and visualization. This should be used with `--show-dir`.
+- `--show-dir`: If specified, detection results will be plotted on the `***_points.obj` and `***_pred.obj` files in the specified directory. It is only applicable to single GPU testing and used for debugging and visualization. You do NOT need a GUI available in your environment for using this option.
+
+All evaluation related arguments are set in the `test_evaluator` in corresponding dataset configuration. such as
+`test_evaluator = dict(type='KittiMetric', ann_file=data_root + 'kitti_infos_val.pkl', pklfile_prefix=None, submission_prefix=None)`
+
+The arguments:
+
+- `type`: The name of the corresponding metric, usually associated with the dataset.
+- `ann_file`: The path of annotation file.
+- `pklfile_prefix`: An optional argument. The filename of the output results in pickle format. If not specified, the results will not be saved to a file.
+- `submission_prefix`: An optional argument. The results will be saved to a file then you can upload it to do the official evaluation.
+
+Examples:
+
+Assume that you have already downloaded the checkpoints to the directory `checkpoints/`.
+
+1. Test VoteNet on ScanNet and save the points and prediction visualization results.
+
+   ```shell
+   python tools/test.py configs/votenet/votenet_8xb8_scannet-3d.py \
+       checkpoints/votenet_8x8_scannet-3d-18class_20200620_230238-2cea9c3a.pth \
+       --show --show-dir ./data/scannet/show_results
+   ```
+
+2. Test VoteNet on ScanNet, save the points, prediction, groundtruth visualization results, and evaluate the mAP.
+
+   ```shell
+   python tools/test.py configs/votenet/votenet_8xb8_scannet-3d.py \
+       checkpoints/votenet_8x8_scannet-3d-18class_20200620_230238-2cea9c3a.pth \
+       --show --show-dir ./data/scannet/show_results
+   ```
+
+3. Test VoteNet on ScanNet (without saving the test results) and evaluate the mAP.
+
+   ```shell
+   python tools/test.py configs/votenet/votenet_8xb8_scannet-3d.py \
+       checkpoints/votenet_8x8_scannet-3d-18class_20200620_230238-2cea9c3a.pth
+   ```
+
+4. Test SECOND on KITTI with 8 GPUs, and evaluate the mAP.
+
+   ```shell
+   ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class.py \
+       checkpoints/hv_second_secfpn_6x8_80e_kitti-3d-3class_20200620_230238-9208083a.pth
+   ```
+
+5. Test PointPillars on nuScenes with 8 GPUs, and generate the json file to be submit to the official evaluation server.
+
+   ```shell
+   ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb4-2x_nus-3d.py \
+       checkpoints/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth \
+      --cfg-options 'test_evaluator.jsonfile_prefix=./pointpillars_nuscenes_results'
+   ```
+
+   The generated results be under `./pointpillars_nuscenes_results` directory.
+
+6. Test SECOND on KITTI with 8 GPUs, and generate the pkl files and submission data to be submit to the official evaluation server.
+
+   ```shell
+   ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class.py \
+       checkpoints/hv_second_secfpn_6x8_80e_kitti-3d-3class_20200620_230238-9208083a.pth \
+       --cfg-options 'test_evaluator.pklfile_prefix=./second_kitti_results' 'submission_prefix=./second_kitti_results'
+   ```
+
+   The generated results be under `./second_kitti_results` directory.
+
+7. Test PointPillars on Lyft with 8 GPUs, generate the pkl files and make a submission to the leaderboard.
+
+   ```shell
+   ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} configs/pointpillars/hv_pointpillars_fpn_sbn-2x8_2x_lyft-3d.py \
+       checkpoints/hv_pointpillars_fpn_sbn-2x8_2x_lyft-3d_latest.pth \
+       --cfg-options 'test_evaluator.jsonfile_prefix=results/pp_lyft/results_challenge' \
+       'test_evaluator.csv_savepath=results/pp_lyft/results_challenge.csv' \
+       'test_evaluator.pklfile_prefix=results/pp_lyft/results_challenge.pkl'
+   ```
+
+   **Notice**: To generate submissions on Lyft, `csv_savepath` must be given in the `--cfg-options`. After generating the csv file, you can make a submission with kaggle commands given on the [website](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/submit).
+
+   Note that in the [config of Lyft dataset](../../configs/_base_/datasets/lyft-3d.py), the value of `ann_file` keyword in `test` is `'lyft_infos_test.pkl'`, which is the official test set of Lyft without annotation. To test on the validation set, please change this to `'lyft_infos_val.pkl'`.
+
+8. Test PointPillars on waymo with 8 GPUs, and evaluate the mAP with waymo metrics.
+
+   ```shell
+   ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-car.py  \
+       checkpoints/hv_pointpillars_secfpn_sbn-2x16_2x_waymo-3d-car_latest.pth \
+       --cfg-options 'test_evaluator.pklfile_prefix=results/waymo-car/kitti_results' \
+       'test_evaluator.submission_prefix=results/waymo-car/kitti_results'
+   ```
+
+   **Notice**: For evaluation on waymo, please follow the [instruction](https://github.com/waymo-research/waymo-open-dataset/blob/master/docs/quick_start.md/) to build the binary file `compute_detection_metrics_main` for metrics computation and put it into `mmdet3d/core/evaluation/waymo_utils/`.(Sometimes when using bazel to build `compute_detection_metrics_main`, an error `'round' is not a member of 'std'` may appear. We just need to remove the `std::` before `round` in that file.) `pklfile_prefix` should be given in the `--eval-options` for the bin file generation. For metrics, `waymo` is the recommended official evaluation prototype. Currently, evaluating with choice `kitti` is adapted from KITTI and the results for each difficulty are not exactly the same as the definition of KITTI. Instead, most of objects are marked with difficulty 0 currently, which will be fixed in the future. The reasons of its instability include the large computation for evaluation, the lack of occlusion and truncation in the converted data, different definition of difficulty and different methods of computing average precision.
+
+9. Test PointPillars on waymo with 8 GPUs, generate the bin files and make a submission to the leaderboard.
+
+   ```shell
+   ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-car.py  \
+       checkpoints/hv_pointpillars_secfpn_sbn-2x16_2x_waymo-3d-car_latest.pth \
+       --cfg-options 'test_evaluator.pklfile_prefix=results/waymo-car/kitti_results' \
+       'test_evaluator.submission_prefix=results/waymo-car/kitti_results'
+   ```
+
+   **Notice**: After generating the bin file, you can simply build the binary file `create_submission` and use them to create a submission file by following the [instruction](https://github.com/waymo-research/waymo-open-dataset/blob/master/docs/quick_start.md/). For evaluation on the validation set with the eval server, you can also use the same way to generate a submission.
+
+## Train predefined models on standard datasets
+
+MMDetection3D implements distributed training and non-distributed training,
+which uses `MMDistributedDataParallel` and `MMDataParallel` respectively.
+
+All outputs (log files and checkpoints) will be saved to the working directory,
+which is specified by `work_dir` in the config file.
+
+By default we evaluate the model on the validation set after each epoch, you can change the evaluation interval by adding the interval argument in the training config.
+
+```python
+train_cfg = dict(type='EpochBasedTrainLoop', val_interval=1)  # This evaluate the model per 12 epoch.
+```
+
+**Important**: The default learning rate in config files is for 8 GPUs and the exact batch size is marked by the config's file name, e.g. '2xb8' means 2 samples per GPU using 8 GPUs.
+According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you need to set the learning rate proportional to the batch size if you use different GPUs or images per GPU, e.g., lr=0.01 for 4 GPUs * 2 img/gpu and lr=0.08 for 16 GPUs * 4 img/gpu. However, since most of the models in this repo use ADAM rather than SGD for optimization, the rule may not hold and users need to tune the learning rate by themselves.
+
+### Train with a single GPU
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+If you want to specify the working directory in the command, you can add an argument `--work-dir ${YOUR_WORK_DIR}`.
+
+### Training with CPU (experimental)
+
+The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process.
+
+```shell
+export CUDA_VISIBLE_DEVICES=-1
+```
+
+And then run the script of train with a single GPU.
+
+**Note**:
+
+For now, most of the point cloud related algorithms rely on 3D CUDA op, which can not be trained on CPU. Some monocular 3D object detection algorithms, like FCOS3D and SMOKE can be trained on CPU. We do not recommend users to use CPU for training because it is too slow. We support this feature to allow users to debug certain models on machines without GPU for convenience.
+
+### Train with multiple GPUs
+
+```shell
+./tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments]
+```
+
+Optional arguments are:
+
+- `--cfg-options 'Key=value'`: Override some settings in the used config.
+
+### Train with multiple machines
+
+If you run MMDetection3D on a cluster managed with [slurm](https://slurm.schedmd.com/), you can use the script `slurm_train.sh`. (This script also supports single machine training.)
+
+```shell
+[GPUS=${GPUS}] ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${WORK_DIR}
+```
+
+Here is an example of using 16 GPUs to train Mask R-CNN on the dev partition.
+
+```shell
+GPUS=16 ./tools/slurm_train.sh dev pp_kitti_3class configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py /nfs/xxxx/pp_kitti_3class
+```
+
+You can check [slurm_train.sh](https://github.com/open-mmlab/mmdetection/blob/master/tools/slurm_train.sh) for full arguments and environment variables.
+
+If you launch with multiple machines simply connected with ethernet, you can simply run following commands:
+
+On the first machine:
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR ./tools/dist_train.sh $CONFIG $GPUS
+```
+
+On the second machine:
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR ./tools/dist_train.sh $CONFIG $GPUS
+```
+
+Usually it is slow if you do not have high speed networking like InfiniBand.
+
+### Launch multiple jobs on a single machine
+
+If you launch multiple jobs on a single machine, e.g., 2 jobs of 4-GPU training on a machine with 8 GPUs,
+you need to specify different ports (29500 by default) for each job to avoid communication conflict.
+
+If you use `dist_train.sh` to launch training jobs, you can set the port in commands.
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4
+```
+
+If you use launch training jobs with Slurm, there are two ways to specify the ports.
+
+1. Set the port through `--cfg-options`. This is more recommended since it does not change the original configs.
+
+   ```shell
+   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR} --cfg-options 'env_cfg.dist_cfg.port=29500'
+   CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR} --cfg-options 'env_cfg.dist_cfg.port=29501'
+   ```
+
+2. Modify the config files (usually the 6th line from the bottom in config files) to set different communication ports.
+
+   In `config1.py`,
+
+   ```python
+   env_cfg = dict(
+       dist_cfg=dict(backend='nccl', port=29500)
+   )
+   ```
+
+   In `config2.py`,
+
+   ```python
+   env_cfg = dict(
+       dist_cfg=dict(backend='nccl', port=29501)
+   )
+   ```
+
+   Then you can launch two jobs with `config1.py` and `config2.py`.
+
+   ```shell
+   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR}
+   CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR}
+   ```
diff --git a/mmde/docs/en/user_guides/useful_tools.md b/mmde/docs/en/user_guides/useful_tools.md
new file mode 100644
index 0000000000000000000000000000000000000000..ab73ae37027ae5eeb058e92c6e2ecb47714229cf
--- /dev/null
+++ b/mmde/docs/en/user_guides/useful_tools.md
@@ -0,0 +1,218 @@
+We provide lots of useful tools under `tools/` directory.
+
+## Log Analysis
+
+You can plot loss/mAP curves given a training log file. Run `pip install seaborn` first to install the dependency.
+
+![loss curve image](../../../resources/loss_curve.png)
+
+```shell
+python tools/analysis_tools/analyze_logs.py plot_curve [--keys ${KEYS}] [--title ${TITLE}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}] [--mode ${MODE}] [--interval ${INTERVAL}]
+```
+
+**Notice**: If the metric you want to plot is calculated in the eval stage, you need to add the flag `--mode eval`. If you perform evaluation with an interval of `${INTERVAL}`, you need to add the args `--interval ${INTERVAL}`.
+
+Examples:
+
+- Plot the classification loss of some run.
+
+  ```shell
+  python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys loss_cls --legend loss_cls
+  ```
+
+- Plot the classification and regression loss of some run, and save the figure to a pdf.
+
+  ```shell
+  python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys loss_cls loss_bbox --out losses.pdf
+  ```
+
+- Compare the bbox mAP of two runs in the same figure.
+
+  ```shell
+  # evaluate PartA2 and second on KITTI according to Car_3D_moderate_strict
+  python tools/analysis_tools/analyze_logs.py plot_curve tools/logs/PartA2.log.json tools/logs/second.log.json --keys KITTI/Car_3D_moderate_strict --legend PartA2 second --mode eval --interval 1
+  # evaluate PointPillars for car and 3 classes on KITTI according to Car_3D_moderate_strict
+  python tools/analysis_tools/analyze_logs.py plot_curve tools/logs/pp-3class.log.json tools/logs/pp.log.json --keys KITTI/Car_3D_moderate_strict --legend pp-3class pp --mode eval --interval 2
+  ```
+
+You can also compute the average training speed.
+
+```shell
+python tools/analysis_tools/analyze_logs.py cal_train_time log.json [--include-outliers]
+```
+
+The output is expected to be like the following.
+
+```
+-----Analyze train time of work_dirs/some_exp/20190611_192040.log.json-----
+slowest epoch 11, average time is 1.2024
+fastest epoch 1, average time is 1.1909
+time std over epochs is 0.0028
+average iter time: 1.1959 s/iter
+```
+
+&#8195;
+
+## Model Serving
+
+**Note**: This tool is still experimental now, only SECOND is supported to be served with [`TorchServe`](https://pytorch.org/serve/). We'll support more models in the future.
+
+In order to serve an `MMDetection3D` model with [`TorchServe`](https://pytorch.org/serve/), you can follow the steps:
+
+### 1. Convert the model from MMDetection3D to TorchServe
+
+```shell
+python tools/deployment/mmdet3d2torchserve.py ${CONFIG_FILE} ${CHECKPOINT_FILE} \
+--output-folder ${MODEL_STORE} \
+--model-name ${MODEL_NAME}
+```
+
+**Note**: ${MODEL_STORE} needs to be an absolute path to a folder.
+
+### 2. Build `mmdet3d-serve` docker image
+
+```shell
+docker build -t mmdet3d-serve:latest docker/serve/
+```
+
+### 3. Run `mmdet3d-serve`
+
+Check the official docs for [running TorchServe with docker](https://github.com/pytorch/serve/blob/master/docker/README.md#running-torchserve-in-a-production-docker-environment).
+
+In order to run it on the GPU, you need to install [nvidia-docker](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). You can omit the `--gpus` argument in order to run on the CPU.
+
+Example:
+
+```shell
+docker run --rm \
+--cpus 8 \
+--gpus device=0 \
+-p8080:8080 -p8081:8081 -p8082:8082 \
+--mount type=bind,source=$MODEL_STORE,target=/home/model-server/model-store \
+mmdet3d-serve:latest
+```
+
+[Read the docs](https://github.com/pytorch/serve/blob/072f5d088cce9bb64b2a18af065886c9b01b317b/docs/rest_api.md/) about the Inference (8080), Management (8081) and Metrics (8082) APis
+
+### 4. Test deployment
+
+You can use `test_torchserver.py` to compare result of torchserver and pytorch.
+
+```shell
+python tools/deployment/test_torchserver.py ${IMAGE_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} ${MODEL_NAME}
+[--inference-addr ${INFERENCE_ADDR}] [--device ${DEVICE}] [--score-thr ${SCORE_THR}]
+```
+
+Example:
+
+```shell
+python tools/deployment/test_torchserver.py demo/data/kitti/kitti_000008.bin configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py checkpoints/hv_second_secfpn_6x8_80e_kitti-3d-car_20200620_230238-393f000c.pth second
+```
+
+&#8195;
+
+## Model Complexity
+
+You can use `tools/analysis_tools/get_flops.py` in MMDetection3D, a script adapted from [flops-counter.pytorch](https://github.com/sovrasov/flops-counter.pytorch), to compute the FLOPs and params of a given model.
+
+```shell
+python tools/analysis_tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}]
+```
+
+You will get the results like this.
+
+```text
+==============================
+Input shape: (40000, 4)
+Flops: 5.78 GFLOPs
+Params: 953.83 k
+==============================
+```
+
+**Note**: This tool is still experimental and we do not guarantee that the
+number is absolutely correct. You may well use the result for simple
+comparisons, but double check it before you adopt it in technical reports or papers.
+
+1. FLOPs are related to the input shape while parameters are not. The default
+   input shape is (1, 40000, 4).
+2. Some operators are not counted into FLOPs like GN and custom operators. Refer to [`mmcv.cnn.get_model_complexity_info()`](https://github.com/open-mmlab/mmcv/blob/master/mmcv/cnn/utils/flops_counter.py) for details.
+3. We currently only support FLOPs calculation of single-stage models with single-modality input (point cloud or image). We will support two-stage and multi-modality models in the future.
+
+&#8195;
+
+## Model Conversion
+
+### RegNet model to MMDetection
+
+`tools/model_converters/regnet2mmdet.py` convert keys in pycls pretrained RegNet models to
+MMDetection style.
+
+```shell
+python tools/model_converters/regnet2mmdet.py ${SRC} ${DST} [-h]
+```
+
+### Detectron ResNet to Pytorch
+
+`tools/detectron2pytorch.py` in MMDetection could convert keys in the original detectron pretrained
+ResNet models to PyTorch style.
+
+```shell
+python tools/detectron2pytorch.py ${SRC} ${DST} ${DEPTH} [-h]
+```
+
+### Prepare a model for publishing
+
+`tools/model_converters/publish_model.py` helps users to prepare their model for publishing.
+
+Before you upload a model to AWS, you may want to
+
+1. convert model weights to CPU tensors
+2. delete the optimizer states and
+3. compute the hash of the checkpoint file and append the hash id to the
+   filename.
+
+```shell
+python tools/model_converters/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME}
+```
+
+E.g.,
+
+```shell
+python tools/model_converters/publish_model.py work_dirs/faster_rcnn/latest.pth faster_rcnn_r50_fpn_1x_20190801.pth
+```
+
+The final output filename will be `faster_rcnn_r50_fpn_1x_20190801-{hash id}.pth`.
+
+&#8195;
+
+## Dataset Conversion
+
+`tools/dataset_converters/` contains tools for converting datasets to other formats. Most of them convert datasets to pickle based info files, like kitti, nuscenes and lyft. Waymo converter is used to reorganize waymo raw data like KITTI style. Users could refer to them for our approach to converting data format. It is also convenient to modify them to use as scripts like nuImages converter.
+
+To convert the nuImages dataset into COCO format, please use the command below:
+
+```shell
+python -u tools/dataset_converters/nuimage_converter.py --data-root ${DATA_ROOT} --version ${VERSIONS} \
+                                                    --out-dir ${OUT_DIR} --nproc ${NUM_WORKERS} --extra-tag ${TAG}
+```
+
+- `--data-root`: the root of the dataset, defaults to `./data/nuimages`.
+- `--version`: the version of the dataset, defaults to `v1.0-mini`. To get the full dataset, please use `--version v1.0-train v1.0-val v1.0-mini`
+- `--out-dir`: the output directory of annotations and semantic masks, defaults to `./data/nuimages/annotations/`.
+- `--nproc`: number of workers for data preparation, defaults to `4`. Larger number could reduce the preparation time as images are processed in parallel.
+- `--extra-tag`: extra tag of the annotations, defaults to `nuimages`. This can be used to separate different annotations processed in different time for study.
+
+More details could be referred to the [doc](https://mmdetection3d.readthedocs.io/en/latest/data_preparation.html) for dataset preparation and [README](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/nuimages/README.md/) for nuImages dataset.
+
+&#8195;
+
+## Miscellaneous
+
+### Print the entire config
+
+`tools/misc/print_config.py` prints the whole config verbatim, expanding all its
+imports.
+
+```shell
+python tools/misc/print_config.py ${CONFIG} [-h] [--options ${OPTIONS [OPTIONS...]}]
+```
diff --git a/mmde/docs/en/user_guides/visualization.md b/mmde/docs/en/user_guides/visualization.md
new file mode 100644
index 0000000000000000000000000000000000000000..3d51e7735173c96e591aeee6a2f79f404714c51e
--- /dev/null
+++ b/mmde/docs/en/user_guides/visualization.md
@@ -0,0 +1,202 @@
+# Visualization
+
+MMDetection3D provides a `Det3DLocalVisualizer` to visualize and store the state of the model during training and testing, as well as results, with the following features.
+
+1. Support the basic drawing interface for multi-modality data and multi-task.
+2. Support multiple backends such as local, TensorBoard, to write training status such as `loss`, `lr`, or performance evaluation metrics and to a specified single or multiple backends.
+3. Support ground truth visualization on multimodal data, and cross-modal visualization of 3D detection results.
+
+## Basic Drawing Interface
+
+Inherited from `DetLocalVisualizer`, `Det3DLocalVisualizer` provides an interface for drawing common objects on 2D images, such as drawing detection boxes, points, text, lines, circles, polygons, and binary masks. More details about 2D drawing can refer to the [visualization documentation](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/visualization.html) in MMDetection. Here we introduce the 3D drawing interface:
+
+### Drawing point cloud on the image
+
+We support drawing point cloud on the image by using `draw_points_on_image`.
+
+```python
+import mmcv
+import numpy as np
+from mmengine import load
+
+from mmdet3d.visualization import Det3DLocalVisualizer
+
+info_file = load('demo/data/kitti/000008.pkl')
+points = np.fromfile('demo/data/kitti/000008.bin', dtype=np.float32)
+points = points.reshape(-1, 4)[:, :3]
+lidar2img = np.array(info_file['data_list'][0]['images']['CAM2']['lidar2img'], dtype=np.float32)
+
+visualizer = Det3DLocalVisualizer()
+img = mmcv.imread('demo/data/kitti/000008.png')
+img = mmcv.imconvert(img, 'bgr', 'rgb')
+visualizer.set_image(img)
+visualizer.draw_points_on_image(points, lidar2img)
+visualizer.show()
+```
+
+![points_on_image](../../../resources/points_on_image.png)
+
+### Drawing 3D Boxes on Point Cloud
+
+We support drawing 3D boxes on point cloud by using `draw_bboxes_3d`.
+
+```python
+import torch
+import numpy as np
+
+from mmdet3d.visualization import Det3DLocalVisualizer
+from mmdet3d.structures import LiDARInstance3DBoxes
+
+points = np.fromfile('demo/data/kitti/000008.bin', dtype=np.float32)
+points = points.reshape(-1, 4)
+visualizer = Det3DLocalVisualizer()
+# set point cloud in visualizer
+visualizer.set_points(points)
+bboxes_3d = LiDARInstance3DBoxes(
+    torch.tensor([[8.7314, -1.8559, -1.5997, 4.2000, 3.4800, 1.8900,
+                   -1.5808]]))
+# Draw 3D bboxes
+visualizer.draw_bboxes_3d(bboxes_3d)
+visualizer.show()
+```
+
+![mono3d](../../../resources/pcd.png)
+
+### Drawing Projected 3D Boxes on Image
+
+We support drawing projected 3D boxes on image by using `draw_proj_bboxes_3d`.
+
+```python
+import mmcv
+import numpy as np
+from mmengine import load
+
+from mmdet3d.visualization import Det3DLocalVisualizer
+from mmdet3d.structures import CameraInstance3DBoxes
+
+info_file = load('demo/data/kitti/000008.pkl')
+cam2img = np.array(info_file['data_list'][0]['images']['CAM2']['cam2img'], dtype=np.float32)
+bboxes_3d = []
+for instance in info_file['data_list'][0]['instances']:
+    bboxes_3d.append(instance['bbox_3d'])
+gt_bboxes_3d = np.array(bboxes_3d, dtype=np.float32)
+gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d)
+input_meta = {'cam2img': cam2img}
+
+visualizer = Det3DLocalVisualizer()
+
+img = mmcv.imread('demo/data/kitti/000008.png')
+img = mmcv.imconvert(img, 'bgr', 'rgb')
+visualizer.set_image(img)
+# project 3D bboxes to image
+visualizer.draw_proj_bboxes_3d(gt_bboxes_3d, input_meta)
+visualizer.show()
+```
+
+### Drawing BEV Boxes
+
+We support drawing BEV boxes by using `draw_bev_bboxes`.
+
+```python
+import numpy as np
+from mmengine import load
+
+from mmdet3d.visualization import Det3DLocalVisualizer
+from mmdet3d.structures import CameraInstance3DBoxes
+
+info_file = load('demo/data/kitti/000008.pkl')
+bboxes_3d = []
+for instance in info_file['data_list'][0]['instances']:
+    bboxes_3d.append(instance['bbox_3d'])
+gt_bboxes_3d = np.array(bboxes_3d, dtype=np.float32)
+gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d)
+
+visualizer = Det3DLocalVisualizer()
+# set bev image in visualizer
+visualizer.set_bev_image()
+# draw bev bboxes
+visualizer.draw_bev_bboxes(gt_bboxes_3d, edge_colors='orange')
+visualizer.show()
+```
+
+### Drawing 3D Semantic Mask
+
+We support draw segmentation mask via per-point colorization by using `draw_seg_mask`.
+
+```python
+import numpy as np
+
+from mmdet3d.visualization import Det3DLocalVisualizer
+
+points = np.fromfile('demo/data/sunrgbd/000017.bin', dtype=np.float32)
+points = points.reshape(-1, 3)
+visualizer = Det3DLocalVisualizer()
+mask = np.random.rand(points.shape[0], 3)
+points_with_mask = np.concatenate((points, mask), axis=-1)
+# Draw 3D points with mask
+visualizer.set_points(points, pcd_mode=2, vis_mode='add')
+visualizer.draw_seg_mask(points_with_mask)
+visualizer.show()
+```
+
+## Results
+
+To see the prediction results of trained models, you can run the following command:
+
+```bash
+python tools/test.py ${CONFIG_FILE} ${CKPT_PATH} --show --show-dir ${SHOW_DIR}
+```
+
+After running this command, plotted results including input data and the output of networks visualized on the input will be saved in `${SHOW_DIR}`.
+
+After running this command, you will obtain the input data, the output of networks and ground-truth labels visualized on the input (e.g. `***_gt.png` and `***_pred.png` in multi-modality detection task and vision-based detection task) in `${SHOW_DIR}`. When `show` is enabled, [Open3D](http://www.open3d.org/) will be used to visualize the results online. If you are running test in remote server without GUI, the online visualization is not supported. You can download the `results.pkl` from the remote server, and visualize the prediction results offline in your local machine.
+
+To visualize the results with `Open3D` backend offline, you can run the following command:
+
+```bash
+python tools/misc/visualize_results.py ${CONFIG_FILE} --result ${RESULTS_PATH} --show-dir ${SHOW_DIR}
+```
+
+![](../../../resources/open3d_visual.gif)
+
+This allows the inference and results generation to be done in remote server and the users can open them on their host with GUI.
+
+## Dataset
+
+We also provide scripts to visualize the dataset without inference. You can use `tools/misc/browse_dataset.py` to show loaded data and ground-truth online and save them on the disk. Currently we support single-modality 3D detection and 3D segmentation on all the datasets, multi-modality 3D detection on KITTI and SUN RGB-D, as well as monocular 3D detection on nuScenes. To browse the KITTI dataset, you can run the following command:
+
+```shell
+python tools/misc/browse_dataset.py configs/_base_/datasets/kitti-3d-3class.py --task lidar_det --output-dir ${OUTPUT_DIR}
+```
+
+**Notice**: Once specifying `--output-dir`, the images of views specified by users will be saved when pressing `_ESC_` in open3d window. If you want to zoom out/in the point clouds to inspect more details, you could specify `--show-interval=0` in the command.
+
+To verify the data consistency and the effect of data augmentation, you can also add `--aug` flag to visualize the data after data augmentation using the command as below:
+
+```shell
+python tools/misc/browse_dataset.py configs/_base_/datasets/kitti-3d-3class.py --task lidar_det --aug --output-dir ${OUTPUT_DIR}
+```
+
+If you also want to show 2D images with 3D bounding boxes projected onto them, you need to find a config that supports multi-modality data loading, and then change the `--task` args to `multi-modality_det`. An example is showed below:
+
+```shell
+python tools/misc/browse_dataset.py configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py --task multi-modality_det --output-dir ${OUTPUT_DIR}
+```
+
+![](../../../resources/browse_dataset_multi_modality.png)
+
+You can simply browse different datasets using different configs, e.g. visualizing the ScanNet dataset in 3D semantic segmentation task:
+
+```shell
+python tools/misc/browse_dataset.py configs/_base_/datasets/scannet-seg.py --task lidar_seg --output-dir ${OUTPUT_DIR}
+```
+
+![](../../../resources/browse_dataset_seg.png)
+
+And browsing the nuScenes dataset in monocular 3D detection task:
+
+```shell
+python tools/misc/browse_dataset.py configs/_base_/datasets/nus-mono3d.py --task mono_det --output-dir ${OUTPUT_DIR}
+```
+
+![](../../../resources/browse_dataset_mono.png)
diff --git a/mmde/docs/zh_cn/Makefile b/mmde/docs/zh_cn/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..d4bb2cbb9eddb1bb1b4f366623044af8e4830919
--- /dev/null
+++ b/mmde/docs/zh_cn/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/mmde/docs/zh_cn/_static/css/readthedocs.css b/mmde/docs/zh_cn/_static/css/readthedocs.css
new file mode 100644
index 0000000000000000000000000000000000000000..cc61ab82abc3da66a46dc51f0ed0cab0073a7493
--- /dev/null
+++ b/mmde/docs/zh_cn/_static/css/readthedocs.css
@@ -0,0 +1,6 @@
+.header-logo {
+    background-image: url("../image/mmdet3d-logo.png");
+    background-size: 182.5px 40px;
+    height: 40px;
+    width: 182.5px;
+}
diff --git a/mmde/docs/zh_cn/advanced_guides/customize_dataset.md b/mmde/docs/zh_cn/advanced_guides/customize_dataset.md
new file mode 100644
index 0000000000000000000000000000000000000000..a481fa261baba8fc9d8e5aa46078284353c787ac
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/customize_dataset.md
@@ -0,0 +1,500 @@
+# 自定义数据集
+
+在本节中，您将了解如何使用自定义数据集训练和测试预定义模型。
+
+基本步骤如下：
+
+1. 准备数据
+2. 准备配置文件
+3. 在自定义数据集上训练，测试和推理模型
+
+## 数据准备
+
+理想情况下我们可以重新组织自定义的原始数据并将标注格式转换成 KITTI 风格。但是，考虑到对于自定义数据集而言，KITTI 格式的校准文件和 3D 标注难以获得，因此我们在文档中介绍基本的数据格式。
+
+### 基本数据格式
+
+#### 点云格式
+
+目前，我们只支持 `.bin` 格式的点云用于训练和推理。在训练自己的数据集之前，需要将其它格式的点云文件转换成 `.bin` 文件。常见的点云数据格式包括 `.pcd` 和 `.las`，我们列举了一些开源工具作为参考。
+
+1. `.pcd` 转换成 `.bin`：https://github.com/DanielPollithy/pypcd
+
+- 您可以通过以下指令安装 `pypcd`：
+
+  ```bash
+  pip install git+https://github.com/DanielPollithy/pypcd.git
+  ```
+
+- 您可以使用以下脚本读取 `.pcd` 文件，并将其转换成 `.bin` 格式来保存：
+
+  ```python
+  import numpy as np
+  from pypcd import pypcd
+
+  pcd_data = pypcd.PointCloud.from_path('point_cloud_data.pcd')
+  points = np.zeros([pcd_data.width, 4], dtype=np.float32)
+  points[:, 0] = pcd_data.pc_data['x'].copy()
+  points[:, 1] = pcd_data.pc_data['y'].copy()
+  points[:, 2] = pcd_data.pc_data['z'].copy()
+  points[:, 3] = pcd_data.pc_data['intensity'].copy().astype(np.float32)
+  with open('point_cloud_data.bin', 'wb') as f:
+      f.write(points.tobytes())
+  ```
+
+2. `.las` 转换成 `.bin`：常见的转换流程为 `.las -> .pcd -> .bin`，`.las -> .pcd` 的转换可以用该[工具](https://github.com/Hitachi-Automotive-And-Industry-Lab/semantic-segmentation-editor)实现。
+
+#### 标签格式
+
+最基本的信息：每个场景的 3D 边界框和类别标签应该包含在 `.txt` 标注文件中。每一行代表特定场景的一个 3D 框，如下所示：
+
+```
+# 格式：[x, y, z, dx, dy, dz, yaw, category_name]
+1.23 1.42 0.23 3.96 1.65 1.55 1.56 Car
+3.51 2.15 0.42 1.05 0.87 1.86 1.23 Pedestrian
+...
+```
+
+**注意**：对于自定义数据集的评估我们目前只支持 KITTI 评估方法。
+
+3D 框应存储在统一的 3D 坐标系中。
+
+#### 校准格式
+
+对于每个激光雷达收集的点云数据，通常会进行融合并转换到特定的激光雷达坐标系。因此，校准信息文件中通常应该包含每个相机的内参矩阵和激光雷达到每个相机的外参转换矩阵，并保存在 `.txt` 校准文件中，其中 `Px` 表示 `camera_x` 的内参矩阵，`lidar2camx` 表示 `lidar` 到 `camera_x` 的外参转换矩阵。
+
+```
+P0
+P1
+P2
+P3
+P4
+...
+lidar2cam0
+lidar2cam1
+lidar2cam2
+lidar2cam3
+lidar2cam4
+...
+```
+
+### 原始数据结构
+
+#### 基于激光雷达的 3D 检测
+
+基于激光雷达的 3D 目标检测原始数据通常组织成如下格式，其中 `ImageSets` 包含划分文件，指明哪些文件属于训练/验证集，`points` 包含存储成 `.bin` 格式的点云数据，`labels` 包含 3D 检测的标签文件。
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── custom
+│   │   ├── ImageSets
+│   │   │   ├── train.txt
+│   │   │   ├── val.txt
+│   │   ├── points
+│   │   │   ├── 000000.bin
+│   │   │   ├── 000001.bin
+│   │   │   ├── ...
+│   │   ├── labels
+│   │   │   ├── 000000.txt
+│   │   │   ├── 000001.txt
+│   │   │   ├── ...
+```
+
+#### 基于视觉的 3D 检测
+
+基于视觉的 3D 目标检测原始数据通常组织成如下格式，其中 `ImageSets` 包含划分文件，指明哪些文件属于训练/验证集，`images` 包含来自不同相机的图像，例如 `camera_x` 获得的图像应放在 `images/images_x` 下，`calibs` 包含校准信息文件，其中存储了每个相机的内参矩阵，`labels` 包含 3D 检测的标签文件。
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── custom
+│   │   ├── ImageSets
+│   │   │   ├── train.txt
+│   │   │   ├── val.txt
+│   │   ├── calibs
+│   │   │   ├── 000000.txt
+│   │   │   ├── 000001.txt
+│   │   │   ├── ...
+│   │   ├── images
+│   │   │   ├── images_0
+│   │   │   │   ├── 000000.png
+│   │   │   │   ├── 000001.png
+│   │   │   │   ├── ...
+│   │   │   ├── images_1
+│   │   │   ├── images_2
+│   │   │   ├── ...
+│   │   ├── labels
+│   │   │   ├── 000000.txt
+│   │   │   ├── 000001.txt
+│   │   │   ├── ...
+```
+
+#### 多模态 3D 检测
+
+多模态 3D 目标检测原始数据通常组织成如下格式。不同于基于视觉的 3D 目标检测，`calibs` 里的校准信息文件存储了每个相机的内参矩阵和外参矩阵。
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── custom
+│   │   ├── ImageSets
+│   │   │   ├── train.txt
+│   │   │   ├── val.txt
+│   │   ├── calibs
+│   │   │   ├── 000000.txt
+│   │   │   ├── 000001.txt
+│   │   │   ├── ...
+│   │   ├── points
+│   │   │   ├── 000000.bin
+│   │   │   ├── 000001.bin
+│   │   │   ├── ...
+│   │   ├── images
+│   │   │   ├── images_0
+│   │   │   │   ├── 000000.png
+│   │   │   │   ├── 000001.png
+│   │   │   │   ├── ...
+│   │   │   ├── images_1
+│   │   │   ├── images_2
+│   │   │   ├── ...
+│   │   ├── labels
+│   │   │   ├── 000000.txt
+│   │   │   ├── 000001.txt
+│   │   │   ├── ...
+```
+
+#### 基于激光雷达的 3D 语义分割
+
+基于激光雷达的 3D 语义分割原始数据通常组织成如下格式，其中 `ImageSets` 包含划分文件，指明哪些文件属于训练/验证集，`points` 包含点云数据，`semantic_mask` 包含逐点级标签。
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── custom
+│   │   ├── ImageSets
+│   │   │   ├── train.txt
+│   │   │   ├── val.txt
+│   │   ├── points
+│   │   │   ├── 000000.bin
+│   │   │   ├── 000001.bin
+│   │   │   ├── ...
+│   │   ├── semantic_mask
+│   │   │   ├── 000000.bin
+│   │   │   ├── 000001.bin
+│   │   │   ├── ...
+```
+
+### 数据转换
+
+按照我们的说明准备好原始数据后，您可以直接使用以下命令生成训练/验证信息文件。
+
+```bash
+python tools/create_data.py custom --root-path ./data/custom --out-dir ./data/custom --extra-tag custom
+```
+
+## 自定义数据集示例
+
+在完成数据准备后，我们可以在 `mmdet3d/datasets/my_dataset.py` 中创建一个新的数据集来加载数据。
+
+```python
+import mmengine
+
+from mmdet3d.registry import DATASETS
+from .det3d_dataset import Det3DDataset
+
+
+@DATASETS.register_module()
+class MyDataset(Det3DDataset):
+
+    # 替换成自定义 pkl 信息文件里的所有类别
+    METAINFO = {
+        'classes': ('Pedestrian', 'Cyclist', 'Car')
+    }
+
+    def parse_ann_info(self, info):
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Data information of single data sample.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
+                  3D ground truth bboxes.
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+        """
+        ann_info = super().parse_ann_info(info)
+        if ann_info is None:
+            ann_info = dict()
+            # 空实例
+            ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
+
+        # 过滤掉没有在训练中使用的类别
+        ann_info = self._remove_dontcare(ann_info)
+        gt_bboxes_3d = LiDARInstance3DBoxes(ann_info['gt_bboxes_3d'])
+        ann_info['gt_bboxes_3d'] = gt_bboxes_3d
+        return ann_info
+```
+
+数据预处理后，用户可以通过以下两个步骤来训练自定义数据集：
+
+1. 修改配置文件来使用自定义数据集。
+2. 验证自定义数据集标注的正确性。
+
+这里我们以在自定义数据集上训练 PointPillars 为例：
+
+### 准备配置
+
+这里我们演示一个纯点云训练的配置示例：
+
+#### 准备数据集配置
+
+在 `configs/_base_/datasets/custom.py` 中：
+
+```python
+# 数据集设置
+dataset_type = 'MyDataset'
+data_root = 'data/custom/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']  # 替换成您的数据集类别
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]  # 根据您的数据集进行调整
+input_modality = dict(use_lidar=True, use_camera=False)
+metainfo = dict(classes=class_names)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,  # 替换成您的点云数据维度
+        use_dim=4),  # 替换成在训练和推理时实际使用的维度
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,  # 替换成您的点云数据维度
+        use_dim=4),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# 为可视化阶段的数据和 GT 加载构造流水线
+eval_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='Pack3DDetInputs', keys=['points']),
+]
+train_dataloader = dict(
+    batch_size=6,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='custom_infos_train.pkl',  # 指定您的训练 pkl 信息
+            data_prefix=dict(pts='points'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            metainfo=metainfo,
+            box_type_3d='LiDAR')))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='points'),
+        ann_file='custom_infos_val.pkl',  # 指定您的验证 pkl 信息
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR'))
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'custom_infos_val.pkl',  # 指定您的验证 pkl 信息
+    metric='bbox')
+```
+
+#### 准备模型配置
+
+对于基于体素化的检测器如 SECOND，PointPillars 及 CenterPoint，点云范围（point cloud range）和体素大小（voxel size）应该根据您的数据集做调整。理论上，`voxel_size` 和 `point_cloud_range` 的设置是相关联的。设置较小的 `voxel_size` 将增加体素数以及相应的内存消耗。此外，需要注意以下问题：
+
+如果将 `point_cloud_range` 和 `voxel_size` 分别设置成 `[0, -40, -3, 70.4, 40, 1]` 和 `[0.05, 0.05, 0.1]`，那么中间特征图的形状应该为 `[(1-(-3))/0.1+1, (40-(-40))/0.05, (70.4-0)/0.05]=[41, 1600, 1408]`。更改 `point_cloud_range` 时，请记得依据 `voxel_size` 更改 `middle_encoder` 里中间特征图的形状。
+
+关于 `anchor_range` 的设置，一般需要根据数据集做调整。需要注意的是，`z` 值需要根据点云的位置做相应调整，具体请参考此 [issue](https://github.com/open-mmlab/mmdetection3d/issues/986)。
+
+关于 `anchor_size` 的设置，通常需要计算整个训练集中目标的长、宽、高的平均值作为 `anchor_size`，以获得最好的结果。
+
+在 `configs/_base_/models/pointpillars_hv_secfpn_custom.py` 中：
+
+```python
+voxel_size = [0.16, 0.16, 4]  # 根据您的数据集做调整
+point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]  # 根据您的数据集做调整
+model = dict(
+    type='VoxelNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=32,
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(16000, 40000))),
+    voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=4,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range),
+    # `output_shape` 需要根据 `point_cloud_range` 和 `voxel_size` 做相应调整
+    middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
+    backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        assign_per_class=True,
+        # 根据您的数据集调整 `ranges` 和 `sizes`
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[
+                [0, -39.68, -0.6, 69.12, 39.68, -0.6],
+                [0, -39.68, -0.6, 69.12, 39.68, -0.6],
+                [0, -39.68, -1.78, 69.12, 39.68, -1.78],
+            ],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # 模型训练和测试设置
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
+```
+
+#### 准备整体配置
+
+我们将上述的所有配置组合在 `configs/pointpillars/pointpillars_hv_secfpn_8xb6_custom.py` 文件中：
+
+```python
+_base_ = [
+    '../_base_/models/pointpillars_hv_secfpn_custom.py',
+    '../_base_/datasets/custom.py',
+    '../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py'
+]
+```
+
+#### 可视化数据集（可选）
+
+为了验证准备的数据和配置是否正确，我们建议在训练和验证前使用 `tools/misc/browse_dataset.py` 脚本可视化数据集和标注。更多细节请参考[可视化文档](https://mmdetection3d.readthedocs.io/zh_CN/dev-1.x/user_guides/visualization.html)。
+
+## 评估
+
+准备好数据和配置之后，您可以遵循我们的文档直接运行训练/测试脚本。
+
+**注意**：我们为自定义数据集提供了 KITTI 风格的评估实现方法。在数据集配置中需要包含如下内容：
+
+```python
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'custom_infos_val.pkl',  # 指定您的验证 pkl 信息
+    metric='bbox')
+```
diff --git a/mmde/docs/zh_cn/advanced_guides/customize_models.md b/mmde/docs/zh_cn/advanced_guides/customize_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..677deca35b158e79eb9c2f07fe959ec93dfc0044
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/customize_models.md
@@ -0,0 +1,619 @@
+# 自定义模型
+
+我们通常把模型的各个组成成分分成 6 种类型：
+
+- 编码器（encoder）：包括 voxel encoder 和 middle encoder 等进入 backbone 前所使用的基于体素的方法，如 `HardVFE` 和 `PointPillarsScatter`。
+- 骨干网络（backbone）：通常采用 FCN 网络来提取特征图，如 `ResNet` 和 `SECOND`。
+- 颈部网络（neck）：位于 backbones 和 heads 之间的组成模块，如 `FPN` 和 `SECONDFPN`。
+- 检测头（head）：用于特定任务的组成模块，如`检测框的预测`和`掩码的预测`。
+- RoI 提取器（RoI extractor）：用于从特征图中提取 RoI 特征的组成模块，如 `H3DRoIHead` 和 `PartAggregationROIHead`。
+- 损失函数（loss）：heads 中用于计算损失函数的组成模块，如 `FocalLoss`、`L1Loss` 和 `GHMLoss`。
+
+## 开发新的组成模块
+
+### 添加新的编码器
+
+接下来我们以 HardVFE 为例展示如何开发新的组成模块。
+
+#### 1. 定义一个新的体素编码器（如 HardVFE：即 HV-SECOND 中使用的体素特征编码器）
+
+创建一个新文件 `mmdet3d/models/voxel_encoders/voxel_encoder.py`。
+
+```python
+import torch.nn as nn
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class HardVFE(nn.Module):
+
+    def __init__(self, arg1, arg2):
+        pass
+
+    def forward(self, x):  # 需要返回一个元组
+        pass
+```
+
+#### 2. 导入该模块
+
+您可以在 `mmdet3d/models/voxel_encoders/__init__.py` 中添加以下代码：
+
+```python
+from .voxel_encoder import HardVFE
+```
+
+或者在配置文件中添加以下代码，从而避免修改源码：
+
+```python
+custom_imports = dict(
+    imports=['mmdet3d.models.voxel_encoders.voxel_encoder'],
+    allow_failed_imports=False)
+```
+
+#### 3. 在配置文件中使用体素编码器
+
+```python
+model = dict(
+    ...
+    voxel_encoder=dict(
+        type='HardVFE',
+        arg1=xxx,
+        arg2=yyy),
+    ...
+)
+```
+
+### 添加新的骨干网络
+
+接下来我们以 [SECOND](https://www.mdpi.com/1424-8220/18/10/3337)（Sparsely Embedded Convolutional Detection）为例展示如何开发新的组成模块。
+
+#### 1. 定义一个新的骨干网络（如 SECOND）
+
+创建一个新文件 `mmdet3d/models/backbones/second.py`。
+
+```python
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class SECOND(BaseModule):
+
+    def __init__(self, arg1, arg2):
+        pass
+
+    def forward(self, x):  # 需要返回一个元组
+        pass
+```
+
+#### 2. 导入该模块
+
+您可以在 `mmdet3d/models/backbones/__init__.py` 中添加以下代码：
+
+```python
+from .second import SECOND
+```
+
+或者在配置文件中添加以下代码，从而避免修改源码：
+
+```python
+custom_imports = dict(
+    imports=['mmdet3d.models.backbones.second'],
+    allow_failed_imports=False)
+```
+
+#### 3. 在配置文件中使用骨干网络
+
+```python
+model = dict(
+    ...
+    backbone=dict(
+        type='SECOND',
+        arg1=xxx,
+        arg2=yyy),
+    ...
+)
+```
+
+### 添加新的颈部网络
+
+#### 1. 定义一个新的颈部网络（如 SECONDFPN）
+
+创建一个新文件 `mmdet3d/models/necks/second_fpn.py`。
+
+```python
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class SECONDFPN(BaseModule):
+
+    def __init__(self,
+                 in_channels=[128, 128, 256],
+                 out_channels=[256, 256, 256],
+                 upsample_strides=[1, 2, 4],
+                 norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+                 upsample_cfg=dict(type='deconv', bias=False),
+                 conv_cfg=dict(type='Conv2d', bias=False),
+                 use_conv_for_no_stride=False,
+                 init_cfg=None):
+        pass
+
+    def forward(self, x):
+        # 具体实现忽略
+        pass
+```
+
+#### 2. 导入该模块
+
+您可以在 `mmdet3d/models/necks/__init__.py` 中添加以下代码：
+
+```python
+from .second_fpn import SECONDFPN
+```
+
+或者在配置文件中添加以下代码，从而避免修改源码：
+
+```python
+custom_imports = dict(
+    imports=['mmdet3d.models.necks.second_fpn'],
+    allow_failed_imports=False)
+```
+
+#### 3. 在配置文件中使用颈部网络
+
+```python
+model = dict(
+    ...
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    ...
+)
+```
+
+### 添加新的检测头
+
+接下来我们以 [PartA2 Head](https://arxiv.org/abs/1907.03670) 为例展示如何开发新的检测头。
+
+**注意**：此处展示的 `PartA2 RoI Head` 将用于检测器的第二阶段。对于单阶段的检测头，请参考 `mmdet3d/models/dense_heads/` 中的例子。由于其简单高效，它们更常用于自动驾驶场景下的 3D 检测中。
+
+首先，在 `mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py` 中添加新的 bbox head。`PartA2 RoI Head` 为目标检测实现了一个新的 bbox head。为了实现一个 bbox head，我们通常需要在新模块中实现如下两个函数。有时还需要实现其他相关函数，如 `loss` 和 `get_targets`。
+
+```python
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class PartA2BboxHead(BaseModule):
+    """PartA2 RoI head."""
+
+    def __init__(self,
+                 num_classes,
+                 seg_in_channels,
+                 part_in_channels,
+                 seg_conv_channels=None,
+                 part_conv_channels=None,
+                 merge_conv_channels=None,
+                 down_conv_channels=None,
+                 shared_fc_channels=None,
+                 cls_channels=None,
+                 reg_channels=None,
+                 dropout_ratio=0.1,
+                 roi_feat_size=14,
+                 with_corner_loss=True,
+                 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='none',
+                     loss_weight=1.0),
+                 init_cfg=None):
+        super(PartA2BboxHead, self).__init__(init_cfg=init_cfg)
+
+    def forward(self, seg_feats, part_feats):
+        pass
+```
+
+其次，如果有必要的话需要实现一个新的 RoI Head。我们从 `Base3DRoIHead` 中继承得到新的 `PartAggregationROIHead`。我们可以发现 `Base3DRoIHead` 已经实现了如下函数。
+
+```python
+from mmdet.models.roi_heads import BaseRoIHead
+
+from mmdet3d.registry import MODELS, TASK_UTILS
+
+
+class Base3DRoIHead(BaseRoIHead):
+    """Base class for 3d RoIHeads."""
+
+    def __init__(self,
+                 bbox_head=None,
+                 bbox_roi_extractor=None,
+                 mask_head=None,
+                 mask_roi_extractor=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None):
+        super(Base3DRoIHead, self).__init__(
+            bbox_head=bbox_head,
+            bbox_roi_extractor=bbox_roi_extractor,
+            mask_head=mask_head,
+            mask_roi_extractor=mask_roi_extractor,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+
+    def init_bbox_head(self, bbox_roi_extractor: dict,
+                       bbox_head: dict) -> None:
+        """Initialize box head and box roi extractor.
+
+        Args:
+            bbox_roi_extractor (dict or ConfigDict): Config of box
+                roi extractor.
+            bbox_head (dict or ConfigDict): Config of box in box head.
+        """
+        self.bbox_roi_extractor = MODELS.build(bbox_roi_extractor)
+        self.bbox_head = MODELS.build(bbox_head)
+
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            if isinstance(self.train_cfg.assigner, dict):
+                self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            elif isinstance(self.train_cfg.assigner, list):
+                self.bbox_assigner = [
+                    TASK_UTILS.build(res) for res in self.train_cfg.assigner
+                ]
+            self.bbox_sampler = TASK_UTILS.build(self.train_cfg.sampler)
+
+    def init_mask_head(self):
+        """Initialize mask head, skip since ``PartAggregationROIHead`` does not
+        have one."""
+        pass
+```
+
+接下来主要对 bbox_forward 的逻辑进行修改，同时其继承了来自 `Base3DRoIHead` 的其它逻辑。在 `mmdet3d/models/roi_heads/part_aggregation_roi_head.py` 中，我们实现了新的 RoI Head，如下所示：
+
+```python
+from typing import Dict, List, Tuple
+
+from mmdet.models.task_modules import AssignResult, SamplingResult
+from mmengine import ConfigDict
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import bbox3d2roi
+from mmdet3d.utils import InstanceList
+from ...structures.det3d_data_sample import SampleList
+from .base_3droi_head import Base3DRoIHead
+
+
+@MODELS.register_module()
+class PartAggregationROIHead(Base3DRoIHead):
+    """Part aggregation roi head for PartA2.
+
+    Args:
+        semantic_head (ConfigDict): Config of semantic head.
+        num_classes (int): The number of classes.
+        seg_roi_extractor (ConfigDict): Config of seg_roi_extractor.
+        bbox_roi_extractor (ConfigDict): Config of part_roi_extractor.
+        bbox_head (ConfigDict): Config of bbox_head.
+        train_cfg (ConfigDict): Training config.
+        test_cfg (ConfigDict): Testing config.
+    """
+
+    def __init__(self,
+                 semantic_head: dict,
+                 num_classes: int = 3,
+                 seg_roi_extractor: dict = None,
+                 bbox_head: dict = None,
+                 bbox_roi_extractor: dict = None,
+                 train_cfg: dict = None,
+                 test_cfg: dict = None,
+                 init_cfg: dict = None) -> None:
+        super(PartAggregationROIHead, self).__init__(
+            bbox_head=bbox_head,
+            bbox_roi_extractor=bbox_roi_extractor,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        self.num_classes = num_classes
+        assert semantic_head is not None
+        self.init_seg_head(seg_roi_extractor, semantic_head)
+
+    def init_seg_head(self, seg_roi_extractor: dict,
+                      semantic_head: dict) -> None:
+        """Initialize semantic head and seg roi extractor.
+
+        Args:
+            seg_roi_extractor (dict): Config of seg
+                roi extractor.
+            semantic_head (dict): Config of semantic head.
+        """
+        self.semantic_head = MODELS.build(semantic_head)
+        self.seg_roi_extractor = MODELS.build(seg_roi_extractor)
+
+    @property
+    def with_semantic(self):
+        """bool: whether the head has semantic branch"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    def predict(self,
+                feats_dict: Dict,
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False,
+                **kwargs) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        assert self.with_bbox, 'Bbox head must be implemented in PartA2.'
+        assert self.with_semantic, 'Semantic head must be implemented' \
+                                   ' in PartA2.'
+
+        batch_input_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        voxels_dict = feats_dict.pop('voxels_dict')
+        # TODO: Split predict semantic and bbox
+        results_list = self.predict_bbox(feats_dict, voxels_dict,
+                                         batch_input_metas, rpn_results_list,
+                                         self.test_cfg)
+        return results_list
+
+    def predict_bbox(self, feats_dict: Dict, voxel_dict: Dict,
+                     batch_input_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     test_cfg: ConfigDict) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            voxel_dict (dict): Contains information of voxels.
+            batch_input_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            test_cfg (Config): Test config.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        ...
+
+    def loss(self, feats_dict: Dict, rpn_results_list: InstanceList,
+             batch_data_samples: SampleList, **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        losses = dict()
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        voxels_dict = feats_dict.pop('voxels_dict')
+        for data_sample in batch_data_samples:
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            if 'ignored_instances' in data_sample:
+                batch_gt_instances_ignore.append(data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+        if self.with_semantic:
+            semantic_results = self._semantic_forward_train(
+                feats_dict, voxels_dict, batch_gt_instances_3d)
+            losses.update(semantic_results.pop('loss_semantic'))
+
+        sample_results = self._assign_and_sample(rpn_results_list,
+                                                 batch_gt_instances_3d)
+        if self.with_bbox:
+            feats_dict.update(semantic_results)
+            bbox_results = self._bbox_forward_train(feats_dict, voxels_dict,
+                                                    sample_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        return losses
+```
+
+此处我们省略了相关函数的更多细节。更多细节请参考[代码](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/models/roi_heads/part_aggregation_roi_head.py)。
+
+最后，用户需要在 `mmdet3d/models/roi_heads/bbox_heads/__init__.py` 和 `mmdet3d/models/roi_heads/__init__.py` 添加模块，从而能被相应的注册器找到并加载。
+
+此外，用户也可以在配置文件中添加以下代码以达到相同的目的。
+
+```python
+custom_imports=dict(
+    imports=['mmdet3d.models.roi_heads.part_aggregation_roi_head', 'mmdet3d.models.roi_heads.bbox_heads.parta2_bbox_head'],
+    allow_failed_imports=False)
+```
+
+`PartAggregationROIHead` 的配置文件如下所示：
+
+```python
+model = dict(
+    ...
+    roi_head=dict(
+        type='PartAggregationROIHead',
+        num_classes=3,
+        semantic_head=dict(
+            type='PointwiseSemanticHead',
+            in_channels=16,
+            extra_width=0.2,
+            seg_score_thr=0.3,
+            num_classes=3,
+            loss_seg=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_part=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                loss_weight=1.0)),
+        seg_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='max')),
+        bbox_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='avg')),
+        bbox_head=dict(
+            type='PartA2BboxHead',
+            num_classes=3,
+            seg_in_channels=16,
+            part_in_channels=4,
+            seg_conv_channels=[64, 64],
+            part_conv_channels=[64, 64],
+            merge_conv_channels=[128, 128],
+            down_conv_channels=[128, 256],
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            shared_fc_channels=[256, 512, 512, 512],
+            cls_channels=[256, 256],
+            reg_channels=[256, 256],
+            dropout_ratio=0.1,
+            roi_feat_size=14,
+            with_corner_loss=True,
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss',
+                beta=1.0 / 9.0,
+                reduction='sum',
+                loss_weight=1.0),
+            loss_cls=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                loss_weight=1.0))),
+    ...
+)
+```
+
+MMDetection 2.0 开始支持配置文件之间的继承，因此用户可以关注配置文件的修改。PartA2 Head 的第二阶段主要使用了新的 `PartAggregationROIHead` 和 `PartA2BboxHead`，需要根据对应模块的 `__init__` 函数来设置参数。
+
+### 添加新的损失函数
+
+假设您想要为检测框的回归添加一个新的损失函数 `MyLoss`。为了添加一个新的损失函数，用户需要在 `mmdet3d/models/losses/my_loss.py` 中实现该函数。装饰器 `weighted_loss` 能够保证对每个元素的损失进行加权平均。
+
+```python
+import torch
+import torch.nn as nn
+from mmdet.models.losses.utils import weighted_loss
+
+from mmdet3d.registry import MODELS
+
+
+@weighted_loss
+def my_loss(pred, target):
+    assert pred.size() == target.size() and target.numel() > 0
+    loss = torch.abs(pred - target)
+    return loss
+
+
+@MODELS.register_module()
+class MyLoss(nn.Module):
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(MyLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * my_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+```
+
+接下来，用户需要在 `mmdet3d/models/losses/__init__.py` 添加该函数。
+
+```python
+from .my_loss import MyLoss, my_loss
+```
+
+或者在配置文件中添加以下代码以达到相同的目的。
+
+```python
+custom_imports=dict(
+    imports=['mmdet3d.models.losses.my_loss'],
+    allow_failed_imports=False)
+```
+
+为了使用该函数，用户需要修改 `loss_xxx` 域。由于 `MyLoss` 是用于回归的，您需要修改 head 中的 `loss_bbox` 域。
+
+```python
+loss_bbox=dict(type='MyLoss', loss_weight=1.0)
+```
diff --git a/mmde/docs/zh_cn/advanced_guides/customize_runtime.md b/mmde/docs/zh_cn/advanced_guides/customize_runtime.md
new file mode 100644
index 0000000000000000000000000000000000000000..9fea0d105a88776443718d45e4e1e8bb1b6bdeac
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/customize_runtime.md
@@ -0,0 +1,382 @@
+# 自定义运行时配置
+
+## 自定义优化器设置
+
+优化器相关的配置是由 `optim_wrapper` 管理的，其通常有三个字段：`optimizer`，`paramwise_cfg`，`clip_grad`。更多细节请参考 [OptimWrapper](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/optim_wrapper.html)。如下所示，使用 `AdamW` 作为`优化器`，骨干网络的学习率降低 10 倍，并添加了梯度裁剪。
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    # 优化器
+    optimizer=dict(
+        type='AdamW',
+        lr=0.0001,
+        weight_decay=0.05,
+        eps=1e-8,
+        betas=(0.9, 0.999)),
+
+    # 参数级学习率及权重衰减系数设置
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+        },
+        norm_decay_mult=0.0),
+
+    # 梯度裁剪
+    clip_grad=dict(max_norm=0.01, norm_type=2))
+```
+
+### 自定义 PyTorch 支持的优化器
+
+我们已经支持使用所有 PyTorch 实现的优化器，且唯一需要修改的地方就是改变配置文件中的 `optim_wrapper` 字段中的 `optimizer` 字段。例如，如果您想使用 `Adam`（注意这样可能会使性能大幅下降），您可以这样修改：
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='Adam', lr=0.0003, weight_decay=0.0001))
+```
+
+为了修改模型的学习率，用户只需要修改 `optimizer` 中的 `lr` 字段。用户可以根据 PyTorch 的 [API 文档](https://pytorch.org/docs/stable/optim.html?highlight=optim#module-torch.optim)直接设置参数。
+
+### 自定义并实现优化器
+
+#### 1. 定义新的优化器
+
+一个自定义优化器可以按照如下过程定义：
+
+假设您想要添加一个叫 `MyOptimizer` 的，拥有参数 `a`，`b` 和 `c` 的优化器，您需要创建一个叫做 `mmdet3d/engine/optimizers` 的目录。接下来，应该在目录下某个文件中实现新的优化器，比如 `mmdet3d/engine/optimizers/my_optimizer.py`：
+
+```python
+from torch.optim import Optimizer
+
+from mmdet3d.registry import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module()
+class MyOptimizer(Optimizer):
+
+    def __init__(self, a, b, c):
+        pass
+```
+
+#### 2. 将优化器添加到注册器
+
+为了找到上述定义的优化器模块，该模块首先需要被引入主命名空间。有两种实现方法：
+
+- 修改 `mmdet3d/engine/optimizers/__init__.py` 导入该模块。
+
+  新定义的模块应该在 `mmdet3d/engine/optimizers/__init__.py` 中被导入，从而被找到并且被添加到注册器中：
+
+  ```python
+  from .my_optimizer import MyOptimizer
+  ```
+
+- 在配置中使用 `custom_imports` 来人工导入新优化器。
+
+  ```python
+  custom_imports = dict(imports=['mmdet3d.engine.optimizers.my_optimizer'], allow_failed_imports=False)
+  ```
+
+  模块 `mmdet3d.engine.optimizers.my_optimizer` 会在程序开始被导入，且 `MyOptimizer` 类在那时会自动被注册。注意到应该只有包含 `MyOptimizer` 类的包被导入。`mmdet3d.engine.optimizers.my_optimizer.MyOptimizer`**不能**被直接导入。
+
+  事实上，用户可以在这种导入的方法中使用完全不同的文件目录结构，只要保证根目录能在 `PYTHONPATH` 中被定位。
+
+#### 3. 在配置文件中指定优化器
+
+接下来您可以在配置文件的 `optimizer` 字段中使用 `MyOptimizer`。在配置文件中，优化器在 `optimizer` 字段中以如下方式定义：
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+```
+
+为了使用您自己的优化器，该字段可以改为：
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value))
+```
+
+### 自定义优化器封装构造器
+
+部分模型可能会拥有一些参数专属的优化器设置，比如 BatchNorm 层的权重衰减 (weight decay)。用户可以通过自定义优化器封装构造器来对那些细粒度的参数进行调优。
+
+```python
+from mmengine.optim import DefaultOptimWrapperConstructor
+
+from mmdet3d.registry import OPTIM_WRAPPER_CONSTRUCTORS
+from .my_optimizer import MyOptimizer
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class MyOptimizerWrapperConstructor(DefaultOptimWrapperConstructor):
+
+    def __init__(self,
+                 optim_wrapper_cfg: dict,
+                 paramwise_cfg: Optional[dict] = None):
+        pass
+
+    def __call__(self, model: nn.Module) -> OptimWrapper:
+
+        return optim_wrapper
+```
+
+默认优化器封装构造器在[这里](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/default_constructor.py#L18)实现。这部分代码也可以用作新优化器封装构造器的模板。
+
+### 额外的设置
+
+没有在优化器部分实现的技巧应该通过优化器封装构造器或者钩子来实现（比如逐参数的学习率设置）。我们列举了一些常用的可以稳定训练过程或者加速训练的设置。我们欢迎提供更多类似设置的 PR 和 issue。
+
+- __使用梯度裁剪 (gradient clip) 来稳定训练过程__：一些模型依赖梯度裁剪技术来裁剪训练中的梯度，以稳定训练过程。举例如下：
+
+  ```python
+  optim_wrapper = dict(
+      _delete_=True, clip_grad=dict(max_norm=35, norm_type=2))
+  ```
+
+  如果您的配置继承了一个已经设置了 `optim_wrapper` 的基础配置，那么您可能需要 `_delete_=True` 字段来覆盖基础配置中无用的设置。更多细节请参考[配置文档](https://mmdetection3d.readthedocs.io/zh_CN/dev-1.x/user_guides/config.html)。
+
+- __使用动量调度器 (momentum scheduler) 来加速模型收敛__：我们支持用动量调度器来根据学习率更改模型的动量，这样可以使模型更快地收敛。动量调度器通常和学习率调度器一起使用，例如，如下配置文件在 [3D 检测](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/schedules/cyclic-20e.py)中被用于加速模型收敛。更多细节请参考 [CosineAnnealingLR](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py#L43) 和 [CosineAnnealingMomentum](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py#L71) 的实现方法。
+
+  ```python
+  param_scheduler = [
+      # 学习率调度器
+      # 在前 8 个 epoch，学习率从 0 升到 lr * 10
+      # 在接下来 12 个 epoch，学习率从 lr * 10 降到 lr * 1e-4
+      dict(
+          type='CosineAnnealingLR',
+          T_max=8,
+          eta_min=lr * 10,
+          begin=0,
+          end=8,
+          by_epoch=True,
+          convert_to_iter_based=True),
+      dict(
+          type='CosineAnnealingLR',
+          T_max=12,
+          eta_min=lr * 1e-4,
+          begin=8,
+          end=20,
+          by_epoch=True,
+          convert_to_iter_based=True),
+      # 动量调度器
+      # 在前 8 个 epoch，动量从 0 升到 0.85 / 0.95
+      # 在接下来 12 个 epoch，动量从 0.85 / 0.95 升到 1
+      dict(
+          type='CosineAnnealingMomentum',
+          T_max=8,
+          eta_min=0.85 / 0.95,
+          begin=0,
+          end=8,
+          by_epoch=True,
+          convert_to_iter_based=True),
+      dict(
+          type='CosineAnnealingMomentum',
+          T_max=12,
+          eta_min=1,
+          begin=8,
+          end=20,
+          by_epoch=True,
+          convert_to_iter_based=True)
+  ]
+  ```
+
+## 自定义训练调度
+
+默认情况下我们使用阶梯式学习率衰减的 1 倍训练调度，这会调用 MMEngine 中的 [`MultiStepLR`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py#L144)。我们在[这里](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py)支持了很多其他学习率调度，比如`余弦退火`和`多项式衰减`调度。下面是一些样例：
+
+- 多项式衰减调度：
+
+  ```python
+  param_scheduler = [
+      dict(
+          type='PolyLR',
+          power=0.9,
+          eta_min=1e-4,
+          begin=0,
+          end=8,
+          by_epoch=True)]
+  ```
+
+- 余弦退火调度：
+
+  ```python
+  param_scheduler = [
+      dict(
+          type='CosineAnnealingLR',
+          T_max=8,
+          eta_min=lr * 1e-5,
+          begin=0,
+          end=8,
+          by_epoch=True)]
+  ```
+
+## 自定义训练循环控制器
+
+默认情况下，我们在 `train_cfg` 中使用 `EpochBasedTrainLoop`，并在每一个训练 epoch 完成后进行一次验证，如下所示：
+
+```python
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_begin=1, val_interval=1)
+```
+
+事实上，[`IterBasedTrainLoop`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L185) 和 [`EpochBasedTrainLoop`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L18) 都支持动态间隔验证，如下所示：
+
+```python
+# 在第 365001 次迭代之前，我们每隔 5000 次迭代验证一次。
+# 在第 365000 次迭代之后，我们每隔 368750 次迭代验证一次，
+# 这意味着我们在训练结束后进行验证。
+
+interval = 5000
+max_iters = 368750
+dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)]
+train_cfg = dict(
+    type='IterBasedTrainLoop',
+    max_iters=max_iters,
+    val_interval=interval,
+    dynamic_intervals=dynamic_intervals)
+```
+
+## 自定义钩子
+
+### 自定义并实现钩子
+
+#### 1. 实现一个新钩子
+
+MMEngine 提供了一些实用的[钩子](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/hook.html)，但有些场合用户可能需要实现一个新的钩子。在 v1.1.0rc0 之后，MMDetection3D 在训练时支持基于 MMEngine 自定义钩子。因此用户可以直接在 mmdet3d 或者基于 mmdet3d 的代码库中实现钩子并通过更改训练配置来使用钩子。这里我们给出一个在 mmdet3d 中创建并使用新钩子的例子。
+
+```python
+from mmengine.hooks import Hook
+
+from mmdet3d.registry import HOOKS
+
+
+@HOOKS.register_module()
+class MyHook(Hook):
+
+    def __init__(self, a, b):
+
+    def before_run(self, runner) -> None:
+
+    def after_run(self, runner) -> None:
+
+    def before_train(self, runner) -> None:
+
+    def after_train(self, runner) -> None:
+
+    def before_train_epoch(self, runner) -> None:
+
+    def after_train_epoch(self, runner) -> None:
+
+    def before_train_iter(self,
+                          runner,
+                          batch_idx: int,
+                          data_batch: DATA_BATCH = None) -> None:
+
+    def after_train_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None,
+                         outputs: Optional[dict] = None) -> None:
+```
+
+用户需要根据钩子的功能指定钩子在每个训练阶段时的行为，具体包括如下阶段：`before_run`，`after_run`，`before_train`，`after_train`，`before_train_epoch`，`after_train_epoch`，`before_train_iter`，和 `after_train_iter`。有更多的位点可以插入钩子，详情可参考 [base hook class](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/hook.py#L9)。
+
+#### 2. 注册新钩子
+
+接下来我们需要导入 `MyHook`。假设新钩子位于文件 `mmdet3d/engine/hooks/my_hook.py` 中，有两种实现方法：
+
+- 修改 `mmdet3d/engine/hooks/__init__.py` 导入该模块。
+
+  新定义的模块应该在 `mmdet3d/engine/hooks/__init__.py` 中被导入，从而被找到并且被添加到注册器中：
+
+  ```python
+  from .my_hook import MyHook
+  ```
+
+- 在配置中使用 `custom_imports` 来人为地导入新钩子。
+
+  ```python
+  custom_imports = dict(imports=['mmdet3d.engine.hooks.my_hook'], allow_failed_imports=False)
+  ```
+
+#### 3. 更改配置文件
+
+```python
+custom_hooks = [
+    dict(type='MyHook', a=a_value, b=b_value)
+]
+```
+
+您可以将字段 `priority` 设置为 `'NORMAL'` 或者 `'HIGHEST'` 来设置钩子的优先级，如下所示：
+
+```python
+custom_hooks = [
+    dict(type='MyHook', a=a_value, b=b_value, priority='NORMAL')
+]
+```
+
+默认情况下，注册阶段钩子的优先级为 `'NORMAL'`。
+
+### 使用 MMDetection3D 中实现的钩子
+
+如果 MMDetection3D 中已经实现了该钩子，您可以直接通过更改配置文件来使用该钩子。
+
+#### 例子：`DisableObjectSampleHook`
+
+我们实现了一个名为 [DisableObjectSampleHook](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/engine/hooks/disable_object_sample_hook.py) 的自定义钩子在训练阶段达到指定 epoch 后禁用 `ObjectSample` 增强策略。
+
+如果有需要的话我们可以在配置文件中设置它：
+
+```python
+custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=15)]
+```
+
+### 更改默认的运行时钩子
+
+有一些常用的钩子通过 `default_hooks` 注册，它们是：
+
+- `IterTimerHook`：该钩子用来记录加载数据的时间 'data_time' 和模型训练一步的时间 'time'。
+- `LoggerHook`：该钩子用来从`执行器（Runner）`的不同组件收集日志并将其写入终端，json 文件，tensorboard 和 wandb 等。
+- `ParamSchedulerHook`：该钩子用来更新优化器中的一些超参数，例如学习率和动量。
+- `CheckpointHook`：该钩子用来定期地保存检查点。
+- `DistSamplerSeedHook`：该钩子用来设置采样和批采样的种子。
+- `Det3DVisualizationHook`：该钩子用来可视化验证和测试过程的预测结果。
+
+`IterTimerHook`，`ParamSchedulerHook` 和 `DistSamplerSeedHook` 都很简单，通常不需要修改，因此此处我们将介绍如何使用 `LoggerHook`，`CheckpointHook` 和 `Det3DVisualizationHook`。
+
+#### CheckpointHook
+
+除了定期地保存检查点，[`CheckpointHook`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py#L18) 提供了其它的可选项例如 `max_keep_ckpts`，`save_optimizer` 等。用户可以设置 `max_keep_ckpts` 只保存少量的检查点或者通过 `save_optimizer` 决定是否保存优化器的状态。参数的更多细节请参考[此处](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py#L18)。
+
+```python
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        interval=1,
+        max_keep_ckpts=3,
+        save_optimizer=True))
+```
+
+#### LoggerHook
+
+`LoggerHook` 允许设置日志记录间隔。详细介绍可参考[文档](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/logger_hook.py#L19)。
+
+```python
+default_hooks = dict(logger=dict(type='LoggerHook', interval=50))
+```
+
+#### Det3DVisualizationHook
+
+`Det3DVisualizationHook` 使用 `DetLocalVisualizer` 来可视化预测结果，`Det3DLocalVisualizer` 支持不同的后端，例如 `TensorboardVisBackend` 和 `WandbVisBackend`（更多细节请参考[文档](https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py)）。用户可以添加多个后端来进行可视化，如下所示。
+
+```python
+default_hooks = dict(
+    visualization=dict(type='Det3DVisualizationHook', draw=True))
+
+vis_backends = [dict(type='LocalVisBackend'),
+                dict(type='TensorboardVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+```
diff --git a/mmde/docs/zh_cn/advanced_guides/datasets/index.rst b/mmde/docs/zh_cn/advanced_guides/datasets/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dc416ef57967e143606f37884eda5496e47dd214
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/datasets/index.rst
@@ -0,0 +1,11 @@
+.. toctree::
+   :maxdepth: 3
+
+   kitti.md
+   nuscenes.md
+   lyft.md
+   waymo.md
+   sunrgbd.md
+   scannet.md
+   s3dis.md
+   semantickitti.md
diff --git a/mmde/docs/zh_cn/advanced_guides/datasets/kitti.md b/mmde/docs/zh_cn/advanced_guides/datasets/kitti.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b86215b5f56a80be2bca05e221e9e14f2aab598
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/datasets/kitti.md
@@ -0,0 +1,206 @@
+# KITTI 数据集
+
+本页提供了有关在 MMDetection3D 中使用 KITTI 数据集的具体教程。
+
+## 数据准备
+
+您可以在[这里](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d)下载 KITTI 3D 检测数据并解压缩所有 zip 文件。此外，您可以在[这里](https://download.openmmlab.com/mmdetection3d/data/train_planes.zip)下载道路平面信息，其在训练过程中作为一个可选项，用来提高模型的性能。道路平面信息由 [AVOD](https://github.com/kujason/avod) 生成，更多细节请参考[此处](https://github.com/kujason/avod/issues/19)。
+
+像准备数据集的一般方法一样，建议将数据集根目录链接到 `$MMDETECTION3D/data`。
+
+在我们处理之前，文件夹结构应按如下方式组织：
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── kitti
+│   │   ├── ImageSets
+│   │   ├── testing
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── velodyne
+│   │   ├── training
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── label_2
+│   │   │   ├── velodyne
+│   │   │   ├── planes (optional)
+```
+
+### 创建 KITTI 数据集
+
+为了创建 KITTI 点云数据，首先需要加载原始的点云数据并生成相关的包含目标标签和标注框的数据标注文件，同时还需要为 KITTI 数据集生成每个单独的训练目标的点云数据，并将其存储在 `data/kitti/kitti_gt_database` 的 `.bin` 格式的文件中，此外，需要为训练数据或者验证数据生成 `.pkl` 格式的包含数据信息的文件。随后，通过运行下面的命令来创建最终的 KITTI 数据：
+
+```bash
+mkdir ./data/kitti/ && mkdir ./data/kitti/ImageSets
+
+# 下载数据划分
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/test.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/test.txt
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/train.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/train.txt
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/val.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/val.txt
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/trainval.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/trainval.txt
+
+python tools/create_data.py kitti --root-path ./data/kitti --out-dir ./data/kitti --extra-tag kitti --with-plane
+```
+
+需要注意的是，如果您的本地磁盘没有充足的存储空间来存储转换后的数据，您可以通过改变 `--out-dir` 来指定其他任意的存储路径。如果您没有准备 `planes` 数据，您需要移除 `--with-plane` 标志。
+
+处理后的文件夹结构应该如下：
+
+```
+kitti
+├── ImageSets
+│   ├── test.txt
+│   ├── train.txt
+│   ├── trainval.txt
+│   ├── val.txt
+├── testing
+│   ├── calib
+│   ├── image_2
+│   ├── velodyne
+│   ├── velodyne_reduced
+├── training
+│   ├── calib
+│   ├── image_2
+│   ├── label_2
+│   ├── velodyne
+│   ├── velodyne_reduced
+│   ├── planes (optional)
+├── kitti_gt_database
+│   ├── xxxxx.bin
+├── kitti_infos_train.pkl
+├── kitti_infos_val.pkl
+├── kitti_dbinfos_train.pkl
+├── kitti_infos_test.pkl
+├── kitti_infos_trainval.pkl
+```
+
+- `kitti_gt_database/xxxxx.bin`：训练数据集中包含在 3D 标注框中的点云数据。
+- `kitti_infos_train.pkl`：训练数据集，该字典包含了两个键值：`metainfo` 和 `data_list`。`metainfo` 包含数据集的基本信息，例如 `categories`, `dataset` 和 `info_version`。`data_list` 是由字典组成的列表，每个字典（以下简称 `info`）包含了单个样本的所有详细信息。
+  - info\['sample_idx'\]：该样本在整个数据集的索引。
+  - info\['images'\]：多个相机捕获的图像信息。是一个字典，包含 5 个键值：`CAM0`, `CAM1`, `CAM2`, `CAM3`, `R0_rect`。
+    - info\['images'\]\['R0_rect'\]：校准旋转矩阵，是一个 4x4 数组。
+    - info\['images'\]\['CAM2'\]：包含 `CAM2` 相机传感器的信息。
+      - info\['images'\]\['CAM2'\]\['img_path'\]：图像的文件名。
+      - info\['images'\]\['CAM2'\]\['height'\]：图像的高。
+      - info\['images'\]\['CAM2'\]\['width'\]：图像的宽。
+      - info\['images'\]\['CAM2'\]\['cam2img'\]：相机到图像的变换矩阵，是一个 4x4 数组。
+      - info\['images'\]\['CAM2'\]\['lidar2cam'\]：激光雷达到相机的变换矩阵，是一个 4x4 数组。
+      - info\['images'\]\['CAM2'\]\['lidar2img'\]：激光雷达到图像的变换矩阵，是一个 4x4 数组。
+    - info\['lidar_points'\]：是一个字典，包含了激光雷达点相关的信息。
+      - info\['lidar_points'\]\['lidar_path'\]：激光雷达点云数据的文件名。
+      - info\['lidar_points'\]\['num_pts_feats'\]：点的特征维度。
+      - info\['lidar_points'\]\['Tr_velo_to_cam'\]：Velodyne 坐标到相机坐标的变换矩阵，是一个 4x4 数组。
+      - info\['lidar_points'\]\['Tr_imu_to_velo'\]：IMU 坐标到 Velodyne 坐标的变换矩阵，是一个 4x4 数组。
+    - info\['instances'\]：是一个字典组成的列表。每个字典包含单个实例的所有标注信息。对于其中的第 i 个实例，我们有：
+      - info\['instances'\]\[i\]\['bbox'\]：长度为 4 的列表，以 (x1, y1, x2, y2) 的顺序表示实例的 2D 边界框。
+      - info\['instances'\]\[i\]\['bbox_3d'\]：长度为 7 的列表，以 (x, y, z, l, h, w, yaw) 的顺序表示实例的 3D 边界框。
+      - info\['instances'\]\[i\]\['bbox_label'\]：是一个整数，表示实例的 2D 标签，-1 代表忽略。
+      - info\['instances'\]\[i\]\['bbox_label_3d'\]：是一个整数，表示实例的 3D 标签，-1 代表忽略。
+      - info\['instances'\]\[i\]\['depth'\]：3D 边界框投影到相关图像平面的中心点的深度。
+      - info\['instances'\]\[i\]\['num_lidar_pts'\]：3D 边界框内的激光雷达点数。
+      - info\['instances'\]\[i\]\['center_2d'\]：3D 边界框投影的 2D 中心。
+      - info\['instances'\]\[i\]\['difficulty'\]：KITTI 官方定义的困难度，包括简单、适中、困难。
+      - info\['instances'\]\[i\]\['truncated'\]：从 0（非截断）到 1（截断）的浮点数，其中截断指的是离开检测图像边界的检测目标。
+      - info\['instances'\]\[i\]\['occluded'\]：整数 (0,1,2,3) 表示目标的遮挡状态：0 = 完全可见，1 = 部分遮挡，2 = 大面积遮挡，3 = 未知。
+      - info\['instances'\]\[i\]\['group_ids'\]：用于多部分的物体。
+    - info\['plane'\]（可选）：地平面信息。
+
+更多细节请参考 [kitti_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/kitti_converter.py) 和 [update_infos_to_v2.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/update_infos_to_v2.py)。
+
+## 训练流程
+
+下面展示了一个使用 KITTI 数据集进行 3D 目标检测的典型流程：
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4, # x, y, z, intensity
+        use_dim=4),
+    dict(
+        type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+```
+
+- 数据增强：
+  - `ObjectNoise`：对场景中的每个真实标注框目标添加噪音。
+  - `RandomFlip3D`：对输入点云数据进行随机地水平翻转或者垂直翻转。
+  - `GlobalRotScaleTrans`：对输入点云数据进行旋转。
+
+## 评估
+
+使用 8 个 GPU 以及 KITTI 指标评估的 PointPillars 的示例如下：
+
+```shell
+bash tools/dist_test.sh configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py work_dirs/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class/latest.pth 8
+```
+
+## 度量指标
+
+KITTI 官方使用全类平均精度（mAP）和平均方向相似度（AOS）来评估 3D 目标检测的性能，更多细节请参考[官方网站](http://www.cvlibs.net/datasets/kitti/eval_3dobject.php)和[论文](http://www.cvlibs.net/publications/Geiger2012CVPR.pdf)。
+
+MMDetection3D 采用相同的方法在 KITTI 数据集上进行评估，下面展示了一个评估结果的例子：
+
+```
+Car AP@0.70, 0.70, 0.70:
+bbox AP:97.9252, 89.6183, 88.1564
+bev  AP:90.4196, 87.9491, 85.1700
+3d   AP:88.3891, 77.1624, 74.4654
+aos  AP:97.70, 89.11, 87.38
+Car AP@0.70, 0.50, 0.50:
+bbox AP:97.9252, 89.6183, 88.1564
+bev  AP:98.3509, 90.2042, 89.6102
+3d   AP:98.2800, 90.1480, 89.4736
+aos  AP:97.70, 89.11, 87.38
+```
+
+## 测试和提交
+
+使用 8 个 GPU 在 KITTI 上测试 PointPillars 并生成对排行榜的提交的示例如下：
+
+- 首先，你需要在你的配置文件中修改 `test_dataloader` 和 `test_evaluator` 字典，如下所示：
+
+  ```python
+  data_root = 'data/kitti/'
+  test_dataloader = dict(
+      dataset=dict(
+          ann_file='kitti_infos_test.pkl',
+          load_eval_anns=False,
+          data_prefix=dict(pts='testing/velodyne_reduced')))
+  test_evaluator = dict(
+      ann_file=data_root + 'kitti_infos_test.pkl',
+      format_only=True,
+      pklfile_prefix='results/kitti-3class/kitti_results',
+      submission_prefix='results/kitti-3class/kitti_results')
+  ```
+
+- 接下来，你可以运行如下测试脚本。
+
+  ```shell
+  ./tools/dist_test.sh configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py work_dirs/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class/latest.pth 8
+  ```
+
+在生成 `results/kitti-3class/kitti_results/xxxxx.txt` 后，您可以提交这些文件到 KITTI 官方网站进行基准测试，更多细节请参考 [KITTI 官方网站](http://www.cvlibs.net/datasets/kitti/index.php)。
diff --git a/mmde/docs/zh_cn/advanced_guides/datasets/lyft.md b/mmde/docs/zh_cn/advanced_guides/datasets/lyft.md
new file mode 100644
index 0000000000000000000000000000000000000000..eab8b5299a306b30cd61b78ee052507867cd6dcf
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/datasets/lyft.md
@@ -0,0 +1,195 @@
+# Lyft 数据集
+
+本页提供了有关在 MMDetection3D 中使用 Lyft 数据集的具体教程。
+
+## 准备之前
+
+您可以在[这里](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data)下载 Lyft 3D 检测数据并解压缩所有 zip 文件。
+
+像准备数据集的一般方法一样，建议将数据集根目录链接到 `$MMDETECTION3D/data`。
+
+在进行处理之前，文件夹结构应按如下方式组织：
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── lyft
+│   │   ├── v1.01-train
+│   │   │   ├── v1.01-train (train_data)
+│   │   │   ├── lidar (train_lidar)
+│   │   │   ├── images (train_images)
+│   │   │   ├── maps (train_maps)
+│   │   ├── v1.01-test
+│   │   │   ├── v1.01-test (test_data)
+│   │   │   ├── lidar (test_lidar)
+│   │   │   ├── images (test_images)
+│   │   │   ├── maps (test_maps)
+│   │   ├── train.txt
+│   │   ├── val.txt
+│   │   ├── test.txt
+│   │   ├── sample_submission.csv
+```
+
+其中 `v1.01-train` 和 `v1.01-test` 包含与 nuScenes 数据集相同的元文件，`.txt` 文件包含数据划分的信息。Lyft 不提供训练集和验证集的官方划分方案，因此 MMDetection3D 对不同场景下的不同类别的目标数量进行分析，并提供了一个数据集划分方案。`sample_submission.csv` 是用于提交到 Kaggle 评估服务器的基本文件。需要注意的是，我们遵循了 Lyft 最初的文件夹命名以实现更清楚的文件组织。请将下载下来的原始文件夹按照上述组织结构重新命名。
+
+## 数据准备
+
+组织 Lyft 数据集的方式和组织 nuScenes 的方式相同，首先会生成几乎具有相同结构的 `.pkl` 文件，接着需要重点关注这两个数据集之间的不同点，更多关于数据集信息文件结构的说明请参考 [nuScenes 教程](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/docs/zh_cn/advanced_guides/datasets/nuscenes_det.md)。
+
+请通过运行下面的命令来生成 Lyft 的数据集信息文件：
+
+```bash
+python tools/create_data.py lyft --root-path ./data/lyft --out-dir ./data/lyft --extra-tag lyft --version v1.01
+python tools/data_converter/lyft_data_fixer.py --version v1.01 --root-folder ./data/lyft
+```
+
+请注意，上面的第二行命令用于修复损坏的 lidar 数据文件，更多细节请参考此处[讨论](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110000)。
+
+处理后的文件夹结构应该如下：
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── lyft
+│   │   ├── v1.01-train
+│   │   │   ├── v1.01-train (train_data)
+│   │   │   ├── lidar (train_lidar)
+│   │   │   ├── images (train_images)
+│   │   │   ├── maps (train_maps)
+│   │   ├── v1.01-test
+│   │   │   ├── v1.01-test (test_data)
+│   │   │   ├── lidar (test_lidar)
+│   │   │   ├── images (test_images)
+│   │   │   ├── maps (test_maps)
+│   │   ├── train.txt
+│   │   ├── val.txt
+│   │   ├── test.txt
+│   │   ├── sample_submission.csv
+│   │   ├── lyft_infos_train.pkl
+│   │   ├── lyft_infos_val.pkl
+│   │   ├── lyft_infos_test.pkl
+```
+
+- `lyft_infos_train.pkl`：训练数据集信息，该字典包含两个关键字：`metainfo` 和 `data_list`。`metainfo` 包含数据集的基本信息，例如 `categories`, `dataset` 和 `info_version`。`data_list` 是由字典组成的列表，每个字典（以下简称 `info`）包含了单个样本的所有详细信息。
+  - info\['sample_idx'\]：样本在整个数据集的索引。
+  - info\['token'\]：样本数据标记。
+  - info\['timestamp'\]：样本数据时间戳。
+  - info\['lidar_points'\]：是一个字典，包含了所有与激光雷达点相关的信息。
+    - info\['lidar_points'\]\['lidar_path'\]：激光雷达点云数据的文件名。
+    - info\['lidar_points'\]\['num_pts_feats'\]：点的特征维度。
+    - info\['lidar_points'\]\['lidar2ego'\]：该激光雷达传感器到自车的变换矩阵。（4x4 列表）
+    - info\['lidar_points'\]\['ego2global'\]：自车到全局坐标的变换矩阵。（4x4 列表）
+  - info\['lidar_sweeps'\]：是一个列表，包含了扫描信息（没有标注的中间帧）。
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['data_path'\]：第 i 次扫描的激光雷达数据的文件路径。
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\[lidar2ego''\]：当前激光雷达传感器到自车在第 i 次扫描的变换矩阵。（4x4 列表）
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['ego2global'\]：自车在第 i 次扫描到全局坐标的变换矩阵。（4x4 列表）
+    - info\['lidar_sweeps'\]\[i\]\['lidar2sensor'\]：从当前帧主激光雷达到第 i 帧扫描激光雷达的变换矩阵。（4x4 列表）
+    - info\['lidar_sweeps'\]\[i\]\['timestamp'\]：扫描数据的时间戳。
+    - info\['lidar_sweeps'\]\[i\]\['sample_data_token'\]：扫描样本数据标记。
+  - info\['images'\]：是一个字典，包含与每个相机对应的六个键值：`'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_BACK'`, `'CAM_BACK_LEFT'`, `'CAM_BACK_RIGHT'`。每个字典包含了对应相机的所有数据信息。
+    - info\['images'\]\['CAM_XXX'\]\['img_path'\]：图像的文件名。
+    - info\['images'\]\['CAM_XXX'\]\['cam2img'\]：当 3D 点投影到图像平面时需要的内参信息相关的变换矩阵。（3x3 列表）
+    - info\['images'\]\['CAM_XXX'\]\['sample_data_token'\]：图像样本数据标记。
+    - info\['images'\]\['CAM_XXX'\]\['timestamp'\]：图像的时间戳。
+    - info\['images'\]\['CAM_XXX'\]\['cam2ego'\]：该相机传感器到自车的变换矩阵。（4x4 列表）
+    - info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]：激光雷达传感器到该相机的变换矩阵。（4x4 列表）
+  - info\['instances'\]：是一个字典组成的列表。每个字典包含单个实例的所有标注信息。对于其中的第 i 个实例，我们有：
+    - info\['instances'\]\[i\]\['bbox_3d'\]：长度为 7 的列表，以 (x, y, z, l, w, h, yaw) 的顺序表示实例在激光雷达坐标系下的 3D 边界框。
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]：整数从 0 开始表示实例的标签，其中 -1 代表忽略该类别。
+    - info\['instances'\]\[i\]\['bbox_3d_isvalid'\]：每个包围框是否有效。一般情况下，我们只将包含至少一个激光雷达或雷达点的 3D 框作为有效框。
+
+接下来将详细介绍 Lyft 数据集和 nuScenes 数据集之间的数据集信息文件中的不同点：
+
+- `lyft_database/xxxxx.bin` 文件不存在：由于真实标注框的采样对实验的影响可以忽略不计，在 Lyft 数据集中不会提取该目录和相关的 `.bin` 文件。
+
+- `lyft_infos_train.pkl`
+
+  - info\['instances'\]\[i\]\['velocity'\] 不存在：Lyft 数据集中不存在速度评估信息。
+  - info\['instances'\]\[i\]\['num_lidar_pts'\] 及 info\['instances'\]\[i\]\['num_radar_pts'\] 不存在。
+
+这里仅介绍存储在训练数据文件的数据记录信息。这同样适用于验证集和测试集（没有实例）。
+
+更多关于 `lyft_infos_xxx.pkl` 的结构信息请参考 [lyft_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/lyft_converter.py)。
+
+## 训练流程
+
+### 基于 LiDAR 的方法
+
+Lyft 上基于 LiDAR 的 3D 检测（包括多模态方法）的训练流程与 nuScenes 几乎相同，如下所示：
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+```
+
+与 nuScenes 相似，在 Lyft 上进行训练的模型也需要 `LoadPointsFromMultiSweeps` 步骤来从连续帧中加载点云数据。另外，考虑到 Lyft 中所收集的激光雷达点的强度是无效的，因此将 `LoadPointsFromMultiSweeps` 中的 `use_dim` 默认值设置为 `[0, 1, 2, 4]`，其中前三个维度表示点的坐标，最后一个维度表示时间戳的差异。
+
+## 评估
+
+使用 8 个 GPU 以及 Lyft 指标评估的 PointPillars 的示例如下：
+
+```shell
+bash ./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py checkpoints/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d_20210517_202818-fc6904c3.pth 8
+```
+
+## 度量指标
+
+Lyft 提出了一个更加严格的用以评估所预测的 3D 检测框的度量指标。判断一个预测框是否是正类的基本评判标准和 KITTI 一样，如基于 3D 交并比进行评估，然而，Lyft 采用与 COCO 相似的方式来计算平均精度 -- 计算 3D 交并比在 0.5-0.95 之间的不同阈值下的平均精度。实际上，重叠部分大于 0.7 的 3D 交并比是一项对于 3D 检测方法比较严格的标准，因此整体的性能似乎会偏低。相比于其他数据集，Lyft 上不同类别的标注不平衡是导致最终结果偏低的另一个重要原因。更多关于度量指标的定义请参考[官方网址](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/overview/evaluation)。
+
+这里将采用官方方法对 Lyft 进行评估，下面展示了一个评估结果的例子：
+
+```
++mAPs@0.5:0.95------+--------------+
+| class             | mAP@0.5:0.95 |
++-------------------+--------------+
+| animal            | 0.0          |
+| bicycle           | 0.099        |
+| bus               | 0.177        |
+| car               | 0.422        |
+| emergency_vehicle | 0.0          |
+| motorcycle        | 0.049        |
+| other_vehicle     | 0.359        |
+| pedestrian        | 0.066        |
+| truck             | 0.176        |
+| Overall           | 0.15         |
++-------------------+--------------+
+```
+
+## 测试和提交
+
+使用 8 个 GPU 在 Lyft 上测试 PointPillars 并生成对排行榜的提交的示例如下：
+
+```shell
+./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-2x_lyft-3d.py work_dirs/pp-lyft/latest.pth 8 --cfg-options test_evaluator.jsonfile_prefix=work_dirs/pp-lyft/results_challenge  test_evaluator.csv_savepath=results/pp-lyft/results_challenge.csv
+```
+
+在生成 `work_dirs/pp-lyft/results_challenge.csv`，您可以将生成的文件提交到 Kaggle 评估服务器，请参考[官方网址](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles)获取更多细节。
+
+同时还可以使用可视化工具将预测结果进行可视化，更多细节请参考[可视化文档](https://mmdetection3d.readthedocs.io/zh_CN/latest/useful_tools.html#visualization)。
diff --git a/mmde/docs/zh_cn/advanced_guides/datasets/nuscenes.md b/mmde/docs/zh_cn/advanced_guides/datasets/nuscenes.md
new file mode 100644
index 0000000000000000000000000000000000000000..431af861598297cc98297f4cba37916fa0773ae7
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/datasets/nuscenes.md
@@ -0,0 +1,303 @@
+# NuScenes 数据集
+
+本页提供了有关在 MMDetection3D 中使用 nuScenes 数据集的具体教程。
+
+## 准备之前
+
+您可以在[这里](https://www.nuscenes.org/download)下载 nuScenes 3D 检测数据 Full dataset (v1.0) 并解压缩所有 zip 文件。
+
+如果您想进行 3D 语义分割任务，需要额外下载 nuScenes-lidarseg 数据标注，并将解压的文件放入 nuScenes 对应的文件夹下。
+
+**注意**：nuScenes-lidarseg 中的 v1.0trainval(test)/categroy.json 会替换原先 Full dataset (v1.0) 原先的 v1.0trainval(test)/categroy.json，但是不会对 3D 目标检测任务造成影响。
+
+像准备数据集的一般方法一样，建议将数据集根目录链接到 `$MMDETECTION3D/data`。
+
+在我们处理之前，文件夹结构应按如下方式组织。
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── nuscenes
+│   │   ├── maps
+│   │   ├── samples
+│   │   ├── sweeps
+│   │   ├── lidarseg (optional)
+│   │   ├── v1.0-test
+|   |   ├── v1.0-trainval
+```
+
+## 数据准备
+
+我们通常需要通过特定样式来使用 `.pkl` 文件组织有用的数据信息。要为 nuScenes 准备这些文件，请运行以下命令：
+
+```bash
+python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes
+```
+
+处理后的文件夹结构应该如下。
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── nuscenes
+│   │   ├── maps
+│   │   ├── samples
+│   │   ├── sweeps
+│   │   ├── lidarseg (optional)
+│   │   ├── v1.0-test
+|   |   ├── v1.0-trainval
+│   │   ├── nuscenes_database
+│   │   ├── nuscenes_infos_train.pkl
+│   │   ├── nuscenes_infos_val.pkl
+│   │   ├── nuscenes_infos_test.pkl
+│   │   ├── nuscenes_dbinfos_train.pkl
+```
+
+- `nuscenes_database/xxxxx.bin`：训练数据集的每个 3D 包围框中包含的点云数据。
+- `nuscenes_infos_train.pkl`：训练数据集，该字典包含了两个键值：`metainfo` 和 `data_list`。`metainfo` 包含数据集的基本信息，例如 `categories`, `dataset` 和 `info_version`。`data_list` 是由字典组成的列表，每个字典（以下简称 `info`）包含了单个样本的所有详细信息。
+  - info\['sample_idx'\]：样本在整个数据集的索引。
+  - info\['token'\]：样本数据标记。
+  - info\['timestamp'\]：样本数据时间戳。
+  - info\['ego2global'\]：自车到全局坐标的变换矩阵。（4x4 列表）
+  - info\['lidar_points'\]：是一个字典，包含了所有与激光雷达点相关的信息。
+    - info\['lidar_points'\]\['lidar_path'\]：激光雷达点云数据的文件名。
+    - info\['lidar_points'\]\['num_pts_feats'\]：点的特征维度。
+    - info\['lidar_points'\]\['lidar2ego'\]：该激光雷达传感器到自车的变换矩阵。（4x4 列表）
+  - info\['lidar_sweeps'\]：是一个列表，包含了扫描信息（没有标注的中间帧）。
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['data_path'\]：第 i 次扫描的激光雷达数据的文件路径。
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\[lidar2ego''\]：当前激光雷达传感器到自车的变换矩阵。（4x4 列表）
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['ego2global'\]：自车到全局坐标的变换矩阵。（4x4 列表）
+    - info\['lidar_sweeps'\]\[i\]\['lidar2sensor'\]：从主激光雷达传感器到当前传感器（用于收集扫描数据）的变换矩阵。（4x4 列表）
+    - info\['lidar_sweeps'\]\[i\]\['timestamp'\]：扫描数据的时间戳。
+    - info\['lidar_sweeps'\]\[i\]\['sample_data_token'\]：扫描样本数据标记。
+  - info\['images'\]：是一个字典，包含与每个相机对应的六个键值：`'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_BACK'`, `'CAM_BACK_LEFT'`, `'CAM_BACK_RIGHT'`。每个字典包含了对应相机的所有数据信息。
+    - info\['images'\]\['CAM_XXX'\]\['img_path'\]：图像的文件名。
+    - info\['images'\]\['CAM_XXX'\]\['cam2img'\]：当 3D 点投影到图像平面时需要的内参信息相关的变换矩阵。（3x3 列表）
+    - info\['images'\]\['CAM_XXX'\]\['sample_data_token'\]：图像样本数据标记。
+    - info\['images'\]\['CAM_XXX'\]\['timestamp'\]：图像的时间戳。
+    - info\['images'\]\['CAM_XXX'\]\['cam2ego'\]：该相机传感器到自车的变换矩阵。（4x4 列表）
+    - info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]：激光雷达传感器到该相机的变换矩阵。（4x4 列表）
+  - info\['instances'\]：是一个字典组成的列表。每个字典包含单个实例的所有标注信息。对于其中的第 i 个实例，我们有：
+    - info\['instances'\]\[i\]\['bbox_3d'\]：长度为 7 的列表，以 (x, y, z, l, w, h, yaw) 的顺序表示实例的 3D 边界框。
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]：整数表示实例的标签，-1 代表忽略。
+    - info\['instances'\]\[i\]\['velocity'\]：3D 边界框的速度（由于不正确，没有垂直测量），大小为 (2, ) 的列表。
+    - info\['instances'\]\[i\]\['num_lidar_pts'\]：每个 3D 边界框内包含的激光雷达点数。
+    - info\['instances'\]\[i\]\['num_radar_pts'\]：每个 3D 边界框内包含的雷达点数。
+    - info\['instances'\]\[i\]\['bbox_3d_isvalid'\]：每个包围框是否有效。一般情况下，我们只将包含至少一个激光雷达或雷达点的 3D 框作为有效框。
+  - info\['cam_instances'\]：是一个字典，包含以下键值：`'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_BACK'`, `'CAM_BACK_LEFT'`, `'CAM_BACK_RIGHT'`。对于基于视觉的 3D 目标检测任务，我们将整个场景的 3D 标注划分至它们所属于的相应相机中。对于其中的第 i 个实例，我们有：
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label'\]：实例标签。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label_3d'\]：实例标签。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox'\]：2D 边界框标注（3D 框投影的矩形框），顺序为 \[x1, y1, x2, y2\] 的列表。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['center_2d'\]：3D 框投影到图像上的中心点，大小为 (2, ) 的列表。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['depth'\]：3D 框投影中心的深度。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['velocity'\]：3D 边界框的速度（由于不正确，没有垂直测量），大小为 (2, ) 的列表。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['attr_label'\]：实例的属性标签。我们为属性分类维护了一个属性集合和映射。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_3d'\]：长度为 7 的列表，以 (x, y, z, l, h, w, yaw) 的顺序表示实例的 3D 边界框。
+  - info\['pts_semantic_mask_path'\]：激光雷达语义分割标注的文件名。
+
+注意：
+
+1. `instances` 和 `cam_instances` 中 `bbox_3d` 的区别。`bbox_3d` 都被转换到 MMDet3D 定义的坐标系下，`instances` 中的 `bbox_3d` 是在激光雷达坐标系下，而 `cam_instances` 是在相机坐标系下。注意它们 3D 框中表示的不同（'l, w, h' 和 'l, h, w'）。
+
+2. 这里我们只解释训练信息文件中记录的数据。这同样适用于验证集和测试集（测试集的 `.pkl` 文件中不包含 `instances` 以及 `cam_instances`）。
+
+获取 `nuscenes_infos_xxx.pkl` 的核心函数为 [\_fill_trainval_infos](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/nuscenes_converter.py#L146)。更多细节请参考 [nuscenes_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/nuscenes_converter.py)。
+
+## 训练流程
+
+### 基于 LiDAR 的方法
+
+nuScenes 上基于 LiDAR 的 3D 检测（包括多模态方法）的典型训练流程如下。
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+```
+
+与一般情况相比，nuScenes 有一个特定的 `'LoadPointsFromMultiSweeps'` 流水线来从连续帧加载点云。这是此设置中使用的常见做法。更多细节请参考 nuScenes [原始论文](https://arxiv.org/abs/1903.11027)。`'LoadPointsFromMultiSweeps'` 中的默认 `use_dim` 是 `[0, 1, 2, 4]`，其中前 3 个维度是指点坐标，最后一个是指时间戳差异。由于在拼接来自不同帧的点时使用点云的强度信息会产生噪声，因此默认情况下不使用点云的强度信息。
+
+### 基于视觉的方法
+
+#### 基于单目方法
+
+在NuScenes数据集中，对于多视角图像，单目检测范式通常由针对每张图像检测和输出 3D 检测结果以及通过后处理（例如 NMS ）得到最终检测结果两步组成。从本质上来说，这种范式直接将单目 3D 检测扩展到多视角任务。NuScenes 上基于图像的 3D 检测的典型训练流水线如下。
+
+```python
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='mmdet.Resize', scale=(1600, 900), keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'attr_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+```
+
+它遵循 2D 检测的一般流水线，但在一些细节上有所不同：
+
+- 它使用单目流水线加载图像，其中包括额外的必需信息，如相机内参矩阵。
+- 它需要加载 3D 标注。
+- 一些数据增强技术需要调整，例如`RandomFlip3D`。目前我们不支持更多的增强方法，因为如何迁移和应用其他技术仍在探索中。
+
+#### 基于BEV方法
+
+鸟瞰图，BEV（Bird's-Eye-View），是另一种常用的 3D 检测范式。它直接利用多个视角图像进行 3D 检测。对于 NuScenes 数据集而言，这些视角包括前方`CAM_FRONT`、左前方`CAM_FRONT_LEFT`、右前方`CAM_FRONT_RIGHT`、后方`CAM_BACK`、左后方`CAM_BACK_LEFT`、右后方`CAM_BACK_RIGHT`。一个基本的用于 BEV 方法的流水线如下。
+
+```python
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+train_transforms = [
+    dict(type='PhotoMetricDistortion3D'),
+    dict(
+        type='RandomResize3D',
+        scale=(1600, 900),
+        ratio_range=(1., 1.),
+        keep_ratio=True)
+]
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles',
+         to_float32=True,
+         num_views=6, ),
+    dict(type='LoadAnnotations3D',
+         with_bbox_3d=True,
+         with_label_3d=True,
+         with_attr_label=False),
+    # 可选，数据增强
+    dict(type='MultiViewWrapper', transforms=train_transforms),
+    # 可选, 筛选特定点云范围内物体
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    # 可选, 筛选特定类别物体
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+```
+
+为了读取多个视角的图像，数据集也应进行相应微调。
+
+```python
+data_prefix = dict(
+    CAM_FRONT='samples/CAM_FRONT',
+    CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+    CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+    CAM_BACK='samples/CAM_BACK',
+    CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+    CAM_BACK_LEFT='samples/CAM_BACK_LEFT',
+)
+train_dataloader = dict(
+    batch_size=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type="NuScenesDataset",
+        data_root="./data/nuScenes",
+        ann_file="nuscenes_infos_train.pkl",
+        data_prefix=data_prefix,
+        modality=dict(use_camera=True, use_lidar=False, ),
+        pipeline=train_pipeline,
+        test_mode=False, )
+)
+```
+
+## 评估
+
+使用 8 个 GPU 以及 nuScenes 指标评估的 PointPillars 的示例如下
+
+```shell
+bash ./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py checkpoints/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth 8
+```
+
+## 指标
+
+NuScenes 提出了一个综合指标，即 nuScenes 检测分数（NDS），以评估不同的方法并设置基准测试。它由平均精度（mAP）、平均平移误差（ATE）、平均尺度误差（ASE）、平均方向误差（AOE）、平均速度误差（AVE）和平均属性误差（AAE）组成。更多细节请参考其[官方网站](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Any)。
+
+我们也采用这种方法对 nuScenes 进行评估。打印的评估结果示例如下：
+
+```
+mAP: 0.3197
+mATE: 0.7595
+mASE: 0.2700
+mAOE: 0.4918
+mAVE: 1.3307
+mAAE: 0.1724
+NDS: 0.3905
+Eval time: 170.8s
+
+Per-class results:
+Object Class    AP      ATE     ASE     AOE     AVE     AAE
+car     0.503   0.577   0.152   0.111   2.096   0.136
+truck   0.223   0.857   0.224   0.220   1.389   0.179
+bus     0.294   0.855   0.204   0.190   2.689   0.283
+trailer 0.081   1.094   0.243   0.553   0.742   0.167
+construction_vehicle    0.058   1.017   0.450   1.019   0.137   0.341
+pedestrian      0.392   0.687   0.284   0.694   0.876   0.158
+motorcycle      0.317   0.737   0.265   0.580   2.033   0.104
+bicycle 0.308   0.704   0.299   0.892   0.683   0.010
+traffic_cone    0.555   0.486   0.309   nan     nan     nan
+barrier 0.466   0.581   0.269   0.169   nan     nan
+```
+
+## 测试和提交
+
+使用 8 个 GPU 在 nuScenes 上测试 PointPillars 并生成对排行榜的提交的示例如下
+
+你需要在对应的配置文件中的 `test_evaluator` 里修改 `jsonfile_prefix`。举个例子，添加 `test_evaluator = dict(type='NuScenesMetric', jsonfile_prefix='work_dirs/pp-nus/results_eval.json')` 或在测试命令后使用 `--cfg-options "test_evaluator.jsonfile_prefix=work_dirs/pp-nus/results_eval.json)`。
+
+```shell
+./tools/dist_test.sh configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py work_dirs/pp-nus/latest.pth 8 --cfg-options 'test_evaluator.jsonfile_prefix=work_dirs/pp-nus/results_eval'
+```
+
+请注意，在[这里](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/datasets/nus-3d.py#L132)测试信息应更改为测试集而不是验证集。
+
+生成 `work_dirs/pp-nus/results_eval.json` 后，您可以压缩并提交给 nuScenes 基准测试。更多信息请参考 [nuScenes 官方网站](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Any)。
+
+我们还可以使用我们开发的可视化工具将预测结果可视化。更多细节请参考[可视化文档](https://mmdetection3d.readthedocs.io/zh_CN/latest/useful_tools.html#id2)。
+
+## 注意
+
+### `NuScenesBox` 和我们的 `CameraInstanceBoxes` 之间的转换。
+
+总的来说，`NuScenesBox` 和我们的 `CameraInstanceBoxes` 的主要区别主要体现在转向角（yaw）定义上。 `NuScenesBox` 定义了一个四元数或三个欧拉角的旋转，而我们的由于实际情况只定义了一个转向角（yaw），它需要我们在预处理和后处理中手动添加一些额外的旋转，例如[这里](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L673)。
+
+另外，请注意，角点和位置的定义在 `NuScenesBox` 中是分离的。例如，在单目 3D 检测中，框位置的定义在其相机坐标中（有关汽车设置，请参阅其官方[插图](https://www.nuscenes.org/nuscenes#data-collection)），即与[我们的](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/bbox/structures/cam_box3d.py)一致。相比之下，它的角点是通过[惯例](https://github.com/nutonomy/nuscenes-devkit/blob/02e9200218977193a1058dd7234f935834378319/python-sdk/nuscenes/utils/data_classes.py#L527) 定义的，“x 向前， y 向左， z 向上”。它导致了与我们的 `CameraInstanceBoxes` 不同的维度和旋转定义理念。一个移除相似冲突的例子是 PR [#744](https://github.com/open-mmlab/mmdetection3d/pull/744)。同样的问题也存在于 LiDAR 系统中。为了解决它们，我们通常会在预处理和后处理中添加一些转换，以保证在整个训练和推理过程中框都在我们的坐标系系统里。
diff --git a/mmde/docs/zh_cn/advanced_guides/datasets/s3dis.md b/mmde/docs/zh_cn/advanced_guides/datasets/s3dis.md
new file mode 100644
index 0000000000000000000000000000000000000000..7af223907517fd287ac83e66fbab398329d298d3
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/datasets/s3dis.md
@@ -0,0 +1,271 @@
+# S3DIS 数据集
+
+## 数据集的准备
+
+对于数据集准备的整体流程，请参考 S3DIS 的[指南](https://github.com/open-mmlab/mmdetection3d/blob/master/data/s3dis/README.md/)。
+
+### 提取 S3DIS 数据
+
+通过从原始数据中提取 S3DIS 数据，我们将点云数据读取并保存下相关的标注信息，例如语义分割标签和实例分割标签。
+
+数据提取前的目录结构应该如下所示：
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── s3dis
+│   │   ├── meta_data
+│   │   ├── Stanford3dDataset_v1.2_Aligned_Version
+│   │   │   ├── Area_1
+│   │   │   │   ├── conferenceRoom_1
+│   │   │   │   ├── office_1
+│   │   │   │   ├── ...
+│   │   │   ├── Area_2
+│   │   │   ├── Area_3
+│   │   │   ├── Area_4
+│   │   │   ├── Area_5
+│   │   │   ├── Area_6
+│   │   ├── indoor3d_util.py
+│   │   ├── collect_indoor3d_data.py
+│   │   ├── README.md
+```
+
+在 `Stanford3dDataset_v1.2_Aligned_Version` 目录下，所有房间依据所属区域被分为 6 组。
+我们通常使用 5 个区域进行训练，然后在余下 1 个区域上进行测试 (被余下的 1 个区域通常为区域 5)。
+在每个区域的目录下包含有多个房间的文件夹，每个文件夹是一个房间的原始点云数据和相关的标注信息。
+例如，在 `Area_1/office_1` 目录下的文件如下所示：
+
+- `office_1.txt`：一个 txt 文件存储着原始点云数据每个点的坐标和颜色信息。
+
+- `Annotations/`：这个文件夹里包含有此房间中实例物体的信息 (以 txt 文件的形式存储)。每个 txt 文件表示一个实例，例如：
+
+  - `chair_1.txt`：存储有该房间中一把椅子的点云数据。
+
+  如果我们将 `Annotations/` 下的所有 txt 文件合并起来，得到的点云就和 `office_1.txt` 中的点云是一致的。
+
+你可以通过 `python collect_indoor3d_data.py` 指令进行 S3DIS 数据的提取。
+主要步骤包括：
+
+- 从原始 txt 文件中读取点云数据、语义分割标签和实例分割标签。
+- 将点云数据和相关标注文件存储下来。
+
+这其中的核心函数 `indoor3d_util.py` 中的 `export` 函数实现如下：
+
+```python
+def export(anno_path, out_filename):
+    """将原始数据集的文件转化为点云、语义分割标签和实例分割掩码文件。
+    我们将同一房间中所有实例的点进行聚合。
+
+    参数列表:
+        anno_path (str): 标注信息的路径，例如 Area_1/office_2/Annotations/
+        out_filename (str): 保存点云和标签的路径
+        file_format (str): txt 或 numpy，指定保存的文件格式
+
+    注意:
+        点云在处理过程中被整体移动了，保存下的点最小位于原点 (即没有负数坐标值)
+    """
+    points_list = []
+    ins_idx = 1  # 实例标签从 1 开始，因此最终实例标签为 0 的点就是无标注的点
+
+    # `anno_path` 的一个例子：Area_1/office_1/Annotations
+    # 其中以 txt 文件存储有该房间中所有实例物体的点云
+    for f in glob.glob(osp.join(anno_path, '*.txt')):
+        # get class name of this instance
+        one_class = osp.basename(f).split('_')[0]
+        if one_class not in class_names:  # 某些房间有 'staris' 类物体
+            one_class = 'clutter'
+        points = np.loadtxt(f)
+        labels = np.ones((points.shape[0], 1)) * class2label[one_class]
+        ins_labels = np.ones((points.shape[0], 1)) * ins_idx
+        ins_idx += 1
+        points_list.append(np.concatenate([points, labels, ins_labels], 1))
+
+    data_label = np.concatenate(points_list, 0)  # [N, 8], (pts, rgb, sem, ins)
+    # 将点云对齐到原点
+    xyz_min = np.amin(data_label, axis=0)[0:3]
+    data_label[:, 0:3] -= xyz_min
+
+    np.save(f'{out_filename}_point.npy', data_label[:, :6].astype(np.float32))
+    np.save(f'{out_filename}_sem_label.npy', data_label[:, 6].astype(np.int64))
+    np.save(f'{out_filename}_ins_label.npy', data_label[:, 7].astype(np.int64))
+
+```
+
+上述代码中，我们读取 `Annotations/` 下的所有点云实例，将其合并得到整体房屋的点云，同时生成语义/实例分割的标签。
+在提取完每个房间的数据后，点云、语义分割和实例分割的标签文件应以 `.npy` 的格式被保存下来。
+
+### 创建数据集
+
+```shell
+python tools/create_data.py s3dis --root-path ./data/s3dis \
+--out-dir ./data/s3dis --extra-tag s3dis
+```
+
+上述指令首先读取以 `.npy` 格式存储的点云、语义分割和实例分割标签文件，然后进一步将它们以 `.bin` 格式保存。
+同时，每个区域 `.pkl` 格式的信息文件也会被保存下来。
+
+数据预处理后的目录结构如下所示：
+
+```
+s3dis
+├── meta_data
+├── indoor3d_util.py
+├── collect_indoor3d_data.py
+├── README.md
+├── Stanford3dDataset_v1.2_Aligned_Version
+├── s3dis_data
+├── points
+│   ├── xxxxx.bin
+├── instance_mask
+│   ├── xxxxx.bin
+├── semantic_mask
+│   ├── xxxxx.bin
+├── seg_info
+│   ├── Area_1_label_weight.npy
+│   ├── Area_1_resampled_scene_idxs.npy
+│   ├── Area_2_label_weight.npy
+│   ├── Area_2_resampled_scene_idxs.npy
+│   ├── Area_3_label_weight.npy
+│   ├── Area_3_resampled_scene_idxs.npy
+│   ├── Area_4_label_weight.npy
+│   ├── Area_4_resampled_scene_idxs.npy
+│   ├── Area_5_label_weight.npy
+│   ├── Area_5_resampled_scene_idxs.npy
+│   ├── Area_6_label_weight.npy
+│   ├── Area_6_resampled_scene_idxs.npy
+├── s3dis_infos_Area_1.pkl
+├── s3dis_infos_Area_2.pkl
+├── s3dis_infos_Area_3.pkl
+├── s3dis_infos_Area_4.pkl
+├── s3dis_infos_Area_5.pkl
+├── s3dis_infos_Area_6.pkl
+```
+
+- `points/xxxxx.bin`：提取的点云数据。
+- `instance_mask/xxxxx.bin`：每个点云的实例标签，取值范围为 \[0, ${实例个数}\]，其中 0 代表未标注的点。
+- `semantic_mask/xxxxx.bin`：每个点云的语义标签，取值范围为 \[0, 12\]。
+- `s3dis_infos_Area_1.pkl`：区域 1 的数据信息，每个房间的详细信息如下：
+  - info\['point_cloud'\]: {'num_features': 6, 'lidar_idx': sample_idx}.
+  - info\['pts_path'\]: `points/xxxxx.bin` 点云的路径。
+  - info\['pts_instance_mask_path'\]: `instance_mask/xxxxx.bin` 实例标签的路径。
+  - info\['pts_semantic_mask_path'\]: `semantic_mask/xxxxx.bin` 语义标签的路径。
+- `seg_info`：为支持语义分割任务所生成的信息文件。
+  - `Area_1_label_weight.npy`：每一语义类别的权重系数。因为 S3DIS 中属于不同类的点的数量相差很大，一个常见的操作是在计算损失时对不同类别进行加权 (label re-weighting) 以得到更好的分割性能。
+  - `Area_1_resampled_scene_idxs.npy`：每一个场景 (房间) 的重采样标签。在训练过程中，我们依据每个场景的点的数量，会对其进行不同次数的重采样，以保证训练数据均衡。
+
+## 训练流程
+
+S3DIS 上 3D 语义分割的一种典型数据载入流程如下所示：
+
+```python
+class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+num_points = 4096
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping'),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.0,
+        ignore_index=None,
+        use_normalized_coord=True,
+        enlarge_size=None,
+        min_unique_num=num_points // 4,
+        eps=0.0),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-3.141592653589793, 3.141592653589793],  # [-pi, pi]
+        scale_ratio_range=[0.8, 1.2],
+        translation_std=[0, 0, 0]),
+    dict(
+        type='RandomJitterPoints',
+        jitter_std=[0.01, 0.01, 0.01],
+        clip_range=[-0.05, 0.05]),
+    dict(type='RandomDropPointsColor', drop_ratio=0.2),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+```
+
+- `PointSegClassMapping`：在训练过程中，只有被使用的类别的序号会被映射到类似 \[0, 13) 范围内的类别标签。其余的类别序号会被转换为 `ignore_index` 所制定的忽略标签，在本例中是 `13`。
+- `IndoorPatchPointSample`：从输入点云中裁剪一个含有固定数量点的小块 (patch)。`block_size` 指定了裁剪块的边长，在 S3DIS 上这个数值一般设置为 `1.0`。
+- `NormalizePointsColor`：将输入点的颜色信息归一化，通过将 RGB 值除以 `255` 来实现。
+- 数据增广：
+  - `GlobalRotScaleTrans`：对输入点云进行随机旋转和放缩变换。
+  - `RandomJitterPoints`：通过对每一个点施加不同的噪声向量以实现对点云的随机扰动。
+  - `RandomDropPointsColor`：以 `drop_ratio` 的概率随机将点云的颜色值全部置零。
+
+## 度量指标
+
+通常我们使用平均交并比 (mean Intersection over Union, mIoU) 作为 S3DIS 语义分割任务的度量指标。
+具体而言，我们先计算所有类别的 IoU，然后取平均值作为 mIoU。
+更多实现细节请参考 [seg_eval.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/seg_eval.py)。
+
+正如在 `提取 S3DIS 数据` 一节中所提及的，S3DIS 通常在 5 个区域上进行训练，然后在余下的 1 个区域上进行测试。但是在其他论文中，也有不同的划分方式。
+为了便于灵活划分训练和测试的子集，我们首先定义子数据集 (sub-dataset) 来表示每一个区域，然后根据区域划分对其进行合并，以得到完整的训练集。
+以下是在区域 1、2、3、4、6 上训练并在区域 5 上测试的一个配置文件例子：
+
+```python
+dataset_type = 'S3DISSegDataset'
+data_root = './data/s3dis/'
+class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=[f's3dis_infos_Area_{i}.pkl' for i in train_area],
+        metainfo=metainfo,
+        data_prefix=data_prefix,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        ignore_index=len(class_names),
+        scene_idxs=[
+            f'seg_info/Area_{i}_resampled_scene_idxs.npy' for i in train_area
+        ],
+        test_mode=False))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=f's3dis_infos_Area_{test_area}.pkl',
+        metainfo=metainfo,
+        data_prefix=data_prefix,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        ignore_index=len(class_names),
+        scene_idxs=f'seg_info/Area_{test_area}_resampled_scene_idxs.npy',
+        test_mode=True))
+val_dataloader = test_dataloader
+```
+
+可以看到，我们通过将多个相应路径构成的列表 (list) 输入 `ann_files` 和 `scene_idxs` 以实现训练测试集的划分。
+如果修改训练测试区域的划分，只需要简单修改 `train_area` 和 `test_area` 即可。
diff --git a/mmde/docs/zh_cn/advanced_guides/datasets/scannet.md b/mmde/docs/zh_cn/advanced_guides/datasets/scannet.md
new file mode 100644
index 0000000000000000000000000000000000000000..bbba255a2c3eb12f4733c41a97d70a4a05e4d2f6
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/datasets/scannet.md
@@ -0,0 +1,354 @@
+# ScanNet 数据集
+
+MMDetection3D 支持在 ScanNet 数据集上进行 3D 目标检测\\语义分割 任务。本页提供了有关在 MMDetection3D 中使用 ScanNet 数据集的具体教程。
+
+## 数据集准备
+
+请参考 ScanNet 的[指南](https://github.com/open-mmlab/mmdetection3d/blob/master/data/scannet/README.md)以查看总体流程。
+
+### 提取 ScanNet 点云数据
+
+通过提取 ScanNet 数据，我们加载原始点云文件，并生成包括语义标签、实例标签和真实物体包围框在内的相关标注。
+
+```shell
+python batch_load_scannet_data.py
+```
+
+数据处理之前的文件目录结构如下：
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── scannet
+│   │   ├── meta_data
+│   │   ├── scans
+│   │   │   ├── scenexxxx_xx
+│   │   ├── batch_load_scannet_data.py
+│   │   ├── load_scannet_data.py
+│   │   ├── scannet_utils.py
+│   │   ├── README.md
+```
+
+在 `scans` 文件夹下总共有 1201 个训练样本文件夹和 312 个验证样本文件夹，其中存有未处理的点云数据和相关的标注。比如说，在文件夹 `scene0001_01` 下文件是这样组织的：
+
+- `scene0001_01_vh_clean_2.ply`：存有每个顶点坐标和颜色的网格文件。网格的顶点被直接用作未处理的点云数据。
+- `scene0001_01.aggregation.json`：包含物体 ID、分割部分 ID、标签的标注文件。
+- `scene0001_01_vh_clean_2.0.010000.segs.json`：包含分割部分 ID 和顶点的分割标注文件。
+- `scene0001_01.txt`：包括对齐矩阵等的元文件。
+- `scene0001_01_vh_clean_2.labels.ply`：包含每个顶点类别的标注文件。
+
+通过运行 `python batch_load_scannet_data.py` 来提取 ScanNet 数据的处理过程主要包含以下几步：
+
+- 从原始文件中提取出点云、实例标签、语义标签和包围框标签文件。
+- 下采样原始点云并过滤掉不合法的类别。
+- 保存处理后的点云数据和相关的标注文件。
+
+`load_scannet_data.py` 中的核心函数 `export` 如下：
+
+```python
+def export(mesh_file,
+           agg_file,
+           seg_file,
+           meta_file,
+           label_map_file,
+           output_file=None,
+           test_mode=False):
+
+    # 标签映射文件：./data/scannet/meta_data/scannetv2-labels.combined.tsv
+    # 该标签映射文件中有多种标签标准，比如 'nyu40id'
+    label_map = scannet_utils.read_label_mapping(
+        label_map_file, label_from='raw_category', label_to='nyu40id')
+    # 加载原始点云数据，特征包括6维：XYZRGB
+    mesh_vertices = scannet_utils.read_mesh_vertices_rgb(mesh_file)
+
+    # 加载场景坐标轴对齐矩阵：一个 4x4 的变换矩阵
+    # 将传感器坐标系下的原始点转化到另一个坐标系下
+    # 该坐标系与房屋的两边平行（也就是与坐标轴平行）
+    lines = open(meta_file).readlines()
+    # 测试集的数据没有对齐矩阵
+    axis_align_matrix = np.eye(4)
+    for line in lines:
+        if 'axisAlignment' in line:
+            axis_align_matrix = [
+                float(x)
+                for x in line.rstrip().strip('axisAlignment = ').split(' ')
+            ]
+            break
+    axis_align_matrix = np.array(axis_align_matrix).reshape((4, 4))
+
+    # 对网格顶点进行全局的对齐
+    pts = np.ones((mesh_vertices.shape[0], 4))
+    # 同种类坐标下的原始点云，每一行的数据是 [x, y, z, 1]
+    pts[:, 0:3] = mesh_vertices[:, 0:3]
+    # 将原始网格顶点转换为对齐后的顶点
+    pts = np.dot(pts, axis_align_matrix.transpose())  # Nx4
+    aligned_mesh_vertices = np.concatenate([pts[:, 0:3], mesh_vertices[:, 3:]],
+                                           axis=1)
+
+    # 加载语义与实例标签
+    if not test_mode:
+        # 每个物体都有一个语义标签，并且包含几个分割部分
+        object_id_to_segs, label_to_segs = read_aggregation(agg_file)
+        # 很多点属于同一分割部分
+        seg_to_verts, num_verts = read_segmentation(seg_file)
+        label_ids = np.zeros(shape=(num_verts), dtype=np.uint32)
+        object_id_to_label_id = {}
+        for label, segs in label_to_segs.items():
+            label_id = label_map[label]
+            for seg in segs:
+                verts = seg_to_verts[seg]
+                # 每个点都有一个语义标签
+                label_ids[verts] = label_id
+        instance_ids = np.zeros(
+            shape=(num_verts), dtype=np.uint32)  # 0：未标注的
+        for object_id, segs in object_id_to_segs.items():
+            for seg in segs:
+                verts = seg_to_verts[seg]
+                # object_id 从 1 开始计数，比如 1,2,3,.,,,.NUM_INSTANCES
+                # 每个点都属于一个物体
+                instance_ids[verts] = object_id
+                if object_id not in object_id_to_label_id:
+                    object_id_to_label_id[object_id] = label_ids[verts][0]
+        # 包围框格式为 [x, y, z, dx, dy, dz, label_id]
+        # [x, y, z] 是包围框的重力中心, [dx, dy, dz] 是与坐标轴平行的
+        # [label_id] 是 'nyu40id' 标准下的语义标签
+        # 注意：因为三维包围框是与坐标轴平行的，所以旋转角是 0
+        unaligned_bboxes = extract_bbox(mesh_vertices, object_id_to_segs,
+                                        object_id_to_label_id, instance_ids)
+        aligned_bboxes = extract_bbox(aligned_mesh_vertices, object_id_to_segs,
+                                      object_id_to_label_id, instance_ids)
+    ...
+
+    return mesh_vertices, label_ids, instance_ids, unaligned_bboxes, \
+        aligned_bboxes, object_id_to_label_id, axis_align_matrix
+
+```
+
+在从每个场景的扫描文件提取数据后，如果原始点云点数过多，可以将其下采样（比如到 50000 个点），但在三维语义分割任务中，点云不会被下采样。此外，在 `nyu40id` 标准之外的不合法语义标签或者可选的 `DONOT CARE` 类别标签应被过滤。最终，点云文件、语义标签、实例标签和真实物体的集合应被存储于 `.npy` 文件中。
+
+### 提取 ScanNet RGB 色彩数据（可选的）
+
+通过提取 ScanNet RGB 色彩数据，对于每个场景我们加载 RGB 图像与配套 4x4 位姿矩阵、单个 4x4 相机内参矩阵的集合。请注意，这一步是可选的，除非要运行多视图物体检测，否则可以略去这步。
+
+```shell
+python extract_posed_images.py
+```
+
+1201 个训练样本，312 个验证样本和 100 个测试样本中的每一个都包含一个单独的 `.sens` 文件。比如说，对于场景 `0001_01` 我们有 `data/scannet/scans/scene0001_01/0001_01.sens`。对于这个场景所有图像和位姿数据都被提取至 `data/scannet/posed_images/scene0001_01`。具体来说，该文件夹下会有 300 个 xxxxx.jpg 格式的图像数据，300 个 xxxxx.txt 格式的相机位姿数据和一个单独的 `intrinsic.txt` 内参文件。通常来说，一个场景包含数千张图像。默认情况下，我们只会提取其中的 300 张，从而只占用少于 100 Gb 的空间。要想提取更多图像，请使用 `--max-images-per-scene` 参数。
+
+### 创建数据集
+
+```shell
+python tools/create_data.py scannet --root-path ./data/scannet \
+--out-dir ./data/scannet --extra-tag scannet
+```
+
+上述提取的点云文件，语义类别标注文件，和物体实例标注文件被进一步以 `.bin` 格式保存。与此同时 `.pkl` 格式的文件被生成并用于训练和验证。获取数据信息的核心函数 `process_single_scene` 如下：
+
+```python
+def process_single_scene(sample_idx):
+
+    # 分别以 .bin 格式保存点云文件，语义类别标注文件和物体实例标注文件
+    # 获取 info['pts_path']，info['pts_instance_mask_path'] 和 info['pts_semantic_mask_path']
+    ...
+
+    # 获取标注
+    if has_label:
+        annotations = {}
+        # 包围框的形状为 [k, 6 + class]
+        aligned_box_label = self.get_aligned_box_label(sample_idx)
+        unaligned_box_label = self.get_unaligned_box_label(sample_idx)
+        annotations['gt_num'] = aligned_box_label.shape[0]
+        if annotations['gt_num'] != 0:
+            aligned_box = aligned_box_label[:, :-1]  # k, 6
+            unaligned_box = unaligned_box_label[:, :-1]
+            classes = aligned_box_label[:, -1]  # k
+            annotations['name'] = np.array([
+                self.label2cat[self.cat_ids2class[classes[i]]]
+                for i in range(annotations['gt_num'])
+            ])
+            # 为了向后兼容，默认的参数名赋予了与坐标轴平行的包围框
+            # 我们同时保存了对应的与坐标轴不平行的包围框的信息
+            annotations['location'] = aligned_box[:, :3]
+            annotations['dimensions'] = aligned_box[:, 3:6]
+            annotations['gt_boxes_upright_depth'] = aligned_box
+            annotations['unaligned_location'] = unaligned_box[:, :3]
+            annotations['unaligned_dimensions'] = unaligned_box[:, 3:6]
+            annotations[
+                'unaligned_gt_boxes_upright_depth'] = unaligned_box
+            annotations['index'] = np.arange(
+                annotations['gt_num'], dtype=np.int32)
+            annotations['class'] = np.array([
+                self.cat_ids2class[classes[i]]
+                for i in range(annotations['gt_num'])
+            ])
+        axis_align_matrix = self.get_axis_align_matrix(sample_idx)
+        annotations['axis_align_matrix'] = axis_align_matrix  # 4x4
+        info['annos'] = annotations
+    return info
+```
+
+如上数据处理后，文件目录结构应如下：
+
+```
+scannet
+├── meta_data
+├── batch_load_scannet_data.py
+├── load_scannet_data.py
+├── scannet_utils.py
+├── README.md
+├── scans
+├── scans_test
+├── scannet_instance_data
+├── points
+│   ├── xxxxx.bin
+├── instance_mask
+│   ├── xxxxx.bin
+├── semantic_mask
+│   ├── xxxxx.bin
+├── seg_info
+│   ├── train_label_weight.npy
+│   ├── train_resampled_scene_idxs.npy
+│   ├── val_label_weight.npy
+│   ├── val_resampled_scene_idxs.npy
+├── posed_images
+│   ├── scenexxxx_xx
+│   │   ├── xxxxxx.txt
+│   │   ├── xxxxxx.jpg
+│   │   ├── intrinsic.txt
+├── scannet_infos_train.pkl
+├── scannet_infos_val.pkl
+├── scannet_infos_test.pkl
+```
+
+- `points/xxxxx.bin`：下采样后，未与坐标轴平行（即没有对齐）的点云。因为 ScanNet 3D 检测任务将与坐标轴平行的点云作为输入，而 ScanNet 3D 语义分割任务将对齐前的点云作为输入，我们选择存储对齐前的点云和它们的对齐矩阵。请注意：在 3D 检测的预处理流程 [`GlobalAlignment`](https://github.com/open-mmlab/mmdetection3d/blob/9f0b01caf6aefed861ef4c3eb197c09362d26b32/mmdet3d/datasets/pipelines/transforms_3d.py#L423) 后，点云就都是与坐标轴平行的了。
+- `instance_mask/xxxxx.bin`：每个点的实例标签，值的范围为：\[0, NUM_INSTANCES\]，其中 0 表示没有标注。
+- `semantic_mask/xxxxx.bin`：每个点的语义标签，值的范围为：\[1, 40\], 也就是 `nyu40id` 的标准。请注意：在训练流程 `PointSegClassMapping` 中，`nyu40id` 的 ID 会被映射到训练 ID。
+- `seg_info`：为支持语义分割任务所生成的信息文件。
+  - `train_label_weight.npy`：每一语义类别的权重系数。因为 ScanNet 中属于不同类的点的数量相差很大，一个常见的操作是在计算损失时对不同类别进行加权 (label re-weighting) 以得到更好的分割性能。
+  - `train_resampled_scene_idxs.npy`：每一个场景 (房间) 的重采样标签。在训练过程中，我们依据每个场景的点的数量，会对其进行不同次数的重采样，以保证训练数据均衡。
+- `posed_images/scenexxxx_xx`：`.jpg` 图像的集合，还包含 `.txt` 格式的 4x4 相机姿态和单个 `.txt` 格式的相机内参矩阵文件。
+- `scannet_infos_train.pkl`：训练集的数据信息，每个场景的具体信息如下：
+  - info\['lidar_points'\]：字典包含与激光雷达点相关的信息。
+    - info\['lidar_points'\]\['lidar_path'\]：激光雷达点云数据的文件名。
+    - info\['lidar_points'\]\['num_pts_feats'\]：点的特征维度。
+    - info\['lidar_points'\]\['axis_align_matrix'\]：用于对齐坐标轴的变换矩阵。
+  - info\['pts_semantic_mask_path'\]：语义分割标注的文件名。
+  - info\['pts_instance_mask_path'\]：实例分割标注的文件名。
+  - info\['instances'\]：字典组成的列表，每个字典包含一个实例的所有标注信息。对于其中的第 i 个实例，我们有：
+    - info\['instances'\]\[i\]\['bbox_3d'\]：长度为 6 的列表，以 (x, y, z, l, w, h) 的顺序表示深度坐标系下与坐标轴平行的 3D 边界框。
+    - info\[instances\]\[i\]\['bbox_label_3d'\]：3D 边界框的标签。
+- `scannet_infos_val.pkl`：验证集上的数据信息，与 `scannet_infos_train.pkl` 格式完全一致。
+- `scannet_infos_test.pkl`：测试集上的数据信息，与 `scannet_infos_train.pkl` 格式几乎完全一致，除了缺少标注。
+
+## 训练流程
+
+ScanNet 进行 **3D 目标检测**的一种典型数据载入流程如下所示：
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointSample', num_points=40000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0],
+        shift_height=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+```
+
+- `GlobalAlignment`：输入的点云在施加了坐标轴平行的矩阵后应被转换为与坐标轴平行的形式。
+- `PointSegClassMapping`：训练中，只有合法的类别 ID 才会被映射到类别标签，比如 \[0, 18)。
+- 数据增强:
+  - `PointSample`：下采样输入点云。
+  - `RandomFlip3D`：随机左右或前后翻转点云。
+  - `GlobalRotScaleTrans`：旋转输入点云，对于 ScanNet 角度通常落入 \[-5, 5\]（度）的范围；并放缩输入点云，对于 ScanNet 比例通常为 1.0（即不做缩放）；最后平移输入点云，对于 ScanNet 通常位移量为 0（即不做位移）。
+
+ScanNet 进行 **3D 语义分割**的一种典型数据载入流程如下所示：
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping'),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.5,
+        ignore_index=len(class_names),
+        use_normalized_coord=False,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+```
+
+- `PointSegClassMapping`：在训练过程中，只有被使用的类别的序号会被映射到类似 \[0, 20) 范围内的类别标签。其余的类别序号会被转换为 `ignore_index` 所制定的忽略标签，在本例中是 `20`。
+- `IndoorPatchPointSample`：从输入点云中裁剪一个含有固定数量点的小块 (patch)。`block_size` 指定了裁剪块的边长，在 ScanNet 上这个数值一般设置为 `1.5`。
+- `NormalizePointsColor`：将输入点的颜色信息归一化，通过将 RGB 值除以 `255` 来实现。
+
+## 评估指标
+
+- **目标检测**：通常使用全类平均精度（mAP）来评估 ScanNet 的 3D 检测任务的性能，比如 `mAP@0.25` 和 `mAP@0.5`。具体来说，评估时调用一个通用的计算 3D 物体检测多个类别的精度和召回率的函数。更多细节请参考 [indoor_eval](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/indoor_eval.py)。
+
+  **注意**：与在章节`提取 ScanNet 数据`中介绍的那样，所有真实物体的三维包围框是与坐标轴平行的，也就是说旋转角为 0。因此，预测包围框的网络接受的包围框旋转角监督也是 0，且在后处理阶段我们使用适用于与坐标轴平行的包围框的非极大值抑制（NMS），该过程不会考虑包围框的旋转。
+
+- **语义分割**：通常使用平均交并比 (mean Intersection over Union, mIoU) 来评估 ScanNet 的 3D 语义分割任务的性能。具体而言，我们先计算所有类别的 IoU，然后取平均值作为 mIoU。更多实现细节请参考 [seg_eval.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/seg_eval.py)。
+
+## 在测试集上测试并提交结果
+
+默认情况下，MMDet3D 的代码是在训练集上进行模型训练，然后在验证集上进行模型测试。
+
+如果你也想在在线基准上测试模型的性能（仅支持语义分割），请在测试命令中加上 `--format-only` 的标记，同时也要将 ScanNet 数据集[配置文件](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/_base_/datasets/scannet-seg.py#L126)中的 `ann_file=data_root + 'scannet_infos_val.pkl'` 改成 `ann_file=data_root + 'scannet_infos_test.pkl'`。
+
+请记得通过 `txt_prefix` 来指定想要保存测试结果的文件夹名称。
+
+以 PointNet++ (SSG) 在 ScanNet 上的测试为例，你可以运行以下命令来完成测试结果的保存：
+
+```
+./tools/dist_test.sh configs/pointnet2/pointnet2_ssg_16x2_cosine_200e_scannet-seg.py \
+    work_dirs/pointnet2_ssg/latest.pth --format-only \
+    --eval-options txt_prefix=work_dirs/pointnet2_ssg/test_submission
+```
+
+在保存测试结果后，你可以将该文件夹压缩，然后提交到 [ScanNet 在线测试服务器](http://kaldir.vc.in.tum.de/scannet_benchmark/semantic_label_3d)上进行验证。
diff --git a/mmde/docs/zh_cn/advanced_guides/datasets/semantickitti.md b/mmde/docs/zh_cn/advanced_guides/datasets/semantickitti.md
new file mode 100644
index 0000000000000000000000000000000000000000..cd4f4dcb65cfbce6f81e5e2e9fe0d9210fd24c8d
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/datasets/semantickitti.md
@@ -0,0 +1,129 @@
+# SemanticKITTI 数据集
+
+本页提供了有关在 MMDetection3D 中使用 SemanticKITTI 数据集的具体教程。
+
+## 数据集准备
+
+您可以在[这里](http://semantic-kitti.org/dataset.html#download)下载 SemanticKITTI 数据集并解压缩所有 zip 文件。
+
+像准备数据集的一般方法一样，建议将数据集根目录链接到 `$MMDETECTION3D/data`。
+
+在我们处理之前，文件夹结构应按如下方式组织：
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── semantickitti
+│   │   ├── sequences
+│   │   │   ├── 00
+│   │   │   │   ├── labels
+│   │   │   │   ├── velodyne
+│   │   │   ├── 01
+│   │   │   ├── ..
+│   │   │   ├── 22
+```
+
+SemanticKITTI 数据集包含 23 个序列，其中序列 \[0-7\] , \[9-10\] 作为训练集（约 19k 训练样本），序列 8 作为验证集（约 4k 验证样本），\[11-22\] 作为测试集 （约20k测试样本）。其中每个序列分别包含 velodyne 和 labels 两个文件夹分别存放激光雷达点云数据和分割标注 (其中高16位存放实例分割标注，低16位存放语义分割标注)。
+
+### 创建 SemanticKITTI 数据集
+
+我们提供了生成数据集信息的脚本，用于测试和训练。通过以下命令生成 `.pkl` 文件：
+
+```bash
+python ./tools/create_data.py semantickitti --root-path ./data/semantickitti --out-dir ./data/semantickitti --extra-tag semantickitti
+```
+
+处理后的文件夹结构应该如下：
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── semantickitti
+│   │   ├── sequences
+│   │   │   ├── 00
+│   │   │   │   ├── labels
+│   │   │   │   ├── velodyne
+│   │   │   ├── 01
+│   │   │   ├── ..
+│   │   │   ├── 22
+│   │   ├── semantickitti_infos_test.pkl
+│   │   ├── semantickitti_infos_train.pkl
+│   │   ├── semantickitti_infos_val.pkl
+```
+
+- `semantickitti_infos_train.pkl`: 训练数据集, 该字典包含两个键值: `metainfo` 和 `data_list`.
+  `metainfo` 包含该数据集的基本信息。 `data_list` 是由字典组成的列表，每个字典（以下简称 `info`）包含了单个样本的所有详细信息。
+  - info\['sample_id'\]：该样本在整个数据集的索引。
+  - info\['lidar_points'\]：是一个字典，包含了激光雷达点相关的信息。
+    - info\['lidar_points'\]\['lidar_path'\]：激光雷达点云数据的文件名。
+    - info\['lidar_points'\]\['num_pts_feats'\]：点的特征维度
+  - info\['pts_semantic_mask_pth'\]：三维语义分割的标注文件的文件路径
+
+更多细节请参考 [semantickitti_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/semantickitti_converter.py) 和 [update_infos_to_v2.py ](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/update_infos_to_v2.py) 。
+
+## Train pipeline
+
+下面展示了一个使用 SemanticKITTI 数据集进行 3D 语义分割的典型流程：
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti',
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.1, 0.1, 0.1],
+    ),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+```
+
+- 数据增强:
+  - `RandomFlip3D`：对输入点云数据进行随机地水平翻转或者垂直翻转。
+  - `GlobalRotScaleTrans`：对输入点云数据进行旋转、缩放、平移。
+
+## 评估
+
+使用 8 个 GPU 以及 SemanticKITTI 指标评估的 MinkUNet 的示例如下：
+
+```shell
+bash tools/dist_test.sh configs/minkunet/minkunet_w32_8xb2-15e_semantickitti.py checkpoints/minkunet_w32_8xb2-15e_semantickitti_20230309_160710-7fa0a6f1.pth 8
+```
+
+## 度量指标
+
+通常我们使用平均交并比 (mean Intersection over Union, mIoU) 作为 SemanticKITTI 语义分割任务的度量指标。
+具体而言，我们先计算所有类别的 IoU，然后取平均值作为 mIoU。
+更多实现细节请参考 [seg_eval.py](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/seg_eval.py)。
+
+以下是一个评估结果的样例:
+
+| classes | car    | bicycle | motorcycle | truck  | bus    | person | bicyclist | motorcyclist | road   | parking | sidewalk | other-ground | building | fence  | vegetation | trunck | terrian | pole   | traffic-sign | miou   | acc    | acc_cls |
+| ------- | ------ | ------- | ---------- | ------ | ------ | ------ | --------- | ------------ | ------ | ------- | -------- | ------------ | -------- | ------ | ---------- | ------ | ------- | ------ | ------------ | ------ | ------ | ------- |
+| results | 0.9687 | 0.1908  | 0.6313     | 0.8580 | 0.6359 | 0.6818 | 0.8444    | 0.0002       | 0.9353 | 0.4854  | 0.8106   | 0.0024       | 0.9050   | 0.6111 | 0.8822     | 0.6605 | 0.7493  | 0.6442 | 0.4837       | 0.6306 | 0.9202 | 0.6924  |
diff --git a/mmde/docs/zh_cn/advanced_guides/datasets/sunrgbd.md b/mmde/docs/zh_cn/advanced_guides/datasets/sunrgbd.md
new file mode 100644
index 0000000000000000000000000000000000000000..3c3875e57fb5dfd3f95297135ebfb5c47866c2c1
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/datasets/sunrgbd.md
@@ -0,0 +1,250 @@
+# SUN RGB-D 数据集
+
+## 数据集的准备
+
+对于数据集准备的整体流程，请参考 SUN RGB-D 的[指南](https://github.com/open-mmlab/mmdetection3d/blob/master/data/sunrgbd/README.md)。
+
+### 下载 SUN RGB-D 数据与工具包
+
+在[这里](http://rgbd.cs.princeton.edu/data/)下载 SUN RGB-D 的数据。接下来，将 `SUNRGBD.zip`、`SUNRGBDMeta2DBB_v2.mat`、`SUNRGBDMeta3DBB_v2.mat` 和 `SUNRGBDtoolbox.zip` 移动到 `OFFICIAL_SUNRGBD` 文件夹，并解压文件。
+
+下载完成后，数据处理之前的文件目录结构如下：
+
+```
+sunrgbd
+├── README.md
+├── matlab
+│   ├── extract_rgbd_data_v1.m
+│   ├── extract_rgbd_data_v2.m
+│   ├── extract_split.m
+├── OFFICIAL_SUNRGBD
+│   ├── SUNRGBD
+│   ├── SUNRGBDMeta2DBB_v2.mat
+│   ├── SUNRGBDMeta3DBB_v2.mat
+│   ├── SUNRGBDtoolbox
+```
+
+### 从原始数据中提取 3D 检测所需数据与标注
+
+通过运行如下指令从原始文件中提取出 SUN RGB-D 的标注（这需要您的机器中安装了 MATLAB）：
+
+```bash
+matlab -nosplash -nodesktop -r 'extract_split;quit;'
+matlab -nosplash -nodesktop -r 'extract_rgbd_data_v2;quit;'
+matlab -nosplash -nodesktop -r 'extract_rgbd_data_v1;quit;'
+```
+
+主要的步骤包括：
+
+- 提取出训练集和验证集的索引文件；
+- 从原始数据中提取出 3D 检测所需要的数据；
+- 从原始的标注数据中提取并组织检测任务使用的标注数据。
+
+用于从深度图中提取点云数据的 `extract_rgbd_data_v2.m` 的主要部分如下：
+
+```matlab
+data = SUNRGBDMeta(imageId);
+data.depthpath(1:16) = '';
+data.depthpath = strcat('../OFFICIAL_SUNRGBD', data.depthpath);
+data.rgbpath(1:16) = '';
+data.rgbpath = strcat('../OFFICIAL_SUNRGBD', data.rgbpath);
+
+% 从深度图获取点云
+[rgb,points3d,depthInpaint,imsize]=read3dPoints(data);
+rgb(isnan(points3d(:,1)),:) = [];
+points3d(isnan(points3d(:,1)),:) = [];
+points3d_rgb = [points3d, rgb];
+
+% MAT 文件比 TXT 文件小三倍。在 Python 中我们可以使用
+% scipy.io.loadmat('xxx.mat')['points3d_rgb'] 来加载数据
+mat_filename = strcat(num2str(imageId,'%06d'), '.mat');
+txt_filename = strcat(num2str(imageId,'%06d'), '.txt');
+% 保存点云数据
+parsave(strcat(depth_folder, mat_filename), points3d_rgb);
+```
+
+用于提取并组织检测任务标注的 `extract_rgbd_data_v1.m` 的主要部分如下：
+
+```matlab
+% 输出 2D 和 3D 包围框
+data2d = data;
+fid = fopen(strcat(det_label_folder, txt_filename), 'w');
+for j = 1:length(data.groundtruth3DBB)
+    centroid = data.groundtruth3DBB(j).centroid;  % 3D 包围框中心
+    classname = data.groundtruth3DBB(j).classname;  % 类名
+    orientation = data.groundtruth3DBB(j).orientation;  % 3D 包围框方向
+    coeffs = abs(data.groundtruth3DBB(j).coeffs);  % 3D 包围框大小
+    box2d = data2d.groundtruth2DBB(j).gtBb2D;  % 2D 包围框
+    fprintf(fid, '%s %d %d %d %d %f %f %f %f %f %f %f %f\n', classname, box2d(1), box2d(2), box2d(3), box2d(4), centroid(1), centroid(2), centroid(3), coeffs(1), coeffs(2), coeffs(3), orientation(1), orientation(2));
+end
+fclose(fid);
+```
+
+上面的两个脚本调用了 SUN RGB-D 提供的[工具包](https://rgbd.cs.princeton.edu/data/SUNRGBDtoolbox.zip)中的一些函数，如 `read3dPoints`。
+
+使用上述脚本提取数据后，文件目录结构应如下：
+
+```
+sunrgbd
+├── README.md
+├── matlab
+│   ├── extract_rgbd_data_v1.m
+│   ├── extract_rgbd_data_v2.m
+│   ├── extract_split.m
+├── OFFICIAL_SUNRGBD
+│   ├── SUNRGBD
+│   ├── SUNRGBDMeta2DBB_v2.mat
+│   ├── SUNRGBDMeta3DBB_v2.mat
+│   ├── SUNRGBDtoolbox
+├── sunrgbd_trainval
+│   ├── calib
+│   ├── depth
+│   ├── image
+│   ├── label
+│   ├── label_v1
+│   ├── seg_label
+│   ├── train_data_idx.txt
+│   ├── val_data_idx.txt
+```
+
+在如下每个文件夹下，都有总计 5285 个训练集样本和 5050 个验证集样本：
+
+- `calib`：`.txt` 后缀的相机标定文件。
+- `depth`：`.mat` 后缀的点云文件，包含 xyz 坐标和 rgb 色彩值。
+- `image`：`.jpg` 后缀的二维图像文件。
+- `label`：`.txt` 后缀的用于检测任务的标注数据（版本二）。
+- `label_v1`：`.txt` 后缀的用于检测任务的标注数据（版本一）。
+- `seg_label`：`.txt` 后缀的用于分割任务的标注数据。
+
+目前，我们使用版本一的数据用于训练与测试，因此版本二的标注并未使用。
+
+### 创建数据集
+
+请运行如下指令创建数据集：
+
+```shell
+python tools/create_data.py sunrgbd --root-path ./data/sunrgbd \
+--out-dir ./data/sunrgbd --extra-tag sunrgbd
+```
+
+或者，如果使用 slurm，可以使用如下指令替代：
+
+```
+bash tools/create_data.sh <job_name> sunrgbd
+```
+
+之前提到的点云数据就会被处理并以 `.bin` 格式重新存储。与此同时，`.pkl` 文件也被生成，用于存储数据标注和元信息。
+
+如上数据处理后，文件目录结构应如下：
+
+```
+sunrgbd
+├── README.md
+├── matlab
+│   ├── ...
+├── OFFICIAL_SUNRGBD
+│   ├── ...
+├── sunrgbd_trainval
+│   ├── ...
+├── points
+├── sunrgbd_infos_train.pkl
+├── sunrgbd_infos_val.pkl
+```
+
+- `points/xxxxxx.bin`：降采样后的点云数据。
+- `sunrgbd_infos_train.pkl`：训练集数据信息（标注与元信息），每个场景所含数据信息具体如下：
+  - info\['lidar_points'\]：字典包含了与激光雷达点相关的信息。
+    - info\['lidar_points'\]\['num_pts_feats'\]：点的特征维度。
+    - info\['lidar_points'\]\['lidar_path'\]：激光雷达点云数据的文件名。
+  - info\['images'\]：字典包含了与图像数据相关的信息。
+    - info\['images'\]\['CAM0'\]\['img_path'\]：图像的文件名。
+    - info\['images'\]\['CAM0'\]\['depth2img'\]：深度到图像的变换矩阵，形状为 (4, 4)。
+    - info\['images'\]\['CAM0'\]\['height'\]：图像的高。
+    - info\['images'\]\['CAM0'\]\['width'\]：图像的宽。
+  - info\['instances'\]：由字典组成的列表，包含了该帧的所有标注信息。每个字典与单个实例的标注相关。对于其中的第 i 个实例，我们有：
+    - info\['instances'\]\[i\]\['bbox_3d'\]：长度为 7 的列表，表示深度坐标系下的 3D 边界框。
+    - info\['instances'\]\[i\]\['bbox'\]：长度为 4 的列表，以 (x1, y1, x2, y2) 的顺序表示实例的 2D 边界框。
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]：整数表示实例的 3D 标签，-1 表示忽略该类别。
+    - info\['instances'\]\[i\]\['bbox_label'\]：整数表示实例的 2D 标签，-1 表示忽略该类别。
+- `sunrgbd_infos_val.pkl`：验证集上的数据信息，与 `sunrgbd_infos_train.pkl` 格式完全一致。
+
+## 训练流程
+
+SUN RGB-D 上纯点云 3D 物体检测的典型流程如下：
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+    ),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.523599, 0.523599],
+        scale_ratio_range=[0.85, 1.15],
+        shift_height=True),
+    dict(type='PointSample', num_points=20000),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+```
+
+点云上的数据增强
+
+- `RandomFlip3D`：随机左右或前后翻转输入点云。
+- `GlobalRotScaleTrans`：旋转输入点云，对于 SUN RGB-D 角度通常落入 \[-30, 30\]（度）的范围；并放缩输入点云，对于 SUN RGB-D 比例通常落入 \[0.85, 1.15\] 的范围；最后平移输入点云，对于 SUN RGB-D 通常位移量为 0（即不做位移）。
+- `PointSample`：降采样输入点云。
+
+SUN RGB-D 上多模态（点云和图像）3D 物体检测的典型流程如下：
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations3D'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1333, 600), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.0),
+    dict(type='Pad', size_divisor=32),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+    ),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.523599, 0.523599],
+        scale_ratio_range=[0.85, 1.15],
+        shift_height=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d','img', 'gt_bboxes', 'gt_bboxes_labels'])
+]
+```
+
+图像上的数据增强
+
+- `Resize`：改变输入图像的大小，`keep_ratio=True` 意味着图像的比例不改变。
+- `RandomFlip`：随机地翻折图像。
+
+图像增强的实现取自 [MMDetection](https://github.com/open-mmlab/mmdetection/tree/dev-3.x/mmdet/datasets/transforms)。
+
+## 度量指标
+
+与 ScanNet 一样，通常使用 mAP（全类平均精度）来评估 SUN RGB-D 的检测任务的性能，比如 `mAP@0.25` 和 `mAP@0.5`。具体来说，评估时调用一个通用的计算 3D 物体检测多个类别的精度和召回率的函数。更多细节请参考 [`indoor_eval.py`](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/functional/indoor_eval.py)。
+
+因为 SUN RGB-D 包含有图像数据，所以图像上的物体检测也是可行的。举个例子，在 ImVoteNet 中，我们首先训练了一个图像检测器，并且也使用 mAP 指标，如 `mAP@0.5`，来评估其表现。我们使用 [MMDetection](https://github.com/open-mmlab/mmdetection) 库中的 `eval_map` 函数来计算 mAP。
diff --git a/mmde/docs/zh_cn/advanced_guides/datasets/waymo.md b/mmde/docs/zh_cn/advanced_guides/datasets/waymo.md
new file mode 100644
index 0000000000000000000000000000000000000000..8c0f0dfc0fe6c2eb13f2246d82f1338060fe7a9f
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/datasets/waymo.md
@@ -0,0 +1,202 @@
+# Waymo 数据集
+
+本文档页包含了关于 MMDetection3D 中 Waymo 数据集用法的教程。
+
+## 数据集准备
+
+在准备 Waymo 数据集之前，如果您之前只安装了 `requirements/build.txt` 和 `requirements/runtime.txt` 中的依赖，请通过运行如下指令额外安装 Waymo 数据集所依赖的官方包：
+
+```
+pip install waymo-open-dataset-tf-2-6-0
+```
+
+或者
+
+```
+pip install -r requirements/optional.txt
+```
+
+和准备数据集的通用方法一致，我们推荐将数据集根目录软链接至 `$MMDETECTION3D/data`。
+由于原始 Waymo 数据的格式基于 `tfrecord`，我们需要将原始数据进行预处理，以便于训练和测试时使用。我们的方法是将它们转换为 KITTI 格式。
+
+处理之前，文件目录结构组织如下：
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── waymo
+│   │   ├── waymo_format
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   │   ├── testing
+│   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
+│   │   ├── kitti_format
+│   │   │   ├── ImageSets
+
+```
+
+您可以在[这里](https://waymo.com/open/download/)下载 1.2 版本的 Waymo 公开数据集，并在[这里](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing)下载其训练/验证/测试集拆分文件。接下来，请将 `tfrecord` 文件放入 `data/waymo/waymo_format/` 下的对应文件夹，并将 txt 格式的数据集拆分文件放入 `data/waymo/kitti_format/ImageSets`。在[这里](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects)下载验证集使用的 bin 格式真实标注 (Ground Truth) 文件并放入 `data/waymo/waymo_format/`。小窍门：您可以使用 `gsutil` 来在命令行下载大规模数据集。您可以将该[工具](https://github.com/RalphMao/Waymo-Dataset-Tool) 作为一个例子来查看更多细节。之后，通过运行如下指令准备 Waymo 数据：
+
+```bash
+# TF_CPP_MIN_LOG_LEVEL=3 will disable all logging output from TensorFlow.
+# The number of `--workers` depends on the maximum number of cores in your CPU.
+TF_CPP_MIN_LOG_LEVEL=3 python tools/create_data.py waymo --root-path ./data/waymo --out-dir ./data/waymo --workers 128 --extra-tag waymo --version v1.4
+```
+
+请注意，如果您的本地磁盘没有足够空间保存转换后的数据，您可以将 `--out-dir` 改为其他目录；只要在创建文件夹、准备数据并转换格式后，将数据文件链接到 `data/waymo/kitti_format` 即可。
+
+在数据转换后，文件目录结构应组织如下：
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── waymo
+│   │   ├── waymo_format
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   │   ├── testing
+│   │   │   ├── gt.bin
+│   │   │   ├── cam_gt.bin
+│   │   │   ├── fov_gt.bin
+│   │   ├── kitti_format
+│   │   │   ├── ImageSets
+│   │   │   ├── training
+│   │   │   │   ├── image_0
+│   │   │   │   ├── image_1
+│   │   │   │   ├── image_2
+│   │   │   │   ├── image_3
+│   │   │   │   ├── image_4
+│   │   │   │   ├── velodyne
+│   │   │   ├── testing
+│   │   │   │   ├── (the same as training)
+│   │   │   ├── waymo_gt_database
+│   │   │   ├── waymo_infos_trainval.pkl
+│   │   │   ├── waymo_infos_train.pkl
+│   │   │   ├── waymo_infos_val.pkl
+│   │   │   ├── waymo_infos_test.pkl
+│   │   │   ├── waymo_dbinfos_train.pkl
+
+```
+
+- `kitti_format/training/image_{0-4}/{a}{bbb}{ccc}.jpg` 因为 Waymo 数据的来源包含数个相机，这里我们将每个相机对应的图像和标签文件分别存储，并将相机位姿 (pose) 文件存储下来以供后续处理连续多帧的点云。我们使用 `{a}{bbb}{ccc}` 的名称编码方式为每帧数据命名，其中 `a` 是不同数据拆分的前缀（`0` 指代训练集，`1` 指代验证集，`2` 指代测试集），`bbb` 是分割部分 (segment) 的索引，而 `ccc` 是帧索引。您可以轻而易举地按照如上命名规则定位到所需的帧。我们将训练和验证所需数据按 KITTI 的方式集合在一起，然后将训练集/验证集/测试集的索引存储在 `ImageSet` 下的文件中。
+- `kitti_format/training/velodyne/{a}{bbb}{ccc}.bin` 当前样本的点云数据
+- `kitti_format/waymo_gt_database/xxx_{Car/Pedestrian/Cyclist}_x.bin`. 训练数据集的每个 3D 包围框中包含的点云数据。这些点云会在数据增强中被使用，例如. `ObjectSample`. `xxx` 表示训练样本的索引，`x` 表示实例在当前样本中的索引。
+- `kitti_format/waymo_infos_train.pkl`. 训练数据集，该字典包含了两个键值：`metainfo` 和 `data_list`。`metainfo` 包含数据集的基本信息，例如 `dataset`、`version` 和 `info_version`。`data_list` 是由字典组成的列表，每个字典（以下简称 `info`）包含了单个样本的所有详细信息。:
+  - info\['sample_idx'\]: 样本在整个数据集的索引。
+  - info\['ego2global'\]: 自车到全局坐标的变换矩阵。（4x4 列表）
+  - info\['timestamp'\]：样本数据时间戳。
+  - info\['context_name'\]: 语境名，表示样本从哪个 `*.tfrecord` 片段中提取的。
+  - info\['lidar_points'\]: 是一个字典，包含了所有与激光雷达点相关的信息。
+    - info\['lidar_points'\]\['lidar_path'\]: 激光雷达点云数据的文件名。
+    - info\['lidar_points'\]\['num_pts_feats'\]: 点的特征维度。
+  - info\['lidar_sweeps'\]: 是一个列表，包含了历史帧信息。
+    - info\['lidar_sweeps'\]\[i\]\['lidar_points'\]\['lidar_path'\]: 第 i 帧的激光雷达数据的文件路径。
+    - info\['lidar_sweeps'\]\[i\]\['ego2global'\]: 第 i 帧的激光雷达传感器到自车的变换矩阵。（4x4 列表）
+    - info\['lidar_sweeps'\]\[i\]\['timestamp'\]: 第 i 帧的样本数据时间戳。
+  - info\['images'\]: 是一个字典，包含与每个相机对应的六个键值：`'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`。每个字典包含了对应相机的所有数据信息。
+    - info\['images'\]\['CAM_XXX'\]\['img_path'\]: 图像的文件名。
+    - info\['images'\]\['CAM_XXX'\]\['height'\]: 图像的高
+    - info\['images'\]\['CAM_XXX'\]\['width'\]: 图像的宽
+    - info\['images'\]\['CAM_XXX'\]\['cam2img'\]: 当 3D 点投影到图像平面时需要的内参信息相关的变换矩阵。（3x3 列表）
+    - info\['images'\]\['CAM_XXX'\]\['lidar2cam'\]: 激光雷达传感器到该相机的变换矩阵。（4x4 列表）
+    - info\['images'\]\['CAM_XXX'\]\['lidar2img'\]: 激光雷达传感器到图像平面的变换矩阵。（4x4 列表）
+  - info\['image_sweeps'\]: 是一个列表，包含了历史帧信息。
+    - info\['image_sweeps'\]\[i\]\['images'\]\['CAM_XXX'\]\['img_path'\]: 第i帧的图像的文件名.
+    - info\['image_sweeps'\]\[i\]\['ego2global'\]: 第 i 帧的自车到全局坐标的变换矩阵。（4x4 列表）
+    - info\['image_sweeps'\]\[i\]\['timestamp'\]: 第 i 帧的样本数据时间戳。
+  - info\['instances'\]: 是一个字典组成的列表。每个字典包含单个实例的所有标注信息。对于其中的第 i 个实例，我们有：
+    - info\['instances'\]\[i\]\['bbox_3d'\]: 长度为 7 的列表，以 (x, y, z, l, w, h, yaw) 的顺序表示实例的 3D 边界框。
+    - info\['instances'\]\[i\]\['bbox'\]: 2D 边界框标注（，顺序为 \[x1, y1, x2, y2\] 的列表。有些实例可能没有对应的 2D 边界框标注。
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: 整数表示实例的标签，-1 代表忽略。
+    - info\['instances'\]\[i\]\['bbox_label'\]: 整数表示实例的标签，-1 代表忽略。
+    - info\['instances'\]\[i\]\['num_lidar_pts'\]: 每个 3D 边界框内包含的激光雷达点数。
+    - info\['instances'\]\[i\]\['camera_id'\]: 当前实例最可见相机的索引。
+    - info\['instances'\]\[i\]\['group_id'\]: 当前实例在当前样本中的索引。
+  - info\['cam_sync_instances'\]: 是一个字典组成的列表。每个字典包含单个实例的所有标注信息。它的形式与 \['instances'\]相同. 但是, \['cam_sync_instances'\] 专门用于基于多视角相机的三维目标检测任务。
+  - info\['cam_instances'\]: 是一个字典，包含以下键值： `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_SIDE_LEFT'`, `'CAM_SIDE_RIGHT'`. 对于基于视觉的 3D 目标检测任务，我们将整个场景的 3D 标注划分至它们所属于的相应相机中。对于其中的第 i 个实例，我们有：
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_3d'\]: 长度为 7 的列表，以 (x, y, z, l, h, w, yaw) 的顺序表示实例的 3D 边界框。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox'\]: 2D 边界框标注（3D 框投影的矩形框），顺序为 \[x1, y1, x2, y2\] 的列表。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label_3d'\]: 实例标签。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['bbox_label'\]: 实例标签。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['center_2d'\]: 3D 框投影到图像上的中心点，大小为 (2, ) 的列表。
+    - info\['cam_instances'\]\['CAM_XXX'\]\[i\]\['depth'\]: 3D 框投影中心的深度。
+
+## 训练
+
+考虑到原始数据集中的数据有很多相似的帧，我们基本上可以主要使用一个子集来训练我们的模型。在我们初步的基线中，我们在每五帧图片中加载一帧。得益于我们的超参数设置和数据增强方案，我们得到了比 Waymo [原论文](https://arxiv.org/pdf/1912.04838.pdf)中更好的性能。请移步 `configs/pointpillars/` 下的 README.md 以查看更多配置和性能相关的细节。我们会尽快发布一个更完整的 Waymo 基准榜单 (benchmark)。
+
+## 评估
+
+为了在 Waymo 数据集上进行检测性能评估，请按照[此处指示](https://github.com/waymo-research/waymo-open-dataset/blob/master/docs/quick_start.md)构建用于计算评估指标的二进制文件 `compute_detection_metrics_main`，并将它置于 `mmdet3d/core/evaluation/waymo_utils/` 下。您基本上可以按照下方命令安装 `bazel`，然后构建二进制文件：
+
+```shell
+# download the code and enter the base directory
+git clone https://github.com/waymo-research/waymo-open-dataset.git waymo-od
+# git clone https://github.com/Abyssaledge/waymo-open-dataset-master waymo-od # if you want to use faster multi-thread version.
+cd waymo-od
+git checkout remotes/origin/master
+
+# use the Bazel build system
+sudo apt-get install --assume-yes pkg-config zip g++ zlib1g-dev unzip python3 python3-pip
+BAZEL_VERSION=3.1.0
+wget https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
+sudo bash bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh
+sudo apt install build-essential
+
+# configure .bazelrc
+./configure.sh
+# delete previous bazel outputs and reset internal caches
+bazel clean
+
+bazel build waymo_open_dataset/metrics/tools/compute_detection_metrics_main
+cp bazel-bin/waymo_open_dataset/metrics/tools/compute_detection_metrics_main ../mmdetection3d/mmdet3d/evaluation/functional/waymo_utils/
+```
+
+接下来，您就可以在 Waymo 上评估您的模型了。如下示例是使用 8 个图形处理器 (GPU) 在 Waymo 上用 Waymo 评价指标评估 PointPillars 模型的情景：
+
+```shell
+./tools/dist_test.sh configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-car.py checkpoints/hv_pointpillars_secfpn_sbn-2x16_2x_waymo-3d-car_latest.pth
+```
+
+如果需要生成 bin 文件，需要在配置文件的 `test_evaluator` 中指定 `pklfile_prefix`，因此你可以在命令后添加 `--cfg-options "test_evaluator.pklfile_prefix=xxxx"`。
+
+**注意**:
+
+1. 有时用 `bazel` 构建 `compute_detection_metrics_main` 的过程中会出现如下错误：`'round' 不是 'std' 的成员` (`'round' is not a member of 'std'`)。我们只需要移除该文件中，`round` 前的 `std::`。
+
+2. 考虑到 Waymo 上评估一次耗时不短，我们建议只在模型训练结束时进行评估。
+
+3. 为了在 CUDA 9 环境使用 TensorFlow，我们建议通过编译 TensorFlow 源码的方式使用。除了官方教程之外，您还可以参考该[链接](https://github.com/SmileTM/Tensorflow2.X-GPU-CUDA9.0)以寻找可能合适的预编译包以及编译源码的实用攻略。
+
+## 测试并提交到官方服务器
+
+如下是一个使用 8 个图形处理器在 Waymo 上测试 PointPillars，生成 bin 文件并提交结果到官方榜单的例子：
+
+如果你想生成 bin 文件并提交到服务器中，在运行测试指令前你需要在配置文件的 `test_evaluator` 中指定 `submission_prefix`。
+
+在生成 bin 文件后，您可以简单地构建二进制文件 `create_submission`，并按照[指示](https://github.com/waymo-research/waymo-open-dataset/blob/master/docs/quick_start.md/)创建一个提交文件。下面是一些示例：
+
+```shell
+cd ../waymo-od/
+bazel build waymo_open_dataset/metrics/tools/create_submission
+cp bazel-bin/waymo_open_dataset/metrics/tools/create_submission ../mmdetection3d/mmdet3d/core/evaluation/waymo_utils/
+vim waymo_open_dataset/metrics/tools/submission.txtpb  # set the metadata information
+cp waymo_open_dataset/metrics/tools/submission.txtpb ../mmdetection3d/mmdet3d/evaluation/functional/waymo_utils/
+
+cd ../mmdetection3d
+# suppose the result bin is in `results/waymo-car/submission`
+mmdet3d/core/evaluation/waymo_utils/create_submission  --input_filenames='results/waymo-car/kitti_results_test.bin' --output_filename='results/waymo-car/submission/model' --submission_filename='mmdet3d/evaluation/functional/waymo_utils/submission.txtpb'
+
+tar cvf results/waymo-car/submission/my_model.tar results/waymo-car/submission/my_model/
+gzip results/waymo-car/submission/my_model.tar
+```
+
+如果想用官方评估服务器评估您在验证集上的结果，您可以使用同样的方法生成提交文件，只需确保您在运行如上指令前更改 `submission.txtpb` 中的字段值即可。
diff --git a/mmde/docs/zh_cn/advanced_guides/index.rst b/mmde/docs/zh_cn/advanced_guides/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a062496b56a9c3d2358ae18a6706d7a934e84c2f
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/index.rst
@@ -0,0 +1,27 @@
+数据集
+**************
+
+.. toctree::
+   :maxdepth: 1
+
+   datasets/index.rst
+
+
+支持的任务
+**************
+
+.. toctree::
+   :maxdepth: 1
+
+   supported_tasks/index.rst
+
+
+自定义项目
+**************
+
+.. toctree::
+   :maxdepth: 1
+
+   customize_dataset.md
+   customize_models.md
+   customize_runtime.md
diff --git a/mmde/docs/zh_cn/advanced_guides/supported_tasks/index.rst b/mmde/docs/zh_cn/advanced_guides/supported_tasks/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..53b8f4f94ed2392e555d54b05c991adfa247c3b6
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/supported_tasks/index.rst
@@ -0,0 +1,6 @@
+.. toctree::
+   :maxdepth: 1
+
+   lidar_det3d.md
+   vision_det3d.md
+   lidar_sem_seg3d.md
diff --git a/mmde/docs/zh_cn/advanced_guides/supported_tasks/lidar_det3d.md b/mmde/docs/zh_cn/advanced_guides/supported_tasks/lidar_det3d.md
new file mode 100644
index 0000000000000000000000000000000000000000..1277e310cb41af47100439e5668466dd18488afd
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/supported_tasks/lidar_det3d.md
@@ -0,0 +1,83 @@
+# 基于激光雷达的 3D 检测
+
+基于激光雷达的 3D 检测是 MMDetection3D 支持的最基础的任务之一。它期望给定的模型以激光雷达采集的任意数量的特征点为输入，并为每一个感兴趣的目标预测 3D 框及类别标签。接下来，我们以 KITTI 数据集上的 PointPillars 为例，展示如何准备数据，在标准的 3D 检测基准上训练并测试模型，以及可视化并验证结果。
+
+## 数据准备
+
+首先，我们需要下载原始数据并按照[数据准备文档](https://mmdetection3d.readthedocs.io/zh_CN/dev-1.x/user_guides/dataset_prepare.html)中提供的标准方式重新组织数据。
+
+由于不同数据集的原始数据有不同的组织方式，我们通常需要用 `.pkl` 文件收集有用的数据信息。因此，在准备好所有的原始数据之后，我们需要运行 `create_data.py` 中提供的脚本来为不同的数据集生成数据集信息。例如，对于 KITTI，我们需要运行如下命令：
+
+```shell
+python tools/create_data.py kitti --root-path ./data/kitti --out-dir ./data/kitti --extra-tag kitti
+```
+
+随后，相关的目录结构将如下所示：
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── kitti
+│   │   ├── ImageSets
+│   │   ├── testing
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── velodyne
+│   │   │   ├── velodyne_reduced
+│   │   ├── training
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── label_2
+│   │   │   ├── velodyne
+│   │   │   ├── velodyne_reduced
+│   │   ├── kitti_gt_database
+│   │   ├── kitti_infos_train.pkl
+│   │   ├── kitti_infos_trainval.pkl
+│   │   ├── kitti_infos_val.pkl
+│   │   ├── kitti_infos_test.pkl
+│   │   ├── kitti_dbinfos_train.pkl
+```
+
+## 训练
+
+接着，我们将使用提供的配置文件训练 PointPillars。当您使用不同的 GPU 设置进行训练时，您可以按照这个[教程](https://mmdetection3d.readthedocs.io/en/dev-1.x/user_guides/train_test.html)的示例。假设我们在一台具有 8 块 GPU 的机器上使用分布式训练：
+
+```shell
+./tools/dist_train.sh configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py 8
+```
+
+注意，配置文件名中的 `8xb6` 是指训练用了 8 块 GPU，每块 GPU 上有 6 个数据样本。如果您的自定义设置不同于此，那么有时候您需要相应地调整学习率。基本规则可以参考[此处](https://arxiv.org/abs/1706.02677)。我们已经支持了使用 `--auto-scale-lr` 来自动缩放学习率。
+
+## 定量评估
+
+在训练期间，模型权重文件将会根据配置文件中的 `train_cfg = dict(val_interval=xxx)` 设置被周期性地评估。我们支持不同数据集的官方评估方案。对于 KITTI，将对 3 个类别使用交并比（IoU）阈值分别为 0.5/0.7 的平均精度（mAP）来评估模型。评估结果将会被打印到终端中，如下所示：
+
+```
+Car AP@0.70, 0.70, 0.70:
+bbox AP:98.1839, 89.7606, 88.7837
+bev AP:89.6905, 87.4570, 85.4865
+3d AP:87.4561, 76.7569, 74.1302
+aos AP:97.70, 88.73, 87.34
+Car AP@0.70, 0.50, 0.50:
+bbox AP:98.1839, 89.7606, 88.7837
+bev AP:98.4400, 90.1218, 89.6270
+3d AP:98.3329, 90.0209, 89.4035
+aos AP:97.70, 88.73, 87.34
+```
+
+此外，在训练完成后您也可以评估特定的模型权重文件。您可以简单地执行以下脚本：
+
+```shell
+./tools/dist_test.sh configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py work_dirs/pointpillars/latest.pth 8
+```
+
+## 测试与提交
+
+如果您只想在在线基准上进行推理或测试模型性能，您需要在相应的评估器中指定 `submission_prefix`，例如，在配置文件中添加 `test_evaluator = dict(type='KittiMetric', ann_file=data_root + 'kitti_infos_test.pkl', format_only=True, pklfile_prefix='results/kitti-3class/kitti_results', submission_prefix='results/kitti-3class/kitti_results')`，然后可以得到结果文件。请确保配置文件中的[测试信息](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/datasets/kitti-3d-3class.py#L117)的 `data_prefix` 和 `ann_file` 由验证集相应地改为测试集。在生成结果后，您可以压缩文件夹并上传至 KITTI 评估服务器上。
+
+## 定性评估
+
+MMDetection3D 还提供了通用的可视化工具，以便于我们可以对训练好的模型预测的检测结果有一个直观的感受。您也可以在评估阶段通过设置 `--show` 来在线可视化检测结果，或者使用 `tools/misc/visualize_results.py` 来离线地进行可视化。此外，我们还提供了脚本 `tools/misc/browse_dataset.py` 用于可视化数据集而不做推理。更多的细节请参考[可视化文档](https://mmdetection3d.readthedocs.io/zh_CN/dev-1.x/user_guides/visualization.html)。
diff --git a/mmde/docs/zh_cn/advanced_guides/supported_tasks/lidar_sem_seg3d.md b/mmde/docs/zh_cn/advanced_guides/supported_tasks/lidar_sem_seg3d.md
new file mode 100644
index 0000000000000000000000000000000000000000..d35636ad7aa830b87d4a53a576c950032030ce03
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/supported_tasks/lidar_sem_seg3d.md
@@ -0,0 +1,78 @@
+# 基于激光雷达的 3D 语义分割
+
+基于激光雷达的 3D 语义分割是 MMDetection3D 支持的最基础的任务之一。它期望给定的模型以激光雷达采集的任意数量的特征点为输入，并预测每个输入点的语义标签。接下来，我们以 ScanNet 数据集上的 PointNet++ (SSG) 为例，展示如何准备数据，在标准的 3D 语义分割基准上训练并测试模型，以及可视化并验证结果。
+
+## 数据准备
+
+首先，我们需要从 ScanNet [官方网站](http://kaldir.vc.in.tum.de/scannet_benchmark/documentation)下载原始数据。
+
+由于不同数据集的原始数据有不同的组织方式，我们通常需要用 pkl 或 json 文件收集有用的数据信息。
+
+因此，在准备好所有的原始数据之后，我们可以遵循 [ScanNet 文档](https://github.com/open-mmlab/mmdetection3d/blob/master/data/scannet/README.md/)中的说明生成数据信息。
+
+随后，相关的目录结构将如下所示：
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── scannet
+│   │   ├── scannet_utils.py
+│   │   ├── batch_load_scannet_data.py
+│   │   ├── load_scannet_data.py
+│   │   ├── scannet_utils.py
+│   │   ├── README.md
+│   │   ├── scans
+│   │   ├── scans_test
+│   │   ├── scannet_instance_data
+│   │   ├── points
+│   │   ├── instance_mask
+│   │   ├── semantic_mask
+│   │   ├── seg_info
+│   │   │   ├── train_label_weight.npy
+│   │   │   ├── train_resampled_scene_idxs.npy
+│   │   │   ├── val_label_weight.npy
+│   │   │   ├── val_resampled_scene_idxs.npy
+│   │   ├── scannet_infos_train.pkl
+│   │   ├── scannet_infos_val.pkl
+│   │   ├── scannet_infos_test.pkl
+```
+
+## 训练
+
+接着，我们将使用提供的配置文件训练 PointNet++ (SSG) 模型。当你使用不同的 GPU 设置进行训练时，你基本上可以按照这个[教程](https://mmdetection3d.readthedocs.io/zh_CN/latest/1_exist_data_model.html#inference-with-existing-models)的示例脚本。假设我们在一台具有 2 块 GPU 的机器上使用分布式训练：
+
+```
+./tools/dist_train.sh configs/pointnet2/pointnet2_ssg_2xb16-cosine-200e_scannet-seg.py 2
+```
+
+注意，配置文件名中的 `16x2` 是指训练时用了 2 块 GPU，每块 GPU 上有 16 个样本。如果你的自定义设置不同于此，那么有时候你需要相应的调整学习率。基本规则可以参考[此处](https://arxiv.org/abs/1706.02677)。
+
+## 定量评估
+
+在训练期间，模型权重将会根据配置文件中的 `train_cfg = dict(val_interval=xxx)` 设置被周期性地评估。我们支持不同数据集的官方评估方案。对于 ScanNet，将使用 20 个类别的平均交并比 (mIoU) 对模型进行评估。评估结果将会被打印到终端中，如下所示：
+
+```
++---------+--------+--------+---------+--------+--------+--------+--------+--------+--------+-----------+---------+---------+--------+---------+--------------+----------------+--------+--------+---------+----------------+--------+--------+---------+
+| classes | wall   | floor  | cabinet | bed    | chair  | sofa   | table  | door   | window | bookshelf | picture | counter | desk   | curtain | refrigerator | showercurtrain | toilet | sink   | bathtub | otherfurniture | miou   | acc    | acc_cls |
++---------+--------+--------+---------+--------+--------+--------+--------+--------+--------+-----------+---------+---------+--------+---------+--------------+----------------+--------+--------+---------+----------------+--------+--------+---------+
+| results | 0.7257 | 0.9373 | 0.4625  | 0.6613 | 0.7707 | 0.5562 | 0.5864 | 0.4010 | 0.4558 | 0.7011    | 0.2500  | 0.4645  | 0.4540 | 0.5399  | 0.2802       | 0.3488         | 0.7359 | 0.4971 | 0.6922  | 0.3681         | 0.5444 | 0.8118 | 0.6695  |
++---------+--------+--------+---------+--------+--------+--------+--------+--------+--------+-----------+---------+---------+--------+---------+--------------+----------------+--------+--------+---------+----------------+--------+--------+---------+
+```
+
+此外，在训练完成后你也可以评估特定的模型权重文件。你可以简单地执行以下脚本：
+
+```
+./tools/dist_test.sh configs/pointnet2/pointnet2_ssg_16x2_cosine_200e_scannet-seg.py work_dirs/pointnet2_ssg/latest.pth 8
+```
+
+## 测试与提交
+
+如果你只想在在线基准上进行推理或测试模型性能，你需要在配置文件中的 `test_evalutor` 字段增加 `submission_prefix`， 例如配置文件增加 `test_evaluator = dict(type='SegMetric',submission_prefix=work_dirs/pointnet2_ssg/test_submission`)。
+并将 ScanNet 数据集[配置文件](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/_base_/datasets/scannet-seg.py#L129)中的 `ann_file=scannet_infos_val.pkl` 变成 `ann_file=scannet_infos_test.pkl`。在生成结果后，你可以压缩文件夹并上传至 [ScanNet 评估服务器](http://kaldir.vc.in.tum.de/scannet_benchmark/semantic_label_3d)上。
+
+## 定性评估
+
+MMDetection3D 还提供了通用的可视化工具，以便于我们可以对训练好的模型预测的分割结果有一个直观的感受。你也可以在评估阶段通过设置 `--eval-options 'show=True' 'out_dir=${SHOW_DIR}'` 来在线可视化分割结果，或者使用 `tools/misc/visualize_results.py` 来离线地进行可视化。此外，我们还提供了脚本 `tools/misc/browse_dataset.py` 用于可视化数据集而不做推理。更多的细节请参考[可视化文档](https://mmdetection3d.readthedocs.io/zh_CN/latest/useful_tools.html#visualization)。
diff --git a/mmde/docs/zh_cn/advanced_guides/supported_tasks/vision_det3d.md b/mmde/docs/zh_cn/advanced_guides/supported_tasks/vision_det3d.md
new file mode 100644
index 0000000000000000000000000000000000000000..ff5917cae810a19c4b21ca06de3b5eb5b586c2e3
--- /dev/null
+++ b/mmde/docs/zh_cn/advanced_guides/supported_tasks/vision_det3d.md
@@ -0,0 +1,114 @@
+# 基于视觉的 3D 检测
+
+基于视觉的 3D 检测是指基于纯视觉输入的 3D 检测方法，例如基于单目、双目和多视图图像的 3D 检测。目前，我们只支持单目和多视图的 3D 检测方法。其他方法也应该与我们的框架兼容，并在将来得到支持。
+
+它期望给定的模型以任意数量的图像作为输入，并为每一个感兴趣的目标预测 3D 框及类别标签。以 nuScenes 数据集 FCOS3D 为例，我们将展示如何准备数据，在标准的 3D 检测基准上训练并测试模型，以及可视化并验证结果。
+
+## 数据准备
+
+首先，我们需要下载原始数据并按照[数据准备文档](https://mmdetection3d.readthedocs.io/zh_CN/latest/data_preparation.html)中提供的标准方式重新组织数据。
+
+由于不同数据集的原始数据有不同的组织方式，我们通常需要用 pkl 或 json 文件收集有用的数据信息。因此，在准备好所有的原始数据之后，我们需要运行 `create_data.py` 中提供的脚本来为不同的数据集生成数据信息。例如，对于 nuScenes，我们需要运行如下命令：
+
+```
+python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes
+```
+
+随后，相关的目录结构将如下所示：
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── nuscenes
+│   │   ├── maps
+│   │   ├── samples
+│   │   ├── sweeps
+│   │   ├── v1.0-test
+|   |   ├── v1.0-trainval
+│   │   ├── nuscenes_database
+│   │   ├── nuscenes_infos_train.pkl
+│   │   ├── nuscenes_infos_trainval.pkl
+│   │   ├── nuscenes_infos_val.pkl
+│   │   ├── nuscenes_infos_test.pkl
+│   │   ├── nuscenes_dbinfos_train.pkl
+│   │   ├── nuscenes_infos_train_mono3d.coco.json
+│   │   ├── nuscenes_infos_trainval_mono3d.coco.json
+│   │   ├── nuscenes_infos_val_mono3d.coco.json
+│   │   ├── nuscenes_infos_test_mono3d.coco.json
+```
+
+注意，此处的 pkl 文件主要用于使用 LiDAR 数据的方法，json 文件用于 2D 检测/纯视觉的 3D 检测。在 v0.13.0 支持单目 3D 检测之前，json 文件只包含 2D 检测的信息，因此如果你需要最新的信息，请切换到 v0.13.0 之后的分支。
+
+## 训练
+
+接着，我们将使用提供的配置文件训练 FCOS3D。基本的脚本与其他模型一样。当你使用不同的 GPU 设置进行训练时，你基本上可以按照这个[教程](https://mmdetection3d.readthedocs.io/zh_CN/latest/1_exist_data_model.html#inference-with-existing-models)的示例。假设我们在一台具有 8 块 GPU 的机器上使用分布式训练：
+
+```
+./tools/dist_train.sh configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py 8
+```
+
+注意，配置文件名中的 `2x8` 是指训练时用了 8 块 GPU，每块 GPU 上有 2 个数据样本。如果你的自定义设置不同于此，那么有时候你需要相应的调整学习率。基本规则可以参考[此处](https://arxiv.org/abs/1706.02677)。
+
+我们也可以通过运行以下命令微调 FCOS3D，从而达到更好的性能：
+
+```
+./tools/dist_train.sh fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune.py 8
+```
+
+通过先前的脚本训练好一个基准模型后，请记得相应的修改[此处](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune.py#L8)的路径。
+
+## 定量评估
+
+在训练期间，模型权重文件将会根据配置文件中的 `evaluation = dict(interval=xxx)` 设置被周期性地评估。
+
+我们支持不同数据集的官方评估方案。由于输出格式与基于其他模态的 3D 检测相同，因此评估方法也是一样的。
+
+对于 nuScenes，将使用基于距离的平均精度（mAP）以及 nuScenes 检测分数（NDS）分别对 10 个类别进行评估。评估结果将会被打印到终端中，如下所示：
+
+```
+mAP: 0.3197
+mATE: 0.7595
+mASE: 0.2700
+mAOE: 0.4918
+mAVE: 1.3307
+mAAE: 0.1724
+NDS: 0.3905
+Eval time: 170.8s
+
+Per-class results:
+Object Class    AP      ATE     ASE     AOE     AVE     AAE
+car     0.503   0.577   0.152   0.111   2.096   0.136
+truck   0.223   0.857   0.224   0.220   1.389   0.179
+bus     0.294   0.855   0.204   0.190   2.689   0.283
+trailer 0.081   1.094   0.243   0.553   0.742   0.167
+construction_vehicle    0.058   1.017   0.450   1.019   0.137   0.341
+pedestrian      0.392   0.687   0.284   0.694   0.876   0.158
+motorcycle      0.317   0.737   0.265   0.580   2.033   0.104
+bicycle 0.308   0.704   0.299   0.892   0.683   0.010
+traffic_cone    0.555   0.486   0.309   nan     nan     nan
+barrier 0.466   0.581   0.269   0.169   nan     nan
+```
+
+此外，在训练完成后你也可以评估特定的模型权重文件。你可以简单地执行以下脚本：
+
+```
+./tools/dist_test.sh configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py \
+    work_dirs/fcos3d/latest.pth --eval mAP
+```
+
+## 测试与提交
+
+如果你只想在在线基准上进行推理或测试模型性能，你需要将之前评估脚本中的 `--eval mAP` 替换成 `--format-only`，并在需要的情况下指定 `jsonfile_prefix`，例如，添加选项 `--eval-options jsonfile_prefix=work_dirs/fcos3d/test_submission`。请确保配置文件中的[测试信息](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/_base_/datasets/nus-mono3d.py#L93)由验证集相应地改为测试集。
+
+在生成结果后，你可以压缩文件夹并上传至 nuScenes 3D 检测挑战的 evalAI 评估服务器上。
+
+## 定性评估
+
+MMDetection3D 还提供了通用的可视化工具，以便于我们可以对训练好的模型预测的检测结果有一个直观的感受。你也可以在评估阶段通过设置 `--eval-options 'show=True' 'out_dir=${SHOW_DIR}'` 来在线可视化检测结果，或者使用 `tools/misc/visualize_results.py` 来离线地进行可视化。
+
+此外，我们还提供了脚本 `tools/misc/browse_dataset.py` 用于可视化数据集而不做推理。更多的细节请参考[可视化文档](https://mmdetection3d.readthedocs.io/zh_CN/latest/useful_tools.html#visualization)。
+
+注意，目前我们仅支持纯视觉方法在图像上的可视化。将来我们将集成在前景图以及鸟瞰图（BEV）中的可视化。
diff --git a/mmde/docs/zh_cn/api.rst b/mmde/docs/zh_cn/api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..777e6f472da5ce023073c80abb54166fb90df2d3
--- /dev/null
+++ b/mmde/docs/zh_cn/api.rst
@@ -0,0 +1,154 @@
+mmdet3d.apis
+--------------
+.. automodule:: mmdet3d.apis
+    :members:
+
+mmdet3d.datasets
+--------------
+
+datasets
+^^^^^^^^^^
+.. automodule:: mmdet3d.datasets
+    :members:
+
+transforms
+^^^^^^^^^^^^
+.. automodule:: mmdet3d.datasets.transforms
+    :members:
+
+mmdet3d.engine
+--------------
+
+hooks
+^^^^^^^^^^
+.. automodule:: mmdet3d.engine.hooks
+    :members:
+
+mmdet3d.evaluation
+--------------------
+
+functional
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmdet3d.evaluation.functional
+    :members:
+
+metrics
+^^^^^^^^^^
+.. automodule:: mmdet3d.evaluation.metrics
+    :members:
+
+mmdet3d.models
+--------------
+
+backbones
+^^^^^^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.backbones
+    :members:
+
+data_preprocessors
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.data_preprocessors
+    :members:
+
+decode_heads
+^^^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.decode_heads
+    :members:
+
+dense_heads
+^^^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.dense_heads
+    :members:
+
+detectors
+^^^^^^^^^^
+.. automodule:: mmdet3d.models.detectors
+    :members:
+
+layers
+^^^^^^^^^^
+.. automodule:: mmdet3d.models.layers
+    :members:
+
+losses
+^^^^^^^^^^
+.. automodule:: mmdet3d.models.losses
+    :members:
+
+middle_encoders
+^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.middle_encoders
+    :members:
+
+necks
+^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.necks
+    :members:
+
+roi_heads
+^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.roi_heads
+    :members:
+
+segmentors
+^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.segmentors
+    :members:
+
+task_modules
+^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.task_modules
+    :members:
+
+test_time_augs
+^^^^^^^^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.test_time_augs
+    :members:
+
+utils
+^^^^^^^^^^
+.. automodule:: mmdet3d.models.utils
+    :members:
+
+voxel_encoders
+^^^^^^^^^^^^^
+.. automodule:: mmdet3d.models.voxel_encoders
+    :members:
+
+mmdet3d.structures
+--------------------
+
+structures
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmdet3d.structures
+    :members:
+
+bbox_3d
+^^^^^^^^^^
+.. automodule:: mmdet3d.structures.bbox_3d
+    :members:
+
+ops
+^^^^^^^^^^
+.. automodule:: mmdet3d.structures.ops
+    :members:
+
+points
+^^^^^^^^^^
+.. automodule:: mmdet3d.structures.points
+    :members:
+
+mmdet3d.testing
+----------------
+.. automodule:: mmdet3d.testing
+    :members:
+
+mmdet3d.visualization
+--------------------
+.. automodule:: mmdet3d.visualization
+    :members:
+
+mmdet3d.utils
+--------------
+.. automodule:: mmdet3d.utils
+    :members:
diff --git a/mmde/docs/zh_cn/conf.py b/mmde/docs/zh_cn/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d6aa456519fadc4616cd9b6070d66c70c3afa9b
--- /dev/null
+++ b/mmde/docs/zh_cn/conf.py
@@ -0,0 +1,160 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+
+import os
+import subprocess
+import sys
+
+import pytorch_sphinx_theme
+
+sys.path.insert(0, os.path.abspath('../../'))
+
+# -- Project information -----------------------------------------------------
+
+project = 'MMDetection3D'
+copyright = '2020-2023, OpenMMLab'
+author = 'MMDetection3D Authors'
+
+# The full version, including alpha/beta/rc tags
+version_file = '../../mmdet3d/version.py'
+with open(version_file) as f:
+    exec(compile(f.read(), version_file, 'exec'))
+__version__ = locals()['__version__']
+release = __version__
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'sphinx_markdown_tables',
+    'sphinx_copybutton',
+    'myst_parser',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.autodoc.typehints',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.autosectionlabel',
+    'sphinx_tabs.tabs',
+]
+autodoc_typehints = 'description'
+autodoc_mock_imports = ['mmcv._ext']
+autosummary_generate = True  # Turn on sphinx.ext.autosummary
+
+# Ignore >>> when copying code
+copybutton_prompt_text = r'>>> |\.\.\. '
+copybutton_prompt_is_regexp = True
+
+myst_enable_extensions = ['colon_fence']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
+
+# The master toctree document.
+master_doc = 'index'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+# html_theme = 'sphinx_rtd_theme'
+html_theme = 'pytorch_sphinx_theme'
+html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+html_theme_options = {
+    'menu': [
+        {
+            'name': 'GitHub',
+            'url': 'https://github.com/open-mmlab/mmdetection3d'
+        },
+        {
+            'name':
+            '上游库',
+            'children': [
+                {
+                    'name': 'MMEngine',
+                    'url': 'https://github.com/open-mmlab/mmengine',
+                    'description': '深度学习模型训练基础库'
+                },
+                {
+                    'name': 'MMCV',
+                    'url': 'https://github.com/open-mmlab/mmcv',
+                    'description': '基础视觉库'
+                },
+                {
+                    'name': 'MMDetection',
+                    'url': 'https://github.com/open-mmlab/mmdetection',
+                    'description': '目标检测工具箱'
+                },
+            ]
+        },
+    ],
+    # Specify the language of shared menu
+    'menu_lang':
+    'en'
+}
+
+language = 'en'
+
+master_doc = 'index'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+html_css_files = [
+    'https://cdn.datatables.net/1.13.2/css/dataTables.bootstrap5.min.css',
+    'css/readthedocs.css'
+]
+html_js_files = [
+    'https://cdn.datatables.net/1.13.2/js/jquery.dataTables.min.js',
+    'https://cdn.datatables.net/1.13.2/js/dataTables.bootstrap5.min.js',
+    'js/collapsed.js',
+    'js/table.js',
+]
+
+myst_heading_anchors = 4
+
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3', None),
+    'numpy': ('https://numpy.org/doc/stable', None),
+    'torch': ('https://pytorch.org/docs/stable/', None),
+    'mmcv': ('https://mmcv.readthedocs.io/zh_CN/latest/', None),
+    'mmengine': ('https://mmengine.readthedocs.io/zh_CN/latest/', None),
+    'mmdetection': ('https://mmdetection.readthedocs.io/zh_CN/latest/', None),
+}
+
+
+def builder_inited_handler(app):
+    subprocess.run(['./stat.py'])
+
+
+def setup(app):
+    app.connect('builder-inited', builder_inited_handler)
diff --git a/mmde/docs/zh_cn/get_started.md b/mmde/docs/zh_cn/get_started.md
new file mode 100644
index 0000000000000000000000000000000000000000..224a44c4abb0e4ad589254097884d1a65e3bbead
--- /dev/null
+++ b/mmde/docs/zh_cn/get_started.md
@@ -0,0 +1,298 @@
+# 开始你的第一步
+
+## 依赖
+
+在本节中，我们将展示如何使用 PyTorch 准备环境。
+
+MMDetection3D 支持在 Linux，Windows（实验性支持），MacOS 上运行，它需要 Python 3.7 以上，CUDA 9.2 以上和 PyTorch 1.6 以上。
+
+```{note}
+如果您对 PyTorch 有经验并且已经安装了它，您可以直接跳转到[下一小节](#安装流程)。否则，您可以按照下述步骤进行准备。
+```
+
+**步骤 0.** 从[官方网站](https://docs.conda.io/en/latest/miniconda.html)下载并安装 Miniconda。
+
+**步骤 1.** 创建并激活一个 conda 环境。
+
+```shell
+conda create --name openmmlab python=3.8 -y
+conda activate openmmlab
+```
+
+**步骤 2.** 基于 [PyTorch 官方说明](https://pytorch.org/get-started/locally/)安装 PyTorch，例如：
+
+在 GPU 平台上：
+
+```shell
+conda install pytorch torchvision -c pytorch
+```
+
+在 CPU 平台上：
+
+```shell
+conda install pytorch torchvision cpuonly -c pytorch
+```
+
+## 安装流程
+
+我们推荐用户参照我们的最佳实践安装 MMDetection3D。不过，整个过程也是可定制化的，更多信息请参考[自定义安装](#自定义安装)章节。
+
+### 最佳实践
+
+**步骤 0.** 使用 [MIM](https://github.com/open-mmlab/mim) 安装 [MMEngine](https://github.com/open-mmlab/mmengine)，[MMCV](https://github.com/open-mmlab/mmcv) 和 [MMDetection](https://github.com/open-mmlab/mmdetection)。
+
+```shell
+pip install -U openmim
+mim install mmengine
+mim install 'mmcv>=2.0.0rc4'
+mim install 'mmdet>=3.0.0'
+```
+
+**注意**：在 MMCV-v2.x 中，`mmcv-full` 改名为 `mmcv`，如果您想安装不包含 CUDA 算子的 `mmcv`，您可以使用 `mim install "mmcv-lite>=2.0.0rc4"` 安装精简版。
+
+**步骤 1.** 安装 MMDetection3D。
+
+方案 a：如果您开发并直接运行 mmdet3d，从源码安装它：
+
+```shell
+git clone https://github.com/open-mmlab/mmdetection3d.git -b dev-1.x
+# "-b dev-1.x" 表示切换到 `dev-1.x` 分支。
+cd mmdetection3d
+pip install -v -e .
+# "-v" 指详细说明，或更多的输出
+# "-e" 表示在可编辑模式下安装项目，因此对代码所做的任何本地修改都会生效，从而无需重新安装。
+```
+
+方案 b：如果您将 mmdet3d 作为依赖或第三方 Python 包使用，使用 MIM 安装：
+
+```shell
+mim install "mmdet3d>=1.1.0rc0"
+```
+
+注意：
+
+1. 如果您希望使用 `opencv-python-headless` 而不是 `opencv-python`，您可以在安装 MMCV 之前安装它。
+
+2. 一些安装依赖是可选的。简单地运行 `pip install -v -e .` 将会安装最低运行要求的版本。如果想要使用一些可选依赖项，例如 `albumentations` 和 `imagecorruptions`，可以使用 `pip install -r requirements/optional.txt` 进行手动安装，或者在使用 `pip` 时指定所需的附加功能（例如 `pip install -v -e .[optional]`），支持附加功能的有效键值包括 `all`、`tests`、`build` 以及 `optional`。
+
+   我们已经支持 `spconv 2.0`。如果用户已经安装 `spconv 2.0`，代码会默认使用 `spconv 2.0`，它会比原生 `mmcv spconv` 使用更少的 GPU 内存。用户可以使用下列的命令来安装 `spconv 2.0`：
+
+   ```shell
+   pip install cumm-cuxxx
+   pip install spconv-cuxxx
+   ```
+
+   `xxx` 表示环境中的 CUDA 版本。
+
+   例如，使用 CUDA 10.2，对应命令是 `pip install cumm-cu102 && pip install spconv-cu102`。
+
+   支持的 CUDA 版本包括 10.2，11.1，11.3 和 11.4。用户也可以通过源码编译来安装。更多细节请参考[spconv v2.x](https://github.com/traveller59/spconv)。
+
+   我们也支持 `Minkowski Engine` 作为稀疏卷积的后端。如果需要，请参考[安装指南](https://github.com/NVIDIA/MinkowskiEngine#installation) 或者使用 `pip` 来安装：
+
+   ```shell
+   conda install openblas-devel -c anaconda
+   export CPLUS_INCLUDE_PATH=CPLUS_INCLUDE_PATH:${YOUR_CONDA_ENVS_DIR}/include
+   # replace ${YOUR_CONDA_ENVS_DIR} to your anaconda environment path e.g. `/home/username/anaconda3/envs/openmmlab`.
+   pip install -U git+https://github.com/NVIDIA/MinkowskiEngine -v --no-deps --install-option="--blas_include_dirs=/opt/conda/include" --install-option="--blas=openblas"
+   ```
+
+   我们还支持 `Torchsparse` 作为稀疏卷积的后端。如果需要，请参考[安装指南](https://github.com/mit-han-lab/torchsparse#installation) 或者使用 `pip` 来安装：
+
+   ```shell
+   sudo apt install libsparsehash-dev
+   pip install --upgrade git+https://github.com/mit-han-lab/torchsparse.git@v1.4.0
+   ```
+
+   或者通过以下安装绕过sudo权限
+
+   ```shell
+   conda install -c bioconda sparsehash
+   export CPLUS_INCLUDE_PATH=CPLUS_INCLUDE_PATH:${YOUR_CONDA_ENVS_DIR}/include
+    # replace ${YOUR_CONDA_ENVS_DIR} to your anaconda environment path e.g. `/home/username/anaconda3/envs/openmmlab`.
+   pip install --upgrade git+https://github.com/mit-han-lab/torchsparse.git@v1.4.0
+   ```
+
+3. 我们的代码目前不能在只有 CPU 的环境（CUDA 不可用）下编译。
+
+### 验证安装
+
+为了验证 MMDetection3D 是否安装正确，我们提供了一些示例代码来执行模型推理。
+
+**步骤 1.** 我们需要下载配置文件和模型权重文件。
+
+```shell
+mim download mmdet3d --config pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car --dest .
+```
+
+下载将需要几秒钟或更长时间，这取决于您的网络环境。完成后，您会在当前文件夹中发现两个文件 `pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car.py` 和 `hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20220331_134606-d42d15ed.pth`。
+
+**步骤 2.** 推理验证。
+
+方案 a：如果您从源码安装 MMDetection3D，那么直接运行以下命令进行验证：
+
+```shell
+python demo/pcd_demo.py demo/data/kitti/000008.bin pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car.py hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20220331_134606-d42d15ed.pth --show
+```
+
+您会看到一个带有点云的可视化界面，其中包含有在汽车上绘制的检测框。
+
+**注意**：
+
+如果你在没有显示设备的服务器上安装 MMDetection3D ，你可以忽略 `--show` 参数。Demo 仍会将预测结果保存到 `outputs/pred/000008.json` 文件中。
+
+**注意**：
+
+如果您想输入一个 `.ply` 文件，您可以使用如下函数将它转换成 `.bin` 格式。然后您可以使用转化的 `.bin` 文件来运行样例。请注意在使用此脚本之前，您需要安装 `pandas` 和 `plyfile`。这个函数也可以用于训练 `ply 数据`时作为数据预处理来使用。
+
+```python
+import numpy as np
+import pandas as pd
+from plyfile import PlyData
+
+def convert_ply(input_path, output_path):
+    plydata = PlyData.read(input_path)  # 读取文件
+    data = plydata.elements[0].data  # 读取数据
+    data_pd = pd.DataFrame(data)  # 转换成 DataFrame
+    data_np = np.zeros(data_pd.shape, dtype=np.float)  # 初始化数组来存储数据
+    property_names = data[0].dtype.names  # 读取属性名称
+    for i, name in enumerate(
+            property_names):  # 通过属性读取数据
+        data_np[:, i] = data_pd[name]
+    data_np.astype(np.float32).tofile(output_path)
+```
+
+例如：
+
+```python
+convert_ply('./test.ply', './test.bin')
+```
+
+如果您有其他格式的点云数据（`.off`，`.obj` 等），您可以使用 `trimesh` 将它们转化成 `.ply`。
+
+```python
+import trimesh
+
+def to_ply(input_path, output_path, original_type):
+    mesh = trimesh.load(input_path, file_type=original_type)  # 读取文件
+    mesh.export(output_path, file_type='ply')  # 转换成 ply
+```
+
+例如：
+
+```python
+to_ply('./test.obj', './test.ply', 'obj')
+```
+
+方案 b：如果您使用 MIM 安装 MMDetection3D，那么可以打开您的 Python 解析器，复制并粘贴以下代码：
+
+```python
+from mmdet3d.apis import init_model, inference_detector
+
+config_file = 'pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car.py'
+checkpoint_file = 'hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20220331_134606-d42d15ed.pth'
+model = init_model(config_file, checkpoint_file)
+inference_detector(model, 'demo/data/kitti/000008.bin')
+```
+
+您将会看到一个包含 `Det3DDataSample` 的列表，预测结果在 `pred_instances_3d` 里面，包含有检测框，类别和得分。
+
+### 自定义安装
+
+#### CUDA 版本
+
+在安装 PyTorch 时，您需要指定 CUDA 的版本。如果您不清楚应该选择哪一个，请遵循我们的建议：
+
+- 对于 Ampere 架构的 NVIDIA GPU，例如 GeForce 30 系列以及 NVIDIA A100，CUDA 11 是必需的。
+- 对于更早的 NVIDIA GPU，CUDA 11 是向后兼容的，但 CUDA 10.2 提供更好的兼容性，并且更轻量。
+
+请确保 GPU 驱动版本满足最低的版本需求。更多信息请参考此[表格](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions)。
+
+```{note}
+如果您遵循我们的最佳实践，您只需要安装 CUDA 运行库，这是因为不需要在本地编译 CUDA 代码。但如果您希望从源码编译 MMCV，或者开发其他 CUDA 算子，那么您需要从 NVIDIA 的[官网](https://developer.nvidia.com/cuda-downloads)安装完整的 CUDA 工具链，并且该版本应该与 PyTorch 的 CUDA 版本相匹配，比如在 `conda install` 指令里指定 cudatoolkit 版本。
+```
+
+#### 不通过 MIM 安装 MMEngine
+
+如果想要使用 pip 而不是 MIM 安装 MMEngine，请参考 [MMEngine 安装指南](https://mmengine.readthedocs.io/zh_CN/latest/get_started/installation.html)。
+
+例如，您可以通过以下指令安装 MMEngine：
+
+```shell
+pip install mmengine
+```
+
+#### 不通过 MIM 安装 MMCV
+
+MMCV 包含 C++ 和 CUDA 拓展，因此其对 PyTorch 的依赖更复杂。MIM 会自动解决此类依赖关系并使安装更容易。但这不是必需的。
+
+如果想要使用 pip 而不是 MIM 安装 MMCV，请参考 [MMCV 安装指南](https://mmcv.readthedocs.io/zh_CN/2.x/get_started/installation.html)。这需要用指定 url 的形式手动指定对应的 PyTorch 和 CUDA 版本。
+
+例如，下述指令将会安装基于 PyTorch 1.12.x 和 CUDA 11.6 编译的 MMCV：
+
+```shell
+pip install "mmcv>=2.0.0rc4" -f https://download.openmmlab.com/mmcv/dist/cu116/torch1.12.0/index.html
+```
+
+#### 在 Google Colab 中安装
+
+[Google Colab](https://colab.research.google.com/) 通常已经安装了 PyTorch，因此我们只需要用如下命令安装 MMEngine，MMCV，MMDetection 和 MMDetection3D 即可。
+
+**步骤 1.** 使用 [MIM](https://github.com/open-mmlab/mim) 安装 [MMEngine](https://github.com/open-mmlab/mmengine)，[MMCV](https://github.com/open-mmlab/mmcv) 和 [MMDetection](https://github.com/open-mmlab/mmdetection)。
+
+```shell
+!pip3 install openmim
+!mim install mmengine
+!mim install "mmcv>=2.0.0rc4,<2.1.0"
+!mim install "mmdet>=3.0.0,<3.1.0"
+```
+
+**步骤 2.** 从源码安装 MMDetection3D。
+
+```shell
+!git clone https://github.com/open-mmlab/mmdetection3d.git -b dev-1.x
+%cd mmdetection3d
+!pip install -e .
+```
+
+**步骤 3.** 验证安装是否成功。
+
+```python
+import mmdet3d
+print(mmdet3d.__version__)
+# 预期输出：1.1.0rc0 或其它版本号。
+```
+
+```{note}
+在 Jupyter Notebook 中，感叹号 `!` 用于执行外部命令，而 `%cd` 是一个[魔术命令](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-cd)，用于切换 Python 的工作路径。
+```
+
+#### 通过 Docker 使用 MMDetection3D
+
+我们提供了 [Dockerfile](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/docker/Dockerfile) 来构建一个镜像。请确保您的 [docker 版本](https://docs.docker.com/engine/install/) >= 19.03。
+
+```shell
+# 基于 PyTorch 1.9，CUDA 11.1 构建镜像
+# 如果您想要其他版本，只需要修改 Dockerfile
+docker build -t mmdetection3d docker/
+```
+
+用以下命令运行 Docker 镜像：
+
+```shell
+docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmdetection3d/data mmdetection3d
+```
+
+### 故障排除
+
+如果您在安装过程中遇到一些问题，请先参考 [FAQ](notes/faq.md) 页面。如果没有找到对应的解决方案，您也可以在 GitHub [提一个问题](https://github.com/open-mmlab/mmdetection3d/issues/new/choose)。
+
+### 使用多个 MMDetection3D 版本进行开发
+
+训练和测试的脚本已经在 `PYTHONPATH` 中进行了修改，以确保脚本使用当前目录中的 MMDetection3D。
+
+要使环境中安装默认版本的 MMDetection3D 而不是当前正在使用的，可以删除出现在相关脚本中的代码：
+
+```shell
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH
+```
diff --git a/mmde/docs/zh_cn/index.rst b/mmde/docs/zh_cn/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2916fcc7619b9ac7ef5fb5da9878e0e5a16848d4
--- /dev/null
+++ b/mmde/docs/zh_cn/index.rst
@@ -0,0 +1,55 @@
+欢迎来到 MMDetection3D 文档！
+==========================================
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 开始你的第一步
+
+   get_started.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: 使用指南
+
+   user_guides/index.rst
+
+.. toctree::
+   :maxdepth: 2
+   :caption: 进阶教程
+
+   advanced_guides/index.rst
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 迁移版本
+
+   migration.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 接口文档（英文）
+
+   api.rst
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 模型仓库
+
+   model_zoo.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 说明
+
+   notes/index.rst
+
+.. toctree::
+   :caption: 语言切换
+
+   switch_language.md
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/mmde/docs/zh_cn/make.bat b/mmde/docs/zh_cn/make.bat
new file mode 100644
index 0000000000000000000000000000000000000000..922152e96a04a242e6fc40f124261d74890617d8
--- /dev/null
+++ b/mmde/docs/zh_cn/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/mmde/docs/zh_cn/model_zoo.md b/mmde/docs/zh_cn/model_zoo.md
new file mode 100644
index 0000000000000000000000000000000000000000..ec08bb25ea797803b5ac05dd9e7b1920fb1c30e1
--- /dev/null
+++ b/mmde/docs/zh_cn/model_zoo.md
@@ -0,0 +1,141 @@
+# 模型库
+
+## 通用设置
+
+- 使用分布式训练；
+- 为了和其他代码库做公平对比，本文展示的是使用 `torch.cuda.max_memory_allocated()` 在 8 个 GPUs 上得到的最大 GPU 显存占用值，需要注意的是，这些显存占用值通常小于 `nvidia-smi` 显示出来的显存占用值；
+- 在模型库中所展示的推理时间是包括网络前向传播和后处理所需的总时间，不包括数据加载所需的时间，模型库中所展示的结果均由 [benchmark.py](https://github.com/open-mmlab/mmdetection/blob/master/tools/analysis_tools/benchmark.py) 脚本文件在 2000 张图像上所计算的平均时间。
+
+## 基准结果
+
+### SECOND
+
+请参考 [SECOND](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/second) 获取更多的细节，我们在 KITTI 和 Waymo 数据集上都给出了相应的基准结果。
+
+### PointPillars
+
+请参考 [PointPillars](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/pointpillars) 获取更多细节，我们在 KITTI 、nuScenes 、Lyft 、Waymo 数据集上给出了相应的基准结果。
+
+### Part-A2
+
+请参考 [Part-A2](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/parta2) 获取更多细节。
+
+### VoteNet
+
+请参考 [VoteNet](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/votenet) 获取更多细节，我们在 ScanNet 和 SUNRGBD 数据集上给出了相应的基准结果。
+
+### Dynamic Voxelization
+
+请参考 [Dynamic Voxelization](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/dynamic_voxelization) 获取更多细节。
+
+### MVXNet
+
+请参考 [MVXNet](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/mvxnet) 获取更多细节。
+
+### RegNetX
+
+请参考 [RegNet](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/regnet) 获取更多细节，我们将 pointpillars 的主干网络替换成 RegNetX，并在 nuScenes 和 Lyft 数据集上给出了相应的基准结果。
+
+### nuImages
+
+我们在 [nuImages 数据集](https://www.nuscenes.org/nuimages) 上也提供基准模型，请参考 [nuImages](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/nuimages) 获取更多细节，我们在该数据集上提供 Mask R-CNN ， Cascade Mask R-CNN 和 HTC 的结果。
+
+### H3DNet
+
+请参考 [H3DNet](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/h3dnet) 获取更多细节。
+
+### 3DSSD
+
+请参考 [3DSSD](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/3dssd) 获取更多细节。
+
+### CenterPoint
+
+请参考 [CenterPoint](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/centerpoint) 获取更多细节。
+
+### SSN
+
+请参考 [SSN](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/ssn) 获取更多细节，我们将 pointpillars 中的检测头替换成 SSN 模型中所使用的 ‘shape-aware grouping heads’，并在 nuScenes 和 Lyft 数据集上给出了相应的基准结果。
+
+### ImVoteNet
+
+请参考 [ImVoteNet](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/imvotenet) 获取更多细节，我们在 SUNRGBD 数据集上给出了相应的结果。
+
+### FCOS3D
+
+请参考 [FCOS3D](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/fcos3d) 获取更多细节，我们在 nuScenes 数据集上给出了相应的结果。
+
+### PointNet++
+
+请参考 [PointNet++](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/pointnet2) 获取更多细节，我们在 ScanNet 和 S3DIS 数据集上给出了相应的结果。
+
+### Group-Free-3D
+
+请参考 [Group-Free-3D](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/groupfree3d) 获取更多细节，我们在 ScanNet 数据集上给出了相应的结果。
+
+### ImVoxelNet
+
+请参考 [ImVoxelNet](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/imvoxelnet) 获取更多细节，我们在 KITTI 数据集上给出了相应的结果。
+
+### PAConv
+
+请参考 [PAConv](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/paconv) 获取更多细节，我们在 S3DIS 数据集上给出了相应的结果。
+
+### DGCNN
+
+请参考 [DGCNN](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/dgcnn) 获取更多细节，我们在 S3DIS 数据集上给出了相应的结果。
+
+### SMOKE
+
+请参考 [SMOKE](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/smoke) 获取更多细节，我们在 KITTI 数据集上给出了相应的结果。
+
+### PGD
+
+请参考 [PGD](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/pgd) 获取更多细节，我们在 KITTI 和 nuScenes 数据集上给出了相应的结果。
+
+### PointRCNN
+
+请参考 [PointRCNN](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/point_rcnn) 获取更多细节，我们在 KITTI 数据集上给出了相应的结果。
+
+### MonoFlex
+
+请参考 [MonoFlex](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/monoflex) 获取更多细节，我们在 KITTI 数据集上给出了相应的结果。
+
+### SA-SSD
+
+请参考 [SA-SSD](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/sassd) 获取更多的细节，我们在 KITTI 数据集上给出了相应的基准结果。
+
+### FCAF3D
+
+请参考 [FCAF3D](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/fcaf3d) 获取更多的细节，我们在 ScanNet, S3DIS 和 SUN RGB-D 数据集上给出了相应的基准结果。
+
+### PV-RCNN
+
+请参考 [PV-RCNN](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/pv_rcnn) 获取更多的细节，我们在 KITTI 数据集上给出了相应的基准结果。
+
+### BEVFusion
+
+请参考 [BEVFusion](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/projects/BEVFusion) 获取更多的细节, 我们在 NuScenes 数据集上给出了相应的基准结果。
+
+### CenterFormer
+
+请参考 [CenterFormer](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/projects/CenterFormer) 获取更多的细节, 我们在 Waymo 数据集上给出了相应的基准结果。
+
+### TR3D
+
+请参考 [TR3D](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/projects/TR3D) 获取更多的细节, 我们在 ScanNet, SUN RGB-D 和 S3DIS 数据集上给出了相应的基准结果。
+
+### DETR3D
+
+请参考 [DETR3D](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/projects/DETR3D) 获取更多的细节, 我们在 NuScenes 数据集上给出了相应的基准结果。
+
+### PETR
+
+请参考 [PETR](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/projects/PETR) 获取更多的细节, 我们在 NuScenes 数据集上给出了相应的基准结果。
+
+### TPVFormer
+
+请参考 [TPVFormer](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/projects/TPVFormer) 获取更多的细节, 我们在 NuScenes 数据集上给出了相应的基准结果。
+
+### Mixed Precision (FP16) Training
+
+细节请参考 [Mixed Precision (FP16) Training 在 PointPillars 训练的样例](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/pointpillars/pointpillars_hv_fpn_sbn-all_8xb2-amp-2x_nus-3d.py)。
diff --git a/mmde/docs/zh_cn/notes/benchmarks.md b/mmde/docs/zh_cn/notes/benchmarks.md
new file mode 100644
index 0000000000000000000000000000000000000000..52b922707629536823bd0a2ecda21bd6c49f6d64
--- /dev/null
+++ b/mmde/docs/zh_cn/notes/benchmarks.md
@@ -0,0 +1,285 @@
+# 基准测试
+
+这里我们对 MMDetection3D 和其他开源 3D 目标检测代码库中模型的训练速度和测试速度进行了基准测试。
+
+## 配置
+
+- 硬件：8 NVIDIA Tesla V100 (32G) GPUs, Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+- 软件：Python 3.7, CUDA 10.1, cuDNN 7.6.5, PyTorch 1.3, numba 0.48.0.
+- 模型：由于不同代码库所实现的模型种类有所不同，在基准测试中我们选择了 SECOND、PointPillars、Part-A2 和 VoteNet 几种模型，分别与其他代码库中的相应模型实现进行了对比。
+- 度量方法：我们使用整个训练过程中的平均吞吐量作为度量方法，并跳过每个 epoch 的前 50 次迭代以消除训练预热的影响。
+
+## 主要结果
+
+对于模型的训练速度（样本/秒），我们将 MMDetection3D 与其他实现了相同模型的代码库进行了对比。结果如下所示，表格内的数字越大，代表模型的训练速度越快。代码库中不支持的模型使用 `×` 进行标识。
+
+|        模型         | MMDetection3D | OpenPCDet | votenet | Det3D |
+| :-----------------: | :-----------: | :-------: | :-----: | :---: |
+|       VoteNet       |      358      |     ×     |   77    |   ×   |
+|  PointPillars-car   |      141      |     ×     |    ×    |  140  |
+| PointPillars-3class |      107      |    44     |    ×    |   ×   |
+|       SECOND        |      40       |    30     |    ×    |   ×   |
+|       Part-A2       |      17       |    14     |    ×    |   ×   |
+
+## 测试细节
+
+### 为了计算速度所做的修改
+
+- __MMDetection3D__：我们尝试使用与其他代码库中尽可能相同的配置，具体配置细节见 [基准测试配置](https://github.com/open-mmlab/MMDetection3D/blob/main/configs/benchmark)。
+
+- __Det3D__：为了与 Det3D 进行比较，我们使用了 commit [519251e](https://github.com/poodarchu/Det3D/tree/519251e72a5c1fdd58972eabeac67808676b9bb7) 所对应的代码版本。
+
+- __OpenPCDet__：为了与 OpenPCDet 进行比较，我们使用了 commit [b32fbddb](https://github.com/open-mmlab/OpenPCDet/tree/b32fbddbe06183507bad433ed99b407cbc2175c2) 所对应的代码版本。
+
+  为了计算训练速度，我们在 `./tools/train_utils/train_utils.py` 文件中添加了用于记录运行时间的代码。我们对每个 epoch 的训练速度进行计算，并报告所有 epoch 的平均速度。
+
+  <details>
+    <summary>
+    （为了使用相同方法进行测试所做的具体修改 - 点击展开）
+    </summary>
+
+  ```diff
+  diff --git a/tools/train_utils/train_utils.py b/tools/train_utils/train_utils.py
+  index 91f21dd..021359d 100644
+  --- a/tools/train_utils/train_utils.py
+  +++ b/tools/train_utils/train_utils.py
+  @@ -2,6 +2,7 @@ import torch
+   import os
+   import glob
+   import tqdm
+  +import datetime
+   from torch.nn.utils import clip_grad_norm_
+
+
+  @@ -13,7 +14,10 @@ def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, ac
+       if rank == 0:
+           pbar = tqdm.tqdm(total=total_it_each_epoch, leave=leave_pbar, desc='train', dynamic_ncols=True)
+
+  +    start_time = None
+       for cur_it in range(total_it_each_epoch):
+  +        if cur_it > 49 and start_time is None:
+  +            start_time = datetime.datetime.now()
+           try:
+               batch = next(dataloader_iter)
+           except StopIteration:
+  @@ -55,9 +59,11 @@ def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, ac
+                   tb_log.add_scalar('learning_rate', cur_lr, accumulated_iter)
+                   for key, val in tb_dict.items():
+                       tb_log.add_scalar('train_' + key, val, accumulated_iter)
+  +    endtime = datetime.datetime.now()
+  +    speed = (endtime - start_time).seconds / (total_it_each_epoch - 50)
+       if rank == 0:
+           pbar.close()
+  -    return accumulated_iter
+  +    return accumulated_iter, speed
+
+
+   def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_cfg,
+  @@ -65,6 +71,7 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
+                   lr_warmup_scheduler=None, ckpt_save_interval=1, max_ckpt_save_num=50,
+                   merge_all_iters_to_one_epoch=False):
+       accumulated_iter = start_iter
+  +    speeds = []
+       with tqdm.trange(start_epoch, total_epochs, desc='epochs', dynamic_ncols=True, leave=(rank == 0)) as tbar:
+           total_it_each_epoch = len(train_loader)
+           if merge_all_iters_to_one_epoch:
+  @@ -82,7 +89,7 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
+                   cur_scheduler = lr_warmup_scheduler
+               else:
+                   cur_scheduler = lr_scheduler
+  -            accumulated_iter = train_one_epoch(
+  +            accumulated_iter, speed = train_one_epoch(
+                   model, optimizer, train_loader, model_func,
+                   lr_scheduler=cur_scheduler,
+                   accumulated_iter=accumulated_iter, optim_cfg=optim_cfg,
+  @@ -91,7 +98,7 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
+                   total_it_each_epoch=total_it_each_epoch,
+                   dataloader_iter=dataloader_iter
+               )
+  -
+  +            speeds.append(speed)
+               # save trained model
+               trained_epoch = cur_epoch + 1
+               if trained_epoch % ckpt_save_interval == 0 and rank == 0:
+  @@ -107,6 +114,8 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
+                   save_checkpoint(
+                       checkpoint_state(model, optimizer, trained_epoch, accumulated_iter), filename=ckpt_name,
+                   )
+  +            print(speed)
+  +    print(f'*******{sum(speeds) / len(speeds)}******')
+
+
+   def model_state_to_cpu(model_state):
+  ```
+
+  </details>
+
+### VoteNet
+
+- __MMDetection3D__：在 v0.1.0 版本下, 执行如下命令：
+
+  ```bash
+  ./tools/dist_train.sh configs/votenet/votenet_8xb16_sunrgbd-3d.py 8 --no-validate
+  ```
+
+- __votenet__：在 commit [2f6d6d3](https://github.com/facebookresearch/votenet/tree/2f6d6d36ff98d96901182e935afe48ccee82d566) 版本下，执行如下命令：
+
+  ```bash
+  python train.py --dataset sunrgbd --batch_size 16
+  ```
+
+  然后执行如下命令，对测试速度进行评估：
+
+  ```bash
+  python eval.py --dataset sunrgbd --checkpoint_path log_sunrgbd/checkpoint.tar --batch_size 1 --dump_dir eval_sunrgbd --cluster_sampling seed_fps --use_3d_nms --use_cls_nms --per_class_proposal
+  ```
+
+  注意，为了计算推理速度，我们对 `eval.py` 进行了修改。
+
+  <details>
+  <summary>
+  （为了对相同模型进行测试所做的具体修改 - 点击展开）
+  </summary>
+
+  ```diff
+  diff --git a/eval.py b/eval.py
+    index c0b2886..04921e9 100644
+    --- a/eval.py
+    +++ b/eval.py
+    @@ -10,6 +10,7 @@ import os
+     import sys
+     import numpy as np
+     from datetime import datetime
+    +import time
+     import argparse
+     import importlib
+     import torch
+    @@ -28,7 +29,7 @@ parser.add_argument('--checkpoint_path', default=None, help='Model checkpoint pa
+     parser.add_argument('--dump_dir', default=None, help='Dump dir to save sample outputs [default: None]')
+     parser.add_argument('--num_point', type=int, default=20000, help='Point Number [default: 20000]')
+     parser.add_argument('--num_target', type=int, default=256, help='Point Number [default: 256]')
+    -parser.add_argument('--batch_size', type=int, default=8, help='Batch Size during training [default: 8]')
+    +parser.add_argument('--batch_size', type=int, default=1, help='Batch Size during training [default: 8]')
+     parser.add_argument('--vote_factor', type=int, default=1, help='Number of votes generated from each seed [default: 1]')
+     parser.add_argument('--cluster_sampling', default='vote_fps', help='Sampling strategy for vote clusters: vote_fps, seed_fps, random [default: vote_fps]')
+     parser.add_argument('--ap_iou_thresholds', default='0.25,0.5', help='A list of AP IoU thresholds [default: 0.25,0.5]')
+    @@ -132,6 +133,7 @@ CONFIG_DICT = {'remove_empty_box': (not FLAGS.faster_eval), 'use_3d_nms': FLAGS.
+     # ------------------------------------------------------------------------- GLOBAL CONFIG END
+
+     def evaluate_one_epoch():
+    +    time_list = list()
+         stat_dict = {}
+         ap_calculator_list = [APCalculator(iou_thresh, DATASET_CONFIG.class2type) \
+             for iou_thresh in AP_IOU_THRESHOLDS]
+    @@ -144,6 +146,8 @@ def evaluate_one_epoch():
+
+             # Forward pass
+             inputs = {'point_clouds': batch_data_label['point_clouds']}
+    +        torch.cuda.synchronize()
+    +        start_time = time.perf_counter()
+             with torch.no_grad():
+                 end_points = net(inputs)
+
+    @@ -161,6 +165,12 @@ def evaluate_one_epoch():
+
+             batch_pred_map_cls = parse_predictions(end_points, CONFIG_DICT)
+             batch_gt_map_cls = parse_groundtruths(end_points, CONFIG_DICT)
+    +        torch.cuda.synchronize()
+    +        elapsed = time.perf_counter() - start_time
+    +        time_list.append(elapsed)
+    +
+    +        if len(time_list==200):
+    +            print("average inference time: %4f"%(sum(time_list[5:])/len(time_list[5:])))
+             for ap_calculator in ap_calculator_list:
+                 ap_calculator.step(batch_pred_map_cls, batch_gt_map_cls)
+
+  ```
+
+### PointPillars-car
+
+- __MMDetection3D__：在 v0.1.0 版本下, 执行如下命令：
+
+  ```bash
+  ./tools/dist_train.sh configs/benchmark/hv_pointpillars_secfpn_3x8_100e_det3d_kitti-3d-car.py 8 --no-validate
+  ```
+
+- __Det3D__：在 commit [519251e](https://github.com/poodarchu/Det3D/tree/519251e72a5c1fdd58972eabeac67808676b9bb7) 版本下，使用 `kitti_point_pillars_mghead_syncbn.py` 并执行如下命令：
+
+  ```bash
+  ./tools/scripts/train.sh --launcher=slurm --gpus=8
+  ```
+
+  注意，为了训练 PointPillars，我们对 `train.sh` 进行了修改。
+
+  <details>
+  <summary>
+  （为了对相同模型进行测试所做的具体修改 - 点击展开）
+  </summary>
+
+  ```diff
+  diff --git a/tools/scripts/train.sh b/tools/scripts/train.sh
+  index 3a93f95..461e0ea 100755
+  --- a/tools/scripts/train.sh
+  +++ b/tools/scripts/train.sh
+  @@ -16,9 +16,9 @@ then
+   fi
+
+   # Voxelnet
+  -python -m torch.distributed.launch --nproc_per_node=8 ./tools/train.py examples/second/configs/  kitti_car_vfev3_spmiddlefhd_rpn1_mghead_syncbn.py --work_dir=$SECOND_WORK_DIR
+  +# python -m torch.distributed.launch --nproc_per_node=8 ./tools/train.py examples/second/configs/  kitti_car_vfev3_spmiddlefhd_rpn1_mghead_syncbn.py --work_dir=$SECOND_WORK_DIR
+   # python -m torch.distributed.launch --nproc_per_node=8 ./tools/train.py examples/cbgs/configs/  nusc_all_vfev3_spmiddleresnetfhd_rpn2_mghead_syncbn.py --work_dir=$NUSC_CBGS_WORK_DIR
+   # python -m torch.distributed.launch --nproc_per_node=8 ./tools/train.py examples/second/configs/  lyft_all_vfev3_spmiddleresnetfhd_rpn2_mghead_syncbn.py --work_dir=$LYFT_CBGS_WORK_DIR
+
+   # PointPillars
+  -# python -m torch.distributed.launch --nproc_per_node=8 ./tools/train.py ./examples/point_pillars/configs/  original_pp_mghead_syncbn_kitti.py --work_dir=$PP_WORK_DIR
+  +python -m torch.distributed.launch --nproc_per_node=8 ./tools/train.py ./examples/point_pillars/configs/  kitti_point_pillars_mghead_syncbn.py
+  ```
+
+  </details>
+
+### PointPillars-3class
+
+- __MMDetection3D__：在 v0.1.0 版本下, 执行如下命令：
+
+  ```bash
+  ./tools/dist_train.sh configs/benchmark/hv_pointpillars_secfpn_4x8_80e_pcdet_kitti-3d-3class.py 8 --no-validate
+  ```
+
+- __OpenPCDet__：在 commit [b32fbddb](https://github.com/open-mmlab/OpenPCDet/tree/b32fbddbe06183507bad433ed99b407cbc2175c2) 版本下，执行如下命令：
+
+  ```bash
+  cd tools
+  sh scripts/slurm_train.sh ${PARTITION} ${JOB_NAME} 8  --cfg_file ./cfgs/kitti_models/pointpillar.yaml --batch_size 32  --workers 32 --epochs 80
+  ```
+
+### SECOND
+
+基准测试中的 SECOND 指在 [second.Pytorch](https://github.com/traveller59/second.pytorch) 首次被实现的 [SECONDv1.5](https://github.com/traveller59/second.pytorch/blob/master/second/configs/all.fhd.config)。Det3D 实现的 SECOND 中，使用了自己实现的 Multi-Group Head，因此无法将它的速度与其他代码库进行对比。
+
+- __MMDetection3D__：在 v0.1.0 版本下, 执行如下命令：
+
+  ```bash
+  ./tools/dist_train.sh configs/benchmark/hv_second_secfpn_4x8_80e_pcdet_kitti-3d-3class.py 8 --no-validate
+  ```
+
+- __OpenPCDet__：在 commit [b32fbddb](https://github.com/open-mmlab/OpenPCDet/tree/b32fbddbe06183507bad433ed99b407cbc2175c2) 版本下，执行如下命令：
+
+  ```bash
+  cd tools
+  sh ./scripts/slurm_train.sh ${PARTITION} ${JOB_NAME} 8  --cfg_file ./cfgs/kitti_models/second.yaml --batch_size 32  --workers 32 --epochs 80
+  ```
+
+### Part-A2
+
+- __MMDetection3D__：在 v0.1.0 版本下, 执行如下命令：
+
+  ```bash
+  ./tools/dist_train.sh configs/benchmark/hv_PartA2_secfpn_4x8_cyclic_80e_pcdet_kitti-3d-3class.py 8 --no-validate
+  ```
+
+- __OpenPCDet__：在 commit [b32fbddb](https://github.com/open-mmlab/OpenPCDet/tree/b32fbddbe06183507bad433ed99b407cbc2175c2) 版本下，执行如下命令以进行模型训练：
+
+  ```bash
+  cd tools
+  sh ./scripts/slurm_train.sh ${PARTITION} ${JOB_NAME} 8  --cfg_file ./cfgs/kitti_models/PartA2.yaml --batch_size 32 --workers 32 --epochs 80
+  ```
diff --git a/mmde/docs/zh_cn/notes/changelog.md b/mmde/docs/zh_cn/notes/changelog.md
new file mode 100644
index 0000000000000000000000000000000000000000..258cba0c8b5c5784edbbc891db1754ee08275dad
--- /dev/null
+++ b/mmde/docs/zh_cn/notes/changelog.md
@@ -0,0 +1 @@
+# v1.1 变更日志
diff --git a/mmde/docs/zh_cn/notes/changelog_v1.0.x.md b/mmde/docs/zh_cn/notes/changelog_v1.0.x.md
new file mode 100644
index 0000000000000000000000000000000000000000..d7916ef6cba182de9667ba47f5e6a6ddcdcd3295
--- /dev/null
+++ b/mmde/docs/zh_cn/notes/changelog_v1.0.x.md
@@ -0,0 +1 @@
+# v1.0.x 变更日志
diff --git a/mmde/docs/zh_cn/notes/compatibility.md b/mmde/docs/zh_cn/notes/compatibility.md
new file mode 100644
index 0000000000000000000000000000000000000000..97144d1394bc192f2534953417e7dae2be5c306b
--- /dev/null
+++ b/mmde/docs/zh_cn/notes/compatibility.md
@@ -0,0 +1 @@
+# 兼容性
diff --git a/mmde/docs/zh_cn/notes/faq.md b/mmde/docs/zh_cn/notes/faq.md
new file mode 100644
index 0000000000000000000000000000000000000000..3e1199cd78b73a2b11c67640e39f5ef12c23d402
--- /dev/null
+++ b/mmde/docs/zh_cn/notes/faq.md
@@ -0,0 +1,57 @@
+# 常见问题解答
+
+我们列出了一些用户和开发者在开发过程中会遇到的常见问题以及对应的解决方案，如果您发现了任何频繁出现的问题，请随时扩充本列表，非常欢迎您提出的任何解决方案。如果您在环境配置、模型训练等工作中遇到任何的问题，请使用[问题模板](https://github.com/open-mmlab/mmdetection3d/blob/master/.github/ISSUE_TEMPLATE/error-report.md)来创建相应的 issue，并将所需的所有信息填入到问题模板中，我们会尽快解决您的问题。
+
+## MMEngine/MMCV/MMDet/MMDet3D 安装
+
+- 跟 MMEngine, MMCV, MMDetection 和 MMDetection3D 相关的编译问题; "ConvWS is already registered in conv layer"; "AssertionError: MMCV==xxx is used but incompatible. Please install mmcv>=xxx, \<=xxx."
+
+- MMDetection3D 需要的 MMEngine, MMCV 和 MMDetection 的版本列在了下面。请安装正确版本的 MMEngine、MMCV 和 MMDetection 以避免相关的安装问题。
+
+  | MMDetection3D 版本 |      MMEngine 版本       |        MMCV 版本        |     MMDetection 版本     |
+  | ------------------ | :----------------------: | :---------------------: | :----------------------: |
+  | main               | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.4.0 |
+  | v1.4.0             | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.4.0 |
+  | v1.3.0             | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.2.0 | mmdet>=3.0.0rc5, \<3.3.0 |
+  | v1.2.0             | mmengine>=0.8.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |  mmdet>=3.0.0, \<3.2.0   |
+  | v1.1.1             | mmengine>=0.7.1, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |  mmdet>=3.0.0, \<3.1.0   |
+
+  **注意**：如果你想安装 mmdet3d-v1.0.0rcx，可以在[此处](https://mmdetection3d.readthedocs.io/en/latest/faq.html#mmcv-mmdet-mmdet3d-installation)找到 MMDetection，MMSegmentation 和 MMCV 的兼容版本。请选择正确版本的 MMCV、MMDetection 和 MMSegmentation 以避免安装问题。
+
+- 如果您在 `import open3d` 时遇到下面的问题：
+
+  `OSError: /lib/x86_64-linux-gnu/libm.so.6: version 'GLIBC_2.27' not found`
+
+  请将 open3d 的版本降级至 0.9.0.0，因为最新版 open3d 需要 'GLIBC_2.27' 文件的支持， Ubuntu 16.04 系统中缺失该文件，且该文件仅存在于 Ubuntu 18.04 及之后的系统中。
+
+- 如果您在 `import pycocotools` 时遇到版本错误的问题，这是由于 nuscenes-devkit 需要安装 pycocotools，然而 mmdet 依赖于 mmpycocotools，当前的解决方案如下所示，我们将会在之后全面支持 pycocotools ：
+
+  ```shell
+  pip uninstall pycocotools mmpycocotools
+  pip install mmpycocotools
+  ```
+
+  **注意**： 我们已经在 0.13.0 及之后的版本中全面支持 pycocotools。
+
+- 如果您在导入 pycocotools 相关包时遇到下面的问题：
+
+  `ValueError: numpy.ndarray size changed, may indicate binary incompatibility. Expected 88 from C header, got 80 from PyObject`
+
+  请将 pycocotools 的版本降级至 2.0.1，这是由于最新版本的 pycocotools 与 numpy \< 1.20.0 不兼容。或者通过下面的方式从源码进行编译来安装最新版本的 pycocotools ：
+
+  `pip install -e "git+https://github.com/cocodataset/cocoapi#egg=pycocotools&subdirectory=PythonAPI"`
+
+  或者
+
+  `pip install -e "git+https://github.com/ppwwyyxx/cocoapi#egg=pycocotools&subdirectory=PythonAPI"`
+
+- 如果您使用 cuda-9.0 的环境并遇到关于 numba 的错误， 您应该检查下 numba 的版本。在 cuda-9.0 环境中，高版本的 numba 是不支持的，我们建议安装 numba==0.53.0.
+
+## 如何标注点云？
+
+MMDetection3D 不支持点云标注。我们提供一些开源的标注工具供参考：
+
+- [SUSTechPOINTS](https://github.com/naurril/SUSTechPOINTS)
+- [LATTE](https://github.com/bernwang/latte)
+
+此外，我们改进了 [LATTE](https://github.com/bernwang/latte) 以便更方便的标注。更多的细节请参考[这里](https://arxiv.org/abs/2011.10174)。
diff --git a/mmde/docs/zh_cn/notes/index.rst b/mmde/docs/zh_cn/notes/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..609f0e0493f025e47e21503cb016d3d5d55f06e1
--- /dev/null
+++ b/mmde/docs/zh_cn/notes/index.rst
@@ -0,0 +1,8 @@
+.. toctree::
+   :maxdepth: 3
+
+   benchmarks.md
+   changelog_v1.0.x.md
+   changelog.md
+   compatibility.md
+   faq.md
diff --git a/mmde/docs/zh_cn/stat.py b/mmde/docs/zh_cn/stat.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9ec8247c5c0aaf31105b5359733acfbc9c5e7a3
--- /dev/null
+++ b/mmde/docs/zh_cn/stat.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+import functools as func
+import glob
+import re
+from os import path as osp
+
+import numpy as np
+
+url_prefix = 'https://github.com/open-mmlab/mmdetection3d/blob/main/'
+
+files = sorted(glob.glob('../../configs/*/README.md'))
+
+stats = []
+titles = []
+num_ckpts = 0
+
+for f in files:
+    url = osp.dirname(f.replace('../../', url_prefix))
+
+    with open(f, 'r') as content_file:
+        content = content_file.read()
+
+    title = content.split('\n')[0].replace('# ', '').strip()
+    ckpts = set(x.lower().strip()
+                for x in re.findall(r'\[model\]\((https?.*)\)', content))
+
+    if len(ckpts) == 0:
+        continue
+
+    _papertype = [x for x in re.findall(r'\[([A-Z]+)\]', content)]
+    assert len(_papertype) > 0
+    papertype = _papertype[0]
+
+    paper = set([(papertype, title)])
+
+    titles.append(title)
+    num_ckpts += len(ckpts)
+
+    statsmsg = f"""
+\t* [{papertype}] [{title}]({url}) ({len(ckpts)} ckpts)
+"""
+    stats.append((paper, ckpts, statsmsg))
+
+allpapers = func.reduce(lambda a, b: a.union(b), [p for p, _, _ in stats])
+msglist = '\n'.join(x for _, _, x in stats)
+
+papertypes, papercounts = np.unique([t for t, _ in allpapers],
+                                    return_counts=True)
+countstr = '\n'.join(
+    [f'   - {t}: {c}' for t, c in zip(papertypes, papercounts)])
+
+modelzoo = f"""
+# Model Zoo Statistics
+
+* Number of papers: {len(set(titles))}
+{countstr}
+
+* Number of checkpoints: {num_ckpts}
+
+{msglist}
+"""
+
+with open('modelzoo_statistics.md', 'w') as f:
+    f.write(modelzoo)
diff --git a/mmde/docs/zh_cn/switch_language.md b/mmde/docs/zh_cn/switch_language.md
new file mode 100644
index 0000000000000000000000000000000000000000..d33d0803ef8f62e1410f12f991a8d78f2cf75df1
--- /dev/null
+++ b/mmde/docs/zh_cn/switch_language.md
@@ -0,0 +1,3 @@
+## <a href='https://mmdetection3d.readthedocs.io/en/latest/'>English</a>
+
+## <a href='https://mmdetection3d.readthedocs.io/zh_CN/latest/'>简体中文</a>
diff --git a/mmde/docs/zh_cn/user_guides/backends_support.md b/mmde/docs/zh_cn/user_guides/backends_support.md
new file mode 100644
index 0000000000000000000000000000000000000000..80c535de5f73618439b2d7463da768c091aabd13
--- /dev/null
+++ b/mmde/docs/zh_cn/user_guides/backends_support.md
@@ -0,0 +1,154 @@
+# 后端支持
+
+我们支持不同的文件客户端后端：磁盘、Ceph 和 LMDB 等。下面是修改配置使之从 Ceph 加载和保存数据的示例。
+
+## 从 Ceph 读取数据和标注文件
+
+我们支持从 Ceph 加载数据和生成的标注信息文件（pkl 和 json）：
+
+```python
+# set file client backends as Ceph
+backend_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/nuscenes/':
+        's3://openmmlab/datasets/detection3d/nuscenes/', # replace the path with your data path on Ceph
+        'data/nuscenes/':
+        's3://openmmlab/datasets/detection3d/nuscenes/' # replace the path with your data path on Ceph
+    }))
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    sample_groups=dict(Car=15),
+    classes=class_names,
+    # set file client for points loader to load training data
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    # set file client for data base sampler to load db info file
+    backend_args=backend_args)
+
+train_pipeline = [
+    # set file client for loading training data
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, backend_args=backend_args),
+    # set file client for loading training data annotations
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, backend_args=backend_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[0.25, 0.25, 0.25],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.15707963267, 0.15707963267]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    # set file client for loading validation/testing data
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+
+data = dict(
+    # set file client for loading training info files (.pkl)
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(pipeline=train_pipeline, classes=class_names, backend_args=backend_args)),
+    # set file client for loading validation info files (.pkl)
+    val=dict(pipeline=test_pipeline, classes=class_names,backend_args=backend_args),
+    # set file client for loading testing info files (.pkl)
+    test=dict(pipeline=test_pipeline, classes=class_names, backend_args=backend_args))
+```
+
+## 从 Ceph 读取预训练模型
+
+```python
+model = dict(
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch='regnetx_1.6gf',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='s3://openmmlab/checkpoints/mmdetection3d/regnetx_1.6gf'), # replace the path with your pretrained model path on Ceph
+        ...
+```
+
+## 从 Ceph 读取模型权重文件
+
+```python
+# replace the path with your checkpoint path on Ceph
+load_from = 's3://openmmlab/checkpoints/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230614-77663cd6.pth'
+resume_from = None
+workflow = [('train', 1)]
+```
+
+## 保存模型权重文件至 Ceph
+
+```python
+# checkpoint saving
+# replace the path with your checkpoint saving path on Ceph
+checkpoint_config = dict(interval=1, max_keep_ckpts=2, out_dir='s3://openmmlab/mmdetection3d')
+```
+
+## EvalHook 保存最优模型权重文件至 Ceph
+
+```python
+# replace the path with your checkpoint saving path on Ceph
+evaluation = dict(interval=1, save_best='bbox', out_dir='s3://openmmlab/mmdetection3d')
+```
+
+## 训练日志保存至 Ceph
+
+训练后的训练日志会备份到指定的 Ceph 路径。
+
+```python
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook', out_dir='s3://openmmlab/mmdetection3d'),
+    ])
+```
+
+您还可以通过设置 `keep_local = False` 备份到指定的 Ceph 路径后删除本地训练日志。
+
+```python
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook', out_dir='s3://openmmlab/mmdetection3d', keep_local=False),
+    ])
+```
diff --git a/mmde/docs/zh_cn/user_guides/config.md b/mmde/docs/zh_cn/user_guides/config.md
new file mode 100644
index 0000000000000000000000000000000000000000..f971a108550bb720d0a8b869a16bd7c549f6e32d
--- /dev/null
+++ b/mmde/docs/zh_cn/user_guides/config.md
@@ -0,0 +1,558 @@
+# 学习配置文件
+
+MMDetection3D 和其他 OpenMMLab 仓库使用 [MMEngine 的配置文件系统](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/config.html)。它具有模块化和继承性设计，以便于进行各种实验。
+
+## 配置文件的内容
+
+MMDetection3D 采用模块化设计，所有功能的模块可以通过配置文件进行配置。以 PointPillars 为例，我们将根据不同的功能模块介绍配置文件的各个字段。
+
+### 模型配置
+
+在 MMDetection3D 的配置中，我们使用 `model` 字段来配置检测算法的组件。除了 `voxel_encoder`，`backbone` 等神经网络组件外，还需要 `data_preprocessor`，`train_cfg` 和 `test_cfg`。`data_preprocessor` 负责对数据加载器（dataloader）输出的每一批数据进行预处理。模型配置中的 `train_cfg` 和 `test_cfg` 用于设置训练和测试组件的超参数。
+
+```python
+model = dict(
+    type='VoxelNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=32,
+            point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1],
+            voxel_size=[0.16, 0.16, 4],
+            max_voxels=(16000, 40000))),
+    voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=4,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=[0.16, 0.16, 4],
+        point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]),
+    middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
+    backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        assign_per_class=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[0, -39.68, -0.6, 69.12, 39.68, -0.6],
+                    [0, -39.68, -0.6, 69.12, 39.68, -0.6],
+                    [0, -39.68, -1.78, 69.12, 39.68, -1.78]],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=0.1111111111111111,
+            loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    train_cfg=dict(
+        assigner=[
+            dict(
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1)
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
+```
+
+### 数据集和评测器配置
+
+在使用[执行器（Runner）](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/runner.html)进行训练、测试和验证时，我们需要配置[数据加载器](https://pytorch.org/docs/stable/data.html?highlight=data%20loader#torch.utils.data.DataLoader)。构建数据加载器需要设置数据集和数据处理流程。由于这部分的配置较为复杂，我们使用中间变量来简化数据加载器配置的编写。
+
+```python
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+metainfo = dict(classes=class_names)
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=15, Cyclist=15),
+    points_loader=dict(
+        type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4))
+
+train_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler, use_ground_plane=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_labels_3d', 'gt_bboxes_3d'])
+]
+test_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+eval_pipeline = [
+    dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=6,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='kitti_infos_train.pkl',
+            data_prefix=dict(pts='training/velodyne_reduced'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            metainfo=metainfo,
+            box_type_3d='LiDAR')))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR'))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR'))
+```
+
+[评测器](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/evaluation.html)用于计算训练模型在验证和测试数据集上的指标。评测器的配置由一个或一组评价指标配置组成：
+
+```python
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox')
+test_evaluator = val_evaluator
+```
+
+由于测试数据集没有标注文件，因此 MMDetection3D 中的 test_dataloader 和 test_evaluator 配置通常等于 val。如果您想要保存在测试数据集上的检测结果，则可以像这样编写配置：
+
+```python
+# 在测试集上推理，
+# 并将检测结果转换格式以用于提交结果
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='testing/velodyne_reduced'),
+        ann_file='kitti_infos_test.pkl',
+        load_eval_anns=False,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR'))
+test_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'kitti_infos_test.pkl',
+    metric='bbox',
+    format_only=True,
+    submission_prefix='results/kitti-3class/kitti_results')
+```
+
+### 训练和测试配置
+
+MMEngine 的执行器使用循环（Loop）来控制训练，验证和测试过程。用户可以使用这些字段设置最大训练轮次和验证间隔：
+
+```python
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=80,
+    val_interval=2)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+```
+
+### 优化配置
+
+`optim_wrapper` 是配置优化相关设置的字段。优化器封装不仅提供了优化器的功能，还支持梯度裁剪、混合精度训练等功能。更多内容请看[优化器封装教程](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/optim_wrapper.html)。
+
+```python
+optim_wrapper = dict(  # 优化器封装配置
+    type='OptimWrapper',  # 优化器封装类型，切换到 AmpOptimWrapper 启动混合精度训练
+    optimizer=dict(  # 优化器配置。支持 PyTorch 的各种优化器，请参考 https://pytorch.org/docs/stable/optim.html#algorithms
+        type='AdamW', lr=0.001, betas=(0.95, 0.99), weight_decay=0.01),
+    clip_grad=dict(max_norm=35, norm_type=2))  # 梯度裁剪选项。设置为 None 禁用梯度裁剪。使用方法请见 https://mmengine.readthedocs.io/zh_CN/latest/tutorials/optim_wrapper.html
+```
+
+`param_scheduler` 是配置调整优化器超参数（例如学习率和动量）的字段。用户可以组合多个调度器来创建所需要的参数调整策略。更多信息请参考[参数调度器教程](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/param_scheduler.html)和[参数调度器 API 文档](https://mmengine.readthedocs.io/zh_CN/latest/api/optim.html#scheduler)。
+
+```python
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=32,
+        eta_min=0.01,
+        begin=0,
+        end=32,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=48,
+        eta_min=1.0000000000000001e-07,
+        begin=32,
+        end=80,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=32,
+        eta_min=0.8947368421052632,
+        begin=0,
+        end=32,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=48,
+        eta_min=1,
+        begin=32,
+        end=80,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+```
+
+### 钩子配置
+
+用户可以在训练、验证和测试循环上添加钩子，从而在运行期间插入一些操作。有两种不同的钩子字段，一种是 `default_hooks`，另一种是 `custom_hooks`。
+
+`default_hooks` 是一个钩子配置字典，并且这些钩子是运行时所需要的。它们具有默认优先级，是不需要修改的。如果未设置，执行器将使用默认值。如果要禁用默认钩子，用户可以将其配置设置为 `None`。
+
+```python
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=-1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='Det3DVisualizationHook'))
+```
+
+`custom_hooks` 是一个由其他钩子配置组成的列表。用户可以开发自己的钩子并将其插入到该字段中。
+
+```python
+custom_hooks = []
+```
+
+### 运行配置
+
+```python
+default_scope = 'mmdet3d'  # 寻找模块的默认注册器域。请参考 https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/registry.html
+
+env_cfg = dict(
+    cudnn_benchmark=False,  # 是否启用 cudnn benchmark
+    mp_cfg=dict(  # 多进程配置
+        mp_start_method='fork',  # 使用 fork 来启动多进程。'fork' 通常比 'spawn' 更快，但可能不安全。请参考 https://github.com/pytorch/pytorch/issues/1355
+        opencv_num_threads=0),  # 关闭 opencv 的多进程以避免系统超负荷
+    dist_cfg=dict(backend='nccl'))  # 分布式配置
+
+vis_backends = [dict(type='LocalVisBackend')]  # 可视化后端。请参考 https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/visualization.html
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+log_processor = dict(
+    type='LogProcessor',  # 日志处理器用于处理运行时日志
+    window_size=50,  # 日志数值的平滑窗口
+    by_epoch=True)  # 是否使用 epoch 格式的日志。需要与训练循环的类型保持一致
+
+log_level = 'INFO'  # 日志等级
+load_from = None  # 从给定路径加载模型检查点作为预训练模型。这不会恢复训练。
+resume = False  # 是否从 `load_from` 中定义的检查点恢复。如果 `load_from` 为 None，它将恢复 `work_dir` 中的最近检查点。
+```
+
+## 配置文件继承
+
+在 `configs/_base_` 文件夹下有 4 个基本组件类型，分别是：数据集（dataset），模型（model），训练策略（schedule）和运行时的默认设置（default runtime）。许多方法，如 SECOND、PointPillars、PartA2、VoteNet 都能够很容易地构建出来。由 `_base_` 下的组件组成的配置，被我们称为 _原始配置（primitive）_。
+
+对于同一个文件夹下的所有配置，推荐**只有一个**对应的 _原始配置_ 文件。所有其他的配置文件都应该继承自这个 _原始配置_ 文件。这样就能保证配置文件的最大继承深度为 3。
+
+为了便于理解，我们建议贡献者继承现有方法。例如，如果在 PointPillars 的基础上做了一些修改，用户可以首先通过指定 `_base_ = '../pointpillars/pointpillars_hv_fpn_sbn-all_8xb4-2x_nus-3d.py'` 来继承基础的 PointPillars 结构，然后修改配置文件中的必要参数以完成继承。
+
+如果您在构建一个与任何现有方法都不共享的全新方法，那么可以在 `configs` 文件夹下创建一个新的例如 `xxx_rcnn` 文件夹。
+
+更多细节请参考 [MMEngine 配置文件教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/config.html)。
+
+通过设置 `_base_` 字段，我们可以设置当前配置文件继承自哪些文件。
+
+当 `_base_` 为文件路径字符串时，表示继承一个配置文件的内容。
+
+```python
+_base_ = './pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py'
+```
+
+当 `_base_` 是多个文件路径组成的列表式，表示继承多个文件。
+
+```python
+_base_ = [
+    '../_base_/models/pointpillars_hv_secfpn_kitti.py',
+    '../_base_/datasets/kitti-3d-3class.py',
+    '../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py'
+]
+```
+
+如果需要检测配置文件，可以通过运行 `python tools/misc/print_config.py /PATH/TO/CONFIG` 来查看完整的配置。
+
+### 忽略基础配置文件里的部分字段
+
+有时，您也许会设置 `_delete_=True` 去忽略基础配置文件里的一些字段。您可以参考 [MMEngine 配置文件教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/config.html) 来获得一些简单的指导。
+
+在 MMDetection3D 里，例如，修改以下 PointPillars 配置中的颈部网络：
+
+```python
+model = dict(
+    type='MVXFasterRCNN',
+    data_preprocessor=dict(voxel_layer=dict(...)),
+    pts_voxel_encoder=dict(...),
+    pts_middle_encoder=dict(...),
+    pts_backbone=dict(...),
+    pts_neck=dict(
+        type='FPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        in_channels=[64, 128, 256],
+        out_channels=256,
+        start_level=0,
+        num_outs=3),
+    pts_bbox_head=dict(...))
+```
+
+`FPN` 和 `SECONDFPN` 使用不同的关键字来构建：
+
+```python
+_base_ = '../_base_/models/pointpillars_hv_fpn_nus.py'
+model = dict(
+    pts_neck=dict(
+        _delete_=True,
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(...))
+```
+
+`_delete_=True` 将使用新的键去替换 `pts_neck` 字段内所有旧的键。
+
+### 在配置文件里使用中间变量
+
+配置文件里会使用一些中间变量，例如数据集里的 `train_pipeline`/`test_pipeline`。需要注意的是，当修改子配置文件中的中间变量时，用户需要再次将中间变量传递到对应的字段中。例如，我们想使用多尺度策略训练并测试 PointPillars，`train_pipeline`/`test_pipeline` 是我们想要修改的中间变量。
+
+```python
+_base_ = './nus-3d.py'
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_labels_3d', 'gt_bboxes_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=[0.95, 1.0, 1.05],
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+```
+
+我们首先定义新的 `train_pipeline`/`test_pipeline`，然后传递到数据加载器字段中。
+
+### 复用 \_base\_ 文件中的变量
+
+如果用户希望复用 base 文件中的变量，则可以通过使用 `{{_base_.xxx}}` 获取对应变量的拷贝。例如：
+
+```python
+_base_ = './pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py'
+
+a = {{_base_.model}}  # 变量 `a` 等于 `_base_` 中定义的 `model`
+```
+
+## 通过脚本参数修改配置
+
+当使用 `tools/train.py` 或者 `tools/test.py` 提交工作时，您可以通过指定 `--cfg-options` 来修改配置文件。
+
+- 更新配置字典的键值
+
+  可以按照原始配置文件中字典的键值顺序指定配置选项。例如，使用 `--cfg-options model.backbone.norm_eval=False` 将模型主干网络中的所有 BN 模块都改为 `train` 模式。
+
+- 更新配置列表中的键值
+
+  在配置文件里，一些配置字典被包含在列表中，例如，训练流程 `train_dataloader.dataset.pipeline` 通常是一个列表，例如 `[dict(type='LoadPointsFromFile'), ...]`。如果您想要将训练流程中的 `'LoadPointsFromFile'` 改成 `'LoadPointsFromDict'`，您需要指定 `--cfg-options data.train.pipeline.0.type=LoadPointsFromDict`。
+
+- 更新列表/元组的值
+
+  如果要更新的值是列表或元组。例如，配置文件通常设置 `model.data_preprocessor.mean=[123.675, 116.28, 103.53]`。如果您想要改变这个均值，您需要指定 `--cfg-options model.data_preprocessor.mean="[127,127,127]"`。注意，引号 `"` 是支持列表/元组数据类型所必需的，并且在指定值的引号内**不允许**有空格。
+
+## 配置文件名称风格
+
+我们遵循以下样式来命名配置文件。建议贡献者遵循相同的风格。
+
+```
+{algorithm name}_{model component names [component1]_[component2]_[...]}_{training settings}_{training dataset information}_{testing dataset information}.py
+```
+
+文件名分为五个部分。所有部分和组件用 `_` 连接，每个部分或组件内的单词应该用 `-` 连接。
+
+- `{algorithm name}`：算法的名称。它可以是检测器的名称，例如 `pointpillars`、`fcos3d` 等。
+- `{model component names}`：算法中使用的组件名称，如 voxel_encoder、backbone、neck 等。例如 `second_secfpn_head-dcn-circlenms` 表示使用 SECOND 的 SparseEncoder，SECONDFPN，以及带有 DCN 和 circle NMS 的检测头。
+- `{training settings}`：训练设置的信息，例如批量大小，数据增强，损失函数策略，调度器以及训练轮次/迭代。例如 `8xb4-tta-cyclic-20e` 表示使用 8 个 gpu，每个 gpu 有 4 个数据样本，测试增强，余弦退火学习率，训练 20 个 epoch。缩写介绍：
+  - `{gpu x batch_per_gpu}`：GPU 数和每个 GPU 的样本数。`bN` 表示每个 GPU 上的批量大小为 N。例如 `4xb4` 是 4 个 GPU，每个 GPU 有 4 个样本数的缩写。
+  - `{schedule}`：训练方案，可选项为 `schedule-2x`、`schedule-3x`、`cyclic-20e` 等。`schedule-2x` 和 `schedule-3x` 分别代表 24 epoch 和 36 epoch。`cyclic-20e` 表示 20 epoch。
+- `{training dataset information}`：训练数据集名，例如 `kitti-3d-3class`，`nus-3d`，`s3dis-seg`，`scannet-seg`，`waymoD5-3d-car`。这里 `3d` 表示数据集用于 3D 目标检测，`seg` 表示数据集用于点云分割。
+- `{testing dataset information}`（可选）：当模型在一个数据集上训练，在另一个数据集上测试时的测试数据集名。如果没有注明，则表示训练和测试的数据集类型相同。
diff --git a/mmde/docs/zh_cn/user_guides/coord_sys_tutorial.md b/mmde/docs/zh_cn/user_guides/coord_sys_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..d666ba75737e593be9a19c1f0553dadd30774371
--- /dev/null
+++ b/mmde/docs/zh_cn/user_guides/coord_sys_tutorial.md
@@ -0,0 +1,245 @@
+# 坐标系
+
+## 概述
+
+MMDetection3D 使用 3 种不同的坐标系。3D 目标检测领域中不同坐标系的存在是非常有必要的，因为对于各种 3D 数据采集设备来说，如激光雷达、深度相机等，使用的坐标系是不一致的，不同的 3D 数据集也遵循不同的数据格式。早期的工作，比如 SECOND、VoteNet 将原始数据转换为另一种格式，形成了一些后续工作也遵循的约定，使得不同坐标系之间的转换变得更加复杂。
+
+尽管数据集和采集设备多种多样，但是通过总结 3D 目标检测的工作线，我们可以将坐标系大致分为三类：
+
+- 相机坐标系 -- 大多数相机的坐标系，在该坐标系中 y 轴正方向指向地面，x 轴正方向指向右侧，z 轴正方向指向前方。
+
+  ```
+              上  z 前
+              |    ^
+              |   /
+              |  /
+              | /
+              |/
+  左   ------ 0 ------> x 右
+              |
+              |
+              |
+              |
+              v
+            y 下
+  ```
+
+- 激光雷达坐标系 -- 众多激光雷达的坐标系，在该坐标系中 z 轴负方向指向地面，x 轴正方向指向前方，y 轴正方向指向左侧。
+
+  ```
+                z 上   x 前
+                 ^    ^
+                 |   /
+                 |  /
+                 | /
+                 |/
+  y 左   <------ 0 ------ 右
+  ```
+
+- 深度坐标系 -- VoteNet、H3DNet 等模型使用的坐标系，在该坐标系中 z 轴负方向指向地面，x 轴正方向指向右侧，y 轴正方向指向前方。
+
+  ```
+             z 上   y 前
+              ^    ^
+              |   /
+              |  /
+              | /
+              |/
+  左   ------ 0 ------> x 右
+  ```
+
+该教程中的坐标系定义实际上**不仅仅是定义三个轴**。对于形如 $(x, y, z, dx, dy, dz, r)$ 的框来说，我们的坐标系也定义了如何解释框的尺寸 $(dx, dy, dz)$ 和转向角 (yaw) 角度 $r$。
+
+三个坐标系的图示如下：
+
+![](https://raw.githubusercontent.com/open-mmlab/mmdetection3d/master/resources/coord_sys_all.png)
+
+上面三张图是 3D 坐标系，下面三张图是鸟瞰图。
+
+以后我们将坚持使用本教程中定义的三个坐标系。
+
+## 转向角 (yaw) 的定义
+
+请参考[维基百科](https://en.wikipedia.org/wiki/Euler_angles#Tait%E2%80%93Bryan_angles)了解转向角的标准定义。在目标检测中，我们选择一个轴作为重力轴，并在垂直于重力轴的平面 $\\Pi$ 上选取一个参考方向，那么参考方向的转向角为 0，在 $\\Pi$ 上的其他方向有非零的转向角，其角度取决于其与参考方向的角度。
+
+目前，对于所有支持的数据集，标注不包括俯仰角 (pitch) 和滚动角 (roll)，这意味着我们在预测框和计算框之间的重叠时只需考虑转向角 (yaw)。
+
+在 MMDetection3D 中，所有坐标系都是右手坐标系，这意味着如果从重力轴的负方向（轴的正方向指向人眼）看，转向角 (yaw) 沿着逆时针方向增加。
+
+下图显示，在右手坐标系中，如果我们设定 x 轴正方向为参考方向，那么 y 轴正方向的转向角 (yaw) 为 $\\frac{\\pi}{2}$。
+
+```
+                     z 上  y 前 (yaw=0.5*pi)
+                      ^    ^
+                      |   /
+                      |  /
+                      | /
+                      |/
+左 (yaw=pi)    ------ 0 ------> x 右 (yaw=0)
+```
+
+对于一个框来说，其转向角 (yaw) 的值等于其方向减去一个参考方向。在 MMDetection3D 的所有三个坐标系中，参考方向总是 x 轴的正方向，而如果一个框的转向角 (yaw) 为 0，则其方向被定义为与 x 轴平行。框的转向角 (yaw) 的定义如下图所示。
+
+```
+  y 前
+  ^      框的方向 (yaw=0.5*pi)
+ /|\        ^
+  |        /|\
+  |     ____|____
+  |    |    |    |
+  |    |    |    |
+__|____|____|____|______\ x 右
+  |    |    |    |      /
+  |    |    |    |
+  |    |____|____|
+  |
+```
+
+## 框尺寸的定义
+
+框尺寸的定义与转向角 (yaw) 的定义是分不开的。在上一节中，我们提到如果一个框的转向角 (yaw) 为 0，它的方向就被定义为与 x 轴平行。那么自然地，一个框对应于 x 轴的尺寸应该是 $dx$。但是，这在某些数据集中并非总是如此（我们稍后会解决这个问题）。
+
+下图展示了 x 轴和 $dx$，y 轴和 $dy$ 对应的含义。
+
+```
+y 前
+  ^      框的方向 (yaw=0.5*pi)
+ /|\        ^
+  |        /|\
+  |     ____|____
+  |    |    |    |
+  |    |    |    | dx
+__|____|____|____|______\ x 右
+  |    |    |    |      /
+  |    |    |    |
+  |    |____|____|
+  |         dy
+```
+
+注意框的方向总是和 $dx$ 边平行。
+
+```
+y 前
+  ^     _________
+ /|\   |    |    |
+  |    |    |    |
+  |    |    |    | dy
+  |    |____|____|____\  框的方向 (yaw=0)
+  |    |    |    |    /
+__|____|____|____|_________\ x 右
+  |    |    |    |         /
+  |    |____|____|
+  |         dx
+  |
+```
+
+## 与支持的数据集的原始坐标系的关系
+
+### KITTI
+
+KITTI 数据集的原始标注是在相机坐标系下的，详见 [get_label_anno](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/data_converter/kitti_data_utils.py)。在 MMDetection3D 中，为了在 KITTI 数据集上训练基于激光雷达的模型，首先将数据从相机坐标系转换到激光雷达坐标，详见 [get_ann_info](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/kitti_dataset.py)。对于训练基于视觉的模型，数据保持在相机坐标系不变。
+
+在 SECOND 中，框的激光雷达坐标系定义如下（鸟瞰图）：
+
+![](https://raw.githubusercontent.com/traveller59/second.pytorch/master/images/kittibox.png)
+
+对于每个框来说，尺寸为 $(w, l, h)$，转向角 (yaw) 的参考方向为 y 轴正方向。更多细节请参考[代码库](https://github.com/traveller59/second.pytorch#concepts)。
+
+我们的激光雷达坐标系有两处改变：
+
+- 转向角 (yaw) 被定义为右手而非左手，从而保持一致性；
+- 框的尺寸为 $(l, w, h)$ 而非 $(w, l, h)$，由于在 KITTI 数据集中 $w$ 对应 $dy$，$l$ 对应 $dx$。
+
+### Waymo
+
+我们使用 Waymo 数据集的 KITTI 格式数据。因此，在我们的实现中 KITTI 和 Waymo 也共用相同的坐标系。
+
+### NuScenes
+
+NuScenes 提供了一个评估工具包，其中每个框都被包装成一个 `Box` 实例。`Box` 的坐标系不同于我们的激光雷达坐标系，在 `Box` 坐标系中，前两个表示框尺寸的元素分别对应 $(dy, dx)$ 或者 $(w, l)$，和我们的表示方法相反。更多细节请参考 NuScenes [教程](https://github.com/open-mmlab/mmdetection3d/blob/master/docs/zh_cn/datasets/nuscenes_det.md#notes)。
+
+读者可以参考 [NuScenes 开发工具](https://github.com/nutonomy/nuscenes-devkit/tree/master/python-sdk/nuscenes/eval/detection)，了解 [NuScenes 框](https://github.com/nutonomy/nuscenes-devkit/blob/2c6a752319f23910d5f55cc995abc547a9e54142/python-sdk/nuscenes/utils/data_classes.py#L457) 的定义和 [NuScenes 评估](https://github.com/nutonomy/nuscenes-devkit/blob/master/python-sdk/nuscenes/eval/detection/evaluate.py)的过程。
+
+### Lyft
+
+就涉及坐标系而言，Lyft 和 NuScenes 共用相同的数据格式。
+
+请参考[官方网站](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data)获取更多信息。
+
+### ScanNet
+
+ScanNet 的原始数据不是点云而是网格，需要在我们的深度坐标系下进行采样得到点云数据。对于 ScanNet 检测任务，框的标注是轴对齐的，并且转向角 (yaw) 始终是 0。因此，我们的深度坐标系中转向角 (yaw) 的方向对 ScanNet 没有影响。
+
+### SUN RGB-D
+
+SUN RGB-D 的原始数据不是点云而是 RGB-D 图像。我们通过反投影，可以得到每张图像对应的点云，其在我们的深度坐标系下。但是，数据集的标注并不在我们的系统中，所以需要进行转换。
+
+将原始标注转换为我们的深度坐标系下的标注的转换过程请参考 [sunrgbd_data_utils.py](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/data_converter/sunrgbd_data_utils.py)。
+
+### S3DIS
+
+在我们的实现中，S3DIS 与 ScanNet 共用相同的坐标系。然而 S3DIS 是一个仅限于分割任务的数据集，因此没有标注是坐标系敏感的。
+
+## 例子
+
+### 框（在不同坐标系间）的转换
+
+以相机坐标系和激光雷达坐标系间的转换为例：
+
+首先，对于点和框的中心点，坐标转换前后满足下列关系：
+
+- $x\_{LiDAR}=z\_{camera}$
+- $y\_{LiDAR}=-x\_{camera}$
+- $z\_{LiDAR}=-y\_{camera}$
+
+然后，框的尺寸转换前后满足下列关系：
+
+- $dx\_{LiDAR}=dx\_{camera}$
+- $dy\_{LiDAR}=dz\_{camera}$
+- $dz\_{LiDAR}=dy\_{camera}$
+
+最后，转向角 (yaw) 也应该被转换：
+
+- $r\_{LiDAR}=-\\frac{\\pi}{2}-r\_{camera}$
+
+详见[此处](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/bbox/structures/box_3d_mode.py)代码了解更多细节。
+
+### 鸟瞰图
+
+如果 3D 框是 $(x, y, z, dx, dy, dz, r)$，相机坐标系下框的鸟瞰图是 $(x, z, dx, dz, -r)$。转向角 (yaw) 符号取反是因为相机坐标系重力轴的正方向指向地面。
+
+详见[此处](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/bbox/structures/cam_box3d.py)代码了解更多细节。
+
+### 框的旋转
+
+我们将各种框的旋转设定为绕着重力轴逆时针旋转。因此，为了旋转一个 3D 框，我们首先需要计算新的框的中心，然后将旋转角度添加到转向角 (yaw)。
+
+详见[此处](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/bbox/structures/cam_box3d.py)代码了解更多细节。
+
+## 常见问题
+
+#### Q1: 与框相关的算子是否适用于所有坐标系类型？
+
+否。例如，[用于 RoI-Aware Pooling 的算子](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/roiaware_pool3d.py)只适用于深度坐标系和激光雷达坐标系下的框。由于如果从上方看，旋转是顺时针的，所以 KITTI 数据集[这里](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/evaluation/kitti_utils.py)的评估函数仅适用于相机坐标系下的框。
+
+对于每个和框相关的算子，我们注明了其所适用的框类型。
+
+#### Q2: 在每个坐标系中，三个轴是否分别准确地指向右侧、前方和地面？
+
+否。例如在 KITTI 中，从相机坐标系转换为激光雷达坐标系时，我们需要一个校准矩阵。
+
+#### Q3: 框中转向角 (yaw) $2\\pi$ 的相位差如何影响评估？
+
+对于交并比 (IoU) 计算，转向角 (yaw) 有 $2\\pi$ 的相位差的两个框是相同的，所以不会影响评估。
+
+对于角度预测评估，例如 NuScenes 中的 NDS 指标和 KITTI 中的 AOS 指标，会先对预测框的角度进行标准化，因此 $2\\pi$ 的相位差不会改变结果。
+
+#### Q4: 框中转向角 (yaw) $\\pi$ 的相位差如何影响评估？
+
+对于交并比 (IoU) 计算，转向角 (yaw) 有 $\\pi$ 的相位差的两个框是相同的，所以不会影响评估。
+
+然而，对于角度预测评估，这会导致完全相反的方向。
+
+考虑一辆汽车，转向角 (yaw) 是汽车前部方向与 x 轴正方向之间的夹角。如果我们将该角度增加 $\\pi$，车前部将变成车后部。
+
+对于某些类别，例如障碍物，前后没有区别，因此 $\\pi$ 的相位差不会对角度预测分数产生影响。
diff --git a/mmde/docs/zh_cn/user_guides/data_pipeline.md b/mmde/docs/zh_cn/user_guides/data_pipeline.md
new file mode 100644
index 0000000000000000000000000000000000000000..cf50d70ff64aaca818bd75aeabfb8b8da99257c3
--- /dev/null
+++ b/mmde/docs/zh_cn/user_guides/data_pipeline.md
@@ -0,0 +1,191 @@
+# 自定义数据预处理流程
+
+## 数据预处理流程的设计
+
+遵循一般惯例，我们使用 `Dataset` 和 `DataLoader` 来调用多个进程进行数据的加载。`Dataset` 将会返回与模型前向传播的参数所对应的数据项构成的字典。因为目标检测中的数据的尺寸可能无法保持一致（如点云中点的数量、真实标注框的尺寸等），我们在 MMCV 中引入一个 `DataContainer` 类型，来帮助收集和分发不同尺寸的数据。请参考[此处](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py)获取更多细节。
+
+数据预处理流程和数据集之间是互相分离的两个部分，通常数据集定义了如何处理标注信息，而数据预处理流程定义了准备数据项字典的所有步骤。数据集预处理流程包含一系列的操作，每个操作将一个字典作为输入，并输出应用于下一个转换的一个新的字典。
+
+我们将在下图中展示一个最经典的数据集预处理流程，其中蓝色框表示预处理流程中的各项操作。随着预处理的进行，每一个操作都会添加新的键值（图中标记为绿色）到输出字典中，或者更新当前存在的键值（图中标记为橙色）。
+
+![](../../../resources/data_pipeline.png)
+
+预处理流程中的各项操作主要分为数据加载、预处理、格式化、测试时的数据增强。
+
+接下来将展示一个用于 PointPillars 模型的数据集预处理流程的例子。
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1.0,
+        flip=False,
+        pcd_horizontal_flip=False,
+        pcd_vertical_flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+```
+
+对于每项操作，我们将列出相关的被添加/更新/移除的字典项。
+
+### 数据加载
+
+`LoadPointsFromFile`
+
+- 添加：points
+
+`LoadPointsFromMultiSweeps`
+
+- 更新：points
+
+`LoadAnnotations3D`
+
+- 添加：gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, pts_instance_mask, pts_semantic_mask, bbox3d_fields, pts_mask_fields, pts_seg_fields
+
+### 预处理
+
+`GlobalRotScaleTrans`
+
+- 添加：pcd_trans, pcd_rotation, pcd_scale_factor
+- 更新：points, \*bbox3d_fields
+
+`RandomFlip3D`
+
+- 添加：flip, pcd_horizontal_flip, pcd_vertical_flip
+- 更新：points, \*bbox3d_fields
+
+`PointsRangeFilter`
+
+- 更新：points
+
+`ObjectRangeFilter`
+
+- 更新：gt_bboxes_3d, gt_labels_3d
+
+`ObjectNameFilter`
+
+- 更新：gt_bboxes_3d, gt_labels_3d
+
+`PointShuffle`
+
+- 更新：points
+
+`PointsRangeFilter`
+
+- 更新：points
+
+### 格式化
+
+`DefaultFormatBundle3D`
+
+- 更新：points, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels
+
+`Collect3D`
+
+- 添加：img_meta （由 `meta_keys` 指定的键值构成的 img_meta）
+- 移除：所有除 `keys` 指定的键值以外的其他键值
+
+### 测试时的数据增强
+
+`MultiScaleFlipAug`
+
+- 更新: scale, pcd_scale_factor, flip, flip_direction, pcd_horizontal_flip, pcd_vertical_flip （与这些指定的参数对应的增强后的数据列表）
+
+## 扩展并使用自定义数据集预处理方法
+
+1. 在任意文件中写入新的数据集预处理方法，如 `my_pipeline.py`，该预处理方法的输入和输出均为字典
+
+   ```python
+   from mmdet.datasets import PIPELINES
+
+   @PIPELINES.register_module()
+   class MyTransform:
+
+       def __call__(self, results):
+           results['dummy'] = True
+           return results
+   ```
+
+2. 导入新的预处理方法类
+
+   ```python
+   from .my_pipeline import MyTransform
+   ```
+
+3. 在配置文件中使用该数据集预处理方法
+
+   ```python
+   train_pipeline = [
+       dict(
+           type='LoadPointsFromFile',
+           load_dim=5,
+           use_dim=5,
+           backend_args=backend_args),
+       dict(
+           type='LoadPointsFromMultiSweeps',
+           sweeps_num=10,
+           backend_args=backend_args),
+       dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+       dict(
+           type='GlobalRotScaleTrans',
+           rot_range=[-0.3925, 0.3925],
+           scale_ratio_range=[0.95, 1.05],
+           translation_std=[0, 0, 0]),
+       dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+       dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+       dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+       dict(type='ObjectNameFilter', classes=class_names),
+       dict(type='MyTransform'),
+       dict(type='PointShuffle'),
+       dict(type='DefaultFormatBundle3D', class_names=class_names),
+       dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+   ]
+   ```
diff --git a/mmde/docs/zh_cn/user_guides/dataset_prepare.md b/mmde/docs/zh_cn/user_guides/dataset_prepare.md
new file mode 100644
index 0000000000000000000000000000000000000000..983e988afb4fb38f9041675ce6bf44d97f467306
--- /dev/null
+++ b/mmde/docs/zh_cn/user_guides/dataset_prepare.md
@@ -0,0 +1,219 @@
+# 数据预处理
+
+## 在数据预处理前
+
+我们推荐用户将数据集的路径软链接到 `$MMDETECTION3D/data`。如果你的文件夹结构和以下所展示的结构不一致，你可能需要改变配置文件中相应的数据路径。
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── nuscenes
+│   │   ├── maps
+│   │   ├── samples
+│   │   ├── sweeps
+│   │   ├── v1.0-test
+|   |   ├── v1.0-trainval
+│   ├── kitti
+│   │   ├── ImageSets
+│   │   ├── testing
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── velodyne
+│   │   ├── training
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── label_2
+│   │   │   ├── velodyne
+│   ├── waymo
+│   │   ├── waymo_format
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   │   ├── testing
+│   │   │   ├── gt.bin
+│   │   ├── kitti_format
+│   │   │   ├── ImageSets
+│   ├── lyft
+│   │   ├── v1.01-train
+│   │   │   ├── v1.01-train (训练数据)
+│   │   │   ├── lidar (训练激光雷达)
+│   │   │   ├── images (训练图片)
+│   │   │   ├── maps (训练地图)
+│   │   ├── v1.01-test
+│   │   │   ├── v1.01-test (测试数据)
+│   │   │   ├── lidar (测试激光雷达)
+│   │   │   ├── images (测试图片)
+│   │   │   ├── maps (测试地图)
+│   │   ├── train.txt
+│   │   ├── val.txt
+│   │   ├── test.txt
+│   │   ├── sample_submission.csv
+│   ├── s3dis
+│   │   ├── meta_data
+│   │   ├── Stanford3dDataset_v1.2_Aligned_Version
+│   │   ├── collect_indoor3d_data.py
+│   │   ├── indoor3d_util.py
+│   │   ├── README.md
+│   ├── scannet
+│   │   ├── meta_data
+│   │   ├── scans
+│   │   ├── scans_test
+│   │   ├── batch_load_scannet_data.py
+│   │   ├── load_scannet_data.py
+│   │   ├── scannet_utils.py
+│   │   ├── README.md
+│   ├── sunrgbd
+│   │   ├── OFFICIAL_SUNRGBD
+│   │   ├── matlab
+│   │   ├── sunrgbd_data.py
+│   │   ├── sunrgbd_utils.py
+│   │   ├── README.md
+
+```
+
+## 数据下载和预处理
+
+### KITTI
+
+在[这里](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d)下载 KITTI 的 3D 检测数据。通过运行以下指令对 KITTI 数据进行预处理：
+
+```bash
+mkdir ./data/kitti/ && mkdir ./data/kitti/ImageSets
+
+# 下载数据划分文件
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/test.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/test.txt
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/train.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/train.txt
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/val.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/val.txt
+wget -c  https://raw.githubusercontent.com/traveller59/second.pytorch/master/second/data/ImageSets/trainval.txt --no-check-certificate --content-disposition -O ./data/kitti/ImageSets/trainval.txt
+```
+
+然后通过运行以下指令生成信息文件：
+
+```bash
+python tools/create_data.py kitti --root-path ./data/kitti --out-dir ./data/kitti --extra-tag kitti
+```
+
+在使用 slurm 的环境下，用户需要使用下面的指令：
+
+```bash
+sh tools/create_data.sh <partition> kitti
+```
+
+**小贴士**：
+
+- **现成的标注文件**：我们已经提供了离线处理好的 [KITTI 标注文件](#数据集标注文件列表)。您直接下载他们并放到 `data/kitti/` 目录下。然而，如果你想在点云检测方法中使用 `ObjectSample` 这一数据增强，你可以再额外使用以下命令来生成物体标注框数据库：
+
+```bash
+python tools/create_data.py kitti --root-path ./data/kitti --out-dir ./data/kitti --extra-tag kitti --only-gt-database
+```
+
+### Waymo
+
+在[这里](https://waymo.com/open/download/)下载 Waymo 公开数据集 1.4.1 版本，在[这里](https://drive.google.com/drive/folders/18BVuF_RYJF0NjZpt8SnfzANiakoRMf0o?usp=sharing)下载其数据划分文件。然后，将 `.tfrecord` 文件置于 `data/waymo/waymo_format/` 目录下的相应位置，并将数据划分的 `.txt` 文件置于 `data/waymo/kitti_format/ImageSets` 目录下。在[这里](https://console.cloud.google.com/storage/browser/waymo_open_dataset_v_1_2_0/validation/ground_truth_objects)下载验证集的真实标签（`.bin` 文件）并将其置于 `data/waymo/waymo_format/`。提示：你可以使用 `gsutil` 来用命令下载大规模的数据集。更多细节请参考此[工具](https://github.com/RalphMao/Waymo-Dataset-Tool)。完成以上各步后，可以通过运行以下指令对 Waymo 数据进行预处理：
+
+```bash
+# TF_CPP_MIN_LOG_LEVEL=3 will disable all logging output from TensorFlow.
+# The number of `--workers` depends on the maximum number of cores in your CPU.
+TF_CPP_MIN_LOG_LEVEL=3 python tools/create_data.py waymo --root-path ./data/waymo --out-dir ./data/waymo --workers 128 --extra-tag waymo --version v1.4
+```
+
+注意:
+
+- 如果你的硬盘空间大小不足以存储转换后的数据，你可以将 `--out-dir` 参数设定为别的路径。你只需要记得在那个路径下创建文件夹并下载数据，然后在数据预处理完成后将其链接回 `data/waymo/kitti_format` 即可。
+
+**小贴士**：
+
+- **现成的标注文件**: 我们已经提供了离线处理好的 [Waymo 标注文件](#数据集标注文件列表)。您直接下载他们并放到 `data/waymo/kitti_format/` 目录下。然而，您还是需要自己使用上面的脚本将 Waymo 的原始数据还需要转成 kitti 格式。
+
+- **Waymo-mini**： 如果你只是为了验证某些方法或者 debug, 你可以使用我们提供的 [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz)。它只包含原始数据集中训练集中的 2 个 segments 和 验证集中的 1 个 segment。您只需要下载并且解压到 `data/waymo_mini/`，即可使用它：
+
+  ```bash
+  tar -xzvf waymo_mini.tar.gz -C ./data/waymo_mini
+  ```
+
+### NuScenes
+
+在[这里](https://www.nuscenes.org/download)下载 nuScenes 数据集 1.0 版本的完整数据文件。通过运行以下指令对 nuScenes 数据进行预处理：
+
+```bash
+python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes
+```
+
+**小贴士**：
+
+- **现成的标注文件**：我们已经提供了离线处理好的 [NuScenes 标注文件](#数据集标注文件列表)。您直接下载他们并放到 `data/nuscenes/` 目录下。然而，如果你想在点云检测方法中使用 `ObjectSample` 这一数据增强，你可以再额外使用以下命令来生成物体标注框数据库：
+
+```bash
+python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes --only-gt-database
+```
+
+### Lyft
+
+在[这里](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data)下载 Lyft 3D 检测数据。通过运行以下指令对 Lyft 数据进行预处理：
+
+```bash
+python tools/create_data.py lyft --root-path ./data/lyft --out-dir ./data/lyft --extra-tag lyft --version v1.01
+python tools/data_converter/lyft_data_fixer.py --version v1.01 --root-folder ./data/lyft
+```
+
+注意，为了文件结构的清晰性，我们遵从了 Lyft 数据原先的文件夹名称。请按照上面展示出的文件结构对原始文件夹进行重命名。同样值得注意的是，第二行命令的目的是为了修复一个损坏的激光雷达数据文件。更多细节请参考[该讨论](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110000)。
+
+### SemanticKITTI
+
+在[这里](http://semantic-kitti.org/dataset.html#download)下载 SemanticKITTI 数据集并解压所有文件。通过运行以下指令对 SemanticKITTI 数据进行预处理：
+
+```bash
+python ./tools/create_data.py semantickitti --root-path ./data/semantickitti --out-dir ./data/semantickitti --extra-tag semantickitti
+```
+
+**小贴士**：
+
+- **现成的标注文件**. 我们已经提供了离线处理好的 [SemanticKITTI 标注文件](#数据集标注文件列表)。您直接下载他们并放到 `data/semantickitti` 目录下。
+
+### S3DIS、ScanNet 和 SUN RGB-D
+
+请参考 S3DIS [README](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/data/s3dis/README.md) 文件以对其进行数据预处理。
+
+请参考 ScanNet [README](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/data/scannet/README.md) 文件以对其进行数据预处理。
+
+请参考 SUN RGB-D [README](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/data/sunrgbd/README.md) 文件以对其进行数据预处理。
+
+**小贴士**：对于 S3DIS, ScanNet 和 SUN RGB-D 数据集，我们已经提供了离线处理好的 [标注文件](#数据集标注文件列表)。您可以直接下载他们并放到 `data/${DATASET}/` 目录下。然而，您还是需要自己利用我们的脚本来生成点云文件以及语义掩膜文件(如果该数据集有的话)。
+
+### 自定义数据集
+
+关于如何使用自定义数据集，请参考[自定义数据集](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/docs/zh_cn/advanced_guides/customize_dataset.md)。
+
+### 更新数据信息
+
+如果你之前已经使用 v1.0.0rc1-v1.0.0rc4 版的 mmdetection3d 创建数据信息，现在你想使用最新的 v1.1.0 版 mmdetection3d，你需要更新数据信息文件。
+
+```bash
+python tools/dataset_converters/update_infos_to_v2.py --dataset ${DATA_SET} --pkl-path ${PKL_PATH} --out-dir ${OUT_DIR}
+```
+
+- `--dataset`：数据集名。
+- `--pkl-path`：指定数据信息 pkl 文件路径。
+- `--out-dir`：输出数据信息 pkl 文件目录。
+
+例如：
+
+```bash
+python tools/dataset_converters/update_infos_to_v2.py --dataset kitti --pkl-path ./data/kitti/kitti_infos_trainval.pkl --out-dir ./data/kitti
+```
+
+### 数据集标注文件列表
+
+我们提供了离线生成好的数据集标注文件以供参考。为了方便，您也可以直接使用他们。
+
+|                                                  数据集                                                   |                                                                                                              训练集标注文件                                                                                                               |                                                                                                           验证集标注文件                                                                                                           |                                                                                                                 测试集标注文件                                                                                                                  |
+| :-------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|                                                   KITTI                                                   |                                                                  [kitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_train.pkl)                                                                   |                                                                 [kitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_val.pkl)                                                                 |                                                                        [kitti_infos_test](https://download.openmmlab.com/mmdetection3d/data/kitti/kitti_infos_test.pkl)                                                                         |
+|                                                 NuScenes                                                  | [nuscenes_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_train.pkl) [nuscenes_mini_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_train.pkl) | [nuscenes_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_infos_val.pkl)  [nuscenes_mini_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/nuscenes/nuscenes_mini_infos_val.pkl) |                                                                                                                                                                                                                                                 |
+|                                                   Waymo                                                   |                                                         [waymo_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_train.pkl)                                                          |                                                        [waymo_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_val.pkl)                                                        | [waymo_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo/waymo_infos_test.pkl)   [waymo_infos_test_cam_only.pkl](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_infos_test_cam_only.pkl) |
+| [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) |                                                                                                                                                                                                                                           |                                                                                                                                                                                                                                    |                                                                                                                                                                                                                                                 |
+|                                                 SUN RGB-D                                                 |                                                               [sunrgbd_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_train.pkl)                                                                |                                                              [sunrgbd_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/sunrgbd/sunrgbd_infos_val.pkl)                                                              |                                                                                                                                                                                                                                                 |
+|                                                  ScanNet                                                  |                                                               [scannet_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_train.pkl)                                                                |                                                              [scannet_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_val.pkl)                                                              |                                                                   [scannet_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/scannet/scannet_infos_test.pkl)                                                                    |
+|                                               SemanticKitti                                               |                                                      [semantickitti_infos_train.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_train.pkl)                                                       |                                                     [semantickitti_infos_val.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_val.pkl)                                                     |                                                          [semantickitti_infos_test.pkl](https://download.openmmlab.com/mmdetection3d/data/semantickitti/semantickitti_infos_test.pkl)                                                           |
diff --git a/mmde/docs/zh_cn/user_guides/index.rst b/mmde/docs/zh_cn/user_guides/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cefdcf3cc435b33863b937693d8ba3a3f9caf9e0
--- /dev/null
+++ b/mmde/docs/zh_cn/user_guides/index.rst
@@ -0,0 +1,22 @@
+训练和测试
+**************
+.. toctree::
+   :maxdepth: 1
+
+   config.md
+   coord_sys_tutorial.md
+   dataset_prepare.md
+   data_pipeline.md
+   train_test.md
+   inference.md
+   new_data_model.md
+
+实用工具
+************
+.. toctree::
+   :maxdepth: 1
+
+   useful_tools.md
+   visualization.md
+   backends_support.md
+   model_deployment.md
diff --git a/mmde/docs/zh_cn/user_guides/inference.md b/mmde/docs/zh_cn/user_guides/inference.md
new file mode 100644
index 0000000000000000000000000000000000000000..a3a62b53f4e61c06a4662f6df7cbb99550f6334f
--- /dev/null
+++ b/mmde/docs/zh_cn/user_guides/inference.md
@@ -0,0 +1,101 @@
+# 推理
+
+## 介绍
+
+我们提供了多模态/单模态（基于激光雷达/图像）、室内/室外场景的 3D 检测和 3D 语义分割样例的脚本，预训练模型可以从 [Model Zoo](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/docs/zh_cn/model_zoo.md) 下载。我们也提供了 KITTI、SUN RGB-D、nuScenes 和 ScanNet 数据集的预处理样本数据，你可以根据我们的预处理步骤使用任何其它数据。
+
+## 测试
+
+### 3D 检测
+
+#### 点云样例
+
+在点云数据上测试 3D 检测器，运行：
+
+```shell
+python demo/pcd_demo.py ${PCD_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${GPU_ID}] [--score-thr ${SCORE_THR}] [--out-dir ${OUT_DIR}] [--show]
+```
+
+点云和预测 3D 框的可视化结果会被保存在 `${OUT_DIR}/PCD_NAME`，它可以使用 [MeshLab](http://www.meshlab.net/) 打开。注意如果你设置了 `--show`，通过 [Open3D](http://www.open3d.org/) 可以在线显示预测结果。
+
+在 KITTI 数据上测试 [PointPillars 模型](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20220331_134606-d42d15ed.pth)：
+
+```shell
+python demo/pcd_demo.py demo/data/kitti/000008.bin configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-car.py ${CHECKPOINT_FILE} --show
+```
+
+在 SUN RGB-D 数据上测试 [VoteNet 模型](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/votenet/votenet_16x8_sunrgbd-3d-10class/votenet_16x8_sunrgbd-3d-10class_20210820_162823-bf11f014.pth)：
+
+```shell
+python demo/pcd_demo.py demo/data/sunrgbd/sunrgbd_000017.bin configs/votenet/votenet_8xb16_sunrgbd-3d.py ${CHECKPOINT_FILE} --show
+```
+
+#### 单目 3D 样例
+
+在图像数据上测试单目 3D 检测器，运行：
+
+```shell
+python demo/mono_det_demo.py ${IMAGE_FILE} ${ANNOTATION_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${GPU_ID}] [--out-dir ${OUT_DIR}] [--show]
+```
+
+`ANNOTATION_FILE` 需要提供 3D 到 2D 的仿射矩阵（相机内参矩阵），可视化结果会被保存在 `${OUT_DIR}/PCD_NAME`，其中包括图像以及预测 3D 框在图像上的投影。
+
+在 KITTI 数据上测试 [PGD 模型](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d_20211022_102608-8a97533b.pth)：
+
+```shell
+python demo/mono_det_demo.py demo/data/kitti/000008.png demo/data/kitti/000008.pkl  configs/pgd/pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d.py ${CHECKPOINT_FILE}  --show --cam-type CAM2 --score-thr 8
+```
+
+**注意**： PGD 方法的预测框分数并不是在 (0, 1) 之间
+
+在 nuScenes 数据上测试 [FCOS3D 模型](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d_finetune_20210717_095645-8d806dc2.pth)：
+
+```shell
+python demo/mono_det_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__CAM_BACK__1532402927637525.jpg demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl  configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d_finetune.py ${CHECKPOINT_FILE}  --show --cam-type CAM_BACK
+```
+
+**注意**： 当对翻转图像可视化单目 3D 检测结果是，相机内参矩阵也应该相应修改。在 PR [#744](https://github.com/open-mmlab/mmdetection3d/pull/744) 中可以了解更多细节和示例。
+
+#### 多模态样例
+
+在多模态数据（通常是点云和图像）上测试 3D 检测器，运行：
+
+```shell
+python demo/multi_modality_demo.py ${PCD_FILE} ${IMAGE_FILE} ${ANNOTATION_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${GPU_ID}] [--score-thr ${SCORE_THR}] [--out-dir ${OUT_DIR}] [--show]
+```
+
+`ANNOTATION_FILE` 需要提供 3D 到 2D 的仿射矩阵，可视化结果会被保存在 `${OUT_DIR}/PCD_NAME`，其中包括点云、图像、预测的 3D 框以及它们在图像上的投影。
+
+在 KITTI 数据上测试 [MVX-Net 模型](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class-8963258a.pth)：
+
+```shell
+python demo/multi_modality_demo.py demo/data/kitti/000008.bin demo/data/kitti/000008.png demo/data/kitti/000008.pkl configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py ${CHECKPOINT_FILE} --cam-type CAM2 --show
+```
+
+在 SUN RGB-D 数据上测试 [ImVoteNet 模型](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210819_192851-1bcd1b97.pth)：
+
+```shell
+python demo/multi_modality_demo.py demo/data/sunrgbd/000017.bin demo/data/sunrgbd/000017.jpg demo/data/sunrgbd/sunrgbd_000017_infos.pkl configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py ${CHECKPOINT_FILE} --cam-type CAM0 --show --score-thr 0.6
+```
+
+在 NuScenes 数据上测试 [BEVFusion 模型](https://drive.google.com/file/d/1QkvbYDk4G2d6SZoeJqish13qSyXA4lp3/view?usp=share_link)
+
+```shell
+python demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show
+```
+
+### 3D 分割
+
+在点云数据上测试 3D 分割器，运行：
+
+```shell
+python demo/pc_seg_demo.py ${PCD_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${GPU_ID}] [--out-dir ${OUT_DIR}] [--show]
+```
+
+可视化结果会被保存在 `${OUT_DIR}/PCD_NAME`，其中包括点云以及预测的 3D 分割掩码。
+
+在 ScanNet 数据上测试 [PointNet++ (SSG) 模型](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_ssg_16x2_cosine_200e_scannet_seg-3d-20class/pointnet2_ssg_16x2_cosine_200e_scannet_seg-3d-20class_20210514_143644-ee73704a.pth)：
+
+```shell
+python demo/pcd_seg_demo.py demo/data/scannet/scene0000_00.bin configs/pointnet2/pointnet2_ssg_2xb16-cosine-200e_scannet-seg.py ${CHECKPOINT_FILE} --show
+```
diff --git a/mmde/docs/zh_cn/user_guides/model_deployment.md b/mmde/docs/zh_cn/user_guides/model_deployment.md
new file mode 100644
index 0000000000000000000000000000000000000000..f66172c184c4d43375f6ce3fa9f0d19a75fc7b9d
--- /dev/null
+++ b/mmde/docs/zh_cn/user_guides/model_deployment.md
@@ -0,0 +1,4 @@
+# 模型部署（待更新）
+
+MMDet3D 1.1 完全基于 [MMDeploy](https://mmdeploy.readthedocs.io/) 來部署模型。
+我们将在下一个版本完善这个文档。
diff --git a/mmde/docs/zh_cn/user_guides/new_data_model.md b/mmde/docs/zh_cn/user_guides/new_data_model.md
new file mode 100644
index 0000000000000000000000000000000000000000..add2fe5ec73d43aabcc7cf02f2042ffb22b968e2
--- /dev/null
+++ b/mmde/docs/zh_cn/user_guides/new_data_model.md
@@ -0,0 +1,102 @@
+# 在自定义数据集上进行训练
+
+本文将主要介绍如何使用自定义数据集来进行模型的训练和测试，以 Waymo 数据集作为示例来说明整个流程。
+
+基本步骤如下所示：
+
+1. 准备自定义数据集；
+2. 准备配置文件；
+3. 在自定义数据集上进行模型的训练、测试和推理。
+
+## 准备自定义数据集
+
+在 MMDetection3D 中有三种方式来自定义一个新的数据集：
+
+1. 将新数据集的数据格式重新组织成已支持的数据集格式；
+2. 将新数据集的数据格式重新组织成已支持的一种中间格式；
+3. 从头开始创建一个新的数据集。
+
+由于前两种方式比第三种方式更加容易，我们更加建议采用前两种方式来自定义数据集。
+
+在本文中，我们给出示例将数据转换成 KITTI 数据集的数据格式，你可以参考此处将你的数据集重新组织成 KITTI 格式。关于标准格式的数据集，你可以参考[自定义数据集文档](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/docs/zh_cn/advanced_guides/customize_dataset.md)。
+
+**注意**：考虑到 Waymo 数据集的格式与现有的其他数据集的格式的差别较大，因此本文以该数据集为例来讲解如何自定义数据集，从而方便理解数据集自定义的过程。若需要创建的新数据集与现有的数据集的组织格式较为相似，如 Lyft 数据集和 nuScenes 数据集，采用对数据集的中间格式进行转换的方式（第二种方式）相比于采用对数据格式进行转换的方式（第一种方式）会更加简单易行。
+
+### KITTI 数据集格式
+
+应用于 3D 目标检测的 KITTI 原始数据集的组织方式通常如下所示，其中 `ImageSets` 包含数据集划分文件，用以划分训练集/验证集/测试集，`calib` 包含对于每个数据样本的标定信息，`image_2` 和 `velodyne` 分别包含图像数据和点云数据，`label_2` 包含与 3D 目标检测相关的标注文件。
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── kitti
+│   │   ├── ImageSets
+│   │   ├── testing
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── velodyne
+│   │   ├── training
+│   │   │   ├── calib
+│   │   │   ├── image_2
+│   │   │   ├── label_2
+│   │   │   ├── velodyne
+```
+
+KITTI 官方提供的目标检测开发[工具包](https://s3.eu-central-1.amazonaws.com/avg-kitti/devkit_object.zip)详细描述了 KITTI 数据集的标注格式，例如，KITTI 标注格式包含了以下的标注信息：
+
+```
+#  值    名称      描述
+----------------------------------------------------------------------------
+   1    类型      描述检测目标的类型：'Car'，'Van'，'Truck'，
+                  'Pedestrian'，'Person_sitting'，'Cyclist'，'Tram'，
+                  'Misc' 或 'DontCare'
+   1    截断程度　 从 0（非截断）到 1（截断）的浮点数，其中截断指的是离开检测图像边界的检测目标
+   1    遮挡程度　 用来表示遮挡状态的四种整数（0，1，2，3）:
+                  0 = 可见，1 = 部分遮挡
+                  2 = 大面积遮挡，3 = 未知
+   1    观测角    观测目标的角度，取值范围为 [-pi..pi]
+   4    标注框    检测目标在图像中的二维标注框（以0为初始下标）：包括每个检测目标的左上角和右下角的坐标
+   3    维度　    检测目标的三维维度：高度、宽度、长度（以米为单位）
+   3    位置　    相机坐标系下的三维位置 x，y，z（以米为单位）
+   1    y 旋转　  相机坐标系下检测目标绕着Y轴的旋转角，取值范围为 [-pi..pi]
+   1    得分　    仅在计算结果时使用，检测中表示置信度的浮点数，用于生成 p/r 曲线，在p/r 图中，越高的曲线表示结果越好。
+```
+
+假定我们使用 Waymo 数据集。
+
+在下载好数据集后，我们需要实现一个函数用来将输入数据和标注文件转换成 KITTI 风格。然后我们可以通过继承 `KittiDataset` 实现 `WaymoDataset`，用来加载数据以及训练模型，通过继承 `KittiMetric` 实现 `WaymoMetric` 来做模型的评估。
+
+具体来说，首先使用[数据转换器](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tools/dataset_converters/waymo_converter.py)将 Waymo 数据集转换成 KITTI 数据集的格式，并定义 [Waymo 类](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/datasets/waymo_dataset.py)对转换的数据进行处理。此外需要添加 waymo [评估类](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/evaluation/metrics/waymo_metric.py)来评估结果。因为我们将 Waymo 原始数据集进行预处理并重新组织成 KITTI 数据集的格式，因此可以比较容易通过继承 KittiDataset 类来实现 WaymoDataset 类。需要注意的是，由于 Waymo 数据集有相应的官方评估方法，我们需要进一步实现新的 Waymo 评估方法，更多关于评估方法参考[评估文档](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/metric_and_evaluator.md)。最后，用户可以成功地转换数据并使用 `WaymoDataset` 训练以及 `WaymoMetric` 评估模型。
+
+更多关于 Waymo 数据集预处理的中间结果的细节，请参照对应的[说明文档](https://mmdetection3d.readthedocs.io/zh_CN/latest/datasets/waymo_det.html)。
+
+## 准备配置文件
+
+第二步是准备配置文件来帮助数据集的读取和使用，另外，为了在 3D 检测中获得不错的性能，调整超参数通常是必要的。
+
+假设我们想要使用 PointPillars 模型在 Waymo 数据集上实现三类的 3D 目标检测：vehicle、cyclist、pedestrian，参照 KITTI 数据集[配置文件](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/datasets/kitti-3d-3class.py)、模型[配置文件](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/models/pointpillars_hv_secfpn_kitti.py)和[整体配置文件](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py)，我们需要准备[数据集配置文件](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/datasets/waymoD5-3d-3class.py)、[模型配置文件](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/models/pointpillars_hv_secfpn_waymo.py)，并将这两种文件进行结合得到[整体配置文件](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py)。
+
+## 训练一个新的模型
+
+为了使用一个新的配置文件来训练模型，可以通过下面的命令来实现：
+
+```shell
+python tools/train.py configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py
+```
+
+更多的使用细节，请参考[案例 1](https://mmdetection3d.readthedocs.io/zh_CN/latest/1_exist_data_model.html)。
+
+## 测试和推理
+
+为了测试已经训练好的模型的性能，可以通过下面的命令来实现：
+
+```shell
+python tools/test.py configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class.py work_dirs/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymoD5-3d-3class/latest.pth
+```
+
+**注意**：为了使用 Waymo 数据集的评估方法，需要参考[说明文档](https://mmdetection3d.readthedocs.io/zh_CN/latest/datasets/waymo_det.html)并按照官方指导来准备与评估相关联的文件。
+
+更多有关测试和推理的使用细节，请参考[案例 1](https://mmdetection3d.readthedocs.io/zh_CN/latest/1_exist_data_model.html) 。
diff --git a/mmde/docs/zh_cn/user_guides/train_test.md b/mmde/docs/zh_cn/user_guides/train_test.md
new file mode 100644
index 0000000000000000000000000000000000000000..3b475a444e9332daf91ab7795504d36ef189aa20
--- /dev/null
+++ b/mmde/docs/zh_cn/user_guides/train_test.md
@@ -0,0 +1,254 @@
+# 在标注数据集上测试和训练
+
+### 在标准数据集上测试已有模型
+
+- 单显卡
+- CPU
+- 单节点多显卡
+- 多节点
+
+你可以通过以下命令来测试数据集：
+
+```shell
+# 单块显卡测试
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] [--show] [--show-dir ${SHOW_DIR}]
+
+# CPU：禁用显卡并运行单块 CPU 测试脚本（实验性）
+export CUDA_VISIBLE_DEVICES=-1
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] [--show] [--show-dir ${SHOW_DIR}]
+
+# 多块显卡测试
+./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}]
+```
+
+**注意**:
+
+目前我们只支持 SMOKE 的 CPU 推理测试。
+
+可选参数：
+
+- `--show`：如果被指定，检测结果会在静默模式下被保存，用于调试和可视化，但只在单块 GPU 测试的情况下生效，和 `--show-dir` 搭配使用。
+- `--show-dir`：如果被指定，检测结果会被保存在指定文件夹下的 `***_points.obj` 和 `***_pred.obj` 文件中，用于调试和可视化，但只在单块 GPU 测试的情况下生效，对于这个选项，图形化界面在你的环境中不是必需的。
+
+所有和评估相关的参数在相应的数据集配置的 `test_evaluator` 中设置。例如 `test_evaluator = dict(type='KittiMetric', ann_file=data_root + 'kitti_infos_val.pkl', pklfile_prefix=None, submission_prefix=None)`
+
+参数：
+
+- `type`：相对应的评价指标名，通常和数据集相关联。
+- `ann_file`：标注文件路径。
+- `pklfile_prefix`：可选参数。输出结果保存成 pickle 格式的文件名。如果没有指定，结果将不会保存成文件。
+- `submission_prefix`：可选参数。结果将被保存到文件中，然后你可以将它上传到官方评估服务器中。
+
+示例：
+
+假定你已经把模型权重文件下载到 `checkpoints/` 文件夹下，
+
+1. 在 ScanNet 数据集上测试 VoteNet，保存模型，可视化预测结果
+
+   ```shell
+   python tools/test.py configs/votenet/votenet_8xb8_scannet-3d.py \
+       checkpoints/votenet_8x8_scannet-3d-18class_20200620_230238-2cea9c3a.pth \
+       --show --show-dir ./data/scannet/show_results
+   ```
+
+2. 在 ScanNet 数据集上测试 VoteNet，保存模型，可视化预测结果，可视化真实标签，计算 mAP
+
+   ```shell
+   python tools/test.py configs/votenet/votenet_8xb8_scannet-3d.py \
+       checkpoints/votenet_8x8_scannet-3d-18class_20200620_230238-2cea9c3a.pth \
+       --show --show-dir ./data/scannet/show_results
+   ```
+
+3. 在 ScanNet 数据集上测试 VoteNet（不保存测试结果），计算 mAP
+
+   ```shell
+   python tools/test.py configs/votenet/votenet_8xb8_scannet-3d.py \
+       checkpoints/votenet_8x8_scannet-3d-18class_20200620_230238-2cea9c3a.pth
+   ```
+
+4. 使用 8 块显卡在 KITTI 数据集上测试 SECOND，计算 mAP
+
+   ```shell
+   ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class.py \
+       checkpoints/hv_second_secfpn_6x8_80e_kitti-3d-3class_20200620_230238-9208083a.pth
+   ```
+
+5. 使用 8 块显卡在 nuScenes 数据集上测试 PointPillars，生成提交给官方评测服务器的 json 文件
+
+   ```shell
+   ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} configs/pointpillars/pointpillars_hv_secfpn_sbn-all_8xb4-2x_nus-3d.py \
+       checkpoints/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth \
+      --cfg-options 'test_evaluator.jsonfile_prefix=./pointpillars_nuscenes_results'
+   ```
+
+   生成的结果会保存在 `./pointpillars_nuscenes_results` 目录。
+
+6. 使用 8 块显卡在 KITTI 数据集上测试 SECOND，生成提交给官方评测服务器的 txt 文件
+
+   ```shell
+   ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} configs/second/second_hv_secfpn_8xb6-80e_kitti-3d-3class.py \
+       checkpoints/hv_second_secfpn_6x8_80e_kitti-3d-3class_20200620_230238-9208083a.pth \
+       --cfg-options 'test_evaluator.pklfile_prefix=./second_kitti_results' 'submission_prefix=./second_kitti_results'
+   ```
+
+   生成的结果会保存在 `./second_kitti_results` 目录。
+
+7. 使用 8 块显卡在 Lyft 数据集上测试 PointPillars，生成提交给排行榜的 pkl 文件
+
+   ```shell
+   ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} configs/pointpillars/hv_pointpillars_fpn_sbn-2x8_2x_lyft-3d.py \
+       checkpoints/hv_pointpillars_fpn_sbn-2x8_2x_lyft-3d_latest.pth \
+       --cfg-options 'test_evaluator.jsonfile_prefix=results/pp_lyft/results_challenge' \
+       'test_evaluator.csv_savepath=results/pp_lyft/results_challenge.csv' \
+       'test_evaluator.pklfile_prefix=results/pp_lyft/results_challenge.pkl'
+   ```
+
+   **注意**：为了生成 Lyft 数据集的提交结果，`--eval-options` 必须指定 `csv_savepath`。生成 csv 文件后，你可以使用[网站](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/submit)上给出的 kaggle 命令提交结果。
+
+   注意在 [Lyft 数据集的配置文件](../../configs/_base_/datasets/lyft-3d.py)，`test` 中的 `ann_file` 值为 `lyft_infos_test.pkl`，是没有标注的 Lyft 官方测试集。要在验证数据集上测试，请把它改为 `lyft_infos_val.pkl`。
+
+8. 使用 8 块显卡在 waymo 数据集上测试 PointPillars，使用 waymo 度量方法计算 mAP
+
+   ```shell
+   ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-car.py  \
+       checkpoints/hv_pointpillars_secfpn_sbn-2x16_2x_waymo-3d-car_latest.pth \
+       --cfg-options 'test_evaluator.pklfile_prefix=results/waymo-car/kitti_results' \
+       'test_evaluator.submission_prefix=results/waymo-car/kitti_results'
+   ```
+
+   **注意**：对于 waymo 数据集上的评估，请根据[说明](https://github.com/waymo-research/waymo-open-dataset/blob/master/docs/quick_start.md/)构建二进制文件 `compute_detection_metrics_main` 来做度量计算，并把它放在 `mmdet3d/core/evaluation/waymo_utils/`。（在使用 bazel 构建  `compute_detection_metrics_main` 时，有时会出现 `'round' is not a member of 'std'` 的错误，我们只需要把那个文件中 `round` 前的 `std::` 去掉。）二进制文件生成时需要在 `--eval-options` 中给定 `pklfile_prefix`。对于度量方法，`waymo` 是推荐的官方评估策略，目前 `kitti` 评估是依照 KITTI 而来的，每个难度的结果和 KITTI 的定义并不完全一致。目前大多数物体都被标记为0难度，会在未来修复。它的不稳定原因包括评估的计算大、转换后的数据缺乏遮挡和截断、难度的定义不同以及平均精度的计算方法不同。
+
+9. 使用 8 块显卡在 waymo 数据集上测试 PointPillars，生成 bin 文件并提交到排行榜
+
+   ```shell
+   ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} configs/pointpillars/pointpillars_hv_secfpn_sbn-all_16xb2-2x_waymo-3d-car.py  \
+       checkpoints/hv_pointpillars_secfpn_sbn-2x16_2x_waymo-3d-car_latest.pth \
+       --cfg-options 'test_evaluator.pklfile_prefix=results/waymo-car/kitti_results' \
+       'test_evaluator.submission_prefix=results/waymo-car/kitti_results'
+   ```
+
+   **注意**：生成 bin 文件后，你可以简单地构建二进制文件  `create_submission`，并根据[说明](https://github.com/waymo-research/waymo-open-dataset/blob/master/docs/quick_start.md/)创建提交的文件。要在验证服务器上评测验证数据集，你也可以用同样的方式生成提交的文件。
+
+## 在标准数据集上训练预定义模型
+
+MMDetection3D 分别用 `MMDistributedDataParallel` and `MMDataParallel` 实现了分布式训练和非分布式训练。
+
+所有的输出（日志文件和模型权重文件）都会被保存到工作目录下，通过配置文件里的 `work_dir` 指定。
+
+默认我们每过一个周期都在验证数据集上评测模型，你可以通过在训练配置里添加间隔参数来改变评测的时间间隔：
+
+```python
+train_cfg = dict(type='EpochBasedTrainLoop', val_interval=1)  # 每12个周期评估一次模型
+```
+
+**重要**：配置文件中的默认学习率对应 8 块显卡，配置文件名里有具体的批量大小，比如 '2xb8' 表示一共 8 块显卡，每块显卡 2 个样本。
+根据 [Linear Scaling Rule](https://arxiv.org/abs/1706.02677)，当你使用不同数量的显卡或每块显卡有不同数量的图像时，需要依批量大小按比例调整学习率。如果用 4 块显卡、每块显卡 2 幅图像时学习率为 0.01，那么用 16 块显卡、每块显卡 4 幅图像时学习率应设为 0.08。然而，由于大多数模型使用 ADAM 而不是 SGD 进行优化，上述规则可能并不适用，用户需要自己调整学习率。
+
+### 使用单块显卡进行训练
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+如果你想在命令中指定工作目录，添加参数 `--work-dir ${YOUR_WORK_DIR}`。
+
+### 使用 CPU 进行训练 (实验性)
+
+在 CPU 上训练的过程与单 GPU 训练一致。 我们只需要在训练过程之前禁用显卡。
+
+```shell
+export CUDA_VISIBLE_DEVICES=-1
+```
+
+之后运行单显卡训练脚本即可。
+
+**注意**：
+
+目前，大多数点云相关算法都依赖于 3D CUDA 算子，无法在 CPU 上进行训练。 一些单目 3D 物体检测算法，例如 FCOS3D、SMOKE 可以在 CPU 上进行训练。我们不推荐用户使用 CPU 进行训练，这太过缓慢。我们支持这个功能是为了方便用户在没有显卡的机器上调试某些特定的方法。
+
+### 使用多块显卡进行训练
+
+```shell
+./tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments]
+```
+
+可选参数：
+
+- `--cfg-options 'Key=value'`：覆盖使用的配置中的一些设定。
+
+### 使用多个机器进行训练
+
+如果要在 [slurm](https://slurm.schedmd.com/) 管理的集群上运行 MMDectection3D，你可以使用 `slurm_train.sh` 脚本（该脚本也支持单机训练）
+
+```shell
+[GPUS=${GPUS}] ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${WORK_DIR}
+```
+
+下面是一个使用 16 块显卡在 dev 分区上训练 Mask R-CNN 的示例：
+
+```shell
+GPUS=16 ./tools/slurm_train.sh dev pp_kitti_3class configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py /nfs/xxxx/pp_kitti_3class
+```
+
+你可以查看 [slurm_train.sh](https://github.com/open-mmlab/mmdetection/blob/master/tools/slurm_train.sh) 来获取所有的参数和环境变量。
+
+如果您想使用由 ethernet 连接起来的多台机器， 您可以使用以下命令:
+
+在第一台机器上:
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR ./tools/dist_train.sh $CONFIG $GPUS
+```
+
+在第二台机器上:
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR ./tools/dist_train.sh $CONFIG $GPUS
+```
+
+但是，如果您不使用高速网路连接这几台机器的话，训练将会非常慢。
+
+### 在单个机器上启动多个任务
+
+如果你在单个机器上启动多个任务，比如，在具有8块显卡的机器上进行2个4块显卡训练的任务，你需要为每个任务指定不同的端口（默认为29500）以避免通信冲突。
+
+如果你使用 `dist_train.sh` 启动训练任务，可以在命令中设置端口：
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4
+```
+
+如果你使用 Slurm 启动训练任务，有两种方式指定端口：
+
+1. 通过 `--cfg-options` 设置端口，这是更推荐的，因为它不改变原来的配置
+
+   ```shell
+   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR} --cfg-options 'env_cfg.dist_cfg.port=29500'
+   CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR} --cfg-options 'env_cfg.dist_cfg.port=29501'
+   ```
+
+2. 修改配置文件（通常在配置文件的倒数第6行）来设置不同的通信端口
+
+   在 `config1.py` 中，
+
+   ```python
+   env_cfg = dict(
+       dist_cfg=dict(backend='nccl', port=29500)
+   )
+   ```
+
+   在 `config2.py` 中，
+
+   ```python
+   env_cfg = dict(
+       dist_cfg=dict(backend='nccl', port=29501)
+   )
+   ```
+
+   然后，你可以使用 `config1.py` and `config2.py` 启动两个任务
+
+   ```shell
+   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR}
+   CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR}
+   ```
diff --git a/mmde/docs/zh_cn/user_guides/useful_tools.md b/mmde/docs/zh_cn/user_guides/useful_tools.md
new file mode 100644
index 0000000000000000000000000000000000000000..5cd256c73e99ebb189201a0963856d0c273b5691
--- /dev/null
+++ b/mmde/docs/zh_cn/user_guides/useful_tools.md
@@ -0,0 +1,211 @@
+我们在 `tools/` 文件夹路径下提供了许多有用的工具。
+
+## 日志分析
+
+给定一个训练的日志文件，您可以绘制出 loss/mAP 曲线。首先需要运行 `pip install seaborn` 安装依赖包。
+
+![loss曲线图](../../../resources/loss_curve.png)
+
+```shell
+python tools/analysis_tools/analyze_logs.py plot_curve [--keys ${KEYS}] [--title ${TITLE}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}] [--mode ${MODE}] [--interval ${INTERVAL}]
+```
+
+**注意**: 如果您想绘制的指标是在验证阶段计算得到的，您需要添加一个标志 `--mode eval` ，如果您每经过一个 `${INTERVAL}` 的间隔进行评估，您需要增加一个参数 `--interval ${INTERVAL}`。
+
+示例：
+
+- 绘制出某次运行的分类 loss。
+
+  ```shell
+  python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys loss_cls --legend loss_cls
+  ```
+
+- 绘制出某次运行的分类和回归 loss，并且保存图片为 pdf 格式。
+
+  ```shell
+  python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys loss_cls loss_bbox --out losses.pdf
+  ```
+
+- 在同一张图片中比较两次运行的 bbox mAP。
+
+  ```shell
+  # 根据 Car_3D_moderate_strict 在 KITTI 上评估 PartA2 和 second。
+  python tools/analysis_tools/analyze_logs.py plot_curve tools/logs/PartA2.log.json tools/logs/second.log.json --keys KITTI/Car_3D_moderate_strict --legend PartA2 second --mode eval --interval 1
+  # 根据 Car_3D_moderate_strict 在 KITTI 上分别对车和 3 类评估 PointPillars。
+  python tools/analysis_tools/analyze_logs.py plot_curve tools/logs/pp-3class.log.json tools/logs/pp.log.json --keys KITTI/Car_3D_moderate_strict --legend pp-3class pp --mode eval --interval 2
+  ```
+
+您也能计算平均训练速度。
+
+```shell
+python tools/analysis_tools/analyze_logs.py cal_train_time log.json [--include-outliers]
+```
+
+预期输出应该如下所示。
+
+```
+-----Analyze train time of work_dirs/some_exp/20190611_192040.log.json-----
+slowest epoch 11, average time is 1.2024
+fastest epoch 1, average time is 1.1909
+time std over epochs is 0.0028
+average iter time: 1.1959 s/iter
+```
+
+&#8195;
+
+## 模型部署
+
+**注意**：此工具仍然处于试验阶段，目前只有 SECOND 支持用 [`TorchServe`](https://pytorch.org/serve/) 部署，我们将会在未来支持更多的模型。
+
+为了使用 [`TorchServe`](https://pytorch.org/serve/) 部署 `MMDetection3D` 模型，您可以遵循以下步骤：
+
+### 1. 将模型从 MMDetection3D 转换到 TorchServe
+
+```shell
+python tools/deployment/mmdet3d2torchserve.py ${CONFIG_FILE} ${CHECKPOINT_FILE} \
+--output-folder ${MODEL_STORE} \
+--model-name ${MODEL_NAME}
+```
+
+**Note**: ${MODEL_STORE} 需要为文件夹的绝对路径。
+
+### 2. 构建 `mmdet3d-serve` 镜像
+
+```shell
+docker build -t mmdet3d-serve:latest docker/serve/
+```
+
+### 3. 运行 `mmdet3d-serve`
+
+查看官网文档来 [使用 docker 运行 TorchServe](https://github.com/pytorch/serve/blob/master/docker/README.md#running-torchserve-in-a-production-docker-environment)。
+
+为了在 GPU 上运行，您需要安装 [nvidia-docker](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)。您可以忽略 `--gpus` 参数，从而在 CPU 上运行。
+
+例子：
+
+```shell
+docker run --rm \
+--cpus 8 \
+--gpus device=0 \
+-p8080:8080 -p8081:8081 -p8082:8082 \
+--mount type=bind,source=$MODEL_STORE,target=/home/model-server/model-store \
+mmdet3d-serve:latest
+```
+
+[阅读文档](https://github.com/pytorch/serve/blob/072f5d088cce9bb64b2a18af065886c9b01b317b/docs/rest_api.md/) 关于 Inference (8080), Management (8081) and Metrics (8082) 接口。
+
+### 4. 测试部署
+
+您可以使用 `test_torchserver.py` 进行部署， 同时比较 torchserver 和 pytorch 的结果。
+
+```shell
+python tools/deployment/test_torchserver.py ${IMAGE_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} ${MODEL_NAME}
+[--inference-addr ${INFERENCE_ADDR}] [--device ${DEVICE}] [--score-thr ${SCORE_THR}]
+```
+
+例子:
+
+```shell
+python tools/deployment/test_torchserver.py demo/data/kitti/kitti_000008.bin configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py checkpoints/hv_second_secfpn_6x8_80e_kitti-3d-car_20200620_230238-393f000c.pth second
+```
+
+&#8195;
+
+# 模型复杂度
+
+您可以使用 MMDetection 中的 `tools/analysis_tools/get_flops.py` 这个脚本文件，基于 [flops-counter.pytorch](https://github.com/sovrasov/flops-counter.pytorch) 计算一个给定模型的计算量 (FLOPS) 和参数量 (params)。
+
+```shell
+python tools/analysis_tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}]
+```
+
+您将会得到如下的结果：
+
+```text
+==============================
+Input shape: (4000, 4)
+Flops: 5.78 GFLOPs
+Params: 953.83 k
+==============================
+```
+
+**注意**：此工具仍然处于试验阶段，我们不能保证数值是绝对正确的。您可以将结果用于简单的比较，但在写技术文档报告或者论文之前您需要再次确认一下。
+
+1. 计算量 (FLOPs) 和输入形状有关，但是参数量 (params) 则和输入形状无关。默认的输入形状为 (1, 40000, 4)。
+2. 一些运算操作不计入计算量 (FLOPs)，比如说像GN和定制的运算操作，详细细节请参考 [`mmcv.cnn.get_model_complexity_info()`](https://github.com/open-mmlab/mmcv/blob/master/mmcv/cnn/utils/flops_counter.py)。
+3. 我们现在仅仅支持单模态输入（点云或者图片）的单阶段模型的计算量 (FLOPs) 计算，我们将会在未来支持两阶段和多模态模型的计算。
+
+&#8195;
+
+## 模型转换
+
+### RegNet 模型转换到 MMDetection
+
+`tools/model_converters/regnet2mmdet.py` 将 pycls 预训练 RegNet 模型中的键转换为 MMDetection 风格。
+
+```shell
+python tools/model_converters/regnet2mmdet.py ${SRC} ${DST} [-h]
+```
+
+### Detectron ResNet 转换到 Pytorch
+
+MMDetection 中的 `tools/detectron2pytorch.py` 能够把原始的 detectron 中预训练的 ResNet 模型的键转换为 PyTorch 风格。
+
+```shell
+python tools/detectron2pytorch.py ${SRC} ${DST} ${DEPTH} [-h]
+```
+
+### 准备要发布的模型
+
+`tools/model_converters/publish_model.py` 帮助用户准备他们用于发布的模型。
+
+在您上传一个模型到云服务器 (AWS) 之前，您需要做以下几步：
+
+1. 将模型权重转换为 CPU 张量
+2. 删除记录优化器状态 (optimizer states) 的相关信息
+3. 计算检查点 (checkpoint) 文件的哈希编码 (hash id) 并且把哈希编码加到文件名里
+
+```shell
+python tools/model_converters/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME}
+```
+
+例如，
+
+```shell
+python tools/model_converters/publish_model.py work_dirs/faster_rcnn/latest.pth faster_rcnn_r50_fpn_1x_20190801.pth
+```
+
+最终的输出文件名将会是 `faster_rcnn_r50_fpn_1x_20190801-{hash id}.pth`。
+
+&#8195;
+
+# 数据集转换
+
+`tools/dataset_converters/` 包含转换数据集为其他格式的一些工具。其中大多数转换数据集为基于 pickle 的信息文件，比如 KITTI，nuscense 和 lyft。Waymo 转换器被用来重新组织 waymo 原始数据为 KITTI 风格。用户能够参考它们了解我们转换数据格式的方法。将它们修改为 nuImages 转换器等脚本也很方便。
+
+为了转换 nuImages 数据集为 COCO 格式，请使用下面的指令：
+
+```shell
+python -u tools/dataset_converters/nuimage_converter.py --data-root ${DATA_ROOT} --version ${VERSIONS} \
+                                                    --out-dir ${OUT_DIR} --nproc ${NUM_WORKERS} --extra-tag ${TAG}
+```
+
+- `--data-root`: 数据集的根目录，默认为 `./data/nuimages`。
+- `--version`: 数据集的版本，默认为 `v1.0-mini`。要获取完整数据集，请使用 `--version v1.0-train v1.0-val v1.0-mini`。
+- `--out-dir`: 注释和语义掩码的输出目录，默认为 `./data/nuimages/annotations/`。
+- `--nproc`: 数据准备的进程数，默认为 `4`。由于图片是并行处理的，更大的进程数目能够减少准备时间。
+- `--extra-tag`: 注释的额外标签，默认为 `nuimages`。这可用于将不同时间处理的不同注释分开以供研究。
+
+更多的数据准备细节参考 [doc](https://mmdetection3d.readthedocs.io/zh_CN/latest/data_preparation.html)，nuImages 数据集的细节参考 [README](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/nuimages/README.md/)。
+
+&#8195;
+
+# 其他内容
+
+## 打印完整的配置文件
+
+`tools/misc/print_config.py` 逐字打印整个配置文件，展开所有的导入。
+
+```shell
+python tools/misc/print_config.py ${CONFIG} [-h] [--options ${OPTIONS [OPTIONS...]}]
+```
diff --git a/mmde/docs/zh_cn/user_guides/visualization.md b/mmde/docs/zh_cn/user_guides/visualization.md
new file mode 100644
index 0000000000000000000000000000000000000000..d09c45714b9ced7e8a4db7962851f737400779e7
--- /dev/null
+++ b/mmde/docs/zh_cn/user_guides/visualization.md
@@ -0,0 +1,202 @@
+# 可视化
+
+MMDetection3D 提供了 `Det3DLocalVisualizer` 用来在训练及测试阶段可视化和存储模型的状态以及结果，其具有以下特性：
+
+1. 支持多模态数据和多任务的基本绘图界面。
+2. 支持多个后端（如 local，TensorBoard），将训练状态（如 `loss`，`lr`）或模型评估指标写入指定的一个或多个后端中。
+3. 支持多模态数据真实标签的可视化，3D 检测结果的跨模态可视化。
+
+## 基本绘制界面
+
+继承自 `DetLocalVisualizer`，`Det3DLocalVisualizer` 提供了在 2D 图像上绘制常见目标的界面，例如绘制检测框、点、文本、线、圆、多边形、二进制掩码等。关于 2D 绘制的更多细节，请参考 MMDetection 中的[可视化文档](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/visualization.html)。这里我们介绍 3D 绘制界面。
+
+### 在图像上绘制点云
+
+通过使用 `draw_points_on_image`，我们支持在图像上绘制点云。
+
+```python
+import mmcv
+import numpy as np
+from mmengine import load
+
+from mmdet3d.visualization import Det3DLocalVisualizer
+
+info_file = load('demo/data/kitti/000008.pkl')
+points = np.fromfile('demo/data/kitti/000008.bin', dtype=np.float32)
+points = points.reshape(-1, 4)[:, :3]
+lidar2img = np.array(info_file['data_list'][0]['images']['CAM2']['lidar2img'], dtype=np.float32)
+
+visualizer = Det3DLocalVisualizer()
+img = mmcv.imread('demo/data/kitti/000008.png')
+img = mmcv.imconvert(img, 'bgr', 'rgb')
+visualizer.set_image(img)
+visualizer.draw_points_on_image(points, lidar2img)
+visualizer.show()
+```
+
+![points_on_image](../../../resources/points_on_image.png)
+
+### 在点云上绘制 3D 框
+
+通过使用 `draw_bboxes_3d`，我们支持在点云上绘制 3D 框。
+
+```python
+import torch
+import numpy as np
+
+from mmdet3d.visualization import Det3DLocalVisualizer
+from mmdet3d.structures import LiDARInstance3DBoxes
+
+points = np.fromfile('demo/data/kitti/000008.bin', dtype=np.float32)
+points = points.reshape(-1, 4)
+visualizer = Det3DLocalVisualizer()
+# set point cloud in visualizer
+visualizer.set_points(points)
+bboxes_3d = LiDARInstance3DBoxes(
+    torch.tensor([[8.7314, -1.8559, -1.5997, 4.2000, 3.4800, 1.8900,
+                   -1.5808]]))
+# Draw 3D bboxes
+visualizer.draw_bboxes_3d(bboxes_3d)
+visualizer.show()
+```
+
+![mono3d](../../../resources/pcd.png)
+
+### 在图像上绘制投影的 3D 框
+
+通过使用 `draw_proj_bboxes_3d`，我们支持在图像上绘制投影的 3D 框。
+
+```python
+import mmcv
+import numpy as np
+from mmengine import load
+
+from mmdet3d.visualization import Det3DLocalVisualizer
+from mmdet3d.structures import CameraInstance3DBoxes
+
+info_file = load('demo/data/kitti/000008.pkl')
+cam2img = np.array(info_file['data_list'][0]['images']['CAM2']['cam2img'], dtype=np.float32)
+bboxes_3d = []
+for instance in info_file['data_list'][0]['instances']:
+    bboxes_3d.append(instance['bbox_3d'])
+gt_bboxes_3d = np.array(bboxes_3d, dtype=np.float32)
+gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d)
+input_meta = {'cam2img': cam2img}
+
+visualizer = Det3DLocalVisualizer()
+
+img = mmcv.imread('demo/data/kitti/000008.png')
+img = mmcv.imconvert(img, 'bgr', 'rgb')
+visualizer.set_image(img)
+# project 3D bboxes to image
+visualizer.draw_proj_bboxes_3d(gt_bboxes_3d, input_meta)
+visualizer.show()
+```
+
+### 绘制 BEV 视角的框
+
+通过使用 `draw_bev_bboxes`，我们支持绘制 BEV 视角下的框。
+
+```python
+import numpy as np
+from mmengine import load
+
+from mmdet3d.visualization import Det3DLocalVisualizer
+from mmdet3d.structures import CameraInstance3DBoxes
+
+info_file = load('demo/data/kitti/000008.pkl')
+bboxes_3d = []
+for instance in info_file['data_list'][0]['instances']:
+    bboxes_3d.append(instance['bbox_3d'])
+gt_bboxes_3d = np.array(bboxes_3d, dtype=np.float32)
+gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d)
+
+visualizer = Det3DLocalVisualizer()
+# set bev image in visualizer
+visualizer.set_bev_image()
+# draw bev bboxes
+visualizer.draw_bev_bboxes(gt_bboxes_3d, edge_colors='orange')
+visualizer.show()
+```
+
+### 绘制 3D 分割掩码
+
+通过使用 `draw_seg_mask`，我们支持通过逐点着色来绘制分割掩码。
+
+```python
+import numpy as np
+
+from mmdet3d.visualization import Det3DLocalVisualizer
+
+points = np.fromfile('demo/data/sunrgbd/000017.bin', dtype=np.float32)
+points = points.reshape(-1, 3)
+visualizer = Det3DLocalVisualizer()
+mask = np.random.rand(points.shape[0], 3)
+points_with_mask = np.concatenate((points, mask), axis=-1)
+# Draw 3D points with mask
+visualizer.set_points(points, pcd_mode=2, vis_mode='add')
+visualizer.draw_seg_mask(points_with_mask)
+visualizer.show()
+```
+
+## 结果
+
+如果想要可视化训练模型的预测结果，你可以运行如下指令：
+
+```bash
+python tools/test.py ${CONFIG_FILE} ${CKPT_PATH} --show --show-dir ${SHOW_DIR}
+```
+
+运行该指令后，绘制的结果（包括输入数据和网络输出在输入上的可视化）将会被保存在 `${SHOW_DIR}` 中。
+
+运行该指令后，你将在 `${SHOW_DIR}` 中获得输入数据，网络输出和真是标签在输入上的可视化（如在多模态检测任务和基于视觉的检测任务中的 `***_gt.png` 和 `***_pred.png`）。当启用 `show` 时，[Open3D](http://www.open3d.org/) 将会用于在线可视化结果。如果你是在没有 GUI 的远程服务器上测试时，在线可视化是不被支持的。你可以从远程服务器中下载 `results.pkl`，并在本地机器上离线可视化预测结果。
+
+使用 `Open3D` 后端离线可视化结果，你可以运行如下指令：
+
+```bash
+python tools/misc/visualize_results.py ${CONFIG_FILE} --result ${RESULTS_PATH} --show-dir ${SHOW_DIR}
+```
+
+![](../../../resources/open3d_visual.gif)
+
+这需要在远程服务器中能够推理并生成结果，然后用户在主机中使用 GUI 打开。
+
+## 数据集
+
+我们也提供了脚本来可视化数据集而无需推理。你可以使用 `tools/misc/browse_dataset.py` 来在线可视化加载的数据的真实标签，并保存在硬盘中。目前我们支持所有数据集的单模态 3D 检测和 3D 分割，KITTI 和 SUN RGB-D 的多模态 3D 检测，以及 nuScenes 的单目 3D 检测。如果想要浏览 KITTI 数据集，你可以运行如下指令：
+
+```shell
+python tools/misc/browse_dataset.py configs/_base_/datasets/kitti-3d-3class.py --task lidar_det --output-dir ${OUTPUT_DIR}
+```
+
+**注意**：一旦指定了 `--output-dir`，当在 open3d 窗口中按下 `_ESC_` 时，用户指定的视图图像将会被保存下来。如果你想要对点云进行缩放操作以观察更多细节， 你可以在命令中指定 `--show-interval=0`。
+
+为了验证数据的一致性和数据增强的效果，你可以加上 `--aug` 来可视化数据增强后的数据，指令如下所示：
+
+```shell
+python tools/misc/browse_dataset.py configs/_base_/datasets/kitti-3d-3class.py --task det --aug --output-dir ${OUTPUT_DIR}
+```
+
+如果你想显示带有投影的 3D 边界框的 2D 图像，你需要一个支持多模态数据加载的配置文件，并将 `--task` 参数改为 `multi-modality_det`。示例如下：
+
+```shell
+python tools/misc/browse_dataset.py configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py --task multi-modality_det --output-dir ${OUTPUT_DIR}
+```
+
+![](../../../resources/browse_dataset_multi_modality.png)
+
+你可以使用不同的配置浏览不同的数据集，例如在 3D 语义分割任务中可视化 ScanNet 数据集：
+
+```shell
+python tools/misc/browse_dataset.py configs/_base_/datasets/scannet-seg.py --task lidar_seg --output-dir ${OUTPUT_DIR}
+```
+
+![](../../../resources/browse_dataset_seg.png)
+
+在单目 3D 检测任务中浏览 nuScenes 数据集：
+
+```shell
+python tools/misc/browse_dataset.py configs/_base_/datasets/nus-mono3d.py --task mono_det --output-dir ${OUTPUT_DIR}
+```
+
+![](../../../resources/browse_dataset_mono.png)
diff --git a/mmde/fix_weight.py b/mmde/fix_weight.py
new file mode 100644
index 0000000000000000000000000000000000000000..3006a51e9fb25e6535a17e4c074df13e312261bd
--- /dev/null
+++ b/mmde/fix_weight.py
@@ -0,0 +1,18 @@
+import torch
+
+# 指向你刚刚发给我的纯 LiDAR 官方权重
+ckpt_path = 'pth/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-2628f933.pth'
+ckpt = torch.load(ckpt_path, map_location='cpu')
+state_dict = ckpt['state_dict']
+
+fixed_count = 0
+for key in list(state_dict.keys()):
+    # 修复 3D 稀疏卷积维度 (16,3,3,3,16) -> (3,3,3,16,16)
+    if 'pts_middle_encoder' in key and state_dict[key].dim() == 5:
+        state_dict[key] = state_dict[key].permute(1, 2, 3, 4, 0).contiguous()
+        fixed_count += 1
+
+ckpt['state_dict'] = state_dict
+fixed_path = ckpt_path.replace('.pth', '_fixed.pth')
+torch.save(ckpt, fixed_path)
+print(f'✅ 纯 LiDAR 权重修复完成！已保存至 {fixed_path}，共处理 {fixed_count} 个层。')
\ No newline at end of file
diff --git a/mmde/mmcv-2.2.0+das.opt1.dtk2604.torch251-cp310-cp310-manylinux_2_28_x86_64.whl b/mmde/mmcv-2.2.0+das.opt1.dtk2604.torch251-cp310-cp310-manylinux_2_28_x86_64.whl
new file mode 100644
index 0000000000000000000000000000000000000000..fa0f01f02e20d2887c7389033c74e6b2bf508318
Binary files /dev/null and b/mmde/mmcv-2.2.0+das.opt1.dtk2604.torch251-cp310-cp310-manylinux_2_28_x86_64.whl differ
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/ade20k_instance.py b/mmde/mmdet/.mim/configs/_base_/datasets/ade20k_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..57f657aa67f34830515f410425eccc96cb065af4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/ade20k_instance.py
@@ -0,0 +1,53 @@
+# dataset settings
+dataset_type = 'ADE20KInstanceDataset'
+data_root = 'data/ADEChallengeData2016/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/ADEChallengeData2016/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(2560, 640), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='ade20k_instance_val.json',
+        data_prefix=dict(img='images/validation'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'ade20k_instance_val.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/ade20k_panoptic.py b/mmde/mmdet/.mim/configs/_base_/datasets/ade20k_panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..7be5ddd7f0732193f4f92bc49e52493602928162
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/ade20k_panoptic.py
@@ -0,0 +1,38 @@
+# dataset settings
+dataset_type = 'ADE20KPanopticDataset'
+data_root = 'data/ADEChallengeData2016/'
+
+backend_args = None
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(2560, 640), keep_ratio=True),
+    dict(type='LoadPanopticAnnotations', backend_args=backend_args),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=0,
+    persistent_workers=False,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='ade20k_panoptic_val.json',
+        data_prefix=dict(img='images/validation/', seg='ade20k_panoptic_val/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoPanopticMetric',
+    ann_file=data_root + 'ade20k_panoptic_val.json',
+    seg_prefix=data_root + 'ade20k_panoptic_val/',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/ade20k_semantic.py b/mmde/mmdet/.mim/configs/_base_/datasets/ade20k_semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..522a775704182ededaa36f318cd1eb185784918f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/ade20k_semantic.py
@@ -0,0 +1,48 @@
+dataset_type = 'ADE20KSegDataset'
+data_root = 'data/ADEChallengeData2016/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/ADEChallengeData2016/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=False,
+        with_mask=False,
+        with_seg=True,
+        reduce_zero_label=True),
+    dict(
+        type='PackDetInputs', meta_keys=('img_path', 'ori_shape', 'img_shape'))
+]
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='SemSegMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/cityscapes_detection.py b/mmde/mmdet/.mim/configs/_base_/datasets/cityscapes_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..caeba6bfcd26d8954fc9d499446e93323e372959
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/cityscapes_detection.py
@@ -0,0 +1,84 @@
+# dataset settings
+dataset_type = 'CityscapesDataset'
+data_root = 'data/cityscapes/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/segmentation/cityscapes/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/segmentation/',
+#          'data/': 's3://openmmlab/datasets/segmentation/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize',
+        scale=[(2048, 800), (2048, 1024)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type='RepeatDataset',
+        times=8,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instancesonly_filtered_gtFine_train.json',
+            data_prefix=dict(img='leftImg8bit/train/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instancesonly_filtered_gtFine_val.json',
+        data_prefix=dict(img='leftImg8bit/val/'),
+        test_mode=True,
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instancesonly_filtered_gtFine_val.json',
+    metric='bbox',
+    backend_args=backend_args)
+
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/cityscapes_instance.py b/mmde/mmdet/.mim/configs/_base_/datasets/cityscapes_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..136403136c67a6726662832b66f56701ff5aba8a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/cityscapes_instance.py
@@ -0,0 +1,113 @@
+# dataset settings
+dataset_type = 'CityscapesDataset'
+data_root = 'data/cityscapes/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/segmentation/cityscapes/'
+
+# Method 2: Use backend_args, file_client_args in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/segmentation/',
+#          'data/': 's3://openmmlab/datasets/segmentation/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomResize',
+        scale=[(2048, 800), (2048, 1024)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type='RepeatDataset',
+        times=8,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instancesonly_filtered_gtFine_train.json',
+            data_prefix=dict(img='leftImg8bit/train/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instancesonly_filtered_gtFine_val.json',
+        data_prefix=dict(img='leftImg8bit/val/'),
+        test_mode=True,
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+
+test_dataloader = val_dataloader
+
+val_evaluator = [
+    dict(
+        type='CocoMetric',
+        ann_file=data_root +
+        'annotations/instancesonly_filtered_gtFine_val.json',
+        metric=['bbox', 'segm'],
+        backend_args=backend_args),
+    dict(
+        type='CityScapesMetric',
+        seg_prefix=data_root + 'gtFine/val',
+        outfile_prefix='./work_dirs/cityscapes_metric/instance',
+        backend_args=backend_args)
+]
+
+test_evaluator = val_evaluator
+
+# inference on test dataset and
+# format the output results for submission.
+# test_dataloader = dict(
+#     batch_size=1,
+#     num_workers=2,
+#     persistent_workers=True,
+#     drop_last=False,
+#     sampler=dict(type='DefaultSampler', shuffle=False),
+#     dataset=dict(
+#         type=dataset_type,
+#         data_root=data_root,
+#         ann_file='annotations/instancesonly_filtered_gtFine_test.json',
+#         data_prefix=dict(img='leftImg8bit/test/'),
+#         test_mode=True,
+#         filter_cfg=dict(filter_empty_gt=True, min_size=32),
+#         pipeline=test_pipeline))
+# test_evaluator = dict(
+#         type='CityScapesMetric',
+#         format_only=True,
+#         outfile_prefix='./work_dirs/cityscapes_metric/test')
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/coco_caption.py b/mmde/mmdet/.mim/configs/_base_/datasets/coco_caption.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1bd898313927e4fca336dfa10f05e78b9fb7162
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/coco_caption.py
@@ -0,0 +1,60 @@
+# data settings
+
+dataset_type = 'CocoCaptionDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        imdecode_backend='pillow',
+        backend_args=backend_args),
+    dict(
+        type='Resize',
+        scale=(224, 224),
+        interpolation='bicubic',
+        backend='pillow'),
+    dict(type='PackInputs', meta_keys=['image_id']),
+]
+
+# ann_file download from
+# train dataset: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json # noqa
+# val dataset: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json # noqa
+# test dataset: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json # noqa
+# val evaluator: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val_gt.json # noqa
+# test evaluator: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test_gt.json # noqa
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/coco_karpathy_val.json',
+        pipeline=test_pipeline,
+    ))
+
+val_evaluator = dict(
+    type='COCOCaptionMetric',
+    ann_file=data_root + 'annotations/coco_karpathy_val_gt.json',
+)
+
+# # If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/coco_detection.py b/mmde/mmdet/.mim/configs/_base_/datasets/coco_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdf8dfad9476b1d7b7a4e8c3e2832f115a1ea7f2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/coco_detection.py
@@ -0,0 +1,95 @@
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# inference on test dataset and
+# format the output results for submission.
+# test_dataloader = dict(
+#     batch_size=1,
+#     num_workers=2,
+#     persistent_workers=True,
+#     drop_last=False,
+#     sampler=dict(type='DefaultSampler', shuffle=False),
+#     dataset=dict(
+#         type=dataset_type,
+#         data_root=data_root,
+#         ann_file=data_root + 'annotations/image_info_test-dev2017.json',
+#         data_prefix=dict(img='test2017/'),
+#         test_mode=True,
+#         pipeline=test_pipeline))
+# test_evaluator = dict(
+#     type='CocoMetric',
+#     metric='bbox',
+#     format_only=True,
+#     ann_file=data_root + 'annotations/image_info_test-dev2017.json',
+#     outfile_prefix='./work_dirs/coco_detection/test')
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/coco_instance.py b/mmde/mmdet/.mim/configs/_base_/datasets/coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..e91cb354038db4df3b990b307a5da9d77f341a88
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/coco_instance.py
@@ -0,0 +1,95 @@
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# inference on test dataset and
+# format the output results for submission.
+# test_dataloader = dict(
+#     batch_size=1,
+#     num_workers=2,
+#     persistent_workers=True,
+#     drop_last=False,
+#     sampler=dict(type='DefaultSampler', shuffle=False),
+#     dataset=dict(
+#         type=dataset_type,
+#         data_root=data_root,
+#         ann_file=data_root + 'annotations/image_info_test-dev2017.json',
+#         data_prefix=dict(img='test2017/'),
+#         test_mode=True,
+#         pipeline=test_pipeline))
+# test_evaluator = dict(
+#     type='CocoMetric',
+#     metric=['bbox', 'segm'],
+#     format_only=True,
+#     ann_file=data_root + 'annotations/image_info_test-dev2017.json',
+#     outfile_prefix='./work_dirs/coco_instance/test')
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/coco_instance_semantic.py b/mmde/mmdet/.mim/configs/_base_/datasets/coco_instance_semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc961863306690c056e564b542d518c0ebfbb7e2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/coco_instance_semantic.py
@@ -0,0 +1,78 @@
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(
+        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/', seg='stuffthingmaps/train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/coco_panoptic.py b/mmde/mmdet/.mim/configs/_base_/datasets/coco_panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b95b619e68ed531d361bbd11a2382852c13446e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/coco_panoptic.py
@@ -0,0 +1,94 @@
+# dataset settings
+dataset_type = 'CocoPanopticDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadPanopticAnnotations', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='LoadPanopticAnnotations', backend_args=backend_args),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/panoptic_train2017.json',
+        data_prefix=dict(
+            img='train2017/', seg='annotations/panoptic_train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/panoptic_val2017.json',
+        data_prefix=dict(img='val2017/', seg='annotations/panoptic_val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoPanopticMetric',
+    ann_file=data_root + 'annotations/panoptic_val2017.json',
+    seg_prefix=data_root + 'annotations/panoptic_val2017/',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# inference on test dataset and
+# format the output results for submission.
+# test_dataloader = dict(
+#     batch_size=1,
+#     num_workers=1,
+#     persistent_workers=True,
+#     drop_last=False,
+#     sampler=dict(type='DefaultSampler', shuffle=False),
+#     dataset=dict(
+#         type=dataset_type,
+#         data_root=data_root,
+#         ann_file='annotations/panoptic_image_info_test-dev2017.json',
+#         data_prefix=dict(img='test2017/'),
+#         test_mode=True,
+#         pipeline=test_pipeline))
+# test_evaluator = dict(
+#     type='CocoPanopticMetric',
+#     format_only=True,
+#     ann_file=data_root + 'annotations/panoptic_image_info_test-dev2017.json',
+#     outfile_prefix='./work_dirs/coco_panoptic/test')
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/coco_semantic.py b/mmde/mmdet/.mim/configs/_base_/datasets/coco_semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..944bbbaeaeb6f10f0946bd1fc828bb01ea6c1fc3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/coco_semantic.py
@@ -0,0 +1,78 @@
+# dataset settings
+dataset_type = 'CocoSegDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=False,
+        with_label=False,
+        with_seg=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=False,
+        with_label=False,
+        with_seg=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor'))
+]
+
+# For stuffthingmaps_semseg, please refer to
+# `docs/en/user_guides/dataset_prepare.md`
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train2017/',
+            seg_map_path='stuffthingmaps_semseg/train2017/'),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='val2017/',
+            seg_map_path='stuffthingmaps_semseg/val2017/'),
+        pipeline=test_pipeline))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='SemSegMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/deepfashion.py b/mmde/mmdet/.mim/configs/_base_/datasets/deepfashion.py
new file mode 100644
index 0000000000000000000000000000000000000000..a93dc7152f7a2e28ab726c79f9398a1034b7b4a1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/deepfashion.py
@@ -0,0 +1,95 @@
+# dataset settings
+dataset_type = 'DeepFashionDataset'
+data_root = 'data/DeepFashion/In-shop/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', scale=(750, 1101), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(750, 1101), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='Anno/segmentation/DeepFashion_segmentation_train.json',
+            data_prefix=dict(img='Img/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='Anno/segmentation/DeepFashion_segmentation_query.json',
+        data_prefix=dict(img='Img/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='Anno/segmentation/DeepFashion_segmentation_gallery.json',
+        data_prefix=dict(img='Img/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root +
+    'Anno/segmentation/DeepFashion_segmentation_query.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root +
+    'Anno/segmentation/DeepFashion_segmentation_gallery.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/dsdl.py b/mmde/mmdet/.mim/configs/_base_/datasets/dsdl.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f19e5e498b18a404f3c4e6419316b5f9981e811
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/dsdl.py
@@ -0,0 +1,62 @@
+dataset_type = 'DSDLDetDataset'
+data_root = 'path to dataset folder'
+train_ann = 'path to train yaml file'
+val_ann = 'path to val yaml file'
+
+backend_args = None
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': "s3://open_data/",
+#         'data/': "s3://open_data/"
+#     }))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'instances'))
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=train_ann,
+        filter_cfg=dict(filter_empty_gt=True, min_size=32, bbox_min_size=32),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=val_ann,
+        test_mode=True,
+        pipeline=test_pipeline))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='CocoMetric', metric='bbox')
+# val_evaluator = dict(type='VOCMetric', metric='mAP', eval_mode='11points')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/isaid_instance.py b/mmde/mmdet/.mim/configs/_base_/datasets/isaid_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..09ddcab02bdd52374d5093d446abb0e34751f7a3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/isaid_instance.py
@@ -0,0 +1,59 @@
+# dataset settings
+dataset_type = 'iSAIDDataset'
+data_root = 'data/iSAID/'
+backend_args = None
+
+# Please see  `projects/iSAID/README.md` for data preparation
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', scale=(800, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(800, 800), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train/instancesonly_filtered_train.json',
+        data_prefix=dict(img='train/images/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val/instancesonly_filtered_val.json',
+        data_prefix=dict(img='val/images/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'val/instancesonly_filtered_val.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/lvis_v0.5_instance.py b/mmde/mmdet/.mim/configs/_base_/datasets/lvis_v0.5_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0ca44efb6d31aae5f6426a1c8b89d2e9be2104f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/lvis_v0.5_instance.py
@@ -0,0 +1,79 @@
+# dataset settings
+dataset_type = 'LVISV05Dataset'
+data_root = 'data/lvis_v0.5/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/lvis_v0.5/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type='ClassBalancedDataset',
+        oversample_thr=1e-3,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/lvis_v0.5_train.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/lvis_v0.5_val.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='LVISMetric',
+    ann_file=data_root + 'annotations/lvis_v0.5_val.json',
+    metric=['bbox', 'segm'],
+    backend_args=backend_args)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/lvis_v1_instance.py b/mmde/mmdet/.mim/configs/_base_/datasets/lvis_v1_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..0413f370a2b635362a60c20881769064bac9a603
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/lvis_v1_instance.py
@@ -0,0 +1,22 @@
+# dataset settings
+_base_ = 'lvis_v0.5_instance.py'
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/lvis_v1/'
+
+train_dataloader = dict(
+    dataset=dict(
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/lvis_v1_train.json',
+            data_prefix=dict(img=''))))
+val_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/lvis_v1_val.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(ann_file=data_root + 'annotations/lvis_v1_val.json')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/mot_challenge.py b/mmde/mmdet/.mim/configs/_base_/datasets/mot_challenge.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce2828ef70a34c123792d252bf992f423049d065
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/mot_challenge.py
@@ -0,0 +1,90 @@
+# dataset settings
+dataset_type = 'MOTChallengeDataset'
+data_root = 'data/MOT17/'
+img_scale = (1088, 1088)
+
+backend_args = None
+# data pipeline
+train_pipeline = [
+    dict(
+        type='UniformRefFrameSample',
+        num_ref_imgs=1,
+        frame_range=10,
+        filter_key_img=True),
+    dict(
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='LoadImageFromFile', backend_args=backend_args),
+            dict(type='LoadTrackAnnotations'),
+            dict(
+                type='RandomResize',
+                scale=img_scale,
+                ratio_range=(0.8, 1.2),
+                keep_ratio=True,
+                clip_object_border=False),
+            dict(type='PhotoMetricDistortion')
+        ]),
+    dict(
+        type='TransformBroadcaster',
+        # different cropped positions for different frames
+        share_random_params=False,
+        transforms=[
+            dict(
+                type='RandomCrop', crop_size=img_scale, bbox_clip_border=False)
+        ]),
+    dict(
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='RandomFlip', prob=0.5),
+        ]),
+    dict(type='PackTrackInputs')
+]
+
+test_pipeline = [
+    dict(
+        type='TransformBroadcaster',
+        transforms=[
+            dict(type='LoadImageFromFile', backend_args=backend_args),
+            dict(type='Resize', scale=img_scale, keep_ratio=True),
+            dict(type='LoadTrackAnnotations')
+        ]),
+    dict(type='PackTrackInputs')
+]
+
+# dataloader
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='TrackImgSampler'),  # image-based sampling
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        visibility_thr=-1,
+        ann_file='annotations/half-train_cocoformat.json',
+        data_prefix=dict(img_path='train'),
+        metainfo=dict(classes=('pedestrian', )),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    # Now we support two ways to test, image_based and video_based
+    # if you want to use video_based sampling, you can use as follows
+    # sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    sampler=dict(type='TrackImgSampler'),  # image-based sampling
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/half-val_cocoformat.json',
+        data_prefix=dict(img_path='train'),
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(
+    type='MOTChallengeMetric', metric=['HOTA', 'CLEAR', 'Identity'])
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/mot_challenge_det.py b/mmde/mmdet/.mim/configs/_base_/datasets/mot_challenge_det.py
new file mode 100644
index 0000000000000000000000000000000000000000..a988572c3837eb2a8a6bf7b9eca06f3d82abdfda
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/mot_challenge_det.py
@@ -0,0 +1,66 @@
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/MOT17/'
+
+backend_args = None
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args, to_float32=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize',
+        scale=(1088, 1088),
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True,
+        clip_object_border=False),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='RandomCrop', crop_size=(1088, 1088), bbox_clip_border=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1088, 1088), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/half-train_cocoformat.json',
+        data_prefix=dict(img='train/'),
+        metainfo=dict(classes=('pedestrian', )),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/half-val_cocoformat.json',
+        data_prefix=dict(img='train/'),
+        metainfo=dict(classes=('pedestrian', )),
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/half-val_cocoformat.json',
+    metric='bbox',
+    format_only=False)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/mot_challenge_reid.py b/mmde/mmdet/.mim/configs/_base_/datasets/mot_challenge_reid.py
new file mode 100644
index 0000000000000000000000000000000000000000..57a95b531f3591e60daaabc5eea6f11c7424215b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/mot_challenge_reid.py
@@ -0,0 +1,61 @@
+# dataset settings
+dataset_type = 'ReIDDataset'
+data_root = 'data/MOT17/'
+
+backend_args = None
+# data pipeline
+train_pipeline = [
+    dict(
+        type='TransformBroadcaster',
+        share_random_params=False,
+        transforms=[
+            dict(
+                type='LoadImageFromFile',
+                backend_args=backend_args,
+                to_float32=True),
+            dict(
+                type='Resize',
+                scale=(128, 256),
+                keep_ratio=False,
+                clip_object_border=False),
+            dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+        ]),
+    dict(type='PackReIDInputs', meta_keys=('flip', 'flip_direction'))
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args, to_float32=True),
+    dict(type='Resize', scale=(128, 256), keep_ratio=False),
+    dict(type='PackReIDInputs')
+]
+
+# dataloader
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        triplet_sampler=dict(num_ids=8, ins_per_id=4),
+        data_prefix=dict(img_path='reid/imgs'),
+        ann_file='reid/meta/train_80.txt',
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        triplet_sampler=None,
+        data_prefix=dict(img_path='reid/imgs'),
+        ann_file='reid/meta/val_20.txt',
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(type='ReIDMetrics', metric=['mAP', 'CMC'])
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/objects365v1_detection.py b/mmde/mmdet/.mim/configs/_base_/datasets/objects365v1_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee398698608543e13188452a816283e9a2563390
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/objects365v1_detection.py
@@ -0,0 +1,74 @@
+# dataset settings
+dataset_type = 'Objects365V1Dataset'
+data_root = 'data/Objects365/Obj365_v1/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/objects365_train.json',
+        data_prefix=dict(img='train/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/objects365_val.json',
+        data_prefix=dict(img='val/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/objects365_val.json',
+    metric='bbox',
+    sort_categories=True,
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/objects365v2_detection.py b/mmde/mmdet/.mim/configs/_base_/datasets/objects365v2_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..b25a7ba901befa8d61e3cdae8a7c68fb8a9c5aef
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/objects365v2_detection.py
@@ -0,0 +1,73 @@
+# dataset settings
+dataset_type = 'Objects365V2Dataset'
+data_root = 'data/Objects365/Obj365_v2/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/zhiyuan_objv2_train.json',
+        data_prefix=dict(img='train/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/zhiyuan_objv2_val.json',
+        data_prefix=dict(img='val/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/zhiyuan_objv2_val.json',
+    metric='bbox',
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/openimages_detection.py b/mmde/mmdet/.mim/configs/_base_/datasets/openimages_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..129661b405c70d3e2d0d2c4741e3a59333dd960c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/openimages_detection.py
@@ -0,0 +1,81 @@
+# dataset settings
+dataset_type = 'OpenImagesDataset'
+data_root = 'data/OpenImages/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1024, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1024, 800), keep_ratio=True),
+    # avoid bboxes being resized
+    dict(type='LoadAnnotations', with_bbox=True),
+    # TODO: find a better way to collect image_level_labels
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'instances', 'image_level_labels'))
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=0,  # workers_per_gpu > 0 may occur out of memory
+    persistent_workers=False,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/oidv6-train-annotations-bbox.csv',
+        data_prefix=dict(img='OpenImages/train/'),
+        label_file='annotations/class-descriptions-boxable.csv',
+        hierarchy_file='annotations/bbox_labels_600_hierarchy.json',
+        meta_file='annotations/train-image-metas.pkl',
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=0,
+    persistent_workers=False,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/validation-annotations-bbox.csv',
+        data_prefix=dict(img='OpenImages/validation/'),
+        label_file='annotations/class-descriptions-boxable.csv',
+        hierarchy_file='annotations/bbox_labels_600_hierarchy.json',
+        meta_file='annotations/validation-image-metas.pkl',
+        image_level_ann_file='annotations/validation-'
+        'annotations-human-imagelabels-boxable.csv',
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='OpenImagesMetric',
+    iou_thrs=0.5,
+    ioa_thrs=0.5,
+    use_group_of=True,
+    get_supercategory=True)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/refcoco+.py b/mmde/mmdet/.mim/configs/_base_/datasets/refcoco+.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae0278ddf6c30fda6e4fb42aed1cb1b9a55109ec
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/refcoco+.py
@@ -0,0 +1,55 @@
+# dataset settings
+dataset_type = 'RefCocoDataset'
+data_root = 'data/coco/'
+
+backend_args = None
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(
+        type='LoadAnnotations',
+        with_mask=True,
+        with_bbox=False,
+        with_seg=False,
+        with_label=False),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'gt_masks', 'text'))
+]
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='train2014/'),
+        ann_file='refcoco+/instances.json',
+        split_file='refcoco+/refs(unc).p',
+        split='val',
+        text_mode='select_first',
+        pipeline=test_pipeline))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='train2014/'),
+        ann_file='refcoco+/instances.json',
+        split_file='refcoco+/refs(unc).p',
+        split='testA',  # or 'testB'
+        text_mode='select_first',
+        pipeline=test_pipeline))
+
+val_evaluator = dict(type='RefSegMetric', metric=['cIoU', 'mIoU'])
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/refcoco.py b/mmde/mmdet/.mim/configs/_base_/datasets/refcoco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b6caefa9a4bbfabdb49689588821f99d882a80f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/refcoco.py
@@ -0,0 +1,55 @@
+# dataset settings
+dataset_type = 'RefCocoDataset'
+data_root = 'data/coco/'
+
+backend_args = None
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(
+        type='LoadAnnotations',
+        with_mask=True,
+        with_bbox=False,
+        with_seg=False,
+        with_label=False),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'gt_masks', 'text'))
+]
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='train2014/'),
+        ann_file='refcoco/instances.json',
+        split_file='refcoco/refs(unc).p',
+        split='val',
+        text_mode='select_first',
+        pipeline=test_pipeline))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='train2014/'),
+        ann_file='refcoco/instances.json',
+        split_file='refcoco/refs(unc).p',
+        split='testA',  # or 'testB'
+        text_mode='select_first',
+        pipeline=test_pipeline))
+
+val_evaluator = dict(type='RefSegMetric', metric=['cIoU', 'mIoU'])
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/refcocog.py b/mmde/mmdet/.mim/configs/_base_/datasets/refcocog.py
new file mode 100644
index 0000000000000000000000000000000000000000..19dbeef1cde79fcb2aa80bb9936a60cc30089963
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/refcocog.py
@@ -0,0 +1,55 @@
+# dataset settings
+dataset_type = 'RefCocoDataset'
+data_root = 'data/coco/'
+
+backend_args = None
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(
+        type='LoadAnnotations',
+        with_mask=True,
+        with_bbox=False,
+        with_seg=False,
+        with_label=False),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'gt_masks', 'text'))
+]
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='train2014/'),
+        ann_file='refcocog/instances.json',
+        split_file='refcocog/refs(umd).p',
+        split='val',
+        text_mode='select_first',
+        pipeline=test_pipeline))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='train2014/'),
+        ann_file='refcocog/instances.json',
+        split_file='refcocog/refs(umd).p',
+        split='test',
+        text_mode='select_first',
+        pipeline=test_pipeline))
+
+val_evaluator = dict(type='RefSegMetric', metric=['cIoU', 'mIoU'])
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/semi_coco_detection.py b/mmde/mmdet/.mim/configs/_base_/datasets/semi_coco_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..694f25f841e06dbb59a699dfe13c18e34dbdce9f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/semi_coco_detection.py
@@ -0,0 +1,178 @@
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+color_space = [
+    [dict(type='ColorTransform')],
+    [dict(type='AutoContrast')],
+    [dict(type='Equalize')],
+    [dict(type='Sharpness')],
+    [dict(type='Posterize')],
+    [dict(type='Solarize')],
+    [dict(type='Color')],
+    [dict(type='Contrast')],
+    [dict(type='Brightness')],
+]
+
+geometric = [
+    [dict(type='Rotate')],
+    [dict(type='ShearX')],
+    [dict(type='ShearY')],
+    [dict(type='TranslateX')],
+    [dict(type='TranslateY')],
+]
+
+scale = [(1333, 400), (1333, 1200)]
+
+branch_field = ['sup', 'unsup_teacher', 'unsup_student']
+# pipeline used to augment labeled data,
+# which will be sent to student model for supervised training.
+sup_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomResize', scale=scale, keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='RandAugment', aug_space=color_space, aug_num=1),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='MultiBranch',
+        branch_field=branch_field,
+        sup=dict(type='PackDetInputs'))
+]
+
+# pipeline used to augment unlabeled data weakly,
+# which will be sent to teacher model for predicting pseudo instances.
+weak_pipeline = [
+    dict(type='RandomResize', scale=scale, keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction',
+                   'homography_matrix')),
+]
+
+# pipeline used to augment unlabeled data strongly,
+# which will be sent to student model for unsupervised training.
+strong_pipeline = [
+    dict(type='RandomResize', scale=scale, keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomOrder',
+        transforms=[
+            dict(type='RandAugment', aug_space=color_space, aug_num=1),
+            dict(type='RandAugment', aug_space=geometric, aug_num=1),
+        ]),
+    dict(type='RandomErasing', n_patches=(1, 5), ratio=(0, 0.2)),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction',
+                   'homography_matrix')),
+]
+
+# pipeline used to augment unlabeled data into different views
+unsup_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadEmptyAnnotations'),
+    dict(
+        type='MultiBranch',
+        branch_field=branch_field,
+        unsup_teacher=weak_pipeline,
+        unsup_student=strong_pipeline,
+    )
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+batch_size = 5
+num_workers = 5
+# There are two common semi-supervised learning settings on the coco dataset：
+# (1) Divide the train2017 into labeled and unlabeled datasets
+# by a fixed percentage, such as 1%, 2%, 5% and 10%.
+# The format of labeled_ann_file and unlabeled_ann_file are
+# instances_train2017.{fold}@{percent}.json, and
+# instances_train2017.{fold}@{percent}-unlabeled.json
+# `fold` is used for cross-validation, and `percent` represents
+# the proportion of labeled data in the train2017.
+# (2) Choose the train2017 as the labeled dataset
+# and unlabeled2017 as the unlabeled dataset.
+# The labeled_ann_file and unlabeled_ann_file are
+# instances_train2017.json and image_info_unlabeled2017.json
+# We use this configuration by default.
+labeled_dataset = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='annotations/instances_train2017.json',
+    data_prefix=dict(img='train2017/'),
+    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+    pipeline=sup_pipeline,
+    backend_args=backend_args)
+
+unlabeled_dataset = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='annotations/instances_unlabeled2017.json',
+    data_prefix=dict(img='unlabeled2017/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=unsup_pipeline,
+    backend_args=backend_args)
+
+train_dataloader = dict(
+    batch_size=batch_size,
+    num_workers=num_workers,
+    persistent_workers=True,
+    sampler=dict(
+        type='GroupMultiSourceSampler',
+        batch_size=batch_size,
+        source_ratio=[1, 4]),
+    dataset=dict(
+        type='ConcatDataset', datasets=[labeled_dataset, unlabeled_dataset]))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/v3det.py b/mmde/mmdet/.mim/configs/_base_/datasets/v3det.py
new file mode 100644
index 0000000000000000000000000000000000000000..38ccbf864b6248192dfbf4abaf4858b5f93d45e8
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/v3det.py
@@ -0,0 +1,69 @@
+# dataset settings
+dataset_type = 'V3DetDataset'
+data_root = 'data/V3Det/'
+
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type='ClassBalancedDataset',
+        oversample_thr=1e-3,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/v3det_2023_v1_train.json',
+            data_prefix=dict(img=''),
+            filter_cfg=dict(filter_empty_gt=True, min_size=4),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/v3det_2023_v1_val.json',
+        data_prefix=dict(img=''),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/v3det_2023_v1_val.json',
+    metric='bbox',
+    format_only=False,
+    backend_args=backend_args,
+    use_mp_eval=True,
+    proposal_nums=[300])
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/voc0712.py b/mmde/mmdet/.mim/configs/_base_/datasets/voc0712.py
new file mode 100644
index 0000000000000000000000000000000000000000..47f5e6563b7f47dd6cfec02248d4c8decd32afe4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/voc0712.py
@@ -0,0 +1,92 @@
+# dataset settings
+dataset_type = 'VOCDataset'
+data_root = 'data/VOCdevkit/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically Infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/segmentation/VOCdevkit/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/segmentation/',
+#         'data/': 's3://openmmlab/datasets/segmentation/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1000, 600), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1000, 600), keep_ratio=True),
+    # avoid bboxes being resized
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type='ConcatDataset',
+            # VOCDataset will add different `dataset_type` in dataset.metainfo,
+            # which will get error if using ConcatDataset. Adding
+            # `ignore_keys` can avoid this error.
+            ignore_keys=['dataset_type'],
+            datasets=[
+                dict(
+                    type=dataset_type,
+                    data_root=data_root,
+                    ann_file='VOC2007/ImageSets/Main/trainval.txt',
+                    data_prefix=dict(sub_data_root='VOC2007/'),
+                    filter_cfg=dict(
+                        filter_empty_gt=True, min_size=32, bbox_min_size=32),
+                    pipeline=train_pipeline,
+                    backend_args=backend_args),
+                dict(
+                    type=dataset_type,
+                    data_root=data_root,
+                    ann_file='VOC2012/ImageSets/Main/trainval.txt',
+                    data_prefix=dict(sub_data_root='VOC2012/'),
+                    filter_cfg=dict(
+                        filter_empty_gt=True, min_size=32, bbox_min_size=32),
+                    pipeline=train_pipeline,
+                    backend_args=backend_args)
+            ])))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='VOC2007/ImageSets/Main/test.txt',
+        data_prefix=dict(sub_data_root='VOC2007/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+# Pascal VOC2007 uses `11points` as default evaluate mode, while PASCAL
+# VOC2012 defaults to use 'area'.
+val_evaluator = dict(type='VOCMetric', metric='mAP', eval_mode='11points')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/wider_face.py b/mmde/mmdet/.mim/configs/_base_/datasets/wider_face.py
new file mode 100644
index 0000000000000000000000000000000000000000..7042bc46e877ed899969730325143307e15adf64
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/wider_face.py
@@ -0,0 +1,73 @@
+# dataset settings
+dataset_type = 'WIDERFaceDataset'
+data_root = 'data/WIDERFace/'
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/cityscapes/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#          'data/': 's3://openmmlab/datasets/detection/'
+#      }))
+backend_args = None
+
+img_scale = (640, 640)  # VGA resolution
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img='WIDER_train'),
+        filter_cfg=dict(filter_empty_gt=True, bbox_min_size=17, min_size=32),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img='WIDER_val'),
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    # TODO: support WiderFace-Evaluation for easy, medium, hard cases
+    type='VOCMetric',
+    metric='mAP',
+    eval_mode='11points')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/_base_/datasets/youtube_vis.py b/mmde/mmdet/.mim/configs/_base_/datasets/youtube_vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..ece07cc3879e512082e302c2e3f76108c57a0234
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/datasets/youtube_vis.py
@@ -0,0 +1,66 @@
+dataset_type = 'YouTubeVISDataset'
+data_root = 'data/youtube_vis_2019/'
+dataset_version = data_root[-5:-1]  # 2019 or 2021
+
+backend_args = None
+
+# dataset settings
+train_pipeline = [
+    dict(
+        type='UniformRefFrameSample',
+        num_ref_imgs=1,
+        frame_range=100,
+        filter_key_img=True),
+    dict(
+        type='TransformBroadcaster',
+        share_random_params=True,
+        transforms=[
+            dict(type='LoadImageFromFile', backend_args=backend_args),
+            dict(type='LoadTrackAnnotations', with_mask=True),
+            dict(type='Resize', scale=(640, 360), keep_ratio=True),
+            dict(type='RandomFlip', prob=0.5),
+        ]),
+    dict(type='PackTrackInputs')
+]
+
+test_pipeline = [
+    dict(
+        type='TransformBroadcaster',
+        transforms=[
+            dict(type='LoadImageFromFile', backend_args=backend_args),
+            dict(type='Resize', scale=(640, 360), keep_ratio=True),
+            dict(type='LoadTrackAnnotations', with_mask=True),
+        ]),
+    dict(type='PackTrackInputs')
+]
+
+# dataloader
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    # sampler=dict(type='TrackImgSampler'),  # image-based sampling
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='TrackAspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        dataset_version=dataset_version,
+        ann_file='annotations/youtube_vis_2019_train.json',
+        data_prefix=dict(img_path='train/JPEGImages'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        dataset_version=dataset_version,
+        ann_file='annotations/youtube_vis_2019_valid.json',
+        data_prefix=dict(img_path='valid/JPEGImages'),
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/_base_/default_runtime.py b/mmde/mmdet/.mim/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..870e5614c86d7e1bbdad13d77a0db03a46ce717a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/default_runtime.py
@@ -0,0 +1,24 @@
+default_scope = 'mmdet'
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='DetVisualizationHook'))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
+
+log_level = 'INFO'
+load_from = None
+resume = False
diff --git a/mmde/mmdet/.mim/configs/_base_/models/cascade-mask-rcnn_r50_fpn.py b/mmde/mmdet/.mim/configs/_base_/models/cascade-mask-rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5167f7a02e66c80bd8ec8cc7572acb22eaadba5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/models/cascade-mask-rcnn_r50_fpn.py
@@ -0,0 +1,203 @@
+# model settings
+model = dict(
+    type='CascadeRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/mmde/mmdet/.mim/configs/_base_/models/cascade-rcnn_r50_fpn.py b/mmde/mmdet/.mim/configs/_base_/models/cascade-rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..50c57f01ca3a6ea827f71801b0c233af268914f9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/models/cascade-rcnn_r50_fpn.py
@@ -0,0 +1,185 @@
+# model settings
+model = dict(
+    type='CascadeRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ]),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/mmde/mmdet/.mim/configs/_base_/models/fast-rcnn_r50_fpn.py b/mmde/mmdet/.mim/configs/_base_/models/fast-rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bd45e9266b01df302b78e50258fa1572144cb21
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/models/fast-rcnn_r50_fpn.py
@@ -0,0 +1,68 @@
+# model settings
+model = dict(
+    type='FastRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/mmde/mmdet/.mim/configs/_base_/models/faster-rcnn_r50-caffe-c4.py b/mmde/mmdet/.mim/configs/_base_/models/faster-rcnn_r50-caffe-c4.py
new file mode 100644
index 0000000000000000000000000000000000000000..15d2db72e48790505c2a1e4e7d184c1803f7ab31
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/models/faster-rcnn_r50-caffe-c4.py
@@ -0,0 +1,123 @@
+# model settings
+norm_cfg = dict(type='BN', requires_grad=False)
+model = dict(
+    type='FasterRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=3,
+        strides=(1, 2, 2),
+        dilations=(1, 1, 1),
+        out_indices=(2, ),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=1024,
+        feat_channels=1024,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[2, 4, 8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[16]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        shared_head=dict(
+            type='ResLayer',
+            depth=50,
+            stage=3,
+            stride=2,
+            dilation=1,
+            style='caffe',
+            norm_cfg=norm_cfg,
+            norm_eval=True,
+            init_cfg=dict(
+                type='Pretrained',
+                checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=1024,
+            featmap_strides=[16]),
+        bbox_head=dict(
+            type='BBoxHead',
+            with_avg_pool=True,
+            roi_feat_size=7,
+            in_channels=2048,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=12000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=6000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/mmde/mmdet/.mim/configs/_base_/models/faster-rcnn_r50-caffe-dc5.py b/mmde/mmdet/.mim/configs/_base_/models/faster-rcnn_r50-caffe-dc5.py
new file mode 100644
index 0000000000000000000000000000000000000000..189915e3d9ce7239493da6465931f91e2d9d664f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/models/faster-rcnn_r50-caffe-dc5.py
@@ -0,0 +1,111 @@
+# model settings
+norm_cfg = dict(type='BN', requires_grad=False)
+model = dict(
+    type='FasterRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        strides=(1, 2, 2, 1),
+        dilations=(1, 1, 1, 2),
+        out_indices=(3, ),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=2048,
+        feat_channels=2048,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[2, 4, 8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[16]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=2048,
+            featmap_strides=[16]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=2048,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=12000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms=dict(type='nms', iou_threshold=0.7),
+            nms_pre=6000,
+            max_per_img=1000,
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/mmde/mmdet/.mim/configs/_base_/models/faster-rcnn_r50_fpn.py b/mmde/mmdet/.mim/configs/_base_/models/faster-rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..31aa1461799a988a11adb901306a063fd3f0b951
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/models/faster-rcnn_r50_fpn.py
@@ -0,0 +1,114 @@
+# model settings
+model = dict(
+    type='FasterRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+    ))
diff --git a/mmde/mmdet/.mim/configs/_base_/models/mask-rcnn_r50-caffe-c4.py b/mmde/mmdet/.mim/configs/_base_/models/mask-rcnn_r50-caffe-c4.py
new file mode 100644
index 0000000000000000000000000000000000000000..de1131b24893ae24bd99923895fd844837c9b46d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/models/mask-rcnn_r50-caffe-c4.py
@@ -0,0 +1,132 @@
+# model settings
+norm_cfg = dict(type='BN', requires_grad=False)
+model = dict(
+    type='MaskRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_mask=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=3,
+        strides=(1, 2, 2),
+        dilations=(1, 1, 1),
+        out_indices=(2, ),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=1024,
+        feat_channels=1024,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[2, 4, 8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[16]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        shared_head=dict(
+            type='ResLayer',
+            depth=50,
+            stage=3,
+            stride=2,
+            dilation=1,
+            style='caffe',
+            norm_cfg=norm_cfg,
+            norm_eval=True),
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=1024,
+            featmap_strides=[16]),
+        bbox_head=dict(
+            type='BBoxHead',
+            with_avg_pool=True,
+            roi_feat_size=7,
+            in_channels=2048,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+        mask_roi_extractor=None,
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=0,
+            in_channels=2048,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=12000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=14,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=6000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            max_per_img=1000,
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/mmde/mmdet/.mim/configs/_base_/models/mask-rcnn_r50_fpn.py b/mmde/mmdet/.mim/configs/_base_/models/mask-rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4ff7a49d0a2f0abd4823ef89ad957d9708085e7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/models/mask-rcnn_r50_fpn.py
@@ -0,0 +1,127 @@
+# model settings
+model = dict(
+    type='MaskRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=28,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/mmde/mmdet/.mim/configs/_base_/models/retinanet_r50_fpn.py b/mmde/mmdet/.mim/configs/_base_/models/retinanet_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..53662c9f1390af22b15c5591e122b0aa0b2d6c92
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/models/retinanet_r50_fpn.py
@@ -0,0 +1,68 @@
+# model settings
+model = dict(
+    type='RetinaNet',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_input',
+        num_outs=5),
+    bbox_head=dict(
+        type='RetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='PseudoSampler'),  # Focal loss should use PseudoSampler
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
diff --git a/mmde/mmdet/.mim/configs/_base_/models/rpn_r50-caffe-c4.py b/mmde/mmdet/.mim/configs/_base_/models/rpn_r50-caffe-c4.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed1dbe746d432d96d70e7dc9048c9e1b1727c938
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/models/rpn_r50-caffe-c4.py
@@ -0,0 +1,64 @@
+# model settings
+model = dict(
+    type='RPN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=3,
+        strides=(1, 2, 2),
+        dilations=(1, 1, 1),
+        out_indices=(2, ),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=None,
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=1024,
+        feat_channels=1024,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[2, 4, 8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[16]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=12000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0)))
diff --git a/mmde/mmdet/.mim/configs/_base_/models/rpn_r50_fpn.py b/mmde/mmdet/.mim/configs/_base_/models/rpn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bc4790434a368d0728d74dcd7ba79e665aae276
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/models/rpn_r50_fpn.py
@@ -0,0 +1,64 @@
+# model settings
+model = dict(
+    type='RPN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0)))
diff --git a/mmde/mmdet/.mim/configs/_base_/models/ssd300.py b/mmde/mmdet/.mim/configs/_base_/models/ssd300.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd113c7cbc41494eabb6a56061f8a90343ac9efd
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/models/ssd300.py
@@ -0,0 +1,63 @@
+# model settings
+input_size = 300
+model = dict(
+    type='SingleStageDetector',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[1, 1, 1],
+        bgr_to_rgb=True,
+        pad_size_divisor=1),
+    backbone=dict(
+        type='SSDVGG',
+        depth=16,
+        with_last_pool=False,
+        ceil_mode=True,
+        out_indices=(3, 4),
+        out_feature_indices=(22, 34),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://vgg16_caffe')),
+    neck=dict(
+        type='SSDNeck',
+        in_channels=(512, 1024),
+        out_channels=(512, 1024, 512, 256, 256, 256),
+        level_strides=(2, 2, 1, 1),
+        level_paddings=(1, 1, 0, 0),
+        l2_norm_scale=20),
+    bbox_head=dict(
+        type='SSDHead',
+        in_channels=(512, 1024, 512, 256, 256, 256),
+        num_classes=80,
+        anchor_generator=dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            input_size=input_size,
+            basesize_ratio_range=(0.15, 0.9),
+            strides=[8, 16, 32, 64, 100, 300],
+            ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2])),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.,
+            ignore_iof_thr=-1,
+            gt_max_assign_all=False),
+        sampler=dict(type='PseudoSampler'),
+        smoothl1_beta=1.,
+        allowed_border=-1,
+        pos_weight=-1,
+        neg_pos_ratio=3,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        nms=dict(type='nms', iou_threshold=0.45),
+        min_bbox_size=0,
+        score_thr=0.02,
+        max_per_img=200))
+cudnn_benchmark = True
diff --git a/mmde/mmdet/.mim/configs/_base_/schedules/schedule_1x.py b/mmde/mmdet/.mim/configs/_base_/schedules/schedule_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..95f30be74ff37080ba0d227d55bbd587feeaa892
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/schedules/schedule_1x.py
@@ -0,0 +1,28 @@
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/_base_/schedules/schedule_20e.py b/mmde/mmdet/.mim/configs/_base_/schedules/schedule_20e.py
new file mode 100644
index 0000000000000000000000000000000000000000..75f958b0ed11d77ae3aebff6b7a5d8cb80797d9f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/schedules/schedule_20e.py
@@ -0,0 +1,28 @@
+# training schedule for 20e
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=20, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=20,
+        by_epoch=True,
+        milestones=[16, 19],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/_base_/schedules/schedule_2x.py b/mmde/mmdet/.mim/configs/_base_/schedules/schedule_2x.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b7b241de6f3285e0f127f3c0581c8c84de463e4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/_base_/schedules/schedule_2x.py
@@ -0,0 +1,28 @@
+# training schedule for 2x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/albu_example/mask-rcnn_r50_fpn_albu-1x_coco.py b/mmde/mmdet/.mim/configs/albu_example/mask-rcnn_r50_fpn_albu-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8a2780e99b88c78adbe74c024fcd2d693817030
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/albu_example/mask-rcnn_r50_fpn_albu-1x_coco.py
@@ -0,0 +1,66 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+
+albu_train_transforms = [
+    dict(
+        type='ShiftScaleRotate',
+        shift_limit=0.0625,
+        scale_limit=0.0,
+        rotate_limit=0,
+        interpolation=1,
+        p=0.5),
+    dict(
+        type='RandomBrightnessContrast',
+        brightness_limit=[0.1, 0.3],
+        contrast_limit=[0.1, 0.3],
+        p=0.2),
+    dict(
+        type='OneOf',
+        transforms=[
+            dict(
+                type='RGBShift',
+                r_shift_limit=10,
+                g_shift_limit=10,
+                b_shift_limit=10,
+                p=1.0),
+            dict(
+                type='HueSaturationValue',
+                hue_shift_limit=20,
+                sat_shift_limit=30,
+                val_shift_limit=20,
+                p=1.0)
+        ],
+        p=0.1),
+    dict(type='JpegCompression', quality_lower=85, quality_upper=95, p=0.2),
+    dict(type='ChannelShuffle', p=0.1),
+    dict(
+        type='OneOf',
+        transforms=[
+            dict(type='Blur', blur_limit=3, p=1.0),
+            dict(type='MedianBlur', blur_limit=3, p=1.0)
+        ],
+        p=0.1),
+]
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(
+        type='Albu',
+        transforms=albu_train_transforms,
+        bbox_params=dict(
+            type='BboxParams',
+            format='pascal_voc',
+            label_fields=['gt_bboxes_labels', 'gt_ignore_flags'],
+            min_visibility=0.0,
+            filter_lost_elements=True),
+        keymap={
+            'img': 'image',
+            'gt_masks': 'masks',
+            'gt_bboxes': 'bboxes'
+        },
+        skip_img_without_anno=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/albu_example/metafile.yml b/mmde/mmdet/.mim/configs/albu_example/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3b54bdf15688281e5896faac3f841433497c7eaf
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/albu_example/metafile.yml
@@ -0,0 +1,17 @@
+Models:
+  - Name: mask-rcnn_r50_fpn_albu-1x_coco
+    In Collection: Mask R-CNN
+    Config: mask-rcnn_r50_fpn_albu-1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.4
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 34.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/albu_example/mask_rcnn_r50_fpn_albu_1x_coco/mask_rcnn_r50_fpn_albu_1x_coco_20200208-ab203bcd.pth
diff --git a/mmde/mmdet/.mim/configs/atss/atss_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/atss/atss_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5225d2ab672738d4d427eba252e92bd554252476
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/atss/atss_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './atss_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/atss/atss_r101_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/atss/atss_r101_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..69999ce45aee9c76dcc4af974e6e9baabbd5b44b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/atss/atss_r101_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './atss_r50_fpn_8xb8-amp-lsj-200e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/atss/atss_r18_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/atss/atss_r18_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d9f13263619333391befd6692c83622091ef4e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/atss/atss_r18_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './atss_r50_fpn_8xb8-amp-lsj-200e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
diff --git a/mmde/mmdet/.mim/configs/atss/atss_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/atss/atss_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..306435d7d2fc645f1c2deae784c1875cc4ceaf98
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/atss/atss_r50_fpn_1x_coco.py
@@ -0,0 +1,71 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    type='ATSS',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='ATSSHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/atss/atss_r50_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/atss/atss_r50_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b3c46f4b926b82fbab438d6d50eb6c079dabc3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/atss/atss_r50_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,81 @@
+_base_ = '../common/lsj-200e_coco-detection.py'
+
+image_size = (1024, 1024)
+batch_augments = [dict(type='BatchFixedSizePad', size=image_size)]
+
+model = dict(
+    type='ATSS',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32,
+        batch_augments=batch_augments),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='ATSSHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+train_dataloader = dict(batch_size=8, num_workers=4)
+
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=0.01 * 4, momentum=0.9, weight_decay=0.00004))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/atss/metafile.yml b/mmde/mmdet/.mim/configs/atss/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f4c567ef29ba9ea4fddd7bc00d63a4bca41b1cfa
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/atss/metafile.yml
@@ -0,0 +1,60 @@
+Collections:
+  - Name: ATSS
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ATSS
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1912.02424
+      Title: 'Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection'
+    README: configs/atss/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/atss.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: atss_r50_fpn_1x_coco
+    In Collection: ATSS
+    Config: configs/atss/atss_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.7
+      inference time (ms/im):
+        - value: 50.76
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r50_fpn_1x_coco/atss_r50_fpn_1x_coco_20200209-985f7bd0.pth
+
+  - Name: atss_r101_fpn_1x_coco
+    In Collection: ATSS
+    Config: configs/atss/atss_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.6
+      inference time (ms/im):
+        - value: 81.3
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r101_fpn_1x_coco/atss_r101_fpn_1x_20200825-dfcadd6f.pth
diff --git a/mmde/mmdet/.mim/configs/autoassign/autoassign_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/autoassign/autoassign_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a361952d95b655451186ef1cb39df2f24ae305
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/autoassign/autoassign_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,69 @@
+# We follow the original implementation which
+# adopts the Caffe pre-trained backbone.
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    type='AutoAssign',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[102.9801, 115.9465, 122.7717],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs=True,
+        num_outs=5,
+        relu_before_extra_convs=True,
+        init_cfg=dict(type='Caffe2Xavier', layer='Conv2d')),
+    bbox_head=dict(
+        type='AutoAssignHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        loss_bbox=dict(type='GIoULoss', loss_weight=5.0)),
+    train_cfg=None,
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(lr=0.01), paramwise_cfg=dict(norm_decay_mult=0.))
diff --git a/mmde/mmdet/.mim/configs/autoassign/metafile.yml b/mmde/mmdet/.mim/configs/autoassign/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ab7a4af3371d4be5325498db97af0e7dd8fdc28c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/autoassign/metafile.yml
@@ -0,0 +1,33 @@
+Collections:
+  - Name: AutoAssign
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - AutoAssign
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/2007.03496
+      Title: 'AutoAssign: Differentiable Label Assignment for Dense Object Detection'
+    README: configs/autoassign/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/detectors/autoassign.py#L6
+      Version: v2.12.0
+
+Models:
+  - Name: autoassign_r50-caffe_fpn_1x_coco
+    In Collection: AutoAssign
+    Config: configs/autoassign/autoassign_r50-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.08
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/autoassign/auto_assign_r50_fpn_1x_coco/auto_assign_r50_fpn_1x_coco_20210413_115540-5e17991f.pth
diff --git a/mmde/mmdet/.mim/configs/boxinst/boxinst_r101_fpn_ms-90k_coco.py b/mmde/mmdet/.mim/configs/boxinst/boxinst_r101_fpn_ms-90k_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab2b11628a79aee7f6f6403cecf8f7b1d0526d69
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/boxinst/boxinst_r101_fpn_ms-90k_coco.py
@@ -0,0 +1,8 @@
+_base_ = './boxinst_r50_fpn_ms-90k_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/boxinst/boxinst_r50_fpn_ms-90k_coco.py b/mmde/mmdet/.mim/configs/boxinst/boxinst_r50_fpn_ms-90k_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..371f252a153855e19f3a3bb25cd42c83a4bb77fd
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/boxinst/boxinst_r50_fpn_ms-90k_coco.py
@@ -0,0 +1,93 @@
+_base_ = '../common/ms-90k_coco.py'
+
+# model settings
+model = dict(
+    type='BoxInst',
+    data_preprocessor=dict(
+        type='BoxInstDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32,
+        mask_stride=4,
+        pairwise_size=3,
+        pairwise_dilation=2,
+        pairwise_color_thresh=0.3,
+        bottom_pixels_removed=10),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',  # use P5
+        num_outs=5,
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type='BoxInstBboxHead',
+        num_params=593,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        norm_on_bbox=True,
+        centerness_on_reg=True,
+        dcn_on_last_conv=False,
+        center_sampling=True,
+        conv_bias=True,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    mask_head=dict(
+        type='BoxInstMaskHead',
+        num_layers=3,
+        feat_channels=16,
+        size_of_interest=8,
+        mask_out_stride=4,
+        topk_masks_per_img=64,
+        mask_feature_head=dict(
+            in_channels=256,
+            feat_channels=128,
+            start_level=0,
+            end_level=2,
+            out_channels=16,
+            mask_stride=8,
+            num_stacked_convs=4,
+            norm_cfg=dict(type='BN', requires_grad=True)),
+        loss_mask=dict(
+            type='DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            eps=5e-6,
+            loss_weight=1.0)),
+    # model training and testing settings
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100,
+        mask_thr=0.5))
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(lr=0.01))
+
+# evaluator
+val_evaluator = dict(metric=['bbox', 'segm'])
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/boxinst/metafile.yml b/mmde/mmdet/.mim/configs/boxinst/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c97fcdcd636cd4d8d1a1437679f20b96d90fc74f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/boxinst/metafile.yml
@@ -0,0 +1,52 @@
+Collections:
+  - Name: BoxInst
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - ResNet
+        - FPN
+        - CondInst
+    Paper:
+      URL: https://arxiv.org/abs/2012.02310
+      Title: 'BoxInst: High-Performance Instance Segmentation with Box Annotations'
+    README: configs/boxinst/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v3.0.0rc6/mmdet/models/detectors/boxinst.py#L8
+      Version: v3.0.0rc6
+
+Models:
+  - Name: boxinst_r50_fpn_ms-90k_coco
+    In Collection: BoxInst
+    Config: configs/boxinst/boxinst_r50_fpn_ms-90k_coco.py
+    Metadata:
+      Iterations: 90000
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 30.8
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/boxinst/boxinst_r50_fpn_ms-90k_coco/boxinst_r50_fpn_ms-90k_coco_20221228_163052-6add751a.pth
+
+  - Name: boxinst_r101_fpn_ms-90k_coco
+    In Collection: BoxInst
+    Config: configs/boxinst/boxinst_r101_fpn_ms-90k_coco.py
+    Metadata:
+      Iterations: 90000
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 32.7
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/boxinst/boxinst_r101_fpn_ms-90k_coco/boxinst_r101_fpn_ms-90k_coco_20221229_145106-facf375b.pth
diff --git a/mmde/mmdet/.mim/configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/mmde/mmdet/.mim/configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
new file mode 100644
index 0000000000000000000000000000000000000000..24b3f7841947204f2ecea385dcfa8b97fa0c6e85
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,249 @@
+_base_ = ['../yolox/yolox_x_8xb8-300e_coco.py']
+
+dataset_type = 'MOTChallengeDataset'
+data_root = 'data/MOT17/'
+
+img_scale = (1440, 800)  # weight, height
+batch_size = 4
+
+detector = _base_.model
+detector.pop('data_preprocessor')
+detector.bbox_head.update(dict(num_classes=1))
+detector.test_cfg.nms.update(dict(iou_threshold=0.7))
+detector['init_cfg'] = dict(
+    type='Pretrained',
+    checkpoint=  # noqa: E251
+    'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth'  # noqa: E501
+)
+del _base_.model
+
+model = dict(
+    type='ByteTrack',
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        pad_size_divisor=32,
+        # in bytetrack, we provide joint train detector and evaluate tracking
+        # performance, use_det_processor means use independent detector
+        # data_preprocessor. of course, you can train detector independently
+        # like strongsort
+        use_det_processor=True,
+        batch_augments=[
+            dict(
+                type='BatchSyncRandomResize',
+                random_size_range=(576, 1024),
+                size_divisor=32,
+                interval=10)
+        ]),
+    detector=detector,
+    tracker=dict(
+        type='ByteTracker',
+        motion=dict(type='KalmanFilter'),
+        obj_score_thrs=dict(high=0.6, low=0.1),
+        init_track_thr=0.7,
+        weight_iou_with_det_scores=True,
+        match_iou_thrs=dict(high=0.1, low=0.5, tentative=0.3),
+        num_frames_retain=30))
+
+train_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        bbox_clip_border=False),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        bbox_clip_border=False),
+    dict(
+        type='MixUp',
+        img_scale=img_scale,
+        ratio_range=(0.8, 1.6),
+        pad_val=114.0,
+        bbox_clip_border=False),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='Resize',
+        scale=img_scale,
+        keep_ratio=True,
+        clip_object_border=False),
+    dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(
+        type='TransformBroadcaster',
+        transforms=[
+            dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+            dict(type='Resize', scale=img_scale, keep_ratio=True),
+            dict(
+                type='Pad',
+                size_divisor=32,
+                pad_val=dict(img=(114.0, 114.0, 114.0))),
+            dict(type='LoadTrackAnnotations'),
+        ]),
+    dict(type='PackTrackInputs')
+]
+train_dataloader = dict(
+    _delete_=True,
+    batch_size=batch_size,
+    num_workers=4,
+    persistent_workers=True,
+    pin_memory=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='MultiImageMixDataset',
+        dataset=dict(
+            type='ConcatDataset',
+            datasets=[
+                dict(
+                    type='CocoDataset',
+                    data_root='data/MOT17',
+                    ann_file='annotations/half-train_cocoformat.json',
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(classes=('pedestrian', )),
+                    pipeline=[
+                        dict(
+                            type='LoadImageFromFile',
+                            backend_args=_base_.backend_args),
+                        dict(type='LoadAnnotations', with_bbox=True),
+                    ]),
+                dict(
+                    type='CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_train.json',
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(classes=('pedestrian', )),
+                    pipeline=[
+                        dict(
+                            type='LoadImageFromFile',
+                            backend_args=_base_.backend_args),
+                        dict(type='LoadAnnotations', with_bbox=True),
+                    ]),
+                dict(
+                    type='CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_val.json',
+                    data_prefix=dict(img='val'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(classes=('pedestrian', )),
+                    pipeline=[
+                        dict(
+                            type='LoadImageFromFile',
+                            backend_args=_base_.backend_args),
+                        dict(type='LoadAnnotations', with_bbox=True),
+                    ]),
+            ]),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    _delete_=True,
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    pin_memory=True,
+    drop_last=False,
+    # video_based
+    # sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    sampler=dict(type='TrackImgSampler'),  # image_based
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/half-val_cocoformat.json',
+        data_prefix=dict(img_path='train'),
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optimizer
+# default 8 gpu
+base_lr = 0.001 / 8 * batch_size
+optim_wrapper = dict(optimizer=dict(lr=base_lr))
+
+# some hyper parameters
+# training settings
+max_epochs = 80
+num_last_epochs = 10
+interval = 5
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=max_epochs,
+    val_begin=70,
+    val_interval=1)
+
+# learning policy
+param_scheduler = [
+    dict(
+        # use quadratic formula to warm up 1 epochs
+        type='QuadraticWarmupLR',
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        # use cosine lr from 1 to 70 epoch
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=1,
+        T_max=max_epochs - num_last_epochs,
+        end=max_epochs - num_last_epochs,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        # use fixed lr during last 10 epochs
+        type='ConstantLR',
+        by_epoch=True,
+        factor=1,
+        begin=max_epochs - num_last_epochs,
+        end=max_epochs,
+    )
+]
+
+custom_hooks = [
+    dict(
+        type='YOLOXModeSwitchHook',
+        num_last_epochs=num_last_epochs,
+        priority=48),
+    dict(type='SyncNormHook', priority=48),
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0001,
+        update_buffers=True,
+        priority=49)
+]
+
+default_hooks = dict(
+    checkpoint=dict(
+        _delete_=True, type='CheckpointHook', interval=1, max_keep_ckpts=10),
+    visualization=dict(type='TrackVisualizationHook', draw=False))
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='TrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# evaluator
+val_evaluator = dict(
+    _delete_=True,
+    type='MOTChallengeMetric',
+    metric=['HOTA', 'CLEAR', 'Identity'],
+    postprocess_tracklet_cfg=[
+        dict(type='InterpolateTracklets', min_num_frames=5, max_num_frames=20)
+    ])
+test_evaluator = val_evaluator
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (4 samples per GPU)
+auto_scale_lr = dict(base_batch_size=32)
+
+del detector
+del _base_.tta_model
+del _base_.tta_pipeline
+del _base_.train_dataset
diff --git a/mmde/mmdet/.mim/configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py b/mmde/mmdet/.mim/configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9202f5fbda29d2a1d4cc81322c99d638ebf475d6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py
@@ -0,0 +1,127 @@
+_base_ = [
+    './bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_'
+    'test-mot17halfval.py'
+]
+
+dataset_type = 'MOTChallengeDataset'
+
+img_scale = (1600, 896)  # weight, height
+
+model = dict(
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        use_det_processor=True,
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(type='BatchSyncRandomResize', random_size_range=(640, 1152))
+        ]),
+    tracker=dict(
+        weight_iou_with_det_scores=False,
+        match_iou_thrs=dict(high=0.3),
+    ))
+
+train_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        bbox_clip_border=True),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        bbox_clip_border=True),
+    dict(
+        type='MixUp',
+        img_scale=img_scale,
+        ratio_range=(0.8, 1.6),
+        pad_val=114.0,
+        bbox_clip_border=True),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='Resize',
+        scale=img_scale,
+        keep_ratio=True,
+        clip_object_border=True),
+    dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(
+        type='TransformBroadcaster',
+        transforms=[
+            dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+            dict(type='Resize', scale=img_scale, keep_ratio=True),
+            dict(
+                type='Pad',
+                size_divisor=32,
+                pad_val=dict(img=(114.0, 114.0, 114.0))),
+            dict(type='LoadTrackAnnotations'),
+        ]),
+    dict(type='PackTrackInputs')
+]
+train_dataloader = dict(
+    dataset=dict(
+        type='MultiImageMixDataset',
+        dataset=dict(
+            type='ConcatDataset',
+            datasets=[
+                dict(
+                    type='CocoDataset',
+                    data_root='data/MOT20',
+                    ann_file='annotations/train_cocoformat.json',
+                    # TODO: mmdet use img as key, but img_path is needed
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(classes=('pedestrian', )),
+                    pipeline=[
+                        dict(
+                            type='LoadImageFromFile',
+                            backend_args=_base_.backend_args),
+                        dict(type='LoadAnnotations', with_bbox=True),
+                    ]),
+                dict(
+                    type='CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_train.json',
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(classes=('pedestrian', )),
+                    pipeline=[
+                        dict(
+                            type='LoadImageFromFile',
+                            backend_args=_base_.backend_args),
+                        dict(type='LoadAnnotations', with_bbox=True),
+                    ]),
+                dict(
+                    type='CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_val.json',
+                    data_prefix=dict(img='val'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(classes=('pedestrian', )),
+                    pipeline=[
+                        dict(
+                            type='LoadImageFromFile',
+                            backend_args=_base_.backend_args),
+                        dict(type='LoadAnnotations', with_bbox=True),
+                    ]),
+            ]),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    dataset=dict(ann_file='annotations/train_cocoformat.json'))
+
+test_dataloader = dict(
+    dataset=dict(
+        data_root='data/MOT20', ann_file='annotations/test_cocoformat.json'))
+
+test_evaluator = dict(
+    type='MOTChallengeMetrics',
+    postprocess_tracklet_cfg=[
+        dict(type='InterpolateTracklets', min_num_frames=5, max_num_frames=20)
+    ],
+    format_only=True,
+    outfile_prefix='./mot_20_test_res')
diff --git a/mmde/mmdet/.mim/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/mmde/mmdet/.mim/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c2119203a46e76cd8b6cc8f755334f58ffb086d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,9 @@
+_base_ = [
+    './bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_'
+    'test-mot17halfval.py'
+]
+
+# fp16 settings
+optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic')
+val_cfg = dict(type='ValLoop', fp16=True)
+test_cfg = dict(type='TestLoop', fp16=True)
diff --git a/mmde/mmdet/.mim/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py b/mmde/mmdet/.mim/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f4427c18bff66ab1fa2a9ba22517989722d0625
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py
@@ -0,0 +1,17 @@
+_base_ = [
+    './bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-'
+    'mot17halftrain_test-mot17halfval.py'
+]
+
+test_dataloader = dict(
+    dataset=dict(
+        data_root='data/MOT17/',
+        ann_file='annotations/test_cocoformat.json',
+        data_prefix=dict(img_path='test')))
+test_evaluator = dict(
+    type='MOTChallengeMetrics',
+    postprocess_tracklet_cfg=[
+        dict(type='InterpolateTracklets', min_num_frames=5, max_num_frames=20)
+    ],
+    format_only=True,
+    outfile_prefix='./mot_17_test_res')
diff --git a/mmde/mmdet/.mim/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py b/mmde/mmdet/.mim/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1016999729263d72bbd75019be4968bc3960e368
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py
@@ -0,0 +1,8 @@
+_base_ = [
+    './bytetrack_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py'
+]
+
+# fp16 settings
+optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic')
+val_cfg = dict(type='ValLoop', fp16=True)
+test_cfg = dict(type='TestLoop', fp16=True)
diff --git a/mmde/mmdet/.mim/configs/bytetrack/metafile.yml b/mmde/mmdet/.mim/configs/bytetrack/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8ed638cf6dda0b0b3db264aa8847358d78ee0fbe
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/bytetrack/metafile.yml
@@ -0,0 +1,53 @@
+Collections:
+  - Name: ByteTrack
+    Metadata:
+      Training Techniques:
+        - SGD with Momentum
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - YOLOX
+    Paper:
+      URL: https://arxiv.org/abs/2110.06864
+      Title: ByteTrack Multi-Object Tracking by Associating Every Detection Box
+    README: configs/bytetrack/README.md
+
+Models:
+  - Name: bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval
+    In Collection: ByteTrack
+    Config: configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
+    Metadata:
+      Training Data: CrowdHuman + MOT17-half-train
+    Results:
+      - Task: Multiple Object Tracking
+        Dataset: MOT17-half-val
+        Metrics:
+          HOTA: 67.5
+          MOTA: 78.6
+          IDF1: 78.5
+    Weights: https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth
+
+  - Name: bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test
+    In Collection: ByteTrack
+    Config: configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py
+    Metadata:
+      Training Data: CrowdHuman + MOT17-half-train
+    Results:
+      - Task: Multiple Object Tracking
+        Dataset: MOT17-test
+        Metrics:
+          MOTA: 78.1
+          IDF1: 74.8
+    Weights: https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth
+
+  - Name: bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test
+    In Collection: ByteTrack
+    Config: configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py
+    Metadata:
+      Training Data: CrowdHuman + MOT20-train
+    Results:
+      - Task: Multiple Object Tracking
+        Dataset: MOT20-test
+        Metrics:
+          MOTA: 77.0
+          IDF1: 75.4
+    Weights: https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot20-private_20220506_101040-9ce38a60.pth
diff --git a/mmde/mmdet/.mim/configs/bytetrack/yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/mmde/mmdet/.mim/configs/bytetrack/yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fc3acd487211d04fb3d6e4504ded5235393e4a7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/bytetrack/yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,6 @@
+_base_ = [
+    '../strongsort/yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py'  # noqa: E501
+]
+
+# fp16 settings
+optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic')
diff --git a/mmde/mmdet/.mim/configs/carafe/faster-rcnn_r50_fpn-carafe_1x_coco.py b/mmde/mmdet/.mim/configs/carafe/faster-rcnn_r50_fpn-carafe_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..388305cceac2e81eb1b4df6eac36662df7b8bf0d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/carafe/faster-rcnn_r50_fpn-carafe_1x_coco.py
@@ -0,0 +1,20 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    data_preprocessor=dict(pad_size_divisor=64),
+    neck=dict(
+        type='FPN_CARAFE',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5,
+        start_level=0,
+        end_level=-1,
+        norm_cfg=None,
+        act_cfg=None,
+        order=('conv', 'norm', 'act'),
+        upsample_cfg=dict(
+            type='carafe',
+            up_kernel=5,
+            up_group=1,
+            encoder_kernel=3,
+            encoder_dilation=1,
+            compressed_channels=64)))
diff --git a/mmde/mmdet/.mim/configs/carafe/mask-rcnn_r50_fpn-carafe_1x_coco.py b/mmde/mmdet/.mim/configs/carafe/mask-rcnn_r50_fpn-carafe_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ce621de77aff60f39126136cb25ca9ca38a1c9f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/carafe/mask-rcnn_r50_fpn-carafe_1x_coco.py
@@ -0,0 +1,30 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    data_preprocessor=dict(pad_size_divisor=64),
+    neck=dict(
+        type='FPN_CARAFE',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5,
+        start_level=0,
+        end_level=-1,
+        norm_cfg=None,
+        act_cfg=None,
+        order=('conv', 'norm', 'act'),
+        upsample_cfg=dict(
+            type='carafe',
+            up_kernel=5,
+            up_group=1,
+            encoder_kernel=3,
+            encoder_dilation=1,
+            compressed_channels=64)),
+    roi_head=dict(
+        mask_head=dict(
+            upsample_cfg=dict(
+                type='carafe',
+                scale_factor=2,
+                up_kernel=5,
+                up_group=1,
+                encoder_kernel=3,
+                encoder_dilation=1,
+                compressed_channels=64))))
diff --git a/mmde/mmdet/.mim/configs/carafe/metafile.yml b/mmde/mmdet/.mim/configs/carafe/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..863c0f49ae6322429e91cf068b06f713a29fcbdc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/carafe/metafile.yml
@@ -0,0 +1,55 @@
+Collections:
+  - Name: CARAFE
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RPN
+        - FPN_CARAFE
+        - ResNet
+        - RoIPool
+    Paper:
+      URL: https://arxiv.org/abs/1905.02188
+      Title: 'CARAFE: Content-Aware ReAssembly of FEatures'
+    README: configs/carafe/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/necks/fpn_carafe.py#L11
+      Version: v2.12.0
+
+Models:
+  - Name: faster-rcnn_r50_fpn_carafe_1x_coco
+    In Collection: CARAFE
+    Config: configs/carafe/faster-rcnn_r50_fpn-carafe_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.26
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/carafe/faster_rcnn_r50_fpn_carafe_1x_coco/faster_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.386_20200504_175733-385a75b7.pth
+
+  - Name: mask-rcnn_r50_fpn_carafe_1x_coco
+    In Collection: CARAFE
+    Config: configs/carafe/mask-rcnn_r50_fpn-carafe_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.31
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 35.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/carafe/mask_rcnn_r50_fpn_carafe_1x_coco/mask_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.393__segm_mAP-0.358_20200503_135957-8687f195.pth
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d85340e1cb92c60293c3710d05ef708d3726fdd
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './cascade-mask-rcnn_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6855ee8c6fffd5e8d48f6cc2bb41e9dde9f6516
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_ms-3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './cascade-mask-rcnn_r50-caffe_fpn_ms-3x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3d962c229d2621e7364c13959e3c4c1137edef1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..497148f513edb79ca58f719f242be6274f923a65
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_20e_coco.py
@@ -0,0 +1,6 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..183b5c50ff5563d987b2937d27d6d02bdd6cc2bd
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_ms-3x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_ms-3x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..497f68c4ab458ec49ad1d0c89cabbb2c0eb444f3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = ['./cascade-mask-rcnn_r50_fpn_1x_coco.py']
+
+model = dict(
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6677a9fea501a7683475dc8b865659cef5485bbe
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_ms-3x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../common/ms_3x_coco-instance.py',
+    '../_base_/models/cascade-mask-rcnn_r50_fpn.py'
+]
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f59bb94eaaf3e850e971268383cd0275bcddf54d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/cascade-mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..35c8aa6748d25e4c9c834478488ee21b44c8f2bd
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_20e_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/cascade-mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_20e.py', '../_base_/default_runtime.py'
+]
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b15006f451f346216243dc61140e9907535f0b20
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_ms-3x_coco.py
@@ -0,0 +1,4 @@
+_base_ = [
+    '../common/ms_3x_coco-instance.py',
+    '../_base_/models/cascade-mask-rcnn_r50_fpn.py'
+]
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..87a4cc325a10b01cbf5a91e336da2281bc19a728
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e8dcaa6891877c89acb024b9811a4fe7a87bc3b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_20e_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a0f61b9aee2b0ab80c5c9b998a73826e5ff45a6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_ms-3x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_ms-3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x8d_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x8d_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cf08306850bdaef776a0ce53b88b23b9013a1a0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x8d_fpn_ms-3x_coco.py
@@ -0,0 +1,24 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_ms-3x_coco.py'
+
+model = dict(
+    # ResNeXt-101-32x8d model trained with Caffe2 at FB,
+    # so the mean and std need to be changed.
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[57.375, 57.120, 58.395],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb2e6b6b9507dcf38403d38499e1d57bd792a4da
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc20c171542b5d75634d99d9ed25eea3acf8df19
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_20e_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4ecc42655903c271e7e181b719d09821118a204
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_ms-3x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_ms-3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r101-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r101-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6eaee2db700b897255ed44a5fd30bc23929388f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r101-caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './cascade-rcnn_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cdf5108b7d2908e420c52c59f8a9805c7989702
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './cascade-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r101_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r101_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..84c285fc9e59d4191e79dd337ece2baff3d38b02
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r101_fpn_20e_coco.py
@@ -0,0 +1,6 @@
+_base_ = './cascade-rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fc52e9cb8e1e9c27d45e32200b0b72efa8c363d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './cascade-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa30a3d07f5644dfc6f79f0eafc374518149e777
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './cascade-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad90e259b2d8410309bfd877b74755524b94f788
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './cascade-rcnn_r50_fpn_1x_coco.py'
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a07c8b2302b9c2337d4da2d32c388142ca1f748
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/cascade-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r50_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r50_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..30f3ff106018ba51173f018c196cf62a88fdb172
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r50_fpn_20e_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/cascade-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_20e.py', '../_base_/default_runtime.py'
+]
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd25f02608c3f51a59e35185a41080c6e8e3a1ea
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/cascade-rcnn_r50_fpn.py',
+    '../common/lsj-200e_coco-detection.py'
+]
+image_size = (1024, 1024)
+batch_augments = [dict(type='BatchFixedSizePad', size=image_size)]
+
+# disable allowed_border to avoid potential errors.
+model = dict(
+    data_preprocessor=dict(batch_augments=batch_augments),
+    train_cfg=dict(rpn=dict(allowed_border=-1)))
+
+train_dataloader = dict(batch_size=8, num_workers=4)
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=0.02 * 4, momentum=0.9, weight_decay=0.00004))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..50e0b9544592d61b3c14ec7f64f3e6eaa2e96a57
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6120189205d883d98b2d323a160ec54ea26aab13
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_20e_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade-rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_x101-64x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_x101-64x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..29475e39273dccad13058e9114728770e77f71ef
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_x101-64x4d_fpn_1x_coco.py
@@ -0,0 +1,15 @@
+_base_ = './cascade-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    type='CascadeRCNN',
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_x101_64x4d_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_x101_64x4d_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2aa57eaaf43788fc3628f1463e94405279c7416
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/cascade-rcnn_x101_64x4d_fpn_20e_coco.py
@@ -0,0 +1,15 @@
+_base_ = './cascade-rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    type='CascadeRCNN',
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/cascade_rcnn/metafile.yml b/mmde/mmdet/.mim/configs/cascade_rcnn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7e0385daeed3f3310dc7f9a8b64c99b5cb8324b4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rcnn/metafile.yml
@@ -0,0 +1,545 @@
+Collections:
+  - Name: Cascade R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Cascade R-CNN
+        - FPN
+        - RPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: http://dx.doi.org/10.1109/tpami.2019.2956516
+      Title: 'Cascade R-CNN: Delving into High Quality Object Detection'
+    README: configs/cascade_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/cascade_rcnn.py#L6
+      Version: v2.0.0
+  - Name: Cascade Mask R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Cascade R-CNN
+        - FPN
+        - RPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: http://dx.doi.org/10.1109/tpami.2019.2956516
+      Title: 'Cascade R-CNN: Delving into High Quality Object Detection'
+    README: configs/cascade_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/cascade_rcnn.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: cascade-rcnn_r50-caffe_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-rcnn_r50-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.2
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco/cascade_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.404_20200504_174853-b857be87.pth
+
+  - Name: cascade-rcnn_r50_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.4
+      inference time (ms/im):
+        - value: 62.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco/cascade_rcnn_r50_fpn_1x_coco_20200316-3dc56deb.pth
+
+  - Name: cascade-rcnn_r50_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-rcnn_r50_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 4.4
+      inference time (ms/im):
+        - value: 62.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco/cascade_rcnn_r50_fpn_20e_coco_bbox_mAP-0.41_20200504_175131-e9872a90.pth
+
+  - Name: cascade-rcnn_r101-caffe_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-rcnn_r101-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.2
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco/cascade_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.423_20200504_175649-cab8dbd5.pth
+
+  - Name: cascade-rcnn_r101_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-rcnn_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      inference time (ms/im):
+        - value: 74.07
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco/cascade_rcnn_r101_fpn_1x_coco_20200317-0b6a2fbf.pth
+
+  - Name: cascade-rcnn_r101_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-rcnn_r101_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      inference time (ms/im):
+        - value: 74.07
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco/cascade_rcnn_r101_fpn_20e_coco_bbox_mAP-0.425_20200504_231812-5057dcc5.pth
+
+  - Name: cascade-rcnn_x101-32x4d_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      inference time (ms/im):
+        - value: 91.74
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco/cascade_rcnn_x101_32x4d_fpn_1x_coco_20200316-95c2deb6.pth
+
+  - Name: cascade-rcnn_x101-32x4d_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco/cascade_rcnn_x101_32x4d_fpn_20e_coco_20200906_134608-9ae0a720.pth
+
+  - Name: cascade-rcnn_x101-64x4d_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-rcnn_x101-64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.7
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_075702-43ce6a30.pth
+
+  - Name: cascade-rcnn_x101_64x4d_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-rcnn_x101_64x4d_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 10.7
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357-051557b1.pth
+
+  - Name: cascade-mask-rcnn_r50-caffe_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.9
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco/cascade_mask_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.412__segm_mAP-0.36_20200504_174659-5004b251.pth
+
+  - Name: cascade-mask-rcnn_r50_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 89.29
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  35.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth
+
+  - Name: cascade-mask-rcnn_r50_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 89.29
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_bbox_mAP-0.419__segm_mAP-0.365_20200504_174711-4af8e66e.pth
+
+  - Name: cascade-mask-rcnn_r101-caffe_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco/cascade_mask_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.432__segm_mAP-0.376_20200504_174813-5c1e9599.pth
+
+  - Name: cascade-mask-rcnn_r101_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.9
+      inference time (ms/im):
+        - value: 102.04
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco/cascade_mask_rcnn_r101_fpn_1x_coco_20200203-befdf6ee.pth
+
+  - Name: cascade-mask-rcnn_r101_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 7.9
+      inference time (ms/im):
+        - value: 102.04
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco/cascade_mask_rcnn_r101_fpn_20e_coco_bbox_mAP-0.434__segm_mAP-0.378_20200504_174836-005947da.pth
+
+  - Name: cascade-mask-rcnn_x101-32x4d_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.2
+      inference time (ms/im):
+        - value: 116.28
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco_20200201-0f411b1f.pth
+
+  - Name: cascade-mask-rcnn_x101-32x4d_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 9.2
+      inference time (ms/im):
+        - value: 116.28
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco_20200528_083917-ed1f4751.pth
+
+  - Name: cascade-mask-rcnn_x101-64x4d_fpn_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 12.2
+      inference time (ms/im):
+        - value: 149.25
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  39.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco_20200203-9a2db89d.pth
+
+  - Name: cascade-mask-rcnn_x101-64x4d_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 12.2
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco_20200512_161033-bdb5126a.pth
+
+  - Name: cascade-mask-rcnn_r50-caffe_fpn_ms-3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.7
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210707_002651-6e29b3a6.pth
+
+  - Name: cascade-mask-rcnn_r50_fpn_mstrain_3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.9
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco_20210628_164719-5bdc3824.pth
+
+  - Name: cascade-mask-rcnn_r101-caffe_fpn_ms-3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.7
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210707_002620-a5bd2389.pth
+
+  - Name: cascade-mask-rcnn_r101_fpn_ms-3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco_20210628_165236-51a2d363.pth
+
+  - Name: cascade-mask-rcnn_x101-32x4d_fpn_ms-3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 9.0
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210706_225234-40773067.pth
+
+  - Name: cascade-mask-rcnn_x101-32x8d_fpn_ms-3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_x101-32x8d_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 12.1
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210719_180640-9ff7e76f.pth
+
+  - Name: cascade-mask-rcnn_x101-64x4d_fpn_ms-3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 12.0
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210719_210311-d3e64ba0.pth
diff --git a/mmde/mmdet/.mim/configs/cascade_rpn/cascade-rpn_fast-rcnn_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/cascade_rpn/cascade-rpn_fast-rcnn_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba23ce90652d2ab2e9362be9a6231742d1815a70
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rpn/cascade-rpn_fast-rcnn_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,27 @@
+_base_ = '../fast_rcnn/fast-rcnn_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            bbox_coder=dict(target_stds=[0.04, 0.04, 0.08, 0.08]),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.5),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                pos_iou_thr=0.65, neg_iou_thr=0.65, min_pos_iou=0.65),
+            sampler=dict(num=256))),
+    test_cfg=dict(rcnn=dict(score_thr=1e-3)))
+
+# MMEngine support the following two ways, users can choose
+# according to convenience
+# train_dataloader = dict(dataset=dict(proposal_file='proposals/crpn_r50_caffe_fpn_1x_train2017.pkl'))  # noqa
+_base_.train_dataloader.dataset.proposal_file = 'proposals/crpn_r50_caffe_fpn_1x_train2017.pkl'  # noqa
+
+# val_dataloader = dict(dataset=dict(proposal_file='proposals/crpn_r50_caffe_fpn_1x_val2017.pkl'))  # noqa
+# test_dataloader = val_dataloader
+_base_.val_dataloader.dataset.proposal_file = 'proposals/crpn_r50_caffe_fpn_1x_val2017.pkl'  # noqa
+test_dataloader = _base_.val_dataloader
+
+optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/cascade_rpn/cascade-rpn_faster-rcnn_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/cascade_rpn/cascade-rpn_faster-rcnn_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f7eced00144fb8fff1f234210a2b3f3fe475c8f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rpn/cascade-rpn_faster-rcnn_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,89 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50-caffe_fpn_1x_coco.py'
+rpn_weight = 0.7
+model = dict(
+    rpn_head=dict(
+        _delete_=True,
+        type='CascadeRPNHead',
+        num_stages=2,
+        stages=[
+            dict(
+                type='StageCascadeRPNHead',
+                in_channels=256,
+                feat_channels=256,
+                anchor_generator=dict(
+                    type='AnchorGenerator',
+                    scales=[8],
+                    ratios=[1.0],
+                    strides=[4, 8, 16, 32, 64]),
+                adapt_cfg=dict(type='dilation', dilation=3),
+                bridged_feature=True,
+                with_cls=False,
+                reg_decoded_bbox=True,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=(.0, .0, .0, .0),
+                    target_stds=(0.1, 0.1, 0.5, 0.5)),
+                loss_bbox=dict(
+                    type='IoULoss', linear=True,
+                    loss_weight=10.0 * rpn_weight)),
+            dict(
+                type='StageCascadeRPNHead',
+                in_channels=256,
+                feat_channels=256,
+                adapt_cfg=dict(type='offset'),
+                bridged_feature=False,
+                with_cls=True,
+                reg_decoded_bbox=True,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=(.0, .0, .0, .0),
+                    target_stds=(0.05, 0.05, 0.1, 0.1)),
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=True,
+                    loss_weight=1.0 * rpn_weight),
+                loss_bbox=dict(
+                    type='IoULoss', linear=True,
+                    loss_weight=10.0 * rpn_weight))
+        ]),
+    roi_head=dict(
+        bbox_head=dict(
+            bbox_coder=dict(target_stds=[0.04, 0.04, 0.08, 0.08]),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.5),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=[
+            dict(
+                assigner=dict(
+                    type='RegionAssigner', center_ratio=0.2, ignore_ratio=0.5),
+                allowed_border=-1,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=256,
+                    pos_fraction=0.5,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=False),
+                allowed_border=-1,
+                pos_weight=-1,
+                debug=False)
+        ],
+        rpn_proposal=dict(max_per_img=300, nms=dict(iou_threshold=0.8)),
+        rcnn=dict(
+            assigner=dict(
+                pos_iou_thr=0.65, neg_iou_thr=0.65, min_pos_iou=0.65),
+            sampler=dict(type='RandomSampler', num=256))),
+    test_cfg=dict(
+        rpn=dict(max_per_img=300, nms=dict(iou_threshold=0.8)),
+        rcnn=dict(score_thr=1e-3)))
+optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/cascade_rpn/cascade-rpn_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/cascade_rpn/cascade-rpn_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6eba24d11368ee0cdaae4fa316020ea3750be7f0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rpn/cascade-rpn_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,76 @@
+_base_ = '../rpn/rpn_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    rpn_head=dict(
+        _delete_=True,
+        type='CascadeRPNHead',
+        num_stages=2,
+        stages=[
+            dict(
+                type='StageCascadeRPNHead',
+                in_channels=256,
+                feat_channels=256,
+                anchor_generator=dict(
+                    type='AnchorGenerator',
+                    scales=[8],
+                    ratios=[1.0],
+                    strides=[4, 8, 16, 32, 64]),
+                adapt_cfg=dict(type='dilation', dilation=3),
+                bridged_feature=True,
+                sampling=False,
+                with_cls=False,
+                reg_decoded_bbox=True,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=(.0, .0, .0, .0),
+                    target_stds=(0.1, 0.1, 0.5, 0.5)),
+                loss_bbox=dict(type='IoULoss', linear=True, loss_weight=10.0)),
+            dict(
+                type='StageCascadeRPNHead',
+                in_channels=256,
+                feat_channels=256,
+                adapt_cfg=dict(type='offset'),
+                bridged_feature=False,
+                sampling=True,
+                with_cls=True,
+                reg_decoded_bbox=True,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=(.0, .0, .0, .0),
+                    target_stds=(0.05, 0.05, 0.1, 0.1)),
+                loss_cls=dict(
+                    type='CrossEntropyLoss', use_sigmoid=True,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='IoULoss', linear=True, loss_weight=10.0))
+        ]),
+    train_cfg=dict(rpn=[
+        dict(
+            assigner=dict(
+                type='RegionAssigner', center_ratio=0.2, ignore_ratio=0.5),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.7,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1,
+                iou_calculator=dict(type='BboxOverlaps2D')),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False)
+    ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.8),
+            min_bbox_size=0)))
+optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/cascade_rpn/metafile.yml b/mmde/mmdet/.mim/configs/cascade_rpn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..62a88c5d2185ffd3aa7884f7a8c7d68cc3d60c8f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cascade_rpn/metafile.yml
@@ -0,0 +1,44 @@
+Collections:
+  - Name: Cascade RPN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Cascade RPN
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1909.06720
+      Title: 'Cascade RPN: Delving into High-Quality Region Proposal Network with Adaptive Convolution'
+    README: configs/cascade_rpn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.8.0/mmdet/models/dense_heads/cascade_rpn_head.py#L538
+      Version: v2.8.0
+
+Models:
+  - Name: cascade-rpn_fast-rcnn_r50-caffe_fpn_1x_coco
+    In Collection: Cascade RPN
+    Config: configs/cascade_rpn/cascade-rpn_fast-rcnn_r50-caffe_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco/crpn_fast_rcnn_r50_caffe_fpn_1x_coco-cb486e66.pth
+
+  - Name: cascade-rpn_faster-rcnn_r50-caffe_fpn_1x_coco
+    In Collection: Cascade RPN
+    Config: configs/cascade_rpn/cascade-rpn_faster-rcnn_r50-caffe_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco/crpn_faster_rcnn_r50_caffe_fpn_1x_coco-c8283cca.pth
diff --git a/mmde/mmdet/.mim/configs/centernet/centernet-update_r101_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/centernet/centernet-update_r101_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fc65e0f8aeb1f02a0bea675146ced7a56800251
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/centernet/centernet-update_r101_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './centernet-update_r50_fpn_8xb8-amp-lsj-200e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/centernet/centernet-update_r18_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/centernet/centernet-update_r18_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab3ae32ecd54cd08664e883a0888ef43040528d1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/centernet/centernet-update_r18_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './centernet-update_r50_fpn_8xb8-amp-lsj-200e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
diff --git a/mmde/mmdet/.mim/configs/centernet/centernet-update_r50-caffe_fpn_ms-1x_coco.py b/mmde/mmdet/.mim/configs/centernet/centernet-update_r50-caffe_fpn_ms-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f6e2b3919d6d2197c0ae9e1d721dc4eab00cf9c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/centernet/centernet-update_r50-caffe_fpn_ms-1x_coco.py
@@ -0,0 +1,105 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='CenterNet',
+    # use caffe img_norm
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5,
+        # There is a chance to get 40.3 after switching init_cfg,
+        # otherwise it is about 39.9~40.1
+        init_cfg=dict(type='Caffe2Xavier', layer='Conv2d'),
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type='CenterNetUpdateHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        hm_min_radius=4,
+        hm_min_overlap=0.8,
+        more_pos_thresh=0.2,
+        more_pos_topk=9,
+        soft_weight_on_reg=False,
+        loss_cls=dict(
+            type='GaussianFocalLoss',
+            pos_weight=0.25,
+            neg_weight=0.75,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+    ),
+    train_cfg=None,
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+# single-scale training is about 39.3
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.00025,
+        by_epoch=False,
+        begin=0,
+        end=4000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(lr=0.01),
+    # Experiments show that there is no need to turn on clip_grad.
+    paramwise_cfg=dict(norm_decay_mult=0.))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/centernet/centernet-update_r50_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/centernet/centernet-update_r50_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..34e0c680d39486467464f0ea7d6e1e08bf0c5240
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/centernet/centernet-update_r50_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,83 @@
+_base_ = '../common/lsj-200e_coco-detection.py'
+
+image_size = (1024, 1024)
+batch_augments = [dict(type='BatchFixedSizePad', size=image_size)]
+
+model = dict(
+    type='CenterNet',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32,
+        batch_augments=batch_augments),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5,
+        init_cfg=dict(type='Caffe2Xavier', layer='Conv2d'),
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type='CenterNetUpdateHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        loss_cls=dict(
+            type='GaussianFocalLoss',
+            pos_weight=0.25,
+            neg_weight=0.75,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+    ),
+    train_cfg=None,
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+train_dataloader = dict(batch_size=8, num_workers=4)
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=0.01 * 4, momentum=0.9, weight_decay=0.00004),
+    paramwise_cfg=dict(norm_decay_mult=0.))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.00025,
+        by_epoch=False,
+        begin=0,
+        end=4000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=25,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/centernet/centernet_r18-dcnv2_8xb16-crop512-140e_coco.py b/mmde/mmdet/.mim/configs/centernet/centernet_r18-dcnv2_8xb16-crop512-140e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..732a55d59ad7dee175d8b72f798f0be044f23326
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/centernet/centernet_r18-dcnv2_8xb16-crop512-140e_coco.py
@@ -0,0 +1,136 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py',
+    './centernet_tta.py'
+]
+
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# model settings
+model = dict(
+    type='CenterNet',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='ResNet',
+        depth=18,
+        norm_eval=False,
+        norm_cfg=dict(type='BN'),
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(
+        type='CTResNetNeck',
+        in_channels=512,
+        num_deconv_filters=(256, 128, 64),
+        num_deconv_kernels=(4, 4, 4),
+        use_dcn=True),
+    bbox_head=dict(
+        type='CenterNetHead',
+        num_classes=80,
+        in_channels=64,
+        feat_channels=64,
+        loss_center_heatmap=dict(type='GaussianFocalLoss', loss_weight=1.0),
+        loss_wh=dict(type='L1Loss', loss_weight=0.1),
+        loss_offset=dict(type='L1Loss', loss_weight=1.0)),
+    train_cfg=None,
+    test_cfg=dict(topk=100, local_maximum_kernel=3, max_per_img=100))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='RandomCenterCropPad',
+        # The cropped images are padded into squares during training,
+        # but may be less than crop_size.
+        crop_size=(512, 512),
+        ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3),
+        mean=[0, 0, 0],
+        std=[1, 1, 1],
+        to_rgb=True,
+        test_pad_mode=None),
+    # Make sure the output is always crop_size.
+    dict(type='Resize', scale=(512, 512), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        backend_args={{_base_.backend_args}},
+        to_float32=True),
+    # don't need Resize
+    dict(
+        type='RandomCenterCropPad',
+        ratios=None,
+        border=None,
+        mean=[0, 0, 0],
+        std=[1, 1, 1],
+        to_rgb=True,
+        test_mode=True,
+        test_pad_mode=['logical_or', 31],
+        test_pad_add_pix=1),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'border'))
+]
+
+# Use RepeatDataset to speed up training
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args={{_base_.backend_args}},
+        )))
+
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optimizer
+# Based on the default settings of modern detectors, the SGD effect is better
+# than the Adam in the source code, so we use SGD default settings and
+# if you use adam+lr5e-4, the map is 29.1.
+optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2))
+
+max_epochs = 28
+# learning policy
+# Based on the default settings of modern detectors, we added warmup settings.
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[18, 24],  # the real step is [18*5, 24*5]
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs)  # the real epoch is 28*5=140
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (16 samples per GPU)
+auto_scale_lr = dict(base_batch_size=128)
diff --git a/mmde/mmdet/.mim/configs/centernet/centernet_r18_8xb16-crop512-140e_coco.py b/mmde/mmdet/.mim/configs/centernet/centernet_r18_8xb16-crop512-140e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6094b64221bd91eaafc9868e01c718d4421b418a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/centernet/centernet_r18_8xb16-crop512-140e_coco.py
@@ -0,0 +1,3 @@
+_base_ = './centernet_r18-dcnv2_8xb16-crop512-140e_coco.py'
+
+model = dict(neck=dict(use_dcn=False))
diff --git a/mmde/mmdet/.mim/configs/centernet/centernet_tta.py b/mmde/mmdet/.mim/configs/centernet/centernet_tta.py
new file mode 100644
index 0000000000000000000000000000000000000000..edd7b03ecdeb272870919dcbd4842d6b8e32d8d4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/centernet/centernet_tta.py
@@ -0,0 +1,39 @@
+# This is different from the TTA of official CenterNet.
+
+tta_model = dict(
+    type='DetTTAModel',
+    tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.5), max_per_img=100))
+
+tta_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True, backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                # ``RandomFlip`` must be placed before ``RandomCenterCropPad``,
+                # otherwise bounding box coordinates after flipping cannot be
+                # recovered correctly.
+                dict(type='RandomFlip', prob=1.),
+                dict(type='RandomFlip', prob=0.)
+            ],
+            [
+                dict(
+                    type='RandomCenterCropPad',
+                    ratios=None,
+                    border=None,
+                    mean=[0, 0, 0],
+                    std=[1, 1, 1],
+                    to_rgb=True,
+                    test_mode=True,
+                    test_pad_mode=['logical_or', 31],
+                    test_pad_add_pix=1),
+            ],
+            [dict(type='LoadAnnotations', with_bbox=True)],
+            [
+                dict(
+                    type='PackDetInputs',
+                    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                               'flip', 'flip_direction', 'border'))
+            ]
+        ])
+]
diff --git a/mmde/mmdet/.mim/configs/centernet/metafile.yml b/mmde/mmdet/.mim/configs/centernet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..496b8ea22df0ac1e757a40c2750893034e08a81c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/centernet/metafile.yml
@@ -0,0 +1,60 @@
+Collections:
+  - Name: CenterNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x TITANXP GPUs
+      Architecture:
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1904.07850
+      Title: 'Objects as Points'
+    README: configs/centernet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.13.0/mmdet/models/detectors/centernet.py#L10
+      Version: v2.13.0
+
+Models:
+  - Name: centernet_r18-dcnv2_8xb16-crop512-140e_coco
+    In Collection: CenterNet
+    Config: configs/centernet/centernet_r18-dcnv2_8xb16-crop512-140e_coco.py
+    Metadata:
+      Batch Size: 128
+      Training Memory (GB): 3.47
+      Epochs: 140
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 29.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_dcnv2_140e_coco/centernet_resnet18_dcnv2_140e_coco_20210702_155131-c8cd631f.pth
+
+  - Name: centernet_r18_8xb16-crop512-140e_coco
+    In Collection: CenterNet
+    Config: configs/centernet/centernet_r18_8xb16-crop512-140e_coco.py
+    Metadata:
+      Batch Size: 128
+      Training Memory (GB): 3.45
+      Epochs: 140
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 25.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_140e_coco/centernet_resnet18_140e_coco_20210705_093630-bb5b3bf7.pth
+
+  - Name: centernet-update_r50-caffe_fpn_ms-1x_coco
+    In Collection: CenterNet
+    Config: configs/centernet/centernet-update_r50-caffe_fpn_ms-1x_coco.py
+    Metadata:
+      Batch Size: 16
+      Training Memory (GB): 3.3
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.2
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/centernet/centernet-update_r50-caffe_fpn_ms-1x_coco/centernet-update_r50-caffe_fpn_ms-1x_coco_20230512_203845-8306baf2.pth
diff --git a/mmde/mmdet/.mim/configs/centripetalnet/centripetalnet_hourglass104_16xb6-crop511-210e-mstest_coco.py b/mmde/mmdet/.mim/configs/centripetalnet/centripetalnet_hourglass104_16xb6-crop511-210e-mstest_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b757ffd16dca2d2b51d27ad413fdba889252c87f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/centripetalnet/centripetalnet_hourglass104_16xb6-crop511-210e-mstest_coco.py
@@ -0,0 +1,181 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/coco_detection.py'
+]
+
+data_preprocessor = dict(
+    type='DetDataPreprocessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True)
+
+# model settings
+model = dict(
+    type='CornerNet',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='HourglassNet',
+        downsample_times=5,
+        num_stacks=2,
+        stage_channels=[256, 256, 384, 384, 384, 512],
+        stage_blocks=[2, 2, 2, 2, 2, 4],
+        norm_cfg=dict(type='BN', requires_grad=True)),
+    neck=None,
+    bbox_head=dict(
+        type='CentripetalHead',
+        num_classes=80,
+        in_channels=256,
+        num_feat_levels=2,
+        corner_emb_channels=0,
+        loss_heatmap=dict(
+            type='GaussianFocalLoss', alpha=2.0, gamma=4.0, loss_weight=1),
+        loss_offset=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1),
+        loss_guiding_shift=dict(
+            type='SmoothL1Loss', beta=1.0, loss_weight=0.05),
+        loss_centripetal_shift=dict(
+            type='SmoothL1Loss', beta=1.0, loss_weight=1)),
+    # training and testing settings
+    train_cfg=None,
+    test_cfg=dict(
+        corner_topk=100,
+        local_maximum_kernel=3,
+        distance_threshold=0.5,
+        score_thr=0.05,
+        max_per_img=100,
+        nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian')))
+
+# data settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        # The cropped images are padded into squares during training,
+        # but may be smaller than crop_size.
+        type='RandomCenterCropPad',
+        crop_size=(511, 511),
+        ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3),
+        test_mode=False,
+        test_pad_mode=None,
+        mean=data_preprocessor['mean'],
+        std=data_preprocessor['std'],
+        # Image data is not converted to rgb.
+        to_rgb=data_preprocessor['bgr_to_rgb']),
+    dict(type='Resize', scale=(511, 511), keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs'),
+]
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        to_float32=True,
+        backend_args=_base_.backend_args),
+    # don't need Resize
+    dict(
+        type='RandomCenterCropPad',
+        crop_size=None,
+        ratios=None,
+        border=None,
+        test_mode=True,
+        test_pad_mode=['logical_or', 127],
+        mean=data_preprocessor['mean'],
+        std=data_preprocessor['std'],
+        # Image data is not converted to rgb.
+        to_rgb=data_preprocessor['bgr_to_rgb']),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'border'))
+]
+
+train_dataloader = dict(
+    batch_size=6,
+    num_workers=3,
+    batch_sampler=None,
+    dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='Adam', lr=0.0005),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+max_epochs = 210
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[190],
+        gamma=0.1)
+]
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (6 samples per GPU)
+auto_scale_lr = dict(base_batch_size=96)
+
+tta_model = dict(
+    type='DetTTAModel',
+    tta_cfg=dict(
+        nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian'),
+        max_per_img=100))
+
+tta_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        to_float32=True,
+        backend_args=_base_.backend_args),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                # ``RandomFlip`` must be placed before ``RandomCenterCropPad``,
+                # otherwise bounding box coordinates after flipping cannot be
+                # recovered correctly.
+                dict(type='RandomFlip', prob=1.),
+                dict(type='RandomFlip', prob=0.)
+            ],
+            [
+                dict(
+                    type='RandomCenterCropPad',
+                    crop_size=None,
+                    ratios=None,
+                    border=None,
+                    test_mode=True,
+                    test_pad_mode=['logical_or', 127],
+                    mean=data_preprocessor['mean'],
+                    std=data_preprocessor['std'],
+                    # Image data is not converted to rgb.
+                    to_rgb=data_preprocessor['bgr_to_rgb'])
+            ],
+            [dict(type='LoadAnnotations', with_bbox=True)],
+            [
+                dict(
+                    type='PackDetInputs',
+                    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                               'flip', 'flip_direction', 'border'))
+            ]
+        ])
+]
diff --git a/mmde/mmdet/.mim/configs/centripetalnet/metafile.yml b/mmde/mmdet/.mim/configs/centripetalnet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..526572dfed0d158b55205c23031b5dfdbdfa9dc0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/centripetalnet/metafile.yml
@@ -0,0 +1,39 @@
+Collections:
+  - Name: CentripetalNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - Adam
+      Training Resources: 16x V100 GPUs
+      Architecture:
+        - Corner Pooling
+        - Stacked Hourglass Network
+    Paper:
+      URL: https://arxiv.org/abs/2003.09119
+      Title: 'CentripetalNet: Pursuing High-quality Keypoint Pairs for Object Detection'
+    README: configs/centripetalnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.5.0/mmdet/models/detectors/cornernet.py#L9
+      Version: v2.5.0
+
+Models:
+  - Name: centripetalnet_hourglass104_16xb6-crop511-210e-mstest_coco
+    In Collection: CentripetalNet
+    Config: configs/centripetalnet/centripetalnet_hourglass104_16xb6-crop511-210e-mstest_coco.py
+    Metadata:
+      Batch Size: 96
+      Training Memory (GB): 16.7
+      inference time (ms/im):
+        - value: 270.27
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 210
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco/centripetalnet_hourglass104_mstest_16x6_210e_coco_20200915_204804-3ccc61e5.pth
diff --git a/mmde/mmdet/.mim/configs/cityscapes/faster-rcnn_r50_fpn_1x_cityscapes.py b/mmde/mmdet/.mim/configs/cityscapes/faster-rcnn_r50_fpn_1x_cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccd0de2aff1c1f3071e70e67dbf94b1c1cfe7e8b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cityscapes/faster-rcnn_r50_fpn_1x_cityscapes.py
@@ -0,0 +1,41 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/cityscapes_detection.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_1x.py'
+]
+model = dict(
+    backbone=dict(init_cfg=None),
+    roi_head=dict(
+        bbox_head=dict(
+            num_classes=8,
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))))
+
+# optimizer
+# lr is set for a batch size of 8
+optim_wrapper = dict(optimizer=dict(lr=0.01))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=8,
+        by_epoch=True,
+        # [7] yields higher performance than [6]
+        milestones=[7],
+        gamma=0.1)
+]
+
+# actual epoch = 8 * 8 = 64
+train_cfg = dict(max_epochs=8)
+
+# For better, more stable performance initialize from COCO
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'  # noqa
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+# TODO: support auto scaling lr
+# auto_scale_lr = dict(base_batch_size=8)
diff --git a/mmde/mmdet/.mim/configs/cityscapes/mask-rcnn_r50_fpn_1x_cityscapes.py b/mmde/mmdet/.mim/configs/cityscapes/mask-rcnn_r50_fpn_1x_cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..772268b121e7b8858c4cfcf3b6820e6146634d0d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cityscapes/mask-rcnn_r50_fpn_1x_cityscapes.py
@@ -0,0 +1,43 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/cityscapes_instance.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_1x.py'
+]
+model = dict(
+    backbone=dict(init_cfg=None),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            num_classes=8,
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+        mask_head=dict(num_classes=8)))
+
+# optimizer
+# lr is set for a batch size of 8
+optim_wrapper = dict(optimizer=dict(lr=0.01))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=8,
+        by_epoch=True,
+        # [7] yields higher performance than [6]
+        milestones=[7],
+        gamma=0.1)
+]
+
+# actual epoch = 8 * 8 = 64
+train_cfg = dict(max_epochs=8)
+
+# For better, more stable performance initialize from COCO
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth'  # noqa
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+# TODO: support auto scaling lr
+# auto_scale_lr = dict(base_batch_size=8)
diff --git a/mmde/mmdet/.mim/configs/common/lsj-100e_coco-detection.py b/mmde/mmdet/.mim/configs/common/lsj-100e_coco-detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb631e5d5c1253cc3a5d81a8cdc6cd86133d9b53
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/common/lsj-100e_coco-detection.py
@@ -0,0 +1,122 @@
+_base_ = '../_base_/default_runtime.py'
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+image_size = (1024, 1024)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize',
+        scale=image_size,
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+# Use RepeatDataset to speed up training
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=4,  # simply change this from 2 to 16 for 50e - 400e training.
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+max_epochs = 25
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=5)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# optimizer assumes bs=64
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00004))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.067, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
+
+# only keep latest 2 checkpoints
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=2))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/common/lsj-100e_coco-instance.py b/mmde/mmdet/.mim/configs/common/lsj-100e_coco-instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e62729d639c7659115a7f5f6449fa9021338be6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/common/lsj-100e_coco-instance.py
@@ -0,0 +1,122 @@
+_base_ = '../_base_/default_runtime.py'
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+image_size = (1024, 1024)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomResize',
+        scale=image_size,
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+# Use RepeatDataset to speed up training
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=4,  # simply change this from 2 to 16 for 50e - 400e training.
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+max_epochs = 25
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=5)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# optimizer assumes bs=64
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00004))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.067, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
+
+# only keep latest 2 checkpoints
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=2))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/common/lsj-200e_coco-detection.py b/mmde/mmdet/.mim/configs/common/lsj-200e_coco-detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..83d12947fed900f05d748b6f90ef29cc5fbc407a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/common/lsj-200e_coco-detection.py
@@ -0,0 +1,18 @@
+_base_ = './lsj-100e_coco-detection.py'
+
+# 8x25=200e
+train_dataloader = dict(dataset=dict(times=8))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.067, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=25,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/common/lsj-200e_coco-instance.py b/mmde/mmdet/.mim/configs/common/lsj-200e_coco-instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..af3e4bf160c01045c6e36d67bdee796e7bf96cd3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/common/lsj-200e_coco-instance.py
@@ -0,0 +1,18 @@
+_base_ = './lsj-100e_coco-instance.py'
+
+# 8x25=200e
+train_dataloader = dict(dataset=dict(times=8))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.067, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=25,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/common/ms-90k_coco.py b/mmde/mmdet/.mim/configs/common/ms-90k_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2d6c3dafb61d59bbbe9d0c6188a1bbff3b736b3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/common/ms-90k_coco.py
@@ -0,0 +1,122 @@
+_base_ = '../_base_/default_runtime.py'
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# Align with Detectron2
+backend = 'pillow'
+train_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        backend_args=backend_args,
+        imdecode_backend=backend),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True,
+        backend=backend),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        backend_args=backend_args,
+        imdecode_backend=backend),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True, backend=backend),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    pin_memory=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    pin_memory=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# training schedule for 90k
+max_iter = 90000
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=max_iter, val_interval=10000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[60000, 80000],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+default_hooks = dict(checkpoint=dict(by_epoch=False, interval=10000))
+log_processor = dict(by_epoch=False)
diff --git a/mmde/mmdet/.mim/configs/common/ms-poly-90k_coco-instance.py b/mmde/mmdet/.mim/configs/common/ms-poly-90k_coco-instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5566b3c3b8bfe0a49c8c062fb0fc972d5ae1f55
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/common/ms-poly-90k_coco-instance.py
@@ -0,0 +1,130 @@
+_base_ = '../_base_/default_runtime.py'
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# Align with Detectron2
+backend = 'pillow'
+train_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        backend_args=backend_args,
+        imdecode_backend=backend),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True,
+        backend=backend),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        backend_args=backend_args,
+        imdecode_backend=backend),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True, backend=backend),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    pin_memory=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    pin_memory=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# training schedule for 90k
+max_iter = 90000
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=max_iter, val_interval=10000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[60000, 80000],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+default_hooks = dict(checkpoint=dict(by_epoch=False, interval=10000))
+log_processor = dict(by_epoch=False)
diff --git a/mmde/mmdet/.mim/configs/common/ms-poly_3x_coco-instance.py b/mmde/mmdet/.mim/configs/common/ms-poly_3x_coco-instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..04072f9b84c06d546767649f7e17736444db7ce2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/common/ms-poly_3x_coco-instance.py
@@ -0,0 +1,118 @@
+_base_ = '../_base_/default_runtime.py'
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='RandomResize', scale=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# training schedule for 3x with `RepeatDataset`
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+# Experiments show that using milestones=[9, 11] has higher performance
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[9, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/common/ms_3x_coco-instance.py b/mmde/mmdet/.mim/configs/common/ms_3x_coco-instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..f80cf88e9b1e770dce3157abc852aea996eec624
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/common/ms_3x_coco-instance.py
@@ -0,0 +1,108 @@
+_base_ = '../_base_/default_runtime.py'
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomResize', scale=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# training schedule for 3x with `RepeatDataset`
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+# Experiments show that using milestones=[9, 11] has higher performance
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[9, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/common/ms_3x_coco.py b/mmde/mmdet/.mim/configs/common/ms_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..facbb34cf05088d8832502d3c9a38d812d328308
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/common/ms_3x_coco.py
@@ -0,0 +1,108 @@
+_base_ = '../_base_/default_runtime.py'
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize', scale=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# training schedule for 3x with `RepeatDataset`
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+# Experiments show that using milestones=[9, 11] has higher performance
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[9, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/common/ssj_270k_coco-instance.py b/mmde/mmdet/.mim/configs/common/ssj_270k_coco-instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..7407644fd59bb03d6e0afde83f8893a351ddc356
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/common/ssj_270k_coco-instance.py
@@ -0,0 +1,125 @@
+_base_ = '../_base_/default_runtime.py'
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+image_size = (1024, 1024)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# Standard Scale Jittering (SSJ) resizes and crops an image
+# with a resize range of 0.8 to 1.25 of the original image size.
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomResize',
+        scale=image_size,
+        ratio_range=(0.8, 1.25),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# The model is trained by 270k iterations with batch_size 64,
+# which is roughly equivalent to 144 epochs.
+
+max_iters = 270000
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=max_iters, val_interval=10000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# optimizer assumes bs=64
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00004))
+
+# learning rate policy
+# lr steps at [0.9, 0.95, 0.975] of the maximum iterations
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=270000,
+        by_epoch=False,
+        milestones=[243000, 256500, 263250],
+        gamma=0.1)
+]
+
+default_hooks = dict(checkpoint=dict(by_epoch=False, interval=10000))
+log_processor = dict(by_epoch=False)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/common/ssj_scp_270k_coco-instance.py b/mmde/mmdet/.mim/configs/common/ssj_scp_270k_coco-instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..06159dd40312ec935ac383701fa7b052b863e1bf
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/common/ssj_scp_270k_coco-instance.py
@@ -0,0 +1,60 @@
+_base_ = 'ssj_270k_coco-instance.py'
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+image_size = (1024, 1024)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# Standard Scale Jittering (SSJ) resizes and crops an image
+# with a resize range of 0.8 to 1.25 of the original image size.
+load_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomResize',
+        scale=image_size,
+        ratio_range=(0.8, 1.25),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=image_size),
+]
+train_pipeline = [
+    dict(type='CopyPaste', max_num_pasted=100),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='MultiImageMixDataset',
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=load_pipeline,
+            backend_args=backend_args),
+        pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/condinst/condinst_r50_fpn_ms-poly-90k_coco_instance.py b/mmde/mmdet/.mim/configs/condinst/condinst_r50_fpn_ms-poly-90k_coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..39639d874cbeb54b64a2789f251f1f6dad585ce3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/condinst/condinst_r50_fpn_ms-poly-90k_coco_instance.py
@@ -0,0 +1,85 @@
+_base_ = '../common/ms-poly-90k_coco-instance.py'
+
+# model settings
+model = dict(
+    type='CondInst',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',  # use P5
+        num_outs=5,
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type='CondInstBboxHead',
+        num_params=169,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        norm_on_bbox=True,
+        centerness_on_reg=True,
+        dcn_on_last_conv=False,
+        center_sampling=True,
+        conv_bias=True,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    mask_head=dict(
+        type='CondInstMaskHead',
+        num_layers=3,
+        feat_channels=8,
+        size_of_interest=8,
+        mask_out_stride=4,
+        max_masks_to_train=300,
+        mask_feature_head=dict(
+            in_channels=256,
+            feat_channels=128,
+            start_level=0,
+            end_level=2,
+            out_channels=8,
+            mask_stride=8,
+            num_stacked_convs=4,
+            norm_cfg=dict(type='BN', requires_grad=True)),
+        loss_mask=dict(
+            type='DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            eps=5e-6,
+            loss_weight=1.0)),
+    # model training and testing settings
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100,
+        mask_thr=0.5))
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(lr=0.01))
diff --git a/mmde/mmdet/.mim/configs/condinst/metafile.yml b/mmde/mmdet/.mim/configs/condinst/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1237b74d77a8b1f1e4b0ba74c6bdc5e5595d9816
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/condinst/metafile.yml
@@ -0,0 +1,32 @@
+Collections:
+  - Name: CondInst
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - FPN
+        - FCOS
+        - ResNet
+    Paper: https://arxiv.org/abs/2003.05664
+    README: configs/condinst/README.md
+
+Models:
+  - Name: condinst_r50_fpn_ms-poly-90k_coco_instance
+    In Collection: CondInst
+    Config: configs/condinst/condinst_r50_fpn_ms-poly-90k_coco_instance.py
+    Metadata:
+      Training Memory (GB): 4.4
+      Iterations: 90000
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.0
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/condinst/condinst_r50_fpn_ms-poly-90k_coco_instance/condinst_r50_fpn_ms-poly-90k_coco_instance_20221129_125223-4c186406.pth
diff --git a/mmde/mmdet/.mim/configs/conditional_detr/conditional-detr_r50_8xb2-50e_coco.py b/mmde/mmdet/.mim/configs/conditional_detr/conditional-detr_r50_8xb2-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a21476448d0cbab6b6e4b94aa46d686e38667879
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/conditional_detr/conditional-detr_r50_8xb2-50e_coco.py
@@ -0,0 +1,42 @@
+_base_ = ['../detr/detr_r50_8xb2-150e_coco.py']
+model = dict(
+    type='ConditionalDETR',
+    num_queries=300,
+    decoder=dict(
+        num_layers=6,
+        layer_cfg=dict(
+            self_attn_cfg=dict(
+                _delete_=True,
+                embed_dims=256,
+                num_heads=8,
+                attn_drop=0.1,
+                cross_attn=False),
+            cross_attn_cfg=dict(
+                _delete_=True,
+                embed_dims=256,
+                num_heads=8,
+                attn_drop=0.1,
+                cross_attn=True))),
+    bbox_head=dict(
+        type='ConditionalDETRHead',
+        loss_cls=dict(
+            _delete_=True,
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            match_costs=[
+                dict(type='FocalLossCost', weight=2.0),
+                dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                dict(type='IoUCost', iou_mode='giou', weight=2.0)
+            ])))
+
+# learning policy
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=50, val_interval=1)
+
+param_scheduler = [dict(type='MultiStepLR', end=50, milestones=[40])]
diff --git a/mmde/mmdet/.mim/configs/conditional_detr/metafile.yml b/mmde/mmdet/.mim/configs/conditional_detr/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..83f5532ce380c903d644b36055c4c2610455472a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/conditional_detr/metafile.yml
@@ -0,0 +1,32 @@
+Collections:
+  - Name: Conditional DETR
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Multi Scale Train
+        - Gradient Clip
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - ResNet
+        - Transformer
+    Paper:
+      URL: https://arxiv.org/abs/2108.06152
+      Title: 'Conditional DETR for Fast Training Convergence'
+    README: configs/conditional_detr/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/f4112c9e5611468ffbd57cfba548fd1289264b52/mmdet/models/detectors/conditional_detr.py#L14
+      Version: v3.0.0rc6
+
+Models:
+  - Name: conditional-detr_r50_8xb2-50e_coco
+    In Collection: Conditional DETR
+    Config: configs/conditional_detr/conditional-detr_r50_8xb2-50e_coco.py
+    Metadata:
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.9
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/conditional_detr/conditional-detr_r50_8xb2-50e_coco/conditional-detr_r50_8xb2-50e_coco_20221121_180202-c83a1dc0.pth
diff --git a/mmde/mmdet/.mim/configs/convnext/cascade-mask-rcnn_convnext-s-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py b/mmde/mmdet/.mim/configs/convnext/cascade-mask-rcnn_convnext-s-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a5fbedcaa78636f11a5718f1123d33e7e2ac273
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/convnext/cascade-mask-rcnn_convnext-s-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py
@@ -0,0 +1,26 @@
+_base_ = './cascade-mask-rcnn_convnext-t-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py'  # noqa
+
+# please install mmpretrain
+# import mmpretrain.models to trigger register_module in mmpretrain
+custom_imports = dict(
+    imports=['mmpretrain.models'], allow_failed_imports=False)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-small_3rdparty_32xb128-noema_in1k_20220301-303e75e3.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='mmpretrain.ConvNeXt',
+        arch='small',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.6,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')))
+
+optim_wrapper = dict(paramwise_cfg={
+    'decay_rate': 0.7,
+    'decay_type': 'layer_wise',
+    'num_layers': 12
+})
diff --git a/mmde/mmdet/.mim/configs/convnext/cascade-mask-rcnn_convnext-t-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py b/mmde/mmdet/.mim/configs/convnext/cascade-mask-rcnn_convnext-t-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c92f86838c31710dd550c36d9abc11d79bb6e2eb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/convnext/cascade-mask-rcnn_convnext-t-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py
@@ -0,0 +1,154 @@
+_base_ = [
+    '../_base_/models/cascade-mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# please install mmpretrain
+# import mmpretrain.models to trigger register_module in mmpretrain
+custom_imports = dict(
+    imports=['mmpretrain.models'], allow_failed_imports=False)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-tiny_3rdparty_32xb128-noema_in1k_20220301-795e9634.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='mmpretrain.ConvNeXt',
+        arch='tiny',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.4,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    neck=dict(in_channels=[96, 192, 384, 768]),
+    roi_head=dict(bbox_head=[
+        dict(
+            type='ConvFCBBoxHead',
+            num_shared_convs=4,
+            num_shared_fcs=1,
+            in_channels=256,
+            conv_out_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            reg_decoded_bbox=True,
+            norm_cfg=dict(type='SyncBN', requires_grad=True),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='GIoULoss', loss_weight=10.0)),
+        dict(
+            type='ConvFCBBoxHead',
+            num_shared_convs=4,
+            num_shared_fcs=1,
+            in_channels=256,
+            conv_out_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.05, 0.05, 0.1, 0.1]),
+            reg_class_agnostic=False,
+            reg_decoded_bbox=True,
+            norm_cfg=dict(type='SyncBN', requires_grad=True),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='GIoULoss', loss_weight=10.0)),
+        dict(
+            type='ConvFCBBoxHead',
+            num_shared_convs=4,
+            num_shared_fcs=1,
+            in_channels=256,
+            conv_out_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.033, 0.033, 0.067, 0.067]),
+            reg_class_agnostic=False,
+            reg_decoded_bbox=True,
+            norm_cfg=dict(type='SyncBN', requires_grad=True),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='GIoULoss', loss_weight=10.0))
+    ]))
+
+# augmentation strategy originates from DETR / Sparse RCNN
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[[
+            dict(
+                type='RandomChoiceResize',
+                scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                        (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                        (736, 1333), (768, 1333), (800, 1333)],
+                keep_ratio=True)
+        ],
+                    [
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(400, 1333), (500, 1333), (600, 1333)],
+                            keep_ratio=True),
+                        dict(
+                            type='RandomCrop',
+                            crop_type='absolute_range',
+                            crop_size=(384, 600),
+                            allow_negative_crop=True),
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(480, 1333), (512, 1333), (544, 1333),
+                                    (576, 1333), (608, 1333), (640, 1333),
+                                    (672, 1333), (704, 1333), (736, 1333),
+                                    (768, 1333), (800, 1333)],
+                            keep_ratio=True)
+                    ]]),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+max_epochs = 36
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[27, 33],
+        gamma=0.1)
+]
+
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    constructor='LearningRateDecayOptimizerConstructor',
+    paramwise_cfg={
+        'decay_rate': 0.7,
+        'decay_type': 'layer_wise',
+        'num_layers': 6
+    },
+    optimizer=dict(
+        _delete_=True,
+        type='AdamW',
+        lr=0.0002,
+        betas=(0.9, 0.999),
+        weight_decay=0.05))
diff --git a/mmde/mmdet/.mim/configs/convnext/mask-rcnn_convnext-t-p4-w7_fpn_amp-ms-crop-3x_coco.py b/mmde/mmdet/.mim/configs/convnext/mask-rcnn_convnext-t-p4-w7_fpn_amp-ms-crop-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5792b5b5c5a03c85a7d69040dd9a0b5381bc7995
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/convnext/mask-rcnn_convnext-t-p4-w7_fpn_amp-ms-crop-3x_coco.py
@@ -0,0 +1,96 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# please install mmpretrain
+# import mmpretrain.models to trigger register_module in mmpretrain
+custom_imports = dict(
+    imports=['mmpretrain.models'], allow_failed_imports=False)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-tiny_3rdparty_32xb128-noema_in1k_20220301-795e9634.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='mmpretrain.ConvNeXt',
+        arch='tiny',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.4,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    neck=dict(in_channels=[96, 192, 384, 768]))
+
+# augmentation strategy originates from DETR / Sparse RCNN
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[[
+            dict(
+                type='RandomChoiceResize',
+                scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                        (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                        (736, 1333), (768, 1333), (800, 1333)],
+                keep_ratio=True)
+        ],
+                    [
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(400, 1333), (500, 1333), (600, 1333)],
+                            keep_ratio=True),
+                        dict(
+                            type='RandomCrop',
+                            crop_type='absolute_range',
+                            crop_size=(384, 600),
+                            allow_negative_crop=True),
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(480, 1333), (512, 1333), (544, 1333),
+                                    (576, 1333), (608, 1333), (640, 1333),
+                                    (672, 1333), (704, 1333), (736, 1333),
+                                    (768, 1333), (800, 1333)],
+                            keep_ratio=True)
+                    ]]),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+max_epochs = 36
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[27, 33],
+        gamma=0.1)
+]
+
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    constructor='LearningRateDecayOptimizerConstructor',
+    paramwise_cfg={
+        'decay_rate': 0.95,
+        'decay_type': 'layer_wise',
+        'num_layers': 6
+    },
+    optimizer=dict(
+        _delete_=True,
+        type='AdamW',
+        lr=0.0001,
+        betas=(0.9, 0.999),
+        weight_decay=0.05,
+    ))
diff --git a/mmde/mmdet/.mim/configs/convnext/metafile.yml b/mmde/mmdet/.mim/configs/convnext/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b9fd7506cf46896d6c5f2238b594d32558ed3195
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/convnext/metafile.yml
@@ -0,0 +1,93 @@
+Models:
+  - Name: mask-rcnn_convnext-t-p4-w7_fpn_amp-ms-crop-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/convnext/mask-rcnn_convnext-t-p4-w7_fpn_amp-ms-crop-3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.3
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Mixed Precision Training
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - ConvNeXt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/convnext/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco_20220426_154953-050731f4.pth
+    Paper:
+      URL: https://arxiv.org/abs/2201.03545
+      Title: 'A ConvNet for the 2020s'
+    README: configs/convnext/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465
+      Version: v2.16.0
+
+  - Name: cascade-mask-rcnn_convnext-t-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/convnext/cascade-mask-rcnn_convnext-t-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py
+    Metadata:
+      Training Memory (GB): 9.0
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Mixed Precision Training
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - ConvNeXt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 43.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/convnext/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220509_204200-8f07c40b.pth
+    Paper:
+      URL: https://arxiv.org/abs/2201.03545
+      Title: 'A ConvNet for the 2020s'
+    README: configs/convnext/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465
+      Version: v2.25.0
+
+  - Name: cascade-mask-rcnn_convnext-s-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco
+    In Collection: Cascade Mask R-CNN
+    Config: configs/convnext/cascade-mask-rcnn_convnext-s-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py
+    Metadata:
+      Training Memory (GB): 12.3
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Mixed Precision Training
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - ConvNeXt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 51.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 44.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/convnext/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220510_201004-3d24f5a4.pth
+    Paper:
+      URL: https://arxiv.org/abs/2201.03545
+      Title: 'A ConvNet for the 2020s'
+    README: configs/convnext/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465
+      Version: v2.25.0
diff --git a/mmde/mmdet/.mim/configs/cornernet/cornernet_hourglass104_10xb5-crop511-210e-mstest_coco.py b/mmde/mmdet/.mim/configs/cornernet/cornernet_hourglass104_10xb5-crop511-210e-mstest_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..76339163b618a5a9d41a542ec75192aedb409eea
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cornernet/cornernet_hourglass104_10xb5-crop511-210e-mstest_coco.py
@@ -0,0 +1,8 @@
+_base_ = './cornernet_hourglass104_8xb6-210e-mstest_coco.py'
+
+train_dataloader = dict(batch_size=5)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (10 GPUs) x (5 samples per GPU)
+auto_scale_lr = dict(base_batch_size=50)
diff --git a/mmde/mmdet/.mim/configs/cornernet/cornernet_hourglass104_32xb3-210e-mstest_coco.py b/mmde/mmdet/.mim/configs/cornernet/cornernet_hourglass104_32xb3-210e-mstest_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..51a4740318a1d85a62b6b4482c53808c98fb8a62
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cornernet/cornernet_hourglass104_32xb3-210e-mstest_coco.py
@@ -0,0 +1,8 @@
+_base_ = './cornernet_hourglass104_8xb6-210e-mstest_coco.py'
+
+train_dataloader = dict(batch_size=3)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (3 samples per GPU)
+auto_scale_lr = dict(base_batch_size=96)
diff --git a/mmde/mmdet/.mim/configs/cornernet/cornernet_hourglass104_8xb6-210e-mstest_coco.py b/mmde/mmdet/.mim/configs/cornernet/cornernet_hourglass104_8xb6-210e-mstest_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdb46fff164f796d9333c123deb701c341bdc1e3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cornernet/cornernet_hourglass104_8xb6-210e-mstest_coco.py
@@ -0,0 +1,183 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/coco_detection.py'
+]
+
+data_preprocessor = dict(
+    type='DetDataPreprocessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True)
+
+# model settings
+model = dict(
+    type='CornerNet',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='HourglassNet',
+        downsample_times=5,
+        num_stacks=2,
+        stage_channels=[256, 256, 384, 384, 384, 512],
+        stage_blocks=[2, 2, 2, 2, 2, 4],
+        norm_cfg=dict(type='BN', requires_grad=True)),
+    neck=None,
+    bbox_head=dict(
+        type='CornerHead',
+        num_classes=80,
+        in_channels=256,
+        num_feat_levels=2,
+        corner_emb_channels=1,
+        loss_heatmap=dict(
+            type='GaussianFocalLoss', alpha=2.0, gamma=4.0, loss_weight=1),
+        loss_embedding=dict(
+            type='AssociativeEmbeddingLoss',
+            pull_weight=0.10,
+            push_weight=0.10),
+        loss_offset=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1)),
+    # training and testing settings
+    train_cfg=None,
+    test_cfg=dict(
+        corner_topk=100,
+        local_maximum_kernel=3,
+        distance_threshold=0.5,
+        score_thr=0.05,
+        max_per_img=100,
+        nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian')))
+
+# data settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        # The cropped images are padded into squares during training,
+        # but may be smaller than crop_size.
+        type='RandomCenterCropPad',
+        crop_size=(511, 511),
+        ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3),
+        test_mode=False,
+        test_pad_mode=None,
+        mean=data_preprocessor['mean'],
+        std=data_preprocessor['std'],
+        # Image data is not converted to rgb.
+        to_rgb=data_preprocessor['bgr_to_rgb']),
+    # Make sure the output is always crop_size.
+    dict(type='Resize', scale=(511, 511), keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs'),
+]
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        to_float32=True,
+        backend_args=_base_.backend_args,
+    ),
+    # don't need Resize
+    dict(
+        type='RandomCenterCropPad',
+        crop_size=None,
+        ratios=None,
+        border=None,
+        test_mode=True,
+        test_pad_mode=['logical_or', 127],
+        mean=data_preprocessor['mean'],
+        std=data_preprocessor['std'],
+        # Image data is not converted to rgb.
+        to_rgb=data_preprocessor['bgr_to_rgb']),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'border'))
+]
+
+train_dataloader = dict(
+    batch_size=6,
+    num_workers=3,
+    batch_sampler=None,
+    dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='Adam', lr=0.0005),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+max_epochs = 210
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[180],
+        gamma=0.1)
+]
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (6 samples per GPU)
+auto_scale_lr = dict(base_batch_size=48)
+
+tta_model = dict(
+    type='DetTTAModel',
+    tta_cfg=dict(
+        nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian'),
+        max_per_img=100))
+
+tta_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        to_float32=True,
+        backend_args=_base_.backend_args),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                # ``RandomFlip`` must be placed before ``RandomCenterCropPad``,
+                # otherwise bounding box coordinates after flipping cannot be
+                # recovered correctly.
+                dict(type='RandomFlip', prob=1.),
+                dict(type='RandomFlip', prob=0.)
+            ],
+            [
+                dict(
+                    type='RandomCenterCropPad',
+                    crop_size=None,
+                    ratios=None,
+                    border=None,
+                    test_mode=True,
+                    test_pad_mode=['logical_or', 127],
+                    mean=data_preprocessor['mean'],
+                    std=data_preprocessor['std'],
+                    # Image data is not converted to rgb.
+                    to_rgb=data_preprocessor['bgr_to_rgb'])
+            ],
+            [dict(type='LoadAnnotations', with_bbox=True)],
+            [
+                dict(
+                    type='PackDetInputs',
+                    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                               'flip', 'flip_direction', 'border'))
+            ]
+        ])
+]
diff --git a/mmde/mmdet/.mim/configs/cornernet/metafile.yml b/mmde/mmdet/.mim/configs/cornernet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f915cf37e8e157405a66431dfb21595db319b8b6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/cornernet/metafile.yml
@@ -0,0 +1,83 @@
+Collections:
+  - Name: CornerNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - Adam
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Corner Pooling
+        - Stacked Hourglass Network
+    Paper:
+      URL: https://arxiv.org/abs/1808.01244
+      Title: 'CornerNet: Detecting Objects as Paired Keypoints'
+    README: configs/cornernet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.3.0/mmdet/models/detectors/cornernet.py#L9
+      Version: v2.3.0
+
+Models:
+  - Name: cornernet_hourglass104_10xb5-crop511-210e-mstest_coco
+    In Collection: CornerNet
+    Config: configs/cornernet/cornernet_hourglass104_10xb5-crop511-210e-mstest_coco.py
+    Metadata:
+      Training Resources: 10x V100 GPUs
+      Batch Size: 50
+      Training Memory (GB): 13.9
+      inference time (ms/im):
+        - value: 238.1
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 210
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco/cornernet_hourglass104_mstest_10x5_210e_coco_20200824_185720-5fefbf1c.pth
+
+  - Name: cornernet_hourglass104_8xb6-210e-mstest_coco
+    In Collection: CornerNet
+    Config: configs/cornernet/cornernet_hourglass104_8xb6-210e-mstest_coco.py
+    Metadata:
+      Batch Size: 48
+      Training Memory (GB): 15.9
+      inference time (ms/im):
+        - value: 238.1
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 210
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco/cornernet_hourglass104_mstest_8x6_210e_coco_20200825_150618-79b44c30.pth
+
+  - Name: cornernet_hourglass104_32xb3-210e-mstest_coco
+    In Collection: CornerNet
+    Config: configs/cornernet/cornernet_hourglass104_32xb3-210e-mstest_coco.py
+    Metadata:
+      Training Resources: 32x V100 GPUs
+      Batch Size: 96
+      Training Memory (GB): 9.5
+      inference time (ms/im):
+        - value: 256.41
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 210
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco/cornernet_hourglass104_mstest_32x3_210e_coco_20200819_203110-1efaea91.pth
diff --git a/mmde/mmdet/.mim/configs/crowddet/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman.py b/mmde/mmdet/.mim/configs/crowddet/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman.py
new file mode 100644
index 0000000000000000000000000000000000000000..8815be77d49cf77afff6f888ee225e928e43b402
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/crowddet/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman.py
@@ -0,0 +1,227 @@
+_base_ = ['../_base_/default_runtime.py']
+
+model = dict(
+    type='CrowdDet',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False,
+        pad_size_divisor=64,
+        # This option is set according to https://github.com/Purkialo/CrowdDet/
+        # blob/master/lib/data/CrowdHuman.py The images in the entire batch are
+        # resize together.
+        batch_augments=[
+            dict(type='BatchResize', scale=(1400, 800), pad_size_divisor=64)
+        ]),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5,
+        upsample_cfg=dict(mode='bilinear', align_corners=False)),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[1.0, 2.0, 3.0],
+            strides=[4, 8, 16, 32, 64],
+            centers=[(8, 8), (8, 8), (8, 8), (8, 8), (8, 8)]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[0.0, 0.0, 0.0, 0.0],
+            target_stds=[1.0, 1.0, 1.0, 1.0],
+            clip_border=False),
+        loss_cls=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='MultiInstanceRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=7,
+                sampling_ratio=-1,
+                aligned=True,
+                use_torchvision=True),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='MultiInstanceBBoxHead',
+            with_refine=False,
+            num_shared_fcs=2,
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=1,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                loss_weight=1.0,
+                use_sigmoid=False,
+                reduction='none'),
+            loss_bbox=dict(
+                type='SmoothL1Loss', loss_weight=1.0, reduction='none'))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=(0.3, 0.7),
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2400,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=2),
+        rcnn=dict(
+            assigner=dict(
+                type='MultiInstanceAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.3,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='MultiInsRandomSampler',
+                num=512,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1200,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=2),
+        rcnn=dict(
+            nms=dict(type='nms', iou_threshold=0.5),
+            score_thr=0.01,
+            max_per_img=500)))
+
+dataset_type = 'CrowdHumanDataset'
+data_root = 'data/CrowdHuman/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/tracking/CrowdHuman/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/tracking/',
+#         'data/': 's3://openmmlab/datasets/tracking/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1400, 800), keep_ratio=True),
+    # avoid bboxes being resized
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=None,  # The 'batch_sampler' may decrease the precision
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotation_train.odgt',
+        data_prefix=dict(img='Images/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotation_val.odgt',
+        data_prefix=dict(img='Images/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CrowdHumanMetric',
+    ann_file=data_root + 'annotation_val.odgt',
+    metric=['AP', 'MR', 'JI'],
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=30, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=800),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=30,
+        by_epoch=True,
+        milestones=[24, 27],
+        gamma=0.1)
+]
+
+# optimizer
+auto_scale_lr = dict(base_batch_size=16)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.002, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/crowddet/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman.py b/mmde/mmdet/.mim/configs/crowddet/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman.py
new file mode 100644
index 0000000000000000000000000000000000000000..80277ce1c1436c37c4e2a4d13293d0ecb8ba4722
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/crowddet/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman.py
@@ -0,0 +1,3 @@
+_base_ = './crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman.py'
+
+model = dict(roi_head=dict(bbox_head=dict(with_refine=True)))
diff --git a/mmde/mmdet/.mim/configs/crowddet/metafile.yml b/mmde/mmdet/.mim/configs/crowddet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4f191dea9cc599f64091434152000e67289f9180
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/crowddet/metafile.yml
@@ -0,0 +1,47 @@
+Collections:
+  - Name: CrowdDet
+    Metadata:
+      Training Data: CrowdHuman
+      Training Techniques:
+        - SGD
+        - EMD Loss
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - FPN
+        - RPN
+        - ResNet
+        - RoIPool
+    Paper:
+      URL: https://arxiv.org/abs/2003.09163
+      Title: 'Detection in Crowded Scenes: One Proposal, Multiple Predictions'
+    README: configs/crowddet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v3.0.0rc3/mmdet/models/detectors/crowddet.py
+      Version: v3.0.0rc3
+
+Models:
+  - Name: crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman
+    In Collection: CrowdDet
+    Config: configs/crowddet/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman.py
+    Metadata:
+      Training Memory (GB): 4.8
+      Epochs: 30
+    Results:
+      - Task: Object Detection
+        Dataset: CrowdHuman
+        Metrics:
+          box AP: 90.32
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/crowddet/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman_20221024_215917-45602806.pth
+
+  - Name: crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman
+    In Collection: CrowdDet
+    Config: configs/crowddet/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman.py
+    Metadata:
+      Training Memory (GB): 4.4
+      Epochs: 30
+    Results:
+      - Task: Object Detection
+        Dataset: CrowdHuman
+        Metrics:
+          box AP: 90.0
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/crowddet/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman_20221023_174954-dc319c2d.pth
diff --git a/mmde/mmdet/.mim/configs/dab_detr/dab-detr_r50_8xb2-50e_coco.py b/mmde/mmdet/.mim/configs/dab_detr/dab-detr_r50_8xb2-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..314ed97e2d80ae3c95119abf9166f95d416c010e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dab_detr/dab-detr_r50_8xb2-50e_coco.py
@@ -0,0 +1,159 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='DABDETR',
+    num_queries=300,
+    with_random_refpoints=False,
+    num_patterns=0,
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=1),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3, ),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=None,
+        num_outs=1),
+    encoder=dict(
+        num_layers=6,
+        layer_cfg=dict(
+            self_attn_cfg=dict(
+                embed_dims=256, num_heads=8, dropout=0., batch_first=True),
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,
+                num_fcs=2,
+                ffn_drop=0.,
+                act_cfg=dict(type='PReLU')))),
+    decoder=dict(
+        num_layers=6,
+        query_dim=4,
+        query_scale_type='cond_elewise',
+        with_modulated_hw_attn=True,
+        layer_cfg=dict(
+            self_attn_cfg=dict(
+                embed_dims=256,
+                num_heads=8,
+                attn_drop=0.,
+                proj_drop=0.,
+                cross_attn=False),
+            cross_attn_cfg=dict(
+                embed_dims=256,
+                num_heads=8,
+                attn_drop=0.,
+                proj_drop=0.,
+                cross_attn=True),
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,
+                num_fcs=2,
+                ffn_drop=0.,
+                act_cfg=dict(type='PReLU'))),
+        return_intermediate=True),
+    positional_encoding=dict(num_feats=128, temperature=20, normalize=True),
+    bbox_head=dict(
+        type='DABDETRHead',
+        num_classes=80,
+        embed_dims=256,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            match_costs=[
+                dict(type='FocalLossCost', weight=2., eps=1e-8),
+                dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                dict(type='IoUCost', iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=300))
+
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[[
+            dict(
+                type='RandomChoiceResize',
+                scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                        (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                        (736, 1333), (768, 1333), (800, 1333)],
+                keep_ratio=True)
+        ],
+                    [
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(400, 1333), (500, 1333), (600, 1333)],
+                            keep_ratio=True),
+                        dict(
+                            type='RandomCrop',
+                            crop_type='absolute_range',
+                            crop_size=(384, 600),
+                            allow_negative_crop=True),
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(480, 1333), (512, 1333), (544, 1333),
+                                    (576, 1333), (608, 1333), (640, 1333),
+                                    (672, 1333), (704, 1333), (736, 1333),
+                                    (768, 1333), (800, 1333)],
+                            keep_ratio=True)
+                    ]]),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}))
+
+# learning policy
+max_epochs = 50
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[40],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16, enable=False)
diff --git a/mmde/mmdet/.mim/configs/dab_detr/metafile.yml b/mmde/mmdet/.mim/configs/dab_detr/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..94383a0493b86a730181f78ab2f0e94a2ab2de73
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dab_detr/metafile.yml
@@ -0,0 +1,32 @@
+Collections:
+  - Name: DAB-DETR
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Multi Scale Train
+        - Gradient Clip
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - ResNet
+        - Transformer
+    Paper:
+      URL: https://arxiv.org/abs/2201.12329
+      Title: 'DAB-DETR: Dynamic Anchor Boxes are Better Queries for DETR'
+    README: configs/dab_detr/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/f4112c9e5611468ffbd57cfba548fd1289264b52/mmdet/models/detectors/dab_detr.py#L15
+      Version: v3.0.0rc6
+
+Models:
+  - Name: dab-detr_r50_8xb2-50e_coco
+    In Collection: DAB-DETR
+    Config: configs/dab_detr/dab-detr_r50_8xb2-50e_coco.py
+    Metadata:
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.3
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/dab_detr/dab-detr_r50_8xb2-50e_coco/dab-detr_r50_8xb2-50e_coco_20221122_120837-c1035c8c.pth
diff --git a/mmde/mmdet/.mim/configs/dcn/cascade-mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/dcn/cascade-mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c0ff9890e82bd0c1ee4e445e37d2c7afa534161
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcn/cascade-mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../cascade_rcnn/cascade-mask-rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/dcn/cascade-mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/dcn/cascade-mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfcc5e73cc508e11d77c5a3557f30632b545b803
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcn/cascade-mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/dcn/cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/dcn/cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..48b25f62125da09368c446bcd6ccff9b0219a7cc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcn/cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/dcn/cascade-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/dcn/cascade-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a942da754119b8d913f807907322a3d96c83ff8
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcn/cascade-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../cascade_rcnn/cascade-rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/dcn/cascade-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/dcn/cascade-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6bf5b7998a972f41b52f90955ef52977adfd68c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcn/cascade-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/dcn/faster-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/dcn/faster-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..db44e7e87b2d11555140ab2c8a19f32e1ce65770
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcn/faster-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../faster_rcnn/faster-rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/dcn/faster-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/dcn/faster-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..95f20467af60167a4a61f253e4354dadd832ccc7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcn/faster-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/dcn/faster-rcnn_r50_fpn_dpool_1x_coco.py b/mmde/mmdet/.mim/configs/dcn/faster-rcnn_r50_fpn_dpool_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c65ce5fd0267dc892455da6495cd3be9f1f99fcf
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcn/faster-rcnn_r50_fpn_dpool_1x_coco.py
@@ -0,0 +1,12 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                _delete_=True,
+                type='DeformRoIPoolPack',
+                output_size=7,
+                output_channels=256),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32])))
diff --git a/mmde/mmdet/.mim/configs/dcn/faster-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/dcn/faster-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4ed832f5e7ff0d050be33e57d2fa611e9ae7e8e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcn/faster-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/dcn/mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/dcn/mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f36714a5301823ca401820ab9d926374428ee70
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcn/mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b281d417b4f6a7320201da261e5fdf6950556a1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_amp-1x_coco.py b/mmde/mmdet/.mim/configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_amp-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d01594314aad74bc47d7331c42a39f2ca453071
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_amp-1x_coco.py
@@ -0,0 +1,10 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
+
+# MMEngine support the following two ways, users can choose
+# according to convenience
+# optim_wrapper = dict(type='AmpOptimWrapper')
+_base_.optim_wrapper.type = 'AmpOptimWrapper'
diff --git a/mmde/mmdet/.mim/configs/dcn/metafile.yml b/mmde/mmdet/.mim/configs/dcn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4aa35b5d95f7f531cc2bdb8a03553ae197cfe727
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcn/metafile.yml
@@ -0,0 +1,272 @@
+Collections:
+  - Name: Deformable Convolutional Networks
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Deformable Convolution
+    Paper:
+      URL: https://arxiv.org/abs/1703.06211
+      Title: "Deformable Convolutional Networks"
+    README: configs/dcn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/dcn/deform_conv.py#L15
+      Version: v2.0.0
+
+Models:
+  - Name: faster-rcnn_r50_fpn_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/faster-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.0
+      inference time (ms/im):
+        - value: 56.18
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-d68aed1e.pth
+
+  - Name: faster-rcnn_r50_fpn_dpool_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/faster-rcnn_r50_fpn_dpool_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      inference time (ms/im):
+        - value: 58.14
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dpool_1x_coco/faster_rcnn_r50_fpn_dpool_1x_coco_20200307-90d3c01d.pth
+
+  - Name: faster-rcnn_r101-dconv-c3-c5_fpn_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/faster-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 80
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203-1377f13d.pth
+
+  - Name: faster-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/faster-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.3
+      inference time (ms/im):
+        - value: 100
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco_20200203-4f85c69c.pth
+
+  - Name: mask-rcnn_r50_fpn_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.5
+      inference time (ms/im):
+        - value: 64.94
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200203-4d9ad43b.pth
+
+  - Name: mask-rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_amp-1x_coco.py
+    Metadata:
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+        - Mixed Precision Training
+      Training Memory (GB): 3.0
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco_20210520_180247-c06429d2.pth
+
+  - Name: mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.5
+      inference time (ms/im):
+        - value: 85.47
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200216-a71f5bce.pth
+
+  - Name: cascade-rcnn_r50_fpn_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/cascade-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.5
+      inference time (ms/im):
+        - value: 68.49
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-2f1fca44.pth
+
+  - Name: cascade-rcnn_r101-dconv-c3-c5_fpn_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/cascade-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      inference time (ms/im):
+        - value: 90.91
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203-3b2f0594.pth
+
+  - Name: cascade-mask-rcnn_r50_fpn_dconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/cascade-mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 100
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200202-42e767a2.pth
+
+  - Name: cascade-mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/cascade-mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.0
+      inference time (ms/im):
+        - value: 116.28
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200204-df0c5f10.pth
+
+  - Name: cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco
+    In Collection: Deformable Convolutional Networks
+    Config: configs/dcn/cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.2
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco-e75f90c8.pth
diff --git a/mmde/mmdet/.mim/configs/dcnv2/faster-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/dcnv2/faster-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7f7e4eecaf74418690975d54d09eeb0e31f9a1f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcnv2/faster-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/dcnv2/faster-rcnn_r50-mdconv-group4-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/dcnv2/faster-rcnn_r50-mdconv-group4-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c58dbed3782403a5fac3c6809598372e47cd72c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcnv2/faster-rcnn_r50-mdconv-group4-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=4, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/dcnv2/faster-rcnn_r50_fpn_mdpool_1x_coco.py b/mmde/mmdet/.mim/configs/dcnv2/faster-rcnn_r50_fpn_mdpool_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6198d6d7d72f8d012c777330f1116b46b89290be
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcnv2/faster-rcnn_r50_fpn_mdpool_1x_coco.py
@@ -0,0 +1,12 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                _delete_=True,
+                type='ModulatedDeformRoIPoolPack',
+                output_size=7,
+                output_channels=256),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32])))
diff --git a/mmde/mmdet/.mim/configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7a90bbf31bea3663820caa4541de3ceafeb7366
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_amp-1x_coco.py b/mmde/mmdet/.mim/configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_amp-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b3894c2d61ee3208170235ba1aa98def79a7120
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_amp-1x_coco.py
@@ -0,0 +1,10 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
+
+# MMEngine support the following two ways, users can choose
+# according to convenience
+# optim_wrapper = dict(type='AmpOptimWrapper')
+_base_.optim_wrapper.type = 'AmpOptimWrapper'
diff --git a/mmde/mmdet/.mim/configs/dcnv2/metafile.yml b/mmde/mmdet/.mim/configs/dcnv2/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..dea7bfa1b531410f3c81693d7012a835781a63ca
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dcnv2/metafile.yml
@@ -0,0 +1,123 @@
+Collections:
+  - Name: Deformable Convolutional Networks v2
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Deformable Convolution
+    Paper:
+      URL: https://arxiv.org/abs/1811.11168
+      Title: "Deformable ConvNets v2: More Deformable, Better Results"
+    README: configs/dcnv2/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/dcn/deform_conv.py#L15
+      Version: v2.0.0
+
+Models:
+  - Name: faster-rcnn_r50_fpn_mdconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks v2
+    Config: configs/dcnv2/faster-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.1
+      inference time (ms/im):
+        - value: 56.82
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200130-d099253b.pth
+
+  - Name: faster-rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco
+    In Collection: Deformable Convolutional Networks v2
+    Config: configs/dcnv2/faster-rcnn_r50-mdconv-group4-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.2
+      inference time (ms/im):
+        - value: 57.47
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco_20200130-01262257.pth
+
+  - Name: faster-rcnn_r50_fpn_mdpool_1x_coco
+    In Collection: Deformable Convolutional Networks v2
+    Config: configs/dcnv2/faster-rcnn_r50_fpn_mdpool_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.8
+      inference time (ms/im):
+        - value: 60.24
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco/faster_rcnn_r50_fpn_mdpool_1x_coco_20200307-c0df27ff.pth
+
+  - Name: mask-rcnn_r50_fpn_mdconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks v2
+    Config: configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.5
+      inference time (ms/im):
+        - value: 66.23
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200203-ad97591f.pth
+
+  - Name: mask-rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco
+    In Collection: Deformable Convolutional Networks v2
+    Config: configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_amp-1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.1
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+        - Mixed Precision Training
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco_20210520_180434-cf8fefa5.pth
diff --git a/mmde/mmdet/.mim/configs/ddod/ddod_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/ddod/ddod_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..fed1116b1f92e613517a57aa196839e4de3037dc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ddod/ddod_r50_fpn_1x_coco.py
@@ -0,0 +1,72 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='DDOD',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='DDODHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_iou=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    train_cfg=dict(
+        # assigner is mean cls_assigner
+        assigner=dict(type='ATSSAssigner', topk=9, alpha=0.8),
+        reg_assigner=dict(type='ATSSAssigner', topk=9, alpha=0.5),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/ddod/metafile.yml b/mmde/mmdet/.mim/configs/ddod/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c22395002bd614cd0e75d753320c3f9e7ce54bd1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ddod/metafile.yml
@@ -0,0 +1,33 @@
+Collections:
+  - Name: DDOD
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - DDOD
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/pdf/2107.02963.pdf
+      Title: 'Disentangle Your Dense Object Detector'
+    README: configs/ddod/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.25.0/mmdet/models/detectors/ddod.py#L6
+      Version: v2.25.0
+
+Models:
+  - Name: ddod_r50_fpn_1x_coco
+    In Collection: DDOD
+    Config: configs/ddod/ddod_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.4
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ddod/ddod_r50_fpn_1x_coco/ddod_r50_fpn_1x_coco_20220523_223737-29b2fc67.pth
diff --git a/mmde/mmdet/.mim/configs/ddq/ddq-detr-4scale_r50_8xb2-12e_coco.py b/mmde/mmdet/.mim/configs/ddq/ddq-detr-4scale_r50_8xb2-12e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e64afc087e1ed68b8b5d1474127c832f893cb9b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ddq/ddq-detr-4scale_r50_8xb2-12e_coco.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='DDQDETR',
+    num_queries=900,  # num_matching_queries
+    # ratio of num_dense queries to num_queries
+    dense_topk_ratio=1.5,
+    with_box_refine=True,
+    as_two_stage=True,
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=1),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[512, 1024, 2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    # encoder class name: DeformableDetrTransformerEncoder
+    encoder=dict(
+        num_layers=6,
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_levels=4,
+                               dropout=0.0),  # 0.1 for DeformDETR
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,  # 1024 for DeformDETR
+                ffn_drop=0.0))),  # 0.1 for DeformDETR
+    # decoder class name: DDQTransformerDecoder
+    decoder=dict(
+        # `num_layers` >= 2, because attention masks of the last
+        #   `num_layers` - 1 layers are used for distinct query selection
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_heads=8,
+                               dropout=0.0),  # 0.1 for DeformDETR
+            cross_attn_cfg=dict(embed_dims=256, num_levels=4,
+                                dropout=0.0),  # 0.1 for DeformDETR
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,  # 1024 for DeformDETR
+                ffn_drop=0.0)),  # 0.1 for DeformDETR
+        post_norm_cfg=None),
+    positional_encoding=dict(
+        num_feats=128,
+        normalize=True,
+        offset=0.0,  # -0.5 for DeformDETR
+        temperature=20),  # 10000 for DeformDETR
+    bbox_head=dict(
+        type='DDQDETRHead',
+        num_classes=80,
+        sync_cls_avg_factor=True,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    dn_cfg=dict(
+        label_noise_scale=0.5,
+        box_noise_scale=1.0,
+        group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)),
+    dqs_cfg=dict(type='nms', iou_threshold=0.8),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            match_costs=[
+                dict(type='FocalLossCost', weight=2.0),
+                dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                dict(type='IoUCost', iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=300))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.05),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)}))
+
+# learning policy
+max_epochs = 12
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.0001,
+        by_epoch=False,
+        begin=0,
+        end=2000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[11],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/ddq/ddq-detr-4scale_swinl_8xb2-30e_coco.py b/mmde/mmdet/.mim/configs/ddq/ddq-detr-4scale_swinl_8xb2-30e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d863649411e3157373961b3da339990df1e6f267
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ddq/ddq-detr-4scale_swinl_8xb2-30e_coco.py
@@ -0,0 +1,177 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'  # noqa: E501
+model = dict(
+    type='DDQDETR',
+    num_queries=900,  # num_matching_queries
+    # ratio of num_dense queries to num_queries
+    dense_topk_ratio=1.5,
+    with_box_refine=True,
+    as_two_stage=True,
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=1),
+    backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[384, 768, 1536],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    # encoder class name: DeformableDetrTransformerEncoder
+    encoder=dict(
+        num_layers=6,
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_levels=4,
+                               dropout=0.0),  # 0.1 for DeformDETR
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,  # 1024 for DeformDETR
+                ffn_drop=0.0))),  # 0.1 for DeformDETR
+    # decoder class name: DDQTransformerDecoder
+    decoder=dict(
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_heads=8,
+                               dropout=0.0),  # 0.1 for DeformDETR
+            cross_attn_cfg=dict(embed_dims=256, num_levels=4,
+                                dropout=0.0),  # 0.1 for DeformDETR
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,  # 1024 for DeformDETR
+                ffn_drop=0.0)),  # 0.1 for DeformDETR
+        post_norm_cfg=None),
+    positional_encoding=dict(
+        num_feats=128,
+        normalize=True,
+        offset=0.0,  # -0.5 for DeformDETR
+        temperature=20),  # 10000 for DeformDETR
+    bbox_head=dict(
+        type='DDQDETRHead',
+        num_classes=80,
+        sync_cls_avg_factor=True,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    dn_cfg=dict(
+        label_noise_scale=0.5,
+        box_noise_scale=1.0,
+        group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)),
+    dqs_cfg=dict(type='nms', iou_threshold=0.8),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            match_costs=[
+                dict(type='FocalLossCost', weight=2.0),
+                dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                dict(type='IoUCost', iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=300))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.05),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.05)}))
+
+# learning policy
+max_epochs = 30
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.0001,
+        by_epoch=False,
+        begin=0,
+        end=2000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[20, 26],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/ddq/ddq-detr-5scale_r50_8xb2-12e_coco.py b/mmde/mmdet/.mim/configs/ddq/ddq-detr-5scale_r50_8xb2-12e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c38f553bdd46bc4e0611bbd0fd4bab0c1929825
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ddq/ddq-detr-5scale_r50_8xb2-12e_coco.py
@@ -0,0 +1,171 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='DDQDETR',
+    num_queries=900,  # num_matching_queries
+    # ratio of num_dense queries to num_queries
+    dense_topk_ratio=1.5,
+    with_box_refine=True,
+    as_two_stage=True,
+    num_feature_levels=5,
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=1),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[256, 512, 1024, 2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=5),
+    # encoder class name: DeformableDetrTransformerEncoder
+    encoder=dict(
+        num_layers=6,
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_levels=5,
+                               dropout=0.0),  # 0.1 for DeformDETR
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,  # 1024 for DeformDETR
+                ffn_drop=0.0))),  # 0.1 for DeformDETR
+    # decoder class name: DDQTransformerDecoder
+    decoder=dict(
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_heads=8,
+                               dropout=0.0),  # 0.1 for DeformDETR
+            cross_attn_cfg=dict(embed_dims=256, num_levels=5,
+                                dropout=0.0),  # 0.1 for DeformDETR
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,  # 1024 for DeformDETR
+                ffn_drop=0.0)),  # 0.1 for DeformDETR
+        post_norm_cfg=None),
+    positional_encoding=dict(
+        num_feats=128,
+        normalize=True,
+        offset=0.0,  # -0.5 for DeformDETR
+        temperature=20),  # 10000 for DeformDETR
+    bbox_head=dict(
+        type='DDQDETRHead',
+        num_classes=80,
+        sync_cls_avg_factor=True,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    dn_cfg=dict(
+        label_noise_scale=0.5,
+        box_noise_scale=1.0,
+        group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)),
+    dqs_cfg=dict(type='nms', iou_threshold=0.8),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            match_costs=[
+                dict(type='FocalLossCost', weight=2.0),
+                dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                dict(type='IoUCost', iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=300))
+
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.05),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)}))
+
+# learning policy
+max_epochs = 12
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.0001,
+        by_epoch=False,
+        begin=0,
+        end=2000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[11],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/ddq/metafile.yml b/mmde/mmdet/.mim/configs/ddq/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..bd33abe1a5122885913a1e8cbee60cb48014239f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ddq/metafile.yml
@@ -0,0 +1,56 @@
+Collections:
+  - Name: DDQ
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Multi Scale Train
+        - Gradient Clip
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - ResNet
+        - Transformer
+    Paper:
+      URL: https://arxiv.org/abs/2303.12776
+      Title: 'Dense Distinct Query for End-to-End Object Detection'
+    README: configs/ddq/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/detectors/ddq_detr.py#L21
+      Version: dev-3.x
+
+Models:
+  - Name: ddq-detr-4scale_r50_8xb2-12e_coco
+    In Collection: DDQ
+    Config: configs/ddq/ddq-detr-4scale_r50_8xb2-12e_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 51.4
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/ddq/ddq-detr-4scale_r50_8xb2-12e_coco/ddq-detr-4scale_r50_8xb2-12e_coco_20230809_170711-42528127.pth
+
+  - Name: ddq-detr-5scale_r50_8xb2-12e_coco
+    In Collection: DDQ
+    Config: configs/dino/ddq-detr-5scale_r50_8xb2-12e_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 52.1
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/ddq/ddq_detr_5scale_coco_1x.pth
+
+  - Name: ddq-detr-4scale_swinl_8xb2-30e_coco
+    In Collection: DDQ
+    Config: configs/dino/ddq-detr-4scale_swinl_8xb2-30e_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 58.7
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/ddq/ddq_detr_swinl_30e.pth
diff --git a/mmde/mmdet/.mim/configs/deepfashion/mask-rcnn_r50_fpn_15e_deepfashion.py b/mmde/mmdet/.mim/configs/deepfashion/mask-rcnn_r50_fpn_15e_deepfashion.py
new file mode 100644
index 0000000000000000000000000000000000000000..403b18a4ca8ed61aedcb99218ecc79302826ff8c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/deepfashion/mask-rcnn_r50_fpn_15e_deepfashion.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/deepfashion.py', '../_base_/schedules/schedule_1x.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(num_classes=15), mask_head=dict(num_classes=15)))
+# runtime settings
+max_epochs = 15
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py b/mmde/mmdet/.mim/configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
new file mode 100644
index 0000000000000000000000000000000000000000..70d3393829b422740bfba5d1746c7651e9c2d69c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,85 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/mot_challenge.py', '../_base_/default_runtime.py'
+]
+
+default_hooks = dict(
+    logger=dict(type='LoggerHook', interval=1),
+    visualization=dict(type='TrackVisualizationHook', draw=False))
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='TrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+# custom hooks
+custom_hooks = [
+    # Synchronize model buffers such as running_mean and running_var in BN
+    # at the end of each epoch
+    dict(type='SyncBuffersHook')
+]
+
+detector = _base_.model
+detector.pop('data_preprocessor')
+detector.rpn_head.bbox_coder.update(dict(clip_border=False))
+detector.roi_head.bbox_head.update(dict(num_classes=1))
+detector.roi_head.bbox_head.bbox_coder.update(dict(clip_border=False))
+detector['init_cfg'] = dict(
+    type='Pretrained',
+    checkpoint=  # noqa: E251
+    'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/'
+    'faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth')
+del _base_.model
+
+model = dict(
+    type='DeepSORT',
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    detector=detector,
+    reid=dict(
+        type='BaseReID',
+        data_preprocessor=dict(type='mmpretrain.ClsDataPreprocessor'),
+        backbone=dict(
+            type='mmpretrain.ResNet',
+            depth=50,
+            num_stages=4,
+            out_indices=(3, ),
+            style='pytorch'),
+        neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),
+        head=dict(
+            type='LinearReIDHead',
+            num_fcs=1,
+            in_channels=2048,
+            fc_channels=1024,
+            out_channels=128,
+            num_classes=380,
+            loss_cls=dict(type='mmpretrain.CrossEntropyLoss', loss_weight=1.0),
+            loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),
+            norm_cfg=dict(type='BN1d'),
+            act_cfg=dict(type='ReLU')),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth'  # noqa: E501
+        )),
+    tracker=dict(
+        type='SORTTracker',
+        motion=dict(type='KalmanFilter', center_only=False),
+        obj_score_thr=0.5,
+        reid=dict(
+            num_samples=10,
+            img_scale=(256, 128),
+            img_norm_cfg=None,
+            match_score_thr=2.0),
+        match_iou_thr=0.5,
+        momentums=None,
+        num_tentatives=2,
+        num_frames_retain=100))
+
+train_dataloader = None
+
+train_cfg = None
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/mmde/mmdet/.mim/configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py b/mmde/mmdet/.mim/configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py
new file mode 100644
index 0000000000000000000000000000000000000000..687ce7adfcc1742bab75cca939a99df37b43689c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py
@@ -0,0 +1,15 @@
+_base_ = [
+    './deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain'
+    '_test-mot17halfval.py'
+]
+
+# dataloader
+val_dataloader = dict(
+    dataset=dict(ann_file='annotations/train_cocoformat.json'))
+test_dataloader = dict(
+    dataset=dict(
+        ann_file='annotations/test_cocoformat.json',
+        data_prefix=dict(img_path='test')))
+
+# evaluator
+test_evaluator = dict(format_only=True, outfile_prefix='./mot_17_test_res')
diff --git a/mmde/mmdet/.mim/configs/deepsort/metafile.yml b/mmde/mmdet/.mim/configs/deepsort/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2feb358e93d1590f0305e2ed08ae40e18bbd6cb9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/deepsort/metafile.yml
@@ -0,0 +1,37 @@
+Collections:
+  - Name: DeepSORT
+    Metadata:
+      Training Techniques:
+        - SGD with Momentum
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+        - FPN
+    Paper:
+      URL: https://arxiv.org/abs/1703.07402
+      Title: Simple Online and Realtime Tracking with a Deep Association Metric
+    README: configs/deepsort/README.md
+
+Models:
+  - Name: deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval
+    In Collection: DeepSORT
+    Config: configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
+    Metadata:
+      Training Data: MOT17-half-train
+      inference time (ms/im):
+        - value: 72.5
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (640, 1088)
+    Results:
+      - Task: Multiple Object Tracking
+        Dataset: MOT17-half-val
+        Metrics:
+          MOTA: 63.7
+          IDF1: 69.5
+          HOTA: 57.0
+    Weights:
+      - https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth
+      - https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth
diff --git a/mmde/mmdet/.mim/configs/deformable_detr/deformable-detr-refine-twostage_r50_16xb2-50e_coco.py b/mmde/mmdet/.mim/configs/deformable_detr/deformable-detr-refine-twostage_r50_16xb2-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeb67fc98486cfd929a8177b9af6be3cdab9aa4b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/deformable_detr/deformable-detr-refine-twostage_r50_16xb2-50e_coco.py
@@ -0,0 +1,2 @@
+_base_ = 'deformable-detr-refine_r50_16xb2-50e_coco.py'
+model = dict(as_two_stage=True)
diff --git a/mmde/mmdet/.mim/configs/deformable_detr/deformable-detr-refine_r50_16xb2-50e_coco.py b/mmde/mmdet/.mim/configs/deformable_detr/deformable-detr-refine_r50_16xb2-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b968674f4a9fc450803cdba018b0c4e9e6ca422a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/deformable_detr/deformable-detr-refine_r50_16xb2-50e_coco.py
@@ -0,0 +1,2 @@
+_base_ = 'deformable-detr_r50_16xb2-50e_coco.py'
+model = dict(with_box_refine=True)
diff --git a/mmde/mmdet/.mim/configs/deformable_detr/deformable-detr_r50_16xb2-50e_coco.py b/mmde/mmdet/.mim/configs/deformable_detr/deformable-detr_r50_16xb2-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0dee411c8e27ab440ccc874e40f4207b24a21e7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/deformable_detr/deformable-detr_r50_16xb2-50e_coco.py
@@ -0,0 +1,156 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='DeformableDETR',
+    num_queries=300,
+    num_feature_levels=4,
+    with_box_refine=False,
+    as_two_stage=False,
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=1),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[512, 1024, 2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    encoder=dict(  # DeformableDetrTransformerEncoder
+        num_layers=6,
+        layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+            self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                embed_dims=256,
+                batch_first=True),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=1024, ffn_drop=0.1))),
+    decoder=dict(  # DeformableDetrTransformerDecoder
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(  # DeformableDetrTransformerDecoderLayer
+            self_attn_cfg=dict(  # MultiheadAttention
+                embed_dims=256,
+                num_heads=8,
+                dropout=0.1,
+                batch_first=True),
+            cross_attn_cfg=dict(  # MultiScaleDeformableAttention
+                embed_dims=256,
+                batch_first=True),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=1024, ffn_drop=0.1)),
+        post_norm_cfg=None),
+    positional_encoding=dict(num_feats=128, normalize=True, offset=-0.5),
+    bbox_head=dict(
+        type='DeformableDETRHead',
+        num_classes=80,
+        sync_cls_avg_factor=True,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            match_costs=[
+                dict(type='FocalLossCost', weight=2.0),
+                dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                dict(type='IoUCost', iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=100))
+
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(
+    dataset=dict(
+        filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1),
+            'sampling_offsets': dict(lr_mult=0.1),
+            'reference_points': dict(lr_mult=0.1)
+        }))
+
+# learning policy
+max_epochs = 50
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[40],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=32)
diff --git a/mmde/mmdet/.mim/configs/deformable_detr/metafile.yml b/mmde/mmdet/.mim/configs/deformable_detr/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a30c97914baf6f1ec56cea8fd67b5ad1efb574fe
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/deformable_detr/metafile.yml
@@ -0,0 +1,56 @@
+Collections:
+  - Name: Deformable DETR
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Multi Scale Train
+        - Gradient Clip
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+        - Transformer
+    Paper:
+      URL: https://openreview.net/forum?id=gZ9hCDWe6ke
+      Title: 'Deformable DETR: Deformable Transformers for End-to-End Object Detection'
+    README: configs/deformable_detr/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/detectors/deformable_detr.py#L6
+      Version: v2.12.0
+
+Models:
+  - Name: deformable-detr_r50_16xb2-50e_coco
+    In Collection: Deformable DETR
+    Config: configs/deformable_detr/deformable-detr_r50_16xb2-50e_coco.py
+    Metadata:
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.3
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/deformable_detr/deformable-detr_r50_16xb2-50e_coco/deformable-detr_r50_16xb2-50e_coco_20221029_210934-6bc7d21b.pth
+
+  - Name: deformable-detr-refine_r50_16xb2-50e_coco
+    In Collection: Deformable DETR
+    Config: configs/deformable_detr/deformable-detr-refine_r50_16xb2-50e_coco.py
+    Metadata:
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.2
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/deformable_detr/deformable-detr-refine_r50_16xb2-50e_coco/deformable-detr-refine_r50_16xb2-50e_coco_20221022_225303-844e0f93.pth
+
+  - Name: deformable-detr-refine-twostage_r50_16xb2-50e_coco
+    In Collection: Deformable DETR
+    Config: configs/deformable_detr/deformable-detr-refine-twostage_r50_16xb2-50e_coco.py
+    Metadata:
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.0
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/deformable_detr/deformable-detr-refine-twostage_r50_16xb2-50e_coco/deformable-detr-refine-twostage_r50_16xb2-50e_coco_20221021_184714-acc8a5ff.pth
diff --git a/mmde/mmdet/.mim/configs/detectors/cascade-rcnn_r50-rfp_1x_coco.py b/mmde/mmdet/.mim/configs/detectors/cascade-rcnn_r50-rfp_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c30c84d74cf68bc4369db16b6b2602626acb6fdf
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/detectors/cascade-rcnn_r50-rfp_1x_coco.py
@@ -0,0 +1,28 @@
+_base_ = [
+    '../_base_/models/cascade-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        type='DetectoRS_ResNet',
+        conv_cfg=dict(type='ConvAWS'),
+        output_img=True),
+    neck=dict(
+        type='RFP',
+        rfp_steps=2,
+        aspp_out_channels=64,
+        aspp_dilations=(1, 3, 6, 1),
+        rfp_backbone=dict(
+            rfp_inplanes=256,
+            type='DetectoRS_ResNet',
+            depth=50,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=1,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            norm_eval=True,
+            conv_cfg=dict(type='ConvAWS'),
+            pretrained='torchvision://resnet50',
+            style='pytorch')))
diff --git a/mmde/mmdet/.mim/configs/detectors/cascade-rcnn_r50-sac_1x_coco.py b/mmde/mmdet/.mim/configs/detectors/cascade-rcnn_r50-sac_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..24d6cd3a95ecf262caac667cfcc32d6885fa5880
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/detectors/cascade-rcnn_r50-sac_1x_coco.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/cascade-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        type='DetectoRS_ResNet',
+        conv_cfg=dict(type='ConvAWS'),
+        sac=dict(type='SAC', use_deform=True),
+        stage_with_sac=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/detectors/detectors_cascade-rcnn_r50_1x_coco.py b/mmde/mmdet/.mim/configs/detectors/detectors_cascade-rcnn_r50_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..19d13d9c8c38b666b7481a58a641918b5d20e0ad
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/detectors/detectors_cascade-rcnn_r50_1x_coco.py
@@ -0,0 +1,32 @@
+_base_ = [
+    '../_base_/models/cascade-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        type='DetectoRS_ResNet',
+        conv_cfg=dict(type='ConvAWS'),
+        sac=dict(type='SAC', use_deform=True),
+        stage_with_sac=(False, True, True, True),
+        output_img=True),
+    neck=dict(
+        type='RFP',
+        rfp_steps=2,
+        aspp_out_channels=64,
+        aspp_dilations=(1, 3, 6, 1),
+        rfp_backbone=dict(
+            rfp_inplanes=256,
+            type='DetectoRS_ResNet',
+            depth=50,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=1,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            norm_eval=True,
+            conv_cfg=dict(type='ConvAWS'),
+            sac=dict(type='SAC', use_deform=True),
+            stage_with_sac=(False, True, True, True),
+            pretrained='torchvision://resnet50',
+            style='pytorch')))
diff --git a/mmde/mmdet/.mim/configs/detectors/detectors_htc-r101_20e_coco.py b/mmde/mmdet/.mim/configs/detectors/detectors_htc-r101_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..93d7d2b1adeb3fbdb7bac0107edf4433669e8015
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/detectors/detectors_htc-r101_20e_coco.py
@@ -0,0 +1,28 @@
+_base_ = '../htc/htc_r101_fpn_20e_coco.py'
+
+model = dict(
+    backbone=dict(
+        type='DetectoRS_ResNet',
+        conv_cfg=dict(type='ConvAWS'),
+        sac=dict(type='SAC', use_deform=True),
+        stage_with_sac=(False, True, True, True),
+        output_img=True),
+    neck=dict(
+        type='RFP',
+        rfp_steps=2,
+        aspp_out_channels=64,
+        aspp_dilations=(1, 3, 6, 1),
+        rfp_backbone=dict(
+            rfp_inplanes=256,
+            type='DetectoRS_ResNet',
+            depth=101,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=1,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            norm_eval=True,
+            conv_cfg=dict(type='ConvAWS'),
+            sac=dict(type='SAC', use_deform=True),
+            stage_with_sac=(False, True, True, True),
+            pretrained='torchvision://resnet101',
+            style='pytorch')))
diff --git a/mmde/mmdet/.mim/configs/detectors/detectors_htc-r50_1x_coco.py b/mmde/mmdet/.mim/configs/detectors/detectors_htc-r50_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d2fc4f77fcca715c1dfb613306d214b636aa0c0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/detectors/detectors_htc-r50_1x_coco.py
@@ -0,0 +1,28 @@
+_base_ = '../htc/htc_r50_fpn_1x_coco.py'
+
+model = dict(
+    backbone=dict(
+        type='DetectoRS_ResNet',
+        conv_cfg=dict(type='ConvAWS'),
+        sac=dict(type='SAC', use_deform=True),
+        stage_with_sac=(False, True, True, True),
+        output_img=True),
+    neck=dict(
+        type='RFP',
+        rfp_steps=2,
+        aspp_out_channels=64,
+        aspp_dilations=(1, 3, 6, 1),
+        rfp_backbone=dict(
+            rfp_inplanes=256,
+            type='DetectoRS_ResNet',
+            depth=50,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=1,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            norm_eval=True,
+            conv_cfg=dict(type='ConvAWS'),
+            sac=dict(type='SAC', use_deform=True),
+            stage_with_sac=(False, True, True, True),
+            pretrained='torchvision://resnet50',
+            style='pytorch')))
diff --git a/mmde/mmdet/.mim/configs/detectors/htc_r50-rfp_1x_coco.py b/mmde/mmdet/.mim/configs/detectors/htc_r50-rfp_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..496104e12550a1985f9c9e3748a343f69d7df6d8
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/detectors/htc_r50-rfp_1x_coco.py
@@ -0,0 +1,24 @@
+_base_ = '../htc/htc_r50_fpn_1x_coco.py'
+
+model = dict(
+    backbone=dict(
+        type='DetectoRS_ResNet',
+        conv_cfg=dict(type='ConvAWS'),
+        output_img=True),
+    neck=dict(
+        type='RFP',
+        rfp_steps=2,
+        aspp_out_channels=64,
+        aspp_dilations=(1, 3, 6, 1),
+        rfp_backbone=dict(
+            rfp_inplanes=256,
+            type='DetectoRS_ResNet',
+            depth=50,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=1,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            norm_eval=True,
+            conv_cfg=dict(type='ConvAWS'),
+            pretrained='torchvision://resnet50',
+            style='pytorch')))
diff --git a/mmde/mmdet/.mim/configs/detectors/htc_r50-sac_1x_coco.py b/mmde/mmdet/.mim/configs/detectors/htc_r50-sac_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..72d4db963ffd95851b945911b3db9941426583ab
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/detectors/htc_r50-sac_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = '../htc/htc_r50_fpn_1x_coco.py'
+
+model = dict(
+    backbone=dict(
+        type='DetectoRS_ResNet',
+        conv_cfg=dict(type='ConvAWS'),
+        sac=dict(type='SAC', use_deform=True),
+        stage_with_sac=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/detectors/metafile.yml b/mmde/mmdet/.mim/configs/detectors/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..196a1cef1751bc9d5812915c4d06de220f62baa1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/detectors/metafile.yml
@@ -0,0 +1,114 @@
+Collections:
+  - Name: DetectoRS
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ASPP
+        - FPN
+        - RFP
+        - RPN
+        - ResNet
+        - RoIAlign
+        - SAC
+    Paper:
+      URL: https://arxiv.org/abs/2006.02334
+      Title: 'DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable Atrous Convolution'
+    README: configs/detectors/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/backbones/detectors_resnet.py#L205
+      Version: v2.2.0
+
+Models:
+  - Name: cascade-rcnn_r50-rfp_1x_coco
+    In Collection: DetectoRS
+    Config: configs/detectors/cascade-rcnn_r50-rfp_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.5
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_rfp_1x_coco/cascade_rcnn_r50_rfp_1x_coco-8cf51bfd.pth
+
+  - Name: cascade-rcnn_r50-sac_1x_coco
+    In Collection: DetectoRS
+    Config: configs/detectors/cascade-rcnn_r50-sac_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.6
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_sac_1x_coco/cascade_rcnn_r50_sac_1x_coco-24bfda62.pth
+
+  - Name: detectors_cascade-rcnn_r50_1x_coco
+    In Collection: DetectoRS
+    Config: configs/detectors/detectors_cascade-rcnn_r50_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.9
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_cascade_rcnn_r50_1x_coco/detectors_cascade_rcnn_r50_1x_coco-32a10ba0.pth
+
+  - Name: htc_r50-rfp_1x_coco
+    In Collection: DetectoRS
+    Config: configs/detectors/htc_r50-rfp_1x_coco.py
+    Metadata:
+      Training Memory (GB): 11.2
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  40.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_rfp_1x_coco/htc_r50_rfp_1x_coco-8ff87c51.pth
+
+  - Name: htc_r50-sac_1x_coco
+    In Collection: DetectoRS
+    Config: configs/detectors/htc_r50-sac_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.3
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  40.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_sac_1x_coco/htc_r50_sac_1x_coco-bfa60c54.pth
+
+  - Name: detectors_htc-r50_1x_coco
+    In Collection: DetectoRS
+    Config: configs/detectors/detectors_htc-r50_1x_coco.py
+    Metadata:
+      Training Memory (GB): 13.6
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 49.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  42.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r50_1x_coco/detectors_htc_r50_1x_coco-329b1453.pth
diff --git a/mmde/mmdet/.mim/configs/detr/detr_r101_8xb2-500e_coco.py b/mmde/mmdet/.mim/configs/detr/detr_r101_8xb2-500e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6661aacdc54e889aa38b2e759c40fd9797ae44ad
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/detr/detr_r101_8xb2-500e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './detr_r50_8xb2-500e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/detr/detr_r18_8xb2-500e_coco.py b/mmde/mmdet/.mim/configs/detr/detr_r18_8xb2-500e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..305b9d6fee8d75273b588f32b2e21582473cb137
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/detr/detr_r18_8xb2-500e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './detr_r50_8xb2-500e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[512]))
diff --git a/mmde/mmdet/.mim/configs/detr/detr_r50_8xb2-150e_coco.py b/mmde/mmdet/.mim/configs/detr/detr_r50_8xb2-150e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaa15410532e552cae387ef4eaa57227af1d855d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/detr/detr_r50_8xb2-150e_coco.py
@@ -0,0 +1,155 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='DETR',
+    num_queries=100,
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=1),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3, ),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=None,
+        num_outs=1),
+    encoder=dict(  # DetrTransformerEncoder
+        num_layers=6,
+        layer_cfg=dict(  # DetrTransformerEncoderLayer
+            self_attn_cfg=dict(  # MultiheadAttention
+                embed_dims=256,
+                num_heads=8,
+                dropout=0.1,
+                batch_first=True),
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,
+                num_fcs=2,
+                ffn_drop=0.1,
+                act_cfg=dict(type='ReLU', inplace=True)))),
+    decoder=dict(  # DetrTransformerDecoder
+        num_layers=6,
+        layer_cfg=dict(  # DetrTransformerDecoderLayer
+            self_attn_cfg=dict(  # MultiheadAttention
+                embed_dims=256,
+                num_heads=8,
+                dropout=0.1,
+                batch_first=True),
+            cross_attn_cfg=dict(  # MultiheadAttention
+                embed_dims=256,
+                num_heads=8,
+                dropout=0.1,
+                batch_first=True),
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,
+                num_fcs=2,
+                ffn_drop=0.1,
+                act_cfg=dict(type='ReLU', inplace=True))),
+        return_intermediate=True),
+    positional_encoding=dict(num_feats=128, normalize=True),
+    bbox_head=dict(
+        type='DETRHead',
+        num_classes=80,
+        embed_dims=256,
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            bg_cls_weight=0.1,
+            use_sigmoid=False,
+            loss_weight=1.0,
+            class_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            match_costs=[
+                dict(type='ClassificationCost', weight=1.),
+                dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                dict(type='IoUCost', iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=100))
+
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[[
+            dict(
+                type='RandomChoiceResize',
+                scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                        (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                        (736, 1333), (768, 1333), (800, 1333)],
+                keep_ratio=True)
+        ],
+                    [
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(400, 1333), (500, 1333), (600, 1333)],
+                            keep_ratio=True),
+                        dict(
+                            type='RandomCrop',
+                            crop_type='absolute_range',
+                            crop_size=(384, 600),
+                            allow_negative_crop=True),
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(480, 1333), (512, 1333), (544, 1333),
+                                    (576, 1333), (608, 1333), (640, 1333),
+                                    (672, 1333), (704, 1333), (736, 1333),
+                                    (768, 1333), (800, 1333)],
+                            keep_ratio=True)
+                    ]]),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}))
+
+# learning policy
+max_epochs = 150
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[100],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/detr/detr_r50_8xb2-500e_coco.py b/mmde/mmdet/.mim/configs/detr/detr_r50_8xb2-500e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f07d5dce05b08c74aea2059989b45d5d275c53e0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/detr/detr_r50_8xb2-500e_coco.py
@@ -0,0 +1,24 @@
+_base_ = './detr_r50_8xb2-150e_coco.py'
+
+# learning policy
+max_epochs = 500
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=10)
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[334],
+        gamma=0.1)
+]
+
+# only keep latest 2 checkpoints
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=2))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/detr/metafile.yml b/mmde/mmdet/.mim/configs/detr/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a9132dff0228e31c146ae46ed32445491f4225c1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/detr/metafile.yml
@@ -0,0 +1,33 @@
+Collections:
+  - Name: DETR
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Multi Scale Train
+        - Gradient Clip
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+        - Transformer
+    Paper:
+      URL: https://arxiv.org/abs/2005.12872
+      Title: 'End-to-End Object Detection with Transformers'
+    README: configs/detr/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/detectors/detr.py#L7
+      Version: v2.7.0
+
+Models:
+  - Name: detr_r50_8xb2-150e_coco
+    In Collection: DETR
+    Config: configs/detr/detr_r50_8xb2-150e_coco.py
+    Metadata:
+      Training Memory (GB): 7.9
+      Epochs: 150
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/detr/detr_r50_8xb2-150e_coco/detr_r50_8xb2-150e_coco_20221023_153551-436d03e8.pth
diff --git a/mmde/mmdet/.mim/configs/dino/dino-4scale_r50_8xb2-12e_coco.py b/mmde/mmdet/.mim/configs/dino/dino-4scale_r50_8xb2-12e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5831f898b4a706accb2b828b6194b2974e78d0fc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dino/dino-4scale_r50_8xb2-12e_coco.py
@@ -0,0 +1,163 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='DINO',
+    num_queries=900,  # num_matching_queries
+    with_box_refine=True,
+    as_two_stage=True,
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=1),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[512, 1024, 2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    encoder=dict(
+        num_layers=6,
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_levels=4,
+                               dropout=0.0),  # 0.1 for DeformDETR
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,  # 1024 for DeformDETR
+                ffn_drop=0.0))),  # 0.1 for DeformDETR
+    decoder=dict(
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_heads=8,
+                               dropout=0.0),  # 0.1 for DeformDETR
+            cross_attn_cfg=dict(embed_dims=256, num_levels=4,
+                                dropout=0.0),  # 0.1 for DeformDETR
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,  # 1024 for DeformDETR
+                ffn_drop=0.0)),  # 0.1 for DeformDETR
+        post_norm_cfg=None),
+    positional_encoding=dict(
+        num_feats=128,
+        normalize=True,
+        offset=0.0,  # -0.5 for DeformDETR
+        temperature=20),  # 10000 for DeformDETR
+    bbox_head=dict(
+        type='DINOHead',
+        num_classes=80,
+        sync_cls_avg_factor=True,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),  # 2.0 in DeformDETR
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    dn_cfg=dict(  # TODO: Move to model.train_cfg ?
+        label_noise_scale=0.5,
+        box_noise_scale=1.0,  # 0.4 for DN-DETR
+        group_cfg=dict(dynamic=True, num_groups=None,
+                       num_dn_queries=100)),  # TODO: half num_dn_queries
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            match_costs=[
+                dict(type='FocalLossCost', weight=2.0),
+                dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                dict(type='IoUCost', iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=300))  # 100 for DeformDETR
+
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(
+    dataset=dict(
+        filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW',
+        lr=0.0001,  # 0.0002 for DeformDETR
+        weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)})
+)  # custom_keys contains sampling_offsets and reference_points in DeformDETR  # noqa
+
+# learning policy
+max_epochs = 12
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[11],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/dino/dino-4scale_r50_8xb2-24e_coco.py b/mmde/mmdet/.mim/configs/dino/dino-4scale_r50_8xb2-24e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8534ac6a7ccc7f3f8c081275b3567a0a0792b7a5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dino/dino-4scale_r50_8xb2-24e_coco.py
@@ -0,0 +1,13 @@
+_base_ = './dino-4scale_r50_8xb2-12e_coco.py'
+max_epochs = 24
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[20],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/dino/dino-4scale_r50_8xb2-36e_coco.py b/mmde/mmdet/.mim/configs/dino/dino-4scale_r50_8xb2-36e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c2cf4602d358dfed5b737f8a74843c89a54702d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dino/dino-4scale_r50_8xb2-36e_coco.py
@@ -0,0 +1,13 @@
+_base_ = './dino-4scale_r50_8xb2-12e_coco.py'
+max_epochs = 36
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[30],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/dino/dino-4scale_r50_improved_8xb2-12e_coco.py b/mmde/mmdet/.mim/configs/dino/dino-4scale_r50_improved_8xb2-12e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a4a82bacc1f1e990d4720db81cae0af5c012557
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dino/dino-4scale_r50_improved_8xb2-12e_coco.py
@@ -0,0 +1,18 @@
+_base_ = ['dino-4scale_r50_8xb2-12e_coco.py']
+
+# from deformable detr hyper
+model = dict(
+    backbone=dict(frozen_stages=-1),
+    bbox_head=dict(loss_cls=dict(loss_weight=2.0)),
+    positional_encoding=dict(offset=-0.5, temperature=10000),
+    dn_cfg=dict(group_cfg=dict(num_dn_queries=300)))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(lr=0.0002),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1),
+            'sampling_offsets': dict(lr_mult=0.1),
+            'reference_points': dict(lr_mult=0.1)
+        }))
diff --git a/mmde/mmdet/.mim/configs/dino/dino-5scale_swin-l_8xb2-12e_coco.py b/mmde/mmdet/.mim/configs/dino/dino-5scale_swin-l_8xb2-12e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d39f22f50926a11137d143976fe4033ec3a8640
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dino/dino-5scale_swin-l_8xb2-12e_coco.py
@@ -0,0 +1,30 @@
+_base_ = './dino-4scale_r50_8xb2-12e_coco.py'
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'  # noqa
+num_levels = 5
+model = dict(
+    num_feature_levels=num_levels,
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        # Please only add indices that would be used
+        # in FPN, otherwise some parameter will not be used
+        with_cp=True,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(in_channels=[192, 384, 768, 1536], num_outs=num_levels),
+    encoder=dict(layer_cfg=dict(self_attn_cfg=dict(num_levels=num_levels))),
+    decoder=dict(layer_cfg=dict(cross_attn_cfg=dict(num_levels=num_levels))))
diff --git a/mmde/mmdet/.mim/configs/dino/dino-5scale_swin-l_8xb2-36e_coco.py b/mmde/mmdet/.mim/configs/dino/dino-5scale_swin-l_8xb2-36e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d55a38e61d411892c6de819cf46247ba4d41d427
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dino/dino-5scale_swin-l_8xb2-36e_coco.py
@@ -0,0 +1,13 @@
+_base_ = './dino-5scale_swin-l_8xb2-12e_coco.py'
+max_epochs = 36
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[27, 33],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/dino/metafile.yml b/mmde/mmdet/.mim/configs/dino/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f276a04ef557b70443083ac70b6a16671e7fa6e1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dino/metafile.yml
@@ -0,0 +1,85 @@
+Collections:
+  - Name: DINO
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Multi Scale Train
+        - Gradient Clip
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - ResNet
+        - Transformer
+    Paper:
+      URL: https://arxiv.org/abs/2203.03605
+      Title: 'DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection'
+    README: configs/dino/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/f4112c9e5611468ffbd57cfba548fd1289264b52/mmdet/models/detectors/dino.py#L17
+      Version: v3.0.0rc6
+
+Models:
+  - Name: dino-4scale_r50_8xb2-12e_coco
+    In Collection: DINO
+    Config: configs/dino/dino-4scale_r50_8xb2-12e_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 49.0
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/dino/dino-4scale_r50_8xb2-12e_coco/dino-4scale_r50_8xb2-12e_coco_20221202_182705-55b2bba2.pth
+
+  - Name: dino-4scale_r50_8xb2-24e_coco
+    In Collection: DINO
+    Config: configs/dino/dino-4scale_r50_8xb2-24e_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+
+  - Name:  dino-4scale_r50_8xb2-36e_coco
+    In Collection: DINO
+    Config: configs/dino/dino-4scale_r50_8xb2-36e_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+
+  - Name: dino-5scale_swin-l_8xb2-12e_coco
+    In Collection: DINO
+    Config: configs/dino/dino-5scale_swin-l_8xb2-12e_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 57.2
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/dino/dino-5scale_swin-l_8xb2-12e_coco/dino-5scale_swin-l_8xb2-12e_coco_20230228_072924-a654145f.pth
+
+  - Name: dino-5scale_swin-l_8xb2-36e_coco
+    In Collection: DINO
+    Config: configs/dino/dino-5scale_swin-l_8xb2-36e_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 58.4
+    Weights: https://github.com/RistoranteRist/mmlab-weights/releases/download/dino-swinl/dino-5scale_swin-l_8xb2-36e_coco-5486e051.pth
+  - Name: dino-4scale_r50_improved_8xb2-12e_coco
+    In Collection: DINO
+    Config: configs/dino/dino-4scale_r50_improved_8xb2-12e_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.1
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/dino/dino-4scale_r50_improved_8xb2-12e_coco/dino-4scale_r50_improved_8xb2-12e_coco_20230818_162607-6f47a913.pth
diff --git a/mmde/mmdet/.mim/configs/double_heads/dh-faster-rcnn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/double_heads/dh-faster-rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b9b6e69a12d978a55fbba049fc2b1c5229c1fc5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/double_heads/dh-faster-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,23 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        type='DoubleHeadRoIHead',
+        reg_roi_scale_factor=1.3,
+        bbox_head=dict(
+            _delete_=True,
+            type='DoubleConvFCBBoxHead',
+            num_convs=4,
+            num_fcs=2,
+            in_channels=256,
+            conv_out_channels=1024,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=2.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=2.0))))
diff --git a/mmde/mmdet/.mim/configs/double_heads/metafile.yml b/mmde/mmdet/.mim/configs/double_heads/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..bb14e7968e259bb6dae1bbd6dad5e1c4e862f228
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/double_heads/metafile.yml
@@ -0,0 +1,41 @@
+Collections:
+  - Name: Rethinking Classification and Localization for Object Detection
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - RPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/pdf/1904.06493
+      Title: 'Rethinking Classification and Localization for Object Detection'
+    README: configs/double_heads/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/roi_heads/double_roi_head.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: dh-faster-rcnn_r50_fpn_1x_coco
+    In Collection: Rethinking Classification and Localization for Object Detection
+    Config: configs/double_heads/dh-faster-rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.8
+      inference time (ms/im):
+        - value: 105.26
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/double_heads/dh_faster_rcnn_r50_fpn_1x_coco/dh_faster_rcnn_r50_fpn_1x_coco_20200130-586b67df.pth
diff --git a/mmde/mmdet/.mim/configs/dsdl/coco.py b/mmde/mmdet/.mim/configs/dsdl/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c9e895e53c1588028cf6def2fe79d49fd98d6e1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dsdl/coco.py
@@ -0,0 +1,33 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py',
+    '../_base_/datasets/dsdl.py'
+]
+
+# dsdl dataset settings
+
+# please visit our platform [OpenDataLab](https://opendatalab.com/)
+# to downloaded dsdl dataset.
+data_root = 'data/COCO2017'
+img_prefix = 'original'
+train_ann = 'dsdl/set-train/train.yaml'
+val_ann = 'dsdl/set-val/val.yaml'
+specific_key_path = dict(ignore_flag='./annotations/*/iscrowd')
+
+train_dataloader = dict(
+    dataset=dict(
+        specific_key_path=specific_key_path,
+        data_root=data_root,
+        ann_file=train_ann,
+        data_prefix=dict(img_path=img_prefix),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32, bbox_min_size=32),
+    ))
+
+val_dataloader = dict(
+    dataset=dict(
+        specific_key_path=specific_key_path,
+        data_root=data_root,
+        ann_file=val_ann,
+        data_prefix=dict(img_path=img_prefix),
+    ))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/dsdl/coco_instance.py b/mmde/mmdet/.mim/configs/dsdl/coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..e34f93c97f55f5eeef55f9de73f1a8389f8980c6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dsdl/coco_instance.py
@@ -0,0 +1,62 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py',
+    '../_base_/datasets/dsdl.py'
+]
+
+# dsdl dataset settings.
+
+# please visit our platform [OpenDataLab](https://opendatalab.com/)
+# to downloaded dsdl dataset.
+data_root = 'data/COCO2017'
+img_prefix = 'original'
+train_ann = 'dsdl/set-train/train.yaml'
+val_ann = 'dsdl/set-val/val.yaml'
+specific_key_path = dict(ignore_flag='./annotations/*/iscrowd')
+
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'instances'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        with_polygon=True,
+        specific_key_path=specific_key_path,
+        data_root=data_root,
+        ann_file=train_ann,
+        data_prefix=dict(img_path=img_prefix),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32, bbox_min_size=32),
+        pipeline=train_pipeline,
+    ))
+
+val_dataloader = dict(
+    dataset=dict(
+        with_polygon=True,
+        specific_key_path=specific_key_path,
+        data_root=data_root,
+        ann_file=val_ann,
+        data_prefix=dict(img_path=img_prefix),
+        pipeline=test_pipeline,
+    ))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric', metric=['bbox', 'segm'], format_only=False)
+
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/dsdl/objects365v2.py b/mmde/mmdet/.mim/configs/dsdl/objects365v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d25a2323027c22eaf9777f6e62e4992880b29d2c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dsdl/objects365v2.py
@@ -0,0 +1,54 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py',
+    '../_base_/datasets/dsdl.py'
+]
+
+model = dict(roi_head=dict(bbox_head=dict(num_classes=365)))
+
+# dsdl dataset settings
+
+# please visit our platform [OpenDataLab](https://opendatalab.com/)
+# to downloaded dsdl dataset.
+data_root = 'data/Objects365'
+img_prefix = 'original'
+train_ann = 'dsdl/set-train/train.yaml'
+val_ann = 'dsdl/set-val/val.yaml'
+specific_key_path = dict(ignore_flag='./annotations/*/iscrowd')
+
+train_dataloader = dict(
+    dataset=dict(
+        specific_key_path=specific_key_path,
+        data_root=data_root,
+        ann_file=train_ann,
+        data_prefix=dict(img_path=img_prefix),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32, bbox_min_size=32),
+    ))
+
+val_dataloader = dict(
+    dataset=dict(
+        specific_key_path=specific_key_path,
+        data_root=data_root,
+        ann_file=val_ann,
+        data_prefix=dict(img_path=img_prefix),
+        test_mode=True,
+    ))
+test_dataloader = val_dataloader
+
+default_hooks = dict(logger=dict(type='LoggerHook', interval=1000), )
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=3, val_interval=1)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[1, 2],
+        gamma=0.1)
+]
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/dsdl/openimagesv6.py b/mmde/mmdet/.mim/configs/dsdl/openimagesv6.py
new file mode 100644
index 0000000000000000000000000000000000000000..a65f942a0d4f8cfdaa3cfb712276d6de34d62a84
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dsdl/openimagesv6.py
@@ -0,0 +1,94 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/schedules/schedule_1x.py',
+    '../_base_/default_runtime.py',
+]
+
+model = dict(roi_head=dict(bbox_head=dict(num_classes=601)))
+
+# dsdl dataset settings
+
+# please visit our platform [OpenDataLab](https://opendatalab.com/)
+# to downloaded dsdl dataset.
+dataset_type = 'DSDLDetDataset'
+data_root = 'data/OpenImages'
+train_ann = 'dsdl/set-train/train.yaml'
+val_ann = 'dsdl/set-val/val.yaml'
+specific_key_path = dict(
+    image_level_labels='./image_labels/*/label',
+    Label='./objects/*/label',
+    is_group_of='./objects/*/isgroupof',
+)
+
+backend_args = dict(
+    backend='petrel',
+    path_mapping=dict({'data/': 's3://open_dataset_original/'}))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1024, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1024, 800), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'instances', 'image_level_labels'))
+]
+
+train_dataloader = dict(
+    sampler=dict(type='ClassAwareSampler', num_sample_class=1),
+    dataset=dict(
+        type=dataset_type,
+        with_imagelevel_label=True,
+        with_hierarchy=True,
+        specific_key_path=specific_key_path,
+        data_root=data_root,
+        ann_file=train_ann,
+        filter_cfg=dict(filter_empty_gt=True, min_size=32, bbox_min_size=32),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        with_imagelevel_label=True,
+        with_hierarchy=True,
+        specific_key_path=specific_key_path,
+        data_root=data_root,
+        ann_file=val_ann,
+        test_mode=True,
+        pipeline=test_pipeline))
+
+test_dataloader = val_dataloader
+
+default_hooks = dict(logger=dict(type='LoggerHook', interval=1000), )
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=3, val_interval=1)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[1, 2],
+        gamma=0.1)
+]
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
+
+val_evaluator = dict(
+    type='OpenImagesMetric',
+    iou_thrs=0.5,
+    ioa_thrs=0.5,
+    use_group_of=True,
+    get_supercategory=True)
+
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/dsdl/voc07.py b/mmde/mmdet/.mim/configs/dsdl/voc07.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7b864714e4987ca9d31eda5fee746e741b7aa10
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dsdl/voc07.py
@@ -0,0 +1,94 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py', '../_base_/default_runtime.py'
+]
+
+# model setting
+model = dict(roi_head=dict(bbox_head=dict(num_classes=20)))
+
+# dsdl dataset settings
+
+# please visit our platform [OpenDataLab](https://opendatalab.com/)
+# to downloaded dsdl dataset.
+dataset_type = 'DSDLDetDataset'
+data_root = 'data/VOC07-det'
+img_prefix = 'original'
+train_ann = 'dsdl/set-train/train.yaml'
+val_ann = 'dsdl/set-test/test.yaml'
+
+specific_key_path = dict(ignore_flag='./objects/*/difficult')
+
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1000, 600), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1000, 600), keep_ratio=True),
+    # avoid bboxes being resized
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'instances'))
+]
+train_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        specific_key_path=specific_key_path,
+        data_root=data_root,
+        ann_file=train_ann,
+        data_prefix=dict(img_path=img_prefix),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32, bbox_min_size=32),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        specific_key_path=specific_key_path,
+        data_root=data_root,
+        ann_file=val_ann,
+        data_prefix=dict(img_path=img_prefix),
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# Pascal VOC2007 uses `11points` as default evaluate mode, while PASCAL
+# VOC2012 defaults to use 'area'.
+val_evaluator = dict(type='VOCMetric', metric='mAP', eval_mode='11points')
+# val_evaluator = dict(type='CocoMetric', metric='bbox')
+test_evaluator = val_evaluator
+
+# training schedule, voc dataset is repeated 3 times, in
+# `_base_/datasets/voc0712.py`, so the actual epoch = 4 * 3 = 12
+max_epochs = 12
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=3)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[9],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/dsdl/voc0712.py b/mmde/mmdet/.mim/configs/dsdl/voc0712.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ec1bb8f98e56d0402c9a80934c3b77bd7919fa4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dsdl/voc0712.py
@@ -0,0 +1,132 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/schedules/schedule_1x.py',
+    '../_base_/default_runtime.py',
+    # '../_base_/datasets/dsdl.py'
+]
+
+# model setting
+model = dict(roi_head=dict(bbox_head=dict(num_classes=20)))
+
+# dsdl dataset settings
+
+# please visit our platform [OpenDataLab](https://opendatalab.com/)
+# to downloaded dsdl dataset.
+dataset_type = 'DSDLDetDataset'
+data_root_07 = 'data/VOC07-det'
+data_root_12 = 'data/VOC12-det'
+img_prefix = 'original'
+
+train_ann = 'dsdl/set-train/train.yaml'
+val_ann = 'dsdl/set-val/val.yaml'
+test_ann = 'dsdl/set-test/test.yaml'
+
+backend_args = None
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1000, 600), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1000, 600), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'instances'))
+]
+
+specific_key_path = dict(ignore_flag='./objects/*/difficult', )
+
+train_dataloader = dict(
+    dataset=dict(
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            type='ConcatDataset',
+            datasets=[
+                dict(
+                    type=dataset_type,
+                    specific_key_path=specific_key_path,
+                    data_root=data_root_07,
+                    ann_file=train_ann,
+                    data_prefix=dict(img_path=img_prefix),
+                    filter_cfg=dict(
+                        filter_empty_gt=True, min_size=32, bbox_min_size=32),
+                    pipeline=train_pipeline),
+                dict(
+                    type=dataset_type,
+                    specific_key_path=specific_key_path,
+                    data_root=data_root_07,
+                    ann_file=val_ann,
+                    data_prefix=dict(img_path=img_prefix),
+                    filter_cfg=dict(
+                        filter_empty_gt=True, min_size=32, bbox_min_size=32),
+                    pipeline=train_pipeline),
+                dict(
+                    type=dataset_type,
+                    specific_key_path=specific_key_path,
+                    data_root=data_root_12,
+                    ann_file=train_ann,
+                    data_prefix=dict(img_path=img_prefix),
+                    filter_cfg=dict(
+                        filter_empty_gt=True, min_size=32, bbox_min_size=32),
+                    pipeline=train_pipeline),
+                dict(
+                    type=dataset_type,
+                    specific_key_path=specific_key_path,
+                    data_root=data_root_12,
+                    ann_file=val_ann,
+                    data_prefix=dict(img_path=img_prefix),
+                    filter_cfg=dict(
+                        filter_empty_gt=True, min_size=32, bbox_min_size=32),
+                    pipeline=train_pipeline),
+            ])))
+
+val_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        specific_key_path=specific_key_path,
+        data_root=data_root_07,
+        ann_file=test_ann,
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='CocoMetric', metric='bbox')
+# val_evaluator = dict(type='VOCMetric', metric='mAP', eval_mode='11points')
+test_evaluator = val_evaluator
+
+# training schedule, voc dataset is repeated 3 times, in
+# `_base_/datasets/voc0712.py`, so the actual epoch = 4 * 3 = 12
+max_epochs = 4
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[3],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/dyhead/atss_r50-caffe_fpn_dyhead_1x_coco.py b/mmde/mmdet/.mim/configs/dyhead/atss_r50-caffe_fpn_dyhead_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8716f1226cb0b37435d0318d62599a74e6126f19
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dyhead/atss_r50-caffe_fpn_dyhead_1x_coco.py
@@ -0,0 +1,103 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='ATSS',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=128),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=[
+        dict(
+            type='FPN',
+            in_channels=[256, 512, 1024, 2048],
+            out_channels=256,
+            start_level=1,
+            add_extra_convs='on_output',
+            num_outs=5),
+        dict(
+            type='DyHead',
+            in_channels=256,
+            out_channels=256,
+            num_blocks=6,
+            # disable zero_init_offset to follow official implementation
+            zero_init_offset=False)
+    ],
+    bbox_head=dict(
+        type='ATSSHead',
+        num_classes=80,
+        in_channels=256,
+        pred_kernel_size=1,  # follow DyHead official implementation
+        stacked_convs=0,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128],
+            center_offset=0.5),  # follow DyHead official implementation
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(lr=0.01))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True, backend='pillow'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True, backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py b/mmde/mmdet/.mim/configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..89e89b98ca437bb13fe5d01acc05cfdcd04e8fa0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py
@@ -0,0 +1,72 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='ATSS',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=[
+        dict(
+            type='FPN',
+            in_channels=[256, 512, 1024, 2048],
+            out_channels=256,
+            start_level=1,
+            add_extra_convs='on_output',
+            num_outs=5),
+        dict(type='DyHead', in_channels=256, out_channels=256, num_blocks=6)
+    ],
+    bbox_head=dict(
+        type='ATSSHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=0,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(lr=0.01))
diff --git a/mmde/mmdet/.mim/configs/dyhead/atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco.py b/mmde/mmdet/.mim/configs/dyhead/atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f537b9dc9b17aa50f0044b874585fe1e0ba15216
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dyhead/atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco.py
@@ -0,0 +1,140 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'  # noqa
+model = dict(
+    type='ATSS',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=128),
+    backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        # Please only add indices that would be used
+        # in FPN, otherwise some parameter will not be used
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=[
+        dict(
+            type='FPN',
+            in_channels=[384, 768, 1536],
+            out_channels=256,
+            start_level=0,
+            add_extra_convs='on_output',
+            num_outs=5),
+        dict(
+            type='DyHead',
+            in_channels=256,
+            out_channels=256,
+            num_blocks=6,
+            # disable zero_init_offset to follow official implementation
+            zero_init_offset=False)
+    ],
+    bbox_head=dict(
+        type='ATSSHead',
+        num_classes=80,
+        in_channels=256,
+        pred_kernel_size=1,  # follow DyHead official implementation
+        stacked_convs=0,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128],
+            center_offset=0.5),  # follow DyHead official implementation
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize',
+        scale=[(2000, 480), (2000, 1200)],
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(2000, 1200), keep_ratio=True, backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type={{_base_.dataset_type}},
+            data_root={{_base_.data_root}},
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args={{_base_.backend_args}})))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00005, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'relative_position_bias_table': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }),
+    clip_grad=None)
diff --git a/mmde/mmdet/.mim/configs/dyhead/metafile.yml b/mmde/mmdet/.mim/configs/dyhead/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..28b5a5821c81cea3213494c712910f904ae117f2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dyhead/metafile.yml
@@ -0,0 +1,76 @@
+Collections:
+  - Name: DyHead
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 4x T4 GPUs
+      Architecture:
+        - ATSS
+        - DyHead
+        - FPN
+        - ResNet
+        - Deformable Convolution
+        - Pyramid Convolution
+    Paper:
+      URL: https://arxiv.org/abs/2106.08322
+      Title: 'Dynamic Head: Unifying Object Detection Heads with Attentions'
+    README: configs/dyhead/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/mmdet/models/necks/dyhead.py#L130
+      Version: v2.22.0
+
+Models:
+  - Name: atss_r50-caffe_fpn_dyhead_1x_coco
+    In Collection: DyHead
+    Config: configs/dyhead/atss_r50-caffe_fpn_dyhead_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.4
+      inference time (ms/im):
+        - value: 75.7
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_for_reproduction_1x_coco/atss_r50_fpn_dyhead_for_reproduction_4x4_1x_coco_20220107_213939-162888e6.pth
+
+  - Name: atss_r50_fpn_dyhead_1x_coco
+    In Collection: DyHead
+    Config: configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.9
+      inference time (ms/im):
+        - value: 73.1
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_4x4_1x_coco/atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314-eaa620c6.pth
+
+  - Name: atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco
+    In Collection: DyHead
+    Config: configs/dyhead/atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco.py
+    Metadata:
+      Training Memory (GB): 58.4
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 56.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco_20220509_100315-bc5b6516.pth
diff --git a/mmde/mmdet/.mim/configs/dynamic_rcnn/dynamic-rcnn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/dynamic_rcnn/dynamic-rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f64dfa0b9102d5f7b32793b9d21e19c67afdfc2a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dynamic_rcnn/dynamic-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,28 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        type='DynamicRoIHead',
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    train_cfg=dict(
+        rpn_proposal=dict(nms=dict(iou_threshold=0.85)),
+        rcnn=dict(
+            dynamic_rcnn=dict(
+                iou_topk=75,
+                beta_topk=10,
+                update_iter_interval=100,
+                initial_iou=0.4,
+                initial_beta=1.0))),
+    test_cfg=dict(rpn=dict(nms=dict(iou_threshold=0.85))))
diff --git a/mmde/mmdet/.mim/configs/dynamic_rcnn/metafile.yml b/mmde/mmdet/.mim/configs/dynamic_rcnn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..64ab3b0ce490a25e227b3bcd60442669608fda22
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/dynamic_rcnn/metafile.yml
@@ -0,0 +1,35 @@
+Collections:
+  - Name: Dynamic R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Dynamic R-CNN
+        - FPN
+        - RPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/pdf/2004.06002
+      Title: 'Dynamic R-CNN: Towards High Quality Object Detection via Dynamic Training'
+    README: configs/dynamic_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/roi_heads/dynamic_roi_head.py#L11
+      Version: v2.2.0
+
+Models:
+  - Name: dynamic-rcnn_r50_fpn_1x_coco
+    In Collection: Dynamic R-CNN
+    Config: configs/dynamic_rcnn/dynamic-rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.8
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x/dynamic_rcnn_r50_fpn_1x-62a3f276.pth
diff --git a/mmde/mmdet/.mim/configs/efficientnet/metafile.yml b/mmde/mmdet/.mim/configs/efficientnet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6e220c8ad7cd0e25386d950c21616d4b92f8481e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/efficientnet/metafile.yml
@@ -0,0 +1,19 @@
+Models:
+  - Name: retinanet_effb3_fpn_8xb4-crop896-1x_coco
+    In Collection: RetinaNet
+    Config: configs/efficientnet/retinanet_effb3_fpn_8xb4-crop896-1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco/retinanet_effb3_fpn_crop896_8x4_1x_coco_20220322_234806-615a0dda.pth
+    Paper:
+      URL: https://arxiv.org/abs/1905.11946v5
+      Title: 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks'
+    README: configs/efficientnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.23.0/mmdet/models/backbones/efficientnet.py#L159
+      Version: v2.23.0
diff --git a/mmde/mmdet/.mim/configs/efficientnet/retinanet_effb3_fpn_8xb4-crop896-1x_coco.py b/mmde/mmdet/.mim/configs/efficientnet/retinanet_effb3_fpn_8xb4-crop896-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d0d9cefd0b565b2cce42117eb872ac9373ea4b9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/efficientnet/retinanet_effb3_fpn_8xb4-crop896-1x_coco.py
@@ -0,0 +1,94 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/schedules/schedule_1x.py',
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+
+image_size = (896, 896)
+batch_augments = [dict(type='BatchFixedSizePad', size=image_size)]
+norm_cfg = dict(type='BN', requires_grad=True)
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa_in1k_20220119-5b4887a0.pth'  # noqa
+model = dict(
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32,
+        batch_augments=batch_augments),
+    backbone=dict(
+        _delete_=True,
+        type='EfficientNet',
+        arch='b3',
+        drop_path_rate=0.2,
+        out_indices=(3, 4, 5),
+        frozen_stages=0,
+        norm_cfg=dict(
+            type='SyncBN', requires_grad=True, eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone', checkpoint=checkpoint)),
+    neck=dict(
+        in_channels=[48, 136, 384],
+        start_level=0,
+        out_channels=256,
+        relu_before_extra_convs=True,
+        no_norm_on_lateral=True,
+        norm_cfg=norm_cfg),
+    bbox_head=dict(type='RetinaSepBNHead', num_ins=5, norm_cfg=norm_cfg),
+    # training and testing settings
+    train_cfg=dict(assigner=dict(neg_iou_thr=0.5)))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize',
+        scale=image_size,
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=image_size),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=image_size, keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=4, num_workers=4, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(lr=0.04),
+    paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True))
+
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs)
+
+# cudnn_benchmark=True can accelerate fix-size training
+env_cfg = dict(cudnn_benchmark=True)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (4 samples per GPU)
+auto_scale_lr = dict(base_batch_size=32)
diff --git a/mmde/mmdet/.mim/configs/empirical_attention/faster-rcnn_r50-attn0010-dcn_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/empirical_attention/faster-rcnn_r50-attn0010-dcn_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1ae17a7ee4d3516e6aca90697fa165f592cf51e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/empirical_attention/faster-rcnn_r50-attn0010-dcn_fpn_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        plugins=[
+            dict(
+                cfg=dict(
+                    type='GeneralizedAttention',
+                    spatial_range=-1,
+                    num_heads=8,
+                    attention_type='0010',
+                    kv_stride=2),
+                stages=(False, False, True, True),
+                position='after_conv2')
+        ],
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/empirical_attention/faster-rcnn_r50-attn0010_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/empirical_attention/faster-rcnn_r50-attn0010_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7336d292eafe8c92407f831e712946a23e231db0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/empirical_attention/faster-rcnn_r50-attn0010_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(plugins=[
+        dict(
+            cfg=dict(
+                type='GeneralizedAttention',
+                spatial_range=-1,
+                num_heads=8,
+                attention_type='0010',
+                kv_stride=2),
+            stages=(False, False, True, True),
+            position='after_conv2')
+    ]))
diff --git a/mmde/mmdet/.mim/configs/empirical_attention/faster-rcnn_r50-attn1111-dcn_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/empirical_attention/faster-rcnn_r50-attn1111-dcn_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..980e23d4509a19fe438d5c8494e2905d940705b1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/empirical_attention/faster-rcnn_r50-attn1111-dcn_fpn_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        plugins=[
+            dict(
+                cfg=dict(
+                    type='GeneralizedAttention',
+                    spatial_range=-1,
+                    num_heads=8,
+                    attention_type='1111',
+                    kv_stride=2),
+                stages=(False, False, True, True),
+                position='after_conv2')
+        ],
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
diff --git a/mmde/mmdet/.mim/configs/empirical_attention/faster-rcnn_r50-attn1111_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/empirical_attention/faster-rcnn_r50-attn1111_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..426bc09fd64c16b43b33a5c797265aa9ec2c0c15
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/empirical_attention/faster-rcnn_r50-attn1111_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(plugins=[
+        dict(
+            cfg=dict(
+                type='GeneralizedAttention',
+                spatial_range=-1,
+                num_heads=8,
+                attention_type='1111',
+                kv_stride=2),
+            stages=(False, False, True, True),
+            position='after_conv2')
+    ]))
diff --git a/mmde/mmdet/.mim/configs/empirical_attention/metafile.yml b/mmde/mmdet/.mim/configs/empirical_attention/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b488da7d29fbd632da614895272cec2025b5eccc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/empirical_attention/metafile.yml
@@ -0,0 +1,103 @@
+Collections:
+  - Name: Empirical Attention
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Deformable Convolution
+        - FPN
+        - RPN
+        - ResNet
+        - RoIAlign
+        - Spatial Attention
+    Paper:
+      URL: https://arxiv.org/pdf/1904.05873
+      Title: 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
+    README: configs/empirical_attention/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/generalized_attention.py#L10
+      Version: v2.0.0
+
+Models:
+  - Name: faster-rcnn_r50_fpn_attention_1111_1x_coco
+    In Collection: Empirical Attention
+    Config: configs/empirical_attention/faster-rcnn_r50-attn1111_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.0
+      inference time (ms/im):
+        - value: 72.46
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco/faster_rcnn_r50_fpn_attention_1111_1x_coco_20200130-403cccba.pth
+
+  - Name: faster-rcnn_r50_fpn_attention_0010_1x_coco
+    In Collection: Empirical Attention
+    Config: configs/empirical_attention/faster-rcnn_r50-attn0010_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.2
+      inference time (ms/im):
+        - value: 54.35
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco/faster_rcnn_r50_fpn_attention_0010_1x_coco_20200130-7cb0c14d.pth
+
+  - Name: faster-rcnn_r50_fpn_attention_1111_dcn_1x_coco
+    In Collection: Empirical Attention
+    Config: configs/empirical_attention/faster-rcnn_r50-attn1111-dcn_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.0
+      inference time (ms/im):
+        - value: 78.74
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco_20200130-8b2523a6.pth
+
+  - Name: faster-rcnn_r50_fpn_attention_0010_dcn_1x_coco
+    In Collection: Empirical Attention
+    Config: configs/empirical_attention/faster-rcnn_r50-attn0010-dcn_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.2
+      inference time (ms/im):
+        - value: 58.48
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco_20200130-1a2e831d.pth
diff --git a/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r101-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r101-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..02c70296fca04d59b2b87801fa7834c0dc3d30f0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r101-caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './fast-rcnn_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5af6b223c5bf66928a1d79ffba904d86006a3741
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './fast-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r101_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r101_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..73425cf1ac3be429c69f6cf6b482fee91a8e2782
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r101_fpn_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './fast-rcnn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3110f9fdf590ea665c9d7b7e28a56613cd79b786
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './fast-rcnn_r50_fpn_1x_coco.py'
+
+model = dict(
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        norm_cfg=dict(type='BN', requires_grad=False),
+        style='caffe',
+        norm_eval=True,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
diff --git a/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..daefe2d2d287b865b925263a81c12a6e30c58c4d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,39 @@
+_base_ = [
+    '../_base_/models/fast-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadProposals', num_max_proposals=2000),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='ProposalBroadcaster',
+        transforms=[
+            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+            dict(type='RandomFlip', prob=0.5),
+        ]),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadProposals', num_max_proposals=None),
+    dict(
+        type='ProposalBroadcaster',
+        transforms=[
+            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    dataset=dict(
+        proposal_file='proposals/rpn_r50_fpn_1x_train2017.pkl',
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    dataset=dict(
+        proposal_file='proposals/rpn_r50_fpn_1x_val2017.pkl',
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r50_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r50_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d609a7c02d657e15316a4c5747983a4d9a10fc7c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fast_rcnn/fast-rcnn_r50_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './fast-rcnn_r50_fpn_1x_coco.py'
+
+train_cfg = dict(max_epochs=24)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a18f1ada31ed2a2d1023d16470a271ad49c3be2e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './faster-rcnn_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cdb4d4973e364c4f37b80644388a4859f55772e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_ms-3x_coco.py
@@ -0,0 +1,11 @@
+_base_ = 'faster-rcnn_r50_fpn_ms-3x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d113ae6295fdc3f3058ef498eb9b675154a05c12
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b471fb3cbd8a79165e0cd19afc3ba98bbcfeb74e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101_fpn_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './faster-rcnn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a71d4afd3246d083bdf0f5a84be2fbf2340f621f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './faster-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ef6d1f8ea6b45e9a4bfe438910da827d079479b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r101_fpn_ms-3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = 'faster-rcnn_r50_fpn_ms-3x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..65515c9ace8bf4445a77db2485fc8d3f95c263b9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './faster-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe-c4_ms-1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe-c4_ms-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e231e865270acf0383e03a64f151efdbf88c29e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe-c4_ms-1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './faster-rcnn_r50-caffe_c4-1x_coco.py'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+_base_.train_dataloader.dataset.pipeline = train_pipeline
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8952a5c9c6c2fe019711968fa2aa7ed2065b13f6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50-caffe-dc5.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..63a68859a85fe5556e927c04aae5cafbef1fc0b6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = 'faster-rcnn_r50-caffe-dc5_1x_coco.py'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+_base_.train_dataloader.dataset.pipeline = train_pipeline
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-3x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..27063468a70436a62a7cc54b8c8efc2de96ec33f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-3x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './faster-rcnn_r50-caffe-dc5_ms-1x_coco.py'
+
+# MMEngine support the following two ways, users can choose
+# according to convenience
+# param_scheduler = [
+#     dict(
+#         type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), # noqa
+#     dict(
+#         type='MultiStepLR',
+#         begin=0,
+#         end=12,
+#         by_epoch=True,
+#         milestones=[28, 34],
+#         gamma=0.1)
+# ]
+_base_.param_scheduler[1].milestones = [28, 34]
+
+train_cfg = dict(max_epochs=36)
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_c4-1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_c4-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0888fc01790af82a4c7131280ca5f0247b28d9fd
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_c4-1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50-caffe-c4.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9129a9583c52bf8ccab38a65f35c9f14bb128d07
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,15 @@
+_base_ = './faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_90k_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_90k_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..27f49355f3be8f6a53038894405c5f1b3d9b46fa
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_90k_coco.py
@@ -0,0 +1,22 @@
+_base_ = 'faster-rcnn_r50-caffe_fpn_1x_coco.py'
+max_iter = 90000
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[60000, 80000],
+        gamma=0.1)
+]
+
+train_cfg = dict(
+    _delete_=True,
+    type='IterBasedTrainLoop',
+    max_iters=max_iter,
+    val_interval=10000)
+default_hooks = dict(checkpoint=dict(by_epoch=False, interval=10000))
+log_processor = dict(by_epoch=False)
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person-bicycle-car.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person-bicycle-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..f36bb055f87aeadc43aa1233d1d3a7bdc33fbd80
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person-bicycle-car.py
@@ -0,0 +1,16 @@
+_base_ = './faster-rcnn_r50-caffe_fpn_ms-1x_coco.py'
+model = dict(roi_head=dict(bbox_head=dict(num_classes=3)))
+metainfo = {
+    'classes': ('person', 'bicycle', 'car'),
+    'palette': [
+        (220, 20, 60),
+        (119, 11, 32),
+        (0, 0, 142),
+    ]
+}
+
+train_dataloader = dict(dataset=dict(metainfo=metainfo))
+val_dataloader = dict(dataset=dict(metainfo=metainfo))
+test_dataloader = dict(dataset=dict(metainfo=metainfo))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_bbox_mAP-0.398_20200504_163323-30042637.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py
new file mode 100644
index 0000000000000000000000000000000000000000..9528b63f4deabb3610a26af59c856cee62c489c2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py
@@ -0,0 +1,14 @@
+_base_ = './faster-rcnn_r50-caffe_fpn_ms-1x_coco.py'
+model = dict(roi_head=dict(bbox_head=dict(num_classes=1)))
+metainfo = {
+    'classes': ('person', ),
+    'palette': [
+        (220, 20, 60),
+    ]
+}
+
+train_dataloader = dict(dataset=dict(metainfo=metainfo))
+val_dataloader = dict(dataset=dict(metainfo=metainfo))
+test_dataloader = dict(dataset=dict(metainfo=metainfo))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_bbox_mAP-0.398_20200504_163323-30042637.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..59f1633c807f3eb904657cfaf97113c355df3fca
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py
@@ -0,0 +1,31 @@
+_base_ = './faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+# MMEngine support the following two ways, users can choose
+# according to convenience
+# train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+_base_.train_dataloader.dataset.pipeline = train_pipeline
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..44d320ea01ba53d591ab7db29742e7fffc7c81ce
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-2x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './faster-rcnn_r50-caffe_fpn_ms-1x_coco.py'
+
+# MMEngine support the following two ways, users can choose
+# according to convenience
+# param_scheduler = [
+#     dict(
+#         type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), # noqa
+#     dict(
+#         type='MultiStepLR',
+#         begin=0,
+#         end=12,
+#         by_epoch=True,
+#         milestones=[16, 23],
+#         gamma=0.1)
+# ]
+_base_.param_scheduler[1].milestones = [16, 23]
+
+train_cfg = dict(max_epochs=24)
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..365f6439241c6374554af1fd58a114ef03448877
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-3x_coco.py
@@ -0,0 +1,15 @@
+_base_ = 'faster-rcnn_r50_fpn_ms-3x_coco.py'
+model = dict(
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-90k_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-90k_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b9b3eb0e79b1ffb71d15c21274692d3b85e16ac
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-90k_coco.py
@@ -0,0 +1,23 @@
+_base_ = 'faster-rcnn_r50-caffe_fpn_ms-1x_coco.py'
+
+max_iter = 90000
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[60000, 80000],
+        gamma=0.1)
+]
+
+train_cfg = dict(
+    _delete_=True,
+    type='IterBasedTrainLoop',
+    max_iters=max_iter,
+    val_interval=10000)
+default_hooks = dict(checkpoint=dict(by_epoch=False, interval=10000))
+log_processor = dict(by_epoch=False)
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-tnr-pre_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-tnr-pre_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3e5dedbe81b927492dd41b13f017bcc2bd4c92
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50-tnr-pre_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+checkpoint = 'https://download.pytorch.org/models/resnet50-11ad3fa6.pth'
+model = dict(
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)))
+
+# `lr` and `weight_decay` have been searched to be optimal.
+optim_wrapper = dict(
+    optimizer=dict(_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.1),
+    paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a45417fdd4566241114e20275990a5729486932
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2981c6fbe16eb7a8b6ca1202ebb6325e2324c040
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_2x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d366f3ba0e5ff098db3e409171a88860f1cf3af
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,20 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../common/lsj-200e_coco-detection.py'
+]
+image_size = (1024, 1024)
+batch_augments = [dict(type='BatchFixedSizePad', size=image_size)]
+
+model = dict(data_preprocessor=dict(batch_augments=batch_augments))
+
+train_dataloader = dict(batch_size=8, num_workers=4)
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=0.02 * 4, momentum=0.9, weight_decay=0.00004))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_amp-1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_amp-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f765deaef1db8a798c44d848c6f759755ccd4c45
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_amp-1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './faster-rcnn_r50_fpn_1x_coco.py'
+
+# MMEngine support the following two ways, users can choose
+# according to convenience
+# optim_wrapper = dict(type='AmpOptimWrapper')
+_base_.optim_wrapper.type = 'AmpOptimWrapper'
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_bounded-iou_1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_bounded-iou_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7758ca80b372e7895be267cad8c4603778d160b3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_bounded-iou_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            reg_decoded_bbox=True,
+            loss_bbox=dict(type='BoundedIoULoss', loss_weight=10.0))))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_ciou_1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_ciou_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8d8a3042750e8f5f9478b5e8c3111d8b7a10528
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_ciou_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            reg_decoded_bbox=True,
+            loss_bbox=dict(type='CIoULoss', loss_weight=12.0))))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_fcos-rpn_1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_fcos-rpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5a34d9f74a60388fa60afd8255d470c45f209f7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_fcos-rpn_1x_coco.py
@@ -0,0 +1,48 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    # copied from configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py
+    neck=dict(
+        start_level=1,
+        add_extra_convs='on_output',  # use P5
+        relu_before_extra_convs=True),
+    rpn_head=dict(
+        _delete_=True,  # ignore the unused old settings
+        type='FCOSHead',
+        # num_classes = 1 for rpn,
+        # if num_classes > 1, it will be set to 1 in
+        # TwoStageDetector automatically
+        num_classes=1,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='IoULoss', loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    roi_head=dict(  # update featmap_strides
+        bbox_roi_extractor=dict(featmap_strides=[8, 16, 32, 64, 128])))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),  # Slowly increase lr, otherwise loss becomes NAN
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_giou_1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_giou_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..82b71d77bfc448eceadcd03a6c8cbc4c8f871109
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_giou_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            reg_decoded_bbox=True,
+            loss_bbox=dict(type='GIoULoss', loss_weight=10.0))))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_iou_1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_iou_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e21c43640cb7004e8e4ef189ff8843ad39de3c6f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_iou_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            reg_decoded_bbox=True,
+            loss_bbox=dict(type='IoULoss', loss_weight=10.0))))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..75dcfeb7a2310938c05cc103fadec6c6e119b90b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_ms-3x_coco.py
@@ -0,0 +1 @@
+_base_ = ['../common/ms_3x_coco.py', '../_base_/models/faster-rcnn_r50_fpn.py']
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_ohem_1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_ohem_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f804b9be283015d4ec349f0df664e9ca7326c96
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_ohem_1x_coco.py
@@ -0,0 +1,2 @@
+_base_ = './faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(train_cfg=dict(rcnn=dict(sampler=dict(type='OHEMSampler'))))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_soft-nms_1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_soft-nms_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3775d8e447cb80c0fc28199be2abc4c23383eadd
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_r50_fpn_soft-nms_1x_coco.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='soft_nms', iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..395c98cd65cd5f883c9fe206a7b9c99e59acb32e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6232d0edba51f433a930c46d03c49fc27954303f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './faster-rcnn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..88cb40fd62a87a8af13e166df16a348c26e6d29e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_ms-3x_coco.py
@@ -0,0 +1,14 @@
+_base_ = ['../common/ms_3x_coco.py', '../_base_/models/faster-rcnn_r50_fpn.py']
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-32x8d_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-32x8d_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..28d6290be7a75b7cceef8957e872e221fd3e78f5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-32x8d_fpn_ms-3x_coco.py
@@ -0,0 +1,23 @@
+_base_ = ['../common/ms_3x_coco.py', '../_base_/models/faster-rcnn_r50_fpn.py']
+model = dict(
+    # ResNeXt-101-32x8d model trained with Caffe2 at FB,
+    # so the mean and std need to be changed.
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[57.375, 57.120, 58.395],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f39d6322fc3a4729ea7bbfefc207a6975efb4bf4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..97a3c1338fe294f66109fa92de0d8a48686b8a09
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './faster-rcnn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeaa218c9dc76123791d9e19b0ebae687cc296c9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_ms-3x_coco.py
@@ -0,0 +1,14 @@
+_base_ = ['../common/ms_3x_coco.py', '../_base_/models/faster-rcnn_r50_fpn.py']
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/faster_rcnn/metafile.yml b/mmde/mmdet/.mim/configs/faster_rcnn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6a201e177bad065235dd1346c1d36017c4359214
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/faster_rcnn/metafile.yml
@@ -0,0 +1,451 @@
+Collections:
+  - Name: Faster R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - RPN
+        - ResNet
+        - RoIPool
+    Paper:
+      URL: https://arxiv.org/abs/1506.01497
+      Title: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks"
+    README: configs/faster_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/faster_rcnn.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: faster-rcnn_r50-caffe-c4_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50-caffe_c4-1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 35.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco/faster_rcnn_r50_caffe_c4_1x_coco_20220316_150152-3f885b85.pth
+
+  - Name: faster-rcnn_r50-caffe-c4_mstrain_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50-caffe-c4_ms-1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 35.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco/faster_rcnn_r50_caffe_c4_mstrain_1x_coco_20220316_150527-db276fed.pth
+
+  - Name: faster-rcnn_r50-caffe-dc5_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco/faster_rcnn_r50_caffe_dc5_1x_coco_20201030_151909-531f0f43.pth
+
+  - Name: faster-rcnn_r50-caffe_fpn_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.8
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco/faster_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.378_20200504_180032-c5925ee5.pth
+
+  - Name: faster-rcnn_r50_fpn_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.0
+      inference time (ms/im):
+        - value: 46.73
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth
+
+  - Name: faster-rcnn_r50_fpn_fp16_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50_fpn_amp-1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.4
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+        - Mixed Precision Training
+      inference time (ms/im):
+        - value: 34.72
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP16
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/faster_rcnn_r50_fpn_fp16_1x_coco/faster_rcnn_r50_fpn_fp16_1x_coco_20200204-d4dc1471.pth
+
+  - Name: faster-rcnn_r50_fpn_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 4.0
+      inference time (ms/im):
+        - value: 46.73
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth
+
+  - Name: faster-rcnn_r101-caffe_fpn_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.7
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco/faster_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.398_20200504_180057-b269e9dd.pth
+
+  - Name: faster-rcnn_r101_fpn_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 64.1
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_1x_coco/faster_rcnn_r101_fpn_1x_coco_20200130-f513f705.pth
+
+  - Name: faster-rcnn_r101_fpn_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r101_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 64.1
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth
+
+  - Name: faster-rcnn_x101-32x4d_fpn_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.2
+      inference time (ms/im):
+        - value: 72.46
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco/faster_rcnn_x101_32x4d_fpn_1x_coco_20200203-cff10310.pth
+
+  - Name: faster-rcnn_x101-32x4d_fpn_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.2
+      inference time (ms/im):
+        - value: 72.46
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco/faster_rcnn_x101_32x4d_fpn_2x_coco_bbox_mAP-0.412_20200506_041400-64a12c0b.pth
+
+  - Name: faster-rcnn_x101-64x4d_fpn_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.3
+      inference time (ms/im):
+        - value: 106.38
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco/faster_rcnn_x101_64x4d_fpn_1x_coco_20200204-833ee192.pth
+
+  - Name: faster-rcnn_x101-64x4d_fpn_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.3
+      inference time (ms/im):
+        - value: 106.38
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco/faster_rcnn_x101_64x4d_fpn_2x_coco_20200512_161033-5961fa95.pth
+
+  - Name: faster-rcnn_r50_fpn_iou_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50_fpn_iou_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco/faster_rcnn_r50_fpn_iou_1x_coco_20200506_095954-938e81f0.pth
+
+  - Name: faster-rcnn_r50_fpn_giou_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50_fpn_giou_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_giou_1x_coco-0eada910.pth
+
+  - Name: faster-rcnn_r50_fpn_bounded_iou_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50_fpn_bounded-iou_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_bounded_iou_1x_coco-98ad993b.pth
+
+  - Name: faster-rcnn_r50-caffe-dc5_mstrain_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco_20201028_233851-b33d21b9.pth
+
+  - Name: faster-rcnn_r50-caffe-dc5_mstrain_3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco_20201028_002107-34a53b2c.pth
+
+  - Name: faster-rcnn_r50-caffe_fpn_ms-2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-2x_coco.py
+    Metadata:
+      Training Memory (GB): 4.3
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco_bbox_mAP-0.397_20200504_231813-10b2de58.pth
+
+  - Name: faster-rcnn_r50-caffe_fpn_ms-3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 3.7
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210526_095054-1f77628b.pth
+
+  - Name: faster-rcnn_r50_fpn_mstrain_3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 3.9
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco/faster_rcnn_r50_fpn_mstrain_3x_coco_20210524_110822-e10bd31c.pth
+
+  - Name: faster-rcnn_r101-caffe_fpn_ms-3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.6
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210526_095742-a7ae426d.pth
+
+  - Name: faster-rcnn_r101_fpn_ms-3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r101_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.8
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco/faster_rcnn_r101_fpn_mstrain_3x_coco_20210524_110822-4d4d2ca8.pth
+
+  - Name: faster-rcnn_x101-32x4d_fpn_ms-3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210524_124151-16b9b260.pth
+
+  - Name: faster-rcnn_x101-32x8d_fpn_ms-3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_x101-32x8d_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 10.1
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210604_182954-002e082a.pth
+
+  - Name: faster-rcnn_x101-64x4d_fpn_ms-3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 10.0
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210524_124528-26c63de6.pth
+
+  - Name: faster-rcnn_r50_fpn_tnr-pretrain_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/faster_rcnn/faster-rcnn_r50-tnr-pre_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.0
+      inference time (ms/im):
+        - value: 46.73
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco_20220320_085147-efedfda4.pth
diff --git a/mmde/mmdet/.mim/configs/fcos/fcos_r101-caffe_fpn_gn-head-1x_coco.py b/mmde/mmdet/.mim/configs/fcos/fcos_r101-caffe_fpn_gn-head-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5380e87483e494b4c0bc6d8846c6892811d581d3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fcos/fcos_r101-caffe_fpn_gn-head-1x_coco.py
@@ -0,0 +1,9 @@
+_base_ = './fcos_r50-caffe_fpn_gn-head_1x_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/fcos/fcos_r101-caffe_fpn_gn-head_ms-640-800-2x_coco.py b/mmde/mmdet/.mim/configs/fcos/fcos_r101-caffe_fpn_gn-head_ms-640-800-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..286a07a2db2c6fc423f6cf039b2609ac81ede73d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fcos/fcos_r101-caffe_fpn_gn-head_ms-640-800-2x_coco.py
@@ -0,0 +1,38 @@
+_base_ = './fcos_r50-caffe_fpn_gn-head_1x_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron/resnet101_caffe')))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# training schedule for 2x
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(type='ConstantLR', factor=1.0 / 3, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/fcos/fcos_r101_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/fcos/fcos_r101_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..77250e6917812d3494c8dabd52a3ed12f5f34483
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fcos/fcos_r101_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './fcos_r50_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py'  # noqa
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/fcos/fcos_r18_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/fcos/fcos_r18_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f001024bb702c5ed0cb1103c5e10ae3cd7f599b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fcos/fcos_r18_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './fcos_r50_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py'  # noqa
+
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
diff --git a/mmde/mmdet/.mim/configs/fcos/fcos_r50-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py b/mmde/mmdet/.mim/configs/fcos/fcos_r50-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a77641dd87142d5c6d508f2f4a4ba5b70db52c1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fcos/fcos_r50-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py
@@ -0,0 +1,43 @@
+_base_ = 'fcos_r50-caffe_fpn_gn-head_1x_coco.py'
+
+# model setting
+model = dict(
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    bbox_head=dict(
+        norm_on_bbox=True,
+        centerness_on_reg=True,
+        dcn_on_last_conv=False,
+        center_sampling=True,
+        conv_bias=True,
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.0)),
+    # training and testing settings
+    test_cfg=dict(nms=dict(type='nms', iou_threshold=0.6)))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3.0,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(clip_grad=None)
diff --git a/mmde/mmdet/.mim/configs/fcos/fcos_r50-caffe_fpn_gn-head-center_1x_coco.py b/mmde/mmdet/.mim/configs/fcos/fcos_r50-caffe_fpn_gn-head-center_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e4eb1d5981761fab8fe0bb876ff7ef243ac31f9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fcos/fcos_r50-caffe_fpn_gn-head-center_1x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './fcos_r50-caffe_fpn_gn-head_1x_coco.py'
+
+# model settings
+model = dict(bbox_head=dict(center_sampling=True, center_sample_radius=1.5))
diff --git a/mmde/mmdet/.mim/configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py b/mmde/mmdet/.mim/configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..928a9b4c92d217822179c0ae00ae50f6f74289b1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py
@@ -0,0 +1,75 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    type='FCOS',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[102.9801, 115.9465, 122.7717],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron/resnet50_caffe')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',  # use P5
+        num_outs=5,
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type='FCOSHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='IoULoss', loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # testing settings
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
+
+# learning rate
+param_scheduler = [
+    dict(type='ConstantLR', factor=1.0 / 3, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(lr=0.01),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/fcos/fcos_r50-caffe_fpn_gn-head_4xb4-1x_coco.py b/mmde/mmdet/.mim/configs/fcos/fcos_r50-caffe_fpn_gn-head_4xb4-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..32358cd3c69800874aa77ba5746ffc0d6f3a219d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fcos/fcos_r50-caffe_fpn_gn-head_4xb4-1x_coco.py
@@ -0,0 +1,5 @@
+# TODO: Remove this config after benchmarking all related configs
+_base_ = 'fcos_r50-caffe_fpn_gn-head_1x_coco.py'
+
+# dataset settings
+train_dataloader = dict(batch_size=4, num_workers=4)
diff --git a/mmde/mmdet/.mim/configs/fcos/fcos_r50-caffe_fpn_gn-head_ms-640-800-2x_coco.py b/mmde/mmdet/.mim/configs/fcos/fcos_r50-caffe_fpn_gn-head_ms-640-800-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d50b4ec6c4a10b07cbf73475e7af545b058605c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fcos/fcos_r50-caffe_fpn_gn-head_ms-640-800-2x_coco.py
@@ -0,0 +1,30 @@
+_base_ = './fcos_r50-caffe_fpn_gn-head_1x_coco.py'
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# training schedule for 2x
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(type='ConstantLR', factor=1.0 / 3, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/fcos/fcos_r50-dcn-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py b/mmde/mmdet/.mim/configs/fcos/fcos_r50-dcn-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6a6c44f9b4213601b447bc02720e24dc86a53d9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fcos/fcos_r50-dcn-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py
@@ -0,0 +1,45 @@
+_base_ = 'fcos_r50-caffe_fpn_gn-head_1x_coco.py'
+
+# model settings
+model = dict(
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    bbox_head=dict(
+        norm_on_bbox=True,
+        centerness_on_reg=True,
+        dcn_on_last_conv=True,
+        center_sampling=True,
+        conv_bias=True,
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.0)),
+    # training and testing settings
+    test_cfg=dict(nms=dict(type='nms', iou_threshold=0.6)))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3.0,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(clip_grad=None)
diff --git a/mmde/mmdet/.mim/configs/fcos/fcos_r50_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/fcos/fcos_r50_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b51556b8eb7f844866d7acff5c7b86c08cb2a054
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fcos/fcos_r50_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,75 @@
+_base_ = '../common/lsj-200e_coco-detection.py'
+
+image_size = (1024, 1024)
+batch_augments = [dict(type='BatchFixedSizePad', size=image_size)]
+
+# model settings
+model = dict(
+    type='FCOS',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32,
+        batch_augments=batch_augments),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',  # use P5
+        num_outs=5,
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type='FCOSHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        norm_on_bbox=True,
+        centerness_on_reg=True,
+        dcn_on_last_conv=False,
+        center_sampling=True,
+        conv_bias=True,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # testing settings
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+train_dataloader = dict(batch_size=8, num_workers=4)
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=0.01 * 4, momentum=0.9, weight_decay=0.00004),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/fcos/fcos_x101-64x4d_fpn_gn-head_ms-640-800-2x_coco.py b/mmde/mmdet/.mim/configs/fcos/fcos_x101-64x4d_fpn_gn-head_ms-640-800-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..503c0e1ce79bdbc9f2a32cc65f977b0f1e968927
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fcos/fcos_x101-64x4d_fpn_gn-head_ms-640-800-2x_coco.py
@@ -0,0 +1,52 @@
+_base_ = './fcos_r50-caffe_fpn_gn-head_1x_coco.py'
+
+# model settings
+model = dict(
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# training schedule for 2x
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(type='ConstantLR', factor=1.0 / 3, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/fcos/metafile.yml b/mmde/mmdet/.mim/configs/fcos/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..fb6527cf2d418762ae1a4a9298ade3da54ece5df
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fcos/metafile.yml
@@ -0,0 +1,146 @@
+Collections:
+  - Name: FCOS
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - Group Normalization
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1904.01355
+      Title: 'FCOS: Fully Convolutional One-Stage Object Detection'
+    README: configs/fcos/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/fcos.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: fcos_r50-caffe_fpn_gn-head_1x_coco
+    In Collection: FCOS
+    Config: configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.6
+      inference time (ms/im):
+        - value: 44.05
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth
+
+  - Name: fcos_r50-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco
+    In Collection: FCOS
+    Config: configs/fcos/fcos_r50-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.7
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco-0a0d75a8.pth
+
+  - Name: fcos_r50-dcn-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco
+    In Collection: FCOS
+    Config: configs/fcos/fcos_r50-dcn-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.8
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco-ae4d8b3d.pth
+
+  - Name: fcos_r101-caffe_fpn_gn-head-1x_coco
+    In Collection: FCOS
+    Config: configs/fcos/fcos_r101-caffe_fpn_gn-head-1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.5
+      inference time (ms/im):
+        - value: 57.8
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco/fcos_r101_caffe_fpn_gn-head_1x_coco-0e37b982.pth
+
+  - Name: fcos_r50-caffe_fpn_gn-head_ms-640-800-2x_coco
+    In Collection: FCOS
+    Config: configs/fcos/fcos_r50-caffe_fpn_gn-head_ms-640-800-2x_coco.py
+    Metadata:
+      Training Memory (GB): 2.6
+      inference time (ms/im):
+        - value: 43.67
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco-d92ceeea.pth
+
+  - Name: fcos_r101-caffe_fpn_gn-head_ms-640-800-2x_coco
+    In Collection: FCOS
+    Config: configs/fcos/fcos_r101-caffe_fpn_gn-head_ms-640-800-2x_coco.py
+    Metadata:
+      Training Memory (GB): 5.5
+      inference time (ms/im):
+        - value: 57.8
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco-511424d6.pth
+
+  - Name: fcos_x101-64x4d_fpn_gn-head_ms-640-800-2x_coco
+    In Collection: FCOS
+    Config: configs/fcos/fcos_x101-64x4d_fpn_gn-head_ms-640-800-2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.0
+      inference time (ms/im):
+        - value: 103.09
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco-ede514a8.pth
diff --git a/mmde/mmdet/.mim/configs/foveabox/fovea_r101_fpn_4xb4-1x_coco.py b/mmde/mmdet/.mim/configs/foveabox/fovea_r101_fpn_4xb4-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e8ccf910e6317bf576463fa26bfcb330b6ff385
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/foveabox/fovea_r101_fpn_4xb4-1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './fovea_r50_fpn_4xb4-1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/foveabox/fovea_r101_fpn_4xb4-2x_coco.py b/mmde/mmdet/.mim/configs/foveabox/fovea_r101_fpn_4xb4-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dc98515e62b2dba225e822850229f0a2f802d63
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/foveabox/fovea_r101_fpn_4xb4-2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './fovea_r50_fpn_4xb4-2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/foveabox/fovea_r101_fpn_gn-head-align_4xb4-2x_coco.py b/mmde/mmdet/.mim/configs/foveabox/fovea_r101_fpn_gn-head-align_4xb4-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..222671d49d1e3fbc31285e4f13487d86642ebbe3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/foveabox/fovea_r101_fpn_gn-head-align_4xb4-2x_coco.py
@@ -0,0 +1,23 @@
+_base_ = './fovea_r50_fpn_4xb4-1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    bbox_head=dict(
+        with_deform=True,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)))
+# learning policy
+max_epochs = 24
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs)
diff --git a/mmde/mmdet/.mim/configs/foveabox/fovea_r101_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py b/mmde/mmdet/.mim/configs/foveabox/fovea_r101_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1852d581fcbdd9a1459291fc7f65e51041aa4e6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/foveabox/fovea_r101_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py
@@ -0,0 +1,34 @@
+_base_ = './fovea_r50_fpn_4xb4-1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    bbox_head=dict(
+        with_deform=True,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)))
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+# learning policy
+max_epochs = 24
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs)
diff --git a/mmde/mmdet/.mim/configs/foveabox/fovea_r50_fpn_4xb4-1x_coco.py b/mmde/mmdet/.mim/configs/foveabox/fovea_r50_fpn_4xb4-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..13cf3ae92b0d2bfd1d84f032f7b202430f095a6a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/foveabox/fovea_r50_fpn_4xb4-1x_coco.py
@@ -0,0 +1,59 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    type='FOVEA',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        num_outs=5,
+        add_extra_convs='on_input'),
+    bbox_head=dict(
+        type='FoveaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        base_edge_list=[16, 32, 64, 128, 256],
+        scale_ranges=((1, 64), (32, 128), (64, 256), (128, 512), (256, 2048)),
+        sigma=0.4,
+        with_deform=False,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=1.50,
+            alpha=0.4,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        nms_pre=1000,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
+train_dataloader = dict(batch_size=4, num_workers=4)
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/foveabox/fovea_r50_fpn_4xb4-2x_coco.py b/mmde/mmdet/.mim/configs/foveabox/fovea_r50_fpn_4xb4-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9d06ef9f9ba89f202ef13176af39df7e89cb5e6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/foveabox/fovea_r50_fpn_4xb4-2x_coco.py
@@ -0,0 +1,15 @@
+_base_ = './fovea_r50_fpn_4xb4-1x_coco.py'
+# learning policy
+max_epochs = 24
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs)
diff --git a/mmde/mmdet/.mim/configs/foveabox/fovea_r50_fpn_gn-head-align_4xb4-2x_coco.py b/mmde/mmdet/.mim/configs/foveabox/fovea_r50_fpn_gn-head-align_4xb4-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..877bb4fa4e1c03190a05da4e95558d8534e5e6e8
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/foveabox/fovea_r50_fpn_gn-head-align_4xb4-2x_coco.py
@@ -0,0 +1,20 @@
+_base_ = './fovea_r50_fpn_4xb4-1x_coco.py'
+model = dict(
+    bbox_head=dict(
+        with_deform=True,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)))
+# learning policy
+max_epochs = 24
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs)
+optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/foveabox/fovea_r50_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py b/mmde/mmdet/.mim/configs/foveabox/fovea_r50_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5690bcae08cd0e639afe3c832a46f78036324c08
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/foveabox/fovea_r50_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py
@@ -0,0 +1,30 @@
+_base_ = './fovea_r50_fpn_4xb4-1x_coco.py'
+model = dict(
+    bbox_head=dict(
+        with_deform=True,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)))
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+# learning policy
+max_epochs = 24
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs)
diff --git a/mmde/mmdet/.mim/configs/foveabox/metafile.yml b/mmde/mmdet/.mim/configs/foveabox/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9ab2f5420323a9eb8c2ace386485c34277d53213
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/foveabox/metafile.yml
@@ -0,0 +1,172 @@
+Collections:
+  - Name: FoveaBox
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 4x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1904.03797
+      Title: 'FoveaBox: Beyond Anchor-based Object Detector'
+    README: configs/foveabox/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/fovea.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: fovea_r50_fpn_4xb4-1x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_r50_fpn_4xb4-1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.6
+      inference time (ms/im):
+        - value: 41.49
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_1x_coco/fovea_r50_fpn_4x4_1x_coco_20200219-ee4d5303.pth
+
+  - Name: fovea_r50_fpn_4xb4-2x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_r50_fpn_4xb4-2x_coco.py
+    Metadata:
+      Training Memory (GB): 5.6
+      inference time (ms/im):
+        - value: 41.49
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_2x_coco/fovea_r50_fpn_4x4_2x_coco_20200203-2df792b1.pth
+
+  - Name: fovea_r50_fpn_gn-head-align_4xb4-2x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_r50_fpn_gn-head-align_4xb4-2x_coco.py
+    Metadata:
+      Training Memory (GB): 8.1
+      inference time (ms/im):
+        - value: 51.55
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco/fovea_align_r50_fpn_gn-head_4x4_2x_coco_20200203-8987880d.pth
+
+  - Name: fovea_r50_fpn_gn-head-align_ms-640-800-4xb4-2x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_r50_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py
+    Metadata:
+      Training Memory (GB): 8.1
+      inference time (ms/im):
+        - value: 54.64
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200205-85ce26cb.pth
+
+  - Name: fovea_r101_fpn_4xb4-1x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_r101_fpn_4xb4-1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.2
+      inference time (ms/im):
+        - value: 57.47
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_1x_coco/fovea_r101_fpn_4x4_1x_coco_20200219-05e38f1c.pth
+
+  - Name: fovea_r101_fpn_4xb4-2x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_r101_fpn_4xb4-2x_coco.py
+    Metadata:
+      Training Memory (GB): 11.7
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_2x_coco/fovea_r101_fpn_4x4_2x_coco_20200208-02320ea4.pth
+
+  - Name: fovea_r101_fpn_gn-head-align_4xb4-2x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_r101_fpn_gn-head-align_4xb4-2x_coco.py
+    Metadata:
+      Training Memory (GB): 11.7
+      inference time (ms/im):
+        - value: 68.03
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco/fovea_align_r101_fpn_gn-head_4x4_2x_coco_20200208-c39a027a.pth
+
+  - Name: fovea_r101_fpn_gn-head-align_ms-640-800-4xb4-2x_coco
+    In Collection: FoveaBox
+    Config: configs/foveabox/fovea_r101_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py
+    Metadata:
+      Training Memory (GB): 11.7
+      inference time (ms/im):
+        - value: 68.03
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200208-649c5eb6.pth
diff --git a/mmde/mmdet/.mim/configs/fpg/faster-rcnn_r50_fpg-chn128_crop640-50e_coco.py b/mmde/mmdet/.mim/configs/fpg/faster-rcnn_r50_fpg-chn128_crop640-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb9160f5cc7e118069d7172573018515aa406331
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fpg/faster-rcnn_r50_fpg-chn128_crop640-50e_coco.py
@@ -0,0 +1,9 @@
+_base_ = 'faster-rcnn_r50_fpg_crop640-50e_coco.py'
+
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    neck=dict(out_channels=128, inter_channels=128),
+    rpn_head=dict(in_channels=128),
+    roi_head=dict(
+        bbox_roi_extractor=dict(out_channels=128),
+        bbox_head=dict(in_channels=128)))
diff --git a/mmde/mmdet/.mim/configs/fpg/faster-rcnn_r50_fpg_crop640-50e_coco.py b/mmde/mmdet/.mim/configs/fpg/faster-rcnn_r50_fpg_crop640-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0d366f1f30e5bcc6d52010c46d60183b56386ea
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fpg/faster-rcnn_r50_fpg_crop640-50e_coco.py
@@ -0,0 +1,48 @@
+_base_ = 'faster-rcnn_r50_fpn_crop640-50e_coco.py'
+
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    neck=dict(
+        type='FPG',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        inter_channels=256,
+        num_outs=5,
+        stack_times=9,
+        paths=['bu'] * 9,
+        same_down_trans=None,
+        same_up_trans=dict(
+            type='conv',
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_lateral_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_down_trans=dict(
+            type='interpolation_conv',
+            mode='nearest',
+            kernel_size=3,
+            norm_cfg=norm_cfg,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        across_up_trans=None,
+        across_skip_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        output_trans=dict(
+            type='last_conv',
+            kernel_size=3,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        norm_cfg=norm_cfg,
+        skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()]))
diff --git a/mmde/mmdet/.mim/configs/fpg/faster-rcnn_r50_fpn_crop640-50e_coco.py b/mmde/mmdet/.mim/configs/fpg/faster-rcnn_r50_fpn_crop640-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..46211de03f34e6a9709a9cfa8561b88a90f69581
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fpg/faster-rcnn_r50_fpn_crop640-50e_coco.py
@@ -0,0 +1,73 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+norm_cfg = dict(type='BN', requires_grad=True)
+image_size = (640, 640)
+batch_augments = [dict(type='BatchFixedSizePad', size=image_size)]
+
+model = dict(
+    data_preprocessor=dict(pad_size_divisor=64, batch_augments=batch_augments),
+    backbone=dict(norm_cfg=norm_cfg, norm_eval=False),
+    neck=dict(norm_cfg=norm_cfg),
+    roi_head=dict(bbox_head=dict(norm_cfg=norm_cfg)))
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize',
+        scale=image_size,
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=image_size,
+        allow_negative_crop=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=image_size, keep_ratio=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=8, num_workers=4, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# learning policy
+max_epochs = 50
+train_cfg = dict(max_epochs=max_epochs, val_interval=2)
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[30, 40],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001),
+    paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True),
+    clip_grad=None)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/fpg/mask-rcnn_r50_fpg-chn128_crop640-50e_coco.py b/mmde/mmdet/.mim/configs/fpg/mask-rcnn_r50_fpg-chn128_crop640-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..804393966c6711a1e5261ace00e9b8b84283fde5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fpg/mask-rcnn_r50_fpg-chn128_crop640-50e_coco.py
@@ -0,0 +1,10 @@
+_base_ = 'mask-rcnn_r50_fpg_crop640-50e_coco.py'
+
+model = dict(
+    neck=dict(out_channels=128, inter_channels=128),
+    rpn_head=dict(in_channels=128),
+    roi_head=dict(
+        bbox_roi_extractor=dict(out_channels=128),
+        bbox_head=dict(in_channels=128),
+        mask_roi_extractor=dict(out_channels=128),
+        mask_head=dict(in_channels=128)))
diff --git a/mmde/mmdet/.mim/configs/fpg/mask-rcnn_r50_fpg_crop640-50e_coco.py b/mmde/mmdet/.mim/configs/fpg/mask-rcnn_r50_fpg_crop640-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..135bb60bb340c40a47a9bd64e5a8afc57ede60db
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fpg/mask-rcnn_r50_fpg_crop640-50e_coco.py
@@ -0,0 +1,48 @@
+_base_ = 'mask-rcnn_r50_fpn_crop640-50e_coco.py'
+
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    neck=dict(
+        type='FPG',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        inter_channels=256,
+        num_outs=5,
+        stack_times=9,
+        paths=['bu'] * 9,
+        same_down_trans=None,
+        same_up_trans=dict(
+            type='conv',
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_lateral_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_down_trans=dict(
+            type='interpolation_conv',
+            mode='nearest',
+            kernel_size=3,
+            norm_cfg=norm_cfg,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        across_up_trans=None,
+        across_skip_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        output_trans=dict(
+            type='last_conv',
+            kernel_size=3,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        norm_cfg=norm_cfg,
+        skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()]))
diff --git a/mmde/mmdet/.mim/configs/fpg/mask-rcnn_r50_fpn_crop640-50e_coco.py b/mmde/mmdet/.mim/configs/fpg/mask-rcnn_r50_fpn_crop640-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..08ca5b6ffd8b9d166857d3c27bb6f5bde91416cc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fpg/mask-rcnn_r50_fpn_crop640-50e_coco.py
@@ -0,0 +1,79 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+norm_cfg = dict(type='BN', requires_grad=True)
+image_size = (640, 640)
+batch_augments = [dict(type='BatchFixedSizePad', size=image_size)]
+
+model = dict(
+    data_preprocessor=dict(pad_size_divisor=64, batch_augments=batch_augments),
+    backbone=dict(norm_cfg=norm_cfg, norm_eval=False),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        norm_cfg=norm_cfg,
+        num_outs=5),
+    roi_head=dict(
+        bbox_head=dict(norm_cfg=norm_cfg), mask_head=dict(norm_cfg=norm_cfg)))
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomResize',
+        scale=image_size,
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=image_size,
+        allow_negative_crop=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=image_size, keep_ratio=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=8, num_workers=4, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# learning policy
+max_epochs = 50
+train_cfg = dict(max_epochs=max_epochs, val_interval=2)
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[30, 40],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001),
+    paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True),
+    clip_grad=None)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/fpg/metafile.yml b/mmde/mmdet/.mim/configs/fpg/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7d7634aec6161a283577059de96d5f995cf1e4bb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fpg/metafile.yml
@@ -0,0 +1,104 @@
+Collections:
+  - Name: Feature Pyramid Grids
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Feature Pyramid Grids
+    Paper:
+      URL: https://arxiv.org/abs/2004.03580
+      Title: 'Feature Pyramid Grids'
+    README: configs/fpg/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.10.0/mmdet/models/necks/fpg.py#L101
+      Version: v2.10.0
+
+Models:
+  - Name: faster-rcnn_r50_fpg_crop640-50e_coco
+    In Collection: Feature Pyramid Grids
+    Config: configs/fpg/faster-rcnn_r50_fpg_crop640-50e_coco.py
+    Metadata:
+      Training Memory (GB): 20.0
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/faster_rcnn_r50_fpg_crop640_50e_coco_20220311_011856-74109f42.pth
+
+  - Name: faster-rcnn_r50_fpg-chn128_crop640-50e_coco
+    In Collection: Feature Pyramid Grids
+    Config: configs/fpg/faster-rcnn_r50_fpg-chn128_crop640-50e_coco.py
+    Metadata:
+      Training Memory (GB): 11.9
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/faster_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011857-9376aa9d.pth
+
+  - Name: mask-rcnn_r50_fpg_crop640-50e_coco
+    In Collection: Feature Pyramid Grids
+    Config: configs/fpg/mask-rcnn_r50_fpg_crop640-50e_coco.py
+    Metadata:
+      Training Memory (GB): 23.2
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco_20220311_011857-233b8334.pth
+
+  - Name: mask-rcnn_r50_fpg-chn128_crop640-50e_coco
+    In Collection: Feature Pyramid Grids
+    Config: configs/fpg/mask-rcnn_r50_fpg-chn128_crop640-50e_coco.py
+    Metadata:
+      Training Memory (GB): 15.3
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/mask_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011859-043c9b4e.pth
+
+  - Name: retinanet_r50_fpg_crop640_50e_coco
+    In Collection: Feature Pyramid Grids
+    Config: configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py
+    Metadata:
+      Training Memory (GB): 20.8
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/retinanet_r50_fpg_crop640_50e_coco_20220311_110809-b0bcf5f4.pth
+
+  - Name: retinanet_r50_fpg-chn128_crop640_50e_coco
+    In Collection: Feature Pyramid Grids
+    Config: configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py
+    Metadata:
+      Training Memory (GB): 19.9
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.9
+    Weights:  https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/retinanet_r50_fpg-chn128_crop640_50e_coco_20220313_104829-ee99a686.pth
diff --git a/mmde/mmdet/.mim/configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py b/mmde/mmdet/.mim/configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a6cf7e56a4f23a42d3905560a9b8035d6d935ff
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py
@@ -0,0 +1,5 @@
+_base_ = 'retinanet_r50_fpg_crop640_50e_coco.py'
+
+model = dict(
+    neck=dict(out_channels=128, inter_channels=128),
+    bbox_head=dict(in_channels=128))
diff --git a/mmde/mmdet/.mim/configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py b/mmde/mmdet/.mim/configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2aac283992ea9e4595e7594233b21208bd672f5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py
@@ -0,0 +1,53 @@
+_base_ = '../nas_fpn/retinanet_r50_nasfpn_crop640-50e_coco.py'
+
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    neck=dict(
+        _delete_=True,
+        type='FPG',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        inter_channels=256,
+        num_outs=5,
+        add_extra_convs=True,
+        start_level=1,
+        stack_times=9,
+        paths=['bu'] * 9,
+        same_down_trans=None,
+        same_up_trans=dict(
+            type='conv',
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_lateral_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        across_down_trans=dict(
+            type='interpolation_conv',
+            mode='nearest',
+            kernel_size=3,
+            norm_cfg=norm_cfg,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        across_up_trans=None,
+        across_skip_trans=dict(
+            type='conv',
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            inplace=False,
+            order=('act', 'conv', 'norm')),
+        output_trans=dict(
+            type='last_conv',
+            kernel_size=3,
+            order=('act', 'conv', 'norm'),
+            inplace=False),
+        norm_cfg=norm_cfg,
+        skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()]))
+
+train_cfg = dict(val_interval=2)
diff --git a/mmde/mmdet/.mim/configs/free_anchor/freeanchor_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/free_anchor/freeanchor_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc323d94f7aa20b38e2204a38ed8e234dd4eadd1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/free_anchor/freeanchor_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './freeanchor_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/free_anchor/freeanchor_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/free_anchor/freeanchor_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..13f64d14a1ead0431549b8569d031f72669a2e84
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/free_anchor/freeanchor_r50_fpn_1x_coco.py
@@ -0,0 +1,22 @@
+_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    bbox_head=dict(
+        _delete_=True,
+        type='FreeAnchorRetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.75)))
+
+optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/free_anchor/freeanchor_x101-32x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/free_anchor/freeanchor_x101-32x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e448bc1123115d37ef9f21a33c8a6b38cd821c3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/free_anchor/freeanchor_x101-32x4d_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+_base_ = './freeanchor_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/free_anchor/metafile.yml b/mmde/mmdet/.mim/configs/free_anchor/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..cff19db6c957c2cdc09c1f76ff230c3a611bfc01
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/free_anchor/metafile.yml
@@ -0,0 +1,79 @@
+Collections:
+  - Name: FreeAnchor
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FreeAnchor
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1909.02466
+      Title: 'FreeAnchor: Learning to Match Anchors for Visual Object Detection'
+    README: configs/free_anchor/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/dense_heads/free_anchor_retina_head.py#L10
+      Version: v2.0.0
+
+Models:
+  - Name: freeanchor_r50_fpn_1x_coco
+    In Collection: FreeAnchor
+    Config: configs/free_anchor/freeanchor_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.9
+      inference time (ms/im):
+        - value: 54.35
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco/retinanet_free_anchor_r50_fpn_1x_coco_20200130-0f67375f.pth
+
+  - Name: freeanchor_r101_fpn_1x_coco
+    In Collection: FreeAnchor
+    Config: configs/free_anchor/freeanchor_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.8
+      inference time (ms/im):
+        - value: 67.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco/retinanet_free_anchor_r101_fpn_1x_coco_20200130-358324e6.pth
+
+  - Name: freeanchor_x101-32x4d_fpn_1x_coco
+    In Collection: FreeAnchor
+    Config: configs/free_anchor/freeanchor_x101-32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.1
+      inference time (ms/im):
+        - value: 90.09
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco/retinanet_free_anchor_x101_32x4d_fpn_1x_coco_20200130-d4846968.pth
diff --git a/mmde/mmdet/.mim/configs/fsaf/fsaf_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/fsaf/fsaf_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..12b49fed5b6cd617aa9c05d76ed737d755992a34
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fsaf/fsaf_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './fsaf_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/fsaf/fsaf_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/fsaf/fsaf_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7165cd63c74ab27ff47f8255836f4c10158cf0e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fsaf/fsaf_r50_fpn_1x_coco.py
@@ -0,0 +1,47 @@
+_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py'
+# model settings
+model = dict(
+    type='FSAF',
+    bbox_head=dict(
+        type='FSAFHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        reg_decoded_bbox=True,
+        # Only anchor-free branch is implemented. The anchor generator only
+        #  generates 1 anchor at each feature point, as a substitute of the
+        #  grid of features.
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=1,
+            scales_per_octave=1,
+            ratios=[1.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(_delete_=True, type='TBLRBBoxCoder', normalizer=4.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0,
+            reduction='none'),
+        loss_bbox=dict(
+            _delete_=True,
+            type='IoULoss',
+            eps=1e-6,
+            loss_weight=1.0,
+            reduction='none')),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            _delete_=True,
+            type='CenterRegionAssigner',
+            pos_scale=0.2,
+            neg_scale=0.2,
+            min_pos_iof=0.01),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False))
+
+optim_wrapper = dict(clip_grad=dict(max_norm=10, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/fsaf/fsaf_x101-64x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/fsaf/fsaf_x101-64x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..89c0c6344aba6e6eae5657eff60745645dd1e8dc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fsaf/fsaf_x101-64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './fsaf_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/fsaf/metafile.yml b/mmde/mmdet/.mim/configs/fsaf/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..daaad0d3a864b52df618a95a63c6caeaa1fd76ec
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/fsaf/metafile.yml
@@ -0,0 +1,80 @@
+Collections:
+  - Name: FSAF
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x Titan-XP GPUs
+      Architecture:
+        - FPN
+        - FSAF
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1903.00621
+      Title: 'Feature Selective Anchor-Free Module for Single-Shot Object Detection'
+    README: configs/fsaf/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/detectors/fsaf.py#L6
+      Version: v2.1.0
+
+Models:
+  - Name: fsaf_r50_fpn_1x_coco
+    In Collection: FSAF
+    Config: configs/fsaf/fsaf_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.15
+      inference time (ms/im):
+        - value: 76.92
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r50_fpn_1x_coco/fsaf_r50_fpn_1x_coco-94ccc51f.pth
+
+  - Name: fsaf_r101_fpn_1x_coco
+    In Collection: FSAF
+    Config: configs/fsaf/fsaf_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.08
+      inference time (ms/im):
+        - value: 92.59
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r101_fpn_1x_coco/fsaf_r101_fpn_1x_coco-9e71098f.pth
+
+  - Name: fsaf_x101-64x4d_fpn_1x_coco
+    In Collection: FSAF
+    Config: configs/fsaf/fsaf_x101-64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.38
+      inference time (ms/im):
+        - value: 178.57
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_x101_64x4d_fpn_1x_coco/fsaf_x101_64x4d_fpn_1x_coco-e3f6e6fd.pth
diff --git a/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r16-gcb-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r16-gcb-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf605b666e460aee48adc629b0604af4c64e306
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r16-gcb-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../dcn/cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r4-gcb-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r4-gcb-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..95fc687b664b25b754d4ba890ae9c9e982db65fb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r4-gcb-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../dcn/cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 4),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b77dc9315f52f9437eb1e39f6d518f1afaa41bb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,4 @@
+_base_ = '../dcn/cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False))
diff --git a/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r16-gcb-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r16-gcb-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f97972aa2b7d151d5824de40da9cedae9c57535
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r16-gcb-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r4-gcb-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r4-gcb-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8404cfdaf34e470d2bff57a707ca8183fe442131
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r4-gcb-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 4),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..87667dee779ee8068075be17638a6d10a9985c7e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py
@@ -0,0 +1,4 @@
+_base_ = '../cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False))
diff --git a/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r101-gcb-r16-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r101-gcb-r16-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..447e2c6d858738db0f0d2e46e57e1fccd2233af3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r101-gcb-r16-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = '../mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(plugins=[
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16),
+            stages=(False, True, True, True),
+            position='after_conv3')
+    ]))
diff --git a/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r101-gcb-r4-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r101-gcb-r4-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c723a64b6f686b9dd0f8e7648c7b1b303205168
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r101-gcb-r4-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = '../mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(plugins=[
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 4),
+            stages=(False, True, True, True),
+            position='after_conv3')
+    ]))
diff --git a/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r101-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r101-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f9d03d3f8d94116b4814825ad8377b534a912b1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r101-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d07cb0d488c0df76a137bad54123a7583c7da87b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 4),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r101-syncbn_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r101-syncbn_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..957bdf55470017d9ac9fa482b416c2206266af86
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r101-syncbn_fpn_1x_coco.py
@@ -0,0 +1,4 @@
+_base_ = '../mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False))
diff --git a/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r50-gcb-r16-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r50-gcb-r16-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9ec5ac3baf7c46ea95d4c3fcf4f5da4ad7a3dce
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r50-gcb-r16-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(plugins=[
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 16),
+            stages=(False, True, True, True),
+            position='after_conv3')
+    ]))
diff --git a/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r50-gcb-r4-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r50-gcb-r4-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..42474d5196a8a130999db735989b423664486304
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r50-gcb-r4-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(plugins=[
+        dict(
+            cfg=dict(type='ContextBlock', ratio=1. / 4),
+            stages=(False, True, True, True),
+            position='after_conv3')
+    ]))
diff --git a/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r50-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r50-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac1928082405baebfe5ec483f37b9775da21d5ad
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r50-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r50-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r50-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae29f0cebe4f9fe16f2fea3de53874914186da9b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r50-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 4),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r50-syncbn_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r50-syncbn_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8ef27bad9743cba8f7134f1a77a091af1bca093
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_r50-syncbn_fpn_1x_coco.py
@@ -0,0 +1,4 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False))
diff --git a/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a2e2c9f26b25c5aefba912997cd01db60854a5e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 16),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..65d3f9aadf5f79a4fb9fc9082dfabfdb3de08871
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = '../mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        plugins=[
+            dict(
+                cfg=dict(type='ContextBlock', ratio=1. / 4),
+                stages=(False, True, True, True),
+                position='after_conv3')
+        ]))
diff --git a/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5343a6d4596eb82245ef078d36a5a6ce5137aeb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py
@@ -0,0 +1,4 @@
+_base_ = '../mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False))
diff --git a/mmde/mmdet/.mim/configs/gcnet/metafile.yml b/mmde/mmdet/.mim/configs/gcnet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..075a94c8fbf4c5f629d9343cc841f94f18472195
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gcnet/metafile.yml
@@ -0,0 +1,440 @@
+Collections:
+  - Name: GCNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Global Context Block
+        - FPN
+        - RPN
+        - ResNet
+        - ResNeXt
+    Paper:
+      URL: https://arxiv.org/abs/1904.11492
+      Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    README: configs/gcnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/context_block.py#L13
+      Version: v2.0.0
+
+Models:
+  - Name: mask-rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask-rcnn_r50-gcb-r16-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  35.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco_20200515_211915-187da160.pth
+
+  - Name: mask-rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask-rcnn_r50-gcb-r4-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.1
+      inference time (ms/im):
+        - value: 66.67
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco_20200204-17235656.pth
+
+  - Name: mask-rcnn_r101-gcb-r16-c3-c5_fpn_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask-rcnn_r101-gcb-r16-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      inference time (ms/im):
+        - value: 87.72
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco_20200205-e58ae947.pth
+
+  - Name: mask-rcnn_r101-gcb-r4-c3-c5_fpn_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask-rcnn_r101-gcb-r4-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      inference time (ms/im):
+        - value: 86.21
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco_20200206-af22dc9d.pth
+
+  - Name: mask-rcnn_r50_fpn_syncbn-backbone_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask-rcnn_r50-syncbn_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.4
+      inference time (ms/im):
+        - value: 60.24
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  34.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco_20200202-bb3eb55c.pth
+
+  - Name: mask-rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask-rcnn_r50-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      inference time (ms/im):
+        - value: 64.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200202-587b99aa.pth
+
+  - Name: mask-rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask-rcnn_r50-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.1
+      inference time (ms/im):
+        - value: 66.23
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202-50b90e5c.pth
+
+  - Name: mask-rcnn_r101-syncbn_fpn_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask-rcnn_r101-syncbn_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      inference time (ms/im):
+        - value: 75.19
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco_20200210-81658c8a.pth
+
+  - Name: mask-rcnn_r101-syncbn-gcb-r16-c3-c5_fpn_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask-rcnn_r101-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      inference time (ms/im):
+        - value: 83.33
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200207-945e77ca.pth
+
+  - Name: mask-rcnn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask-rcnn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      inference time (ms/im):
+        - value: 84.75
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206-8407a3f0.pth
+
+  - Name: mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      inference time (ms/im):
+        - value: 88.5
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200211-7584841c.pth
+
+  - Name: mask-rcnn_x101-32x4d-syncbn-gcb-r16-c3-c5_fpn_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.8
+      inference time (ms/im):
+        - value: 102.04
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211-cbed3d2c.pth
+
+  - Name: mask-rcnn_x101-32x4d-syncbn-gcb-r4-c3-c5_fpn_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.0
+      inference time (ms/im):
+        - value: 103.09
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200212-68164964.pth
+
+  - Name: cascade-mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.2
+      inference time (ms/im):
+        - value: 119.05
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200310-d5ad2a5e.pth
+
+  - Name: cascade-mask-rcnn_x101-32x4d-syncbn-r16-gcb-c3-c5_fpn_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r16-gcb-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.3
+      inference time (ms/im):
+        - value: 129.87
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  39.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211-10bf2463.pth
+
+  - Name: cascade-mask-rcnn_x101-32x4d-syncbn-r4-gcb-c3-c5_fpn_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r4-gcb-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.6
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:    40.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200703_180653-ed035291.pth
+
+  - Name: cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5_fpn_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  40.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco_20210615_211019-abbc39ea.pth
+
+  - Name: cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r16-gcb-c3-c5_fpn_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r16-gcb-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 48.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco_20210615_215648-44aa598a.pth
+
+  - Name: cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r4-gcb-c3-c5_fpn_1x_coco
+    In Collection: GCNet
+    Config: configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r4-gcb-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:   41.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco_20210615_161851-720338ec.pth
diff --git a/mmde/mmdet/.mim/configs/gfl/gfl_r101-dconv-c3-c5_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/gfl/gfl_r101-dconv-c3-c5_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f748935b62884fd501af7e6731ad3ef6ce0effb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gfl/gfl_r101-dconv-c3-c5_fpn_ms-2x_coco.py
@@ -0,0 +1,15 @@
+_base_ = './gfl_r50_fpn_ms-2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/gfl/gfl_r101_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/gfl/gfl_r101_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..10135f161b9e933612d961af12a8e30198cca484
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gfl/gfl_r101_fpn_ms-2x_coco.py
@@ -0,0 +1,13 @@
+_base_ = './gfl_r50_fpn_ms-2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/gfl/gfl_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/gfl/gfl_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..902382552d58f124bbe2b8c2904ce74ec7b7a4d8
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gfl/gfl_r50_fpn_1x_coco.py
@@ -0,0 +1,66 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='GFL',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='GFLHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25),
+        reg_max=16,
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/gfl/gfl_r50_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/gfl/gfl_r50_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..22770eb101920f9daae750a1b72f5410be395743
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gfl/gfl_r50_fpn_ms-2x_coco.py
@@ -0,0 +1,28 @@
+_base_ = './gfl_r50_fpn_1x_coco.py'
+max_epochs = 24
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs)
+
+# multi-scale training
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize', scale=[(1333, 480), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/gfl/gfl_x101-32x4d-dconv-c4-c5_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/gfl/gfl_x101-32x4d-dconv-c4-c5_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6aa98eea2d0d25b4df1570aed97cce8475e9104d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gfl/gfl_x101-32x4d-dconv-c4-c5_fpn_ms-2x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './gfl_r50_fpn_ms-2x_coco.py'
+model = dict(
+    type='GFL',
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/gfl/gfl_x101-32x4d_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/gfl/gfl_x101-32x4d_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec629b1f0d5d3317dcb20f1244bc713818518d8a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gfl/gfl_x101-32x4d_fpn_ms-2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './gfl_r50_fpn_ms-2x_coco.py'
+model = dict(
+    type='GFL',
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/gfl/metafile.yml b/mmde/mmdet/.mim/configs/gfl/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..183fc14bdee0492c7ea3fc18ccb7371682dc0066
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gfl/metafile.yml
@@ -0,0 +1,134 @@
+Collections:
+  - Name: Generalized Focal Loss
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Generalized Focal Loss
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/2006.04388
+      Title: 'Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection'
+    README: configs/gfl/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/detectors/gfl.py#L6
+      Version: v2.2.0
+
+Models:
+  - Name: gfl_r50_fpn_1x_coco
+    In Collection: Generalized Focal Loss
+    Config: configs/gfl/gfl_r50_fpn_1x_coco.py
+    Metadata:
+      inference time (ms/im):
+        - value: 51.28
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_1x_coco/gfl_r50_fpn_1x_coco_20200629_121244-25944287.pth
+
+  - Name: gfl_r50_fpn_ms-2x_coco
+    In Collection: Generalized Focal Loss
+    Config: configs/gfl/gfl_r50_fpn_ms-2x_coco.py
+    Metadata:
+      inference time (ms/im):
+        - value: 51.28
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_mstrain_2x_coco/gfl_r50_fpn_mstrain_2x_coco_20200629_213802-37bb1edc.pth
+
+  - Name: gfl_r101_fpn_ms-2x_coco
+    In Collection: Generalized Focal Loss
+    Config: configs/gfl/gfl_r101_fpn_ms-2x_coco.py
+    Metadata:
+      inference time (ms/im):
+        - value: 68.03
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_mstrain_2x_coco/gfl_r101_fpn_mstrain_2x_coco_20200629_200126-dd12f847.pth
+
+  - Name: gfl_r101-dconv-c3-c5_fpn_ms-2x_coco
+    In Collection: Generalized Focal Loss
+    Config: configs/gfl/gfl_r101-dconv-c3-c5_fpn_ms-2x_coco.py
+    Metadata:
+      inference time (ms/im):
+        - value: 77.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20200630_102002-134b07df.pth
+
+  - Name: gfl_x101-32x4d_fpn_ms-2x_coco
+    In Collection: Generalized Focal Loss
+    Config: configs/gfl/gfl_x101-32x4d_fpn_ms-2x_coco.py
+    Metadata:
+      inference time (ms/im):
+        - value: 82.64
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco/gfl_x101_32x4d_fpn_mstrain_2x_coco_20200630_102002-50c1ffdb.pth
+
+  - Name: gfl_x101-32x4d-dconv-c4-c5_fpn_ms-2x_coco
+    In Collection: Generalized Focal Loss
+    Config: configs/gfl/gfl_x101-32x4d-dconv-c4-c5_fpn_ms-2x_coco.py
+    Metadata:
+      inference time (ms/im):
+        - value: 93.46
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 48.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco_20200630_102002-14a2bf25.pth
diff --git a/mmde/mmdet/.mim/configs/ghm/metafile.yml b/mmde/mmdet/.mim/configs/ghm/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..63cb48ffe7323686c38fcb279dde9ee6387e9be7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ghm/metafile.yml
@@ -0,0 +1,101 @@
+Collections:
+  - Name: GHM
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - GHM-C
+        - GHM-R
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1811.05181
+      Title: 'Gradient Harmonized Single-stage Detector'
+    README: configs/ghm/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/losses/ghm_loss.py#L21
+      Version: v2.0.0
+
+Models:
+  - Name: retinanet_r50_fpn_ghm-1x_coco
+    In Collection: GHM
+    Config: configs/ghm/retinanet_r50_fpn_ghm-1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.0
+      inference time (ms/im):
+        - value: 303.03
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r50_fpn_1x_coco/retinanet_ghm_r50_fpn_1x_coco_20200130-a437fda3.pth
+
+  - Name: retinanet_r101_fpn_ghm-1x_coco
+    In Collection: GHM
+    Config: configs/ghm/retinanet_r101_fpn_ghm-1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 227.27
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r101_fpn_1x_coco/retinanet_ghm_r101_fpn_1x_coco_20200130-c148ee8f.pth
+
+  - Name: retinanet_x101-32x4d_fpn_ghm-1x_coco
+    In Collection: GHM
+    Config: configs/ghm/retinanet_x101-32x4d_fpn_ghm-1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.2
+      inference time (ms/im):
+        - value: 196.08
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco/retinanet_ghm_x101_32x4d_fpn_1x_coco_20200131-e4333bd0.pth
+
+  - Name: retinanet_x101-64x4d_fpn_ghm-1x_coco
+    In Collection: GHM
+    Config: configs/ghm/retinanet_x101-64x4d_fpn_ghm-1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.3
+      inference time (ms/im):
+        - value: 192.31
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco/retinanet_ghm_x101_64x4d_fpn_1x_coco_20200131-dd381cef.pth
diff --git a/mmde/mmdet/.mim/configs/ghm/retinanet_r101_fpn_ghm-1x_coco.py b/mmde/mmdet/.mim/configs/ghm/retinanet_r101_fpn_ghm-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..090221e68f68a95cfcf092b15f2636cd28fc9d87
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ghm/retinanet_r101_fpn_ghm-1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './retinanet_r50_fpn_ghm-1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/ghm/retinanet_r50_fpn_ghm-1x_coco.py b/mmde/mmdet/.mim/configs/ghm/retinanet_r50_fpn_ghm-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..42b9aa6d05dc64f3045685a7c23d632a6041249c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ghm/retinanet_r50_fpn_ghm-1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    bbox_head=dict(
+        loss_cls=dict(
+            _delete_=True,
+            type='GHMC',
+            bins=30,
+            momentum=0.75,
+            use_sigmoid=True,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            _delete_=True,
+            type='GHMR',
+            mu=0.02,
+            bins=10,
+            momentum=0.7,
+            loss_weight=10.0)))
+optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/ghm/retinanet_x101-32x4d_fpn_ghm-1x_coco.py b/mmde/mmdet/.mim/configs/ghm/retinanet_x101-32x4d_fpn_ghm-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1240545a624a70c7122829e85b426cafcc3f42d2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ghm/retinanet_x101-32x4d_fpn_ghm-1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './retinanet_r50_fpn_ghm-1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/ghm/retinanet_x101-64x4d_fpn_ghm-1x_coco.py b/mmde/mmdet/.mim/configs/ghm/retinanet_x101-64x4d_fpn_ghm-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..689d2edcdf1bdffa52ee3aa3a8a4dac7988f6fa5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ghm/retinanet_x101-64x4d_fpn_ghm-1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './retinanet_r50_fpn_ghm-1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/glip/flickr30k/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg_zeroshot_flickr30k.py b/mmde/mmdet/.mim/configs/glip/flickr30k/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg_zeroshot_flickr30k.py
new file mode 100644
index 0000000000000000000000000000000000000000..14d6e8aaa6372a5272467dd46d33e80979298efc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/flickr30k/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg_zeroshot_flickr30k.py
@@ -0,0 +1,61 @@
+_base_ = '../glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py'
+
+lang_model_name = 'bert-base-uncased'
+
+model = dict(bbox_head=dict(early_fuse=True))
+
+dataset_type = 'Flickr30kDataset'
+data_root = 'data/flickr30k_entities/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive', 'phrase_ids', 'phrases'))
+]
+
+dataset_Flickr30k_val = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='final_flickr_separateGT_val.json',
+    data_prefix=dict(img='flickr30k_images/'),
+    pipeline=test_pipeline,
+)
+
+dataset_Flickr30k_test = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='final_flickr_separateGT_test.json',
+    data_prefix=dict(img='flickr30k_images/'),
+    pipeline=test_pipeline,
+)
+
+val_evaluator_Flickr30k = dict(type='Flickr30kMetric', )
+
+test_evaluator_Flickr30k = dict(type='Flickr30kMetric', )
+
+# ----------Config---------- #
+dataset_prefixes = ['Flickr30kVal', 'Flickr30kTest']
+datasets = [dataset_Flickr30k_val, dataset_Flickr30k_test]
+metrics = [val_evaluator_Flickr30k, test_evaluator_Flickr30k]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/glip/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco.py b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..92a85a11d57b6d3d64bfed5f9a691bca739d7ce3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco.py
@@ -0,0 +1,14 @@
+_base_ = './glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py'
+
+model = dict(
+    backbone=dict(
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        drop_path_rate=0.4,
+    ),
+    neck=dict(in_channels=[384, 768, 1536]),
+    bbox_head=dict(early_fuse=True, num_dyhead_blocks=8, use_checkpoint=True))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/glip/glip_l_mmdet-abfe026b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/glip/glip_atss_swin-l_fpn_dyhead_pretrain_mixeddata.py b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-l_fpn_dyhead_pretrain_mixeddata.py
new file mode 100644
index 0000000000000000000000000000000000000000..546ecfe1d513b4161322f5ffa0e51d01b2775780
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-l_fpn_dyhead_pretrain_mixeddata.py
@@ -0,0 +1,12 @@
+_base_ = './glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py'
+
+model = dict(
+    backbone=dict(
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        drop_path_rate=0.4,
+    ),
+    neck=dict(in_channels=[384, 768, 1536]),
+    bbox_head=dict(early_fuse=True, num_dyhead_blocks=8))
diff --git a/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco.py b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b280657b315c77dd118ab84880d97dc882102a1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco.py
@@ -0,0 +1,155 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_a_mmdet-b3654169.pth'  # noqa
+lang_model_name = 'bert-base-uncased'
+
+model = dict(
+    type='GLIP',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=False,
+        convert_weights=False),
+    neck=dict(
+        type='FPN_DropBlock',
+        in_channels=[192, 384, 768],
+        out_channels=256,
+        start_level=0,
+        relu_before_extra_convs=True,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='ATSSVLFusionHead',
+        lang_model_name=lang_model_name,
+        num_classes=80,
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128],
+            center_offset=0.5),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoderForGLIP',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    language_model=dict(type='BertModel', name=lang_model_name),
+    train_cfg=dict(
+        assigner=dict(
+            type='ATSSAssigner',
+            topk=9,
+            iou_calculator=dict(type='BboxOverlaps2D_GLIP')),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+# dataset settings
+train_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        imdecode_backend='pillow',
+        backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='GTBoxSubOne_GLIP'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 480), (1333, 560), (1333, 640), (1333, 720),
+                (1333, 800)],
+        keep_ratio=True,
+        resize_type='FixScaleResize',
+        backend='pillow'),
+    dict(type='RandomFlip_GLIP', prob=0.5),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        backend_args=_base_.backend_args,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=_base_.dataset_type,
+            data_root=_base_.data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            return_classes=True,
+            backend_args=_base_.backend_args)))
+
+val_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, return_classes=True))
+test_dataloader = val_dataloader
+
+# We did not adopt the official 24e optimizer strategy
+# because the results indicate that the current strategy is superior.
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00002, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'relative_position_bias_table': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }),
+    clip_grad=None)
diff --git a/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py
new file mode 100644
index 0000000000000000000000000000000000000000..34a818caefcbfcdd9e51ec304fb94906c20ceb9a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py
@@ -0,0 +1,90 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+lang_model_name = 'bert-base-uncased'
+
+model = dict(
+    type='GLIP',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=False,
+        convert_weights=False),
+    neck=dict(
+        type='FPN',
+        in_channels=[192, 384, 768],
+        out_channels=256,
+        start_level=0,
+        relu_before_extra_convs=True,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='ATSSVLFusionHead',
+        lang_model_name=lang_model_name,
+        num_classes=80,
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128],
+            center_offset=0.5),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoderForGLIP',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+    ),
+    language_model=dict(type='BertModel', name=lang_model_name),
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        backend_args=_base_.backend_args,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities'))
+]
+
+val_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, return_classes=True))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3487de3f3a24077f475e8451722d1b4d252a0084
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py
@@ -0,0 +1,9 @@
+_base_ = './glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco.py'
+
+model = dict(bbox_head=dict(early_fuse=True, use_checkpoint=True))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_b_mmdet-6dfbd102.pth'  # noqa
+
+optim_wrapper = dict(
+    optimizer=dict(lr=0.00001),
+    clip_grad=dict(_delete_=True, max_norm=1, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365.py b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365.py
new file mode 100644
index 0000000000000000000000000000000000000000..6334e5e3b4043a81d154fc03a94594d93d74aed5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365.py
@@ -0,0 +1,3 @@
+_base_ = './glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py'
+
+model = dict(bbox_head=dict(early_fuse=True))
diff --git a/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco.py b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c315e490e7a7e05a6334d4d38ce9be9b70851b3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco.py
@@ -0,0 +1,3 @@
+_base_ = './glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py'
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_c_mmdet-2fc427dd.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg.py b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg.py
new file mode 100644
index 0000000000000000000000000000000000000000..24898f4df532cc2e2728265800d2f6a030e8efe0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg.py
@@ -0,0 +1 @@
+_base_ = './glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365.py'
diff --git a/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco.py b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3391272e608e8098773a6435550e578f462ed886
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco.py
@@ -0,0 +1,3 @@
+_base_ = './glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py'
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_mmdet-c24ce662.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_fpn_dyhead_pretrain_obj365-goldg-cc3m-sub.py b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_fpn_dyhead_pretrain_obj365-goldg-cc3m-sub.py
new file mode 100644
index 0000000000000000000000000000000000000000..24898f4df532cc2e2728265800d2f6a030e8efe0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/glip_atss_swin-t_fpn_dyhead_pretrain_obj365-goldg-cc3m-sub.py
@@ -0,0 +1 @@
+_base_ = './glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365.py'
diff --git a/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-l_fpn_dyhead_pretrain_zeroshot_lvis.py b/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-l_fpn_dyhead_pretrain_zeroshot_lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f79e447d3f24e364739740be504bb234adc1e98
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-l_fpn_dyhead_pretrain_zeroshot_lvis.py
@@ -0,0 +1,12 @@
+_base_ = './glip_atss_swin-t_a_fpn_dyhead_pretrain_zeroshot_lvis.py'
+
+model = dict(
+    backbone=dict(
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        drop_path_rate=0.4,
+    ),
+    neck=dict(in_channels=[384, 768, 1536]),
+    bbox_head=dict(early_fuse=True, num_dyhead_blocks=8))
diff --git a/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-l_fpn_dyhead_pretrain_zeroshot_mini-lvis.py b/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-l_fpn_dyhead_pretrain_zeroshot_mini-lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..13f1a69082b670632dfe3eb8dc50826549dcf59f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-l_fpn_dyhead_pretrain_zeroshot_mini-lvis.py
@@ -0,0 +1,12 @@
+_base_ = './glip_atss_swin-t_a_fpn_dyhead_pretrain_zeroshot_mini-lvis.py'
+
+model = dict(
+    backbone=dict(
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        drop_path_rate=0.4,
+    ),
+    neck=dict(in_channels=[384, 768, 1536]),
+    bbox_head=dict(early_fuse=True, num_dyhead_blocks=8))
diff --git a/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-t_a_fpn_dyhead_pretrain_zeroshot_lvis.py b/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-t_a_fpn_dyhead_pretrain_zeroshot_lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d526d59008b39996a147a2852a44d2e936113d2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-t_a_fpn_dyhead_pretrain_zeroshot_lvis.py
@@ -0,0 +1,24 @@
+_base_ = '../glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py'
+
+model = dict(test_cfg=dict(
+    max_per_img=300,
+    chunked_size=40,
+))
+
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/coco/'
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        type=dataset_type,
+        ann_file='annotations/lvis_od_val.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+# numpy < 1.24.0
+val_evaluator = dict(
+    _delete_=True,
+    type='LVISFixedAPMetric',
+    ann_file=data_root + 'annotations/lvis_od_val.json')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-t_a_fpn_dyhead_pretrain_zeroshot_mini-lvis.py b/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-t_a_fpn_dyhead_pretrain_zeroshot_mini-lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..70a57a3f581ca1c374dbae71059c7049a20d3a47
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-t_a_fpn_dyhead_pretrain_zeroshot_mini-lvis.py
@@ -0,0 +1,25 @@
+_base_ = '../glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py'
+
+model = dict(test_cfg=dict(
+    max_per_img=300,
+    chunked_size=40,
+))
+
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/coco/'
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        type=dataset_type,
+        ann_file='annotations/lvis_v1_minival_inserted_image_name.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+# numpy < 1.24.0
+val_evaluator = dict(
+    _delete_=True,
+    type='LVISFixedAPMetric',
+    ann_file=data_root +
+    'annotations/lvis_v1_minival_inserted_image_name.json')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-t_bc_fpn_dyhead_pretrain_zeroshot_lvis.py b/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-t_bc_fpn_dyhead_pretrain_zeroshot_lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dc712b3bcb4f8dd1018b175d3a4e7f59be3a990
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-t_bc_fpn_dyhead_pretrain_zeroshot_lvis.py
@@ -0,0 +1,3 @@
+_base_ = './glip_atss_swin-t_a_fpn_dyhead_pretrain_zeroshot_lvis.py'
+
+model = dict(bbox_head=dict(early_fuse=True))
diff --git a/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-t_bc_fpn_dyhead_pretrain_zeroshot_mini-lvis.py b/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-t_bc_fpn_dyhead_pretrain_zeroshot_mini-lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..3babb91101a6dc283ada78911672c7c7433f67ac
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/lvis/glip_atss_swin-t_bc_fpn_dyhead_pretrain_zeroshot_mini-lvis.py
@@ -0,0 +1,3 @@
+_base_ = './glip_atss_swin-t_a_fpn_dyhead_pretrain_zeroshot_mini-lvis.py'
+
+model = dict(bbox_head=dict(early_fuse=True))
diff --git a/mmde/mmdet/.mim/configs/glip/metafile.yml b/mmde/mmdet/.mim/configs/glip/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..fbbf718b9fff3061a4e02a7d39a6c95252beb603
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/metafile.yml
@@ -0,0 +1,111 @@
+Collections:
+  - Name: GLIP
+    Metadata:
+      Training Data: Objects365, GoldG, CC3M, SBU and COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: A100 GPUs
+      Architecture:
+        - Swin Transformer
+        - DYHead
+        - BERT
+    Paper:
+      URL: https://arxiv.org/abs/2112.03857
+      Title: 'GLIP: Grounded Language-Image Pre-training'
+    README: configs/glip/README.md
+    Code:
+      URL:
+      Version: v3.0.0
+
+Models:
+  - Name: glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365
+    In Collection: GLIP
+    Config: configs/glip/glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.0
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_a_mmdet-b3654169.pth
+  - Name: glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365
+    In Collection: GLIP
+    Config: configs/glip/glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.9
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_b_mmdet-6dfbd102.pth
+  - Name: glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg
+    In Collection: GLIP
+    Config: configs/glip/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.7
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_c_mmdet-2fc427dd.pth
+  - Name: glip_atss_swin-t_fpn_dyhead_pretrain_obj365-goldg-cc3m-sub
+    In Collection: GLIP
+    Config: configs/glip/glip_atss_swin-t_fpn_dyhead_pretrain_obj365-goldg-cc3m-sub.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.4
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_mmdet-c24ce662.pth
+  - Name: glip_atss_swin-l_fpn_dyhead_pretrain_mixeddata
+    In Collection: GLIP
+    Config: configs/glip/glip_atss_swin-l_fpn_dyhead_pretrain_mixeddata.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 51.3
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_l_mmdet-abfe026b.pth
+  - Name: glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco
+    In Collection: GLIP
+    Config: configs/glip/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 53.3
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230914_180419-e6addd96.pth
+  - Name: glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco
+    In Collection: GLIP
+    Config: configs/glip/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 54.1
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230916_163538-650323ba.pth
+  - Name: glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco
+    In Collection: GLIP
+    Config: configs/glip/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 55.2
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230914_182935-4ba3fc3b.pth
+  - Name: glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco
+    In Collection: GLIP
+    Config: configs/glip/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 55.4
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230914_224410-ba97be24.pth
+  - Name: glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco
+    In Collection: GLIP
+    Config: configs/glip/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 59.4
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230910_100800-e9be4274.pth
diff --git a/mmde/mmdet/.mim/configs/glip/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw13.py b/mmde/mmdet/.mim/configs/glip/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw13.py
new file mode 100644
index 0000000000000000000000000000000000000000..d38effba8c1333a2403c6bc0f20b7fde21c4c47d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw13.py
@@ -0,0 +1,338 @@
+_base_ = '../glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py'
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    test_mode=True,
+    pipeline=base_test_pipeline,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'penguin': {
+#         'suffix': ', which is black and white'
+#     },
+#     'puffin': {
+#         'suffix': ' with orange beaks'
+#     },
+#     'stingray': {
+#         'suffix': ' which is flat and round'
+#     },
+# }
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 CottontailRabbits---------------------#
+class_name = ('Cottontail-Rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+
+caption_prompt = None
+# caption_prompt = {'Cottontail-Rabbit': {'name': 'rabbit'}}
+
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 EgoHands---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+
+caption_prompt = None
+# caption_prompt = {'hand': {'suffix': ' of a person'}}
+
+dataset_EgoHands = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 NorthAmericaMushrooms---------------------#
+class_name = ('CoW', 'chanterelle')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+
+caption_prompt = None
+# caption_prompt = {
+#     'CoW': {
+#         'name': 'flat mushroom'
+#     },
+#     'chanterelle': {
+#         'name': 'yellow mushroom'
+#     }
+# }
+
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'package': {
+#         'prefix': 'there is a ',
+#         'suffix': ' on the porch'
+#     }
+# }
+
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'pothole': {
+#         'prefix': 'there are some ',
+#         'name': 'holes',
+#         'suffix': ' on the road'
+#     }
+# }
+
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+dataset_prefixes = [
+    'AerialMaritimeDrone', 'Aquarium', 'CottontailRabbits', 'EgoHands',
+    'NorthAmericaMushrooms', 'Packages', 'PascalVOC', 'pistols', 'pothole',
+    'Raccoon', 'ShellfishOpenImages', 'thermalDogsAndPeople',
+    'VehiclesOpenImages'
+]
+datasets = [
+    dataset_AerialMaritimeDrone, dataset_Aquarium, dataset_CottontailRabbits,
+    dataset_EgoHands, dataset_NorthAmericaMushrooms, dataset_Packages,
+    dataset_PascalVOC, dataset_pistols, dataset_pothole, dataset_Raccoon,
+    dataset_ShellfishOpenImages, dataset_thermalDogsAndPeople,
+    dataset_VehiclesOpenImages
+]
+metrics = [
+    val_evaluator_AerialMaritimeDrone, val_evaluator_Aquarium,
+    val_evaluator_CottontailRabbits, val_evaluator_EgoHands,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_Packages,
+    val_evaluator_PascalVOC, val_evaluator_pistols, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_ShellfishOpenImages,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_VehiclesOpenImages
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/glip/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw35.py b/mmde/mmdet/.mim/configs/glip/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw35.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eaf09ed771978397b9d67048b371724418e50aa
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw35.py
@@ -0,0 +1,794 @@
+_base_ = '../glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py'
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone_large---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone_large = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_large = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 AerialMaritimeDrone_tiled---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/tiled/'
+dataset_AerialMaritimeDrone_tiled = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_tiled = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 AmericanSignLanguageLetters---------------------#
+class_name = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+              'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AmericanSignLanguageLetters/American Sign Language Letters.v1-v1.coco/'  # noqa
+dataset_AmericanSignLanguageLetters = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AmericanSignLanguageLetters = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 BCCD---------------------#
+class_name = ('Platelets', 'RBC', 'WBC')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'BCCD/BCCD.v3-raw.coco/'
+dataset_BCCD = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_BCCD = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 boggleBoards---------------------#
+class_name = ('Q', 'a', 'an', 'b', 'c', 'd', 'e', 'er', 'f', 'g', 'h', 'he',
+              'i', 'in', 'j', 'k', 'l', 'm', 'n', 'o', 'o ', 'p', 'q', 'qu',
+              'r', 's', 't', 't\\', 'th', 'u', 'v', 'w', 'wild', 'x', 'y', 'z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'boggleBoards/416x416AutoOrient/export/'
+dataset_boggleBoards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_boggleBoards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 brackishUnderwater---------------------#
+class_name = ('crab', 'fish', 'jellyfish', 'shrimp', 'small_fish', 'starfish')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'brackishUnderwater/960x540/'
+dataset_brackishUnderwater = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_brackishUnderwater = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 ChessPieces---------------------#
+class_name = ('  ', 'black bishop', 'black king', 'black knight', 'black pawn',
+              'black queen', 'black rook', 'white bishop', 'white king',
+              'white knight', 'white pawn', 'white queen', 'white rook')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ChessPieces/Chess Pieces.v23-raw.coco/'
+dataset_ChessPieces = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ChessPieces = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 CottontailRabbits---------------------#
+class_name = ('rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 dice---------------------#
+class_name = ('1', '2', '3', '4', '5', '6')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'dice/mediumColor/export/'
+dataset_dice = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_dice = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 DroneControl---------------------#
+class_name = ('follow', 'follow_hand', 'land', 'land_hand', 'null', 'object',
+              'takeoff', 'takeoff-hand')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'DroneControl/Drone Control.v3-raw.coco/'
+dataset_DroneControl = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_DroneControl = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 EgoHands_generic---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+caption_prompt = {'hand': {'suffix': ' of a person'}}
+dataset_EgoHands_generic = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_generic = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 EgoHands_specific---------------------#
+class_name = ('myleft', 'myright', 'yourleft', 'yourright')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/specific/'
+dataset_EgoHands_specific = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_specific = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------14 HardHatWorkers---------------------#
+class_name = ('head', 'helmet', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'HardHatWorkers/raw/'
+dataset_HardHatWorkers = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_HardHatWorkers = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------15 MaskWearing---------------------#
+class_name = ('mask', 'no-mask')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MaskWearing/raw/'
+dataset_MaskWearing = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MaskWearing = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------16 MountainDewCommercial---------------------#
+class_name = ('bottle', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MountainDewCommercial/'
+dataset_MountainDewCommercial = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MountainDewCommercial = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------17 NorthAmericaMushrooms---------------------#
+class_name = ('flat mushroom', 'yellow mushroom')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------18 openPoetryVision---------------------#
+class_name = ('American Typewriter', 'Andale Mono', 'Apple Chancery', 'Arial',
+              'Avenir', 'Baskerville', 'Big Caslon', 'Bradley Hand',
+              'Brush Script MT', 'Chalkboard', 'Comic Sans MS', 'Copperplate',
+              'Courier', 'Didot', 'Futura', 'Geneva', 'Georgia', 'Gill Sans',
+              'Helvetica', 'Herculanum', 'Impact', 'Kefa', 'Lucida Grande',
+              'Luminari', 'Marker Felt', 'Menlo', 'Monaco', 'Noteworthy',
+              'Optima', 'PT Sans', 'PT Serif', 'Palatino', 'Papyrus',
+              'Phosphate', 'Rockwell', 'SF Pro', 'SignPainter', 'Skia',
+              'Snell Roundhand', 'Tahoma', 'Times New Roman', 'Trebuchet MS',
+              'Verdana')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'openPoetryVision/512x512/'
+dataset_openPoetryVision = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_openPoetryVision = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------19 OxfordPets_by_breed---------------------#
+class_name = ('cat-Abyssinian', 'cat-Bengal', 'cat-Birman', 'cat-Bombay',
+              'cat-British_Shorthair', 'cat-Egyptian_Mau', 'cat-Maine_Coon',
+              'cat-Persian', 'cat-Ragdoll', 'cat-Russian_Blue', 'cat-Siamese',
+              'cat-Sphynx', 'dog-american_bulldog',
+              'dog-american_pit_bull_terrier', 'dog-basset_hound',
+              'dog-beagle', 'dog-boxer', 'dog-chihuahua',
+              'dog-english_cocker_spaniel', 'dog-english_setter',
+              'dog-german_shorthaired', 'dog-great_pyrenees', 'dog-havanese',
+              'dog-japanese_chin', 'dog-keeshond', 'dog-leonberger',
+              'dog-miniature_pinscher', 'dog-newfoundland', 'dog-pomeranian',
+              'dog-pug', 'dog-saint_bernard', 'dog-samoyed',
+              'dog-scottish_terrier', 'dog-shiba_inu',
+              'dog-staffordshire_bull_terrier', 'dog-wheaten_terrier',
+              'dog-yorkshire_terrier')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-breed/'  # noqa
+dataset_OxfordPets_by_breed = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_breed = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------20 OxfordPets_by_species---------------------#
+class_name = ('cat', 'dog')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-species/'  # noqa
+dataset_OxfordPets_by_species = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_species = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------21 PKLot---------------------#
+class_name = ('space-empty', 'space-occupied')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PKLot/640/'  # noqa
+dataset_PKLot = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PKLot = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------22 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+caption_prompt = {
+    'package': {
+        'prefix': 'there is a ',
+        'suffix': ' on the porch'
+    }
+}
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------23 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------24 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------25 plantdoc---------------------#
+class_name = ('Apple Scab Leaf', 'Apple leaf', 'Apple rust leaf',
+              'Bell_pepper leaf', 'Bell_pepper leaf spot', 'Blueberry leaf',
+              'Cherry leaf', 'Corn Gray leaf spot', 'Corn leaf blight',
+              'Corn rust leaf', 'Peach leaf', 'Potato leaf',
+              'Potato leaf early blight', 'Potato leaf late blight',
+              'Raspberry leaf', 'Soyabean leaf', 'Soybean leaf',
+              'Squash Powdery mildew leaf', 'Strawberry leaf',
+              'Tomato Early blight leaf', 'Tomato Septoria leaf spot',
+              'Tomato leaf', 'Tomato leaf bacterial spot',
+              'Tomato leaf late blight', 'Tomato leaf mosaic virus',
+              'Tomato leaf yellow virus', 'Tomato mold leaf',
+              'Tomato two spotted spider mites leaf', 'grape leaf',
+              'grape leaf black rot')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'plantdoc/416x416/'
+dataset_plantdoc = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_plantdoc = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------26 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+caption_prompt = {
+    'pothole': {
+        'name': 'holes',
+        'prefix': 'there are some ',
+        'suffix': ' on the road'
+    }
+}
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    caption_prompt=caption_prompt,
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------27 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------28 selfdrivingCar---------------------#
+class_name = ('biker', 'car', 'pedestrian', 'trafficLight',
+              'trafficLight-Green', 'trafficLight-GreenLeft',
+              'trafficLight-Red', 'trafficLight-RedLeft',
+              'trafficLight-Yellow', 'trafficLight-YellowLeft', 'truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'selfdrivingCar/fixedLarge/export/'
+dataset_selfdrivingCar = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_selfdrivingCar = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------29 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------30 ThermalCheetah---------------------#
+class_name = ('cheetah', 'human')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ThermalCheetah/'
+dataset_ThermalCheetah = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ThermalCheetah = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------31 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------32 UnoCards---------------------#
+class_name = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
+              '12', '13', '14')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'UnoCards/raw/'
+dataset_UnoCards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_UnoCards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------33 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------34 WildfireSmoke---------------------#
+class_name = ('smoke', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'WildfireSmoke/'
+dataset_WildfireSmoke = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_WildfireSmoke = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------35 websiteScreenshots---------------------#
+class_name = ('button', 'field', 'heading', 'iframe', 'image', 'label', 'link',
+              'text')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'websiteScreenshots/'
+dataset_websiteScreenshots = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_websiteScreenshots = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+
+dataset_prefixes = [
+    'AerialMaritimeDrone_large',
+    'AerialMaritimeDrone_tiled',
+    'AmericanSignLanguageLetters',
+    'Aquarium',
+    'BCCD',
+    'boggleBoards',
+    'brackishUnderwater',
+    'ChessPieces',
+    'CottontailRabbits',
+    'dice',
+    'DroneControl',
+    'EgoHands_generic',
+    'EgoHands_specific',
+    'HardHatWorkers',
+    'MaskWearing',
+    'MountainDewCommercial',
+    'NorthAmericaMushrooms',
+    'openPoetryVision',
+    'OxfordPets_by_breed',
+    'OxfordPets_by_species',
+    'PKLot',
+    'Packages',
+    'PascalVOC',
+    'pistols',
+    'plantdoc',
+    'pothole',
+    'Raccoons',
+    'selfdrivingCar',
+    'ShellfishOpenImages',
+    'ThermalCheetah',
+    'thermalDogsAndPeople',
+    'UnoCards',
+    'VehiclesOpenImages',
+    'WildfireSmoke',
+    'websiteScreenshots',
+]
+
+datasets = [
+    dataset_AerialMaritimeDrone_large, dataset_AerialMaritimeDrone_tiled,
+    dataset_AmericanSignLanguageLetters, dataset_Aquarium, dataset_BCCD,
+    dataset_boggleBoards, dataset_brackishUnderwater, dataset_ChessPieces,
+    dataset_CottontailRabbits, dataset_dice, dataset_DroneControl,
+    dataset_EgoHands_generic, dataset_EgoHands_specific,
+    dataset_HardHatWorkers, dataset_MaskWearing, dataset_MountainDewCommercial,
+    dataset_NorthAmericaMushrooms, dataset_openPoetryVision,
+    dataset_OxfordPets_by_breed, dataset_OxfordPets_by_species, dataset_PKLot,
+    dataset_Packages, dataset_PascalVOC, dataset_pistols, dataset_plantdoc,
+    dataset_pothole, dataset_Raccoon, dataset_selfdrivingCar,
+    dataset_ShellfishOpenImages, dataset_ThermalCheetah,
+    dataset_thermalDogsAndPeople, dataset_UnoCards, dataset_VehiclesOpenImages,
+    dataset_WildfireSmoke, dataset_websiteScreenshots
+]
+
+metrics = [
+    val_evaluator_AerialMaritimeDrone_large,
+    val_evaluator_AerialMaritimeDrone_tiled,
+    val_evaluator_AmericanSignLanguageLetters, val_evaluator_Aquarium,
+    val_evaluator_BCCD, val_evaluator_boggleBoards,
+    val_evaluator_brackishUnderwater, val_evaluator_ChessPieces,
+    val_evaluator_CottontailRabbits, val_evaluator_dice,
+    val_evaluator_DroneControl, val_evaluator_EgoHands_generic,
+    val_evaluator_EgoHands_specific, val_evaluator_HardHatWorkers,
+    val_evaluator_MaskWearing, val_evaluator_MountainDewCommercial,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_openPoetryVision,
+    val_evaluator_OxfordPets_by_breed, val_evaluator_OxfordPets_by_species,
+    val_evaluator_PKLot, val_evaluator_Packages, val_evaluator_PascalVOC,
+    val_evaluator_pistols, val_evaluator_plantdoc, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_selfdrivingCar,
+    val_evaluator_ShellfishOpenImages, val_evaluator_ThermalCheetah,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_UnoCards,
+    val_evaluator_VehiclesOpenImages, val_evaluator_WildfireSmoke,
+    val_evaluator_websiteScreenshots
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/glip/odinw/glip_atss_swin-t_bc_fpn_dyhead_pretrain_odinw13.py b/mmde/mmdet/.mim/configs/glip/odinw/glip_atss_swin-t_bc_fpn_dyhead_pretrain_odinw13.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3479b62b781fa38282b26ab69763d1766301dc7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/odinw/glip_atss_swin-t_bc_fpn_dyhead_pretrain_odinw13.py
@@ -0,0 +1,3 @@
+_base_ = './glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw13.py'
+
+model = dict(bbox_head=dict(early_fuse=True))
diff --git a/mmde/mmdet/.mim/configs/glip/odinw/glip_atss_swin-t_bc_fpn_dyhead_pretrain_odinw35.py b/mmde/mmdet/.mim/configs/glip/odinw/glip_atss_swin-t_bc_fpn_dyhead_pretrain_odinw35.py
new file mode 100644
index 0000000000000000000000000000000000000000..182afc66c93441da85d7e0116970e45a58c492d0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/odinw/glip_atss_swin-t_bc_fpn_dyhead_pretrain_odinw35.py
@@ -0,0 +1,3 @@
+_base_ = './glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw35.py'
+
+model = dict(bbox_head=dict(early_fuse=True))
diff --git a/mmde/mmdet/.mim/configs/glip/odinw/override_category.py b/mmde/mmdet/.mim/configs/glip/odinw/override_category.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ff05fc6e5e4d0989cf7fcf7af4dc902ee99f3a3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/glip/odinw/override_category.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import mmengine
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Override Category')
+    parser.add_argument('data_root')
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    ChessPieces = [{
+        'id': 1,
+        'name': '  ',
+        'supercategory': 'pieces'
+    }, {
+        'id': 2,
+        'name': 'black bishop',
+        'supercategory': 'pieces'
+    }, {
+        'id': 3,
+        'name': 'black king',
+        'supercategory': 'pieces'
+    }, {
+        'id': 4,
+        'name': 'black knight',
+        'supercategory': 'pieces'
+    }, {
+        'id': 5,
+        'name': 'black pawn',
+        'supercategory': 'pieces'
+    }, {
+        'id': 6,
+        'name': 'black queen',
+        'supercategory': 'pieces'
+    }, {
+        'id': 7,
+        'name': 'black rook',
+        'supercategory': 'pieces'
+    }, {
+        'id': 8,
+        'name': 'white bishop',
+        'supercategory': 'pieces'
+    }, {
+        'id': 9,
+        'name': 'white king',
+        'supercategory': 'pieces'
+    }, {
+        'id': 10,
+        'name': 'white knight',
+        'supercategory': 'pieces'
+    }, {
+        'id': 11,
+        'name': 'white pawn',
+        'supercategory': 'pieces'
+    }, {
+        'id': 12,
+        'name': 'white queen',
+        'supercategory': 'pieces'
+    }, {
+        'id': 13,
+        'name': 'white rook',
+        'supercategory': 'pieces'
+    }]
+
+    _data_root = args.data_root + 'ChessPieces/Chess Pieces.v23-raw.coco/'
+    json_data = mmengine.load(_data_root +
+                              'valid/annotations_without_background.json')
+    json_data['categories'] = ChessPieces
+    mmengine.dump(json_data,
+                  _data_root + 'valid/new_annotations_without_background.json')
+
+    CottontailRabbits = [{
+        'id': 1,
+        'name': 'rabbit',
+        'supercategory': 'Cottontail-Rabbit'
+    }]
+
+    _data_root = args.data_root + 'CottontailRabbits/'
+    json_data = mmengine.load(_data_root +
+                              'valid/annotations_without_background.json')
+    json_data['categories'] = CottontailRabbits
+    mmengine.dump(json_data,
+                  _data_root + 'valid/new_annotations_without_background.json')
+
+    NorthAmericaMushrooms = [{
+        'id': 1,
+        'name': 'flat mushroom',
+        'supercategory': 'mushroom'
+    }, {
+        'id': 2,
+        'name': 'yellow mushroom',
+        'supercategory': 'mushroom'
+    }]
+
+    _data_root = args.data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+    json_data = mmengine.load(_data_root +
+                              'valid/annotations_without_background.json')
+    json_data['categories'] = NorthAmericaMushrooms
+    mmengine.dump(json_data,
+                  _data_root + 'valid/new_annotations_without_background.json')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/configs/gn+ws/faster-rcnn_r101_fpn_gn-ws-all_1x_coco.py b/mmde/mmdet/.mim/configs/gn+ws/faster-rcnn_r101_fpn_gn-ws-all_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4cb8281ac6d4b43a615ba1a05903770d8ee2f69
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn+ws/faster-rcnn_r101_fpn_gn-ws-all_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://jhu/resnet101_gn_ws')))
diff --git a/mmde/mmdet/.mim/configs/gn+ws/faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py b/mmde/mmdet/.mim/configs/gn+ws/faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a044c99a2e84de71822edb62543570891141b25
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn+ws/faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+conv_cfg = dict(type='ConvWS')
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        conv_cfg=conv_cfg,
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://jhu/resnet50_gn_ws')),
+    neck=dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)))
diff --git a/mmde/mmdet/.mim/configs/gn+ws/faster-rcnn_x101-32x4d_fpn_gn-ws-all_1x_coco.py b/mmde/mmdet/.mim/configs/gn+ws/faster-rcnn_x101-32x4d_fpn_gn-ws-all_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2a317d2ac830d95788084eaa8d374838b34a365
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn+ws/faster-rcnn_x101-32x4d_fpn_gn-ws-all_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py'
+conv_cfg = dict(type='ConvWS')
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch',
+        conv_cfg=conv_cfg,
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://jhu/resnext101_32x4d_gn_ws')))
diff --git a/mmde/mmdet/.mim/configs/gn+ws/faster-rcnn_x50-32x4d_fpn_gn-ws-all_1x_coco.py b/mmde/mmdet/.mim/configs/gn+ws/faster-rcnn_x50-32x4d_fpn_gn-ws-all_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd75a2c004b8cc04411d47d8b9db6ba0ec4ffcb0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn+ws/faster-rcnn_x50-32x4d_fpn_gn-ws-all_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py'
+conv_cfg = dict(type='ConvWS')
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=50,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch',
+        conv_cfg=conv_cfg,
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://jhu/resnext50_32x4d_gn_ws')))
diff --git a/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_r101_fpn_gn-ws-all_20-23-24e_coco.py b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_r101_fpn_gn-ws-all_20-23-24e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1815e3f85b9fd5d7204b08cd60a13980a382fd51
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_r101_fpn_gn-ws-all_20-23-24e_coco.py
@@ -0,0 +1,17 @@
+_base_ = './mask-rcnn_r101_fpn_gn-ws-all_2x_coco.py'
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[20, 23],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_r101_fpn_gn-ws-all_2x_coco.py b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_r101_fpn_gn-ws-all_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5de37dee5e86e202c211464eaa08dd295dba44b2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_r101_fpn_gn-ws-all_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://jhu/resnet101_gn_ws')))
diff --git a/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_r50_fpn_gn-ws-all_20-23-24e_coco.py b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_r50_fpn_gn-ws-all_20-23-24e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..287c652045d6230411043f2abab34be4f6106687
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_r50_fpn_gn-ws-all_20-23-24e_coco.py
@@ -0,0 +1,17 @@
+_base_ = './mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py'
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[20, 23],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed8b1b73fe8695fc6bbb4054405192fca995cf81
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py
@@ -0,0 +1,33 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+conv_cfg = dict(type='ConvWS')
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        conv_cfg=conv_cfg,
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://jhu/resnet50_gn_ws')),
+    neck=dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg),
+        mask_head=dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg)))
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_20-23-24e_coco.py b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_20-23-24e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ce9193579b914f8dc0804cb73c3d8e41b153655
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_20-23-24e_coco.py
@@ -0,0 +1,17 @@
+_base_ = './mask-rcnn_x101-32x4d_fpn_gn-ws-all_2x_coco.py'
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[20, 23],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_2x_coco.py b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcfc371e774470ede7d171b4268db919385775ab
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_2x_coco.py
@@ -0,0 +1,19 @@
+_base_ = './mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py'
+# model settings
+conv_cfg = dict(type='ConvWS')
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch',
+        conv_cfg=conv_cfg,
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://jhu/resnext101_32x4d_gn_ws')))
diff --git a/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_20-23-24e_coco.py b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_20-23-24e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..af9ea5ab476b8ea3247062261726bef6b6bc1b0c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_20-23-24e_coco.py
@@ -0,0 +1,17 @@
+_base_ = './mask-rcnn_x50-32x4d_fpn_gn-ws-all_2x_coco.py'
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[20, 23],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_2x_coco.py b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab2b14042e9510ab14698e7a64c68d6ff60835e1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn+ws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_2x_coco.py
@@ -0,0 +1,19 @@
+_base_ = './mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py'
+# model settings
+conv_cfg = dict(type='ConvWS')
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=50,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch',
+        conv_cfg=conv_cfg,
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://jhu/resnext50_32x4d_gn_ws')))
diff --git a/mmde/mmdet/.mim/configs/gn+ws/metafile.yml b/mmde/mmdet/.mim/configs/gn+ws/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..89b91072924a31e53db1e95df30b47636a67b74b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn+ws/metafile.yml
@@ -0,0 +1,263 @@
+Collections:
+  - Name: Weight Standardization
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Group Normalization
+        - Weight Standardization
+    Paper:
+      URL: https://arxiv.org/abs/1903.10520
+      Title: 'Weight Standardization'
+    README: configs/gn+ws/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/configs/gn%2Bws/mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py
+      Version: v2.0.0
+
+Models:
+  - Name: faster-rcnn_r50_fpn_gn_ws-all_1x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn%2Bws/faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.9
+      inference time (ms/im):
+        - value: 85.47
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco/faster_rcnn_r50_fpn_gn_ws-all_1x_coco_20200130-613d9fe2.pth
+
+  - Name: faster-rcnn_r101_fpn_gn-ws-all_1x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn%2Bws/faster-rcnn_r101_fpn_gn-ws-all_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.9
+      inference time (ms/im):
+        - value: 111.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco/faster_rcnn_r101_fpn_gn_ws-all_1x_coco_20200205-a93b0d75.pth
+
+  - Name: faster-rcnn_x50-32x4d_fpn_gn-ws-all_1x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn%2Bws/faster-rcnn_x50-32x4d_fpn_gn-ws-all_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 97.09
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco_20200203-839c5d9d.pth
+
+  - Name: faster-rcnn_x101-32x4d_fpn_gn-ws-all_1x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn%2Bws/faster-rcnn_x101-32x4d_fpn_gn-ws-all_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.8
+      inference time (ms/im):
+        - value: 131.58
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco_20200212-27da1bc2.pth
+
+  - Name: mask-rcnn_r50_fpn_gn_ws-all_2x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn%2Bws/mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.3
+      inference time (ms/im):
+        - value: 95.24
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco/mask_rcnn_r50_fpn_gn_ws-all_2x_coco_20200226-16acb762.pth
+
+  - Name: mask-rcnn_r101_fpn_gn-ws-all_2x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn%2Bws/mask-rcnn_r101_fpn_gn-ws-all_2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.3
+      inference time (ms/im):
+        - value: 116.28
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco/mask_rcnn_r101_fpn_gn_ws-all_2x_coco_20200212-ea357cd9.pth
+
+  - Name: mask-rcnn_x50-32x4d_fpn_gn-ws-all_2x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn%2Bws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_2x_coco.py
+    Metadata:
+      Training Memory (GB): 8.4
+      inference time (ms/im):
+        - value: 107.53
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco_20200216-649fdb6f.pth
+
+  - Name: mask-rcnn_x101-32x4d_fpn_gn-ws-all_2x_coco
+    In Collection: Weight Standardization
+    Config: configs/gn%2Bws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_2x_coco.py
+    Metadata:
+      Training Memory (GB): 12.2
+      inference time (ms/im):
+        - value: 140.85
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco_20200319-33fb95b5.pth
+
+  - Name: mask-rcnn_r50_fpn_gn_ws-all_20_23_24e_coco
+    In Collection: Weight Standardization
+    Config: configs/gn%2Bws/mask-rcnn_r50_fpn_gn-ws-all_20-23-24e_coco.py
+    Metadata:
+      Training Memory (GB): 7.3
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco_20200213-487d1283.pth
+
+  - Name: mask-rcnn_r101_fpn_gn-ws-all_20-23-24e_coco
+    In Collection: Weight Standardization
+    Config: configs/gn%2Bws/mask-rcnn_r101_fpn_gn-ws-all_20-23-24e_coco.py
+    Metadata:
+      Training Memory (GB): 10.3
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco_20200213-57b5a50f.pth
+
+  - Name: mask-rcnn_x50-32x4d_fpn_gn-ws-all_20-23-24e_coco
+    In Collection: Weight Standardization
+    Config: configs/gn%2Bws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_20-23-24e_coco.py
+    Metadata:
+      Training Memory (GB): 8.4
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200226-969bcb2c.pth
+
+  - Name: mask-rcnn_x101-32x4d_fpn_gn-ws-all_20-23-24e_coco
+    In Collection: Weight Standardization
+    Config: configs/gn%2Bws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_20-23-24e_coco.py
+    Metadata:
+      Training Memory (GB): 12.2
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200316-e6cd35ef.pth
diff --git a/mmde/mmdet/.mim/configs/gn/mask-rcnn_r101_fpn_gn-all_2x_coco.py b/mmde/mmdet/.mim/configs/gn/mask-rcnn_r101_fpn_gn-all_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..54f57d8d0855d07c696907d8c7c0758e4c13a573
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn/mask-rcnn_r101_fpn_gn-all_2x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './mask-rcnn_r50_fpn_gn-all_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron/resnet101_gn')))
diff --git a/mmde/mmdet/.mim/configs/gn/mask-rcnn_r101_fpn_gn-all_3x_coco.py b/mmde/mmdet/.mim/configs/gn/mask-rcnn_r101_fpn_gn-all_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a94e063ecd2a5e2fd83eb78aa4d7ddd8f51e2b9e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn/mask-rcnn_r101_fpn_gn-all_3x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './mask-rcnn_r101_fpn_gn-all_2x_coco.py'
+
+# learning policy
+max_epochs = 36
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[28, 34],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_2x_coco.py b/mmde/mmdet/.mim/configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5515ec14a47a0dfa58acf6c46bc40d77ce39ac3d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_2x_coco.py
@@ -0,0 +1,31 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://contrib/resnet50_gn')),
+    neck=dict(norm_cfg=norm_cfg),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=norm_cfg),
+        mask_head=dict(norm_cfg=norm_cfg)))
+
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_3x_coco.py b/mmde/mmdet/.mim/configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6f7a97e8e0482836b225e832be2e3de4ae99947
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_3x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './mask-rcnn_r50-contrib_fpn_gn-all_2x_coco.py'
+
+# learning policy
+max_epochs = 36
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[28, 34],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/gn/mask-rcnn_r50_fpn_gn-all_2x_coco.py b/mmde/mmdet/.mim/configs/gn/mask-rcnn_r50_fpn_gn-all_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1313b22e4795239d5148fb8d665cdadb5fac8e4f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn/mask-rcnn_r50_fpn_gn-all_2x_coco.py
@@ -0,0 +1,36 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        norm_cfg=norm_cfg,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron/resnet50_gn')),
+    neck=dict(norm_cfg=norm_cfg),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=norm_cfg),
+        mask_head=dict(norm_cfg=norm_cfg)))
+
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/gn/mask-rcnn_r50_fpn_gn-all_3x_coco.py b/mmde/mmdet/.mim/configs/gn/mask-rcnn_r50_fpn_gn-all_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e425de951bb0419d1d1596e45637be1d914a8034
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn/mask-rcnn_r50_fpn_gn-all_3x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './mask-rcnn_r50_fpn_gn-all_2x_coco.py'
+
+# learning policy
+max_epochs = 36
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[28, 34],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/gn/metafile.yml b/mmde/mmdet/.mim/configs/gn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9781dc9393f17b89a8e4228ef905a06dfdbc7eb5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/gn/metafile.yml
@@ -0,0 +1,162 @@
+Collections:
+  - Name: Group Normalization
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Group Normalization
+    Paper:
+      URL: https://arxiv.org/abs/1803.08494
+      Title: 'Group Normalization'
+    README: configs/gn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/configs/gn/mask-rcnn_r50_fpn_gn-all_2x_coco.py
+      Version: v2.0.0
+
+Models:
+  - Name: mask-rcnn_r50_fpn_gn-all_2x_coco
+    In Collection: Group Normalization
+    Config: configs/gn/mask-rcnn_r50_fpn_gn-all_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      inference time (ms/im):
+        - value: 90.91
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_2x_coco/mask_rcnn_r50_fpn_gn-all_2x_coco_20200206-8eee02a6.pth
+
+  - Name: mask-rcnn_r50_fpn_gn-all_3x_coco
+    In Collection: Group Normalization
+    Config: configs/gn/mask-rcnn_r50_fpn_gn-all_3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      inference time (ms/im):
+        - value: 90.91
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_3x_coco/mask_rcnn_r50_fpn_gn-all_3x_coco_20200214-8b23b1e5.pth
+
+  - Name: mask-rcnn_r101_fpn_gn-all_2x_coco
+    In Collection: Group Normalization
+    Config: configs/gn/mask-rcnn_r101_fpn_gn-all_2x_coco.py
+    Metadata:
+      Training Memory (GB): 9.9
+      inference time (ms/im):
+        - value: 111.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_2x_coco/mask_rcnn_r101_fpn_gn-all_2x_coco_20200205-d96b1b50.pth
+
+  - Name: mask-rcnn_r101_fpn_gn-all_3x_coco
+    In Collection: Group Normalization
+    Config: configs/gn/mask-rcnn_r101_fpn_gn-all_3x_coco.py
+    Metadata:
+      Training Memory (GB): 9.9
+      inference time (ms/im):
+        - value: 111.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  38.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_3x_coco/mask_rcnn_r101_fpn_gn-all_3x_coco_20200513_181609-0df864f4.pth
+
+  - Name: mask-rcnn_r50_fpn_gn-all_contrib_2x_coco
+    In Collection: Group Normalization
+    Config: configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      inference time (ms/im):
+        - value: 91.74
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco_20200207-20d3e849.pth
+
+  - Name: mask-rcnn_r50_fpn_gn-all_contrib_3x_coco
+    In Collection: Group Normalization
+    Config: configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      inference time (ms/im):
+        - value: 91.74
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:  36.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco_20200225-542aefbc.pth
diff --git a/mmde/mmdet/.mim/configs/grid_rcnn/grid-rcnn_r101_fpn_gn-head_2x_coco.py b/mmde/mmdet/.mim/configs/grid_rcnn/grid-rcnn_r101_fpn_gn-head_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..46d41ed4ed5d1d6345e98434221cc5b07c60767d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grid_rcnn/grid-rcnn_r101_fpn_gn-head_2x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './grid-rcnn_r50_fpn_gn-head_2x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_1x_coco.py b/mmde/mmdet/.mim/configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..358280630fa96e40ac7834cbda6b1ad3dc689c55
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_1x_coco.py
@@ -0,0 +1,19 @@
+_base_ = './grid-rcnn_r50_fpn_gn-head_2x_coco.py'
+
+# training schedule
+max_epochs = 12
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.0001, by_epoch=False, begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_2x_coco.py b/mmde/mmdet/.mim/configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..228fca2323ceec2052a3835089d987a2643c53c1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_2x_coco.py
@@ -0,0 +1,160 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    type='GridRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='GridRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            with_reg=False,
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False),
+        grid_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        grid_head=dict(
+            type='GridHead',
+            grid_points=9,
+            num_convs=8,
+            in_channels=256,
+            point_feat_channels=64,
+            norm_cfg=dict(type='GN', num_groups=36),
+            loss_grid=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=15))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_radius=1,
+            pos_weight=-1,
+            max_num_grid=192,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.03,
+            nms=dict(type='nms', iou_threshold=0.3),
+            max_per_img=100)))
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+
+# training schedule
+max_epochs = 25
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 80,
+        by_epoch=False,
+        begin=0,
+        end=3665),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[17, 23],
+        gamma=0.1)
+]
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/grid_rcnn/grid-rcnn_x101-32x4d_fpn_gn-head_2x_coco.py b/mmde/mmdet/.mim/configs/grid_rcnn/grid-rcnn_x101-32x4d_fpn_gn-head_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..dddf157beb6667887d0cd920cb2803e340d43183
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grid_rcnn/grid-rcnn_x101-32x4d_fpn_gn-head_2x_coco.py
@@ -0,0 +1,13 @@
+_base_ = './grid-rcnn_r50_fpn_gn-head_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/grid_rcnn/grid-rcnn_x101-64x4d_fpn_gn-head_2x_coco.py b/mmde/mmdet/.mim/configs/grid_rcnn/grid-rcnn_x101-64x4d_fpn_gn-head_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4ff50f546ae660cf398c2cb1c6f67ca20848c0f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grid_rcnn/grid-rcnn_x101-64x4d_fpn_gn-head_2x_coco.py
@@ -0,0 +1,13 @@
+_base_ = './grid-rcnn_x101-32x4d_fpn_gn-head_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/grid_rcnn/metafile.yml b/mmde/mmdet/.mim/configs/grid_rcnn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..cee91e3b88e7bafa27e705713f2bc45d0dc872d0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grid_rcnn/metafile.yml
@@ -0,0 +1,101 @@
+Collections:
+  - Name: Grid R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RPN
+        - Dilated Convolution
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/abs/1906.05688
+      Title: 'Grid R-CNN'
+    README: configs/grid_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/grid_rcnn.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: grid-rcnn_r50_fpn_gn-head_2x_coco
+    In Collection: Grid R-CNN
+    Config: configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_2x_coco.py
+    Metadata:
+      Training Memory (GB): 5.1
+      inference time (ms/im):
+        - value: 66.67
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco/grid_rcnn_r50_fpn_gn-head_2x_coco_20200130-6cca8223.pth
+
+  - Name: grid-rcnn_r101_fpn_gn-head_2x_coco
+    In Collection: Grid R-CNN
+    Config: configs/grid_rcnn/grid-rcnn_r101_fpn_gn-head_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 79.37
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco/grid_rcnn_r101_fpn_gn-head_2x_coco_20200309-d6eca030.pth
+
+  - Name: grid-rcnn_x101-32x4d_fpn_gn-head_2x_coco
+    In Collection: Grid R-CNN
+    Config: configs/grid_rcnn/grid-rcnn_x101-32x4d_fpn_gn-head_2x_coco.py
+    Metadata:
+      Training Memory (GB): 8.3
+      inference time (ms/im):
+        - value: 92.59
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco_20200130-d8f0e3ff.pth
+
+  - Name: grid-rcnn_x101-64x4d_fpn_gn-head_2x_coco
+    In Collection: Grid R-CNN
+    Config: configs/grid_rcnn/grid-rcnn_x101-64x4d_fpn_gn-head_2x_coco.py
+    Metadata:
+      Training Memory (GB): 11.3
+      inference time (ms/im):
+        - value: 129.87
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco_20200204-ec76a754.pth
diff --git a/mmde/mmdet/.mim/configs/groie/faste-rcnn_r50_fpn_groie_1x_coco.py b/mmde/mmdet/.mim/configs/groie/faste-rcnn_r50_fpn_groie_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fbe8a32c3a81e9b312a02f79f3495171387d9f0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/groie/faste-rcnn_r50_fpn_groie_1x_coco.py
@@ -0,0 +1,25 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+# model settings
+model = dict(
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            aggregation='sum',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2))))
diff --git a/mmde/mmdet/.mim/configs/groie/grid-rcnn_r50_fpn_gn-head-groie_1x_coco.py b/mmde/mmdet/.mim/configs/groie/grid-rcnn_r50_fpn_gn-head-groie_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..dadccb79c2288f16eb4a1fa33269e4a8f5a55c9b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/groie/grid-rcnn_r50_fpn_gn-head-groie_1x_coco.py
@@ -0,0 +1,45 @@
+_base_ = '../grid_rcnn/grid-rcnn_r50_fpn_gn-head_1x_coco.py'
+# model settings
+model = dict(
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            aggregation='sum',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2)),
+        grid_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2))))
diff --git a/mmde/mmdet/.mim/configs/groie/mask-rcnn_r101_fpn_syncbn-r4-gcb_c3-c5-groie_1x_coco.py b/mmde/mmdet/.mim/configs/groie/mask-rcnn_r101_fpn_syncbn-r4-gcb_c3-c5-groie_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5699b4284a76fe633afd81acb0b047a81df6afd2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/groie/mask-rcnn_r101_fpn_syncbn-r4-gcb_c3-c5-groie_1x_coco.py
@@ -0,0 +1,45 @@
+_base_ = '../gcnet/mask-rcnn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py'
+# model settings
+model = dict(
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            aggregation='sum',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2)),
+        mask_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2))))
diff --git a/mmde/mmdet/.mim/configs/groie/mask-rcnn_r50_fpn_groie_1x_coco.py b/mmde/mmdet/.mim/configs/groie/mask-rcnn_r50_fpn_groie_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c9521e2f5730b74efc51f2051f861bfe5f8192d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/groie/mask-rcnn_r50_fpn_groie_1x_coco.py
@@ -0,0 +1,45 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+# model settings
+model = dict(
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            aggregation='sum',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2)),
+        mask_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2))))
diff --git a/mmde/mmdet/.mim/configs/groie/mask-rcnn_r50_fpn_syncbn-r4-gcb-c3-c5-groie_1x_coco.py b/mmde/mmdet/.mim/configs/groie/mask-rcnn_r50_fpn_syncbn-r4-gcb-c3-c5-groie_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..22e97b6959a0bd13ae4432c806c61ca3d899f9ea
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/groie/mask-rcnn_r50_fpn_syncbn-r4-gcb-c3-c5-groie_1x_coco.py
@@ -0,0 +1,45 @@
+_base_ = '../gcnet/mask-rcnn_r50-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py'
+# model settings
+model = dict(
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            aggregation='sum',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2)),
+        mask_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32],
+            pre_cfg=dict(
+                type='ConvModule',
+                in_channels=256,
+                out_channels=256,
+                kernel_size=5,
+                padding=2,
+                inplace=False,
+            ),
+            post_cfg=dict(
+                type='GeneralizedAttention',
+                in_channels=256,
+                spatial_range=-1,
+                num_heads=6,
+                attention_type='0100',
+                kv_stride=2))))
diff --git a/mmde/mmdet/.mim/configs/groie/metafile.yml b/mmde/mmdet/.mim/configs/groie/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ce957004719cb542a51c48e7e07a3d94d6bdee18
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/groie/metafile.yml
@@ -0,0 +1,94 @@
+Collections:
+  - Name: GRoIE
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Generic RoI Extractor
+        - FPN
+        - RPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/abs/2004.13665
+      Title: 'A novel Region of Interest Extraction Layer for Instance Segmentation'
+    README: configs/groie/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/roi_heads/roi_extractors/groie.py#L15
+      Version: v2.1.0
+
+Models:
+  - Name: faster-rcnn_r50_fpn_groie_1x_coco
+    In Collection: GRoIE
+    Config: configs/groie/faste-rcnn_r50_fpn_groie_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/faster_rcnn_r50_fpn_groie_1x_coco/faster_rcnn_r50_fpn_groie_1x_coco_20200604_211715-66ee9516.pth
+
+  - Name: grid-rcnn_r50_fpn_gn-head-groie_1x_coco
+    In Collection: GRoIE
+    Config: configs/groie/grid-rcnn_r50_fpn_gn-head-groie_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco/grid_rcnn_r50_fpn_gn-head_groie_1x_coco_20200605_202059-4b75d86f.pth
+
+  - Name: mask-rcnn_r50_fpn_groie_1x_coco
+    In Collection: GRoIE
+    Config: configs/groie/mask-rcnn_r50_fpn_groie_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_groie_1x_coco/mask_rcnn_r50_fpn_groie_1x_coco_20200604_211715-50d90c74.pth
+
+  - Name: mask-rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco
+    In Collection: GRoIE
+    Config: configs/groie/mask-rcnn_r50_fpn_syncbn-r4-gcb-c3-c5-groie_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP:   37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200604_211715-42eb79e1.pth
+
+  - Name: mask-rcnn_r101_fpn_syncbn-r4-gcb_c3-c5-groie_1x_coco
+    In Collection: GRoIE
+    Config: configs/groie/mask-rcnn_r101_fpn_syncbn-r4-gcb_c3-c5-groie_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200607_224507-8daae01c.pth
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py b/mmde/mmdet/.mim/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac655b74aa664ef912b6b1f509e4eb9341ccd62a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py
@@ -0,0 +1,14 @@
+_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py'
+
+model = dict(
+    type='GroundingDINO',
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_parallel_dod.py b/mmde/mmdet/.mim/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_parallel_dod.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a1c8f2ac740c6c64a01a1a6a8f7dd57622bedf6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_parallel_dod.py
@@ -0,0 +1,3 @@
+_base_ = 'grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py'
+
+model = dict(test_cfg=dict(chunked_size=1))
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py b/mmde/mmdet/.mim/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb418011bf489c259f3696589aa56c5b8296256c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py
@@ -0,0 +1,78 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
+
+data_root = 'data/d3/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities', 'sent_ids'))
+]
+
+# -------------------------------------------------#
+val_dataset_full = dict(
+    type='DODDataset',
+    data_root=data_root,
+    ann_file='d3_json/d3_full_annotations.json',
+    data_prefix=dict(img='d3_images/', anno='d3_pkl'),
+    pipeline=test_pipeline,
+    test_mode=True,
+    backend_args=None,
+    return_classes=True)
+
+val_evaluator_full = dict(
+    type='DODCocoMetric',
+    ann_file=data_root + 'd3_json/d3_full_annotations.json')
+
+# -------------------------------------------------#
+val_dataset_pres = dict(
+    type='DODDataset',
+    data_root=data_root,
+    ann_file='d3_json/d3_pres_annotations.json',
+    data_prefix=dict(img='d3_images/', anno='d3_pkl'),
+    pipeline=test_pipeline,
+    test_mode=True,
+    backend_args=None,
+    return_classes=True)
+val_evaluator_pres = dict(
+    type='DODCocoMetric',
+    ann_file=data_root + 'd3_json/d3_pres_annotations.json')
+
+# -------------------------------------------------#
+val_dataset_abs = dict(
+    type='DODDataset',
+    data_root=data_root,
+    ann_file='d3_json/d3_abs_annotations.json',
+    data_prefix=dict(img='d3_images/', anno='d3_pkl'),
+    pipeline=test_pipeline,
+    test_mode=True,
+    backend_args=None,
+    return_classes=True)
+val_evaluator_abs = dict(
+    type='DODCocoMetric',
+    ann_file=data_root + 'd3_json/d3_abs_annotations.json')
+
+# -------------------------------------------------#
+datasets = [val_dataset_full, val_dataset_pres, val_dataset_abs]
+dataset_prefixes = ['FULL', 'PRES', 'ABS']
+metrics = [val_evaluator_full, val_evaluator_pres, val_evaluator_abs]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_parallel_dod.py b/mmde/mmdet/.mim/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_parallel_dod.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d680091162e5ac96c15c76b58a18764e85d3233
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_parallel_dod.py
@@ -0,0 +1,3 @@
+_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py'
+
+model = dict(test_cfg=dict(chunked_size=1))
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_zeroshot_flickr30k.py b/mmde/mmdet/.mim/configs/grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_zeroshot_flickr30k.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1996567588842f82c0af83e3a9ab84c81e7c25d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_zeroshot_flickr30k.py
@@ -0,0 +1,57 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
+
+dataset_type = 'Flickr30kDataset'
+data_root = 'data/flickr30k_entities/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive', 'phrase_ids', 'phrases'))
+]
+
+dataset_Flickr30k_val = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='final_flickr_separateGT_val.json',
+    data_prefix=dict(img='flickr30k_images/'),
+    pipeline=test_pipeline,
+)
+
+dataset_Flickr30k_test = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='final_flickr_separateGT_test.json',
+    data_prefix=dict(img='flickr30k_images/'),
+    pipeline=test_pipeline,
+)
+
+val_evaluator_Flickr30k = dict(type='Flickr30kMetric')
+
+test_evaluator_Flickr30k = dict(type='Flickr30kMetric')
+
+# ----------Config---------- #
+dataset_prefixes = ['Flickr30kVal', 'Flickr30kTest']
+datasets = [dataset_Flickr30k_val, dataset_Flickr30k_test]
+metrics = [val_evaluator_Flickr30k, test_evaluator_Flickr30k]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco.py b/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..623a29b87adfd6734e980e814766e873b2b89d05
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco.py
@@ -0,0 +1,208 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+lang_model_name = 'bert-base-uncased'
+
+model = dict(
+    type='GroundingDINO',
+    num_queries=900,
+    with_box_refine=True,
+    as_two_stage=True,
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=False,
+    ),
+    language_model=dict(
+        type='BertModel',
+        name=lang_model_name,
+        pad_to_max=False,
+        use_sub_sentence_represent=True,
+        special_tokens_list=['[CLS]', '[SEP]', '.', '?'],
+        add_pooling_layer=False,
+    ),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[512, 1024, 2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        bias=True,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    encoder=dict(
+        num_layers=6,
+        num_cp=6,
+        # visual layer config
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_levels=4, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
+        # text layer config
+        text_layer_cfg=dict(
+            self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=1024, ffn_drop=0.0)),
+        # fusion layer config
+        fusion_layer_cfg=dict(
+            v_dim=256,
+            l_dim=256,
+            embed_dim=1024,
+            num_heads=4,
+            init_values=1e-4),
+    ),
+    decoder=dict(
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(
+            # query self attention layer
+            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            # cross attention layer query to text
+            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            # cross attention layer query to image
+            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
+        post_norm_cfg=None),
+    positional_encoding=dict(
+        num_feats=128, normalize=True, offset=0.0, temperature=20),
+    bbox_head=dict(
+        type='GroundingDINOHead',
+        num_classes=80,
+        sync_cls_avg_factor=True,
+        contrastive_cfg=dict(max_text_len=256, log_scale='auto', bias=True),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),  # 2.0 in DeformDETR
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    dn_cfg=dict(  # TODO: Move to model.train_cfg ?
+        label_noise_scale=0.5,
+        box_noise_scale=1.0,  # 0.4 for DN-DETR
+        group_cfg=dict(dynamic=True, num_groups=None,
+                       num_dn_queries=100)),  # TODO: half num_dn_queries
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            match_costs=[
+                dict(type='BinaryFocalLossCost', weight=2.0),
+                dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                dict(type='IoUCost', iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=300))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='FixScaleResize', scale=(800, 1333), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        filter_cfg=dict(filter_empty_gt=False),
+        pipeline=train_pipeline,
+        return_classes=True))
+val_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, return_classes=True))
+test_dataloader = val_dataloader
+
+# We did not adopt the official 24e optimizer strategy
+# because the results indicate that the current strategy is superior.
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW',
+        lr=0.0001,  # 0.0002 for DeformDETR
+        weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'absolute_pos_embed': dict(decay_mult=0.),
+        'backbone': dict(lr_mult=0.1)
+    }))
+# learning policy
+max_epochs = 12
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[11],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco.py b/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3554ee245ffe4312fc7f2cdd83755b1a0731aab9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = [
+    './grounding_dino_swin-t_finetune_16xb2_1x_coco.py',
+]
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'  # noqa
+model = dict(
+    type='GroundingDINO',
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py b/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py
new file mode 100644
index 0000000000000000000000000000000000000000..92f327fef8311f0f72d7f75149bfc163863e913c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py
@@ -0,0 +1,16 @@
+_base_ = [
+    './grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py',
+]
+
+model = dict(
+    type='GroundingDINO',
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco.py b/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c6403ee66d9e5782723117191176efbadec2a90
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco.py
@@ -0,0 +1,204 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth'  # noqa
+lang_model_name = 'bert-base-uncased'
+
+model = dict(
+    type='GroundingDINO',
+    num_queries=900,
+    with_box_refine=True,
+    as_two_stage=True,
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=False,
+    ),
+    language_model=dict(
+        type='BertModel',
+        name=lang_model_name,
+        pad_to_max=False,
+        use_sub_sentence_represent=True,
+        special_tokens_list=['[CLS]', '[SEP]', '.', '?'],
+        add_pooling_layer=False,
+    ),
+    backbone=dict(
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=True,
+        convert_weights=False),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[192, 384, 768],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        bias=True,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    encoder=dict(
+        num_layers=6,
+        num_cp=6,
+        # visual layer config
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_levels=4, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
+        # text layer config
+        text_layer_cfg=dict(
+            self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=1024, ffn_drop=0.0)),
+        # fusion layer config
+        fusion_layer_cfg=dict(
+            v_dim=256,
+            l_dim=256,
+            embed_dim=1024,
+            num_heads=4,
+            init_values=1e-4),
+    ),
+    decoder=dict(
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(
+            # query self attention layer
+            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            # cross attention layer query to text
+            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            # cross attention layer query to image
+            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
+        post_norm_cfg=None),
+    positional_encoding=dict(
+        num_feats=128, normalize=True, offset=0.0, temperature=20),
+    bbox_head=dict(
+        type='GroundingDINOHead',
+        num_classes=80,
+        sync_cls_avg_factor=True,
+        contrastive_cfg=dict(max_text_len=256, log_scale=0.0, bias=False),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),  # 2.0 in DeformDETR
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    dn_cfg=dict(  # TODO: Move to model.train_cfg ?
+        label_noise_scale=0.5,
+        box_noise_scale=1.0,  # 0.4 for DN-DETR
+        group_cfg=dict(dynamic=True, num_groups=None,
+                       num_dn_queries=100)),  # TODO: half num_dn_queries
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            match_costs=[
+                dict(type='BinaryFocalLossCost', weight=2.0),
+                dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                dict(type='IoUCost', iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=300))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='FixScaleResize', scale=(800, 1333), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        filter_cfg=dict(filter_empty_gt=False),
+        pipeline=train_pipeline,
+        return_classes=True))
+val_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, return_classes=True))
+test_dataloader = val_dataloader
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'absolute_pos_embed': dict(decay_mult=0.),
+        'backbone': dict(lr_mult=0.1)
+    }))
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[11],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=32)
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_swin-t_finetune_8xb2_20e_cat.py b/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_swin-t_finetune_8xb2_20e_cat.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2265e86730f68ed69af246a5e0e87fa2cb5e570
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_swin-t_finetune_8xb2_20e_cat.py
@@ -0,0 +1,56 @@
+_base_ = 'grounding_dino_swin-t_finetune_16xb2_1x_coco.py'
+
+data_root = 'data/cat/'
+class_name = ('cat', )
+num_classes = len(class_name)
+metainfo = dict(classes=class_name, palette=[(220, 20, 60)])
+
+model = dict(bbox_head=dict(num_classes=num_classes))
+
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        metainfo=metainfo,
+        ann_file='annotations/trainval.json',
+        data_prefix=dict(img='images/')))
+
+val_dataloader = dict(
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        ann_file='annotations/test.json',
+        data_prefix=dict(img='images/')))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(ann_file=data_root + 'annotations/test.json')
+test_evaluator = val_evaluator
+
+max_epoch = 20
+
+default_hooks = dict(
+    checkpoint=dict(interval=1, max_keep_ckpts=1, save_best='auto'),
+    logger=dict(type='LoggerHook', interval=5))
+train_cfg = dict(max_epochs=max_epoch, val_interval=1)
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=30),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epoch,
+        by_epoch=True,
+        milestones=[15],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(lr=0.00005),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            'language_model': dict(lr_mult=0),
+        }))
+
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py b/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py
new file mode 100644
index 0000000000000000000000000000000000000000..7448764ef7ed4fb91bbca981e8006b412e74c414
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py
@@ -0,0 +1,128 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+lang_model_name = 'bert-base-uncased'
+
+model = dict(
+    type='GroundingDINO',
+    num_queries=900,
+    with_box_refine=True,
+    as_two_stage=True,
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=False,
+    ),
+    language_model=dict(
+        type='BertModel',
+        name=lang_model_name,
+        pad_to_max=False,
+        use_sub_sentence_represent=True,
+        special_tokens_list=['[CLS]', '[SEP]', '.', '?'],
+        add_pooling_layer=True,
+    ),
+    backbone=dict(
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=False,
+        convert_weights=False),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[192, 384, 768],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        bias=True,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    encoder=dict(
+        num_layers=6,
+        # visual layer config
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_levels=4, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
+        # text layer config
+        text_layer_cfg=dict(
+            self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=1024, ffn_drop=0.0)),
+        # fusion layer config
+        fusion_layer_cfg=dict(
+            v_dim=256,
+            l_dim=256,
+            embed_dim=1024,
+            num_heads=4,
+            init_values=1e-4),
+    ),
+    decoder=dict(
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(
+            # query self attention layer
+            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            # cross attention layer query to text
+            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            # cross attention layer query to image
+            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
+        post_norm_cfg=None),
+    positional_encoding=dict(
+        num_feats=128, normalize=True, offset=0.0, temperature=20),
+    bbox_head=dict(
+        type='GroundingDINOHead',
+        num_classes=80,
+        sync_cls_avg_factor=True,
+        contrastive_cfg=dict(max_text_len=256),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),  # 2.0 in DeformDETR
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0)),
+    dn_cfg=dict(  # TODO: Move to model.train_cfg ?
+        label_noise_scale=0.5,
+        box_noise_scale=1.0,  # 0.4 for DN-DETR
+        group_cfg=dict(dynamic=True, num_groups=None,
+                       num_dn_queries=100)),  # TODO: half num_dn_queries
+    # training and testing settings
+    train_cfg=None,
+    test_cfg=dict(max_per_img=300))
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive'))
+]
+
+val_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, return_classes=True))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_lvis.py b/mmde/mmdet/.mim/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..6084159044e8c0e8642a1226c6a9efd85c7d27d2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_lvis.py
@@ -0,0 +1,14 @@
+_base_ = './grounding_dino_swin-t_pretrain_zeroshot_lvis.py'
+
+model = dict(
+    type='GroundingDINO',
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_mini-lvis.py b/mmde/mmdet/.mim/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_mini-lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..68467a7237ca893aa79eb5b0acc9d159f7082968
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_mini-lvis.py
@@ -0,0 +1,14 @@
+_base_ = './grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py'
+
+model = dict(
+    type='GroundingDINO',
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_lvis.py b/mmde/mmdet/.mim/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d05f0ce1c0cb095c0c9f9a65bd7666cba57afe7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_lvis.py
@@ -0,0 +1,24 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
+
+model = dict(test_cfg=dict(
+    max_per_img=300,
+    chunked_size=40,
+))
+
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/coco/'
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        type=dataset_type,
+        ann_file='annotations/lvis_od_val.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+# numpy < 1.24.0
+val_evaluator = dict(
+    _delete_=True,
+    type='LVISFixedAPMetric',
+    ann_file=data_root + 'annotations/lvis_od_val.json')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py b/mmde/mmdet/.mim/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..0aac6cf33a92827c9c350175977bb1a595d2c0c8
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py
@@ -0,0 +1,25 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
+
+model = dict(test_cfg=dict(
+    max_per_img=300,
+    chunked_size=40,
+))
+
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/coco/'
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        type=dataset_type,
+        ann_file='annotations/lvis_v1_minival_inserted_image_name.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+# numpy < 1.24.0
+val_evaluator = dict(
+    _delete_=True,
+    type='LVISFixedAPMetric',
+    ann_file=data_root +
+    'annotations/lvis_v1_minival_inserted_image_name.json')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/metafile.yml b/mmde/mmdet/.mim/configs/grounding_dino/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..dcb5ebf82846d3cfbc2fa345cc89468ba269fd84
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/metafile.yml
@@ -0,0 +1,67 @@
+Collections:
+  - Name: Grounding DINO
+    Metadata:
+      Training Data: Objects365, GoldG, CC3M and COCO
+      Training Techniques:
+        - AdamW
+        - Multi Scale Train
+        - Gradient Clip
+      Training Resources: 3090 GPUs
+      Architecture:
+        - Swin Transformer
+        - BERT
+    Paper:
+      URL: https://arxiv.org/abs/2303.05499
+      Title: 'Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection
+'
+    README: configs/grounding_dino/README.md
+    Code:
+      URL:
+      Version: v3.0.0
+
+Models:
+  - Name: grounding_dino_swin-t_pretrain_obj365_goldg_cap4m
+    In Collection: Grounding DINO
+    Config: configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 48.5
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth
+  - Name: grounding_dino_swin-b_pretrain_mixeddata
+    In Collection: Grounding DINO
+    Config: configs/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 56.9
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth
+  - Name: grounding_dino_swin-t_finetune_16xb2_1x_coco
+    In Collection: Grounding DINO
+    Config: configs/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 58.1
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco/grounding_dino_swin-t_finetune_16xb2_1x_coco_20230921_152544-5f234b20.pth
+  - Name: grounding_dino_swin-b_finetune_16xb2_1x_coco
+    In Collection: Grounding DINO
+    Config: configs/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 59.7
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco/grounding_dino_swin-b_finetune_16xb2_1x_coco_20230921_153201-f219e0c0.pth
+  - Name: grounding_dino_r50_scratch_8xb2_1x_coco
+    In Collection: Grounding DINO
+    Config: configs/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 48.9
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco/grounding_dino_r50_scratch_1x_coco-fe0002f2.pth
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw13.py b/mmde/mmdet/.mim/configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw13.py
new file mode 100644
index 0000000000000000000000000000000000000000..65a6bc2a078a9ea5123c745aa72ba22466ea6e58
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw13.py
@@ -0,0 +1,338 @@
+_base_ = '../grounding_dino_swin-b_pretrain_mixeddata.py'
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    test_mode=True,
+    pipeline=base_test_pipeline,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'penguin': {
+#         'suffix': ', which is black and white'
+#     },
+#     'puffin': {
+#         'suffix': ' with orange beaks'
+#     },
+#     'stingray': {
+#         'suffix': ' which is flat and round'
+#     },
+# }
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 CottontailRabbits---------------------#
+class_name = ('Cottontail-Rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+
+caption_prompt = None
+# caption_prompt = {'Cottontail-Rabbit': {'name': 'rabbit'}}
+
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 EgoHands---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+
+caption_prompt = None
+# caption_prompt = {'hand': {'suffix': ' of a person'}}
+
+dataset_EgoHands = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 NorthAmericaMushrooms---------------------#
+class_name = ('CoW', 'chanterelle')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+
+caption_prompt = None
+# caption_prompt = {
+#     'CoW': {
+#         'name': 'flat mushroom'
+#     },
+#     'chanterelle': {
+#         'name': 'yellow mushroom'
+#     }
+# }
+
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'package': {
+#         'prefix': 'there is a ',
+#         'suffix': ' on the porch'
+#     }
+# }
+
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'pothole': {
+#         'prefix': 'there are some ',
+#         'name': 'holes',
+#         'suffix': ' on the road'
+#     }
+# }
+
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+dataset_prefixes = [
+    'AerialMaritimeDrone', 'Aquarium', 'CottontailRabbits', 'EgoHands',
+    'NorthAmericaMushrooms', 'Packages', 'PascalVOC', 'pistols', 'pothole',
+    'Raccoon', 'ShellfishOpenImages', 'thermalDogsAndPeople',
+    'VehiclesOpenImages'
+]
+datasets = [
+    dataset_AerialMaritimeDrone, dataset_Aquarium, dataset_CottontailRabbits,
+    dataset_EgoHands, dataset_NorthAmericaMushrooms, dataset_Packages,
+    dataset_PascalVOC, dataset_pistols, dataset_pothole, dataset_Raccoon,
+    dataset_ShellfishOpenImages, dataset_thermalDogsAndPeople,
+    dataset_VehiclesOpenImages
+]
+metrics = [
+    val_evaluator_AerialMaritimeDrone, val_evaluator_Aquarium,
+    val_evaluator_CottontailRabbits, val_evaluator_EgoHands,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_Packages,
+    val_evaluator_PascalVOC, val_evaluator_pistols, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_ShellfishOpenImages,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_VehiclesOpenImages
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw35.py b/mmde/mmdet/.mim/configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw35.py
new file mode 100644
index 0000000000000000000000000000000000000000..e73cd8e61ba20f4baff6f7c85477a8fae3735e44
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw35.py
@@ -0,0 +1,796 @@
+_base_ = '../grounding_dino_swin-b_pretrain_mixeddata.py'
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone_large---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone_large = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_large = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 AerialMaritimeDrone_tiled---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/tiled/'
+dataset_AerialMaritimeDrone_tiled = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_tiled = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 AmericanSignLanguageLetters---------------------#
+class_name = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+              'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AmericanSignLanguageLetters/American Sign Language Letters.v1-v1.coco/'  # noqa
+dataset_AmericanSignLanguageLetters = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AmericanSignLanguageLetters = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 BCCD---------------------#
+class_name = ('Platelets', 'RBC', 'WBC')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'BCCD/BCCD.v3-raw.coco/'
+dataset_BCCD = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_BCCD = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 boggleBoards---------------------#
+class_name = ('Q', 'a', 'an', 'b', 'c', 'd', 'e', 'er', 'f', 'g', 'h', 'he',
+              'i', 'in', 'j', 'k', 'l', 'm', 'n', 'o', 'o ', 'p', 'q', 'qu',
+              'r', 's', 't', 't\\', 'th', 'u', 'v', 'w', 'wild', 'x', 'y', 'z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'boggleBoards/416x416AutoOrient/export/'
+dataset_boggleBoards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_boggleBoards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 brackishUnderwater---------------------#
+class_name = ('crab', 'fish', 'jellyfish', 'shrimp', 'small_fish', 'starfish')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'brackishUnderwater/960x540/'
+dataset_brackishUnderwater = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_brackishUnderwater = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 ChessPieces---------------------#
+class_name = ('  ', 'black bishop', 'black king', 'black knight', 'black pawn',
+              'black queen', 'black rook', 'white bishop', 'white king',
+              'white knight', 'white pawn', 'white queen', 'white rook')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ChessPieces/Chess Pieces.v23-raw.coco/'
+dataset_ChessPieces = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ChessPieces = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 CottontailRabbits---------------------#
+class_name = ('rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 dice---------------------#
+class_name = ('1', '2', '3', '4', '5', '6')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'dice/mediumColor/export/'
+dataset_dice = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_dice = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 DroneControl---------------------#
+class_name = ('follow', 'follow_hand', 'land', 'land_hand', 'null', 'object',
+              'takeoff', 'takeoff-hand')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'DroneControl/Drone Control.v3-raw.coco/'
+dataset_DroneControl = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_DroneControl = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 EgoHands_generic---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+caption_prompt = {'hand': {'suffix': ' of a person'}}
+dataset_EgoHands_generic = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    # NOTE w. prompt 0.548; wo. prompt 0.764
+    # caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_generic = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 EgoHands_specific---------------------#
+class_name = ('myleft', 'myright', 'yourleft', 'yourright')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/specific/'
+dataset_EgoHands_specific = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_specific = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------14 HardHatWorkers---------------------#
+class_name = ('head', 'helmet', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'HardHatWorkers/raw/'
+dataset_HardHatWorkers = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_HardHatWorkers = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------15 MaskWearing---------------------#
+class_name = ('mask', 'no-mask')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MaskWearing/raw/'
+dataset_MaskWearing = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MaskWearing = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------16 MountainDewCommercial---------------------#
+class_name = ('bottle', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MountainDewCommercial/'
+dataset_MountainDewCommercial = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MountainDewCommercial = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------17 NorthAmericaMushrooms---------------------#
+class_name = ('flat mushroom', 'yellow mushroom')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------18 openPoetryVision---------------------#
+class_name = ('American Typewriter', 'Andale Mono', 'Apple Chancery', 'Arial',
+              'Avenir', 'Baskerville', 'Big Caslon', 'Bradley Hand',
+              'Brush Script MT', 'Chalkboard', 'Comic Sans MS', 'Copperplate',
+              'Courier', 'Didot', 'Futura', 'Geneva', 'Georgia', 'Gill Sans',
+              'Helvetica', 'Herculanum', 'Impact', 'Kefa', 'Lucida Grande',
+              'Luminari', 'Marker Felt', 'Menlo', 'Monaco', 'Noteworthy',
+              'Optima', 'PT Sans', 'PT Serif', 'Palatino', 'Papyrus',
+              'Phosphate', 'Rockwell', 'SF Pro', 'SignPainter', 'Skia',
+              'Snell Roundhand', 'Tahoma', 'Times New Roman', 'Trebuchet MS',
+              'Verdana')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'openPoetryVision/512x512/'
+dataset_openPoetryVision = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_openPoetryVision = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------19 OxfordPets_by_breed---------------------#
+class_name = ('cat-Abyssinian', 'cat-Bengal', 'cat-Birman', 'cat-Bombay',
+              'cat-British_Shorthair', 'cat-Egyptian_Mau', 'cat-Maine_Coon',
+              'cat-Persian', 'cat-Ragdoll', 'cat-Russian_Blue', 'cat-Siamese',
+              'cat-Sphynx', 'dog-american_bulldog',
+              'dog-american_pit_bull_terrier', 'dog-basset_hound',
+              'dog-beagle', 'dog-boxer', 'dog-chihuahua',
+              'dog-english_cocker_spaniel', 'dog-english_setter',
+              'dog-german_shorthaired', 'dog-great_pyrenees', 'dog-havanese',
+              'dog-japanese_chin', 'dog-keeshond', 'dog-leonberger',
+              'dog-miniature_pinscher', 'dog-newfoundland', 'dog-pomeranian',
+              'dog-pug', 'dog-saint_bernard', 'dog-samoyed',
+              'dog-scottish_terrier', 'dog-shiba_inu',
+              'dog-staffordshire_bull_terrier', 'dog-wheaten_terrier',
+              'dog-yorkshire_terrier')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-breed/'  # noqa
+dataset_OxfordPets_by_breed = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_breed = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------20 OxfordPets_by_species---------------------#
+class_name = ('cat', 'dog')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-species/'  # noqa
+dataset_OxfordPets_by_species = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_species = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------21 PKLot---------------------#
+class_name = ('space-empty', 'space-occupied')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PKLot/640/'  # noqa
+dataset_PKLot = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PKLot = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------22 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+caption_prompt = {
+    'package': {
+        'prefix': 'there is a ',
+        'suffix': ' on the porch'
+    }
+}
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,  # NOTE w. prompt 0.728; wo. prompt 0.670
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------23 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------24 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------25 plantdoc---------------------#
+class_name = ('Apple Scab Leaf', 'Apple leaf', 'Apple rust leaf',
+              'Bell_pepper leaf', 'Bell_pepper leaf spot', 'Blueberry leaf',
+              'Cherry leaf', 'Corn Gray leaf spot', 'Corn leaf blight',
+              'Corn rust leaf', 'Peach leaf', 'Potato leaf',
+              'Potato leaf early blight', 'Potato leaf late blight',
+              'Raspberry leaf', 'Soyabean leaf', 'Soybean leaf',
+              'Squash Powdery mildew leaf', 'Strawberry leaf',
+              'Tomato Early blight leaf', 'Tomato Septoria leaf spot',
+              'Tomato leaf', 'Tomato leaf bacterial spot',
+              'Tomato leaf late blight', 'Tomato leaf mosaic virus',
+              'Tomato leaf yellow virus', 'Tomato mold leaf',
+              'Tomato two spotted spider mites leaf', 'grape leaf',
+              'grape leaf black rot')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'plantdoc/416x416/'
+dataset_plantdoc = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_plantdoc = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------26 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+caption_prompt = {
+    'pothole': {
+        'name': 'holes',
+        'prefix': 'there are some ',
+        'suffix': ' on the road'
+    }
+}
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    # NOTE w. prompt 0.221; wo. prompt 0.478
+    # caption_prompt=caption_prompt,
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------27 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------28 selfdrivingCar---------------------#
+class_name = ('biker', 'car', 'pedestrian', 'trafficLight',
+              'trafficLight-Green', 'trafficLight-GreenLeft',
+              'trafficLight-Red', 'trafficLight-RedLeft',
+              'trafficLight-Yellow', 'trafficLight-YellowLeft', 'truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'selfdrivingCar/fixedLarge/export/'
+dataset_selfdrivingCar = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_selfdrivingCar = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------29 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------30 ThermalCheetah---------------------#
+class_name = ('cheetah', 'human')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ThermalCheetah/'
+dataset_ThermalCheetah = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ThermalCheetah = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------31 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------32 UnoCards---------------------#
+class_name = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
+              '12', '13', '14')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'UnoCards/raw/'
+dataset_UnoCards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_UnoCards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------33 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------34 WildfireSmoke---------------------#
+class_name = ('smoke', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'WildfireSmoke/'
+dataset_WildfireSmoke = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_WildfireSmoke = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------35 websiteScreenshots---------------------#
+class_name = ('button', 'field', 'heading', 'iframe', 'image', 'label', 'link',
+              'text')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'websiteScreenshots/'
+dataset_websiteScreenshots = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_websiteScreenshots = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+
+dataset_prefixes = [
+    'AerialMaritimeDrone_large',
+    'AerialMaritimeDrone_tiled',
+    'AmericanSignLanguageLetters',
+    'Aquarium',
+    'BCCD',
+    'boggleBoards',
+    'brackishUnderwater',
+    'ChessPieces',
+    'CottontailRabbits',
+    'dice',
+    'DroneControl',
+    'EgoHands_generic',
+    'EgoHands_specific',
+    'HardHatWorkers',
+    'MaskWearing',
+    'MountainDewCommercial',
+    'NorthAmericaMushrooms',
+    'openPoetryVision',
+    'OxfordPets_by_breed',
+    'OxfordPets_by_species',
+    'PKLot',
+    'Packages',
+    'PascalVOC',
+    'pistols',
+    'plantdoc',
+    'pothole',
+    'Raccoons',
+    'selfdrivingCar',
+    'ShellfishOpenImages',
+    'ThermalCheetah',
+    'thermalDogsAndPeople',
+    'UnoCards',
+    'VehiclesOpenImages',
+    'WildfireSmoke',
+    'websiteScreenshots',
+]
+
+datasets = [
+    dataset_AerialMaritimeDrone_large, dataset_AerialMaritimeDrone_tiled,
+    dataset_AmericanSignLanguageLetters, dataset_Aquarium, dataset_BCCD,
+    dataset_boggleBoards, dataset_brackishUnderwater, dataset_ChessPieces,
+    dataset_CottontailRabbits, dataset_dice, dataset_DroneControl,
+    dataset_EgoHands_generic, dataset_EgoHands_specific,
+    dataset_HardHatWorkers, dataset_MaskWearing, dataset_MountainDewCommercial,
+    dataset_NorthAmericaMushrooms, dataset_openPoetryVision,
+    dataset_OxfordPets_by_breed, dataset_OxfordPets_by_species, dataset_PKLot,
+    dataset_Packages, dataset_PascalVOC, dataset_pistols, dataset_plantdoc,
+    dataset_pothole, dataset_Raccoon, dataset_selfdrivingCar,
+    dataset_ShellfishOpenImages, dataset_ThermalCheetah,
+    dataset_thermalDogsAndPeople, dataset_UnoCards, dataset_VehiclesOpenImages,
+    dataset_WildfireSmoke, dataset_websiteScreenshots
+]
+
+metrics = [
+    val_evaluator_AerialMaritimeDrone_large,
+    val_evaluator_AerialMaritimeDrone_tiled,
+    val_evaluator_AmericanSignLanguageLetters, val_evaluator_Aquarium,
+    val_evaluator_BCCD, val_evaluator_boggleBoards,
+    val_evaluator_brackishUnderwater, val_evaluator_ChessPieces,
+    val_evaluator_CottontailRabbits, val_evaluator_dice,
+    val_evaluator_DroneControl, val_evaluator_EgoHands_generic,
+    val_evaluator_EgoHands_specific, val_evaluator_HardHatWorkers,
+    val_evaluator_MaskWearing, val_evaluator_MountainDewCommercial,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_openPoetryVision,
+    val_evaluator_OxfordPets_by_breed, val_evaluator_OxfordPets_by_species,
+    val_evaluator_PKLot, val_evaluator_Packages, val_evaluator_PascalVOC,
+    val_evaluator_pistols, val_evaluator_plantdoc, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_selfdrivingCar,
+    val_evaluator_ShellfishOpenImages, val_evaluator_ThermalCheetah,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_UnoCards,
+    val_evaluator_VehiclesOpenImages, val_evaluator_WildfireSmoke,
+    val_evaluator_websiteScreenshots
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py b/mmde/mmdet/.mim/configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py
new file mode 100644
index 0000000000000000000000000000000000000000..216b8059726b8fbe9dff3b2a43718bc563502aab
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py
@@ -0,0 +1,338 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'  # noqa
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    test_mode=True,
+    pipeline=base_test_pipeline,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'penguin': {
+#         'suffix': ', which is black and white'
+#     },
+#     'puffin': {
+#         'suffix': ' with orange beaks'
+#     },
+#     'stingray': {
+#         'suffix': ' which is flat and round'
+#     },
+# }
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 CottontailRabbits---------------------#
+class_name = ('Cottontail-Rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+
+caption_prompt = None
+# caption_prompt = {'Cottontail-Rabbit': {'name': 'rabbit'}}
+
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 EgoHands---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+
+caption_prompt = None
+# caption_prompt = {'hand': {'suffix': ' of a person'}}
+
+dataset_EgoHands = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 NorthAmericaMushrooms---------------------#
+class_name = ('CoW', 'chanterelle')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+
+caption_prompt = None
+# caption_prompt = {
+#     'CoW': {
+#         'name': 'flat mushroom'
+#     },
+#     'chanterelle': {
+#         'name': 'yellow mushroom'
+#     }
+# }
+
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'package': {
+#         'prefix': 'there is a ',
+#         'suffix': ' on the porch'
+#     }
+# }
+
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'pothole': {
+#         'prefix': 'there are some ',
+#         'name': 'holes',
+#         'suffix': ' on the road'
+#     }
+# }
+
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+dataset_prefixes = [
+    'AerialMaritimeDrone', 'Aquarium', 'CottontailRabbits', 'EgoHands',
+    'NorthAmericaMushrooms', 'Packages', 'PascalVOC', 'pistols', 'pothole',
+    'Raccoon', 'ShellfishOpenImages', 'thermalDogsAndPeople',
+    'VehiclesOpenImages'
+]
+datasets = [
+    dataset_AerialMaritimeDrone, dataset_Aquarium, dataset_CottontailRabbits,
+    dataset_EgoHands, dataset_NorthAmericaMushrooms, dataset_Packages,
+    dataset_PascalVOC, dataset_pistols, dataset_pothole, dataset_Raccoon,
+    dataset_ShellfishOpenImages, dataset_thermalDogsAndPeople,
+    dataset_VehiclesOpenImages
+]
+metrics = [
+    val_evaluator_AerialMaritimeDrone, val_evaluator_Aquarium,
+    val_evaluator_CottontailRabbits, val_evaluator_EgoHands,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_Packages,
+    val_evaluator_PascalVOC, val_evaluator_pistols, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_ShellfishOpenImages,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_VehiclesOpenImages
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw35.py b/mmde/mmdet/.mim/configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw35.py
new file mode 100644
index 0000000000000000000000000000000000000000..3df0394a204061684cbb9bb66adb08d92a784efb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw35.py
@@ -0,0 +1,796 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'  # noqa
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone_large---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone_large = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_large = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 AerialMaritimeDrone_tiled---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/tiled/'
+dataset_AerialMaritimeDrone_tiled = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_tiled = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 AmericanSignLanguageLetters---------------------#
+class_name = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+              'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AmericanSignLanguageLetters/American Sign Language Letters.v1-v1.coco/'  # noqa
+dataset_AmericanSignLanguageLetters = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AmericanSignLanguageLetters = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 BCCD---------------------#
+class_name = ('Platelets', 'RBC', 'WBC')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'BCCD/BCCD.v3-raw.coco/'
+dataset_BCCD = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_BCCD = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 boggleBoards---------------------#
+class_name = ('Q', 'a', 'an', 'b', 'c', 'd', 'e', 'er', 'f', 'g', 'h', 'he',
+              'i', 'in', 'j', 'k', 'l', 'm', 'n', 'o', 'o ', 'p', 'q', 'qu',
+              'r', 's', 't', 't\\', 'th', 'u', 'v', 'w', 'wild', 'x', 'y', 'z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'boggleBoards/416x416AutoOrient/export/'
+dataset_boggleBoards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_boggleBoards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 brackishUnderwater---------------------#
+class_name = ('crab', 'fish', 'jellyfish', 'shrimp', 'small_fish', 'starfish')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'brackishUnderwater/960x540/'
+dataset_brackishUnderwater = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_brackishUnderwater = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 ChessPieces---------------------#
+class_name = ('  ', 'black bishop', 'black king', 'black knight', 'black pawn',
+              'black queen', 'black rook', 'white bishop', 'white king',
+              'white knight', 'white pawn', 'white queen', 'white rook')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ChessPieces/Chess Pieces.v23-raw.coco/'
+dataset_ChessPieces = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ChessPieces = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 CottontailRabbits---------------------#
+class_name = ('rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 dice---------------------#
+class_name = ('1', '2', '3', '4', '5', '6')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'dice/mediumColor/export/'
+dataset_dice = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_dice = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 DroneControl---------------------#
+class_name = ('follow', 'follow_hand', 'land', 'land_hand', 'null', 'object',
+              'takeoff', 'takeoff-hand')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'DroneControl/Drone Control.v3-raw.coco/'
+dataset_DroneControl = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_DroneControl = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 EgoHands_generic---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+caption_prompt = {'hand': {'suffix': ' of a person'}}
+dataset_EgoHands_generic = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    # NOTE w. prompt 0.526, wo. prompt 0.608
+    # caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_generic = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 EgoHands_specific---------------------#
+class_name = ('myleft', 'myright', 'yourleft', 'yourright')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/specific/'
+dataset_EgoHands_specific = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_specific = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------14 HardHatWorkers---------------------#
+class_name = ('head', 'helmet', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'HardHatWorkers/raw/'
+dataset_HardHatWorkers = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_HardHatWorkers = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------15 MaskWearing---------------------#
+class_name = ('mask', 'no-mask')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MaskWearing/raw/'
+dataset_MaskWearing = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MaskWearing = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------16 MountainDewCommercial---------------------#
+class_name = ('bottle', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MountainDewCommercial/'
+dataset_MountainDewCommercial = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MountainDewCommercial = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------17 NorthAmericaMushrooms---------------------#
+class_name = ('flat mushroom', 'yellow mushroom')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------18 openPoetryVision---------------------#
+class_name = ('American Typewriter', 'Andale Mono', 'Apple Chancery', 'Arial',
+              'Avenir', 'Baskerville', 'Big Caslon', 'Bradley Hand',
+              'Brush Script MT', 'Chalkboard', 'Comic Sans MS', 'Copperplate',
+              'Courier', 'Didot', 'Futura', 'Geneva', 'Georgia', 'Gill Sans',
+              'Helvetica', 'Herculanum', 'Impact', 'Kefa', 'Lucida Grande',
+              'Luminari', 'Marker Felt', 'Menlo', 'Monaco', 'Noteworthy',
+              'Optima', 'PT Sans', 'PT Serif', 'Palatino', 'Papyrus',
+              'Phosphate', 'Rockwell', 'SF Pro', 'SignPainter', 'Skia',
+              'Snell Roundhand', 'Tahoma', 'Times New Roman', 'Trebuchet MS',
+              'Verdana')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'openPoetryVision/512x512/'
+dataset_openPoetryVision = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_openPoetryVision = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------19 OxfordPets_by_breed---------------------#
+class_name = ('cat-Abyssinian', 'cat-Bengal', 'cat-Birman', 'cat-Bombay',
+              'cat-British_Shorthair', 'cat-Egyptian_Mau', 'cat-Maine_Coon',
+              'cat-Persian', 'cat-Ragdoll', 'cat-Russian_Blue', 'cat-Siamese',
+              'cat-Sphynx', 'dog-american_bulldog',
+              'dog-american_pit_bull_terrier', 'dog-basset_hound',
+              'dog-beagle', 'dog-boxer', 'dog-chihuahua',
+              'dog-english_cocker_spaniel', 'dog-english_setter',
+              'dog-german_shorthaired', 'dog-great_pyrenees', 'dog-havanese',
+              'dog-japanese_chin', 'dog-keeshond', 'dog-leonberger',
+              'dog-miniature_pinscher', 'dog-newfoundland', 'dog-pomeranian',
+              'dog-pug', 'dog-saint_bernard', 'dog-samoyed',
+              'dog-scottish_terrier', 'dog-shiba_inu',
+              'dog-staffordshire_bull_terrier', 'dog-wheaten_terrier',
+              'dog-yorkshire_terrier')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-breed/'  # noqa
+dataset_OxfordPets_by_breed = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_breed = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------20 OxfordPets_by_species---------------------#
+class_name = ('cat', 'dog')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-species/'  # noqa
+dataset_OxfordPets_by_species = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_species = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------21 PKLot---------------------#
+class_name = ('space-empty', 'space-occupied')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PKLot/640/'  # noqa
+dataset_PKLot = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PKLot = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------22 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+caption_prompt = {
+    'package': {
+        'prefix': 'there is a ',
+        'suffix': ' on the porch'
+    }
+}
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,  # NOTE w. prompt 0.695; wo. prompt 0.687
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------23 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------24 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------25 plantdoc---------------------#
+class_name = ('Apple Scab Leaf', 'Apple leaf', 'Apple rust leaf',
+              'Bell_pepper leaf', 'Bell_pepper leaf spot', 'Blueberry leaf',
+              'Cherry leaf', 'Corn Gray leaf spot', 'Corn leaf blight',
+              'Corn rust leaf', 'Peach leaf', 'Potato leaf',
+              'Potato leaf early blight', 'Potato leaf late blight',
+              'Raspberry leaf', 'Soyabean leaf', 'Soybean leaf',
+              'Squash Powdery mildew leaf', 'Strawberry leaf',
+              'Tomato Early blight leaf', 'Tomato Septoria leaf spot',
+              'Tomato leaf', 'Tomato leaf bacterial spot',
+              'Tomato leaf late blight', 'Tomato leaf mosaic virus',
+              'Tomato leaf yellow virus', 'Tomato mold leaf',
+              'Tomato two spotted spider mites leaf', 'grape leaf',
+              'grape leaf black rot')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'plantdoc/416x416/'
+dataset_plantdoc = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_plantdoc = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------26 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+caption_prompt = {
+    'pothole': {
+        'name': 'holes',
+        'prefix': 'there are some ',
+        'suffix': ' on the road'
+    }
+}
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    # NOTE w. prompt 0.137; wo. prompt 0.215
+    # caption_prompt=caption_prompt,
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------27 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------28 selfdrivingCar---------------------#
+class_name = ('biker', 'car', 'pedestrian', 'trafficLight',
+              'trafficLight-Green', 'trafficLight-GreenLeft',
+              'trafficLight-Red', 'trafficLight-RedLeft',
+              'trafficLight-Yellow', 'trafficLight-YellowLeft', 'truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'selfdrivingCar/fixedLarge/export/'
+dataset_selfdrivingCar = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_selfdrivingCar = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------29 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------30 ThermalCheetah---------------------#
+class_name = ('cheetah', 'human')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ThermalCheetah/'
+dataset_ThermalCheetah = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ThermalCheetah = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------31 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------32 UnoCards---------------------#
+class_name = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
+              '12', '13', '14')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'UnoCards/raw/'
+dataset_UnoCards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_UnoCards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------33 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------34 WildfireSmoke---------------------#
+class_name = ('smoke', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'WildfireSmoke/'
+dataset_WildfireSmoke = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_WildfireSmoke = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------35 websiteScreenshots---------------------#
+class_name = ('button', 'field', 'heading', 'iframe', 'image', 'label', 'link',
+              'text')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'websiteScreenshots/'
+dataset_websiteScreenshots = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_websiteScreenshots = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+
+dataset_prefixes = [
+    'AerialMaritimeDrone_large',
+    'AerialMaritimeDrone_tiled',
+    'AmericanSignLanguageLetters',
+    'Aquarium',
+    'BCCD',
+    'boggleBoards',
+    'brackishUnderwater',
+    'ChessPieces',
+    'CottontailRabbits',
+    'dice',
+    'DroneControl',
+    'EgoHands_generic',
+    'EgoHands_specific',
+    'HardHatWorkers',
+    'MaskWearing',
+    'MountainDewCommercial',
+    'NorthAmericaMushrooms',
+    'openPoetryVision',
+    'OxfordPets_by_breed',
+    'OxfordPets_by_species',
+    'PKLot',
+    'Packages',
+    'PascalVOC',
+    'pistols',
+    'plantdoc',
+    'pothole',
+    'Raccoons',
+    'selfdrivingCar',
+    'ShellfishOpenImages',
+    'ThermalCheetah',
+    'thermalDogsAndPeople',
+    'UnoCards',
+    'VehiclesOpenImages',
+    'WildfireSmoke',
+    'websiteScreenshots',
+]
+
+datasets = [
+    dataset_AerialMaritimeDrone_large, dataset_AerialMaritimeDrone_tiled,
+    dataset_AmericanSignLanguageLetters, dataset_Aquarium, dataset_BCCD,
+    dataset_boggleBoards, dataset_brackishUnderwater, dataset_ChessPieces,
+    dataset_CottontailRabbits, dataset_dice, dataset_DroneControl,
+    dataset_EgoHands_generic, dataset_EgoHands_specific,
+    dataset_HardHatWorkers, dataset_MaskWearing, dataset_MountainDewCommercial,
+    dataset_NorthAmericaMushrooms, dataset_openPoetryVision,
+    dataset_OxfordPets_by_breed, dataset_OxfordPets_by_species, dataset_PKLot,
+    dataset_Packages, dataset_PascalVOC, dataset_pistols, dataset_plantdoc,
+    dataset_pothole, dataset_Raccoon, dataset_selfdrivingCar,
+    dataset_ShellfishOpenImages, dataset_ThermalCheetah,
+    dataset_thermalDogsAndPeople, dataset_UnoCards, dataset_VehiclesOpenImages,
+    dataset_WildfireSmoke, dataset_websiteScreenshots
+]
+
+metrics = [
+    val_evaluator_AerialMaritimeDrone_large,
+    val_evaluator_AerialMaritimeDrone_tiled,
+    val_evaluator_AmericanSignLanguageLetters, val_evaluator_Aquarium,
+    val_evaluator_BCCD, val_evaluator_boggleBoards,
+    val_evaluator_brackishUnderwater, val_evaluator_ChessPieces,
+    val_evaluator_CottontailRabbits, val_evaluator_dice,
+    val_evaluator_DroneControl, val_evaluator_EgoHands_generic,
+    val_evaluator_EgoHands_specific, val_evaluator_HardHatWorkers,
+    val_evaluator_MaskWearing, val_evaluator_MountainDewCommercial,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_openPoetryVision,
+    val_evaluator_OxfordPets_by_breed, val_evaluator_OxfordPets_by_species,
+    val_evaluator_PKLot, val_evaluator_Packages, val_evaluator_PascalVOC,
+    val_evaluator_pistols, val_evaluator_plantdoc, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_selfdrivingCar,
+    val_evaluator_ShellfishOpenImages, val_evaluator_ThermalCheetah,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_UnoCards,
+    val_evaluator_VehiclesOpenImages, val_evaluator_WildfireSmoke,
+    val_evaluator_websiteScreenshots
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/odinw/override_category.py b/mmde/mmdet/.mim/configs/grounding_dino/odinw/override_category.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ff05fc6e5e4d0989cf7fcf7af4dc902ee99f3a3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/odinw/override_category.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import mmengine
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Override Category')
+    parser.add_argument('data_root')
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    ChessPieces = [{
+        'id': 1,
+        'name': '  ',
+        'supercategory': 'pieces'
+    }, {
+        'id': 2,
+        'name': 'black bishop',
+        'supercategory': 'pieces'
+    }, {
+        'id': 3,
+        'name': 'black king',
+        'supercategory': 'pieces'
+    }, {
+        'id': 4,
+        'name': 'black knight',
+        'supercategory': 'pieces'
+    }, {
+        'id': 5,
+        'name': 'black pawn',
+        'supercategory': 'pieces'
+    }, {
+        'id': 6,
+        'name': 'black queen',
+        'supercategory': 'pieces'
+    }, {
+        'id': 7,
+        'name': 'black rook',
+        'supercategory': 'pieces'
+    }, {
+        'id': 8,
+        'name': 'white bishop',
+        'supercategory': 'pieces'
+    }, {
+        'id': 9,
+        'name': 'white king',
+        'supercategory': 'pieces'
+    }, {
+        'id': 10,
+        'name': 'white knight',
+        'supercategory': 'pieces'
+    }, {
+        'id': 11,
+        'name': 'white pawn',
+        'supercategory': 'pieces'
+    }, {
+        'id': 12,
+        'name': 'white queen',
+        'supercategory': 'pieces'
+    }, {
+        'id': 13,
+        'name': 'white rook',
+        'supercategory': 'pieces'
+    }]
+
+    _data_root = args.data_root + 'ChessPieces/Chess Pieces.v23-raw.coco/'
+    json_data = mmengine.load(_data_root +
+                              'valid/annotations_without_background.json')
+    json_data['categories'] = ChessPieces
+    mmengine.dump(json_data,
+                  _data_root + 'valid/new_annotations_without_background.json')
+
+    CottontailRabbits = [{
+        'id': 1,
+        'name': 'rabbit',
+        'supercategory': 'Cottontail-Rabbit'
+    }]
+
+    _data_root = args.data_root + 'CottontailRabbits/'
+    json_data = mmengine.load(_data_root +
+                              'valid/annotations_without_background.json')
+    json_data['categories'] = CottontailRabbits
+    mmengine.dump(json_data,
+                  _data_root + 'valid/new_annotations_without_background.json')
+
+    NorthAmericaMushrooms = [{
+        'id': 1,
+        'name': 'flat mushroom',
+        'supercategory': 'mushroom'
+    }, {
+        'id': 2,
+        'name': 'yellow mushroom',
+        'supercategory': 'mushroom'
+    }]
+
+    _data_root = args.data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+    json_data = mmengine.load(_data_root +
+                              'valid/annotations_without_background.json')
+    json_data['categories'] = NorthAmericaMushrooms
+    mmengine.dump(json_data,
+                  _data_root + 'valid/new_annotations_without_background.json')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/refcoco/grounding_dino_swin-b_pretrain_zeroshot_refexp.py b/mmde/mmdet/.mim/configs/grounding_dino/refcoco/grounding_dino_swin-b_pretrain_zeroshot_refexp.py
new file mode 100644
index 0000000000000000000000000000000000000000..dea0bad08c0ebf6455211fadb268b07868ab4ded
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/refcoco/grounding_dino_swin-b_pretrain_zeroshot_refexp.py
@@ -0,0 +1,14 @@
+_base_ = './grounding_dino_swin-t_pretrain_zeroshot_refexp.py'
+
+model = dict(
+    type='GroundingDINO',
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
diff --git a/mmde/mmdet/.mim/configs/grounding_dino/refcoco/grounding_dino_swin-t_pretrain_zeroshot_refexp.py b/mmde/mmdet/.mim/configs/grounding_dino/refcoco/grounding_dino_swin-t_pretrain_zeroshot_refexp.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b5c46574a30bbb2253fc69f79edbcf0cb016505
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/grounding_dino/refcoco/grounding_dino_swin-t_pretrain_zeroshot_refexp.py
@@ -0,0 +1,228 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
+
+# 30 is an empirical value, just set it to the maximum value
+# without affecting the evaluation result
+model = dict(test_cfg=dict(max_per_img=30))
+
+data_root = 'data/coco/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive'))
+]
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/final_refexp_val.json'
+val_dataset_all_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+val_evaluator_all_val = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco_testA.json'
+val_dataset_refcoco_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testA = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco_testB.json'
+val_dataset_refcoco_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testB = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco+_testA.json'
+val_dataset_refcoco_plus_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_plus_testA = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco+_testB.json'
+val_dataset_refcoco_plus_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_plus_testB = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcocog_test.json'
+val_dataset_refcocog_test = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcocog_test = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_val.json'
+val_dataset_grefcoco_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_grefcoco_val = dict(
+    type='gRefCOCOMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    thresh_score=0.7,
+    thresh_f1=1.0)
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_testA.json'
+val_dataset_grefcoco_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_grefcoco_testA = dict(
+    type='gRefCOCOMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    thresh_score=0.7,
+    thresh_f1=1.0)
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_testB.json'
+val_dataset_grefcoco_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_grefcoco_testB = dict(
+    type='gRefCOCOMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    thresh_score=0.7,
+    thresh_f1=1.0)
+
+# -------------------------------------------------#
+datasets = [
+    val_dataset_all_val, val_dataset_refcoco_testA, val_dataset_refcoco_testB,
+    val_dataset_refcoco_plus_testA, val_dataset_refcoco_plus_testB,
+    val_dataset_refcocog_test, val_dataset_grefcoco_val,
+    val_dataset_grefcoco_testA, val_dataset_grefcoco_testB
+]
+dataset_prefixes = [
+    'val', 'refcoco_testA', 'refcoco_testB', 'refcoco+_testA',
+    'refcoco+_testB', 'refcocog_test', 'grefcoco_val', 'grefcoco_testA',
+    'grefcoco_testB'
+]
+metrics = [
+    val_evaluator_all_val, val_evaluator_refcoco_testA,
+    val_evaluator_refcoco_testB, val_evaluator_refcoco_plus_testA,
+    val_evaluator_refcoco_plus_testB, val_evaluator_refcocog_test,
+    val_evaluator_grefcoco_val, val_evaluator_grefcoco_testA,
+    val_evaluator_grefcoco_testB
+]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-fast-rcnn_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-fast-rcnn_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d0579c53cb23d71d0bec57387f413cc39449e93
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-fast-rcnn_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,66 @@
+_base_ = '../fast_rcnn/fast-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    roi_head=dict(
+        bbox_head=dict(bbox_coder=dict(target_stds=[0.05, 0.05, 0.1, 0.1]))),
+    # model training and testing settings
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6),
+            sampler=dict(num=256))),
+    test_cfg=dict(rcnn=dict(score_thr=1e-3)))
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadProposals', num_max_proposals=300),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadProposals', num_max_proposals=None),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img', 'proposals']),
+        ])
+]
+# TODO: support loading proposals
+data = dict(
+    train=dict(
+        proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_train2017.pkl',
+        pipeline=train_pipeline),
+    val=dict(
+        proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_val2017.pkl',
+        pipeline=test_pipeline),
+    test=dict(
+        proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_val2017.pkl',
+        pipeline=test_pipeline))
+optimizer_config = dict(
+    _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-faster-rcnn_r101-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-faster-rcnn_r101-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f585dc355ac7dc10e75875f6b9f739fe669912bb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-faster-rcnn_r101-caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './ga-faster-rcnn_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-faster-rcnn_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-faster-rcnn_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cd44de557bfb20b4298099bd0972e3327b410cb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-faster-rcnn_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,64 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    rpn_head=dict(
+        _delete_=True,
+        type='GARPNHead',
+        in_channels=256,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=8,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[8],
+            strides=[4, 8, 16, 32, 64]),
+        anchor_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.14, 0.14]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.11, 0.11]),
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+    roi_head=dict(
+        bbox_head=dict(bbox_coder=dict(target_stds=[0.05, 0.05, 0.1, 0.1]))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            ga_assigner=dict(
+                type='ApproxMaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            ga_sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            center_ratio=0.2,
+            ignore_ratio=0.5),
+        rpn_proposal=dict(nms_post=1000, max_per_img=300),
+        rcnn=dict(
+            assigner=dict(pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6),
+            sampler=dict(type='RandomSampler', num=256))),
+    test_cfg=dict(
+        rpn=dict(nms_post=1000, max_per_img=300), rcnn=dict(score_thr=1e-3)))
+optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-faster-rcnn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-faster-rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3007fbec42016fa8c6b90ba5b0b4e772d0e865f7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-faster-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,64 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    rpn_head=dict(
+        _delete_=True,
+        type='GARPNHead',
+        in_channels=256,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=8,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[8],
+            strides=[4, 8, 16, 32, 64]),
+        anchor_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.14, 0.14]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.11, 0.11]),
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+    roi_head=dict(
+        bbox_head=dict(bbox_coder=dict(target_stds=[0.05, 0.05, 0.1, 0.1]))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            ga_assigner=dict(
+                type='ApproxMaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            ga_sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            center_ratio=0.2,
+            ignore_ratio=0.5),
+        rpn_proposal=dict(nms_post=1000, max_per_img=300),
+        rcnn=dict(
+            assigner=dict(pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6),
+            sampler=dict(type='RandomSampler', num=256))),
+    test_cfg=dict(
+        rpn=dict(nms_post=1000, max_per_img=300), rcnn=dict(score_thr=1e-3)))
+optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-faster-rcnn_x101-32x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-faster-rcnn_x101-32x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a22a1ec01e66854c68968f65802dc117aa59953
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-faster-rcnn_x101-32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ga-faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-faster-rcnn_x101-64x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-faster-rcnn_x101-64x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d6aaeaa7187deaa2c0da73a89bf14980a3405db
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-faster-rcnn_x101-64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ga-faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9adbae55eea2311800ccbc8e01e3f41521c7040b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './ga-retinanet_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_ms-2x.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_ms-2x.py
new file mode 100644
index 0000000000000000000000000000000000000000..012e89b8338c69c4ffdf4182827a185233945288
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_ms-2x.py
@@ -0,0 +1,34 @@
+_base_ = './ga-retinanet_r101-caffe_fpn_1x_coco.py'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize', scale=[(1333, 480), (1333, 960)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# learning policy
+max_epochs = 24
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3.0,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b62aba62c64870977c7c8fe4021a361c8871b633
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,61 @@
+_base_ = '../retinanet/retinanet_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    bbox_head=dict(
+        _delete_=True,
+        type='GARetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        anchor_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.04, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        ga_assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.4,
+            ignore_iof_thr=-1),
+        ga_sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        assigner=dict(neg_iou_thr=0.5, min_pos_iou=0.0),
+        center_ratio=0.2,
+        ignore_ratio=0.5))
+optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..da39c7005b26d65cca0ae122bf078db2d8ad2786
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_r50_fpn_1x_coco.py
@@ -0,0 +1,61 @@
+_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    bbox_head=dict(
+        _delete_=True,
+        type='GARetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        anchor_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.04, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        ga_assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.4,
+            ignore_iof_thr=-1),
+        ga_sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        assigner=dict(neg_iou_thr=0.5, min_pos_iou=0.0),
+        center_ratio=0.2,
+        ignore_ratio=0.5))
+optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_x101-32x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_x101-32x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..478a8e5e4a2192e23329564ac688ac40c93110dd
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_x101-32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ga-retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_x101-64x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_x101-64x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb7721d3a604277977b102d431076d6d58a7d457
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-retinanet_x101-64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ga-retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-rpn_r101-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-rpn_r101-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b375c874ac8cabf5ad29aacc51e1065d14d83ee1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-rpn_r101-caffe_fpn_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = './ga-rpn_r50-caffe_fpn_1x_coco.py'
+# model settings
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-rpn_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-rpn_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa58426effe8bedbe9ffb907153b98d51bef5ef2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-rpn_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,57 @@
+_base_ = '../rpn/rpn_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    rpn_head=dict(
+        _delete_=True,
+        type='GARPNHead',
+        in_channels=256,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=8,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[8],
+            strides=[4, 8, 16, 32, 64]),
+        anchor_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.14, 0.14]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.11, 0.11]),
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            ga_assigner=dict(
+                type='ApproxMaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            ga_sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            center_ratio=0.2,
+            ignore_ratio=0.5)),
+    test_cfg=dict(rpn=dict(nms_post=1000)))
+optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-rpn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-rpn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2973f272b740c8deec74f6c24798a2d80d917946
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-rpn_r50_fpn_1x_coco.py
@@ -0,0 +1,57 @@
+_base_ = '../rpn/rpn_r50_fpn_1x_coco.py'
+model = dict(
+    rpn_head=dict(
+        _delete_=True,
+        type='GARPNHead',
+        in_channels=256,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=8,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[8],
+            strides=[4, 8, 16, 32, 64]),
+        anchor_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.14, 0.14]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.07, 0.07, 0.11, 0.11]),
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            ga_assigner=dict(
+                type='ApproxMaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            ga_sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            center_ratio=0.2,
+            ignore_ratio=0.5)),
+    test_cfg=dict(rpn=dict(nms_post=1000)))
+optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-rpn_x101-32x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-rpn_x101-32x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..276d45d8c21fa1eba130e834671bdddd794fa1f5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-rpn_x101-32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ga-rpn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/ga-rpn_x101-64x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/guided_anchoring/ga-rpn_x101-64x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f29fe9aa20054f3152e290df5ca75363dff6a4ce
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/ga-rpn_x101-64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ga-rpn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/guided_anchoring/metafile.yml b/mmde/mmdet/.mim/configs/guided_anchoring/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..516b3e93fc2b10fb563de1b377144da103ef4523
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/guided_anchoring/metafile.yml
@@ -0,0 +1,246 @@
+Collections:
+  - Name: Guided Anchoring
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - Guided Anchoring
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1901.03278
+      Title: 'Region Proposal by Guided Anchoring'
+    README: configs/guided_anchoring/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/dense_heads/ga_retina_head.py#L10
+      Version: v2.0.0
+
+Models:
+  - Name: ga-rpn_r50-caffe_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga-rpn_r50-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.3
+      inference time (ms/im):
+        - value: 63.29
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Region Proposal
+        Dataset: COCO
+        Metrics:
+          AR@1000: 68.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco/ga_rpn_r50_caffe_fpn_1x_coco_20200531-899008a6.pth
+
+  - Name: ga-rpn_r101-caffe_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga-rpn_r101-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.3
+      inference time (ms/im):
+        - value: 76.92
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Region Proposal
+        Dataset: COCO
+        Metrics:
+          AR@1000: 69.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco/ga_rpn_r101_caffe_fpn_1x_coco_20200531-ca9ba8fb.pth
+
+  - Name: ga-rpn_x101-32x4d_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga-rpn_x101-32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.5
+      inference time (ms/im):
+        - value: 100
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Region Proposal
+        Dataset: COCO
+        Metrics:
+          AR@1000: 70.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco/ga_rpn_x101_32x4d_fpn_1x_coco_20200220-c28d1b18.pth
+
+  - Name: ga-rpn_x101-64x4d_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga-rpn_x101-64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      inference time (ms/im):
+        - value: 133.33
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Region Proposal
+        Dataset: COCO
+        Metrics:
+          AR@1000: 70.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco/ga_rpn_x101_64x4d_fpn_1x_coco_20200225-3c6e1aa2.pth
+
+  - Name: ga-faster-rcnn_r50-caffe_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga-faster-rcnn_r50-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.5
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco/ga_faster_r50_caffe_fpn_1x_coco_20200702_000718-a11ccfe6.pth
+
+  - Name: ga-faster-rcnn_r101-caffe_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga-faster-rcnn_r101-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.5
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco/ga_faster_r101_caffe_fpn_1x_coco_bbox_mAP-0.415_20200505_115528-fb82e499.pth
+
+  - Name: ga-faster-rcnn_x101-32x4d_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga-faster-rcnn_x101-32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.7
+      inference time (ms/im):
+        - value: 103.09
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco/ga_faster_x101_32x4d_fpn_1x_coco_20200215-1ded9da3.pth
+
+  - Name: ga-faster-rcnn_x101-64x4d_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga-faster-rcnn_x101-64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 11.8
+      inference time (ms/im):
+        - value: 136.99
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco/ga_faster_x101_64x4d_fpn_1x_coco_20200215-0fa7bde7.pth
+
+  - Name: ga-retinanet_r50-caffe_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga-retinanet_r50-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.5
+      inference time (ms/im):
+        - value: 59.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco/ga_retinanet_r50_caffe_fpn_1x_coco_20201020-39581c6f.pth
+
+  - Name: ga-retinanet_r101-caffe_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.5
+      inference time (ms/im):
+        - value: 77.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco/ga_retinanet_r101_caffe_fpn_1x_coco_20200531-6266453c.pth
+
+  - Name: ga-retinanet_x101-32x4d_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga-retinanet_x101-32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.9
+      inference time (ms/im):
+        - value: 94.34
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco/ga_retinanet_x101_32x4d_fpn_1x_coco_20200219-40c56caa.pth
+
+  - Name: ga-retinanet_x101-64x4d_fpn_1x_coco
+    In Collection: Guided Anchoring
+    Config: configs/guided_anchoring/ga-retinanet_x101-64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.9
+      inference time (ms/im):
+        - value: 129.87
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco/ga_retinanet_x101_64x4d_fpn_1x_coco_20200226-ef9f7f1f.pth
diff --git a/mmde/mmdet/.mim/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w18_20e_coco.py b/mmde/mmdet/.mim/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w18_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ca0ebfe43b00886b22ffc426c5ac89a50f4fda6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w18_20e_coco.py
@@ -0,0 +1,11 @@
+_base_ = './cascade-mask-rcnn_hrnetv2p-w32_20e_coco.py'
+# model settings
+model = dict(
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(18, 36)),
+            stage3=dict(num_channels=(18, 36, 72)),
+            stage4=dict(num_channels=(18, 36, 72, 144))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+    neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w32_20e_coco.py b/mmde/mmdet/.mim/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w32_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ffedc3916748c3c6b333023110e56895de7e4bd
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w32_20e_coco.py
@@ -0,0 +1,51 @@
+_base_ = '../cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')),
+    neck=dict(
+        _delete_=True,
+        type='HRFPN',
+        in_channels=[32, 64, 128, 256],
+        out_channels=256))
+# learning policy
+max_epochs = 20
+train_cfg = dict(max_epochs=max_epochs)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 19],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w40-20e_coco.py b/mmde/mmdet/.mim/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w40-20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a51a02412871905d947bcbb648b1a24e5033f56
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w40-20e_coco.py
@@ -0,0 +1,12 @@
+_base_ = './cascade-mask-rcnn_hrnetv2p-w32_20e_coco.py'
+# model settings
+model = dict(
+    backbone=dict(
+        type='HRNet',
+        extra=dict(
+            stage2=dict(num_channels=(40, 80)),
+            stage3=dict(num_channels=(40, 80, 160)),
+            stage4=dict(num_channels=(40, 80, 160, 320))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')),
+    neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/cascade-rcnn_hrnetv2p-w18-20e_coco.py b/mmde/mmdet/.mim/configs/hrnet/cascade-rcnn_hrnetv2p-w18-20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8834c1d4ac7973a0e5ceb9f794786c0d706f343a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/cascade-rcnn_hrnetv2p-w18-20e_coco.py
@@ -0,0 +1,11 @@
+_base_ = './cascade-rcnn_hrnetv2p-w32-20e_coco.py'
+# model settings
+model = dict(
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(18, 36)),
+            stage3=dict(num_channels=(18, 36, 72)),
+            stage4=dict(num_channels=(18, 36, 72, 144))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+    neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/cascade-rcnn_hrnetv2p-w32-20e_coco.py b/mmde/mmdet/.mim/configs/hrnet/cascade-rcnn_hrnetv2p-w32-20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..afeb75dbe13c5a8425924e280b250208aaec872f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/cascade-rcnn_hrnetv2p-w32-20e_coco.py
@@ -0,0 +1,51 @@
+_base_ = '../cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')),
+    neck=dict(
+        _delete_=True,
+        type='HRFPN',
+        in_channels=[32, 64, 128, 256],
+        out_channels=256))
+# learning policy
+max_epochs = 20
+train_cfg = dict(max_epochs=max_epochs)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 19],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/hrnet/cascade-rcnn_hrnetv2p-w40-20e_coco.py b/mmde/mmdet/.mim/configs/hrnet/cascade-rcnn_hrnetv2p-w40-20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..66f8882a0030ae82f7a74f67963bbd1da3422a48
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/cascade-rcnn_hrnetv2p-w40-20e_coco.py
@@ -0,0 +1,12 @@
+_base_ = './cascade-rcnn_hrnetv2p-w32-20e_coco.py'
+# model settings
+model = dict(
+    backbone=dict(
+        type='HRNet',
+        extra=dict(
+            stage2=dict(num_channels=(40, 80)),
+            stage3=dict(num_channels=(40, 80, 160)),
+            stage4=dict(num_channels=(40, 80, 160, 320))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')),
+    neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w18-1x_coco.py b/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w18-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee9a698699a6674c90011b4037843560459462db
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w18-1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = './faster-rcnn_hrnetv2p-w32-1x_coco.py'
+# model settings
+model = dict(
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(18, 36)),
+            stage3=dict(num_channels=(18, 36, 72)),
+            stage4=dict(num_channels=(18, 36, 72, 144))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+    neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w18-2x_coco.py b/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w18-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b72c68f8cbbc83d16313c6d3ab3faf0ac86926f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w18-2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './faster-rcnn_hrnetv2p-w18-1x_coco.py'
+
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w32-1x_coco.py b/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w32-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a27ad06c5c169c84c6368f767b79b0a817d99fa1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w32-1x_coco.py
@@ -0,0 +1,37 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')),
+    neck=dict(
+        _delete_=True,
+        type='HRFPN',
+        in_channels=[32, 64, 128, 256],
+        out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w32_2x_coco.py b/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w32_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9568ce65c142f86ec6181236464454106d7de99
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w32_2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './faster-rcnn_hrnetv2p-w32-1x_coco.py'
+
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w40-1x_coco.py b/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w40-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b36200230b76269a9644cc7852cec6ce62eac5c3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w40-1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = './faster-rcnn_hrnetv2p-w32-1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='HRNet',
+        extra=dict(
+            stage2=dict(num_channels=(40, 80)),
+            stage3=dict(num_channels=(40, 80, 160)),
+            stage4=dict(num_channels=(40, 80, 160, 320))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')),
+    neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w40_2x_coco.py b/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w40_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b45355db1de7c649136438b91fec5199e08141
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/faster-rcnn_hrnetv2p-w40_2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './faster-rcnn_hrnetv2p-w40-1x_coco.py'
+
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-1x_coco.py b/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c20ca7767364e14e552b5b8af68a8124f6a1253e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-1x_coco.py
@@ -0,0 +1,10 @@
+_base_ = './fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py'
+model = dict(
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(18, 36)),
+            stage3=dict(num_channels=(18, 36, 72)),
+            stage4=dict(num_channels=(18, 36, 72, 144))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+    neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-2x_coco.py b/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5b67f6a12e294455829dddb89d05e281f2d7dc0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './fcos_hrnetv2p-w18-gn-head_4xb4-1x_coco.py'
+
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w18-gn-head_ms-640-800-4xb4-2x_coco.py b/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w18-gn-head_ms-640-800-4xb4-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5332d65d129255117f459f45369d5e13ed6653c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w18-gn-head_ms-640-800-4xb4-2x_coco.py
@@ -0,0 +1,10 @@
+_base_ = './fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco.py'
+model = dict(
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(18, 36)),
+            stage3=dict(num_channels=(18, 36, 72)),
+            stage4=dict(num_channels=(18, 36, 72, 144))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+    neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py b/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..159d96d712ae047efd7988bc53ae65006291478f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py
@@ -0,0 +1,43 @@
+_base_ = '../fcos/fcos_r50-caffe_fpn_gn-head_4xb4-1x_coco.py'
+model = dict(
+    data_preprocessor=dict(
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')),
+    neck=dict(
+        _delete_=True,
+        type='HRFPN',
+        in_channels=[32, 64, 128, 256],
+        out_channels=256,
+        stride=2,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-2x_coco.py b/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..73fd80e979d88840a57c68ca2fad6cb2e82a26bd
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py'
+
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco.py b/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c977bf31ed2fb0ef062108cea97c1cd235b89d3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco.py
@@ -0,0 +1,35 @@
+_base_ = './fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py'
+
+model = dict(
+    data_preprocessor=dict(
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w40-gn-head_ms-640-800-4xb4-2x_coco.py b/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w40-gn-head_ms-640-800-4xb4-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb0ff6d6ce80e702f6e88b556a770345a23afca4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/fcos_hrnetv2p-w40-gn-head_ms-640-800-4xb4-2x_coco.py
@@ -0,0 +1,11 @@
+_base_ = './fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='HRNet',
+        extra=dict(
+            stage2=dict(num_channels=(40, 80)),
+            stage3=dict(num_channels=(40, 80, 160)),
+            stage4=dict(num_channels=(40, 80, 160, 320))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')),
+    neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/htc_hrnetv2p-w18_20e_coco.py b/mmde/mmdet/.mim/configs/hrnet/htc_hrnetv2p-w18_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..55255d52a3541c99660dcddfba96da27c99f841d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/htc_hrnetv2p-w18_20e_coco.py
@@ -0,0 +1,10 @@
+_base_ = './htc_hrnetv2p-w32_20e_coco.py'
+model = dict(
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(18, 36)),
+            stage3=dict(num_channels=(18, 36, 72)),
+            stage4=dict(num_channels=(18, 36, 72, 144))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+    neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/htc_hrnetv2p-w32_20e_coco.py b/mmde/mmdet/.mim/configs/hrnet/htc_hrnetv2p-w32_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..545cb83eaca50f9d5de1fa6b3f3e569faab7d5f2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/htc_hrnetv2p-w32_20e_coco.py
@@ -0,0 +1,37 @@
+_base_ = '../htc/htc_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')),
+    neck=dict(
+        _delete_=True,
+        type='HRFPN',
+        in_channels=[32, 64, 128, 256],
+        out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/htc_hrnetv2p-w40_20e_coco.py b/mmde/mmdet/.mim/configs/hrnet/htc_hrnetv2p-w40_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b09256a08ee16893bcc0dd6518714daece294e0d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/htc_hrnetv2p-w40_20e_coco.py
@@ -0,0 +1,11 @@
+_base_ = './htc_hrnetv2p-w32_20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='HRNet',
+        extra=dict(
+            stage2=dict(num_channels=(40, 80)),
+            stage3=dict(num_channels=(40, 80, 160)),
+            stage4=dict(num_channels=(40, 80, 160, 320))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')),
+    neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/htc_hrnetv2p-w40_28e_coco.py b/mmde/mmdet/.mim/configs/hrnet/htc_hrnetv2p-w40_28e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c13b58a1a0690d19239fef40915489ddaff408e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/htc_hrnetv2p-w40_28e_coco.py
@@ -0,0 +1,16 @@
+_base_ = './htc_hrnetv2p-w40_20e_coco.py'
+
+# learning policy
+max_epochs = 28
+train_cfg = dict(max_epochs=max_epochs)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[24, 27],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/hrnet/htc_x101-64x4d_fpn_16xb1-28e_coco.py b/mmde/mmdet/.mim/configs/hrnet/htc_x101-64x4d_fpn_16xb1-28e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f1304e5f963351667c28cb264ca5434bc81f744
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/htc_x101-64x4d_fpn_16xb1-28e_coco.py
@@ -0,0 +1,16 @@
+_base_ = '../htc/htc_x101-64x4d_fpn_16xb1-20e_coco.py'
+
+# learning policy
+max_epochs = 28
+train_cfg = dict(max_epochs=max_epochs)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[24, 27],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w18-1x_coco.py b/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w18-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d5a463a66bed51d73a42eafffea654a18c111ce
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w18-1x_coco.py
@@ -0,0 +1,10 @@
+_base_ = './mask-rcnn_hrnetv2p-w32-1x_coco.py'
+model = dict(
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(18, 36)),
+            stage3=dict(num_channels=(18, 36, 72)),
+            stage4=dict(num_channels=(18, 36, 72, 144))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')),
+    neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w18-2x_coco.py b/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w18-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8abc55924a3eb8e06f9e1e5eeed503890542f6f6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w18-2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './mask-rcnn_hrnetv2p-w18-1x_coco.py'
+
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w32-1x_coco.py b/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w32-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..208b037807dfa9cab1d33ac58ac785ff72e400c1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w32-1x_coco.py
@@ -0,0 +1,37 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')),
+    neck=dict(
+        _delete_=True,
+        type='HRFPN',
+        in_channels=[32, 64, 128, 256],
+        out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w32-2x_coco.py b/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w32-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3741c820a6a0ca622ce6bbf80cb3e922107efb6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w32-2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './mask-rcnn_hrnetv2p-w32-1x_coco.py'
+
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w40-2x_coco.py b/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w40-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..360420c56d42814ed6f4d84775f1a19dfa96574a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w40-2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './mask-rcnn_hrnetv2p-w40_1x_coco.py'
+
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w40_1x_coco.py b/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w40_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..36e2305a520fd8305f9fd1358f5cbcb01027e40d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/mask-rcnn_hrnetv2p-w40_1x_coco.py
@@ -0,0 +1,11 @@
+_base_ = './mask-rcnn_hrnetv2p-w18-1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='HRNet',
+        extra=dict(
+            stage2=dict(num_channels=(40, 80)),
+            stage3=dict(num_channels=(40, 80, 160)),
+            stage4=dict(num_channels=(40, 80, 160, 320))),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')),
+    neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256))
diff --git a/mmde/mmdet/.mim/configs/hrnet/metafile.yml b/mmde/mmdet/.mim/configs/hrnet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..54c624793291dc9a713c9a6fa6df50499136768c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/hrnet/metafile.yml
@@ -0,0 +1,971 @@
+Models:
+  - Name: faster-rcnn_hrnetv2p-w18-1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/hrnet/faster-rcnn_hrnetv2p-w18-1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.6
+      inference time (ms/im):
+        - value: 74.63
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco/faster_rcnn_hrnetv2p_w18_1x_coco_20200130-56651a6d.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: faster-rcnn_hrnetv2p-w18-2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/hrnet/faster-rcnn_hrnetv2p-w18-2x_coco.py
+    Metadata:
+      Training Memory (GB): 6.6
+      inference time (ms/im):
+        - value: 74.63
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco/faster_rcnn_hrnetv2p_w18_2x_coco_20200702_085731-a4ec0611.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: faster-rcnn_hrnetv2p-w32-1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/hrnet/faster-rcnn_hrnetv2p-w32-1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.0
+      inference time (ms/im):
+        - value: 80.65
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco/faster_rcnn_hrnetv2p_w32_1x_coco_20200130-6e286425.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: faster-rcnn_hrnetv2p-w32_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/hrnet/faster-rcnn_hrnetv2p-w32_2x_coco.py
+    Metadata:
+      Training Memory (GB): 9.0
+      inference time (ms/im):
+        - value: 80.65
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco/faster_rcnn_hrnetv2p_w32_2x_coco_20200529_015927-976a9c15.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: faster-rcnn_hrnetv2p-w40-1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/hrnet/faster-rcnn_hrnetv2p-w40-1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.4
+      inference time (ms/im):
+        - value: 95.24
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco/faster_rcnn_hrnetv2p_w40_1x_coco_20200210-95c1f5ce.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: faster-rcnn_hrnetv2p-w40_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/hrnet/faster-rcnn_hrnetv2p-w40_2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.4
+      inference time (ms/im):
+        - value: 95.24
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco/faster_rcnn_hrnetv2p_w40_2x_coco_20200512_161033-0f236ef4.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: mask-rcnn_hrnetv2p-w18-1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/hrnet/mask-rcnn_hrnetv2p-w18-1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 85.47
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 34.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco/mask_rcnn_hrnetv2p_w18_1x_coco_20200205-1c3d78ed.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: mask-rcnn_hrnetv2p-w18-2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/hrnet/mask-rcnn_hrnetv2p-w18-2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 85.47
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco/mask_rcnn_hrnetv2p_w18_2x_coco_20200212-b3c825b1.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: mask-rcnn_hrnetv2p-w32-1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/hrnet/mask-rcnn_hrnetv2p-w32-1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.4
+      inference time (ms/im):
+        - value: 88.5
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco/mask_rcnn_hrnetv2p_w32_1x_coco_20200207-b29f616e.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: mask-rcnn_hrnetv2p-w32-2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/hrnet/mask-rcnn_hrnetv2p-w32-2x_coco.py
+    Metadata:
+      Training Memory (GB): 9.4
+      inference time (ms/im):
+        - value: 88.5
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco/mask_rcnn_hrnetv2p_w32_2x_coco_20200213-45b75b4d.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: mask-rcnn_hrnetv2p-w40_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/hrnet/mask-rcnn_hrnetv2p-w40_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.9
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco/mask_rcnn_hrnetv2p_w40_1x_coco_20200511_015646-66738b35.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: mask-rcnn_hrnetv2p-w40-2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/hrnet/mask-rcnn_hrnetv2p-w40-2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.9
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco/mask_rcnn_hrnetv2p_w40_2x_coco_20200512_163732-aed5e4ab.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: cascade-rcnn_hrnetv2p-w18-20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/hrnet/cascade-rcnn_hrnetv2p-w18-20e_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 90.91
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco/cascade_rcnn_hrnetv2p_w18_20e_coco_20200210-434be9d7.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: cascade-rcnn_hrnetv2p-w32-20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/hrnet/cascade-rcnn_hrnetv2p-w32-20e_coco.py
+    Metadata:
+      Training Memory (GB): 9.4
+      inference time (ms/im):
+        - value: 90.91
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco/cascade_rcnn_hrnetv2p_w32_20e_coco_20200208-928455a4.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: cascade-rcnn_hrnetv2p-w40-20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/hrnet/cascade-rcnn_hrnetv2p-w40-20e_coco.py
+    Metadata:
+      Training Memory (GB): 10.8
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco/cascade_rcnn_hrnetv2p_w40_20e_coco_20200512_161112-75e47b04.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: cascade-mask-rcnn_hrnetv2p-w18_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/hrnet/cascade-mask-rcnn_hrnetv2p-w18_20e_coco.py
+    Metadata:
+      Training Memory (GB): 8.5
+      inference time (ms/im):
+        - value: 117.65
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/cascade_mask_rcnn_hrnetv2p_w18_20e_coco_20200210-b543cd2b.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: cascade-mask-rcnn_hrnetv2p-w32_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/hrnet/cascade-mask-rcnn_hrnetv2p-w32_20e_coco.py
+    Metadata:
+      inference time (ms/im):
+        - value: 120.48
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco/cascade_mask_rcnn_hrnetv2p_w32_20e_coco_20200512_154043-39d9cf7b.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: cascade-mask-rcnn_hrnetv2p-w40-20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/hrnet/cascade-mask-rcnn_hrnetv2p-w40-20e_coco.py
+    Metadata:
+      Training Memory (GB): 12.5
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco/cascade_mask_rcnn_hrnetv2p_w40_20e_coco_20200527_204922-969c4610.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: htc_hrnetv2p-w18_20e_coco
+    In Collection: HTC
+    Config: configs/hrnet/htc_hrnetv2p-w18_20e_coco.py
+    Metadata:
+      Training Memory (GB): 10.8
+      inference time (ms/im):
+        - value: 212.77
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w18_20e_coco/htc_hrnetv2p_w18_20e_coco_20200210-b266988c.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: htc_hrnetv2p-w32_20e_coco
+    In Collection: HTC
+    Config: configs/hrnet/htc_hrnetv2p-w32_20e_coco.py
+    Metadata:
+      Training Memory (GB): 13.1
+      inference time (ms/im):
+        - value: 204.08
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w32_20e_coco/htc_hrnetv2p_w32_20e_coco_20200207-7639fa12.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: htc_hrnetv2p-w40_20e_coco
+    In Collection: HTC
+    Config: configs/hrnet/htc_hrnetv2p-w40_20e_coco.py
+    Metadata:
+      Training Memory (GB): 14.6
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w40_20e_coco/htc_hrnetv2p_w40_20e_coco_20200529_183411-417c4d5b.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: fcos_hrnetv2p-w18-gn-head_4xb4-1x_coco
+    In Collection: FCOS
+    Config: configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-1x_coco.py
+    Metadata:
+      Training Resources: 4x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 13.0
+      inference time (ms/im):
+        - value: 77.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 35.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco_20201212_100710-4ad151de.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: fcos_hrnetv2p-w18-gn-head_4xb4-2x_coco
+    In Collection: FCOS
+    Config: configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-2x_coco.py
+    Metadata:
+      Training Resources: 4x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 13.0
+      inference time (ms/im):
+        - value: 77.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco_20201212_101110-5c575fa5.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco
+    In Collection: FCOS
+    Config: configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py
+    Metadata:
+      Training Resources: 4x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 17.5
+      inference time (ms/im):
+        - value: 77.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco_20201211_134730-cb8055c0.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: fcos_hrnetv2p-w32-gn-head_4xb4-2x_coco
+    In Collection: FCOS
+    Config: configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-2x_coco.py
+    Metadata:
+      Training Resources: 4x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 17.5
+      inference time (ms/im):
+        - value: 77.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco_20201212_112133-77b6b9bb.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: fcos_hrnetv2p-w18-gn-head_ms-640-800-4xb4-2x_coco
+    In Collection: FCOS
+    Config: configs/hrnet/fcos_hrnetv2p-w18-gn-head_ms-640-800-4xb4-2x_coco.py
+    Metadata:
+      Training Resources: 4x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 13.0
+      inference time (ms/im):
+        - value: 77.52
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco_20201212_111651-441e9d9f.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco
+    In Collection: FCOS
+    Config: configs/hrnet/fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco.py
+    Metadata:
+      Training Resources: 4x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 17.5
+      inference time (ms/im):
+        - value: 80.65
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco_20201212_090846-b6f2b49f.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
+
+  - Name: fcos_hrnetv2p-w40-gn-head_ms-640-800-4xb4-2x_coco
+    In Collection: FCOS
+    Config: configs/hrnet/fcos_hrnetv2p-w40-gn-head_ms-640-800-4xb4-2x_coco.py
+    Metadata:
+      Training Resources: 4x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 20.3
+      inference time (ms/im):
+        - value: 92.59
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Architecture:
+        - HRNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco_20201212_124752-f22d2ce5.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.04514
+      Title: 'Deep High-Resolution Representation Learning for Visual Recognition'
+    README: configs/hrnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195
+      Version: v2.0.0
diff --git a/mmde/mmdet/.mim/configs/htc/htc-without-semantic_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/htc/htc-without-semantic_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..791f4eb25b53e122cd4876a71e84a4a9d2f67e26
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/htc/htc-without-semantic_r50_fpn_1x_coco.py
@@ -0,0 +1,223 @@
+_base_ = [
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    type='HybridTaskCascade',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='HybridTaskCascadeRoIHead',
+        interleaved=True,
+        mask_info_flow=True,
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=[
+            dict(
+                type='HTCMaskHead',
+                with_conv_res=False,
+                num_convs=4,
+                in_channels=256,
+                conv_out_channels=256,
+                num_classes=80,
+                loss_mask=dict(
+                    type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)),
+            dict(
+                type='HTCMaskHead',
+                num_convs=4,
+                in_channels=256,
+                conv_out_channels=256,
+                num_classes=80,
+                loss_mask=dict(
+                    type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)),
+            dict(
+                type='HTCMaskHead',
+                num_convs=4,
+                in_channels=256,
+                conv_out_channels=256,
+                num_classes=80,
+                loss_mask=dict(
+                    type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))
+        ]),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.001,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/mmde/mmdet/.mim/configs/htc/htc_r101_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/htc/htc_r101_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..28091aad31029109c29941404f2c3cc47f9c1092
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/htc/htc_r101_fpn_20e_coco.py
@@ -0,0 +1,6 @@
+_base_ = './htc_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/htc/htc_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/htc/htc_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3573f1f698095585f4a1de692d0e45a21429822e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/htc/htc_r50_fpn_1x_coco.py
@@ -0,0 +1,33 @@
+_base_ = './htc-without-semantic_r50_fpn_1x_coco.py'
+model = dict(
+    data_preprocessor=dict(pad_seg=True),
+    roi_head=dict(
+        semantic_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[8]),
+        semantic_head=dict(
+            type='FusedSemanticHead',
+            num_ins=5,
+            fusion_level=1,
+            seg_scale_factor=1 / 8,
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=183,
+            loss_seg=dict(
+                type='CrossEntropyLoss', ignore_index=255, loss_weight=0.2))))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(
+        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(
+    dataset=dict(
+        data_prefix=dict(img='train2017/', seg='stuffthingmaps/train2017/'),
+        pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/htc/htc_r50_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/htc/htc_r50_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f510fa6eec210381707f4d1b01264e72e0d0f76
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/htc/htc_r50_fpn_20e_coco.py
@@ -0,0 +1,16 @@
+_base_ = './htc_r50_fpn_1x_coco.py'
+
+# learning policy
+max_epochs = 20
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 19],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs)
diff --git a/mmde/mmdet/.mim/configs/htc/htc_x101-32x4d_fpn_16xb1-20e_coco.py b/mmde/mmdet/.mim/configs/htc/htc_x101-32x4d_fpn_16xb1-20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..396d3a0e2b72acc1d9601706ec4629720a46a738
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/htc/htc_x101-32x4d_fpn_16xb1-20e_coco.py
@@ -0,0 +1,32 @@
+_base_ = './htc_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
+
+train_dataloader = dict(batch_size=1, num_workers=1)
+
+# learning policy
+max_epochs = 20
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 19],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs)
diff --git a/mmde/mmdet/.mim/configs/htc/htc_x101-64x4d-dconv-c3-c5_fpn_ms-400-1400-16xb1-20e_coco.py b/mmde/mmdet/.mim/configs/htc/htc_x101-64x4d-dconv-c3-c5_fpn_ms-400-1400-16xb1-20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..26d68e7e2cda2a711e4d16899ae85b100afc60a0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/htc/htc_x101-64x4d-dconv-c3-c5_fpn_ms-400-1400-16xb1-20e_coco.py
@@ -0,0 +1,20 @@
+_base_ = './htc_x101-64x4d_fpn_16xb1-20e_coco.py'
+
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
+    dict(
+        type='RandomResize',
+        scale=[(1600, 400), (1600, 1400)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/htc/htc_x101-64x4d_fpn_16xb1-20e_coco.py b/mmde/mmdet/.mim/configs/htc/htc_x101-64x4d_fpn_16xb1-20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a600ddb0ebd2287cdaa0d00a6008db636d79be76
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/htc/htc_x101-64x4d_fpn_16xb1-20e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './htc_x101-32x4d_fpn_16xb1-20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        groups=64,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/htc/metafile.yml b/mmde/mmdet/.mim/configs/htc/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2f0f74d2d06a0f6053fa7f0b9bb73024f8dcaac5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/htc/metafile.yml
@@ -0,0 +1,165 @@
+Collections:
+  - Name: HTC
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - HTC
+        - RPN
+        - ResNet
+        - ResNeXt
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/abs/1901.07518
+      Title: 'Hybrid Task Cascade for Instance Segmentation'
+    README: configs/htc/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/htc.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: htc_r50_fpn_1x_coco
+    In Collection: HTC
+    Config: configs/htc/htc_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.2
+      inference time (ms/im):
+        - value: 172.41
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_1x_coco/htc_r50_fpn_1x_coco_20200317-7332cf16.pth
+
+  - Name: htc_r50_fpn_20e_coco
+    In Collection: HTC
+    Config: configs/htc/htc_r50_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 8.2
+      inference time (ms/im):
+        - value: 172.41
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_20e_coco/htc_r50_fpn_20e_coco_20200319-fe28c577.pth
+
+  - Name: htc_r101_fpn_20e_coco
+    In Collection: HTC
+    Config: configs/htc/htc_r101_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 10.2
+      inference time (ms/im):
+        - value: 181.82
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r101_fpn_20e_coco/htc_r101_fpn_20e_coco_20200317-9b41b48f.pth
+
+  - Name: htc_x101-32x4d_fpn_16xb1-20e_coco
+    In Collection: HTC
+    Config: configs/htc/htc_x101-32x4d_fpn_16xb1-20e_coco.py
+    Metadata:
+      Training Resources: 16x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 11.4
+      inference time (ms/im):
+        - value: 200
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_32x4d_fpn_16x1_20e_coco/htc_x101_32x4d_fpn_16x1_20e_coco_20200318-de97ae01.pth
+
+  - Name: htc_x101-64x4d_fpn_16xb1-20e_coco
+    In Collection: HTC
+    Config: configs/htc/htc_x101-64x4d_fpn_16xb1-20e_coco.py
+    Metadata:
+      Training Resources: 16x V100 GPUs
+      Batch Size: 16
+      Training Memory (GB): 14.5
+      inference time (ms/im):
+        - value: 227.27
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_16x1_20e_coco/htc_x101_64x4d_fpn_16x1_20e_coco_20200318-b181fd7a.pth
+
+  - Name: htc_x101-64x4d-dconv-c3-c5_fpn_ms-400-1400-16xb1-20e_coco
+    In Collection: HTC
+    Config: configs/htc/htc_x101-64x4d-dconv-c3-c5_fpn_ms-400-1400-16xb1-20e_coco.py
+    Metadata:
+      Training Resources: 16x V100 GPUs
+      Batch Size: 16
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 43.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco_20200312-946fd751.pth
diff --git a/mmde/mmdet/.mim/configs/instaboost/cascade-mask-rcnn_r101_fpn_instaboost-4x_coco.py b/mmde/mmdet/.mim/configs/instaboost/cascade-mask-rcnn_r101_fpn_instaboost-4x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..53e33b890cad86fcc64e6ea6eefe39138241c8e7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/instaboost/cascade-mask-rcnn_r101_fpn_instaboost-4x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_instaboost-4x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/instaboost/cascade-mask-rcnn_r50_fpn_instaboost-4x_coco.py b/mmde/mmdet/.mim/configs/instaboost/cascade-mask-rcnn_r50_fpn_instaboost-4x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7736cf5756676944c543b7e8412997ac81c2745
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/instaboost/cascade-mask-rcnn_r50_fpn_instaboost-4x_coco.py
@@ -0,0 +1,40 @@
+_base_ = '../cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(
+        type='InstaBoost',
+        action_candidate=('normal', 'horizontal', 'skip'),
+        action_prob=(1, 0, 0),
+        scale=(0.8, 1.2),
+        dx=15,
+        dy=15,
+        theta=(-1, 1),
+        color_prob=0.5,
+        hflag=False,
+        aug_ratio=0.5),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+max_epochs = 48
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[32, 44],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs)
+
+# only keep latest 3 checkpoints
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
diff --git a/mmde/mmdet/.mim/configs/instaboost/cascade-mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py b/mmde/mmdet/.mim/configs/instaboost/cascade-mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7938d9e00e3a9c030b788ca83b1a6ddee208aed
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/instaboost/cascade-mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './cascade-mask-rcnn_r50_fpn_instaboost-4x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/instaboost/mask-rcnn_r101_fpn_instaboost-4x_coco.py b/mmde/mmdet/.mim/configs/instaboost/mask-rcnn_r101_fpn_instaboost-4x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..55bfa9fefa4db9d6d69fb3c4a285d04592168398
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/instaboost/mask-rcnn_r101_fpn_instaboost-4x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './mask-rcnn_r50_fpn_instaboost-4x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/instaboost/mask-rcnn_r50_fpn_instaboost-4x_coco.py b/mmde/mmdet/.mim/configs/instaboost/mask-rcnn_r50_fpn_instaboost-4x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a8c9be81f03f98f97975aca47922575555e3844
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/instaboost/mask-rcnn_r50_fpn_instaboost-4x_coco.py
@@ -0,0 +1,40 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(
+        type='InstaBoost',
+        action_candidate=('normal', 'horizontal', 'skip'),
+        action_prob=(1, 0, 0),
+        scale=(0.8, 1.2),
+        dx=15,
+        dy=15,
+        theta=(-1, 1),
+        color_prob=0.5,
+        hflag=False,
+        aug_ratio=0.5),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+max_epochs = 48
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[32, 44],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs)
+
+# only keep latest 3 checkpoints
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
diff --git a/mmde/mmdet/.mim/configs/instaboost/mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py b/mmde/mmdet/.mim/configs/instaboost/mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ba2ada6011dd77ea2dcac2133bef8d92e522381
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/instaboost/mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './mask-rcnn_r50_fpn_instaboost-4x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/instaboost/metafile.yml b/mmde/mmdet/.mim/configs/instaboost/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..228f31b7301e6a5f9d2206e10be07bc7ea3b70be
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/instaboost/metafile.yml
@@ -0,0 +1,99 @@
+Collections:
+  - Name: InstaBoost
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - InstaBoost
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+    Paper:
+      URL: https://arxiv.org/abs/1908.07801
+      Title: 'Instaboost: Boosting instance segmentation via probability map guided copy-pasting'
+    README: configs/instaboost/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/datasets/pipelines/instaboost.py#L7
+      Version: v2.0.0
+
+Models:
+  - Name: mask-rcnn_r50_fpn_instaboost_4x_coco
+    In Collection: InstaBoost
+    Config: configs/instaboost/mask-rcnn_r50_fpn_instaboost-4x_coco.py
+    Metadata:
+      Training Memory (GB): 4.4
+      inference time (ms/im):
+        - value: 57.14
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 48
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco/mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-d025f83a.pth
+
+  - Name: mask-rcnn_r101_fpn_instaboost-4x_coco
+    In Collection: InstaBoost
+    Config: configs/instaboost/mask-rcnn_r101_fpn_instaboost-4x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      Epochs: 48
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco/mask_rcnn_r101_fpn_instaboost_4x_coco_20200703_235738-f23f3a5f.pth
+
+  - Name: mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco
+    In Collection: InstaBoost
+    Config: configs/instaboost/mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py
+    Metadata:
+      Training Memory (GB): 10.7
+      Epochs: 48
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco_20200515_080947-8ed58c1b.pth
+
+  - Name: cascade-mask-rcnn_r50_fpn_instaboost_4x_coco
+    In Collection: InstaBoost
+    Config: configs/instaboost/cascade-mask-rcnn_r50_fpn_instaboost-4x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      inference time (ms/im):
+        - value: 83.33
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 48
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-c19d98d9.pth
diff --git a/mmde/mmdet/.mim/configs/lad/lad_r101-paa-r50_fpn_2xb8_coco_1x.py b/mmde/mmdet/.mim/configs/lad/lad_r101-paa-r50_fpn_2xb8_coco_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..d61d08638a073f3dad71d7499221e3ef62ff90f3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/lad/lad_r101-paa-r50_fpn_2xb8_coco_1x.py
@@ -0,0 +1,127 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+teacher_ckpt = 'https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.pth'  # noqa
+
+model = dict(
+    type='LAD',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    # student
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='LADHead',
+        reg_decoded_bbox=True,
+        score_voting=True,
+        topk=9,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)),
+    # teacher
+    teacher_ckpt=teacher_ckpt,
+    teacher_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    teacher_neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    teacher_bbox_head=dict(
+        type='LADHead',
+        reg_decoded_bbox=True,
+        score_voting=True,
+        topk=9,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.1,
+            neg_iou_thr=0.1,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        score_voting=True,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+train_dataloader = dict(batch_size=8, num_workers=4)
+optim_wrapper = dict(type='AmpOptimWrapper', optimizer=dict(lr=0.01))
diff --git a/mmde/mmdet/.mim/configs/lad/lad_r50-paa-r101_fpn_2xb8_coco_1x.py b/mmde/mmdet/.mim/configs/lad/lad_r50-paa-r101_fpn_2xb8_coco_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7eaf2bfba1c41b42836e94ffe2714978dffd20a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/lad/lad_r50-paa-r101_fpn_2xb8_coco_1x.py
@@ -0,0 +1,126 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+teacher_ckpt = 'http://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.pth'  # noqa
+
+model = dict(
+    type='LAD',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    # student
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='LADHead',
+        reg_decoded_bbox=True,
+        score_voting=True,
+        topk=9,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)),
+    # teacher
+    teacher_ckpt=teacher_ckpt,
+    teacher_backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    teacher_neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    teacher_bbox_head=dict(
+        type='LADHead',
+        reg_decoded_bbox=True,
+        score_voting=True,
+        topk=9,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.1,
+            neg_iou_thr=0.1,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        score_voting=True,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+train_dataloader = dict(batch_size=8, num_workers=4)
+optim_wrapper = dict(type='AmpOptimWrapper', optimizer=dict(lr=0.01))
diff --git a/mmde/mmdet/.mim/configs/lad/metafile.yml b/mmde/mmdet/.mim/configs/lad/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..230132e63c06c77e16902450c282cf9a25150751
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/lad/metafile.yml
@@ -0,0 +1,45 @@
+Collections:
+  - Name: Label Assignment Distillation
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - Label Assignment Distillation
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 2x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/2108.10520
+      Title: 'Improving Object Detection by Label Assignment Distillation'
+    README: configs/lad/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.19.0/mmdet/models/detectors/lad.py#L10
+      Version: v2.19.0
+
+Models:
+  - Name: lad_r101-paa-r50_fpn_2xb8_coco_1x
+    In Collection: Label Assignment Distillation
+    Config: configs/lad/lad_r101-paa-r50_fpn_2xb8_coco_1x.py
+    Metadata:
+      Training Memory (GB): 12.4
+      Epochs: 12
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 43.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/lad/lad_r101_paa_r50_fpn_coco_1x/lad_r101_paa_r50_fpn_coco_1x_20220708_124357-9407ac54.pth
+  - Name: lad_r50-paa-r101_fpn_2xb8_coco_1x
+    In Collection: Label Assignment Distillation
+    Config: configs/lad/lad_r50-paa-r101_fpn_2xb8_coco_1x.py
+    Metadata:
+      Training Memory (GB): 8.9
+      Epochs: 12
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 41.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/lad/lad_r50_paa_r101_fpn_coco_1x/lad_r50_paa_r101_fpn_coco_1x_20220708_124246-74c76ff0.pth
diff --git a/mmde/mmdet/.mim/configs/ld/ld_r101-gflv1-r101-dcn_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/ld/ld_r101-gflv1-r101-dcn_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7e928bdc2325825d836bd939f163d71e972c238
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ld/ld_r101-gflv1-r101-dcn_fpn_2x_coco.py
@@ -0,0 +1,49 @@
+_base_ = ['./ld_r18-gflv1-r101_fpn_1x_coco.py']
+teacher_ckpt = 'https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20200630_102002-134b07df.pth'  # noqa
+model = dict(
+    teacher_config='configs/gfl/gfl_r101-dconv-c3-c5_fpn_ms-2x_coco.py',
+    teacher_ckpt=teacher_ckpt,
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5))
+
+max_epochs = 24
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs)
+
+# multi-scale training
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize', scale=[(1333, 480), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/ld/ld_r18-gflv1-r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/ld/ld_r18-gflv1-r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f18bb1d3620f3caecdc870ea8a3346424729225c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ld/ld_r18-gflv1-r101_fpn_1x_coco.py
@@ -0,0 +1,70 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+teacher_ckpt = 'https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_mstrain_2x_coco/gfl_r101_fpn_mstrain_2x_coco_20200629_200126-dd12f847.pth'  # noqa
+model = dict(
+    type='KnowledgeDistillationSingleStageDetector',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    teacher_config='configs/gfl/gfl_r101_fpn_ms-2x_coco.py',
+    teacher_ckpt=teacher_ckpt,
+    backbone=dict(
+        type='ResNet',
+        depth=18,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(
+        type='FPN',
+        in_channels=[64, 128, 256, 512],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='LDHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25),
+        loss_ld=dict(
+            type='KnowledgeDistillationKLDivLoss', loss_weight=0.25, T=10),
+        reg_max=16,
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/ld/ld_r34-gflv1-r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/ld/ld_r34-gflv1-r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2198adc82cfc98fca139e120ea0487989ac8bae7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ld/ld_r34-gflv1-r101_fpn_1x_coco.py
@@ -0,0 +1,19 @@
+_base_ = ['./ld_r18-gflv1-r101_fpn_1x_coco.py']
+model = dict(
+    backbone=dict(
+        type='ResNet',
+        depth=34,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet34')),
+    neck=dict(
+        type='FPN',
+        in_channels=[64, 128, 256, 512],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/ld/ld_r50-gflv1-r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/ld/ld_r50-gflv1-r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..89ab5796969b88080f96f3afcc24183b0c11c730
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ld/ld_r50-gflv1-r101_fpn_1x_coco.py
@@ -0,0 +1,19 @@
+_base_ = ['./ld_r18-gflv1-r101_fpn_1x_coco.py']
+model = dict(
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/ld/metafile.yml b/mmde/mmdet/.mim/configs/ld/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a807d1b816e78734839cc1482c9c3d4afe59d6ac
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ld/metafile.yml
@@ -0,0 +1,69 @@
+Collections:
+  - Name: Localization Distillation
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - Localization Distillation
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/2102.12252
+      Title: 'Localization Distillation for Dense Object Detection'
+    README: configs/ld/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.11.0/mmdet/models/dense_heads/ld_head.py#L11
+      Version: v2.11.0
+
+Models:
+  - Name: ld_r18-gflv1-r101_fpn_1x_coco
+    In Collection: Localization Distillation
+    Config: configs/ld/ld_r18-gflv1-r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 1.8
+      Epochs: 12
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 36.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r18_gflv1_r101_fpn_coco_1x/ld_r18_gflv1_r101_fpn_coco_1x_20220702_062206-330e6332.pth
+  - Name: ld_r34-gflv1-r101_fpn_1x_coco
+    In Collection: Localization Distillation
+    Config: configs/ld/ld_r34-gflv1-r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 2.2
+      Epochs: 12
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r34_gflv1_r101_fpn_coco_1x/ld_r34_gflv1_r101_fpn_coco_1x_20220630_134007-9bc69413.pth
+  - Name: ld_r50-gflv1-r101_fpn_1x_coco
+    In Collection: Localization Distillation
+    Config: configs/ld/ld_r50-gflv1-r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.6
+      Epochs: 12
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 41.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r50_gflv1_r101_fpn_coco_1x/ld_r50_gflv1_r101_fpn_coco_1x_20220629_145355-8dc5bad8.pth
+  - Name: ld_r101-gflv1-r101-dcn_fpn_2x_coco
+    In Collection: Localization Distillation
+    Config: configs/ld/ld_r101-gflv1-r101-dcn_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 5.5
+      Epochs: 24
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 45.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r101_gflv1_r101dcn_fpn_coco_2x/ld_r101_gflv1_r101dcn_fpn_coco_2x_20220629_185920-9e658426.pth
diff --git a/mmde/mmdet/.mim/configs/legacy_1.x/cascade-mask-rcnn_r50_fpn_1x_coco_v1.py b/mmde/mmdet/.mim/configs/legacy_1.x/cascade-mask-rcnn_r50_fpn_1x_coco_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..f948a7a9c10f618438e8ff54bdf3333335577e90
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/legacy_1.x/cascade-mask-rcnn_r50_fpn_1x_coco_v1.py
@@ -0,0 +1,78 @@
+_base_ = [
+    '../_base_/models/cascade-mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='CascadeRCNN',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        anchor_generator=dict(type='LegacyAnchorGenerator', center_offset=0.5),
+        bbox_coder=dict(
+            type='LegacyDeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0])),
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=7,
+                sampling_ratio=2,
+                aligned=False)),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                reg_class_agnostic=True,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='LegacyDeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2])),
+            dict(
+                type='Shared2FCBBoxHead',
+                reg_class_agnostic=True,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='LegacyDeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1])),
+            dict(
+                type='Shared2FCBBoxHead',
+                reg_class_agnostic=True,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='LegacyDeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067])),
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=14,
+                sampling_ratio=2,
+                aligned=False))))
diff --git a/mmde/mmdet/.mim/configs/legacy_1.x/faster-rcnn_r50_fpn_1x_coco_v1.py b/mmde/mmdet/.mim/configs/legacy_1.x/faster-rcnn_r50_fpn_1x_coco_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bf9713793c4a0a951273d037253f930fbb31a6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/legacy_1.x/faster-rcnn_r50_fpn_1x_coco_v1.py
@@ -0,0 +1,38 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='FasterRCNN',
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    rpn_head=dict(
+        type='RPNHead',
+        anchor_generator=dict(
+            type='LegacyAnchorGenerator',
+            center_offset=0.5,
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=7,
+                sampling_ratio=2,
+                aligned=False),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn_proposal=dict(max_per_img=2000),
+        rcnn=dict(assigner=dict(match_low_quality=True))))
diff --git a/mmde/mmdet/.mim/configs/legacy_1.x/mask-rcnn_r50_fpn_1x_coco_v1.py b/mmde/mmdet/.mim/configs/legacy_1.x/mask-rcnn_r50_fpn_1x_coco_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..690802598493e64821aaf98111161e36b169e475
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/legacy_1.x/mask-rcnn_r50_fpn_1x_coco_v1.py
@@ -0,0 +1,34 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    rpn_head=dict(
+        anchor_generator=dict(type='LegacyAnchorGenerator', center_offset=0.5),
+        bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=7,
+                sampling_ratio=2,
+                aligned=False)),
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=14,
+                sampling_ratio=2,
+                aligned=False)),
+        bbox_head=dict(
+            bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+
+    # model training and testing settings
+    train_cfg=dict(
+        rpn_proposal=dict(max_per_img=2000),
+        rcnn=dict(assigner=dict(match_low_quality=True))))
diff --git a/mmde/mmdet/.mim/configs/legacy_1.x/retinanet_r50-caffe_fpn_1x_coco_v1.py b/mmde/mmdet/.mim/configs/legacy_1.x/retinanet_r50-caffe_fpn_1x_coco_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..49abc31a002f56147cacf1b7707140a14b784a99
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/legacy_1.x/retinanet_r50-caffe_fpn_1x_coco_v1.py
@@ -0,0 +1,16 @@
+_base_ = './retinanet_r50_fpn_1x_coco_v1.py'
+model = dict(
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        # use caffe img_norm
+        mean=[102.9801, 115.9465, 122.7717],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron/resnet50_caffe')))
diff --git a/mmde/mmdet/.mim/configs/legacy_1.x/retinanet_r50_fpn_1x_coco_v1.py b/mmde/mmdet/.mim/configs/legacy_1.x/retinanet_r50_fpn_1x_coco_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..6198b9717957374ce734ca74de5f54dda44123b9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/legacy_1.x/retinanet_r50_fpn_1x_coco_v1.py
@@ -0,0 +1,17 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    bbox_head=dict(
+        type='RetinaHead',
+        anchor_generator=dict(
+            type='LegacyAnchorGenerator',
+            center_offset=0.5,
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0)))
diff --git a/mmde/mmdet/.mim/configs/legacy_1.x/ssd300_coco_v1.py b/mmde/mmdet/.mim/configs/legacy_1.x/ssd300_coco_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5ffc633a9b4773d7116bed7cbf8bcab7fb3110d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/legacy_1.x/ssd300_coco_v1.py
@@ -0,0 +1,20 @@
+_base_ = [
+    '../_base_/models/ssd300.py', '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+# model settings
+input_size = 300
+model = dict(
+    bbox_head=dict(
+        type='SSDHead',
+        anchor_generator=dict(
+            type='LegacySSDAnchorGenerator',
+            scale_major=False,
+            input_size=input_size,
+            basesize_ratio_range=(0.15, 0.9),
+            strides=[8, 16, 32, 64, 100, 300],
+            ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]),
+        bbox_coder=dict(
+            type='LegacyDeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2])))
diff --git a/mmde/mmdet/.mim/configs/libra_rcnn/libra-fast-rcnn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/libra_rcnn/libra-fast-rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2efe440ce361d5bc5855c76001a5ff6b661a568a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/libra_rcnn/libra-fast-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,52 @@
+_base_ = '../fast_rcnn/fast-rcnn_r50_fpn_1x_coco.py'
+# model settings
+model = dict(
+    neck=[
+        dict(
+            type='FPN',
+            in_channels=[256, 512, 1024, 2048],
+            out_channels=256,
+            num_outs=5),
+        dict(
+            type='BFP',
+            in_channels=256,
+            num_levels=5,
+            refine_level=2,
+            refine_type='non_local')
+    ],
+    roi_head=dict(
+        bbox_head=dict(
+            loss_bbox=dict(
+                _delete_=True,
+                type='BalancedL1Loss',
+                alpha=0.5,
+                gamma=1.5,
+                beta=1.0,
+                loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rcnn=dict(
+            sampler=dict(
+                _delete_=True,
+                type='CombinedSampler',
+                num=512,
+                pos_fraction=0.25,
+                add_gt_as_proposals=True,
+                pos_sampler=dict(type='InstanceBalancedPosSampler'),
+                neg_sampler=dict(
+                    type='IoUBalancedNegSampler',
+                    floor_thr=-1,
+                    floor_fraction=0,
+                    num_bins=3)))))
+
+# MMEngine support the following two ways, users can choose
+# according to convenience
+# _base_.train_dataloader.dataset.proposal_file = 'libra_proposals/rpn_r50_fpn_1x_train2017.pkl'  # noqa
+train_dataloader = dict(
+    dataset=dict(proposal_file='libra_proposals/rpn_r50_fpn_1x_train2017.pkl'))
+
+# _base_.val_dataloader.dataset.proposal_file = 'libra_proposals/rpn_r50_fpn_1x_val2017.pkl'  # noqa
+# test_dataloader = _base_.val_dataloader
+val_dataloader = dict(
+    dataset=dict(proposal_file='libra_proposals/rpn_r50_fpn_1x_val2017.pkl'))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/libra_rcnn/libra-faster-rcnn_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/libra_rcnn/libra-faster-rcnn_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..985df64cb437e233f76235ee9be4b788ec8f701c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/libra_rcnn/libra-faster-rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './libra-faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/libra_rcnn/libra-faster-rcnn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/libra_rcnn/libra-faster-rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9ee507d26338b49eca004ee195fd2b1954c32d9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/libra_rcnn/libra-faster-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,41 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+# model settings
+model = dict(
+    neck=[
+        dict(
+            type='FPN',
+            in_channels=[256, 512, 1024, 2048],
+            out_channels=256,
+            num_outs=5),
+        dict(
+            type='BFP',
+            in_channels=256,
+            num_levels=5,
+            refine_level=2,
+            refine_type='non_local')
+    ],
+    roi_head=dict(
+        bbox_head=dict(
+            loss_bbox=dict(
+                _delete_=True,
+                type='BalancedL1Loss',
+                alpha=0.5,
+                gamma=1.5,
+                beta=1.0,
+                loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(sampler=dict(neg_pos_ub=5), allowed_border=-1),
+        rcnn=dict(
+            sampler=dict(
+                _delete_=True,
+                type='CombinedSampler',
+                num=512,
+                pos_fraction=0.25,
+                add_gt_as_proposals=True,
+                pos_sampler=dict(type='InstanceBalancedPosSampler'),
+                neg_sampler=dict(
+                    type='IoUBalancedNegSampler',
+                    floor_thr=-1,
+                    floor_fraction=0,
+                    num_bins=3)))))
diff --git a/mmde/mmdet/.mim/configs/libra_rcnn/libra-faster-rcnn_x101-64x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/libra_rcnn/libra-faster-rcnn_x101-64x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..158e238ed14d9c56b7d02d17f0061b08d4116282
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/libra_rcnn/libra-faster-rcnn_x101-64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './libra-faster-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/libra_rcnn/libra-retinanet_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/libra_rcnn/libra-retinanet_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..be2742098fb8f1e46bbb16c9d3e2e20c2e3083aa
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/libra_rcnn/libra-retinanet_r50_fpn_1x_coco.py
@@ -0,0 +1,26 @@
+_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py'
+# model settings
+model = dict(
+    neck=[
+        dict(
+            type='FPN',
+            in_channels=[256, 512, 1024, 2048],
+            out_channels=256,
+            start_level=1,
+            add_extra_convs='on_input',
+            num_outs=5),
+        dict(
+            type='BFP',
+            in_channels=256,
+            num_levels=5,
+            refine_level=1,
+            refine_type='non_local')
+    ],
+    bbox_head=dict(
+        loss_bbox=dict(
+            _delete_=True,
+            type='BalancedL1Loss',
+            alpha=0.5,
+            gamma=1.5,
+            beta=0.11,
+            loss_weight=1.0)))
diff --git a/mmde/mmdet/.mim/configs/libra_rcnn/metafile.yml b/mmde/mmdet/.mim/configs/libra_rcnn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f01bd02bb7a55dd899bc64a56346357f2951f6d5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/libra_rcnn/metafile.yml
@@ -0,0 +1,99 @@
+Collections:
+  - Name: Libra R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - IoU-Balanced Sampling
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Balanced Feature Pyramid
+    Paper:
+      URL: https://arxiv.org/abs/1904.02701
+      Title: 'Libra R-CNN: Towards Balanced Learning for Object Detection'
+    README: configs/libra_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/necks/bfp.py#L10
+      Version: v2.0.0
+
+Models:
+  - Name: libra-faster-rcnn_r50_fpn_1x_coco
+    In Collection: Libra R-CNN
+    Config: configs/libra_rcnn/libra-faster-rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.6
+      inference time (ms/im):
+        - value: 52.63
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco/libra_faster_rcnn_r50_fpn_1x_coco_20200130-3afee3a9.pth
+
+  - Name: libra-faster-rcnn_r101_fpn_1x_coco
+    In Collection: Libra R-CNN
+    Config: configs/libra_rcnn/libra-faster-rcnn_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.5
+      inference time (ms/im):
+        - value: 69.44
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco/libra_faster_rcnn_r101_fpn_1x_coco_20200203-8dba6a5a.pth
+
+  - Name: libra-faster-rcnn_x101-64x4d_fpn_1x_coco
+    In Collection: Libra R-CNN
+    Config: configs/libra_rcnn/libra-faster-rcnn_x101-64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.8
+      inference time (ms/im):
+        - value: 117.65
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco/libra_faster_rcnn_x101_64x4d_fpn_1x_coco_20200315-3a7d0488.pth
+
+  - Name: libra-retinanet_r50_fpn_1x_coco
+    In Collection: Libra R-CNN
+    Config: configs/libra_rcnn/libra-retinanet_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.2
+      inference time (ms/im):
+        - value: 56.5
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_retinanet_r50_fpn_1x_coco/libra_retinanet_r50_fpn_1x_coco_20200205-804d94ce.pth
diff --git a/mmde/mmdet/.mim/configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-1x_lvis-v1.py b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-1x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..3994d75a81aaa5368bd42c591fa770b05b665e25
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-1x_lvis-v1.py
@@ -0,0 +1,6 @@
+_base_ = './mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-2x_lvis-v0.5.py b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-2x_lvis-v0.5.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed8b3639a0046e14d5c11a98f9d7dc38eb4badec
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-2x_lvis-v0.5.py
@@ -0,0 +1,6 @@
+_base_ = './mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdd3683e3005dd09ada78827825da516bfd4c66e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py
@@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/lvis_v1_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(num_classes=1203), mask_head=dict(num_classes=1203)),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.0001,
+            # LVIS allows up to 300
+            max_per_img=300)))
diff --git a/mmde/mmdet/.mim/configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py
new file mode 100644
index 0000000000000000000000000000000000000000..b36b6c17fef7da3646654e494fa715302b1b050e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py
@@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/lvis_v0.5_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(num_classes=1230), mask_head=dict(num_classes=1230)),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.0001,
+            # LVIS allows up to 300
+            max_per_img=300)))
diff --git a/mmde/mmdet/.mim/configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-1x_lvis-v1.py b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-1x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..9da3ab6db04ec6ee772202270a47179171a9d13c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-1x_lvis-v1.py
@@ -0,0 +1,14 @@
+_base_ = './mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a097c94c7e2d7c7b583027ce6000aba8205d490
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py
@@ -0,0 +1,14 @@
+_base_ = './mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-1x_lvis-v1.py b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-1x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0819b3ec60d710205a643305edd2a27db977d9b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-1x_lvis-v1.py
@@ -0,0 +1,14 @@
+_base_ = './mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d2720089181f066bcaa04b73903836b64b97bb9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py
@@ -0,0 +1,14 @@
+_base_ = './mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/lvis/metafile.yml b/mmde/mmdet/.mim/configs/lvis/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f8def96c7e5404bba0b40f4f00ce9efabfe0a891
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/lvis/metafile.yml
@@ -0,0 +1,128 @@
+Models:
+  - Name: mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5
+    In Collection: Mask R-CNN
+    Config: configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v0.5
+        Metrics:
+          box AP: 26.1
+      - Task: Instance Segmentation
+        Dataset: LVIS v0.5
+        Metrics:
+          mask AP: 25.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis-dbd06831.pth
+
+  - Name: mask-rcnn_r101_fpn_sample1e-3_ms-2x_lvis-v0.5
+    In Collection: Mask R-CNN
+    Config: configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-2x_lvis-v0.5.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v0.5
+        Metrics:
+          box AP: 27.1
+      - Task: Instance Segmentation
+        Dataset: LVIS v0.5
+        Metrics:
+          mask AP: 27.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis-54582ee2.pth
+
+  - Name: mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-2x_lvis-v0.5
+    In Collection: Mask R-CNN
+    Config: configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v0.5
+        Metrics:
+          box AP: 26.7
+      - Task: Instance Segmentation
+        Dataset: LVIS v0.5
+        Metrics:
+          mask AP: 26.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis-3cf55ea2.pth
+
+  - Name: mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-2x_lvis-v0.5
+    In Collection: Mask R-CNN
+    Config: configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v0.5
+        Metrics:
+          box AP: 26.4
+      - Task: Instance Segmentation
+        Dataset: LVIS v0.5
+        Metrics:
+          mask AP: 26.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis-1c99a5ad.pth
+
+  - Name: mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1
+    In Collection: Mask R-CNN
+    Config: configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 22.5
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 21.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1-aa78ac3d.pth
+
+  - Name: mask-rcnn_r101_fpn_sample1e-3_ms-1x_lvis-v1
+    In Collection: Mask R-CNN
+    Config: configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-1x_lvis-v1.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 24.6
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 23.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1-ec55ce32.pth
+
+  - Name: mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-1x_lvis-v1
+    In Collection: Mask R-CNN
+    Config: configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-1x_lvis-v1.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 26.7
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 25.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-ebbc5c81.pth
+
+  - Name: mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-1x_lvis-v1
+    In Collection: Mask R-CNN
+    Config: configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-1x_lvis-v1.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 27.2
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 25.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-43d9edfe.pth
diff --git a/mmde/mmdet/.mim/configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco-panoptic.py b/mmde/mmdet/.mim/configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco-panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..66685a2fca9c0e165ba0024e242d5eabf5d565c9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco-panoptic.py
@@ -0,0 +1,7 @@
+_base_ = './mask2former_r50_8xb2-lsj-50e_coco-panoptic.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco.py b/mmde/mmdet/.mim/configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4c29906d9fc6ce47ce928fb73dcb1bb6c6f7ba9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco.py
@@ -0,0 +1,7 @@
+_base_ = ['./mask2former_r50_8xb2-lsj-50e_coco.py']
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py b/mmde/mmdet/.mim/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..c53e981bf0d5081c3735676be922f64298a8fc80
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py
@@ -0,0 +1,251 @@
+_base_ = [
+    '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py'
+]
+image_size = (1024, 1024)
+batch_augments = [
+    dict(
+        type='BatchFixedSizePad',
+        size=image_size,
+        img_pad_value=0,
+        pad_mask=True,
+        mask_pad_value=0,
+        pad_seg=True,
+        seg_pad_value=255)
+]
+data_preprocessor = dict(
+    type='DetDataPreprocessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_size_divisor=32,
+    pad_mask=True,
+    mask_pad_value=0,
+    pad_seg=True,
+    seg_pad_value=255,
+    batch_augments=batch_augments)
+
+num_things_classes = 80
+num_stuff_classes = 53
+num_classes = num_things_classes + num_stuff_classes
+model = dict(
+    type='Mask2Former',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    panoptic_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[256, 512, 1024, 2048],  # pass to pixel_decoder inside
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        pixel_decoder=dict(
+            type='MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        dropout=0.0,
+                        batch_first=True),
+                    ffn_cfg=dict(
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)))),
+            positional_encoding=dict(num_feats=128, normalize=True)),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    dropout=0.0,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    dropout=0.0,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    ffn_drop=0.0,
+                    act_cfg=dict(type='ReLU', inplace=True))),
+            init_cfg=None),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0)),
+    panoptic_fusion_head=dict(
+        type='MaskFormerFusionHead',
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        loss_panoptic=None,
+        init_cfg=None),
+    train_cfg=dict(
+        num_points=12544,
+        oversample_ratio=3.0,
+        importance_sample_ratio=0.75,
+        assigner=dict(
+            type='HungarianAssigner',
+            match_costs=[
+                dict(type='ClassificationCost', weight=2.0),
+                dict(
+                    type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True),
+                dict(type='DiceCost', weight=5.0, pred_act=True, eps=1.0)
+            ]),
+        sampler=dict(type='MaskPseudoSampler')),
+    test_cfg=dict(
+        panoptic_on=True,
+        # For now, the dataset does not support
+        # evaluating semantic segmentation metric.
+        semantic_on=False,
+        instance_on=True,
+        # max_per_image is for instance segmentation.
+        max_per_image=100,
+        iou_thr=0.8,
+        # In Mask2Former's panoptic postprocessing,
+        # it will filter mask area where score is less than 0.5 .
+        filter_low_score=True),
+    init_cfg=None)
+
+# dataset settings
+data_root = 'data/coco/'
+train_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        to_float32=True,
+        backend_args={{_base_.backend_args}}),
+    dict(
+        type='LoadPanopticAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True,
+        backend_args={{_base_.backend_args}}),
+    dict(type='RandomFlip', prob=0.5),
+    # large scale jittering
+    dict(
+        type='RandomResize',
+        scale=image_size,
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_size=image_size,
+        crop_type='absolute',
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+val_evaluator = [
+    dict(
+        type='CocoPanopticMetric',
+        ann_file=data_root + 'annotations/panoptic_val2017.json',
+        seg_prefix=data_root + 'annotations/panoptic_val2017/',
+        backend_args={{_base_.backend_args}}),
+    dict(
+        type='CocoMetric',
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        metric=['bbox', 'segm'],
+        backend_args={{_base_.backend_args}})
+]
+test_evaluator = val_evaluator
+
+# optimizer
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW',
+        lr=0.0001,
+        weight_decay=0.05,
+        eps=1e-8,
+        betas=(0.9, 0.999)),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': embed_multi,
+            'query_feat': embed_multi,
+            'level_embed': embed_multi,
+        },
+        norm_decay_mult=0.0),
+    clip_grad=dict(max_norm=0.01, norm_type=2))
+
+# learning policy
+max_iters = 368750
+param_scheduler = dict(
+    type='MultiStepLR',
+    begin=0,
+    end=max_iters,
+    by_epoch=False,
+    milestones=[327778, 355092],
+    gamma=0.1)
+
+# Before 365001th iteration, we do evaluation every 5000 iterations.
+# After 365000th iteration, we do evaluation every 368750 iterations,
+# which means that we do evaluation at the end of training.
+interval = 5000
+dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)]
+train_cfg = dict(
+    type='IterBasedTrainLoop',
+    max_iters=max_iters,
+    val_interval=interval,
+    dynamic_intervals=dynamic_intervals)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        by_epoch=False,
+        save_last=True,
+        max_keep_ckpts=3,
+        interval=interval))
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False)
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco.py b/mmde/mmdet/.mim/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..24a17f58c54a2e8694a8bf960d10ebc918acdddc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco.py
@@ -0,0 +1,100 @@
+_base_ = ['./mask2former_r50_8xb2-lsj-50e_coco-panoptic.py']
+
+num_things_classes = 80
+num_stuff_classes = 0
+num_classes = num_things_classes + num_stuff_classes
+image_size = (1024, 1024)
+batch_augments = [
+    dict(
+        type='BatchFixedSizePad',
+        size=image_size,
+        img_pad_value=0,
+        pad_mask=True,
+        mask_pad_value=0,
+        pad_seg=False)
+]
+data_preprocessor = dict(
+    type='DetDataPreprocessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_size_divisor=32,
+    pad_mask=True,
+    mask_pad_value=0,
+    pad_seg=False,
+    batch_augments=batch_augments)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    panoptic_head=dict(
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        loss_cls=dict(class_weight=[1.0] * num_classes + [0.1])),
+    panoptic_fusion_head=dict(
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes),
+    test_cfg=dict(panoptic_on=False))
+
+# dataset settings
+train_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        to_float32=True,
+        backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', prob=0.5),
+    # large scale jittering
+    dict(
+        type='RandomResize',
+        scale=image_size,
+        ratio_range=(0.1, 2.0),
+        resize_type='Resize',
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_size=image_size,
+        crop_type='absolute',
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-5, 1e-5), by_mask=True),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        to_float32=True,
+        backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+train_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args={{_base_.backend_args}})
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic.py b/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..b275f23175e8d8294b8bb76e9708dd014ef7030b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic.py
@@ -0,0 +1,5 @@
+_base_ = ['./mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic.py']
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth'  # noqa
+
+model = dict(
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=pretrained)))
diff --git a/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic.py b/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd59400b4aed1aac97795e474633d5581705b899
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic.py
@@ -0,0 +1,42 @@
+_base_ = ['./mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py']
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=depths,
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    panoptic_head=dict(in_channels=[128, 256, 512, 1024]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic.py b/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..e203ffc96c40098e4cf0788fc47b4438ebffbb41
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic.py
@@ -0,0 +1,25 @@
+_base_ = ['./mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic.py']
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        embed_dims=192,
+        num_heads=[6, 12, 24, 48],
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    panoptic_head=dict(num_queries=200, in_channels=[192, 384, 768, 1536]))
+
+train_dataloader = dict(batch_size=1, num_workers=1)
+
+# learning policy
+max_iters = 737500
+param_scheduler = dict(end=max_iters, milestones=[655556, 710184])
+
+# Before 735001th iteration, we do evaluation every 5000 iterations.
+# After 735000th iteration, we do evaluation every 737500 iterations,
+# which means that we do evaluation at the end of training.'
+interval = 5000
+dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)]
+train_cfg = dict(
+    max_iters=max_iters,
+    val_interval=interval,
+    dynamic_intervals=dynamic_intervals)
diff --git a/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py b/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9d081db58a74dd02b3b715c3777f077d42de7ca
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py
@@ -0,0 +1,37 @@
+_base_ = ['./mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py']
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        depths=depths, init_cfg=dict(type='Pretrained',
+                                     checkpoint=pretrained)))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco.py b/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..69d5e8c6f96434973e3e9f3498155e385af815be
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco.py
@@ -0,0 +1,37 @@
+_base_ = ['./mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco.py']
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        depths=depths, init_cfg=dict(type='Pretrained',
+                                     checkpoint=pretrained)))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py b/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c00d7a697f07ad618a0b4735432a0a74d4992a9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py
@@ -0,0 +1,58 @@
+_base_ = ['./mask2former_r50_8xb2-lsj-50e_coco-panoptic.py']
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'  # noqa
+
+depths = [2, 2, 6, 2]
+model = dict(
+    type='Mask2Former',
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    panoptic_head=dict(
+        type='Mask2FormerHead', in_channels=[96, 192, 384, 768]),
+    init_cfg=None)
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco.py b/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bb9c21858ebe065691a8a963bf5dec85542fb57
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco.py
@@ -0,0 +1,56 @@
+_base_ = ['./mask2former_r50_8xb2-lsj-50e_coco.py']
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'  # noqa
+depths = [2, 2, 6, 2]
+model = dict(
+    type='Mask2Former',
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    panoptic_head=dict(
+        type='Mask2FormerHead', in_channels=[96, 192, 384, 768]),
+    init_cfg=None)
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/mmde/mmdet/.mim/configs/mask2former/metafile.yml b/mmde/mmdet/.mim/configs/mask2former/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3321239213f7345084b63b77cf02b0525a534585
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former/metafile.yml
@@ -0,0 +1,223 @@
+Collections:
+  - Name: Mask2Former
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Weight Decay
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - Mask2Former
+    Paper:
+      URL: https://arxiv.org/pdf/2112.01527
+      Title: 'Masked-attention Mask Transformer for Universal Image Segmentation'
+    README: configs/mask2former/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.23.0/mmdet/models/detectors/mask2former.py#L7
+      Version: v2.23.0
+
+Models:
+- Name: mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py
+  Metadata:
+    Training Memory (GB): 19.1
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 47.8
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 44.5
+  - Task: Panoptic Segmentation
+    Dataset: COCO
+    Metrics:
+      PQ: 54.5
+  Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic_20220329_225200-4a16ded7.pth
+- Name: mask2former_r101_8xb2-lsj-50e_coco
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco.py
+  Metadata:
+    Training Memory (GB): 15.5
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 46.7
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 44.0
+  Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_r101_8xb2-lsj-50e_coco/mask2former_r101_8xb2-lsj-50e_coco_20220426_100250-ecf181e2.pth
+- Name: mask2former_r101_8xb2-lsj-50e_coco-panoptic
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco-panoptic.py
+  Metadata:
+    Training Memory (GB): 16.1
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 45.3
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 42.4
+  - Task: Panoptic Segmentation
+    Dataset: COCO
+    Metrics:
+      PQ: 52.4
+  Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_r101_8xb2-lsj-50e_coco-panoptic/mask2former_r101_8xb2-lsj-50e_coco-panoptic_20220329_225104-c74d4d71.pth
+- Name: mask2former_r50_8xb2-lsj-50e_coco-panoptic
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py
+  Metadata:
+    Training Memory (GB): 13.9
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 44.5
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 41.8
+  - Task: Panoptic Segmentation
+    Dataset: COCO
+    Metrics:
+      PQ: 52.0
+  Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_r50_8xb2-lsj-50e_coco-panoptic/mask2former_r50_8xb2-lsj-50e_coco-panoptic_20230118_125535-54df384a.pth
+- Name: mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py
+  Metadata:
+    Training Memory (GB): 15.9
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 46.3
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 43.4
+  - Task: Panoptic Segmentation
+    Dataset: COCO
+    Metrics:
+      PQ: 53.4
+  Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic_20220326_224553-3ec9e0ae.pth
+- Name: mask2former_r50_8xb2-lsj-50e_coco
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco.py
+  Metadata:
+    Training Memory (GB): 13.7
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 45.7
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 42.9
+  Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_r50_8xb2-lsj-50e_coco/mask2former_r50_8xb2-lsj-50e_coco_20220506_191028-41b088b6.pth
+- Name: mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic.py
+  Metadata:
+    Training Memory (GB): 21.1
+    Iterations: 737500
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 52.2
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 48.5
+  - Task: Panoptic Segmentation
+    Dataset: COCO
+    Metrics:
+      PQ: 57.6
+  Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic/mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic_20220407_104949-82f8d28d.pth
+- Name: mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic.py
+  Metadata:
+    Training Memory (GB): 25.8
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 50.0
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 46.3
+  - Task: Panoptic Segmentation
+    Dataset: COCO
+    Metrics:
+      PQ: 56.3
+  Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic/mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic_20220329_230021-05ec7315.pth
+- Name: mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic.py
+  Metadata:
+    Training Memory (GB): 26.0
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 48.2
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 44.9
+  - Task: Panoptic Segmentation
+    Dataset: COCO
+    Metrics:
+      PQ: 55.1
+  Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic/mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic_20220331_002244-8a651d82.pth
+- Name: mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco.py
+  Metadata:
+    Training Memory (GB): 15.3
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 47.7
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 44.7
+  Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco_20220508_091649-01b0f990.pth
+- Name: mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco
+  In Collection: Mask2Former
+  Config: configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco.py
+  Metadata:
+    Training Memory (GB): 18.8
+    Iterations: 368750
+  Results:
+  - Task: Object Detection
+    Dataset: COCO
+    Metrics:
+      box AP: 49.3
+  - Task: Instance Segmentation
+    Dataset: COCO
+    Metrics:
+      mask AP: 46.1
+  Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco_20220504_001756-c9d0c4f2.pth
diff --git a/mmde/mmdet/.mim/configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2019.py b/mmde/mmdet/.mim/configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2019.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ba4aea8eac72f347940fb12ac964e9bf67c2e0e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2019.py
@@ -0,0 +1,12 @@
+_base_ = './mask2former_r50_8xb2-8e_youtubevis2019.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+        'mask2former/mask2former_r101_8xb2-lsj-50e_coco/'
+        'mask2former_r101_8xb2-lsj-50e_coco_20220426_100250-ecf181e2.pth'))
diff --git a/mmde/mmdet/.mim/configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2021.py b/mmde/mmdet/.mim/configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2021.py
new file mode 100644
index 0000000000000000000000000000000000000000..95f9ceeb38833aeef342e12178703db6901fe5f6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2021.py
@@ -0,0 +1,12 @@
+_base_ = './mask2former_r50_8xb2-8e_youtubevis2021.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+        'mask2former/mask2former_r101_8xb2-lsj-50e_coco/'
+        'mask2former_r101_8xb2-lsj-50e_coco_20220426_100250-ecf181e2.pth'))
diff --git a/mmde/mmdet/.mim/configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2019.py b/mmde/mmdet/.mim/configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2019.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dc03bf97a2ed2b90e097bbd9637a42bf4d64c35
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2019.py
@@ -0,0 +1,174 @@
+_base_ = ['../_base_/datasets/youtube_vis.py', '../_base_/default_runtime.py']
+
+num_classes = 40
+num_frames = 2
+model = dict(
+    type='Mask2FormerVideo',
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    track_head=dict(
+        type='Mask2FormerTrackHead',
+        in_channels=[256, 512, 1024, 2048],  # pass to pixel_decoder inside
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_frames=num_frames,
+        num_transformer_feat_level=3,
+        pixel_decoder=dict(
+            type='MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=128,
+                        dropout=0.0,
+                        batch_first=True),
+                    ffn_cfg=dict(
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)))),
+            positional_encoding=dict(num_feats=128, normalize=True)),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='SinePositionalEncoding3D', num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    dropout=0.0,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    dropout=0.0,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    ffn_drop=0.0,
+                    act_cfg=dict(type='ReLU', inplace=True))),
+            init_cfg=None),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='HungarianAssigner',
+                match_costs=[
+                    dict(type='ClassificationCost', weight=2.0),
+                    dict(
+                        type='CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(type='DiceCost', weight=5.0, pred_act=True, eps=1.0)
+                ]),
+            sampler=dict(type='MaskPseudoSampler'))),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint='https://download.openmmlab.com/mmdetection/v3.0/'
+        'mask2former/mask2former_r50_8xb2-lsj-50e_coco/'
+        'mask2former_r50_8xb2-lsj-50e_coco_20220506_191028-41b088b6.pth'))
+
+# optimizer
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW',
+        lr=0.0001,
+        weight_decay=0.05,
+        eps=1e-8,
+        betas=(0.9, 0.999)),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': embed_multi,
+            'query_feat': embed_multi,
+            'level_embed': embed_multi,
+        },
+        norm_decay_mult=0.0),
+    clip_grad=dict(max_norm=0.01, norm_type=2))
+
+# learning policy
+max_iters = 6000
+param_scheduler = dict(
+    type='MultiStepLR',
+    begin=0,
+    end=max_iters,
+    by_epoch=False,
+    milestones=[
+        4000,
+    ],
+    gamma=0.1)
+# runtime settings
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=max_iters, val_interval=6001)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='TrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, save_last=True, interval=2000),
+    visualization=dict(type='TrackVisualizationHook', draw=False))
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False)
+
+# evaluator
+val_evaluator = dict(
+    type='YouTubeVISMetric',
+    metric='youtube_vis_ap',
+    outfile_prefix='./youtube_vis_results',
+    format_only=True)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021.py b/mmde/mmdet/.mim/configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021.py
new file mode 100644
index 0000000000000000000000000000000000000000..158fe52d20fccf162cb66202fbc9069ba0f4cb68
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021.py
@@ -0,0 +1,37 @@
+_base_ = './mask2former_r50_8xb2-8e_youtubevis2019.py'
+
+dataset_type = 'YouTubeVISDataset'
+data_root = 'data/youtube_vis_2021/'
+dataset_version = data_root[-5:-1]  # 2019 or 2021
+
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        dataset_version=dataset_version,
+        ann_file='annotations/youtube_vis_2021_train.json'))
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        dataset_version=dataset_version,
+        ann_file='annotations/youtube_vis_2021_valid.json'))
+test_dataloader = val_dataloader
+
+# learning policy
+max_iters = 8000
+param_scheduler = dict(
+    type='MultiStepLR',
+    begin=0,
+    end=max_iters,
+    by_epoch=False,
+    milestones=[
+        5500,
+    ],
+    gamma=0.1)
+# runtime settings
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=max_iters, val_interval=8001)
+
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, save_last=True, interval=500))
diff --git a/mmde/mmdet/.mim/configs/mask2former_vis/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py b/mmde/mmdet/.mim/configs/mask2former_vis/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py
new file mode 100644
index 0000000000000000000000000000000000000000..94dcccf408dfb989ea264536a617a48ecc13171c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former_vis/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py
@@ -0,0 +1,64 @@
+_base_ = ['./mask2former_r50_8xb2-8e_youtubevis2021.py']
+depths = [2, 2, 18, 2]
+model = dict(
+    type='Mask2FormerVideo',
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=depths,
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=None),
+    track_head=dict(
+        type='Mask2FormerTrackHead',
+        in_channels=[192, 384, 768, 1536],
+        num_queries=200),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint=  # noqa: E251
+        'https://download.openmmlab.com/mmdetection/v3.0/mask2former/'
+        'mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic/'
+        'mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic_'
+        '20220407_104949-82f8d28d.pth'))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/mmde/mmdet/.mim/configs/mask2former_vis/metafile.yml b/mmde/mmdet/.mim/configs/mask2former_vis/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f5f4bd7c5775820f283a7544bf5978fe0aa1abc5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask2former_vis/metafile.yml
@@ -0,0 +1,53 @@
+Collections:
+  - Name: Mask2Former
+    Metadata:
+      Training Techniques:
+        - AdamW
+        - Weight Decay
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - Mask2Former
+    Paper:
+      URL: https://arxiv.org/pdf/2112.10764.pdf
+      Title: Mask2Former for Video Instance Segmentation
+    README: configs/mask2former/README.md
+
+Models:
+  - Name: mask2former_r50_8xb2-8e_youtubevis2021
+    In Collection: Mask2Former
+    Config: configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021.py
+    Metadata:
+      Training Data: YouTube-VIS 2021
+      Training Memory (GB): 6.0
+    Results:
+      - Task: Video Instance Segmentation
+        Dataset: YouTube-VIS 2021
+        Metrics:
+          AP: 41.3
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021/mask2former_r50_8xb2-8e_youtubevis2021_20230426_131833-5d215283.pth
+
+  - Name: mask2former_r101_8xb2-8e_youtubevis2021
+    In Collection: Mask2Former
+    Config: configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2021.py
+    Metadata:
+      Training Data: YouTube-VIS 2021
+      Training Memory (GB): 7.5
+    Results:
+      - Task: Video Instance Segmentation
+        Dataset: YouTube-VIS 2021
+        Metrics:
+          AP: 42.3
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2021/mask2former_r101_8xb2-8e_youtubevis2021_20220823_092747-8077d115.pth
+
+  - Name: mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py
+    In Collection: Mask2Former
+    Config: configs/mask2former_vis/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py
+    Metadata:
+      Training Data: YouTube-VIS 2021
+      Training Memory (GB): 18.5
+    Results:
+      - Task: Video Instance Segmentation
+        Dataset: YouTube-VIS 2021
+        Metrics:
+          AP: 52.3
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former_vis/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021_20220907_124752-48252603.pth
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..09808e4bcada43b1e935d5393894c7ba3401fc3d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './mask-rcnn_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_ms-poly-3x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_ms-poly-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e723aea81ff82dfa842d7468e166f42ee9291669
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_ms-poly-3x_coco.py
@@ -0,0 +1,19 @@
+_base_ = [
+    '../common/ms-poly_3x_coco-instance.py',
+    '../_base_/models/mask-rcnn_r50_fpn.py'
+]
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        depth=101,
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..af91ff0b8349b0e9e658b69cf4c5dd138b7b8a5a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5599e7c4942b523d6500e2c7c8ad4638cab45c6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101_fpn_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './mask-rcnn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..452351050238a4d4411b2bf6fc916e2d69804766
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './mask-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101_fpn_ms-poly-3x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101_fpn_ms-poly-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..384f6dcd3ca33cd91755b48dd525d747a358ee02
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r101_fpn_ms-poly-3x_coco.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../common/ms-poly_3x_coco-instance.py',
+    '../_base_/models/mask-rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b9219c9c1da8ca68cf7ada0881419b371a26a87
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './mask-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe-c4_1x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe-c4_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9919f11c3fc7b68528bf6f690e39185d703aff43
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe-c4_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50-caffe-c4.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4124f138d874def6810cea6c884a02eaacdf5f71
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+_base_ = './mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-1x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7702ae14a9cc54686df6a3eadec5bc8cfeb8e0a8
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-1x_coco.py
@@ -0,0 +1,28 @@
+_base_ = './mask-rcnn_r50_fpn_1x_coco.py'
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..94d94dd3613e0599f51f113ccf12e568a5b29f8f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py
@@ -0,0 +1,31 @@
+_base_ = './mask-rcnn_r50_fpn_1x_coco.py'
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-2x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbf87bb8346dd351c8f16700df7b9640bcfa984a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-2x_coco.py
@@ -0,0 +1,15 @@
+_base_ = './mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py'
+
+train_cfg = dict(max_epochs=24)
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-3x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..45260e2e39b53c0107e257ef2d05a14f5d5c0323
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-3x_coco.py
@@ -0,0 +1,15 @@
+_base_ = './mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py'
+
+train_cfg = dict(max_epochs=36)
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[28, 34],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_poly-1x_coco_v1.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_poly-1x_coco_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..3baf00140ecfa57ea54b68b85ac826e14490daa4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_poly-1x_coco_v1.py
@@ -0,0 +1,31 @@
+_base_ = './mask-rcnn_r50_fpn_1x_coco.py'
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    rpn_head=dict(
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=7,
+                sampling_ratio=2,
+                aligned=False)),
+        bbox_head=dict(
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+        mask_roi_extractor=dict(
+            roi_layer=dict(
+                type='RoIAlign',
+                output_size=14,
+                sampling_ratio=2,
+                aligned=False))))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_1x-wandb_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_1x-wandb_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..28b125ccb94869aff2bb283e6533fd693c79a76e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_1x-wandb_coco.py
@@ -0,0 +1,16 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]
+visualizer = dict(vis_backends=vis_backends)
+
+# MMEngine support the following two ways, users can choose
+# according to convenience
+# default_hooks = dict(checkpoint=dict(interval=4))
+_base_.default_hooks.checkpoint.interval = 4
+
+# train_cfg = dict(val_interval=2)
+_base_.train_cfg.val_interval = 2
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fc6b91aa895e044b3fc62a3cdedbc12a052e91b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..87cb8b4bb7d2fbfcfe667e7bd6cfc08e01e28c1a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_2x_coco.py
@@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7371b3646fdda7bdc1fcfcd44cf8a20df27c40b5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,22 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../common/lsj-100e_coco-instance.py'
+]
+image_size = (1024, 1024)
+batch_augments = [
+    dict(type='BatchFixedSizePad', size=image_size, pad_mask=True)
+]
+
+model = dict(data_preprocessor=dict(batch_augments=batch_augments))
+
+train_dataloader = dict(batch_size=8, num_workers=4)
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=0.02 * 4, momentum=0.9, weight_decay=0.00004))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_amp-1x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_amp-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a139c48b2091a3a40943ce7ec8301b06cea01d4f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_amp-1x_coco.py
@@ -0,0 +1,4 @@
+_base_ = './mask-rcnn_r50_fpn_1x_coco.py'
+
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(type='AmpOptimWrapper')
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_ms-poly-3x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_ms-poly-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..417adc3cebb3acbcc987b3f0453a78204dde1ea9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_ms-poly-3x_coco.py
@@ -0,0 +1,4 @@
+_base_ = [
+    '../common/ms-poly_3x_coco-instance.py',
+    '../_base_/models/mask-rcnn_r50_fpn.py'
+]
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_poly-1x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_poly-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..826180ce0a831a1ee6206bd52ffa516df766136c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_r50_fpn_poly-1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs'),
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..921ade81e30afb60a3a6f03d2f2aecef85767da8
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './mask-rcnn_r101_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..db8157f80fac23f6216afbeefed6cb80398f7e0d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './mask-rcnn_r101_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_ms-poly-3x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_ms-poly-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..83e5451f38cb01d3d30712f22633fed6234d06c9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_ms-poly-3x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../common/ms-poly_3x_coco-instance.py',
+    '../_base_/models/mask-rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e9b1b6fe8fcb152d9ad22bc403da6e62e936f77
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_1x_coco.py
@@ -0,0 +1,22 @@
+_base_ = './mask-rcnn_r101_fpn_1x_coco.py'
+
+model = dict(
+    # ResNeXt-101-32x8d model trained with Caffe2 at FB,
+    # so the mean and std need to be changed.
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[57.375, 57.120, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-1x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ee204d90001edd3e8e08e4a59ba25dd1ec4195c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-1x_coco.py
@@ -0,0 +1,40 @@
+_base_ = './mask-rcnn_r101_fpn_1x_coco.py'
+
+model = dict(
+    # ResNeXt-101-32x8d model trained with Caffe2 at FB,
+    # so the mean and std need to be changed.
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[57.375, 57.120, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs'),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-3x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..999a30c39fc083f26fe0cd9e2ec13bb4f6063268
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-3x_coco.py
@@ -0,0 +1,25 @@
+_base_ = [
+    '../common/ms-poly_3x_coco-instance.py',
+    '../_base_/models/mask-rcnn_r50_fpn.py'
+]
+
+model = dict(
+    # ResNeXt-101-32x8d model trained with Caffe2 at FB,
+    # so the mean and std need to be changed.
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[57.375, 57.120, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cbb658c1b053d6674694c1a09101e965d5724ba
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './mask-rcnn_x101-32x4d_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f21a55b00db77a3cf2386a738a3b8fb39bf2fa44
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './mask-rcnn_x101-32x4d_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_ms-poly_3x_coco.py b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_ms-poly_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..09b49d47740b70c4a192d94a95b994d0a303f2d1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_ms-poly_3x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../common/ms-poly_3x_coco-instance.py',
+    '../_base_/models/mask-rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/mask_rcnn/metafile.yml b/mmde/mmdet/.mim/configs/mask_rcnn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ddf85c872bc8681a849c59c917a4b5ca0151d21a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mask_rcnn/metafile.yml
@@ -0,0 +1,443 @@
+Collections:
+  - Name: Mask R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Softmax
+        - RPN
+        - Convolution
+        - Dense Connections
+        - FPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/abs/1703.06870v3
+      Title: "Mask R-CNN"
+    README: configs/mask_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/mask_rcnn.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: mask-rcnn_r50-caffe_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.3
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 34.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco/mask_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.38__segm_mAP-0.344_20200504_231812-0ebd1859.pth
+
+  - Name: mask-rcnn_r50_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.4
+      inference time (ms/im):
+        - value: 62.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 34.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth
+
+  - Name: mask-rcnn_r50_fpn_fp16_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_r50_fpn_amp-1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.6
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+        - Mixed Precision Training
+      inference time (ms/im):
+        - value: 41.49
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP16
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 34.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_1x_coco/mask_rcnn_r50_fpn_fp16_1x_coco_20200205-59faf7e4.pth
+
+  - Name: mask-rcnn_r50_fpn_2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_r50_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 4.4
+      inference time (ms/im):
+        - value: 62.11
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 35.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392__segm_mAP-0.354_20200505_003907-3e542a40.pth
+
+  - Name: mask-rcnn_r101-caffe_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco/mask_rcnn_r101_caffe_fpn_1x_coco_20200601_095758-805e06c1.pth
+
+  - Name: mask-rcnn_r101_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      inference time (ms/im):
+        - value: 74.07
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204-1efe0ed5.pth
+
+  - Name: mask-rcnn_r101_fpn_2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_r101_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      inference time (ms/im):
+        - value: 74.07
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_2x_coco/mask_rcnn_r101_fpn_2x_coco_bbox_mAP-0.408__segm_mAP-0.366_20200505_071027-14b391c7.pth
+
+  - Name: mask-rcnn_x101-32x4d_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      inference time (ms/im):
+        - value: 88.5
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205-478d0b67.pth
+
+  - Name: mask-rcnn_x101-32x4d_fpn_2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      inference time (ms/im):
+        - value: 88.5
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco/mask_rcnn_x101_32x4d_fpn_2x_coco_bbox_mAP-0.422__segm_mAP-0.378_20200506_004702-faef898c.pth
+
+  - Name: mask-rcnn_x101-64x4d_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.7
+      inference time (ms/im):
+        - value: 125
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco/mask_rcnn_x101_64x4d_fpn_1x_coco_20200201-9352eb0d.pth
+
+  - Name: mask-rcnn_x101-64x4d_fpn_2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.7
+      inference time (ms/im):
+        - value: 125
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco/mask_rcnn_x101_64x4d_fpn_2x_coco_20200509_224208-39d6f70c.pth
+
+  - Name: mask-rcnn_x101-32x8d_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.6
+      Epochs: 12
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 42.8
+    - Task: Instance Segmentation
+      Dataset: COCO
+      Metrics:
+        mask AP: 38.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco/mask_rcnn_x101_32x8d_fpn_1x_coco_20220630_173841-0aaf329e.pth
+
+  - Name: mask-rcnn_r50-caffe_fpn_ms-poly-2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-2x_coco.py
+    Metadata:
+      Training Memory (GB): 4.3
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco_bbox_mAP-0.403__segm_mAP-0.365_20200504_231822-a75c98ce.pth
+
+  - Name: mask-rcnn_r50-caffe_fpn_ms-poly-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.3
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth
+
+  - Name: mask-rcnn_r50_fpn_mstrain-poly_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_r50_fpn_ms-poly-3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.1
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_fpn_mstrain-poly_3x_coco_20210524_201154-21b550bb.pth
+
+  - Name: mask-rcnn_r101_fpn_ms-poly-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_r101_fpn_ms-poly-3x_coco.py
+    Metadata:
+      Training Memory (GB): 6.1
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_fpn_mstrain-poly_3x_coco_20210524_200244-5675c317.pth
+
+  - Name: mask-rcnn_r101-caffe_fpn_ms-poly-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_ms-poly-3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.9
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco_20210526_132339-3c33ce02.pth
+
+  - Name: mask-rcnn_x101-32x4d_fpn_ms-poly-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_ms-poly-3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.3
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco_20210524_201410-abcd7859.pth
+
+  - Name: mask-rcnn_x101-32x8d_fpn_ms-poly-1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.4
+      Epochs: 12
+    Results:
+    - Task: Object Detection
+      Dataset: COCO
+      Metrics:
+        box AP: 43.4
+    - Task: Instance Segmentation
+      Dataset: COCO
+      Metrics:
+        mask AP: 39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco_20220630_170346-b4637974.pth
+
+  - Name: mask-rcnn_x101-32x8d_fpn_ms-poly-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-3x_coco.py
+    Metadata:
+      Training Memory (GB): 10.3
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco_20210607_161042-8bd2c639.pth
+
+  - Name: mask-rcnn_x101-64x4d_fpn_ms-poly_3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_ms-poly_3x_coco.py
+    Metadata:
+      Epochs: 36
+      Training Memory (GB): 10.4
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco_20210526_120447-c376f129.pth
diff --git a/mmde/mmdet/.mim/configs/maskformer/maskformer_r50_ms-16xb1-75e_coco.py b/mmde/mmdet/.mim/configs/maskformer/maskformer_r50_ms-16xb1-75e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..784ee7767bf1318e967444461028b49a38dc3dbc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/maskformer/maskformer_r50_ms-16xb1-75e_coco.py
@@ -0,0 +1,216 @@
+_base_ = [
+    '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py'
+]
+
+data_preprocessor = dict(
+    type='DetDataPreprocessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_size_divisor=1,
+    pad_mask=True,
+    mask_pad_value=0,
+    pad_seg=True,
+    seg_pad_value=255)
+
+num_things_classes = 80
+num_stuff_classes = 53
+num_classes = num_things_classes + num_stuff_classes
+model = dict(
+    type='MaskFormer',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    panoptic_head=dict(
+        type='MaskFormerHead',
+        in_channels=[256, 512, 1024, 2048],  # pass to pixel_decoder inside
+        feat_channels=256,
+        out_channels=256,
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        num_queries=100,
+        pixel_decoder=dict(
+            type='TransformerEncoderPixelDecoder',
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiheadAttention
+                        embed_dims=256,
+                        num_heads=8,
+                        dropout=0.1,
+                        batch_first=True),
+                    ffn_cfg=dict(
+                        embed_dims=256,
+                        feedforward_channels=2048,
+                        num_fcs=2,
+                        ffn_drop=0.1,
+                        act_cfg=dict(type='ReLU', inplace=True)))),
+            positional_encoding=dict(num_feats=128, normalize=True)),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(num_feats=128, normalize=True),
+        transformer_decoder=dict(  # DetrTransformerDecoder
+            num_layers=6,
+            layer_cfg=dict(  # DetrTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    dropout=0.1,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    dropout=0.1,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    ffn_drop=0.1,
+                    act_cfg=dict(type='ReLU', inplace=True))),
+            return_intermediate=True),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=1.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            reduction='mean',
+            loss_weight=20.0),
+        loss_dice=dict(
+            type='DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=1.0)),
+    panoptic_fusion_head=dict(
+        type='MaskFormerFusionHead',
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        loss_panoptic=None,
+        init_cfg=None),
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            match_costs=[
+                dict(type='ClassificationCost', weight=1.0),
+                dict(type='FocalLossCost', weight=20.0, binary_input=True),
+                dict(type='DiceCost', weight=1.0, pred_act=True, eps=1.0)
+            ]),
+        sampler=dict(type='MaskPseudoSampler')),
+    test_cfg=dict(
+        panoptic_on=True,
+        # For now, the dataset does not support
+        # evaluating semantic segmentation metric.
+        semantic_on=False,
+        instance_on=False,
+        # max_per_image is for instance segmentation.
+        max_per_image=100,
+        object_mask_thr=0.8,
+        iou_thr=0.8,
+        # In MaskFormer's panoptic postprocessing,
+        # it will not filter masks whose score is smaller than 0.5 .
+        filter_low_score=False),
+    init_cfg=None)
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadPanopticAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[[
+            dict(
+                type='RandomChoiceResize',
+                scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                        (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                        (736, 1333), (768, 1333), (800, 1333)],
+                keep_ratio=True)
+        ],
+                    [
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(400, 1333), (500, 1333), (600, 1333)],
+                            keep_ratio=True),
+                        dict(
+                            type='RandomCrop',
+                            crop_type='absolute_range',
+                            crop_size=(384, 600),
+                            allow_negative_crop=True),
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(480, 1333), (512, 1333), (544, 1333),
+                                    (576, 1333), (608, 1333), (640, 1333),
+                                    (672, 1333), (704, 1333), (736, 1333),
+                                    (768, 1333), (800, 1333)],
+                            keep_ratio=True)
+                    ]]),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(
+    batch_size=1, num_workers=1, dataset=dict(pipeline=train_pipeline))
+
+val_dataloader = dict(batch_size=1, num_workers=1)
+
+test_dataloader = val_dataloader
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW',
+        lr=0.0001,
+        weight_decay=0.0001,
+        eps=1e-8,
+        betas=(0.9, 0.999)),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': dict(lr_mult=1.0, decay_mult=0.0)
+        },
+        norm_decay_mult=0.0),
+    clip_grad=dict(max_norm=0.01, norm_type=2))
+
+max_epochs = 75
+
+# learning rate
+param_scheduler = dict(
+    type='MultiStepLR',
+    begin=0,
+    end=max_epochs,
+    by_epoch=True,
+    milestones=[50],
+    gamma=0.1)
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (16 GPUs) x (1 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/maskformer/maskformer_swin-l-p4-w12_64xb1-ms-300e_coco.py b/mmde/mmdet/.mim/configs/maskformer/maskformer_swin-l-p4-w12_64xb1-ms-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e4897f26d47c049f8791169867c2df307b87f61
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/maskformer/maskformer_swin-l-p4-w12_64xb1-ms-300e_coco.py
@@ -0,0 +1,73 @@
+_base_ = './maskformer_r50_ms-16xb1-75e_coco.py'
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'  # noqa
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=192,
+        patch_size=4,
+        window_size=12,
+        mlp_ratio=4,
+        depths=depths,
+        num_heads=[6, 12, 24, 48],
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    panoptic_head=dict(
+        in_channels=[192, 384, 768, 1536],  # pass to pixel_decoder inside
+        pixel_decoder=dict(
+            _delete_=True,
+            type='PixelDecoder',
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU')),
+        enforce_decoder_input_project=True))
+
+# optimizer
+
+# weight_decay = 0.01
+# norm_weight_decay = 0.0
+# embed_weight_decay = 0.0
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+norm_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'norm': norm_multi,
+    'absolute_pos_embed': embed_multi,
+    'relative_position_bias_table': embed_multi,
+    'query_embed': embed_multi
+}
+
+optim_wrapper = dict(
+    optimizer=dict(lr=6e-5, weight_decay=0.01),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+max_epochs = 300
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[250],
+        gamma=0.1)
+]
+
+train_cfg = dict(max_epochs=max_epochs)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (1 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/maskformer/metafile.yml b/mmde/mmdet/.mim/configs/maskformer/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..fa58269d51c3e936f6acfaa664766afb84e7e0b6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/maskformer/metafile.yml
@@ -0,0 +1,43 @@
+Collections:
+  - Name: MaskFormer
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Weight Decay
+      Training Resources: 16x V100 GPUs
+      Architecture:
+        - MaskFormer
+    Paper:
+      URL: https://arxiv.org/pdf/2107.06278
+      Title: 'Per-Pixel Classification is Not All You Need for Semantic Segmentation'
+    README: configs/maskformer/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/mmdet/models/detectors/maskformer.py#L7
+      Version: v2.22.0
+
+Models:
+  - Name: maskformer_r50_ms-16xb1-75e_coco
+    In Collection: MaskFormer
+    Config: configs/maskformer/maskformer_r50_ms-16xb1-75e_coco.py
+    Metadata:
+      Training Memory (GB): 16.2
+      Epochs: 75
+    Results:
+    - Task: Panoptic Segmentation
+      Dataset: COCO
+      Metrics:
+        PQ: 46.9
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/maskformer/maskformer_r50_ms-16xb1-75e_coco/maskformer_r50_ms-16xb1-75e_coco_20230116_095226-baacd858.pth
+  - Name: maskformer_swin-l-p4-w12_64xb1-ms-300e_coco
+    In Collection: MaskFormer
+    Config: configs/maskformer/maskformer_swin-l-p4-w12_64xb1-ms-300e_coco.py
+    Metadata:
+      Training Memory (GB): 27.2
+      Epochs: 300
+    Results:
+    - Task: Panoptic Segmentation
+      Dataset: COCO
+      Metrics:
+        PQ: 53.2
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/maskformer/maskformer_swin-l-p4-w12_64xb1-ms-300e_coco/maskformer_swin-l-p4-w12_64xb1-ms-300e_coco_20220326_221612-c63ab967.pth
diff --git a/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py b/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py
new file mode 100644
index 0000000000000000000000000000000000000000..4be492d5419b8598120faa29eed44eada0fb5ba2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py
@@ -0,0 +1,12 @@
+_base_ = ['./masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py']
+model = dict(
+    detector=dict(
+        backbone=dict(
+            depth=101,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='torchvision://resnet101')),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204-1efe0ed5.pth'  # noqa: E501
+        )))
diff --git a/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py b/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py
new file mode 100644
index 0000000000000000000000000000000000000000..81bae4af8d8945a024cd498a001e52059741f8a9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py
@@ -0,0 +1,28 @@
+_base_ = ['./masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py']
+model = dict(
+    detector=dict(
+        backbone=dict(
+            depth=101,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='torchvision://resnet101')),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204-1efe0ed5.pth'  # noqa: E501
+        )))
+
+data_root = 'data/youtube_vis_2021/'
+dataset_version = data_root[-5:-1]
+
+# dataloader
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        dataset_version=dataset_version,
+        ann_file='annotations/youtube_vis_2021_train.json'))
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        dataset_version=dataset_version,
+        ann_file='annotations/youtube_vis_2021_valid.json'))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py b/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py
new file mode 100644
index 0000000000000000000000000000000000000000..db1be7b0ddf00a07ce6e06e4e179059e68c103a3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py
@@ -0,0 +1,130 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/youtube_vis.py', '../_base_/default_runtime.py'
+]
+
+detector = _base_.model
+detector.pop('data_preprocessor')
+detector.roi_head.bbox_head.update(dict(num_classes=40))
+detector.roi_head.mask_head.update(dict(num_classes=40))
+detector.train_cfg.rpn.sampler.update(dict(num=64))
+detector.train_cfg.rpn_proposal.update(dict(nms_pre=200, max_per_img=200))
+detector.train_cfg.rcnn.sampler.update(dict(num=128))
+detector.test_cfg.rpn.update(dict(nms_pre=200, max_per_img=200))
+detector.test_cfg.rcnn.update(dict(score_thr=0.01))
+detector['init_cfg'] = dict(
+    type='Pretrained',
+    checkpoint=  # noqa: E251
+    'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth'  # noqa: E501
+)
+del _base_.model
+
+model = dict(
+    type='MaskTrackRCNN',
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=True,
+        pad_size_divisor=32),
+    detector=detector,
+    track_head=dict(
+        type='RoITrackHead',
+        roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        embed_head=dict(
+            type='RoIEmbedHead',
+            num_fcs=2,
+            roi_feat_size=7,
+            in_channels=256,
+            fc_out_channels=1024),
+        train_cfg=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=128,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    tracker=dict(
+        type='MaskTrackRCNNTracker',
+        match_weights=dict(det_score=1.0, iou=2.0, det_label=10.0),
+        num_frames_retain=20))
+
+dataset_type = 'YouTubeVISDataset'
+data_root = 'data/youtube_vis_2019/'
+dataset_version = data_root[-5:-1]  # 2019 or 2021
+
+# train_dataloader
+train_dataloader = dict(
+    _delete_=True,
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='TrackImgSampler'),  # image-based sampling
+    batch_sampler=dict(type='TrackAspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        dataset_version=dataset_version,
+        ann_file='annotations/youtube_vis_2019_train.json',
+        data_prefix=dict(img_path='train/JPEGImages'),
+        pipeline=_base_.train_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.00125, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3.0,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# visualizer
+default_hooks = dict(
+    visualization=dict(type='TrackVisualizationHook', draw=False))
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='TrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_begin=13)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# evaluator
+val_evaluator = dict(
+    type='YouTubeVISMetric',
+    metric='youtube_vis_ap',
+    outfile_prefix='./youtube_vis_results',
+    format_only=True)
+test_evaluator = val_evaluator
+
+del detector
diff --git a/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py b/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py
new file mode 100644
index 0000000000000000000000000000000000000000..47263d5091c3b5b76056373558ce9a0a97bb071b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py
@@ -0,0 +1,17 @@
+_base_ = ['./masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py']
+
+data_root = 'data/youtube_vis_2021/'
+dataset_version = data_root[-5:-1]
+
+# dataloader
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        dataset_version=dataset_version,
+        ann_file='annotations/youtube_vis_2021_train.json'))
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        dataset_version=dataset_version,
+        ann_file='annotations/youtube_vis_2021_valid.json'))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py b/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7e3f11e13a3a20ba8e4311963db558a9e4fd247
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py
@@ -0,0 +1,16 @@
+_base_ = ['./masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py']
+model = dict(
+    detector=dict(
+        backbone=dict(
+            type='ResNeXt',
+            depth=101,
+            groups=64,
+            base_width=4,
+            init_cfg=dict(
+                type='Pretrained',
+                checkpoint='open-mmlab://resnext101_64x4d')),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco/mask_rcnn_x101_64x4d_fpn_1x_coco_20200201-9352eb0d.pth'  # noqa: E501
+        )))
diff --git a/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py b/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea4c8b92483292cc7de1b2f321d4d514427f3cb5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py
@@ -0,0 +1,32 @@
+_base_ = ['./masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py']
+model = dict(
+    detector=dict(
+        backbone=dict(
+            type='ResNeXt',
+            depth=101,
+            groups=64,
+            base_width=4,
+            init_cfg=dict(
+                type='Pretrained',
+                checkpoint='open-mmlab://resnext101_64x4d')),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco/mask_rcnn_x101_64x4d_fpn_1x_coco_20200201-9352eb0d.pth'  # noqa: E501
+        )))
+
+data_root = 'data/youtube_vis_2021/'
+dataset_version = data_root[-5:-1]
+
+# dataloader
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        dataset_version=dataset_version,
+        ann_file='annotations/youtube_vis_2021_train.json'))
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        dataset_version=dataset_version,
+        ann_file='annotations/youtube_vis_2021_valid.json'))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/masktrack_rcnn/metafile.yml b/mmde/mmdet/.mim/configs/masktrack_rcnn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7a1d71d582dc31f3c05f721c6ea8a225d0e0ce33
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/masktrack_rcnn/metafile.yml
@@ -0,0 +1,91 @@
+Collections:
+  - Name: MaskTrack R-CNN
+    Metadata:
+      Training Techniques:
+        - SGD with Momentum
+      Training Resources: 8x TiTanXP GPUs
+      Architecture:
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/pdf/1905.04804.pdf
+      Title: Video Instance Segmentation
+    README: configs/masktrack_rcnn/README.md
+
+Models:
+  - Name: masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019
+    In Collection: MaskTrack R-CNN
+    Config: configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py
+    Metadata:
+      Training Data: YouTube-VIS 2019
+      Training Memory (GB): 1.16
+    Results:
+      - Task: Video Instance Segmentation
+        Dataset: YouTube-VIS 2019
+        Metrics:
+          AP: 30.2
+    Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth
+
+  - Name: masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019
+    In Collection: MaskTrack R-CNN
+    Config: configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py
+    Metadata:
+      Training Data: YouTube-VIS 2019
+      Training Memory (GB): 2.27
+    Results:
+      - Task: Video Instance Segmentation
+        Dataset: YouTube-VIS 2019
+        Metrics:
+          AP: 32.2
+    Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2019/masktrack_rcnn_r101_fpn_12e_youtubevis2019_20211023_150038-454dc48b.pth
+
+  - Name: masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019
+    In Collection: MaskTrack R-CNN
+    Config: configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py
+    Metadata:
+      Training Data: YouTube-VIS 2019
+      Training Memory (GB): 3.69
+    Results:
+      - Task: Video Instance Segmentation
+        Dataset: YouTube-VIS 2019
+        Metrics:
+          AP: 34.7
+    Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2019/masktrack_rcnn_x101_fpn_12e_youtubevis2019_20211023_153205-fff7a102.pth
+
+  - Name: masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021
+    In Collection: MaskTrack R-CNN
+    Config: configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py
+    Metadata:
+      Training Data: YouTube-VIS 2021
+      Training Memory (GB): 1.16
+    Results:
+      - Task: Video Instance Segmentation
+        Dataset: YouTube-VIS 2021
+        Metrics:
+          AP: 28.7
+    Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2021/masktrack_rcnn_r50_fpn_12e_youtubevis2021_20211026_044948-10da90d9.pth
+
+  - Name: masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021
+    In Collection: MaskTrack R-CNN
+    Config: configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py
+    Metadata:
+      Training Data: YouTube-VIS 2021
+      Training Memory (GB): 2.27
+    Results:
+      - Task: Video Instance Segmentation
+        Dataset: YouTube-VIS 2021
+        Metrics:
+          AP: 31.3
+    Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2021/masktrack_rcnn_r101_fpn_12e_youtubevis2021_20211026_045509-3c49e4f3.pth
+
+  - Name: masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021
+    In Collection: MaskTrack R-CNN
+    Config: configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py
+    Metadata:
+      Training Data: YouTube-VIS 2021
+      Training Memory (GB): 3.69
+    Results:
+      - Task: Video Instance Segmentation
+        Dataset: YouTube-VIS 2021
+        Metrics:
+          AP: 33.5
+    Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2021/masktrack_rcnn_x101_fpn_12e_youtubevis2021_20211026_095943-90831df4.pth
diff --git a/mmde/mmdet/.mim/configs/misc/d2_faster-rcnn_r50-caffe_fpn_ms-90k_coco.py b/mmde/mmdet/.mim/configs/misc/d2_faster-rcnn_r50-caffe_fpn_ms-90k_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d93e1562606b3d6bd657454c99220d329c526f30
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/misc/d2_faster-rcnn_r50-caffe_fpn_ms-90k_coco.py
@@ -0,0 +1,75 @@
+_base_ = '../common/ms-90k_coco.py'
+
+# model settings
+model = dict(
+    type='Detectron2Wrapper',
+    bgr_to_rgb=False,
+    detector=dict(
+        # The settings in `d2_detector` will merged into default settings
+        # in detectron2. More details please refer to
+        # https://github.com/facebookresearch/detectron2/blob/main/detectron2/config/defaults.py    # noqa
+        meta_architecture='GeneralizedRCNN',
+        # If you want to finetune the detector, you can use the
+        # checkpoint released by detectron2, for example:
+        # weights='detectron2://COCO-Detection/faster_rcnn_R_50_FPN_1x/137257794/model_final_b275ba.pkl'     # noqa
+        weights='detectron2://ImageNetPretrained/MSRA/R-50.pkl',
+        mask_on=False,
+        pixel_mean=[103.530, 116.280, 123.675],
+        pixel_std=[1.0, 1.0, 1.0],
+        backbone=dict(name='build_resnet_fpn_backbone', freeze_at=2),
+        resnets=dict(
+            depth=50,
+            out_features=['res2', 'res3', 'res4', 'res5'],
+            num_groups=1,
+            norm='FrozenBN'),
+        fpn=dict(
+            in_features=['res2', 'res3', 'res4', 'res5'], out_channels=256),
+        anchor_generator=dict(
+            name='DefaultAnchorGenerator',
+            sizes=[[32], [64], [128], [256], [512]],
+            aspect_ratios=[[0.5, 1.0, 2.0]],
+            angles=[[-90, 0, 90]]),
+        proposal_generator=dict(name='RPN'),
+        rpn=dict(
+            head_name='StandardRPNHead',
+            in_features=['p2', 'p3', 'p4', 'p5', 'p6'],
+            iou_thresholds=[0.3, 0.7],
+            iou_labels=[0, -1, 1],
+            batch_size_per_image=256,
+            positive_fraction=0.5,
+            bbox_reg_loss_type='smooth_l1',
+            bbox_reg_loss_weight=1.0,
+            bbox_reg_weights=(1.0, 1.0, 1.0, 1.0),
+            smooth_l1_beta=0.0,
+            loss_weight=1.0,
+            boundary_thresh=-1,
+            pre_nms_topk_train=2000,
+            post_nms_topk_train=1000,
+            pre_nms_topk_test=1000,
+            post_nms_topk_test=1000,
+            nms_thresh=0.7,
+            conv_dims=[-1]),
+        roi_heads=dict(
+            name='StandardROIHeads',
+            num_classes=80,
+            in_features=['p2', 'p3', 'p4', 'p5'],
+            iou_thresholds=[0.5],
+            iou_labels=[0, 1],
+            batch_size_per_image=512,
+            positive_fraction=0.25,
+            score_thresh_test=0.05,
+            nms_thresh_test=0.5,
+            proposal_append_gt=True),
+        roi_box_head=dict(
+            name='FastRCNNConvFCHead',
+            num_fc=2,
+            fc_dim=1024,
+            conv_dim=256,
+            pooler_type='ROIAlignV2',
+            pooler_resolution=7,
+            pooler_sampling_ratio=0,
+            bbox_reg_loss_type='smooth_l1',
+            bbox_reg_loss_weight=1.0,
+            bbox_reg_weights=(10.0, 10.0, 5.0, 5.0),
+            smooth_l1_beta=0.0,
+            cls_agnostic_bbox_reg=False)))
diff --git a/mmde/mmdet/.mim/configs/misc/d2_mask-rcnn_r50-caffe_fpn_ms-90k_coco.py b/mmde/mmdet/.mim/configs/misc/d2_mask-rcnn_r50-caffe_fpn_ms-90k_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0919c4593f028445dc033e85314320f88409a54
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/misc/d2_mask-rcnn_r50-caffe_fpn_ms-90k_coco.py
@@ -0,0 +1,83 @@
+_base_ = '../common/ms-poly-90k_coco-instance.py'
+
+# model settings
+model = dict(
+    type='Detectron2Wrapper',
+    bgr_to_rgb=False,
+    detector=dict(
+        # The settings in `d2_detector` will merged into default settings
+        # in detectron2. More details please refer to
+        # https://github.com/facebookresearch/detectron2/blob/main/detectron2/config/defaults.py    # noqa
+        meta_architecture='GeneralizedRCNN',
+        # If you want to finetune the detector, you can use the
+        # checkpoint released by detectron2, for example:
+        # weights='detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/model_final_a54504.pkl'  # noqa
+        weights='detectron2://ImageNetPretrained/MSRA/R-50.pkl',
+        mask_on=True,
+        pixel_mean=[103.530, 116.280, 123.675],
+        pixel_std=[1.0, 1.0, 1.0],
+        backbone=dict(name='build_resnet_fpn_backbone', freeze_at=2),
+        resnets=dict(
+            depth=50,
+            out_features=['res2', 'res3', 'res4', 'res5'],
+            num_groups=1,
+            norm='FrozenBN'),
+        fpn=dict(
+            in_features=['res2', 'res3', 'res4', 'res5'], out_channels=256),
+        anchor_generator=dict(
+            name='DefaultAnchorGenerator',
+            sizes=[[32], [64], [128], [256], [512]],
+            aspect_ratios=[[0.5, 1.0, 2.0]],
+            angles=[[-90, 0, 90]]),
+        proposal_generator=dict(name='RPN'),
+        rpn=dict(
+            head_name='StandardRPNHead',
+            in_features=['p2', 'p3', 'p4', 'p5', 'p6'],
+            iou_thresholds=[0.3, 0.7],
+            iou_labels=[0, -1, 1],
+            batch_size_per_image=256,
+            positive_fraction=0.5,
+            bbox_reg_loss_type='smooth_l1',
+            bbox_reg_loss_weight=1.0,
+            bbox_reg_weights=(1.0, 1.0, 1.0, 1.0),
+            smooth_l1_beta=0.0,
+            loss_weight=1.0,
+            boundary_thresh=-1,
+            pre_nms_topk_train=2000,
+            post_nms_topk_train=1000,
+            pre_nms_topk_test=1000,
+            post_nms_topk_test=1000,
+            nms_thresh=0.7,
+            conv_dims=[-1]),
+        roi_heads=dict(
+            name='StandardROIHeads',
+            num_classes=80,
+            in_features=['p2', 'p3', 'p4', 'p5'],
+            iou_thresholds=[0.5],
+            iou_labels=[0, 1],
+            batch_size_per_image=512,
+            positive_fraction=0.25,
+            score_thresh_test=0.05,
+            nms_thresh_test=0.5,
+            proposal_append_gt=True),
+        roi_box_head=dict(
+            name='FastRCNNConvFCHead',
+            num_fc=2,
+            fc_dim=1024,
+            conv_dim=256,
+            pooler_type='ROIAlignV2',
+            pooler_resolution=7,
+            pooler_sampling_ratio=0,
+            bbox_reg_loss_type='smooth_l1',
+            bbox_reg_loss_weight=1.0,
+            bbox_reg_weights=(10.0, 10.0, 5.0, 5.0),
+            smooth_l1_beta=0.0,
+            cls_agnostic_bbox_reg=False),
+        roi_mask_head=dict(
+            name='MaskRCNNConvUpsampleHead',
+            conv_dim=256,
+            num_conv=4,
+            pooler_type='ROIAlignV2',
+            pooler_resolution=14,
+            pooler_sampling_ratio=0,
+            cls_agnostic_mask=False)))
diff --git a/mmde/mmdet/.mim/configs/misc/d2_retinanet_r50-caffe_fpn_ms-90k_coco.py b/mmde/mmdet/.mim/configs/misc/d2_retinanet_r50-caffe_fpn_ms-90k_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3f7587648bde1d15b5c3c1e1ace6c35bb7c20b0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/misc/d2_retinanet_r50-caffe_fpn_ms-90k_coco.py
@@ -0,0 +1,48 @@
+_base_ = '../common/ms-90k_coco.py'
+
+# model settings
+model = dict(
+    type='Detectron2Wrapper',
+    bgr_to_rgb=False,
+    detector=dict(
+        # The settings in `d2_detector` will merged into default settings
+        # in detectron2. More details please refer to
+        # https://github.com/facebookresearch/detectron2/blob/main/detectron2/config/defaults.py    # noqa
+        meta_architecture='RetinaNet',
+        # If you want to finetune the detector, you can use the
+        # checkpoint released by detectron2, for example:
+        # weights='detectron2://COCO-Detection/retinanet_R_50_FPN_1x/190397773/model_final_bfca0b.pkl'     # noqa
+        weights='detectron2://ImageNetPretrained/MSRA/R-50.pkl',
+        mask_on=False,
+        pixel_mean=[103.530, 116.280, 123.675],
+        pixel_std=[1.0, 1.0, 1.0],
+        backbone=dict(name='build_retinanet_resnet_fpn_backbone', freeze_at=2),
+        resnets=dict(
+            depth=50,
+            out_features=['res3', 'res4', 'res5'],
+            num_groups=1,
+            norm='FrozenBN'),
+        fpn=dict(in_features=['res3', 'res4', 'res5'], out_channels=256),
+        anchor_generator=dict(
+            name='DefaultAnchorGenerator',
+            sizes=[[x, x * 2**(1.0 / 3), x * 2**(2.0 / 3)]
+                   for x in [32, 64, 128, 256, 512]],
+            aspect_ratios=[[0.5, 1.0, 2.0]],
+            angles=[[-90, 0, 90]]),
+        retinanet=dict(
+            num_classes=80,
+            in_features=['p3', 'p4', 'p5', 'p6', 'p7'],
+            num_convs=4,
+            iou_thresholds=[0.4, 0.5],
+            iou_labels=[0, -1, 1],
+            bbox_reg_weights=(1.0, 1.0, 1.0, 1.0),
+            bbox_reg_loss_type='smooth_l1',
+            smooth_l1_loss_beta=0.0,
+            focal_loss_gamma=2.0,
+            focal_loss_alpha=0.25,
+            prior_prob=0.01,
+            score_thresh_test=0.05,
+            topk_candidates_test=1000,
+            nms_thresh_test=0.5)))
+
+optim_wrapper = dict(optimizer=dict(lr=0.01))
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/brain_tumor/grounding_dino_swin-t_finetune_8xb4_50e_brain_tumor.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/brain_tumor/grounding_dino_swin-t_finetune_8xb4_50e_brain_tumor.py
new file mode 100644
index 0000000000000000000000000000000000000000..1172da5b64102413eec11f223f467ad4c03a7cdf
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/brain_tumor/grounding_dino_swin-t_finetune_8xb4_50e_brain_tumor.py
@@ -0,0 +1,112 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+# https://universe.roboflow.com/roboflow-100/brain-tumor-m2pbp/dataset/2
+data_root = 'data/brain_tumor_v2/'
+class_name = ('label0', 'label1', 'label2')
+label_name = '_annotations.coco.json'
+
+palette = [(220, 20, 60), (255, 0, 0), (0, 0, 142)]
+
+metainfo = dict(classes=class_name, palette=palette)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+train_dataloader = dict(
+    sampler=dict(_delete_=True, type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=10,
+        dataset=dict(
+            type='CocoDataset',
+            data_root=data_root,
+            metainfo=metainfo,
+            filter_cfg=dict(filter_empty_gt=False, min_size=32),
+            pipeline=train_pipeline,
+            return_classes=True,
+            data_prefix=dict(img='train/'),
+            ann_file='train/' + label_name)))
+
+val_dataloader = dict(
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        return_classes=True,
+        ann_file='valid/' + label_name,
+        data_prefix=dict(img='valid/')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'valid/' + label_name,
+    metric='bbox',
+    format_only=False)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'absolute_pos_embed': dict(decay_mult=0.),
+        'backbone': dict(lr_mult=0.1)
+    }))
+
+# learning policy
+max_epochs = 5
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[4],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/cityscapes/grounding_dino_swin-t_finetune_8xb4_50e_cityscapes.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/cityscapes/grounding_dino_swin-t_finetune_8xb4_50e_cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4283413c4ba0c060144d7fb85f7d064a60577c7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/cityscapes/grounding_dino_swin-t_finetune_8xb4_50e_cityscapes.py
@@ -0,0 +1,110 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/cityscapes/'
+class_name = ('person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+              'bicycle')
+palette = [(220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70), (0, 60, 100),
+           (0, 80, 100), (0, 0, 230), (119, 11, 32)]
+
+metainfo = dict(classes=class_name, palette=palette)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+train_dataloader = dict(
+    sampler=dict(_delete_=True, type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=10,
+        dataset=dict(
+            type='CocoDataset',
+            data_root=data_root,
+            metainfo=metainfo,
+            filter_cfg=dict(filter_empty_gt=False, min_size=32),
+            pipeline=train_pipeline,
+            return_classes=True,
+            data_prefix=dict(img='leftImg8bit/train/'),
+            ann_file='annotations/instancesonly_filtered_gtFine_train.json')))
+
+val_dataloader = dict(
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        return_classes=True,
+        ann_file='annotations/instancesonly_filtered_gtFine_val.json',
+        data_prefix=dict(img='leftImg8bit/val/')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instancesonly_filtered_gtFine_val.json',
+    metric='bbox',
+    format_only=False)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'absolute_pos_embed': dict(decay_mult=0.),
+        'backbone': dict(lr_mult=0.1)
+    }))
+
+# learning policy
+max_epochs = 5
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[4],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_coco.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..792297accd302d390f865bee294b1294863d6ac1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_coco.py
@@ -0,0 +1,85 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='CocoDataset',
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        return_classes=True,
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        pipeline=train_pipeline))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            'language_model': dict(lr_mult=0.1),
+        }))
+
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_coco_48_17.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_coco_48_17.py
new file mode 100644
index 0000000000000000000000000000000000000000..e68afbb43286af24612321129042e7d0e0f34b29
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_coco_48_17.py
@@ -0,0 +1,157 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+base_classes = ('person', 'bicycle', 'car', 'motorcycle', 'train', 'truck',
+                'boat', 'bench', 'bird', 'horse', 'sheep', 'bear', 'zebra',
+                'giraffe', 'backpack', 'handbag', 'suitcase', 'frisbee',
+                'skis', 'kite', 'surfboard', 'bottle', 'fork', 'spoon', 'bowl',
+                'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+                'pizza', 'donut', 'chair', 'bed', 'toilet', 'tv', 'laptop',
+                'mouse', 'remote', 'microwave', 'oven', 'toaster',
+                'refrigerator', 'book', 'clock', 'vase', 'toothbrush')  # 48
+novel_classes = ('airplane', 'bus', 'cat', 'dog', 'cow', 'elephant',
+                 'umbrella', 'tie', 'snowboard', 'skateboard', 'cup', 'knife',
+                 'cake', 'couch', 'keyboard', 'sink', 'scissors')  # 17
+all_classes = (
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'kite', 'skateboard',
+    'surfboard', 'bottle', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana',
+    'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'pizza', 'donut',
+    'cake', 'chair', 'couch', 'bed', 'toilet', 'tv', 'laptop', 'mouse',
+    'remote', 'keyboard', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'toothbrush')  # 65
+
+train_metainfo = dict(classes=base_classes)
+test_metainfo = dict(
+    classes=all_classes,
+    base_classes=base_classes,
+    novel_classes=novel_classes)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='CocoDataset',
+        metainfo=train_metainfo,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017_seen_2.json',
+        data_prefix=dict(img='train2017/'),
+        return_classes=True,
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='CocoDataset',
+        metainfo=test_metainfo,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017_all_2.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        return_classes=True,
+    ))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='OVCocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017_all_2.json',
+    metric='bbox',
+    format_only=False)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.00005, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            # 'language_model': dict(lr_mult=0),
+        }))
+
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+default_hooks = dict(
+    checkpoint=dict(
+        max_keep_ckpts=1, save_best='coco/novel_ap50', rule='greater'))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_sft_coco.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_sft_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5505df58b8b103a93570519c20aaf0fcc144e91c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_sft_coco.py
@@ -0,0 +1,93 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=20,  # ======= important =====
+        label_map_file='data/coco/annotations/coco2017_label_map.json',
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='ODVGDataset',
+        need_text=False,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017_od.json',
+        label_map_file='annotations/coco2017_label_map.json',
+        data_prefix=dict(img='train2017/'),
+        return_classes=True,
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        pipeline=train_pipeline))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.00005, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            'language_model': dict(lr_mult=0.0),
+        }))
+
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py
new file mode 100644
index 0000000000000000000000000000000000000000..e59a0a52518aa125d556aab12f8076a95f39ec22
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py
@@ -0,0 +1,78 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/d3/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities', 'sent_ids'))
+]
+
+# -------------------------------------------------#
+val_dataset_full = dict(
+    type='DODDataset',
+    data_root=data_root,
+    ann_file='d3_json/d3_full_annotations.json',
+    data_prefix=dict(img='d3_images/', anno='d3_pkl'),
+    pipeline=test_pipeline,
+    test_mode=True,
+    backend_args=None,
+    return_classes=True)
+
+val_evaluator_full = dict(
+    type='DODCocoMetric',
+    ann_file=data_root + 'd3_json/d3_full_annotations.json')
+
+# -------------------------------------------------#
+val_dataset_pres = dict(
+    type='DODDataset',
+    data_root=data_root,
+    ann_file='d3_json/d3_pres_annotations.json',
+    data_prefix=dict(img='d3_images/', anno='d3_pkl'),
+    pipeline=test_pipeline,
+    test_mode=True,
+    backend_args=None,
+    return_classes=True)
+val_evaluator_pres = dict(
+    type='DODCocoMetric',
+    ann_file=data_root + 'd3_json/d3_pres_annotations.json')
+
+# -------------------------------------------------#
+val_dataset_abs = dict(
+    type='DODDataset',
+    data_root=data_root,
+    ann_file='d3_json/d3_abs_annotations.json',
+    data_prefix=dict(img='d3_images/', anno='d3_pkl'),
+    pipeline=test_pipeline,
+    test_mode=True,
+    backend_args=None,
+    return_classes=True)
+val_evaluator_abs = dict(
+    type='DODCocoMetric',
+    ann_file=data_root + 'd3_json/d3_abs_annotations.json')
+
+# -------------------------------------------------#
+datasets = [val_dataset_full, val_dataset_pres, val_dataset_abs]
+dataset_prefixes = ['FULL', 'PRES', 'ABS']
+metrics = [val_evaluator_full, val_evaluator_pres, val_evaluator_abs]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_parallel_dod.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_parallel_dod.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d680091162e5ac96c15c76b58a18764e85d3233
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_parallel_dod.py
@@ -0,0 +1,3 @@
+_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py'
+
+model = dict(test_cfg=dict(chunked_size=1))
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_flickr30k.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_flickr30k.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9eb783da97a6d665002cc9192f740010282870e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_flickr30k.py
@@ -0,0 +1,57 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+dataset_type = 'Flickr30kDataset'
+data_root = 'data/flickr30k_entities/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive', 'phrase_ids', 'phrases'))
+]
+
+dataset_Flickr30k_val = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='final_flickr_separateGT_val.json',
+    data_prefix=dict(img='flickr30k_images/'),
+    pipeline=test_pipeline,
+)
+
+dataset_Flickr30k_test = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='final_flickr_separateGT_test.json',
+    data_prefix=dict(img='flickr30k_images/'),
+    pipeline=test_pipeline,
+)
+
+val_evaluator_Flickr30k = dict(type='Flickr30kMetric')
+
+test_evaluator_Flickr30k = dict(type='Flickr30kMetric')
+
+# ----------Config---------- #
+dataset_prefixes = ['Flickr30kVal', 'Flickr30kTest']
+datasets = [dataset_Flickr30k_val, dataset_Flickr30k_test]
+metrics = [val_evaluator_Flickr30k, test_evaluator_Flickr30k]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-l_pretrain_all.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-l_pretrain_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..46241e2e03b53263e5527abd050f55b8ff394298
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-l_pretrain_all.py
@@ -0,0 +1,542 @@
+_base_ = 'grounding_dino_swin-t_pretrain_obj365.py'
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'  # noqa
+num_levels = 5
+model = dict(
+    num_feature_levels=num_levels,
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        # Please only add indices that would be used
+        # in FPN, otherwise some parameter will not be used
+        with_cp=True,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(in_channels=[192, 384, 768, 1536], num_outs=num_levels),
+    encoder=dict(layer_cfg=dict(self_attn_cfg=dict(num_levels=num_levels))),
+    decoder=dict(layer_cfg=dict(cross_attn_cfg=dict(num_levels=num_levels))))
+
+# --------------------------- object365v2 od dataset---------------------------
+# objv2_backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/objects365v2/': 'yudong:s3://wangyudong/obj365_v2/',
+#         'data/objects365v2/': 'yudong:s3://wangyudong/obj365_v2/'
+#     }))
+objv2_backend_args = None
+
+objv2_train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=objv2_backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        # change this
+        label_map_file='data/objects365v2/annotations/o365v2_label_map.json',
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+o365v2_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/objects365v2/',
+    ann_file='annotations/zhiyuan_objv2_train_od.json',
+    label_map_file='annotations/o365v2_label_map.json',
+    data_prefix=dict(img='train/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=objv2_train_pipeline,
+    return_classes=True,
+    need_text=False,
+    backend_args=None,
+)
+
+# --------------------------- openimagev6 od dataset---------------------------
+# oi_backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+oi_backend_args = None
+
+oi_train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=oi_backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        # change this
+        label_map_file='data/OpenImages/annotations/openimages_label_map.json',
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+oiv6_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/OpenImages/',
+    ann_file='annotations/oidv6-train-annotations_od.json',
+    label_map_file='annotations/openimages_label_map.json',
+    data_prefix=dict(img='OpenImages/train/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    need_text=False,
+    pipeline=oi_train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+# --------------------------- v3det od dataset---------------------------
+v3d_train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        # change this
+        label_map_file='data/V3Det/annotations/v3det_2023_v1_label_map.json',
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+v3det_dataset = dict(
+    type='RepeatDataset',
+    times=2,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root='data/V3Det/',
+        ann_file='annotations/v3det_2023_v1_train_od.json',
+        label_map_file='annotations/v3det_2023_v1_label_map.json',
+        data_prefix=dict(img=''),
+        filter_cfg=dict(filter_empty_gt=False),
+        need_text=False,
+        pipeline=v3d_train_pipeline,
+        return_classes=True,
+        backend_args=None))
+
+# --------------------------- lvis od dataset---------------------------
+lvis_train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        # change this
+        label_map_file='data/coco/annotations/lvis_v1_label_map.json',
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+lvis_dataset = dict(
+    type='ClassBalancedDataset',
+    oversample_thr=1e-3,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root='data/coco/',
+        ann_file='annotations/lvis_v1_train_od.json',
+        label_map_file='annotations/lvis_v1_label_map.json',
+        data_prefix=dict(img=''),
+        filter_cfg=dict(filter_empty_gt=False),
+        need_text=False,  # change this
+        pipeline=lvis_train_pipeline,
+        return_classes=True,
+        backend_args=None))
+
+# --------------------------- coco2017 od dataset---------------------------
+coco2017_train_dataset = dict(
+    type='RepeatDataset',
+    times=2,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root='data/coco/',
+        ann_file='annotations/instance_train2017_norefval_od.json',
+        label_map_file='annotations/coco2017_label_map.json',
+        data_prefix=dict(img='train2017'),
+        filter_cfg=dict(filter_empty_gt=False),
+        pipeline=_base_.train_pipeline,
+        return_classes=True,
+        backend_args=None))
+
+# --------------------------- flickr30k vg dataset---------------------------
+flickr30k_dataset = dict(
+    type='RepeatDataset',
+    times=2,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root='data/flickr30k_entities/',
+        ann_file='final_flickr_separateGT_train_vg.json',
+        label_map_file=None,
+        data_prefix=dict(img='flickr30k_images/'),
+        filter_cfg=dict(filter_empty_gt=False),
+        pipeline=_base_.train_pipeline,
+        return_classes=True,
+        backend_args=None))
+
+# --------------------------- gqa vg dataset---------------------------
+gqa_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/gqa/',
+    ann_file='final_mixed_train_no_coco_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+# --------------------------- coco2014 vg dataset---------------------------
+coco2014_vg_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/coco/',
+    ann_file='mdetr_annotations/final_mixed_train_only_coco_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='train2014/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+# --------------------------- refcoco vg dataset---------------------------
+refcoco_dataset = dict(
+    type='RepeatDataset',
+    times=2,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root='data/coco/',
+        ann_file='mdetr_annotations/finetune_refcoco_train_vg.json',
+        label_map_file=None,
+        data_prefix=dict(img='train2014'),
+        filter_cfg=dict(filter_empty_gt=False),
+        pipeline=_base_.train_pipeline,
+        return_classes=True,
+        backend_args=None))
+
+# --------------------------- refcoco+ vg dataset---------------------------
+refcoco_plus_dataset = dict(
+    type='RepeatDataset',
+    times=2,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root='data/coco/',
+        ann_file='mdetr_annotations/finetune_refcoco+_train_vg.json',
+        label_map_file=None,
+        data_prefix=dict(img='train2014'),
+        filter_cfg=dict(filter_empty_gt=False),
+        pipeline=_base_.train_pipeline,
+        return_classes=True,
+        backend_args=None))
+
+# --------------------------- refcocog vg dataset---------------------------
+refcocog_dataset = dict(
+    type='RepeatDataset',
+    times=3,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root='data/coco/',
+        ann_file='mdetr_annotations/finetune_refcocog_train_vg.json',
+        label_map_file=None,
+        data_prefix=dict(img='train2014'),
+        filter_cfg=dict(filter_empty_gt=False),
+        pipeline=_base_.train_pipeline,
+        return_classes=True,
+        backend_args=None))
+
+# --------------------------- grefcoco vg dataset---------------------------
+grefcoco_dataset = dict(
+    type='RepeatDataset',
+    times=2,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root='data/coco/',
+        ann_file='mdetr_annotations/finetune_grefcoco_train_vg.json',
+        label_map_file=None,
+        data_prefix=dict(img='train2014'),
+        filter_cfg=dict(filter_empty_gt=False),
+        pipeline=_base_.train_pipeline,
+        return_classes=True,
+        backend_args=None))
+
+# --------------------------- grit vg dataset---------------------------
+# grit_backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/grit/': 'yichen:s3://chenyicheng/grit/',
+#         'data/grit/': 'yichen:s3://chenyicheng/grit/'
+#     }))
+grit_backend_args = None
+
+grit_train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=grit_backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+grit_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/grit/',
+    ann_file='grit20m_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img=''),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=grit_train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+# --------------------------- dataloader---------------------------
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    sampler=dict(
+        _delete_=True,
+        type='CustomSampleSizeSampler',
+        ratio_mode=True,
+        # OD ~ 1.74+1.67*0.5+0.18*2+0.12*2+0.1=3.2
+        # vg ~ 0.15*2+0.62*1+0.49*1+0.12*2+0.12*2+0.08*3+0.19*2+9*0.09=3.3
+        dataset_size=[-1, 0.5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0.09]),
+    dataset=dict(datasets=[
+        o365v2_dataset,  # 1.74M
+        oiv6_dataset,  # 1.67M
+        v3det_dataset,  # 0.18M
+        coco2017_train_dataset,  # 0.12M
+        lvis_dataset,  # 0.1M
+        flickr30k_dataset,  # 0.15M
+        gqa_dataset,  # 0.62M
+        coco2014_vg_dataset,  # 0.49M
+        refcoco_dataset,  # 0.12M
+        refcoco_plus_dataset,  # 0.12M
+        refcocog_dataset,  # 0.08M
+        grefcoco_dataset,  # 0.19M
+        grit_dataset  # 9M
+    ]))
+
+# bs=256
+optim_wrapper = dict(optimizer=dict(lr=0.0008))
+
+# one epoch = (3.2+3.3)M/256 = 25390 iter
+# 24e=609360 iter
+# 16e=406240 iter
+# 20e=507800 iter
+max_iter = 609360
+train_cfg = dict(
+    _delete_=True,
+    type='IterBasedTrainLoop',
+    max_iters=max_iter,
+    val_interval=13000)
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[406240, 507800],
+        gamma=0.1)
+]
+
+default_hooks = dict(
+    checkpoint=dict(by_epoch=False, interval=13000, max_keep_ckpts=30))
+log_processor = dict(by_epoch=False)
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_finetune_8xb4_20e_cat.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_finetune_8xb4_20e_cat.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf3b35894eb5fcee6db9f02c2ab8a837cd6da20b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_finetune_8xb4_20e_cat.py
@@ -0,0 +1,102 @@
+_base_ = 'grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/cat/'
+class_name = ('cat', )
+num_classes = len(class_name)
+metainfo = dict(classes=class_name, palette=[(220, 20, 60)])
+
+model = dict(bbox_head=dict(num_classes=num_classes))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='CocoDataset',
+        data_root=data_root,
+        metainfo=metainfo,
+        return_classes=True,
+        pipeline=train_pipeline,
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        ann_file='annotations/trainval.json',
+        data_prefix=dict(img='images/')))
+
+val_dataloader = dict(
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        ann_file='annotations/test.json',
+        data_prefix=dict(img='images/')))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(ann_file=data_root + 'annotations/test.json')
+test_evaluator = val_evaluator
+
+max_epoch = 20
+
+default_hooks = dict(
+    checkpoint=dict(interval=1, max_keep_ckpts=1, save_best='auto'),
+    logger=dict(type='LoggerHook', interval=5))
+train_cfg = dict(max_epochs=max_epoch, val_interval=1)
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epoch,
+        by_epoch=True,
+        milestones=[15],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(lr=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.0),
+            'language_model': dict(lr_mult=0.0)
+        }))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py
new file mode 100644
index 0000000000000000000000000000000000000000..66060f45ea735ab5bbd8e1852c035ea20adcbd80
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py
@@ -0,0 +1,247 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'  # noqa
+lang_model_name = 'bert-base-uncased'
+
+model = dict(
+    type='GroundingDINO',
+    num_queries=900,
+    with_box_refine=True,
+    as_two_stage=True,
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=False,
+    ),
+    language_model=dict(
+        type='BertModel',
+        name=lang_model_name,
+        max_tokens=256,
+        pad_to_max=False,
+        use_sub_sentence_represent=True,
+        special_tokens_list=['[CLS]', '[SEP]', '.', '?'],
+        add_pooling_layer=False,
+    ),
+    backbone=dict(
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=True,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[192, 384, 768],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        bias=True,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    encoder=dict(
+        num_layers=6,
+        num_cp=6,
+        # visual layer config
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_levels=4, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
+        # text layer config
+        text_layer_cfg=dict(
+            self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=1024, ffn_drop=0.0)),
+        # fusion layer config
+        fusion_layer_cfg=dict(
+            v_dim=256,
+            l_dim=256,
+            embed_dim=1024,
+            num_heads=4,
+            init_values=1e-4),
+    ),
+    decoder=dict(
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(
+            # query self attention layer
+            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            # cross attention layer query to text
+            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            # cross attention layer query to image
+            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
+        post_norm_cfg=None),
+    positional_encoding=dict(
+        num_feats=128, normalize=True, offset=0.0, temperature=20),
+    bbox_head=dict(
+        type='GroundingDINOHead',
+        num_classes=256,
+        sync_cls_avg_factor=True,
+        contrastive_cfg=dict(max_text_len=256, log_scale='auto', bias=True),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),  # 2.0 in DeformDETR
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0)),
+    dn_cfg=dict(  # TODO: Move to model.train_cfg ?
+        label_noise_scale=0.5,
+        box_noise_scale=1.0,  # 0.4 for DN-DETR
+        group_cfg=dict(dynamic=True, num_groups=None,
+                       num_dn_queries=100)),  # TODO: half num_dn_queries
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            match_costs=[
+                dict(type='BinaryFocalLossCost', weight=2.0),
+                dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                dict(type='IoUCost', iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=300))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=lang_model_name,
+        num_sample_negative=85,
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive'))
+]
+
+dataset_type = 'ODVGDataset'
+data_root = 'data/objects365v1/'
+
+coco_od_dataset = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='o365v1_train_odvg.json',
+    label_map_file='o365v1_label_map.json',
+    data_prefix=dict(img='train/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+train_dataloader = dict(
+    _delete_=True,
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(type='ConcatDataset', datasets=[coco_od_dataset]))
+
+val_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, return_classes=True))
+test_dataloader = val_dataloader
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0004,
+                   weight_decay=0.0001),  # bs=16 0.0001
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            'language_model': dict(lr_mult=0.1),
+        }))
+
+# learning policy
+max_epochs = 30
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[19, 26],
+        gamma=0.1)
+]
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
+
+default_hooks = dict(visualization=dict(type='GroundingVisualizationHook'))
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7f388bdd4e8b61d1e7b6fd19445b3628164c4a0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg.py
@@ -0,0 +1,38 @@
+_base_ = 'grounding_dino_swin-t_pretrain_obj365.py'
+
+o365v1_od_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/objects365v1/',
+    ann_file='o365v1_train_odvg.json',
+    label_map_file='o365v1_label_map.json',
+    data_prefix=dict(img='train/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None,
+)
+
+flickr30k_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/flickr30k_entities/',
+    ann_file='final_flickr_separateGT_train_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='flickr30k_images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+gqa_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/gqa/',
+    ann_file='final_mixed_train_no_coco_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+train_dataloader = dict(
+    dataset=dict(datasets=[o365v1_od_dataset, flickr30k_dataset, gqa_dataset]))
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e9f5ca4aaba7afb631f76b8a575101868fed2a4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m.py
@@ -0,0 +1,55 @@
+_base_ = 'grounding_dino_swin-t_pretrain_obj365.py'
+
+o365v1_od_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/objects365v1/',
+    ann_file='o365v1_train_odvg.json',
+    label_map_file='o365v1_label_map.json',
+    data_prefix=dict(img='train/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None,
+)
+
+flickr30k_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/flickr30k_entities/',
+    ann_file='final_flickr_separateGT_train_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='flickr30k_images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+gqa_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/gqa/',
+    ann_file='final_mixed_train_no_coco_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+grit_dataset = dict(
+    type='ODVGDataset',
+    data_root='grit_processed/',
+    ann_file='grit20m_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img=''),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+train_dataloader = dict(
+    sampler=dict(
+        _delete_=True,
+        type='CustomSampleSizeSampler',
+        dataset_size=[-1, -1, -1, 500000]),
+    dataset=dict(datasets=[
+        o365v1_od_dataset, flickr30k_dataset, gqa_dataset, grit_dataset
+    ]))
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det.py
new file mode 100644
index 0000000000000000000000000000000000000000..56e500c86932a8e61dba88fde2bfc00c0ced5585
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det.py
@@ -0,0 +1,117 @@
+_base_ = 'grounding_dino_swin-t_pretrain_obj365.py'
+
+o365v1_od_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/objects365v1/',
+    ann_file='o365v1_train_odvg.json',
+    label_map_file='o365v1_label_map.json',
+    data_prefix=dict(img='train/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None,
+)
+
+flickr30k_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/flickr30k_entities/',
+    ann_file='final_flickr_separateGT_train_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='flickr30k_images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+gqa_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/gqa/',
+    ann_file='final_mixed_train_no_coco_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+v3d_train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        # change this
+        label_map_file='data/V3Det/annotations/v3det_2023_v1_label_map.json',
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+v3det_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/V3Det/',
+    ann_file='annotations/v3det_2023_v1_train_od.json',
+    label_map_file='annotations/v3det_2023_v1_label_map.json',
+    data_prefix=dict(img=''),
+    filter_cfg=dict(filter_empty_gt=False),
+    need_text=False,  # change this
+    pipeline=v3d_train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+grit_dataset = dict(
+    type='ODVGDataset',
+    data_root='grit_processed/',
+    ann_file='grit20m_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img=''),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+train_dataloader = dict(
+    sampler=dict(
+        _delete_=True,
+        type='CustomSampleSizeSampler',
+        dataset_size=[-1, -1, -1, -1, 500000]),
+    dataset=dict(datasets=[
+        o365v1_od_dataset, flickr30k_dataset, gqa_dataset, v3det_dataset,
+        grit_dataset
+    ]))
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_v3det.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_v3det.py
new file mode 100644
index 0000000000000000000000000000000000000000..c89014fbbe43a1e7787fa46d7d850d42a64ff8a9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_v3det.py
@@ -0,0 +1,101 @@
+_base_ = 'grounding_dino_swin-t_pretrain_obj365.py'
+
+o365v1_od_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/objects365v1/',
+    ann_file='o365v1_train_odvg.json',
+    label_map_file='o365v1_label_map.json',
+    data_prefix=dict(img='train/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None,
+)
+
+flickr30k_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/flickr30k_entities/',
+    ann_file='final_flickr_separateGT_train_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='flickr30k_images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+gqa_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/gqa/',
+    ann_file='final_mixed_train_no_coco_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+v3d_train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        # change this
+        label_map_file='data/V3Det/annotations/v3det_2023_v1_label_map.json',
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+v3det_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/V3Det/',
+    ann_file='annotations/v3det_2023_v1_train_od.json',
+    label_map_file='annotations/v3det_2023_v1_label_map.json',
+    data_prefix=dict(img=''),
+    filter_cfg=dict(filter_empty_gt=False),
+    need_text=False,  # change this
+    pipeline=v3d_train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+train_dataloader = dict(
+    dataset=dict(datasets=[
+        o365v1_od_dataset, flickr30k_dataset, gqa_dataset, v3det_dataset
+    ]))
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_pseudo-labeling_cat.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_pseudo-labeling_cat.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dc8dcd8df4b98a3fdb3aa26d73ce353b9251f50
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_pseudo-labeling_cat.py
@@ -0,0 +1,43 @@
+_base_ = 'grounding_dino_swin-t_pretrain_obj365.py'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadTextAnnotations'),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive'))
+]
+
+data_root = 'data/cat/'
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=False,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root=data_root,
+        label_map_file='cat_label_map.json',
+        ann_file='cat_train_od.json',
+        data_prefix=dict(img='images/'),
+        pipeline=test_pipeline,
+        return_classes=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    outfile_path=data_root + 'cat_train_od_v1.json',
+    img_prefix=data_root + 'images/',
+    score_thr=0.7,
+    nms_thr=0.5,
+    type='DumpODVGResults')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_pseudo-labeling_flickr30k.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_pseudo-labeling_flickr30k.py
new file mode 100644
index 0000000000000000000000000000000000000000..78bf1c344bf7c795ace08283b745527dfc9b15f7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_pseudo-labeling_flickr30k.py
@@ -0,0 +1,42 @@
+_base_ = 'grounding_dino_swin-t_pretrain_obj365.py'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadTextAnnotations'),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive'))
+]
+
+data_root = 'data/flickr30k_entities/'
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=False,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root=data_root,
+        ann_file='flickr_simple_train_vg.json',
+        data_prefix=dict(img='flickr30k_images/'),
+        pipeline=test_pipeline,
+        return_classes=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    outfile_path=data_root + 'flickr_simple_train_vg_v1.json',
+    img_prefix=data_root + 'flickr30k_images/',
+    score_thr=0.4,
+    nms_thr=0.5,
+    type='DumpODVGResults')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_finetune_16xb4_1x_lvis.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_finetune_16xb4_1x_lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ba12c9067511b00b616781ca0cf2e477e5e689e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_finetune_16xb4_1x_lvis.py
@@ -0,0 +1,120 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+model = dict(test_cfg=dict(
+    max_per_img=300,
+    chunked_size=40,
+))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        # change this
+        label_map_file='data/coco/annotations/lvis_v1_label_map.json',
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='ClassBalancedDataset',
+        oversample_thr=1e-3,
+        dataset=dict(
+            type='ODVGDataset',
+            data_root=data_root,
+            need_text=False,
+            label_map_file='annotations/lvis_v1_label_map.json',
+            ann_file='annotations/lvis_v1_train_od.json',
+            data_prefix=dict(img=''),
+            filter_cfg=dict(filter_empty_gt=False, min_size=32),
+            return_classes=True,
+            pipeline=train_pipeline)))
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        type='LVISV1Dataset',
+        ann_file='annotations/lvis_v1_minival_inserted_image_name.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='LVISFixedAPMetric',
+    ann_file=data_root +
+    'annotations/lvis_v1_minival_inserted_image_name.json')
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            # 'language_model': dict(lr_mult=0),
+        }))
+
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[11],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=3)
+
+default_hooks = dict(
+    checkpoint=dict(
+        max_keep_ckpts=1, save_best='lvis_fixed_ap/AP', rule='greater'))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_finetune_16xb4_1x_lvis_866_337.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_finetune_16xb4_1x_lvis_866_337.py
new file mode 100644
index 0000000000000000000000000000000000000000..28d0141d3e2c0feba26ae4ed924000960c311bf5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_finetune_16xb4_1x_lvis_866_337.py
@@ -0,0 +1,120 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+model = dict(test_cfg=dict(
+    max_per_img=300,
+    chunked_size=40,
+))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        # change this
+        label_map_file='data/coco/annotations/lvis_v1_label_map_norare.json',
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='ClassBalancedDataset',
+        oversample_thr=1e-3,
+        dataset=dict(
+            type='ODVGDataset',
+            data_root=data_root,
+            need_text=False,
+            label_map_file='annotations/lvis_v1_label_map_norare.json',
+            ann_file='annotations/lvis_v1_train_od_norare.json',
+            data_prefix=dict(img=''),
+            filter_cfg=dict(filter_empty_gt=False, min_size=32),
+            return_classes=True,
+            pipeline=train_pipeline)))
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        type='LVISV1Dataset',
+        ann_file='annotations/lvis_v1_minival_inserted_image_name.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='LVISFixedAPMetric',
+    ann_file=data_root +
+    'annotations/lvis_v1_minival_inserted_image_name.json')
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.00005, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            # 'language_model': dict(lr_mult=0),
+        }))
+
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=3)
+
+default_hooks = dict(
+    checkpoint=dict(
+        max_keep_ckpts=3, save_best='lvis_fixed_ap/AP', rule='greater'))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_lvis.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb4ed438e0b59ca4c991836310cf7103cc02f0f2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_lvis.py
@@ -0,0 +1,24 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+model = dict(test_cfg=dict(
+    max_per_img=300,
+    chunked_size=40,
+))
+
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/coco/'
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        type=dataset_type,
+        ann_file='annotations/lvis_od_val.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+# numpy < 1.24.0
+val_evaluator = dict(
+    _delete_=True,
+    type='LVISFixedAPMetric',
+    ann_file=data_root + 'annotations/lvis_od_val.json')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..406a39a4264a0d6ea5d7950a205b0bac72e8f846
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py
@@ -0,0 +1,25 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+model = dict(test_cfg=dict(
+    max_per_img=300,
+    chunked_size=40,
+))
+
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/coco/'
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        type=dataset_type,
+        ann_file='annotations/lvis_v1_minival_inserted_image_name.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+# numpy < 1.24.0
+val_evaluator = dict(
+    _delete_=True,
+    type='LVISFixedAPMetric',
+    ann_file=data_root +
+    'annotations/lvis_v1_minival_inserted_image_name.json')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/metafile.yml b/mmde/mmdet/.mim/configs/mm_grounding_dino/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3071686e7ac9aeda34157f1fbf5d94e12530839e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/metafile.yml
@@ -0,0 +1,54 @@
+Collections:
+  - Name: MM Grounding DINO
+    Metadata:
+      Training Data: Objects365, GoldG, GRIT and V3Det
+      Training Techniques:
+        - AdamW
+        - Multi Scale Train
+        - Gradient Clip
+      Training Resources: 3090 GPUs
+      Architecture:
+        - Swin Transformer
+        - BERT
+    README: configs/mm_grounding_dino/README.md
+    Code:
+      URL:
+      Version: v3.0.0
+
+Models:
+  - Name: grounding_dino_swin-t_pretrain_obj365_goldg
+    In Collection: MM Grounding DINO
+    Config: configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.4
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg/grounding_dino_swin-t_pretrain_obj365_goldg_20231122_132602-4ea751ce.pth
+  - Name: grounding_dino_swin-t_pretrain_obj365_goldg_grit9m
+    In Collection: MM Grounding DINO
+    Config: configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.5
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_20231128_200818-169cc352.pth
+  - Name: grounding_dino_swin-t_pretrain_obj365_goldg_v3det
+    In Collection: MM Grounding DINO
+    Config: configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_v3det.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.6
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_v3det_20231218_095741-e316e297.pth
+  - Name: grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det
+    In Collection: MM Grounding DINO
+    Config: configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det.py
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.4
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py
new file mode 100644
index 0000000000000000000000000000000000000000..d87ca7ca1ea48a3cff83e15f3e2ad66927598d7f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py
@@ -0,0 +1,338 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'  # noqa
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    test_mode=True,
+    pipeline=base_test_pipeline,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'penguin': {
+#         'suffix': ', which is black and white'
+#     },
+#     'puffin': {
+#         'suffix': ' with orange beaks'
+#     },
+#     'stingray': {
+#         'suffix': ' which is flat and round'
+#     },
+# }
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 CottontailRabbits---------------------#
+class_name = ('Cottontail-Rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+
+# caption_prompt = None
+caption_prompt = {'Cottontail-Rabbit': {'name': 'rabbit'}}
+
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 EgoHands---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+
+# caption_prompt = None
+caption_prompt = {'hand': {'suffix': ' of a person'}}
+
+dataset_EgoHands = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 NorthAmericaMushrooms---------------------#
+class_name = ('CoW', 'chanterelle')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+
+# caption_prompt = None
+caption_prompt = {
+    'CoW': {
+        'name': 'flat mushroom'
+    },
+    'chanterelle': {
+        'name': 'yellow mushroom'
+    }
+}
+
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+
+# caption_prompt = None
+caption_prompt = {
+    'package': {
+        'prefix': 'there is a ',
+        'suffix': ' on the porch'
+    }
+}
+
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+
+# caption_prompt = None
+caption_prompt = {
+    'pothole': {
+        'prefix': 'there are some ',
+        'name': 'holes',
+        'suffix': ' on the road'
+    }
+}
+
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+dataset_prefixes = [
+    'AerialMaritimeDrone', 'Aquarium', 'CottontailRabbits', 'EgoHands',
+    'NorthAmericaMushrooms', 'Packages', 'PascalVOC', 'pistols', 'pothole',
+    'Raccoon', 'ShellfishOpenImages', 'thermalDogsAndPeople',
+    'VehiclesOpenImages'
+]
+datasets = [
+    dataset_AerialMaritimeDrone, dataset_Aquarium, dataset_CottontailRabbits,
+    dataset_EgoHands, dataset_NorthAmericaMushrooms, dataset_Packages,
+    dataset_PascalVOC, dataset_pistols, dataset_pothole, dataset_Raccoon,
+    dataset_ShellfishOpenImages, dataset_thermalDogsAndPeople,
+    dataset_VehiclesOpenImages
+]
+metrics = [
+    val_evaluator_AerialMaritimeDrone, val_evaluator_Aquarium,
+    val_evaluator_CottontailRabbits, val_evaluator_EgoHands,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_Packages,
+    val_evaluator_PascalVOC, val_evaluator_pistols, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_ShellfishOpenImages,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_VehiclesOpenImages
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw35.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw35.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6b8566aed486ef48653b6e54200cb8817910f2f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw35.py
@@ -0,0 +1,794 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'  # noqa
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone_large---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone_large = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_large = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 AerialMaritimeDrone_tiled---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/tiled/'
+dataset_AerialMaritimeDrone_tiled = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_tiled = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 AmericanSignLanguageLetters---------------------#
+class_name = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+              'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AmericanSignLanguageLetters/American Sign Language Letters.v1-v1.coco/'  # noqa
+dataset_AmericanSignLanguageLetters = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AmericanSignLanguageLetters = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 BCCD---------------------#
+class_name = ('Platelets', 'RBC', 'WBC')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'BCCD/BCCD.v3-raw.coco/'
+dataset_BCCD = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_BCCD = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 boggleBoards---------------------#
+class_name = ('Q', 'a', 'an', 'b', 'c', 'd', 'e', 'er', 'f', 'g', 'h', 'he',
+              'i', 'in', 'j', 'k', 'l', 'm', 'n', 'o', 'o ', 'p', 'q', 'qu',
+              'r', 's', 't', 't\\', 'th', 'u', 'v', 'w', 'wild', 'x', 'y', 'z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'boggleBoards/416x416AutoOrient/export/'
+dataset_boggleBoards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_boggleBoards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 brackishUnderwater---------------------#
+class_name = ('crab', 'fish', 'jellyfish', 'shrimp', 'small_fish', 'starfish')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'brackishUnderwater/960x540/'
+dataset_brackishUnderwater = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_brackishUnderwater = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 ChessPieces---------------------#
+class_name = ('  ', 'black bishop', 'black king', 'black knight', 'black pawn',
+              'black queen', 'black rook', 'white bishop', 'white king',
+              'white knight', 'white pawn', 'white queen', 'white rook')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ChessPieces/Chess Pieces.v23-raw.coco/'
+dataset_ChessPieces = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ChessPieces = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 CottontailRabbits---------------------#
+class_name = ('rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 dice---------------------#
+class_name = ('1', '2', '3', '4', '5', '6')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'dice/mediumColor/export/'
+dataset_dice = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_dice = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 DroneControl---------------------#
+class_name = ('follow', 'follow_hand', 'land', 'land_hand', 'null', 'object',
+              'takeoff', 'takeoff-hand')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'DroneControl/Drone Control.v3-raw.coco/'
+dataset_DroneControl = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_DroneControl = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 EgoHands_generic---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+caption_prompt = {'hand': {'suffix': ' of a person'}}
+dataset_EgoHands_generic = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_generic = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 EgoHands_specific---------------------#
+class_name = ('myleft', 'myright', 'yourleft', 'yourright')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/specific/'
+dataset_EgoHands_specific = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_specific = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------14 HardHatWorkers---------------------#
+class_name = ('head', 'helmet', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'HardHatWorkers/raw/'
+dataset_HardHatWorkers = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_HardHatWorkers = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------15 MaskWearing---------------------#
+class_name = ('mask', 'no-mask')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MaskWearing/raw/'
+dataset_MaskWearing = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MaskWearing = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------16 MountainDewCommercial---------------------#
+class_name = ('bottle', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MountainDewCommercial/'
+dataset_MountainDewCommercial = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MountainDewCommercial = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------17 NorthAmericaMushrooms---------------------#
+class_name = ('flat mushroom', 'yellow mushroom')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------18 openPoetryVision---------------------#
+class_name = ('American Typewriter', 'Andale Mono', 'Apple Chancery', 'Arial',
+              'Avenir', 'Baskerville', 'Big Caslon', 'Bradley Hand',
+              'Brush Script MT', 'Chalkboard', 'Comic Sans MS', 'Copperplate',
+              'Courier', 'Didot', 'Futura', 'Geneva', 'Georgia', 'Gill Sans',
+              'Helvetica', 'Herculanum', 'Impact', 'Kefa', 'Lucida Grande',
+              'Luminari', 'Marker Felt', 'Menlo', 'Monaco', 'Noteworthy',
+              'Optima', 'PT Sans', 'PT Serif', 'Palatino', 'Papyrus',
+              'Phosphate', 'Rockwell', 'SF Pro', 'SignPainter', 'Skia',
+              'Snell Roundhand', 'Tahoma', 'Times New Roman', 'Trebuchet MS',
+              'Verdana')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'openPoetryVision/512x512/'
+dataset_openPoetryVision = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_openPoetryVision = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------19 OxfordPets_by_breed---------------------#
+class_name = ('cat-Abyssinian', 'cat-Bengal', 'cat-Birman', 'cat-Bombay',
+              'cat-British_Shorthair', 'cat-Egyptian_Mau', 'cat-Maine_Coon',
+              'cat-Persian', 'cat-Ragdoll', 'cat-Russian_Blue', 'cat-Siamese',
+              'cat-Sphynx', 'dog-american_bulldog',
+              'dog-american_pit_bull_terrier', 'dog-basset_hound',
+              'dog-beagle', 'dog-boxer', 'dog-chihuahua',
+              'dog-english_cocker_spaniel', 'dog-english_setter',
+              'dog-german_shorthaired', 'dog-great_pyrenees', 'dog-havanese',
+              'dog-japanese_chin', 'dog-keeshond', 'dog-leonberger',
+              'dog-miniature_pinscher', 'dog-newfoundland', 'dog-pomeranian',
+              'dog-pug', 'dog-saint_bernard', 'dog-samoyed',
+              'dog-scottish_terrier', 'dog-shiba_inu',
+              'dog-staffordshire_bull_terrier', 'dog-wheaten_terrier',
+              'dog-yorkshire_terrier')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-breed/'  # noqa
+dataset_OxfordPets_by_breed = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_breed = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------20 OxfordPets_by_species---------------------#
+class_name = ('cat', 'dog')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-species/'  # noqa
+dataset_OxfordPets_by_species = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_species = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------21 PKLot---------------------#
+class_name = ('space-empty', 'space-occupied')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PKLot/640/'  # noqa
+dataset_PKLot = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PKLot = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------22 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+caption_prompt = {
+    'package': {
+        'prefix': 'there is a ',
+        'suffix': ' on the porch'
+    }
+}
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------23 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------24 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------25 plantdoc---------------------#
+class_name = ('Apple Scab Leaf', 'Apple leaf', 'Apple rust leaf',
+              'Bell_pepper leaf', 'Bell_pepper leaf spot', 'Blueberry leaf',
+              'Cherry leaf', 'Corn Gray leaf spot', 'Corn leaf blight',
+              'Corn rust leaf', 'Peach leaf', 'Potato leaf',
+              'Potato leaf early blight', 'Potato leaf late blight',
+              'Raspberry leaf', 'Soyabean leaf', 'Soybean leaf',
+              'Squash Powdery mildew leaf', 'Strawberry leaf',
+              'Tomato Early blight leaf', 'Tomato Septoria leaf spot',
+              'Tomato leaf', 'Tomato leaf bacterial spot',
+              'Tomato leaf late blight', 'Tomato leaf mosaic virus',
+              'Tomato leaf yellow virus', 'Tomato mold leaf',
+              'Tomato two spotted spider mites leaf', 'grape leaf',
+              'grape leaf black rot')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'plantdoc/416x416/'
+dataset_plantdoc = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_plantdoc = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------26 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+caption_prompt = {
+    'pothole': {
+        'name': 'holes',
+        'prefix': 'there are some ',
+        'suffix': ' on the road'
+    }
+}
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    caption_prompt=caption_prompt,
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------27 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------28 selfdrivingCar---------------------#
+class_name = ('biker', 'car', 'pedestrian', 'trafficLight',
+              'trafficLight-Green', 'trafficLight-GreenLeft',
+              'trafficLight-Red', 'trafficLight-RedLeft',
+              'trafficLight-Yellow', 'trafficLight-YellowLeft', 'truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'selfdrivingCar/fixedLarge/export/'
+dataset_selfdrivingCar = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_selfdrivingCar = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------29 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------30 ThermalCheetah---------------------#
+class_name = ('cheetah', 'human')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ThermalCheetah/'
+dataset_ThermalCheetah = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ThermalCheetah = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------31 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------32 UnoCards---------------------#
+class_name = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
+              '12', '13', '14')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'UnoCards/raw/'
+dataset_UnoCards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_UnoCards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------33 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------34 WildfireSmoke---------------------#
+class_name = ('smoke', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'WildfireSmoke/'
+dataset_WildfireSmoke = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_WildfireSmoke = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------35 websiteScreenshots---------------------#
+class_name = ('button', 'field', 'heading', 'iframe', 'image', 'label', 'link',
+              'text')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'websiteScreenshots/'
+dataset_websiteScreenshots = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_websiteScreenshots = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+
+dataset_prefixes = [
+    'AerialMaritimeDrone_large',
+    'AerialMaritimeDrone_tiled',
+    'AmericanSignLanguageLetters',
+    'Aquarium',
+    'BCCD',
+    'boggleBoards',
+    'brackishUnderwater',
+    'ChessPieces',
+    'CottontailRabbits',
+    'dice',
+    'DroneControl',
+    'EgoHands_generic',
+    'EgoHands_specific',
+    'HardHatWorkers',
+    'MaskWearing',
+    'MountainDewCommercial',
+    'NorthAmericaMushrooms',
+    'openPoetryVision',
+    'OxfordPets_by_breed',
+    'OxfordPets_by_species',
+    'PKLot',
+    'Packages',
+    'PascalVOC',
+    'pistols',
+    'plantdoc',
+    'pothole',
+    'Raccoons',
+    'selfdrivingCar',
+    'ShellfishOpenImages',
+    'ThermalCheetah',
+    'thermalDogsAndPeople',
+    'UnoCards',
+    'VehiclesOpenImages',
+    'WildfireSmoke',
+    'websiteScreenshots',
+]
+
+datasets = [
+    dataset_AerialMaritimeDrone_large, dataset_AerialMaritimeDrone_tiled,
+    dataset_AmericanSignLanguageLetters, dataset_Aquarium, dataset_BCCD,
+    dataset_boggleBoards, dataset_brackishUnderwater, dataset_ChessPieces,
+    dataset_CottontailRabbits, dataset_dice, dataset_DroneControl,
+    dataset_EgoHands_generic, dataset_EgoHands_specific,
+    dataset_HardHatWorkers, dataset_MaskWearing, dataset_MountainDewCommercial,
+    dataset_NorthAmericaMushrooms, dataset_openPoetryVision,
+    dataset_OxfordPets_by_breed, dataset_OxfordPets_by_species, dataset_PKLot,
+    dataset_Packages, dataset_PascalVOC, dataset_pistols, dataset_plantdoc,
+    dataset_pothole, dataset_Raccoon, dataset_selfdrivingCar,
+    dataset_ShellfishOpenImages, dataset_ThermalCheetah,
+    dataset_thermalDogsAndPeople, dataset_UnoCards, dataset_VehiclesOpenImages,
+    dataset_WildfireSmoke, dataset_websiteScreenshots
+]
+
+metrics = [
+    val_evaluator_AerialMaritimeDrone_large,
+    val_evaluator_AerialMaritimeDrone_tiled,
+    val_evaluator_AmericanSignLanguageLetters, val_evaluator_Aquarium,
+    val_evaluator_BCCD, val_evaluator_boggleBoards,
+    val_evaluator_brackishUnderwater, val_evaluator_ChessPieces,
+    val_evaluator_CottontailRabbits, val_evaluator_dice,
+    val_evaluator_DroneControl, val_evaluator_EgoHands_generic,
+    val_evaluator_EgoHands_specific, val_evaluator_HardHatWorkers,
+    val_evaluator_MaskWearing, val_evaluator_MountainDewCommercial,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_openPoetryVision,
+    val_evaluator_OxfordPets_by_breed, val_evaluator_OxfordPets_by_species,
+    val_evaluator_PKLot, val_evaluator_Packages, val_evaluator_PascalVOC,
+    val_evaluator_pistols, val_evaluator_plantdoc, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_selfdrivingCar,
+    val_evaluator_ShellfishOpenImages, val_evaluator_ThermalCheetah,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_UnoCards,
+    val_evaluator_VehiclesOpenImages, val_evaluator_WildfireSmoke,
+    val_evaluator_websiteScreenshots
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/odinw/override_category.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/odinw/override_category.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ff05fc6e5e4d0989cf7fcf7af4dc902ee99f3a3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/odinw/override_category.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import mmengine
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Override Category')
+    parser.add_argument('data_root')
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    ChessPieces = [{
+        'id': 1,
+        'name': '  ',
+        'supercategory': 'pieces'
+    }, {
+        'id': 2,
+        'name': 'black bishop',
+        'supercategory': 'pieces'
+    }, {
+        'id': 3,
+        'name': 'black king',
+        'supercategory': 'pieces'
+    }, {
+        'id': 4,
+        'name': 'black knight',
+        'supercategory': 'pieces'
+    }, {
+        'id': 5,
+        'name': 'black pawn',
+        'supercategory': 'pieces'
+    }, {
+        'id': 6,
+        'name': 'black queen',
+        'supercategory': 'pieces'
+    }, {
+        'id': 7,
+        'name': 'black rook',
+        'supercategory': 'pieces'
+    }, {
+        'id': 8,
+        'name': 'white bishop',
+        'supercategory': 'pieces'
+    }, {
+        'id': 9,
+        'name': 'white king',
+        'supercategory': 'pieces'
+    }, {
+        'id': 10,
+        'name': 'white knight',
+        'supercategory': 'pieces'
+    }, {
+        'id': 11,
+        'name': 'white pawn',
+        'supercategory': 'pieces'
+    }, {
+        'id': 12,
+        'name': 'white queen',
+        'supercategory': 'pieces'
+    }, {
+        'id': 13,
+        'name': 'white rook',
+        'supercategory': 'pieces'
+    }]
+
+    _data_root = args.data_root + 'ChessPieces/Chess Pieces.v23-raw.coco/'
+    json_data = mmengine.load(_data_root +
+                              'valid/annotations_without_background.json')
+    json_data['categories'] = ChessPieces
+    mmengine.dump(json_data,
+                  _data_root + 'valid/new_annotations_without_background.json')
+
+    CottontailRabbits = [{
+        'id': 1,
+        'name': 'rabbit',
+        'supercategory': 'Cottontail-Rabbit'
+    }]
+
+    _data_root = args.data_root + 'CottontailRabbits/'
+    json_data = mmengine.load(_data_root +
+                              'valid/annotations_without_background.json')
+    json_data['categories'] = CottontailRabbits
+    mmengine.dump(json_data,
+                  _data_root + 'valid/new_annotations_without_background.json')
+
+    NorthAmericaMushrooms = [{
+        'id': 1,
+        'name': 'flat mushroom',
+        'supercategory': 'mushroom'
+    }, {
+        'id': 2,
+        'name': 'yellow mushroom',
+        'supercategory': 'mushroom'
+    }]
+
+    _data_root = args.data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+    json_data = mmengine.load(_data_root +
+                              'valid/annotations_without_background.json')
+    json_data['categories'] = NorthAmericaMushrooms
+    mmengine.dump(json_data,
+                  _data_root + 'valid/new_annotations_without_background.json')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/people_in_painting/grounding_dino_swin-t_finetune_8xb4_50e_people_in_painting.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/people_in_painting/grounding_dino_swin-t_finetune_8xb4_50e_people_in_painting.py
new file mode 100644
index 0000000000000000000000000000000000000000..449d8682f896c3857e6a50b16a13b43acc77ebc2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/people_in_painting/grounding_dino_swin-t_finetune_8xb4_50e_people_in_painting.py
@@ -0,0 +1,109 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+# https://universe.roboflow.com/roboflow-100/people-in-paintings/dataset/2
+data_root = 'data/people_in_painting_v2/'
+class_name = ('Human', )
+palette = [(220, 20, 60)]
+
+metainfo = dict(classes=class_name, palette=palette)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+train_dataloader = dict(
+    sampler=dict(_delete_=True, type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=10,
+        dataset=dict(
+            type='CocoDataset',
+            data_root=data_root,
+            metainfo=metainfo,
+            filter_cfg=dict(filter_empty_gt=False, min_size=32),
+            pipeline=train_pipeline,
+            return_classes=True,
+            data_prefix=dict(img='train/'),
+            ann_file='train/_annotations.coco.json')))
+
+val_dataloader = dict(
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        return_classes=True,
+        ann_file='valid/_annotations.coco.json',
+        data_prefix=dict(img='valid/')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'valid/_annotations.coco.json',
+    metric='bbox',
+    format_only=False)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'absolute_pos_embed': dict(decay_mult=0.),
+        'backbone': dict(lr_mult=0.1)
+    }))
+
+# learning policy
+max_epochs = 5
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[4],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_8xb4_5e_grefcoco.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_8xb4_5e_grefcoco.py
new file mode 100644
index 0000000000000000000000000000000000000000..983ffe5c6f3f6e59cf1616a0b22c17f065e08437
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_8xb4_5e_grefcoco.py
@@ -0,0 +1,170 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    # change this
+    dict(type='RandomFlip', prob=0.0),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='ODVGDataset',
+        data_root=data_root,
+        ann_file='mdetr_annotations/finetune_grefcoco_train_vg.json',
+        data_prefix=dict(img='train2014/'),
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        return_classes=True,
+        pipeline=train_pipeline))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_val.json'
+val_dataset_all_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+val_evaluator_all_val = dict(
+    type='gRefCOCOMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    thresh_score=0.7,
+    thresh_f1=1.0)
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_testA.json'
+val_dataset_refcoco_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testA = dict(
+    type='gRefCOCOMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    thresh_score=0.7,
+    thresh_f1=1.0)
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_testB.json'
+val_dataset_refcoco_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testB = dict(
+    type='gRefCOCOMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    thresh_score=0.7,
+    thresh_f1=1.0)
+
+# -------------------------------------------------#
+datasets = [
+    val_dataset_all_val, val_dataset_refcoco_testA, val_dataset_refcoco_testB
+]
+dataset_prefixes = ['grefcoco_val', 'grefcoco_testA', 'grefcoco_testB']
+metrics = [
+    val_evaluator_all_val, val_evaluator_refcoco_testA,
+    val_evaluator_refcoco_testB
+]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            # 'language_model': dict(lr_mult=0),
+        }))
+
+# learning policy
+max_epochs = 5
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[3],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_8xb4_5e_refcoco.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_8xb4_5e_refcoco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d91af473a239f2f48a09a272d926e00c52da987b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_8xb4_5e_refcoco.py
@@ -0,0 +1,167 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    # change this
+    dict(type='RandomFlip', prob=0.0),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='ODVGDataset',
+        data_root=data_root,
+        ann_file='mdetr_annotations/finetune_refcoco_train_vg.json',
+        data_prefix=dict(img='train2014/'),
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        return_classes=True,
+        pipeline=train_pipeline))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco_val.json'
+val_dataset_all_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+val_evaluator_all_val = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco_testA.json'
+val_dataset_refcoco_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testA = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco_testB.json'
+val_dataset_refcoco_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testB = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+datasets = [
+    val_dataset_all_val, val_dataset_refcoco_testA, val_dataset_refcoco_testB
+]
+dataset_prefixes = ['refcoco_val', 'refcoco_testA', 'refcoco_testB']
+metrics = [
+    val_evaluator_all_val, val_evaluator_refcoco_testA,
+    val_evaluator_refcoco_testB
+]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            # 'language_model': dict(lr_mult=0),
+        }))
+
+# learning policy
+max_epochs = 5
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[3],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_8xb4_5e_refcoco_plus.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_8xb4_5e_refcoco_plus.py
new file mode 100644
index 0000000000000000000000000000000000000000..871adc8efb48532fb5e0fbfa07e6019c37911712
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_8xb4_5e_refcoco_plus.py
@@ -0,0 +1,167 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    # change this
+    dict(type='RandomFlip', prob=0.0),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='ODVGDataset',
+        data_root=data_root,
+        ann_file='mdetr_annotations/finetune_refcoco+_train_vg.json',
+        data_prefix=dict(img='train2014/'),
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        return_classes=True,
+        pipeline=train_pipeline))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco+_val.json'
+val_dataset_all_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+val_evaluator_all_val = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco+_testA.json'
+val_dataset_refcoco_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testA = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco+_testB.json'
+val_dataset_refcoco_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testB = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+datasets = [
+    val_dataset_all_val, val_dataset_refcoco_testA, val_dataset_refcoco_testB
+]
+dataset_prefixes = ['refcoco+_val', 'refcoco+_testA', 'refcoco+_testB']
+metrics = [
+    val_evaluator_all_val, val_evaluator_refcoco_testA,
+    val_evaluator_refcoco_testB
+]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            # 'language_model': dict(lr_mult=0),
+        }))
+
+# learning policy
+max_epochs = 5
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[3],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_8xb4_5e_refcocog.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_8xb4_5e_refcocog.py
new file mode 100644
index 0000000000000000000000000000000000000000..a351d6f9d123fc8f2000990a5e6d02adbb3eb2fa
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_8xb4_5e_refcocog.py
@@ -0,0 +1,145 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    # change this
+    dict(type='RandomFlip', prob=0.0),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='ODVGDataset',
+        data_root=data_root,
+        ann_file='mdetr_annotations/finetune_refcocog_train_vg.json',
+        data_prefix=dict(img='train2014/'),
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        return_classes=True,
+        pipeline=train_pipeline))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcocog_val.json'
+val_dataset_all_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+val_evaluator_all_val = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcocog_test.json'
+val_dataset_refcoco_test = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_test = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+datasets = [val_dataset_all_val, val_dataset_refcoco_test]
+dataset_prefixes = ['refcocog_val', 'refcocog_test']
+metrics = [val_evaluator_all_val, val_evaluator_refcoco_test]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            # 'language_model': dict(lr_mult=0),
+        }))
+
+# learning policy
+max_epochs = 5
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[3],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_pretrain_zeroshot_refexp.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_pretrain_zeroshot_refexp.py
new file mode 100644
index 0000000000000000000000000000000000000000..437d71c6b357eda85d13b5efd4c81d4d32f91120
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_pretrain_zeroshot_refexp.py
@@ -0,0 +1,228 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+# 30 is an empirical value, just set it to the maximum value
+# without affecting the evaluation result
+model = dict(test_cfg=dict(max_per_img=30))
+
+data_root = 'data/coco/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive'))
+]
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/final_refexp_val.json'
+val_dataset_all_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+val_evaluator_all_val = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco_testA.json'
+val_dataset_refcoco_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testA = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco_testB.json'
+val_dataset_refcoco_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testB = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco+_testA.json'
+val_dataset_refcoco_plus_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_plus_testA = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco+_testB.json'
+val_dataset_refcoco_plus_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_plus_testB = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcocog_test.json'
+val_dataset_refcocog_test = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcocog_test = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_val.json'
+val_dataset_grefcoco_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_grefcoco_val = dict(
+    type='gRefCOCOMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    thresh_score=0.7,
+    thresh_f1=1.0)
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_testA.json'
+val_dataset_grefcoco_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_grefcoco_testA = dict(
+    type='gRefCOCOMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    thresh_score=0.7,
+    thresh_f1=1.0)
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_testB.json'
+val_dataset_grefcoco_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_grefcoco_testB = dict(
+    type='gRefCOCOMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    thresh_score=0.7,
+    thresh_f1=1.0)
+
+# -------------------------------------------------#
+datasets = [
+    val_dataset_all_val, val_dataset_refcoco_testA, val_dataset_refcoco_testB,
+    val_dataset_refcoco_plus_testA, val_dataset_refcoco_plus_testB,
+    val_dataset_refcocog_test, val_dataset_grefcoco_val,
+    val_dataset_grefcoco_testA, val_dataset_grefcoco_testB
+]
+dataset_prefixes = [
+    'val', 'refcoco_testA', 'refcoco_testB', 'refcoco+_testA',
+    'refcoco+_testB', 'refcocog_test', 'grefcoco_val', 'grefcoco_testA',
+    'grefcoco_testB'
+]
+metrics = [
+    val_evaluator_all_val, val_evaluator_refcoco_testA,
+    val_evaluator_refcoco_testB, val_evaluator_refcoco_plus_testA,
+    val_evaluator_refcoco_plus_testB, val_evaluator_refcocog_test,
+    val_evaluator_grefcoco_val, val_evaluator_grefcoco_testA,
+    val_evaluator_grefcoco_testB
+]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/rtts/grounding_dino_swin-t_finetune_8xb4_1x_rtts.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/rtts/grounding_dino_swin-t_finetune_8xb4_1x_rtts.py
new file mode 100644
index 0000000000000000000000000000000000000000..95c2be058e2c407fc92de93f4b79ec8b36e25c18
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/rtts/grounding_dino_swin-t_finetune_8xb4_1x_rtts.py
@@ -0,0 +1,106 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/RTTS/'
+class_name = ('bicycle', 'bus', 'car', 'motorbike', 'person')
+palette = [(255, 97, 0), (0, 201, 87), (176, 23, 31), (138, 43, 226),
+           (30, 144, 255)]
+
+metainfo = dict(classes=class_name, palette=palette)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+train_dataloader = dict(
+    sampler=dict(_delete_=True, type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        _delete_=True,
+        type='CocoDataset',
+        data_root=data_root,
+        metainfo=metainfo,
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        pipeline=train_pipeline,
+        return_classes=True,
+        ann_file='annotations_json/rtts_train.json',
+        data_prefix=dict(img='')))
+
+val_dataloader = dict(
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        return_classes=True,
+        ann_file='annotations_json/rtts_val.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations_json/rtts_val.json',
+    metric='bbox',
+    format_only=False)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'absolute_pos_embed': dict(decay_mult=0.),
+        'backbone': dict(lr_mult=0.1)
+    }))
+
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[11],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/mm_grounding_dino/ruod/grounding_dino_swin-t_finetune_8xb4_1x_ruod.py b/mmde/mmdet/.mim/configs/mm_grounding_dino/ruod/grounding_dino_swin-t_finetune_8xb4_1x_ruod.py
new file mode 100644
index 0000000000000000000000000000000000000000..f57682b29d970fb6d46c2f459f773b03e803695d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/mm_grounding_dino/ruod/grounding_dino_swin-t_finetune_8xb4_1x_ruod.py
@@ -0,0 +1,108 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/RUOD/'
+class_name = ('holothurian', 'echinus', 'scallop', 'starfish', 'fish',
+              'corals', 'diver', 'cuttlefish', 'turtle', 'jellyfish')
+palette = [(235, 211, 70), (106, 90, 205), (160, 32, 240), (176, 23, 31),
+           (142, 0, 0), (230, 0, 0), (106, 0, 228), (60, 100, 0), (80, 100, 0),
+           (70, 0, 0)]
+
+metainfo = dict(classes=class_name, palette=palette)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+train_dataloader = dict(
+    sampler=dict(_delete_=True, type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        _delete_=True,
+        type='CocoDataset',
+        data_root=data_root,
+        metainfo=metainfo,
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        pipeline=train_pipeline,
+        return_classes=True,
+        ann_file='RUOD_ANN/instances_train.json',
+        data_prefix=dict(img='RUOD_pic/train/')))
+
+val_dataloader = dict(
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        return_classes=True,
+        ann_file='RUOD_ANN/instances_test.json',
+        data_prefix=dict(img='RUOD_pic/test/')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'RUOD_ANN/instances_test.json',
+    metric='bbox',
+    format_only=False)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'absolute_pos_embed': dict(decay_mult=0.),
+        'backbone': dict(lr_mult=0.1)
+    }))
+
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[11],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/mmde/mmdet/.mim/configs/ms_rcnn/metafile.yml b/mmde/mmdet/.mim/configs/ms_rcnn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..290f05436949c68d226d8bc2f107e480acbd6b4c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ms_rcnn/metafile.yml
@@ -0,0 +1,159 @@
+Collections:
+  - Name: Mask Scoring R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RPN
+        - FPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/abs/1903.00241
+      Title: 'Mask Scoring R-CNN'
+    README: configs/ms_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/mask_scoring_rcnn.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: ms-rcnn_r50-caffe_fpn_1x_coco
+    In Collection: Mask Scoring R-CNN
+    Config: configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.5
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco/ms_rcnn_r50_caffe_fpn_1x_coco_20200702_180848-61c9355e.pth
+
+  - Name: ms-rcnn_r50-caffe_fpn_2x_coco
+    In Collection: Mask Scoring R-CNN
+    Config: configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco/ms_rcnn_r50_caffe_fpn_2x_coco_bbox_mAP-0.388__segm_mAP-0.363_20200506_004738-ee87b137.pth
+
+  - Name: ms-rcnn_r101-caffe_fpn_1x_coco
+    In Collection: Mask Scoring R-CNN
+    Config: configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.5
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco/ms_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.404__segm_mAP-0.376_20200506_004755-b9b12a37.pth
+
+  - Name: ms-rcnn_r101-caffe_fpn_2x_coco
+    In Collection: Mask Scoring R-CNN
+    Config: configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco/ms_rcnn_r101_caffe_fpn_2x_coco_bbox_mAP-0.411__segm_mAP-0.381_20200506_011134-5f3cc74f.pth
+
+  - Name: ms-rcnn_x101-32x4d_fpn_1x_coco
+    In Collection: Mask Scoring R-CNN
+    Config: configs/ms_rcnn/ms-rcnn_x101-32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.9
+      inference time (ms/im):
+        - value: 90.91
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco/ms_rcnn_x101_32x4d_fpn_1x_coco_20200206-81fd1740.pth
+
+  - Name: ms-rcnn_x101-64x4d_fpn_1x_coco
+    In Collection: Mask Scoring R-CNN
+    Config: configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 11.0
+      inference time (ms/im):
+        - value: 125
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco/ms_rcnn_x101_64x4d_fpn_1x_coco_20200206-86ba88d2.pth
+
+  - Name: ms-rcnn_x101-64x4d_fpn_2x_coco
+    In Collection: Mask Scoring R-CNN
+    Config: configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 11.0
+      inference time (ms/im):
+        - value: 125
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco/ms_rcnn_x101_64x4d_fpn_2x_coco_20200308-02a445e2.pth
diff --git a/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ff4f2d66ae6de88ba9d5d8fb5cf31abaa4cb3c5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './ms-rcnn_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..54b29e4f7aea547e2b26782b71ada8053930d325
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_2x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './ms-rcnn_r101-caffe_fpn_1x_coco.py'
+# learning policy
+max_epochs = 24
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7fbc51f1ba431ca7c22ff3d2c74cfc9e1263ffb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    type='MaskScoringRCNN',
+    roi_head=dict(
+        type='MaskScoringRoIHead',
+        mask_iou_head=dict(
+            type='MaskIoUHead',
+            num_convs=4,
+            num_fcs=2,
+            roi_feat_size=14,
+            in_channels=256,
+            conv_out_channels=256,
+            fc_out_channels=1024,
+            num_classes=80)),
+    # model training and testing settings
+    train_cfg=dict(rcnn=dict(mask_thr_binary=0.5)))
diff --git a/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..033488229220e5b044c30c43f5e72f8468f68224
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_2x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './ms-rcnn_r50-caffe_fpn_1x_coco.py'
+# learning policy
+max_epochs = 24
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ae47d1c38daa4430de4b4264bbb2aef0eb7f7ea
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    type='MaskScoringRCNN',
+    roi_head=dict(
+        type='MaskScoringRoIHead',
+        mask_iou_head=dict(
+            type='MaskIoUHead',
+            num_convs=4,
+            num_fcs=2,
+            roi_feat_size=14,
+            in_channels=256,
+            conv_out_channels=256,
+            fc_out_channels=1024,
+            num_classes=80)),
+    # model training and testing settings
+    train_cfg=dict(rcnn=dict(mask_thr_binary=0.5)))
diff --git a/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_x101-32x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_x101-32x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a5d0d0f3188e8e661cc9ab7a731fc631dd950ac
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_x101-32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ms-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..16290076c07d7a97108b89e4a41b5ff51cbbcdc1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './ms-rcnn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7aec1874394692a63dc8caeef2609cf01b7bfd7c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_2x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './ms-rcnn_x101-64x4d_fpn_1x_coco.py'
+# learning policy
+max_epochs = 24
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/nas_fcos/metafile.yml b/mmde/mmdet/.mim/configs/nas_fcos/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..02292a41516b6b2d5ab87e629f2bd2672e61e0fb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/nas_fcos/metafile.yml
@@ -0,0 +1,44 @@
+Collections:
+  - Name: NAS-FCOS
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 4x V100 GPUs
+      Architecture:
+        - FPN
+        - NAS-FCOS
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1906.04423
+      Title: 'NAS-FCOS: Fast Neural Architecture Search for Object Detection'
+    README: configs/nas_fcos/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/detectors/nasfcos.py#L6
+      Version: v2.1.0
+
+Models:
+  - Name: nas-fcos_r50-caffe_fpn_nashead-gn-head_4xb4-1x_coco
+    In Collection: NAS-FCOS
+    Config: configs/nas_fcos/nas-fcos_r50-caffe_fpn_nashead-gn-head_4xb4-1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200520-1bdba3ce.pth
+
+  - Name: nas-fcos_r50-caffe_fpn_fcoshead-gn-head_4xb4-1x_coco
+    In Collection: NAS-FCOS
+    Config: configs/nas_fcos/nas-fcos_r50-caffe_fpn_fcoshead-gn-head_4xb4-1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200521-7fdcbce0.pth
diff --git a/mmde/mmdet/.mim/configs/nas_fcos/nas-fcos_r50-caffe_fpn_fcoshead-gn-head_4xb4-1x_coco.py b/mmde/mmdet/.mim/configs/nas_fcos/nas-fcos_r50-caffe_fpn_fcoshead-gn-head_4xb4-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba207c9fbdddc5cd30e4d4d86add2c98664e7ffb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/nas_fcos/nas-fcos_r50-caffe_fpn_fcoshead-gn-head_4xb4-1x_coco.py
@@ -0,0 +1,75 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    type='NASFCOS',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False, eps=0),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=dict(
+        type='NASFCOS_FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs=True,
+        num_outs=5,
+        norm_cfg=dict(type='BN'),
+        conv_cfg=dict(type='DCNv2', deform_groups=2)),
+    bbox_head=dict(
+        type='FCOSHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        norm_cfg=dict(type='GN', num_groups=32),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='IoULoss', loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+# dataset settings
+train_dataloader = dict(batch_size=4, num_workers=2)
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(lr=0.01),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
diff --git a/mmde/mmdet/.mim/configs/nas_fcos/nas-fcos_r50-caffe_fpn_nashead-gn-head_4xb4-1x_coco.py b/mmde/mmdet/.mim/configs/nas_fcos/nas-fcos_r50-caffe_fpn_nashead-gn-head_4xb4-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..329f34c45ca0ea3f95e8da8505717df86b7c79c0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/nas_fcos/nas-fcos_r50-caffe_fpn_nashead-gn-head_4xb4-1x_coco.py
@@ -0,0 +1,74 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    type='NASFCOS',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False, eps=0),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=dict(
+        type='NASFCOS_FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs=True,
+        num_outs=5,
+        norm_cfg=dict(type='BN'),
+        conv_cfg=dict(type='DCNv2', deform_groups=2)),
+    bbox_head=dict(
+        type='NASFCOSHead',
+        num_classes=80,
+        in_channels=256,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        norm_cfg=dict(type='GN', num_groups=32),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='IoULoss', loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+# dataset settings
+train_dataloader = dict(batch_size=4, num_workers=2)
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(lr=0.01),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.))
diff --git a/mmde/mmdet/.mim/configs/nas_fpn/metafile.yml b/mmde/mmdet/.mim/configs/nas_fpn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..aef0df6d7f38c71d691526004c0f1d19d66744b0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/nas_fpn/metafile.yml
@@ -0,0 +1,59 @@
+Collections:
+  - Name: NAS-FPN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - NAS-FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1904.07392
+      Title: 'NAS-FPN: Learning Scalable Feature Pyramid Architecture for Object Detection'
+    README: configs/nas_fpn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/necks/nas_fpn.py#L67
+      Version: v2.0.0
+
+Models:
+  - Name: retinanet_r50_fpn_crop640-50e_coco
+    In Collection: NAS-FPN
+    Config: configs/nas_fpn/retinanet_r50_fpn_crop640-50e_coco.py
+    Metadata:
+      Training Memory (GB): 12.9
+      inference time (ms/im):
+        - value: 43.67
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_fpn_crop640_50e_coco/retinanet_r50_fpn_crop640_50e_coco-9b953d76.pth
+
+  - Name: retinanet_r50_nasfpn_crop640-50e_coco
+    In Collection: NAS-FPN
+    Config: configs/nas_fpn/retinanet_r50_nasfpn_crop640-50e_coco.py
+    Metadata:
+      Training Memory (GB): 13.2
+      inference time (ms/im):
+        - value: 43.48
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 50
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco/retinanet_r50_nasfpn_crop640_50e_coco-0ad1f644.pth
diff --git a/mmde/mmdet/.mim/configs/nas_fpn/retinanet_r50_fpn_crop640-50e_coco.py b/mmde/mmdet/.mim/configs/nas_fpn/retinanet_r50_fpn_crop640-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..11c34f6758a4862571e3f840424341c3964115be
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/nas_fpn/retinanet_r50_fpn_crop640-50e_coco.py
@@ -0,0 +1,78 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=64,
+        batch_augments=[dict(type='BatchFixedSizePad', size=(640, 640))]),
+    backbone=dict(norm_eval=False),
+    neck=dict(
+        relu_before_extra_convs=True,
+        no_norm_on_lateral=True,
+        norm_cfg=norm_cfg),
+    bbox_head=dict(type='RetinaSepBNHead', num_ins=5, norm_cfg=norm_cfg),
+    # training and testing settings
+    train_cfg=dict(assigner=dict(neg_iou_thr=0.5)))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize',
+        scale=(640, 640),
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(640, 640)),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(640, 640), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=8, num_workers=4, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# training schedule for 50e
+max_epochs = 50
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[30, 40],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001),
+    paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True))
+
+env_cfg = dict(cudnn_benchmark=True)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/nas_fpn/retinanet_r50_nasfpn_crop640-50e_coco.py b/mmde/mmdet/.mim/configs/nas_fpn/retinanet_r50_nasfpn_crop640-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a851b745defb72aa05df289a3002c1534655d118
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/nas_fpn/retinanet_r50_nasfpn_crop640-50e_coco.py
@@ -0,0 +1,16 @@
+_base_ = './retinanet_r50_fpn_crop640-50e_coco.py'
+
+# model settings
+model = dict(
+    # `pad_size_divisor=128` ensures the feature maps sizes
+    # in `NAS_FPN` won't mismatch.
+    data_preprocessor=dict(pad_size_divisor=128),
+    neck=dict(
+        _delete_=True,
+        type='NASFPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5,
+        stack_times=7,
+        start_level=1,
+        norm_cfg=dict(type='BN', requires_grad=True)))
diff --git a/mmde/mmdet/.mim/configs/objects365/faster-rcnn_r50-syncbn_fpn_1350k_objects365v1.py b/mmde/mmdet/.mim/configs/objects365/faster-rcnn_r50-syncbn_fpn_1350k_objects365v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff7d0a360b95b1a72f779a8f7ad22a7e03235720
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/objects365/faster-rcnn_r50-syncbn_fpn_1350k_objects365v1.py
@@ -0,0 +1,49 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/objects365v2_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    roi_head=dict(bbox_head=dict(num_classes=365)))
+
+# training schedule for 1350K
+train_cfg = dict(
+    _delete_=True,
+    type='IterBasedTrainLoop',
+    max_iters=1350000,  # 36 epochs
+    val_interval=150000)
+
+# Using 8 GPUS while training
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning rate policy
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=1350000,
+        by_epoch=False,
+        milestones=[900000, 1200000],
+        gamma=0.1)
+]
+
+train_dataloader = dict(sampler=dict(type='InfiniteSampler'))
+default_hooks = dict(checkpoint=dict(by_epoch=False, interval=150000))
+
+log_processor = dict(by_epoch=False)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v1.py b/mmde/mmdet/.mim/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc0d96fa22920a34f9ab9437a0f15cc93f46d0fa
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v1.py
@@ -0,0 +1,39 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/objects365v1_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(roi_head=dict(bbox_head=dict(num_classes=365)))
+
+train_dataloader = dict(
+    batch_size=4,  # using 16 GPUS while training. total batch size is 16 x 4)
+)
+
+# Using 32 GPUS while training
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v2.py b/mmde/mmdet/.mim/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1090678f652444c82a627fbf8bdda39fe0077f1e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v2.py
@@ -0,0 +1,39 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/objects365v2_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(roi_head=dict(bbox_head=dict(num_classes=365)))
+
+train_dataloader = dict(
+    batch_size=4,  # using 16 GPUS while training. total batch size is 16 x 4)
+)
+
+# Using 32 GPUS while training
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/objects365/metafile.yml b/mmde/mmdet/.mim/configs/objects365/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d43e8bde9d2aad9516f5383cd4152faf8f097660
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/objects365/metafile.yml
@@ -0,0 +1,101 @@
+- Name: retinanet_r50_fpn_1x_objects365v1
+  In Collection: RetinaNet
+  Config: configs/objects365/retinanet_r50_fpn_1x_objects365v1.py
+  Metadata:
+    Training Memory (GB): 7.4
+    Epochs: 12
+    Training Data: Objects365 v1
+    Training Techniques:
+      - SGD with Momentum
+      - Weight Decay
+  Results:
+  - Task: Object Detection
+    Dataset: Objects365 v1
+    Metrics:
+      box AP: 14.8
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_1x_obj365v1/retinanet_r50_fpn_1x_obj365v1_20221219_181859-ba3e3dd5.pth
+
+- Name: retinanet_r50-syncbn_fpn_1350k_objects365v1
+  In Collection: RetinaNet
+  Config: configs/objects365/retinanet_r50-syncbn_fpn_1350k_objects365v1.py
+  Metadata:
+    Training Memory (GB): 7.6
+    Iterations: 1350000
+    Training Data: Objects365 v1
+    Training Techniques:
+      - SGD with Momentum
+      - Weight Decay
+  Results:
+  - Task: Object Detection
+    Dataset: Objects365 v1
+    Metrics:
+      box AP: 18.0
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_syncbn_1350k_obj365v1/retinanet_r50_fpn_syncbn_1350k_obj365v1_20220513_111237-7517c576.pth
+
+- Name: retinanet_r50_fpn_1x_objects365v2
+  In Collection: RetinaNet
+  Config: configs/objects365/retinanet_r50_fpn_1x_objects365v2.py
+  Metadata:
+    Training Memory (GB): 7.2
+    Epochs: 12
+    Training Data: Objects365 v2
+    Training Techniques:
+      - SGD with Momentum
+      - Weight Decay
+  Results:
+  - Task: Object Detection
+    Dataset: Objects365 v2
+    Metrics:
+      box AP: 16.7
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_1x_obj365v2/retinanet_r50_fpn_1x_obj365v2_20221223_122105-d9b191f1.pth
+
+- Name: faster-rcnn_r50_fpn_16xb4-1x_objects365v1
+  In Collection: Faster R-CNN
+  Config: configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v1.py
+  Metadata:
+    Training Memory (GB): 11.4
+    Epochs: 12
+    Training Data: Objects365 v1
+    Training Techniques:
+      - SGD with Momentum
+      - Weight Decay
+  Results:
+  - Task: Object Detection
+    Dataset: Objects365 v1
+    Metrics:
+      box AP: 19.6
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v1/faster_rcnn_r50_fpn_16x4_1x_obj365v1_20221219_181226-9ff10f95.pth
+
+- Name: faster-rcnn_r50-syncbn_fpn_1350k_objects365v1
+  In Collection: Faster R-CNN
+  Config: configs/objects365/faster-rcnn_r50-syncbn_fpn_1350k_objects365v1.py
+  Metadata:
+    Training Memory (GB): 8.6
+    Iterations: 1350000
+    Training Data: Objects365 v1
+    Training Techniques:
+      - SGD with Momentum
+      - Weight Decay
+  Results:
+  - Task: Object Detection
+    Dataset: Objects365 v1
+    Metrics:
+      box AP: 22.3
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1_20220510_142457-337d8965.pth
+
+- Name: faster-rcnn_r50_fpn_16xb4-1x_objects365v2
+  In Collection: Faster R-CNN
+  Config: configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v2.py
+  Metadata:
+    Training Memory (GB): 10.8
+    Epochs: 12
+    Training Data: Objects365 v1
+    Training Techniques:
+      - SGD with Momentum
+      - Weight Decay
+  Results:
+  - Task: Object Detection
+    Dataset: Objects365 v2
+    Metrics:
+      box AP: 19.8
+  Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v2/faster_rcnn_r50_fpn_16x4_1x_obj365v2_20221220_175040-5910b015.pth
diff --git a/mmde/mmdet/.mim/configs/objects365/retinanet_r50-syncbn_fpn_1350k_objects365v1.py b/mmde/mmdet/.mim/configs/objects365/retinanet_r50-syncbn_fpn_1350k_objects365v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..c41dfce8bc67e7f4d18434a2c10a33c66da403c1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/objects365/retinanet_r50-syncbn_fpn_1350k_objects365v1.py
@@ -0,0 +1,49 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/objects365v2_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    bbox_head=dict(num_classes=365))
+
+# training schedule for 1350K
+train_cfg = dict(
+    _delete_=True,
+    type='IterBasedTrainLoop',
+    max_iters=1350000,  # 36 epochs
+    val_interval=150000)
+
+# Using 8 GPUS while training
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning rate policy
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=10000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=1350000,
+        by_epoch=False,
+        milestones=[900000, 1200000],
+        gamma=0.1)
+]
+
+train_dataloader = dict(sampler=dict(type='InfiniteSampler'))
+default_hooks = dict(checkpoint=dict(by_epoch=False, interval=150000))
+
+log_processor = dict(by_epoch=False)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/objects365/retinanet_r50_fpn_1x_objects365v1.py b/mmde/mmdet/.mim/configs/objects365/retinanet_r50_fpn_1x_objects365v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..72144192aaa36d757053a982ed7ad2a886916b75
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/objects365/retinanet_r50_fpn_1x_objects365v1.py
@@ -0,0 +1,35 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/objects365v1_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(bbox_head=dict(num_classes=365))
+
+# Using 8 GPUS while training
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=10000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/objects365/retinanet_r50_fpn_1x_objects365v2.py b/mmde/mmdet/.mim/configs/objects365/retinanet_r50_fpn_1x_objects365v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..219544126ab0ab6e93d50f1962ffaf40f25b14f0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/objects365/retinanet_r50_fpn_1x_objects365v2.py
@@ -0,0 +1,35 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/objects365v2_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(bbox_head=dict(num_classes=365))
+
+# Using 8 GPUS while training
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=10000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/ocsort/metafile.yml b/mmde/mmdet/.mim/configs/ocsort/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0a31ef108ea7c594d3566970763ff704234d4e0c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ocsort/metafile.yml
@@ -0,0 +1,27 @@
+Collections:
+  - Name: OCSORT
+    Metadata:
+      Training Techniques:
+        - SGD with Momentum
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - YOLOX
+    Paper:
+        URL: https://arxiv.org/abs/2203.14360
+        Title: Observation-Centric SORT Rethinking SORT for Robust Multi-Object Tracking
+    README: configs/ocsort/README.md
+
+Models:
+  - Name: ocsort_yolox_x_crowdhuman_mot17-private-half
+    In Collection: OCSORT
+    Config: configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
+    Metadata:
+      Training Data: CrowdHuman + MOT17-half-train
+    Results:
+      - Task: Multiple Object Tracking
+        Dataset: MOT17-half-val
+        Metrics:
+          HOTA: 67.5
+          MOTA: 77.5
+          IDF1: 78.2
+    Weights: https://download.openmmlab.com/mmtracking/mot/ocsort/mot_dataset/ocsort_yolox_x_crowdhuman_mot17-private-half_20220813_101618-fe150582.pth
diff --git a/mmde/mmdet/.mim/configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/mmde/mmdet/.mim/configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea04923d6aec237c51b7e23d0348c487cb9d697b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py',  # noqa: E501
+]
+
+model = dict(
+    type='OCSORT',
+    tracker=dict(
+        _delete_=True,
+        type='OCSORTTracker',
+        motion=dict(type='KalmanFilter'),
+        obj_score_thr=0.3,
+        init_track_thr=0.7,
+        weight_iou_with_det_scores=True,
+        match_iou_thr=0.3,
+        num_tentatives=3,
+        vel_consist_weight=0.2,
+        vel_delta_t=3,
+        num_frames_retain=30))
diff --git a/mmde/mmdet/.mim/configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py b/mmde/mmdet/.mim/configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea04923d6aec237c51b7e23d0348c487cb9d697b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py',  # noqa: E501
+]
+
+model = dict(
+    type='OCSORT',
+    tracker=dict(
+        _delete_=True,
+        type='OCSORTTracker',
+        motion=dict(type='KalmanFilter'),
+        obj_score_thr=0.3,
+        init_track_thr=0.7,
+        weight_iou_with_det_scores=True,
+        match_iou_thr=0.3,
+        num_tentatives=3,
+        vel_consist_weight=0.2,
+        vel_delta_t=3,
+        num_frames_retain=30))
diff --git a/mmde/mmdet/.mim/configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages-challenge.py b/mmde/mmdet/.mim/configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages-challenge.py
new file mode 100644
index 0000000000000000000000000000000000000000..e79a92cccb2e432e5dd60bc080dab76781eb32bc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages-challenge.py
@@ -0,0 +1,39 @@
+_base_ = ['faster-rcnn_r50_fpn_32xb2-1x_openimages.py']
+
+model = dict(
+    roi_head=dict(bbox_head=dict(num_classes=500)),
+    test_cfg=dict(rcnn=dict(score_thr=0.01)))
+
+# dataset settings
+dataset_type = 'OpenImagesChallengeDataset'
+train_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        ann_file='challenge2019/challenge-2019-train-detection-bbox.txt',
+        label_file='challenge2019/cls-label-description.csv',
+        hierarchy_file='challenge2019/class_label_tree.np',
+        meta_file='challenge2019/challenge-2019-train-metas.pkl'))
+val_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        ann_file='challenge2019/challenge-2019-validation-detection-bbox.txt',
+        data_prefix=dict(img='OpenImages/'),
+        label_file='challenge2019/cls-label-description.csv',
+        hierarchy_file='challenge2019/class_label_tree.np',
+        meta_file='challenge2019/challenge-2019-validation-metas.pkl',
+        image_level_ann_file='challenge2019/challenge-2019-validation-'
+        'detection-human-imagelabels.csv'))
+test_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        ann_file='challenge2019/challenge-2019-validation-detection-bbox.txt',
+        label_file='challenge2019/cls-label-description.csv',
+        hierarchy_file='challenge2019/class_label_tree.np',
+        meta_file='challenge2019/challenge-2019-validation-metas.pkl',
+        image_level_ann_file='challenge2019/challenge-2019-validation-'
+        'detection-human-imagelabels.csv'))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages.py b/mmde/mmdet/.mim/configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3f0aa0a0ff0ef16cd6e55543a72b5fe405ec5a8
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages.py
@@ -0,0 +1,35 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/openimages_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(roi_head=dict(bbox_head=dict(num_classes=601)))
+
+# Using 32 GPUS while training
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 64,
+        by_epoch=False,
+        begin=0,
+        end=26000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages-challenge.py b/mmde/mmdet/.mim/configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages-challenge.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e428725bcc39d2c009a2382c191fa53fe5ce284
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages-challenge.py
@@ -0,0 +1,5 @@
+_base_ = ['faster-rcnn_r50_fpn_32xb2-1x_openimages-challenge.py']
+
+# Use ClassAwareSampler
+train_dataloader = dict(
+    sampler=dict(_delete_=True, type='ClassAwareSampler', num_sample_class=1))
diff --git a/mmde/mmdet/.mim/configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages.py b/mmde/mmdet/.mim/configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages.py
new file mode 100644
index 0000000000000000000000000000000000000000..803190abfee63ea87e70dfe1b0fddca02f3556b8
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages.py
@@ -0,0 +1,5 @@
+_base_ = ['faster-rcnn_r50_fpn_32xb2-1x_openimages.py']
+
+# Use ClassAwareSampler
+train_dataloader = dict(
+    sampler=dict(_delete_=True, type='ClassAwareSampler', num_sample_class=1))
diff --git a/mmde/mmdet/.mim/configs/openimages/metafile.yml b/mmde/mmdet/.mim/configs/openimages/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..76c1209471921610f791a074ed7a6863cd0709c0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/openimages/metafile.yml
@@ -0,0 +1,102 @@
+Models:
+  - Name: faster-rcnn_r50_fpn_32x2_1x_openimages
+    In Collection: Faster R-CNN
+    Config: configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages.py
+    Metadata:
+      Training Memory (GB): 7.7
+      Epochs: 12
+      Training Data: Open Images v6
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images v6
+        Metrics:
+          box AP: 51.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_20211130_231159-e87ab7ce.pth
+
+  - Name: retinanet_r50_fpn_32xb2-1x_openimages
+    In Collection: RetinaNet
+    Config: configs/openimages/retinanet_r50_fpn_32xb2-1x_openimages.py
+    Metadata:
+      Training Memory (GB): 6.6
+      Epochs: 12
+      Training Data: Open Images v6
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images v6
+        Metrics:
+          box AP: 61.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/retinanet_r50_fpn_32x2_1x_openimages/retinanet_r50_fpn_32x2_1x_openimages_20211223_071954-d2ae5462.pth
+
+  - Name: ssd300_32xb8-36e_openimages
+    In Collection: SSD
+    Config: configs/openimages/ssd300_32xb8-36e_openimages.py
+    Metadata:
+      Training Memory (GB): 10.8
+      Epochs: 36
+      Training Data: Open Images v6
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images v6
+        Metrics:
+          box AP: 35.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/ssd300_32x8_36e_openimages/ssd300_32x8_36e_openimages_20211224_000232-dce93846.pth
+
+  - Name: faster-rcnn_r50_fpn_32x2_1x_openimages_challenge
+    In Collection: Faster R-CNN
+    Config: configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages-challenge.py
+    Metadata:
+      Training Memory (GB): 7.7
+      Epochs: 12
+      Training Data: Open Images Challenge 2019
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images Challenge 2019
+        Metrics:
+          box AP: 54.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge_20220114_045100-0e79e5df.pth
+
+  - Name: faster-rcnn_r50_fpn_32x2_cas_1x_openimages
+    In Collection: Faster R-CNN
+    Config: configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages.py
+    Metadata:
+      Training Memory (GB): 7.7
+      Epochs: 12
+      Training Data: Open Images Challenge 2019
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images Challenge 2019
+        Metrics:
+          box AP: 60.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_20220306_202424-98c630e5.pth
+
+  - Name: faster-rcnn_r50_fpn_32x2_cas_1x_openimages_challenge
+    In Collection: Faster R-CNN
+    Config: configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages-challenge.py
+    Metadata:
+      Training Memory (GB): 7.1
+      Epochs: 12
+      Training Data: Open Images Challenge 2019
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+    Results:
+      - Task: Object Detection
+        Dataset: Open Images Challenge 2019
+        Metrics:
+          box AP: 65.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge_20220221_192021-34c402d9.pth
diff --git a/mmde/mmdet/.mim/configs/openimages/retinanet_r50_fpn_32xb2-1x_openimages.py b/mmde/mmdet/.mim/configs/openimages/retinanet_r50_fpn_32xb2-1x_openimages.py
new file mode 100644
index 0000000000000000000000000000000000000000..97a0eb075c730ceeaa494190e0b8369706c7d7c3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/openimages/retinanet_r50_fpn_32xb2-1x_openimages.py
@@ -0,0 +1,35 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/openimages_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(bbox_head=dict(num_classes=601))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 64,
+        by_epoch=False,
+        begin=0,
+        end=26000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/openimages/ssd300_32xb8-36e_openimages.py b/mmde/mmdet/.mim/configs/openimages/ssd300_32xb8-36e_openimages.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cb51cae00a8707c0a901b99620851132e9eaccf
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/openimages/ssd300_32xb8-36e_openimages.py
@@ -0,0 +1,88 @@
+_base_ = [
+    '../_base_/models/ssd300.py', '../_base_/datasets/openimages_detection.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_1x.py'
+]
+model = dict(
+    bbox_head=dict(
+        num_classes=601,
+        anchor_generator=dict(basesize_ratio_range=(0.2, 0.9))))
+# dataset settings
+dataset_type = 'OpenImagesDataset'
+data_root = 'data/OpenImages/'
+input_size = 300
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Expand',
+        mean={{_base_.model.data_preprocessor.mean}},
+        to_rgb={{_base_.model.data_preprocessor.bgr_to_rgb}},
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+    # avoid bboxes being resized
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'instances'))
+]
+
+train_dataloader = dict(
+    batch_size=8,  # using 32 GPUS while training. total batch size is 32 x 8
+    batch_sampler=None,
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=3,  # repeat 3 times, total epochs are 12 x 3
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/oidv6-train-annotations-bbox.csv',
+            data_prefix=dict(img='OpenImages/train/'),
+            label_file='annotations/class-descriptions-boxable.csv',
+            hierarchy_file='annotations/bbox_labels_600_hierarchy.json',
+            meta_file='annotations/train-image-metas.pkl',
+            pipeline=train_pipeline)))
+val_dataloader = dict(batch_size=8, dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(batch_size=8, dataset=dict(pipeline=test_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.04, momentum=0.9, weight_decay=5e-4))
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.001,
+        by_epoch=False,
+        begin=0,
+        end=20000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/mmde/mmdet/.mim/configs/paa/metafile.yml b/mmde/mmdet/.mim/configs/paa/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..078b974971d3a3faf537cc52937278488923667e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/paa/metafile.yml
@@ -0,0 +1,111 @@
+Collections:
+  - Name: PAA
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - Probabilistic Anchor Assignment
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/2007.08103
+      Title: 'Probabilistic Anchor Assignment with IoU Prediction for Object Detection'
+    README: configs/paa/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.4.0/mmdet/models/detectors/paa.py#L6
+      Version: v2.4.0
+
+Models:
+  - Name: paa_r50_fpn_1x_coco
+    In Collection: PAA
+    Config: configs/paa/paa_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.7
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.pth
+
+  - Name: paa_r50_fpn_1.5x_coco
+    In Collection: PAA
+    Config: configs/paa/paa_r50_fpn_1.5x_coco.py
+    Metadata:
+      Training Memory (GB): 3.7
+      Epochs: 18
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1.5x_coco/paa_r50_fpn_1.5x_coco_20200823-805d6078.pth
+
+  - Name: paa_r50_fpn_2x_coco
+    In Collection: PAA
+    Config: configs/paa/paa_r50_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 3.7
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_2x_coco/paa_r50_fpn_2x_coco_20200821-c98bfc4e.pth
+
+  - Name: paa_r50_fpn_mstrain_3x_coco
+    In Collection: PAA
+    Config: configs/paa/paa_r50_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 3.7
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_mstrain_3x_coco/paa_r50_fpn_mstrain_3x_coco_20210121_145722-06a6880b.pth
+
+  - Name: paa_r101_fpn_1x_coco
+    In Collection: PAA
+    Config: configs/paa/paa_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.2
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.pth
+
+  - Name: paa_r101_fpn_2x_coco
+    In Collection: PAA
+    Config: configs/paa/paa_r101_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 6.2
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_2x_coco/paa_r101_fpn_2x_coco_20200821-6829f96b.pth
+
+  - Name: paa_r101_fpn_mstrain_3x_coco
+    In Collection: PAA
+    Config: configs/paa/paa_r101_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 6.2
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_mstrain_3x_coco/paa_r101_fpn_mstrain_3x_coco_20210122_084202-83250d22.pth
diff --git a/mmde/mmdet/.mim/configs/paa/paa_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/paa/paa_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..94f1c278dc16c1befbca510ca0ac5ba407969f6d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/paa/paa_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './paa_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/paa/paa_r101_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/paa/paa_r101_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6136f3bb404df6a6fc18536e6770116738af6c7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/paa/paa_r101_fpn_2x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './paa_r101_fpn_1x_coco.py'
+max_epochs = 24
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+# training schedule for 2x
+train_cfg = dict(max_epochs=max_epochs)
diff --git a/mmde/mmdet/.mim/configs/paa/paa_r101_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/paa/paa_r101_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8529dcdb90adb2b02162f4d2268088f5f376fcb0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/paa/paa_r101_fpn_ms-3x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './paa_r50_fpn_ms-3x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/paa/paa_r50_fpn_1.5x_coco.py b/mmde/mmdet/.mim/configs/paa/paa_r50_fpn_1.5x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae993b5c4370c8fc3e450f84fb7058528b853727
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/paa/paa_r50_fpn_1.5x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './paa_r50_fpn_1x_coco.py'
+max_epochs = 18
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[12, 16],
+        gamma=0.1)
+]
+
+# training schedule for 1.5x
+train_cfg = dict(max_epochs=max_epochs)
diff --git a/mmde/mmdet/.mim/configs/paa/paa_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/paa/paa_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f806a3ea65ffb9ee8b898122fb678b94ef212637
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/paa/paa_r50_fpn_1x_coco.py
@@ -0,0 +1,80 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    type='PAA',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='PAAHead',
+        reg_decoded_bbox=True,
+        score_voting=True,
+        topk=9,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.3),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.1,
+            neg_iou_thr=0.1,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/paa/paa_r50_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/paa/paa_r50_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6908e4eb97fcfa92a20d486ceab9a7ddfaf480b7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/paa/paa_r50_fpn_2x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './paa_r50_fpn_1x_coco.py'
+max_epochs = 24
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+# training schedule for 2x
+train_cfg = dict(max_epochs=max_epochs)
diff --git a/mmde/mmdet/.mim/configs/paa/paa_r50_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/paa/paa_r50_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..fed8b90a0fde7a1d344160a6658be04d1f9c654e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/paa/paa_r50_fpn_ms-3x_coco.py
@@ -0,0 +1,29 @@
+_base_ = './paa_r50_fpn_1x_coco.py'
+max_epochs = 36
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[28, 34],
+        gamma=0.1)
+]
+
+# training schedule for 3x
+train_cfg = dict(max_epochs=max_epochs)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize', scale=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/pafpn/faster-rcnn_r50_pafpn_1x_coco.py b/mmde/mmdet/.mim/configs/pafpn/faster-rcnn_r50_pafpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1452baeca7e680b11f9b2ec654abe689d3e53042
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pafpn/faster-rcnn_r50_pafpn_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+
+model = dict(
+    neck=dict(
+        type='PAFPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/pafpn/metafile.yml b/mmde/mmdet/.mim/configs/pafpn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7772d276ab6f0da685ed8ea5e58efd8fc5164529
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pafpn/metafile.yml
@@ -0,0 +1,38 @@
+Collections:
+  - Name: PAFPN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - PAFPN
+    Paper:
+      URL: https://arxiv.org/abs/1803.01534
+      Title: 'Path Aggregation Network for Instance Segmentation'
+    README: configs/pafpn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/necks/pafpn.py#L11
+      Version: v2.0.0
+
+Models:
+  - Name: faster-rcnn_r50_pafpn_1x_coco
+    In Collection: PAFPN
+    Config: configs/pafpn/faster-rcnn_r50_pafpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.0
+      inference time (ms/im):
+        - value: 58.14
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pafpn/faster_rcnn_r50_pafpn_1x_coco/faster_rcnn_r50_pafpn_1x_coco_bbox_mAP-0.375_20200503_105836-b7b4b9bd.pth
diff --git a/mmde/mmdet/.mim/configs/panoptic_fpn/metafile.yml b/mmde/mmdet/.mim/configs/panoptic_fpn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c99275ec3f37f47db756b96a4603c466d5fbd946
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/panoptic_fpn/metafile.yml
@@ -0,0 +1,70 @@
+Collections:
+  - Name: PanopticFPN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - PanopticFPN
+    Paper:
+      URL: https://arxiv.org/pdf/1901.02446
+      Title: 'Panoptic feature pyramid networks'
+    README: configs/panoptic_fpn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/detectors/panoptic_fpn.py#L7
+      Version: v2.16.0
+
+Models:
+  - Name: panoptic_fpn_r50_fpn_1x_coco
+    In Collection: PanopticFPN
+    Config: configs/panoptic_fpn/panoptic-fpn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.6
+      Epochs: 12
+    Results:
+    - Task: Panoptic Segmentation
+      Dataset: COCO
+      Metrics:
+        PQ: 40.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth
+
+  - Name: panoptic_fpn_r50_fpn_mstrain_3x_coco
+    In Collection: PanopticFPN
+    Config: configs/panoptic_fpn/panoptic-fpn_r50_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.6
+      Epochs: 36
+    Results:
+    - Task: Panoptic Segmentation
+      Dataset: COCO
+      Metrics:
+        PQ: 42.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco/panoptic_fpn_r50_fpn_mstrain_3x_coco_20210824_171155-5650f98b.pth
+
+  - Name: panoptic_fpn_r101_fpn_1x_coco
+    In Collection: PanopticFPN
+    Config: configs/panoptic_fpn/panoptic-fpn_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.5
+      Epochs: 12
+    Results:
+    - Task: Panoptic Segmentation
+      Dataset: COCO
+      Metrics:
+        PQ: 42.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth
+
+  - Name: panoptic_fpn_r101_fpn_mstrain_3x_coco
+    In Collection: PanopticFPN
+    Config: configs/panoptic_fpn/panoptic-fpn_r101_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 6.5
+      Epochs: 36
+    Results:
+    - Task: Panoptic Segmentation
+      Dataset: COCO
+      Metrics:
+        PQ: 44.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco/panoptic_fpn_r101_fpn_mstrain_3x_coco_20210823_114712-9c99acc4.pth
diff --git a/mmde/mmdet/.mim/configs/panoptic_fpn/panoptic-fpn_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/panoptic_fpn/panoptic-fpn_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b960254ef5ecfac1de790a66a5378535114e9ba3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/panoptic_fpn/panoptic-fpn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './panoptic-fpn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/panoptic_fpn/panoptic-fpn_r101_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/panoptic_fpn/panoptic-fpn_r101_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..268782ee2cca31796e43423300319176556cfef7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/panoptic_fpn/panoptic-fpn_r101_fpn_ms-3x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './panoptic-fpn_r50_fpn_ms-3x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/panoptic_fpn/panoptic-fpn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/panoptic_fpn/panoptic-fpn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2c89ef520124a43c910b35a4808153e4c455d3a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/panoptic_fpn/panoptic-fpn_r50_fpn_1x_coco.py
@@ -0,0 +1,45 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_panoptic.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='PanopticFPN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32,
+        pad_mask=True,
+        mask_pad_value=0,
+        pad_seg=True,
+        seg_pad_value=255),
+    semantic_head=dict(
+        type='PanopticFPNHead',
+        num_things_classes=80,
+        num_stuff_classes=53,
+        in_channels=256,
+        inner_channels=128,
+        start_level=0,
+        end_level=4,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+        conv_cfg=None,
+        loss_seg=dict(
+            type='CrossEntropyLoss', ignore_index=255, loss_weight=0.5)),
+    panoptic_fusion_head=dict(
+        type='HeuristicFusionHead',
+        num_things_classes=80,
+        num_stuff_classes=53),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.6,
+            nms=dict(type='nms', iou_threshold=0.5, class_agnostic=True),
+            max_per_img=100,
+            mask_thr_binary=0.5),
+        # used in HeuristicFusionHead
+        panoptic=dict(mask_overlap=0.5, stuff_area_limit=4096)))
+
+# Forced to remove NumClassCheckHook
+custom_hooks = []
diff --git a/mmde/mmdet/.mim/configs/panoptic_fpn/panoptic-fpn_r50_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/panoptic_fpn/panoptic-fpn_r50_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b18a8f8dd7eb6c49e277346ffe71c6e36c9d3b68
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/panoptic_fpn/panoptic-fpn_r50_fpn_ms-3x_coco.py
@@ -0,0 +1,35 @@
+_base_ = './panoptic-fpn_r50_fpn_1x_coco.py'
+
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadPanopticAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True),
+    dict(
+        type='RandomResize', scale=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# TODO: Use RepeatDataset to speed up training
+# training schedule for 3x
+train_cfg = dict(max_epochs=36, val_interval=3)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[24, 33],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/pascal_voc/faster-rcnn_r50-caffe-c4_ms-18k_voc0712.py b/mmde/mmdet/.mim/configs/pascal_voc/faster-rcnn_r50-caffe-c4_ms-18k_voc0712.py
new file mode 100644
index 0000000000000000000000000000000000000000..dddc0bbdf33948478e11bb701f844a8473ddf165
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pascal_voc/faster-rcnn_r50-caffe-c4_ms-18k_voc0712.py
@@ -0,0 +1,86 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50-caffe-c4.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/datasets/voc0712.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(roi_head=dict(bbox_head=dict(num_classes=20)))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 480), (1333, 512), (1333, 544), (1333, 576),
+                (1333, 608), (1333, 640), (1333, 672), (1333, 704),
+                (1333, 736), (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    # avoid bboxes being resized
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        _delete_=True,
+        type='ConcatDataset',
+        datasets=[
+            dict(
+                type='VOCDataset',
+                data_root={{_base_.data_root}},
+                ann_file='VOC2007/ImageSets/Main/trainval.txt',
+                data_prefix=dict(sub_data_root='VOC2007/'),
+                filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                pipeline=train_pipeline,
+                backend_args={{_base_.backend_args}}),
+            dict(
+                type='VOCDataset',
+                data_root={{_base_.data_root}},
+                ann_file='VOC2012/ImageSets/Main/trainval.txt',
+                data_prefix=dict(sub_data_root='VOC2012/'),
+                filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                pipeline=train_pipeline,
+                backend_args={{_base_.backend_args}})
+        ]))
+
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# training schedule for 18k
+max_iter = 18000
+train_cfg = dict(
+    _delete_=True,
+    type='IterBasedTrainLoop',
+    max_iters=max_iter,
+    val_interval=3000)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=100),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[12000, 16000],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
+
+default_hooks = dict(checkpoint=dict(by_epoch=False, interval=3000))
+log_processor = dict(by_epoch=False)
diff --git a/mmde/mmdet/.mim/configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712-cocofmt.py b/mmde/mmdet/.mim/configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712-cocofmt.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b0aa41d67fc4edfde6d534e2e54a135f5de6e44
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712-cocofmt.py
@@ -0,0 +1,100 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py', '../_base_/datasets/voc0712.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(roi_head=dict(bbox_head=dict(num_classes=20)))
+
+METAINFO = {
+    'classes':
+    ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
+     'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
+     'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'),
+    # palette is a list of color tuples, which is used for visualization.
+    'palette': [(106, 0, 228), (119, 11, 32), (165, 42, 42), (0, 0, 192),
+                (197, 226, 255), (0, 60, 100), (0, 0, 142), (255, 77, 255),
+                (153, 69, 1), (120, 166, 157), (0, 182, 199), (0, 226, 252),
+                (182, 182, 255), (0, 0, 230), (220, 20, 60), (163, 255, 0),
+                (0, 82, 0), (3, 95, 161), (0, 80, 100), (183, 130, 88)]
+}
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/VOCdevkit/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1000, 600), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(1000, 600), keep_ratio=True),
+    # avoid bboxes being resized
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    dataset=dict(
+        type='RepeatDataset',
+        times=3,
+        dataset=dict(
+            _delete_=True,
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/voc0712_trainval.json',
+            data_prefix=dict(img=''),
+            metainfo=METAINFO,
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args={{_base_.backend_args}})))
+val_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotations/voc07_test.json',
+        data_prefix=dict(img=''),
+        metainfo=METAINFO,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/voc07_test.json',
+    metric='bbox',
+    format_only=False,
+    backend_args={{_base_.backend_args}})
+test_evaluator = val_evaluator
+
+# training schedule, the dataset is repeated 3 times, so the
+# actual epoch = 4 * 3 = 12
+max_epochs = 4
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[3],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712.py b/mmde/mmdet/.mim/configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712.py
new file mode 100644
index 0000000000000000000000000000000000000000..07391667b35c9db9e352a03624411bb568f5396a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712.py
@@ -0,0 +1,35 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py', '../_base_/datasets/voc0712.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(roi_head=dict(bbox_head=dict(num_classes=20)))
+
+# training schedule, voc dataset is repeated 3 times, in
+# `_base_/datasets/voc0712.py`, so the actual epoch = 4 * 3 = 12
+max_epochs = 4
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[3],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/pascal_voc/retinanet_r50_fpn_1x_voc0712.py b/mmde/mmdet/.mim/configs/pascal_voc/retinanet_r50_fpn_1x_voc0712.py
new file mode 100644
index 0000000000000000000000000000000000000000..c86a6f199c9317804692189975f3abaff24f6aff
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pascal_voc/retinanet_r50_fpn_1x_voc0712.py
@@ -0,0 +1,34 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py', '../_base_/datasets/voc0712.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(bbox_head=dict(num_classes=20))
+
+# training schedule, voc dataset is repeated 3 times, in
+# `_base_/datasets/voc0712.py`, so the actual epoch = 4 * 3 = 12
+max_epochs = 4
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[3],
+        gamma=0.1)
+]
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/pascal_voc/ssd300_voc0712.py b/mmde/mmdet/.mim/configs/pascal_voc/ssd300_voc0712.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff7a1368b76aa53700bd81a912b54e84ab58e53a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pascal_voc/ssd300_voc0712.py
@@ -0,0 +1,102 @@
+_base_ = [
+    '../_base_/models/ssd300.py', '../_base_/datasets/voc0712.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    bbox_head=dict(
+        num_classes=20, anchor_generator=dict(basesize_ratio_range=(0.2,
+                                                                    0.9))))
+# dataset settings
+dataset_type = 'VOCDataset'
+data_root = 'data/VOCdevkit/'
+input_size = 300
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean={{_base_.model.data_preprocessor.mean}},
+        to_rgb={{_base_.model.data_preprocessor.bgr_to_rgb}},
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+    # avoid bboxes being resized
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=3,
+    dataset=dict(  # RepeatDataset
+        # the dataset is repeated 10 times, and the training schedule is 2x,
+        # so the actual epoch = 12 * 10 = 120.
+        times=10,
+        dataset=dict(  # ConcatDataset
+            # VOCDataset will add different `dataset_type` in dataset.metainfo,
+            # which will get error if using ConcatDataset. Adding
+            # `ignore_keys` can avoid this error.
+            ignore_keys=['dataset_type'],
+            datasets=[
+                dict(
+                    type=dataset_type,
+                    data_root=data_root,
+                    ann_file='VOC2007/ImageSets/Main/trainval.txt',
+                    data_prefix=dict(sub_data_root='VOC2007/'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    pipeline=train_pipeline),
+                dict(
+                    type=dataset_type,
+                    data_root=data_root,
+                    ann_file='VOC2012/ImageSets/Main/trainval.txt',
+                    data_prefix=dict(sub_data_root='VOC2012/'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    pipeline=train_pipeline)
+            ])))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+custom_hooks = [
+    dict(type='NumClassCheckHook'),
+    dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW')
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=1e-3, momentum=0.9, weight_decay=5e-4))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 20],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/pascal_voc/ssd512_voc0712.py b/mmde/mmdet/.mim/configs/pascal_voc/ssd512_voc0712.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c4dc8a3eec86ccced7d44120b254463d18c00f5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pascal_voc/ssd512_voc0712.py
@@ -0,0 +1,82 @@
+_base_ = 'ssd300_voc0712.py'
+
+input_size = 512
+model = dict(
+    neck=dict(
+        out_channels=(512, 1024, 512, 256, 256, 256, 256),
+        level_strides=(2, 2, 2, 2, 1),
+        level_paddings=(1, 1, 1, 1, 1),
+        last_kernel_size=4),
+    bbox_head=dict(
+        in_channels=(512, 1024, 512, 256, 256, 256, 256),
+        anchor_generator=dict(
+            input_size=input_size,
+            strides=[8, 16, 32, 64, 128, 256, 512],
+            basesize_ratio_range=(0.15, 0.9),
+            ratios=([2], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2]))))
+
+# dataset settings
+dataset_type = 'VOCDataset'
+data_root = 'data/VOCdevkit/'
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean={{_base_.model.data_preprocessor.mean}},
+        to_rgb={{_base_.model.data_preprocessor.bgr_to_rgb}},
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+    # avoid bboxes being resized
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=3,
+    dataset=dict(  # RepeatDataset
+        # the dataset is repeated 10 times, and the training schedule is 2x,
+        # so the actual epoch = 12 * 10 = 120.
+        times=10,
+        dataset=dict(  # ConcatDataset
+            # VOCDataset will add different `dataset_type` in dataset.metainfo,
+            # which will get error if using ConcatDataset. Adding
+            # `ignore_keys` can avoid this error.
+            ignore_keys=['dataset_type'],
+            datasets=[
+                dict(
+                    type=dataset_type,
+                    data_root=data_root,
+                    ann_file='VOC2007/ImageSets/Main/trainval.txt',
+                    data_prefix=dict(sub_data_root='VOC2007/'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    pipeline=train_pipeline),
+                dict(
+                    type=dataset_type,
+                    data_root=data_root,
+                    ann_file='VOC2012/ImageSets/Main/trainval.txt',
+                    data_prefix=dict(sub_data_root='VOC2012/'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    pipeline=train_pipeline)
+            ])))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/pisa/faster-rcnn_r50_fpn_pisa_1x_coco.py b/mmde/mmdet/.mim/configs/pisa/faster-rcnn_r50_fpn_pisa_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..237a3b13aa5e61f04579670af01df8f481d80dd1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pisa/faster-rcnn_r50_fpn_pisa_1x_coco.py
@@ -0,0 +1,30 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+
+model = dict(
+    roi_head=dict(
+        type='PISARoIHead',
+        bbox_head=dict(
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    train_cfg=dict(
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            sampler=dict(
+                type='ScoreHLRSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True,
+                k=0.5,
+                bias=0.),
+            isr=dict(k=2, bias=0),
+            carl=dict(k=1, bias=0.2))),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0)))
diff --git a/mmde/mmdet/.mim/configs/pisa/faster-rcnn_x101-32x4d_fpn_pisa_1x_coco.py b/mmde/mmdet/.mim/configs/pisa/faster-rcnn_x101-32x4d_fpn_pisa_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b2c8d9a20ac7adf1965bb3d98e868c785cb23c3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pisa/faster-rcnn_x101-32x4d_fpn_pisa_1x_coco.py
@@ -0,0 +1,30 @@
+_base_ = '../faster_rcnn/faster-rcnn_x101-32x4d_fpn_1x_coco.py'
+
+model = dict(
+    roi_head=dict(
+        type='PISARoIHead',
+        bbox_head=dict(
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    train_cfg=dict(
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            sampler=dict(
+                type='ScoreHLRSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True,
+                k=0.5,
+                bias=0.),
+            isr=dict(k=2, bias=0),
+            carl=dict(k=1, bias=0.2))),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0)))
diff --git a/mmde/mmdet/.mim/configs/pisa/mask-rcnn_r50_fpn_pisa_1x_coco.py b/mmde/mmdet/.mim/configs/pisa/mask-rcnn_r50_fpn_pisa_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6a6823591b1d7780c7f9d49029579afede239aa
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pisa/mask-rcnn_r50_fpn_pisa_1x_coco.py
@@ -0,0 +1,30 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+
+model = dict(
+    roi_head=dict(
+        type='PISARoIHead',
+        bbox_head=dict(
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    train_cfg=dict(
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            sampler=dict(
+                type='ScoreHLRSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True,
+                k=0.5,
+                bias=0.),
+            isr=dict(k=2, bias=0),
+            carl=dict(k=1, bias=0.2))),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0)))
diff --git a/mmde/mmdet/.mim/configs/pisa/mask-rcnn_x101-32x4d_fpn_pisa_1x_coco.py b/mmde/mmdet/.mim/configs/pisa/mask-rcnn_x101-32x4d_fpn_pisa_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2ac19fe75ba8c5b2440772eced16397e2273735
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pisa/mask-rcnn_x101-32x4d_fpn_pisa_1x_coco.py
@@ -0,0 +1,30 @@
+_base_ = '../mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py'
+
+model = dict(
+    roi_head=dict(
+        type='PISARoIHead',
+        bbox_head=dict(
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))),
+    train_cfg=dict(
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            sampler=dict(
+                type='ScoreHLRSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True,
+                k=0.5,
+                bias=0.),
+            isr=dict(k=2, bias=0),
+            carl=dict(k=1, bias=0.2))),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0)))
diff --git a/mmde/mmdet/.mim/configs/pisa/metafile.yml b/mmde/mmdet/.mim/configs/pisa/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3be5c3baf6d386d246b8fdc39035245d7dbbaad5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pisa/metafile.yml
@@ -0,0 +1,110 @@
+Collections:
+  - Name: PISA
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - PISA
+        - RPN
+        - ResNet
+        - RoIPool
+    Paper:
+      URL: https://arxiv.org/abs/1904.04821
+      Title: 'Prime Sample Attention in Object Detection'
+    README: configs/pisa/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/roi_heads/pisa_roi_head.py#L8
+      Version: v2.1.0
+
+Models:
+  - Name: pisa_faster_rcnn_r50_fpn_1x_coco
+    In Collection: PISA
+    Config: configs/pisa/faster-rcnn_r50_fpn_pisa_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_r50_fpn_1x_coco/pisa_faster_rcnn_r50_fpn_1x_coco-dea93523.pth
+
+  - Name: pisa_faster_rcnn_x101_32x4d_fpn_1x_coco
+    In Collection: PISA
+    Config: configs/pisa/faster-rcnn_x101-32x4d_fpn_pisa_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco-e4accec4.pth
+
+  - Name: pisa_mask_rcnn_r50_fpn_1x_coco
+    In Collection: PISA
+    Config: configs/pisa/mask-rcnn_r50_fpn_pisa_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 35.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_mask_rcnn_r50_fpn_1x_coco/pisa_mask_rcnn_r50_fpn_1x_coco-dfcedba6.pth
+
+  - Name: pisa_retinanet_r50_fpn_1x_coco
+    In Collection: PISA
+    Config: configs/pisa/retinanet-r50_fpn_pisa_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_r50_fpn_1x_coco/pisa_retinanet_r50_fpn_1x_coco-76409952.pth
+
+  - Name: pisa_retinanet_x101_32x4d_fpn_1x_coco
+    In Collection: PISA
+    Config: configs/pisa/retinanet_x101-32x4d_fpn_pisa_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco/pisa_retinanet_x101_32x4d_fpn_1x_coco-a0c13c73.pth
+
+  - Name: pisa_ssd300_coco
+    In Collection: PISA
+    Config: configs/pisa/ssd300_pisa_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 27.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd300_coco/pisa_ssd300_coco-710e3ac9.pth
+
+  - Name: pisa_ssd512_coco
+    In Collection: PISA
+    Config: configs/pisa/ssd512_pisa_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 31.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd512_coco/pisa_ssd512_coco-247addee.pth
diff --git a/mmde/mmdet/.mim/configs/pisa/retinanet-r50_fpn_pisa_1x_coco.py b/mmde/mmdet/.mim/configs/pisa/retinanet-r50_fpn_pisa_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..70f89e227ec64b5c7224375aac0cf7ae3a10a29e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pisa/retinanet-r50_fpn_pisa_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py'
+
+model = dict(
+    bbox_head=dict(
+        type='PISARetinaHead',
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0)),
+    train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2)))
diff --git a/mmde/mmdet/.mim/configs/pisa/retinanet_x101-32x4d_fpn_pisa_1x_coco.py b/mmde/mmdet/.mim/configs/pisa/retinanet_x101-32x4d_fpn_pisa_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9caad45d34a9cde84a3c29ad45e3080bb831bb76
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pisa/retinanet_x101-32x4d_fpn_pisa_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = '../retinanet/retinanet_x101-32x4d_fpn_1x_coco.py'
+
+model = dict(
+    bbox_head=dict(
+        type='PISARetinaHead',
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0)),
+    train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2)))
diff --git a/mmde/mmdet/.mim/configs/pisa/ssd300_pisa_coco.py b/mmde/mmdet/.mim/configs/pisa/ssd300_pisa_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b10236baeb1925483c2fdb025d86c45d51ba0276
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pisa/ssd300_pisa_coco.py
@@ -0,0 +1,7 @@
+_base_ = '../ssd/ssd300_coco.py'
+
+model = dict(
+    bbox_head=dict(type='PISASSDHead'),
+    train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2)))
+
+optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/pisa/ssd512_pisa_coco.py b/mmde/mmdet/.mim/configs/pisa/ssd512_pisa_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..939c7f453d4d881324c3b0443b0696eb96b3df4f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pisa/ssd512_pisa_coco.py
@@ -0,0 +1,7 @@
+_base_ = '../ssd/ssd512_coco.py'
+
+model = dict(
+    bbox_head=dict(type='PISASSDHead'),
+    train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2)))
+
+optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/point_rend/metafile.yml b/mmde/mmdet/.mim/configs/point_rend/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f54f8a860b7951c1e99471b1f10e69c4685d998b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/point_rend/metafile.yml
@@ -0,0 +1,54 @@
+Collections:
+  - Name: PointRend
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - PointRend
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1912.08193
+      Title: 'PointRend: Image Segmentation as Rendering'
+    README: configs/point_rend/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/detectors/point_rend.py#L6
+      Version: v2.2.0
+
+Models:
+  - Name: point_rend_r50_caffe_fpn_mstrain_1x_coco
+    In Collection: PointRend
+    Config: configs/point_rend/point-rend_r50-caffe_fpn_ms-1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.6
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco/point_rend_r50_caffe_fpn_mstrain_1x_coco-1bcb5fb4.pth
+
+  - Name: point_rend_r50_caffe_fpn_mstrain_3x_coco
+    In Collection: PointRend
+    Config: configs/point_rend/point-rend_r50-caffe_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.6
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco/point_rend_r50_caffe_fpn_mstrain_3x_coco-e0ebb6b7.pth
diff --git a/mmde/mmdet/.mim/configs/point_rend/point-rend_r50-caffe_fpn_ms-1x_coco.py b/mmde/mmdet/.mim/configs/point_rend/point-rend_r50-caffe_fpn_ms-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b17f5a340bad54a8fe9b366ccc7d5574f687b17
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/point_rend/point-rend_r50-caffe_fpn_ms-1x_coco.py
@@ -0,0 +1,44 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-1x_coco.py'
+# model settings
+model = dict(
+    type='PointRend',
+    roi_head=dict(
+        type='PointRendRoIHead',
+        mask_roi_extractor=dict(
+            type='GenericRoIExtractor',
+            aggregation='concat',
+            roi_layer=dict(
+                _delete_=True, type='SimpleRoIAlign', output_size=14),
+            out_channels=256,
+            featmap_strides=[4]),
+        mask_head=dict(
+            _delete_=True,
+            type='CoarseMaskHead',
+            num_fcs=2,
+            in_channels=256,
+            conv_out_channels=256,
+            fc_out_channels=1024,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)),
+        point_head=dict(
+            type='MaskPointHead',
+            num_fcs=3,
+            in_channels=256,
+            fc_channels=256,
+            num_classes=80,
+            coarse_pred_each_layer=True,
+            loss_point=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rcnn=dict(
+            mask_size=7,
+            num_points=14 * 14,
+            oversample_ratio=3,
+            importance_sample_ratio=0.75)),
+    test_cfg=dict(
+        rcnn=dict(
+            subdivision_steps=5,
+            subdivision_num_points=28 * 28,
+            scale_factor=2)))
diff --git a/mmde/mmdet/.mim/configs/point_rend/point-rend_r50-caffe_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/point_rend/point-rend_r50-caffe_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b11faaa98ebc5b61f086a2297debda6769dc6270
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/point_rend/point-rend_r50-caffe_fpn_ms-3x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './point-rend_r50-caffe_fpn_ms-1x_coco.py'
+
+max_epochs = 36
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[28, 34],
+        gamma=0.1)
+]
+
+train_cfg = dict(max_epochs=max_epochs)
diff --git a/mmde/mmdet/.mim/configs/pvt/metafile.yml b/mmde/mmdet/.mim/configs/pvt/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..58843784955f3f4be7aeebf7caa9b50b7891f4c5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pvt/metafile.yml
@@ -0,0 +1,243 @@
+Models:
+  - Name: retinanet_pvt-t_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvt-t_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformer
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-t_fpn_1x_coco/retinanet_pvt-t_fpn_1x_coco_20210831_103110-17b566bd.pth
+    Paper:
+      URL: https://arxiv.org/abs/2102.12122
+      Title: "Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L315
+      Version: 2.17.0
+
+  - Name: retinanet_pvt-s_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvt-s_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 14.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformer
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-s_fpn_1x_coco/retinanet_pvt-s_fpn_1x_coco_20210906_142921-b6c94a5b.pth
+    Paper:
+      URL: https://arxiv.org/abs/2102.12122
+      Title: "Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L315
+      Version: 2.17.0
+
+  - Name: retinanet_pvt-m_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvt-m_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 20.9
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformer
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-m_fpn_1x_coco/retinanet_pvt-m_fpn_1x_coco_20210831_103243-55effa1b.pth
+    Paper:
+      URL: https://arxiv.org/abs/2102.12122
+      Title: "Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L315
+      Version: 2.17.0
+
+  - Name: retinanet_pvtv2-b0_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.4
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformerV2
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b0_fpn_1x_coco/retinanet_pvtv2-b0_fpn_1x_coco_20210831_103157-13e9aabe.pth
+    Paper:
+      URL: https://arxiv.org/abs/2106.13797
+      Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543
+      Version: 2.17.0
+
+  - Name: retinanet_pvtv2-b1_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 9.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformerV2
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b1_fpn_1x_coco/retinanet_pvtv2-b1_fpn_1x_coco_20210831_103318-7e169a7d.pth
+    Paper:
+      URL: https://arxiv.org/abs/2106.13797
+      Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543
+      Version: 2.17.0
+
+  - Name: retinanet_pvtv2-b2_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 16.2
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformerV2
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b2_fpn_1x_coco/retinanet_pvtv2-b2_fpn_1x_coco_20210901_174843-529f0b9a.pth
+    Paper:
+      URL: https://arxiv.org/abs/2106.13797
+      Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543
+      Version: 2.17.0
+
+  - Name: retinanet_pvtv2-b3_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 23.0
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformerV2
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b3_fpn_1x_coco/retinanet_pvtv2-b3_fpn_1x_coco_20210903_151512-8357deff.pth
+    Paper:
+      URL: https://arxiv.org/abs/2106.13797
+      Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543
+      Version: 2.17.0
+
+  - Name: retinanet_pvtv2-b4_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 17.0
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformerV2
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b4_fpn_1x_coco/retinanet_pvtv2-b4_fpn_1x_coco_20210901_170151-83795c86.pth
+    Paper:
+      URL: https://arxiv.org/abs/2106.13797
+      Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543
+      Version: 2.17.0
+
+  - Name: retinanet_pvtv2-b5_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 18.7
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x NVIDIA V100 GPUs
+      Architecture:
+        - PyramidVisionTransformerV2
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b5_fpn_1x_coco/retinanet_pvtv2-b5_fpn_1x_coco_20210902_201800-3420eb57.pth
+    Paper:
+      URL: https://arxiv.org/abs/2106.13797
+      Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer"
+    README: configs/pvt/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543
+      Version: 2.17.0
diff --git a/mmde/mmdet/.mim/configs/pvt/retinanet_pvt-l_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/pvt/retinanet_pvt-l_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a6f604bdb367106bc75680808ce6fabc2740ed1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pvt/retinanet_pvt-l_fpn_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = 'retinanet_pvt-t_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        num_layers=[3, 8, 27, 3],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_large.pth')))
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(type='AmpOptimWrapper')
diff --git a/mmde/mmdet/.mim/configs/pvt/retinanet_pvt-m_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/pvt/retinanet_pvt-m_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b888f788b6c7310491751774238451bb7107dccc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pvt/retinanet_pvt-m_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = 'retinanet_pvt-t_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        num_layers=[3, 4, 18, 3],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_medium.pth')))
diff --git a/mmde/mmdet/.mim/configs/pvt/retinanet_pvt-s_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/pvt/retinanet_pvt-s_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..46603488bb3ceb4fc1052139da53340a3d595256
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pvt/retinanet_pvt-s_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = 'retinanet_pvt-t_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        num_layers=[3, 4, 6, 3],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_small.pth')))
diff --git a/mmde/mmdet/.mim/configs/pvt/retinanet_pvt-t_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/pvt/retinanet_pvt-t_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f67c444f262613d615b8b7331991ca7e2f57935
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pvt/retinanet_pvt-t_fpn_1x_coco.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='RetinaNet',
+    backbone=dict(
+        _delete_=True,
+        type='PyramidVisionTransformer',
+        num_layers=[2, 2, 2, 2],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_tiny.pth')),
+    neck=dict(in_channels=[64, 128, 320, 512]))
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        _delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbebf90fb89d81bd2f4c0874dc2c82cf7c7393d0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py
@@ -0,0 +1,19 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='RetinaNet',
+    backbone=dict(
+        _delete_=True,
+        type='PyramidVisionTransformerV2',
+        embed_dims=32,
+        num_layers=[2, 2, 2, 2],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_v2_b0.pth')),
+    neck=dict(in_channels=[32, 64, 160, 256]))
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        _delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5374c50925f5c7ed8a761eda40dc4bf374df3aeb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        embed_dims=64,
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_v2_b1.pth')),
+    neck=dict(in_channels=[64, 128, 320, 512]))
diff --git a/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf9a18debbe5f8b9918e0d086ad6d54d203ef310
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        embed_dims=64,
+        num_layers=[3, 4, 6, 3],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_v2_b2.pth')),
+    neck=dict(in_channels=[64, 128, 320, 512]))
diff --git a/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a47f820324af7fecf773640d7d1829b0c115471
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        embed_dims=64,
+        num_layers=[3, 4, 18, 3],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_v2_b3.pth')),
+    neck=dict(in_channels=[64, 128, 320, 512]))
diff --git a/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5faf4c507ba89ffe614b2b9d34d452e4c106b0fe
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py
@@ -0,0 +1,20 @@
+_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        embed_dims=64,
+        num_layers=[3, 8, 27, 3],
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_v2_b4.pth')),
+    neck=dict(in_channels=[64, 128, 320, 512]))
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        _delete_=True, type='AdamW', lr=0.0001 / 1.4, weight_decay=0.0001))
+
+# dataset settings
+train_dataloader = dict(batch_size=1, num_workers=1)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+auto_scale_lr = dict(base_batch_size=8)
diff --git a/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..afff8719ece41dbfbbe23e2259b9973bb29871f6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py
@@ -0,0 +1,21 @@
+_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        embed_dims=64,
+        num_layers=[3, 6, 40, 3],
+        mlp_ratios=(4, 4, 4, 4),
+        init_cfg=dict(checkpoint='https://github.com/whai362/PVT/'
+                      'releases/download/v2/pvt_v2_b5.pth')),
+    neck=dict(in_channels=[64, 128, 320, 512]))
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        _delete_=True, type='AdamW', lr=0.0001 / 1.4, weight_decay=0.0001))
+
+# dataset settings
+train_dataloader = dict(batch_size=1, num_workers=1)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+auto_scale_lr = dict(base_batch_size=8)
diff --git a/mmde/mmdet/.mim/configs/qdtrack/metafile.yml b/mmde/mmdet/.mim/configs/qdtrack/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e5c5504d1bd00e43bdba7f28efcbf9dd23555342
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/qdtrack/metafile.yml
@@ -0,0 +1,30 @@
+Collections:
+  - Name: QDTrack
+    Metadata:
+      Training Data: MOT17, crowdhuman
+      Training Techniques:
+        - SGD
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/pdf/2006.06664.pdf
+      Title: Quasi-Dense Similarity Learning for Multiple Object Tracking
+    README: configs/qdtrack/README.md
+
+Models:
+  - Name: qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval
+    In Collection: QDTrack
+    Config: configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
+    Metadata:
+      Training Data: MOT17
+      Training Memory (GB): 5.83
+      Epochs: 4
+    Results:
+      - Task: Multi-object Tracking
+        Dataset: MOT17
+        Metrics:
+          HOTA: 57.1
+          MOTA: 68.1
+          IDF1: 68.6
+    Weights: https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_mot17_20220315_145635-76f295ef.pth
diff --git a/mmde/mmdet/.mim/configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py b/mmde/mmdet/.mim/configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3c17c3eb97eedef88949c841364b858a3a1d6e9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py
@@ -0,0 +1,118 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py', '../_base_/default_runtime.py'
+]
+
+detector = _base_.model
+detector.pop('data_preprocessor')
+
+detector['backbone'].update(
+    dict(
+        norm_cfg=dict(type='BN', requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+detector.rpn_head.loss_bbox.update(
+    dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0))
+detector.rpn_head.bbox_coder.update(dict(clip_border=False))
+detector.roi_head.bbox_head.update(dict(num_classes=1))
+detector.roi_head.bbox_head.bbox_coder.update(dict(clip_border=False))
+detector['init_cfg'] = dict(
+    type='Pretrained',
+    checkpoint=  # noqa: E251
+    'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
+    'faster_rcnn_r50_fpn_1x_coco-person/'
+    'faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'
+    # noqa: E501
+)
+del _base_.model
+
+model = dict(
+    type='QDTrack',
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    detector=detector,
+    track_head=dict(
+        type='QuasiDenseTrackHead',
+        roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        embed_head=dict(
+            type='QuasiDenseEmbedHead',
+            num_convs=4,
+            num_fcs=1,
+            embed_channels=256,
+            norm_cfg=dict(type='GN', num_groups=32),
+            loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25),
+            loss_track_aux=dict(
+                type='MarginL2Loss',
+                neg_pos_ub=3,
+                pos_margin=0,
+                neg_margin=0.1,
+                hard_mining=True,
+                loss_weight=1.0)),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0),
+        train_cfg=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='CombinedSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=3,
+                add_gt_as_proposals=True,
+                pos_sampler=dict(type='InstanceBalancedPosSampler'),
+                neg_sampler=dict(type='RandomSampler')))),
+    tracker=dict(
+        type='QuasiDenseTracker',
+        init_score_thr=0.9,
+        obj_score_thr=0.5,
+        match_score_thr=0.5,
+        memo_tracklet_frames=30,
+        memo_backdrop_frames=1,
+        memo_momentum=0.8,
+        nms_conf_thr=0.5,
+        nms_backdrop_iou_thr=0.3,
+        nms_class_iou_thr=0.7,
+        with_cats=True,
+        match_metric='bisoftmax'))
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+# learning policy
+param_scheduler = [
+    dict(type='MultiStepLR', begin=0, end=4, by_epoch=True, milestones=[3])
+]
+
+# runtime settings
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=4, val_interval=4)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+default_hooks = dict(
+    logger=dict(type='LoggerHook', interval=50),
+    visualization=dict(type='TrackVisualizationHook', draw=False))
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='TrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# custom hooks
+custom_hooks = [
+    # Synchronize model buffers such as running_mean and running_var in BN
+    # at the end of each epoch
+    dict(type='SyncBuffersHook')
+]
diff --git a/mmde/mmdet/.mim/configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py b/mmde/mmdet/.mim/configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
new file mode 100644
index 0000000000000000000000000000000000000000..d87604dad6bf39028a8111708307482186118b19
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,14 @@
+_base_ = [
+    './qdtrack_faster-rcnn_r50_fpn_4e_base.py',
+    '../_base_/datasets/mot_challenge.py',
+]
+
+# evaluator
+val_evaluator = [
+    dict(type='CocoVideoMetric', metric=['bbox'], classwise=True),
+    dict(type='MOTChallengeMetric', metric=['HOTA', 'CLEAR', 'Identity'])
+]
+
+test_evaluator = val_evaluator
+# The fluctuation of HOTA is about +-1.
+randomness = dict(seed=6)
diff --git a/mmde/mmdet/.mim/configs/queryinst/metafile.yml b/mmde/mmdet/.mim/configs/queryinst/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3ea3b00a945c8856b8c63f68a0ec6a48c70a933f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/queryinst/metafile.yml
@@ -0,0 +1,100 @@
+Collections:
+  - Name: QueryInst
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+        - QueryInst
+    Paper:
+      URL: https://openaccess.thecvf.com/content/ICCV2021/papers/Fang_Instances_As_Queries_ICCV_2021_paper.pdf
+      Title: 'Instances as Queries'
+    README: configs/queryinst/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/detectors/queryinst.py
+      Version: v2.18.0
+
+Models:
+  - Name: queryinst_r50_fpn_1x_coco
+    In Collection: QueryInst
+    Config: configs/queryinst/queryinst_r50_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_1x_coco/queryinst_r50_fpn_1x_coco_20210907_084916-5a8f1998.pth
+
+  - Name: queryinst_r50_fpn_ms-480-800-3x_coco
+    In Collection: QueryInst
+    Config: configs/queryinst/queryinst_r50_fpn_ms-480-800-3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco/queryinst_r50_fpn_mstrain_480-800_3x_coco_20210901_103643-7837af86.pth
+
+  - Name: queryinst_r50_fpn_300-proposals_crop-ms-480-800-3x_coco
+    In Collection: QueryInst
+    Config: configs/queryinst/queryinst_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_101802-85cffbd8.pth
+
+  - Name: queryinst_r101_fpn_ms-480-800-3x_coco
+    In Collection: QueryInst
+    Config: configs/queryinst/queryinst_r101_fpn_ms-480-800-3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco/queryinst_r101_fpn_mstrain_480-800_3x_coco_20210904_104048-91f9995b.pth
+
+  - Name: queryinst_r101_fpn_300-proposals_crop-ms-480-800-3x_coco
+    In Collection: QueryInst
+    Config: configs/queryinst/queryinst_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 49.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 42.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_153621-76cce59f.pth
diff --git a/mmde/mmdet/.mim/configs/queryinst/queryinst_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py b/mmde/mmdet/.mim/configs/queryinst/queryinst_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1692c134698a98da33612487a9fb703117fdb8b6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/queryinst/queryinst_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './queryinst_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/queryinst/queryinst_r101_fpn_ms-480-800-3x_coco.py b/mmde/mmdet/.mim/configs/queryinst/queryinst_r101_fpn_ms-480-800-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd5b7f452e583eb362e0bb05f272a771d68b6e48
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/queryinst/queryinst_r101_fpn_ms-480-800-3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './queryinst_r50_fpn_ms-480-800-3x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/queryinst/queryinst_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/queryinst/queryinst_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..63d61d78872b452bdd8d2607fc03181b169ea845
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/queryinst/queryinst_r50_fpn_1x_coco.py
@@ -0,0 +1,155 @@
+_base_ = [
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+num_stages = 6
+num_proposals = 100
+model = dict(
+    type='QueryInst',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=0,
+        add_extra_convs='on_input',
+        num_outs=4),
+    rpn_head=dict(
+        type='EmbeddingRPNHead',
+        num_proposals=num_proposals,
+        proposal_feature_channel=256),
+    roi_head=dict(
+        type='SparseRoIHead',
+        num_stages=num_stages,
+        stage_loss_weights=[1] * num_stages,
+        proposal_feature_channel=256,
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='DIIHead',
+                num_classes=80,
+                num_ffn_fcs=2,
+                num_heads=8,
+                num_cls_fcs=1,
+                num_reg_fcs=3,
+                feedforward_channels=2048,
+                in_channels=256,
+                dropout=0.0,
+                ffn_act_cfg=dict(type='ReLU', inplace=True),
+                dynamic_conv_cfg=dict(
+                    type='DynamicConv',
+                    in_channels=256,
+                    feat_channels=64,
+                    out_channels=256,
+                    input_feat_shape=7,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    norm_cfg=dict(type='LN')),
+                loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                loss_cls=dict(
+                    type='FocalLoss',
+                    use_sigmoid=True,
+                    gamma=2.0,
+                    alpha=0.25,
+                    loss_weight=2.0),
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    clip_border=False,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.5, 0.5, 1., 1.])) for _ in range(num_stages)
+        ],
+        mask_head=[
+            dict(
+                type='DynamicMaskHead',
+                dynamic_conv_cfg=dict(
+                    type='DynamicConv',
+                    in_channels=256,
+                    feat_channels=64,
+                    out_channels=256,
+                    input_feat_shape=14,
+                    with_proj=False,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    norm_cfg=dict(type='LN')),
+                num_convs=4,
+                num_classes=80,
+                roi_feat_size=14,
+                in_channels=256,
+                conv_kernel_size=3,
+                conv_out_channels=256,
+                class_agnostic=False,
+                norm_cfg=dict(type='BN'),
+                upsample_cfg=dict(type='deconv', scale_factor=2),
+                loss_mask=dict(
+                    type='DiceLoss',
+                    loss_weight=8.0,
+                    use_sigmoid=True,
+                    activate=False,
+                    eps=1e-5)) for _ in range(num_stages)
+        ]),
+    # training and testing settings
+    train_cfg=dict(
+        rpn=None,
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='HungarianAssigner',
+                    match_costs=[
+                        dict(type='FocalLossCost', weight=2.0),
+                        dict(type='BBoxL1Cost', weight=5.0, box_format='xyxy'),
+                        dict(type='IoUCost', iou_mode='giou', weight=2.0)
+                    ]),
+                sampler=dict(type='PseudoSampler'),
+                pos_weight=1,
+                mask_size=28,
+            ) for _ in range(num_stages)
+        ]),
+    test_cfg=dict(
+        rpn=None, rcnn=dict(max_per_img=num_proposals, mask_thr_binary=0.5)))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        _delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
+    clip_grad=dict(max_norm=0.1, norm_type=2))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/queryinst/queryinst_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py b/mmde/mmdet/.mim/configs/queryinst/queryinst_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..33ab061267bc9753f490acc57ed8d4193f1250b4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/queryinst/queryinst_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py
@@ -0,0 +1,45 @@
+_base_ = './queryinst_r50_fpn_ms-480-800-3x_coco.py'
+num_proposals = 300
+model = dict(
+    rpn_head=dict(num_proposals=num_proposals),
+    test_cfg=dict(
+        _delete_=True,
+        rpn=None,
+        rcnn=dict(max_per_img=num_proposals, mask_thr_binary=0.5)))
+
+# augmentation strategy originates from DETR.
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[[
+            dict(
+                type='RandomChoiceResize',
+                scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                        (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                        (736, 1333), (768, 1333), (800, 1333)],
+                keep_ratio=True)
+        ],
+                    [
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(400, 1333), (500, 1333), (600, 1333)],
+                            keep_ratio=True),
+                        dict(
+                            type='RandomCrop',
+                            crop_type='absolute_range',
+                            crop_size=(384, 600),
+                            allow_negative_crop=True),
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(480, 1333), (512, 1333), (544, 1333),
+                                    (576, 1333), (608, 1333), (640, 1333),
+                                    (672, 1333), (704, 1333), (736, 1333),
+                                    (768, 1333), (800, 1333)],
+                            keep_ratio=True)
+                    ]]),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/queryinst/queryinst_r50_fpn_ms-480-800-3x_coco.py b/mmde/mmdet/.mim/configs/queryinst/queryinst_r50_fpn_ms-480-800-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b99374ef4364dc76a60c2dd74377f92c15780ed
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/queryinst/queryinst_r50_fpn_ms-480-800-3x_coco.py
@@ -0,0 +1,32 @@
+_base_ = './queryinst_r50_fpn_1x_coco.py'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                (736, 1333), (768, 1333), (800, 1333)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# learning policy
+max_epochs = 36
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=max_epochs)
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[27, 33],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/regnet/cascade-mask-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/regnet/cascade-mask-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..74e6adaba5c262d45aaec876d1225b0061bb290b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/cascade-mask-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_1.6gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[72, 168, 408, 912],
+        out_channels=256,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/regnet/cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/regnet/cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea219021260b6aa3a844eb6b4780e9669e50ed3b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py
@@ -0,0 +1,28 @@
+_base_ = [
+    '../common/ms_3x_coco-instance.py',
+    '../_base_/models/cascade-mask-rcnn_r50_fpn.py'
+]
+model = dict(
+    data_preprocessor=dict(
+        # The mean and std are used in PyCls when training RegNets
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_3.2gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[96, 192, 432, 1008],
+        out_channels=256,
+        num_outs=5))
+
+optim_wrapper = dict(optimizer=dict(weight_decay=0.00005))
diff --git a/mmde/mmdet/.mim/configs/regnet/cascade-mask-rcnn_regnetx-400MF_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/regnet/cascade-mask-rcnn_regnetx-400MF_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fe47f837437163710ecd28f1bb217c643464965
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/cascade-mask-rcnn_regnetx-400MF_fpn_ms-3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_400mf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_400mf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[32, 64, 160, 384],
+        out_channels=256,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/regnet/cascade-mask-rcnn_regnetx-4GF_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/regnet/cascade-mask-rcnn_regnetx-4GF_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e22886a80f92ba4269477a307b2689c45468381c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/cascade-mask-rcnn_regnetx-4GF_fpn_ms-3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_4.0gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[80, 240, 560, 1360],
+        out_channels=256,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/regnet/cascade-mask-rcnn_regnetx-800MF_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/regnet/cascade-mask-rcnn_regnetx-800MF_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..655bdc60c772875e0a1ed871bd6bf02aab8e39cc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/cascade-mask-rcnn_regnetx-800MF_fpn_ms-3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_800mf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[64, 128, 288, 672],
+        out_channels=256,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9e8302bdd1537b825f36777e3211d27dec8fb0c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_1.6gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[72, 168, 408, 912],
+        out_channels=256,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..db49092e2fb7e1cf3dbcad2bb99aa08396ea35e7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_1x_coco.py
@@ -0,0 +1,30 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    data_preprocessor=dict(
+        # The mean and std are used in PyCls when training RegNets
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_3.2gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[96, 192, 432, 1008],
+        out_channels=256,
+        num_outs=5))
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005))
diff --git a/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..be533603085a89b65556b47f5e333fdde734bbd1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './faster-rcnn_regnetx-3.2GF_fpn_1x_coco.py'
+
+# learning policy
+max_epochs = 24
+train_cfg = dict(max_epochs=max_epochs)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3d5d5d689162d805c0cfb4d84f9a128faf90c25
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py
@@ -0,0 +1,25 @@
+_base_ = ['../common/ms_3x_coco.py', '../_base_/models/faster-rcnn_r50_fpn.py']
+model = dict(
+    data_preprocessor=dict(
+        # The mean and std are used in PyCls when training RegNets
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_3.2gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[96, 192, 432, 1008],
+        out_channels=256,
+        num_outs=5))
+
+optim_wrapper = dict(optimizer=dict(weight_decay=0.00005))
diff --git a/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-400MF_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-400MF_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2edeff9c1f5a794ed14dc8723917986ac26e3d36
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-400MF_fpn_ms-3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_400mf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_400mf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[32, 64, 160, 384],
+        out_channels=256,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-4GF_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-4GF_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..afcbb5d5d1a8aee47267d1f82fff8d40fa0d8e9b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-4GF_fpn_ms-3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_4.0gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[80, 240, 560, 1360],
+        out_channels=256,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-800MF_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-800MF_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f659ec9689068afd94aa3bc545d4fed91ffb5eb4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/faster-rcnn_regnetx-800MF_fpn_ms-3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = 'faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_800mf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[64, 128, 288, 672],
+        out_channels=256,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-1.6GF_fpn_ms-poly-3x_coco.py b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-1.6GF_fpn_ms-poly-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..60874c66dbc37df824a9c44bb8c28a441f7f84e4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-1.6GF_fpn_ms-poly-3x_coco.py
@@ -0,0 +1,26 @@
+_base_ = [
+    '../common/ms-poly_3x_coco-instance.py',
+    '../_base_/models/mask-rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_1.6gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[72, 168, 408, 912],
+        out_channels=256,
+        num_outs=5))
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005),
+    clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-12GF_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-12GF_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e82cecea010fb32143f809add198a052285a6897
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-12GF_fpn_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_12gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_12gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[224, 448, 896, 2240],
+        out_channels=256,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-3.2GF-mdconv-c3-c5_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-3.2GF-mdconv-c3-c5_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7c1d1ac3a7bd87bd210b4cd2194dd7e430f8d96
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-3.2GF-mdconv-c3-c5_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = 'mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')))
diff --git a/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c52bf13ff6df5cda353c21ac32a950602620dbde
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py
@@ -0,0 +1,30 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    data_preprocessor=dict(
+        # The mean and std are used in PyCls when training RegNets
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_3.2gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[96, 192, 432, 1008],
+        out_channels=256,
+        num_outs=5))
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005))
diff --git a/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..36482c939dc3e600171b98bc159440e5fb740ffa
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py
@@ -0,0 +1,60 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    data_preprocessor=dict(
+        # The mean and std are used in PyCls when training RegNets
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_3.2gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[96, 192, 432, 1008],
+        out_channels=256,
+        num_outs=5))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning policy
+max_epochs = 36
+train_cfg = dict(max_epochs=max_epochs)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[28, 34],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-400MF_fpn_ms-poly-3x_coco.py b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-400MF_fpn_ms-poly-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b96e1921f0dae8ad6656a7785d9d4655f9f349b3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-400MF_fpn_ms-poly-3x_coco.py
@@ -0,0 +1,26 @@
+_base_ = [
+    '../common/ms-poly_3x_coco-instance.py',
+    '../_base_/models/mask-rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_400mf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_400mf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[32, 64, 160, 384],
+        out_channels=256,
+        num_outs=5))
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005),
+    clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-4GF_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-4GF_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce9f8ef4ffbcce66ec0184b3ff06a92425231597
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-4GF_fpn_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_4.0gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[80, 240, 560, 1360],
+        out_channels=256,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-4GF_fpn_ms-poly-3x_coco.py b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-4GF_fpn_ms-poly-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f160ccf66700d98a6403ed736928e529368e800c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-4GF_fpn_ms-poly-3x_coco.py
@@ -0,0 +1,26 @@
+_base_ = [
+    '../common/ms-poly_3x_coco-instance.py',
+    '../_base_/models/mask-rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_4.0gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[80, 240, 560, 1360],
+        out_channels=256,
+        num_outs=5))
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005),
+    clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-6.4GF_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-6.4GF_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e17a3d7695fa7ba9e135d7a436118aae29be4747
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-6.4GF_fpn_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_6.4gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_6.4gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[168, 392, 784, 1624],
+        out_channels=256,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-800MF_fpn_ms-poly-3x_coco.py b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-800MF_fpn_ms-poly-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..93851fdbb99e5d8e3a58062c7ad83d2acad14ac6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-800MF_fpn_ms-poly-3x_coco.py
@@ -0,0 +1,26 @@
+_base_ = [
+    '../common/ms-poly_3x_coco-instance.py',
+    '../_base_/models/mask-rcnn_r50_fpn.py'
+]
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_800mf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[64, 128, 288, 672],
+        out_channels=256,
+        num_outs=5))
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005),
+    clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-8GF_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-8GF_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..62a4c931512e6b46093b03fd4e80741a93151c6a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/mask-rcnn_regnetx-8GF_fpn_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_8.0gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_8.0gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[80, 240, 720, 1920],
+        out_channels=256,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/regnet/metafile.yml b/mmde/mmdet/.mim/configs/regnet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..19fbba80f0396e1dad7a330ef769d98ad1a0c4d2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/metafile.yml
@@ -0,0 +1,797 @@
+Models:
+  - Name: mask-rcnn_regnetx-3.2GF_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_1x_coco_20200520_163141-2a9d1814.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask-rcnn_regnetx-4GF_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask-rcnn_regnetx-4GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco/mask_rcnn_regnetx-4GF_fpn_1x_coco_20200517_180217-32e9c92d.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask-rcnn_regnetx-6.4GF_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask-rcnn_regnetx-6.4GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.1
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco/mask_rcnn_regnetx-6.4GF_fpn_1x_coco_20200517_180439-3a7aae83.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask-rcnn_regnetx-8GF_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask-rcnn_regnetx-8GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco/mask_rcnn_regnetx-8GF_fpn_1x_coco_20200517_180515-09daa87e.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask-rcnn_regnetx-12GF_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask-rcnn_regnetx-12GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.4
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco/mask_rcnn_regnetx-12GF_fpn_1x_coco_20200517_180552-b538bd8b.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask-rcnn_regnetx-3.2GF-mdconv-c3-c5_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask-rcnn_regnetx-3.2GF-mdconv-c3-c5_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco_20200520_172726-75f40794.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: faster-rcnn_regnetx-3.2GF_fpn_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco/faster_rcnn_regnetx-3.2GF_fpn_1x_coco_20200517_175927-126fd9bf.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: faster-rcnn_regnetx-3.2GF_fpn_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 4.5
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco/faster_rcnn_regnetx-3.2GF_fpn_2x_coco_20200520_223955-e2081918.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: retinanet_regnetx-800MF_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 2.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 35.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-800MF_fpn_1x_coco/retinanet_regnetx-800MF_fpn_1x_coco_20200517_191403-f6f91d10.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: retinanet_regnetx-1.6GF_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.3
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco/retinanet_regnetx-1.6GF_fpn_1x_coco_20200517_191403-37009a9d.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: retinanet_regnetx-3.2GF_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.2
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco/retinanet_regnetx-3.2GF_fpn_1x_coco_20200520_163141-cb1509e8.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: faster-rcnn_regnetx-400MF_fpn_ms-3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/regnet/faster-rcnn_regnetx-400MF_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 2.3
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210526_095112-e1967c37.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: faster-rcnn_regnetx-800MF_fpn_ms-3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/regnet/faster-rcnn_regnetx-800MF_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 2.8
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210526_095118-a2c70b20.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: faster-rcnn_regnetx-1.6GF_fpn_ms-3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/regnet/faster-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 3.4
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-1_20210526_095325-94aa46cc.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.4
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-3_20210526_095152-e16a5227.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: faster-rcnn_regnetx-4GF_fpn_ms-3x_coco
+    In Collection: Faster R-CNN
+    Config: configs/regnet/faster-rcnn_regnetx-4GF_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.9
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210526_095201-65eaf841.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco_20200521_202221-99879813.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask-rcnn_regnetx-400MF_fpn_ms-poly-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask-rcnn_regnetx-400MF_fpn_ms-poly-3x_coco.py
+    Metadata:
+      Training Memory (GB): 2.5
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 34.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco_20210601_235443-8aac57a4.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask-rcnn_regnetx-800MF_fpn_ms-poly-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask-rcnn_regnetx-800MF_fpn_ms-poly-3x_coco.py
+    Metadata:
+      Training Memory (GB): 2.9
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco_20210602_210641-715d51f5.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask-rcnn_regnetx-1.6GF_fpn_ms-poly-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask-rcnn_regnetx-1.6GF_fpn_ms-poly-3x_coco.py
+    Metadata:
+      Training Memory (GB): 3.6
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-1_20210602_210641-6764cff5.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco_20200521_202221-99879813.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: mask-rcnn_regnetx-4GF_fpn_ms-poly-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/regnet/mask-rcnn_regnetx-4GF_fpn_ms-poly-3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.1
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco_20210602_032621-00f0331c.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: cascade-mask-rcnn_regnetx-400MF_fpn_ms-3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/regnet/cascade-mask-rcnn_regnetx-400MF_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.3
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210715_211619-5142f449.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: cascade-mask-rcnn_regnetx-800MF_fpn_ms-3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/regnet/cascade-mask-rcnn_regnetx-800MF_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 4.8
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210715_211616-dcbd13f4.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: cascade-mask-rcnn_regnetx-1.6GF_fpn_ms-3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/regnet/cascade-mask-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.4
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-1_20210715_211616-75f29a61.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/regnet/cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 6.4
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-3_20210715_211616-b9c2c58b.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
+
+  - Name: cascade-mask-rcnn_regnetx-4GF_fpn_ms-3x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/regnet/cascade-mask-rcnn_regnetx-4GF_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 6.9
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - RegNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210715_212034-cbb1be4c.pth
+    Paper:
+      URL: https://arxiv.org/abs/2003.13678
+      Title: 'Designing Network Design Spaces'
+    README: configs/regnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11
+      Version: v2.1.0
diff --git a/mmde/mmdet/.mim/configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7395c1bfbfa16670294c721f9f3135da9b9e69ae
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './retinanet_regnetx-3.2GF_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_1.6gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[72, 168, 408, 912],
+        out_channels=256,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b8a32cec195901e2f1326bf62f4fa4508e744d2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py
@@ -0,0 +1,31 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    data_preprocessor=dict(
+        # The mean and std are used in PyCls when training RegNets
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        _delete_=True,
+        type='RegNet',
+        arch='regnetx_3.2gf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[96, 192, 432, 1008],
+        out_channels=256,
+        num_outs=5))
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005),
+    clip_grad=dict(max_norm=35, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6f8989320d6ffbcd55148471f62a962c52f9131
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './retinanet_regnetx-3.2GF_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='RegNet',
+        arch='regnetx_800mf',
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')),
+    neck=dict(
+        type='FPN',
+        in_channels=[64, 128, 288, 672],
+        out_channels=256,
+        num_outs=5))
diff --git a/mmde/mmdet/.mim/configs/reid/reid_r50_8xb32-6e_mot15train80_test-mot15val20.py b/mmde/mmdet/.mim/configs/reid/reid_r50_8xb32-6e_mot15train80_test-mot15val20.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e30b22964d0504771678dbd0a551bc16a0714ea
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/reid/reid_r50_8xb32-6e_mot15train80_test-mot15val20.py
@@ -0,0 +1,7 @@
+_base_ = ['./reid_r50_8xb32-6e_mot17train80_test-mot17val20.py']
+model = dict(head=dict(num_classes=368))
+# data
+data_root = 'data/MOT15/'
+train_dataloader = dict(dataset=dict(data_root=data_root))
+val_dataloader = dict(dataset=dict(data_root=data_root))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/reid/reid_r50_8xb32-6e_mot16train80_test-mot16val20.py b/mmde/mmdet/.mim/configs/reid/reid_r50_8xb32-6e_mot16train80_test-mot16val20.py
new file mode 100644
index 0000000000000000000000000000000000000000..468b9bfb2453f97c83282cc2f383c7592694269c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/reid/reid_r50_8xb32-6e_mot16train80_test-mot16val20.py
@@ -0,0 +1,7 @@
+_base_ = ['./reid_r50_8xb32-6e_mot17train80_test-mot17val20.py']
+model = dict(head=dict(num_classes=371))
+# data
+data_root = 'data/MOT16/'
+train_dataloader = dict(dataset=dict(data_root=data_root))
+val_dataloader = dict(dataset=dict(data_root=data_root))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py b/mmde/mmdet/.mim/configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py
new file mode 100644
index 0000000000000000000000000000000000000000..83669de7c170c5de0e2054808ef7a76878bc1f24
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py
@@ -0,0 +1,61 @@
+_base_ = [
+    '../_base_/datasets/mot_challenge_reid.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='BaseReID',
+    data_preprocessor=dict(
+        type='ReIDDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        to_rgb=True),
+    backbone=dict(
+        type='mmpretrain.ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3, ),
+        style='pytorch'),
+    neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),
+    head=dict(
+        type='LinearReIDHead',
+        num_fcs=1,
+        in_channels=2048,
+        fc_channels=1024,
+        out_channels=128,
+        num_classes=380,
+        loss_cls=dict(type='mmpretrain.CrossEntropyLoss', loss_weight=1.0),
+        loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),
+        norm_cfg=dict(type='BN1d'),
+        act_cfg=dict(type='ReLU')),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint=  # noqa: E251
+        'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth'  # noqa: E501
+    ))
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    clip_grad=None,
+    optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=6,
+        by_epoch=True,
+        milestones=[5],
+        gamma=0.1)
+]
+
+# train, val, test setting
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=6, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/mmde/mmdet/.mim/configs/reid/reid_r50_8xb32-6e_mot20train80_test-mot20val20.py b/mmde/mmdet/.mim/configs/reid/reid_r50_8xb32-6e_mot20train80_test-mot20val20.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a807996186c35f91e23f6e0ec95a2191479c15b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/reid/reid_r50_8xb32-6e_mot20train80_test-mot20val20.py
@@ -0,0 +1,10 @@
+_base_ = ['./reid_r50_8xb32-6e_mot17train80_test-mot17val20.py']
+model = dict(head=dict(num_classes=1701))
+# data
+data_root = 'data/MOT20/'
+train_dataloader = dict(dataset=dict(data_root=data_root))
+val_dataloader = dict(dataset=dict(data_root=data_root))
+test_dataloader = val_dataloader
+
+# train, val, test setting
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=6, val_interval=7)
diff --git a/mmde/mmdet/.mim/configs/reppoints/metafile.yml b/mmde/mmdet/.mim/configs/reppoints/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..732d541fb548f6eed00d6ba0fb4ffe3854b4f9c5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/reppoints/metafile.yml
@@ -0,0 +1,181 @@
+Collections:
+  - Name: RepPoints
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Group Normalization
+        - FPN
+        - RepPoints
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1904.11490
+      Title: 'RepPoints: Point Set Representation for Object Detection'
+    README: configs/reppoints/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/reppoints_detector.py#L9
+      Version: v2.0.0
+
+Models:
+  - Name: reppoints-bbox_r50_fpn-gn_head-gn-grid_1x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/reppoints-bbox_r50_fpn-gn_head-gn-grid_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.9
+      inference time (ms/im):
+        - value: 62.89
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329_145916-0eedf8d1.pth
+
+  - Name: reppoints-bbox_r50-center_fpn-gn_head-gn-grid_1x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/reppoints-bbox_r50-center_fpn-gn_head-gn-grid_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.9
+      inference time (ms/im):
+        - value: 64.94
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329_145916-0eedf8d1.pth
+
+  - Name: reppoints-moment_r50_fpn_1x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/reppoints-moment_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.3
+      inference time (ms/im):
+        - value: 54.05
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_1x_coco/reppoints_moment_r50_fpn_1x_coco_20200330-b73db8d1.pth
+
+  - Name: reppoints-moment_r50_fpn-gn_head-gn_1x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.9
+      inference time (ms/im):
+        - value: 57.14
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco_20200329_145952-3e51b550.pth
+
+  - Name: reppoints-moment_r50_fpn-gn_head-gn_2x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 3.9
+      inference time (ms/im):
+        - value: 57.14
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco_20200329-91babaa2.pth
+
+  - Name: reppoints-moment_r101_fpn-gn_head-gn_2x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/reppoints-moment_r101_fpn-gn_head-gn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 5.8
+      inference time (ms/im):
+        - value: 72.99
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco_20200329-4fbc7310.pth
+
+  - Name: reppoints-moment_r101-dconv-c3-c5_fpn-gn_head-gn_2x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/reppoints-moment_r101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 5.9
+      inference time (ms/im):
+        - value: 82.64
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329-3309fbf2.pth
+
+  - Name: reppoints-moment_x101-dconv-c3-c5_fpn-gn_head-gn_2x_coco
+    In Collection: RepPoints
+    Config: configs/reppoints/reppoints-moment_x101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      inference time (ms/im):
+        - value: 107.53
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329-f87da1ea.pth
diff --git a/mmde/mmdet/.mim/configs/reppoints/reppoints-bbox_r50-center_fpn-gn_head-gn-grid_1x_coco.py b/mmde/mmdet/.mim/configs/reppoints/reppoints-bbox_r50-center_fpn-gn_head-gn-grid_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f116e53f6ded9468098733c1bab938831fee041d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/reppoints/reppoints-bbox_r50-center_fpn-gn_head-gn-grid_1x_coco.py
@@ -0,0 +1,2 @@
+_base_ = './reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py'
+model = dict(bbox_head=dict(transform_method='minmax', use_grid_points=True))
diff --git a/mmde/mmdet/.mim/configs/reppoints/reppoints-bbox_r50_fpn-gn_head-gn-grid_1x_coco.py b/mmde/mmdet/.mim/configs/reppoints/reppoints-bbox_r50_fpn-gn_head-gn-grid_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..76be39b8de8f52d48c6cdd4626f23221e35164ab
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/reppoints/reppoints-bbox_r50_fpn-gn_head-gn-grid_1x_coco.py
@@ -0,0 +1,13 @@
+_base_ = './reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py'
+model = dict(
+    bbox_head=dict(transform_method='minmax', use_grid_points=True),
+    # training and testing settings
+    train_cfg=dict(
+        init=dict(
+            assigner=dict(
+                _delete_=True,
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.4,
+                min_pos_iou=0,
+                ignore_iof_thr=-1))))
diff --git a/mmde/mmdet/.mim/configs/reppoints/reppoints-minmax_r50_fpn-gn_head-gn_1x_coco.py b/mmde/mmdet/.mim/configs/reppoints/reppoints-minmax_r50_fpn-gn_head-gn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e7dffe77a062268737205fd86ab23f22cd85479
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/reppoints/reppoints-minmax_r50_fpn-gn_head-gn_1x_coco.py
@@ -0,0 +1,2 @@
+_base_ = './reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py'
+model = dict(bbox_head=dict(transform_method='minmax'))
diff --git a/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_r101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py b/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_r101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c2bfab40020d7508ba90029ad29b24da8a7ad78
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_r101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py
@@ -0,0 +1,8 @@
+_base_ = './reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_r101_fpn-gn_head-gn_2x_coco.py b/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_r101_fpn-gn_head-gn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..02c447ada075ca6b076a5e7ff2ed74fb3b80c30d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_r101_fpn-gn_head-gn_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py b/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cedf2226b5ecd2e5dd207041523ab4a2627a1734
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py
@@ -0,0 +1,3 @@
+_base_ = './reppoints-moment_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(neck=dict(norm_cfg=norm_cfg), bbox_head=dict(norm_cfg=norm_cfg))
diff --git a/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py b/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4490d4496af6d680fbed2eedcaf73e138afff0cc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py'
+
+max_epochs = 24
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..df7e72a80c66f42fe8554cfb344fee87ee5fe24a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_r50_fpn_1x_coco.py
@@ -0,0 +1,74 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='RepPointsDetector',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_input',
+        num_outs=5),
+    bbox_head=dict(
+        type='RepPointsHead',
+        num_classes=80,
+        in_channels=256,
+        feat_channels=256,
+        point_feat_channels=256,
+        stacked_convs=3,
+        num_points=9,
+        gradient_mul=0.1,
+        point_strides=[8, 16, 32, 64, 128],
+        point_base_scale=4,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5),
+        loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0),
+        transform_method='moment'),
+    # training and testing settings
+    train_cfg=dict(
+        init=dict(
+            assigner=dict(type='PointAssigner', scale=4, pos_num=1),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        refine=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.4,
+                min_pos_iou=0,
+                ignore_iof_thr=-1),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
+
+optim_wrapper = dict(optimizer=dict(lr=0.01))
diff --git a/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_x101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py b/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_x101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9909efe511da9423859de6ce096b1b1524a9b6f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/reppoints/reppoints-moment_x101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/reppoints/reppoints-partial-minmax_r50_fpn-gn_head-gn_1x_coco.py b/mmde/mmdet/.mim/configs/reppoints/reppoints-partial-minmax_r50_fpn-gn_head-gn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..30f7844b8344110896c5d885bd0ca340322045e4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/reppoints/reppoints-partial-minmax_r50_fpn-gn_head-gn_1x_coco.py
@@ -0,0 +1,2 @@
+_base_ = './reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py'
+model = dict(bbox_head=dict(transform_method='partial_minmax'))
diff --git a/mmde/mmdet/.mim/configs/res2net/cascade-mask-rcnn_res2net-101_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/res2net/cascade-mask-rcnn_res2net-101_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..21b6d2ea1c0167b8dd643211b520ac89ddd63e10
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/res2net/cascade-mask-rcnn_res2net-101_fpn_20e_coco.py
@@ -0,0 +1,10 @@
+_base_ = '../cascade_rcnn/cascade-mask-rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='Res2Net',
+        depth=101,
+        scales=4,
+        base_width=26,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://res2net101_v1d_26w_4s')))
diff --git a/mmde/mmdet/.mim/configs/res2net/cascade-rcnn_res2net-101_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/res2net/cascade-rcnn_res2net-101_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..670a77454e060f8f639dbdc40064b71cd82520e9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/res2net/cascade-rcnn_res2net-101_fpn_20e_coco.py
@@ -0,0 +1,10 @@
+_base_ = '../cascade_rcnn/cascade-rcnn_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='Res2Net',
+        depth=101,
+        scales=4,
+        base_width=26,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://res2net101_v1d_26w_4s')))
diff --git a/mmde/mmdet/.mim/configs/res2net/faster-rcnn_res2net-101_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/res2net/faster-rcnn_res2net-101_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..033cf574962f51a75c3fce1e74a22efb9c6320f2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/res2net/faster-rcnn_res2net-101_fpn_2x_coco.py
@@ -0,0 +1,10 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='Res2Net',
+        depth=101,
+        scales=4,
+        base_width=26,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://res2net101_v1d_26w_4s')))
diff --git a/mmde/mmdet/.mim/configs/res2net/htc_res2net-101_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/res2net/htc_res2net-101_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5542fda4c8181a417f14817180296e84944b832
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/res2net/htc_res2net-101_fpn_20e_coco.py
@@ -0,0 +1,10 @@
+_base_ = '../htc/htc_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='Res2Net',
+        depth=101,
+        scales=4,
+        base_width=26,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://res2net101_v1d_26w_4s')))
diff --git a/mmde/mmdet/.mim/configs/res2net/mask-rcnn_res2net-101_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/res2net/mask-rcnn_res2net-101_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a2d57304d07d9b3dbc58ee9a5d8f2355c6b4427
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/res2net/mask-rcnn_res2net-101_fpn_2x_coco.py
@@ -0,0 +1,10 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='Res2Net',
+        depth=101,
+        scales=4,
+        base_width=26,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://res2net101_v1d_26w_4s')))
diff --git a/mmde/mmdet/.mim/configs/res2net/metafile.yml b/mmde/mmdet/.mim/configs/res2net/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1d9f9ea023d895cd8a93b0f48b3bc4dee5a93e6b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/res2net/metafile.yml
@@ -0,0 +1,146 @@
+Models:
+  - Name: faster-rcnn_res2net-101_fpn_2x_coco
+    In Collection: Faster R-CNN
+    Config: configs/res2net/faster-rcnn_res2net-101_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.4
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Res2Net
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/faster_rcnn_r2_101_fpn_2x_coco/faster_rcnn_r2_101_fpn_2x_coco-175f1da6.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.01169
+      Title: 'Res2Net for object detection and instance segmentation'
+    README: configs/res2net/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239
+      Version: v2.1.0
+
+  - Name: mask-rcnn_res2net-101_fpn_2x_coco
+    In Collection: Mask R-CNN
+    Config: configs/res2net/mask-rcnn_res2net-101_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.9
+      Epochs: 24
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Res2Net
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/mask_rcnn_r2_101_fpn_2x_coco/mask_rcnn_r2_101_fpn_2x_coco-17f061e8.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.01169
+      Title: 'Res2Net for object detection and instance segmentation'
+    README: configs/res2net/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239
+      Version: v2.1.0
+
+  - Name: cascade-rcnn_res2net-101_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/res2net/cascade-rcnn_res2net-101_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Res2Net
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_rcnn_r2_101_fpn_20e_coco/cascade_rcnn_r2_101_fpn_20e_coco-f4b7b7db.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.01169
+      Title: 'Res2Net for object detection and instance segmentation'
+    README: configs/res2net/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239
+      Version: v2.1.0
+
+  - Name: cascade-mask-rcnn_res2net-101_fpn_20e_coco
+    In Collection: Cascade R-CNN
+    Config: configs/res2net/cascade-mask-rcnn_res2net-101_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 9.5
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Res2Net
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco/cascade_mask_rcnn_r2_101_fpn_20e_coco-8a7b41e1.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.01169
+      Title: 'Res2Net for object detection and instance segmentation'
+    README: configs/res2net/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239
+      Version: v2.1.0
+
+  - Name: htc_res2net-101_fpn_20e_coco
+    In Collection: HTC
+    Config: configs/res2net/htc_res2net-101_fpn_20e_coco.py
+    Metadata:
+      Epochs: 20
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Res2Net
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/htc_r2_101_fpn_20e_coco/htc_r2_101_fpn_20e_coco-3a8d2112.pth
+    Paper:
+      URL: https://arxiv.org/abs/1904.01169
+      Title: 'Res2Net for object detection and instance segmentation'
+    README: configs/res2net/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239
+      Version: v2.1.0
diff --git a/mmde/mmdet/.mim/configs/resnest/cascade-mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py b/mmde/mmdet/.mim/configs/resnest/cascade-mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4f19925788acc357e9720513d4f388598927a70
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/resnest/cascade-mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './cascade-mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py'
+model = dict(
+    backbone=dict(
+        stem_channels=128,
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='open-mmlab://resnest101')))
diff --git a/mmde/mmdet/.mim/configs/resnest/cascade-mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py b/mmde/mmdet/.mim/configs/resnest/cascade-mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6ef41c05cd97d19320c02fb065b0cde1dda54d7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/resnest/cascade-mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py
@@ -0,0 +1,101 @@
+_base_ = '../cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+
+model = dict(
+    # use ResNeSt img_norm
+    data_preprocessor=dict(
+        mean=[123.68, 116.779, 103.939],
+        std=[58.393, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=64,
+        depth=50,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')),
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                norm_cfg=norm_cfg,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                norm_cfg=norm_cfg,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                norm_cfg=norm_cfg,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_head=dict(norm_cfg=norm_cfg)))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/resnest/cascade-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py b/mmde/mmdet/.mim/configs/resnest/cascade-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dbf3fae5ffb9382b053852c35e263f109668020
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/resnest/cascade-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './cascade-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py'
+model = dict(
+    backbone=dict(
+        stem_channels=128,
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='open-mmlab://resnest101')))
diff --git a/mmde/mmdet/.mim/configs/resnest/cascade-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py b/mmde/mmdet/.mim/configs/resnest/cascade-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ce7b56320a6511376237710c25061edd44b17dd
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/resnest/cascade-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py
@@ -0,0 +1,93 @@
+_base_ = '../cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    # use ResNeSt img_norm
+    data_preprocessor=dict(
+        mean=[123.68, 116.779, 103.939],
+        std=[58.393, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=64,
+        depth=50,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')),
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                norm_cfg=norm_cfg,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                norm_cfg=norm_cfg,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                in_channels=256,
+                conv_out_channels=256,
+                fc_out_channels=1024,
+                norm_cfg=norm_cfg,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ], ))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize', scale=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/resnest/faster-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py b/mmde/mmdet/.mim/configs/resnest/faster-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1e16321adff643d593268f868c09f5a318e7e93
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/resnest/faster-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './faster-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py'
+model = dict(
+    backbone=dict(
+        stem_channels=128,
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='open-mmlab://resnest101')))
diff --git a/mmde/mmdet/.mim/configs/resnest/faster-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py b/mmde/mmdet/.mim/configs/resnest/faster-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f0ec6e07af1fcd250171cb769252eeb03f92da8
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/resnest/faster-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py
@@ -0,0 +1,39 @@
+_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    # use ResNeSt img_norm
+    data_preprocessor=dict(
+        mean=[123.68, 116.779, 103.939],
+        std=[58.393, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=64,
+        depth=50,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=norm_cfg)))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize', scale=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/resnest/mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py b/mmde/mmdet/.mim/configs/resnest/mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3edf49f052f1f3c875cca2c061276cc1aca77604
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/resnest/mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py'
+model = dict(
+    backbone=dict(
+        stem_channels=128,
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='open-mmlab://resnest101')))
diff --git a/mmde/mmdet/.mim/configs/resnest/mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py b/mmde/mmdet/.mim/configs/resnest/mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6f27000862d74e23a665f3bf8caae0ec4a3d6f5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/resnest/mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py
@@ -0,0 +1,46 @@
+_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    # use ResNeSt img_norm
+    data_preprocessor=dict(
+        mean=[123.68, 116.779, 103.939],
+        std=[58.393, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=64,
+        depth=50,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=norm_cfg),
+        mask_head=dict(norm_cfg=norm_cfg)))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/resnest/metafile.yml b/mmde/mmdet/.mim/configs/resnest/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..265c94094975858ff0cc0ceac3870c9b4f9b9a84
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/resnest/metafile.yml
@@ -0,0 +1,230 @@
+Models:
+  - Name: faster-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/resnest/faster-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.8
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20200926_125502-20289c16.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
+
+  - Name: faster-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/resnest/faster-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201006_021058-421517f1.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
+
+  - Name: mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/resnest/mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20200926_125503-8a2c3d47.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
+
+  - Name: mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/resnest/mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201005_215831-af60cdf9.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
+
+  - Name: cascade-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/resnest/cascade-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py
+    Metadata:
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201122_213640-763cc7b5.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
+
+  - Name: cascade-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/resnest/cascade-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.4
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201005_113242-b9459f8f.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
+
+  - Name: cascade-mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/resnest/cascade-mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py
+    Metadata:
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201122_104428-99eca4c7.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
+
+  - Name: cascade-mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/resnest/cascade-mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNeSt
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201005_113243-42607475.pth
+    Paper:
+      URL: https://arxiv.org/abs/2004.08955
+      Title: 'ResNeSt: Split-Attention Networks'
+    README: configs/resnest/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273
+      Version: v2.7.0
diff --git a/mmde/mmdet/.mim/configs/resnet_strikes_back/cascade-mask-rcnn_r50-rsb-pre_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/resnet_strikes_back/cascade-mask-rcnn_r50-rsb-pre_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..de7b95b0863d1ea89382fd9fa5852eccf0f34150
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/resnet_strikes_back/cascade-mask-rcnn_r50-rsb-pre_fpn_1x_coco.py
@@ -0,0 +1,15 @@
+_base_ = [
+    '../_base_/models/cascade-mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)))
+
+optim_wrapper = dict(
+    optimizer=dict(_delete_=True, type='AdamW', lr=0.0002, weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True))
diff --git a/mmde/mmdet/.mim/configs/resnet_strikes_back/faster-rcnn_r50-rsb-pre_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/resnet_strikes_back/faster-rcnn_r50-rsb-pre_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c60f66a7ba8e5b6a7ee6af06e771b3c6ad71f6c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/resnet_strikes_back/faster-rcnn_r50-rsb-pre_fpn_1x_coco.py
@@ -0,0 +1,15 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)))
+
+optim_wrapper = dict(
+    optimizer=dict(_delete_=True, type='AdamW', lr=0.0002, weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True))
diff --git a/mmde/mmdet/.mim/configs/resnet_strikes_back/mask-rcnn_r50-rsb-pre_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/resnet_strikes_back/mask-rcnn_r50-rsb-pre_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..85e25d392359b1a7811fb0c933ede5edacbfb9c3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/resnet_strikes_back/mask-rcnn_r50-rsb-pre_fpn_1x_coco.py
@@ -0,0 +1,15 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)))
+
+optim_wrapper = dict(
+    optimizer=dict(_delete_=True, type='AdamW', lr=0.0002, weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True))
diff --git a/mmde/mmdet/.mim/configs/resnet_strikes_back/metafile.yml b/mmde/mmdet/.mim/configs/resnet_strikes_back/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..74b152107d7a6d96f671c52d5273c79751122bfa
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/resnet_strikes_back/metafile.yml
@@ -0,0 +1,116 @@
+Models:
+  - Name: faster-rcnn_r50_fpn_rsb-pretrain_1x_coco
+    In Collection: Faster R-CNN
+    Config: configs/resnet_strikes_back/faster-rcnn_r50-rsb-pre_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.9
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_162229-32ae82a9.pth
+    Paper:
+      URL: https://arxiv.org/abs/2110.00476
+      Title: 'ResNet strikes back: An improved training procedure in timm'
+    README: configs/resnet_strikes_back/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md
+      Version: v2.22.0
+
+  - Name: cascade-mask-rcnn_r50_fpn_rsb-pretrain_1x_coco
+    In Collection: Cascade R-CNN
+    Config: configs/resnet_strikes_back/cascade-mask-rcnn_r50-rsb-pre_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 6.2
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_193636-8b9ad50f.pth
+    Paper:
+      URL: https://arxiv.org/abs/2110.00476
+      Title: 'ResNet strikes back: An improved training procedure in timm'
+    README: configs/resnet_strikes_back/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md
+      Version: v2.22.0
+
+  - Name: retinanet_r50-rsb-pre_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/resnet_strikes_back/retinanet_r50-rsb-pre_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.8
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco/retinanet_r50_fpn_rsb-pretrain_1x_coco_20220113_175432-bd24aae9.pth
+    Paper:
+      URL: https://arxiv.org/abs/2110.00476
+      Title: 'ResNet strikes back: An improved training procedure in timm'
+    README: configs/resnet_strikes_back/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md
+      Version: v2.22.0
+
+  - Name: mask-rcnn_r50_fpn_rsb-pretrain_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/resnet_strikes_back/mask-rcnn_r50-rsb-pre_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.5
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_174054-06ce8ba0.pth
+    Paper:
+      URL: https://arxiv.org/abs/2110.00476
+      Title: 'ResNet strikes back: An improved training procedure in timm'
+    README: configs/resnet_strikes_back/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md
+      Version: v2.22.0
diff --git a/mmde/mmdet/.mim/configs/resnet_strikes_back/retinanet_r50-rsb-pre_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/resnet_strikes_back/retinanet_r50-rsb-pre_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ce7bfd87d6b41a36acc4ff207695e38ef89700c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/resnet_strikes_back/retinanet_r50-rsb-pre_fpn_1x_coco.py
@@ -0,0 +1,15 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)))
+
+optim_wrapper = dict(
+    optimizer=dict(_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True))
diff --git a/mmde/mmdet/.mim/configs/retinanet/metafile.yml b/mmde/mmdet/.mim/configs/retinanet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0551541c59100d3cc8fb361cc8895c2dbd4cf8f3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/metafile.yml
@@ -0,0 +1,312 @@
+Collections:
+  - Name: RetinaNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Focal Loss
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1708.02002
+      Title: "Focal Loss for Dense Object Detection"
+    README: configs/retinanet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/retinanet.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: retinanet_r18_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r18_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 1.7
+      Training Resources: 8x V100 GPUs
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 31.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x_coco/retinanet_r18_fpn_1x_coco_20220407_171055-614fd399.pth
+
+  - Name: retinanet_r18_fpn_1xb8-1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r18_fpn_1xb8-1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      Training Resources:  1x V100 GPUs
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 31.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x8_1x_coco/retinanet_r18_fpn_1x8_1x_coco_20220407_171255-4ea310d7.pth
+
+  - Name: retinanet_r50-caffe_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r50-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.5
+      inference time (ms/im):
+        - value: 53.76
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_caffe_fpn_1x_coco/retinanet_r50_caffe_fpn_1x_coco_20200531-f11027c5.pth
+
+  - Name: retinanet_r50_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.8
+      inference time (ms/im):
+        - value: 52.63
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth
+
+  - Name: retinanet_r50_fpn_amp-1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r50_fpn_amp-1x_coco.py
+    Metadata:
+      Training Memory (GB): 2.8
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+        - Mixed Precision Training
+      inference time (ms/im):
+        - value: 31.65
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP16
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 36.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/retinanet_r50_fpn_fp16_1x_coco/retinanet_r50_fpn_fp16_1x_coco_20200702-0dbfb212.pth
+
+  - Name: retinanet_r50_fpn_2x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r50_fpn_2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_2x_coco/retinanet_r50_fpn_2x_coco_20200131-fdb43119.pth
+
+  - Name: retinanet_r50_fpn_ms-640-800-3x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r50_fpn_ms-640-800-3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_mstrain_3x_coco/retinanet_r50_fpn_mstrain_3x_coco_20210718_220633-88476508.pth
+
+  - Name: retinanet_r101-caffe_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r101-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.5
+      inference time (ms/im):
+        - value: 68.03
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_1x_coco/retinanet_r101_caffe_fpn_1x_coco_20200531-b428fa0f.pth
+
+  - Name: retinanet_r101-caffe_fpn_ms-3x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r101-caffe_fpn_ms-3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco/retinanet_r101_caffe_fpn_mstrain_3x_coco_20210721_063439-88a8a944.pth
+
+  - Name: retinanet_r101_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r101_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.7
+      inference time (ms/im):
+        - value: 66.67
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_1x_coco/retinanet_r101_fpn_1x_coco_20200130-7a93545f.pth
+
+  - Name: retinanet_r101_fpn_2x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r101_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 5.7
+      inference time (ms/im):
+        - value: 66.67
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_2x_coco/retinanet_r101_fpn_2x_coco_20200131-5560aee8.pth
+
+  - Name: retinanet_r101_fpn_ms-640-800-3x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_r101_fpn_ms-640-800-3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_mstrain_3x_coco/retinanet_r101_fpn_mstrain_3x_coco_20210720_214650-7ee888e0.pth
+
+  - Name: retinanet_x101-32x4d_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_x101-32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 82.64
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_1x_coco/retinanet_x101_32x4d_fpn_1x_coco_20200130-5c8b7ec4.pth
+
+  - Name: retinanet_x101-32x4d_fpn_2x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_x101-32x4d_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 82.64
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_2x_coco/retinanet_x101_32x4d_fpn_2x_coco_20200131-237fc5e1.pth
+
+  - Name: retinanet_x101-64x4d_fpn_1x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_x101-64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.0
+      inference time (ms/im):
+        - value: 114.94
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_1x_coco/retinanet_x101_64x4d_fpn_1x_coco_20200130-366f5af1.pth
+
+  - Name: retinanet_x101-64x4d_fpn_2x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_x101-64x4d_fpn_2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.0
+      inference time (ms/im):
+        - value: 114.94
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_2x_coco/retinanet_x101_64x4d_fpn_2x_coco_20200131-bca068ab.pth
+
+  - Name: retinanet_x101-64x4d_fpn_ms-640-800-3x_coco
+    In Collection: RetinaNet
+    Config: configs/retinanet/retinanet_x101-64x4d_fpn_ms-640-800-3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_mstrain_3x_coco/retinanet_x101_64x4d_fpn_mstrain_3x_coco_20210719_051838-022c2187.pth
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r101-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r101-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f3a4487103eea868eafe8539517b38455025bbe
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r101-caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './retinanet_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r101-caffe_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r101-caffe_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfe773459c2529079274b241f5f99ae66d8906ad
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r101-caffe_fpn_ms-3x_coco.py
@@ -0,0 +1,8 @@
+_base_ = './retinanet_r50-caffe_fpn_ms-3x_coco.py'
+# learning policy
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7f06002413dcdf2716975655a582a3eefaf007a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r101_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r101_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..721112a221953bb86dc3259e3991d7f0f740b26c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r101_fpn_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './retinanet_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r101_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r101_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..be018eaac672a4c1c3a61eac9940c4d28ea4fb40
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r101_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './retinanet_r50_fpn_8xb8-amp-lsj-200e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r101_fpn_ms-640-800-3x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r101_fpn_ms-640-800-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..566397227f7861a268c4cc4e111279b95b620ab8
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r101_fpn_ms-640-800-3x_coco.py
@@ -0,0 +1,9 @@
+_base_ = ['../_base_/models/retinanet_r50_fpn.py', '../common/ms_3x_coco.py']
+# optimizer
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r18_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r18_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..960211806756d38cf74eed998addcca3f8467a4d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r18_fpn_1x_coco.py
@@ -0,0 +1,20 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# model
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
+
+# TODO: support auto scaling lr
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+# auto_scale_lr = dict(base_batch_size=16)
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r18_fpn_1xb8-1x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r18_fpn_1xb8-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2e88d68e3366671e402b1766d3b456593262a9b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r18_fpn_1xb8-1x_coco.py
@@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# data
+train_dataloader = dict(batch_size=8)
+
+# model
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
+
+# Note: If the learning rate is set to 0.0025, the mAP will be 32.4.
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001))
+# TODO: support auto scaling lr
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (1 GPUs) x (8 samples per GPU)
+# auto_scale_lr = dict(base_batch_size=8)
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r18_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r18_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6833f3f4711ec28a25ae8a51687fc4ac13ffb89
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r18_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './retinanet_r50_fpn_8xb8-amp-lsj-200e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ba1cdddc4707b40f549189f768457312635669d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        # use caffe img_norm
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r50-caffe_fpn_ms-1x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50-caffe_fpn_ms-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..93687d8c27b73ae2a172b45a733345e5fc036f03
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50-caffe_fpn_ms-1x_coco.py
@@ -0,0 +1,15 @@
+_base_ = './retinanet_r50-caffe_fpn_1x_coco.py'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r50-caffe_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50-caffe_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d1604fb9efd5deb11ffc04f6f9685739f82aea9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50-caffe_fpn_ms-2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './retinanet_r50-caffe_fpn_ms-1x_coco.py'
+# training schedule for 2x
+train_cfg = dict(max_epochs=24)
+
+# learning rate policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r50-caffe_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50-caffe_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a6d42a13c27d5fc0b8072e2c96ef5d15a0f248c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50-caffe_fpn_ms-3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './retinanet_r50-caffe_fpn_ms-1x_coco.py'
+
+# training schedule for 2x
+train_cfg = dict(max_epochs=36)
+
+# learning rate policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[28, 34],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..00d2567b245dba2b2be815a92146ea1364e1e799
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_1x_coco.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py',
+    './retinanet_tta.py'
+]
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..47511b78ed2edb43121de2fc27986f6bb81abcfa
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_2x_coco.py
@@ -0,0 +1,25 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# training schedule for 2x
+train_cfg = dict(max_epochs=24)
+
+# learning rate policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_8xb8-amp-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_8xb8-amp-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f10db2f3c84d4b1970f13f54c563408487d04af
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_8xb8-amp-lsj-200e_coco.py
@@ -0,0 +1,21 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../common/lsj-200e_coco-detection.py'
+]
+
+image_size = (1024, 1024)
+batch_augments = [dict(type='BatchFixedSizePad', size=image_size)]
+
+model = dict(data_preprocessor=dict(batch_augments=batch_augments))
+
+train_dataloader = dict(batch_size=8, num_workers=4)
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=0.01 * 4, momentum=0.9, weight_decay=0.00004))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_90k_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_90k_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1b2fd950a0293220cc93ce3f3b377b4163f3aa
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_90k_coco.py
@@ -0,0 +1,24 @@
+_base_ = 'retinanet_r50_fpn_1x_coco.py'
+
+# training schedule for 90k
+train_cfg = dict(
+    _delete_=True,
+    type='IterBasedTrainLoop',
+    max_iters=90000,
+    val_interval=10000)
+# learning rate policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=90000,
+        by_epoch=False,
+        milestones=[60000, 80000],
+        gamma=0.1)
+]
+train_dataloader = dict(sampler=dict(type='InfiniteSampler'))
+default_hooks = dict(checkpoint=dict(by_epoch=False, interval=10000))
+
+log_processor = dict(by_epoch=False)
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_amp-1x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_amp-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..acf5266337b8e73957a1cdf2b06076c1733b4d56
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_amp-1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './retinanet_r50_fpn_1x_coco.py'
+
+# MMEngine support the following two ways, users can choose
+# according to convenience
+# optim_wrapper = dict(type='AmpOptimWrapper')
+_base_.optim_wrapper.type = 'AmpOptimWrapper'
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_ms-640-800-3x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_ms-640-800-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d91cf8ce0df15968706631d7eac76e834cba93dc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_r50_fpn_ms-640-800-3x_coco.py
@@ -0,0 +1,4 @@
+_base_ = ['../_base_/models/retinanet_r50_fpn.py', '../common/ms_3x_coco.py']
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_tta.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_tta.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0f37e0ab25e2aff1ad55e76a7ee02777293d507
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_tta.py
@@ -0,0 +1,23 @@
+tta_model = dict(
+    type='DetTTAModel',
+    tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.5), max_per_img=100))
+
+img_scales = [(1333, 800), (666, 400), (2000, 1200)]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[[
+            dict(type='Resize', scale=s, keep_ratio=True) for s in img_scales
+        ], [
+            dict(type='RandomFlip', prob=1.),
+            dict(type='RandomFlip', prob=0.)
+        ], [dict(type='LoadAnnotations', with_bbox=True)],
+                    [
+                        dict(
+                            type='PackDetInputs',
+                            meta_keys=('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'flip',
+                                       'flip_direction'))
+                    ]])
+]
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_x101-32x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_x101-32x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..765a4c2cc0f69bf13891bf371c94c17b6cd5f30c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_x101-32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_x101-32x4d_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_x101-32x4d_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..14de96faf70180d7828a670630a8f48a3cd1081d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_x101-32x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './retinanet_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_x101-64x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_x101-64x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..948cd18e4d995d18d947b345ba7229b5cad60eb1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_x101-64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './retinanet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_x101-64x4d_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_x101-64x4d_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad04b6eea793add40c81d1d7096481597357d5bd
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_x101-64x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './retinanet_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/retinanet/retinanet_x101-64x4d_fpn_ms-640-800-3x_coco.py b/mmde/mmdet/.mim/configs/retinanet/retinanet_x101-64x4d_fpn_ms-640-800-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..853134160cd2128cac7954cca7e008444522fd2c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/retinanet/retinanet_x101-64x4d_fpn_ms-640-800-3x_coco.py
@@ -0,0 +1,11 @@
+_base_ = ['../_base_/models/retinanet_r50_fpn.py', '../common/ms_3x_coco.py']
+# optimizer
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
+optim_wrapper = dict(optimizer=dict(type='SGD', lr=0.01))
diff --git a/mmde/mmdet/.mim/configs/rpn/metafile.yml b/mmde/mmdet/.mim/configs/rpn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9796ead6d2ed28f0e10e16165103e31c289dae26
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rpn/metafile.yml
@@ -0,0 +1,127 @@
+Collections:
+  - Name: RPN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1506.01497
+      Title: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks"
+    README: configs/rpn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/rpn.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: rpn_r50-caffe_fpn_1x_coco
+    In Collection: RPN
+    Config: configs/rpn/rpn_r50-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.5
+      Training Resources: 8x V100 GPUs
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          AR@1000: 58.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_caffe_fpn_1x_coco/rpn_r50_caffe_fpn_1x_coco_20200531-5b903a37.pth
+
+  - Name: rpn_r50_fpn_1x_coco
+    In Collection: RPN
+    Config: configs/rpn/rpn_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 3.8
+      Training Resources: 8x V100 GPUs
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          AR@1000: 58.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_1x_coco/rpn_r50_fpn_1x_coco_20200218-5525fa2e.pth
+
+  - Name: rpn_r50_fpn_2x_coco
+    In Collection: RPN
+    Config: rpn_r50_fpn_2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          AR@1000: 58.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_2x_coco/rpn_r50_fpn_2x_coco_20200131-0728c9b3.pth
+
+  - Name: rpn_r101-caffe_fpn_1x_coco
+    In Collection: RPN
+    Config: configs/rpn/rpn_r101-caffe_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.4
+      Training Resources: 8x V100 GPUs
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          AR@1000: 60.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_caffe_fpn_1x_coco/rpn_r101_caffe_fpn_1x_coco_20200531-0629a2e2.pth
+
+  - Name: rpn_x101-32x4d_fpn_1x_coco
+    In Collection: RPN
+    Config: configs/rpn/rpn_x101-32x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      Training Resources: 8x V100 GPUs
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          AR@1000: 60.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_1x_coco/rpn_x101_32x4d_fpn_1x_coco_20200219-b02646c6.pth
+
+  - Name: rpn_x101-32x4d_fpn_2x_coco
+    In Collection: RPN
+    Config: configs/rpn/rpn_x101-32x4d_fpn_2x_coco.py
+    Metadata:
+      Training Resources: 8x V100 GPUs
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          AR@1000: 61.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_2x_coco/rpn_x101_32x4d_fpn_2x_coco_20200208-d22bd0bb.pth
+
+  - Name: rpn_x101-64x4d_fpn_1x_coco
+    In Collection: RPN
+    Config: configs/rpn/rpn_x101-64x4d_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 10.1
+      Training Resources: 8x V100 GPUs
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          AR@1000: 61.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_1x_coco/rpn_x101_64x4d_fpn_1x_coco_20200208-cde6f7dd.pth
+
+  - Name: rpn_x101-64x4d_fpn_2x_coco
+    In Collection: RPN
+    Config: configs/rpn/rpn_x101-64x4d_fpn_2x_coco.py
+    Metadata:
+      Training Resources: 8x V100 GPUs
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          AR@1000: 61.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_2x_coco/rpn_x101_64x4d_fpn_2x_coco_20200208-c65f524f.pth
diff --git a/mmde/mmdet/.mim/configs/rpn/rpn_r101-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/rpn/rpn_r101-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..22977af8cb761f9415c55f8fa6d458937a00ba06
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rpn/rpn_r101-caffe_fpn_1x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './rpn_r50-caffe_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/mmde/mmdet/.mim/configs/rpn/rpn_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/rpn/rpn_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..962728ff08abb4652c617a085649575b6cfdcbf8
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rpn/rpn_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './rpn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/rpn/rpn_r101_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/rpn/rpn_r101_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac7671c1c2421c0caa7b42d012cc3a2edc068934
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rpn/rpn_r101_fpn_2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './rpn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/rpn/rpn_r50-caffe-c4_1x_coco.py b/mmde/mmdet/.mim/configs/rpn/rpn_r50-caffe-c4_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..76b878c874d6545e537ee8a9618e83bb095de281
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rpn/rpn_r50-caffe-c4_1x_coco.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/rpn_r50-caffe-c4.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+val_evaluator = dict(metric='proposal_fast')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/rpn/rpn_r50-caffe_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/rpn/rpn_r50-caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..530f365210572f9bf55ca2775bfdbeba98567076
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rpn/rpn_r50-caffe_fpn_1x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './rpn_r50_fpn_1x_coco.py'
+# use caffe img_norm
+model = dict(
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
diff --git a/mmde/mmdet/.mim/configs/rpn/rpn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/rpn/rpn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fe88d395b8a32e7513ede3c0c724e29b3554da6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rpn/rpn_r50_fpn_1x_coco.py
@@ -0,0 +1,36 @@
+_base_ = [
+    '../_base_/models/rpn_r50_fpn.py', '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+val_evaluator = dict(metric='proposal_fast')
+test_evaluator = val_evaluator
+
+# inference on val dataset and dump the proposals with evaluate metric
+# data_root = 'data/coco/'
+# test_evaluator = [
+#     dict(
+#         type='DumpProposals',
+#         output_dir=data_root + 'proposals/',
+#         proposals_file='rpn_r50_fpn_1x_val2017.pkl'),
+#     dict(
+#         type='CocoMetric',
+#         ann_file=data_root + 'annotations/instances_val2017.json',
+#         metric='proposal_fast',
+#         backend_args={{_base_.backend_args}},
+#         format_only=False)
+# ]
+
+# inference on training dataset and dump the proposals without evaluate metric
+# data_root = 'data/coco/'
+# test_dataloader = dict(
+#     dataset=dict(
+#         ann_file='annotations/instances_train2017.json',
+#         data_prefix=dict(img='train2017/')))
+#
+# test_evaluator = [
+#     dict(
+#         type='DumpProposals',
+#         output_dir=data_root + 'proposals/',
+#         proposals_file='rpn_r50_fpn_1x_train2017.pkl'),
+# ]
diff --git a/mmde/mmdet/.mim/configs/rpn/rpn_r50_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/rpn/rpn_r50_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ebccbcfaf394fcbb4fbdaea51abdd583f628cac
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rpn/rpn_r50_fpn_2x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './rpn_r50_fpn_1x_coco.py'
+
+# learning policy
+max_epochs = 24
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/rpn/rpn_x101-32x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/rpn/rpn_x101-32x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0c73948ac56afa34b9d6c8d22d6158271306b8c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rpn/rpn_x101-32x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './rpn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/rpn/rpn_x101-32x4d_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/rpn/rpn_x101-32x4d_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6880b762abc8f5d3bf12f278054d76958756fb2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rpn/rpn_x101-32x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './rpn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/rpn/rpn_x101-64x4d_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/rpn/rpn_x101-64x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..96e691a912c424f09add038c75631a2e1fefeffc
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rpn/rpn_x101-64x4d_fpn_1x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './rpn_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/rpn/rpn_x101-64x4d_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/rpn/rpn_x101-64x4d_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4182a39667c47d774a1df9d34a1bc2fe60b45538
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rpn/rpn_x101-64x4d_fpn_2x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './rpn_r50_fpn_2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/rtmdet/classification/cspnext-l_8xb256-rsb-a1-600e_in1k.py b/mmde/mmdet/.mim/configs/rtmdet/classification/cspnext-l_8xb256-rsb-a1-600e_in1k.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2e70539f05da69cca53f273d11e3296c87c4eda
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/classification/cspnext-l_8xb256-rsb-a1-600e_in1k.py
@@ -0,0 +1,5 @@
+_base_ = './cspnext-s_8xb256-rsb-a1-600e_in1k.py'
+
+model = dict(
+    backbone=dict(deepen_factor=1, widen_factor=1),
+    head=dict(in_channels=1024))
diff --git a/mmde/mmdet/.mim/configs/rtmdet/classification/cspnext-m_8xb256-rsb-a1-600e_in1k.py b/mmde/mmdet/.mim/configs/rtmdet/classification/cspnext-m_8xb256-rsb-a1-600e_in1k.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1b1352dd91a803eeafe80f587203f96a247c27f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/classification/cspnext-m_8xb256-rsb-a1-600e_in1k.py
@@ -0,0 +1,5 @@
+_base_ = './cspnext-s_8xb256-rsb-a1-600e_in1k.py'
+
+model = dict(
+    backbone=dict(deepen_factor=0.67, widen_factor=0.75),
+    head=dict(in_channels=768))
diff --git a/mmde/mmdet/.mim/configs/rtmdet/classification/cspnext-s_8xb256-rsb-a1-600e_in1k.py b/mmde/mmdet/.mim/configs/rtmdet/classification/cspnext-s_8xb256-rsb-a1-600e_in1k.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcfd2ea47d54408ef6d2fe225b57c5c9e540918a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/classification/cspnext-s_8xb256-rsb-a1-600e_in1k.py
@@ -0,0 +1,64 @@
+_base_ = [
+    'mmpretrain::_base_/datasets/imagenet_bs256_rsb_a12.py',
+    'mmpretrain::_base_/schedules/imagenet_bs2048_rsb.py',
+    'mmpretrain::_base_/default_runtime.py'
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='mmdet.CSPNeXt',
+        arch='P5',
+        out_indices=(4, ),
+        expand_ratio=0.5,
+        deepen_factor=0.33,
+        widen_factor=0.5,
+        channel_attention=True,
+        norm_cfg=dict(type='BN'),
+        act_cfg=dict(type='mmdet.SiLU')),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        loss=dict(
+            type='LabelSmoothLoss',
+            label_smooth_val=0.1,
+            mode='original',
+            loss_weight=1.0),
+        topk=(1, 5)),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.2),
+        dict(type='CutMix', alpha=1.0)
+    ]))
+
+# dataset settings
+train_dataloader = dict(sampler=dict(type='RepeatAugSampler', shuffle=True))
+
+# schedule settings
+optim_wrapper = dict(
+    optimizer=dict(weight_decay=0.01),
+    paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.),
+)
+
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=0.0001,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(
+        type='CosineAnnealingLR',
+        T_max=595,
+        eta_min=1.0e-6,
+        by_epoch=True,
+        begin=5,
+        end=600)
+]
+
+train_cfg = dict(by_epoch=True, max_epochs=600)
diff --git a/mmde/mmdet/.mim/configs/rtmdet/classification/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py b/mmde/mmdet/.mim/configs/rtmdet/classification/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py
new file mode 100644
index 0000000000000000000000000000000000000000..af3170bdc51778c4601d4426aa88cc27c608f100
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/classification/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py
@@ -0,0 +1,5 @@
+_base_ = './cspnext-s_8xb256-rsb-a1-600e_in1k.py'
+
+model = dict(
+    backbone=dict(deepen_factor=0.167, widen_factor=0.375),
+    head=dict(in_channels=384))
diff --git a/mmde/mmdet/.mim/configs/rtmdet/classification/cspnext-x_8xb256-rsb-a1-600e_in1k.py b/mmde/mmdet/.mim/configs/rtmdet/classification/cspnext-x_8xb256-rsb-a1-600e_in1k.py
new file mode 100644
index 0000000000000000000000000000000000000000..edec48d78dbefdb7783c5dd50e97873e29ea6497
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/classification/cspnext-x_8xb256-rsb-a1-600e_in1k.py
@@ -0,0 +1,5 @@
+_base_ = './cspnext-s_8xb256-rsb-a1-600e_in1k.py'
+
+model = dict(
+    backbone=dict(deepen_factor=1.33, widen_factor=1.25),
+    head=dict(in_channels=1280))
diff --git a/mmde/mmdet/.mim/configs/rtmdet/metafile.yml b/mmde/mmdet/.mim/configs/rtmdet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a62abcb2faabb2e7d6c4a6c7d3b492392eba9775
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/metafile.yml
@@ -0,0 +1,242 @@
+Collections:
+  - Name: RTMDet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+        - Flat Cosine Annealing
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - CSPNeXt
+        - CSPNeXtPAFPN
+    README: configs/rtmdet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v3.0.0rc1/mmdet/models/detectors/rtmdet.py#L6
+      Version: v3.0.0rc1
+
+Models:
+  - Name: rtmdet_tiny_8xb32-300e_coco
+    Alias:
+      - rtmdet-t
+    In Collection: RTMDet
+    Config: configs/rtmdet/rtmdet_tiny_8xb32-300e_coco.py
+    Metadata:
+      Training Memory (GB): 11.7
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.9
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth
+
+  - Name: rtmdet_s_8xb32-300e_coco
+    Alias:
+      - rtmdet-s
+    In Collection: RTMDet
+    Config: configs/rtmdet/rtmdet_s_8xb32-300e_coco.py
+    Metadata:
+      Training Memory (GB): 15.9
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_s_8xb32-300e_coco/rtmdet_s_8xb32-300e_coco_20220905_161602-387a891e.pth
+
+  - Name: rtmdet_m_8xb32-300e_coco
+    Alias:
+      - rtmdet-m
+    In Collection: RTMDet
+    Config: configs/rtmdet/rtmdet_m_8xb32-300e_coco.py
+    Metadata:
+      Training Memory (GB): 27.8
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 49.1
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_m_8xb32-300e_coco/rtmdet_m_8xb32-300e_coco_20220719_112220-229f527c.pth
+
+  - Name: rtmdet_l_8xb32-300e_coco
+    Alias:
+      - rtmdet-l
+    In Collection: RTMDet
+    Config: configs/rtmdet/rtmdet_l_8xb32-300e_coco.py
+    Metadata:
+      Training Memory (GB): 43.2
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 51.3
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_l_8xb32-300e_coco/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth
+
+  - Name: rtmdet_x_8xb32-300e_coco
+    Alias:
+      - rtmdet-x
+    In Collection: RTMDet
+    Config: configs/rtmdet/rtmdet_x_8xb32-300e_coco.py
+    Metadata:
+      Training Memory (GB): 61.1
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 52.6
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_x_8xb32-300e_coco/rtmdet_x_8xb32-300e_coco_20220715_230555-cc79b9ae.pth
+
+  - Name: rtmdet_x_p6_4xb8-300e_coco
+    Alias:
+      - rtmdet-x_p6
+    In Collection: RTMDet
+    Config: configs/rtmdet/rtmdet_x_p6_4xb8-300e_coco.py
+    Metadata:
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 54.9
+    Weights: https://github.com/orange0-jp/orange-weights/releases/download/v0.1.0rtmdet-p6/rtmdet_x_p6_4xb8-300e_coco-bf32be58.pth
+
+  - Name: rtmdet_l_convnext_b_4xb32-100e_coco
+    Alias:
+      - rtmdet-l_convnext_b
+    In Collection: RTMDet
+    Config: configs/rtmdet/rtmdet_l_convnext_b_4xb32-100e_coco.py
+    Metadata:
+      Epochs: 100
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 53.1
+    Weights: https://github.com/orange0-jp/orange-weights/releases/download/v0.1.0rtmdet-swin-convnext/rtmdet_l_convnext_b_4xb32-100e_coco-d4731b3d.pth
+
+  - Name: rtmdet_l_swin_b_4xb32-100e_coco
+    Alias:
+      - rtmdet-l_swin_b
+    In Collection: RTMDet
+    Config: configs/rtmdet/rtmdet_l_swin_b_4xb32-100e_coco.py
+    Metadata:
+      Epochs: 100
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 52.4
+    Weights: https://github.com/orange0-jp/orange-weights/releases/download/v0.1.0rtmdet-swin-convnext/rtmdet_l_swin_b_4xb32-100e_coco-0828ce5d.pth
+
+  - Name: rtmdet_l_swin_b_p6_4xb16-100e_coco
+    Alias:
+      - rtmdet-l_swin_b_p6
+    In Collection: RTMDet
+    Config: configs/rtmdet/rtmdet_l_swin_b_p6_4xb16-100e_coco.py
+    Metadata:
+      Epochs: 100
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 56.4
+    Weights: https://github.com/orange0-jp/orange-weights/releases/download/v0.1.0rtmdet-swin-convnext/rtmdet_l_swin_b_p6_4xb16-100e_coco-a1486b6f.pth
+
+  - Name: rtmdet-ins_tiny_8xb32-300e_coco
+    Alias:
+      - rtmdet-ins-t
+    In Collection: RTMDet
+    Config: configs/rtmdet/rtmdet-ins_tiny_8xb32-300e_coco.py
+    Metadata:
+      Training Memory (GB): 18.4
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 35.4
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_tiny_8xb32-300e_coco/rtmdet-ins_tiny_8xb32-300e_coco_20221130_151727-ec670f7e.pth
+
+  - Name: rtmdet-ins_s_8xb32-300e_coco
+    Alias:
+      - rtmdet-ins-s
+    In Collection: RTMDet
+    Config: configs/rtmdet/rtmdet-ins_s_8xb32-300e_coco.py
+    Metadata:
+      Training Memory (GB): 27.6
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 38.7
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_s_8xb32-300e_coco/rtmdet-ins_s_8xb32-300e_coco_20221121_212604-fdc5d7ec.pth
+
+  - Name: rtmdet-ins_m_8xb32-300e_coco
+    Alias:
+      - rtmdet-ins-m
+    In Collection: RTMDet
+    Config: configs/rtmdet/rtmdet-ins_m_8xb32-300e_coco.py
+    Metadata:
+      Training Memory (GB): 42.5
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 48.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 42.1
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_m_8xb32-300e_coco/rtmdet-ins_m_8xb32-300e_coco_20221123_001039-6eba602e.pth
+
+  - Name: rtmdet-ins_l_8xb32-300e_coco
+    Alias:
+      - rtmdet-ins-l
+    In Collection: RTMDet
+    Config: configs/rtmdet/rtmdet-ins_l_8xb32-300e_coco.py
+    Metadata:
+      Training Memory (GB): 59.8
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 51.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 43.7
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_l_8xb32-300e_coco/rtmdet-ins_l_8xb32-300e_coco_20221124_103237-78d1d652.pth
+
+  - Name: rtmdet-ins_x_8xb16-300e_coco
+    Alias:
+      - rtmdet-ins-x
+    In Collection: RTMDet
+    Config: configs/rtmdet/rtmdet-ins_x_8xb16-300e_coco.py
+    Metadata:
+      Training Memory (GB): 33.7
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 52.4
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 44.6
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_x_8xb16-300e_coco/rtmdet-ins_x_8xb16-300e_coco_20221124_111313-33d4595b.pth
diff --git a/mmde/mmdet/.mim/configs/rtmdet/rtmdet-ins_l_8xb32-300e_coco.py b/mmde/mmdet/.mim/configs/rtmdet/rtmdet-ins_l_8xb32-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b4b9240a64d39d8a16352ef87de53af9e81ac96
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/rtmdet-ins_l_8xb32-300e_coco.py
@@ -0,0 +1,104 @@
+_base_ = './rtmdet_l_8xb32-300e_coco.py'
+model = dict(
+    bbox_head=dict(
+        _delete_=True,
+        type='RTMDetInsSepBNHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=2,
+        share_conv=True,
+        pred_kernel_size=1,
+        feat_channels=256,
+        act_cfg=dict(type='SiLU', inplace=True),
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        anchor_generator=dict(
+            type='MlvlPointGenerator', offset=0, strides=[8, 16, 32]),
+        bbox_coder=dict(type='DistancePointBBoxCoder'),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        loss_mask=dict(
+            type='DiceLoss', loss_weight=2.0, eps=5e-6, reduction='mean')),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100,
+        mask_thr_binary=0.5),
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0),
+    dict(
+        type='RandomResize',
+        scale=(1280, 1280),
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_size=(640, 640),
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='CachedMixUp',
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=20,
+        pad_val=(114, 114, 114)),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(pin_memory=True, dataset=dict(pipeline=train_pipeline))
+
+train_pipeline_stage2 = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='RandomResize',
+        scale=(640, 640),
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_size=(640, 640),
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type='PackDetInputs')
+]
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='PipelineSwitchHook',
+        switch_epoch=280,
+        switch_pipeline=train_pipeline_stage2)
+]
+
+val_evaluator = dict(metric=['bbox', 'segm'])
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/rtmdet/rtmdet-ins_m_8xb32-300e_coco.py b/mmde/mmdet/.mim/configs/rtmdet/rtmdet-ins_m_8xb32-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..66da9148775b425c6b0052beb04f9c8ca17257d9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/rtmdet-ins_m_8xb32-300e_coco.py
@@ -0,0 +1,6 @@
+_base_ = './rtmdet-ins_l_8xb32-300e_coco.py'
+
+model = dict(
+    backbone=dict(deepen_factor=0.67, widen_factor=0.75),
+    neck=dict(in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2),
+    bbox_head=dict(in_channels=192, feat_channels=192))
diff --git a/mmde/mmdet/.mim/configs/rtmdet/rtmdet-ins_s_8xb32-300e_coco.py b/mmde/mmdet/.mim/configs/rtmdet/rtmdet-ins_s_8xb32-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..28bc21cc93bb36d2d2fc8601b06bb0f0c58d6d49
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/rtmdet-ins_s_8xb32-300e_coco.py
@@ -0,0 +1,80 @@
+_base_ = './rtmdet-ins_l_8xb32-300e_coco.py'
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth'  # noqa
+model = dict(
+    backbone=dict(
+        deepen_factor=0.33,
+        widen_factor=0.5,
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
+    neck=dict(in_channels=[128, 256, 512], out_channels=128, num_csp_blocks=1),
+    bbox_head=dict(in_channels=128, feat_channels=128))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0),
+    dict(
+        type='RandomResize',
+        scale=(1280, 1280),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_size=(640, 640),
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='CachedMixUp',
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=20,
+        pad_val=(114, 114, 114)),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)),
+    dict(type='PackDetInputs')
+]
+
+train_pipeline_stage2 = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='RandomResize',
+        scale=(640, 640),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_size=(640, 640),
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='PipelineSwitchHook',
+        switch_epoch=280,
+        switch_pipeline=train_pipeline_stage2)
+]
diff --git a/mmde/mmdet/.mim/configs/rtmdet/rtmdet-ins_tiny_8xb32-300e_coco.py b/mmde/mmdet/.mim/configs/rtmdet/rtmdet-ins_tiny_8xb32-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..954f911614e75eb9910effbf1bbc1d7b01120276
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/rtmdet-ins_tiny_8xb32-300e_coco.py
@@ -0,0 +1,48 @@
+_base_ = './rtmdet-ins_s_8xb32-300e_coco.py'
+
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        deepen_factor=0.167,
+        widen_factor=0.375,
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
+    neck=dict(in_channels=[96, 192, 384], out_channels=96, num_csp_blocks=1),
+    bbox_head=dict(in_channels=96, feat_channels=96))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        poly2mask=False),
+    dict(
+        type='CachedMosaic',
+        img_scale=(640, 640),
+        pad_val=114.0,
+        max_cached_images=20,
+        random_pop=False),
+    dict(
+        type='RandomResize',
+        scale=(1280, 1280),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(640, 640)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='CachedMixUp',
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=10,
+        random_pop=False,
+        pad_val=(114, 114, 114),
+        prob=0.5),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/rtmdet/rtmdet-ins_x_8xb16-300e_coco.py b/mmde/mmdet/.mim/configs/rtmdet/rtmdet-ins_x_8xb16-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..daaa640edac6b2114caf13b650d99d7c7632629a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/rtmdet-ins_x_8xb16-300e_coco.py
@@ -0,0 +1,31 @@
+_base_ = './rtmdet-ins_l_8xb32-300e_coco.py'
+
+model = dict(
+    backbone=dict(deepen_factor=1.33, widen_factor=1.25),
+    neck=dict(
+        in_channels=[320, 640, 1280], out_channels=320, num_csp_blocks=4),
+    bbox_head=dict(in_channels=320, feat_channels=320))
+
+base_lr = 0.002
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(lr=base_lr))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0e-5,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        # use cosine lr from 150 to 300 epoch
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=_base_.max_epochs // 2,
+        end=_base_.max_epochs,
+        T_max=_base_.max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
diff --git a/mmde/mmdet/.mim/configs/rtmdet/rtmdet_l_8xb32-300e_coco.py b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_l_8xb32-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cce4d89c84a81d7aa22197cd6dd70fe08637a35
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_l_8xb32-300e_coco.py
@@ -0,0 +1,179 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_1x.py',
+    '../_base_/datasets/coco_detection.py', './rtmdet_tta.py'
+]
+model = dict(
+    type='RTMDet',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False,
+        batch_augments=None),
+    backbone=dict(
+        type='CSPNeXt',
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=1,
+        widen_factor=1,
+        channel_attention=True,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    neck=dict(
+        type='CSPNeXtPAFPN',
+        in_channels=[256, 512, 1024],
+        out_channels=256,
+        num_csp_blocks=3,
+        expand_ratio=0.5,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    bbox_head=dict(
+        type='RTMDetSepBNHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=2,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='MlvlPointGenerator', offset=0, strides=[8, 16, 32]),
+        bbox_coder=dict(type='DistancePointBBoxCoder'),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            beta=2.0,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        with_objectness=False,
+        exp_on_reg=True,
+        share_conv=True,
+        pred_kernel_size=1,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    train_cfg=dict(
+        assigner=dict(type='DynamicSoftLabelAssigner', topk=13),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=30000,
+        min_bbox_size=0,
+        score_thr=0.001,
+        nms=dict(type='nms', iou_threshold=0.65),
+        max_per_img=300),
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0),
+    dict(
+        type='RandomResize',
+        scale=(1280, 1280),
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(640, 640)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='CachedMixUp',
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=20,
+        pad_val=(114, 114, 114)),
+    dict(type='PackDetInputs')
+]
+
+train_pipeline_stage2 = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize',
+        scale=(640, 640),
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(640, 640)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(640, 640), keep_ratio=True),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=10,
+    batch_sampler=None,
+    pin_memory=True,
+    dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=5, num_workers=10, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+max_epochs = 300
+stage2_num_epochs = 20
+base_lr = 0.004
+interval = 10
+
+train_cfg = dict(
+    max_epochs=max_epochs,
+    val_interval=interval,
+    dynamic_intervals=[(max_epochs - stage2_num_epochs, 1)])
+
+val_evaluator = dict(proposal_nums=(100, 1, 10))
+test_evaluator = val_evaluator
+
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0e-5,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        # use cosine lr from 150 to 300 epoch
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        interval=interval,
+        max_keep_ckpts=3  # only keep latest 3 checkpoints
+    ))
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='PipelineSwitchHook',
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
diff --git a/mmde/mmdet/.mim/configs/rtmdet/rtmdet_l_convnext_b_4xb32-100e_coco.py b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_l_convnext_b_4xb32-100e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..85af292bcaba2e1853ed4f3a3f5818c0c0d5813e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_l_convnext_b_4xb32-100e_coco.py
@@ -0,0 +1,81 @@
+_base_ = './rtmdet_l_8xb32-300e_coco.py'
+
+custom_imports = dict(
+    imports=['mmpretrain.models'], allow_failed_imports=False)
+
+norm_cfg = dict(type='GN', num_groups=32)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_in1k-384px_20221219-4570f792.pth'  # noqa
+model = dict(
+    type='RTMDet',
+    data_preprocessor=dict(
+        _delete_=True,
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        batch_augments=None),
+    backbone=dict(
+        _delete_=True,
+        type='mmpretrain.ConvNeXt',
+        arch='base',
+        out_indices=[1, 2, 3],
+        drop_path_rate=0.7,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        with_cp=True,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    neck=dict(in_channels=[256, 512, 1024], norm_cfg=norm_cfg),
+    bbox_head=dict(norm_cfg=norm_cfg))
+
+max_epochs = 100
+stage2_num_epochs = 10
+interval = 10
+base_lr = 0.001
+
+train_cfg = dict(
+    max_epochs=max_epochs,
+    val_interval=interval,
+    dynamic_intervals=[(max_epochs - stage2_num_epochs, 1)])
+
+optim_wrapper = dict(
+    constructor='LearningRateDecayOptimizerConstructor',
+    paramwise_cfg={
+        'decay_rate': 0.8,
+        'decay_type': 'layer_wise',
+        'num_layers': 12
+    },
+    optimizer=dict(lr=base_lr))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0e-5,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        # use cosine lr from 50 to 100 epoch
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='PipelineSwitchHook',
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline={{_base_.train_pipeline_stage2}})
+]
diff --git a/mmde/mmdet/.mim/configs/rtmdet/rtmdet_l_swin_b_4xb32-100e_coco.py b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_l_swin_b_4xb32-100e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..84b0e0fa7d18848a4c1e305985e33e69e3196790
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_l_swin_b_4xb32-100e_coco.py
@@ -0,0 +1,78 @@
+_base_ = './rtmdet_l_8xb32-300e_coco.py'
+
+norm_cfg = dict(type='GN', num_groups=32)
+checkpoint = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth'  # noqa
+model = dict(
+    type='RTMDet',
+    data_preprocessor=dict(
+        _delete_=True,
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        batch_augments=None),
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=True,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    neck=dict(in_channels=[256, 512, 1024], norm_cfg=norm_cfg),
+    bbox_head=dict(norm_cfg=norm_cfg))
+
+max_epochs = 100
+stage2_num_epochs = 10
+interval = 10
+base_lr = 0.001
+
+train_cfg = dict(
+    max_epochs=max_epochs,
+    val_interval=interval,
+    dynamic_intervals=[(max_epochs - stage2_num_epochs, 1)])
+
+optim_wrapper = dict(optimizer=dict(lr=base_lr))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0e-5,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        # use cosine lr from 50 to 100 epoch
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='PipelineSwitchHook',
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline={{_base_.train_pipeline_stage2}})
+]
diff --git a/mmde/mmdet/.mim/configs/rtmdet/rtmdet_l_swin_b_p6_4xb16-100e_coco.py b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_l_swin_b_p6_4xb16-100e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..37d4215c3f014ef20c7817875cbc1689186e0766
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_l_swin_b_p6_4xb16-100e_coco.py
@@ -0,0 +1,114 @@
+_base_ = './rtmdet_l_swin_b_4xb32-100e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depths=[2, 2, 18, 2, 1],
+        num_heads=[4, 8, 16, 32, 64],
+        strides=(4, 2, 2, 2, 2),
+        out_indices=(1, 2, 3, 4)),
+    neck=dict(in_channels=[256, 512, 1024, 2048]),
+    bbox_head=dict(
+        anchor_generator=dict(
+            type='MlvlPointGenerator', offset=0, strides=[8, 16, 32, 64])))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='CachedMosaic', img_scale=(1280, 1280), pad_val=114.0),
+    dict(
+        type='RandomResize',
+        scale=(2560, 2560),
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(1280, 1280)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(1280, 1280), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='CachedMixUp',
+        img_scale=(1280, 1280),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=20,
+        pad_val=(114, 114, 114)),
+    dict(type='PackDetInputs')
+]
+
+train_pipeline_stage2 = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize',
+        scale=(1280, 1280),
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(1280, 1280)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(1280, 1280), pad_val=dict(img=(114, 114, 114))),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(1280, 1280), keep_ratio=True),
+    dict(type='Pad', size=(1280, 1280), pad_val=dict(img=(114, 114, 114))),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=16, num_workers=20, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(num_workers=20, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+max_epochs = 100
+stage2_num_epochs = 10
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='PipelineSwitchHook',
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
+
+img_scales = [(1280, 1280), (640, 640), (1920, 1920)]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale=s, keep_ratio=True)
+                for s in img_scales
+            ],
+            [
+                # ``RandomFlip`` must be placed before ``Pad``, otherwise
+                # bounding box coordinates after flipping cannot be
+                # recovered correctly.
+                dict(type='RandomFlip', prob=1.),
+                dict(type='RandomFlip', prob=0.)
+            ],
+            [
+                dict(
+                    type='Pad',
+                    size=(1920, 1920),
+                    pad_val=dict(img=(114, 114, 114))),
+            ],
+            [dict(type='LoadAnnotations', with_bbox=True)],
+            [
+                dict(
+                    type='PackDetInputs',
+                    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                               'scale_factor', 'flip', 'flip_direction'))
+            ]
+        ])
+]
diff --git a/mmde/mmdet/.mim/configs/rtmdet/rtmdet_m_8xb32-300e_coco.py b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_m_8xb32-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c83f5a60bd7d9f85f46574ee4cd19027391b5e1e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_m_8xb32-300e_coco.py
@@ -0,0 +1,6 @@
+_base_ = './rtmdet_l_8xb32-300e_coco.py'
+
+model = dict(
+    backbone=dict(deepen_factor=0.67, widen_factor=0.75),
+    neck=dict(in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2),
+    bbox_head=dict(in_channels=192, feat_channels=192))
diff --git a/mmde/mmdet/.mim/configs/rtmdet/rtmdet_s_8xb32-300e_coco.py b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_s_8xb32-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbf76247b74e94735eea0dd70ce6ac9e57f4dadf
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_s_8xb32-300e_coco.py
@@ -0,0 +1,62 @@
+_base_ = './rtmdet_l_8xb32-300e_coco.py'
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth'  # noqa
+model = dict(
+    backbone=dict(
+        deepen_factor=0.33,
+        widen_factor=0.5,
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
+    neck=dict(in_channels=[128, 256, 512], out_channels=128, num_csp_blocks=1),
+    bbox_head=dict(in_channels=128, feat_channels=128, exp_on_reg=False))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0),
+    dict(
+        type='RandomResize',
+        scale=(1280, 1280),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(640, 640)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='CachedMixUp',
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=20,
+        pad_val=(114, 114, 114)),
+    dict(type='PackDetInputs')
+]
+
+train_pipeline_stage2 = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize',
+        scale=(640, 640),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(640, 640)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='PipelineSwitchHook',
+        switch_epoch=280,
+        switch_pipeline=train_pipeline_stage2)
+]
diff --git a/mmde/mmdet/.mim/configs/rtmdet/rtmdet_tiny_8xb32-300e_coco.py b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_tiny_8xb32-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a686f4a7f0c4c3bed956c2a3fa504ea8863c669d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_tiny_8xb32-300e_coco.py
@@ -0,0 +1,43 @@
+_base_ = './rtmdet_s_8xb32-300e_coco.py'
+
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        deepen_factor=0.167,
+        widen_factor=0.375,
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
+    neck=dict(in_channels=[96, 192, 384], out_channels=96, num_csp_blocks=1),
+    bbox_head=dict(in_channels=96, feat_channels=96, exp_on_reg=False))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='CachedMosaic',
+        img_scale=(640, 640),
+        pad_val=114.0,
+        max_cached_images=20,
+        random_pop=False),
+    dict(
+        type='RandomResize',
+        scale=(1280, 1280),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(640, 640)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='CachedMixUp',
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=10,
+        random_pop=False,
+        pad_val=(114, 114, 114),
+        prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/rtmdet/rtmdet_tta.py b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_tta.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dde36de3ff06576944a351de9daf53746103f21
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_tta.py
@@ -0,0 +1,36 @@
+tta_model = dict(
+    type='DetTTAModel',
+    tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.6), max_per_img=100))
+
+img_scales = [(640, 640), (320, 320), (960, 960)]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale=s, keep_ratio=True)
+                for s in img_scales
+            ],
+            [
+                # ``RandomFlip`` must be placed before ``Pad``, otherwise
+                # bounding box coordinates after flipping cannot be
+                # recovered correctly.
+                dict(type='RandomFlip', prob=1.),
+                dict(type='RandomFlip', prob=0.)
+            ],
+            [
+                dict(
+                    type='Pad',
+                    size=(960, 960),
+                    pad_val=dict(img=(114, 114, 114))),
+            ],
+            [dict(type='LoadAnnotations', with_bbox=True)],
+            [
+                dict(
+                    type='PackDetInputs',
+                    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                               'scale_factor', 'flip', 'flip_direction'))
+            ]
+        ])
+]
diff --git a/mmde/mmdet/.mim/configs/rtmdet/rtmdet_x_8xb32-300e_coco.py b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_x_8xb32-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..16a33632c00b19b270b237f5dcd8f603350ac0c9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_x_8xb32-300e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './rtmdet_l_8xb32-300e_coco.py'
+
+model = dict(
+    backbone=dict(deepen_factor=1.33, widen_factor=1.25),
+    neck=dict(
+        in_channels=[320, 640, 1280], out_channels=320, num_csp_blocks=4),
+    bbox_head=dict(in_channels=320, feat_channels=320))
diff --git a/mmde/mmdet/.mim/configs/rtmdet/rtmdet_x_p6_4xb8-300e_coco.py b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_x_p6_4xb8-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1bb7fa6a78812e5a415acfb60eccedae9b884e2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/rtmdet/rtmdet_x_p6_4xb8-300e_coco.py
@@ -0,0 +1,132 @@
+_base_ = './rtmdet_x_8xb32-300e_coco.py'
+
+model = dict(
+    backbone=dict(arch='P6', out_indices=(2, 3, 4, 5)),
+    neck=dict(in_channels=[320, 640, 960, 1280]),
+    bbox_head=dict(
+        anchor_generator=dict(
+            type='MlvlPointGenerator', offset=0, strides=[8, 16, 32, 64])))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='CachedMosaic', img_scale=(1280, 1280), pad_val=114.0),
+    dict(
+        type='RandomResize',
+        scale=(2560, 2560),
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(1280, 1280)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(1280, 1280), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='CachedMixUp',
+        img_scale=(1280, 1280),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=20,
+        pad_val=(114, 114, 114)),
+    dict(type='PackDetInputs')
+]
+
+train_pipeline_stage2 = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize',
+        scale=(1280, 1280),
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(1280, 1280)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='Pad', size=(1280, 1280), pad_val=dict(img=(114, 114, 114))),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(1280, 1280), keep_ratio=True),
+    dict(type='Pad', size=(1280, 1280), pad_val=dict(img=(114, 114, 114))),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=8, num_workers=20, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=5, num_workers=20, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+max_epochs = 300
+stage2_num_epochs = 20
+
+base_lr = 0.004 * 32 / 256
+optim_wrapper = dict(optimizer=dict(lr=base_lr))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0e-5,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        # use cosine lr from 150 to 300 epoch
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='PipelineSwitchHook',
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
+
+img_scales = [(1280, 1280), (640, 640), (1920, 1920)]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale=s, keep_ratio=True)
+                for s in img_scales
+            ],
+            [
+                # ``RandomFlip`` must be placed before ``Pad``, otherwise
+                # bounding box coordinates after flipping cannot be
+                # recovered correctly.
+                dict(type='RandomFlip', prob=1.),
+                dict(type='RandomFlip', prob=0.)
+            ],
+            [
+                dict(
+                    type='Pad',
+                    size=(1920, 1920),
+                    pad_val=dict(img=(114, 114, 114))),
+            ],
+            [dict(type='LoadAnnotations', with_bbox=True)],
+            [
+                dict(
+                    type='PackDetInputs',
+                    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                               'scale_factor', 'flip', 'flip_direction'))
+            ]
+        ])
+]
diff --git a/mmde/mmdet/.mim/configs/sabl/metafile.yml b/mmde/mmdet/.mim/configs/sabl/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..632b869cc4bec559d442410b1d3a4f18d74556ed
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sabl/metafile.yml
@@ -0,0 +1,140 @@
+Collections:
+  - Name: SABL
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+        - SABL
+    Paper:
+      URL: https://arxiv.org/abs/1912.04260
+      Title: 'Side-Aware Boundary Localization for More Precise Object Detection'
+    README: configs/sabl/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.4.0/mmdet/models/roi_heads/bbox_heads/sabl_head.py#L14
+      Version: v2.4.0
+
+Models:
+  - Name: sabl-faster-rcnn_r50_fpn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl-faster-rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r50_fpn_1x_coco/sabl_faster_rcnn_r50_fpn_1x_coco-e867595b.pth
+
+  - Name: sabl-faster-rcnn_r101_fpn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl-faster-rcnn_r101_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r101_fpn_1x_coco/sabl_faster_rcnn_r101_fpn_1x_coco-f804c6c1.pth
+
+  - Name: sabl-cascade-rcnn_r50_fpn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl-cascade-rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco/sabl_cascade_rcnn_r50_fpn_1x_coco-e1748e5e.pth
+
+  - Name: sabl-cascade-rcnn_r101_fpn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl-cascade-rcnn_r101_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco/sabl_cascade_rcnn_r101_fpn_1x_coco-2b83e87c.pth
+
+  - Name: sabl-retinanet_r50_fpn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl-retinanet_r50_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_1x_coco/sabl_retinanet_r50_fpn_1x_coco-6c54fd4f.pth
+
+  - Name: sabl-retinanet_r50-gn_fpn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl-retinanet_r50-gn_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_gn_1x_coco/sabl_retinanet_r50_fpn_gn_1x_coco-e16dfcf1.pth
+
+  - Name: sabl-retinanet_r101_fpn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl-retinanet_r101_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 39.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_1x_coco/sabl_retinanet_r101_fpn_1x_coco-42026904.pth
+
+  - Name: sabl-retinanet_r101-gn_fpn_1x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl-retinanet_r101-gn_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_1x_coco/sabl_retinanet_r101_fpn_gn_1x_coco-40a893e8.pth
+
+  - Name: sabl-retinanet_r101-gn_fpn_ms-640-800-2x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl-retinanet_r101-gn_fpn_ms-640-800-2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco-1e63382c.pth
+
+  - Name: sabl-retinanet_r101-gn_fpn_ms-480-960-2x_coco
+    In Collection: SABL
+    Config: configs/sabl/sabl-retinanet_r101-gn_fpn_ms-480-960-2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco-5342f857.pth
diff --git a/mmde/mmdet/.mim/configs/sabl/sabl-cascade-rcnn_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/sabl/sabl-cascade-rcnn_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..404e7fcb2ac52773c9bc74f411e66584114f378e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sabl/sabl-cascade-rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,90 @@
+_base_ = [
+    '../_base_/models/cascade-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    roi_head=dict(bbox_head=[
+        dict(
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1,
+                               loss_weight=1.0)),
+        dict(
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.5),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1,
+                               loss_weight=1.0)),
+        dict(
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.3),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, loss_weight=1.0))
+    ]))
diff --git a/mmde/mmdet/.mim/configs/sabl/sabl-cascade-rcnn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/sabl/sabl-cascade-rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..69c59ca20d6c16e458292a55b8e4258a3d9a06bb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sabl/sabl-cascade-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,86 @@
+_base_ = [
+    '../_base_/models/cascade-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    roi_head=dict(bbox_head=[
+        dict(
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1,
+                               loss_weight=1.0)),
+        dict(
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.5),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1,
+                               loss_weight=1.0)),
+        dict(
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.3),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, loss_weight=1.0))
+    ]))
diff --git a/mmde/mmdet/.mim/configs/sabl/sabl-faster-rcnn_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/sabl/sabl-faster-rcnn_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1bf8b9c8cf1ac62d351456e7b19f75259ec0625
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sabl/sabl-faster-rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,38 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    roi_head=dict(
+        bbox_head=dict(
+            _delete_=True,
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1,
+                               loss_weight=1.0))))
diff --git a/mmde/mmdet/.mim/configs/sabl/sabl-faster-rcnn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/sabl/sabl-faster-rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a727bd6d3da09c86908c3c584509c5313cf732b5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sabl/sabl-faster-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,34 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            _delete_=True,
+            type='SABLHead',
+            num_classes=80,
+            cls_in_channels=256,
+            reg_in_channels=256,
+            roi_feat_size=7,
+            reg_feat_up_ratio=2,
+            reg_pre_kernel=3,
+            reg_post_kernel=3,
+            reg_pre_num=2,
+            reg_post_num=1,
+            cls_out_channels=1024,
+            reg_offset_out_channels=256,
+            reg_cls_out_channels=256,
+            num_cls_fcs=1,
+            num_reg_fcs=0,
+            reg_class_agnostic=True,
+            norm_cfg=None,
+            bbox_coder=dict(
+                type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1,
+                               loss_weight=1.0))))
diff --git a/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r101-gn_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r101-gn_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f181ad6813e4c6e3729ff80b3b8d915d84b53bf2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r101-gn_fpn_1x_coco.py
@@ -0,0 +1,57 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    bbox_head=dict(
+        _delete_=True,
+        type='SABLRetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        norm_cfg=norm_cfg,
+        bbox_coder=dict(
+            type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5),
+        loss_bbox_reg=dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False))
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r101-gn_fpn_ms-480-960-2x_coco.py b/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r101-gn_fpn_ms-480-960-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc7209aebad3efcb88945460cf20b36e6ec4b419
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r101-gn_fpn_ms-480-960-2x_coco.py
@@ -0,0 +1,68 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+# model settings
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    bbox_head=dict(
+        _delete_=True,
+        type='SABLRetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        norm_cfg=norm_cfg,
+        bbox_coder=dict(
+            type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5),
+        loss_bbox_reg=dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False))
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize', scale=[(1333, 480), (1333, 960)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r101-gn_fpn_ms-640-800-2x_coco.py b/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r101-gn_fpn_ms-640-800-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac5f6d9811dc8e45cfc036b3a3d4a04e7fa5ee60
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r101-gn_fpn_ms-640-800-2x_coco.py
@@ -0,0 +1,68 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+# model settings
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    bbox_head=dict(
+        _delete_=True,
+        type='SABLRetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        norm_cfg=norm_cfg,
+        bbox_coder=dict(
+            type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5),
+        loss_bbox_reg=dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False))
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize', scale=[(1333, 480), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..409695b5dbccfe20bb6e85ee16231211c2ebcdba
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r101_fpn_1x_coco.py
@@ -0,0 +1,55 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    bbox_head=dict(
+        _delete_=True,
+        type='SABLRetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5),
+        loss_bbox_reg=dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False))
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r50-gn_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r50-gn_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4facdb6aaab05fd04b95e8c3ba2f0460090b1d6c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r50-gn_fpn_1x_coco.py
@@ -0,0 +1,53 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    bbox_head=dict(
+        _delete_=True,
+        type='SABLRetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        norm_cfg=norm_cfg,
+        bbox_coder=dict(
+            type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5),
+        loss_bbox_reg=dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False))
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9073d6f002fcb49aecc280f318b8769b477d2d82
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sabl/sabl-retinanet_r50_fpn_1x_coco.py
@@ -0,0 +1,51 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    bbox_head=dict(
+        _delete_=True,
+        type='SABLRetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        approx_anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5),
+        loss_bbox_reg=dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.0,
+            ignore_iof_thr=-1),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False))
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/scnet/metafile.yml b/mmde/mmdet/.mim/configs/scnet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..936d38960a8f423198702194f64a9eb46c770979
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/scnet/metafile.yml
@@ -0,0 +1,116 @@
+Collections:
+  - Name: SCNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+        - SCNet
+    Paper:
+      URL: https://arxiv.org/abs/2012.10150
+      Title: 'SCNet: Training Inference Sample Consistency for Instance Segmentation'
+    README: configs/scnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.9.0/mmdet/models/detectors/scnet.py#L6
+      Version: v2.9.0
+
+Models:
+  - Name: scnet_r50_fpn_1x_coco
+    In Collection: SCNet
+    Config: configs/scnet/scnet_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 161.29
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_1x_coco/scnet_r50_fpn_1x_coco-c3f09857.pth
+
+  - Name: scnet_r50_fpn_20e_coco
+    In Collection: SCNet
+    Config: configs/scnet/scnet_r50_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 7.0
+      inference time (ms/im):
+        - value: 161.29
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_20e_coco/scnet_r50_fpn_20e_coco-a569f645.pth
+
+  - Name: scnet_r101_fpn_20e_coco
+    In Collection: SCNet
+    Config: configs/scnet/scnet_r101_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 8.9
+      inference time (ms/im):
+        - value: 172.41
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r101_fpn_20e_coco/scnet_r101_fpn_20e_coco-294e312c.pth
+
+  - Name: scnet_x101-64x4d_fpn_20e_coco
+    In Collection: SCNet
+    Config: configs/scnet/scnet_x101-64x4d_fpn_20e_coco.py
+    Metadata:
+      Training Memory (GB): 13.2
+      inference time (ms/im):
+        - value: 204.08
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (800, 1333)
+      Epochs: 20
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 42.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_x101_64x4d_fpn_20e_coco/scnet_x101_64x4d_fpn_20e_coco-fb09dec9.pth
diff --git a/mmde/mmdet/.mim/configs/scnet/scnet_r101_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/scnet/scnet_r101_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebba52978b23c07a68e3563033c860a95dd515b6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/scnet/scnet_r101_fpn_20e_coco.py
@@ -0,0 +1,6 @@
+_base_ = './scnet_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/scnet/scnet_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/scnet/scnet_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0210fdb456c26b2c05d99a2435da14fc30f088d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/scnet/scnet_r50_fpn_1x_coco.py
@@ -0,0 +1,138 @@
+_base_ = '../htc/htc_r50_fpn_1x_coco.py'
+# model settings
+model = dict(
+    type='SCNet',
+    roi_head=dict(
+        _delete_=True,
+        type='SCNetRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='SCNetBBoxHead',
+                num_shared_fcs=2,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='SCNetBBoxHead',
+                num_shared_fcs=2,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='SCNetBBoxHead',
+                num_shared_fcs=2,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='SCNetMaskHead',
+            num_convs=12,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            conv_to_res=True,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)),
+        semantic_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[8]),
+        semantic_head=dict(
+            type='SCNetSemanticHead',
+            num_ins=5,
+            fusion_level=1,
+            seg_scale_factor=1 / 8,
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=183,
+            loss_seg=dict(
+                type='CrossEntropyLoss', ignore_index=255, loss_weight=0.2),
+            conv_to_res=True),
+        glbctx_head=dict(
+            type='GlobalContextHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_weight=3.0,
+            conv_to_res=True),
+        feat_relay_head=dict(
+            type='FeatureRelayHead',
+            in_channels=1024,
+            out_conv_channels=256,
+            roi_feat_size=7,
+            scale_factor=2)))
+
+# TODO
+# uncomment below code to enable test time augmentations
+# img_norm_cfg = dict(
+#     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# test_pipeline = [
+#     dict(type='LoadImageFromFile'),
+#     dict(
+#         type='MultiScaleFlipAug',
+#         img_scale=[(600, 900), (800, 1200), (1000, 1500), (1200, 1800),
+#                    (1400, 2100)],
+#         flip=True,
+#         transforms=[
+#             dict(type='Resize', keep_ratio=True),
+#             dict(type='RandomFlip', flip_ratio=0.5),
+#             dict(type='Normalize', **img_norm_cfg),
+#             dict(type='Pad', size_divisor=32),
+#             dict(type='ImageToTensor', keys=['img']),
+#             dict(type='Collect', keys=['img']),
+#         ])
+# ]
+# data = dict(
+#     val=dict(pipeline=test_pipeline),
+#     test=dict(pipeline=test_pipeline))
diff --git a/mmde/mmdet/.mim/configs/scnet/scnet_r50_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/scnet/scnet_r50_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..533e1b5f3253387788fbf1a9d6d7a38c7c5c5f30
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/scnet/scnet_r50_fpn_20e_coco.py
@@ -0,0 +1,15 @@
+_base_ = './scnet_r50_fpn_1x_coco.py'
+# learning policy
+max_epochs = 20
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 19],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs)
diff --git a/mmde/mmdet/.mim/configs/scnet/scnet_x101-64x4d_fpn_20e_coco.py b/mmde/mmdet/.mim/configs/scnet/scnet_x101-64x4d_fpn_20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e54b030fa68f76f22edf66e3594d66a13c2c672
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/scnet/scnet_x101-64x4d_fpn_20e_coco.py
@@ -0,0 +1,15 @@
+_base_ = './scnet_r50_fpn_20e_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/scnet/scnet_x101-64x4d_fpn_8xb1-20e_coco.py b/mmde/mmdet/.mim/configs/scnet/scnet_x101-64x4d_fpn_8xb1-20e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cdce7d54248e77e98639d68490cc30dfd625c87
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/scnet/scnet_x101-64x4d_fpn_8xb1-20e_coco.py
@@ -0,0 +1,8 @@
+_base_ = './scnet_x101-64x4d_fpn_20e_coco.py'
+train_dataloader = dict(batch_size=1, num_workers=1)
+
+optim_wrapper = dict(optimizer=dict(lr=0.01))
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (1 samples per GPU)
+auto_scale_lr = dict(base_batch_size=8)
diff --git a/mmde/mmdet/.mim/configs/scratch/faster-rcnn_r50-scratch_fpn_gn-all_6x_coco.py b/mmde/mmdet/.mim/configs/scratch/faster-rcnn_r50-scratch_fpn_gn-all_6x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e632b9a150871a44b698dfdb0fdc3f07308ef81
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/scratch/faster-rcnn_r50-scratch_fpn_gn-all_6x_coco.py
@@ -0,0 +1,39 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        frozen_stages=-1,
+        zero_init_residual=False,
+        norm_cfg=norm_cfg,
+        init_cfg=None),
+    neck=dict(norm_cfg=norm_cfg),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=norm_cfg)))
+
+optim_wrapper = dict(paramwise_cfg=dict(norm_decay_mult=0.))
+
+max_epochs = 73
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[65, 71],
+        gamma=0.1)
+]
+
+train_cfg = dict(max_epochs=max_epochs)
+
+# only keep latest 3 checkpoints
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
diff --git a/mmde/mmdet/.mim/configs/scratch/mask-rcnn_r50-scratch_fpn_gn-all_6x_coco.py b/mmde/mmdet/.mim/configs/scratch/mask-rcnn_r50-scratch_fpn_gn-all_6x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9796f504b677a841919bb058ded414de25e74a50
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/scratch/mask-rcnn_r50-scratch_fpn_gn-all_6x_coco.py
@@ -0,0 +1,40 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    backbone=dict(
+        frozen_stages=-1,
+        zero_init_residual=False,
+        norm_cfg=norm_cfg,
+        init_cfg=None),
+    neck=dict(norm_cfg=norm_cfg),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=norm_cfg),
+        mask_head=dict(norm_cfg=norm_cfg)))
+
+optim_wrapper = dict(paramwise_cfg=dict(norm_decay_mult=0.))
+
+max_epochs = 73
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[65, 71],
+        gamma=0.1)
+]
+
+train_cfg = dict(max_epochs=max_epochs)
+
+# only keep latest 3 checkpoints
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
diff --git a/mmde/mmdet/.mim/configs/scratch/metafile.yml b/mmde/mmdet/.mim/configs/scratch/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..977b8e5bfc2b6319793ae8abdeb71e5e04d7cb1b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/scratch/metafile.yml
@@ -0,0 +1,48 @@
+Collections:
+  - Name: Rethinking ImageNet Pre-training
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - RPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1811.08883
+      Title: 'Rethinking ImageNet Pre-training'
+    README: configs/scratch/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/configs/scratch/faster-rcnn_r50-scratch_fpn_gn-all_6x_coco.py
+      Version: v2.0.0
+
+Models:
+  - Name: faster-rcnn_r50_fpn_gn-all_scratch_6x_coco
+    In Collection: Rethinking ImageNet Pre-training
+    Config: configs/scratch/faster-rcnn_r50-scratch_fpn_gn-all_6x_coco.py
+    Metadata:
+      Epochs: 72
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_faster_rcnn_r50_fpn_gn_6x_bbox_mAP-0.407_20200201_193013-90813d01.pth
+
+  - Name: mask-rcnn_r50_fpn_gn-all_scratch_6x_coco
+    In Collection: Rethinking ImageNet Pre-training
+    Config: configs/scratch/mask-rcnn_r50-scratch_fpn_gn-all_6x_coco.py
+    Metadata:
+      Epochs: 72
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_mask_rcnn_r50_fpn_gn_6x_bbox_mAP-0.412__segm_mAP-0.374_20200201_193051-1e190a40.pth
diff --git a/mmde/mmdet/.mim/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py b/mmde/mmdet/.mim/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..2de87dcca59ccac7fc96c10c2a069fcf0464aeff
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py
@@ -0,0 +1,5 @@
+_base_ = './cascade-mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py'  # noqa: E501
+model = dict(
+    roi_head=dict(
+        mask_head=dict(
+            predictor_cfg=dict(type='NormedConv2d', tempearture=20))))
diff --git a/mmde/mmdet/.mim/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py b/mmde/mmdet/.mim/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d67ad7d4817a32b365bc2567937f69b68a9c97c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py
@@ -0,0 +1,5 @@
+_base_ = './cascade-mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py'  # noqa: E501
+model = dict(
+    roi_head=dict(
+        mask_head=dict(
+            predictor_cfg=dict(type='NormedConv2d', tempearture=20))))
diff --git a/mmde/mmdet/.mim/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py b/mmde/mmdet/.mim/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a1a87d4203a12a78a26fd873bd6017fafb49cdf
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py
@@ -0,0 +1,116 @@
+_base_ = [
+    '../_base_/models/cascade-mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=1203,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+                loss_cls=dict(
+                    type='SeesawLoss',
+                    p=0.8,
+                    q=2.0,
+                    num_classes=1203,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=1203,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+                loss_cls=dict(
+                    type='SeesawLoss',
+                    p=0.8,
+                    q=2.0,
+                    num_classes=1203,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=1203,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+                loss_cls=dict(
+                    type='SeesawLoss',
+                    p=0.8,
+                    q=2.0,
+                    num_classes=1203,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_head=dict(num_classes=1203)),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.0001,
+            # LVIS allows up to 300
+            max_per_img=300)))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/lvis_v1/'
+train_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/lvis_v1_train.json',
+        data_prefix=dict(img=''),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/lvis_v1_val.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='LVISMetric',
+    ann_file=data_root + 'annotations/lvis_v1_val.json',
+    metric=['bbox', 'segm'])
+test_evaluator = val_evaluator
+
+train_cfg = dict(val_interval=24)
diff --git a/mmde/mmdet/.mim/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py b/mmde/mmdet/.mim/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e7b4df91368d23092a68f16ba4a35660ea23130
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py
@@ -0,0 +1,95 @@
+_base_ = [
+    '../_base_/models/cascade-mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/lvis_v1_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')),
+    roi_head=dict(
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=1203,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+                loss_cls=dict(
+                    type='SeesawLoss',
+                    p=0.8,
+                    q=2.0,
+                    num_classes=1203,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=1203,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+                loss_cls=dict(
+                    type='SeesawLoss',
+                    p=0.8,
+                    q=2.0,
+                    num_classes=1203,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=1203,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+                loss_cls=dict(
+                    type='SeesawLoss',
+                    p=0.8,
+                    q=2.0,
+                    num_classes=1203,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_head=dict(num_classes=1203)),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.0001,
+            # LVIS allows up to 300
+            max_per_img=300)))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
+
+train_cfg = dict(val_interval=24)
diff --git a/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..b518c2135acb39a3d1119a8892c72816910ca496
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py
@@ -0,0 +1,6 @@
+_base_ = './mask-rcnn_r50_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py'  # noqa: E501
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..008bbcae6eb8d189bdd0688b42d663eeba2a661e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py
@@ -0,0 +1,6 @@
+_base_ = './mask-rcnn_r50_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py'  # noqa: E501
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a0b6755bf6f218c337d9ee16677e3e64886c019
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py
@@ -0,0 +1,6 @@
+_base_ = './mask-rcnn_r50_fpn_seesaw-loss_random-ms-2x_lvis-v1.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..6143231918e028523b6bb1792887ef7ce16dde02
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py
@@ -0,0 +1,6 @@
+_base_ = './mask-rcnn_r50_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..06d2438cf7c351a2fb352f787bc434cc6afc3ebb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py
@@ -0,0 +1,5 @@
+_base_ = './mask-rcnn_r50_fpn_seesaw-loss_random-ms-2x_lvis-v1.py'
+model = dict(
+    roi_head=dict(
+        mask_head=dict(
+            predictor_cfg=dict(type='NormedConv2d', tempearture=20))))
diff --git a/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fc68d3df32015e0fc8d5dd2bc92df416a8fc5fd
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py
@@ -0,0 +1,5 @@
+_base_ = './mask-rcnn_r50_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py'
+model = dict(
+    roi_head=dict(
+        mask_head=dict(
+            predictor_cfg=dict(type='NormedConv2d', tempearture=20))))
diff --git a/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_random-ms-2x_lvis-v1.py b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_random-ms-2x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..25c646c9c75c4468e71442049876a77382528e02
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_random-ms-2x_lvis-v1.py
@@ -0,0 +1,59 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            num_classes=1203,
+            cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+            loss_cls=dict(
+                type='SeesawLoss',
+                p=0.8,
+                q=2.0,
+                num_classes=1203,
+                loss_weight=1.0)),
+        mask_head=dict(num_classes=1203)),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.0001,
+            # LVIS allows up to 300
+            max_per_img=300)))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/lvis_v1/'
+train_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/lvis_v1_train.json',
+        data_prefix=dict(img=''),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/lvis_v1_val.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='LVISMetric',
+    ann_file=data_root + 'annotations/lvis_v1_val.json',
+    metric=['bbox', 'segm'])
+test_evaluator = val_evaluator
+
+train_cfg = dict(val_interval=24)
diff --git a/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..d60320e0b78035d24adb86f3aa184433951481fe
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py
@@ -0,0 +1,38 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/lvis_v1_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            num_classes=1203,
+            cls_predictor_cfg=dict(type='NormedLinear', tempearture=20),
+            loss_cls=dict(
+                type='SeesawLoss',
+                p=0.8,
+                q=2.0,
+                num_classes=1203,
+                loss_weight=1.0)),
+        mask_head=dict(num_classes=1203)),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.0001,
+            # LVIS allows up to 300
+            max_per_img=300)))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
+
+train_cfg = dict(val_interval=24)
diff --git a/mmde/mmdet/.mim/configs/seesaw_loss/metafile.yml b/mmde/mmdet/.mim/configs/seesaw_loss/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..374b9cde64ab1ff3c5f23971467846804738b0aa
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/seesaw_loss/metafile.yml
@@ -0,0 +1,203 @@
+Collections:
+  - Name: Seesaw Loss
+    Metadata:
+      Training Data: LVIS
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Softmax
+        - RPN
+        - Convolution
+        - Dense Connections
+        - FPN
+        - ResNet
+        - RoIAlign
+        - Seesaw Loss
+    Paper:
+      URL: https://arxiv.org/abs/2008.10032
+      Title: 'Seesaw Loss for Long-Tailed Instance Segmentation'
+    README: configs/seesaw_loss/README.md
+
+Models:
+  - Name: mask-rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_random-ms-2x_lvis-v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 25.6
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 25.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-a698dd3d.pth
+  - Name: mask-rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 25.6
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 25.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-a1c11314.pth
+  - Name: mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 27.4
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 26.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-8e6e6dd5.pth
+  - Name: mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 27.2
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 27.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-a0b59c42.pth
+  - Name: mask-rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 27.6
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 26.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-392a804b.pth
+  - Name: mask-rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 27.6
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 26.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-cd0f6a12.pth
+  - Name: mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 28.9
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 27.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-e68eb464.pth
+  - Name: mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 28.9
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 28.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-1d817139.pth
+  - Name: cascade-mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 33.1
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 29.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-71e2215e.pth
+  - Name: cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 33.0
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 30.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-8b5a6745.pth
+  - Name: cascade-mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 30.0
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 29.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-5d8ca2a4.pth
+  - Name: cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1
+    In Collection: Seesaw Loss
+    Config: configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: LVIS v1
+        Metrics:
+          box AP: 32.8
+      - Task: Instance Segmentation
+        Dataset: LVIS v1
+        Metrics:
+          mask AP: 30.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-c8551505.pth
diff --git a/mmde/mmdet/.mim/configs/selfsup_pretrain/mask-rcnn_r50-mocov2-pre_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/selfsup_pretrain/mask-rcnn_r50-mocov2-pre_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..91d45add8aba54de4b25fba11ecf5e18bca0084f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/selfsup_pretrain/mask-rcnn_r50-mocov2-pre_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        frozen_stages=0,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='./mocov2_r50_800ep_pretrain.pth')))
diff --git a/mmde/mmdet/.mim/configs/selfsup_pretrain/mask-rcnn_r50-mocov2-pre_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/selfsup_pretrain/mask-rcnn_r50-mocov2-pre_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddaebf5558a22680d556aa8b3fe79541d634d910
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/selfsup_pretrain/mask-rcnn_r50-mocov2-pre_fpn_ms-2x_coco.py
@@ -0,0 +1,25 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        frozen_stages=0,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='./mocov2_r50_800ep_pretrain.pth')))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomResize', scale=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/selfsup_pretrain/mask-rcnn_r50-swav-pre_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/selfsup_pretrain/mask-rcnn_r50-swav-pre_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..785c80ec9d14c8e4b54b2e3359f9b4c680eaca17
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/selfsup_pretrain/mask-rcnn_r50-swav-pre_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        frozen_stages=0,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='./swav_800ep_pretrain.pth.tar')))
diff --git a/mmde/mmdet/.mim/configs/selfsup_pretrain/mask-rcnn_r50-swav-pre_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/selfsup_pretrain/mask-rcnn_r50-swav-pre_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c393e0b36047f731c91c3f0963ef90347a0910e9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/selfsup_pretrain/mask-rcnn_r50-swav-pre_fpn_ms-2x_coco.py
@@ -0,0 +1,25 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    backbone=dict(
+        frozen_stages=0,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='./swav_800ep_pretrain.pth.tar')))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomResize', scale=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-270k_coco.py b/mmde/mmdet/.mim/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-270k_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c6e081e860e1240f8d35efa8176563a8b5be845
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-270k_coco.py
@@ -0,0 +1,31 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    # 270k iterations with batch_size 64 is roughly equivalent to 144 epochs
+    '../common/ssj_270k_coco-instance.py',
+]
+
+image_size = (1024, 1024)
+batch_augments = [
+    dict(type='BatchFixedSizePad', size=image_size, pad_mask=True)
+]
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+# Use MMSyncBN that handles empty tensor in head. It can be changed to
+# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed
+head_norm_cfg = dict(type='MMSyncBN', requires_grad=True)
+model = dict(
+    # the model is trained from scratch, so init_cfg is None
+    data_preprocessor=dict(
+        # pad_size_divisor=32 is unnecessary in training but necessary
+        # in testing.
+        pad_size_divisor=32,
+        batch_augments=batch_augments),
+    backbone=dict(
+        frozen_stages=-1, norm_eval=False, norm_cfg=norm_cfg, init_cfg=None),
+    neck=dict(norm_cfg=norm_cfg),
+    rpn_head=dict(num_convs=2),  # leads to 0.1+ mAP
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=head_norm_cfg),
+        mask_head=dict(norm_cfg=head_norm_cfg)))
diff --git a/mmde/mmdet/.mim/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-90k_coco.py b/mmde/mmdet/.mim/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-90k_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..abe8962ac69184241e30628242e5313c52f503f4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-90k_coco.py
@@ -0,0 +1,18 @@
+_base_ = 'mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-270k_coco.py'  # noqa
+
+# training schedule for 90k
+max_iters = 90000
+
+# learning rate policy
+# lr steps at [0.9, 0.95, 0.975] of the maximum iterations
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.067, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=90000,
+        by_epoch=False,
+        milestones=[81000, 85500, 87750],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-270k_coco.py b/mmde/mmdet/.mim/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-270k_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0ea57d19728d7c563e56d139888059dd9c81317
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-270k_coco.py
@@ -0,0 +1,31 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    # 270k iterations with batch_size 64 is roughly equivalent to 144 epochs
+    '../common/ssj_scp_270k_coco-instance.py'
+]
+
+image_size = (1024, 1024)
+batch_augments = [
+    dict(type='BatchFixedSizePad', size=image_size, pad_mask=True)
+]
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+# Use MMSyncBN that handles empty tensor in head. It can be changed to
+# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed
+head_norm_cfg = dict(type='MMSyncBN', requires_grad=True)
+model = dict(
+    # the model is trained from scratch, so init_cfg is None
+    data_preprocessor=dict(
+        # pad_size_divisor=32 is unnecessary in training but necessary
+        # in testing.
+        pad_size_divisor=32,
+        batch_augments=batch_augments),
+    backbone=dict(
+        frozen_stages=-1, norm_eval=False, norm_cfg=norm_cfg, init_cfg=None),
+    neck=dict(norm_cfg=norm_cfg),
+    rpn_head=dict(num_convs=2),  # leads to 0.1+ mAP
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=head_norm_cfg),
+        mask_head=dict(norm_cfg=head_norm_cfg)))
diff --git a/mmde/mmdet/.mim/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-90k_coco.py b/mmde/mmdet/.mim/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-90k_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e158b5c05aae3345ba9d4d1a55d1bbb82a789726
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-90k_coco.py
@@ -0,0 +1,18 @@
+_base_ = 'mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-270k_coco.py'  # noqa
+
+# training schedule for 90k
+max_iters = 90000
+
+# learning rate policy
+# lr steps at [0.9, 0.95, 0.975] of the maximum iterations
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.067, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=90000,
+        by_epoch=False,
+        milestones=[81000, 85500, 87750],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/simple_copy_paste/metafile.yml b/mmde/mmdet/.mim/configs/simple_copy_paste/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8a40b658feeefd870300e62934ea21315218bfba
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/simple_copy_paste/metafile.yml
@@ -0,0 +1,92 @@
+Collections:
+  - Name: SimpleCopyPaste
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 32x A100 GPUs
+      Architecture:
+        - Softmax
+        - RPN
+        - Convolution
+        - Dense Connections
+        - FPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/abs/2012.07177
+      Title: "Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation"
+    README: configs/simple_copy_paste/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.25.0/mmdet/datasets/pipelines/transforms.py#L2762
+      Version: v2.25.0
+
+Models:
+  - Name: mask-rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco
+    In Collection: SimpleCopyPaste
+    Config: configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-270k_coco.py
+    Metadata:
+      Training Memory (GB): 7.2
+      Iterations: 270000
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.5
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco_20220324_182940-33a100c5.pth
+
+  - Name: mask-rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco
+    In Collection: SimpleCopyPaste
+    Config: configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-90k_coco.py
+    Metadata:
+      Training Memory (GB): 7.2
+      Iterations: 90000
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.3
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco_20220316_181409-f79c84c5.pth
+
+  - Name: mask-rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco
+    In Collection: SimpleCopyPaste
+    Config: configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-270k_coco.py
+    Metadata:
+      Training Memory (GB): 7.2
+      Iterations: 270000
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 40.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco_20220324_201229-80ee90b7.pth
+
+  - Name: mask-rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco
+    In Collection: SimpleCopyPaste
+    Config: configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-90k_coco.py
+    Metadata:
+      Training Memory (GB): 7.2
+      Iterations: 90000
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco_20220316_181307-6bc5726f.pth
diff --git a/mmde/mmdet/.mim/configs/soft_teacher/metafile.yml b/mmde/mmdet/.mim/configs/soft_teacher/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9622acec93ad3138daff09930ecfa2807dc7748a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/soft_teacher/metafile.yml
@@ -0,0 +1,67 @@
+Collections:
+  - Name: SoftTeacher
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/2106.09018
+      Title: "End-to-End Semi-Supervised Object Detection with Soft Teacher"
+    README: configs/soft_teacher/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v3.0.0rc1/mmdet/models/detectors/soft_teacher.py#L20
+      Version: v3.0.0rc1
+
+Models:
+  - Name: soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco.py
+    In Collection: SoftTeacher
+    Config: configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco.py
+    Metadata:
+      Iterations: 180000
+    Results:
+      - Task: Semi-Supervised Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 19.9
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230330_233412-3c8f6d4a.pth
+
+  - Name: soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco.py
+    In Collection: SoftTeacher
+    Config: configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco.py
+    Metadata:
+      Iterations: 180000
+    Results:
+      - Task: Semi-Supervised Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 24.9
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230331_020244-c0d2c3aa.pth
+
+  - Name: soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco.py
+    In Collection: SoftTeacher
+    Config: configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco.py
+    Metadata:
+      Iterations: 180000
+    Results:
+      - Task: Semi-Supervised Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 30.4
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230331_070656-308798ad.pth
+
+  - Name: soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py
+    In Collection: SoftTeacher
+    Config: configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py
+    Metadata:
+      Iterations: 180000
+    Results:
+      - Task: Semi-Supervised Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 33.8
+    Weights: https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230330_232113-b46f78d0.pth
diff --git a/mmde/mmdet/.mim/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco.py b/mmde/mmdet/.mim/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bd09645598204482e9f88f6baf00d32eba9cab6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco.py
@@ -0,0 +1,9 @@
+_base_ = ['soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py']
+
+# 1% coco train2017 is set as labeled dataset
+labeled_dataset = _base_.labeled_dataset
+unlabeled_dataset = _base_.unlabeled_dataset
+labeled_dataset.ann_file = 'semi_anns/instances_train2017.1@1.json'
+unlabeled_dataset.ann_file = 'semi_anns/instances_train2017.1@1-unlabeled.json'
+train_dataloader = dict(
+    dataset=dict(datasets=[labeled_dataset, unlabeled_dataset]))
diff --git a/mmde/mmdet/.mim/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco.py b/mmde/mmdet/.mim/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ca38c931926cef33321f931b0c6d5c66824ff55
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco.py
@@ -0,0 +1,9 @@
+_base_ = ['soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py']
+
+# 2% coco train2017 is set as labeled dataset
+labeled_dataset = _base_.labeled_dataset
+unlabeled_dataset = _base_.unlabeled_dataset
+labeled_dataset.ann_file = 'semi_anns/instances_train2017.1@2.json'
+unlabeled_dataset.ann_file = 'semi_anns/instances_train2017.1@2-unlabeled.json'
+train_dataloader = dict(
+    dataset=dict(datasets=[labeled_dataset, unlabeled_dataset]))
diff --git a/mmde/mmdet/.mim/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco.py b/mmde/mmdet/.mim/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..750b7ed6df6c91bab8f68f58f339b2f3696fa693
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco.py
@@ -0,0 +1,9 @@
+_base_ = ['soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py']
+
+# 5% coco train2017 is set as labeled dataset
+labeled_dataset = _base_.labeled_dataset
+unlabeled_dataset = _base_.unlabeled_dataset
+labeled_dataset.ann_file = 'semi_anns/instances_train2017.1@5.json'
+unlabeled_dataset.ann_file = 'semi_anns/instances_train2017.1@5-unlabeled.json'
+train_dataloader = dict(
+    dataset=dict(datasets=[labeled_dataset, unlabeled_dataset]))
diff --git a/mmde/mmdet/.mim/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py b/mmde/mmdet/.mim/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3713aef442f4add55efafde08b2c98da1773bab0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py
@@ -0,0 +1,84 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py', '../_base_/default_runtime.py',
+    '../_base_/datasets/semi_coco_detection.py'
+]
+
+detector = _base_.model
+detector.data_preprocessor = dict(
+    type='DetDataPreprocessor',
+    mean=[103.530, 116.280, 123.675],
+    std=[1.0, 1.0, 1.0],
+    bgr_to_rgb=False,
+    pad_size_divisor=32)
+detector.backbone = dict(
+    type='ResNet',
+    depth=50,
+    num_stages=4,
+    out_indices=(0, 1, 2, 3),
+    frozen_stages=1,
+    norm_cfg=dict(type='BN', requires_grad=False),
+    norm_eval=True,
+    style='caffe',
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint='open-mmlab://detectron2/resnet50_caffe'))
+
+model = dict(
+    _delete_=True,
+    type='SoftTeacher',
+    detector=detector,
+    data_preprocessor=dict(
+        type='MultiBranchDataPreprocessor',
+        data_preprocessor=detector.data_preprocessor),
+    semi_train_cfg=dict(
+        freeze_teacher=True,
+        sup_weight=1.0,
+        unsup_weight=4.0,
+        pseudo_label_initial_score_thr=0.5,
+        rpn_pseudo_thr=0.9,
+        cls_pseudo_thr=0.9,
+        reg_pseudo_thr=0.02,
+        jitter_times=10,
+        jitter_scale=0.06,
+        min_pseudo_bbox_wh=(1e-2, 1e-2)),
+    semi_test_cfg=dict(predict_on='teacher'))
+
+# 10% coco train2017 is set as labeled dataset
+labeled_dataset = _base_.labeled_dataset
+unlabeled_dataset = _base_.unlabeled_dataset
+labeled_dataset.ann_file = 'semi_anns/instances_train2017.1@10.json'
+unlabeled_dataset.ann_file = 'semi_anns/' \
+                             'instances_train2017.1@10-unlabeled.json'
+unlabeled_dataset.data_prefix = dict(img='train2017/')
+train_dataloader = dict(
+    dataset=dict(datasets=[labeled_dataset, unlabeled_dataset]))
+
+# training schedule for 180k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=180000, val_interval=5000)
+val_cfg = dict(type='TeacherStudentValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=180000,
+        by_epoch=False,
+        milestones=[120000, 160000],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
+
+default_hooks = dict(
+    checkpoint=dict(by_epoch=False, interval=10000, max_keep_ckpts=2))
+log_processor = dict(by_epoch=False)
+
+custom_hooks = [dict(type='MeanTeacherHook')]
diff --git a/mmde/mmdet/.mim/configs/solo/decoupled-solo-light_r50_fpn_3x_coco.py b/mmde/mmdet/.mim/configs/solo/decoupled-solo-light_r50_fpn_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc35df3c3cbbd70532e066de27b06418549eb906
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solo/decoupled-solo-light_r50_fpn_3x_coco.py
@@ -0,0 +1,50 @@
+_base_ = './decoupled-solo_r50_fpn_3x_coco.py'
+
+# model settings
+model = dict(
+    mask_head=dict(
+        type='DecoupledSOLOLightHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 8, 16, 32, 32],
+        scale_ranges=((1, 64), (32, 128), (64, 256), (128, 512), (256, 2048)),
+        pos_scale=0.2,
+        num_grids=[40, 36, 24, 16, 12],
+        cls_down_index=0,
+        loss_mask=dict(
+            type='DiceLoss', use_sigmoid=True, activate=False,
+            loss_weight=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(852, 512), (852, 480), (852, 448), (852, 416), (852, 384),
+                (852, 352)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(852, 512), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/solo/decoupled-solo_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/solo/decoupled-solo_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d7f4b90c19d9fdcc3c895deb4101cf7acd7bd8e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solo/decoupled-solo_r50_fpn_1x_coco.py
@@ -0,0 +1,24 @@
+_base_ = './solo_r50_fpn_1x_coco.py'
+# model settings
+model = dict(
+    mask_head=dict(
+        type='DecoupledSOLOHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=7,
+        feat_channels=256,
+        strides=[8, 8, 16, 32, 32],
+        scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)),
+        pos_scale=0.2,
+        num_grids=[40, 36, 24, 16, 12],
+        cls_down_index=0,
+        loss_mask=dict(
+            type='DiceLoss', use_sigmoid=True, activate=False,
+            loss_weight=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)))
diff --git a/mmde/mmdet/.mim/configs/solo/decoupled-solo_r50_fpn_3x_coco.py b/mmde/mmdet/.mim/configs/solo/decoupled-solo_r50_fpn_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a8c19decb72a3d904a277faac06670999f6b322
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solo/decoupled-solo_r50_fpn_3x_coco.py
@@ -0,0 +1,25 @@
+_base_ = './solo_r50_fpn_3x_coco.py'
+
+# model settings
+model = dict(
+    mask_head=dict(
+        type='DecoupledSOLOHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=7,
+        feat_channels=256,
+        strides=[8, 8, 16, 32, 32],
+        scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)),
+        pos_scale=0.2,
+        num_grids=[40, 36, 24, 16, 12],
+        cls_down_index=0,
+        loss_mask=dict(
+            type='DiceLoss', use_sigmoid=True, activate=False,
+            loss_weight=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)))
diff --git a/mmde/mmdet/.mim/configs/solo/metafile.yml b/mmde/mmdet/.mim/configs/solo/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..aa38b8c07b3db7eb018bb769b6eca6e010a1d764
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solo/metafile.yml
@@ -0,0 +1,115 @@
+Collections:
+  - Name: SOLO
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - Convolution
+        - ResNet
+    Paper: https://arxiv.org/abs/1912.04488
+    README: configs/solo/README.md
+
+Models:
+  - Name: decoupled-solo_r50_fpn_1x_coco
+    In Collection: SOLO
+    Config: configs/solo/decoupled-solo_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      Epochs: 12
+    inference time (ms/im):
+      - value: 116.4
+        hardware: V100
+        backend: PyTorch
+        batch size: 1
+        mode: FP32
+        resolution: (1333, 800)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 33.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_1x_coco/decoupled_solo_r50_fpn_1x_coco_20210820_233348-6337c589.pth
+
+  - Name: decoupled-solo_r50_fpn_3x_coco
+    In Collection: SOLO
+    Config: configs/solo/decoupled-solo_r50_fpn_3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.9
+      Epochs: 36
+    inference time (ms/im):
+      - value: 117.2
+        hardware: V100
+        backend: PyTorch
+        batch size: 1
+        mode: FP32
+        resolution: (1333, 800)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 36.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_3x_coco/decoupled_solo_r50_fpn_3x_coco_20210821_042504-7b3301ec.pth
+
+  - Name: decoupled-solo-light_r50_fpn_3x_coco
+    In Collection: SOLO
+    Config: configs/solo/decoupled-solo-light_r50_fpn_3x_coco.py
+    Metadata:
+      Training Memory (GB): 2.2
+      Epochs: 36
+    inference time (ms/im):
+      - value: 35.0
+        hardware: V100
+        backend: PyTorch
+        batch size: 1
+        mode: FP32
+        resolution: (852, 512)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 32.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_light_r50_fpn_3x_coco/decoupled_solo_light_r50_fpn_3x_coco_20210906_142703-e70e226f.pth
+
+  - Name: solo_r50_fpn_3x_coco
+    In Collection: SOLO
+    Config: configs/solo/solo_r50_fpn_3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.4
+      Epochs: 36
+    inference time (ms/im):
+      - value: 94.2
+        hardware: V100
+        backend: PyTorch
+        batch size: 1
+        mode: FP32
+        resolution: (1333, 800)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 35.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_3x_coco/solo_r50_fpn_3x_coco_20210901_012353-11d224d7.pth
+
+  - Name: solo_r50_fpn_1x_coco
+    In Collection: SOLO
+    Config: configs/solo/solo_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.0
+      Epochs: 12
+    inference time (ms/im):
+      - value: 95.1
+        hardware: V100
+        backend: PyTorch
+        batch size: 1
+        mode: FP32
+        resolution: (1333, 800)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 33.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_1x_coco/solo_r50_fpn_1x_coco_20210821_035055-2290a6b8.pth
diff --git a/mmde/mmdet/.mim/configs/solo/solo_r101_fpn_8xb8-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/solo/solo_r101_fpn_8xb8-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f49c5c1ce67973d15b3fad3ad8c966af8203af7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solo/solo_r101_fpn_8xb8-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './solo_r50_fpn_8xb8-lsj-200e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/solo/solo_r18_fpn_8xb8-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/solo/solo_r18_fpn_8xb8-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..977ae54dc28e56802289ac552ce20815b7d1d761
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solo/solo_r18_fpn_8xb8-lsj-200e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './solo_r50_fpn_8xb8-lsj-200e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
diff --git a/mmde/mmdet/.mim/configs/solo/solo_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/solo/solo_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..595e9ffe148be84dcc3d5c89e5315e8ef3a24477
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solo/solo_r50_fpn_1x_coco.py
@@ -0,0 +1,62 @@
+_base_ = [
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    type='SOLO',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=0,
+        num_outs=5),
+    mask_head=dict(
+        type='SOLOHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=7,
+        feat_channels=256,
+        strides=[8, 8, 16, 32, 32],
+        scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)),
+        pos_scale=0.2,
+        num_grids=[40, 36, 24, 16, 12],
+        cls_down_index=0,
+        loss_mask=dict(type='DiceLoss', use_sigmoid=True, loss_weight=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
+    # model training and testing settings
+    test_cfg=dict(
+        nms_pre=500,
+        score_thr=0.1,
+        mask_thr=0.5,
+        filter_thr=0.05,
+        kernel='gaussian',  # gaussian/linear
+        sigma=2.0,
+        max_per_img=100))
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(lr=0.01))
+
+val_evaluator = dict(metric='segm')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/solo/solo_r50_fpn_3x_coco.py b/mmde/mmdet/.mim/configs/solo/solo_r50_fpn_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d5abbd2f4d4e1fdc2e3cb92c8e0157188b0aa9a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solo/solo_r50_fpn_3x_coco.py
@@ -0,0 +1,35 @@
+_base_ = './solo_r50_fpn_1x_coco.py'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 800), (1333, 768), (1333, 736), (1333, 704),
+                (1333, 672), (1333, 640)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# training schedule for 3x
+max_epochs = 36
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[27, 33],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/solo/solo_r50_fpn_8xb8-lsj-200e_coco.py b/mmde/mmdet/.mim/configs/solo/solo_r50_fpn_8xb8-lsj-200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d46bf391c907707d222756e9450b661b6edd6985
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solo/solo_r50_fpn_8xb8-lsj-200e_coco.py
@@ -0,0 +1,71 @@
+_base_ = '../common/lsj-200e_coco-instance.py'
+
+image_size = (1024, 1024)
+batch_augments = [dict(type='BatchFixedSizePad', size=image_size)]
+
+# model settings
+model = dict(
+    type='SOLO',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32,
+        batch_augments=batch_augments),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=0,
+        num_outs=5),
+    mask_head=dict(
+        type='SOLOHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=7,
+        feat_channels=256,
+        strides=[8, 8, 16, 32, 32],
+        scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)),
+        pos_scale=0.2,
+        num_grids=[40, 36, 24, 16, 12],
+        cls_down_index=0,
+        loss_mask=dict(type='DiceLoss', use_sigmoid=True, loss_weight=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
+    # model training and testing settings
+    test_cfg=dict(
+        nms_pre=500,
+        score_thr=0.1,
+        mask_thr=0.5,
+        filter_thr=0.05,
+        kernel='gaussian',  # gaussian/linear
+        sigma=2.0,
+        max_per_img=100))
+
+train_dataloader = dict(batch_size=8, num_workers=4)
+
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=0.01 * 4, momentum=0.9, weight_decay=0.00004),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/solov2/metafile.yml b/mmde/mmdet/.mim/configs/solov2/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d0156b2b40cf62537cdc62af4fa57d644a7978ad
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solov2/metafile.yml
@@ -0,0 +1,93 @@
+Collections:
+  - Name: SOLOv2
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x A100 GPUs
+      Architecture:
+        - FPN
+        - Convolution
+        - ResNet
+    Paper: https://arxiv.org/abs/2003.10152
+    README: configs/solov2/README.md
+
+Models:
+  - Name: solov2_r50_fpn_1x_coco
+    In Collection: SOLOv2
+    Config: configs/solov2/solov2_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 5.1
+      Epochs: 12
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 34.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r50_fpn_1x_coco/solov2_r50_fpn_1x_coco_20220512_125858-a357fa23.pth
+
+  - Name: solov2_r50_fpn_ms-3x_coco
+    In Collection: SOLOv2
+    Config: configs/solov2/solov2_r50_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 5.1
+      Epochs: 36
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r50_fpn_3x_coco/solov2_r50_fpn_3x_coco_20220512_125856-fed092d4.pth
+
+  - Name: solov2_r101-dcn_fpn_ms-3x_coco
+    In Collection: SOLOv2
+    Config: configs/solov2/solov2_r101-dcn_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.1
+      Epochs: 36
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r101_dcn_fpn_3x_coco/solov2_r101_dcn_fpn_3x_coco_20220513_214734-16c966cb.pth
+
+  - Name: solov2_x101-dcn_fpn_ms-3x_coco
+    In Collection: SOLOv2
+    Config: configs/solov2/solov2_x101-dcn_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 11.3
+      Epochs: 36
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 42.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_x101_dcn_fpn_3x_coco/solov2_x101_dcn_fpn_3x_coco_20220513_214337-aef41095.pth
+
+  - Name: solov2-light_r18_fpn_ms-3x_coco
+    In Collection: SOLOv2
+    Config: configs/solov2/solov2-light_r18_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 9.1
+      Epochs: 36
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 29.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r18_fpn_3x_coco/solov2_light_r18_fpn_3x_coco_20220511_083717-75fa355b.pth
+
+  - Name: solov2-light_r50_fpn_ms-3x_coco
+    In Collection: SOLOv2
+    Config: configs/solov2/solov2-light_r50_fpn_ms-3x_coco.py
+    Metadata:
+      Training Memory (GB): 9.9
+      Epochs: 36
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 33.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r50_fpn_3x_coco/solov2_light_r50_fpn_3x_coco_20220512_165256-c93a6074.pth
diff --git a/mmde/mmdet/.mim/configs/solov2/solov2-light_r18_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/solov2/solov2-light_r18_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8fc53e0aed9dd4479f9cd8dcc98ca61db2e50bf
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solov2/solov2-light_r18_fpn_ms-3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './solov2-light_r50_fpn_ms-3x_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        depth=18, init_cfg=dict(checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
diff --git a/mmde/mmdet/.mim/configs/solov2/solov2-light_r34_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/solov2/solov2-light_r34_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..149b336655349c70233e78d03f72d7ee3f1a75f3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solov2/solov2-light_r34_fpn_ms-3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './solov2-light_r50_fpn_ms-3x_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        depth=34, init_cfg=dict(checkpoint='torchvision://resnet34')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
diff --git a/mmde/mmdet/.mim/configs/solov2/solov2-light_r50-dcn_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/solov2/solov2-light_r50-dcn_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..05391944b683985ab975dc8f66be0c8a12f7d255
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solov2/solov2-light_r50-dcn_fpn_ms-3x_coco.py
@@ -0,0 +1,14 @@
+_base_ = './solov2-light_r50_fpn_ms-3x_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)),
+    mask_head=dict(
+        feat_channels=256,
+        stacked_convs=3,
+        scale_ranges=((1, 64), (32, 128), (64, 256), (128, 512), (256, 2048)),
+        mask_feature_head=dict(out_channels=128),
+        dcn_cfg=dict(type='DCNv2'),
+        dcn_apply_to_all_conv=False))  # light solov2 head
diff --git a/mmde/mmdet/.mim/configs/solov2/solov2-light_r50_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/solov2/solov2-light_r50_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf0a7f779c0f587d11c86a31aca19b2663f79a57
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solov2/solov2-light_r50_fpn_ms-3x_coco.py
@@ -0,0 +1,56 @@
+_base_ = './solov2_r50_fpn_1x_coco.py'
+
+# model settings
+model = dict(
+    mask_head=dict(
+        stacked_convs=2,
+        feat_channels=256,
+        scale_ranges=((1, 56), (28, 112), (56, 224), (112, 448), (224, 896)),
+        mask_feature_head=dict(out_channels=128)))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(768, 512), (768, 480), (768, 448), (768, 416), (768, 384),
+                (768, 352)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(448, 768), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# training schedule for 3x
+max_epochs = 36
+train_cfg = dict(by_epoch=True, max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[27, 33],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/solov2/solov2_r101-dcn_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/solov2/solov2_r101-dcn_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..370a4eb7db811b285cc55282e4b66360ca338a31
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solov2/solov2_r101-dcn_fpn_ms-3x_coco.py
@@ -0,0 +1,13 @@
+_base_ = './solov2_r50_fpn_ms-3x_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)),
+    mask_head=dict(
+        mask_feature_head=dict(conv_cfg=dict(type='DCNv2')),
+        dcn_cfg=dict(type='DCNv2'),
+        dcn_apply_to_all_conv=True))
diff --git a/mmde/mmdet/.mim/configs/solov2/solov2_r101_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/solov2/solov2_r101_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..96aaac0a7c2689a125ac0a68edaff2a76dfc773d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solov2/solov2_r101_fpn_ms-3x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './solov2_r50_fpn_ms-3x_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        depth=101, init_cfg=dict(checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/solov2/solov2_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/solov2/solov2_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..138ca010b5f3f96a4f296ffbe66cb1be3add7ec2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solov2/solov2_r50_fpn_1x_coco.py
@@ -0,0 +1,70 @@
+_base_ = [
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    type='SOLOv2',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=0,
+        num_outs=5),
+    mask_head=dict(
+        type='SOLOV2Head',
+        num_classes=80,
+        in_channels=256,
+        feat_channels=512,
+        stacked_convs=4,
+        strides=[8, 8, 16, 32, 32],
+        scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)),
+        pos_scale=0.2,
+        num_grids=[40, 36, 24, 16, 12],
+        cls_down_index=0,
+        mask_feature_head=dict(
+            feat_channels=128,
+            start_level=0,
+            end_level=3,
+            out_channels=256,
+            mask_stride=4,
+            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)),
+        loss_mask=dict(type='DiceLoss', use_sigmoid=True, loss_weight=3.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0)),
+    # model training and testing settings
+    test_cfg=dict(
+        nms_pre=500,
+        score_thr=0.1,
+        mask_thr=0.5,
+        filter_thr=0.05,
+        kernel='gaussian',  # gaussian/linear
+        sigma=2.0,
+        max_per_img=100))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(lr=0.01), clip_grad=dict(max_norm=35, norm_type=2))
+
+val_evaluator = dict(metric='segm')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/solov2/solov2_r50_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/solov2/solov2_r50_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6f09827efbe4e135a784b0808604dbc855ed47e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solov2/solov2_r50_fpn_ms-3x_coco.py
@@ -0,0 +1,35 @@
+_base_ = './solov2_r50_fpn_1x_coco.py'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 800), (1333, 768), (1333, 736), (1333, 704),
+                (1333, 672), (1333, 640)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# training schedule for 3x
+max_epochs = 36
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[27, 33],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/solov2/solov2_x101-dcn_fpn_ms-3x_coco.py b/mmde/mmdet/.mim/configs/solov2/solov2_x101-dcn_fpn_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..612c45eb437efc481948edb660ef1a3eebbcfebe
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/solov2/solov2_x101-dcn_fpn_ms-3x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './solov2_r50_fpn_ms-3x_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')),
+    mask_head=dict(
+        mask_feature_head=dict(conv_cfg=dict(type='DCNv2')),
+        dcn_cfg=dict(type='DCNv2'),
+        dcn_apply_to_all_conv=True))
diff --git a/mmde/mmdet/.mim/configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py b/mmde/mmdet/.mim/configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1d5b72ce3fff73504a0c032867d246bc4e30123
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,41 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/mot_challenge_det.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    rpn_head=dict(
+        bbox_coder=dict(clip_border=False),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        bbox_head=dict(
+            num_classes=1,
+            bbox_coder=dict(clip_border=False),
+            loss_bbox=dict(type='SmoothL1Loss', loss_weight=1.0))),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint=  # noqa: E251
+        'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'  # noqa: E501
+    ))
+
+# training schedule for 4e
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=4, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=100),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=4,
+        by_epoch=True,
+        milestones=[3],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17train.py b/mmde/mmdet/.mim/configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17train.py
new file mode 100644
index 0000000000000000000000000000000000000000..83647061c7f59dc8a6e8d033cdb8dc81de648df4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17train.py
@@ -0,0 +1,11 @@
+_base_ = ['./faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval']
+# data
+data_root = 'data/MOT17/'
+train_dataloader = dict(
+    dataset=dict(ann_file='annotations/train_cocoformat.json'))
+val_dataloader = dict(
+    dataset=dict(ann_file='annotations/train_cocoformat.json'))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(ann_file=data_root + 'annotations/train_cocoformat.json')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/sort/faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval.py b/mmde/mmdet/.mim/configs/sort/faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6d14ad8be2a939bce168f4f09f08dde50f140c8
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sort/faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval.py
@@ -0,0 +1,29 @@
+_base_ = ['./faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval']
+model = dict(
+    rpn_head=dict(bbox_coder=dict(clip_border=True)),
+    roi_head=dict(
+        bbox_head=dict(bbox_coder=dict(clip_border=True), num_classes=1)))
+# data
+data_root = 'data/MOT20/'
+train_dataloader = dict(dataset=dict(data_root=data_root))
+val_dataloader = dict(dataset=dict(data_root=data_root))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(ann_file=data_root +
+                     'annotations/half-val_cocoformat.json')
+test_evaluator = val_evaluator
+
+# training schedule for 8e
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=8, val_interval=1)
+
+# learning rate
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=100),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=8,
+        by_epoch=True,
+        milestones=[6],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/sort/faster-rcnn_r50_fpn_8xb2-8e_mot20train_test-mot20train.py b/mmde/mmdet/.mim/configs/sort/faster-rcnn_r50_fpn_8xb2-8e_mot20train_test-mot20train.py
new file mode 100644
index 0000000000000000000000000000000000000000..85c859732cb3e4742d3003d555f72f4cc7ac2e05
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sort/faster-rcnn_r50_fpn_8xb2-8e_mot20train_test-mot20train.py
@@ -0,0 +1,32 @@
+_base_ = ['./faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval']
+model = dict(
+    rpn_head=dict(bbox_coder=dict(clip_border=True)),
+    roi_head=dict(
+        bbox_head=dict(bbox_coder=dict(clip_border=True), num_classes=1)))
+# data
+data_root = 'data/MOT20/'
+train_dataloader = dict(
+    dataset=dict(
+        data_root=data_root, ann_file='annotations/train_cocoformat.json'))
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root, ann_file='annotations/train_cocoformat.json'))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(ann_file=data_root + 'annotations/train_cocoformat.json')
+test_evaluator = val_evaluator
+
+# training schedule for 8e
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=8, val_interval=1)
+
+# learning rate
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=100),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=8,
+        by_epoch=True,
+        milestones=[6],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/sort/metafile.yml b/mmde/mmdet/.mim/configs/sort/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c582ce353df6344aaa2fe25e0f410bb458e50803
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sort/metafile.yml
@@ -0,0 +1,35 @@
+Collections:
+  - Name: SORT
+    Metadata:
+      Training Techniques:
+        - SGD with Momentum
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+        - FPN
+    Paper:
+      URL: https://arxiv.org/abs/1602.00763
+      Title: Simple Online and Realtime Tracking
+    README: configs/sort/README.md
+
+Models:
+  - Name: sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval
+    In Collection: SORT
+    Config: configs/mot/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
+    Metadata:
+      Training Data: MOT17-half-train
+      inference time (ms/im):
+        - value: 53.8
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (640, 1088)
+    Results:
+      - Task: Multiple Object Tracking
+        Dataset: MOT17-half-val
+        Metrics:
+          MOTA: 62.0
+          IDF1: 57.8
+          HOTA: 52.0
+    Weights: https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth
diff --git a/mmde/mmdet/.mim/configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py b/mmde/mmdet/.mim/configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
new file mode 100644
index 0000000000000000000000000000000000000000..78acb774ec22b7555e633b541c21fe20beb75ce9
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,54 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py',
+    '../_base_/datasets/mot_challenge.py', '../_base_/default_runtime.py'
+]
+
+default_hooks = dict(
+    logger=dict(type='LoggerHook', interval=1),
+    visualization=dict(type='TrackVisualizationHook', draw=False))
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='TrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# custom hooks
+custom_hooks = [
+    # Synchronize model buffers such as running_mean and running_var in BN
+    # at the end of each epoch
+    dict(type='SyncBuffersHook')
+]
+
+detector = _base_.model
+detector.pop('data_preprocessor')
+detector.rpn_head.bbox_coder.update(dict(clip_border=False))
+detector.roi_head.bbox_head.update(dict(num_classes=1))
+detector.roi_head.bbox_head.bbox_coder.update(dict(clip_border=False))
+detector['init_cfg'] = dict(
+    type='Pretrained',
+    checkpoint=  # noqa: E251
+    'https://download.openmmlab.com/mmtracking/mot/'
+    'faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth')  # noqa: E501
+del _base_.model
+
+model = dict(
+    type='DeepSORT',
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        rgb_to_bgr=False,
+        pad_size_divisor=32),
+    detector=detector,
+    tracker=dict(
+        type='SORTTracker',
+        motion=dict(type='KalmanFilter', center_only=False),
+        obj_score_thr=0.5,
+        match_iou_thr=0.5,
+        reid=None))
+
+train_dataloader = None
+
+train_cfg = None
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/mmde/mmdet/.mim/configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py b/mmde/mmdet/.mim/configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py
new file mode 100644
index 0000000000000000000000000000000000000000..921652c4430ccf63cd5850884b2a064e8dc73251
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py
@@ -0,0 +1,15 @@
+_base_ = [
+    './sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain'
+    '_test-mot17halfval.py'
+]
+
+# dataloader
+val_dataloader = dict(
+    dataset=dict(ann_file='annotations/train_cocoformat.json'))
+test_dataloader = dict(
+    dataset=dict(
+        ann_file='annotations/test_cocoformat.json',
+        data_prefix=dict(img_path='test')))
+
+# evaluator
+test_evaluator = dict(format_only=True, outfile_prefix='./mot_17_test_res')
diff --git a/mmde/mmdet/.mim/configs/sparse_rcnn/metafile.yml b/mmde/mmdet/.mim/configs/sparse_rcnn/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8fe2531893b99662bd9e5dbbc1d6f9a6ced00325
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sparse_rcnn/metafile.yml
@@ -0,0 +1,80 @@
+Collections:
+  - Name: Sparse R-CNN
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+        - Sparse R-CNN
+    Paper:
+      URL: https://arxiv.org/abs/2011.12450
+      Title: 'Sparse R-CNN: End-to-End Object Detection with Learnable Proposals'
+    README: configs/sparse_rcnn/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.9.0/mmdet/models/detectors/sparse_rcnn.py#L6
+      Version: v2.9.0
+
+Models:
+  - Name: sparse-rcnn_r50_fpn_1x_coco
+    In Collection: Sparse R-CNN
+    Config: configs/sparse_rcnn/sparse-rcnn_r50_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco/sparse_rcnn_r50_fpn_1x_coco_20201222_214453-dc79b137.pth
+
+  - Name: sparse-rcnn_r50_fpn_ms-480-800-3x_coco
+    In Collection: Sparse R-CNN
+    Config: configs/sparse_rcnn/sparse-rcnn_r50_fpn_ms-480-800-3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco_20201218_154234-7bc5c054.pth
+
+  - Name: sparse-rcnn_r50_fpn_300-proposals_crop-ms-480-800-3x_coco
+    In Collection: Sparse R-CNN
+    Config: configs/sparse_rcnn/sparse-rcnn_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_024605-9fe92701.pth
+
+  - Name: sparse-rcnn_r101_fpn_ms-480-800-3x_coco
+    In Collection: Sparse R-CNN
+    Config: configs/sparse_rcnn/sparse-rcnn_r101_fpn_ms-480-800-3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco_20201223_121552-6c46c9d6.pth
+
+  - Name: sparse-rcnn_r101_fpn_300-proposals_crop-ms-480-800-3x_coco
+    In Collection: Sparse R-CNN
+    Config: configs/sparse_rcnn/sparse-rcnn_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_023452-c23c3564.pth
diff --git a/mmde/mmdet/.mim/configs/sparse_rcnn/sparse-rcnn_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py b/mmde/mmdet/.mim/configs/sparse_rcnn/sparse-rcnn_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..09c11c6565ea2444fe8ffc930ca49fbffff3e8fa
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sparse_rcnn/sparse-rcnn_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './sparse-rcnn_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/sparse_rcnn/sparse-rcnn_r101_fpn_ms-480-800-3x_coco.py b/mmde/mmdet/.mim/configs/sparse_rcnn/sparse-rcnn_r101_fpn_ms-480-800-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a51f11ce5b6d55b2037461a93aa2bd18c8f2639d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sparse_rcnn/sparse-rcnn_r101_fpn_ms-480-800-3x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './sparse-rcnn_r50_fpn_ms-480-800-3x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/sparse_rcnn/sparse-rcnn_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/sparse_rcnn/sparse-rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..88354427b4138f4f5587f2a4a047bad654693780
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sparse_rcnn/sparse-rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,101 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+num_stages = 6
+num_proposals = 100
+model = dict(
+    type='SparseRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=0,
+        add_extra_convs='on_input',
+        num_outs=4),
+    rpn_head=dict(
+        type='EmbeddingRPNHead',
+        num_proposals=num_proposals,
+        proposal_feature_channel=256),
+    roi_head=dict(
+        type='SparseRoIHead',
+        num_stages=num_stages,
+        stage_loss_weights=[1] * num_stages,
+        proposal_feature_channel=256,
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='DIIHead',
+                num_classes=80,
+                num_ffn_fcs=2,
+                num_heads=8,
+                num_cls_fcs=1,
+                num_reg_fcs=3,
+                feedforward_channels=2048,
+                in_channels=256,
+                dropout=0.0,
+                ffn_act_cfg=dict(type='ReLU', inplace=True),
+                dynamic_conv_cfg=dict(
+                    type='DynamicConv',
+                    in_channels=256,
+                    feat_channels=64,
+                    out_channels=256,
+                    input_feat_shape=7,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    norm_cfg=dict(type='LN')),
+                loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                loss_cls=dict(
+                    type='FocalLoss',
+                    use_sigmoid=True,
+                    gamma=2.0,
+                    alpha=0.25,
+                    loss_weight=2.0),
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    clip_border=False,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.5, 0.5, 1., 1.])) for _ in range(num_stages)
+        ]),
+    # training and testing settings
+    train_cfg=dict(
+        rpn=None,
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='HungarianAssigner',
+                    match_costs=[
+                        dict(type='FocalLossCost', weight=2.0),
+                        dict(type='BBoxL1Cost', weight=5.0, box_format='xyxy'),
+                        dict(type='IoUCost', iou_mode='giou', weight=2.0)
+                    ]),
+                sampler=dict(type='PseudoSampler'),
+                pos_weight=1) for _ in range(num_stages)
+        ]),
+    test_cfg=dict(rpn=None, rcnn=dict(max_per_img=num_proposals)))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        _delete_=True, type='AdamW', lr=0.000025, weight_decay=0.0001),
+    clip_grad=dict(max_norm=1, norm_type=2))
diff --git a/mmde/mmdet/.mim/configs/sparse_rcnn/sparse-rcnn_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py b/mmde/mmdet/.mim/configs/sparse_rcnn/sparse-rcnn_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..93edc0314b510c635f703f82e39c446ed056c6ea
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sparse_rcnn/sparse-rcnn_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py
@@ -0,0 +1,43 @@
+_base_ = './sparse-rcnn_r50_fpn_ms-480-800-3x_coco.py'
+num_proposals = 300
+model = dict(
+    rpn_head=dict(num_proposals=num_proposals),
+    test_cfg=dict(
+        _delete_=True, rpn=None, rcnn=dict(max_per_img=num_proposals)))
+
+# augmentation strategy originates from DETR.
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[[
+            dict(
+                type='RandomChoiceResize',
+                scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                        (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                        (736, 1333), (768, 1333), (800, 1333)],
+                keep_ratio=True)
+        ],
+                    [
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(400, 1333), (500, 1333), (600, 1333)],
+                            keep_ratio=True),
+                        dict(
+                            type='RandomCrop',
+                            crop_type='absolute_range',
+                            crop_size=(384, 600),
+                            allow_negative_crop=True),
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(480, 1333), (512, 1333), (544, 1333),
+                                    (576, 1333), (608, 1333), (640, 1333),
+                                    (672, 1333), (704, 1333), (736, 1333),
+                                    (768, 1333), (800, 1333)],
+                            keep_ratio=True)
+                    ]]),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/sparse_rcnn/sparse-rcnn_r50_fpn_ms-480-800-3x_coco.py b/mmde/mmdet/.mim/configs/sparse_rcnn/sparse-rcnn_r50_fpn_ms-480-800-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..156028d7cdd22c32c00a765c6cf86b8f9e2df48b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/sparse_rcnn/sparse-rcnn_r50_fpn_ms-480-800-3x_coco.py
@@ -0,0 +1,32 @@
+_base_ = './sparse-rcnn_r50_fpn_1x_coco.py'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                (736, 1333), (768, 1333), (800, 1333)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# learning policy
+max_epochs = 36
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=max_epochs)
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[27, 33],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/ssd/metafile.yml b/mmde/mmdet/.mim/configs/ssd/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..190a207ccc9b62a002d026f917d66778e5cee8b7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ssd/metafile.yml
@@ -0,0 +1,78 @@
+Collections:
+  - Name: SSD
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - VGG
+    Paper:
+      URL: https://arxiv.org/abs/1512.02325
+      Title: 'SSD: Single Shot MultiBox Detector'
+    README: configs/ssd/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.14.0/mmdet/models/dense_heads/ssd_head.py#L16
+      Version: v2.14.0
+
+Models:
+  - Name: ssd300_coco
+    In Collection: SSD
+    Config: configs/ssd/ssd300_coco.py
+    Metadata:
+      Training Memory (GB): 9.9
+      inference time (ms/im):
+        - value: 22.88
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (300, 300)
+      Epochs: 120
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 25.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd300_coco/ssd300_coco_20210803_015428-d231a06e.pth
+
+  - Name: ssd512_coco
+    In Collection: SSD
+    Config: configs/ssd/ssd512_coco.py
+    Metadata:
+      Training Memory (GB): 19.4
+      inference time (ms/im):
+        - value: 32.57
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (512, 512)
+      Epochs: 120
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 29.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd512_coco/ssd512_coco_20210803_022849-0a47a1ca.pth
+
+  - Name: ssdlite_mobilenetv2-scratch_8xb24-600e_coco
+    In Collection: SSD
+    Config: configs/ssd/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py
+    Metadata:
+      Training Memory (GB): 4.0
+      inference time (ms/im):
+        - value: 14.3
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (320, 320)
+      Epochs: 600
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 21.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/ssd/ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_scratch_600e_coco_20210629_110627-974d9307.pth
diff --git a/mmde/mmdet/.mim/configs/ssd/ssd300_coco.py b/mmde/mmdet/.mim/configs/ssd/ssd300_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..796d25c905350a8ed263b9cd1d2f8027b8c9a3ca
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ssd/ssd300_coco.py
@@ -0,0 +1,71 @@
+_base_ = [
+    '../_base_/models/ssd300.py', '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+
+# dataset settings
+input_size = 300
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean={{_base_.model.data_preprocessor.mean}},
+        to_rgb={{_base_.model.data_preprocessor.bgr_to_rgb}},
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=2,
+    batch_sampler=None,
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type={{_base_.dataset_type}},
+            data_root={{_base_.data_root}},
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args={{_base_.backend_args}})))
+val_dataloader = dict(batch_size=8, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4))
+
+custom_hooks = [
+    dict(type='NumClassCheckHook'),
+    dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW')
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/ssd/ssd512_coco.py b/mmde/mmdet/.mim/configs/ssd/ssd512_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7acd6144202e8fee232e3ed49a557d3cf7c53e15
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ssd/ssd512_coco.py
@@ -0,0 +1,60 @@
+_base_ = 'ssd300_coco.py'
+
+# model settings
+input_size = 512
+model = dict(
+    neck=dict(
+        out_channels=(512, 1024, 512, 256, 256, 256, 256),
+        level_strides=(2, 2, 2, 2, 1),
+        level_paddings=(1, 1, 1, 1, 1),
+        last_kernel_size=4),
+    bbox_head=dict(
+        in_channels=(512, 1024, 512, 256, 256, 256, 256),
+        anchor_generator=dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            input_size=input_size,
+            basesize_ratio_range=(0.1, 0.9),
+            strides=[8, 16, 32, 64, 128, 256, 512],
+            ratios=[[2], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2]])))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean={{_base_.model.data_preprocessor.mean}},
+        to_rgb={{_base_.model.data_preprocessor.bgr_to_rgb}},
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/ssd/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py b/mmde/mmdet/.mim/configs/ssd/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e508f20ecf33e58ddfe6ff8ee94f516d3e03f79
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/ssd/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py
@@ -0,0 +1,158 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+data_preprocessor = dict(
+    type='DetDataPreprocessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_size_divisor=1)
+model = dict(
+    type='SingleStageDetector',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='MobileNetV2',
+        out_indices=(4, 7),
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+        init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)),
+    neck=dict(
+        type='SSDNeck',
+        in_channels=(96, 1280),
+        out_channels=(96, 1280, 512, 256, 256, 128),
+        level_strides=(2, 2, 2, 2),
+        level_paddings=(1, 1, 1, 1),
+        l2_norm_scale=None,
+        use_depthwise=True,
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+        act_cfg=dict(type='ReLU6'),
+        init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)),
+    bbox_head=dict(
+        type='SSDHead',
+        in_channels=(96, 1280, 512, 256, 256, 128),
+        num_classes=80,
+        use_depthwise=True,
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.03),
+        act_cfg=dict(type='ReLU6'),
+        init_cfg=dict(type='Normal', layer='Conv2d', std=0.001),
+
+        # set anchor size manually instead of using the predefined
+        # SSD300 setting.
+        anchor_generator=dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            strides=[16, 32, 64, 107, 160, 320],
+            ratios=[[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]],
+            min_sizes=[48, 100, 150, 202, 253, 304],
+            max_sizes=[100, 150, 202, 253, 304, 320]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2])),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.,
+            ignore_iof_thr=-1,
+            gt_max_assign_all=False),
+        sampler=dict(type='PseudoSampler'),
+        smoothl1_beta=1.,
+        allowed_border=-1,
+        pos_weight=-1,
+        neg_pos_ratio=3,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        nms=dict(type='nms', iou_threshold=0.45),
+        min_bbox_size=0,
+        score_thr=0.02,
+        max_per_img=200))
+env_cfg = dict(cudnn_benchmark=True)
+
+# dataset settings
+input_size = 320
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean=data_preprocessor['mean'],
+        to_rgb=data_preprocessor['bgr_to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=24,
+    num_workers=4,
+    batch_sampler=None,
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type={{_base_.dataset_type}},
+            data_root={{_base_.data_root}},
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline)))
+val_dataloader = dict(batch_size=8, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# training schedule
+max_epochs = 120
+train_cfg = dict(max_epochs=max_epochs, val_interval=5)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='CosineAnnealingLR',
+        begin=0,
+        T_max=max_epochs,
+        end=max_epochs,
+        by_epoch=True,
+        eta_min=0)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.015, momentum=0.9, weight_decay=4.0e-5))
+
+custom_hooks = [
+    dict(type='NumClassCheckHook'),
+    dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW')
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (24 samples per GPU)
+auto_scale_lr = dict(base_batch_size=192)
diff --git a/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_amp-lsj-100e_coco.py b/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_amp-lsj-100e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b004d740a8f1e303bc4ad32593baad021ccae710
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_amp-lsj-100e_coco.py
@@ -0,0 +1,4 @@
+_base_ = 'mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py'  # noqa
+
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(type='AmpOptimWrapper')
diff --git a/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py b/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..70e92a82e0cd1f083fbb87035f61877da4c11022
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py
@@ -0,0 +1,68 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../common/lsj-100e_coco-instance.py'
+]
+image_size = (1024, 1024)
+batch_augments = [
+    dict(type='BatchFixedSizePad', size=image_size, pad_mask=True)
+]
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+# Use MMSyncBN that handles empty tensor in head. It can be changed to
+# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed
+head_norm_cfg = dict(type='MMSyncBN', requires_grad=True)
+model = dict(
+    # use caffe norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+
+        # pad_size_divisor=32 is unnecessary in training but necessary
+        # in testing.
+        pad_size_divisor=32,
+        batch_augments=batch_augments),
+    backbone=dict(
+        frozen_stages=-1,
+        norm_eval=False,
+        norm_cfg=norm_cfg,
+        init_cfg=None,
+        style='caffe'),
+    neck=dict(norm_cfg=norm_cfg),
+    rpn_head=dict(num_convs=2),
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=head_norm_cfg),
+        mask_head=dict(norm_cfg=head_norm_cfg)))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='RandomResize',
+        scale=image_size,
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+# Use RepeatDataset to speed up training
+train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-400e_coco.py b/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-400e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb64c9b6865634412c8b9d951b588cf0fb8cd32b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-400e_coco.py
@@ -0,0 +1,20 @@
+_base_ = './mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py'  # noqa
+
+# Use RepeatDataset to speed up training
+# change repeat time from 4 (for 100 epochs) to 16 (for 400 epochs)
+train_dataloader = dict(dataset=dict(times=4 * 4))
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.067,
+        by_epoch=False,
+        begin=0,
+        end=500 * 4),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_amp-lsj-100e_coco.py b/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_amp-lsj-100e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fab2c72114cbe8a4d6cd3bdddb4e7c3b8dc2d0c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_amp-lsj-100e_coco.py
@@ -0,0 +1,4 @@
+_base_ = 'mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py'
+
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(type='AmpOptimWrapper')
diff --git a/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py b/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e06587fb03d42958142cac9ce7b15e7a19a9f6d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py
@@ -0,0 +1,30 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../common/lsj-100e_coco-instance.py'
+]
+
+image_size = (1024, 1024)
+batch_augments = [
+    dict(type='BatchFixedSizePad', size=image_size, pad_mask=True)
+]
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+# Use MMSyncBN that handles empty tensor in head. It can be changed to
+# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed
+head_norm_cfg = dict(type='MMSyncBN', requires_grad=True)
+model = dict(
+    # the model is trained from scratch, so init_cfg is None
+    data_preprocessor=dict(
+        # pad_size_divisor=32 is unnecessary in training but necessary
+        # in testing.
+        pad_size_divisor=32,
+        batch_augments=batch_augments),
+    backbone=dict(
+        frozen_stages=-1, norm_eval=False, norm_cfg=norm_cfg, init_cfg=None),
+    neck=dict(norm_cfg=norm_cfg),
+    rpn_head=dict(num_convs=2),  # leads to 0.1+ mAP
+    roi_head=dict(
+        bbox_head=dict(
+            type='Shared4Conv1FCBBoxHead',
+            conv_out_channels=256,
+            norm_cfg=head_norm_cfg),
+        mask_head=dict(norm_cfg=head_norm_cfg)))
diff --git a/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-50e_coco.py b/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6621d28c0a80bd669fa857ce4eb7058a6f82296c
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-50e_coco.py
@@ -0,0 +1,5 @@
+_base_ = 'mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py'
+
+# Use RepeatDataset to speed up training
+# change repeat time from 4 (for 100 epochs) to 2 (for 50 epochs)
+train_dataloader = dict(dataset=dict(times=2))
diff --git a/mmde/mmdet/.mim/configs/strong_baselines/metafile.yml b/mmde/mmdet/.mim/configs/strong_baselines/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f72c07e64b6e72dc0c71ae114877ce5c8513be7b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/strong_baselines/metafile.yml
@@ -0,0 +1,24 @@
+Models:
+  - Name: mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco
+    In Collection: Mask R-CNN
+    Config: configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py
+    Metadata:
+      Epochs: 100
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+        - LSJ
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+        - FPN
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          box AP: 40.4
diff --git a/mmde/mmdet/.mim/configs/strongsort/metafile.yml b/mmde/mmdet/.mim/configs/strongsort/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..08a564b77b866ebe55e2b634faa919817a1de09a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/strongsort/metafile.yml
@@ -0,0 +1,48 @@
+Collections:
+  - Name: StrongSORT++
+    Metadata:
+      Training Techniques:
+        - SGD with Momentum
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+        - YOLOX
+    Paper:
+      URL: https://arxiv.org/abs/2202.13514
+      Title: "StrongSORT: Make DeepSORT Great Again"
+    README: configs/strongsort/README.md
+
+Models:
+  - Name: strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval
+    In Collection: StrongSORT++
+    Config: configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
+    Metadata:
+      Training Data: CrowdHuman + MOT17-half-train
+    Results:
+      - Task: Multiple Object Tracking
+        Dataset: MOT17-half-val
+        Metrics:
+          MOTA: 78.3
+          IDF1: 83.2
+          HOTA: 70.9
+    Weights:
+      - https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/yolox_x_crowdhuman_mot17-private-half_20220812_192036-b6c9ce9a.pth
+      - https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot17-4bf6b63d.pth
+      - https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/aflink_motchallenge_20220812_190310-a7578ad3.pth
+
+  - Name: strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test
+    In Collection: StrongSORT++
+    Config: configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py
+    Metadata:
+      Training Data: CrowdHuman + MOT20-train
+    Results:
+      - Task: Multiple Object Tracking
+        Dataset: MOT20-test
+        Metrics:
+          MOTA: 75.5
+          IDF1: 77.3
+          HOTA: 62.9
+    Weights:
+      - https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/yolox_x_crowdhuman_mot20-private_20220812_192123-77c014de.pth
+      - https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth
+      - https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/aflink_motchallenge_20220812_190310-a7578ad3.pth
diff --git a/mmde/mmdet/.mim/configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/mmde/mmdet/.mim/configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
new file mode 100644
index 0000000000000000000000000000000000000000..532e2aee718fb481bc81759a2853ac0fddf80e0e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,130 @@
+_base_ = [
+    './yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py',  # noqa: E501
+]
+
+dataset_type = 'MOTChallengeDataset'
+detector = _base_.model
+detector.pop('data_preprocessor')
+del _base_.model
+
+model = dict(
+    type='StrongSORT',
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='BatchSyncRandomResize',
+                random_size_range=(576, 1024),
+                size_divisor=32,
+                interval=10)
+        ]),
+    detector=detector,
+    reid=dict(
+        type='BaseReID',
+        data_preprocessor=dict(type='mmpretrain.ClsDataPreprocessor'),
+        backbone=dict(
+            type='mmpretrain.ResNet',
+            depth=50,
+            num_stages=4,
+            out_indices=(3, ),
+            style='pytorch'),
+        neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1),
+        head=dict(
+            type='LinearReIDHead',
+            num_fcs=1,
+            in_channels=2048,
+            fc_channels=1024,
+            out_channels=128,
+            num_classes=380,
+            loss_cls=dict(type='mmpretrain.CrossEntropyLoss', loss_weight=1.0),
+            loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0),
+            norm_cfg=dict(type='BN1d'),
+            act_cfg=dict(type='ReLU'))),
+    cmc=dict(
+        type='CameraMotionCompensation',
+        warp_mode='cv2.MOTION_EUCLIDEAN',
+        num_iters=100,
+        stop_eps=0.00001),
+    tracker=dict(
+        type='StrongSORTTracker',
+        motion=dict(type='KalmanFilter', center_only=False, use_nsa=True),
+        obj_score_thr=0.6,
+        reid=dict(
+            num_samples=None,
+            img_scale=(256, 128),
+            img_norm_cfg=dict(
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375],
+                to_rgb=True),
+            match_score_thr=0.3,
+            motion_weight=0.02,
+        ),
+        match_iou_thr=0.7,
+        momentums=dict(embeds=0.1, ),
+        num_tentatives=2,
+        num_frames_retain=100),
+    postprocess_model=dict(
+        type='AppearanceFreeLink',
+        checkpoint=  # noqa: E251
+        'https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/aflink_motchallenge_20220812_190310-a7578ad3.pth',  # noqa: E501
+        temporal_threshold=(0, 30),
+        spatial_threshold=50,
+        confidence_threshold=0.95,
+    ))
+
+train_pipeline = None
+test_pipeline = [
+    dict(
+        type='TransformBroadcaster',
+        transforms=[
+            dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+            dict(type='Resize', scale=_base_.img_scale, keep_ratio=True),
+            dict(
+                type='Pad',
+                size_divisor=32,
+                pad_val=dict(img=(114.0, 114.0, 114.0))),
+            dict(type='LoadTrackAnnotations'),
+        ]),
+    dict(type='PackTrackInputs')
+]
+
+train_dataloader = None
+val_dataloader = dict(
+    # Now StrongSORT only support video_based sampling
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        _delete_=True,
+        type=dataset_type,
+        data_root=_base_.data_root,
+        ann_file='annotations/half-val_cocoformat.json',
+        data_prefix=dict(img_path='train'),
+        # when you evaluate track performance, you need to remove metainfo
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+train_cfg = None
+optim_wrapper = None
+
+# evaluator
+val_evaluator = dict(
+    _delete_=True,
+    type='MOTChallengeMetric',
+    metric=['HOTA', 'CLEAR', 'Identity'],
+    # use_postprocess to support AppearanceFreeLink in val_evaluator
+    use_postprocess=True,
+    postprocess_tracklet_cfg=[
+        dict(
+            type='InterpolateTracklets',
+            min_num_frames=5,
+            max_num_frames=20,
+            use_gsi=True,
+            smooth_tau=10)
+    ])
+test_evaluator = val_evaluator
+
+default_hooks = dict(logger=dict(type='LoggerHook', interval=1))
+
+del _base_.param_scheduler
+del _base_.custom_hooks
diff --git a/mmde/mmdet/.mim/configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py b/mmde/mmdet/.mim/configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py
new file mode 100644
index 0000000000000000000000000000000000000000..eab97063932528df7e17c7d65bf9f0d13f5dfa73
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py
@@ -0,0 +1,44 @@
+_base_ = [
+    './strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain'
+    '_test-mot17halfval.py'
+]
+
+img_scale = (1600, 896)  # width, height
+
+model = dict(
+    data_preprocessor=dict(
+        type='TrackDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(type='BatchSyncRandomResize', random_size_range=(640, 1152))
+        ]))
+
+test_pipeline = [
+    dict(
+        type='TransformBroadcaster',
+        transforms=[
+            dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+            dict(type='Resize', scale=img_scale, keep_ratio=True),
+            dict(
+                type='Pad',
+                size_divisor=32,
+                pad_val=dict(img=(114.0, 114.0, 114.0))),
+            dict(type='LoadTrackAnnotations'),
+        ]),
+    dict(type='PackTrackInputs')
+]
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root='data/MOT17',
+        ann_file='annotations/train_cocoformat.json',
+        data_prefix=dict(img_path='train'),
+        pipeline=test_pipeline))
+test_dataloader = dict(
+    dataset=dict(
+        data_root='data/MOT20',
+        ann_file='annotations/test_cocoformat.json',
+        data_prefix=dict(img_path='test'),
+        pipeline=test_pipeline))
+
+test_evaluator = dict(format_only=True, outfile_prefix='./mot_20_test_res')
diff --git a/mmde/mmdet/.mim/configs/strongsort/yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/mmde/mmdet/.mim/configs/strongsort/yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
new file mode 100644
index 0000000000000000000000000000000000000000..59a52e4394b5825d40a99e08793147fe836b4c19
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/strongsort/yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,188 @@
+_base_ = ['../yolox/yolox_x_8xb8-300e_coco.py']
+
+data_root = 'data/MOT17/'
+
+img_scale = (1440, 800)  # width, height
+batch_size = 4
+
+# model settings
+model = dict(
+    bbox_head=dict(num_classes=1),
+    test_cfg=dict(nms=dict(iou_threshold=0.7)),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint=  # noqa: E251
+        'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth'  # noqa: E501
+    ))
+
+train_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        bbox_clip_border=False),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        bbox_clip_border=False),
+    dict(
+        type='MixUp',
+        img_scale=img_scale,
+        ratio_range=(0.8, 1.6),
+        pad_val=114.0,
+        bbox_clip_border=False),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='Resize',
+        scale=img_scale,
+        keep_ratio=True,
+        clip_object_border=False),
+    dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    _delete_=True,
+    batch_size=batch_size,
+    num_workers=4,
+    persistent_workers=True,
+    pin_memory=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='MultiImageMixDataset',
+        dataset=dict(
+            type='ConcatDataset',
+            datasets=[
+                dict(
+                    type='CocoDataset',
+                    data_root=data_root,
+                    ann_file='annotations/half-train_cocoformat.json',
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(classes=('pedestrian', )),
+                    pipeline=[
+                        dict(
+                            type='LoadImageFromFile',
+                            backend_args=_base_.backend_args),
+                        dict(type='LoadAnnotations', with_bbox=True),
+                    ]),
+                dict(
+                    type='CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_train.json',
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(classes=('pedestrian', )),
+                    pipeline=[
+                        dict(
+                            type='LoadImageFromFile',
+                            backend_args=_base_.backend_args),
+                        dict(type='LoadAnnotations', with_bbox=True),
+                    ]),
+                dict(
+                    type='CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_val.json',
+                    data_prefix=dict(img='val'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(classes=('pedestrian', )),
+                    pipeline=[
+                        dict(
+                            type='LoadImageFromFile',
+                            backend_args=_base_.backend_args),
+                        dict(type='LoadAnnotations', with_bbox=True),
+                    ]),
+            ]),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    dataset=dict(
+        data_root=data_root,
+        ann_file='annotations/half-val_cocoformat.json',
+        data_prefix=dict(img='train'),
+        metainfo=dict(classes=('pedestrian', )),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# training settings
+max_epochs = 80
+num_last_epochs = 10
+interval = 5
+
+train_cfg = dict(max_epochs=max_epochs, val_begin=75, val_interval=1)
+
+# optimizer
+# default 8 gpu
+base_lr = 0.001 / 8 * batch_size
+optim_wrapper = dict(optimizer=dict(lr=base_lr))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='QuadraticWarmupLR',
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=1,
+        T_max=max_epochs - num_last_epochs,
+        end=max_epochs - num_last_epochs,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='ConstantLR',
+        by_epoch=True,
+        factor=1,
+        begin=max_epochs - num_last_epochs,
+        end=max_epochs,
+    )
+]
+
+default_hooks = dict(
+    checkpoint=dict(
+        interval=1,
+        max_keep_ckpts=5  # only keep latest 5 checkpoints
+    ))
+
+custom_hooks = [
+    dict(
+        type='YOLOXModeSwitchHook',
+        num_last_epochs=num_last_epochs,
+        priority=48),
+    dict(type='SyncNormHook', priority=48),
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0001,
+        update_buffers=True,
+        priority=49)
+]
+
+# evaluator
+val_evaluator = dict(
+    ann_file=data_root + 'annotations/half-val_cocoformat.json',
+    format_only=False)
+test_evaluator = val_evaluator
+
+del _base_.tta_model
+del _base_.tta_pipeline
+del _base_.train_dataset
diff --git a/mmde/mmdet/.mim/configs/strongsort/yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py b/mmde/mmdet/.mim/configs/strongsort/yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4eb3cb2c9804f0219ba91d0b5d460da342ab668
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/strongsort/yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py
@@ -0,0 +1,108 @@
+_base_ = ['./yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py']
+
+data_root = 'data/MOT20/'
+
+img_scale = (1600, 896)  # width, height
+
+# model settings
+model = dict(
+    data_preprocessor=dict(batch_augments=[
+        dict(type='BatchSyncRandomResize', random_size_range=(640, 1152))
+    ]))
+
+train_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        bbox_clip_border=True),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        bbox_clip_border=True),
+    dict(
+        type='MixUp',
+        img_scale=img_scale,
+        ratio_range=(0.8, 1.6),
+        pad_val=114.0,
+        bbox_clip_border=True),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='Resize',
+        scale=img_scale,
+        keep_ratio=True,
+        clip_object_border=True),
+    dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        type='MultiImageMixDataset',
+        dataset=dict(
+            type='ConcatDataset',
+            datasets=[
+                dict(
+                    type='CocoDataset',
+                    data_root=data_root,
+                    ann_file='annotations/train_cocoformat.json',
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(classes=('pedestrian', )),
+                    pipeline=[
+                        dict(
+                            type='LoadImageFromFile',
+                            backend_args=_base_.backend_args),
+                        dict(type='LoadAnnotations', with_bbox=True),
+                    ]),
+                dict(
+                    type='CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_train.json',
+                    data_prefix=dict(img='train'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(classes=('pedestrian', )),
+                    pipeline=[
+                        dict(
+                            type='LoadImageFromFile',
+                            backend_args=_base_.backend_args),
+                        dict(type='LoadAnnotations', with_bbox=True),
+                    ]),
+                dict(
+                    type='CocoDataset',
+                    data_root='data/crowdhuman',
+                    ann_file='annotations/crowdhuman_val.json',
+                    data_prefix=dict(img='val'),
+                    filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                    metainfo=dict(classes=('pedestrian', )),
+                    pipeline=[
+                        dict(
+                            type='LoadImageFromFile',
+                            backend_args=_base_.backend_args),
+                        dict(type='LoadAnnotations', with_bbox=True),
+                    ]),
+            ]),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root='data/MOT17', ann_file='annotations/train_cocoformat.json'))
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(ann_file='data/MOT17/annotations/train_cocoformat.json')
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet/.mim/configs/swin/mask-rcnn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco.py b/mmde/mmdet/.mim/configs/swin/mask-rcnn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a3e8ad900553c38d11ddc7747cbc0f244f6b4c7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/swin/mask-rcnn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './mask-rcnn_swin-t-p4-w7_fpn_amp-ms-crop-3x_coco.py'
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth'  # noqa
+model = dict(
+    backbone=dict(
+        depths=[2, 2, 18, 2],
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)))
diff --git a/mmde/mmdet/.mim/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5471caa139c0b7670f995501347ddf80383e9268
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_1x_coco.py
@@ -0,0 +1,60 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'  # noqa
+model = dict(
+    type='MaskRCNN',
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(in_channels=[96, 192, 384, 768]))
+
+max_epochs = 12
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'relative_position_bias_table': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }),
+    optimizer=dict(
+        _delete_=True,
+        type='AdamW',
+        lr=0.0001,
+        betas=(0.9, 0.999),
+        weight_decay=0.05))
diff --git a/mmde/mmdet/.mim/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_amp-ms-crop-3x_coco.py b/mmde/mmdet/.mim/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_amp-ms-crop-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..622087ba7164fda53a70eb927b9258572b7c8ef0
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_amp-ms-crop-3x_coco.py
@@ -0,0 +1,3 @@
+_base_ = './mask-rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py'
+# Enable automatic-mixed-precision training with AmpOptimWrapper.
+optim_wrapper = dict(type='AmpOptimWrapper')
diff --git a/mmde/mmdet/.mim/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py b/mmde/mmdet/.mim/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7024b73249ca8c77da89ab9e4653757f36a1d1d2
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py
@@ -0,0 +1,99 @@
+_base_ = [
+    '../_base_/models/mask-rcnn_r50_fpn.py',
+    '../_base_/datasets/coco_instance.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'  # noqa
+
+model = dict(
+    type='MaskRCNN',
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(in_channels=[96, 192, 384, 768]))
+
+# augmentation strategy originates from DETR / Sparse RCNN
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[[
+            dict(
+                type='RandomChoiceResize',
+                scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                        (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                        (736, 1333), (768, 1333), (800, 1333)],
+                keep_ratio=True)
+        ],
+                    [
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(400, 1333), (500, 1333), (600, 1333)],
+                            keep_ratio=True),
+                        dict(
+                            type='RandomCrop',
+                            crop_type='absolute_range',
+                            crop_size=(384, 600),
+                            allow_negative_crop=True),
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(480, 1333), (512, 1333), (544, 1333),
+                                    (576, 1333), (608, 1333), (640, 1333),
+                                    (672, 1333), (704, 1333), (736, 1333),
+                                    (768, 1333), (800, 1333)],
+                            keep_ratio=True)
+                    ]]),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+max_epochs = 36
+train_cfg = dict(max_epochs=max_epochs)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[27, 33],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'relative_position_bias_table': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }),
+    optimizer=dict(
+        _delete_=True,
+        type='AdamW',
+        lr=0.0001,
+        betas=(0.9, 0.999),
+        weight_decay=0.05))
diff --git a/mmde/mmdet/.mim/configs/swin/metafile.yml b/mmde/mmdet/.mim/configs/swin/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..763f9300d44bcc3f9348951f3640ada171c3ce05
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/swin/metafile.yml
@@ -0,0 +1,120 @@
+Models:
+  - Name: mask-rcnn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/swin/mask-rcnn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco.py
+    Metadata:
+      Training Memory (GB): 11.9
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Swin Transformer
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 48.2
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 43.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco_20210903_104808-b92c91f1.pth
+    Paper:
+      URL: https://arxiv.org/abs/2107.08430
+      Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    README: configs/swin/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465
+      Version: v2.16.0
+
+  - Name: mask-rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/swin/mask-rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py
+    Metadata:
+      Training Memory (GB): 10.2
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Swin Transformer
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco_20210906_131725-bacf6f7b.pth
+    Paper:
+      URL: https://arxiv.org/abs/2107.08430
+      Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    README: configs/swin/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465
+      Version: v2.16.0
+
+  - Name: mask-rcnn_swin-t-p4-w7_fpn_1x_coco
+    In Collection: Mask R-CNN
+    Config: configs/swin/mask-rcnn_swin-t-p4-w7_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      Epochs: 12
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Swin Transformer
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.7
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco/mask_rcnn_swin-t-p4-w7_fpn_1x_coco_20210902_120937-9d6b7cfa.pth
+    Paper:
+      URL: https://arxiv.org/abs/2107.08430
+      Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    README: configs/swin/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465
+      Version: v2.16.0
+
+  - Name: mask-rcnn_swin-t-p4-w7_fpn_amp-ms-crop-3x_coco
+    In Collection: Mask R-CNN
+    Config: configs/swin/mask-rcnn_swin-t-p4-w7_fpn_amp-ms-crop-3x_coco.py
+    Metadata:
+      Training Memory (GB): 7.8
+      Epochs: 36
+      Training Data: COCO
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Swin Transformer
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco_20210908_165006-90a4008c.pth
+    Paper:
+      URL: https://arxiv.org/abs/2107.08430
+      Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    README: configs/swin/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465
+      Version: v2.16.0
diff --git a/mmde/mmdet/.mim/configs/swin/retinanet_swin-t-p4-w7_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/swin/retinanet_swin-t-p4-w7_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f40a87e8cf8593edd92f024d0bb0ed43a87b4fb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/swin/retinanet_swin-t-p4-w7_fpn_1x_coco.py
@@ -0,0 +1,31 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'  # noqa
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        # Please only add indices that would be used
+        # in FPN, otherwise some parameter will not be used
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(in_channels=[192, 384, 768], start_level=0, num_outs=5))
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(lr=0.01))
diff --git a/mmde/mmdet/.mim/configs/timm_example/retinanet_timm-efficientnet-b1_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/timm_example/retinanet_timm-efficientnet-b1_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b87dddf50f7179dc143b9ab9aecb07d09d4dea4b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/timm_example/retinanet_timm-efficientnet-b1_fpn_1x_coco.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# please install mmpretrain
+# import mmpretrain.models to trigger register_module in mmpretrain
+custom_imports = dict(
+    imports=['mmpretrain.models'], allow_failed_imports=False)
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='mmpretrain.TIMMBackbone',
+        model_name='efficientnet_b1',
+        features_only=True,
+        pretrained=True,
+        out_indices=(1, 2, 3, 4)),
+    neck=dict(in_channels=[24, 40, 112, 320]))
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(lr=0.01))
diff --git a/mmde/mmdet/.mim/configs/timm_example/retinanet_timm-tv-resnet50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/timm_example/retinanet_timm-tv-resnet50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..74e43506959574abbf08feb44848f4bfa8d65719
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/timm_example/retinanet_timm-tv-resnet50_fpn_1x_coco.py
@@ -0,0 +1,22 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# please install mmpretrain
+# import mmpretrain.models to trigger register_module in mmpretrain
+custom_imports = dict(
+    imports=['mmpretrain.models'], allow_failed_imports=False)
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='mmpretrain.TIMMBackbone',
+        model_name='tv_resnet50',  # ResNet-50 with torchvision weights
+        features_only=True,
+        pretrained=True,
+        out_indices=(1, 2, 3, 4)))
+
+# optimizer
+optim_wrapper = dict(optimizer=dict(lr=0.01))
diff --git a/mmde/mmdet/.mim/configs/tood/metafile.yml b/mmde/mmdet/.mim/configs/tood/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d2bc08073a10ef153b9c97f4d2742e5f85015aa5
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/tood/metafile.yml
@@ -0,0 +1,95 @@
+Collections:
+  - Name: TOOD
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - TOOD
+    Paper:
+      URL: https://arxiv.org/abs/2108.07755
+      Title: 'TOOD: Task-aligned One-stage Object Detection'
+    README: configs/tood/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.20.0/mmdet/models/detectors/tood.py#L7
+      Version: v2.20.0
+
+Models:
+  - Name: tood_r101_fpn_ms-2x_coco
+    In Collection: TOOD
+    Config: configs/tood/tood_r101_fpn_ms-2x_coco.py
+    Metadata:
+      Training Memory (GB): 6.0
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.1
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_mstrain_2x_coco/tood_r101_fpn_mstrain_2x_coco_20211210_144232-a18f53c8.pth
+
+  - Name: tood_x101-64x4d_fpn_ms-2x_coco
+    In Collection: TOOD
+    Config: configs/tood/tood_x101-64x4d_fpn_ms-2x_coco.py
+    Metadata:
+      Training Memory (GB): 10.2
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 47.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_x101_64x4d_fpn_mstrain_2x_coco/tood_x101_64x4d_fpn_mstrain_2x_coco_20211211_003519-a4f36113.pth
+
+  - Name: tood_r101-dconv-c3-c5_fpn_ms-2x_coco
+    In Collection: TOOD
+    Config: configs/tood/tood_r101-dconv-c3-c5_fpn_ms-2x_coco.py
+    Metadata:
+      Training Memory (GB): 6.2
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 49.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20211210_213728-4a824142.pth
+
+  - Name: tood_r50_fpn_anchor-based_1x_coco
+    In Collection: TOOD
+    Config: configs/tood/tood_r50_fpn_anchor-based_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.1
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_anchor_based_1x_coco/tood_r50_fpn_anchor_based_1x_coco_20211214_100105-b776c134.pth
+
+  - Name: tood_r50_fpn_1x_coco
+    In Collection: TOOD
+    Config: configs/tood/tood_r50_fpn_1x_coco.py
+    Metadata:
+      Training Memory (GB): 4.1
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 42.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_1x_coco/tood_r50_fpn_1x_coco_20211210_103425-20e20746.pth
+
+  - Name: tood_r50_fpn_ms-2x_coco
+    In Collection: TOOD
+    Config: configs/tood/tood_r50_fpn_ms-2x_coco.py
+    Metadata:
+      Training Memory (GB): 4.1
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_mstrain_2x_coco/tood_r50_fpn_mstrain_2x_coco_20211210_144231-3b23174c.pth
diff --git a/mmde/mmdet/.mim/configs/tood/tood_r101-dconv-c3-c5_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/tood/tood_r101-dconv-c3-c5_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..45030a6832db39a329d0901dde4a5320f34a9b6e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/tood/tood_r101-dconv-c3-c5_fpn_ms-2x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './tood_r101_fpn_ms-2x_coco.py'
+
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)),
+    bbox_head=dict(num_dcn=2))
diff --git a/mmde/mmdet/.mim/configs/tood/tood_r101_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/tood/tood_r101_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc6ae5d942e05ac90162ca9ac67adb311d581e5b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/tood/tood_r101_fpn_ms-2x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './tood_r50_fpn_ms-2x_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/tood/tood_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/tood/tood_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4839d9d77e64d61b504ed8789bda225cc878da1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/tood/tood_r50_fpn_1x_coco.py
@@ -0,0 +1,80 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    type='TOOD',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5),
+    bbox_head=dict(
+        type='TOODHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=6,
+        feat_channels=256,
+        anchor_type='anchor_free',
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            octave_base_scale=8,
+            scales_per_octave=1,
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        initial_loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            activated=True,  # use probability instead of logit as input
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            activated=True,  # use probability instead of logit as input
+            beta=2.0,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0)),
+    train_cfg=dict(
+        initial_epoch=4,
+        initial_assigner=dict(type='ATSSAssigner', topk=9),
+        assigner=dict(type='TaskAlignedAssigner', topk=13),
+        alpha=1,
+        beta=6,
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/tood/tood_r50_fpn_anchor-based_1x_coco.py b/mmde/mmdet/.mim/configs/tood/tood_r50_fpn_anchor-based_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7fbf6aff197b821de07f8d4a73f9c72e5f76288
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/tood/tood_r50_fpn_anchor-based_1x_coco.py
@@ -0,0 +1,2 @@
+_base_ = './tood_r50_fpn_1x_coco.py'
+model = dict(bbox_head=dict(anchor_type='anchor_based'))
diff --git a/mmde/mmdet/.mim/configs/tood/tood_r50_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/tood/tood_r50_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffb296dccee30438977bac61b970f5844d647cfa
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/tood/tood_r50_fpn_ms-2x_coco.py
@@ -0,0 +1,30 @@
+_base_ = './tood_r50_fpn_1x_coco.py'
+max_epochs = 24
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+# training schedule for 2x
+train_cfg = dict(max_epochs=max_epochs)
+
+# multi-scale training
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize', scale=[(1333, 480), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/tood/tood_x101-64x4d-dconv-c4-c5_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/tood/tood_x101-64x4d-dconv-c4-c5_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..43405196184715923bb22499958c74fe9bf4a2da
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/tood/tood_x101-64x4d-dconv-c4-c5_fpn_ms-2x_coco.py
@@ -0,0 +1,7 @@
+_base_ = './tood_x101-64x4d_fpn_ms-2x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True),
+    ),
+    bbox_head=dict(num_dcn=2))
diff --git a/mmde/mmdet/.mim/configs/tood/tood_x101-64x4d_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/tood/tood_x101-64x4d_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1651542c7562553f206ba763fb9a43838e042450
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/tood/tood_x101-64x4d_fpn_ms-2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './tood_r50_fpn_ms-2x_coco.py'
+
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/tridentnet/metafile.yml b/mmde/mmdet/.mim/configs/tridentnet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c0081c5be02986efbfdad9f199aa8ccd4b599d0f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/tridentnet/metafile.yml
@@ -0,0 +1,55 @@
+Collections:
+  - Name: TridentNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - ResNet
+        - TridentNet Block
+    Paper:
+      URL: https://arxiv.org/abs/1901.01892
+      Title: 'Scale-Aware Trident Networks for Object Detection'
+    README: configs/tridentnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.8.0/mmdet/models/detectors/trident_faster_rcnn.py#L6
+      Version: v2.8.0
+
+Models:
+  - Name: tridentnet_r50-caffe_1x_coco
+    In Collection: TridentNet
+    Config: configs/tridentnet/tridentnet_r50-caffe_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_1x_coco/tridentnet_r50_caffe_1x_coco_20201230_141838-2ec0b530.pth
+
+  - Name: tridentnet_r50-caffe_ms-1x_coco
+    In Collection: TridentNet
+    Config: configs/tridentnet/tridentnet_r50-caffe_ms-1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco/tridentnet_r50_caffe_mstrain_1x_coco_20201230_141839-6ce55ccb.pth
+
+  - Name: tridentnet_r50-caffe_ms-3x_coco
+    In Collection: TridentNet
+    Config: configs/tridentnet/tridentnet_r50-caffe_ms-3x_coco.py
+    Metadata:
+      Epochs: 36
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.3
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco/tridentnet_r50_caffe_mstrain_3x_coco_20201130_100539-46d227ba.pth
diff --git a/mmde/mmdet/.mim/configs/tridentnet/tridentnet_r50-caffe_1x_coco.py b/mmde/mmdet/.mim/configs/tridentnet/tridentnet_r50-caffe_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..26a4c12316ee80c7dfae1624af3f4146dba0a414
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/tridentnet/tridentnet_r50-caffe_1x_coco.py
@@ -0,0 +1,22 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50-caffe-c4.py',
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='TridentFasterRCNN',
+    backbone=dict(
+        type='TridentResNet',
+        trident_dilations=(1, 2, 3),
+        num_branch=3,
+        test_branch_idx=1,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    roi_head=dict(type='TridentRoIHead', num_branch=3, test_branch_idx=1),
+    train_cfg=dict(
+        rpn_proposal=dict(max_per_img=500),
+        rcnn=dict(
+            sampler=dict(num=128, pos_fraction=0.5,
+                         add_gt_as_proposals=False))))
diff --git a/mmde/mmdet/.mim/configs/tridentnet/tridentnet_r50-caffe_ms-1x_coco.py b/mmde/mmdet/.mim/configs/tridentnet/tridentnet_r50-caffe_ms-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..806d20b90c96be9357eccd9f9ca8c880b0716cae
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/tridentnet/tridentnet_r50-caffe_ms-1x_coco.py
@@ -0,0 +1,15 @@
+_base_ = 'tridentnet_r50-caffe_1x_coco.py'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/mmde/mmdet/.mim/configs/tridentnet/tridentnet_r50-caffe_ms-3x_coco.py b/mmde/mmdet/.mim/configs/tridentnet/tridentnet_r50-caffe_ms-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4de249c60c234a9d301658594f7b072b0b48017b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/tridentnet/tridentnet_r50-caffe_ms-3x_coco.py
@@ -0,0 +1,18 @@
+_base_ = 'tridentnet_r50-caffe_ms-1x_coco.py'
+
+# learning rate
+max_epochs = 36
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[28, 34],
+        gamma=0.1)
+]
diff --git a/mmde/mmdet/.mim/configs/v3det/cascade_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py b/mmde/mmdet/.mim/configs/v3det/cascade_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py
new file mode 100644
index 0000000000000000000000000000000000000000..567c31bd0e986e071b50ff2aac9cb896d4daf6fd
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/v3det/cascade_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py
@@ -0,0 +1,171 @@
+_base_ = [
+    '../_base_/models/cascade-rcnn_r50_fpn.py', '../_base_/datasets/v3det.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    rpn_head=dict(
+        loss_bbox=dict(_delete_=True, type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(bbox_head=[
+        dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=13204,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=True,
+            cls_predictor_cfg=dict(
+                type='NormedLinear', tempearture=50, bias=True),
+            loss_cls=dict(
+                type='CrossEntropyCustomLoss',
+                num_classes=13204,
+                use_sigmoid=True,
+                loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+        dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=13204,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.05, 0.05, 0.1, 0.1]),
+            reg_class_agnostic=True,
+            cls_predictor_cfg=dict(
+                type='NormedLinear', tempearture=50, bias=True),
+            loss_cls=dict(
+                type='CrossEntropyCustomLoss',
+                num_classes=13204,
+                use_sigmoid=True,
+                loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+        dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=13204,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.033, 0.033, 0.067, 0.067]),
+            reg_class_agnostic=True,
+            cls_predictor_cfg=dict(
+                type='NormedLinear', tempearture=50, bias=True),
+            loss_cls=dict(
+                type='CrossEntropyCustomLoss',
+                num_classes=13204,
+                use_sigmoid=True,
+                loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))
+    ]),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn_proposal=dict(nms_pre=4000, max_per_img=2000),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1,
+                    perm_repeat_gt_cfg=dict(iou_thr=0.7, perm_range=0.01)),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1,
+                    perm_repeat_gt_cfg=dict(iou_thr=0.7, perm_range=0.01)),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1,
+                    perm_repeat_gt_cfg=dict(iou_thr=0.7, perm_range=0.01)),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.0001,
+            nms=dict(type='nms', iou_threshold=0.6),
+            max_per_img=300)))
+# dataset settings
+train_dataloader = dict(batch_size=4, num_workers=8)
+
+# training schedule for 1x
+max_iter = 68760 * 2
+train_cfg = dict(
+    _delete_=True,
+    type='IterBasedTrainLoop',
+    max_iters=max_iter,
+    val_interval=max_iter)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 2048,
+        by_epoch=False,
+        begin=0,
+        end=5000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[45840 * 2, 63030 * 2],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(_delete_=True, type='AdamW', lr=1e-4 * 1, weight_decay=0.1),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=5730 * 2))
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False)
diff --git a/mmde/mmdet/.mim/configs/v3det/cascade_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py b/mmde/mmdet/.mim/configs/v3det/cascade_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6493323ba8d92d2628fb4784f5a12dd564460be
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/v3det/cascade_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py
@@ -0,0 +1,27 @@
+_base_ = [
+    './cascade_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py',
+]
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(in_channels=[128, 256, 512, 1024]))
diff --git a/mmde/mmdet/.mim/configs/v3det/deformable-detr-refine-twostage_r50_8xb4_sample1e-3_v3det_50e.py b/mmde/mmdet/.mim/configs/v3det/deformable-detr-refine-twostage_r50_8xb4_sample1e-3_v3det_50e.py
new file mode 100644
index 0000000000000000000000000000000000000000..97544a27edfd75eef4ba25fd12a122f03b392c1f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/v3det/deformable-detr-refine-twostage_r50_8xb4_sample1e-3_v3det_50e.py
@@ -0,0 +1,108 @@
+_base_ = '../deformable_detr/deformable-detr-refine-twostage_r50_16xb2-50e_coco.py'  # noqa
+
+model = dict(
+    bbox_head=dict(num_classes=13204),
+    test_cfg=dict(max_per_img=300),
+)
+
+data_root = 'data/V3Det/'
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='PackDetInputs')
+]
+
+train_dataloader = dict(
+    _delete_=True,
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type='ClassBalancedDataset',
+        oversample_thr=1e-3,
+        dataset=dict(
+            type='V3DetDataset',
+            data_root=data_root,
+            ann_file='annotations/v3det_2023_v1_train.json',
+            data_prefix=dict(img=''),
+            filter_cfg=dict(filter_empty_gt=False),
+            pipeline=train_pipeline,
+            backend_args=None)))
+val_dataloader = dict(
+    dataset=dict(
+        type='V3DetDataset',
+        data_root=data_root,
+        ann_file='annotations/v3det_2023_v1_val.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    ann_file=data_root + 'annotations/v3det_2023_v1_val.json',
+    use_mp_eval=True,
+    proposal_nums=[300])
+test_evaluator = val_evaluator
+
+# training schedule for 50e
+# when using RFS, bs32, each epoch ~ 5730 iter
+max_iter = 286500
+train_cfg = dict(
+    _delete_=True,
+    type='IterBasedTrainLoop',
+    max_iters=max_iter,
+    val_interval=max_iter / 5)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[229200],  # 40e
+        gamma=0.1)
+]
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5730,
+        max_keep_ckpts=3))
+
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False)
diff --git a/mmde/mmdet/.mim/configs/v3det/deformable-detr-refine-twostage_swin_16xb2_sample1e-3_v3det_50e.py b/mmde/mmdet/.mim/configs/v3det/deformable-detr-refine-twostage_swin_16xb2_sample1e-3_v3det_50e.py
new file mode 100644
index 0000000000000000000000000000000000000000..e640cd604a97813a70588d5ffe23701543ab0087
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/v3det/deformable-detr-refine-twostage_swin_16xb2_sample1e-3_v3det_50e.py
@@ -0,0 +1,27 @@
+_base_ = 'deformable-detr-refine-twostage_r50_8xb4_sample1e-3_v3det_50e.py'
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
+
+train_dataloader = dict(batch_size=2, num_workers=2)
diff --git a/mmde/mmdet/.mim/configs/v3det/dino-4scale_r50_8xb2_sample1e-3_v3det_36e.py b/mmde/mmdet/.mim/configs/v3det/dino-4scale_r50_8xb2_sample1e-3_v3det_36e.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9e6e6be0715512b111171c4b60cca7433f8ca34
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/v3det/dino-4scale_r50_8xb2_sample1e-3_v3det_36e.py
@@ -0,0 +1,109 @@
+_base_ = '../dino/dino-4scale_r50_8xb2-36e_coco.py'
+
+model = dict(
+    bbox_head=dict(num_classes=13204),
+    test_cfg=dict(max_per_img=300),
+)
+
+data_root = 'data/V3Det/'
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='PackDetInputs')
+]
+train_dataloader = dict(
+    _delete_=True,
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type='ClassBalancedDataset',
+        oversample_thr=1e-3,
+        dataset=dict(
+            type='V3DetDataset',
+            data_root=data_root,
+            ann_file='annotations/v3det_2023_v1_train.json',
+            data_prefix=dict(img=''),
+            filter_cfg=dict(filter_empty_gt=False),
+            pipeline=train_pipeline,
+            backend_args=None)))
+val_dataloader = dict(
+    dataset=dict(
+        type='V3DetDataset',
+        data_root=data_root,
+        ann_file='annotations/v3det_2023_v1_val.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    ann_file=data_root + 'annotations/v3det_2023_v1_val.json',
+    use_mp_eval=True,
+    proposal_nums=[300])
+test_evaluator = val_evaluator
+
+# training schedule for 36e
+# when using RFS, bs16, each epoch ~ 11460 iter
+max_iter = 412560
+train_cfg = dict(
+    _delete_=True,
+    type='IterBasedTrainLoop',
+    max_iters=max_iter,
+    val_interval=max_iter / 5)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[343800],  # 30e
+        gamma=0.1)
+]
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook',
+        by_epoch=False,
+        interval=11460,
+        max_keep_ckpts=3))
+
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False)
diff --git a/mmde/mmdet/.mim/configs/v3det/dino-4scale_swin_16xb1_sample1e-3_v3det_36e.py b/mmde/mmdet/.mim/configs/v3det/dino-4scale_swin_16xb1_sample1e-3_v3det_36e.py
new file mode 100644
index 0000000000000000000000000000000000000000..100c4ba4b8cb2c0ac3e44f5e9ddcfc37bbfe6b55
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/v3det/dino-4scale_swin_16xb1_sample1e-3_v3det_36e.py
@@ -0,0 +1,27 @@
+_base_ = 'dino-4scale_r50_8xb2_sample1e-3_v3det_36e.py'
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
+
+train_dataloader = dict(batch_size=1)
diff --git a/mmde/mmdet/.mim/configs/v3det/faster_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py b/mmde/mmdet/.mim/configs/v3det/faster_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d306fb094806d75ec614b52a43bf6614d13eed4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/v3det/faster_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py
@@ -0,0 +1,72 @@
+_base_ = [
+    '../_base_/models/faster-rcnn_r50_fpn.py', '../_base_/datasets/v3det.py',
+    '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    roi_head=dict(
+        bbox_head=dict(
+            num_classes=13204,
+            reg_class_agnostic=True,
+            cls_predictor_cfg=dict(
+                type='NormedLinear', tempearture=50, bias=True),
+            loss_cls=dict(
+                type='CrossEntropyCustomLoss',
+                num_classes=13204,
+                use_sigmoid=True,
+                loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn_proposal=dict(nms_pre=4000, max_per_img=2000),
+        rcnn=dict(
+            assigner=dict(
+                perm_repeat_gt_cfg=dict(iou_thr=0.7, perm_range=0.01)))),
+    test_cfg=dict(
+        rcnn=dict(
+            score_thr=0.0001,
+            nms=dict(type='nms', iou_threshold=0.6),
+            max_per_img=300)))
+# dataset settings
+train_dataloader = dict(batch_size=4, num_workers=8)
+
+# training schedule for 2x
+max_iter = 68760 * 2
+train_cfg = dict(
+    _delete_=True,
+    type='IterBasedTrainLoop',
+    max_iters=max_iter,
+    val_interval=max_iter)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 2048,
+        by_epoch=False,
+        begin=0,
+        end=5000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[45840 * 2, 63030 * 2],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(_delete_=True, type='AdamW', lr=1e-4 * 1, weight_decay=0.1),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=5730 * 2))
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False)
diff --git a/mmde/mmdet/.mim/configs/v3det/faster_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py b/mmde/mmdet/.mim/configs/v3det/faster_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0b1110811230b4bda27da9fd2e58067c7326c52
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/v3det/faster_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py
@@ -0,0 +1,27 @@
+_base_ = [
+    './faster_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py',
+]
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(in_channels=[128, 256, 512, 1024]))
diff --git a/mmde/mmdet/.mim/configs/v3det/fcos_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py b/mmde/mmdet/.mim/configs/v3det/fcos_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py
new file mode 100644
index 0000000000000000000000000000000000000000..b78e38c93cb0fdedff3948f1ce7b5b7787efcaea
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/v3det/fcos_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py
@@ -0,0 +1,116 @@
+_base_ = [
+    '../_base_/datasets/v3det.py', '../_base_/schedules/schedule_2x.py',
+    '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    type='FCOS',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',  # use P5
+        num_outs=5,
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type='FCOSHead',
+        num_classes=13204,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        cls_predictor_cfg=dict(type='NormedLinear', tempearture=50, bias=True),
+        loss_cls=dict(
+            type='FocalCustomLoss',
+            use_sigmoid=True,
+            num_classes=13204,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='IoULoss', loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0,
+            ignore_iof_thr=-1,
+            perm_repeat_gt_cfg=dict(iou_thr=0.7, perm_range=0.01)),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.0001,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=300))
+# dataset settings
+
+backend_args = None
+
+train_dataloader = dict(batch_size=2, num_workers=8)
+
+# training schedule for 2x
+max_iter = 68760 * 2 * 2
+train_cfg = dict(
+    _delete_=True,
+    type='IterBasedTrainLoop',
+    max_iters=max_iter,
+    val_interval=max_iter)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 2048,
+        by_epoch=False,
+        begin=0,
+        end=5000 * 2),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[45840 * 2 * 2, 63030 * 2 * 2],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        _delete_=True, type='AdamW', lr=1e-4 * 0.25, weight_decay=0.1),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=5730 * 2))
+log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False)
+
+find_unused_parameters = True
diff --git a/mmde/mmdet/.mim/configs/v3det/fcos_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py b/mmde/mmdet/.mim/configs/v3det/fcos_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ca952a28fc08ae9b14ad30308eff823b1bba55e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/v3det/fcos_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py
@@ -0,0 +1,27 @@
+_base_ = [
+    './fcos_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py',
+]
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(in_channels=[128, 256, 512, 1024], force_grad_on_level=True))
diff --git a/mmde/mmdet/.mim/configs/vfnet/metafile.yml b/mmde/mmdet/.mim/configs/vfnet/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1b791d01d50ad8a28bff225fa1d3f5af8d348207
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/vfnet/metafile.yml
@@ -0,0 +1,116 @@
+Collections:
+  - Name: VFNet
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+        - Varifocal Loss
+    Paper:
+      URL: https://arxiv.org/abs/2008.13367
+      Title: 'VarifocalNet: An IoU-aware Dense Object Detector'
+    README: configs/vfnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.6.0/mmdet/models/detectors/vfnet.py#L6
+      Version: v2.6.0
+
+Models:
+  - Name: vfnet_r50_fpn_1x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_r50_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_1x_coco/vfnet_r50_fpn_1x_coco_20201027-38db6f58.pth
+
+  - Name: vfnet_r50_fpn_ms-2x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_r50_fpn_ms-2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 44.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mstrain_2x_coco/vfnet_r50_fpn_mstrain_2x_coco_20201027-7cc75bd2.pth
+
+  - Name: vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 48.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-6879c318.pth
+
+  - Name: vfnet_r101_fpn_1x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_r101_fpn_1x_coco.py
+    Metadata:
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 43.6
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_1x_coco/vfnet_r101_fpn_1x_coco_20201027pth-c831ece7.pth
+
+  - Name: vfnet_r101_fpn_ms-2x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_r101_fpn_ms-2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 46.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mstrain_2x_coco/vfnet_r101_fpn_mstrain_2x_coco_20201027pth-4a5d53f1.pth
+
+  - Name: vfnet_r101-mdconv-c3-c5_fpn_ms-2x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_r101-mdconv-c3-c5_fpn_ms-2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 49.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-7729adb5.pth
+
+  - Name: vfnet_x101-32x4d-mdconv-c3-c5_fpn_ms-2x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_x101-32x4d-mdconv-c3-c5_fpn_ms-2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-d300a6fc.pth
+
+  - Name: vfnet_x101-64x4d-mdconv-c3-c5_fpn_ms-2x_coco
+    In Collection: VFNet
+    Config: configs/vfnet/vfnet_x101-64x4d-mdconv-c3-c5_fpn_ms-2x_coco.py
+    Metadata:
+      Epochs: 24
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-b5f6da5e.pth
diff --git a/mmde/mmdet/.mim/configs/vfnet/vfnet_r101-mdconv-c3-c5_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/vfnet/vfnet_r101-mdconv-c3-c5_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dd67a3bcce3bbb66531997133880d65af0c856a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/vfnet/vfnet_r101-mdconv-c3-c5_fpn_ms-2x_coco.py
@@ -0,0 +1,15 @@
+_base_ = './vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/vfnet/vfnet_r101_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/vfnet/vfnet_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b296a07959e43517d792f36f356404a232fb0dc3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/vfnet/vfnet_r101_fpn_1x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './vfnet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/vfnet/vfnet_r101_fpn_2x_coco.py b/mmde/mmdet/.mim/configs/vfnet/vfnet_r101_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..37a7bacb5e409a75ae2cd71fc022837f09537aa7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/vfnet/vfnet_r101_fpn_2x_coco.py
@@ -0,0 +1,20 @@
+_base_ = './vfnet_r50_fpn_1x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
+# learning policy
+max_epochs = 24
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+train_cfg = dict(max_epochs=max_epochs)
diff --git a/mmde/mmdet/.mim/configs/vfnet/vfnet_r101_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/vfnet/vfnet_r101_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..62f064b7473f4e6fec3ac50962240ac1f828753f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/vfnet/vfnet_r101_fpn_ms-2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './vfnet_r50_fpn_ms-2x_coco.py'
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/vfnet/vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/vfnet/vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..08adf927599b7759dea0e2d14c37ce716482b301
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/vfnet/vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py
@@ -0,0 +1,6 @@
+_base_ = './vfnet_r50_fpn_ms-2x_coco.py'
+model = dict(
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True)),
+    bbox_head=dict(dcn_on_last_conv=True))
diff --git a/mmde/mmdet/.mim/configs/vfnet/vfnet_r50_fpn_1x_coco.py b/mmde/mmdet/.mim/configs/vfnet/vfnet_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..99bc3b5f4c78c7a7cda11e20f209ea40af7dfd80
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/vfnet/vfnet_r50_fpn_1x_coco.py
@@ -0,0 +1,104 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    type='VFNet',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',  # use P5
+        num_outs=5,
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type='VFNetHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=3,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        center_sampling=False,
+        dcn_on_last_conv=False,
+        use_atss=True,
+        use_vfl=True,
+        loss_cls=dict(
+            type='VarifocalLoss',
+            use_sigmoid=True,
+            alpha=0.75,
+            gamma=2.0,
+            iou_weighted=True,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.5),
+        loss_bbox_refine=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(type='ATSSAssigner', topk=9),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+# data setting
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(lr=0.01),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=None)
+# learning rate
+max_epochs = 12
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+train_cfg = dict(max_epochs=max_epochs)
diff --git a/mmde/mmdet/.mim/configs/vfnet/vfnet_r50_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/vfnet/vfnet_r50_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f8eed298e81967582420ac45a241b2726c47f6a
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/vfnet/vfnet_r50_fpn_ms-2x_coco.py
@@ -0,0 +1,36 @@
+_base_ = './vfnet_r50_fpn_1x_coco.py'
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='RandomResize', scale=[(1333, 480), (1333, 960)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+# learning policy
+max_epochs = 24
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+train_cfg = dict(max_epochs=max_epochs)
diff --git a/mmde/mmdet/.mim/configs/vfnet/vfnet_res2net-101_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/vfnet/vfnet_res2net-101_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..94288e8e80e5be2c6e8effd38e30e239cd1e3c5f
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/vfnet/vfnet_res2net-101_fpn_ms-2x_coco.py
@@ -0,0 +1,16 @@
+_base_ = './vfnet_r50_fpn_ms-2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='Res2Net',
+        depth=101,
+        scales=4,
+        base_width=26,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://res2net101_v1d_26w_4s')))
diff --git a/mmde/mmdet/.mim/configs/vfnet/vfnet_res2net101-mdconv-c3-c5_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/vfnet/vfnet_res2net101-mdconv-c3-c5_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..269330d3d8c218e51c3e65b550e4afc3296f2ec4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/vfnet/vfnet_res2net101-mdconv-c3-c5_fpn_ms-2x_coco.py
@@ -0,0 +1,18 @@
+_base_ = './vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='Res2Net',
+        depth=101,
+        scales=4,
+        base_width=26,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://res2net101_v1d_26w_4s')))
diff --git a/mmde/mmdet/.mim/configs/vfnet/vfnet_x101-32x4d-mdconv-c3-c5_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/vfnet/vfnet_x101-32x4d-mdconv-c3-c5_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..465da0cbdf4c4ae34d648349f4f9fa2d3fb13fe6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/vfnet/vfnet_x101-32x4d-mdconv-c3-c5_fpn_ms-2x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/vfnet/vfnet_x101-32x4d_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/vfnet/vfnet_x101-32x4d_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..486bcfe5ebd85f8c4ac3b211694e7dd9d13aa302
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/vfnet/vfnet_x101-32x4d_fpn_ms-2x_coco.py
@@ -0,0 +1,15 @@
+_base_ = './vfnet_r50_fpn_ms-2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/mmde/mmdet/.mim/configs/vfnet/vfnet_x101-64x4d-mdconv-c3-c5_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/vfnet/vfnet_x101-64x4d-mdconv-c3-c5_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..14a070e73ff54d6833aced096e2d94da4171ca42
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/vfnet/vfnet_x101-64x4d-mdconv-c3-c5_fpn_ms-2x_coco.py
@@ -0,0 +1,17 @@
+_base_ = './vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, True, True, True),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/vfnet/vfnet_x101-64x4d_fpn_ms-2x_coco.py b/mmde/mmdet/.mim/configs/vfnet/vfnet_x101-64x4d_fpn_ms-2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..92e3f71df6818a5653ec9c0475c277d89a1adb47
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/vfnet/vfnet_x101-64x4d_fpn_ms-2x_coco.py
@@ -0,0 +1,15 @@
+_base_ = './vfnet_r50_fpn_ms-2x_coco.py'
+model = dict(
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/mmde/mmdet/.mim/configs/wider_face/retinanet_r50_fpn_1x_widerface.py b/mmde/mmdet/.mim/configs/wider_face/retinanet_r50_fpn_1x_widerface.py
new file mode 100644
index 0000000000000000000000000000000000000000..78067255f8f69f9d193e8d3ae2fe8a685e4defe1
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/wider_face/retinanet_r50_fpn_1x_widerface.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/retinanet_r50_fpn.py',
+    '../_base_/datasets/wider_face.py', '../_base_/schedules/schedule_1x.py',
+    '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(bbox_head=dict(num_classes=1))
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
diff --git a/mmde/mmdet/.mim/configs/wider_face/ssd300_8xb32-24e_widerface.py b/mmde/mmdet/.mim/configs/wider_face/ssd300_8xb32-24e_widerface.py
new file mode 100644
index 0000000000000000000000000000000000000000..02c3c927f78ff022b03bf180789ce91d6061ec9e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/wider_face/ssd300_8xb32-24e_widerface.py
@@ -0,0 +1,64 @@
+_base_ = [
+    '../_base_/models/ssd300.py', '../_base_/datasets/wider_face.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_2x.py'
+]
+model = dict(bbox_head=dict(num_classes=1))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(
+        type='Expand',
+        mean={{_base_.model.data_preprocessor.mean}},
+        to_rgb={{_base_.model.data_preprocessor.bgr_to_rgb}},
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', scale=(300, 300), keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='Resize', scale=(300, 300), keep_ratio=False),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+dataset_type = 'WIDERFaceDataset'
+data_root = 'data/WIDERFace/'
+train_dataloader = dict(
+    batch_size=32, num_workers=8, dataset=dict(pipeline=train_pipeline))
+
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(type='MultiStepLR', by_epoch=True, milestones=[16, 20], gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(lr=0.012, momentum=0.9, weight_decay=5e-4),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (32 samples per GPU)
+auto_scale_lr = dict(base_batch_size=256)
diff --git a/mmde/mmdet/.mim/configs/yolact/metafile.yml b/mmde/mmdet/.mim/configs/yolact/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9ca76b3d3910f497e97275d0f25b1b1c3062d12b
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolact/metafile.yml
@@ -0,0 +1,81 @@
+Collections:
+  - Name: YOLACT
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - FPN
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/1904.02689
+      Title: 'YOLACT: Real-time Instance Segmentation'
+    README: configs/yolact/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.5.0/mmdet/models/detectors/yolact.py#L9
+      Version: v2.5.0
+
+Models:
+  - Name: yolact_r50_1x8_coco
+    In Collection: YOLACT
+    Config: configs/yolact/yolact_r50_1xb8-55e_coco.py
+    Metadata:
+      Training Resources: 1x V100 GPU
+      Batch Size: 8
+      Epochs: 55
+      inference time (ms/im):
+        - value: 23.53
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (550, 550)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 29.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r50_1x8_coco/yolact_r50_1x8_coco_20200908-f38d58df.pth
+
+  - Name: yolact_r50_8x8_coco
+    In Collection: YOLACT
+    Config: configs/yolact/yolact_r50_8xb8-55e_coco.py
+    Metadata:
+      Batch Size: 64
+      Epochs: 55
+      inference time (ms/im):
+        - value: 23.53
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (550, 550)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 28.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r50_8x8_coco/yolact_r50_8x8_coco_20200908-ca34f5db.pth
+
+  - Name: yolact_r101_1x8_coco
+    In Collection: YOLACT
+    Config: configs/yolact/yolact_r101_1xb8-55e_coco.py
+    Metadata:
+      Training Resources: 1x V100 GPU
+      Batch Size: 8
+      Epochs: 55
+      inference time (ms/im):
+        - value: 29.85
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (550, 550)
+    Results:
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 30.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r101_1x8_coco/yolact_r101_1x8_coco_20200908-4cbe9101.pth
diff --git a/mmde/mmdet/.mim/configs/yolact/yolact_r101_1xb8-55e_coco.py b/mmde/mmdet/.mim/configs/yolact/yolact_r101_1xb8-55e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6ffe29627ff5bd24b8e53be8d7defaa9eb91df7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolact/yolact_r101_1xb8-55e_coco.py
@@ -0,0 +1,7 @@
+_base_ = './yolact_r50_1xb8-55e_coco.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/mmde/mmdet/.mim/configs/yolact/yolact_r50_1xb8-55e_coco.py b/mmde/mmdet/.mim/configs/yolact/yolact_r50_1xb8-55e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7dabf1548a733cbf18b8007ae2fa9033a340af6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolact/yolact_r50_1xb8-55e_coco.py
@@ -0,0 +1,170 @@
+_base_ = [
+    '../_base_/datasets/coco_instance.py', '../_base_/default_runtime.py'
+]
+img_norm_cfg = dict(
+    mean=[123.68, 116.78, 103.94], std=[58.40, 57.12, 57.38], to_rgb=True)
+# model settings
+input_size = 550
+model = dict(
+    type='YOLACT',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=img_norm_cfg['mean'],
+        std=img_norm_cfg['std'],
+        bgr_to_rgb=img_norm_cfg['to_rgb'],
+        pad_mask=True),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,  # do not freeze stem
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,  # update the statistics of bn
+        zero_init_residual=False,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_input',
+        num_outs=5,
+        upsample_cfg=dict(mode='bilinear')),
+    bbox_head=dict(
+        type='YOLACTHead',
+        num_classes=80,
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=3,
+            scales_per_octave=1,
+            base_sizes=[8, 16, 32, 64, 128],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[550.0 / x for x in [69, 35, 18, 9, 5]],
+            centers=[(550 * 0.5 / x, 550 * 0.5 / x)
+                     for x in [69, 35, 18, 9, 5]]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[0.1, 0.1, 0.2, 0.2]),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            reduction='none',
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.5),
+        num_head_convs=1,
+        num_protos=32,
+        use_ohem=True),
+    mask_head=dict(
+        type='YOLACTProtonet',
+        in_channels=256,
+        num_protos=32,
+        num_classes=80,
+        max_masks_to_train=100,
+        loss_mask_weight=6.125,
+        with_seg_branch=True,
+        loss_segm=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0.,
+            ignore_iof_thr=-1,
+            gt_max_assign_all=False),
+        sampler=dict(type='PseudoSampler'),  # YOLACT should use PseudoSampler
+        # smoothl1_beta=1.,
+        allowed_border=-1,
+        pos_weight=-1,
+        neg_pos_ratio=3,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        mask_thr=0.5,
+        iou_thr=0.5,
+        top_k=200,
+        max_per_img=100,
+        mask_thr_binary=0.5))
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(4.0, 4.0)),
+    dict(
+        type='Expand',
+        mean=img_norm_cfg['mean'],
+        to_rgb=img_norm_cfg['to_rgb'],
+        ratio_range=(1, 4)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='PhotoMetricDistortion',
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(input_size, input_size), keep_ratio=False),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    batch_sampler=None,
+    dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+max_epochs = 55
+# training schedule for 55e
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning rate
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[20, 42, 49, 52],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=1e-3, momentum=0.9, weight_decay=5e-4))
+
+custom_hooks = [
+    dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW')
+]
+
+env_cfg = dict(cudnn_benchmark=True)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (1 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=8)
diff --git a/mmde/mmdet/.mim/configs/yolact/yolact_r50_8xb8-55e_coco.py b/mmde/mmdet/.mim/configs/yolact/yolact_r50_8xb8-55e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e39c285da10ef4821343ebf3c0d0d4c094a97198
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolact/yolact_r50_8xb8-55e_coco.py
@@ -0,0 +1,23 @@
+_base_ = 'yolact_r50_1xb8-55e_coco.py'
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(lr=8e-3),
+    clip_grad=dict(max_norm=35, norm_type=2))
+# learning rate
+max_epochs = 55
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[20, 42, 49, 52],
+        gamma=0.1)
+]
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/yolo/metafile.yml b/mmde/mmdet/.mim/configs/yolo/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..627e70c4d368728d3632f4fda6b68475c3a0fa66
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolo/metafile.yml
@@ -0,0 +1,124 @@
+Collections:
+  - Name: YOLOv3
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - DarkNet
+    Paper:
+      URL: https://arxiv.org/abs/1804.02767
+      Title: 'YOLOv3: An Incremental Improvement'
+    README: configs/yolo/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.4.0/mmdet/models/detectors/yolo.py#L8
+      Version: v2.4.0
+
+Models:
+  - Name: yolov3_d53_320_273e_coco
+    In Collection: YOLOv3
+    Config: configs/yolo/yolov3_d53_8xb8-320-273e_coco.py
+    Metadata:
+      Training Memory (GB): 2.7
+      inference time (ms/im):
+        - value: 15.65
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (320, 320)
+      Epochs: 273
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 27.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_320_273e_coco/yolov3_d53_320_273e_coco-421362b6.pth
+
+  - Name: yolov3_d53_mstrain-416_273e_coco
+    In Collection: YOLOv3
+    Config: configs/yolo/yolov3_d53_8xb8-ms-416-273e_coco.py
+    Metadata:
+      Training Memory (GB): 3.8
+      inference time (ms/im):
+        - value: 16.34
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (416, 416)
+      Epochs: 273
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 30.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-416_273e_coco/yolov3_d53_mstrain-416_273e_coco-2b60fcd9.pth
+
+  - Name: yolov3_d53_mstrain-608_273e_coco
+    In Collection: YOLOv3
+    Config: configs/yolo/yolov3_d53_8xb8-ms-608-273e_coco.py
+    Metadata:
+      Training Memory (GB): 7.4
+      inference time (ms/im):
+        - value: 20.79
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP32
+          resolution: (608, 608)
+      Epochs: 273
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 33.7
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-608_273e_coco/yolov3_d53_mstrain-608_273e_coco_20210518_115020-a2c3acb8.pth
+
+  - Name: yolov3_d53_fp16_mstrain-608_273e_coco
+    In Collection: YOLOv3
+    Config: configs/yolo/yolov3_d53_8xb8-amp-ms-608-273e_coco.py
+    Metadata:
+      Training Memory (GB): 4.7
+      inference time (ms/im):
+        - value: 20.79
+          hardware: V100
+          backend: PyTorch
+          batch size: 1
+          mode: FP16
+          resolution: (608, 608)
+      Epochs: 273
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 33.8
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_fp16_mstrain-608_273e_coco/yolov3_d53_fp16_mstrain-608_273e_coco_20210517_213542-4bc34944.pth
+
+  - Name: yolov3_mobilenetv2_8xb24-320-300e_coco
+    In Collection: YOLOv3
+    Config: configs/yolo/yolov3_mobilenetv2_8xb24-320-300e_coco.py
+    Metadata:
+      Training Memory (GB): 3.2
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 22.2
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_320_300e_coco/yolov3_mobilenetv2_320_300e_coco_20210719_215349-d18dff72.pth
+
+  - Name: yolov3_mobilenetv2_8xb24-ms-416-300e_coco
+    In Collection: YOLOv3
+    Config: configs/yolo/yolov3_mobilenetv2_8xb24-ms-416-300e_coco.py
+    Metadata:
+      Training Memory (GB): 5.3
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 23.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco/yolov3_mobilenetv2_mstrain-416_300e_coco_20210718_010823-f68a07b3.pth
diff --git a/mmde/mmdet/.mim/configs/yolo/yolov3_d53_8xb8-320-273e_coco.py b/mmde/mmdet/.mim/configs/yolo/yolov3_d53_8xb8-320-273e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3d08dd7706e5ba5bec5fc9e8da6fab120ed813d
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolo/yolov3_d53_8xb8-320-273e_coco.py
@@ -0,0 +1,29 @@
+_base_ = './yolov3_d53_8xb8-ms-608-273e_coco.py'
+
+input_size = (320, 320)
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    # `mean` and `to_rgb` should be the same with the `preprocess_cfg`
+    dict(type='Expand', mean=[0, 0, 0], to_rgb=True, ratio_range=(1, 2)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', scale=input_size, keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=input_size, keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/yolo/yolov3_d53_8xb8-amp-ms-608-273e_coco.py b/mmde/mmdet/.mim/configs/yolo/yolov3_d53_8xb8-amp-ms-608-273e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..173d8ee22227b3c3f4aa0488cb4e6f131d7dbee4
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolo/yolov3_d53_8xb8-amp-ms-608-273e_coco.py
@@ -0,0 +1,3 @@
+_base_ = './yolov3_d53_8xb8-ms-608-273e_coco.py'
+# fp16 settings
+optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic')
diff --git a/mmde/mmdet/.mim/configs/yolo/yolov3_d53_8xb8-ms-416-273e_coco.py b/mmde/mmdet/.mim/configs/yolo/yolov3_d53_8xb8-ms-416-273e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca0127e83edaeb8d5851ed089f6bd6d7385a1f86
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolo/yolov3_d53_8xb8-ms-416-273e_coco.py
@@ -0,0 +1,28 @@
+_base_ = './yolov3_d53_8xb8-ms-608-273e_coco.py'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    # `mean` and `to_rgb` should be the same with the `preprocess_cfg`
+    dict(type='Expand', mean=[0, 0, 0], to_rgb=True, ratio_range=(1, 2)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
+        min_crop_size=0.3),
+    dict(type='RandomResize', scale=[(320, 320), (416, 416)], keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(416, 416), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/yolo/yolov3_d53_8xb8-ms-608-273e_coco.py b/mmde/mmdet/.mim/configs/yolo/yolov3_d53_8xb8-ms-608-273e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4a36dfdaaf9b9e013882a6c28d42cca5942be20
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolo/yolov3_d53_8xb8-ms-608-273e_coco.py
@@ -0,0 +1,167 @@
+_base_ = ['../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py']
+# model settings
+data_preprocessor = dict(
+    type='DetDataPreprocessor',
+    mean=[0, 0, 0],
+    std=[255., 255., 255.],
+    bgr_to_rgb=True,
+    pad_size_divisor=32)
+model = dict(
+    type='YOLOV3',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='Darknet',
+        depth=53,
+        out_indices=(3, 4, 5),
+        init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://darknet53')),
+    neck=dict(
+        type='YOLOV3Neck',
+        num_scales=3,
+        in_channels=[1024, 512, 256],
+        out_channels=[512, 256, 128]),
+    bbox_head=dict(
+        type='YOLOV3Head',
+        num_classes=80,
+        in_channels=[512, 256, 128],
+        out_channels=[1024, 512, 256],
+        anchor_generator=dict(
+            type='YOLOAnchorGenerator',
+            base_sizes=[[(116, 90), (156, 198), (373, 326)],
+                        [(30, 61), (62, 45), (59, 119)],
+                        [(10, 13), (16, 30), (33, 23)]],
+            strides=[32, 16, 8]),
+        bbox_coder=dict(type='YOLOBBoxCoder'),
+        featmap_strides=[32, 16, 8],
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=1.0,
+            reduction='sum'),
+        loss_conf=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=1.0,
+            reduction='sum'),
+        loss_xy=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=2.0,
+            reduction='sum'),
+        loss_wh=dict(type='MSELoss', loss_weight=2.0, reduction='sum')),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='GridAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0)),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        conf_thr=0.005,
+        nms=dict(type='nms', iou_threshold=0.45),
+        max_per_img=100))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean=data_preprocessor['mean'],
+        to_rgb=data_preprocessor['bgr_to_rgb'],
+        ratio_range=(1, 2)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
+        min_crop_size=0.3),
+    dict(type='RandomResize', scale=[(320, 320), (608, 608)], keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(608, 608), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+train_cfg = dict(max_epochs=273, val_interval=7)
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0005),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning policy
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=2000),
+    dict(type='MultiStepLR', by_epoch=True, milestones=[218, 246], gamma=0.1)
+]
+
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=7))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/yolo/yolov3_mobilenetv2_8xb24-320-300e_coco.py b/mmde/mmdet/.mim/configs/yolo/yolov3_mobilenetv2_8xb24-320-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..07b393734329fd3ed5f4bd11fbc15b4abf7846bb
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolo/yolov3_mobilenetv2_8xb24-320-300e_coco.py
@@ -0,0 +1,42 @@
+_base_ = ['./yolov3_mobilenetv2_8xb24-ms-416-300e_coco.py']
+
+# yapf:disable
+model = dict(
+    bbox_head=dict(
+        anchor_generator=dict(
+            base_sizes=[[(220, 125), (128, 222), (264, 266)],
+                        [(35, 87), (102, 96), (60, 170)],
+                        [(10, 15), (24, 36), (72, 42)]])))
+# yapf:enable
+
+input_size = (320, 320)
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    # `mean` and `to_rgb` should be the same with the `preprocess_cfg`
+    dict(
+        type='Expand',
+        mean=[123.675, 116.28, 103.53],
+        to_rgb=True,
+        ratio_range=(1, 2)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
+        min_crop_size=0.3),
+    dict(type='Resize', scale=input_size, keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=input_size, keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/yolo/yolov3_mobilenetv2_8xb24-ms-416-300e_coco.py b/mmde/mmdet/.mim/configs/yolo/yolov3_mobilenetv2_8xb24-ms-416-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a161b66fe92666e904a9580ab5a1ff16d630ab7
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolo/yolov3_mobilenetv2_8xb24-ms-416-300e_coco.py
@@ -0,0 +1,176 @@
+_base_ = ['../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py']
+# model settings
+data_preprocessor = dict(
+    type='DetDataPreprocessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_size_divisor=32)
+model = dict(
+    type='YOLOV3',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='MobileNetV2',
+        out_indices=(2, 4, 6),
+        act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://mmdet/mobilenet_v2')),
+    neck=dict(
+        type='YOLOV3Neck',
+        num_scales=3,
+        in_channels=[320, 96, 32],
+        out_channels=[96, 96, 96]),
+    bbox_head=dict(
+        type='YOLOV3Head',
+        num_classes=80,
+        in_channels=[96, 96, 96],
+        out_channels=[96, 96, 96],
+        anchor_generator=dict(
+            type='YOLOAnchorGenerator',
+            base_sizes=[[(116, 90), (156, 198), (373, 326)],
+                        [(30, 61), (62, 45), (59, 119)],
+                        [(10, 13), (16, 30), (33, 23)]],
+            strides=[32, 16, 8]),
+        bbox_coder=dict(type='YOLOBBoxCoder'),
+        featmap_strides=[32, 16, 8],
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=1.0,
+            reduction='sum'),
+        loss_conf=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=1.0,
+            reduction='sum'),
+        loss_xy=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            loss_weight=2.0,
+            reduction='sum'),
+        loss_wh=dict(type='MSELoss', loss_weight=2.0, reduction='sum')),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='GridAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0)),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        conf_thr=0.005,
+        nms=dict(type='nms', iou_threshold=0.45),
+        max_per_img=100))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Expand',
+        mean=data_preprocessor['mean'],
+        to_rgb=data_preprocessor['bgr_to_rgb'],
+        ratio_range=(1, 2)),
+    dict(
+        type='MinIoURandomCrop',
+        min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
+        min_crop_size=0.3),
+    dict(type='RandomResize', scale=[(320, 320), (416, 416)], keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(416, 416), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=24,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type='RepeatDataset',  # use RepeatDataset to speed up training
+        times=10,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=24,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+train_cfg = dict(max_epochs=30)
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.003, momentum=0.9, weight_decay=0.0005),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.0001,
+        by_epoch=False,
+        begin=0,
+        end=4000),
+    dict(type='MultiStepLR', by_epoch=True, milestones=[24, 28], gamma=0.1)
+]
+
+find_unused_parameters = True
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (24 samples per GPU)
+auto_scale_lr = dict(base_batch_size=192)
diff --git a/mmde/mmdet/.mim/configs/yolof/metafile.yml b/mmde/mmdet/.mim/configs/yolof/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b3b7b7f8d5d3d7faec0cd04984ede59a99d06f38
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolof/metafile.yml
@@ -0,0 +1,32 @@
+Collections:
+  - Name: YOLOF
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Dilated Encoder
+        - ResNet
+    Paper:
+      URL: https://arxiv.org/abs/2103.09460
+      Title: 'You Only Look One-level Feature'
+    README: configs/yolof/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/detectors/yolof.py#L6
+      Version: v2.12.0
+
+Models:
+  - Name: yolof_r50_c5_8x8_1x_coco
+    In Collection: YOLOF
+    Config: configs/yolof/yolof_r50-c5_8xb8-1x_coco.py
+    Metadata:
+      Training Memory (GB): 8.3
+      Epochs: 12
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 37.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolof/yolof_r50_c5_8x8_1x_coco/yolof_r50_c5_8x8_1x_coco_20210425_024427-8e864411.pth
diff --git a/mmde/mmdet/.mim/configs/yolof/yolof_r50-c5_8xb8-1x_coco.py b/mmde/mmdet/.mim/configs/yolof/yolof_r50-c5_8xb8-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ea228e3e3270e07a4e5b171ab544c704fb172f3
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolof/yolof_r50-c5_8xb8-1x_coco.py
@@ -0,0 +1,116 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+model = dict(
+    type='YOLOF',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3, ),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron/resnet50_caffe')),
+    neck=dict(
+        type='DilatedEncoder',
+        in_channels=2048,
+        out_channels=512,
+        block_mid_channels=128,
+        num_residual_blocks=4,
+        block_dilations=[2, 4, 6, 8]),
+    bbox_head=dict(
+        type='YOLOFHead',
+        num_classes=80,
+        in_channels=512,
+        reg_decoded_bbox=True,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[1, 2, 4, 8, 16],
+            strides=[32]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1., 1., 1., 1.],
+            add_ctr_clamp=True,
+            ctr_clamp=32),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='UniformAssigner', pos_ignore_thr=0.15, neg_ignore_thr=0.7),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.12, momentum=0.9, weight_decay=0.0001),
+    paramwise_cfg=dict(
+        norm_decay_mult=0., custom_keys={'backbone': dict(lr_mult=1. / 3)}))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.00066667,
+        by_epoch=False,
+        begin=0,
+        end=1500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='RandomShift', prob=0.5, max_shift_px=32),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=8, num_workers=8, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/yolof/yolof_r50-c5_8xb8-iter-1x_coco.py b/mmde/mmdet/.mim/configs/yolof/yolof_r50-c5_8xb8-iter-1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..466a820099e3ac1760371e8352a89f93fbeef5ee
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolof/yolof_r50-c5_8xb8-iter-1x_coco.py
@@ -0,0 +1,32 @@
+_base_ = './yolof_r50-c5_8xb8-1x_coco.py'
+
+# We implemented the iter-based config according to the source code.
+# COCO dataset has 117266 images after filtering. We use 8 gpu and
+# 8 batch size training, so 22500 is equivalent to
+# 22500/(117266/(8x8))=12.3 epoch, 15000 is equivalent to 8.2 epoch,
+# 20000 is equivalent to 10.9 epoch. Due to lr(0.12) is large,
+# the iter-based and epoch-based setting have about 0.2 difference on
+# the mAP evaluation value.
+
+train_cfg = dict(
+    _delete_=True,
+    type='IterBasedTrainLoop',
+    max_iters=22500,
+    val_interval=4500)
+
+# learning rate policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=22500,
+        by_epoch=False,
+        milestones=[15000, 20000],
+        gamma=0.1)
+]
+train_dataloader = dict(sampler=dict(type='InfiniteSampler'))
+default_hooks = dict(checkpoint=dict(by_epoch=False, interval=2500))
+
+log_processor = dict(by_epoch=False)
diff --git a/mmde/mmdet/.mim/configs/yolox/metafile.yml b/mmde/mmdet/.mim/configs/yolox/metafile.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2f64450e94cae436a05f46da67d3a1264235ffbd
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolox/metafile.yml
@@ -0,0 +1,70 @@
+Collections:
+  - Name: YOLOX
+    Metadata:
+      Training Data: COCO
+      Training Techniques:
+        - SGD with Nesterov
+        - Weight Decay
+        - Cosine Annealing Lr Updater
+      Training Resources: 8x TITANXp GPUs
+      Architecture:
+        - CSPDarkNet
+        - PAFPN
+    Paper:
+      URL: https://arxiv.org/abs/2107.08430
+      Title: 'YOLOX: Exceeding YOLO Series in 2021'
+    README: configs/yolox/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.15.1/mmdet/models/detectors/yolox.py#L6
+      Version: v2.15.1
+
+
+Models:
+  - Name: yolox_s_8x8_300e_coco
+    In Collection: YOLOX
+    Config: configs/yolox/yolox_s_8xb8-300e_coco.py
+    Metadata:
+      Training Memory (GB): 7.6
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth
+  - Name: yolox_l_8x8_300e_coco
+    In Collection: YOLOX
+    Config: configs/yolox/yolox_l_8xb8-300e_coco.py
+    Metadata:
+      Training Memory (GB): 19.9
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 49.4
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth
+  - Name: yolox_x_8x8_300e_coco
+    In Collection: YOLOX
+    Config: configs/yolox/yolox_x_8xb8-300e_coco.py
+    Metadata:
+      Training Memory (GB): 28.1
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.9
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth
+  - Name: yolox_tiny_8x8_300e_coco
+    In Collection: YOLOX
+    Config: configs/yolox/yolox_tiny_8xb8-300e_coco.py
+    Metadata:
+      Training Memory (GB): 3.5
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 32.0
+    Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth
diff --git a/mmde/mmdet/.mim/configs/yolox/yolox_l_8xb8-300e_coco.py b/mmde/mmdet/.mim/configs/yolox/yolox_l_8xb8-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a4b287bad595db65df69b7d6f80163bd4a49e44
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolox/yolox_l_8xb8-300e_coco.py
@@ -0,0 +1,8 @@
+_base_ = './yolox_s_8xb8-300e_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(deepen_factor=1.0, widen_factor=1.0),
+    neck=dict(
+        in_channels=[256, 512, 1024], out_channels=256, num_csp_blocks=3),
+    bbox_head=dict(in_channels=256, feat_channels=256))
diff --git a/mmde/mmdet/.mim/configs/yolox/yolox_m_8xb8-300e_coco.py b/mmde/mmdet/.mim/configs/yolox/yolox_m_8xb8-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d82f9e98f1fcd4a1c6089807adc3cca2b48d6b5e
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolox/yolox_m_8xb8-300e_coco.py
@@ -0,0 +1,8 @@
+_base_ = './yolox_s_8xb8-300e_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(deepen_factor=0.67, widen_factor=0.75),
+    neck=dict(in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2),
+    bbox_head=dict(in_channels=192, feat_channels=192),
+)
diff --git a/mmde/mmdet/.mim/configs/yolox/yolox_nano_8xb8-300e_coco.py b/mmde/mmdet/.mim/configs/yolox/yolox_nano_8xb8-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f7a1c5ab066439c78ffa005a2a60c9057223849
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolox/yolox_nano_8xb8-300e_coco.py
@@ -0,0 +1,11 @@
+_base_ = './yolox_tiny_8xb8-300e_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(deepen_factor=0.33, widen_factor=0.25, use_depthwise=True),
+    neck=dict(
+        in_channels=[64, 128, 256],
+        out_channels=64,
+        num_csp_blocks=1,
+        use_depthwise=True),
+    bbox_head=dict(in_channels=64, feat_channels=64, use_depthwise=True))
diff --git a/mmde/mmdet/.mim/configs/yolox/yolox_s_8xb8-300e_coco.py b/mmde/mmdet/.mim/configs/yolox/yolox_s_8xb8-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e324eb5b99202fd42c8d67847a1be1c165b4057
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolox/yolox_s_8xb8-300e_coco.py
@@ -0,0 +1,250 @@
+_base_ = [
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py',
+    './yolox_tta.py'
+]
+
+img_scale = (640, 640)  # width, height
+
+# model settings
+model = dict(
+    type='YOLOX',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='BatchSyncRandomResize',
+                random_size_range=(480, 800),
+                size_divisor=32,
+                interval=10)
+        ]),
+    backbone=dict(
+        type='CSPDarknet',
+        deepen_factor=0.33,
+        widen_factor=0.5,
+        out_indices=(2, 3, 4),
+        use_depthwise=False,
+        spp_kernal_sizes=(5, 9, 13),
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='Swish'),
+    ),
+    neck=dict(
+        type='YOLOXPAFPN',
+        in_channels=[128, 256, 512],
+        out_channels=128,
+        num_csp_blocks=1,
+        use_depthwise=False,
+        upsample_cfg=dict(scale_factor=2, mode='nearest'),
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='Swish')),
+    bbox_head=dict(
+        type='YOLOXHead',
+        num_classes=80,
+        in_channels=128,
+        feat_channels=128,
+        stacked_convs=2,
+        strides=(8, 16, 32),
+        use_depthwise=False,
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='Swish'),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='IoULoss',
+            mode='square',
+            eps=1e-16,
+            reduction='sum',
+            loss_weight=5.0),
+        loss_obj=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0)),
+    train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
+    # In order to align the source code, the threshold of the val phase is
+    # 0.01, and the threshold of the test phase is 0.001.
+    test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65)))
+
+# dataset settings
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        # img_scale is (width, height)
+        border=(-img_scale[0] // 2, -img_scale[1] // 2)),
+    dict(
+        type='MixUp',
+        img_scale=img_scale,
+        ratio_range=(0.8, 1.6),
+        pad_val=114.0),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    # According to the official implementation, multi-scale
+    # training is not considered here but in the
+    # 'mmdet/models/detectors/yolox.py'.
+    # Resize and Pad are for the last 15 epochs when Mosaic,
+    # RandomAffine, and MixUp are closed by YOLOXModeSwitchHook.
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(
+        type='Pad',
+        pad_to_square=True,
+        # If the image is three-channel, the pad value needs
+        # to be set separately for each channel.
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
+    dict(type='PackDetInputs')
+]
+
+train_dataset = dict(
+    # use MultiImageMixDataset wrapper to support mosaic and mixup
+    type='MultiImageMixDataset',
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=[
+            dict(type='LoadImageFromFile', backend_args=backend_args),
+            dict(type='LoadAnnotations', with_bbox=True)
+        ],
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        backend_args=backend_args),
+    pipeline=train_pipeline)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(
+        type='Pad',
+        pad_to_square=True,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=train_dataset)
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# training settings
+max_epochs = 300
+num_last_epochs = 15
+interval = 10
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=interval)
+
+# optimizer
+# default 8 gpu
+base_lr = 0.01
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=base_lr, momentum=0.9, weight_decay=5e-4,
+        nesterov=True),
+    paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.))
+
+# learning rate
+param_scheduler = [
+    dict(
+        # use quadratic formula to warm up 5 epochs
+        # and lr is updated by iteration
+        # TODO: fix default scope in get function
+        type='mmdet.QuadraticWarmupLR',
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        # use cosine lr from 5 to 285 epoch
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=5,
+        T_max=max_epochs - num_last_epochs,
+        end=max_epochs - num_last_epochs,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        # use fixed lr during last 15 epochs
+        type='ConstantLR',
+        by_epoch=True,
+        factor=1,
+        begin=max_epochs - num_last_epochs,
+        end=max_epochs,
+    )
+]
+
+default_hooks = dict(
+    checkpoint=dict(
+        interval=interval,
+        max_keep_ckpts=3  # only keep latest 3 checkpoints
+    ))
+
+custom_hooks = [
+    dict(
+        type='YOLOXModeSwitchHook',
+        num_last_epochs=num_last_epochs,
+        priority=48),
+    dict(type='SyncNormHook', priority=48),
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0001,
+        update_buffers=True,
+        priority=49)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/mmde/mmdet/.mim/configs/yolox/yolox_tiny_8xb8-300e_coco.py b/mmde/mmdet/.mim/configs/yolox/yolox_tiny_8xb8-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..86f7e9a6191066ab9b672d548b93a29e64746f29
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolox/yolox_tiny_8xb8-300e_coco.py
@@ -0,0 +1,54 @@
+_base_ = './yolox_s_8xb8-300e_coco.py'
+
+# model settings
+model = dict(
+    data_preprocessor=dict(batch_augments=[
+        dict(
+            type='BatchSyncRandomResize',
+            random_size_range=(320, 640),
+            size_divisor=32,
+            interval=10)
+    ]),
+    backbone=dict(deepen_factor=0.33, widen_factor=0.375),
+    neck=dict(in_channels=[96, 192, 384], out_channels=96),
+    bbox_head=dict(in_channels=96, feat_channels=96))
+
+img_scale = (640, 640)  # width, height
+
+train_pipeline = [
+    dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.5, 1.5),
+        # img_scale is (width, height)
+        border=(-img_scale[0] // 2, -img_scale[1] // 2)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    # Resize and Pad are for the last 15 epochs when Mosaic and
+    # RandomAffine are closed by YOLOXModeSwitchHook.
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(
+        type='Pad',
+        pad_to_square=True,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}),
+    dict(type='Resize', scale=(416, 416), keep_ratio=True),
+    dict(
+        type='Pad',
+        pad_to_square=True,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/mmde/mmdet/.mim/configs/yolox/yolox_tta.py b/mmde/mmdet/.mim/configs/yolox/yolox_tta.py
new file mode 100644
index 0000000000000000000000000000000000000000..e65244be6e1bb70393d111ef4d25334d3b2ce8a6
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolox/yolox_tta.py
@@ -0,0 +1,36 @@
+tta_model = dict(
+    type='DetTTAModel',
+    tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.65), max_per_img=100))
+
+img_scales = [(640, 640), (320, 320), (960, 960)]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale=s, keep_ratio=True)
+                for s in img_scales
+            ],
+            [
+                # ``RandomFlip`` must be placed before ``Pad``, otherwise
+                # bounding box coordinates after flipping cannot be
+                # recovered correctly.
+                dict(type='RandomFlip', prob=1.),
+                dict(type='RandomFlip', prob=0.)
+            ],
+            [
+                dict(
+                    type='Pad',
+                    pad_to_square=True,
+                    pad_val=dict(img=(114.0, 114.0, 114.0))),
+            ],
+            [dict(type='LoadAnnotations', with_bbox=True)],
+            [
+                dict(
+                    type='PackDetInputs',
+                    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                               'scale_factor', 'flip', 'flip_direction'))
+            ]
+        ])
+]
diff --git a/mmde/mmdet/.mim/configs/yolox/yolox_x_8xb8-300e_coco.py b/mmde/mmdet/.mim/configs/yolox/yolox_x_8xb8-300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..34828e0363a2f282af59da74e805e59772dfeb69
--- /dev/null
+++ b/mmde/mmdet/.mim/configs/yolox/yolox_x_8xb8-300e_coco.py
@@ -0,0 +1,8 @@
+_base_ = './yolox_s_8xb8-300e_coco.py'
+
+# model settings
+model = dict(
+    backbone=dict(deepen_factor=1.33, widen_factor=1.25),
+    neck=dict(
+        in_channels=[320, 640, 1280], out_channels=320, num_csp_blocks=4),
+    bbox_head=dict(in_channels=320, feat_channels=320))
diff --git a/mmde/mmdet/.mim/dataset-index.yml b/mmde/mmdet/.mim/dataset-index.yml
new file mode 100644
index 0000000000000000000000000000000000000000..116412e1ad678cadb5b9734df95e6fe096b33164
--- /dev/null
+++ b/mmde/mmdet/.mim/dataset-index.yml
@@ -0,0 +1,18 @@
+openxlab: true
+voc2007:
+  dataset: OpenDataLab/PASCAL_VOC2007
+  download_root: data
+  data_root: data
+  script: tools/dataset_converters/scripts/preprocess_voc2007.sh
+
+voc2012:
+  dataset: OpenDataLab/PASCAL_VOC2012
+  download_root: data
+  data_root: data
+  script: tools/dataset_converters/scripts/preprocess_voc2012.sh
+
+coco2017:
+  dataset: OpenDataLab/COCO_2017
+  download_root: data
+  data_root: data/coco
+  script: tools/dataset_converters/scripts/preprocess_coco2017.sh
diff --git a/mmde/mmdet/.mim/model-index.yml b/mmde/mmdet/.mim/model-index.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d4b4392b422042070139d009407e40f64c80a4f6
--- /dev/null
+++ b/mmde/mmdet/.mim/model-index.yml
@@ -0,0 +1,102 @@
+Import:
+  - configs/albu_example/metafile.yml
+  - configs/atss/metafile.yml
+  - configs/autoassign/metafile.yml
+  - configs/boxinst/metafile.yml
+  - configs/carafe/metafile.yml
+  - configs/cascade_rcnn/metafile.yml
+  - configs/cascade_rpn/metafile.yml
+  - configs/centernet/metafile.yml
+  - configs/centripetalnet/metafile.yml
+  - configs/condinst/metafile.yml
+  - configs/conditional_detr/metafile.yml
+  - configs/cornernet/metafile.yml
+  - configs/convnext/metafile.yml
+  - configs/crowddet/metafile.yml
+  - configs/dab_detr/metafile.yml
+  - configs/dcn/metafile.yml
+  - configs/dcnv2/metafile.yml
+  - configs/ddod/metafile.yml
+  - configs/deformable_detr/metafile.yml
+  - configs/detectors/metafile.yml
+  - configs/detr/metafile.yml
+  - configs/dino/metafile.yml
+  - configs/double_heads/metafile.yml
+  - configs/dyhead/metafile.yml
+  - configs/dynamic_rcnn/metafile.yml
+  - configs/efficientnet/metafile.yml
+  - configs/empirical_attention/metafile.yml
+  - configs/faster_rcnn/metafile.yml
+  - configs/fcos/metafile.yml
+  - configs/foveabox/metafile.yml
+  - configs/fpg/metafile.yml
+  - configs/free_anchor/metafile.yml
+  - configs/fsaf/metafile.yml
+  - configs/gcnet/metafile.yml
+  - configs/gfl/metafile.yml
+  - configs/ghm/metafile.yml
+  - configs/gn/metafile.yml
+  - configs/gn+ws/metafile.yml
+  - configs/grid_rcnn/metafile.yml
+  - configs/groie/metafile.yml
+  - configs/guided_anchoring/metafile.yml
+  - configs/hrnet/metafile.yml
+  - configs/htc/metafile.yml
+  - configs/instaboost/metafile.yml
+  - configs/lad/metafile.yml
+  - configs/ld/metafile.yml
+  - configs/libra_rcnn/metafile.yml
+  - configs/lvis/metafile.yml
+  - configs/mask2former/metafile.yml
+  - configs/mask_rcnn/metafile.yml
+  - configs/maskformer/metafile.yml
+  - configs/ms_rcnn/metafile.yml
+  - configs/nas_fcos/metafile.yml
+  - configs/nas_fpn/metafile.yml
+  - configs/openimages/metafile.yml
+  - configs/paa/metafile.yml
+  - configs/pafpn/metafile.yml
+  - configs/panoptic_fpn/metafile.yml
+  - configs/pvt/metafile.yml
+  - configs/pisa/metafile.yml
+  - configs/point_rend/metafile.yml
+  - configs/queryinst/metafile.yml
+  - configs/regnet/metafile.yml
+  - configs/reppoints/metafile.yml
+  - configs/res2net/metafile.yml
+  - configs/resnest/metafile.yml
+  - configs/resnet_strikes_back/metafile.yml
+  - configs/retinanet/metafile.yml
+  - configs/rpn/metafile.yml
+  - configs/rtmdet/metafile.yml
+  - configs/sabl/metafile.yml
+  - configs/scnet/metafile.yml
+  - configs/scratch/metafile.yml
+  - configs/seesaw_loss/metafile.yml
+  - configs/simple_copy_paste/metafile.yml
+  - configs/soft_teacher/metafile.yml
+  - configs/sparse_rcnn/metafile.yml
+  - configs/solo/metafile.yml
+  - configs/solov2/metafile.yml
+  - configs/ssd/metafile.yml
+  - configs/strong_baselines/metafile.yml
+  - configs/swin/metafile.yml
+  - configs/tridentnet/metafile.yml
+  - configs/tood/metafile.yml
+  - configs/vfnet/metafile.yml
+  - configs/yolact/metafile.yml
+  - configs/yolo/metafile.yml
+  - configs/yolof/metafile.yml
+  - configs/yolox/metafile.yml
+  - configs/bytetrack/metafile.yml
+  - configs/strongsort/metafile.yml
+  - configs/ocsort/metafile.yml
+  - configs/sort/metafile.yml
+  - configs/deepsort/metafile.yml
+  - configs/qdtrack/metafile.yml
+  - configs/mask2former_vis/metafile.yml
+  - configs/masktrack_rcnn/metafile.yml
+  - configs/glip/metafile.yml
+  - configs/ddq/metafile.yml
+  - configs/grounding_dino/metafile.yml
+  - configs/mm_grounding_dino/metafile.yml
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/analyze_logs.py b/mmde/mmdet/.mim/tools/analysis_tools/analyze_logs.py
new file mode 100644
index 0000000000000000000000000000000000000000..926412e27bad8817c0efb4c729f7dfedd9d10de1
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/analyze_logs.py
@@ -0,0 +1,211 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+
+
+def cal_train_time(log_dicts, args):
+    for i, log_dict in enumerate(log_dicts):
+        print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
+        all_times = []
+        for epoch in log_dict.keys():
+            if args.include_outliers:
+                all_times.append(log_dict[epoch]['time'])
+            else:
+                all_times.append(log_dict[epoch]['time'][1:])
+        if not all_times:
+            raise KeyError(
+                'Please reduce the log interval in the config so that'
+                'interval is less than iterations of one epoch.')
+        epoch_ave_time = np.array(list(map(lambda x: np.mean(x), all_times)))
+        slowest_epoch = epoch_ave_time.argmax()
+        fastest_epoch = epoch_ave_time.argmin()
+        std_over_epoch = epoch_ave_time.std()
+        print(f'slowest epoch {slowest_epoch + 1}, '
+              f'average time is {epoch_ave_time[slowest_epoch]:.4f} s/iter')
+        print(f'fastest epoch {fastest_epoch + 1}, '
+              f'average time is {epoch_ave_time[fastest_epoch]:.4f} s/iter')
+        print(f'time std over epochs is {std_over_epoch:.4f}')
+        print(f'average iter time: {np.mean(epoch_ave_time):.4f} s/iter\n')
+
+
+def plot_curve(log_dicts, args):
+    if args.backend is not None:
+        plt.switch_backend(args.backend)
+    sns.set_style(args.style)
+    # if legend is None, use {filename}_{key} as legend
+    legend = args.legend
+    if legend is None:
+        legend = []
+        for json_log in args.json_logs:
+            for metric in args.keys:
+                legend.append(f'{json_log}_{metric}')
+    assert len(legend) == (len(args.json_logs) * len(args.keys))
+    metrics = args.keys
+
+    # TODO: support dynamic eval interval(e.g. RTMDet) when plotting mAP.
+    num_metrics = len(metrics)
+    for i, log_dict in enumerate(log_dicts):
+        epochs = list(log_dict.keys())
+        for j, metric in enumerate(metrics):
+            print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+            if metric not in log_dict[epochs[int(args.eval_interval) - 1]]:
+                if 'mAP' in metric:
+                    raise KeyError(
+                        f'{args.json_logs[i]} does not contain metric '
+                        f'{metric}. Please check if "--no-validate" is '
+                        'specified when you trained the model. Or check '
+                        f'if the eval_interval {args.eval_interval} in args '
+                        'is equal to the eval_interval during training.')
+                raise KeyError(
+                    f'{args.json_logs[i]} does not contain metric {metric}. '
+                    'Please reduce the log interval in the config so that '
+                    'interval is less than iterations of one epoch.')
+
+            if 'mAP' in metric:
+                xs = []
+                ys = []
+                for epoch in epochs:
+                    ys += log_dict[epoch][metric]
+                    if log_dict[epoch][metric]:
+                        xs += [epoch]
+                plt.xlabel('epoch')
+                plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o')
+            else:
+                xs = []
+                ys = []
+                for epoch in epochs:
+                    iters = log_dict[epoch]['step']
+                    xs.append(np.array(iters))
+                    ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
+                xs = np.concatenate(xs)
+                ys = np.concatenate(ys)
+                plt.xlabel('iter')
+                plt.plot(
+                    xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
+            plt.legend()
+        if args.title is not None:
+            plt.title(args.title)
+    if args.out is None:
+        plt.show()
+    else:
+        print(f'save curve to: {args.out}')
+        plt.savefig(args.out)
+        plt.cla()
+
+
+def add_plot_parser(subparsers):
+    parser_plt = subparsers.add_parser(
+        'plot_curve', help='parser for plotting curves')
+    parser_plt.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_plt.add_argument(
+        '--keys',
+        type=str,
+        nargs='+',
+        default=['bbox_mAP'],
+        help='the metric that you want to plot')
+    parser_plt.add_argument(
+        '--start-epoch',
+        type=str,
+        default='1',
+        help='the epoch that you want to start')
+    parser_plt.add_argument(
+        '--eval-interval',
+        type=str,
+        default='1',
+        help='the eval interval when training')
+    parser_plt.add_argument('--title', type=str, help='title of figure')
+    parser_plt.add_argument(
+        '--legend',
+        type=str,
+        nargs='+',
+        default=None,
+        help='legend of each plot')
+    parser_plt.add_argument(
+        '--backend', type=str, default=None, help='backend of plt')
+    parser_plt.add_argument(
+        '--style', type=str, default='dark', help='style of plt')
+    parser_plt.add_argument('--out', type=str, default=None)
+
+
+def add_time_parser(subparsers):
+    parser_time = subparsers.add_parser(
+        'cal_train_time',
+        help='parser for computing the average time per training iteration')
+    parser_time.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_time.add_argument(
+        '--include-outliers',
+        action='store_true',
+        help='include the first value of every epoch when computing '
+        'the average time')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Analyze Json Log')
+    # currently only support plot curve and calculate average train time
+    subparsers = parser.add_subparsers(dest='task', help='task parser')
+    add_plot_parser(subparsers)
+    add_time_parser(subparsers)
+    args = parser.parse_args()
+    return args
+
+
+def load_json_logs(json_logs):
+    # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+    # keys of sub dict is different metrics, e.g. memory, bbox_mAP
+    # value of sub dict is a list of corresponding values of all iterations
+    log_dicts = [dict() for _ in json_logs]
+    for json_log, log_dict in zip(json_logs, log_dicts):
+        with open(json_log, 'r') as log_file:
+            epoch = 1
+            for i, line in enumerate(log_file):
+                log = json.loads(line.strip())
+                val_flag = False
+                # skip lines only contains one key
+                if not len(log) > 1:
+                    continue
+
+                if epoch not in log_dict:
+                    log_dict[epoch] = defaultdict(list)
+
+                for k, v in log.items():
+                    if '/' in k:
+                        log_dict[epoch][k.split('/')[-1]].append(v)
+                        val_flag = True
+                    elif val_flag:
+                        continue
+                    else:
+                        log_dict[epoch][k].append(v)
+
+                if 'epoch' in log.keys():
+                    epoch = log['epoch']
+
+    return log_dicts
+
+
+def main():
+    args = parse_args()
+
+    json_logs = args.json_logs
+    for json_log in json_logs:
+        assert json_log.endswith('.json')
+
+    log_dicts = load_json_logs(json_logs)
+
+    eval(args.task)(log_dicts, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/analyze_results.py b/mmde/mmdet/.mim/tools/analysis_tools/analyze_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..0efba72198f6065fffc384fb6629fee26968ac36
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/analyze_results.py
@@ -0,0 +1,407 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from multiprocessing import Pool
+
+import mmcv
+import numpy as np
+from mmengine.config import Config, DictAction
+from mmengine.fileio import load
+from mmengine.registry import init_default_scope
+from mmengine.runner import Runner
+from mmengine.structures import InstanceData, PixelData
+from mmengine.utils import ProgressBar, check_file_exist, mkdir_or_exist
+
+from mmdet.datasets import get_loading_pipeline
+from mmdet.evaluation import eval_map
+from mmdet.registry import DATASETS, RUNNERS
+from mmdet.structures import DetDataSample
+from mmdet.utils import replace_cfg_vals, update_data_root
+from mmdet.visualization import DetLocalVisualizer
+
+
+def bbox_map_eval(det_result, annotation, nproc=4):
+    """Evaluate mAP of single image det result.
+
+    Args:
+        det_result (list[list]): [[cls1_det, cls2_det, ...], ...].
+            The outer list indicates images, and the inner list indicates
+            per-class detected bboxes.
+        annotation (dict): Ground truth annotations where keys of
+             annotations are:
+
+            - bboxes: numpy array of shape (n, 4)
+            - labels: numpy array of shape (n, )
+            - bboxes_ignore (optional): numpy array of shape (k, 4)
+            - labels_ignore (optional): numpy array of shape (k, )
+
+        nproc (int): Processes used for computing mAP.
+            Default: 4.
+
+    Returns:
+        float: mAP
+    """
+
+    # use only bbox det result
+    if isinstance(det_result, tuple):
+        bbox_det_result = [det_result[0]]
+    else:
+        bbox_det_result = [det_result]
+    # mAP
+    iou_thrs = np.linspace(
+        .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+
+    processes = []
+    workers = Pool(processes=nproc)
+    for thr in iou_thrs:
+        p = workers.apply_async(eval_map, (bbox_det_result, [annotation]), {
+            'iou_thr': thr,
+            'logger': 'silent',
+            'nproc': 1
+        })
+        processes.append(p)
+
+    workers.close()
+    workers.join()
+
+    mean_aps = []
+    for p in processes:
+        mean_aps.append(p.get()[0])
+
+    return sum(mean_aps) / len(mean_aps)
+
+
+class ResultVisualizer:
+    """Display and save evaluation results.
+
+    Args:
+        show (bool): Whether to show the image. Default: True.
+        wait_time (float): Value of waitKey param. Default: 0.
+        score_thr (float): Minimum score of bboxes to be shown.
+           Default: 0.
+        runner (:obj:`Runner`): The runner of the visualization process.
+    """
+
+    def __init__(self, show=False, wait_time=0, score_thr=0, runner=None):
+        self.show = show
+        self.wait_time = wait_time
+        self.score_thr = score_thr
+        self.visualizer = DetLocalVisualizer()
+        self.runner = runner
+        self.evaluator = runner.test_evaluator
+
+    def _save_image_gts_results(self,
+                                dataset,
+                                results,
+                                performances,
+                                out_dir=None,
+                                task='det'):
+        """Display or save image with groung truths and predictions from a
+        model.
+
+        Args:
+            dataset (Dataset): A PyTorch dataset.
+            results (list): Object detection or panoptic segmentation
+                results from test results pkl file.
+            performances (dict): A dict contains samples's indices
+                in dataset and model's performance on them.
+            out_dir (str, optional): The filename to write the image.
+                Defaults: None.
+            task (str): The task to be performed. Defaults: 'det'
+        """
+        mkdir_or_exist(out_dir)
+
+        for performance_info in performances:
+            index, performance = performance_info
+            data_info = dataset[index]
+            data_info['gt_instances'] = data_info['instances']
+
+            # calc save file path
+            filename = data_info['img_path']
+            fname, name = osp.splitext(osp.basename(filename))
+            save_filename = fname + '_' + str(round(performance, 3)) + name
+            out_file = osp.join(out_dir, save_filename)
+
+            if task == 'det':
+                gt_instances = InstanceData()
+                gt_instances.bboxes = [
+                    d['bbox'] for d in data_info['gt_instances']
+                ]
+                gt_instances.labels = [
+                    d['bbox_label'] for d in data_info['gt_instances']
+                ]
+
+                pred_instances = InstanceData()
+                pred_instances.bboxes = results[index]['pred_instances'][
+                    'bboxes']
+                pred_instances.labels = results[index]['pred_instances'][
+                    'labels']
+                pred_instances.scores = results[index]['pred_instances'][
+                    'scores']
+
+                data_samples = DetDataSample()
+                data_samples.pred_instances = pred_instances
+                data_samples.gt_instances = gt_instances
+
+            elif task == 'seg':
+                gt_panoptic_seg = PixelData()
+                gt_panoptic_seg.sem_seg = [
+                    d['gt_seg_map'] for d in data_info['gt_instances']
+                ]
+
+                pred_panoptic_seg = PixelData()
+                pred_panoptic_seg.sem_seg = results[index][
+                    'pred_panoptic_seg']['sem_seg']
+
+                data_samples = DetDataSample()
+                data_samples.pred_panoptic_seg = pred_panoptic_seg
+                data_samples.gt_panoptic_seg = gt_panoptic_seg
+
+            img = mmcv.imread(filename, channel_order='rgb')
+            self.visualizer.add_datasample(
+                'image',
+                img,
+                data_samples,
+                show=self.show,
+                draw_gt=False,
+                pred_score_thr=self.score_thr,
+                out_file=out_file)
+
+    def evaluate_and_show(self,
+                          dataset,
+                          results,
+                          topk=20,
+                          show_dir='work_dir'):
+        """Evaluate and show results.
+
+        Args:
+            dataset (Dataset): A PyTorch dataset.
+            results (list): Object detection or panoptic segmentation
+                results from test results pkl file.
+            topk (int): Number of the highest topk and
+                lowest topk after evaluation index sorting. Default: 20.
+            show_dir (str, optional): The filename to write the image.
+                Default: 'work_dir'
+        """
+
+        self.visualizer.dataset_meta = dataset.metainfo
+
+        assert topk > 0
+        if (topk * 2) > len(dataset):
+            topk = len(dataset) // 2
+
+        good_dir = osp.abspath(osp.join(show_dir, 'good'))
+        bad_dir = osp.abspath(osp.join(show_dir, 'bad'))
+
+        if 'pred_panoptic_seg' in results[0].keys():
+            good_samples, bad_samples = self.panoptic_evaluate(
+                dataset, results, topk=topk)
+            self._save_image_gts_results(
+                dataset, results, good_samples, good_dir, task='seg')
+            self._save_image_gts_results(
+                dataset, results, bad_samples, bad_dir, task='seg')
+        elif 'pred_instances' in results[0].keys():
+            good_samples, bad_samples = self.detection_evaluate(
+                dataset, results, topk=topk)
+            self._save_image_gts_results(
+                dataset, results, good_samples, good_dir, task='det')
+            self._save_image_gts_results(
+                dataset, results, bad_samples, bad_dir, task='det')
+        else:
+            raise 'expect \'pred_panoptic_seg\' or \'pred_instances\' \
+                in dict result'
+
+    def detection_evaluate(self, dataset, results, topk=20, eval_fn=None):
+        """Evaluation for object detection.
+
+        Args:
+            dataset (Dataset): A PyTorch dataset.
+            results (list): Object detection results from test
+                results pkl file.
+            topk (int): Number of the highest topk and
+                lowest topk after evaluation index sorting. Default: 20.
+            eval_fn (callable, optional): Eval function, Default: None.
+
+        Returns:
+            tuple: A tuple contains good samples and bad samples.
+                good_mAPs (dict[int, float]): A dict contains good
+                    samples's indices in dataset and model's
+                    performance on them.
+                bad_mAPs (dict[int, float]): A dict contains bad
+                    samples's indices in dataset and model's
+                    performance on them.
+        """
+
+        if eval_fn is None:
+            eval_fn = bbox_map_eval
+        else:
+            assert callable(eval_fn)
+
+        prog_bar = ProgressBar(len(results))
+        _mAPs = {}
+        data_info = {}
+        for i, (result, ) in enumerate(zip(results)):
+
+            # self.dataset[i] should not call directly
+            # because there is a risk of mismatch
+            data_info = dataset.prepare_data(i)
+            data_info['bboxes'] = data_info['gt_bboxes'].tensor
+            data_info['labels'] = data_info['gt_bboxes_labels']
+
+            pred = result['pred_instances']
+            pred_bboxes = pred['bboxes'].cpu().numpy()
+            pred_scores = pred['scores'].cpu().numpy()
+            pred_labels = pred['labels'].cpu().numpy()
+
+            dets = []
+            for label in range(len(dataset.metainfo['classes'])):
+                index = np.where(pred_labels == label)[0]
+                pred_bbox_scores = np.hstack(
+                    [pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
+                dets.append(pred_bbox_scores)
+            mAP = eval_fn(dets, data_info)
+
+            _mAPs[i] = mAP
+            prog_bar.update()
+        # descending select topk image
+        _mAPs = list(sorted(_mAPs.items(), key=lambda kv: kv[1]))
+        good_mAPs = _mAPs[-topk:]
+        bad_mAPs = _mAPs[:topk]
+
+        return good_mAPs, bad_mAPs
+
+    def panoptic_evaluate(self, dataset, results, topk=20):
+        """Evaluation for panoptic segmentation.
+
+        Args:
+            dataset (Dataset): A PyTorch dataset.
+            results (list): Panoptic segmentation results from test
+                results pkl file.
+            topk (int): Number of the highest topk and
+                lowest topk after evaluation index sorting. Default: 20.
+
+        Returns:
+            tuple: A tuple contains good samples and bad samples.
+                good_pqs (dict[int, float]): A dict contains good
+                    samples's indices in dataset and model's
+                    performance on them.
+                bad_pqs (dict[int, float]): A dict contains bad
+                    samples's indices in dataset and model's
+                    performance on them.
+        """
+        pqs = {}
+        prog_bar = ProgressBar(len(results))
+
+        for i in range(len(results)):
+            data_sample = {}
+            for k in dataset[i].keys():
+                data_sample[k] = dataset[i][k]
+
+            for k in results[i].keys():
+                data_sample[k] = results[i][k]
+
+            self.evaluator.process([data_sample])
+            metrics = self.evaluator.evaluate(1)
+
+            pqs[i] = metrics['coco_panoptic/PQ']
+            prog_bar.update()
+
+        # descending select topk image
+        pqs = list(sorted(pqs.items(), key=lambda kv: kv[1]))
+        good_pqs = pqs[-topk:]
+        bad_pqs = pqs[:topk]
+
+        return good_pqs, bad_pqs
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet eval image prediction result for each')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument(
+        'prediction_path', help='prediction path where test pkl result')
+    parser.add_argument(
+        'show_dir', help='directory where painted images will be saved')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--wait-time',
+        type=float,
+        default=0,
+        help='the interval of show (s), 0 is block')
+    parser.add_argument(
+        '--topk',
+        default=20,
+        type=int,
+        help='saved Number of the highest topk '
+        'and lowest topk after index sorting')
+    parser.add_argument(
+        '--show-score-thr',
+        type=float,
+        default=0,
+        help='score threshold (default: 0.)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    check_file_exist(args.prediction_path)
+
+    cfg = Config.fromfile(args.config)
+
+    # replace the ${key} with the value of cfg.key
+    cfg = replace_cfg_vals(cfg)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    init_default_scope(cfg.get('default_scope', 'mmdet'))
+
+    cfg.test_dataloader.dataset.test_mode = True
+
+    cfg.test_dataloader.pop('batch_size', 0)
+    if cfg.train_dataloader.dataset.type in ('MultiImageMixDataset',
+                                             'ClassBalancedDataset',
+                                             'RepeatDataset'):
+        cfg.test_dataloader.dataset.pipeline = get_loading_pipeline(
+            cfg.train_dataloader.dataset.dataset.pipeline)
+    elif cfg.train_dataloader.dataset.type in ('ConcatDataset', ):
+        cfg.test_dataloader.dataset.pipeline = get_loading_pipeline(
+            cfg.train_dataloader.dataset.datasets[0].pipeline)
+    else:
+        cfg.test_dataloader.dataset.pipeline = get_loading_pipeline(
+            cfg.train_dataloader.dataset.pipeline)
+    dataset = DATASETS.build(cfg.test_dataloader.dataset)
+    outputs = load(args.prediction_path)
+
+    cfg.work_dir = args.show_dir
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
+    else:
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
+
+    result_visualizer = ResultVisualizer(args.show, args.wait_time,
+                                         args.show_score_thr, runner)
+    result_visualizer.evaluate_and_show(
+        dataset, outputs, topk=args.topk, show_dir=args.show_dir)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/benchmark.py b/mmde/mmdet/.mim/tools/analysis_tools/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfc06e2a3ade9d254c637ded42a7760213473c09
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/benchmark.py
@@ -0,0 +1,133 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+
+from mmengine import MMLogger
+from mmengine.config import Config, DictAction
+from mmengine.dist import init_dist
+from mmengine.registry import init_default_scope
+from mmengine.utils import mkdir_or_exist
+
+from mmdet.utils.benchmark import (DataLoaderBenchmark, DatasetBenchmark,
+                                   InferenceBenchmark)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet benchmark')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--task',
+        choices=['inference', 'dataloader', 'dataset'],
+        default='dataloader',
+        help='Which task do you want to go to benchmark')
+    parser.add_argument(
+        '--repeat-num',
+        type=int,
+        default=1,
+        help='number of repeat times of measurement for averaging the results')
+    parser.add_argument(
+        '--max-iter', type=int, default=2000, help='num of max iter')
+    parser.add_argument(
+        '--log-interval', type=int, default=50, help='interval of logging')
+    parser.add_argument(
+        '--num-warmup', type=int, default=5, help='Number of warmup')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--dataset-type',
+        choices=['train', 'val', 'test'],
+        default='test',
+        help='Benchmark dataset type. only supports train, val and test')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing '
+        'benchmark metrics')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def inference_benchmark(args, cfg, distributed, logger):
+    benchmark = InferenceBenchmark(
+        cfg,
+        args.checkpoint,
+        distributed,
+        args.fuse_conv_bn,
+        args.max_iter,
+        args.log_interval,
+        args.num_warmup,
+        logger=logger)
+    return benchmark
+
+
+def dataloader_benchmark(args, cfg, distributed, logger):
+    benchmark = DataLoaderBenchmark(
+        cfg,
+        distributed,
+        args.dataset_type,
+        args.max_iter,
+        args.log_interval,
+        args.num_warmup,
+        logger=logger)
+    return benchmark
+
+
+def dataset_benchmark(args, cfg, distributed, logger):
+    benchmark = DatasetBenchmark(
+        cfg,
+        args.dataset_type,
+        args.max_iter,
+        args.log_interval,
+        args.num_warmup,
+        logger=logger)
+    return benchmark
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    init_default_scope(cfg.get('default_scope', 'mmdet'))
+
+    distributed = False
+    if args.launcher != 'none':
+        init_dist(args.launcher, **cfg.get('env_cfg', {}).get('dist_cfg', {}))
+        distributed = True
+
+    log_file = None
+    if args.work_dir:
+        log_file = os.path.join(args.work_dir, 'benchmark.log')
+        mkdir_or_exist(args.work_dir)
+
+    logger = MMLogger.get_instance(
+        'mmdet', log_file=log_file, log_level='INFO')
+
+    benchmark = eval(f'{args.task}_benchmark')(args, cfg, distributed, logger)
+    benchmark.run(args.repeat_num)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/browse_dataset.py b/mmde/mmdet/.mim/tools/analysis_tools/browse_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef2c484d650f6ceaa68886d9f9da8dd411bd0c2e
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/browse_dataset.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+from mmengine.config import Config, DictAction
+from mmengine.registry import init_default_scope
+from mmengine.utils import ProgressBar
+
+from mmdet.models.utils import mask2ndarray
+from mmdet.registry import DATASETS, VISUALIZERS
+from mmdet.structures.bbox import BaseBoxes
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--not-show', default=False, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=float,
+        default=2,
+        help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # register all modules in mmdet into the registries
+    init_default_scope(cfg.get('default_scope', 'mmdet'))
+
+    dataset = DATASETS.build(cfg.train_dataloader.dataset)
+    visualizer = VISUALIZERS.build(cfg.visualizer)
+    visualizer.dataset_meta = dataset.metainfo
+
+    progress_bar = ProgressBar(len(dataset))
+    for item in dataset:
+        img = item['inputs'].permute(1, 2, 0).numpy()
+        data_sample = item['data_samples'].numpy()
+        gt_instances = data_sample.gt_instances
+        img_path = osp.basename(item['data_samples'].img_path)
+
+        out_file = osp.join(
+            args.output_dir,
+            osp.basename(img_path)) if args.output_dir is not None else None
+
+        img = img[..., [2, 1, 0]]  # bgr to rgb
+        gt_bboxes = gt_instances.get('bboxes', None)
+        if gt_bboxes is not None and isinstance(gt_bboxes, BaseBoxes):
+            gt_instances.bboxes = gt_bboxes.tensor
+        gt_masks = gt_instances.get('masks', None)
+        if gt_masks is not None:
+            masks = mask2ndarray(gt_masks)
+            gt_instances.masks = masks.astype(bool)
+        data_sample.gt_instances = gt_instances
+
+        visualizer.add_datasample(
+            osp.basename(img_path),
+            img,
+            data_sample,
+            draw_pred=False,
+            show=not args.not_show,
+            wait_time=args.show_interval,
+            out_file=out_file)
+
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/browse_grounding_dataset.py b/mmde/mmdet/.mim/tools/analysis_tools/browse_grounding_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..43261956faa37e0d7d0fb8e6dec502c1260b5e04
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/browse_grounding_dataset.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+import numpy as np
+from mmcv.image import imwrite
+from mmengine.config import Config, DictAction
+from mmengine.registry import init_default_scope
+from mmengine.utils import ProgressBar
+
+from mmdet.registry import DATASETS, VISUALIZERS
+from mmdet.structures.bbox import BaseBoxes
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--output-dir',
+        '-o',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--not-show', default=False, action='store_true')
+    parser.add_argument('--show-num', '-n', type=int, default=30)
+    parser.add_argument('--shuffle', default=False, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=float,
+        default=0,
+        help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def draw_all_character(visualizer, characters, w):
+    start_index = 2
+    y_index = 5
+    for char in characters:
+        if isinstance(char, str):
+            visualizer.draw_texts(
+                str(char),
+                positions=np.array([start_index, y_index]),
+                colors=(0, 0, 0),
+                font_families='monospace')
+            start_index += len(char) * 8
+        else:
+            visualizer.draw_texts(
+                str(char[0]),
+                positions=np.array([start_index, y_index]),
+                colors=char[1],
+                font_families='monospace')
+            start_index += len(char[0]) * 8
+
+        if start_index > w - 10:
+            start_index = 2
+            y_index += 15
+
+    drawn_text = visualizer.get_image()
+    return drawn_text
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    assert args.show_num > 0
+
+    # register all modules in mmdet into the registries
+    init_default_scope(cfg.get('default_scope', 'mmdet'))
+
+    dataset = DATASETS.build(cfg.train_dataloader.dataset)
+    visualizer = VISUALIZERS.build(cfg.visualizer)
+    visualizer.dataset_meta = dataset.metainfo
+
+    dataset_index = list(range(len(dataset)))
+    if args.shuffle:
+        import random
+        random.shuffle(dataset_index)
+
+    progress_bar = ProgressBar(len(dataset))
+    for i in dataset_index[:args.show_num]:
+        item = dataset[i]
+        img = item['inputs'].permute(1, 2, 0).numpy()
+        data_sample = item['data_samples'].numpy()
+        gt_instances = data_sample.gt_instances
+        tokens_positive = data_sample.tokens_positive
+
+        gt_labels = gt_instances.labels
+
+        base_name = osp.basename(item['data_samples'].img_path)
+        name, extension = osp.splitext(base_name)
+
+        out_file = osp.join(args.output_dir, name + '_' + str(i) +
+                            extension) if args.output_dir is not None else None
+
+        img = img[..., [2, 1, 0]]  # bgr to rgb
+        gt_bboxes = gt_instances.get('bboxes', None)
+        if gt_bboxes is not None and isinstance(gt_bboxes, BaseBoxes):
+            gt_instances.bboxes = gt_bboxes.tensor
+
+        print(data_sample.text)
+
+        dataset_mode = data_sample.dataset_mode
+        if dataset_mode == 'VG':
+            max_label = int(max(gt_labels) if len(gt_labels) > 0 else 0)
+            palette = np.random.randint(0, 256, size=(max_label + 1, 3))
+            bbox_palette = [tuple(c) for c in palette]
+            # bbox_palette = get_palette('random', max_label + 1)
+            colors = [bbox_palette[label] for label in gt_labels]
+
+            visualizer.set_image(img)
+
+            for label, bbox, color in zip(gt_labels, gt_bboxes, colors):
+                visualizer.draw_bboxes(
+                    bbox, edge_colors=color, face_colors=color, alpha=0.3)
+                visualizer.draw_bboxes(bbox, edge_colors=color, alpha=1)
+
+            drawn_img = visualizer.get_image()
+
+            new_image = np.ones((100, img.shape[1], 3), dtype=np.uint8) * 255
+            visualizer.set_image(new_image)
+
+            gt_tokens_positive = [
+                tokens_positive[label] for label in gt_labels
+            ]
+            split_by_character = [char for char in data_sample.text]
+            characters = []
+            start_index = 0
+            end_index = 0
+            for w in split_by_character:
+                end_index += len(w)
+                is_find = False
+                for i, positive in enumerate(gt_tokens_positive):
+                    for p in positive:
+                        if start_index >= p[0] and end_index <= p[1]:
+                            characters.append([w, colors[i]])
+                            is_find = True
+                            break
+                    if is_find:
+                        break
+                if not is_find:
+                    characters.append([w, (0, 0, 0)])
+                start_index = end_index
+
+            drawn_text = draw_all_character(visualizer, characters,
+                                            img.shape[1])
+            drawn_img = np.concatenate((drawn_img, drawn_text), axis=0)
+        else:
+            gt_labels = gt_instances.labels
+            text = data_sample.text
+            label_names = []
+            for label in gt_labels:
+                label_names.append(text[
+                    tokens_positive[label][0][0]:tokens_positive[label][0][1]])
+            gt_instances.label_names = label_names
+            data_sample.gt_instances = gt_instances
+
+            visualizer.add_datasample(
+                base_name,
+                img,
+                data_sample,
+                draw_pred=False,
+                show=False,
+                wait_time=0,
+                out_file=None)
+            drawn_img = visualizer.get_image()
+
+            new_image = np.ones((100, img.shape[1], 3), dtype=np.uint8) * 255
+            visualizer.set_image(new_image)
+
+            characters = [char for char in text]
+            drawn_text = draw_all_character(visualizer, characters,
+                                            img.shape[1])
+            drawn_img = np.concatenate((drawn_img, drawn_text), axis=0)
+
+        if not args.not_show:
+            visualizer.show(
+                drawn_img, win_name=base_name, wait_time=args.show_interval)
+
+        if out_file is not None:
+            imwrite(drawn_img[..., ::-1], out_file)
+
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/browse_grounding_raw.py b/mmde/mmdet/.mim/tools/analysis_tools/browse_grounding_raw.py
new file mode 100644
index 0000000000000000000000000000000000000000..16fa604cacd296d3f30e8dfc3f25857802bc2bf5
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/browse_grounding_raw.py
@@ -0,0 +1,284 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import os.path as osp
+
+import cv2
+import numpy as np
+from mmcv.image import imfrombytes, imwrite
+from mmengine.fileio import get
+from mmengine.structures import InstanceData
+from mmengine.utils import mkdir_or_exist
+
+from mmdet.structures import DetDataSample
+from mmdet.visualization import DetLocalVisualizer
+from mmdet.visualization.palette import _get_adaptive_scales
+
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('data_root')
+    parser.add_argument('ann_file')
+    parser.add_argument('img_prefix')
+    parser.add_argument('--label-map-file', '-m', default=None)
+    parser.add_argument(
+        '--output-dir',
+        '-o',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--not-show', default=False, action='store_true')
+    parser.add_argument('--show-num', '-n', type=int, default=30)
+    parser.add_argument('--shuffle', default=False, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=float,
+        default=0,
+        help='the interval of show (s)')
+    args = parser.parse_args()
+    return args
+
+
+def draw_all_character(visualizer, characters, w):
+    start_index = 2
+    y_index = 5
+    for char in characters:
+        if isinstance(char, str):
+            visualizer.draw_texts(
+                str(char),
+                positions=np.array([start_index, y_index]),
+                colors=(0, 0, 0),
+                font_families='monospace')
+            start_index += len(char) * 8
+        else:
+            visualizer.draw_texts(
+                str(char[0]),
+                positions=np.array([start_index, y_index]),
+                colors=char[1],
+                font_families='monospace')
+            start_index += len(char[0]) * 8
+
+        if start_index > w - 10:
+            start_index = 2
+            y_index += 15
+
+    drawn_text = visualizer.get_image()
+    return drawn_text
+
+
+def main():
+    args = parse_args()
+    assert args.show_num > 0
+
+    local_path = osp.join(args.data_root, args.ann_file)
+    with open(local_path, 'r') as f:
+        data_list = [json.loads(line) for line in f]
+
+    dataset_index = list(range(len(data_list)))
+    if args.shuffle:
+        import random
+        random.shuffle(dataset_index)
+
+    if args.label_map_file is not None:
+        label_map_file = osp.join(args.data_root, args.label_map_file)
+        with open(label_map_file, 'r') as file:
+            label_map = json.load(file)
+
+    visualizer = DetLocalVisualizer()
+
+    for i in dataset_index[:args.show_num]:
+        item = data_list[i]
+
+        img_path = osp.join(args.data_root, args.img_prefix, item['filename'])
+        if backend_args is not None:
+            img_bytes = get(img_path, backend_args)
+            img = imfrombytes(img_bytes, flag='color')
+        else:
+            img = cv2.imread(img_path)
+        img = img[..., [2, 1, 0]]  # bgr to rgb
+
+        base_name, extension = osp.splitext(item['filename'])
+
+        out_file = osp.join(args.output_dir, base_name + '_' + str(i) +
+                            extension) if args.output_dir is not None else None
+
+        if args.output_dir is not None:
+            mkdir_or_exist(args.output_dir)
+
+        if 'detection' in item:
+            anno = item['detection']
+
+            instances = [obj for obj in anno['instances']]
+            bboxes = [obj['bbox'] for obj in instances]
+            bbox_labels = [int(obj['label']) for obj in instances]
+            label_names = [label_map[str(label)] for label in bbox_labels]
+
+            data_sample = DetDataSample()
+            gt_instances = InstanceData()
+            if len(instances) > 0 and 'score' in instances[0]:
+                score = [obj['score'] for obj in instances]
+                gt_instances['scores'] = np.array(score)
+
+            gt_instances['bboxes'] = np.array(bboxes).reshape(-1, 4)
+            gt_instances['labels'] = np.array(bbox_labels)
+            gt_instances['label_names'] = label_names
+            data_sample.gt_instances = gt_instances
+
+            visualizer.add_datasample(
+                osp.basename(img_path),
+                img,
+                data_sample,
+                draw_pred=False,
+                show=not args.not_show,
+                wait_time=args.show_interval,
+                out_file=out_file)
+        elif 'grounding' in item:
+            anno = item['grounding']
+            text = anno['caption']
+            regions = anno['regions']
+
+            max_label = len(regions) if len(regions) > 0 else 0
+            palette = np.random.randint(0, 256, size=(max_label + 1, 3))
+            bbox_palette = [tuple(c) for c in palette]
+            # bbox_palette = get_palette('random', max_label + 1)
+            colors = [bbox_palette[label] for label in range(max_label)]
+
+            visualizer.set_image(img)
+
+            gt_tokens_positive = []
+            for i, region in enumerate(regions):
+                bbox = region['bbox']
+                bbox = np.array(bbox).reshape(-1, 4)
+                tokens_positive = region['tokens_positive']
+                gt_tokens_positive.append(tokens_positive)
+                visualizer.draw_bboxes(
+                    bbox,
+                    edge_colors=colors[i],
+                    face_colors=colors[i],
+                    alpha=0.3)
+                visualizer.draw_bboxes(bbox, edge_colors=colors[i], alpha=1)
+
+                if 'score' in region:
+                    areas = (bbox[:, 3] - bbox[:, 1]) * (
+                        bbox[:, 2] - bbox[:, 0])
+                    scales = _get_adaptive_scales(areas)
+                    score = region['score'][0]
+                    score = [str(s) for s in score]
+                    font_sizes = [
+                        int(13 * scales[i]) for i in range(len(scales))
+                    ]
+                    visualizer.draw_texts(
+                        score,
+                        bbox[:, :2].astype(np.int32),
+                        colors=(255, 255, 255),
+                        font_sizes=font_sizes,
+                        bboxes=[{
+                            'facecolor': 'black',
+                            'alpha': 0.8,
+                            'pad': 0.7,
+                            'edgecolor': 'none'
+                        }] * len(bbox))
+
+            drawn_img = visualizer.get_image()
+            new_image = np.ones((100, img.shape[1], 3), dtype=np.uint8) * 255
+            visualizer.set_image(new_image)
+
+            split_by_character = [char for char in text]
+            characters = []
+            start_index = 0
+            end_index = 0
+            for w in split_by_character:
+                end_index += len(w)
+                is_find = False
+                for i, positive in enumerate(gt_tokens_positive):
+                    for p in positive:
+                        if start_index >= p[0] and end_index <= p[1]:
+                            characters.append([w, colors[i]])
+                            is_find = True
+                            break
+                    if is_find:
+                        break
+                if not is_find:
+                    characters.append([w, (0, 0, 0)])
+                start_index = end_index
+
+            drawn_text = draw_all_character(visualizer, characters,
+                                            img.shape[1])
+            drawn_img = np.concatenate((drawn_img, drawn_text), axis=0)
+
+            if not args.not_show:
+                visualizer.show(
+                    drawn_img,
+                    win_name=base_name,
+                    wait_time=args.show_interval)
+
+            if out_file is not None:
+                imwrite(drawn_img[..., ::-1], out_file)
+
+        elif 'referring' in item:
+            referring = item['referring']
+
+            max_label = len(referring) if len(referring) > 0 else 0
+            palette = np.random.randint(0, 256, size=(max_label + 1, 3))
+            bbox_palette = [tuple(c) for c in palette]
+            # bbox_palette = get_palette('random', max_label + 1)
+            colors = [bbox_palette[label] for label in range(max_label)]
+
+            visualizer.set_image(img)
+            phrases = []
+            for i, ref in enumerate(referring):
+                bbox = ref['bbox']
+                phrase = ref['phrase']
+                phrases.append(' // '.join(phrase))
+                bbox = np.array(bbox).reshape(-1, 4)
+
+                visualizer.draw_bboxes(
+                    bbox,
+                    edge_colors=colors[i],
+                    face_colors=colors[i],
+                    alpha=0.3)
+                visualizer.draw_bboxes(bbox, edge_colors=colors[i], alpha=1)
+            drawn_img = visualizer.get_image()
+
+            new_image = np.ones((100, img.shape[1], 3), dtype=np.uint8) * 255
+            visualizer.set_image(new_image)
+
+            start_index = 2
+            y_index = 5
+
+            chunk_size = max(min(img.shape[1] - 400, 70), 50)
+            for i, p in enumerate(phrases):
+                chunk_p = [
+                    p[i:i + chunk_size] for i in range(0, len(p), chunk_size)
+                ]
+                for cp in chunk_p:
+                    visualizer.draw_texts(
+                        cp,
+                        positions=np.array([start_index, y_index]),
+                        colors=colors[i],
+                        font_families='monospace')
+                    y_index += 15
+
+            drawn_text = visualizer.get_image()
+            drawn_img = np.concatenate((drawn_img, drawn_text), axis=0)
+
+            if not args.not_show:
+                visualizer.show(
+                    drawn_img,
+                    win_name=base_name,
+                    wait_time=args.show_interval)
+
+            if out_file is not None:
+                imwrite(drawn_img[..., ::-1], out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/coco_error_analysis.py b/mmde/mmdet/.mim/tools/analysis_tools/coco_error_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed270144d770e28a9b8f90c9c4991824af886fef
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/coco_error_analysis.py
@@ -0,0 +1,372 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os
+from argparse import ArgumentParser
+from multiprocessing import Pool
+
+import matplotlib.pyplot as plt
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+
+def makeplot(rs, ps, outDir, class_name, iou_type):
+    cs = np.vstack([
+        np.ones((2, 3)),
+        np.array([0.31, 0.51, 0.74]),
+        np.array([0.75, 0.31, 0.30]),
+        np.array([0.36, 0.90, 0.38]),
+        np.array([0.50, 0.39, 0.64]),
+        np.array([1, 0.6, 0]),
+    ])
+    areaNames = ['allarea', 'small', 'medium', 'large']
+    types = ['C75', 'C50', 'Loc', 'Sim', 'Oth', 'BG', 'FN']
+    for i in range(len(areaNames)):
+        area_ps = ps[..., i, 0]
+        figure_title = iou_type + '-' + class_name + '-' + areaNames[i]
+        aps = [ps_.mean() for ps_ in area_ps]
+        ps_curve = [
+            ps_.mean(axis=1) if ps_.ndim > 1 else ps_ for ps_ in area_ps
+        ]
+        ps_curve.insert(0, np.zeros(ps_curve[0].shape))
+        fig = plt.figure()
+        ax = plt.subplot(111)
+        for k in range(len(types)):
+            ax.plot(rs, ps_curve[k + 1], color=[0, 0, 0], linewidth=0.5)
+            ax.fill_between(
+                rs,
+                ps_curve[k],
+                ps_curve[k + 1],
+                color=cs[k],
+                label=str(f'[{aps[k]:.3f}]' + types[k]),
+            )
+        plt.xlabel('recall')
+        plt.ylabel('precision')
+        plt.xlim(0, 1.0)
+        plt.ylim(0, 1.0)
+        plt.title(figure_title)
+        plt.legend()
+        # plt.show()
+        fig.savefig(outDir + f'/{figure_title}.png')
+        plt.close(fig)
+
+
+def autolabel(ax, rects):
+    """Attach a text label above each bar in *rects*, displaying its height."""
+    for rect in rects:
+        height = rect.get_height()
+        if height > 0 and height <= 1:  # for percent values
+            text_label = '{:2.0f}'.format(height * 100)
+        else:
+            text_label = '{:2.0f}'.format(height)
+        ax.annotate(
+            text_label,
+            xy=(rect.get_x() + rect.get_width() / 2, height),
+            xytext=(0, 3),  # 3 points vertical offset
+            textcoords='offset points',
+            ha='center',
+            va='bottom',
+            fontsize='x-small',
+        )
+
+
+def makebarplot(rs, ps, outDir, class_name, iou_type):
+    areaNames = ['allarea', 'small', 'medium', 'large']
+    types = ['C75', 'C50', 'Loc', 'Sim', 'Oth', 'BG', 'FN']
+    fig, ax = plt.subplots()
+    x = np.arange(len(areaNames))  # the areaNames locations
+    width = 0.60  # the width of the bars
+    rects_list = []
+    figure_title = iou_type + '-' + class_name + '-' + 'ap bar plot'
+    for i in range(len(types) - 1):
+        type_ps = ps[i, ..., 0]
+        aps = [ps_.mean() for ps_ in type_ps.T]
+        rects_list.append(
+            ax.bar(
+                x - width / 2 + (i + 1) * width / len(types),
+                aps,
+                width / len(types),
+                label=types[i],
+            ))
+
+    # Add some text for labels, title and custom x-axis tick labels, etc.
+    ax.set_ylabel('Mean Average Precision (mAP)')
+    ax.set_title(figure_title)
+    ax.set_xticks(x)
+    ax.set_xticklabels(areaNames)
+    ax.legend()
+
+    # Add score texts over bars
+    for rects in rects_list:
+        autolabel(ax, rects)
+
+    # Save plot
+    fig.savefig(outDir + f'/{figure_title}.png')
+    plt.close(fig)
+
+
+def get_gt_area_group_numbers(cocoEval):
+    areaRng = cocoEval.params.areaRng
+    areaRngStr = [str(aRng) for aRng in areaRng]
+    areaRngLbl = cocoEval.params.areaRngLbl
+    areaRngStr2areaRngLbl = dict(zip(areaRngStr, areaRngLbl))
+    areaRngLbl2Number = dict.fromkeys(areaRngLbl, 0)
+    for evalImg in cocoEval.evalImgs:
+        if evalImg:
+            for gtIgnore in evalImg['gtIgnore']:
+                if not gtIgnore:
+                    aRngLbl = areaRngStr2areaRngLbl[str(evalImg['aRng'])]
+                    areaRngLbl2Number[aRngLbl] += 1
+    return areaRngLbl2Number
+
+
+def make_gt_area_group_numbers_plot(cocoEval, outDir, verbose=True):
+    areaRngLbl2Number = get_gt_area_group_numbers(cocoEval)
+    areaRngLbl = areaRngLbl2Number.keys()
+    if verbose:
+        print('number of annotations per area group:', areaRngLbl2Number)
+
+    # Init figure
+    fig, ax = plt.subplots()
+    x = np.arange(len(areaRngLbl))  # the areaNames locations
+    width = 0.60  # the width of the bars
+    figure_title = 'number of annotations per area group'
+
+    rects = ax.bar(x, areaRngLbl2Number.values(), width)
+
+    # Add some text for labels, title and custom x-axis tick labels, etc.
+    ax.set_ylabel('Number of annotations')
+    ax.set_title(figure_title)
+    ax.set_xticks(x)
+    ax.set_xticklabels(areaRngLbl)
+
+    # Add score texts over bars
+    autolabel(ax, rects)
+
+    # Save plot
+    fig.tight_layout()
+    fig.savefig(outDir + f'/{figure_title}.png')
+    plt.close(fig)
+
+
+def make_gt_area_histogram_plot(cocoEval, outDir):
+    n_bins = 100
+    areas = [ann['area'] for ann in cocoEval.cocoGt.anns.values()]
+
+    # init figure
+    figure_title = 'gt annotation areas histogram plot'
+    fig, ax = plt.subplots()
+
+    # Set the number of bins
+    ax.hist(np.sqrt(areas), bins=n_bins)
+
+    # Add some text for labels, title and custom x-axis tick labels, etc.
+    ax.set_xlabel('Squareroot Area')
+    ax.set_ylabel('Number of annotations')
+    ax.set_title(figure_title)
+
+    # Save plot
+    fig.tight_layout()
+    fig.savefig(outDir + f'/{figure_title}.png')
+    plt.close(fig)
+
+
+def analyze_individual_category(k,
+                                cocoDt,
+                                cocoGt,
+                                catId,
+                                iou_type,
+                                areas=None):
+    nm = cocoGt.loadCats(catId)[0]
+    print(f'--------------analyzing {k + 1}-{nm["name"]}---------------')
+    ps_ = {}
+    dt = copy.deepcopy(cocoDt)
+    nm = cocoGt.loadCats(catId)[0]
+    imgIds = cocoGt.getImgIds()
+    dt_anns = dt.dataset['annotations']
+    select_dt_anns = []
+    for ann in dt_anns:
+        if ann['category_id'] == catId:
+            select_dt_anns.append(ann)
+    dt.dataset['annotations'] = select_dt_anns
+    dt.createIndex()
+    # compute precision but ignore superclass confusion
+    gt = copy.deepcopy(cocoGt)
+    child_catIds = gt.getCatIds(supNms=[nm['supercategory']])
+    for idx, ann in enumerate(gt.dataset['annotations']):
+        if ann['category_id'] in child_catIds and ann['category_id'] != catId:
+            gt.dataset['annotations'][idx]['ignore'] = 1
+            gt.dataset['annotations'][idx]['iscrowd'] = 1
+            gt.dataset['annotations'][idx]['category_id'] = catId
+    cocoEval = COCOeval(gt, copy.deepcopy(dt), iou_type)
+    cocoEval.params.imgIds = imgIds
+    cocoEval.params.maxDets = [100]
+    cocoEval.params.iouThrs = [0.1]
+    cocoEval.params.useCats = 1
+    if areas:
+        cocoEval.params.areaRng = [
+            [0**2, areas[2]],
+            [0**2, areas[0]],
+            [areas[0], areas[1]],
+            [areas[1], areas[2]],
+        ]
+    cocoEval.evaluate()
+    cocoEval.accumulate()
+    ps_supercategory = cocoEval.eval['precision'][0, :, k, :, :]
+    ps_['ps_supercategory'] = ps_supercategory
+    # compute precision but ignore any class confusion
+    gt = copy.deepcopy(cocoGt)
+    for idx, ann in enumerate(gt.dataset['annotations']):
+        if ann['category_id'] != catId:
+            gt.dataset['annotations'][idx]['ignore'] = 1
+            gt.dataset['annotations'][idx]['iscrowd'] = 1
+            gt.dataset['annotations'][idx]['category_id'] = catId
+    cocoEval = COCOeval(gt, copy.deepcopy(dt), iou_type)
+    cocoEval.params.imgIds = imgIds
+    cocoEval.params.maxDets = [100]
+    cocoEval.params.iouThrs = [0.1]
+    cocoEval.params.useCats = 1
+    if areas:
+        cocoEval.params.areaRng = [
+            [0**2, areas[2]],
+            [0**2, areas[0]],
+            [areas[0], areas[1]],
+            [areas[1], areas[2]],
+        ]
+    cocoEval.evaluate()
+    cocoEval.accumulate()
+    ps_allcategory = cocoEval.eval['precision'][0, :, k, :, :]
+    ps_['ps_allcategory'] = ps_allcategory
+    return k, ps_
+
+
+def analyze_results(res_file,
+                    ann_file,
+                    res_types,
+                    out_dir,
+                    extraplots=None,
+                    areas=None,
+                    score_thr=None):
+    for res_type in res_types:
+        assert res_type in ['bbox', 'segm']
+    if areas:
+        assert (len(areas) == 3), '3 integers should be specified as areas, \
+            representing 3 area regions'
+
+    if score_thr:
+        assert score_thr >= 0, 'score_thr should be bigger than 0'
+
+    directory = os.path.dirname(out_dir + '/')
+    if not os.path.exists(directory):
+        print(f'-------------create {out_dir}-----------------')
+        os.makedirs(directory)
+
+    cocoGt = COCO(ann_file)
+    cocoDt = cocoGt.loadRes(res_file)
+    imgIds = cocoGt.getImgIds()
+
+    if score_thr:
+        cocoDt.dataset['annotations'] = list(
+            filter(lambda ann: ann['score'] >= score_thr,
+                   cocoDt.dataset['annotations']))
+        cocoDt.createIndex()
+
+    for res_type in res_types:
+        res_out_dir = out_dir + '/' + res_type + '/'
+        res_directory = os.path.dirname(res_out_dir)
+        if not os.path.exists(res_directory):
+            print(f'-------------create {res_out_dir}-----------------')
+            os.makedirs(res_directory)
+        iou_type = res_type
+        cocoEval = COCOeval(
+            copy.deepcopy(cocoGt), copy.deepcopy(cocoDt), iou_type)
+        cocoEval.params.imgIds = imgIds
+        cocoEval.params.iouThrs = [0.75, 0.5, 0.1]
+        cocoEval.params.maxDets = [100]
+        if areas:
+            cocoEval.params.areaRng = [
+                [0**2, areas[2]],
+                [0**2, areas[0]],
+                [areas[0], areas[1]],
+                [areas[1], areas[2]],
+            ]
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        ps = cocoEval.eval['precision']
+        ps = np.vstack([ps, np.zeros((4, *ps.shape[1:]))])
+        catIds = cocoGt.getCatIds()
+        recThrs = cocoEval.params.recThrs
+        with Pool(processes=48) as pool:
+            args = [(k, cocoDt, cocoGt, catId, iou_type, areas)
+                    for k, catId in enumerate(catIds)]
+            analyze_results = pool.starmap(analyze_individual_category, args)
+        for k, catId in enumerate(catIds):
+            nm = cocoGt.loadCats(catId)[0]
+            print(f'--------------saving {k + 1}-{nm["name"]}---------------')
+            analyze_result = analyze_results[k]
+            assert k == analyze_result[0]
+            ps_supercategory = analyze_result[1]['ps_supercategory']
+            ps_allcategory = analyze_result[1]['ps_allcategory']
+            # compute precision but ignore superclass confusion
+            ps[3, :, k, :, :] = ps_supercategory
+            # compute precision but ignore any class confusion
+            ps[4, :, k, :, :] = ps_allcategory
+            # fill in background and false negative errors and plot
+            ps[ps == -1] = 0
+            ps[5, :, k, :, :] = ps[4, :, k, :, :] > 0
+            ps[6, :, k, :, :] = 1.0
+            makeplot(recThrs, ps[:, :, k], res_out_dir, nm['name'], iou_type)
+            if extraplots:
+                makebarplot(recThrs, ps[:, :, k], res_out_dir, nm['name'],
+                            iou_type)
+        makeplot(recThrs, ps, res_out_dir, 'allclass', iou_type)
+        if extraplots:
+            makebarplot(recThrs, ps, res_out_dir, 'allclass', iou_type)
+            make_gt_area_group_numbers_plot(
+                cocoEval=cocoEval, outDir=res_out_dir, verbose=True)
+            make_gt_area_histogram_plot(cocoEval=cocoEval, outDir=res_out_dir)
+
+
+def main():
+    parser = ArgumentParser(description='COCO Error Analysis Tool')
+    parser.add_argument('result', help='result file (json format) path')
+    parser.add_argument('out_dir', help='dir to save analyze result images')
+    parser.add_argument(
+        '--ann',
+        default='data/coco/annotations/instances_val2017.json',
+        help='annotation file path',
+    )
+    parser.add_argument(
+        '--types', type=str, nargs='+', default=['bbox'], help='result types')
+    parser.add_argument(
+        '--extraplots',
+        action='store_true',
+        help='export extra bar/stat plots')
+    parser.add_argument(
+        '--score-thr',
+        type=float,
+        default=None,
+        help='score threshold to filter detection bboxes, only applied'
+        'when users want to change it.',
+    )
+    parser.add_argument(
+        '--areas',
+        type=int,
+        nargs='+',
+        default=[1024, 9216, 10000000000],
+        help='area regions',
+    )
+    args = parser.parse_args()
+    analyze_results(
+        args.result,
+        args.ann,
+        args.types,
+        out_dir=args.out_dir,
+        extraplots=args.extraplots,
+        areas=args.areas,
+        score_thr=args.score_thr,
+    )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/coco_occluded_separated_recall.py b/mmde/mmdet/.mim/tools/analysis_tools/coco_occluded_separated_recall.py
new file mode 100644
index 0000000000000000000000000000000000000000..e61f2ccd94517c47674747a0732380db6e2a18c9
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/coco_occluded_separated_recall.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser
+
+import mmengine
+from mmengine.logging import print_log
+
+from mmdet.datasets import CocoDataset
+from mmdet.evaluation import CocoOccludedSeparatedMetric
+
+
+def main():
+    parser = ArgumentParser(
+        description='Compute recall of COCO occluded and separated masks '
+        'presented in paper https://arxiv.org/abs/2210.10046.')
+    parser.add_argument('result', help='result file (pkl format) path')
+    parser.add_argument('--out', help='file path to save evaluation results')
+    parser.add_argument(
+        '--score-thr',
+        type=float,
+        default=0.3,
+        help='Score threshold for the recall calculation. Defaults to 0.3')
+    parser.add_argument(
+        '--iou-thr',
+        type=float,
+        default=0.75,
+        help='IoU threshold for the recall calculation. Defaults to 0.75.')
+    parser.add_argument(
+        '--ann',
+        default='data/coco/annotations/instances_val2017.json',
+        help='coco annotation file path')
+    args = parser.parse_args()
+
+    results = mmengine.load(args.result)
+    assert 'masks' in results[0]['pred_instances'], \
+        'The results must be predicted by instance segmentation model.'
+    metric = CocoOccludedSeparatedMetric(
+        ann_file=args.ann, iou_thr=args.iou_thr, score_thr=args.score_thr)
+    metric.dataset_meta = CocoDataset.METAINFO
+    for datasample in results:
+        metric.process(data_batch=None, data_samples=[datasample])
+    metric_res = metric.compute_metrics(metric.results)
+    if args.out is not None:
+        mmengine.dump(metric_res, args.out)
+        print_log(f'Evaluation results have been saved to {args.out}.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/confusion_matrix.py b/mmde/mmdet/.mim/tools/analysis_tools/confusion_matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1c4c8ec86f70d28dfe7dc3d0173df4f4b46b8c3
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/confusion_matrix.py
@@ -0,0 +1,273 @@
+import argparse
+import os
+
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.ticker import MultipleLocator
+from mmcv.ops import nms
+from mmengine import Config, DictAction
+from mmengine.fileio import load
+from mmengine.registry import init_default_scope
+from mmengine.utils import ProgressBar
+
+from mmdet.evaluation import bbox_overlaps
+from mmdet.registry import DATASETS
+from mmdet.utils import replace_cfg_vals, update_data_root
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Generate confusion matrix from detection results')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument(
+        'prediction_path', help='prediction path where test .pkl result')
+    parser.add_argument(
+        'save_dir', help='directory where confusion matrix will be saved')
+    parser.add_argument(
+        '--show', action='store_true', help='show confusion matrix')
+    parser.add_argument(
+        '--color-theme',
+        default='plasma',
+        help='theme of the matrix color map')
+    parser.add_argument(
+        '--score-thr',
+        type=float,
+        default=0.3,
+        help='score threshold to filter detection bboxes')
+    parser.add_argument(
+        '--tp-iou-thr',
+        type=float,
+        default=0.5,
+        help='IoU threshold to be considered as matched')
+    parser.add_argument(
+        '--nms-iou-thr',
+        type=float,
+        default=None,
+        help='nms IoU threshold, only applied when users want to change the'
+        'nms IoU threshold.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def calculate_confusion_matrix(dataset,
+                               results,
+                               score_thr=0,
+                               nms_iou_thr=None,
+                               tp_iou_thr=0.5):
+    """Calculate the confusion matrix.
+
+    Args:
+        dataset (Dataset): Test or val dataset.
+        results (list[ndarray]): A list of detection results in each image.
+        score_thr (float|optional): Score threshold to filter bboxes.
+            Default: 0.
+        nms_iou_thr (float|optional): nms IoU threshold, the detection results
+            have done nms in the detector, only applied when users want to
+            change the nms IoU threshold. Default: None.
+        tp_iou_thr (float|optional): IoU threshold to be considered as matched.
+            Default: 0.5.
+    """
+    num_classes = len(dataset.metainfo['classes'])
+    confusion_matrix = np.zeros(shape=[num_classes + 1, num_classes + 1])
+    assert len(dataset) == len(results)
+    prog_bar = ProgressBar(len(results))
+    for idx, per_img_res in enumerate(results):
+        res_bboxes = per_img_res['pred_instances']
+        gts = dataset.get_data_info(idx)['instances']
+        analyze_per_img_dets(confusion_matrix, gts, res_bboxes, score_thr,
+                             tp_iou_thr, nms_iou_thr)
+        prog_bar.update()
+    return confusion_matrix
+
+
+def analyze_per_img_dets(confusion_matrix,
+                         gts,
+                         result,
+                         score_thr=0,
+                         tp_iou_thr=0.5,
+                         nms_iou_thr=None):
+    """Analyze detection results on each image.
+
+    Args:
+        confusion_matrix (ndarray): The confusion matrix,
+            has shape (num_classes + 1, num_classes + 1).
+        gt_bboxes (ndarray): Ground truth bboxes, has shape (num_gt, 4).
+        gt_labels (ndarray): Ground truth labels, has shape (num_gt).
+        result (ndarray): Detection results, has shape
+            (num_classes, num_bboxes, 5).
+        score_thr (float): Score threshold to filter bboxes.
+            Default: 0.
+        tp_iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        nms_iou_thr (float|optional): nms IoU threshold, the detection results
+            have done nms in the detector, only applied when users want to
+            change the nms IoU threshold. Default: None.
+    """
+    true_positives = np.zeros(len(gts))
+    gt_bboxes = []
+    gt_labels = []
+    for gt in gts:
+        gt_bboxes.append(gt['bbox'])
+        gt_labels.append(gt['bbox_label'])
+
+    gt_bboxes = np.array(gt_bboxes)
+    gt_labels = np.array(gt_labels)
+
+    unique_label = np.unique(result['labels'].numpy())
+
+    for det_label in unique_label:
+        mask = (result['labels'] == det_label)
+        det_bboxes = result['bboxes'][mask].numpy()
+        det_scores = result['scores'][mask].numpy()
+
+        if nms_iou_thr:
+            det_bboxes, _ = nms(
+                det_bboxes, det_scores, nms_iou_thr, score_threshold=score_thr)
+        ious = bbox_overlaps(det_bboxes[:, :4], gt_bboxes)
+        for i, score in enumerate(det_scores):
+            det_match = 0
+            if score >= score_thr:
+                for j, gt_label in enumerate(gt_labels):
+                    if ious[i, j] >= tp_iou_thr:
+                        det_match += 1
+                        if gt_label == det_label:
+                            true_positives[j] += 1  # TP
+                        confusion_matrix[gt_label, det_label] += 1
+                if det_match == 0:  # BG FP
+                    confusion_matrix[-1, det_label] += 1
+    for num_tp, gt_label in zip(true_positives, gt_labels):
+        if num_tp == 0:  # FN
+            confusion_matrix[gt_label, -1] += 1
+
+
+def plot_confusion_matrix(confusion_matrix,
+                          labels,
+                          save_dir=None,
+                          show=True,
+                          title='Normalized Confusion Matrix',
+                          color_theme='plasma'):
+    """Draw confusion matrix with matplotlib.
+
+    Args:
+        confusion_matrix (ndarray): The confusion matrix.
+        labels (list[str]): List of class names.
+        save_dir (str|optional): If set, save the confusion matrix plot to the
+            given path. Default: None.
+        show (bool): Whether to show the plot. Default: True.
+        title (str): Title of the plot. Default: `Normalized Confusion Matrix`.
+        color_theme (str): Theme of the matrix color map. Default: `plasma`.
+    """
+    # normalize the confusion matrix
+    per_label_sums = confusion_matrix.sum(axis=1)[:, np.newaxis]
+    confusion_matrix = \
+        confusion_matrix.astype(np.float32) / per_label_sums * 100
+
+    num_classes = len(labels)
+    fig, ax = plt.subplots(
+        figsize=(0.5 * num_classes, 0.5 * num_classes * 0.8), dpi=180)
+    cmap = plt.get_cmap(color_theme)
+    im = ax.imshow(confusion_matrix, cmap=cmap)
+    plt.colorbar(mappable=im, ax=ax)
+
+    title_font = {'weight': 'bold', 'size': 12}
+    ax.set_title(title, fontdict=title_font)
+    label_font = {'size': 10}
+    plt.ylabel('Ground Truth Label', fontdict=label_font)
+    plt.xlabel('Prediction Label', fontdict=label_font)
+
+    # draw locator
+    xmajor_locator = MultipleLocator(1)
+    xminor_locator = MultipleLocator(0.5)
+    ax.xaxis.set_major_locator(xmajor_locator)
+    ax.xaxis.set_minor_locator(xminor_locator)
+    ymajor_locator = MultipleLocator(1)
+    yminor_locator = MultipleLocator(0.5)
+    ax.yaxis.set_major_locator(ymajor_locator)
+    ax.yaxis.set_minor_locator(yminor_locator)
+
+    # draw grid
+    ax.grid(True, which='minor', linestyle='-')
+
+    # draw label
+    ax.set_xticks(np.arange(num_classes))
+    ax.set_yticks(np.arange(num_classes))
+    ax.set_xticklabels(labels)
+    ax.set_yticklabels(labels)
+
+    ax.tick_params(
+        axis='x', bottom=False, top=True, labelbottom=False, labeltop=True)
+    plt.setp(
+        ax.get_xticklabels(), rotation=45, ha='left', rotation_mode='anchor')
+
+    # draw confution matrix value
+    for i in range(num_classes):
+        for j in range(num_classes):
+            ax.text(
+                j,
+                i,
+                '{}%'.format(
+                    int(confusion_matrix[
+                        i,
+                        j]) if not np.isnan(confusion_matrix[i, j]) else -1),
+                ha='center',
+                va='center',
+                color='w',
+                size=7)
+
+    ax.set_ylim(len(confusion_matrix) - 0.5, -0.5)  # matplotlib>3.1.1
+
+    fig.tight_layout()
+    if save_dir is not None:
+        plt.savefig(
+            os.path.join(save_dir, 'confusion_matrix.png'), format='png')
+    if show:
+        plt.show()
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    # replace the ${key} with the value of cfg.key
+    cfg = replace_cfg_vals(cfg)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    init_default_scope(cfg.get('default_scope', 'mmdet'))
+
+    results = load(args.prediction_path)
+
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+
+    dataset = DATASETS.build(cfg.test_dataloader.dataset)
+
+    confusion_matrix = calculate_confusion_matrix(dataset, results,
+                                                  args.score_thr,
+                                                  args.nms_iou_thr,
+                                                  args.tp_iou_thr)
+    plot_confusion_matrix(
+        confusion_matrix,
+        dataset.metainfo['classes'] + ('background', ),
+        save_dir=args.save_dir,
+        show=args.show,
+        color_theme=args.color_theme)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/eval_metric.py b/mmde/mmdet/.mim/tools/analysis_tools/eval_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..450828735cacd79f59c4ab796301737b30adff1c
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/eval_metric.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import mmengine
+from mmengine import Config, DictAction
+from mmengine.evaluator import Evaluator
+from mmengine.registry import init_default_scope
+
+from mmdet.registry import DATASETS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Evaluate metric of the '
+                                     'results saved in pkl format')
+    parser.add_argument('config', help='Config of the model')
+    parser.add_argument('pkl_results', help='Results in pickle format')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    init_default_scope(cfg.get('default_scope', 'mmdet'))
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    dataset = DATASETS.build(cfg.test_dataloader.dataset)
+    predictions = mmengine.load(args.pkl_results)
+
+    evaluator = Evaluator(cfg.val_evaluator)
+    evaluator.dataset_meta = dataset.metainfo
+    eval_results = evaluator.offline_evaluate(predictions)
+    print(eval_results)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/fuse_results.py b/mmde/mmdet/.mim/tools/analysis_tools/fuse_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f35123cbbb7b3ed9403c870505c022142422037
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/fuse_results.py
@@ -0,0 +1,142 @@
+import argparse
+
+from mmengine.fileio import dump, load
+from mmengine.logging import print_log
+from mmengine.utils import ProgressBar
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+from mmdet.models.utils import weighted_boxes_fusion
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Fusion image \
+        prediction results using Weighted \
+        Boxes Fusion from multiple models.')
+    parser.add_argument(
+        'pred-results',
+        type=str,
+        nargs='+',
+        help='files of prediction results \
+                    from multiple models, json format.')
+    parser.add_argument('--annotation', type=str, help='annotation file path')
+    parser.add_argument(
+        '--weights',
+        type=float,
+        nargs='*',
+        default=None,
+        help='weights for each model, '
+        'remember to correspond to the above prediction path.')
+    parser.add_argument(
+        '--fusion-iou-thr',
+        type=float,
+        default=0.55,
+        help='IoU value for boxes to be a match in wbf.')
+    parser.add_argument(
+        '--skip-box-thr',
+        type=float,
+        default=0.0,
+        help='exclude boxes with score lower than this variable in wbf.')
+    parser.add_argument(
+        '--conf-type',
+        type=str,
+        default='avg',
+        help='how to calculate confidence in weighted boxes in wbf.')
+    parser.add_argument(
+        '--eval-single',
+        action='store_true',
+        help='whether evaluate each single model result.')
+    parser.add_argument(
+        '--save-fusion-results',
+        action='store_true',
+        help='whether save fusion result')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='outputs',
+        help='Output directory of images or prediction results.')
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert len(args.models_name) == len(args.pred_results), \
+        'the quantities of model names and prediction results are not equal'
+
+    cocoGT = COCO(args.annotation)
+
+    predicts_raw = []
+
+    models_name = ['model_' + str(i) for i in range(len(args.pred_results))]
+
+    for model_name, path in \
+            zip(models_name, args.pred_results):
+        pred = load(path)
+        predicts_raw.append(pred)
+
+        if args.eval_single:
+            print_log(f'Evaluate {model_name}...')
+            cocoDt = cocoGT.loadRes(pred)
+            coco_eval = COCOeval(cocoGT, cocoDt, iouType='bbox')
+            coco_eval.evaluate()
+            coco_eval.accumulate()
+            coco_eval.summarize()
+
+    predict = {
+        str(image_id): {
+            'bboxes_list': [[] for _ in range(len(predicts_raw))],
+            'scores_list': [[] for _ in range(len(predicts_raw))],
+            'labels_list': [[] for _ in range(len(predicts_raw))]
+        }
+        for image_id in cocoGT.getImgIds()
+    }
+
+    for i, pred_single in enumerate(predicts_raw):
+        for pred in pred_single:
+            p = predict[str(pred['image_id'])]
+            p['bboxes_list'][i].append(pred['bbox'])
+            p['scores_list'][i].append(pred['score'])
+            p['labels_list'][i].append(pred['category_id'])
+
+    result = []
+    prog_bar = ProgressBar(len(predict))
+    for image_id, res in predict.items():
+        bboxes, scores, labels = weighted_boxes_fusion(
+            res['bboxes_list'],
+            res['scores_list'],
+            res['labels_list'],
+            weights=args.weights,
+            iou_thr=args.fusion_iou_thr,
+            skip_box_thr=args.skip_box_thr,
+            conf_type=args.conf_type)
+
+        for bbox, score, label in zip(bboxes, scores, labels):
+            result.append({
+                'bbox': bbox.numpy().tolist(),
+                'category_id': int(label),
+                'image_id': int(image_id),
+                'score': float(score)
+            })
+
+        prog_bar.update()
+
+    if args.save_fusion_results:
+        out_file = args.out_dir + '/fusion_results.json'
+        dump(result, file=out_file)
+        print_log(
+            f'Fusion results have been saved to {out_file}.', logger='current')
+
+    print_log('Evaluate fusion results using wbf...')
+    cocoDt = cocoGT.loadRes(result)
+    coco_eval = COCOeval(cocoGT, cocoDt, iouType='bbox')
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/get_flops.py b/mmde/mmdet/.mim/tools/analysis_tools/get_flops.py
new file mode 100644
index 0000000000000000000000000000000000000000..a696ddcf7420623327864b86c55777f2ab7f9fad
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/get_flops.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+from functools import partial
+from pathlib import Path
+
+import numpy as np
+import torch
+from mmengine.config import Config, DictAction
+from mmengine.logging import MMLogger
+from mmengine.model import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.runner import Runner
+from mmengine.utils import digit_version
+
+from mmdet.registry import MODELS
+
+try:
+    from mmengine.analysis import get_model_complexity_info
+    from mmengine.analysis.print_helper import _format_size
+except ImportError:
+    raise ImportError('Please upgrade mmengine >= 0.6.0')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Get a detector flops')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--num-images',
+        type=int,
+        default=100,
+        help='num images of calculate model flops')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def inference(args, logger):
+    if digit_version(torch.__version__) < digit_version('1.12'):
+        logger.warning(
+            'Some config files, such as configs/yolact and configs/detectors,'
+            'may have compatibility issues with torch.jit when torch<1.12. '
+            'If you want to calculate flops for these models, '
+            'please make sure your pytorch version is >=1.12.')
+
+    config_name = Path(args.config)
+    if not config_name.exists():
+        logger.error(f'{config_name} not found.')
+
+    cfg = Config.fromfile(args.config)
+    cfg.val_dataloader.batch_size = 1
+    cfg.work_dir = tempfile.TemporaryDirectory().name
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    init_default_scope(cfg.get('default_scope', 'mmdet'))
+
+    # TODO: The following usage is temporary and not safe
+    # use hard code to convert mmSyncBN to SyncBN. This is a known
+    # bug in mmengine, mmSyncBN requires a distributed environment，
+    # this question involves models like configs/strong_baselines
+    if hasattr(cfg, 'head_norm_cfg'):
+        cfg['head_norm_cfg'] = dict(type='SyncBN', requires_grad=True)
+        cfg['model']['roi_head']['bbox_head']['norm_cfg'] = dict(
+            type='SyncBN', requires_grad=True)
+        cfg['model']['roi_head']['mask_head']['norm_cfg'] = dict(
+            type='SyncBN', requires_grad=True)
+
+    result = {}
+    avg_flops = []
+    data_loader = Runner.build_dataloader(cfg.val_dataloader)
+    model = MODELS.build(cfg.model)
+    if torch.cuda.is_available():
+        model = model.cuda()
+    model = revert_sync_batchnorm(model)
+    model.eval()
+    _forward = model.forward
+
+    for idx, data_batch in enumerate(data_loader):
+        if idx == args.num_images:
+            break
+        data = model.data_preprocessor(data_batch)
+        result['ori_shape'] = data['data_samples'][0].ori_shape
+        result['pad_shape'] = data['data_samples'][0].pad_shape
+        if hasattr(data['data_samples'][0], 'batch_input_shape'):
+            result['pad_shape'] = data['data_samples'][0].batch_input_shape
+        model.forward = partial(_forward, data_samples=data['data_samples'])
+        outputs = get_model_complexity_info(
+            model,
+            None,
+            inputs=data['inputs'],
+            show_table=False,
+            show_arch=False)
+        avg_flops.append(outputs['flops'])
+        params = outputs['params']
+        result['compute_type'] = 'dataloader: load a picture from the dataset'
+    del data_loader
+
+    mean_flops = _format_size(int(np.average(avg_flops)))
+    params = _format_size(params)
+    result['flops'] = mean_flops
+    result['params'] = params
+
+    return result
+
+
+def main():
+    args = parse_args()
+    logger = MMLogger.get_instance(name='MMLogger')
+    result = inference(args, logger)
+    split_line = '=' * 30
+    ori_shape = result['ori_shape']
+    pad_shape = result['pad_shape']
+    flops = result['flops']
+    params = result['params']
+    compute_type = result['compute_type']
+
+    if pad_shape != ori_shape:
+        print(f'{split_line}\nUse size divisor set input shape '
+              f'from {ori_shape} to {pad_shape}')
+    print(f'{split_line}\nCompute type: {compute_type}\n'
+          f'Input shape: {pad_shape}\nFlops: {flops}\n'
+          f'Params: {params}\n{split_line}')
+    print('!!!Please be cautious if you use the results in papers. '
+          'You may need to check if all ops are supported and verify '
+          'that the flops computation is correct.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/mot/browse_dataset.py b/mmde/mmdet/.mim/tools/analysis_tools/mot/browse_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b3722f2d08fc696fcc4d13cf6d6f46169d613bf
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/mot/browse_dataset.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+import mmengine
+from mmengine import Config, DictAction
+from mmengine.registry import init_default_scope
+
+from mmdet.registry import DATASETS, VISUALIZERS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--show', default=True, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=float,
+        default=2,
+        help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    init_default_scope(cfg.get('default_scope', 'mmdet'))
+
+    dataset = DATASETS.build(cfg.train_dataloader.dataset)
+
+    visualizer = VISUALIZERS.build(cfg.visualizer)
+    visualizer.dataset_meta = dataset.metainfo
+
+    progress_bar = mmengine.ProgressBar(len(dataset))
+    for idx, item in enumerate(dataset):  # inputs data_samples
+        data_sample = item['data_samples']
+        input = item['inputs']
+        for img_idx in range(len(data_sample)):
+            img_data_sample = data_sample[img_idx]
+            img_path = img_data_sample.img_path
+            img = input[img_idx].permute(1, 2, 0).numpy()
+            out_file = osp.join(
+                args.output_dir,
+                str(idx).zfill(6),
+                f'img_{img_idx}.jpg') if args.output_dir is not None else None
+            img = img[..., [2, 1, 0]]  # bgr to rgb
+            visualizer.add_datasample(
+                osp.basename(img_path),
+                img,
+                data_sample=img_data_sample,
+                draw_pred=False,
+                show=args.show,
+                wait_time=args.show_interval,
+                out_file=out_file)
+            # Record file path mapping.
+            if args.output_dir is not None:
+                with open(
+                        osp.join(args.output_dir,
+                                 str(idx).zfill(6), 'info.txt'), 'a') as f:
+                    f.write(f'The source filepath of img_{img_idx}.jpg'
+                            f'is `{img_path}`.\n')
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/mot/dist_mot_search.sh b/mmde/mmdet/.mim/tools/analysis_tools/mot/dist_mot_search.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a1991c132b22f71bd22c90f4c1f1c274ae3a5388
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/mot/dist_mot_search.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-29500}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/mot_param_search.py $CONFIG --launcher pytorch ${@:3}
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/mot/mot_error_visualize.py b/mmde/mmdet/.mim/tools/analysis_tools/mot/mot_error_visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b3d3eebb45f5808cd9339174f87b753b40d4a13
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/mot/mot_error_visualize.py
@@ -0,0 +1,211 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+import re
+
+import mmcv
+import motmetrics as mm
+import numpy as np
+import pandas as pd
+from mmengine import Config
+from mmengine.logging import print_log
+from mmengine.registry import init_default_scope
+from torch.utils.data import Dataset
+
+from mmdet.registry import DATASETS
+from mmdet.utils import imshow_mot_errors
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='visualize errors for multiple object tracking')
+    parser.add_argument('config', help='path of the config file')
+    parser.add_argument(
+        '--result-dir', help='directory of the inference result')
+    parser.add_argument(
+        '--output-dir',
+        help='directory where painted images or videos will be saved')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='whether to show the results on the fly')
+    parser.add_argument(
+        '--fps', type=int, default=3, help='FPS of the output video')
+    parser.add_argument(
+        '--backend',
+        type=str,
+        choices=['cv2', 'plt'],
+        default='cv2',
+        help='backend of visualization')
+    args = parser.parse_args()
+    return args
+
+
+def compare_res_gts(results_dir: str, dataset: Dataset, video_name: str):
+    """Evaluate the results of the video.
+
+    Args:
+        results_dir (str): the directory of the MOT results.
+        dataset (Dataset): MOT dataset of the video to be evaluated.
+        video_name (str): Name of the video to be evaluated.
+
+    Returns:
+        tuple: (acc, res, gt), acc contains the results of MOT metrics,
+        res is the results of inference and gt is the ground truth.
+    """
+    if 'half-train' in dataset.ann_file:
+        gt_file = osp.join(dataset.data_prefix['img_path'],
+                           f'{video_name}/gt/gt_half-train.txt')
+        gt = mm.io.loadtxt(gt_file)
+        gt.index = gt.index.set_levels(
+            pd.factorize(gt.index.levels[0])[0] + 1, level=0)
+    elif 'half-val' in dataset.ann_file:
+        gt_file = osp.join(dataset.data_prefix['img_path'],
+                           f'{video_name}/gt/gt_half-val.txt')
+        gt = mm.io.loadtxt(gt_file)
+        gt.index = gt.index.set_levels(
+            pd.factorize(gt.index.levels[0])[0] + 1, level=0)
+    else:
+        gt_file = osp.join(dataset.data_prefix['img_path'],
+                           f'{video_name}/gt/gt.txt')
+        gt = mm.io.loadtxt(gt_file)
+        gt.index = gt.index.set_levels(
+            pd.factorize(gt.index.levels[0])[0] + 1, level=0)
+    res_file = osp.join(results_dir, f'{video_name}.txt')
+    res = mm.io.loadtxt(res_file)
+    ini_file = osp.join(dataset.data_prefix['img_path'],
+                        f'{video_name}/seqinfo.ini')
+    if osp.exists(ini_file):
+        acc, _ = mm.utils.CLEAR_MOT_M(gt, res, ini_file)
+    else:
+        acc = mm.utils.compare_to_groundtruth(gt, res)
+
+    return acc, res, gt
+
+
+def main():
+    args = parse_args()
+
+    assert args.show or args.out_dir, \
+        ('Please specify at least one operation (show the results '
+         '/ save the results) with the argument "--show" or "--out-dir"')
+
+    if args.out_dir is not None:
+        os.makedirs(args.out_dir, exist_ok=True)
+
+    print_log('This script visualizes the error for multiple object tracking. '
+              'By Default, the red bounding box denotes false positive, '
+              'the yellow bounding box denotes the false negative '
+              'and the blue bounding box denotes ID switch.')
+
+    cfg = Config.fromfile(args.config)
+
+    init_default_scope(cfg.get('default_scope', 'mmdet'))
+    dataset = DATASETS.build(cfg.val_dataloader.dataset)
+
+    # create index from frame_id to filename
+    filenames_dict = dict()
+    for i in range(len(dataset)):
+        video_info = dataset.get_data_info(i)
+        # the `data_info['file_name']` usually has the same format
+        # with "MOT17-09-DPM/img1/000003.jpg"
+        # split with both '\' and '/' to be compatible with different OS.
+        for data_info in video_info['images']:
+            split_path = re.split(r'[\\/]', data_info['file_name'])
+            video_name = split_path[-3]
+            frame_id = int(data_info['frame_id'] + 1)
+            if video_name not in filenames_dict:
+                filenames_dict[video_name] = dict()
+        # the data_info['img_path'] usually has the same format
+        # with `img_path_prefix + "MOT17-09-DPM/img1/000003.jpg"`
+            filenames_dict[video_name][frame_id] = data_info['img_path']
+    video_names = tuple(filenames_dict.keys())
+
+    for video_name in video_names:
+        print_log(f'Start processing video {video_name}')
+
+        acc, res, gt = compare_res_gts(args.result_dir, dataset, video_name)
+
+        frames_id_list = sorted(
+            list(set(acc.mot_events.index.get_level_values(0))))
+        for frame_id in frames_id_list:
+            # events in the current frame
+            events = acc.mot_events.xs(frame_id)
+            cur_res = res.loc[frame_id] if frame_id in res.index else None
+            cur_gt = gt.loc[frame_id] if frame_id in gt.index else None
+            # path of image
+            img = filenames_dict[video_name][frame_id]
+            fps = events[events.Type == 'FP']
+            fns = events[events.Type == 'MISS']
+            idsws = events[events.Type == 'SWITCH']
+
+            bboxes, ids, error_types = [], [], []
+            for fp_index in fps.index:
+                hid = events.loc[fp_index].HId
+                bboxes.append([
+                    cur_res.loc[hid].X, cur_res.loc[hid].Y,
+                    cur_res.loc[hid].X + cur_res.loc[hid].Width,
+                    cur_res.loc[hid].Y + cur_res.loc[hid].Height,
+                    cur_res.loc[hid].Confidence
+                ])
+                ids.append(hid)
+                # error_type = 0 denotes false positive error
+                error_types.append(0)
+            for fn_index in fns.index:
+                oid = events.loc[fn_index].OId
+                bboxes.append([
+                    cur_gt.loc[oid].X, cur_gt.loc[oid].Y,
+                    cur_gt.loc[oid].X + cur_gt.loc[oid].Width,
+                    cur_gt.loc[oid].Y + cur_gt.loc[oid].Height,
+                    cur_gt.loc[oid].Confidence
+                ])
+                ids.append(-1)
+                # error_type = 1 denotes false negative error
+                error_types.append(1)
+            for idsw_index in idsws.index:
+                hid = events.loc[idsw_index].HId
+                bboxes.append([
+                    cur_res.loc[hid].X, cur_res.loc[hid].Y,
+                    cur_res.loc[hid].X + cur_res.loc[hid].Width,
+                    cur_res.loc[hid].Y + cur_res.loc[hid].Height,
+                    cur_res.loc[hid].Confidence
+                ])
+                ids.append(hid)
+                # error_type = 2 denotes id switch
+                error_types.append(2)
+            if len(bboxes) == 0:
+                bboxes = np.zeros((0, 5), dtype=np.float32)
+            else:
+                bboxes = np.asarray(bboxes, dtype=np.float32)
+            ids = np.asarray(ids, dtype=np.int32)
+            error_types = np.asarray(error_types, dtype=np.int32)
+            imshow_mot_errors(
+                img,
+                bboxes,
+                ids,
+                error_types,
+                show=args.show,
+                out_file=osp.join(args.out_dir,
+                                  f'{video_name}/{frame_id:06d}.jpg')
+                if args.out_dir else None,
+                backend=args.backend)
+
+        print_log(f'Done! Visualization images are saved in '
+                  f'\'{args.out_dir}/{video_name}\'')
+
+        mmcv.frames2video(
+            f'{args.out_dir}/{video_name}',
+            f'{args.out_dir}/{video_name}.mp4',
+            fps=args.fps,
+            fourcc='mp4v',
+            start=frames_id_list[0],
+            end=frames_id_list[-1],
+            show_progress=False)
+        print_log(
+            f'Done! Visualization video is saved as '
+            f'\'{args.out_dir}/{video_name}.mp4\' with a FPS of {args.fps}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/mot/mot_param_search.py b/mmde/mmdet/.mim/tools/analysis_tools/mot/mot_param_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b531d181cfcf89bdc72ea298f7aa29128118e8d
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/mot/mot_param_search.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+from itertools import product
+
+from mmengine.config import Config, DictAction
+from mmengine.dist import get_dist_info
+from mmengine.logging import MMLogger, print_log
+from mmengine.model import is_model_wrapper
+from mmengine.registry import init_default_scope
+from mmengine.runner import Runner
+from mmengine.runner.checkpoint import load_checkpoint
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet tracking test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', help='checkpoint file')
+    parser.add_argument('--detector', help='detection checkpoint file')
+    parser.add_argument('--reid', help='reid checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing evaluation metrics')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def get_search_params(cfg, search_params=None, prefix=None, logger=None):
+    if search_params is None:
+        search_params = dict()
+    for k, v in cfg.items():
+        if prefix is not None:
+            entire_k = prefix + '.' + k
+        else:
+            entire_k = k
+        if isinstance(v, list):
+            print_log(f'search `{entire_k}` in {v}.', logger)
+            search_params[entire_k] = v
+        if isinstance(v, dict):
+            search_params = get_search_params(v, search_params, entire_k,
+                                              logger)
+    return search_params
+
+
+def main():
+
+    args = parse_args()
+
+    # do not init the default scope here because it will be init in the runner
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    init_default_scope(cfg.get('default_scope', 'mmdet'))
+
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    cfg.load_from = args.checkpoint
+
+    logger = MMLogger.get_instance(name='ParamsSearcher', logger_name='Logger')
+    # get all cases
+    search_params = get_search_params(cfg.model.tracker, logger=logger)
+    search_params_names = tuple(search_params.keys())
+    all_search_cases = []
+    for values in product(*search_params.values()):
+        search = dict()
+        for k, v in zip(search_params_names, values):
+            search[k] = v
+        all_search_cases.append(search)
+
+    print_log(f'Totally {len(all_search_cases)} cases.', logger)
+
+    search_metrics = []
+    metrics_types = [cfg.test_evaluator.metric] if isinstance(
+        cfg.test_evaluator.metric, str) else cfg.test_evaluator.metric
+    if 'HOTA' in metrics_types:
+        search_metrics.extend(['HOTA', 'AssA', 'DetA'])
+    if 'CLEAR' in metrics_types:
+        search_metrics.extend(
+            ['MOTA', 'MOTP', 'IDSW', 'TP', 'FN', 'FP', 'Frag', 'MT', 'ML'])
+    if 'Identity' in metrics_types:
+        search_metrics.extend(['IDF1', 'IDTP', 'IDFN', 'IDFP', 'IDP', 'IDR'])
+    print_log(f'Record {search_metrics}.', logger)
+
+    runner = Runner.from_cfg(cfg)
+    if is_model_wrapper(runner.model):
+        model = runner.model.module
+    else:
+        model = runner.model
+
+    if args.detector:
+        assert not (args.checkpoint and args.detector), \
+            'Error: checkpoint and detector checkpoint cannot both exist'
+        load_checkpoint(model.detector, args.detector)
+
+    if args.reid:
+        assert (args.checkpoint is not None) or (args.detector is not None), \
+            'Error: checkpoint and detector checkpoint cannot both not exist'
+        assert not (args.checkpoint and args.reid), \
+            'Error: checkpoint and reid checkpoint cannot both exist'
+        load_checkpoint(model.reid, args.reid)
+
+    for case in all_search_cases:
+        for name, value in case.items():
+            if hasattr(runner.model, 'module'):
+                setattr(runner.model.module.tracker, name, value)
+            else:
+                setattr(runner.model.tracker, name, value)
+        runner.test()
+        rank, _ = get_dist_info()
+        if rank == 0:
+            _records = []
+            for metric in search_metrics:
+                res = runner.message_hub.get_scalar(
+                    'test/motchallenge-metric/' + metric).current()
+                if isinstance(res, float):
+                    _records.append(f'{res:.3f}')
+                else:
+                    _records.append(f'{res}')
+            print_log(f'-------------- {case}: {_records} --------------',
+                      logger)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/mot/slurm_mot_search.sh b/mmde/mmdet/.mim/tools/analysis_tools/mot/slurm_mot_search.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d54d7a68e1f729dfaef1946661f351e292ec8520
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/mot/slurm_mot_search.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+GPUS=$4
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-2}
+PY_ARGS=${@:5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u $(dirname "$0")/mot_param_search.py ${CONFIG} --launcher="slurm" ${PY_ARGS}
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/optimize_anchors.py b/mmde/mmdet/.mim/tools/analysis_tools/optimize_anchors.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b6a02b86443c4cbdbd0787ca3277534f3580806
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/optimize_anchors.py
@@ -0,0 +1,382 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Optimize anchor settings on a specific dataset.
+
+This script provides two method to optimize YOLO anchors including k-means
+anchor cluster and differential evolution. You can use ``--algorithm k-means``
+and ``--algorithm differential_evolution`` to switch two method.
+
+Example:
+    Use k-means anchor cluster::
+
+        python tools/analysis_tools/optimize_anchors.py ${CONFIG} \
+        --algorithm k-means --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \
+        --output-dir ${OUTPUT_DIR}
+    Use differential evolution to optimize anchors::
+
+        python tools/analysis_tools/optimize_anchors.py ${CONFIG} \
+        --algorithm differential_evolution \
+        --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \
+        --output-dir ${OUTPUT_DIR}
+"""
+import argparse
+import os.path as osp
+
+import numpy as np
+import torch
+from mmengine.config import Config
+from mmengine.fileio import dump
+from mmengine.logging import MMLogger
+from mmengine.registry import init_default_scope
+from mmengine.utils import ProgressBar
+from scipy.optimize import differential_evolution
+
+from mmdet.registry import DATASETS
+from mmdet.structures.bbox import (bbox_cxcywh_to_xyxy, bbox_overlaps,
+                                   bbox_xyxy_to_cxcywh)
+from mmdet.utils import replace_cfg_vals, update_data_root
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Optimize anchor parameters.')
+    parser.add_argument('config', help='Train config file path.')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for calculating.')
+    parser.add_argument(
+        '--input-shape',
+        type=int,
+        nargs='+',
+        default=[608, 608],
+        help='input image size')
+    parser.add_argument(
+        '--algorithm',
+        default='differential_evolution',
+        help='Algorithm used for anchor optimizing.'
+        'Support k-means and differential_evolution for YOLO.')
+    parser.add_argument(
+        '--iters',
+        default=1000,
+        type=int,
+        help='Maximum iterations for optimizer.')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='Path to save anchor optimize result.')
+
+    args = parser.parse_args()
+    return args
+
+
+class BaseAnchorOptimizer:
+    """Base class for anchor optimizer.
+
+    Args:
+        dataset (obj:`Dataset`): Dataset object.
+        input_shape (list[int]): Input image shape of the model.
+            Format in [width, height].
+        logger (obj:`logging.Logger`): The logger for logging.
+        device (str, optional): Device used for calculating.
+            Default: 'cuda:0'
+        out_dir (str, optional): Path to save anchor optimize result.
+            Default: None
+    """
+
+    def __init__(self,
+                 dataset,
+                 input_shape,
+                 logger,
+                 device='cuda:0',
+                 out_dir=None):
+        self.dataset = dataset
+        self.input_shape = input_shape
+        self.logger = logger
+        self.device = device
+        self.out_dir = out_dir
+        bbox_whs, img_shapes = self.get_whs_and_shapes()
+        ratios = img_shapes.max(1, keepdims=True) / np.array([input_shape])
+
+        # resize to input shape
+        self.bbox_whs = bbox_whs / ratios
+
+    def get_whs_and_shapes(self):
+        """Get widths and heights of bboxes and shapes of images.
+
+        Returns:
+            tuple[np.ndarray]: Array of bbox shapes and array of image
+            shapes with shape (num_bboxes, 2) in [width, height] format.
+        """
+        self.logger.info('Collecting bboxes from annotation...')
+        bbox_whs = []
+        img_shapes = []
+        prog_bar = ProgressBar(len(self.dataset))
+        for idx in range(len(self.dataset)):
+            data_info = self.dataset.get_data_info(idx)
+            img_shape = np.array([data_info['width'], data_info['height']])
+            gt_instances = data_info['instances']
+            for instance in gt_instances:
+                bbox = np.array(instance['bbox'])
+                wh = bbox[2:4] - bbox[0:2]
+                img_shapes.append(img_shape)
+                bbox_whs.append(wh)
+
+            prog_bar.update()
+        print('\n')
+        bbox_whs = np.array(bbox_whs)
+        img_shapes = np.array(img_shapes)
+        self.logger.info(f'Collected {bbox_whs.shape[0]} bboxes.')
+        return bbox_whs, img_shapes
+
+    def get_zero_center_bbox_tensor(self):
+        """Get a tensor of bboxes centered at (0, 0).
+
+        Returns:
+            Tensor: Tensor of bboxes with shape (num_bboxes, 4)
+            in [xmin, ymin, xmax, ymax] format.
+        """
+        whs = torch.from_numpy(self.bbox_whs).to(
+            self.device, dtype=torch.float32)
+        bboxes = bbox_cxcywh_to_xyxy(
+            torch.cat([torch.zeros_like(whs), whs], dim=1))
+        return bboxes
+
+    def optimize(self):
+        raise NotImplementedError
+
+    def save_result(self, anchors, path=None):
+        anchor_results = []
+        for w, h in anchors:
+            anchor_results.append([round(w), round(h)])
+        self.logger.info(f'Anchor optimize result:{anchor_results}')
+        if path:
+            json_path = osp.join(path, 'anchor_optimize_result.json')
+            dump(anchor_results, json_path)
+            self.logger.info(f'Result saved in {json_path}')
+
+
+class YOLOKMeansAnchorOptimizer(BaseAnchorOptimizer):
+    r"""YOLO anchor optimizer using k-means. Code refer to `AlexeyAB/darknet.
+    <https://github.com/AlexeyAB/darknet/blob/master/src/detector.c>`_.
+
+    Args:
+        num_anchors (int) : Number of anchors.
+        iters (int): Maximum iterations for k-means.
+    """
+
+    def __init__(self, num_anchors, iters, **kwargs):
+
+        super(YOLOKMeansAnchorOptimizer, self).__init__(**kwargs)
+        self.num_anchors = num_anchors
+        self.iters = iters
+
+    def optimize(self):
+        anchors = self.kmeans_anchors()
+        self.save_result(anchors, self.out_dir)
+
+    def kmeans_anchors(self):
+        self.logger.info(
+            f'Start cluster {self.num_anchors} YOLO anchors with K-means...')
+        bboxes = self.get_zero_center_bbox_tensor()
+        cluster_center_idx = torch.randint(
+            0, bboxes.shape[0], (self.num_anchors, )).to(self.device)
+
+        assignments = torch.zeros((bboxes.shape[0], )).to(self.device)
+        cluster_centers = bboxes[cluster_center_idx]
+        if self.num_anchors == 1:
+            cluster_centers = self.kmeans_maximization(bboxes, assignments,
+                                                       cluster_centers)
+            anchors = bbox_xyxy_to_cxcywh(cluster_centers)[:, 2:].cpu().numpy()
+            anchors = sorted(anchors, key=lambda x: x[0] * x[1])
+            return anchors
+
+        prog_bar = ProgressBar(self.iters)
+        for i in range(self.iters):
+            converged, assignments = self.kmeans_expectation(
+                bboxes, assignments, cluster_centers)
+            if converged:
+                self.logger.info(f'K-means process has converged at iter {i}.')
+                break
+            cluster_centers = self.kmeans_maximization(bboxes, assignments,
+                                                       cluster_centers)
+            prog_bar.update()
+        print('\n')
+        avg_iou = bbox_overlaps(bboxes,
+                                cluster_centers).max(1)[0].mean().item()
+
+        anchors = bbox_xyxy_to_cxcywh(cluster_centers)[:, 2:].cpu().numpy()
+        anchors = sorted(anchors, key=lambda x: x[0] * x[1])
+        self.logger.info(f'Anchor cluster finish. Average IOU: {avg_iou}')
+
+        return anchors
+
+    def kmeans_maximization(self, bboxes, assignments, centers):
+        """Maximization part of EM algorithm(Expectation-Maximization)"""
+        new_centers = torch.zeros_like(centers)
+        for i in range(centers.shape[0]):
+            mask = (assignments == i)
+            if mask.sum():
+                new_centers[i, :] = bboxes[mask].mean(0)
+        return new_centers
+
+    def kmeans_expectation(self, bboxes, assignments, centers):
+        """Expectation part of EM algorithm(Expectation-Maximization)"""
+        ious = bbox_overlaps(bboxes, centers)
+        closest = ious.argmax(1)
+        converged = (closest == assignments).all()
+        return converged, closest
+
+
+class YOLODEAnchorOptimizer(BaseAnchorOptimizer):
+    """YOLO anchor optimizer using differential evolution algorithm.
+
+    Args:
+        num_anchors (int) : Number of anchors.
+        iters (int): Maximum iterations for k-means.
+        strategy (str): The differential evolution strategy to use.
+            Should be one of:
+
+                - 'best1bin'
+                - 'best1exp'
+                - 'rand1exp'
+                - 'randtobest1exp'
+                - 'currenttobest1exp'
+                - 'best2exp'
+                - 'rand2exp'
+                - 'randtobest1bin'
+                - 'currenttobest1bin'
+                - 'best2bin'
+                - 'rand2bin'
+                - 'rand1bin'
+
+            Default: 'best1bin'.
+        population_size (int): Total population size of evolution algorithm.
+            Default: 15.
+        convergence_thr (float): Tolerance for convergence, the
+            optimizing stops when ``np.std(pop) <= abs(convergence_thr)
+            + convergence_thr * np.abs(np.mean(population_energies))``,
+            respectively. Default: 0.0001.
+        mutation (tuple[float]): Range of dithering randomly changes the
+            mutation constant. Default: (0.5, 1).
+        recombination (float): Recombination constant of crossover probability.
+            Default: 0.7.
+    """
+
+    def __init__(self,
+                 num_anchors,
+                 iters,
+                 strategy='best1bin',
+                 population_size=15,
+                 convergence_thr=0.0001,
+                 mutation=(0.5, 1),
+                 recombination=0.7,
+                 **kwargs):
+
+        super(YOLODEAnchorOptimizer, self).__init__(**kwargs)
+
+        self.num_anchors = num_anchors
+        self.iters = iters
+        self.strategy = strategy
+        self.population_size = population_size
+        self.convergence_thr = convergence_thr
+        self.mutation = mutation
+        self.recombination = recombination
+
+    def optimize(self):
+        anchors = self.differential_evolution()
+        self.save_result(anchors, self.out_dir)
+
+    def differential_evolution(self):
+        bboxes = self.get_zero_center_bbox_tensor()
+
+        bounds = []
+        for i in range(self.num_anchors):
+            bounds.extend([(0, self.input_shape[0]), (0, self.input_shape[1])])
+
+        result = differential_evolution(
+            func=self.avg_iou_cost,
+            bounds=bounds,
+            args=(bboxes, ),
+            strategy=self.strategy,
+            maxiter=self.iters,
+            popsize=self.population_size,
+            tol=self.convergence_thr,
+            mutation=self.mutation,
+            recombination=self.recombination,
+            updating='immediate',
+            disp=True)
+        self.logger.info(
+            f'Anchor evolution finish. Average IOU: {1 - result.fun}')
+        anchors = [(w, h) for w, h in zip(result.x[::2], result.x[1::2])]
+        anchors = sorted(anchors, key=lambda x: x[0] * x[1])
+        return anchors
+
+    @staticmethod
+    def avg_iou_cost(anchor_params, bboxes):
+        assert len(anchor_params) % 2 == 0
+        anchor_whs = torch.tensor(
+            [[w, h]
+             for w, h in zip(anchor_params[::2], anchor_params[1::2])]).to(
+                 bboxes.device, dtype=bboxes.dtype)
+        anchor_boxes = bbox_cxcywh_to_xyxy(
+            torch.cat([torch.zeros_like(anchor_whs), anchor_whs], dim=1))
+        ious = bbox_overlaps(bboxes, anchor_boxes)
+        max_ious, _ = ious.max(1)
+        cost = 1 - max_ious.mean().item()
+        return cost
+
+
+def main():
+    logger = MMLogger.get_current_instance()
+    args = parse_args()
+    cfg = args.config
+    cfg = Config.fromfile(cfg)
+    init_default_scope(cfg.get('default_scope', 'mmdet'))
+
+    # replace the ${key} with the value of cfg.key
+    cfg = replace_cfg_vals(cfg)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
+    input_shape = args.input_shape
+    assert len(input_shape) == 2
+
+    anchor_type = cfg.model.bbox_head.anchor_generator.type
+    assert anchor_type == 'YOLOAnchorGenerator', \
+        f'Only support optimize YOLOAnchor, but get {anchor_type}.'
+
+    base_sizes = cfg.model.bbox_head.anchor_generator.base_sizes
+    num_anchors = sum([len(sizes) for sizes in base_sizes])
+
+    train_data_cfg = cfg.train_dataloader
+    while 'dataset' in train_data_cfg:
+        train_data_cfg = train_data_cfg['dataset']
+    dataset = DATASETS.build(train_data_cfg)
+
+    if args.algorithm == 'k-means':
+        optimizer = YOLOKMeansAnchorOptimizer(
+            dataset=dataset,
+            input_shape=input_shape,
+            device=args.device,
+            num_anchors=num_anchors,
+            iters=args.iters,
+            logger=logger,
+            out_dir=args.output_dir)
+    elif args.algorithm == 'differential_evolution':
+        optimizer = YOLODEAnchorOptimizer(
+            dataset=dataset,
+            input_shape=input_shape,
+            device=args.device,
+            num_anchors=num_anchors,
+            iters=args.iters,
+            logger=logger,
+            out_dir=args.output_dir)
+    else:
+        raise NotImplementedError(
+            f'Only support k-means and differential_evolution, '
+            f'but get {args.algorithm}')
+
+    optimizer.optimize()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/robustness_eval.py b/mmde/mmdet/.mim/tools/analysis_tools/robustness_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..56e534176006f4b710d936cbe872755dccc0a2c7
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/robustness_eval.py
@@ -0,0 +1,263 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from argparse import ArgumentParser
+
+import numpy as np
+from mmengine.fileio import load
+
+
+def print_coco_results(results):
+
+    def _print(result, ap=1, iouThr=None, areaRng='all', maxDets=100):
+        titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
+        typeStr = '(AP)' if ap == 1 else '(AR)'
+        iouStr = '0.50:0.95' \
+            if iouThr is None else f'{iouThr:0.2f}'
+        iStr = f' {titleStr:<18} {typeStr} @[ IoU={iouStr:<9} | '
+        iStr += f'area={areaRng:>6s} | maxDets={maxDets:>3d} ] = {result:0.3f}'
+        print(iStr)
+
+    stats = np.zeros((12, ))
+    stats[0] = _print(results[0], 1)
+    stats[1] = _print(results[1], 1, iouThr=.5)
+    stats[2] = _print(results[2], 1, iouThr=.75)
+    stats[3] = _print(results[3], 1, areaRng='small')
+    stats[4] = _print(results[4], 1, areaRng='medium')
+    stats[5] = _print(results[5], 1, areaRng='large')
+    # TODO support recall metric
+    '''
+    stats[6] = _print(results[6], 0, maxDets=1)
+    stats[7] = _print(results[7], 0, maxDets=10)
+    stats[8] = _print(results[8], 0)
+    stats[9] = _print(results[9], 0, areaRng='small')
+    stats[10] = _print(results[10], 0, areaRng='medium')
+    stats[11] = _print(results[11], 0, areaRng='large')
+    '''
+
+
+def get_coco_style_results(filename,
+                           task='bbox',
+                           metric=None,
+                           prints='mPC',
+                           aggregate='benchmark'):
+
+    assert aggregate in ['benchmark', 'all']
+
+    if prints == 'all':
+        prints = ['P', 'mPC', 'rPC']
+    elif isinstance(prints, str):
+        prints = [prints]
+    for p in prints:
+        assert p in ['P', 'mPC', 'rPC']
+
+    if metric is None:
+        metrics = [
+            'mAP',
+            'mAP_50',
+            'mAP_75',
+            'mAP_s',
+            'mAP_m',
+            'mAP_l',
+        ]
+    elif isinstance(metric, list):
+        metrics = metric
+    else:
+        metrics = [metric]
+
+    for metric_name in metrics:
+        assert metric_name in [
+            'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+        ]
+
+    eval_output = load(filename)
+
+    num_distortions = len(list(eval_output.keys()))
+    results = np.zeros((num_distortions, 6, len(metrics)), dtype='float32')
+
+    for corr_i, distortion in enumerate(eval_output):
+        for severity in eval_output[distortion]:
+            for metric_j, metric_name in enumerate(metrics):
+                metric_dict = eval_output[distortion][severity]
+
+                new_metric_dict = {}
+                for k, v in metric_dict.items():
+                    if '/' in k:
+                        new_metric_dict[k.split('/')[-1]] = v
+                mAP = new_metric_dict['_'.join((task, metric_name))]
+                results[corr_i, severity, metric_j] = mAP
+
+    P = results[0, 0, :]
+    if aggregate == 'benchmark':
+        mPC = np.mean(results[:15, 1:, :], axis=(0, 1))
+    else:
+        mPC = np.mean(results[:, 1:, :], axis=(0, 1))
+    rPC = mPC / P
+
+    print(f'\nmodel: {osp.basename(filename)}')
+    if metric is None:
+        if 'P' in prints:
+            print(f'Performance on Clean Data [P] ({task})')
+            print_coco_results(P)
+        if 'mPC' in prints:
+            print(f'Mean Performance under Corruption [mPC] ({task})')
+            print_coco_results(mPC)
+        if 'rPC' in prints:
+            print(f'Relative Performance under Corruption [rPC] ({task})')
+            print_coco_results(rPC)
+    else:
+        if 'P' in prints:
+            print(f'Performance on Clean Data [P] ({task})')
+            for metric_i, metric_name in enumerate(metrics):
+                print(f'{metric_name:5} =  {P[metric_i]:0.3f}')
+        if 'mPC' in prints:
+            print(f'Mean Performance under Corruption [mPC] ({task})')
+            for metric_i, metric_name in enumerate(metrics):
+                print(f'{metric_name:5} =  {mPC[metric_i]:0.3f}')
+        if 'rPC' in prints:
+            print(f'Relative Performance under Corruption [rPC] ({task})')
+            for metric_i, metric_name in enumerate(metrics):
+                print(f'{metric_name:5} => {rPC[metric_i] * 100:0.1f} %')
+
+    return results
+
+
+def get_voc_style_results(filename, prints='mPC', aggregate='benchmark'):
+
+    assert aggregate in ['benchmark', 'all']
+
+    if prints == 'all':
+        prints = ['P', 'mPC', 'rPC']
+    elif isinstance(prints, str):
+        prints = [prints]
+    for p in prints:
+        assert p in ['P', 'mPC', 'rPC']
+
+    eval_output = load(filename)
+
+    num_distortions = len(list(eval_output.keys()))
+    results = np.zeros((num_distortions, 6, 20), dtype='float32')
+
+    for i, distortion in enumerate(eval_output):
+        for severity in eval_output[distortion]:
+            mAP = [
+                eval_output[distortion][severity][j]['ap']
+                for j in range(len(eval_output[distortion][severity]))
+            ]
+            results[i, severity, :] = mAP
+
+    P = results[0, 0, :]
+    if aggregate == 'benchmark':
+        mPC = np.mean(results[:15, 1:, :], axis=(0, 1))
+    else:
+        mPC = np.mean(results[:, 1:, :], axis=(0, 1))
+    rPC = mPC / P
+
+    print(f'\nmodel: {osp.basename(filename)}')
+    if 'P' in prints:
+        print(f'Performance on Clean Data [P] in AP50 = {np.mean(P):0.3f}')
+    if 'mPC' in prints:
+        print('Mean Performance under Corruption [mPC] in AP50 = '
+              f'{np.mean(mPC):0.3f}')
+    if 'rPC' in prints:
+        print('Relative Performance under Corruption [rPC] in % = '
+              f'{np.mean(rPC) * 100:0.1f}')
+
+    return np.mean(results, axis=2, keepdims=True)
+
+
+def get_results(filename,
+                dataset='coco',
+                task='bbox',
+                metric=None,
+                prints='mPC',
+                aggregate='benchmark'):
+    assert dataset in ['coco', 'voc', 'cityscapes']
+
+    if dataset in ['coco', 'cityscapes']:
+        results = get_coco_style_results(
+            filename,
+            task=task,
+            metric=metric,
+            prints=prints,
+            aggregate=aggregate)
+    elif dataset == 'voc':
+        if task != 'bbox':
+            print('Only bbox analysis is supported for Pascal VOC')
+            print('Will report bbox results\n')
+        if metric not in [None, ['AP'], ['AP50']]:
+            print('Only the AP50 metric is supported for Pascal VOC')
+            print('Will report AP50 metric\n')
+        results = get_voc_style_results(
+            filename, prints=prints, aggregate=aggregate)
+
+    return results
+
+
+def get_distortions_from_file(filename):
+
+    eval_output = load(filename)
+
+    return get_distortions_from_results(eval_output)
+
+
+def get_distortions_from_results(eval_output):
+    distortions = []
+    for i, distortion in enumerate(eval_output):
+        distortions.append(distortion.replace('_', ' '))
+    return distortions
+
+
+def main():
+    parser = ArgumentParser(description='Corruption Result Analysis')
+    parser.add_argument('filename', help='result file path')
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        choices=['coco', 'voc', 'cityscapes'],
+        default='coco',
+        help='dataset type')
+    parser.add_argument(
+        '--task',
+        type=str,
+        nargs='+',
+        choices=['bbox', 'segm'],
+        default=['bbox'],
+        help='task to report')
+    parser.add_argument(
+        '--metric',
+        nargs='+',
+        choices=[
+            None, 'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'AR1', 'AR10',
+            'AR100', 'ARs', 'ARm', 'ARl'
+        ],
+        default=None,
+        help='metric to report')
+    parser.add_argument(
+        '--prints',
+        type=str,
+        nargs='+',
+        choices=['P', 'mPC', 'rPC'],
+        default='mPC',
+        help='corruption benchmark metric to print')
+    parser.add_argument(
+        '--aggregate',
+        type=str,
+        choices=['all', 'benchmark'],
+        default='benchmark',
+        help='aggregate all results or only those \
+        for benchmark corruptions')
+
+    args = parser.parse_args()
+
+    for task in args.task:
+        get_results(
+            args.filename,
+            dataset=args.dataset,
+            task=task,
+            metric=args.metric,
+            prints=args.prints,
+            aggregate=args.aggregate)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/analysis_tools/test_robustness.py b/mmde/mmdet/.mim/tools/analysis_tools/test_robustness.py
new file mode 100644
index 0000000000000000000000000000000000000000..a701d23fe5157b771ad3e5be13d7dde65e886012
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/analysis_tools/test_robustness.py
@@ -0,0 +1,239 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy
+import os
+import os.path as osp
+
+from mmengine.config import Config, DictAction
+from mmengine.dist import get_dist_info
+from mmengine.evaluator import DumpResults
+from mmengine.fileio import dump
+from mmengine.runner import Runner
+
+from mmdet.engine.hooks.utils import trigger_visualization_hook
+from mmdet.registry import RUNNERS
+from tools.analysis_tools.robustness_eval import get_results
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet test detector')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--out',
+        type=str,
+        help='dump predictions to a pickle file for offline evaluation')
+    parser.add_argument(
+        '--corruptions',
+        type=str,
+        nargs='+',
+        default='benchmark',
+        choices=[
+            'all', 'benchmark', 'noise', 'blur', 'weather', 'digital',
+            'holdout', 'None', 'gaussian_noise', 'shot_noise', 'impulse_noise',
+            'defocus_blur', 'glass_blur', 'motion_blur', 'zoom_blur', 'snow',
+            'frost', 'fog', 'brightness', 'contrast', 'elastic_transform',
+            'pixelate', 'jpeg_compression', 'speckle_noise', 'gaussian_blur',
+            'spatter', 'saturate'
+        ],
+        help='corruptions')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing evaluation metrics')
+    parser.add_argument(
+        '--severities',
+        type=int,
+        nargs='+',
+        default=[0, 1, 2, 3, 4, 5],
+        help='corruption severity levels')
+    parser.add_argument(
+        '--summaries',
+        type=bool,
+        default=False,
+        help='Print summaries for every corruption and severity')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--show-dir', help='directory where painted images will be saved')
+    parser.add_argument(
+        '--wait-time', type=float, default=2, help='the interval of show (s)')
+    parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument(
+        '--final-prints',
+        type=str,
+        nargs='+',
+        choices=['P', 'mPC', 'rPC'],
+        default='mPC',
+        help='corruption benchmark metric to print at the end')
+    parser.add_argument(
+        '--final-prints-aggregate',
+        type=str,
+        choices=['all', 'benchmark'],
+        default='benchmark',
+        help='aggregate all results or only those for benchmark corruptions')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert args.out or args.show or args.show_dir, \
+        ('Please specify at least one operation (save or show the results) '
+         'with the argument "--out", "--show" or "show-dir"')
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    cfg.model.backbone.init_cfg.type = None
+    cfg.test_dataloader.dataset.test_mode = True
+
+    cfg.load_from = args.checkpoint
+    if args.show or args.show_dir:
+        cfg = trigger_visualization_hook(cfg, args)
+
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
+    else:
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
+
+    # add `DumpResults` dummy metric
+    if args.out is not None:
+        assert args.out.endswith(('.pkl', '.pickle')), \
+            'The dump file must be a pkl file.'
+        runner.test_evaluator.metrics.append(
+            DumpResults(out_file_path=args.out))
+
+    if 'all' in args.corruptions:
+        corruptions = [
+            'gaussian_noise', 'shot_noise', 'impulse_noise', 'defocus_blur',
+            'glass_blur', 'motion_blur', 'zoom_blur', 'snow', 'frost', 'fog',
+            'brightness', 'contrast', 'elastic_transform', 'pixelate',
+            'jpeg_compression', 'speckle_noise', 'gaussian_blur', 'spatter',
+            'saturate'
+        ]
+    elif 'benchmark' in args.corruptions:
+        corruptions = [
+            'gaussian_noise', 'shot_noise', 'impulse_noise', 'defocus_blur',
+            'glass_blur', 'motion_blur', 'zoom_blur', 'snow', 'frost', 'fog',
+            'brightness', 'contrast', 'elastic_transform', 'pixelate',
+            'jpeg_compression'
+        ]
+    elif 'noise' in args.corruptions:
+        corruptions = ['gaussian_noise', 'shot_noise', 'impulse_noise']
+    elif 'blur' in args.corruptions:
+        corruptions = [
+            'defocus_blur', 'glass_blur', 'motion_blur', 'zoom_blur'
+        ]
+    elif 'weather' in args.corruptions:
+        corruptions = ['snow', 'frost', 'fog', 'brightness']
+    elif 'digital' in args.corruptions:
+        corruptions = [
+            'contrast', 'elastic_transform', 'pixelate', 'jpeg_compression'
+        ]
+    elif 'holdout' in args.corruptions:
+        corruptions = ['speckle_noise', 'gaussian_blur', 'spatter', 'saturate']
+    elif 'None' in args.corruptions:
+        corruptions = ['None']
+        args.severities = [0]
+    else:
+        corruptions = args.corruptions
+
+    aggregated_results = {}
+    for corr_i, corruption in enumerate(corruptions):
+        aggregated_results[corruption] = {}
+        for sev_i, corruption_severity in enumerate(args.severities):
+            # evaluate severity 0 (= no corruption) only once
+            if corr_i > 0 and corruption_severity == 0:
+                aggregated_results[corruption][0] = \
+                    aggregated_results[corruptions[0]][0]
+                continue
+
+            test_loader_cfg = copy.deepcopy(cfg.test_dataloader)
+            # assign corruption and severity
+            if corruption_severity > 0:
+                corruption_trans = dict(
+                    type='Corrupt',
+                    corruption=corruption,
+                    severity=corruption_severity)
+                # TODO: hard coded "1", we assume that the first step is
+                # loading images, which needs to be fixed in the future
+                test_loader_cfg.dataset.pipeline.insert(1, corruption_trans)
+
+            test_loader = runner.build_dataloader(test_loader_cfg)
+
+            runner.test_loop.dataloader = test_loader
+            # set random seeds
+            if args.seed is not None:
+                runner.set_randomness(args.seed)
+
+            # print info
+            print(f'\nTesting {corruption} at severity {corruption_severity}')
+
+            eval_results = runner.test()
+            if args.out:
+                eval_results_filename = (
+                    osp.splitext(args.out)[0] + '_results' +
+                    osp.splitext(args.out)[1])
+                aggregated_results[corruption][
+                    corruption_severity] = eval_results
+                dump(aggregated_results, eval_results_filename)
+
+    rank, _ = get_dist_info()
+    if rank == 0:
+        eval_results_filename = (
+            osp.splitext(args.out)[0] + '_results' + osp.splitext(args.out)[1])
+        # print final results
+        print('\nAggregated results:')
+        prints = args.final_prints
+        aggregate = args.final_prints_aggregate
+
+        if cfg.dataset_type == 'VOCDataset':
+            get_results(
+                eval_results_filename,
+                dataset='voc',
+                prints=prints,
+                aggregate=aggregate)
+        else:
+            get_results(
+                eval_results_filename,
+                dataset='coco',
+                prints=prints,
+                aggregate=aggregate)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/ade20k2coco.py b/mmde/mmdet/.mim/tools/dataset_converters/ade20k2coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0b5ce86da86ac7fb4ece53281f494b395cb1aca
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/ade20k2coco.py
@@ -0,0 +1,367 @@
+import argparse
+import json
+import os
+from pathlib import Path
+
+import numpy as np
+import pycocotools.mask as mask_util
+from mmengine.utils import ProgressBar, mkdir_or_exist
+from panopticapi.utils import IdGenerator, save_json
+from PIL import Image
+
+from mmdet.datasets.ade20k import ADE20KPanopticDataset
+
+ORIGINAL_CATEGORIES = [
+    'wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road, route',
+    'bed', 'window', 'grass', 'cabinet', 'sidewalk, pavement', 'person',
+    'earth, ground', 'door', 'table', 'mountain, mount', 'plant', 'curtain',
+    'chair', 'car', 'water', 'painting, picture', 'sofa', 'shelf', 'house',
+    'sea', 'mirror', 'rug', 'field', 'armchair', 'seat', 'fence', 'desk',
+    'rock, stone', 'wardrobe, closet, press', 'lamp', 'tub', 'rail', 'cushion',
+    'base, pedestal, stand', 'box', 'column, pillar', 'signboard, sign',
+    'chest of drawers, chest, bureau, dresser', 'counter', 'sand', 'sink',
+    'skyscraper', 'fireplace', 'refrigerator, icebox',
+    'grandstand, covered stand', 'path', 'stairs', 'runway',
+    'case, display case, showcase, vitrine',
+    'pool table, billiard table, snooker table', 'pillow',
+    'screen door, screen', 'stairway, staircase', 'river', 'bridge, span',
+    'bookcase', 'blind, screen', 'coffee table',
+    'toilet, can, commode, crapper, pot, potty, stool, throne', 'flower',
+    'book', 'hill', 'bench', 'countertop', 'stove', 'palm, palm tree',
+    'kitchen island', 'computer', 'swivel chair', 'boat', 'bar',
+    'arcade machine', 'hovel, hut, hutch, shack, shanty', 'bus', 'towel',
+    'light', 'truck', 'tower', 'chandelier', 'awning, sunshade, sunblind',
+    'street lamp', 'booth', 'tv', 'airplane', 'dirt track', 'clothes', 'pole',
+    'land, ground, soil',
+    'bannister, banister, balustrade, balusters, handrail',
+    'escalator, moving staircase, moving stairway',
+    'ottoman, pouf, pouffe, puff, hassock', 'bottle',
+    'buffet, counter, sideboard',
+    'poster, posting, placard, notice, bill, card', 'stage', 'van', 'ship',
+    'fountain',
+    'conveyer belt, conveyor belt, conveyer, conveyor, transporter', 'canopy',
+    'washer, automatic washer, washing machine', 'plaything, toy', 'pool',
+    'stool', 'barrel, cask', 'basket, handbasket', 'falls', 'tent', 'bag',
+    'minibike, motorbike', 'cradle', 'oven', 'ball', 'food, solid food',
+    'step, stair', 'tank, storage tank', 'trade name', 'microwave', 'pot',
+    'animal', 'bicycle', 'lake', 'dishwasher', 'screen', 'blanket, cover',
+    'sculpture', 'hood, exhaust hood', 'sconce', 'vase', 'traffic light',
+    'tray', 'trash can', 'fan', 'pier', 'crt screen', 'plate', 'monitor',
+    'bulletin board', 'shower', 'radiator', 'glass, drinking glass', 'clock',
+    'flag'
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert ADE20K annotations to COCO format')
+    parser.add_argument('src', help='ade20k data path')
+    parser.add_argument('--task', help='task name', default='panoptic')
+    args = parser.parse_args()
+    return args
+
+
+def prepare_instance_annotations(dataset_dir: str):
+    dataset_dir = Path(dataset_dir)
+    for name, dirname in [('train', 'training'), ('val', 'validation')]:
+        image_dir = dataset_dir / 'images' / dirname
+        instance_dir = dataset_dir / 'annotations_instance' / dirname
+
+        ann_id = 0
+
+        # json
+        out_file = dataset_dir / f'ade20k_instance_{name}.json'
+
+        # json config
+        instance_config_file = dataset_dir / 'imgCatIds.json'
+        with open(instance_config_file, 'r') as f:
+            category_dict = json.load(f)['categories']
+
+        # catid mapping
+        mapping_file = dataset_dir / 'categoryMapping.txt'
+        with open(mapping_file, 'r') as f:
+            map_id = {}
+            for i, line in enumerate(f.readlines()):
+                if i == 0:
+                    continue
+                ins_id, sem_id, _ = line.strip().split()
+                map_id[int(ins_id)] = int(sem_id) - 1
+
+        for cat in category_dict:
+            cat['id'] = map_id[cat['id']]
+
+        filenames = sorted(list(image_dir.iterdir()))
+
+        ann_dict = {}
+        images = []
+        annotations = []
+
+        progressbar = ProgressBar(len(filenames))
+        for filename in filenames:
+            image = {}
+            image_id = filename.stem
+
+            image['id'] = image_id
+            image['file_name'] = filename.name
+
+            original_format = np.array(Image.open(filename))
+            image['height'] = original_format.shape[0]
+            image['width'] = original_format.shape[1]
+
+            images.append(image)
+
+            instance_file = instance_dir / f'{image_id}.png'
+            ins_seg = np.array(Image.open(instance_file))
+            assert ins_seg.dtype == np.uint8
+
+            instance_cat_ids = ins_seg[..., 0]
+            instance_ins_ids = ins_seg[..., 1]
+
+            for thing_id in np.unique(instance_ins_ids):
+                if thing_id == 0:
+                    continue
+                mask = instance_ins_ids == thing_id
+                instance_cat_id = np.unique(instance_cat_ids[mask])
+                assert len(instance_cat_id) == 1
+
+                anno = {}
+                anno['id'] = ann_id
+                ann_id += 1
+                anno['image_id'] = image['id']
+                anno['iscrowd'] = int(0)
+                anno['category_id'] = int(map_id[instance_cat_id[0]])
+
+                inds = np.nonzero(mask)
+                ymin, ymax = inds[0].min(), inds[0].max()
+                xmin, xmax = inds[1].min(), inds[1].max()
+                anno['bbox'] = [
+                    int(xmin),
+                    int(ymin),
+                    int(xmax - xmin + 1),
+                    int(ymax - ymin + 1)
+                ]
+
+                rle = mask_util.encode(
+                    np.array(mask[:, :, np.newaxis], order='F',
+                             dtype='uint8'))[0]
+                rle['counts'] = rle['counts'].decode('utf-8')
+                anno['segmentation'] = rle
+                anno['area'] = int(mask_util.area(rle))
+                annotations.append(anno)
+            progressbar.update()
+
+        ann_dict['images'] = images
+        ann_dict['categories'] = category_dict
+        ann_dict['annotations'] = annotations
+        save_json(ann_dict, out_file)
+
+
+def prepare_panoptic_annotations(dataset_dir: str):
+    dataset_dir = Path(dataset_dir)
+
+    for name, dirname in [('train', 'training'), ('val', 'validation')]:
+        image_dir = dataset_dir / 'images' / dirname
+        semantic_dir = dataset_dir / 'annotations' / dirname
+        instance_dir = dataset_dir / 'annotations_instance' / dirname
+
+        # folder to store panoptic PNGs
+        out_folder = dataset_dir / f'ade20k_panoptic_{name}'
+        # json with segmentations information
+        out_file = dataset_dir / f'ade20k_panoptic_{name}.json'
+
+        mkdir_or_exist(out_folder)
+
+        # catid mapping
+        neworder_categories = []
+        all_classes = ORIGINAL_CATEGORIES
+        thing_classes = ADE20KPanopticDataset.METAINFO['thing_classes']
+        stuff_classes = ADE20KPanopticDataset.METAINFO['stuff_classes']
+        palette = ADE20KPanopticDataset.METAINFO['palette']
+
+        old_2_new_mapping = {}
+        new_2_old_mapping = {}
+        for i, t in enumerate(thing_classes):
+            j = list(all_classes).index(t)
+            old_2_new_mapping[j] = i
+            new_2_old_mapping[i] = j
+
+        for i, t in enumerate(stuff_classes):
+            j = list(all_classes).index(t)
+            old_2_new_mapping[j] = i + len(thing_classes)
+            new_2_old_mapping[i + len(thing_classes)] = j
+
+        for old, new in old_2_new_mapping.items():
+            neworder_categories.append({
+                'id': new,
+                'name': all_classes[old],
+                'isthing': int(new < len(thing_classes)),
+                'color': palette[new]
+            })
+        categories_dict = {cat['id']: cat for cat in neworder_categories}
+
+        panoptic_json_categories = neworder_categories[:]
+        panoptic_json_images = []
+        panoptic_json_annotations = []
+
+        filenames = sorted(list(image_dir.iterdir()))
+        progressbar = ProgressBar(len(filenames))
+        for filename in filenames:
+            panoptic_json_image = {}
+
+            image_id = filename.stem
+
+            panoptic_json_image['id'] = image_id
+            panoptic_json_image['file_name'] = filename.name
+
+            original_format = np.array(Image.open(filename))
+            panoptic_json_image['height'] = original_format.shape[0]
+            panoptic_json_image['width'] = original_format.shape[1]
+
+            pan_seg = np.zeros(
+                (original_format.shape[0], original_format.shape[1], 3),
+                dtype=np.uint8)
+            id_generator = IdGenerator(categories_dict)
+
+            filename_semantic = semantic_dir / f'{image_id}.png'
+            filename_instance = instance_dir / f'{image_id}.png'
+
+            sem_seg = np.array(Image.open(filename_semantic))
+            ins_seg = np.array(Image.open(filename_instance))
+
+            assert sem_seg.dtype == np.uint8
+            assert ins_seg.dtype == np.uint8
+
+            semantic_cat_ids = sem_seg - 1
+            instance_cat_ids = ins_seg[..., 0] - 1
+            # instance id starts from 1!
+            # because 0 is reserved as VOID label
+            instance_ins_ids = ins_seg[..., 1]
+
+            segm_info = []
+
+            # process stuffs
+            for semantic_cat_id in np.unique(semantic_cat_ids):
+                if semantic_cat_id == 255:
+                    continue
+                if categories_dict[old_2_new_mapping[int(
+                        semantic_cat_id)]]['isthing'] == 1:
+                    continue
+                mask = semantic_cat_ids == semantic_cat_id
+                # should not have any overlap
+                assert pan_seg[mask].sum() == 0
+
+                segment_id, color = id_generator.get_id_and_color(
+                    old_2_new_mapping[int(semantic_cat_id)])
+                pan_seg[mask] = color
+
+                area = np.sum(mask)
+                # bbox computation for a segment
+                hor = np.sum(mask, axis=0)
+                hor_idx = np.nonzero(hor)[0]
+                x = hor_idx[0]
+                width = hor_idx[-1] - x + 1
+                vert = np.sum(mask, axis=1)
+                vert_idx = np.nonzero(vert)[0]
+                y = vert_idx[0]
+                height = vert_idx[-1] - y + 1
+                bbox = [int(x), int(y), int(width), int(height)]
+
+                segm_info.append({
+                    'id':
+                    int(segment_id),
+                    'category_id':
+                    old_2_new_mapping[int(semantic_cat_id)],
+                    'area':
+                    int(area),
+                    'bbox':
+                    bbox,
+                    'iscrowd':
+                    0
+                })
+
+            # process things
+            for thing_id in np.unique(instance_ins_ids):
+                if thing_id == 0:
+                    continue
+                mask = instance_ins_ids == thing_id
+
+                instance_cat_id = np.unique(instance_cat_ids[mask])
+                assert len(instance_cat_id) == 1
+
+                segment_id, color = id_generator.get_id_and_color(
+                    instance_cat_id[0])
+                pan_seg[mask] = color
+
+                area = np.sum(mask)
+                # bbox computation for a segment
+                hor = np.sum(mask, axis=0)
+                hor_idx = np.nonzero(hor)[0]
+                x = hor_idx[-1] - x + 1
+                width = hor_idx[-1] - x + 1
+                vert = np.sum(mask, axis=1)
+                vert_idx = np.nonzero(vert)[0]
+                y = vert_idx[0]
+                height = vert_idx[-1] - y + 1
+                bbox = [int(x), int(y), int(width), int(height)]
+
+                segm_info.append({
+                    'id': int(segment_id),
+                    'category_id': int(instance_cat_id[0]),
+                    'area': int(area),
+                    'bbox': bbox,
+                    'iscrowd': 0
+                })
+
+            panoptic_json_annotation = {
+                'image_id': image_id,
+                'file_name': image_id + '.png',
+                'segments_info': segm_info
+            }
+
+            Image.fromarray(pan_seg).save(out_folder / f'{image_id}.png')
+
+            panoptic_json_images.append(panoptic_json_image)
+            panoptic_json_annotations.append(panoptic_json_annotation)
+
+            progressbar.update()
+
+        panoptic_json = {
+            'images': panoptic_json_images,
+            'annotations': panoptic_json_annotations,
+            'categories': panoptic_json_categories
+        }
+        save_json(panoptic_json, out_file)
+
+
+def main():
+    args = parse_args()
+    assert args.task in ['panoptic', 'instance']
+    src = args.src
+    if args.task == 'panoptic':
+        annotation_train_path = f'{src}/ade20k_panoptic_train'
+        annotation_val_path = f'{src}/ade20k_panoptic_val'
+        print('Preparing ADE20K panoptic annotations ...')
+        print(
+            f'Creating panoptic annotations to {annotation_train_path} and {annotation_val_path} ...'  # noqa
+        )
+        if os.path.exists(annotation_train_path) or os.path.exists(
+                annotation_val_path):
+            raise RuntimeError('Panoptic annotations already exist.')
+        prepare_panoptic_annotations(src)
+        print('Done.')
+    else:
+        annotation_train_path = f'{src}/ade20k_instance_train'
+        annotation_val_path = f'{src}/ade20k_instance_val'
+        print('Preparing ADE20K instance annotations ...')
+        print(
+            f'Creating instance annotations to {annotation_train_path} and {annotation_val_path} ...'  # noqa
+        )
+        if os.path.exists(annotation_train_path) or os.path.exists(
+                annotation_val_path):
+            raise RuntimeError('Instance annotations already exist.')
+        prepare_instance_annotations(src)
+        print('Done.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/cityscapes.py b/mmde/mmdet/.mim/tools/dataset_converters/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..23ad431ce0b570b122ae5cf1afd50e9c2bdb1788
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/cityscapes.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import glob
+import os.path as osp
+
+import cityscapesscripts.helpers.labels as CSLabels
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+from mmengine.fileio import dump
+from mmengine.utils import (Timer, mkdir_or_exist, track_parallel_progress,
+                            track_progress)
+
+
+def collect_files(img_dir, gt_dir):
+    suffix = 'leftImg8bit.png'
+    files = []
+    for img_file in glob.glob(osp.join(img_dir, '**/*.png')):
+        assert img_file.endswith(suffix), img_file
+        inst_file = gt_dir + img_file[
+            len(img_dir):-len(suffix)] + 'gtFine_instanceIds.png'
+        # Note that labelIds are not converted to trainId for seg map
+        segm_file = gt_dir + img_file[
+            len(img_dir):-len(suffix)] + 'gtFine_labelIds.png'
+        files.append((img_file, inst_file, segm_file))
+    assert len(files), f'No images found in {img_dir}'
+    print(f'Loaded {len(files)} images from {img_dir}')
+
+    return files
+
+
+def collect_annotations(files, nproc=1):
+    print('Loading annotation images')
+    if nproc > 1:
+        images = track_parallel_progress(load_img_info, files, nproc=nproc)
+    else:
+        images = track_progress(load_img_info, files)
+
+    return images
+
+
+def load_img_info(files):
+    img_file, inst_file, segm_file = files
+    inst_img = mmcv.imread(inst_file, 'unchanged')
+    # ids < 24 are stuff labels (filtering them first is about 5% faster)
+    unique_inst_ids = np.unique(inst_img[inst_img >= 24])
+    anno_info = []
+    for inst_id in unique_inst_ids:
+        # For non-crowd annotations, inst_id // 1000 is the label_id
+        # Crowd annotations have <1000 instance ids
+        label_id = inst_id // 1000 if inst_id >= 1000 else inst_id
+        label = CSLabels.id2label[label_id]
+        if not label.hasInstances or label.ignoreInEval:
+            continue
+
+        category_id = label.id
+        iscrowd = int(inst_id < 1000)
+        mask = np.asarray(inst_img == inst_id, dtype=np.uint8, order='F')
+        mask_rle = maskUtils.encode(mask[:, :, None])[0]
+
+        area = maskUtils.area(mask_rle)
+        # convert to COCO style XYWH format
+        bbox = maskUtils.toBbox(mask_rle)
+
+        # for json encoding
+        mask_rle['counts'] = mask_rle['counts'].decode()
+
+        anno = dict(
+            iscrowd=iscrowd,
+            category_id=category_id,
+            bbox=bbox.tolist(),
+            area=area.tolist(),
+            segmentation=mask_rle)
+        anno_info.append(anno)
+    video_name = osp.basename(osp.dirname(img_file))
+    img_info = dict(
+        # remove img_prefix for filename
+        file_name=osp.join(video_name, osp.basename(img_file)),
+        height=inst_img.shape[0],
+        width=inst_img.shape[1],
+        anno_info=anno_info,
+        segm_file=osp.join(video_name, osp.basename(segm_file)))
+
+    return img_info
+
+
+def cvt_annotations(image_infos, out_json_name):
+    out_json = dict()
+    img_id = 0
+    ann_id = 0
+    out_json['images'] = []
+    out_json['categories'] = []
+    out_json['annotations'] = []
+    for image_info in image_infos:
+        image_info['id'] = img_id
+        anno_infos = image_info.pop('anno_info')
+        out_json['images'].append(image_info)
+        for anno_info in anno_infos:
+            anno_info['image_id'] = img_id
+            anno_info['id'] = ann_id
+            out_json['annotations'].append(anno_info)
+            ann_id += 1
+        img_id += 1
+    for label in CSLabels.labels:
+        if label.hasInstances and not label.ignoreInEval:
+            cat = dict(id=label.id, name=label.name)
+            out_json['categories'].append(cat)
+
+    if len(out_json['annotations']) == 0:
+        out_json.pop('annotations')
+
+    dump(out_json, out_json_name)
+    return out_json
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert Cityscapes annotations to COCO format')
+    parser.add_argument('cityscapes_path', help='cityscapes data path')
+    parser.add_argument('--img-dir', default='leftImg8bit', type=str)
+    parser.add_argument('--gt-dir', default='gtFine', type=str)
+    parser.add_argument('-o', '--out-dir', help='output path')
+    parser.add_argument(
+        '--nproc', default=1, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cityscapes_path = args.cityscapes_path
+    out_dir = args.out_dir if args.out_dir else cityscapes_path
+    mkdir_or_exist(out_dir)
+
+    img_dir = osp.join(cityscapes_path, args.img_dir)
+    gt_dir = osp.join(cityscapes_path, args.gt_dir)
+
+    set_name = dict(
+        train='instancesonly_filtered_gtFine_train.json',
+        val='instancesonly_filtered_gtFine_val.json',
+        test='instancesonly_filtered_gtFine_test.json')
+
+    for split, json_name in set_name.items():
+        print(f'Converting {split} into {json_name}')
+        with Timer(print_tmpl='It took {}s to convert Cityscapes annotation'):
+            files = collect_files(
+                osp.join(img_dir, split), osp.join(gt_dir, split))
+            image_infos = collect_annotations(files, nproc=args.nproc)
+            cvt_annotations(image_infos, osp.join(out_dir, json_name))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/coco2odvg.py b/mmde/mmdet/.mim/tools/dataset_converters/coco2odvg.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa9bc86d6d2934717129ac872ca8eae609183914
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/coco2odvg.py
@@ -0,0 +1,345 @@
+import argparse
+import json
+import os.path
+
+import jsonlines
+from pycocotools.coco import COCO
+from tqdm import tqdm
+
+id_map = {
+    0: 1,
+    1: 2,
+    2: 3,
+    3: 4,
+    4: 5,
+    5: 6,
+    6: 7,
+    7: 8,
+    8: 9,
+    9: 10,
+    10: 11,
+    11: 13,
+    12: 14,
+    13: 15,
+    14: 16,
+    15: 17,
+    16: 18,
+    17: 19,
+    18: 20,
+    19: 21,
+    20: 22,
+    21: 23,
+    22: 24,
+    23: 25,
+    24: 27,
+    25: 28,
+    26: 31,
+    27: 32,
+    28: 33,
+    29: 34,
+    30: 35,
+    31: 36,
+    32: 37,
+    33: 38,
+    34: 39,
+    35: 40,
+    36: 41,
+    37: 42,
+    38: 43,
+    39: 44,
+    40: 46,
+    41: 47,
+    42: 48,
+    43: 49,
+    44: 50,
+    45: 51,
+    46: 52,
+    47: 53,
+    48: 54,
+    49: 55,
+    50: 56,
+    51: 57,
+    52: 58,
+    53: 59,
+    54: 60,
+    55: 61,
+    56: 62,
+    57: 63,
+    58: 64,
+    59: 65,
+    60: 67,
+    61: 70,
+    62: 72,
+    63: 73,
+    64: 74,
+    65: 75,
+    66: 76,
+    67: 77,
+    68: 78,
+    69: 79,
+    70: 80,
+    71: 81,
+    72: 82,
+    73: 84,
+    74: 85,
+    75: 86,
+    76: 87,
+    77: 88,
+    78: 89,
+    79: 90
+}
+key_list_coco = list(id_map.keys())
+val_list_coco = list(id_map.values())
+key_list_o365 = [i for i in range(365)]
+val_list_o365 = [i for i in range(1, 366)]
+key_list_v3det = [i for i in range(13204)]
+val_list_v3det = [i for i in range(1, 13205)]
+
+
+def dump_coco_label_map(args):
+    ori_map = {
+        '1': 'person',
+        '2': 'bicycle',
+        '3': 'car',
+        '4': 'motorcycle',
+        '5': 'airplane',
+        '6': 'bus',
+        '7': 'train',
+        '8': 'truck',
+        '9': 'boat',
+        '10': 'traffic light',
+        '11': 'fire hydrant',
+        '13': 'stop sign',
+        '14': 'parking meter',
+        '15': 'bench',
+        '16': 'bird',
+        '17': 'cat',
+        '18': 'dog',
+        '19': 'horse',
+        '20': 'sheep',
+        '21': 'cow',
+        '22': 'elephant',
+        '23': 'bear',
+        '24': 'zebra',
+        '25': 'giraffe',
+        '27': 'backpack',
+        '28': 'umbrella',
+        '31': 'handbag',
+        '32': 'tie',
+        '33': 'suitcase',
+        '34': 'frisbee',
+        '35': 'skis',
+        '36': 'snowboard',
+        '37': 'sports ball',
+        '38': 'kite',
+        '39': 'baseball bat',
+        '40': 'baseball glove',
+        '41': 'skateboard',
+        '42': 'surfboard',
+        '43': 'tennis racket',
+        '44': 'bottle',
+        '46': 'wine glass',
+        '47': 'cup',
+        '48': 'fork',
+        '49': 'knife',
+        '50': 'spoon',
+        '51': 'bowl',
+        '52': 'banana',
+        '53': 'apple',
+        '54': 'sandwich',
+        '55': 'orange',
+        '56': 'broccoli',
+        '57': 'carrot',
+        '58': 'hot dog',
+        '59': 'pizza',
+        '60': 'donut',
+        '61': 'cake',
+        '62': 'chair',
+        '63': 'couch',
+        '64': 'potted plant',
+        '65': 'bed',
+        '67': 'dining table',
+        '70': 'toilet',
+        '72': 'tv',
+        '73': 'laptop',
+        '74': 'mouse',
+        '75': 'remote',
+        '76': 'keyboard',
+        '77': 'cell phone',
+        '78': 'microwave',
+        '79': 'oven',
+        '80': 'toaster',
+        '81': 'sink',
+        '82': 'refrigerator',
+        '84': 'book',
+        '85': 'clock',
+        '86': 'vase',
+        '87': 'scissors',
+        '88': 'teddy bear',
+        '89': 'hair drier',
+        '90': 'toothbrush'
+    }
+    new_map = {}
+    for key, value in ori_map.items():
+        label = int(key)
+        ind = val_list_coco.index(label)
+        label_trans = key_list_coco[ind]
+        new_map[label_trans] = value
+    if args.output is None:
+        output = os.path.dirname(args.input) + '/coco2017_label_map.json'
+    else:
+        output = os.path.dirname(args.output) + '/coco2017_label_map.json'
+    with open(output, 'w') as f:
+        json.dump(new_map, f)
+
+
+def dump_o365v1_label_map(args):
+    with open(args.input, 'r') as f:
+        j = json.load(f)
+    o_dict = {}
+    for category in j['categories']:
+        index = str(int(category['id']) - 1)
+        name = category['name']
+        o_dict[index] = name
+    if args.output is None:
+        output = os.path.dirname(args.input) + '/o365v1_label_map.json'
+    else:
+        output = os.path.dirname(args.output) + '/o365v1_label_map.json'
+    with open(output, 'w') as f:
+        json.dump(o_dict, f)
+
+
+def dump_o365v2_label_map(args):
+    with open(args.input, 'r') as f:
+        j = json.load(f)
+    o_dict = {}
+    for category in j['categories']:
+        index = str(int(category['id']) - 1)
+        name = category['name']
+        o_dict[index] = name
+    if args.output is None:
+        output = os.path.dirname(args.input) + '/o365v2_label_map.json'
+    else:
+        output = os.path.dirname(args.output) + '/o365v2_label_map.json'
+    with open(output, 'w') as f:
+        json.dump(o_dict, f)
+
+
+def dump_v3det_label_map(args):
+    with open(args.input, 'r') as f:
+        j = json.load(f)
+    o_dict = {}
+    for category in j['categories']:
+        index = str(int(category['id']) - 1)
+        name = category['name']
+        o_dict[index] = name
+    if args.output is None:
+        output = os.path.dirname(args.input) + '/v3det_2023_v1_label_map.json'
+    else:
+        output = os.path.dirname(args.output) + '/v3det_2023_v1_label_map.json'
+    with open(output, 'w') as f:
+        json.dump(o_dict, f)
+
+
+def coco2odvg(args):
+    coco = COCO(args.input)
+    cats = coco.loadCats(coco.getCatIds())
+    nms = {cat['id']: cat['name'] for cat in cats}
+    metas = []
+    if args.output is None:
+        out_path = args.input[:-5] + '_od.json'
+    else:
+        out_path = args.output
+
+    if args.dataset == 'coco':
+        key_list = key_list_coco
+        val_list = val_list_coco
+        dump_coco_label_map(args)
+    elif args.dataset == 'o365v1':
+        key_list = key_list_o365
+        val_list = val_list_o365
+        dump_o365v1_label_map(args)
+    elif args.dataset == 'o365v2':
+        key_list = key_list_o365
+        val_list = val_list_o365
+        dump_o365v2_label_map(args)
+    elif args.dataset == 'v3det':
+        key_list = key_list_v3det
+        val_list = val_list_v3det
+        dump_v3det_label_map(args)
+
+    for img_id, img_info in tqdm(coco.imgs.items()):
+        # missing images
+        if args.dataset == 'o365v2' and img_id in [908726, 320532, 320534]:
+            print(img_info['file_name'])
+            continue
+        if args.dataset == 'o365v1' and img_id in [6, 19, 23]:
+            print(img_info['file_name'])
+            continue
+
+        if args.dataset == 'o365v2':
+            file_name = img_info['file_name']
+            if file_name.startswith('images/v2/'):
+                file_name = file_name.replace('images/v2/', '')
+            elif file_name.startswith('images/v1/'):
+                file_name = file_name.replace('images/v1/', '')
+            img_info['file_name'] = file_name
+
+        ann_ids = coco.getAnnIds(imgIds=img_id)
+        instance_list = []
+        for ann_id in ann_ids:
+            ann = coco.anns[ann_id]
+
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+
+            if ann.get('iscrowd', False):
+                continue
+
+            bbox_xyxy = [x1, y1, x1 + w, y1 + h]
+            label = ann['category_id']
+            category = nms[label]
+            ind = val_list.index(label)
+            label_trans = key_list[ind]
+            instance_list.append({
+                'bbox': bbox_xyxy,
+                'label': label_trans,
+                'category': category
+            })
+        metas.append({
+            'filename': img_info['file_name'],
+            'height': img_info['height'],
+            'width': img_info['width'],
+            'detection': {
+                'instances': instance_list
+            }
+        })
+
+    with jsonlines.open(out_path, mode='w') as writer:
+        writer.write_all(metas)
+
+    print('save to {}'.format(out_path))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('coco to odvg format.', add_help=True)
+    parser.add_argument('input', type=str, help='input json file name')
+    parser.add_argument(
+        '--output', '-o', type=str, help='output json file name')
+    parser.add_argument(
+        '--dataset',
+        '-d',
+        required=True,
+        type=str,
+        choices=['coco', 'o365v1', 'o365v2', 'v3det'],
+    )
+    args = parser.parse_args()
+
+    coco2odvg(args)
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/coco2ovd.py b/mmde/mmdet/.mim/tools/dataset_converters/coco2ovd.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc70145f9aa8c5f973f9540468a76806450b63f2
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/coco2ovd.py
@@ -0,0 +1,70 @@
+import argparse
+import json
+import os.path
+
+base_classes = ('person', 'bicycle', 'car', 'motorcycle', 'train', 'truck',
+                'boat', 'bench', 'bird', 'horse', 'sheep', 'bear', 'zebra',
+                'giraffe', 'backpack', 'handbag', 'suitcase', 'frisbee',
+                'skis', 'kite', 'surfboard', 'bottle', 'fork', 'spoon', 'bowl',
+                'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+                'pizza', 'donut', 'chair', 'bed', 'toilet', 'tv', 'laptop',
+                'mouse', 'remote', 'microwave', 'oven', 'toaster',
+                'refrigerator', 'book', 'clock', 'vase', 'toothbrush')
+
+novel_classes = ('airplane', 'bus', 'cat', 'dog', 'cow', 'elephant',
+                 'umbrella', 'tie', 'snowboard', 'skateboard', 'cup', 'knife',
+                 'cake', 'couch', 'keyboard', 'sink', 'scissors')
+
+
+def filter_annotation(anno_dict, split_name_list, class_id_to_split):
+    filtered_categories = []
+    for item in anno_dict['categories']:
+        if class_id_to_split.get(item['id']) in split_name_list:
+            item['split'] = class_id_to_split.get(item['id'])
+            filtered_categories.append(item)
+    anno_dict['categories'] = filtered_categories
+
+    filtered_images = []
+    filtered_annotations = []
+    useful_image_ids = set()
+    for item in anno_dict['annotations']:
+        if class_id_to_split.get(item['category_id']) in split_name_list:
+            filtered_annotations.append(item)
+            useful_image_ids.add(item['image_id'])
+    for item in anno_dict['images']:
+        if item['id'] in useful_image_ids:
+            filtered_images.append(item)
+    anno_dict['annotations'] = filtered_annotations
+    anno_dict['images'] = filtered_images
+
+
+def coco2ovd(args):
+    ann_path = os.path.join(args.data_root, 'annotations/')
+    with open(ann_path + 'instances_train2017.json', 'r') as fin:
+        coco_train_anno_all = json.load(fin)
+
+    class_id_to_split = {}
+    for item in coco_train_anno_all['categories']:
+        if item['name'] in base_classes:
+            class_id_to_split[item['id']] = 'seen'
+        elif item['name'] in novel_classes:
+            class_id_to_split[item['id']] = 'unseen'
+
+    filter_annotation(coco_train_anno_all, ['seen'], class_id_to_split)
+    with open(ann_path + 'instances_train2017_seen_2.json', 'w') as fout:
+        json.dump(coco_train_anno_all, fout)
+
+    with open(ann_path + 'instances_val2017.json', 'r') as fin:
+        coco_val_anno_all = json.load(fin)
+
+    filter_annotation(coco_val_anno_all, ['seen', 'unseen'], class_id_to_split)
+    with open(ann_path + 'instances_val2017_all_2.json', 'w') as fout:
+        json.dump(coco_val_anno_all, fout)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('coco to ovd format.', add_help=True)
+    parser.add_argument('data_root', type=str, help='coco root path')
+    args = parser.parse_args()
+
+    coco2ovd(args)
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/coco_stuff164k.py b/mmde/mmdet/.mim/tools/dataset_converters/coco_stuff164k.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe1ff9f6b43a9a5aaad3b86be93db0d81c1bede2
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/coco_stuff164k.py
@@ -0,0 +1,254 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from functools import partial
+from glob import glob
+
+import numpy as np
+from mmengine.utils import (mkdir_or_exist, track_parallel_progress,
+                            track_progress)
+from PIL import Image
+
+COCO_LEN = 123287
+
+clsID_to_trID = {
+    0: 0,
+    1: 1,
+    2: 2,
+    3: 3,
+    4: 4,
+    5: 5,
+    6: 6,
+    7: 7,
+    8: 8,
+    9: 9,
+    10: 10,
+    12: 11,
+    13: 12,
+    14: 13,
+    15: 14,
+    16: 15,
+    17: 16,
+    18: 17,
+    19: 18,
+    20: 19,
+    21: 20,
+    22: 21,
+    23: 22,
+    24: 23,
+    26: 24,
+    27: 25,
+    30: 26,
+    31: 27,
+    32: 28,
+    33: 29,
+    34: 30,
+    35: 31,
+    36: 32,
+    37: 33,
+    38: 34,
+    39: 35,
+    40: 36,
+    41: 37,
+    42: 38,
+    43: 39,
+    45: 40,
+    46: 41,
+    47: 42,
+    48: 43,
+    49: 44,
+    50: 45,
+    51: 46,
+    52: 47,
+    53: 48,
+    54: 49,
+    55: 50,
+    56: 51,
+    57: 52,
+    58: 53,
+    59: 54,
+    60: 55,
+    61: 56,
+    62: 57,
+    63: 58,
+    64: 59,
+    66: 60,
+    69: 61,
+    71: 62,
+    72: 63,
+    73: 64,
+    74: 65,
+    75: 66,
+    76: 67,
+    77: 68,
+    78: 69,
+    79: 70,
+    80: 71,
+    81: 72,
+    83: 73,
+    84: 74,
+    85: 75,
+    86: 76,
+    87: 77,
+    88: 78,
+    89: 79,
+    91: 80,
+    92: 81,
+    93: 82,
+    94: 83,
+    95: 84,
+    96: 85,
+    97: 86,
+    98: 87,
+    99: 88,
+    100: 89,
+    101: 90,
+    102: 91,
+    103: 92,
+    104: 93,
+    105: 94,
+    106: 95,
+    107: 96,
+    108: 97,
+    109: 98,
+    110: 99,
+    111: 100,
+    112: 101,
+    113: 102,
+    114: 103,
+    115: 104,
+    116: 105,
+    117: 106,
+    118: 107,
+    119: 108,
+    120: 109,
+    121: 110,
+    122: 111,
+    123: 112,
+    124: 113,
+    125: 114,
+    126: 115,
+    127: 116,
+    128: 117,
+    129: 118,
+    130: 119,
+    131: 120,
+    132: 121,
+    133: 122,
+    134: 123,
+    135: 124,
+    136: 125,
+    137: 126,
+    138: 127,
+    139: 128,
+    140: 129,
+    141: 130,
+    142: 131,
+    143: 132,
+    144: 133,
+    145: 134,
+    146: 135,
+    147: 136,
+    148: 137,
+    149: 138,
+    150: 139,
+    151: 140,
+    152: 141,
+    153: 142,
+    154: 143,
+    155: 144,
+    156: 145,
+    157: 146,
+    158: 147,
+    159: 148,
+    160: 149,
+    161: 150,
+    162: 151,
+    163: 152,
+    164: 153,
+    165: 154,
+    166: 155,
+    167: 156,
+    168: 157,
+    169: 158,
+    170: 159,
+    171: 160,
+    172: 161,
+    173: 162,
+    174: 163,
+    175: 164,
+    176: 165,
+    177: 166,
+    178: 167,
+    179: 168,
+    180: 169,
+    181: 170,
+    255: 255
+}
+
+
+def convert_to_trainID(maskpath, out_mask_dir, is_train):
+    mask = np.array(Image.open(maskpath))
+    mask_copy = mask.copy()
+    for clsID, trID in clsID_to_trID.items():
+        mask_copy[mask == clsID] = trID
+    seg_filename = osp.join(out_mask_dir, 'train2017',
+                            osp.basename(maskpath)) if is_train else osp.join(
+                                out_mask_dir, 'val2017',
+                                osp.basename(maskpath))
+    Image.fromarray(mask_copy).save(seg_filename, 'PNG')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=\
+        'Convert COCO Stuff 164k annotations to mmdet format')  # noqa
+    parser.add_argument('coco_path', help='coco stuff path')
+    parser.add_argument(
+        '--out-dir-name',
+        '-o',
+        default='stuffthingmaps_semseg',
+        help='output path')
+    parser.add_argument(
+        '--nproc', default=16, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    coco_path = args.coco_path
+    out_dir = osp.join(coco_path, args.out_dir_name)
+    nproc = args.nproc
+
+    mkdir_or_exist(osp.join(out_dir, 'train2017'))
+    mkdir_or_exist(osp.join(out_dir, 'val2017'))
+
+    train_list = glob(osp.join(coco_path, 'stuffthingmaps/train2017', '*.png'))
+    val_list = glob(osp.join(coco_path, 'stuffthingmaps/val2017', '*.png'))
+    assert (len(train_list) +
+            len(val_list)) == COCO_LEN, 'Wrong length of list {} & {}'.format(
+                len(train_list), len(val_list))
+
+    if args.nproc > 1:
+        track_parallel_progress(
+            partial(convert_to_trainID, out_mask_dir=out_dir, is_train=True),
+            train_list,
+            nproc=nproc)
+        track_parallel_progress(
+            partial(convert_to_trainID, out_mask_dir=out_dir, is_train=False),
+            val_list,
+            nproc=nproc)
+    else:
+        track_progress(
+            partial(convert_to_trainID, out_mask_dir=out_dir, is_train=True),
+            train_list)
+        track_progress(
+            partial(convert_to_trainID, out_mask_dir=out_dir, is_train=False),
+            val_list)
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/crowdhuman2coco.py b/mmde/mmdet/.mim/tools/dataset_converters/crowdhuman2coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..84af82daf99f83ba8ea6aa093a488cbd343e8165
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/crowdhuman2coco.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import os
+import os.path as osp
+from collections import defaultdict
+
+import mmengine
+from PIL import Image
+from tqdm import tqdm
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='CrowdHuman to COCO Video format')
+    parser.add_argument(
+        '-i',
+        '--input',
+        help='root directory of CrowdHuman annotations',
+    )
+    parser.add_argument(
+        '-o',
+        '--output',
+        help='directory to save coco formatted label file',
+    )
+    return parser.parse_args()
+
+
+def load_odgt(filename):
+    with open(filename, 'r') as f:
+        lines = f.readlines()
+    data_infos = [json.loads(line.strip('\n')) for line in lines]
+    return data_infos
+
+
+def convert_crowdhuman(ann_dir, save_dir, mode='train'):
+    """Convert CrowdHuman dataset in COCO style.
+
+    Args:
+        ann_dir (str): The path of CrowdHuman dataset.
+        save_dir (str): The path to save annotation files.
+        mode (str): Convert train dataset or validation dataset. Options are
+            'train', 'val'. Default: 'train'.
+    """
+    assert mode in ['train', 'val']
+
+    records = dict(img_id=1, ann_id=1)
+    outputs = defaultdict(list)
+    outputs['categories'] = [dict(id=1, name='pedestrian')]
+
+    data_infos = load_odgt(osp.join(ann_dir, f'annotation_{mode}.odgt'))
+    for data_info in tqdm(data_infos):
+        img_name = osp.join('Images', f"{data_info['ID']}.jpg")
+        img = Image.open(osp.join(ann_dir, mode, img_name))
+        width, height = img.size[:2]
+        image = dict(
+            file_name=img_name,
+            height=height,
+            width=width,
+            id=records['img_id'])
+        outputs['images'].append(image)
+
+        if mode != 'test':
+            for ann_info in data_info['gtboxes']:
+                bbox = ann_info['fbox']
+                if 'extra' in ann_info and 'ignore' in ann_info[
+                        'extra'] and ann_info['extra']['ignore'] == 1:
+                    iscrowd = True
+                else:
+                    iscrowd = False
+                ann = dict(
+                    id=records['ann_id'],
+                    image_id=records['img_id'],
+                    category_id=outputs['categories'][0]['id'],
+                    vis_bbox=ann_info['vbox'],
+                    bbox=bbox,
+                    area=bbox[2] * bbox[3],
+                    iscrowd=iscrowd)
+                outputs['annotations'].append(ann)
+                records['ann_id'] += 1
+        records['img_id'] += 1
+
+    if not osp.isdir(save_dir):
+        os.makedirs(save_dir)
+    mmengine.dump(outputs, osp.join(save_dir, f'crowdhuman_{mode}.json'))
+    print(f'-----CrowdHuman {mode} set------')
+    print(f'total {records["img_id"] - 1} images')
+    if mode != 'test':
+        print(f'{records["ann_id"] - 1} pedestrians are annotated.')
+    print('-----------------------')
+
+
+def main():
+    args = parse_args()
+    convert_crowdhuman(args.input, args.output, mode='train')
+    convert_crowdhuman(args.input, args.output, mode='val')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/extract_coco_from_mixed.py b/mmde/mmdet/.mim/tools/dataset_converters/extract_coco_from_mixed.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4777b0fd073b9bb1b5f8ef0ff16e63cd7f18e58
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/extract_coco_from_mixed.py
@@ -0,0 +1,45 @@
+import argparse
+import os.path as osp
+
+import mmengine
+from pycocotools.coco import COCO
+
+
+def extract_coco(args):
+    coco = COCO(args.mixed_ann)
+
+    json_data = mmengine.load(args.mixed_ann)
+    new_json_data = {
+        'info': json_data['info'],
+        'licenses': json_data['licenses'],
+        'categories': json_data['categories'],
+        'images': [],
+        'annotations': []
+    }
+    del json_data
+
+    img_ids = coco.getImgIds()
+    for img_id in img_ids:
+        img_info = coco.loadImgs([img_id])[0]
+        if img_info['data_source'] == 'coco':
+            new_json_data['images'].append(img_info)
+            ann_ids = coco.getAnnIds(imgIds=[img_id])
+            img_ann_info = coco.loadAnns(ann_ids)
+            new_json_data['annotations'].extend(img_ann_info)
+    if args.out_ann is None:
+        out_ann = osp.dirname(
+            args.mixed_ann) + '/final_mixed_train_only_coco.json'
+        mmengine.dump(new_json_data, out_ann)
+        print('save new json to {}'.format(out_ann))
+    else:
+        mmengine.dump(new_json_data, args.out_ann)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        'split mixed goldg to coco.', add_help=True)
+    parser.add_argument('mixed_ann', type=str)
+    parser.add_argument('--out-ann', '-o', type=str)
+    args = parser.parse_args()
+
+    extract_coco(args)
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/fix_o365_names.py b/mmde/mmdet/.mim/tools/dataset_converters/fix_o365_names.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bb4a62843ced9a342dee7487b502cb9c137b93a
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/fix_o365_names.py
@@ -0,0 +1,35 @@
+# Reference: https://github.com/shenyunhang/APE/blob/main/datasets/tools/objects3652coco/fix_o365_names.py # noqa
+import argparse
+import copy
+import json
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--ann',
+        default='data/objects365v2/annotations/zhiyuan_objv2_train.json')
+    parser.add_argument(
+        '--fix_name_map',
+        default='tools/dataset_converters/zhiyuan_objv2_train_names_fix.csv')
+    args = parser.parse_args()
+
+    new_names = {}
+    old_names = {}
+    with open(args.fix_name_map, 'r') as f:
+        for line in f:
+            tmp = line.strip().split(',')
+            old_names[int(tmp[0])] = tmp[1]
+            new_names[int(tmp[0])] = tmp[2]
+    data = json.load(open(args.ann, 'r'))
+
+    cat_info = copy.deepcopy(data['categories'])
+
+    for x in cat_info:
+        if old_names[x['id']] != new_names[x['id']]:
+            print('Renaming', x['id'], x['name'], new_names[x['id']])
+            x['name'] = new_names[x['id']]
+
+    data['categories'] = cat_info
+    out_name = args.ann[:-5] + '_fixname.json'
+    print('Saving to', out_name)
+    json.dump(data, open(out_name, 'w'))
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/goldg2odvg.py b/mmde/mmdet/.mim/tools/dataset_converters/goldg2odvg.py
new file mode 100644
index 0000000000000000000000000000000000000000..5267553da011aec9f0824159e0c435ae11049acd
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/goldg2odvg.py
@@ -0,0 +1,136 @@
+import argparse
+
+import jsonlines
+from pycocotools.coco import COCO
+from tqdm import tqdm
+
+
+def _has_only_empty_bbox(anno):
+    return all(any(o <= 1 for o in obj['bbox'][2:]) for obj in anno)
+
+
+def has_valid_annotation(anno):
+    # if it's empty, there is no annotation
+    if len(anno) == 0:
+        return False
+    # if all boxes have close to zero area, there is no annotation
+    if _has_only_empty_bbox(anno):
+        return False
+    return True
+
+
+def goldg2odvg(args):
+    coco = COCO(args.input)
+    ids = list(sorted(coco.imgs.keys()))
+
+    out_results = []
+    for img_id in tqdm(ids):
+        if isinstance(img_id, str):
+            ann_ids = coco.getAnnIds(imgIds=[img_id], iscrowd=0)
+        else:
+            ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=0)
+        annos = coco.loadAnns(ann_ids)
+        if not has_valid_annotation(annos):
+            continue
+
+        img_info = coco.loadImgs(img_id)[0]
+        file_name = img_info['file_name']
+        caption = img_info['caption']
+
+        regions = {}
+
+        for anno in annos:
+            box = anno['bbox']
+            tokens_positive = anno['tokens_positive']
+            x1, y1, w, h = box
+            inter_w = max(0, min(x1 + w, int(img_info['width'])) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, int(img_info['height'])) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if anno['area'] <= 0 or w < 1 or h < 1:
+                continue
+
+            if anno.get('iscrowd', False):
+                continue
+            bbox_xyxy = [
+                x1, y1,
+                min(x1 + w, int(img_info['width'])),
+                min(y1 + h, int(img_info['height']))
+            ]
+
+            tokens_positive = sorted(tokens_positive, key=lambda x: x[0])
+
+            phrase = []
+            pre_end_index = -10
+            for token in tokens_positive:
+                start_index = token[0]
+                end_index = token[1]
+                if pre_end_index + 1 == start_index:
+                    if caption[token[0] - 1] == ' ':
+                        phrase[
+                            -1] = phrase[-1] + ' ' + caption[token[0]:token[1]]
+                    else:
+                        phrase.append(caption[token[0]:token[1]])
+                else:
+                    phrase.append(caption[token[0]:token[1]])
+                pre_end_index = end_index
+
+            key = ' '.join(phrase)
+
+            if key not in regions:
+                regions[key] = {
+                    'bbox': bbox_xyxy,
+                    'phrase': phrase,
+                    'tokens_positive': tokens_positive
+                }
+            else:
+                old_box = regions[key]['bbox']
+                if isinstance(old_box[0], list):
+                    old_box.append(bbox_xyxy)
+                else:
+                    old_box = [old_box, bbox_xyxy]
+
+                regions[key]['bbox'] = old_box
+
+        out_dict = {
+            'filename': file_name,
+            'height': int(img_info['height']),
+            'width': int(img_info['width']),
+            'grounding': {
+                'caption': caption
+            }
+        }
+
+        region_list = []
+        for key, value in regions.items():
+            phrase = value['phrase']
+            if len(phrase) == 1:
+                phrase = phrase[0]
+            region_list.append({
+                'bbox': value['bbox'],
+                'phrase': phrase,
+                'tokens_positive': value['tokens_positive']
+            })
+        out_dict['grounding']['regions'] = region_list
+        out_results.append(out_dict)
+
+    if args.out_ann is None:
+        out_path = args.input[:-5] + '_vg.json'
+    else:
+        out_path = args.out_ann
+
+    with jsonlines.open(out_path, mode='w') as writer:
+        writer.write_all(out_results)
+    print(f'save to {out_path}')
+
+
+# goldg+: final_mixed_train_no_coco.json +
+# final_flickr_separateGT_train.json +
+# final_mixed_train_only_coco.json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('goldg to odvg format.', add_help=True)
+    parser.add_argument('input', type=str, help='input json file name')
+    parser.add_argument('--out-ann', '-o', type=str)
+    args = parser.parse_args()
+
+    goldg2odvg(args)
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/grit2odvg.py b/mmde/mmdet/.mim/tools/dataset_converters/grit2odvg.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d1c6d1a5e760979e7a99a1bcba90c1e4ac0ccec
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/grit2odvg.py
@@ -0,0 +1,189 @@
+import argparse
+import json
+import multiprocessing
+import os
+import os.path as osp
+
+import emoji
+import jsonlines
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+is_debug = False
+
+
+def is_valid_caption(caption, rules={'↙️', '[CLS]', '[SEP]'}):
+    check_anno = caption.strip(
+    )[:-1]  # Remove the ending delimiter from the caption.
+    for ch in rules:
+        if ch in check_anno:
+            return False
+    return True
+
+
+def process_one_file(anno_file, result_queue):
+    print('processing', anno_file)
+    with open(anno_file, 'r') as f:
+        metas = json.load(f)
+
+    results = []
+    for meta in metas:
+        # print('============================')
+        file_name = meta['key'][0:5] + '/' + meta['key'] + '.jpg'
+        file_name = osp.join('images', file_name)
+
+        h = meta['height']
+        w = meta['width']
+
+        caption = meta['caption']
+        # Weird captions are filtered out from the beginning.
+        if not is_valid_caption(caption):
+            if is_debug:
+                print('=====caption filtered====', caption)
+            continue
+
+        # Captions exceeding 240 tokens are filtered out,
+        # where 240 is an empirical value.
+        tokenized = tokenizer([caption], return_tensors='pt')
+        if tokenized.input_ids.shape[1] >= 240:
+            if is_debug:
+                print('=====token filtered====', caption)
+            continue
+
+        ref_exps = meta['ref_exps']
+        ref_captions = [i[0:2] for i in ref_exps]
+        ref_token_positives = [i[0:2] for i in ref_exps]
+        ref_captions = [caption[int(i[0]):int(i[1])] for i in ref_captions]
+        ref_boxes = [i[2:6] for i in ref_exps]
+
+        regions = {}
+        for bbox, ref_caption, tokens_positive in zip(ref_boxes, ref_captions,
+                                                      ref_token_positives):
+            #  If the current reference includes special delimiters,
+            #  it will be filtered out.
+            if not is_valid_caption(
+                    caption, rules={'.', '？', ' ', "\'", "\""}):
+                if is_debug:
+                    print('=====ref filtered====', caption)
+                continue
+            # If the current reference contains non-ASCII characters,
+            # it will be filtered out.
+            if not str.isascii(caption):
+                if is_debug:
+                    print('=====ref filtered====', caption)
+                continue
+            # If the current reference includes non-ASCII characters,
+            # it will be filtered out.
+            if emoji.emoji_count(caption):
+                if is_debug:
+                    print('=====ref filtered====', caption)
+                continue
+
+            box = [
+                round(bbox[0] * w, 3),
+                round(bbox[1] * h, 3),
+                round((bbox[2]) * w, 3),
+                round((bbox[3]) * h, 3)
+            ]
+            x1, y1, x2, y2 = box
+            inter_w = max(0, min(x1 + w, int(w)) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, int(h)) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                if is_debug:
+                    print('=====wh filtered====', box)
+                continue
+            if w <= 1 or h <= 1:
+                if is_debug:
+                    print('=====area filtered====', box)
+                continue
+
+            if ref_caption not in regions:
+                regions[ref_caption] = {
+                    'bbox':
+                    box,
+                    'phrase':
+                    ref_caption,
+                    'tokens_positive':
+                    [[int(tokens_positive[0]),
+                      int(tokens_positive[1])]],
+                }
+            else:
+                old_box = regions[ref_caption]['bbox']
+                if isinstance(old_box[0], list):
+                    old_box.append(box)
+                else:
+                    old_box = [old_box, box]
+                regions[ref_caption]['bbox'] = old_box
+
+        if len(regions) > 0:
+            print('caption: ', caption)
+            print('regions', regions)
+        else:
+            if is_debug:
+                print('caption: ', caption)
+                print('regions', regions)
+
+        if len(regions) == 0:
+            continue
+
+        out_dict = {
+            'filename': file_name,
+            'height': int(h),
+            'width': int(w),
+            'grounding': {
+                'caption': caption
+            }
+        }
+
+        region_list = []
+        for key, value in regions.items():
+            phrase = value['phrase']
+            if len(phrase) == 1:
+                phrase = phrase[0]
+            region_list.append({
+                'bbox': value['bbox'],
+                'phrase': phrase,
+                'tokens_positive': value['tokens_positive']
+            })
+        out_dict['grounding']['regions'] = region_list
+        print(out_dict)
+        results.append(out_dict)
+    result_queue.put(results)
+
+
+def grit2odvg(args):
+    annotations_dir = osp.join(args.data_root, 'annotations')
+    annos_files = [
+        osp.join(annotations_dir, anno) for anno in os.listdir(annotations_dir)
+        if anno.endswith('.json') and not anno.endswith('vg.json')
+    ]
+
+    annos_files = annos_files[:2]
+
+    manager = multiprocessing.Manager()
+    result_queue = manager.Queue()
+    pool = multiprocessing.Pool(processes=min(len(annos_files), 16))
+
+    for anno_file in annos_files:
+        pool.apply_async(process_one_file, args=(anno_file, result_queue))
+
+    pool.close()
+    pool.join()
+
+    out_datas = []
+    while not result_queue.empty():
+        out_datas.extend(result_queue.get())
+
+    out_path = osp.join(args.data_root, 'grit20m_vg.json')
+    with jsonlines.open(out_path, mode='w') as writer:
+        writer.write_all(out_datas)
+    print('save to ', out_path)
+    print('total img: ', len(out_datas))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('grit to odvg format.', add_help=True)
+    parser.add_argument('data_root', type=str, help='input dir name')
+    args = parser.parse_args()
+
+    grit2odvg(args)
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/grit_processing.py b/mmde/mmdet/.mim/tools/dataset_converters/grit_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebf3791a80e4c5056a0aaf23c49d6c1ba4ff68b2
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/grit_processing.py
@@ -0,0 +1,121 @@
+import argparse
+import json
+import logging
+import os
+import tarfile
+from functools import partial
+from multiprocessing import Pool
+
+
+def create_logger(output_file):
+    logger = logging.getLogger('grit_logger')
+    logger.setLevel(logging.INFO)  # set logger output level
+    formatter = logging.Formatter('%(asctime)s - %(message)s')
+
+    fh = logging.FileHandler(output_file)
+    fh.setLevel(logging.INFO)
+    fh.setFormatter(formatter)
+
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+
+    logger.addHandler(fh)
+    logger.addHandler(console)
+
+    return logger
+
+
+def count_download_image(download_json_dir, logger):
+    parquet_files = [
+        f for f in os.listdir(download_json_dir) if f.endswith('.json')
+    ]
+    len = 0
+
+    for file in parquet_files:
+        with open(os.path.join(download_json_dir, file), 'r') as f:
+            data = json.load(f)
+            len = len + int(data['successes'])
+        logger.info(file + 'has ' + str(data['successes']) +
+                    ' successful images')
+
+    logger.info('all files finished.', str(len),
+                'images have been successfully downloaded.')
+
+
+def tar_processing(tar_path, output_dir, logger):
+    filepath = untar(tar_path, logger)
+    json_files = [f for f in os.listdir(filepath) if f.endswith('.json')]
+    all_data = []
+    cnt = 0
+
+    for file in json_files:
+        with open(os.path.join(filepath, file), 'r') as f:
+            df = json.load(f)
+        cnt = cnt + 1
+        all_data.extend([df])
+    dir_name = os.path.basename(filepath)
+    # write all data to a json file
+    logger.info(f'{dir_name} has {cnt} jsons')
+    json_name = os.path.basename(filepath) + '.json'
+    if not os.path.exists(os.path.join(output_dir, 'annotations')):
+        os.mkdir(os.path.join(output_dir, 'annotations'))
+    with open(os.path.join(output_dir, 'annotations', json_name), 'w') as f:
+        json.dump(all_data, f)
+    logger.info(f'{dir_name} completed')
+    cp_rm(filepath, output_dir)
+    return os.path.basename(filepath)
+
+
+def untar(filepath, logger):
+    if tarfile.is_tarfile(filepath):
+        new_folder = os.path.splitext(filepath)[0]
+        tar_name = os.path.basename(filepath)
+        with tarfile.open(filepath) as tar:
+            members = tar.getmembers()
+            if not os.path.exists(new_folder):
+                os.mkdir(new_folder)
+            else:
+                f = os.listdir(new_folder)
+                if len(members) == len(f):
+                    logger.info(f'{tar_name} already decompressed')
+                    return new_folder
+            logger.info(f'{tar_name} decompressing...')
+            os.system(f'tar -xf {filepath} -C {new_folder}')
+            logger.info(f'{tar_name} decompressed!')
+        return new_folder
+
+
+def cp_rm(filepath, output_dir):
+    # delete txt/json
+    for file in os.listdir(filepath):
+        if file.endswith('.txt') or file.endswith('.json'):
+            os.remove(os.path.join(filepath, file))
+    # move images to output dir
+    target_dir = os.path.join(output_dir, 'images')
+    if not os.path.exists(os.path.join(output_dir, 'images')):
+        os.mkdir(os.path.join(output_dir, 'images'))
+    os.system('mv -f {} {}'.format(filepath, target_dir))
+
+
+def main(args):
+    logger = create_logger(args.log_name)
+    all_file_name = [
+        os.path.join(args.image_dir, file)
+        for file in os.listdir(args.image_dir) if file.endswith('.tar')
+    ]
+    all_file_name.sort()
+    func = partial(tar_processing, output_dir=args.output_dir, logger=logger)
+    with Pool(processes=args.num_process) as pool:
+        result = pool.imap(func=func, iterable=all_file_name)  # noqa
+        # print(result)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('image_dir', type=str)  # grit raw directory
+    parser.add_argument('output_dir', type=str)
+    parser.add_argument('--num-process', default=10)
+    parser.add_argument('--log-name', type=str, default='grit_processing.log')
+    args = parser.parse_args()
+
+    main(args)
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/images2coco.py b/mmde/mmdet/.mim/tools/dataset_converters/images2coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a893de8421ce8dffab5cd788c884400750d79f06
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/images2coco.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+
+from mmengine.fileio import dump, list_from_file
+from mmengine.utils import mkdir_or_exist, scandir, track_iter_progress
+from PIL import Image
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert images to coco format without annotations')
+    parser.add_argument('img_path', help='The root path of images')
+    parser.add_argument(
+        'classes', type=str, help='The text file name of storage class list')
+    parser.add_argument(
+        'out',
+        type=str,
+        help='The output annotation json file name, The save dir is in the '
+        'same directory as img_path')
+    parser.add_argument(
+        '-e',
+        '--exclude-extensions',
+        type=str,
+        nargs='+',
+        help='The suffix of images to be excluded, such as "png" and "bmp"')
+    args = parser.parse_args()
+    return args
+
+
+def collect_image_infos(path, exclude_extensions=None):
+    img_infos = []
+
+    images_generator = scandir(path, recursive=True)
+    for image_path in track_iter_progress(list(images_generator)):
+        if exclude_extensions is None or (
+                exclude_extensions is not None
+                and not image_path.lower().endswith(exclude_extensions)):
+            image_path = os.path.join(path, image_path)
+            img_pillow = Image.open(image_path)
+            img_info = {
+                'filename': image_path,
+                'width': img_pillow.width,
+                'height': img_pillow.height,
+            }
+            img_infos.append(img_info)
+    return img_infos
+
+
+def cvt_to_coco_json(img_infos, classes):
+    image_id = 0
+    coco = dict()
+    coco['images'] = []
+    coco['type'] = 'instance'
+    coco['categories'] = []
+    coco['annotations'] = []
+    image_set = set()
+
+    for category_id, name in enumerate(classes):
+        category_item = dict()
+        category_item['supercategory'] = str('none')
+        category_item['id'] = int(category_id)
+        category_item['name'] = str(name)
+        coco['categories'].append(category_item)
+
+    for img_dict in img_infos:
+        file_name = img_dict['filename']
+        assert file_name not in image_set
+        image_item = dict()
+        image_item['id'] = int(image_id)
+        image_item['file_name'] = str(file_name)
+        image_item['height'] = int(img_dict['height'])
+        image_item['width'] = int(img_dict['width'])
+        coco['images'].append(image_item)
+        image_set.add(file_name)
+
+        image_id += 1
+    return coco
+
+
+def main():
+    args = parse_args()
+    assert args.out.endswith(
+        'json'), 'The output file name must be json suffix'
+
+    # 1 load image list info
+    img_infos = collect_image_infos(args.img_path, args.exclude_extensions)
+
+    # 2 convert to coco format data
+    classes = list_from_file(args.classes)
+    coco_info = cvt_to_coco_json(img_infos, classes)
+
+    # 3 dump
+    save_dir = os.path.join(args.img_path, '..', 'annotations')
+    mkdir_or_exist(save_dir)
+    save_path = os.path.join(save_dir, args.out)
+    dump(coco_info, save_path)
+    print(f'save json file: {save_path}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/lvis2odvg.py b/mmde/mmdet/.mim/tools/dataset_converters/lvis2odvg.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce0c4381b35605bed9f0a0c9b3f5a8366141178a
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/lvis2odvg.py
@@ -0,0 +1,98 @@
+import argparse
+import json
+import os.path
+
+import jsonlines
+from lvis import LVIS
+from tqdm import tqdm
+
+key_list_lvis = [i for i in range(1203)]
+val_list_lvis = [i for i in range(1, 1204)]
+
+
+def dump_lvis_label_map(args):
+    with open(args.input, 'r') as f:
+        j = json.load(f)
+    o_dict = {}
+    for category in j['categories']:
+        index = str(int(category['id']) - 1)
+        name = category['name']
+        o_dict[index] = name
+    if args.output is None:
+        output = os.path.dirname(args.input) + '/lvis_v1_label_map.json'
+    else:
+        output = os.path.dirname(args.output) + '/lvis_v1_label_map.json'
+    with open(output, 'w') as f:
+        json.dump(o_dict, f)
+
+
+def lvis2odvg(args):
+    lvis = LVIS(args.input)
+    cats = lvis.load_cats(lvis.get_cat_ids())
+    nms = {cat['id']: cat['name'] for cat in cats}
+    metas = []
+    if args.output is None:
+        out_path = args.input[:-5] + '_od.json'
+    else:
+        out_path = args.output
+
+    key_list = key_list_lvis
+    val_list = val_list_lvis
+    dump_lvis_label_map(args)
+
+    for img_id, img_info in tqdm(lvis.imgs.items()):
+        file_name = img_info['coco_url'].replace(
+            'http://images.cocodataset.org/', '')
+        ann_ids = lvis.get_ann_ids(img_ids=[img_id])
+        raw_ann_info = lvis.load_anns(ann_ids)
+        instance_list = []
+        for ann in raw_ann_info:
+            if ann.get('ignore', False):
+                print(f'invalid ignore box of {ann}')
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                print(f'invalid wh box of {ann}')
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                print(f'invalid area box of {ann}, '
+                      f'w={img_info["width"]}, h={img_info["height"]}')
+                continue
+
+            if ann.get('iscrowd', False):
+                print(f'invalid iscrowd box of {ann}')
+                continue
+
+            bbox_xyxy = [x1, y1, x1 + w, y1 + h]
+            label = ann['category_id']
+            category = nms[label]
+            ind = val_list.index(label)
+            label_trans = key_list[ind]
+            instance_list.append({
+                'bbox': bbox_xyxy,
+                'label': label_trans,
+                'category': category
+            })
+        metas.append({
+            'filename': file_name,
+            'height': img_info['height'],
+            'width': img_info['width'],
+            'detection': {
+                'instances': instance_list
+            }
+        })
+
+    with jsonlines.open(out_path, mode='w') as writer:
+        writer.write_all(metas)
+
+    print('save to {}'.format(out_path))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('lvis to odvg format.', add_help=True)
+    parser.add_argument('input', type=str, help='input list name')
+    parser.add_argument('--output', '-o', type=str, help='input list name')
+    args = parser.parse_args()
+    lvis2odvg(args)
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/lvis2ovd.py b/mmde/mmdet/.mim/tools/dataset_converters/lvis2ovd.py
new file mode 100644
index 0000000000000000000000000000000000000000..3405bf3ad4fb06685e6848c196958953e48dfdf7
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/lvis2ovd.py
@@ -0,0 +1,41 @@
+import argparse
+import json
+import os.path
+
+import jsonlines
+
+
+def lvis2ovd(args):
+    ann_path = os.path.join(args.data_root, 'annotations/')
+
+    lvis = json.load(open(ann_path + 'lvis_v1_val.json'))
+    base_class_ids = [
+        cat['id'] - 1 for cat in lvis['categories'] if cat['frequency'] != 'r'
+    ]
+
+    with open(ann_path + 'lvis_v1_train_od.json') as f:
+        data = [json.loads(d) for d in f]
+    for i in range(len(data)):
+        instance = [
+            inst for inst in data[i]['detection']['instances']
+            if inst['label'] in base_class_ids
+        ]
+        data[i]['detection']['instances'] = instance
+    with jsonlines.open(
+            ann_path + 'lvis_v1_train_od_norare.json', mode='w') as writer:
+        writer.write_all(data)
+
+    label_map = json.load(open(ann_path + 'lvis_v1_label_map.json'))
+    label_map = {
+        k: v
+        for k, v in label_map.items() if int(k) in base_class_ids
+    }
+    json.dump(label_map, open(ann_path + 'lvis_v1_label_map_norare.json', 'w'))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('lvis to ovd format.', add_help=True)
+    parser.add_argument('data_root', type=str, help='coco root path')
+    args = parser.parse_args()
+
+    lvis2ovd(args)
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/mot2coco.py b/mmde/mmdet/.mim/tools/dataset_converters/mot2coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8e890212baab3cbac6d92fa1defab05d5520812
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/mot2coco.py
@@ -0,0 +1,220 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This script converts MOT labels into COCO style.
+# Official website of the MOT dataset: https://motchallenge.net/
+#
+# Label format of MOT dataset:
+#   GTs:
+#       <frame_id> # starts from 1 but COCO style starts from 0,
+#       <instance_id>, <x1>, <y1>, <w>, <h>,
+#       <conf> # conf is annotated as 0 if the object is ignored,
+#       <class_id>, <visibility>
+#
+#   DETs and Results:
+#       <frame_id>, <instance_id>, <x1>, <y1>, <w>, <h>, <conf>,
+#       <x>, <y>, <z> # for 3D objects
+
+import argparse
+import os
+import os.path as osp
+from collections import defaultdict
+
+import mmengine
+import numpy as np
+from tqdm import tqdm
+
+# Classes in MOT:
+CLASSES = [
+    dict(id=1, name='pedestrian'),
+    dict(id=2, name='person_on_vehicle'),
+    dict(id=3, name='car'),
+    dict(id=4, name='bicycle'),
+    dict(id=5, name='motorbike'),
+    dict(id=6, name='non_mot_vehicle'),
+    dict(id=7, name='static_person'),
+    dict(id=8, name='distractor'),
+    dict(id=9, name='occluder'),
+    dict(id=10, name='occluder_on_ground'),
+    dict(id=11, name='occluder_full'),
+    dict(id=12, name='reflection'),
+    dict(id=13, name='crowd')
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert MOT label and detections to COCO-VID format.')
+    parser.add_argument('-i', '--input', help='path of MOT data')
+    parser.add_argument(
+        '-o', '--output', help='path to save coco formatted label file')
+    parser.add_argument(
+        '--convert-det',
+        action='store_true',
+        help='convert official detection results.')
+    parser.add_argument(
+        '--split-train',
+        action='store_true',
+        help='split the train set into half-train and half-validate.')
+    return parser.parse_args()
+
+
+def parse_gts(gts, is_mot15):
+    outputs = defaultdict(list)
+    for gt in gts:
+        gt = gt.strip().split(',')
+        frame_id, ins_id = map(int, gt[:2])
+        bbox = list(map(float, gt[2:6]))
+        if is_mot15:
+            conf = 1.
+            category_id = 1
+            visibility = 1.
+        else:
+            conf = float(gt[6])
+            category_id = int(gt[7])
+            visibility = float(gt[8])
+        anns = dict(
+            category_id=category_id,
+            bbox=bbox,
+            area=bbox[2] * bbox[3],
+            iscrowd=False,
+            visibility=visibility,
+            mot_instance_id=ins_id,
+            mot_conf=conf)
+        outputs[frame_id].append(anns)
+    return outputs
+
+
+def parse_dets(dets):
+    outputs = defaultdict(list)
+    for det in dets:
+        det = det.strip().split(',')
+        frame_id, ins_id = map(int, det[:2])
+        assert ins_id == -1
+        bbox = list(map(float, det[2:7]))
+        # [x1, y1, x2, y2] to be consistent with mmdet
+        bbox = [
+            bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3], bbox[4]
+        ]
+        outputs[frame_id].append(bbox)
+
+    return outputs
+
+
+def main():
+    args = parse_args()
+    if not osp.isdir(args.output):
+        os.makedirs(args.output)
+
+    sets = ['train', 'test']
+    if args.split_train:
+        sets += ['half-train', 'half-val']
+    vid_id, img_id, ann_id = 1, 1, 1
+
+    for subset in sets:
+        ins_id = 0
+        print(f'Converting {subset} set to COCO format')
+        if 'half' in subset:
+            in_folder = osp.join(args.input, 'train')
+        else:
+            in_folder = osp.join(args.input, subset)
+        out_file = osp.join(args.output, f'{subset}_cocoformat.json')
+        outputs = defaultdict(list)
+        outputs['categories'] = CLASSES
+        if args.convert_det:
+            det_file = osp.join(args.output, f'{subset}_detections.pkl')
+            detections = dict(det_bboxes=dict())
+        video_names = os.listdir(in_folder)
+        for video_name in tqdm(video_names):
+            # basic params
+            parse_gt = 'test' not in subset
+            ins_maps = dict()
+            # load video infos
+            video_folder = osp.join(in_folder, video_name)
+            infos = mmengine.list_from_file(f'{video_folder}/seqinfo.ini')
+            # video-level infos
+            assert video_name == infos[1].strip().split('=')[1]
+            img_folder = infos[2].strip().split('=')[1]
+            img_names = os.listdir(f'{video_folder}/{img_folder}')
+            img_names = sorted(img_names)
+            fps = int(infos[3].strip().split('=')[1])
+            num_imgs = int(infos[4].strip().split('=')[1])
+            assert num_imgs == len(img_names)
+            width = int(infos[5].strip().split('=')[1])
+            height = int(infos[6].strip().split('=')[1])
+            video = dict(
+                id=vid_id,
+                name=video_name,
+                fps=fps,
+                width=width,
+                height=height)
+            # parse annotations
+            if parse_gt:
+                gts = mmengine.list_from_file(f'{video_folder}/gt/gt.txt')
+                if 'MOT15' in video_folder:
+                    img2gts = parse_gts(gts, True)
+                else:
+                    img2gts = parse_gts(gts, False)
+            if args.convert_det:
+                dets = mmengine.list_from_file(f'{video_folder}/det/det.txt')
+                img2dets = parse_dets(dets)
+            # make half sets
+            if 'half' in subset:
+                split_frame = num_imgs // 2 + 1
+                if 'train' in subset:
+                    img_names = img_names[:split_frame]
+                elif 'val' in subset:
+                    img_names = img_names[split_frame:]
+                else:
+                    raise ValueError(
+                        'subset must be named with `train` or `val`')
+                mot_frame_ids = [str(int(_.split('.')[0])) for _ in img_names]
+                with open(f'{video_folder}/gt/gt_{subset}.txt', 'wt') as f:
+                    for gt in gts:
+                        if gt.split(',')[0] in mot_frame_ids:
+                            f.writelines(f'{gt}\n')
+            # image and box level infos
+            for frame_id, name in enumerate(img_names):
+                img_name = osp.join(video_name, img_folder, name)
+                mot_frame_id = int(name.split('.')[0])
+                image = dict(
+                    id=img_id,
+                    video_id=vid_id,
+                    file_name=img_name,
+                    height=height,
+                    width=width,
+                    frame_id=frame_id,
+                    mot_frame_id=mot_frame_id)
+                if parse_gt:
+                    gts = img2gts[mot_frame_id]
+                    for gt in gts:
+                        gt.update(id=ann_id, image_id=img_id)
+                        mot_ins_id = gt['mot_instance_id']
+                        if mot_ins_id in ins_maps:
+                            gt['instance_id'] = ins_maps[mot_ins_id]
+                        else:
+                            gt['instance_id'] = ins_id
+                            ins_maps[mot_ins_id] = ins_id
+                            ins_id += 1
+                        outputs['annotations'].append(gt)
+                        ann_id += 1
+                if args.convert_det:
+                    dets = np.array(img2dets[mot_frame_id])
+                    if dets.ndim == 1:
+                        assert len(dets) == 0
+                        dets = np.zeros((0, 5))
+                    detections['det_bboxes'][img_name] = [dets]
+                outputs['images'].append(image)
+                img_id += 1
+            outputs['videos'].append(video)
+            vid_id += 1
+            outputs['num_instances'] = ins_id
+        print(f'{subset} has {ins_id} instances.')
+        mmengine.dump(outputs, out_file)
+        if args.convert_det:
+            mmengine.dump(detections, det_file)
+            print(f'Done! Saved as {out_file} and {det_file}')
+        else:
+            print(f'Done! Saved as {out_file}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/mot2reid.py b/mmde/mmdet/.mim/tools/dataset_converters/mot2reid.py
new file mode 100644
index 0000000000000000000000000000000000000000..11228cc42f8fcbacb6c614dc5852e180a81ad93b
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/mot2reid.py
@@ -0,0 +1,191 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This script converts MOT dataset into ReID dataset.
+# Official website of the MOT dataset: https://motchallenge.net/
+#
+# Label format of MOT dataset:
+#   GTs:
+#       <frame_id> # starts from 1,
+#       <instance_id>, <x1>, <y1>, <w>, <h>,
+#       <conf> # conf is annotated as 0 if the object is ignored,
+#       <class_id>, <visibility>
+#
+#   DETs and Results:
+#       <frame_id>, <instance_id>, <x1>, <y1>, <w>, <h>, <conf>,
+#       <x>, <y>, <z> # for 3D objects
+#
+# Classes in MOT:
+#   1: 'pedestrian'
+#   2: 'person on vehicle'
+#   3: 'car'
+#   4: 'bicycle'
+#   5: 'motorbike'
+#   6: 'non motorized vehicle'
+#   7: 'static person'
+#   8: 'distractor'
+#   9: 'occluder'
+#   10: 'occluder on the ground',
+#   11: 'occluder full'
+#   12: 'reflection'
+#
+#   USELESS classes and IGNORES classes will not be selected
+#   into the dataset for reid model training.
+import argparse
+import os
+import os.path as osp
+import random
+
+import mmcv
+import numpy as np
+from mmengine.fileio import list_from_file
+from tqdm import tqdm
+
+USELESS = [3, 4, 5, 6, 9, 10, 11]
+IGNORES = [2, 7, 8, 12, 13]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert MOT dataset into ReID dataset.')
+    parser.add_argument('-i', '--input', help='path of MOT data')
+    parser.add_argument('-o', '--output', help='path to save ReID dataset')
+    parser.add_argument(
+        '--val-split',
+        type=float,
+        default=0.2,
+        help='proportion of the validation dataset to the whole ReID dataset')
+    parser.add_argument(
+        '--vis-threshold',
+        type=float,
+        default=0.3,
+        help='threshold of visibility for each person')
+    parser.add_argument(
+        '--min-per-person',
+        type=int,
+        default=8,
+        help='minimum number of images for each person')
+    parser.add_argument(
+        '--max-per-person',
+        type=int,
+        default=1000,
+        help='maxmum number of images for each person')
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    if not osp.isdir(args.output):
+        os.makedirs(args.output, exist_ok=True)
+
+    in_folder = osp.join(args.input, 'train')
+    video_names = os.listdir(in_folder)
+    if 'MOT17' in in_folder:
+        video_names = [
+            video_name for video_name in video_names if 'FRCNN' in video_name
+        ]
+    is_mot15 = True if 'MOT15' in in_folder else False
+    for video_name in tqdm(video_names):
+        # load video infos
+        video_folder = osp.join(in_folder, video_name)
+        infos = list_from_file(f'{video_folder}/seqinfo.ini')
+        # video-level infos
+        assert video_name == infos[1].strip().split('=')[1]
+        raw_img_folder = infos[2].strip().split('=')[1]
+        raw_img_names = os.listdir(f'{video_folder}/{raw_img_folder}')
+        raw_img_names = sorted(raw_img_names)
+        num_raw_imgs = int(infos[4].strip().split('=')[1])
+        assert num_raw_imgs == len(raw_img_names)
+
+        reid_train_folder = osp.join(args.output, 'imgs')
+        if not osp.exists(reid_train_folder):
+            os.makedirs(reid_train_folder)
+        gts = list_from_file(f'{video_folder}/gt/gt.txt')
+        last_frame_id = -1
+        for gt in gts:
+            gt = gt.strip().split(',')
+            frame_id, ins_id = map(int, gt[:2])
+            ltwh = list(map(float, gt[2:6]))
+            if is_mot15:
+                class_id = 1
+                visibility = 1.
+            else:
+                class_id = int(gt[7])
+                visibility = float(gt[8])
+            if class_id in USELESS:
+                continue
+            elif class_id in IGNORES:
+                continue
+            elif visibility < args.vis_threshold:
+                continue
+            reid_img_folder = osp.join(reid_train_folder,
+                                       f'{video_name}_{ins_id:06d}')
+            if not osp.exists(reid_img_folder):
+                os.makedirs(reid_img_folder)
+            idx = len(os.listdir(reid_img_folder))
+            reid_img_name = f'{idx:06d}.jpg'
+            if frame_id != last_frame_id:
+                raw_img_name = raw_img_names[frame_id - 1]
+                raw_img = mmcv.imread(
+                    f'{video_folder}/{raw_img_folder}/{raw_img_name}')
+                last_frame_id = frame_id
+            xyxy = np.asarray(
+                [ltwh[0], ltwh[1], ltwh[0] + ltwh[2], ltwh[1] + ltwh[3]])
+            reid_img = mmcv.imcrop(raw_img, xyxy)
+            mmcv.imwrite(reid_img, f'{reid_img_folder}/{reid_img_name}')
+
+    reid_meta_folder = osp.join(args.output, 'meta')
+    if not osp.exists(reid_meta_folder):
+        os.makedirs(reid_meta_folder)
+    reid_train_list = []
+    reid_val_list = []
+    reid_img_folder_names = sorted(os.listdir(reid_train_folder))
+    num_ids = len(reid_img_folder_names)
+    num_train_ids = int(num_ids * (1 - args.val_split))
+    train_label, val_label = 0, 0
+    random.seed(0)
+    for reid_img_folder_name in reid_img_folder_names[:num_train_ids]:
+        reid_img_names = os.listdir(
+            f'{reid_train_folder}/{reid_img_folder_name}')
+        # ignore ids whose number of image is less than min_per_person
+        if (len(reid_img_names) < args.min_per_person):
+            continue
+        # downsampling when there are too many images owned by one id
+        if (len(reid_img_names) > args.max_per_person):
+            reid_img_names = random.sample(reid_img_names, args.max_per_person)
+        # training set
+        for reid_img_name in reid_img_names:
+            reid_train_list.append(
+                f'{reid_img_folder_name}/{reid_img_name} {train_label}\n')
+        train_label += 1
+    reid_entire_dataset_list = reid_train_list.copy()
+    for reid_img_folder_name in reid_img_folder_names[num_train_ids:]:
+        reid_img_names = os.listdir(
+            f'{reid_train_folder}/{reid_img_folder_name}')
+        # ignore ids whose number of image is less than min_per_person
+        if (len(reid_img_names) < args.min_per_person):
+            continue
+        # downsampling when there are too many images owned by one id
+        if (len(reid_img_names) > args.max_per_person):
+            reid_img_names = random.sample(reid_img_names, args.max_per_person)
+        for reid_img_name in reid_img_names:
+            # validation set
+            reid_val_list.append(
+                f'{reid_img_folder_name}/{reid_img_name} {val_label}\n')
+            reid_entire_dataset_list.append(
+                f'{reid_img_folder_name}/{reid_img_name} '
+                f'{train_label + val_label}\n')
+        val_label += 1
+    with open(
+            osp.join(reid_meta_folder,
+                     f'train_{int(100 * (1 - args.val_split))}.txt'),
+            'w') as f:
+        f.writelines(reid_train_list)
+    with open(
+            osp.join(reid_meta_folder, f'val_{int(100 * args.val_split)}.txt'),
+            'w') as f:
+        f.writelines(reid_val_list)
+    with open(osp.join(reid_meta_folder, 'train.txt'), 'w') as f:
+        f.writelines(reid_entire_dataset_list)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/openimages2odvg.py b/mmde/mmdet/.mim/tools/dataset_converters/openimages2odvg.py
new file mode 100644
index 0000000000000000000000000000000000000000..d700a4146a32da90da2d04da1063d0408e3f56c5
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/openimages2odvg.py
@@ -0,0 +1,187 @@
+import argparse
+import copy
+import csv
+import json
+import os.path as osp
+
+import jsonlines
+from mmcv.image import imfrombytes
+from mmengine.fileio import get
+
+
+def _parse_label_file(label_file):
+    index_list = []
+    classes_names = []
+    with open(label_file, 'r') as f:
+        reader = csv.reader(f)
+        for line in reader:
+            classes_names.append(line[1])
+            index_list.append(line[0])
+    index_mapping = {index: i for i, index in enumerate(index_list)}
+    return classes_names, index_mapping
+
+
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+
+def oi2odvg(args):
+    ann_file = osp.join(args.input_dir, 'oidv6-train-annotations-bbox.csv')
+    label_file = osp.join(args.input_dir, 'class-descriptions-boxable.csv')
+
+    classes_names, index_mapping = _parse_label_file(label_file)
+
+    label_map = {}
+    for class_name, idx in index_mapping.items():
+        class_name = classes_names[idx]
+        label_map[str(idx)] = class_name
+
+    if args.out_ann is None:
+        output = osp.join(args.input_dir, 'openimages_label_map.json')
+    else:
+        output = osp.join(
+            osp.dirname(args.out_ann), 'openimages_label_map.json')
+    with open(output, 'w') as f:
+        json.dump(label_map, f)
+
+    metas = []
+    skip_count = 0
+    with open(ann_file, 'r') as f:
+        reader = csv.reader(f)
+        last_img_id = None
+        _filename_shape = [0, 0]
+        instances = []
+        for i, line in enumerate(reader):
+            if i == 0:
+                continue
+            img_id = line[0]
+            if last_img_id is None:
+                last_img_id = img_id
+            label_id = line[2]
+
+            filename = f'{img_id}.jpg'
+            label = index_mapping[label_id]
+            category = label_map[str(label)]
+            bbox = [
+                float(line[4]),  # xmin
+                float(line[6]),  # ymin
+                float(line[5]),  # xmax
+                float(line[7])  # ymax
+            ]
+
+            # is_occluded = True if int(line[8]) == 1 else False
+            # is_truncated = True if int(line[9]) == 1 else False
+            is_group_of = True if int(line[10]) == 1 else False
+            # is_depiction = True if int(line[11]) == 1 else False
+            # is_inside = True if int(line[12]) == 1 else False
+
+            # if any([is_occluded, is_truncated, is_group_of,
+            # is_depiction, is_inside]):
+            if is_group_of:
+                print(f'skip {filename} of one instance')
+                skip_count += 1
+                continue
+
+            # denormalize
+            if filename != _filename_shape[0]:
+                if args.img_prefix is not None:
+                    _filename = osp.join(
+                        osp.dirname(args.input_dir), args.img_prefix, filename)
+                else:
+                    _filename = osp.join(osp.dirname(args.input_dir), filename)
+                img_bytes = get(_filename, backend_args)
+                img = imfrombytes(img_bytes, flag='color')
+                shape = img.shape
+                _filename_shape = [filename, shape]
+            else:
+                shape = _filename_shape[1]
+
+            h, w = shape[:2]
+            bbox = [
+                max(bbox[0] * w, 0),
+                max(bbox[1] * h, 0),
+                min(bbox[2] * w, w),
+                min(bbox[3] * h, h)
+            ]
+
+            x1, y1, x2, y2 = bbox
+            inter_w = max(0, min(x2, w) - max(x1, 0))
+            inter_h = max(0, min(y2, h) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if w < 1 or h < 1:
+                continue
+
+            instance = {
+                'filename': filename,
+                'height': h,
+                'width': w,
+                'bbox': bbox,
+                'label': label,
+                'category': category
+            }
+
+            if img_id != last_img_id:
+                copy_instances = copy.deepcopy(instances)
+                for copy_instance in copy_instances:
+                    _filename = copy_instance.pop('filename')
+                    _h = copy_instance.pop('height')
+                    _w = copy_instance.pop('width')
+
+                meta_ifo = {
+                    'filename': _filename,
+                    'height': _h,
+                    'width': _w,
+                    'detection': {
+                        'instances': copy_instances
+                    }
+                }
+                metas.append(meta_ifo)
+                instances = []
+            instances.append(instance)
+            last_img_id = img_id
+
+        for instance in instances:
+            _filename = instance.pop('filename')
+            _h = instance.pop('height')
+            _w = instance.pop('width')
+        meta_ifo = {
+            'filename': _filename,
+            'height': _h,
+            'width': _w,
+            'detection': {
+                'instances': instances
+            }
+        }
+        metas.append(meta_ifo)
+
+    if args.out_ann is None:
+        out_path = osp.join(args.input_dir, 'oidv6-train-annotations_od.json')
+    else:
+        out_path = args.out_ann
+
+    with jsonlines.open(out_path, mode='w') as writer:
+        writer.write_all(metas)
+
+    print('skip {} instances'.format(skip_count))
+    print('save to {}'.format(out_path))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        'openimages to odvg format.', add_help=True)
+    parser.add_argument(
+        '--input-dir',
+        default='data/OpenImages/annotations',
+        type=str,
+        help='input list name')
+    parser.add_argument('--img-prefix', default='OpenImages/train/')
+    parser.add_argument('--out-ann', '-o', type=str)
+    args = parser.parse_args()
+
+    oi2odvg(args)
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/pascal_voc.py b/mmde/mmdet/.mim/tools/dataset_converters/pascal_voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd238bfcf2826c4c57c5dc2f60969171421e9062
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/pascal_voc.py
@@ -0,0 +1,238 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+import xml.etree.ElementTree as ET
+
+import numpy as np
+from mmengine.fileio import dump, list_from_file
+from mmengine.utils import mkdir_or_exist, track_progress
+
+from mmdet.evaluation import voc_classes
+
+label_ids = {name: i for i, name in enumerate(voc_classes())}
+
+
+def parse_xml(args):
+    xml_path, img_path = args
+    tree = ET.parse(xml_path)
+    root = tree.getroot()
+    size = root.find('size')
+    w = int(size.find('width').text)
+    h = int(size.find('height').text)
+    bboxes = []
+    labels = []
+    bboxes_ignore = []
+    labels_ignore = []
+    for obj in root.findall('object'):
+        name = obj.find('name').text
+        label = label_ids[name]
+        difficult = int(obj.find('difficult').text)
+        bnd_box = obj.find('bndbox')
+        bbox = [
+            int(bnd_box.find('xmin').text),
+            int(bnd_box.find('ymin').text),
+            int(bnd_box.find('xmax').text),
+            int(bnd_box.find('ymax').text)
+        ]
+        if difficult:
+            bboxes_ignore.append(bbox)
+            labels_ignore.append(label)
+        else:
+            bboxes.append(bbox)
+            labels.append(label)
+    if not bboxes:
+        bboxes = np.zeros((0, 4))
+        labels = np.zeros((0, ))
+    else:
+        bboxes = np.array(bboxes, ndmin=2) - 1
+        labels = np.array(labels)
+    if not bboxes_ignore:
+        bboxes_ignore = np.zeros((0, 4))
+        labels_ignore = np.zeros((0, ))
+    else:
+        bboxes_ignore = np.array(bboxes_ignore, ndmin=2) - 1
+        labels_ignore = np.array(labels_ignore)
+    annotation = {
+        'filename': img_path,
+        'width': w,
+        'height': h,
+        'ann': {
+            'bboxes': bboxes.astype(np.float32),
+            'labels': labels.astype(np.int64),
+            'bboxes_ignore': bboxes_ignore.astype(np.float32),
+            'labels_ignore': labels_ignore.astype(np.int64)
+        }
+    }
+    return annotation
+
+
+def cvt_annotations(devkit_path, years, split, out_file):
+    if not isinstance(years, list):
+        years = [years]
+    annotations = []
+    for year in years:
+        filelist = osp.join(devkit_path,
+                            f'VOC{year}/ImageSets/Main/{split}.txt')
+        if not osp.isfile(filelist):
+            print(f'filelist does not exist: {filelist}, '
+                  f'skip voc{year} {split}')
+            return
+        img_names = list_from_file(filelist)
+        xml_paths = [
+            osp.join(devkit_path, f'VOC{year}/Annotations/{img_name}.xml')
+            for img_name in img_names
+        ]
+        img_paths = [
+            f'VOC{year}/JPEGImages/{img_name}.jpg' for img_name in img_names
+        ]
+        part_annotations = track_progress(parse_xml,
+                                          list(zip(xml_paths, img_paths)))
+        annotations.extend(part_annotations)
+    if out_file.endswith('json'):
+        annotations = cvt_to_coco_json(annotations)
+    dump(annotations, out_file)
+    return annotations
+
+
+def cvt_to_coco_json(annotations):
+    image_id = 0
+    annotation_id = 0
+    coco = dict()
+    coco['images'] = []
+    coco['type'] = 'instance'
+    coco['categories'] = []
+    coco['annotations'] = []
+    image_set = set()
+
+    def addAnnItem(annotation_id, image_id, category_id, bbox, difficult_flag):
+        annotation_item = dict()
+        annotation_item['segmentation'] = []
+
+        seg = []
+        # bbox[] is x1,y1,x2,y2
+        # left_top
+        seg.append(int(bbox[0]))
+        seg.append(int(bbox[1]))
+        # left_bottom
+        seg.append(int(bbox[0]))
+        seg.append(int(bbox[3]))
+        # right_bottom
+        seg.append(int(bbox[2]))
+        seg.append(int(bbox[3]))
+        # right_top
+        seg.append(int(bbox[2]))
+        seg.append(int(bbox[1]))
+
+        annotation_item['segmentation'].append(seg)
+
+        xywh = np.array(
+            [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]])
+        annotation_item['area'] = int(xywh[2] * xywh[3])
+        if difficult_flag == 1:
+            annotation_item['ignore'] = 0
+            annotation_item['iscrowd'] = 1
+        else:
+            annotation_item['ignore'] = 0
+            annotation_item['iscrowd'] = 0
+        annotation_item['image_id'] = int(image_id)
+        annotation_item['bbox'] = xywh.astype(int).tolist()
+        annotation_item['category_id'] = int(category_id)
+        annotation_item['id'] = int(annotation_id)
+        coco['annotations'].append(annotation_item)
+        return annotation_id + 1
+
+    for category_id, name in enumerate(voc_classes()):
+        category_item = dict()
+        category_item['supercategory'] = str('none')
+        category_item['id'] = int(category_id)
+        category_item['name'] = str(name)
+        coco['categories'].append(category_item)
+
+    for ann_dict in annotations:
+        file_name = ann_dict['filename']
+        ann = ann_dict['ann']
+        assert file_name not in image_set
+        image_item = dict()
+        image_item['id'] = int(image_id)
+        image_item['file_name'] = str(file_name)
+        image_item['height'] = int(ann_dict['height'])
+        image_item['width'] = int(ann_dict['width'])
+        coco['images'].append(image_item)
+        image_set.add(file_name)
+
+        bboxes = ann['bboxes'][:, :4]
+        labels = ann['labels']
+        for bbox_id in range(len(bboxes)):
+            bbox = bboxes[bbox_id]
+            label = labels[bbox_id]
+            annotation_id = addAnnItem(
+                annotation_id, image_id, label, bbox, difficult_flag=0)
+
+        bboxes_ignore = ann['bboxes_ignore'][:, :4]
+        labels_ignore = ann['labels_ignore']
+        for bbox_id in range(len(bboxes_ignore)):
+            bbox = bboxes_ignore[bbox_id]
+            label = labels_ignore[bbox_id]
+            annotation_id = addAnnItem(
+                annotation_id, image_id, label, bbox, difficult_flag=1)
+
+        image_id += 1
+
+    return coco
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert PASCAL VOC annotations to mmdetection format')
+    parser.add_argument('devkit_path', help='pascal voc devkit path')
+    parser.add_argument('-o', '--out-dir', help='output path')
+    parser.add_argument(
+        '--out-format',
+        default='pkl',
+        choices=('pkl', 'coco'),
+        help='output format, "coco" indicates coco annotation format')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    devkit_path = args.devkit_path
+    out_dir = args.out_dir if args.out_dir else devkit_path
+    mkdir_or_exist(out_dir)
+
+    years = []
+    if osp.isdir(osp.join(devkit_path, 'VOC2007')):
+        years.append('2007')
+    if osp.isdir(osp.join(devkit_path, 'VOC2012')):
+        years.append('2012')
+    if '2007' in years and '2012' in years:
+        years.append(['2007', '2012'])
+    if not years:
+        raise IOError(f'The devkit path {devkit_path} contains neither '
+                      '"VOC2007" nor "VOC2012" subfolder')
+    out_fmt = f'.{args.out_format}'
+    if args.out_format == 'coco':
+        out_fmt = '.json'
+    for year in years:
+        if year == '2007':
+            prefix = 'voc07'
+        elif year == '2012':
+            prefix = 'voc12'
+        elif year == ['2007', '2012']:
+            prefix = 'voc0712'
+        for split in ['train', 'val', 'trainval']:
+            dataset_name = prefix + '_' + split
+            print(f'processing {dataset_name} ...')
+            cvt_annotations(devkit_path, year, split,
+                            osp.join(out_dir, dataset_name + out_fmt))
+        if not isinstance(year, list):
+            dataset_name = prefix + '_test'
+            print(f'processing {dataset_name} ...')
+            cvt_annotations(devkit_path, year, 'test',
+                            osp.join(out_dir, dataset_name + out_fmt))
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/prepare_coco_semantic_annos_from_panoptic_annos.py b/mmde/mmdet/.mim/tools/dataset_converters/prepare_coco_semantic_annos_from_panoptic_annos.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b9ee592cb35d222fe2de0ce3a5ee135cdd63d3d
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/prepare_coco_semantic_annos_from_panoptic_annos.py
@@ -0,0 +1,899 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/facebookresearch/Mask2Former/blob/main/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py # noqa
+
+import argparse
+import functools
+import json
+import multiprocessing as mp
+import os
+import time
+
+import numpy as np
+from panopticapi.utils import rgb2id
+from PIL import Image
+
+COCO_CATEGORIES = [
+    {
+        'color': [220, 20, 60],
+        'isthing': 1,
+        'id': 1,
+        'name': 'person'
+    },
+    {
+        'color': [119, 11, 32],
+        'isthing': 1,
+        'id': 2,
+        'name': 'bicycle'
+    },
+    {
+        'color': [0, 0, 142],
+        'isthing': 1,
+        'id': 3,
+        'name': 'car'
+    },
+    {
+        'color': [0, 0, 230],
+        'isthing': 1,
+        'id': 4,
+        'name': 'motorcycle'
+    },
+    {
+        'color': [106, 0, 228],
+        'isthing': 1,
+        'id': 5,
+        'name': 'airplane'
+    },
+    {
+        'color': [0, 60, 100],
+        'isthing': 1,
+        'id': 6,
+        'name': 'bus'
+    },
+    {
+        'color': [0, 80, 100],
+        'isthing': 1,
+        'id': 7,
+        'name': 'train'
+    },
+    {
+        'color': [0, 0, 70],
+        'isthing': 1,
+        'id': 8,
+        'name': 'truck'
+    },
+    {
+        'color': [0, 0, 192],
+        'isthing': 1,
+        'id': 9,
+        'name': 'boat'
+    },
+    {
+        'color': [250, 170, 30],
+        'isthing': 1,
+        'id': 10,
+        'name': 'traffic light'
+    },
+    {
+        'color': [100, 170, 30],
+        'isthing': 1,
+        'id': 11,
+        'name': 'fire hydrant'
+    },
+    {
+        'color': [220, 220, 0],
+        'isthing': 1,
+        'id': 13,
+        'name': 'stop sign'
+    },
+    {
+        'color': [175, 116, 175],
+        'isthing': 1,
+        'id': 14,
+        'name': 'parking meter'
+    },
+    {
+        'color': [250, 0, 30],
+        'isthing': 1,
+        'id': 15,
+        'name': 'bench'
+    },
+    {
+        'color': [165, 42, 42],
+        'isthing': 1,
+        'id': 16,
+        'name': 'bird'
+    },
+    {
+        'color': [255, 77, 255],
+        'isthing': 1,
+        'id': 17,
+        'name': 'cat'
+    },
+    {
+        'color': [0, 226, 252],
+        'isthing': 1,
+        'id': 18,
+        'name': 'dog'
+    },
+    {
+        'color': [182, 182, 255],
+        'isthing': 1,
+        'id': 19,
+        'name': 'horse'
+    },
+    {
+        'color': [0, 82, 0],
+        'isthing': 1,
+        'id': 20,
+        'name': 'sheep'
+    },
+    {
+        'color': [120, 166, 157],
+        'isthing': 1,
+        'id': 21,
+        'name': 'cow'
+    },
+    {
+        'color': [110, 76, 0],
+        'isthing': 1,
+        'id': 22,
+        'name': 'elephant'
+    },
+    {
+        'color': [174, 57, 255],
+        'isthing': 1,
+        'id': 23,
+        'name': 'bear'
+    },
+    {
+        'color': [199, 100, 0],
+        'isthing': 1,
+        'id': 24,
+        'name': 'zebra'
+    },
+    {
+        'color': [72, 0, 118],
+        'isthing': 1,
+        'id': 25,
+        'name': 'giraffe'
+    },
+    {
+        'color': [255, 179, 240],
+        'isthing': 1,
+        'id': 27,
+        'name': 'backpack'
+    },
+    {
+        'color': [0, 125, 92],
+        'isthing': 1,
+        'id': 28,
+        'name': 'umbrella'
+    },
+    {
+        'color': [209, 0, 151],
+        'isthing': 1,
+        'id': 31,
+        'name': 'handbag'
+    },
+    {
+        'color': [188, 208, 182],
+        'isthing': 1,
+        'id': 32,
+        'name': 'tie'
+    },
+    {
+        'color': [0, 220, 176],
+        'isthing': 1,
+        'id': 33,
+        'name': 'suitcase'
+    },
+    {
+        'color': [255, 99, 164],
+        'isthing': 1,
+        'id': 34,
+        'name': 'frisbee'
+    },
+    {
+        'color': [92, 0, 73],
+        'isthing': 1,
+        'id': 35,
+        'name': 'skis'
+    },
+    {
+        'color': [133, 129, 255],
+        'isthing': 1,
+        'id': 36,
+        'name': 'snowboard'
+    },
+    {
+        'color': [78, 180, 255],
+        'isthing': 1,
+        'id': 37,
+        'name': 'sports ball'
+    },
+    {
+        'color': [0, 228, 0],
+        'isthing': 1,
+        'id': 38,
+        'name': 'kite'
+    },
+    {
+        'color': [174, 255, 243],
+        'isthing': 1,
+        'id': 39,
+        'name': 'baseball bat'
+    },
+    {
+        'color': [45, 89, 255],
+        'isthing': 1,
+        'id': 40,
+        'name': 'baseball glove'
+    },
+    {
+        'color': [134, 134, 103],
+        'isthing': 1,
+        'id': 41,
+        'name': 'skateboard'
+    },
+    {
+        'color': [145, 148, 174],
+        'isthing': 1,
+        'id': 42,
+        'name': 'surfboard'
+    },
+    {
+        'color': [255, 208, 186],
+        'isthing': 1,
+        'id': 43,
+        'name': 'tennis racket'
+    },
+    {
+        'color': [197, 226, 255],
+        'isthing': 1,
+        'id': 44,
+        'name': 'bottle'
+    },
+    {
+        'color': [171, 134, 1],
+        'isthing': 1,
+        'id': 46,
+        'name': 'wine glass'
+    },
+    {
+        'color': [109, 63, 54],
+        'isthing': 1,
+        'id': 47,
+        'name': 'cup'
+    },
+    {
+        'color': [207, 138, 255],
+        'isthing': 1,
+        'id': 48,
+        'name': 'fork'
+    },
+    {
+        'color': [151, 0, 95],
+        'isthing': 1,
+        'id': 49,
+        'name': 'knife'
+    },
+    {
+        'color': [9, 80, 61],
+        'isthing': 1,
+        'id': 50,
+        'name': 'spoon'
+    },
+    {
+        'color': [84, 105, 51],
+        'isthing': 1,
+        'id': 51,
+        'name': 'bowl'
+    },
+    {
+        'color': [74, 65, 105],
+        'isthing': 1,
+        'id': 52,
+        'name': 'banana'
+    },
+    {
+        'color': [166, 196, 102],
+        'isthing': 1,
+        'id': 53,
+        'name': 'apple'
+    },
+    {
+        'color': [208, 195, 210],
+        'isthing': 1,
+        'id': 54,
+        'name': 'sandwich'
+    },
+    {
+        'color': [255, 109, 65],
+        'isthing': 1,
+        'id': 55,
+        'name': 'orange'
+    },
+    {
+        'color': [0, 143, 149],
+        'isthing': 1,
+        'id': 56,
+        'name': 'broccoli'
+    },
+    {
+        'color': [179, 0, 194],
+        'isthing': 1,
+        'id': 57,
+        'name': 'carrot'
+    },
+    {
+        'color': [209, 99, 106],
+        'isthing': 1,
+        'id': 58,
+        'name': 'hot dog'
+    },
+    {
+        'color': [5, 121, 0],
+        'isthing': 1,
+        'id': 59,
+        'name': 'pizza'
+    },
+    {
+        'color': [227, 255, 205],
+        'isthing': 1,
+        'id': 60,
+        'name': 'donut'
+    },
+    {
+        'color': [147, 186, 208],
+        'isthing': 1,
+        'id': 61,
+        'name': 'cake'
+    },
+    {
+        'color': [153, 69, 1],
+        'isthing': 1,
+        'id': 62,
+        'name': 'chair'
+    },
+    {
+        'color': [3, 95, 161],
+        'isthing': 1,
+        'id': 63,
+        'name': 'couch'
+    },
+    {
+        'color': [163, 255, 0],
+        'isthing': 1,
+        'id': 64,
+        'name': 'potted plant'
+    },
+    {
+        'color': [119, 0, 170],
+        'isthing': 1,
+        'id': 65,
+        'name': 'bed'
+    },
+    {
+        'color': [0, 182, 199],
+        'isthing': 1,
+        'id': 67,
+        'name': 'dining table'
+    },
+    {
+        'color': [0, 165, 120],
+        'isthing': 1,
+        'id': 70,
+        'name': 'toilet'
+    },
+    {
+        'color': [183, 130, 88],
+        'isthing': 1,
+        'id': 72,
+        'name': 'tv'
+    },
+    {
+        'color': [95, 32, 0],
+        'isthing': 1,
+        'id': 73,
+        'name': 'laptop'
+    },
+    {
+        'color': [130, 114, 135],
+        'isthing': 1,
+        'id': 74,
+        'name': 'mouse'
+    },
+    {
+        'color': [110, 129, 133],
+        'isthing': 1,
+        'id': 75,
+        'name': 'remote'
+    },
+    {
+        'color': [166, 74, 118],
+        'isthing': 1,
+        'id': 76,
+        'name': 'keyboard'
+    },
+    {
+        'color': [219, 142, 185],
+        'isthing': 1,
+        'id': 77,
+        'name': 'cell phone'
+    },
+    {
+        'color': [79, 210, 114],
+        'isthing': 1,
+        'id': 78,
+        'name': 'microwave'
+    },
+    {
+        'color': [178, 90, 62],
+        'isthing': 1,
+        'id': 79,
+        'name': 'oven'
+    },
+    {
+        'color': [65, 70, 15],
+        'isthing': 1,
+        'id': 80,
+        'name': 'toaster'
+    },
+    {
+        'color': [127, 167, 115],
+        'isthing': 1,
+        'id': 81,
+        'name': 'sink'
+    },
+    {
+        'color': [59, 105, 106],
+        'isthing': 1,
+        'id': 82,
+        'name': 'refrigerator'
+    },
+    {
+        'color': [142, 108, 45],
+        'isthing': 1,
+        'id': 84,
+        'name': 'book'
+    },
+    {
+        'color': [196, 172, 0],
+        'isthing': 1,
+        'id': 85,
+        'name': 'clock'
+    },
+    {
+        'color': [95, 54, 80],
+        'isthing': 1,
+        'id': 86,
+        'name': 'vase'
+    },
+    {
+        'color': [128, 76, 255],
+        'isthing': 1,
+        'id': 87,
+        'name': 'scissors'
+    },
+    {
+        'color': [201, 57, 1],
+        'isthing': 1,
+        'id': 88,
+        'name': 'teddy bear'
+    },
+    {
+        'color': [246, 0, 122],
+        'isthing': 1,
+        'id': 89,
+        'name': 'hair drier'
+    },
+    {
+        'color': [191, 162, 208],
+        'isthing': 1,
+        'id': 90,
+        'name': 'toothbrush'
+    },
+    {
+        'color': [255, 255, 128],
+        'isthing': 0,
+        'id': 92,
+        'name': 'banner'
+    },
+    {
+        'color': [147, 211, 203],
+        'isthing': 0,
+        'id': 93,
+        'name': 'blanket'
+    },
+    {
+        'color': [150, 100, 100],
+        'isthing': 0,
+        'id': 95,
+        'name': 'bridge'
+    },
+    {
+        'color': [168, 171, 172],
+        'isthing': 0,
+        'id': 100,
+        'name': 'cardboard'
+    },
+    {
+        'color': [146, 112, 198],
+        'isthing': 0,
+        'id': 107,
+        'name': 'counter'
+    },
+    {
+        'color': [210, 170, 100],
+        'isthing': 0,
+        'id': 109,
+        'name': 'curtain'
+    },
+    {
+        'color': [92, 136, 89],
+        'isthing': 0,
+        'id': 112,
+        'name': 'door-stuff'
+    },
+    {
+        'color': [218, 88, 184],
+        'isthing': 0,
+        'id': 118,
+        'name': 'floor-wood'
+    },
+    {
+        'color': [241, 129, 0],
+        'isthing': 0,
+        'id': 119,
+        'name': 'flower'
+    },
+    {
+        'color': [217, 17, 255],
+        'isthing': 0,
+        'id': 122,
+        'name': 'fruit'
+    },
+    {
+        'color': [124, 74, 181],
+        'isthing': 0,
+        'id': 125,
+        'name': 'gravel'
+    },
+    {
+        'color': [70, 70, 70],
+        'isthing': 0,
+        'id': 128,
+        'name': 'house'
+    },
+    {
+        'color': [255, 228, 255],
+        'isthing': 0,
+        'id': 130,
+        'name': 'light'
+    },
+    {
+        'color': [154, 208, 0],
+        'isthing': 0,
+        'id': 133,
+        'name': 'mirror-stuff'
+    },
+    {
+        'color': [193, 0, 92],
+        'isthing': 0,
+        'id': 138,
+        'name': 'net'
+    },
+    {
+        'color': [76, 91, 113],
+        'isthing': 0,
+        'id': 141,
+        'name': 'pillow'
+    },
+    {
+        'color': [255, 180, 195],
+        'isthing': 0,
+        'id': 144,
+        'name': 'platform'
+    },
+    {
+        'color': [106, 154, 176],
+        'isthing': 0,
+        'id': 145,
+        'name': 'playingfield'
+    },
+    {
+        'color': [230, 150, 140],
+        'isthing': 0,
+        'id': 147,
+        'name': 'railroad'
+    },
+    {
+        'color': [60, 143, 255],
+        'isthing': 0,
+        'id': 148,
+        'name': 'river'
+    },
+    {
+        'color': [128, 64, 128],
+        'isthing': 0,
+        'id': 149,
+        'name': 'road'
+    },
+    {
+        'color': [92, 82, 55],
+        'isthing': 0,
+        'id': 151,
+        'name': 'roof'
+    },
+    {
+        'color': [254, 212, 124],
+        'isthing': 0,
+        'id': 154,
+        'name': 'sand'
+    },
+    {
+        'color': [73, 77, 174],
+        'isthing': 0,
+        'id': 155,
+        'name': 'sea'
+    },
+    {
+        'color': [255, 160, 98],
+        'isthing': 0,
+        'id': 156,
+        'name': 'shelf'
+    },
+    {
+        'color': [255, 255, 255],
+        'isthing': 0,
+        'id': 159,
+        'name': 'snow'
+    },
+    {
+        'color': [104, 84, 109],
+        'isthing': 0,
+        'id': 161,
+        'name': 'stairs'
+    },
+    {
+        'color': [169, 164, 131],
+        'isthing': 0,
+        'id': 166,
+        'name': 'tent'
+    },
+    {
+        'color': [225, 199, 255],
+        'isthing': 0,
+        'id': 168,
+        'name': 'towel'
+    },
+    {
+        'color': [137, 54, 74],
+        'isthing': 0,
+        'id': 171,
+        'name': 'wall-brick'
+    },
+    {
+        'color': [135, 158, 223],
+        'isthing': 0,
+        'id': 175,
+        'name': 'wall-stone'
+    },
+    {
+        'color': [7, 246, 231],
+        'isthing': 0,
+        'id': 176,
+        'name': 'wall-tile'
+    },
+    {
+        'color': [107, 255, 200],
+        'isthing': 0,
+        'id': 177,
+        'name': 'wall-wood'
+    },
+    {
+        'color': [58, 41, 149],
+        'isthing': 0,
+        'id': 178,
+        'name': 'water-other'
+    },
+    {
+        'color': [183, 121, 142],
+        'isthing': 0,
+        'id': 180,
+        'name': 'window-blind'
+    },
+    {
+        'color': [255, 73, 97],
+        'isthing': 0,
+        'id': 181,
+        'name': 'window-other'
+    },
+    {
+        'color': [107, 142, 35],
+        'isthing': 0,
+        'id': 184,
+        'name': 'tree-merged'
+    },
+    {
+        'color': [190, 153, 153],
+        'isthing': 0,
+        'id': 185,
+        'name': 'fence-merged'
+    },
+    {
+        'color': [146, 139, 141],
+        'isthing': 0,
+        'id': 186,
+        'name': 'ceiling-merged'
+    },
+    {
+        'color': [70, 130, 180],
+        'isthing': 0,
+        'id': 187,
+        'name': 'sky-other-merged'
+    },
+    {
+        'color': [134, 199, 156],
+        'isthing': 0,
+        'id': 188,
+        'name': 'cabinet-merged'
+    },
+    {
+        'color': [209, 226, 140],
+        'isthing': 0,
+        'id': 189,
+        'name': 'table-merged'
+    },
+    {
+        'color': [96, 36, 108],
+        'isthing': 0,
+        'id': 190,
+        'name': 'floor-other-merged'
+    },
+    {
+        'color': [96, 96, 96],
+        'isthing': 0,
+        'id': 191,
+        'name': 'pavement-merged'
+    },
+    {
+        'color': [64, 170, 64],
+        'isthing': 0,
+        'id': 192,
+        'name': 'mountain-merged'
+    },
+    {
+        'color': [152, 251, 152],
+        'isthing': 0,
+        'id': 193,
+        'name': 'grass-merged'
+    },
+    {
+        'color': [208, 229, 228],
+        'isthing': 0,
+        'id': 194,
+        'name': 'dirt-merged'
+    },
+    {
+        'color': [206, 186, 171],
+        'isthing': 0,
+        'id': 195,
+        'name': 'paper-merged'
+    },
+    {
+        'color': [152, 161, 64],
+        'isthing': 0,
+        'id': 196,
+        'name': 'food-other-merged'
+    },
+    {
+        'color': [116, 112, 0],
+        'isthing': 0,
+        'id': 197,
+        'name': 'building-other-merged'
+    },
+    {
+        'color': [0, 114, 143],
+        'isthing': 0,
+        'id': 198,
+        'name': 'rock-merged'
+    },
+    {
+        'color': [102, 102, 156],
+        'isthing': 0,
+        'id': 199,
+        'name': 'wall-other-merged'
+    },
+    {
+        'color': [250, 141, 255],
+        'isthing': 0,
+        'id': 200,
+        'name': 'rug-merged'
+    },
+]
+
+
+def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments,
+                                  id_map):
+    panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
+    panoptic = rgb2id(panoptic)
+    output = np.zeros_like(panoptic, dtype=np.uint8) + 255
+    for seg in segments:
+        cat_id = seg['category_id']
+        new_cat_id = id_map[cat_id]
+        output[panoptic == seg['id']] = new_cat_id
+    Image.fromarray(output).save(output_semantic)
+
+
+def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root,
+                                         sem_seg_root, categories):
+    """Create semantic segmentation annotations from panoptic segmentation
+    annotations, to be used by PanopticFPN.
+
+    It maps all thing categories to class 0, and maps all
+    unlabeled pixels to class 255.
+    It maps all stuff categories to contiguous ids starting from 1.
+    Args:
+        panoptic_json (str): path to the panoptic json file, in COCO's format.
+        panoptic_root (str): a directory with panoptic annotation files, in
+            COCO's format.
+        sem_seg_root (str): a directory to output semantic annotation files
+        categories (list[dict]): category metadata. Each dict needs to have:
+            "id": corresponds to the "category_id" in the json annotations
+            "isthing": 0 or 1
+    """
+    os.makedirs(sem_seg_root, exist_ok=True)
+
+    id_map = {}  # map from category id to id in the output semantic annotation
+    assert len(categories) <= 254
+    for i, k in enumerate(categories):
+        id_map[k['id']] = i
+    # what is id = 0?
+    # id_map[0] = 255
+    print(id_map)
+
+    with open(panoptic_json) as f:
+        obj = json.load(f)
+
+    pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
+
+    def iter_annotations():
+        for anno in obj['annotations']:
+            file_name = anno['file_name']
+            segments = anno['segments_info']
+            input = os.path.join(panoptic_root, file_name)
+            output = os.path.join(sem_seg_root, file_name)
+            yield input, output, segments
+
+    print('Start writing to {} ...'.format(sem_seg_root))
+    start = time.time()
+    pool.starmap(
+        functools.partial(_process_panoptic_to_semantic, id_map=id_map),
+        iter_annotations(),
+        chunksize=100,
+    )
+    print('Finished. time: {:.2f}s'.format(time.time() - start))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=\
+        'Convert COCO Stuff 164k annotations to mmdet format')  # noqa
+    parser.add_argument('coco_path', help='coco stuff path')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    dataset_dir = args.coco_path
+    for s in ['val2017', 'train2017']:
+        separate_coco_semantic_from_panoptic(
+            os.path.join(dataset_dir,
+                         'annotations/panoptic_{}.json'.format(s)),
+            os.path.join(dataset_dir, 'annotations/panoptic_{}'.format(s)),
+            os.path.join(dataset_dir,
+                         'annotations/panoptic_semseg_{}'.format(s)),
+            COCO_CATEGORIES,
+        )
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/refcoco2odvg.py b/mmde/mmdet/.mim/tools/dataset_converters/refcoco2odvg.py
new file mode 100644
index 0000000000000000000000000000000000000000..c11869b3855d4be5fb1a005a48c2587c0f818433
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/refcoco2odvg.py
@@ -0,0 +1,147 @@
+import argparse
+import os.path as osp
+
+import jsonlines
+from pycocotools.coco import COCO
+from tqdm import tqdm
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='refcoco to odvg')
+    parser.add_argument('mdetr_anno_dir', type=str)
+    parser.add_argument('--out-dir', '-o', type=str)
+    args = parser.parse_args()
+    return args
+
+
+def _has_only_empty_bbox(anno):
+    return all(any(o <= 1 for o in obj['bbox'][2:]) for obj in anno)
+
+
+def has_valid_annotation(anno):
+    # if it's empty, there is no annotation
+    if len(anno) == 0:
+        return False
+    # if all boxes have close to zero area, there is no annotation
+    if _has_only_empty_bbox(anno):
+        return False
+    return True
+
+
+def process_item(args, filename):
+    path = osp.join(args.mdetr_anno_dir, filename)
+    coco = COCO(path)
+
+    ids = list(sorted(coco.imgs.keys()))
+
+    out_results = []
+    for img_id in tqdm(ids):
+        if isinstance(img_id, str):
+            ann_ids = coco.getAnnIds(imgIds=[img_id], iscrowd=0)
+        else:
+            ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=0)
+        annos = coco.loadAnns(ann_ids)
+        if not has_valid_annotation(annos):
+            continue
+
+        img_info = coco.loadImgs(img_id)[0]
+        file_name = img_info['file_name']
+        caption = img_info['caption']
+
+        regions = {}
+
+        for anno in annos:
+            box = anno['bbox']
+            tokens_positive = anno['tokens_positive']
+            x1, y1, w, h = box
+            inter_w = max(0, min(x1 + w, int(img_info['width'])) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, int(img_info['height'])) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if anno['area'] <= 0 or w < 1 or h < 1:
+                continue
+
+            if anno.get('iscrowd', False):
+                continue
+            bbox_xyxy = [
+                x1, y1,
+                min(x1 + w, int(img_info['width'])),
+                min(y1 + h, int(img_info['height']))
+            ]
+
+            tokens_positive = sorted(tokens_positive, key=lambda x: x[0])
+
+            phrase = []
+            pre_end_index = -10
+            for token in tokens_positive:
+                start_index = token[0]
+                end_index = token[1]
+                if pre_end_index + 1 == start_index:
+                    if caption[token[0] - 1] == ' ':
+                        phrase[
+                            -1] = phrase[-1] + ' ' + caption[token[0]:token[1]]
+                    else:
+                        phrase.append(caption[token[0]:token[1]])
+                else:
+                    phrase.append(caption[token[0]:token[1]])
+                pre_end_index = end_index
+
+            key = ' '.join(phrase)
+
+            if key not in regions:
+                regions[key] = {
+                    'bbox': bbox_xyxy,
+                    'phrase': phrase,
+                    'tokens_positive': tokens_positive
+                }
+            else:
+                old_box = regions[key]['bbox']
+                if isinstance(old_box[0], list):
+                    old_box.append(bbox_xyxy)
+                else:
+                    old_box = [old_box, bbox_xyxy]
+
+                regions[key]['bbox'] = old_box
+
+        out_dict = {
+            'filename': file_name,
+            'height': int(img_info['height']),
+            'width': int(img_info['width']),
+            'grounding': {
+                'caption': caption
+            }
+        }
+
+        region_list = []
+        for key, value in regions.items():
+            phrase = value['phrase']
+            if len(phrase) == 1:
+                phrase = phrase[0]
+            region_list.append({
+                'bbox': value['bbox'],
+                'phrase': phrase,
+                'tokens_positive': value['tokens_positive']
+            })
+        out_dict['grounding']['regions'] = region_list
+        out_results.append(out_dict)
+
+    if args.out_dir is None:
+        out_path = osp.join(args.mdetr_anno_dir, filename[:-5] + '_vg.json')
+    else:
+        out_path = osp.join(args.out_dir, filename[:-5] + '_vg.json')
+
+    with jsonlines.open(out_path, mode='w') as writer:
+        writer.write_all(out_results)
+    print(f'save to {out_path}')
+
+
+def main():
+    args = parse_args()
+    process_item(args, 'finetune_refcoco_train.json')
+    process_item(args, 'finetune_refcoco+_train.json')
+    process_item(args, 'finetune_refcocog_train.json')
+    process_item(args, 'finetune_grefcoco_train.json')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/remove_cocotrain2017_from_refcoco.py b/mmde/mmdet/.mim/tools/dataset_converters/remove_cocotrain2017_from_refcoco.py
new file mode 100644
index 0000000000000000000000000000000000000000..7de2a9ec4e2ef882d16e235e71f42c5241fc68a9
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/remove_cocotrain2017_from_refcoco.py
@@ -0,0 +1,110 @@
+import argparse
+import json
+import os.path as osp
+
+import mmengine
+from pycocotools.coco import COCO
+
+
+def diff_image_id(coco2017_train_ids, ref_ids):
+    set1 = set(coco2017_train_ids)
+    set2 = set(ref_ids)
+    intersection = set1.intersection(set2)
+    result = set1 - intersection
+    return result
+
+
+def gen_new_json(coco2017_train_path, json_data, coco2017_train_ids):
+    coco = COCO(coco2017_train_path)
+    new_json_data = {
+        'info': json_data['info'],
+        'licenses': json_data['licenses'],
+        'categories': json_data['categories'],
+        'images': [],
+        'annotations': []
+    }
+
+    for id in coco2017_train_ids:
+        ann_ids = coco.getAnnIds(imgIds=[id])
+        img_ann_info = coco.loadAnns(ann_ids)
+        img_info = coco.loadImgs([id])[0]
+
+        new_json_data['images'].append(img_info)
+        new_json_data['annotations'].extend(img_ann_info)
+    return new_json_data
+
+
+# coco2017 val and final_mixed_train.json have no intersection,
+# so deduplication is not necessary.
+
+# coco2017 val and datasets like refcoco based on coco2014 train
+# have no intersection, so deduplication is not necessary.
+
+
+# coco2017 train and datasets like refcoco based on coco2014
+# train have overlapping annotations in the validation set,
+# so deduplication is required.
+def exclude_coco(args):
+    with open(args.coco2017_train, 'r') as f:
+        coco2017_train = json.load(f)
+    coco2017_train_ids = [train['id'] for train in coco2017_train['images']]
+    orig_len = len(coco2017_train_ids)
+
+    with open(osp.join(args.mdetr_anno_dir, 'finetune_refcoco_val.json'),
+              'r') as f:
+        refcoco_ann = json.load(f)
+    refcoco_ids = [refcoco['original_id'] for refcoco in refcoco_ann['images']]
+    coco2017_train_ids = diff_image_id(coco2017_train_ids, refcoco_ids)
+
+    with open(
+            osp.join(args.mdetr_anno_dir, 'finetune_refcoco+_val.json'),
+            'r') as f:
+        refcoco_plus_ann = json.load(f)
+    refcoco_plus_ids = [
+        refcoco['original_id'] for refcoco in refcoco_plus_ann['images']
+    ]
+    coco2017_train_ids = diff_image_id(coco2017_train_ids, refcoco_plus_ids)
+
+    with open(
+            osp.join(args.mdetr_anno_dir, 'finetune_refcocog_val.json'),
+            'r') as f:
+        refcocog_ann = json.load(f)
+    refcocog_ids = [
+        refcoco['original_id'] for refcoco in refcocog_ann['images']
+    ]
+    coco2017_train_ids = diff_image_id(coco2017_train_ids, refcocog_ids)
+
+    with open(
+            osp.join(args.mdetr_anno_dir, 'finetune_grefcoco_val.json'),
+            'r') as f:
+        grefcoco_ann = json.load(f)
+    grefcoco_ids = [
+        refcoco['original_id'] for refcoco in grefcoco_ann['images']
+    ]
+    coco2017_train_ids = diff_image_id(coco2017_train_ids, grefcoco_ids)
+
+    coco2017_train_ids = list(coco2017_train_ids)
+    print(
+        'remove {} images from coco2017_train'.format(orig_len -
+                                                      len(coco2017_train_ids)))
+
+    new_json_data = gen_new_json(args.coco2017_train, coco2017_train,
+                                 coco2017_train_ids)
+    if args.out_ann is None:
+        out_ann = osp.dirname(
+            args.coco2017_train) + '/instances_train2017_norefval.json'
+        mmengine.dump(new_json_data, out_ann)
+        print('save new json to {}'.format(out_ann))
+    else:
+        mmengine.dump(new_json_data, args.out_ann)
+        print('save new json to {}'.format(args.out_ann))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('coco to odvg format.', add_help=True)
+    parser.add_argument('mdetr_anno_dir', type=str)
+    parser.add_argument('coco2017_train', type=str)
+    parser.add_argument('--out-ann', '-o', type=str)
+    args = parser.parse_args()
+
+    exclude_coco(args)
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/scripts/preprocess_coco2017.sh b/mmde/mmdet/.mim/tools/dataset_converters/scripts/preprocess_coco2017.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f2986d09430eb69b2f316bf5acf439ff7c30d1c9
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/scripts/preprocess_coco2017.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+DOWNLOAD_DIR=$1
+DATA_ROOT=$2
+
+unzip $DOWNLOAD_DIR/OpenDataLab___COCO_2017/raw/Images/val2017.zip -d $DATA_ROOT
+unzip $DOWNLOAD_DIR/OpenDataLab___COCO_2017/raw/Images/train2017.zip -d $DATA_ROOT
+unzip $DOWNLOAD_DIR/OpenDataLab___COCO_2017/raw/Images/test2017.zip -d $DATA_ROOT/
+unzip $DOWNLOAD_DIR/OpenDataLab___COCO_2017/raw/Images/unlabeled2017.zip -d $DATA_ROOT
+unzip $DOWNLOAD_DIR/OpenDataLab___COCO_2017/raw/Annotations/stuff_annotations_trainval2017.zip -d $DATA_ROOT/
+unzip $DOWNLOAD_DIR/OpenDataLab___COCO_2017/raw/Annotations/panoptic_annotations_trainval2017.zip -d $DATA_ROOT/
+unzip $DOWNLOAD_DIR/OpenDataLab___COCO_2017/raw/Annotations/image_info_unlabeled2017.zip -d $DATA_ROOT/
+unzip $DOWNLOAD_DIR/OpenDataLab___COCO_2017/raw/Annotations/image_info_test2017.zip -d $DATA_ROOT/
+unzip $DOWNLOAD_DIR/OpenDataLab___COCO_2017/raw/Annotations/annotations_trainval2017.zip -d $DATA_ROOT
+rm -rf $DOWNLOAD_DIR/OpenDataLab___COCO_2017
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/scripts/preprocess_voc2007.sh b/mmde/mmdet/.mim/tools/dataset_converters/scripts/preprocess_voc2007.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9d265c745ea7b78a63394c03face7d387683c05a
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/scripts/preprocess_voc2007.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+DOWNLOAD_DIR=$1
+DATA_ROOT=$2
+
+tar -xvf $DOWNLOAD_DIR/OpenDataLab___PASCAL_VOC2007/raw/VOCtrainval_06-Nov-2007.tar -C $DATA_ROOT
+tar -xvf $DOWNLOAD_DIR/OpenDataLab___PASCAL_VOC2007/raw/VOCtestnoimgs_06-Nov-2007.tar -C $DATA_ROOT
+rm -rf $DOWNLOAD_DIR/OpenDataLab___PASCAL_VOC2007
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/scripts/preprocess_voc2012.sh b/mmde/mmdet/.mim/tools/dataset_converters/scripts/preprocess_voc2012.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e6f9ba6d824d18b059bd8e0bb982eea39f125be3
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/scripts/preprocess_voc2012.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+DOWNLOAD_DIR=$1
+DATA_ROOT=$2
+
+tar -xvf $DOWNLOAD_DIR/OpenDataLab___PASCAL_VOC2012/raw/VOCtrainval_11-May-2012.tar -C $DATA_ROOT
+tar -xvf $DOWNLOAD_DIR/OpenDataLab___PASCAL_VOC2012/raw/VOC2012test.tar -C $DATA_ROOT
+rm -rf $DOWNLOAD_DIR/OpenDataLab___PASCAL_VOC2012
diff --git a/mmde/mmdet/.mim/tools/dataset_converters/youtubevis2coco.py b/mmde/mmdet/.mim/tools/dataset_converters/youtubevis2coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a864f43a30e68fee0de96eb6ff14a04dc2c3b79f
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dataset_converters/youtubevis2coco.py
@@ -0,0 +1,157 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy
+import os
+import os.path as osp
+from collections import defaultdict
+
+import mmengine
+from tqdm import tqdm
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='YouTube-VIS to COCO Video format')
+    parser.add_argument(
+        '-i',
+        '--input',
+        help='root directory of YouTube-VIS annotations',
+    )
+    parser.add_argument(
+        '-o',
+        '--output',
+        help='directory to save coco formatted label file',
+    )
+    parser.add_argument(
+        '--version',
+        choices=['2019', '2021'],
+        help='The version of YouTube-VIS Dataset',
+    )
+    return parser.parse_args()
+
+
+def convert_vis(ann_dir, save_dir, dataset_version, mode='train'):
+    """Convert YouTube-VIS dataset in COCO style.
+
+    Args:
+        ann_dir (str): The path of YouTube-VIS dataset.
+        save_dir (str): The path to save `VIS`.
+        dataset_version (str): The version of dataset. Options are '2019',
+            '2021'.
+        mode (str): Convert train dataset or validation dataset or test
+            dataset. Options are 'train', 'valid', 'test'. Default: 'train'.
+    """
+    assert dataset_version in ['2019', '2021']
+    assert mode in ['train', 'valid', 'test']
+    VIS = defaultdict(list)
+    records = dict(vid_id=1, img_id=1, ann_id=1, global_instance_id=1)
+    obj_num_classes = dict()
+
+    if dataset_version == '2019':
+        official_anns = mmengine.load(osp.join(ann_dir, f'{mode}.json'))
+    elif dataset_version == '2021':
+        official_anns = mmengine.load(
+            osp.join(ann_dir, mode, 'instances.json'))
+    VIS['categories'] = copy.deepcopy(official_anns['categories'])
+
+    has_annotations = mode == 'train'
+    if has_annotations:
+        vid_to_anns = defaultdict(list)
+        for ann_info in official_anns['annotations']:
+            vid_to_anns[ann_info['video_id']].append(ann_info)
+
+    video_infos = official_anns['videos']
+    for video_info in tqdm(video_infos):
+        video_name = video_info['file_names'][0].split(os.sep)[0]
+        video = dict(
+            id=video_info['id'],
+            name=video_name,
+            width=video_info['width'],
+            height=video_info['height'])
+        VIS['videos'].append(video)
+
+        num_frames = len(video_info['file_names'])
+        width = video_info['width']
+        height = video_info['height']
+        if has_annotations:
+            ann_infos_in_video = vid_to_anns[video_info['id']]
+            instance_id_maps = dict()
+
+        for frame_id in range(num_frames):
+            image = dict(
+                file_name=video_info['file_names'][frame_id],
+                height=height,
+                width=width,
+                id=records['img_id'],
+                frame_id=frame_id,
+                video_id=video_info['id'])
+            VIS['images'].append(image)
+
+            if has_annotations:
+                for ann_info in ann_infos_in_video:
+                    bbox = ann_info['bboxes'][frame_id]
+                    if bbox is None:
+                        continue
+
+                    category_id = ann_info['category_id']
+                    track_id = ann_info['id']
+                    segmentation = ann_info['segmentations'][frame_id]
+                    area = ann_info['areas'][frame_id]
+                    assert isinstance(category_id, int)
+                    assert isinstance(track_id, int)
+                    assert segmentation is not None
+                    assert area is not None
+
+                    if track_id in instance_id_maps:
+                        instance_id = instance_id_maps[track_id]
+                    else:
+                        instance_id = records['global_instance_id']
+                        records['global_instance_id'] += 1
+                        instance_id_maps[track_id] = instance_id
+
+                    ann = dict(
+                        id=records['ann_id'],
+                        video_id=video_info['id'],
+                        image_id=records['img_id'],
+                        category_id=category_id,
+                        instance_id=instance_id,
+                        bbox=bbox,
+                        segmentation=segmentation,
+                        area=area,
+                        iscrowd=ann_info['iscrowd'])
+
+                    if category_id not in obj_num_classes:
+                        obj_num_classes[category_id] = 1
+                    else:
+                        obj_num_classes[category_id] += 1
+
+                    VIS['annotations'].append(ann)
+                    records['ann_id'] += 1
+            records['img_id'] += 1
+        records['vid_id'] += 1
+
+    if not osp.isdir(save_dir):
+        os.makedirs(save_dir)
+    mmengine.dump(
+        VIS, osp.join(save_dir, f'youtube_vis_{dataset_version}_{mode}.json'))
+    print(f'-----YouTube VIS {dataset_version} {mode}------')
+    print(f'{records["vid_id"]- 1} videos')
+    print(f'{records["img_id"]- 1} images')
+    if has_annotations:
+        print(f'{records["ann_id"] - 1} objects')
+        print(f'{records["global_instance_id"] - 1} instances')
+    print('-----------------------')
+    if has_annotations:
+        for i in range(1, len(VIS['categories']) + 1):
+            class_name = VIS['categories'][i - 1]['name']
+            print(f'Class {i} {class_name} has {obj_num_classes[i]} objects.')
+
+
+def main():
+    args = parse_args()
+    for sub_set in ['train', 'valid', 'test']:
+        convert_vis(args.input, args.output, args.version, sub_set)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/deployment/mmdet2torchserve.py b/mmde/mmdet/.mim/tools/deployment/mmdet2torchserve.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d539e8e9f57df8d69a82905223406a4a85ce20b
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/deployment/mmdet2torchserve.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from mmengine.config import Config
+from mmengine.utils import mkdir_or_exist
+
+try:
+    from model_archiver.model_packaging import package_model
+    from model_archiver.model_packaging_utils import ModelExportUtils
+except ImportError:
+    package_model = None
+
+
+def mmdet2torchserve(
+    config_file: str,
+    checkpoint_file: str,
+    output_folder: str,
+    model_name: str,
+    model_version: str = '1.0',
+    force: bool = False,
+):
+    """Converts MMDetection model (config + checkpoint) to TorchServe `.mar`.
+
+    Args:
+        config_file:
+            In MMDetection config format.
+            The contents vary for each task repository.
+        checkpoint_file:
+            In MMDetection checkpoint format.
+            The contents vary for each task repository.
+        output_folder:
+            Folder where `{model_name}.mar` will be created.
+            The file created will be in TorchServe archive format.
+        model_name:
+            If not None, used for naming the `{model_name}.mar` file
+            that will be created under `output_folder`.
+            If None, `{Path(checkpoint_file).stem}` will be used.
+        model_version:
+            Model's version.
+        force:
+            If True, if there is an existing `{model_name}.mar`
+            file under `output_folder` it will be overwritten.
+    """
+    mkdir_or_exist(output_folder)
+
+    config = Config.fromfile(config_file)
+
+    with TemporaryDirectory() as tmpdir:
+        config.dump(f'{tmpdir}/config.py')
+
+        args = Namespace(
+            **{
+                'model_file': f'{tmpdir}/config.py',
+                'config_file': f'{tmpdir}/config.py',
+                'serialized_file': checkpoint_file,
+                'handler': f'{Path(__file__).parent}/mmdet_handler.py',
+                'model_name': model_name or Path(checkpoint_file).stem,
+                'version': model_version,
+                'export_path': output_folder,
+                'force': force,
+                'requirements_file': None,
+                'extra_files': None,
+                'runtime': 'python',
+                'archive_format': 'default'
+            })
+        manifest = ModelExportUtils.generate_manifest_json(args)
+        package_model(args, manifest)
+
+
+def parse_args():
+    parser = ArgumentParser(
+        description='Convert MMDetection models to TorchServe `.mar` format.')
+    parser.add_argument('config', type=str, help='config file path')
+    parser.add_argument('checkpoint', type=str, help='checkpoint file path')
+    parser.add_argument(
+        '--output-folder',
+        type=str,
+        required=True,
+        help='Folder where `{model_name}.mar` will be created.')
+    parser.add_argument(
+        '--model-name',
+        type=str,
+        default=None,
+        help='If not None, used for naming the `{model_name}.mar`'
+        'file that will be created under `output_folder`.'
+        'If None, `{Path(checkpoint_file).stem}` will be used.')
+    parser.add_argument(
+        '--model-version',
+        type=str,
+        default='1.0',
+        help='Number used for versioning.')
+    parser.add_argument(
+        '-f',
+        '--force',
+        action='store_true',
+        help='overwrite the existing `{model_name}.mar`')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if package_model is None:
+        raise ImportError('`torch-model-archiver` is required.'
+                          'Try: pip install torch-model-archiver')
+
+    mmdet2torchserve(args.config, args.checkpoint, args.output_folder,
+                     args.model_name, args.model_version, args.force)
diff --git a/mmde/mmdet/.mim/tools/deployment/mmdet_handler.py b/mmde/mmdet/.mim/tools/deployment/mmdet_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..28c93c99f6e3bb0898f8ac2237e890f4e261cc7f
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/deployment/mmdet_handler.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import base64
+import os
+
+import mmcv
+import numpy as np
+import torch
+from ts.torch_handler.base_handler import BaseHandler
+
+from mmdet.apis import inference_detector, init_detector
+
+
+class MMdetHandler(BaseHandler):
+    threshold = 0.5
+
+    def initialize(self, context):
+        properties = context.system_properties
+        self.map_location = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = torch.device(self.map_location + ':' +
+                                   str(properties.get('gpu_id')) if torch.cuda.
+                                   is_available() else self.map_location)
+        self.manifest = context.manifest
+
+        model_dir = properties.get('model_dir')
+        serialized_file = self.manifest['model']['serializedFile']
+        checkpoint = os.path.join(model_dir, serialized_file)
+        self.config_file = os.path.join(model_dir, 'config.py')
+
+        self.model = init_detector(self.config_file, checkpoint, self.device)
+        self.initialized = True
+
+    def preprocess(self, data):
+        images = []
+
+        for row in data:
+            image = row.get('data') or row.get('body')
+            if isinstance(image, str):
+                image = base64.b64decode(image)
+            image = mmcv.imfrombytes(image)
+            images.append(image)
+
+        return images
+
+    def inference(self, data, *args, **kwargs):
+        results = inference_detector(self.model, data)
+        return results
+
+    def postprocess(self, data):
+        # Format output following the example ObjectDetectionHandler format
+        output = []
+        for data_sample in data:
+            pred_instances = data_sample.pred_instances
+            bboxes = pred_instances.bboxes.cpu().numpy().astype(
+                np.float32).tolist()
+            labels = pred_instances.labels.cpu().numpy().astype(
+                np.int32).tolist()
+            scores = pred_instances.scores.cpu().numpy().astype(
+                np.float32).tolist()
+            preds = []
+            for idx in range(len(labels)):
+                cls_score, bbox, cls_label = scores[idx], bboxes[idx], labels[
+                    idx]
+                if cls_score >= self.threshold:
+                    class_name = self.model.dataset_meta['classes'][cls_label]
+                    result = dict(
+                        class_label=cls_label,
+                        class_name=class_name,
+                        bbox=bbox,
+                        score=cls_score)
+                    preds.append(result)
+            output.append(preds)
+        return output
diff --git a/mmde/mmdet/.mim/tools/deployment/test_torchserver.py b/mmde/mmdet/.mim/tools/deployment/test_torchserver.py
new file mode 100644
index 0000000000000000000000000000000000000000..5160a2fbdefb67550967047eef04b6104e4abd5f
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/deployment/test_torchserver.py
@@ -0,0 +1,113 @@
+import os
+from argparse import ArgumentParser
+
+import mmcv
+import requests
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.apis import inference_detector, init_detector
+from mmdet.registry import VISUALIZERS
+from mmdet.structures import DetDataSample
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('model_name', help='The model name in the server')
+    parser.add_argument(
+        '--inference-addr',
+        default='127.0.0.1:8080',
+        help='Address and port of the inference server')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.5, help='bbox score threshold')
+    parser.add_argument(
+        '--work-dir',
+        type=str,
+        default=None,
+        help='output directory to save drawn results.')
+    args = parser.parse_args()
+    return args
+
+
+def align_ts_output(inputs, metainfo, device):
+    bboxes = []
+    labels = []
+    scores = []
+    for i, pred in enumerate(inputs):
+        bboxes.append(pred['bbox'])
+        labels.append(pred['class_label'])
+        scores.append(pred['score'])
+    pred_instances = InstanceData(metainfo=metainfo)
+    pred_instances.bboxes = torch.tensor(
+        bboxes, dtype=torch.float32, device=device)
+    pred_instances.labels = torch.tensor(
+        labels, dtype=torch.int64, device=device)
+    pred_instances.scores = torch.tensor(
+        scores, dtype=torch.float32, device=device)
+    ts_data_sample = DetDataSample(pred_instances=pred_instances)
+    return ts_data_sample
+
+
+def main(args):
+    # build the model from a config file and a checkpoint file
+    model = init_detector(args.config, args.checkpoint, device=args.device)
+    # test a single image
+    pytorch_results = inference_detector(model, args.img)
+    keep = pytorch_results.pred_instances.scores >= args.score_thr
+    pytorch_results.pred_instances = pytorch_results.pred_instances[keep]
+
+    # init visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    # the dataset_meta is loaded from the checkpoint and
+    # then pass to the model in init_detector
+    visualizer.dataset_meta = model.dataset_meta
+
+    # show the results
+    img = mmcv.imread(args.img)
+    img = mmcv.imconvert(img, 'bgr', 'rgb')
+    pt_out_file = None
+    ts_out_file = None
+    if args.work_dir is not None:
+        os.makedirs(args.work_dir, exist_ok=True)
+        pt_out_file = os.path.join(args.work_dir, 'pytorch_result.png')
+        ts_out_file = os.path.join(args.work_dir, 'torchserve_result.png')
+    visualizer.add_datasample(
+        'pytorch_result',
+        img.copy(),
+        data_sample=pytorch_results,
+        draw_gt=False,
+        out_file=pt_out_file,
+        show=True,
+        wait_time=0)
+
+    url = 'http://' + args.inference_addr + '/predictions/' + args.model_name
+    with open(args.img, 'rb') as image:
+        response = requests.post(url, image)
+    metainfo = pytorch_results.pred_instances.metainfo
+    ts_results = align_ts_output(response.json(), metainfo, args.device)
+
+    visualizer.add_datasample(
+        'torchserve_result',
+        img,
+        data_sample=ts_results,
+        draw_gt=False,
+        out_file=ts_out_file,
+        show=True,
+        wait_time=0)
+
+    assert torch.allclose(pytorch_results.pred_instances.bboxes,
+                          ts_results.pred_instances.bboxes)
+    assert torch.allclose(pytorch_results.pred_instances.labels,
+                          ts_results.pred_instances.labels)
+    assert torch.allclose(pytorch_results.pred_instances.scores,
+                          ts_results.pred_instances.scores)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/mmde/mmdet/.mim/tools/dist_test.sh b/mmde/mmdet/.mim/tools/dist_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..dea131b43ea8f1222661d20603d40c18ea7f28a1
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dist_test.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/test.py \
+    $CONFIG \
+    $CHECKPOINT \
+    --launcher pytorch \
+    ${@:4}
diff --git a/mmde/mmdet/.mim/tools/dist_test_tracking.sh b/mmde/mmdet/.mim/tools/dist_test_tracking.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fd282e07adaef0a735685829cce02dc1d853fdea
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dist_test_tracking.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/test_tracking.py \
+    $CONFIG \
+    --launcher pytorch \
+    ${@:3}
diff --git a/mmde/mmdet/.mim/tools/dist_train.sh b/mmde/mmdet/.mim/tools/dist_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3fca7641dec4090930c85991a079c28409529d4e
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/dist_train.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/train.py \
+    $CONFIG \
+    --launcher pytorch ${@:3}
diff --git a/mmde/mmdet/.mim/tools/misc/download_dataset.py b/mmde/mmdet/.mim/tools/misc/download_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d801d208c4d0c4369e8df76a52ad5324df88d7b
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/misc/download_dataset.py
@@ -0,0 +1,229 @@
+import argparse
+import tarfile
+from itertools import repeat
+from multiprocessing.pool import ThreadPool
+from pathlib import Path
+from tarfile import TarFile
+from zipfile import ZipFile
+
+import torch
+from mmengine.utils.path import mkdir_or_exist
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Download datasets for training')
+    parser.add_argument(
+        '--dataset-name', type=str, help='dataset name', default='coco2017')
+    parser.add_argument(
+        '--save-dir',
+        type=str,
+        help='the dir to save dataset',
+        default='data/coco')
+    parser.add_argument(
+        '--unzip',
+        action='store_true',
+        help='whether unzip dataset or not, zipped files will be saved')
+    parser.add_argument(
+        '--delete',
+        action='store_true',
+        help='delete the download zipped files')
+    parser.add_argument(
+        '--threads', type=int, help='number of threading', default=4)
+    args = parser.parse_args()
+    return args
+
+
+def download(url, dir, unzip=True, delete=False, threads=1):
+
+    def download_one(url, dir):
+        f = dir / Path(url).name
+        if Path(url).is_file():
+            Path(url).rename(f)
+        elif not f.exists():
+            print(f'Downloading {url} to {f}')
+            torch.hub.download_url_to_file(url, f, progress=True)
+        if unzip and f.suffix in ('.zip', '.tar'):
+            print(f'Unzipping {f.name}')
+            if f.suffix == '.zip':
+                ZipFile(f).extractall(path=dir)
+            elif f.suffix == '.tar':
+                TarFile(f).extractall(path=dir)
+            if delete:
+                f.unlink()
+                print(f'Delete {f}')
+
+    dir = Path(dir)
+    if threads > 1:
+        pool = ThreadPool(threads)
+        pool.imap(lambda x: download_one(*x), zip(url, repeat(dir)))
+        pool.close()
+        pool.join()
+    else:
+        for u in [url] if isinstance(url, (str, Path)) else url:
+            download_one(u, dir)
+
+
+def download_objects365v2(url, dir, unzip=True, delete=False, threads=1):
+
+    def download_single(url, dir):
+
+        if 'train' in url:
+            saving_dir = dir / Path('train_zip')
+            mkdir_or_exist(saving_dir)
+            f = saving_dir / Path(url).name
+
+            unzip_dir = dir / Path('train')
+            mkdir_or_exist(unzip_dir)
+        elif 'val' in url:
+            saving_dir = dir / Path('val')
+            mkdir_or_exist(saving_dir)
+            f = saving_dir / Path(url).name
+
+            unzip_dir = dir / Path('val')
+            mkdir_or_exist(unzip_dir)
+        else:
+            raise NotImplementedError
+
+        if Path(url).is_file():
+            Path(url).rename(f)
+        elif not f.exists():
+            print(f'Downloading {url} to {f}')
+            torch.hub.download_url_to_file(url, f, progress=True)
+
+        if unzip and str(f).endswith('.tar.gz'):
+            print(f'Unzipping {f.name}')
+            tar = tarfile.open(f)
+            tar.extractall(path=unzip_dir)
+            if delete:
+                f.unlink()
+                print(f'Delete {f}')
+
+    # process annotations
+    full_url = []
+    for _url in url:
+        if 'zhiyuan_objv2_train.tar.gz' in _url or \
+                'zhiyuan_objv2_val.json' in _url:
+            full_url.append(_url)
+        elif 'train' in _url:
+            for i in range(51):
+                full_url.append(f'{_url}patch{i}.tar.gz')
+        elif 'val/images/v1' in _url:
+            for i in range(16):
+                full_url.append(f'{_url}patch{i}.tar.gz')
+        elif 'val/images/v2' in _url:
+            for i in range(16, 44):
+                full_url.append(f'{_url}patch{i}.tar.gz')
+        else:
+            raise NotImplementedError
+
+    dir = Path(dir)
+    if threads > 1:
+        pool = ThreadPool(threads)
+        pool.imap(lambda x: download_single(*x), zip(full_url, repeat(dir)))
+        pool.close()
+        pool.join()
+    else:
+        for u in full_url:
+            download_single(u, dir)
+
+
+def main():
+    args = parse_args()
+    path = Path(args.save_dir)
+    if not path.exists():
+        path.mkdir(parents=True, exist_ok=True)
+    data2url = dict(
+        # TODO: Support for downloading Panoptic Segmentation of COCO
+        coco2017=[
+            'http://images.cocodataset.org/zips/train2017.zip',
+            'http://images.cocodataset.org/zips/val2017.zip',
+            'http://images.cocodataset.org/zips/test2017.zip',
+            'http://images.cocodataset.org/zips/unlabeled2017.zip',
+            'http://images.cocodataset.org/annotations/annotations_trainval2017.zip',  # noqa
+            'http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip',  # noqa
+            'http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip',  # noqa
+            'http://images.cocodataset.org/annotations/image_info_test2017.zip',  # noqa
+            'http://images.cocodataset.org/annotations/image_info_unlabeled2017.zip',  # noqa
+        ],
+        coco2014=[
+            'http://images.cocodataset.org/zips/train2014.zip',
+            'http://images.cocodataset.org/zips/val2014.zip',
+            'http://images.cocodataset.org/zips/test2014.zip',
+            'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',  # noqa
+            'http://images.cocodataset.org/annotations/image_info_test2014.zip'  # noqa
+        ],
+        lvis=[
+            'https://s3-us-west-2.amazonaws.com/dl.fbaipublicfiles.com/LVIS/lvis_v1_train.json.zip',  # noqa
+            'https://s3-us-west-2.amazonaws.com/dl.fbaipublicfiles.com/LVIS/lvis_v1_train.json.zip',  # noqa
+        ],
+        voc2007=[
+            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar',  # noqa
+            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar',  # noqa
+            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCdevkit_08-Jun-2007.tar',  # noqa
+        ],
+        voc2012=[
+            'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar',  # noqa
+        ],
+        balloon=[
+            # src link: https://github.com/matterport/Mask_RCNN/releases/download/v2.1/balloon_dataset.zip # noqa
+            'https://download.openmmlab.com/mmyolo/data/balloon_dataset.zip'
+        ],
+        # Note: There is no download link for Objects365-V1 right now. If you
+        # would like to download Objects365-V1, please visit
+        # http://www.objects365.org/ to concat the author.
+        objects365v2=[
+            # training annotations
+            'https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/train/zhiyuan_objv2_train.tar.gz',  # noqa
+            # validation annotations
+            'https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/val/zhiyuan_objv2_val.json',  # noqa
+            # training url root
+            'https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/train/',  # noqa
+            # validation url root_1
+            'https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/val/images/v1/',  # noqa
+            # validation url root_2
+            'https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/val/images/v2/'  # noqa
+        ],
+        ade20k_2016=[
+            # training images and semantic segmentation annotations
+            'http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip',  # noqa
+            # instance segmentation annotations
+            'http://sceneparsing.csail.mit.edu/data/ChallengeData2017/annotations_instance.tar',  # noqa
+            # img categories ids
+            'https://raw.githubusercontent.com/CSAILVision/placeschallenge/master/instancesegmentation/imgCatIds.json',  # noqa
+            # category mapping
+            'https://raw.githubusercontent.com/CSAILVision/placeschallenge/master/instancesegmentation/categoryMapping.txt'  # noqa
+        ],
+        refcoco=[
+            # images
+            'http://images.cocodataset.org/zips/train2014.zip',
+            # refcoco annotations
+            'https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco.zip',
+            # refcoco+ annotations
+            'https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco+.zip',
+            # refcocog annotations
+            'https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcocog.zip'
+        ])
+    url = data2url.get(args.dataset_name, None)
+    if url is None:
+        print('Only support ADE20K, COCO, RefCOCO, VOC, LVIS, '
+              'balloon, and Objects365v2 now!')
+        return
+    if args.dataset_name == 'objects365v2':
+        download_objects365v2(
+            url,
+            dir=path,
+            unzip=args.unzip,
+            delete=args.delete,
+            threads=args.threads)
+    else:
+        download(
+            url,
+            dir=path,
+            unzip=args.unzip,
+            delete=args.delete,
+            threads=args.threads)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/misc/gen_coco_panoptic_test_info.py b/mmde/mmdet/.mim/tools/misc/gen_coco_panoptic_test_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc928e66f0a6e8b02488c8959f0a12fbf331bb5b
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/misc/gen_coco_panoptic_test_info.py
@@ -0,0 +1,33 @@
+import argparse
+import os.path as osp
+
+from mmengine.fileio import dump, load
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Generate COCO test image information '
+        'for COCO panoptic segmentation.')
+    parser.add_argument('data_root', help='Path to COCO annotation directory.')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+    data_root = args.data_root
+    val_info = load(osp.join(data_root, 'panoptic_val2017.json'))
+    test_old_info = load(osp.join(data_root, 'image_info_test-dev2017.json'))
+
+    # replace categories from image_info_test-dev2017.json
+    # with categories from panoptic_val2017.json which
+    # has attribute `isthing`.
+    test_info = test_old_info
+    test_info.update({'categories': val_info['categories']})
+    dump(test_info, osp.join(data_root,
+                             'panoptic_image_info_test-dev2017.json'))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/misc/get_crowdhuman_id_hw.py b/mmde/mmdet/.mim/tools/misc/get_crowdhuman_id_hw.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ed9142a42383768cd57246676fc8e7011a38056
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/misc/get_crowdhuman_id_hw.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Get image shape on CrowdHuman dataset.
+
+Here is an example to run this script.
+
+Example:
+    python tools/misc/get_crowdhuman_id_hw.py ${CONFIG} \
+    --dataset ${DATASET_TYPE}
+"""
+import argparse
+import json
+import logging
+import os.path as osp
+from multiprocessing import Pool
+
+import mmcv
+from mmengine.config import Config
+from mmengine.fileio import dump, get, get_text
+from mmengine.logging import print_log
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Collect image metas')
+    parser.add_argument('config', help='Config file path')
+    parser.add_argument(
+        '--dataset',
+        choices=['train', 'val'],
+        help='Collect image metas from which dataset')
+    parser.add_argument(
+        '--nproc',
+        default=10,
+        type=int,
+        help='Processes used for get image metas')
+    args = parser.parse_args()
+    return args
+
+
+def get_image_metas(anno_str, img_prefix):
+    id_hw = {}
+    anno_dict = json.loads(anno_str)
+    img_path = osp.join(img_prefix, f"{anno_dict['ID']}.jpg")
+    img_id = anno_dict['ID']
+    img_bytes = get(img_path)
+    img = mmcv.imfrombytes(img_bytes, backend='cv2')
+    id_hw[img_id] = img.shape[:2]
+    return id_hw
+
+
+def main():
+    args = parse_args()
+
+    # get ann_file and img_prefix from config files
+    cfg = Config.fromfile(args.config)
+    dataset = args.dataset
+    dataloader_cfg = cfg.get(f'{dataset}_dataloader')
+    ann_file = osp.join(dataloader_cfg.dataset.data_root,
+                        dataloader_cfg.dataset.ann_file)
+    img_prefix = osp.join(dataloader_cfg.dataset.data_root,
+                          dataloader_cfg.dataset.data_prefix['img'])
+
+    # load image metas
+    print_log(
+        f'loading CrowdHuman {dataset} annotation...', level=logging.INFO)
+    anno_strs = get_text(ann_file).strip().split('\n')
+    pool = Pool(args.nproc)
+    # get image metas with multiple processes
+    id_hw_temp = pool.starmap(
+        get_image_metas,
+        zip(anno_strs, [img_prefix for _ in range(len(anno_strs))]),
+    )
+    pool.close()
+
+    # save image metas
+    id_hw = {}
+    for sub_dict in id_hw_temp:
+        id_hw.update(sub_dict)
+
+    data_root = osp.dirname(ann_file)
+    save_path = osp.join(data_root, f'id_hw_{dataset}.json')
+    print_log(
+        f'\nsaving "id_hw_{dataset}.json" in "{data_root}"',
+        level=logging.INFO)
+    dump(id_hw, save_path, file_format='json')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/misc/get_image_metas.py b/mmde/mmdet/.mim/tools/misc/get_image_metas.py
new file mode 100644
index 0000000000000000000000000000000000000000..5644fa8c1ab7c65583374d65a4e68b3faed86d42
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/misc/get_image_metas.py
@@ -0,0 +1,125 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Get image metas on a specific dataset.
+
+Here is an example to run this script.
+
+Example:
+    python tools/misc/get_image_metas.py ${CONFIG} \
+    --out ${OUTPUT FILE NAME}
+"""
+import argparse
+import csv
+import os.path as osp
+from multiprocessing import Pool
+
+import mmcv
+from mmengine.config import Config
+from mmengine.fileio import dump, get
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Collect image metas')
+    parser.add_argument('config', help='Config file path')
+    parser.add_argument(
+        '--dataset',
+        default='val',
+        choices=['train', 'val', 'test'],
+        help='Collect image metas from which dataset')
+    parser.add_argument(
+        '--out',
+        default='validation-image-metas.pkl',
+        help='The output image metas file name. The save dir is in the '
+        'same directory as `dataset.ann_file` path')
+    parser.add_argument(
+        '--nproc',
+        default=4,
+        type=int,
+        help='Processes used for get image metas')
+    args = parser.parse_args()
+    return args
+
+
+def get_metas_from_csv_style_ann_file(ann_file):
+    data_infos = []
+    cp_filename = None
+    with open(ann_file, 'r') as f:
+        reader = csv.reader(f)
+        for i, line in enumerate(reader):
+            if i == 0:
+                continue
+            img_id = line[0]
+            filename = f'{img_id}.jpg'
+            if filename != cp_filename:
+                data_infos.append(dict(filename=filename))
+                cp_filename = filename
+    return data_infos
+
+
+def get_metas_from_txt_style_ann_file(ann_file):
+    with open(ann_file) as f:
+        lines = f.readlines()
+    i = 0
+    data_infos = []
+    while i < len(lines):
+        filename = lines[i].rstrip()
+        data_infos.append(dict(filename=filename))
+        skip_lines = int(lines[i + 2]) + 3
+        i += skip_lines
+    return data_infos
+
+
+def get_image_metas(data_info, img_prefix):
+    filename = data_info.get('filename', None)
+    if filename is not None:
+        if img_prefix is not None:
+            filename = osp.join(img_prefix, filename)
+        img_bytes = get(filename)
+        img = mmcv.imfrombytes(img_bytes, flag='color')
+        shape = img.shape
+        meta = dict(filename=filename, ori_shape=shape)
+    else:
+        raise NotImplementedError('Missing `filename` in data_info')
+    return meta
+
+
+def main():
+    args = parse_args()
+    assert args.out.endswith('pkl'), 'The output file name must be pkl suffix'
+
+    # load config files
+    cfg = Config.fromfile(args.config)
+    dataloader_cfg = cfg.get(f'{args.dataset}_dataloader')
+    ann_file = osp.join(dataloader_cfg.dataset.data_root,
+                        dataloader_cfg.dataset.ann_file)
+    img_prefix = osp.join(dataloader_cfg.dataset.data_root,
+                          dataloader_cfg.dataset.data_prefix['img'])
+
+    print(f'{"-" * 5} Start Processing {"-" * 5}')
+    if ann_file.endswith('csv'):
+        data_infos = get_metas_from_csv_style_ann_file(ann_file)
+    elif ann_file.endswith('txt'):
+        data_infos = get_metas_from_txt_style_ann_file(ann_file)
+    else:
+        shuffix = ann_file.split('.')[-1]
+        raise NotImplementedError('File name must be csv or txt suffix but '
+                                  f'get {shuffix}')
+
+    print(f'Successfully load annotation file from {ann_file}')
+    print(f'Processing {len(data_infos)} images...')
+    pool = Pool(args.nproc)
+    # get image metas with multiple processes
+    image_metas = pool.starmap(
+        get_image_metas,
+        zip(data_infos, [img_prefix for _ in range(len(data_infos))]),
+    )
+    pool.close()
+
+    # save image metas
+    root_path = dataloader_cfg.dataset.ann_file.rsplit('/', 1)[0]
+    save_path = osp.join(root_path, args.out)
+    dump(image_metas, save_path, protocol=4)
+    print(f'Image meta file save to: {save_path}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/misc/print_config.py b/mmde/mmdet/.mim/tools/misc/print_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..291943bc2ba093080b6e4f5d498b1f0d615c8458
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/misc/print_config.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+
+from mmengine import Config, DictAction
+
+from mmdet.utils import replace_cfg_vals, update_data_root
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Print the whole config')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '--save-path',
+        default=None,
+        help='save path of whole config, suffixed with .py, .json or .yml')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    # replace the ${key} with the value of cfg.key
+    cfg = replace_cfg_vals(cfg)
+
+    # update data root according to MMDET_DATASETS
+    update_data_root(cfg)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    print(f'Config:\n{cfg.pretty_text}')
+
+    if args.save_path is not None:
+        save_path = args.save_path
+
+        suffix = os.path.splitext(save_path)[-1]
+        assert suffix in ['.py', '.json', '.yml']
+
+        if not os.path.exists(os.path.split(save_path)[0]):
+            os.makedirs(os.path.split(save_path)[0])
+        cfg.dump(save_path)
+        print(f'Config saving at {save_path}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/misc/split_coco.py b/mmde/mmdet/.mim/tools/misc/split_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..075623f3d7237eef3f4dfe343bbce8d53829f129
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/misc/split_coco.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+import numpy as np
+from mmengine.fileio import dump, load
+from mmengine.utils import mkdir_or_exist, track_parallel_progress
+
+prog_description = '''K-Fold coco split.
+
+To split coco data for semi-supervised object detection:
+    python tools/misc/split_coco.py
+'''
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--data-root',
+        type=str,
+        help='The data root of coco dataset.',
+        default='./data/coco/')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        help='The output directory of coco semi-supervised annotations.',
+        default='./data/coco/semi_anns/')
+    parser.add_argument(
+        '--labeled-percent',
+        type=float,
+        nargs='+',
+        help='The percentage of labeled data in the training set.',
+        default=[1, 2, 5, 10])
+    parser.add_argument(
+        '--fold',
+        type=int,
+        help='K-fold cross validation for semi-supervised object detection.',
+        default=5)
+    args = parser.parse_args()
+    return args
+
+
+def split_coco(data_root, out_dir, percent, fold):
+    """Split COCO data for Semi-supervised object detection.
+
+    Args:
+        data_root (str): The data root of coco dataset.
+        out_dir (str): The output directory of coco semi-supervised
+            annotations.
+        percent (float): The percentage of labeled data in the training set.
+        fold (int): The fold of dataset and set as random seed for data split.
+    """
+
+    def save_anns(name, images, annotations):
+        sub_anns = dict()
+        sub_anns['images'] = images
+        sub_anns['annotations'] = annotations
+        sub_anns['licenses'] = anns['licenses']
+        sub_anns['categories'] = anns['categories']
+        sub_anns['info'] = anns['info']
+
+        mkdir_or_exist(out_dir)
+        dump(sub_anns, f'{out_dir}/{name}.json')
+
+    # set random seed with the fold
+    np.random.seed(fold)
+    ann_file = osp.join(data_root, 'annotations/instances_train2017.json')
+    anns = load(ann_file)
+
+    image_list = anns['images']
+    labeled_total = int(percent / 100. * len(image_list))
+    labeled_inds = set(
+        np.random.choice(range(len(image_list)), size=labeled_total))
+    labeled_ids, labeled_images, unlabeled_images = [], [], []
+
+    for i in range(len(image_list)):
+        if i in labeled_inds:
+            labeled_images.append(image_list[i])
+            labeled_ids.append(image_list[i]['id'])
+        else:
+            unlabeled_images.append(image_list[i])
+
+    # get all annotations of labeled images
+    labeled_ids = set(labeled_ids)
+    labeled_annotations, unlabeled_annotations = [], []
+
+    for ann in anns['annotations']:
+        if ann['image_id'] in labeled_ids:
+            labeled_annotations.append(ann)
+        else:
+            unlabeled_annotations.append(ann)
+
+    # save labeled and unlabeled
+    labeled_name = f'instances_train2017.{fold}@{percent}'
+    unlabeled_name = f'instances_train2017.{fold}@{percent}-unlabeled'
+
+    save_anns(labeled_name, labeled_images, labeled_annotations)
+    save_anns(unlabeled_name, unlabeled_images, unlabeled_annotations)
+
+
+def multi_wrapper(args):
+    return split_coco(*args)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    arguments_list = [(args.data_root, args.out_dir, p, f)
+                      for f in range(1, args.fold + 1)
+                      for p in args.labeled_percent]
+    track_parallel_progress(multi_wrapper, arguments_list, args.fold)
diff --git a/mmde/mmdet/.mim/tools/misc/split_odvg.py b/mmde/mmdet/.mim/tools/misc/split_odvg.py
new file mode 100644
index 0000000000000000000000000000000000000000..37fae909859bc4b9da32e9d867c728b34b1983da
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/misc/split_odvg.py
@@ -0,0 +1,80 @@
+import argparse
+import json
+import os
+import shutil
+
+import jsonlines
+import numpy as np
+from mmengine.utils import ProgressBar, mkdir_or_exist
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('data_root', type=str, help='The data root.')
+    parser.add_argument('ann_file', type=str)
+    parser.add_argument('img_prefix', type=str)
+    parser.add_argument(
+        'out_dir',
+        type=str,
+        help='The output directory of coco semi-supervised annotations.')
+    parser.add_argument(
+        '--label-map-file', '-m', type=str, help='label map file')
+    parser.add_argument(
+        '--num-img',
+        '-n',
+        default=200,
+        type=int,
+        help='num of extract image, -1 means all images')
+    parser.add_argument('--seed', default=-1, type=int, help='seed')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    assert args.out_dir != args.data_root, \
+        'The file will be overwritten in place, ' \
+        'so the same folder is not allowed !'
+
+    seed = int(args.seed)
+    if seed != -1:
+        print(f'Set the global seed: {seed}')
+        np.random.seed(int(args.seed))
+
+    ann_file = os.path.join(args.data_root, args.ann_file)
+    with open(ann_file, 'r') as f:
+        data_list = [json.loads(line) for line in f]
+
+    np.random.shuffle(data_list)
+
+    num_img = args.num_img
+
+    progress_bar = ProgressBar(num_img)
+    for i in range(num_img):
+        file_name = data_list[i]['filename']
+        image_path = os.path.join(args.data_root, args.img_prefix, file_name)
+        out_image_dir = os.path.join(args.out_dir, args.img_prefix)
+        mkdir_or_exist(out_image_dir)
+        out_image_path = os.path.join(out_image_dir, file_name)
+        shutil.copyfile(image_path, out_image_path)
+
+        progress_bar.update()
+
+    out_path = os.path.join(args.out_dir, args.ann_file)
+    out_dir = os.path.dirname(out_path)
+    mkdir_or_exist(out_dir)
+
+    with jsonlines.open(out_path, mode='w') as writer:
+        writer.write_all(data_list[:num_img])
+
+    if args.label_map_file is not None:
+        out_dir = os.path.dirname(
+            os.path.join(args.out_dir, args.label_map_file))
+        mkdir_or_exist(out_dir)
+        shutil.copyfile(
+            os.path.join(args.data_root, args.label_map_file),
+            os.path.join(args.out_dir, args.label_map_file))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/model_converters/detectron2_to_mmdet.py b/mmde/mmdet/.mim/tools/model_converters/detectron2_to_mmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e55d1fad20a8a223cacf50300c819f446115a2d
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/model_converters/detectron2_to_mmdet.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from collections import OrderedDict
+
+import torch
+from mmengine.fileio import load
+from mmengine.runner import save_checkpoint
+
+
+def convert(src: str, dst: str, prefix: str = 'd2_model') -> None:
+    """Convert Detectron2 checkpoint to MMDetection style.
+
+    Args:
+        src (str): The Detectron2 checkpoint path, should endswith `pkl`.
+        dst (str): The MMDetection checkpoint path.
+        prefix (str): The prefix of MMDetection model, defaults to 'd2_model'.
+    """
+    # load arch_settings
+    assert src.endswith('pkl'), \
+        'the source Detectron2 checkpoint should endswith `pkl`.'
+    d2_model = load(src, encoding='latin1').get('model')
+    assert d2_model is not None
+
+    # convert to mmdet style
+    dst_state_dict = OrderedDict()
+    for name, value in d2_model.items():
+        if not isinstance(value, torch.Tensor):
+            value = torch.from_numpy(value)
+        dst_state_dict[f'{prefix}.{name}'] = value
+
+    mmdet_model = dict(state_dict=dst_state_dict, meta=dict())
+    save_checkpoint(mmdet_model, dst)
+    print(f'Convert Detectron2 model {src} to MMDetection model {dst}')
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert Detectron2 checkpoint to MMDetection style')
+    parser.add_argument('src', help='Detectron2 model path')
+    parser.add_argument('dst', help='MMDetectron model save path')
+    parser.add_argument(
+        '--prefix', default='d2_model', type=str, help='prefix of the model')
+    args = parser.parse_args()
+    convert(args.src, args.dst, args.prefix)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/model_converters/detectron2pytorch.py b/mmde/mmdet/.mim/tools/model_converters/detectron2pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe0920ada194d4387029cdf45899f9fd31a7dd18
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/model_converters/detectron2pytorch.py
@@ -0,0 +1,83 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from collections import OrderedDict
+
+import torch
+from mmengine.fileio import load
+
+arch_settings = {50: (3, 4, 6, 3), 101: (3, 4, 23, 3)}
+
+
+def convert_bn(blobs, state_dict, caffe_name, torch_name, converted_names):
+    # detectron replace bn with affine channel layer
+    state_dict[torch_name + '.bias'] = torch.from_numpy(blobs[caffe_name +
+                                                              '_b'])
+    state_dict[torch_name + '.weight'] = torch.from_numpy(blobs[caffe_name +
+                                                                '_s'])
+    bn_size = state_dict[torch_name + '.weight'].size()
+    state_dict[torch_name + '.running_mean'] = torch.zeros(bn_size)
+    state_dict[torch_name + '.running_var'] = torch.ones(bn_size)
+    converted_names.add(caffe_name + '_b')
+    converted_names.add(caffe_name + '_s')
+
+
+def convert_conv_fc(blobs, state_dict, caffe_name, torch_name,
+                    converted_names):
+    state_dict[torch_name + '.weight'] = torch.from_numpy(blobs[caffe_name +
+                                                                '_w'])
+    converted_names.add(caffe_name + '_w')
+    if caffe_name + '_b' in blobs:
+        state_dict[torch_name + '.bias'] = torch.from_numpy(blobs[caffe_name +
+                                                                  '_b'])
+        converted_names.add(caffe_name + '_b')
+
+
+def convert(src, dst, depth):
+    """Convert keys in detectron pretrained ResNet models to pytorch style."""
+    # load arch_settings
+    if depth not in arch_settings:
+        raise ValueError('Only support ResNet-50 and ResNet-101 currently')
+    block_nums = arch_settings[depth]
+    # load caffe model
+    caffe_model = load(src, encoding='latin1')
+    blobs = caffe_model['blobs'] if 'blobs' in caffe_model else caffe_model
+    # convert to pytorch style
+    state_dict = OrderedDict()
+    converted_names = set()
+    convert_conv_fc(blobs, state_dict, 'conv1', 'conv1', converted_names)
+    convert_bn(blobs, state_dict, 'res_conv1_bn', 'bn1', converted_names)
+    for i in range(1, len(block_nums) + 1):
+        for j in range(block_nums[i - 1]):
+            if j == 0:
+                convert_conv_fc(blobs, state_dict, f'res{i + 1}_{j}_branch1',
+                                f'layer{i}.{j}.downsample.0', converted_names)
+                convert_bn(blobs, state_dict, f'res{i + 1}_{j}_branch1_bn',
+                           f'layer{i}.{j}.downsample.1', converted_names)
+            for k, letter in enumerate(['a', 'b', 'c']):
+                convert_conv_fc(blobs, state_dict,
+                                f'res{i + 1}_{j}_branch2{letter}',
+                                f'layer{i}.{j}.conv{k+1}', converted_names)
+                convert_bn(blobs, state_dict,
+                           f'res{i + 1}_{j}_branch2{letter}_bn',
+                           f'layer{i}.{j}.bn{k + 1}', converted_names)
+    # check if all layers are converted
+    for key in blobs:
+        if key not in converted_names:
+            print(f'Not Convert: {key}')
+    # save checkpoint
+    checkpoint = dict()
+    checkpoint['state_dict'] = state_dict
+    torch.save(checkpoint, dst)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert model keys')
+    parser.add_argument('src', help='src detectron model path')
+    parser.add_argument('dst', help='save path')
+    parser.add_argument('depth', type=int, help='ResNet model depth')
+    args = parser.parse_args()
+    convert(args.src, args.dst, args.depth)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/model_converters/detic_to_mmdet.py b/mmde/mmdet/.mim/tools/model_converters/detic_to_mmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..25759cb4fda5b464d57e3e350bb1484800e8bd81
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/model_converters/detic_to_mmdet.py
@@ -0,0 +1,195 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+from collections import OrderedDict
+
+import torch
+from mmengine.runner import CheckpointLoader
+
+convert_dict_fpn = {
+    'backbone.fpn_lateral3': 'neck.lateral_convs.0.conv',
+    'backbone.fpn_lateral4': 'neck.lateral_convs.1.conv',
+    'backbone.fpn_lateral5': 'neck.lateral_convs.2.conv',
+    'backbone.fpn_output3': 'neck.fpn_convs.0.conv',
+    'backbone.fpn_output4': 'neck.fpn_convs.1.conv',
+    'backbone.fpn_output5': 'neck.fpn_convs.2.conv',
+    'backbone.top_block.p6': 'neck.fpn_convs.3.conv',
+    'backbone.top_block.p7': 'neck.fpn_convs.4.conv',
+}
+
+convert_dict_rpn = {
+    'proposal_generator.centernet_head.bbox_tower.0':
+    'rpn_head.reg_convs.0.conv',
+    'proposal_generator.centernet_head.bbox_tower.1':
+    'rpn_head.reg_convs.0.gn',
+    'proposal_generator.centernet_head.bbox_tower.3':
+    'rpn_head.reg_convs.1.conv',
+    'proposal_generator.centernet_head.bbox_tower.4':
+    'rpn_head.reg_convs.1.gn',
+    'proposal_generator.centernet_head.bbox_tower.6':
+    'rpn_head.reg_convs.2.conv',
+    'proposal_generator.centernet_head.bbox_tower.7':
+    'rpn_head.reg_convs.2.gn',
+    'proposal_generator.centernet_head.bbox_tower.9':
+    'rpn_head.reg_convs.3.conv',
+    'proposal_generator.centernet_head.bbox_tower.10':
+    'rpn_head.reg_convs.3.gn',
+    'proposal_generator.centernet_head.bbox_pred': 'rpn_head.conv_reg',
+    'proposal_generator.centernet_head.scales.0.scale':
+    'rpn_head.scales.0.scale',
+    'proposal_generator.centernet_head.scales.1.scale':
+    'rpn_head.scales.1.scale',
+    'proposal_generator.centernet_head.scales.2.scale':
+    'rpn_head.scales.2.scale',
+    'proposal_generator.centernet_head.scales.3.scale':
+    'rpn_head.scales.3.scale',
+    'proposal_generator.centernet_head.scales.4.scale':
+    'rpn_head.scales.4.scale',
+    'proposal_generator.centernet_head.agn_hm': 'rpn_head.conv_cls',
+}
+
+convert_dict_roi = {
+    'roi_heads.box_head.0.fc1': 'roi_head.bbox_head.0.shared_fcs.0',
+    'roi_heads.box_head.0.fc2': 'roi_head.bbox_head.0.shared_fcs.1',
+    'roi_heads.box_head.1.fc1': 'roi_head.bbox_head.1.shared_fcs.0',
+    'roi_heads.box_head.1.fc2': 'roi_head.bbox_head.1.shared_fcs.1',
+    'roi_heads.box_head.2.fc1': 'roi_head.bbox_head.2.shared_fcs.0',
+    'roi_heads.box_head.2.fc2': 'roi_head.bbox_head.2.shared_fcs.1',
+    'roi_heads.box_predictor.0.freq_weight':
+    'roi_head.bbox_head.0.freq_weight',
+    'roi_heads.box_predictor.0.cls_score.zs_weight':
+    'roi_head.bbox_head.0.fc_cls.zs_weight',
+    'roi_heads.box_predictor.0.cls_score.linear':
+    'roi_head.bbox_head.0.fc_cls.linear',
+    'roi_heads.box_predictor.0.bbox_pred.0': 'roi_head.bbox_head.0.fc_reg.0',
+    'roi_heads.box_predictor.0.bbox_pred.2': 'roi_head.bbox_head.0.fc_reg.2',
+    'roi_heads.box_predictor.1.freq_weight':
+    'roi_head.bbox_head.1.freq_weight',
+    'roi_heads.box_predictor.1.cls_score.zs_weight':
+    'roi_head.bbox_head.1.fc_cls.zs_weight',
+    'roi_heads.box_predictor.1.cls_score.linear':
+    'roi_head.bbox_head.1.fc_cls.linear',
+    'roi_heads.box_predictor.1.bbox_pred.0': 'roi_head.bbox_head.1.fc_reg.0',
+    'roi_heads.box_predictor.1.bbox_pred.2': 'roi_head.bbox_head.1.fc_reg.2',
+    'roi_heads.box_predictor.2.freq_weight':
+    'roi_head.bbox_head.2.freq_weight',
+    'roi_heads.box_predictor.2.cls_score.zs_weight':
+    'roi_head.bbox_head.2.fc_cls.zs_weight',
+    'roi_heads.box_predictor.2.cls_score.linear':
+    'roi_head.bbox_head.2.fc_cls.linear',
+    'roi_heads.box_predictor.2.bbox_pred.0': 'roi_head.bbox_head.2.fc_reg.0',
+    'roi_heads.box_predictor.2.bbox_pred.2': 'roi_head.bbox_head.2.fc_reg.2',
+    'roi_heads.mask_head.mask_fcn1': 'roi_head.mask_head.convs.0.conv',
+    'roi_heads.mask_head.mask_fcn2': 'roi_head.mask_head.convs.1.conv',
+    'roi_heads.mask_head.mask_fcn3': 'roi_head.mask_head.convs.2.conv',
+    'roi_heads.mask_head.mask_fcn4': 'roi_head.mask_head.convs.3.conv',
+    'roi_heads.mask_head.deconv': 'roi_head.mask_head.upsample',
+    'roi_heads.mask_head.predictor': 'roi_head.mask_head.conv_logits',
+}
+
+
+def correct_unfold_reduction_order(x):
+    out_channel, in_channel = x.shape
+    x = x.reshape(out_channel, 4, in_channel // 4)
+    x = x[:, [0, 2, 1, 3], :].transpose(1, 2).reshape(out_channel, in_channel)
+    return x
+
+
+def correct_unfold_norm_order(x):
+    in_channel = x.shape[0]
+    x = x.reshape(4, in_channel // 4)
+    x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
+    return x
+
+
+def convert(ckpt):
+    new_ckpt = OrderedDict()
+
+    for k, v in list(ckpt.items()):
+        new_v = v
+        if 'backbone.bottom_up' in k:
+            new_k = k.replace('backbone.bottom_up', 'backbone')
+            # for Transformer backbone
+            if 'patch_embed.proj' in new_k:
+                new_k = new_k.replace('patch_embed.proj',
+                                      'patch_embed.projection')
+            elif 'pos_drop' in new_k:
+                new_k = new_k.replace('pos_drop', 'drop_after_pos')
+
+            if 'layers' in new_k:
+                new_k = new_k.replace('layers', 'stages')
+                if 'mlp.fc1' in new_k:
+                    new_k = new_k.replace('mlp.fc1', 'ffn.layers.0.0')
+                elif 'mlp.fc2' in new_k:
+                    new_k = new_k.replace('mlp.fc2', 'ffn.layers.1')
+                elif 'attn' in new_k:
+                    new_k = new_k.replace('attn', 'attn.w_msa')
+
+                if 'downsample' in k:
+                    if 'reduction.' in k:
+                        new_v = correct_unfold_reduction_order(v)
+                    elif 'norm.' in k:
+                        new_v = correct_unfold_norm_order(v)
+            # for resnet
+            if 'base.' in k:
+                new_k = new_k.replace('base.', '')
+
+        elif 'backbone.fpn' in k or 'backbone.top_block' in k:
+            old_k = k.replace('.weight', '')
+            old_k = old_k.replace('.bias', '')
+            new_k = k.replace(old_k, convert_dict_fpn[old_k])
+        elif 'proposal_generator' in k:
+            old_k = k.replace('.weight', '')
+            old_k = old_k.replace('.bias', '')
+            new_k = k.replace(old_k, convert_dict_rpn[old_k])
+        elif 'roi_heads' in k:
+            old_k = k.replace('.weight', '')
+            old_k = old_k.replace('.bias', '')
+            new_k = k.replace(old_k, convert_dict_roi[old_k])
+        else:
+            print('skip:', k)
+            continue
+
+        new_ckpt[new_k] = new_v
+    return new_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in pretrained eva '
+        'models to mmpretrain style.')
+    parser.add_argument(
+        '--src',
+        default='Detic_LbaseI_CLIP_SwinB_896b32_4x_ft4x_max-size.pth',
+        help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument(
+        '--dst',
+        default='detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis.pth',
+        help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+
+    if 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+
+    weight = {}
+    new_state_dict = convert(state_dict)
+    if 'backbone.fc.weight' in new_state_dict.keys():
+        del [new_state_dict['backbone.fc.weight']]
+    if 'backbone.fc.bias' in new_state_dict.keys():
+        del [new_state_dict['backbone.fc.bias']]
+    weight['state_dict'] = new_state_dict
+    torch.save(weight, args.dst)
+
+    sha = subprocess.check_output(['sha256sum', args.dst]).decode()
+    final_file = args.dst.replace('.pth', '') + '-{}.pth'.format(sha[:8])
+    subprocess.Popen(['mv', args.dst, final_file])
+    print(f'Done!!, save to {final_file}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/model_converters/glip_to_mmdet.py b/mmde/mmdet/.mim/tools/model_converters/glip_to_mmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..255addca5bdbf3023880d4371de018be280eb8e6
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/model_converters/glip_to_mmdet.py
@@ -0,0 +1,125 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+from collections import OrderedDict
+
+import torch
+from mmengine.runner import CheckpointLoader
+
+convert_dict_fpn = {
+    'module.backbone.fpn.fpn_inner2': 'neck.lateral_convs.0.conv',
+    'module.backbone.fpn.fpn_inner3': 'neck.lateral_convs.1.conv',
+    'module.backbone.fpn.fpn_inner4': 'neck.lateral_convs.2.conv',
+    'module.backbone.fpn.fpn_layer2': 'neck.fpn_convs.0.conv',
+    'module.backbone.fpn.fpn_layer3': 'neck.fpn_convs.1.conv',
+    'module.backbone.fpn.fpn_layer4': 'neck.fpn_convs.2.conv',
+    'module.backbone.fpn.top_blocks.p6': 'neck.fpn_convs.3.conv',
+    'module.backbone.fpn.top_blocks.p7': 'neck.fpn_convs.4.conv',
+}
+
+
+def correct_unfold_reduction_order(x):
+    out_channel, in_channel = x.shape
+    x = x.reshape(out_channel, 4, in_channel // 4)
+    x = x[:, [0, 2, 1, 3], :].transpose(1, 2).reshape(out_channel, in_channel)
+    return x
+
+
+def correct_unfold_norm_order(x):
+    in_channel = x.shape[0]
+    x = x.reshape(4, in_channel // 4)
+    x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
+    return x
+
+
+def convert(ckpt):
+    new_ckpt = OrderedDict()
+
+    for k, v in list(ckpt.items()):
+        if 'anchor_generator' in k or 'resizer' in k or 'cls_logits' in k:
+            continue
+
+        new_v = v
+        if 'module.backbone.body' in k:
+            new_k = k.replace('module.backbone.body', 'backbone')
+            if 'patch_embed.proj' in new_k:
+                new_k = new_k.replace('patch_embed.proj',
+                                      'patch_embed.projection')
+            elif 'pos_drop' in new_k:
+                new_k = new_k.replace('pos_drop', 'drop_after_pos')
+
+            if 'layers' in new_k:
+                new_k = new_k.replace('layers', 'stages')
+                if 'mlp.fc1' in new_k:
+                    new_k = new_k.replace('mlp.fc1', 'ffn.layers.0.0')
+                elif 'mlp.fc2' in new_k:
+                    new_k = new_k.replace('mlp.fc2', 'ffn.layers.1')
+                elif 'attn' in new_k:
+                    new_k = new_k.replace('attn', 'attn.w_msa')
+
+                if 'downsample' in k:
+                    if 'reduction.' in k:
+                        new_v = correct_unfold_reduction_order(v)
+                    elif 'norm.' in k:
+                        new_v = correct_unfold_norm_order(v)
+
+        elif 'module.backbone.fpn' in k:
+            old_k = k.replace('.weight', '')
+            old_k = old_k.replace('.bias', '')
+            new_k = k.replace(old_k, convert_dict_fpn[old_k])
+        elif 'module.language_backbone' in k:
+            new_k = k.replace('module.language_backbone',
+                              'language_model.language_backbone')
+            if 'pooler' in k:
+                continue
+        elif 'module.rpn' in k:
+            if 'module.rpn.head.scales' in k:
+                new_k = k.replace('module.rpn.head.scales',
+                                  'bbox_head.head.scales')
+            else:
+                new_k = k.replace('module.rpn', 'bbox_head')
+
+            if 'anchor_generator' in k and 'resizer' in k:
+                continue
+        else:
+            print('skip:', k)
+            continue
+
+        if 'DyConv' in new_k:
+            new_k = new_k.replace('DyConv', 'dyconvs')
+
+        if 'AttnConv' in new_k:
+            new_k = new_k.replace('AttnConv', 'attnconv')
+
+        new_ckpt[new_k] = new_v
+    return new_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys to mmdet style.')
+    parser.add_argument(
+        'src', default='glip_a_tiny_o365.pth', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument(
+        '--dst', default='glip_tiny_a_mmdet.pth', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+
+    if 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+
+    weight = convert(state_dict)
+    torch.save(weight, args.dst)
+
+    sha = subprocess.check_output(['sha256sum', args.dst]).decode()
+    final_file = args.dst.replace('.pth', '') + '-{}.pth'.format(sha[:8])
+    subprocess.Popen(['mv', args.dst, final_file])
+    print(f'Done!!, save to {final_file}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/model_converters/groundingdino_to_mmdet.py b/mmde/mmdet/.mim/tools/model_converters/groundingdino_to_mmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5896731d7b1cb1e295631dd7bbbbcd4f8017cac
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/model_converters/groundingdino_to_mmdet.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+from collections import OrderedDict
+
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def correct_unfold_reduction_order(x):
+    out_channel, in_channel = x.shape
+    x = x.reshape(out_channel, 4, in_channel // 4)
+    x = x[:, [0, 2, 1, 3], :].transpose(1, 2).reshape(out_channel, in_channel)
+    return x
+
+
+def correct_unfold_norm_order(x):
+    in_channel = x.shape[0]
+    x = x.reshape(4, in_channel // 4)
+    x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
+    return x
+
+
+def convert(ckpt):
+    new_ckpt = OrderedDict()
+
+    for k, v in list(ckpt.items()):
+        new_v = v
+        #
+        if 'module' not in k:
+            # NOTE: swin-b has no module prefix and swin-t has module prefix
+            k = 'module.' + k
+        if 'module.bbox_embed' in k:
+            # NOTE: bbox_embed name is swin-b is different from swin-t
+            k = k.replace('module.bbox_embed',
+                          'module.transformer.decoder.bbox_embed')
+
+        if 'module.backbone.0' in k:
+            new_k = k.replace('module.backbone.0', 'backbone')
+            if 'patch_embed.proj' in new_k:
+                new_k = new_k.replace('patch_embed.proj',
+                                      'patch_embed.projection')
+            elif 'pos_drop' in new_k:
+                new_k = new_k.replace('pos_drop', 'drop_after_pos')
+
+            if 'layers' in new_k:
+                new_k = new_k.replace('layers', 'stages')
+                if 'mlp.fc1' in new_k:
+                    new_k = new_k.replace('mlp.fc1', 'ffn.layers.0.0')
+                elif 'mlp.fc2' in new_k:
+                    new_k = new_k.replace('mlp.fc2', 'ffn.layers.1')
+                elif 'attn' in new_k:
+                    new_k = new_k.replace('attn', 'attn.w_msa')
+
+                if 'downsample' in k:
+                    if 'reduction.' in k:
+                        new_v = correct_unfold_reduction_order(v)
+                    elif 'norm.' in k:
+                        new_v = correct_unfold_norm_order(v)
+
+        elif 'module.bert' in k:
+            new_k = k.replace('module.bert',
+                              'language_model.language_backbone.body.model')
+            # new_k = k.replace('module.bert', 'bert')
+
+        elif 'module.feat_map' in k:
+            new_k = k.replace('module.feat_map', 'text_feat_map')
+
+        elif 'module.input_proj' in k:
+            new_k = k.replace('module.input_proj', 'neck.convs')
+            if 'neck.convs.3' in new_k:
+                # extra convs for 4th scale
+                new_k = new_k.replace('neck.convs.3', 'neck.extra_convs.0')
+            if '0.weight' in new_k:
+                # 0.weight -> conv.weight
+                new_k = new_k.replace('0.weight', 'conv.weight')
+            if '0.bias' in new_k:
+                # 0.bias -> conv.bias
+                new_k = new_k.replace('0.bias', 'conv.bias')
+            if '1.weight' in new_k:
+                # 1.weight -> gn.weight
+                new_k = new_k.replace('1.weight', 'gn.weight')
+            if '1.bias' in new_k:
+                # 1.bias -> gn.bias
+                new_k = new_k.replace('1.bias', 'gn.bias')
+
+        elif 'module.transformer.level_embed' in k:
+            # module.transformer.level_embed -> level_embed
+            new_k = k.replace('module.transformer.level_embed', 'level_embed')
+
+        elif 'module.transformer.encoder' in k:
+            # if '.layers' in k:
+            new_k = k.replace('module.transformer.encoder', 'encoder')
+            if 'norm1' in new_k:
+                new_k = new_k.replace('norm1', 'norms.0')
+            if 'norm2' in new_k:
+                new_k = new_k.replace('norm2', 'norms.1')
+            if 'norm3' in new_k:
+                new_k = new_k.replace('norm3', 'norms.2')
+            if 'linear1' in new_k:
+                new_k = new_k.replace('linear1', 'ffn.layers.0.0')
+            if 'linear2' in new_k:
+                new_k = new_k.replace('linear2', 'ffn.layers.1')
+
+            if 'text_layers' in new_k and 'self_attn' in new_k:
+                new_k = new_k.replace('self_attn', 'self_attn.attn')
+
+        elif 'module.transformer.enc_output' in k:
+            if 'module.transformer.enc_output' in k and 'norm' not in k:
+                new_k = k.replace('module.transformer.enc_output',
+                                  'memory_trans_fc')
+            if 'module.transformer.enc_output_norm' in k:
+                new_k = k.replace('module.transformer.enc_output_norm',
+                                  'memory_trans_norm')
+
+        elif 'module.transformer.enc_out_bbox_embed.layers' in k:
+            # ugly version
+            if 'module.transformer.enc_out_bbox_embed.layers.0' in k:
+                new_k = k.replace(
+                    'module.transformer.enc_out_bbox_embed.layers.0',
+                    'bbox_head.reg_branches.6.0')
+            if 'module.transformer.enc_out_bbox_embed.layers.1' in k:
+                new_k = k.replace(
+                    'module.transformer.enc_out_bbox_embed.layers.1',
+                    'bbox_head.reg_branches.6.2')
+            if 'module.transformer.enc_out_bbox_embed.layers.2' in k:
+                new_k = k.replace(
+                    'module.transformer.enc_out_bbox_embed.layers.2',
+                    'bbox_head.reg_branches.6.4')
+
+        elif 'module.transformer.tgt_embed' in k:
+            new_k = k.replace('module.transformer.tgt_embed',
+                              'query_embedding')
+
+        elif 'module.transformer.decoder' in k:
+            new_k = k.replace('module.transformer.decoder', 'decoder')
+            if 'norm1' in new_k:
+                # norm1 in official GroundingDINO is the third norm in decoder
+                new_k = new_k.replace('norm1', 'norms.2')
+            if 'catext_norm' in new_k:
+                # catext_norm in official GroundingDINO is the
+                # second norm in decoder
+                new_k = new_k.replace('catext_norm', 'norms.1')
+            if 'norm2' in new_k:
+                # norm2 in official GroundingDINO is the first norm in decoder
+                new_k = new_k.replace('norm2', 'norms.0')
+            if 'norm3' in new_k:
+                new_k = new_k.replace('norm3', 'norms.3')
+            if 'ca_text' in new_k:
+                new_k = new_k.replace('ca_text', 'cross_attn_text')
+                if 'in_proj_weight' in new_k:
+                    new_k = new_k.replace('in_proj_weight',
+                                          'attn.in_proj_weight')
+                if 'in_proj_bias' in new_k:
+                    new_k = new_k.replace('in_proj_bias', 'attn.in_proj_bias')
+                if 'out_proj.weight' in new_k:
+                    new_k = new_k.replace('out_proj.weight',
+                                          'attn.out_proj.weight')
+                if 'out_proj.bias' in new_k:
+                    new_k = new_k.replace('out_proj.bias',
+                                          'attn.out_proj.bias')
+            if 'linear1' in new_k:
+                new_k = new_k.replace('linear1', 'ffn.layers.0.0')
+            if 'linear2' in new_k:
+                new_k = new_k.replace('linear2', 'ffn.layers.1')
+            if 'self_attn' in new_k:
+                new_k = new_k.replace('self_attn', 'self_attn.attn')
+            if 'bbox_embed' in new_k:
+                reg_layer_id = int(new_k.split('.')[2])
+                linear_id = int(new_k.split('.')[4])
+                weight_or_bias = new_k.split('.')[-1]
+                new_k = 'bbox_head.reg_branches.' + \
+                    str(reg_layer_id)+'.'+str(2*linear_id)+'.'+weight_or_bias
+
+        else:
+            print('skip:', k)
+            continue
+
+        new_ckpt[new_k] = new_v
+    return new_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys to mmdet style.')
+    parser.add_argument(
+        'src',
+        default='groundingdino_swint_ogc.pth.pth',
+        help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument(
+        'dst',
+        default='groundingdino_swint_ogc.pth_mmdet.pth',
+        help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+
+    if 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+
+    weight = convert(state_dict)
+    torch.save(weight, args.dst)
+    sha = subprocess.check_output(['sha256sum', args.dst]).decode()
+    final_file = args.dst.replace('.pth', '') + '-{}.pth'.format(sha[:8])
+    subprocess.Popen(['mv', args.dst, final_file])
+    print(f'Done!!, save to {final_file}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/model_converters/publish_model.py b/mmde/mmdet/.mim/tools/model_converters/publish_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d3e4111e4f58b0cc4de2349f0a1859c1cf4400c
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/model_converters/publish_model.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+
+import torch
+from mmengine.logging import print_log
+from mmengine.utils import digit_version
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    parser.add_argument(
+        '--save-keys',
+        nargs='+',
+        type=str,
+        default=['meta', 'state_dict'],
+        help='keys to save in the published checkpoint')
+    args = parser.parse_args()
+    return args
+
+
+def process_checkpoint(in_file, out_file, save_keys=['meta', 'state_dict']):
+    checkpoint = torch.load(in_file, map_location='cpu')
+
+    # only keep `meta` and `state_dict` for smaller file size
+    ckpt_keys = list(checkpoint.keys())
+    for k in ckpt_keys:
+        if k not in save_keys:
+            print_log(
+                f'Key `{k}` will be removed because it is not in '
+                f'save_keys. If you want to keep it, '
+                f'please set --save-keys.',
+                logger='current')
+            checkpoint.pop(k, None)
+
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    if digit_version(torch.__version__) >= digit_version('1.6'):
+        torch.save(checkpoint, out_file, _use_new_zipfile_serialization=False)
+    else:
+        torch.save(checkpoint, out_file)
+    sha = subprocess.check_output(['sha256sum', out_file]).decode()
+    if out_file.endswith('.pth'):
+        out_file_name = out_file[:-4]
+    else:
+        out_file_name = out_file
+    final_file = out_file_name + f'-{sha[:8]}.pth'
+    subprocess.Popen(['mv', out_file, final_file])
+    print_log(
+        f'The published model is saved at {final_file}.', logger='current')
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.out_file, args.save_keys)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/model_converters/regnet2mmdet.py b/mmde/mmdet/.mim/tools/model_converters/regnet2mmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbf8c8f33a90839fef055aea0a775e76ff84afd3
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/model_converters/regnet2mmdet.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from collections import OrderedDict
+
+import torch
+
+
+def convert_stem(model_key, model_weight, state_dict, converted_names):
+    new_key = model_key.replace('stem.conv', 'conv1')
+    new_key = new_key.replace('stem.bn', 'bn1')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+    print(f'Convert {model_key} to {new_key}')
+
+
+def convert_head(model_key, model_weight, state_dict, converted_names):
+    new_key = model_key.replace('head.fc', 'fc')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+    print(f'Convert {model_key} to {new_key}')
+
+
+def convert_reslayer(model_key, model_weight, state_dict, converted_names):
+    split_keys = model_key.split('.')
+    layer, block, module = split_keys[:3]
+    block_id = int(block[1:])
+    layer_name = f'layer{int(layer[1:])}'
+    block_name = f'{block_id - 1}'
+
+    if block_id == 1 and module == 'bn':
+        new_key = f'{layer_name}.{block_name}.downsample.1.{split_keys[-1]}'
+    elif block_id == 1 and module == 'proj':
+        new_key = f'{layer_name}.{block_name}.downsample.0.{split_keys[-1]}'
+    elif module == 'f':
+        if split_keys[3] == 'a_bn':
+            module_name = 'bn1'
+        elif split_keys[3] == 'b_bn':
+            module_name = 'bn2'
+        elif split_keys[3] == 'c_bn':
+            module_name = 'bn3'
+        elif split_keys[3] == 'a':
+            module_name = 'conv1'
+        elif split_keys[3] == 'b':
+            module_name = 'conv2'
+        elif split_keys[3] == 'c':
+            module_name = 'conv3'
+        new_key = f'{layer_name}.{block_name}.{module_name}.{split_keys[-1]}'
+    else:
+        raise ValueError(f'Unsupported conversion of key {model_key}')
+    print(f'Convert {model_key} to {new_key}')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+
+
+def convert(src, dst):
+    """Convert keys in pycls pretrained RegNet models to mmdet style."""
+    # load caffe model
+    regnet_model = torch.load(src)
+    blobs = regnet_model['model_state']
+    # convert to pytorch style
+    state_dict = OrderedDict()
+    converted_names = set()
+    for key, weight in blobs.items():
+        if 'stem' in key:
+            convert_stem(key, weight, state_dict, converted_names)
+        elif 'head' in key:
+            convert_head(key, weight, state_dict, converted_names)
+        elif key.startswith('s'):
+            convert_reslayer(key, weight, state_dict, converted_names)
+
+    # check if all layers are converted
+    for key in blobs:
+        if key not in converted_names:
+            print(f'not converted: {key}')
+    # save checkpoint
+    checkpoint = dict()
+    checkpoint['state_dict'] = state_dict
+    torch.save(checkpoint, dst)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert model keys')
+    parser.add_argument('src', help='src detectron model path')
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+    convert(args.src, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/model_converters/selfsup2mmdet.py b/mmde/mmdet/.mim/tools/model_converters/selfsup2mmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc8cce1bd1cde22d09bd200b813bf67b4d066892
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/model_converters/selfsup2mmdet.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from collections import OrderedDict
+
+import torch
+
+
+def moco_convert(src, dst):
+    """Convert keys in pycls pretrained moco models to mmdet style."""
+    # load caffe model
+    moco_model = torch.load(src)
+    blobs = moco_model['state_dict']
+    # convert to pytorch style
+    state_dict = OrderedDict()
+    for k, v in blobs.items():
+        if not k.startswith('module.encoder_q.'):
+            continue
+        old_k = k
+        k = k.replace('module.encoder_q.', '')
+        state_dict[k] = v
+        print(old_k, '->', k)
+    # save checkpoint
+    checkpoint = dict()
+    checkpoint['state_dict'] = state_dict
+    torch.save(checkpoint, dst)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert model keys')
+    parser.add_argument('src', help='src detectron model path')
+    parser.add_argument('dst', help='save path')
+    parser.add_argument(
+        '--selfsup', type=str, choices=['moco', 'swav'], help='save path')
+    args = parser.parse_args()
+    if args.selfsup == 'moco':
+        moco_convert(args.src, args.dst)
+    elif args.selfsup == 'swav':
+        print('SWAV does not need to convert the keys')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/model_converters/swinv1_to_mmdet.py b/mmde/mmdet/.mim/tools/model_converters/swinv1_to_mmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..5de98f464a54afdd975e29416c809a3da371301f
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/model_converters/swinv1_to_mmdet.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+from collections import OrderedDict
+
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def swin_converter(ckpt):
+
+    new_ckpt = OrderedDict()
+
+    def correct_unfold_reduction_order(x):
+        out_channel, in_channel = x.shape
+        x = x.reshape(out_channel, 4, in_channel // 4)
+        x = x[:, [0, 2, 1, 3], :].transpose(1,
+                                            2).reshape(out_channel, in_channel)
+        return x
+
+    def correct_unfold_norm_order(x):
+        in_channel = x.shape[0]
+        x = x.reshape(4, in_channel // 4)
+        x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
+        return x
+
+    for k, v in ckpt.items():
+        if k.startswith('backbone.layers'):
+            new_v = v
+            if 'attn.' in k:
+                new_k = k.replace('attn.', 'attn.w_msa.')
+            elif 'mlp.' in k:
+                if 'mlp.fc1.' in k:
+                    new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.')
+                elif 'mlp.fc2.' in k:
+                    new_k = k.replace('mlp.fc2.', 'ffn.layers.1.')
+                else:
+                    new_k = k.replace('mlp.', 'ffn.')
+            elif 'downsample' in k:
+                new_k = k
+                if 'reduction.' in k:
+                    new_v = correct_unfold_reduction_order(v)
+                elif 'norm.' in k:
+                    new_v = correct_unfold_norm_order(v)
+            else:
+                new_k = k
+            new_k = new_k.replace('layers', 'stages', 1)
+        elif k.startswith('backbone.patch_embed'):
+            new_v = v
+            if 'proj' in k:
+                new_k = k.replace('proj', 'projection')
+            else:
+                new_k = k
+        else:
+            new_v = v
+            new_k = k
+
+        new_ckpt[new_k] = new_v
+
+    return new_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys to mmdet style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    torch.save(swin_converter(state_dict), args.dst)
+
+    sha = subprocess.check_output(['sha256sum', args.dst]).decode()
+    final_file = args.dst.replace('.pth', '') + '-{}.pth'.format(sha[:8])
+    subprocess.Popen(['mv', args.dst, final_file])
+    print(f'Done!!, save to {final_file}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/model_converters/upgrade_model_version.py b/mmde/mmdet/.mim/tools/model_converters/upgrade_model_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..f06e836a579062f25eca5e64c446d79dc390dce2
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/model_converters/upgrade_model_version.py
@@ -0,0 +1,210 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import re
+import tempfile
+from collections import OrderedDict
+
+import torch
+from mmengine import Config
+
+
+def is_head(key):
+    valid_head_list = [
+        'bbox_head', 'mask_head', 'semantic_head', 'grid_head', 'mask_iou_head'
+    ]
+
+    return any(key.startswith(h) for h in valid_head_list)
+
+
+def parse_config(config_strings):
+    temp_file = tempfile.NamedTemporaryFile()
+    config_path = f'{temp_file.name}.py'
+    with open(config_path, 'w') as f:
+        f.write(config_strings)
+
+    config = Config.fromfile(config_path)
+    is_two_stage = True
+    is_ssd = False
+    is_retina = False
+    reg_cls_agnostic = False
+    if 'rpn_head' not in config.model:
+        is_two_stage = False
+        # check whether it is SSD
+        if config.model.bbox_head.type == 'SSDHead':
+            is_ssd = True
+        elif config.model.bbox_head.type == 'RetinaHead':
+            is_retina = True
+    elif isinstance(config.model['bbox_head'], list):
+        reg_cls_agnostic = True
+    elif 'reg_class_agnostic' in config.model.bbox_head:
+        reg_cls_agnostic = config.model.bbox_head \
+            .reg_class_agnostic
+    temp_file.close()
+    return is_two_stage, is_ssd, is_retina, reg_cls_agnostic
+
+
+def reorder_cls_channel(val, num_classes=81):
+    # bias
+    if val.dim() == 1:
+        new_val = torch.cat((val[1:], val[:1]), dim=0)
+    # weight
+    else:
+        out_channels, in_channels = val.shape[:2]
+        # conv_cls for softmax output
+        if out_channels != num_classes and out_channels % num_classes == 0:
+            new_val = val.reshape(-1, num_classes, in_channels, *val.shape[2:])
+            new_val = torch.cat((new_val[:, 1:], new_val[:, :1]), dim=1)
+            new_val = new_val.reshape(val.size())
+        # fc_cls
+        elif out_channels == num_classes:
+            new_val = torch.cat((val[1:], val[:1]), dim=0)
+        # agnostic | retina_cls | rpn_cls
+        else:
+            new_val = val
+
+    return new_val
+
+
+def truncate_cls_channel(val, num_classes=81):
+
+    # bias
+    if val.dim() == 1:
+        if val.size(0) % num_classes == 0:
+            new_val = val[:num_classes - 1]
+        else:
+            new_val = val
+    # weight
+    else:
+        out_channels, in_channels = val.shape[:2]
+        # conv_logits
+        if out_channels % num_classes == 0:
+            new_val = val.reshape(num_classes, in_channels, *val.shape[2:])[1:]
+            new_val = new_val.reshape(-1, *val.shape[1:])
+        # agnostic
+        else:
+            new_val = val
+
+    return new_val
+
+
+def truncate_reg_channel(val, num_classes=81):
+    # bias
+    if val.dim() == 1:
+        # fc_reg | rpn_reg
+        if val.size(0) % num_classes == 0:
+            new_val = val.reshape(num_classes, -1)[:num_classes - 1]
+            new_val = new_val.reshape(-1)
+        # agnostic
+        else:
+            new_val = val
+    # weight
+    else:
+        out_channels, in_channels = val.shape[:2]
+        # fc_reg | rpn_reg
+        if out_channels % num_classes == 0:
+            new_val = val.reshape(num_classes, -1, in_channels,
+                                  *val.shape[2:])[1:]
+            new_val = new_val.reshape(-1, *val.shape[1:])
+        # agnostic
+        else:
+            new_val = val
+
+    return new_val
+
+
+def convert(in_file, out_file, num_classes):
+    """Convert keys in checkpoints.
+
+    There can be some breaking changes during the development of mmdetection,
+    and this tool is used for upgrading checkpoints trained with old versions
+    to the latest one.
+    """
+    checkpoint = torch.load(in_file)
+    in_state_dict = checkpoint.pop('state_dict')
+    out_state_dict = OrderedDict()
+    meta_info = checkpoint['meta']
+    is_two_stage, is_ssd, is_retina, reg_cls_agnostic = parse_config(
+        '#' + meta_info['config'])
+    if meta_info['mmdet_version'] <= '0.5.3' and is_retina:
+        upgrade_retina = True
+    else:
+        upgrade_retina = False
+
+    # MMDetection v2.5.0 unifies the class order in RPN
+    # if the model is trained in version<v2.5.0
+    # The RPN model should be upgraded to be used in version>=2.5.0
+    if meta_info['mmdet_version'] < '2.5.0':
+        upgrade_rpn = True
+    else:
+        upgrade_rpn = False
+
+    for key, val in in_state_dict.items():
+        new_key = key
+        new_val = val
+        if is_two_stage and is_head(key):
+            new_key = 'roi_head.{}'.format(key)
+
+        # classification
+        if upgrade_rpn:
+            m = re.search(
+                r'(conv_cls|retina_cls|rpn_cls|fc_cls|fcos_cls|'
+                r'fovea_cls).(weight|bias)', new_key)
+        else:
+            m = re.search(
+                r'(conv_cls|retina_cls|fc_cls|fcos_cls|'
+                r'fovea_cls).(weight|bias)', new_key)
+        if m is not None:
+            print(f'reorder cls channels of {new_key}')
+            new_val = reorder_cls_channel(val, num_classes)
+
+        # regression
+        if upgrade_rpn:
+            m = re.search(r'(fc_reg).(weight|bias)', new_key)
+        else:
+            m = re.search(r'(fc_reg|rpn_reg).(weight|bias)', new_key)
+        if m is not None and not reg_cls_agnostic:
+            print(f'truncate regression channels of {new_key}')
+            new_val = truncate_reg_channel(val, num_classes)
+
+        # mask head
+        m = re.search(r'(conv_logits).(weight|bias)', new_key)
+        if m is not None:
+            print(f'truncate mask prediction channels of {new_key}')
+            new_val = truncate_cls_channel(val, num_classes)
+
+        m = re.search(r'(cls_convs|reg_convs).\d.(weight|bias)', key)
+        # Legacy issues in RetinaNet since V1.x
+        # Use ConvModule instead of nn.Conv2d in RetinaNet
+        # cls_convs.0.weight -> cls_convs.0.conv.weight
+        if m is not None and upgrade_retina:
+            param = m.groups()[1]
+            new_key = key.replace(param, f'conv.{param}')
+            out_state_dict[new_key] = val
+            print(f'rename the name of {key} to {new_key}')
+            continue
+
+        m = re.search(r'(cls_convs).\d.(weight|bias)', key)
+        if m is not None and is_ssd:
+            print(f'reorder cls channels of {new_key}')
+            new_val = reorder_cls_channel(val, num_classes)
+
+        out_state_dict[new_key] = new_val
+    checkpoint['state_dict'] = out_state_dict
+    torch.save(checkpoint, out_file)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Upgrade model version')
+    parser.add_argument('in_file', help='input checkpoint file')
+    parser.add_argument('out_file', help='output checkpoint file')
+    parser.add_argument(
+        '--num-classes',
+        type=int,
+        default=81,
+        help='number of classes of the original model')
+    args = parser.parse_args()
+    convert(args.in_file, args.out_file, args.num_classes)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/model_converters/upgrade_ssd_version.py b/mmde/mmdet/.mim/tools/model_converters/upgrade_ssd_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..98e96f68a372903ebad5a6c74acb68162cf1e52c
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/model_converters/upgrade_ssd_version.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+from collections import OrderedDict
+
+import torch
+from mmengine import Config
+from mmengine.utils import digit_version
+
+
+def parse_config(config_strings):
+    temp_file = tempfile.NamedTemporaryFile()
+    config_path = f'{temp_file.name}.py'
+    with open(config_path, 'w') as f:
+        f.write(config_strings)
+
+    config = Config.fromfile(config_path)
+    # check whether it is SSD
+    if config.model.bbox_head.type != 'SSDHead':
+        raise AssertionError('This is not a SSD model.')
+
+
+def convert(in_file, out_file):
+    checkpoint = torch.load(in_file)
+    in_state_dict = checkpoint.pop('state_dict')
+    out_state_dict = OrderedDict()
+    meta_info = checkpoint['meta']
+    parse_config('#' + meta_info['config'])
+    for key, value in in_state_dict.items():
+        if 'extra' in key:
+            layer_idx = int(key.split('.')[2])
+            new_key = 'neck.extra_layers.{}.{}.conv.'.format(
+                layer_idx // 2, layer_idx % 2) + key.split('.')[-1]
+        elif 'l2_norm' in key:
+            new_key = 'neck.l2_norm.weight'
+        elif 'bbox_head' in key:
+            new_key = key[:21] + '.0' + key[21:]
+        else:
+            new_key = key
+        out_state_dict[new_key] = value
+    checkpoint['state_dict'] = out_state_dict
+
+    if digit_version(torch.__version__) >= digit_version('1.6'):
+        torch.save(checkpoint, out_file, _use_new_zipfile_serialization=False)
+    else:
+        torch.save(checkpoint, out_file)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Upgrade SSD version')
+    parser.add_argument('in_file', help='input checkpoint file')
+    parser.add_argument('out_file', help='output checkpoint file')
+
+    args = parser.parse_args()
+    convert(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/slurm_test.sh b/mmde/mmdet/.mim/tools/slurm_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6dd67e57442b741fc30f26102eb5afe16139edb1
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/slurm_test.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+CHECKPOINT=$4
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+PY_ARGS=${@:5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
diff --git a/mmde/mmdet/.mim/tools/slurm_test_tracking.sh b/mmde/mmdet/.mim/tools/slurm_test_tracking.sh
new file mode 100644
index 0000000000000000000000000000000000000000..16a2f1a43dd81982f713291def2ef390f1768f03
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/slurm_test_tracking.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+PY_ARGS=${@:4}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/test_tracking.py ${CONFIG} --launcher="slurm" ${PY_ARGS}
diff --git a/mmde/mmdet/.mim/tools/slurm_train.sh b/mmde/mmdet/.mim/tools/slurm_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b3feb3d9c7a6c33d82739cdf5ee10365673aaded
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/slurm_train.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+WORK_DIR=$4
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+PY_ARGS=${@:5}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
diff --git a/mmde/mmdet/.mim/tools/test.py b/mmde/mmdet/.mim/tools/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac7a1d099669b12e783f0e63646bf4288bfacbad
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/test.py
@@ -0,0 +1,149 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+import warnings
+from copy import deepcopy
+
+from mmengine import ConfigDict
+from mmengine.config import Config, DictAction
+from mmengine.runner import Runner
+
+from mmdet.engine.hooks.utils import trigger_visualization_hook
+from mmdet.evaluation import DumpDetResults
+from mmdet.registry import RUNNERS
+from mmdet.utils import setup_cache_size_limit_of_dynamo
+
+
+# TODO: support fuse_conv_bn and format_only
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing evaluation metrics')
+    parser.add_argument(
+        '--out',
+        type=str,
+        help='dump predictions to a pickle file for offline evaluation')
+    parser.add_argument(
+        '--show', action='store_true', help='show prediction results')
+    parser.add_argument(
+        '--show-dir',
+        help='directory where painted images will be saved. '
+        'If specified, it will be automatically saved '
+        'to the work_dir/timestamp/show_dir')
+    parser.add_argument(
+        '--wait-time', type=float, default=2, help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--tta', action='store_true')
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/train.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # Reduce the number of repeated compilations and improve
+    # testing speed.
+    setup_cache_size_limit_of_dynamo()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    cfg.load_from = args.checkpoint
+
+    if args.show or args.show_dir:
+        cfg = trigger_visualization_hook(cfg, args)
+
+    if args.tta:
+
+        if 'tta_model' not in cfg:
+            warnings.warn('Cannot find ``tta_model`` in config, '
+                          'we will set it as default.')
+            cfg.tta_model = dict(
+                type='DetTTAModel',
+                tta_cfg=dict(
+                    nms=dict(type='nms', iou_threshold=0.5), max_per_img=100))
+        if 'tta_pipeline' not in cfg:
+            warnings.warn('Cannot find ``tta_pipeline`` in config, '
+                          'we will set it as default.')
+            test_data_cfg = cfg.test_dataloader.dataset
+            while 'dataset' in test_data_cfg:
+                test_data_cfg = test_data_cfg['dataset']
+            cfg.tta_pipeline = deepcopy(test_data_cfg.pipeline)
+            flip_tta = dict(
+                type='TestTimeAug',
+                transforms=[
+                    [
+                        dict(type='RandomFlip', prob=1.),
+                        dict(type='RandomFlip', prob=0.)
+                    ],
+                    [
+                        dict(
+                            type='PackDetInputs',
+                            meta_keys=('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'flip',
+                                       'flip_direction'))
+                    ],
+                ])
+            cfg.tta_pipeline[-1] = flip_tta
+        cfg.model = ConfigDict(**cfg.tta_model, module=cfg.model)
+        cfg.test_dataloader.dataset.pipeline = cfg.tta_pipeline
+
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
+    else:
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
+
+    # add `DumpResults` dummy metric
+    if args.out is not None:
+        assert args.out.endswith(('.pkl', '.pickle')), \
+            'The dump file must be a pkl file.'
+        runner.test_evaluator.metrics.append(
+            DumpDetResults(out_file_path=args.out))
+
+    # start testing
+    runner.test()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/test_tracking.py b/mmde/mmdet/.mim/tools/test_tracking.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b928c0e84ed86c6d2547cb028e5c8488e3b78a0
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/test_tracking.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+from mmengine.config import Config, DictAction
+from mmengine.model import is_model_wrapper
+from mmengine.registry import RUNNERS
+from mmengine.runner import Runner
+from mmengine.runner.checkpoint import load_checkpoint
+
+from mmdet.utils import register_all_modules
+
+
+# TODO: support fuse_conv_bn, visualization, and format_only
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMTrack test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', help='checkpoint file')
+    parser.add_argument('--detector', help='detection checkpoint file')
+    parser.add_argument('--reid', help='reid checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing evaluation metrics')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # register all modules in mmtrack into the registries
+    # do not init the default scope here because it will be init in the runner
+    register_all_modules(init_default_scope=False)
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    cfg.load_from = args.checkpoint
+
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
+    else:
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
+
+    if is_model_wrapper(runner.model):
+        model = runner.model.module
+    else:
+        model = runner.model
+
+    if args.detector:
+        assert not (args.checkpoint and args.detector), \
+            'Error: checkpoint and detector checkpoint cannot both exist'
+        load_checkpoint(model.detector, args.detector)
+
+    if args.reid:
+        assert not (args.checkpoint and args.reid), \
+             'Error: checkpoint and reid checkpoint cannot both exist'
+        load_checkpoint(model.reid, args.reid)
+
+    # start testing
+    runner.test()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/.mim/tools/train.py b/mmde/mmdet/.mim/tools/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e5b71fbcaeb6c78c131f3229de7255bb07712a3
--- /dev/null
+++ b/mmde/mmdet/.mim/tools/train.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+from mmengine.config import Config, DictAction
+from mmengine.registry import RUNNERS
+from mmengine.runner import Runner
+
+from mmdet.utils import setup_cache_size_limit_of_dynamo
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--amp',
+        action='store_true',
+        default=False,
+        help='enable automatic-mixed-precision training')
+    parser.add_argument(
+        '--auto-scale-lr',
+        action='store_true',
+        help='enable automatically scaling LR.')
+    parser.add_argument(
+        '--resume',
+        nargs='?',
+        type=str,
+        const='auto',
+        help='If specify checkpoint path, resume from it, while if not '
+        'specify, try to auto resume from the latest checkpoint '
+        'in the work directory.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/train.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # Reduce the number of repeated compilations and improve
+    # training speed.
+    setup_cache_size_limit_of_dynamo()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    # enable automatic-mixed-precision training
+    if args.amp is True:
+        cfg.optim_wrapper.type = 'AmpOptimWrapper'
+        cfg.optim_wrapper.loss_scale = 'dynamic'
+
+    # enable automatically scaling LR
+    if args.auto_scale_lr:
+        if 'auto_scale_lr' in cfg and \
+                'enable' in cfg.auto_scale_lr and \
+                'base_batch_size' in cfg.auto_scale_lr:
+            cfg.auto_scale_lr.enable = True
+        else:
+            raise RuntimeError('Can not find "auto_scale_lr" or '
+                               '"auto_scale_lr.enable" or '
+                               '"auto_scale_lr.base_batch_size" in your'
+                               ' configuration file.')
+
+    # resume is determined in this priority: resume from > auto_resume
+    if args.resume == 'auto':
+        cfg.resume = True
+        cfg.load_from = None
+    elif args.resume is not None:
+        cfg.resume = True
+        cfg.load_from = args.resume
+
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
+    else:
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
+
+    # start training
+    runner.train()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/mmdet/__init__.py b/mmde/mmdet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4697ca4955c68b8edd1dddb41fccb1962f72308a
--- /dev/null
+++ b/mmde/mmdet/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import mmengine
+from mmengine.utils import digit_version
+
+from .version import __version__, version_info
+
+mmcv_minimum_version = '2.0.0rc4'
+mmcv_maximum_version = '3.2.0'
+mmcv_version = digit_version(mmcv.__version__)
+
+mmengine_minimum_version = '0.7.1'
+mmengine_maximum_version = '1.0.0'
+mmengine_version = digit_version(mmengine.__version__)
+
+# assert (mmcv_version >= digit_version(mmcv_minimum_version)
+#         and mmcv_version < digit_version(mmcv_maximum_version)), \
+#     f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+#     f'Please install mmcv>={mmcv_minimum_version}, <{mmcv_maximum_version}.'
+
+# assert (mmengine_version >= digit_version(mmengine_minimum_version)
+#         and mmengine_version < digit_version(mmengine_maximum_version)), \
+#     f'MMEngine=={mmengine.__version__} is used but incompatible. ' \
+#     f'Please install mmengine>={mmengine_minimum_version}, ' \
+#     f'<{mmengine_maximum_version}.'
+
+__all__ = ['__version__', 'version_info', 'digit_version']
diff --git a/mmde/mmdet/apis/__init__.py b/mmde/mmdet/apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c89dc72914b11a73e91dc7e9404f41bf10b93c6c
--- /dev/null
+++ b/mmde/mmdet/apis/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .det_inferencer import DetInferencer
+from .inference import (async_inference_detector, inference_detector,
+                        inference_mot, init_detector, init_track_model)
+
+__all__ = [
+    'init_detector', 'async_inference_detector', 'inference_detector',
+    'DetInferencer', 'inference_mot', 'init_track_model'
+]
diff --git a/mmde/mmdet/apis/det_inferencer.py b/mmde/mmdet/apis/det_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce8532eb786558ca3807195781d8e380741cea00
--- /dev/null
+++ b/mmde/mmdet/apis/det_inferencer.py
@@ -0,0 +1,652 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+import warnings
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
+
+import mmcv
+import mmengine
+import numpy as np
+import torch.nn as nn
+from mmcv.transforms import LoadImageFromFile
+from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
+from mmengine.infer.infer import BaseInferencer, ModelType
+from mmengine.model.utils import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.runner.checkpoint import _load_checkpoint_to_model
+from mmengine.visualization import Visualizer
+from rich.progress import track
+
+from mmdet.evaluation import INSTANCE_OFFSET
+from mmdet.registry import DATASETS
+from mmdet.structures import DetDataSample
+from mmdet.structures.mask import encode_mask_results, mask2bbox
+from mmdet.utils import ConfigType
+from ..evaluation import get_classes
+
+try:
+    from panopticapi.evaluation import VOID
+    from panopticapi.utils import id2rgb
+except ImportError:
+    id2rgb = None
+    VOID = None
+
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = List[DetDataSample]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
+                  '.tiff', '.webp')
+
+
+class DetInferencer(BaseInferencer):
+    """Object Detection Inferencer.
+
+    Args:
+        model (str, optional): Path to the config file or the model name
+            defined in metafile. For example, it could be
+            "rtmdet-s" or 'rtmdet_s_8xb32-300e_coco' or
+            "configs/rtmdet/rtmdet_s_8xb32-300e_coco.py".
+            If model is not specified, user must provide the
+            `weights` saved by MMEngine which contains the config string.
+            Defaults to None.
+        weights (str, optional): Path to the checkpoint. If it is not specified
+            and model is a model name of metafile, the weights will be loaded
+            from metafile. Defaults to None.
+        device (str, optional): Device to run inference. If None, the available
+            device will be automatically used. Defaults to None.
+        scope (str, optional): The scope of the model. Defaults to mmdet.
+        palette (str): Color palette used for visualization. The order of
+            priority is palette -> config -> checkpoint. Defaults to 'none'.
+        show_progress (bool): Control whether to display the progress
+            bar during the inference process. Defaults to True.
+    """
+
+    preprocess_kwargs: set = set()
+    forward_kwargs: set = set()
+    visualize_kwargs: set = {
+        'return_vis',
+        'show',
+        'wait_time',
+        'draw_pred',
+        'pred_score_thr',
+        'img_out_dir',
+        'no_save_vis',
+    }
+    postprocess_kwargs: set = {
+        'print_result',
+        'pred_out_dir',
+        'return_datasamples',
+        'no_save_pred',
+    }
+
+    def __init__(self,
+                 model: Optional[Union[ModelType, str]] = None,
+                 weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: Optional[str] = 'mmdet',
+                 palette: str = 'none',
+                 show_progress: bool = True) -> None:
+        # A global counter tracking the number of images processed, for
+        # naming of the output images
+        self.num_visualized_imgs = 0
+        self.num_predicted_imgs = 0
+        self.palette = palette
+        init_default_scope(scope)
+        super().__init__(
+            model=model, weights=weights, device=device, scope=scope)
+        self.model = revert_sync_batchnorm(self.model)
+        self.show_progress = show_progress
+
+    def _load_weights_to_model(self, model: nn.Module,
+                               checkpoint: Optional[dict],
+                               cfg: Optional[ConfigType]) -> None:
+        """Loading model weights and meta information from cfg and checkpoint.
+
+        Args:
+            model (nn.Module): Model to load weights and meta information.
+            checkpoint (dict, optional): The loaded checkpoint.
+            cfg (Config or ConfigDict, optional): The loaded config.
+        """
+
+        if checkpoint is not None:
+            _load_checkpoint_to_model(model, checkpoint)
+            checkpoint_meta = checkpoint.get('meta', {})
+            # save the dataset_meta in the model for convenience
+            if 'dataset_meta' in checkpoint_meta:
+                # mmdet 3.x, all keys should be lowercase
+                model.dataset_meta = {
+                    k.lower(): v
+                    for k, v in checkpoint_meta['dataset_meta'].items()
+                }
+            elif 'CLASSES' in checkpoint_meta:
+                # < mmdet 3.x
+                classes = checkpoint_meta['CLASSES']
+                model.dataset_meta = {'classes': classes}
+            else:
+                warnings.warn(
+                    'dataset_meta or class names are not saved in the '
+                    'checkpoint\'s meta data, use COCO classes by default.')
+                model.dataset_meta = {'classes': get_classes('coco')}
+        else:
+            warnings.warn('Checkpoint is not loaded, and the inference '
+                          'result is calculated by the randomly initialized '
+                          'model!')
+            warnings.warn('weights is None, use COCO classes by default.')
+            model.dataset_meta = {'classes': get_classes('coco')}
+
+        # Priority:  args.palette -> config -> checkpoint
+        if self.palette != 'none':
+            model.dataset_meta['palette'] = self.palette
+        else:
+            test_dataset_cfg = copy.deepcopy(cfg.test_dataloader.dataset)
+            # lazy init. We only need the metainfo.
+            test_dataset_cfg['lazy_init'] = True
+            metainfo = DATASETS.build(test_dataset_cfg).metainfo
+            cfg_palette = metainfo.get('palette', None)
+            if cfg_palette is not None:
+                model.dataset_meta['palette'] = cfg_palette
+            else:
+                if 'palette' not in model.dataset_meta:
+                    warnings.warn(
+                        'palette does not exist, random is used by default. '
+                        'You can also set the palette to customize.')
+                    model.dataset_meta['palette'] = 'random'
+
+    def _init_pipeline(self, cfg: ConfigType) -> Compose:
+        """Initialize the test pipeline."""
+        pipeline_cfg = cfg.test_dataloader.dataset.pipeline
+
+        # For inference, the key of ``img_id`` is not used.
+        if 'meta_keys' in pipeline_cfg[-1]:
+            pipeline_cfg[-1]['meta_keys'] = tuple(
+                meta_key for meta_key in pipeline_cfg[-1]['meta_keys']
+                if meta_key != 'img_id')
+
+        load_img_idx = self._get_transform_idx(
+            pipeline_cfg, ('LoadImageFromFile', LoadImageFromFile))
+        if load_img_idx == -1:
+            raise ValueError(
+                'LoadImageFromFile is not found in the test pipeline')
+        pipeline_cfg[load_img_idx]['type'] = 'mmdet.InferencerLoader'
+        return Compose(pipeline_cfg)
+
+    def _get_transform_idx(self, pipeline_cfg: ConfigType,
+                           name: Union[str, Tuple[str, type]]) -> int:
+        """Returns the index of the transform in a pipeline.
+
+        If the transform is not found, returns -1.
+        """
+        for i, transform in enumerate(pipeline_cfg):
+            if transform['type'] in name:
+                return i
+        return -1
+
+    def _init_visualizer(self, cfg: ConfigType) -> Optional[Visualizer]:
+        """Initialize visualizers.
+
+        Args:
+            cfg (ConfigType): Config containing the visualizer information.
+
+        Returns:
+            Visualizer or None: Visualizer initialized with config.
+        """
+        visualizer = super()._init_visualizer(cfg)
+        visualizer.dataset_meta = self.model.dataset_meta
+        return visualizer
+
+    def _inputs_to_list(self, inputs: InputsType) -> list:
+        """Preprocess the inputs to a list.
+
+        Preprocess inputs to a list according to its type:
+
+        - list or tuple: return inputs
+        - str:
+            - Directory path: return all files in the directory
+            - other cases: return a list containing the string. The string
+              could be a path to file, a url or other types of string according
+              to the task.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+
+        Returns:
+            list: List of input for the :meth:`preprocess`.
+        """
+        if isinstance(inputs, str):
+            backend = get_file_backend(inputs)
+            if hasattr(backend, 'isdir') and isdir(inputs):
+                # Backends like HttpsBackend do not implement `isdir`, so only
+                # those backends that implement `isdir` could accept the inputs
+                # as a directory
+                filename_list = list_dir_or_file(
+                    inputs, list_dir=False, suffix=IMG_EXTENSIONS)
+                inputs = [
+                    join_path(inputs, filename) for filename in filename_list
+                ]
+
+        if not isinstance(inputs, (list, tuple)):
+            inputs = [inputs]
+
+        return list(inputs)
+
+    def preprocess(self, inputs: InputsType, batch_size: int = 1, **kwargs):
+        """Process the inputs into a model-feedable format.
+
+        Customize your preprocess by overriding this method. Preprocess should
+        return an iterable object, of which each item will be used as the
+        input of ``model.test_step``.
+
+        ``BaseInferencer.preprocess`` will return an iterable chunked data,
+        which will be used in __call__ like this:
+
+        .. code-block:: python
+
+            def __call__(self, inputs, batch_size=1, **kwargs):
+                chunked_data = self.preprocess(inputs, batch_size, **kwargs)
+                for batch in chunked_data:
+                    preds = self.forward(batch, **kwargs)
+
+        Args:
+            inputs (InputsType): Inputs given by user.
+            batch_size (int): batch size. Defaults to 1.
+
+        Yields:
+            Any: Data processed by the ``pipeline`` and ``collate_fn``.
+        """
+        chunked_data = self._get_chunk_data(inputs, batch_size)
+        yield from map(self.collate_fn, chunked_data)
+
+    def _get_chunk_data(self, inputs: Iterable, chunk_size: int):
+        """Get batch data from inputs.
+
+        Args:
+            inputs (Iterable): An iterable dataset.
+            chunk_size (int): Equivalent to batch size.
+
+        Yields:
+            list: batch data.
+        """
+        inputs_iter = iter(inputs)
+        while True:
+            try:
+                chunk_data = []
+                for _ in range(chunk_size):
+                    inputs_ = next(inputs_iter)
+                    if isinstance(inputs_, dict):
+                        if 'img' in inputs_:
+                            ori_inputs_ = inputs_['img']
+                        else:
+                            ori_inputs_ = inputs_['img_path']
+                        chunk_data.append(
+                            (ori_inputs_,
+                             self.pipeline(copy.deepcopy(inputs_))))
+                    else:
+                        chunk_data.append((inputs_, self.pipeline(inputs_)))
+                yield chunk_data
+            except StopIteration:
+                if chunk_data:
+                    yield chunk_data
+                break
+
+    # TODO: Video and Webcam are currently not supported and
+    #  may consume too much memory if your input folder has a lot of images.
+    #  We will be optimized later.
+    def __call__(
+            self,
+            inputs: InputsType,
+            batch_size: int = 1,
+            return_vis: bool = False,
+            show: bool = False,
+            wait_time: int = 0,
+            no_save_vis: bool = False,
+            draw_pred: bool = True,
+            pred_score_thr: float = 0.3,
+            return_datasamples: bool = False,
+            print_result: bool = False,
+            no_save_pred: bool = True,
+            out_dir: str = '',
+            # by open image task
+            texts: Optional[Union[str, list]] = None,
+            # by open panoptic task
+            stuff_texts: Optional[Union[str, list]] = None,
+            # by GLIP and Grounding DINO
+            custom_entities: bool = False,
+            # by Grounding DINO
+            tokens_positive: Optional[Union[int, list]] = None,
+            **kwargs) -> dict:
+        """Call the inferencer.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+            batch_size (int): Inference batch size. Defaults to 1.
+            show (bool): Whether to display the visualization results in a
+                popup window. Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            no_save_vis (bool): Whether to force not to save prediction
+                vis results. Defaults to False.
+            draw_pred (bool): Whether to draw predicted bounding boxes.
+                Defaults to True.
+            pred_score_thr (float): Minimum score of bboxes to draw.
+                Defaults to 0.3.
+            return_datasamples (bool): Whether to return results as
+                :obj:`DetDataSample`. Defaults to False.
+            print_result (bool): Whether to print the inference result w/o
+                visualization to the console. Defaults to False.
+            no_save_pred (bool): Whether to force not to save prediction
+                results. Defaults to True.
+            out_dir: Dir to save the inference results or
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+            texts (str | list[str]): Text prompts. Defaults to None.
+            stuff_texts (str | list[str]): Stuff text prompts of open
+                panoptic task. Defaults to None.
+            custom_entities (bool): Whether to use custom entities.
+                Defaults to False. Only used in GLIP and Grounding DINO.
+            **kwargs: Other keyword arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
+                and ``postprocess_kwargs``.
+
+        Returns:
+            dict: Inference and visualization results.
+        """
+        (
+            preprocess_kwargs,
+            forward_kwargs,
+            visualize_kwargs,
+            postprocess_kwargs,
+        ) = self._dispatch_kwargs(**kwargs)
+
+        ori_inputs = self._inputs_to_list(inputs)
+
+        if texts is not None and isinstance(texts, str):
+            texts = [texts] * len(ori_inputs)
+        if stuff_texts is not None and isinstance(stuff_texts, str):
+            stuff_texts = [stuff_texts] * len(ori_inputs)
+
+        # Currently only supports bs=1
+        tokens_positive = [tokens_positive] * len(ori_inputs)
+
+        if texts is not None:
+            assert len(texts) == len(ori_inputs)
+            for i in range(len(texts)):
+                if isinstance(ori_inputs[i], str):
+                    ori_inputs[i] = {
+                        'text': texts[i],
+                        'img_path': ori_inputs[i],
+                        'custom_entities': custom_entities,
+                        'tokens_positive': tokens_positive[i]
+                    }
+                else:
+                    ori_inputs[i] = {
+                        'text': texts[i],
+                        'img': ori_inputs[i],
+                        'custom_entities': custom_entities,
+                        'tokens_positive': tokens_positive[i]
+                    }
+        if stuff_texts is not None:
+            assert len(stuff_texts) == len(ori_inputs)
+            for i in range(len(stuff_texts)):
+                ori_inputs[i]['stuff_text'] = stuff_texts[i]
+
+        inputs = self.preprocess(
+            ori_inputs, batch_size=batch_size, **preprocess_kwargs)
+
+        results_dict = {'predictions': [], 'visualization': []}
+        for ori_imgs, data in (track(inputs, description='Inference')
+                               if self.show_progress else inputs):
+            preds = self.forward(data, **forward_kwargs)
+            visualization = self.visualize(
+                ori_imgs,
+                preds,
+                return_vis=return_vis,
+                show=show,
+                wait_time=wait_time,
+                draw_pred=draw_pred,
+                pred_score_thr=pred_score_thr,
+                no_save_vis=no_save_vis,
+                img_out_dir=out_dir,
+                **visualize_kwargs)
+            results = self.postprocess(
+                preds,
+                visualization,
+                return_datasamples=return_datasamples,
+                print_result=print_result,
+                no_save_pred=no_save_pred,
+                pred_out_dir=out_dir,
+                **postprocess_kwargs)
+            results_dict['predictions'].extend(results['predictions'])
+            if results['visualization'] is not None:
+                results_dict['visualization'].extend(results['visualization'])
+        return results_dict
+
+    def visualize(self,
+                  inputs: InputsType,
+                  preds: PredType,
+                  return_vis: bool = False,
+                  show: bool = False,
+                  wait_time: int = 0,
+                  draw_pred: bool = True,
+                  pred_score_thr: float = 0.3,
+                  no_save_vis: bool = False,
+                  img_out_dir: str = '',
+                  **kwargs) -> Union[List[np.ndarray], None]:
+        """Visualize predictions.
+
+        Args:
+            inputs (List[Union[str, np.ndarray]]): Inputs for the inferencer.
+            preds (List[:obj:`DetDataSample`]): Predictions of the model.
+            return_vis (bool): Whether to return the visualization result.
+                Defaults to False.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            draw_pred (bool): Whether to draw predicted bounding boxes.
+                Defaults to True.
+            pred_score_thr (float): Minimum score of bboxes to draw.
+                Defaults to 0.3.
+            no_save_vis (bool): Whether to force not to save prediction
+                vis results. Defaults to False.
+            img_out_dir (str): Output directory of visualization results.
+                If left as empty, no file will be saved. Defaults to ''.
+
+        Returns:
+            List[np.ndarray] or None: Returns visualization results only if
+            applicable.
+        """
+        if no_save_vis is True:
+            img_out_dir = ''
+
+        if not show and img_out_dir == '' and not return_vis:
+            return None
+
+        if self.visualizer is None:
+            raise ValueError('Visualization needs the "visualizer" term'
+                             'defined in the config, but got None.')
+
+        results = []
+
+        for single_input, pred in zip(inputs, preds):
+            if isinstance(single_input, str):
+                img_bytes = mmengine.fileio.get(single_input)
+                img = mmcv.imfrombytes(img_bytes)
+                img = img[:, :, ::-1]
+                img_name = osp.basename(single_input)
+            elif isinstance(single_input, np.ndarray):
+                img = single_input.copy()
+                img_num = str(self.num_visualized_imgs).zfill(8)
+                img_name = f'{img_num}.jpg'
+            else:
+                raise ValueError('Unsupported input type: '
+                                 f'{type(single_input)}')
+
+            out_file = osp.join(img_out_dir, 'vis',
+                                img_name) if img_out_dir != '' else None
+
+            self.visualizer.add_datasample(
+                img_name,
+                img,
+                pred,
+                show=show,
+                wait_time=wait_time,
+                draw_gt=False,
+                draw_pred=draw_pred,
+                pred_score_thr=pred_score_thr,
+                out_file=out_file,
+            )
+            results.append(self.visualizer.get_image())
+            self.num_visualized_imgs += 1
+
+        return results
+
+    def postprocess(
+        self,
+        preds: PredType,
+        visualization: Optional[List[np.ndarray]] = None,
+        return_datasamples: bool = False,
+        print_result: bool = False,
+        no_save_pred: bool = False,
+        pred_out_dir: str = '',
+        **kwargs,
+    ) -> Dict:
+        """Process the predictions and visualization results from ``forward``
+        and ``visualize``.
+
+        This method should be responsible for the following tasks:
+
+        1. Convert datasamples into a json-serializable dict if needed.
+        2. Pack the predictions and visualization results and return them.
+        3. Dump or log the predictions.
+
+        Args:
+            preds (List[:obj:`DetDataSample`]): Predictions of the model.
+            visualization (Optional[np.ndarray]): Visualized predictions.
+            return_datasamples (bool): Whether to use Datasample to store
+                inference results. If False, dict will be used.
+            print_result (bool): Whether to print the inference result w/o
+                visualization to the console. Defaults to False.
+            no_save_pred (bool): Whether to force not to save prediction
+                results. Defaults to False.
+            pred_out_dir: Dir to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+
+        Returns:
+            dict: Inference and visualization results with key ``predictions``
+            and ``visualization``.
+
+            - ``visualization`` (Any): Returned by :meth:`visualize`.
+            - ``predictions`` (dict or DataSample): Returned by
+                :meth:`forward` and processed in :meth:`postprocess`.
+                If ``return_datasamples=False``, it usually should be a
+                json-serializable dict containing only basic data elements such
+                as strings and numbers.
+        """
+        if no_save_pred is True:
+            pred_out_dir = ''
+
+        result_dict = {}
+        results = preds
+        if not return_datasamples:
+            results = []
+            for pred in preds:
+                result = self.pred2dict(pred, pred_out_dir)
+                results.append(result)
+        elif pred_out_dir != '':
+            warnings.warn('Currently does not support saving datasample '
+                          'when return_datasamples is set to True. '
+                          'Prediction results are not saved!')
+        # Add img to the results after printing and dumping
+        result_dict['predictions'] = results
+        if print_result:
+            print(result_dict)
+        result_dict['visualization'] = visualization
+        return result_dict
+
+    # TODO: The data format and fields saved in json need further discussion.
+    #  Maybe should include model name, timestamp, filename, image info etc.
+    def pred2dict(self,
+                  data_sample: DetDataSample,
+                  pred_out_dir: str = '') -> Dict:
+        """Extract elements necessary to represent a prediction into a
+        dictionary.
+
+        It's better to contain only basic data elements such as strings and
+        numbers in order to guarantee it's json-serializable.
+
+        Args:
+            data_sample (:obj:`DetDataSample`): Predictions of the model.
+            pred_out_dir: Dir to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+
+        Returns:
+            dict: Prediction results.
+        """
+        is_save_pred = True
+        if pred_out_dir == '':
+            is_save_pred = False
+
+        if is_save_pred and 'img_path' in data_sample:
+            img_path = osp.basename(data_sample.img_path)
+            img_path = osp.splitext(img_path)[0]
+            out_img_path = osp.join(pred_out_dir, 'preds',
+                                    img_path + '_panoptic_seg.png')
+            out_json_path = osp.join(pred_out_dir, 'preds', img_path + '.json')
+        elif is_save_pred:
+            out_img_path = osp.join(
+                pred_out_dir, 'preds',
+                f'{self.num_predicted_imgs}_panoptic_seg.png')
+            out_json_path = osp.join(pred_out_dir, 'preds',
+                                     f'{self.num_predicted_imgs}.json')
+            self.num_predicted_imgs += 1
+
+        result = {}
+        if 'pred_instances' in data_sample:
+            masks = data_sample.pred_instances.get('masks')
+            pred_instances = data_sample.pred_instances.numpy()
+            result = {
+                'labels': pred_instances.labels.tolist(),
+                'scores': pred_instances.scores.tolist()
+            }
+            if 'bboxes' in pred_instances:
+                result['bboxes'] = pred_instances.bboxes.tolist()
+            if masks is not None:
+                if 'bboxes' not in pred_instances or pred_instances.bboxes.sum(
+                ) == 0:
+                    # Fake bbox, such as the SOLO.
+                    bboxes = mask2bbox(masks.cpu()).numpy().tolist()
+                    result['bboxes'] = bboxes
+                encode_masks = encode_mask_results(pred_instances.masks)
+                for encode_mask in encode_masks:
+                    if isinstance(encode_mask['counts'], bytes):
+                        encode_mask['counts'] = encode_mask['counts'].decode()
+                result['masks'] = encode_masks
+
+        if 'pred_panoptic_seg' in data_sample:
+            if VOID is None:
+                raise RuntimeError(
+                    'panopticapi is not installed, please install it by: '
+                    'pip install git+https://github.com/cocodataset/'
+                    'panopticapi.git.')
+
+            pan = data_sample.pred_panoptic_seg.sem_seg.cpu().numpy()[0]
+            pan[pan % INSTANCE_OFFSET == len(
+                self.model.dataset_meta['classes'])] = VOID
+            pan = id2rgb(pan).astype(np.uint8)
+
+            if is_save_pred:
+                mmcv.imwrite(pan[:, :, ::-1], out_img_path)
+                result['panoptic_seg_path'] = out_img_path
+            else:
+                result['panoptic_seg'] = pan
+
+        if is_save_pred:
+            mmengine.dump(result, out_json_path)
+
+        return result
diff --git a/mmde/mmdet/apis/inference.py b/mmde/mmdet/apis/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e6f914ecabf4b9c110a4fd15310bc97d0197db9
--- /dev/null
+++ b/mmde/mmdet/apis/inference.py
@@ -0,0 +1,372 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from pathlib import Path
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.ops import RoIPool
+from mmcv.transforms import Compose
+from mmengine.config import Config
+from mmengine.dataset import default_collate
+from mmengine.model.utils import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+
+from mmdet.registry import DATASETS
+from mmdet.utils import ConfigType
+from ..evaluation import get_classes
+from ..registry import MODELS
+from ..structures import DetDataSample, SampleList
+from ..utils import get_test_pipeline_cfg
+
+
+def init_detector(
+    config: Union[str, Path, Config],
+    checkpoint: Optional[str] = None,
+    palette: str = 'none',
+    device: str = 'cuda:0',
+    cfg_options: Optional[dict] = None,
+) -> nn.Module:
+    """Initialize a detector from config file.
+
+    Args:
+        config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path,
+            :obj:`Path`, or the config object.
+        checkpoint (str, optional): Checkpoint path. If left as None, the model
+            will not load any weights.
+        palette (str): Color palette used for visualization. If palette
+            is stored in checkpoint, use checkpoint's palette first, otherwise
+            use externally passed palette. Currently, supports 'coco', 'voc',
+            'citys' and 'random'. Defaults to none.
+        device (str): The device where the anchors will be put on.
+            Defaults to cuda:0.
+        cfg_options (dict, optional): Options to override some settings in
+            the used config.
+
+    Returns:
+        nn.Module: The constructed detector.
+    """
+    if isinstance(config, (str, Path)):
+        config = Config.fromfile(config)
+    elif not isinstance(config, Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    if cfg_options is not None:
+        config.merge_from_dict(cfg_options)
+    elif 'init_cfg' in config.model.backbone:
+        config.model.backbone.init_cfg = None
+
+    scope = config.get('default_scope', 'mmdet')
+    if scope is not None:
+        init_default_scope(config.get('default_scope', 'mmdet'))
+
+    model = MODELS.build(config.model)
+    model = revert_sync_batchnorm(model)
+    if checkpoint is None:
+        warnings.simplefilter('once')
+        warnings.warn('checkpoint is None, use COCO classes by default.')
+        model.dataset_meta = {'classes': get_classes('coco')}
+    else:
+        checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
+        # Weights converted from elsewhere may not have meta fields.
+        checkpoint_meta = checkpoint.get('meta', {})
+
+        # save the dataset_meta in the model for convenience
+        if 'dataset_meta' in checkpoint_meta:
+            # mmdet 3.x, all keys should be lowercase
+            model.dataset_meta = {
+                k.lower(): v
+                for k, v in checkpoint_meta['dataset_meta'].items()
+            }
+        elif 'CLASSES' in checkpoint_meta:
+            # < mmdet 3.x
+            classes = checkpoint_meta['CLASSES']
+            model.dataset_meta = {'classes': classes}
+        else:
+            warnings.simplefilter('once')
+            warnings.warn(
+                'dataset_meta or class names are not saved in the '
+                'checkpoint\'s meta data, use COCO classes by default.')
+            model.dataset_meta = {'classes': get_classes('coco')}
+
+    # Priority:  args.palette -> config -> checkpoint
+    if palette != 'none':
+        model.dataset_meta['palette'] = palette
+    else:
+        test_dataset_cfg = copy.deepcopy(config.test_dataloader.dataset)
+        # lazy init. We only need the metainfo.
+        test_dataset_cfg['lazy_init'] = True
+        metainfo = DATASETS.build(test_dataset_cfg).metainfo
+        cfg_palette = metainfo.get('palette', None)
+        if cfg_palette is not None:
+            model.dataset_meta['palette'] = cfg_palette
+        else:
+            if 'palette' not in model.dataset_meta:
+                warnings.warn(
+                    'palette does not exist, random is used by default. '
+                    'You can also set the palette to customize.')
+                model.dataset_meta['palette'] = 'random'
+
+    model.cfg = config  # save the config in the model for convenience
+    model.to(device)
+    model.eval()
+    return model
+
+
+ImagesType = Union[str, np.ndarray, Sequence[str], Sequence[np.ndarray]]
+
+
+def inference_detector(
+    model: nn.Module,
+    imgs: ImagesType,
+    test_pipeline: Optional[Compose] = None,
+    text_prompt: Optional[str] = None,
+    custom_entities: bool = False,
+) -> Union[DetDataSample, SampleList]:
+    """Inference image(s) with the detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        imgs (str, ndarray, Sequence[str/ndarray]):
+           Either image files or loaded images.
+        test_pipeline (:obj:`Compose`): Test pipeline.
+
+    Returns:
+        :obj:`DetDataSample` or list[:obj:`DetDataSample`]:
+        If imgs is a list or tuple, the same length list type results
+        will be returned, otherwise return the detection results directly.
+    """
+
+    if isinstance(imgs, (list, tuple)):
+        is_batch = True
+    else:
+        imgs = [imgs]
+        is_batch = False
+
+    cfg = model.cfg
+
+    if test_pipeline is None:
+        cfg = cfg.copy()
+        test_pipeline = get_test_pipeline_cfg(cfg)
+        if isinstance(imgs[0], np.ndarray):
+            # Calling this method across libraries will result
+            # in module unregistered error if not prefixed with mmdet.
+            test_pipeline[0].type = 'mmdet.LoadImageFromNDArray'
+
+        test_pipeline = Compose(test_pipeline)
+
+    if model.data_preprocessor.device.type == 'cpu':
+        for m in model.modules():
+            assert not isinstance(
+                m, RoIPool
+            ), 'CPU inference with RoIPool is not supported currently.'
+
+    result_list = []
+    for i, img in enumerate(imgs):
+        # prepare data
+        if isinstance(img, np.ndarray):
+            # TODO: remove img_id.
+            data_ = dict(img=img, img_id=0)
+        else:
+            # TODO: remove img_id.
+            data_ = dict(img_path=img, img_id=0)
+
+        if text_prompt:
+            data_['text'] = text_prompt
+            data_['custom_entities'] = custom_entities
+
+        # build the data pipeline
+        data_ = test_pipeline(data_)
+
+        data_['inputs'] = [data_['inputs']]
+        data_['data_samples'] = [data_['data_samples']]
+
+        # forward the model
+        with torch.no_grad():
+            results = model.test_step(data_)[0]
+
+        result_list.append(results)
+
+    if not is_batch:
+        return result_list[0]
+    else:
+        return result_list
+
+
+# TODO: Awaiting refactoring
+async def async_inference_detector(model, imgs):
+    """Async inference image(s) with the detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        img (str | ndarray): Either image files or loaded images.
+
+    Returns:
+        Awaitable detection results.
+    """
+    if not isinstance(imgs, (list, tuple)):
+        imgs = [imgs]
+
+    cfg = model.cfg
+
+    if isinstance(imgs[0], np.ndarray):
+        cfg = cfg.copy()
+        # set loading pipeline type
+        cfg.data.test.pipeline[0].type = 'LoadImageFromNDArray'
+
+    # cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+    test_pipeline = Compose(cfg.data.test.pipeline)
+
+    datas = []
+    for img in imgs:
+        # prepare data
+        if isinstance(img, np.ndarray):
+            # directly add img
+            data = dict(img=img)
+        else:
+            # add information into dict
+            data = dict(img_info=dict(filename=img), img_prefix=None)
+        # build the data pipeline
+        data = test_pipeline(data)
+        datas.append(data)
+
+    for m in model.modules():
+        assert not isinstance(
+            m,
+            RoIPool), 'CPU inference with RoIPool is not supported currently.'
+
+    # We don't restore `torch.is_grad_enabled()` value during concurrent
+    # inference since execution can overlap
+    torch.set_grad_enabled(False)
+    results = await model.aforward_test(data, rescale=True)
+    return results
+
+
+def build_test_pipeline(cfg: ConfigType) -> ConfigType:
+    """Build test_pipeline for mot/vis demo. In mot/vis infer, original
+    test_pipeline should remove the "LoadImageFromFile" and
+    "LoadTrackAnnotations".
+
+    Args:
+         cfg (ConfigDict): The loaded config.
+    Returns:
+         ConfigType: new test_pipeline
+    """
+    # remove the "LoadImageFromFile" and "LoadTrackAnnotations" in pipeline
+    transform_broadcaster = cfg.test_dataloader.dataset.pipeline[0].copy()
+    for transform in transform_broadcaster['transforms']:
+        if transform['type'] == 'Resize':
+            transform_broadcaster['transforms'] = transform
+    pack_track_inputs = cfg.test_dataloader.dataset.pipeline[-1].copy()
+    test_pipeline = Compose([transform_broadcaster, pack_track_inputs])
+
+    return test_pipeline
+
+
+def inference_mot(model: nn.Module, img: np.ndarray, frame_id: int,
+                  video_len: int) -> SampleList:
+    """Inference image(s) with the mot model.
+
+    Args:
+        model (nn.Module): The loaded mot model.
+        img (np.ndarray): Loaded image.
+        frame_id (int): frame id.
+        video_len (int): demo video length
+    Returns:
+        SampleList: The tracking data samples.
+    """
+    cfg = model.cfg
+    data = dict(
+        img=[img.astype(np.float32)],
+        frame_id=[frame_id],
+        ori_shape=[img.shape[:2]],
+        img_id=[frame_id + 1],
+        ori_video_length=[video_len])
+
+    test_pipeline = build_test_pipeline(cfg)
+    data = test_pipeline(data)
+
+    if not next(model.parameters()).is_cuda:
+        for m in model.modules():
+            assert not isinstance(
+                m, RoIPool
+            ), 'CPU inference with RoIPool is not supported currently.'
+
+    # forward the model
+    with torch.no_grad():
+        data = default_collate([data])
+        result = model.test_step(data)[0]
+    return result
+
+
+def init_track_model(config: Union[str, Config],
+                     checkpoint: Optional[str] = None,
+                     detector: Optional[str] = None,
+                     reid: Optional[str] = None,
+                     device: str = 'cuda:0',
+                     cfg_options: Optional[dict] = None) -> nn.Module:
+    """Initialize a model from config file.
+
+    Args:
+        config (str or :obj:`mmengine.Config`): Config file path or the config
+            object.
+        checkpoint (Optional[str], optional): Checkpoint path. Defaults to
+            None.
+        detector (Optional[str], optional): Detector Checkpoint path, use in
+            some tracking algorithms like sort.  Defaults to None.
+        reid (Optional[str], optional): Reid checkpoint path. use in
+            some tracking algorithms like sort. Defaults to None.
+        device (str, optional): The device that the model inferences on.
+            Defaults to `cuda:0`.
+        cfg_options (Optional[dict], optional): Options to override some
+            settings in the used config. Defaults to None.
+
+    Returns:
+        nn.Module: The constructed model.
+    """
+    if isinstance(config, str):
+        config = Config.fromfile(config)
+    elif not isinstance(config, Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    if cfg_options is not None:
+        config.merge_from_dict(cfg_options)
+
+    model = MODELS.build(config.model)
+
+    if checkpoint is not None:
+        checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
+        # Weights converted from elsewhere may not have meta fields.
+        checkpoint_meta = checkpoint.get('meta', {})
+        # save the dataset_meta in the model for convenience
+        if 'dataset_meta' in checkpoint_meta:
+            if 'CLASSES' in checkpoint_meta['dataset_meta']:
+                value = checkpoint_meta['dataset_meta'].pop('CLASSES')
+                checkpoint_meta['dataset_meta']['classes'] = value
+            model.dataset_meta = checkpoint_meta['dataset_meta']
+
+    if detector is not None:
+        assert not (checkpoint and detector), \
+            'Error: checkpoint and detector checkpoint cannot both exist'
+        load_checkpoint(model.detector, detector, map_location='cpu')
+
+    if reid is not None:
+        assert not (checkpoint and reid), \
+            'Error: checkpoint and reid checkpoint cannot both exist'
+        load_checkpoint(model.reid, reid, map_location='cpu')
+
+    # Some methods don't load checkpoints or checkpoints don't contain
+    # 'dataset_meta'
+    # VIS need dataset_meta, MOT don't need dataset_meta
+    if not hasattr(model, 'dataset_meta'):
+        warnings.warn('dataset_meta or class names are missed, '
+                      'use None by default.')
+        model.dataset_meta = {'classes': None}
+
+    model.cfg = config  # save the config in the model for convenience
+    model.to(device)
+    model.eval()
+    return model
diff --git a/mmde/mmdet/datasets/__init__.py b/mmde/mmdet/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..670c207cacf9ed0f9fee88bada119ee3aaa85eae
--- /dev/null
+++ b/mmde/mmdet/datasets/__init__.py
@@ -0,0 +1,53 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ade20k import (ADE20KInstanceDataset, ADE20KPanopticDataset,
+                     ADE20KSegDataset)
+from .base_det_dataset import BaseDetDataset
+from .base_semseg_dataset import BaseSegDataset
+from .base_video_dataset import BaseVideoDataset
+from .cityscapes import CityscapesDataset
+from .coco import CocoDataset
+from .coco_caption import CocoCaptionDataset
+from .coco_panoptic import CocoPanopticDataset
+from .coco_semantic import CocoSegDataset
+from .crowdhuman import CrowdHumanDataset
+from .dataset_wrappers import ConcatDataset, MultiImageMixDataset
+from .deepfashion import DeepFashionDataset
+from .dod import DODDataset
+from .dsdl import DSDLDetDataset
+from .flickr30k import Flickr30kDataset
+from .isaid import iSAIDDataset
+from .lvis import LVISDataset, LVISV1Dataset, LVISV05Dataset
+from .mdetr_style_refcoco import MDETRStyleRefCocoDataset
+from .mot_challenge_dataset import MOTChallengeDataset
+from .objects365 import Objects365V1Dataset, Objects365V2Dataset
+from .odvg import ODVGDataset
+from .openimages import OpenImagesChallengeDataset, OpenImagesDataset
+from .refcoco import RefCocoDataset
+from .reid_dataset import ReIDDataset
+from .samplers import (AspectRatioBatchSampler, ClassAwareSampler,
+                       CustomSampleSizeSampler, GroupMultiSourceSampler,
+                       MultiSourceSampler, TrackAspectRatioBatchSampler,
+                       TrackImgSampler)
+from .utils import get_loading_pipeline
+from .v3det import V3DetDataset
+from .voc import VOCDataset
+from .wider_face import WIDERFaceDataset
+from .xml_style import XMLDataset
+from .youtube_vis_dataset import YouTubeVISDataset
+
+__all__ = [
+    'XMLDataset', 'CocoDataset', 'DeepFashionDataset', 'VOCDataset',
+    'CityscapesDataset', 'LVISDataset', 'LVISV05Dataset', 'LVISV1Dataset',
+    'WIDERFaceDataset', 'get_loading_pipeline', 'CocoPanopticDataset',
+    'MultiImageMixDataset', 'OpenImagesDataset', 'OpenImagesChallengeDataset',
+    'AspectRatioBatchSampler', 'ClassAwareSampler', 'MultiSourceSampler',
+    'GroupMultiSourceSampler', 'BaseDetDataset', 'CrowdHumanDataset',
+    'Objects365V1Dataset', 'Objects365V2Dataset', 'DSDLDetDataset',
+    'BaseVideoDataset', 'MOTChallengeDataset', 'TrackImgSampler',
+    'ReIDDataset', 'YouTubeVISDataset', 'TrackAspectRatioBatchSampler',
+    'ADE20KPanopticDataset', 'CocoCaptionDataset', 'RefCocoDataset',
+    'BaseSegDataset', 'ADE20KSegDataset', 'CocoSegDataset',
+    'ADE20KInstanceDataset', 'iSAIDDataset', 'V3DetDataset', 'ConcatDataset',
+    'ODVGDataset', 'MDETRStyleRefCocoDataset', 'DODDataset',
+    'CustomSampleSizeSampler', 'Flickr30kDataset'
+]
diff --git a/mmde/mmdet/datasets/ade20k.py b/mmde/mmdet/datasets/ade20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..573271cb5d0cb83571564272895bddde9a5f6ad7
--- /dev/null
+++ b/mmde/mmdet/datasets/ade20k.py
@@ -0,0 +1,260 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+from mmengine import fileio
+
+from mmdet.registry import DATASETS
+from .base_semseg_dataset import BaseSegDataset
+from .coco import CocoDataset
+from .coco_panoptic import CocoPanopticDataset
+
+ADE_PALETTE = [(120, 120, 120), (180, 120, 120), (6, 230, 230), (80, 50, 50),
+               (4, 200, 3), (120, 120, 80), (140, 140, 140), (204, 5, 255),
+               (230, 230, 230), (4, 250, 7), (224, 5, 255), (235, 255, 7),
+               (150, 5, 61), (120, 120, 70), (8, 255, 51), (255, 6, 82),
+               (143, 255, 140), (204, 255, 4), (255, 51, 7), (204, 70, 3),
+               (0, 102, 200), (61, 230, 250), (255, 6, 51), (11, 102, 255),
+               (255, 7, 71), (255, 9, 224), (9, 7, 230), (220, 220, 220),
+               (255, 9, 92), (112, 9, 255), (8, 255, 214), (7, 255, 224),
+               (255, 184, 6), (10, 255, 71), (255, 41, 10), (7, 255, 255),
+               (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7),
+               (255, 122, 8), (0, 255, 20), (255, 8, 41), (255, 5, 153),
+               (6, 51, 255), (235, 12, 255), (160, 150, 20), (0, 163, 255),
+               (140, 140, 140), (250, 10, 15), (20, 255, 0), (31, 255, 0),
+               (255, 31, 0), (255, 224, 0), (153, 255, 0), (0, 0, 255),
+               (255, 71, 0), (0, 235, 255), (0, 173, 255), (31, 0, 255),
+               (11, 200, 200), (255, 82, 0), (0, 255, 245), (0, 61, 255),
+               (0, 255, 112), (0, 255, 133), (255, 0, 0), (255, 163, 0),
+               (255, 102, 0), (194, 255, 0), (0, 143, 255), (51, 255, 0),
+               (0, 82, 255), (0, 255, 41), (0, 255, 173), (10, 0, 255),
+               (173, 255, 0), (0, 255, 153), (255, 92, 0), (255, 0, 255),
+               (255, 0, 245), (255, 0, 102), (255, 173, 0), (255, 0, 20),
+               (255, 184, 184), (0, 31, 255), (0, 255, 61), (0, 71, 255),
+               (255, 0, 204), (0, 255, 194), (0, 255, 82), (0, 10, 255),
+               (0, 112, 255), (51, 0, 255), (0, 194, 255), (0, 122, 255),
+               (0, 255, 163), (255, 153, 0), (0, 255, 10), (255, 112, 0),
+               (143, 255, 0), (82, 0, 255), (163, 255, 0), (255, 235, 0),
+               (8, 184, 170), (133, 0, 255), (0, 255, 92), (184, 0, 255),
+               (255, 0, 31), (0, 184, 255), (0, 214, 255), (255, 0, 112),
+               (92, 255, 0), (0, 224, 255), (112, 224, 255), (70, 184, 160),
+               (163, 0, 255), (153, 0, 255), (71, 255, 0), (255, 0, 163),
+               (255, 204, 0), (255, 0, 143), (0, 255, 235), (133, 255, 0),
+               (255, 0, 235), (245, 0, 255), (255, 0, 122), (255, 245, 0),
+               (10, 190, 212), (214, 255, 0), (0, 204, 255), (20, 0, 255),
+               (255, 255, 0), (0, 153, 255), (0, 41, 255), (0, 255, 204),
+               (41, 0, 255), (41, 255, 0), (173, 0, 255), (0, 245, 255),
+               (71, 0, 255), (122, 0, 255), (0, 255, 184), (0, 92, 255),
+               (184, 255, 0), (0, 133, 255), (255, 214, 0), (25, 194, 194),
+               (102, 255, 0), (92, 0, 255)]
+
+
+@DATASETS.register_module()
+class ADE20KPanopticDataset(CocoPanopticDataset):
+    METAINFO = {
+        'classes':
+        ('bed', 'window', 'cabinet', 'person', 'door', 'table', 'curtain',
+         'chair', 'car', 'painting, picture', 'sofa', 'shelf', 'mirror',
+         'armchair', 'seat', 'fence', 'desk', 'wardrobe, closet, press',
+         'lamp', 'tub', 'rail', 'cushion', 'box', 'column, pillar',
+         'signboard, sign', 'chest of drawers, chest, bureau, dresser',
+         'counter', 'sink', 'fireplace', 'refrigerator, icebox', 'stairs',
+         'case, display case, showcase, vitrine',
+         'pool table, billiard table, snooker table', 'pillow',
+         'screen door, screen', 'bookcase', 'coffee table',
+         'toilet, can, commode, crapper, pot, potty, stool, throne', 'flower',
+         'book', 'bench', 'countertop', 'stove', 'palm, palm tree',
+         'kitchen island', 'computer', 'swivel chair', 'boat',
+         'arcade machine', 'bus', 'towel', 'light', 'truck', 'chandelier',
+         'awning, sunshade, sunblind', 'street lamp', 'booth', 'tv',
+         'airplane', 'clothes', 'pole',
+         'bannister, banister, balustrade, balusters, handrail',
+         'ottoman, pouf, pouffe, puff, hassock', 'bottle', 'van', 'ship',
+         'fountain', 'washer, automatic washer, washing machine',
+         'plaything, toy', 'stool', 'barrel, cask', 'basket, handbasket',
+         'bag', 'minibike, motorbike', 'oven', 'ball', 'food, solid food',
+         'step, stair', 'trade name', 'microwave', 'pot', 'animal', 'bicycle',
+         'dishwasher', 'screen', 'sculpture', 'hood, exhaust hood', 'sconce',
+         'vase', 'traffic light', 'tray', 'trash can', 'fan', 'plate',
+         'monitor', 'bulletin board', 'radiator', 'glass, drinking glass',
+         'clock', 'flag', 'wall', 'building', 'sky', 'floor', 'tree',
+         'ceiling', 'road, route', 'grass', 'sidewalk, pavement',
+         'earth, ground', 'mountain, mount', 'plant', 'water', 'house', 'sea',
+         'rug', 'field', 'rock, stone', 'base, pedestal, stand', 'sand',
+         'skyscraper', 'grandstand, covered stand', 'path', 'runway',
+         'stairway, staircase', 'river', 'bridge, span', 'blind, screen',
+         'hill', 'bar', 'hovel, hut, hutch, shack, shanty', 'tower',
+         'dirt track', 'land, ground, soil',
+         'escalator, moving staircase, moving stairway',
+         'buffet, counter, sideboard',
+         'poster, posting, placard, notice, bill, card', 'stage',
+         'conveyer belt, conveyor belt, conveyer, conveyor, transporter',
+         'canopy', 'pool', 'falls', 'tent', 'cradle', 'tank, storage tank',
+         'lake', 'blanket, cover', 'pier', 'crt screen', 'shower'),
+        'thing_classes':
+        ('bed', 'window', 'cabinet', 'person', 'door', 'table', 'curtain',
+         'chair', 'car', 'painting, picture', 'sofa', 'shelf', 'mirror',
+         'armchair', 'seat', 'fence', 'desk', 'wardrobe, closet, press',
+         'lamp', 'tub', 'rail', 'cushion', 'box', 'column, pillar',
+         'signboard, sign', 'chest of drawers, chest, bureau, dresser',
+         'counter', 'sink', 'fireplace', 'refrigerator, icebox', 'stairs',
+         'case, display case, showcase, vitrine',
+         'pool table, billiard table, snooker table', 'pillow',
+         'screen door, screen', 'bookcase', 'coffee table',
+         'toilet, can, commode, crapper, pot, potty, stool, throne', 'flower',
+         'book', 'bench', 'countertop', 'stove', 'palm, palm tree',
+         'kitchen island', 'computer', 'swivel chair', 'boat',
+         'arcade machine', 'bus', 'towel', 'light', 'truck', 'chandelier',
+         'awning, sunshade, sunblind', 'street lamp', 'booth', 'tv',
+         'airplane', 'clothes', 'pole',
+         'bannister, banister, balustrade, balusters, handrail',
+         'ottoman, pouf, pouffe, puff, hassock', 'bottle', 'van', 'ship',
+         'fountain', 'washer, automatic washer, washing machine',
+         'plaything, toy', 'stool', 'barrel, cask', 'basket, handbasket',
+         'bag', 'minibike, motorbike', 'oven', 'ball', 'food, solid food',
+         'step, stair', 'trade name', 'microwave', 'pot', 'animal', 'bicycle',
+         'dishwasher', 'screen', 'sculpture', 'hood, exhaust hood', 'sconce',
+         'vase', 'traffic light', 'tray', 'trash can', 'fan', 'plate',
+         'monitor', 'bulletin board', 'radiator', 'glass, drinking glass',
+         'clock', 'flag'),
+        'stuff_classes':
+        ('wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road, route',
+         'grass', 'sidewalk, pavement', 'earth, ground', 'mountain, mount',
+         'plant', 'water', 'house', 'sea', 'rug', 'field', 'rock, stone',
+         'base, pedestal, stand', 'sand', 'skyscraper',
+         'grandstand, covered stand', 'path', 'runway', 'stairway, staircase',
+         'river', 'bridge, span', 'blind, screen', 'hill', 'bar',
+         'hovel, hut, hutch, shack, shanty', 'tower', 'dirt track',
+         'land, ground, soil', 'escalator, moving staircase, moving stairway',
+         'buffet, counter, sideboard',
+         'poster, posting, placard, notice, bill, card', 'stage',
+         'conveyer belt, conveyor belt, conveyer, conveyor, transporter',
+         'canopy', 'pool', 'falls', 'tent', 'cradle', 'tank, storage tank',
+         'lake', 'blanket, cover', 'pier', 'crt screen', 'shower'),
+        'palette':
+        ADE_PALETTE
+    }
+
+
+@DATASETS.register_module()
+class ADE20KInstanceDataset(CocoDataset):
+    METAINFO = {
+        'classes':
+        ('bed', 'windowpane', 'cabinet', 'person', 'door', 'table', 'curtain',
+         'chair', 'car', 'painting', 'sofa', 'shelf', 'mirror', 'armchair',
+         'seat', 'fence', 'desk', 'wardrobe', 'lamp', 'bathtub', 'railing',
+         'cushion', 'box', 'column', 'signboard', 'chest of drawers',
+         'counter', 'sink', 'fireplace', 'refrigerator', 'stairs', 'case',
+         'pool table', 'pillow', 'screen door', 'bookcase', 'coffee table',
+         'toilet', 'flower', 'book', 'bench', 'countertop', 'stove', 'palm',
+         'kitchen island', 'computer', 'swivel chair', 'boat',
+         'arcade machine', 'bus', 'towel', 'light', 'truck', 'chandelier',
+         'awning', 'streetlight', 'booth', 'television receiver', 'airplane',
+         'apparel', 'pole', 'bannister', 'ottoman', 'bottle', 'van', 'ship',
+         'fountain', 'washer', 'plaything', 'stool', 'barrel', 'basket', 'bag',
+         'minibike', 'oven', 'ball', 'food', 'step', 'trade name', 'microwave',
+         'pot', 'animal', 'bicycle', 'dishwasher', 'screen', 'sculpture',
+         'hood', 'sconce', 'vase', 'traffic light', 'tray', 'ashcan', 'fan',
+         'plate', 'monitor', 'bulletin board', 'radiator', 'glass', 'clock',
+         'flag'),
+        'palette': [(204, 5, 255), (230, 230, 230), (224, 5, 255),
+                    (150, 5, 61), (8, 255, 51), (255, 6, 82), (255, 51, 7),
+                    (204, 70, 3), (0, 102, 200), (255, 6, 51), (11, 102, 255),
+                    (255, 7, 71), (220, 220, 220), (8, 255, 214),
+                    (7, 255, 224), (255, 184, 6), (10, 255, 71), (7, 255, 255),
+                    (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7),
+                    (0, 255, 20), (255, 8, 41), (255, 5, 153), (6, 51, 255),
+                    (235, 12, 255), (0, 163, 255), (250, 10, 15), (20, 255, 0),
+                    (255, 224, 0), (0, 0, 255), (255, 71, 0), (0, 235, 255),
+                    (0, 173, 255), (0, 255, 245), (0, 255, 112), (0, 255, 133),
+                    (255, 0, 0), (255, 163, 0), (194, 255, 0), (0, 143, 255),
+                    (51, 255, 0), (0, 82, 255), (0, 255, 41), (0, 255, 173),
+                    (10, 0, 255), (173, 255, 0), (255, 92, 0), (255, 0, 245),
+                    (255, 0, 102), (255, 173, 0), (255, 0, 20), (0, 31, 255),
+                    (0, 255, 61), (0, 71, 255), (255, 0, 204), (0, 255, 194),
+                    (0, 255, 82), (0, 112, 255), (51, 0, 255), (0, 122, 255),
+                    (255, 153, 0), (0, 255, 10), (163, 255, 0), (255, 235, 0),
+                    (8, 184, 170), (184, 0, 255), (255, 0, 31), (0, 214, 255),
+                    (255, 0, 112), (92, 255, 0), (70, 184, 160), (163, 0, 255),
+                    (71, 255, 0), (255, 0, 163), (255, 204, 0), (255, 0, 143),
+                    (133, 255, 0), (255, 0, 235), (245, 0, 255), (255, 0, 122),
+                    (255, 245, 0), (214, 255, 0), (0, 204, 255), (255, 255, 0),
+                    (0, 153, 255), (0, 41, 255), (0, 255, 204), (41, 0, 255),
+                    (41, 255, 0), (173, 0, 255), (0, 245, 255), (0, 255, 184),
+                    (0, 92, 255), (184, 255, 0), (255, 214, 0), (25, 194, 194),
+                    (102, 255, 0), (92, 0, 255)],
+    }
+
+
+@DATASETS.register_module()
+class ADE20KSegDataset(BaseSegDataset):
+    """ADE20K dataset.
+
+    In segmentation map annotation for ADE20K, 0 stands for background, which
+    is not included in 150 categories. The ``img_suffix`` is fixed to '.jpg',
+    and ``seg_map_suffix`` is fixed to '.png'.
+    """
+    METAINFO = dict(
+        classes=('wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road',
+                 'bed ', 'windowpane', 'grass', 'cabinet', 'sidewalk',
+                 'person', 'earth', 'door', 'table', 'mountain', 'plant',
+                 'curtain', 'chair', 'car', 'water', 'painting', 'sofa',
+                 'shelf', 'house', 'sea', 'mirror', 'rug', 'field', 'armchair',
+                 'seat', 'fence', 'desk', 'rock', 'wardrobe', 'lamp',
+                 'bathtub', 'railing', 'cushion', 'base', 'box', 'column',
+                 'signboard', 'chest of drawers', 'counter', 'sand', 'sink',
+                 'skyscraper', 'fireplace', 'refrigerator', 'grandstand',
+                 'path', 'stairs', 'runway', 'case', 'pool table', 'pillow',
+                 'screen door', 'stairway', 'river', 'bridge', 'bookcase',
+                 'blind', 'coffee table', 'toilet', 'flower', 'book', 'hill',
+                 'bench', 'countertop', 'stove', 'palm', 'kitchen island',
+                 'computer', 'swivel chair', 'boat', 'bar', 'arcade machine',
+                 'hovel', 'bus', 'towel', 'light', 'truck', 'tower',
+                 'chandelier', 'awning', 'streetlight', 'booth',
+                 'television receiver', 'airplane', 'dirt track', 'apparel',
+                 'pole', 'land', 'bannister', 'escalator', 'ottoman', 'bottle',
+                 'buffet', 'poster', 'stage', 'van', 'ship', 'fountain',
+                 'conveyer belt', 'canopy', 'washer', 'plaything',
+                 'swimming pool', 'stool', 'barrel', 'basket', 'waterfall',
+                 'tent', 'bag', 'minibike', 'cradle', 'oven', 'ball', 'food',
+                 'step', 'tank', 'trade name', 'microwave', 'pot', 'animal',
+                 'bicycle', 'lake', 'dishwasher', 'screen', 'blanket',
+                 'sculpture', 'hood', 'sconce', 'vase', 'traffic light',
+                 'tray', 'ashcan', 'fan', 'pier', 'crt screen', 'plate',
+                 'monitor', 'bulletin board', 'shower', 'radiator', 'glass',
+                 'clock', 'flag'),
+        palette=ADE_PALETTE)
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 return_classes=False,
+                 **kwargs) -> None:
+        self.return_classes = return_classes
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            List[dict]: All data info of dataset.
+        """
+        data_list = []
+        img_dir = self.data_prefix.get('img_path', None)
+        ann_dir = self.data_prefix.get('seg_map_path', None)
+        for img in fileio.list_dir_or_file(
+                dir_path=img_dir,
+                list_dir=False,
+                suffix=self.img_suffix,
+                recursive=True,
+                backend_args=self.backend_args):
+            data_info = dict(img_path=osp.join(img_dir, img))
+            if ann_dir is not None:
+                seg_map = img.replace(self.img_suffix, self.seg_map_suffix)
+                data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+            data_info['label_map'] = self.label_map
+            if self.return_classes:
+                data_info['text'] = list(self._metainfo['classes'])
+            data_list.append(data_info)
+        return data_list
diff --git a/mmde/mmdet/datasets/api_wrappers/__init__.py b/mmde/mmdet/datasets/api_wrappers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e3c41a2f87b14d10339955208e0502aeeeb7082
--- /dev/null
+++ b/mmde/mmdet/datasets/api_wrappers/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .coco_api import COCO, COCOeval, COCOPanoptic
+from .cocoeval_mp import COCOevalMP
+
+__all__ = ['COCO', 'COCOeval', 'COCOPanoptic', 'COCOevalMP']
diff --git a/mmde/mmdet/datasets/api_wrappers/coco_api.py b/mmde/mmdet/datasets/api_wrappers/coco_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2d11a122e1860d1b097710ff98adfddc1508c5a
--- /dev/null
+++ b/mmde/mmdet/datasets/api_wrappers/coco_api.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This file add snake case alias for coco api
+
+import warnings
+from collections import defaultdict
+from typing import List, Optional, Union
+
+import pycocotools
+from pycocotools.coco import COCO as _COCO
+from pycocotools.cocoeval import COCOeval as _COCOeval
+
+
+class COCO(_COCO):
+    """This class is almost the same as official pycocotools package.
+
+    It implements some snake case function aliases. So that the COCO class has
+    the same interface as LVIS class.
+    """
+
+    def __init__(self, annotation_file=None):
+        if getattr(pycocotools, '__version__', '0') >= '12.0.2':
+            warnings.warn(
+                'mmpycocotools is deprecated. Please install official pycocotools by "pip install pycocotools"',  # noqa: E501
+                UserWarning)
+        super().__init__(annotation_file=annotation_file)
+        self.img_ann_map = self.imgToAnns
+        self.cat_img_map = self.catToImgs
+
+    def get_ann_ids(self, img_ids=[], cat_ids=[], area_rng=[], iscrowd=None):
+        return self.getAnnIds(img_ids, cat_ids, area_rng, iscrowd)
+
+    def get_cat_ids(self, cat_names=[], sup_names=[], cat_ids=[]):
+        return self.getCatIds(cat_names, sup_names, cat_ids)
+
+    def get_img_ids(self, img_ids=[], cat_ids=[]):
+        return self.getImgIds(img_ids, cat_ids)
+
+    def load_anns(self, ids):
+        return self.loadAnns(ids)
+
+    def load_cats(self, ids):
+        return self.loadCats(ids)
+
+    def load_imgs(self, ids):
+        return self.loadImgs(ids)
+
+
+# just for the ease of import
+COCOeval = _COCOeval
+
+
+class COCOPanoptic(COCO):
+    """This wrapper is for loading the panoptic style annotation file.
+
+    The format is shown in the CocoPanopticDataset class.
+
+    Args:
+        annotation_file (str, optional): Path of annotation file.
+            Defaults to None.
+    """
+
+    def __init__(self, annotation_file: Optional[str] = None) -> None:
+        super(COCOPanoptic, self).__init__(annotation_file)
+
+    def createIndex(self) -> None:
+        """Create index."""
+        # create index
+        print('creating index...')
+        # anns stores 'segment_id -> annotation'
+        anns, cats, imgs = {}, {}, {}
+        img_to_anns, cat_to_imgs = defaultdict(list), defaultdict(list)
+        if 'annotations' in self.dataset:
+            for ann in self.dataset['annotations']:
+                for seg_ann in ann['segments_info']:
+                    # to match with instance.json
+                    seg_ann['image_id'] = ann['image_id']
+                    img_to_anns[ann['image_id']].append(seg_ann)
+                    # segment_id is not unique in coco dataset orz...
+                    # annotations from different images but
+                    # may have same segment_id
+                    if seg_ann['id'] in anns.keys():
+                        anns[seg_ann['id']].append(seg_ann)
+                    else:
+                        anns[seg_ann['id']] = [seg_ann]
+
+            # filter out annotations from other images
+            img_to_anns_ = defaultdict(list)
+            for k, v in img_to_anns.items():
+                img_to_anns_[k] = [x for x in v if x['image_id'] == k]
+            img_to_anns = img_to_anns_
+
+        if 'images' in self.dataset:
+            for img_info in self.dataset['images']:
+                img_info['segm_file'] = img_info['file_name'].replace(
+                    '.jpg', '.png')
+                imgs[img_info['id']] = img_info
+
+        if 'categories' in self.dataset:
+            for cat in self.dataset['categories']:
+                cats[cat['id']] = cat
+
+        if 'annotations' in self.dataset and 'categories' in self.dataset:
+            for ann in self.dataset['annotations']:
+                for seg_ann in ann['segments_info']:
+                    cat_to_imgs[seg_ann['category_id']].append(ann['image_id'])
+
+        print('index created!')
+
+        self.anns = anns
+        self.imgToAnns = img_to_anns
+        self.catToImgs = cat_to_imgs
+        self.imgs = imgs
+        self.cats = cats
+
+    def load_anns(self,
+                  ids: Union[List[int], int] = []) -> Optional[List[dict]]:
+        """Load anns with the specified ids.
+
+        ``self.anns`` is a list of annotation lists instead of a
+        list of annotations.
+
+        Args:
+            ids (Union[List[int], int]): Integer ids specifying anns.
+
+        Returns:
+            anns (List[dict], optional): Loaded ann objects.
+        """
+        anns = []
+
+        if hasattr(ids, '__iter__') and hasattr(ids, '__len__'):
+            # self.anns is a list of annotation lists instead of
+            # a list of annotations
+            for id in ids:
+                anns += self.anns[id]
+            return anns
+        elif type(ids) == int:
+            return self.anns[ids]
diff --git a/mmde/mmdet/datasets/api_wrappers/cocoeval_mp.py b/mmde/mmdet/datasets/api_wrappers/cocoeval_mp.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3673ea7a7edc593cb49fb336f352a20c1b1015b
--- /dev/null
+++ b/mmde/mmdet/datasets/api_wrappers/cocoeval_mp.py
@@ -0,0 +1,296 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import itertools
+import time
+from collections import defaultdict
+
+import numpy as np
+import torch.multiprocessing as mp
+from mmengine.logging import MMLogger
+from pycocotools.cocoeval import COCOeval
+from tqdm import tqdm
+
+
+class COCOevalMP(COCOeval):
+
+    def _prepare(self):
+        '''
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        '''
+
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                rle = coco.annToRLE(ann)
+                ann['segmentation'] = rle
+
+        p = self.params
+        if p.useCats:
+            gts = []
+            dts = []
+            img_ids = set(p.imgIds)
+            cat_ids = set(p.catIds)
+            for gt in self.cocoGt.dataset['annotations']:
+                if (gt['category_id'] in cat_ids) and (gt['image_id']
+                                                       in img_ids):
+                    gts.append(gt)
+            for dt in self.cocoDt.dataset['annotations']:
+                if (dt['category_id'] in cat_ids) and (dt['image_id']
+                                                       in img_ids):
+                    dts.append(dt)
+            # gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) # noqa
+            # dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) # noqa
+            # gts=self.cocoGt.dataset['annotations']
+            # dts=self.cocoDt.dataset['annotations']
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == 'segm':
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+        # set ignore flag
+        for gt in gts:
+            gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
+            gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
+            if p.iouType == 'keypoints':
+                gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        for gt in gts:
+            self._gts[gt['image_id'], gt['category_id']].append(gt)
+        for dt in dts:
+            self._dts[dt['image_id'], dt['category_id']].append(dt)
+        self.evalImgs = defaultdict(
+            list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+
+    def evaluate(self):
+        """Run per image evaluation on given images and store results (a list
+        of dict) in self.evalImgs.
+
+        :return: None
+        """
+        tic = time.time()
+        print('Running per image evaluation...')
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+            print('useSegm (deprecated) is not None. Running {} evaluation'.
+                  format(p.iouType))
+        print('Evaluate annotation type *{}*'.format(p.iouType))
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        nproc = 8
+        split_size = len(catIds) // nproc
+        mp_params = []
+        for i in range(nproc):
+            begin = i * split_size
+            end = (i + 1) * split_size
+            if i == nproc - 1:
+                end = len(catIds)
+            mp_params.append((catIds[begin:end], ))
+
+        MMLogger.get_current_instance().info(
+            'start multi processing evaluation ...')
+        with mp.Pool(nproc) as pool:
+            self.evalImgs = pool.starmap(self._evaluateImg, mp_params)
+
+        self.evalImgs = list(itertools.chain(*self.evalImgs))
+
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format(toc - tic))
+
+    def _evaluateImg(self, catids_chunk):
+        self._prepare()
+        p = self.params
+        maxDet = max(p.maxDets)
+        all_params = []
+        for catId in catids_chunk:
+            for areaRng in p.areaRng:
+                for imgId in p.imgIds:
+                    all_params.append((catId, areaRng, imgId))
+        evalImgs = [
+            self.evaluateImg(imgId, catId, areaRng, maxDet)
+            for catId, areaRng, imgId in tqdm(all_params)
+        ]
+        return evalImgs
+
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return None
+
+        for g in gt:
+            if g['ignore'] or (g['area'] < aRng[0] or g['area'] > aRng[1]):
+                g['_ignore'] = 1
+            else:
+                g['_ignore'] = 0
+
+        # sort dt highest score first, sort gt ignore last
+        gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in dtind[0:maxDet]]
+        iscrowd = [int(o['iscrowd']) for o in gt]
+        # load computed ious
+        # ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId] # noqa
+        ious = self.computeIoU(imgId, catId)
+        ious = ious[:, gtind] if len(ious) > 0 else ious
+
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm = np.zeros((T, G))
+        dtm = np.zeros((T, D))
+        gtIg = np.array([g['_ignore'] for g in gt])
+        dtIg = np.zeros((T, D))
+        if not len(ious) == 0:
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t, 1 - 1e-10])
+                    m = -1
+                    for gind, g in enumerate(gt):
+                        # if this gt already matched, and not a crowd, continue
+                        if gtm[tind, gind] > 0 and not iscrowd[gind]:
+                            continue
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1:
+                            break
+                        # continue to next gt unless better match made
+                        if ious[dind, gind] < iou:
+                            continue
+                        # if match successful and best so far,
+                        # store appropriately
+                        iou = ious[dind, gind]
+                        m = gind
+                    # if match made store id of match for both dt and gt
+                    if m == -1:
+                        continue
+                    dtIg[tind, dind] = gtIg[m]
+                    dtm[tind, dind] = gt[m]['id']
+                    gtm[tind, m] = d['id']
+        # set unmatched detections outside of area range to ignore
+        a = np.array([d['area'] < aRng[0] or d['area'] > aRng[1]
+                      for d in dt]).reshape((1, len(dt)))
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T,
+                                                                      0)))
+        # store results for given image and category
+
+        return {
+            'image_id': imgId,
+            'category_id': catId,
+            'aRng': aRng,
+            'maxDet': maxDet,
+            'dtIds': [d['id'] for d in dt],
+            'gtIds': [g['id'] for g in gt],
+            'dtMatches': dtm,
+            'gtMatches': gtm,
+            'dtScores': [d['score'] for d in dt],
+            'gtIgnore': gtIg,
+            'dtIgnore': dtIg,
+        }
+
+    def summarize(self):
+        """Compute and display summary metrics for evaluation results.
+
+        Note this function can *only* be applied on the default parameter
+        setting
+        """
+
+        def _summarize(ap=1, iouThr=None, areaRng='all', maxDets=100):
+            p = self.params
+            iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'  # noqa
+            titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
+            typeStr = '(AP)' if ap == 1 else '(AR)'
+            iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
+                if iouThr is None else '{:0.2f}'.format(iouThr)
+
+            aind = [
+                i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng
+            ]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval['precision']
+                # IoU
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, :, aind, mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval['recall']
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, aind, mind]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            print(
+                iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets,
+                            mean_s))
+            return mean_s
+
+        def _summarizeDets():
+            stats = []
+            stats.append(_summarize(1, maxDets=self.params.maxDets[-1]))
+            stats.append(
+                _summarize(1, iouThr=.5, maxDets=self.params.maxDets[-1]))
+            stats.append(
+                _summarize(1, iouThr=.75, maxDets=self.params.maxDets[-1]))
+            for area_rng in ('small', 'medium', 'large'):
+                stats.append(
+                    _summarize(
+                        1, areaRng=area_rng, maxDets=self.params.maxDets[-1]))
+            for max_det in self.params.maxDets:
+                stats.append(_summarize(0, maxDets=max_det))
+            for area_rng in ('small', 'medium', 'large'):
+                stats.append(
+                    _summarize(
+                        0, areaRng=area_rng, maxDets=self.params.maxDets[-1]))
+            stats = np.array(stats)
+            return stats
+
+        def _summarizeKps():
+            stats = np.zeros((10, ))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng='medium')
+            stats[4] = _summarize(1, maxDets=20, areaRng='large')
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng='medium')
+            stats[9] = _summarize(0, maxDets=20, areaRng='large')
+            return stats
+
+        if not self.eval:
+            raise Exception('Please run accumulate() first')
+        iouType = self.params.iouType
+        if iouType == 'segm' or iouType == 'bbox':
+            summarize = _summarizeDets
+        elif iouType == 'keypoints':
+            summarize = _summarizeKps
+        self.stats = summarize()
diff --git a/mmde/mmdet/datasets/base_det_dataset.py b/mmde/mmdet/datasets/base_det_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b3876d5c06eb7d3741a29fe8b0963a7e425ec1b
--- /dev/null
+++ b/mmde/mmdet/datasets/base_det_dataset.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List, Optional
+
+from mmengine.dataset import BaseDataset
+from mmengine.fileio import load
+from mmengine.utils import is_abs
+
+from ..registry import DATASETS
+
+
+@DATASETS.register_module()
+class BaseDetDataset(BaseDataset):
+    """Base dataset for detection.
+
+    Args:
+        proposal_file (str, optional): Proposals file path. Defaults to None.
+        file_client_args (dict): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        return_classes (bool): Whether to return class information
+            for open vocabulary-based algorithms. Defaults to False.
+        caption_prompt (dict, optional): Prompt for captioning.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 *args,
+                 seg_map_suffix: str = '.png',
+                 proposal_file: Optional[str] = None,
+                 file_client_args: dict = None,
+                 backend_args: dict = None,
+                 return_classes: bool = False,
+                 caption_prompt: Optional[dict] = None,
+                 **kwargs) -> None:
+        self.seg_map_suffix = seg_map_suffix
+        self.proposal_file = proposal_file
+        self.backend_args = backend_args
+        self.return_classes = return_classes
+        self.caption_prompt = caption_prompt
+        if self.caption_prompt is not None:
+            assert self.return_classes, \
+                'return_classes must be True when using caption_prompt'
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+        super().__init__(*args, **kwargs)
+
+    def full_init(self) -> None:
+        """Load annotation file and set ``BaseDataset._fully_initialized`` to
+        True.
+
+        If ``lazy_init=False``, ``full_init`` will be called during the
+        instantiation and ``self._fully_initialized`` will be set to True. If
+        ``obj._fully_initialized=False``, the class method decorated by
+        ``force_full_init`` will call ``full_init`` automatically.
+
+        Several steps to initialize annotation:
+
+            - load_data_list: Load annotations from annotation file.
+            - load_proposals: Load proposals from proposal file, if
+              `self.proposal_file` is not None.
+            - filter data information: Filter annotations according to
+              filter_cfg.
+            - slice_data: Slice dataset according to ``self._indices``
+            - serialize_data: Serialize ``self.data_list`` if
+            ``self.serialize_data`` is True.
+        """
+        if self._fully_initialized:
+            return
+        # load data information
+        self.data_list = self.load_data_list()
+        # get proposals from file
+        if self.proposal_file is not None:
+            self.load_proposals()
+        # filter illegal data, such as data that has no annotations.
+        self.data_list = self.filter_data()
+
+        # Get subset data according to indices.
+        if self._indices is not None:
+            self.data_list = self._get_unserialized_subset(self._indices)
+
+        # serialize data_list
+        if self.serialize_data:
+            self.data_bytes, self.data_address = self._serialize_data()
+
+        self._fully_initialized = True
+
+    def load_proposals(self) -> None:
+        """Load proposals from proposals file.
+
+        The `proposals_list` should be a dict[img_path: proposals]
+        with the same length as `data_list`. And the `proposals` should be
+        a `dict` or :obj:`InstanceData` usually contains following keys.
+
+            - bboxes (np.ndarry): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+            - scores (np.ndarry): Classification scores, has a shape
+              (num_instance, ).
+        """
+        # TODO: Add Unit Test after fully support Dump-Proposal Metric
+        if not is_abs(self.proposal_file):
+            self.proposal_file = osp.join(self.data_root, self.proposal_file)
+        proposals_list = load(
+            self.proposal_file, backend_args=self.backend_args)
+        assert len(self.data_list) == len(proposals_list)
+        for data_info in self.data_list:
+            img_path = data_info['img_path']
+            # `file_name` is the key to obtain the proposals from the
+            # `proposals_list`.
+            file_name = osp.join(
+                osp.split(osp.split(img_path)[0])[-1],
+                osp.split(img_path)[-1])
+            proposals = proposals_list[file_name]
+            data_info['proposals'] = proposals
+
+    def get_cat_ids(self, idx: int) -> List[int]:
+        """Get COCO category ids by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            List[int]: All categories in the image of specified index.
+        """
+        instances = self.get_data_info(idx)['instances']
+        return [instance['bbox_label'] for instance in instances]
diff --git a/mmde/mmdet/datasets/base_semseg_dataset.py b/mmde/mmdet/datasets/base_semseg_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d10f762a21a897ab8274fbe9eefab054691a7c60
--- /dev/null
+++ b/mmde/mmdet/datasets/base_semseg_dataset.py
@@ -0,0 +1,265 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from typing import Callable, Dict, List, Optional, Sequence, Union
+
+import mmengine
+import mmengine.fileio as fileio
+import numpy as np
+from mmengine.dataset import BaseDataset, Compose
+
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class BaseSegDataset(BaseDataset):
+    """Custom dataset for semantic segmentation. An example of file structure
+    is as followed.
+
+    .. code-block:: none
+
+        ├── data
+        │   ├── my_dataset
+        │   │   ├── img_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{img_suffix}
+        │   │   │   │   ├── yyy{img_suffix}
+        │   │   │   │   ├── zzz{img_suffix}
+        │   │   │   ├── val
+        │   │   ├── ann_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{seg_map_suffix}
+        │   │   │   │   ├── yyy{seg_map_suffix}
+        │   │   │   │   ├── zzz{seg_map_suffix}
+        │   │   │   ├── val
+
+    The img/gt_semantic_seg pair of BaseSegDataset should be of the same
+    except suffix. A valid img/gt_semantic_seg filename pair should be like
+    ``xxx{img_suffix}`` and ``xxx{seg_map_suffix}`` (extension is also included
+    in the suffix). If split is given, then ``xxx`` is specified in txt file.
+    Otherwise, all files in ``img_dir/``and ``ann_dir`` will be loaded.
+    Please refer to ``docs/en/tutorials/new_dataset.md`` for more details.
+
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as
+            specify classes to load. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        data_prefix (dict, optional): Prefix for training data. Defaults to
+            dict(img_path=None, seg_map_path=None).
+        img_suffix (str): Suffix of images. Default: '.jpg'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=True``. Defaults to False.
+        use_label_map (bool, optional): Whether to use label map.
+            Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4 required.
+    """
+    METAINFO: dict = dict()
+
+    def __init__(self,
+                 ann_file: str = '',
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = None,
+                 data_prefix: dict = dict(img_path='', seg_map_path=''),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 use_label_map: bool = False,
+                 max_refetch: int = 1000,
+                 backend_args: Optional[dict] = None) -> None:
+
+        self.img_suffix = img_suffix
+        self.seg_map_suffix = seg_map_suffix
+        self.backend_args = backend_args.copy() if backend_args else None
+
+        self.data_root = data_root
+        self.data_prefix = copy.copy(data_prefix)
+        self.ann_file = ann_file
+        self.filter_cfg = copy.deepcopy(filter_cfg)
+        self._indices = indices
+        self.serialize_data = serialize_data
+        self.test_mode = test_mode
+        self.max_refetch = max_refetch
+        self.data_list: List[dict] = []
+        self.data_bytes: np.ndarray
+
+        # Set meta information.
+        self._metainfo = self._load_metainfo(copy.deepcopy(metainfo))
+
+        # Get label map for custom classes
+        new_classes = self._metainfo.get('classes', None)
+        self.label_map = self.get_label_map(
+            new_classes) if use_label_map else None
+        self._metainfo.update(dict(label_map=self.label_map))
+
+        # Update palette based on label map or generate palette
+        # if it is not defined
+        updated_palette = self._update_palette()
+        self._metainfo.update(dict(palette=updated_palette))
+
+        # Join paths.
+        if self.data_root is not None:
+            self._join_prefix()
+
+        # Build pipeline.
+        self.pipeline = Compose(pipeline)
+        # Full initialize the dataset.
+        if not lazy_init:
+            self.full_init()
+
+        if test_mode:
+            assert self._metainfo.get('classes') is not None, \
+                'dataset metainfo `classes` should be specified when testing'
+
+    @classmethod
+    def get_label_map(cls,
+                      new_classes: Optional[Sequence] = None
+                      ) -> Union[Dict, None]:
+        """Require label mapping.
+
+        The ``label_map`` is a dictionary, its keys are the old label ids and
+        its values are the new label ids, and is used for changing pixel
+        labels in load_annotations. If and only if old classes in cls.METAINFO
+        is not equal to new classes in self._metainfo and nether of them is not
+        None, `label_map` is not None.
+
+        Args:
+            new_classes (list, tuple, optional): The new classes name from
+                metainfo. Default to None.
+
+
+        Returns:
+            dict, optional: The mapping from old classes in cls.METAINFO to
+                new classes in self._metainfo
+        """
+        old_classes = cls.METAINFO.get('classes', None)
+        if (new_classes is not None and old_classes is not None
+                and list(new_classes) != list(old_classes)):
+
+            label_map = {}
+            if not set(new_classes).issubset(cls.METAINFO['classes']):
+                raise ValueError(
+                    f'new classes {new_classes} is not a '
+                    f'subset of classes {old_classes} in METAINFO.')
+            for i, c in enumerate(old_classes):
+                if c not in new_classes:
+                    # 0 is background
+                    label_map[i] = 0
+                else:
+                    label_map[i] = new_classes.index(c)
+            return label_map
+        else:
+            return None
+
+    def _update_palette(self) -> list:
+        """Update palette after loading metainfo.
+
+        If length of palette is equal to classes, just return the palette.
+        If palette is not defined, it will randomly generate a palette.
+        If classes is updated by customer, it will return the subset of
+        palette.
+
+        Returns:
+            Sequence: Palette for current dataset.
+        """
+        palette = self._metainfo.get('palette', [])
+        classes = self._metainfo.get('classes', [])
+        # palette does match classes
+        if len(palette) == len(classes):
+            return palette
+
+        if len(palette) == 0:
+            # Get random state before set seed, and restore
+            # random state later.
+            # It will prevent loss of randomness, as the palette
+            # may be different in each iteration if not specified.
+            # See: https://github.com/open-mmlab/mmdetection/issues/5844
+            state = np.random.get_state()
+            np.random.seed(42)
+            # random palette
+            new_palette = np.random.randint(
+                0, 255, size=(len(classes), 3)).tolist()
+            np.random.set_state(state)
+        elif len(palette) >= len(classes) and self.label_map is not None:
+            new_palette = []
+            # return subset of palette
+            for old_id, new_id in sorted(
+                    self.label_map.items(), key=lambda x: x[1]):
+                # 0 is background
+                if new_id != 0:
+                    new_palette.append(palette[old_id])
+            new_palette = type(palette)(new_palette)
+        elif len(palette) >= len(classes):
+            # Allow palette length is greater than classes.
+            return palette
+        else:
+            raise ValueError('palette does not match classes '
+                             f'as metainfo is {self._metainfo}.')
+        return new_palette
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            list[dict]: All data info of dataset.
+        """
+        data_list = []
+        img_dir = self.data_prefix.get('img_path', None)
+        ann_dir = self.data_prefix.get('seg_map_path', None)
+        if not osp.isdir(self.ann_file) and self.ann_file:
+            assert osp.isfile(self.ann_file), \
+                f'Failed to load `ann_file` {self.ann_file}'
+            lines = mmengine.list_from_file(
+                self.ann_file, backend_args=self.backend_args)
+            for line in lines:
+                img_name = line.strip()
+                data_info = dict(
+                    img_path=osp.join(img_dir, img_name + self.img_suffix))
+                if ann_dir is not None:
+                    seg_map = img_name + self.seg_map_suffix
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_list.append(data_info)
+        else:
+            for img in fileio.list_dir_or_file(
+                    dir_path=img_dir,
+                    list_dir=False,
+                    suffix=self.img_suffix,
+                    recursive=True,
+                    backend_args=self.backend_args):
+                data_info = dict(img_path=osp.join(img_dir, img))
+                if ann_dir is not None:
+                    seg_map = img.replace(self.img_suffix, self.seg_map_suffix)
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_list.append(data_info)
+            data_list = sorted(data_list, key=lambda x: x['img_path'])
+        return data_list
diff --git a/mmde/mmdet/datasets/base_video_dataset.py b/mmde/mmdet/datasets/base_video_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a4a7a25f16206f06c7b64a7ce4c3588efd5455e
--- /dev/null
+++ b/mmde/mmdet/datasets/base_video_dataset.py
@@ -0,0 +1,304 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from collections import defaultdict
+from typing import Any, List, Tuple
+
+import mmengine.fileio as fileio
+from mmengine.dataset import BaseDataset
+from mmengine.logging import print_log
+
+from mmdet.datasets.api_wrappers import COCO
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class BaseVideoDataset(BaseDataset):
+    """Base video dataset for VID, MOT and VIS tasks."""
+
+    META = dict(classes=None)
+    # ann_id is unique in coco dataset.
+    ANN_ID_UNIQUE = True
+
+    def __init__(self, *args, backend_args: dict = None, **kwargs):
+        self.backend_args = backend_args
+        super().__init__(*args, **kwargs)
+
+    def load_data_list(self) -> Tuple[List[dict], List]:
+        """Load annotations from an annotation file named as ``self.ann_file``.
+
+        Returns:
+            tuple(list[dict], list): A list of annotation and a list of
+            valid data indices.
+        """
+        with fileio.get_local_path(self.ann_file) as local_path:
+            self.coco = COCO(local_path)
+        # The order of returned `cat_ids` will not
+        # change with the order of the classes
+        self.cat_ids = self.coco.get_cat_ids(
+            cat_names=self.metainfo['classes'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.coco.cat_img_map)
+        # used in `filter_data`
+        self.img_ids_with_ann = set()
+
+        img_ids = self.coco.get_img_ids()
+        total_ann_ids = []
+        # if ``video_id`` is not in the annotation file, we will assign a big
+        # unique video_id for this video.
+        single_video_id = 100000
+        videos = {}
+        for img_id in img_ids:
+            raw_img_info = self.coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+            if 'video_id' not in raw_img_info:
+                single_video_id = single_video_id + 1
+                video_id = single_video_id
+            else:
+                video_id = raw_img_info['video_id']
+
+            if video_id not in videos:
+                videos[video_id] = {
+                    'video_id': video_id,
+                    'images': [],
+                    'video_length': 0
+                }
+
+            videos[video_id]['video_length'] += 1
+            ann_ids = self.coco.get_ann_ids(
+                img_ids=[img_id], cat_ids=self.cat_ids)
+            raw_ann_info = self.coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            parsed_data_info = self.parse_data_info(
+                dict(raw_img_info=raw_img_info, raw_ann_info=raw_ann_info))
+
+            if len(parsed_data_info['instances']) > 0:
+                self.img_ids_with_ann.add(parsed_data_info['img_id'])
+
+            videos[video_id]['images'].append(parsed_data_info)
+
+        data_list = [v for v in videos.values()]
+
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.coco
+
+        return data_list
+
+    def parse_data_info(self, raw_data_info: dict) -> dict:
+        """Parse raw annotation to target format.
+
+        Args:
+            raw_data_info (dict): Raw data information loaded from
+                ``ann_file``.
+
+        Returns:
+            dict: Parsed annotation.
+        """
+        img_info = raw_data_info['raw_img_info']
+        ann_info = raw_data_info['raw_ann_info']
+        data_info = {}
+
+        data_info.update(img_info)
+        if self.data_prefix.get('img_path', None) is not None:
+            img_path = osp.join(self.data_prefix['img_path'],
+                                img_info['file_name'])
+        else:
+            img_path = img_info['file_name']
+        data_info['img_path'] = img_path
+
+        instances = []
+        for i, ann in enumerate(ann_info):
+            instance = {}
+
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+
+            if ann.get('iscrowd', False):
+                instance['ignore_flag'] = 1
+            else:
+                instance['ignore_flag'] = 0
+            instance['bbox'] = bbox
+            instance['bbox_label'] = self.cat2label[ann['category_id']]
+            if ann.get('segmentation', None):
+                instance['mask'] = ann['segmentation']
+            if ann.get('instance_id', None):
+                instance['instance_id'] = ann['instance_id']
+            else:
+                # image dataset usually has no `instance_id`.
+                # Therefore, we set it to `i`.
+                instance['instance_id'] = i
+            instances.append(instance)
+        data_info['instances'] = instances
+        return data_info
+
+    def filter_data(self) -> List[int]:
+        """Filter image annotations according to filter_cfg.
+
+        Returns:
+            list[int]: Filtered results.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        num_imgs_before_filter = sum(
+            [len(info['images']) for info in self.data_list])
+        num_imgs_after_filter = 0
+
+        # obtain images that contain annotations of the required categories
+        ids_in_cat = set()
+        for i, class_id in enumerate(self.cat_ids):
+            ids_in_cat |= set(self.cat_img_map[class_id])
+        # merge the image id sets of the two conditions and use the merged set
+        # to filter out images if self.filter_empty_gt=True
+        ids_in_cat &= self.img_ids_with_ann
+
+        new_data_list = []
+        for video_data_info in self.data_list:
+            imgs_data_info = video_data_info['images']
+            valid_imgs_data_info = []
+
+            for data_info in imgs_data_info:
+                img_id = data_info['img_id']
+                width = data_info['width']
+                height = data_info['height']
+                # TODO: simplify these conditions
+                if self.filter_cfg is None:
+                    if img_id not in ids_in_cat:
+                        video_data_info['video_length'] -= 1
+                        continue
+                    if min(width, height) >= 32:
+                        valid_imgs_data_info.append(data_info)
+                        num_imgs_after_filter += 1
+                    else:
+                        video_data_info['video_length'] -= 1
+                else:
+                    if self.filter_cfg.get('filter_empty_gt',
+                                           True) and img_id not in ids_in_cat:
+                        video_data_info['video_length'] -= 1
+                        continue
+                    if min(width, height) >= self.filter_cfg.get(
+                            'min_size', 32):
+                        valid_imgs_data_info.append(data_info)
+                        num_imgs_after_filter += 1
+                    else:
+                        video_data_info['video_length'] -= 1
+                video_data_info['images'] = valid_imgs_data_info
+            new_data_list.append(video_data_info)
+
+        print_log(
+            'The number of samples before and after filtering: '
+            f'{num_imgs_before_filter} / {num_imgs_after_filter}', 'current')
+        return new_data_list
+
+    def prepare_data(self, idx) -> Any:
+        """Get date processed by ``self.pipeline``. Note that ``idx`` is a
+        video index in default since the base element of video dataset is a
+        video. However, in some cases, we need to specific both the video index
+        and frame index. For example, in traing mode, we may want to sample the
+        specific frames and all the frames must be sampled once in a epoch; in
+        test mode, we may want to output data of a single image rather than the
+        whole video for saving memory.
+
+        Args:
+            idx (int): The index of ``data_info``.
+
+        Returns:
+            Any: Depends on ``self.pipeline``.
+        """
+        if isinstance(idx, tuple):
+            assert len(idx) == 2, 'The length of idx must be 2: '
+            '(video_index, frame_index)'
+            video_idx, frame_idx = idx[0], idx[1]
+        else:
+            video_idx, frame_idx = idx, None
+
+        data_info = self.get_data_info(video_idx)
+        if self.test_mode:
+            # Support two test_mode: frame-level and video-level
+            final_data_info = defaultdict(list)
+            if frame_idx is None:
+                frames_idx_list = list(range(data_info['video_length']))
+            else:
+                frames_idx_list = [frame_idx]
+            for index in frames_idx_list:
+                frame_ann = data_info['images'][index]
+                frame_ann['video_id'] = data_info['video_id']
+                # Collate data_list (list of dict to dict of list)
+                for key, value in frame_ann.items():
+                    final_data_info[key].append(value)
+                # copy the info in video-level into img-level
+                # TODO: the value of this key is the same as that of
+                # `video_length` in test mode
+                final_data_info['ori_video_length'].append(
+                    data_info['video_length'])
+
+            final_data_info['video_length'] = [len(frames_idx_list)
+                                               ] * len(frames_idx_list)
+            return self.pipeline(final_data_info)
+        else:
+            # Specify `key_frame_id` for the frame sampling in the pipeline
+            if frame_idx is not None:
+                data_info['key_frame_id'] = frame_idx
+            return self.pipeline(data_info)
+
+    def get_cat_ids(self, index) -> List[int]:
+        """Following image detection, we provide this interface function. Get
+        category ids by video index and frame index.
+
+        Args:
+            index: The index of the dataset. It support two kinds of inputs:
+                Tuple:
+                    video_idx (int): Index of video.
+                    frame_idx (int): Index of frame.
+                Int: Index of video.
+
+        Returns:
+            List[int]: All categories in the image of specified video index
+            and frame index.
+        """
+        if isinstance(index, tuple):
+            assert len(
+                index
+            ) == 2, f'Expect the length of index is 2, but got {len(index)}'
+            video_idx, frame_idx = index
+            instances = self.get_data_info(
+                video_idx)['images'][frame_idx]['instances']
+            return [instance['bbox_label'] for instance in instances]
+        else:
+            cat_ids = []
+            for img in self.get_data_info(index)['images']:
+                for instance in img['instances']:
+                    cat_ids.append(instance['bbox_label'])
+            return cat_ids
+
+    @property
+    def num_all_imgs(self):
+        """Get the number of all the images in this video dataset."""
+        return sum(
+            [len(self.get_data_info(i)['images']) for i in range(len(self))])
+
+    def get_len_per_video(self, idx):
+        """Get length of one video.
+
+        Args:
+            idx (int): Index of video.
+
+        Returns:
+            int (int): The length of the video.
+        """
+        return len(self.get_data_info(idx)['images'])
diff --git a/mmde/mmdet/datasets/cityscapes.py b/mmde/mmdet/datasets/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..09755eb1e8b0f0c278085bd2fafbb7247a3fc946
--- /dev/null
+++ b/mmde/mmdet/datasets/cityscapes.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/datasets/cityscapes.py # noqa
+# and https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa
+
+from typing import List
+
+from mmdet.registry import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class CityscapesDataset(CocoDataset):
+    """Dataset for Cityscapes."""
+
+    METAINFO = {
+        'classes': ('person', 'rider', 'car', 'truck', 'bus', 'train',
+                    'motorcycle', 'bicycle'),
+        'palette': [(220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70),
+                    (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32)]
+    }
+
+    def filter_data(self) -> List[dict]:
+        """Filter annotations according to filter_cfg.
+
+        Returns:
+            List[dict]: Filtered results.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        if self.filter_cfg is None:
+            return self.data_list
+
+        filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False)
+        min_size = self.filter_cfg.get('min_size', 0)
+
+        # obtain images that contain annotation
+        ids_with_ann = set(data_info['img_id'] for data_info in self.data_list)
+        # obtain images that contain annotations of the required categories
+        ids_in_cat = set()
+        for i, class_id in enumerate(self.cat_ids):
+            ids_in_cat |= set(self.cat_img_map[class_id])
+        # merge the image id sets of the two conditions and use the merged set
+        # to filter out images if self.filter_empty_gt=True
+        ids_in_cat &= ids_with_ann
+
+        valid_data_infos = []
+        for i, data_info in enumerate(self.data_list):
+            img_id = data_info['img_id']
+            width = data_info['width']
+            height = data_info['height']
+            all_is_crowd = all([
+                instance['ignore_flag'] == 1
+                for instance in data_info['instances']
+            ])
+            if filter_empty_gt and (img_id not in ids_in_cat or all_is_crowd):
+                continue
+            if min(width, height) >= min_size:
+                valid_data_infos.append(data_info)
+
+        return valid_data_infos
diff --git a/mmde/mmdet/datasets/coco.py b/mmde/mmdet/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cf21c4e667e3b565ea01d1eb95bcdbf171b90d0
--- /dev/null
+++ b/mmde/mmdet/datasets/coco.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from typing import List, Union
+
+from mmengine.fileio import get_local_path
+
+from mmdet.registry import DATASETS
+from .api_wrappers import COCO
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class CocoDataset(BaseDetDataset):
+    """Dataset for COCO."""
+
+    METAINFO = {
+        'classes':
+        ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+         'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+         'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+         'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+         'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+         'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+         'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+         'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+         'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+         'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+         'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+         'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+         'scissors', 'teddy bear', 'hair drier', 'toothbrush'),
+        # palette is a list of color tuples, which is used for visualization.
+        'palette':
+        [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), (106, 0, 228),
+         (0, 60, 100), (0, 80, 100), (0, 0, 70), (0, 0, 192), (250, 170, 30),
+         (100, 170, 30), (220, 220, 0), (175, 116, 175), (250, 0, 30),
+         (165, 42, 42), (255, 77, 255), (0, 226, 252), (182, 182, 255),
+         (0, 82, 0), (120, 166, 157), (110, 76, 0), (174, 57, 255),
+         (199, 100, 0), (72, 0, 118), (255, 179, 240), (0, 125, 92),
+         (209, 0, 151), (188, 208, 182), (0, 220, 176), (255, 99, 164),
+         (92, 0, 73), (133, 129, 255), (78, 180, 255), (0, 228, 0),
+         (174, 255, 243), (45, 89, 255), (134, 134, 103), (145, 148, 174),
+         (255, 208, 186), (197, 226, 255), (171, 134, 1), (109, 63, 54),
+         (207, 138, 255), (151, 0, 95), (9, 80, 61), (84, 105, 51),
+         (74, 65, 105), (166, 196, 102), (208, 195, 210), (255, 109, 65),
+         (0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0),
+         (227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161),
+         (163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120),
+         (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133),
+         (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62),
+         (65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45),
+         (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1),
+         (246, 0, 122), (191, 162, 208)]
+    }
+    COCOAPI = COCO
+    # ann_id is unique in coco dataset.
+    ANN_ID_UNIQUE = True
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            self.coco = self.COCOAPI(local_path)
+        # The order of returned `cat_ids` will not
+        # change with the order of the `classes`
+        self.cat_ids = self.coco.get_cat_ids(
+            cat_names=self.metainfo['classes'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.coco.cat_img_map)
+
+        img_ids = self.coco.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+
+            ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            })
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.coco
+
+        return data_list
+
+    def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``
+
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        img_info = raw_data_info['raw_img_info']
+        ann_info = raw_data_info['raw_ann_info']
+
+        data_info = {}
+
+        # TODO: need to change data_prefix['img'] to data_prefix['img_path']
+        img_path = osp.join(self.data_prefix['img'], img_info['file_name'])
+        if self.data_prefix.get('seg', None):
+            seg_map_path = osp.join(
+                self.data_prefix['seg'],
+                img_info['file_name'].rsplit('.', 1)[0] + self.seg_map_suffix)
+        else:
+            seg_map_path = None
+        data_info['img_path'] = img_path
+        data_info['img_id'] = img_info['img_id']
+        data_info['seg_map_path'] = seg_map_path
+        data_info['height'] = img_info['height']
+        data_info['width'] = img_info['width']
+
+        if self.return_classes:
+            data_info['text'] = self.metainfo['classes']
+            data_info['caption_prompt'] = self.caption_prompt
+            data_info['custom_entities'] = True
+
+        instances = []
+        for i, ann in enumerate(ann_info):
+            instance = {}
+
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+
+            if ann.get('iscrowd', False):
+                instance['ignore_flag'] = 1
+            else:
+                instance['ignore_flag'] = 0
+            instance['bbox'] = bbox
+            instance['bbox_label'] = self.cat2label[ann['category_id']]
+
+            if ann.get('segmentation', None):
+                instance['mask'] = ann['segmentation']
+
+            instances.append(instance)
+        data_info['instances'] = instances
+        return data_info
+
+    def filter_data(self) -> List[dict]:
+        """Filter annotations according to filter_cfg.
+
+        Returns:
+            List[dict]: Filtered results.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        if self.filter_cfg is None:
+            return self.data_list
+
+        filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False)
+        min_size = self.filter_cfg.get('min_size', 0)
+
+        # obtain images that contain annotation
+        ids_with_ann = set(data_info['img_id'] for data_info in self.data_list)
+        # obtain images that contain annotations of the required categories
+        ids_in_cat = set()
+        for i, class_id in enumerate(self.cat_ids):
+            ids_in_cat |= set(self.cat_img_map[class_id])
+        # merge the image id sets of the two conditions and use the merged set
+        # to filter out images if self.filter_empty_gt=True
+        ids_in_cat &= ids_with_ann
+
+        valid_data_infos = []
+        for i, data_info in enumerate(self.data_list):
+            img_id = data_info['img_id']
+            width = data_info['width']
+            height = data_info['height']
+            if filter_empty_gt and img_id not in ids_in_cat:
+                continue
+            if min(width, height) >= min_size:
+                valid_data_infos.append(data_info)
+
+        return valid_data_infos
diff --git a/mmde/mmdet/datasets/coco_caption.py b/mmde/mmdet/datasets/coco_caption.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee695fe9a768f2be5345c6ad6bafc74177f252c0
--- /dev/null
+++ b/mmde/mmdet/datasets/coco_caption.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from pathlib import Path
+from typing import List
+
+import mmengine
+from mmengine.dataset import BaseDataset
+from mmengine.fileio import get_file_backend
+
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class CocoCaptionDataset(BaseDataset):
+    """COCO2014 Caption dataset."""
+
+    def load_data_list(self) -> List[dict]:
+        """Load data list."""
+        img_prefix = self.data_prefix['img_path']
+        annotations = mmengine.load(self.ann_file)
+        file_backend = get_file_backend(img_prefix)
+
+        data_list = []
+        for ann in annotations:
+            data_info = {
+                'img_id': Path(ann['image']).stem.split('_')[-1],
+                'img_path': file_backend.join_path(img_prefix, ann['image']),
+                'gt_caption': ann['caption'],
+            }
+
+            data_list.append(data_info)
+
+        return data_list
diff --git a/mmde/mmdet/datasets/coco_panoptic.py b/mmde/mmdet/datasets/coco_panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7a200e01d323e998afa782797e1cc92f75c70cf
--- /dev/null
+++ b/mmde/mmdet/datasets/coco_panoptic.py
@@ -0,0 +1,292 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Callable, List, Optional, Sequence, Union
+
+from mmdet.registry import DATASETS
+from .api_wrappers import COCOPanoptic
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class CocoPanopticDataset(CocoDataset):
+    """Coco dataset for Panoptic segmentation.
+
+    The annotation format is shown as follows. The `ann` field is optional
+    for testing.
+
+    .. code-block:: none
+
+        [
+            {
+                'filename': f'{image_id:012}.png',
+                'image_id':9
+                'segments_info':
+                [
+                    {
+                        'id': 8345037, (segment_id in panoptic png,
+                                        convert from rgb)
+                        'category_id': 51,
+                        'iscrowd': 0,
+                        'bbox': (x1, y1, w, h),
+                        'area': 24315
+                    },
+                    ...
+                ]
+            },
+            ...
+        ]
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        data_prefix (dict, optional): Prefix for training data. Defaults to
+            ``dict(img=None, ann=None, seg=None)``. The prefix ``seg`` which is
+            for panoptic segmentation map must be not None.
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=False``. Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+    """
+
+    METAINFO = {
+        'classes':
+        ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+         'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+         'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+         'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+         'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+         'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+         'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+         'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+         'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+         'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+         'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+         'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+         'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+         'blanket', 'bridge', 'cardboard', 'counter', 'curtain', 'door-stuff',
+         'floor-wood', 'flower', 'fruit', 'gravel', 'house', 'light',
+         'mirror-stuff', 'net', 'pillow', 'platform', 'playingfield',
+         'railroad', 'river', 'road', 'roof', 'sand', 'sea', 'shelf', 'snow',
+         'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', 'wall-tile',
+         'wall-wood', 'water-other', 'window-blind', 'window-other',
+         'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+         'cabinet-merged', 'table-merged', 'floor-other-merged',
+         'pavement-merged', 'mountain-merged', 'grass-merged', 'dirt-merged',
+         'paper-merged', 'food-other-merged', 'building-other-merged',
+         'rock-merged', 'wall-other-merged', 'rug-merged'),
+        'thing_classes':
+        ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+         'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+         'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+         'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+         'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+         'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+         'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+         'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+         'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+         'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+         'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+         'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+         'scissors', 'teddy bear', 'hair drier', 'toothbrush'),
+        'stuff_classes':
+        ('banner', 'blanket', 'bridge', 'cardboard', 'counter', 'curtain',
+         'door-stuff', 'floor-wood', 'flower', 'fruit', 'gravel', 'house',
+         'light', 'mirror-stuff', 'net', 'pillow', 'platform', 'playingfield',
+         'railroad', 'river', 'road', 'roof', 'sand', 'sea', 'shelf', 'snow',
+         'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', 'wall-tile',
+         'wall-wood', 'water-other', 'window-blind', 'window-other',
+         'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+         'cabinet-merged', 'table-merged', 'floor-other-merged',
+         'pavement-merged', 'mountain-merged', 'grass-merged', 'dirt-merged',
+         'paper-merged', 'food-other-merged', 'building-other-merged',
+         'rock-merged', 'wall-other-merged', 'rug-merged'),
+        'palette':
+        [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), (106, 0, 228),
+         (0, 60, 100), (0, 80, 100), (0, 0, 70), (0, 0, 192), (250, 170, 30),
+         (100, 170, 30), (220, 220, 0), (175, 116, 175), (250, 0, 30),
+         (165, 42, 42), (255, 77, 255), (0, 226, 252), (182, 182, 255),
+         (0, 82, 0), (120, 166, 157), (110, 76, 0), (174, 57, 255),
+         (199, 100, 0), (72, 0, 118), (255, 179, 240), (0, 125, 92),
+         (209, 0, 151), (188, 208, 182), (0, 220, 176), (255, 99, 164),
+         (92, 0, 73), (133, 129, 255), (78, 180, 255), (0, 228, 0),
+         (174, 255, 243), (45, 89, 255), (134, 134, 103), (145, 148, 174),
+         (255, 208, 186), (197, 226, 255), (171, 134, 1), (109, 63, 54),
+         (207, 138, 255), (151, 0, 95), (9, 80, 61), (84, 105, 51),
+         (74, 65, 105), (166, 196, 102), (208, 195, 210), (255, 109, 65),
+         (0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0),
+         (227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161),
+         (163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120),
+         (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133),
+         (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62),
+         (65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45),
+         (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1),
+         (246, 0, 122), (191, 162, 208), (255, 255, 128), (147, 211, 203),
+         (150, 100, 100), (168, 171, 172), (146, 112, 198), (210, 170, 100),
+         (92, 136, 89), (218, 88, 184), (241, 129, 0), (217, 17, 255),
+         (124, 74, 181), (70, 70, 70), (255, 228, 255), (154, 208, 0),
+         (193, 0, 92), (76, 91, 113), (255, 180, 195), (106, 154, 176),
+         (230, 150, 140), (60, 143, 255), (128, 64, 128), (92, 82, 55),
+         (254, 212, 124), (73, 77, 174), (255, 160, 98), (255, 255, 255),
+         (104, 84, 109), (169, 164, 131), (225, 199, 255), (137, 54, 74),
+         (135, 158, 223), (7, 246, 231), (107, 255, 200), (58, 41, 149),
+         (183, 121, 142), (255, 73, 97), (107, 142, 35), (190, 153, 153),
+         (146, 139, 141), (70, 130, 180), (134, 199, 156), (209, 226, 140),
+         (96, 36, 108), (96, 96, 96), (64, 170, 64), (152, 251, 152),
+         (208, 229, 228), (206, 186, 171), (152, 161, 64), (116, 112, 0),
+         (0, 114, 143), (102, 102, 156), (250, 141, 255)]
+    }
+    COCOAPI = COCOPanoptic
+    # ann_id is not unique in coco panoptic dataset.
+    ANN_ID_UNIQUE = False
+
+    def __init__(self,
+                 ann_file: str = '',
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = None,
+                 data_prefix: dict = dict(img=None, ann=None, seg=None),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 max_refetch: int = 1000,
+                 backend_args: dict = None,
+                 **kwargs) -> None:
+        super().__init__(
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_root=data_root,
+            data_prefix=data_prefix,
+            filter_cfg=filter_cfg,
+            indices=indices,
+            serialize_data=serialize_data,
+            pipeline=pipeline,
+            test_mode=test_mode,
+            lazy_init=lazy_init,
+            max_refetch=max_refetch,
+            backend_args=backend_args,
+            **kwargs)
+
+    def parse_data_info(self, raw_data_info: dict) -> dict:
+        """Parse raw annotation to target format.
+
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``.
+
+        Returns:
+            dict: Parsed annotation.
+        """
+        img_info = raw_data_info['raw_img_info']
+        ann_info = raw_data_info['raw_ann_info']
+        # filter out unmatched annotations which have
+        # same segment_id but belong to other image
+        ann_info = [
+            ann for ann in ann_info if ann['image_id'] == img_info['img_id']
+        ]
+        data_info = {}
+
+        img_path = osp.join(self.data_prefix['img'], img_info['file_name'])
+        if self.data_prefix.get('seg', None):
+            seg_map_path = osp.join(
+                self.data_prefix['seg'],
+                img_info['file_name'].replace('.jpg', '.png'))
+        else:
+            seg_map_path = None
+        data_info['img_path'] = img_path
+        data_info['img_id'] = img_info['img_id']
+        data_info['seg_map_path'] = seg_map_path
+        data_info['height'] = img_info['height']
+        data_info['width'] = img_info['width']
+
+        if self.return_classes:
+            data_info['text'] = self.metainfo['thing_classes']
+            data_info['stuff_text'] = self.metainfo['stuff_classes']
+            data_info['custom_entities'] = True  # no important
+
+        instances = []
+        segments_info = []
+        for ann in ann_info:
+            instance = {}
+            x1, y1, w, h = ann['bbox']
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            category_id = ann['category_id']
+            contiguous_cat_id = self.cat2label[category_id]
+
+            is_thing = self.coco.load_cats(ids=category_id)[0]['isthing']
+            if is_thing:
+                is_crowd = ann.get('iscrowd', False)
+                instance['bbox'] = bbox
+                instance['bbox_label'] = contiguous_cat_id
+                if not is_crowd:
+                    instance['ignore_flag'] = 0
+                else:
+                    instance['ignore_flag'] = 1
+                    is_thing = False
+
+            segment_info = {
+                'id': ann['id'],
+                'category': contiguous_cat_id,
+                'is_thing': is_thing
+            }
+            segments_info.append(segment_info)
+            if len(instance) > 0 and is_thing:
+                instances.append(instance)
+        data_info['instances'] = instances
+        data_info['segments_info'] = segments_info
+        return data_info
+
+    def filter_data(self) -> List[dict]:
+        """Filter images too small or without ground truth.
+
+        Returns:
+            List[dict]: ``self.data_list`` after filtering.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        if self.filter_cfg is None:
+            return self.data_list
+
+        filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False)
+        min_size = self.filter_cfg.get('min_size', 0)
+
+        ids_with_ann = set()
+        # check whether images have legal thing annotations.
+        for data_info in self.data_list:
+            for segment_info in data_info['segments_info']:
+                if not segment_info['is_thing']:
+                    continue
+                ids_with_ann.add(data_info['img_id'])
+
+        valid_data_list = []
+        for data_info in self.data_list:
+            img_id = data_info['img_id']
+            width = data_info['width']
+            height = data_info['height']
+            if filter_empty_gt and img_id not in ids_with_ann:
+                continue
+            if min(width, height) >= min_size:
+                valid_data_list.append(data_info)
+
+        return valid_data_list
diff --git a/mmde/mmdet/datasets/coco_semantic.py b/mmde/mmdet/datasets/coco_semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..752568454456c1e5edcb2a24c6c2b46f042cb334
--- /dev/null
+++ b/mmde/mmdet/datasets/coco_semantic.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import DATASETS
+from .ade20k import ADE20KSegDataset
+
+
+@DATASETS.register_module()
+class CocoSegDataset(ADE20KSegDataset):
+    """COCO dataset.
+
+    In segmentation map annotation for COCO. The ``img_suffix`` is fixed to
+    '.jpg',  and ``seg_map_suffix`` is fixed to '.png'.
+    """
+
+    METAINFO = dict(
+        classes=(
+            'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+            'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+            'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+            'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
+            'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+            'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
+            'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+            'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
+            'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+            'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
+            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+            'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+            'blanket', 'branch', 'bridge', 'building-other', 'bush', 'cabinet',
+            'cage', 'cardboard', 'carpet', 'ceiling-other', 'ceiling-tile',
+            'cloth', 'clothes', 'clouds', 'counter', 'cupboard', 'curtain',
+            'desk-stuff', 'dirt', 'door-stuff', 'fence', 'floor-marble',
+            'floor-other', 'floor-stone', 'floor-tile', 'floor-wood', 'flower',
+            'fog', 'food-other', 'fruit', 'furniture-other', 'grass', 'gravel',
+            'ground-other', 'hill', 'house', 'leaves', 'light', 'mat', 'metal',
+            'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net',
+            'paper', 'pavement', 'pillow', 'plant-other', 'plastic',
+            'platform', 'playingfield', 'railing', 'railroad', 'river', 'road',
+            'rock', 'roof', 'rug', 'salad', 'sand', 'sea', 'shelf',
+            'sky-other', 'skyscraper', 'snow', 'solid-other', 'stairs',
+            'stone', 'straw', 'structural-other', 'table', 'tent',
+            'textile-other', 'towel', 'tree', 'vegetable', 'wall-brick',
+            'wall-concrete', 'wall-other', 'wall-panel', 'wall-stone',
+            'wall-tile', 'wall-wood', 'water-other', 'waterdrops',
+            'window-blind', 'window-other', 'wood'),
+        palette=[(120, 120, 120), (180, 120, 120), (6, 230, 230), (80, 50, 50),
+                 (4, 200, 3), (120, 120, 80), (140, 140, 140), (204, 5, 255),
+                 (230, 230, 230), (4, 250, 7), (224, 5, 255), (235, 255, 7),
+                 (150, 5, 61), (120, 120, 70), (8, 255, 51), (255, 6, 82),
+                 (143, 255, 140), (204, 255, 4), (255, 51, 7), (204, 70, 3),
+                 (0, 102, 200), (61, 230, 250), (255, 6, 51), (11, 102, 255),
+                 (255, 7, 71), (255, 9, 224), (9, 7, 230), (220, 220, 220),
+                 (255, 9, 92), (112, 9, 255), (8, 255, 214), (7, 255, 224),
+                 (255, 184, 6), (10, 255, 71), (255, 41, 10), (7, 255, 255),
+                 (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7),
+                 (255, 122, 8), (0, 255, 20), (255, 8, 41), (255, 5, 153),
+                 (6, 51, 255), (235, 12, 255), (160, 150, 20), (0, 163, 255),
+                 (140, 140, 140), (250, 10, 15), (20, 255, 0), (31, 255, 0),
+                 (255, 31, 0), (255, 224, 0), (153, 255, 0), (0, 0, 255),
+                 (255, 71, 0), (0, 235, 255), (0, 173, 255), (31, 0, 255),
+                 (11, 200, 200), (255, 82, 0), (0, 255, 245), (0, 61, 255),
+                 (0, 255, 112), (0, 255, 133), (255, 0, 0), (255, 163, 0),
+                 (255, 102, 0), (194, 255, 0), (0, 143, 255), (51, 255, 0),
+                 (0, 82, 255), (0, 255, 41), (0, 255, 173), (10, 0, 255),
+                 (173, 255, 0), (0, 255, 153), (255, 92, 0), (255, 0, 255),
+                 (255, 0, 245), (255, 0, 102), (255, 173, 0), (255, 0, 20),
+                 (255, 184, 184), (0, 31, 255), (0, 255, 61), (0, 71, 255),
+                 (255, 0, 204), (0, 255, 194), (0, 255, 82), (0, 10, 255),
+                 (0, 112, 255), (51, 0, 255), (0, 194, 255), (0, 122, 255),
+                 (0, 255, 163), (255, 153, 0), (0, 255, 10), (255, 112, 0),
+                 (143, 255, 0), (82, 0, 255), (163, 255, 0), (255, 235, 0),
+                 (8, 184, 170), (133, 0, 255), (0, 255, 92), (184, 0, 255),
+                 (255, 0, 31), (0, 184, 255), (0, 214, 255), (255, 0, 112),
+                 (92, 255, 0), (0, 224, 255), (112, 224, 255), (70, 184, 160),
+                 (163, 0, 255), (153, 0, 255), (71, 255, 0), (255, 0, 163),
+                 (255, 204, 0), (255, 0, 143), (0, 255, 235), (133, 255, 0),
+                 (255, 0, 235), (245, 0, 255), (255, 0, 122), (255, 245, 0),
+                 (10, 190, 212), (214, 255, 0), (0, 204, 255), (20, 0, 255),
+                 (255, 255, 0), (0, 153, 255), (0, 41, 255), (0, 255, 204),
+                 (41, 0, 255), (41, 255, 0), (173, 0, 255), (0, 245, 255),
+                 (71, 0, 255), (122, 0, 255), (0, 255, 184), (0, 92, 255),
+                 (184, 255, 0), (0, 133, 255), (255, 214, 0), (25, 194, 194),
+                 (102, 255, 0), (92, 0, 255), (107, 255, 200), (58, 41, 149),
+                 (183, 121, 142), (255, 73, 97), (107, 142, 35),
+                 (190, 153, 153), (146, 139, 141), (70, 130, 180),
+                 (134, 199, 156), (209, 226, 140), (96, 36, 108), (96, 96, 96),
+                 (64, 170, 64), (152, 251, 152), (208, 229, 228),
+                 (206, 186, 171), (152, 161, 64), (116, 112, 0), (0, 114, 143),
+                 (102, 102, 156), (250, 141, 255)])
diff --git a/mmde/mmdet/datasets/crowdhuman.py b/mmde/mmdet/datasets/crowdhuman.py
new file mode 100644
index 0000000000000000000000000000000000000000..650176ee545ba6a10a816517553b3b77718d945b
--- /dev/null
+++ b/mmde/mmdet/datasets/crowdhuman.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import logging
+import os.path as osp
+import warnings
+from typing import List, Union
+
+import mmcv
+from mmengine.dist import get_rank
+from mmengine.fileio import dump, get, get_text, load
+from mmengine.logging import print_log
+from mmengine.utils import ProgressBar
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class CrowdHumanDataset(BaseDetDataset):
+    r"""Dataset for CrowdHuman.
+
+    Args:
+        data_root (str): The root directory for
+            ``data_prefix`` and ``ann_file``.
+        ann_file (str): Annotation file path.
+        extra_ann_file (str | optional):The path of extra image metas
+            for CrowdHuman. It can be created by CrowdHumanDataset
+            automatically or by tools/misc/get_crowdhuman_id_hw.py
+            manually. Defaults to None.
+    """
+
+    METAINFO = {
+        'classes': ('person', ),
+        # palette is a list of color tuples, which is used for visualization.
+        'palette': [(220, 20, 60)]
+    }
+
+    def __init__(self, data_root, ann_file, extra_ann_file=None, **kwargs):
+        # extra_ann_file record the size of each image. This file is
+        # automatically created when you first load the CrowdHuman
+        # dataset by mmdet.
+        if extra_ann_file is not None:
+            self.extra_ann_exist = True
+            self.extra_anns = load(extra_ann_file)
+        else:
+            ann_file_name = osp.basename(ann_file)
+            if 'train' in ann_file_name:
+                self.extra_ann_file = osp.join(data_root, 'id_hw_train.json')
+            elif 'val' in ann_file_name:
+                self.extra_ann_file = osp.join(data_root, 'id_hw_val.json')
+            self.extra_ann_exist = False
+            if not osp.isfile(self.extra_ann_file):
+                print_log(
+                    'extra_ann_file does not exist, prepare to collect '
+                    'image height and width...',
+                    level=logging.INFO)
+                self.extra_anns = {}
+            else:
+                self.extra_ann_exist = True
+                self.extra_anns = load(self.extra_ann_file)
+        super().__init__(data_root=data_root, ann_file=ann_file, **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        anno_strs = get_text(
+            self.ann_file, backend_args=self.backend_args).strip().split('\n')
+        print_log('loading CrowdHuman annotation...', level=logging.INFO)
+        data_list = []
+        prog_bar = ProgressBar(len(anno_strs))
+        for i, anno_str in enumerate(anno_strs):
+            anno_dict = json.loads(anno_str)
+            parsed_data_info = self.parse_data_info(anno_dict)
+            data_list.append(parsed_data_info)
+            prog_bar.update()
+        if not self.extra_ann_exist and get_rank() == 0:
+            #  TODO: support file client
+            try:
+                dump(self.extra_anns, self.extra_ann_file, file_format='json')
+            except:  # noqa
+                warnings.warn(
+                    'Cache files can not be saved automatically! To speed up'
+                    'loading the dataset, please manually generate the cache'
+                    ' file by file tools/misc/get_crowdhuman_id_hw.py')
+
+            print_log(
+                f'\nsave extra_ann_file in {self.data_root}',
+                level=logging.INFO)
+
+        del self.extra_anns
+        print_log('\nDone', level=logging.INFO)
+        return data_list
+
+    def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``
+
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        data_info = {}
+        img_path = osp.join(self.data_prefix['img'],
+                            f"{raw_data_info['ID']}.jpg")
+        data_info['img_path'] = img_path
+        data_info['img_id'] = raw_data_info['ID']
+
+        if not self.extra_ann_exist:
+            img_bytes = get(img_path, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(img_bytes, backend='cv2')
+            data_info['height'], data_info['width'] = img.shape[:2]
+            self.extra_anns[raw_data_info['ID']] = img.shape[:2]
+            del img, img_bytes
+        else:
+            data_info['height'], data_info['width'] = self.extra_anns[
+                raw_data_info['ID']]
+
+        instances = []
+        for i, ann in enumerate(raw_data_info['gtboxes']):
+            instance = {}
+            if ann['tag'] not in self.metainfo['classes']:
+                instance['bbox_label'] = -1
+                instance['ignore_flag'] = 1
+            else:
+                instance['bbox_label'] = self.metainfo['classes'].index(
+                    ann['tag'])
+                instance['ignore_flag'] = 0
+            if 'extra' in ann:
+                if 'ignore' in ann['extra']:
+                    if ann['extra']['ignore'] != 0:
+                        instance['bbox_label'] = -1
+                        instance['ignore_flag'] = 1
+
+            x1, y1, w, h = ann['fbox']
+            bbox = [x1, y1, x1 + w, y1 + h]
+            instance['bbox'] = bbox
+
+            # Record the full bbox(fbox), head bbox(hbox) and visible
+            # bbox(vbox) as additional information. If you need to use
+            # this information, you just need to design the pipeline
+            # instead of overriding the CrowdHumanDataset.
+            instance['fbox'] = bbox
+            hbox = ann['hbox']
+            instance['hbox'] = [
+                hbox[0], hbox[1], hbox[0] + hbox[2], hbox[1] + hbox[3]
+            ]
+            vbox = ann['vbox']
+            instance['vbox'] = [
+                vbox[0], vbox[1], vbox[0] + vbox[2], vbox[1] + vbox[3]
+            ]
+
+            instances.append(instance)
+
+        data_info['instances'] = instances
+        return data_info
diff --git a/mmde/mmdet/datasets/dataset_wrappers.py b/mmde/mmdet/datasets/dataset_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4e26e07c0f8a9e9f106bcd351f71e7b24d6ccf9
--- /dev/null
+++ b/mmde/mmdet/datasets/dataset_wrappers.py
@@ -0,0 +1,260 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections
+import copy
+from typing import List, Sequence, Union
+
+from mmengine.dataset import BaseDataset
+from mmengine.dataset import ConcatDataset as MMENGINE_ConcatDataset
+from mmengine.dataset import force_full_init
+
+from mmdet.registry import DATASETS, TRANSFORMS
+
+
+@DATASETS.register_module()
+class MultiImageMixDataset:
+    """A wrapper of multiple images mixed dataset.
+
+    Suitable for training on multiple images mixed data augmentation like
+    mosaic and mixup. For the augmentation pipeline of mixed image data,
+    the `get_indexes` method needs to be provided to obtain the image
+    indexes, and you can set `skip_flags` to change the pipeline running
+    process. At the same time, we provide the `dynamic_scale` parameter
+    to dynamically change the output image size.
+
+    Args:
+        dataset (:obj:`CustomDataset`): The dataset to be mixed.
+        pipeline (Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        dynamic_scale (tuple[int], optional): The image scale can be changed
+            dynamically. Default to None. It is deprecated.
+        skip_type_keys (list[str], optional): Sequence of type string to
+            be skip pipeline. Default to None.
+        max_refetch (int): The maximum number of retry iterations for getting
+            valid results from the pipeline. If the number of iterations is
+            greater than `max_refetch`, but results is still None, then the
+            iteration is terminated and raise the error. Default: 15.
+    """
+
+    def __init__(self,
+                 dataset: Union[BaseDataset, dict],
+                 pipeline: Sequence[str],
+                 skip_type_keys: Union[Sequence[str], None] = None,
+                 max_refetch: int = 15,
+                 lazy_init: bool = False) -> None:
+        assert isinstance(pipeline, collections.abc.Sequence)
+        if skip_type_keys is not None:
+            assert all([
+                isinstance(skip_type_key, str)
+                for skip_type_key in skip_type_keys
+            ])
+        self._skip_type_keys = skip_type_keys
+
+        self.pipeline = []
+        self.pipeline_types = []
+        for transform in pipeline:
+            if isinstance(transform, dict):
+                self.pipeline_types.append(transform['type'])
+                transform = TRANSFORMS.build(transform)
+                self.pipeline.append(transform)
+            else:
+                raise TypeError('pipeline must be a dict')
+
+        self.dataset: BaseDataset
+        if isinstance(dataset, dict):
+            self.dataset = DATASETS.build(dataset)
+        elif isinstance(dataset, BaseDataset):
+            self.dataset = dataset
+        else:
+            raise TypeError(
+                'elements in datasets sequence should be config or '
+                f'`BaseDataset` instance, but got {type(dataset)}')
+
+        self._metainfo = self.dataset.metainfo
+        if hasattr(self.dataset, 'flag'):
+            self.flag = self.dataset.flag
+        self.num_samples = len(self.dataset)
+        self.max_refetch = max_refetch
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+    @property
+    def metainfo(self) -> dict:
+        """Get the meta information of the multi-image-mixed dataset.
+
+        Returns:
+            dict: The meta information of multi-image-mixed dataset.
+        """
+        return copy.deepcopy(self._metainfo)
+
+    def full_init(self):
+        """Loop to ``full_init`` each dataset."""
+        if self._fully_initialized:
+            return
+
+        self.dataset.full_init()
+        self._ori_len = len(self.dataset)
+        self._fully_initialized = True
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index.
+
+        Args:
+            idx (int): Global index of ``ConcatDataset``.
+
+        Returns:
+            dict: The idx-th annotation of the datasets.
+        """
+        return self.dataset.get_data_info(idx)
+
+    @force_full_init
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        results = copy.deepcopy(self.dataset[idx])
+        for (transform, transform_type) in zip(self.pipeline,
+                                               self.pipeline_types):
+            if self._skip_type_keys is not None and \
+                    transform_type in self._skip_type_keys:
+                continue
+
+            if hasattr(transform, 'get_indexes'):
+                for i in range(self.max_refetch):
+                    # Make sure the results passed the loading pipeline
+                    # of the original dataset is not None.
+                    indexes = transform.get_indexes(self.dataset)
+                    if not isinstance(indexes, collections.abc.Sequence):
+                        indexes = [indexes]
+                    mix_results = [
+                        copy.deepcopy(self.dataset[index]) for index in indexes
+                    ]
+                    if None not in mix_results:
+                        results['mix_results'] = mix_results
+                        break
+                else:
+                    raise RuntimeError(
+                        'The loading pipeline of the original dataset'
+                        ' always return None. Please check the correctness '
+                        'of the dataset and its pipeline.')
+
+            for i in range(self.max_refetch):
+                # To confirm the results passed the training pipeline
+                # of the wrapper is not None.
+                updated_results = transform(copy.deepcopy(results))
+                if updated_results is not None:
+                    results = updated_results
+                    break
+            else:
+                raise RuntimeError(
+                    'The training pipeline of the dataset wrapper'
+                    ' always return None.Please check the correctness '
+                    'of the dataset and its pipeline.')
+
+            if 'mix_results' in results:
+                results.pop('mix_results')
+
+        return results
+
+    def update_skip_type_keys(self, skip_type_keys):
+        """Update skip_type_keys. It is called by an external hook.
+
+        Args:
+            skip_type_keys (list[str], optional): Sequence of type
+                string to be skip pipeline.
+        """
+        assert all([
+            isinstance(skip_type_key, str) for skip_type_key in skip_type_keys
+        ])
+        self._skip_type_keys = skip_type_keys
+
+
+@DATASETS.register_module()
+class ConcatDataset(MMENGINE_ConcatDataset):
+    """A wrapper of concatenated dataset.
+
+    Same as ``torch.utils.data.dataset.ConcatDataset``, support
+    lazy_init and get_dataset_source.
+
+    Note:
+        ``ConcatDataset`` should not inherit from ``BaseDataset`` since
+        ``get_subset`` and ``get_subset_`` could produce ambiguous meaning
+        sub-dataset which conflicts with original dataset. If you want to use
+        a sub-dataset of ``ConcatDataset``, you should set ``indices``
+        arguments for wrapped dataset which inherit from ``BaseDataset``.
+
+    Args:
+        datasets (Sequence[BaseDataset] or Sequence[dict]): A list of datasets
+            which will be concatenated.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. Defaults to False.
+        ignore_keys (List[str] or str): Ignore the keys that can be
+            unequal in `dataset.metainfo`. Defaults to None.
+            `New in version 0.3.0.`
+    """
+
+    def __init__(self,
+                 datasets: Sequence[Union[BaseDataset, dict]],
+                 lazy_init: bool = False,
+                 ignore_keys: Union[str, List[str], None] = None):
+        self.datasets: List[BaseDataset] = []
+        for i, dataset in enumerate(datasets):
+            if isinstance(dataset, dict):
+                self.datasets.append(DATASETS.build(dataset))
+            elif isinstance(dataset, BaseDataset):
+                self.datasets.append(dataset)
+            else:
+                raise TypeError(
+                    'elements in datasets sequence should be config or '
+                    f'`BaseDataset` instance, but got {type(dataset)}')
+        if ignore_keys is None:
+            self.ignore_keys = []
+        elif isinstance(ignore_keys, str):
+            self.ignore_keys = [ignore_keys]
+        elif isinstance(ignore_keys, list):
+            self.ignore_keys = ignore_keys
+        else:
+            raise TypeError('ignore_keys should be a list or str, '
+                            f'but got {type(ignore_keys)}')
+
+        meta_keys: set = set()
+        for dataset in self.datasets:
+            meta_keys |= dataset.metainfo.keys()
+        # if the metainfo of multiple datasets are the same, use metainfo
+        # of the first dataset, else the metainfo is a list with metainfo
+        # of all the datasets
+        is_all_same = True
+        self._metainfo_first = self.datasets[0].metainfo
+        for i, dataset in enumerate(self.datasets, 1):
+            for key in meta_keys:
+                if key in self.ignore_keys:
+                    continue
+                if key not in dataset.metainfo:
+                    is_all_same = False
+                    break
+                if self._metainfo_first[key] != dataset.metainfo[key]:
+                    is_all_same = False
+                    break
+
+        if is_all_same:
+            self._metainfo = self.datasets[0].metainfo
+        else:
+            self._metainfo = [dataset.metainfo for dataset in self.datasets]
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+            if is_all_same:
+                self._metainfo.update(
+                    dict(cumulative_sizes=self.cumulative_sizes))
+            else:
+                for i, dataset in enumerate(self.datasets):
+                    self._metainfo[i].update(
+                        dict(cumulative_sizes=self.cumulative_sizes))
+
+    def get_dataset_source(self, idx: int) -> int:
+        dataset_idx, _ = self._get_ori_dataset_idx(idx)
+        return dataset_idx
diff --git a/mmde/mmdet/datasets/deepfashion.py b/mmde/mmdet/datasets/deepfashion.py
new file mode 100644
index 0000000000000000000000000000000000000000..f853fc63398d598b90a88323e660ba6f4d81e2df
--- /dev/null
+++ b/mmde/mmdet/datasets/deepfashion.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class DeepFashionDataset(CocoDataset):
+    """Dataset for DeepFashion."""
+
+    METAINFO = {
+        'classes': ('top', 'skirt', 'leggings', 'dress', 'outer', 'pants',
+                    'bag', 'neckwear', 'headwear', 'eyeglass', 'belt',
+                    'footwear', 'hair', 'skin', 'face'),
+        # palette is a list of color tuples, which is used for visualization.
+        'palette': [(0, 192, 64), (0, 64, 96), (128, 192, 192), (0, 64, 64),
+                    (0, 192, 224), (0, 192, 192), (128, 192, 64), (0, 192, 96),
+                    (128, 32, 192), (0, 0, 224), (0, 0, 64), (0, 160, 192),
+                    (128, 0, 96), (128, 0, 192), (0, 32, 192)]
+    }
diff --git a/mmde/mmdet/datasets/dod.py b/mmde/mmdet/datasets/dod.py
new file mode 100644
index 0000000000000000000000000000000000000000..152d32aaf70c7fb5e3730d46d26e150fc1204f22
--- /dev/null
+++ b/mmde/mmdet/datasets/dod.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List, Optional
+
+import numpy as np
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+try:
+    from d_cube import D3
+except ImportError:
+    D3 = None
+from .api_wrappers import COCO
+
+
+@DATASETS.register_module()
+class DODDataset(BaseDetDataset):
+
+    def __init__(self,
+                 *args,
+                 data_root: Optional[str] = '',
+                 data_prefix: dict = dict(img_path=''),
+                 **kwargs) -> None:
+        if D3 is None:
+            raise ImportError(
+                'Please install d3 by `pip install ddd-dataset`.')
+        pkl_anno_path = osp.join(data_root, data_prefix['anno'])
+        self.img_root = osp.join(data_root, data_prefix['img'])
+        self.d3 = D3(self.img_root, pkl_anno_path)
+
+        sent_infos = self.d3.load_sents()
+        classes = tuple([sent_info['raw_sent'] for sent_info in sent_infos])
+        super().__init__(
+            *args,
+            data_root=data_root,
+            data_prefix=data_prefix,
+            metainfo={'classes': classes},
+            **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        coco = COCO(self.ann_file)
+        data_list = []
+        img_ids = self.d3.get_img_ids()
+        for img_id in img_ids:
+            data_info = {}
+
+            img_info = self.d3.load_imgs(img_id)[0]
+            file_name = img_info['file_name']
+            img_path = osp.join(self.img_root, file_name)
+            data_info['img_path'] = img_path
+            data_info['img_id'] = img_id
+            data_info['height'] = img_info['height']
+            data_info['width'] = img_info['width']
+
+            group_ids = self.d3.get_group_ids(img_ids=[img_id])
+            sent_ids = self.d3.get_sent_ids(group_ids=group_ids)
+            sent_list = self.d3.load_sents(sent_ids=sent_ids)
+            text_list = [sent['raw_sent'] for sent in sent_list]
+            ann_ids = coco.get_ann_ids(img_ids=[img_id])
+            anno = coco.load_anns(ann_ids)
+
+            data_info['text'] = text_list
+            data_info['sent_ids'] = np.array([s for s in sent_ids])
+            data_info['custom_entities'] = True
+
+            instances = []
+            for i, ann in enumerate(anno):
+                instance = {}
+                x1, y1, w, h = ann['bbox']
+                bbox = [x1, y1, x1 + w, y1 + h]
+                instance['ignore_flag'] = 0
+                instance['bbox'] = bbox
+                instance['bbox_label'] = ann['category_id'] - 1
+                instances.append(instance)
+            data_info['instances'] = instances
+            data_list.append(data_info)
+        return data_list
diff --git a/mmde/mmdet/datasets/dsdl.py b/mmde/mmdet/datasets/dsdl.py
new file mode 100644
index 0000000000000000000000000000000000000000..75570a2a6396e0e7a4ce5cac5dbf2a23cd164629
--- /dev/null
+++ b/mmde/mmdet/datasets/dsdl.py
@@ -0,0 +1,192 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import List
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+try:
+    from dsdl.dataset import DSDLDataset
+except ImportError:
+    DSDLDataset = None
+
+
+@DATASETS.register_module()
+class DSDLDetDataset(BaseDetDataset):
+    """Dataset for dsdl detection.
+
+    Args:
+        with_bbox(bool): Load bbox or not, defaults to be True.
+        with_polygon(bool): Load polygon or not, defaults to be False.
+        with_mask(bool): Load seg map mask or not, defaults to be False.
+        with_imagelevel_label(bool): Load image level label or not,
+            defaults to be False.
+        with_hierarchy(bool): Load hierarchy information or not,
+            defaults to be False.
+        specific_key_path(dict): Path of specific key which can not
+            be loaded by it's field name.
+        pre_transform(dict): pre-transform functions before loading.
+    """
+
+    METAINFO = {}
+
+    def __init__(self,
+                 with_bbox: bool = True,
+                 with_polygon: bool = False,
+                 with_mask: bool = False,
+                 with_imagelevel_label: bool = False,
+                 with_hierarchy: bool = False,
+                 specific_key_path: dict = {},
+                 pre_transform: dict = {},
+                 **kwargs) -> None:
+
+        if DSDLDataset is None:
+            raise RuntimeError(
+                'Package dsdl is not installed. Please run "pip install dsdl".'
+            )
+
+        self.with_hierarchy = with_hierarchy
+        self.specific_key_path = specific_key_path
+
+        loc_config = dict(type='LocalFileReader', working_dir='')
+        if kwargs.get('data_root'):
+            kwargs['ann_file'] = os.path.join(kwargs['data_root'],
+                                              kwargs['ann_file'])
+        self.required_fields = ['Image', 'ImageShape', 'Label', 'ignore_flag']
+        if with_bbox:
+            self.required_fields.append('Bbox')
+        if with_polygon:
+            self.required_fields.append('Polygon')
+        if with_mask:
+            self.required_fields.append('LabelMap')
+        if with_imagelevel_label:
+            self.required_fields.append('image_level_labels')
+            assert 'image_level_labels' in specific_key_path.keys(
+            ), '`image_level_labels` not specified in `specific_key_path` !'
+
+        self.extra_keys = [
+            key for key in self.specific_key_path.keys()
+            if key not in self.required_fields
+        ]
+
+        self.dsdldataset = DSDLDataset(
+            dsdl_yaml=kwargs['ann_file'],
+            location_config=loc_config,
+            required_fields=self.required_fields,
+            specific_key_path=specific_key_path,
+            transform=pre_transform,
+        )
+
+        BaseDetDataset.__init__(self, **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load data info from an dsdl yaml file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of data info.
+        """
+        if self.with_hierarchy:
+            # get classes_names and relation_matrix
+            classes_names, relation_matrix = \
+                self.dsdldataset.class_dom.get_hierarchy_info()
+            self._metainfo['classes'] = tuple(classes_names)
+            self._metainfo['RELATION_MATRIX'] = relation_matrix
+
+        else:
+            self._metainfo['classes'] = tuple(self.dsdldataset.class_names)
+
+        data_list = []
+
+        for i, data in enumerate(self.dsdldataset):
+            # basic image info, including image id, path and size.
+            datainfo = dict(
+                img_id=i,
+                img_path=os.path.join(self.data_prefix['img_path'],
+                                      data['Image'][0].location),
+                width=data['ImageShape'][0].width,
+                height=data['ImageShape'][0].height,
+            )
+
+            # get image label info
+            if 'image_level_labels' in data.keys():
+                if self.with_hierarchy:
+                    # get leaf node name when using hierarchy classes
+                    datainfo['image_level_labels'] = [
+                        self._metainfo['classes'].index(i.leaf_node_name)
+                        for i in data['image_level_labels']
+                    ]
+                else:
+                    datainfo['image_level_labels'] = [
+                        self._metainfo['classes'].index(i.name)
+                        for i in data['image_level_labels']
+                    ]
+
+            # get semantic segmentation info
+            if 'LabelMap' in data.keys():
+                datainfo['seg_map_path'] = data['LabelMap']
+
+            # load instance info
+            instances = []
+            if 'Bbox' in data.keys():
+                for idx in range(len(data['Bbox'])):
+                    bbox = data['Bbox'][idx]
+                    if self.with_hierarchy:
+                        # get leaf node name when using hierarchy classes
+                        label = data['Label'][idx].leaf_node_name
+                        label_index = self._metainfo['classes'].index(label)
+                    else:
+                        label = data['Label'][idx].name
+                        label_index = self._metainfo['classes'].index(label)
+
+                    instance = {}
+                    instance['bbox'] = bbox.xyxy
+                    instance['bbox_label'] = label_index
+
+                    if 'ignore_flag' in data.keys():
+                        # get ignore flag
+                        instance['ignore_flag'] = data['ignore_flag'][idx]
+                    else:
+                        instance['ignore_flag'] = 0
+
+                    if 'Polygon' in data.keys():
+                        # get polygon info
+                        polygon = data['Polygon'][idx]
+                        instance['mask'] = polygon.openmmlabformat
+
+                    for key in self.extra_keys:
+                        # load extra instance info
+                        instance[key] = data[key][idx]
+
+                    instances.append(instance)
+
+            datainfo['instances'] = instances
+            # append a standard sample in data list
+            if len(datainfo['instances']) > 0:
+                data_list.append(datainfo)
+
+        return data_list
+
+    def filter_data(self) -> List[dict]:
+        """Filter annotations according to filter_cfg.
+
+        Returns:
+            List[dict]: Filtered results.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) \
+            if self.filter_cfg is not None else False
+        min_size = self.filter_cfg.get('min_size', 0) \
+            if self.filter_cfg is not None else 0
+
+        valid_data_list = []
+        for i, data_info in enumerate(self.data_list):
+            width = data_info['width']
+            height = data_info['height']
+            if filter_empty_gt and len(data_info['instances']) == 0:
+                continue
+            if min(width, height) >= min_size:
+                valid_data_list.append(data_info)
+
+        return valid_data_list
diff --git a/mmde/mmdet/datasets/flickr30k.py b/mmde/mmdet/datasets/flickr30k.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c76a41bc965bb0e8348c3d13e77d5c6e8ca08ce
--- /dev/null
+++ b/mmde/mmdet/datasets/flickr30k.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+from pycocotools.coco import COCO
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+def convert_phrase_ids(phrase_ids: list) -> list:
+    unique_elements = sorted(set(phrase_ids))
+    element_to_new_label = {
+        element: label
+        for label, element in enumerate(unique_elements)
+    }
+    phrase_ids = [element_to_new_label[element] for element in phrase_ids]
+    return phrase_ids
+
+
+@DATASETS.register_module()
+class Flickr30kDataset(BaseDetDataset):
+    """Flickr30K Dataset."""
+
+    def load_data_list(self) -> List[dict]:
+
+        self.coco = COCO(self.ann_file)
+
+        self.ids = sorted(list(self.coco.imgs.keys()))
+
+        data_list = []
+        for img_id in self.ids:
+            if isinstance(img_id, str):
+                ann_ids = self.coco.getAnnIds(imgIds=[img_id], iscrowd=None)
+            else:
+                ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None)
+
+            coco_img = self.coco.loadImgs(img_id)[0]
+
+            caption = coco_img['caption']
+            file_name = coco_img['file_name']
+            img_path = osp.join(self.data_prefix['img'], file_name)
+            width = coco_img['width']
+            height = coco_img['height']
+            tokens_positive = coco_img['tokens_positive_eval']
+            phrases = [caption[i[0][0]:i[0][1]] for i in tokens_positive]
+            phrase_ids = []
+
+            instances = []
+            annos = self.coco.loadAnns(ann_ids)
+            for anno in annos:
+                instance = {
+                    'bbox': [
+                        anno['bbox'][0], anno['bbox'][1],
+                        anno['bbox'][0] + anno['bbox'][2],
+                        anno['bbox'][1] + anno['bbox'][3]
+                    ],
+                    'bbox_label':
+                    anno['category_id'],
+                    'ignore_flag':
+                    anno['iscrowd']
+                }
+                phrase_ids.append(anno['phrase_ids'])
+                instances.append(instance)
+
+            phrase_ids = convert_phrase_ids(phrase_ids)
+
+            data_list.append(
+                dict(
+                    img_path=img_path,
+                    img_id=img_id,
+                    height=height,
+                    width=width,
+                    instances=instances,
+                    text=caption,
+                    phrase_ids=phrase_ids,
+                    tokens_positive=tokens_positive,
+                    phrases=phrases,
+                ))
+
+        return data_list
diff --git a/mmde/mmdet/datasets/isaid.py b/mmde/mmdet/datasets/isaid.py
new file mode 100644
index 0000000000000000000000000000000000000000..87067d8459c4dd6e80e5f808f613e0bd600b5f2f
--- /dev/null
+++ b/mmde/mmdet/datasets/isaid.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class iSAIDDataset(CocoDataset):
+    """Dataset for iSAID instance segmentation.
+
+    iSAID: A Large-scale Dataset for Instance Segmentation
+    in Aerial Images.
+
+    For more detail, please refer to "projects/iSAID/README.md"
+    """
+
+    METAINFO = dict(
+        classes=('background', 'ship', 'store_tank', 'baseball_diamond',
+                 'tennis_court', 'basketball_court', 'Ground_Track_Field',
+                 'Bridge', 'Large_Vehicle', 'Small_Vehicle', 'Helicopter',
+                 'Swimming_pool', 'Roundabout', 'Soccer_ball_field', 'plane',
+                 'Harbor'),
+        palette=[(0, 0, 0), (0, 0, 63), (0, 63, 63), (0, 63, 0), (0, 63, 127),
+                 (0, 63, 191), (0, 63, 255), (0, 127, 63), (0, 127, 127),
+                 (0, 0, 127), (0, 0, 191), (0, 0, 255), (0, 191, 127),
+                 (0, 127, 191), (0, 127, 255), (0, 100, 155)])
diff --git a/mmde/mmdet/datasets/lvis.py b/mmde/mmdet/datasets/lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9629f5d463da183f0b4ab4c5d0f7ff7b07e4348
--- /dev/null
+++ b/mmde/mmdet/datasets/lvis.py
@@ -0,0 +1,638 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import List
+
+from mmengine.fileio import get_local_path
+
+from mmdet.registry import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class LVISV05Dataset(CocoDataset):
+    """LVIS v0.5 dataset for detection."""
+
+    METAINFO = {
+        'classes':
+        ('acorn', 'aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock',
+         'alcohol', 'alligator', 'almond', 'ambulance', 'amplifier', 'anklet',
+         'antenna', 'apple', 'apple_juice', 'applesauce', 'apricot', 'apron',
+         'aquarium', 'armband', 'armchair', 'armoire', 'armor', 'artichoke',
+         'trash_can', 'ashtray', 'asparagus', 'atomizer', 'avocado', 'award',
+         'awning', 'ax', 'baby_buggy', 'basketball_backboard', 'backpack',
+         'handbag', 'suitcase', 'bagel', 'bagpipe', 'baguet', 'bait', 'ball',
+         'ballet_skirt', 'balloon', 'bamboo', 'banana', 'Band_Aid', 'bandage',
+         'bandanna', 'banjo', 'banner', 'barbell', 'barge', 'barrel',
+         'barrette', 'barrow', 'baseball_base', 'baseball', 'baseball_bat',
+         'baseball_cap', 'baseball_glove', 'basket', 'basketball_hoop',
+         'basketball', 'bass_horn', 'bat_(animal)', 'bath_mat', 'bath_towel',
+         'bathrobe', 'bathtub', 'batter_(food)', 'battery', 'beachball',
+         'bead', 'beaker', 'bean_curd', 'beanbag', 'beanie', 'bear', 'bed',
+         'bedspread', 'cow', 'beef_(food)', 'beeper', 'beer_bottle',
+         'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt', 'belt_buckle',
+         'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor', 'binder',
+         'binoculars', 'bird', 'birdfeeder', 'birdbath', 'birdcage',
+         'birdhouse', 'birthday_cake', 'birthday_card', 'biscuit_(bread)',
+         'pirate_flag', 'black_sheep', 'blackboard', 'blanket', 'blazer',
+         'blender', 'blimp', 'blinker', 'blueberry', 'boar', 'gameboard',
+         'boat', 'bobbin', 'bobby_pin', 'boiled_egg', 'bolo_tie', 'deadbolt',
+         'bolt', 'bonnet', 'book', 'book_bag', 'bookcase', 'booklet',
+         'bookmark', 'boom_microphone', 'boot', 'bottle', 'bottle_opener',
+         'bouquet', 'bow_(weapon)', 'bow_(decorative_ribbons)', 'bow-tie',
+         'bowl', 'pipe_bowl', 'bowler_hat', 'bowling_ball', 'bowling_pin',
+         'boxing_glove', 'suspenders', 'bracelet', 'brass_plaque', 'brassiere',
+         'bread-bin', 'breechcloth', 'bridal_gown', 'briefcase',
+         'bristle_brush', 'broccoli', 'broach', 'broom', 'brownie',
+         'brussels_sprouts', 'bubble_gum', 'bucket', 'horse_buggy', 'bull',
+         'bulldog', 'bulldozer', 'bullet_train', 'bulletin_board',
+         'bulletproof_vest', 'bullhorn', 'corned_beef', 'bun', 'bunk_bed',
+         'buoy', 'burrito', 'bus_(vehicle)', 'business_card', 'butcher_knife',
+         'butter', 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car',
+         'cabinet', 'locker', 'cake', 'calculator', 'calendar', 'calf',
+         'camcorder', 'camel', 'camera', 'camera_lens', 'camper_(vehicle)',
+         'can', 'can_opener', 'candelabrum', 'candle', 'candle_holder',
+         'candy_bar', 'candy_cane', 'walking_cane', 'canister', 'cannon',
+         'canoe', 'cantaloup', 'canteen', 'cap_(headwear)', 'bottle_cap',
+         'cape', 'cappuccino', 'car_(automobile)', 'railcar_(part_of_a_train)',
+         'elevator_car', 'car_battery', 'identity_card', 'card', 'cardigan',
+         'cargo_ship', 'carnation', 'horse_carriage', 'carrot', 'tote_bag',
+         'cart', 'carton', 'cash_register', 'casserole', 'cassette', 'cast',
+         'cat', 'cauliflower', 'caviar', 'cayenne_(spice)', 'CD_player',
+         'celery', 'cellular_telephone', 'chain_mail', 'chair',
+         'chaise_longue', 'champagne', 'chandelier', 'chap', 'checkbook',
+         'checkerboard', 'cherry', 'chessboard',
+         'chest_of_drawers_(furniture)', 'chicken_(animal)', 'chicken_wire',
+         'chickpea', 'Chihuahua', 'chili_(vegetable)', 'chime', 'chinaware',
+         'crisp_(potato_chip)', 'poker_chip', 'chocolate_bar',
+         'chocolate_cake', 'chocolate_milk', 'chocolate_mousse', 'choker',
+         'chopping_board', 'chopstick', 'Christmas_tree', 'slide', 'cider',
+         'cigar_box', 'cigarette', 'cigarette_case', 'cistern', 'clarinet',
+         'clasp', 'cleansing_agent', 'clementine', 'clip', 'clipboard',
+         'clock', 'clock_tower', 'clothes_hamper', 'clothespin', 'clutch_bag',
+         'coaster', 'coat', 'coat_hanger', 'coatrack', 'cock', 'coconut',
+         'coffee_filter', 'coffee_maker', 'coffee_table', 'coffeepot', 'coil',
+         'coin', 'colander', 'coleslaw', 'coloring_material',
+         'combination_lock', 'pacifier', 'comic_book', 'computer_keyboard',
+         'concrete_mixer', 'cone', 'control', 'convertible_(automobile)',
+         'sofa_bed', 'cookie', 'cookie_jar', 'cooking_utensil',
+         'cooler_(for_food)', 'cork_(bottle_plug)', 'corkboard', 'corkscrew',
+         'edible_corn', 'cornbread', 'cornet', 'cornice', 'cornmeal', 'corset',
+         'romaine_lettuce', 'costume', 'cougar', 'coverall', 'cowbell',
+         'cowboy_hat', 'crab_(animal)', 'cracker', 'crape', 'crate', 'crayon',
+         'cream_pitcher', 'credit_card', 'crescent_roll', 'crib', 'crock_pot',
+         'crossbar', 'crouton', 'crow', 'crown', 'crucifix', 'cruise_ship',
+         'police_cruiser', 'crumb', 'crutch', 'cub_(animal)', 'cube',
+         'cucumber', 'cufflink', 'cup', 'trophy_cup', 'cupcake', 'hair_curler',
+         'curling_iron', 'curtain', 'cushion', 'custard', 'cutting_tool',
+         'cylinder', 'cymbal', 'dachshund', 'dagger', 'dartboard',
+         'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk',
+         'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table',
+         'tux', 'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher',
+         'dishwasher_detergent', 'diskette', 'dispenser', 'Dixie_cup', 'dog',
+         'dog_collar', 'doll', 'dollar', 'dolphin', 'domestic_ass', 'eye_mask',
+         'doorbell', 'doorknob', 'doormat', 'doughnut', 'dove', 'dragonfly',
+         'drawer', 'underdrawers', 'dress', 'dress_hat', 'dress_suit',
+         'dresser', 'drill', 'drinking_fountain', 'drone', 'dropper',
+         'drum_(musical_instrument)', 'drumstick', 'duck', 'duckling',
+         'duct_tape', 'duffel_bag', 'dumbbell', 'dumpster', 'dustpan',
+         'Dutch_oven', 'eagle', 'earphone', 'earplug', 'earring', 'easel',
+         'eclair', 'eel', 'egg', 'egg_roll', 'egg_yolk', 'eggbeater',
+         'eggplant', 'electric_chair', 'refrigerator', 'elephant', 'elk',
+         'envelope', 'eraser', 'escargot', 'eyepatch', 'falcon', 'fan',
+         'faucet', 'fedora', 'ferret', 'Ferris_wheel', 'ferry', 'fig_(fruit)',
+         'fighter_jet', 'figurine', 'file_cabinet', 'file_(tool)',
+         'fire_alarm', 'fire_engine', 'fire_extinguisher', 'fire_hose',
+         'fireplace', 'fireplug', 'fish', 'fish_(food)', 'fishbowl',
+         'fishing_boat', 'fishing_rod', 'flag', 'flagpole', 'flamingo',
+         'flannel', 'flash', 'flashlight', 'fleece', 'flip-flop_(sandal)',
+         'flipper_(footwear)', 'flower_arrangement', 'flute_glass', 'foal',
+         'folding_chair', 'food_processor', 'football_(American)',
+         'football_helmet', 'footstool', 'fork', 'forklift', 'freight_car',
+         'French_toast', 'freshener', 'frisbee', 'frog', 'fruit_juice',
+         'fruit_salad', 'frying_pan', 'fudge', 'funnel', 'futon', 'gag',
+         'garbage', 'garbage_truck', 'garden_hose', 'gargle', 'gargoyle',
+         'garlic', 'gasmask', 'gazelle', 'gelatin', 'gemstone', 'giant_panda',
+         'gift_wrap', 'ginger', 'giraffe', 'cincture',
+         'glass_(drink_container)', 'globe', 'glove', 'goat', 'goggles',
+         'goldfish', 'golf_club', 'golfcart', 'gondola_(boat)', 'goose',
+         'gorilla', 'gourd', 'surgical_gown', 'grape', 'grasshopper', 'grater',
+         'gravestone', 'gravy_boat', 'green_bean', 'green_onion', 'griddle',
+         'grillroom', 'grinder_(tool)', 'grits', 'grizzly', 'grocery_bag',
+         'guacamole', 'guitar', 'gull', 'gun', 'hair_spray', 'hairbrush',
+         'hairnet', 'hairpin', 'ham', 'hamburger', 'hammer', 'hammock',
+         'hamper', 'hamster', 'hair_dryer', 'hand_glass', 'hand_towel',
+         'handcart', 'handcuff', 'handkerchief', 'handle', 'handsaw',
+         'hardback_book', 'harmonium', 'hat', 'hatbox', 'hatch', 'veil',
+         'headband', 'headboard', 'headlight', 'headscarf', 'headset',
+         'headstall_(for_horses)', 'hearing_aid', 'heart', 'heater',
+         'helicopter', 'helmet', 'heron', 'highchair', 'hinge', 'hippopotamus',
+         'hockey_stick', 'hog', 'home_plate_(baseball)', 'honey', 'fume_hood',
+         'hook', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce',
+         'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear',
+         'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate',
+         'ice_tea', 'igniter', 'incense', 'inhaler', 'iPod',
+         'iron_(for_clothing)', 'ironing_board', 'jacket', 'jam', 'jean',
+         'jeep', 'jelly_bean', 'jersey', 'jet_plane', 'jewelry', 'joystick',
+         'jumpsuit', 'kayak', 'keg', 'kennel', 'kettle', 'key', 'keycard',
+         'kilt', 'kimono', 'kitchen_sink', 'kitchen_table', 'kite', 'kitten',
+         'kiwi_fruit', 'knee_pad', 'knife', 'knight_(chess_piece)',
+         'knitting_needle', 'knob', 'knocker_(on_a_door)', 'koala', 'lab_coat',
+         'ladder', 'ladle', 'ladybug', 'lamb_(animal)', 'lamb-chop', 'lamp',
+         'lamppost', 'lampshade', 'lantern', 'lanyard', 'laptop_computer',
+         'lasagna', 'latch', 'lawn_mower', 'leather', 'legging_(clothing)',
+         'Lego', 'lemon', 'lemonade', 'lettuce', 'license_plate', 'life_buoy',
+         'life_jacket', 'lightbulb', 'lightning_rod', 'lime', 'limousine',
+         'linen_paper', 'lion', 'lip_balm', 'lipstick', 'liquor', 'lizard',
+         'Loafer_(type_of_shoe)', 'log', 'lollipop', 'lotion',
+         'speaker_(stereo_equipment)', 'loveseat', 'machine_gun', 'magazine',
+         'magnet', 'mail_slot', 'mailbox_(at_home)', 'mallet', 'mammoth',
+         'mandarin_orange', 'manger', 'manhole', 'map', 'marker', 'martini',
+         'mascot', 'mashed_potato', 'masher', 'mask', 'mast',
+         'mat_(gym_equipment)', 'matchbox', 'mattress', 'measuring_cup',
+         'measuring_stick', 'meatball', 'medicine', 'melon', 'microphone',
+         'microscope', 'microwave_oven', 'milestone', 'milk', 'minivan',
+         'mint_candy', 'mirror', 'mitten', 'mixer_(kitchen_tool)', 'money',
+         'monitor_(computer_equipment) computer_monitor', 'monkey', 'motor',
+         'motor_scooter', 'motor_vehicle', 'motorboat', 'motorcycle',
+         'mound_(baseball)', 'mouse_(animal_rodent)',
+         'mouse_(computer_equipment)', 'mousepad', 'muffin', 'mug', 'mushroom',
+         'music_stool', 'musical_instrument', 'nailfile', 'nameplate',
+         'napkin', 'neckerchief', 'necklace', 'necktie', 'needle', 'nest',
+         'newsstand', 'nightshirt', 'nosebag_(for_animals)',
+         'noseband_(for_animals)', 'notebook', 'notepad', 'nut', 'nutcracker',
+         'oar', 'octopus_(food)', 'octopus_(animal)', 'oil_lamp', 'olive_oil',
+         'omelet', 'onion', 'orange_(fruit)', 'orange_juice', 'oregano',
+         'ostrich', 'ottoman', 'overalls_(clothing)', 'owl', 'packet',
+         'inkpad', 'pad', 'paddle', 'padlock', 'paintbox', 'paintbrush',
+         'painting', 'pajamas', 'palette', 'pan_(for_cooking)',
+         'pan_(metal_container)', 'pancake', 'pantyhose', 'papaya',
+         'paperclip', 'paper_plate', 'paper_towel', 'paperback_book',
+         'paperweight', 'parachute', 'parakeet', 'parasail_(sports)',
+         'parchment', 'parka', 'parking_meter', 'parrot',
+         'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport',
+         'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter',
+         'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'pegboard',
+         'pelican', 'pen', 'pencil', 'pencil_box', 'pencil_sharpener',
+         'pendulum', 'penguin', 'pennant', 'penny_(coin)', 'pepper',
+         'pepper_mill', 'perfume', 'persimmon', 'baby', 'pet', 'petfood',
+         'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano',
+         'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow',
+         'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball',
+         'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)',
+         'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat',
+         'plate', 'platter', 'playing_card', 'playpen', 'pliers',
+         'plow_(farm_equipment)', 'pocket_watch', 'pocketknife',
+         'poker_(fire_stirring_tool)', 'pole', 'police_van', 'polo_shirt',
+         'poncho', 'pony', 'pool_table', 'pop_(soda)', 'portrait',
+         'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot',
+         'potato', 'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn',
+         'printer', 'projectile_(weapon)', 'projector', 'propeller', 'prune',
+         'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', 'puncher',
+         'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt', 'rabbit',
+         'race_car', 'racket', 'radar', 'radiator', 'radio_receiver', 'radish',
+         'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', 'rat',
+         'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt',
+         'recliner', 'record_player', 'red_cabbage', 'reflector',
+         'remote_control', 'rhinoceros', 'rib_(food)', 'rifle', 'ring',
+         'river_boat', 'road_map', 'robe', 'rocking_chair', 'roller_skate',
+         'Rollerblade', 'rolling_pin', 'root_beer',
+         'router_(computer_equipment)', 'rubber_band', 'runner_(carpet)',
+         'plastic_bag', 'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag',
+         'safety_pin', 'sail', 'salad', 'salad_plate', 'salami',
+         'salmon_(fish)', 'salmon_(food)', 'salsa', 'saltshaker',
+         'sandal_(type_of_shoe)', 'sandwich', 'satchel', 'saucepan', 'saucer',
+         'sausage', 'sawhorse', 'saxophone', 'scale_(measuring_instrument)',
+         'scarecrow', 'scarf', 'school_bus', 'scissors', 'scoreboard',
+         'scrambled_eggs', 'scraper', 'scratcher', 'screwdriver',
+         'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane',
+         'seashell', 'seedling', 'serving_dish', 'sewing_machine', 'shaker',
+         'shampoo', 'shark', 'sharpener', 'Sharpie', 'shaver_(electric)',
+         'shaving_cream', 'shawl', 'shears', 'sheep', 'shepherd_dog',
+         'sherbert', 'shield', 'shirt', 'shoe', 'shopping_bag',
+         'shopping_cart', 'short_pants', 'shot_glass', 'shoulder_bag',
+         'shovel', 'shower_head', 'shower_curtain', 'shredder_(for_paper)',
+         'sieve', 'signboard', 'silo', 'sink', 'skateboard', 'skewer', 'ski',
+         'ski_boot', 'ski_parka', 'ski_pole', 'skirt', 'sled', 'sleeping_bag',
+         'sling_(bandage)', 'slipper_(footwear)', 'smoothie', 'snake',
+         'snowboard', 'snowman', 'snowmobile', 'soap', 'soccer_ball', 'sock',
+         'soda_fountain', 'carbonated_water', 'sofa', 'softball',
+         'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon',
+         'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)',
+         'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'sponge',
+         'spoon', 'sportswear', 'spotlight', 'squirrel',
+         'stapler_(stapling_machine)', 'starfish', 'statue_(sculpture)',
+         'steak_(food)', 'steak_knife', 'steamer_(kitchen_appliance)',
+         'steering_wheel', 'stencil', 'stepladder', 'step_stool',
+         'stereo_(sound_system)', 'stew', 'stirrer', 'stirrup',
+         'stockings_(leg_wear)', 'stool', 'stop_sign', 'brake_light', 'stove',
+         'strainer', 'strap', 'straw_(for_drinking)', 'strawberry',
+         'street_sign', 'streetlight', 'string_cheese', 'stylus', 'subwoofer',
+         'sugar_bowl', 'sugarcane_(plant)', 'suit_(clothing)', 'sunflower',
+         'sunglasses', 'sunhat', 'sunscreen', 'surfboard', 'sushi', 'mop',
+         'sweat_pants', 'sweatband', 'sweater', 'sweatshirt', 'sweet_potato',
+         'swimsuit', 'sword', 'syringe', 'Tabasco_sauce', 'table-tennis_table',
+         'table', 'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag',
+         'taillight', 'tambourine', 'army_tank', 'tank_(storage_vessel)',
+         'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure',
+         'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup',
+         'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth',
+         'telephone_pole', 'telephoto_lens', 'television_camera',
+         'television_set', 'tennis_ball', 'tennis_racket', 'tequila',
+         'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread',
+         'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer',
+         'tinfoil', 'tinsel', 'tissue_paper', 'toast_(food)', 'toaster',
+         'toaster_oven', 'toilet', 'toilet_tissue', 'tomato', 'tongs',
+         'toolbox', 'toothbrush', 'toothpaste', 'toothpick', 'cover',
+         'tortilla', 'tow_truck', 'towel', 'towel_rack', 'toy',
+         'tractor_(farm_equipment)', 'traffic_light', 'dirt_bike',
+         'trailer_truck', 'train_(railroad_vehicle)', 'trampoline', 'tray',
+         'tree_house', 'trench_coat', 'triangle_(musical_instrument)',
+         'tricycle', 'tripod', 'trousers', 'truck', 'truffle_(chocolate)',
+         'trunk', 'vat', 'turban', 'turkey_(bird)', 'turkey_(food)', 'turnip',
+         'turtle', 'turtleneck_(clothing)', 'typewriter', 'umbrella',
+         'underwear', 'unicycle', 'urinal', 'urn', 'vacuum_cleaner', 'valve',
+         'vase', 'vending_machine', 'vent', 'videotape', 'vinegar', 'violin',
+         'vodka', 'volleyball', 'vulture', 'waffle', 'waffle_iron', 'wagon',
+         'wagon_wheel', 'walking_stick', 'wall_clock', 'wall_socket', 'wallet',
+         'walrus', 'wardrobe', 'wasabi', 'automatic_washer', 'watch',
+         'water_bottle', 'water_cooler', 'water_faucet', 'water_filter',
+         'water_heater', 'water_jug', 'water_gun', 'water_scooter',
+         'water_ski', 'water_tower', 'watering_can', 'watermelon',
+         'weathervane', 'webcam', 'wedding_cake', 'wedding_ring', 'wet_suit',
+         'wheel', 'wheelchair', 'whipped_cream', 'whiskey', 'whistle', 'wick',
+         'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)',
+         'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket',
+         'wineglass', 'wing_chair', 'blinder_(for_horses)', 'wok', 'wolf',
+         'wooden_spoon', 'wreath', 'wrench', 'wristband', 'wristlet', 'yacht',
+         'yak', 'yogurt', 'yoke_(animal_equipment)', 'zebra', 'zucchini'),
+        'palette':
+        None
+    }
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        try:
+            import lvis
+            if getattr(lvis, '__version__', '0') >= '10.5.3':
+                warnings.warn(
+                    'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"',  # noqa: E501
+                    UserWarning)
+            from lvis import LVIS
+        except ImportError:
+            raise ImportError(
+                'Package lvis is not installed. Please run "pip install git+https://github.com/lvis-dataset/lvis-api.git".'  # noqa: E501
+            )
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            self.lvis = LVIS(local_path)
+        self.cat_ids = self.lvis.get_cat_ids()
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.lvis.cat_img_map)
+
+        img_ids = self.lvis.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.lvis.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+            if raw_img_info['file_name'].startswith('COCO'):
+                # Convert form the COCO 2014 file naming convention of
+                # COCO_[train/val/test]2014_000000000000.jpg to the 2017
+                # naming convention of 000000000000.jpg
+                # (LVIS v1 will fix this naming issue)
+                raw_img_info['file_name'] = raw_img_info['file_name'][-16:]
+            ann_ids = self.lvis.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.lvis.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            })
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.lvis
+
+        return data_list
+
+
+LVISDataset = LVISV05Dataset
+DATASETS.register_module(name='LVISDataset', module=LVISDataset)
+
+
+@DATASETS.register_module()
+class LVISV1Dataset(LVISDataset):
+    """LVIS v1 dataset for detection."""
+
+    METAINFO = {
+        'classes':
+        ('aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock',
+         'alcohol', 'alligator', 'almond', 'ambulance', 'amplifier', 'anklet',
+         'antenna', 'apple', 'applesauce', 'apricot', 'apron', 'aquarium',
+         'arctic_(type_of_shoe)', 'armband', 'armchair', 'armoire', 'armor',
+         'artichoke', 'trash_can', 'ashtray', 'asparagus', 'atomizer',
+         'avocado', 'award', 'awning', 'ax', 'baboon', 'baby_buggy',
+         'basketball_backboard', 'backpack', 'handbag', 'suitcase', 'bagel',
+         'bagpipe', 'baguet', 'bait', 'ball', 'ballet_skirt', 'balloon',
+         'bamboo', 'banana', 'Band_Aid', 'bandage', 'bandanna', 'banjo',
+         'banner', 'barbell', 'barge', 'barrel', 'barrette', 'barrow',
+         'baseball_base', 'baseball', 'baseball_bat', 'baseball_cap',
+         'baseball_glove', 'basket', 'basketball', 'bass_horn', 'bat_(animal)',
+         'bath_mat', 'bath_towel', 'bathrobe', 'bathtub', 'batter_(food)',
+         'battery', 'beachball', 'bead', 'bean_curd', 'beanbag', 'beanie',
+         'bear', 'bed', 'bedpan', 'bedspread', 'cow', 'beef_(food)', 'beeper',
+         'beer_bottle', 'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt',
+         'belt_buckle', 'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor',
+         'billboard', 'binder', 'binoculars', 'bird', 'birdfeeder', 'birdbath',
+         'birdcage', 'birdhouse', 'birthday_cake', 'birthday_card',
+         'pirate_flag', 'black_sheep', 'blackberry', 'blackboard', 'blanket',
+         'blazer', 'blender', 'blimp', 'blinker', 'blouse', 'blueberry',
+         'gameboard', 'boat', 'bob', 'bobbin', 'bobby_pin', 'boiled_egg',
+         'bolo_tie', 'deadbolt', 'bolt', 'bonnet', 'book', 'bookcase',
+         'booklet', 'bookmark', 'boom_microphone', 'boot', 'bottle',
+         'bottle_opener', 'bouquet', 'bow_(weapon)',
+         'bow_(decorative_ribbons)', 'bow-tie', 'bowl', 'pipe_bowl',
+         'bowler_hat', 'bowling_ball', 'box', 'boxing_glove', 'suspenders',
+         'bracelet', 'brass_plaque', 'brassiere', 'bread-bin', 'bread',
+         'breechcloth', 'bridal_gown', 'briefcase', 'broccoli', 'broach',
+         'broom', 'brownie', 'brussels_sprouts', 'bubble_gum', 'bucket',
+         'horse_buggy', 'bull', 'bulldog', 'bulldozer', 'bullet_train',
+         'bulletin_board', 'bulletproof_vest', 'bullhorn', 'bun', 'bunk_bed',
+         'buoy', 'burrito', 'bus_(vehicle)', 'business_card', 'butter',
+         'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car', 'cabinet',
+         'locker', 'cake', 'calculator', 'calendar', 'calf', 'camcorder',
+         'camel', 'camera', 'camera_lens', 'camper_(vehicle)', 'can',
+         'can_opener', 'candle', 'candle_holder', 'candy_bar', 'candy_cane',
+         'walking_cane', 'canister', 'canoe', 'cantaloup', 'canteen',
+         'cap_(headwear)', 'bottle_cap', 'cape', 'cappuccino',
+         'car_(automobile)', 'railcar_(part_of_a_train)', 'elevator_car',
+         'car_battery', 'identity_card', 'card', 'cardigan', 'cargo_ship',
+         'carnation', 'horse_carriage', 'carrot', 'tote_bag', 'cart', 'carton',
+         'cash_register', 'casserole', 'cassette', 'cast', 'cat',
+         'cauliflower', 'cayenne_(spice)', 'CD_player', 'celery',
+         'cellular_telephone', 'chain_mail', 'chair', 'chaise_longue',
+         'chalice', 'chandelier', 'chap', 'checkbook', 'checkerboard',
+         'cherry', 'chessboard', 'chicken_(animal)', 'chickpea',
+         'chili_(vegetable)', 'chime', 'chinaware', 'crisp_(potato_chip)',
+         'poker_chip', 'chocolate_bar', 'chocolate_cake', 'chocolate_milk',
+         'chocolate_mousse', 'choker', 'chopping_board', 'chopstick',
+         'Christmas_tree', 'slide', 'cider', 'cigar_box', 'cigarette',
+         'cigarette_case', 'cistern', 'clarinet', 'clasp', 'cleansing_agent',
+         'cleat_(for_securing_rope)', 'clementine', 'clip', 'clipboard',
+         'clippers_(for_plants)', 'cloak', 'clock', 'clock_tower',
+         'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster', 'coat',
+         'coat_hanger', 'coatrack', 'cock', 'cockroach', 'cocoa_(beverage)',
+         'coconut', 'coffee_maker', 'coffee_table', 'coffeepot', 'coil',
+         'coin', 'colander', 'coleslaw', 'coloring_material',
+         'combination_lock', 'pacifier', 'comic_book', 'compass',
+         'computer_keyboard', 'condiment', 'cone', 'control',
+         'convertible_(automobile)', 'sofa_bed', 'cooker', 'cookie',
+         'cooking_utensil', 'cooler_(for_food)', 'cork_(bottle_plug)',
+         'corkboard', 'corkscrew', 'edible_corn', 'cornbread', 'cornet',
+         'cornice', 'cornmeal', 'corset', 'costume', 'cougar', 'coverall',
+         'cowbell', 'cowboy_hat', 'crab_(animal)', 'crabmeat', 'cracker',
+         'crape', 'crate', 'crayon', 'cream_pitcher', 'crescent_roll', 'crib',
+         'crock_pot', 'crossbar', 'crouton', 'crow', 'crowbar', 'crown',
+         'crucifix', 'cruise_ship', 'police_cruiser', 'crumb', 'crutch',
+         'cub_(animal)', 'cube', 'cucumber', 'cufflink', 'cup', 'trophy_cup',
+         'cupboard', 'cupcake', 'hair_curler', 'curling_iron', 'curtain',
+         'cushion', 'cylinder', 'cymbal', 'dagger', 'dalmatian', 'dartboard',
+         'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk',
+         'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table',
+         'tux', 'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher',
+         'dishwasher_detergent', 'dispenser', 'diving_board', 'Dixie_cup',
+         'dog', 'dog_collar', 'doll', 'dollar', 'dollhouse', 'dolphin',
+         'domestic_ass', 'doorknob', 'doormat', 'doughnut', 'dove',
+         'dragonfly', 'drawer', 'underdrawers', 'dress', 'dress_hat',
+         'dress_suit', 'dresser', 'drill', 'drone', 'dropper',
+         'drum_(musical_instrument)', 'drumstick', 'duck', 'duckling',
+         'duct_tape', 'duffel_bag', 'dumbbell', 'dumpster', 'dustpan', 'eagle',
+         'earphone', 'earplug', 'earring', 'easel', 'eclair', 'eel', 'egg',
+         'egg_roll', 'egg_yolk', 'eggbeater', 'eggplant', 'electric_chair',
+         'refrigerator', 'elephant', 'elk', 'envelope', 'eraser', 'escargot',
+         'eyepatch', 'falcon', 'fan', 'faucet', 'fedora', 'ferret',
+         'Ferris_wheel', 'ferry', 'fig_(fruit)', 'fighter_jet', 'figurine',
+         'file_cabinet', 'file_(tool)', 'fire_alarm', 'fire_engine',
+         'fire_extinguisher', 'fire_hose', 'fireplace', 'fireplug',
+         'first-aid_kit', 'fish', 'fish_(food)', 'fishbowl', 'fishing_rod',
+         'flag', 'flagpole', 'flamingo', 'flannel', 'flap', 'flash',
+         'flashlight', 'fleece', 'flip-flop_(sandal)', 'flipper_(footwear)',
+         'flower_arrangement', 'flute_glass', 'foal', 'folding_chair',
+         'food_processor', 'football_(American)', 'football_helmet',
+         'footstool', 'fork', 'forklift', 'freight_car', 'French_toast',
+         'freshener', 'frisbee', 'frog', 'fruit_juice', 'frying_pan', 'fudge',
+         'funnel', 'futon', 'gag', 'garbage', 'garbage_truck', 'garden_hose',
+         'gargle', 'gargoyle', 'garlic', 'gasmask', 'gazelle', 'gelatin',
+         'gemstone', 'generator', 'giant_panda', 'gift_wrap', 'ginger',
+         'giraffe', 'cincture', 'glass_(drink_container)', 'globe', 'glove',
+         'goat', 'goggles', 'goldfish', 'golf_club', 'golfcart',
+         'gondola_(boat)', 'goose', 'gorilla', 'gourd', 'grape', 'grater',
+         'gravestone', 'gravy_boat', 'green_bean', 'green_onion', 'griddle',
+         'grill', 'grits', 'grizzly', 'grocery_bag', 'guitar', 'gull', 'gun',
+         'hairbrush', 'hairnet', 'hairpin', 'halter_top', 'ham', 'hamburger',
+         'hammer', 'hammock', 'hamper', 'hamster', 'hair_dryer', 'hand_glass',
+         'hand_towel', 'handcart', 'handcuff', 'handkerchief', 'handle',
+         'handsaw', 'hardback_book', 'harmonium', 'hat', 'hatbox', 'veil',
+         'headband', 'headboard', 'headlight', 'headscarf', 'headset',
+         'headstall_(for_horses)', 'heart', 'heater', 'helicopter', 'helmet',
+         'heron', 'highchair', 'hinge', 'hippopotamus', 'hockey_stick', 'hog',
+         'home_plate_(baseball)', 'honey', 'fume_hood', 'hook', 'hookah',
+         'hornet', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce',
+         'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear',
+         'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate',
+         'igniter', 'inhaler', 'iPod', 'iron_(for_clothing)', 'ironing_board',
+         'jacket', 'jam', 'jar', 'jean', 'jeep', 'jelly_bean', 'jersey',
+         'jet_plane', 'jewel', 'jewelry', 'joystick', 'jumpsuit', 'kayak',
+         'keg', 'kennel', 'kettle', 'key', 'keycard', 'kilt', 'kimono',
+         'kitchen_sink', 'kitchen_table', 'kite', 'kitten', 'kiwi_fruit',
+         'knee_pad', 'knife', 'knitting_needle', 'knob', 'knocker_(on_a_door)',
+         'koala', 'lab_coat', 'ladder', 'ladle', 'ladybug', 'lamb_(animal)',
+         'lamb-chop', 'lamp', 'lamppost', 'lampshade', 'lantern', 'lanyard',
+         'laptop_computer', 'lasagna', 'latch', 'lawn_mower', 'leather',
+         'legging_(clothing)', 'Lego', 'legume', 'lemon', 'lemonade',
+         'lettuce', 'license_plate', 'life_buoy', 'life_jacket', 'lightbulb',
+         'lightning_rod', 'lime', 'limousine', 'lion', 'lip_balm', 'liquor',
+         'lizard', 'log', 'lollipop', 'speaker_(stereo_equipment)', 'loveseat',
+         'machine_gun', 'magazine', 'magnet', 'mail_slot', 'mailbox_(at_home)',
+         'mallard', 'mallet', 'mammoth', 'manatee', 'mandarin_orange',
+         'manger', 'manhole', 'map', 'marker', 'martini', 'mascot',
+         'mashed_potato', 'masher', 'mask', 'mast', 'mat_(gym_equipment)',
+         'matchbox', 'mattress', 'measuring_cup', 'measuring_stick',
+         'meatball', 'medicine', 'melon', 'microphone', 'microscope',
+         'microwave_oven', 'milestone', 'milk', 'milk_can', 'milkshake',
+         'minivan', 'mint_candy', 'mirror', 'mitten', 'mixer_(kitchen_tool)',
+         'money', 'monitor_(computer_equipment) computer_monitor', 'monkey',
+         'motor', 'motor_scooter', 'motor_vehicle', 'motorcycle',
+         'mound_(baseball)', 'mouse_(computer_equipment)', 'mousepad',
+         'muffin', 'mug', 'mushroom', 'music_stool', 'musical_instrument',
+         'nailfile', 'napkin', 'neckerchief', 'necklace', 'necktie', 'needle',
+         'nest', 'newspaper', 'newsstand', 'nightshirt',
+         'nosebag_(for_animals)', 'noseband_(for_animals)', 'notebook',
+         'notepad', 'nut', 'nutcracker', 'oar', 'octopus_(food)',
+         'octopus_(animal)', 'oil_lamp', 'olive_oil', 'omelet', 'onion',
+         'orange_(fruit)', 'orange_juice', 'ostrich', 'ottoman', 'oven',
+         'overalls_(clothing)', 'owl', 'packet', 'inkpad', 'pad', 'paddle',
+         'padlock', 'paintbrush', 'painting', 'pajamas', 'palette',
+         'pan_(for_cooking)', 'pan_(metal_container)', 'pancake', 'pantyhose',
+         'papaya', 'paper_plate', 'paper_towel', 'paperback_book',
+         'paperweight', 'parachute', 'parakeet', 'parasail_(sports)',
+         'parasol', 'parchment', 'parka', 'parking_meter', 'parrot',
+         'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport',
+         'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter',
+         'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'wooden_leg',
+         'pegboard', 'pelican', 'pen', 'pencil', 'pencil_box',
+         'pencil_sharpener', 'pendulum', 'penguin', 'pennant', 'penny_(coin)',
+         'pepper', 'pepper_mill', 'perfume', 'persimmon', 'person', 'pet',
+         'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano',
+         'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow',
+         'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball',
+         'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)',
+         'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat',
+         'plate', 'platter', 'playpen', 'pliers', 'plow_(farm_equipment)',
+         'plume', 'pocket_watch', 'pocketknife', 'poker_(fire_stirring_tool)',
+         'pole', 'polo_shirt', 'poncho', 'pony', 'pool_table', 'pop_(soda)',
+         'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot',
+         'potato', 'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn',
+         'pretzel', 'printer', 'projectile_(weapon)', 'projector', 'propeller',
+         'prune', 'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin',
+         'puncher', 'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt',
+         'rabbit', 'race_car', 'racket', 'radar', 'radiator', 'radio_receiver',
+         'radish', 'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry',
+         'rat', 'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt',
+         'recliner', 'record_player', 'reflector', 'remote_control',
+         'rhinoceros', 'rib_(food)', 'rifle', 'ring', 'river_boat', 'road_map',
+         'robe', 'rocking_chair', 'rodent', 'roller_skate', 'Rollerblade',
+         'rolling_pin', 'root_beer', 'router_(computer_equipment)',
+         'rubber_band', 'runner_(carpet)', 'plastic_bag',
+         'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag', 'safety_pin',
+         'sail', 'salad', 'salad_plate', 'salami', 'salmon_(fish)',
+         'salmon_(food)', 'salsa', 'saltshaker', 'sandal_(type_of_shoe)',
+         'sandwich', 'satchel', 'saucepan', 'saucer', 'sausage', 'sawhorse',
+         'saxophone', 'scale_(measuring_instrument)', 'scarecrow', 'scarf',
+         'school_bus', 'scissors', 'scoreboard', 'scraper', 'screwdriver',
+         'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane',
+         'seashell', 'sewing_machine', 'shaker', 'shampoo', 'shark',
+         'sharpener', 'Sharpie', 'shaver_(electric)', 'shaving_cream', 'shawl',
+         'shears', 'sheep', 'shepherd_dog', 'sherbert', 'shield', 'shirt',
+         'shoe', 'shopping_bag', 'shopping_cart', 'short_pants', 'shot_glass',
+         'shoulder_bag', 'shovel', 'shower_head', 'shower_cap',
+         'shower_curtain', 'shredder_(for_paper)', 'signboard', 'silo', 'sink',
+         'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka', 'ski_pole',
+         'skirt', 'skullcap', 'sled', 'sleeping_bag', 'sling_(bandage)',
+         'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman',
+         'snowmobile', 'soap', 'soccer_ball', 'sock', 'sofa', 'softball',
+         'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon',
+         'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)',
+         'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'crawfish',
+         'sponge', 'spoon', 'sportswear', 'spotlight', 'squid_(food)',
+         'squirrel', 'stagecoach', 'stapler_(stapling_machine)', 'starfish',
+         'statue_(sculpture)', 'steak_(food)', 'steak_knife', 'steering_wheel',
+         'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew',
+         'stirrer', 'stirrup', 'stool', 'stop_sign', 'brake_light', 'stove',
+         'strainer', 'strap', 'straw_(for_drinking)', 'strawberry',
+         'street_sign', 'streetlight', 'string_cheese', 'stylus', 'subwoofer',
+         'sugar_bowl', 'sugarcane_(plant)', 'suit_(clothing)', 'sunflower',
+         'sunglasses', 'sunhat', 'surfboard', 'sushi', 'mop', 'sweat_pants',
+         'sweatband', 'sweater', 'sweatshirt', 'sweet_potato', 'swimsuit',
+         'sword', 'syringe', 'Tabasco_sauce', 'table-tennis_table', 'table',
+         'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag', 'taillight',
+         'tambourine', 'army_tank', 'tank_(storage_vessel)',
+         'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure',
+         'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup',
+         'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth',
+         'telephone_pole', 'telephoto_lens', 'television_camera',
+         'television_set', 'tennis_ball', 'tennis_racket', 'tequila',
+         'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread',
+         'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer',
+         'tinfoil', 'tinsel', 'tissue_paper', 'toast_(food)', 'toaster',
+         'toaster_oven', 'toilet', 'toilet_tissue', 'tomato', 'tongs',
+         'toolbox', 'toothbrush', 'toothpaste', 'toothpick', 'cover',
+         'tortilla', 'tow_truck', 'towel', 'towel_rack', 'toy',
+         'tractor_(farm_equipment)', 'traffic_light', 'dirt_bike',
+         'trailer_truck', 'train_(railroad_vehicle)', 'trampoline', 'tray',
+         'trench_coat', 'triangle_(musical_instrument)', 'tricycle', 'tripod',
+         'trousers', 'truck', 'truffle_(chocolate)', 'trunk', 'vat', 'turban',
+         'turkey_(food)', 'turnip', 'turtle', 'turtleneck_(clothing)',
+         'typewriter', 'umbrella', 'underwear', 'unicycle', 'urinal', 'urn',
+         'vacuum_cleaner', 'vase', 'vending_machine', 'vent', 'vest',
+         'videotape', 'vinegar', 'violin', 'vodka', 'volleyball', 'vulture',
+         'waffle', 'waffle_iron', 'wagon', 'wagon_wheel', 'walking_stick',
+         'wall_clock', 'wall_socket', 'wallet', 'walrus', 'wardrobe',
+         'washbasin', 'automatic_washer', 'watch', 'water_bottle',
+         'water_cooler', 'water_faucet', 'water_heater', 'water_jug',
+         'water_gun', 'water_scooter', 'water_ski', 'water_tower',
+         'watering_can', 'watermelon', 'weathervane', 'webcam', 'wedding_cake',
+         'wedding_ring', 'wet_suit', 'wheel', 'wheelchair', 'whipped_cream',
+         'whistle', 'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)',
+         'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket',
+         'wineglass', 'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon',
+         'wreath', 'wrench', 'wristband', 'wristlet', 'yacht', 'yogurt',
+         'yoke_(animal_equipment)', 'zebra', 'zucchini'),
+        'palette':
+        None
+    }
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        try:
+            import lvis
+            if getattr(lvis, '__version__', '0') >= '10.5.3':
+                warnings.warn(
+                    'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"',  # noqa: E501
+                    UserWarning)
+            from lvis import LVIS
+        except ImportError:
+            raise ImportError(
+                'Package lvis is not installed. Please run "pip install git+https://github.com/lvis-dataset/lvis-api.git".'  # noqa: E501
+            )
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            self.lvis = LVIS(local_path)
+        self.cat_ids = self.lvis.get_cat_ids()
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.lvis.cat_img_map)
+
+        img_ids = self.lvis.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.lvis.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+            # coco_url is used in LVISv1 instead of file_name
+            # e.g. http://images.cocodataset.org/train2017/000000391895.jpg
+            # train/val split in specified in url
+            raw_img_info['file_name'] = raw_img_info['coco_url'].replace(
+                'http://images.cocodataset.org/', '')
+            ann_ids = self.lvis.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.lvis.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            })
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.lvis
+
+        return data_list
diff --git a/mmde/mmdet/datasets/mdetr_style_refcoco.py b/mmde/mmdet/datasets/mdetr_style_refcoco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc56dec49db72daddf929bcc65471ffc2ca6fb4d
--- /dev/null
+++ b/mmde/mmdet/datasets/mdetr_style_refcoco.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+from mmengine.fileio import get_local_path
+
+from mmdet.datasets import BaseDetDataset
+from mmdet.registry import DATASETS
+from .api_wrappers import COCO
+
+
+@DATASETS.register_module()
+class MDETRStyleRefCocoDataset(BaseDetDataset):
+    """RefCOCO dataset.
+
+    Only support evaluation now.
+    """
+
+    def load_data_list(self) -> List[dict]:
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            coco = COCO(local_path)
+
+        img_ids = coco.get_img_ids()
+
+        data_infos = []
+        for img_id in img_ids:
+            raw_img_info = coco.load_imgs([img_id])[0]
+            ann_ids = coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = coco.load_anns(ann_ids)
+
+            data_info = {}
+            img_path = osp.join(self.data_prefix['img'],
+                                raw_img_info['file_name'])
+            data_info['img_path'] = img_path
+            data_info['img_id'] = img_id
+            data_info['height'] = raw_img_info['height']
+            data_info['width'] = raw_img_info['width']
+            data_info['dataset_mode'] = raw_img_info['dataset_name']
+
+            data_info['text'] = raw_img_info['caption']
+            data_info['custom_entities'] = False
+            data_info['tokens_positive'] = -1
+
+            instances = []
+            for i, ann in enumerate(raw_ann_info):
+                instance = {}
+                x1, y1, w, h = ann['bbox']
+                bbox = [x1, y1, x1 + w, y1 + h]
+                instance['bbox'] = bbox
+                instance['bbox_label'] = ann['category_id']
+                instance['ignore_flag'] = 0
+                instances.append(instance)
+
+            data_info['instances'] = instances
+            data_infos.append(data_info)
+        return data_infos
diff --git a/mmde/mmdet/datasets/mot_challenge_dataset.py b/mmde/mmdet/datasets/mot_challenge_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffbdc48ebf8d4a4ba11a605c8bc2a479cf2a0c96
--- /dev/null
+++ b/mmde/mmdet/datasets/mot_challenge_dataset.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List, Union
+
+from mmdet.registry import DATASETS
+from .base_video_dataset import BaseVideoDataset
+
+
+@DATASETS.register_module()
+class MOTChallengeDataset(BaseVideoDataset):
+    """Dataset for MOTChallenge.
+
+    Args:
+        visibility_thr (float, optional): The minimum visibility
+            for the objects during training. Default to -1.
+    """
+
+    METAINFO = {
+        'classes':
+        ('pedestrian', 'person_on_vehicle', 'car', 'bicycle', 'motorbike',
+         'non_mot_vehicle', 'static_person', 'distractor', 'occluder',
+         'occluder_on_ground', 'occluder_full', 'reflection', 'crowd')
+    }
+
+    def __init__(self, visibility_thr: float = -1, *args, **kwargs):
+        self.visibility_thr = visibility_thr
+        super().__init__(*args, **kwargs)
+
+    def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format. The difference between this
+        function and the one in ``BaseVideoDataset`` is that the parsing here
+        adds ``visibility`` and ``mot_conf``.
+
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``
+
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        img_info = raw_data_info['raw_img_info']
+        ann_info = raw_data_info['raw_ann_info']
+        data_info = {}
+
+        data_info.update(img_info)
+        if self.data_prefix.get('img_path', None) is not None:
+            img_path = osp.join(self.data_prefix['img_path'],
+                                img_info['file_name'])
+        else:
+            img_path = img_info['file_name']
+        data_info['img_path'] = img_path
+
+        instances = []
+        for i, ann in enumerate(ann_info):
+            instance = {}
+
+            if (not self.test_mode) and (ann['visibility'] <
+                                         self.visibility_thr):
+                continue
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+
+            if ann.get('iscrowd', False):
+                instance['ignore_flag'] = 1
+            else:
+                instance['ignore_flag'] = 0
+            instance['bbox'] = bbox
+            instance['bbox_label'] = self.cat2label[ann['category_id']]
+            instance['instance_id'] = ann['instance_id']
+            instance['category_id'] = ann['category_id']
+            instance['mot_conf'] = ann['mot_conf']
+            instance['visibility'] = ann['visibility']
+            if len(instance) > 0:
+                instances.append(instance)
+        if not self.test_mode:
+            assert len(instances) > 0, f'No valid instances found in ' \
+                f'image {data_info["img_path"]}!'
+        data_info['instances'] = instances
+        return data_info
diff --git a/mmde/mmdet/datasets/objects365.py b/mmde/mmdet/datasets/objects365.py
new file mode 100644
index 0000000000000000000000000000000000000000..e99869bfa309635af3c03cbfa77f732db3f50637
--- /dev/null
+++ b/mmde/mmdet/datasets/objects365.py
@@ -0,0 +1,284 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from typing import List
+
+from mmengine.fileio import get_local_path
+
+from mmdet.registry import DATASETS
+from .api_wrappers import COCO
+from .coco import CocoDataset
+
+# images exist in annotations but not in image folder.
+objv2_ignore_list = [
+    osp.join('patch16', 'objects365_v2_00908726.jpg'),
+    osp.join('patch6', 'objects365_v1_00320532.jpg'),
+    osp.join('patch6', 'objects365_v1_00320534.jpg'),
+]
+
+
+@DATASETS.register_module()
+class Objects365V1Dataset(CocoDataset):
+    """Objects365 v1 dataset for detection."""
+
+    METAINFO = {
+        'classes':
+        ('person', 'sneakers', 'chair', 'hat', 'lamp', 'bottle',
+         'cabinet/shelf', 'cup', 'car', 'glasses', 'picture/frame', 'desk',
+         'handbag', 'street lights', 'book', 'plate', 'helmet',
+         'leather shoes', 'pillow', 'glove', 'potted plant', 'bracelet',
+         'flower', 'tv', 'storage box', 'vase', 'bench', 'wine glass', 'boots',
+         'bowl', 'dining table', 'umbrella', 'boat', 'flag', 'speaker',
+         'trash bin/can', 'stool', 'backpack', 'couch', 'belt', 'carpet',
+         'basket', 'towel/napkin', 'slippers', 'barrel/bucket', 'coffee table',
+         'suv', 'toy', 'tie', 'bed', 'traffic light', 'pen/pencil',
+         'microphone', 'sandals', 'canned', 'necklace', 'mirror', 'faucet',
+         'bicycle', 'bread', 'high heels', 'ring', 'van', 'watch', 'sink',
+         'horse', 'fish', 'apple', 'camera', 'candle', 'teddy bear', 'cake',
+         'motorcycle', 'wild bird', 'laptop', 'knife', 'traffic sign',
+         'cell phone', 'paddle', 'truck', 'cow', 'power outlet', 'clock',
+         'drum', 'fork', 'bus', 'hanger', 'nightstand', 'pot/pan', 'sheep',
+         'guitar', 'traffic cone', 'tea pot', 'keyboard', 'tripod', 'hockey',
+         'fan', 'dog', 'spoon', 'blackboard/whiteboard', 'balloon',
+         'air conditioner', 'cymbal', 'mouse', 'telephone', 'pickup truck',
+         'orange', 'banana', 'airplane', 'luggage', 'skis', 'soccer',
+         'trolley', 'oven', 'remote', 'baseball glove', 'paper towel',
+         'refrigerator', 'train', 'tomato', 'machinery vehicle', 'tent',
+         'shampoo/shower gel', 'head phone', 'lantern', 'donut',
+         'cleaning products', 'sailboat', 'tangerine', 'pizza', 'kite',
+         'computer box', 'elephant', 'toiletries', 'gas stove', 'broccoli',
+         'toilet', 'stroller', 'shovel', 'baseball bat', 'microwave',
+         'skateboard', 'surfboard', 'surveillance camera', 'gun', 'life saver',
+         'cat', 'lemon', 'liquid soap', 'zebra', 'duck', 'sports car',
+         'giraffe', 'pumpkin', 'piano', 'stop sign', 'radiator', 'converter',
+         'tissue ', 'carrot', 'washing machine', 'vent', 'cookies',
+         'cutting/chopping board', 'tennis racket', 'candy',
+         'skating and skiing shoes', 'scissors', 'folder', 'baseball',
+         'strawberry', 'bow tie', 'pigeon', 'pepper', 'coffee machine',
+         'bathtub', 'snowboard', 'suitcase', 'grapes', 'ladder', 'pear',
+         'american football', 'basketball', 'potato', 'paint brush', 'printer',
+         'billiards', 'fire hydrant', 'goose', 'projector', 'sausage',
+         'fire extinguisher', 'extension cord', 'facial mask', 'tennis ball',
+         'chopsticks', 'electronic stove and gas stove', 'pie', 'frisbee',
+         'kettle', 'hamburger', 'golf club', 'cucumber', 'clutch', 'blender',
+         'tong', 'slide', 'hot dog', 'toothbrush', 'facial cleanser', 'mango',
+         'deer', 'egg', 'violin', 'marker', 'ship', 'chicken', 'onion',
+         'ice cream', 'tape', 'wheelchair', 'plum', 'bar soap', 'scale',
+         'watermelon', 'cabbage', 'router/modem', 'golf ball', 'pine apple',
+         'crane', 'fire truck', 'peach', 'cello', 'notepaper', 'tricycle',
+         'toaster', 'helicopter', 'green beans', 'brush', 'carriage', 'cigar',
+         'earphone', 'penguin', 'hurdle', 'swing', 'radio', 'CD',
+         'parking meter', 'swan', 'garlic', 'french fries', 'horn', 'avocado',
+         'saxophone', 'trumpet', 'sandwich', 'cue', 'kiwi fruit', 'bear',
+         'fishing rod', 'cherry', 'tablet', 'green vegetables', 'nuts', 'corn',
+         'key', 'screwdriver', 'globe', 'broom', 'pliers', 'volleyball',
+         'hammer', 'eggplant', 'trophy', 'dates', 'board eraser', 'rice',
+         'tape measure/ruler', 'dumbbell', 'hamimelon', 'stapler', 'camel',
+         'lettuce', 'goldfish', 'meat balls', 'medal', 'toothpaste',
+         'antelope', 'shrimp', 'rickshaw', 'trombone', 'pomegranate',
+         'coconut', 'jellyfish', 'mushroom', 'calculator', 'treadmill',
+         'butterfly', 'egg tart', 'cheese', 'pig', 'pomelo', 'race car',
+         'rice cooker', 'tuba', 'crosswalk sign', 'papaya', 'hair drier',
+         'green onion', 'chips', 'dolphin', 'sushi', 'urinal', 'donkey',
+         'electric drill', 'spring rolls', 'tortoise/turtle', 'parrot',
+         'flute', 'measuring cup', 'shark', 'steak', 'poker card',
+         'binoculars', 'llama', 'radish', 'noodles', 'yak', 'mop', 'crab',
+         'microscope', 'barbell', 'bread/bun', 'baozi', 'lion', 'red cabbage',
+         'polar bear', 'lighter', 'seal', 'mangosteen', 'comb', 'eraser',
+         'pitaya', 'scallop', 'pencil case', 'saw', 'table tennis paddle',
+         'okra', 'starfish', 'eagle', 'monkey', 'durian', 'game board',
+         'rabbit', 'french horn', 'ambulance', 'asparagus', 'hoverboard',
+         'pasta', 'target', 'hotair balloon', 'chainsaw', 'lobster', 'iron',
+         'flashlight'),
+        'palette':
+        None
+    }
+
+    COCOAPI = COCO
+    # ann_id is unique in coco dataset.
+    ANN_ID_UNIQUE = True
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            self.coco = self.COCOAPI(local_path)
+
+        # 'categories' list in objects365_train.json and objects365_val.json
+        # is inconsistent, need sort list(or dict) before get cat_ids.
+        cats = self.coco.cats
+        sorted_cats = {i: cats[i] for i in sorted(cats)}
+        self.coco.cats = sorted_cats
+        categories = self.coco.dataset['categories']
+        sorted_categories = sorted(categories, key=lambda i: i['id'])
+        self.coco.dataset['categories'] = sorted_categories
+        # The order of returned `cat_ids` will not
+        # change with the order of the `classes`
+        self.cat_ids = self.coco.get_cat_ids(
+            cat_names=self.metainfo['classes'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.coco.cat_img_map)
+
+        img_ids = self.coco.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+
+            ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            })
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.coco
+
+        return data_list
+
+
+@DATASETS.register_module()
+class Objects365V2Dataset(CocoDataset):
+    """Objects365 v2 dataset for detection."""
+    METAINFO = {
+        'classes':
+        ('Person', 'Sneakers', 'Chair', 'Other Shoes', 'Hat', 'Car', 'Lamp',
+         'Glasses', 'Bottle', 'Desk', 'Cup', 'Street Lights', 'Cabinet/shelf',
+         'Handbag/Satchel', 'Bracelet', 'Plate', 'Picture/Frame', 'Helmet',
+         'Book', 'Gloves', 'Storage box', 'Boat', 'Leather Shoes', 'Flower',
+         'Bench', 'Potted Plant', 'Bowl/Basin', 'Flag', 'Pillow', 'Boots',
+         'Vase', 'Microphone', 'Necklace', 'Ring', 'SUV', 'Wine Glass', 'Belt',
+         'Moniter/TV', 'Backpack', 'Umbrella', 'Traffic Light', 'Speaker',
+         'Watch', 'Tie', 'Trash bin Can', 'Slippers', 'Bicycle', 'Stool',
+         'Barrel/bucket', 'Van', 'Couch', 'Sandals', 'Bakset', 'Drum',
+         'Pen/Pencil', 'Bus', 'Wild Bird', 'High Heels', 'Motorcycle',
+         'Guitar', 'Carpet', 'Cell Phone', 'Bread', 'Camera', 'Canned',
+         'Truck', 'Traffic cone', 'Cymbal', 'Lifesaver', 'Towel',
+         'Stuffed Toy', 'Candle', 'Sailboat', 'Laptop', 'Awning', 'Bed',
+         'Faucet', 'Tent', 'Horse', 'Mirror', 'Power outlet', 'Sink', 'Apple',
+         'Air Conditioner', 'Knife', 'Hockey Stick', 'Paddle', 'Pickup Truck',
+         'Fork', 'Traffic Sign', 'Ballon', 'Tripod', 'Dog', 'Spoon', 'Clock',
+         'Pot', 'Cow', 'Cake', 'Dinning Table', 'Sheep', 'Hanger',
+         'Blackboard/Whiteboard', 'Napkin', 'Other Fish', 'Orange/Tangerine',
+         'Toiletry', 'Keyboard', 'Tomato', 'Lantern', 'Machinery Vehicle',
+         'Fan', 'Green Vegetables', 'Banana', 'Baseball Glove', 'Airplane',
+         'Mouse', 'Train', 'Pumpkin', 'Soccer', 'Skiboard', 'Luggage',
+         'Nightstand', 'Tea pot', 'Telephone', 'Trolley', 'Head Phone',
+         'Sports Car', 'Stop Sign', 'Dessert', 'Scooter', 'Stroller', 'Crane',
+         'Remote', 'Refrigerator', 'Oven', 'Lemon', 'Duck', 'Baseball Bat',
+         'Surveillance Camera', 'Cat', 'Jug', 'Broccoli', 'Piano', 'Pizza',
+         'Elephant', 'Skateboard', 'Surfboard', 'Gun',
+         'Skating and Skiing shoes', 'Gas stove', 'Donut', 'Bow Tie', 'Carrot',
+         'Toilet', 'Kite', 'Strawberry', 'Other Balls', 'Shovel', 'Pepper',
+         'Computer Box', 'Toilet Paper', 'Cleaning Products', 'Chopsticks',
+         'Microwave', 'Pigeon', 'Baseball', 'Cutting/chopping Board',
+         'Coffee Table', 'Side Table', 'Scissors', 'Marker', 'Pie', 'Ladder',
+         'Snowboard', 'Cookies', 'Radiator', 'Fire Hydrant', 'Basketball',
+         'Zebra', 'Grape', 'Giraffe', 'Potato', 'Sausage', 'Tricycle',
+         'Violin', 'Egg', 'Fire Extinguisher', 'Candy', 'Fire Truck',
+         'Billards', 'Converter', 'Bathtub', 'Wheelchair', 'Golf Club',
+         'Briefcase', 'Cucumber', 'Cigar/Cigarette ', 'Paint Brush', 'Pear',
+         'Heavy Truck', 'Hamburger', 'Extractor', 'Extention Cord', 'Tong',
+         'Tennis Racket', 'Folder', 'American Football', 'earphone', 'Mask',
+         'Kettle', 'Tennis', 'Ship', 'Swing', 'Coffee Machine', 'Slide',
+         'Carriage', 'Onion', 'Green beans', 'Projector', 'Frisbee',
+         'Washing Machine/Drying Machine', 'Chicken', 'Printer', 'Watermelon',
+         'Saxophone', 'Tissue', 'Toothbrush', 'Ice cream', 'Hotair ballon',
+         'Cello', 'French Fries', 'Scale', 'Trophy', 'Cabbage', 'Hot dog',
+         'Blender', 'Peach', 'Rice', 'Wallet/Purse', 'Volleyball', 'Deer',
+         'Goose', 'Tape', 'Tablet', 'Cosmetics', 'Trumpet', 'Pineapple',
+         'Golf Ball', 'Ambulance', 'Parking meter', 'Mango', 'Key', 'Hurdle',
+         'Fishing Rod', 'Medal', 'Flute', 'Brush', 'Penguin', 'Megaphone',
+         'Corn', 'Lettuce', 'Garlic', 'Swan', 'Helicopter', 'Green Onion',
+         'Sandwich', 'Nuts', 'Speed Limit Sign', 'Induction Cooker', 'Broom',
+         'Trombone', 'Plum', 'Rickshaw', 'Goldfish', 'Kiwi fruit',
+         'Router/modem', 'Poker Card', 'Toaster', 'Shrimp', 'Sushi', 'Cheese',
+         'Notepaper', 'Cherry', 'Pliers', 'CD', 'Pasta', 'Hammer', 'Cue',
+         'Avocado', 'Hamimelon', 'Flask', 'Mushroon', 'Screwdriver', 'Soap',
+         'Recorder', 'Bear', 'Eggplant', 'Board Eraser', 'Coconut',
+         'Tape Measur/ Ruler', 'Pig', 'Showerhead', 'Globe', 'Chips', 'Steak',
+         'Crosswalk Sign', 'Stapler', 'Campel', 'Formula 1 ', 'Pomegranate',
+         'Dishwasher', 'Crab', 'Hoverboard', 'Meat ball', 'Rice Cooker',
+         'Tuba', 'Calculator', 'Papaya', 'Antelope', 'Parrot', 'Seal',
+         'Buttefly', 'Dumbbell', 'Donkey', 'Lion', 'Urinal', 'Dolphin',
+         'Electric Drill', 'Hair Dryer', 'Egg tart', 'Jellyfish', 'Treadmill',
+         'Lighter', 'Grapefruit', 'Game board', 'Mop', 'Radish', 'Baozi',
+         'Target', 'French', 'Spring Rolls', 'Monkey', 'Rabbit', 'Pencil Case',
+         'Yak', 'Red Cabbage', 'Binoculars', 'Asparagus', 'Barbell', 'Scallop',
+         'Noddles', 'Comb', 'Dumpling', 'Oyster', 'Table Teniis paddle',
+         'Cosmetics Brush/Eyeliner Pencil', 'Chainsaw', 'Eraser', 'Lobster',
+         'Durian', 'Okra', 'Lipstick', 'Cosmetics Mirror', 'Curling',
+         'Table Tennis '),
+        'palette':
+        None
+    }
+
+    COCOAPI = COCO
+    # ann_id is unique in coco dataset.
+    ANN_ID_UNIQUE = True
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            self.coco = self.COCOAPI(local_path)
+        # The order of returned `cat_ids` will not
+        # change with the order of the `classes`
+        self.cat_ids = self.coco.get_cat_ids(
+            cat_names=self.metainfo['classes'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.coco.cat_img_map)
+
+        img_ids = self.coco.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+
+            ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            # file_name should be `patchX/xxx.jpg`
+            file_name = osp.join(
+                osp.split(osp.split(raw_img_info['file_name'])[0])[-1],
+                osp.split(raw_img_info['file_name'])[-1])
+
+            if file_name in objv2_ignore_list:
+                continue
+
+            raw_img_info['file_name'] = file_name
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            })
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.coco
+
+        return data_list
diff --git a/mmde/mmdet/datasets/odvg.py b/mmde/mmdet/datasets/odvg.py
new file mode 100644
index 0000000000000000000000000000000000000000..c73865f2ea724205640bea2c701c355bbd9135e3
--- /dev/null
+++ b/mmde/mmdet/datasets/odvg.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from typing import List, Optional
+
+from mmengine.fileio import get_local_path
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class ODVGDataset(BaseDetDataset):
+    """object detection and visual grounding dataset."""
+
+    def __init__(self,
+                 *args,
+                 data_root: str = '',
+                 label_map_file: Optional[str] = None,
+                 need_text: bool = True,
+                 **kwargs) -> None:
+        self.dataset_mode = 'VG'
+        self.need_text = need_text
+        if label_map_file:
+            label_map_file = osp.join(data_root, label_map_file)
+            with open(label_map_file, 'r') as file:
+                self.label_map = json.load(file)
+            self.dataset_mode = 'OD'
+        super().__init__(*args, data_root=data_root, **kwargs)
+        assert self.return_classes is True
+
+    def load_data_list(self) -> List[dict]:
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                data_list = [json.loads(line) for line in f]
+
+        out_data_list = []
+        for data in data_list:
+            data_info = {}
+            img_path = osp.join(self.data_prefix['img'], data['filename'])
+            data_info['img_path'] = img_path
+            data_info['height'] = data['height']
+            data_info['width'] = data['width']
+            if self.dataset_mode == 'OD':
+                if self.need_text:
+                    data_info['text'] = self.label_map
+                anno = data.get('detection', {})
+                instances = [obj for obj in anno.get('instances', [])]
+                bboxes = [obj['bbox'] for obj in instances]
+                bbox_labels = [str(obj['label']) for obj in instances]
+
+                instances = []
+                for bbox, label in zip(bboxes, bbox_labels):
+                    instance = {}
+                    x1, y1, x2, y2 = bbox
+                    inter_w = max(0, min(x2, data['width']) - max(x1, 0))
+                    inter_h = max(0, min(y2, data['height']) - max(y1, 0))
+                    if inter_w * inter_h == 0:
+                        continue
+                    if (x2 - x1) < 1 or (y2 - y1) < 1:
+                        continue
+                    instance['ignore_flag'] = 0
+                    instance['bbox'] = bbox
+                    instance['bbox_label'] = int(label)
+                    instances.append(instance)
+                data_info['instances'] = instances
+                data_info['dataset_mode'] = self.dataset_mode
+                out_data_list.append(data_info)
+            else:
+                anno = data['grounding']
+                data_info['text'] = anno['caption']
+                regions = anno['regions']
+
+                instances = []
+                phrases = {}
+                for i, region in enumerate(regions):
+                    bbox = region['bbox']
+                    phrase = region['phrase']
+                    tokens_positive = region['tokens_positive']
+                    if not isinstance(bbox[0], list):
+                        bbox = [bbox]
+                    for box in bbox:
+                        instance = {}
+                        x1, y1, x2, y2 = box
+                        inter_w = max(0, min(x2, data['width']) - max(x1, 0))
+                        inter_h = max(0, min(y2, data['height']) - max(y1, 0))
+                        if inter_w * inter_h == 0:
+                            continue
+                        if (x2 - x1) < 1 or (y2 - y1) < 1:
+                            continue
+                        instance['ignore_flag'] = 0
+                        instance['bbox'] = box
+                        instance['bbox_label'] = i
+                        phrases[i] = {
+                            'phrase': phrase,
+                            'tokens_positive': tokens_positive
+                        }
+                        instances.append(instance)
+                data_info['instances'] = instances
+                data_info['phrases'] = phrases
+                data_info['dataset_mode'] = self.dataset_mode
+                out_data_list.append(data_info)
+
+        del data_list
+        return out_data_list
diff --git a/mmde/mmdet/datasets/openimages.py b/mmde/mmdet/datasets/openimages.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3c6c8ec44fdfe86a653fc6a716009836f7d471c
--- /dev/null
+++ b/mmde/mmdet/datasets/openimages.py
@@ -0,0 +1,484 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import csv
+import os.path as osp
+from collections import defaultdict
+from typing import Dict, List, Optional
+
+import numpy as np
+from mmengine.fileio import get_local_path, load
+from mmengine.utils import is_abs
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class OpenImagesDataset(BaseDetDataset):
+    """Open Images dataset for detection.
+
+    Args:
+        ann_file (str): Annotation file path.
+        label_file (str): File path of the label description file that
+            maps the classes names in MID format to their short
+            descriptions.
+        meta_file (str): File path to get image metas.
+        hierarchy_file (str): The file path of the class hierarchy.
+        image_level_ann_file (str): Human-verified image level annotation,
+            which is used in evaluation.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    METAINFO: dict = dict(dataset_type='oid_v6')
+
+    def __init__(self,
+                 label_file: str,
+                 meta_file: str,
+                 hierarchy_file: str,
+                 image_level_ann_file: Optional[str] = None,
+                 **kwargs) -> None:
+        self.label_file = label_file
+        self.meta_file = meta_file
+        self.hierarchy_file = hierarchy_file
+        self.image_level_ann_file = image_level_ann_file
+        super().__init__(**kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """
+        classes_names, label_id_mapping = self._parse_label_file(
+            self.label_file)
+        self._metainfo['classes'] = classes_names
+        self.label_id_mapping = label_id_mapping
+
+        if self.image_level_ann_file is not None:
+            img_level_anns = self._parse_img_level_ann(
+                self.image_level_ann_file)
+        else:
+            img_level_anns = None
+
+        # OpenImagesMetric can get the relation matrix from the dataset meta
+        relation_matrix = self._get_relation_matrix(self.hierarchy_file)
+        self._metainfo['RELATION_MATRIX'] = relation_matrix
+
+        data_list = []
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                reader = csv.reader(f)
+                last_img_id = None
+                instances = []
+                for i, line in enumerate(reader):
+                    if i == 0:
+                        continue
+                    img_id = line[0]
+                    if last_img_id is None:
+                        last_img_id = img_id
+                    label_id = line[2]
+                    assert label_id in self.label_id_mapping
+                    label = int(self.label_id_mapping[label_id])
+                    bbox = [
+                        float(line[4]),  # xmin
+                        float(line[6]),  # ymin
+                        float(line[5]),  # xmax
+                        float(line[7])  # ymax
+                    ]
+                    is_occluded = True if int(line[8]) == 1 else False
+                    is_truncated = True if int(line[9]) == 1 else False
+                    is_group_of = True if int(line[10]) == 1 else False
+                    is_depiction = True if int(line[11]) == 1 else False
+                    is_inside = True if int(line[12]) == 1 else False
+
+                    instance = dict(
+                        bbox=bbox,
+                        bbox_label=label,
+                        ignore_flag=0,
+                        is_occluded=is_occluded,
+                        is_truncated=is_truncated,
+                        is_group_of=is_group_of,
+                        is_depiction=is_depiction,
+                        is_inside=is_inside)
+                    last_img_path = osp.join(self.data_prefix['img'],
+                                             f'{last_img_id}.jpg')
+                    if img_id != last_img_id:
+                        # switch to a new image, record previous image's data.
+                        data_info = dict(
+                            img_path=last_img_path,
+                            img_id=last_img_id,
+                            instances=instances,
+                        )
+                        data_list.append(data_info)
+                        instances = []
+                    instances.append(instance)
+                    last_img_id = img_id
+                data_list.append(
+                    dict(
+                        img_path=last_img_path,
+                        img_id=last_img_id,
+                        instances=instances,
+                    ))
+
+        # add image metas to data list
+        img_metas = load(
+            self.meta_file, file_format='pkl', backend_args=self.backend_args)
+        assert len(img_metas) == len(data_list)
+        for i, meta in enumerate(img_metas):
+            img_id = data_list[i]['img_id']
+            assert f'{img_id}.jpg' == osp.split(meta['filename'])[-1]
+            h, w = meta['ori_shape'][:2]
+            data_list[i]['height'] = h
+            data_list[i]['width'] = w
+            # denormalize bboxes
+            for j in range(len(data_list[i]['instances'])):
+                data_list[i]['instances'][j]['bbox'][0] *= w
+                data_list[i]['instances'][j]['bbox'][2] *= w
+                data_list[i]['instances'][j]['bbox'][1] *= h
+                data_list[i]['instances'][j]['bbox'][3] *= h
+            # add image-level annotation
+            if img_level_anns is not None:
+                img_labels = []
+                confidences = []
+                img_ann_list = img_level_anns.get(img_id, [])
+                for ann in img_ann_list:
+                    img_labels.append(int(ann['image_level_label']))
+                    confidences.append(float(ann['confidence']))
+                data_list[i]['image_level_labels'] = np.array(
+                    img_labels, dtype=np.int64)
+                data_list[i]['confidences'] = np.array(
+                    confidences, dtype=np.float32)
+        return data_list
+
+    def _parse_label_file(self, label_file: str) -> tuple:
+        """Get classes name and index mapping from cls-label-description file.
+
+        Args:
+            label_file (str): File path of the label description file that
+                maps the classes names in MID format to their short
+                descriptions.
+
+        Returns:
+            tuple: Class name of OpenImages.
+        """
+
+        index_list = []
+        classes_names = []
+        with get_local_path(
+                label_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                reader = csv.reader(f)
+                for line in reader:
+                    # self.cat2label[line[0]] = line[1]
+                    classes_names.append(line[1])
+                    index_list.append(line[0])
+        index_mapping = {index: i for i, index in enumerate(index_list)}
+        return classes_names, index_mapping
+
+    def _parse_img_level_ann(self,
+                             img_level_ann_file: str) -> Dict[str, List[dict]]:
+        """Parse image level annotations from csv style ann_file.
+
+        Args:
+            img_level_ann_file (str): CSV style image level annotation
+                file path.
+
+        Returns:
+            Dict[str, List[dict]]: Annotations where item of the defaultdict
+            indicates an image, each of which has (n) dicts.
+            Keys of dicts are:
+
+                - `image_level_label` (int): Label id.
+                - `confidence` (float): Labels that are human-verified to be
+                  present in an image have confidence = 1 (positive labels).
+                  Labels that are human-verified to be absent from an image
+                  have confidence = 0 (negative labels). Machine-generated
+                  labels have fractional confidences, generally >= 0.5.
+                  The higher the confidence, the smaller the chance for
+                  the label to be a false positive.
+        """
+
+        item_lists = defaultdict(list)
+        with get_local_path(
+                img_level_ann_file,
+                backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                reader = csv.reader(f)
+                for i, line in enumerate(reader):
+                    if i == 0:
+                        continue
+                    img_id = line[0]
+                    item_lists[img_id].append(
+                        dict(
+                            image_level_label=int(
+                                self.label_id_mapping[line[2]]),
+                            confidence=float(line[3])))
+        return item_lists
+
+    def _get_relation_matrix(self, hierarchy_file: str) -> np.ndarray:
+        """Get the matrix of class hierarchy from the hierarchy file. Hierarchy
+        for 600 classes can be found at https://storage.googleapis.com/openimag
+        es/2018_04/bbox_labels_600_hierarchy_visualizer/circle.html.
+
+        Args:
+            hierarchy_file (str): File path to the hierarchy for classes.
+
+        Returns:
+            np.ndarray: The matrix of the corresponding relationship between
+            the parent class and the child class, of shape
+            (class_num, class_num).
+        """  # noqa
+
+        hierarchy = load(
+            hierarchy_file, file_format='json', backend_args=self.backend_args)
+        class_num = len(self._metainfo['classes'])
+        relation_matrix = np.eye(class_num, class_num)
+        relation_matrix = self._convert_hierarchy_tree(hierarchy,
+                                                       relation_matrix)
+        return relation_matrix
+
+    def _convert_hierarchy_tree(self,
+                                hierarchy_map: dict,
+                                relation_matrix: np.ndarray,
+                                parents: list = [],
+                                get_all_parents: bool = True) -> np.ndarray:
+        """Get matrix of the corresponding relationship between the parent
+        class and the child class.
+
+        Args:
+            hierarchy_map (dict): Including label name and corresponding
+                subcategory. Keys of dicts are:
+
+                - `LabeName` (str): Name of the label.
+                - `Subcategory` (dict | list): Corresponding subcategory(ies).
+            relation_matrix (ndarray): The matrix of the corresponding
+                relationship between the parent class and the child class,
+                of shape (class_num, class_num).
+            parents (list): Corresponding parent class.
+            get_all_parents (bool): Whether get all parent names.
+                Default: True
+
+        Returns:
+            ndarray: The matrix of the corresponding relationship between
+            the parent class and the child class, of shape
+            (class_num, class_num).
+        """
+
+        if 'Subcategory' in hierarchy_map:
+            for node in hierarchy_map['Subcategory']:
+                if 'LabelName' in node:
+                    children_name = node['LabelName']
+                    children_index = self.label_id_mapping[children_name]
+                    children = [children_index]
+                else:
+                    continue
+                if len(parents) > 0:
+                    for parent_index in parents:
+                        if get_all_parents:
+                            children.append(parent_index)
+                        relation_matrix[children_index, parent_index] = 1
+                relation_matrix = self._convert_hierarchy_tree(
+                    node, relation_matrix, parents=children)
+        return relation_matrix
+
+    def _join_prefix(self):
+        """Join ``self.data_root`` with annotation path."""
+        super()._join_prefix()
+        if not is_abs(self.label_file) and self.label_file:
+            self.label_file = osp.join(self.data_root, self.label_file)
+        if not is_abs(self.meta_file) and self.meta_file:
+            self.meta_file = osp.join(self.data_root, self.meta_file)
+        if not is_abs(self.hierarchy_file) and self.hierarchy_file:
+            self.hierarchy_file = osp.join(self.data_root, self.hierarchy_file)
+        if self.image_level_ann_file and not is_abs(self.image_level_ann_file):
+            self.image_level_ann_file = osp.join(self.data_root,
+                                                 self.image_level_ann_file)
+
+
+@DATASETS.register_module()
+class OpenImagesChallengeDataset(OpenImagesDataset):
+    """Open Images Challenge dataset for detection.
+
+    Args:
+        ann_file (str): Open Images Challenge box annotation in txt format.
+    """
+
+    METAINFO: dict = dict(dataset_type='oid_challenge')
+
+    def __init__(self, ann_file: str, **kwargs) -> None:
+        if not ann_file.endswith('txt'):
+            raise TypeError('The annotation file of Open Images Challenge '
+                            'should be a txt file.')
+
+        super().__init__(ann_file=ann_file, **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """
+        classes_names, label_id_mapping = self._parse_label_file(
+            self.label_file)
+        self._metainfo['classes'] = classes_names
+        self.label_id_mapping = label_id_mapping
+
+        if self.image_level_ann_file is not None:
+            img_level_anns = self._parse_img_level_ann(
+                self.image_level_ann_file)
+        else:
+            img_level_anns = None
+
+        # OpenImagesMetric can get the relation matrix from the dataset meta
+        relation_matrix = self._get_relation_matrix(self.hierarchy_file)
+        self._metainfo['RELATION_MATRIX'] = relation_matrix
+
+        data_list = []
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                lines = f.readlines()
+        i = 0
+        while i < len(lines):
+            instances = []
+            filename = lines[i].rstrip()
+            i += 2
+            img_gt_size = int(lines[i])
+            i += 1
+            for j in range(img_gt_size):
+                sp = lines[i + j].split()
+                instances.append(
+                    dict(
+                        bbox=[
+                            float(sp[1]),
+                            float(sp[2]),
+                            float(sp[3]),
+                            float(sp[4])
+                        ],
+                        bbox_label=int(sp[0]) - 1,  # labels begin from 1
+                        ignore_flag=0,
+                        is_group_ofs=True if int(sp[5]) == 1 else False))
+            i += img_gt_size
+            data_list.append(
+                dict(
+                    img_path=osp.join(self.data_prefix['img'], filename),
+                    instances=instances,
+                ))
+
+        # add image metas to data list
+        img_metas = load(
+            self.meta_file, file_format='pkl', backend_args=self.backend_args)
+        assert len(img_metas) == len(data_list)
+        for i, meta in enumerate(img_metas):
+            img_id = osp.split(data_list[i]['img_path'])[-1][:-4]
+            assert img_id == osp.split(meta['filename'])[-1][:-4]
+            h, w = meta['ori_shape'][:2]
+            data_list[i]['height'] = h
+            data_list[i]['width'] = w
+            data_list[i]['img_id'] = img_id
+            # denormalize bboxes
+            for j in range(len(data_list[i]['instances'])):
+                data_list[i]['instances'][j]['bbox'][0] *= w
+                data_list[i]['instances'][j]['bbox'][2] *= w
+                data_list[i]['instances'][j]['bbox'][1] *= h
+                data_list[i]['instances'][j]['bbox'][3] *= h
+            # add image-level annotation
+            if img_level_anns is not None:
+                img_labels = []
+                confidences = []
+                img_ann_list = img_level_anns.get(img_id, [])
+                for ann in img_ann_list:
+                    img_labels.append(int(ann['image_level_label']))
+                    confidences.append(float(ann['confidence']))
+                data_list[i]['image_level_labels'] = np.array(
+                    img_labels, dtype=np.int64)
+                data_list[i]['confidences'] = np.array(
+                    confidences, dtype=np.float32)
+        return data_list
+
+    def _parse_label_file(self, label_file: str) -> tuple:
+        """Get classes name and index mapping from cls-label-description file.
+
+        Args:
+            label_file (str): File path of the label description file that
+                maps the classes names in MID format to their short
+                descriptions.
+
+        Returns:
+            tuple: Class name of OpenImages.
+        """
+        label_list = []
+        id_list = []
+        index_mapping = {}
+        with get_local_path(
+                label_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                reader = csv.reader(f)
+                for line in reader:
+                    label_name = line[0]
+                    label_id = int(line[2])
+                    label_list.append(line[1])
+                    id_list.append(label_id)
+                    index_mapping[label_name] = label_id - 1
+        indexes = np.argsort(id_list)
+        classes_names = []
+        for index in indexes:
+            classes_names.append(label_list[index])
+        return classes_names, index_mapping
+
+    def _parse_img_level_ann(self, image_level_ann_file):
+        """Parse image level annotations from csv style ann_file.
+
+        Args:
+            image_level_ann_file (str): CSV style image level annotation
+                file path.
+
+        Returns:
+            defaultdict[list[dict]]: Annotations where item of the defaultdict
+            indicates an image, each of which has (n) dicts.
+            Keys of dicts are:
+
+                - `image_level_label` (int): of shape 1.
+                - `confidence` (float): of shape 1.
+        """
+
+        item_lists = defaultdict(list)
+        with get_local_path(
+                image_level_ann_file,
+                backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                reader = csv.reader(f)
+                i = -1
+                for line in reader:
+                    i += 1
+                    if i == 0:
+                        continue
+                    else:
+                        img_id = line[0]
+                        label_id = line[1]
+                        assert label_id in self.label_id_mapping
+                        image_level_label = int(
+                            self.label_id_mapping[label_id])
+                        confidence = float(line[2])
+                        item_lists[img_id].append(
+                            dict(
+                                image_level_label=image_level_label,
+                                confidence=confidence))
+        return item_lists
+
+    def _get_relation_matrix(self, hierarchy_file: str) -> np.ndarray:
+        """Get the matrix of class hierarchy from the hierarchy file.
+
+        Args:
+            hierarchy_file (str): File path to the hierarchy for classes.
+
+        Returns:
+            np.ndarray: The matrix of the corresponding
+            relationship between the parent class and the child class,
+            of shape (class_num, class_num).
+        """
+        with get_local_path(
+                hierarchy_file, backend_args=self.backend_args) as local_path:
+            class_label_tree = np.load(local_path, allow_pickle=True)
+        return class_label_tree[1:, 1:]
diff --git a/mmde/mmdet/datasets/refcoco.py b/mmde/mmdet/datasets/refcoco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dae75fd547216a5b69033cc821b93a1d9ac6abc
--- /dev/null
+++ b/mmde/mmdet/datasets/refcoco.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections
+import os.path as osp
+import random
+from typing import Dict, List
+
+import mmengine
+from mmengine.dataset import BaseDataset
+
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class RefCocoDataset(BaseDataset):
+    """RefCOCO dataset.
+
+    The `Refcoco` and `Refcoco+` dataset is based on
+    `ReferItGame: Referring to Objects in Photographs of Natural Scenes
+    <http://tamaraberg.com/papers/referit.pdf>`_.
+
+    The `Refcocog` dataset is based on
+    `Generation and Comprehension of Unambiguous Object Descriptions
+    <https://arxiv.org/abs/1511.02283>`_.
+
+    Args:
+        ann_file (str): Annotation file path.
+        data_root (str): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to ''.
+        data_prefix (str): Prefix for training data.
+        split_file (str): Split file path.
+        split (str): Split name. Defaults to 'train'.
+        text_mode (str): Text mode. Defaults to 'random'.
+        **kwargs: Other keyword arguments in :class:`BaseDataset`.
+    """
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 split_file: str,
+                 data_prefix: Dict,
+                 split: str = 'train',
+                 text_mode: str = 'random',
+                 **kwargs):
+        self.split_file = split_file
+        self.split = split
+
+        assert text_mode in ['original', 'random', 'concat', 'select_first']
+        self.text_mode = text_mode
+        super().__init__(
+            data_root=data_root,
+            data_prefix=data_prefix,
+            ann_file=ann_file,
+            **kwargs,
+        )
+
+    def _join_prefix(self):
+        if not mmengine.is_abs(self.split_file) and self.split_file:
+            self.split_file = osp.join(self.data_root, self.split_file)
+
+        return super()._join_prefix()
+
+    def _init_refs(self):
+        """Initialize the refs for RefCOCO."""
+        anns, imgs = {}, {}
+        for ann in self.instances['annotations']:
+            anns[ann['id']] = ann
+        for img in self.instances['images']:
+            imgs[img['id']] = img
+
+        refs, ref_to_ann = {}, {}
+        for ref in self.splits:
+            # ids
+            ref_id = ref['ref_id']
+            ann_id = ref['ann_id']
+            # add mapping related to ref
+            refs[ref_id] = ref
+            ref_to_ann[ref_id] = anns[ann_id]
+
+        self.refs = refs
+        self.ref_to_ann = ref_to_ann
+
+    def load_data_list(self) -> List[dict]:
+        """Load data list."""
+        self.splits = mmengine.load(self.split_file, file_format='pkl')
+        self.instances = mmengine.load(self.ann_file, file_format='json')
+        self._init_refs()
+        img_prefix = self.data_prefix['img_path']
+
+        ref_ids = [
+            ref['ref_id'] for ref in self.splits if ref['split'] == self.split
+        ]
+        full_anno = []
+        for ref_id in ref_ids:
+            ref = self.refs[ref_id]
+            ann = self.ref_to_ann[ref_id]
+            ann.update(ref)
+            full_anno.append(ann)
+
+        image_id_list = []
+        final_anno = {}
+        for anno in full_anno:
+            image_id_list.append(anno['image_id'])
+            final_anno[anno['ann_id']] = anno
+        annotations = [value for key, value in final_anno.items()]
+
+        coco_train_id = []
+        image_annot = {}
+        for i in range(len(self.instances['images'])):
+            coco_train_id.append(self.instances['images'][i]['id'])
+            image_annot[self.instances['images'][i]
+                        ['id']] = self.instances['images'][i]
+
+        images = []
+        for image_id in list(set(image_id_list)):
+            images += [image_annot[image_id]]
+
+        data_list = []
+
+        grounding_dict = collections.defaultdict(list)
+        for anno in annotations:
+            image_id = int(anno['image_id'])
+            grounding_dict[image_id].append(anno)
+
+        join_path = mmengine.fileio.get_file_backend(img_prefix).join_path
+        for image in images:
+            img_id = image['id']
+            instances = []
+            sentences = []
+            for grounding_anno in grounding_dict[img_id]:
+                texts = [x['raw'].lower() for x in grounding_anno['sentences']]
+                # random select one text
+                if self.text_mode == 'random':
+                    idx = random.randint(0, len(texts) - 1)
+                    text = [texts[idx]]
+                # concat all texts
+                elif self.text_mode == 'concat':
+                    text = [''.join(texts)]
+                # select the first text
+                elif self.text_mode == 'select_first':
+                    text = [texts[0]]
+                # use all texts
+                elif self.text_mode == 'original':
+                    text = texts
+                else:
+                    raise ValueError(f'Invalid text mode "{self.text_mode}".')
+                ins = [{
+                    'mask': grounding_anno['segmentation'],
+                    'ignore_flag': 0
+                }] * len(text)
+                instances.extend(ins)
+                sentences.extend(text)
+            data_info = {
+                'img_path': join_path(img_prefix, image['file_name']),
+                'img_id': img_id,
+                'instances': instances,
+                'text': sentences
+            }
+            data_list.append(data_info)
+
+        if len(data_list) == 0:
+            raise ValueError(f'No sample in split "{self.split}".')
+
+        return data_list
diff --git a/mmde/mmdet/datasets/reid_dataset.py b/mmde/mmdet/datasets/reid_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eed3ee4f0358edf59d19695c2b28394336dffd3
--- /dev/null
+++ b/mmde/mmdet/datasets/reid_dataset.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from collections import defaultdict
+from typing import Any, Dict, List
+
+import numpy as np
+from mmengine.dataset import BaseDataset
+from mmengine.utils import check_file_exist
+
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class ReIDDataset(BaseDataset):
+    """Dataset for ReID.
+
+    Args:
+        triplet_sampler (dict, optional): The sampler for hard mining
+            triplet loss. Defaults to None.
+        keys: num_ids (int): The number of person ids.
+              ins_per_id (int): The number of image for each person.
+    """
+
+    def __init__(self, triplet_sampler: dict = None, *args, **kwargs):
+        self.triplet_sampler = triplet_sampler
+        super().__init__(*args, **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ''self.ann_file''.
+
+        Returns:
+              list[dict]: A list of annotation.
+        """
+        assert isinstance(self.ann_file, str)
+        check_file_exist(self.ann_file)
+        data_list = []
+        with open(self.ann_file) as f:
+            samples = [x.strip().split(' ') for x in f.readlines()]
+            for filename, gt_label in samples:
+                info = dict(img_prefix=self.data_prefix)
+                if self.data_prefix['img_path'] is not None:
+                    info['img_path'] = osp.join(self.data_prefix['img_path'],
+                                                filename)
+                else:
+                    info['img_path'] = filename
+                info['gt_label'] = np.array(gt_label, dtype=np.int64)
+                data_list.append(info)
+        self._parse_ann_info(data_list)
+        return data_list
+
+    def _parse_ann_info(self, data_list: List[dict]):
+        """Parse person id annotations."""
+        index_tmp_dic = defaultdict(list)  # pid->[idx1,...,idxN]
+        self.index_dic = dict()  # pid->array([idx1,...,idxN])
+        for idx, info in enumerate(data_list):
+            pid = info['gt_label']
+            index_tmp_dic[int(pid)].append(idx)
+        for pid, idxs in index_tmp_dic.items():
+            self.index_dic[pid] = np.asarray(idxs, dtype=np.int64)
+        self.pids = np.asarray(list(self.index_dic.keys()), dtype=np.int64)
+
+    def prepare_data(self, idx: int) -> Any:
+        """Get data processed by ''self.pipeline''.
+
+        Args:
+            idx (int): The index of ''data_info''
+
+        Returns:
+            Any: Depends on ''self.pipeline''
+        """
+        data_info = self.get_data_info(idx)
+        if self.triplet_sampler is not None:
+            img_info = self.triplet_sampling(data_info['gt_label'],
+                                             **self.triplet_sampler)
+            data_info = copy.deepcopy(img_info)  # triplet -> list
+        else:
+            data_info = copy.deepcopy(data_info)  # no triplet -> dict
+        return self.pipeline(data_info)
+
+    def triplet_sampling(self,
+                         pos_pid,
+                         num_ids: int = 8,
+                         ins_per_id: int = 4) -> Dict:
+        """Triplet sampler for hard mining triplet loss. First, for one
+        pos_pid, random sample ins_per_id images with same person id.
+
+        Then, random sample num_ids - 1 images for each negative id.
+        Finally, random sample ins_per_id images for each negative id.
+
+        Args:
+            pos_pid (ndarray): The person id of the anchor.
+            num_ids (int): The number of person ids.
+            ins_per_id (int): The number of images for each person.
+
+        Returns:
+            Dict: Annotation information of num_ids X ins_per_id images.
+        """
+        assert len(self.pids) >= num_ids, \
+            'The number of person ids in the training set must ' \
+            'be greater than the number of person ids in the sample.'
+
+        pos_idxs = self.index_dic[int(
+            pos_pid)]  # all positive idxs for pos_pid
+        idxs_list = []
+        # select positive samplers
+        idxs_list.extend(pos_idxs[np.random.choice(
+            pos_idxs.shape[0], ins_per_id, replace=True)])
+        # select negative ids
+        neg_pids = np.random.choice(
+            [i for i, _ in enumerate(self.pids) if i != pos_pid],
+            num_ids - 1,
+            replace=False)
+        # select negative samplers for each negative id
+        for neg_pid in neg_pids:
+            neg_idxs = self.index_dic[neg_pid]
+            idxs_list.extend(neg_idxs[np.random.choice(
+                neg_idxs.shape[0], ins_per_id, replace=True)])
+        # return the final triplet batch
+        triplet_img_infos = []
+        for idx in idxs_list:
+            triplet_img_infos.append(copy.deepcopy(self.get_data_info(idx)))
+        # Collect data_list scatters (list of dict -> dict of list)
+        out = dict()
+        for key in triplet_img_infos[0].keys():
+            out[key] = [_info[key] for _info in triplet_img_infos]
+        return out
diff --git a/mmde/mmdet/datasets/samplers/__init__.py b/mmde/mmdet/datasets/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ea0e4cb0628fc23bc034c51e503d8ceca5ee90c
--- /dev/null
+++ b/mmde/mmdet/datasets/samplers/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .batch_sampler import (AspectRatioBatchSampler,
+                            MultiDataAspectRatioBatchSampler,
+                            TrackAspectRatioBatchSampler)
+from .class_aware_sampler import ClassAwareSampler
+from .custom_sample_size_sampler import CustomSampleSizeSampler
+from .multi_data_sampler import MultiDataSampler
+from .multi_source_sampler import GroupMultiSourceSampler, MultiSourceSampler
+from .track_img_sampler import TrackImgSampler
+
+__all__ = [
+    'ClassAwareSampler', 'AspectRatioBatchSampler', 'MultiSourceSampler',
+    'GroupMultiSourceSampler', 'TrackImgSampler',
+    'TrackAspectRatioBatchSampler', 'MultiDataSampler',
+    'MultiDataAspectRatioBatchSampler', 'CustomSampleSizeSampler'
+]
diff --git a/mmde/mmdet/datasets/samplers/batch_sampler.py b/mmde/mmdet/datasets/samplers/batch_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c17789c4e3ea51f1fa140d039a679f797a7660f6
--- /dev/null
+++ b/mmde/mmdet/datasets/samplers/batch_sampler.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+from torch.utils.data import BatchSampler, Sampler
+
+from mmdet.datasets.samplers.track_img_sampler import TrackImgSampler
+from mmdet.registry import DATA_SAMPLERS
+
+
+# TODO: maybe replace with a data_loader wrapper
+@DATA_SAMPLERS.register_module()
+class AspectRatioBatchSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio (< 1 or.
+
+    >= 1) into a same batch.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+    """
+
+    def __init__(self,
+                 sampler: Sampler,
+                 batch_size: int,
+                 drop_last: bool = False) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        # two groups for w < h and w >= h
+        self._aspect_ratio_buckets = [[] for _ in range(2)]
+
+    def __iter__(self) -> Sequence[int]:
+        for idx in self.sampler:
+            data_info = self.sampler.dataset.get_data_info(idx)
+            width, height = data_info['width'], data_info['height']
+            bucket_id = 0 if width < height else 1
+            bucket = self._aspect_ratio_buckets[bucket_id]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]
+
+        # yield the rest data and reset the bucket
+        left_data = self._aspect_ratio_buckets[0] + self._aspect_ratio_buckets[
+            1]
+        self._aspect_ratio_buckets = [[] for _ in range(2)]
+        while len(left_data) > 0:
+            if len(left_data) <= self.batch_size:
+                if not self.drop_last:
+                    yield left_data[:]
+                left_data = []
+            else:
+                yield left_data[:self.batch_size]
+                left_data = left_data[self.batch_size:]
+
+    def __len__(self) -> int:
+        if self.drop_last:
+            return len(self.sampler) // self.batch_size
+        else:
+            return (len(self.sampler) + self.batch_size - 1) // self.batch_size
+
+
+@DATA_SAMPLERS.register_module()
+class TrackAspectRatioBatchSampler(AspectRatioBatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio (< 1 or.
+
+    >= 1) into a same batch.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+    """
+
+    def __iter__(self) -> Sequence[int]:
+        for idx in self.sampler:
+            # hard code to solve TrackImgSampler
+            if isinstance(self.sampler, TrackImgSampler):
+                video_idx, _ = idx
+            else:
+                video_idx = idx
+            # video_idx
+            data_info = self.sampler.dataset.get_data_info(video_idx)
+            # data_info {video_id, images, video_length}
+            img_data_info = data_info['images'][0]
+            width, height = img_data_info['width'], img_data_info['height']
+            bucket_id = 0 if width < height else 1
+            bucket = self._aspect_ratio_buckets[bucket_id]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]
+
+        # yield the rest data and reset the bucket
+        left_data = self._aspect_ratio_buckets[0] + self._aspect_ratio_buckets[
+            1]
+        self._aspect_ratio_buckets = [[] for _ in range(2)]
+        while len(left_data) > 0:
+            if len(left_data) <= self.batch_size:
+                if not self.drop_last:
+                    yield left_data[:]
+                left_data = []
+            else:
+                yield left_data[:self.batch_size]
+                left_data = left_data[self.batch_size:]
+
+
+@DATA_SAMPLERS.register_module()
+class MultiDataAspectRatioBatchSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio (< 1 or.
+
+    >= 1) into a same batch for multi-source datasets.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        batch_size (Sequence(int)): Size of mini-batch for multi-source
+        datasets.
+        num_datasets(int): Number of multi-source datasets.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+        its size would be less than ``batch_size``.
+    """
+
+    def __init__(self,
+                 sampler: Sampler,
+                 batch_size: Sequence[int],
+                 num_datasets: int,
+                 drop_last: bool = True) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        self.sampler = sampler
+        self.batch_size = batch_size
+        self.num_datasets = num_datasets
+        self.drop_last = drop_last
+        # two groups for w < h and w >= h for each dataset --> 2 * num_datasets
+        self._buckets = [[] for _ in range(2 * self.num_datasets)]
+
+    def __iter__(self) -> Sequence[int]:
+        for idx in self.sampler:
+            data_info = self.sampler.dataset.get_data_info(idx)
+            width, height = data_info['width'], data_info['height']
+            dataset_source_idx = self.sampler.dataset.get_dataset_source(idx)
+            aspect_ratio_bucket_id = 0 if width < height else 1
+            bucket_id = dataset_source_idx * 2 + aspect_ratio_bucket_id
+            bucket = self._buckets[bucket_id]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size[dataset_source_idx]:
+                yield bucket[:]
+                del bucket[:]
+
+        # yield the rest data and reset the bucket
+        for i in range(self.num_datasets):
+            left_data = self._buckets[i * 2 + 0] + self._buckets[i * 2 + 1]
+            while len(left_data) > 0:
+                if len(left_data) <= self.batch_size[i]:
+                    if not self.drop_last:
+                        yield left_data[:]
+                    left_data = []
+                else:
+                    yield left_data[:self.batch_size[i]]
+                    left_data = left_data[self.batch_size[i]:]
+
+        self._buckets = [[] for _ in range(2 * self.num_datasets)]
+
+    def __len__(self) -> int:
+        sizes = [0 for _ in range(self.num_datasets)]
+        for idx in self.sampler:
+            dataset_source_idx = self.sampler.dataset.get_dataset_source(idx)
+            sizes[dataset_source_idx] += 1
+
+        if self.drop_last:
+            lens = 0
+            for i in range(self.num_datasets):
+                lens += sizes[i] // self.batch_size[i]
+            return lens
+        else:
+            lens = 0
+            for i in range(self.num_datasets):
+                lens += (sizes[i] + self.batch_size[i] -
+                         1) // self.batch_size[i]
+            return lens
diff --git a/mmde/mmdet/datasets/samplers/class_aware_sampler.py b/mmde/mmdet/datasets/samplers/class_aware_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ca2f9b3ffb7c780ab25cc3704b67589763259e0
--- /dev/null
+++ b/mmde/mmdet/datasets/samplers/class_aware_sampler.py
@@ -0,0 +1,192 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Dict, Iterator, Optional, Union
+
+import numpy as np
+import torch
+from mmengine.dataset import BaseDataset
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+
+from mmdet.registry import DATA_SAMPLERS
+
+
+@DATA_SAMPLERS.register_module()
+class ClassAwareSampler(Sampler):
+    r"""Sampler that restricts data loading to the label of the dataset.
+
+    A class-aware sampling strategy to effectively tackle the
+    non-uniform class distribution. The length of the training data is
+    consistent with source data. Simple improvements based on `Relay
+    Backpropagation for Effective Learning of Deep Convolutional
+    Neural Networks <https://arxiv.org/abs/1512.05830>`_
+
+    The implementation logic is referred to
+    https://github.com/Sense-X/TSD/blob/master/mmdet/datasets/samplers/distributed_classaware_sampler.py
+
+    Args:
+        dataset: Dataset used for sampling.
+        seed (int, optional): random seed used to shuffle the sampler.
+            This number should be identical across all
+            processes in the distributed group. Defaults to None.
+        num_sample_class (int): The number of samples taken from each
+            per-label list. Defaults to 1.
+    """
+
+    def __init__(self,
+                 dataset: BaseDataset,
+                 seed: Optional[int] = None,
+                 num_sample_class: int = 1) -> None:
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        self.epoch = 0
+        # Must be the same across all workers. If None, will use a
+        # random seed shared among workers
+        # (require synchronization among all workers)
+        if seed is None:
+            seed = sync_random_seed()
+        self.seed = seed
+
+        # The number of samples taken from each per-label list
+        assert num_sample_class > 0 and isinstance(num_sample_class, int)
+        self.num_sample_class = num_sample_class
+        # Get per-label image list from dataset
+        self.cat_dict = self.get_cat2imgs()
+
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / world_size))
+        self.total_size = self.num_samples * self.world_size
+
+        # get number of images containing each category
+        self.num_cat_imgs = [len(x) for x in self.cat_dict.values()]
+        # filter labels without images
+        self.valid_cat_inds = [
+            i for i, length in enumerate(self.num_cat_imgs) if length != 0
+        ]
+        self.num_classes = len(self.valid_cat_inds)
+
+    def get_cat2imgs(self) -> Dict[int, list]:
+        """Get a dict with class as key and img_ids as values.
+
+        Returns:
+            dict[int, list]: A dict of per-label image list,
+            the item of the dict indicates a label index,
+            corresponds to the image index that contains the label.
+        """
+        classes = self.dataset.metainfo.get('classes', None)
+        if classes is None:
+            raise ValueError('dataset metainfo must contain `classes`')
+        # sort the label index
+        cat2imgs = {i: [] for i in range(len(classes))}
+        for i in range(len(self.dataset)):
+            cat_ids = set(self.dataset.get_cat_ids(i))
+            for cat in cat_ids:
+                cat2imgs[cat].append(i)
+        return cat2imgs
+
+    def __iter__(self) -> Iterator[int]:
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch + self.seed)
+
+        # initialize label list
+        label_iter_list = RandomCycleIter(self.valid_cat_inds, generator=g)
+        # initialize each per-label image list
+        data_iter_dict = dict()
+        for i in self.valid_cat_inds:
+            data_iter_dict[i] = RandomCycleIter(self.cat_dict[i], generator=g)
+
+        def gen_cat_img_inds(cls_list, data_dict, num_sample_cls):
+            """Traverse the categories and extract `num_sample_cls` image
+            indexes of the corresponding categories one by one."""
+            id_indices = []
+            for _ in range(len(cls_list)):
+                cls_idx = next(cls_list)
+                for _ in range(num_sample_cls):
+                    id = next(data_dict[cls_idx])
+                    id_indices.append(id)
+            return id_indices
+
+        # deterministically shuffle based on epoch
+        num_bins = int(
+            math.ceil(self.total_size * 1.0 / self.num_classes /
+                      self.num_sample_class))
+        indices = []
+        for i in range(num_bins):
+            indices += gen_cat_img_inds(label_iter_list, data_iter_dict,
+                                        self.num_sample_class)
+
+        # fix extra samples to make it evenly divisible
+        if len(indices) >= self.total_size:
+            indices = indices[:self.total_size]
+        else:
+            indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self) -> int:
+        """The number of samples in this rank."""
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        """Sets the epoch for this sampler.
+
+        When :attr:`shuffle=True`, this ensures all replicas use a different
+        random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
+
+
+class RandomCycleIter:
+    """Shuffle the list and do it again after the list have traversed.
+
+    The implementation logic is referred to
+    https://github.com/wutong16/DistributionBalancedLoss/blob/master/mllt/datasets/loader/sampler.py
+
+    Example:
+        >>> label_list = [0, 1, 2, 4, 5]
+        >>> g = torch.Generator()
+        >>> g.manual_seed(0)
+        >>> label_iter_list = RandomCycleIter(label_list, generator=g)
+        >>> index = next(label_iter_list)
+    Args:
+        data (list or ndarray): The data that needs to be shuffled.
+        generator: An torch.Generator object, which is used in setting the seed
+            for generating random numbers.
+    """  # noqa: W605
+
+    def __init__(self,
+                 data: Union[list, np.ndarray],
+                 generator: torch.Generator = None) -> None:
+        self.data = data
+        self.length = len(data)
+        self.index = torch.randperm(self.length, generator=generator).numpy()
+        self.i = 0
+        self.generator = generator
+
+    def __iter__(self) -> Iterator:
+        return self
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def __next__(self):
+        if self.i == self.length:
+            self.index = torch.randperm(
+                self.length, generator=self.generator).numpy()
+            self.i = 0
+        idx = self.data[self.index[self.i]]
+        self.i += 1
+        return idx
diff --git a/mmde/mmdet/datasets/samplers/custom_sample_size_sampler.py b/mmde/mmdet/datasets/samplers/custom_sample_size_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bedf6c66be81b091a6424bae6788953ba7763a3
--- /dev/null
+++ b/mmde/mmdet/datasets/samplers/custom_sample_size_sampler.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Iterator, Optional, Sequence, Sized
+
+import torch
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+
+from mmdet.registry import DATA_SAMPLERS
+from .class_aware_sampler import RandomCycleIter
+
+
+@DATA_SAMPLERS.register_module()
+class CustomSampleSizeSampler(Sampler):
+
+    def __init__(self,
+                 dataset: Sized,
+                 dataset_size: Sequence[int],
+                 ratio_mode: bool = False,
+                 seed: Optional[int] = None,
+                 round_up: bool = True) -> None:
+        assert len(dataset.datasets) == len(dataset_size)
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        if seed is None:
+            seed = sync_random_seed()
+        self.seed = seed
+        self.epoch = 0
+        self.round_up = round_up
+
+        total_size = 0
+        total_size_fake = 0
+        self.dataset_index = []
+        self.dataset_cycle_iter = []
+        new_dataset_size = []
+        for dataset, size in zip(dataset.datasets, dataset_size):
+            self.dataset_index.append(
+                list(range(total_size_fake,
+                           len(dataset) + total_size_fake)))
+            total_size_fake += len(dataset)
+            if size == -1:
+                total_size += len(dataset)
+                self.dataset_cycle_iter.append(None)
+                new_dataset_size.append(-1)
+            else:
+                if ratio_mode:
+                    size = int(size * len(dataset))
+                assert size <= len(
+                    dataset
+                ), f'dataset size {size} is larger than ' \
+                   f'dataset length {len(dataset)}'
+                total_size += size
+                new_dataset_size.append(size)
+
+                g = torch.Generator()
+                g.manual_seed(self.seed)
+                self.dataset_cycle_iter.append(
+                    RandomCycleIter(self.dataset_index[-1], generator=g))
+        self.dataset_size = new_dataset_size
+
+        if self.round_up:
+            self.num_samples = math.ceil(total_size / world_size)
+            self.total_size = self.num_samples * self.world_size
+        else:
+            self.num_samples = math.ceil((total_size - rank) / world_size)
+            self.total_size = total_size
+
+    def __iter__(self) -> Iterator[int]:
+        """Iterate the indices."""
+        # deterministically shuffle based on epoch and seed
+        g = torch.Generator()
+        g.manual_seed(self.seed + self.epoch)
+
+        out_index = []
+        for data_size, data_index, cycle_iter in zip(self.dataset_size,
+                                                     self.dataset_index,
+                                                     self.dataset_cycle_iter):
+            if data_size == -1:
+                out_index += data_index
+            else:
+                index = [next(cycle_iter) for _ in range(data_size)]
+                out_index += index
+
+        index = torch.randperm(len(out_index), generator=g).numpy().tolist()
+        indices = [out_index[i] for i in index]
+
+        if self.round_up:
+            indices = (
+                indices *
+                int(self.total_size / len(indices) + 1))[:self.total_size]
+        indices = indices[self.rank:self.total_size:self.world_size]
+        return iter(indices)
+
+    def __len__(self) -> int:
+        """The number of samples in this rank."""
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        """Sets the epoch for this sampler.
+
+        When :attr:`shuffle=True`, this ensures all replicas use a different
+        random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
diff --git a/mmde/mmdet/datasets/samplers/multi_data_sampler.py b/mmde/mmdet/datasets/samplers/multi_data_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3a4b60d84122ce9eb2090095e9744c2bd73cc3d
--- /dev/null
+++ b/mmde/mmdet/datasets/samplers/multi_data_sampler.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Iterator, Optional, Sequence, Sized
+
+import torch
+from mmengine.dist import get_dist_info, sync_random_seed
+from mmengine.registry import DATA_SAMPLERS
+from torch.utils.data import Sampler
+
+
+@DATA_SAMPLERS.register_module()
+class MultiDataSampler(Sampler):
+    """The default data sampler for both distributed and non-distributed
+    environment.
+
+    It has several differences from the PyTorch ``DistributedSampler`` as
+    below:
+
+    1. This sampler supports non-distributed environment.
+
+    2. The round up behaviors are a little different.
+
+       - If ``round_up=True``, this sampler will add extra samples to make the
+         number of samples is evenly divisible by the world size. And
+         this behavior is the same as the ``DistributedSampler`` with
+         ``drop_last=False``.
+       - If ``round_up=False``, this sampler won't remove or add any samples
+         while the ``DistributedSampler`` with ``drop_last=True`` will remove
+         tail samples.
+
+    Args:
+        dataset (Sized): The dataset.
+        dataset_ratio (Sequence(int)) The ratios of different datasets.
+        seed (int, optional): Random seed used to shuffle the sampler if
+            :attr:`shuffle=True`. This number should be identical across all
+            processes in the distributed group. Defaults to None.
+        round_up (bool): Whether to add extra samples to make the number of
+            samples evenly divisible by the world size. Defaults to True.
+    """
+
+    def __init__(self,
+                 dataset: Sized,
+                 dataset_ratio: Sequence[int],
+                 seed: Optional[int] = None,
+                 round_up: bool = True) -> None:
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        self.dataset_ratio = dataset_ratio
+
+        if seed is None:
+            seed = sync_random_seed()
+        self.seed = seed
+        self.epoch = 0
+        self.round_up = round_up
+
+        if self.round_up:
+            self.num_samples = math.ceil(len(self.dataset) / world_size)
+            self.total_size = self.num_samples * self.world_size
+        else:
+            self.num_samples = math.ceil(
+                (len(self.dataset) - rank) / world_size)
+            self.total_size = len(self.dataset)
+
+        self.sizes = [len(dataset) for dataset in self.dataset.datasets]
+
+        dataset_weight = [
+            torch.ones(s) * max(self.sizes) / s * r / sum(self.dataset_ratio)
+            for i, (r, s) in enumerate(zip(self.dataset_ratio, self.sizes))
+        ]
+        self.weights = torch.cat(dataset_weight)
+
+    def __iter__(self) -> Iterator[int]:
+        """Iterate the indices."""
+        # deterministically shuffle based on epoch and seed
+        g = torch.Generator()
+        g.manual_seed(self.seed + self.epoch)
+
+        indices = torch.multinomial(
+            self.weights, len(self.weights), generator=g,
+            replacement=True).tolist()
+
+        # add extra samples to make it evenly divisible
+        if self.round_up:
+            indices = (
+                indices *
+                int(self.total_size / len(indices) + 1))[:self.total_size]
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.world_size]
+
+        return iter(indices)
+
+    def __len__(self) -> int:
+        """The number of samples in this rank."""
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        """Sets the epoch for this sampler.
+
+        When :attr:`shuffle=True`, this ensures all replicas use a different
+        random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
diff --git a/mmde/mmdet/datasets/samplers/multi_source_sampler.py b/mmde/mmdet/datasets/samplers/multi_source_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6efcde35e1375547239825a8f78a9e74f7825290
--- /dev/null
+++ b/mmde/mmdet/datasets/samplers/multi_source_sampler.py
@@ -0,0 +1,214 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+from typing import Iterator, List, Optional, Sized, Union
+
+import numpy as np
+import torch
+from mmengine.dataset import BaseDataset
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+
+from mmdet.registry import DATA_SAMPLERS
+
+
+@DATA_SAMPLERS.register_module()
+class MultiSourceSampler(Sampler):
+    r"""Multi-Source Infinite Sampler.
+
+    According to the sampling ratio, sample data from different
+    datasets to form batches.
+
+    Args:
+        dataset (Sized): The dataset.
+        batch_size (int): Size of mini-batch.
+        source_ratio (list[int | float]): The sampling ratio of different
+            source datasets in a mini-batch.
+        shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
+        seed (int, optional): Random seed. If None, set a random seed.
+            Defaults to None.
+
+    Examples:
+        >>> dataset_type = 'ConcatDataset'
+        >>> sub_dataset_type = 'CocoDataset'
+        >>> data_root = 'data/coco/'
+        >>> sup_ann = '../coco_semi_annos/instances_train2017.1@10.json'
+        >>> unsup_ann = '../coco_semi_annos/' \
+        >>>             'instances_train2017.1@10-unlabeled.json'
+        >>> dataset = dict(type=dataset_type,
+        >>>     datasets=[
+        >>>         dict(
+        >>>             type=sub_dataset_type,
+        >>>             data_root=data_root,
+        >>>             ann_file=sup_ann,
+        >>>             data_prefix=dict(img='train2017/'),
+        >>>             filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        >>>             pipeline=sup_pipeline),
+        >>>         dict(
+        >>>             type=sub_dataset_type,
+        >>>             data_root=data_root,
+        >>>             ann_file=unsup_ann,
+        >>>             data_prefix=dict(img='train2017/'),
+        >>>             filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        >>>             pipeline=unsup_pipeline),
+        >>>         ])
+        >>>     train_dataloader = dict(
+        >>>         batch_size=5,
+        >>>         num_workers=5,
+        >>>         persistent_workers=True,
+        >>>         sampler=dict(type='MultiSourceSampler',
+        >>>             batch_size=5, source_ratio=[1, 4]),
+        >>>         batch_sampler=None,
+        >>>         dataset=dataset)
+    """
+
+    def __init__(self,
+                 dataset: Sized,
+                 batch_size: int,
+                 source_ratio: List[Union[int, float]],
+                 shuffle: bool = True,
+                 seed: Optional[int] = None) -> None:
+
+        assert hasattr(dataset, 'cumulative_sizes'),\
+            f'The dataset must be ConcatDataset, but get {dataset}'
+        assert isinstance(batch_size, int) and batch_size > 0, \
+            'batch_size must be a positive integer value, ' \
+            f'but got batch_size={batch_size}'
+        assert isinstance(source_ratio, list), \
+            f'source_ratio must be a list, but got source_ratio={source_ratio}'
+        assert len(source_ratio) == len(dataset.cumulative_sizes), \
+            'The length of source_ratio must be equal to ' \
+            f'the number of datasets, but got source_ratio={source_ratio}'
+
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        self.cumulative_sizes = [0] + dataset.cumulative_sizes
+        self.batch_size = batch_size
+        self.source_ratio = source_ratio
+
+        self.num_per_source = [
+            int(batch_size * sr / sum(source_ratio)) for sr in source_ratio
+        ]
+        self.num_per_source[0] = batch_size - sum(self.num_per_source[1:])
+
+        assert sum(self.num_per_source) == batch_size, \
+            'The sum of num_per_source must be equal to ' \
+            f'batch_size, but get {self.num_per_source}'
+
+        self.seed = sync_random_seed() if seed is None else seed
+        self.shuffle = shuffle
+        self.source2inds = {
+            source: self._indices_of_rank(len(ds))
+            for source, ds in enumerate(dataset.datasets)
+        }
+
+    def _infinite_indices(self, sample_size: int) -> Iterator[int]:
+        """Infinitely yield a sequence of indices."""
+        g = torch.Generator()
+        g.manual_seed(self.seed)
+        while True:
+            if self.shuffle:
+                yield from torch.randperm(sample_size, generator=g).tolist()
+            else:
+                yield from torch.arange(sample_size).tolist()
+
+    def _indices_of_rank(self, sample_size: int) -> Iterator[int]:
+        """Slice the infinite indices by rank."""
+        yield from itertools.islice(
+            self._infinite_indices(sample_size), self.rank, None,
+            self.world_size)
+
+    def __iter__(self) -> Iterator[int]:
+        batch_buffer = []
+        while True:
+            for source, num in enumerate(self.num_per_source):
+                batch_buffer_per_source = []
+                for idx in self.source2inds[source]:
+                    idx += self.cumulative_sizes[source]
+                    batch_buffer_per_source.append(idx)
+                    if len(batch_buffer_per_source) == num:
+                        batch_buffer += batch_buffer_per_source
+                        break
+            yield from batch_buffer
+            batch_buffer = []
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    def set_epoch(self, epoch: int) -> None:
+        """Not supported in `epoch-based runner."""
+        pass
+
+
+@DATA_SAMPLERS.register_module()
+class GroupMultiSourceSampler(MultiSourceSampler):
+    r"""Group Multi-Source Infinite Sampler.
+
+    According to the sampling ratio, sample data from different
+    datasets but the same group to form batches.
+
+    Args:
+        dataset (Sized): The dataset.
+        batch_size (int): Size of mini-batch.
+        source_ratio (list[int | float]): The sampling ratio of different
+            source datasets in a mini-batch.
+        shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
+        seed (int, optional): Random seed. If None, set a random seed.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 dataset: BaseDataset,
+                 batch_size: int,
+                 source_ratio: List[Union[int, float]],
+                 shuffle: bool = True,
+                 seed: Optional[int] = None) -> None:
+        super().__init__(
+            dataset=dataset,
+            batch_size=batch_size,
+            source_ratio=source_ratio,
+            shuffle=shuffle,
+            seed=seed)
+
+        self._get_source_group_info()
+        self.group_source2inds = [{
+            source:
+            self._indices_of_rank(self.group2size_per_source[source][group])
+            for source in range(len(dataset.datasets))
+        } for group in range(len(self.group_ratio))]
+
+    def _get_source_group_info(self) -> None:
+        self.group2size_per_source = [{0: 0, 1: 0}, {0: 0, 1: 0}]
+        self.group2inds_per_source = [{0: [], 1: []}, {0: [], 1: []}]
+        for source, dataset in enumerate(self.dataset.datasets):
+            for idx in range(len(dataset)):
+                data_info = dataset.get_data_info(idx)
+                width, height = data_info['width'], data_info['height']
+                group = 0 if width < height else 1
+                self.group2size_per_source[source][group] += 1
+                self.group2inds_per_source[source][group].append(idx)
+
+        self.group_sizes = np.zeros(2, dtype=np.int64)
+        for group2size in self.group2size_per_source:
+            for group, size in group2size.items():
+                self.group_sizes[group] += size
+        self.group_ratio = self.group_sizes / sum(self.group_sizes)
+
+    def __iter__(self) -> Iterator[int]:
+        batch_buffer = []
+        while True:
+            group = np.random.choice(
+                list(range(len(self.group_ratio))), p=self.group_ratio)
+            for source, num in enumerate(self.num_per_source):
+                batch_buffer_per_source = []
+                for idx in self.group_source2inds[group][source]:
+                    idx = self.group2inds_per_source[source][group][
+                        idx] + self.cumulative_sizes[source]
+                    batch_buffer_per_source.append(idx)
+                    if len(batch_buffer_per_source) == num:
+                        batch_buffer += batch_buffer_per_source
+                        break
+            yield from batch_buffer
+            batch_buffer = []
diff --git a/mmde/mmdet/datasets/samplers/track_img_sampler.py b/mmde/mmdet/datasets/samplers/track_img_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7db629f40f3f24bdf14cd852ccc4472d1d50f1b
--- /dev/null
+++ b/mmde/mmdet/datasets/samplers/track_img_sampler.py
@@ -0,0 +1,146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import random
+from typing import Iterator, Optional, Sized
+
+import numpy as np
+from mmengine.dataset import ClassBalancedDataset, ConcatDataset
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+
+from mmdet.registry import DATA_SAMPLERS
+from ..base_video_dataset import BaseVideoDataset
+
+
+@DATA_SAMPLERS.register_module()
+class TrackImgSampler(Sampler):
+    """Sampler that providing image-level sampling outputs for video datasets
+    in tracking tasks. It could be both used in both distributed and
+    non-distributed environment.
+    If using the default sampler in pytorch, the subsequent data receiver will
+    get one video, which is not desired in some cases:
+    (Take a non-distributed environment as an example)
+    1. In test mode, we want only one image is fed into the data pipeline. This
+    is in consideration of memory usage since feeding the whole video commonly
+    requires a large amount of memory (>=20G on MOTChallenge17 dataset), which
+    is not available in some machines.
+    2. In training mode, we may want to make sure all the images in one video
+    are randomly sampled once in one epoch and this can not be guaranteed in
+    the default sampler in pytorch.
+
+    Args:
+        dataset (Sized): Dataset used for sampling.
+        seed (int, optional): random seed used to shuffle the sampler. This
+            number should be identical across all processes in the distributed
+            group. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        dataset: Sized,
+        seed: Optional[int] = None,
+    ) -> None:
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+        self.epoch = 0
+        if seed is None:
+            self.seed = sync_random_seed()
+        else:
+            self.seed = seed
+
+        self.dataset = dataset
+        self.indices = []
+        # Hard code here to handle different dataset wrapper
+        if isinstance(self.dataset, ConcatDataset):
+            cat_datasets = self.dataset.datasets
+            assert isinstance(
+                cat_datasets[0], BaseVideoDataset
+            ), f'expected BaseVideoDataset, but got {type(cat_datasets[0])}'
+            self.test_mode = cat_datasets[0].test_mode
+            assert not self.test_mode, "'ConcatDataset' should not exist in "
+            'test mode'
+            for dataset in cat_datasets:
+                num_videos = len(dataset)
+                for video_ind in range(num_videos):
+                    self.indices.extend([
+                        (video_ind, frame_ind) for frame_ind in range(
+                            dataset.get_len_per_video(video_ind))
+                    ])
+        elif isinstance(self.dataset, ClassBalancedDataset):
+            ori_dataset = self.dataset.dataset
+            assert isinstance(
+                ori_dataset, BaseVideoDataset
+            ), f'expected BaseVideoDataset, but got {type(ori_dataset)}'
+            self.test_mode = ori_dataset.test_mode
+            assert not self.test_mode, "'ClassBalancedDataset' should not "
+            'exist in test mode'
+            video_indices = self.dataset.repeat_indices
+            for index in video_indices:
+                self.indices.extend([(index, frame_ind) for frame_ind in range(
+                    ori_dataset.get_len_per_video(index))])
+        else:
+            assert isinstance(
+                self.dataset, BaseVideoDataset
+            ), 'TrackImgSampler is only supported in BaseVideoDataset or '
+            'dataset wrapper: ClassBalancedDataset and ConcatDataset, but '
+            f'got {type(self.dataset)} '
+            self.test_mode = self.dataset.test_mode
+            num_videos = len(self.dataset)
+
+            if self.test_mode:
+                # in test mode, the images belong to the same video must be put
+                # on the same device.
+                if num_videos < self.world_size:
+                    raise ValueError(f'only {num_videos} videos loaded,'
+                                     f'but {self.world_size} gpus were given.')
+                chunks = np.array_split(
+                    list(range(num_videos)), self.world_size)
+                for videos_inds in chunks:
+                    indices_chunk = []
+                    for video_ind in videos_inds:
+                        indices_chunk.extend([
+                            (video_ind, frame_ind) for frame_ind in range(
+                                self.dataset.get_len_per_video(video_ind))
+                        ])
+                    self.indices.append(indices_chunk)
+            else:
+                for video_ind in range(num_videos):
+                    self.indices.extend([
+                        (video_ind, frame_ind) for frame_ind in range(
+                            self.dataset.get_len_per_video(video_ind))
+                    ])
+
+        if self.test_mode:
+            self.num_samples = len(self.indices[self.rank])
+            self.total_size = sum(
+                [len(index_list) for index_list in self.indices])
+        else:
+            self.num_samples = int(
+                math.ceil(len(self.indices) * 1.0 / self.world_size))
+            self.total_size = self.num_samples * self.world_size
+
+    def __iter__(self) -> Iterator:
+        if self.test_mode:
+            # in test mode, the order of frames can not be shuffled.
+            indices = self.indices[self.rank]
+        else:
+            # deterministically shuffle based on epoch
+            rng = random.Random(self.epoch + self.seed)
+            indices = rng.sample(self.indices, len(self.indices))
+
+            # add extra samples to make it evenly divisible
+            indices += indices[:(self.total_size - len(indices))]
+            assert len(indices) == self.total_size
+
+            # subsample
+            indices = indices[self.rank:self.total_size:self.world_size]
+            assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/mmde/mmdet/datasets/transforms/__init__.py b/mmde/mmdet/datasets/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab3478feb008443cb0e56bf5084261370e38327d
--- /dev/null
+++ b/mmde/mmdet/datasets/transforms/__init__.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .augment_wrappers import AutoAugment, RandAugment
+from .colorspace import (AutoContrast, Brightness, Color, ColorTransform,
+                         Contrast, Equalize, Invert, Posterize, Sharpness,
+                         Solarize, SolarizeAdd)
+from .formatting import (ImageToTensor, PackDetInputs, PackReIDInputs,
+                         PackTrackInputs, ToTensor, Transpose)
+from .frame_sampling import BaseFrameSample, UniformRefFrameSample
+from .geometric import (GeomTransform, Rotate, ShearX, ShearY, TranslateX,
+                        TranslateY)
+from .instaboost import InstaBoost
+from .loading import (FilterAnnotations, InferencerLoader, LoadAnnotations,
+                      LoadEmptyAnnotations, LoadImageFromNDArray,
+                      LoadMultiChannelImageFromFiles, LoadPanopticAnnotations,
+                      LoadProposals, LoadTrackAnnotations)
+from .text_transformers import LoadTextAnnotations, RandomSamplingNegPos
+from .transformers_glip import GTBoxSubOne_GLIP, RandomFlip_GLIP
+from .transforms import (Albu, CachedMixUp, CachedMosaic, CopyPaste, CutOut,
+                         Expand, FixScaleResize, FixShapeResize,
+                         MinIoURandomCrop, MixUp, Mosaic, Pad,
+                         PhotoMetricDistortion, RandomAffine,
+                         RandomCenterCropPad, RandomCrop, RandomErasing,
+                         RandomFlip, RandomShift, Resize, ResizeShortestEdge,
+                         SegRescale, YOLOXHSVRandomAug)
+from .wrappers import MultiBranch, ProposalBroadcaster, RandomOrder
+
+__all__ = [
+    'PackDetInputs', 'ToTensor', 'ImageToTensor', 'Transpose',
+    'LoadImageFromNDArray', 'LoadAnnotations', 'LoadPanopticAnnotations',
+    'LoadMultiChannelImageFromFiles', 'LoadProposals', 'Resize', 'RandomFlip',
+    'RandomCrop', 'SegRescale', 'MinIoURandomCrop', 'Expand',
+    'PhotoMetricDistortion', 'Albu', 'InstaBoost', 'RandomCenterCropPad',
+    'AutoAugment', 'CutOut', 'ShearX', 'ShearY', 'Rotate', 'Color', 'Equalize',
+    'Brightness', 'Contrast', 'TranslateX', 'TranslateY', 'RandomShift',
+    'Mosaic', 'MixUp', 'RandomAffine', 'YOLOXHSVRandomAug', 'CopyPaste',
+    'FilterAnnotations', 'Pad', 'GeomTransform', 'ColorTransform',
+    'RandAugment', 'Sharpness', 'Solarize', 'SolarizeAdd', 'Posterize',
+    'AutoContrast', 'Invert', 'MultiBranch', 'RandomErasing',
+    'LoadEmptyAnnotations', 'RandomOrder', 'CachedMosaic', 'CachedMixUp',
+    'FixShapeResize', 'ProposalBroadcaster', 'InferencerLoader',
+    'LoadTrackAnnotations', 'BaseFrameSample', 'UniformRefFrameSample',
+    'PackTrackInputs', 'PackReIDInputs', 'FixScaleResize',
+    'ResizeShortestEdge', 'GTBoxSubOne_GLIP', 'RandomFlip_GLIP',
+    'RandomSamplingNegPos', 'LoadTextAnnotations'
+]
diff --git a/mmde/mmdet/datasets/transforms/augment_wrappers.py b/mmde/mmdet/datasets/transforms/augment_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..19fae6efdf66aa4c26bb85a2f2c96a1e079320b8
--- /dev/null
+++ b/mmde/mmdet/datasets/transforms/augment_wrappers.py
@@ -0,0 +1,264 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import numpy as np
+from mmcv.transforms import RandomChoice
+from mmcv.transforms.utils import cache_randomness
+from mmengine.config import ConfigDict
+
+from mmdet.registry import TRANSFORMS
+
+# AutoAugment uses reinforcement learning to search for
+# some widely useful data augmentation strategies,
+# here we provide AUTOAUG_POLICIES_V0.
+# For AUTOAUG_POLICIES_V0, each tuple is an augmentation
+# operation of the form (operation, probability, magnitude).
+# Each element in policies is a policy that will be applied
+# sequentially on the image.
+
+# RandAugment defines a data augmentation search space, RANDAUG_SPACE,
+# sampling 1~3 data augmentations each time, and
+# setting the magnitude of each data augmentation randomly,
+# which will be applied sequentially on the image.
+
+_MAX_LEVEL = 10
+
+AUTOAUG_POLICIES_V0 = [
+    [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+    [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+    [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+    [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+    [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+    [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+    [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+    [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+    [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+    [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+    [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+    [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+    [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+    [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+    [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+    [('Rotate', 1.0, 7), ('TranslateY', 0.8, 9)],
+    [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+    [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+    [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+    [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+    [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+    [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+    [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
+    [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+    [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+]
+
+
+def policies_v0():
+    """Autoaugment policies that was used in AutoAugment Paper."""
+    policies = list()
+    for policy_args in AUTOAUG_POLICIES_V0:
+        policy = list()
+        for args in policy_args:
+            policy.append(dict(type=args[0], prob=args[1], level=args[2]))
+        policies.append(policy)
+    return policies
+
+
+RANDAUG_SPACE = [[dict(type='AutoContrast')], [dict(type='Equalize')],
+                 [dict(type='Invert')], [dict(type='Rotate')],
+                 [dict(type='Posterize')], [dict(type='Solarize')],
+                 [dict(type='SolarizeAdd')], [dict(type='Color')],
+                 [dict(type='Contrast')], [dict(type='Brightness')],
+                 [dict(type='Sharpness')], [dict(type='ShearX')],
+                 [dict(type='ShearY')], [dict(type='TranslateX')],
+                 [dict(type='TranslateY')]]
+
+
+def level_to_mag(level: Optional[int], min_mag: float,
+                 max_mag: float) -> float:
+    """Map from level to magnitude."""
+    if level is None:
+        return round(np.random.rand() * (max_mag - min_mag) + min_mag, 1)
+    else:
+        return round(level / _MAX_LEVEL * (max_mag - min_mag) + min_mag, 1)
+
+
+@TRANSFORMS.register_module()
+class AutoAugment(RandomChoice):
+    """Auto augmentation.
+
+    This data augmentation is proposed in `AutoAugment: Learning
+    Augmentation Policies from Data <https://arxiv.org/abs/1805.09501>`_
+    and in `Learning Data Augmentation Strategies for Object Detection
+    <https://arxiv.org/pdf/1906.11172>`_.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_bboxes_labels
+    - gt_masks
+    - gt_ignore_flags
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        policies (List[List[Union[dict, ConfigDict]]]):
+            The policies of auto augmentation.Each policy in ``policies``
+            is a specific augmentation policy, and is composed by several
+            augmentations. When AutoAugment is called, a random policy in
+            ``policies`` will be selected to augment images.
+            Defaults to policy_v0().
+        prob (list[float], optional): The probabilities associated
+            with each policy. The length should be equal to the policy
+            number and the sum should be 1. If not given, a uniform
+            distribution will be assumed. Defaults to None.
+
+    Examples:
+        >>> policies = [
+        >>>     [
+        >>>         dict(type='Sharpness', prob=0.0, level=8),
+        >>>         dict(type='ShearX', prob=0.4, level=0,)
+        >>>     ],
+        >>>     [
+        >>>         dict(type='Rotate', prob=0.6, level=10),
+        >>>         dict(type='Color', prob=1.0, level=6)
+        >>>     ]
+        >>> ]
+        >>> augmentation = AutoAugment(policies)
+        >>> img = np.ones(100, 100, 3)
+        >>> gt_bboxes = np.ones(10, 4)
+        >>> results = dict(img=img, gt_bboxes=gt_bboxes)
+        >>> results = augmentation(results)
+    """
+
+    def __init__(self,
+                 policies: List[List[Union[dict, ConfigDict]]] = policies_v0(),
+                 prob: Optional[List[float]] = None) -> None:
+        assert isinstance(policies, list) and len(policies) > 0, \
+            'Policies must be a non-empty list.'
+        for policy in policies:
+            assert isinstance(policy, list) and len(policy) > 0, \
+                'Each policy in policies must be a non-empty list.'
+            for augment in policy:
+                assert isinstance(augment, dict) and 'type' in augment, \
+                    'Each specific augmentation must be a dict with key' \
+                    ' "type".'
+        super().__init__(transforms=policies, prob=prob)
+        self.policies = policies
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(policies={self.policies}, ' \
+               f'prob={self.prob})'
+
+
+@TRANSFORMS.register_module()
+class RandAugment(RandomChoice):
+    """Rand augmentation.
+
+    This data augmentation is proposed in `RandAugment:
+    Practical automated data augmentation with a reduced
+    search space <https://arxiv.org/abs/1909.13719>`_.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_bboxes_labels
+    - gt_masks
+    - gt_ignore_flags
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        aug_space (List[List[Union[dict, ConfigDict]]]): The augmentation space
+            of rand augmentation. Each augmentation transform in ``aug_space``
+            is a specific transform, and is composed by several augmentations.
+            When RandAugment is called, a random transform in ``aug_space``
+            will be selected to augment images. Defaults to aug_space.
+        aug_num (int): Number of augmentation to apply equentially.
+            Defaults to 2.
+        prob (list[float], optional): The probabilities associated with
+            each augmentation. The length should be equal to the
+            augmentation space and the sum should be 1. If not given,
+            a uniform distribution will be assumed. Defaults to None.
+
+    Examples:
+        >>> aug_space = [
+        >>>     dict(type='Sharpness'),
+        >>>     dict(type='ShearX'),
+        >>>     dict(type='Color'),
+        >>>     ],
+        >>> augmentation = RandAugment(aug_space)
+        >>> img = np.ones(100, 100, 3)
+        >>> gt_bboxes = np.ones(10, 4)
+        >>> results = dict(img=img, gt_bboxes=gt_bboxes)
+        >>> results = augmentation(results)
+    """
+
+    def __init__(self,
+                 aug_space: List[Union[dict, ConfigDict]] = RANDAUG_SPACE,
+                 aug_num: int = 2,
+                 prob: Optional[List[float]] = None) -> None:
+        assert isinstance(aug_space, list) and len(aug_space) > 0, \
+            'Augmentation space must be a non-empty list.'
+        for aug in aug_space:
+            assert isinstance(aug, list) and len(aug) == 1, \
+                'Each augmentation in aug_space must be a list.'
+            for transform in aug:
+                assert isinstance(transform, dict) and 'type' in transform, \
+                    'Each specific transform must be a dict with key' \
+                    ' "type".'
+        super().__init__(transforms=aug_space, prob=prob)
+        self.aug_space = aug_space
+        self.aug_num = aug_num
+
+    @cache_randomness
+    def random_pipeline_index(self):
+        indices = np.arange(len(self.transforms))
+        return np.random.choice(
+            indices, self.aug_num, p=self.prob, replace=False)
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to use RandAugment.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with RandAugment.
+        """
+        for idx in self.random_pipeline_index():
+            results = self.transforms[idx](results)
+        return results
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(' \
+               f'aug_space={self.aug_space}, '\
+               f'aug_num={self.aug_num}, ' \
+               f'prob={self.prob})'
diff --git a/mmde/mmdet/datasets/transforms/colorspace.py b/mmde/mmdet/datasets/transforms/colorspace.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0ba2e97c7eedf65df5ab8942ee461f48a785f39
--- /dev/null
+++ b/mmde/mmdet/datasets/transforms/colorspace.py
@@ -0,0 +1,493 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmcv.transforms.utils import cache_randomness
+
+from mmdet.registry import TRANSFORMS
+from .augment_wrappers import _MAX_LEVEL, level_to_mag
+
+
+@TRANSFORMS.register_module()
+class ColorTransform(BaseTransform):
+    """Base class for color transformations. All color transformations need to
+    inherit from this base class. ``ColorTransform`` unifies the class
+    attributes and class functions of color transformations (Color, Brightness,
+    Contrast, Sharpness, Solarize, SolarizeAdd, Equalize, AutoContrast, Invert,
+    and Posterize), and only distort color channels, without impacting the
+    locations of the instances.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing the geometric
+            transformation and should be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for color transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for color transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0 <= prob <= 1.0, f'The probability of the transformation ' \
+                                 f'should be in range [0,1], got {prob}.'
+        assert level is None or isinstance(level, int), \
+            f'The level should be None or type int, got {type(level)}.'
+        assert level is None or 0 <= level <= _MAX_LEVEL, \
+            f'The level should be in range [0,{_MAX_LEVEL}], got {level}.'
+        assert isinstance(min_mag, float), \
+            f'min_mag should be type float, got {type(min_mag)}.'
+        assert isinstance(max_mag, float), \
+            f'max_mag should be type float, got {type(max_mag)}.'
+        assert min_mag <= max_mag, \
+            f'min_mag should smaller than max_mag, ' \
+            f'got min_mag={min_mag} and max_mag={max_mag}'
+        self.prob = prob
+        self.level = level
+        self.min_mag = min_mag
+        self.max_mag = max_mag
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Transform the image."""
+        pass
+
+    @cache_randomness
+    def _random_disable(self):
+        """Randomly disable the transform."""
+        return np.random.rand() > self.prob
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        return level_to_mag(self.level, self.min_mag, self.max_mag)
+
+    def transform(self, results: dict) -> dict:
+        """Transform function for images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Transformed results.
+        """
+
+        if self._random_disable():
+            return results
+        mag = self._get_mag()
+        self._transform_img(results, mag)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'level={self.level}, '
+        repr_str += f'min_mag={self.min_mag}, '
+        repr_str += f'max_mag={self.max_mag})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Color(ColorTransform):
+    """Adjust the color balance of the image, in a manner similar to the
+    controls on a colour TV set. A magnitude=0 gives a black & white image,
+    whereas magnitude=1 gives the original image. The bboxes, masks and
+    segmentations are not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Color transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Color transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Color transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Color should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Color should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Apply Color transformation to image."""
+        # NOTE defaultly the image should be BGR format
+        img = results['img']
+        results['img'] = mmcv.adjust_color(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Brightness(ColorTransform):
+    """Adjust the brightness of the image. A magnitude=0 gives a black image,
+    whereas magnitude=1 gives the original image. The bboxes, masks and
+    segmentations are not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Brightness transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Brightness transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Brightness transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Brightness should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Brightness should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Adjust the brightness of image."""
+        img = results['img']
+        results['img'] = mmcv.adjust_brightness(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Contrast(ColorTransform):
+    """Control the contrast of the image. A magnitude=0 gives a gray image,
+    whereas magnitude=1 gives the original imageThe bboxes, masks and
+    segmentations are not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Contrast transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Contrast transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Contrast transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Contrast should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Contrast should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Adjust the image contrast."""
+        img = results['img']
+        results['img'] = mmcv.adjust_contrast(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Sharpness(ColorTransform):
+    """Adjust images sharpness. A positive magnitude would enhance the
+    sharpness and a negative magnitude would make the image blurry. A
+    magnitude=0 gives the origin img.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Sharpness transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Sharpness transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Sharpness transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Sharpness should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Sharpness should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Adjust the image sharpness."""
+        img = results['img']
+        results['img'] = mmcv.adjust_sharpness(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Solarize(ColorTransform):
+    """Solarize images (Invert all pixels above a threshold value of
+    magnitude.).
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Solarize transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Solarize transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for Solarize transformation.
+            Defaults to 256.0.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 256.0) -> None:
+        assert 0. <= min_mag <= 256.0, f'min_mag for Solarize should be ' \
+                                       f'in range [0, 256], got {min_mag}.'
+        assert 0. <= max_mag <= 256.0, f'max_mag for Solarize should be ' \
+                                       f'in range [0, 256], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Invert all pixel values above magnitude."""
+        img = results['img']
+        results['img'] = mmcv.solarize(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class SolarizeAdd(ColorTransform):
+    """SolarizeAdd images. For each pixel in the image that is less than 128,
+    add an additional amount to it decided by the magnitude.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing SolarizeAdd
+            transformation. Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for SolarizeAdd transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for SolarizeAdd transformation.
+            Defaults to 110.0.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 110.0) -> None:
+        assert 0. <= min_mag <= 110.0, f'min_mag for SolarizeAdd should be ' \
+                                       f'in range [0, 110], got {min_mag}.'
+        assert 0. <= max_mag <= 110.0, f'max_mag for SolarizeAdd should be ' \
+                                       f'in range [0, 110], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """SolarizeAdd the image."""
+        img = results['img']
+        img_solarized = np.where(img < 128, np.minimum(img + mag, 255), img)
+        results['img'] = img_solarized.astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Posterize(ColorTransform):
+    """Posterize images (reduce the number of bits for each color channel).
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Posterize
+            transformation. Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Posterize transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for Posterize transformation.
+            Defaults to 4.0.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 4.0) -> None:
+        assert 0. <= min_mag <= 8.0, f'min_mag for Posterize should be ' \
+                                     f'in range [0, 8], got {min_mag}.'
+        assert 0. <= max_mag <= 8.0, f'max_mag for Posterize should be ' \
+                                     f'in range [0, 8], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Posterize the image."""
+        img = results['img']
+        results['img'] = mmcv.posterize(img, math.ceil(mag)).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Equalize(ColorTransform):
+    """Equalize the image histogram. The bboxes, masks and segmentations are
+    not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Equalize transformation.
+            Defaults to 1.0.
+        level (int, optional): No use for Equalize transformation.
+            Defaults to None.
+        min_mag (float): No use for Equalize transformation. Defaults to 0.1.
+        max_mag (float): No use for Equalize transformation. Defaults to 1.9.
+    """
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Equalizes the histogram of one image."""
+        img = results['img']
+        results['img'] = mmcv.imequalize(img).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class AutoContrast(ColorTransform):
+    """Auto adjust image contrast.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing AutoContrast should
+             be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): No use for AutoContrast transformation.
+            Defaults to None.
+        min_mag (float): No use for AutoContrast transformation.
+            Defaults to 0.1.
+        max_mag (float): No use for AutoContrast transformation.
+            Defaults to 1.9.
+    """
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Auto adjust image contrast."""
+        img = results['img']
+        results['img'] = mmcv.auto_contrast(img).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Invert(ColorTransform):
+    """Invert images.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing invert therefore should
+             be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): No use for Invert transformation.
+            Defaults to None.
+        min_mag (float): No use for Invert transformation. Defaults to 0.1.
+        max_mag (float): No use for Invert transformation. Defaults to 1.9.
+    """
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Invert the image."""
+        img = results['img']
+        results['img'] = mmcv.iminvert(img).astype(img.dtype)
diff --git a/mmde/mmdet/datasets/transforms/formatting.py b/mmde/mmdet/datasets/transforms/formatting.py
new file mode 100644
index 0000000000000000000000000000000000000000..05263807c0eab470b0c73f435d327ad8cadb60b3
--- /dev/null
+++ b/mmde/mmdet/datasets/transforms/formatting.py
@@ -0,0 +1,512 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence
+
+import numpy as np
+from mmcv.transforms import to_tensor
+from mmcv.transforms.base import BaseTransform
+from mmengine.structures import InstanceData, PixelData
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures import DetDataSample, ReIDDataSample, TrackDataSample
+from mmdet.structures.bbox import BaseBoxes
+
+
+@TRANSFORMS.register_module()
+class PackDetInputs(BaseTransform):
+    """Pack the inputs data for the detection / semantic segmentation /
+    panoptic segmentation.
+
+    The ``img_meta`` item is always populated.  The contents of the
+    ``img_meta`` dictionary depends on ``meta_keys``. By default this includes:
+
+        - ``img_id``: id of the image
+
+        - ``img_path``: path to the image file
+
+        - ``ori_shape``: original shape of the image as a tuple (h, w)
+
+        - ``img_shape``: shape of the image input to the network as a tuple \
+            (h, w).  Note that images may be zero padded on the \
+            bottom/right if the batch tensor is larger than this shape.
+
+        - ``scale_factor``: a float indicating the preprocessing scale
+
+        - ``flip``: a boolean indicating if image flip transform was used
+
+        - ``flip_direction``: the flipping direction
+
+    Args:
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ``('img_id', 'img_path', 'ori_shape', 'img_shape',
+            'scale_factor', 'flip', 'flip_direction')``
+    """
+    mapping_table = {
+        'gt_bboxes': 'bboxes',
+        'gt_bboxes_labels': 'labels',
+        'gt_masks': 'masks'
+    }
+
+    def __init__(self,
+                 meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                            'scale_factor', 'flip', 'flip_direction')):
+        self.meta_keys = meta_keys
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+
+        Args:
+            results (dict): Result dict from the data pipeline.
+
+        Returns:
+            dict:
+
+            - 'inputs' (obj:`torch.Tensor`): The forward data of models.
+            - 'data_sample' (obj:`DetDataSample`): The annotation info of the
+                sample.
+        """
+        packed_results = dict()
+        if 'img' in results:
+            img = results['img']
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            # To improve the computational speed by by 3-5 times, apply:
+            # If image is not contiguous, use
+            # `numpy.transpose()` followed by `numpy.ascontiguousarray()`
+            # If image is already contiguous, use
+            # `torch.permute()` followed by `torch.contiguous()`
+            # Refer to https://github.com/open-mmlab/mmdetection/pull/9533
+            # for more details
+            if not img.flags.c_contiguous:
+                img = np.ascontiguousarray(img.transpose(2, 0, 1))
+                img = to_tensor(img)
+            else:
+                img = to_tensor(img).permute(2, 0, 1).contiguous()
+
+            packed_results['inputs'] = img
+
+        if 'gt_ignore_flags' in results:
+            valid_idx = np.where(results['gt_ignore_flags'] == 0)[0]
+            ignore_idx = np.where(results['gt_ignore_flags'] == 1)[0]
+
+        data_sample = DetDataSample()
+        instance_data = InstanceData()
+        ignore_instance_data = InstanceData()
+
+        for key in self.mapping_table.keys():
+            if key not in results:
+                continue
+            if key == 'gt_masks' or isinstance(results[key], BaseBoxes):
+                if 'gt_ignore_flags' in results:
+                    instance_data[
+                        self.mapping_table[key]] = results[key][valid_idx]
+                    ignore_instance_data[
+                        self.mapping_table[key]] = results[key][ignore_idx]
+                else:
+                    instance_data[self.mapping_table[key]] = results[key]
+            else:
+                if 'gt_ignore_flags' in results:
+                    instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key][valid_idx])
+                    ignore_instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key][ignore_idx])
+                else:
+                    instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key])
+        data_sample.gt_instances = instance_data
+        data_sample.ignored_instances = ignore_instance_data
+
+        if 'proposals' in results:
+            proposals = InstanceData(
+                bboxes=to_tensor(results['proposals']),
+                scores=to_tensor(results['proposals_scores']))
+            data_sample.proposals = proposals
+
+        if 'gt_seg_map' in results:
+            gt_sem_seg_data = dict(
+                sem_seg=to_tensor(results['gt_seg_map'][None, ...].copy()))
+            gt_sem_seg_data = PixelData(**gt_sem_seg_data)
+            if 'ignore_index' in results:
+                metainfo = dict(ignore_index=results['ignore_index'])
+                gt_sem_seg_data.set_metainfo(metainfo)
+            data_sample.gt_sem_seg = gt_sem_seg_data
+
+        img_meta = {}
+        for key in self.meta_keys:
+            if key in results:
+                img_meta[key] = results[key]
+        data_sample.set_metainfo(img_meta)
+        packed_results['data_samples'] = data_sample
+
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ToTensor:
+    """Convert some results to :obj:`torch.Tensor` by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys that need to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert data in results to :obj:`torch.Tensor`.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data converted
+                to :obj:`torch.Tensor`.
+        """
+        for key in self.keys:
+            results[key] = to_tensor(results[key])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@TRANSFORMS.register_module()
+class ImageToTensor:
+    """Convert image to :obj:`torch.Tensor` by given keys.
+
+    The dimension order of input image is (H, W, C). The pipeline will convert
+    it to (C, H, W). If only 2 dimension (H, W) is given, the output would be
+    (1, H, W).
+
+    Args:
+        keys (Sequence[str]): Key of images to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert image in results to :obj:`torch.Tensor` and
+        transpose the channel order.
+
+        Args:
+            results (dict): Result dict contains the image data to convert.
+
+        Returns:
+            dict: The result dict contains the image converted
+                to :obj:`torch.Tensor` and permuted to (C, H, W) order.
+        """
+        for key in self.keys:
+            img = results[key]
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            results[key] = to_tensor(img).permute(2, 0, 1).contiguous()
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@TRANSFORMS.register_module()
+class Transpose:
+    """Transpose some results by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys of results to be transposed.
+        order (Sequence[int]): Order of transpose.
+    """
+
+    def __init__(self, keys, order):
+        self.keys = keys
+        self.order = order
+
+    def __call__(self, results):
+        """Call function to transpose the channel order of data in results.
+
+        Args:
+            results (dict): Result dict contains the data to transpose.
+
+        Returns:
+            dict: The result dict contains the data transposed to \
+                ``self.order``.
+        """
+        for key in self.keys:
+            results[key] = results[key].transpose(self.order)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, order={self.order})'
+
+
+@TRANSFORMS.register_module()
+class WrapFieldsToLists:
+    """Wrap fields of the data dictionary into lists for evaluation.
+
+    This class can be used as a last step of a test or validation
+    pipeline for single image evaluation or inference.
+
+    Example:
+        >>> test_pipeline = [
+        >>>    dict(type='LoadImageFromFile'),
+        >>>    dict(type='Normalize',
+                    mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True),
+        >>>    dict(type='Pad', size_divisor=32),
+        >>>    dict(type='ImageToTensor', keys=['img']),
+        >>>    dict(type='Collect', keys=['img']),
+        >>>    dict(type='WrapFieldsToLists')
+        >>> ]
+    """
+
+    def __call__(self, results):
+        """Call function to wrap fields into lists.
+
+        Args:
+            results (dict): Result dict contains the data to wrap.
+
+        Returns:
+            dict: The result dict where value of ``self.keys`` are wrapped \
+                into list.
+        """
+
+        # Wrap dict fields into lists
+        for key, val in results.items():
+            results[key] = [val]
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}()'
+
+
+@TRANSFORMS.register_module()
+class PackTrackInputs(BaseTransform):
+    """Pack the inputs data for the multi object tracking and video instance
+    segmentation. All the information of images are packed to ``inputs``. All
+    the information except images are packed to ``data_samples``. In order to
+    get the original annotaiton and meta info, we add `instances` key into meta
+    keys.
+
+    Args:
+        meta_keys (Sequence[str]): Meta keys to be collected in
+            ``data_sample.metainfo``. Defaults to None.
+        default_meta_keys (tuple): Default meta keys. Defaults to ('img_id',
+            'img_path', 'ori_shape', 'img_shape', 'scale_factor',
+            'flip', 'flip_direction', 'frame_id', 'is_video_data',
+            'video_id', 'video_length', 'instances').
+    """
+    mapping_table = {
+        'gt_bboxes': 'bboxes',
+        'gt_bboxes_labels': 'labels',
+        'gt_masks': 'masks',
+        'gt_instances_ids': 'instances_ids'
+    }
+
+    def __init__(self,
+                 meta_keys: Optional[dict] = None,
+                 default_meta_keys: tuple = ('img_id', 'img_path', 'ori_shape',
+                                             'img_shape', 'scale_factor',
+                                             'flip', 'flip_direction',
+                                             'frame_id', 'video_id',
+                                             'video_length',
+                                             'ori_video_length', 'instances')):
+        self.meta_keys = default_meta_keys
+        if meta_keys is not None:
+            if isinstance(meta_keys, str):
+                meta_keys = (meta_keys, )
+            else:
+                assert isinstance(meta_keys, tuple), \
+                    'meta_keys must be str or tuple'
+            self.meta_keys += meta_keys
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+        Args:
+            results (dict): Result dict from the data pipeline.
+        Returns:
+            dict:
+            - 'inputs' (dict[Tensor]): The forward data of models.
+            - 'data_samples' (obj:`TrackDataSample`): The annotation info of
+                the samples.
+        """
+        packed_results = dict()
+        packed_results['inputs'] = dict()
+
+        # 1. Pack images
+        if 'img' in results:
+            imgs = results['img']
+            imgs = np.stack(imgs, axis=0)
+            imgs = imgs.transpose(0, 3, 1, 2)
+            packed_results['inputs'] = to_tensor(imgs)
+
+        # 2. Pack InstanceData
+        if 'gt_ignore_flags' in results:
+            gt_ignore_flags_list = results['gt_ignore_flags']
+            valid_idx_list, ignore_idx_list = [], []
+            for gt_ignore_flags in gt_ignore_flags_list:
+                valid_idx = np.where(gt_ignore_flags == 0)[0]
+                ignore_idx = np.where(gt_ignore_flags == 1)[0]
+                valid_idx_list.append(valid_idx)
+                ignore_idx_list.append(ignore_idx)
+
+        assert 'img_id' in results, "'img_id' must contained in the results "
+        'for counting the number of images'
+
+        num_imgs = len(results['img_id'])
+        instance_data_list = [InstanceData() for _ in range(num_imgs)]
+        ignore_instance_data_list = [InstanceData() for _ in range(num_imgs)]
+
+        for key in self.mapping_table.keys():
+            if key not in results:
+                continue
+            if key == 'gt_masks':
+                mapped_key = self.mapping_table[key]
+                gt_masks_list = results[key]
+                if 'gt_ignore_flags' in results:
+                    for i, gt_mask in enumerate(gt_masks_list):
+                        valid_idx, ignore_idx = valid_idx_list[
+                            i], ignore_idx_list[i]
+                        instance_data_list[i][mapped_key] = gt_mask[valid_idx]
+                        ignore_instance_data_list[i][mapped_key] = gt_mask[
+                            ignore_idx]
+
+                else:
+                    for i, gt_mask in enumerate(gt_masks_list):
+                        instance_data_list[i][mapped_key] = gt_mask
+
+            else:
+                anns_list = results[key]
+                if 'gt_ignore_flags' in results:
+                    for i, ann in enumerate(anns_list):
+                        valid_idx, ignore_idx = valid_idx_list[
+                            i], ignore_idx_list[i]
+                        instance_data_list[i][
+                            self.mapping_table[key]] = to_tensor(
+                                ann[valid_idx])
+                        ignore_instance_data_list[i][
+                            self.mapping_table[key]] = to_tensor(
+                                ann[ignore_idx])
+                else:
+                    for i, ann in enumerate(anns_list):
+                        instance_data_list[i][
+                            self.mapping_table[key]] = to_tensor(ann)
+
+        det_data_samples_list = []
+        for i in range(num_imgs):
+            det_data_sample = DetDataSample()
+            det_data_sample.gt_instances = instance_data_list[i]
+            det_data_sample.ignored_instances = ignore_instance_data_list[i]
+            det_data_samples_list.append(det_data_sample)
+
+        # 3. Pack metainfo
+        for key in self.meta_keys:
+            if key not in results:
+                continue
+            img_metas_list = results[key]
+            for i, img_meta in enumerate(img_metas_list):
+                det_data_samples_list[i].set_metainfo({f'{key}': img_meta})
+
+        track_data_sample = TrackDataSample()
+        track_data_sample.video_data_samples = det_data_samples_list
+        if 'key_frame_flags' in results:
+            key_frame_flags = np.asarray(results['key_frame_flags'])
+            key_frames_inds = np.where(key_frame_flags)[0].tolist()
+            ref_frames_inds = np.where(~key_frame_flags)[0].tolist()
+            track_data_sample.set_metainfo(
+                dict(key_frames_inds=key_frames_inds))
+            track_data_sample.set_metainfo(
+                dict(ref_frames_inds=ref_frames_inds))
+
+        packed_results['data_samples'] = track_data_sample
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'meta_keys={self.meta_keys}, '
+        repr_str += f'default_meta_keys={self.default_meta_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PackReIDInputs(BaseTransform):
+    """Pack the inputs data for the ReID. The ``meta_info`` item is always
+    populated. The contents of the ``meta_info`` dictionary depends on
+    ``meta_keys``. By default this includes:
+
+        - ``img_path``: path to the image file.
+        - ``ori_shape``: original shape of the image as a tuple (H, W).
+        - ``img_shape``: shape of the image input to the network as a tuple
+            (H, W). Note that images may be zero padded on the bottom/right
+          if the batch tensor is larger than this shape.
+        - ``scale``: scale of the image as a tuple (W, H).
+        - ``scale_factor``: a float indicating the pre-processing scale.
+        -  ``flip``: a boolean indicating if image flip transform was used.
+        - ``flip_direction``: the flipping direction.
+    Args:
+        meta_keys (Sequence[str], optional): The meta keys to saved in the
+            ``metainfo`` of the packed ``data_sample``.
+    """
+    default_meta_keys = ('img_path', 'ori_shape', 'img_shape', 'scale',
+                         'scale_factor')
+
+    def __init__(self, meta_keys: Sequence[str] = ()) -> None:
+        self.meta_keys = self.default_meta_keys
+        if meta_keys is not None:
+            if isinstance(meta_keys, str):
+                meta_keys = (meta_keys, )
+            else:
+                assert isinstance(meta_keys, tuple), \
+                    'meta_keys must be str or tuple.'
+            self.meta_keys += meta_keys
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+        Args:
+            results (dict): Result dict from the data pipeline.
+        Returns:
+            dict:
+            - 'inputs' (dict[Tensor]): The forward data of models.
+            - 'data_samples' (obj:`ReIDDataSample`): The meta info of the
+                sample.
+        """
+        packed_results = dict(inputs=dict(), data_samples=None)
+        assert 'img' in results, 'Missing the key ``img``.'
+        _type = type(results['img'])
+        label = results['gt_label']
+
+        if _type == list:
+            img = results['img']
+            label = np.stack(label, axis=0)  # (N,)
+            assert all([type(v) == _type for v in results.values()]), \
+                'All items in the results must have the same type.'
+        else:
+            img = [results['img']]
+
+        img = np.stack(img, axis=3)  # (H, W, C, N)
+        img = img.transpose(3, 2, 0, 1)  # (N, C, H, W)
+        img = np.ascontiguousarray(img)
+
+        packed_results['inputs'] = to_tensor(img)
+
+        data_sample = ReIDDataSample()
+        data_sample.set_gt_label(label)
+
+        meta_info = dict()
+        for key in self.meta_keys:
+            meta_info[key] = results[key]
+        data_sample.set_metainfo(meta_info)
+        packed_results['data_samples'] = data_sample
+
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str
diff --git a/mmde/mmdet/datasets/transforms/frame_sampling.py b/mmde/mmdet/datasets/transforms/frame_sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..a91f1e7880f8f061f183dc30a01758d97b7d03da
--- /dev/null
+++ b/mmde/mmdet/datasets/transforms/frame_sampling.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from collections import defaultdict
+from typing import Dict, List, Optional, Union
+
+from mmcv.transforms import BaseTransform
+
+from mmdet.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class BaseFrameSample(BaseTransform):
+    """Directly get the key frame, no reference frames.
+
+    Args:
+        collect_video_keys (list[str]): The keys of video info to be
+            collected.
+    """
+
+    def __init__(self,
+                 collect_video_keys: List[str] = ['video_id', 'video_length']):
+        self.collect_video_keys = collect_video_keys
+
+    def prepare_data(self, video_infos: dict,
+                     sampled_inds: List[int]) -> Dict[str, List]:
+        """Prepare data for the subsequent pipeline.
+
+        Args:
+            video_infos (dict): The whole video information.
+            sampled_inds (list[int]): The sampled frame indices.
+
+        Returns:
+            dict: The processed data information.
+        """
+        frames_anns = video_infos['images']
+        final_data_info = defaultdict(list)
+        # for data in frames_anns:
+        for index in sampled_inds:
+            data = frames_anns[index]
+            # copy the info in video-level into img-level
+            for key in self.collect_video_keys:
+                if key == 'video_length':
+                    data['ori_video_length'] = video_infos[key]
+                    data['video_length'] = len(sampled_inds)
+                else:
+                    data[key] = video_infos[key]
+            # Collate data_list (list of dict to dict of list)
+            for key, value in data.items():
+                final_data_info[key].append(value)
+
+        return final_data_info
+
+    def transform(self, video_infos: dict) -> Optional[Dict[str, List]]:
+        """Transform the video information.
+
+        Args:
+            video_infos (dict): The whole video information.
+
+        Returns:
+            dict: The data information of the key frames.
+        """
+        if 'key_frame_id' in video_infos:
+            key_frame_id = video_infos['key_frame_id']
+            assert isinstance(video_infos['key_frame_id'], int)
+        else:
+            key_frame_id = random.sample(
+                list(range(video_infos['video_length'])), 1)[0]
+        results = self.prepare_data(video_infos, [key_frame_id])
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(collect_video_keys={self.collect_video_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class UniformRefFrameSample(BaseFrameSample):
+    """Uniformly sample reference frames.
+
+    Args:
+        num_ref_imgs (int): Number of reference frames to be sampled.
+        frame_range (int | list[int]): Range of frames to be sampled around
+            key frame. If int, the range is [-frame_range, frame_range].
+            Defaults to 10.
+        filter_key_img (bool): Whether to filter the key frame when
+            sampling reference frames. Defaults to True.
+        collect_video_keys (list[str]): The keys of video info to be
+            collected.
+    """
+
+    def __init__(self,
+                 num_ref_imgs: int = 1,
+                 frame_range: Union[int, List[int]] = 10,
+                 filter_key_img: bool = True,
+                 collect_video_keys: List[str] = ['video_id', 'video_length']):
+        self.num_ref_imgs = num_ref_imgs
+        self.filter_key_img = filter_key_img
+        if isinstance(frame_range, int):
+            assert frame_range >= 0, 'frame_range can not be a negative value.'
+            frame_range = [-frame_range, frame_range]
+        elif isinstance(frame_range, list):
+            assert len(frame_range) == 2, 'The length must be 2.'
+            assert frame_range[0] <= 0 and frame_range[1] >= 0
+            for i in frame_range:
+                assert isinstance(i, int), 'Each element must be int.'
+        else:
+            raise TypeError('The type of frame_range must be int or list.')
+        self.frame_range = frame_range
+        super().__init__(collect_video_keys=collect_video_keys)
+
+    def sampling_frames(self, video_length: int, key_frame_id: int):
+        """Sampling frames.
+
+        Args:
+            video_length (int): The length of the video.
+            key_frame_id (int): The key frame id.
+
+        Returns:
+            list[int]: The sampled frame indices.
+        """
+        if video_length > 1:
+            left = max(0, key_frame_id + self.frame_range[0])
+            right = min(key_frame_id + self.frame_range[1], video_length - 1)
+            frame_ids = list(range(0, video_length))
+
+            valid_ids = frame_ids[left:right + 1]
+            if self.filter_key_img and key_frame_id in valid_ids:
+                valid_ids.remove(key_frame_id)
+            assert len(
+                valid_ids
+            ) > 0, 'After filtering key frame, there are no valid frames'
+            if len(valid_ids) < self.num_ref_imgs:
+                valid_ids = valid_ids * self.num_ref_imgs
+            ref_frame_ids = random.sample(valid_ids, self.num_ref_imgs)
+        else:
+            ref_frame_ids = [key_frame_id] * self.num_ref_imgs
+
+        sampled_frames_ids = [key_frame_id] + ref_frame_ids
+        sampled_frames_ids = sorted(sampled_frames_ids)
+
+        key_frames_ind = sampled_frames_ids.index(key_frame_id)
+        key_frame_flags = [False] * len(sampled_frames_ids)
+        key_frame_flags[key_frames_ind] = True
+        return sampled_frames_ids, key_frame_flags
+
+    def transform(self, video_infos: dict) -> Optional[Dict[str, List]]:
+        """Transform the video information.
+
+        Args:
+            video_infos (dict): The whole video information.
+
+        Returns:
+            dict: The data information of the sampled frames.
+        """
+        if 'key_frame_id' in video_infos:
+            key_frame_id = video_infos['key_frame_id']
+            assert isinstance(video_infos['key_frame_id'], int)
+        else:
+            key_frame_id = random.sample(
+                list(range(video_infos['video_length'])), 1)[0]
+
+        (sampled_frames_ids, key_frame_flags) = self.sampling_frames(
+            video_infos['video_length'], key_frame_id=key_frame_id)
+        results = self.prepare_data(video_infos, sampled_frames_ids)
+        results['key_frame_flags'] = key_frame_flags
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_ref_imgs={self.num_ref_imgs}, '
+        repr_str += f'frame_range={self.frame_range}, '
+        repr_str += f'filter_key_img={self.filter_key_img}, '
+        repr_str += f'collect_video_keys={self.collect_video_keys})'
+        return repr_str
diff --git a/mmde/mmdet/datasets/transforms/geometric.py b/mmde/mmdet/datasets/transforms/geometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2cd6be258f73a69aa2c2b36fef64c6c4e46a2a4
--- /dev/null
+++ b/mmde/mmdet/datasets/transforms/geometric.py
@@ -0,0 +1,754 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Optional, Union
+
+import cv2
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmcv.transforms.utils import cache_randomness
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import autocast_box_type
+from .augment_wrappers import _MAX_LEVEL, level_to_mag
+
+
+@TRANSFORMS.register_module()
+class GeomTransform(BaseTransform):
+    """Base class for geometric transformations. All geometric transformations
+    need to inherit from this base class. ``GeomTransform`` unifies the class
+    attributes and class functions of geometric transformations (ShearX,
+    ShearY, Rotate, TranslateX, and TranslateY), and records the homography
+    matrix.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for performing the geometric
+            transformation and should be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for geometric transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for geometric transformation.
+            Defaults to 1.0.
+        reversal_prob (float): The probability that reverses the geometric
+            transformation magnitude. Should be in range [0,1].
+            Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 1.0,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0 <= prob <= 1.0, f'The probability of the transformation ' \
+                                 f'should be in range [0,1], got {prob}.'
+        assert level is None or isinstance(level, int), \
+            f'The level should be None or type int, got {type(level)}.'
+        assert level is None or 0 <= level <= _MAX_LEVEL, \
+            f'The level should be in range [0,{_MAX_LEVEL}], got {level}.'
+        assert isinstance(min_mag, float), \
+            f'min_mag should be type float, got {type(min_mag)}.'
+        assert isinstance(max_mag, float), \
+            f'max_mag should be type float, got {type(max_mag)}.'
+        assert min_mag <= max_mag, \
+            f'min_mag should smaller than max_mag, ' \
+            f'got min_mag={min_mag} and max_mag={max_mag}'
+        assert isinstance(reversal_prob, float), \
+            f'reversal_prob should be type float, got {type(max_mag)}.'
+        assert 0 <= reversal_prob <= 1.0, \
+            f'The reversal probability of the transformation magnitude ' \
+            f'should be type float, got {type(reversal_prob)}.'
+        if isinstance(img_border_value, (float, int)):
+            img_border_value = tuple([float(img_border_value)] * 3)
+        elif isinstance(img_border_value, tuple):
+            assert len(img_border_value) == 3, \
+                f'img_border_value as tuple must have 3 elements, ' \
+                f'got {len(img_border_value)}.'
+            img_border_value = tuple([float(val) for val in img_border_value])
+        else:
+            raise ValueError(
+                'img_border_value must be float or tuple with 3 elements.')
+        assert np.all([0 <= val <= 255 for val in img_border_value]), 'all ' \
+            'elements of img_border_value should between range [0,255].' \
+            f'got {img_border_value}.'
+        self.prob = prob
+        self.level = level
+        self.min_mag = min_mag
+        self.max_mag = max_mag
+        self.reversal_prob = reversal_prob
+        self.img_border_value = img_border_value
+        self.mask_border_value = mask_border_value
+        self.seg_ignore_label = seg_ignore_label
+        self.interpolation = interpolation
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Transform the image."""
+        pass
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Transform the masks."""
+        pass
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Transform the segmentation map."""
+        pass
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for the geometric transformation."""
+        return np.eye(3, dtype=np.float32)
+
+    def _transform_bboxes(self, results: dict, mag: float) -> None:
+        """Transform the bboxes."""
+        results['gt_bboxes'].project_(self.homography_matrix)
+        results['gt_bboxes'].clip_(results['img_shape'])
+
+    def _record_homography_matrix(self, results: dict) -> None:
+        """Record the homography matrix for the geometric transformation."""
+        if results.get('homography_matrix', None) is None:
+            results['homography_matrix'] = self.homography_matrix
+        else:
+            results['homography_matrix'] = self.homography_matrix @ results[
+                'homography_matrix']
+
+    @cache_randomness
+    def _random_disable(self):
+        """Randomly disable the transform."""
+        return np.random.rand() > self.prob
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        mag = level_to_mag(self.level, self.min_mag, self.max_mag)
+        return -mag if np.random.rand() > self.reversal_prob else mag
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function for images, bounding boxes, masks and semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Transformed results.
+        """
+
+        if self._random_disable():
+            return results
+        mag = self._get_mag()
+        self.homography_matrix = self._get_homography_matrix(results, mag)
+        self._record_homography_matrix(results)
+        self._transform_img(results, mag)
+        if results.get('gt_bboxes', None) is not None:
+            self._transform_bboxes(results, mag)
+        if results.get('gt_masks', None) is not None:
+            self._transform_masks(results, mag)
+        if results.get('gt_seg_map', None) is not None:
+            self._transform_seg(results, mag)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'level={self.level}, '
+        repr_str += f'min_mag={self.min_mag}, '
+        repr_str += f'max_mag={self.max_mag}, '
+        repr_str += f'reversal_prob={self.reversal_prob}, '
+        repr_str += f'img_border_value={self.img_border_value}, '
+        repr_str += f'mask_border_value={self.mask_border_value}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
+        repr_str += f'interpolation={self.interpolation})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ShearX(GeomTransform):
+    """Shear the images, bboxes, masks and segmentation map horizontally.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for performing Shear and should be in
+            range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum angle for the horizontal shear.
+            Defaults to 0.0.
+        max_mag (float): The maximum angle for the horizontal shear.
+            Defaults to 30.0.
+        reversal_prob (float): The probability that reverses the horizontal
+            shear magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 30.0,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 90., \
+            f'min_mag angle for ShearX should be ' \
+            f'in range [0, 90], got {min_mag}.'
+        assert 0. <= max_mag <= 90., \
+            f'max_mag angle for ShearX should be ' \
+            f'in range [0, 90], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        mag = level_to_mag(self.level, self.min_mag, self.max_mag)
+        mag = np.tan(mag * np.pi / 180)
+        return -mag if np.random.rand() > self.reversal_prob else mag
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for ShearX."""
+        return np.array([[1, mag, 0], [0, 1, 0], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Shear the image horizontally."""
+        results['img'] = mmcv.imshear(
+            results['img'],
+            mag,
+            direction='horizontal',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Shear the masks horizontally."""
+        results['gt_masks'] = results['gt_masks'].shear(
+            results['img_shape'],
+            mag,
+            direction='horizontal',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Shear the segmentation map horizontally."""
+        results['gt_seg_map'] = mmcv.imshear(
+            results['gt_seg_map'],
+            mag,
+            direction='horizontal',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class ShearY(GeomTransform):
+    """Shear the images, bboxes, masks and segmentation map vertically.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for performing ShearY and should be in
+            range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum angle for the vertical shear.
+            Defaults to 0.0.
+        max_mag (float): The maximum angle for the vertical shear.
+            Defaults to 30.0.
+        reversal_prob (float): The probability that reverses the vertical
+            shear magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 30.,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 90., \
+            f'min_mag angle for ShearY should be ' \
+            f'in range [0, 90], got {min_mag}.'
+        assert 0. <= max_mag <= 90., \
+            f'max_mag angle for ShearY should be ' \
+            f'in range [0, 90], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        mag = level_to_mag(self.level, self.min_mag, self.max_mag)
+        mag = np.tan(mag * np.pi / 180)
+        return -mag if np.random.rand() > self.reversal_prob else mag
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for ShearY."""
+        return np.array([[1, 0, 0], [mag, 1, 0], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Shear the image vertically."""
+        results['img'] = mmcv.imshear(
+            results['img'],
+            mag,
+            direction='vertical',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Shear the masks vertically."""
+        results['gt_masks'] = results['gt_masks'].shear(
+            results['img_shape'],
+            mag,
+            direction='vertical',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Shear the segmentation map vertically."""
+        results['gt_seg_map'] = mmcv.imshear(
+            results['gt_seg_map'],
+            mag,
+            direction='vertical',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class Rotate(GeomTransform):
+    """Rotate the images, bboxes, masks and segmentation map.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The maximum angle for rotation.
+            Defaults to 0.0.
+        max_mag (float): The maximum angle for rotation.
+            Defaults to 30.0.
+        reversal_prob (float): The probability that reverses the rotation
+            magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 30.0,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 180., \
+            f'min_mag for Rotate should be in range [0,180], got {min_mag}.'
+        assert 0. <= max_mag <= 180., \
+            f'max_mag for Rotate should be in range [0,180], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for Rotate."""
+        img_shape = results['img_shape']
+        center = ((img_shape[1] - 1) * 0.5, (img_shape[0] - 1) * 0.5)
+        cv2_rotation_matrix = cv2.getRotationMatrix2D(center, -mag, 1.0)
+        return np.concatenate(
+            [cv2_rotation_matrix,
+             np.array([0, 0, 1]).reshape((1, 3))]).astype(np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Rotate the image."""
+        results['img'] = mmcv.imrotate(
+            results['img'],
+            mag,
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Rotate the masks."""
+        results['gt_masks'] = results['gt_masks'].rotate(
+            results['img_shape'],
+            mag,
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Rotate the segmentation map."""
+        results['gt_seg_map'] = mmcv.imrotate(
+            results['gt_seg_map'],
+            mag,
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class TranslateX(GeomTransform):
+    """Translate the images, bboxes, masks and segmentation map horizontally.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum pixel's offset ratio for horizontal
+            translation. Defaults to 0.0.
+        max_mag (float): The maximum pixel's offset ratio for horizontal
+            translation. Defaults to 0.1.
+        reversal_prob (float): The probability that reverses the horizontal
+            translation magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 0.1,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 1., \
+            f'min_mag ratio for TranslateX should be ' \
+            f'in range [0, 1], got {min_mag}.'
+        assert 0. <= max_mag <= 1., \
+            f'max_mag ratio for TranslateX should be ' \
+            f'in range [0, 1], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for TranslateX."""
+        mag = int(results['img_shape'][1] * mag)
+        return np.array([[1, 0, mag], [0, 1, 0], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Translate the image horizontally."""
+        mag = int(results['img_shape'][1] * mag)
+        results['img'] = mmcv.imtranslate(
+            results['img'],
+            mag,
+            direction='horizontal',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Translate the masks horizontally."""
+        mag = int(results['img_shape'][1] * mag)
+        results['gt_masks'] = results['gt_masks'].translate(
+            results['img_shape'],
+            mag,
+            direction='horizontal',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Translate the segmentation map horizontally."""
+        mag = int(results['img_shape'][1] * mag)
+        results['gt_seg_map'] = mmcv.imtranslate(
+            results['gt_seg_map'],
+            mag,
+            direction='horizontal',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class TranslateY(GeomTransform):
+    """Translate the images, bboxes, masks and segmentation map vertically.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum pixel's offset ratio for vertical
+            translation. Defaults to 0.0.
+        max_mag (float): The maximum pixel's offset ratio for vertical
+            translation. Defaults to 0.1.
+        reversal_prob (float): The probability that reverses the vertical
+            translation magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 0.1,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 1., \
+            f'min_mag ratio for TranslateY should be ' \
+            f'in range [0,1], got {min_mag}.'
+        assert 0. <= max_mag <= 1., \
+            f'max_mag ratio for TranslateY should be ' \
+            f'in range [0,1], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for TranslateY."""
+        mag = int(results['img_shape'][0] * mag)
+        return np.array([[1, 0, 0], [0, 1, mag], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Translate the image vertically."""
+        mag = int(results['img_shape'][0] * mag)
+        results['img'] = mmcv.imtranslate(
+            results['img'],
+            mag,
+            direction='vertical',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Translate masks vertically."""
+        mag = int(results['img_shape'][0] * mag)
+        results['gt_masks'] = results['gt_masks'].translate(
+            results['img_shape'],
+            mag,
+            direction='vertical',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Translate segmentation map vertically."""
+        mag = int(results['img_shape'][0] * mag)
+        results['gt_seg_map'] = mmcv.imtranslate(
+            results['gt_seg_map'],
+            mag,
+            direction='vertical',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
diff --git a/mmde/mmdet/datasets/transforms/instaboost.py b/mmde/mmdet/datasets/transforms/instaboost.py
new file mode 100644
index 0000000000000000000000000000000000000000..30dc1603643ec8d398bfade95f5ec1c9b8f89c8d
--- /dev/null
+++ b/mmde/mmdet/datasets/transforms/instaboost.py
@@ -0,0 +1,150 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import numpy as np
+from mmcv.transforms import BaseTransform
+
+from mmdet.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class InstaBoost(BaseTransform):
+    r"""Data augmentation method in `InstaBoost: Boosting Instance
+    Segmentation Via Probability Map Guided Copy-Pasting
+    <https://arxiv.org/abs/1908.07801>`_.
+
+    Refer to https://github.com/GothicAi/Instaboost for implementation details.
+
+
+    Required Keys:
+
+    - img (np.uint8)
+    - instances
+
+    Modified Keys:
+
+    - img (np.uint8)
+    - instances
+
+    Args:
+        action_candidate (tuple): Action candidates. "normal", "horizontal", \
+            "vertical", "skip" are supported. Defaults to ('normal', \
+            'horizontal', 'skip').
+        action_prob (tuple): Corresponding action probabilities. Should be \
+            the same length as action_candidate. Defaults to (1, 0, 0).
+        scale (tuple): (min scale, max scale). Defaults to (0.8, 1.2).
+        dx (int): The maximum x-axis shift will be (instance width) / dx.
+            Defaults to 15.
+        dy (int): The maximum y-axis shift will be (instance height) / dy.
+            Defaults to 15.
+        theta (tuple): (min rotation degree, max rotation degree). \
+            Defaults to (-1, 1).
+        color_prob (float): Probability of images for color augmentation.
+            Defaults to 0.5.
+        hflag (bool): Whether to use heatmap guided. Defaults to False.
+        aug_ratio (float): Probability of applying this transformation. \
+            Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 action_candidate: tuple = ('normal', 'horizontal', 'skip'),
+                 action_prob: tuple = (1, 0, 0),
+                 scale: tuple = (0.8, 1.2),
+                 dx: int = 15,
+                 dy: int = 15,
+                 theta: tuple = (-1, 1),
+                 color_prob: float = 0.5,
+                 hflag: bool = False,
+                 aug_ratio: float = 0.5) -> None:
+
+        import matplotlib
+        import matplotlib.pyplot as plt
+        default_backend = plt.get_backend()
+
+        try:
+            import instaboostfast as instaboost
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install instaboostfast" '
+                'to install instaboostfast first for instaboost augmentation.')
+
+        # instaboost will modify the default backend
+        # and cause visualization to fail.
+        matplotlib.use(default_backend)
+
+        self.cfg = instaboost.InstaBoostConfig(action_candidate, action_prob,
+                                               scale, dx, dy, theta,
+                                               color_prob, hflag)
+        self.aug_ratio = aug_ratio
+
+    def _load_anns(self, results: dict) -> Tuple[list, list]:
+        """Convert raw anns to instaboost expected input format."""
+        anns = []
+        ignore_anns = []
+        for instance in results['instances']:
+            label = instance['bbox_label']
+            bbox = instance['bbox']
+            mask = instance['mask']
+            x1, y1, x2, y2 = bbox
+            # assert (x2 - x1) >= 1 and (y2 - y1) >= 1
+            bbox = [x1, y1, x2 - x1, y2 - y1]
+
+            if instance['ignore_flag'] == 0:
+                anns.append({
+                    'category_id': label,
+                    'segmentation': mask,
+                    'bbox': bbox
+                })
+            else:
+                # Ignore instances without data augmentation
+                ignore_anns.append(instance)
+        return anns, ignore_anns
+
+    def _parse_anns(self, results: dict, anns: list, ignore_anns: list,
+                    img: np.ndarray) -> dict:
+        """Restore the result of instaboost processing to the original anns
+        format."""
+        instances = []
+        for ann in anns:
+            x1, y1, w, h = ann['bbox']
+            # TODO: more essential bug need to be fixed in instaboost
+            if w <= 0 or h <= 0:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            instances.append(
+                dict(
+                    bbox=bbox,
+                    bbox_label=ann['category_id'],
+                    mask=ann['segmentation'],
+                    ignore_flag=0))
+
+        instances.extend(ignore_anns)
+        results['img'] = img
+        results['instances'] = instances
+        return results
+
+    def transform(self, results) -> dict:
+        """The transform function."""
+        img = results['img']
+        ori_type = img.dtype
+        if 'instances' not in results or len(results['instances']) == 0:
+            return results
+
+        anns, ignore_anns = self._load_anns(results)
+        if np.random.choice([0, 1], p=[1 - self.aug_ratio, self.aug_ratio]):
+            try:
+                import instaboostfast as instaboost
+            except ImportError:
+                raise ImportError('Please run "pip install instaboostfast" '
+                                  'to install instaboostfast first.')
+            anns, img = instaboost.get_new_data(
+                anns, img.astype(np.uint8), self.cfg, background=None)
+
+        results = self._parse_anns(results, anns, ignore_anns,
+                                   img.astype(ori_type))
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(aug_ratio={self.aug_ratio})'
+        return repr_str
diff --git a/mmde/mmdet/datasets/transforms/loading.py b/mmde/mmdet/datasets/transforms/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..722d4b0e7c830dfde2412746db1258b880167a2f
--- /dev/null
+++ b/mmde/mmdet/datasets/transforms/loading.py
@@ -0,0 +1,1074 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+import torch
+from mmcv.transforms import BaseTransform
+from mmcv.transforms import LoadAnnotations as MMCV_LoadAnnotations
+from mmcv.transforms import LoadImageFromFile
+from mmengine.fileio import get
+from mmengine.structures import BaseDataElement
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import get_box_type
+from mmdet.structures.bbox.box_type import autocast_box_type
+from mmdet.structures.mask import BitmapMasks, PolygonMasks
+
+
+@TRANSFORMS.register_module()
+class LoadImageFromNDArray(LoadImageFromFile):
+    """Load an image from ``results['img']``.
+
+    Similar with :obj:`LoadImageFromFile`, but the image has been loaded as
+    :obj:`np.ndarray` in ``results['img']``. Can be used when loading image
+    from webcam.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+    - img_path
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to add image meta information.
+
+        Args:
+            results (dict): Result dict with Webcam read image in
+                ``results['img']``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        img = results['img']
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img_path'] = None
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadMultiChannelImageFromFiles(BaseTransform):
+    """Load multi-channel images from a list of separate channel files.
+
+    Required Keys:
+
+    - img_path
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:``mmcv.imfrombytes``.
+            Defaults to 'unchanged'.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:``mmcv.imfrombytes``.
+            See :func:``mmcv.imfrombytes`` for details.
+            Defaults to 'cv2'.
+        file_client_args (dict): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet >= 3.0.0rc7. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        to_float32: bool = False,
+        color_type: str = 'unchanged',
+        imdecode_backend: str = 'cv2',
+        file_client_args: dict = None,
+        backend_args: dict = None,
+    ) -> None:
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.imdecode_backend = imdecode_backend
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+    def transform(self, results: dict) -> dict:
+        """Transform functions to load multiple images and get images meta
+        information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded images and meta information.
+        """
+
+        assert isinstance(results['img_path'], list)
+        img = []
+        for name in results['img_path']:
+            img_bytes = get(name, backend_args=self.backend_args)
+            img.append(
+                mmcv.imfrombytes(
+                    img_bytes,
+                    flag=self.color_type,
+                    backend=self.imdecode_backend))
+        img = np.stack(img, axis=-1)
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f"imdecode_backend='{self.imdecode_backend}', "
+                    f'backend_args={self.backend_args})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadAnnotations(MMCV_LoadAnnotations):
+    """Load and process the ``instances`` and ``seg_map`` annotation provided
+    by dataset.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            'instances':
+            [
+                {
+                # List of 4 numbers representing the bounding box of the
+                # instance, in (x1, y1, x2, y2) order.
+                'bbox': [x1, y1, x2, y2],
+
+                # Label of image classification.
+                'bbox_label': 1,
+
+                # Used in instance/panoptic segmentation. The segmentation mask
+                # of the instance or the information of segments.
+                # 1. If list[list[float]], it represents a list of polygons,
+                # one for each connected component of the object. Each
+                # list[float] is one simple polygon in the format of
+                # [x1, y1, ..., xn, yn] (n >= 3). The Xs and Ys are absolute
+                # coordinates in unit of pixels.
+                # 2. If dict, it represents the per-pixel segmentation mask in
+                # COCO's compressed RLE format. The dict should have keys
+                # “size” and “counts”.  Can be loaded by pycocotools
+                'mask': list[list[float]] or dict,
+
+                }
+            ]
+            # Filename of semantic or panoptic segmentation ground truth file.
+            'seg_map_path': 'a/b/c'
+        }
+
+    After this module, the annotation has been changed to the format below:
+
+    .. code-block:: python
+
+        {
+            # In (x1, y1, x2, y2) order, float type. N is the number of bboxes
+            # in an image
+            'gt_bboxes': BaseBoxes(N, 4)
+             # In int type.
+            'gt_bboxes_labels': np.ndarray(N, )
+             # In built-in class
+            'gt_masks': PolygonMasks (H, W) or BitmapMasks (H, W)
+             # In uint8 type.
+            'gt_seg_map': np.ndarray (H, W)
+             # in (x, y, v) order, float type.
+        }
+
+    Required Keys:
+
+    - height
+    - width
+    - instances
+
+      - bbox (optional)
+      - bbox_label
+      - mask (optional)
+      - ignore_flag
+
+    - seg_map_path (optional)
+
+    Added Keys:
+
+    - gt_bboxes (BaseBoxes[torch.float32])
+    - gt_bboxes_labels (np.int64)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (bool)
+
+    Args:
+        with_bbox (bool): Whether to parse and load the bbox annotation.
+            Defaults to True.
+        with_label (bool): Whether to parse and load the label annotation.
+            Defaults to True.
+        with_mask (bool): Whether to parse and load the mask annotation.
+             Default: False.
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Defaults to False.
+        poly2mask (bool): Whether to convert mask to bitmap. Default: True.
+        box_type (str): The box type used to wrap the bboxes. If ``box_type``
+            is None, gt_bboxes will keep being np.ndarray. Defaults to 'hbox'.
+        reduce_zero_label (bool): Whether reduce all label value
+            by 1. Usually used for datasets where 0 is background label.
+            Defaults to False.
+        ignore_index (int): The label index to be ignored.
+            Valid only if reduce_zero_label is true. Defaults is 255.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:``mmcv.imfrombytes``.
+            See :fun:``mmcv.imfrombytes`` for details.
+            Defaults to 'cv2'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    def __init__(
+            self,
+            with_mask: bool = False,
+            poly2mask: bool = True,
+            box_type: str = 'hbox',
+            # use for semseg
+            reduce_zero_label: bool = False,
+            ignore_index: int = 255,
+            **kwargs) -> None:
+        super(LoadAnnotations, self).__init__(**kwargs)
+        self.with_mask = with_mask
+        self.poly2mask = poly2mask
+        self.box_type = box_type
+        self.reduce_zero_label = reduce_zero_label
+        self.ignore_index = ignore_index
+
+    def _load_bboxes(self, results: dict) -> None:
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+        gt_bboxes = []
+        gt_ignore_flags = []
+        for instance in results.get('instances', []):
+            gt_bboxes.append(instance['bbox'])
+            gt_ignore_flags.append(instance['ignore_flag'])
+        if self.box_type is None:
+            results['gt_bboxes'] = np.array(
+                gt_bboxes, dtype=np.float32).reshape((-1, 4))
+        else:
+            _, box_type_cls = get_box_type(self.box_type)
+            results['gt_bboxes'] = box_type_cls(gt_bboxes, dtype=torch.float32)
+        results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool)
+
+    def _load_labels(self, results: dict) -> None:
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+        gt_bboxes_labels = []
+        for instance in results.get('instances', []):
+            gt_bboxes_labels.append(instance['bbox_label'])
+        # TODO: Inconsistent with mmcv, consider how to deal with it later.
+        results['gt_bboxes_labels'] = np.array(
+            gt_bboxes_labels, dtype=np.int64)
+
+    def _poly2mask(self, mask_ann: Union[list, dict], img_h: int,
+                   img_w: int) -> np.ndarray:
+        """Private function to convert masks represented with polygon to
+        bitmaps.
+
+        Args:
+            mask_ann (list | dict): Polygon mask annotation input.
+            img_h (int): The height of output mask.
+            img_w (int): The width of output mask.
+
+        Returns:
+            np.ndarray: The decode bitmap mask of shape (img_h, img_w).
+        """
+
+        if isinstance(mask_ann, list):
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+            rle = maskUtils.merge(rles)
+        elif isinstance(mask_ann['counts'], list):
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        else:
+            # rle
+            rle = mask_ann
+        mask = maskUtils.decode(rle)
+        return mask
+
+    def _process_masks(self, results: dict) -> list:
+        """Process gt_masks and filter invalid polygons.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+
+        Returns:
+            list: Processed gt_masks.
+        """
+        gt_masks = []
+        gt_ignore_flags = []
+        for instance in results.get('instances', []):
+            gt_mask = instance['mask']
+            # If the annotation of segmentation mask is invalid,
+            # ignore the whole instance.
+            if isinstance(gt_mask, list):
+                gt_mask = [
+                    np.array(polygon) for polygon in gt_mask
+                    if len(polygon) % 2 == 0 and len(polygon) >= 6
+                ]
+                if len(gt_mask) == 0:
+                    # ignore this instance and set gt_mask to a fake mask
+                    instance['ignore_flag'] = 1
+                    gt_mask = [np.zeros(6)]
+            elif not self.poly2mask:
+                # `PolygonMasks` requires a ploygon of format List[np.array],
+                # other formats are invalid.
+                instance['ignore_flag'] = 1
+                gt_mask = [np.zeros(6)]
+            elif isinstance(gt_mask, dict) and \
+                    not (gt_mask.get('counts') is not None and
+                         gt_mask.get('size') is not None and
+                         isinstance(gt_mask['counts'], (list, str))):
+                # if gt_mask is a dict, it should include `counts` and `size`,
+                # so that `BitmapMasks` can uncompressed RLE
+                instance['ignore_flag'] = 1
+                gt_mask = [np.zeros(6)]
+            gt_masks.append(gt_mask)
+            # re-process gt_ignore_flags
+            gt_ignore_flags.append(instance['ignore_flag'])
+        results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool)
+        return gt_masks
+
+    def _load_masks(self, results: dict) -> None:
+        """Private function to load mask annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+        """
+        h, w = results['ori_shape']
+        gt_masks = self._process_masks(results)
+        if self.poly2mask:
+            gt_masks = BitmapMasks(
+                [self._poly2mask(mask, h, w) for mask in gt_masks], h, w)
+        else:
+            # fake polygon masks will be ignored in `PackDetInputs`
+            gt_masks = PolygonMasks([mask for mask in gt_masks], h, w)
+        results['gt_masks'] = gt_masks
+
+    def _load_seg_map(self, results: dict) -> None:
+        """Private function to load semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded semantic segmentation annotations.
+        """
+        if results.get('seg_map_path', None) is None:
+            return
+
+        img_bytes = get(
+            results['seg_map_path'], backend_args=self.backend_args)
+        gt_semantic_seg = mmcv.imfrombytes(
+            img_bytes, flag='unchanged',
+            backend=self.imdecode_backend).squeeze()
+
+        if self.reduce_zero_label:
+            # avoid using underflow conversion
+            gt_semantic_seg[gt_semantic_seg == 0] = self.ignore_index
+            gt_semantic_seg = gt_semantic_seg - 1
+            gt_semantic_seg[gt_semantic_seg == self.ignore_index -
+                            1] = self.ignore_index
+
+        # modify if custom classes
+        if results.get('label_map', None) is not None:
+            # Add deep copy to solve bug of repeatedly
+            # replace `gt_semantic_seg`, which is reported in
+            # https://github.com/open-mmlab/mmsegmentation/pull/1445/
+            gt_semantic_seg_copy = gt_semantic_seg.copy()
+            for old_id, new_id in results['label_map'].items():
+                gt_semantic_seg[gt_semantic_seg_copy == old_id] = new_id
+        results['gt_seg_map'] = gt_semantic_seg
+        results['ignore_index'] = self.ignore_index
+
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label and
+            semantic segmentation.
+        """
+
+        if self.with_bbox:
+            self._load_bboxes(results)
+        if self.with_label:
+            self._load_labels(results)
+        if self.with_mask:
+            self._load_masks(results)
+        if self.with_seg:
+            self._load_seg_map(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'poly2mask={self.poly2mask}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+        repr_str += f'backend_args={self.backend_args})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadPanopticAnnotations(LoadAnnotations):
+    """Load multiple types of panoptic annotations.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            'instances':
+            [
+                {
+                # List of 4 numbers representing the bounding box of the
+                # instance, in (x1, y1, x2, y2) order.
+                'bbox': [x1, y1, x2, y2],
+
+                # Label of image classification.
+                'bbox_label': 1,
+                },
+                ...
+            ]
+            'segments_info':
+            [
+                {
+                # id = cls_id + instance_id * INSTANCE_OFFSET
+                'id': int,
+
+                # Contiguous category id defined in dataset.
+                'category': int
+
+                # Thing flag.
+                'is_thing': bool
+                },
+                ...
+            ]
+
+            # Filename of semantic or panoptic segmentation ground truth file.
+            'seg_map_path': 'a/b/c'
+        }
+
+    After this module, the annotation has been changed to the format below:
+
+    .. code-block:: python
+
+        {
+            # In (x1, y1, x2, y2) order, float type. N is the number of bboxes
+            # in an image
+            'gt_bboxes': BaseBoxes(N, 4)
+             # In int type.
+            'gt_bboxes_labels': np.ndarray(N, )
+             # In built-in class
+            'gt_masks': PolygonMasks (H, W) or BitmapMasks (H, W)
+             # In uint8 type.
+            'gt_seg_map': np.ndarray (H, W)
+             # in (x, y, v) order, float type.
+        }
+
+    Required Keys:
+
+    - height
+    - width
+    - instances
+      - bbox
+      - bbox_label
+      - ignore_flag
+    - segments_info
+      - id
+      - category
+      - is_thing
+    - seg_map_path
+
+    Added Keys:
+
+    - gt_bboxes (BaseBoxes[torch.float32])
+    - gt_bboxes_labels (np.int64)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (bool)
+
+    Args:
+        with_bbox (bool): Whether to parse and load the bbox annotation.
+            Defaults to True.
+        with_label (bool): Whether to parse and load the label annotation.
+            Defaults to True.
+        with_mask (bool): Whether to parse and load the mask annotation.
+             Defaults to True.
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Defaults to False.
+        box_type (str): The box mode used to wrap the bboxes.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:``mmcv.imfrombytes``.
+            See :fun:``mmcv.imfrombytes`` for details.
+            Defaults to 'cv2'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet >= 3.0.0rc7. Defaults to None.
+    """
+
+    def __init__(self,
+                 with_bbox: bool = True,
+                 with_label: bool = True,
+                 with_mask: bool = True,
+                 with_seg: bool = True,
+                 box_type: str = 'hbox',
+                 imdecode_backend: str = 'cv2',
+                 backend_args: dict = None) -> None:
+        try:
+            from panopticapi import utils
+        except ImportError:
+            raise ImportError(
+                'panopticapi is not installed, please install it by: '
+                'pip install git+https://github.com/cocodataset/'
+                'panopticapi.git.')
+        self.rgb2id = utils.rgb2id
+
+        super(LoadPanopticAnnotations, self).__init__(
+            with_bbox=with_bbox,
+            with_label=with_label,
+            with_mask=with_mask,
+            with_seg=with_seg,
+            with_keypoints=False,
+            box_type=box_type,
+            imdecode_backend=imdecode_backend,
+            backend_args=backend_args)
+
+    def _load_masks_and_semantic_segs(self, results: dict) -> None:
+        """Private function to load mask and semantic segmentation annotations.
+
+        In gt_semantic_seg, the foreground label is from ``0`` to
+        ``num_things - 1``, the background label is from ``num_things`` to
+        ``num_things + num_stuff - 1``, 255 means the ignored label (``VOID``).
+
+        Args:
+            results (dict): Result dict from :obj:``mmdet.CustomDataset``.
+        """
+        # seg_map_path is None, when inference on the dataset without gts.
+        if results.get('seg_map_path', None) is None:
+            return
+
+        img_bytes = get(
+            results['seg_map_path'], backend_args=self.backend_args)
+        pan_png = mmcv.imfrombytes(
+            img_bytes, flag='color', channel_order='rgb').squeeze()
+        pan_png = self.rgb2id(pan_png)
+
+        gt_masks = []
+        gt_seg = np.zeros_like(pan_png) + 255  # 255 as ignore
+
+        for segment_info in results['segments_info']:
+            mask = (pan_png == segment_info['id'])
+            gt_seg = np.where(mask, segment_info['category'], gt_seg)
+
+            # The legal thing masks
+            if segment_info.get('is_thing'):
+                gt_masks.append(mask.astype(np.uint8))
+
+        if self.with_mask:
+            h, w = results['ori_shape']
+            gt_masks = BitmapMasks(gt_masks, h, w)
+            results['gt_masks'] = gt_masks
+
+        if self.with_seg:
+            results['gt_seg_map'] = gt_seg
+
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types panoptic annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmdet.CustomDataset``.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label, mask and
+                semantic segmentation annotations.
+        """
+
+        if self.with_bbox:
+            self._load_bboxes(results)
+        if self.with_label:
+            self._load_labels(results)
+        if self.with_mask or self.with_seg:
+            # The tasks completed by '_load_masks' and '_load_semantic_segs'
+            # in LoadAnnotations are merged to one function.
+            self._load_masks_and_semantic_segs(results)
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadProposals(BaseTransform):
+    """Load proposal pipeline.
+
+    Required Keys:
+
+    - proposals
+
+    Modified Keys:
+
+    - proposals
+
+    Args:
+        num_max_proposals (int, optional): Maximum number of proposals to load.
+            If not specified, all proposals will be loaded.
+    """
+
+    def __init__(self, num_max_proposals: Optional[int] = None) -> None:
+        self.num_max_proposals = num_max_proposals
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to load proposals from file.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded proposal annotations.
+        """
+
+        proposals = results['proposals']
+        # the type of proposals should be `dict` or `InstanceData`
+        assert isinstance(proposals, dict) \
+               or isinstance(proposals, BaseDataElement)
+        bboxes = proposals['bboxes'].astype(np.float32)
+        assert bboxes.shape[1] == 4, \
+            f'Proposals should have shapes (n, 4), but found {bboxes.shape}'
+
+        if 'scores' in proposals:
+            scores = proposals['scores'].astype(np.float32)
+            assert bboxes.shape[0] == scores.shape[0]
+        else:
+            scores = np.zeros(bboxes.shape[0], dtype=np.float32)
+
+        if self.num_max_proposals is not None:
+            # proposals should sort by scores during dumping the proposals
+            bboxes = bboxes[:self.num_max_proposals]
+            scores = scores[:self.num_max_proposals]
+
+        if len(bboxes) == 0:
+            bboxes = np.zeros((0, 4), dtype=np.float32)
+            scores = np.zeros(0, dtype=np.float32)
+
+        results['proposals'] = bboxes
+        results['proposals_scores'] = scores
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+               f'(num_max_proposals={self.num_max_proposals})'
+
+
+@TRANSFORMS.register_module()
+class FilterAnnotations(BaseTransform):
+    """Filter invalid annotations.
+
+    Required Keys:
+
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_masks (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        min_gt_bbox_wh (tuple[float]): Minimum width and height of ground truth
+            boxes. Default: (1., 1.)
+        min_gt_mask_area (int): Minimum foreground area of ground truth masks.
+            Default: 1
+        by_box (bool): Filter instances with bounding boxes not meeting the
+            min_gt_bbox_wh threshold. Default: True
+        by_mask (bool): Filter instances with masks not meeting
+            min_gt_mask_area threshold. Default: False
+        keep_empty (bool): Whether to return None when it
+            becomes an empty bbox after filtering. Defaults to True.
+    """
+
+    def __init__(self,
+                 min_gt_bbox_wh: Tuple[int, int] = (1, 1),
+                 min_gt_mask_area: int = 1,
+                 by_box: bool = True,
+                 by_mask: bool = False,
+                 keep_empty: bool = True) -> None:
+        # TODO: add more filter options
+        assert by_box or by_mask
+        self.min_gt_bbox_wh = min_gt_bbox_wh
+        self.min_gt_mask_area = min_gt_mask_area
+        self.by_box = by_box
+        self.by_mask = by_mask
+        self.keep_empty = keep_empty
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> Union[dict, None]:
+        """Transform function to filter annotations.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        assert 'gt_bboxes' in results
+        gt_bboxes = results['gt_bboxes']
+        if gt_bboxes.shape[0] == 0:
+            return results
+
+        tests = []
+        if self.by_box:
+            tests.append(
+                ((gt_bboxes.widths > self.min_gt_bbox_wh[0]) &
+                 (gt_bboxes.heights > self.min_gt_bbox_wh[1])).numpy())
+        if self.by_mask:
+            assert 'gt_masks' in results
+            gt_masks = results['gt_masks']
+            tests.append(gt_masks.areas >= self.min_gt_mask_area)
+
+        keep = tests[0]
+        for t in tests[1:]:
+            keep = keep & t
+
+        if not keep.any():
+            if self.keep_empty:
+                return None
+
+        keys = ('gt_bboxes', 'gt_bboxes_labels', 'gt_masks', 'gt_ignore_flags')
+        for key in keys:
+            if key in results:
+                results[key] = results[key][keep]
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+               f'(min_gt_bbox_wh={self.min_gt_bbox_wh}, ' \
+               f'keep_empty={self.keep_empty})'
+
+
+@TRANSFORMS.register_module()
+class LoadEmptyAnnotations(BaseTransform):
+    """Load Empty Annotations for unlabeled images.
+
+    Added Keys:
+    - gt_bboxes (np.float32)
+    - gt_bboxes_labels (np.int64)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (bool)
+
+    Args:
+        with_bbox (bool): Whether to load the pseudo bbox annotation.
+            Defaults to True.
+        with_label (bool): Whether to load the pseudo label annotation.
+            Defaults to True.
+        with_mask (bool): Whether to load the pseudo mask annotation.
+             Default: False.
+        with_seg (bool): Whether to load the pseudo semantic segmentation
+            annotation. Defaults to False.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+    """
+
+    def __init__(self,
+                 with_bbox: bool = True,
+                 with_label: bool = True,
+                 with_mask: bool = False,
+                 with_seg: bool = False,
+                 seg_ignore_label: int = 255) -> None:
+        self.with_bbox = with_bbox
+        self.with_label = with_label
+        self.with_mask = with_mask
+        self.with_seg = with_seg
+        self.seg_ignore_label = seg_ignore_label
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to load empty annotations.
+
+        Args:
+            results (dict): Result dict.
+        Returns:
+            dict: Updated result dict.
+        """
+        if self.with_bbox:
+            results['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
+            results['gt_ignore_flags'] = np.zeros((0, ), dtype=bool)
+        if self.with_label:
+            results['gt_bboxes_labels'] = np.zeros((0, ), dtype=np.int64)
+        if self.with_mask:
+            # TODO: support PolygonMasks
+            h, w = results['img_shape']
+            gt_masks = np.zeros((0, h, w), dtype=np.uint8)
+            results['gt_masks'] = BitmapMasks(gt_masks, h, w)
+        if self.with_seg:
+            h, w = results['img_shape']
+            results['gt_seg_map'] = self.seg_ignore_label * np.ones(
+                (h, w), dtype=np.uint8)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class InferencerLoader(BaseTransform):
+    """Load an image from ``results['img']``.
+
+    Similar with :obj:`LoadImageFromFile`, but the image has been loaded as
+    :obj:`np.ndarray` in ``results['img']``. Can be used when loading image
+    from webcam.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+    - img_path
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__()
+        self.from_file = TRANSFORMS.build(
+            dict(type='LoadImageFromFile', **kwargs))
+        self.from_ndarray = TRANSFORMS.build(
+            dict(type='mmdet.LoadImageFromNDArray', **kwargs))
+
+    def transform(self, results: Union[str, np.ndarray, dict]) -> dict:
+        """Transform function to add image meta information.
+
+        Args:
+            results (str, np.ndarray or dict): The result.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        if isinstance(results, str):
+            inputs = dict(img_path=results)
+        elif isinstance(results, np.ndarray):
+            inputs = dict(img=results)
+        elif isinstance(results, dict):
+            inputs = results
+        else:
+            raise NotImplementedError
+
+        if 'img' in inputs:
+            return self.from_ndarray(inputs)
+        return self.from_file(inputs)
+
+
+@TRANSFORMS.register_module()
+class LoadTrackAnnotations(LoadAnnotations):
+    """Load and process the ``instances`` and ``seg_map`` annotation provided
+    by dataset. It must load ``instances_ids`` which is only used in the
+    tracking tasks. The annotation format is as the following:
+
+    .. code-block:: python
+        {
+            'instances':
+            [
+                {
+                # List of 4 numbers representing the bounding box of the
+                # instance, in (x1, y1, x2, y2) order.
+                'bbox': [x1, y1, x2, y2],
+                # Label of image classification.
+                'bbox_label': 1,
+                # Used in tracking.
+                # Id of instances.
+                'instance_id': 100,
+                # Used in instance/panoptic segmentation. The segmentation mask
+                # of the instance or the information of segments.
+                # 1. If list[list[float]], it represents a list of polygons,
+                # one for each connected component of the object. Each
+                # list[float] is one simple polygon in the format of
+                # [x1, y1, ..., xn, yn] (n >= 3). The Xs and Ys are absolute
+                # coordinates in unit of pixels.
+                # 2. If dict, it represents the per-pixel segmentation mask in
+                # COCO's compressed RLE format. The dict should have keys
+                # “size” and “counts”.  Can be loaded by pycocotools
+                'mask': list[list[float]] or dict,
+                }
+            ]
+            # Filename of semantic or panoptic segmentation ground truth file.
+            'seg_map_path': 'a/b/c'
+        }
+
+    After this module, the annotation has been changed to the format below:
+    .. code-block:: python
+        {
+            # In (x1, y1, x2, y2) order, float type. N is the number of bboxes
+            # in an image
+            'gt_bboxes': np.ndarray(N, 4)
+             # In int type.
+            'gt_bboxes_labels': np.ndarray(N, )
+             # In built-in class
+            'gt_masks': PolygonMasks (H, W) or BitmapMasks (H, W)
+             # In uint8 type.
+            'gt_seg_map': np.ndarray (H, W)
+             # in (x, y, v) order, float type.
+        }
+
+    Required Keys:
+
+    - height (optional)
+    - width (optional)
+    - instances
+      - bbox (optional)
+      - bbox_label
+      - instance_id (optional)
+      - mask (optional)
+      - ignore_flag (optional)
+    - seg_map_path (optional)
+
+    Added Keys:
+
+    - gt_bboxes (np.float32)
+    - gt_bboxes_labels (np.int32)
+    - gt_instances_ids (np.int32)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (np.bool)
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+    def _load_bboxes(self, results: dict) -> None:
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+        gt_bboxes = []
+        gt_ignore_flags = []
+        # TODO: use bbox_type
+        for instance in results['instances']:
+            # The datasets which are only format in evaluation don't have
+            # groundtruth boxes.
+            if 'bbox' in instance:
+                gt_bboxes.append(instance['bbox'])
+            if 'ignore_flag' in instance:
+                gt_ignore_flags.append(instance['ignore_flag'])
+
+        # TODO: check this case
+        if len(gt_bboxes) != len(gt_ignore_flags):
+            # There may be no ``gt_ignore_flags`` in some cases, we treat them
+            # as all False in order to keep the length of ``gt_bboxes`` and
+            # ``gt_ignore_flags`` the same
+            gt_ignore_flags = [False] * len(gt_bboxes)
+
+        results['gt_bboxes'] = np.array(
+            gt_bboxes, dtype=np.float32).reshape(-1, 4)
+        results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool)
+
+    def _load_instances_ids(self, results: dict) -> None:
+        """Private function to load instances id annotations.
+
+        Args:
+            results (dict): Result dict from :obj :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict containing instances id annotations.
+        """
+        gt_instances_ids = []
+        for instance in results['instances']:
+            gt_instances_ids.append(instance['instance_id'])
+        results['gt_instances_ids'] = np.array(
+            gt_instances_ids, dtype=np.int32)
+
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label, instances id
+            and semantic segmentation and keypoints annotations.
+        """
+        results = super().transform(results)
+        self._load_instances_ids(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'poly2mask={self.poly2mask}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+        repr_str += f'file_client_args={self.file_client_args})'
+        return repr_str
diff --git a/mmde/mmdet/datasets/transforms/text_transformers.py b/mmde/mmdet/datasets/transforms/text_transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..25304d5fe451860547dea24b78f645d32536e2ae
--- /dev/null
+++ b/mmde/mmdet/datasets/transforms/text_transformers.py
@@ -0,0 +1,255 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+from mmcv.transforms import BaseTransform
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import BaseBoxes
+
+try:
+    from transformers import AutoTokenizer
+    from transformers import BertModel as HFBertModel
+except ImportError:
+    AutoTokenizer = None
+    HFBertModel = None
+
+import random
+import re
+
+import numpy as np
+
+
+def clean_name(name):
+    name = re.sub(r'\(.*\)', '', name)
+    name = re.sub(r'_', ' ', name)
+    name = re.sub(r'  ', ' ', name)
+    name = name.lower()
+    return name
+
+
+def check_for_positive_overflow(gt_bboxes, gt_labels, text, tokenizer,
+                                max_tokens):
+    # Check if we have too many positive labels
+    # generate a caption by appending the positive labels
+    positive_label_list = np.unique(gt_labels).tolist()
+    # random shuffule so we can sample different annotations
+    # at different epochs
+    random.shuffle(positive_label_list)
+
+    kept_lables = []
+    length = 0
+
+    for index, label in enumerate(positive_label_list):
+
+        label_text = clean_name(text[str(label)]) + '. '
+
+        tokenized = tokenizer.tokenize(label_text)
+
+        length += len(tokenized)
+
+        if length > max_tokens:
+            break
+        else:
+            kept_lables.append(label)
+
+    keep_box_index = []
+    keep_gt_labels = []
+    for i in range(len(gt_labels)):
+        if gt_labels[i] in kept_lables:
+            keep_box_index.append(i)
+            keep_gt_labels.append(gt_labels[i])
+
+    return gt_bboxes[keep_box_index], np.array(
+        keep_gt_labels, dtype=np.long), length
+
+
+def generate_senetence_given_labels(positive_label_list, negative_label_list,
+                                    text):
+    label_to_positions = {}
+
+    label_list = negative_label_list + positive_label_list
+
+    random.shuffle(label_list)
+
+    pheso_caption = ''
+
+    label_remap_dict = {}
+    for index, label in enumerate(label_list):
+
+        start_index = len(pheso_caption)
+
+        pheso_caption += clean_name(text[str(label)])
+
+        end_index = len(pheso_caption)
+
+        if label in positive_label_list:
+            label_to_positions[index] = [[start_index, end_index]]
+            label_remap_dict[int(label)] = index
+
+        # if index != len(label_list) - 1:
+        #     pheso_caption += '. '
+        pheso_caption += '. '
+
+    return label_to_positions, pheso_caption, label_remap_dict
+
+
+@TRANSFORMS.register_module()
+class RandomSamplingNegPos(BaseTransform):
+
+    def __init__(self,
+                 tokenizer_name,
+                 num_sample_negative=85,
+                 max_tokens=256,
+                 full_sampling_prob=0.5,
+                 label_map_file=None):
+        if AutoTokenizer is None:
+            raise RuntimeError(
+                'transformers is not installed, please install it by: '
+                'pip install transformers.')
+
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        self.num_sample_negative = num_sample_negative
+        self.full_sampling_prob = full_sampling_prob
+        self.max_tokens = max_tokens
+        self.label_map = None
+        if label_map_file:
+            with open(label_map_file, 'r') as file:
+                self.label_map = json.load(file)
+
+    def transform(self, results: dict) -> dict:
+        if 'phrases' in results:
+            return self.vg_aug(results)
+        else:
+            return self.od_aug(results)
+
+    def vg_aug(self, results):
+        gt_bboxes = results['gt_bboxes']
+        if isinstance(gt_bboxes, BaseBoxes):
+            gt_bboxes = gt_bboxes.tensor
+        gt_labels = results['gt_bboxes_labels']
+        text = results['text'].lower().strip()
+        if not text.endswith('.'):
+            text = text + '. '
+
+        phrases = results['phrases']
+        # TODO: add neg
+        positive_label_list = np.unique(gt_labels).tolist()
+        label_to_positions = {}
+        for label in positive_label_list:
+            label_to_positions[label] = phrases[label]['tokens_positive']
+
+        results['gt_bboxes'] = gt_bboxes
+        results['gt_bboxes_labels'] = gt_labels
+
+        results['text'] = text
+        results['tokens_positive'] = label_to_positions
+        return results
+
+    def od_aug(self, results):
+        gt_bboxes = results['gt_bboxes']
+        if isinstance(gt_bboxes, BaseBoxes):
+            gt_bboxes = gt_bboxes.tensor
+        gt_labels = results['gt_bboxes_labels']
+
+        if 'text' not in results:
+            assert self.label_map is not None
+            text = self.label_map
+        else:
+            text = results['text']
+
+        original_box_num = len(gt_labels)
+        # If the category name is in the format of 'a/b' (in object365),
+        # we randomly select one of them.
+        for key, value in text.items():
+            if '/' in value:
+                text[key] = random.choice(value.split('/')).strip()
+
+        gt_bboxes, gt_labels, positive_caption_length = \
+            check_for_positive_overflow(gt_bboxes, gt_labels,
+                                        text, self.tokenizer, self.max_tokens)
+
+        if len(gt_bboxes) < original_box_num:
+            print('WARNING: removed {} boxes due to positive caption overflow'.
+                  format(original_box_num - len(gt_bboxes)))
+
+        valid_negative_indexes = list(text.keys())
+
+        positive_label_list = np.unique(gt_labels).tolist()
+        full_negative = self.num_sample_negative
+
+        if full_negative > len(valid_negative_indexes):
+            full_negative = len(valid_negative_indexes)
+
+        outer_prob = random.random()
+
+        if outer_prob < self.full_sampling_prob:
+            # c. probability_full: add both all positive and all negatives
+            num_negatives = full_negative
+        else:
+            if random.random() < 1.0:
+                num_negatives = np.random.choice(max(1, full_negative)) + 1
+            else:
+                num_negatives = full_negative
+
+        # Keep some negatives
+        negative_label_list = set()
+        if num_negatives != -1:
+            if num_negatives > len(valid_negative_indexes):
+                num_negatives = len(valid_negative_indexes)
+
+            for i in np.random.choice(
+                    valid_negative_indexes, size=num_negatives, replace=False):
+                if i not in positive_label_list:
+                    negative_label_list.add(i)
+
+        random.shuffle(positive_label_list)
+
+        negative_label_list = list(negative_label_list)
+        random.shuffle(negative_label_list)
+
+        negative_max_length = self.max_tokens - positive_caption_length
+        screened_negative_label_list = []
+
+        for negative_label in negative_label_list:
+            label_text = clean_name(text[str(negative_label)]) + '. '
+
+            tokenized = self.tokenizer.tokenize(label_text)
+
+            negative_max_length -= len(tokenized)
+
+            if negative_max_length > 0:
+                screened_negative_label_list.append(negative_label)
+            else:
+                break
+        negative_label_list = screened_negative_label_list
+        label_to_positions, pheso_caption, label_remap_dict = \
+            generate_senetence_given_labels(positive_label_list,
+                                            negative_label_list, text)
+
+        # label remap
+        if len(gt_labels) > 0:
+            gt_labels = np.vectorize(lambda x: label_remap_dict[x])(gt_labels)
+
+        results['gt_bboxes'] = gt_bboxes
+        results['gt_bboxes_labels'] = gt_labels
+
+        results['text'] = pheso_caption
+        results['tokens_positive'] = label_to_positions
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadTextAnnotations(BaseTransform):
+
+    def transform(self, results: dict) -> dict:
+        if 'phrases' in results:
+            tokens_positive = [
+                phrase['tokens_positive']
+                for phrase in results['phrases'].values()
+            ]
+            results['tokens_positive'] = tokens_positive
+        else:
+            text = results['text']
+            results['text'] = list(text.values())
+        return results
diff --git a/mmde/mmdet/datasets/transforms/transformers_glip.py b/mmde/mmdet/datasets/transforms/transformers_glip.py
new file mode 100644
index 0000000000000000000000000000000000000000..60c4f87d1b86c13f886da27584114b6420b8b8cb
--- /dev/null
+++ b/mmde/mmdet/datasets/transforms/transformers_glip.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import HorizontalBoxes, autocast_box_type
+from .transforms import RandomFlip
+
+
+@TRANSFORMS.register_module()
+class GTBoxSubOne_GLIP(BaseTransform):
+    """Subtract 1 from the x2 and y2 coordinates of the gt_bboxes."""
+
+    def transform(self, results: dict) -> dict:
+        if 'gt_bboxes' in results:
+            gt_bboxes = results['gt_bboxes']
+            if isinstance(gt_bboxes, np.ndarray):
+                gt_bboxes[:, 2:] -= 1
+                results['gt_bboxes'] = gt_bboxes
+            elif isinstance(gt_bboxes, HorizontalBoxes):
+                gt_bboxes = results['gt_bboxes'].tensor
+                gt_bboxes[:, 2:] -= 1
+                results['gt_bboxes'] = HorizontalBoxes(gt_bboxes)
+            else:
+                raise NotImplementedError
+        return results
+
+
+@TRANSFORMS.register_module()
+class RandomFlip_GLIP(RandomFlip):
+    """Flip the image & bboxes & masks & segs horizontally or vertically.
+
+    When using horizontal flipping, the corresponding bbox x-coordinate needs
+    to be additionally subtracted by one.
+    """
+
+    @autocast_box_type()
+    def _flip(self, results: dict) -> None:
+        """Flip images, bounding boxes, and semantic segmentation map."""
+        # flip image
+        results['img'] = mmcv.imflip(
+            results['img'], direction=results['flip_direction'])
+
+        img_shape = results['img'].shape[:2]
+
+        # flip bboxes
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'].flip_(img_shape, results['flip_direction'])
+            # Only change this line
+            if results['flip_direction'] == 'horizontal':
+                results['gt_bboxes'].translate_([-1, 0])
+
+        # TODO: check it
+        # flip masks
+        if results.get('gt_masks', None) is not None:
+            results['gt_masks'] = results['gt_masks'].flip(
+                results['flip_direction'])
+
+        # flip segs
+        if results.get('gt_seg_map', None) is not None:
+            results['gt_seg_map'] = mmcv.imflip(
+                results['gt_seg_map'], direction=results['flip_direction'])
+
+        # record homography matrix for flip
+        self._record_homography_matrix(results)
diff --git a/mmde/mmdet/datasets/transforms/transforms.py b/mmde/mmdet/datasets/transforms/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..c50b987db33c91f759f6c89580f605631ce4f558
--- /dev/null
+++ b/mmde/mmdet/datasets/transforms/transforms.py
@@ -0,0 +1,3856 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+import math
+import warnings
+from typing import List, Optional, Sequence, Tuple, Union
+
+import cv2
+import mmcv
+import numpy as np
+from mmcv.image import imresize
+from mmcv.image.geometric import _scale_size
+from mmcv.transforms import BaseTransform
+from mmcv.transforms import Pad as MMCV_Pad
+from mmcv.transforms import RandomFlip as MMCV_RandomFlip
+from mmcv.transforms import Resize as MMCV_Resize
+from mmcv.transforms.utils import avoid_cache_randomness, cache_randomness
+from mmengine.dataset import BaseDataset
+from mmengine.utils import is_str
+from numpy import random
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import HorizontalBoxes, autocast_box_type
+from mmdet.structures.mask import BitmapMasks, PolygonMasks
+from mmdet.utils import log_img_scale
+
+try:
+    from imagecorruptions import corrupt
+except ImportError:
+    corrupt = None
+
+try:
+    import albumentations
+    from albumentations import Compose
+except ImportError:
+    albumentations = None
+    Compose = None
+
+Number = Union[int, float]
+
+
+def _fixed_scale_size(
+    size: Tuple[int, int],
+    scale: Union[float, int, tuple],
+) -> Tuple[int, int]:
+    """Rescale a size by a ratio.
+
+    Args:
+        size (tuple[int]): (w, h).
+        scale (float | tuple(float)): Scaling factor.
+
+    Returns:
+        tuple[int]: scaled size.
+    """
+    if isinstance(scale, (float, int)):
+        scale = (scale, scale)
+    w, h = size
+    # don't need o.5 offset
+    return int(w * float(scale[0])), int(h * float(scale[1]))
+
+
+def rescale_size(old_size: tuple,
+                 scale: Union[float, int, tuple],
+                 return_scale: bool = False) -> tuple:
+    """Calculate the new size to be rescaled to.
+
+    Args:
+        old_size (tuple[int]): The old size (w, h) of image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image size.
+
+    Returns:
+        tuple[int]: The new rescaled image size.
+    """
+    w, h = old_size
+    if isinstance(scale, (float, int)):
+        if scale <= 0:
+            raise ValueError(f'Invalid scale {scale}, must be positive.')
+        scale_factor = scale
+    elif isinstance(scale, tuple):
+        max_long_edge = max(scale)
+        max_short_edge = min(scale)
+        scale_factor = min(max_long_edge / max(h, w),
+                           max_short_edge / min(h, w))
+    else:
+        raise TypeError(
+            f'Scale must be a number or tuple of int, but got {type(scale)}')
+    # only change this
+    new_size = _fixed_scale_size((w, h), scale_factor)
+
+    if return_scale:
+        return new_size, scale_factor
+    else:
+        return new_size
+
+
+def imrescale(
+    img: np.ndarray,
+    scale: Union[float, Tuple[int, int]],
+    return_scale: bool = False,
+    interpolation: str = 'bilinear',
+    backend: Optional[str] = None
+) -> Union[np.ndarray, Tuple[np.ndarray, float]]:
+    """Resize image while keeping the aspect ratio.
+
+    Args:
+        img (ndarray): The input image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image.
+        interpolation (str): Same as :func:`resize`.
+        backend (str | None): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The rescaled image.
+    """
+    h, w = img.shape[:2]
+    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
+    rescaled_img = imresize(
+        img, new_size, interpolation=interpolation, backend=backend)
+    if return_scale:
+        return rescaled_img, scale_factor
+    else:
+        return rescaled_img
+
+
+@TRANSFORMS.register_module()
+class Resize(MMCV_Resize):
+    """Resize images & bbox & seg.
+
+    This transform resizes the input image according to ``scale`` or
+    ``scale_factor``. Bboxes, masks, and seg map are then resized
+    with the same scale factor.
+    if ``scale`` and ``scale_factor`` are both set, it will use ``scale`` to
+    resize.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+    - homography_matrix
+
+    Args:
+        scale (int or tuple): Images scales for resizing. Defaults to None
+        scale_factor (float or tuple[float]): Scale factors for resizing.
+            Defaults to None.
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Defaults to False.
+        clip_object_border (bool): Whether to clip the objects
+            outside the border of the image. In some dataset like MOT17, the gt
+            bboxes are allowed to cross the border of images. Therefore, we
+            don't need to clip the gt bboxes in these cases. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def _resize_masks(self, results: dict) -> None:
+        """Resize masks with ``results['scale']``"""
+        if results.get('gt_masks', None) is not None:
+            if self.keep_ratio:
+                results['gt_masks'] = results['gt_masks'].rescale(
+                    results['scale'])
+            else:
+                results['gt_masks'] = results['gt_masks'].resize(
+                    results['img_shape'])
+
+    def _resize_bboxes(self, results: dict) -> None:
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'].rescale_(results['scale_factor'])
+            if self.clip_object_border:
+                results['gt_bboxes'].clip_(results['img_shape'])
+
+    def _record_homography_matrix(self, results: dict) -> None:
+        """Record the homography matrix for the Resize."""
+        w_scale, h_scale = results['scale_factor']
+        homography_matrix = np.array(
+            [[w_scale, 0, 0], [0, h_scale, 0], [0, 0, 1]], dtype=np.float32)
+        if results.get('homography_matrix', None) is None:
+            results['homography_matrix'] = homography_matrix
+        else:
+            results['homography_matrix'] = homography_matrix @ results[
+                'homography_matrix']
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to resize images, bounding boxes and semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
+            'scale', 'scale_factor', 'height', 'width', and 'keep_ratio' keys
+            are updated in result dict.
+        """
+        if self.scale:
+            results['scale'] = self.scale
+        else:
+            img_shape = results['img'].shape[:2]
+            results['scale'] = _scale_size(img_shape[::-1], self.scale_factor)
+        self._resize_img(results)
+        self._resize_bboxes(results)
+        self._resize_masks(results)
+        self._resize_seg(results)
+        self._record_homography_matrix(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(scale={self.scale}, '
+        repr_str += f'scale_factor={self.scale_factor}, '
+        repr_str += f'keep_ratio={self.keep_ratio}, '
+        repr_str += f'clip_object_border={self.clip_object_border}), '
+        repr_str += f'backend={self.backend}), '
+        repr_str += f'interpolation={self.interpolation})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class FixScaleResize(Resize):
+    """Compared to Resize, FixScaleResize fixes the scaling issue when
+    `keep_ratio=true`."""
+
+    def _resize_img(self, results):
+        """Resize images with ``results['scale']``."""
+        if results.get('img', None) is not None:
+            if self.keep_ratio:
+                img, scale_factor = imrescale(
+                    results['img'],
+                    results['scale'],
+                    interpolation=self.interpolation,
+                    return_scale=True,
+                    backend=self.backend)
+                new_h, new_w = img.shape[:2]
+                h, w = results['img'].shape[:2]
+                w_scale = new_w / w
+                h_scale = new_h / h
+            else:
+                img, w_scale, h_scale = mmcv.imresize(
+                    results['img'],
+                    results['scale'],
+                    interpolation=self.interpolation,
+                    return_scale=True,
+                    backend=self.backend)
+            results['img'] = img
+            results['img_shape'] = img.shape[:2]
+            results['scale_factor'] = (w_scale, h_scale)
+            results['keep_ratio'] = self.keep_ratio
+
+
+@TRANSFORMS.register_module()
+class ResizeShortestEdge(BaseTransform):
+    """Resize the image and mask while keeping the aspect ratio unchanged.
+
+    Modified from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/transforms/augmentation_impl.py#L130 # noqa:E501
+
+    This transform attempts to scale the shorter edge to the given
+    `scale`, as long as the longer edge does not exceed `max_size`.
+    If `max_size` is reached, then downscale so that the longer
+    edge does not exceed `max_size`.
+
+    Required Keys:
+        - img
+        - gt_seg_map (optional)
+    Modified Keys:
+        - img
+        - img_shape
+        - gt_seg_map (optional))
+    Added Keys:
+        - scale
+        - scale_factor
+        - keep_ratio
+
+    Args:
+        scale (Union[int, Tuple[int, int]]): The target short edge length.
+            If it's tuple, will select the min value as the short edge length.
+        max_size (int): The maximum allowed longest edge length.
+    """
+
+    def __init__(self,
+                 scale: Union[int, Tuple[int, int]],
+                 max_size: Optional[int] = None,
+                 resize_type: str = 'Resize',
+                 **resize_kwargs) -> None:
+        super().__init__()
+        self.scale = scale
+        self.max_size = max_size
+
+        self.resize_cfg = dict(type=resize_type, **resize_kwargs)
+        self.resize = TRANSFORMS.build({'scale': 0, **self.resize_cfg})
+
+    def _get_output_shape(
+            self, img: np.ndarray,
+            short_edge_length: Union[int, Tuple[int, int]]) -> Tuple[int, int]:
+        """Compute the target image shape with the given `short_edge_length`.
+
+        Args:
+            img (np.ndarray): The input image.
+            short_edge_length (Union[int, Tuple[int, int]]): The target short
+                edge length. If it's tuple, will select the min value as the
+                short edge length.
+        """
+        h, w = img.shape[:2]
+        if isinstance(short_edge_length, int):
+            size = short_edge_length * 1.0
+        elif isinstance(short_edge_length, tuple):
+            size = min(short_edge_length) * 1.0
+        scale = size / min(h, w)
+        if h < w:
+            new_h, new_w = size, scale * w
+        else:
+            new_h, new_w = scale * h, size
+
+        if self.max_size and max(new_h, new_w) > self.max_size:
+            scale = self.max_size * 1.0 / max(new_h, new_w)
+            new_h *= scale
+            new_w *= scale
+
+        new_h = int(new_h + 0.5)
+        new_w = int(new_w + 0.5)
+        return new_w, new_h
+
+    def transform(self, results: dict) -> dict:
+        self.resize.scale = self._get_output_shape(results['img'], self.scale)
+        return self.resize(results)
+
+
+@TRANSFORMS.register_module()
+class FixShapeResize(Resize):
+    """Resize images & bbox & seg to the specified size.
+
+    This transform resizes the input image according to ``width`` and
+    ``height``. Bboxes, masks, and seg map are then resized
+    with the same parameters.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+    - homography_matrix
+
+    Args:
+        width (int): width for resizing.
+        height (int): height for resizing.
+            Defaults to None.
+        pad_val (Number | dict[str, Number], optional): Padding value for if
+            the pad_mode is "constant".  If it is a single number, the value
+            to pad the image is the number and to pad the semantic
+            segmentation map is 255. If it is a dict, it should have the
+            following keys:
+
+            - img: The value to pad the image.
+            - seg: The value to pad the semantic segmentation map.
+            Defaults to dict(img=0, seg=255).
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Defaults to False.
+        clip_object_border (bool): Whether to clip the objects
+            outside the border of the image. In some dataset like MOT17, the gt
+            bboxes are allowed to cross the border of images. Therefore, we
+            don't need to clip the gt bboxes in these cases. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 width: int,
+                 height: int,
+                 pad_val: Union[Number, dict] = dict(img=0, seg=255),
+                 keep_ratio: bool = False,
+                 clip_object_border: bool = True,
+                 backend: str = 'cv2',
+                 interpolation: str = 'bilinear') -> None:
+        assert width is not None and height is not None, (
+            '`width` and'
+            '`height` can not be `None`')
+
+        self.width = width
+        self.height = height
+        self.scale = (width, height)
+
+        self.backend = backend
+        self.interpolation = interpolation
+        self.keep_ratio = keep_ratio
+        self.clip_object_border = clip_object_border
+
+        if keep_ratio is True:
+            # padding to the fixed size when keep_ratio=True
+            self.pad_transform = Pad(size=self.scale, pad_val=pad_val)
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to resize images, bounding boxes and semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
+            'scale', 'scale_factor', 'height', 'width', and 'keep_ratio' keys
+            are updated in result dict.
+        """
+        img = results['img']
+        h, w = img.shape[:2]
+        if self.keep_ratio:
+            scale_factor = min(self.width / w, self.height / h)
+            results['scale_factor'] = (scale_factor, scale_factor)
+            real_w, real_h = int(w * float(scale_factor) +
+                                 0.5), int(h * float(scale_factor) + 0.5)
+            img, scale_factor = mmcv.imrescale(
+                results['img'], (real_w, real_h),
+                interpolation=self.interpolation,
+                return_scale=True,
+                backend=self.backend)
+            # the w_scale and h_scale has minor difference
+            # a real fix should be done in the mmcv.imrescale in the future
+            results['img'] = img
+            results['img_shape'] = img.shape[:2]
+            results['keep_ratio'] = self.keep_ratio
+            results['scale'] = (real_w, real_h)
+        else:
+            results['scale'] = (self.width, self.height)
+            results['scale_factor'] = (self.width / w, self.height / h)
+            super()._resize_img(results)
+
+        self._resize_bboxes(results)
+        self._resize_masks(results)
+        self._resize_seg(results)
+        self._record_homography_matrix(results)
+        if self.keep_ratio:
+            self.pad_transform(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(width={self.width}, height={self.height}, '
+        repr_str += f'keep_ratio={self.keep_ratio}, '
+        repr_str += f'clip_object_border={self.clip_object_border}), '
+        repr_str += f'backend={self.backend}), '
+        repr_str += f'interpolation={self.interpolation})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomFlip(MMCV_RandomFlip):
+    """Flip the image & bbox & mask & segmentation map. Added or Updated keys:
+    flip, flip_direction, img, gt_bboxes, and gt_seg_map. There are 3 flip
+    modes:
+
+     - ``prob`` is float, ``direction`` is string: the image will be
+         ``direction``ly flipped with probability of ``prob`` .
+         E.g., ``prob=0.5``, ``direction='horizontal'``,
+         then image will be horizontally flipped with probability of 0.5.
+     - ``prob`` is float, ``direction`` is list of string: the image will
+         be ``direction[i]``ly flipped with probability of
+         ``prob/len(direction)``.
+         E.g., ``prob=0.5``, ``direction=['horizontal', 'vertical']``,
+         then image will be horizontally flipped with probability of 0.25,
+         vertically with probability of 0.25.
+     - ``prob`` is list of float, ``direction`` is list of string:
+         given ``len(prob) == len(direction)``, the image will
+         be ``direction[i]``ly flipped with probability of ``prob[i]``.
+         E.g., ``prob=[0.3, 0.5]``, ``direction=['horizontal',
+         'vertical']``, then image will be horizontally flipped with
+         probability of 0.3, vertically with probability of 0.5.
+
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - flip
+    - flip_direction
+    - homography_matrix
+
+
+    Args:
+         prob (float | list[float], optional): The flipping probability.
+             Defaults to None.
+         direction(str | list[str]): The flipping direction. Options
+             If input is a list, the length must equal ``prob``. Each
+             element in ``prob`` indicates the flip probability of
+             corresponding direction. Defaults to 'horizontal'.
+    """
+
+    def _record_homography_matrix(self, results: dict) -> None:
+        """Record the homography matrix for the RandomFlip."""
+        cur_dir = results['flip_direction']
+        h, w = results['img'].shape[:2]
+
+        if cur_dir == 'horizontal':
+            homography_matrix = np.array([[-1, 0, w], [0, 1, 0], [0, 0, 1]],
+                                         dtype=np.float32)
+        elif cur_dir == 'vertical':
+            homography_matrix = np.array([[1, 0, 0], [0, -1, h], [0, 0, 1]],
+                                         dtype=np.float32)
+        elif cur_dir == 'diagonal':
+            homography_matrix = np.array([[-1, 0, w], [0, -1, h], [0, 0, 1]],
+                                         dtype=np.float32)
+        else:
+            homography_matrix = np.eye(3, dtype=np.float32)
+
+        if results.get('homography_matrix', None) is None:
+            results['homography_matrix'] = homography_matrix
+        else:
+            results['homography_matrix'] = homography_matrix @ results[
+                'homography_matrix']
+
+    @autocast_box_type()
+    def _flip(self, results: dict) -> None:
+        """Flip images, bounding boxes, and semantic segmentation map."""
+        # flip image
+        results['img'] = mmcv.imflip(
+            results['img'], direction=results['flip_direction'])
+
+        img_shape = results['img'].shape[:2]
+
+        # flip bboxes
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'].flip_(img_shape, results['flip_direction'])
+
+        # flip masks
+        if results.get('gt_masks', None) is not None:
+            results['gt_masks'] = results['gt_masks'].flip(
+                results['flip_direction'])
+
+        # flip segs
+        if results.get('gt_seg_map', None) is not None:
+            results['gt_seg_map'] = mmcv.imflip(
+                results['gt_seg_map'], direction=results['flip_direction'])
+
+        # record homography matrix for flip
+        self._record_homography_matrix(results)
+
+
+@TRANSFORMS.register_module()
+class RandomShift(BaseTransform):
+    """Shift the image and box given shift pixels and probability.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32])
+    - gt_bboxes_labels (np.int64)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_bboxes_labels
+    - gt_ignore_flags (bool) (optional)
+
+    Args:
+        prob (float): Probability of shifts. Defaults to 0.5.
+        max_shift_px (int): The max pixels for shifting. Defaults to 32.
+        filter_thr_px (int): The width and height threshold for filtering.
+            The bbox and the rest of the targets below the width and
+            height threshold will be filtered. Defaults to 1.
+    """
+
+    def __init__(self,
+                 prob: float = 0.5,
+                 max_shift_px: int = 32,
+                 filter_thr_px: int = 1) -> None:
+        assert 0 <= prob <= 1
+        assert max_shift_px >= 0
+        self.prob = prob
+        self.max_shift_px = max_shift_px
+        self.filter_thr_px = int(filter_thr_px)
+
+    @cache_randomness
+    def _random_prob(self) -> float:
+        return random.uniform(0, 1)
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to random shift images, bounding boxes.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Shift results.
+        """
+        if self._random_prob() < self.prob:
+            img_shape = results['img'].shape[:2]
+
+            random_shift_x = random.randint(-self.max_shift_px,
+                                            self.max_shift_px)
+            random_shift_y = random.randint(-self.max_shift_px,
+                                            self.max_shift_px)
+            new_x = max(0, random_shift_x)
+            ori_x = max(0, -random_shift_x)
+            new_y = max(0, random_shift_y)
+            ori_y = max(0, -random_shift_y)
+
+            # TODO: support mask and semantic segmentation maps.
+            bboxes = results['gt_bboxes'].clone()
+            bboxes.translate_([random_shift_x, random_shift_y])
+
+            # clip border
+            bboxes.clip_(img_shape)
+
+            # remove invalid bboxes
+            valid_inds = (bboxes.widths > self.filter_thr_px).numpy() & (
+                bboxes.heights > self.filter_thr_px).numpy()
+            # If the shift does not contain any gt-bbox area, skip this
+            # image.
+            if not valid_inds.any():
+                return results
+            bboxes = bboxes[valid_inds]
+            results['gt_bboxes'] = bboxes
+            results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
+                valid_inds]
+
+            if results.get('gt_ignore_flags', None) is not None:
+                results['gt_ignore_flags'] = \
+                    results['gt_ignore_flags'][valid_inds]
+
+            # shift img
+            img = results['img']
+            new_img = np.zeros_like(img)
+            img_h, img_w = img.shape[:2]
+            new_h = img_h - np.abs(random_shift_y)
+            new_w = img_w - np.abs(random_shift_x)
+            new_img[new_y:new_y + new_h, new_x:new_x + new_w] \
+                = img[ori_y:ori_y + new_h, ori_x:ori_x + new_w]
+            results['img'] = new_img
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'max_shift_px={self.max_shift_px}, '
+        repr_str += f'filter_thr_px={self.filter_thr_px})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Pad(MMCV_Pad):
+    """Pad the image & segmentation map.
+
+    There are three padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number. and (3)pad to square. Also,
+    pad to square and pad to the minimum size can be used as the same time.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - pad_shape
+    - pad_fixed_size
+    - pad_size_divisor
+
+    Args:
+        size (tuple, optional): Fixed padding size.
+            Expected padding shape (width, height). Defaults to None.
+        size_divisor (int, optional): The divisor of padded size. Defaults to
+            None.
+        pad_to_square (bool): Whether to pad the image into a square.
+            Currently only used for YOLOX. Defaults to False.
+        pad_val (Number | dict[str, Number], optional) - Padding value for if
+            the pad_mode is "constant".  If it is a single number, the value
+            to pad the image is the number and to pad the semantic
+            segmentation map is 255. If it is a dict, it should have the
+            following keys:
+
+            - img: The value to pad the image.
+            - seg: The value to pad the semantic segmentation map.
+            Defaults to dict(img=0, seg=255).
+        padding_mode (str): Type of padding. Should be: constant, edge,
+            reflect or symmetric. Defaults to 'constant'.
+
+            - constant: pads with a constant value, this value is specified
+              with pad_val.
+            - edge: pads with the last value at the edge of the image.
+            - reflect: pads with reflection of image without repeating the last
+              value on the edge. For example, padding [1, 2, 3, 4] with 2
+              elements on both sides in reflect mode will result in
+              [3, 2, 1, 2, 3, 4, 3, 2].
+            - symmetric: pads with reflection of image repeating the last value
+              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+              both sides in symmetric mode will result in
+              [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    def _pad_masks(self, results: dict) -> None:
+        """Pad masks according to ``results['pad_shape']``."""
+        if results.get('gt_masks', None) is not None:
+            pad_val = self.pad_val.get('masks', 0)
+            pad_shape = results['pad_shape'][:2]
+            results['gt_masks'] = results['gt_masks'].pad(
+                pad_shape, pad_val=pad_val)
+
+    def transform(self, results: dict) -> dict:
+        """Call function to pad images, masks, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        self._pad_seg(results)
+        self._pad_masks(results)
+        return results
+
+
+@TRANSFORMS.register_module()
+class RandomCrop(BaseTransform):
+    """Random crop the image & bboxes & masks.
+
+    The absolute ``crop_size`` is sampled based on ``crop_type`` and
+    ``image_size``, then the cropped results are generated.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_masks (optional)
+    - gt_ignore_flags (optional)
+    - gt_seg_map (optional)
+    - gt_instances_ids (options, only used in MOT/VIS)
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        crop_size (tuple): The relative ratio or absolute pixels of
+            (width, height).
+        crop_type (str, optional): One of "relative_range", "relative",
+            "absolute", "absolute_range". "relative" randomly crops
+            (h * crop_size[0], w * crop_size[1]) part from an input of size
+            (h, w). "relative_range" uniformly samples relative crop size from
+            range [crop_size[0], 1] and [crop_size[1], 1] for height and width
+            respectively. "absolute" crops from an input with absolute size
+            (crop_size[0], crop_size[1]). "absolute_range" uniformly samples
+            crop_h in range [crop_size[0], min(h, crop_size[1])] and crop_w
+            in range [crop_size[0], min(w, crop_size[1])].
+            Defaults to "absolute".
+        allow_negative_crop (bool, optional): Whether to allow a crop that does
+            not contain any bbox area. Defaults to False.
+        recompute_bbox (bool, optional): Whether to re-compute the boxes based
+            on cropped instance masks. Defaults to False.
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+
+    Note:
+        - If the image is smaller than the absolute crop size, return the
+            original image.
+        - The keys for bboxes, labels and masks must be aligned. That is,
+          ``gt_bboxes`` corresponds to ``gt_labels`` and ``gt_masks``, and
+          ``gt_bboxes_ignore`` corresponds to ``gt_labels_ignore`` and
+          ``gt_masks_ignore``.
+        - If the crop does not contain any gt-bbox region and
+          ``allow_negative_crop`` is set to False, skip this image.
+    """
+
+    def __init__(self,
+                 crop_size: tuple,
+                 crop_type: str = 'absolute',
+                 allow_negative_crop: bool = False,
+                 recompute_bbox: bool = False,
+                 bbox_clip_border: bool = True) -> None:
+        if crop_type not in [
+                'relative_range', 'relative', 'absolute', 'absolute_range'
+        ]:
+            raise ValueError(f'Invalid crop_type {crop_type}.')
+        if crop_type in ['absolute', 'absolute_range']:
+            assert crop_size[0] > 0 and crop_size[1] > 0
+            assert isinstance(crop_size[0], int) and isinstance(
+                crop_size[1], int)
+            if crop_type == 'absolute_range':
+                assert crop_size[0] <= crop_size[1]
+        else:
+            assert 0 < crop_size[0] <= 1 and 0 < crop_size[1] <= 1
+        self.crop_size = crop_size
+        self.crop_type = crop_type
+        self.allow_negative_crop = allow_negative_crop
+        self.bbox_clip_border = bbox_clip_border
+        self.recompute_bbox = recompute_bbox
+
+    def _crop_data(self, results: dict, crop_size: Tuple[int, int],
+                   allow_negative_crop: bool) -> Union[dict, None]:
+        """Function to randomly crop images, bounding boxes, masks, semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            crop_size (Tuple[int, int]): Expected absolute size after
+                cropping, (h, w).
+            allow_negative_crop (bool): Whether to allow a crop that does not
+                contain any bbox area.
+
+        Returns:
+            results (Union[dict, None]): Randomly cropped results, 'img_shape'
+                key in result dict is updated according to crop size. None will
+                be returned when there is no valid bbox after cropping.
+        """
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        img = results['img']
+        margin_h = max(img.shape[0] - crop_size[0], 0)
+        margin_w = max(img.shape[1] - crop_size[1], 0)
+        offset_h, offset_w = self._rand_offset((margin_h, margin_w))
+        crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
+        crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
+
+        # Record the homography matrix for the RandomCrop
+        homography_matrix = np.array(
+            [[1, 0, -offset_w], [0, 1, -offset_h], [0, 0, 1]],
+            dtype=np.float32)
+        if results.get('homography_matrix', None) is None:
+            results['homography_matrix'] = homography_matrix
+        else:
+            results['homography_matrix'] = homography_matrix @ results[
+                'homography_matrix']
+
+        # crop the image
+        img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+        img_shape = img.shape
+        results['img'] = img
+        results['img_shape'] = img_shape[:2]
+
+        # crop bboxes accordingly and clip to the image boundary
+        if results.get('gt_bboxes', None) is not None:
+            bboxes = results['gt_bboxes']
+            bboxes.translate_([-offset_w, -offset_h])
+            if self.bbox_clip_border:
+                bboxes.clip_(img_shape[:2])
+            valid_inds = bboxes.is_inside(img_shape[:2]).numpy()
+            # If the crop does not contain any gt-bbox area and
+            # allow_negative_crop is False, skip this image.
+            if (not valid_inds.any() and not allow_negative_crop):
+                return None
+
+            results['gt_bboxes'] = bboxes[valid_inds]
+
+            if results.get('gt_ignore_flags', None) is not None:
+                results['gt_ignore_flags'] = \
+                    results['gt_ignore_flags'][valid_inds]
+
+            if results.get('gt_bboxes_labels', None) is not None:
+                results['gt_bboxes_labels'] = \
+                    results['gt_bboxes_labels'][valid_inds]
+
+            if results.get('gt_masks', None) is not None:
+                results['gt_masks'] = results['gt_masks'][
+                    valid_inds.nonzero()[0]].crop(
+                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
+                if self.recompute_bbox:
+                    results['gt_bboxes'] = results['gt_masks'].get_bboxes(
+                        type(results['gt_bboxes']))
+
+            # We should remove the instance ids corresponding to invalid boxes.
+            if results.get('gt_instances_ids', None) is not None:
+                results['gt_instances_ids'] = \
+                    results['gt_instances_ids'][valid_inds]
+
+        # crop semantic seg
+        if results.get('gt_seg_map', None) is not None:
+            results['gt_seg_map'] = results['gt_seg_map'][crop_y1:crop_y2,
+                                                          crop_x1:crop_x2]
+
+        return results
+
+    @cache_randomness
+    def _rand_offset(self, margin: Tuple[int, int]) -> Tuple[int, int]:
+        """Randomly generate crop offset.
+
+        Args:
+            margin (Tuple[int, int]): The upper bound for the offset generated
+                randomly.
+
+        Returns:
+            Tuple[int, int]: The random offset for the crop.
+        """
+        margin_h, margin_w = margin
+        offset_h = np.random.randint(0, margin_h + 1)
+        offset_w = np.random.randint(0, margin_w + 1)
+
+        return offset_h, offset_w
+
+    @cache_randomness
+    def _get_crop_size(self, image_size: Tuple[int, int]) -> Tuple[int, int]:
+        """Randomly generates the absolute crop size based on `crop_type` and
+        `image_size`.
+
+        Args:
+            image_size (Tuple[int, int]): (h, w).
+
+        Returns:
+            crop_size (Tuple[int, int]): (crop_h, crop_w) in absolute pixels.
+        """
+        h, w = image_size
+        if self.crop_type == 'absolute':
+            return min(self.crop_size[1], h), min(self.crop_size[0], w)
+        elif self.crop_type == 'absolute_range':
+            crop_h = np.random.randint(
+                min(h, self.crop_size[0]),
+                min(h, self.crop_size[1]) + 1)
+            crop_w = np.random.randint(
+                min(w, self.crop_size[0]),
+                min(w, self.crop_size[1]) + 1)
+            return crop_h, crop_w
+        elif self.crop_type == 'relative':
+            crop_w, crop_h = self.crop_size
+            return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+        else:
+            # 'relative_range'
+            crop_size = np.asarray(self.crop_size, dtype=np.float32)
+            crop_h, crop_w = crop_size + np.random.rand(2) * (1 - crop_size)
+            return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> Union[dict, None]:
+        """Transform function to randomly crop images, bounding boxes, masks,
+        semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            results (Union[dict, None]): Randomly cropped results, 'img_shape'
+                key in result dict is updated according to crop size. None will
+                be returned when there is no valid bbox after cropping.
+        """
+        image_size = results['img'].shape[:2]
+        crop_size = self._get_crop_size(image_size)
+        results = self._crop_data(results, crop_size, self.allow_negative_crop)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size={self.crop_size}, '
+        repr_str += f'crop_type={self.crop_type}, '
+        repr_str += f'allow_negative_crop={self.allow_negative_crop}, '
+        repr_str += f'recompute_bbox={self.recompute_bbox}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class SegRescale(BaseTransform):
+    """Rescale semantic segmentation maps.
+
+    This transform rescale the ``gt_seg_map`` according to ``scale_factor``.
+
+    Required Keys:
+
+    - gt_seg_map
+
+    Modified Keys:
+
+    - gt_seg_map
+
+    Args:
+        scale_factor (float): The scale factor of the final output. Defaults
+            to 1.
+        backend (str): Image rescale backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+    """
+
+    def __init__(self, scale_factor: float = 1, backend: str = 'cv2') -> None:
+        self.scale_factor = scale_factor
+        self.backend = backend
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to scale the semantic segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with semantic segmentation map scaled.
+        """
+        if self.scale_factor != 1:
+            results['gt_seg_map'] = mmcv.imrescale(
+                results['gt_seg_map'],
+                self.scale_factor,
+                interpolation='nearest',
+                backend=self.backend)
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(scale_factor={self.scale_factor}, '
+        repr_str += f'backend={self.backend})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PhotoMetricDistortion(BaseTransform):
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+
+    Required Keys:
+
+    - img (np.uint8)
+
+    Modified Keys:
+
+    - img (np.float32)
+
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (sequence): range of contrast.
+        saturation_range (sequence): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta: int = 32,
+                 contrast_range: Sequence[Number] = (0.5, 1.5),
+                 saturation_range: Sequence[Number] = (0.5, 1.5),
+                 hue_delta: int = 18) -> None:
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    @cache_randomness
+    def _random_flags(self) -> Sequence[Number]:
+        mode = random.randint(2)
+        brightness_flag = random.randint(2)
+        contrast_flag = random.randint(2)
+        saturation_flag = random.randint(2)
+        hue_flag = random.randint(2)
+        swap_flag = random.randint(2)
+        delta_value = random.uniform(-self.brightness_delta,
+                                     self.brightness_delta)
+        alpha_value = random.uniform(self.contrast_lower, self.contrast_upper)
+        saturation_value = random.uniform(self.saturation_lower,
+                                          self.saturation_upper)
+        hue_value = random.uniform(-self.hue_delta, self.hue_delta)
+        swap_value = random.permutation(3)
+
+        return (mode, brightness_flag, contrast_flag, saturation_flag,
+                hue_flag, swap_flag, delta_value, alpha_value,
+                saturation_value, hue_value, swap_value)
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images distorted.
+        """
+        assert 'img' in results, '`img` is not found in results'
+        img = results['img']
+        img = img.astype(np.float32)
+
+        (mode, brightness_flag, contrast_flag, saturation_flag, hue_flag,
+         swap_flag, delta_value, alpha_value, saturation_value, hue_value,
+         swap_value) = self._random_flags()
+
+        # random brightness
+        if brightness_flag:
+            img += delta_value
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        if mode == 1:
+            if contrast_flag:
+                img *= alpha_value
+
+        # convert color from BGR to HSV
+        img = mmcv.bgr2hsv(img)
+
+        # random saturation
+        if saturation_flag:
+            img[..., 1] *= saturation_value
+            # For image(type=float32), after convert bgr to hsv by opencv,
+            # valid saturation value range is [0, 1]
+            if saturation_value > 1:
+                img[..., 1] = img[..., 1].clip(0, 1)
+
+        # random hue
+        if hue_flag:
+            img[..., 0] += hue_value
+            img[..., 0][img[..., 0] > 360] -= 360
+            img[..., 0][img[..., 0] < 0] += 360
+
+        # convert color from HSV to BGR
+        img = mmcv.hsv2bgr(img)
+
+        # random contrast
+        if mode == 0:
+            if contrast_flag:
+                img *= alpha_value
+
+        # randomly swap channels
+        if swap_flag:
+            img = img[..., swap_value]
+
+        results['img'] = img
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(brightness_delta={self.brightness_delta}, '
+        repr_str += 'contrast_range='
+        repr_str += f'{(self.contrast_lower, self.contrast_upper)}, '
+        repr_str += 'saturation_range='
+        repr_str += f'{(self.saturation_lower, self.saturation_upper)}, '
+        repr_str += f'hue_delta={self.hue_delta})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Expand(BaseTransform):
+    """Random expand the image & bboxes & masks & segmentation map.
+
+    Randomly place the original image on a canvas of ``ratio`` x original image
+    size filled with mean values. The ratio is in the range of ratio_range.
+
+    Required Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+
+    Args:
+        mean (sequence): mean value of dataset.
+        to_rgb (bool): if need to convert the order of mean to align with RGB.
+        ratio_range (sequence)): range of expand ratio.
+        seg_ignore_label (int): label of ignore segmentation map.
+        prob (float): probability of applying this transformation
+    """
+
+    def __init__(self,
+                 mean: Sequence[Number] = (0, 0, 0),
+                 to_rgb: bool = True,
+                 ratio_range: Sequence[Number] = (1, 4),
+                 seg_ignore_label: int = None,
+                 prob: float = 0.5) -> None:
+        self.to_rgb = to_rgb
+        self.ratio_range = ratio_range
+        if to_rgb:
+            self.mean = mean[::-1]
+        else:
+            self.mean = mean
+        self.min_ratio, self.max_ratio = ratio_range
+        self.seg_ignore_label = seg_ignore_label
+        self.prob = prob
+
+    @cache_randomness
+    def _random_prob(self) -> float:
+        return random.uniform(0, 1)
+
+    @cache_randomness
+    def _random_ratio(self) -> float:
+        return random.uniform(self.min_ratio, self.max_ratio)
+
+    @cache_randomness
+    def _random_left_top(self, ratio: float, h: int,
+                         w: int) -> Tuple[int, int]:
+        left = int(random.uniform(0, w * ratio - w))
+        top = int(random.uniform(0, h * ratio - h))
+        return left, top
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to expand images, bounding boxes, masks,
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images, bounding boxes, masks, segmentation
+                map expanded.
+        """
+        if self._random_prob() > self.prob:
+            return results
+        assert 'img' in results, '`img` is not found in results'
+        img = results['img']
+        h, w, c = img.shape
+        ratio = self._random_ratio()
+        # speedup expand when meets large image
+        if np.all(self.mean == self.mean[0]):
+            expand_img = np.empty((int(h * ratio), int(w * ratio), c),
+                                  img.dtype)
+            expand_img.fill(self.mean[0])
+        else:
+            expand_img = np.full((int(h * ratio), int(w * ratio), c),
+                                 self.mean,
+                                 dtype=img.dtype)
+        left, top = self._random_left_top(ratio, h, w)
+        expand_img[top:top + h, left:left + w] = img
+        results['img'] = expand_img
+        results['img_shape'] = expand_img.shape[:2]
+
+        # expand bboxes
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'].translate_([left, top])
+
+        # expand masks
+        if results.get('gt_masks', None) is not None:
+            results['gt_masks'] = results['gt_masks'].expand(
+                int(h * ratio), int(w * ratio), top, left)
+
+        # expand segmentation map
+        if results.get('gt_seg_map', None) is not None:
+            gt_seg = results['gt_seg_map']
+            expand_gt_seg = np.full((int(h * ratio), int(w * ratio)),
+                                    self.seg_ignore_label,
+                                    dtype=gt_seg.dtype)
+            expand_gt_seg[top:top + h, left:left + w] = gt_seg
+            results['gt_seg_map'] = expand_gt_seg
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, to_rgb={self.to_rgb}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class MinIoURandomCrop(BaseTransform):
+    """Random crop the image & bboxes & masks & segmentation map, the cropped
+    patches have minimum IoU requirement with original image & bboxes & masks.
+
+    & segmentation map, the IoU threshold is randomly selected from min_ious.
+
+
+    Required Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_bboxes_labels
+    - gt_masks
+    - gt_ignore_flags
+    - gt_seg_map
+
+
+    Args:
+        min_ious (Sequence[float]): minimum IoU threshold for all intersections
+            with bounding boxes.
+        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
+        where a >= min_crop_size).
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 min_ious: Sequence[float] = (0.1, 0.3, 0.5, 0.7, 0.9),
+                 min_crop_size: float = 0.3,
+                 bbox_clip_border: bool = True) -> None:
+
+        self.min_ious = min_ious
+        self.sample_mode = (1, *min_ious, 0)
+        self.min_crop_size = min_crop_size
+        self.bbox_clip_border = bbox_clip_border
+
+    @cache_randomness
+    def _random_mode(self) -> Number:
+        return random.choice(self.sample_mode)
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to crop images and bounding boxes with minimum
+        IoU constraint.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images and bounding boxes cropped, \
+                'img_shape' key is updated.
+        """
+        assert 'img' in results, '`img` is not found in results'
+        assert 'gt_bboxes' in results, '`gt_bboxes` is not found in results'
+        img = results['img']
+        boxes = results['gt_bboxes']
+        h, w, c = img.shape
+        while True:
+            mode = self._random_mode()
+            self.mode = mode
+            if mode == 1:
+                return results
+
+            min_iou = self.mode
+            for i in range(50):
+                new_w = random.uniform(self.min_crop_size * w, w)
+                new_h = random.uniform(self.min_crop_size * h, h)
+
+                # h / w in [0.5, 2]
+                if new_h / new_w < 0.5 or new_h / new_w > 2:
+                    continue
+
+                left = random.uniform(w - new_w)
+                top = random.uniform(h - new_h)
+
+                patch = np.array(
+                    (int(left), int(top), int(left + new_w), int(top + new_h)))
+                # Line or point crop is not allowed
+                if patch[2] == patch[0] or patch[3] == patch[1]:
+                    continue
+                overlaps = boxes.overlaps(
+                    HorizontalBoxes(patch.reshape(-1, 4).astype(np.float32)),
+                    boxes).numpy().reshape(-1)
+                if len(overlaps) > 0 and overlaps.min() < min_iou:
+                    continue
+
+                # center of boxes should inside the crop img
+                # only adjust boxes and instance masks when the gt is not empty
+                if len(overlaps) > 0:
+                    # adjust boxes
+                    def is_center_of_bboxes_in_patch(boxes, patch):
+                        centers = boxes.centers.numpy()
+                        mask = ((centers[:, 0] > patch[0]) *
+                                (centers[:, 1] > patch[1]) *
+                                (centers[:, 0] < patch[2]) *
+                                (centers[:, 1] < patch[3]))
+                        return mask
+
+                    mask = is_center_of_bboxes_in_patch(boxes, patch)
+                    if not mask.any():
+                        continue
+                    if results.get('gt_bboxes', None) is not None:
+                        boxes = results['gt_bboxes']
+                        mask = is_center_of_bboxes_in_patch(boxes, patch)
+                        boxes = boxes[mask]
+                        boxes.translate_([-patch[0], -patch[1]])
+                        if self.bbox_clip_border:
+                            boxes.clip_(
+                                [patch[3] - patch[1], patch[2] - patch[0]])
+                        results['gt_bboxes'] = boxes
+
+                        # ignore_flags
+                        if results.get('gt_ignore_flags', None) is not None:
+                            results['gt_ignore_flags'] = \
+                                results['gt_ignore_flags'][mask]
+
+                        # labels
+                        if results.get('gt_bboxes_labels', None) is not None:
+                            results['gt_bboxes_labels'] = results[
+                                'gt_bboxes_labels'][mask]
+
+                        # mask fields
+                        if results.get('gt_masks', None) is not None:
+                            results['gt_masks'] = results['gt_masks'][
+                                mask.nonzero()[0]].crop(patch)
+                # adjust the img no matter whether the gt is empty before crop
+                img = img[patch[1]:patch[3], patch[0]:patch[2]]
+                results['img'] = img
+                results['img_shape'] = img.shape[:2]
+
+                # seg fields
+                if results.get('gt_seg_map', None) is not None:
+                    results['gt_seg_map'] = results['gt_seg_map'][
+                        patch[1]:patch[3], patch[0]:patch[2]]
+                return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(min_ious={self.min_ious}, '
+        repr_str += f'min_crop_size={self.min_crop_size}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Corrupt(BaseTransform):
+    """Corruption augmentation.
+
+    Corruption transforms implemented based on
+    `imagecorruptions <https://github.com/bethgelab/imagecorruptions>`_.
+
+    Required Keys:
+
+    - img (np.uint8)
+
+
+    Modified Keys:
+
+    - img (np.uint8)
+
+
+    Args:
+        corruption (str): Corruption name.
+        severity (int): The severity of corruption. Defaults to 1.
+    """
+
+    def __init__(self, corruption: str, severity: int = 1) -> None:
+        self.corruption = corruption
+        self.severity = severity
+
+    def transform(self, results: dict) -> dict:
+        """Call function to corrupt image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images corrupted.
+        """
+
+        if corrupt is None:
+            raise RuntimeError('imagecorruptions is not installed')
+        results['img'] = corrupt(
+            results['img'].astype(np.uint8),
+            corruption_name=self.corruption,
+            severity=self.severity)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(corruption={self.corruption}, '
+        repr_str += f'severity={self.severity})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+@avoid_cache_randomness
+class Albu(BaseTransform):
+    """Albumentation augmentation.
+
+    Adds custom transformations from Albumentations library.
+    Please, visit `https://albumentations.readthedocs.io`
+    to get more information.
+
+    Required Keys:
+
+    - img (np.uint8)
+    - gt_bboxes (HorizontalBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+
+    Modified Keys:
+
+    - img (np.uint8)
+    - gt_bboxes (HorizontalBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - img_shape (tuple)
+
+    An example of ``transforms`` is as followed:
+
+    .. code-block::
+
+        [
+            dict(
+                type='ShiftScaleRotate',
+                shift_limit=0.0625,
+                scale_limit=0.0,
+                rotate_limit=0,
+                interpolation=1,
+                p=0.5),
+            dict(
+                type='RandomBrightnessContrast',
+                brightness_limit=[0.1, 0.3],
+                contrast_limit=[0.1, 0.3],
+                p=0.2),
+            dict(type='ChannelShuffle', p=0.1),
+            dict(
+                type='OneOf',
+                transforms=[
+                    dict(type='Blur', blur_limit=3, p=1.0),
+                    dict(type='MedianBlur', blur_limit=3, p=1.0)
+                ],
+                p=0.1),
+        ]
+
+    Args:
+        transforms (list[dict]): A list of albu transformations
+        bbox_params (dict, optional): Bbox_params for albumentation `Compose`
+        keymap (dict, optional): Contains
+            {'input key':'albumentation-style key'}
+        skip_img_without_anno (bool): Whether to skip the image if no ann left
+            after aug. Defaults to False.
+    """
+
+    def __init__(self,
+                 transforms: List[dict],
+                 bbox_params: Optional[dict] = None,
+                 keymap: Optional[dict] = None,
+                 skip_img_without_anno: bool = False) -> None:
+        if Compose is None:
+            raise RuntimeError('albumentations is not installed')
+
+        # Args will be modified later, copying it will be safer
+        transforms = copy.deepcopy(transforms)
+        if bbox_params is not None:
+            bbox_params = copy.deepcopy(bbox_params)
+        if keymap is not None:
+            keymap = copy.deepcopy(keymap)
+        self.transforms = transforms
+        self.filter_lost_elements = False
+        self.skip_img_without_anno = skip_img_without_anno
+
+        # A simple workaround to remove masks without boxes
+        if (isinstance(bbox_params, dict) and 'label_fields' in bbox_params
+                and 'filter_lost_elements' in bbox_params):
+            self.filter_lost_elements = True
+            self.origin_label_fields = bbox_params['label_fields']
+            bbox_params['label_fields'] = ['idx_mapper']
+            del bbox_params['filter_lost_elements']
+
+        self.bbox_params = (
+            self.albu_builder(bbox_params) if bbox_params else None)
+        self.aug = Compose([self.albu_builder(t) for t in self.transforms],
+                           bbox_params=self.bbox_params)
+
+        if not keymap:
+            self.keymap_to_albu = {
+                'img': 'image',
+                'gt_masks': 'masks',
+                'gt_bboxes': 'bboxes'
+            }
+        else:
+            self.keymap_to_albu = keymap
+        self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()}
+
+    def albu_builder(self, cfg: dict) -> albumentations:
+        """Import a module from albumentations.
+
+        It inherits some of :func:`build_from_cfg` logic.
+
+        Args:
+            cfg (dict): Config dict. It should at least contain the key "type".
+
+        Returns:
+            obj: The constructed object.
+        """
+
+        assert isinstance(cfg, dict) and 'type' in cfg
+        args = cfg.copy()
+        obj_type = args.pop('type')
+        if is_str(obj_type):
+            if albumentations is None:
+                raise RuntimeError('albumentations is not installed')
+            obj_cls = getattr(albumentations, obj_type)
+        elif inspect.isclass(obj_type):
+            obj_cls = obj_type
+        else:
+            raise TypeError(
+                f'type must be a str or valid type, but got {type(obj_type)}')
+
+        if 'transforms' in args:
+            args['transforms'] = [
+                self.albu_builder(transform)
+                for transform in args['transforms']
+            ]
+
+        return obj_cls(**args)
+
+    @staticmethod
+    def mapper(d: dict, keymap: dict) -> dict:
+        """Dictionary mapper. Renames keys according to keymap provided.
+
+        Args:
+            d (dict): old dict
+            keymap (dict): {'old_key':'new_key'}
+        Returns:
+            dict: new dict.
+        """
+        updated_dict = {}
+        for k, v in zip(d.keys(), d.values()):
+            new_k = keymap.get(k, k)
+            updated_dict[new_k] = d[k]
+        return updated_dict
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> Union[dict, None]:
+        """Transform function of Albu."""
+        # TODO: gt_seg_map is not currently supported
+        # dict to albumentations format
+        results = self.mapper(results, self.keymap_to_albu)
+        results, ori_masks = self._preprocess_results(results)
+        results = self.aug(**results)
+        results = self._postprocess_results(results, ori_masks)
+        if results is None:
+            return None
+        # back to the original format
+        results = self.mapper(results, self.keymap_back)
+        results['img_shape'] = results['img'].shape[:2]
+        return results
+
+    def _preprocess_results(self, results: dict) -> tuple:
+        """Pre-processing results to facilitate the use of Albu."""
+        if 'bboxes' in results:
+            # to list of boxes
+            if not isinstance(results['bboxes'], HorizontalBoxes):
+                raise NotImplementedError(
+                    'Albu only supports horizontal boxes now')
+            bboxes = results['bboxes'].numpy()
+            results['bboxes'] = [x for x in bboxes]
+            # add pseudo-field for filtration
+            if self.filter_lost_elements:
+                results['idx_mapper'] = np.arange(len(results['bboxes']))
+
+        # TODO: Support mask structure in albu
+        ori_masks = None
+        if 'masks' in results:
+            if isinstance(results['masks'], PolygonMasks):
+                raise NotImplementedError(
+                    'Albu only supports BitMap masks now')
+            ori_masks = results['masks']
+            if albumentations.__version__ < '0.5':
+                results['masks'] = results['masks'].masks
+            else:
+                results['masks'] = [mask for mask in results['masks'].masks]
+
+        return results, ori_masks
+
+    def _postprocess_results(
+            self,
+            results: dict,
+            ori_masks: Optional[Union[BitmapMasks,
+                                      PolygonMasks]] = None) -> dict:
+        """Post-processing Albu output."""
+        # albumentations may return np.array or list on different versions
+        if 'gt_bboxes_labels' in results and isinstance(
+                results['gt_bboxes_labels'], list):
+            results['gt_bboxes_labels'] = np.array(
+                results['gt_bboxes_labels'], dtype=np.int64)
+        if 'gt_ignore_flags' in results and isinstance(
+                results['gt_ignore_flags'], list):
+            results['gt_ignore_flags'] = np.array(
+                results['gt_ignore_flags'], dtype=bool)
+
+        if 'bboxes' in results:
+            if isinstance(results['bboxes'], list):
+                results['bboxes'] = np.array(
+                    results['bboxes'], dtype=np.float32)
+            results['bboxes'] = results['bboxes'].reshape(-1, 4)
+            results['bboxes'] = HorizontalBoxes(results['bboxes'])
+
+            # filter label_fields
+            if self.filter_lost_elements:
+
+                for label in self.origin_label_fields:
+                    results[label] = np.array(
+                        [results[label][i] for i in results['idx_mapper']])
+                if 'masks' in results:
+                    assert ori_masks is not None
+                    results['masks'] = np.array(
+                        [results['masks'][i] for i in results['idx_mapper']])
+                    results['masks'] = ori_masks.__class__(
+                        results['masks'],
+                        results['masks'][0].shape[0],
+                        results['masks'][0].shape[1],
+                    )
+                if (not len(results['idx_mapper'])
+                        and self.skip_img_without_anno):
+                    return None
+            elif 'masks' in results:
+                results['masks'] = ori_masks.__class__(results['masks'],
+                                                       ori_masks.height,
+                                                       ori_masks.width)
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__ + f'(transforms={self.transforms})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+@avoid_cache_randomness
+class RandomCenterCropPad(BaseTransform):
+    """Random center crop and random around padding for CornerNet.
+
+    This operation generates randomly cropped image from the original image and
+    pads it simultaneously. Different from :class:`RandomCrop`, the output
+    shape may not equal to ``crop_size`` strictly. We choose a random value
+    from ``ratios`` and the output shape could be larger or smaller than
+    ``crop_size``. The padding operation is also different from :class:`Pad`,
+    here we use around padding instead of right-bottom padding.
+
+    The relation between output image (padding image) and original image:
+
+    .. code:: text
+
+                        output image
+
+               +----------------------------+
+               |          padded area       |
+        +------|----------------------------|----------+
+        |      |         cropped area       |          |
+        |      |         +---------------+  |          |
+        |      |         |    .   center |  |          | original image
+        |      |         |        range  |  |          |
+        |      |         +---------------+  |          |
+        +------|----------------------------|----------+
+               |          padded area       |
+               +----------------------------+
+
+    There are 5 main areas in the figure:
+
+    - output image: output image of this operation, also called padding
+      image in following instruction.
+    - original image: input image of this operation.
+    - padded area: non-intersect area of output image and original image.
+    - cropped area: the overlap of output image and original image.
+    - center range: a smaller area where random center chosen from.
+      center range is computed by ``border`` and original image's shape
+      to avoid our random center is too close to original image's border.
+
+    Also this operation act differently in train and test mode, the summary
+    pipeline is listed below.
+
+    Train pipeline:
+
+    1. Choose a ``random_ratio`` from ``ratios``, the shape of padding image
+       will be ``random_ratio * crop_size``.
+    2. Choose a ``random_center`` in center range.
+    3. Generate padding image with center matches the ``random_center``.
+    4. Initialize the padding image with pixel value equals to ``mean``.
+    5. Copy the cropped area to padding image.
+    6. Refine annotations.
+
+    Test pipeline:
+
+    1. Compute output shape according to ``test_pad_mode``.
+    2. Generate padding image with center matches the original image
+       center.
+    3. Initialize the padding image with pixel value equals to ``mean``.
+    4. Copy the ``cropped area`` to padding image.
+
+    Required Keys:
+
+    - img (np.float32)
+    - img_shape (tuple)
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - img (np.float32)
+    - img_shape (tuple)
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Args:
+        crop_size (tuple, optional): expected size after crop, final size will
+            computed according to ratio. Requires  (width, height)
+            in train mode, and None in test mode.
+        ratios (tuple, optional): random select a ratio from tuple and crop
+            image to (crop_size[0] * ratio) * (crop_size[1] * ratio).
+            Only available in train mode. Defaults to (0.9, 1.0, 1.1).
+        border (int, optional): max distance from center select area to image
+            border. Only available in train mode. Defaults to 128.
+        mean (sequence, optional): Mean values of 3 channels.
+        std (sequence, optional): Std values of 3 channels.
+        to_rgb (bool, optional): Whether to convert the image from BGR to RGB.
+        test_mode (bool): whether involve random variables in transform.
+            In train mode, crop_size is fixed, center coords and ratio is
+            random selected from predefined lists. In test mode, crop_size
+            is image's original shape, center coords and ratio is fixed.
+            Defaults to False.
+        test_pad_mode (tuple, optional): padding method and padding shape
+            value, only available in test mode. Default is using
+            'logical_or' with 127 as padding shape value.
+
+            - 'logical_or': final_shape = input_shape | padding_shape_value
+            - 'size_divisor': final_shape = int(
+              ceil(input_shape / padding_shape_value) * padding_shape_value)
+
+            Defaults to ('logical_or', 127).
+        test_pad_add_pix (int): Extra padding pixel in test mode.
+            Defaults to 0.
+        bbox_clip_border (bool): Whether clip the objects outside
+            the border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 crop_size: Optional[tuple] = None,
+                 ratios: Optional[tuple] = (0.9, 1.0, 1.1),
+                 border: Optional[int] = 128,
+                 mean: Optional[Sequence] = None,
+                 std: Optional[Sequence] = None,
+                 to_rgb: Optional[bool] = None,
+                 test_mode: bool = False,
+                 test_pad_mode: Optional[tuple] = ('logical_or', 127),
+                 test_pad_add_pix: int = 0,
+                 bbox_clip_border: bool = True) -> None:
+        if test_mode:
+            assert crop_size is None, 'crop_size must be None in test mode'
+            assert ratios is None, 'ratios must be None in test mode'
+            assert border is None, 'border must be None in test mode'
+            assert isinstance(test_pad_mode, (list, tuple))
+            assert test_pad_mode[0] in ['logical_or', 'size_divisor']
+        else:
+            assert isinstance(crop_size, (list, tuple))
+            assert crop_size[0] > 0 and crop_size[1] > 0, (
+                'crop_size must > 0 in train mode')
+            assert isinstance(ratios, (list, tuple))
+            assert test_pad_mode is None, (
+                'test_pad_mode must be None in train mode')
+
+        self.crop_size = crop_size
+        self.ratios = ratios
+        self.border = border
+        # We do not set default value to mean, std and to_rgb because these
+        # hyper-parameters are easy to forget but could affect the performance.
+        # Please use the same setting as Normalize for performance assurance.
+        assert mean is not None and std is not None and to_rgb is not None
+        self.to_rgb = to_rgb
+        self.input_mean = mean
+        self.input_std = std
+        if to_rgb:
+            self.mean = mean[::-1]
+            self.std = std[::-1]
+        else:
+            self.mean = mean
+            self.std = std
+        self.test_mode = test_mode
+        self.test_pad_mode = test_pad_mode
+        self.test_pad_add_pix = test_pad_add_pix
+        self.bbox_clip_border = bbox_clip_border
+
+    def _get_border(self, border, size):
+        """Get final border for the target size.
+
+        This function generates a ``final_border`` according to image's shape.
+        The area between ``final_border`` and ``size - final_border`` is the
+        ``center range``. We randomly choose center from the ``center range``
+        to avoid our random center is too close to original image's border.
+        Also ``center range`` should be larger than 0.
+
+        Args:
+            border (int): The initial border, default is 128.
+            size (int): The width or height of original image.
+        Returns:
+            int: The final border.
+        """
+        k = 2 * border / size
+        i = pow(2, np.ceil(np.log2(np.ceil(k))) + (k == int(k)))
+        return border // i
+
+    def _filter_boxes(self, patch, boxes):
+        """Check whether the center of each box is in the patch.
+
+        Args:
+            patch (list[int]): The cropped area, [left, top, right, bottom].
+            boxes (numpy array, (N x 4)): Ground truth boxes.
+
+        Returns:
+            mask (numpy array, (N,)): Each box is inside or outside the patch.
+        """
+        center = boxes.centers.numpy()
+        mask = (center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (
+            center[:, 0] < patch[2]) * (
+                center[:, 1] < patch[3])
+        return mask
+
+    def _crop_image_and_paste(self, image, center, size):
+        """Crop image with a given center and size, then paste the cropped
+        image to a blank image with two centers align.
+
+        This function is equivalent to generating a blank image with ``size``
+        as its shape. Then cover it on the original image with two centers (
+        the center of blank image and the random center of original image)
+        aligned. The overlap area is paste from the original image and the
+        outside area is filled with ``mean pixel``.
+
+        Args:
+            image (np array, H x W x C): Original image.
+            center (list[int]): Target crop center coord.
+            size (list[int]): Target crop size. [target_h, target_w]
+
+        Returns:
+            cropped_img (np array, target_h x target_w x C): Cropped image.
+            border (np array, 4): The distance of four border of
+                ``cropped_img`` to the original image area, [top, bottom,
+                left, right]
+            patch (list[int]): The cropped area, [left, top, right, bottom].
+        """
+        center_y, center_x = center
+        target_h, target_w = size
+        img_h, img_w, img_c = image.shape
+
+        x0 = max(0, center_x - target_w // 2)
+        x1 = min(center_x + target_w // 2, img_w)
+        y0 = max(0, center_y - target_h // 2)
+        y1 = min(center_y + target_h // 2, img_h)
+        patch = np.array((int(x0), int(y0), int(x1), int(y1)))
+
+        left, right = center_x - x0, x1 - center_x
+        top, bottom = center_y - y0, y1 - center_y
+
+        cropped_center_y, cropped_center_x = target_h // 2, target_w // 2
+        cropped_img = np.zeros((target_h, target_w, img_c), dtype=image.dtype)
+        for i in range(img_c):
+            cropped_img[:, :, i] += self.mean[i]
+        y_slice = slice(cropped_center_y - top, cropped_center_y + bottom)
+        x_slice = slice(cropped_center_x - left, cropped_center_x + right)
+        cropped_img[y_slice, x_slice, :] = image[y0:y1, x0:x1, :]
+
+        border = np.array([
+            cropped_center_y - top, cropped_center_y + bottom,
+            cropped_center_x - left, cropped_center_x + right
+        ],
+                          dtype=np.float32)
+
+        return cropped_img, border, patch
+
+    def _train_aug(self, results):
+        """Random crop and around padding the original image.
+
+        Args:
+            results (dict): Image infomations in the augment pipeline.
+
+        Returns:
+            results (dict): The updated dict.
+        """
+        img = results['img']
+        h, w, c = img.shape
+        gt_bboxes = results['gt_bboxes']
+        while True:
+            scale = random.choice(self.ratios)
+            new_h = int(self.crop_size[1] * scale)
+            new_w = int(self.crop_size[0] * scale)
+            h_border = self._get_border(self.border, h)
+            w_border = self._get_border(self.border, w)
+
+            for i in range(50):
+                center_x = random.randint(low=w_border, high=w - w_border)
+                center_y = random.randint(low=h_border, high=h - h_border)
+
+                cropped_img, border, patch = self._crop_image_and_paste(
+                    img, [center_y, center_x], [new_h, new_w])
+
+                if len(gt_bboxes) == 0:
+                    results['img'] = cropped_img
+                    results['img_shape'] = cropped_img.shape[:2]
+                    return results
+
+                # if image do not have valid bbox, any crop patch is valid.
+                mask = self._filter_boxes(patch, gt_bboxes)
+                if not mask.any():
+                    continue
+
+                results['img'] = cropped_img
+                results['img_shape'] = cropped_img.shape[:2]
+
+                x0, y0, x1, y1 = patch
+
+                left_w, top_h = center_x - x0, center_y - y0
+                cropped_center_x, cropped_center_y = new_w // 2, new_h // 2
+
+                # crop bboxes accordingly and clip to the image boundary
+                gt_bboxes = gt_bboxes[mask]
+                gt_bboxes.translate_([
+                    cropped_center_x - left_w - x0,
+                    cropped_center_y - top_h - y0
+                ])
+                if self.bbox_clip_border:
+                    gt_bboxes.clip_([new_h, new_w])
+                keep = gt_bboxes.is_inside([new_h, new_w]).numpy()
+                gt_bboxes = gt_bboxes[keep]
+
+                results['gt_bboxes'] = gt_bboxes
+
+                # ignore_flags
+                if results.get('gt_ignore_flags', None) is not None:
+                    gt_ignore_flags = results['gt_ignore_flags'][mask]
+                    results['gt_ignore_flags'] = \
+                        gt_ignore_flags[keep]
+
+                # labels
+                if results.get('gt_bboxes_labels', None) is not None:
+                    gt_labels = results['gt_bboxes_labels'][mask]
+                    results['gt_bboxes_labels'] = gt_labels[keep]
+
+                if 'gt_masks' in results or 'gt_seg_map' in results:
+                    raise NotImplementedError(
+                        'RandomCenterCropPad only supports bbox.')
+
+                return results
+
+    def _test_aug(self, results):
+        """Around padding the original image without cropping.
+
+        The padding mode and value are from ``test_pad_mode``.
+
+        Args:
+            results (dict): Image infomations in the augment pipeline.
+
+        Returns:
+            results (dict): The updated dict.
+        """
+        img = results['img']
+        h, w, c = img.shape
+        if self.test_pad_mode[0] in ['logical_or']:
+            # self.test_pad_add_pix is only used for centernet
+            target_h = (h | self.test_pad_mode[1]) + self.test_pad_add_pix
+            target_w = (w | self.test_pad_mode[1]) + self.test_pad_add_pix
+        elif self.test_pad_mode[0] in ['size_divisor']:
+            divisor = self.test_pad_mode[1]
+            target_h = int(np.ceil(h / divisor)) * divisor
+            target_w = int(np.ceil(w / divisor)) * divisor
+        else:
+            raise NotImplementedError(
+                'RandomCenterCropPad only support two testing pad mode:'
+                'logical-or and size_divisor.')
+
+        cropped_img, border, _ = self._crop_image_and_paste(
+            img, [h // 2, w // 2], [target_h, target_w])
+        results['img'] = cropped_img
+        results['img_shape'] = cropped_img.shape[:2]
+        results['border'] = border
+        return results
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        assert img.dtype == np.float32, (
+            'RandomCenterCropPad needs the input image of dtype np.float32,'
+            ' please set "to_float32=True" in "LoadImageFromFile" pipeline')
+        h, w, c = img.shape
+        assert c == len(self.mean)
+        if self.test_mode:
+            return self._test_aug(results)
+        else:
+            return self._train_aug(results)
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size={self.crop_size}, '
+        repr_str += f'ratios={self.ratios}, '
+        repr_str += f'border={self.border}, '
+        repr_str += f'mean={self.input_mean}, '
+        repr_str += f'std={self.input_std}, '
+        repr_str += f'to_rgb={self.to_rgb}, '
+        repr_str += f'test_mode={self.test_mode}, '
+        repr_str += f'test_pad_mode={self.test_pad_mode}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CutOut(BaseTransform):
+    """CutOut operation.
+
+    Randomly drop some regions of image used in
+    `Cutout <https://arxiv.org/abs/1708.04552>`_.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        n_holes (int or tuple[int, int]): Number of regions to be dropped.
+            If it is given as a list, number of holes will be randomly
+            selected from the closed interval [``n_holes[0]``, ``n_holes[1]``].
+        cutout_shape (tuple[int, int] or list[tuple[int, int]], optional):
+            The candidate shape of dropped regions. It can be
+            ``tuple[int, int]`` to use a fixed cutout shape, or
+            ``list[tuple[int, int]]`` to randomly choose shape
+            from the list. Defaults to None.
+        cutout_ratio (tuple[float, float] or list[tuple[float, float]],
+            optional): The candidate ratio of dropped regions. It can be
+            ``tuple[float, float]`` to use a fixed ratio or
+            ``list[tuple[float, float]]`` to randomly choose ratio
+            from the list. Please note that ``cutout_shape`` and
+            ``cutout_ratio`` cannot be both given at the same time.
+            Defaults to None.
+        fill_in (tuple[float, float, float] or tuple[int, int, int]): The value
+            of pixel to fill in the dropped regions. Defaults to (0, 0, 0).
+    """
+
+    def __init__(
+        self,
+        n_holes: Union[int, Tuple[int, int]],
+        cutout_shape: Optional[Union[Tuple[int, int],
+                                     List[Tuple[int, int]]]] = None,
+        cutout_ratio: Optional[Union[Tuple[float, float],
+                                     List[Tuple[float, float]]]] = None,
+        fill_in: Union[Tuple[float, float, float], Tuple[int, int,
+                                                         int]] = (0, 0, 0)
+    ) -> None:
+
+        assert (cutout_shape is None) ^ (cutout_ratio is None), \
+            'Either cutout_shape or cutout_ratio should be specified.'
+        assert (isinstance(cutout_shape, (list, tuple))
+                or isinstance(cutout_ratio, (list, tuple)))
+        if isinstance(n_holes, tuple):
+            assert len(n_holes) == 2 and 0 <= n_holes[0] < n_holes[1]
+        else:
+            n_holes = (n_holes, n_holes)
+        self.n_holes = n_holes
+        self.fill_in = fill_in
+        self.with_ratio = cutout_ratio is not None
+        self.candidates = cutout_ratio if self.with_ratio else cutout_shape
+        if not isinstance(self.candidates, list):
+            self.candidates = [self.candidates]
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Call function to drop some regions of image."""
+        h, w, c = results['img'].shape
+        n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1)
+        for _ in range(n_holes):
+            x1 = np.random.randint(0, w)
+            y1 = np.random.randint(0, h)
+            index = np.random.randint(0, len(self.candidates))
+            if not self.with_ratio:
+                cutout_w, cutout_h = self.candidates[index]
+            else:
+                cutout_w = int(self.candidates[index][0] * w)
+                cutout_h = int(self.candidates[index][1] * h)
+
+            x2 = np.clip(x1 + cutout_w, 0, w)
+            y2 = np.clip(y1 + cutout_h, 0, h)
+            results['img'][y1:y2, x1:x2, :] = self.fill_in
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(n_holes={self.n_holes}, '
+        repr_str += (f'cutout_ratio={self.candidates}, ' if self.with_ratio
+                     else f'cutout_shape={self.candidates}, ')
+        repr_str += f'fill_in={self.fill_in})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Mosaic(BaseTransform):
+    """Mosaic augmentation.
+
+    Given 4 images, mosaic transform combines them into
+    one output image. The output image is composed of the parts from each sub-
+    image.
+
+    .. code:: text
+
+                        mosaic transform
+                           center_x
+                +------------------------------+
+                |       pad        |  pad      |
+                |      +-----------+           |
+                |      |           |           |
+                |      |  image1   |--------+  |
+                |      |           |        |  |
+                |      |           | image2 |  |
+     center_y   |----+-------------+-----------|
+                |    |   cropped   |           |
+                |pad |   image3    |  image4   |
+                |    |             |           |
+                +----|-------------+-----------+
+                     |             |
+                     +-------------+
+
+     The mosaic transform steps are as follows:
+
+         1. Choose the mosaic center as the intersections of 4 images
+         2. Get the left top image according to the index, and randomly
+            sample another 3 images from the custom dataset.
+         3. Sub image will be cropped if image is larger than mosaic patch
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        img_scale (Sequence[int]): Image size before mosaic pipeline of single
+            image. The shape order should be (width, height).
+            Defaults to (640, 640).
+        center_ratio_range (Sequence[float]): Center ratio range of mosaic
+            output. Defaults to (0.5, 1.5).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        pad_val (int): Pad value. Defaults to 114.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 center_ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 bbox_clip_border: bool = True,
+                 pad_val: float = 114.0,
+                 prob: float = 1.0) -> None:
+        assert isinstance(img_scale, tuple)
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \
+                                 f'got {prob}.'
+
+        log_img_scale(img_scale, skip_square=True, shape_order='wh')
+        self.img_scale = img_scale
+        self.center_ratio_range = center_ratio_range
+        self.bbox_clip_border = bbox_clip_border
+        self.pad_val = pad_val
+        self.prob = prob
+
+    @cache_randomness
+    def get_indexes(self, dataset: BaseDataset) -> int:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+
+        Returns:
+            list: indexes.
+        """
+
+        indexes = [random.randint(0, len(dataset)) for _ in range(3)]
+        return indexes
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Mosaic transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        if random.uniform(0, 1) > self.prob:
+            return results
+
+        assert 'mix_results' in results
+        mosaic_bboxes = []
+        mosaic_bboxes_labels = []
+        mosaic_ignore_flags = []
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full(
+                (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2)),
+                self.pad_val,
+                dtype=results['img'].dtype)
+
+        # mosaic center x, y
+        center_x = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[0])
+        center_y = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[1])
+        center_position = (center_x, center_y)
+
+        loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        for i, loc in enumerate(loc_strs):
+            if loc == 'top_left':
+                results_patch = copy.deepcopy(results)
+            else:
+                results_patch = copy.deepcopy(results['mix_results'][i - 1])
+
+            img_i = results_patch['img']
+            h_i, w_i = img_i.shape[:2]
+            # keep_ratio resize
+            scale_ratio_i = min(self.img_scale[1] / h_i,
+                                self.img_scale[0] / w_i)
+            img_i = mmcv.imresize(
+                img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
+
+            # compute the combine parameters
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center_position, img_i.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
+
+            # adjust coordinate
+            gt_bboxes_i = results_patch['gt_bboxes']
+            gt_bboxes_labels_i = results_patch['gt_bboxes_labels']
+            gt_ignore_flags_i = results_patch['gt_ignore_flags']
+
+            padw = x1_p - x1_c
+            padh = y1_p - y1_c
+            gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i])
+            gt_bboxes_i.translate_([padw, padh])
+            mosaic_bboxes.append(gt_bboxes_i)
+            mosaic_bboxes_labels.append(gt_bboxes_labels_i)
+            mosaic_ignore_flags.append(gt_ignore_flags_i)
+
+        mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
+        mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
+        mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0)
+
+        if self.bbox_clip_border:
+            mosaic_bboxes.clip_([2 * self.img_scale[1], 2 * self.img_scale[0]])
+        # remove outside bboxes
+        inside_inds = mosaic_bboxes.is_inside(
+            [2 * self.img_scale[1], 2 * self.img_scale[0]]).numpy()
+        mosaic_bboxes = mosaic_bboxes[inside_inds]
+        mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds]
+        mosaic_ignore_flags = mosaic_ignore_flags[inside_inds]
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape[:2]
+        results['gt_bboxes'] = mosaic_bboxes
+        results['gt_bboxes_labels'] = mosaic_bboxes_labels
+        results['gt_ignore_flags'] = mosaic_ignore_flags
+        return results
+
+    def _mosaic_combine(
+            self, loc: str, center_position_xy: Sequence[float],
+            img_shape_wh: Sequence[int]) -> Tuple[Tuple[int], Tuple[int]]:
+        """Calculate global coordinate of mosaic image and local coordinate of
+        cropped sub-image.
+
+        Args:
+            loc (str): Index for the sub-image, loc in ('top_left',
+              'top_right', 'bottom_left', 'bottom_right').
+            center_position_xy (Sequence[float]): Mixing center for 4 images,
+                (x, y).
+            img_shape_wh (Sequence[int]): Width and height of sub-image
+
+        Returns:
+            tuple[tuple[float]]: Corresponding coordinate of pasting and
+                cropping
+                - paste_coord (tuple): paste corner coordinate in mosaic image.
+                - crop_coord (tuple): crop corner coordinate in mosaic image.
+        """
+        assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        if loc == 'top_left':
+            # index0 to top left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             center_position_xy[0], \
+                             center_position_xy[1]
+            crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - (
+                y2 - y1), img_shape_wh[0], img_shape_wh[1]
+
+        elif loc == 'top_right':
+            # index1 to top right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[0] * 2), \
+                             center_position_xy[1]
+            crop_coord = 0, img_shape_wh[1] - (y2 - y1), min(
+                img_shape_wh[0], x2 - x1), img_shape_wh[1]
+
+        elif loc == 'bottom_left':
+            # index2 to bottom left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             center_position_xy[1], \
+                             center_position_xy[0], \
+                             min(self.img_scale[1] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min(
+                y2 - y1, img_shape_wh[1])
+
+        else:
+            # index3 to bottom right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             center_position_xy[1], \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[0] * 2), \
+                             min(self.img_scale[1] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = 0, 0, min(img_shape_wh[0],
+                                   x2 - x1), min(y2 - y1, img_shape_wh[1])
+
+        paste_coord = x1, y1, x2, y2
+        return paste_coord, crop_coord
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'center_ratio_range={self.center_ratio_range}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class MixUp(BaseTransform):
+    """MixUp data augmentation.
+
+    .. code:: text
+
+                         mixup transform
+                +------------------------------+
+                | mixup image   |              |
+                |      +--------|--------+     |
+                |      |        |        |     |
+                |---------------+        |     |
+                |      |                 |     |
+                |      |      image      |     |
+                |      |                 |     |
+                |      |                 |     |
+                |      |-----------------+     |
+                |             pad              |
+                +------------------------------+
+
+     The mixup transform steps are as follows:
+
+        1. Another random image is picked by dataset and embedded in
+           the top left patch(after padding and resizing)
+        2. The target of mixup transform is the weighted average of mixup
+           image and origin image.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+
+    Args:
+        img_scale (Sequence[int]): Image output size after mixup pipeline.
+            The shape order should be (width, height). Defaults to (640, 640).
+        ratio_range (Sequence[float]): Scale ratio of mixup image.
+            Defaults to (0.5, 1.5).
+        flip_ratio (float): Horizontal flip ratio of mixup image.
+            Defaults to 0.5.
+        pad_val (int): Pad value. Defaults to 114.
+        max_iters (int): The maximum number of iterations. If the number of
+            iterations is greater than `max_iters`, but gt_bbox is still
+            empty, then the iteration is terminated. Defaults to 15.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 flip_ratio: float = 0.5,
+                 pad_val: float = 114.0,
+                 max_iters: int = 15,
+                 bbox_clip_border: bool = True) -> None:
+        assert isinstance(img_scale, tuple)
+        log_img_scale(img_scale, skip_square=True, shape_order='wh')
+        self.dynamic_scale = img_scale
+        self.ratio_range = ratio_range
+        self.flip_ratio = flip_ratio
+        self.pad_val = pad_val
+        self.max_iters = max_iters
+        self.bbox_clip_border = bbox_clip_border
+
+    @cache_randomness
+    def get_indexes(self, dataset: BaseDataset) -> int:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+
+        Returns:
+            list: indexes.
+        """
+
+        for i in range(self.max_iters):
+            index = random.randint(0, len(dataset))
+            gt_bboxes_i = dataset[index]['gt_bboxes']
+            if len(gt_bboxes_i) != 0:
+                break
+
+        return index
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """MixUp transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+
+        assert 'mix_results' in results
+        assert len(
+            results['mix_results']) == 1, 'MixUp only support 2 images now !'
+
+        if results['mix_results'][0]['gt_bboxes'].shape[0] == 0:
+            # empty bbox
+            return results
+
+        retrieve_results = results['mix_results'][0]
+        retrieve_img = retrieve_results['img']
+
+        jit_factor = random.uniform(*self.ratio_range)
+        is_flip = random.uniform(0, 1) > self.flip_ratio
+
+        if len(retrieve_img.shape) == 3:
+            out_img = np.ones(
+                (self.dynamic_scale[1], self.dynamic_scale[0], 3),
+                dtype=retrieve_img.dtype) * self.pad_val
+        else:
+            out_img = np.ones(
+                self.dynamic_scale[::-1],
+                dtype=retrieve_img.dtype) * self.pad_val
+
+        # 1. keep_ratio resize
+        scale_ratio = min(self.dynamic_scale[1] / retrieve_img.shape[0],
+                          self.dynamic_scale[0] / retrieve_img.shape[1])
+        retrieve_img = mmcv.imresize(
+            retrieve_img, (int(retrieve_img.shape[1] * scale_ratio),
+                           int(retrieve_img.shape[0] * scale_ratio)))
+
+        # 2. paste
+        out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img
+
+        # 3. scale jit
+        scale_ratio *= jit_factor
+        out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor),
+                                          int(out_img.shape[0] * jit_factor)))
+
+        # 4. flip
+        if is_flip:
+            out_img = out_img[:, ::-1, :]
+
+        # 5. random crop
+        ori_img = results['img']
+        origin_h, origin_w = out_img.shape[:2]
+        target_h, target_w = ori_img.shape[:2]
+        padded_img = np.ones((max(origin_h, target_h), max(
+            origin_w, target_w), 3)) * self.pad_val
+        padded_img = padded_img.astype(np.uint8)
+        padded_img[:origin_h, :origin_w] = out_img
+
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w)
+        padded_cropped_img = padded_img[y_offset:y_offset + target_h,
+                                        x_offset:x_offset + target_w]
+
+        # 6. adjust bbox
+        retrieve_gt_bboxes = retrieve_results['gt_bboxes']
+        retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio])
+        if self.bbox_clip_border:
+            retrieve_gt_bboxes.clip_([origin_h, origin_w])
+
+        if is_flip:
+            retrieve_gt_bboxes.flip_([origin_h, origin_w],
+                                     direction='horizontal')
+
+        # 7. filter
+        cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone()
+        cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset])
+        if self.bbox_clip_border:
+            cp_retrieve_gt_bboxes.clip_([target_h, target_w])
+
+        # 8. mix up
+        ori_img = ori_img.astype(np.float32)
+        mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img.astype(np.float32)
+
+        retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels']
+        retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags']
+
+        mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat(
+            (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0)
+        mixup_gt_bboxes_labels = np.concatenate(
+            (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0)
+        mixup_gt_ignore_flags = np.concatenate(
+            (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0)
+
+        # remove outside bbox
+        inside_inds = mixup_gt_bboxes.is_inside([target_h, target_w]).numpy()
+        mixup_gt_bboxes = mixup_gt_bboxes[inside_inds]
+        mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds]
+        mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds]
+
+        results['img'] = mixup_img.astype(np.uint8)
+        results['img_shape'] = mixup_img.shape[:2]
+        results['gt_bboxes'] = mixup_gt_bboxes
+        results['gt_bboxes_labels'] = mixup_gt_bboxes_labels
+        results['gt_ignore_flags'] = mixup_gt_ignore_flags
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(dynamic_scale={self.dynamic_scale}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'flip_ratio={self.flip_ratio}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'max_iters={self.max_iters}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomAffine(BaseTransform):
+    """Random affine transform data augmentation.
+
+    This operation randomly generates affine transform matrix which including
+    rotation, translation, shear and scaling transforms.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        max_rotate_degree (float): Maximum degrees of rotation transform.
+            Defaults to 10.
+        max_translate_ratio (float): Maximum ratio of translation.
+            Defaults to 0.1.
+        scaling_ratio_range (tuple[float]): Min and max ratio of
+            scaling transform. Defaults to (0.5, 1.5).
+        max_shear_degree (float): Maximum degrees of shear
+            transform. Defaults to 2.
+        border (tuple[int]): Distance from width and height sides of input
+            image to adjust output shape. Only used in mosaic dataset.
+            Defaults to (0, 0).
+        border_val (tuple[int]): Border padding values of 3 channels.
+            Defaults to (114, 114, 114).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+    """
+
+    def __init__(self,
+                 max_rotate_degree: float = 10.0,
+                 max_translate_ratio: float = 0.1,
+                 scaling_ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 max_shear_degree: float = 2.0,
+                 border: Tuple[int, int] = (0, 0),
+                 border_val: Tuple[int, int, int] = (114, 114, 114),
+                 bbox_clip_border: bool = True) -> None:
+        assert 0 <= max_translate_ratio <= 1
+        assert scaling_ratio_range[0] <= scaling_ratio_range[1]
+        assert scaling_ratio_range[0] > 0
+        self.max_rotate_degree = max_rotate_degree
+        self.max_translate_ratio = max_translate_ratio
+        self.scaling_ratio_range = scaling_ratio_range
+        self.max_shear_degree = max_shear_degree
+        self.border = border
+        self.border_val = border_val
+        self.bbox_clip_border = bbox_clip_border
+
+    @cache_randomness
+    def _get_random_homography_matrix(self, height, width):
+        # Rotation
+        rotation_degree = random.uniform(-self.max_rotate_degree,
+                                         self.max_rotate_degree)
+        rotation_matrix = self._get_rotation_matrix(rotation_degree)
+
+        # Scaling
+        scaling_ratio = random.uniform(self.scaling_ratio_range[0],
+                                       self.scaling_ratio_range[1])
+        scaling_matrix = self._get_scaling_matrix(scaling_ratio)
+
+        # Shear
+        x_degree = random.uniform(-self.max_shear_degree,
+                                  self.max_shear_degree)
+        y_degree = random.uniform(-self.max_shear_degree,
+                                  self.max_shear_degree)
+        shear_matrix = self._get_shear_matrix(x_degree, y_degree)
+
+        # Translation
+        trans_x = random.uniform(-self.max_translate_ratio,
+                                 self.max_translate_ratio) * width
+        trans_y = random.uniform(-self.max_translate_ratio,
+                                 self.max_translate_ratio) * height
+        translate_matrix = self._get_translation_matrix(trans_x, trans_y)
+
+        warp_matrix = (
+            translate_matrix @ shear_matrix @ rotation_matrix @ scaling_matrix)
+        return warp_matrix
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        height = img.shape[0] + self.border[1] * 2
+        width = img.shape[1] + self.border[0] * 2
+
+        warp_matrix = self._get_random_homography_matrix(height, width)
+
+        img = cv2.warpPerspective(
+            img,
+            warp_matrix,
+            dsize=(width, height),
+            borderValue=self.border_val)
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+
+        bboxes = results['gt_bboxes']
+        num_bboxes = len(bboxes)
+        if num_bboxes:
+            bboxes.project_(warp_matrix)
+            if self.bbox_clip_border:
+                bboxes.clip_([height, width])
+            # remove outside bbox
+            valid_index = bboxes.is_inside([height, width]).numpy()
+            results['gt_bboxes'] = bboxes[valid_index]
+            results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
+                valid_index]
+            results['gt_ignore_flags'] = results['gt_ignore_flags'][
+                valid_index]
+
+            if 'gt_masks' in results:
+                raise NotImplementedError('RandomAffine only supports bbox.')
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(max_rotate_degree={self.max_rotate_degree}, '
+        repr_str += f'max_translate_ratio={self.max_translate_ratio}, '
+        repr_str += f'scaling_ratio_range={self.scaling_ratio_range}, '
+        repr_str += f'max_shear_degree={self.max_shear_degree}, '
+        repr_str += f'border={self.border}, '
+        repr_str += f'border_val={self.border_val}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+    @staticmethod
+    def _get_rotation_matrix(rotate_degrees: float) -> np.ndarray:
+        radian = math.radians(rotate_degrees)
+        rotation_matrix = np.array(
+            [[np.cos(radian), -np.sin(radian), 0.],
+             [np.sin(radian), np.cos(radian), 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return rotation_matrix
+
+    @staticmethod
+    def _get_scaling_matrix(scale_ratio: float) -> np.ndarray:
+        scaling_matrix = np.array(
+            [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return scaling_matrix
+
+    @staticmethod
+    def _get_shear_matrix(x_shear_degrees: float,
+                          y_shear_degrees: float) -> np.ndarray:
+        x_radian = math.radians(x_shear_degrees)
+        y_radian = math.radians(y_shear_degrees)
+        shear_matrix = np.array([[1, np.tan(x_radian), 0.],
+                                 [np.tan(y_radian), 1, 0.], [0., 0., 1.]],
+                                dtype=np.float32)
+        return shear_matrix
+
+    @staticmethod
+    def _get_translation_matrix(x: float, y: float) -> np.ndarray:
+        translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]],
+                                      dtype=np.float32)
+        return translation_matrix
+
+
+@TRANSFORMS.register_module()
+class YOLOXHSVRandomAug(BaseTransform):
+    """Apply HSV augmentation to image sequentially. It is referenced from
+    https://github.com/Megvii-
+    BaseDetection/YOLOX/blob/main/yolox/data/data_augment.py#L21.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        hue_delta (int): delta of hue. Defaults to 5.
+        saturation_delta (int): delta of saturation. Defaults to 30.
+        value_delta (int): delat of value. Defaults to 30.
+    """
+
+    def __init__(self,
+                 hue_delta: int = 5,
+                 saturation_delta: int = 30,
+                 value_delta: int = 30) -> None:
+        self.hue_delta = hue_delta
+        self.saturation_delta = saturation_delta
+        self.value_delta = value_delta
+
+    @cache_randomness
+    def _get_hsv_gains(self):
+        hsv_gains = np.random.uniform(-1, 1, 3) * [
+            self.hue_delta, self.saturation_delta, self.value_delta
+        ]
+        # random selection of h, s, v
+        hsv_gains *= np.random.randint(0, 2, 3)
+        # prevent overflow
+        hsv_gains = hsv_gains.astype(np.int16)
+        return hsv_gains
+
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        hsv_gains = self._get_hsv_gains()
+        img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.int16)
+
+        img_hsv[..., 0] = (img_hsv[..., 0] + hsv_gains[0]) % 180
+        img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_gains[1], 0, 255)
+        img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_gains[2], 0, 255)
+        cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR, dst=img)
+
+        results['img'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(hue_delta={self.hue_delta}, '
+        repr_str += f'saturation_delta={self.saturation_delta}, '
+        repr_str += f'value_delta={self.value_delta})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CopyPaste(BaseTransform):
+    """Simple Copy-Paste is a Strong Data Augmentation Method for Instance
+    Segmentation The simple copy-paste transform steps are as follows:
+
+    1. The destination image is already resized with aspect ratio kept,
+       cropped and padded.
+    2. Randomly select a source image, which is also already resized
+       with aspect ratio kept, cropped and padded in a similar way
+       as the destination image.
+    3. Randomly select some objects from the source image.
+    4. Paste these source objects to the destination image directly,
+       due to the source and destination image have the same size.
+    5. Update object masks of the destination image, for some origin objects
+       may be occluded.
+    6. Generate bboxes from the updated destination masks and
+       filter some objects which are totally occluded, and adjust bboxes
+       which are partly occluded.
+    7. Append selected source bboxes, masks, and labels.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_masks (BitmapMasks) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+    - gt_masks (optional)
+
+    Args:
+        max_num_pasted (int): The maximum number of pasted objects.
+            Defaults to 100.
+        bbox_occluded_thr (int): The threshold of occluded bbox.
+            Defaults to 10.
+        mask_occluded_thr (int): The threshold of occluded mask.
+            Defaults to 300.
+        selected (bool): Whether select objects or not. If select is False,
+            all objects of the source image will be pasted to the
+            destination image.
+            Defaults to True.
+        paste_by_box (bool): Whether use boxes as masks when masks are not
+            available.
+            Defaults to False.
+    """
+
+    def __init__(
+        self,
+        max_num_pasted: int = 100,
+        bbox_occluded_thr: int = 10,
+        mask_occluded_thr: int = 300,
+        selected: bool = True,
+        paste_by_box: bool = False,
+    ) -> None:
+        self.max_num_pasted = max_num_pasted
+        self.bbox_occluded_thr = bbox_occluded_thr
+        self.mask_occluded_thr = mask_occluded_thr
+        self.selected = selected
+        self.paste_by_box = paste_by_box
+
+    @cache_randomness
+    def get_indexes(self, dataset: BaseDataset) -> int:
+        """Call function to collect indexes.s.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+        Returns:
+            list: Indexes.
+        """
+        return random.randint(0, len(dataset))
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to make a copy-paste of image.
+
+        Args:
+            results (dict): Result dict.
+        Returns:
+            dict: Result dict with copy-paste transformed.
+        """
+
+        assert 'mix_results' in results
+        num_images = len(results['mix_results'])
+        assert num_images == 1, \
+            f'CopyPaste only supports processing 2 images, got {num_images}'
+        if self.selected:
+            selected_results = self._select_object(results['mix_results'][0])
+        else:
+            selected_results = results['mix_results'][0]
+        return self._copy_paste(results, selected_results)
+
+    @cache_randomness
+    def _get_selected_inds(self, num_bboxes: int) -> np.ndarray:
+        max_num_pasted = min(num_bboxes + 1, self.max_num_pasted)
+        num_pasted = np.random.randint(0, max_num_pasted)
+        return np.random.choice(num_bboxes, size=num_pasted, replace=False)
+
+    def get_gt_masks(self, results: dict) -> BitmapMasks:
+        """Get gt_masks originally or generated based on bboxes.
+
+        If gt_masks is not contained in results,
+        it will be generated based on gt_bboxes.
+        Args:
+            results (dict): Result dict.
+        Returns:
+            BitmapMasks: gt_masks, originally or generated based on bboxes.
+        """
+        if results.get('gt_masks', None) is not None:
+            if self.paste_by_box:
+                warnings.warn('gt_masks is already contained in results, '
+                              'so paste_by_box is disabled.')
+            return results['gt_masks']
+        else:
+            if not self.paste_by_box:
+                raise RuntimeError('results does not contain masks.')
+            return results['gt_bboxes'].create_masks(results['img'].shape[:2])
+
+    def _select_object(self, results: dict) -> dict:
+        """Select some objects from the source results."""
+        bboxes = results['gt_bboxes']
+        labels = results['gt_bboxes_labels']
+        masks = self.get_gt_masks(results)
+        ignore_flags = results['gt_ignore_flags']
+
+        selected_inds = self._get_selected_inds(bboxes.shape[0])
+
+        selected_bboxes = bboxes[selected_inds]
+        selected_labels = labels[selected_inds]
+        selected_masks = masks[selected_inds]
+        selected_ignore_flags = ignore_flags[selected_inds]
+
+        results['gt_bboxes'] = selected_bboxes
+        results['gt_bboxes_labels'] = selected_labels
+        results['gt_masks'] = selected_masks
+        results['gt_ignore_flags'] = selected_ignore_flags
+        return results
+
+    def _copy_paste(self, dst_results: dict, src_results: dict) -> dict:
+        """CopyPaste transform function.
+
+        Args:
+            dst_results (dict): Result dict of the destination image.
+            src_results (dict): Result dict of the source image.
+        Returns:
+            dict: Updated result dict.
+        """
+        dst_img = dst_results['img']
+        dst_bboxes = dst_results['gt_bboxes']
+        dst_labels = dst_results['gt_bboxes_labels']
+        dst_masks = self.get_gt_masks(dst_results)
+        dst_ignore_flags = dst_results['gt_ignore_flags']
+
+        src_img = src_results['img']
+        src_bboxes = src_results['gt_bboxes']
+        src_labels = src_results['gt_bboxes_labels']
+        src_masks = src_results['gt_masks']
+        src_ignore_flags = src_results['gt_ignore_flags']
+
+        if len(src_bboxes) == 0:
+            return dst_results
+
+        # update masks and generate bboxes from updated masks
+        composed_mask = np.where(np.any(src_masks.masks, axis=0), 1, 0)
+        updated_dst_masks = self._get_updated_masks(dst_masks, composed_mask)
+        updated_dst_bboxes = updated_dst_masks.get_bboxes(type(dst_bboxes))
+        assert len(updated_dst_bboxes) == len(updated_dst_masks)
+
+        # filter totally occluded objects
+        l1_distance = (updated_dst_bboxes.tensor - dst_bboxes.tensor).abs()
+        bboxes_inds = (l1_distance <= self.bbox_occluded_thr).all(
+            dim=-1).numpy()
+        masks_inds = updated_dst_masks.masks.sum(
+            axis=(1, 2)) > self.mask_occluded_thr
+        valid_inds = bboxes_inds | masks_inds
+
+        # Paste source objects to destination image directly
+        img = dst_img * (1 - composed_mask[..., np.newaxis]
+                         ) + src_img * composed_mask[..., np.newaxis]
+        bboxes = src_bboxes.cat([updated_dst_bboxes[valid_inds], src_bboxes])
+        labels = np.concatenate([dst_labels[valid_inds], src_labels])
+        masks = np.concatenate(
+            [updated_dst_masks.masks[valid_inds], src_masks.masks])
+        ignore_flags = np.concatenate(
+            [dst_ignore_flags[valid_inds], src_ignore_flags])
+
+        dst_results['img'] = img
+        dst_results['gt_bboxes'] = bboxes
+        dst_results['gt_bboxes_labels'] = labels
+        dst_results['gt_masks'] = BitmapMasks(masks, masks.shape[1],
+                                              masks.shape[2])
+        dst_results['gt_ignore_flags'] = ignore_flags
+
+        return dst_results
+
+    def _get_updated_masks(self, masks: BitmapMasks,
+                           composed_mask: np.ndarray) -> BitmapMasks:
+        """Update masks with composed mask."""
+        assert masks.masks.shape[-2:] == composed_mask.shape[-2:], \
+            'Cannot compare two arrays of different size'
+        masks.masks = np.where(composed_mask, 0, masks.masks)
+        return masks
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(max_num_pasted={self.max_num_pasted}, '
+        repr_str += f'bbox_occluded_thr={self.bbox_occluded_thr}, '
+        repr_str += f'mask_occluded_thr={self.mask_occluded_thr}, '
+        repr_str += f'selected={self.selected}), '
+        repr_str += f'paste_by_box={self.paste_by_box})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomErasing(BaseTransform):
+    """RandomErasing operation.
+
+    Random Erasing randomly selects a rectangle region
+    in an image and erases its pixels with random values.
+    `RandomErasing <https://arxiv.org/abs/1708.04896>`_.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (HorizontalBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_masks (BitmapMasks) (optional)
+
+    Modified Keys:
+    - img
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+    - gt_masks (optional)
+
+    Args:
+        n_patches (int or tuple[int, int]): Number of regions to be dropped.
+            If it is given as a tuple, number of patches will be randomly
+            selected from the closed interval [``n_patches[0]``,
+            ``n_patches[1]``].
+        ratio (float or tuple[float, float]): The ratio of erased regions.
+            It can be ``float`` to use a fixed ratio or ``tuple[float, float]``
+            to randomly choose ratio from the interval.
+        squared (bool): Whether to erase square region. Defaults to True.
+        bbox_erased_thr (float): The threshold for the maximum area proportion
+            of the bbox to be erased. When the proportion of the area where the
+            bbox is erased is greater than the threshold, the bbox will be
+            removed. Defaults to 0.9.
+        img_border_value (int or float or tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+    """
+
+    def __init__(
+        self,
+        n_patches: Union[int, Tuple[int, int]],
+        ratio: Union[float, Tuple[float, float]],
+        squared: bool = True,
+        bbox_erased_thr: float = 0.9,
+        img_border_value: Union[int, float, tuple] = 128,
+        mask_border_value: int = 0,
+        seg_ignore_label: int = 255,
+    ) -> None:
+        if isinstance(n_patches, tuple):
+            assert len(n_patches) == 2 and 0 <= n_patches[0] < n_patches[1]
+        else:
+            n_patches = (n_patches, n_patches)
+        if isinstance(ratio, tuple):
+            assert len(ratio) == 2 and 0 <= ratio[0] < ratio[1] <= 1
+        else:
+            ratio = (ratio, ratio)
+
+        self.n_patches = n_patches
+        self.ratio = ratio
+        self.squared = squared
+        self.bbox_erased_thr = bbox_erased_thr
+        self.img_border_value = img_border_value
+        self.mask_border_value = mask_border_value
+        self.seg_ignore_label = seg_ignore_label
+
+    @cache_randomness
+    def _get_patches(self, img_shape: Tuple[int, int]) -> List[list]:
+        """Get patches for random erasing."""
+        patches = []
+        n_patches = np.random.randint(self.n_patches[0], self.n_patches[1] + 1)
+        for _ in range(n_patches):
+            if self.squared:
+                ratio = np.random.random() * (self.ratio[1] -
+                                              self.ratio[0]) + self.ratio[0]
+                ratio = (ratio, ratio)
+            else:
+                ratio = (np.random.random() * (self.ratio[1] - self.ratio[0]) +
+                         self.ratio[0], np.random.random() *
+                         (self.ratio[1] - self.ratio[0]) + self.ratio[0])
+            ph, pw = int(img_shape[0] * ratio[0]), int(img_shape[1] * ratio[1])
+            px1, py1 = np.random.randint(0,
+                                         img_shape[1] - pw), np.random.randint(
+                                             0, img_shape[0] - ph)
+            px2, py2 = px1 + pw, py1 + ph
+            patches.append([px1, py1, px2, py2])
+        return np.array(patches)
+
+    def _transform_img(self, results: dict, patches: List[list]) -> None:
+        """Random erasing the image."""
+        for patch in patches:
+            px1, py1, px2, py2 = patch
+            results['img'][py1:py2, px1:px2, :] = self.img_border_value
+
+    def _transform_bboxes(self, results: dict, patches: List[list]) -> None:
+        """Random erasing the bboxes."""
+        bboxes = results['gt_bboxes']
+        # TODO: unify the logic by using operators in BaseBoxes.
+        assert isinstance(bboxes, HorizontalBoxes)
+        bboxes = bboxes.numpy()
+        left_top = np.maximum(bboxes[:, None, :2], patches[:, :2])
+        right_bottom = np.minimum(bboxes[:, None, 2:], patches[:, 2:])
+        wh = np.maximum(right_bottom - left_top, 0)
+        inter_areas = wh[:, :, 0] * wh[:, :, 1]
+        bbox_areas = (bboxes[:, 2] - bboxes[:, 0]) * (
+            bboxes[:, 3] - bboxes[:, 1])
+        bboxes_erased_ratio = inter_areas.sum(-1) / (bbox_areas + 1e-7)
+        valid_inds = bboxes_erased_ratio < self.bbox_erased_thr
+        results['gt_bboxes'] = HorizontalBoxes(bboxes[valid_inds])
+        results['gt_bboxes_labels'] = results['gt_bboxes_labels'][valid_inds]
+        results['gt_ignore_flags'] = results['gt_ignore_flags'][valid_inds]
+        if results.get('gt_masks', None) is not None:
+            results['gt_masks'] = results['gt_masks'][valid_inds]
+
+    def _transform_masks(self, results: dict, patches: List[list]) -> None:
+        """Random erasing the masks."""
+        for patch in patches:
+            px1, py1, px2, py2 = patch
+            results['gt_masks'].masks[:, py1:py2,
+                                      px1:px2] = self.mask_border_value
+
+    def _transform_seg(self, results: dict, patches: List[list]) -> None:
+        """Random erasing the segmentation map."""
+        for patch in patches:
+            px1, py1, px2, py2 = patch
+            results['gt_seg_map'][py1:py2, px1:px2] = self.seg_ignore_label
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to erase some regions of image."""
+        patches = self._get_patches(results['img_shape'])
+        self._transform_img(results, patches)
+        if results.get('gt_bboxes', None) is not None:
+            self._transform_bboxes(results, patches)
+        if results.get('gt_masks', None) is not None:
+            self._transform_masks(results, patches)
+        if results.get('gt_seg_map', None) is not None:
+            self._transform_seg(results, patches)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(n_patches={self.n_patches}, '
+        repr_str += f'ratio={self.ratio}, '
+        repr_str += f'squared={self.squared}, '
+        repr_str += f'bbox_erased_thr={self.bbox_erased_thr}, '
+        repr_str += f'img_border_value={self.img_border_value}, '
+        repr_str += f'mask_border_value={self.mask_border_value}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CachedMosaic(Mosaic):
+    """Cached mosaic augmentation.
+
+    Cached mosaic transform will random select images from the cache
+    and combine them into one output image.
+
+    .. code:: text
+
+                        mosaic transform
+                           center_x
+                +------------------------------+
+                |       pad        |  pad      |
+                |      +-----------+           |
+                |      |           |           |
+                |      |  image1   |--------+  |
+                |      |           |        |  |
+                |      |           | image2 |  |
+     center_y   |----+-------------+-----------|
+                |    |   cropped   |           |
+                |pad |   image3    |  image4   |
+                |    |             |           |
+                +----|-------------+-----------+
+                     |             |
+                     +-------------+
+
+     The cached mosaic transform steps are as follows:
+
+         1. Append the results from the last transform into the cache.
+         2. Choose the mosaic center as the intersections of 4 images
+         3. Get the left top image according to the index, and randomly
+            sample another 3 images from the result cache.
+         4. Sub image will be cropped if image is larger than mosaic patch
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (np.float32) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        img_scale (Sequence[int]): Image size before mosaic pipeline of single
+            image. The shape order should be (width, height).
+            Defaults to (640, 640).
+        center_ratio_range (Sequence[float]): Center ratio range of mosaic
+            output. Defaults to (0.5, 1.5).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        pad_val (int): Pad value. Defaults to 114.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 40.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 *args,
+                 max_cached_images: int = 40,
+                 random_pop: bool = True,
+                 **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.results_cache = []
+        self.random_pop = random_pop
+        assert max_cached_images >= 4, 'The length of cache must >= 4, ' \
+                                       f'but got {max_cached_images}.'
+        self.max_cached_images = max_cached_images
+
+    @cache_randomness
+    def get_indexes(self, cache: list) -> list:
+        """Call function to collect indexes.
+
+        Args:
+            cache (list): The results cache.
+
+        Returns:
+            list: indexes.
+        """
+
+        indexes = [random.randint(0, len(cache) - 1) for _ in range(3)]
+        return indexes
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Mosaic transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        # cache and pop images
+        self.results_cache.append(copy.deepcopy(results))
+        if len(self.results_cache) > self.max_cached_images:
+            if self.random_pop:
+                index = random.randint(0, len(self.results_cache) - 1)
+            else:
+                index = 0
+            self.results_cache.pop(index)
+
+        if len(self.results_cache) <= 4:
+            return results
+
+        if random.uniform(0, 1) > self.prob:
+            return results
+        indices = self.get_indexes(self.results_cache)
+        mix_results = [copy.deepcopy(self.results_cache[i]) for i in indices]
+
+        # TODO: refactor mosaic to reuse these code.
+        mosaic_bboxes = []
+        mosaic_bboxes_labels = []
+        mosaic_ignore_flags = []
+        mosaic_masks = []
+        with_mask = True if 'gt_masks' in results else False
+
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full(
+                (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2)),
+                self.pad_val,
+                dtype=results['img'].dtype)
+
+        # mosaic center x, y
+        center_x = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[0])
+        center_y = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[1])
+        center_position = (center_x, center_y)
+
+        loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        for i, loc in enumerate(loc_strs):
+            if loc == 'top_left':
+                results_patch = copy.deepcopy(results)
+            else:
+                results_patch = copy.deepcopy(mix_results[i - 1])
+
+            img_i = results_patch['img']
+            h_i, w_i = img_i.shape[:2]
+            # keep_ratio resize
+            scale_ratio_i = min(self.img_scale[1] / h_i,
+                                self.img_scale[0] / w_i)
+            img_i = mmcv.imresize(
+                img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
+
+            # compute the combine parameters
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center_position, img_i.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
+
+            # adjust coordinate
+            gt_bboxes_i = results_patch['gt_bboxes']
+            gt_bboxes_labels_i = results_patch['gt_bboxes_labels']
+            gt_ignore_flags_i = results_patch['gt_ignore_flags']
+
+            padw = x1_p - x1_c
+            padh = y1_p - y1_c
+            gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i])
+            gt_bboxes_i.translate_([padw, padh])
+            mosaic_bboxes.append(gt_bboxes_i)
+            mosaic_bboxes_labels.append(gt_bboxes_labels_i)
+            mosaic_ignore_flags.append(gt_ignore_flags_i)
+            if with_mask and results_patch.get('gt_masks', None) is not None:
+                gt_masks_i = results_patch['gt_masks']
+                gt_masks_i = gt_masks_i.rescale(float(scale_ratio_i))
+                gt_masks_i = gt_masks_i.translate(
+                    out_shape=(int(self.img_scale[0] * 2),
+                               int(self.img_scale[1] * 2)),
+                    offset=padw,
+                    direction='horizontal')
+                gt_masks_i = gt_masks_i.translate(
+                    out_shape=(int(self.img_scale[0] * 2),
+                               int(self.img_scale[1] * 2)),
+                    offset=padh,
+                    direction='vertical')
+                mosaic_masks.append(gt_masks_i)
+
+        mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
+        mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
+        mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0)
+
+        if self.bbox_clip_border:
+            mosaic_bboxes.clip_([2 * self.img_scale[1], 2 * self.img_scale[0]])
+        # remove outside bboxes
+        inside_inds = mosaic_bboxes.is_inside(
+            [2 * self.img_scale[1], 2 * self.img_scale[0]]).numpy()
+        mosaic_bboxes = mosaic_bboxes[inside_inds]
+        mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds]
+        mosaic_ignore_flags = mosaic_ignore_flags[inside_inds]
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape[:2]
+        results['gt_bboxes'] = mosaic_bboxes
+        results['gt_bboxes_labels'] = mosaic_bboxes_labels
+        results['gt_ignore_flags'] = mosaic_ignore_flags
+
+        if with_mask:
+            mosaic_masks = mosaic_masks[0].cat(mosaic_masks)
+            results['gt_masks'] = mosaic_masks[inside_inds]
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'center_ratio_range={self.center_ratio_range}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'prob={self.prob}, '
+        repr_str += f'max_cached_images={self.max_cached_images}, '
+        repr_str += f'random_pop={self.random_pop})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CachedMixUp(BaseTransform):
+    """Cached mixup data augmentation.
+
+    .. code:: text
+
+                         mixup transform
+                +------------------------------+
+                | mixup image   |              |
+                |      +--------|--------+     |
+                |      |        |        |     |
+                |---------------+        |     |
+                |      |                 |     |
+                |      |      image      |     |
+                |      |                 |     |
+                |      |                 |     |
+                |      |-----------------+     |
+                |             pad              |
+                +------------------------------+
+
+     The cached mixup transform steps are as follows:
+
+        1. Append the results from the last transform into the cache.
+        2. Another random image is picked from the cache and embedded in
+           the top left patch(after padding and resizing)
+        3. The target of mixup transform is the weighted average of mixup
+           image and origin image.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (np.float32) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+
+    Args:
+        img_scale (Sequence[int]): Image output size after mixup pipeline.
+            The shape order should be (width, height). Defaults to (640, 640).
+        ratio_range (Sequence[float]): Scale ratio of mixup image.
+            Defaults to (0.5, 1.5).
+        flip_ratio (float): Horizontal flip ratio of mixup image.
+            Defaults to 0.5.
+        pad_val (int): Pad value. Defaults to 114.
+        max_iters (int): The maximum number of iterations. If the number of
+            iterations is greater than `max_iters`, but gt_bbox is still
+            empty, then the iteration is terminated. Defaults to 15.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 20.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 flip_ratio: float = 0.5,
+                 pad_val: float = 114.0,
+                 max_iters: int = 15,
+                 bbox_clip_border: bool = True,
+                 max_cached_images: int = 20,
+                 random_pop: bool = True,
+                 prob: float = 1.0) -> None:
+        assert isinstance(img_scale, tuple)
+        assert max_cached_images >= 2, 'The length of cache must >= 2, ' \
+                                       f'but got {max_cached_images}.'
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \
+                                 f'got {prob}.'
+        self.dynamic_scale = img_scale
+        self.ratio_range = ratio_range
+        self.flip_ratio = flip_ratio
+        self.pad_val = pad_val
+        self.max_iters = max_iters
+        self.bbox_clip_border = bbox_clip_border
+        self.results_cache = []
+
+        self.max_cached_images = max_cached_images
+        self.random_pop = random_pop
+        self.prob = prob
+
+    @cache_randomness
+    def get_indexes(self, cache: list) -> int:
+        """Call function to collect indexes.
+
+        Args:
+            cache (list): The result cache.
+
+        Returns:
+            int: index.
+        """
+
+        for i in range(self.max_iters):
+            index = random.randint(0, len(cache) - 1)
+            gt_bboxes_i = cache[index]['gt_bboxes']
+            if len(gt_bboxes_i) != 0:
+                break
+        return index
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """MixUp transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        # cache and pop images
+        self.results_cache.append(copy.deepcopy(results))
+        if len(self.results_cache) > self.max_cached_images:
+            if self.random_pop:
+                index = random.randint(0, len(self.results_cache) - 1)
+            else:
+                index = 0
+            self.results_cache.pop(index)
+
+        if len(self.results_cache) <= 1:
+            return results
+
+        if random.uniform(0, 1) > self.prob:
+            return results
+
+        index = self.get_indexes(self.results_cache)
+        retrieve_results = copy.deepcopy(self.results_cache[index])
+
+        # TODO: refactor mixup to reuse these code.
+        if retrieve_results['gt_bboxes'].shape[0] == 0:
+            # empty bbox
+            return results
+
+        retrieve_img = retrieve_results['img']
+        with_mask = True if 'gt_masks' in results else False
+
+        jit_factor = random.uniform(*self.ratio_range)
+        is_flip = random.uniform(0, 1) > self.flip_ratio
+
+        if len(retrieve_img.shape) == 3:
+            out_img = np.ones(
+                (self.dynamic_scale[1], self.dynamic_scale[0], 3),
+                dtype=retrieve_img.dtype) * self.pad_val
+        else:
+            out_img = np.ones(
+                self.dynamic_scale[::-1],
+                dtype=retrieve_img.dtype) * self.pad_val
+
+        # 1. keep_ratio resize
+        scale_ratio = min(self.dynamic_scale[1] / retrieve_img.shape[0],
+                          self.dynamic_scale[0] / retrieve_img.shape[1])
+        retrieve_img = mmcv.imresize(
+            retrieve_img, (int(retrieve_img.shape[1] * scale_ratio),
+                           int(retrieve_img.shape[0] * scale_ratio)))
+
+        # 2. paste
+        out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img
+
+        # 3. scale jit
+        scale_ratio *= jit_factor
+        out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor),
+                                          int(out_img.shape[0] * jit_factor)))
+
+        # 4. flip
+        if is_flip:
+            out_img = out_img[:, ::-1, :]
+
+        # 5. random crop
+        ori_img = results['img']
+        origin_h, origin_w = out_img.shape[:2]
+        target_h, target_w = ori_img.shape[:2]
+        padded_img = np.ones((max(origin_h, target_h), max(
+            origin_w, target_w), 3)) * self.pad_val
+        padded_img = padded_img.astype(np.uint8)
+        padded_img[:origin_h, :origin_w] = out_img
+
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w)
+        padded_cropped_img = padded_img[y_offset:y_offset + target_h,
+                                        x_offset:x_offset + target_w]
+
+        # 6. adjust bbox
+        retrieve_gt_bboxes = retrieve_results['gt_bboxes']
+        retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio])
+        if with_mask:
+            retrieve_gt_masks = retrieve_results['gt_masks'].rescale(
+                scale_ratio)
+
+        if self.bbox_clip_border:
+            retrieve_gt_bboxes.clip_([origin_h, origin_w])
+
+        if is_flip:
+            retrieve_gt_bboxes.flip_([origin_h, origin_w],
+                                     direction='horizontal')
+            if with_mask:
+                retrieve_gt_masks = retrieve_gt_masks.flip()
+
+        # 7. filter
+        cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone()
+        cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset])
+        if with_mask:
+            retrieve_gt_masks = retrieve_gt_masks.translate(
+                out_shape=(target_h, target_w),
+                offset=-x_offset,
+                direction='horizontal')
+            retrieve_gt_masks = retrieve_gt_masks.translate(
+                out_shape=(target_h, target_w),
+                offset=-y_offset,
+                direction='vertical')
+
+        if self.bbox_clip_border:
+            cp_retrieve_gt_bboxes.clip_([target_h, target_w])
+
+        # 8. mix up
+        ori_img = ori_img.astype(np.float32)
+        mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img.astype(np.float32)
+
+        retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels']
+        retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags']
+
+        mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat(
+            (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0)
+        mixup_gt_bboxes_labels = np.concatenate(
+            (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0)
+        mixup_gt_ignore_flags = np.concatenate(
+            (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0)
+        if with_mask:
+            mixup_gt_masks = retrieve_gt_masks.cat(
+                [results['gt_masks'], retrieve_gt_masks])
+
+        # remove outside bbox
+        inside_inds = mixup_gt_bboxes.is_inside([target_h, target_w]).numpy()
+        mixup_gt_bboxes = mixup_gt_bboxes[inside_inds]
+        mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds]
+        mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds]
+        if with_mask:
+            mixup_gt_masks = mixup_gt_masks[inside_inds]
+
+        results['img'] = mixup_img.astype(np.uint8)
+        results['img_shape'] = mixup_img.shape[:2]
+        results['gt_bboxes'] = mixup_gt_bboxes
+        results['gt_bboxes_labels'] = mixup_gt_bboxes_labels
+        results['gt_ignore_flags'] = mixup_gt_ignore_flags
+        if with_mask:
+            results['gt_masks'] = mixup_gt_masks
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(dynamic_scale={self.dynamic_scale}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'flip_ratio={self.flip_ratio}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'max_iters={self.max_iters}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border}, '
+        repr_str += f'max_cached_images={self.max_cached_images}, '
+        repr_str += f'random_pop={self.random_pop}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
diff --git a/mmde/mmdet/datasets/transforms/wrappers.py b/mmde/mmdet/datasets/transforms/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a17711c06bfbd4dc0038dce9ea7796d1476c37e
--- /dev/null
+++ b/mmde/mmdet/datasets/transforms/wrappers.py
@@ -0,0 +1,277 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+from mmcv.transforms import BaseTransform, Compose
+from mmcv.transforms.utils import cache_random_params, cache_randomness
+
+from mmdet.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class MultiBranch(BaseTransform):
+    r"""Multiple branch pipeline wrapper.
+
+    Generate multiple data-augmented versions of the same image.
+    `MultiBranch` needs to specify the branch names of all
+    pipelines of the dataset, perform corresponding data augmentation
+    for the current branch, and return None for other branches,
+    which ensures the consistency of return format across
+    different samples.
+
+    Args:
+        branch_field (list): List of branch names.
+        branch_pipelines (dict): Dict of different pipeline configs
+            to be composed.
+
+    Examples:
+        >>> branch_field = ['sup', 'unsup_teacher', 'unsup_student']
+        >>> sup_pipeline = [
+        >>>     dict(type='LoadImageFromFile'),
+        >>>     dict(type='LoadAnnotations', with_bbox=True),
+        >>>     dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+        >>>     dict(type='RandomFlip', prob=0.5),
+        >>>     dict(
+        >>>         type='MultiBranch',
+        >>>         branch_field=branch_field,
+        >>>         sup=dict(type='PackDetInputs'))
+        >>>     ]
+        >>> weak_pipeline = [
+        >>>     dict(type='LoadImageFromFile'),
+        >>>     dict(type='LoadAnnotations', with_bbox=True),
+        >>>     dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+        >>>     dict(type='RandomFlip', prob=0.0),
+        >>>     dict(
+        >>>         type='MultiBranch',
+        >>>         branch_field=branch_field,
+        >>>         sup=dict(type='PackDetInputs'))
+        >>>     ]
+        >>> strong_pipeline = [
+        >>>     dict(type='LoadImageFromFile'),
+        >>>     dict(type='LoadAnnotations', with_bbox=True),
+        >>>     dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+        >>>     dict(type='RandomFlip', prob=1.0),
+        >>>     dict(
+        >>>         type='MultiBranch',
+        >>>         branch_field=branch_field,
+        >>>         sup=dict(type='PackDetInputs'))
+        >>>     ]
+        >>> unsup_pipeline = [
+        >>>     dict(type='LoadImageFromFile'),
+        >>>     dict(type='LoadEmptyAnnotations'),
+        >>>     dict(
+        >>>         type='MultiBranch',
+        >>>         branch_field=branch_field,
+        >>>         unsup_teacher=weak_pipeline,
+        >>>         unsup_student=strong_pipeline)
+        >>>     ]
+        >>> from mmcv.transforms import Compose
+        >>> sup_branch = Compose(sup_pipeline)
+        >>> unsup_branch = Compose(unsup_pipeline)
+        >>> print(sup_branch)
+        >>> Compose(
+        >>>     LoadImageFromFile(ignore_empty=False, to_float32=False, color_type='color', imdecode_backend='cv2') # noqa
+        >>>     LoadAnnotations(with_bbox=True, with_label=True, with_mask=False, with_seg=False, poly2mask=True, imdecode_backend='cv2') # noqa
+        >>>     Resize(scale=(1333, 800), scale_factor=None, keep_ratio=True, clip_object_border=True), backend=cv2), interpolation=bilinear) # noqa
+        >>>     RandomFlip(prob=0.5, direction=horizontal)
+        >>>     MultiBranch(branch_pipelines=['sup'])
+        >>> )
+        >>> print(unsup_branch)
+        >>> Compose(
+        >>>     LoadImageFromFile(ignore_empty=False, to_float32=False, color_type='color', imdecode_backend='cv2') # noqa
+        >>>     LoadEmptyAnnotations(with_bbox=True, with_label=True, with_mask=False, with_seg=False, seg_ignore_label=255) # noqa
+        >>>     MultiBranch(branch_pipelines=['unsup_teacher', 'unsup_student'])
+        >>> )
+    """
+
+    def __init__(self, branch_field: List[str],
+                 **branch_pipelines: dict) -> None:
+        self.branch_field = branch_field
+        self.branch_pipelines = {
+            branch: Compose(pipeline)
+            for branch, pipeline in branch_pipelines.items()
+        }
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to apply transforms sequentially.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict:
+
+            - 'inputs' (Dict[str, obj:`torch.Tensor`]): The forward data of
+                models from different branches.
+            - 'data_sample' (Dict[str,obj:`DetDataSample`]): The annotation
+                info of the sample from different branches.
+        """
+
+        multi_results = {}
+        for branch in self.branch_field:
+            multi_results[branch] = {'inputs': None, 'data_samples': None}
+        for branch, pipeline in self.branch_pipelines.items():
+            branch_results = pipeline(copy.deepcopy(results))
+            # If one branch pipeline returns None,
+            # it will sample another data from dataset.
+            if branch_results is None:
+                return None
+            multi_results[branch] = branch_results
+
+        format_results = {}
+        for branch, results in multi_results.items():
+            for key in results.keys():
+                if format_results.get(key, None) is None:
+                    format_results[key] = {branch: results[key]}
+                else:
+                    format_results[key][branch] = results[key]
+        return format_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(branch_pipelines={list(self.branch_pipelines.keys())})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomOrder(Compose):
+    """Shuffle the transform Sequence."""
+
+    @cache_randomness
+    def _random_permutation(self):
+        return np.random.permutation(len(self.transforms))
+
+    def transform(self, results: Dict) -> Optional[Dict]:
+        """Transform function to apply transforms in random order.
+
+        Args:
+            results (dict): A result dict contains the results to transform.
+
+        Returns:
+            dict or None: Transformed results.
+        """
+        inds = self._random_permutation()
+        for idx in inds:
+            t = self.transforms[idx]
+            results = t(results)
+            if results is None:
+                return None
+        return results
+
+    def __repr__(self):
+        """Compute the string representation."""
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += f'{t.__class__.__name__}, '
+        format_string += ')'
+        return format_string
+
+
+@TRANSFORMS.register_module()
+class ProposalBroadcaster(BaseTransform):
+    """A transform wrapper to apply the wrapped transforms to process both
+    `gt_bboxes` and `proposals` without adding any codes. It will do the
+    following steps:
+
+        1. Scatter the broadcasting targets to a list of inputs of the wrapped
+           transforms. The type of the list should be list[dict, dict], which
+           the first is the original inputs, the second is the processing
+           results that `gt_bboxes` being rewritten by the `proposals`.
+        2. Apply ``self.transforms``, with same random parameters, which is
+           sharing with a context manager. The type of the outputs is a
+           list[dict, dict].
+        3. Gather the outputs, update the `proposals` in the first item of
+           the outputs with the `gt_bboxes` in the second .
+
+    Args:
+         transforms (list, optional): Sequence of transform
+            object or config dict to be wrapped. Defaults to [].
+
+    Note: The `TransformBroadcaster` in MMCV can achieve the same operation as
+          `ProposalBroadcaster`, but need to set more complex parameters.
+
+    Examples:
+        >>> pipeline = [
+        >>>     dict(type='LoadImageFromFile'),
+        >>>     dict(type='LoadProposals', num_max_proposals=2000),
+        >>>     dict(type='LoadAnnotations', with_bbox=True),
+        >>>     dict(
+        >>>         type='ProposalBroadcaster',
+        >>>         transforms=[
+        >>>             dict(type='Resize', scale=(1333, 800),
+        >>>                  keep_ratio=True),
+        >>>             dict(type='RandomFlip', prob=0.5),
+        >>>         ]),
+        >>>     dict(type='PackDetInputs')]
+    """
+
+    def __init__(self, transforms: List[Union[dict, Callable]] = []) -> None:
+        self.transforms = Compose(transforms)
+
+    def transform(self, results: dict) -> dict:
+        """Apply wrapped transform functions to process both `gt_bboxes` and
+        `proposals`.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        assert results.get('proposals', None) is not None, \
+            '`proposals` should be in the results, please delete ' \
+            '`ProposalBroadcaster` in your configs, or check whether ' \
+            'you have load proposals successfully.'
+
+        inputs = self._process_input(results)
+        outputs = self._apply_transforms(inputs)
+        outputs = self._process_output(outputs)
+        return outputs
+
+    def _process_input(self, data: dict) -> list:
+        """Scatter the broadcasting targets to a list of inputs of the wrapped
+        transforms.
+
+        Args:
+            data (dict): The original input data.
+
+        Returns:
+            list[dict]: A list of input data.
+        """
+        cp_data = copy.deepcopy(data)
+        cp_data['gt_bboxes'] = cp_data['proposals']
+        scatters = [data, cp_data]
+        return scatters
+
+    def _apply_transforms(self, inputs: list) -> list:
+        """Apply ``self.transforms``.
+
+        Args:
+            inputs (list[dict, dict]): list of input data.
+
+        Returns:
+            list[dict]: The output of the wrapped pipeline.
+        """
+        assert len(inputs) == 2
+        ctx = cache_random_params
+        with ctx(self.transforms):
+            output_scatters = [self.transforms(_input) for _input in inputs]
+        return output_scatters
+
+    def _process_output(self, output_scatters: list) -> dict:
+        """Gathering and renaming data items.
+
+        Args:
+            output_scatters (list[dict, dict]): The output of the wrapped
+                pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        assert isinstance(output_scatters, list) and \
+               isinstance(output_scatters[0], dict) and \
+               len(output_scatters) == 2
+        outputs = output_scatters[0]
+        outputs['proposals'] = output_scatters[1]['gt_bboxes']
+        return outputs
diff --git a/mmde/mmdet/datasets/utils.py b/mmde/mmdet/datasets/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d794eb4b06ec9db56ff3a5fc7b817d1d9332a989
--- /dev/null
+++ b/mmde/mmdet/datasets/utils.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmcv.transforms import LoadImageFromFile
+
+from mmdet.datasets.transforms import LoadAnnotations, LoadPanopticAnnotations
+from mmdet.registry import TRANSFORMS
+
+
+def get_loading_pipeline(pipeline):
+    """Only keep loading image and annotations related configuration.
+
+    Args:
+        pipeline (list[dict]): Data pipeline configs.
+
+    Returns:
+        list[dict]: The new pipeline list with only keep
+            loading image and annotations related configuration.
+
+    Examples:
+        >>> pipelines = [
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(type='LoadAnnotations', with_bbox=True),
+        ...    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+        ...    dict(type='RandomFlip', flip_ratio=0.5),
+        ...    dict(type='Normalize', **img_norm_cfg),
+        ...    dict(type='Pad', size_divisor=32),
+        ...    dict(type='DefaultFormatBundle'),
+        ...    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+        ...    ]
+        >>> expected_pipelines = [
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(type='LoadAnnotations', with_bbox=True)
+        ...    ]
+        >>> assert expected_pipelines ==\
+        ...        get_loading_pipeline(pipelines)
+    """
+    loading_pipeline_cfg = []
+    for cfg in pipeline:
+        obj_cls = TRANSFORMS.get(cfg['type'])
+        # TODO:use more elegant way to distinguish loading modules
+        if obj_cls is not None and obj_cls in (LoadImageFromFile,
+                                               LoadAnnotations,
+                                               LoadPanopticAnnotations):
+            loading_pipeline_cfg.append(cfg)
+    assert len(loading_pipeline_cfg) == 2, \
+        'The data pipeline in your config file must include ' \
+        'loading image and annotations related pipeline.'
+    return loading_pipeline_cfg
diff --git a/mmde/mmdet/datasets/v3det.py b/mmde/mmdet/datasets/v3det.py
new file mode 100644
index 0000000000000000000000000000000000000000..25bfe3bc718841143653c54954240186c3376955
--- /dev/null
+++ b/mmde/mmdet/datasets/v3det.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path
+from typing import Optional
+
+import mmengine
+
+from mmdet.registry import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class V3DetDataset(CocoDataset):
+    """Dataset for V3Det."""
+
+    METAINFO = {
+        'classes': None,
+        'palette': None,
+    }
+
+    def __init__(
+            self,
+            *args,
+            metainfo: Optional[dict] = None,
+            data_root: str = '',
+            label_file='annotations/category_name_13204_v3det_2023_v1.txt',  # noqa
+            **kwargs) -> None:
+        class_names = tuple(
+            mmengine.list_from_file(os.path.join(data_root, label_file)))
+        if metainfo is None:
+            metainfo = {'classes': class_names}
+        super().__init__(
+            *args, data_root=data_root, metainfo=metainfo, **kwargs)
diff --git a/mmde/mmdet/datasets/voc.py b/mmde/mmdet/datasets/voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..65e73f2f0bd4f2b16d5237cd3b5f342e44cf0438
--- /dev/null
+++ b/mmde/mmdet/datasets/voc.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import DATASETS
+from .xml_style import XMLDataset
+
+
+@DATASETS.register_module()
+class VOCDataset(XMLDataset):
+    """Dataset for PASCAL VOC."""
+
+    METAINFO = {
+        'classes':
+        ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
+         'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
+         'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'),
+        # palette is a list of color tuples, which is used for visualization.
+        'palette': [(106, 0, 228), (119, 11, 32), (165, 42, 42), (0, 0, 192),
+                    (197, 226, 255), (0, 60, 100), (0, 0, 142), (255, 77, 255),
+                    (153, 69, 1), (120, 166, 157), (0, 182, 199),
+                    (0, 226, 252), (182, 182, 255), (0, 0, 230), (220, 20, 60),
+                    (163, 255, 0), (0, 82, 0), (3, 95, 161), (0, 80, 100),
+                    (183, 130, 88)]
+    }
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if 'VOC2007' in self.sub_data_root:
+            self._metainfo['dataset_type'] = 'VOC2007'
+        elif 'VOC2012' in self.sub_data_root:
+            self._metainfo['dataset_type'] = 'VOC2012'
+        else:
+            self._metainfo['dataset_type'] = None
diff --git a/mmde/mmdet/datasets/wider_face.py b/mmde/mmdet/datasets/wider_face.py
new file mode 100644
index 0000000000000000000000000000000000000000..62c7fff869ab970b6f96908a998ba6feb25ea205
--- /dev/null
+++ b/mmde/mmdet/datasets/wider_face.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import xml.etree.ElementTree as ET
+
+from mmengine.dist import is_main_process
+from mmengine.fileio import get_local_path, list_from_file
+from mmengine.utils import ProgressBar
+
+from mmdet.registry import DATASETS
+from mmdet.utils.typing_utils import List, Union
+from .xml_style import XMLDataset
+
+
+@DATASETS.register_module()
+class WIDERFaceDataset(XMLDataset):
+    """Reader for the WIDER Face dataset in PASCAL VOC format.
+
+    Conversion scripts can be found in
+    https://github.com/sovrasov/wider-face-pascal-voc-annotations
+    """
+    METAINFO = {'classes': ('face', ), 'palette': [(0, 255, 0)]}
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from XML style ann_file.
+
+        Returns:
+            list[dict]: Annotation info from XML file.
+        """
+        assert self._metainfo.get('classes', None) is not None, \
+            'classes in `XMLDataset` can not be None.'
+        self.cat2label = {
+            cat: i
+            for i, cat in enumerate(self._metainfo['classes'])
+        }
+
+        data_list = []
+        img_ids = list_from_file(self.ann_file, backend_args=self.backend_args)
+
+        # loading process takes around 10 mins
+        if is_main_process():
+            prog_bar = ProgressBar(len(img_ids))
+
+        for img_id in img_ids:
+            raw_img_info = {}
+            raw_img_info['img_id'] = img_id
+            raw_img_info['file_name'] = f'{img_id}.jpg'
+            parsed_data_info = self.parse_data_info(raw_img_info)
+            data_list.append(parsed_data_info)
+
+            if is_main_process():
+                prog_bar.update()
+        return data_list
+
+    def parse_data_info(self, img_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+
+        Args:
+            img_info (dict): Raw image information, usually it includes
+                `img_id`, `file_name`, and `xml_path`.
+
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        data_info = {}
+        img_id = img_info['img_id']
+        xml_path = osp.join(self.data_prefix['img'], 'Annotations',
+                            f'{img_id}.xml')
+        data_info['img_id'] = img_id
+        data_info['xml_path'] = xml_path
+
+        # deal with xml file
+        with get_local_path(
+                xml_path, backend_args=self.backend_args) as local_path:
+            raw_ann_info = ET.parse(local_path)
+        root = raw_ann_info.getroot()
+        size = root.find('size')
+        width = int(size.find('width').text)
+        height = int(size.find('height').text)
+        folder = root.find('folder').text
+        img_path = osp.join(self.data_prefix['img'], folder,
+                            img_info['file_name'])
+        data_info['img_path'] = img_path
+
+        data_info['height'] = height
+        data_info['width'] = width
+
+        # Coordinates are in range [0, width - 1 or height - 1]
+        data_info['instances'] = self._parse_instance_info(
+            raw_ann_info, minus_one=False)
+        return data_info
diff --git a/mmde/mmdet/datasets/xml_style.py b/mmde/mmdet/datasets/xml_style.py
new file mode 100644
index 0000000000000000000000000000000000000000..06045ea0092238abdac9622511b336586858f8f5
--- /dev/null
+++ b/mmde/mmdet/datasets/xml_style.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import xml.etree.ElementTree as ET
+from typing import List, Optional, Union
+
+import mmcv
+from mmengine.fileio import get, get_local_path, list_from_file
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class XMLDataset(BaseDetDataset):
+    """XML dataset for detection.
+
+    Args:
+        img_subdir (str): Subdir where images are stored. Default: JPEGImages.
+        ann_subdir (str): Subdir where annotations are. Default: Annotations.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    def __init__(self,
+                 img_subdir: str = 'JPEGImages',
+                 ann_subdir: str = 'Annotations',
+                 **kwargs) -> None:
+        self.img_subdir = img_subdir
+        self.ann_subdir = ann_subdir
+        super().__init__(**kwargs)
+
+    @property
+    def sub_data_root(self) -> str:
+        """Return the sub data root."""
+        return self.data_prefix.get('sub_data_root', '')
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from XML style ann_file.
+
+        Returns:
+            list[dict]: Annotation info from XML file.
+        """
+        assert self._metainfo.get('classes', None) is not None, \
+            '`classes` in `XMLDataset` can not be None.'
+        self.cat2label = {
+            cat: i
+            for i, cat in enumerate(self._metainfo['classes'])
+        }
+
+        data_list = []
+        img_ids = list_from_file(self.ann_file, backend_args=self.backend_args)
+        for img_id in img_ids:
+            file_name = osp.join(self.img_subdir, f'{img_id}.jpg')
+            xml_path = osp.join(self.sub_data_root, self.ann_subdir,
+                                f'{img_id}.xml')
+
+            raw_img_info = {}
+            raw_img_info['img_id'] = img_id
+            raw_img_info['file_name'] = file_name
+            raw_img_info['xml_path'] = xml_path
+
+            parsed_data_info = self.parse_data_info(raw_img_info)
+            data_list.append(parsed_data_info)
+        return data_list
+
+    @property
+    def bbox_min_size(self) -> Optional[int]:
+        """Return the minimum size of bounding boxes in the images."""
+        if self.filter_cfg is not None:
+            return self.filter_cfg.get('bbox_min_size', None)
+        else:
+            return None
+
+    def parse_data_info(self, img_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+
+        Args:
+            img_info (dict): Raw image information, usually it includes
+                `img_id`, `file_name`, and `xml_path`.
+
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        data_info = {}
+        img_path = osp.join(self.sub_data_root, img_info['file_name'])
+        data_info['img_path'] = img_path
+        data_info['img_id'] = img_info['img_id']
+        data_info['xml_path'] = img_info['xml_path']
+
+        # deal with xml file
+        with get_local_path(
+                img_info['xml_path'],
+                backend_args=self.backend_args) as local_path:
+            raw_ann_info = ET.parse(local_path)
+        root = raw_ann_info.getroot()
+        size = root.find('size')
+        if size is not None:
+            width = int(size.find('width').text)
+            height = int(size.find('height').text)
+        else:
+            img_bytes = get(img_path, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(img_bytes, backend='cv2')
+            height, width = img.shape[:2]
+            del img, img_bytes
+
+        data_info['height'] = height
+        data_info['width'] = width
+
+        data_info['instances'] = self._parse_instance_info(
+            raw_ann_info, minus_one=True)
+
+        return data_info
+
+    def _parse_instance_info(self,
+                             raw_ann_info: ET,
+                             minus_one: bool = True) -> List[dict]:
+        """parse instance information.
+
+        Args:
+            raw_ann_info (ElementTree): ElementTree object.
+            minus_one (bool): Whether to subtract 1 from the coordinates.
+                Defaults to True.
+
+        Returns:
+            List[dict]: List of instances.
+        """
+        instances = []
+        for obj in raw_ann_info.findall('object'):
+            instance = {}
+            name = obj.find('name').text
+            if name not in self._metainfo['classes']:
+                continue
+            difficult = obj.find('difficult')
+            difficult = 0 if difficult is None else int(difficult.text)
+            bnd_box = obj.find('bndbox')
+            bbox = [
+                int(float(bnd_box.find('xmin').text)),
+                int(float(bnd_box.find('ymin').text)),
+                int(float(bnd_box.find('xmax').text)),
+                int(float(bnd_box.find('ymax').text))
+            ]
+
+            # VOC needs to subtract 1 from the coordinates
+            if minus_one:
+                bbox = [x - 1 for x in bbox]
+
+            ignore = False
+            if self.bbox_min_size is not None:
+                assert not self.test_mode
+                w = bbox[2] - bbox[0]
+                h = bbox[3] - bbox[1]
+                if w < self.bbox_min_size or h < self.bbox_min_size:
+                    ignore = True
+            if difficult or ignore:
+                instance['ignore_flag'] = 1
+            else:
+                instance['ignore_flag'] = 0
+            instance['bbox'] = bbox
+            instance['bbox_label'] = self.cat2label[name]
+            instances.append(instance)
+        return instances
+
+    def filter_data(self) -> List[dict]:
+        """Filter annotations according to filter_cfg.
+
+        Returns:
+            List[dict]: Filtered results.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) \
+            if self.filter_cfg is not None else False
+        min_size = self.filter_cfg.get('min_size', 0) \
+            if self.filter_cfg is not None else 0
+
+        valid_data_infos = []
+        for i, data_info in enumerate(self.data_list):
+            width = data_info['width']
+            height = data_info['height']
+            if filter_empty_gt and len(data_info['instances']) == 0:
+                continue
+            if min(width, height) >= min_size:
+                valid_data_infos.append(data_info)
+
+        return valid_data_infos
diff --git a/mmde/mmdet/datasets/youtube_vis_dataset.py b/mmde/mmdet/datasets/youtube_vis_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..38c3d3909f1b8fd795c181546094056c54c9c4b2
--- /dev/null
+++ b/mmde/mmdet/datasets/youtube_vis_dataset.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import DATASETS
+from .base_video_dataset import BaseVideoDataset
+
+
+@DATASETS.register_module()
+class YouTubeVISDataset(BaseVideoDataset):
+    """YouTube VIS dataset for video instance segmentation.
+
+    Args:
+        dataset_version (str): Select dataset year version.
+    """
+
+    def __init__(self, dataset_version: str, *args, **kwargs):
+        self.set_dataset_classes(dataset_version)
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def set_dataset_classes(cls, dataset_version: str) -> None:
+        """Pass the category of the corresponding year to metainfo.
+
+        Args:
+            dataset_version (str): Select dataset year version.
+        """
+        classes_2019_version = ('person', 'giant_panda', 'lizard', 'parrot',
+                                'skateboard', 'sedan', 'ape', 'dog', 'snake',
+                                'monkey', 'hand', 'rabbit', 'duck', 'cat',
+                                'cow', 'fish', 'train', 'horse', 'turtle',
+                                'bear', 'motorbike', 'giraffe', 'leopard',
+                                'fox', 'deer', 'owl', 'surfboard', 'airplane',
+                                'truck', 'zebra', 'tiger', 'elephant',
+                                'snowboard', 'boat', 'shark', 'mouse', 'frog',
+                                'eagle', 'earless_seal', 'tennis_racket')
+
+        classes_2021_version = ('airplane', 'bear', 'bird', 'boat', 'car',
+                                'cat', 'cow', 'deer', 'dog', 'duck',
+                                'earless_seal', 'elephant', 'fish',
+                                'flying_disc', 'fox', 'frog', 'giant_panda',
+                                'giraffe', 'horse', 'leopard', 'lizard',
+                                'monkey', 'motorbike', 'mouse', 'parrot',
+                                'person', 'rabbit', 'shark', 'skateboard',
+                                'snake', 'snowboard', 'squirrel', 'surfboard',
+                                'tennis_racket', 'tiger', 'train', 'truck',
+                                'turtle', 'whale', 'zebra')
+
+        if dataset_version == '2019':
+            cls.METAINFO = dict(classes=classes_2019_version)
+        elif dataset_version == '2021':
+            cls.METAINFO = dict(classes=classes_2021_version)
+        else:
+            raise NotImplementedError('Not supported YouTubeVIS dataset'
+                                      f'version: {dataset_version}')
diff --git a/mmde/mmdet/engine/__init__.py b/mmde/mmdet/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c91ace6ffa20948af572d3a0fd594e8a0b091775
--- /dev/null
+++ b/mmde/mmdet/engine/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hooks import *  # noqa: F401, F403
+from .optimizers import *  # noqa: F401, F403
+from .runner import *  # noqa: F401, F403
+from .schedulers import *  # noqa: F401, F403
diff --git a/mmde/mmdet/engine/hooks/__init__.py b/mmde/mmdet/engine/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..889fa557adef87e2251c625a7353503226beb079
--- /dev/null
+++ b/mmde/mmdet/engine/hooks/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .checkloss_hook import CheckInvalidLossHook
+from .mean_teacher_hook import MeanTeacherHook
+from .memory_profiler_hook import MemoryProfilerHook
+from .num_class_check_hook import NumClassCheckHook
+from .pipeline_switch_hook import PipelineSwitchHook
+from .set_epoch_info_hook import SetEpochInfoHook
+from .sync_norm_hook import SyncNormHook
+from .utils import trigger_visualization_hook
+from .visualization_hook import (DetVisualizationHook,
+                                 GroundingVisualizationHook,
+                                 TrackVisualizationHook)
+from .yolox_mode_switch_hook import YOLOXModeSwitchHook
+
+__all__ = [
+    'YOLOXModeSwitchHook', 'SyncNormHook', 'CheckInvalidLossHook',
+    'SetEpochInfoHook', 'MemoryProfilerHook', 'DetVisualizationHook',
+    'NumClassCheckHook', 'MeanTeacherHook', 'trigger_visualization_hook',
+    'PipelineSwitchHook', 'TrackVisualizationHook',
+    'GroundingVisualizationHook'
+]
diff --git a/mmde/mmdet/engine/hooks/checkloss_hook.py b/mmde/mmdet/engine/hooks/checkloss_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ebfcd5dfcd7ae329399723d3a9c0fc0a0d722ef
--- /dev/null
+++ b/mmde/mmdet/engine/hooks/checkloss_hook.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class CheckInvalidLossHook(Hook):
+    """Check invalid loss hook.
+
+    This hook will regularly check whether the loss is valid
+    during training.
+
+    Args:
+        interval (int): Checking interval (every k iterations).
+            Default: 50.
+    """
+
+    def __init__(self, interval: int = 50) -> None:
+        self.interval = interval
+
+    def after_train_iter(self,
+                         runner: Runner,
+                         batch_idx: int,
+                         data_batch: Optional[dict] = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Regularly check whether the loss is valid every n iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict, Optional): Data from dataloader.
+                Defaults to None.
+            outputs (dict, Optional): Outputs from model. Defaults to None.
+        """
+        if self.every_n_train_iters(runner, self.interval):
+            assert torch.isfinite(outputs['loss']), \
+                runner.logger.info('loss become infinite or NaN!')
diff --git a/mmde/mmdet/engine/hooks/mean_teacher_hook.py b/mmde/mmdet/engine/hooks/mean_teacher_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..b924c0a5934248d05e7ce1add50e7574b739b9c7
--- /dev/null
+++ b/mmde/mmdet/engine/hooks/mean_teacher_hook.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+from mmengine.runner import Runner
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class MeanTeacherHook(Hook):
+    """Mean Teacher Hook.
+
+    Mean Teacher is an efficient semi-supervised learning method in
+    `Mean Teacher <https://arxiv.org/abs/1703.01780>`_.
+    This method requires two models with exactly the same structure,
+    as the student model and the teacher model, respectively.
+    The student model updates the parameters through gradient descent,
+    and the teacher model updates the parameters through
+    exponential moving average of the student model.
+    Compared with the student model, the teacher model
+    is smoother and accumulates more knowledge.
+
+    Args:
+        momentum (float): The momentum used for updating teacher's parameter.
+            Teacher's parameter are updated with the formula:
+           `teacher = (1-momentum) * teacher + momentum * student`.
+            Defaults to 0.001.
+        interval (int): Update teacher's parameter every interval iteration.
+            Defaults to 1.
+        skip_buffers (bool): Whether to skip the model buffers, such as
+            batchnorm running stats (running_mean, running_var), it does not
+            perform the ema operation. Default to True.
+    """
+
+    def __init__(self,
+                 momentum: float = 0.001,
+                 interval: int = 1,
+                 skip_buffer=True) -> None:
+        assert 0 < momentum < 1
+        self.momentum = momentum
+        self.interval = interval
+        self.skip_buffers = skip_buffer
+
+    def before_train(self, runner: Runner) -> None:
+        """To check that teacher model and student model exist."""
+        model = runner.model
+        if is_model_wrapper(model):
+            model = model.module
+        assert hasattr(model, 'teacher')
+        assert hasattr(model, 'student')
+        # only do it at initial stage
+        if runner.iter == 0:
+            self.momentum_update(model, 1)
+
+    def after_train_iter(self,
+                         runner: Runner,
+                         batch_idx: int,
+                         data_batch: Optional[dict] = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Update teacher's parameter every self.interval iterations."""
+        if (runner.iter + 1) % self.interval != 0:
+            return
+        model = runner.model
+        if is_model_wrapper(model):
+            model = model.module
+        self.momentum_update(model, self.momentum)
+
+    def momentum_update(self, model: nn.Module, momentum: float) -> None:
+        """Compute the moving average of the parameters using exponential
+        moving average."""
+        if self.skip_buffers:
+            for (src_name, src_parm), (dst_name, dst_parm) in zip(
+                    model.student.named_parameters(),
+                    model.teacher.named_parameters()):
+                dst_parm.data.mul_(1 - momentum).add_(
+                    src_parm.data, alpha=momentum)
+        else:
+            for (src_parm,
+                 dst_parm) in zip(model.student.state_dict().values(),
+                                  model.teacher.state_dict().values()):
+                # exclude num_tracking
+                if dst_parm.dtype.is_floating_point:
+                    dst_parm.data.mul_(1 - momentum).add_(
+                        src_parm.data, alpha=momentum)
diff --git a/mmde/mmdet/engine/hooks/memory_profiler_hook.py b/mmde/mmdet/engine/hooks/memory_profiler_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dcdcae0b669ade46026d28c46b35f35d90b504b
--- /dev/null
+++ b/mmde/mmdet/engine/hooks/memory_profiler_hook.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence
+
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+
+from mmdet.registry import HOOKS
+from mmdet.structures import DetDataSample
+
+
+@HOOKS.register_module()
+class MemoryProfilerHook(Hook):
+    """Memory profiler hook recording memory information including virtual
+    memory, swap memory, and the memory of the current process.
+
+    Args:
+        interval (int): Checking interval (every k iterations).
+            Default: 50.
+    """
+
+    def __init__(self, interval: int = 50) -> None:
+        try:
+            from psutil import swap_memory, virtual_memory
+            self._swap_memory = swap_memory
+            self._virtual_memory = virtual_memory
+        except ImportError:
+            raise ImportError('psutil is not installed, please install it by: '
+                              'pip install psutil')
+
+        try:
+            from memory_profiler import memory_usage
+            self._memory_usage = memory_usage
+        except ImportError:
+            raise ImportError(
+                'memory_profiler is not installed, please install it by: '
+                'pip install memory_profiler')
+
+        self.interval = interval
+
+    def _record_memory_information(self, runner: Runner) -> None:
+        """Regularly record memory information.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training or evaluation
+                process.
+        """
+        # in Byte
+        virtual_memory = self._virtual_memory()
+        swap_memory = self._swap_memory()
+        # in MB
+        process_memory = self._memory_usage()[0]
+        factor = 1024 * 1024
+        runner.logger.info(
+            'Memory information '
+            'available_memory: '
+            f'{round(virtual_memory.available / factor)} MB, '
+            'used_memory: '
+            f'{round(virtual_memory.used / factor)} MB, '
+            f'memory_utilization: {virtual_memory.percent} %, '
+            'available_swap_memory: '
+            f'{round((swap_memory.total - swap_memory.used) / factor)}'
+            ' MB, '
+            f'used_swap_memory: {round(swap_memory.used / factor)} MB, '
+            f'swap_memory_utilization: {swap_memory.percent} %, '
+            'current_process_memory: '
+            f'{round(process_memory)} MB')
+
+    def after_train_iter(self,
+                         runner: Runner,
+                         batch_idx: int,
+                         data_batch: Optional[dict] = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Regularly record memory information.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict, optional): Data from dataloader.
+                Defaults to None.
+            outputs (dict, optional): Outputs from model. Defaults to None.
+        """
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            self._record_memory_information(runner)
+
+    def after_val_iter(
+            self,
+            runner: Runner,
+            batch_idx: int,
+            data_batch: Optional[dict] = None,
+            outputs: Optional[Sequence[DetDataSample]] = None) -> None:
+        """Regularly record memory information.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict, optional): Data from dataloader.
+                Defaults to None.
+            outputs (Sequence[:obj:`DetDataSample`], optional):
+                Outputs from model. Defaults to None.
+        """
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            self._record_memory_information(runner)
+
+    def after_test_iter(
+            self,
+            runner: Runner,
+            batch_idx: int,
+            data_batch: Optional[dict] = None,
+            outputs: Optional[Sequence[DetDataSample]] = None) -> None:
+        """Regularly record memory information.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict, optional): Data from dataloader.
+                Defaults to None.
+            outputs (Sequence[:obj:`DetDataSample`], optional):
+                Outputs from model. Defaults to None.
+        """
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            self._record_memory_information(runner)
diff --git a/mmde/mmdet/engine/hooks/num_class_check_hook.py b/mmde/mmdet/engine/hooks/num_class_check_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..6588473acfbd3ffe8e80eb163aa7ee449332e6b8
--- /dev/null
+++ b/mmde/mmdet/engine/hooks/num_class_check_hook.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import VGG
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class NumClassCheckHook(Hook):
+    """Check whether the `num_classes` in head matches the length of `classes`
+    in `dataset.metainfo`."""
+
+    def _check_head(self, runner: Runner, mode: str) -> None:
+        """Check whether the `num_classes` in head matches the length of
+        `classes` in `dataset.metainfo`.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training or evaluation
+                process.
+        """
+        assert mode in ['train', 'val']
+        model = runner.model
+        dataset = runner.train_dataloader.dataset if mode == 'train' else \
+            runner.val_dataloader.dataset
+        if dataset.metainfo.get('classes', None) is None:
+            runner.logger.warning(
+                f'Please set `classes` '
+                f'in the {dataset.__class__.__name__} `metainfo` and'
+                f'check if it is consistent with the `num_classes` '
+                f'of head')
+        else:
+            classes = dataset.metainfo['classes']
+            assert type(classes) is not str, \
+                (f'`classes` in {dataset.__class__.__name__}'
+                 f'should be a tuple of str.'
+                 f'Add comma if number of classes is 1 as '
+                 f'classes = ({classes},)')
+            from mmdet.models.roi_heads.mask_heads import FusedSemanticHead
+            for name, module in model.named_modules():
+                if hasattr(module, 'num_classes') and not name.endswith(
+                        'rpn_head') and not isinstance(
+                            module, (VGG, FusedSemanticHead)):
+                    assert module.num_classes == len(classes), \
+                        (f'The `num_classes` ({module.num_classes}) in '
+                         f'{module.__class__.__name__} of '
+                         f'{model.__class__.__name__} does not matches '
+                         f'the length of `classes` '
+                         f'{len(classes)}) in '
+                         f'{dataset.__class__.__name__}')
+
+    def before_train_epoch(self, runner: Runner) -> None:
+        """Check whether the training dataset is compatible with head.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training or evaluation
+                process.
+        """
+        self._check_head(runner, 'train')
+
+    def before_val_epoch(self, runner: Runner) -> None:
+        """Check whether the dataset in val epoch is compatible with head.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training or evaluation
+                process.
+        """
+        self._check_head(runner, 'val')
diff --git a/mmde/mmdet/engine/hooks/pipeline_switch_hook.py b/mmde/mmdet/engine/hooks/pipeline_switch_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5abd897803b11793ebace86e45aac8f59938545
--- /dev/null
+++ b/mmde/mmdet/engine/hooks/pipeline_switch_hook.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms import Compose
+from mmengine.hooks import Hook
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class PipelineSwitchHook(Hook):
+    """Switch data pipeline at switch_epoch.
+
+    Args:
+        switch_epoch (int): switch pipeline at this epoch.
+        switch_pipeline (list[dict]): the pipeline to switch to.
+    """
+
+    def __init__(self, switch_epoch, switch_pipeline):
+        self.switch_epoch = switch_epoch
+        self.switch_pipeline = switch_pipeline
+        self._restart_dataloader = False
+        self._has_switched = False
+
+    def before_train_epoch(self, runner):
+        """switch pipeline."""
+        epoch = runner.epoch
+        train_loader = runner.train_dataloader
+        if epoch >= self.switch_epoch and not self._has_switched:
+            runner.logger.info('Switch pipeline now!')
+            # The dataset pipeline cannot be updated when persistent_workers
+            # is True, so we need to force the dataloader's multi-process
+            # restart. This is a very hacky approach.
+            train_loader.dataset.pipeline = Compose(self.switch_pipeline)
+            if hasattr(train_loader, 'persistent_workers'
+                       ) and train_loader.persistent_workers is True:
+                train_loader._DataLoader__initialized = False
+                train_loader._iterator = None
+                self._restart_dataloader = True
+            self._has_switched = True
+        else:
+            # Once the restart is complete, we need to restore
+            # the initialization flag.
+            if self._restart_dataloader:
+                train_loader._DataLoader__initialized = True
diff --git a/mmde/mmdet/engine/hooks/set_epoch_info_hook.py b/mmde/mmdet/engine/hooks/set_epoch_info_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..183f3167445dc0818e4fa37bdd2049d3876ed031
--- /dev/null
+++ b/mmde/mmdet/engine/hooks/set_epoch_info_hook.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import Hook
+from mmengine.model.wrappers import is_model_wrapper
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class SetEpochInfoHook(Hook):
+    """Set runner's epoch information to the model."""
+
+    def before_train_epoch(self, runner):
+        epoch = runner.epoch
+        model = runner.model
+        if is_model_wrapper(model):
+            model = model.module
+        model.set_epoch(epoch)
diff --git a/mmde/mmdet/engine/hooks/sync_norm_hook.py b/mmde/mmdet/engine/hooks/sync_norm_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1734380c83157c911568098abfce761fb3c9a1f
--- /dev/null
+++ b/mmde/mmdet/engine/hooks/sync_norm_hook.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+from mmengine.dist import get_dist_info
+from mmengine.hooks import Hook
+from torch import nn
+
+from mmdet.registry import HOOKS
+from mmdet.utils import all_reduce_dict
+
+
+def get_norm_states(module: nn.Module) -> OrderedDict:
+    """Get the state_dict of batch norms in the module."""
+    async_norm_states = OrderedDict()
+    for name, child in module.named_modules():
+        if isinstance(child, nn.modules.batchnorm._NormBase):
+            for k, v in child.state_dict().items():
+                async_norm_states['.'.join([name, k])] = v
+    return async_norm_states
+
+
+@HOOKS.register_module()
+class SyncNormHook(Hook):
+    """Synchronize Norm states before validation, currently used in YOLOX."""
+
+    def before_val_epoch(self, runner):
+        """Synchronizing norm."""
+        module = runner.model
+        _, world_size = get_dist_info()
+        if world_size == 1:
+            return
+        norm_states = get_norm_states(module)
+        if len(norm_states) == 0:
+            return
+        # TODO: use `all_reduce_dict` in mmengine
+        norm_states = all_reduce_dict(norm_states, op='mean')
+        module.load_state_dict(norm_states, strict=False)
diff --git a/mmde/mmdet/engine/hooks/utils.py b/mmde/mmdet/engine/hooks/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d267cfe77be163c0520568b7b7936f4453914aab
--- /dev/null
+++ b/mmde/mmdet/engine/hooks/utils.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def trigger_visualization_hook(cfg, args):
+    default_hooks = cfg.default_hooks
+    if 'visualization' in default_hooks:
+        visualization_hook = default_hooks['visualization']
+        # Turn on visualization
+        visualization_hook['draw'] = True
+        if args.show:
+            visualization_hook['show'] = True
+            visualization_hook['wait_time'] = args.wait_time
+        if args.show_dir:
+            visualization_hook['test_out_dir'] = args.show_dir
+    else:
+        raise RuntimeError(
+            'VisualizationHook must be included in default_hooks.'
+            'refer to usage '
+            '"visualization=dict(type=\'VisualizationHook\')"')
+
+    return cfg
diff --git a/mmde/mmdet/engine/hooks/visualization_hook.py b/mmde/mmdet/engine/hooks/visualization_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..3408186b6ef9c4195745b0c740519541572d27d2
--- /dev/null
+++ b/mmde/mmdet/engine/hooks/visualization_hook.py
@@ -0,0 +1,515 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from typing import Optional, Sequence
+
+import mmcv
+import numpy as np
+from mmengine.fileio import get
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+from mmengine.utils import mkdir_or_exist
+from mmengine.visualization import Visualizer
+
+from mmdet.datasets.samplers import TrackImgSampler
+from mmdet.registry import HOOKS
+from mmdet.structures import DetDataSample, TrackDataSample
+from mmdet.structures.bbox import BaseBoxes
+from mmdet.visualization.palette import _get_adaptive_scales
+
+
+@HOOKS.register_module()
+class DetVisualizationHook(Hook):
+    """Detection Visualization Hook. Used to visualize validation and testing
+    process prediction results.
+
+    In the testing phase:
+
+    1. If ``show`` is True, it means that only the prediction results are
+        visualized without storing data, so ``vis_backends`` needs to
+        be excluded.
+    2. If ``test_out_dir`` is specified, it means that the prediction results
+        need to be saved to ``test_out_dir``. In order to avoid vis_backends
+        also storing data, so ``vis_backends`` needs to be excluded.
+    3. ``vis_backends`` takes effect if the user does not specify ``show``
+        and `test_out_dir``. You can set ``vis_backends`` to WandbVisBackend or
+        TensorboardVisBackend to store the prediction result in Wandb or
+        Tensorboard.
+
+    Args:
+        draw (bool): whether to draw prediction results. If it is False,
+            it means that no drawing will be done. Defaults to False.
+        interval (int): The interval of visualization. Defaults to 50.
+        score_thr (float): The threshold to visualize the bboxes
+            and masks. Defaults to 0.3.
+        show (bool): Whether to display the drawn image. Default to False.
+        wait_time (float): The interval of show (s). Defaults to 0.
+        test_out_dir (str, optional): directory where painted images
+            will be saved in testing process.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    def __init__(self,
+                 draw: bool = False,
+                 interval: int = 50,
+                 score_thr: float = 0.3,
+                 show: bool = False,
+                 wait_time: float = 0.,
+                 test_out_dir: Optional[str] = None,
+                 backend_args: dict = None):
+        self._visualizer: Visualizer = Visualizer.get_current_instance()
+        self.interval = interval
+        self.score_thr = score_thr
+        self.show = show
+        if self.show:
+            # No need to think about vis backends.
+            self._visualizer._vis_backends = {}
+            warnings.warn('The show is True, it means that only '
+                          'the prediction results are visualized '
+                          'without storing data, so vis_backends '
+                          'needs to be excluded.')
+
+        self.wait_time = wait_time
+        self.backend_args = backend_args
+        self.draw = draw
+        self.test_out_dir = test_out_dir
+        self._test_index = 0
+
+    def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                       outputs: Sequence[DetDataSample]) -> None:
+        """Run after every ``self.interval`` validation iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`DetDataSample`]]): A batch of data samples
+                that contain annotations and predictions.
+        """
+        if self.draw is False:
+            return
+
+        # There is no guarantee that the same batch of images
+        # is visualized for each evaluation.
+        total_curr_iter = runner.iter + batch_idx
+
+        # Visualize only the first data
+        img_path = outputs[0].img_path
+        img_bytes = get(img_path, backend_args=self.backend_args)
+        img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+
+        if total_curr_iter % self.interval == 0:
+            self._visualizer.add_datasample(
+                osp.basename(img_path) if self.show else 'val_img',
+                img,
+                data_sample=outputs[0],
+                show=self.show,
+                wait_time=self.wait_time,
+                pred_score_thr=self.score_thr,
+                step=total_curr_iter)
+
+    def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                        outputs: Sequence[DetDataSample]) -> None:
+        """Run after every testing iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`DetDataSample`]): A batch of data samples
+                that contain annotations and predictions.
+        """
+        if self.draw is False:
+            return
+
+        if self.test_out_dir is not None:
+            self.test_out_dir = osp.join(runner.work_dir, runner.timestamp,
+                                         self.test_out_dir)
+            mkdir_or_exist(self.test_out_dir)
+
+        for data_sample in outputs:
+            self._test_index += 1
+
+            img_path = data_sample.img_path
+            img_bytes = get(img_path, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+
+            out_file = None
+            if self.test_out_dir is not None:
+                out_file = osp.basename(img_path)
+                out_file = osp.join(self.test_out_dir, out_file)
+
+            self._visualizer.add_datasample(
+                osp.basename(img_path) if self.show else 'test_img',
+                img,
+                data_sample=data_sample,
+                show=self.show,
+                wait_time=self.wait_time,
+                pred_score_thr=self.score_thr,
+                out_file=out_file,
+                step=self._test_index)
+
+
+@HOOKS.register_module()
+class TrackVisualizationHook(Hook):
+    """Tracking Visualization Hook. Used to visualize validation and testing
+    process prediction results.
+
+    In the testing phase:
+
+    1. If ``show`` is True, it means that only the prediction results are
+        visualized without storing data, so ``vis_backends`` needs to
+        be excluded.
+    2. If ``test_out_dir`` is specified, it means that the prediction results
+        need to be saved to ``test_out_dir``. In order to avoid vis_backends
+        also storing data, so ``vis_backends`` needs to be excluded.
+    3. ``vis_backends`` takes effect if the user does not specify ``show``
+        and `test_out_dir``. You can set ``vis_backends`` to WandbVisBackend or
+        TensorboardVisBackend to store the prediction result in Wandb or
+        Tensorboard.
+
+    Args:
+        draw (bool): whether to draw prediction results. If it is False,
+            it means that no drawing will be done. Defaults to False.
+        frame_interval (int): The interval of visualization. Defaults to 30.
+        score_thr (float): The threshold to visualize the bboxes
+            and masks. Defaults to 0.3.
+        show (bool): Whether to display the drawn image. Default to False.
+        wait_time (float): The interval of show (s). Defaults to 0.
+        test_out_dir (str, optional): directory where painted images
+            will be saved in testing process.
+        backend_args (dict): Arguments to instantiate a file client.
+            Defaults to ``None``.
+    """
+
+    def __init__(self,
+                 draw: bool = False,
+                 frame_interval: int = 30,
+                 score_thr: float = 0.3,
+                 show: bool = False,
+                 wait_time: float = 0.,
+                 test_out_dir: Optional[str] = None,
+                 backend_args: dict = None) -> None:
+        self._visualizer: Visualizer = Visualizer.get_current_instance()
+        self.frame_interval = frame_interval
+        self.score_thr = score_thr
+        self.show = show
+        if self.show:
+            # No need to think about vis backends.
+            self._visualizer._vis_backends = {}
+            warnings.warn('The show is True, it means that only '
+                          'the prediction results are visualized '
+                          'without storing data, so vis_backends '
+                          'needs to be excluded.')
+
+        self.wait_time = wait_time
+        self.backend_args = backend_args
+        self.draw = draw
+        self.test_out_dir = test_out_dir
+        self.image_idx = 0
+
+    def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                       outputs: Sequence[TrackDataSample]) -> None:
+        """Run after every ``self.interval`` validation iteration.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`TrackDataSample`]): Outputs from model.
+        """
+        if self.draw is False:
+            return
+
+        assert len(outputs) == 1, \
+            'only batch_size=1 is supported while validating.'
+
+        sampler = runner.val_dataloader.sampler
+        if isinstance(sampler, TrackImgSampler):
+            if self.every_n_inner_iters(batch_idx, self.frame_interval):
+                total_curr_iter = runner.iter + batch_idx
+                track_data_sample = outputs[0]
+                self.visualize_single_image(track_data_sample[0],
+                                            total_curr_iter)
+        else:
+            # video visualization DefaultSampler
+            if self.every_n_inner_iters(batch_idx, 1):
+                track_data_sample = outputs[0]
+                video_length = len(track_data_sample)
+
+                for frame_id in range(video_length):
+                    if frame_id % self.frame_interval == 0:
+                        total_curr_iter = runner.iter + self.image_idx + \
+                                          frame_id
+                        img_data_sample = track_data_sample[frame_id]
+                        self.visualize_single_image(img_data_sample,
+                                                    total_curr_iter)
+                self.image_idx = self.image_idx + video_length
+
+    def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                        outputs: Sequence[TrackDataSample]) -> None:
+        """Run after every testing iteration.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`TrackDataSample`]): Outputs from model.
+        """
+        if self.draw is False:
+            return
+
+        assert len(outputs) == 1, \
+            'only batch_size=1 is supported while testing.'
+
+        if self.test_out_dir is not None:
+            self.test_out_dir = osp.join(runner.work_dir, runner.timestamp,
+                                         self.test_out_dir)
+            mkdir_or_exist(self.test_out_dir)
+
+        sampler = runner.test_dataloader.sampler
+        if isinstance(sampler, TrackImgSampler):
+            if self.every_n_inner_iters(batch_idx, self.frame_interval):
+                track_data_sample = outputs[0]
+                self.visualize_single_image(track_data_sample[0], batch_idx)
+        else:
+            # video visualization DefaultSampler
+            if self.every_n_inner_iters(batch_idx, 1):
+                track_data_sample = outputs[0]
+                video_length = len(track_data_sample)
+
+                for frame_id in range(video_length):
+                    if frame_id % self.frame_interval == 0:
+                        img_data_sample = track_data_sample[frame_id]
+                        self.visualize_single_image(img_data_sample,
+                                                    self.image_idx + frame_id)
+                self.image_idx = self.image_idx + video_length
+
+    def visualize_single_image(self, img_data_sample: DetDataSample,
+                               step: int) -> None:
+        """
+        Args:
+            img_data_sample (DetDataSample): single image output.
+            step (int): The index of the current image.
+        """
+        img_path = img_data_sample.img_path
+        img_bytes = get(img_path, backend_args=self.backend_args)
+        img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+
+        out_file = None
+        if self.test_out_dir is not None:
+            video_name = img_path.split('/')[-3]
+            mkdir_or_exist(osp.join(self.test_out_dir, video_name))
+            out_file = osp.join(self.test_out_dir, video_name,
+                                osp.basename(img_path))
+
+        self._visualizer.add_datasample(
+            osp.basename(img_path) if self.show else 'test_img',
+            img,
+            data_sample=img_data_sample,
+            show=self.show,
+            wait_time=self.wait_time,
+            pred_score_thr=self.score_thr,
+            out_file=out_file,
+            step=step)
+
+
+def draw_all_character(visualizer, characters, w):
+    start_index = 2
+    y_index = 5
+    for char in characters:
+        if isinstance(char, str):
+            visualizer.draw_texts(
+                str(char),
+                positions=np.array([start_index, y_index]),
+                colors=(0, 0, 0),
+                font_families='monospace')
+            start_index += len(char) * 8
+        else:
+            visualizer.draw_texts(
+                str(char[0]),
+                positions=np.array([start_index, y_index]),
+                colors=char[1],
+                font_families='monospace')
+            start_index += len(char[0]) * 8
+
+        if start_index > w - 10:
+            start_index = 2
+            y_index += 15
+
+    drawn_text = visualizer.get_image()
+    return drawn_text
+
+
+@HOOKS.register_module()
+class GroundingVisualizationHook(DetVisualizationHook):
+
+    def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                        outputs: Sequence[DetDataSample]) -> None:
+        """Run after every testing iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`DetDataSample`]): A batch of data samples
+                that contain annotations and predictions.
+        """
+        if self.draw is False:
+            return
+
+        if self.test_out_dir is not None:
+            self.test_out_dir = osp.join(runner.work_dir, runner.timestamp,
+                                         self.test_out_dir)
+            mkdir_or_exist(self.test_out_dir)
+
+        for data_sample in outputs:
+            data_sample = data_sample.cpu()
+
+            self._test_index += 1
+
+            img_path = data_sample.img_path
+            img_bytes = get(img_path, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+
+            out_file = None
+            if self.test_out_dir is not None:
+                out_file = osp.basename(img_path)
+                out_file = osp.join(self.test_out_dir, out_file)
+
+            text = data_sample.text
+            if isinstance(text, str):  # VG
+                gt_instances = data_sample.gt_instances
+                tokens_positive = data_sample.tokens_positive
+                if 'phrase_ids' in data_sample:
+                    # flickr30k
+                    gt_labels = data_sample.phrase_ids
+                else:
+                    gt_labels = gt_instances.labels
+                gt_bboxes = gt_instances.get('bboxes', None)
+                if gt_bboxes is not None and isinstance(gt_bboxes, BaseBoxes):
+                    gt_instances.bboxes = gt_bboxes.tensor
+                print(gt_labels, tokens_positive, gt_bboxes, img_path)
+                pred_instances = data_sample.pred_instances
+                pred_instances = pred_instances[
+                    pred_instances.scores > self.score_thr]
+                pred_labels = pred_instances.labels
+                pred_bboxes = pred_instances.bboxes
+                pred_scores = pred_instances.scores
+
+                max_label = 0
+                if len(gt_labels) > 0:
+                    max_label = max(gt_labels)
+                if len(pred_labels) > 0:
+                    max_label = max(max(pred_labels), max_label)
+
+                max_label = int(max(max_label, 0))
+                palette = np.random.randint(0, 256, size=(max_label + 1, 3))
+                bbox_palette = [tuple(c) for c in palette]
+                # bbox_palette = get_palette('random', max_label + 1)
+                if len(gt_labels) >= len(pred_labels):
+                    colors = [bbox_palette[label] for label in gt_labels]
+                else:
+                    colors = [bbox_palette[label] for label in pred_labels]
+
+                self._visualizer.set_image(img)
+
+                for label, bbox, color in zip(gt_labels, gt_bboxes, colors):
+                    self._visualizer.draw_bboxes(
+                        bbox, edge_colors=color, face_colors=color, alpha=0.3)
+                    self._visualizer.draw_bboxes(
+                        bbox, edge_colors=color, alpha=1)
+
+                drawn_img = self._visualizer.get_image()
+
+                new_image = np.ones(
+                    (100, img.shape[1], 3), dtype=np.uint8) * 255
+                self._visualizer.set_image(new_image)
+
+                if tokens_positive == -1:  # REC
+                    gt_tokens_positive = [[]]
+                else:  # Phrase Grounding
+                    gt_tokens_positive = [
+                        tokens_positive[label] for label in gt_labels
+                    ]
+                split_by_character = [char for char in text]
+                characters = []
+                start_index = 0
+                end_index = 0
+                for w in split_by_character:
+                    end_index += len(w)
+                    is_find = False
+                    for i, positive in enumerate(gt_tokens_positive):
+                        for p in positive:
+                            if start_index >= p[0] and end_index <= p[1]:
+                                characters.append([w, colors[i]])
+                                is_find = True
+                                break
+                        if is_find:
+                            break
+                    if not is_find:
+                        characters.append([w, (0, 0, 0)])
+                    start_index = end_index
+
+                drawn_text = draw_all_character(self._visualizer, characters,
+                                                img.shape[1])
+                drawn_gt_img = np.concatenate((drawn_img, drawn_text), axis=0)
+
+                self._visualizer.set_image(img)
+
+                for label, bbox, color in zip(pred_labels, pred_bboxes,
+                                              colors):
+                    self._visualizer.draw_bboxes(
+                        bbox, edge_colors=color, face_colors=color, alpha=0.3)
+                    self._visualizer.draw_bboxes(
+                        bbox, edge_colors=color, alpha=1)
+                print(pred_labels, pred_bboxes, pred_scores, colors)
+                areas = (pred_bboxes[:, 3] - pred_bboxes[:, 1]) * (
+                    pred_bboxes[:, 2] - pred_bboxes[:, 0])
+                scales = _get_adaptive_scales(areas)
+                score = [str(round(s.item(), 2)) for s in pred_scores]
+                font_sizes = [int(13 * scales[i]) for i in range(len(scales))]
+                self._visualizer.draw_texts(
+                    score,
+                    pred_bboxes[:, :2].int(),
+                    colors=(255, 255, 255),
+                    font_sizes=font_sizes,
+                    bboxes=[{
+                        'facecolor': 'black',
+                        'alpha': 0.8,
+                        'pad': 0.7,
+                        'edgecolor': 'none'
+                    }] * len(pred_bboxes))
+
+                drawn_img = self._visualizer.get_image()
+
+                new_image = np.ones(
+                    (100, img.shape[1], 3), dtype=np.uint8) * 255
+                self._visualizer.set_image(new_image)
+                drawn_text = draw_all_character(self._visualizer, characters,
+                                                img.shape[1])
+                drawn_pred_img = np.concatenate((drawn_img, drawn_text),
+                                                axis=0)
+                drawn_img = np.concatenate((drawn_gt_img, drawn_pred_img),
+                                           axis=1)
+
+                if self.show:
+                    self._visualizer.show(
+                        drawn_img,
+                        win_name=osp.basename(img_path),
+                        wait_time=self.wait_time)
+                if out_file is not None:
+                    mmcv.imwrite(drawn_img[..., ::-1], out_file)
+                else:
+                    self.add_image('test_img', drawn_img, self._test_index)
+            else:  # OD
+                self._visualizer.add_datasample(
+                    osp.basename(img_path) if self.show else 'test_img',
+                    img,
+                    data_sample=data_sample,
+                    show=self.show,
+                    wait_time=self.wait_time,
+                    pred_score_thr=self.score_thr,
+                    out_file=out_file,
+                    step=self._test_index)
diff --git a/mmde/mmdet/engine/hooks/yolox_mode_switch_hook.py b/mmde/mmdet/engine/hooks/yolox_mode_switch_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a2c69068bedd1c6fb3836e1fc34568e9f6bc83
--- /dev/null
+++ b/mmde/mmdet/engine/hooks/yolox_mode_switch_hook.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class YOLOXModeSwitchHook(Hook):
+    """Switch the mode of YOLOX during training.
+
+    This hook turns off the mosaic and mixup data augmentation and switches
+    to use L1 loss in bbox_head.
+
+    Args:
+        num_last_epochs (int): The number of latter epochs in the end of the
+            training to close the data augmentation and switch to L1 loss.
+            Defaults to 15.
+       skip_type_keys (Sequence[str], optional): Sequence of type string to be
+            skip pipeline. Defaults to ('Mosaic', 'RandomAffine', 'MixUp').
+    """
+
+    def __init__(
+        self,
+        num_last_epochs: int = 15,
+        skip_type_keys: Sequence[str] = ('Mosaic', 'RandomAffine', 'MixUp')
+    ) -> None:
+        self.num_last_epochs = num_last_epochs
+        self.skip_type_keys = skip_type_keys
+        self._restart_dataloader = False
+        self._has_switched = False
+
+    def before_train_epoch(self, runner) -> None:
+        """Close mosaic and mixup augmentation and switches to use L1 loss."""
+        epoch = runner.epoch
+        train_loader = runner.train_dataloader
+        model = runner.model
+        # TODO: refactor after mmengine using model wrapper
+        if is_model_wrapper(model):
+            model = model.module
+        epoch_to_be_switched = ((epoch + 1) >=
+                                runner.max_epochs - self.num_last_epochs)
+        if epoch_to_be_switched and not self._has_switched:
+            runner.logger.info('No mosaic and mixup aug now!')
+            # The dataset pipeline cannot be updated when persistent_workers
+            # is True, so we need to force the dataloader's multi-process
+            # restart. This is a very hacky approach.
+            train_loader.dataset.update_skip_type_keys(self.skip_type_keys)
+            if hasattr(train_loader, 'persistent_workers'
+                       ) and train_loader.persistent_workers is True:
+                train_loader._DataLoader__initialized = False
+                train_loader._iterator = None
+                self._restart_dataloader = True
+            runner.logger.info('Add additional L1 loss now!')
+            if hasattr(model, 'detector'):
+                model.detector.bbox_head.use_l1 = True
+            else:
+                model.bbox_head.use_l1 = True
+            self._has_switched = True
+        else:
+            # Once the restart is complete, we need to restore
+            # the initialization flag.
+            if self._restart_dataloader:
+                train_loader._DataLoader__initialized = True
diff --git a/mmde/mmdet/engine/optimizers/__init__.py b/mmde/mmdet/engine/optimizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..83db069ee34cad0888bbf388d3cc7030ba49bbbb
--- /dev/null
+++ b/mmde/mmdet/engine/optimizers/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .layer_decay_optimizer_constructor import \
+    LearningRateDecayOptimizerConstructor
+
+__all__ = ['LearningRateDecayOptimizerConstructor']
diff --git a/mmde/mmdet/engine/optimizers/layer_decay_optimizer_constructor.py b/mmde/mmdet/engine/optimizers/layer_decay_optimizer_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..73028a0aef698d63dcba8c4935d6ef6c577d0f46
--- /dev/null
+++ b/mmde/mmdet/engine/optimizers/layer_decay_optimizer_constructor.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+from typing import List
+
+import torch.nn as nn
+from mmengine.dist import get_dist_info
+from mmengine.logging import MMLogger
+from mmengine.optim import DefaultOptimWrapperConstructor
+
+from mmdet.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+def get_layer_id_for_convnext(var_name, max_layer_id):
+    """Get the layer id to set the different learning rates in ``layer_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_layer_id (int): Maximum layer id.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        stage_id = int(var_name.split('.')[2])
+        if stage_id == 0:
+            layer_id = 0
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        block_id = int(var_name.split('.')[3])
+        if stage_id == 0:
+            layer_id = 1
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3 + block_id // 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    else:
+        return max_layer_id + 1
+
+
+def get_stage_id_for_convnext(var_name, max_stage_id):
+    """Get the stage id to set the different learning rates in ``stage_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_stage_id (int): Maximum stage id.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        return 0
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        return stage_id + 1
+    else:
+        return max_stage_id - 1
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class LearningRateDecayOptimizerConstructor(DefaultOptimWrapperConstructor):
+    # Different learning rates are set for different layers of backbone.
+    # Note: Currently, this optimizer constructor is built for ConvNeXt.
+
+    def add_params(self, params: List[dict], module: nn.Module,
+                   **kwargs) -> None:
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+        """
+        logger = MMLogger.get_current_instance()
+
+        parameter_groups = {}
+        logger.info(f'self.paramwise_cfg is {self.paramwise_cfg}')
+        num_layers = self.paramwise_cfg.get('num_layers') + 2
+        decay_rate = self.paramwise_cfg.get('decay_rate')
+        decay_type = self.paramwise_cfg.get('decay_type', 'layer_wise')
+        logger.info('Build LearningRateDecayOptimizerConstructor  '
+                    f'{decay_type} {decay_rate} - {num_layers}')
+        weight_decay = self.base_wd
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith('.bias') or name in (
+                    'pos_embed', 'cls_token'):
+                group_name = 'no_decay'
+                this_weight_decay = 0.
+            else:
+                group_name = 'decay'
+                this_weight_decay = weight_decay
+            if 'layer_wise' in decay_type:
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_convnext(
+                        name, self.paramwise_cfg.get('num_layers'))
+                    logger.info(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            elif decay_type == 'stage_wise':
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_stage_id_for_convnext(name, num_layers)
+                    logger.info(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            group_name = f'layer_{layer_id}_{group_name}'
+
+            if group_name not in parameter_groups:
+                scale = decay_rate**(num_layers - layer_id - 1)
+
+                parameter_groups[group_name] = {
+                    'weight_decay': this_weight_decay,
+                    'params': [],
+                    'param_names': [],
+                    'lr_scale': scale,
+                    'group_name': group_name,
+                    'lr': scale * self.base_lr,
+                }
+
+            parameter_groups[group_name]['params'].append(param)
+            parameter_groups[group_name]['param_names'].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    'param_names': parameter_groups[key]['param_names'],
+                    'lr_scale': parameter_groups[key]['lr_scale'],
+                    'lr': parameter_groups[key]['lr'],
+                    'weight_decay': parameter_groups[key]['weight_decay'],
+                }
+            logger.info(f'Param groups = {json.dumps(to_display, indent=2)}')
+        params.extend(parameter_groups.values())
diff --git a/mmde/mmdet/engine/runner/__init__.py b/mmde/mmdet/engine/runner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8bcce4448e48e2d64354ba6770f9f426fb3d869
--- /dev/null
+++ b/mmde/mmdet/engine/runner/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .loops import TeacherStudentValLoop
+
+__all__ = ['TeacherStudentValLoop']
diff --git a/mmde/mmdet/engine/runner/loops.py b/mmde/mmdet/engine/runner/loops.py
new file mode 100644
index 0000000000000000000000000000000000000000..afe53afa5c80facf3ba6c224bd358e0859dade32
--- /dev/null
+++ b/mmde/mmdet/engine/runner/loops.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.model import is_model_wrapper
+from mmengine.runner import ValLoop
+
+from mmdet.registry import LOOPS
+
+
+@LOOPS.register_module()
+class TeacherStudentValLoop(ValLoop):
+    """Loop for validation of model teacher and student."""
+
+    def run(self):
+        """Launch validation for model teacher and student."""
+        self.runner.call_hook('before_val')
+        self.runner.call_hook('before_val_epoch')
+        self.runner.model.eval()
+
+        model = self.runner.model
+        if is_model_wrapper(model):
+            model = model.module
+        assert hasattr(model, 'teacher')
+        assert hasattr(model, 'student')
+
+        predict_on = model.semi_test_cfg.get('predict_on', None)
+        multi_metrics = dict()
+        for _predict_on in ['teacher', 'student']:
+            model.semi_test_cfg['predict_on'] = _predict_on
+            for idx, data_batch in enumerate(self.dataloader):
+                self.run_iter(idx, data_batch)
+            # compute metrics
+            metrics = self.evaluator.evaluate(len(self.dataloader.dataset))
+            multi_metrics.update(
+                {'/'.join((_predict_on, k)): v
+                 for k, v in metrics.items()})
+        model.semi_test_cfg['predict_on'] = predict_on
+
+        self.runner.call_hook('after_val_epoch', metrics=multi_metrics)
+        self.runner.call_hook('after_val')
diff --git a/mmde/mmdet/engine/schedulers/__init__.py b/mmde/mmdet/engine/schedulers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..01261646fa8255c643e86ba0517019760a50d387
--- /dev/null
+++ b/mmde/mmdet/engine/schedulers/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .quadratic_warmup import (QuadraticWarmupLR, QuadraticWarmupMomentum,
+                               QuadraticWarmupParamScheduler)
+
+__all__ = [
+    'QuadraticWarmupParamScheduler', 'QuadraticWarmupMomentum',
+    'QuadraticWarmupLR'
+]
diff --git a/mmde/mmdet/engine/schedulers/quadratic_warmup.py b/mmde/mmdet/engine/schedulers/quadratic_warmup.py
new file mode 100644
index 0000000000000000000000000000000000000000..639b47854887786bf3f81d6d0a375033d190d91e
--- /dev/null
+++ b/mmde/mmdet/engine/schedulers/quadratic_warmup.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.scheduler.lr_scheduler import LRSchedulerMixin
+from mmengine.optim.scheduler.momentum_scheduler import MomentumSchedulerMixin
+from mmengine.optim.scheduler.param_scheduler import INF, _ParamScheduler
+from torch.optim import Optimizer
+
+from mmdet.registry import PARAM_SCHEDULERS
+
+
+@PARAM_SCHEDULERS.register_module()
+class QuadraticWarmupParamScheduler(_ParamScheduler):
+    r"""Warm up the parameter value of each parameter group by quadratic
+    formula:
+
+    .. math::
+
+        X_{t} = X_{t-1} + \frac{2t+1}{{(end-begin)}^{2}} \times X_{base}
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: Optimizer,
+                 param_name: str,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        if end >= INF:
+            raise ValueError('``end`` must be less than infinity,'
+                             'Please set ``end`` parameter of '
+                             '``QuadraticWarmupScheduler`` as the '
+                             'number of warmup end.')
+        self.total_iters = end - begin
+        super().__init__(
+            optimizer=optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        begin = begin * epoch_length
+        if end != INF:
+            end = end * epoch_length
+        return cls(*args, begin=begin, end=end, by_epoch=by_epoch, **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if self.last_step == 0:
+            return [
+                base_value * (2 * self.last_step + 1) / self.total_iters**2
+                for base_value in self.base_values
+            ]
+
+        return [
+            group[self.param_name] + base_value *
+            (2 * self.last_step + 1) / self.total_iters**2
+            for base_value, group in zip(self.base_values,
+                                         self.optimizer.param_groups)
+        ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class QuadraticWarmupLR(LRSchedulerMixin, QuadraticWarmupParamScheduler):
+    """Warm up the learning rate of each parameter group by quadratic formula.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class QuadraticWarmupMomentum(MomentumSchedulerMixin,
+                              QuadraticWarmupParamScheduler):
+    """Warm up the momentum value of each parameter group by quadratic formula.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
diff --git a/mmde/mmdet/evaluation/__init__.py b/mmde/mmdet/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..126dea092eb1a4affab9fbe3fb043f5b373607ee
--- /dev/null
+++ b/mmde/mmdet/evaluation/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .evaluator import *  # noqa: F401,F403
+from .functional import *  # noqa: F401,F403
+from .metrics import *  # noqa: F401,F403
diff --git a/mmde/mmdet/evaluation/evaluator/__init__.py b/mmde/mmdet/evaluation/evaluator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b13fe99548e7e2e4c6e196a2da22b9c8cbec8a3
--- /dev/null
+++ b/mmde/mmdet/evaluation/evaluator/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .multi_datasets_evaluator import MultiDatasetsEvaluator
+
+__all__ = ['MultiDatasetsEvaluator']
diff --git a/mmde/mmdet/evaluation/evaluator/multi_datasets_evaluator.py b/mmde/mmdet/evaluation/evaluator/multi_datasets_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cff1cf210e644e11b348f3aa757119ac579170d
--- /dev/null
+++ b/mmde/mmdet/evaluation/evaluator/multi_datasets_evaluator.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections import OrderedDict
+from typing import Sequence, Union
+
+from mmengine.dist import (broadcast_object_list, collect_results,
+                           is_main_process)
+from mmengine.evaluator import BaseMetric, Evaluator
+from mmengine.evaluator.metric import _to_cpu
+from mmengine.registry import EVALUATOR
+
+from mmdet.utils import ConfigType
+
+
+@EVALUATOR.register_module()
+class MultiDatasetsEvaluator(Evaluator):
+    """Wrapper class to compose class: `ConcatDataset` and multiple
+    :class:`BaseMetric` instances.
+    The metrics will be evaluated on each dataset slice separately. The name of
+    the each metric is the concatenation of the dataset prefix, the metric
+    prefix and the key of metric - e.g.
+    `dataset_prefix/metric_prefix/accuracy`.
+
+    Args:
+        metrics (dict or BaseMetric or Sequence): The config of metrics.
+        dataset_prefixes (Sequence[str]): The prefix of each dataset. The
+            length of this sequence should be the same as the length of the
+            datasets.
+    """
+
+    def __init__(self, metrics: Union[ConfigType, BaseMetric, Sequence],
+                 dataset_prefixes: Sequence[str]) -> None:
+        super().__init__(metrics)
+        self.dataset_prefixes = dataset_prefixes
+        self._setups = False
+
+    def _get_cumulative_sizes(self):
+        # ConcatDataset have a property `cumulative_sizes`
+        if isinstance(self.dataset_meta, Sequence):
+            dataset_slices = self.dataset_meta[0]['cumulative_sizes']
+            if not self._setups:
+                self._setups = True
+                for dataset_meta, metric in zip(self.dataset_meta,
+                                                self.metrics):
+                    metric.dataset_meta = dataset_meta
+        else:
+            dataset_slices = self.dataset_meta['cumulative_sizes']
+        return dataset_slices
+
+    def evaluate(self, size: int) -> dict:
+        """Invoke ``evaluate`` method of each metric and collect the metrics
+        dictionary.
+
+        Args:
+            size (int): Length of the entire validation dataset. When batch
+                size > 1, the dataloader may pad some data samples to make
+                sure all ranks have the same length of dataset slice. The
+                ``collect_results`` function will drop the padded data based on
+                this size.
+
+        Returns:
+            dict: Evaluation results of all metrics. The keys are the names
+            of the metrics, and the values are corresponding results.
+        """
+        metrics_results = OrderedDict()
+        dataset_slices = self._get_cumulative_sizes()
+        assert len(dataset_slices) == len(self.dataset_prefixes)
+
+        for dataset_prefix, start, end, metric in zip(
+                self.dataset_prefixes, [0] + dataset_slices[:-1],
+                dataset_slices, self.metrics):
+            if len(metric.results) == 0:
+                warnings.warn(
+                    f'{metric.__class__.__name__} got empty `self.results`.'
+                    'Please ensure that the processed results are properly '
+                    'added into `self.results` in `process` method.')
+
+            results = collect_results(metric.results, size,
+                                      metric.collect_device)
+
+            if is_main_process():
+                # cast all tensors in results list to cpu
+                results = _to_cpu(results)
+                _metrics = metric.compute_metrics(
+                    results[start:end])  # type: ignore
+
+                if metric.prefix:
+                    final_prefix = '/'.join((dataset_prefix, metric.prefix))
+                else:
+                    final_prefix = dataset_prefix
+                print(f'================{final_prefix}================')
+                metric_results = {
+                    '/'.join((final_prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+
+                # Check metric name conflicts
+                for name in metric_results.keys():
+                    if name in metrics_results:
+                        raise ValueError(
+                            'There are multiple evaluation results with '
+                            f'the same metric name {name}. Please make '
+                            'sure all metrics have different prefixes.')
+                metrics_results.update(metric_results)
+            metric.results.clear()
+        if is_main_process():
+            metrics_results = [metrics_results]
+        else:
+            metrics_results = [None]  # type: ignore
+        broadcast_object_list(metrics_results)
+        return metrics_results[0]
diff --git a/mmde/mmdet/evaluation/functional/__init__.py b/mmde/mmdet/evaluation/functional/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..96d58ebd3ab0dd714a6f361622a7faf2a09486cb
--- /dev/null
+++ b/mmde/mmdet/evaluation/functional/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox_overlaps import bbox_overlaps
+from .cityscapes_utils import evaluateImgLists
+from .class_names import (cityscapes_classes, coco_classes,
+                          coco_panoptic_classes, dataset_aliases, get_classes,
+                          imagenet_det_classes, imagenet_vid_classes,
+                          objects365v1_classes, objects365v2_classes,
+                          oid_challenge_classes, oid_v6_classes, voc_classes)
+from .mean_ap import average_precision, eval_map, print_map_summary
+from .panoptic_utils import (INSTANCE_OFFSET, pq_compute_multi_core,
+                             pq_compute_single_core)
+from .recall import (eval_recalls, plot_iou_recall, plot_num_recall,
+                     print_recall_summary)
+from .ytvis import YTVIS
+from .ytviseval import YTVISeval
+
+__all__ = [
+    'voc_classes', 'imagenet_det_classes', 'imagenet_vid_classes',
+    'coco_classes', 'cityscapes_classes', 'dataset_aliases', 'get_classes',
+    'average_precision', 'eval_map', 'print_map_summary', 'eval_recalls',
+    'print_recall_summary', 'plot_num_recall', 'plot_iou_recall',
+    'oid_v6_classes', 'oid_challenge_classes', 'INSTANCE_OFFSET',
+    'pq_compute_single_core', 'pq_compute_multi_core', 'bbox_overlaps',
+    'objects365v1_classes', 'objects365v2_classes', 'coco_panoptic_classes',
+    'evaluateImgLists', 'YTVIS', 'YTVISeval'
+]
diff --git a/mmde/mmdet/evaluation/functional/bbox_overlaps.py b/mmde/mmdet/evaluation/functional/bbox_overlaps.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d6eb82fcfc8d5444dd2a13b7d95b978f8206a55
--- /dev/null
+++ b/mmde/mmdet/evaluation/functional/bbox_overlaps.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+
+def bbox_overlaps(bboxes1,
+                  bboxes2,
+                  mode='iou',
+                  eps=1e-6,
+                  use_legacy_coordinate=False):
+    """Calculate the ious between each bbox of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (ndarray): Shape (n, 4)
+        bboxes2 (ndarray): Shape (k, 4)
+        mode (str): IOU (intersection over union) or IOF (intersection
+            over foreground)
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Note when function is used in `VOCDataset`, it should be
+            True to align with the official implementation
+            `http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar`
+            Default: False.
+
+    Returns:
+        ious (ndarray): Shape (n, k)
+    """
+
+    assert mode in ['iou', 'iof']
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+    bboxes1 = bboxes1.astype(np.float32)
+    bboxes2 = bboxes2.astype(np.float32)
+    rows = bboxes1.shape[0]
+    cols = bboxes2.shape[0]
+    ious = np.zeros((rows, cols), dtype=np.float32)
+    if rows * cols == 0:
+        return ious
+    exchange = False
+    if bboxes1.shape[0] > bboxes2.shape[0]:
+        bboxes1, bboxes2 = bboxes2, bboxes1
+        ious = np.zeros((cols, rows), dtype=np.float32)
+        exchange = True
+    area1 = (bboxes1[:, 2] - bboxes1[:, 0] + extra_length) * (
+        bboxes1[:, 3] - bboxes1[:, 1] + extra_length)
+    area2 = (bboxes2[:, 2] - bboxes2[:, 0] + extra_length) * (
+        bboxes2[:, 3] - bboxes2[:, 1] + extra_length)
+    for i in range(bboxes1.shape[0]):
+        x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
+        y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
+        x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
+        y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
+        overlap = np.maximum(x_end - x_start + extra_length, 0) * np.maximum(
+            y_end - y_start + extra_length, 0)
+        if mode == 'iou':
+            union = area1[i] + area2 - overlap
+        else:
+            union = area1[i] if not exchange else area2
+        union = np.maximum(union, eps)
+        ious[i, :] = overlap / union
+    if exchange:
+        ious = ious.T
+    return ious
diff --git a/mmde/mmdet/evaluation/functional/cityscapes_utils.py b/mmde/mmdet/evaluation/functional/cityscapes_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ced3680deefe333af7cca3675a6359c02dd96f8
--- /dev/null
+++ b/mmde/mmdet/evaluation/functional/cityscapes_utils.py
@@ -0,0 +1,302 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) https://github.com/mcordts/cityscapesScripts
+# A wrapper of `cityscapesscripts` which supports loading groundtruth
+# image from `backend_args`.
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Optional, Union
+
+import mmcv
+import numpy as np
+from mmengine.fileio import get
+
+try:
+    import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as CSEval  # noqa: E501
+    from cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling import \
+        CArgs  # noqa: E501
+    from cityscapesscripts.evaluation.instance import Instance
+    from cityscapesscripts.helpers.csHelpers import (id2label, labels,
+                                                     writeDict2JSON)
+    HAS_CITYSCAPESAPI = True
+except ImportError:
+    CArgs = object
+    HAS_CITYSCAPESAPI = False
+
+
+def evaluateImgLists(prediction_list: list,
+                     groundtruth_list: list,
+                     args: CArgs,
+                     backend_args: Optional[dict] = None,
+                     dump_matches: bool = False) -> dict:
+    """A wrapper of obj:``cityscapesscripts.evaluation.
+
+    evalInstanceLevelSemanticLabeling.evaluateImgLists``. Support loading
+    groundtruth image from file backend.
+    Args:
+        prediction_list (list): A list of prediction txt file.
+        groundtruth_list (list): A list of groundtruth image file.
+        args (CArgs): A global object setting in
+            obj:``cityscapesscripts.evaluation.
+            evalInstanceLevelSemanticLabeling``
+        backend_args (dict, optional): Arguments to instantiate the
+            preifx of uri corresponding backend. Defaults to None.
+        dump_matches (bool): whether dump matches.json. Defaults to False.
+    Returns:
+        dict: The computed metric.
+    """
+    if not HAS_CITYSCAPESAPI:
+        raise RuntimeError('Failed to import `cityscapesscripts`.'
+                           'Please try to install official '
+                           'cityscapesscripts by '
+                           '"pip install cityscapesscripts"')
+    # determine labels of interest
+    CSEval.setInstanceLabels(args)
+    # get dictionary of all ground truth instances
+    gt_instances = getGtInstances(
+        groundtruth_list, args, backend_args=backend_args)
+    # match predictions and ground truth
+    matches = matchGtWithPreds(prediction_list, groundtruth_list, gt_instances,
+                               args, backend_args)
+    if dump_matches:
+        CSEval.writeDict2JSON(matches, 'matches.json')
+    # evaluate matches
+    apScores = CSEval.evaluateMatches(matches, args)
+    # averages
+    avgDict = CSEval.computeAverages(apScores, args)
+    # result dict
+    resDict = CSEval.prepareJSONDataForResults(avgDict, apScores, args)
+    if args.JSONOutput:
+        # create output folder if necessary
+        path = os.path.dirname(args.exportFile)
+        CSEval.ensurePath(path)
+        # Write APs to JSON
+        CSEval.writeDict2JSON(resDict, args.exportFile)
+
+    CSEval.printResults(avgDict, args)
+
+    return resDict
+
+
+def matchGtWithPreds(prediction_list: list,
+                     groundtruth_list: list,
+                     gt_instances: dict,
+                     args: CArgs,
+                     backend_args=None):
+    """A wrapper of obj:``cityscapesscripts.evaluation.
+
+    evalInstanceLevelSemanticLabeling.matchGtWithPreds``. Support loading
+    groundtruth image from file backend.
+    Args:
+        prediction_list (list): A list of prediction txt file.
+        groundtruth_list (list): A list of groundtruth image file.
+        gt_instances (dict): Groundtruth dict.
+        args (CArgs): A global object setting in
+            obj:``cityscapesscripts.evaluation.
+            evalInstanceLevelSemanticLabeling``
+        backend_args (dict, optional): Arguments to instantiate the
+            preifx of uri corresponding backend. Defaults to None.
+    Returns:
+        dict: The processed prediction and groundtruth result.
+    """
+    if not HAS_CITYSCAPESAPI:
+        raise RuntimeError('Failed to import `cityscapesscripts`.'
+                           'Please try to install official '
+                           'cityscapesscripts by '
+                           '"pip install cityscapesscripts"')
+    matches: dict = dict()
+    if not args.quiet:
+        print(f'Matching {len(prediction_list)} pairs of images...')
+
+    count = 0
+    for (pred, gt) in zip(prediction_list, groundtruth_list):
+        # Read input files
+        gt_image = readGTImage(gt, backend_args)
+        pred_info = readPredInfo(pred)
+        # Get and filter ground truth instances
+        unfiltered_instances = gt_instances[gt]
+        cur_gt_instances_orig = CSEval.filterGtInstances(
+            unfiltered_instances, args)
+
+        # Try to assign all predictions
+        (cur_gt_instances,
+         cur_pred_instances) = CSEval.assignGt2Preds(cur_gt_instances_orig,
+                                                     gt_image, pred_info, args)
+
+        # append to global dict
+        matches[gt] = {}
+        matches[gt]['groundTruth'] = cur_gt_instances
+        matches[gt]['prediction'] = cur_pred_instances
+
+        count += 1
+        if not args.quiet:
+            print(f'\rImages Processed: {count}', end=' ')
+            sys.stdout.flush()
+
+    if not args.quiet:
+        print('')
+
+    return matches
+
+
+def readGTImage(image_file: Union[str, Path],
+                backend_args: Optional[dict] = None) -> np.ndarray:
+    """Read an image from path.
+
+    Same as obj:``cityscapesscripts.evaluation.
+    evalInstanceLevelSemanticLabeling.readGTImage``, but support loading
+    groundtruth image from file backend.
+    Args:
+        image_file (str or Path): Either a str or pathlib.Path.
+        backend_args (dict, optional): Instantiates the corresponding file
+            backend. It may contain `backend` key to specify the file
+            backend. If it contains, the file backend corresponding to this
+            value will be used and initialized with the remaining values,
+            otherwise the corresponding file backend will be selected
+            based on the prefix of the file path. Defaults to None.
+    Returns:
+        np.ndarray: The groundtruth image.
+    """
+    img_bytes = get(image_file, backend_args=backend_args)
+    img = mmcv.imfrombytes(img_bytes, flag='unchanged', backend='pillow')
+    return img
+
+
+def readPredInfo(prediction_file: str) -> dict:
+    """A wrapper of obj:``cityscapesscripts.evaluation.
+
+    evalInstanceLevelSemanticLabeling.readPredInfo``.
+    Args:
+        prediction_file (str): The prediction txt file.
+    Returns:
+        dict: The processed prediction results.
+    """
+    if not HAS_CITYSCAPESAPI:
+        raise RuntimeError('Failed to import `cityscapesscripts`.'
+                           'Please try to install official '
+                           'cityscapesscripts by '
+                           '"pip install cityscapesscripts"')
+    printError = CSEval.printError
+
+    predInfo = {}
+    if (not os.path.isfile(prediction_file)):
+        printError(f"Infofile '{prediction_file}' "
+                   'for the predictions not found.')
+    with open(prediction_file) as f:
+        for line in f:
+            splittedLine = line.split(' ')
+            if len(splittedLine) != 3:
+                printError('Invalid prediction file. Expected content: '
+                           'relPathPrediction1 labelIDPrediction1 '
+                           'confidencePrediction1')
+            if os.path.isabs(splittedLine[0]):
+                printError('Invalid prediction file. First entry in each '
+                           'line must be a relative path.')
+
+            filename = os.path.join(
+                os.path.dirname(prediction_file), splittedLine[0])
+
+            imageInfo = {}
+            imageInfo['labelID'] = int(float(splittedLine[1]))
+            imageInfo['conf'] = float(splittedLine[2])  # type: ignore
+            predInfo[filename] = imageInfo
+
+    return predInfo
+
+
+def getGtInstances(groundtruth_list: list,
+                   args: CArgs,
+                   backend_args: Optional[dict] = None) -> dict:
+    """A wrapper of obj:``cityscapesscripts.evaluation.
+
+    evalInstanceLevelSemanticLabeling.getGtInstances``. Support loading
+    groundtruth image from file backend.
+    Args:
+        groundtruth_list (list): A list of groundtruth image file.
+        args (CArgs): A global object setting in
+            obj:``cityscapesscripts.evaluation.
+            evalInstanceLevelSemanticLabeling``
+        backend_args (dict, optional): Arguments to instantiate the
+            preifx of uri corresponding backend. Defaults to None.
+    Returns:
+        dict: The computed metric.
+    """
+    if not HAS_CITYSCAPESAPI:
+        raise RuntimeError('Failed to import `cityscapesscripts`.'
+                           'Please try to install official '
+                           'cityscapesscripts by '
+                           '"pip install cityscapesscripts"')
+    # if there is a global statistics json, then load it
+    if (os.path.isfile(args.gtInstancesFile)):
+        if not args.quiet:
+            print('Loading ground truth instances from JSON.')
+        with open(args.gtInstancesFile) as json_file:
+            gt_instances = json.load(json_file)
+    # otherwise create it
+    else:
+        if (not args.quiet):
+            print('Creating ground truth instances from png files.')
+        gt_instances = instances2dict(
+            groundtruth_list, args, backend_args=backend_args)
+        writeDict2JSON(gt_instances, args.gtInstancesFile)
+
+    return gt_instances
+
+
+def instances2dict(image_list: list,
+                   args: CArgs,
+                   backend_args: Optional[dict] = None) -> dict:
+    """A wrapper of obj:``cityscapesscripts.evaluation.
+
+    evalInstanceLevelSemanticLabeling.instances2dict``. Support loading
+    groundtruth image from file backend.
+    Args:
+        image_list (list): A list of image file.
+        args (CArgs): A global object setting in
+            obj:``cityscapesscripts.evaluation.
+            evalInstanceLevelSemanticLabeling``
+        backend_args (dict, optional): Arguments to instantiate the
+            preifx of uri corresponding backend. Defaults to None.
+    Returns:
+        dict: The processed groundtruth results.
+    """
+    if not HAS_CITYSCAPESAPI:
+        raise RuntimeError('Failed to import `cityscapesscripts`.'
+                           'Please try to install official '
+                           'cityscapesscripts by '
+                           '"pip install cityscapesscripts"')
+    imgCount = 0
+    instanceDict = {}
+
+    if not isinstance(image_list, list):
+        image_list = [image_list]
+
+    if not args.quiet:
+        print(f'Processing {len(image_list)} images...')
+
+    for image_name in image_list:
+        # Load image
+        img_bytes = get(image_name, backend_args=backend_args)
+        imgNp = mmcv.imfrombytes(img_bytes, flag='unchanged', backend='pillow')
+
+        # Initialize label categories
+        instances: dict = {}
+        for label in labels:
+            instances[label.name] = []
+
+        # Loop through all instance ids in instance image
+        for instanceId in np.unique(imgNp):
+            instanceObj = Instance(imgNp, instanceId)
+
+            instances[id2label[instanceObj.labelID].name].append(
+                instanceObj.toDict())
+
+        instanceDict[image_name] = instances
+        imgCount += 1
+
+        if not args.quiet:
+            print(f'\rImages Processed: {imgCount}', end=' ')
+            sys.stdout.flush()
+
+    return instanceDict
diff --git a/mmde/mmdet/evaluation/functional/class_names.py b/mmde/mmdet/evaluation/functional/class_names.py
new file mode 100644
index 0000000000000000000000000000000000000000..623a89cfdc06ab04831afd3423d5f725acc881f0
--- /dev/null
+++ b/mmde/mmdet/evaluation/functional/class_names.py
@@ -0,0 +1,762 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils import is_str
+
+
+def wider_face_classes() -> list:
+    """Class names of WIDERFace."""
+    return ['face']
+
+
+def voc_classes() -> list:
+    """Class names of PASCAL VOC."""
+    return [
+        'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
+        'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
+        'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
+    ]
+
+
+def imagenet_det_classes() -> list:
+    """Class names of ImageNet Det."""
+    return [
+        'accordion', 'airplane', 'ant', 'antelope', 'apple', 'armadillo',
+        'artichoke', 'axe', 'baby_bed', 'backpack', 'bagel', 'balance_beam',
+        'banana', 'band_aid', 'banjo', 'baseball', 'basketball', 'bathing_cap',
+        'beaker', 'bear', 'bee', 'bell_pepper', 'bench', 'bicycle', 'binder',
+        'bird', 'bookshelf', 'bow_tie', 'bow', 'bowl', 'brassiere', 'burrito',
+        'bus', 'butterfly', 'camel', 'can_opener', 'car', 'cart', 'cattle',
+        'cello', 'centipede', 'chain_saw', 'chair', 'chime', 'cocktail_shaker',
+        'coffee_maker', 'computer_keyboard', 'computer_mouse', 'corkscrew',
+        'cream', 'croquet_ball', 'crutch', 'cucumber', 'cup_or_mug', 'diaper',
+        'digital_clock', 'dishwasher', 'dog', 'domestic_cat', 'dragonfly',
+        'drum', 'dumbbell', 'electric_fan', 'elephant', 'face_powder', 'fig',
+        'filing_cabinet', 'flower_pot', 'flute', 'fox', 'french_horn', 'frog',
+        'frying_pan', 'giant_panda', 'goldfish', 'golf_ball', 'golfcart',
+        'guacamole', 'guitar', 'hair_dryer', 'hair_spray', 'hamburger',
+        'hammer', 'hamster', 'harmonica', 'harp', 'hat_with_a_wide_brim',
+        'head_cabbage', 'helmet', 'hippopotamus', 'horizontal_bar', 'horse',
+        'hotdog', 'iPod', 'isopod', 'jellyfish', 'koala_bear', 'ladle',
+        'ladybug', 'lamp', 'laptop', 'lemon', 'lion', 'lipstick', 'lizard',
+        'lobster', 'maillot', 'maraca', 'microphone', 'microwave', 'milk_can',
+        'miniskirt', 'monkey', 'motorcycle', 'mushroom', 'nail', 'neck_brace',
+        'oboe', 'orange', 'otter', 'pencil_box', 'pencil_sharpener', 'perfume',
+        'person', 'piano', 'pineapple', 'ping-pong_ball', 'pitcher', 'pizza',
+        'plastic_bag', 'plate_rack', 'pomegranate', 'popsicle', 'porcupine',
+        'power_drill', 'pretzel', 'printer', 'puck', 'punching_bag', 'purse',
+        'rabbit', 'racket', 'ray', 'red_panda', 'refrigerator',
+        'remote_control', 'rubber_eraser', 'rugby_ball', 'ruler',
+        'salt_or_pepper_shaker', 'saxophone', 'scorpion', 'screwdriver',
+        'seal', 'sheep', 'ski', 'skunk', 'snail', 'snake', 'snowmobile',
+        'snowplow', 'soap_dispenser', 'soccer_ball', 'sofa', 'spatula',
+        'squirrel', 'starfish', 'stethoscope', 'stove', 'strainer',
+        'strawberry', 'stretcher', 'sunglasses', 'swimming_trunks', 'swine',
+        'syringe', 'table', 'tape_player', 'tennis_ball', 'tick', 'tie',
+        'tiger', 'toaster', 'traffic_light', 'train', 'trombone', 'trumpet',
+        'turtle', 'tv_or_monitor', 'unicycle', 'vacuum', 'violin',
+        'volleyball', 'waffle_iron', 'washer', 'water_bottle', 'watercraft',
+        'whale', 'wine_bottle', 'zebra'
+    ]
+
+
+def imagenet_vid_classes() -> list:
+    """Class names of ImageNet VID."""
+    return [
+        'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
+        'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda',
+        'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle', 'rabbit',
+        'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train', 'turtle',
+        'watercraft', 'whale', 'zebra'
+    ]
+
+
+def coco_classes() -> list:
+    """Class names of COCO."""
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign',
+        'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard',
+        'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy_bear', 'hair_drier', 'toothbrush'
+    ]
+
+
+def coco_panoptic_classes() -> list:
+    """Class names of COCO panoptic."""
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+        'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+        'blanket', 'bridge', 'cardboard', 'counter', 'curtain', 'door-stuff',
+        'floor-wood', 'flower', 'fruit', 'gravel', 'house', 'light',
+        'mirror-stuff', 'net', 'pillow', 'platform', 'playingfield',
+        'railroad', 'river', 'road', 'roof', 'sand', 'sea', 'shelf', 'snow',
+        'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', 'wall-tile',
+        'wall-wood', 'water-other', 'window-blind', 'window-other',
+        'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+        'cabinet-merged', 'table-merged', 'floor-other-merged',
+        'pavement-merged', 'mountain-merged', 'grass-merged', 'dirt-merged',
+        'paper-merged', 'food-other-merged', 'building-other-merged',
+        'rock-merged', 'wall-other-merged', 'rug-merged'
+    ]
+
+
+def cityscapes_classes() -> list:
+    """Class names of Cityscapes."""
+    return [
+        'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+        'bicycle'
+    ]
+
+
+def oid_challenge_classes() -> list:
+    """Class names of Open Images Challenge."""
+    return [
+        'Footwear', 'Jeans', 'House', 'Tree', 'Woman', 'Man', 'Land vehicle',
+        'Person', 'Wheel', 'Bus', 'Human face', 'Bird', 'Dress', 'Girl',
+        'Vehicle', 'Building', 'Cat', 'Car', 'Belt', 'Elephant', 'Dessert',
+        'Butterfly', 'Train', 'Guitar', 'Poster', 'Book', 'Boy', 'Bee',
+        'Flower', 'Window', 'Hat', 'Human head', 'Dog', 'Human arm', 'Drink',
+        'Human mouth', 'Human hair', 'Human nose', 'Human hand', 'Table',
+        'Marine invertebrates', 'Fish', 'Sculpture', 'Rose', 'Street light',
+        'Glasses', 'Fountain', 'Skyscraper', 'Swimwear', 'Brassiere', 'Drum',
+        'Duck', 'Countertop', 'Furniture', 'Ball', 'Human leg', 'Boat',
+        'Balloon', 'Bicycle helmet', 'Goggles', 'Door', 'Human eye', 'Shirt',
+        'Toy', 'Teddy bear', 'Pasta', 'Tomato', 'Human ear',
+        'Vehicle registration plate', 'Microphone', 'Musical keyboard',
+        'Tower', 'Houseplant', 'Flowerpot', 'Fruit', 'Vegetable',
+        'Musical instrument', 'Suit', 'Motorcycle', 'Bagel', 'French fries',
+        'Hamburger', 'Chair', 'Salt and pepper shakers', 'Snail', 'Airplane',
+        'Horse', 'Laptop', 'Computer keyboard', 'Football helmet', 'Cocktail',
+        'Juice', 'Tie', 'Computer monitor', 'Human beard', 'Bottle',
+        'Saxophone', 'Lemon', 'Mouse', 'Sock', 'Cowboy hat', 'Sun hat',
+        'Football', 'Porch', 'Sunglasses', 'Lobster', 'Crab', 'Picture frame',
+        'Van', 'Crocodile', 'Surfboard', 'Shorts', 'Helicopter', 'Helmet',
+        'Sports uniform', 'Taxi', 'Swan', 'Goose', 'Coat', 'Jacket', 'Handbag',
+        'Flag', 'Skateboard', 'Television', 'Tire', 'Spoon', 'Palm tree',
+        'Stairs', 'Salad', 'Castle', 'Oven', 'Microwave oven', 'Wine',
+        'Ceiling fan', 'Mechanical fan', 'Cattle', 'Truck', 'Box', 'Ambulance',
+        'Desk', 'Wine glass', 'Reptile', 'Tank', 'Traffic light', 'Billboard',
+        'Tent', 'Insect', 'Spider', 'Treadmill', 'Cupboard', 'Shelf',
+        'Seat belt', 'Human foot', 'Bicycle', 'Bicycle wheel', 'Couch',
+        'Bookcase', 'Fedora', 'Backpack', 'Bench', 'Oyster',
+        'Moths and butterflies', 'Lavender', 'Waffle', 'Fork', 'Animal',
+        'Accordion', 'Mobile phone', 'Plate', 'Coffee cup', 'Saucer',
+        'Platter', 'Dagger', 'Knife', 'Bull', 'Tortoise', 'Sea turtle', 'Deer',
+        'Weapon', 'Apple', 'Ski', 'Taco', 'Traffic sign', 'Beer', 'Necklace',
+        'Sunflower', 'Piano', 'Organ', 'Harpsichord', 'Bed', 'Cabinetry',
+        'Nightstand', 'Curtain', 'Chest of drawers', 'Drawer', 'Parrot',
+        'Sandal', 'High heels', 'Tableware', 'Cart', 'Mushroom', 'Kite',
+        'Missile', 'Seafood', 'Camera', 'Paper towel', 'Toilet paper',
+        'Sombrero', 'Radish', 'Lighthouse', 'Segway', 'Pig', 'Watercraft',
+        'Golf cart', 'studio couch', 'Dolphin', 'Whale', 'Earrings', 'Otter',
+        'Sea lion', 'Whiteboard', 'Monkey', 'Gondola', 'Zebra',
+        'Baseball glove', 'Scarf', 'Adhesive tape', 'Trousers', 'Scoreboard',
+        'Lily', 'Carnivore', 'Power plugs and sockets', 'Office building',
+        'Sandwich', 'Swimming pool', 'Headphones', 'Tin can', 'Crown', 'Doll',
+        'Cake', 'Frog', 'Beetle', 'Ant', 'Gas stove', 'Canoe', 'Falcon',
+        'Blue jay', 'Egg', 'Fire hydrant', 'Raccoon', 'Muffin', 'Wall clock',
+        'Coffee', 'Mug', 'Tea', 'Bear', 'Waste container', 'Home appliance',
+        'Candle', 'Lion', 'Mirror', 'Starfish', 'Marine mammal', 'Wheelchair',
+        'Umbrella', 'Alpaca', 'Violin', 'Cello', 'Brown bear', 'Canary', 'Bat',
+        'Ruler', 'Plastic bag', 'Penguin', 'Watermelon', 'Harbor seal', 'Pen',
+        'Pumpkin', 'Harp', 'Kitchen appliance', 'Roller skates', 'Bust',
+        'Coffee table', 'Tennis ball', 'Tennis racket', 'Ladder', 'Boot',
+        'Bowl', 'Stop sign', 'Volleyball', 'Eagle', 'Paddle', 'Chicken',
+        'Skull', 'Lamp', 'Beehive', 'Maple', 'Sink', 'Goldfish', 'Tripod',
+        'Coconut', 'Bidet', 'Tap', 'Bathroom cabinet', 'Toilet',
+        'Filing cabinet', 'Pretzel', 'Table tennis racket', 'Bronze sculpture',
+        'Rocket', 'Mouse', 'Hamster', 'Lizard', 'Lifejacket', 'Goat',
+        'Washing machine', 'Trumpet', 'Horn', 'Trombone', 'Sheep',
+        'Tablet computer', 'Pillow', 'Kitchen & dining room table',
+        'Parachute', 'Raven', 'Glove', 'Loveseat', 'Christmas tree',
+        'Shellfish', 'Rifle', 'Shotgun', 'Sushi', 'Sparrow', 'Bread',
+        'Toaster', 'Watch', 'Asparagus', 'Artichoke', 'Suitcase', 'Antelope',
+        'Broccoli', 'Ice cream', 'Racket', 'Banana', 'Cookie', 'Cucumber',
+        'Dragonfly', 'Lynx', 'Caterpillar', 'Light bulb', 'Office supplies',
+        'Miniskirt', 'Skirt', 'Fireplace', 'Potato', 'Light switch',
+        'Croissant', 'Cabbage', 'Ladybug', 'Handgun', 'Luggage and bags',
+        'Window blind', 'Snowboard', 'Baseball bat', 'Digital clock',
+        'Serving tray', 'Infant bed', 'Sofa bed', 'Guacamole', 'Fox', 'Pizza',
+        'Snowplow', 'Jet ski', 'Refrigerator', 'Lantern', 'Convenience store',
+        'Sword', 'Rugby ball', 'Owl', 'Ostrich', 'Pancake', 'Strawberry',
+        'Carrot', 'Tart', 'Dice', 'Turkey', 'Rabbit', 'Invertebrate', 'Vase',
+        'Stool', 'Swim cap', 'Shower', 'Clock', 'Jellyfish', 'Aircraft',
+        'Chopsticks', 'Orange', 'Snake', 'Sewing machine', 'Kangaroo', 'Mixer',
+        'Food processor', 'Shrimp', 'Towel', 'Porcupine', 'Jaguar', 'Cannon',
+        'Limousine', 'Mule', 'Squirrel', 'Kitchen knife', 'Tiara', 'Tiger',
+        'Bow and arrow', 'Candy', 'Rhinoceros', 'Shark', 'Cricket ball',
+        'Doughnut', 'Plumbing fixture', 'Camel', 'Polar bear', 'Coin',
+        'Printer', 'Blender', 'Giraffe', 'Billiard table', 'Kettle',
+        'Dinosaur', 'Pineapple', 'Zucchini', 'Jug', 'Barge', 'Teapot',
+        'Golf ball', 'Binoculars', 'Scissors', 'Hot dog', 'Door handle',
+        'Seahorse', 'Bathtub', 'Leopard', 'Centipede', 'Grapefruit', 'Snowman',
+        'Cheetah', 'Alarm clock', 'Grape', 'Wrench', 'Wok', 'Bell pepper',
+        'Cake stand', 'Barrel', 'Woodpecker', 'Flute', 'Corded phone',
+        'Willow', 'Punching bag', 'Pomegranate', 'Telephone', 'Pear',
+        'Common fig', 'Bench', 'Wood-burning stove', 'Burrito', 'Nail',
+        'Turtle', 'Submarine sandwich', 'Drinking straw', 'Peach', 'Popcorn',
+        'Frying pan', 'Picnic basket', 'Honeycomb', 'Envelope', 'Mango',
+        'Cutting board', 'Pitcher', 'Stationary bicycle', 'Dumbbell',
+        'Personal care', 'Dog bed', 'Snowmobile', 'Oboe', 'Briefcase',
+        'Squash', 'Tick', 'Slow cooker', 'Coffeemaker', 'Measuring cup',
+        'Crutch', 'Stretcher', 'Screwdriver', 'Flashlight', 'Spatula',
+        'Pressure cooker', 'Ring binder', 'Beaker', 'Torch', 'Winter melon'
+    ]
+
+
+def oid_v6_classes() -> list:
+    """Class names of Open Images V6."""
+    return [
+        'Tortoise', 'Container', 'Magpie', 'Sea turtle', 'Football',
+        'Ambulance', 'Ladder', 'Toothbrush', 'Syringe', 'Sink', 'Toy',
+        'Organ (Musical Instrument)', 'Cassette deck', 'Apple', 'Human eye',
+        'Cosmetics', 'Paddle', 'Snowman', 'Beer', 'Chopsticks', 'Human beard',
+        'Bird', 'Parking meter', 'Traffic light', 'Croissant', 'Cucumber',
+        'Radish', 'Towel', 'Doll', 'Skull', 'Washing machine', 'Glove', 'Tick',
+        'Belt', 'Sunglasses', 'Banjo', 'Cart', 'Ball', 'Backpack', 'Bicycle',
+        'Home appliance', 'Centipede', 'Boat', 'Surfboard', 'Boot',
+        'Headphones', 'Hot dog', 'Shorts', 'Fast food', 'Bus', 'Boy',
+        'Screwdriver', 'Bicycle wheel', 'Barge', 'Laptop', 'Miniskirt',
+        'Drill (Tool)', 'Dress', 'Bear', 'Waffle', 'Pancake', 'Brown bear',
+        'Woodpecker', 'Blue jay', 'Pretzel', 'Bagel', 'Tower', 'Teapot',
+        'Person', 'Bow and arrow', 'Swimwear', 'Beehive', 'Brassiere', 'Bee',
+        'Bat (Animal)', 'Starfish', 'Popcorn', 'Burrito', 'Chainsaw',
+        'Balloon', 'Wrench', 'Tent', 'Vehicle registration plate', 'Lantern',
+        'Toaster', 'Flashlight', 'Billboard', 'Tiara', 'Limousine', 'Necklace',
+        'Carnivore', 'Scissors', 'Stairs', 'Computer keyboard', 'Printer',
+        'Traffic sign', 'Chair', 'Shirt', 'Poster', 'Cheese', 'Sock',
+        'Fire hydrant', 'Land vehicle', 'Earrings', 'Tie', 'Watercraft',
+        'Cabinetry', 'Suitcase', 'Muffin', 'Bidet', 'Snack', 'Snowmobile',
+        'Clock', 'Medical equipment', 'Cattle', 'Cello', 'Jet ski', 'Camel',
+        'Coat', 'Suit', 'Desk', 'Cat', 'Bronze sculpture', 'Juice', 'Gondola',
+        'Beetle', 'Cannon', 'Computer mouse', 'Cookie', 'Office building',
+        'Fountain', 'Coin', 'Calculator', 'Cocktail', 'Computer monitor',
+        'Box', 'Stapler', 'Christmas tree', 'Cowboy hat', 'Hiking equipment',
+        'Studio couch', 'Drum', 'Dessert', 'Wine rack', 'Drink', 'Zucchini',
+        'Ladle', 'Human mouth', 'Dairy Product', 'Dice', 'Oven', 'Dinosaur',
+        'Ratchet (Device)', 'Couch', 'Cricket ball', 'Winter melon', 'Spatula',
+        'Whiteboard', 'Pencil sharpener', 'Door', 'Hat', 'Shower', 'Eraser',
+        'Fedora', 'Guacamole', 'Dagger', 'Scarf', 'Dolphin', 'Sombrero',
+        'Tin can', 'Mug', 'Tap', 'Harbor seal', 'Stretcher', 'Can opener',
+        'Goggles', 'Human body', 'Roller skates', 'Coffee cup',
+        'Cutting board', 'Blender', 'Plumbing fixture', 'Stop sign',
+        'Office supplies', 'Volleyball (Ball)', 'Vase', 'Slow cooker',
+        'Wardrobe', 'Coffee', 'Whisk', 'Paper towel', 'Personal care', 'Food',
+        'Sun hat', 'Tree house', 'Flying disc', 'Skirt', 'Gas stove',
+        'Salt and pepper shakers', 'Mechanical fan', 'Face powder', 'Fax',
+        'Fruit', 'French fries', 'Nightstand', 'Barrel', 'Kite', 'Tart',
+        'Treadmill', 'Fox', 'Flag', 'French horn', 'Window blind',
+        'Human foot', 'Golf cart', 'Jacket', 'Egg (Food)', 'Street light',
+        'Guitar', 'Pillow', 'Human leg', 'Isopod', 'Grape', 'Human ear',
+        'Power plugs and sockets', 'Panda', 'Giraffe', 'Woman', 'Door handle',
+        'Rhinoceros', 'Bathtub', 'Goldfish', 'Houseplant', 'Goat',
+        'Baseball bat', 'Baseball glove', 'Mixing bowl',
+        'Marine invertebrates', 'Kitchen utensil', 'Light switch', 'House',
+        'Horse', 'Stationary bicycle', 'Hammer', 'Ceiling fan', 'Sofa bed',
+        'Adhesive tape', 'Harp', 'Sandal', 'Bicycle helmet', 'Saucer',
+        'Harpsichord', 'Human hair', 'Heater', 'Harmonica', 'Hamster',
+        'Curtain', 'Bed', 'Kettle', 'Fireplace', 'Scale', 'Drinking straw',
+        'Insect', 'Hair dryer', 'Kitchenware', 'Indoor rower', 'Invertebrate',
+        'Food processor', 'Bookcase', 'Refrigerator', 'Wood-burning stove',
+        'Punching bag', 'Common fig', 'Cocktail shaker', 'Jaguar (Animal)',
+        'Golf ball', 'Fashion accessory', 'Alarm clock', 'Filing cabinet',
+        'Artichoke', 'Table', 'Tableware', 'Kangaroo', 'Koala', 'Knife',
+        'Bottle', 'Bottle opener', 'Lynx', 'Lavender (Plant)', 'Lighthouse',
+        'Dumbbell', 'Human head', 'Bowl', 'Humidifier', 'Porch', 'Lizard',
+        'Billiard table', 'Mammal', 'Mouse', 'Motorcycle',
+        'Musical instrument', 'Swim cap', 'Frying pan', 'Snowplow',
+        'Bathroom cabinet', 'Missile', 'Bust', 'Man', 'Waffle iron', 'Milk',
+        'Ring binder', 'Plate', 'Mobile phone', 'Baked goods', 'Mushroom',
+        'Crutch', 'Pitcher (Container)', 'Mirror', 'Personal flotation device',
+        'Table tennis racket', 'Pencil case', 'Musical keyboard', 'Scoreboard',
+        'Briefcase', 'Kitchen knife', 'Nail (Construction)', 'Tennis ball',
+        'Plastic bag', 'Oboe', 'Chest of drawers', 'Ostrich', 'Piano', 'Girl',
+        'Plant', 'Potato', 'Hair spray', 'Sports equipment', 'Pasta',
+        'Penguin', 'Pumpkin', 'Pear', 'Infant bed', 'Polar bear', 'Mixer',
+        'Cupboard', 'Jacuzzi', 'Pizza', 'Digital clock', 'Pig', 'Reptile',
+        'Rifle', 'Lipstick', 'Skateboard', 'Raven', 'High heels', 'Red panda',
+        'Rose', 'Rabbit', 'Sculpture', 'Saxophone', 'Shotgun', 'Seafood',
+        'Submarine sandwich', 'Snowboard', 'Sword', 'Picture frame', 'Sushi',
+        'Loveseat', 'Ski', 'Squirrel', 'Tripod', 'Stethoscope', 'Submarine',
+        'Scorpion', 'Segway', 'Training bench', 'Snake', 'Coffee table',
+        'Skyscraper', 'Sheep', 'Television', 'Trombone', 'Tea', 'Tank', 'Taco',
+        'Telephone', 'Torch', 'Tiger', 'Strawberry', 'Trumpet', 'Tree',
+        'Tomato', 'Train', 'Tool', 'Picnic basket', 'Cooking spray',
+        'Trousers', 'Bowling equipment', 'Football helmet', 'Truck',
+        'Measuring cup', 'Coffeemaker', 'Violin', 'Vehicle', 'Handbag',
+        'Paper cutter', 'Wine', 'Weapon', 'Wheel', 'Worm', 'Wok', 'Whale',
+        'Zebra', 'Auto part', 'Jug', 'Pizza cutter', 'Cream', 'Monkey', 'Lion',
+        'Bread', 'Platter', 'Chicken', 'Eagle', 'Helicopter', 'Owl', 'Duck',
+        'Turtle', 'Hippopotamus', 'Crocodile', 'Toilet', 'Toilet paper',
+        'Squid', 'Clothing', 'Footwear', 'Lemon', 'Spider', 'Deer', 'Frog',
+        'Banana', 'Rocket', 'Wine glass', 'Countertop', 'Tablet computer',
+        'Waste container', 'Swimming pool', 'Dog', 'Book', 'Elephant', 'Shark',
+        'Candle', 'Leopard', 'Axe', 'Hand dryer', 'Soap dispenser',
+        'Porcupine', 'Flower', 'Canary', 'Cheetah', 'Palm tree', 'Hamburger',
+        'Maple', 'Building', 'Fish', 'Lobster', 'Garden Asparagus',
+        'Furniture', 'Hedgehog', 'Airplane', 'Spoon', 'Otter', 'Bull',
+        'Oyster', 'Horizontal bar', 'Convenience store', 'Bomb', 'Bench',
+        'Ice cream', 'Caterpillar', 'Butterfly', 'Parachute', 'Orange',
+        'Antelope', 'Beaker', 'Moths and butterflies', 'Window', 'Closet',
+        'Castle', 'Jellyfish', 'Goose', 'Mule', 'Swan', 'Peach', 'Coconut',
+        'Seat belt', 'Raccoon', 'Chisel', 'Fork', 'Lamp', 'Camera',
+        'Squash (Plant)', 'Racket', 'Human face', 'Human arm', 'Vegetable',
+        'Diaper', 'Unicycle', 'Falcon', 'Chime', 'Snail', 'Shellfish',
+        'Cabbage', 'Carrot', 'Mango', 'Jeans', 'Flowerpot', 'Pineapple',
+        'Drawer', 'Stool', 'Envelope', 'Cake', 'Dragonfly', 'Common sunflower',
+        'Microwave oven', 'Honeycomb', 'Marine mammal', 'Sea lion', 'Ladybug',
+        'Shelf', 'Watch', 'Candy', 'Salad', 'Parrot', 'Handgun', 'Sparrow',
+        'Van', 'Grinder', 'Spice rack', 'Light bulb', 'Corded phone',
+        'Sports uniform', 'Tennis racket', 'Wall clock', 'Serving tray',
+        'Kitchen & dining room table', 'Dog bed', 'Cake stand',
+        'Cat furniture', 'Bathroom accessory', 'Facial tissue holder',
+        'Pressure cooker', 'Kitchen appliance', 'Tire', 'Ruler',
+        'Luggage and bags', 'Microphone', 'Broccoli', 'Umbrella', 'Pastry',
+        'Grapefruit', 'Band-aid', 'Animal', 'Bell pepper', 'Turkey', 'Lily',
+        'Pomegranate', 'Doughnut', 'Glasses', 'Human nose', 'Pen', 'Ant',
+        'Car', 'Aircraft', 'Human hand', 'Skunk', 'Teddy bear', 'Watermelon',
+        'Cantaloupe', 'Dishwasher', 'Flute', 'Balance beam', 'Sandwich',
+        'Shrimp', 'Sewing machine', 'Binoculars', 'Rays and skates', 'Ipod',
+        'Accordion', 'Willow', 'Crab', 'Crown', 'Seahorse', 'Perfume',
+        'Alpaca', 'Taxi', 'Canoe', 'Remote control', 'Wheelchair',
+        'Rugby ball', 'Armadillo', 'Maracas', 'Helmet'
+    ]
+
+
+def objects365v1_classes() -> list:
+    """Class names of Objects365 V1."""
+    return [
+        'person', 'sneakers', 'chair', 'hat', 'lamp', 'bottle',
+        'cabinet/shelf', 'cup', 'car', 'glasses', 'picture/frame', 'desk',
+        'handbag', 'street lights', 'book', 'plate', 'helmet', 'leather shoes',
+        'pillow', 'glove', 'potted plant', 'bracelet', 'flower', 'tv',
+        'storage box', 'vase', 'bench', 'wine glass', 'boots', 'bowl',
+        'dining table', 'umbrella', 'boat', 'flag', 'speaker', 'trash bin/can',
+        'stool', 'backpack', 'couch', 'belt', 'carpet', 'basket',
+        'towel/napkin', 'slippers', 'barrel/bucket', 'coffee table', 'suv',
+        'toy', 'tie', 'bed', 'traffic light', 'pen/pencil', 'microphone',
+        'sandals', 'canned', 'necklace', 'mirror', 'faucet', 'bicycle',
+        'bread', 'high heels', 'ring', 'van', 'watch', 'sink', 'horse', 'fish',
+        'apple', 'camera', 'candle', 'teddy bear', 'cake', 'motorcycle',
+        'wild bird', 'laptop', 'knife', 'traffic sign', 'cell phone', 'paddle',
+        'truck', 'cow', 'power outlet', 'clock', 'drum', 'fork', 'bus',
+        'hanger', 'nightstand', 'pot/pan', 'sheep', 'guitar', 'traffic cone',
+        'tea pot', 'keyboard', 'tripod', 'hockey', 'fan', 'dog', 'spoon',
+        'blackboard/whiteboard', 'balloon', 'air conditioner', 'cymbal',
+        'mouse', 'telephone', 'pickup truck', 'orange', 'banana', 'airplane',
+        'luggage', 'skis', 'soccer', 'trolley', 'oven', 'remote',
+        'baseball glove', 'paper towel', 'refrigerator', 'train', 'tomato',
+        'machinery vehicle', 'tent', 'shampoo/shower gel', 'head phone',
+        'lantern', 'donut', 'cleaning products', 'sailboat', 'tangerine',
+        'pizza', 'kite', 'computer box', 'elephant', 'toiletries', 'gas stove',
+        'broccoli', 'toilet', 'stroller', 'shovel', 'baseball bat',
+        'microwave', 'skateboard', 'surfboard', 'surveillance camera', 'gun',
+        'life saver', 'cat', 'lemon', 'liquid soap', 'zebra', 'duck',
+        'sports car', 'giraffe', 'pumpkin', 'piano', 'stop sign', 'radiator',
+        'converter', 'tissue ', 'carrot', 'washing machine', 'vent', 'cookies',
+        'cutting/chopping board', 'tennis racket', 'candy',
+        'skating and skiing shoes', 'scissors', 'folder', 'baseball',
+        'strawberry', 'bow tie', 'pigeon', 'pepper', 'coffee machine',
+        'bathtub', 'snowboard', 'suitcase', 'grapes', 'ladder', 'pear',
+        'american football', 'basketball', 'potato', 'paint brush', 'printer',
+        'billiards', 'fire hydrant', 'goose', 'projector', 'sausage',
+        'fire extinguisher', 'extension cord', 'facial mask', 'tennis ball',
+        'chopsticks', 'electronic stove and gas stove', 'pie', 'frisbee',
+        'kettle', 'hamburger', 'golf club', 'cucumber', 'clutch', 'blender',
+        'tong', 'slide', 'hot dog', 'toothbrush', 'facial cleanser', 'mango',
+        'deer', 'egg', 'violin', 'marker', 'ship', 'chicken', 'onion',
+        'ice cream', 'tape', 'wheelchair', 'plum', 'bar soap', 'scale',
+        'watermelon', 'cabbage', 'router/modem', 'golf ball', 'pine apple',
+        'crane', 'fire truck', 'peach', 'cello', 'notepaper', 'tricycle',
+        'toaster', 'helicopter', 'green beans', 'brush', 'carriage', 'cigar',
+        'earphone', 'penguin', 'hurdle', 'swing', 'radio', 'CD',
+        'parking meter', 'swan', 'garlic', 'french fries', 'horn', 'avocado',
+        'saxophone', 'trumpet', 'sandwich', 'cue', 'kiwi fruit', 'bear',
+        'fishing rod', 'cherry', 'tablet', 'green vegetables', 'nuts', 'corn',
+        'key', 'screwdriver', 'globe', 'broom', 'pliers', 'volleyball',
+        'hammer', 'eggplant', 'trophy', 'dates', 'board eraser', 'rice',
+        'tape measure/ruler', 'dumbbell', 'hamimelon', 'stapler', 'camel',
+        'lettuce', 'goldfish', 'meat balls', 'medal', 'toothpaste', 'antelope',
+        'shrimp', 'rickshaw', 'trombone', 'pomegranate', 'coconut',
+        'jellyfish', 'mushroom', 'calculator', 'treadmill', 'butterfly',
+        'egg tart', 'cheese', 'pig', 'pomelo', 'race car', 'rice cooker',
+        'tuba', 'crosswalk sign', 'papaya', 'hair drier', 'green onion',
+        'chips', 'dolphin', 'sushi', 'urinal', 'donkey', 'electric drill',
+        'spring rolls', 'tortoise/turtle', 'parrot', 'flute', 'measuring cup',
+        'shark', 'steak', 'poker card', 'binoculars', 'llama', 'radish',
+        'noodles', 'yak', 'mop', 'crab', 'microscope', 'barbell', 'bread/bun',
+        'baozi', 'lion', 'red cabbage', 'polar bear', 'lighter', 'seal',
+        'mangosteen', 'comb', 'eraser', 'pitaya', 'scallop', 'pencil case',
+        'saw', 'table tennis paddle', 'okra', 'starfish', 'eagle', 'monkey',
+        'durian', 'game board', 'rabbit', 'french horn', 'ambulance',
+        'asparagus', 'hoverboard', 'pasta', 'target', 'hotair balloon',
+        'chainsaw', 'lobster', 'iron', 'flashlight'
+    ]
+
+
+def objects365v2_classes() -> list:
+    """Class names of Objects365 V2."""
+    return [
+        'Person', 'Sneakers', 'Chair', 'Other Shoes', 'Hat', 'Car', 'Lamp',
+        'Glasses', 'Bottle', 'Desk', 'Cup', 'Street Lights', 'Cabinet/shelf',
+        'Handbag/Satchel', 'Bracelet', 'Plate', 'Picture/Frame', 'Helmet',
+        'Book', 'Gloves', 'Storage box', 'Boat', 'Leather Shoes', 'Flower',
+        'Bench', 'Potted Plant', 'Bowl/Basin', 'Flag', 'Pillow', 'Boots',
+        'Vase', 'Microphone', 'Necklace', 'Ring', 'SUV', 'Wine Glass', 'Belt',
+        'Moniter/TV', 'Backpack', 'Umbrella', 'Traffic Light', 'Speaker',
+        'Watch', 'Tie', 'Trash bin Can', 'Slippers', 'Bicycle', 'Stool',
+        'Barrel/bucket', 'Van', 'Couch', 'Sandals', 'Bakset', 'Drum',
+        'Pen/Pencil', 'Bus', 'Wild Bird', 'High Heels', 'Motorcycle', 'Guitar',
+        'Carpet', 'Cell Phone', 'Bread', 'Camera', 'Canned', 'Truck',
+        'Traffic cone', 'Cymbal', 'Lifesaver', 'Towel', 'Stuffed Toy',
+        'Candle', 'Sailboat', 'Laptop', 'Awning', 'Bed', 'Faucet', 'Tent',
+        'Horse', 'Mirror', 'Power outlet', 'Sink', 'Apple', 'Air Conditioner',
+        'Knife', 'Hockey Stick', 'Paddle', 'Pickup Truck', 'Fork',
+        'Traffic Sign', 'Ballon', 'Tripod', 'Dog', 'Spoon', 'Clock', 'Pot',
+        'Cow', 'Cake', 'Dinning Table', 'Sheep', 'Hanger',
+        'Blackboard/Whiteboard', 'Napkin', 'Other Fish', 'Orange/Tangerine',
+        'Toiletry', 'Keyboard', 'Tomato', 'Lantern', 'Machinery Vehicle',
+        'Fan', 'Green Vegetables', 'Banana', 'Baseball Glove', 'Airplane',
+        'Mouse', 'Train', 'Pumpkin', 'Soccer', 'Skiboard', 'Luggage',
+        'Nightstand', 'Tea pot', 'Telephone', 'Trolley', 'Head Phone',
+        'Sports Car', 'Stop Sign', 'Dessert', 'Scooter', 'Stroller', 'Crane',
+        'Remote', 'Refrigerator', 'Oven', 'Lemon', 'Duck', 'Baseball Bat',
+        'Surveillance Camera', 'Cat', 'Jug', 'Broccoli', 'Piano', 'Pizza',
+        'Elephant', 'Skateboard', 'Surfboard', 'Gun',
+        'Skating and Skiing shoes', 'Gas stove', 'Donut', 'Bow Tie', 'Carrot',
+        'Toilet', 'Kite', 'Strawberry', 'Other Balls', 'Shovel', 'Pepper',
+        'Computer Box', 'Toilet Paper', 'Cleaning Products', 'Chopsticks',
+        'Microwave', 'Pigeon', 'Baseball', 'Cutting/chopping Board',
+        'Coffee Table', 'Side Table', 'Scissors', 'Marker', 'Pie', 'Ladder',
+        'Snowboard', 'Cookies', 'Radiator', 'Fire Hydrant', 'Basketball',
+        'Zebra', 'Grape', 'Giraffe', 'Potato', 'Sausage', 'Tricycle', 'Violin',
+        'Egg', 'Fire Extinguisher', 'Candy', 'Fire Truck', 'Billards',
+        'Converter', 'Bathtub', 'Wheelchair', 'Golf Club', 'Briefcase',
+        'Cucumber', 'Cigar/Cigarette ', 'Paint Brush', 'Pear', 'Heavy Truck',
+        'Hamburger', 'Extractor', 'Extention Cord', 'Tong', 'Tennis Racket',
+        'Folder', 'American Football', 'earphone', 'Mask', 'Kettle', 'Tennis',
+        'Ship', 'Swing', 'Coffee Machine', 'Slide', 'Carriage', 'Onion',
+        'Green beans', 'Projector', 'Frisbee',
+        'Washing Machine/Drying Machine', 'Chicken', 'Printer', 'Watermelon',
+        'Saxophone', 'Tissue', 'Toothbrush', 'Ice cream', 'Hotair ballon',
+        'Cello', 'French Fries', 'Scale', 'Trophy', 'Cabbage', 'Hot dog',
+        'Blender', 'Peach', 'Rice', 'Wallet/Purse', 'Volleyball', 'Deer',
+        'Goose', 'Tape', 'Tablet', 'Cosmetics', 'Trumpet', 'Pineapple',
+        'Golf Ball', 'Ambulance', 'Parking meter', 'Mango', 'Key', 'Hurdle',
+        'Fishing Rod', 'Medal', 'Flute', 'Brush', 'Penguin', 'Megaphone',
+        'Corn', 'Lettuce', 'Garlic', 'Swan', 'Helicopter', 'Green Onion',
+        'Sandwich', 'Nuts', 'Speed Limit Sign', 'Induction Cooker', 'Broom',
+        'Trombone', 'Plum', 'Rickshaw', 'Goldfish', 'Kiwi fruit',
+        'Router/modem', 'Poker Card', 'Toaster', 'Shrimp', 'Sushi', 'Cheese',
+        'Notepaper', 'Cherry', 'Pliers', 'CD', 'Pasta', 'Hammer', 'Cue',
+        'Avocado', 'Hamimelon', 'Flask', 'Mushroon', 'Screwdriver', 'Soap',
+        'Recorder', 'Bear', 'Eggplant', 'Board Eraser', 'Coconut',
+        'Tape Measur/ Ruler', 'Pig', 'Showerhead', 'Globe', 'Chips', 'Steak',
+        'Crosswalk Sign', 'Stapler', 'Campel', 'Formula 1 ', 'Pomegranate',
+        'Dishwasher', 'Crab', 'Hoverboard', 'Meat ball', 'Rice Cooker', 'Tuba',
+        'Calculator', 'Papaya', 'Antelope', 'Parrot', 'Seal', 'Buttefly',
+        'Dumbbell', 'Donkey', 'Lion', 'Urinal', 'Dolphin', 'Electric Drill',
+        'Hair Dryer', 'Egg tart', 'Jellyfish', 'Treadmill', 'Lighter',
+        'Grapefruit', 'Game board', 'Mop', 'Radish', 'Baozi', 'Target',
+        'French', 'Spring Rolls', 'Monkey', 'Rabbit', 'Pencil Case', 'Yak',
+        'Red Cabbage', 'Binoculars', 'Asparagus', 'Barbell', 'Scallop',
+        'Noddles', 'Comb', 'Dumpling', 'Oyster', 'Table Teniis paddle',
+        'Cosmetics Brush/Eyeliner Pencil', 'Chainsaw', 'Eraser', 'Lobster',
+        'Durian', 'Okra', 'Lipstick', 'Cosmetics Mirror', 'Curling',
+        'Table Tennis '
+    ]
+
+
+def lvis_classes() -> list:
+    """Class names of LVIS."""
+    return [
+        'aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock', 'alcohol',
+        'alligator', 'almond', 'ambulance', 'amplifier', 'anklet', 'antenna',
+        'apple', 'applesauce', 'apricot', 'apron', 'aquarium',
+        'arctic_(type_of_shoe)', 'armband', 'armchair', 'armoire', 'armor',
+        'artichoke', 'trash_can', 'ashtray', 'asparagus', 'atomizer',
+        'avocado', 'award', 'awning', 'ax', 'baboon', 'baby_buggy',
+        'basketball_backboard', 'backpack', 'handbag', 'suitcase', 'bagel',
+        'bagpipe', 'baguet', 'bait', 'ball', 'ballet_skirt', 'balloon',
+        'bamboo', 'banana', 'Band_Aid', 'bandage', 'bandanna', 'banjo',
+        'banner', 'barbell', 'barge', 'barrel', 'barrette', 'barrow',
+        'baseball_base', 'baseball', 'baseball_bat', 'baseball_cap',
+        'baseball_glove', 'basket', 'basketball', 'bass_horn', 'bat_(animal)',
+        'bath_mat', 'bath_towel', 'bathrobe', 'bathtub', 'batter_(food)',
+        'battery', 'beachball', 'bead', 'bean_curd', 'beanbag', 'beanie',
+        'bear', 'bed', 'bedpan', 'bedspread', 'cow', 'beef_(food)', 'beeper',
+        'beer_bottle', 'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt',
+        'belt_buckle', 'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor',
+        'billboard', 'binder', 'binoculars', 'bird', 'birdfeeder', 'birdbath',
+        'birdcage', 'birdhouse', 'birthday_cake', 'birthday_card',
+        'pirate_flag', 'black_sheep', 'blackberry', 'blackboard', 'blanket',
+        'blazer', 'blender', 'blimp', 'blinker', 'blouse', 'blueberry',
+        'gameboard', 'boat', 'bob', 'bobbin', 'bobby_pin', 'boiled_egg',
+        'bolo_tie', 'deadbolt', 'bolt', 'bonnet', 'book', 'bookcase',
+        'booklet', 'bookmark', 'boom_microphone', 'boot', 'bottle',
+        'bottle_opener', 'bouquet', 'bow_(weapon)', 'bow_(decorative_ribbons)',
+        'bow-tie', 'bowl', 'pipe_bowl', 'bowler_hat', 'bowling_ball', 'box',
+        'boxing_glove', 'suspenders', 'bracelet', 'brass_plaque', 'brassiere',
+        'bread-bin', 'bread', 'breechcloth', 'bridal_gown', 'briefcase',
+        'broccoli', 'broach', 'broom', 'brownie', 'brussels_sprouts',
+        'bubble_gum', 'bucket', 'horse_buggy', 'bull', 'bulldog', 'bulldozer',
+        'bullet_train', 'bulletin_board', 'bulletproof_vest', 'bullhorn',
+        'bun', 'bunk_bed', 'buoy', 'burrito', 'bus_(vehicle)', 'business_card',
+        'butter', 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car',
+        'cabinet', 'locker', 'cake', 'calculator', 'calendar', 'calf',
+        'camcorder', 'camel', 'camera', 'camera_lens', 'camper_(vehicle)',
+        'can', 'can_opener', 'candle', 'candle_holder', 'candy_bar',
+        'candy_cane', 'walking_cane', 'canister', 'canoe', 'cantaloup',
+        'canteen', 'cap_(headwear)', 'bottle_cap', 'cape', 'cappuccino',
+        'car_(automobile)', 'railcar_(part_of_a_train)', 'elevator_car',
+        'car_battery', 'identity_card', 'card', 'cardigan', 'cargo_ship',
+        'carnation', 'horse_carriage', 'carrot', 'tote_bag', 'cart', 'carton',
+        'cash_register', 'casserole', 'cassette', 'cast', 'cat', 'cauliflower',
+        'cayenne_(spice)', 'CD_player', 'celery', 'cellular_telephone',
+        'chain_mail', 'chair', 'chaise_longue', 'chalice', 'chandelier',
+        'chap', 'checkbook', 'checkerboard', 'cherry', 'chessboard',
+        'chicken_(animal)', 'chickpea', 'chili_(vegetable)', 'chime',
+        'chinaware', 'crisp_(potato_chip)', 'poker_chip', 'chocolate_bar',
+        'chocolate_cake', 'chocolate_milk', 'chocolate_mousse', 'choker',
+        'chopping_board', 'chopstick', 'Christmas_tree', 'slide', 'cider',
+        'cigar_box', 'cigarette', 'cigarette_case', 'cistern', 'clarinet',
+        'clasp', 'cleansing_agent', 'cleat_(for_securing_rope)', 'clementine',
+        'clip', 'clipboard', 'clippers_(for_plants)', 'cloak', 'clock',
+        'clock_tower', 'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster',
+        'coat', 'coat_hanger', 'coatrack', 'cock', 'cockroach',
+        'cocoa_(beverage)', 'coconut', 'coffee_maker', 'coffee_table',
+        'coffeepot', 'coil', 'coin', 'colander', 'coleslaw',
+        'coloring_material', 'combination_lock', 'pacifier', 'comic_book',
+        'compass', 'computer_keyboard', 'condiment', 'cone', 'control',
+        'convertible_(automobile)', 'sofa_bed', 'cooker', 'cookie',
+        'cooking_utensil', 'cooler_(for_food)', 'cork_(bottle_plug)',
+        'corkboard', 'corkscrew', 'edible_corn', 'cornbread', 'cornet',
+        'cornice', 'cornmeal', 'corset', 'costume', 'cougar', 'coverall',
+        'cowbell', 'cowboy_hat', 'crab_(animal)', 'crabmeat', 'cracker',
+        'crape', 'crate', 'crayon', 'cream_pitcher', 'crescent_roll', 'crib',
+        'crock_pot', 'crossbar', 'crouton', 'crow', 'crowbar', 'crown',
+        'crucifix', 'cruise_ship', 'police_cruiser', 'crumb', 'crutch',
+        'cub_(animal)', 'cube', 'cucumber', 'cufflink', 'cup', 'trophy_cup',
+        'cupboard', 'cupcake', 'hair_curler', 'curling_iron', 'curtain',
+        'cushion', 'cylinder', 'cymbal', 'dagger', 'dalmatian', 'dartboard',
+        'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk',
+        'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table', 'tux',
+        'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher',
+        'dishwasher_detergent', 'dispenser', 'diving_board', 'Dixie_cup',
+        'dog', 'dog_collar', 'doll', 'dollar', 'dollhouse', 'dolphin',
+        'domestic_ass', 'doorknob', 'doormat', 'doughnut', 'dove', 'dragonfly',
+        'drawer', 'underdrawers', 'dress', 'dress_hat', 'dress_suit',
+        'dresser', 'drill', 'drone', 'dropper', 'drum_(musical_instrument)',
+        'drumstick', 'duck', 'duckling', 'duct_tape', 'duffel_bag', 'dumbbell',
+        'dumpster', 'dustpan', 'eagle', 'earphone', 'earplug', 'earring',
+        'easel', 'eclair', 'eel', 'egg', 'egg_roll', 'egg_yolk', 'eggbeater',
+        'eggplant', 'electric_chair', 'refrigerator', 'elephant', 'elk',
+        'envelope', 'eraser', 'escargot', 'eyepatch', 'falcon', 'fan',
+        'faucet', 'fedora', 'ferret', 'Ferris_wheel', 'ferry', 'fig_(fruit)',
+        'fighter_jet', 'figurine', 'file_cabinet', 'file_(tool)', 'fire_alarm',
+        'fire_engine', 'fire_extinguisher', 'fire_hose', 'fireplace',
+        'fireplug', 'first-aid_kit', 'fish', 'fish_(food)', 'fishbowl',
+        'fishing_rod', 'flag', 'flagpole', 'flamingo', 'flannel', 'flap',
+        'flash', 'flashlight', 'fleece', 'flip-flop_(sandal)',
+        'flipper_(footwear)', 'flower_arrangement', 'flute_glass', 'foal',
+        'folding_chair', 'food_processor', 'football_(American)',
+        'football_helmet', 'footstool', 'fork', 'forklift', 'freight_car',
+        'French_toast', 'freshener', 'frisbee', 'frog', 'fruit_juice',
+        'frying_pan', 'fudge', 'funnel', 'futon', 'gag', 'garbage',
+        'garbage_truck', 'garden_hose', 'gargle', 'gargoyle', 'garlic',
+        'gasmask', 'gazelle', 'gelatin', 'gemstone', 'generator',
+        'giant_panda', 'gift_wrap', 'ginger', 'giraffe', 'cincture',
+        'glass_(drink_container)', 'globe', 'glove', 'goat', 'goggles',
+        'goldfish', 'golf_club', 'golfcart', 'gondola_(boat)', 'goose',
+        'gorilla', 'gourd', 'grape', 'grater', 'gravestone', 'gravy_boat',
+        'green_bean', 'green_onion', 'griddle', 'grill', 'grits', 'grizzly',
+        'grocery_bag', 'guitar', 'gull', 'gun', 'hairbrush', 'hairnet',
+        'hairpin', 'halter_top', 'ham', 'hamburger', 'hammer', 'hammock',
+        'hamper', 'hamster', 'hair_dryer', 'hand_glass', 'hand_towel',
+        'handcart', 'handcuff', 'handkerchief', 'handle', 'handsaw',
+        'hardback_book', 'harmonium', 'hat', 'hatbox', 'veil', 'headband',
+        'headboard', 'headlight', 'headscarf', 'headset',
+        'headstall_(for_horses)', 'heart', 'heater', 'helicopter', 'helmet',
+        'heron', 'highchair', 'hinge', 'hippopotamus', 'hockey_stick', 'hog',
+        'home_plate_(baseball)', 'honey', 'fume_hood', 'hook', 'hookah',
+        'hornet', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce',
+        'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear',
+        'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate',
+        'igniter', 'inhaler', 'iPod', 'iron_(for_clothing)', 'ironing_board',
+        'jacket', 'jam', 'jar', 'jean', 'jeep', 'jelly_bean', 'jersey',
+        'jet_plane', 'jewel', 'jewelry', 'joystick', 'jumpsuit', 'kayak',
+        'keg', 'kennel', 'kettle', 'key', 'keycard', 'kilt', 'kimono',
+        'kitchen_sink', 'kitchen_table', 'kite', 'kitten', 'kiwi_fruit',
+        'knee_pad', 'knife', 'knitting_needle', 'knob', 'knocker_(on_a_door)',
+        'koala', 'lab_coat', 'ladder', 'ladle', 'ladybug', 'lamb_(animal)',
+        'lamb-chop', 'lamp', 'lamppost', 'lampshade', 'lantern', 'lanyard',
+        'laptop_computer', 'lasagna', 'latch', 'lawn_mower', 'leather',
+        'legging_(clothing)', 'Lego', 'legume', 'lemon', 'lemonade', 'lettuce',
+        'license_plate', 'life_buoy', 'life_jacket', 'lightbulb',
+        'lightning_rod', 'lime', 'limousine', 'lion', 'lip_balm', 'liquor',
+        'lizard', 'log', 'lollipop', 'speaker_(stereo_equipment)', 'loveseat',
+        'machine_gun', 'magazine', 'magnet', 'mail_slot', 'mailbox_(at_home)',
+        'mallard', 'mallet', 'mammoth', 'manatee', 'mandarin_orange', 'manger',
+        'manhole', 'map', 'marker', 'martini', 'mascot', 'mashed_potato',
+        'masher', 'mask', 'mast', 'mat_(gym_equipment)', 'matchbox',
+        'mattress', 'measuring_cup', 'measuring_stick', 'meatball', 'medicine',
+        'melon', 'microphone', 'microscope', 'microwave_oven', 'milestone',
+        'milk', 'milk_can', 'milkshake', 'minivan', 'mint_candy', 'mirror',
+        'mitten', 'mixer_(kitchen_tool)', 'money',
+        'monitor_(computer_equipment) computer_monitor', 'monkey', 'motor',
+        'motor_scooter', 'motor_vehicle', 'motorcycle', 'mound_(baseball)',
+        'mouse_(computer_equipment)', 'mousepad', 'muffin', 'mug', 'mushroom',
+        'music_stool', 'musical_instrument', 'nailfile', 'napkin',
+        'neckerchief', 'necklace', 'necktie', 'needle', 'nest', 'newspaper',
+        'newsstand', 'nightshirt', 'nosebag_(for_animals)',
+        'noseband_(for_animals)', 'notebook', 'notepad', 'nut', 'nutcracker',
+        'oar', 'octopus_(food)', 'octopus_(animal)', 'oil_lamp', 'olive_oil',
+        'omelet', 'onion', 'orange_(fruit)', 'orange_juice', 'ostrich',
+        'ottoman', 'oven', 'overalls_(clothing)', 'owl', 'packet', 'inkpad',
+        'pad', 'paddle', 'padlock', 'paintbrush', 'painting', 'pajamas',
+        'palette', 'pan_(for_cooking)', 'pan_(metal_container)', 'pancake',
+        'pantyhose', 'papaya', 'paper_plate', 'paper_towel', 'paperback_book',
+        'paperweight', 'parachute', 'parakeet', 'parasail_(sports)', 'parasol',
+        'parchment', 'parka', 'parking_meter', 'parrot',
+        'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport',
+        'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter',
+        'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'wooden_leg',
+        'pegboard', 'pelican', 'pen', 'pencil', 'pencil_box',
+        'pencil_sharpener', 'pendulum', 'penguin', 'pennant', 'penny_(coin)',
+        'pepper', 'pepper_mill', 'perfume', 'persimmon', 'person', 'pet',
+        'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano',
+        'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow',
+        'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball',
+        'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)',
+        'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat',
+        'plate', 'platter', 'playpen', 'pliers', 'plow_(farm_equipment)',
+        'plume', 'pocket_watch', 'pocketknife', 'poker_(fire_stirring_tool)',
+        'pole', 'polo_shirt', 'poncho', 'pony', 'pool_table', 'pop_(soda)',
+        'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot', 'potato',
+        'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn', 'pretzel',
+        'printer', 'projectile_(weapon)', 'projector', 'propeller', 'prune',
+        'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', 'puncher',
+        'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt', 'rabbit',
+        'race_car', 'racket', 'radar', 'radiator', 'radio_receiver', 'radish',
+        'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', 'rat',
+        'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt',
+        'recliner', 'record_player', 'reflector', 'remote_control',
+        'rhinoceros', 'rib_(food)', 'rifle', 'ring', 'river_boat', 'road_map',
+        'robe', 'rocking_chair', 'rodent', 'roller_skate', 'Rollerblade',
+        'rolling_pin', 'root_beer', 'router_(computer_equipment)',
+        'rubber_band', 'runner_(carpet)', 'plastic_bag',
+        'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag', 'safety_pin',
+        'sail', 'salad', 'salad_plate', 'salami', 'salmon_(fish)',
+        'salmon_(food)', 'salsa', 'saltshaker', 'sandal_(type_of_shoe)',
+        'sandwich', 'satchel', 'saucepan', 'saucer', 'sausage', 'sawhorse',
+        'saxophone', 'scale_(measuring_instrument)', 'scarecrow', 'scarf',
+        'school_bus', 'scissors', 'scoreboard', 'scraper', 'screwdriver',
+        'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane',
+        'seashell', 'sewing_machine', 'shaker', 'shampoo', 'shark',
+        'sharpener', 'Sharpie', 'shaver_(electric)', 'shaving_cream', 'shawl',
+        'shears', 'sheep', 'shepherd_dog', 'sherbert', 'shield', 'shirt',
+        'shoe', 'shopping_bag', 'shopping_cart', 'short_pants', 'shot_glass',
+        'shoulder_bag', 'shovel', 'shower_head', 'shower_cap',
+        'shower_curtain', 'shredder_(for_paper)', 'signboard', 'silo', 'sink',
+        'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka', 'ski_pole',
+        'skirt', 'skullcap', 'sled', 'sleeping_bag', 'sling_(bandage)',
+        'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman',
+        'snowmobile', 'soap', 'soccer_ball', 'sock', 'sofa', 'softball',
+        'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon',
+        'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)',
+        'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'crawfish',
+        'sponge', 'spoon', 'sportswear', 'spotlight', 'squid_(food)',
+        'squirrel', 'stagecoach', 'stapler_(stapling_machine)', 'starfish',
+        'statue_(sculpture)', 'steak_(food)', 'steak_knife', 'steering_wheel',
+        'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew', 'stirrer',
+        'stirrup', 'stool', 'stop_sign', 'brake_light', 'stove', 'strainer',
+        'strap', 'straw_(for_drinking)', 'strawberry', 'street_sign',
+        'streetlight', 'string_cheese', 'stylus', 'subwoofer', 'sugar_bowl',
+        'sugarcane_(plant)', 'suit_(clothing)', 'sunflower', 'sunglasses',
+        'sunhat', 'surfboard', 'sushi', 'mop', 'sweat_pants', 'sweatband',
+        'sweater', 'sweatshirt', 'sweet_potato', 'swimsuit', 'sword',
+        'syringe', 'Tabasco_sauce', 'table-tennis_table', 'table',
+        'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag', 'taillight',
+        'tambourine', 'army_tank', 'tank_(storage_vessel)',
+        'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure',
+        'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup',
+        'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth',
+        'telephone_pole', 'telephoto_lens', 'television_camera',
+        'television_set', 'tennis_ball', 'tennis_racket', 'tequila',
+        'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread',
+        'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer', 'tinfoil',
+        'tinsel', 'tissue_paper', 'toast_(food)', 'toaster', 'toaster_oven',
+        'toilet', 'toilet_tissue', 'tomato', 'tongs', 'toolbox', 'toothbrush',
+        'toothpaste', 'toothpick', 'cover', 'tortilla', 'tow_truck', 'towel',
+        'towel_rack', 'toy', 'tractor_(farm_equipment)', 'traffic_light',
+        'dirt_bike', 'trailer_truck', 'train_(railroad_vehicle)', 'trampoline',
+        'tray', 'trench_coat', 'triangle_(musical_instrument)', 'tricycle',
+        'tripod', 'trousers', 'truck', 'truffle_(chocolate)', 'trunk', 'vat',
+        'turban', 'turkey_(food)', 'turnip', 'turtle', 'turtleneck_(clothing)',
+        'typewriter', 'umbrella', 'underwear', 'unicycle', 'urinal', 'urn',
+        'vacuum_cleaner', 'vase', 'vending_machine', 'vent', 'vest',
+        'videotape', 'vinegar', 'violin', 'vodka', 'volleyball', 'vulture',
+        'waffle', 'waffle_iron', 'wagon', 'wagon_wheel', 'walking_stick',
+        'wall_clock', 'wall_socket', 'wallet', 'walrus', 'wardrobe',
+        'washbasin', 'automatic_washer', 'watch', 'water_bottle',
+        'water_cooler', 'water_faucet', 'water_heater', 'water_jug',
+        'water_gun', 'water_scooter', 'water_ski', 'water_tower',
+        'watering_can', 'watermelon', 'weathervane', 'webcam', 'wedding_cake',
+        'wedding_ring', 'wet_suit', 'wheel', 'wheelchair', 'whipped_cream',
+        'whistle', 'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)',
+        'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket',
+        'wineglass', 'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon',
+        'wreath', 'wrench', 'wristband', 'wristlet', 'yacht', 'yogurt',
+        'yoke_(animal_equipment)', 'zebra', 'zucchini'
+    ]
+
+
+dataset_aliases = {
+    'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'],
+    'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'],
+    'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'],
+    'coco': ['coco', 'mscoco', 'ms_coco'],
+    'coco_panoptic': ['coco_panoptic', 'panoptic'],
+    'wider_face': ['WIDERFaceDataset', 'wider_face', 'WIDERFace'],
+    'cityscapes': ['cityscapes'],
+    'oid_challenge': ['oid_challenge', 'openimages_challenge'],
+    'oid_v6': ['oid_v6', 'openimages_v6'],
+    'objects365v1': ['objects365v1', 'obj365v1'],
+    'objects365v2': ['objects365v2', 'obj365v2'],
+    'lvis': ['lvis', 'lvis_v1'],
+}
+
+
+def get_classes(dataset) -> list:
+    """Get class names of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_classes()')
+        else:
+            raise ValueError(f'Unrecognized dataset: {dataset}')
+    else:
+        raise TypeError(f'dataset must a str, but got {type(dataset)}')
+    return labels
diff --git a/mmde/mmdet/evaluation/functional/mean_ap.py b/mmde/mmdet/evaluation/functional/mean_ap.py
new file mode 100644
index 0000000000000000000000000000000000000000..989972a48467f74fa915fa6f3807d0db3becdba2
--- /dev/null
+++ b/mmde/mmdet/evaluation/functional/mean_ap.py
@@ -0,0 +1,792 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from multiprocessing import Pool
+
+import numpy as np
+from mmengine.logging import print_log
+from mmengine.utils import is_str
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+from .class_names import get_classes
+
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or ndarray: calculated average precision
+    """
+    no_scale = False
+    if recalls.ndim == 1:
+        no_scale = True
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+    assert recalls.shape == precisions.shape and recalls.ndim == 2
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+        ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    if no_scale:
+        ap = ap[0]
+    return ap
+
+
+def tpfp_imagenet(det_bboxes,
+                  gt_bboxes,
+                  gt_bboxes_ignore=None,
+                  default_iou_thr=0.5,
+                  area_ranges=None,
+                  use_legacy_coordinate=False,
+                  **kwargs):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Defaults to None
+        default_iou_thr (float): IoU threshold to be considered as matched for
+            medium and large bboxes (small ones have special rules).
+            Defaults to 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be evaluated,
+            in the format [(min1, max1), (min2, max2), ...]. Defaults to None.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Defaults to False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0],
+                  dtype=bool), np.ones(gt_bboxes_ignore.shape[0], dtype=bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp
+    # of a certain scale.
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+    ious = bbox_overlaps(
+        det_bboxes, gt_bboxes - 1, use_legacy_coordinate=use_legacy_coordinate)
+    gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length
+    gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length
+    iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)),
+                          default_iou_thr)
+    # sort all detections by scores in descending order
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+        else:
+            gt_areas = gt_w * gt_h
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            max_iou = -1
+            matched_gt = -1
+            # find best overlapped available gt
+            for j in range(num_gts):
+                # different from PASCAL VOC: allow finding other gts if the
+                # best overlapped ones are already matched by other det bboxes
+                if gt_covered[j]:
+                    continue
+                elif ious[i, j] >= iou_thrs[j] and ious[i, j] > max_iou:
+                    max_iou = ious[i, j]
+                    matched_gt = j
+            # there are 4 cases for a det bbox:
+            # 1. it matches a gt, tp = 1, fp = 0
+            # 2. it matches an ignored gt, tp = 0, fp = 0
+            # 3. it matches no gt and within area range, tp = 0, fp = 1
+            # 4. it matches no gt but is beyond area range, tp = 0, fp = 0
+            if matched_gt >= 0:
+                gt_covered[matched_gt] = 1
+                if not (gt_ignore_inds[matched_gt]
+                        or gt_area_ignore[matched_gt]):
+                    tp[k, i] = 1
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0] + extra_length) * (
+                    bbox[3] - bbox[1] + extra_length)
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def tpfp_default(det_bboxes,
+                 gt_bboxes,
+                 gt_bboxes_ignore=None,
+                 iou_thr=0.5,
+                 area_ranges=None,
+                 use_legacy_coordinate=False,
+                 **kwargs):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Defaults to None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Defaults to 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be
+            evaluated, in the format [(min1, max1), (min2, max2), ...].
+            Defaults to None.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Defaults to False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0],
+                  dtype=bool), np.ones(gt_bboxes_ignore.shape[0], dtype=bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+    # a certain scale
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+
+    ious = bbox_overlaps(
+        det_bboxes, gt_bboxes, use_legacy_coordinate=use_legacy_coordinate)
+    # for each det, the max iou with all gts
+    ious_max = ious.max(axis=1)
+    # for each det, which gt overlaps most with it
+    ious_argmax = ious.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+        else:
+            gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length) * (
+                gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length)
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            if ious_max[i] >= iou_thr:
+                matched_gt = ious_argmax[i]
+                if not (gt_ignore_inds[matched_gt]
+                        or gt_area_ignore[matched_gt]):
+                    if not gt_covered[matched_gt]:
+                        gt_covered[matched_gt] = True
+                        tp[k, i] = 1
+                    else:
+                        fp[k, i] = 1
+                # otherwise ignore this detected bbox, tp = 0, fp = 0
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0] + extra_length) * (
+                    bbox[3] - bbox[1] + extra_length)
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def tpfp_openimages(det_bboxes,
+                    gt_bboxes,
+                    gt_bboxes_ignore=None,
+                    iou_thr=0.5,
+                    area_ranges=None,
+                    use_legacy_coordinate=False,
+                    gt_bboxes_group_of=None,
+                    use_group_of=True,
+                    ioa_thr=0.5,
+                    **kwargs):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Defaults to None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Defaults to 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be
+            evaluated, in the format [(min1, max1), (min2, max2), ...].
+            Defaults to None.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Defaults to False.
+        gt_bboxes_group_of (ndarray): GT group_of of this image, of shape
+            (k, 1). Defaults to None
+        use_group_of (bool): Whether to use group of when calculate TP and FP,
+            which only used in OpenImages evaluation. Defaults to True.
+        ioa_thr (float | None): IoA threshold to be considered as matched,
+            which only used in OpenImages evaluation. Defaults to 0.5.
+
+    Returns:
+        tuple[np.ndarray]: Returns a tuple (tp, fp, det_bboxes), where
+        (tp, fp) whose elements are 0 and 1. The shape of each array is
+        (num_scales, m). (det_bboxes) whose will filter those are not
+        matched by group of gts when processing Open Images evaluation.
+        The shape is (num_scales, m).
+    """
+
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0],
+                  dtype=bool), np.ones(gt_bboxes_ignore.shape[0], dtype=bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+    # a certain scale
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp, det_bboxes
+
+    if gt_bboxes_group_of is not None and use_group_of:
+        # if handle group-of boxes, divided gt boxes into two parts:
+        # non-group-of and group-of.Then calculate ious and ioas through
+        # non-group-of group-of gts respectively. This only used in
+        # OpenImages evaluation.
+        assert gt_bboxes_group_of.shape[0] == gt_bboxes.shape[0]
+        non_group_gt_bboxes = gt_bboxes[~gt_bboxes_group_of]
+        group_gt_bboxes = gt_bboxes[gt_bboxes_group_of]
+        num_gts_group = group_gt_bboxes.shape[0]
+        ious = bbox_overlaps(det_bboxes, non_group_gt_bboxes)
+        ioas = bbox_overlaps(det_bboxes, group_gt_bboxes, mode='iof')
+    else:
+        # if not consider group-of boxes, only calculate ious through gt boxes
+        ious = bbox_overlaps(
+            det_bboxes, gt_bboxes, use_legacy_coordinate=use_legacy_coordinate)
+        ioas = None
+
+    if ious.shape[1] > 0:
+        # for each det, the max iou with all gts
+        ious_max = ious.max(axis=1)
+        # for each det, which gt overlaps most with it
+        ious_argmax = ious.argmax(axis=1)
+        # sort all dets in descending order by scores
+        sort_inds = np.argsort(-det_bboxes[:, -1])
+        for k, (min_area, max_area) in enumerate(area_ranges):
+            gt_covered = np.zeros(num_gts, dtype=bool)
+            # if no area range is specified, gt_area_ignore is all False
+            if min_area is None:
+                gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+            else:
+                gt_areas = (
+                    gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length) * (
+                        gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length)
+                gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+            for i in sort_inds:
+                if ious_max[i] >= iou_thr:
+                    matched_gt = ious_argmax[i]
+                    if not (gt_ignore_inds[matched_gt]
+                            or gt_area_ignore[matched_gt]):
+                        if not gt_covered[matched_gt]:
+                            gt_covered[matched_gt] = True
+                            tp[k, i] = 1
+                        else:
+                            fp[k, i] = 1
+                    # otherwise ignore this detected bbox, tp = 0, fp = 0
+                elif min_area is None:
+                    fp[k, i] = 1
+                else:
+                    bbox = det_bboxes[i, :4]
+                    area = (bbox[2] - bbox[0] + extra_length) * (
+                        bbox[3] - bbox[1] + extra_length)
+                    if area >= min_area and area < max_area:
+                        fp[k, i] = 1
+    else:
+        # if there is no no-group-of gt bboxes in this image,
+        # then all det bboxes within area range are false positives.
+        # Only used in OpenImages evaluation.
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+
+    if ioas is None or ioas.shape[1] <= 0:
+        return tp, fp, det_bboxes
+    else:
+        # The evaluation of group-of TP and FP are done in two stages:
+        # 1. All detections are first matched to non group-of boxes; true
+        #    positives are determined.
+        # 2. Detections that are determined as false positives are matched
+        #    against group-of boxes and calculated group-of TP and FP.
+        # Only used in OpenImages evaluation.
+        det_bboxes_group = np.zeros(
+            (num_scales, ioas.shape[1], det_bboxes.shape[1]), dtype=float)
+        match_group_of = np.zeros((num_scales, num_dets), dtype=bool)
+        tp_group = np.zeros((num_scales, num_gts_group), dtype=np.float32)
+        ioas_max = ioas.max(axis=1)
+        # for each det, which gt overlaps most with it
+        ioas_argmax = ioas.argmax(axis=1)
+        # sort all dets in descending order by scores
+        sort_inds = np.argsort(-det_bboxes[:, -1])
+        for k, (min_area, max_area) in enumerate(area_ranges):
+            box_is_covered = tp[k]
+            # if no area range is specified, gt_area_ignore is all False
+            if min_area is None:
+                gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+            else:
+                gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+                    gt_bboxes[:, 3] - gt_bboxes[:, 1])
+                gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+            for i in sort_inds:
+                matched_gt = ioas_argmax[i]
+                if not box_is_covered[i]:
+                    if ioas_max[i] >= ioa_thr:
+                        if not (gt_ignore_inds[matched_gt]
+                                or gt_area_ignore[matched_gt]):
+                            if not tp_group[k, matched_gt]:
+                                tp_group[k, matched_gt] = 1
+                                match_group_of[k, i] = True
+                            else:
+                                match_group_of[k, i] = True
+
+                            if det_bboxes_group[k, matched_gt, -1] < \
+                                    det_bboxes[i, -1]:
+                                det_bboxes_group[k, matched_gt] = \
+                                    det_bboxes[i]
+
+        fp_group = (tp_group <= 0).astype(float)
+        tps = []
+        fps = []
+        # concatenate tp, fp, and det-boxes which not matched group of
+        # gt boxes and tp_group, fp_group, and det_bboxes_group which
+        # matched group of boxes respectively.
+        for i in range(num_scales):
+            tps.append(
+                np.concatenate((tp[i][~match_group_of[i]], tp_group[i])))
+            fps.append(
+                np.concatenate((fp[i][~match_group_of[i]], fp_group[i])))
+            det_bboxes = np.concatenate(
+                (det_bboxes[~match_group_of[i]], det_bboxes_group[i]))
+
+        tp = np.vstack(tps)
+        fp = np.vstack(fps)
+        return tp, fp, det_bboxes
+
+
+def get_cls_results(det_results, annotations, class_id):
+    """Get det results and gt information of a certain class.
+
+    Args:
+        det_results (list[list]): Same as `eval_map()`.
+        annotations (list[dict]): Same as `eval_map()`.
+        class_id (int): ID of a specific class.
+
+    Returns:
+        tuple[list[np.ndarray]]: detected bboxes, gt bboxes, ignored gt bboxes
+    """
+    cls_dets = [img_res[class_id] for img_res in det_results]
+    cls_gts = []
+    cls_gts_ignore = []
+    for ann in annotations:
+        gt_inds = ann['labels'] == class_id
+        cls_gts.append(ann['bboxes'][gt_inds, :])
+
+        if ann.get('labels_ignore', None) is not None:
+            ignore_inds = ann['labels_ignore'] == class_id
+            cls_gts_ignore.append(ann['bboxes_ignore'][ignore_inds, :])
+        else:
+            cls_gts_ignore.append(np.empty((0, 4), dtype=np.float32))
+
+    return cls_dets, cls_gts, cls_gts_ignore
+
+
+def get_cls_group_ofs(annotations, class_id):
+    """Get `gt_group_of` of a certain class, which is used in Open Images.
+
+    Args:
+        annotations (list[dict]): Same as `eval_map()`.
+        class_id (int): ID of a specific class.
+
+    Returns:
+        list[np.ndarray]: `gt_group_of` of a certain class.
+    """
+    gt_group_ofs = []
+    for ann in annotations:
+        gt_inds = ann['labels'] == class_id
+        if ann.get('gt_is_group_ofs', None) is not None:
+            gt_group_ofs.append(ann['gt_is_group_ofs'][gt_inds])
+        else:
+            gt_group_ofs.append(np.empty((0, 1), dtype=bool))
+
+    return gt_group_ofs
+
+
+def eval_map(det_results,
+             annotations,
+             scale_ranges=None,
+             iou_thr=0.5,
+             ioa_thr=None,
+             dataset=None,
+             logger=None,
+             tpfp_fn=None,
+             nproc=4,
+             use_legacy_coordinate=False,
+             use_group_of=False,
+             eval_mode='area'):
+    """Evaluate mAP of a dataset.
+
+    Args:
+        det_results (list[list]): [[cls1_det, cls2_det, ...], ...].
+            The outer list indicates images, and the inner list indicates
+            per-class detected bboxes.
+        annotations (list[dict]): Ground truth annotations where each item of
+            the list indicates an image. Keys of annotations are:
+
+            - `bboxes`: numpy array of shape (n, 4)
+            - `labels`: numpy array of shape (n, )
+            - `bboxes_ignore` (optional): numpy array of shape (k, 4)
+            - `labels_ignore` (optional): numpy array of shape (k, )
+        scale_ranges (list[tuple] | None): Range of scales to be evaluated,
+            in the format [(min1, max1), (min2, max2), ...]. A range of
+            (32, 64) means the area range between (32**2, 64**2).
+            Defaults to None.
+        iou_thr (float): IoU threshold to be considered as matched.
+            Defaults to 0.5.
+        ioa_thr (float | None): IoA threshold to be considered as matched,
+            which only used in OpenImages evaluation. Defaults to None.
+        dataset (list[str] | str | None): Dataset name or dataset classes,
+            there are minor differences in metrics for different datasets, e.g.
+            "voc", "imagenet_det", etc. Defaults to None.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmengine.logging.print_log()` for details.
+            Defaults to None.
+        tpfp_fn (callable | None): The function used to determine true/
+            false positives. If None, :func:`tpfp_default` is used as default
+            unless dataset is 'det' or 'vid' (:func:`tpfp_imagenet` in this
+            case). If it is given as a function, then this function is used
+            to evaluate tp & fp. Default None.
+        nproc (int): Processes used for computing TP and FP.
+            Defaults to 4.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Defaults to False.
+        use_group_of (bool): Whether to use group of when calculate TP and FP,
+            which only used in OpenImages evaluation. Defaults to False.
+        eval_mode (str): 'area' or '11points', 'area' means calculating the
+            area under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1],
+            PASCAL VOC2007 uses `11points` as default evaluate mode, while
+            others are 'area'. Defaults to 'area'.
+
+    Returns:
+        tuple: (mAP, [dict, dict, ...])
+    """
+    assert len(det_results) == len(annotations)
+    assert eval_mode in ['area', '11points'], \
+        f'Unrecognized {eval_mode} mode, only "area" and "11points" ' \
+        'are supported'
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    num_imgs = len(det_results)
+    num_scales = len(scale_ranges) if scale_ranges is not None else 1
+    num_classes = len(det_results[0])  # positive class num
+    area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges]
+                   if scale_ranges is not None else None)
+
+    # There is no need to use multi processes to process
+    # when num_imgs = 1 .
+    if num_imgs > 1:
+        assert nproc > 0, 'nproc must be at least one.'
+        nproc = min(nproc, num_imgs)
+        pool = Pool(nproc)
+
+    eval_results = []
+    for i in range(num_classes):
+        # get gt and det bboxes of this class
+        cls_dets, cls_gts, cls_gts_ignore = get_cls_results(
+            det_results, annotations, i)
+        # choose proper function according to datasets to compute tp and fp
+        if tpfp_fn is None:
+            if dataset in ['det', 'vid']:
+                tpfp_fn = tpfp_imagenet
+            elif dataset in ['oid_challenge', 'oid_v6'] \
+                    or use_group_of is True:
+                tpfp_fn = tpfp_openimages
+            else:
+                tpfp_fn = tpfp_default
+        if not callable(tpfp_fn):
+            raise ValueError(
+                f'tpfp_fn has to be a function or None, but got {tpfp_fn}')
+
+        if num_imgs > 1:
+            # compute tp and fp for each image with multiple processes
+            args = []
+            if use_group_of:
+                # used in Open Images Dataset evaluation
+                gt_group_ofs = get_cls_group_ofs(annotations, i)
+                args.append(gt_group_ofs)
+                args.append([use_group_of for _ in range(num_imgs)])
+            if ioa_thr is not None:
+                args.append([ioa_thr for _ in range(num_imgs)])
+
+            tpfp = pool.starmap(
+                tpfp_fn,
+                zip(cls_dets, cls_gts, cls_gts_ignore,
+                    [iou_thr for _ in range(num_imgs)],
+                    [area_ranges for _ in range(num_imgs)],
+                    [use_legacy_coordinate for _ in range(num_imgs)], *args))
+        else:
+            tpfp = tpfp_fn(
+                cls_dets[0],
+                cls_gts[0],
+                cls_gts_ignore[0],
+                iou_thr,
+                area_ranges,
+                use_legacy_coordinate,
+                gt_bboxes_group_of=(get_cls_group_ofs(annotations, i)[0]
+                                    if use_group_of else None),
+                use_group_of=use_group_of,
+                ioa_thr=ioa_thr)
+            tpfp = [tpfp]
+
+        if use_group_of:
+            tp, fp, cls_dets = tuple(zip(*tpfp))
+        else:
+            tp, fp = tuple(zip(*tpfp))
+        # calculate gt number of each scale
+        # ignored gts or gts beyond the specific scale are not counted
+        num_gts = np.zeros(num_scales, dtype=int)
+        for j, bbox in enumerate(cls_gts):
+            if area_ranges is None:
+                num_gts[0] += bbox.shape[0]
+            else:
+                gt_areas = (bbox[:, 2] - bbox[:, 0] + extra_length) * (
+                    bbox[:, 3] - bbox[:, 1] + extra_length)
+                for k, (min_area, max_area) in enumerate(area_ranges):
+                    num_gts[k] += np.sum((gt_areas >= min_area)
+                                         & (gt_areas < max_area))
+        # sort all det bboxes by score, also sort tp and fp
+        cls_dets = np.vstack(cls_dets)
+        num_dets = cls_dets.shape[0]
+        sort_inds = np.argsort(-cls_dets[:, -1])
+        tp = np.hstack(tp)[:, sort_inds]
+        fp = np.hstack(fp)[:, sort_inds]
+        # calculate recall and precision with tp and fp
+        tp = np.cumsum(tp, axis=1)
+        fp = np.cumsum(fp, axis=1)
+        eps = np.finfo(np.float32).eps
+        recalls = tp / np.maximum(num_gts[:, np.newaxis], eps)
+        precisions = tp / np.maximum((tp + fp), eps)
+        # calculate AP
+        if scale_ranges is None:
+            recalls = recalls[0, :]
+            precisions = precisions[0, :]
+            num_gts = num_gts.item()
+        ap = average_precision(recalls, precisions, eval_mode)
+        eval_results.append({
+            'num_gts': num_gts,
+            'num_dets': num_dets,
+            'recall': recalls,
+            'precision': precisions,
+            'ap': ap
+        })
+
+    if num_imgs > 1:
+        pool.close()
+
+    if scale_ranges is not None:
+        # shape (num_classes, num_scales)
+        all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results])
+        all_num_gts = np.vstack(
+            [cls_result['num_gts'] for cls_result in eval_results])
+        mean_ap = []
+        for i in range(num_scales):
+            if np.any(all_num_gts[:, i] > 0):
+                mean_ap.append(all_ap[all_num_gts[:, i] > 0, i].mean())
+            else:
+                mean_ap.append(0.0)
+    else:
+        aps = []
+        for cls_result in eval_results:
+            if cls_result['num_gts'] > 0:
+                aps.append(cls_result['ap'])
+        mean_ap = np.array(aps).mean().item() if aps else 0.0
+
+    print_map_summary(
+        mean_ap, eval_results, dataset, area_ranges, logger=logger)
+
+    return mean_ap, eval_results
+
+
+def print_map_summary(mean_ap,
+                      results,
+                      dataset=None,
+                      scale_ranges=None,
+                      logger=None):
+    """Print mAP and results of each class.
+
+    A table will be printed to show the gts/dets/recall/AP of each class and
+    the mAP.
+
+    Args:
+        mean_ap (float): Calculated from `eval_map()`.
+        results (list[dict]): Calculated from `eval_map()`.
+        dataset (list[str] | str | None): Dataset name or dataset classes.
+        scale_ranges (list[tuple] | None): Range of scales to be evaluated.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmengine.logging.print_log()` for details.
+            Defaults to None.
+    """
+
+    if logger == 'silent':
+        return
+
+    if isinstance(results[0]['ap'], np.ndarray):
+        num_scales = len(results[0]['ap'])
+    else:
+        num_scales = 1
+
+    if scale_ranges is not None:
+        assert len(scale_ranges) == num_scales
+
+    num_classes = len(results)
+
+    recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
+    aps = np.zeros((num_scales, num_classes), dtype=np.float32)
+    num_gts = np.zeros((num_scales, num_classes), dtype=int)
+    for i, cls_result in enumerate(results):
+        if cls_result['recall'].size > 0:
+            recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
+        aps[:, i] = cls_result['ap']
+        num_gts[:, i] = cls_result['num_gts']
+
+    if dataset is None:
+        label_names = [str(i) for i in range(num_classes)]
+    elif is_str(dataset):
+        label_names = get_classes(dataset)
+    else:
+        label_names = dataset
+
+    if not isinstance(mean_ap, list):
+        mean_ap = [mean_ap]
+
+    header = ['class', 'gts', 'dets', 'recall', 'ap']
+    for i in range(num_scales):
+        if scale_ranges is not None:
+            print_log(f'Scale range {scale_ranges[i]}', logger=logger)
+        table_data = [header]
+        for j in range(num_classes):
+            row_data = [
+                label_names[j], num_gts[i, j], results[j]['num_dets'],
+                f'{recalls[i, j]:.3f}', f'{aps[i, j]:.3f}'
+            ]
+            table_data.append(row_data)
+        table_data.append(['mAP', '', '', '', f'{mean_ap[i]:.3f}'])
+        table = AsciiTable(table_data)
+        table.inner_footing_row_border = True
+        print_log('\n' + table.table, logger=logger)
diff --git a/mmde/mmdet/evaluation/functional/panoptic_utils.py b/mmde/mmdet/evaluation/functional/panoptic_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6faa8ed52bc46c2cb74b1974b8daa521e616e996
--- /dev/null
+++ b/mmde/mmdet/evaluation/functional/panoptic_utils.py
@@ -0,0 +1,228 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Copyright (c) 2018, Alexander Kirillov
+# This file supports `backend_args` for `panopticapi`,
+# the source code is copied from `panopticapi`,
+# only the way to load the gt images is modified.
+import multiprocessing
+import os
+
+import mmcv
+import numpy as np
+from mmengine.fileio import get
+
+# A custom value to distinguish instance ID and category ID; need to
+# be greater than the number of categories.
+# For a pixel in the panoptic result map:
+#   pan_id = ins_id * INSTANCE_OFFSET + cat_id
+INSTANCE_OFFSET = 1000
+
+try:
+    from panopticapi.evaluation import OFFSET, VOID, PQStat
+    from panopticapi.utils import rgb2id
+except ImportError:
+    PQStat = None
+    rgb2id = None
+    VOID = 0
+    OFFSET = 256 * 256 * 256
+
+
+def pq_compute_single_core(proc_id,
+                           annotation_set,
+                           gt_folder,
+                           pred_folder,
+                           categories,
+                           backend_args=None,
+                           print_log=False):
+    """The single core function to evaluate the metric of Panoptic
+    Segmentation.
+
+    Same as the function with the same name in `panopticapi`. Only the function
+    to load the images is changed to use the file client.
+
+    Args:
+        proc_id (int): The id of the mini process.
+        gt_folder (str): The path of the ground truth images.
+        pred_folder (str): The path of the prediction images.
+        categories (str): The categories of the dataset.
+        backend_args (object): The Backend of the dataset. If None,
+            the backend will be set to `local`.
+        print_log (bool): Whether to print the log. Defaults to False.
+    """
+    if PQStat is None:
+        raise RuntimeError(
+            'panopticapi is not installed, please install it by: '
+            'pip install git+https://github.com/cocodataset/'
+            'panopticapi.git.')
+
+    pq_stat = PQStat()
+
+    idx = 0
+    for gt_ann, pred_ann in annotation_set:
+        if print_log and idx % 100 == 0:
+            print('Core: {}, {} from {} images processed'.format(
+                proc_id, idx, len(annotation_set)))
+        idx += 1
+        # The gt images can be on the local disk or `ceph`, so we use
+        # backend here.
+        img_bytes = get(
+            os.path.join(gt_folder, gt_ann['file_name']),
+            backend_args=backend_args)
+        pan_gt = mmcv.imfrombytes(img_bytes, flag='color', channel_order='rgb')
+        pan_gt = rgb2id(pan_gt)
+
+        # The predictions can only be on the local dist now.
+        pan_pred = mmcv.imread(
+            os.path.join(pred_folder, pred_ann['file_name']),
+            flag='color',
+            channel_order='rgb')
+        pan_pred = rgb2id(pan_pred)
+
+        gt_segms = {el['id']: el for el in gt_ann['segments_info']}
+        pred_segms = {el['id']: el for el in pred_ann['segments_info']}
+
+        # predicted segments area calculation + prediction sanity checks
+        pred_labels_set = set(el['id'] for el in pred_ann['segments_info'])
+        labels, labels_cnt = np.unique(pan_pred, return_counts=True)
+        for label, label_cnt in zip(labels, labels_cnt):
+            if label not in pred_segms:
+                if label == VOID:
+                    continue
+                raise KeyError(
+                    'In the image with ID {} segment with ID {} is '
+                    'presented in PNG and not presented in JSON.'.format(
+                        gt_ann['image_id'], label))
+            pred_segms[label]['area'] = label_cnt
+            pred_labels_set.remove(label)
+            if pred_segms[label]['category_id'] not in categories:
+                raise KeyError(
+                    'In the image with ID {} segment with ID {} has '
+                    'unknown category_id {}.'.format(
+                        gt_ann['image_id'], label,
+                        pred_segms[label]['category_id']))
+        if len(pred_labels_set) != 0:
+            raise KeyError(
+                'In the image with ID {} the following segment IDs {} '
+                'are presented in JSON and not presented in PNG.'.format(
+                    gt_ann['image_id'], list(pred_labels_set)))
+
+        # confusion matrix calculation
+        pan_gt_pred = pan_gt.astype(np.uint64) * OFFSET + pan_pred.astype(
+            np.uint64)
+        gt_pred_map = {}
+        labels, labels_cnt = np.unique(pan_gt_pred, return_counts=True)
+        for label, intersection in zip(labels, labels_cnt):
+            gt_id = label // OFFSET
+            pred_id = label % OFFSET
+            gt_pred_map[(gt_id, pred_id)] = intersection
+
+        # count all matched pairs
+        gt_matched = set()
+        pred_matched = set()
+        for label_tuple, intersection in gt_pred_map.items():
+            gt_label, pred_label = label_tuple
+            if gt_label not in gt_segms:
+                continue
+            if pred_label not in pred_segms:
+                continue
+            if gt_segms[gt_label]['iscrowd'] == 1:
+                continue
+            if gt_segms[gt_label]['category_id'] != pred_segms[pred_label][
+                    'category_id']:
+                continue
+
+            union = pred_segms[pred_label]['area'] + gt_segms[gt_label][
+                'area'] - intersection - gt_pred_map.get((VOID, pred_label), 0)
+            iou = intersection / union
+            if iou > 0.5:
+                pq_stat[gt_segms[gt_label]['category_id']].tp += 1
+                pq_stat[gt_segms[gt_label]['category_id']].iou += iou
+                gt_matched.add(gt_label)
+                pred_matched.add(pred_label)
+
+        # count false positives
+        crowd_labels_dict = {}
+        for gt_label, gt_info in gt_segms.items():
+            if gt_label in gt_matched:
+                continue
+            # crowd segments are ignored
+            if gt_info['iscrowd'] == 1:
+                crowd_labels_dict[gt_info['category_id']] = gt_label
+                continue
+            pq_stat[gt_info['category_id']].fn += 1
+
+        # count false positives
+        for pred_label, pred_info in pred_segms.items():
+            if pred_label in pred_matched:
+                continue
+            # intersection of the segment with VOID
+            intersection = gt_pred_map.get((VOID, pred_label), 0)
+            # plus intersection with corresponding CROWD region if it exists
+            if pred_info['category_id'] in crowd_labels_dict:
+                intersection += gt_pred_map.get(
+                    (crowd_labels_dict[pred_info['category_id']], pred_label),
+                    0)
+            # predicted segment is ignored if more than half of
+            # the segment correspond to VOID and CROWD regions
+            if intersection / pred_info['area'] > 0.5:
+                continue
+            pq_stat[pred_info['category_id']].fp += 1
+
+    if print_log:
+        print('Core: {}, all {} images processed'.format(
+            proc_id, len(annotation_set)))
+    return pq_stat
+
+
+def pq_compute_multi_core(matched_annotations_list,
+                          gt_folder,
+                          pred_folder,
+                          categories,
+                          backend_args=None,
+                          nproc=32):
+    """Evaluate the metrics of Panoptic Segmentation with multithreading.
+
+    Same as the function with the same name in `panopticapi`.
+
+    Args:
+        matched_annotations_list (list): The matched annotation list. Each
+            element is a tuple of annotations of the same image with the
+            format (gt_anns, pred_anns).
+        gt_folder (str): The path of the ground truth images.
+        pred_folder (str): The path of the prediction images.
+        categories (str): The categories of the dataset.
+        backend_args (object): The file client of the dataset. If None,
+            the backend will be set to `local`.
+        nproc (int): Number of processes for panoptic quality computing.
+            Defaults to 32. When `nproc` exceeds the number of cpu cores,
+            the number of cpu cores is used.
+    """
+    if PQStat is None:
+        raise RuntimeError(
+            'panopticapi is not installed, please install it by: '
+            'pip install git+https://github.com/cocodataset/'
+            'panopticapi.git.')
+
+    cpu_num = min(nproc, multiprocessing.cpu_count())
+
+    annotations_split = np.array_split(matched_annotations_list, cpu_num)
+    print('Number of cores: {}, images per core: {}'.format(
+        cpu_num, len(annotations_split[0])))
+    workers = multiprocessing.Pool(processes=cpu_num)
+    processes = []
+    for proc_id, annotation_set in enumerate(annotations_split):
+        p = workers.apply_async(pq_compute_single_core,
+                                (proc_id, annotation_set, gt_folder,
+                                 pred_folder, categories, backend_args))
+        processes.append(p)
+
+    # Close the process pool, otherwise it will lead to memory
+    # leaking problems.
+    workers.close()
+    workers.join()
+
+    pq_stat = PQStat()
+    for p in processes:
+        pq_stat += p.get()
+
+    return pq_stat
diff --git a/mmde/mmdet/evaluation/functional/recall.py b/mmde/mmdet/evaluation/functional/recall.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bce2bf3614ab454dbbdf48efc4650018cc71b13
--- /dev/null
+++ b/mmde/mmdet/evaluation/functional/recall.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Sequence
+
+import numpy as np
+from mmengine.logging import print_log
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+
+
+def _recalls(all_ious, proposal_nums, thrs):
+
+    img_num = all_ious.shape[0]
+    total_gt_num = sum([ious.shape[0] for ious in all_ious])
+
+    _ious = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)
+    for k, proposal_num in enumerate(proposal_nums):
+        tmp_ious = np.zeros(0)
+        for i in range(img_num):
+            ious = all_ious[i][:, :proposal_num].copy()
+            gt_ious = np.zeros((ious.shape[0]))
+            if ious.size == 0:
+                tmp_ious = np.hstack((tmp_ious, gt_ious))
+                continue
+            for j in range(ious.shape[0]):
+                gt_max_overlaps = ious.argmax(axis=1)
+                max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]
+                gt_idx = max_ious.argmax()
+                gt_ious[j] = max_ious[gt_idx]
+                box_idx = gt_max_overlaps[gt_idx]
+                ious[gt_idx, :] = -1
+                ious[:, box_idx] = -1
+            tmp_ious = np.hstack((tmp_ious, gt_ious))
+        _ious[k, :] = tmp_ious
+
+    _ious = np.fliplr(np.sort(_ious, axis=1))
+    recalls = np.zeros((proposal_nums.size, thrs.size))
+    for i, thr in enumerate(thrs):
+        recalls[:, i] = (_ious >= thr).sum(axis=1) / float(total_gt_num)
+
+    return recalls
+
+
+def set_recall_param(proposal_nums, iou_thrs):
+    """Check proposal_nums and iou_thrs and set correct format."""
+    if isinstance(proposal_nums, Sequence):
+        _proposal_nums = np.array(proposal_nums)
+    elif isinstance(proposal_nums, int):
+        _proposal_nums = np.array([proposal_nums])
+    else:
+        _proposal_nums = proposal_nums
+
+    if iou_thrs is None:
+        _iou_thrs = np.array([0.5])
+    elif isinstance(iou_thrs, Sequence):
+        _iou_thrs = np.array(iou_thrs)
+    elif isinstance(iou_thrs, float):
+        _iou_thrs = np.array([iou_thrs])
+    else:
+        _iou_thrs = iou_thrs
+
+    return _proposal_nums, _iou_thrs
+
+
+def eval_recalls(gts,
+                 proposals,
+                 proposal_nums=None,
+                 iou_thrs=0.5,
+                 logger=None,
+                 use_legacy_coordinate=False):
+    """Calculate recalls.
+
+    Args:
+        gts (list[ndarray]): a list of arrays of shape (n, 4)
+        proposals (list[ndarray]): a list of arrays of shape (k, 4) or (k, 5)
+        proposal_nums (int | Sequence[int]): Top N proposals to be evaluated.
+        iou_thrs (float | Sequence[float]): IoU thresholds. Default: 0.5.
+        logger (logging.Logger | str | None): The way to print the recall
+            summary. See `mmengine.logging.print_log()` for details.
+            Default: None.
+        use_legacy_coordinate (bool): Whether use coordinate system
+            in mmdet v1.x. "1" was added to both height and width
+            which means w, h should be
+            computed as 'x2 - x1 + 1` and 'y2 - y1 + 1'. Default: False.
+
+
+    Returns:
+        ndarray: recalls of different ious and proposal nums
+    """
+
+    img_num = len(gts)
+    assert img_num == len(proposals)
+    proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)
+    all_ious = []
+    for i in range(img_num):
+        if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:
+            scores = proposals[i][:, 4]
+            sort_idx = np.argsort(scores)[::-1]
+            img_proposal = proposals[i][sort_idx, :]
+        else:
+            img_proposal = proposals[i]
+        prop_num = min(img_proposal.shape[0], proposal_nums[-1])
+        if gts[i] is None or gts[i].shape[0] == 0:
+            ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)
+        else:
+            ious = bbox_overlaps(
+                gts[i],
+                img_proposal[:prop_num, :4],
+                use_legacy_coordinate=use_legacy_coordinate)
+        all_ious.append(ious)
+    all_ious = np.array(all_ious)
+    recalls = _recalls(all_ious, proposal_nums, iou_thrs)
+
+    print_recall_summary(recalls, proposal_nums, iou_thrs, logger=logger)
+    return recalls
+
+
+def print_recall_summary(recalls,
+                         proposal_nums,
+                         iou_thrs,
+                         row_idxs=None,
+                         col_idxs=None,
+                         logger=None):
+    """Print recalls in a table.
+
+    Args:
+        recalls (ndarray): calculated from `bbox_recalls`
+        proposal_nums (ndarray or list): top N proposals
+        iou_thrs (ndarray or list): iou thresholds
+        row_idxs (ndarray): which rows(proposal nums) to print
+        col_idxs (ndarray): which cols(iou thresholds) to print
+        logger (logging.Logger | str | None): The way to print the recall
+            summary. See `mmengine.logging.print_log()` for details.
+            Default: None.
+    """
+    proposal_nums = np.array(proposal_nums, dtype=np.int32)
+    iou_thrs = np.array(iou_thrs)
+    if row_idxs is None:
+        row_idxs = np.arange(proposal_nums.size)
+    if col_idxs is None:
+        col_idxs = np.arange(iou_thrs.size)
+    row_header = [''] + iou_thrs[col_idxs].tolist()
+    table_data = [row_header]
+    for i, num in enumerate(proposal_nums[row_idxs]):
+        row = [f'{val:.3f}' for val in recalls[row_idxs[i], col_idxs].tolist()]
+        row.insert(0, num)
+        table_data.append(row)
+    table = AsciiTable(table_data)
+    print_log('\n' + table.table, logger=logger)
+
+
+def plot_num_recall(recalls, proposal_nums):
+    """Plot Proposal_num-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        proposal_nums(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(proposal_nums, np.ndarray):
+        _proposal_nums = proposal_nums.tolist()
+    else:
+        _proposal_nums = proposal_nums
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot([0] + _proposal_nums, [0] + _recalls)
+    plt.xlabel('Proposal num')
+    plt.ylabel('Recall')
+    plt.axis([0, proposal_nums.max(), 0, 1])
+    f.show()
+
+
+def plot_iou_recall(recalls, iou_thrs):
+    """Plot IoU-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        iou_thrs(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(iou_thrs, np.ndarray):
+        _iou_thrs = iou_thrs.tolist()
+    else:
+        _iou_thrs = iou_thrs
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot(_iou_thrs + [1.0], _recalls + [0.])
+    plt.xlabel('IoU')
+    plt.ylabel('Recall')
+    plt.axis([iou_thrs.min(), 1, 0, 1])
+    f.show()
diff --git a/mmde/mmdet/evaluation/functional/ytvis.py b/mmde/mmdet/evaluation/functional/ytvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..c65a7e9bc956c7de42e0d6e511dabb3d7325782d
--- /dev/null
+++ b/mmde/mmdet/evaluation/functional/ytvis.py
@@ -0,0 +1,305 @@
+# Copyright (c) Github URL
+# Copied from
+# https://github.com/youtubevos/cocoapi/blob/master/PythonAPI/pycocotools/ytvos.py
+__author__ = 'ychfan'
+# Interface for accessing the YouTubeVIS dataset.
+
+# The following API functions are defined:
+#  YTVIS       - YTVIS api class that loads YouTubeVIS annotation file
+#  and prepare data structures.
+#  decodeMask - Decode binary mask M encoded via run-length encoding.
+#  encodeMask - Encode binary mask M using run-length encoding.
+#  getAnnIds  - Get ann ids that satisfy given filter conditions.
+#  getCatIds  - Get cat ids that satisfy given filter conditions.
+#  getImgIds  - Get img ids that satisfy given filter conditions.
+#  loadAnns   - Load anns with the specified ids.
+#  loadCats   - Load cats with the specified ids.
+#  loadImgs   - Load imgs with the specified ids.
+#  annToMask  - Convert segmentation in an annotation to binary mask.
+#  loadRes    - Load algorithm results and create API for accessing them.
+
+# Microsoft COCO Toolbox.      version 2.0
+# Data, paper, and tutorials available at:  http://mscoco.org/
+# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
+# Licensed under the Simplified BSD License [see bsd.txt]
+
+import copy
+import itertools
+import json
+import sys
+import time
+from collections import defaultdict
+
+import numpy as np
+from pycocotools import mask as maskUtils
+
+PYTHON_VERSION = sys.version_info[0]
+
+
+def _isArrayLike(obj):
+    return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
+
+
+class YTVIS:
+
+    def __init__(self, annotation_file=None):
+        """Constructor of Microsoft COCO helper class for reading and
+        visualizing annotations.
+
+        :param annotation_file (str | dict): location of annotation file or
+            dict results.
+        :param image_folder (str): location to the folder that hosts images.
+        :return:
+        """
+        # load dataset
+        self.dataset, self.anns, self.cats, self.vids = dict(), dict(), dict(
+        ), dict()
+        self.vidToAnns, self.catToVids = defaultdict(list), defaultdict(list)
+        if annotation_file is not None:
+            print('loading annotations into memory...')
+            tic = time.time()
+            if type(annotation_file) == str:
+                dataset = json.load(open(annotation_file, 'r'))
+            else:
+                dataset = annotation_file
+            assert type(
+                dataset
+            ) == dict, 'annotation file format {} not supported'.format(
+                type(dataset))
+            print('Done (t={:0.2f}s)'.format(time.time() - tic))
+            self.dataset = dataset
+            self.createIndex()
+
+    def createIndex(self):
+        # create index
+        print('creating index...')
+        anns, cats, vids = {}, {}, {}
+        vidToAnns, catToVids = defaultdict(list), defaultdict(list)
+        if 'annotations' in self.dataset:
+            for ann in self.dataset['annotations']:
+                vidToAnns[ann['video_id']].append(ann)
+                anns[ann['id']] = ann
+
+        if 'videos' in self.dataset:
+            for vid in self.dataset['videos']:
+                vids[vid['id']] = vid
+
+        if 'categories' in self.dataset:
+            for cat in self.dataset['categories']:
+                cats[cat['id']] = cat
+
+        if 'annotations' in self.dataset and 'categories' in self.dataset:
+            for ann in self.dataset['annotations']:
+                catToVids[ann['category_id']].append(ann['video_id'])
+
+        print('index created!')
+
+        # create class members
+        self.anns = anns
+        self.vidToAnns = vidToAnns
+        self.catToVids = catToVids
+        self.vids = vids
+        self.cats = cats
+
+    def getAnnIds(self, vidIds=[], catIds=[], areaRng=[], iscrowd=None):
+        """Get ann ids that satisfy given filter conditions. default skips that
+        filter.
+
+        :param vidIds  (int array)     : get anns for given vids
+               catIds  (int array)     : get anns for given cats
+               areaRng (float array)   : get anns for given area range
+               iscrowd (boolean)       : get anns for given crowd label
+        :return: ids (int array)       : integer array of ann ids
+        """
+        vidIds = vidIds if _isArrayLike(vidIds) else [vidIds]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+
+        if len(vidIds) == len(catIds) == len(areaRng) == 0:
+            anns = self.dataset['annotations']
+        else:
+            if not len(vidIds) == 0:
+                lists = [
+                    self.vidToAnns[vidId] for vidId in vidIds
+                    if vidId in self.vidToAnns
+                ]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.dataset['annotations']
+            anns = anns if len(catIds) == 0 else [
+                ann for ann in anns if ann['category_id'] in catIds
+            ]
+            anns = anns if len(areaRng) == 0 else [
+                ann for ann in anns if ann['avg_area'] > areaRng[0]
+                and ann['avg_area'] < areaRng[1]
+            ]
+        if iscrowd is not None:
+            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
+        else:
+            ids = [ann['id'] for ann in anns]
+        return ids
+
+    def getCatIds(self, catNms=[], supNms=[], catIds=[]):
+        """filtering parameters. default skips that filter.
+
+        :param catNms (str array)  : get cats for given cat names
+        :param supNms (str array)  : get cats for given supercategory names
+        :param catIds (int array)  : get cats for given cat ids
+        :return: ids (int array)   : integer array of cat ids
+        """
+        catNms = catNms if _isArrayLike(catNms) else [catNms]
+        supNms = supNms if _isArrayLike(supNms) else [supNms]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+
+        if len(catNms) == len(supNms) == len(catIds) == 0:
+            cats = self.dataset['categories']
+        else:
+            cats = self.dataset['categories']
+            cats = cats if len(catNms) == 0 else [
+                cat for cat in cats if cat['name'] in catNms
+            ]
+            cats = cats if len(supNms) == 0 else [
+                cat for cat in cats if cat['supercategory'] in supNms
+            ]
+            cats = cats if len(catIds) == 0 else [
+                cat for cat in cats if cat['id'] in catIds
+            ]
+        ids = [cat['id'] for cat in cats]
+        return ids
+
+    def getVidIds(self, vidIds=[], catIds=[]):
+        """Get vid ids that satisfy given filter conditions.
+
+        :param vidIds (int array) : get vids for given ids
+        :param catIds (int array) : get vids with all given cats
+        :return: ids (int array)  : integer array of vid ids
+        """
+        vidIds = vidIds if _isArrayLike(vidIds) else [vidIds]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+
+        if len(vidIds) == len(catIds) == 0:
+            ids = self.vids.keys()
+        else:
+            ids = set(vidIds)
+            for i, catId in enumerate(catIds):
+                if i == 0 and len(ids) == 0:
+                    ids = set(self.catToVids[catId])
+                else:
+                    ids &= set(self.catToVids[catId])
+        return list(ids)
+
+    def loadAnns(self, ids=[]):
+        """Load anns with the specified ids.
+
+        :param ids (int array)       : integer ids specifying anns
+        :return: anns (object array) : loaded ann objects
+        """
+        if _isArrayLike(ids):
+            return [self.anns[id] for id in ids]
+        elif type(ids) == int:
+            return [self.anns[ids]]
+
+    def loadCats(self, ids=[]):
+        """Load cats with the specified ids.
+
+        :param ids (int array)       : integer ids specifying cats
+        :return: cats (object array) : loaded cat objects
+        """
+        if _isArrayLike(ids):
+            return [self.cats[id] for id in ids]
+        elif type(ids) == int:
+            return [self.cats[ids]]
+
+    def loadVids(self, ids=[]):
+        """Load anns with the specified ids.
+
+        :param ids (int array)       : integer ids specifying vid
+        :return: vids (object array) : loaded vid objects
+        """
+        if _isArrayLike(ids):
+            return [self.vids[id] for id in ids]
+        elif type(ids) == int:
+            return [self.vids[ids]]
+
+    def loadRes(self, resFile):
+        """Load result file and return a result api object.
+
+        :param   resFile (str)     : file name of result file
+        :return: res (obj)         : result api object
+        """
+        res = YTVIS()
+        res.dataset['videos'] = [img for img in self.dataset['videos']]
+
+        print('Loading and preparing results...')
+        tic = time.time()
+        if type(resFile) == str or (PYTHON_VERSION == 2
+                                    and type(resFile) == str):
+            anns = json.load(open(resFile))
+        elif type(resFile) == np.ndarray:
+            anns = self.loadNumpyAnnotations(resFile)
+        else:
+            anns = resFile
+        assert type(anns) == list, 'results in not an array of objects'
+        annsVidIds = [ann['video_id'] for ann in anns]
+        assert set(annsVidIds) == (set(annsVidIds) & set(self.getVidIds())), \
+               'Results do not correspond to current coco set'
+        if 'segmentations' in anns[0]:
+            res.dataset['categories'] = copy.deepcopy(
+                self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                ann['areas'] = []
+                if 'bboxes' not in ann:
+                    ann['bboxes'] = []
+                for seg in ann['segmentations']:
+                    # now only support compressed RLE format
+                    # as segmentation results
+                    if seg:
+                        ann['areas'].append(maskUtils.area(seg))
+                        if len(ann['bboxes']) < len(ann['areas']):
+                            ann['bboxes'].append(maskUtils.toBbox(seg))
+                    else:
+                        ann['areas'].append(None)
+                        if len(ann['bboxes']) < len(ann['areas']):
+                            ann['bboxes'].append(None)
+                ann['id'] = id + 1
+                l_ori = [a for a in ann['areas'] if a]
+                if len(l_ori) == 0:
+                    ann['avg_area'] = 0
+                else:
+                    ann['avg_area'] = np.array(l_ori).mean()
+                ann['iscrowd'] = 0
+        print('DONE (t={:0.2f}s)'.format(time.time() - tic))
+
+        res.dataset['annotations'] = anns
+        res.createIndex()
+        return res
+
+    def annToRLE(self, ann, frameId):
+        """Convert annotation which can be polygons, uncompressed RLE to RLE.
+
+        :return: binary mask (numpy 2D array)
+        """
+        t = self.vids[ann['video_id']]
+        h, w = t['height'], t['width']
+        segm = ann['segmentations'][frameId]
+        if type(segm) == list:
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(segm, h, w)
+            rle = maskUtils.merge(rles)
+        elif type(segm['counts']) == list:
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(segm, h, w)
+        else:
+            # rle
+            rle = segm
+        return rle
+
+    def annToMask(self, ann, frameId):
+        """Convert annotation which can be polygons, uncompressed RLE, or RLE
+        to binary mask.
+
+        :return: binary mask (numpy 2D array)
+        """
+        rle = self.annToRLE(ann, frameId)
+        m = maskUtils.decode(rle)
+        return m
diff --git a/mmde/mmdet/evaluation/functional/ytviseval.py b/mmde/mmdet/evaluation/functional/ytviseval.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdaf110d37c61b4e02873a4dc83e1722a70a29f1
--- /dev/null
+++ b/mmde/mmdet/evaluation/functional/ytviseval.py
@@ -0,0 +1,623 @@
+# Copyright (c) Github URL
+# Copied from
+# https://github.com/youtubevos/cocoapi/blob/master/PythonAPI/pycocotools/ytvoseval.py
+__author__ = 'ychfan'
+
+import copy
+import datetime
+import time
+from collections import defaultdict
+
+import numpy as np
+from pycocotools import mask as maskUtils
+
+
+class YTVISeval:
+    # Interface for evaluating video instance segmentation on
+    # the YouTubeVIS dataset.
+    #
+    # The usage for YTVISeval is as follows:
+    #  cocoGt=..., cocoDt=...       # load dataset and results
+    #  E = YTVISeval(cocoGt,cocoDt); # initialize YTVISeval object
+    #  E.params.recThrs = ...;      # set parameters as desired
+    #  E.evaluate();                # run per image evaluation
+    #  E.accumulate();              # accumulate per image results
+    #  E.summarize();               # display summary metrics of results
+    # For example usage see evalDemo.m and http://mscoco.org/.
+    #
+    # The evaluation parameters are as follows (defaults in brackets):
+    #  imgIds     - [all] N img ids to use for evaluation
+    #  catIds     - [all] K cat ids to use for evaluation
+    #  iouThrs    - [.5:.05:.95] T=10 IoU thresholds for evaluation
+    #  recThrs    - [0:.01:1] R=101 recall thresholds for evaluation
+    #  areaRng    - [...] A=4 object area ranges for evaluation
+    #  maxDets    - [1 10 100] M=3 thresholds on max detections per image
+    #  iouType    - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints'
+    #  iouType replaced the now DEPRECATED useSegm parameter.
+    #  useCats    - [1] if true use category labels for evaluation
+    # Note: if useCats=0 category labels are ignored as in proposal scoring.
+    # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
+    #
+    # evaluate(): evaluates detections on every image and every category and
+    # concats the results into the "evalImgs" with fields:
+    #  dtIds      - [1xD] id for each of the D detections (dt)
+    #  gtIds      - [1xG] id for each of the G ground truths (gt)
+    #  dtMatches  - [TxD] matching gt id at each IoU or 0
+    #  gtMatches  - [TxG] matching dt id at each IoU or 0
+    #  dtScores   - [1xD] confidence of each dt
+    #  gtIgnore   - [1xG] ignore flag for each gt
+    #  dtIgnore   - [TxD] ignore flag for each dt at each IoU
+    #
+    # accumulate(): accumulates the per-image, per-category evaluation
+    # results in "evalImgs" into the dictionary "eval" with fields:
+    #  params     - parameters used for evaluation
+    #  date       - date evaluation was performed
+    #  counts     - [T,R,K,A,M] parameter dimensions (see above)
+    #  precision  - [TxRxKxAxM] precision for every evaluation setting
+    #  recall     - [TxKxAxM] max recall for every evaluation setting
+    # Note: precision and recall==-1 for settings with no gt objects.
+    #
+    # See also coco, mask, pycocoDemo, pycocoEvalDemo
+    #
+    # Microsoft COCO Toolbox.      version 2.0
+    # Data, paper, and tutorials available at:  http://mscoco.org/
+    # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+    # Licensed under the Simplified BSD License [see coco/license.txt]
+    def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
+        """Initialize CocoEval using coco APIs for gt and dt.
+
+        :param cocoGt: coco object with ground truth annotations
+        :param cocoDt: coco object with detection results
+        :return: None
+        """
+        if not iouType:
+            print('iouType not specified. use default iouType segm')
+        self.cocoGt = cocoGt  # ground truth COCO API
+        self.cocoDt = cocoDt  # detections COCO API
+        self.params = {}  # evaluation parameters
+        self.evalVids = defaultdict(
+            list)  # per-image per-category evaluation results [KxAxI] elements
+        self.eval = {}  # accumulated evaluation results
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        self.params = Params(iouType=iouType)  # parameters
+        self._paramsEval = {}  # parameters for evaluation
+        self.stats = []  # result summarization
+        self.ious = {}  # ious between all gts and dts
+        if cocoGt is not None:
+            self.params.vidIds = sorted(cocoGt.getVidIds())
+            self.params.catIds = sorted(cocoGt.getCatIds())
+
+    def _prepare(self):
+        '''
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        '''
+
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                for i, a in enumerate(ann['segmentations']):
+                    if a:
+                        rle = coco.annToRLE(ann, i)
+                        ann['segmentations'][i] = rle
+                l_ori = [a for a in ann['areas'] if a]
+                if len(l_ori) == 0:
+                    ann['avg_area'] = 0
+                else:
+                    ann['avg_area'] = np.array(l_ori).mean()
+
+        p = self.params
+        if p.useCats:
+            gts = self.cocoGt.loadAnns(
+                self.cocoGt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds))
+            dts = self.cocoDt.loadAnns(
+                self.cocoDt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds))
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(vidIds=p.vidIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(vidIds=p.vidIds))
+
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == 'segm':
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+        # set ignore flag
+        for gt in gts:
+            gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
+            gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
+            if p.iouType == 'keypoints':
+                gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        for gt in gts:
+            self._gts[gt['video_id'], gt['category_id']].append(gt)
+        for dt in dts:
+            self._dts[dt['video_id'], dt['category_id']].append(dt)
+        self.evalVids = defaultdict(
+            list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+
+    def evaluate(self):
+        '''
+        Run per image evaluation on given images and store
+        results (a list of dict) in self.evalVids
+        :return: None
+        '''
+        tic = time.time()
+        print('Running per image evaluation...')
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+            print('useSegm (deprecated) is not None. Running {} evaluation'.
+                  format(p.iouType))
+        print('Evaluate annotation type *{}*'.format(p.iouType))
+        p.vidIds = list(np.unique(p.vidIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        self._prepare()
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        if p.iouType == 'segm' or p.iouType == 'bbox':
+            computeIoU = self.computeIoU
+        elif p.iouType == 'keypoints':
+            computeIoU = self.computeOks
+        self.ious = {(vidId, catId): computeIoU(vidId, catId)
+                     for vidId in p.vidIds for catId in catIds}
+
+        evaluateVid = self.evaluateVid
+        maxDet = p.maxDets[-1]
+
+        self.evalImgs = [
+            evaluateVid(vidId, catId, areaRng, maxDet) for catId in catIds
+            for areaRng in p.areaRng for vidId in p.vidIds
+        ]
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format(toc - tic))
+
+    def computeIoU(self, vidId, catId):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[vidId, catId]
+            dt = self._dts[vidId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[vidId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[vidId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        inds = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0:p.maxDets[-1]]
+
+        if p.iouType == 'segm':
+            g = [g['segmentations'] for g in gt]
+            d = [d['segmentations'] for d in dt]
+        elif p.iouType == 'bbox':
+            g = [g['bboxes'] for g in gt]
+            d = [d['bboxes'] for d in dt]
+        else:
+            raise Exception('unknown iouType for iou computation')
+
+        # compute iou between each dt and gt region
+
+        def iou_seq(d_seq, g_seq):
+            i = .0
+            u = .0
+            for d, g in zip(d_seq, g_seq):
+                if d and g:
+                    i += maskUtils.area(maskUtils.merge([d, g], True))
+                    u += maskUtils.area(maskUtils.merge([d, g], False))
+                elif not d and g:
+                    u += maskUtils.area(g)
+                elif d and not g:
+                    u += maskUtils.area(d)
+            if not u > .0:
+                print('Mask sizes in video {} and category {} may not match!'.
+                      format(vidId, catId))
+            iou = i / u if u > .0 else .0
+            return iou
+
+        ious = np.zeros([len(d), len(g)])
+        for i, j in np.ndindex(ious.shape):
+            ious[i, j] = iou_seq(d[i], g[j])
+
+        return ious
+
+    def computeOks(self, imgId, catId):
+        p = self.params
+
+        gts = self._gts[imgId, catId]
+        dts = self._dts[imgId, catId]
+        inds = np.argsort([-d['score'] for d in dts], kind='mergesort')
+        dts = [dts[i] for i in inds]
+        if len(dts) > p.maxDets[-1]:
+            dts = dts[0:p.maxDets[-1]]
+        # if len(gts) == 0 and len(dts) == 0:
+        if len(gts) == 0 or len(dts) == 0:
+            return []
+        ious = np.zeros((len(dts), len(gts)))
+        sigmas = np.array([
+            .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
+            .87, .87, .89, .89
+        ]) / 10.0
+        vars = (sigmas * 2)**2
+        k = len(sigmas)
+        # compute oks between each detection and ground truth object
+        for j, gt in enumerate(gts):
+            # create bounds for ignore regions(double the gt bbox)
+            g = np.array(gt['keypoints'])
+            xg = g[0::3]
+            yg = g[1::3]
+            vg = g[2::3]
+            k1 = np.count_nonzero(vg > 0)
+            bb = gt['bbox']
+            x0 = bb[0] - bb[2]
+            x1 = bb[0] + bb[2] * 2
+            y0 = bb[1] - bb[3]
+            y1 = bb[1] + bb[3] * 2
+            for i, dt in enumerate(dts):
+                d = np.array(dt['keypoints'])
+                xd = d[0::3]
+                yd = d[1::3]
+                if k1 > 0:
+                    # measure the per-keypoint distance if keypoints visible
+                    dx = xd - xg
+                    dy = yd - yg
+                else:
+                    # measure minimum distance to keypoints
+                    z = np.zeros((k))
+                    dx = np.max((z, x0 - xd), axis=0) + np.max(
+                        (z, xd - x1), axis=0)
+                    dy = np.max((z, y0 - yd), axis=0) + np.max(
+                        (z, yd - y1), axis=0)
+                e = (dx**2 + dy**2) / vars / (gt['avg_area'] +
+                                              np.spacing(1)) / 2
+                if k1 > 0:
+                    e = e[vg > 0]
+                ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
+        return ious
+
+    def evaluateVid(self, vidId, catId, aRng, maxDet):
+        '''
+        perform evaluation for single category and image
+        :return: dict (single image results)
+        '''
+        p = self.params
+        if p.useCats:
+            gt = self._gts[vidId, catId]
+            dt = self._dts[vidId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[vidId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[vidId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return None
+
+        for g in gt:
+            if g['ignore'] or (g['avg_area'] < aRng[0]
+                               or g['avg_area'] > aRng[1]):
+                g['_ignore'] = 1
+            else:
+                g['_ignore'] = 0
+
+        # sort dt highest score first, sort gt ignore last
+        gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in dtind[0:maxDet]]
+        iscrowd = [int(o['iscrowd']) for o in gt]
+        # load computed ious
+        ious = self.ious[vidId, catId][:, gtind] if len(
+            self.ious[vidId, catId]) > 0 else self.ious[vidId, catId]
+
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm = np.zeros((T, G))
+        dtm = np.zeros((T, D))
+        gtIg = np.array([g['_ignore'] for g in gt])
+        dtIg = np.zeros((T, D))
+        if not len(ious) == 0:
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t, 1 - 1e-10])
+                    m = -1
+                    for gind, g in enumerate(gt):
+                        # if this gt already matched, and not a crowd, continue
+                        if gtm[tind, gind] > 0 and not iscrowd[gind]:
+                            continue
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1:
+                            break
+                        # continue to next gt unless better match made
+                        if ious[dind, gind] < iou:
+                            continue
+                        # if match successful and best so far,
+                        # store appropriately
+                        iou = ious[dind, gind]
+                        m = gind
+                    # if match made store id of match for both dt and gt
+                    if m == -1:
+                        continue
+                    dtIg[tind, dind] = gtIg[m]
+                    dtm[tind, dind] = gt[m]['id']
+                    gtm[tind, m] = d['id']
+        # set unmatched detections outside of area range to ignore
+        a = np.array([
+            d['avg_area'] < aRng[0] or d['avg_area'] > aRng[1] for d in dt
+        ]).reshape((1, len(dt)))
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T,
+                                                                      0)))
+        # store results for given image and category
+        return {
+            'video_id': vidId,
+            'category_id': catId,
+            'aRng': aRng,
+            'maxDet': maxDet,
+            'dtIds': [d['id'] for d in dt],
+            'gtIds': [g['id'] for g in gt],
+            'dtMatches': dtm,
+            'gtMatches': gtm,
+            'dtScores': [d['score'] for d in dt],
+            'gtIgnore': gtIg,
+            'dtIgnore': dtIg,
+        }
+
+    def accumulate(self, p=None):
+        """Accumulate per image evaluation results and store the result in
+        self.eval.
+
+        :param p: input params for evaluation
+        :return: None
+        """
+        print('Accumulating evaluation results...')
+        tic = time.time()
+        if not self.evalImgs:
+            print('Please run evaluate() first')
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+        p.catIds = p.catIds if p.useCats == 1 else [-1]
+        T = len(p.iouThrs)
+        R = len(p.recThrs)
+        K = len(p.catIds) if p.useCats else 1
+        A = len(p.areaRng)
+        M = len(p.maxDets)
+        precision = -np.ones(
+            (T, R, K, A, M))  # -1 for the precision of absent categories
+        recall = -np.ones((T, K, A, M))
+        scores = -np.ones((T, R, K, A, M))
+
+        # create dictionary for future indexing
+        _pe = self._paramsEval
+        catIds = _pe.catIds if _pe.useCats else [-1]
+        setK = set(catIds)
+        setA = set(map(tuple, _pe.areaRng))
+        setM = set(_pe.maxDets)
+        setI = set(_pe.vidIds)
+        # get inds to evaluate
+        k_list = [n for n, k in enumerate(p.catIds) if k in setK]
+        m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
+        a_list = [
+            n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng))
+            if a in setA
+        ]
+        i_list = [n for n, i in enumerate(p.vidIds) if i in setI]
+        I0 = len(_pe.vidIds)
+        A0 = len(_pe.areaRng)
+        # retrieve E at each category, area range, and max number of detections
+        for k, k0 in enumerate(k_list):
+            Nk = k0 * A0 * I0
+            for a, a0 in enumerate(a_list):
+                Na = a0 * I0
+                for m, maxDet in enumerate(m_list):
+                    E = [self.evalImgs[Nk + Na + i] for i in i_list]
+                    E = [e for e in E if e is not None]
+                    if len(E) == 0:
+                        continue
+                    dtScores = np.concatenate(
+                        [e['dtScores'][0:maxDet] for e in E])
+
+                    inds = np.argsort(-dtScores, kind='mergesort')
+                    dtScoresSorted = dtScores[inds]
+
+                    dtm = np.concatenate(
+                        [e['dtMatches'][:, 0:maxDet] for e in E], axis=1)[:,
+                                                                          inds]
+                    dtIg = np.concatenate(
+                        [e['dtIgnore'][:, 0:maxDet] for e in E], axis=1)[:,
+                                                                         inds]
+                    gtIg = np.concatenate([e['gtIgnore'] for e in E])
+                    npig = np.count_nonzero(gtIg == 0)
+                    if npig == 0:
+                        continue
+                    tps = np.logical_and(dtm, np.logical_not(dtIg))
+                    fps = np.logical_and(
+                        np.logical_not(dtm), np.logical_not(dtIg))
+
+                    tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
+                    fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
+                    for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
+                        tp = np.array(tp)
+                        fp = np.array(fp)
+                        nd_ori = len(tp)
+                        rc = tp / npig
+                        pr = tp / (fp + tp + np.spacing(1))
+                        q = np.zeros((R, ))
+                        ss = np.zeros((R, ))
+
+                        if nd_ori:
+                            recall[t, k, a, m] = rc[-1]
+                        else:
+                            recall[t, k, a, m] = 0
+
+                        # use python array gets significant speed improvement
+                        pr = pr.tolist()
+                        q = q.tolist()
+
+                        for i in range(nd_ori - 1, 0, -1):
+                            if pr[i] > pr[i - 1]:
+                                pr[i - 1] = pr[i]
+
+                        inds = np.searchsorted(rc, p.recThrs, side='left')
+                        try:
+                            for ri, pi in enumerate(inds):
+                                q[ri] = pr[pi]
+                                ss[ri] = dtScoresSorted[pi]
+                        except Exception:
+                            pass
+                        precision[t, :, k, a, m] = np.array(q)
+                        scores[t, :, k, a, m] = np.array(ss)
+        self.eval = {
+            'params': p,
+            'counts': [T, R, K, A, M],
+            'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+            'precision': precision,
+            'recall': recall,
+            'scores': scores,
+        }
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format(toc - tic))
+
+    def summarize(self):
+        """Compute and display summary metrics for evaluation results.
+
+        Note this function can *only* be applied on the default parameter
+        setting
+        """
+
+        def _summarize(ap=1, iouThr=None, areaRng='all', maxDets=100):
+            p = self.params
+            iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | ' \
+                   'maxDets={:>3d} ] = {:0.3f}'
+            titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
+            typeStr = '(AP)' if ap == 1 else '(AR)'
+            iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
+                if iouThr is None else '{:0.2f}'.format(iouThr)
+
+            aind = [
+                i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng
+            ]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval['precision']
+                # IoU
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, :, aind, mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval['recall']
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, aind, mind]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            print(
+                iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets,
+                            mean_s))
+            return mean_s
+
+        def _summarizeDets():
+            stats = np.zeros((12, ))
+            stats[0] = _summarize(1)
+            stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2])
+            stats[2] = _summarize(
+                1, iouThr=.75, maxDets=self.params.maxDets[2])
+            stats[3] = _summarize(
+                1, areaRng='small', maxDets=self.params.maxDets[2])
+            stats[4] = _summarize(
+                1, areaRng='medium', maxDets=self.params.maxDets[2])
+            stats[5] = _summarize(
+                1, areaRng='large', maxDets=self.params.maxDets[2])
+            stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
+            stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
+            stats[9] = _summarize(
+                0, areaRng='small', maxDets=self.params.maxDets[2])
+            stats[10] = _summarize(
+                0, areaRng='medium', maxDets=self.params.maxDets[2])
+            stats[11] = _summarize(
+                0, areaRng='large', maxDets=self.params.maxDets[2])
+            return stats
+
+        def _summarizeKps():
+            stats = np.zeros((10, ))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng='medium')
+            stats[4] = _summarize(1, maxDets=20, areaRng='large')
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng='medium')
+            stats[9] = _summarize(0, maxDets=20, areaRng='large')
+            return stats
+
+        if not self.eval:
+            raise Exception('Please run accumulate() first')
+        iouType = self.params.iouType
+        if iouType == 'segm' or iouType == 'bbox':
+            summarize = _summarizeDets
+        elif iouType == 'keypoints':
+            summarize = _summarizeKps
+        self.stats = summarize()
+
+    def __str__(self):
+        self.summarize()
+
+
+class Params:
+    """Params for coco evaluation api."""
+
+    def setDetParams(self):
+        self.vidIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange
+        # is slightly larger than the true value
+        self.iouThrs = np.linspace(
+            .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(
+            .0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
+        self.maxDets = [1, 10, 100]
+        self.areaRng = [[0**2, 1e5**2], [0**2, 128**2], [128**2, 256**2],
+                        [256**2, 1e5**2]]
+        self.areaRngLbl = ['all', 'small', 'medium', 'large']
+        self.useCats = 1
+
+    def setKpParams(self):
+        self.vidIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange
+        # is slightly larger than the true value
+        self.iouThrs = np.linspace(
+            .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(
+            .0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
+        self.maxDets = [20]
+        self.areaRng = [[0**2, 1e5**2], [32**2, 96**2], [96**2, 1e5**2]]
+        self.areaRngLbl = ['all', 'medium', 'large']
+        self.useCats = 1
+
+    def __init__(self, iouType='segm'):
+        if iouType == 'segm' or iouType == 'bbox':
+            self.setDetParams()
+        elif iouType == 'keypoints':
+            self.setKpParams()
+        else:
+            raise Exception('iouType not supported')
+        self.iouType = iouType
+        # useSegm is deprecated
+        self.useSegm = None
diff --git a/mmde/mmdet/evaluation/metrics/__init__.py b/mmde/mmdet/evaluation/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ad040cf6ffe3ada4b77e6a6b9caee3ad7afdf1d
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/__init__.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_video_metric import BaseVideoMetric
+from .cityscapes_metric import CityScapesMetric
+from .coco_caption_metric import COCOCaptionMetric
+from .coco_metric import CocoMetric
+from .coco_occluded_metric import CocoOccludedSeparatedMetric
+from .coco_panoptic_metric import CocoPanopticMetric
+from .coco_video_metric import CocoVideoMetric
+from .crowdhuman_metric import CrowdHumanMetric
+from .dod_metric import DODCocoMetric
+from .dump_det_results import DumpDetResults
+from .dump_odvg_results import DumpODVGResults
+from .dump_proposals_metric import DumpProposals
+from .flickr30k_metric import Flickr30kMetric
+from .grefcoco_metric import gRefCOCOMetric
+from .lvis_metric import LVISMetric
+from .mot_challenge_metric import MOTChallengeMetric
+from .openimages_metric import OpenImagesMetric
+from .ov_coco_metric import OVCocoMetric
+from .refexp_metric import RefExpMetric
+from .refseg_metric import RefSegMetric
+from .reid_metric import ReIDMetrics
+from .semseg_metric import SemSegMetric
+from .voc_metric import VOCMetric
+from .youtube_vis_metric import YouTubeVISMetric
+
+__all__ = [
+    'CityScapesMetric', 'CocoMetric', 'CocoPanopticMetric', 'OpenImagesMetric',
+    'VOCMetric', 'LVISMetric', 'CrowdHumanMetric', 'DumpProposals',
+    'CocoOccludedSeparatedMetric', 'DumpDetResults', 'BaseVideoMetric',
+    'MOTChallengeMetric', 'CocoVideoMetric', 'ReIDMetrics', 'YouTubeVISMetric',
+    'COCOCaptionMetric', 'SemSegMetric', 'RefSegMetric', 'RefExpMetric',
+    'gRefCOCOMetric', 'DODCocoMetric', 'DumpODVGResults', 'Flickr30kMetric',
+    'OVCocoMetric'
+]
diff --git a/mmde/mmdet/evaluation/metrics/base_video_metric.py b/mmde/mmdet/evaluation/metrics/base_video_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..90c7cdcbed5f12b59b6978ccba7576d6d2c25c5e
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/base_video_metric.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import warnings
+from typing import Optional, Sequence
+
+import torch
+from mmengine.dist import (barrier, broadcast, broadcast_object_list,
+                           get_dist_info, is_main_process)
+from mmengine.evaluator import BaseMetric
+from mmengine.utils import mkdir_or_exist
+
+
+class BaseVideoMetric(BaseMetric):
+    """Base class for a metric in video task.
+
+    The metric first processes each batch of data_samples and predictions,
+    and appends the processed results to the results list. Then it
+    collects all results together from all ranks if distributed training
+    is used. Finally, it computes the metrics of the entire dataset.
+
+    A subclass of class:`BaseVideoMetric` should assign a meaningful value
+    to the class attribute `default_prefix`. See the argument `prefix` for
+    details.
+    """
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for track_data_sample in data_samples:
+            video_data_samples = track_data_sample['video_data_samples']
+            ori_video_len = video_data_samples[0].ori_video_length
+            if ori_video_len == len(video_data_samples):
+                # video process
+                self.process_video(video_data_samples)
+            else:
+                # image process
+                self.process_image(video_data_samples, ori_video_len)
+
+    def evaluate(self, size: int = 1) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        if len(self.results) == 0:
+            warnings.warn(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.')
+
+        results = collect_tracking_results(self.results, self.collect_device)
+
+        if is_main_process():
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
+
+
+def collect_tracking_results(results: list,
+                             device: str = 'cpu',
+                             tmpdir: Optional[str] = None) -> Optional[list]:
+    """Collected results in distributed environments. different from the
+    function mmengine.dist.collect_results, tracking compute metrics don't use
+    paramenter size, which means length of the entire validation dataset.
+    because it's equal to video num, but compute metrics need image num.
+
+    Args:
+        results (list): Result list containing result parts to be
+            collected. Each item of ``result_part`` should be a picklable
+            object.
+        device (str): Device name. Optional values are 'cpu' and 'gpu'.
+        tmpdir (str | None): Temporal directory for collected results to
+            store. If set to None, it will create a temporal directory for it.
+            ``tmpdir`` should be None when device is 'gpu'. Defaults to None.
+
+    Returns:
+        list or None: The collected results.
+    """
+    if device not in ['gpu', 'cpu']:
+        raise NotImplementedError(
+            f"device must be 'cpu' or 'gpu', but got {device}")
+
+    if device == 'gpu':
+        assert tmpdir is None, 'tmpdir should be None when device is "gpu"'
+        raise NotImplementedError('GPU collecting has not been supported yet')
+    else:
+        return collect_tracking_results_cpu(results, tmpdir)
+
+
+def collect_tracking_results_cpu(result_part: list,
+                                 tmpdir: Optional[str] = None
+                                 ) -> Optional[list]:
+    """Collect results on cpu mode.
+
+    Saves the results on different gpus to 'tmpdir' and collects them by the
+    rank 0 worker.
+
+    Args:
+        result_part (list): The part of prediction results.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode. If is None, use `tempfile.mkdtemp()`
+            to make a temporary path. Defaults to None.
+
+    Returns:
+        list or None: The collected results.
+    """
+    rank, world_size = get_dist_info()
+    if world_size == 1:
+        return result_part
+
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8)
+        if rank == 0:
+            mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8)
+            dir_tensor[:len(tmpdir)] = tmpdir
+        broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.numpy().tobytes().decode().rstrip()
+    else:
+        mkdir_or_exist(tmpdir)
+
+    # dump the part result to the dir
+    with open(osp.join(tmpdir, f'part_{rank}.pkl'), 'wb') as f:  # type: ignore
+        pickle.dump(result_part, f, protocol=2)
+
+    barrier()
+
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            path = osp.join(tmpdir, f'part_{i}.pkl')  # type: ignore
+            with open(path, 'rb') as f:
+                part_list.extend(pickle.load(f))
+        shutil.rmtree(tmpdir)
+        return part_list
diff --git a/mmde/mmdet/evaluation/metrics/cityscapes_metric.py b/mmde/mmdet/evaluation/metrics/cityscapes_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5cdc179a3c76ef3742dd3ee6692c7deb9905459
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/cityscapes_metric.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import shutil
+import tempfile
+from collections import OrderedDict
+from typing import Dict, Optional, Sequence
+
+import mmcv
+import numpy as np
+from mmengine.dist import is_main_process
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS
+
+try:
+    import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as CSEval  # noqa: E501
+    import cityscapesscripts.helpers.labels as CSLabels
+
+    from mmdet.evaluation.functional import evaluateImgLists
+    HAS_CITYSCAPESAPI = True
+except ImportError:
+    HAS_CITYSCAPESAPI = False
+
+
+@METRICS.register_module()
+class CityScapesMetric(BaseMetric):
+    """CityScapes metric for instance segmentation.
+
+    Args:
+        outfile_prefix (str): The prefix of txt and png files. The txt and
+            png file will be save in a directory whose path is
+            "outfile_prefix.results/".
+        seg_prefix (str, optional): Path to the directory which contains the
+            cityscapes instance segmentation masks. It's necessary when
+            training and validation. It could be None when infer on test
+            dataset. Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+        dump_matches (bool): Whether dump matches.json file during evaluating.
+            Defaults to False.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+    default_prefix: Optional[str] = 'cityscapes'
+
+    def __init__(self,
+                 outfile_prefix: str,
+                 seg_prefix: Optional[str] = None,
+                 format_only: bool = False,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 dump_matches: bool = False,
+                 file_client_args: dict = None,
+                 backend_args: dict = None) -> None:
+
+        if not HAS_CITYSCAPESAPI:
+            raise RuntimeError('Failed to import `cityscapesscripts`.'
+                               'Please try to install official '
+                               'cityscapesscripts by '
+                               '"pip install cityscapesscripts"')
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.tmp_dir = None
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+        else:
+            assert seg_prefix is not None, '`seg_prefix` is necessary when '
+            'computing the CityScapes metrics'
+
+        if outfile_prefix is None:
+            self.tmp_dir = tempfile.TemporaryDirectory()
+            self.outfile_prefix = osp.join(self.tmp_dir.name, 'results')
+        else:
+            # the directory to save predicted panoptic segmentation mask
+            self.outfile_prefix = osp.join(outfile_prefix, 'results')  # type: ignore # yapf: disable # noqa: E501
+
+        dir_name = osp.expanduser(self.outfile_prefix)
+
+        if osp.exists(dir_name) and is_main_process():
+            logger: MMLogger = MMLogger.get_current_instance()
+            logger.info('remove previous results.')
+            shutil.rmtree(dir_name)
+        os.makedirs(dir_name, exist_ok=True)
+
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+        self.seg_prefix = seg_prefix
+        self.dump_matches = dump_matches
+
+    def __del__(self) -> None:
+        """Clean up the results if necessary."""
+        if self.tmp_dir is not None:
+            self.tmp_dir.cleanup()
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            # parse pred
+            result = dict()
+            pred = data_sample['pred_instances']
+            filename = data_sample['img_path']
+            basename = osp.splitext(osp.basename(filename))[0]
+            pred_txt = osp.join(self.outfile_prefix, basename + '_pred.txt')
+            result['pred_txt'] = pred_txt
+            labels = pred['labels'].cpu().numpy()
+            masks = pred['masks'].cpu().numpy().astype(np.uint8)
+            if 'mask_scores' in pred:
+                # some detectors use different scores for bbox and mask
+                mask_scores = pred['mask_scores'].cpu().numpy()
+            else:
+                mask_scores = pred['scores'].cpu().numpy()
+
+            with open(pred_txt, 'w') as f:
+                for i, (label, mask, mask_score) in enumerate(
+                        zip(labels, masks, mask_scores)):
+                    class_name = self.dataset_meta['classes'][label]
+                    class_id = CSLabels.name2label[class_name].id
+                    png_filename = osp.join(
+                        self.outfile_prefix,
+                        basename + f'_{i}_{class_name}.png')
+                    mmcv.imwrite(mask, png_filename)
+                    f.write(f'{osp.basename(png_filename)} '
+                            f'{class_id} {mask_score}\n')
+
+            # parse gt
+            gt = dict()
+            img_path = filename.replace('leftImg8bit.png',
+                                        'gtFine_instanceIds.png')
+            gt['file_name'] = img_path.replace('leftImg8bit', 'gtFine')
+
+            self.results.append((gt, result))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+                the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        if self.format_only:
+            logger.info(
+                f'results are saved to {osp.dirname(self.outfile_prefix)}')
+            return OrderedDict()
+        logger.info('starts to compute metric')
+
+        gts, preds = zip(*results)
+        # set global states in cityscapes evaluation API
+        gt_instances_file = osp.join(self.outfile_prefix, 'gtInstances.json')  # type: ignore # yapf: disable # noqa: E501
+        # split gt and prediction list
+        gts, preds = zip(*results)
+        CSEval.args.JSONOutput = False
+        CSEval.args.colorized = False
+        CSEval.args.gtInstancesFile = gt_instances_file
+
+        groundTruthImgList = [gt['file_name'] for gt in gts]
+        predictionImgList = [pred['pred_txt'] for pred in preds]
+        CSEval_results = evaluateImgLists(
+            predictionImgList,
+            groundTruthImgList,
+            CSEval.args,
+            self.backend_args,
+            dump_matches=self.dump_matches)['averages']
+
+        eval_results = OrderedDict()
+        eval_results['mAP'] = CSEval_results['allAp']
+        eval_results['AP@50'] = CSEval_results['allAp50%']
+
+        return eval_results
diff --git a/mmde/mmdet/evaluation/metrics/coco_caption_metric.py b/mmde/mmdet/evaluation/metrics/coco_caption_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8c7350150f73d8d568597b352e33ad2a202c609
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/coco_caption_metric.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import tempfile
+from typing import List, Optional
+
+from mmengine.evaluator import BaseMetric
+from mmengine.utils import track_iter_progress
+from pycocotools.coco import COCO
+
+from mmdet.registry import METRICS
+
+try:
+    from pycocoevalcap.eval import COCOEvalCap
+except ImportError:
+    COCOEvalCap = None
+
+
+@METRICS.register_module()
+class COCOCaptionMetric(BaseMetric):
+    """Coco Caption evaluation wrapper.
+
+    Save the generated captions and transform into coco format.
+    Calling COCO API for caption metrics.
+
+    Args:
+        ann_file (str): the path for the COCO format caption ground truth
+            json file, load for evaluations.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Should be modified according to the
+            `retrieval_type` for unambiguous results. Defaults to TR.
+    """
+
+    def __init__(self,
+                 ann_file: str,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None):
+        if COCOEvalCap is None:
+            raise RuntimeError(
+                'COCOEvalCap is not installed, please install it by: '
+                'pip install pycocoevalcap')
+
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.ann_file = ann_file
+
+    def process(self, data_batch, data_samples):
+        """Process one batch of data samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+
+        Args:
+            data_batch: A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+
+        for data_sample in data_samples:
+            result = dict()
+
+            result['caption'] = data_sample['pred_caption']
+            result['image_id'] = int(data_sample['img_id'])
+
+            # Save the result to `self.results`.
+            self.results.append(result)
+
+    def compute_metrics(self, results: List):
+        """Compute the metrics from processed results.
+
+        Args:
+            results (dict): The processed results of each batch.
+
+        Returns:
+            Dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        # NOTICE: don't access `self.results` from the method.
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+
+            eval_result_file = save_result(
+                result=results,
+                result_dir=temp_dir,
+                filename='caption_pred',
+                remove_duplicate='image_id',
+            )
+
+            coco_val = coco_caption_eval(eval_result_file, self.ann_file)
+
+        return coco_val
+
+
+def save_result(result, result_dir, filename, remove_duplicate=''):
+    """Saving predictions as json file for evaluation."""
+    # combine results from all processes
+    if remove_duplicate:
+        result_new = []
+        id_list = []
+        for res in track_iter_progress(result):
+            if res[remove_duplicate] not in id_list:
+                id_list.append(res[remove_duplicate])
+                result_new.append(res)
+        result = result_new
+
+    final_result_file_url = os.path.join(result_dir, '%s.json' % filename)
+    print(f'result file saved to {final_result_file_url}')
+    json.dump(result, open(final_result_file_url, 'w'))
+
+    return final_result_file_url
+
+
+def coco_caption_eval(results_file, ann_file):
+    """Evaluation between gt json and prediction json files."""
+    # create coco object and coco_result object
+    coco = COCO(ann_file)
+    coco_result = coco.loadRes(results_file)
+
+    # create coco_eval object by taking coco and coco_result
+    coco_eval = COCOEvalCap(coco, coco_result)
+
+    # make sure the image ids are the same
+    coco_eval.params['image_id'] = coco_result.getImgIds()
+
+    # This will take some times at the first run
+    coco_eval.evaluate()
+
+    # print output evaluation scores
+    for metric, score in coco_eval.eval.items():
+        print(f'{metric}: {score:.3f}')
+
+    return coco_eval.eval
diff --git a/mmde/mmdet/evaluation/metrics/coco_metric.py b/mmde/mmdet/evaluation/metrics/coco_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfdc66e03b96e62366a921c137fc5a5727e26302
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/coco_metric.py
@@ -0,0 +1,597 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import itertools
+import os.path as osp
+import tempfile
+from collections import OrderedDict
+from typing import Dict, List, Optional, Sequence, Union
+
+import numpy as np
+import torch
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import dump, get_local_path, load
+from mmengine.logging import MMLogger
+from terminaltables import AsciiTable
+
+from mmdet.datasets.api_wrappers import COCO, COCOeval, COCOevalMP
+from mmdet.registry import METRICS
+from mmdet.structures.mask import encode_mask_results
+from ..functional import eval_recalls
+
+
+@METRICS.register_module()
+class CocoMetric(BaseMetric):
+    """COCO evaluation metric.
+
+    Evaluate AR, AP, and mAP for detection tasks including proposal/box
+    detection and instance segmentation. Please refer to
+    https://cocodataset.org/#detection-eval for more details.
+
+    Args:
+        ann_file (str, optional): Path to the coco format annotation file.
+            If not specified, ground truth annotations from the dataset will
+            be converted to coco format. Defaults to None.
+        metric (str | List[str]): Metrics to be evaluated. Valid metrics
+            include 'bbox', 'segm', 'proposal', and 'proposal_fast'.
+            Defaults to 'bbox'.
+        classwise (bool): Whether to evaluate the metric class-wise.
+            Defaults to False.
+        proposal_nums (Sequence[int]): Numbers of proposals to be evaluated.
+            Defaults to (100, 300, 1000).
+        iou_thrs (float | List[float], optional): IoU threshold to compute AP
+            and AR. If not specified, IoUs from 0.5 to 0.95 will be used.
+            Defaults to None.
+        metric_items (List[str], optional): Metric result names to be
+            recorded in the evaluation result. Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        outfile_prefix (str, optional): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Defaults to None.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+        sort_categories (bool): Whether sort categories in annotations. Only
+            used for `Objects365V1Dataset`. Defaults to False.
+        use_mp_eval (bool): Whether to use mul-processing evaluation
+    """
+    default_prefix: Optional[str] = 'coco'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 metric: Union[str, List[str]] = 'bbox',
+                 classwise: bool = False,
+                 proposal_nums: Sequence[int] = (100, 300, 1000),
+                 iou_thrs: Optional[Union[float, Sequence[float]]] = None,
+                 metric_items: Optional[Sequence[str]] = None,
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 file_client_args: dict = None,
+                 backend_args: dict = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 sort_categories: bool = False,
+                 use_mp_eval: bool = False) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        # coco evaluation metrics
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
+        for metric in self.metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(
+                    "metric should be one of 'bbox', 'segm', 'proposal', "
+                    f"'proposal_fast', but got {metric}.")
+
+        # do class wise evaluation, default False
+        self.classwise = classwise
+        # whether to use multi processing evaluation, default False
+        self.use_mp_eval = use_mp_eval
+
+        # proposal_nums used to compute recall or precision.
+        self.proposal_nums = list(proposal_nums)
+
+        # iou_thrs used to compute recall or precision.
+        if iou_thrs is None:
+            iou_thrs = np.linspace(
+                .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.iou_thrs = iou_thrs
+        self.metric_items = metric_items
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+
+        self.outfile_prefix = outfile_prefix
+
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+        # if ann_file is not specified,
+        # initialize coco api with the converted dataset
+        if ann_file is not None:
+            with get_local_path(
+                    ann_file, backend_args=self.backend_args) as local_path:
+                self._coco_api = COCO(local_path)
+                if sort_categories:
+                    # 'categories' list in objects365_train.json and
+                    # objects365_val.json is inconsistent, need sort
+                    # list(or dict) before get cat_ids.
+                    cats = self._coco_api.cats
+                    sorted_cats = {i: cats[i] for i in sorted(cats)}
+                    self._coco_api.cats = sorted_cats
+                    categories = self._coco_api.dataset['categories']
+                    sorted_categories = sorted(
+                        categories, key=lambda i: i['id'])
+                    self._coco_api.dataset['categories'] = sorted_categories
+        else:
+            self._coco_api = None
+
+        # handle dataset lazy init
+        self.cat_ids = None
+        self.img_ids = None
+
+    def fast_eval_recall(self,
+                         results: List[dict],
+                         proposal_nums: Sequence[int],
+                         iou_thrs: Sequence[float],
+                         logger: Optional[MMLogger] = None) -> np.ndarray:
+        """Evaluate proposal recall with COCO's fast_eval_recall.
+
+        Args:
+            results (List[dict]): Results of the dataset.
+            proposal_nums (Sequence[int]): Proposal numbers used for
+                evaluation.
+            iou_thrs (Sequence[float]): IoU thresholds used for evaluation.
+            logger (MMLogger, optional): Logger used for logging the recall
+                summary.
+        Returns:
+            np.ndarray: Averaged recall results.
+        """
+        gt_bboxes = []
+        pred_bboxes = [result['bboxes'] for result in results]
+        for i in range(len(self.img_ids)):
+            ann_ids = self._coco_api.get_ann_ids(img_ids=self.img_ids[i])
+            ann_info = self._coco_api.load_anns(ann_ids)
+            if len(ann_info) == 0:
+                gt_bboxes.append(np.zeros((0, 4)))
+                continue
+            bboxes = []
+            for ann in ann_info:
+                if ann.get('ignore', False) or ann['iscrowd']:
+                    continue
+                x1, y1, w, h = ann['bbox']
+                bboxes.append([x1, y1, x1 + w, y1 + h])
+            bboxes = np.array(bboxes, dtype=np.float32)
+            if bboxes.shape[0] == 0:
+                bboxes = np.zeros((0, 4))
+            gt_bboxes.append(bboxes)
+
+        recalls = eval_recalls(
+            gt_bboxes, pred_bboxes, proposal_nums, iou_thrs, logger=logger)
+        ar = recalls.mean(axis=1)
+        return ar
+
+    def xyxy2xywh(self, bbox: np.ndarray) -> list:
+        """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO
+        evaluation.
+
+        Args:
+            bbox (numpy.ndarray): The bounding boxes, shape (4, ), in
+                ``xyxy`` order.
+
+        Returns:
+            list[float]: The converted bounding boxes, in ``xywh`` order.
+        """
+
+        _bbox: List = bbox.tolist()
+        return [
+            _bbox[0],
+            _bbox[1],
+            _bbox[2] - _bbox[0],
+            _bbox[3] - _bbox[1],
+        ]
+
+    def results2json(self, results: Sequence[dict],
+                     outfile_prefix: str) -> dict:
+        """Dump the detection results to a COCO style json file.
+
+        There are 3 types of results: proposals, bbox predictions, mask
+        predictions, and they have different data types. This method will
+        automatically recognize the type, and dump them to json files.
+
+        Args:
+            results (Sequence[dict]): Testing results of the
+                dataset.
+            outfile_prefix (str): The filename prefix of the json files. If the
+                prefix is "somepath/xxx", the json files will be named
+                "somepath/xxx.bbox.json", "somepath/xxx.segm.json",
+                "somepath/xxx.proposal.json".
+
+        Returns:
+            dict: Possible keys are "bbox", "segm", "proposal", and
+            values are corresponding filenames.
+        """
+        bbox_json_results = []
+        segm_json_results = [] if 'masks' in results[0] else None
+        for idx, result in enumerate(results):
+            image_id = result.get('img_id', idx)
+            labels = result['labels']
+            bboxes = result['bboxes']
+            scores = result['scores']
+            # bbox results
+            for i, label in enumerate(labels):
+                data = dict()
+                data['image_id'] = image_id
+                data['bbox'] = self.xyxy2xywh(bboxes[i])
+                data['score'] = float(scores[i])
+                data['category_id'] = self.cat_ids[label]
+                bbox_json_results.append(data)
+
+            if segm_json_results is None:
+                continue
+
+            # segm results
+            masks = result['masks']
+            mask_scores = result.get('mask_scores', scores)
+            for i, label in enumerate(labels):
+                data = dict()
+                data['image_id'] = image_id
+                data['bbox'] = self.xyxy2xywh(bboxes[i])
+                data['score'] = float(mask_scores[i])
+                data['category_id'] = self.cat_ids[label]
+                if isinstance(masks[i]['counts'], bytes):
+                    masks[i]['counts'] = masks[i]['counts'].decode()
+                data['segmentation'] = masks[i]
+                segm_json_results.append(data)
+
+        result_files = dict()
+        result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+        result_files['proposal'] = f'{outfile_prefix}.bbox.json'
+        dump(bbox_json_results, result_files['bbox'])
+
+        if segm_json_results is not None:
+            result_files['segm'] = f'{outfile_prefix}.segm.json'
+            dump(segm_json_results, result_files['segm'])
+
+        return result_files
+
+    def gt_to_coco_json(self, gt_dicts: Sequence[dict],
+                        outfile_prefix: str) -> str:
+        """Convert ground truth to coco format json file.
+
+        Args:
+            gt_dicts (Sequence[dict]): Ground truth of the dataset.
+            outfile_prefix (str): The filename prefix of the json files. If the
+                prefix is "somepath/xxx", the json file will be named
+                "somepath/xxx.gt.json".
+        Returns:
+            str: The filename of the json file.
+        """
+        categories = [
+            dict(id=id, name=name)
+            for id, name in enumerate(self.dataset_meta['classes'])
+        ]
+        image_infos = []
+        annotations = []
+
+        for idx, gt_dict in enumerate(gt_dicts):
+            img_id = gt_dict.get('img_id', idx)
+            image_info = dict(
+                id=img_id,
+                width=gt_dict['width'],
+                height=gt_dict['height'],
+                file_name='')
+            image_infos.append(image_info)
+            for ann in gt_dict['anns']:
+                label = ann['bbox_label']
+                bbox = ann['bbox']
+                coco_bbox = [
+                    bbox[0],
+                    bbox[1],
+                    bbox[2] - bbox[0],
+                    bbox[3] - bbox[1],
+                ]
+
+                annotation = dict(
+                    id=len(annotations) +
+                    1,  # coco api requires id starts with 1
+                    image_id=img_id,
+                    bbox=coco_bbox,
+                    iscrowd=ann.get('ignore_flag', 0),
+                    category_id=int(label),
+                    area=coco_bbox[2] * coco_bbox[3])
+                if ann.get('mask', None):
+                    mask = ann['mask']
+                    # area = mask_util.area(mask)
+                    if isinstance(mask, dict) and isinstance(
+                            mask['counts'], bytes):
+                        mask['counts'] = mask['counts'].decode()
+                    annotation['segmentation'] = mask
+                    # annotation['area'] = float(area)
+                annotations.append(annotation)
+
+        info = dict(
+            date_created=str(datetime.datetime.now()),
+            description='Coco json file converted by mmdet CocoMetric.')
+        coco_json = dict(
+            info=info,
+            images=image_infos,
+            categories=categories,
+            licenses=None,
+        )
+        if len(annotations) > 0:
+            coco_json['annotations'] = annotations
+        converted_json_path = f'{outfile_prefix}.gt.json'
+        dump(coco_json, converted_json_path)
+        return converted_json_path
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+            result['labels'] = pred['labels'].cpu().numpy()
+            # encode mask to RLE
+            if 'masks' in pred:
+                result['masks'] = encode_mask_results(
+                    pred['masks'].detach().cpu().numpy()) if isinstance(
+                        pred['masks'], torch.Tensor) else pred['masks']
+            # some detectors use different scores for bbox and mask
+            if 'mask_scores' in pred:
+                result['mask_scores'] = pred['mask_scores'].cpu().numpy()
+
+            # parse gt
+            gt = dict()
+            gt['width'] = data_sample['ori_shape'][1]
+            gt['height'] = data_sample['ori_shape'][0]
+            gt['img_id'] = data_sample['img_id']
+            if self._coco_api is None:
+                # TODO: Need to refactor to support LoadAnnotations
+                assert 'instances' in data_sample, \
+                    'ground truth is required for evaluation when ' \
+                    '`ann_file` is not provided'
+                gt['anns'] = data_sample['instances']
+            # add converted result to the results list
+            self.results.append((gt, result))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # split gt and prediction list
+        gts, preds = zip(*results)
+
+        tmp_dir = None
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+
+        if self._coco_api is None:
+            # use converted gt json file to initialize coco api
+            logger.info('Converting ground truth to coco format...')
+            coco_json_path = self.gt_to_coco_json(
+                gt_dicts=gts, outfile_prefix=outfile_prefix)
+            self._coco_api = COCO(coco_json_path)
+
+        # handle lazy init
+        if self.cat_ids is None:
+            self.cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['classes'])
+        if self.img_ids is None:
+            self.img_ids = self._coco_api.get_img_ids()
+
+        # convert predictions to coco format and dump to json file
+        result_files = self.results2json(preds, outfile_prefix)
+
+        eval_results = OrderedDict()
+        if self.format_only:
+            logger.info('results are saved in '
+                        f'{osp.dirname(outfile_prefix)}')
+            return eval_results
+
+        for metric in self.metrics:
+            logger.info(f'Evaluating {metric}...')
+
+            # TODO: May refactor fast_eval_recall to an independent metric?
+            # fast eval recall
+            if metric == 'proposal_fast':
+                ar = self.fast_eval_recall(
+                    preds, self.proposal_nums, self.iou_thrs, logger=logger)
+                log_msg = []
+                for i, num in enumerate(self.proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
+                log_msg = ''.join(log_msg)
+                logger.info(log_msg)
+                continue
+
+            # evaluate proposal, bbox and segm
+            iou_type = 'bbox' if metric == 'proposal' else metric
+            if metric not in result_files:
+                raise KeyError(f'{metric} is not in results')
+            try:
+                predictions = load(result_files[metric])
+                if iou_type == 'segm':
+                    # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331  # noqa
+                    # When evaluating mask AP, if the results contain bbox,
+                    # cocoapi will use the box area instead of the mask area
+                    # for calculating the instance area. Though the overall AP
+                    # is not affected, this leads to different
+                    # small/medium/large mask AP results.
+                    for x in predictions:
+                        x.pop('bbox')
+                coco_dt = self._coco_api.loadRes(predictions)
+
+            except IndexError:
+                logger.error(
+                    'The testing results of the whole dataset is empty.')
+                break
+
+            if self.use_mp_eval:
+                coco_eval = COCOevalMP(self._coco_api, coco_dt, iou_type)
+            else:
+                coco_eval = COCOeval(self._coco_api, coco_dt, iou_type)
+
+            coco_eval.params.catIds = self.cat_ids
+            coco_eval.params.imgIds = self.img_ids
+            coco_eval.params.maxDets = list(self.proposal_nums)
+            coco_eval.params.iouThrs = self.iou_thrs
+
+            # mapping of cocoEval.stats
+            coco_metric_names = {
+                'mAP': 0,
+                'mAP_50': 1,
+                'mAP_75': 2,
+                'mAP_s': 3,
+                'mAP_m': 4,
+                'mAP_l': 5,
+                'AR@100': 6,
+                'AR@300': 7,
+                'AR@1000': 8,
+                'AR_s@1000': 9,
+                'AR_m@1000': 10,
+                'AR_l@1000': 11
+            }
+            metric_items = self.metric_items
+            if metric_items is not None:
+                for metric_item in metric_items:
+                    if metric_item not in coco_metric_names:
+                        raise KeyError(
+                            f'metric item "{metric_item}" is not supported')
+
+            if metric == 'proposal':
+                coco_eval.params.useCats = 0
+                coco_eval.evaluate()
+                coco_eval.accumulate()
+                coco_eval.summarize()
+                if metric_items is None:
+                    metric_items = [
+                        'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000',
+                        'AR_m@1000', 'AR_l@1000'
+                    ]
+
+                for item in metric_items:
+                    val = float(
+                        f'{coco_eval.stats[coco_metric_names[item]]:.3f}')
+                    eval_results[item] = val
+            else:
+                coco_eval.evaluate()
+                coco_eval.accumulate()
+                coco_eval.summarize()
+                if self.classwise:  # Compute per-category AP
+                    # Compute per-category AP
+                    # from https://github.com/facebookresearch/detectron2/
+                    precisions = coco_eval.eval['precision']
+                    # precision: (iou, recall, cls, area range, max dets)
+                    assert len(self.cat_ids) == precisions.shape[2]
+
+                    results_per_category = []
+                    for idx, cat_id in enumerate(self.cat_ids):
+                        t = []
+                        # area range index 0: all area ranges
+                        # max dets index -1: typically 100 per image
+                        nm = self._coco_api.loadCats(cat_id)[0]
+                        precision = precisions[:, :, idx, 0, -1]
+                        precision = precision[precision > -1]
+                        if precision.size:
+                            ap = np.mean(precision)
+                        else:
+                            ap = float('nan')
+                        t.append(f'{nm["name"]}')
+                        t.append(f'{round(ap, 3)}')
+                        eval_results[f'{nm["name"]}_precision'] = round(ap, 3)
+
+                        # indexes of IoU  @50 and @75
+                        for iou in [0, 5]:
+                            precision = precisions[iou, :, idx, 0, -1]
+                            precision = precision[precision > -1]
+                            if precision.size:
+                                ap = np.mean(precision)
+                            else:
+                                ap = float('nan')
+                            t.append(f'{round(ap, 3)}')
+
+                        # indexes of area of small, median and large
+                        for area in [1, 2, 3]:
+                            precision = precisions[:, :, idx, area, -1]
+                            precision = precision[precision > -1]
+                            if precision.size:
+                                ap = np.mean(precision)
+                            else:
+                                ap = float('nan')
+                            t.append(f'{round(ap, 3)}')
+                        results_per_category.append(tuple(t))
+
+                    num_columns = len(results_per_category[0])
+                    results_flatten = list(
+                        itertools.chain(*results_per_category))
+                    headers = [
+                        'category', 'mAP', 'mAP_50', 'mAP_75', 'mAP_s',
+                        'mAP_m', 'mAP_l'
+                    ]
+                    results_2d = itertools.zip_longest(*[
+                        results_flatten[i::num_columns]
+                        for i in range(num_columns)
+                    ])
+                    table_data = [headers]
+                    table_data += [result for result in results_2d]
+                    table = AsciiTable(table_data)
+                    logger.info('\n' + table.table)
+
+                if metric_items is None:
+                    metric_items = [
+                        'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+                    ]
+
+                for metric_item in metric_items:
+                    key = f'{metric}_{metric_item}'
+                    val = coco_eval.stats[coco_metric_names[metric_item]]
+                    eval_results[key] = float(f'{round(val, 3)}')
+
+                ap = coco_eval.stats[:6]
+                logger.info(f'{metric}_mAP_copypaste: {ap[0]:.3f} '
+                            f'{ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+                            f'{ap[4]:.3f} {ap[5]:.3f}')
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
diff --git a/mmde/mmdet/evaluation/metrics/coco_occluded_metric.py b/mmde/mmdet/evaluation/metrics/coco_occluded_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..81235a04e6ee1929cfd6b5cdc284d239765b0d69
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/coco_occluded_metric.py
@@ -0,0 +1,204 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Union
+
+import mmengine
+import numpy as np
+from mmengine.fileio import load
+from mmengine.logging import print_log
+from pycocotools import mask as coco_mask
+from terminaltables import AsciiTable
+
+from mmdet.registry import METRICS
+from .coco_metric import CocoMetric
+
+
+@METRICS.register_module()
+class CocoOccludedSeparatedMetric(CocoMetric):
+    """Metric of separated and occluded masks which presented in paper `A Tri-
+    Layer Plugin to Improve Occluded Detection.
+
+    <https://arxiv.org/abs/2210.10046>`_.
+
+    Separated COCO and Occluded COCO are automatically generated subsets of
+    COCO val dataset, collecting separated objects and partially occluded
+    objects for a large variety of categories. In this way, we define
+    occlusion into two major categories: separated and partially occluded.
+
+    - Separation: target object segmentation mask is separated into distinct
+      regions by the occluder.
+    - Partial Occlusion: target object is partially occluded but the
+      segmentation mask is connected.
+
+    These two new scalable real-image datasets are to benchmark a model's
+    capability to detect occluded objects of 80 common categories.
+
+    Please cite the paper if you use this dataset:
+
+    @article{zhan2022triocc,
+        title={A Tri-Layer Plugin to Improve Occluded Detection},
+        author={Zhan, Guanqi and Xie, Weidi and Zisserman, Andrew},
+        journal={British Machine Vision Conference},
+        year={2022}
+    }
+
+    Args:
+        occluded_ann (str): Path to the occluded coco annotation file.
+        separated_ann (str): Path to the separated coco annotation file.
+        score_thr (float): Score threshold of the detection masks.
+            Defaults to 0.3.
+        iou_thr (float): IoU threshold for the recall calculation.
+            Defaults to 0.75.
+        metric (str | List[str]): Metrics to be evaluated. Valid metrics
+            include 'bbox', 'segm', 'proposal', and 'proposal_fast'.
+            Defaults to 'bbox'.
+    """
+    default_prefix: Optional[str] = 'coco'
+
+    def __init__(
+            self,
+            *args,
+            occluded_ann:
+        str = 'https://www.robots.ox.ac.uk/~vgg/research/tpod/datasets/occluded_coco.pkl',  # noqa
+            separated_ann:
+        str = 'https://www.robots.ox.ac.uk/~vgg/research/tpod/datasets/separated_coco.pkl',  # noqa
+            score_thr: float = 0.3,
+            iou_thr: float = 0.75,
+            metric: Union[str, List[str]] = ['bbox', 'segm'],
+            **kwargs) -> None:
+        super().__init__(*args, metric=metric, **kwargs)
+        self.occluded_ann = load(occluded_ann)
+        self.separated_ann = load(separated_ann)
+        self.score_thr = score_thr
+        self.iou_thr = iou_thr
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        coco_metric_res = super().compute_metrics(results)
+        eval_res = self.evaluate_occluded_separated(results)
+        coco_metric_res.update(eval_res)
+        return coco_metric_res
+
+    def evaluate_occluded_separated(self, results: List[tuple]) -> dict:
+        """Compute the recall of occluded and separated masks.
+
+        Args:
+            results (list[tuple]): Testing results of the dataset.
+
+        Returns:
+            dict[str, float]: The recall of occluded and separated masks.
+        """
+        dict_det = {}
+        print_log('processing detection results...')
+        prog_bar = mmengine.ProgressBar(len(results))
+        for i in range(len(results)):
+            gt, dt = results[i]
+            img_id = dt['img_id']
+            cur_img_name = self._coco_api.imgs[img_id]['file_name']
+            if cur_img_name not in dict_det.keys():
+                dict_det[cur_img_name] = []
+
+            for bbox, score, label, mask in zip(dt['bboxes'], dt['scores'],
+                                                dt['labels'], dt['masks']):
+                cur_binary_mask = coco_mask.decode(mask)
+                dict_det[cur_img_name].append([
+                    score, self.dataset_meta['classes'][label],
+                    cur_binary_mask, bbox
+                ])
+            dict_det[cur_img_name].sort(
+                key=lambda x: (-x[0], x[3][0], x[3][1])
+            )  # rank by confidence from high to low, avoid same confidence
+            prog_bar.update()
+        print_log('\ncomputing occluded mask recall...', logger='current')
+        occluded_correct_num, occluded_recall = self.compute_recall(
+            dict_det, gt_ann=self.occluded_ann, is_occ=True)
+        print_log(
+            f'\nCOCO occluded mask recall: {occluded_recall:.2f}%',
+            logger='current')
+        print_log(
+            f'COCO occluded mask success num: {occluded_correct_num}',
+            logger='current')
+        print_log('computing separated mask recall...', logger='current')
+        separated_correct_num, separated_recall = self.compute_recall(
+            dict_det, gt_ann=self.separated_ann, is_occ=False)
+        print_log(
+            f'\nCOCO separated mask recall: {separated_recall:.2f}%',
+            logger='current')
+        print_log(
+            f'COCO separated mask success num: {separated_correct_num}',
+            logger='current')
+        table_data = [
+            ['mask type', 'recall', 'num correct'],
+            ['occluded', f'{occluded_recall:.2f}%', occluded_correct_num],
+            ['separated', f'{separated_recall:.2f}%', separated_correct_num]
+        ]
+        table = AsciiTable(table_data)
+        print_log('\n' + table.table, logger='current')
+        return dict(
+            occluded_recall=occluded_recall, separated_recall=separated_recall)
+
+    def compute_recall(self,
+                       result_dict: dict,
+                       gt_ann: list,
+                       is_occ: bool = True) -> tuple:
+        """Compute the recall of occluded or separated masks.
+
+        Args:
+            result_dict (dict): Processed mask results.
+            gt_ann (list): Occluded or separated coco annotations.
+            is_occ (bool): Whether the annotation is occluded mask.
+                Defaults to True.
+        Returns:
+            tuple: number of correct masks and the recall.
+        """
+        correct = 0
+        prog_bar = mmengine.ProgressBar(len(gt_ann))
+        for iter_i in range(len(gt_ann)):
+            cur_item = gt_ann[iter_i]
+            cur_img_name = cur_item[0]
+            cur_gt_bbox = cur_item[3]
+            if is_occ:
+                cur_gt_bbox = [
+                    cur_gt_bbox[0], cur_gt_bbox[1],
+                    cur_gt_bbox[0] + cur_gt_bbox[2],
+                    cur_gt_bbox[1] + cur_gt_bbox[3]
+                ]
+            cur_gt_class = cur_item[1]
+            cur_gt_mask = coco_mask.decode(cur_item[4])
+
+            assert cur_img_name in result_dict.keys()
+            cur_detections = result_dict[cur_img_name]
+
+            correct_flag = False
+            for i in range(len(cur_detections)):
+                cur_det_confidence = cur_detections[i][0]
+                if cur_det_confidence < self.score_thr:
+                    break
+                cur_det_class = cur_detections[i][1]
+                if cur_det_class != cur_gt_class:
+                    continue
+                cur_det_mask = cur_detections[i][2]
+                cur_iou = self.mask_iou(cur_det_mask, cur_gt_mask)
+                if cur_iou >= self.iou_thr:
+                    correct_flag = True
+                    break
+            if correct_flag:
+                correct += 1
+            prog_bar.update()
+        recall = correct / len(gt_ann) * 100
+        return correct, recall
+
+    def mask_iou(self, mask1: np.ndarray, mask2: np.ndarray) -> np.ndarray:
+        """Compute IoU between two masks."""
+        mask1_area = np.count_nonzero(mask1 == 1)
+        mask2_area = np.count_nonzero(mask2 == 1)
+        intersection = np.count_nonzero(np.logical_and(mask1 == 1, mask2 == 1))
+        iou = intersection / (mask1_area + mask2_area - intersection)
+        return iou
diff --git a/mmde/mmdet/evaluation/metrics/coco_panoptic_metric.py b/mmde/mmdet/evaluation/metrics/coco_panoptic_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..f86be916f9cacbdd1160d0fdb3dd6b5d43399299
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/coco_panoptic_metric.py
@@ -0,0 +1,618 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import itertools
+import os.path as osp
+import tempfile
+from typing import Dict, Optional, Sequence, Tuple, Union
+
+import mmcv
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import dump, get_local_path, load
+from mmengine.logging import MMLogger, print_log
+from terminaltables import AsciiTable
+
+from mmdet.datasets.api_wrappers import COCOPanoptic
+from mmdet.registry import METRICS
+from ..functional import (INSTANCE_OFFSET, pq_compute_multi_core,
+                          pq_compute_single_core)
+
+try:
+    import panopticapi
+    from panopticapi.evaluation import VOID, PQStat
+    from panopticapi.utils import id2rgb, rgb2id
+except ImportError:
+    panopticapi = None
+    id2rgb = None
+    rgb2id = None
+    VOID = None
+    PQStat = None
+
+
+@METRICS.register_module()
+class CocoPanopticMetric(BaseMetric):
+    """COCO panoptic segmentation evaluation metric.
+
+    Evaluate PQ, SQ RQ for panoptic segmentation tasks. Please refer to
+    https://cocodataset.org/#panoptic-eval for more details.
+
+    Args:
+        ann_file (str, optional): Path to the coco format annotation file.
+            If not specified, ground truth annotations from the dataset will
+            be converted to coco format. Defaults to None.
+        seg_prefix (str, optional): Path to the directory which contains the
+            coco panoptic segmentation mask. It should be specified when
+            evaluate. Defaults to None.
+        classwise (bool): Whether to evaluate the metric class-wise.
+            Defaults to False.
+        outfile_prefix (str, optional): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created.
+            It should be specified when format_only is True. Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        nproc (int): Number of processes for panoptic quality computing.
+            Defaults to 32. When ``nproc`` exceeds the number of cpu cores,
+            the number of cpu cores is used.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+    default_prefix: Optional[str] = 'coco_panoptic'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 seg_prefix: Optional[str] = None,
+                 classwise: bool = False,
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 nproc: int = 32,
+                 file_client_args: dict = None,
+                 backend_args: dict = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        if panopticapi is None:
+            raise RuntimeError(
+                'panopticapi is not installed, please install it by: '
+                'pip install git+https://github.com/cocodataset/'
+                'panopticapi.git.')
+
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.classwise = classwise
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+
+        self.tmp_dir = None
+        # outfile_prefix should be a prefix of a path which points to a shared
+        # storage when train or test with multi nodes.
+        self.outfile_prefix = outfile_prefix
+        if outfile_prefix is None:
+            self.tmp_dir = tempfile.TemporaryDirectory()
+            self.outfile_prefix = osp.join(self.tmp_dir.name, 'results')
+        # the directory to save predicted panoptic segmentation mask
+        self.seg_out_dir = f'{self.outfile_prefix}.panoptic'
+        self.nproc = nproc
+        self.seg_prefix = seg_prefix
+
+        self.cat_ids = None
+        self.cat2label = None
+
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+        if ann_file:
+            with get_local_path(
+                    ann_file, backend_args=self.backend_args) as local_path:
+                self._coco_api = COCOPanoptic(local_path)
+            self.categories = self._coco_api.cats
+        else:
+            self._coco_api = None
+            self.categories = None
+
+    def __del__(self) -> None:
+        """Clean up."""
+        if self.tmp_dir is not None:
+            self.tmp_dir.cleanup()
+
+    def gt_to_coco_json(self, gt_dicts: Sequence[dict],
+                        outfile_prefix: str) -> Tuple[str, str]:
+        """Convert ground truth to coco panoptic segmentation format json file.
+
+        Args:
+            gt_dicts (Sequence[dict]): Ground truth of the dataset.
+            outfile_prefix (str): The filename prefix of the json file. If the
+                prefix is "somepath/xxx", the json file will be named
+                "somepath/xxx.gt.json".
+
+        Returns:
+            Tuple[str, str]: The filename of the json file and the name of the\
+                directory which contains panoptic segmentation masks.
+        """
+        assert len(gt_dicts) > 0, 'gt_dicts is empty.'
+        gt_folder = osp.dirname(gt_dicts[0]['seg_map_path'])
+        converted_json_path = f'{outfile_prefix}.gt.json'
+
+        categories = []
+        for id, name in enumerate(self.dataset_meta['classes']):
+            isthing = 1 if name in self.dataset_meta['thing_classes'] else 0
+            categories.append({'id': id, 'name': name, 'isthing': isthing})
+
+        image_infos = []
+        annotations = []
+        for gt_dict in gt_dicts:
+            img_id = gt_dict['image_id']
+            image_info = {
+                'id': img_id,
+                'width': gt_dict['width'],
+                'height': gt_dict['height'],
+                'file_name': osp.split(gt_dict['seg_map_path'])[-1]
+            }
+            image_infos.append(image_info)
+
+            pan_png = mmcv.imread(gt_dict['seg_map_path']).squeeze()
+            pan_png = pan_png[:, :, ::-1]
+            pan_png = rgb2id(pan_png)
+            segments_info = []
+            for segment_info in gt_dict['segments_info']:
+                id = segment_info['id']
+                label = segment_info['category']
+                mask = pan_png == id
+                isthing = categories[label]['isthing']
+                if isthing:
+                    iscrowd = 1 if not segment_info['is_thing'] else 0
+                else:
+                    iscrowd = 0
+
+                new_segment_info = {
+                    'id': id,
+                    'category_id': label,
+                    'isthing': isthing,
+                    'iscrowd': iscrowd,
+                    'area': mask.sum()
+                }
+                segments_info.append(new_segment_info)
+
+            segm_file = image_info['file_name'].replace('.jpg', '.png')
+            annotation = dict(
+                image_id=img_id,
+                segments_info=segments_info,
+                file_name=segm_file)
+            annotations.append(annotation)
+            pan_png = id2rgb(pan_png)
+
+        info = dict(
+            date_created=str(datetime.datetime.now()),
+            description='Coco json file converted by mmdet CocoPanopticMetric.'
+        )
+        coco_json = dict(
+            info=info,
+            images=image_infos,
+            categories=categories,
+            licenses=None,
+        )
+        if len(annotations) > 0:
+            coco_json['annotations'] = annotations
+        dump(coco_json, converted_json_path)
+        return converted_json_path, gt_folder
+
+    def result2json(self, results: Sequence[dict],
+                    outfile_prefix: str) -> Tuple[str, str]:
+        """Dump the panoptic results to a COCO style json file and a directory.
+
+        Args:
+            results (Sequence[dict]): Testing results of the dataset.
+            outfile_prefix (str): The filename prefix of the json files and the
+                directory.
+
+        Returns:
+            Tuple[str, str]: The json file and the directory which contains \
+                panoptic segmentation masks. The filename of the json is
+                "somepath/xxx.panoptic.json" and name of the directory is
+                "somepath/xxx.panoptic".
+        """
+        label2cat = dict((v, k) for (k, v) in self.cat2label.items())
+        pred_annotations = []
+        for idx in range(len(results)):
+            result = results[idx]
+            for segment_info in result['segments_info']:
+                sem_label = segment_info['category_id']
+                # convert sem_label to json label
+                cat_id = label2cat[sem_label]
+                segment_info['category_id'] = label2cat[sem_label]
+                is_thing = self.categories[cat_id]['isthing']
+                segment_info['isthing'] = is_thing
+            pred_annotations.append(result)
+        pan_json_results = dict(annotations=pred_annotations)
+        json_filename = f'{outfile_prefix}.panoptic.json'
+        dump(pan_json_results, json_filename)
+        return json_filename, (
+            self.seg_out_dir
+            if self.tmp_dir is None else tempfile.gettempdir())
+
+    def _parse_predictions(self,
+                           pred: dict,
+                           img_id: int,
+                           segm_file: str,
+                           label2cat=None) -> dict:
+        """Parse panoptic segmentation predictions.
+
+        Args:
+            pred (dict): Panoptic segmentation predictions.
+            img_id (int): Image id.
+            segm_file (str): Segmentation file name.
+            label2cat (dict): Mapping from label to category id.
+                Defaults to None.
+
+        Returns:
+            dict: Parsed predictions.
+        """
+        result = dict()
+        result['img_id'] = img_id
+        # shape (1, H, W) -> (H, W)
+        pan = pred['pred_panoptic_seg']['sem_seg'].cpu().numpy()[0]
+        ignore_index = pred['pred_panoptic_seg'].get(
+            'ignore_index', len(self.dataset_meta['classes']))
+        pan_labels = np.unique(pan)
+        segments_info = []
+        for pan_label in pan_labels:
+            sem_label = pan_label % INSTANCE_OFFSET
+            # We reserve the length of dataset_meta['classes']
+            # and ignore_index for VOID label
+            if sem_label == len(
+                    self.dataset_meta['classes']) or sem_label == ignore_index:
+                continue
+            mask = pan == pan_label
+            area = mask.sum()
+            segments_info.append({
+                'id':
+                int(pan_label),
+                # when ann_file provided, sem_label should be cat_id, otherwise
+                # sem_label should be a continuous id, not the cat_id
+                # defined in dataset
+                'category_id':
+                label2cat[sem_label] if label2cat else sem_label,
+                'area':
+                int(area)
+            })
+        # evaluation script uses 0 for VOID label.
+        pan[pan % INSTANCE_OFFSET == len(self.dataset_meta['classes'])] = VOID
+        pan[pan % INSTANCE_OFFSET == ignore_index] = VOID
+
+        pan = id2rgb(pan).astype(np.uint8)
+        mmcv.imwrite(pan[:, :, ::-1], osp.join(self.seg_out_dir, segm_file))
+        result = {
+            'image_id': img_id,
+            'segments_info': segments_info,
+            'file_name': segm_file
+        }
+
+        return result
+
+    def _compute_batch_pq_stats(self, data_samples: Sequence[dict]):
+        """Process gts and predictions when ``outfile_prefix`` is not set, gts
+        are from dataset or a json file which is defined by ``ann_file``.
+
+        Intermediate results, ``pq_stats``, are computed here and put into
+        ``self.results``.
+        """
+        if self._coco_api is None:
+            categories = dict()
+            for id, name in enumerate(self.dataset_meta['classes']):
+                isthing = 1 if name in self.dataset_meta['thing_classes']\
+                    else 0
+                categories[id] = {'id': id, 'name': name, 'isthing': isthing}
+            label2cat = None
+        else:
+            categories = self.categories
+            cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['classes'])
+            label2cat = {i: cat_id for i, cat_id in enumerate(cat_ids)}
+
+        for data_sample in data_samples:
+            # parse pred
+            img_id = data_sample['img_id']
+            segm_file = osp.basename(data_sample['img_path']).replace(
+                '.jpg', '.png')
+            result = self._parse_predictions(
+                pred=data_sample,
+                img_id=img_id,
+                segm_file=segm_file,
+                label2cat=label2cat)
+
+            # parse gt
+            gt = dict()
+            gt['image_id'] = img_id
+            gt['width'] = data_sample['ori_shape'][1]
+            gt['height'] = data_sample['ori_shape'][0]
+            gt['file_name'] = segm_file
+
+            if self._coco_api is None:
+                # get segments_info from data_sample
+                seg_map_path = osp.join(self.seg_prefix, segm_file)
+                pan_png = mmcv.imread(seg_map_path).squeeze()
+                pan_png = pan_png[:, :, ::-1]
+                pan_png = rgb2id(pan_png)
+                segments_info = []
+
+                for segment_info in data_sample['segments_info']:
+                    id = segment_info['id']
+                    label = segment_info['category']
+                    mask = pan_png == id
+                    isthing = categories[label]['isthing']
+                    if isthing:
+                        iscrowd = 1 if not segment_info['is_thing'] else 0
+                    else:
+                        iscrowd = 0
+
+                    new_segment_info = {
+                        'id': id,
+                        'category_id': label,
+                        'isthing': isthing,
+                        'iscrowd': iscrowd,
+                        'area': mask.sum()
+                    }
+                    segments_info.append(new_segment_info)
+            else:
+                # get segments_info from annotation file
+                segments_info = self._coco_api.imgToAnns[img_id]
+
+            gt['segments_info'] = segments_info
+
+            pq_stats = pq_compute_single_core(
+                proc_id=0,
+                annotation_set=[(gt, result)],
+                gt_folder=self.seg_prefix,
+                pred_folder=self.seg_out_dir,
+                categories=categories,
+                backend_args=self.backend_args)
+
+            self.results.append(pq_stats)
+
+    def _process_gt_and_predictions(self, data_samples: Sequence[dict]):
+        """Process gts and predictions when ``outfile_prefix`` is set.
+
+        The predictions will be saved to directory specified by
+        ``outfile_predfix``. The matched pair (gt, result) will be put into
+        ``self.results``.
+        """
+        for data_sample in data_samples:
+            # parse pred
+            img_id = data_sample['img_id']
+            segm_file = osp.basename(data_sample['img_path']).replace(
+                '.jpg', '.png')
+            result = self._parse_predictions(
+                pred=data_sample, img_id=img_id, segm_file=segm_file)
+
+            # parse gt
+            gt = dict()
+            gt['image_id'] = img_id
+            gt['width'] = data_sample['ori_shape'][1]
+            gt['height'] = data_sample['ori_shape'][0]
+
+            if self._coco_api is None:
+                # get segments_info from dataset
+                gt['segments_info'] = data_sample['segments_info']
+                gt['seg_map_path'] = data_sample['seg_map_path']
+
+            self.results.append((gt, result))
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        # If ``self.tmp_dir`` is none, it will save gt and predictions to
+        # self.results, otherwise, it will compute pq_stats here.
+        if self.tmp_dir is None:
+            self._process_gt_and_predictions(data_samples)
+        else:
+            self._compute_batch_pq_stats(data_samples)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch. There
+                are two cases:
+
+                - When ``outfile_prefix`` is not provided, the elements in
+                  results are pq_stats which can be summed directly to get PQ.
+                - When ``outfile_prefix`` is provided, the elements in
+                  results are tuples like (gt, pred).
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+                the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        if self.tmp_dir is None:
+            # do evaluation after collect all the results
+
+            # split gt and prediction list
+            gts, preds = zip(*results)
+
+            if self._coco_api is None:
+                # use converted gt json file to initialize coco api
+                logger.info('Converting ground truth to coco format...')
+                coco_json_path, gt_folder = self.gt_to_coco_json(
+                    gt_dicts=gts, outfile_prefix=self.outfile_prefix)
+                self._coco_api = COCOPanoptic(coco_json_path)
+            else:
+                gt_folder = self.seg_prefix
+
+            self.cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['classes'])
+            self.cat2label = {
+                cat_id: i
+                for i, cat_id in enumerate(self.cat_ids)
+            }
+            self.img_ids = self._coco_api.get_img_ids()
+            self.categories = self._coco_api.cats
+
+            # convert predictions to coco format and dump to json file
+            json_filename, pred_folder = self.result2json(
+                results=preds, outfile_prefix=self.outfile_prefix)
+
+            if self.format_only:
+                logger.info('results are saved in '
+                            f'{osp.dirname(self.outfile_prefix)}')
+                return dict()
+
+            imgs = self._coco_api.imgs
+            gt_json = self._coco_api.img_ann_map
+            gt_json = [{
+                'image_id': k,
+                'segments_info': v,
+                'file_name': imgs[k]['segm_file']
+            } for k, v in gt_json.items()]
+            pred_json = load(json_filename)
+            pred_json = dict(
+                (el['image_id'], el) for el in pred_json['annotations'])
+
+            # match the gt_anns and pred_anns in the same image
+            matched_annotations_list = []
+            for gt_ann in gt_json:
+                img_id = gt_ann['image_id']
+                if img_id not in pred_json.keys():
+                    raise Exception('no prediction for the image'
+                                    ' with id: {}'.format(img_id))
+                matched_annotations_list.append((gt_ann, pred_json[img_id]))
+
+            pq_stat = pq_compute_multi_core(
+                matched_annotations_list,
+                gt_folder,
+                pred_folder,
+                self.categories,
+                backend_args=self.backend_args,
+                nproc=self.nproc)
+
+        else:
+            # aggregate the results generated in process
+            if self._coco_api is None:
+                categories = dict()
+                for id, name in enumerate(self.dataset_meta['classes']):
+                    isthing = 1 if name in self.dataset_meta[
+                        'thing_classes'] else 0
+                    categories[id] = {
+                        'id': id,
+                        'name': name,
+                        'isthing': isthing
+                    }
+                self.categories = categories
+
+            pq_stat = PQStat()
+            for result in results:
+                pq_stat += result
+
+        metrics = [('All', None), ('Things', True), ('Stuff', False)]
+        pq_results = {}
+
+        for name, isthing in metrics:
+            pq_results[name], classwise_results = pq_stat.pq_average(
+                self.categories, isthing=isthing)
+            if name == 'All':
+                pq_results['classwise'] = classwise_results
+
+        classwise_results = None
+        if self.classwise:
+            classwise_results = {
+                k: v
+                for k, v in zip(self.dataset_meta['classes'],
+                                pq_results['classwise'].values())
+            }
+
+        print_panoptic_table(pq_results, classwise_results, logger=logger)
+        results = parse_pq_results(pq_results)
+
+        return results
+
+
+def parse_pq_results(pq_results: dict) -> dict:
+    """Parse the Panoptic Quality results.
+
+    Args:
+        pq_results (dict): Panoptic Quality results.
+
+    Returns:
+        dict: Panoptic Quality results parsed.
+    """
+    result = dict()
+    result['PQ'] = 100 * pq_results['All']['pq']
+    result['SQ'] = 100 * pq_results['All']['sq']
+    result['RQ'] = 100 * pq_results['All']['rq']
+    result['PQ_th'] = 100 * pq_results['Things']['pq']
+    result['SQ_th'] = 100 * pq_results['Things']['sq']
+    result['RQ_th'] = 100 * pq_results['Things']['rq']
+    result['PQ_st'] = 100 * pq_results['Stuff']['pq']
+    result['SQ_st'] = 100 * pq_results['Stuff']['sq']
+    result['RQ_st'] = 100 * pq_results['Stuff']['rq']
+    return result
+
+
+def print_panoptic_table(
+        pq_results: dict,
+        classwise_results: Optional[dict] = None,
+        logger: Optional[Union['MMLogger', str]] = None) -> None:
+    """Print the panoptic evaluation results table.
+
+    Args:
+        pq_results(dict): The Panoptic Quality results.
+        classwise_results(dict, optional): The classwise Panoptic Quality.
+            results. The keys are class names and the values are metrics.
+            Defaults to None.
+        logger (:obj:`MMLogger` | str, optional): Logger used for printing
+            related information during evaluation. Default: None.
+    """
+
+    headers = ['', 'PQ', 'SQ', 'RQ', 'categories']
+    data = [headers]
+    for name in ['All', 'Things', 'Stuff']:
+        numbers = [
+            f'{(pq_results[name][k] * 100):0.3f}' for k in ['pq', 'sq', 'rq']
+        ]
+        row = [name] + numbers + [pq_results[name]['n']]
+        data.append(row)
+    table = AsciiTable(data)
+    print_log('Panoptic Evaluation Results:\n' + table.table, logger=logger)
+
+    if classwise_results is not None:
+        class_metrics = [(name, ) + tuple(f'{(metrics[k] * 100):0.3f}'
+                                          for k in ['pq', 'sq', 'rq'])
+                         for name, metrics in classwise_results.items()]
+        num_columns = min(8, len(class_metrics) * 4)
+        results_flatten = list(itertools.chain(*class_metrics))
+        headers = ['category', 'PQ', 'SQ', 'RQ'] * (num_columns // 4)
+        results_2d = itertools.zip_longest(
+            *[results_flatten[i::num_columns] for i in range(num_columns)])
+        data = [headers]
+        data += [result for result in results_2d]
+        table = AsciiTable(data)
+        print_log(
+            'Classwise Panoptic Evaluation Results:\n' + table.table,
+            logger=logger)
diff --git a/mmde/mmdet/evaluation/metrics/coco_video_metric.py b/mmde/mmdet/evaluation/metrics/coco_video_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5c75d025a6109762db21a600e3d866764caf1cb
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/coco_video_metric.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Sequence
+
+from mmengine.dist import broadcast_object_list, is_main_process
+
+from mmdet.registry import METRICS
+from .base_video_metric import collect_tracking_results
+from .coco_metric import CocoMetric
+
+
+@METRICS.register_module()
+class CocoVideoMetric(CocoMetric):
+    """COCO evaluation metric.
+
+    Evaluate AR, AP, and mAP for detection tasks including proposal/box
+    detection and instance segmentation. Please refer to
+    https://cocodataset.org/#detection-eval for more details.
+    """
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for track_data_sample in data_samples:
+            video_data_samples = track_data_sample['video_data_samples']
+            ori_video_len = video_data_samples[0].ori_video_length
+            video_len = len(video_data_samples)
+            if ori_video_len == video_len:
+                # video process
+                for frame_id in range(video_len):
+                    img_data_sample = video_data_samples[frame_id].to_dict()
+                    super().process(None, [img_data_sample])
+            else:
+                # image process
+                img_data_sample = video_data_samples[0].to_dict()
+                super().process(None, [img_data_sample])
+
+    def evaluate(self, size: int = 1) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        if len(self.results) == 0:
+            warnings.warn(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.')
+
+        results = collect_tracking_results(self.results, self.collect_device)
+
+        if is_main_process():
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
diff --git a/mmde/mmdet/evaluation/metrics/crowdhuman_metric.py b/mmde/mmdet/evaluation/metrics/crowdhuman_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..50ac210ae8606bab6cada69418334c113c90fb38
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/crowdhuman_metric.py
@@ -0,0 +1,824 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import json
+import os.path as osp
+import tempfile
+from collections import OrderedDict
+from multiprocessing import Process, Queue
+from typing import Dict, List, Optional, Sequence, Union
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import dump, get_text, load
+from mmengine.logging import MMLogger
+from scipy.sparse import csr_matrix
+from scipy.sparse.csgraph import maximum_bipartite_matching
+
+from mmdet.evaluation.functional.bbox_overlaps import bbox_overlaps
+from mmdet.registry import METRICS
+
+PERSON_CLASSES = ['background', 'person']
+
+
+@METRICS.register_module()
+class CrowdHumanMetric(BaseMetric):
+    """CrowdHuman evaluation metric.
+
+    Evaluate Average Precision (AP), Miss Rate (MR) and Jaccard Index (JI)
+    for detection tasks.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        metric (str | List[str]): Metrics to be evaluated. Valid metrics
+            include 'AP', 'MR' and 'JI'. Defaults to 'AP'.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        outfile_prefix (str, optional): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Defaults to None.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+        eval_mode (int): Select the mode of evaluate. Valid mode include
+            0(just body box), 1(just head box) and 2(both of them).
+            Defaults to 0.
+        iou_thres (float): IoU threshold. Defaults to 0.5.
+        compare_matching_method (str, optional): Matching method to compare
+            the detection results with the ground_truth when compute 'AP'
+            and 'MR'.Valid method include VOC and None(CALTECH). Default to
+            None.
+        mr_ref (str): Different parameter selection to calculate MR. Valid
+            ref include CALTECH_-2 and CALTECH_-4. Defaults to CALTECH_-2.
+        num_ji_process (int): The number of processes to evaluation JI.
+            Defaults to 10.
+    """
+    default_prefix: Optional[str] = 'crowd_human'
+
+    def __init__(self,
+                 ann_file: str,
+                 metric: Union[str, List[str]] = ['AP', 'MR', 'JI'],
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 file_client_args: dict = None,
+                 backend_args: dict = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 eval_mode: int = 0,
+                 iou_thres: float = 0.5,
+                 compare_matching_method: Optional[str] = None,
+                 mr_ref: str = 'CALTECH_-2',
+                 num_ji_process: int = 10) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.ann_file = ann_file
+        # crowdhuman evaluation metrics
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['MR', 'AP', 'JI']
+        for metric in self.metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f"metric should be one of 'MR', 'AP', 'JI',"
+                               f'but got {metric}.')
+
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+        self.outfile_prefix = outfile_prefix
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+        assert eval_mode in [0, 1, 2], \
+            "Unknown eval mode. mr_ref should be one of '0', '1', '2'."
+        assert compare_matching_method is None or \
+               compare_matching_method == 'VOC', \
+               'The alternative compare_matching_method is VOC.' \
+               'This parameter defaults to CALTECH(None)'
+        assert mr_ref == 'CALTECH_-2' or mr_ref == 'CALTECH_-4', \
+            "mr_ref should be one of 'CALTECH_-2', 'CALTECH_-4'."
+        self.eval_mode = eval_mode
+        self.iou_thres = iou_thres
+        self.compare_matching_method = compare_matching_method
+        self.mr_ref = mr_ref
+        self.num_ji_process = num_ji_process
+
+    @staticmethod
+    def results2json(results: Sequence[dict], outfile_prefix: str) -> str:
+        """Dump the detection results to a json file."""
+        result_file_path = f'{outfile_prefix}.json'
+        bbox_json_results = []
+        for i, result in enumerate(results):
+            ann, pred = result
+            dump_dict = dict()
+            dump_dict['ID'] = ann['ID']
+            dump_dict['width'] = ann['width']
+            dump_dict['height'] = ann['height']
+            dtboxes = []
+            bboxes = pred.tolist()
+            for _, single_bbox in enumerate(bboxes):
+                temp_dict = dict()
+                x1, y1, x2, y2, score = single_bbox
+                temp_dict['box'] = [x1, y1, x2 - x1, y2 - y1]
+                temp_dict['score'] = score
+                temp_dict['tag'] = 1
+                dtboxes.append(temp_dict)
+            dump_dict['dtboxes'] = dtboxes
+            bbox_json_results.append(dump_dict)
+        dump(bbox_json_results, result_file_path)
+        return result_file_path
+
+    def process(self, data_batch: Sequence[dict],
+                data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            ann = dict()
+            ann['ID'] = data_sample['img_id']
+            ann['width'] = data_sample['ori_shape'][1]
+            ann['height'] = data_sample['ori_shape'][0]
+            pred_bboxes = data_sample['pred_instances']['bboxes'].cpu().numpy()
+            pred_scores = data_sample['pred_instances']['scores'].cpu().numpy()
+
+            pred_bbox_scores = np.hstack(
+                [pred_bboxes, pred_scores.reshape((-1, 1))])
+
+            self.results.append((ann, pred_bbox_scores))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            eval_results(Dict[str, float]): The computed metrics.
+            The keys are the names of the metrics, and the values
+            are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        tmp_dir = None
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'result')
+        else:
+            outfile_prefix = self.outfile_prefix
+
+        # convert predictions to coco format and dump to json file
+        result_file = self.results2json(results, outfile_prefix)
+        eval_results = OrderedDict()
+        if self.format_only:
+            logger.info(f'results are saved in {osp.dirname(outfile_prefix)}')
+            return eval_results
+
+        # load evaluation samples
+        eval_samples = self.load_eval_samples(result_file)
+
+        if 'AP' in self.metrics or 'MR' in self.metrics:
+            score_list = self.compare(eval_samples)
+            gt_num = sum([eval_samples[i].gt_num for i in eval_samples])
+            ign_num = sum([eval_samples[i].ign_num for i in eval_samples])
+            gt_num = gt_num - ign_num
+            img_num = len(eval_samples)
+
+        for metric in self.metrics:
+            logger.info(f'Evaluating {metric}...')
+            if metric == 'AP':
+                AP = self.eval_ap(score_list, gt_num, img_num)
+                eval_results['mAP'] = float(f'{round(AP, 4)}')
+            if metric == 'MR':
+                MR = self.eval_mr(score_list, gt_num, img_num)
+                eval_results['mMR'] = float(f'{round(MR, 4)}')
+            if metric == 'JI':
+                JI = self.eval_ji(eval_samples)
+                eval_results['JI'] = float(f'{round(JI, 4)}')
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        return eval_results
+
+    def load_eval_samples(self, result_file):
+        """Load data from annotations file and detection results.
+
+        Args:
+            result_file (str): The file path of the saved detection results.
+
+        Returns:
+            Dict[Image]: The detection result packaged by Image
+        """
+        gt_str = get_text(
+            self.ann_file, backend_args=self.backend_args).strip().split('\n')
+        gt_records = [json.loads(line) for line in gt_str]
+
+        pred_records = load(result_file, backend_args=self.backend_args)
+        eval_samples = dict()
+        for gt_record, pred_record in zip(gt_records, pred_records):
+            assert gt_record['ID'] == pred_record['ID'], \
+                'please set val_dataloader.sampler.shuffle=False and try again'
+            eval_samples[pred_record['ID']] = Image(self.eval_mode)
+            eval_samples[pred_record['ID']].load(gt_record, 'box', None,
+                                                 PERSON_CLASSES, True)
+            eval_samples[pred_record['ID']].load(pred_record, 'box', None,
+                                                 PERSON_CLASSES, False)
+            eval_samples[pred_record['ID']].clip_all_boader()
+        return eval_samples
+
+    def compare(self, samples):
+        """Match the detection results with the ground_truth.
+
+        Args:
+            samples (dict[Image]): The detection result packaged by Image.
+
+        Returns:
+            score_list(list[tuple[ndarray, int, str]]): Matching result.
+            a list of tuples (dtbox, label, imgID) in the descending
+            sort of dtbox.score.
+        """
+        score_list = list()
+        for id in samples:
+            if self.compare_matching_method == 'VOC':
+                result = samples[id].compare_voc(self.iou_thres)
+            else:
+                result = samples[id].compare_caltech(self.iou_thres)
+            score_list.extend(result)
+        # In the descending sort of dtbox score.
+        score_list.sort(key=lambda x: x[0][-1], reverse=True)
+        return score_list
+
+    @staticmethod
+    def eval_ap(score_list, gt_num, img_num):
+        """Evaluate by average precision.
+
+        Args:
+            score_list(list[tuple[ndarray, int, str]]): Matching result.
+                a list of tuples (dtbox, label, imgID) in the descending
+                sort of dtbox.score.
+            gt_num(int): The number of gt boxes in the entire dataset.
+            img_num(int): The number of images in the entire dataset.
+
+        Returns:
+            ap(float): result of average precision.
+        """
+
+        # calculate general ap score
+        def _calculate_map(_recall, _precision):
+            assert len(_recall) == len(_precision)
+            area = 0
+            for k in range(1, len(_recall)):
+                delta_h = (_precision[k - 1] + _precision[k]) / 2
+                delta_w = _recall[k] - _recall[k - 1]
+                area += delta_w * delta_h
+            return area
+
+        tp, fp = 0.0, 0.0
+        rpX, rpY = list(), list()
+
+        fpn = []
+        recalln = []
+        thr = []
+        fppi = []
+        for i, item in enumerate(score_list):
+            if item[1] == 1:
+                tp += 1.0
+            elif item[1] == 0:
+                fp += 1.0
+            fn = gt_num - tp
+            recall = tp / (tp + fn)
+            precision = tp / (tp + fp)
+            rpX.append(recall)
+            rpY.append(precision)
+            fpn.append(fp)
+            recalln.append(tp)
+            thr.append(item[0][-1])
+            fppi.append(fp / img_num)
+
+        ap = _calculate_map(rpX, rpY)
+        return ap
+
+    def eval_mr(self, score_list, gt_num, img_num):
+        """Evaluate by Caltech-style log-average miss rate.
+
+        Args:
+            score_list(list[tuple[ndarray, int, str]]): Matching result.
+                a list of tuples (dtbox, label, imgID) in the descending
+                sort of dtbox.score.
+            gt_num(int): The number of gt boxes in the entire dataset.
+            img_num(int): The number of image in the entire dataset.
+
+        Returns:
+            mr(float): result of miss rate.
+        """
+
+        # find greater_than
+        def _find_gt(lst, target):
+            for idx, _item in enumerate(lst):
+                if _item >= target:
+                    return idx
+            return len(lst) - 1
+
+        if self.mr_ref == 'CALTECH_-2':
+            # CALTECH_MRREF_2: anchor points (from 10^-2 to 1) as in
+            # P.Dollar's paper
+            ref = [
+                0.0100, 0.0178, 0.03160, 0.0562, 0.1000, 0.1778, 0.3162,
+                0.5623, 1.000
+            ]
+        else:
+            # CALTECH_MRREF_4: anchor points (from 10^-4 to 1) as in
+            # S.Zhang's paper
+            ref = [
+                0.0001, 0.0003, 0.00100, 0.0032, 0.0100, 0.0316, 0.1000,
+                0.3162, 1.000
+            ]
+
+        tp, fp = 0.0, 0.0
+        fppiX, fppiY = list(), list()
+        for i, item in enumerate(score_list):
+            if item[1] == 1:
+                tp += 1.0
+            elif item[1] == 0:
+                fp += 1.0
+
+            fn = gt_num - tp
+            recall = tp / (tp + fn)
+            missrate = 1.0 - recall
+            fppi = fp / img_num
+            fppiX.append(fppi)
+            fppiY.append(missrate)
+
+        score = list()
+        for pos in ref:
+            argmin = _find_gt(fppiX, pos)
+            if argmin >= 0:
+                score.append(fppiY[argmin])
+        score = np.array(score)
+        mr = np.exp(np.log(score).mean())
+        return mr
+
+    def eval_ji(self, samples):
+        """Evaluate by JI using multi_process.
+
+        Args:
+            samples(Dict[str, Image]): The detection result packaged by Image.
+
+        Returns:
+            ji(float): result of jaccard index.
+        """
+        import math
+        res_line = []
+        res_ji = []
+        for i in range(10):
+            score_thr = 1e-1 * i
+            total = len(samples)
+            stride = math.ceil(total / self.num_ji_process)
+            result_queue = Queue(10000)
+            results, procs = [], []
+            records = list(samples.items())
+            for i in range(self.num_ji_process):
+                start = i * stride
+                end = np.min([start + stride, total])
+                sample_data = dict(records[start:end])
+                p = Process(
+                    target=self.compute_ji_with_ignore,
+                    args=(result_queue, sample_data, score_thr))
+                p.start()
+                procs.append(p)
+            for i in range(total):
+                t = result_queue.get()
+                results.append(t)
+            for p in procs:
+                p.join()
+            line, mean_ratio = self.gather(results)
+            line = 'score_thr:{:.1f}, {}'.format(score_thr, line)
+            res_line.append(line)
+            res_ji.append(mean_ratio)
+        return max(res_ji)
+
+    def compute_ji_with_ignore(self, result_queue, dt_result, score_thr):
+        """Compute JI with ignore.
+
+        Args:
+            result_queue(Queue): The Queue for save compute result when
+                multi_process.
+            dt_result(dict[Image]): Detection result packaged by Image.
+            score_thr(float): The threshold of detection score.
+        Returns:
+            dict: compute result.
+        """
+        for ID, record in dt_result.items():
+            gt_boxes = record.gt_boxes
+            dt_boxes = record.dt_boxes
+            keep = dt_boxes[:, -1] > score_thr
+            dt_boxes = dt_boxes[keep][:, :-1]
+
+            gt_tag = np.array(gt_boxes[:, -1] != -1)
+            matches = self.compute_ji_matching(dt_boxes, gt_boxes[gt_tag, :4])
+            # get the unmatched_indices
+            matched_indices = np.array([j for (j, _) in matches])
+            unmatched_indices = list(
+                set(np.arange(dt_boxes.shape[0])) - set(matched_indices))
+            num_ignore_dt = self.get_ignores(dt_boxes[unmatched_indices],
+                                             gt_boxes[~gt_tag, :4])
+            matched_indices = np.array([j for (_, j) in matches])
+            unmatched_indices = list(
+                set(np.arange(gt_boxes[gt_tag].shape[0])) -
+                set(matched_indices))
+            num_ignore_gt = self.get_ignores(
+                gt_boxes[gt_tag][unmatched_indices], gt_boxes[~gt_tag, :4])
+            # compute results
+            eps = 1e-6
+            k = len(matches)
+            m = gt_tag.sum() - num_ignore_gt
+            n = dt_boxes.shape[0] - num_ignore_dt
+            ratio = k / (m + n - k + eps)
+            recall = k / (m + eps)
+            cover = k / (n + eps)
+            noise = 1 - cover
+            result_dict = dict(
+                ratio=ratio,
+                recall=recall,
+                cover=cover,
+                noise=noise,
+                k=k,
+                m=m,
+                n=n)
+            result_queue.put_nowait(result_dict)
+
+    @staticmethod
+    def gather(results):
+        """Integrate test results."""
+        assert len(results)
+        img_num = 0
+        for result in results:
+            if result['n'] != 0 or result['m'] != 0:
+                img_num += 1
+        mean_ratio = np.sum([rb['ratio'] for rb in results]) / img_num
+        valids = np.sum([rb['k'] for rb in results])
+        total = np.sum([rb['n'] for rb in results])
+        gtn = np.sum([rb['m'] for rb in results])
+        line = 'mean_ratio:{:.4f}, valids:{}, total:{}, gtn:{}'\
+            .format(mean_ratio, valids, total, gtn)
+        return line, mean_ratio
+
+    def compute_ji_matching(self, dt_boxes, gt_boxes):
+        """Match the annotation box for each detection box.
+
+        Args:
+            dt_boxes(ndarray): Detection boxes.
+            gt_boxes(ndarray): Ground_truth boxes.
+
+        Returns:
+            matches_(list[tuple[int, int]]): Match result.
+        """
+        assert dt_boxes.shape[-1] > 3 and gt_boxes.shape[-1] > 3
+        if dt_boxes.shape[0] < 1 or gt_boxes.shape[0] < 1:
+            return list()
+
+        ious = bbox_overlaps(dt_boxes, gt_boxes, mode='iou')
+        input_ = copy.deepcopy(ious)
+        input_[input_ < self.iou_thres] = 0
+        match_scipy = maximum_bipartite_matching(
+            csr_matrix(input_), perm_type='column')
+        matches_ = []
+        for i in range(len(match_scipy)):
+            if match_scipy[i] != -1:
+                matches_.append((i, int(match_scipy[i])))
+        return matches_
+
+    def get_ignores(self, dt_boxes, gt_boxes):
+        """Get the number of ignore bboxes."""
+        if gt_boxes.size:
+            ioas = bbox_overlaps(dt_boxes, gt_boxes, mode='iof')
+            ioas = np.max(ioas, axis=1)
+            rows = np.where(ioas > self.iou_thres)[0]
+            return len(rows)
+        else:
+            return 0
+
+
+class Image(object):
+    """Data structure for evaluation of CrowdHuman.
+
+    Note:
+        This implementation is modified from https://github.com/Purkialo/
+        CrowdDet/blob/master/lib/evaluate/APMRToolkits/image.py
+
+    Args:
+        mode (int): Select the mode of evaluate. Valid mode include
+            0(just body box), 1(just head box) and 2(both of them).
+            Defaults to 0.
+    """
+
+    def __init__(self, mode):
+        self.ID = None
+        self.width = None
+        self.height = None
+        self.dt_boxes = None
+        self.gt_boxes = None
+        self.eval_mode = mode
+
+        self.ign_num = None
+        self.gt_num = None
+        self.dt_num = None
+
+    def load(self, record, body_key, head_key, class_names, gt_flag):
+        """Loading information for evaluation.
+
+        Args:
+            record (dict): Label information or test results.
+                The format might look something like this:
+                {
+                    'ID': '273271,c9db000d5146c15',
+                    'gtboxes': [
+                        {'fbox': [72, 202, 163, 503], 'tag': 'person', ...},
+                        {'fbox': [199, 180, 144, 499], 'tag': 'person', ...},
+                        ...
+                    ]
+                }
+                or:
+                {
+                    'ID': '273271,c9db000d5146c15',
+                    'width': 800,
+                    'height': 1067,
+                    'dtboxes': [
+                        {
+                            'box': [306.22, 205.95, 164.05, 394.04],
+                            'score': 0.99,
+                            'tag': 1
+                        },
+                        {
+                            'box': [403.60, 178.66, 157.15, 421.33],
+                            'score': 0.99,
+                            'tag': 1
+                        },
+                        ...
+                    ]
+                }
+            body_key (str, None): key of detection body box.
+                Valid when loading detection results and self.eval_mode!=1.
+            head_key (str, None): key of detection head box.
+                Valid when loading detection results and self.eval_mode!=0.
+            class_names (list[str]):class names of data set.
+                Defaults to ['background', 'person'].
+            gt_flag (bool): Indicate whether record is ground truth
+                or predicting the outcome.
+        """
+        if 'ID' in record and self.ID is None:
+            self.ID = record['ID']
+        if 'width' in record and self.width is None:
+            self.width = record['width']
+        if 'height' in record and self.height is None:
+            self.height = record['height']
+        if gt_flag:
+            self.gt_num = len(record['gtboxes'])
+            body_bbox, head_bbox = self.load_gt_boxes(record, 'gtboxes',
+                                                      class_names)
+            if self.eval_mode == 0:
+                self.gt_boxes = body_bbox
+                self.ign_num = (body_bbox[:, -1] == -1).sum()
+            elif self.eval_mode == 1:
+                self.gt_boxes = head_bbox
+                self.ign_num = (head_bbox[:, -1] == -1).sum()
+            else:
+                gt_tag = np.array([
+                    body_bbox[i, -1] != -1 and head_bbox[i, -1] != -1
+                    for i in range(len(body_bbox))
+                ])
+                self.ign_num = (gt_tag == 0).sum()
+                self.gt_boxes = np.hstack(
+                    (body_bbox[:, :-1], head_bbox[:, :-1],
+                     gt_tag.reshape(-1, 1)))
+
+        if not gt_flag:
+            self.dt_num = len(record['dtboxes'])
+            if self.eval_mode == 0:
+                self.dt_boxes = self.load_det_boxes(record, 'dtboxes',
+                                                    body_key, 'score')
+            elif self.eval_mode == 1:
+                self.dt_boxes = self.load_det_boxes(record, 'dtboxes',
+                                                    head_key, 'score')
+            else:
+                body_dtboxes = self.load_det_boxes(record, 'dtboxes', body_key,
+                                                   'score')
+                head_dtboxes = self.load_det_boxes(record, 'dtboxes', head_key,
+                                                   'score')
+                self.dt_boxes = np.hstack((body_dtboxes, head_dtboxes))
+
+    @staticmethod
+    def load_gt_boxes(dict_input, key_name, class_names):
+        """load ground_truth and transform [x, y, w, h] to [x1, y1, x2, y2]"""
+        assert key_name in dict_input
+        if len(dict_input[key_name]) < 1:
+            return np.empty([0, 5])
+        head_bbox = []
+        body_bbox = []
+        for rb in dict_input[key_name]:
+            if rb['tag'] in class_names:
+                body_tag = class_names.index(rb['tag'])
+                head_tag = copy.deepcopy(body_tag)
+            else:
+                body_tag = -1
+                head_tag = -1
+            if 'extra' in rb:
+                if 'ignore' in rb['extra']:
+                    if rb['extra']['ignore'] != 0:
+                        body_tag = -1
+                        head_tag = -1
+            if 'head_attr' in rb:
+                if 'ignore' in rb['head_attr']:
+                    if rb['head_attr']['ignore'] != 0:
+                        head_tag = -1
+            head_bbox.append(np.hstack((rb['hbox'], head_tag)))
+            body_bbox.append(np.hstack((rb['fbox'], body_tag)))
+        head_bbox = np.array(head_bbox)
+        head_bbox[:, 2:4] += head_bbox[:, :2]
+        body_bbox = np.array(body_bbox)
+        body_bbox[:, 2:4] += body_bbox[:, :2]
+        return body_bbox, head_bbox
+
+    @staticmethod
+    def load_det_boxes(dict_input, key_name, key_box, key_score, key_tag=None):
+        """load detection boxes."""
+        assert key_name in dict_input
+        if len(dict_input[key_name]) < 1:
+            return np.empty([0, 5])
+        else:
+            assert key_box in dict_input[key_name][0]
+            if key_score:
+                assert key_score in dict_input[key_name][0]
+            if key_tag:
+                assert key_tag in dict_input[key_name][0]
+        if key_score:
+            if key_tag:
+                bboxes = np.vstack([
+                    np.hstack((rb[key_box], rb[key_score], rb[key_tag]))
+                    for rb in dict_input[key_name]
+                ])
+            else:
+                bboxes = np.vstack([
+                    np.hstack((rb[key_box], rb[key_score]))
+                    for rb in dict_input[key_name]
+                ])
+        else:
+            if key_tag:
+                bboxes = np.vstack([
+                    np.hstack((rb[key_box], rb[key_tag]))
+                    for rb in dict_input[key_name]
+                ])
+            else:
+                bboxes = np.vstack(
+                    [rb[key_box] for rb in dict_input[key_name]])
+        bboxes[:, 2:4] += bboxes[:, :2]
+        return bboxes
+
+    def clip_all_boader(self):
+        """Make sure boxes are within the image range."""
+
+        def _clip_boundary(boxes, height, width):
+            assert boxes.shape[-1] >= 4
+            boxes[:, 0] = np.minimum(np.maximum(boxes[:, 0], 0), width - 1)
+            boxes[:, 1] = np.minimum(np.maximum(boxes[:, 1], 0), height - 1)
+            boxes[:, 2] = np.maximum(np.minimum(boxes[:, 2], width), 0)
+            boxes[:, 3] = np.maximum(np.minimum(boxes[:, 3], height), 0)
+            return boxes
+
+        assert self.dt_boxes.shape[-1] >= 4
+        assert self.gt_boxes.shape[-1] >= 4
+        assert self.width is not None and self.height is not None
+        if self.eval_mode == 2:
+            self.dt_boxes[:, :4] = _clip_boundary(self.dt_boxes[:, :4],
+                                                  self.height, self.width)
+            self.gt_boxes[:, :4] = _clip_boundary(self.gt_boxes[:, :4],
+                                                  self.height, self.width)
+            self.dt_boxes[:, 4:8] = _clip_boundary(self.dt_boxes[:, 4:8],
+                                                   self.height, self.width)
+            self.gt_boxes[:, 4:8] = _clip_boundary(self.gt_boxes[:, 4:8],
+                                                   self.height, self.width)
+        else:
+            self.dt_boxes = _clip_boundary(self.dt_boxes, self.height,
+                                           self.width)
+            self.gt_boxes = _clip_boundary(self.gt_boxes, self.height,
+                                           self.width)
+
+    def compare_voc(self, thres):
+        """Match the detection results with the ground_truth by VOC.
+
+        Args:
+            thres (float): IOU threshold.
+
+        Returns:
+            score_list(list[tuple[ndarray, int, str]]): Matching result.
+            a list of tuples (dtbox, label, imgID) in the descending
+            sort of dtbox.score.
+        """
+        if self.dt_boxes is None:
+            return list()
+        dtboxes = self.dt_boxes
+        gtboxes = self.gt_boxes if self.gt_boxes is not None else list()
+        dtboxes.sort(key=lambda x: x.score, reverse=True)
+        gtboxes.sort(key=lambda x: x.ign)
+
+        score_list = list()
+        for i, dt in enumerate(dtboxes):
+            maxpos = -1
+            maxiou = thres
+
+            for j, gt in enumerate(gtboxes):
+                overlap = dt.iou(gt)
+                if overlap > maxiou:
+                    maxiou = overlap
+                    maxpos = j
+
+            if maxpos >= 0:
+                if gtboxes[maxpos].ign == 0:
+                    gtboxes[maxpos].matched = 1
+                    dtboxes[i].matched = 1
+                    score_list.append((dt, self.ID))
+                else:
+                    dtboxes[i].matched = -1
+            else:
+                dtboxes[i].matched = 0
+                score_list.append((dt, self.ID))
+        return score_list
+
+    def compare_caltech(self, thres):
+        """Match the detection results with the ground_truth by Caltech
+        matching strategy.
+
+        Args:
+            thres (float): IOU threshold.
+
+        Returns:
+            score_list(list[tuple[ndarray, int, str]]): Matching result.
+            a list of tuples (dtbox, label, imgID) in the descending
+            sort of dtbox.score.
+        """
+        if self.dt_boxes is None or self.gt_boxes is None:
+            return list()
+
+        dtboxes = self.dt_boxes if self.dt_boxes is not None else list()
+        gtboxes = self.gt_boxes if self.gt_boxes is not None else list()
+        dt_matched = np.zeros(dtboxes.shape[0])
+        gt_matched = np.zeros(gtboxes.shape[0])
+
+        dtboxes = np.array(sorted(dtboxes, key=lambda x: x[-1], reverse=True))
+        gtboxes = np.array(sorted(gtboxes, key=lambda x: x[-1], reverse=True))
+        if len(dtboxes):
+            overlap_iou = bbox_overlaps(dtboxes, gtboxes, mode='iou')
+            overlap_ioa = bbox_overlaps(dtboxes, gtboxes, mode='iof')
+        else:
+            return list()
+
+        score_list = list()
+        for i, dt in enumerate(dtboxes):
+            maxpos = -1
+            maxiou = thres
+            for j, gt in enumerate(gtboxes):
+                if gt_matched[j] == 1:
+                    continue
+                if gt[-1] > 0:
+                    overlap = overlap_iou[i][j]
+                    if overlap > maxiou:
+                        maxiou = overlap
+                        maxpos = j
+                else:
+                    if maxpos >= 0:
+                        break
+                    else:
+                        overlap = overlap_ioa[i][j]
+                        if overlap > thres:
+                            maxiou = overlap
+                            maxpos = j
+            if maxpos >= 0:
+                if gtboxes[maxpos, -1] > 0:
+                    gt_matched[maxpos] = 1
+                    dt_matched[i] = 1
+                    score_list.append((dt, 1, self.ID))
+                else:
+                    dt_matched[i] = -1
+            else:
+                dt_matched[i] = 0
+                score_list.append((dt, 0, self.ID))
+        return score_list
diff --git a/mmde/mmdet/evaluation/metrics/dod_metric.py b/mmde/mmdet/evaluation/metrics/dod_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..b47d07219dad112a336123444e58c72978953439
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/dod_metric.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import List, Optional, Sequence
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import get_local_path
+from mmengine.logging import MMLogger
+
+from mmdet.datasets.api_wrappers import COCO, COCOeval
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class DODCocoMetric(BaseMetric):
+
+    default_prefix: Optional[str] = 'dod'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 outfile_prefix: Optional[str] = None,
+                 backend_args: dict = None,
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.outfile_prefix = outfile_prefix
+        with get_local_path(ann_file, backend_args=backend_args) as local_path:
+            self._coco_api = COCO(local_path)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+
+            result['labels'] = pred['labels'].cpu().numpy()
+            result['labels'] = data_sample['sent_ids'][result['labels']]
+            self.results.append(result)
+
+    def xyxy2xywh(self, bbox: np.ndarray) -> list:
+        """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO
+        evaluation.
+
+        Args:
+            bbox (numpy.ndarray): The bounding boxes, shape (4, ), in
+                ``xyxy`` order.
+
+        Returns:
+            list[float]: The converted bounding boxes, in ``xywh`` order.
+        """
+
+        _bbox: List = bbox.tolist()
+        return [
+            _bbox[0],
+            _bbox[1],
+            _bbox[2] - _bbox[0],
+            _bbox[3] - _bbox[1],
+        ]
+
+    def results2json(self, results: Sequence[dict]) -> list:
+        """Dump the detection results to a COCO style json file.
+
+        There are 3 types of results: proposals, bbox predictions, mask
+        predictions, and they have different data types. This method will
+        automatically recognize the type, and dump them to json files.
+
+        Args:
+            results (Sequence[dict]): Testing results of the
+                dataset.
+
+        Returns:
+            dict: Possible keys are "bbox", "segm", "proposal", and
+            values are corresponding filenames.
+        """
+        bbox_json_results = []
+        for idx, result in enumerate(results):
+            image_id = result.get('img_id', idx)
+            labels = result['labels']
+            bboxes = result['bboxes']
+            scores = result['scores']
+            for i, label in enumerate(labels):
+                data = dict()
+                data['image_id'] = image_id
+                data['bbox'] = self.xyxy2xywh(bboxes[i])
+                data['score'] = float(scores[i])
+                data['category_id'] = label
+                bbox_json_results.append(data)
+        return bbox_json_results
+
+    def compute_metrics(self, results: list) -> dict:
+        logger: MMLogger = MMLogger.get_current_instance()
+        result_files = self.results2json(results)
+        d3_res = self._coco_api.loadRes(result_files)
+        cocoEval = COCOeval(self._coco_api, d3_res, 'bbox')
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        cocoEval.summarize()
+
+        aps = cocoEval.eval['precision'][:, :, :, 0, -1]
+        category_ids = self._coco_api.getCatIds()
+        category_names = [
+            cat['name'] for cat in self._coco_api.loadCats(category_ids)
+        ]
+
+        aps_lens = defaultdict(list)
+        counter_lens = defaultdict(int)
+        for i in range(len(category_names)):
+            ap = aps[:, :, i]
+            ap_value = ap[ap > -1].mean()
+            if not np.isnan(ap_value):
+                len_ref = len(category_names[i].split(' '))
+                aps_lens[len_ref].append(ap_value)
+                counter_lens[len_ref] += 1
+
+        ap_sum_short = sum([sum(aps_lens[i]) for i in range(0, 4)])
+        ap_sum_mid = sum([sum(aps_lens[i]) for i in range(4, 7)])
+        ap_sum_long = sum([sum(aps_lens[i]) for i in range(7, 10)])
+        ap_sum_very_long = sum([
+            sum(aps_lens[i]) for i in range(10,
+                                            max(counter_lens.keys()) + 1)
+        ])
+        c_sum_short = sum([counter_lens[i] for i in range(1, 4)])
+        c_sum_mid = sum([counter_lens[i] for i in range(4, 7)])
+        c_sum_long = sum([counter_lens[i] for i in range(7, 10)])
+        c_sum_very_long = sum(
+            [counter_lens[i] for i in range(10,
+                                            max(counter_lens.keys()) + 1)])
+        map_short = ap_sum_short / c_sum_short
+        map_mid = ap_sum_mid / c_sum_mid
+        map_long = ap_sum_long / c_sum_long
+        map_very_long = ap_sum_very_long / c_sum_very_long
+
+        coco_metric_names = {
+            'mAP': 0,
+            'mAP_50': 1,
+            'mAP_75': 2,
+            'mAP_s': 3,
+            'mAP_m': 4,
+            'mAP_l': 5,
+            'AR@100': 6,
+            'AR@300': 7,
+            'AR@1000': 8,
+            'AR_s@1000': 9,
+            'AR_m@1000': 10,
+            'AR_l@1000': 11
+        }
+        metric_items = ['mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l']
+
+        eval_results = {}
+        for metric_item in metric_items:
+            key = f'{metric_item}'
+            val = cocoEval.stats[coco_metric_names[metric_item]]
+            eval_results[key] = float(f'{round(val, 3)}')
+
+        ap = cocoEval.stats[:6]
+        logger.info(f'mAP_copypaste: {ap[0]:.3f} '
+                    f'{ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+                    f'{ap[4]:.3f} {ap[5]:.3f}')
+
+        logger.info(f'mAP over reference length: short - {map_short:.4f}, '
+                    f'mid - {map_mid:.4f}, long - {map_long:.4f}, '
+                    f'very long - {map_very_long:.4f}')
+        eval_results['mAP_short'] = float(f'{round(map_short, 3)}')
+        eval_results['mAP_mid'] = float(f'{round(map_mid, 3)}')
+        eval_results['mAP_long'] = float(f'{round(map_long, 3)}')
+        eval_results['mAP_very_long'] = float(f'{round(map_very_long, 3)}')
+        return eval_results
diff --git a/mmde/mmdet/evaluation/metrics/dump_det_results.py b/mmde/mmdet/evaluation/metrics/dump_det_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3071d19a6ad0199458d13dfe6f570f181a5ea7f
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/dump_det_results.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Sequence
+
+from mmengine.evaluator import DumpResults
+from mmengine.evaluator.metric import _to_cpu
+
+from mmdet.registry import METRICS
+from mmdet.structures.mask import encode_mask_results
+
+
+@METRICS.register_module()
+class DumpDetResults(DumpResults):
+    """Dump model predictions to a pickle file for offline evaluation.
+
+    Different from `DumpResults` in MMEngine, it compresses instance
+    segmentation masks into RLE format.
+
+    Args:
+        out_file_path (str): Path of the dumped file. Must end with '.pkl'
+            or '.pickle'.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+    """
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """transfer tensors in predictions to CPU."""
+        data_samples = _to_cpu(data_samples)
+        for data_sample in data_samples:
+            # remove gt
+            data_sample.pop('gt_instances', None)
+            data_sample.pop('ignored_instances', None)
+            data_sample.pop('gt_panoptic_seg', None)
+
+            if 'pred_instances' in data_sample:
+                pred = data_sample['pred_instances']
+                # encode mask to RLE
+                if 'masks' in pred:
+                    pred['masks'] = encode_mask_results(pred['masks'].numpy())
+            if 'pred_panoptic_seg' in data_sample:
+                warnings.warn(
+                    'Panoptic segmentation map will not be compressed. '
+                    'The dumped file will be extremely large! '
+                    'Suggest using `CocoPanopticMetric` to save the coco '
+                    'format json and segmentation png files directly.')
+        self.results.extend(data_samples)
diff --git a/mmde/mmdet/evaluation/metrics/dump_odvg_results.py b/mmde/mmdet/evaluation/metrics/dump_odvg_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1446b0538053e14b6b9b21bebc6d91c9564d9b5
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/dump_odvg_results.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional, Sequence
+
+from mmcv.ops import batched_nms
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import print_log
+
+from mmdet.registry import METRICS
+
+try:
+    import jsonlines
+except ImportError:
+    jsonlines = None
+
+
+@METRICS.register_module()
+class DumpODVGResults(BaseMetric):
+    default_prefix: Optional[str] = 'pl_odvg'
+
+    def __init__(self,
+                 outfile_path,
+                 img_prefix: str,
+                 score_thr: float = 0.1,
+                 collect_device: str = 'cpu',
+                 nms_thr: float = 0.5,
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.outfile_path = outfile_path
+        self.score_thr = score_thr
+        self.img_prefix = img_prefix
+        self.nms_thr = nms_thr
+
+        if jsonlines is None:
+            raise ImportError('Please run "pip install jsonlines" to install '
+                              'this package.')
+
+    def process(self, data_batch: Any, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = {}
+
+            filename = data_sample['img_path']
+            filename = filename.replace(self.img_prefix, '')
+            if filename.startswith('/'):
+                filename = filename[1:]
+            result['filename'] = filename
+
+            height = data_sample['ori_shape'][0]
+            width = data_sample['ori_shape'][1]
+            result['height'] = height
+            result['width'] = width
+
+            pred_instances = data_sample['pred_instances']
+
+            bboxes = pred_instances['bboxes'].cpu()
+            scores = pred_instances['scores'].cpu()
+            labels = pred_instances['labels'].cpu()
+
+            bboxes = bboxes[scores > self.score_thr]
+            labels = labels[scores > self.score_thr]
+            scores = scores[scores > self.score_thr]
+
+            if 'tokens_positive' in data_sample:
+                task = 'vg'
+            else:
+                task = 'od'
+
+            if task == 'od':
+                classes_name = data_sample['text']
+                result['detection'] = {}
+
+                if len(bboxes) > 0:
+                    det_bboxes, keep = batched_nms(
+                        bboxes, scores, labels,
+                        dict(type='nms', iou_threshold=self.nms_thr))
+                    _scores = det_bboxes[:, -1]
+                    _bboxes = det_bboxes[:, :-1]
+                    _labels = labels[keep]
+
+                    instances = []
+                    _bboxes = _bboxes.numpy().tolist()
+                    _scores = _scores.numpy().tolist()
+                    _labels = _labels.numpy().tolist()
+                    for bbox, score, label in zip(_bboxes, _scores, _labels):
+                        round_bbox = [round(b, 2) for b in bbox]
+                        round_score = round(score, 2)
+                        instances.append({
+                            'bbox': round_bbox,
+                            'score': round_score,
+                            'label': label,
+                            'category': classes_name[label]
+                        })
+                    result['detection']['instances'] = instances
+                else:
+                    result['detection']['instances'] = []
+                self.results.append(result)
+            else:
+                caption = data_sample['text']
+                result['grounding'] = {}
+                result['grounding']['caption'] = caption
+
+                tokens_positive = data_sample['tokens_positive']
+
+                region_list = []
+                for label, positive in enumerate(tokens_positive):
+                    phrase = [caption[pos[0]:pos[1]] for pos in positive]
+
+                    _bboxes = bboxes[labels == label]
+                    _scores = scores[labels == label]
+                    det_bboxes, _ = batched_nms(
+                        _bboxes,
+                        _scores,
+                        None,
+                        dict(type='nms', iou_threshold=self.nms_thr),
+                        class_agnostic=True)
+                    _scores = det_bboxes[:, -1].numpy().tolist()
+                    _bboxes = det_bboxes[:, :-1].numpy().tolist()
+
+                    round_bboxes = []
+                    for bbox in _bboxes:
+                        round_bboxes.append([round(b, 2) for b in bbox])
+                    _scores = [[round(s, 2) for s in _scores]]
+                    region = {
+                        'phrase': phrase,
+                        'bbox': round_bboxes,
+                        'score': _scores,
+                        'tokens_positive': positive
+                    }
+                    region_list.append(region)
+                result['grounding']['regions'] = region_list
+                self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        with jsonlines.open(self.outfile_path, mode='w') as writer:
+            writer.write_all(results)
+        print_log(
+            f'Results has been saved to {self.outfile_path}.',
+            logger='current')
+        return {}
diff --git a/mmde/mmdet/evaluation/metrics/dump_proposals_metric.py b/mmde/mmdet/evaluation/metrics/dump_proposals_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e9c53654c15d4b1f7e6555a9a7c53f844cb071f
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/dump_proposals_metric.py
@@ -0,0 +1,119 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+from typing import Optional, Sequence
+
+from mmengine.dist import is_main_process
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import dump
+from mmengine.logging import MMLogger
+from mmengine.structures import InstanceData
+
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class DumpProposals(BaseMetric):
+    """Dump proposals pseudo metric.
+
+    Args:
+        output_dir (str): The root directory for ``proposals_file``.
+            Defaults to ''.
+        proposals_file (str): Proposals file path. Defaults to 'proposals.pkl'.
+        num_max_proposals (int, optional): Maximum number of proposals to dump.
+            If not specified, all proposals will be dumped.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    default_prefix: Optional[str] = 'dump_proposals'
+
+    def __init__(self,
+                 output_dir: str = '',
+                 proposals_file: str = 'proposals.pkl',
+                 num_max_proposals: Optional[int] = None,
+                 file_client_args: dict = None,
+                 backend_args: dict = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.num_max_proposals = num_max_proposals
+        # TODO: update after mmengine finish refactor fileio.
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+        self.output_dir = output_dir
+        assert proposals_file.endswith(('.pkl', '.pickle')), \
+            'The output file must be a pkl file.'
+
+        self.proposals_file = os.path.join(self.output_dir, proposals_file)
+        if is_main_process():
+            os.makedirs(self.output_dir, exist_ok=True)
+
+    def process(self, data_batch: Sequence[dict],
+                data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            pred = data_sample['pred_instances']
+            # `bboxes` is sorted by `scores`
+            ranked_scores, rank_inds = pred['scores'].sort(descending=True)
+            ranked_bboxes = pred['bboxes'][rank_inds, :]
+
+            ranked_bboxes = ranked_bboxes.cpu().numpy()
+            ranked_scores = ranked_scores.cpu().numpy()
+
+            pred_instance = InstanceData()
+            pred_instance.bboxes = ranked_bboxes
+            pred_instance.scores = ranked_scores
+            if self.num_max_proposals is not None:
+                pred_instance = pred_instance[:self.num_max_proposals]
+
+            img_path = data_sample['img_path']
+            # `file_name` is the key to obtain the proposals from the
+            # `proposals_list`.
+            file_name = osp.join(
+                osp.split(osp.split(img_path)[0])[-1],
+                osp.split(img_path)[-1])
+            result = {file_name: pred_instance}
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        """Dump the processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: An empty dict.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        dump_results = {}
+        for result in results:
+            dump_results.update(result)
+        dump(
+            dump_results,
+            file=self.proposals_file,
+            backend_args=self.backend_args)
+        logger.info(f'Results are saved at {self.proposals_file}')
+        return {}
diff --git a/mmde/mmdet/evaluation/metrics/flickr30k_metric.py b/mmde/mmdet/evaluation/metrics/flickr30k_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8b64bfda46b3e8cc4a1053d10082eff9bc421e8
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/flickr30k_metric.py
@@ -0,0 +1,165 @@
+# Copyright (c) OpenMMLab. All rights reserved
+from collections import defaultdict
+from typing import Dict, List, Optional, Sequence
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS
+from ..functional import bbox_overlaps
+
+
+class RecallTracker:
+    """Utility class to track recall@k for various k, split by categories."""
+
+    def __init__(self, topk: Sequence[int]):
+        """
+        Parameters:
+           - topk : tuple of ints corresponding to the recalls being
+           tracked (eg, recall@1, recall@10, ...)
+        """
+
+        self.total_byk_bycat: Dict[int, Dict[str, int]] = {
+            k: defaultdict(int)
+            for k in topk
+        }
+        self.positives_byk_bycat: Dict[int, Dict[str, int]] = {
+            k: defaultdict(int)
+            for k in topk
+        }
+
+    def add_positive(self, k: int, category: str):
+        """Log a positive hit @k for given category."""
+        if k not in self.total_byk_bycat:
+            raise RuntimeError(f'{k} is not a valid recall threshold')
+        self.total_byk_bycat[k][category] += 1
+        self.positives_byk_bycat[k][category] += 1
+
+    def add_negative(self, k: int, category: str):
+        """Log a negative hit @k for given category."""
+        if k not in self.total_byk_bycat:
+            raise RuntimeError(f'{k} is not a valid recall threshold')
+        self.total_byk_bycat[k][category] += 1
+
+    def report(self) -> Dict[str, Dict[str, float]]:
+        """Return a condensed report of the results as a dict of dict.
+
+        report[k][cat] is the recall@k for the given category
+        """
+        report: Dict[str, Dict[str, float]] = {}
+        for k in self.total_byk_bycat:
+            assert k in self.positives_byk_bycat
+            report[str(k)] = {
+                cat:
+                self.positives_byk_bycat[k][cat] / self.total_byk_bycat[k][cat]
+                for cat in self.total_byk_bycat[k]
+            }
+        return report
+
+
+@METRICS.register_module()
+class Flickr30kMetric(BaseMetric):
+    """Phrase Grounding Metric."""
+
+    def __init__(
+        self,
+        topk: Sequence[int] = (1, 5, 10, -1),
+        iou_thrs: float = 0.5,
+        merge_boxes: bool = False,
+        collect_device: str = 'cpu',
+        prefix: Optional[str] = None,
+    ) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.iou_thrs = iou_thrs
+        self.topk = topk
+        self.merge = merge_boxes
+
+    def merge_boxes(self, boxes: List[List[int]]) -> List[List[int]]:
+        """Return the boxes corresponding to the smallest enclosing box
+        containing all the provided boxes The boxes are expected in [x1, y1,
+        x2, y2] format."""
+        if len(boxes) == 1:
+            return boxes
+
+        np_boxes = np.asarray(boxes)
+
+        return [[
+            np.boxes[:, 0].min(), np_boxes[:, 1].min(), np_boxes[:, 2].max(),
+            np_boxes[:, 3].max()
+        ]]
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            pred = data_sample['pred_instances']
+            gt = data_sample['gt_instances']['bboxes']
+            gt_label = data_sample['phrase_ids']
+            phrases = data_sample['phrases']
+            assert len(gt) == len(gt_label)
+
+            self.results.append((pred, gt, gt_label, phrases))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        pred_list, gt_list, gt_label_list, phrase_list = zip(*results)
+
+        recall_tracker = RecallTracker(self.topk)
+
+        for pred, gt_boxes, gt_labels, phrases in zip(pred_list, gt_list,
+                                                      gt_label_list,
+                                                      phrase_list):
+            pred_boxes = pred['bboxes'].cpu().numpy()
+            pred_labels = pred['labels'].cpu().numpy()
+            for i, phrase in enumerate(phrases):
+                cur_index = pred_labels == i
+                cur_boxes = pred_boxes[cur_index]
+                tar_index = [
+                    index for index, value in enumerate(gt_labels)
+                    if value == i
+                ]
+                tar_boxes = gt_boxes[tar_index]
+                if self.merge:
+                    tar_boxes = self.merge_boxes(tar_boxes)
+                if len(cur_boxes) == 0:
+                    cur_boxes = [[0., 0., 0., 0.]]
+                ious = bbox_overlaps(
+                    np.asarray(cur_boxes), np.asarray(tar_boxes))
+                for k in self.topk:
+                    if k == -1:
+                        maxi = ious.max()
+                    else:
+                        assert k > 0
+                        maxi = ious[:k].max()
+                    if maxi >= self.iou_thrs:
+                        recall_tracker.add_positive(k, 'all')
+                        # TODO: do not support class-wise evaluation yet
+                        # for phrase_type in phrase['phrase_type']:
+                        #     recall_tracker.add_positive(k, phrase_type)
+                    else:
+                        recall_tracker.add_negative(k, 'all')
+                        # for phrase_type in phrase['phrase_type']:
+                        #     recall_tracker.add_negative(k, phrase_type)
+
+        results = recall_tracker.report()
+        logger.info(results)
+        return results
diff --git a/mmde/mmdet/evaluation/metrics/grefcoco_metric.py b/mmde/mmdet/evaluation/metrics/grefcoco_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..55cc638c5e4de11480a6858d15309017ba59a16a
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/grefcoco_metric.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Sequence
+
+import numpy as np
+import torch
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import get_local_path
+from mmengine.logging import MMLogger
+
+from mmdet.datasets.api_wrappers import COCO
+from mmdet.registry import METRICS
+from ..functional import bbox_overlaps
+
+
+# refer from https://github.com/henghuiding/gRefCOCO/blob/main/mdetr/datasets/refexp.py # noqa
+@METRICS.register_module()
+class gRefCOCOMetric(BaseMetric):
+    default_prefix: Optional[str] = 'grefcoco'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 metric: str = 'bbox',
+                 iou_thrs: float = 0.5,
+                 thresh_score: float = 0.7,
+                 thresh_f1: float = 1.0,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.metric = metric
+        self.iou_thrs = iou_thrs
+        self.thresh_score = thresh_score
+        self.thresh_f1 = thresh_f1
+
+        with get_local_path(ann_file) as local_path:
+            self.coco = COCO(local_path)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu()
+            result['scores'] = pred['scores'].cpu()
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        correct_image = 0
+        num_image = 0
+        nt = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}
+
+        for result in results:
+            img_id = result['img_id']
+            TP = 0
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id)
+            target = self.coco.loadAnns(ann_ids[0])
+
+            converted_bbox_all = []
+            no_target_flag = False
+            for one_target in target:
+                if one_target['category_id'] == -1:
+                    no_target_flag = True
+                target_bbox = one_target['bbox']
+                converted_bbox = [
+                    target_bbox[0],
+                    target_bbox[1],
+                    target_bbox[2] + target_bbox[0],
+                    target_bbox[3] + target_bbox[1],
+                ]
+                converted_bbox_all.append(
+                    np.array(converted_bbox).reshape(-1, 4))
+            gt_bbox_all = np.concatenate(converted_bbox_all, axis=0)
+
+            idx = result['scores'] >= self.thresh_score
+            filtered_boxes = result['bboxes'][idx]
+
+            iou = bbox_overlaps(filtered_boxes.numpy(), gt_bbox_all)
+            iou = torch.from_numpy(iou)
+
+            num_prediction = filtered_boxes.shape[0]
+            num_gt = gt_bbox_all.shape[0]
+            if no_target_flag:
+                if num_prediction >= 1:
+                    nt['FN'] += 1
+                else:
+                    nt['TP'] += 1
+                if num_prediction >= 1:
+                    f_1 = 0.
+                else:
+                    f_1 = 1.0
+            else:
+                if num_prediction >= 1:
+                    nt['TN'] += 1
+                else:
+                    nt['FP'] += 1
+                for i in range(min(num_prediction, num_gt)):
+                    top_value, top_index = torch.topk(iou.flatten(0, 1), 1)
+                    if top_value < self.iou_thrs:
+                        break
+                    else:
+                        top_index_x = top_index // num_gt
+                        top_index_y = top_index % num_gt
+                        TP += 1
+                        iou[top_index_x[0], :] = 0.0
+                        iou[:, top_index_y[0]] = 0.0
+                FP = num_prediction - TP
+                FN = num_gt - TP
+                f_1 = 2 * TP / (2 * TP + FP + FN)
+
+            if f_1 >= self.thresh_f1:
+                correct_image += 1
+            num_image += 1
+
+        score = correct_image / max(num_image, 1)
+        results = {
+            'F1_score': score,
+            'T_acc': nt['TN'] / (nt['TN'] + nt['FP']),
+            'N_acc': nt['TP'] / (nt['TP'] + nt['FN'])
+        }
+        logger.info(results)
+        return results
diff --git a/mmde/mmdet/evaluation/metrics/lvis_metric.py b/mmde/mmdet/evaluation/metrics/lvis_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..a861c6ee7b48adb2e428dcdaa97e8dc7ba476a6c
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/lvis_metric.py
@@ -0,0 +1,534 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+import logging
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+from typing import Dict, List, Optional, Sequence, Union
+
+import numpy as np
+import torch
+from mmengine.dist import (all_gather_object, broadcast_object_list,
+                           is_main_process)
+from mmengine.evaluator import BaseMetric
+from mmengine.evaluator.metric import _to_cpu
+from mmengine.fileio import get_local_path
+from mmengine.logging import MMLogger, print_log
+from terminaltables import AsciiTable
+
+from mmdet.registry import METRICS
+from mmdet.structures.mask import encode_mask_results
+from ..functional import eval_recalls
+from .coco_metric import CocoMetric
+
+try:
+    import lvis
+
+    if getattr(lvis, '__version__', '0') >= '10.5.3':
+        warnings.warn(
+            'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"',  # noqa: E501
+            UserWarning)
+    from lvis import LVIS, LVISEval, LVISResults
+except ImportError:
+    lvis = None
+    LVISEval = None
+    LVISResults = None
+
+
+@METRICS.register_module()
+class LVISMetric(CocoMetric):
+    """LVIS evaluation metric.
+
+    Args:
+        ann_file (str, optional): Path to the coco format annotation file.
+            If not specified, ground truth annotations from the dataset will
+            be converted to coco format. Defaults to None.
+        metric (str | List[str]): Metrics to be evaluated. Valid metrics
+            include 'bbox', 'segm', 'proposal', and 'proposal_fast'.
+            Defaults to 'bbox'.
+        classwise (bool): Whether to evaluate the metric class-wise.
+            Defaults to False.
+        proposal_nums (Sequence[int]): Numbers of proposals to be evaluated.
+            Defaults to (100, 300, 1000).
+        iou_thrs (float | List[float], optional): IoU threshold to compute AP
+            and AR. If not specified, IoUs from 0.5 to 0.95 will be used.
+            Defaults to None.
+        metric_items (List[str], optional): Metric result names to be
+            recorded in the evaluation result. Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        outfile_prefix (str, optional): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    default_prefix: Optional[str] = 'lvis'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 metric: Union[str, List[str]] = 'bbox',
+                 classwise: bool = False,
+                 proposal_nums: Sequence[int] = (100, 300, 1000),
+                 iou_thrs: Optional[Union[float, Sequence[float]]] = None,
+                 metric_items: Optional[Sequence[str]] = None,
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 file_client_args: dict = None,
+                 backend_args: dict = None) -> None:
+        if lvis is None:
+            raise RuntimeError(
+                'Package lvis is not installed. Please run "pip install '
+                'git+https://github.com/lvis-dataset/lvis-api.git".')
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        # coco evaluation metrics
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
+        for metric in self.metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(
+                    "metric should be one of 'bbox', 'segm', 'proposal', "
+                    f"'proposal_fast', but got {metric}.")
+
+        # do class wise evaluation, default False
+        self.classwise = classwise
+
+        # proposal_nums used to compute recall or precision.
+        self.proposal_nums = list(proposal_nums)
+
+        # iou_thrs used to compute recall or precision.
+        if iou_thrs is None:
+            iou_thrs = np.linspace(
+                .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.iou_thrs = iou_thrs
+        self.metric_items = metric_items
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+
+        self.outfile_prefix = outfile_prefix
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+        # if ann_file is not specified,
+        # initialize lvis api with the converted dataset
+        if ann_file is not None:
+            with get_local_path(
+                    ann_file, backend_args=self.backend_args) as local_path:
+                self._lvis_api = LVIS(local_path)
+        else:
+            self._lvis_api = None
+
+        # handle dataset lazy init
+        self.cat_ids = None
+        self.img_ids = None
+
+    def fast_eval_recall(self,
+                         results: List[dict],
+                         proposal_nums: Sequence[int],
+                         iou_thrs: Sequence[float],
+                         logger: Optional[MMLogger] = None) -> np.ndarray:
+        """Evaluate proposal recall with LVIS's fast_eval_recall.
+
+        Args:
+            results (List[dict]): Results of the dataset.
+            proposal_nums (Sequence[int]): Proposal numbers used for
+                evaluation.
+            iou_thrs (Sequence[float]): IoU thresholds used for evaluation.
+            logger (MMLogger, optional): Logger used for logging the recall
+                summary.
+        Returns:
+            np.ndarray: Averaged recall results.
+        """
+        gt_bboxes = []
+        pred_bboxes = [result['bboxes'] for result in results]
+        for i in range(len(self.img_ids)):
+            ann_ids = self._lvis_api.get_ann_ids(img_ids=[self.img_ids[i]])
+            ann_info = self._lvis_api.load_anns(ann_ids)
+            if len(ann_info) == 0:
+                gt_bboxes.append(np.zeros((0, 4)))
+                continue
+            bboxes = []
+            for ann in ann_info:
+                x1, y1, w, h = ann['bbox']
+                bboxes.append([x1, y1, x1 + w, y1 + h])
+            bboxes = np.array(bboxes, dtype=np.float32)
+            if bboxes.shape[0] == 0:
+                bboxes = np.zeros((0, 4))
+            gt_bboxes.append(bboxes)
+
+        recalls = eval_recalls(
+            gt_bboxes, pred_bboxes, proposal_nums, iou_thrs, logger=logger)
+        ar = recalls.mean(axis=1)
+        return ar
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+            result['labels'] = pred['labels'].cpu().numpy()
+            # encode mask to RLE
+            if 'masks' in pred:
+                result['masks'] = encode_mask_results(
+                    pred['masks'].detach().cpu().numpy())
+            # some detectors use different scores for bbox and mask
+            if 'mask_scores' in pred:
+                result['mask_scores'] = pred['mask_scores'].cpu().numpy()
+
+            # parse gt
+            gt = dict()
+            gt['width'] = data_sample['ori_shape'][1]
+            gt['height'] = data_sample['ori_shape'][0]
+            gt['img_id'] = data_sample['img_id']
+            if self._lvis_api is None:
+                # TODO: Need to refactor to support LoadAnnotations
+                assert 'instances' in data_sample, \
+                    'ground truth is required for evaluation when ' \
+                    '`ann_file` is not provided'
+                gt['anns'] = data_sample['instances']
+            # add converted result to the results list
+            self.results.append((gt, result))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # split gt and prediction list
+        gts, preds = zip(*results)
+
+        tmp_dir = None
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+
+        if self._lvis_api is None:
+            # use converted gt json file to initialize coco api
+            logger.info('Converting ground truth to coco format...')
+            coco_json_path = self.gt_to_coco_json(
+                gt_dicts=gts, outfile_prefix=outfile_prefix)
+            self._lvis_api = LVIS(coco_json_path)
+
+        # handle lazy init
+        if self.cat_ids is None:
+            self.cat_ids = self._lvis_api.get_cat_ids()
+        if self.img_ids is None:
+            self.img_ids = self._lvis_api.get_img_ids()
+
+        # convert predictions to coco format and dump to json file
+        result_files = self.results2json(preds, outfile_prefix)
+
+        eval_results = OrderedDict()
+        if self.format_only:
+            logger.info('results are saved in '
+                        f'{osp.dirname(outfile_prefix)}')
+            return eval_results
+
+        lvis_gt = self._lvis_api
+
+        for metric in self.metrics:
+            logger.info(f'Evaluating {metric}...')
+
+            # TODO: May refactor fast_eval_recall to an independent metric?
+            # fast eval recall
+            if metric == 'proposal_fast':
+                ar = self.fast_eval_recall(
+                    preds, self.proposal_nums, self.iou_thrs, logger=logger)
+                log_msg = []
+                for i, num in enumerate(self.proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
+                log_msg = ''.join(log_msg)
+                logger.info(log_msg)
+                continue
+
+            try:
+                lvis_dt = LVISResults(lvis_gt, result_files[metric])
+            except IndexError:
+                logger.info(
+                    'The testing results of the whole dataset is empty.')
+                break
+
+            iou_type = 'bbox' if metric == 'proposal' else metric
+            lvis_eval = LVISEval(lvis_gt, lvis_dt, iou_type)
+            lvis_eval.params.imgIds = self.img_ids
+            metric_items = self.metric_items
+            if metric == 'proposal':
+                lvis_eval.params.useCats = 0
+                lvis_eval.params.maxDets = list(self.proposal_nums)
+                lvis_eval.evaluate()
+                lvis_eval.accumulate()
+                lvis_eval.summarize()
+                if metric_items is None:
+                    metric_items = ['AR@300', 'ARs@300', 'ARm@300', 'ARl@300']
+                for k, v in lvis_eval.get_results().items():
+                    if k in metric_items:
+                        val = float('{:.3f}'.format(float(v)))
+                        eval_results[k] = val
+
+            else:
+                lvis_eval.evaluate()
+                lvis_eval.accumulate()
+                lvis_eval.summarize()
+                lvis_results = lvis_eval.get_results()
+                if self.classwise:  # Compute per-category AP
+                    # Compute per-category AP
+                    # from https://github.com/facebookresearch/detectron2/
+                    precisions = lvis_eval.eval['precision']
+                    # precision: (iou, recall, cls, area range, max dets)
+                    assert len(self.cat_ids) == precisions.shape[2]
+
+                    results_per_category = []
+                    for idx, catId in enumerate(self.cat_ids):
+                        # area range index 0: all area ranges
+                        # max dets index -1: typically 100 per image
+                        # the dimensions of precisions are
+                        # [num_thrs, num_recalls, num_cats, num_area_rngs]
+                        nm = self._lvis_api.load_cats([catId])[0]
+                        precision = precisions[:, :, idx, 0]
+                        precision = precision[precision > -1]
+                        if precision.size:
+                            ap = np.mean(precision)
+                        else:
+                            ap = float('nan')
+                        results_per_category.append(
+                            (f'{nm["name"]}', f'{float(ap):0.3f}'))
+                        eval_results[f'{nm["name"]}_precision'] = round(ap, 3)
+
+                    num_columns = min(6, len(results_per_category) * 2)
+                    results_flatten = list(
+                        itertools.chain(*results_per_category))
+                    headers = ['category', 'AP'] * (num_columns // 2)
+                    results_2d = itertools.zip_longest(*[
+                        results_flatten[i::num_columns]
+                        for i in range(num_columns)
+                    ])
+                    table_data = [headers]
+                    table_data += [result for result in results_2d]
+                    table = AsciiTable(table_data)
+                    logger.info('\n' + table.table)
+
+                if metric_items is None:
+                    metric_items = [
+                        'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'APr',
+                        'APc', 'APf'
+                    ]
+
+                for k, v in lvis_results.items():
+                    if k in metric_items:
+                        key = '{}_{}'.format(metric, k)
+                        val = float('{:.3f}'.format(float(v)))
+                        eval_results[key] = val
+
+            lvis_eval.print_results()
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
+
+
+def _merge_lists(listA, listB, maxN, key):
+    result = []
+    indA, indB = 0, 0
+    while (indA < len(listA) or indB < len(listB)) and len(result) < maxN:
+        if (indB < len(listB)) and (indA >= len(listA)
+                                    or key(listA[indA]) < key(listB[indB])):
+            result.append(listB[indB])
+            indB += 1
+        else:
+            result.append(listA[indA])
+            indA += 1
+    return result
+
+
+@METRICS.register_module()
+class LVISFixedAPMetric(BaseMetric):
+    default_prefix: Optional[str] = 'lvis_fixed_ap'
+
+    def __init__(self,
+                 ann_file: str,
+                 topk: int = 10000,
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 backend_args: dict = None) -> None:
+
+        if lvis is None:
+            raise RuntimeError(
+                'Package lvis is not installed. Please run "pip install '
+                'git+https://github.com/lvis-dataset/lvis-api.git".')
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+
+        self.outfile_prefix = outfile_prefix
+        self.backend_args = backend_args
+
+        with get_local_path(
+                ann_file, backend_args=self.backend_args) as local_path:
+            self._lvis_api = LVIS(local_path)
+
+        self.cat_ids = self._lvis_api.get_cat_ids()
+
+        self.results = {}
+        self.topk = topk
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        cur_results = []
+        for data_sample in data_samples:
+            pred = data_sample['pred_instances']
+            xmin, ymin, xmax, ymax = pred['bboxes'].cpu().unbind(1)
+            boxes = torch.stack((xmin, ymin, xmax - xmin, ymax - ymin),
+                                dim=1).tolist()
+
+            scores = pred['scores'].cpu().numpy()
+            labels = pred['labels'].cpu().numpy()
+
+            if len(boxes) == 0:
+                continue
+
+            cur_results.extend([{
+                'image_id': data_sample['img_id'],
+                'category_id': self.cat_ids[labels[k]],
+                'bbox': box,
+                'score': scores[k],
+            } for k, box in enumerate(boxes)])
+
+        by_cat = defaultdict(list)
+        for ann in cur_results:
+            by_cat[ann['category_id']].append(ann)
+
+        for cat, cat_anns in by_cat.items():
+            if cat not in self.results:
+                self.results[cat] = []
+
+            cur = sorted(
+                cat_anns, key=lambda x: x['score'], reverse=True)[:self.topk]
+            self.results[cat] = _merge_lists(
+                self.results[cat], cur, self.topk, key=lambda x: x['score'])
+
+    def compute_metrics(self, results: dict) -> dict:
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        new_results = []
+
+        missing_dets_cats = set()
+        for cat, cat_anns in results.items():
+            if len(cat_anns) < self.topk:
+                missing_dets_cats.add(cat)
+            new_results.extend(
+                sorted(cat_anns, key=lambda x: x['score'],
+                       reverse=True)[:self.topk])
+
+        if missing_dets_cats:
+            logger.info(
+                f'\n===\n'
+                f'{len(missing_dets_cats)} classes had less than {self.topk} '
+                f'detections!\n Outputting {self.topk} detections for each '
+                f'class will improve AP further.\n ===')
+
+        new_results = LVISResults(self._lvis_api, new_results, max_dets=-1)
+        lvis_eval = LVISEval(self._lvis_api, new_results, iou_type='bbox')
+        params = lvis_eval.params
+        params.max_dets = -1  # No limit on detections per image.
+        lvis_eval.run()
+        lvis_eval.print_results()
+        metrics = {
+            k: v
+            for k, v in lvis_eval.results.items() if k.startswith('AP')
+        }
+        logger.info(f'mAP_copypaste: {metrics}')
+        return metrics
+
+    def evaluate(self, size: int) -> dict:
+        if len(self.results) == 0:
+            print_log(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.',
+                logger='current',
+                level=logging.WARNING)
+
+        all_cats = all_gather_object(self.results)
+        results = defaultdict(list)
+        for cats in all_cats:
+            for cat, cat_anns in cats.items():
+                results[cat].extend(cat_anns)
+
+        if is_main_process():
+            # cast all tensors in results list to cpu
+            results = _to_cpu(results)
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results
+        self.results = {}
+        return metrics[0]
diff --git a/mmde/mmdet/evaluation/metrics/mot_challenge_metric.py b/mmde/mmdet/evaluation/metrics/mot_challenge_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5513c44e81de7dd869d4c5c802bfac0387bdbf6
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/mot_challenge_metric.py
@@ -0,0 +1,443 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import shutil
+import tempfile
+from collections import defaultdict
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+
+try:
+    import trackeval
+except ImportError:
+    trackeval = None
+from mmengine.dist import (all_gather_object, barrier, broadcast,
+                           broadcast_object_list, get_dist_info,
+                           is_main_process)
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS, TASK_UTILS
+from .base_video_metric import BaseVideoMetric
+
+
+def get_tmpdir() -> str:
+    """return the same tmpdir for all processes."""
+    rank, world_size = get_dist_info()
+    MAX_LEN = 512
+    # 32 is whitespace
+    dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8)
+    if rank == 0:
+        tmpdir = tempfile.mkdtemp()
+        tmpdir = torch.tensor(bytearray(tmpdir.encode()), dtype=torch.uint8)
+        dir_tensor[:len(tmpdir)] = tmpdir
+    broadcast(dir_tensor, 0)
+    tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    return tmpdir
+
+
+@METRICS.register_module()
+class MOTChallengeMetric(BaseVideoMetric):
+    """Evaluation metrics for MOT Challenge.
+
+    Args:
+        metric (str | list[str]): Metrics to be evaluated. Options are
+            'HOTA', 'CLEAR', 'Identity'.
+            Defaults to ['HOTA', 'CLEAR', 'Identity'].
+        outfile_prefix (str, optional): Path to save the formatted results.
+            Defaults to None.
+        track_iou_thr (float): IoU threshold for tracking evaluation.
+            Defaults to 0.5.
+        benchmark (str): Benchmark to be evaluated. Defaults to 'MOT17'.
+        format_only (bool): If True, only formatting the results to the
+            official format and not performing evaluation. Defaults to False.
+        postprocess_tracklet_cfg (List[dict], optional): configs for tracklets
+            postprocessing methods. `InterpolateTracklets` is supported.
+            Defaults to []
+            - InterpolateTracklets:
+                - min_num_frames (int, optional): The minimum length of a
+                    track that will be interpolated. Defaults to 5.
+                - max_num_frames (int, optional): The maximum disconnected
+                    length in a track. Defaults to 20.
+                - use_gsi (bool, optional): Whether to use the GSI (Gaussian-
+                    smoothed interpolation) method. Defaults to False.
+                - smooth_tau (int, optional): smoothing parameter in GSI.
+                    Defaults to 10.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+    Returns:
+    """
+    TRACKER = 'default-tracker'
+    allowed_metrics = ['HOTA', 'CLEAR', 'Identity']
+    allowed_benchmarks = ['MOT15', 'MOT16', 'MOT17', 'MOT20', 'DanceTrack']
+    default_prefix: Optional[str] = 'motchallenge-metric'
+
+    def __init__(self,
+                 metric: Union[str, List[str]] = ['HOTA', 'CLEAR', 'Identity'],
+                 outfile_prefix: Optional[str] = None,
+                 track_iou_thr: float = 0.5,
+                 benchmark: str = 'MOT17',
+                 format_only: bool = False,
+                 use_postprocess: bool = False,
+                 postprocess_tracklet_cfg: Optional[List[dict]] = [],
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        if trackeval is None:
+            raise RuntimeError(
+                'trackeval is not installed,'
+                'please install it by: pip install'
+                'git+https://github.com/JonathonLuiten/TrackEval.git'
+                'trackeval need low version numpy, please install it'
+                'by: pip install -U numpy==1.23.5')
+        if isinstance(metric, list):
+            metrics = metric
+        elif isinstance(metric, str):
+            metrics = [metric]
+        else:
+            raise TypeError('metric must be a list or a str.')
+        for metric in metrics:
+            if metric not in self.allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported.')
+        self.metrics = metrics
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+        self.use_postprocess = use_postprocess
+        self.postprocess_tracklet_cfg = postprocess_tracklet_cfg.copy()
+        self.postprocess_tracklet_methods = [
+            TASK_UTILS.build(cfg) for cfg in self.postprocess_tracklet_cfg
+        ]
+        assert benchmark in self.allowed_benchmarks
+        self.benchmark = benchmark
+        self.track_iou_thr = track_iou_thr
+        self.tmp_dir = tempfile.TemporaryDirectory()
+        self.tmp_dir.name = get_tmpdir()
+        self.seq_info = defaultdict(
+            lambda: dict(seq_length=-1, gt_tracks=[], pred_tracks=[]))
+        self.gt_dir = self._get_gt_dir()
+        self.pred_dir = self._get_pred_dir(outfile_prefix)
+        self.seqmap = osp.join(self.pred_dir, 'videoseq.txt')
+        with open(self.seqmap, 'w') as f:
+            f.write('name\n')
+
+    def __del__(self):
+        # To avoid tmpdir being cleaned up too early, because in multiple
+        # consecutive ValLoops, the value of `self.tmp_dir.name` is unchanged,
+        # and calling `tmp_dir.cleanup()` in compute_metrics will cause errors.
+        self.tmp_dir.cleanup()
+
+    def _get_pred_dir(self, outfile_prefix):
+        """Get directory to save the prediction results."""
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        if outfile_prefix is None:
+            outfile_prefix = self.tmp_dir.name
+        else:
+            if osp.exists(outfile_prefix) and is_main_process():
+                logger.info('remove previous results.')
+                shutil.rmtree(outfile_prefix)
+        pred_dir = osp.join(outfile_prefix, self.TRACKER)
+        os.makedirs(pred_dir, exist_ok=True)
+        return pred_dir
+
+    def _get_gt_dir(self):
+        """Get directory to save the gt files."""
+        output_dir = osp.join(self.tmp_dir.name, 'gt')
+        os.makedirs(output_dir, exist_ok=True)
+        return output_dir
+
+    def transform_gt_and_pred(self, img_data_sample, video, frame_id):
+
+        video = img_data_sample['img_path'].split(os.sep)[-3]
+        # load gts
+        if 'instances' in img_data_sample:
+            gt_instances = img_data_sample['instances']
+            gt_tracks = [
+                np.array([
+                    frame_id + 1, gt_instances[i]['instance_id'],
+                    gt_instances[i]['bbox'][0], gt_instances[i]['bbox'][1],
+                    gt_instances[i]['bbox'][2] - gt_instances[i]['bbox'][0],
+                    gt_instances[i]['bbox'][3] - gt_instances[i]['bbox'][1],
+                    gt_instances[i]['mot_conf'],
+                    gt_instances[i]['category_id'],
+                    gt_instances[i]['visibility']
+                ]) for i in range(len(gt_instances))
+            ]
+            self.seq_info[video]['gt_tracks'].extend(gt_tracks)
+
+        # load predictions
+        assert 'pred_track_instances' in img_data_sample
+        if self.use_postprocess:
+            pred_instances = img_data_sample['pred_track_instances']
+            pred_tracks = [
+                pred_instances['bboxes'][i]
+                for i in range(len(pred_instances['bboxes']))
+            ]
+        else:
+            pred_instances = img_data_sample['pred_track_instances']
+            pred_tracks = [
+                np.array([
+                    frame_id + 1, pred_instances['instances_id'][i].cpu(),
+                    pred_instances['bboxes'][i][0].cpu(),
+                    pred_instances['bboxes'][i][1].cpu(),
+                    (pred_instances['bboxes'][i][2] -
+                     pred_instances['bboxes'][i][0]).cpu(),
+                    (pred_instances['bboxes'][i][3] -
+                     pred_instances['bboxes'][i][1]).cpu(),
+                    pred_instances['scores'][i].cpu()
+                ]) for i in range(len(pred_instances['instances_id']))
+            ]
+        self.seq_info[video]['pred_tracks'].extend(pred_tracks)
+
+    def process_image(self, data_samples, video_len):
+
+        img_data_sample = data_samples[0].to_dict()
+        video = img_data_sample['img_path'].split(os.sep)[-3]
+        frame_id = img_data_sample['frame_id']
+        if self.seq_info[video]['seq_length'] == -1:
+            self.seq_info[video]['seq_length'] = video_len
+        self.transform_gt_and_pred(img_data_sample, video, frame_id)
+
+        if frame_id == video_len - 1:
+            # postprocessing
+            if self.postprocess_tracklet_cfg:
+                info = self.seq_info[video]
+                pred_tracks = np.array(info['pred_tracks'])
+                for postprocess_tracklet_methods in \
+                        self.postprocess_tracklet_methods:
+                    pred_tracks = postprocess_tracklet_methods\
+                        .forward(pred_tracks)
+                info['pred_tracks'] = pred_tracks
+            self._save_one_video_gts_preds(video)
+
+    def process_video(self, data_samples):
+
+        video_len = len(data_samples)
+        for frame_id in range(video_len):
+            img_data_sample = data_samples[frame_id].to_dict()
+            # load basic info
+            video = img_data_sample['img_path'].split(os.sep)[-3]
+            if self.seq_info[video]['seq_length'] == -1:
+                self.seq_info[video]['seq_length'] = video_len
+            self.transform_gt_and_pred(img_data_sample, video, frame_id)
+
+        if self.postprocess_tracklet_cfg:
+            info = self.seq_info[video]
+            pred_tracks = np.array(info['pred_tracks'])
+            for postprocess_tracklet_methods in \
+                    self.postprocess_tracklet_methods:
+                pred_tracks = postprocess_tracklet_methods \
+                    .forward(pred_tracks)
+            info['pred_tracks'] = pred_tracks
+        self._save_one_video_gts_preds(video)
+
+    def _save_one_video_gts_preds(self, seq: str) -> None:
+        """Save the gt and prediction results."""
+        info = self.seq_info[seq]
+        # save predictions
+        pred_file = osp.join(self.pred_dir, seq + '.txt')
+
+        pred_tracks = np.array(info['pred_tracks'])
+
+        with open(pred_file, 'wt') as f:
+            for tracks in pred_tracks:
+                line = '%d,%d,%.3f,%.3f,%.3f,%.3f,%.3f,-1,-1,-1\n' % (
+                    tracks[0], tracks[1], tracks[2], tracks[3], tracks[4],
+                    tracks[5], tracks[6])
+                f.writelines(line)
+
+        info['pred_tracks'] = []
+        # save gts
+        if info['gt_tracks']:
+            gt_file = osp.join(self.gt_dir, seq + '.txt')
+            with open(gt_file, 'wt') as f:
+                for tracks in info['gt_tracks']:
+                    line = '%d,%d,%d,%d,%d,%d,%d,%d,%.5f\n' % (
+                        tracks[0], tracks[1], tracks[2], tracks[3], tracks[4],
+                        tracks[5], tracks[6], tracks[7], tracks[8])
+                    f.writelines(line)
+            info['gt_tracks'].clear()
+        # save seq info
+        with open(self.seqmap, 'a') as f:
+            f.write(seq + '\n')
+            f.close()
+
+    def compute_metrics(self, results: list = None) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+                Defaults to None.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # NOTICE: don't access `self.results` from the method.
+        eval_results = dict()
+
+        if self.format_only:
+            return eval_results
+
+        eval_config = trackeval.Evaluator.get_default_eval_config()
+
+        # need to split out the tracker name
+        # caused by the implementation of TrackEval
+        pred_dir_tmp = self.pred_dir.rsplit(osp.sep, 1)[0]
+        dataset_config = self.get_dataset_cfg(self.gt_dir, pred_dir_tmp)
+
+        evaluator = trackeval.Evaluator(eval_config)
+        dataset = [trackeval.datasets.MotChallenge2DBox(dataset_config)]
+        metrics = [
+            getattr(trackeval.metrics,
+                    metric)(dict(METRICS=[metric], THRESHOLD=0.5))
+            for metric in self.metrics
+        ]
+        output_res, _ = evaluator.evaluate(dataset, metrics)
+        output_res = output_res['MotChallenge2DBox'][
+            self.TRACKER]['COMBINED_SEQ']['pedestrian']
+
+        if 'HOTA' in self.metrics:
+            logger.info('Evaluating HOTA Metrics...')
+            eval_results['HOTA'] = np.average(output_res['HOTA']['HOTA'])
+            eval_results['AssA'] = np.average(output_res['HOTA']['AssA'])
+            eval_results['DetA'] = np.average(output_res['HOTA']['DetA'])
+
+        if 'CLEAR' in self.metrics:
+            logger.info('Evaluating CLEAR Metrics...')
+            eval_results['MOTA'] = np.average(output_res['CLEAR']['MOTA'])
+            eval_results['MOTP'] = np.average(output_res['CLEAR']['MOTP'])
+            eval_results['IDSW'] = np.average(output_res['CLEAR']['IDSW'])
+            eval_results['TP'] = np.average(output_res['CLEAR']['CLR_TP'])
+            eval_results['FP'] = np.average(output_res['CLEAR']['CLR_FP'])
+            eval_results['FN'] = np.average(output_res['CLEAR']['CLR_FN'])
+            eval_results['Frag'] = np.average(output_res['CLEAR']['Frag'])
+            eval_results['MT'] = np.average(output_res['CLEAR']['MT'])
+            eval_results['ML'] = np.average(output_res['CLEAR']['ML'])
+
+        if 'Identity' in self.metrics:
+            logger.info('Evaluating Identity Metrics...')
+            eval_results['IDF1'] = np.average(output_res['Identity']['IDF1'])
+            eval_results['IDTP'] = np.average(output_res['Identity']['IDTP'])
+            eval_results['IDFN'] = np.average(output_res['Identity']['IDFN'])
+            eval_results['IDFP'] = np.average(output_res['Identity']['IDFP'])
+            eval_results['IDP'] = np.average(output_res['Identity']['IDP'])
+            eval_results['IDR'] = np.average(output_res['Identity']['IDR'])
+
+        return eval_results
+
+    def evaluate(self, size: int = 1) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+                Defaults to None.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        # wait for all processes to complete prediction.
+        barrier()
+
+        # gather seq_info and convert the list of dict to a dict.
+        # convert self.seq_info to dict first to make it picklable.
+        gathered_seq_info = all_gather_object(dict(self.seq_info))
+        all_seq_info = dict()
+        for _seq_info in gathered_seq_info:
+            all_seq_info.update(_seq_info)
+        self.seq_info = all_seq_info
+
+        if is_main_process():
+            _metrics = self.compute_metrics()  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
+
+    def get_dataset_cfg(self, gt_folder: str, tracker_folder: str):
+        """Get default configs for trackeval.datasets.MotChallenge2DBox.
+
+        Args:
+            gt_folder (str): the name of the GT folder
+            tracker_folder (str): the name of the tracker folder
+
+        Returns:
+            Dataset Configs for MotChallenge2DBox.
+        """
+        dataset_config = dict(
+            # Location of GT data
+            GT_FOLDER=gt_folder,
+            # Trackers location
+            TRACKERS_FOLDER=tracker_folder,
+            # Where to save eval results
+            # (if None, same as TRACKERS_FOLDER)
+            OUTPUT_FOLDER=None,
+            # Use self.TRACKER as the default tracker
+            TRACKERS_TO_EVAL=[self.TRACKER],
+            # Option values: ['pedestrian']
+            CLASSES_TO_EVAL=['pedestrian'],
+            # Option Values: 'MOT15', 'MOT16', 'MOT17', 'MOT20', 'DanceTrack'
+            BENCHMARK=self.benchmark,
+            # Option Values: 'train', 'test'
+            SPLIT_TO_EVAL='val' if self.benchmark == 'DanceTrack' else 'train',
+            # Whether tracker input files are zipped
+            INPUT_AS_ZIP=False,
+            # Whether to print current config
+            PRINT_CONFIG=True,
+            # Whether to perform preprocessing
+            # (never done for MOT15)
+            DO_PREPROC=False if self.benchmark == 'MOT15' else True,
+            # Tracker files are in
+            # TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
+            TRACKER_SUB_FOLDER='',
+            # Output files are saved in
+            # OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+            OUTPUT_SUB_FOLDER='',
+            # Names of trackers to display
+            # (if None: TRACKERS_TO_EVAL)
+            TRACKER_DISPLAY_NAMES=None,
+            # Where seqmaps are found
+            # (if None: GT_FOLDER/seqmaps)
+            SEQMAP_FOLDER=None,
+            # Directly specify seqmap file
+            # (if none use seqmap_folder/benchmark-split_to_eval)
+            SEQMAP_FILE=self.seqmap,
+            # If not None, specify sequences to eval
+            # and their number of timesteps
+            SEQ_INFO={
+                seq: info['seq_length']
+                for seq, info in self.seq_info.items()
+            },
+            # '{gt_folder}/{seq}.txt'
+            GT_LOC_FORMAT='{gt_folder}/{seq}.txt',
+            # If False, data is in GT_FOLDER/BENCHMARK-SPLIT_TO_EVAL/ and in
+            # TRACKERS_FOLDER/BENCHMARK-SPLIT_TO_EVAL/tracker/
+            # If True, the middle 'benchmark-split' folder is skipped for both.
+            SKIP_SPLIT_FOL=True,
+        )
+
+        return dataset_config
diff --git a/mmde/mmdet/evaluation/metrics/openimages_metric.py b/mmde/mmdet/evaluation/metrics/openimages_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d75c59e0e711c90bb1e5fbcc1529e95864e99e9a
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/openimages_metric.py
@@ -0,0 +1,237 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from collections import OrderedDict
+from typing import List, Optional, Sequence, Union
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+
+from mmdet.registry import METRICS
+from ..functional import eval_map
+
+
+@METRICS.register_module()
+class OpenImagesMetric(BaseMetric):
+    """OpenImages evaluation metric.
+
+    Evaluate detection mAP for OpenImages. Please refer to
+    https://storage.googleapis.com/openimages/web/evaluation.html for more
+    details.
+
+    Args:
+        iou_thrs (float or List[float]): IoU threshold. Defaults to 0.5.
+        ioa_thrs (float or List[float]): IoA threshold. Defaults to 0.5.
+        scale_ranges (List[tuple], optional): Scale ranges for evaluating
+            mAP. If not specified, all bounding boxes would be included in
+            evaluation. Defaults to None
+        use_group_of (bool): Whether consider group of groud truth bboxes
+            during evaluating. Defaults to True.
+        get_supercategory (bool): Whether to get parent class of the
+            current class. Default: True.
+        filter_labels (bool): Whether filter unannotated classes.
+            Default: True.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+    default_prefix: Optional[str] = 'openimages'
+
+    def __init__(self,
+                 iou_thrs: Union[float, List[float]] = 0.5,
+                 ioa_thrs: Union[float, List[float]] = 0.5,
+                 scale_ranges: Optional[List[tuple]] = None,
+                 use_group_of: bool = True,
+                 get_supercategory: bool = True,
+                 filter_labels: bool = True,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.iou_thrs = [iou_thrs] if isinstance(iou_thrs, float) else iou_thrs
+        self.ioa_thrs = [ioa_thrs] if (isinstance(ioa_thrs, float)
+                                       or ioa_thrs is None) else ioa_thrs
+        assert isinstance(self.iou_thrs, list) and isinstance(
+            self.ioa_thrs, list)
+        assert len(self.iou_thrs) == len(self.ioa_thrs)
+
+        self.scale_ranges = scale_ranges
+        self.use_group_of = use_group_of
+        self.get_supercategory = get_supercategory
+        self.filter_labels = filter_labels
+
+    def _get_supercategory_ann(self, instances: List[dict]) -> List[dict]:
+        """Get parent classes's annotation of the corresponding class.
+
+        Args:
+            instances (List[dict]): A list of annotations of the instances.
+
+        Returns:
+            List[dict]: Annotations extended with super-category.
+        """
+        supercat_instances = []
+        relation_matrix = self.dataset_meta['RELATION_MATRIX']
+        for instance in instances:
+            labels = np.where(relation_matrix[instance['bbox_label']])[0]
+            for label in labels:
+                if label == instance['bbox_label']:
+                    continue
+                new_instance = copy.deepcopy(instance)
+                new_instance['bbox_label'] = label
+                supercat_instances.append(new_instance)
+        return supercat_instances
+
+    def _process_predictions(self, pred_bboxes: np.ndarray,
+                             pred_scores: np.ndarray, pred_labels: np.ndarray,
+                             gt_instances: list,
+                             image_level_labels: np.ndarray) -> tuple:
+        """Process results of the corresponding class of the detection bboxes.
+
+        Note: It will choose to do the following two processing according to
+        the parameters:
+
+        1. Whether to add parent classes of the corresponding class of the
+        detection bboxes.
+
+        2. Whether to ignore the classes that unannotated on that image.
+
+        Args:
+            pred_bboxes (np.ndarray): bboxes predicted by the model
+            pred_scores (np.ndarray): scores predicted by the model
+            pred_labels (np.ndarray): labels predicted by the model
+            gt_instances (list): ground truth annotations
+            image_level_labels (np.ndarray): human-verified image level labels
+
+        Returns:
+            tuple: Processed bboxes, scores, and labels.
+        """
+        processed_bboxes = copy.deepcopy(pred_bboxes)
+        processed_scores = copy.deepcopy(pred_scores)
+        processed_labels = copy.deepcopy(pred_labels)
+        gt_labels = np.array([ins['bbox_label'] for ins in gt_instances],
+                             dtype=np.int64)
+        if image_level_labels is not None:
+            allowed_classes = np.unique(
+                np.append(gt_labels, image_level_labels))
+        else:
+            allowed_classes = np.unique(gt_labels)
+        relation_matrix = self.dataset_meta['RELATION_MATRIX']
+        pred_classes = np.unique(pred_labels)
+        for pred_class in pred_classes:
+            classes = np.where(relation_matrix[pred_class])[0]
+            for cls in classes:
+                if (cls in allowed_classes and cls != pred_class
+                        and self.get_supercategory):
+                    # add super-supercategory preds
+                    index = np.where(pred_labels == pred_class)[0]
+                    processed_scores = np.concatenate(
+                        [processed_scores, pred_scores[index]])
+                    processed_bboxes = np.concatenate(
+                        [processed_bboxes, pred_bboxes[index]])
+                    extend_labels = np.full(index.shape, cls, dtype=np.int64)
+                    processed_labels = np.concatenate(
+                        [processed_labels, extend_labels])
+                elif cls not in allowed_classes and self.filter_labels:
+                    # remove unannotated preds
+                    index = np.where(processed_labels != cls)[0]
+                    processed_scores = processed_scores[index]
+                    processed_bboxes = processed_bboxes[index]
+                    processed_labels = processed_labels[index]
+        return processed_bboxes, processed_scores, processed_labels
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            gt = copy.deepcopy(data_sample)
+            # add super-category instances
+            # TODO: Need to refactor to support LoadAnnotations
+            instances = gt['instances']
+            if self.get_supercategory:
+                supercat_instances = self._get_supercategory_ann(instances)
+                instances.extend(supercat_instances)
+            gt_labels = []
+            gt_bboxes = []
+            is_group_ofs = []
+            for ins in instances:
+                gt_labels.append(ins['bbox_label'])
+                gt_bboxes.append(ins['bbox'])
+                is_group_ofs.append(ins['is_group_of'])
+            ann = dict(
+                labels=np.array(gt_labels, dtype=np.int64),
+                bboxes=np.array(gt_bboxes, dtype=np.float32).reshape((-1, 4)),
+                gt_is_group_ofs=np.array(is_group_ofs, dtype=bool))
+
+            image_level_labels = gt.get('image_level_labels', None)
+            pred = data_sample['pred_instances']
+            pred_bboxes = pred['bboxes'].cpu().numpy()
+            pred_scores = pred['scores'].cpu().numpy()
+            pred_labels = pred['labels'].cpu().numpy()
+
+            pred_bboxes, pred_scores, pred_labels = self._process_predictions(
+                pred_bboxes, pred_scores, pred_labels, instances,
+                image_level_labels)
+
+            dets = []
+            for label in range(len(self.dataset_meta['classes'])):
+                index = np.where(pred_labels == label)[0]
+                pred_bbox_scores = np.hstack(
+                    [pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
+                dets.append(pred_bbox_scores)
+            self.results.append((ann, dets))
+
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        logger = MMLogger.get_current_instance()
+        gts, preds = zip(*results)
+        eval_results = OrderedDict()
+        # get dataset type
+        dataset_type = self.dataset_meta.get('dataset_type')
+        if dataset_type not in ['oid_challenge', 'oid_v6']:
+            dataset_type = 'oid_v6'
+            print_log(
+                'Cannot infer dataset type from the length of the'
+                ' classes. Set `oid_v6` as dataset type.',
+                logger='current')
+        mean_aps = []
+        for i, (iou_thr,
+                ioa_thr) in enumerate(zip(self.iou_thrs, self.ioa_thrs)):
+            if self.use_group_of:
+                assert ioa_thr is not None, 'ioa_thr must have value when' \
+                                            ' using group_of in evaluation.'
+            print_log(f'\n{"-" * 15}iou_thr, ioa_thr: {iou_thr}, {ioa_thr}'
+                      f'{"-" * 15}')
+            mean_ap, _ = eval_map(
+                preds,
+                gts,
+                scale_ranges=self.scale_ranges,
+                iou_thr=iou_thr,
+                ioa_thr=ioa_thr,
+                dataset=dataset_type,
+                logger=logger,
+                use_group_of=self.use_group_of)
+
+            mean_aps.append(mean_ap)
+            eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3)
+        eval_results['mAP'] = sum(mean_aps) / len(mean_aps)
+        return eval_results
diff --git a/mmde/mmdet/evaluation/metrics/ov_coco_metric.py b/mmde/mmdet/evaluation/metrics/ov_coco_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..08cb902514914947551a5047c9900947738adf24
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/ov_coco_metric.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+import os.path as osp
+import tempfile
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from mmengine.fileio import load
+from mmengine.logging import MMLogger
+from terminaltables import AsciiTable
+
+from mmdet.datasets.api_wrappers import COCO, COCOeval, COCOevalMP
+from mmdet.registry import METRICS
+from .coco_metric import CocoMetric
+
+
+@METRICS.register_module()
+class OVCocoMetric(CocoMetric):
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # split gt and prediction list
+        gts, preds = zip(*results)
+
+        tmp_dir = None
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+
+        if self._coco_api is None:
+            # use converted gt json file to initialize coco api
+            logger.info('Converting ground truth to coco format...')
+            coco_json_path = self.gt_to_coco_json(
+                gt_dicts=gts, outfile_prefix=outfile_prefix)
+            self._coco_api = COCO(coco_json_path)
+
+        # handle lazy init
+        if self.cat_ids is None:
+            self.cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['classes'])
+            self.base_cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['base_classes'])
+            self.novel_cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['novel_classes'])
+
+        if self.img_ids is None:
+            self.img_ids = self._coco_api.get_img_ids()
+
+        # convert predictions to coco format and dump to json file
+        result_files = self.results2json(preds, outfile_prefix)
+
+        eval_results = OrderedDict()
+        if self.format_only:
+            logger.info('results are saved in '
+                        f'{osp.dirname(outfile_prefix)}')
+            return eval_results
+
+        for metric in self.metrics:
+            logger.info(f'Evaluating {metric}...')
+
+            # TODO: May refactor fast_eval_recall to an independent metric?
+            # fast eval recall
+            if metric == 'proposal_fast':
+                ar = self.fast_eval_recall(
+                    preds, self.proposal_nums, self.iou_thrs, logger=logger)
+                log_msg = []
+                for i, num in enumerate(self.proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
+                log_msg = ''.join(log_msg)
+                logger.info(log_msg)
+                continue
+
+            # evaluate proposal, bbox and segm
+            iou_type = 'bbox' if metric == 'proposal' else metric
+            if metric not in result_files:
+                raise KeyError(f'{metric} is not in results')
+            try:
+                predictions = load(result_files[metric])
+                if iou_type == 'segm':
+                    # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331  # noqa
+                    # When evaluating mask AP, if the results contain bbox,
+                    # cocoapi will use the box area instead of the mask area
+                    # for calculating the instance area. Though the overall AP
+                    # is not affected, this leads to different
+                    # small/medium/large mask AP results.
+                    for x in predictions:
+                        x.pop('bbox')
+                coco_dt = self._coco_api.loadRes(predictions)
+
+            except IndexError:
+                logger.error(
+                    'The testing results of the whole dataset is empty.')
+                break
+
+            if self.use_mp_eval:
+                coco_eval = COCOevalMP(self._coco_api, coco_dt, iou_type)
+            else:
+                coco_eval = COCOeval(self._coco_api, coco_dt, iou_type)
+
+            coco_eval.params.catIds = self.cat_ids
+            coco_eval.params.imgIds = self.img_ids
+            coco_eval.params.maxDets = list(self.proposal_nums)
+            coco_eval.params.iouThrs = self.iou_thrs
+
+            # mapping of cocoEval.stats
+            coco_metric_names = {
+                'mAP': 0,
+                'mAP_50': 1,
+                'mAP_75': 2,
+                'mAP_s': 3,
+                'mAP_m': 4,
+                'mAP_l': 5,
+                'AR@100': 6,
+                'AR@300': 7,
+                'AR@1000': 8,
+                'AR_s@1000': 9,
+                'AR_m@1000': 10,
+                'AR_l@1000': 11
+            }
+            metric_items = self.metric_items
+            if metric_items is not None:
+                for metric_item in metric_items:
+                    if metric_item not in coco_metric_names:
+                        raise KeyError(
+                            f'metric item "{metric_item}" is not supported')
+
+            if metric == 'proposal':
+                coco_eval.params.useCats = 0
+                coco_eval.evaluate()
+                coco_eval.accumulate()
+                coco_eval.summarize()
+                if metric_items is None:
+                    metric_items = [
+                        'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000',
+                        'AR_m@1000', 'AR_l@1000'
+                    ]
+
+                for item in metric_items:
+                    val = float(
+                        f'{coco_eval.stats[coco_metric_names[item]]:.3f}')
+                    eval_results[item] = val
+            else:
+                coco_eval.evaluate()
+                coco_eval.accumulate()
+                coco_eval.summarize()
+                if self.classwise:  # Compute per-category AP
+                    # Compute per-category AP
+                    # from https://github.com/facebookresearch/detectron2/
+                    precisions = coco_eval.eval['precision']
+                    # precision: (iou, recall, cls, area range, max dets)
+                    assert len(self.cat_ids) == precisions.shape[2]
+
+                    results_per_category = []
+                    for idx, cat_id in enumerate(self.cat_ids):
+                        t = []
+                        # area range index 0: all area ranges
+                        # max dets index -1: typically 100 per image
+                        nm = self._coco_api.loadCats(cat_id)[0]
+                        precision = precisions[:, :, idx, 0, -1]
+                        precision = precision[precision > -1]
+                        if precision.size:
+                            ap = np.mean(precision)
+                        else:
+                            ap = float('nan')
+                        t.append(f'{nm["name"]}')
+                        t.append(f'{round(ap, 3)}')
+                        eval_results[f'{nm["name"]}_precision'] = round(ap, 3)
+
+                        # indexes of IoU  @50 and @75
+                        for iou in [0, 5]:
+                            precision = precisions[iou, :, idx, 0, -1]
+                            precision = precision[precision > -1]
+                            if precision.size:
+                                ap = np.mean(precision)
+                            else:
+                                ap = float('nan')
+                            t.append(f'{round(ap, 3)}')
+
+                        # indexes of area of small, median and large
+                        for area in [1, 2, 3]:
+                            precision = precisions[:, :, idx, area, -1]
+                            precision = precision[precision > -1]
+                            if precision.size:
+                                ap = np.mean(precision)
+                            else:
+                                ap = float('nan')
+                            t.append(f'{round(ap, 3)}')
+                        results_per_category.append(tuple(t))
+
+                    num_columns = len(results_per_category[0])
+                    results_flatten = list(
+                        itertools.chain(*results_per_category))
+                    headers = [
+                        'category', 'mAP', 'mAP_50', 'mAP_75', 'mAP_s',
+                        'mAP_m', 'mAP_l'
+                    ]
+                    results_2d = itertools.zip_longest(*[
+                        results_flatten[i::num_columns]
+                        for i in range(num_columns)
+                    ])
+                    table_data = [headers]
+                    table_data += [result for result in results_2d]
+                    table = AsciiTable(table_data)
+                    logger.info('\n' + table.table)
+
+                # ------------get novel_ap50 and base_ap50---------
+                precisions = coco_eval.eval['precision']
+                assert len(self.cat_ids) == precisions.shape[2]
+                base_inds, novel_inds = [], []
+
+                for idx, catId in enumerate(self.cat_ids):
+                    if catId in self.base_cat_ids:
+                        base_inds.append(idx)
+                    if catId in self.novel_cat_ids:
+                        novel_inds.append(idx)
+
+                base_ap = precisions[:, :, base_inds, 0, -1]
+                novel_ap = precisions[:, :, novel_inds, 0, -1]
+                base_ap50 = precisions[0, :, base_inds, 0, -1]
+                novel_ap50 = precisions[0, :, novel_inds, 0, -1]
+
+                eval_results['base_ap'] = np.mean(
+                    base_ap[base_ap > -1]) if len(
+                        base_ap[base_ap > -1]) else -1
+                eval_results['novel_ap'] = np.mean(
+                    novel_ap[novel_ap > -1]) if len(
+                        novel_ap[novel_ap > -1]) else -1
+                eval_results['base_ap50'] = np.mean(
+                    base_ap50[base_ap50 > -1]) if len(
+                        base_ap50[base_ap50 > -1]) else -1
+                eval_results['novel_ap50'] = np.mean(
+                    novel_ap50[novel_ap50 > -1]) if len(
+                        novel_ap50[novel_ap50 > -1]) else -1
+                # ------------get novel_ap50 and base_ap50---------
+                if metric_items is None:
+                    metric_items = [
+                        'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+                    ]
+
+                for metric_item in metric_items:
+                    key = f'{metric}_{metric_item}'
+                    val = coco_eval.stats[coco_metric_names[metric_item]]
+                    eval_results[key] = float(f'{round(val, 3)}')
+
+                ap = coco_eval.stats[:6]
+                logger.info(f'{metric}_mAP_copypaste: {ap[0]:.3f} '
+                            f'{ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+                            f'{ap[4]:.3f} {ap[5]:.3f}')
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
diff --git a/mmde/mmdet/evaluation/metrics/refexp_metric.py b/mmde/mmdet/evaluation/metrics/refexp_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bcdf1629b9bcd9519e0160769810168017a6d0d
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/refexp_metric.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Sequence
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import get_local_path
+from mmengine.logging import MMLogger
+
+from mmdet.datasets.api_wrappers import COCO
+from mmdet.registry import METRICS
+from ..functional import bbox_overlaps
+
+
+@METRICS.register_module()
+class RefExpMetric(BaseMetric):
+    default_prefix: Optional[str] = 'refexp'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 metric: str = 'bbox',
+                 topk=(1, 5, 10),
+                 iou_thrs: float = 0.5,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.metric = metric
+        self.topk = topk
+        self.iou_thrs = iou_thrs
+
+        with get_local_path(ann_file) as local_path:
+            self.coco = COCO(local_path)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        dataset2score = {
+            'refcoco': {k: 0.0
+                        for k in self.topk},
+            'refcoco+': {k: 0.0
+                         for k in self.topk},
+            'refcocog': {k: 0.0
+                         for k in self.topk},
+        }
+        dataset2count = {'refcoco': 0.0, 'refcoco+': 0.0, 'refcocog': 0.0}
+
+        for result in results:
+            img_id = result['img_id']
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id)
+            assert len(ann_ids) == 1
+            img_info = self.coco.loadImgs(img_id)[0]
+            target = self.coco.loadAnns(ann_ids[0])
+
+            target_bbox = target[0]['bbox']
+            converted_bbox = [
+                target_bbox[0],
+                target_bbox[1],
+                target_bbox[2] + target_bbox[0],
+                target_bbox[3] + target_bbox[1],
+            ]
+            iou = bbox_overlaps(result['bboxes'],
+                                np.array(converted_bbox).reshape(-1, 4))
+            for k in self.topk:
+                if max(iou[:k]) >= self.iou_thrs:
+                    dataset2score[img_info['dataset_name']][k] += 1.0
+            dataset2count[img_info['dataset_name']] += 1.0
+
+        for key, value in dataset2score.items():
+            for k in self.topk:
+                try:
+                    value[k] /= dataset2count[key]
+                except Exception as e:
+                    print(e)
+
+        results = {}
+        mean_precision = 0.0
+        for key, value in dataset2score.items():
+            results[key] = sorted([v for k, v in value.items()])
+            mean_precision += sum(results[key])
+            logger.info(
+                f' Dataset: {key} - Precision @ 1, 5, 10: {results[key]}')
+
+        # `mean_precision` key is used for saving the best checkpoint
+        out_results = {'mean_precision': mean_precision / 9.0}
+
+        for i, k in enumerate(self.topk):
+            out_results[f'refcoco_precision@{k}'] = results['refcoco'][i]
+        for i, k in enumerate(self.topk):
+            out_results[f'refcoco+_precision@{k}'] = results['refcoco+'][i]
+        for i, k in enumerate(self.topk):
+            out_results[f'refcocog_precision@{k}'] = results['refcocog'][i]
+        return out_results
diff --git a/mmde/mmdet/evaluation/metrics/refseg_metric.py b/mmde/mmdet/evaluation/metrics/refseg_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..0faee07007e809ef08e86a88e8b11c2be1a64034
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/refseg_metric.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+import torch
+from mmengine.evaluator import BaseMetric
+
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class RefSegMetric(BaseMetric):
+    """Referring Expression Segmentation Metric."""
+
+    def __init__(self, metric: Sequence = ('cIoU', 'mIoU'), **kwargs):
+        super().__init__(**kwargs)
+        assert set(metric).issubset(['cIoU', 'mIoU']), \
+            f'Only support cIoU and mIoU, but got {metric}'
+        assert len(metric) > 0, 'metrics should not be empty'
+        self.metrics = metric
+
+    def compute_iou(self, pred_seg: torch.Tensor,
+                    gt_seg: torch.Tensor) -> tuple:
+        overlap = pred_seg & gt_seg
+        union = pred_seg | gt_seg
+        return overlap, union
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data and data_samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            pred_label = data_sample['pred_instances']['masks'].bool()
+            label = data_sample['gt_masks'].to_tensor(
+                pred_label.dtype, pred_label.device).bool()
+            # calculate iou
+            overlap, union = self.compute_iou(pred_label, label)
+
+            bs = len(pred_label)
+            iou = overlap.reshape(bs, -1).sum(-1) * 1.0 / union.reshape(
+                bs, -1).sum(-1)
+            iou = torch.nan_to_num_(iou, nan=0.0)
+            self.results.append((overlap.sum(), union.sum(), iou.sum(), bs))
+
+    def compute_metrics(self, results: list) -> dict:
+        results = tuple(zip(*results))
+        assert len(results) == 4
+        cum_i = sum(results[0])
+        cum_u = sum(results[1])
+        iou = sum(results[2])
+        seg_total = sum(results[3])
+
+        metrics = {}
+        if 'cIoU' in self.metrics:
+            metrics['cIoU'] = cum_i * 100 / cum_u
+        if 'mIoU' in self.metrics:
+            metrics['mIoU'] = iou * 100 / seg_total
+        return metrics
diff --git a/mmde/mmdet/evaluation/metrics/reid_metric.py b/mmde/mmdet/evaluation/metrics/reid_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d74df1433cdb093cfb0377b734fc5479401e09e7
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/reid_metric.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+from mmengine.evaluator import BaseMetric
+
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class ReIDMetrics(BaseMetric):
+    """mAP and CMC evaluation metrics for the ReID task.
+
+    Args:
+        metric (str | list[str]): Metrics to be evaluated.
+            Default value is `mAP`.
+        metric_options: (dict, optional): Options for calculating metrics.
+            Allowed keys are 'rank_list' and 'max_rank'. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+    """
+    allowed_metrics = ['mAP', 'CMC']
+    default_prefix: Optional[str] = 'reid-metric'
+
+    def __init__(self,
+                 metric: Union[str, Sequence[str]] = 'mAP',
+                 metric_options: Optional[dict] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device, prefix)
+
+        if isinstance(metric, list):
+            metrics = metric
+        elif isinstance(metric, str):
+            metrics = [metric]
+        else:
+            raise TypeError('metric must be a list or a str.')
+        for metric in metrics:
+            if metric not in self.allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported.')
+        self.metrics = metrics
+
+        self.metric_options = metric_options or dict(
+            rank_list=[1, 5, 10, 20], max_rank=20)
+        for rank in self.metric_options['rank_list']:
+            assert 1 <= rank <= self.metric_options['max_rank']
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            pred_feature = data_sample['pred_feature']
+            assert isinstance(pred_feature, torch.Tensor)
+            gt_label = data_sample.get('gt_label', data_sample['gt_label'])
+            assert isinstance(gt_label['label'], torch.Tensor)
+            result = dict(
+                pred_feature=pred_feature.data.cpu(),
+                gt_label=gt_label['label'].cpu())
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        # NOTICE: don't access `self.results` from the method.
+        metrics = {}
+
+        pids = torch.cat([result['gt_label'] for result in results]).numpy()
+        features = torch.stack([result['pred_feature'] for result in results])
+
+        n, c = features.size()
+        mat = torch.pow(features, 2).sum(dim=1, keepdim=True).expand(n, n)
+        distmat = mat + mat.t()
+        distmat.addmm_(features, features.t(), beta=1, alpha=-2)
+        distmat = distmat.numpy()
+
+        indices = np.argsort(distmat, axis=1)
+        matches = (pids[indices] == pids[:, np.newaxis]).astype(np.int32)
+
+        all_cmc = []
+        all_AP = []
+        num_valid_q = 0.
+        for q_idx in range(n):
+            # remove self
+            raw_cmc = matches[q_idx][1:]
+            if not np.any(raw_cmc):
+                # this condition is true when query identity
+                # does not appear in gallery
+                continue
+
+            cmc = raw_cmc.cumsum()
+            cmc[cmc > 1] = 1
+
+            all_cmc.append(cmc[:self.metric_options['max_rank']])
+            num_valid_q += 1.
+
+            # compute average precision
+            num_rel = raw_cmc.sum()
+            tmp_cmc = raw_cmc.cumsum()
+            tmp_cmc = [x / (i + 1.) for i, x in enumerate(tmp_cmc)]
+            tmp_cmc = np.asarray(tmp_cmc) * raw_cmc
+            AP = tmp_cmc.sum() / num_rel
+            all_AP.append(AP)
+
+        assert num_valid_q > 0, \
+            'Error: all query identities do not appear in gallery'
+
+        all_cmc = np.asarray(all_cmc)
+        all_cmc = all_cmc.sum(0) / num_valid_q
+        mAP = np.mean(all_AP)
+
+        if 'mAP' in self.metrics:
+            metrics['mAP'] = np.around(mAP, decimals=3)
+        if 'CMC' in self.metrics:
+            for rank in self.metric_options['rank_list']:
+                metrics[f'R{rank}'] = np.around(all_cmc[rank - 1], decimals=3)
+
+        return metrics
diff --git a/mmde/mmdet/evaluation/metrics/semseg_metric.py b/mmde/mmdet/evaluation/metrics/semseg_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..3215f6788a6155bdbceb6a91259008b4d851868e
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/semseg_metric.py
@@ -0,0 +1,279 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections import OrderedDict
+from typing import Dict, Optional, Sequence, Union
+
+import numpy as np
+import torch
+from mmcv import imwrite
+from mmengine.dist import is_main_process
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+from mmengine.utils import mkdir_or_exist
+from PIL import Image
+
+try:
+    from prettytable import PrettyTable
+except ImportError:
+    PrettyTable = None
+
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class SemSegMetric(BaseMetric):
+    """mIoU evaluation metric.
+
+    Args:
+        iou_metrics (list[str] | str): Metrics to be calculated, the options
+            includes 'mIoU', 'mDice' and 'mFscore'.
+        beta (int): Determines the weight of recall in the combined score.
+            Default: 1.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        output_dir (str): The directory for output prediction. Defaults to
+            None.
+        format_only (bool): Only format result for results commit without
+            perform evaluation. It is useful when you want to save the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    def __init__(self,
+                 iou_metrics: Sequence[str] = ['mIoU'],
+                 beta: int = 1,
+                 collect_device: str = 'cpu',
+                 output_dir: Optional[str] = None,
+                 format_only: bool = False,
+                 backend_args: dict = None,
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        if isinstance(iou_metrics, str):
+            iou_metrics = [iou_metrics]
+        if not set(iou_metrics).issubset(set(['mIoU', 'mDice', 'mFscore'])):
+            raise KeyError(f'metrics {iou_metrics} is not supported. '
+                           f'Only supports mIoU/mDice/mFscore.')
+        self.metrics = iou_metrics
+        self.beta = beta
+        self.output_dir = output_dir
+        if self.output_dir and is_main_process():
+            mkdir_or_exist(self.output_dir)
+        self.format_only = format_only
+        self.backend_args = backend_args
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data and data_samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        num_classes = len(self.dataset_meta['classes'])
+        for data_sample in data_samples:
+            pred_label = data_sample['pred_sem_seg']['sem_seg'].squeeze()
+            # format_only always for test dataset without ground truth
+            if not self.format_only:
+                label = data_sample['gt_sem_seg']['sem_seg'].squeeze().to(
+                    pred_label)
+                ignore_index = data_sample['pred_sem_seg'].get(
+                    'ignore_index', 255)
+                self.results.append(
+                    self._compute_pred_stats(pred_label, label, num_classes,
+                                             ignore_index))
+
+            # format_result
+            if self.output_dir is not None:
+                basename = osp.splitext(osp.basename(
+                    data_sample['img_path']))[0]
+                png_filename = osp.abspath(
+                    osp.join(self.output_dir, f'{basename}.png'))
+                output_mask = pred_label.cpu().numpy()
+                output = Image.fromarray(output_mask.astype(np.uint8))
+                imwrite(output, png_filename, backend_args=self.backend_args)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+                the metrics, and the values are corresponding results. The key
+                mainly includes aAcc, mIoU, mAcc, mDice, mFscore, mPrecision,
+                mRecall.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.format_only:
+            logger.info(f'results are saved to {osp.dirname(self.output_dir)}')
+            return OrderedDict()
+
+        ret_metrics = self.get_return_metrics(results)
+
+        # summary table
+        ret_metrics_summary = OrderedDict({
+            ret_metric: np.round(np.nanmean(ret_metric_value) * 100, 2)
+            for ret_metric, ret_metric_value in ret_metrics.items()
+        })
+        metrics = dict()
+        for key, val in ret_metrics_summary.items():
+            if key == 'aAcc':
+                metrics[key] = val
+            else:
+                metrics['m' + key] = val
+
+        print_semantic_table(ret_metrics, self.dataset_meta['classes'], logger)
+
+        return metrics
+
+    def _compute_pred_stats(self, pred_label: torch.tensor,
+                            label: torch.tensor, num_classes: int,
+                            ignore_index: int):
+        """Parse semantic segmentation predictions.
+
+        Args:
+            pred_label (torch.tensor): Prediction segmentation map
+                or predict result filename. The shape is (H, W).
+            label (torch.tensor): Ground truth segmentation map
+                or label filename. The shape is (H, W).
+            num_classes (int): Number of categories.
+
+        Returns:
+            torch.Tensor: The intersection of prediction and ground truth
+                histogram on all classes.
+            torch.Tensor: The union of prediction and ground truth histogram on
+                all classes.
+            torch.Tensor: The prediction histogram on all classes.
+            torch.Tensor: The ground truth histogram on all classes.
+        """
+        assert pred_label.shape == label.shape
+        mask = label != ignore_index
+        label, pred_label = label[mask], pred_label[mask]
+
+        intersect = pred_label[pred_label == label]
+        area_intersect = torch.histc(
+            intersect.float(), bins=num_classes, min=0, max=num_classes - 1)
+        area_pred_label = torch.histc(
+            pred_label.float(), bins=num_classes, min=0, max=num_classes - 1)
+        area_label = torch.histc(
+            label.float(), bins=num_classes, min=0, max=num_classes - 1)
+        area_union = area_pred_label + area_label - area_intersect
+        result = dict(
+            area_intersect=area_intersect,
+            area_union=area_union,
+            area_pred_label=area_pred_label,
+            area_label=area_label)
+        return result
+
+    def get_return_metrics(self, results: list) -> dict:
+        """Calculate evaluation metrics.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, np.ndarray]: per category evaluation metrics,
+                shape (num_classes, ).
+        """
+
+        def f_score(precision, recall, beta=1):
+            """calculate the f-score value.
+
+            Args:
+                precision (float | torch.Tensor): The precision value.
+                recall (float | torch.Tensor): The recall value.
+                beta (int): Determines the weight of recall in the combined
+                    score. Default: 1.
+
+            Returns:
+                [torch.tensor]: The f-score value.
+            """
+            score = (1 + beta**2) * (precision * recall) / (
+                (beta**2 * precision) + recall)
+            return score
+
+        total_area_intersect = sum([r['area_intersect'] for r in results])
+        total_area_union = sum([r['area_union'] for r in results])
+        total_area_pred_label = sum([r['area_pred_label'] for r in results])
+        total_area_label = sum([r['area_label'] for r in results])
+
+        all_acc = total_area_intersect / total_area_label
+        ret_metrics = OrderedDict({'aAcc': all_acc})
+        for metric in self.metrics:
+            if metric == 'mIoU':
+                iou = total_area_intersect / total_area_union
+                acc = total_area_intersect / total_area_label
+                ret_metrics['IoU'] = iou
+                ret_metrics['Acc'] = acc
+            elif metric == 'mDice':
+                dice = 2 * total_area_intersect / (
+                    total_area_pred_label + total_area_label)
+                acc = total_area_intersect / total_area_label
+                ret_metrics['Dice'] = dice
+                ret_metrics['Acc'] = acc
+            elif metric == 'mFscore':
+                precision = total_area_intersect / total_area_pred_label
+                recall = total_area_intersect / total_area_label
+                f_value = torch.tensor([
+                    f_score(x[0], x[1], self.beta)
+                    for x in zip(precision, recall)
+                ])
+                ret_metrics['Fscore'] = f_value
+                ret_metrics['Precision'] = precision
+                ret_metrics['Recall'] = recall
+
+        ret_metrics = {
+            metric: value.cpu().numpy()
+            for metric, value in ret_metrics.items()
+        }
+
+        return ret_metrics
+
+
+def print_semantic_table(
+        results: dict,
+        class_names: list,
+        logger: Optional[Union['MMLogger', str]] = None) -> None:
+    """Print semantic segmentation evaluation results table.
+
+    Args:
+        results (dict): The evaluation results.
+        class_names (list): Class names.
+        logger (MMLogger | str, optional): Logger used for printing.
+            Default: None.
+    """
+    # each class table
+    results.pop('aAcc', None)
+    ret_metrics_class = OrderedDict({
+        ret_metric: np.round(ret_metric_value * 100, 2)
+        for ret_metric, ret_metric_value in results.items()
+    })
+
+    print_log('per class results:', logger)
+    if PrettyTable:
+        class_table_data = PrettyTable()
+        ret_metrics_class.update({'Class': class_names})
+        ret_metrics_class.move_to_end('Class', last=False)
+        for key, val in ret_metrics_class.items():
+            class_table_data.add_column(key, val)
+        print_log('\n' + class_table_data.get_string(), logger=logger)
+    else:
+        logger.warning(
+            '`prettytable` is not installed, for better table format, '
+            'please consider installing it with "pip install prettytable"')
+        print_result = {}
+        for class_name, iou, acc in zip(class_names, ret_metrics_class['IoU'],
+                                        ret_metrics_class['Acc']):
+            print_result[class_name] = {'IoU': iou, 'Acc': acc}
+        print_log(print_result, logger)
diff --git a/mmde/mmdet/evaluation/metrics/voc_metric.py b/mmde/mmdet/evaluation/metrics/voc_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..32d8c075de9c8b4fb842ad7f64f87a10c4d68546
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/voc_metric.py
@@ -0,0 +1,176 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from collections import OrderedDict
+from typing import List, Optional, Sequence, Union
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS
+from ..functional import eval_map, eval_recalls
+
+
+@METRICS.register_module()
+class VOCMetric(BaseMetric):
+    """Pascal VOC evaluation metric.
+
+    Args:
+        iou_thrs (float or List[float]): IoU threshold. Defaults to 0.5.
+        scale_ranges (List[tuple], optional): Scale ranges for evaluating
+            mAP. If not specified, all bounding boxes would be included in
+            evaluation. Defaults to None.
+        metric (str | list[str]): Metrics to be evaluated. Options are
+            'mAP', 'recall'. If is list, the first setting in the list will
+             be used to evaluate metric.
+        proposal_nums (Sequence[int]): Proposal number used for evaluating
+            recalls, such as recall@100, recall@1000.
+            Default: (100, 300, 1000).
+        eval_mode (str): 'area' or '11points', 'area' means calculating the
+            area under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1].
+            The PASCAL VOC2007 defaults to use '11points', while PASCAL
+            VOC2012 defaults to use 'area'.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    default_prefix: Optional[str] = 'pascal_voc'
+
+    def __init__(self,
+                 iou_thrs: Union[float, List[float]] = 0.5,
+                 scale_ranges: Optional[List[tuple]] = None,
+                 metric: Union[str, List[str]] = 'mAP',
+                 proposal_nums: Sequence[int] = (100, 300, 1000),
+                 eval_mode: str = '11points',
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.iou_thrs = [iou_thrs] if isinstance(iou_thrs, float) \
+            else iou_thrs
+        self.scale_ranges = scale_ranges
+        # voc evaluation metrics
+        if not isinstance(metric, str):
+            assert len(metric) == 1
+            metric = metric[0]
+        allowed_metrics = ['recall', 'mAP']
+        if metric not in allowed_metrics:
+            raise KeyError(
+                f"metric should be one of 'recall', 'mAP', but got {metric}.")
+        self.metric = metric
+        self.proposal_nums = proposal_nums
+        assert eval_mode in ['area', '11points'], \
+            'Unrecognized mode, only "area" and "11points" are supported'
+        self.eval_mode = eval_mode
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            gt = copy.deepcopy(data_sample)
+            # TODO: Need to refactor to support LoadAnnotations
+            gt_instances = gt['gt_instances']
+            gt_ignore_instances = gt['ignored_instances']
+            ann = dict(
+                labels=gt_instances['labels'].cpu().numpy(),
+                bboxes=gt_instances['bboxes'].cpu().numpy(),
+                bboxes_ignore=gt_ignore_instances['bboxes'].cpu().numpy(),
+                labels_ignore=gt_ignore_instances['labels'].cpu().numpy())
+
+            pred = data_sample['pred_instances']
+            pred_bboxes = pred['bboxes'].cpu().numpy()
+            pred_scores = pred['scores'].cpu().numpy()
+            pred_labels = pred['labels'].cpu().numpy()
+
+            dets = []
+            for label in range(len(self.dataset_meta['classes'])):
+                index = np.where(pred_labels == label)[0]
+                pred_bbox_scores = np.hstack(
+                    [pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
+                dets.append(pred_bbox_scores)
+
+            self.results.append((ann, dets))
+
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        gts, preds = zip(*results)
+        eval_results = OrderedDict()
+        if self.metric == 'mAP':
+            assert isinstance(self.iou_thrs, list)
+            dataset_type = self.dataset_meta.get('dataset_type')
+            if dataset_type in ['VOC2007', 'VOC2012']:
+                dataset_name = 'voc'
+                if dataset_type == 'VOC2007' and self.eval_mode != '11points':
+                    warnings.warn('Pascal VOC2007 uses `11points` as default '
+                                  'evaluate mode, but you are using '
+                                  f'{self.eval_mode}.')
+                elif dataset_type == 'VOC2012' and self.eval_mode != 'area':
+                    warnings.warn('Pascal VOC2012 uses `area` as default '
+                                  'evaluate mode, but you are using '
+                                  f'{self.eval_mode}.')
+            else:
+                dataset_name = self.dataset_meta['classes']
+
+            mean_aps = []
+            for iou_thr in self.iou_thrs:
+                logger.info(f'\n{"-" * 15}iou_thr: {iou_thr}{"-" * 15}')
+                # Follow the official implementation,
+                # http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar
+                # we should use the legacy coordinate system in mmdet 1.x,
+                # which means w, h should be computed as 'x2 - x1 + 1` and
+                # `y2 - y1 + 1`
+                mean_ap, _ = eval_map(
+                    preds,
+                    gts,
+                    scale_ranges=self.scale_ranges,
+                    iou_thr=iou_thr,
+                    dataset=dataset_name,
+                    logger=logger,
+                    eval_mode=self.eval_mode,
+                    use_legacy_coordinate=True)
+                mean_aps.append(mean_ap)
+                eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3)
+            eval_results['mAP'] = sum(mean_aps) / len(mean_aps)
+            eval_results.move_to_end('mAP', last=False)
+        elif self.metric == 'recall':
+            gt_bboxes = [gt['bboxes'] for gt in gts]
+            pr_bboxes = [pred[0] for pred in preds]
+            recalls = eval_recalls(
+                gt_bboxes,
+                pr_bboxes,
+                self.proposal_nums,
+                self.iou_thrs,
+                logger=logger,
+                use_legacy_coordinate=True)
+            for i, num in enumerate(self.proposal_nums):
+                for j, iou_thr in enumerate(self.iou_thrs):
+                    eval_results[f'recall@{num}@{iou_thr}'] = recalls[i, j]
+            if recalls.shape[1] > 1:
+                ar = recalls.mean(axis=1)
+                for i, num in enumerate(self.proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+        return eval_results
diff --git a/mmde/mmdet/evaluation/metrics/youtube_vis_metric.py b/mmde/mmdet/evaluation/metrics/youtube_vis_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..5abc77a591c7ee5d67cdf4dc4c4926c84894ba1d
--- /dev/null
+++ b/mmde/mmdet/evaluation/metrics/youtube_vis_metric.py
@@ -0,0 +1,426 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+import zipfile
+from collections import OrderedDict, defaultdict
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import mmengine
+import numpy as np
+from mmengine.dist import (all_gather_object, barrier, broadcast_object_list,
+                           is_main_process)
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS
+from mmdet.structures.mask import encode_mask_results
+from ..functional import YTVIS, YTVISeval
+from .base_video_metric import BaseVideoMetric, collect_tracking_results
+
+
+@METRICS.register_module()
+class YouTubeVISMetric(BaseVideoMetric):
+    """mAP evaluation metrics for the VIS task.
+
+    Args:
+        metric (str | list[str]): Metrics to be evaluated.
+            Default value is `youtube_vis_ap`.
+        metric_items (List[str], optional): Metric result names to be
+            recorded in the evaluation result. Defaults to None.
+        outfile_prefix (str | None): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonyms metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+        format_only (bool): If True, only formatting the results to the
+            official format and not performing evaluation. Defaults to False.
+    """
+
+    default_prefix: Optional[str] = 'youtube_vis'
+
+    def __init__(self,
+                 metric: Union[str, List[str]] = 'youtube_vis_ap',
+                 metric_items: Optional[Sequence[str]] = None,
+                 outfile_prefix: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 format_only: bool = False) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        # vis evaluation metrics
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        self.format_only = format_only
+        allowed_metrics = ['youtube_vis_ap']
+        for metric in self.metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(
+                    f"metric should be 'youtube_vis_ap', but got {metric}.")
+
+        self.metric_items = metric_items
+        self.outfile_prefix = outfile_prefix
+        self.per_video_res = []
+        self.categories = []
+        self._vis_meta_info = defaultdict(list)  # record video and image infos
+
+    def process_video(self, data_samples):
+
+        video_length = len(data_samples)
+        for frame_id in range(video_length):
+            result = dict()
+            img_data_sample = data_samples[frame_id].to_dict()
+            pred = img_data_sample['pred_track_instances']
+            video_id = img_data_sample['video_id']
+
+            result['img_id'] = img_data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+            result['labels'] = pred['labels'].cpu().numpy()
+            result['instances_id'] = pred['instances_id'].cpu().numpy()
+            # encode mask to RLE
+            assert 'masks' in pred, \
+                'masks must exist in YouTube-VIS metric'
+            result['masks'] = encode_mask_results(
+                pred['masks'].detach().cpu().numpy())
+
+            # parse gt
+            gt = dict()
+            gt['width'] = img_data_sample['ori_shape'][1]
+            gt['height'] = img_data_sample['ori_shape'][0]
+            gt['img_id'] = img_data_sample['img_id']
+            gt['frame_id'] = frame_id
+            gt['video_id'] = video_id
+            gt['video_length'] = video_length
+
+            if 'instances' in img_data_sample:
+                gt['anns'] = img_data_sample['instances']
+            else:
+                gt['anns'] = dict()
+            self.per_video_res.append((result, gt))
+
+        preds, gts = zip(*self.per_video_res)
+        # format the results
+        # we must format gts first to update self._vis_meta_info
+        gt_results = self._format_one_video_gts(gts)
+        pred_results = self._format_one_video_preds(preds)
+        self.per_video_res.clear()
+        # add converted result to the results list
+        self.results.append((pred_results, gt_results))
+
+    def compute_metrics(self, results: List) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (List): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        # split gt and prediction list
+        tmp_pred_results, tmp_gt_results = zip(*results)
+        gt_results = self.format_gts(tmp_gt_results)
+        pred_results = self.format_preds(tmp_pred_results)
+
+        if self.format_only:
+            self.save_pred_results(pred_results)
+            return dict()
+
+        ytvis = YTVIS(gt_results)
+
+        ytvis_dets = ytvis.loadRes(pred_results)
+        vid_ids = ytvis.getVidIds()
+
+        iou_type = metric = 'segm'
+        eval_results = OrderedDict()
+        ytvisEval = YTVISeval(ytvis, ytvis_dets, iou_type)
+        ytvisEval.params.vidIds = vid_ids
+        ytvisEval.evaluate()
+        ytvisEval.accumulate()
+        ytvisEval.summarize()
+
+        coco_metric_names = {
+            'mAP': 0,
+            'mAP_50': 1,
+            'mAP_75': 2,
+            'mAP_s': 3,
+            'mAP_m': 4,
+            'mAP_l': 5,
+            'AR@1': 6,
+            'AR@10': 7,
+            'AR@100': 8,
+            'AR_s@100': 9,
+            'AR_m@100': 10,
+            'AR_l@100': 11
+        }
+        metric_items = self.metric_items
+        if metric_items is not None:
+            for metric_item in metric_items:
+                if metric_item not in coco_metric_names:
+                    raise KeyError(
+                        f'metric item "{metric_item}" is not supported')
+
+        if metric_items is None:
+            metric_items = [
+                'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+            ]
+        for metric_item in metric_items:
+            key = f'{metric}_{metric_item}'
+            val = float(
+                f'{ytvisEval.stats[coco_metric_names[metric_item]]:.3f}')
+            eval_results[key] = val
+
+        return eval_results
+
+    def format_gts(self, gts: Tuple[List]) -> dict:
+        """Gather all ground-truth from self.results."""
+        self.categories = [
+            dict(id=id + 1, name=name)
+            for id, name in enumerate(self.dataset_meta['classes'])
+        ]
+        gt_results = dict(
+            categories=self.categories,
+            videos=self._vis_meta_info['videos'],
+            annotations=[])
+        for gt_result in gts:
+            gt_results['annotations'].extend(gt_result)
+        return gt_results
+
+    def format_preds(self, preds: Tuple[List]) -> List:
+        """Gather all predictions from self.results."""
+        pred_results = []
+        for pred_result in preds:
+            pred_results.extend(pred_result)
+        return pred_results
+
+    def _format_one_video_preds(self, pred_dicts: Tuple[dict]) -> List:
+        """Convert the annotation to the format of YouTube-VIS.
+
+        This operation is to make it easier to use the official eval API.
+
+        Args:
+            pred_dicts (Tuple[dict]): Prediction of the dataset.
+
+        Returns:
+            List: The formatted predictions.
+        """
+        # Collate preds scatters (tuple of dict to dict of list)
+        preds = defaultdict(list)
+        for pred in pred_dicts:
+            for key in pred.keys():
+                preds[key].append(pred[key])
+
+        img_infos = self._vis_meta_info['images']
+        vid_infos = self._vis_meta_info['videos']
+        inds = [i for i, _ in enumerate(img_infos) if _['frame_id'] == 0]
+        inds.append(len(img_infos))
+        json_results = []
+        video_id = vid_infos[-1]['id']
+        # collect data for each instances in a video.
+        collect_data = dict()
+        for frame_id, (masks, scores, labels, ids) in enumerate(
+                zip(preds['masks'], preds['scores'], preds['labels'],
+                    preds['instances_id'])):
+
+            assert len(masks) == len(labels)
+            for j, id in enumerate(ids):
+                if id not in collect_data:
+                    collect_data[id] = dict(
+                        category_ids=[], scores=[], segmentations=dict())
+                collect_data[id]['category_ids'].append(labels[j])
+                collect_data[id]['scores'].append(scores[j])
+                if isinstance(masks[j]['counts'], bytes):
+                    masks[j]['counts'] = masks[j]['counts'].decode()
+                collect_data[id]['segmentations'][frame_id] = masks[j]
+
+        # transform the collected data into official format
+        for id, id_data in collect_data.items():
+            output = dict()
+            output['video_id'] = video_id
+            output['score'] = np.array(id_data['scores']).mean().item()
+            # majority voting for sequence category
+            output['category_id'] = np.bincount(
+                np.array(id_data['category_ids'])).argmax().item() + 1
+            output['segmentations'] = []
+            for frame_id in range(inds[-1] - inds[-2]):
+                if frame_id in id_data['segmentations']:
+                    output['segmentations'].append(
+                        id_data['segmentations'][frame_id])
+                else:
+                    output['segmentations'].append(None)
+            json_results.append(output)
+
+        return json_results
+
+    def _format_one_video_gts(self, gt_dicts: Tuple[dict]) -> List:
+        """Convert the annotation to the format of YouTube-VIS.
+
+        This operation is to make it easier to use the official eval API.
+
+        Args:
+            gt_dicts (Tuple[dict]): Ground truth of the dataset.
+
+        Returns:
+            list: The formatted gts.
+        """
+        video_infos = []
+        image_infos = []
+        instance_infos = defaultdict(list)
+        len_videos = dict()  # mapping from instance_id to video_length
+        vis_anns = []
+
+        # get video infos
+        for gt_dict in gt_dicts:
+            frame_id = gt_dict['frame_id']
+            video_id = gt_dict['video_id']
+            img_id = gt_dict['img_id']
+            image_info = dict(
+                id=img_id,
+                width=gt_dict['width'],
+                height=gt_dict['height'],
+                frame_id=frame_id,
+                file_name='')
+            image_infos.append(image_info)
+            if frame_id == 0:
+                video_info = dict(
+                    id=video_id,
+                    width=gt_dict['width'],
+                    height=gt_dict['height'],
+                    file_name='')
+                video_infos.append(video_info)
+
+            for ann in gt_dict['anns']:
+                label = ann['bbox_label']
+                bbox = ann['bbox']
+                instance_id = ann['instance_id']
+                # update video length
+                len_videos[instance_id] = gt_dict['video_length']
+                coco_bbox = [
+                    bbox[0],
+                    bbox[1],
+                    bbox[2] - bbox[0],
+                    bbox[3] - bbox[1],
+                ]
+
+                annotation = dict(
+                    video_id=video_id,
+                    frame_id=frame_id,
+                    bbox=coco_bbox,
+                    instance_id=instance_id,
+                    iscrowd=ann.get('ignore_flag', 0),
+                    category_id=int(label) + 1,
+                    area=coco_bbox[2] * coco_bbox[3])
+                if ann.get('mask', None):
+                    mask = ann['mask']
+                    # area = mask_util.area(mask)
+                    if isinstance(mask, dict) and isinstance(
+                            mask['counts'], bytes):
+                        mask['counts'] = mask['counts'].decode()
+                    annotation['segmentation'] = mask
+
+                instance_infos[instance_id].append(annotation)
+
+        # update vis meta info
+        self._vis_meta_info['images'].extend(image_infos)
+        self._vis_meta_info['videos'].extend(video_infos)
+
+        for instance_id, ann_infos in instance_infos.items():
+            cur_video_len = len_videos[instance_id]
+            segm = [None] * cur_video_len
+            bbox = [None] * cur_video_len
+            area = [None] * cur_video_len
+            # In the official format, no instances are represented by
+            # 'None', however, only images with instances are recorded
+            # in the current annotations, so we need to use 'None' to
+            # initialize these lists.
+            for ann_info in ann_infos:
+                frame_id = ann_info['frame_id']
+                segm[frame_id] = ann_info['segmentation']
+                bbox[frame_id] = ann_info['bbox']
+                area[frame_id] = ann_info['area']
+            instance = dict(
+                category_id=ann_infos[0]['category_id'],
+                segmentations=segm,
+                bboxes=bbox,
+                video_id=ann_infos[0]['video_id'],
+                areas=area,
+                id=instance_id,
+                iscrowd=ann_infos[0]['iscrowd'])
+            vis_anns.append(instance)
+        return vis_anns
+
+    def save_pred_results(self, pred_results: List) -> None:
+        """Save the results to a zip file (standard format for YouTube-VIS
+        Challenge).
+
+        Args:
+            pred_results (list): Testing results of the
+                dataset.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+        mmengine.dump(pred_results, f'{outfile_prefix}.json')
+        # zip the json file in order to submit to the test server.
+        zip_file_name = f'{outfile_prefix}.submission_file.zip'
+        zf = zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED)
+        logger.info(f"zip the 'results.json' into '{zip_file_name}', "
+                    'please submmit the zip file to the test server')
+        zf.write(f'{outfile_prefix}.json', 'results.json')
+        zf.close()
+
+    def evaluate(self, size: int) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        # wait for all processes to complete prediction.
+        barrier()
+
+        if len(self.results) == 0:
+            warnings.warn(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.')
+
+        results = collect_tracking_results(self.results, self.collect_device)
+
+        # gather seq_info
+        gathered_seq_info = all_gather_object(self._vis_meta_info['videos'])
+        all_seq_info = []
+        for _seq_info in gathered_seq_info:
+            all_seq_info.extend(_seq_info)
+        # update self._vis_meta_info
+        self._vis_meta_info = dict(videos=all_seq_info)
+
+        if is_main_process():
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        # reset the vis_meta_info
+        self._vis_meta_info.clear()
+        return metrics[0]
diff --git a/mmde/mmdet/models/__init__.py b/mmde/mmdet/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0a0d5e8d350d81e72787ff73fd85c2176783b43
--- /dev/null
+++ b/mmde/mmdet/models/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .backbones import *  # noqa: F401,F403
+from .data_preprocessors import *  # noqa: F401,F403
+from .dense_heads import *  # noqa: F401,F403
+from .detectors import *  # noqa: F401,F403
+from .language_models import *  # noqa: F401,F403
+from .layers import *  # noqa: F401,F403
+from .losses import *  # noqa: F401,F403
+from .mot import *  # noqa: F401,F403
+from .necks import *  # noqa: F401,F403
+from .reid import *  # noqa: F401,F403
+from .roi_heads import *  # noqa: F401,F403
+from .seg_heads import *  # noqa: F401,F403
+from .task_modules import *  # noqa: F401,F403
+from .test_time_augs import *  # noqa: F401,F403
+from .trackers import *  # noqa: F401,F403
+from .tracking_heads import *  # noqa: F401,F403
+from .vis import *  # noqa: F401,F403
diff --git a/mmde/mmdet/models/backbones/__init__.py b/mmde/mmdet/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e16ff85f7037b36fb2046fcbcd3af523050a6516
--- /dev/null
+++ b/mmde/mmdet/models/backbones/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .csp_darknet import CSPDarknet
+from .cspnext import CSPNeXt
+from .darknet import Darknet
+from .detectors_resnet import DetectoRS_ResNet
+from .detectors_resnext import DetectoRS_ResNeXt
+from .efficientnet import EfficientNet
+from .hourglass import HourglassNet
+from .hrnet import HRNet
+from .mobilenet_v2 import MobileNetV2
+from .pvt import PyramidVisionTransformer, PyramidVisionTransformerV2
+from .regnet import RegNet
+from .res2net import Res2Net
+from .resnest import ResNeSt
+from .resnet import ResNet, ResNetV1d
+from .resnext import ResNeXt
+from .ssd_vgg import SSDVGG
+from .swin import SwinTransformer
+from .trident_resnet import TridentResNet
+
+__all__ = [
+    'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet',
+    'MobileNetV2', 'Res2Net', 'HourglassNet', 'DetectoRS_ResNet',
+    'DetectoRS_ResNeXt', 'Darknet', 'ResNeSt', 'TridentResNet', 'CSPDarknet',
+    'SwinTransformer', 'PyramidVisionTransformer',
+    'PyramidVisionTransformerV2', 'EfficientNet', 'CSPNeXt'
+]
diff --git a/mmde/mmdet/models/backbones/csp_darknet.py b/mmde/mmdet/models/backbones/csp_darknet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a890b486f255befa23fe5a3e9746f8f9298ac33f
--- /dev/null
+++ b/mmde/mmdet/models/backbones/csp_darknet.py
@@ -0,0 +1,286 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from ..layers import CSPLayer
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        kernel_size (int): The kernel size of the convolution. Default: 1
+        stride (int): The stride of the convolution. Default: 1
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=1,
+                 stride=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish')):
+        super().__init__()
+        self.conv = ConvModule(
+            in_channels * 4,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)
+
+
+class SPPBottleneck(BaseModule):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        kernel_sizes (tuple[int]): Sequential of kernel sizes of pooling
+            layers. Default: (5, 9, 13).
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_sizes=(5, 9, 13),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        mid_channels = in_channels // 2
+        self.conv1 = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.poolings = nn.ModuleList([
+            nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+            for ks in kernel_sizes
+        ])
+        conv2_channels = mid_channels * (len(kernel_sizes) + 1)
+        self.conv2 = ConvModule(
+            conv2_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        with torch.cuda.amp.autocast(enabled=False):
+            x = torch.cat(
+                [x] + [pooling(x) for pooling in self.poolings], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+@MODELS.register_module()
+class CSPDarknet(BaseModule):
+    """CSP-Darknet backbone used in YOLOv5 and YOLOX.
+
+    Args:
+        arch (str): Architecture of CSP-Darknet, from {P5, P6}.
+            Default: P5.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Default: 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Default: (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Default: -1.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Default: False.
+        arch_ovewrite(list): Overwrite default arch settings. Default: None.
+        spp_kernal_sizes: (tuple[int]): Sequential of kernel sizes of SPP
+            layers. Default: (5, 9, 13).
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Example:
+        >>> from mmdet.models import CSPDarknet
+        >>> import torch
+        >>> self = CSPDarknet(depth=53)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+               [256, 512, 9, True, False], [512, 1024, 3, False, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+               [256, 512, 9, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, False, True]]
+    }
+
+    def __init__(self,
+                 arch='P5',
+                 deepen_factor=1.0,
+                 widen_factor=1.0,
+                 out_indices=(2, 3, 4),
+                 frozen_stages=-1,
+                 use_depthwise=False,
+                 arch_ovewrite=None,
+                 spp_kernal_sizes=(5, 9, 13),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 norm_eval=False,
+                 init_cfg=dict(
+                     type='Kaiming',
+                     layer='Conv2d',
+                     a=math.sqrt(5),
+                     distribution='uniform',
+                     mode='fan_in',
+                     nonlinearity='leaky_relu')):
+        super().__init__(init_cfg)
+        arch_setting = self.arch_settings[arch]
+        if arch_ovewrite:
+            arch_setting = arch_ovewrite
+        assert set(out_indices).issubset(
+            i for i in range(len(arch_setting) + 1))
+        if frozen_stages not in range(-1, len(arch_setting) + 1):
+            raise ValueError('frozen_stages must be in range(-1, '
+                             'len(arch_setting) + 1). But received '
+                             f'{frozen_stages}')
+
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.use_depthwise = use_depthwise
+        self.norm_eval = norm_eval
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        self.stem = Focus(
+            3,
+            int(arch_setting[0][0] * widen_factor),
+            kernel_size=3,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.layers = ['stem']
+
+        for i, (in_channels, out_channels, num_blocks, add_identity,
+                use_spp) in enumerate(arch_setting):
+            in_channels = int(in_channels * widen_factor)
+            out_channels = int(out_channels * widen_factor)
+            num_blocks = max(round(num_blocks * deepen_factor), 1)
+            stage = []
+            conv_layer = conv(
+                in_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(conv_layer)
+            if use_spp:
+                spp = SPPBottleneck(
+                    out_channels,
+                    out_channels,
+                    kernel_sizes=spp_kernal_sizes,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg)
+                stage.append(spp)
+            csp_layer = CSPLayer(
+                out_channels,
+                out_channels,
+                num_blocks=num_blocks,
+                add_identity=add_identity,
+                use_depthwise=use_depthwise,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(csp_layer)
+            self.add_module(f'stage{i + 1}', nn.Sequential(*stage))
+            self.layers.append(f'stage{i + 1}')
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages + 1):
+                m = getattr(self, self.layers[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True):
+        super(CSPDarknet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    def forward(self, x):
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/mmde/mmdet/models/backbones/cspnext.py b/mmde/mmdet/models/backbones/cspnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..269725a70224047a1f7f7564ba8199e38df25cc8
--- /dev/null
+++ b/mmde/mmdet/models/backbones/cspnext.py
@@ -0,0 +1,195 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence, Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from ..layers import CSPLayer
+from .csp_darknet import SPPBottleneck
+
+
+@MODELS.register_module()
+class CSPNeXt(BaseModule):
+    """CSPNeXt backbone used in RTMDet.
+
+    Args:
+        arch (str): Architecture of CSPNeXt, from {P5, P6}.
+            Defaults to P5.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Defaults to 0.5.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        arch_ovewrite (list): Overwrite default arch settings.
+            Defaults to None.
+        spp_kernel_sizes: (tuple[int]): Sequential of kernel sizes of SPP
+            layers. Defaults to (5, 9, 13).
+        channel_attention (bool): Whether to add channel attention in each
+            stage. Defaults to True.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config norm layer. Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 1024, 3, False, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, False, True]]
+    }
+
+    def __init__(
+        self,
+        arch: str = 'P5',
+        deepen_factor: float = 1.0,
+        widen_factor: float = 1.0,
+        out_indices: Sequence[int] = (2, 3, 4),
+        frozen_stages: int = -1,
+        use_depthwise: bool = False,
+        expand_ratio: float = 0.5,
+        arch_ovewrite: dict = None,
+        spp_kernel_sizes: Sequence[int] = (5, 9, 13),
+        channel_attention: bool = True,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg: ConfigType = dict(type='SiLU'),
+        norm_eval: bool = False,
+        init_cfg: OptMultiConfig = dict(
+            type='Kaiming',
+            layer='Conv2d',
+            a=math.sqrt(5),
+            distribution='uniform',
+            mode='fan_in',
+            nonlinearity='leaky_relu')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        arch_setting = self.arch_settings[arch]
+        if arch_ovewrite:
+            arch_setting = arch_ovewrite
+        assert set(out_indices).issubset(
+            i for i in range(len(arch_setting) + 1))
+        if frozen_stages not in range(-1, len(arch_setting) + 1):
+            raise ValueError('frozen_stages must be in range(-1, '
+                             'len(arch_setting) + 1). But received '
+                             f'{frozen_stages}')
+
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.use_depthwise = use_depthwise
+        self.norm_eval = norm_eval
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        self.stem = nn.Sequential(
+            ConvModule(
+                3,
+                int(arch_setting[0][0] * widen_factor // 2),
+                3,
+                padding=1,
+                stride=2,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                int(arch_setting[0][0] * widen_factor // 2),
+                int(arch_setting[0][0] * widen_factor // 2),
+                3,
+                padding=1,
+                stride=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                int(arch_setting[0][0] * widen_factor // 2),
+                int(arch_setting[0][0] * widen_factor),
+                3,
+                padding=1,
+                stride=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+        self.layers = ['stem']
+
+        for i, (in_channels, out_channels, num_blocks, add_identity,
+                use_spp) in enumerate(arch_setting):
+            in_channels = int(in_channels * widen_factor)
+            out_channels = int(out_channels * widen_factor)
+            num_blocks = max(round(num_blocks * deepen_factor), 1)
+            stage = []
+            conv_layer = conv(
+                in_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(conv_layer)
+            if use_spp:
+                spp = SPPBottleneck(
+                    out_channels,
+                    out_channels,
+                    kernel_sizes=spp_kernel_sizes,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg)
+                stage.append(spp)
+            csp_layer = CSPLayer(
+                out_channels,
+                out_channels,
+                num_blocks=num_blocks,
+                add_identity=add_identity,
+                use_depthwise=use_depthwise,
+                use_cspnext_block=True,
+                expand_ratio=expand_ratio,
+                channel_attention=channel_attention,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(csp_layer)
+            self.add_module(f'stage{i + 1}', nn.Sequential(*stage))
+            self.layers.append(f'stage{i + 1}')
+
+    def _freeze_stages(self) -> None:
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages + 1):
+                m = getattr(self, self.layers[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True) -> None:
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    def forward(self, x: Tuple[Tensor, ...]) -> Tuple[Tensor, ...]:
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/mmde/mmdet/models/backbones/darknet.py b/mmde/mmdet/models/backbones/darknet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d44da1e03f04a7e0801c10e5338277cf6244ab1
--- /dev/null
+++ b/mmde/mmdet/models/backbones/darknet.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+
+
+class ResBlock(BaseModule):
+    """The basic residual block used in Darknet. Each ResBlock consists of two
+    ConvModules and the input is added to the final output. Each ConvModule is
+    composed of Conv, BN, and LeakyReLU. In YoloV3 paper, the first convLayer
+    has half of the number of the filters as much as the second convLayer. The
+    first convLayer has filter size of 1x1 and the second one has the filter
+    size of 3x3.
+
+    Args:
+        in_channels (int): The input channels. Must be even.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+                 init_cfg=None):
+        super(ResBlock, self).__init__(init_cfg)
+        assert in_channels % 2 == 0  # ensure the in_channels is even
+        half_in_channels = in_channels // 2
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        self.conv1 = ConvModule(in_channels, half_in_channels, 1, **cfg)
+        self.conv2 = ConvModule(
+            half_in_channels, in_channels, 3, padding=1, **cfg)
+
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = out + residual
+
+        return out
+
+
+@MODELS.register_module()
+class Darknet(BaseModule):
+    """Darknet backbone.
+
+    Args:
+        depth (int): Depth of Darknet. Currently only support 53.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import Darknet
+        >>> import torch
+        >>> self = Darknet(depth=53)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+
+    # Dict(depth: (layers, channels))
+    arch_settings = {
+        53: ((1, 2, 8, 8, 4), ((32, 64), (64, 128), (128, 256), (256, 512),
+                               (512, 1024)))
+    }
+
+    def __init__(self,
+                 depth=53,
+                 out_indices=(3, 4, 5),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+                 norm_eval=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(Darknet, self).__init__(init_cfg)
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for darknet')
+
+        self.depth = depth
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.layers, self.channels = self.arch_settings[depth]
+
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        self.conv1 = ConvModule(3, 32, 3, padding=1, **cfg)
+
+        self.cr_blocks = ['conv1']
+        for i, n_layers in enumerate(self.layers):
+            layer_name = f'conv_res_block{i + 1}'
+            in_c, out_c = self.channels[i]
+            self.add_module(
+                layer_name,
+                self.make_conv_res_block(in_c, out_c, n_layers, **cfg))
+            self.cr_blocks.append(layer_name)
+
+        self.norm_eval = norm_eval
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        outs = []
+        for i, layer_name in enumerate(self.cr_blocks):
+            cr_block = getattr(self, layer_name)
+            x = cr_block(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages):
+                m = getattr(self, self.cr_blocks[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True):
+        super(Darknet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    @staticmethod
+    def make_conv_res_block(in_channels,
+                            out_channels,
+                            res_repeat,
+                            conv_cfg=None,
+                            norm_cfg=dict(type='BN', requires_grad=True),
+                            act_cfg=dict(type='LeakyReLU',
+                                         negative_slope=0.1)):
+        """In Darknet backbone, ConvLayer is usually followed by ResBlock. This
+        function will make that. The Conv layers always have 3x3 filters with
+        stride=2. The number of the filters in Conv layer is the same as the
+        out channels of the ResBlock.
+
+        Args:
+            in_channels (int): The number of input channels.
+            out_channels (int): The number of output channels.
+            res_repeat (int): The number of ResBlocks.
+            conv_cfg (dict): Config dict for convolution layer. Default: None.
+            norm_cfg (dict): Dictionary to construct and config norm layer.
+                Default: dict(type='BN', requires_grad=True)
+            act_cfg (dict): Config dict for activation layer.
+                Default: dict(type='LeakyReLU', negative_slope=0.1).
+        """
+
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        model = nn.Sequential()
+        model.add_module(
+            'conv',
+            ConvModule(
+                in_channels, out_channels, 3, stride=2, padding=1, **cfg))
+        for idx in range(res_repeat):
+            model.add_module('res{}'.format(idx),
+                             ResBlock(out_channels, **cfg))
+        return model
diff --git a/mmde/mmdet/models/backbones/detectors_resnet.py b/mmde/mmdet/models/backbones/detectors_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f33424fce4a933d675f1f1d3d4ad89e0173c5f9e
--- /dev/null
+++ b/mmde/mmdet/models/backbones/detectors_resnet.py
@@ -0,0 +1,353 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.logging import MMLogger
+from mmengine.model import Sequential, constant_init, kaiming_init
+from mmengine.runner.checkpoint import load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from .resnet import BasicBlock
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottleneck(_Bottleneck):
+    r"""Bottleneck for the ResNet backbone in `DetectoRS
+    <https://arxiv.org/pdf/2006.02334.pdf>`_.
+
+    This bottleneck allows the users to specify whether to use
+    SAC (Switchable Atrous Convolution) and RFP (Recursive Feature Pyramid).
+
+    Args:
+         inplanes (int): The number of input channels.
+         planes (int): The number of output channels before expansion.
+         rfp_inplanes (int, optional): The number of channels from RFP.
+             Default: None. If specified, an additional conv layer will be
+             added for ``rfp_feat``. Otherwise, the structure is the same as
+             base class.
+         sac (dict, optional): Dictionary to construct SAC. Default: None.
+         init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 rfp_inplanes=None,
+                 sac=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(Bottleneck, self).__init__(
+            inplanes, planes, init_cfg=init_cfg, **kwargs)
+
+        assert sac is None or isinstance(sac, dict)
+        self.sac = sac
+        self.with_sac = sac is not None
+        if self.with_sac:
+            self.conv2 = build_conv_layer(
+                self.sac,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                bias=False)
+
+        self.rfp_inplanes = rfp_inplanes
+        if self.rfp_inplanes:
+            self.rfp_conv = build_conv_layer(
+                None,
+                self.rfp_inplanes,
+                planes * self.expansion,
+                1,
+                stride=1,
+                bias=True)
+            if init_cfg is None:
+                self.init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='rfp_conv'))
+
+    def rfp_forward(self, x, rfp_feat):
+        """The forward function that also takes the RFP features as input."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        if self.rfp_inplanes:
+            rfp_feat = self.rfp_conv(rfp_feat)
+            out = out + rfp_feat
+
+        out = self.relu(out)
+
+        return out
+
+
+class ResLayer(Sequential):
+    """ResLayer to build ResNet style backbone for RPF in detectoRS.
+
+    The difference between this module and base class is that we pass
+    ``rfp_inplanes`` to the first block.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+        rfp_inplanes (int, optional): The number of channels from RFP.
+            Default: None. If specified, an additional conv layer will be
+            added for ``rfp_feat``. Otherwise, the structure is the same as
+            base class.
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 rfp_inplanes=None,
+                 **kwargs):
+        self.block = block
+        assert downsample_first, f'downsample_first={downsample_first} is ' \
+                                 'not supported in DetectoRS'
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down and stride != 1:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride,
+                downsample=downsample,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                rfp_inplanes=rfp_inplanes,
+                **kwargs))
+        inplanes = planes * block.expansion
+        for _ in range(1, num_blocks):
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+
+        super(ResLayer, self).__init__(*layers)
+
+
+@MODELS.register_module()
+class DetectoRS_ResNet(ResNet):
+    """ResNet backbone for DetectoRS.
+
+    Args:
+        sac (dict, optional): Dictionary to construct SAC (Switchable Atrous
+            Convolution). Default: None.
+        stage_with_sac (list): Which stage to use sac. Default: (False, False,
+            False, False).
+        rfp_inplanes (int, optional): The number of channels from RFP.
+            Default: None. If specified, an additional conv layer will be
+            added for ``rfp_feat``. Otherwise, the structure is the same as
+            base class.
+        output_img (bool): If ``True``, the input image will be inserted into
+            the starting position of output. Default: False.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 sac=None,
+                 stage_with_sac=(False, False, False, False),
+                 rfp_inplanes=None,
+                 output_img=False,
+                 pretrained=None,
+                 init_cfg=None,
+                 **kwargs):
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        self.pretrained = pretrained
+        if init_cfg is not None:
+            assert isinstance(init_cfg, dict), \
+                f'init_cfg must be a dict, but got {type(init_cfg)}'
+            if 'type' in init_cfg:
+                assert init_cfg.get('type') == 'Pretrained', \
+                    'Only can initialize module by loading a pretrained model'
+            else:
+                raise KeyError('`init_cfg` must contain the key "type"')
+            self.pretrained = init_cfg.get('checkpoint')
+        self.sac = sac
+        self.stage_with_sac = stage_with_sac
+        self.rfp_inplanes = rfp_inplanes
+        self.output_img = output_img
+        super(DetectoRS_ResNet, self).__init__(**kwargs)
+
+        self.inplanes = self.stem_channels
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            sac = self.sac if self.stage_with_sac[i] else None
+            if self.plugins is not None:
+                stage_plugins = self.make_stage_plugins(self.plugins, i)
+            else:
+                stage_plugins = None
+            planes = self.base_channels * 2**i
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=self.with_cp,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                dcn=dcn,
+                sac=sac,
+                rfp_inplanes=rfp_inplanes if i > 0 else None,
+                plugins=stage_plugins)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+    # In order to be properly initialized by RFP
+    def init_weights(self):
+        # Calling this method will cause parameter initialization exception
+        # super(DetectoRS_ResNet, self).init_weights()
+
+        if isinstance(self.pretrained, str):
+            logger = MMLogger.get_current_instance()
+            load_checkpoint(self, self.pretrained, strict=False, logger=logger)
+        elif self.pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.dcn is not None:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck) and hasattr(
+                            m.conv2, 'conv_offset'):
+                        constant_init(m.conv2.conv_offset, 0)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer`` for DetectoRS."""
+        return ResLayer(**kwargs)
+
+    def forward(self, x):
+        """Forward function."""
+        outs = list(super(DetectoRS_ResNet, self).forward(x))
+        if self.output_img:
+            outs.insert(0, x)
+        return tuple(outs)
+
+    def rfp_forward(self, x, rfp_feats):
+        """Forward function for RFP."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            rfp_feat = rfp_feats[i] if i > 0 else None
+            for layer in res_layer:
+                x = layer.rfp_forward(x, rfp_feat)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/mmde/mmdet/models/backbones/detectors_resnext.py b/mmde/mmdet/models/backbones/detectors_resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bbd63154bb47910e27cf6a75e4b359e050063e1
--- /dev/null
+++ b/mmde/mmdet/models/backbones/detectors_resnext.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from mmdet.registry import MODELS
+from .detectors_resnet import Bottleneck as _Bottleneck
+from .detectors_resnet import DetectoRS_ResNet
+
+
+class Bottleneck(_Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 **kwargs):
+        """Bottleneck block for ResNeXt.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if self.with_sac:
+            self.conv2 = build_conv_layer(
+                self.sac,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        elif not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                self.conv_cfg,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                self.dcn,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+@MODELS.register_module()
+class DetectoRS_ResNeXt(DetectoRS_ResNet):
+    """ResNeXt backbone for DetectoRS.
+
+    Args:
+        groups (int): The number of groups in ResNeXt.
+        base_width (int): The base width of ResNeXt.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, groups=1, base_width=4, **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        super(DetectoRS_ResNeXt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return super().make_res_layer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/mmde/mmdet/models/backbones/efficientnet.py b/mmde/mmdet/models/backbones/efficientnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8484afe2e34e2bf8327e8aefedb968bd9a1e7792
--- /dev/null
+++ b/mmde/mmdet/models/backbones/efficientnet.py
@@ -0,0 +1,418 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn.bricks import ConvModule, DropPath
+from mmengine.model import BaseModule, Sequential
+
+from mmdet.registry import MODELS
+from ..layers import InvertedResidual, SELayer
+from ..utils import make_divisible
+
+
+class EdgeResidual(BaseModule):
+    """Edge Residual Block.
+
+    Args:
+        in_channels (int): The input channels of this module.
+        out_channels (int): The output channels of this module.
+        mid_channels (int): The input channels of the second convolution.
+        kernel_size (int): The kernel size of the first convolution.
+            Defaults to 3.
+        stride (int): The stride of the first convolution. Defaults to 1.
+        se_cfg (dict, optional): Config dict for se layer. Defaults to None,
+            which means no se layer.
+        with_residual (bool): Use residual connection. Defaults to True.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='BN')``.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to ``dict(type='ReLU')``.
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+        init_cfg (dict | list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 stride=1,
+                 se_cfg=None,
+                 with_residual=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 drop_path_rate=0.,
+                 with_cp=False,
+                 init_cfg=None,
+                 **kwargs):
+        super(EdgeResidual, self).__init__(init_cfg=init_cfg)
+        assert stride in [1, 2]
+        self.with_cp = with_cp
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.with_se = se_cfg is not None
+        self.with_residual = (
+            stride == 1 and in_channels == out_channels and with_residual)
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+
+        self.conv2 = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+            out = self.conv1(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.conv2(out)
+
+            if self.with_residual:
+                return x + self.drop_path(out)
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+def model_scaling(layer_setting, arch_setting):
+    """Scaling operation to the layer's parameters according to the
+    arch_setting."""
+    # scale width
+    new_layer_setting = copy.deepcopy(layer_setting)
+    for layer_cfg in new_layer_setting:
+        for block_cfg in layer_cfg:
+            block_cfg[1] = make_divisible(block_cfg[1] * arch_setting[0], 8)
+
+    # scale depth
+    split_layer_setting = [new_layer_setting[0]]
+    for layer_cfg in new_layer_setting[1:-1]:
+        tmp_index = [0]
+        for i in range(len(layer_cfg) - 1):
+            if layer_cfg[i + 1][1] != layer_cfg[i][1]:
+                tmp_index.append(i + 1)
+        tmp_index.append(len(layer_cfg))
+        for i in range(len(tmp_index) - 1):
+            split_layer_setting.append(layer_cfg[tmp_index[i]:tmp_index[i +
+                                                                        1]])
+    split_layer_setting.append(new_layer_setting[-1])
+
+    num_of_layers = [len(layer_cfg) for layer_cfg in split_layer_setting[1:-1]]
+    new_layers = [
+        int(math.ceil(arch_setting[1] * num)) for num in num_of_layers
+    ]
+
+    merge_layer_setting = [split_layer_setting[0]]
+    for i, layer_cfg in enumerate(split_layer_setting[1:-1]):
+        if new_layers[i] <= num_of_layers[i]:
+            tmp_layer_cfg = layer_cfg[:new_layers[i]]
+        else:
+            tmp_layer_cfg = copy.deepcopy(layer_cfg) + [layer_cfg[-1]] * (
+                new_layers[i] - num_of_layers[i])
+        if tmp_layer_cfg[0][3] == 1 and i != 0:
+            merge_layer_setting[-1] += tmp_layer_cfg.copy()
+        else:
+            merge_layer_setting.append(tmp_layer_cfg.copy())
+    merge_layer_setting.append(split_layer_setting[-1])
+
+    return merge_layer_setting
+
+
+@MODELS.register_module()
+class EfficientNet(BaseModule):
+    """EfficientNet backbone.
+
+    Args:
+        arch (str): Architecture of efficientnet. Defaults to b0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (6, ).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Defaults to 0, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+    """
+
+    # Parameters to build layers.
+    # 'b' represents the architecture of normal EfficientNet family includes
+    # 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8'.
+    # 'e' represents the architecture of EfficientNet-EdgeTPU including 'es',
+    # 'em', 'el'.
+    # 6 parameters are needed to construct a layer, From left to right:
+    # - kernel_size: The kernel size of the block
+    # - out_channel: The number of out_channels of the block
+    # - se_ratio: The sequeeze ratio of SELayer.
+    # - stride: The stride of the block
+    # - expand_ratio: The expand_ratio of the mid_channels
+    # - block_type: -1: Not a block, 0: InvertedResidual, 1: EdgeResidual
+    layer_settings = {
+        'b': [[[3, 32, 0, 2, 0, -1]],
+              [[3, 16, 4, 1, 1, 0]],
+              [[3, 24, 4, 2, 6, 0],
+               [3, 24, 4, 1, 6, 0]],
+              [[5, 40, 4, 2, 6, 0],
+               [5, 40, 4, 1, 6, 0]],
+              [[3, 80, 4, 2, 6, 0],
+               [3, 80, 4, 1, 6, 0],
+               [3, 80, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0]],
+              [[5, 192, 4, 2, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [3, 320, 4, 1, 6, 0]],
+              [[1, 1280, 0, 1, 0, -1]]
+              ],
+        'e': [[[3, 32, 0, 2, 0, -1]],
+              [[3, 24, 0, 1, 3, 1]],
+              [[3, 32, 0, 2, 8, 1],
+               [3, 32, 0, 1, 8, 1]],
+              [[3, 48, 0, 2, 8, 1],
+               [3, 48, 0, 1, 8, 1],
+               [3, 48, 0, 1, 8, 1],
+               [3, 48, 0, 1, 8, 1]],
+              [[5, 96, 0, 2, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0]],
+              [[5, 192, 0, 2, 8, 0],
+               [5, 192, 0, 1, 8, 0]],
+              [[1, 1280, 0, 1, 0, -1]]
+              ]
+    }  # yapf: disable
+
+    # Parameters to build different kinds of architecture.
+    # From left to right: scaling factor for width, scaling factor for depth,
+    # resolution.
+    arch_settings = {
+        'b0': (1.0, 1.0, 224),
+        'b1': (1.0, 1.1, 240),
+        'b2': (1.1, 1.2, 260),
+        'b3': (1.2, 1.4, 300),
+        'b4': (1.4, 1.8, 380),
+        'b5': (1.6, 2.2, 456),
+        'b6': (1.8, 2.6, 528),
+        'b7': (2.0, 3.1, 600),
+        'b8': (2.2, 3.6, 672),
+        'es': (1.0, 1.0, 224),
+        'em': (1.0, 1.1, 240),
+        'el': (1.2, 1.4, 300)
+    }
+
+    def __init__(self,
+                 arch='b0',
+                 drop_path_rate=0.,
+                 out_indices=(6, ),
+                 frozen_stages=0,
+                 conv_cfg=dict(type='Conv2dAdaptivePadding'),
+                 norm_cfg=dict(type='BN', eps=1e-3),
+                 act_cfg=dict(type='Swish'),
+                 norm_eval=False,
+                 with_cp=False,
+                 init_cfg=[
+                     dict(type='Kaiming', layer='Conv2d'),
+                     dict(
+                         type='Constant',
+                         layer=['_BatchNorm', 'GroupNorm'],
+                         val=1)
+                 ]):
+        super(EfficientNet, self).__init__(init_cfg)
+        assert arch in self.arch_settings, \
+            f'"{arch}" is not one of the arch_settings ' \
+            f'({", ".join(self.arch_settings.keys())})'
+        self.arch_setting = self.arch_settings[arch]
+        self.layer_setting = self.layer_settings[arch[:1]]
+        for index in out_indices:
+            if index not in range(0, len(self.layer_setting)):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, {len(self.layer_setting)}). '
+                                 f'But received {index}')
+
+        if frozen_stages not in range(len(self.layer_setting) + 1):
+            raise ValueError('frozen_stages must be in range(0, '
+                             f'{len(self.layer_setting) + 1}). '
+                             f'But received {frozen_stages}')
+        self.drop_path_rate = drop_path_rate
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.layer_setting = model_scaling(self.layer_setting,
+                                           self.arch_setting)
+        block_cfg_0 = self.layer_setting[0][0]
+        block_cfg_last = self.layer_setting[-1][0]
+        self.in_channels = make_divisible(block_cfg_0[1], 8)
+        self.out_channels = block_cfg_last[1]
+        self.layers = nn.ModuleList()
+        self.layers.append(
+            ConvModule(
+                in_channels=3,
+                out_channels=self.in_channels,
+                kernel_size=block_cfg_0[0],
+                stride=block_cfg_0[3],
+                padding=block_cfg_0[0] // 2,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+        self.make_layer()
+        # Avoid building unused layers in mmdetection.
+        if len(self.layers) < max(self.out_indices) + 1:
+            self.layers.append(
+                ConvModule(
+                    in_channels=self.in_channels,
+                    out_channels=self.out_channels,
+                    kernel_size=block_cfg_last[0],
+                    stride=block_cfg_last[3],
+                    padding=block_cfg_last[0] // 2,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+    def make_layer(self):
+        # Without the first and the final conv block.
+        layer_setting = self.layer_setting[1:-1]
+
+        total_num_blocks = sum([len(x) for x in layer_setting])
+        block_idx = 0
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, self.drop_path_rate, total_num_blocks)
+        ]  # stochastic depth decay rule
+
+        for i, layer_cfg in enumerate(layer_setting):
+            # Avoid building unused layers in mmdetection.
+            if i > max(self.out_indices) - 1:
+                break
+            layer = []
+            for i, block_cfg in enumerate(layer_cfg):
+                (kernel_size, out_channels, se_ratio, stride, expand_ratio,
+                 block_type) = block_cfg
+
+                mid_channels = int(self.in_channels * expand_ratio)
+                out_channels = make_divisible(out_channels, 8)
+                if se_ratio <= 0:
+                    se_cfg = None
+                else:
+                    # In mmdetection, the `divisor` is deleted to align
+                    # the logic of SELayer with mmpretrain.
+                    se_cfg = dict(
+                        channels=mid_channels,
+                        ratio=expand_ratio * se_ratio,
+                        act_cfg=(self.act_cfg, dict(type='Sigmoid')))
+                if block_type == 1:  # edge tpu
+                    if i > 0 and expand_ratio == 3:
+                        with_residual = False
+                        expand_ratio = 4
+                    else:
+                        with_residual = True
+                    mid_channels = int(self.in_channels * expand_ratio)
+                    if se_cfg is not None:
+                        # In mmdetection, the `divisor` is deleted to align
+                        # the logic of SELayer with mmpretrain.
+                        se_cfg = dict(
+                            channels=mid_channels,
+                            ratio=se_ratio * expand_ratio,
+                            act_cfg=(self.act_cfg, dict(type='Sigmoid')))
+                    block = partial(EdgeResidual, with_residual=with_residual)
+                else:
+                    block = InvertedResidual
+                layer.append(
+                    block(
+                        in_channels=self.in_channels,
+                        out_channels=out_channels,
+                        mid_channels=mid_channels,
+                        kernel_size=kernel_size,
+                        stride=stride,
+                        se_cfg=se_cfg,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg,
+                        drop_path_rate=dpr[block_idx],
+                        with_cp=self.with_cp,
+                        # In mmdetection, `with_expand_conv` is set to align
+                        # the logic of InvertedResidual with mmpretrain.
+                        with_expand_conv=(mid_channels != self.in_channels)))
+                self.in_channels = out_channels
+                block_idx += 1
+            self.layers.append(Sequential(*layer))
+
+    def forward(self, x):
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        for i in range(self.frozen_stages):
+            m = self.layers[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super(EfficientNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
diff --git a/mmde/mmdet/models/backbones/hourglass.py b/mmde/mmdet/models/backbones/hourglass.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb58799f7b32138b3f58383419ddce9aa6d5ca18
--- /dev/null
+++ b/mmde/mmdet/models/backbones/hourglass.py
@@ -0,0 +1,225 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptMultiConfig
+from ..layers import ResLayer
+from .resnet import BasicBlock
+
+
+class HourglassModule(BaseModule):
+    """Hourglass Module for HourglassNet backbone.
+
+    Generate module recursively and use BasicBlock as the base unit.
+
+    Args:
+        depth (int): Depth of current HourglassModule.
+        stage_channels (list[int]): Feature channels of sub-modules in current
+            and follow-up HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in current and
+            follow-up HourglassModule.
+        norm_cfg (ConfigType): Dictionary to construct and config norm layer.
+            Defaults to `dict(type='BN', requires_grad=True)`
+        upsample_cfg (ConfigType): Config dict for interpolate layer.
+            Defaults to `dict(mode='nearest')`
+       init_cfg (dict or ConfigDict, optional): the config to control the
+           initialization.
+    """
+
+    def __init__(self,
+                 depth: int,
+                 stage_channels: List[int],
+                 stage_blocks: List[int],
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 upsample_cfg: ConfigType = dict(mode='nearest'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg)
+
+        self.depth = depth
+
+        cur_block = stage_blocks[0]
+        next_block = stage_blocks[1]
+
+        cur_channel = stage_channels[0]
+        next_channel = stage_channels[1]
+
+        self.up1 = ResLayer(
+            BasicBlock, cur_channel, cur_channel, cur_block, norm_cfg=norm_cfg)
+
+        self.low1 = ResLayer(
+            BasicBlock,
+            cur_channel,
+            next_channel,
+            cur_block,
+            stride=2,
+            norm_cfg=norm_cfg)
+
+        if self.depth > 1:
+            self.low2 = HourglassModule(depth - 1, stage_channels[1:],
+                                        stage_blocks[1:])
+        else:
+            self.low2 = ResLayer(
+                BasicBlock,
+                next_channel,
+                next_channel,
+                next_block,
+                norm_cfg=norm_cfg)
+
+        self.low3 = ResLayer(
+            BasicBlock,
+            next_channel,
+            cur_channel,
+            cur_block,
+            norm_cfg=norm_cfg,
+            downsample_first=False)
+
+        self.up2 = F.interpolate
+        self.upsample_cfg = upsample_cfg
+
+    def forward(self, x: torch.Tensor) -> nn.Module:
+        """Forward function."""
+        up1 = self.up1(x)
+        low1 = self.low1(x)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        # Fixing `scale factor` (e.g. 2) is common for upsampling, but
+        # in some cases the spatial size is mismatched and error will arise.
+        if 'scale_factor' in self.upsample_cfg:
+            up2 = self.up2(low3, **self.upsample_cfg)
+        else:
+            shape = up1.shape[2:]
+            up2 = self.up2(low3, size=shape, **self.upsample_cfg)
+        return up1 + up2
+
+
+@MODELS.register_module()
+class HourglassNet(BaseModule):
+    """HourglassNet backbone.
+
+    Stacked Hourglass Networks for Human Pose Estimation.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1603.06937>`_ .
+
+    Args:
+        downsample_times (int): Downsample times in a HourglassModule.
+        num_stacks (int): Number of HourglassModule modules stacked,
+            1 for Hourglass-52, 2 for Hourglass-104.
+        stage_channels (Sequence[int]): Feature channel of each sub-module in a
+            HourglassModule.
+        stage_blocks (Sequence[int]): Number of sub-modules stacked in a
+            HourglassModule.
+        feat_channel (int): Feature channel of conv after a HourglassModule.
+        norm_cfg (norm_cfg): Dictionary to construct and config norm layer.
+       init_cfg (dict or ConfigDict, optional): the config to control the
+           initialization.
+
+    Example:
+        >>> from mmdet.models import HourglassNet
+        >>> import torch
+        >>> self = HourglassNet()
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 511, 511)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     print(tuple(level_output.shape))
+        (1, 256, 128, 128)
+        (1, 256, 128, 128)
+    """
+
+    def __init__(self,
+                 downsample_times: int = 5,
+                 num_stacks: int = 2,
+                 stage_channels: Sequence = (256, 256, 384, 384, 384, 512),
+                 stage_blocks: Sequence = (2, 2, 2, 2, 2, 4),
+                 feat_channel: int = 256,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 init_cfg: OptMultiConfig = None) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg)
+
+        self.num_stacks = num_stacks
+        assert self.num_stacks >= 1
+        assert len(stage_channels) == len(stage_blocks)
+        assert len(stage_channels) > downsample_times
+
+        cur_channel = stage_channels[0]
+
+        self.stem = nn.Sequential(
+            ConvModule(
+                3, cur_channel // 2, 7, padding=3, stride=2,
+                norm_cfg=norm_cfg),
+            ResLayer(
+                BasicBlock,
+                cur_channel // 2,
+                cur_channel,
+                1,
+                stride=2,
+                norm_cfg=norm_cfg))
+
+        self.hourglass_modules = nn.ModuleList([
+            HourglassModule(downsample_times, stage_channels, stage_blocks)
+            for _ in range(num_stacks)
+        ])
+
+        self.inters = ResLayer(
+            BasicBlock,
+            cur_channel,
+            cur_channel,
+            num_stacks - 1,
+            norm_cfg=norm_cfg)
+
+        self.conv1x1s = nn.ModuleList([
+            ConvModule(
+                cur_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.out_convs = nn.ModuleList([
+            ConvModule(
+                cur_channel, feat_channel, 3, padding=1, norm_cfg=norm_cfg)
+            for _ in range(num_stacks)
+        ])
+
+        self.remap_convs = nn.ModuleList([
+            ConvModule(
+                feat_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def init_weights(self) -> None:
+        """Init module weights."""
+        # Training Centripetal Model needs to reset parameters for Conv2d
+        super().init_weights()
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                m.reset_parameters()
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        """Forward function."""
+        inter_feat = self.stem(x)
+        out_feats = []
+
+        for ind in range(self.num_stacks):
+            single_hourglass = self.hourglass_modules[ind]
+            out_conv = self.out_convs[ind]
+
+            hourglass_feat = single_hourglass(inter_feat)
+            out_feat = out_conv(hourglass_feat)
+            out_feats.append(out_feat)
+
+            if ind < self.num_stacks - 1:
+                inter_feat = self.conv1x1s[ind](
+                    inter_feat) + self.remap_convs[ind](
+                        out_feat)
+                inter_feat = self.inters[ind](self.relu(inter_feat))
+
+        return out_feats
diff --git a/mmde/mmdet/models/backbones/hrnet.py b/mmde/mmdet/models/backbones/hrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..77bd3cc7125bb7ba03cd201ab3a55174b01dde50
--- /dev/null
+++ b/mmde/mmdet/models/backbones/hrnet.py
@@ -0,0 +1,589 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule, ModuleList, Sequential
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from .resnet import BasicBlock, Bottleneck
+
+
+class HRModule(BaseModule):
+    """High-Resolution Module for HRNet.
+
+    In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange
+    is in this module.
+    """
+
+    def __init__(self,
+                 num_branches,
+                 blocks,
+                 num_blocks,
+                 in_channels,
+                 num_channels,
+                 multiscale_output=True,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 block_init_cfg=None,
+                 init_cfg=None):
+        super(HRModule, self).__init__(init_cfg)
+        self.block_init_cfg = block_init_cfg
+        self._check_branches(num_branches, num_blocks, in_channels,
+                             num_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.multiscale_output = multiscale_output
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.with_cp = with_cp
+        self.branches = self._make_branches(num_branches, blocks, num_blocks,
+                                            num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(inplace=False)
+
+    def _check_branches(self, num_branches, num_blocks, in_channels,
+                        num_channels):
+        if num_branches != len(num_blocks):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                        f'!= NUM_BLOCKS({len(num_blocks)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                        f'!= NUM_CHANNELS({len(num_channels)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(in_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                        f'!= NUM_INCHANNELS({len(in_channels)})'
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        downsample = None
+        if stride != 1 or \
+                self.in_channels[branch_index] != \
+                num_channels[branch_index] * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    self.in_channels[branch_index],
+                    num_channels[branch_index] * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, num_channels[branch_index] *
+                                 block.expansion)[1])
+
+        layers = []
+        layers.append(
+            block(
+                self.in_channels[branch_index],
+                num_channels[branch_index],
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                init_cfg=self.block_init_cfg))
+        self.in_channels[branch_index] = \
+            num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.in_channels[branch_index],
+                    num_channels[branch_index],
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    init_cfg=self.block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels))
+
+        return ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels[j],
+                                in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            nn.Upsample(
+                                scale_factor=2**(j - i), mode='nearest')))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[i],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[i])[1]))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    nn.ReLU(inplace=False)))
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = 0
+            for j in range(self.num_branches):
+                if i == j:
+                    y += x[j]
+                else:
+                    y += self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+
+
+@MODELS.register_module()
+class HRNet(BaseModule):
+    """HRNet backbone.
+
+    `High-Resolution Representations for Labeling Pixels and Regions
+    arXiv: <https://arxiv.org/abs/1904.04514>`_.
+
+    Args:
+        extra (dict): Detailed configuration for each stage of HRNet.
+            There must be 4 stages, the configuration for each stage must have
+            5 keys:
+
+                - num_modules(int): The number of HRModule in this stage.
+                - num_branches(int): The number of branches in the HRModule.
+                - block(str): The type of convolution block.
+                - num_blocks(tuple): The number of blocks in each branch.
+                    The length must be equal to num_branches.
+                - num_channels(tuple): The number of channels in each branch.
+                    The length must be equal to num_branches.
+        in_channels (int): Number of input image channels. Default: 3.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: True.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: False.
+        multiscale_output (bool): Whether to output multi-level features
+            produced by multiple branches. If False, only the first level
+            feature will be output. Default: True.
+        pretrained (str, optional): Model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Example:
+        >>> from mmdet.models import HRNet
+        >>> import torch
+        >>> extra = dict(
+        >>>     stage1=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=1,
+        >>>         block='BOTTLENECK',
+        >>>         num_blocks=(4, ),
+        >>>         num_channels=(64, )),
+        >>>     stage2=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=2,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4),
+        >>>         num_channels=(32, 64)),
+        >>>     stage3=dict(
+        >>>         num_modules=4,
+        >>>         num_branches=3,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4),
+        >>>         num_channels=(32, 64, 128)),
+        >>>     stage4=dict(
+        >>>         num_modules=3,
+        >>>         num_branches=4,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4, 4),
+        >>>         num_channels=(32, 64, 128, 256)))
+        >>> self = HRNet(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 32, 8, 8)
+        (1, 64, 4, 4)
+        (1, 128, 2, 2)
+        (1, 256, 1, 1)
+    """
+
+    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 norm_eval=True,
+                 with_cp=False,
+                 zero_init_residual=False,
+                 multiscale_output=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(HRNet, self).__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        # Assert configurations of 4 stages are in extra
+        assert 'stage1' in extra and 'stage2' in extra \
+               and 'stage3' in extra and 'stage4' in extra
+        # Assert whether the length of `num_blocks` and `num_channels` are
+        # equal to `num_branches`
+        for i in range(4):
+            cfg = extra[f'stage{i + 1}']
+            assert len(cfg['num_blocks']) == cfg['num_branches'] and \
+                   len(cfg['num_channels']) == cfg['num_branches']
+
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.zero_init_residual = zero_init_residual
+
+        # stem net
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            64,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # stage 1
+        self.stage1_cfg = self.extra['stage1']
+        num_channels = self.stage1_cfg['num_channels'][0]
+        block_type = self.stage1_cfg['block']
+        num_blocks = self.stage1_cfg['num_blocks'][0]
+
+        block = self.blocks_dict[block_type]
+        stage1_out_channels = num_channels * block.expansion
+        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
+
+        # stage 2
+        self.stage2_cfg = self.extra['stage2']
+        num_channels = self.stage2_cfg['num_channels']
+        block_type = self.stage2_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition1 = self._make_transition_layer([stage1_out_channels],
+                                                       num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+
+        # stage 3
+        self.stage3_cfg = self.extra['stage3']
+        num_channels = self.stage3_cfg['num_channels']
+        block_type = self.stage3_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition2 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+
+        # stage 4
+        self.stage4_cfg = self.extra['stage4']
+        num_channels = self.stage4_cfg['num_channels']
+        block_type = self.stage4_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition3 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg, num_channels, multiscale_output=multiscale_output)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_cur_layer[i])[1],
+                            nn.ReLU(inplace=True)))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = num_channels_cur_layer[i] \
+                        if j == i - num_branches_pre else in_channels
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                out_channels,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU(inplace=True)))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, planes * block.expansion)[1])
+
+        layers = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm3'))
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                init_cfg=block_init_cfg,
+            ))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    init_cfg=block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_stage(self, layer_config, in_channels, multiscale_output=True):
+        num_modules = layer_config['num_modules']
+        num_branches = layer_config['num_branches']
+        num_blocks = layer_config['num_blocks']
+        num_channels = layer_config['num_channels']
+        block = self.blocks_dict[layer_config['block']]
+
+        hr_modules = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm3'))
+
+        for i in range(num_modules):
+            # multi_scale_output is only used for the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            hr_modules.append(
+                HRModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    in_channels,
+                    num_channels,
+                    reset_multiscale_output,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    block_init_cfg=block_init_cfg))
+
+        return Sequential(*hr_modules), in_channels
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['num_branches']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['num_branches']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['num_branches']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        return y_list
+
+    def train(self, mode=True):
+        """Convert the model into training mode will keeping the normalization
+        layer freezed."""
+        super(HRNet, self).train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmde/mmdet/models/backbones/mobilenet_v2.py b/mmde/mmdet/models/backbones/mobilenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4fd0519ad4d5106e1acb82624d6393052596ce8
--- /dev/null
+++ b/mmde/mmdet/models/backbones/mobilenet_v2.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from ..layers import InvertedResidual
+from ..utils import make_divisible
+
+
+@MODELS.register_module()
+class MobileNetV2(BaseModule):
+    """MobileNetV2 backbone.
+
+    Args:
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
+        out_indices (Sequence[int], optional): Output from which stages.
+            Default: (1, 2, 4, 7).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    # Parameters to build layers. 4 parameters are needed to construct a
+    # layer, from left to right: expand_ratio, channel, num_blocks, stride.
+    arch_settings = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2],
+                     [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2],
+                     [6, 320, 1, 1]]
+
+    def __init__(self,
+                 widen_factor=1.,
+                 out_indices=(1, 2, 4, 7),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 norm_eval=False,
+                 with_cp=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super(MobileNetV2, self).__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.widen_factor = widen_factor
+        self.out_indices = out_indices
+        if not set(out_indices).issubset(set(range(0, 8))):
+            raise ValueError('out_indices must be a subset of range'
+                             f'(0, 8). But received {out_indices}')
+
+        if frozen_stages not in range(-1, 8):
+            raise ValueError('frozen_stages must be in range(-1, 8). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.in_channels = make_divisible(32 * widen_factor, 8)
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.layers = []
+
+        for i, layer_cfg in enumerate(self.arch_settings):
+            expand_ratio, channel, num_blocks, stride = layer_cfg
+            out_channels = make_divisible(channel * widen_factor, 8)
+            inverted_res_layer = self.make_layer(
+                out_channels=out_channels,
+                num_blocks=num_blocks,
+                stride=stride,
+                expand_ratio=expand_ratio)
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, inverted_res_layer)
+            self.layers.append(layer_name)
+
+        if widen_factor > 1.0:
+            self.out_channel = int(1280 * widen_factor)
+        else:
+            self.out_channel = 1280
+
+        layer = ConvModule(
+            in_channels=self.in_channels,
+            out_channels=self.out_channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.add_module('conv2', layer)
+        self.layers.append('conv2')
+
+    def make_layer(self, out_channels, num_blocks, stride, expand_ratio):
+        """Stack InvertedResidual blocks to build a layer for MobileNetV2.
+
+        Args:
+            out_channels (int): out_channels of block.
+            num_blocks (int): number of blocks.
+            stride (int): stride of the first block. Default: 1
+            expand_ratio (int): Expand the number of channels of the
+                hidden layer in InvertedResidual by this ratio. Default: 6.
+        """
+        layers = []
+        for i in range(num_blocks):
+            if i >= 1:
+                stride = 1
+            layers.append(
+                InvertedResidual(
+                    self.in_channels,
+                    out_channels,
+                    mid_channels=int(round(self.in_channels * expand_ratio)),
+                    stride=stride,
+                    with_expand_conv=expand_ratio != 1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        frozen."""
+        super(MobileNetV2, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmde/mmdet/models/backbones/pvt.py b/mmde/mmdet/models/backbones/pvt.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b250f63c1b22f21a892faf4c41ccc2d20e83e13
--- /dev/null
+++ b/mmde/mmdet/models/backbones/pvt.py
@@ -0,0 +1,665 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.transformer import MultiheadAttention
+from mmengine.logging import MMLogger
+from mmengine.model import (BaseModule, ModuleList, Sequential, constant_init,
+                            normal_init, trunc_normal_init)
+from mmengine.model.weight_init import trunc_normal_
+from mmengine.runner.checkpoint import CheckpointLoader, load_state_dict
+from torch.nn.modules.utils import _pair as to_2tuple
+
+from mmdet.registry import MODELS
+from ..layers import PatchEmbed, nchw_to_nlc, nlc_to_nchw
+
+
+class MixFFN(BaseModule):
+    """An implementation of MixFFN of PVT.
+
+    The differences between MixFFN & FFN:
+        1. Use 1X1 Conv to replace Linear layer.
+        2. Introduce 3X3 Depth-wise Conv to encode positional information.
+
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`.
+        feedforward_channels (int): The hidden dimension of FFNs.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='GELU').
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+            Default: None.
+        use_conv (bool): If True, add 3x3 DWConv between two Linear layers.
+            Defaults: False.
+        init_cfg (obj:`mmengine.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 feedforward_channels,
+                 act_cfg=dict(type='GELU'),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 use_conv=False,
+                 init_cfg=None):
+        super(MixFFN, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.act_cfg = act_cfg
+        activate = build_activation_layer(act_cfg)
+
+        in_channels = embed_dims
+        fc1 = Conv2d(
+            in_channels=in_channels,
+            out_channels=feedforward_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        if use_conv:
+            # 3x3 depth wise conv to provide positional encode information
+            dw_conv = Conv2d(
+                in_channels=feedforward_channels,
+                out_channels=feedforward_channels,
+                kernel_size=3,
+                stride=1,
+                padding=(3 - 1) // 2,
+                bias=True,
+                groups=feedforward_channels)
+        fc2 = Conv2d(
+            in_channels=feedforward_channels,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        drop = nn.Dropout(ffn_drop)
+        layers = [fc1, activate, drop, fc2, drop]
+        if use_conv:
+            layers.insert(1, dw_conv)
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+
+    def forward(self, x, hw_shape, identity=None):
+        out = nlc_to_nchw(x, hw_shape)
+        out = self.layers(out)
+        out = nchw_to_nlc(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+class SpatialReductionAttention(MultiheadAttention):
+    """An implementation of Spatial Reduction Attention of PVT.
+
+    This module is modified from MultiheadAttention which is a module from
+    mmcv.cnn.bricks.transformer.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut. Default: None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: False.
+        qkv_bias (bool): enable bias for qkv if True. Default: True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention of PVT. Default: 1.
+        init_cfg (obj:`mmengine.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=None,
+                 batch_first=True,
+                 qkv_bias=True,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 init_cfg=None):
+        super().__init__(
+            embed_dims,
+            num_heads,
+            attn_drop,
+            proj_drop,
+            batch_first=batch_first,
+            dropout_layer=dropout_layer,
+            bias=qkv_bias,
+            init_cfg=init_cfg)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = Conv2d(
+                in_channels=embed_dims,
+                out_channels=embed_dims,
+                kernel_size=sr_ratio,
+                stride=sr_ratio)
+            # The ret[0] of build_norm_layer is norm name.
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        # handle the BC-breaking from https://github.com/open-mmlab/mmcv/pull/1418 # noqa
+        from mmdet import digit_version, mmcv_version
+        if mmcv_version < digit_version('1.3.17'):
+            warnings.warn('The legacy version of forward function in'
+                          'SpatialReductionAttention is deprecated in'
+                          'mmcv>=1.3.17 and will no longer support in the'
+                          'future. Please upgrade your mmcv.')
+            self.forward = self.legacy_forward
+
+    def forward(self, x, hw_shape, identity=None):
+
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_queries, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_queries, embed_dims) to num_queries_first
+        # (num_queries ,batch, embed_dims), and recover ``attn_output``
+        # from num_queries_first to batch_first.
+        if self.batch_first:
+            x_q = x_q.transpose(0, 1)
+            x_kv = x_kv.transpose(0, 1)
+
+        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+    def legacy_forward(self, x, hw_shape, identity=None):
+        """multi head attention forward in mmcv version < 1.3.17."""
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+class PVTEncoderLayer(BaseModule):
+    """Implements one encoder layer in PVT.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed.
+            after the feed forward layer. Default: 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path_rate (float): stochastic depth rate. Default: 0.0.
+        qkv_bias (bool): enable bias for qkv if True.
+            Default: True.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention of PVT. Default: 1.
+        use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN.
+            Default: False.
+        init_cfg (dict, optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 use_conv_ffn=False,
+                 init_cfg=None):
+        super(PVTEncoderLayer, self).__init__(init_cfg=init_cfg)
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.attn = SpatialReductionAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio)
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.ffn = MixFFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            use_conv=use_conv_ffn,
+            act_cfg=act_cfg)
+
+    def forward(self, x, hw_shape):
+        x = self.attn(self.norm1(x), hw_shape, identity=x)
+        x = self.ffn(self.norm2(x), hw_shape, identity=x)
+
+        return x
+
+
+class AbsolutePositionEmbedding(BaseModule):
+    """An implementation of the absolute position embedding in PVT.
+
+    Args:
+        pos_shape (int): The shape of the absolute position embedding.
+        pos_dim (int): The dimension of the absolute position embedding.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default: 0.0.
+    """
+
+    def __init__(self, pos_shape, pos_dim, drop_rate=0., init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(pos_shape, int):
+            pos_shape = to_2tuple(pos_shape)
+        elif isinstance(pos_shape, tuple):
+            if len(pos_shape) == 1:
+                pos_shape = to_2tuple(pos_shape[0])
+            assert len(pos_shape) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pos_shape)}'
+        self.pos_shape = pos_shape
+        self.pos_dim = pos_dim
+
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, pos_shape[0] * pos_shape[1], pos_dim))
+        self.drop = nn.Dropout(p=drop_rate)
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+
+    def resize_pos_embed(self, pos_embed, input_shape, mode='bilinear'):
+        """Resize pos_embed weights.
+
+        Resize pos_embed using bilinear interpolate method.
+
+        Args:
+            pos_embed (torch.Tensor): Position embedding weights.
+            input_shape (tuple): Tuple for (downsampled input image height,
+                downsampled input image width).
+            mode (str): Algorithm used for upsampling:
+                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+                ``'trilinear'``. Default: ``'bilinear'``.
+
+        Return:
+            torch.Tensor: The resized pos_embed of shape [B, L_new, C].
+        """
+        assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
+        pos_h, pos_w = self.pos_shape
+        pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
+        pos_embed_weight = pos_embed_weight.reshape(
+            1, pos_h, pos_w, self.pos_dim).permute(0, 3, 1, 2).contiguous()
+        pos_embed_weight = F.interpolate(
+            pos_embed_weight, size=input_shape, mode=mode)
+        pos_embed_weight = torch.flatten(pos_embed_weight,
+                                         2).transpose(1, 2).contiguous()
+        pos_embed = pos_embed_weight
+
+        return pos_embed
+
+    def forward(self, x, hw_shape, mode='bilinear'):
+        pos_embed = self.resize_pos_embed(self.pos_embed, hw_shape, mode)
+        return self.drop(x + pos_embed)
+
+
+@MODELS.register_module()
+class PyramidVisionTransformer(BaseModule):
+    """Pyramid Vision Transformer (PVT)
+
+    Implementation of `Pyramid Vision Transformer: A Versatile Backbone for
+    Dense Prediction without Convolutions
+    <https://arxiv.org/pdf/2102.12122.pdf>`_.
+
+    Args:
+        pretrain_img_size (int | tuple[int]): The size of input image when
+            pretrain. Defaults: 224.
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): Embedding dimension. Default: 64.
+        num_stags (int): The num of stages. Default: 4.
+        num_layers (Sequence[int]): The layer number of each transformer encode
+            layer. Default: [3, 4, 6, 3].
+        num_heads (Sequence[int]): The attention heads of each transformer
+            encode layer. Default: [1, 2, 5, 8].
+        patch_sizes (Sequence[int]): The patch_size of each patch embedding.
+            Default: [4, 2, 2, 2].
+        strides (Sequence[int]): The stride of each patch embedding.
+            Default: [4, 2, 2, 2].
+        paddings (Sequence[int]): The padding of each patch embedding.
+            Default: [0, 0, 0, 0].
+        sr_ratios (Sequence[int]): The spatial reduction rate of each
+            transformer encode layer. Default: [8, 4, 2, 1].
+        out_indices (Sequence[int] | int): Output from which stages.
+            Default: (0, 1, 2, 3).
+        mlp_ratios (Sequence[int]): The ratio of the mlp hidden dim to the
+            embedding dim of each transformer encode layer.
+            Default: [8, 8, 4, 4].
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults: True.
+        use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN.
+            Default: False.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        pretrained (str, optional): model pretrained path. Default: None.
+        convert_weights (bool): The flag indicates whether the
+            pre-trained model is from the original repo. We may need
+            to convert some keys to make it compatible.
+            Default: True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 in_channels=3,
+                 embed_dims=64,
+                 num_stages=4,
+                 num_layers=[3, 4, 6, 3],
+                 num_heads=[1, 2, 5, 8],
+                 patch_sizes=[4, 2, 2, 2],
+                 strides=[4, 2, 2, 2],
+                 paddings=[0, 0, 0, 0],
+                 sr_ratios=[8, 4, 2, 1],
+                 out_indices=(0, 1, 2, 3),
+                 mlp_ratios=[8, 8, 4, 4],
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_abs_pos_embed=True,
+                 norm_after_stage=False,
+                 use_conv_ffn=False,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 pretrained=None,
+                 convert_weights=True,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.convert_weights = convert_weights
+        if isinstance(pretrain_img_size, int):
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+        elif isinstance(pretrain_img_size, tuple):
+            if len(pretrain_img_size) == 1:
+                pretrain_img_size = to_2tuple(pretrain_img_size[0])
+            assert len(pretrain_img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pretrain_img_size)}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            self.init_cfg = init_cfg
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.embed_dims = embed_dims
+
+        self.num_stages = num_stages
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.sr_ratios = sr_ratios
+        assert num_stages == len(num_layers) == len(num_heads) \
+               == len(patch_sizes) == len(strides) == len(sr_ratios)
+
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_stages
+        self.pretrained = pretrained
+
+        # transformer encoder
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, drop_path_rate, sum(num_layers))
+        ]  # stochastic num_layer decay rule
+
+        cur = 0
+        self.layers = ModuleList()
+        for i, num_layer in enumerate(num_layers):
+            embed_dims_i = embed_dims * num_heads[i]
+            patch_embed = PatchEmbed(
+                in_channels=in_channels,
+                embed_dims=embed_dims_i,
+                kernel_size=patch_sizes[i],
+                stride=strides[i],
+                padding=paddings[i],
+                bias=True,
+                norm_cfg=norm_cfg)
+
+            layers = ModuleList()
+            if use_abs_pos_embed:
+                pos_shape = pretrain_img_size // np.prod(patch_sizes[:i + 1])
+                pos_embed = AbsolutePositionEmbedding(
+                    pos_shape=pos_shape,
+                    pos_dim=embed_dims_i,
+                    drop_rate=drop_rate)
+                layers.append(pos_embed)
+            layers.extend([
+                PVTEncoderLayer(
+                    embed_dims=embed_dims_i,
+                    num_heads=num_heads[i],
+                    feedforward_channels=mlp_ratios[i] * embed_dims_i,
+                    drop_rate=drop_rate,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=dpr[cur + idx],
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    sr_ratio=sr_ratios[i],
+                    use_conv_ffn=use_conv_ffn) for idx in range(num_layer)
+            ])
+            in_channels = embed_dims_i
+            # The ret[0] of build_norm_layer is norm name.
+            if norm_after_stage:
+                norm = build_norm_layer(norm_cfg, embed_dims_i)[1]
+            else:
+                norm = nn.Identity()
+            self.layers.append(ModuleList([patch_embed, layers, norm]))
+            cur += num_layer
+
+    def init_weights(self):
+        logger = MMLogger.get_current_instance()
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, 1.0)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(m, 0, math.sqrt(2.0 / fan_out))
+                elif isinstance(m, AbsolutePositionEmbedding):
+                    m.init_weights()
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            checkpoint = CheckpointLoader.load_checkpoint(
+                self.init_cfg.checkpoint, logger=logger, map_location='cpu')
+            logger.warn(f'Load pre-trained model for '
+                        f'{self.__class__.__name__} from original repo')
+            if 'state_dict' in checkpoint:
+                state_dict = checkpoint['state_dict']
+            elif 'model' in checkpoint:
+                state_dict = checkpoint['model']
+            else:
+                state_dict = checkpoint
+            if self.convert_weights:
+                # Because pvt backbones are not supported by mmpretrain,
+                # so we need to convert pre-trained weights to match this
+                # implementation.
+                state_dict = pvt_convert(state_dict)
+            load_state_dict(self, state_dict, strict=False, logger=logger)
+
+    def forward(self, x):
+        outs = []
+
+        for i, layer in enumerate(self.layers):
+            x, hw_shape = layer[0](x)
+
+            for block in layer[1]:
+                x = block(x, hw_shape)
+            x = layer[2](x)
+            x = nlc_to_nchw(x, hw_shape)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return outs
+
+
+@MODELS.register_module()
+class PyramidVisionTransformerV2(PyramidVisionTransformer):
+    """Implementation of `PVTv2: Improved Baselines with Pyramid Vision
+    Transformer <https://arxiv.org/pdf/2106.13797.pdf>`_."""
+
+    def __init__(self, **kwargs):
+        super(PyramidVisionTransformerV2, self).__init__(
+            patch_sizes=[7, 3, 3, 3],
+            paddings=[3, 1, 1, 1],
+            use_abs_pos_embed=False,
+            norm_after_stage=True,
+            use_conv_ffn=True,
+            **kwargs)
+
+
+def pvt_convert(ckpt):
+    new_ckpt = OrderedDict()
+    # Process the concat between q linear weights and kv linear weights
+    use_abs_pos_embed = False
+    use_conv_ffn = False
+    for k in ckpt.keys():
+        if k.startswith('pos_embed'):
+            use_abs_pos_embed = True
+        if k.find('dwconv') >= 0:
+            use_conv_ffn = True
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        if k.startswith('norm.'):
+            continue
+        if k.startswith('cls_token'):
+            continue
+        if k.startswith('pos_embed'):
+            stage_i = int(k.replace('pos_embed', ''))
+            new_k = k.replace(f'pos_embed{stage_i}',
+                              f'layers.{stage_i - 1}.1.0.pos_embed')
+            if stage_i == 4 and v.size(1) == 50:  # 1 (cls token) + 7 * 7
+                new_v = v[:, 1:, :]  # remove cls token
+            else:
+                new_v = v
+        elif k.startswith('patch_embed'):
+            stage_i = int(k.split('.')[0].replace('patch_embed', ''))
+            new_k = k.replace(f'patch_embed{stage_i}',
+                              f'layers.{stage_i - 1}.0')
+            new_v = v
+            if 'proj.' in new_k:
+                new_k = new_k.replace('proj.', 'projection.')
+        elif k.startswith('block'):
+            stage_i = int(k.split('.')[0].replace('block', ''))
+            layer_i = int(k.split('.')[1])
+            new_layer_i = layer_i + use_abs_pos_embed
+            new_k = k.replace(f'block{stage_i}.{layer_i}',
+                              f'layers.{stage_i - 1}.1.{new_layer_i}')
+            new_v = v
+            if 'attn.q.' in new_k:
+                sub_item_k = k.replace('q.', 'kv.')
+                new_k = new_k.replace('q.', 'attn.in_proj_')
+                new_v = torch.cat([v, ckpt[sub_item_k]], dim=0)
+            elif 'attn.kv.' in new_k:
+                continue
+            elif 'attn.proj.' in new_k:
+                new_k = new_k.replace('proj.', 'attn.out_proj.')
+            elif 'attn.sr.' in new_k:
+                new_k = new_k.replace('sr.', 'sr.')
+            elif 'mlp.' in new_k:
+                string = f'{new_k}-'
+                new_k = new_k.replace('mlp.', 'ffn.layers.')
+                if 'fc1.weight' in new_k or 'fc2.weight' in new_k:
+                    new_v = v.reshape((*v.shape, 1, 1))
+                new_k = new_k.replace('fc1.', '0.')
+                new_k = new_k.replace('dwconv.dwconv.', '1.')
+                if use_conv_ffn:
+                    new_k = new_k.replace('fc2.', '4.')
+                else:
+                    new_k = new_k.replace('fc2.', '3.')
+                string += f'{new_k} {v.shape}-{new_v.shape}'
+        elif k.startswith('norm'):
+            stage_i = int(k[4])
+            new_k = k.replace(f'norm{stage_i}', f'layers.{stage_i - 1}.2')
+            new_v = v
+        else:
+            new_k = k
+            new_v = v
+        new_ckpt[new_k] = new_v
+
+    return new_ckpt
diff --git a/mmde/mmdet/models/backbones/regnet.py b/mmde/mmdet/models/backbones/regnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..55d3ce075f0cec68de4537a71ed569151d684562
--- /dev/null
+++ b/mmde/mmdet/models/backbones/regnet.py
@@ -0,0 +1,356 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from mmdet.registry import MODELS
+from .resnet import ResNet
+from .resnext import Bottleneck
+
+
+@MODELS.register_module()
+class RegNet(ResNet):
+    """RegNet backbone.
+
+    More details can be found in `paper <https://arxiv.org/abs/2003.13678>`_ .
+
+    Args:
+        arch (dict): The parameter of RegNets.
+
+            - w0 (int): initial width
+            - wa (float): slope of width
+            - wm (float): quantization parameter to quantize the width
+            - depth (int): depth of the backbone
+            - group_w (int): width of group
+            - bot_mul (float): bottleneck ratio, i.e. expansion of bottleneck.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        base_channels (int): Base channels after stem layer.
+        in_channels (int): Number of input image channels. Default: 3.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import RegNet
+        >>> import torch
+        >>> self = RegNet(
+                arch=dict(
+                    w0=88,
+                    wa=26.31,
+                    wm=2.25,
+                    group_w=48,
+                    depth=25,
+                    bot_mul=1.0))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 96, 8, 8)
+        (1, 192, 4, 4)
+        (1, 432, 2, 2)
+        (1, 1008, 1, 1)
+    """
+    arch_settings = {
+        'regnetx_400mf':
+        dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
+        'regnetx_800mf':
+        dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16, bot_mul=1.0),
+        'regnetx_1.6gf':
+        dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18, bot_mul=1.0),
+        'regnetx_3.2gf':
+        dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, bot_mul=1.0),
+        'regnetx_4.0gf':
+        dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23, bot_mul=1.0),
+        'regnetx_6.4gf':
+        dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17, bot_mul=1.0),
+        'regnetx_8.0gf':
+        dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23, bot_mul=1.0),
+        'regnetx_12gf':
+        dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, bot_mul=1.0),
+    }
+
+    def __init__(self,
+                 arch,
+                 in_channels=3,
+                 stem_channels=32,
+                 base_channels=32,
+                 strides=(2, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResNet, self).__init__(init_cfg)
+
+        # Generate RegNet parameters first
+        if isinstance(arch, str):
+            assert arch in self.arch_settings, \
+                f'"arch": "{arch}" is not one of the' \
+                ' arch_settings'
+            arch = self.arch_settings[arch]
+        elif not isinstance(arch, dict):
+            raise ValueError('Expect "arch" to be either a string '
+                             f'or a dict, got {type(arch)}')
+
+        widths, num_stages = self.generate_regnet(
+            arch['w0'],
+            arch['wa'],
+            arch['wm'],
+            arch['depth'],
+        )
+        # Convert to per stage format
+        stage_widths, stage_blocks = self.get_stages_from_blocks(widths)
+        # Generate group widths and bot muls
+        group_widths = [arch['group_w'] for _ in range(num_stages)]
+        self.bottleneck_ratio = [arch['bot_mul'] for _ in range(num_stages)]
+        # Adjust the compatibility of stage_widths and group_widths
+        stage_widths, group_widths = self.adjust_width_group(
+            stage_widths, self.bottleneck_ratio, group_widths)
+
+        # Group params by stage
+        self.stage_widths = stage_widths
+        self.group_widths = group_widths
+        self.depth = sum(stage_blocks)
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.zero_init_residual = zero_init_residual
+        self.block = Bottleneck
+        expansion_bak = self.block.expansion
+        self.block.expansion = 1
+        self.stage_blocks = stage_blocks[:num_stages]
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        block_init_cfg = None
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+                if self.zero_init_residual:
+                    block_init_cfg = dict(
+                        type='Constant', val=0, override=dict(name='norm3'))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.inplanes = stem_channels
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            group_width = self.group_widths[i]
+            width = int(round(self.stage_widths[i] * self.bottleneck_ratio[i]))
+            stage_groups = width // group_width
+
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if self.plugins is not None:
+                stage_plugins = self.make_stage_plugins(self.plugins, i)
+            else:
+                stage_plugins = None
+
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=self.stage_widths[i],
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=self.with_cp,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins,
+                groups=stage_groups,
+                base_width=group_width,
+                base_channels=self.stage_widths[i],
+                init_cfg=block_init_cfg)
+            self.inplanes = self.stage_widths[i]
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = stage_widths[-1]
+        self.block.expansion = expansion_bak
+
+    def _make_stem_layer(self, in_channels, base_channels):
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            base_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, base_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def generate_regnet(self,
+                        initial_width,
+                        width_slope,
+                        width_parameter,
+                        depth,
+                        divisor=8):
+        """Generates per block width from RegNet parameters.
+
+        Args:
+            initial_width ([int]): Initial width of the backbone
+            width_slope ([float]): Slope of the quantized linear function
+            width_parameter ([int]): Parameter used to quantize the width.
+            depth ([int]): Depth of the backbone.
+            divisor (int, optional): The divisor of channels. Defaults to 8.
+
+        Returns:
+            list, int: return a list of widths of each stage and the number \
+                of stages
+        """
+        assert width_slope >= 0
+        assert initial_width > 0
+        assert width_parameter > 1
+        assert initial_width % divisor == 0
+        widths_cont = np.arange(depth) * width_slope + initial_width
+        ks = np.round(
+            np.log(widths_cont / initial_width) / np.log(width_parameter))
+        widths = initial_width * np.power(width_parameter, ks)
+        widths = np.round(np.divide(widths, divisor)) * divisor
+        num_stages = len(np.unique(widths))
+        widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist()
+        return widths, num_stages
+
+    @staticmethod
+    def quantize_float(number, divisor):
+        """Converts a float to closest non-zero int divisible by divisor.
+
+        Args:
+            number (int): Original number to be quantized.
+            divisor (int): Divisor used to quantize the number.
+
+        Returns:
+            int: quantized number that is divisible by devisor.
+        """
+        return int(round(number / divisor) * divisor)
+
+    def adjust_width_group(self, widths, bottleneck_ratio, groups):
+        """Adjusts the compatibility of widths and groups.
+
+        Args:
+            widths (list[int]): Width of each stage.
+            bottleneck_ratio (float): Bottleneck ratio.
+            groups (int): number of groups in each stage
+
+        Returns:
+            tuple(list): The adjusted widths and groups of each stage.
+        """
+        bottleneck_width = [
+            int(w * b) for w, b in zip(widths, bottleneck_ratio)
+        ]
+        groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_width)]
+        bottleneck_width = [
+            self.quantize_float(w_bot, g)
+            for w_bot, g in zip(bottleneck_width, groups)
+        ]
+        widths = [
+            int(w_bot / b)
+            for w_bot, b in zip(bottleneck_width, bottleneck_ratio)
+        ]
+        return widths, groups
+
+    def get_stages_from_blocks(self, widths):
+        """Gets widths/stage_blocks of network at each stage.
+
+        Args:
+            widths (list[int]): Width in each stage.
+
+        Returns:
+            tuple(list): width and depth of each stage
+        """
+        width_diff = [
+            width != width_prev
+            for width, width_prev in zip(widths + [0], [0] + widths)
+        ]
+        stage_widths = [
+            width for width, diff in zip(widths, width_diff[:-1]) if diff
+        ]
+        stage_blocks = np.diff([
+            depth for depth, diff in zip(range(len(width_diff)), width_diff)
+            if diff
+        ]).tolist()
+        return stage_widths, stage_blocks
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/mmde/mmdet/models/backbones/res2net.py b/mmde/mmdet/models/backbones/res2net.py
new file mode 100644
index 0000000000000000000000000000000000000000..958fc88465c6769cb4c50907c92335331e8b7834
--- /dev/null
+++ b/mmde/mmdet/models/backbones/res2net.py
@@ -0,0 +1,327 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import Sequential
+
+from mmdet.registry import MODELS
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottle2neck(_Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 scales=4,
+                 base_width=26,
+                 base_channels=64,
+                 stage_type='normal',
+                 **kwargs):
+        """Bottle2neck block for Res2Net.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottle2neck, self).__init__(inplanes, planes, **kwargs)
+        assert scales > 1, 'Res2Net degenerates to ResNet when scales = 1.'
+        width = int(math.floor(self.planes * (base_width / base_channels)))
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width * scales, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width * scales,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+
+        if stage_type == 'stage' and self.conv2_stride != 1:
+            self.pool = nn.AvgPool2d(
+                kernel_size=3, stride=self.conv2_stride, padding=1)
+        convs = []
+        bns = []
+
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            for i in range(scales - 1):
+                convs.append(
+                    build_conv_layer(
+                        self.conv_cfg,
+                        width,
+                        width,
+                        kernel_size=3,
+                        stride=self.conv2_stride,
+                        padding=self.dilation,
+                        dilation=self.dilation,
+                        bias=False))
+                bns.append(
+                    build_norm_layer(self.norm_cfg, width, postfix=i + 1)[1])
+            self.convs = nn.ModuleList(convs)
+            self.bns = nn.ModuleList(bns)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            for i in range(scales - 1):
+                convs.append(
+                    build_conv_layer(
+                        self.dcn,
+                        width,
+                        width,
+                        kernel_size=3,
+                        stride=self.conv2_stride,
+                        padding=self.dilation,
+                        dilation=self.dilation,
+                        bias=False))
+                bns.append(
+                    build_norm_layer(self.norm_cfg, width, postfix=i + 1)[1])
+            self.convs = nn.ModuleList(convs)
+            self.bns = nn.ModuleList(bns)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width * scales,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.stage_type = stage_type
+        self.scales = scales
+        self.width = width
+        delattr(self, 'conv2')
+        delattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            spx = torch.split(out, self.width, 1)
+            sp = self.convs[0](spx[0].contiguous())
+            sp = self.relu(self.bns[0](sp))
+            out = sp
+            for i in range(1, self.scales - 1):
+                if self.stage_type == 'stage':
+                    sp = spx[i]
+                else:
+                    sp = sp + spx[i]
+                sp = self.convs[i](sp.contiguous())
+                sp = self.relu(self.bns[i](sp))
+                out = torch.cat((out, sp), 1)
+
+            if self.stage_type == 'normal' or self.conv2_stride == 1:
+                out = torch.cat((out, spx[self.scales - 1]), 1)
+            elif self.stage_type == 'stage':
+                out = torch.cat((out, self.pool(spx[self.scales - 1])), 1)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Res2Layer(Sequential):
+    """Res2Layer to build Res2Net style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottle2neck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        scales (int): Scales used in Res2Net. Default: 4
+        base_width (int): Basic width of each scale. Default: 26
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 avg_down=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 scales=4,
+                 base_width=26,
+                 **kwargs):
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.AvgPool2d(
+                    kernel_size=stride,
+                    stride=stride,
+                    ceil_mode=True,
+                    count_include_pad=False),
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=1,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1],
+            )
+
+        layers = []
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride,
+                downsample=downsample,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                scales=scales,
+                base_width=base_width,
+                stage_type='stage',
+                **kwargs))
+        inplanes = planes * block.expansion
+        for i in range(1, num_blocks):
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    scales=scales,
+                    base_width=base_width,
+                    **kwargs))
+        super(Res2Layer, self).__init__(*layers)
+
+
+@MODELS.register_module()
+class Res2Net(ResNet):
+    """Res2Net backbone.
+
+    Args:
+        scales (int): Scales used in Res2Net. Default: 4
+        base_width (int): Basic width of each scale. Default: 26
+        depth (int): Depth of res2net, from {50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Res2net stages. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottle2neck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - position (str, required): Position inside block to insert
+              plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import Res2Net
+        >>> import torch
+        >>> self = Res2Net(depth=50, scales=4, base_width=26)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 8, 8)
+        (1, 512, 4, 4)
+        (1, 1024, 2, 2)
+        (1, 2048, 1, 1)
+    """
+
+    arch_settings = {
+        50: (Bottle2neck, (3, 4, 6, 3)),
+        101: (Bottle2neck, (3, 4, 23, 3)),
+        152: (Bottle2neck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 scales=4,
+                 base_width=26,
+                 style='pytorch',
+                 deep_stem=True,
+                 avg_down=True,
+                 pretrained=None,
+                 init_cfg=None,
+                 **kwargs):
+        self.scales = scales
+        self.base_width = base_width
+        super(Res2Net, self).__init__(
+            style='pytorch',
+            deep_stem=True,
+            avg_down=True,
+            pretrained=pretrained,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return Res2Layer(
+            scales=self.scales,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/mmde/mmdet/models/backbones/resnest.py b/mmde/mmdet/models/backbones/resnest.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4466c4cc416237bee1f870b52e3c20a849c5a60
--- /dev/null
+++ b/mmde/mmdet/models/backbones/resnest.py
@@ -0,0 +1,322 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from ..layers import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNetV1d
+
+
+class RSoftmax(nn.Module):
+    """Radix Softmax module in ``SplitAttentionConv2d``.
+
+    Args:
+        radix (int): Radix of input.
+        groups (int): Groups of input.
+    """
+
+    def __init__(self, radix, groups):
+        super().__init__()
+        self.radix = radix
+        self.groups = groups
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x
+
+
+class SplitAttentionConv2d(BaseModule):
+    """Split-Attention Conv2d in ResNeSt.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        channels (int): Number of intermediate channels.
+        kernel_size (int | tuple[int]): Size of the convolution kernel.
+        stride (int | tuple[int]): Stride of the convolution.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+        dilation (int | tuple[int]): Spacing between kernel elements.
+        groups (int): Number of blocked connections from input channels to
+            output channels.
+        groups (int): Same as nn.Conv2d.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels. Default: 4.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        dcn (dict): Config dict for DCN. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 radix=2,
+                 reduction_factor=4,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 init_cfg=None):
+        super(SplitAttentionConv2d, self).__init__(init_cfg)
+        inter_channels = max(in_channels * radix // reduction_factor, 32)
+        self.radix = radix
+        self.groups = groups
+        self.channels = channels
+        self.with_dcn = dcn is not None
+        self.dcn = dcn
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if self.with_dcn and not fallback_on_stride:
+            assert conv_cfg is None, 'conv_cfg must be None for DCN'
+            conv_cfg = dcn
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            channels * radix,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups * radix,
+            bias=False)
+        # To be consistent with original implementation, starting from 0
+        self.norm0_name, norm0 = build_norm_layer(
+            norm_cfg, channels * radix, postfix=0)
+        self.add_module(self.norm0_name, norm0)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc1 = build_conv_layer(
+            None, channels, inter_channels, 1, groups=self.groups)
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, inter_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.fc2 = build_conv_layer(
+            None, inter_channels, channels * radix, 1, groups=self.groups)
+        self.rsoftmax = RSoftmax(radix, groups)
+
+    @property
+    def norm0(self):
+        """nn.Module: the normalization layer named "norm0" """
+        return getattr(self, self.norm0_name)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm0(x)
+        x = self.relu(x)
+
+        batch, rchannel = x.shape[:2]
+        batch = x.size(0)
+        if self.radix > 1:
+            splits = x.view(batch, self.radix, -1, *x.shape[2:])
+            gap = splits.sum(dim=1)
+        else:
+            gap = x
+        gap = F.adaptive_avg_pool2d(gap, 1)
+        gap = self.fc1(gap)
+
+        gap = self.norm1(gap)
+        gap = self.relu(gap)
+
+        atten = self.fc2(gap)
+        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
+
+        if self.radix > 1:
+            attens = atten.view(batch, self.radix, -1, *atten.shape[2:])
+            out = torch.sum(attens * splits, dim=1)
+        else:
+            out = atten * x
+        return out.contiguous()
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeSt.
+
+    Args:
+        inplane (int): Input planes of this block.
+        planes (int): Middle planes of this block.
+        groups (int): Groups of conv2.
+        base_width (int): Base of width in terms of base channels. Default: 4.
+        base_channels (int): Base of channels for calculating width.
+            Default: 64.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Key word arguments for base class.
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        """Bottleneck block for ResNeSt."""
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.avg_down_stride = avg_down_stride and self.conv2_stride > 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.with_modulated_dcn = False
+        self.conv2 = SplitAttentionConv2d(
+            width,
+            width,
+            kernel_size=3,
+            stride=1 if self.avg_down_stride else self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            radix=radix,
+            reduction_factor=reduction_factor,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dcn=self.dcn)
+        delattr(self, self.norm2_name)
+
+        if self.avg_down_stride:
+            self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+
+            if self.avg_down_stride:
+                out = self.avd_layer(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@MODELS.register_module()
+class ResNeSt(ResNetV1d):
+    """ResNeSt backbone.
+
+    Args:
+        groups (int): Number of groups of Bottleneck. Default: 1
+        base_width (int): Base width of Bottleneck. Default: 4
+        radix (int): Radix of SplitAttentionConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Keyword arguments for ResNet.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3)),
+        200: (Bottleneck, (3, 24, 36, 3))
+    }
+
+    def __init__(self,
+                 groups=1,
+                 base_width=4,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        self.radix = radix
+        self.reduction_factor = reduction_factor
+        self.avg_down_stride = avg_down_stride
+        super(ResNeSt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            radix=self.radix,
+            reduction_factor=self.reduction_factor,
+            avg_down_stride=self.avg_down_stride,
+            **kwargs)
diff --git a/mmde/mmdet/models/backbones/resnet.py b/mmde/mmdet/models/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6f48f94f286e3c5e3179f752a7b36ea77c0d45
--- /dev/null
+++ b/mmde/mmdet/models/backbones/resnet.py
@@ -0,0 +1,672 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer, build_plugin_layer
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from ..layers import ResLayer
+
+
+class BasicBlock(BaseModule):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        super(BasicBlock, self).__init__(init_cfg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(BaseModule):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        """Bottleneck block for ResNet.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(init_cfg)
+        assert style in ['pytorch', 'caffe']
+        assert dcn is None or isinstance(dcn, dict)
+        assert plugins is None or isinstance(plugins, list)
+        if plugins is not None:
+            allowed_position = ['after_conv1', 'after_conv2', 'after_conv3']
+            assert all(p['position'] in allowed_position for p in plugins)
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+        self.plugins = plugins
+        self.with_plugins = plugins is not None
+
+        if self.with_plugins:
+            # collect plugins for conv1/conv2/conv3
+            self.after_conv1_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv1'
+            ]
+            self.after_conv2_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv2'
+            ]
+            self.after_conv3_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv3'
+            ]
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                conv_cfg,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                dcn,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+        if self.with_plugins:
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                planes, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                planes, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                planes * self.expansion, self.after_conv3_plugins)
+
+    def make_block_plugins(self, in_channels, plugins):
+        """make plugins for block.
+
+        Args:
+            in_channels (int): Input channels of plugin.
+            plugins (list[dict]): List of plugins cfg to build.
+
+        Returns:
+            list[str]: List of the names of plugin.
+        """
+        assert isinstance(plugins, list)
+        plugin_names = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            name, layer = build_plugin_layer(
+                plugin,
+                in_channels=in_channels,
+                postfix=plugin.pop('postfix', ''))
+            assert not hasattr(self, name), f'duplicate plugin {name}'
+            self.add_module(name, layer)
+            plugin_names.append(name)
+        return plugin_names
+
+    def forward_plugin(self, x, plugin_names):
+        out = x
+        for name in plugin_names:
+            out = getattr(self, name)(out)
+        return out
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: normalization layer after the third convolution layer"""
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@MODELS.register_module()
+class ResNet(BaseModule):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        stem_channels (int | None): Number of stem channels. If not specified,
+            it will be the same as `base_channels`. Default: None.
+        base_channels (int): Number of base channels of res layer. Default: 64.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Resnet stages. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - position (str, required): Position inside block to insert
+              plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=None,
+                 base_channels=64,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResNet, self).__init__(init_cfg)
+        self.zero_init_residual = zero_init_residual
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+
+        block_init_cfg = None
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+                block = self.arch_settings[depth][0]
+                if self.zero_init_residual:
+                    if block is BasicBlock:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm2'))
+                    elif block is Bottleneck:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm3'))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depth = depth
+        if stem_channels is None:
+            stem_channels = base_channels
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = stem_channels
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if plugins is not None:
+                stage_plugins = self.make_stage_plugins(plugins, i)
+            else:
+                stage_plugins = None
+            planes = base_channels * 2**i
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins,
+                init_cfg=block_init_cfg)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = self.block.expansion * base_channels * 2**(
+            len(self.stage_blocks) - 1)
+
+    def make_stage_plugins(self, plugins, stage_idx):
+        """Make plugins for ResNet ``stage_idx`` th stage.
+
+        Currently we support to insert ``context_block``,
+        ``empirical_attention_block``, ``nonlocal_block`` into the backbone
+        like ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of
+        Bottleneck.
+
+        An example of plugins format could be:
+
+        Examples:
+            >>> plugins=[
+            ...     dict(cfg=dict(type='xxx', arg1='xxx'),
+            ...          stages=(False, True, True, True),
+            ...          position='after_conv2'),
+            ...     dict(cfg=dict(type='yyy'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='1'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='2'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3')
+            ... ]
+            >>> self = ResNet(depth=18)
+            >>> stage_plugins = self.make_stage_plugins(plugins, 0)
+            >>> assert len(stage_plugins) == 3
+
+        Suppose ``stage_idx=0``, the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->conv3->yyy->zzz1->zzz2
+
+        Suppose 'stage_idx=1', the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2
+
+        If stages is missing, the plugin would be applied to all stages.
+
+        Args:
+            plugins (list[dict]): List of plugins cfg to build. The postfix is
+                required if multiple same type plugins are inserted.
+            stage_idx (int): Index of stage to build
+
+        Returns:
+            list[dict]: Plugins for current stage
+        """
+        stage_plugins = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            stages = plugin.pop('stages', None)
+            assert stages is None or len(stages) == self.num_stages
+            # whether to insert plugin into current stage
+            if stages is None or stages[stage_idx]:
+                stage_plugins.append(plugin)
+
+        return stage_plugins
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels)[1],
+                nn.ReLU(inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(ResNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+@MODELS.register_module()
+class ResNetV1d(ResNet):
+    r"""ResNetV1d variant described in `Bag of Tricks
+    <https://arxiv.org/pdf/1812.01187.pdf>`_.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+
+    def __init__(self, **kwargs):
+        super(ResNetV1d, self).__init__(
+            deep_stem=True, avg_down=True, **kwargs)
diff --git a/mmde/mmdet/models/backbones/resnext.py b/mmde/mmdet/models/backbones/resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..df3d79e046c3ab9b289bcfeb6f937c87f6c09bfa
--- /dev/null
+++ b/mmde/mmdet/models/backbones/resnext.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from mmdet.registry import MODELS
+from ..layers import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottleneck(_Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 **kwargs):
+        """Bottleneck block for ResNeXt.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                self.conv_cfg,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                self.dcn,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        if self.with_plugins:
+            self._del_block_plugins(self.after_conv1_plugin_names +
+                                    self.after_conv2_plugin_names +
+                                    self.after_conv3_plugin_names)
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                width, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                width, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                self.planes * self.expansion, self.after_conv3_plugins)
+
+    def _del_block_plugins(self, plugin_names):
+        """delete plugins for block if exist.
+
+        Args:
+            plugin_names (list[str]): List of plugins name to delete.
+        """
+        assert isinstance(plugin_names, list)
+        for plugin_name in plugin_names:
+            del self._modules[plugin_name]
+
+
+@MODELS.register_module()
+class ResNeXt(ResNet):
+    """ResNeXt backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Resnet stages. Default: 4.
+        groups (int): Group of resnext.
+        base_width (int): Base width of resnext.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, groups=1, base_width=4, **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        super(ResNeXt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``"""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/mmde/mmdet/models/backbones/ssd_vgg.py b/mmde/mmdet/models/backbones/ssd_vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..843e82e2722f93b9b2abb5180c827c8f2a430b48
--- /dev/null
+++ b/mmde/mmdet/models/backbones/ssd_vgg.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import VGG
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from ..necks import ssd_neck
+
+
+@MODELS.register_module()
+class SSDVGG(VGG, BaseModule):
+    """VGG Backbone network for single-shot-detection.
+
+    Args:
+        depth (int): Depth of vgg, from {11, 13, 16, 19}.
+        with_last_pool (bool): Whether to add a pooling layer at the last
+            of the model
+        ceil_mode (bool): When True, will use `ceil` instead of `floor`
+            to compute the output shape.
+        out_indices (Sequence[int]): Output from which stages.
+        out_feature_indices (Sequence[int]): Output from which feature map.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+        input_size (int, optional): Deprecated argumment.
+            Width and height of input, from {300, 512}.
+        l2_norm_scale (float, optional) : Deprecated argumment.
+            L2 normalization layer init scale.
+
+    Example:
+        >>> self = SSDVGG(input_size=300, depth=11)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 300, 300)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 1024, 19, 19)
+        (1, 512, 10, 10)
+        (1, 256, 5, 5)
+        (1, 256, 3, 3)
+        (1, 256, 1, 1)
+    """
+    extra_setting = {
+        300: (256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256),
+        512: (256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 128),
+    }
+
+    def __init__(self,
+                 depth,
+                 with_last_pool=False,
+                 ceil_mode=True,
+                 out_indices=(3, 4),
+                 out_feature_indices=(22, 34),
+                 pretrained=None,
+                 init_cfg=None,
+                 input_size=None,
+                 l2_norm_scale=None):
+        # TODO: in_channels for mmcv.VGG
+        super(SSDVGG, self).__init__(
+            depth,
+            with_last_pool=with_last_pool,
+            ceil_mode=ceil_mode,
+            out_indices=out_indices)
+
+        self.features.add_module(
+            str(len(self.features)),
+            nn.MaxPool2d(kernel_size=3, stride=1, padding=1))
+        self.features.add_module(
+            str(len(self.features)),
+            nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6))
+        self.features.add_module(
+            str(len(self.features)), nn.ReLU(inplace=True))
+        self.features.add_module(
+            str(len(self.features)), nn.Conv2d(1024, 1024, kernel_size=1))
+        self.features.add_module(
+            str(len(self.features)), nn.ReLU(inplace=True))
+        self.out_feature_indices = out_feature_indices
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+
+        if init_cfg is not None:
+            self.init_cfg = init_cfg
+        elif isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            self.init_cfg = [
+                dict(type='Kaiming', layer='Conv2d'),
+                dict(type='Constant', val=1, layer='BatchNorm2d'),
+                dict(type='Normal', std=0.01, layer='Linear'),
+            ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        if input_size is not None:
+            warnings.warn('DeprecationWarning: input_size is deprecated')
+        if l2_norm_scale is not None:
+            warnings.warn('DeprecationWarning: l2_norm_scale in VGG is '
+                          'deprecated, it has been moved to SSDNeck.')
+
+    def init_weights(self, pretrained=None):
+        super(VGG, self).init_weights()
+
+    def forward(self, x):
+        """Forward function."""
+        outs = []
+        for i, layer in enumerate(self.features):
+            x = layer(x)
+            if i in self.out_feature_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+
+class L2Norm(ssd_neck.L2Norm):
+
+    def __init__(self, **kwargs):
+        super(L2Norm, self).__init__(**kwargs)
+        warnings.warn('DeprecationWarning: L2Norm in ssd_vgg.py '
+                      'is deprecated, please use L2Norm in '
+                      'mmdet/models/necks/ssd_neck.py instead')
diff --git a/mmde/mmdet/models/backbones/swin.py b/mmde/mmdet/models/backbones/swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..062190fa077d7b01e0c1db76bea0cfb5dc7b6620
--- /dev/null
+++ b/mmde/mmdet/models/backbones/swin.py
@@ -0,0 +1,819 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, build_dropout
+from mmengine.logging import MMLogger
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import (constant_init, trunc_normal_,
+                                        trunc_normal_init)
+from mmengine.runner.checkpoint import CheckpointLoader
+from mmengine.utils import to_2tuple
+
+from mmdet.registry import MODELS
+from ..layers import PatchEmbed, PatchMerging
+
+
+class WindowMSA(BaseModule):
+    """Window based multi-head self-attention (W-MSA) module with relative
+    position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 init_cfg=None):
+
+        super().__init__()
+        self.embed_dims = embed_dims
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.scale = qk_scale or head_embed_dims**-0.5
+        self.init_cfg = init_cfg
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # About 2x faster than original impl
+        Wh, Ww = self.window_size
+        rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
+        rel_position_index = rel_index_coords + rel_index_coords.T
+        rel_position_index = rel_position_index.flip(1).contiguous()
+        self.register_buffer('relative_position_index', rel_position_index)
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def init_weights(self):
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+
+            x (tensor): input features with shape of (num_windows*B, N, C)
+            mask (tensor | None, Optional): mask with shape of (num_windows,
+                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
+        """
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        # make torchscript happy (cannot use tensor as tuple)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @staticmethod
+    def double_step_seq(step1, len1, step2, len2):
+        seq1 = torch.arange(0, step1 * len1, step1)
+        seq2 = torch.arange(0, step2 * len2, step2)
+        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
+
+
+class ShiftWindowMSA(BaseModule):
+    """Shifted Window Multihead Self-Attention Module.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): The height and width of the window.
+        shift_size (int, optional): The shift step of each window towards
+            right-bottom. If zero, act as regular window-msa. Defaults to 0.
+        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
+            Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Defaults: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Defaults: 0.
+        proj_drop_rate (float, optional): Dropout ratio of output.
+            Defaults: 0.
+        dropout_layer (dict, optional): The dropout_layer used before output.
+            Defaults: dict(type='DropPath', drop_prob=0.).
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 shift_size=0,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0,
+                 proj_drop_rate=0,
+                 dropout_layer=dict(type='DropPath', drop_prob=0.),
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.window_size = window_size
+        self.shift_size = shift_size
+        assert 0 <= self.shift_size < self.window_size
+
+        self.w_msa = WindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=to_2tuple(window_size),
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=proj_drop_rate,
+            init_cfg=None)
+
+        self.drop = build_dropout(dropout_layer)
+
+    def forward(self, query, hw_shape):
+        B, L, C = query.shape
+        H, W = hw_shape
+        assert L == H * W, 'input feature has wrong size'
+        query = query.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))
+        H_pad, W_pad = query.shape[1], query.shape[2]
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_query = torch.roll(
+                query,
+                shifts=(-self.shift_size, -self.shift_size),
+                dims=(1, 2))
+
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device)
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            # nW, window_size, window_size, 1
+            mask_windows = self.window_partition(img_mask)
+            mask_windows = mask_windows.view(
+                -1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                              float(-100.0)).masked_fill(
+                                                  attn_mask == 0, float(0.0))
+        else:
+            shifted_query = query
+            attn_mask = None
+
+        # nW*B, window_size, window_size, C
+        query_windows = self.window_partition(shifted_query)
+        # nW*B, window_size*window_size, C
+        query_windows = query_windows.view(-1, self.window_size**2, C)
+
+        # W-MSA/SW-MSA (nW*B, window_size*window_size, C)
+        attn_windows = self.w_msa(query_windows, mask=attn_mask)
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+
+        # B H' W' C
+        shifted_x = self.window_reverse(attn_windows, H_pad, W_pad)
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        x = self.drop(x)
+        return x
+
+    def window_reverse(self, windows, H, W):
+        """
+        Args:
+            windows: (num_windows*B, window_size, window_size, C)
+            H (int): Height of image
+            W (int): Width of image
+        Returns:
+            x: (B, H, W, C)
+        """
+        window_size = self.window_size
+        B = int(windows.shape[0] / (H * W / window_size / window_size))
+        x = windows.view(B, H // window_size, W // window_size, window_size,
+                         window_size, -1)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+        return x
+
+    def window_partition(self, x):
+        """
+        Args:
+            x: (B, H, W, C)
+        Returns:
+            windows: (num_windows*B, window_size, window_size, C)
+        """
+        B, H, W, C = x.shape
+        window_size = self.window_size
+        x = x.view(B, H // window_size, window_size, W // window_size,
+                   window_size, C)
+        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+        windows = windows.view(-1, window_size, window_size, C)
+        return windows
+
+
+class SwinBlock(BaseModule):
+    """"
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        window_size (int, optional): The local window scale. Default: 7.
+        shift (bool, optional): whether to shift window or not. Default False.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float, optional): Stochastic depth rate. Default: 0.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 window_size=7,
+                 shift=False,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 init_cfg=None):
+
+        super(SwinBlock, self).__init__()
+
+        self.init_cfg = init_cfg
+        self.with_cp = with_cp
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.attn = ShiftWindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=window_size,
+            shift_size=window_size // 2 if shift else 0,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            init_cfg=None)
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=2,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=True,
+            init_cfg=None)
+
+    def forward(self, x, hw_shape):
+
+        def _inner_forward(x):
+            identity = x
+            x = self.norm1(x)
+            x = self.attn(x, hw_shape)
+
+            x = x + identity
+
+            identity = x
+            x = self.norm2(x)
+            x = self.ffn(x, identity=identity)
+
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+
+        return x
+
+
+class SwinBlockSequence(BaseModule):
+    """Implements one stage in Swin Transformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        depth (int): The number of blocks in this stage.
+        window_size (int, optional): The local window scale. Default: 7.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float | list[float], optional): Stochastic depth
+            rate. Default: 0.
+        downsample (BaseModule | None, optional): The downsample operation
+            module. Default: None.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 depth,
+                 window_size=7,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 downsample=None,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(drop_path_rate, list):
+            drop_path_rates = drop_path_rate
+            assert len(drop_path_rates) == depth
+        else:
+            drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)]
+
+        self.blocks = ModuleList()
+        for i in range(depth):
+            block = SwinBlock(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                feedforward_channels=feedforward_channels,
+                window_size=window_size,
+                shift=False if i % 2 == 0 else True,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=drop_path_rates[i],
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp,
+                init_cfg=None)
+            self.blocks.append(block)
+
+        self.downsample = downsample
+
+    def forward(self, x, hw_shape):
+        for block in self.blocks:
+            x = block(x, hw_shape)
+
+        if self.downsample:
+            x_down, down_hw_shape = self.downsample(x, hw_shape)
+            return x_down, down_hw_shape, x, hw_shape
+        else:
+            return x, hw_shape, x, hw_shape
+
+
+@MODELS.register_module()
+class SwinTransformer(BaseModule):
+    """ Swin Transformer
+    A PyTorch implement of : `Swin Transformer:
+    Hierarchical Vision Transformer using Shifted Windows`  -
+        https://arxiv.org/abs/2103.14030
+
+    Inspiration from
+    https://github.com/microsoft/Swin-Transformer
+
+    Args:
+        pretrain_img_size (int | tuple[int]): The size of input image when
+            pretrain. Defaults: 224.
+        in_channels (int): The num of input channels.
+            Defaults: 3.
+        embed_dims (int): The feature dimension. Default: 96.
+        patch_size (int | tuple[int]): Patch size. Default: 4.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (int): Ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+            Default: (2, 2, 6, 2).
+        num_heads (tuple[int]): Parallel attention heads of each Swin
+            Transformer stage. Default: (3, 6, 12, 24).
+        strides (tuple[int]): The patch merging or patch embedding stride of
+            each Swin Transformer stage. (In swin, we set kernel size equal to
+            stride.) Default: (4, 2, 2, 2).
+        out_indices (tuple[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key,
+            value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        patch_norm (bool): If add a norm layer for patch embed and patch
+            merging. Default: True.
+        drop_rate (float): Dropout rate. Defaults: 0.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults: 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults: False.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer at
+            output of backone. Defaults: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        pretrained (str, optional): model pretrained path. Default: None.
+        convert_weights (bool): The flag indicates whether the
+            pre-trained model is from the original repo. We may need
+            to convert some keys to make it compatible.
+            Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            Default: -1 (-1 means not freezing any parameters).
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 in_channels=3,
+                 embed_dims=96,
+                 patch_size=4,
+                 window_size=7,
+                 mlp_ratio=4,
+                 depths=(2, 2, 6, 2),
+                 num_heads=(3, 6, 12, 24),
+                 strides=(4, 2, 2, 2),
+                 out_indices=(0, 1, 2, 3),
+                 qkv_bias=True,
+                 qk_scale=None,
+                 patch_norm=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_abs_pos_embed=False,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 pretrained=None,
+                 convert_weights=False,
+                 frozen_stages=-1,
+                 init_cfg=None):
+        self.convert_weights = convert_weights
+        self.frozen_stages = frozen_stages
+        if isinstance(pretrain_img_size, int):
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+        elif isinstance(pretrain_img_size, tuple):
+            if len(pretrain_img_size) == 1:
+                pretrain_img_size = to_2tuple(pretrain_img_size[0])
+            assert len(pretrain_img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pretrain_img_size)}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            self.init_cfg = init_cfg
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        super(SwinTransformer, self).__init__(init_cfg=init_cfg)
+
+        num_layers = len(depths)
+        self.out_indices = out_indices
+        self.use_abs_pos_embed = use_abs_pos_embed
+
+        assert strides[0] == patch_size, 'Use non-overlapping patch embed.'
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=strides[0],
+            norm_cfg=norm_cfg if patch_norm else None,
+            init_cfg=None)
+
+        if self.use_abs_pos_embed:
+            patch_row = pretrain_img_size[0] // patch_size
+            patch_col = pretrain_img_size[1] // patch_size
+            num_patches = patch_row * patch_col
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros((1, num_patches, embed_dims)))
+
+        self.drop_after_pos = nn.Dropout(p=drop_rate)
+
+        # set stochastic depth decay rule
+        total_depth = sum(depths)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
+        ]
+
+        self.stages = ModuleList()
+        in_channels = embed_dims
+        for i in range(num_layers):
+            if i < num_layers - 1:
+                downsample = PatchMerging(
+                    in_channels=in_channels,
+                    out_channels=2 * in_channels,
+                    stride=strides[i + 1],
+                    norm_cfg=norm_cfg if patch_norm else None,
+                    init_cfg=None)
+            else:
+                downsample = None
+
+            stage = SwinBlockSequence(
+                embed_dims=in_channels,
+                num_heads=num_heads[i],
+                feedforward_channels=mlp_ratio * in_channels,
+                depth=depths[i],
+                window_size=window_size,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])],
+                downsample=downsample,
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp,
+                init_cfg=None)
+            self.stages.append(stage)
+            if downsample:
+                in_channels = downsample.out_channels
+
+        self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)]
+        # Add a norm layer for each output
+        for i in out_indices:
+            layer = build_norm_layer(norm_cfg, self.num_features[i])[1]
+            layer_name = f'norm{i}'
+            self.add_module(layer_name, layer)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            if self.use_abs_pos_embed:
+                self.absolute_pos_embed.requires_grad = False
+            self.drop_after_pos.eval()
+
+        for i in range(1, self.frozen_stages + 1):
+
+            if (i - 1) in self.out_indices:
+                norm_layer = getattr(self, f'norm{i-1}')
+                norm_layer.eval()
+                for param in norm_layer.parameters():
+                    param.requires_grad = False
+
+            m = self.stages[i - 1]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self):
+        logger = MMLogger.get_current_instance()
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+            if self.use_abs_pos_embed:
+                trunc_normal_(self.absolute_pos_embed, std=0.02)
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, 1.0)
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            ckpt = CheckpointLoader.load_checkpoint(
+                self.init_cfg.checkpoint, logger=logger, map_location='cpu')
+            if 'state_dict' in ckpt:
+                _state_dict = ckpt['state_dict']
+            elif 'model' in ckpt:
+                _state_dict = ckpt['model']
+            else:
+                _state_dict = ckpt
+            if self.convert_weights:
+                # supported loading weight from original repo,
+                _state_dict = swin_converter(_state_dict)
+
+            state_dict = OrderedDict()
+            for k, v in _state_dict.items():
+                if k.startswith('backbone.'):
+                    state_dict[k[9:]] = v
+
+            # strip prefix of state_dict
+            if list(state_dict.keys())[0].startswith('module.'):
+                state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+            # reshape absolute position embedding
+            if state_dict.get('absolute_pos_embed') is not None:
+                absolute_pos_embed = state_dict['absolute_pos_embed']
+                N1, L, C1 = absolute_pos_embed.size()
+                N2, C2, H, W = self.absolute_pos_embed.size()
+                if N1 != N2 or C1 != C2 or L != H * W:
+                    logger.warning('Error in loading absolute_pos_embed, pass')
+                else:
+                    state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
+                        N2, H, W, C2).permute(0, 3, 1, 2).contiguous()
+
+            # interpolate position bias table if needed
+            relative_position_bias_table_keys = [
+                k for k in state_dict.keys()
+                if 'relative_position_bias_table' in k
+            ]
+            for table_key in relative_position_bias_table_keys:
+                table_pretrained = state_dict[table_key]
+                table_current = self.state_dict()[table_key]
+                L1, nH1 = table_pretrained.size()
+                L2, nH2 = table_current.size()
+                if nH1 != nH2:
+                    logger.warning(f'Error in loading {table_key}, pass')
+                elif L1 != L2:
+                    S1 = int(L1**0.5)
+                    S2 = int(L2**0.5)
+                    table_pretrained_resized = F.interpolate(
+                        table_pretrained.permute(1, 0).reshape(1, nH1, S1, S1),
+                        size=(S2, S2),
+                        mode='bicubic')
+                    state_dict[table_key] = table_pretrained_resized.view(
+                        nH2, L2).permute(1, 0).contiguous()
+
+            # load state_dict
+            self.load_state_dict(state_dict, False)
+
+    def forward(self, x):
+        x, hw_shape = self.patch_embed(x)
+
+        if self.use_abs_pos_embed:
+            x = x + self.absolute_pos_embed
+        x = self.drop_after_pos(x)
+
+        outs = []
+        for i, stage in enumerate(self.stages):
+            x, hw_shape, out, out_hw_shape = stage(x, hw_shape)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                out = norm_layer(out)
+                out = out.view(-1, *out_hw_shape,
+                               self.num_features[i]).permute(0, 3, 1,
+                                                             2).contiguous()
+                outs.append(out)
+
+        return outs
+
+
+def swin_converter(ckpt):
+
+    new_ckpt = OrderedDict()
+
+    def correct_unfold_reduction_order(x):
+        out_channel, in_channel = x.shape
+        x = x.reshape(out_channel, 4, in_channel // 4)
+        x = x[:, [0, 2, 1, 3], :].transpose(1,
+                                            2).reshape(out_channel, in_channel)
+        return x
+
+    def correct_unfold_norm_order(x):
+        in_channel = x.shape[0]
+        x = x.reshape(4, in_channel // 4)
+        x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
+        return x
+
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        elif k.startswith('layers'):
+            new_v = v
+            if 'attn.' in k:
+                new_k = k.replace('attn.', 'attn.w_msa.')
+            elif 'mlp.' in k:
+                if 'mlp.fc1.' in k:
+                    new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.')
+                elif 'mlp.fc2.' in k:
+                    new_k = k.replace('mlp.fc2.', 'ffn.layers.1.')
+                else:
+                    new_k = k.replace('mlp.', 'ffn.')
+            elif 'downsample' in k:
+                new_k = k
+                if 'reduction.' in k:
+                    new_v = correct_unfold_reduction_order(v)
+                elif 'norm.' in k:
+                    new_v = correct_unfold_norm_order(v)
+            else:
+                new_k = k
+            new_k = new_k.replace('layers', 'stages', 1)
+        elif k.startswith('patch_embed'):
+            new_v = v
+            if 'proj' in k:
+                new_k = k.replace('proj', 'projection')
+            else:
+                new_k = k
+        else:
+            new_v = v
+            new_k = k
+
+        new_ckpt['backbone.' + new_k] = new_v
+
+    return new_ckpt
diff --git a/mmde/mmdet/models/backbones/trident_resnet.py b/mmde/mmdet/models/backbones/trident_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..22c76354522ff8533b094df6858ec361ba400c1e
--- /dev/null
+++ b/mmde/mmdet/models/backbones/trident_resnet.py
@@ -0,0 +1,298 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.backbones.resnet import Bottleneck, ResNet
+from mmdet.registry import MODELS
+
+
+class TridentConv(BaseModule):
+    """Trident Convolution Module.
+
+    Args:
+        in_channels (int): Number of channels in input.
+        out_channels (int): Number of channels in output.
+        kernel_size (int): Size of convolution kernel.
+        stride (int, optional): Convolution stride. Default: 1.
+        trident_dilations (tuple[int, int, int], optional): Dilations of
+            different trident branch. Default: (1, 2, 3).
+        test_branch_idx (int, optional): In inference, all 3 branches will
+            be used if `test_branch_idx==-1`, otherwise only branch with
+            index `test_branch_idx` will be used. Default: 1.
+        bias (bool, optional): Whether to use bias in convolution or not.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 trident_dilations=(1, 2, 3),
+                 test_branch_idx=1,
+                 bias=False,
+                 init_cfg=None):
+        super(TridentConv, self).__init__(init_cfg)
+        self.num_branch = len(trident_dilations)
+        self.with_bias = bias
+        self.test_branch_idx = test_branch_idx
+        self.stride = _pair(stride)
+        self.kernel_size = _pair(kernel_size)
+        self.paddings = _pair(trident_dilations)
+        self.dilations = trident_dilations
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.bias = bias
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels, *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.bias = None
+
+    def extra_repr(self):
+        tmpstr = f'in_channels={self.in_channels}'
+        tmpstr += f', out_channels={self.out_channels}'
+        tmpstr += f', kernel_size={self.kernel_size}'
+        tmpstr += f', num_branch={self.num_branch}'
+        tmpstr += f', test_branch_idx={self.test_branch_idx}'
+        tmpstr += f', stride={self.stride}'
+        tmpstr += f', paddings={self.paddings}'
+        tmpstr += f', dilations={self.dilations}'
+        tmpstr += f', bias={self.bias}'
+        return tmpstr
+
+    def forward(self, inputs):
+        if self.training or self.test_branch_idx == -1:
+            outputs = [
+                F.conv2d(input, self.weight, self.bias, self.stride, padding,
+                         dilation) for input, dilation, padding in zip(
+                             inputs, self.dilations, self.paddings)
+            ]
+        else:
+            assert len(inputs) == 1
+            outputs = [
+                F.conv2d(inputs[0], self.weight, self.bias, self.stride,
+                         self.paddings[self.test_branch_idx],
+                         self.dilations[self.test_branch_idx])
+            ]
+
+        return outputs
+
+
+# Since TridentNet is defined over ResNet50 and ResNet101, here we
+# only support TridentBottleneckBlock.
+class TridentBottleneck(Bottleneck):
+    """BottleBlock for TridentResNet.
+
+    Args:
+        trident_dilations (tuple[int, int, int]): Dilations of different
+            trident branch.
+        test_branch_idx (int): In inference, all 3 branches will be used
+            if `test_branch_idx==-1`, otherwise only branch with index
+            `test_branch_idx` will be used.
+        concat_output (bool): Whether to concat the output list to a Tensor.
+            `True` only in the last Block.
+    """
+
+    def __init__(self, trident_dilations, test_branch_idx, concat_output,
+                 **kwargs):
+
+        super(TridentBottleneck, self).__init__(**kwargs)
+        self.trident_dilations = trident_dilations
+        self.num_branch = len(trident_dilations)
+        self.concat_output = concat_output
+        self.test_branch_idx = test_branch_idx
+        self.conv2 = TridentConv(
+            self.planes,
+            self.planes,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            bias=False,
+            trident_dilations=self.trident_dilations,
+            test_branch_idx=test_branch_idx,
+            init_cfg=dict(
+                type='Kaiming',
+                distribution='uniform',
+                mode='fan_in',
+                override=dict(name='conv2')))
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            num_branch = (
+                self.num_branch
+                if self.training or self.test_branch_idx == -1 else 1)
+            identity = x
+            if not isinstance(x, list):
+                x = (x, ) * num_branch
+                identity = x
+                if self.downsample is not None:
+                    identity = [self.downsample(b) for b in x]
+
+            out = [self.conv1(b) for b in x]
+            out = [self.norm1(b) for b in out]
+            out = [self.relu(b) for b in out]
+
+            if self.with_plugins:
+                for k in range(len(out)):
+                    out[k] = self.forward_plugin(out[k],
+                                                 self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = [self.norm2(b) for b in out]
+            out = [self.relu(b) for b in out]
+            if self.with_plugins:
+                for k in range(len(out)):
+                    out[k] = self.forward_plugin(out[k],
+                                                 self.after_conv2_plugin_names)
+
+            out = [self.conv3(b) for b in out]
+            out = [self.norm3(b) for b in out]
+
+            if self.with_plugins:
+                for k in range(len(out)):
+                    out[k] = self.forward_plugin(out[k],
+                                                 self.after_conv3_plugin_names)
+
+            out = [
+                out_b + identity_b for out_b, identity_b in zip(out, identity)
+            ]
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = [self.relu(b) for b in out]
+        if self.concat_output:
+            out = torch.cat(out, dim=0)
+        return out
+
+
+def make_trident_res_layer(block,
+                           inplanes,
+                           planes,
+                           num_blocks,
+                           stride=1,
+                           trident_dilations=(1, 2, 3),
+                           style='pytorch',
+                           with_cp=False,
+                           conv_cfg=None,
+                           norm_cfg=dict(type='BN'),
+                           dcn=None,
+                           plugins=None,
+                           test_branch_idx=-1):
+    """Build Trident Res Layers."""
+
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = []
+        conv_stride = stride
+        downsample.extend([
+            build_conv_layer(
+                conv_cfg,
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=conv_stride,
+                bias=False),
+            build_norm_layer(norm_cfg, planes * block.expansion)[1]
+        ])
+        downsample = nn.Sequential(*downsample)
+
+    layers = []
+    for i in range(num_blocks):
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride if i == 0 else 1,
+                trident_dilations=trident_dilations,
+                downsample=downsample if i == 0 else None,
+                style=style,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=plugins,
+                test_branch_idx=test_branch_idx,
+                concat_output=True if i == num_blocks - 1 else False))
+        inplanes = planes * block.expansion
+    return nn.Sequential(*layers)
+
+
+@MODELS.register_module()
+class TridentResNet(ResNet):
+    """The stem layer, stage 1 and stage 2 in Trident ResNet are identical to
+    ResNet, while in stage 3, Trident BottleBlock is utilized to replace the
+    normal BottleBlock to yield trident output. Different branch shares the
+    convolution weight but uses different dilations to achieve multi-scale
+    output.
+
+                               / stage3(b0) \
+    x - stem - stage1 - stage2 - stage3(b1) - output
+                               \ stage3(b2) /
+
+    Args:
+        depth (int): Depth of resnet, from {50, 101, 152}.
+        num_branch (int): Number of branches in TridentNet.
+        test_branch_idx (int): In inference, all 3 branches will be used
+            if `test_branch_idx==-1`, otherwise only branch with index
+            `test_branch_idx` will be used.
+        trident_dilations (tuple[int]): Dilations of different trident branch.
+            len(trident_dilations) should be equal to num_branch.
+    """  # noqa
+
+    def __init__(self, depth, num_branch, test_branch_idx, trident_dilations,
+                 **kwargs):
+
+        assert num_branch == len(trident_dilations)
+        assert depth in (50, 101, 152)
+        super(TridentResNet, self).__init__(depth, **kwargs)
+        assert self.num_stages == 3
+        self.test_branch_idx = test_branch_idx
+        self.num_branch = num_branch
+
+        last_stage_idx = self.num_stages - 1
+        stride = self.strides[last_stage_idx]
+        dilation = trident_dilations
+        dcn = self.dcn if self.stage_with_dcn[last_stage_idx] else None
+        if self.plugins is not None:
+            stage_plugins = self.make_stage_plugins(self.plugins,
+                                                    last_stage_idx)
+        else:
+            stage_plugins = None
+        planes = self.base_channels * 2**last_stage_idx
+        res_layer = make_trident_res_layer(
+            TridentBottleneck,
+            inplanes=(self.block.expansion * self.base_channels *
+                      2**(last_stage_idx - 1)),
+            planes=planes,
+            num_blocks=self.stage_blocks[last_stage_idx],
+            stride=stride,
+            trident_dilations=dilation,
+            style=self.style,
+            with_cp=self.with_cp,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dcn=dcn,
+            plugins=stage_plugins,
+            test_branch_idx=self.test_branch_idx)
+
+        layer_name = f'layer{last_stage_idx + 1}'
+
+        self.__setattr__(layer_name, res_layer)
+        self.res_layers.pop(last_stage_idx)
+        self.res_layers.insert(last_stage_idx, layer_name)
+
+        self._freeze_stages()
diff --git a/mmde/mmdet/models/data_preprocessors/__init__.py b/mmde/mmdet/models/data_preprocessors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..201a1da6a4f320a17cea9c65d5c102bfdd7700d8
--- /dev/null
+++ b/mmde/mmdet/models/data_preprocessors/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .data_preprocessor import (BatchFixedSizePad, BatchResize,
+                                BatchSyncRandomResize, BoxInstDataPreprocessor,
+                                DetDataPreprocessor,
+                                MultiBranchDataPreprocessor)
+from .reid_data_preprocessor import ReIDDataPreprocessor
+from .track_data_preprocessor import TrackDataPreprocessor
+
+__all__ = [
+    'DetDataPreprocessor', 'BatchSyncRandomResize', 'BatchFixedSizePad',
+    'MultiBranchDataPreprocessor', 'BatchResize', 'BoxInstDataPreprocessor',
+    'TrackDataPreprocessor', 'ReIDDataPreprocessor'
+]
diff --git a/mmde/mmdet/models/data_preprocessors/data_preprocessor.py b/mmde/mmdet/models/data_preprocessors/data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..55b5c35b3a4888c95c6646df3fa080347afe4704
--- /dev/null
+++ b/mmde/mmdet/models/data_preprocessors/data_preprocessor.py
@@ -0,0 +1,793 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from numbers import Number
+from typing import List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.dist import barrier, broadcast, get_dist_info
+from mmengine.logging import MessageHub
+from mmengine.model import BaseDataPreprocessor, ImgDataPreprocessor
+from mmengine.structures import PixelData
+from mmengine.utils import is_seq_of
+from torch import Tensor
+
+from mmdet.models.utils import unfold_wo_center
+from mmdet.models.utils.misc import samplelist_boxtype2tensor
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample
+from mmdet.structures.mask import BitmapMasks
+from mmdet.utils import ConfigType
+
+try:
+    import skimage
+except ImportError:
+    skimage = None
+
+
+@MODELS.register_module()
+class DetDataPreprocessor(ImgDataPreprocessor):
+    """Image pre-processor for detection tasks.
+
+    Comparing with the :class:`mmengine.ImgDataPreprocessor`,
+
+    1. It supports batch augmentations.
+    2. It will additionally append batch_input_shape and pad_shape
+    to data_samples considering the object detection task.
+
+    It provides the data pre-processing as follows
+
+    - Collate and move data to the target device.
+    - Pad inputs to the maximum size of current batch with defined
+      ``pad_value``. The padding size can be divisible by a defined
+      ``pad_size_divisor``
+    - Stack inputs to batch_inputs.
+    - Convert inputs from bgr to rgb if the shape of input is (3, H, W).
+    - Normalize image with defined std and mean.
+    - Do batch augmentations during training.
+
+    Args:
+        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
+            Defaults to None.
+        std (Sequence[Number], optional): The pixel standard deviation of
+            R, G, B channels. Defaults to None.
+        pad_size_divisor (int): The size of padded image should be
+            divisible by ``pad_size_divisor``. Defaults to 1.
+        pad_value (Number): The padded pixel value. Defaults to 0.
+        pad_mask (bool): Whether to pad instance masks. Defaults to False.
+        mask_pad_value (int): The padded pixel value for instance masks.
+            Defaults to 0.
+        pad_seg (bool): Whether to pad semantic segmentation maps.
+            Defaults to False.
+        seg_pad_value (int): The padded pixel value for semantic
+            segmentation maps. Defaults to 255.
+        bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+            Defaults to False.
+        rgb_to_bgr (bool): whether to convert image from RGB to RGB.
+            Defaults to False.
+        boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of
+            bboxes data to ``Tensor`` type. Defaults to True.
+        non_blocking (bool): Whether block current process
+            when transferring data to device. Defaults to False.
+        batch_augments (list[dict], optional): Batch-level augmentations
+    """
+
+    def __init__(self,
+                 mean: Sequence[Number] = None,
+                 std: Sequence[Number] = None,
+                 pad_size_divisor: int = 1,
+                 pad_value: Union[float, int] = 0,
+                 pad_mask: bool = False,
+                 mask_pad_value: int = 0,
+                 pad_seg: bool = False,
+                 seg_pad_value: int = 255,
+                 bgr_to_rgb: bool = False,
+                 rgb_to_bgr: bool = False,
+                 boxtype2tensor: bool = True,
+                 non_blocking: Optional[bool] = False,
+                 batch_augments: Optional[List[dict]] = None):
+        super().__init__(
+            mean=mean,
+            std=std,
+            pad_size_divisor=pad_size_divisor,
+            pad_value=pad_value,
+            bgr_to_rgb=bgr_to_rgb,
+            rgb_to_bgr=rgb_to_bgr,
+            non_blocking=non_blocking)
+        if batch_augments is not None:
+            self.batch_augments = nn.ModuleList(
+                [MODELS.build(aug) for aug in batch_augments])
+        else:
+            self.batch_augments = None
+        self.pad_mask = pad_mask
+        self.mask_pad_value = mask_pad_value
+        self.pad_seg = pad_seg
+        self.seg_pad_value = seg_pad_value
+        self.boxtype2tensor = boxtype2tensor
+
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization,padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        batch_pad_shape = self._get_pad_shape(data)
+        data = super().forward(data=data, training=training)
+        inputs, data_samples = data['inputs'], data['data_samples']
+
+        if data_samples is not None:
+            # NOTE the batched image size information may be useful, e.g.
+            # in DETR, this is needed for the construction of masks, which is
+            # then used for the transformer_head.
+            batch_input_shape = tuple(inputs[0].size()[-2:])
+            for data_sample, pad_shape in zip(data_samples, batch_pad_shape):
+                data_sample.set_metainfo({
+                    'batch_input_shape': batch_input_shape,
+                    'pad_shape': pad_shape
+                })
+
+            if self.boxtype2tensor:
+                samplelist_boxtype2tensor(data_samples)
+
+            if self.pad_mask and training:
+                self.pad_gt_masks(data_samples)
+
+            if self.pad_seg and training:
+                self.pad_gt_sem_seg(data_samples)
+
+        if training and self.batch_augments is not None:
+            for batch_aug in self.batch_augments:
+                inputs, data_samples = batch_aug(inputs, data_samples)
+
+        return {'inputs': inputs, 'data_samples': data_samples}
+
+    def _get_pad_shape(self, data: dict) -> List[tuple]:
+        """Get the pad_shape of each image based on data and
+        pad_size_divisor."""
+        _batch_inputs = data['inputs']
+        # Process data with `pseudo_collate`.
+        if is_seq_of(_batch_inputs, torch.Tensor):
+            batch_pad_shape = []
+            for ori_input in _batch_inputs:
+                pad_h = int(
+                    np.ceil(ori_input.shape[1] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                pad_w = int(
+                    np.ceil(ori_input.shape[2] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                batch_pad_shape.append((pad_h, pad_w))
+        # Process data with `default_collate`.
+        elif isinstance(_batch_inputs, torch.Tensor):
+            assert _batch_inputs.dim() == 4, (
+                'The input of `ImgDataPreprocessor` should be a NCHW tensor '
+                'or a list of tensor, but got a tensor with shape: '
+                f'{_batch_inputs.shape}')
+            pad_h = int(
+                np.ceil(_batch_inputs.shape[2] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            pad_w = int(
+                np.ceil(_batch_inputs.shape[3] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0]
+        else:
+            raise TypeError('Output of `cast_data` should be a dict '
+                            'or a tuple with inputs and data_samples, but got'
+                            f'{type(data)}: {data}')
+        return batch_pad_shape
+
+    def pad_gt_masks(self,
+                     batch_data_samples: Sequence[DetDataSample]) -> None:
+        """Pad gt_masks to shape of batch_input_shape."""
+        if 'masks' in batch_data_samples[0].gt_instances:
+            for data_samples in batch_data_samples:
+                masks = data_samples.gt_instances.masks
+                data_samples.gt_instances.masks = masks.pad(
+                    data_samples.batch_input_shape,
+                    pad_val=self.mask_pad_value)
+
+    def pad_gt_sem_seg(self,
+                       batch_data_samples: Sequence[DetDataSample]) -> None:
+        """Pad gt_sem_seg to shape of batch_input_shape."""
+        if 'gt_sem_seg' in batch_data_samples[0]:
+            for data_samples in batch_data_samples:
+                gt_sem_seg = data_samples.gt_sem_seg.sem_seg
+                h, w = gt_sem_seg.shape[-2:]
+                pad_h, pad_w = data_samples.batch_input_shape
+                gt_sem_seg = F.pad(
+                    gt_sem_seg,
+                    pad=(0, max(pad_w - w, 0), 0, max(pad_h - h, 0)),
+                    mode='constant',
+                    value=self.seg_pad_value)
+                data_samples.gt_sem_seg = PixelData(sem_seg=gt_sem_seg)
+
+
+@MODELS.register_module()
+class BatchSyncRandomResize(nn.Module):
+    """Batch random resize which synchronizes the random size across ranks.
+
+    Args:
+        random_size_range (tuple): The multi-scale random range during
+            multi-scale training.
+        interval (int): The iter interval of change
+            image size. Defaults to 10.
+        size_divisor (int): Image size divisible factor.
+            Defaults to 32.
+    """
+
+    def __init__(self,
+                 random_size_range: Tuple[int, int],
+                 interval: int = 10,
+                 size_divisor: int = 32) -> None:
+        super().__init__()
+        self.rank, self.world_size = get_dist_info()
+        self._input_size = None
+        self._random_size_range = (round(random_size_range[0] / size_divisor),
+                                   round(random_size_range[1] / size_divisor))
+        self._interval = interval
+        self._size_divisor = size_divisor
+
+    def forward(
+        self, inputs: Tensor, data_samples: List[DetDataSample]
+    ) -> Tuple[Tensor, List[DetDataSample]]:
+        """resize a batch of images and bboxes to shape ``self._input_size``"""
+        h, w = inputs.shape[-2:]
+        if self._input_size is None:
+            self._input_size = (h, w)
+        scale_y = self._input_size[0] / h
+        scale_x = self._input_size[1] / w
+        if scale_x != 1 or scale_y != 1:
+            inputs = F.interpolate(
+                inputs,
+                size=self._input_size,
+                mode='bilinear',
+                align_corners=False)
+            for data_sample in data_samples:
+                img_shape = (int(data_sample.img_shape[0] * scale_y),
+                             int(data_sample.img_shape[1] * scale_x))
+                pad_shape = (int(data_sample.pad_shape[0] * scale_y),
+                             int(data_sample.pad_shape[1] * scale_x))
+                data_sample.set_metainfo({
+                    'img_shape': img_shape,
+                    'pad_shape': pad_shape,
+                    'batch_input_shape': self._input_size
+                })
+                data_sample.gt_instances.bboxes[
+                    ...,
+                    0::2] = data_sample.gt_instances.bboxes[...,
+                                                            0::2] * scale_x
+                data_sample.gt_instances.bboxes[
+                    ...,
+                    1::2] = data_sample.gt_instances.bboxes[...,
+                                                            1::2] * scale_y
+                if 'ignored_instances' in data_sample:
+                    data_sample.ignored_instances.bboxes[
+                        ..., 0::2] = data_sample.ignored_instances.bboxes[
+                            ..., 0::2] * scale_x
+                    data_sample.ignored_instances.bboxes[
+                        ..., 1::2] = data_sample.ignored_instances.bboxes[
+                            ..., 1::2] * scale_y
+        message_hub = MessageHub.get_current_instance()
+        if (message_hub.get_info('iter') + 1) % self._interval == 0:
+            self._input_size = self._get_random_size(
+                aspect_ratio=float(w / h), device=inputs.device)
+        return inputs, data_samples
+
+    def _get_random_size(self, aspect_ratio: float,
+                         device: torch.device) -> Tuple[int, int]:
+        """Randomly generate a shape in ``_random_size_range`` and broadcast to
+        all ranks."""
+        tensor = torch.LongTensor(2).to(device)
+        if self.rank == 0:
+            size = random.randint(*self._random_size_range)
+            size = (self._size_divisor * size,
+                    self._size_divisor * int(aspect_ratio * size))
+            tensor[0] = size[0]
+            tensor[1] = size[1]
+        barrier()
+        broadcast(tensor, 0)
+        input_size = (tensor[0].item(), tensor[1].item())
+        return input_size
+
+
+@MODELS.register_module()
+class BatchFixedSizePad(nn.Module):
+    """Fixed size padding for batch images.
+
+    Args:
+        size (Tuple[int, int]): Fixed padding size. Expected padding
+            shape (h, w). Defaults to None.
+        img_pad_value (int): The padded pixel value for images.
+            Defaults to 0.
+        pad_mask (bool): Whether to pad instance masks. Defaults to False.
+        mask_pad_value (int): The padded pixel value for instance masks.
+            Defaults to 0.
+        pad_seg (bool): Whether to pad semantic segmentation maps.
+            Defaults to False.
+        seg_pad_value (int): The padded pixel value for semantic
+            segmentation maps. Defaults to 255.
+    """
+
+    def __init__(self,
+                 size: Tuple[int, int],
+                 img_pad_value: int = 0,
+                 pad_mask: bool = False,
+                 mask_pad_value: int = 0,
+                 pad_seg: bool = False,
+                 seg_pad_value: int = 255) -> None:
+        super().__init__()
+        self.size = size
+        self.pad_mask = pad_mask
+        self.pad_seg = pad_seg
+        self.img_pad_value = img_pad_value
+        self.mask_pad_value = mask_pad_value
+        self.seg_pad_value = seg_pad_value
+
+    def forward(
+        self,
+        inputs: Tensor,
+        data_samples: Optional[List[dict]] = None
+    ) -> Tuple[Tensor, Optional[List[dict]]]:
+        """Pad image, instance masks, segmantic segmentation maps."""
+        src_h, src_w = inputs.shape[-2:]
+        dst_h, dst_w = self.size
+
+        if src_h >= dst_h and src_w >= dst_w:
+            return inputs, data_samples
+
+        inputs = F.pad(
+            inputs,
+            pad=(0, max(0, dst_w - src_w), 0, max(0, dst_h - src_h)),
+            mode='constant',
+            value=self.img_pad_value)
+
+        if data_samples is not None:
+            # update batch_input_shape
+            for data_sample in data_samples:
+                data_sample.set_metainfo({
+                    'batch_input_shape': (dst_h, dst_w),
+                    'pad_shape': (dst_h, dst_w)
+                })
+
+            if self.pad_mask:
+                for data_sample in data_samples:
+                    masks = data_sample.gt_instances.masks
+                    data_sample.gt_instances.masks = masks.pad(
+                        (dst_h, dst_w), pad_val=self.mask_pad_value)
+
+            if self.pad_seg:
+                for data_sample in data_samples:
+                    gt_sem_seg = data_sample.gt_sem_seg.sem_seg
+                    h, w = gt_sem_seg.shape[-2:]
+                    gt_sem_seg = F.pad(
+                        gt_sem_seg,
+                        pad=(0, max(0, dst_w - w), 0, max(0, dst_h - h)),
+                        mode='constant',
+                        value=self.seg_pad_value)
+                    data_sample.gt_sem_seg = PixelData(sem_seg=gt_sem_seg)
+
+        return inputs, data_samples
+
+
+@MODELS.register_module()
+class MultiBranchDataPreprocessor(BaseDataPreprocessor):
+    """DataPreprocessor wrapper for multi-branch data.
+
+    Take semi-supervised object detection as an example, assume that
+    the ratio of labeled data and unlabeled data in a batch is 1:2,
+    `sup` indicates the branch where the labeled data is augmented,
+    `unsup_teacher` and `unsup_student` indicate the branches where
+    the unlabeled data is augmented by different pipeline.
+
+    The input format of multi-branch data is shown as below :
+
+    .. code-block:: none
+        {
+            'inputs':
+                {
+                    'sup': [Tensor, None, None],
+                    'unsup_teacher': [None, Tensor, Tensor],
+                    'unsup_student': [None, Tensor, Tensor],
+                },
+            'data_sample':
+                {
+                    'sup': [DetDataSample, None, None],
+                    'unsup_teacher': [None, DetDataSample, DetDataSample],
+                    'unsup_student': [NOne, DetDataSample, DetDataSample],
+                }
+        }
+
+    The format of multi-branch data
+    after filtering None is shown as below :
+
+    .. code-block:: none
+        {
+            'inputs':
+                {
+                    'sup': [Tensor],
+                    'unsup_teacher': [Tensor, Tensor],
+                    'unsup_student': [Tensor, Tensor],
+                },
+            'data_sample':
+                {
+                    'sup': [DetDataSample],
+                    'unsup_teacher': [DetDataSample, DetDataSample],
+                    'unsup_student': [DetDataSample, DetDataSample],
+                }
+        }
+
+    In order to reuse `DetDataPreprocessor` for the data
+    from different branches, the format of multi-branch data
+    grouped by branch is as below :
+
+    .. code-block:: none
+        {
+            'sup':
+                {
+                    'inputs': [Tensor]
+                    'data_sample': [DetDataSample, DetDataSample]
+                },
+            'unsup_teacher':
+                {
+                    'inputs': [Tensor, Tensor]
+                    'data_sample': [DetDataSample, DetDataSample]
+                },
+            'unsup_student':
+                {
+                    'inputs': [Tensor, Tensor]
+                    'data_sample': [DetDataSample, DetDataSample]
+                },
+        }
+
+    After preprocessing data from different branches,
+    the multi-branch data needs to be reformatted as:
+
+    .. code-block:: none
+        {
+            'inputs':
+                {
+                    'sup': [Tensor],
+                    'unsup_teacher': [Tensor, Tensor],
+                    'unsup_student': [Tensor, Tensor],
+                },
+            'data_sample':
+                {
+                    'sup': [DetDataSample],
+                    'unsup_teacher': [DetDataSample, DetDataSample],
+                    'unsup_student': [DetDataSample, DetDataSample],
+                }
+        }
+
+    Args:
+        data_preprocessor (:obj:`ConfigDict` or dict): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+    """
+
+    def __init__(self, data_preprocessor: ConfigType) -> None:
+        super().__init__()
+        self.data_preprocessor = MODELS.build(data_preprocessor)
+
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization,padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor`` for multi-branch data.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict:
+
+            - 'inputs' (Dict[str, obj:`torch.Tensor`]): The forward data of
+                models from different branches.
+            - 'data_sample' (Dict[str, obj:`DetDataSample`]): The annotation
+                info of the sample from different branches.
+        """
+
+        if training is False:
+            return self.data_preprocessor(data, training)
+
+        # Filter out branches with a value of None
+        for key in data.keys():
+            for branch in data[key].keys():
+                data[key][branch] = list(
+                    filter(lambda x: x is not None, data[key][branch]))
+
+        # Group data by branch
+        multi_branch_data = {}
+        for key in data.keys():
+            for branch in data[key].keys():
+                if multi_branch_data.get(branch, None) is None:
+                    multi_branch_data[branch] = {key: data[key][branch]}
+                elif multi_branch_data[branch].get(key, None) is None:
+                    multi_branch_data[branch][key] = data[key][branch]
+                else:
+                    multi_branch_data[branch][key].append(data[key][branch])
+
+        # Preprocess data from different branches
+        for branch, _data in multi_branch_data.items():
+            multi_branch_data[branch] = self.data_preprocessor(_data, training)
+
+        # Format data by inputs and data_samples
+        format_data = {}
+        for branch in multi_branch_data.keys():
+            for key in multi_branch_data[branch].keys():
+                if format_data.get(key, None) is None:
+                    format_data[key] = {branch: multi_branch_data[branch][key]}
+                elif format_data[key].get(branch, None) is None:
+                    format_data[key][branch] = multi_branch_data[branch][key]
+                else:
+                    format_data[key][branch].append(
+                        multi_branch_data[branch][key])
+
+        return format_data
+
+    @property
+    def device(self):
+        return self.data_preprocessor.device
+
+    def to(self, device: Optional[Union[int, torch.device]], *args,
+           **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Args:
+            device (int or torch.device, optional): The desired device of the
+                parameters and buffers in this module.
+
+        Returns:
+            nn.Module: The model itself.
+        """
+
+        return self.data_preprocessor.to(device, *args, **kwargs)
+
+    def cuda(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+
+        return self.data_preprocessor.cuda(*args, **kwargs)
+
+    def cpu(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+
+        return self.data_preprocessor.cpu(*args, **kwargs)
+
+
+@MODELS.register_module()
+class BatchResize(nn.Module):
+    """Batch resize during training. This implementation is modified from
+    https://github.com/Purkialo/CrowdDet/blob/master/lib/data/CrowdHuman.py.
+
+    It provides the data pre-processing as follows:
+    - A batch of all images will pad to a uniform size and stack them into
+      a torch.Tensor by `DetDataPreprocessor`.
+    - `BatchFixShapeResize` resize all images to the target size.
+    - Padding images to make sure the size of image can be divisible by
+      ``pad_size_divisor``.
+
+    Args:
+        scale (tuple): Images scales for resizing.
+        pad_size_divisor (int): Image size divisible factor.
+            Defaults to 1.
+        pad_value (Number): The padded pixel value. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        scale: tuple,
+        pad_size_divisor: int = 1,
+        pad_value: Union[float, int] = 0,
+    ) -> None:
+        super().__init__()
+        self.min_size = min(scale)
+        self.max_size = max(scale)
+        self.pad_size_divisor = pad_size_divisor
+        self.pad_value = pad_value
+
+    def forward(
+        self, inputs: Tensor, data_samples: List[DetDataSample]
+    ) -> Tuple[Tensor, List[DetDataSample]]:
+        """resize a batch of images and bboxes."""
+
+        batch_height, batch_width = inputs.shape[-2:]
+        target_height, target_width, scale = self.get_target_size(
+            batch_height, batch_width)
+
+        inputs = F.interpolate(
+            inputs,
+            size=(target_height, target_width),
+            mode='bilinear',
+            align_corners=False)
+
+        inputs = self.get_padded_tensor(inputs, self.pad_value)
+
+        if data_samples is not None:
+            batch_input_shape = tuple(inputs.size()[-2:])
+            for data_sample in data_samples:
+                img_shape = [
+                    int(scale * _) for _ in list(data_sample.img_shape)
+                ]
+                data_sample.set_metainfo({
+                    'img_shape': tuple(img_shape),
+                    'batch_input_shape': batch_input_shape,
+                    'pad_shape': batch_input_shape,
+                    'scale_factor': (scale, scale)
+                })
+
+                data_sample.gt_instances.bboxes *= scale
+                data_sample.ignored_instances.bboxes *= scale
+
+        return inputs, data_samples
+
+    def get_target_size(self, height: int,
+                        width: int) -> Tuple[int, int, float]:
+        """Get the target size of a batch of images based on data and scale."""
+        im_size_min = np.min([height, width])
+        im_size_max = np.max([height, width])
+        scale = self.min_size / im_size_min
+        if scale * im_size_max > self.max_size:
+            scale = self.max_size / im_size_max
+        target_height, target_width = int(round(height * scale)), int(
+            round(width * scale))
+        return target_height, target_width, scale
+
+    def get_padded_tensor(self, tensor: Tensor, pad_value: int) -> Tensor:
+        """Pad images according to pad_size_divisor."""
+        assert tensor.ndim == 4
+        target_height, target_width = tensor.shape[-2], tensor.shape[-1]
+        divisor = self.pad_size_divisor
+        padded_height = (target_height + divisor - 1) // divisor * divisor
+        padded_width = (target_width + divisor - 1) // divisor * divisor
+        padded_tensor = torch.ones([
+            tensor.shape[0], tensor.shape[1], padded_height, padded_width
+        ]) * pad_value
+        padded_tensor = padded_tensor.type_as(tensor)
+        padded_tensor[:, :, :target_height, :target_width] = tensor
+        return padded_tensor
+
+
+@MODELS.register_module()
+class BoxInstDataPreprocessor(DetDataPreprocessor):
+    """Pseudo mask pre-processor for BoxInst.
+
+    Comparing with the :class:`mmdet.DetDataPreprocessor`,
+
+    1. It generates masks using box annotations.
+    2. It computes the images color similarity in LAB color space.
+
+    Args:
+        mask_stride (int): The mask output stride in boxinst. Defaults to 4.
+        pairwise_size (int): The size of neighborhood for each pixel.
+            Defaults to 3.
+        pairwise_dilation (int): The dilation of neighborhood for each pixel.
+            Defaults to 2.
+        pairwise_color_thresh (float): The thresh of image color similarity.
+            Defaults to 0.3.
+        bottom_pixels_removed (int): The length of removed pixels in bottom.
+            It is caused by the annotation error in coco dataset.
+            Defaults to 10.
+    """
+
+    def __init__(self,
+                 *arg,
+                 mask_stride: int = 4,
+                 pairwise_size: int = 3,
+                 pairwise_dilation: int = 2,
+                 pairwise_color_thresh: float = 0.3,
+                 bottom_pixels_removed: int = 10,
+                 **kwargs) -> None:
+        super().__init__(*arg, **kwargs)
+        self.mask_stride = mask_stride
+        self.pairwise_size = pairwise_size
+        self.pairwise_dilation = pairwise_dilation
+        self.pairwise_color_thresh = pairwise_color_thresh
+        self.bottom_pixels_removed = bottom_pixels_removed
+
+        if skimage is None:
+            raise RuntimeError('skimage is not installed,\
+                 please install it by: pip install scikit-image')
+
+    def get_images_color_similarity(self, inputs: Tensor,
+                                    image_masks: Tensor) -> Tensor:
+        """Compute the image color similarity in LAB color space."""
+        assert inputs.dim() == 4
+        assert inputs.size(0) == 1
+
+        unfolded_images = unfold_wo_center(
+            inputs,
+            kernel_size=self.pairwise_size,
+            dilation=self.pairwise_dilation)
+        diff = inputs[:, :, None] - unfolded_images
+        similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5)
+
+        unfolded_weights = unfold_wo_center(
+            image_masks[None, None],
+            kernel_size=self.pairwise_size,
+            dilation=self.pairwise_dilation)
+        unfolded_weights = torch.max(unfolded_weights, dim=1)[0]
+
+        return similarity * unfolded_weights
+
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Get pseudo mask labels using color similarity."""
+        det_data = super().forward(data, training)
+        inputs, data_samples = det_data['inputs'], det_data['data_samples']
+
+        if training:
+            # get image masks and remove bottom pixels
+            b_img_h, b_img_w = data_samples[0].batch_input_shape
+            img_masks = []
+            for i in range(inputs.shape[0]):
+                img_h, img_w = data_samples[i].img_shape
+                img_mask = inputs.new_ones((img_h, img_w))
+                pixels_removed = int(self.bottom_pixels_removed *
+                                     float(img_h) / float(b_img_h))
+                if pixels_removed > 0:
+                    img_mask[-pixels_removed:, :] = 0
+                pad_w = b_img_w - img_w
+                pad_h = b_img_h - img_h
+                img_mask = F.pad(img_mask, (0, pad_w, 0, pad_h), 'constant',
+                                 0.)
+                img_masks.append(img_mask)
+            img_masks = torch.stack(img_masks, dim=0)
+            start = int(self.mask_stride // 2)
+            img_masks = img_masks[:, start::self.mask_stride,
+                                  start::self.mask_stride]
+
+            # Get origin rgb image for color similarity
+            ori_imgs = inputs * self.std + self.mean
+            downsampled_imgs = F.avg_pool2d(
+                ori_imgs.float(),
+                kernel_size=self.mask_stride,
+                stride=self.mask_stride,
+                padding=0)
+
+            # Compute color similarity for pseudo mask generation
+            for im_i, data_sample in enumerate(data_samples):
+                # TODO: Support rgb2lab in mmengine?
+                images_lab = skimage.color.rgb2lab(
+                    downsampled_imgs[im_i].byte().permute(1, 2,
+                                                          0).cpu().numpy())
+                images_lab = torch.as_tensor(
+                    images_lab, device=ori_imgs.device, dtype=torch.float32)
+                images_lab = images_lab.permute(2, 0, 1)[None]
+                images_color_similarity = self.get_images_color_similarity(
+                    images_lab, img_masks[im_i])
+                pairwise_mask = (images_color_similarity >=
+                                 self.pairwise_color_thresh).float()
+
+                per_im_bboxes = data_sample.gt_instances.bboxes
+                if per_im_bboxes.shape[0] > 0:
+                    per_im_masks = []
+                    for per_box in per_im_bboxes:
+                        mask_full = torch.zeros((b_img_h, b_img_w),
+                                                device=self.device).float()
+                        mask_full[int(per_box[1]):int(per_box[3] + 1),
+                                  int(per_box[0]):int(per_box[2] + 1)] = 1.0
+                        per_im_masks.append(mask_full)
+                    per_im_masks = torch.stack(per_im_masks, dim=0)
+                    pairwise_masks = torch.cat(
+                        [pairwise_mask for _ in range(per_im_bboxes.shape[0])],
+                        dim=0)
+                else:
+                    per_im_masks = torch.zeros((0, b_img_h, b_img_w))
+                    pairwise_masks = torch.zeros(
+                        (0, self.pairwise_size**2 - 1, b_img_h, b_img_w))
+
+                # TODO: Support BitmapMasks with tensor?
+                data_sample.gt_instances.masks = BitmapMasks(
+                    per_im_masks.cpu().numpy(), b_img_h, b_img_w)
+                data_sample.gt_instances.pairwise_masks = pairwise_masks
+        return {'inputs': inputs, 'data_samples': data_samples}
diff --git a/mmde/mmdet/models/data_preprocessors/reid_data_preprocessor.py b/mmde/mmdet/models/data_preprocessors/reid_data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d0a1d45d97ba350e8845c6620f3b73f05545e61
--- /dev/null
+++ b/mmde/mmdet/models/data_preprocessors/reid_data_preprocessor.py
@@ -0,0 +1,216 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from numbers import Number
+from typing import Optional, Sequence
+
+import torch
+import torch.nn.functional as F
+from mmengine.model import BaseDataPreprocessor, stack_batch
+
+from mmdet.registry import MODELS
+
+try:
+    import mmpretrain
+    from mmpretrain.models.utils.batch_augments import RandomBatchAugment
+    from mmpretrain.structures import (batch_label_to_onehot, cat_batch_labels,
+                                       tensor_split)
+except ImportError:
+    mmpretrain = None
+
+
+def stack_batch_scores(elements, device=None):
+    """Stack the ``score`` of a batch of :obj:`LabelData` to a tensor.
+
+    Args:
+        elements (List[LabelData]): A batch of :obj`LabelData`.
+        device (torch.device, optional): The output device of the batch label.
+            Defaults to None.
+    Returns:
+        torch.Tensor: The stacked score tensor.
+    """
+    item = elements[0]
+    if 'score' not in item._data_fields:
+        return None
+
+    batch_score = torch.stack([element.score for element in elements])
+    if device is not None:
+        batch_score = batch_score.to(device)
+    return batch_score
+
+
+@MODELS.register_module()
+class ReIDDataPreprocessor(BaseDataPreprocessor):
+    """Image pre-processor for classification tasks.
+
+    Comparing with the :class:`mmengine.model.ImgDataPreprocessor`,
+
+    1. It won't do normalization if ``mean`` is not specified.
+    2. It does normalization and color space conversion after stacking batch.
+    3. It supports batch augmentations like mixup and cutmix.
+
+    It provides the data pre-processing as follows
+
+    - Collate and move data to the target device.
+    - Pad inputs to the maximum size of current batch with defined
+      ``pad_value``. The padding size can be divisible by a defined
+      ``pad_size_divisor``
+    - Stack inputs to batch_inputs.
+    - Convert inputs from bgr to rgb if the shape of input is (3, H, W).
+    - Normalize image with defined std and mean.
+    - Do batch augmentations like Mixup and Cutmix during training.
+
+    Args:
+        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
+            Defaults to None.
+        std (Sequence[Number], optional): The pixel standard deviation of
+            R, G, B channels. Defaults to None.
+        pad_size_divisor (int): The size of padded image should be
+            divisible by ``pad_size_divisor``. Defaults to 1.
+        pad_value (Number): The padded pixel value. Defaults to 0.
+        to_rgb (bool): whether to convert image from BGR to RGB.
+            Defaults to False.
+        to_onehot (bool): Whether to generate one-hot format gt-labels and set
+            to data samples. Defaults to False.
+        num_classes (int, optional): The number of classes. Defaults to None.
+        batch_augments (dict, optional): The batch augmentations settings,
+            including "augments" and "probs". For more details, see
+            :class:`mmpretrain.models.RandomBatchAugment`.
+    """
+
+    def __init__(self,
+                 mean: Sequence[Number] = None,
+                 std: Sequence[Number] = None,
+                 pad_size_divisor: int = 1,
+                 pad_value: Number = 0,
+                 to_rgb: bool = False,
+                 to_onehot: bool = False,
+                 num_classes: Optional[int] = None,
+                 batch_augments: Optional[dict] = None):
+        if mmpretrain is None:
+            raise RuntimeError('Please run "pip install openmim" and '
+                               'run "mim install mmpretrain" to '
+                               'install mmpretrain first.')
+        super().__init__()
+        self.pad_size_divisor = pad_size_divisor
+        self.pad_value = pad_value
+        self.to_rgb = to_rgb
+        self.to_onehot = to_onehot
+        self.num_classes = num_classes
+
+        if mean is not None:
+            assert std is not None, 'To enable the normalization in ' \
+                'preprocessing, please specify both `mean` and `std`.'
+            # Enable the normalization in preprocessing.
+            self._enable_normalize = True
+            self.register_buffer('mean',
+                                 torch.tensor(mean).view(-1, 1, 1), False)
+            self.register_buffer('std',
+                                 torch.tensor(std).view(-1, 1, 1), False)
+        else:
+            self._enable_normalize = False
+
+        if batch_augments is not None:
+            self.batch_augments = RandomBatchAugment(**batch_augments)
+            if not self.to_onehot:
+                from mmengine.logging import MMLogger
+                MMLogger.get_current_instance().info(
+                    'Because batch augmentations are enabled, the data '
+                    'preprocessor automatically enables the `to_onehot` '
+                    'option to generate one-hot format labels.')
+                self.to_onehot = True
+        else:
+            self.batch_augments = None
+
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization, padding, bgr2rgb conversion and batch
+        augmentation based on ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict): data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        inputs = self.cast_data(data['inputs'])
+
+        if isinstance(inputs, torch.Tensor):
+            # The branch if use `default_collate` as the collate_fn in the
+            # dataloader.
+
+            # ------ To RGB ------
+            if self.to_rgb and inputs.size(1) == 3:
+                inputs = inputs.flip(1)
+
+            # -- Normalization ---
+            inputs = inputs.float()
+            if self._enable_normalize:
+                inputs = (inputs - self.mean) / self.std
+
+            # ------ Padding -----
+            if self.pad_size_divisor > 1:
+                h, w = inputs.shape[-2:]
+
+                target_h = math.ceil(
+                    h / self.pad_size_divisor) * self.pad_size_divisor
+                target_w = math.ceil(
+                    w / self.pad_size_divisor) * self.pad_size_divisor
+                pad_h = target_h - h
+                pad_w = target_w - w
+                inputs = F.pad(inputs, (0, pad_w, 0, pad_h), 'constant',
+                               self.pad_value)
+        else:
+            # The branch if use `pseudo_collate` as the collate_fn in the
+            # dataloader.
+
+            processed_inputs = []
+            for input_ in inputs:
+                # ------ To RGB ------
+                if self.to_rgb and input_.size(0) == 3:
+                    input_ = input_.flip(0)
+
+                # -- Normalization ---
+                input_ = input_.float()
+                if self._enable_normalize:
+                    input_ = (input_ - self.mean) / self.std
+
+                processed_inputs.append(input_)
+            # Combine padding and stack
+            inputs = stack_batch(processed_inputs, self.pad_size_divisor,
+                                 self.pad_value)
+
+        data_samples = data.get('data_samples', None)
+        sample_item = data_samples[0] if data_samples is not None else None
+        if 'gt_label' in sample_item:
+            gt_labels = [sample.gt_label for sample in data_samples]
+            gt_labels_tensor = [gt_label.label for gt_label in gt_labels]
+            batch_label, label_indices = cat_batch_labels(gt_labels_tensor)
+            batch_label = batch_label.to(self.device)
+
+            batch_score = stack_batch_scores(gt_labels, device=self.device)
+            if batch_score is None and self.to_onehot:
+                assert batch_label is not None, \
+                    'Cannot generate onehot format labels because no labels.'
+                num_classes = self.num_classes or data_samples[0].get(
+                    'num_classes')
+                assert num_classes is not None, \
+                    'Cannot generate one-hot format labels because not set ' \
+                    '`num_classes` in `data_preprocessor`.'
+                batch_score = batch_label_to_onehot(batch_label, label_indices,
+                                                    num_classes)
+
+            # ----- Batch Augmentations ----
+            if training and self.batch_augments is not None:
+                inputs, batch_score = self.batch_augments(inputs, batch_score)
+
+            # ----- scatter labels and scores to data samples ---
+            if batch_label is not None:
+                for sample, label in zip(
+                        data_samples, tensor_split(batch_label,
+                                                   label_indices)):
+                    sample.set_gt_label(label)
+            if batch_score is not None:
+                for sample, score in zip(data_samples, batch_score):
+                    sample.set_gt_score(score)
+
+        return {'inputs': inputs, 'data_samples': data_samples}
diff --git a/mmde/mmdet/models/data_preprocessors/track_data_preprocessor.py b/mmde/mmdet/models/data_preprocessors/track_data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..40a65b8eaebacdaddd574768fbb00e8c5a072d85
--- /dev/null
+++ b/mmde/mmdet/models/data_preprocessors/track_data_preprocessor.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmengine.model.utils import stack_batch
+
+from mmdet.models.utils.misc import samplelist_boxtype2tensor
+from mmdet.registry import MODELS
+from mmdet.structures import TrackDataSample
+from mmdet.structures.mask import BitmapMasks
+from .data_preprocessor import DetDataPreprocessor
+
+
+@MODELS.register_module()
+class TrackDataPreprocessor(DetDataPreprocessor):
+    """Image pre-processor for tracking tasks.
+
+        Accepts the data sampled by the dataloader, and preprocesses
+        it into the format of the model input. ``TrackDataPreprocessor``
+        provides the tracking data pre-processing as follows:
+
+        - Collate and move data to the target device.
+        - Pad inputs to the maximum size of current batch with defined
+          ``pad_value``. The padding size can be divisible by a defined
+          ``pad_size_divisor``
+        - Stack inputs to inputs.
+        - Convert inputs from bgr to rgb if the shape of input is (1, 3, H, W).
+        - Normalize image with defined std and mean.
+        - Do batch augmentations during training.
+        - Record the information of ``batch_input_shape`` and ``pad_shape``.
+
+        Args:
+            mean (Sequence[Number], optional): The pixel mean of R, G, B
+                channels. Defaults to None.
+            std (Sequence[Number], optional): The pixel standard deviation of
+                R, G, B channels. Defaults to None.
+            pad_size_divisor (int): The size of padded image should be
+                divisible by ``pad_size_divisor``. Defaults to 1.
+            pad_value (Number): The padded pixel value. Defaults to 0.
+            pad_mask (bool): Whether to pad instance masks. Defaults to False.
+            mask_pad_value (int): The padded pixel value for instance masks.
+                Defaults to 0.
+            bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+                Defaults to False.
+            rgb_to_bgr (bool): whether to convert image from RGB to RGB.
+                Defaults to False.
+            use_det_processor: (bool): whether to use DetDataPreprocessor
+                in training phrase. This is mainly for some tracking models
+                fed into one image rather than a group of image in training.
+                Defaults to False.
+    .       boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of
+                bboxes data to ``Tensor`` type. Defaults to True.
+            batch_augments (list[dict], optional): Batch-level augmentations
+    """
+
+    def __init__(self,
+                 mean: Optional[Sequence[Union[float, int]]] = None,
+                 std: Optional[Sequence[Union[float, int]]] = None,
+                 use_det_processor: bool = False,
+                 **kwargs):
+        super().__init__(mean=mean, std=std, **kwargs)
+        self.use_det_processor = use_det_processor
+        if mean is not None and not self.use_det_processor:
+            # overwrite the ``register_bufffer`` in ``ImgDataPreprocessor``
+            # since the shape of ``mean`` and ``std`` in tracking tasks must be
+            # (T, C, H, W), which T is the temporal length of the video.
+            self.register_buffer('mean',
+                                 torch.tensor(mean).view(1, -1, 1, 1), False)
+            self.register_buffer('std',
+                                 torch.tensor(std).view(1, -1, 1, 1), False)
+
+    def forward(self, data: dict, training: bool = False) -> Dict:
+        """Perform normalization,padding and bgr2rgb conversion based on
+        ``TrackDataPreprocessor``.
+
+        Args:
+            data (dict): data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            Tuple[Dict[str, List[torch.Tensor]], OptSampleList]: Data in the
+            same format as the model input.
+        """
+        if self.use_det_processor and training:
+            batch_pad_shape = self._get_pad_shape(data)
+        else:
+            batch_pad_shape = self._get_track_pad_shape(data)
+
+        data = self.cast_data(data)
+        imgs, data_samples = data['inputs'], data['data_samples']
+
+        if self.use_det_processor and training:
+            assert imgs[0].dim() == 3, \
+                'Only support the 3 dims when use detpreprocessor in training'
+            if self._channel_conversion:
+                imgs = [_img[[2, 1, 0], ...] for _img in imgs]
+            # Convert to `float`
+            imgs = [_img.float() for _img in imgs]
+            if self._enable_normalize:
+                imgs = [(_img - self.mean) / self.std for _img in imgs]
+            inputs = stack_batch(imgs, self.pad_size_divisor, self.pad_value)
+        else:
+            assert imgs[0].dim() == 4, \
+                'Only support the 4 dims when use trackprocessor in training'
+            # The shape of imgs[0] is (T, C, H, W).
+            channel = imgs[0].size(1)
+            if self._channel_conversion and channel == 3:
+                imgs = [_img[:, [2, 1, 0], ...] for _img in imgs]
+            # change to `float`
+            imgs = [_img.float() for _img in imgs]
+            if self._enable_normalize:
+                imgs = [(_img - self.mean) / self.std for _img in imgs]
+            inputs = stack_track_batch(imgs, self.pad_size_divisor,
+                                       self.pad_value)
+
+        if data_samples is not None:
+            # NOTE the batched image size information may be useful, e.g.
+            # in DETR, this is needed for the construction of masks, which is
+            # then used for the transformer_head.
+            batch_input_shape = tuple(inputs.size()[-2:])
+            if self.use_det_processor and training:
+                for data_sample, pad_shape in zip(data_samples,
+                                                  batch_pad_shape):
+                    data_sample.set_metainfo({
+                        'batch_input_shape': batch_input_shape,
+                        'pad_shape': pad_shape
+                    })
+                if self.boxtype2tensor:
+                    samplelist_boxtype2tensor(data_samples)
+                if self.pad_mask:
+                    self.pad_gt_masks(data_samples)
+            else:
+                for track_data_sample, pad_shapes in zip(
+                        data_samples, batch_pad_shape):
+                    for i in range(len(track_data_sample)):
+                        det_data_sample = track_data_sample[i]
+                        det_data_sample.set_metainfo({
+                            'batch_input_shape': batch_input_shape,
+                            'pad_shape': pad_shapes[i]
+                        })
+                if self.pad_mask and training:
+                    self.pad_track_gt_masks(data_samples)
+
+        if training and self.batch_augments is not None:
+            for batch_aug in self.batch_augments:
+                if self.use_det_processor and training:
+                    inputs, data_samples = batch_aug(inputs, data_samples)
+                else:
+                    # we only support T==1 when using batch augments.
+                    # Only yolox need batch_aug, and yolox can only process
+                    # (N, C, H, W) shape.
+                    # The shape of `inputs` is (N, T, C, H, W), hence, we use
+                    # inputs[:, 0] to change the shape to (N, C, H, W).
+                    assert inputs.size(1) == 1 and len(
+                        data_samples[0]
+                    ) == 1, 'Only support the number of sequence images equals to 1 when using batch augment.'  # noqa: E501
+                    det_data_samples = [
+                        track_data_sample[0]
+                        for track_data_sample in data_samples
+                    ]
+                    aug_inputs, aug_det_samples = batch_aug(
+                        inputs[:, 0], det_data_samples)
+                    inputs = aug_inputs.unsqueeze(1)
+                    for track_data_sample, det_sample in zip(
+                            data_samples, aug_det_samples):
+                        track_data_sample.video_data_samples = [det_sample]
+
+        # Note: inputs may contain large number of frames, so we must make
+        # sure that the mmeory is contiguous for stable forward
+        inputs = inputs.contiguous()
+
+        return dict(inputs=inputs, data_samples=data_samples)
+
+    def _get_track_pad_shape(self, data: dict) -> Dict[str, List]:
+        """Get the pad_shape of each image based on data and pad_size_divisor.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+
+        Returns:
+            Dict[str, List]: The shape of padding.
+        """
+        batch_pad_shape = dict()
+        batch_pad_shape = []
+        for imgs in data['inputs']:
+            # The sequence images in one sample among a batch have the same
+            # original shape
+            pad_h = int(np.ceil(imgs.shape[-2] /
+                                self.pad_size_divisor)) * self.pad_size_divisor
+            pad_w = int(np.ceil(imgs.shape[-1] /
+                                self.pad_size_divisor)) * self.pad_size_divisor
+            pad_shapes = [(pad_h, pad_w)] * imgs.size(0)
+            batch_pad_shape.append(pad_shapes)
+        return batch_pad_shape
+
+    def pad_track_gt_masks(self,
+                           data_samples: Sequence[TrackDataSample]) -> None:
+        """Pad gt_masks to shape of batch_input_shape."""
+        if 'masks' in data_samples[0][0].get('gt_instances', None):
+            for track_data_sample in data_samples:
+                for i in range(len(track_data_sample)):
+                    det_data_sample = track_data_sample[i]
+                    masks = det_data_sample.gt_instances.masks
+                    # TODO: whether to use BitmapMasks
+                    assert isinstance(masks, BitmapMasks)
+                    batch_input_shape = det_data_sample.batch_input_shape
+                    det_data_sample.gt_instances.masks = masks.pad(
+                        batch_input_shape, pad_val=self.mask_pad_value)
+
+
+def stack_track_batch(tensors: List[torch.Tensor],
+                      pad_size_divisor: int = 0,
+                      pad_value: Union[int, float] = 0) -> torch.Tensor:
+    """Stack multiple tensors to form a batch and pad the images to the max
+    shape use the right bottom padding mode in these images. If
+    ``pad_size_divisor > 0``, add padding to ensure the common height and width
+    is divisible by ``pad_size_divisor``. The difference between this function
+    and ``stack_batch`` in MMEngine is that this function can process batch
+    sequence images with shape (N, T, C, H, W).
+
+    Args:
+        tensors (List[Tensor]): The input multiple tensors. each is a
+            TCHW 4D-tensor. T denotes the number of key/reference frames.
+        pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding
+            to ensure the common height and width is divisible by
+            ``pad_size_divisor``. This depends on the model, and many
+            models need a divisibility of 32. Defaults to 0
+        pad_value (int, float): The padding value. Defaults to 0
+
+    Returns:
+       Tensor: The NTCHW 5D-tensor. N denotes the batch size.
+    """
+    assert isinstance(tensors, list), \
+        f'Expected input type to be list, but got {type(tensors)}'
+    assert len(set([tensor.ndim for tensor in tensors])) == 1, \
+        f'Expected the dimensions of all tensors must be the same, ' \
+        f'but got {[tensor.ndim for tensor in tensors]}'
+    assert tensors[0].ndim == 4, f'Expected tensor dimension to be 4, ' \
+                                 f'but got {tensors[0].ndim}'
+    assert len(set([tensor.shape[0] for tensor in tensors])) == 1, \
+        f'Expected the channels of all tensors must be the same, ' \
+        f'but got {[tensor.shape[0] for tensor in tensors]}'
+
+    tensor_sizes = [(tensor.shape[-2], tensor.shape[-1]) for tensor in tensors]
+    max_size = np.stack(tensor_sizes).max(0)
+
+    if pad_size_divisor > 1:
+        # the last two dims are H,W, both subject to divisibility requirement
+        max_size = (
+            max_size +
+            (pad_size_divisor - 1)) // pad_size_divisor * pad_size_divisor
+
+    padded_samples = []
+    for tensor in tensors:
+        padding_size = [
+            0, max_size[-1] - tensor.shape[-1], 0,
+            max_size[-2] - tensor.shape[-2]
+        ]
+        if sum(padding_size) == 0:
+            padded_samples.append(tensor)
+        else:
+            padded_samples.append(F.pad(tensor, padding_size, value=pad_value))
+
+    return torch.stack(padded_samples, dim=0)
diff --git a/mmde/mmdet/models/dense_heads/__init__.py b/mmde/mmdet/models/dense_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9b55ec2a4230a741e9a2c696ec434bf9cc8bafa
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/__init__.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor_free_head import AnchorFreeHead
+from .anchor_head import AnchorHead
+from .atss_head import ATSSHead
+from .atss_vlfusion_head import ATSSVLFusionHead
+from .autoassign_head import AutoAssignHead
+from .boxinst_head import BoxInstBboxHead, BoxInstMaskHead
+from .cascade_rpn_head import CascadeRPNHead, StageCascadeRPNHead
+from .centernet_head import CenterNetHead
+from .centernet_update_head import CenterNetUpdateHead
+from .centripetal_head import CentripetalHead
+from .condinst_head import CondInstBboxHead, CondInstMaskHead
+from .conditional_detr_head import ConditionalDETRHead
+from .corner_head import CornerHead
+from .dab_detr_head import DABDETRHead
+from .ddod_head import DDODHead
+from .ddq_detr_head import DDQDETRHead
+from .deformable_detr_head import DeformableDETRHead
+from .detr_head import DETRHead
+from .dino_head import DINOHead
+from .embedding_rpn_head import EmbeddingRPNHead
+from .fcos_head import FCOSHead
+from .fovea_head import FoveaHead
+from .free_anchor_retina_head import FreeAnchorRetinaHead
+from .fsaf_head import FSAFHead
+from .ga_retina_head import GARetinaHead
+from .ga_rpn_head import GARPNHead
+from .gfl_head import GFLHead
+from .grounding_dino_head import GroundingDINOHead
+from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead
+from .lad_head import LADHead
+from .ld_head import LDHead
+from .mask2former_head import Mask2FormerHead
+from .maskformer_head import MaskFormerHead
+from .nasfcos_head import NASFCOSHead
+from .paa_head import PAAHead
+from .pisa_retinanet_head import PISARetinaHead
+from .pisa_ssd_head import PISASSDHead
+from .reppoints_head import RepPointsHead
+from .retina_head import RetinaHead
+from .retina_sepbn_head import RetinaSepBNHead
+from .rpn_head import RPNHead
+from .rtmdet_head import RTMDetHead, RTMDetSepBNHead
+from .rtmdet_ins_head import RTMDetInsHead, RTMDetInsSepBNHead
+from .sabl_retina_head import SABLRetinaHead
+from .solo_head import DecoupledSOLOHead, DecoupledSOLOLightHead, SOLOHead
+from .solov2_head import SOLOV2Head
+from .ssd_head import SSDHead
+from .tood_head import TOODHead
+from .vfnet_head import VFNetHead
+from .yolact_head import YOLACTHead, YOLACTProtonet
+from .yolo_head import YOLOV3Head
+from .yolof_head import YOLOFHead
+from .yolox_head import YOLOXHead
+
+__all__ = [
+    'AnchorFreeHead', 'AnchorHead', 'GuidedAnchorHead', 'FeatureAdaption',
+    'RPNHead', 'GARPNHead', 'RetinaHead', 'RetinaSepBNHead', 'GARetinaHead',
+    'SSDHead', 'FCOSHead', 'RepPointsHead', 'FoveaHead',
+    'FreeAnchorRetinaHead', 'ATSSHead', 'FSAFHead', 'NASFCOSHead',
+    'PISARetinaHead', 'PISASSDHead', 'GFLHead', 'CornerHead', 'YOLACTHead',
+    'YOLACTProtonet', 'YOLOV3Head', 'PAAHead', 'SABLRetinaHead',
+    'CentripetalHead', 'VFNetHead', 'StageCascadeRPNHead', 'CascadeRPNHead',
+    'EmbeddingRPNHead', 'LDHead', 'AutoAssignHead', 'DETRHead', 'YOLOFHead',
+    'DeformableDETRHead', 'CenterNetHead', 'YOLOXHead', 'SOLOHead',
+    'DecoupledSOLOHead', 'DecoupledSOLOLightHead', 'SOLOV2Head', 'LADHead',
+    'TOODHead', 'MaskFormerHead', 'Mask2FormerHead', 'DDODHead',
+    'CenterNetUpdateHead', 'RTMDetHead', 'RTMDetSepBNHead', 'CondInstBboxHead',
+    'CondInstMaskHead', 'RTMDetInsHead', 'RTMDetInsSepBNHead',
+    'BoxInstBboxHead', 'BoxInstMaskHead', 'ConditionalDETRHead', 'DINOHead',
+    'ATSSVLFusionHead', 'DABDETRHead', 'DDQDETRHead', 'GroundingDINOHead'
+]
diff --git a/mmde/mmdet/models/dense_heads/anchor_free_head.py b/mmde/mmdet/models/dense_heads/anchor_free_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..90a9b3625b8fef12a2ee3a964c89597b597cb2ec
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/anchor_free_head.py
@@ -0,0 +1,317 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import Any, List, Sequence, Tuple, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from numpy import ndarray
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList)
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..utils import multi_apply
+from .base_dense_head import BaseDenseHead
+
+StrideType = Union[Sequence[int], Sequence[Tuple[int, int]]]
+
+
+@MODELS.register_module()
+class AnchorFreeHead(BaseDenseHead):
+    """Anchor-free head (FCOS, Fovea, RepPoints, etc.).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+        stacked_convs (int): Number of stacking convs of the head.
+        strides (Sequence[int] or Sequence[Tuple[int, int]]): Downsample
+            factor of each feature map.
+        dcn_on_last_conv (bool): If true, use dcn in the last layer of
+            towers. Defaults to False.
+        conv_bias (bool or str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Default: "auto".
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. Defaults
+            'DistancePointBBoxCoder'.
+        conv_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for
+            normalization layer. Defaults to None.
+        train_cfg (:obj:`ConfigDict` or dict, Optional): Training config of
+            anchor-free head.
+        test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of
+            anchor-free head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """  # noqa: W605
+
+    _version = 1
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        feat_channels: int = 256,
+        stacked_convs: int = 4,
+        strides: StrideType = (4, 8, 16, 32, 64),
+        dcn_on_last_conv: bool = False,
+        conv_bias: Union[bool, str] = 'auto',
+        loss_cls: ConfigType = dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox: ConfigType = dict(type='IoULoss', loss_weight=1.0),
+        bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        init_cfg: MultiConfig = dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.01,
+            override=dict(
+                type='Normal', name='conv_cls', std=0.01, bias_prob=0.01))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.dcn_on_last_conv = dcn_on_last_conv
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+
+        self.prior_generator = MlvlPointGenerator(strides)
+
+        # In order to keep a more general interface and be consistent with
+        # anchor_head. We can think of point like one anchor
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.fp16_enabled = False
+
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self._init_cls_convs()
+        self._init_reg_convs()
+        self._init_predictor()
+
+    def _init_cls_convs(self) -> None:
+        """Initialize classification conv layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_reg_convs(self) -> None:
+        """Initialize bbox regression conv layers of the head."""
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_predictor(self) -> None:
+        """Initialize predictor layers of the head."""
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+    def _load_from_state_dict(self, state_dict: dict, prefix: str,
+                              local_metadata: dict, strict: bool,
+                              missing_keys: Union[List[str], str],
+                              unexpected_keys: Union[List[str], str],
+                              error_msgs: Union[List[str], str]) -> None:
+        """Hack some keys of the model state dict so that can load checkpoints
+        of previous version."""
+        version = local_metadata.get('version', None)
+        if version is None:
+            # the key is different in early versions
+            # for example, 'fcos_cls' become 'conv_cls' now
+            bbox_head_keys = [
+                k for k in state_dict.keys() if k.startswith(prefix)
+            ]
+            ori_predictor_keys = []
+            new_predictor_keys = []
+            # e.g. 'fcos_cls' or 'fcos_reg'
+            for key in bbox_head_keys:
+                ori_predictor_keys.append(key)
+                key = key.split('.')
+                if len(key) < 2:
+                    conv_name = None
+                elif key[1].endswith('cls'):
+                    conv_name = 'conv_cls'
+                elif key[1].endswith('reg'):
+                    conv_name = 'conv_reg'
+                elif key[1].endswith('centerness'):
+                    conv_name = 'conv_centerness'
+                else:
+                    conv_name = None
+                if conv_name is not None:
+                    key[1] = conv_name
+                    new_predictor_keys.append('.'.join(key))
+                else:
+                    ori_predictor_keys.pop(-1)
+            for i in range(len(new_predictor_keys)):
+                state_dict[new_predictor_keys[i]] = state_dict.pop(
+                    ori_predictor_keys[i])
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually contain classification scores and bbox predictions.
+
+            - cls_scores (list[Tensor]): Box scores for each scale level, \
+            each is a 4D-tensor, the channel number is \
+            num_points * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for each scale \
+            level, each is a 4D-tensor, the channel number is num_points * 4.
+        """
+        return multi_apply(self.forward_single, x)[:2]
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, ...]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+
+        Returns:
+            tuple: Scores for each class, bbox predictions, features
+            after classification and regression conv layers, some
+            models needs these features like FCOS.
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+        cls_score = self.conv_cls(cls_feat)
+
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+        bbox_pred = self.conv_reg(reg_feat)
+        return cls_score, bbox_pred, cls_feat, reg_feat
+
+    @abstractmethod
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_targets(self, points: List[Tensor],
+                    batch_gt_instances: InstanceList) -> Any:
+        """Compute regression, classification and centerness targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+        """
+        raise NotImplementedError
+
+    # TODO refactor aug_test
+    def aug_test(self,
+                 aug_batch_feats: List[Tensor],
+                 aug_batch_img_metas: List[List[Tensor]],
+                 rescale: bool = False) -> List[ndarray]:
+        """Test function with test time augmentation.
+
+        Args:
+            aug_batch_feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            aug_batch_img_metas (list[list[dict]]): the outer list indicates
+                test-time augs (multiscale, flip, etc.) and the inner list
+                indicates images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[ndarray]: bbox results of each class
+        """
+        return self.aug_test_bboxes(
+            aug_batch_feats, aug_batch_img_metas, rescale=rescale)
diff --git a/mmde/mmdet/models/dense_heads/anchor_head.py b/mmde/mmdet/models/dense_heads/anchor_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4578caca818550397875a0df34c128f461e6ec75
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/anchor_head.py
@@ -0,0 +1,530 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, cat_boxes, get_box_tensor
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig)
+from ..task_modules.prior_generators import (AnchorGenerator,
+                                             anchor_inside_flags)
+from ..task_modules.samplers import PseudoSampler
+from ..utils import images_to_levels, multi_apply, unmap
+from .base_dense_head import BaseDenseHead
+
+
+@MODELS.register_module()
+class AnchorHead(BaseDenseHead):
+    """Anchor-based head (RPN, RetinaNet, SSD, etc.).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+        anchor_generator (dict): Config dict for anchor generator
+        bbox_coder (dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Default False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        train_cfg (dict): Training config of anchor head.
+        test_cfg (dict): Testing config of anchor head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        feat_channels: int = 256,
+        anchor_generator: ConfigType = dict(
+            type='AnchorGenerator',
+            scales=[8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder: ConfigType = dict(
+            type='DeltaXYWHBBoxCoder',
+            clip_border=True,
+            target_means=(.0, .0, .0, .0),
+            target_stds=(1.0, 1.0, 1.0, 1.0)),
+        reg_decoded_bbox: bool = False,
+        loss_cls: ConfigType = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox: ConfigType = dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        init_cfg: OptMultiConfig = dict(
+            type='Normal', layer='Conv2d', std=0.01)
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+
+        if self.cls_out_channels <= 0:
+            raise ValueError(f'num_classes={num_classes} is too small')
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            if train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+        self.fp16_enabled = False
+
+        self.prior_generator = TASK_UTILS.build(anchor_generator)
+
+        # Usually the numbers of anchors for each level are the same
+        # except SSD detectors. So it is an int in the most dense
+        # heads but a list of int in SSDHead
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+        self._init_layers()
+
+    @property
+    def num_anchors(self) -> int:
+        warnings.warn('DeprecationWarning: `num_anchors` is deprecated, '
+                      'for consistency or also use '
+                      '`num_base_priors` instead')
+        return self.prior_generator.num_base_priors[0]
+
+    @property
+    def anchor_generator(self) -> AnchorGenerator:
+        warnings.warn('DeprecationWarning: anchor_generator is deprecated, '
+                      'please use "prior_generator" instead')
+        return self.prior_generator
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.conv_cls = nn.Conv2d(self.in_channels,
+                                  self.num_base_priors * self.cls_out_channels,
+                                  1)
+        reg_dim = self.bbox_coder.encode_size
+        self.conv_reg = nn.Conv2d(self.in_channels,
+                                  self.num_base_priors * reg_dim, 1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level \
+                    the channels number is num_base_priors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale \
+                    level, the channels number is num_base_priors * 4.
+        """
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        return cls_score, bbox_pred
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_scores (list[Tensor]): Classification scores for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_base_priors * num_classes.
+                - bbox_preds (list[Tensor]): Box energies / deltas for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_base_priors * 4.
+        """
+        return multi_apply(self.forward_single, x)
+
+    def get_anchors(self,
+                    featmap_sizes: List[tuple],
+                    batch_img_metas: List[dict],
+                    device: Union[torch.device, str] = 'cuda') \
+            -> Tuple[List[List[Tensor]], List[List[Tensor]]]:
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            batch_img_metas (list[dict]): Image meta info.
+            device (torch.device | str): Device for returned tensors.
+                Defaults to cuda.
+
+        Returns:
+            tuple:
+
+                - anchor_list (list[list[Tensor]]): Anchors of each image.
+                - valid_flag_list (list[list[Tensor]]): Valid flags of each
+                  image.
+        """
+        num_imgs = len(batch_img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_flags = self.prior_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device)
+            valid_flag_list.append(multi_level_flags)
+
+        return anchor_list, valid_flag_list
+
+    def _get_targets_single(self,
+                            flat_anchors: Union[Tensor, BaseBoxes],
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Args:
+            flat_anchors (Tensor or :obj:`BaseBoxes`): Multi-level anchors
+                of the image, which are concatenated into a single tensor
+                or box type of shape (num_anchors, 4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors, ).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.  Defaults to True.
+
+        Returns:
+            tuple:
+
+                - labels (Tensor): Labels of each level.
+                - label_weights (Tensor): Label weights of each level.
+                - bbox_targets (Tensor): BBox targets of each level.
+                - bbox_weights (Tensor): BBox weights of each level.
+                - pos_inds (Tensor): positive samples indexes.
+                - neg_inds (Tensor): negative samples indexes.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags]
+
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(pred_instances, gt_instances,
+                                             gt_instances_ignore)
+        # No sampling is required except for RPN and
+        # Guided Anchoring algorithms
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        target_dim = gt_instances.bboxes.size(-1) if self.reg_decoded_bbox \
+            else self.bbox_coder.encode_size
+        bbox_targets = anchors.new_zeros(num_valid_anchors, target_dim)
+        bbox_weights = anchors.new_zeros(num_valid_anchors, target_dim)
+
+        # TODO: Considering saving memory, is it necessary to be long?
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        # `bbox_coder.encode` accepts tensor or box type inputs and generates
+        # tensor targets. If regressing decoded boxes, the code will convert
+        # box type `pos_bbox_targets` to tensor.
+        if len(pos_inds) > 0:
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_priors, sampling_result.pos_gt_bboxes)
+            else:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+                pos_bbox_targets = get_box_tensor(pos_bbox_targets)
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def get_targets(self,
+                    anchor_list: List[List[Tensor]],
+                    valid_flag_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs: bool = True,
+                    return_sampling_results: bool = False) -> tuple:
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+            return_sampling_results (bool): Whether to return the sampling
+                results. Defaults to False.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - bbox_weights_list (list[Tensor]): BBox weights of each level.
+                - avg_factor (int): Average factor that is used to average
+                  the loss. When using sampling method, avg_factor is usually
+                  the sum of positive and negative priors. When using
+                  `PseudoSampler`, `avg_factor` is usually equal to the number
+                  of positive priors.
+
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors to a single tensor
+        concat_anchor_list = []
+        concat_valid_flag_list = []
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            concat_anchor_list.append(cat_boxes(anchor_list[i]))
+            concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        results = multi_apply(
+            self._get_targets_single,
+            concat_anchor_list,
+            concat_valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore,
+            unmap_outputs=unmap_outputs)
+        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
+         pos_inds_list, neg_inds_list, sampling_results_list) = results[:7]
+        rest_results = list(results[7:])  # user-added return values
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # update `_raw_positive_infos`, which will be used when calling
+        # `get_positive_infos`.
+        self._raw_positive_infos.update(sampling_results=sampling_results_list)
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        res = (labels_list, label_weights_list, bbox_targets_list,
+               bbox_weights_list, avg_factor)
+        if return_sampling_results:
+            res = res + (sampling_results_list, )
+        for i, r in enumerate(rest_results):  # user-added return values
+            rest_results[i] = images_to_levels(r, num_level_anchors)
+
+        return res + tuple(rest_results)
+
+    def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            anchors: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            bbox_weights: Tensor, avg_factor: int) -> tuple:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor
+                weight shape (N, num_total_anchors, 4).
+            bbox_weights (Tensor): BBox regression loss weights of each anchor
+                with shape (N, num_total_anchors, 4).
+            avg_factor (int): Average factor that is used to average the loss.
+
+        Returns:
+            tuple: loss components.
+        """
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor)
+        # regression loss
+        target_dim = bbox_targets.size(-1)
+        bbox_targets = bbox_targets.reshape(-1, target_dim)
+        bbox_weights = bbox_weights.reshape(-1, target_dim)
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1,
+                                                 self.bbox_coder.encode_size)
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            anchors = anchors.reshape(-1, anchors.size(-1))
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+            bbox_pred = get_box_tensor(bbox_pred)
+        loss_bbox = self.loss_bbox(
+            bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor)
+        return loss_cls, loss_bbox
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor) = cls_reg_targets
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(cat_boxes(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            avg_factor=avg_factor)
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
diff --git a/mmde/mmdet/models/dense_heads/atss_head.py b/mmde/mmdet/models/dense_heads/atss_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ce71b3eff5e0ed624ec7ae16e8db80c90e8ffa1
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/atss_head.py
@@ -0,0 +1,524 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..utils import images_to_levels, multi_apply, unmap
+from .anchor_head import AnchorHead
+
+
+@MODELS.register_module()
+class ATSSHead(AnchorHead):
+    """Detection Head of `ATSS <https://arxiv.org/abs/1912.02424>`_.
+
+    ATSS head structure is similar with FCOS, however ATSS use anchor boxes
+    and assign label by Adaptive Training Sample Selection instead max-iou.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        pred_kernel_size (int): Kernel size of ``nn.Conv2d``
+        stacked_convs (int): Number of stacking convs of the head.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to ``dict(type='GN', num_groups=32,
+            requires_grad=True)``.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Defaults to False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        loss_centerness (:obj:`ConfigDict` or dict): Config of centerness loss.
+            Defaults to ``dict(type='CrossEntropyLoss', use_sigmoid=True,
+            loss_weight=1.0)``.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 pred_kernel_size: int = 3,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 reg_decoded_bbox: bool = True,
+                 loss_centerness: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='atss_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        self.pred_kernel_size = pred_kernel_size
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            reg_decoded_bbox=reg_decoded_bbox,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        self.sampling = False
+        self.loss_centerness = MODELS.build(loss_centerness)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        pred_pad_size = self.pred_kernel_size // 2
+        self.atss_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_anchors * self.cls_out_channels,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.atss_reg = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * 4,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.atss_centerness = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * 1,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * 4.
+        """
+        return multi_apply(self.forward_single, x, self.scales)
+
+    def forward_single(self, x: Tensor, scale: Scale) -> Sequence[Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale
+                    level, the channels number is num_anchors * 4.
+                centerness (Tensor): Centerness for a single scale level, the
+                    channel number is (N, num_anchors * 1, H, W).
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.atss_cls(cls_feat)
+        # we just follow atss, not apply exp in bbox_pred
+        bbox_pred = scale(self.atss_reg(reg_feat)).float()
+        centerness = self.atss_centerness(reg_feat)
+        return cls_score, bbox_pred, centerness
+
+    def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor,
+                            bbox_pred: Tensor, centerness: Tensor,
+                            labels: Tensor, label_weights: Tensor,
+                            bbox_targets: Tensor, avg_factor: float) -> dict:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            avg_factor (float): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        centerness = centerness.permute(0, 2, 3, 1).reshape(-1)
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # classification loss
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_centerness = centerness[pos_inds]
+
+            centerness_targets = self.centerness_target(
+                pos_anchors, pos_bbox_targets)
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_pred)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_bbox_targets,
+                weight=centerness_targets,
+                avg_factor=1.0)
+
+            # centerness loss
+            loss_centerness = self.loss_centerness(
+                pos_centerness, centerness_targets, avg_factor=avg_factor)
+
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_centerness = centerness.sum() * 0
+            centerness_targets = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, loss_centerness, centerness_targets.sum()
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            centernesses: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            centernesses (list[Tensor]): Centerness for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in bbox_preds]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_reg_targets
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+
+        losses_cls, losses_bbox, loss_centerness, \
+            bbox_avg_factor = multi_apply(
+                self.loss_by_feat_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                centernesses,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                avg_factor=avg_factor)
+
+        bbox_avg_factor = sum(bbox_avg_factor)
+        bbox_avg_factor = reduce_mean(bbox_avg_factor).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_centerness=loss_centerness)
+
+    def centerness_target(self, anchors: Tensor, gts: Tensor) -> Tensor:
+        """Calculate the centerness between anchors and gts.
+
+        Only calculate pos centerness targets, otherwise there may be nan.
+
+        Args:
+            anchors (Tensor): Anchors with shape (N, 4), "xyxy" format.
+            gts (Tensor): Ground truth bboxes with shape (N, 4), "xyxy" format.
+
+        Returns:
+            Tensor: Centerness between anchors and gts.
+        """
+        anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2
+        anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2
+        l_ = anchors_cx - gts[:, 0]
+        t_ = anchors_cy - gts[:, 1]
+        r_ = gts[:, 2] - anchors_cx
+        b_ = gts[:, 3] - anchors_cy
+
+        left_right = torch.stack([l_, r_], dim=1)
+        top_bottom = torch.stack([t_, b_], dim=1)
+        centerness = torch.sqrt(
+            (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) *
+            (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]))
+        assert not torch.isnan(centerness).any()
+        return centerness
+
+    def get_targets(self,
+                    anchor_list: List[List[Tensor]],
+                    valid_flag_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs: bool = True) -> tuple:
+        """Get targets for ATSS head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. Besides
+        returning the targets as the parent method does, it also returns the
+        anchors as the first element of the returned tuple.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             anchor_list,
+             valid_flag_list,
+             num_level_anchors_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs)
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, avg_factor)
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            num_level_anchors: List[int],
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            num_level_anchors (List[int]): Number of anchors of each scale
+                level.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                labels (Tensor): Labels of all anchors in the image with shape
+                    (N,).
+                label_weights (Tensor): Label weights of all anchor in the
+                    image with shape (N,).
+                bbox_targets (Tensor): BBox targets of all anchors in the
+                    image with shape (N, 4).
+                bbox_weights (Tensor): BBox weights of all anchors in the
+                    image with shape (N, 4)
+                pos_inds (Tensor): Indices of positive anchor with shape
+                    (num_pos,).
+                neg_inds (Tensor): Indices of negative anchor with shape
+                    (num_neg,).
+                sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        num_level_anchors_inside = self.get_num_level_anchors_inside(
+            num_level_anchors, inside_flags)
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(pred_instances,
+                                             num_level_anchors_inside,
+                                             gt_instances, gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if self.reg_decoded_bbox:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            else:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_priors, sampling_result.pos_gt_bboxes)
+
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds, sampling_result)
+
+    def get_num_level_anchors_inside(self, num_level_anchors, inside_flags):
+        """Get the number of valid anchors in every level."""
+
+        split_inside_flags = torch.split(inside_flags, num_level_anchors)
+        num_level_anchors_inside = [
+            int(flags.sum()) for flags in split_inside_flags
+        ]
+        return num_level_anchors_inside
diff --git a/mmde/mmdet/models/dense_heads/atss_vlfusion_head.py b/mmde/mmdet/models/dense_heads/atss_vlfusion_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5cd28b4a040ba447130aed07629f6312f95dcf3
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/atss_vlfusion_head.py
@@ -0,0 +1,949 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from typing import Callable, List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Scale
+from mmcv.ops.modulated_deform_conv import ModulatedDeformConv2d
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModel
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+try:
+    from transformers import BertConfig
+except ImportError:
+    BertConfig = None
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import cat_boxes
+from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
+from ..utils import (BertEncoderLayer, VLFuse, filter_scores_and_topk,
+                     permute_and_flatten, select_single_mlvl,
+                     unpack_gt_instances)
+from ..utils.vlfuse_helper import MAX_CLAMP_VALUE
+from .atss_head import ATSSHead
+
+
+def convert_grounding_to_cls_scores(logits: Tensor,
+                                    positive_maps: List[dict]) -> Tensor:
+    """Convert logits to class scores."""
+    assert len(positive_maps) == logits.shape[0]  # batch size
+
+    scores = torch.zeros(logits.shape[0], logits.shape[1],
+                         len(positive_maps[0])).to(logits.device)
+    if positive_maps is not None:
+        if all(x == positive_maps[0] for x in positive_maps):
+            # only need to compute once
+            positive_map = positive_maps[0]
+            for label_j in positive_map:
+                scores[:, :, label_j -
+                       1] = logits[:, :,
+                                   torch.LongTensor(positive_map[label_j]
+                                                    )].mean(-1)
+        else:
+            for i, positive_map in enumerate(positive_maps):
+                for label_j in positive_map:
+                    scores[i, :, label_j - 1] = logits[
+                        i, :, torch.LongTensor(positive_map[label_j])].mean(-1)
+    return scores
+
+
+class Conv3x3Norm(nn.Module):
+    """Conv3x3 and norm."""
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 stride: int,
+                 groups: int = 1,
+                 use_dcn: bool = False,
+                 norm_type: Optional[Union[Sequence, str]] = None):
+        super().__init__()
+
+        if use_dcn:
+            self.conv = ModulatedDeformConv2d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                groups=groups)
+        else:
+            self.conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                groups=groups)
+
+        if isinstance(norm_type, Sequence):
+            assert len(norm_type) == 2
+            assert norm_type[0] == 'gn'
+            gn_group = norm_type[1]
+            norm_type = norm_type[0]
+
+        if norm_type == 'bn':
+            bn_op = nn.BatchNorm2d(out_channels)
+        elif norm_type == 'gn':
+            bn_op = nn.GroupNorm(
+                num_groups=gn_group, num_channels=out_channels)
+        if norm_type is not None:
+            self.bn = bn_op
+        else:
+            self.bn = None
+
+    def forward(self, x, **kwargs):
+        x = self.conv(x, **kwargs)
+        if self.bn:
+            x = self.bn(x)
+        return x
+
+
+class DyReLU(nn.Module):
+    """Dynamic ReLU."""
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expand_ratio: int = 4):
+        super().__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.expand_ratio = expand_ratio
+        self.out_channels = out_channels
+
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, in_channels // expand_ratio),
+            nn.ReLU(inplace=True),
+            nn.Linear(in_channels // expand_ratio,
+                      out_channels * self.expand_ratio),
+            nn.Hardsigmoid(inplace=True))
+
+    def forward(self, x) -> Tensor:
+        x_out = x
+        b, c, h, w = x.size()
+        x = self.avg_pool(x).view(b, c)
+        x = self.fc(x).view(b, -1, 1, 1)
+
+        a1, b1, a2, b2 = torch.split(x, self.out_channels, dim=1)
+        a1 = (a1 - 0.5) * 2 + 1.0
+        a2 = (a2 - 0.5) * 2
+        b1 = b1 - 0.5
+        b2 = b2 - 0.5
+        out = torch.max(x_out * a1 + b1, x_out * a2 + b2)
+        return out
+
+
+class DyConv(nn.Module):
+    """Dynamic Convolution."""
+
+    def __init__(self,
+                 conv_func: Callable,
+                 in_channels: int,
+                 out_channels: int,
+                 use_dyfuse: bool = True,
+                 use_dyrelu: bool = False,
+                 use_dcn: bool = False):
+        super().__init__()
+
+        self.dyconvs = nn.ModuleList()
+        self.dyconvs.append(conv_func(in_channels, out_channels, 1))
+        self.dyconvs.append(conv_func(in_channels, out_channels, 1))
+        self.dyconvs.append(conv_func(in_channels, out_channels, 2))
+
+        if use_dyfuse:
+            self.attnconv = nn.Sequential(
+                nn.AdaptiveAvgPool2d(1),
+                nn.Conv2d(in_channels, 1, kernel_size=1),
+                nn.ReLU(inplace=True))
+            self.h_sigmoid = nn.Hardsigmoid(inplace=True)
+        else:
+            self.attnconv = None
+
+        if use_dyrelu:
+            self.relu = DyReLU(in_channels, out_channels)
+        else:
+            self.relu = nn.ReLU()
+
+        if use_dcn:
+            self.offset = nn.Conv2d(
+                in_channels, 27, kernel_size=3, stride=1, padding=1)
+        else:
+            self.offset = None
+
+        self.init_weights()
+
+    def init_weights(self):
+        for m in self.dyconvs.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight.data, 0, 0.01)
+                if m.bias is not None:
+                    m.bias.data.zero_()
+        if self.attnconv is not None:
+            for m in self.attnconv.modules():
+                if isinstance(m, nn.Conv2d):
+                    nn.init.normal_(m.weight.data, 0, 0.01)
+                    if m.bias is not None:
+                        m.bias.data.zero_()
+
+    def forward(self, inputs: dict) -> dict:
+        visual_feats = inputs['visual']
+
+        out_vis_feats = []
+        for level, feature in enumerate(visual_feats):
+
+            offset_conv_args = {}
+            if self.offset is not None:
+                offset_mask = self.offset(feature)
+                offset = offset_mask[:, :18, :, :]
+                mask = offset_mask[:, 18:, :, :].sigmoid()
+                offset_conv_args = dict(offset=offset, mask=mask)
+
+            temp_feats = [self.dyconvs[1](feature, **offset_conv_args)]
+
+            if level > 0:
+                temp_feats.append(self.dyconvs[2](visual_feats[level - 1],
+                                                  **offset_conv_args))
+            if level < len(visual_feats) - 1:
+                temp_feats.append(
+                    F.upsample_bilinear(
+                        self.dyconvs[0](visual_feats[level + 1],
+                                        **offset_conv_args),
+                        size=[feature.size(2),
+                              feature.size(3)]))
+            mean_feats = torch.mean(
+                torch.stack(temp_feats), dim=0, keepdim=False)
+
+            if self.attnconv is not None:
+                attn_feat = []
+                res_feat = []
+                for feat in temp_feats:
+                    res_feat.append(feat)
+                    attn_feat.append(self.attnconv(feat))
+
+                res_feat = torch.stack(res_feat)
+                spa_pyr_attn = self.h_sigmoid(torch.stack(attn_feat))
+
+                mean_feats = torch.mean(
+                    res_feat * spa_pyr_attn, dim=0, keepdim=False)
+
+            out_vis_feats.append(mean_feats)
+
+        out_vis_feats = [self.relu(item) for item in out_vis_feats]
+
+        features_dict = {'visual': out_vis_feats, 'lang': inputs['lang']}
+
+        return features_dict
+
+
+class VLFusionModule(BaseModel):
+    """Visual-lang Fusion Module."""
+
+    def __init__(self,
+                 in_channels: int,
+                 feat_channels: int,
+                 num_base_priors: int,
+                 early_fuse: bool = False,
+                 num_dyhead_blocks: int = 6,
+                 lang_model_name: str = 'bert-base-uncased',
+                 use_dyrelu: bool = True,
+                 use_dyfuse: bool = True,
+                 use_dcn: bool = True,
+                 use_checkpoint: bool = False,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        if BertConfig is None:
+            raise RuntimeError(
+                'transformers is not installed, please install it by: '
+                'pip install transformers.')
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.num_base_priors = num_base_priors
+        self.early_fuse = early_fuse
+        self.num_dyhead_blocks = num_dyhead_blocks
+        self.use_dyrelu = use_dyrelu
+        self.use_dyfuse = use_dyfuse
+        self.use_dcn = use_dcn
+        self.use_checkpoint = use_checkpoint
+
+        self.lang_cfg = BertConfig.from_pretrained(lang_model_name)
+        self.lang_dim = self.lang_cfg.hidden_size
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the model."""
+        bias_value = -math.log((1 - 0.01) / 0.01)
+
+        dyhead_tower = []
+        for i in range(self.num_dyhead_blocks):
+            if self.early_fuse:
+                # cross-modality fusion
+                dyhead_tower.append(VLFuse(use_checkpoint=self.use_checkpoint))
+                # lang branch
+                dyhead_tower.append(
+                    BertEncoderLayer(
+                        self.lang_cfg,
+                        clamp_min_for_underflow=True,
+                        clamp_max_for_overflow=True))
+
+            # vision branch
+            dyhead_tower.append(
+                DyConv(
+                    lambda i, o, s: Conv3x3Norm(
+                        i, o, s, use_dcn=self.use_dcn, norm_type=['gn', 16]),
+                    self.in_channels if i == 0 else self.feat_channels,
+                    self.feat_channels,
+                    use_dyrelu=(self.use_dyrelu
+                                and self.in_channels == self.feat_channels)
+                    if i == 0 else self.use_dyrelu,
+                    use_dyfuse=(self.use_dyfuse
+                                and self.in_channels == self.feat_channels)
+                    if i == 0 else self.use_dyfuse,
+                    use_dcn=(self.use_dcn
+                             and self.in_channels == self.feat_channels)
+                    if i == 0 else self.use_dcn,
+                ))
+
+        self.add_module('dyhead_tower', nn.Sequential(*dyhead_tower))
+
+        self.bbox_pred = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, kernel_size=1)
+        self.centerness = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 1, kernel_size=1)
+        self.dot_product_projection_text = nn.Linear(
+            self.lang_dim,
+            self.num_base_priors * self.feat_channels,
+            bias=True)
+        self.log_scale = nn.Parameter(torch.Tensor([0.0]), requires_grad=True)
+        self.bias_lang = nn.Parameter(
+            torch.zeros(self.lang_dim), requires_grad=True)
+        self.bias0 = nn.Parameter(
+            torch.Tensor([bias_value]), requires_grad=True)
+        self.scales = nn.ModuleList([Scale(1.0) for _ in range(5)])
+
+    def forward(self, visual_feats: Tuple[Tensor],
+                language_feats: dict) -> Tuple:
+        feat_inputs = {'visual': visual_feats, 'lang': language_feats}
+        dyhead_tower = self.dyhead_tower(feat_inputs)
+
+        if self.early_fuse:
+            embedding = dyhead_tower['lang']['hidden']
+        else:
+            embedding = language_feats['embedded']
+
+        embedding = F.normalize(embedding, p=2, dim=-1)
+        dot_product_proj_tokens = self.dot_product_projection_text(embedding /
+                                                                   2.0)
+        dot_product_proj_tokens_bias = torch.matmul(
+            embedding, self.bias_lang) + self.bias0
+
+        bbox_preds = []
+        centerness = []
+        cls_logits = []
+
+        for i, feature in enumerate(visual_feats):
+            visual = dyhead_tower['visual'][i]
+            B, C, H, W = visual.shape
+
+            bbox_pred = self.scales[i](self.bbox_pred(visual))
+            bbox_preds.append(bbox_pred)
+            centerness.append(self.centerness(visual))
+
+            dot_product_proj_queries = permute_and_flatten(
+                visual, B, self.num_base_priors, C, H, W)
+
+            bias = dot_product_proj_tokens_bias.unsqueeze(1).repeat(
+                1, self.num_base_priors, 1)
+            dot_product_logit = (
+                torch.matmul(dot_product_proj_queries,
+                             dot_product_proj_tokens.transpose(-1, -2)) /
+                self.log_scale.exp()) + bias
+            dot_product_logit = torch.clamp(
+                dot_product_logit, max=MAX_CLAMP_VALUE)
+            dot_product_logit = torch.clamp(
+                dot_product_logit, min=-MAX_CLAMP_VALUE)
+            cls_logits.append(dot_product_logit)
+
+        return bbox_preds, centerness, cls_logits
+
+
+@MODELS.register_module()
+class ATSSVLFusionHead(ATSSHead):
+    """ATSS head with visual-language fusion module.
+
+    Args:
+        early_fuse (bool): Whether to fuse visual and language features
+            Defaults to False.
+        use_checkpoint (bool): Whether to use checkpoint. Defaults to False.
+        num_dyhead_blocks (int): Number of dynamic head blocks. Defaults to 6.
+        lang_model_name (str): Name of the language model.
+            Defaults to 'bert-base-uncased'.
+    """
+
+    def __init__(self,
+                 *args,
+                 early_fuse: bool = False,
+                 use_checkpoint: bool = False,
+                 num_dyhead_blocks: int = 6,
+                 lang_model_name: str = 'bert-base-uncased',
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(*args, **kwargs, init_cfg=init_cfg)
+        self.head = VLFusionModule(
+            in_channels=self.in_channels,
+            feat_channels=self.feat_channels,
+            num_base_priors=self.num_base_priors,
+            early_fuse=early_fuse,
+            use_checkpoint=use_checkpoint,
+            num_dyhead_blocks=num_dyhead_blocks,
+            lang_model_name=lang_model_name)
+        self.text_masks = None
+
+    def _init_layers(self) -> None:
+        """No need to initialize the ATSS head layer."""
+        pass
+
+    def forward(self, visual_feats: Tuple[Tensor],
+                language_feats: dict) -> Tuple[Tensor]:
+        """Forward function."""
+        bbox_preds, centerness, cls_logits = self.head(visual_feats,
+                                                       language_feats)
+        return cls_logits, bbox_preds, centerness
+
+    def loss(self, visual_feats: Tuple[Tensor], language_feats: dict,
+             batch_data_samples):
+        outputs = unpack_gt_instances(batch_data_samples)
+        (batch_gt_instances, batch_gt_instances_ignore,
+         batch_img_metas) = outputs
+
+        outs = self(visual_feats, language_feats)
+        self.text_masks = language_feats['masks']
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            centernesses: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            centernesses (list[Tensor]): Centerness for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in bbox_preds]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_reg_targets
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+
+        anchors = torch.cat(anchor_list, dim=1)
+        labels = torch.cat(labels_list, dim=1)
+        label_weights = torch.cat(label_weights_list, dim=1)
+        bbox_targets = torch.cat(bbox_targets_list, dim=1)
+        cls_scores = torch.cat(cls_scores, dim=1)
+
+        centernesses_ = []
+        bbox_preds_ = []
+        for bbox_pred, centerness in zip(bbox_preds, centernesses):
+            centernesses_.append(
+                centerness.permute(0, 2, 3,
+                                   1).reshape(cls_scores.size(0), -1, 1))
+            bbox_preds_.append(
+                bbox_pred.permute(0, 2, 3,
+                                  1).reshape(cls_scores.size(0), -1, 4))
+        bbox_preds = torch.cat(bbox_preds_, dim=1)
+        centernesses = torch.cat(centernesses_, dim=1)
+
+        losses_cls, losses_bbox, loss_centerness, bbox_avg_factor = \
+            self._loss_by_feat(
+                anchors,
+                cls_scores,
+                bbox_preds,
+                centernesses,
+                labels,
+                label_weights,
+                bbox_targets,
+                avg_factor=avg_factor)
+
+        bbox_avg_factor = reduce_mean(bbox_avg_factor).clamp_(min=1).item()
+        losses_bbox = losses_bbox / bbox_avg_factor
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_centerness=loss_centerness)
+
+    def _loss_by_feat(self, anchors: Tensor, cls_score: Tensor,
+                      bbox_pred: Tensor, centerness: Tensor, labels: Tensor,
+                      label_weights: Tensor, bbox_targets: Tensor,
+                      avg_factor: float) -> dict:
+        """Calculate the loss of all scale level based on the features
+        extracted by the detection head.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        anchors = anchors.reshape(-1, 4)
+
+        # ===== this change =====
+        pos_inds = (labels.sum(-1) > 0).reshape(-1)
+
+        # Loss is not computed for the padded regions of the text.
+        assert (self.text_masks.dim() == 2)
+        text_mask = (self.text_masks > 0).unsqueeze(1)
+        text_mask = text_mask.repeat(1, cls_score.size(1), 1)
+        cls_score = torch.masked_select(cls_score, text_mask).contiguous()
+        labels = torch.masked_select(labels, text_mask)
+        label_weights = label_weights[...,
+                                      None].repeat(1, 1, text_mask.size(-1))
+        label_weights = torch.masked_select(label_weights, text_mask)
+
+        bbox_pred = bbox_pred.reshape(-1, 4)
+        centerness = centerness.reshape(-1)
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # classification loss
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor)
+
+        if pos_inds.sum() > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_centerness = centerness[pos_inds]
+
+            centerness_targets = self.centerness_target(
+                pos_anchors, pos_bbox_targets)
+
+            if torch.isnan(centerness_targets).any():
+                print('=====Centerness includes NaN=====')
+                mask = ~torch.isnan(centerness_targets)
+                centerness_targets = centerness_targets[mask]
+                pos_centerness = pos_centerness[mask]
+                pos_anchors = pos_anchors[mask]
+                pos_bbox_targets = pos_bbox_targets[mask]
+                pos_bbox_pred = pos_bbox_pred[mask]
+
+                if pos_bbox_targets.shape[0] == 0:
+                    loss_bbox = bbox_pred.sum() * 0
+                    loss_centerness = centerness.sum() * 0
+                    centerness_targets = bbox_targets.new_tensor(0.)
+                    return loss_cls, loss_bbox, loss_centerness, \
+                        centerness_targets.sum()
+
+            # The decoding process takes the offset into consideration.
+            pos_anchors[:, 2:] += 1
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_pred)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_bbox_targets,
+                weight=centerness_targets,
+                avg_factor=1.0)
+
+            # centerness loss
+            loss_centerness = self.loss_centerness(
+                pos_centerness, centerness_targets, avg_factor=avg_factor)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_centerness = centerness.sum() * 0
+            centerness_targets = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, loss_centerness, centerness_targets.sum()
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            num_level_anchors: List[int],
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            num_level_anchors (List[int]): Number of anchors of each scale
+                level.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                labels (Tensor): Labels of all anchors in the image with shape
+                    (N,).
+                label_weights (Tensor): Label weights of all anchor in the
+                    image with shape (N,).
+                bbox_targets (Tensor): BBox targets of all anchors in the
+                    image with shape (N, 4).
+                bbox_weights (Tensor): BBox weights of all anchors in the
+                    image with shape (N, 4)
+                pos_inds (Tensor): Indices of positive anchor with shape
+                    (num_pos,).
+                neg_inds (Tensor): Indices of negative anchor with shape
+                    (num_neg,).
+                sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        anchors = flat_anchors
+        # Align the official implementation
+        anchors[:, 2:] -= 1
+
+        num_level_anchors_inside = num_level_anchors
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(pred_instances,
+                                             num_level_anchors_inside,
+                                             gt_instances, gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+
+        # ===== this change =====
+        labels = anchors.new_full((num_valid_anchors, self.feat_channels),
+                                  0,
+                                  dtype=torch.float32)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if self.reg_decoded_bbox:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            else:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_priors, sampling_result.pos_gt_bboxes)
+
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            # ===== this change =====
+            labels[pos_inds] = gt_instances.positive_maps[
+                sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds, sampling_result)
+
+    def centerness_target(self, anchors: Tensor, gts: Tensor) -> Tensor:
+        """Calculate the centerness between anchors and gts.
+
+        Only calculate pos centerness targets, otherwise there may be nan.
+
+        Args:
+            anchors (Tensor): Anchors with shape (N, 4), "xyxy" format.
+            gts (Tensor): Ground truth bboxes with shape (N, 4), "xyxy" format.
+
+        Returns:
+            Tensor: Centerness between anchors and gts.
+        """
+        anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2
+        anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2
+        l_ = anchors_cx - gts[:, 0]
+        t_ = anchors_cy - gts[:, 1]
+        r_ = gts[:, 2] - anchors_cx
+        b_ = gts[:, 3] - anchors_cy
+
+        left_right = torch.stack([l_, r_], dim=1)
+        top_bottom = torch.stack([t_, b_], dim=1)
+        centerness = torch.sqrt(
+            (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) *
+            (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]))
+        # assert not torch.isnan(centerness).any()
+        return centerness
+
+    def predict(self,
+                visual_feats: Tuple[Tensor],
+                language_feats: dict,
+                batch_data_samples,
+                rescale: bool = True):
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            visual_feats (tuple[Tensor]): Multi-level visual features from the
+                upstream network, each is a 4D-tensor.
+            language_feats (dict): Language features from the upstream network.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        batch_token_positive_maps = [
+            data_samples.token_positive_map
+            for data_samples in batch_data_samples
+        ]
+        outs = self(visual_feats, language_feats)
+
+        predictions = self.predict_by_feat(
+            *outs,
+            batch_img_metas=batch_img_metas,
+            batch_token_positive_maps=batch_token_positive_maps,
+            rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self,
+                        cls_logits: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        score_factors: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        batch_token_positive_maps: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_logits (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            batch_token_positive_maps (list[dict], Optional): Batch token
+                positive map. Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(bbox_preds) == len(score_factors)
+        num_levels = len(bbox_preds)
+
+        featmap_sizes = [bbox_preds[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+
+        result_list = []
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            token_positive_maps = batch_token_positive_maps[img_id]
+            bbox_pred_list = select_single_mlvl(
+                bbox_preds, img_id, detach=True)
+            score_factor_list = select_single_mlvl(
+                score_factors, img_id, detach=True)
+            cls_logit_list = select_single_mlvl(
+                cls_logits, img_id, detach=True)
+
+            results = self._predict_by_feat_single(
+                bbox_pred_list=bbox_pred_list,
+                score_factor_list=score_factor_list,
+                cls_logit_list=cls_logit_list,
+                mlvl_priors=mlvl_priors,
+                token_positive_maps=token_positive_maps,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                cls_logit_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                token_positive_maps: dict,
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = True,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            cls_logit_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            token_positive_maps (dict): Token positive map.
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+        score_thr = cfg.get('score_thr', 0)
+
+        mlvl_bbox_preds = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        mlvl_labels = []
+
+        for level_idx, (bbox_pred, score_factor, cls_logit, priors) in \
+                enumerate(zip(bbox_pred_list,
+                              score_factor_list, cls_logit_list, mlvl_priors)):
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(
+                -1, self.bbox_coder.encode_size)
+            score_factor = score_factor.permute(1, 2, 0).reshape(-1).sigmoid()
+
+            scores = convert_grounding_to_cls_scores(
+                logits=cls_logit.sigmoid()[None],
+                positive_maps=[token_positive_maps])[0]
+
+            results = filter_scores_and_topk(
+                scores, score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+
+            scores, labels, keep_idxs, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+            score_factor = score_factor[keep_idxs]
+            scores = torch.sqrt(scores * score_factor)
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = cat_boxes(mlvl_valid_priors)
+        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        predictions = self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+        if len(predictions) > 0:
+            # Note: GLIP adopts a very strange bbox decoder logic,
+            # and if 1 is not added here, it will not align with
+            # the official mAP.
+            predictions.bboxes[:, 2:] = predictions.bboxes[:, 2:] + 1
+        return predictions
diff --git a/mmde/mmdet/models/dense_heads/autoassign_head.py b/mmde/mmdet/models/dense_heads/autoassign_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2b30ff0d7d41205f0a92ede7b8eb10a234c5942
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/autoassign_head.py
@@ -0,0 +1,524 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Scale
+from mmengine.model import bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..utils import levels_to_images, multi_apply
+from .fcos_head import FCOSHead
+
+EPS = 1e-12
+
+
+class CenterPrior(nn.Module):
+    """Center Weighting module to adjust the category-specific prior
+    distributions.
+
+    Args:
+        force_topk (bool): When no point falls into gt_bbox, forcibly
+            select the k points closest to the center to calculate
+            the center prior. Defaults to False.
+        topk (int): The number of points used to calculate the
+            center prior when no point falls in gt_bbox. Only work when
+            force_topk if True. Defaults to 9.
+        num_classes (int): The class number of dataset. Defaults to 80.
+        strides (Sequence[int]): The stride of each input feature map.
+            Defaults to (8, 16, 32, 64, 128).
+    """
+
+    def __init__(
+        self,
+        force_topk: bool = False,
+        topk: int = 9,
+        num_classes: int = 80,
+        strides: Sequence[int] = (8, 16, 32, 64, 128)
+    ) -> None:
+        super().__init__()
+        self.mean = nn.Parameter(torch.zeros(num_classes, 2))
+        self.sigma = nn.Parameter(torch.ones(num_classes, 2))
+        self.strides = strides
+        self.force_topk = force_topk
+        self.topk = topk
+
+    def forward(self, anchor_points_list: List[Tensor],
+                gt_instances: InstanceData,
+                inside_gt_bbox_mask: Tensor) -> Tuple[Tensor, Tensor]:
+        """Get the center prior of each point on the feature map for each
+        instance.
+
+        Args:
+            anchor_points_list (list[Tensor]): list of coordinate
+                of points on feature map. Each with shape
+                (num_points, 2).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            inside_gt_bbox_mask (Tensor): Tensor of bool type,
+                with shape of (num_points, num_gt), each
+                value is used to mark whether this point falls
+                within a certain gt.
+
+        Returns:
+            tuple[Tensor, Tensor]:
+
+            - center_prior_weights(Tensor): Float tensor with shape  of \
+            (num_points, num_gt). Each value represents the center \
+            weighting coefficient.
+            - inside_gt_bbox_mask (Tensor): Tensor of bool type, with shape \
+            of (num_points, num_gt), each value is used to mark whether this \
+            point falls within a certain gt or is the topk nearest points for \
+            a specific gt_bbox.
+        """
+        gt_bboxes = gt_instances.bboxes
+        labels = gt_instances.labels
+
+        inside_gt_bbox_mask = inside_gt_bbox_mask.clone()
+        num_gts = len(labels)
+        num_points = sum([len(item) for item in anchor_points_list])
+        if num_gts == 0:
+            return gt_bboxes.new_zeros(num_points,
+                                       num_gts), inside_gt_bbox_mask
+        center_prior_list = []
+        for slvl_points, stride in zip(anchor_points_list, self.strides):
+            # slvl_points: points from single level in FPN, has shape (h*w, 2)
+            # single_level_points has shape (h*w, num_gt, 2)
+            single_level_points = slvl_points[:, None, :].expand(
+                (slvl_points.size(0), len(gt_bboxes), 2))
+            gt_center_x = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2)
+            gt_center_y = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2)
+            gt_center = torch.stack((gt_center_x, gt_center_y), dim=1)
+            gt_center = gt_center[None]
+            # instance_center has shape (1, num_gt, 2)
+            instance_center = self.mean[labels][None]
+            # instance_sigma has shape (1, num_gt, 2)
+            instance_sigma = self.sigma[labels][None]
+            # distance has shape (num_points, num_gt, 2)
+            distance = (((single_level_points - gt_center) / float(stride) -
+                         instance_center)**2)
+            center_prior = torch.exp(-distance /
+                                     (2 * instance_sigma**2)).prod(dim=-1)
+            center_prior_list.append(center_prior)
+        center_prior_weights = torch.cat(center_prior_list, dim=0)
+
+        if self.force_topk:
+            gt_inds_no_points_inside = torch.nonzero(
+                inside_gt_bbox_mask.sum(0) == 0).reshape(-1)
+            if gt_inds_no_points_inside.numel():
+                topk_center_index = \
+                    center_prior_weights[:, gt_inds_no_points_inside].topk(
+                                                             self.topk,
+                                                             dim=0)[1]
+                temp_mask = inside_gt_bbox_mask[:, gt_inds_no_points_inside]
+                inside_gt_bbox_mask[:, gt_inds_no_points_inside] = \
+                    torch.scatter(temp_mask,
+                                  dim=0,
+                                  index=topk_center_index,
+                                  src=torch.ones_like(
+                                    topk_center_index,
+                                    dtype=torch.bool))
+
+        center_prior_weights[~inside_gt_bbox_mask] = 0
+        return center_prior_weights, inside_gt_bbox_mask
+
+
+@MODELS.register_module()
+class AutoAssignHead(FCOSHead):
+    """AutoAssignHead head used in AutoAssign.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.03496>`_ .
+
+    Args:
+        force_topk (bool): Used in center prior initialization to
+            handle extremely small gt. Default is False.
+        topk (int): The number of points used to calculate the
+            center prior when no point falls in gt_bbox. Only work when
+            force_topk if True. Defaults to 9.
+        pos_loss_weight (float): The loss weight of positive loss
+            and with default value 0.25.
+        neg_loss_weight (float): The loss weight of negative loss
+            and with default value 0.75.
+        center_loss_weight (float): The loss weight of center prior
+            loss and with default value 0.75.
+    """
+
+    def __init__(self,
+                 *args,
+                 force_topk: bool = False,
+                 topk: int = 9,
+                 pos_loss_weight: float = 0.25,
+                 neg_loss_weight: float = 0.75,
+                 center_loss_weight: float = 0.75,
+                 **kwargs) -> None:
+        super().__init__(*args, conv_bias=True, **kwargs)
+        self.center_prior = CenterPrior(
+            force_topk=force_topk,
+            topk=topk,
+            num_classes=self.num_classes,
+            strides=self.strides)
+        self.pos_loss_weight = pos_loss_weight
+        self.neg_loss_weight = neg_loss_weight
+        self.center_loss_weight = center_loss_weight
+        self.prior_generator = MlvlPointGenerator(self.strides, offset=0)
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head.
+
+        In particular, we have special initialization for classified conv's and
+        regression conv's bias
+        """
+
+        super(AutoAssignHead, self).init_weights()
+        bias_cls = bias_init_with_prob(0.02)
+        normal_init(self.conv_cls, std=0.01, bias=bias_cls)
+        normal_init(self.conv_reg, std=0.01, bias=4.0)
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple[Tensor, Tensor, Tensor]: scores for each class, bbox
+            predictions and centerness predictions of input feature maps.
+        """
+        cls_score, bbox_pred, cls_feat, reg_feat = super(
+            FCOSHead, self).forward_single(x)
+        centerness = self.conv_centerness(reg_feat)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        # bbox_pred needed for gradient computation has been modified
+        # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+        # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+        bbox_pred = bbox_pred.clamp(min=0)
+        bbox_pred *= stride
+        return cls_score, bbox_pred, centerness
+
+    def get_pos_loss_single(self, cls_score: Tensor, objectness: Tensor,
+                            reg_loss: Tensor, gt_instances: InstanceData,
+                            center_prior_weights: Tensor) -> Tuple[Tensor]:
+        """Calculate the positive loss of all points in gt_bboxes.
+
+        Args:
+            cls_score (Tensor): All category scores for each point on
+                the feature map. The shape is (num_points, num_class).
+            objectness (Tensor): Foreground probability of all points,
+                has shape (num_points, 1).
+            reg_loss (Tensor): The regression loss of each gt_bbox and each
+                prediction box, has shape of (num_points, num_gt).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            center_prior_weights (Tensor): Float tensor with shape
+                of (num_points, num_gt). Each value represents
+                the center weighting coefficient.
+
+        Returns:
+            tuple[Tensor]:
+
+            - pos_loss (Tensor): The positive loss of all points in the \
+            gt_bboxes.
+        """
+        gt_labels = gt_instances.labels
+        # p_loc: localization confidence
+        p_loc = torch.exp(-reg_loss)
+        # p_cls: classification confidence
+        p_cls = (cls_score * objectness)[:, gt_labels]
+        # p_pos: joint confidence indicator
+        p_pos = p_cls * p_loc
+
+        # 3 is a hyper-parameter to control the contributions of high and
+        # low confidence locations towards positive losses.
+        confidence_weight = torch.exp(p_pos * 3)
+        p_pos_weight = (confidence_weight * center_prior_weights) / (
+            (confidence_weight * center_prior_weights).sum(
+                0, keepdim=True)).clamp(min=EPS)
+        reweighted_p_pos = (p_pos * p_pos_weight).sum(0)
+        pos_loss = F.binary_cross_entropy(
+            reweighted_p_pos,
+            torch.ones_like(reweighted_p_pos),
+            reduction='none')
+        pos_loss = pos_loss.sum() * self.pos_loss_weight
+        return pos_loss,
+
+    def get_neg_loss_single(self, cls_score: Tensor, objectness: Tensor,
+                            gt_instances: InstanceData, ious: Tensor,
+                            inside_gt_bbox_mask: Tensor) -> Tuple[Tensor]:
+        """Calculate the negative loss of all points in feature map.
+
+        Args:
+            cls_score (Tensor): All category scores for each point on
+                the feature map. The shape is (num_points, num_class).
+            objectness (Tensor): Foreground probability of all points
+                and is shape of (num_points, 1).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            ious (Tensor): Float tensor with shape of (num_points, num_gt).
+                Each value represent the iou of pred_bbox and gt_bboxes.
+            inside_gt_bbox_mask (Tensor): Tensor of bool type,
+                with shape of (num_points, num_gt), each
+                value is used to mark whether this point falls
+                within a certain gt.
+
+        Returns:
+            tuple[Tensor]:
+
+            - neg_loss (Tensor): The negative loss of all points in the \
+            feature map.
+        """
+        gt_labels = gt_instances.labels
+        num_gts = len(gt_labels)
+        joint_conf = (cls_score * objectness)
+        p_neg_weight = torch.ones_like(joint_conf)
+        if num_gts > 0:
+            # the order of dinmension would affect the value of
+            # p_neg_weight, we strictly follow the original
+            # implementation.
+            inside_gt_bbox_mask = inside_gt_bbox_mask.permute(1, 0)
+            ious = ious.permute(1, 0)
+
+            foreground_idxs = torch.nonzero(inside_gt_bbox_mask, as_tuple=True)
+            temp_weight = (1 / (1 - ious[foreground_idxs]).clamp_(EPS))
+
+            def normalize(x):
+                return (x - x.min() + EPS) / (x.max() - x.min() + EPS)
+
+            for instance_idx in range(num_gts):
+                idxs = foreground_idxs[0] == instance_idx
+                if idxs.any():
+                    temp_weight[idxs] = normalize(temp_weight[idxs])
+
+            p_neg_weight[foreground_idxs[1],
+                         gt_labels[foreground_idxs[0]]] = 1 - temp_weight
+
+        logits = (joint_conf * p_neg_weight)
+        neg_loss = (
+            logits**2 * F.binary_cross_entropy(
+                logits, torch.zeros_like(logits), reduction='none'))
+        neg_loss = neg_loss.sum() * self.neg_loss_weight
+        return neg_loss,
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        objectnesses: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            objectnesses (list[Tensor]): objectness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        assert len(cls_scores) == len(bbox_preds) == len(objectnesses)
+        all_num_gt = sum([len(item) for item in batch_gt_instances])
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+        inside_gt_bbox_mask_list, bbox_targets_list = self.get_targets(
+            all_level_points, batch_gt_instances)
+
+        center_prior_weight_list = []
+        temp_inside_gt_bbox_mask_list = []
+        for gt_instances, inside_gt_bbox_mask in zip(batch_gt_instances,
+                                                     inside_gt_bbox_mask_list):
+            center_prior_weight, inside_gt_bbox_mask = \
+                self.center_prior(all_level_points, gt_instances,
+                                  inside_gt_bbox_mask)
+            center_prior_weight_list.append(center_prior_weight)
+            temp_inside_gt_bbox_mask_list.append(inside_gt_bbox_mask)
+        inside_gt_bbox_mask_list = temp_inside_gt_bbox_mask_list
+        mlvl_points = torch.cat(all_level_points, dim=0)
+        bbox_preds = levels_to_images(bbox_preds)
+        cls_scores = levels_to_images(cls_scores)
+        objectnesses = levels_to_images(objectnesses)
+
+        reg_loss_list = []
+        ious_list = []
+        num_points = len(mlvl_points)
+
+        for bbox_pred, encoded_targets, inside_gt_bbox_mask in zip(
+                bbox_preds, bbox_targets_list, inside_gt_bbox_mask_list):
+            temp_num_gt = encoded_targets.size(1)
+            expand_mlvl_points = mlvl_points[:, None, :].expand(
+                num_points, temp_num_gt, 2).reshape(-1, 2)
+            encoded_targets = encoded_targets.reshape(-1, 4)
+            expand_bbox_pred = bbox_pred[:, None, :].expand(
+                num_points, temp_num_gt, 4).reshape(-1, 4)
+            decoded_bbox_preds = self.bbox_coder.decode(
+                expand_mlvl_points, expand_bbox_pred)
+            decoded_target_preds = self.bbox_coder.decode(
+                expand_mlvl_points, encoded_targets)
+            with torch.no_grad():
+                ious = bbox_overlaps(
+                    decoded_bbox_preds, decoded_target_preds, is_aligned=True)
+                ious = ious.reshape(num_points, temp_num_gt)
+                if temp_num_gt:
+                    ious = ious.max(
+                        dim=-1, keepdim=True).values.repeat(1, temp_num_gt)
+                else:
+                    ious = ious.new_zeros(num_points, temp_num_gt)
+                ious[~inside_gt_bbox_mask] = 0
+                ious_list.append(ious)
+            loss_bbox = self.loss_bbox(
+                decoded_bbox_preds,
+                decoded_target_preds,
+                weight=None,
+                reduction_override='none')
+            reg_loss_list.append(loss_bbox.reshape(num_points, temp_num_gt))
+
+        cls_scores = [item.sigmoid() for item in cls_scores]
+        objectnesses = [item.sigmoid() for item in objectnesses]
+        pos_loss_list, = multi_apply(self.get_pos_loss_single, cls_scores,
+                                     objectnesses, reg_loss_list,
+                                     batch_gt_instances,
+                                     center_prior_weight_list)
+        pos_avg_factor = reduce_mean(
+            bbox_pred.new_tensor(all_num_gt)).clamp_(min=1)
+        pos_loss = sum(pos_loss_list) / pos_avg_factor
+
+        neg_loss_list, = multi_apply(self.get_neg_loss_single, cls_scores,
+                                     objectnesses, batch_gt_instances,
+                                     ious_list, inside_gt_bbox_mask_list)
+        neg_avg_factor = sum(item.data.sum()
+                             for item in center_prior_weight_list)
+        neg_avg_factor = reduce_mean(neg_avg_factor).clamp_(min=1)
+        neg_loss = sum(neg_loss_list) / neg_avg_factor
+
+        center_loss = []
+        for i in range(len(batch_img_metas)):
+
+            if inside_gt_bbox_mask_list[i].any():
+                center_loss.append(
+                    len(batch_gt_instances[i]) /
+                    center_prior_weight_list[i].sum().clamp_(min=EPS))
+            # when width or height of gt_bbox is smaller than stride of p3
+            else:
+                center_loss.append(center_prior_weight_list[i].sum() * 0)
+
+        center_loss = torch.stack(center_loss).mean() * self.center_loss_weight
+
+        # avoid dead lock in DDP
+        if all_num_gt == 0:
+            pos_loss = bbox_preds[0].sum() * 0
+            dummy_center_prior_loss = self.center_prior.mean.sum(
+            ) * 0 + self.center_prior.sigma.sum() * 0
+            center_loss = objectnesses[0].sum() * 0 + dummy_center_prior_loss
+
+        loss = dict(
+            loss_pos=pos_loss, loss_neg=neg_loss, loss_center=center_loss)
+
+        return loss
+
+    def get_targets(
+            self, points: List[Tensor], batch_gt_instances: InstanceList
+    ) -> Tuple[List[Tensor], List[Tensor]]:
+        """Compute regression targets and each point inside or outside gt_bbox
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of all fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple(list[Tensor], list[Tensor]):
+
+            - inside_gt_bbox_mask_list (list[Tensor]): Each Tensor is with \
+            bool type and shape of (num_points, num_gt), each value is used \
+            to mark whether this point falls within a certain gt.
+            - concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
+            level. Each tensor has shape (num_points, num_gt, 4).
+        """
+
+        concat_points = torch.cat(points, dim=0)
+        # the number of points per img, per lvl
+        inside_gt_bbox_mask_list, bbox_targets_list = multi_apply(
+            self._get_targets_single, batch_gt_instances, points=concat_points)
+        return inside_gt_bbox_mask_list, bbox_targets_list
+
+    def _get_targets_single(self, gt_instances: InstanceData,
+                            points: Tensor) -> Tuple[Tensor, Tensor]:
+        """Compute regression targets and each point inside or outside gt_bbox
+        for a single image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            points (Tensor): Points of all fpn level, has shape
+                (num_points, 2).
+
+        Returns:
+            tuple[Tensor, Tensor]: Containing the following Tensors:
+
+            - inside_gt_bbox_mask (Tensor): Bool tensor with shape \
+            (num_points, num_gt), each value is used to mark whether this \
+            point falls within a certain gt.
+            - bbox_targets (Tensor): BBox targets of each points with each \
+            gt_bboxes, has shape (num_points, num_gt, 4).
+        """
+        gt_bboxes = gt_instances.bboxes
+        num_points = points.size(0)
+        num_gts = gt_bboxes.size(0)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        xs, ys = points[:, 0], points[:, 1]
+        xs = xs[:, None]
+        ys = ys[:, None]
+        left = xs - gt_bboxes[..., 0]
+        right = gt_bboxes[..., 2] - xs
+        top = ys - gt_bboxes[..., 1]
+        bottom = gt_bboxes[..., 3] - ys
+        bbox_targets = torch.stack((left, top, right, bottom), -1)
+        if num_gts:
+            inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0
+        else:
+            inside_gt_bbox_mask = bbox_targets.new_zeros((num_points, num_gts),
+                                                         dtype=torch.bool)
+
+        return inside_gt_bbox_mask, bbox_targets
diff --git a/mmde/mmdet/models/dense_heads/base_dense_head.py b/mmde/mmdet/models/dense_heads/base_dense_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4469e02c469d029cc2791289dbf41554d6a53
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/base_dense_head.py
@@ -0,0 +1,583 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta, abstractmethod
+from inspect import signature
+from typing import List, Optional, Tuple
+
+import torch
+from mmcv.ops import batched_nms
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, constant_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import (cat_boxes, get_box_tensor, get_box_wh,
+                                   scale_boxes)
+from mmdet.utils import InstanceList, OptMultiConfig
+from ..test_time_augs import merge_aug_results
+from ..utils import (filter_scores_and_topk, select_single_mlvl,
+                     unpack_gt_instances)
+
+
+class BaseDenseHead(BaseModule, metaclass=ABCMeta):
+    """Base class for DenseHeads.
+
+    1. The ``init_weights`` method is used to initialize densehead's
+    model parameters. After detector initialization, ``init_weights``
+    is triggered when ``detector.init_weights()`` is called externally.
+
+    2. The ``loss`` method is used to calculate the loss of densehead,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``loss_by_feat`` method
+    is called based on the feature maps to calculate the loss.
+
+    .. code:: text
+
+    loss(): forward() -> loss_by_feat()
+
+    3. The ``predict`` method is used to predict detection results,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``predict_by_feat`` method
+    is called based on the feature maps to predict detection results including
+    post-processing.
+
+    .. code:: text
+
+    predict(): forward() -> predict_by_feat()
+
+    4. The ``loss_and_predict`` method is used to return loss and detection
+    results at the same time. It will call densehead's ``forward``,
+    ``loss_by_feat`` and ``predict_by_feat`` methods in order.  If one-stage is
+    used as RPN, the densehead needs to return both losses and predictions.
+    This predictions is used as the proposal of roihead.
+
+    .. code:: text
+
+    loss_and_predict(): forward() -> loss_by_feat() -> predict_by_feat()
+    """
+
+    def __init__(self, init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        # `_raw_positive_infos` will be used in `get_positive_infos`, which
+        # can get positive information.
+        self._raw_positive_infos = dict()
+
+    def init_weights(self) -> None:
+        """Initialize the weights."""
+        super().init_weights()
+        # avoid init_cfg overwrite the initialization of `conv_offset`
+        for m in self.modules():
+            # DeformConv2dPack, ModulatedDeformConv2dPack
+            if hasattr(m, 'conv_offset'):
+                constant_init(m.conv_offset, 0)
+
+    def get_positive_infos(self) -> InstanceList:
+        """Get positive information from sampling results.
+
+        Returns:
+            list[:obj:`InstanceData`]: Positive information of each image,
+            usually including positive bboxes, positive labels, positive
+            priors, etc.
+        """
+        if len(self._raw_positive_infos) == 0:
+            return None
+
+        sampling_results = self._raw_positive_infos.get(
+            'sampling_results', None)
+        assert sampling_results is not None
+        positive_infos = []
+        for sampling_result in enumerate(sampling_results):
+            pos_info = InstanceData()
+            pos_info.bboxes = sampling_result.pos_gt_bboxes
+            pos_info.labels = sampling_result.pos_gt_labels
+            pos_info.priors = sampling_result.pos_priors
+            pos_info.pos_assigned_gt_inds = \
+                sampling_result.pos_assigned_gt_inds
+            pos_info.pos_inds = sampling_result.pos_inds
+            positive_infos.append(pos_info)
+        return positive_infos
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        outs = self(x)
+
+        outputs = unpack_gt_instances(batch_data_samples)
+        (batch_gt_instances, batch_gt_instances_ignore,
+         batch_img_metas) = outputs
+
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    @abstractmethod
+    def loss_by_feat(self, **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head."""
+        pass
+
+    def loss_and_predict(
+        self,
+        x: Tuple[Tensor],
+        batch_data_samples: SampleList,
+        proposal_cfg: Optional[ConfigDict] = None
+    ) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+            proposal_cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+                - losses: (dict[str, Tensor]): A dictionary of loss components.
+                - predictions (list[:obj:`InstanceData`]): Detection
+                  results of each image after the post process.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        (batch_gt_instances, batch_gt_instances_ignore,
+         batch_img_metas) = outputs
+
+        outs = self(x)
+
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg)
+        return losses, predictions
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        outs = self(x)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        score_factors: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        if score_factors is None:
+            # e.g. Retina, FreeAnchor, Foveabox, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, AutoAssign, etc.
+            with_score_factors = True
+            assert len(cls_scores) == len(score_factors)
+
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device)
+
+        result_list = []
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            cls_score_list = select_single_mlvl(
+                cls_scores, img_id, detach=True)
+            bbox_pred_list = select_single_mlvl(
+                bbox_preds, img_id, detach=True)
+            if with_score_factors:
+                score_factor_list = select_single_mlvl(
+                    score_factors, img_id, detach=True)
+            else:
+                score_factor_list = [None for _ in range(num_levels)]
+
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                score_factor_list=score_factor_list,
+                mlvl_priors=mlvl_priors,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        if score_factor_list[0] is None:
+            # e.g. Retina, FreeAnchor, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, etc.
+            with_score_factors = True
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        mlvl_labels = []
+        if with_score_factors:
+            mlvl_score_factors = []
+        else:
+            mlvl_score_factors = None
+        for level_idx, (cls_score, bbox_pred, score_factor, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              score_factor_list, mlvl_priors)):
+
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            dim = self.bbox_coder.encode_size
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim)
+            if with_score_factors:
+                score_factor = score_factor.permute(1, 2,
+                                                    0).reshape(-1).sigmoid()
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+
+            # the `custom_cls_channels` parameter is derived from
+            # CrossEntropyCustomLoss and FocalCustomLoss, and is currently used
+            # in v3det.
+            if getattr(self.loss_cls, 'custom_cls_channels', False):
+                scores = self.loss_cls.get_activation(cls_score)
+            elif self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            score_thr = cfg.get('score_thr', 0)
+
+            results = filter_scores_and_topk(
+                scores, score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            if with_score_factors:
+                score_factor = score_factor[keep_idxs]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+            if with_score_factors:
+                mlvl_score_factors.append(score_factor)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = cat_boxes(mlvl_valid_priors)
+        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+        if with_score_factors:
+            results.score_factors = torch.cat(mlvl_score_factors)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def _bbox_post_process(self,
+                           results: InstanceData,
+                           cfg: ConfigDict,
+                           rescale: bool = False,
+                           with_nms: bool = True,
+                           img_meta: Optional[dict] = None) -> InstanceData:
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (ConfigDict): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default to True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            results.bboxes = scale_boxes(results.bboxes, scale_factor)
+
+        if hasattr(results, 'score_factors'):
+            # TODO: Add sqrt operation in order to be consistent with
+            #  the paper.
+            score_factors = results.pop('score_factors')
+            results.scores = results.scores * score_factors
+
+        # filter small size bboxes
+        if cfg.get('min_bbox_size', -1) >= 0:
+            w, h = get_box_wh(results.bboxes)
+            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+            if not valid_mask.all():
+                results = results[valid_mask]
+
+        # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg
+        if with_nms and results.bboxes.numel() > 0:
+            bboxes = get_box_tensor(results.bboxes)
+            det_bboxes, keep_idxs = batched_nms(bboxes, results.scores,
+                                                results.labels, cfg.nms)
+            results = results[keep_idxs]
+            # some nms would reweight the score, such as softnms
+            results.scores = det_bboxes[:, -1]
+            results = results[:cfg.max_per_img]
+
+        return results
+
+    def aug_test(self,
+                 aug_batch_feats,
+                 aug_batch_img_metas,
+                 rescale=False,
+                 with_ori_nms=False,
+                 **kwargs):
+        """Test function with test time augmentation.
+
+        Args:
+            aug_batch_feats (list[tuple[Tensor]]): The outer list
+                indicates test-time augmentations and inner tuple
+                indicate the multi-level feats from
+                FPN, each Tensor should have a shape (B, C, H, W),
+            aug_batch_img_metas (list[list[dict]]): Meta information
+                of images under the different test-time augs
+                (multiscale, flip, etc.). The outer list indicate
+                the
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+            with_ori_nms (bool): Whether execute the nms in original head.
+                Defaults to False. It will be `True` when the head is
+                adopted as `rpn_head`.
+
+        Returns:
+            list(obj:`InstanceData`): Detection results of the
+            input images. Each item usually contains\
+            following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance,)
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances,).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        # TODO: remove this for detr and deformdetr
+        sig_of_get_results = signature(self.get_results)
+        get_results_args = [
+            p.name for p in sig_of_get_results.parameters.values()
+        ]
+        get_results_single_sig = signature(self._get_results_single)
+        get_results_single_sig_args = [
+            p.name for p in get_results_single_sig.parameters.values()
+        ]
+        assert ('with_nms' in get_results_args) and \
+               ('with_nms' in get_results_single_sig_args), \
+               f'{self.__class__.__name__}' \
+               'does not support test-time augmentation '
+
+        num_imgs = len(aug_batch_img_metas[0])
+        aug_batch_results = []
+        for x, img_metas in zip(aug_batch_feats, aug_batch_img_metas):
+            outs = self.forward(x)
+            batch_instance_results = self.get_results(
+                *outs,
+                img_metas=img_metas,
+                cfg=self.test_cfg,
+                rescale=False,
+                with_nms=with_ori_nms,
+                **kwargs)
+            aug_batch_results.append(batch_instance_results)
+
+        # after merging, bboxes will be rescaled to the original image
+        batch_results = merge_aug_results(aug_batch_results,
+                                          aug_batch_img_metas)
+
+        final_results = []
+        for img_id in range(num_imgs):
+            results = batch_results[img_id]
+            det_bboxes, keep_idxs = batched_nms(results.bboxes, results.scores,
+                                                results.labels,
+                                                self.test_cfg.nms)
+            results = results[keep_idxs]
+            # some nms operation may reweight the score such as softnms
+            results.scores = det_bboxes[:, -1]
+            results = results[:self.test_cfg.max_per_img]
+            if rescale:
+                # all results have been mapped to the original scale
+                # in `merge_aug_results`, so just pass
+                pass
+            else:
+                # map to the first aug image scale
+                scale_factor = results.bboxes.new_tensor(
+                    aug_batch_img_metas[0][img_id]['scale_factor'])
+                results.bboxes = \
+                    results.bboxes * scale_factor
+
+            final_results.append(results)
+
+        return final_results
diff --git a/mmde/mmdet/models/dense_heads/base_mask_head.py b/mmde/mmdet/models/dense_heads/base_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7183d782829aa15bf12b9e2f7ade999c84d0593f
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/base_mask_head.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Tuple, Union
+
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig
+from ..utils import unpack_gt_instances
+
+
+class BaseMaskHead(BaseModule, metaclass=ABCMeta):
+    """Base class for mask heads used in One-Stage Instance Segmentation."""
+
+    def __init__(self, init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+    @abstractmethod
+    def loss_by_feat(self, *args, **kwargs):
+        """Calculate the loss based on the features extracted by the mask
+        head."""
+        pass
+
+    @abstractmethod
+    def predict_by_feat(self, *args, **kwargs):
+        """Transform a batch of output features extracted from the head into
+        mask results."""
+        pass
+
+    def loss(self,
+             x: Union[List[Tensor], Tuple[Tensor]],
+             batch_data_samples: SampleList,
+             positive_infos: OptInstanceList = None,
+             **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the mask head on
+        the features of the upstream network.
+
+        Args:
+            x (list[Tensor] | tuple[Tensor]): Features from FPN.
+                Each has a shape (B, C, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+            positive_infos (list[:obj:`InstanceData`], optional): Information
+                of positive samples. Used when the label assignment is
+                done outside the MaskHead, e.g., BboxHead in
+                YOLACT or CondInst, etc. When the label assignment is done in
+                MaskHead, it would be None, like SOLO or SOLOv2. All values
+                in it should have shape (num_positive_samples, *).
+
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        if positive_infos is None:
+            outs = self(x)
+        else:
+            outs = self(x, positive_infos)
+
+        assert isinstance(outs, tuple), 'Forward results should be a tuple, ' \
+                                        'even if only one item is returned'
+
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+        for gt_instances, img_metas in zip(batch_gt_instances,
+                                           batch_img_metas):
+            img_shape = img_metas['batch_input_shape']
+            gt_masks = gt_instances.masks.pad(img_shape)
+            gt_instances.masks = gt_masks
+
+        losses = self.loss_by_feat(
+            *outs,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas,
+            positive_infos=positive_infos,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            **kwargs)
+        return losses
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False,
+                results_list: OptInstanceList = None,
+                **kwargs) -> InstanceList:
+        """Test function without test-time augmentation.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+            results_list (list[obj:`InstanceData`], optional): Detection
+                results of each image after the post process. Only exist
+                if there is a `bbox_head`, like `YOLACT`, `CondInst`, etc.
+
+        Returns:
+            list[obj:`InstanceData`]: Instance segmentation
+            results of each image after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance,)
+                - labels (Tensor): Has a shape (num_instances,).
+                - masks (Tensor): Processed mask results, has a
+                  shape (num_instances, h, w).
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        if results_list is None:
+            outs = self(x)
+        else:
+            outs = self(x, results_list)
+
+        results_list = self.predict_by_feat(
+            *outs,
+            batch_img_metas=batch_img_metas,
+            rescale=rescale,
+            results_list=results_list,
+            **kwargs)
+
+        return results_list
diff --git a/mmde/mmdet/models/dense_heads/boxinst_head.py b/mmde/mmdet/models/dense_heads/boxinst_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d6e8f7777a852cad89b709e59af2d8e12b343a6
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/boxinst_head.py
@@ -0,0 +1,252 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from mmengine import MessageHub
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList
+from ..utils.misc import unfold_wo_center
+from .condinst_head import CondInstBboxHead, CondInstMaskHead
+
+
+@MODELS.register_module()
+class BoxInstBboxHead(CondInstBboxHead):
+    """BoxInst box head used in https://arxiv.org/abs/2012.02310."""
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+
+@MODELS.register_module()
+class BoxInstMaskHead(CondInstMaskHead):
+    """BoxInst mask head used in https://arxiv.org/abs/2012.02310.
+
+    This head outputs the mask for BoxInst.
+
+    Args:
+        pairwise_size (dict): The size of neighborhood for each pixel.
+            Defaults to 3.
+        pairwise_dilation (int): The dilation of neighborhood for each pixel.
+            Defaults to 2.
+        warmup_iters (int): Warmup iterations for pair-wise loss.
+            Defaults to 10000.
+    """
+
+    def __init__(self,
+                 *arg,
+                 pairwise_size: int = 3,
+                 pairwise_dilation: int = 2,
+                 warmup_iters: int = 10000,
+                 **kwargs) -> None:
+        self.pairwise_size = pairwise_size
+        self.pairwise_dilation = pairwise_dilation
+        self.warmup_iters = warmup_iters
+        super().__init__(*arg, **kwargs)
+
+    def get_pairwise_affinity(self, mask_logits: Tensor) -> Tensor:
+        """Compute the pairwise affinity for each pixel."""
+        log_fg_prob = F.logsigmoid(mask_logits).unsqueeze(1)
+        log_bg_prob = F.logsigmoid(-mask_logits).unsqueeze(1)
+
+        log_fg_prob_unfold = unfold_wo_center(
+            log_fg_prob,
+            kernel_size=self.pairwise_size,
+            dilation=self.pairwise_dilation)
+        log_bg_prob_unfold = unfold_wo_center(
+            log_bg_prob,
+            kernel_size=self.pairwise_size,
+            dilation=self.pairwise_dilation)
+
+        # the probability of making the same prediction:
+        # p_i * p_j + (1 - p_i) * (1 - p_j)
+        # we compute the the probability in log space
+        # to avoid numerical instability
+        log_same_fg_prob = log_fg_prob[:, :, None] + log_fg_prob_unfold
+        log_same_bg_prob = log_bg_prob[:, :, None] + log_bg_prob_unfold
+
+        # TODO: Figure out the difference between it and directly sum
+        max_ = torch.max(log_same_fg_prob, log_same_bg_prob)
+        log_same_prob = torch.log(
+            torch.exp(log_same_fg_prob - max_) +
+            torch.exp(log_same_bg_prob - max_)) + max_
+
+        return -log_same_prob[:, 0]
+
+    def loss_by_feat(self, mask_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], positive_infos: InstanceList,
+                     **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mask_preds (list[Tensor]): List of predicted masks, each has
+                shape (num_classes, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+            positive_infos (List[:obj:``InstanceData``]): Information of
+                positive samples of each image that are assigned in detection
+                head.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert positive_infos is not None, \
+            'positive_infos should not be None in `BoxInstMaskHead`'
+        losses = dict()
+
+        loss_mask_project = 0.
+        loss_mask_pairwise = 0.
+        num_imgs = len(mask_preds)
+        total_pos = 0.
+        avg_fatcor = 0.
+
+        for idx in range(num_imgs):
+            (mask_pred, pos_mask_targets, pos_pairwise_masks, num_pos) = \
+                self._get_targets_single(
+                mask_preds[idx], batch_gt_instances[idx],
+                positive_infos[idx])
+            # mask loss
+            total_pos += num_pos
+            if num_pos == 0 or pos_mask_targets is None:
+                loss_project = mask_pred.new_zeros(1).mean()
+                loss_pairwise = mask_pred.new_zeros(1).mean()
+                avg_fatcor += 0.
+            else:
+                # compute the project term
+                loss_project_x = self.loss_mask(
+                    mask_pred.max(dim=1, keepdim=True)[0],
+                    pos_mask_targets.max(dim=1, keepdim=True)[0],
+                    reduction_override='none').sum()
+                loss_project_y = self.loss_mask(
+                    mask_pred.max(dim=2, keepdim=True)[0],
+                    pos_mask_targets.max(dim=2, keepdim=True)[0],
+                    reduction_override='none').sum()
+                loss_project = loss_project_x + loss_project_y
+                # compute the pairwise term
+                pairwise_affinity = self.get_pairwise_affinity(mask_pred)
+                avg_fatcor += pos_pairwise_masks.sum().clamp(min=1.0)
+                loss_pairwise = (pairwise_affinity * pos_pairwise_masks).sum()
+
+            loss_mask_project += loss_project
+            loss_mask_pairwise += loss_pairwise
+
+        if total_pos == 0:
+            total_pos += 1  # avoid nan
+        if avg_fatcor == 0:
+            avg_fatcor += 1  # avoid nan
+        loss_mask_project = loss_mask_project / total_pos
+        loss_mask_pairwise = loss_mask_pairwise / avg_fatcor
+        message_hub = MessageHub.get_current_instance()
+        iter = message_hub.get_info('iter')
+        warmup_factor = min(iter / float(self.warmup_iters), 1.0)
+        loss_mask_pairwise *= warmup_factor
+
+        losses.update(
+            loss_mask_project=loss_mask_project,
+            loss_mask_pairwise=loss_mask_pairwise)
+        return losses
+
+    def _get_targets_single(self, mask_preds: Tensor,
+                            gt_instances: InstanceData,
+                            positive_info: InstanceData):
+        """Compute targets for predictions of single image.
+
+        Args:
+            mask_preds (Tensor): Predicted prototypes with shape
+                (num_classes, H, W).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            positive_info (:obj:`InstanceData`): Information of positive
+                samples that are assigned in detection head. It usually
+                contains following keys.
+
+                    - pos_assigned_gt_inds (Tensor): Assigner GT indexes of
+                      positive proposals, has shape (num_pos, )
+                    - pos_inds (Tensor): Positive index of image, has
+                      shape (num_pos, ).
+                    - param_pred (Tensor): Positive param preditions
+                      with shape (num_pos, num_params).
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+            - mask_preds (Tensor): Positive predicted mask with shape
+              (num_pos, mask_h, mask_w).
+            - pos_mask_targets (Tensor): Positive mask targets with shape
+              (num_pos, mask_h, mask_w).
+            - pos_pairwise_masks (Tensor): Positive pairwise masks with
+              shape: (num_pos, num_neighborhood, mask_h, mask_w).
+            - num_pos (int): Positive numbers.
+        """
+        gt_bboxes = gt_instances.bboxes
+        device = gt_bboxes.device
+        # Note that gt_masks are generated by full box
+        # from BoxInstDataPreprocessor
+        gt_masks = gt_instances.masks.to_tensor(
+            dtype=torch.bool, device=device).float()
+        # Note that pairwise_masks are generated by image color similarity
+        # from BoxInstDataPreprocessor
+        pairwise_masks = gt_instances.pairwise_masks
+        pairwise_masks = pairwise_masks.to(device=device)
+
+        # process with mask targets
+        pos_assigned_gt_inds = positive_info.get('pos_assigned_gt_inds')
+        scores = positive_info.get('scores')
+        centernesses = positive_info.get('centernesses')
+        num_pos = pos_assigned_gt_inds.size(0)
+
+        if gt_masks.size(0) == 0 or num_pos == 0:
+            return mask_preds, None, None, 0
+        # Since we're producing (near) full image masks,
+        # it'd take too much vram to backprop on every single mask.
+        # Thus we select only a subset.
+        if (self.max_masks_to_train != -1) and \
+           (num_pos > self.max_masks_to_train):
+            perm = torch.randperm(num_pos)
+            select = perm[:self.max_masks_to_train]
+            mask_preds = mask_preds[select]
+            pos_assigned_gt_inds = pos_assigned_gt_inds[select]
+            num_pos = self.max_masks_to_train
+        elif self.topk_masks_per_img != -1:
+            unique_gt_inds = pos_assigned_gt_inds.unique()
+            num_inst_per_gt = max(
+                int(self.topk_masks_per_img / len(unique_gt_inds)), 1)
+
+            keep_mask_preds = []
+            keep_pos_assigned_gt_inds = []
+            for gt_ind in unique_gt_inds:
+                per_inst_pos_inds = (pos_assigned_gt_inds == gt_ind)
+                mask_preds_per_inst = mask_preds[per_inst_pos_inds]
+                gt_inds_per_inst = pos_assigned_gt_inds[per_inst_pos_inds]
+                if sum(per_inst_pos_inds) > num_inst_per_gt:
+                    per_inst_scores = scores[per_inst_pos_inds].sigmoid().max(
+                        dim=1)[0]
+                    per_inst_centerness = centernesses[
+                        per_inst_pos_inds].sigmoid().reshape(-1, )
+                    select = (per_inst_scores * per_inst_centerness).topk(
+                        k=num_inst_per_gt, dim=0)[1]
+                    mask_preds_per_inst = mask_preds_per_inst[select]
+                    gt_inds_per_inst = gt_inds_per_inst[select]
+                keep_mask_preds.append(mask_preds_per_inst)
+                keep_pos_assigned_gt_inds.append(gt_inds_per_inst)
+            mask_preds = torch.cat(keep_mask_preds)
+            pos_assigned_gt_inds = torch.cat(keep_pos_assigned_gt_inds)
+            num_pos = pos_assigned_gt_inds.size(0)
+
+        # Follow the origin implement
+        start = int(self.mask_out_stride // 2)
+        gt_masks = gt_masks[:, start::self.mask_out_stride,
+                            start::self.mask_out_stride]
+        gt_masks = gt_masks.gt(0.5).float()
+        pos_mask_targets = gt_masks[pos_assigned_gt_inds]
+        pos_pairwise_masks = pairwise_masks[pos_assigned_gt_inds]
+        pos_pairwise_masks = pos_pairwise_masks * pos_mask_targets.unsqueeze(1)
+
+        return (mask_preds, pos_mask_targets, pos_pairwise_masks, num_pos)
diff --git a/mmde/mmdet/models/dense_heads/cascade_rpn_head.py b/mmde/mmdet/models/dense_heads/cascade_rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8686cc2c9118094df34a04fdeabd87daa636707
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/cascade_rpn_head.py
@@ -0,0 +1,1110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import division
+import copy
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.ops import DeformConv2d
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, ModuleList
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig,
+                         OptInstanceList, OptMultiConfig)
+from ..task_modules.assigners import RegionAssigner
+from ..task_modules.samplers import PseudoSampler
+from ..utils import (images_to_levels, multi_apply, select_single_mlvl,
+                     unpack_gt_instances)
+from .base_dense_head import BaseDenseHead
+from .rpn_head import RPNHead
+
+
+class AdaptiveConv(BaseModule):
+    """AdaptiveConv used to adapt the sampling location with the anchors.
+
+    Args:
+        in_channels (int): Number of channels in the input image.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int or tuple[int]): Size of the conv kernel.
+            Defaults to 3.
+        stride (int or tuple[int]): Stride of the convolution. Defaults to 1.
+        padding (int or tuple[int]): Zero-padding added to both sides of
+            the input. Defaults to 1.
+        dilation (int or tuple[int]): Spacing between kernel elements.
+            Defaults to 3.
+        groups (int): Number of blocked connections from input channels to
+            output channels. Defaults to 1.
+        bias (bool): If set True, adds a learnable bias to the output.
+            Defaults to False.
+        adapt_type (str): Type of adaptive conv, can be either ``offset``
+            (arbitrary anchors) or 'dilation' (uniform anchor).
+            Defaults to 'dilation'.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
+            list[dict]): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int]] = 3,
+        stride: Union[int, Tuple[int]] = 1,
+        padding: Union[int, Tuple[int]] = 1,
+        dilation: Union[int, Tuple[int]] = 3,
+        groups: int = 1,
+        bias: bool = False,
+        adapt_type: str = 'dilation',
+        init_cfg: MultiConfig = dict(
+            type='Normal', std=0.01, override=dict(name='conv'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert adapt_type in ['offset', 'dilation']
+        self.adapt_type = adapt_type
+
+        assert kernel_size == 3, 'Adaptive conv only supports kernels 3'
+        if self.adapt_type == 'offset':
+            assert stride == 1 and padding == 1 and groups == 1, \
+                'Adaptive conv offset mode only supports padding: {1}, ' \
+                f'stride: {1}, groups: {1}'
+            self.conv = DeformConv2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                padding=padding,
+                stride=stride,
+                groups=groups,
+                bias=bias)
+        else:
+            self.conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                padding=dilation,
+                dilation=dilation)
+
+    def forward(self, x: Tensor, offset: Tensor) -> Tensor:
+        """Forward function."""
+        if self.adapt_type == 'offset':
+            N, _, H, W = x.shape
+            assert offset is not None
+            assert H * W == offset.shape[1]
+            # reshape [N, NA, 18] to (N, 18, H, W)
+            offset = offset.permute(0, 2, 1).reshape(N, -1, H, W)
+            offset = offset.contiguous()
+            x = self.conv(x, offset)
+        else:
+            assert offset is None
+            x = self.conv(x)
+        return x
+
+
+@MODELS.register_module()
+class StageCascadeRPNHead(RPNHead):
+    """Stage of CascadeRPNHead.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        anchor_generator (:obj:`ConfigDict` or dict): anchor generator config.
+        adapt_cfg (:obj:`ConfigDict` or dict): adaptation config.
+        bridged_feature (bool): whether update rpn feature. Defaults to False.
+        with_cls (bool): whether use classification branch. Defaults to True.
+        init_cfg :obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 anchor_generator: ConfigType = dict(
+                     type='AnchorGenerator',
+                     scales=[8],
+                     ratios=[1.0],
+                     strides=[4, 8, 16, 32, 64]),
+                 adapt_cfg: ConfigType = dict(type='dilation', dilation=3),
+                 bridged_feature: bool = False,
+                 with_cls: bool = True,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        self.with_cls = with_cls
+        self.anchor_strides = anchor_generator['strides']
+        self.anchor_scales = anchor_generator['scales']
+        self.bridged_feature = bridged_feature
+        self.adapt_cfg = adapt_cfg
+        super().__init__(
+            in_channels=in_channels,
+            anchor_generator=anchor_generator,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        # override sampling and sampler
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            # use PseudoSampler when sampling is False
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+        if init_cfg is None:
+            self.init_cfg = dict(
+                type='Normal', std=0.01, override=[dict(name='rpn_reg')])
+            if self.with_cls:
+                self.init_cfg['override'].append(dict(name='rpn_cls'))
+
+    def _init_layers(self) -> None:
+        """Init layers of a CascadeRPN stage."""
+        adapt_cfg = copy.deepcopy(self.adapt_cfg)
+        adapt_cfg['adapt_type'] = adapt_cfg.pop('type')
+        self.rpn_conv = AdaptiveConv(self.in_channels, self.feat_channels,
+                                     **adapt_cfg)
+        if self.with_cls:
+            self.rpn_cls = nn.Conv2d(self.feat_channels,
+                                     self.num_anchors * self.cls_out_channels,
+                                     1)
+        self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward_single(self, x: Tensor, offset: Tensor) -> Tuple[Tensor]:
+        """Forward function of single scale."""
+        bridged_x = x
+        x = self.relu(self.rpn_conv(x, offset))
+        if self.bridged_feature:
+            bridged_x = x  # update feature
+        cls_score = self.rpn_cls(x) if self.with_cls else None
+        bbox_pred = self.rpn_reg(x)
+        return bridged_x, cls_score, bbox_pred
+
+    def forward(
+            self,
+            feats: List[Tensor],
+            offset_list: Optional[List[Tensor]] = None) -> Tuple[List[Tensor]]:
+        """Forward function."""
+        if offset_list is None:
+            offset_list = [None for _ in range(len(feats))]
+        return multi_apply(self.forward_single, feats, offset_list)
+
+    def _region_targets_single(self, flat_anchors: Tensor, valid_flags: Tensor,
+                               gt_instances: InstanceData, img_meta: dict,
+                               gt_instances_ignore: InstanceData,
+                               featmap_sizes: List[Tuple[int, int]],
+                               num_level_anchors: List[int]) -> tuple:
+        """Get anchor targets based on region for single level.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors, 4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors, ).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            featmap_sizes (list[Tuple[int, int]]): Feature map size each level.
+            num_level_anchors (list[int]): The number of anchors in each level.
+
+        Returns:
+            tuple:
+
+                - labels (Tensor): Labels of each level.
+                - label_weights (Tensor): Label weights of each level.
+                - bbox_targets (Tensor): BBox targets of each level.
+                - bbox_weights (Tensor): BBox weights of each level.
+                - pos_inds (Tensor): positive samples indexes.
+                - neg_inds (Tensor): negative samples indexes.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        pred_instances = InstanceData()
+        pred_instances.priors = flat_anchors
+        pred_instances.valid_flags = valid_flags
+
+        assign_result = self.assigner.assign(
+            pred_instances,
+            gt_instances,
+            img_meta,
+            featmap_sizes,
+            num_level_anchors,
+            self.anchor_scales[0],
+            self.anchor_strides,
+            gt_instances_ignore=gt_instances_ignore,
+            allowed_border=self.train_cfg['allowed_border'])
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_anchors = flat_anchors.shape[0]
+        bbox_targets = torch.zeros_like(flat_anchors)
+        bbox_weights = torch.zeros_like(flat_anchors)
+        labels = flat_anchors.new_zeros(num_anchors, dtype=torch.long)
+        label_weights = flat_anchors.new_zeros(num_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            else:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def region_targets(
+        self,
+        anchor_list: List[List[Tensor]],
+        valid_flag_list: List[List[Tensor]],
+        featmap_sizes: List[Tuple[int, int]],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None,
+        return_sampling_results: bool = False,
+    ) -> tuple:
+        """Compute regression and classification targets for anchors when using
+        RegionAssigner.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image.
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image.
+            featmap_sizes (list[Tuple[int, int]]): Feature map size each level.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            tuple:
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - bbox_weights_list (list[Tensor]): BBox weights of each level.
+                - avg_factor (int): Average factor that is used to average
+                  the loss. When using sampling method, avg_factor is usually
+                  the sum of positive and negative priors. When using
+                  ``PseudoSampler``, ``avg_factor`` is usually equal to the
+                  number of positive priors.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors to a single tensor
+        concat_anchor_list = []
+        concat_valid_flag_list = []
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+            concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
+         pos_inds_list, neg_inds_list, sampling_results_list) = multi_apply(
+             self._region_targets_single,
+             concat_anchor_list,
+             concat_valid_flag_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             featmap_sizes=featmap_sizes,
+             num_level_anchors=num_level_anchors)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        res = (labels_list, label_weights_list, bbox_targets_list,
+               bbox_weights_list, avg_factor)
+        if return_sampling_results:
+            res = res + (sampling_results_list, )
+        return res
+
+    def get_targets(
+        self,
+        anchor_list: List[List[Tensor]],
+        valid_flag_list: List[List[Tensor]],
+        featmap_sizes: List[Tuple[int, int]],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None,
+        return_sampling_results: bool = False,
+    ) -> tuple:
+        """Compute regression and classification targets for anchors.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image.
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image.
+            featmap_sizes (list[Tuple[int, int]]): Feature map size each level.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            return_sampling_results (bool): Whether to return the sampling
+                results. Defaults to False.
+
+        Returns:
+            tuple:
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - bbox_weights_list (list[Tensor]): BBox weights of each level.
+                - avg_factor (int): Average factor that is used to average
+                  the loss. When using sampling method, avg_factor is usually
+                  the sum of positive and negative priors. When using
+                  ``PseudoSampler``, ``avg_factor`` is usually equal to the
+                  number of positive priors.
+        """
+        if isinstance(self.assigner, RegionAssigner):
+            cls_reg_targets = self.region_targets(
+                anchor_list,
+                valid_flag_list,
+                featmap_sizes,
+                batch_gt_instances,
+                batch_img_metas,
+                batch_gt_instances_ignore=batch_gt_instances_ignore,
+                return_sampling_results=return_sampling_results)
+        else:
+            cls_reg_targets = super().get_targets(
+                anchor_list,
+                valid_flag_list,
+                batch_gt_instances,
+                batch_img_metas,
+                batch_gt_instances_ignore=batch_gt_instances_ignore,
+                return_sampling_results=return_sampling_results)
+        return cls_reg_targets
+
+    def anchor_offset(self, anchor_list: List[List[Tensor]],
+                      anchor_strides: List[int],
+                      featmap_sizes: List[Tuple[int, int]]) -> List[Tensor]:
+        """ Get offset for deformable conv based on anchor shape
+        NOTE: currently support deformable kernel_size=3 and dilation=1
+
+        Args:
+            anchor_list (list[list[tensor])): [NI, NLVL, NA, 4] list of
+                multi-level anchors
+            anchor_strides (list[int]): anchor stride of each level
+
+        Returns:
+            list[tensor]: offset of DeformConv kernel with shapes of
+            [NLVL, NA, 2, 18].
+        """
+
+        def _shape_offset(anchors, stride, ks=3, dilation=1):
+            # currently support kernel_size=3 and dilation=1
+            assert ks == 3 and dilation == 1
+            pad = (ks - 1) // 2
+            idx = torch.arange(-pad, pad + 1, dtype=dtype, device=device)
+            yy, xx = torch.meshgrid(idx, idx)  # return order matters
+            xx = xx.reshape(-1)
+            yy = yy.reshape(-1)
+            w = (anchors[:, 2] - anchors[:, 0]) / stride
+            h = (anchors[:, 3] - anchors[:, 1]) / stride
+            w = w / (ks - 1) - dilation
+            h = h / (ks - 1) - dilation
+            offset_x = w[:, None] * xx  # (NA, ks**2)
+            offset_y = h[:, None] * yy  # (NA, ks**2)
+            return offset_x, offset_y
+
+        def _ctr_offset(anchors, stride, featmap_size):
+            feat_h, feat_w = featmap_size
+            assert len(anchors) == feat_h * feat_w
+
+            x = (anchors[:, 0] + anchors[:, 2]) * 0.5
+            y = (anchors[:, 1] + anchors[:, 3]) * 0.5
+            # compute centers on feature map
+            x = x / stride
+            y = y / stride
+            # compute predefine centers
+            xx = torch.arange(0, feat_w, device=anchors.device)
+            yy = torch.arange(0, feat_h, device=anchors.device)
+            yy, xx = torch.meshgrid(yy, xx)
+            xx = xx.reshape(-1).type_as(x)
+            yy = yy.reshape(-1).type_as(y)
+
+            offset_x = x - xx  # (NA, )
+            offset_y = y - yy  # (NA, )
+            return offset_x, offset_y
+
+        num_imgs = len(anchor_list)
+        num_lvls = len(anchor_list[0])
+        dtype = anchor_list[0][0].dtype
+        device = anchor_list[0][0].device
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+
+        offset_list = []
+        for i in range(num_imgs):
+            mlvl_offset = []
+            for lvl in range(num_lvls):
+                c_offset_x, c_offset_y = _ctr_offset(anchor_list[i][lvl],
+                                                     anchor_strides[lvl],
+                                                     featmap_sizes[lvl])
+                s_offset_x, s_offset_y = _shape_offset(anchor_list[i][lvl],
+                                                       anchor_strides[lvl])
+
+                # offset = ctr_offset + shape_offset
+                offset_x = s_offset_x + c_offset_x[:, None]
+                offset_y = s_offset_y + c_offset_y[:, None]
+
+                # offset order (y0, x0, y1, x2, .., y8, x8, y9, x9)
+                offset = torch.stack([offset_y, offset_x], dim=-1)
+                offset = offset.reshape(offset.size(0), -1)  # [NA, 2*ks**2]
+                mlvl_offset.append(offset)
+            offset_list.append(torch.cat(mlvl_offset))  # [totalNA, 2*ks**2]
+        offset_list = images_to_levels(offset_list, num_level_anchors)
+        return offset_list
+
+    def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            anchors: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            bbox_weights: Tensor, avg_factor: int) -> tuple:
+        """Loss function on single scale."""
+        # classification loss
+        if self.with_cls:
+            labels = labels.reshape(-1)
+            label_weights = label_weights.reshape(-1)
+            cls_score = cls_score.permute(0, 2, 3,
+                                          1).reshape(-1, self.cls_out_channels)
+            loss_cls = self.loss_cls(
+                cls_score, labels, label_weights, avg_factor=avg_factor)
+        # regression loss
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        bbox_weights = bbox_weights.reshape(-1, 4)
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            anchors = anchors.reshape(-1, 4)
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+        loss_reg = self.loss_bbox(
+            bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor)
+        if self.with_cls:
+            return loss_cls, loss_reg
+        return None, loss_reg
+
+    def loss_by_feat(
+        self,
+        anchor_list: List[List[Tensor]],
+        valid_flag_list: List[List[Tensor]],
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Compute losses of the head.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image.
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in bbox_preds]
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            featmap_sizes,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            return_sampling_results=True)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor, sampling_results_list) = cls_reg_targets
+        if not sampling_results_list[0].avg_factor_with_neg:
+            # 200 is hard-coded average factor,
+            # which follows guided anchoring.
+            avg_factor = sum([label.numel() for label in labels_list]) / 200.0
+
+        # change per image, per level anchor_list to per_level, per_image
+        mlvl_anchor_list = list(zip(*anchor_list))
+        # concat mlvl_anchor_list
+        mlvl_anchor_list = [
+            torch.cat(anchors, dim=0) for anchors in mlvl_anchor_list
+        ]
+
+        losses = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            bbox_preds,
+            mlvl_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            avg_factor=avg_factor)
+        if self.with_cls:
+            return dict(loss_rpn_cls=losses[0], loss_rpn_reg=losses[1])
+        return dict(loss_rpn_reg=losses[1])
+
+    def predict_by_feat(self,
+                        anchor_list: List[List[Tensor]],
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        batch_img_metas: List[dict],
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False) -> InstanceList:
+        """Get proposal predict. Overriding to enable input ``anchor_list``
+        from outside.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image.
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            batch_img_metas (list[dict], Optional): Image meta info.
+            cfg (:obj:`ConfigDict`, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score_list = select_single_mlvl(cls_scores, img_id)
+            bbox_pred_list = select_single_mlvl(bbox_preds, img_id)
+            proposals = self._predict_by_feat_single(
+                cls_scores=cls_score_list,
+                bbox_preds=bbox_pred_list,
+                mlvl_anchors=anchor_list[img_id],
+                img_meta=batch_img_metas[img_id],
+                cfg=cfg,
+                rescale=rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: List[Tensor],
+                                bbox_preds: List[Tensor],
+                                mlvl_anchors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False) -> InstanceData:
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has
+                shape (num_anchors * 4, H, W).
+            mlvl_anchors (list[Tensor]): Box reference from all scale
+                levels of a single image, each item has shape
+                (num_total_anchors, 4).
+            img_shape (tuple[int]): Shape of the input image,
+                (height, width, 3).
+            scale_factor (ndarray): Scale factor of the image arange as
+                (w_scale, h_scale, w_scale, h_scale).
+            cfg (:obj:`ConfigDict`): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        # bboxes from different level should be independent during NMS,
+        # level_ids are used as labels for batched NMS to separate them
+        level_ids = []
+        mlvl_scores = []
+        mlvl_bbox_preds = []
+        mlvl_valid_anchors = []
+        nms_pre = cfg.get('nms_pre', -1)
+        for idx in range(len(cls_scores)):
+            rpn_cls_score = cls_scores[idx]
+            rpn_bbox_pred = bbox_preds[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(-1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
+                # We set FG labels to [0, num_class-1] and BG label to
+                # num_class in RPN head since mmdet v2.5, which is unified to
+                # be consistent with other head since mmdet v2.0. In mmdet v2.0
+                # to v2.4 we keep BG label as 0 and FG label as 1 in rpn head.
+                scores = rpn_cls_score.softmax(dim=1)[:, 0]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            anchors = mlvl_anchors[idx]
+
+            if 0 < nms_pre < scores.shape[0]:
+                # sort is faster than topk
+                # _, topk_inds = scores.topk(cfg.nms_pre)
+                ranked_scores, rank_inds = scores.sort(descending=True)
+                topk_inds = rank_inds[:nms_pre]
+                scores = ranked_scores[:nms_pre]
+                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
+                anchors = anchors[topk_inds, :]
+            mlvl_scores.append(scores)
+            mlvl_bbox_preds.append(rpn_bbox_pred)
+            mlvl_valid_anchors.append(anchors)
+            level_ids.append(
+                scores.new_full((scores.size(0), ), idx, dtype=torch.long))
+
+        anchors = torch.cat(mlvl_valid_anchors)
+        rpn_bbox_pred = torch.cat(mlvl_bbox_preds)
+        bboxes = self.bbox_coder.decode(
+            anchors, rpn_bbox_pred, max_shape=img_meta['img_shape'])
+
+        proposals = InstanceData()
+        proposals.bboxes = bboxes
+        proposals.scores = torch.cat(mlvl_scores)
+        proposals.level_ids = torch.cat(level_ids)
+
+        return self._bbox_post_process(
+            results=proposals, cfg=cfg, rescale=rescale, img_meta=img_meta)
+
+    def refine_bboxes(self, anchor_list: List[List[Tensor]],
+                      bbox_preds: List[Tensor],
+                      img_metas: List[dict]) -> List[List[Tensor]]:
+        """Refine bboxes through stages."""
+        num_levels = len(bbox_preds)
+        new_anchor_list = []
+        for img_id in range(len(img_metas)):
+            mlvl_anchors = []
+            for i in range(num_levels):
+                bbox_pred = bbox_preds[i][img_id].detach()
+                bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+                img_shape = img_metas[img_id]['img_shape']
+                bboxes = self.bbox_coder.decode(anchor_list[img_id][i],
+                                                bbox_pred, img_shape)
+                mlvl_anchors.append(bboxes)
+            new_anchor_list.append(mlvl_anchors)
+        return new_anchor_list
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, _, batch_img_metas = outputs
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        if self.adapt_cfg['type'] == 'offset':
+            offset_list = self.anchor_offset(anchor_list, self.anchor_strides,
+                                             featmap_sizes)
+        else:
+            offset_list = None
+
+        x, cls_score, bbox_pred = self(x, offset_list)
+        rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score, bbox_pred,
+                           batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*rpn_loss_inputs)
+
+        return losses
+
+    def loss_and_predict(
+        self,
+        x: Tuple[Tensor],
+        batch_data_samples: SampleList,
+        proposal_cfg: Optional[ConfigDict] = None,
+    ) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+            proposal_cfg (:obj`ConfigDict`, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+                - losses: (dict[str, Tensor]): A dictionary of loss components.
+                - predictions (list[:obj:`InstanceData`]): Detection
+                  results of each image after the post process.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, _, batch_img_metas = outputs
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        if self.adapt_cfg['type'] == 'offset':
+            offset_list = self.anchor_offset(anchor_list, self.anchor_strides,
+                                             featmap_sizes)
+        else:
+            offset_list = None
+
+        x, cls_score, bbox_pred = self(x, offset_list)
+        rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score, bbox_pred,
+                           batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*rpn_loss_inputs)
+
+        predictions = self.predict_by_feat(
+            anchor_list,
+            cls_score,
+            bbox_pred,
+            batch_img_metas=batch_img_metas,
+            cfg=proposal_cfg)
+        return losses, predictions
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, _ = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        if self.adapt_cfg['type'] == 'offset':
+            offset_list = self.anchor_offset(anchor_list, self.anchor_strides,
+                                             featmap_sizes)
+        else:
+            offset_list = None
+
+        x, cls_score, bbox_pred = self(x, offset_list)
+        predictions = self.stages[-1].predict_by_feat(
+            anchor_list,
+            cls_score,
+            bbox_pred,
+            batch_img_metas=batch_img_metas,
+            rescale=rescale)
+        return predictions
+
+
+@MODELS.register_module()
+class CascadeRPNHead(BaseDenseHead):
+    """The CascadeRPNHead will predict more accurate region proposals, which is
+    required for two-stage detectors (such as Fast/Faster R-CNN). CascadeRPN
+    consists of a sequence of RPNStage to progressively improve the accuracy of
+    the detected proposals.
+
+    More details can be found in ``https://arxiv.org/abs/1909.06720``.
+
+    Args:
+        num_stages (int): number of CascadeRPN stages.
+        stages (list[:obj:`ConfigDict` or dict]): list of configs to build
+            the stages.
+        train_cfg (list[:obj:`ConfigDict` or dict]): list of configs at
+            training time each stage.
+        test_cfg (:obj:`ConfigDict` or dict): config at testing time.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
+            list[dict]): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 num_stages: int,
+                 stages: List[ConfigType],
+                 train_cfg: List[ConfigType],
+                 test_cfg: ConfigType,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert num_classes == 1, 'Only support num_classes == 1'
+        assert num_stages == len(stages)
+        self.num_stages = num_stages
+        # Be careful! Pretrained weights cannot be loaded when use
+        # nn.ModuleList
+        self.stages = ModuleList()
+        for i in range(len(stages)):
+            train_cfg_i = train_cfg[i] if train_cfg is not None else None
+            stages[i].update(train_cfg=train_cfg_i)
+            stages[i].update(test_cfg=test_cfg)
+            self.stages.append(MODELS.build(stages[i]))
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def loss_by_feat(self):
+        """loss_by_feat() is implemented in StageCascadeRPNHead."""
+        pass
+
+    def predict_by_feat(self):
+        """predict_by_feat() is implemented in StageCascadeRPNHead."""
+        pass
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, _, batch_img_metas = outputs
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, valid_flag_list = self.stages[0].get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        losses = dict()
+
+        for i in range(self.num_stages):
+            stage = self.stages[i]
+
+            if stage.adapt_cfg['type'] == 'offset':
+                offset_list = stage.anchor_offset(anchor_list,
+                                                  stage.anchor_strides,
+                                                  featmap_sizes)
+            else:
+                offset_list = None
+            x, cls_score, bbox_pred = stage(x, offset_list)
+            rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score,
+                               bbox_pred, batch_gt_instances, batch_img_metas)
+            stage_loss = stage.loss_by_feat(*rpn_loss_inputs)
+            for name, value in stage_loss.items():
+                losses['s{}.{}'.format(i, name)] = value
+
+            # refine boxes
+            if i < self.num_stages - 1:
+                anchor_list = stage.refine_bboxes(anchor_list, bbox_pred,
+                                                  batch_img_metas)
+
+        return losses
+
+    def loss_and_predict(
+        self,
+        x: Tuple[Tensor],
+        batch_data_samples: SampleList,
+        proposal_cfg: Optional[ConfigDict] = None,
+    ) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+            proposal_cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+                - losses: (dict[str, Tensor]): A dictionary of loss components.
+                - predictions (list[:obj:`InstanceData`]): Detection
+                  results of each image after the post process.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, _, batch_img_metas = outputs
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, valid_flag_list = self.stages[0].get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        losses = dict()
+
+        for i in range(self.num_stages):
+            stage = self.stages[i]
+
+            if stage.adapt_cfg['type'] == 'offset':
+                offset_list = stage.anchor_offset(anchor_list,
+                                                  stage.anchor_strides,
+                                                  featmap_sizes)
+            else:
+                offset_list = None
+            x, cls_score, bbox_pred = stage(x, offset_list)
+            rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score,
+                               bbox_pred, batch_gt_instances, batch_img_metas)
+            stage_loss = stage.loss_by_feat(*rpn_loss_inputs)
+            for name, value in stage_loss.items():
+                losses['s{}.{}'.format(i, name)] = value
+
+            # refine boxes
+            if i < self.num_stages - 1:
+                anchor_list = stage.refine_bboxes(anchor_list, bbox_pred,
+                                                  batch_img_metas)
+
+        predictions = self.stages[-1].predict_by_feat(
+            anchor_list,
+            cls_score,
+            bbox_pred,
+            batch_img_metas=batch_img_metas,
+            cfg=proposal_cfg)
+        return losses, predictions
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, _ = self.stages[0].get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        for i in range(self.num_stages):
+            stage = self.stages[i]
+            if stage.adapt_cfg['type'] == 'offset':
+                offset_list = stage.anchor_offset(anchor_list,
+                                                  stage.anchor_strides,
+                                                  featmap_sizes)
+            else:
+                offset_list = None
+            x, cls_score, bbox_pred = stage(x, offset_list)
+            if i < self.num_stages - 1:
+                anchor_list = stage.refine_bboxes(anchor_list, bbox_pred,
+                                                  batch_img_metas)
+
+        predictions = self.stages[-1].predict_by_feat(
+            anchor_list,
+            cls_score,
+            bbox_pred,
+            batch_img_metas=batch_img_metas,
+            rescale=rescale)
+        return predictions
diff --git a/mmde/mmdet/models/dense_heads/centernet_head.py b/mmde/mmdet/models/dense_heads/centernet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..09f3e599eb176965e53f270014cbd326858b7c17
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/centernet_head.py
@@ -0,0 +1,447 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.ops import batched_nms
+from mmengine.config import ConfigDict
+from mmengine.model import bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig)
+from ..utils import (gaussian_radius, gen_gaussian_target, get_local_maximum,
+                     get_topk_from_heatmap, multi_apply,
+                     transpose_and_gather_feat)
+from .base_dense_head import BaseDenseHead
+
+
+@MODELS.register_module()
+class CenterNetHead(BaseDenseHead):
+    """Objects as Points Head. CenterHead use center_point to indicate object's
+    position. Paper link <https://arxiv.org/abs/1904.07850>
+
+    Args:
+        in_channels (int): Number of channel in the input feature map.
+        feat_channels (int): Number of channel in the intermediate feature map.
+        num_classes (int): Number of categories excluding the background
+            category.
+        loss_center_heatmap (:obj:`ConfigDict` or dict): Config of center
+            heatmap loss. Defaults to
+            dict(type='GaussianFocalLoss', loss_weight=1.0)
+        loss_wh (:obj:`ConfigDict` or dict): Config of wh loss. Defaults to
+             dict(type='L1Loss', loss_weight=0.1).
+        loss_offset (:obj:`ConfigDict` or dict): Config of offset loss.
+            Defaults to dict(type='L1Loss', loss_weight=1.0).
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config.
+            Useless in CenterNet, but we keep this variable for
+            SingleStageDetector.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config
+            of CenterNet.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`], optional): Initialization
+            config dict.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 feat_channels: int,
+                 num_classes: int,
+                 loss_center_heatmap: ConfigType = dict(
+                     type='GaussianFocalLoss', loss_weight=1.0),
+                 loss_wh: ConfigType = dict(type='L1Loss', loss_weight=0.1),
+                 loss_offset: ConfigType = dict(
+                     type='L1Loss', loss_weight=1.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.heatmap_head = self._build_head(in_channels, feat_channels,
+                                             num_classes)
+        self.wh_head = self._build_head(in_channels, feat_channels, 2)
+        self.offset_head = self._build_head(in_channels, feat_channels, 2)
+
+        self.loss_center_heatmap = MODELS.build(loss_center_heatmap)
+        self.loss_wh = MODELS.build(loss_wh)
+        self.loss_offset = MODELS.build(loss_offset)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.fp16_enabled = False
+
+    def _build_head(self, in_channels: int, feat_channels: int,
+                    out_channels: int) -> nn.Sequential:
+        """Build head for each branch."""
+        layer = nn.Sequential(
+            nn.Conv2d(in_channels, feat_channels, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(feat_channels, out_channels, kernel_size=1))
+        return layer
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        bias_init = bias_init_with_prob(0.1)
+        self.heatmap_head[-1].bias.data.fill_(bias_init)
+        for head in [self.wh_head, self.offset_head]:
+            for m in head.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+
+    def forward(self, x: Tuple[Tensor, ...]) -> Tuple[List[Tensor]]:
+        """Forward features. Notice CenterNet head does not use FPN.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            center_heatmap_preds (list[Tensor]): center predict heatmaps for
+                all levels, the channels number is num_classes.
+            wh_preds (list[Tensor]): wh predicts for all levels, the channels
+                number is 2.
+            offset_preds (list[Tensor]): offset predicts for all levels, the
+               channels number is 2.
+        """
+        return multi_apply(self.forward_single, x)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, ...]:
+        """Forward feature of a single level.
+
+        Args:
+            x (Tensor): Feature of a single level.
+
+        Returns:
+            center_heatmap_pred (Tensor): center predict heatmaps, the
+               channels number is num_classes.
+            wh_pred (Tensor): wh predicts, the channels number is 2.
+            offset_pred (Tensor): offset predicts, the channels number is 2.
+        """
+        center_heatmap_pred = self.heatmap_head(x).sigmoid()
+        wh_pred = self.wh_head(x)
+        offset_pred = self.offset_head(x)
+        return center_heatmap_pred, wh_pred, offset_pred
+
+    def loss_by_feat(
+            self,
+            center_heatmap_preds: List[Tensor],
+            wh_preds: List[Tensor],
+            offset_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute losses of the head.
+
+        Args:
+            center_heatmap_preds (list[Tensor]): center predict heatmaps for
+               all levels with shape (B, num_classes, H, W).
+            wh_preds (list[Tensor]): wh predicts for all levels with
+               shape (B, 2, H, W).
+            offset_preds (list[Tensor]): offset predicts for all levels
+               with shape (B, 2, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: which has components below:
+                - loss_center_heatmap (Tensor): loss of center heatmap.
+                - loss_wh (Tensor): loss of hw heatmap
+                - loss_offset (Tensor): loss of offset heatmap.
+        """
+        assert len(center_heatmap_preds) == len(wh_preds) == len(
+            offset_preds) == 1
+        center_heatmap_pred = center_heatmap_preds[0]
+        wh_pred = wh_preds[0]
+        offset_pred = offset_preds[0]
+
+        gt_bboxes = [
+            gt_instances.bboxes for gt_instances in batch_gt_instances
+        ]
+        gt_labels = [
+            gt_instances.labels for gt_instances in batch_gt_instances
+        ]
+        img_shape = batch_img_metas[0]['batch_input_shape']
+        target_result, avg_factor = self.get_targets(gt_bboxes, gt_labels,
+                                                     center_heatmap_pred.shape,
+                                                     img_shape)
+
+        center_heatmap_target = target_result['center_heatmap_target']
+        wh_target = target_result['wh_target']
+        offset_target = target_result['offset_target']
+        wh_offset_target_weight = target_result['wh_offset_target_weight']
+
+        # Since the channel of wh_target and offset_target is 2, the avg_factor
+        # of loss_center_heatmap is always 1/2 of loss_wh and loss_offset.
+        loss_center_heatmap = self.loss_center_heatmap(
+            center_heatmap_pred, center_heatmap_target, avg_factor=avg_factor)
+        loss_wh = self.loss_wh(
+            wh_pred,
+            wh_target,
+            wh_offset_target_weight,
+            avg_factor=avg_factor * 2)
+        loss_offset = self.loss_offset(
+            offset_pred,
+            offset_target,
+            wh_offset_target_weight,
+            avg_factor=avg_factor * 2)
+        return dict(
+            loss_center_heatmap=loss_center_heatmap,
+            loss_wh=loss_wh,
+            loss_offset=loss_offset)
+
+    def get_targets(self, gt_bboxes: List[Tensor], gt_labels: List[Tensor],
+                    feat_shape: tuple, img_shape: tuple) -> Tuple[dict, int]:
+        """Compute regression and classification targets in multiple images.
+
+        Args:
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box.
+            feat_shape (tuple): feature map shape with value [B, _, H, W]
+            img_shape (tuple): image shape.
+
+        Returns:
+            tuple[dict, float]: The float value is mean avg_factor, the dict
+            has components below:
+               - center_heatmap_target (Tensor): targets of center heatmap, \
+                   shape (B, num_classes, H, W).
+               - wh_target (Tensor): targets of wh predict, shape \
+                   (B, 2, H, W).
+               - offset_target (Tensor): targets of offset predict, shape \
+                   (B, 2, H, W).
+               - wh_offset_target_weight (Tensor): weights of wh and offset \
+                   predict, shape (B, 2, H, W).
+        """
+        img_h, img_w = img_shape[:2]
+        bs, _, feat_h, feat_w = feat_shape
+
+        width_ratio = float(feat_w / img_w)
+        height_ratio = float(feat_h / img_h)
+
+        center_heatmap_target = gt_bboxes[-1].new_zeros(
+            [bs, self.num_classes, feat_h, feat_w])
+        wh_target = gt_bboxes[-1].new_zeros([bs, 2, feat_h, feat_w])
+        offset_target = gt_bboxes[-1].new_zeros([bs, 2, feat_h, feat_w])
+        wh_offset_target_weight = gt_bboxes[-1].new_zeros(
+            [bs, 2, feat_h, feat_w])
+
+        for batch_id in range(bs):
+            gt_bbox = gt_bboxes[batch_id]
+            gt_label = gt_labels[batch_id]
+            center_x = (gt_bbox[:, [0]] + gt_bbox[:, [2]]) * width_ratio / 2
+            center_y = (gt_bbox[:, [1]] + gt_bbox[:, [3]]) * height_ratio / 2
+            gt_centers = torch.cat((center_x, center_y), dim=1)
+
+            for j, ct in enumerate(gt_centers):
+                ctx_int, cty_int = ct.int()
+                ctx, cty = ct
+                scale_box_h = (gt_bbox[j][3] - gt_bbox[j][1]) * height_ratio
+                scale_box_w = (gt_bbox[j][2] - gt_bbox[j][0]) * width_ratio
+                radius = gaussian_radius([scale_box_h, scale_box_w],
+                                         min_overlap=0.3)
+                radius = max(0, int(radius))
+                ind = gt_label[j]
+                gen_gaussian_target(center_heatmap_target[batch_id, ind],
+                                    [ctx_int, cty_int], radius)
+
+                wh_target[batch_id, 0, cty_int, ctx_int] = scale_box_w
+                wh_target[batch_id, 1, cty_int, ctx_int] = scale_box_h
+
+                offset_target[batch_id, 0, cty_int, ctx_int] = ctx - ctx_int
+                offset_target[batch_id, 1, cty_int, ctx_int] = cty - cty_int
+
+                wh_offset_target_weight[batch_id, :, cty_int, ctx_int] = 1
+
+        avg_factor = max(1, center_heatmap_target.eq(1).sum())
+        target_result = dict(
+            center_heatmap_target=center_heatmap_target,
+            wh_target=wh_target,
+            offset_target=offset_target,
+            wh_offset_target_weight=wh_offset_target_weight)
+        return target_result, avg_factor
+
+    def predict_by_feat(self,
+                        center_heatmap_preds: List[Tensor],
+                        wh_preds: List[Tensor],
+                        offset_preds: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        rescale: bool = True,
+                        with_nms: bool = False) -> InstanceList:
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            center_heatmap_preds (list[Tensor]): Center predict heatmaps for
+                all levels with shape (B, num_classes, H, W).
+            wh_preds (list[Tensor]): WH predicts for all levels with
+                shape (B, 2, H, W).
+            offset_preds (list[Tensor]): Offset predicts for all levels
+                with shape (B, 2, H, W).
+            batch_img_metas (list[dict], optional): Batch image meta info.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to True.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Instance segmentation
+            results of each image after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(center_heatmap_preds) == len(wh_preds) == len(
+            offset_preds) == 1
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            result_list.append(
+                self._predict_by_feat_single(
+                    center_heatmap_preds[0][img_id:img_id + 1, ...],
+                    wh_preds[0][img_id:img_id + 1, ...],
+                    offset_preds[0][img_id:img_id + 1, ...],
+                    batch_img_metas[img_id],
+                    rescale=rescale,
+                    with_nms=with_nms))
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                center_heatmap_pred: Tensor,
+                                wh_pred: Tensor,
+                                offset_pred: Tensor,
+                                img_meta: dict,
+                                rescale: bool = True,
+                                with_nms: bool = False) -> InstanceData:
+        """Transform outputs of a single image into bbox results.
+
+        Args:
+            center_heatmap_pred (Tensor): Center heatmap for current level with
+                shape (1, num_classes, H, W).
+            wh_pred (Tensor): WH heatmap for current level with shape
+                (1, num_classes, H, W).
+            offset_pred (Tensor): Offset for current level with shape
+                (1, corner_offset_channels, H, W).
+            img_meta (dict): Meta information of current image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to True.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        batch_det_bboxes, batch_labels = self._decode_heatmap(
+            center_heatmap_pred,
+            wh_pred,
+            offset_pred,
+            img_meta['batch_input_shape'],
+            k=self.test_cfg.topk,
+            kernel=self.test_cfg.local_maximum_kernel)
+
+        det_bboxes = batch_det_bboxes.view([-1, 5])
+        det_labels = batch_labels.view(-1)
+
+        batch_border = det_bboxes.new_tensor(img_meta['border'])[...,
+                                                                 [2, 0, 2, 0]]
+        det_bboxes[..., :4] -= batch_border
+
+        if rescale and 'scale_factor' in img_meta:
+            det_bboxes[..., :4] /= det_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+
+        if with_nms:
+            det_bboxes, det_labels = self._bboxes_nms(det_bboxes, det_labels,
+                                                      self.test_cfg)
+        results = InstanceData()
+        results.bboxes = det_bboxes[..., :4]
+        results.scores = det_bboxes[..., 4]
+        results.labels = det_labels
+        return results
+
+    def _decode_heatmap(self,
+                        center_heatmap_pred: Tensor,
+                        wh_pred: Tensor,
+                        offset_pred: Tensor,
+                        img_shape: tuple,
+                        k: int = 100,
+                        kernel: int = 3) -> Tuple[Tensor, Tensor]:
+        """Transform outputs into detections raw bbox prediction.
+
+        Args:
+            center_heatmap_pred (Tensor): center predict heatmap,
+               shape (B, num_classes, H, W).
+            wh_pred (Tensor): wh predict, shape (B, 2, H, W).
+            offset_pred (Tensor): offset predict, shape (B, 2, H, W).
+            img_shape (tuple): image shape in hw format.
+            k (int): Get top k center keypoints from heatmap. Defaults to 100.
+            kernel (int): Max pooling kernel for extract local maximum pixels.
+               Defaults to 3.
+
+        Returns:
+            tuple[Tensor]: Decoded output of CenterNetHead, containing
+               the following Tensors:
+
+              - batch_bboxes (Tensor): Coords of each box with shape (B, k, 5)
+              - batch_topk_labels (Tensor): Categories of each box with \
+                  shape (B, k)
+        """
+        height, width = center_heatmap_pred.shape[2:]
+        inp_h, inp_w = img_shape
+
+        center_heatmap_pred = get_local_maximum(
+            center_heatmap_pred, kernel=kernel)
+
+        *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap(
+            center_heatmap_pred, k=k)
+        batch_scores, batch_index, batch_topk_labels = batch_dets
+
+        wh = transpose_and_gather_feat(wh_pred, batch_index)
+        offset = transpose_and_gather_feat(offset_pred, batch_index)
+        topk_xs = topk_xs + offset[..., 0]
+        topk_ys = topk_ys + offset[..., 1]
+        tl_x = (topk_xs - wh[..., 0] / 2) * (inp_w / width)
+        tl_y = (topk_ys - wh[..., 1] / 2) * (inp_h / height)
+        br_x = (topk_xs + wh[..., 0] / 2) * (inp_w / width)
+        br_y = (topk_ys + wh[..., 1] / 2) * (inp_h / height)
+
+        batch_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], dim=2)
+        batch_bboxes = torch.cat((batch_bboxes, batch_scores[..., None]),
+                                 dim=-1)
+        return batch_bboxes, batch_topk_labels
+
+    def _bboxes_nms(self, bboxes: Tensor, labels: Tensor,
+                    cfg: ConfigDict) -> Tuple[Tensor, Tensor]:
+        """bboxes nms."""
+        if labels.numel() > 0:
+            max_num = cfg.max_per_img
+            bboxes, keep = batched_nms(bboxes[:, :4], bboxes[:,
+                                                             -1].contiguous(),
+                                       labels, cfg.nms)
+            if max_num > 0:
+                bboxes = bboxes[:max_num]
+                labels = labels[keep][:max_num]
+
+        return bboxes, labels
diff --git a/mmde/mmdet/models/dense_heads/centernet_update_head.py b/mmde/mmdet/models/dense_heads/centernet_update_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..00cfcb89806209c9416b1bd7e9a14d82a4911175
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/centernet_update_head.py
@@ -0,0 +1,624 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Scale
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox2distance
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..utils import multi_apply
+from .anchor_free_head import AnchorFreeHead
+
+INF = 1000000000
+RangeType = Sequence[Tuple[int, int]]
+
+
+def _transpose(tensor_list: List[Tensor],
+               num_point_list: list) -> List[Tensor]:
+    """This function is used to transpose image first tensors to level first
+    ones."""
+    for img_idx in range(len(tensor_list)):
+        tensor_list[img_idx] = torch.split(
+            tensor_list[img_idx], num_point_list, dim=0)
+
+    tensors_level_first = []
+    for targets_per_level in zip(*tensor_list):
+        tensors_level_first.append(torch.cat(targets_per_level, dim=0))
+    return tensors_level_first
+
+
+@MODELS.register_module()
+class CenterNetUpdateHead(AnchorFreeHead):
+    """CenterNetUpdateHead is an improved version of CenterNet in CenterNet2.
+    Paper link `<https://arxiv.org/abs/2103.07461>`_.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channel in the input feature map.
+        regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple
+            level points.
+        hm_min_radius (int): Heatmap target minimum radius of cls branch.
+            Defaults to 4.
+        hm_min_overlap (float): Heatmap target minimum overlap of cls branch.
+            Defaults to 0.8.
+        more_pos_thresh (float): The filtering threshold when the cls branch
+            adds more positive samples. Defaults to 0.2.
+        more_pos_topk (int): The maximum number of additional positive samples
+            added to each gt. Defaults to 9.
+        soft_weight_on_reg (bool): Whether to use the soft target of the
+            cls branch as the soft weight of the bbox branch.
+            Defaults to False.
+        loss_cls (:obj:`ConfigDict` or dict): Config of cls loss. Defaults to
+            dict(type='GaussianFocalLoss', loss_weight=1.0)
+        loss_bbox (:obj:`ConfigDict` or dict): Config of bbox loss. Defaults to
+             dict(type='GIoULoss', loss_weight=2.0).
+        norm_cfg (:obj:`ConfigDict` or dict, optional): dictionary to construct
+            and config norm layer.  Defaults to
+            ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config.
+            Unused in CenterNet. Reserved for compatibility with
+            SingleStageDetector.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config
+            of CenterNet.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 regress_ranges: RangeType = ((0, 80), (64, 160), (128, 320),
+                                              (256, 640), (512, INF)),
+                 hm_min_radius: int = 4,
+                 hm_min_overlap: float = 0.8,
+                 more_pos_thresh: float = 0.2,
+                 more_pos_topk: int = 9,
+                 soft_weight_on_reg: bool = False,
+                 loss_cls: ConfigType = dict(
+                     type='GaussianFocalLoss',
+                     pos_weight=0.25,
+                     neg_weight=0.75,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='GIoULoss', loss_weight=2.0),
+                 norm_cfg: OptConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            norm_cfg=norm_cfg,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            **kwargs)
+        self.soft_weight_on_reg = soft_weight_on_reg
+        self.hm_min_radius = hm_min_radius
+        self.more_pos_thresh = more_pos_thresh
+        self.more_pos_topk = more_pos_topk
+        self.delta = (1 - hm_min_overlap) / (1 + hm_min_overlap)
+        self.sigmoid_clamp = 0.0001
+
+        # GaussianFocalLoss must be sigmoid mode
+        self.use_sigmoid_cls = True
+        self.cls_out_channels = num_classes
+
+        self.regress_ranges = regress_ranges
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+    def _init_predictor(self) -> None:
+        """Initialize predictor layers of the head."""
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.num_classes, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of each level outputs.
+
+            - cls_scores (list[Tensor]): Box scores for each scale level, \
+            each is a 4D-tensor, the channel number is num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for each \
+            scale level, each is a 4D-tensor, the channel number is 4.
+        """
+        return multi_apply(self.forward_single, x, self.scales, self.strides)
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps.
+
+        Returns:
+            tuple: scores for each class, bbox predictions of
+            input feature maps.
+        """
+        cls_score, bbox_pred, _, _ = super().forward_single(x)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        # bbox_pred needed for gradient computation has been modified
+        # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+        # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+        bbox_pred = bbox_pred.clamp(min=0)
+        if not self.training:
+            bbox_pred *= stride
+        return cls_score, bbox_pred
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = cls_scores[0].size(0)
+        assert len(cls_scores) == len(bbox_preds)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+
+        # 1 flatten outputs
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        assert (torch.isfinite(flatten_bbox_preds).all().item())
+
+        # 2 calc reg and cls branch targets
+        cls_targets, bbox_targets = self.get_targets(all_level_points,
+                                                     batch_gt_instances)
+
+        # 3 add more pos index for cls branch
+        featmap_sizes = flatten_points.new_tensor(featmap_sizes)
+        pos_inds, cls_labels = self.add_cls_pos_inds(flatten_points,
+                                                     flatten_bbox_preds,
+                                                     featmap_sizes,
+                                                     batch_gt_instances)
+
+        # 4 calc cls loss
+        if pos_inds is None:
+            # num_gts=0
+            num_pos_cls = bbox_preds[0].new_tensor(0, dtype=torch.float)
+        else:
+            num_pos_cls = bbox_preds[0].new_tensor(
+                len(pos_inds), dtype=torch.float)
+        num_pos_cls = max(reduce_mean(num_pos_cls), 1.0)
+        flatten_cls_scores = flatten_cls_scores.sigmoid().clamp(
+            min=self.sigmoid_clamp, max=1 - self.sigmoid_clamp)
+        cls_loss = self.loss_cls(
+            flatten_cls_scores,
+            cls_targets,
+            pos_inds=pos_inds,
+            pos_labels=cls_labels,
+            avg_factor=num_pos_cls)
+
+        # 5 calc reg loss
+        pos_bbox_inds = torch.nonzero(
+            bbox_targets.max(dim=1)[0] >= 0).squeeze(1)
+        pos_bbox_preds = flatten_bbox_preds[pos_bbox_inds]
+        pos_bbox_targets = bbox_targets[pos_bbox_inds]
+
+        bbox_weight_map = cls_targets.max(dim=1)[0]
+        bbox_weight_map = bbox_weight_map[pos_bbox_inds]
+        bbox_weight_map = bbox_weight_map if self.soft_weight_on_reg \
+            else torch.ones_like(bbox_weight_map)
+        num_pos_bbox = max(reduce_mean(bbox_weight_map.sum()), 1.0)
+
+        if len(pos_bbox_inds) > 0:
+            pos_points = flatten_points[pos_bbox_inds]
+            pos_decoded_bbox_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_preds)
+            pos_decoded_target_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_targets)
+            bbox_loss = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds,
+                weight=bbox_weight_map,
+                avg_factor=num_pos_bbox)
+        else:
+            bbox_loss = flatten_bbox_preds.sum() * 0
+
+        return dict(loss_cls=cls_loss, loss_bbox=bbox_loss)
+
+    def get_targets(
+        self,
+        points: List[Tensor],
+        batch_gt_instances: InstanceList,
+    ) -> Tuple[Tensor, Tensor]:
+        """Compute classification and bbox targets for points in multiple
+        images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple: Targets of each level.
+
+            - concat_lvl_labels (Tensor): Labels of all level and batch.
+            - concat_lvl_bbox_targets (Tensor): BBox targets of all \
+            level and batch.
+        """
+        assert len(points) == len(self.regress_ranges)
+
+        num_levels = len(points)
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+        concat_strides = torch.cat([
+            concat_points.new_ones(num_points[i]) * self.strides[i]
+            for i in range(num_levels)
+        ])
+
+        # get labels and bbox_targets of each image
+        cls_targets_list, bbox_targets_list = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            points=concat_points,
+            regress_ranges=concat_regress_ranges,
+            strides=concat_strides)
+
+        bbox_targets_list = _transpose(bbox_targets_list, num_points)
+        cls_targets_list = _transpose(cls_targets_list, num_points)
+        concat_lvl_bbox_targets = torch.cat(bbox_targets_list, 0)
+        concat_lvl_cls_targets = torch.cat(cls_targets_list, dim=0)
+        return concat_lvl_cls_targets, concat_lvl_bbox_targets
+
+    def _get_targets_single(self, gt_instances: InstanceData, points: Tensor,
+                            regress_ranges: Tensor,
+                            strides: Tensor) -> Tuple[Tensor, Tensor]:
+        """Compute classification and bbox targets for a single image."""
+        num_points = points.size(0)
+        num_gts = len(gt_instances)
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+
+        if num_gts == 0:
+            return gt_labels.new_full((num_points,
+                                       self.num_classes),
+                                      self.num_classes), \
+                   gt_bboxes.new_full((num_points, 4), -1)
+
+        # Calculate the regression tblr target corresponding to all points
+        points = points[:, None].expand(num_points, num_gts, 2)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        strides = strides[:, None, None].expand(num_points, num_gts, 2)
+
+        bbox_target = bbox2distance(points, gt_bboxes)  # M x N x 4
+
+        # condition1: inside a gt bbox
+        inside_gt_bbox_mask = bbox_target.min(dim=2)[0] > 0  # M x N
+
+        # condition2: Calculate the nearest points from
+        # the upper, lower, left and right ranges from
+        # the center of the gt bbox
+        centers = ((gt_bboxes[..., [0, 1]] + gt_bboxes[..., [2, 3]]) / 2)
+        centers_discret = ((centers / strides).int() * strides).float() + \
+            strides / 2
+
+        centers_discret_dist = points - centers_discret
+        dist_x = centers_discret_dist[..., 0].abs()
+        dist_y = centers_discret_dist[..., 1].abs()
+        inside_gt_center3x3_mask = (dist_x <= strides[..., 0]) & \
+                                   (dist_y <= strides[..., 0])
+
+        # condition3: limit the regression range for each location
+        bbox_target_wh = bbox_target[..., :2] + bbox_target[..., 2:]
+        crit = (bbox_target_wh**2).sum(dim=2)**0.5 / 2
+        inside_fpn_level_mask = (crit >= regress_ranges[:, [0]]) & \
+                                (crit <= regress_ranges[:, [1]])
+        bbox_target_mask = inside_gt_bbox_mask & \
+            inside_gt_center3x3_mask & \
+            inside_fpn_level_mask
+
+        # Calculate the distance weight map
+        gt_center_peak_mask = ((centers_discret_dist**2).sum(dim=2) == 0)
+        weighted_dist = ((points - centers)**2).sum(dim=2)  # M x N
+        weighted_dist[gt_center_peak_mask] = 0
+
+        areas = (gt_bboxes[..., 2] - gt_bboxes[..., 0]) * (
+            gt_bboxes[..., 3] - gt_bboxes[..., 1])
+        radius = self.delta**2 * 2 * areas
+        radius = torch.clamp(radius, min=self.hm_min_radius**2)
+        weighted_dist = weighted_dist / radius
+
+        # Calculate bbox_target
+        bbox_weighted_dist = weighted_dist.clone()
+        bbox_weighted_dist[bbox_target_mask == 0] = INF * 1.0
+        min_dist, min_inds = bbox_weighted_dist.min(dim=1)
+        bbox_target = bbox_target[range(len(bbox_target)),
+                                  min_inds]  # M x N x 4 --> M x 4
+        bbox_target[min_dist == INF] = -INF
+
+        # Convert to feature map scale
+        bbox_target /= strides[:, 0, :].repeat(1, 2)
+
+        # Calculate cls_target
+        cls_target = self._create_heatmaps_from_dist(weighted_dist, gt_labels)
+
+        return cls_target, bbox_target
+
+    @torch.no_grad()
+    def add_cls_pos_inds(
+        self, flatten_points: Tensor, flatten_bbox_preds: Tensor,
+        featmap_sizes: Tensor, batch_gt_instances: InstanceList
+    ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
+        """Provide additional adaptive positive samples to the classification
+        branch.
+
+        Args:
+            flatten_points (Tensor): The point after flatten, including
+                batch image and all levels. The shape is (N, 2).
+            flatten_bbox_preds (Tensor): The bbox predicts after flatten,
+                including batch image and all levels. The shape is (N, 4).
+            featmap_sizes (Tensor): Feature map size of all layers.
+                The shape is (5, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+           tuple:
+
+           - pos_inds (Tensor): Adaptively selected positive sample index.
+           - cls_labels (Tensor): Corresponding positive class label.
+        """
+        outputs = self._get_center3x3_region_index_targets(
+            batch_gt_instances, featmap_sizes)
+        cls_labels, fpn_level_masks, center3x3_inds, \
+            center3x3_bbox_targets, center3x3_masks = outputs
+
+        num_gts, total_level, K = cls_labels.shape[0], len(
+            self.strides), center3x3_masks.shape[-1]
+
+        if num_gts == 0:
+            return None, None
+
+        # The out-of-bounds index is forcibly set to 0
+        # to prevent loss calculation errors
+        center3x3_inds[center3x3_masks == 0] = 0
+        reg_pred_center3x3 = flatten_bbox_preds[center3x3_inds]
+        center3x3_points = flatten_points[center3x3_inds].view(-1, 2)
+
+        center3x3_bbox_targets_expand = center3x3_bbox_targets.view(
+            -1, 4).clamp(min=0)
+
+        pos_decoded_bbox_preds = self.bbox_coder.decode(
+            center3x3_points, reg_pred_center3x3.view(-1, 4))
+        pos_decoded_target_preds = self.bbox_coder.decode(
+            center3x3_points, center3x3_bbox_targets_expand)
+        center3x3_bbox_loss = self.loss_bbox(
+            pos_decoded_bbox_preds,
+            pos_decoded_target_preds,
+            None,
+            reduction_override='none').view(num_gts, total_level,
+                                            K) / self.loss_bbox.loss_weight
+
+        # Invalid index Loss set to infinity
+        center3x3_bbox_loss[center3x3_masks == 0] = INF
+
+        # 4 is the center point of the sampled 9 points, the center point
+        # of gt bbox after discretization.
+        # The center point of gt bbox after discretization
+        # must be a positive sample, so we force its loss to be set to 0.
+        center3x3_bbox_loss.view(-1, K)[fpn_level_masks.view(-1), 4] = 0
+        center3x3_bbox_loss = center3x3_bbox_loss.view(num_gts, -1)
+
+        loss_thr = torch.kthvalue(
+            center3x3_bbox_loss, self.more_pos_topk, dim=1)[0]
+
+        loss_thr[loss_thr > self.more_pos_thresh] = self.more_pos_thresh
+        new_pos = center3x3_bbox_loss < loss_thr.view(num_gts, 1)
+        pos_inds = center3x3_inds.view(num_gts, -1)[new_pos]
+        cls_labels = cls_labels.view(num_gts,
+                                     1).expand(num_gts,
+                                               total_level * K)[new_pos]
+        return pos_inds, cls_labels
+
+    def _create_heatmaps_from_dist(self, weighted_dist: Tensor,
+                                   cls_labels: Tensor) -> Tensor:
+        """Generate heatmaps of classification branch based on weighted
+        distance map."""
+        heatmaps = weighted_dist.new_zeros(
+            (weighted_dist.shape[0], self.num_classes))
+        for c in range(self.num_classes):
+            inds = (cls_labels == c)  # N
+            if inds.int().sum() == 0:
+                continue
+            heatmaps[:, c] = torch.exp(-weighted_dist[:, inds].min(dim=1)[0])
+            zeros = heatmaps[:, c] < 1e-4
+            heatmaps[zeros, c] = 0
+        return heatmaps
+
+    def _get_center3x3_region_index_targets(self,
+                                            bacth_gt_instances: InstanceList,
+                                            shapes_per_level: Tensor) -> tuple:
+        """Get the center (and the 3x3 region near center) locations and target
+        of each objects."""
+        cls_labels = []
+        inside_fpn_level_masks = []
+        center3x3_inds = []
+        center3x3_masks = []
+        center3x3_bbox_targets = []
+
+        total_levels = len(self.strides)
+        batch = len(bacth_gt_instances)
+
+        shapes_per_level = shapes_per_level.long()
+        area_per_level = (shapes_per_level[:, 0] * shapes_per_level[:, 1])
+
+        # Select a total of 9 positions of 3x3 in the center of the gt bbox
+        # as candidate positive samples
+        K = 9
+        dx = shapes_per_level.new_tensor([-1, 0, 1, -1, 0, 1, -1, 0,
+                                          1]).view(1, 1, K)
+        dy = shapes_per_level.new_tensor([-1, -1, -1, 0, 0, 0, 1, 1,
+                                          1]).view(1, 1, K)
+
+        regress_ranges = shapes_per_level.new_tensor(self.regress_ranges).view(
+            len(self.regress_ranges), 2)  # L x 2
+        strides = shapes_per_level.new_tensor(self.strides)
+
+        start_coord_pre_level = []
+        _start = 0
+        for level in range(total_levels):
+            start_coord_pre_level.append(_start)
+            _start = _start + batch * area_per_level[level]
+        start_coord_pre_level = shapes_per_level.new_tensor(
+            start_coord_pre_level).view(1, total_levels, 1)
+        area_per_level = area_per_level.view(1, total_levels, 1)
+
+        for im_i in range(batch):
+            gt_instance = bacth_gt_instances[im_i]
+            gt_bboxes = gt_instance.bboxes
+            gt_labels = gt_instance.labels
+            num_gts = gt_bboxes.shape[0]
+            if num_gts == 0:
+                continue
+
+            cls_labels.append(gt_labels)
+
+            gt_bboxes = gt_bboxes[:, None].expand(num_gts, total_levels, 4)
+            expanded_strides = strides[None, :,
+                                       None].expand(num_gts, total_levels, 2)
+            expanded_regress_ranges = regress_ranges[None].expand(
+                num_gts, total_levels, 2)
+            expanded_shapes_per_level = shapes_per_level[None].expand(
+                num_gts, total_levels, 2)
+
+            # calc reg_target
+            centers = ((gt_bboxes[..., [0, 1]] + gt_bboxes[..., [2, 3]]) / 2)
+            centers_inds = (centers / expanded_strides).long()
+            centers_discret = centers_inds * expanded_strides \
+                + expanded_strides // 2
+
+            bbox_target = bbox2distance(centers_discret,
+                                        gt_bboxes)  # M x N x 4
+
+            # calc inside_fpn_level_mask
+            bbox_target_wh = bbox_target[..., :2] + bbox_target[..., 2:]
+            crit = (bbox_target_wh**2).sum(dim=2)**0.5 / 2
+            inside_fpn_level_mask = \
+                (crit >= expanded_regress_ranges[..., 0]) & \
+                (crit <= expanded_regress_ranges[..., 1])
+
+            inside_gt_bbox_mask = bbox_target.min(dim=2)[0] >= 0
+            inside_fpn_level_mask = inside_gt_bbox_mask & inside_fpn_level_mask
+            inside_fpn_level_masks.append(inside_fpn_level_mask)
+
+            # calc center3x3_ind and mask
+            expand_ws = expanded_shapes_per_level[..., 1:2].expand(
+                num_gts, total_levels, K)
+            expand_hs = expanded_shapes_per_level[..., 0:1].expand(
+                num_gts, total_levels, K)
+            centers_inds_x = centers_inds[..., 0:1]
+            centers_inds_y = centers_inds[..., 1:2]
+
+            center3x3_idx = start_coord_pre_level + \
+                im_i * area_per_level + \
+                (centers_inds_y + dy) * expand_ws + \
+                (centers_inds_x + dx)
+            center3x3_mask = \
+                ((centers_inds_y + dy) < expand_hs) & \
+                ((centers_inds_y + dy) >= 0) & \
+                ((centers_inds_x + dx) < expand_ws) & \
+                ((centers_inds_x + dx) >= 0)
+
+            # recalc center3x3 region reg target
+            bbox_target = bbox_target / expanded_strides.repeat(1, 1, 2)
+            center3x3_bbox_target = bbox_target[..., None, :].expand(
+                num_gts, total_levels, K, 4).clone()
+            center3x3_bbox_target[..., 0] += dx
+            center3x3_bbox_target[..., 1] += dy
+            center3x3_bbox_target[..., 2] -= dx
+            center3x3_bbox_target[..., 3] -= dy
+            # update center3x3_mask
+            center3x3_mask = center3x3_mask & (
+                center3x3_bbox_target.min(dim=3)[0] >= 0)  # n x L x K
+
+            center3x3_inds.append(center3x3_idx)
+            center3x3_masks.append(center3x3_mask)
+            center3x3_bbox_targets.append(center3x3_bbox_target)
+
+        if len(inside_fpn_level_masks) > 0:
+            cls_labels = torch.cat(cls_labels, dim=0)
+            inside_fpn_level_masks = torch.cat(inside_fpn_level_masks, dim=0)
+            center3x3_inds = torch.cat(center3x3_inds, dim=0).long()
+            center3x3_bbox_targets = torch.cat(center3x3_bbox_targets, dim=0)
+            center3x3_masks = torch.cat(center3x3_masks, dim=0)
+        else:
+            cls_labels = shapes_per_level.new_zeros(0).long()
+            inside_fpn_level_masks = shapes_per_level.new_zeros(
+                (0, total_levels)).bool()
+            center3x3_inds = shapes_per_level.new_zeros(
+                (0, total_levels, K)).long()
+            center3x3_bbox_targets = shapes_per_level.new_zeros(
+                (0, total_levels, K, 4)).float()
+            center3x3_masks = shapes_per_level.new_zeros(
+                (0, total_levels, K)).bool()
+        return cls_labels, inside_fpn_level_masks, center3x3_inds, \
+            center3x3_bbox_targets, center3x3_masks
diff --git a/mmde/mmdet/models/dense_heads/centripetal_head.py b/mmde/mmdet/models/dense_heads/centripetal_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..18f6601ff82394864d53351b10b40f51eb2aec6b
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/centripetal_head.py
@@ -0,0 +1,459 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import DeformConv2d
+from mmengine.model import normal_init
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, OptInstanceList,
+                         OptMultiConfig)
+from ..utils import multi_apply
+from .corner_head import CornerHead
+
+
+@MODELS.register_module()
+class CentripetalHead(CornerHead):
+    """Head of CentripetalNet: Pursuing High-quality Keypoint Pairs for Object
+    Detection.
+
+    CentripetalHead inherits from :class:`CornerHead`. It removes the
+    embedding branch and adds guiding shift and centripetal shift branches.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2003.09119>`_ .
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        num_feat_levels (int): Levels of feature from the previous module.
+            2 for HourglassNet-104 and 1 for HourglassNet-52. HourglassNet-104
+            outputs the final feature and intermediate supervision feature and
+            HourglassNet-52 only outputs the final feature. Defaults to 2.
+        corner_emb_channels (int): Channel of embedding vector. Defaults to 1.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config.
+            Useless in CornerHead, but we keep this variable for
+            SingleStageDetector.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            CornerHead.
+        loss_heatmap (:obj:`ConfigDict` or dict): Config of corner heatmap
+            loss. Defaults to GaussianFocalLoss.
+        loss_embedding (:obj:`ConfigDict` or dict): Config of corner embedding
+            loss. Defaults to AssociativeEmbeddingLoss.
+        loss_offset (:obj:`ConfigDict` or dict): Config of corner offset loss.
+            Defaults to SmoothL1Loss.
+        loss_guiding_shift (:obj:`ConfigDict` or dict): Config of
+            guiding shift loss. Defaults to SmoothL1Loss.
+        loss_centripetal_shift (:obj:`ConfigDict` or dict): Config of
+            centripetal shift loss. Defaults to SmoothL1Loss.
+       init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+           the initialization.
+    """
+
+    def __init__(self,
+                 *args,
+                 centripetal_shift_channels: int = 2,
+                 guiding_shift_channels: int = 2,
+                 feat_adaption_conv_kernel: int = 3,
+                 loss_guiding_shift: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=0.05),
+                 loss_centripetal_shift: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1),
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        assert centripetal_shift_channels == 2, (
+            'CentripetalHead only support centripetal_shift_channels == 2')
+        self.centripetal_shift_channels = centripetal_shift_channels
+        assert guiding_shift_channels == 2, (
+            'CentripetalHead only support guiding_shift_channels == 2')
+        self.guiding_shift_channels = guiding_shift_channels
+        self.feat_adaption_conv_kernel = feat_adaption_conv_kernel
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        self.loss_guiding_shift = MODELS.build(loss_guiding_shift)
+        self.loss_centripetal_shift = MODELS.build(loss_centripetal_shift)
+
+    def _init_centripetal_layers(self) -> None:
+        """Initialize centripetal layers.
+
+        Including feature adaption deform convs (feat_adaption), deform offset
+        prediction convs (dcn_off), guiding shift (guiding_shift) and
+        centripetal shift ( centripetal_shift). Each branch has two parts:
+        prefix `tl_` for top-left and `br_` for bottom-right.
+        """
+        self.tl_feat_adaption = nn.ModuleList()
+        self.br_feat_adaption = nn.ModuleList()
+        self.tl_dcn_offset = nn.ModuleList()
+        self.br_dcn_offset = nn.ModuleList()
+        self.tl_guiding_shift = nn.ModuleList()
+        self.br_guiding_shift = nn.ModuleList()
+        self.tl_centripetal_shift = nn.ModuleList()
+        self.br_centripetal_shift = nn.ModuleList()
+
+        for _ in range(self.num_feat_levels):
+            self.tl_feat_adaption.append(
+                DeformConv2d(self.in_channels, self.in_channels,
+                             self.feat_adaption_conv_kernel, 1, 1))
+            self.br_feat_adaption.append(
+                DeformConv2d(self.in_channels, self.in_channels,
+                             self.feat_adaption_conv_kernel, 1, 1))
+
+            self.tl_guiding_shift.append(
+                self._make_layers(
+                    out_channels=self.guiding_shift_channels,
+                    in_channels=self.in_channels))
+            self.br_guiding_shift.append(
+                self._make_layers(
+                    out_channels=self.guiding_shift_channels,
+                    in_channels=self.in_channels))
+
+            self.tl_dcn_offset.append(
+                ConvModule(
+                    self.guiding_shift_channels,
+                    self.feat_adaption_conv_kernel**2 *
+                    self.guiding_shift_channels,
+                    1,
+                    bias=False,
+                    act_cfg=None))
+            self.br_dcn_offset.append(
+                ConvModule(
+                    self.guiding_shift_channels,
+                    self.feat_adaption_conv_kernel**2 *
+                    self.guiding_shift_channels,
+                    1,
+                    bias=False,
+                    act_cfg=None))
+
+            self.tl_centripetal_shift.append(
+                self._make_layers(
+                    out_channels=self.centripetal_shift_channels,
+                    in_channels=self.in_channels))
+            self.br_centripetal_shift.append(
+                self._make_layers(
+                    out_channels=self.centripetal_shift_channels,
+                    in_channels=self.in_channels))
+
+    def _init_layers(self) -> None:
+        """Initialize layers for CentripetalHead.
+
+        Including two parts: CornerHead layers and CentripetalHead layers
+        """
+        super()._init_layers()  # using _init_layers in CornerHead
+        self._init_centripetal_layers()
+
+    def init_weights(self) -> None:
+        super().init_weights()
+        for i in range(self.num_feat_levels):
+            normal_init(self.tl_feat_adaption[i], std=0.01)
+            normal_init(self.br_feat_adaption[i], std=0.01)
+            normal_init(self.tl_dcn_offset[i].conv, std=0.1)
+            normal_init(self.br_dcn_offset[i].conv, std=0.1)
+            _ = [x.conv.reset_parameters() for x in self.tl_guiding_shift[i]]
+            _ = [x.conv.reset_parameters() for x in self.br_guiding_shift[i]]
+            _ = [
+                x.conv.reset_parameters() for x in self.tl_centripetal_shift[i]
+            ]
+            _ = [
+                x.conv.reset_parameters() for x in self.br_centripetal_shift[i]
+            ]
+
+    def forward_single(self, x: Tensor, lvl_ind: int) -> List[Tensor]:
+        """Forward feature of a single level.
+
+        Args:
+            x (Tensor): Feature of a single level.
+            lvl_ind (int): Level index of current feature.
+
+        Returns:
+            tuple[Tensor]: A tuple of CentripetalHead's output for current
+            feature level. Containing the following Tensors:
+
+                - tl_heat (Tensor): Predicted top-left corner heatmap.
+                - br_heat (Tensor): Predicted bottom-right corner heatmap.
+                - tl_off (Tensor): Predicted top-left offset heatmap.
+                - br_off (Tensor): Predicted bottom-right offset heatmap.
+                - tl_guiding_shift (Tensor): Predicted top-left guiding shift
+                  heatmap.
+                - br_guiding_shift (Tensor): Predicted bottom-right guiding
+                  shift heatmap.
+                - tl_centripetal_shift (Tensor): Predicted top-left centripetal
+                  shift heatmap.
+                - br_centripetal_shift (Tensor): Predicted bottom-right
+                  centripetal shift heatmap.
+        """
+        tl_heat, br_heat, _, _, tl_off, br_off, tl_pool, br_pool = super(
+        ).forward_single(
+            x, lvl_ind, return_pool=True)
+
+        tl_guiding_shift = self.tl_guiding_shift[lvl_ind](tl_pool)
+        br_guiding_shift = self.br_guiding_shift[lvl_ind](br_pool)
+
+        tl_dcn_offset = self.tl_dcn_offset[lvl_ind](tl_guiding_shift.detach())
+        br_dcn_offset = self.br_dcn_offset[lvl_ind](br_guiding_shift.detach())
+
+        tl_feat_adaption = self.tl_feat_adaption[lvl_ind](tl_pool,
+                                                          tl_dcn_offset)
+        br_feat_adaption = self.br_feat_adaption[lvl_ind](br_pool,
+                                                          br_dcn_offset)
+
+        tl_centripetal_shift = self.tl_centripetal_shift[lvl_ind](
+            tl_feat_adaption)
+        br_centripetal_shift = self.br_centripetal_shift[lvl_ind](
+            br_feat_adaption)
+
+        result_list = [
+            tl_heat, br_heat, tl_off, br_off, tl_guiding_shift,
+            br_guiding_shift, tl_centripetal_shift, br_centripetal_shift
+        ]
+        return result_list
+
+    def loss_by_feat(
+            self,
+            tl_heats: List[Tensor],
+            br_heats: List[Tensor],
+            tl_offs: List[Tensor],
+            br_offs: List[Tensor],
+            tl_guiding_shifts: List[Tensor],
+            br_guiding_shifts: List[Tensor],
+            tl_centripetal_shifts: List[Tensor],
+            br_centripetal_shifts: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            tl_guiding_shifts (list[Tensor]): Top-left guiding shifts for each
+                level with shape (N, guiding_shift_channels, H, W).
+            br_guiding_shifts (list[Tensor]): Bottom-right guiding shifts for
+                each level with shape (N, guiding_shift_channels, H, W).
+            tl_centripetal_shifts (list[Tensor]): Top-left centripetal shifts
+                for each level with shape (N, centripetal_shift_channels, H,
+                W).
+            br_centripetal_shifts (list[Tensor]): Bottom-right centripetal
+                shifts for each level with shape (N,
+                centripetal_shift_channels, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Specify which bounding boxes can be ignored when computing
+                the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components. Containing the
+            following losses:
+
+                - det_loss (list[Tensor]): Corner keypoint losses of all
+                  feature levels.
+                - off_loss (list[Tensor]): Corner offset losses of all feature
+                  levels.
+                - guiding_loss (list[Tensor]): Guiding shift losses of all
+                  feature levels.
+                - centripetal_loss (list[Tensor]): Centripetal shift losses of
+                  all feature levels.
+        """
+        gt_bboxes = [
+            gt_instances.bboxes for gt_instances in batch_gt_instances
+        ]
+        gt_labels = [
+            gt_instances.labels for gt_instances in batch_gt_instances
+        ]
+
+        targets = self.get_targets(
+            gt_bboxes,
+            gt_labels,
+            tl_heats[-1].shape,
+            batch_img_metas[0]['batch_input_shape'],
+            with_corner_emb=self.with_corner_emb,
+            with_guiding_shift=True,
+            with_centripetal_shift=True)
+        mlvl_targets = [targets for _ in range(self.num_feat_levels)]
+        [det_losses, off_losses, guiding_losses, centripetal_losses
+         ] = multi_apply(self.loss_by_feat_single, tl_heats, br_heats, tl_offs,
+                         br_offs, tl_guiding_shifts, br_guiding_shifts,
+                         tl_centripetal_shifts, br_centripetal_shifts,
+                         mlvl_targets)
+        loss_dict = dict(
+            det_loss=det_losses,
+            off_loss=off_losses,
+            guiding_loss=guiding_losses,
+            centripetal_loss=centripetal_losses)
+        return loss_dict
+
+    def loss_by_feat_single(self, tl_hmp: Tensor, br_hmp: Tensor,
+                            tl_off: Tensor, br_off: Tensor,
+                            tl_guiding_shift: Tensor, br_guiding_shift: Tensor,
+                            tl_centripetal_shift: Tensor,
+                            br_centripetal_shift: Tensor,
+                            targets: dict) -> Tuple[Tensor, ...]:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            tl_hmp (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_hmp (Tensor): Bottom-right corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            tl_guiding_shift (Tensor): Top-left guiding shift for current level
+                with shape (N, guiding_shift_channels, H, W).
+            br_guiding_shift (Tensor): Bottom-right guiding shift for current
+                level with shape (N, guiding_shift_channels, H, W).
+            tl_centripetal_shift (Tensor): Top-left centripetal shift for
+                current level with shape (N, centripetal_shift_channels, H, W).
+            br_centripetal_shift (Tensor): Bottom-right centripetal shift for
+                current level with shape (N, centripetal_shift_channels, H, W).
+            targets (dict): Corner target generated by `get_targets`.
+
+        Returns:
+            tuple[torch.Tensor]: Losses of the head's different branches
+            containing the following losses:
+
+                - det_loss (Tensor): Corner keypoint loss.
+                - off_loss (Tensor): Corner offset loss.
+                - guiding_loss (Tensor): Guiding shift loss.
+                - centripetal_loss (Tensor): Centripetal shift loss.
+        """
+        targets['corner_embedding'] = None
+
+        det_loss, _, _, off_loss = super().loss_by_feat_single(
+            tl_hmp, br_hmp, None, None, tl_off, br_off, targets)
+
+        gt_tl_guiding_shift = targets['topleft_guiding_shift']
+        gt_br_guiding_shift = targets['bottomright_guiding_shift']
+        gt_tl_centripetal_shift = targets['topleft_centripetal_shift']
+        gt_br_centripetal_shift = targets['bottomright_centripetal_shift']
+
+        gt_tl_heatmap = targets['topleft_heatmap']
+        gt_br_heatmap = targets['bottomright_heatmap']
+        # We only compute the offset loss at the real corner position.
+        # The value of real corner would be 1 in heatmap ground truth.
+        # The mask is computed in class agnostic mode and its shape is
+        # batch * 1 * width * height.
+        tl_mask = gt_tl_heatmap.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_tl_heatmap)
+        br_mask = gt_br_heatmap.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_br_heatmap)
+
+        # Guiding shift loss
+        tl_guiding_loss = self.loss_guiding_shift(
+            tl_guiding_shift,
+            gt_tl_guiding_shift,
+            tl_mask,
+            avg_factor=tl_mask.sum())
+        br_guiding_loss = self.loss_guiding_shift(
+            br_guiding_shift,
+            gt_br_guiding_shift,
+            br_mask,
+            avg_factor=br_mask.sum())
+        guiding_loss = (tl_guiding_loss + br_guiding_loss) / 2.0
+        # Centripetal shift loss
+        tl_centripetal_loss = self.loss_centripetal_shift(
+            tl_centripetal_shift,
+            gt_tl_centripetal_shift,
+            tl_mask,
+            avg_factor=tl_mask.sum())
+        br_centripetal_loss = self.loss_centripetal_shift(
+            br_centripetal_shift,
+            gt_br_centripetal_shift,
+            br_mask,
+            avg_factor=br_mask.sum())
+        centripetal_loss = (tl_centripetal_loss + br_centripetal_loss) / 2.0
+
+        return det_loss, off_loss, guiding_loss, centripetal_loss
+
+    def predict_by_feat(self,
+                        tl_heats: List[Tensor],
+                        br_heats: List[Tensor],
+                        tl_offs: List[Tensor],
+                        br_offs: List[Tensor],
+                        tl_guiding_shifts: List[Tensor],
+                        br_guiding_shifts: List[Tensor],
+                        tl_centripetal_shifts: List[Tensor],
+                        br_centripetal_shifts: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            tl_guiding_shifts (list[Tensor]): Top-left guiding shifts for each
+                level with shape (N, guiding_shift_channels, H, W). Useless in
+                this function, we keep this arg because it's the raw output
+                from CentripetalHead.
+            br_guiding_shifts (list[Tensor]): Bottom-right guiding shifts for
+                each level with shape (N, guiding_shift_channels, H, W).
+                Useless in this function, we keep this arg because it's the
+                raw output from CentripetalHead.
+            tl_centripetal_shifts (list[Tensor]): Top-left centripetal shifts
+                for each level with shape (N, centripetal_shift_channels, H,
+                W).
+            br_centripetal_shifts (list[Tensor]): Bottom-right centripetal
+                shifts for each level with shape (N,
+                centripetal_shift_channels, H, W).
+            batch_img_metas (list[dict], optional): Batch image meta info.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert tl_heats[-1].shape[0] == br_heats[-1].shape[0] == len(
+            batch_img_metas)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            result_list.append(
+                self._predict_by_feat_single(
+                    tl_heats[-1][img_id:img_id + 1, :],
+                    br_heats[-1][img_id:img_id + 1, :],
+                    tl_offs[-1][img_id:img_id + 1, :],
+                    br_offs[-1][img_id:img_id + 1, :],
+                    batch_img_metas[img_id],
+                    tl_emb=None,
+                    br_emb=None,
+                    tl_centripetal_shift=tl_centripetal_shifts[-1][
+                        img_id:img_id + 1, :],
+                    br_centripetal_shift=br_centripetal_shifts[-1][
+                        img_id:img_id + 1, :],
+                    rescale=rescale,
+                    with_nms=with_nms))
+
+        return result_list
diff --git a/mmde/mmdet/models/dense_heads/condinst_head.py b/mmde/mmdet/models/dense_heads/condinst_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..35a25e6339a8161314cb0523e7181f9d400023ac
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/condinst_head.py
@@ -0,0 +1,1226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, kaiming_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import cat_boxes
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..utils import (aligned_bilinear, filter_scores_and_topk, multi_apply,
+                     relative_coordinate_maps, select_single_mlvl)
+from ..utils.misc import empty_instances
+from .base_mask_head import BaseMaskHead
+from .fcos_head import FCOSHead
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class CondInstBboxHead(FCOSHead):
+    """CondInst box head used in https://arxiv.org/abs/1904.02689.
+
+    Note that CondInst Bbox Head is a extension of FCOS head.
+    Two differences are described as follows:
+
+    1. CondInst box head predicts a set of params for each instance.
+    2. CondInst box head return the pos_gt_inds and pos_inds.
+
+    Args:
+        num_params (int): Number of params for instance segmentation.
+    """
+
+    def __init__(self, *args, num_params: int = 169, **kwargs) -> None:
+        self.num_params = num_params
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        super()._init_layers()
+        self.controller = nn.Conv2d(
+            self.feat_channels, self.num_params, 3, padding=1)
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple: scores for each class, bbox predictions, centerness
+            predictions and param predictions of input feature maps.
+        """
+        cls_score, bbox_pred, cls_feat, reg_feat = \
+            super(FCOSHead, self).forward_single(x)
+        if self.centerness_on_reg:
+            centerness = self.conv_centerness(reg_feat)
+        else:
+            centerness = self.conv_centerness(cls_feat)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        if self.norm_on_bbox:
+            # bbox_pred needed for gradient computation has been modified
+            # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+            # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+            bbox_pred = bbox_pred.clamp(min=0)
+            if not self.training:
+                bbox_pred *= stride
+        else:
+            bbox_pred = bbox_pred.exp()
+        param_pred = self.controller(reg_feat)
+        return cls_score, bbox_pred, centerness, param_pred
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        centernesses: List[Tensor],
+        param_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            centernesses (list[Tensor]): centerness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            param_preds (List[Tensor]): param_pred for each scale level, each
+                is a 4D-tensor, the channel number is num_params.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(centernesses)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        # Need stride for rel coord compute
+        all_level_points_strides = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device,
+            with_stride=True)
+        all_level_points = [i[:, :2] for i in all_level_points_strides]
+        all_level_strides = [i[:, 2] for i in all_level_points_strides]
+        labels, bbox_targets, pos_inds_list, pos_gt_inds_list = \
+            self.get_targets(all_level_points, batch_gt_instances)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds and centerness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_centerness = [
+            centerness.permute(0, 2, 3, 1).reshape(-1)
+            for centerness in centernesses
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_centerness = torch.cat(flatten_centerness)
+        flatten_labels = torch.cat(labels)
+        flatten_bbox_targets = torch.cat(bbox_targets)
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((flatten_labels >= 0)
+                    & (flatten_labels < bg_class_ind)).nonzero().reshape(-1)
+        num_pos = torch.tensor(
+            len(pos_inds), dtype=torch.float, device=bbox_preds[0].device)
+        num_pos = max(reduce_mean(num_pos), 1.0)
+        loss_cls = self.loss_cls(
+            flatten_cls_scores, flatten_labels, avg_factor=num_pos)
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_centerness = flatten_centerness[pos_inds]
+        pos_bbox_targets = flatten_bbox_targets[pos_inds]
+        pos_centerness_targets = self.centerness_target(pos_bbox_targets)
+        # centerness weighted iou loss
+        centerness_denorm = max(
+            reduce_mean(pos_centerness_targets.sum().detach()), 1e-6)
+
+        if len(pos_inds) > 0:
+            pos_points = flatten_points[pos_inds]
+            pos_decoded_bbox_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_preds)
+            pos_decoded_target_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_targets)
+            loss_bbox = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds,
+                weight=pos_centerness_targets,
+                avg_factor=centerness_denorm)
+            loss_centerness = self.loss_centerness(
+                pos_centerness, pos_centerness_targets, avg_factor=num_pos)
+        else:
+            loss_bbox = pos_bbox_preds.sum()
+            loss_centerness = pos_centerness.sum()
+
+        self._raw_positive_infos.update(cls_scores=cls_scores)
+        self._raw_positive_infos.update(centernesses=centernesses)
+        self._raw_positive_infos.update(param_preds=param_preds)
+        self._raw_positive_infos.update(all_level_points=all_level_points)
+        self._raw_positive_infos.update(all_level_strides=all_level_strides)
+        self._raw_positive_infos.update(pos_gt_inds_list=pos_gt_inds_list)
+        self._raw_positive_infos.update(pos_inds_list=pos_inds_list)
+
+        return dict(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_centerness=loss_centerness)
+
+    def get_targets(
+        self, points: List[Tensor], batch_gt_instances: InstanceList
+    ) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]:
+        """Compute regression, classification and centerness targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple: Targets of each level.
+
+            - concat_lvl_labels (list[Tensor]): Labels of each level.
+            - concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
+            level.
+            - pos_inds_list (list[Tensor]): pos_inds of each image.
+            - pos_gt_inds_list (List[Tensor]): pos_gt_inds of each image.
+        """
+        assert len(points) == len(self.regress_ranges)
+        num_levels = len(points)
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        # get labels and bbox_targets of each image
+        labels_list, bbox_targets_list, pos_inds_list, pos_gt_inds_list = \
+            multi_apply(
+                self._get_targets_single,
+                batch_gt_instances,
+                points=concat_points,
+                regress_ranges=concat_regress_ranges,
+                num_points_per_lvl=num_points)
+
+        # split to per img, per level
+        labels_list = [labels.split(num_points, 0) for labels in labels_list]
+        bbox_targets_list = [
+            bbox_targets.split(num_points, 0)
+            for bbox_targets in bbox_targets_list
+        ]
+
+        # concat per level image
+        concat_lvl_labels = []
+        concat_lvl_bbox_targets = []
+        for i in range(num_levels):
+            concat_lvl_labels.append(
+                torch.cat([labels[i] for labels in labels_list]))
+            bbox_targets = torch.cat(
+                [bbox_targets[i] for bbox_targets in bbox_targets_list])
+            if self.norm_on_bbox:
+                bbox_targets = bbox_targets / self.strides[i]
+            concat_lvl_bbox_targets.append(bbox_targets)
+        return (concat_lvl_labels, concat_lvl_bbox_targets, pos_inds_list,
+                pos_gt_inds_list)
+
+    def _get_targets_single(
+        self, gt_instances: InstanceData, points: Tensor,
+        regress_ranges: Tensor, num_points_per_lvl: List[int]
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        """Compute regression and classification targets for a single image."""
+        num_points = points.size(0)
+        num_gts = len(gt_instances)
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        gt_masks = gt_instances.get('masks', None)
+
+        if num_gts == 0:
+            return gt_labels.new_full((num_points,), self.num_classes), \
+                   gt_bboxes.new_zeros((num_points, 4)), \
+                   gt_bboxes.new_zeros((0,), dtype=torch.int64), \
+                   gt_bboxes.new_zeros((0,), dtype=torch.int64)
+
+        areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+            gt_bboxes[:, 3] - gt_bboxes[:, 1])
+        # TODO: figure out why these two are different
+        # areas = areas[None].expand(num_points, num_gts)
+        areas = areas[None].repeat(num_points, 1)
+        regress_ranges = regress_ranges[:, None, :].expand(
+            num_points, num_gts, 2)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        xs, ys = points[:, 0], points[:, 1]
+        xs = xs[:, None].expand(num_points, num_gts)
+        ys = ys[:, None].expand(num_points, num_gts)
+
+        left = xs - gt_bboxes[..., 0]
+        right = gt_bboxes[..., 2] - xs
+        top = ys - gt_bboxes[..., 1]
+        bottom = gt_bboxes[..., 3] - ys
+        bbox_targets = torch.stack((left, top, right, bottom), -1)
+
+        if self.center_sampling:
+            # condition1: inside a `center bbox`
+            radius = self.center_sample_radius
+            # if gt_mask not None, use gt mask's centroid to determine
+            # the center region rather than gt_bbox center
+            if gt_masks is None:
+                center_xs = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) / 2
+                center_ys = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) / 2
+            else:
+                h, w = gt_masks.height, gt_masks.width
+                masks = gt_masks.to_tensor(
+                    dtype=torch.bool, device=gt_bboxes.device)
+                yys = torch.arange(
+                    0, h, dtype=torch.float32, device=masks.device)
+                xxs = torch.arange(
+                    0, w, dtype=torch.float32, device=masks.device)
+                # m00/m10/m01 represent the moments of a contour
+                # centroid is computed by m00/m10 and m00/m01
+                m00 = masks.sum(dim=-1).sum(dim=-1).clamp(min=1e-6)
+                m10 = (masks * xxs).sum(dim=-1).sum(dim=-1)
+                m01 = (masks * yys[:, None]).sum(dim=-1).sum(dim=-1)
+                center_xs = m10 / m00
+                center_ys = m01 / m00
+
+                center_xs = center_xs[None].expand(num_points, num_gts)
+                center_ys = center_ys[None].expand(num_points, num_gts)
+            center_gts = torch.zeros_like(gt_bboxes)
+            stride = center_xs.new_zeros(center_xs.shape)
+
+            # project the points on current lvl back to the `original` sizes
+            lvl_begin = 0
+            for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl):
+                lvl_end = lvl_begin + num_points_lvl
+                stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius
+                lvl_begin = lvl_end
+
+            x_mins = center_xs - stride
+            y_mins = center_ys - stride
+            x_maxs = center_xs + stride
+            y_maxs = center_ys + stride
+            center_gts[..., 0] = torch.where(x_mins > gt_bboxes[..., 0],
+                                             x_mins, gt_bboxes[..., 0])
+            center_gts[..., 1] = torch.where(y_mins > gt_bboxes[..., 1],
+                                             y_mins, gt_bboxes[..., 1])
+            center_gts[..., 2] = torch.where(x_maxs > gt_bboxes[..., 2],
+                                             gt_bboxes[..., 2], x_maxs)
+            center_gts[..., 3] = torch.where(y_maxs > gt_bboxes[..., 3],
+                                             gt_bboxes[..., 3], y_maxs)
+
+            cb_dist_left = xs - center_gts[..., 0]
+            cb_dist_right = center_gts[..., 2] - xs
+            cb_dist_top = ys - center_gts[..., 1]
+            cb_dist_bottom = center_gts[..., 3] - ys
+            center_bbox = torch.stack(
+                (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1)
+            inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
+        else:
+            # condition1: inside a gt bbox
+            inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0
+
+        # condition2: limit the regression range for each location
+        max_regress_distance = bbox_targets.max(-1)[0]
+        inside_regress_range = (
+            (max_regress_distance >= regress_ranges[..., 0])
+            & (max_regress_distance <= regress_ranges[..., 1]))
+
+        # if there are still more than one objects for a location,
+        # we choose the one with minimal area
+        areas[inside_gt_bbox_mask == 0] = INF
+        areas[inside_regress_range == 0] = INF
+        min_area, min_area_inds = areas.min(dim=1)
+
+        labels = gt_labels[min_area_inds]
+        labels[min_area == INF] = self.num_classes  # set as BG
+        bbox_targets = bbox_targets[range(num_points), min_area_inds]
+
+        # return pos_inds & pos_gt_inds
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().reshape(-1)
+        pos_gt_inds = min_area_inds[labels < self.num_classes]
+        return labels, bbox_targets, pos_inds, pos_gt_inds
+
+    def get_positive_infos(self) -> InstanceList:
+        """Get positive information from sampling results.
+
+        Returns:
+            list[:obj:`InstanceData`]: Positive information of each image,
+            usually including positive bboxes, positive labels, positive
+            priors, etc.
+        """
+        assert len(self._raw_positive_infos) > 0
+
+        pos_gt_inds_list = self._raw_positive_infos['pos_gt_inds_list']
+        pos_inds_list = self._raw_positive_infos['pos_inds_list']
+        num_imgs = len(pos_gt_inds_list)
+
+        cls_score_list = []
+        centerness_list = []
+        param_pred_list = []
+        point_list = []
+        stride_list = []
+        for cls_score_per_lvl, centerness_per_lvl, param_pred_per_lvl,\
+            point_per_lvl, stride_per_lvl in \
+            zip(self._raw_positive_infos['cls_scores'],
+                self._raw_positive_infos['centernesses'],
+                self._raw_positive_infos['param_preds'],
+                self._raw_positive_infos['all_level_points'],
+                self._raw_positive_infos['all_level_strides']):
+            cls_score_per_lvl = \
+                cls_score_per_lvl.permute(
+                    0, 2, 3, 1).reshape(num_imgs, -1, self.num_classes)
+            centerness_per_lvl = \
+                centerness_per_lvl.permute(
+                    0, 2, 3, 1).reshape(num_imgs, -1, 1)
+            param_pred_per_lvl = \
+                param_pred_per_lvl.permute(
+                    0, 2, 3, 1).reshape(num_imgs, -1, self.num_params)
+            point_per_lvl = point_per_lvl.unsqueeze(0).repeat(num_imgs, 1, 1)
+            stride_per_lvl = stride_per_lvl.unsqueeze(0).repeat(num_imgs, 1)
+
+            cls_score_list.append(cls_score_per_lvl)
+            centerness_list.append(centerness_per_lvl)
+            param_pred_list.append(param_pred_per_lvl)
+            point_list.append(point_per_lvl)
+            stride_list.append(stride_per_lvl)
+        cls_scores = torch.cat(cls_score_list, dim=1)
+        centernesses = torch.cat(centerness_list, dim=1)
+        param_preds = torch.cat(param_pred_list, dim=1)
+        all_points = torch.cat(point_list, dim=1)
+        all_strides = torch.cat(stride_list, dim=1)
+
+        positive_infos = []
+        for i, (pos_gt_inds,
+                pos_inds) in enumerate(zip(pos_gt_inds_list, pos_inds_list)):
+            pos_info = InstanceData()
+            pos_info.points = all_points[i][pos_inds]
+            pos_info.strides = all_strides[i][pos_inds]
+            pos_info.scores = cls_scores[i][pos_inds]
+            pos_info.centernesses = centernesses[i][pos_inds]
+            pos_info.param_preds = param_preds[i][pos_inds]
+            pos_info.pos_assigned_gt_inds = pos_gt_inds
+            pos_info.pos_inds = pos_inds
+            positive_infos.append(pos_info)
+        return positive_infos
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        score_factors: Optional[List[Tensor]] = None,
+                        param_preds: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            param_preds (list[Tensor], optional): Params for all scale
+                level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_params, H, W)
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        if score_factors is None:
+            # e.g. Retina, FreeAnchor, Foveabox, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, AutoAssign, etc.
+            with_score_factors = True
+            assert len(cls_scores) == len(score_factors)
+
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        all_level_points_strides = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device,
+            with_stride=True)
+        all_level_points = [i[:, :2] for i in all_level_points_strides]
+        all_level_strides = [i[:, 2] for i in all_level_points_strides]
+
+        result_list = []
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            cls_score_list = select_single_mlvl(
+                cls_scores, img_id, detach=True)
+            bbox_pred_list = select_single_mlvl(
+                bbox_preds, img_id, detach=True)
+            if with_score_factors:
+                score_factor_list = select_single_mlvl(
+                    score_factors, img_id, detach=True)
+            else:
+                score_factor_list = [None for _ in range(num_levels)]
+            param_pred_list = select_single_mlvl(
+                param_preds, img_id, detach=True)
+
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                score_factor_list=score_factor_list,
+                param_pred_list=param_pred_list,
+                mlvl_points=all_level_points,
+                mlvl_strides=all_level_strides,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                param_pred_list: List[Tensor],
+                                mlvl_points: List[Tensor],
+                                mlvl_strides: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            param_pred_list (List[Tensor]): Param predition from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_params, H, W).
+            mlvl_points (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid.
+                It has shape (num_priors, 2)
+            mlvl_strides (List[Tensor]):  Each element in the list is
+                the stride of a single level in feature pyramid.
+                It has shape (num_priors, 1)
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        if score_factor_list[0] is None:
+            # e.g. Retina, FreeAnchor, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, etc.
+            with_score_factors = True
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_param_preds = []
+        mlvl_valid_points = []
+        mlvl_valid_strides = []
+        mlvl_scores = []
+        mlvl_labels = []
+        if with_score_factors:
+            mlvl_score_factors = []
+        else:
+            mlvl_score_factors = None
+        for level_idx, (cls_score, bbox_pred, score_factor,
+                        param_pred, points, strides) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              score_factor_list, param_pred_list,
+                              mlvl_points, mlvl_strides)):
+
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            dim = self.bbox_coder.encode_size
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim)
+            if with_score_factors:
+                score_factor = score_factor.permute(1, 2,
+                                                    0).reshape(-1).sigmoid()
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            param_pred = param_pred.permute(1, 2,
+                                            0).reshape(-1, self.num_params)
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            score_thr = cfg.get('score_thr', 0)
+
+            results = filter_scores_and_topk(
+                scores, score_thr, nms_pre,
+                dict(
+                    bbox_pred=bbox_pred,
+                    param_pred=param_pred,
+                    points=points,
+                    strides=strides))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            param_pred = filtered_results['param_pred']
+            points = filtered_results['points']
+            strides = filtered_results['strides']
+
+            if with_score_factors:
+                score_factor = score_factor[keep_idxs]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_param_preds.append(param_pred)
+            mlvl_valid_points.append(points)
+            mlvl_valid_strides.append(strides)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+            if with_score_factors:
+                mlvl_score_factors.append(score_factor)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = cat_boxes(mlvl_valid_points)
+        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+        results.param_preds = torch.cat(mlvl_param_preds)
+        results.points = torch.cat(mlvl_valid_points)
+        results.strides = torch.cat(mlvl_valid_strides)
+        if with_score_factors:
+            results.score_factors = torch.cat(mlvl_score_factors)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+
+class MaskFeatModule(BaseModule):
+    """CondInst mask feature map branch used in \
+    https://arxiv.org/abs/1904.02689.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels of the mask feature
+             map branch.
+        start_level (int): The starting feature map level from RPN that
+             will be used to predict the mask feature map.
+        end_level (int): The ending feature map level from rpn that
+             will be used to predict the mask feature map.
+        out_channels (int): Number of output channels of the mask feature
+             map branch. This is the channel count of the mask
+             feature map that to be dynamically convolved with the predicted
+             kernel.
+        mask_stride (int): Downsample factor of the mask feature map output.
+            Defaults to 4.
+        num_stacked_convs (int): Number of convs in mask feature branch.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 feat_channels: int,
+                 start_level: int,
+                 end_level: int,
+                 out_channels: int,
+                 mask_stride: int = 4,
+                 num_stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 init_cfg: MultiConfig = [
+                     dict(type='Normal', layer='Conv2d', std=0.01)
+                 ],
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.start_level = start_level
+        self.end_level = end_level
+        self.mask_stride = mask_stride
+        self.num_stacked_convs = num_stacked_convs
+        assert start_level >= 0 and end_level >= start_level
+        self.out_channels = out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.convs_all_levels = nn.ModuleList()
+        for i in range(self.start_level, self.end_level + 1):
+            convs_per_level = nn.Sequential()
+            convs_per_level.add_module(
+                f'conv{i}',
+                ConvModule(
+                    self.in_channels,
+                    self.feat_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=False,
+                    bias=False))
+            self.convs_all_levels.append(convs_per_level)
+
+        conv_branch = []
+        for _ in range(self.num_stacked_convs):
+            conv_branch.append(
+                ConvModule(
+                    self.feat_channels,
+                    self.feat_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=False))
+        self.conv_branch = nn.Sequential(*conv_branch)
+
+        self.conv_pred = nn.Conv2d(
+            self.feat_channels, self.out_channels, 1, stride=1)
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        super().init_weights()
+        kaiming_init(self.convs_all_levels, a=1, distribution='uniform')
+        kaiming_init(self.conv_branch, a=1, distribution='uniform')
+        kaiming_init(self.conv_pred, a=1, distribution='uniform')
+
+    def forward(self, x: Tuple[Tensor]) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            Tensor: The predicted mask feature map.
+        """
+        inputs = x[self.start_level:self.end_level + 1]
+        assert len(inputs) == (self.end_level - self.start_level + 1)
+        feature_add_all_level = self.convs_all_levels[0](inputs[0])
+        target_h, target_w = feature_add_all_level.size()[2:]
+        for i in range(1, len(inputs)):
+            input_p = inputs[i]
+            x_p = self.convs_all_levels[i](input_p)
+            h, w = x_p.size()[2:]
+            factor_h = target_h // h
+            factor_w = target_w // w
+            assert factor_h == factor_w
+            feature_per_level = aligned_bilinear(x_p, factor_h)
+            feature_add_all_level = feature_add_all_level + \
+                feature_per_level
+
+        feature_add_all_level = self.conv_branch(feature_add_all_level)
+        feature_pred = self.conv_pred(feature_add_all_level)
+        return feature_pred
+
+
+@MODELS.register_module()
+class CondInstMaskHead(BaseMaskHead):
+    """CondInst mask head used in https://arxiv.org/abs/1904.02689.
+
+    This head outputs the mask for CondInst.
+
+    Args:
+        mask_feature_head (dict): Config of CondInstMaskFeatHead.
+        num_layers (int): Number of dynamic conv layers.
+        feat_channels (int): Number of channels in the dynamic conv.
+        mask_out_stride (int): The stride of the mask feat.
+        size_of_interest (int): The size of the region used in rel coord.
+        max_masks_to_train (int): Maximum number of masks to train for
+            each image.
+        loss_segm (:obj:`ConfigDict` or dict, optional): Config of
+            segmentation loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config
+            of head.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            head.
+    """
+
+    def __init__(self,
+                 mask_feature_head: ConfigType,
+                 num_layers: int = 3,
+                 feat_channels: int = 8,
+                 mask_out_stride: int = 4,
+                 size_of_interest: int = 8,
+                 max_masks_to_train: int = -1,
+                 topk_masks_per_img: int = -1,
+                 loss_mask: ConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None) -> None:
+        super().__init__()
+        self.mask_feature_head = MaskFeatModule(**mask_feature_head)
+        self.mask_feat_stride = self.mask_feature_head.mask_stride
+        self.in_channels = self.mask_feature_head.out_channels
+        self.num_layers = num_layers
+        self.feat_channels = feat_channels
+        self.size_of_interest = size_of_interest
+        self.mask_out_stride = mask_out_stride
+        self.max_masks_to_train = max_masks_to_train
+        self.topk_masks_per_img = topk_masks_per_img
+        self.prior_generator = MlvlPointGenerator([self.mask_feat_stride])
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.loss_mask = MODELS.build(loss_mask)
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        weight_nums, bias_nums = [], []
+        for i in range(self.num_layers):
+            if i == 0:
+                weight_nums.append((self.in_channels + 2) * self.feat_channels)
+                bias_nums.append(self.feat_channels)
+            elif i == self.num_layers - 1:
+                weight_nums.append(self.feat_channels * 1)
+                bias_nums.append(1)
+            else:
+                weight_nums.append(self.feat_channels * self.feat_channels)
+                bias_nums.append(self.feat_channels)
+
+        self.weight_nums = weight_nums
+        self.bias_nums = bias_nums
+        self.num_params = sum(weight_nums) + sum(bias_nums)
+
+    def parse_dynamic_params(
+            self, params: Tensor) -> Tuple[List[Tensor], List[Tensor]]:
+        """parse the dynamic params for dynamic conv."""
+        num_insts = params.size(0)
+        params_splits = list(
+            torch.split_with_sizes(
+                params, self.weight_nums + self.bias_nums, dim=1))
+        weight_splits = params_splits[:self.num_layers]
+        bias_splits = params_splits[self.num_layers:]
+        for i in range(self.num_layers):
+            if i < self.num_layers - 1:
+                weight_splits[i] = weight_splits[i].reshape(
+                    num_insts * self.in_channels, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(num_insts *
+                                                        self.in_channels)
+            else:
+                # out_channels x in_channels x 1 x 1
+                weight_splits[i] = weight_splits[i].reshape(
+                    num_insts * 1, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(num_insts)
+
+        return weight_splits, bias_splits
+
+    def dynamic_conv_forward(self, features: Tensor, weights: List[Tensor],
+                             biases: List[Tensor], num_insts: int) -> Tensor:
+        """dynamic forward, each layer follow a relu."""
+        n_layers = len(weights)
+        x = features
+        for i, (w, b) in enumerate(zip(weights, biases)):
+            x = F.conv2d(x, w, bias=b, stride=1, padding=0, groups=num_insts)
+            if i < n_layers - 1:
+                x = F.relu(x)
+        return x
+
+    def forward(self, x: tuple, positive_infos: InstanceList) -> tuple:
+        """Forward feature from the upstream network to get prototypes and
+        linearly combine the prototypes, using masks coefficients, into
+        instance masks. Finally, crop the instance masks with given bboxes.
+
+        Args:
+            x (Tuple[Tensor]): Feature from the upstream network, which is
+                a 4D-tensor.
+            positive_infos (List[:obj:``InstanceData``]): Positive information
+                that calculate from detect head.
+
+        Returns:
+            tuple: Predicted instance segmentation masks
+        """
+        mask_feats = self.mask_feature_head(x)
+        return multi_apply(self.forward_single, mask_feats, positive_infos)
+
+    def forward_single(self, mask_feat: Tensor,
+                       positive_info: InstanceData) -> Tensor:
+        """Forward features of a each image."""
+        pos_param_preds = positive_info.get('param_preds')
+        pos_points = positive_info.get('points')
+        pos_strides = positive_info.get('strides')
+
+        num_inst = pos_param_preds.shape[0]
+        mask_feat = mask_feat[None].repeat(num_inst, 1, 1, 1)
+        _, _, H, W = mask_feat.size()
+        if num_inst == 0:
+            return (pos_param_preds.new_zeros((0, 1, H, W)), )
+
+        locations = self.prior_generator.single_level_grid_priors(
+            mask_feat.size()[2:], 0, device=mask_feat.device)
+
+        rel_coords = relative_coordinate_maps(locations, pos_points,
+                                              pos_strides,
+                                              self.size_of_interest,
+                                              mask_feat.size()[2:])
+        mask_head_inputs = torch.cat([rel_coords, mask_feat], dim=1)
+        mask_head_inputs = mask_head_inputs.reshape(1, -1, H, W)
+
+        weights, biases = self.parse_dynamic_params(pos_param_preds)
+        mask_preds = self.dynamic_conv_forward(mask_head_inputs, weights,
+                                               biases, num_inst)
+        mask_preds = mask_preds.reshape(-1, H, W)
+        mask_preds = aligned_bilinear(
+            mask_preds.unsqueeze(0),
+            int(self.mask_feat_stride / self.mask_out_stride)).squeeze(0)
+
+        return (mask_preds, )
+
+    def loss_by_feat(self, mask_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], positive_infos: InstanceList,
+                     **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mask_preds (list[Tensor]): List of predicted masks, each has
+                shape (num_classes, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+            positive_infos (List[:obj:``InstanceData``]): Information of
+                positive samples of each image that are assigned in detection
+                head.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert positive_infos is not None, \
+            'positive_infos should not be None in `CondInstMaskHead`'
+        losses = dict()
+
+        loss_mask = 0.
+        num_imgs = len(mask_preds)
+        total_pos = 0
+
+        for idx in range(num_imgs):
+            (mask_pred, pos_mask_targets, num_pos) = \
+                self._get_targets_single(
+                mask_preds[idx], batch_gt_instances[idx],
+                positive_infos[idx])
+            # mask loss
+            total_pos += num_pos
+            if num_pos == 0 or pos_mask_targets is None:
+                loss = mask_pred.new_zeros(1).mean()
+            else:
+                loss = self.loss_mask(
+                    mask_pred, pos_mask_targets,
+                    reduction_override='none').sum()
+            loss_mask += loss
+
+        if total_pos == 0:
+            total_pos += 1  # avoid nan
+        loss_mask = loss_mask / total_pos
+        losses.update(loss_mask=loss_mask)
+        return losses
+
+    def _get_targets_single(self, mask_preds: Tensor,
+                            gt_instances: InstanceData,
+                            positive_info: InstanceData):
+        """Compute targets for predictions of single image.
+
+        Args:
+            mask_preds (Tensor): Predicted prototypes with shape
+                (num_classes, H, W).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            positive_info (:obj:`InstanceData`): Information of positive
+                samples that are assigned in detection head. It usually
+                contains following keys.
+
+                    - pos_assigned_gt_inds (Tensor): Assigner GT indexes of
+                      positive proposals, has shape (num_pos, )
+                    - pos_inds (Tensor): Positive index of image, has
+                      shape (num_pos, ).
+                    - param_pred (Tensor): Positive param preditions
+                      with shape (num_pos, num_params).
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+            - mask_preds (Tensor): Positive predicted mask with shape
+              (num_pos, mask_h, mask_w).
+            - pos_mask_targets (Tensor): Positive mask targets with shape
+              (num_pos, mask_h, mask_w).
+            - num_pos (int): Positive numbers.
+        """
+        gt_bboxes = gt_instances.bboxes
+        device = gt_bboxes.device
+        gt_masks = gt_instances.masks.to_tensor(
+            dtype=torch.bool, device=device).float()
+
+        # process with mask targets
+        pos_assigned_gt_inds = positive_info.get('pos_assigned_gt_inds')
+        scores = positive_info.get('scores')
+        centernesses = positive_info.get('centernesses')
+        num_pos = pos_assigned_gt_inds.size(0)
+
+        if gt_masks.size(0) == 0 or num_pos == 0:
+            return mask_preds, None, 0
+        # Since we're producing (near) full image masks,
+        # it'd take too much vram to backprop on every single mask.
+        # Thus we select only a subset.
+        if (self.max_masks_to_train != -1) and \
+           (num_pos > self.max_masks_to_train):
+            perm = torch.randperm(num_pos)
+            select = perm[:self.max_masks_to_train]
+            mask_preds = mask_preds[select]
+            pos_assigned_gt_inds = pos_assigned_gt_inds[select]
+            num_pos = self.max_masks_to_train
+        elif self.topk_masks_per_img != -1:
+            unique_gt_inds = pos_assigned_gt_inds.unique()
+            num_inst_per_gt = max(
+                int(self.topk_masks_per_img / len(unique_gt_inds)), 1)
+
+            keep_mask_preds = []
+            keep_pos_assigned_gt_inds = []
+            for gt_ind in unique_gt_inds:
+                per_inst_pos_inds = (pos_assigned_gt_inds == gt_ind)
+                mask_preds_per_inst = mask_preds[per_inst_pos_inds]
+                gt_inds_per_inst = pos_assigned_gt_inds[per_inst_pos_inds]
+                if sum(per_inst_pos_inds) > num_inst_per_gt:
+                    per_inst_scores = scores[per_inst_pos_inds].sigmoid().max(
+                        dim=1)[0]
+                    per_inst_centerness = centernesses[
+                        per_inst_pos_inds].sigmoid().reshape(-1, )
+                    select = (per_inst_scores * per_inst_centerness).topk(
+                        k=num_inst_per_gt, dim=0)[1]
+                    mask_preds_per_inst = mask_preds_per_inst[select]
+                    gt_inds_per_inst = gt_inds_per_inst[select]
+                keep_mask_preds.append(mask_preds_per_inst)
+                keep_pos_assigned_gt_inds.append(gt_inds_per_inst)
+            mask_preds = torch.cat(keep_mask_preds)
+            pos_assigned_gt_inds = torch.cat(keep_pos_assigned_gt_inds)
+            num_pos = pos_assigned_gt_inds.size(0)
+
+        # Follow the origin implement
+        start = int(self.mask_out_stride // 2)
+        gt_masks = gt_masks[:, start::self.mask_out_stride,
+                            start::self.mask_out_stride]
+        gt_masks = gt_masks.gt(0.5).float()
+        pos_mask_targets = gt_masks[pos_assigned_gt_inds]
+
+        return (mask_preds, pos_mask_targets, num_pos)
+
+    def predict_by_feat(self,
+                        mask_preds: List[Tensor],
+                        results_list: InstanceList,
+                        batch_img_metas: List[dict],
+                        rescale: bool = True,
+                        **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mask_preds (list[Tensor]): Predicted prototypes with shape
+                (num_classes, H, W).
+            results_list (List[:obj:``InstanceData``]): BBoxHead results.
+            batch_img_metas (list[dict]): Meta information of all images.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        assert len(mask_preds) == len(results_list) == len(batch_img_metas)
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            results = results_list[img_id]
+            bboxes = results.bboxes
+            mask_pred = mask_preds[img_id]
+            if bboxes.shape[0] == 0 or mask_pred.shape[0] == 0:
+                results_list[img_id] = empty_instances(
+                    [img_meta],
+                    bboxes.device,
+                    task_type='mask',
+                    instance_results=[results])[0]
+            else:
+                im_mask = self._predict_by_feat_single(
+                    mask_preds=mask_pred,
+                    bboxes=bboxes,
+                    img_meta=img_meta,
+                    rescale=rescale)
+                results.masks = im_mask
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                mask_preds: Tensor,
+                                bboxes: Tensor,
+                                img_meta: dict,
+                                rescale: bool,
+                                cfg: OptConfigType = None):
+        """Transform a single image's features extracted from the head into
+        mask results.
+
+        Args:
+            mask_preds (Tensor): Predicted prototypes, has shape [H, W, N].
+            img_meta (dict): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If rescale is False, then returned masks will
+                fit the scale of imgs[0].
+            cfg (dict, optional): Config used in test phase.
+                Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+            (1, 2))
+        img_h, img_w = img_meta['img_shape'][:2]
+        ori_h, ori_w = img_meta['ori_shape'][:2]
+
+        mask_preds = mask_preds.sigmoid().unsqueeze(0)
+        mask_preds = aligned_bilinear(mask_preds, self.mask_out_stride)
+        mask_preds = mask_preds[:, :, :img_h, :img_w]
+        if rescale:  # in-placed rescale the bboxes
+            scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+            bboxes /= scale_factor
+
+            masks = F.interpolate(
+                mask_preds, (ori_h, ori_w),
+                mode='bilinear',
+                align_corners=False).squeeze(0) > cfg.mask_thr
+        else:
+            masks = mask_preds.squeeze(0) > cfg.mask_thr
+
+        return masks
diff --git a/mmde/mmdet/models/dense_heads/conditional_detr_head.py b/mmde/mmdet/models/dense_heads/conditional_detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc2df2c215667121c5fe329f369510ecd4666faf
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/conditional_detr_head.py
@@ -0,0 +1,168 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+from mmengine.model import bias_init_with_prob
+from torch import Tensor
+
+from mmdet.models.layers.transformer import inverse_sigmoid
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList
+from .detr_head import DETRHead
+
+
+@MODELS.register_module()
+class ConditionalDETRHead(DETRHead):
+    """Head of Conditional DETR. Conditional DETR: Conditional DETR for Fast
+    Training Convergence. More details can be found in the `paper.
+
+    <https://arxiv.org/abs/2108.06152>`_ .
+    """
+
+    def init_weights(self):
+        """Initialize weights of the transformer head."""
+        super().init_weights()
+        # The initialization below for transformer head is very
+        # important as we use Focal_loss for loss_cls
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            nn.init.constant_(self.fc_cls.bias, bias_init)
+
+    def forward(self, hidden_states: Tensor,
+                references: Tensor) -> Tuple[Tensor, Tensor]:
+        """"Forward function.
+
+        Args:
+            hidden_states (Tensor): Features from transformer decoder. If
+                `return_intermediate_dec` is True output has shape
+                (num_decoder_layers, bs, num_queries, dim), else has shape (1,
+                bs, num_queries, dim) which only contains the last layer
+                outputs.
+            references (Tensor): References from transformer decoder, has
+                shape (bs, num_queries, 2).
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - layers_cls_scores (Tensor): Outputs from the classification head,
+              shape (num_decoder_layers, bs, num_queries, cls_out_channels).
+              Note cls_out_channels should include background.
+            - layers_bbox_preds (Tensor): Sigmoid outputs from the regression
+              head with normalized coordinate format (cx, cy, w, h), has shape
+              (num_decoder_layers, bs, num_queries, 4).
+        """
+
+        references_unsigmoid = inverse_sigmoid(references)
+        layers_bbox_preds = []
+        for layer_id in range(hidden_states.shape[0]):
+            tmp_reg_preds = self.fc_reg(
+                self.activate(self.reg_ffn(hidden_states[layer_id])))
+            tmp_reg_preds[..., :2] += references_unsigmoid
+            outputs_coord = tmp_reg_preds.sigmoid()
+            layers_bbox_preds.append(outputs_coord)
+        layers_bbox_preds = torch.stack(layers_bbox_preds)
+
+        layers_cls_scores = self.fc_cls(hidden_states)
+        return layers_cls_scores, layers_bbox_preds
+
+    def loss(self, hidden_states: Tensor, references: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Features from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, dim).
+            references (Tensor): References from the transformer decoder, has
+               shape (num_decoder_layers, bs, num_queries, 2).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_and_predict(
+            self, hidden_states: Tensor, references: Tensor,
+            batch_data_samples: SampleList) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples. Over-write because
+        img_metas are needed as inputs for bbox_head.
+
+        Args:
+            hidden_states (Tensor): Features from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, dim).
+            references (Tensor): References from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, 2).
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns:
+            tuple: The return value is a tuple contains:
+
+            - losses: (dict[str, Tensor]): A dictionary of loss components.
+            - predictions (list[:obj:`InstanceData`]): Detection
+              results of each image after the post process.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas)
+        return losses, predictions
+
+    def predict(self,
+                hidden_states: Tensor,
+                references: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network. Over-write
+        because img_metas are needed as inputs for bbox_head.
+
+        Args:
+            hidden_states (Tensor): Features from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, dim).
+            references (Tensor): References from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, 2).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        last_layer_hidden_state = hidden_states[-1].unsqueeze(0)
+        outs = self(last_layer_hidden_state, references)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+
+        return predictions
diff --git a/mmde/mmdet/models/dense_heads/corner_head.py b/mmde/mmdet/models/dense_heads/corner_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cec71d50947ff58224ae698ec9c2f9406b58efb
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/corner_head.py
@@ -0,0 +1,1084 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from logging import warning
+from math import ceil, log
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import CornerPool, batched_nms
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig)
+from ..utils import (gather_feat, gaussian_radius, gen_gaussian_target,
+                     get_local_maximum, get_topk_from_heatmap, multi_apply,
+                     transpose_and_gather_feat)
+from .base_dense_head import BaseDenseHead
+
+
+class BiCornerPool(BaseModule):
+    """Bidirectional Corner Pooling Module (TopLeft, BottomRight, etc.)
+
+    Args:
+        in_channels (int): Input channels of module.
+        directions (list[str]): Directions of two CornerPools.
+        out_channels (int): Output channels of module.
+        feat_channels (int): Feature channels of module.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct
+            and config norm layer.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to
+            control the initialization.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 directions: List[int],
+                 feat_channels: int = 128,
+                 out_channels: int = 128,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg)
+        self.direction1_conv = ConvModule(
+            in_channels, feat_channels, 3, padding=1, norm_cfg=norm_cfg)
+        self.direction2_conv = ConvModule(
+            in_channels, feat_channels, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.aftpool_conv = ConvModule(
+            feat_channels,
+            out_channels,
+            3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.conv1 = ConvModule(
+            in_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        self.conv2 = ConvModule(
+            in_channels, out_channels, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.direction1_pool = CornerPool(directions[0])
+        self.direction2_pool = CornerPool(directions[1])
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tensor): Input feature of BiCornerPool.
+
+        Returns:
+            conv2 (tensor): Output feature of BiCornerPool.
+        """
+        direction1_conv = self.direction1_conv(x)
+        direction2_conv = self.direction2_conv(x)
+        direction1_feat = self.direction1_pool(direction1_conv)
+        direction2_feat = self.direction2_pool(direction2_conv)
+        aftpool_conv = self.aftpool_conv(direction1_feat + direction2_feat)
+        conv1 = self.conv1(x)
+        relu = self.relu(aftpool_conv + conv1)
+        conv2 = self.conv2(relu)
+        return conv2
+
+
+@MODELS.register_module()
+class CornerHead(BaseDenseHead):
+    """Head of CornerNet: Detecting Objects as Paired Keypoints.
+
+    Code is modified from the `official github repo
+    <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/
+    kp.py#L73>`_ .
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1808.01244>`_ .
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        num_feat_levels (int): Levels of feature from the previous module.
+            2 for HourglassNet-104 and 1 for HourglassNet-52. Because
+            HourglassNet-104 outputs the final feature and intermediate
+            supervision feature and HourglassNet-52 only outputs the final
+            feature. Defaults to 2.
+        corner_emb_channels (int): Channel of embedding vector. Defaults to 1.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config.
+            Useless in CornerHead, but we keep this variable for
+            SingleStageDetector.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            CornerHead.
+        loss_heatmap (:obj:`ConfigDict` or dict): Config of corner heatmap
+            loss. Defaults to GaussianFocalLoss.
+        loss_embedding (:obj:`ConfigDict` or dict): Config of corner embedding
+            loss. Defaults to AssociativeEmbeddingLoss.
+        loss_offset (:obj:`ConfigDict` or dict): Config of corner offset loss.
+            Defaults to SmoothL1Loss.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 num_feat_levels: int = 2,
+                 corner_emb_channels: int = 1,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 loss_heatmap: ConfigType = dict(
+                     type='GaussianFocalLoss',
+                     alpha=2.0,
+                     gamma=4.0,
+                     loss_weight=1),
+                 loss_embedding: ConfigType = dict(
+                     type='AssociativeEmbeddingLoss',
+                     pull_weight=0.25,
+                     push_weight=0.25),
+                 loss_offset: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1),
+                 init_cfg: OptMultiConfig = None) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.corner_emb_channels = corner_emb_channels
+        self.with_corner_emb = self.corner_emb_channels > 0
+        self.corner_offset_channels = 2
+        self.num_feat_levels = num_feat_levels
+        self.loss_heatmap = MODELS.build(
+            loss_heatmap) if loss_heatmap is not None else None
+        self.loss_embedding = MODELS.build(
+            loss_embedding) if loss_embedding is not None else None
+        self.loss_offset = MODELS.build(
+            loss_offset) if loss_offset is not None else None
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        self._init_layers()
+
+    def _make_layers(self,
+                     out_channels: int,
+                     in_channels: int = 256,
+                     feat_channels: int = 256) -> nn.Sequential:
+        """Initialize conv sequential for CornerHead."""
+        return nn.Sequential(
+            ConvModule(in_channels, feat_channels, 3, padding=1),
+            ConvModule(
+                feat_channels, out_channels, 1, norm_cfg=None, act_cfg=None))
+
+    def _init_corner_kpt_layers(self) -> None:
+        """Initialize corner keypoint layers.
+
+        Including corner heatmap branch and corner offset branch. Each branch
+        has two parts: prefix `tl_` for top-left and `br_` for bottom-right.
+        """
+        self.tl_pool, self.br_pool = nn.ModuleList(), nn.ModuleList()
+        self.tl_heat, self.br_heat = nn.ModuleList(), nn.ModuleList()
+        self.tl_off, self.br_off = nn.ModuleList(), nn.ModuleList()
+
+        for _ in range(self.num_feat_levels):
+            self.tl_pool.append(
+                BiCornerPool(
+                    self.in_channels, ['top', 'left'],
+                    out_channels=self.in_channels))
+            self.br_pool.append(
+                BiCornerPool(
+                    self.in_channels, ['bottom', 'right'],
+                    out_channels=self.in_channels))
+
+            self.tl_heat.append(
+                self._make_layers(
+                    out_channels=self.num_classes,
+                    in_channels=self.in_channels))
+            self.br_heat.append(
+                self._make_layers(
+                    out_channels=self.num_classes,
+                    in_channels=self.in_channels))
+
+            self.tl_off.append(
+                self._make_layers(
+                    out_channels=self.corner_offset_channels,
+                    in_channels=self.in_channels))
+            self.br_off.append(
+                self._make_layers(
+                    out_channels=self.corner_offset_channels,
+                    in_channels=self.in_channels))
+
+    def _init_corner_emb_layers(self) -> None:
+        """Initialize corner embedding layers.
+
+        Only include corner embedding branch with two parts: prefix `tl_` for
+        top-left and `br_` for bottom-right.
+        """
+        self.tl_emb, self.br_emb = nn.ModuleList(), nn.ModuleList()
+
+        for _ in range(self.num_feat_levels):
+            self.tl_emb.append(
+                self._make_layers(
+                    out_channels=self.corner_emb_channels,
+                    in_channels=self.in_channels))
+            self.br_emb.append(
+                self._make_layers(
+                    out_channels=self.corner_emb_channels,
+                    in_channels=self.in_channels))
+
+    def _init_layers(self) -> None:
+        """Initialize layers for CornerHead.
+
+        Including two parts: corner keypoint layers and corner embedding layers
+        """
+        self._init_corner_kpt_layers()
+        if self.with_corner_emb:
+            self._init_corner_emb_layers()
+
+    def init_weights(self) -> None:
+        super().init_weights()
+        bias_init = bias_init_with_prob(0.1)
+        for i in range(self.num_feat_levels):
+            # The initialization of parameters are different between
+            # nn.Conv2d and ConvModule. Our experiments show that
+            # using the original initialization of nn.Conv2d increases
+            # the final mAP by about 0.2%
+            self.tl_heat[i][-1].conv.reset_parameters()
+            self.tl_heat[i][-1].conv.bias.data.fill_(bias_init)
+            self.br_heat[i][-1].conv.reset_parameters()
+            self.br_heat[i][-1].conv.bias.data.fill_(bias_init)
+            self.tl_off[i][-1].conv.reset_parameters()
+            self.br_off[i][-1].conv.reset_parameters()
+            if self.with_corner_emb:
+                self.tl_emb[i][-1].conv.reset_parameters()
+                self.br_emb[i][-1].conv.reset_parameters()
+
+    def forward(self, feats: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of corner heatmaps, offset heatmaps and
+            embedding heatmaps.
+                - tl_heats (list[Tensor]): Top-left corner heatmaps for all
+                  levels, each is a 4D-tensor, the channels number is
+                  num_classes.
+                - br_heats (list[Tensor]): Bottom-right corner heatmaps for all
+                  levels, each is a 4D-tensor, the channels number is
+                  num_classes.
+                - tl_embs (list[Tensor] | list[None]): Top-left embedding
+                  heatmaps for all levels, each is a 4D-tensor or None.
+                  If not None, the channels number is corner_emb_channels.
+                - br_embs (list[Tensor] | list[None]): Bottom-right embedding
+                  heatmaps for all levels, each is a 4D-tensor or None.
+                  If not None, the channels number is corner_emb_channels.
+                - tl_offs (list[Tensor]): Top-left offset heatmaps for all
+                  levels, each is a 4D-tensor. The channels number is
+                  corner_offset_channels.
+                - br_offs (list[Tensor]): Bottom-right offset heatmaps for all
+                  levels, each is a 4D-tensor. The channels number is
+                  corner_offset_channels.
+        """
+        lvl_ind = list(range(self.num_feat_levels))
+        return multi_apply(self.forward_single, feats, lvl_ind)
+
+    def forward_single(self,
+                       x: Tensor,
+                       lvl_ind: int,
+                       return_pool: bool = False) -> List[Tensor]:
+        """Forward feature of a single level.
+
+        Args:
+            x (Tensor): Feature of a single level.
+            lvl_ind (int): Level index of current feature.
+            return_pool (bool): Return corner pool feature or not.
+                Defaults to False.
+
+        Returns:
+            tuple[Tensor]: A tuple of CornerHead's output for current feature
+            level. Containing the following Tensors:
+
+                - tl_heat (Tensor): Predicted top-left corner heatmap.
+                - br_heat (Tensor): Predicted bottom-right corner heatmap.
+                - tl_emb (Tensor | None): Predicted top-left embedding heatmap.
+                  None for `self.with_corner_emb == False`.
+                - br_emb (Tensor | None): Predicted bottom-right embedding
+                  heatmap. None for `self.with_corner_emb == False`.
+                - tl_off (Tensor): Predicted top-left offset heatmap.
+                - br_off (Tensor): Predicted bottom-right offset heatmap.
+                - tl_pool (Tensor): Top-left corner pool feature. Not must
+                  have.
+                - br_pool (Tensor): Bottom-right corner pool feature. Not must
+                  have.
+        """
+        tl_pool = self.tl_pool[lvl_ind](x)
+        tl_heat = self.tl_heat[lvl_ind](tl_pool)
+        br_pool = self.br_pool[lvl_ind](x)
+        br_heat = self.br_heat[lvl_ind](br_pool)
+
+        tl_emb, br_emb = None, None
+        if self.with_corner_emb:
+            tl_emb = self.tl_emb[lvl_ind](tl_pool)
+            br_emb = self.br_emb[lvl_ind](br_pool)
+
+        tl_off = self.tl_off[lvl_ind](tl_pool)
+        br_off = self.br_off[lvl_ind](br_pool)
+
+        result_list = [tl_heat, br_heat, tl_emb, br_emb, tl_off, br_off]
+        if return_pool:
+            result_list.append(tl_pool)
+            result_list.append(br_pool)
+
+        return result_list
+
+    def get_targets(self,
+                    gt_bboxes: List[Tensor],
+                    gt_labels: List[Tensor],
+                    feat_shape: Sequence[int],
+                    img_shape: Sequence[int],
+                    with_corner_emb: bool = False,
+                    with_guiding_shift: bool = False,
+                    with_centripetal_shift: bool = False) -> dict:
+        """Generate corner targets.
+
+        Including corner heatmap, corner offset.
+
+        Optional: corner embedding, corner guiding shift, centripetal shift.
+
+        For CornerNet, we generate corner heatmap, corner offset and corner
+        embedding from this function.
+
+        For CentripetalNet, we generate corner heatmap, corner offset, guiding
+        shift and centripetal shift from this function.
+
+        Args:
+            gt_bboxes (list[Tensor]): Ground truth bboxes of each image, each
+                has shape (num_gt, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each box, each has
+                shape (num_gt, ).
+            feat_shape (Sequence[int]): Shape of output feature,
+                [batch, channel, height, width].
+            img_shape (Sequence[int]): Shape of input image,
+                [height, width, channel].
+            with_corner_emb (bool): Generate corner embedding target or not.
+                Defaults to False.
+            with_guiding_shift (bool): Generate guiding shift target or not.
+                Defaults to False.
+            with_centripetal_shift (bool): Generate centripetal shift target or
+                not. Defaults to False.
+
+        Returns:
+            dict: Ground truth of corner heatmap, corner offset, corner
+            embedding, guiding shift and centripetal shift. Containing the
+            following keys:
+
+                - topleft_heatmap (Tensor): Ground truth top-left corner
+                  heatmap.
+                - bottomright_heatmap (Tensor): Ground truth bottom-right
+                  corner heatmap.
+                - topleft_offset (Tensor): Ground truth top-left corner offset.
+                - bottomright_offset (Tensor): Ground truth bottom-right corner
+                  offset.
+                - corner_embedding (list[list[list[int]]]): Ground truth corner
+                  embedding. Not must have.
+                - topleft_guiding_shift (Tensor): Ground truth top-left corner
+                  guiding shift. Not must have.
+                - bottomright_guiding_shift (Tensor): Ground truth bottom-right
+                  corner guiding shift. Not must have.
+                - topleft_centripetal_shift (Tensor): Ground truth top-left
+                  corner centripetal shift. Not must have.
+                - bottomright_centripetal_shift (Tensor): Ground truth
+                  bottom-right corner centripetal shift. Not must have.
+        """
+        batch_size, _, height, width = feat_shape
+        img_h, img_w = img_shape[:2]
+
+        width_ratio = float(width / img_w)
+        height_ratio = float(height / img_h)
+
+        gt_tl_heatmap = gt_bboxes[-1].new_zeros(
+            [batch_size, self.num_classes, height, width])
+        gt_br_heatmap = gt_bboxes[-1].new_zeros(
+            [batch_size, self.num_classes, height, width])
+        gt_tl_offset = gt_bboxes[-1].new_zeros([batch_size, 2, height, width])
+        gt_br_offset = gt_bboxes[-1].new_zeros([batch_size, 2, height, width])
+
+        if with_corner_emb:
+            match = []
+
+        # Guiding shift is a kind of offset, from center to corner
+        if with_guiding_shift:
+            gt_tl_guiding_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+            gt_br_guiding_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+        # Centripetal shift is also a kind of offset, from center to corner
+        # and normalized by log.
+        if with_centripetal_shift:
+            gt_tl_centripetal_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+            gt_br_centripetal_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+
+        for batch_id in range(batch_size):
+            # Ground truth of corner embedding per image is a list of coord set
+            corner_match = []
+            for box_id in range(len(gt_labels[batch_id])):
+                left, top, right, bottom = gt_bboxes[batch_id][box_id]
+                center_x = (left + right) / 2.0
+                center_y = (top + bottom) / 2.0
+                label = gt_labels[batch_id][box_id]
+
+                # Use coords in the feature level to generate ground truth
+                scale_left = left * width_ratio
+                scale_right = right * width_ratio
+                scale_top = top * height_ratio
+                scale_bottom = bottom * height_ratio
+                scale_center_x = center_x * width_ratio
+                scale_center_y = center_y * height_ratio
+
+                # Int coords on feature map/ground truth tensor
+                left_idx = int(min(scale_left, width - 1))
+                right_idx = int(min(scale_right, width - 1))
+                top_idx = int(min(scale_top, height - 1))
+                bottom_idx = int(min(scale_bottom, height - 1))
+
+                # Generate gaussian heatmap
+                scale_box_width = ceil(scale_right - scale_left)
+                scale_box_height = ceil(scale_bottom - scale_top)
+                radius = gaussian_radius((scale_box_height, scale_box_width),
+                                         min_overlap=0.3)
+                radius = max(0, int(radius))
+                gt_tl_heatmap[batch_id, label] = gen_gaussian_target(
+                    gt_tl_heatmap[batch_id, label], [left_idx, top_idx],
+                    radius)
+                gt_br_heatmap[batch_id, label] = gen_gaussian_target(
+                    gt_br_heatmap[batch_id, label], [right_idx, bottom_idx],
+                    radius)
+
+                # Generate corner offset
+                left_offset = scale_left - left_idx
+                top_offset = scale_top - top_idx
+                right_offset = scale_right - right_idx
+                bottom_offset = scale_bottom - bottom_idx
+                gt_tl_offset[batch_id, 0, top_idx, left_idx] = left_offset
+                gt_tl_offset[batch_id, 1, top_idx, left_idx] = top_offset
+                gt_br_offset[batch_id, 0, bottom_idx, right_idx] = right_offset
+                gt_br_offset[batch_id, 1, bottom_idx,
+                             right_idx] = bottom_offset
+
+                # Generate corner embedding
+                if with_corner_emb:
+                    corner_match.append([[top_idx, left_idx],
+                                         [bottom_idx, right_idx]])
+                # Generate guiding shift
+                if with_guiding_shift:
+                    gt_tl_guiding_shift[batch_id, 0, top_idx,
+                                        left_idx] = scale_center_x - left_idx
+                    gt_tl_guiding_shift[batch_id, 1, top_idx,
+                                        left_idx] = scale_center_y - top_idx
+                    gt_br_guiding_shift[batch_id, 0, bottom_idx,
+                                        right_idx] = right_idx - scale_center_x
+                    gt_br_guiding_shift[
+                        batch_id, 1, bottom_idx,
+                        right_idx] = bottom_idx - scale_center_y
+                # Generate centripetal shift
+                if with_centripetal_shift:
+                    gt_tl_centripetal_shift[batch_id, 0, top_idx,
+                                            left_idx] = log(scale_center_x -
+                                                            scale_left)
+                    gt_tl_centripetal_shift[batch_id, 1, top_idx,
+                                            left_idx] = log(scale_center_y -
+                                                            scale_top)
+                    gt_br_centripetal_shift[batch_id, 0, bottom_idx,
+                                            right_idx] = log(scale_right -
+                                                             scale_center_x)
+                    gt_br_centripetal_shift[batch_id, 1, bottom_idx,
+                                            right_idx] = log(scale_bottom -
+                                                             scale_center_y)
+
+            if with_corner_emb:
+                match.append(corner_match)
+
+        target_result = dict(
+            topleft_heatmap=gt_tl_heatmap,
+            topleft_offset=gt_tl_offset,
+            bottomright_heatmap=gt_br_heatmap,
+            bottomright_offset=gt_br_offset)
+
+        if with_corner_emb:
+            target_result.update(corner_embedding=match)
+        if with_guiding_shift:
+            target_result.update(
+                topleft_guiding_shift=gt_tl_guiding_shift,
+                bottomright_guiding_shift=gt_br_guiding_shift)
+        if with_centripetal_shift:
+            target_result.update(
+                topleft_centripetal_shift=gt_tl_centripetal_shift,
+                bottomright_centripetal_shift=gt_br_centripetal_shift)
+
+        return target_result
+
+    def loss_by_feat(
+            self,
+            tl_heats: List[Tensor],
+            br_heats: List[Tensor],
+            tl_embs: List[Tensor],
+            br_embs: List[Tensor],
+            tl_offs: List[Tensor],
+            br_offs: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_embs (list[Tensor]): Top-left corner embeddings for each level
+                with shape (N, corner_emb_channels, H, W).
+            br_embs (list[Tensor]): Bottom-right corner embeddings for each
+                level with shape (N, corner_emb_channels, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Specify which bounding boxes can be ignored when computing
+                the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components. Containing the
+            following losses:
+
+                - det_loss (list[Tensor]): Corner keypoint losses of all
+                  feature levels.
+                - pull_loss (list[Tensor]): Part one of AssociativeEmbedding
+                  losses of all feature levels.
+                - push_loss (list[Tensor]): Part two of AssociativeEmbedding
+                  losses of all feature levels.
+                - off_loss (list[Tensor]): Corner offset losses of all feature
+                  levels.
+        """
+        gt_bboxes = [
+            gt_instances.bboxes for gt_instances in batch_gt_instances
+        ]
+        gt_labels = [
+            gt_instances.labels for gt_instances in batch_gt_instances
+        ]
+
+        targets = self.get_targets(
+            gt_bboxes,
+            gt_labels,
+            tl_heats[-1].shape,
+            batch_img_metas[0]['batch_input_shape'],
+            with_corner_emb=self.with_corner_emb)
+        mlvl_targets = [targets for _ in range(self.num_feat_levels)]
+        det_losses, pull_losses, push_losses, off_losses = multi_apply(
+            self.loss_by_feat_single, tl_heats, br_heats, tl_embs, br_embs,
+            tl_offs, br_offs, mlvl_targets)
+        loss_dict = dict(det_loss=det_losses, off_loss=off_losses)
+        if self.with_corner_emb:
+            loss_dict.update(pull_loss=pull_losses, push_loss=push_losses)
+        return loss_dict
+
+    def loss_by_feat_single(self, tl_hmp: Tensor, br_hmp: Tensor,
+                            tl_emb: Optional[Tensor], br_emb: Optional[Tensor],
+                            tl_off: Tensor, br_off: Tensor,
+                            targets: dict) -> Tuple[Tensor, ...]:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            tl_hmp (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_hmp (Tensor): Bottom-right corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            tl_emb (Tensor, optional): Top-left corner embedding for current
+                level with shape (N, corner_emb_channels, H, W).
+            br_emb (Tensor, optional): Bottom-right corner embedding for
+                current level with shape (N, corner_emb_channels, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            targets (dict): Corner target generated by `get_targets`.
+
+        Returns:
+            tuple[torch.Tensor]: Losses of the head's different branches
+            containing the following losses:
+
+                - det_loss (Tensor): Corner keypoint loss.
+                - pull_loss (Tensor): Part one of AssociativeEmbedding loss.
+                - push_loss (Tensor): Part two of AssociativeEmbedding loss.
+                - off_loss (Tensor): Corner offset loss.
+        """
+        gt_tl_hmp = targets['topleft_heatmap']
+        gt_br_hmp = targets['bottomright_heatmap']
+        gt_tl_off = targets['topleft_offset']
+        gt_br_off = targets['bottomright_offset']
+        gt_embedding = targets['corner_embedding']
+
+        # Detection loss
+        tl_det_loss = self.loss_heatmap(
+            tl_hmp.sigmoid(),
+            gt_tl_hmp,
+            avg_factor=max(1,
+                           gt_tl_hmp.eq(1).sum()))
+        br_det_loss = self.loss_heatmap(
+            br_hmp.sigmoid(),
+            gt_br_hmp,
+            avg_factor=max(1,
+                           gt_br_hmp.eq(1).sum()))
+        det_loss = (tl_det_loss + br_det_loss) / 2.0
+
+        # AssociativeEmbedding loss
+        if self.with_corner_emb and self.loss_embedding is not None:
+            pull_loss, push_loss = self.loss_embedding(tl_emb, br_emb,
+                                                       gt_embedding)
+        else:
+            pull_loss, push_loss = None, None
+
+        # Offset loss
+        # We only compute the offset loss at the real corner position.
+        # The value of real corner would be 1 in heatmap ground truth.
+        # The mask is computed in class agnostic mode and its shape is
+        # batch * 1 * width * height.
+        tl_off_mask = gt_tl_hmp.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_tl_hmp)
+        br_off_mask = gt_br_hmp.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_br_hmp)
+        tl_off_loss = self.loss_offset(
+            tl_off,
+            gt_tl_off,
+            tl_off_mask,
+            avg_factor=max(1, tl_off_mask.sum()))
+        br_off_loss = self.loss_offset(
+            br_off,
+            gt_br_off,
+            br_off_mask,
+            avg_factor=max(1, br_off_mask.sum()))
+
+        off_loss = (tl_off_loss + br_off_loss) / 2.0
+
+        return det_loss, pull_loss, push_loss, off_loss
+
+    def predict_by_feat(self,
+                        tl_heats: List[Tensor],
+                        br_heats: List[Tensor],
+                        tl_embs: List[Tensor],
+                        br_embs: List[Tensor],
+                        tl_offs: List[Tensor],
+                        br_offs: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_embs (list[Tensor]): Top-left corner embeddings for each level
+                with shape (N, corner_emb_channels, H, W).
+            br_embs (list[Tensor]): Bottom-right corner embeddings for each
+                level with shape (N, corner_emb_channels, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            batch_img_metas (list[dict], optional): Batch image meta info.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert tl_heats[-1].shape[0] == br_heats[-1].shape[0] == len(
+            batch_img_metas)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            result_list.append(
+                self._predict_by_feat_single(
+                    tl_heats[-1][img_id:img_id + 1, :],
+                    br_heats[-1][img_id:img_id + 1, :],
+                    tl_offs[-1][img_id:img_id + 1, :],
+                    br_offs[-1][img_id:img_id + 1, :],
+                    batch_img_metas[img_id],
+                    tl_emb=tl_embs[-1][img_id:img_id + 1, :],
+                    br_emb=br_embs[-1][img_id:img_id + 1, :],
+                    rescale=rescale,
+                    with_nms=with_nms))
+
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                tl_heat: Tensor,
+                                br_heat: Tensor,
+                                tl_off: Tensor,
+                                br_off: Tensor,
+                                img_meta: dict,
+                                tl_emb: Optional[Tensor] = None,
+                                br_emb: Optional[Tensor] = None,
+                                tl_centripetal_shift: Optional[Tensor] = None,
+                                br_centripetal_shift: Optional[Tensor] = None,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            tl_heat (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_heat (Tensor): Bottom-right corner heatmap for current level
+                with shape (N, num_classes, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            img_meta (dict): Meta information of current image, e.g.,
+                image size, scaling factor, etc.
+            tl_emb (Tensor): Top-left corner embedding for current level with
+                shape (N, corner_emb_channels, H, W).
+            br_emb (Tensor): Bottom-right corner embedding for current level
+                with shape (N, corner_emb_channels, H, W).
+            tl_centripetal_shift: Top-left corner's centripetal shift for
+                current level with shape (N, 2, H, W).
+            br_centripetal_shift: Bottom-right corner's centripetal shift for
+                current level with shape (N, 2, H, W).
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        if isinstance(img_meta, (list, tuple)):
+            img_meta = img_meta[0]
+
+        batch_bboxes, batch_scores, batch_clses = self._decode_heatmap(
+            tl_heat=tl_heat.sigmoid(),
+            br_heat=br_heat.sigmoid(),
+            tl_off=tl_off,
+            br_off=br_off,
+            tl_emb=tl_emb,
+            br_emb=br_emb,
+            tl_centripetal_shift=tl_centripetal_shift,
+            br_centripetal_shift=br_centripetal_shift,
+            img_meta=img_meta,
+            k=self.test_cfg.corner_topk,
+            kernel=self.test_cfg.local_maximum_kernel,
+            distance_threshold=self.test_cfg.distance_threshold)
+
+        if rescale and 'scale_factor' in img_meta:
+            batch_bboxes /= batch_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+
+        bboxes = batch_bboxes.view([-1, 4])
+        scores = batch_scores.view(-1)
+        clses = batch_clses.view(-1)
+
+        det_bboxes = torch.cat([bboxes, scores.unsqueeze(-1)], -1)
+        keepinds = (det_bboxes[:, -1] > -0.1)
+        det_bboxes = det_bboxes[keepinds]
+        det_labels = clses[keepinds]
+
+        if with_nms:
+            det_bboxes, det_labels = self._bboxes_nms(det_bboxes, det_labels,
+                                                      self.test_cfg)
+
+        results = InstanceData()
+        results.bboxes = det_bboxes[..., :4]
+        results.scores = det_bboxes[..., 4]
+        results.labels = det_labels
+        return results
+
+    def _bboxes_nms(self, bboxes: Tensor, labels: Tensor,
+                    cfg: ConfigDict) -> Tuple[Tensor, Tensor]:
+        """bboxes nms."""
+        if 'nms_cfg' in cfg:
+            warning.warn('nms_cfg in test_cfg will be deprecated. '
+                         'Please rename it as nms')
+        if 'nms' not in cfg:
+            cfg.nms = cfg.nms_cfg
+
+        if labels.numel() > 0:
+            max_num = cfg.max_per_img
+            bboxes, keep = batched_nms(bboxes[:, :4], bboxes[:,
+                                                             -1].contiguous(),
+                                       labels, cfg.nms)
+            if max_num > 0:
+                bboxes = bboxes[:max_num]
+                labels = labels[keep][:max_num]
+
+        return bboxes, labels
+
+    def _decode_heatmap(self,
+                        tl_heat: Tensor,
+                        br_heat: Tensor,
+                        tl_off: Tensor,
+                        br_off: Tensor,
+                        tl_emb: Optional[Tensor] = None,
+                        br_emb: Optional[Tensor] = None,
+                        tl_centripetal_shift: Optional[Tensor] = None,
+                        br_centripetal_shift: Optional[Tensor] = None,
+                        img_meta: Optional[dict] = None,
+                        k: int = 100,
+                        kernel: int = 3,
+                        distance_threshold: float = 0.5,
+                        num_dets: int = 1000) -> Tuple[Tensor, Tensor, Tensor]:
+        """Transform outputs into detections raw bbox prediction.
+
+        Args:
+            tl_heat (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_heat (Tensor): Bottom-right corner heatmap for current level
+                with shape (N, num_classes, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            tl_emb (Tensor, Optional): Top-left corner embedding for current
+                level with shape (N, corner_emb_channels, H, W).
+            br_emb (Tensor, Optional): Bottom-right corner embedding for
+                current level with shape (N, corner_emb_channels, H, W).
+            tl_centripetal_shift (Tensor, Optional): Top-left centripetal shift
+                for current level with shape (N, 2, H, W).
+            br_centripetal_shift (Tensor, Optional): Bottom-right centripetal
+                shift for current level with shape (N, 2, H, W).
+            img_meta (dict): Meta information of current image, e.g.,
+                image size, scaling factor, etc.
+            k (int): Get top k corner keypoints from heatmap.
+            kernel (int): Max pooling kernel for extract local maximum pixels.
+            distance_threshold (float): Distance threshold. Top-left and
+                bottom-right corner keypoints with feature distance less than
+                the threshold will be regarded as keypoints from same object.
+            num_dets (int): Num of raw boxes before doing nms.
+
+        Returns:
+            tuple[torch.Tensor]: Decoded output of CornerHead, containing the
+            following Tensors:
+
+            - bboxes (Tensor): Coords of each box.
+            - scores (Tensor): Scores of each box.
+            - clses (Tensor): Categories of each box.
+        """
+        with_embedding = tl_emb is not None and br_emb is not None
+        with_centripetal_shift = (
+            tl_centripetal_shift is not None
+            and br_centripetal_shift is not None)
+        assert with_embedding + with_centripetal_shift == 1
+        batch, _, height, width = tl_heat.size()
+        if torch.onnx.is_in_onnx_export():
+            inp_h, inp_w = img_meta['pad_shape_for_onnx'][:2]
+        else:
+            inp_h, inp_w = img_meta['batch_input_shape'][:2]
+
+        # perform nms on heatmaps
+        tl_heat = get_local_maximum(tl_heat, kernel=kernel)
+        br_heat = get_local_maximum(br_heat, kernel=kernel)
+
+        tl_scores, tl_inds, tl_clses, tl_ys, tl_xs = get_topk_from_heatmap(
+            tl_heat, k=k)
+        br_scores, br_inds, br_clses, br_ys, br_xs = get_topk_from_heatmap(
+            br_heat, k=k)
+
+        # We use repeat instead of expand here because expand is a
+        # shallow-copy function. Thus it could cause unexpected testing result
+        # sometimes. Using expand will decrease about 10% mAP during testing
+        # compared to repeat.
+        tl_ys = tl_ys.view(batch, k, 1).repeat(1, 1, k)
+        tl_xs = tl_xs.view(batch, k, 1).repeat(1, 1, k)
+        br_ys = br_ys.view(batch, 1, k).repeat(1, k, 1)
+        br_xs = br_xs.view(batch, 1, k).repeat(1, k, 1)
+
+        tl_off = transpose_and_gather_feat(tl_off, tl_inds)
+        tl_off = tl_off.view(batch, k, 1, 2)
+        br_off = transpose_and_gather_feat(br_off, br_inds)
+        br_off = br_off.view(batch, 1, k, 2)
+
+        tl_xs = tl_xs + tl_off[..., 0]
+        tl_ys = tl_ys + tl_off[..., 1]
+        br_xs = br_xs + br_off[..., 0]
+        br_ys = br_ys + br_off[..., 1]
+
+        if with_centripetal_shift:
+            tl_centripetal_shift = transpose_and_gather_feat(
+                tl_centripetal_shift, tl_inds).view(batch, k, 1, 2).exp()
+            br_centripetal_shift = transpose_and_gather_feat(
+                br_centripetal_shift, br_inds).view(batch, 1, k, 2).exp()
+
+            tl_ctxs = tl_xs + tl_centripetal_shift[..., 0]
+            tl_ctys = tl_ys + tl_centripetal_shift[..., 1]
+            br_ctxs = br_xs - br_centripetal_shift[..., 0]
+            br_ctys = br_ys - br_centripetal_shift[..., 1]
+
+        # all possible boxes based on top k corners (ignoring class)
+        tl_xs *= (inp_w / width)
+        tl_ys *= (inp_h / height)
+        br_xs *= (inp_w / width)
+        br_ys *= (inp_h / height)
+
+        if with_centripetal_shift:
+            tl_ctxs *= (inp_w / width)
+            tl_ctys *= (inp_h / height)
+            br_ctxs *= (inp_w / width)
+            br_ctys *= (inp_h / height)
+
+        x_off, y_off = 0, 0  # no crop
+        if not torch.onnx.is_in_onnx_export():
+            # since `RandomCenterCropPad` is done on CPU with numpy and it's
+            # not dynamic traceable when exporting to ONNX, thus 'border'
+            # does not appears as key in 'img_meta'. As a tmp solution,
+            # we move this 'border' handle part to the postprocess after
+            # finished exporting to ONNX, which is handle in
+            # `mmdet/core/export/model_wrappers.py`. Though difference between
+            # pytorch and exported onnx model, it might be ignored since
+            # comparable performance is achieved between them (e.g. 40.4 vs
+            # 40.6 on COCO val2017, for CornerNet without test-time flip)
+            if 'border' in img_meta:
+                x_off = img_meta['border'][2]
+                y_off = img_meta['border'][0]
+
+        tl_xs -= x_off
+        tl_ys -= y_off
+        br_xs -= x_off
+        br_ys -= y_off
+
+        zeros = tl_xs.new_zeros(*tl_xs.size())
+        tl_xs = torch.where(tl_xs > 0.0, tl_xs, zeros)
+        tl_ys = torch.where(tl_ys > 0.0, tl_ys, zeros)
+        br_xs = torch.where(br_xs > 0.0, br_xs, zeros)
+        br_ys = torch.where(br_ys > 0.0, br_ys, zeros)
+
+        bboxes = torch.stack((tl_xs, tl_ys, br_xs, br_ys), dim=3)
+        area_bboxes = ((br_xs - tl_xs) * (br_ys - tl_ys)).abs()
+
+        if with_centripetal_shift:
+            tl_ctxs -= x_off
+            tl_ctys -= y_off
+            br_ctxs -= x_off
+            br_ctys -= y_off
+
+            tl_ctxs *= tl_ctxs.gt(0.0).type_as(tl_ctxs)
+            tl_ctys *= tl_ctys.gt(0.0).type_as(tl_ctys)
+            br_ctxs *= br_ctxs.gt(0.0).type_as(br_ctxs)
+            br_ctys *= br_ctys.gt(0.0).type_as(br_ctys)
+
+            ct_bboxes = torch.stack((tl_ctxs, tl_ctys, br_ctxs, br_ctys),
+                                    dim=3)
+            area_ct_bboxes = ((br_ctxs - tl_ctxs) * (br_ctys - tl_ctys)).abs()
+
+            rcentral = torch.zeros_like(ct_bboxes)
+            # magic nums from paper section 4.1
+            mu = torch.ones_like(area_bboxes) / 2.4
+            mu[area_bboxes > 3500] = 1 / 2.1  # large bbox have smaller mu
+
+            bboxes_center_x = (bboxes[..., 0] + bboxes[..., 2]) / 2
+            bboxes_center_y = (bboxes[..., 1] + bboxes[..., 3]) / 2
+            rcentral[..., 0] = bboxes_center_x - mu * (bboxes[..., 2] -
+                                                       bboxes[..., 0]) / 2
+            rcentral[..., 1] = bboxes_center_y - mu * (bboxes[..., 3] -
+                                                       bboxes[..., 1]) / 2
+            rcentral[..., 2] = bboxes_center_x + mu * (bboxes[..., 2] -
+                                                       bboxes[..., 0]) / 2
+            rcentral[..., 3] = bboxes_center_y + mu * (bboxes[..., 3] -
+                                                       bboxes[..., 1]) / 2
+            area_rcentral = ((rcentral[..., 2] - rcentral[..., 0]) *
+                             (rcentral[..., 3] - rcentral[..., 1])).abs()
+            dists = area_ct_bboxes / area_rcentral
+
+            tl_ctx_inds = (ct_bboxes[..., 0] <= rcentral[..., 0]) | (
+                ct_bboxes[..., 0] >= rcentral[..., 2])
+            tl_cty_inds = (ct_bboxes[..., 1] <= rcentral[..., 1]) | (
+                ct_bboxes[..., 1] >= rcentral[..., 3])
+            br_ctx_inds = (ct_bboxes[..., 2] <= rcentral[..., 0]) | (
+                ct_bboxes[..., 2] >= rcentral[..., 2])
+            br_cty_inds = (ct_bboxes[..., 3] <= rcentral[..., 1]) | (
+                ct_bboxes[..., 3] >= rcentral[..., 3])
+
+        if with_embedding:
+            tl_emb = transpose_and_gather_feat(tl_emb, tl_inds)
+            tl_emb = tl_emb.view(batch, k, 1)
+            br_emb = transpose_and_gather_feat(br_emb, br_inds)
+            br_emb = br_emb.view(batch, 1, k)
+            dists = torch.abs(tl_emb - br_emb)
+
+        tl_scores = tl_scores.view(batch, k, 1).repeat(1, 1, k)
+        br_scores = br_scores.view(batch, 1, k).repeat(1, k, 1)
+
+        scores = (tl_scores + br_scores) / 2  # scores for all possible boxes
+
+        # tl and br should have same class
+        tl_clses = tl_clses.view(batch, k, 1).repeat(1, 1, k)
+        br_clses = br_clses.view(batch, 1, k).repeat(1, k, 1)
+        cls_inds = (tl_clses != br_clses)
+
+        # reject boxes based on distances
+        dist_inds = dists > distance_threshold
+
+        # reject boxes based on widths and heights
+        width_inds = (br_xs <= tl_xs)
+        height_inds = (br_ys <= tl_ys)
+
+        # No use `scores[cls_inds]`, instead we use `torch.where` here.
+        # Since only 1-D indices with type 'tensor(bool)' are supported
+        # when exporting to ONNX, any other bool indices with more dimensions
+        # (e.g. 2-D bool tensor) as input parameter in node is invalid
+        negative_scores = -1 * torch.ones_like(scores)
+        scores = torch.where(cls_inds, negative_scores, scores)
+        scores = torch.where(width_inds, negative_scores, scores)
+        scores = torch.where(height_inds, negative_scores, scores)
+        scores = torch.where(dist_inds, negative_scores, scores)
+
+        if with_centripetal_shift:
+            scores[tl_ctx_inds] = -1
+            scores[tl_cty_inds] = -1
+            scores[br_ctx_inds] = -1
+            scores[br_cty_inds] = -1
+
+        scores = scores.view(batch, -1)
+        scores, inds = torch.topk(scores, num_dets)
+        scores = scores.unsqueeze(2)
+
+        bboxes = bboxes.view(batch, -1, 4)
+        bboxes = gather_feat(bboxes, inds)
+
+        clses = tl_clses.contiguous().view(batch, -1, 1)
+        clses = gather_feat(clses, inds)
+
+        return bboxes, scores, clses
diff --git a/mmde/mmdet/models/dense_heads/dab_detr_head.py b/mmde/mmdet/models/dense_heads/dab_detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..892833ffce5f17f6f9e82e67b7d32c6b9c1bafc0
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/dab_detr_head.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn as nn
+from mmcv.cnn import Linear
+from mmengine.model import bias_init_with_prob, constant_init
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList
+from ..layers import MLP, inverse_sigmoid
+from .conditional_detr_head import ConditionalDETRHead
+
+
+@MODELS.register_module()
+class DABDETRHead(ConditionalDETRHead):
+    """Head of DAB-DETR. DAB-DETR: Dynamic Anchor Boxes are Better Queries for
+    DETR.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2201.12329>`_ .
+    """
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the transformer head."""
+        # cls branch
+        self.fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        # reg branch
+        self.fc_reg = MLP(self.embed_dims, self.embed_dims, 4, 3)
+
+    def init_weights(self) -> None:
+        """initialize weights."""
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            nn.init.constant_(self.fc_cls.bias, bias_init)
+        constant_init(self.fc_reg.layers[-1], 0., bias=0.)
+
+    def forward(self, hidden_states: Tensor,
+                references: Tensor) -> Tuple[Tensor, Tensor]:
+        """"Forward function.
+
+        Args:
+            hidden_states (Tensor): Features from transformer decoder. If
+                `return_intermediate_dec` is True output has shape
+                (num_decoder_layers, bs, num_queries, dim), else has shape (1,
+                bs, num_queries, dim) which only contains the last layer
+                outputs.
+            references (Tensor): References from transformer decoder. If
+                `return_intermediate_dec` is True output has shape
+                (num_decoder_layers, bs, num_queries, 2/4), else has shape (1,
+                bs, num_queries, 2/4)
+                which only contains the last layer reference.
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - layers_cls_scores (Tensor): Outputs from the classification head,
+              shape (num_decoder_layers, bs, num_queries, cls_out_channels).
+              Note cls_out_channels should include background.
+            - layers_bbox_preds (Tensor): Sigmoid outputs from the regression
+              head with normalized coordinate format (cx, cy, w, h), has shape
+              (num_decoder_layers, bs, num_queries, 4).
+        """
+        layers_cls_scores = self.fc_cls(hidden_states)
+        references_before_sigmoid = inverse_sigmoid(references, eps=1e-3)
+        tmp_reg_preds = self.fc_reg(hidden_states)
+        tmp_reg_preds[..., :references_before_sigmoid.
+                      size(-1)] += references_before_sigmoid
+        layers_bbox_preds = tmp_reg_preds.sigmoid()
+        return layers_cls_scores, layers_bbox_preds
+
+    def predict(self,
+                hidden_states: Tensor,
+                references: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network. Over-write
+        because img_metas are needed as inputs for bbox_head.
+
+        Args:
+            hidden_states (Tensor): Feature from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, dim).
+            references (Tensor): references from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, 2/4).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        last_layer_hidden_state = hidden_states[-1].unsqueeze(0)
+        last_layer_reference = references[-1].unsqueeze(0)
+        outs = self(last_layer_hidden_state, last_layer_reference)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+        return predictions
diff --git a/mmde/mmdet/models/dense_heads/ddod_head.py b/mmde/mmdet/models/dense_heads/ddod_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..64e91ff0135230a8d634c5964eb520e1461c872a
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/ddod_head.py
@@ -0,0 +1,794 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+from mmengine.model import bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..utils import images_to_levels, multi_apply, unmap
+from .anchor_head import AnchorHead
+
+EPS = 1e-12
+
+
+@MODELS.register_module()
+class DDODHead(AnchorHead):
+    """Detection Head of `DDOD <https://arxiv.org/abs/2107.02963>`_.
+
+    DDOD head decomposes conjunctions lying in most current one-stage
+    detectors via label assignment disentanglement, spatial feature
+    disentanglement, and pyramid supervision disentanglement.
+
+    Args:
+        num_classes (int): Number of categories excluding the
+            background category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): The number of stacked Conv. Defaults to 4.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        use_dcn (bool): Use dcn, Same as ATSS when False. Defaults to True.
+        norm_cfg (:obj:`ConfigDict` or dict): Normal config of ddod head.
+            Defaults to dict(type='GN', num_groups=32, requires_grad=True).
+        loss_iou (:obj:`ConfigDict` or dict): Config of IoU loss. Defaults to
+            dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0).
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 use_dcn: bool = True,
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 loss_iou: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 **kwargs) -> None:
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.use_dcn = use_dcn
+        super().__init__(num_classes, in_channels, **kwargs)
+
+        if self.train_cfg:
+            self.cls_assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            self.reg_assigner = TASK_UTILS.build(
+                self.train_cfg['reg_assigner'])
+        self.loss_iou = MODELS.build(loss_iou)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=dict(type='DCN', deform_groups=1)
+                    if i == 0 and self.use_dcn else self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=dict(type='DCN', deform_groups=1)
+                    if i == 0 and self.use_dcn else self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.atss_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.atss_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+        self.atss_iou = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 1, 3, padding=1)
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+        # we use the global list in loss
+        self.cls_num_pos_samples_per_level = [
+            0. for _ in range(len(self.prior_generator.strides))
+        ]
+        self.reg_num_pos_samples_per_level = [
+            0. for _ in range(len(self.prior_generator.strides))
+        ]
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        for m in self.cls_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs:
+            normal_init(m.conv, std=0.01)
+        normal_init(self.atss_reg, std=0.01)
+        normal_init(self.atss_iou, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.atss_cls, std=0.01, bias=bias_cls)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores, bbox predictions,
+            and iou predictions.
+
+            - cls_scores (list[Tensor]): Classification scores for all \
+            scale levels, each is a 4D-tensor, the channels number is \
+            num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all \
+            scale levels, each is a 4D-tensor, the channels number is \
+            num_base_priors * 4.
+            - iou_preds (list[Tensor]): IoU scores for all scale levels, \
+            each is a 4D-tensor, the channels number is num_base_priors * 1.
+        """
+        return multi_apply(self.forward_single, x, self.scales)
+
+    def forward_single(self, x: Tensor, scale: Scale) -> Sequence[Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+
+            - cls_score (Tensor): Cls scores for a single scale level \
+            the channels number is num_base_priors * num_classes.
+            - bbox_pred (Tensor): Box energies / deltas for a single \
+            scale level, the channels number is num_base_priors * 4.
+            - iou_pred (Tensor): Iou for a single scale level, the \
+            channel number is (N, num_base_priors * 1, H, W).
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.atss_cls(cls_feat)
+        # we just follow atss, not apply exp in bbox_pred
+        bbox_pred = scale(self.atss_reg(reg_feat)).float()
+        iou_pred = self.atss_iou(reg_feat)
+        return cls_score, bbox_pred, iou_pred
+
+    def loss_cls_by_feat_single(self, cls_score: Tensor, labels: Tensor,
+                                label_weights: Tensor,
+                                reweight_factor: List[float],
+                                avg_factor: float) -> Tuple[Tensor]:
+        """Compute cls loss of a single scale level.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_base_priors * num_classes, H, W).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            reweight_factor (List[float]): Reweight factor for cls and reg
+                loss.
+            avg_factor (float): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            Tuple[Tensor]: A tuple of loss components.
+        """
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor)
+        return reweight_factor * loss_cls,
+
+    def loss_reg_by_feat_single(self, anchors: Tensor, bbox_pred: Tensor,
+                                iou_pred: Tensor, labels,
+                                label_weights: Tensor, bbox_targets: Tensor,
+                                bbox_weights: Tensor,
+                                reweight_factor: List[float],
+                                avg_factor: float) -> Tuple[Tensor, Tensor]:
+        """Compute reg loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_base_priors * 4, H, W).
+            iou_pred (Tensor): Iou for a single scale level, the
+                channel number is (N, num_base_priors * 1, H, W).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            bbox_weights (Tensor): BBox weights of all anchors in the
+                image with shape (N, 4)
+            reweight_factor (List[float]): Reweight factor for cls and reg
+                loss.
+            avg_factor (float): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+        Returns:
+            Tuple[Tensor, Tensor]: A tuple of loss components.
+        """
+        anchors = anchors.reshape(-1, 4)
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        iou_pred = iou_pred.permute(0, 2, 3, 1).reshape(-1, )
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        bbox_weights = bbox_weights.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        iou_targets = label_weights.new_zeros(labels.shape)
+        iou_weights = label_weights.new_zeros(labels.shape)
+        iou_weights[(bbox_weights.sum(axis=1) > 0).nonzero(
+            as_tuple=False)] = 1.
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    &
+                    (labels < bg_class_ind)).nonzero(as_tuple=False).squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_pred)
+            pos_decode_bbox_targets = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_targets)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                avg_factor=avg_factor)
+
+            iou_targets[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            loss_iou = self.loss_iou(
+                iou_pred, iou_targets, iou_weights, avg_factor=avg_factor)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_iou = iou_pred.sum() * 0
+
+        return reweight_factor * loss_bbox, reweight_factor * loss_iou
+
+    def calc_reweight_factor(self, labels_list: List[Tensor]) -> List[float]:
+        """Compute reweight_factor for regression and classification loss."""
+        # get pos samples for each level
+        bg_class_ind = self.num_classes
+        for ii, each_level_label in enumerate(labels_list):
+            pos_inds = ((each_level_label >= 0) &
+                        (each_level_label < bg_class_ind)).nonzero(
+                            as_tuple=False).squeeze(1)
+            self.cls_num_pos_samples_per_level[ii] += len(pos_inds)
+        # get reweight factor from 1 ~ 2 with bilinear interpolation
+        min_pos_samples = min(self.cls_num_pos_samples_per_level)
+        max_pos_samples = max(self.cls_num_pos_samples_per_level)
+        interval = 1. / (max_pos_samples - min_pos_samples + 1e-10)
+        reweight_factor_per_level = []
+        for pos_samples in self.cls_num_pos_samples_per_level:
+            factor = 2. - (pos_samples - min_pos_samples) * interval
+            reweight_factor_per_level.append(factor)
+        return reweight_factor_per_level
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            iou_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_base_priors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_base_priors * 4, H, W)
+            iou_preds (list[Tensor]): Score factor for all scale level,
+                each is a 4D-tensor, has shape (batch_size, 1, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        # calculate common vars for cls and reg assigners at once
+        targets_com = self.process_predictions_and_anchors(
+            anchor_list, valid_flag_list, cls_scores, bbox_preds,
+            batch_img_metas, batch_gt_instances_ignore)
+        (anchor_list, valid_flag_list, num_level_anchors_list, cls_score_list,
+         bbox_pred_list, batch_gt_instances_ignore) = targets_com
+
+        # classification branch assigner
+        cls_targets = self.get_cls_targets(
+            anchor_list,
+            valid_flag_list,
+            num_level_anchors_list,
+            cls_score_list,
+            bbox_pred_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (cls_anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_targets
+
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+        avg_factor = max(avg_factor, 1.0)
+
+        reweight_factor_per_level = self.calc_reweight_factor(labels_list)
+
+        cls_losses_cls, = multi_apply(
+            self.loss_cls_by_feat_single,
+            cls_scores,
+            labels_list,
+            label_weights_list,
+            reweight_factor_per_level,
+            avg_factor=avg_factor)
+
+        # regression branch assigner
+        reg_targets = self.get_reg_targets(
+            anchor_list,
+            valid_flag_list,
+            num_level_anchors_list,
+            cls_score_list,
+            bbox_pred_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (reg_anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = reg_targets
+
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+        avg_factor = max(avg_factor, 1.0)
+
+        reweight_factor_per_level = self.calc_reweight_factor(labels_list)
+
+        reg_losses_bbox, reg_losses_iou = multi_apply(
+            self.loss_reg_by_feat_single,
+            reg_anchor_list,
+            bbox_preds,
+            iou_preds,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            reweight_factor_per_level,
+            avg_factor=avg_factor)
+
+        return dict(
+            loss_cls=cls_losses_cls,
+            loss_bbox=reg_losses_bbox,
+            loss_iou=reg_losses_iou)
+
+    def process_predictions_and_anchors(
+            self,
+            anchor_list: List[List[Tensor]],
+            valid_flag_list: List[List[Tensor]],
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> tuple:
+        """Compute common vars for regression and classification targets.
+
+        Args:
+            anchor_list (List[List[Tensor]]): anchors of each image.
+            valid_flag_list (List[List[Tensor]]): Valid flags of each image.
+            cls_scores (List[Tensor]): Classification scores for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * 4.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Return:
+            tuple[Tensor]: A tuple of common loss vars.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        anchor_list_ = []
+        valid_flag_list_ = []
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list_.append(torch.cat(anchor_list[i]))
+            valid_flag_list_.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None for _ in range(num_imgs)]
+
+        num_levels = len(cls_scores)
+        cls_score_list = []
+        bbox_pred_list = []
+
+        mlvl_cls_score_list = [
+            cls_score.permute(0, 2, 3, 1).reshape(
+                num_imgs, -1, self.num_base_priors * self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        mlvl_bbox_pred_list = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.num_base_priors * 4)
+            for bbox_pred in bbox_preds
+        ]
+
+        for i in range(num_imgs):
+            mlvl_cls_tensor_list = [
+                mlvl_cls_score_list[j][i] for j in range(num_levels)
+            ]
+            mlvl_bbox_tensor_list = [
+                mlvl_bbox_pred_list[j][i] for j in range(num_levels)
+            ]
+            cat_mlvl_cls_score = torch.cat(mlvl_cls_tensor_list, dim=0)
+            cat_mlvl_bbox_pred = torch.cat(mlvl_bbox_tensor_list, dim=0)
+            cls_score_list.append(cat_mlvl_cls_score)
+            bbox_pred_list.append(cat_mlvl_bbox_pred)
+        return (anchor_list_, valid_flag_list_, num_level_anchors_list,
+                cls_score_list, bbox_pred_list, batch_gt_instances_ignore)
+
+    def get_cls_targets(self,
+                        anchor_list: List[Tensor],
+                        valid_flag_list: List[Tensor],
+                        num_level_anchors_list: List[int],
+                        cls_score_list: List[Tensor],
+                        bbox_pred_list: List[Tensor],
+                        batch_gt_instances: InstanceList,
+                        batch_img_metas: List[dict],
+                        batch_gt_instances_ignore: OptInstanceList = None,
+                        unmap_outputs: bool = True) -> tuple:
+        """Get cls targets for DDOD head.
+
+        This method is almost the same as `AnchorHead.get_targets()`.
+        Besides returning the targets as the parent  method does,
+        it also returns the anchors as the first element of the
+        returned tuple.
+
+        Args:
+            anchor_list (list[Tensor]): anchors of each image.
+            valid_flag_list (list[Tensor]): Valid flags of each image.
+            num_level_anchors_list (list[Tensor]): Number of anchors of each
+                scale level of all image.
+            cls_score_list (list[Tensor]): Classification scores for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * num_classes.
+            bbox_pred_list (list[Tensor]): Box energies / deltas for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Return:
+            tuple[Tensor]: A tuple of cls targets components.
+        """
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             anchor_list,
+             valid_flag_list,
+             cls_score_list,
+             bbox_pred_list,
+             num_level_anchors_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs,
+             is_cls_assigner=True)
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors_list[0])
+        labels_list = images_to_levels(all_labels, num_level_anchors_list[0])
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors_list[0])
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors_list[0])
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors_list[0])
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, avg_factor)
+
+    def get_reg_targets(self,
+                        anchor_list: List[Tensor],
+                        valid_flag_list: List[Tensor],
+                        num_level_anchors_list: List[int],
+                        cls_score_list: List[Tensor],
+                        bbox_pred_list: List[Tensor],
+                        batch_gt_instances: InstanceList,
+                        batch_img_metas: List[dict],
+                        batch_gt_instances_ignore: OptInstanceList = None,
+                        unmap_outputs: bool = True) -> tuple:
+        """Get reg targets for DDOD head.
+
+        This method is almost the same as `AnchorHead.get_targets()` when
+        is_cls_assigner is False. Besides returning the targets as the parent
+        method does, it also returns the anchors as the first element of the
+        returned tuple.
+
+        Args:
+            anchor_list (list[Tensor]): anchors of each image.
+            valid_flag_list (list[Tensor]): Valid flags of each image.
+            num_level_anchors_list (list[Tensor]): Number of anchors of each
+                scale level of all image.
+            cls_score_list (list[Tensor]): Classification scores for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * num_classes.
+            bbox_pred_list (list[Tensor]): Box energies / deltas for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Return:
+            tuple[Tensor]: A tuple of reg targets components.
+        """
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             anchor_list,
+             valid_flag_list,
+             cls_score_list,
+             bbox_pred_list,
+             num_level_anchors_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs,
+             is_cls_assigner=False)
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors_list[0])
+        labels_list = images_to_levels(all_labels, num_level_anchors_list[0])
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors_list[0])
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors_list[0])
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors_list[0])
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, avg_factor)
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            cls_scores: Tensor,
+                            bbox_preds: Tensor,
+                            num_level_anchors: List[int],
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True,
+                            is_cls_assigner: bool = True) -> tuple:
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image,
+                which are concatenated into a single tensor of shape
+                (num_base_priors, 4).
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                shape (num_base_priors,).
+            cls_scores (Tensor): Classification scores for all scale
+                levels of the image.
+            bbox_preds (Tensor): Box energies / deltas for all scale
+                levels of the image.
+            num_level_anchors (List[int]): Number of anchors of each
+                scale level.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+            is_cls_assigner (bool): Classification or regression.
+                Defaults to True.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+            - anchors (Tensor): all anchors in the image with shape (N, 4).
+            - labels (Tensor): Labels of all anchors in the image with \
+            shape (N, ).
+            - label_weights (Tensor): Label weights of all anchor in the \
+            image with shape (N, ).
+            - bbox_targets (Tensor): BBox targets of all anchors in the \
+            image with shape (N, 4).
+            - bbox_weights (Tensor): BBox weights of all anchors in the \
+            image with shape (N, 4)
+            - pos_inds (Tensor): Indices of positive anchor with shape \
+            (num_pos, ).
+            - neg_inds (Tensor): Indices of negative anchor with shape \
+            (num_neg, ).
+            - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        num_level_anchors_inside = self.get_num_level_anchors_inside(
+            num_level_anchors, inside_flags)
+        bbox_preds_valid = bbox_preds[inside_flags, :]
+        cls_scores_valid = cls_scores[inside_flags, :]
+
+        assigner = self.cls_assigner if is_cls_assigner else self.reg_assigner
+
+        # decode prediction out of assigner
+        bbox_preds_valid = self.bbox_coder.decode(anchors, bbox_preds_valid)
+        pred_instances = InstanceData(
+            priors=anchors, bboxes=bbox_preds_valid, scores=cls_scores_valid)
+
+        assign_result = assigner.assign(
+            pred_instances=pred_instances,
+            num_level_priors=num_level_anchors_inside,
+            gt_instances=gt_instances,
+            gt_instances_ignore=gt_instances_ignore)
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            pos_bbox_targets = self.bbox_coder.encode(
+                sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds, sampling_result)
+
+    def get_num_level_anchors_inside(self, num_level_anchors: List[int],
+                                     inside_flags: Tensor) -> List[int]:
+        """Get the anchors of each scale level inside.
+
+        Args:
+            num_level_anchors (list[int]): Number of anchors of each
+                scale level.
+            inside_flags (Tensor): Multi level inside flags of the image,
+                which are concatenated into a single tensor of
+                shape (num_base_priors,).
+
+        Returns:
+            list[int]: Number of anchors of each scale level inside.
+        """
+        split_inside_flags = torch.split(inside_flags, num_level_anchors)
+        num_level_anchors_inside = [
+            int(flags.sum()) for flags in split_inside_flags
+        ]
+        return num_level_anchors_inside
diff --git a/mmde/mmdet/models/dense_heads/ddq_detr_head.py b/mmde/mmdet/models/dense_heads/ddq_detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0580653ac264ea0a597eec76624ab7eb3c7f6a10
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/ddq_detr_head.py
@@ -0,0 +1,550 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Tuple
+
+import torch
+from mmengine.model import bias_init_with_prob, constant_init
+from torch import Tensor, nn
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy
+from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
+from ..layers import inverse_sigmoid
+from ..losses import DDQAuxLoss
+from ..utils import multi_apply
+from .dino_head import DINOHead
+
+
+@MODELS.register_module()
+class DDQDETRHead(DINOHead):
+    r"""Head of DDQDETR: Dense Distinct Query for
+        End-to-End Object Detection.
+
+    Code is modified from the `official github repo
+        <https://github.com/jshilong/DDQ>`_.
+
+    More details can be found in the `paper
+        <https://arxiv.org/abs/2303.12776>`_ .
+
+    Args:
+        aux_num_pos (int): Number of positive targets assigned to a
+            perdicted object. Defaults to 4.
+    """
+
+    def __init__(self, *args, aux_num_pos=4, **kwargs):
+        super(DDQDETRHead, self).__init__(*args, **kwargs)
+        self.aux_loss_for_dense = DDQAuxLoss(
+            train_cfg=dict(
+                assigner=dict(type='TopkHungarianAssigner', topk=aux_num_pos),
+                alpha=1,
+                beta=6))
+
+    def _init_layers(self) -> None:
+        """Initialize classification branch and regression branch of aux head
+        for dense queries."""
+        super(DDQDETRHead, self)._init_layers()
+        # If decoder `num_layers` = 6 and `as_two_stage` = True, then:
+        #   1) 6 main heads are required for
+        #       each decoder output of distinct queries.
+        #   2) 1 main head is required for `output_memory` of distinct queries.
+        #   3) 1 aux head is required for `output_memory` of dense queries,
+        #       which is done by code below this comment.
+        # So 8 heads are required in sum.
+        # aux head for dense queries on encoder feature map
+        self.cls_branches.append(copy.deepcopy(self.cls_branches[-1]))
+        self.reg_branches.append(copy.deepcopy(self.reg_branches[-1]))
+
+        # If decoder `num_layers` = 6 and `as_two_stage` = True, then:
+        #   6 aux heads are required for each decoder output of dense queries.
+        # So 8 + 6 = 14 heads and heads are requires in sum.
+        # self.num_pred_layer is 7
+        # aux head for dense queries in decoder
+        self.aux_cls_branches = nn.ModuleList([
+            copy.deepcopy(self.cls_branches[-1])
+            for _ in range(self.num_pred_layer - 1)
+        ])
+        self.aux_reg_branches = nn.ModuleList([
+            copy.deepcopy(self.reg_branches[-1])
+            for _ in range(self.num_pred_layer - 1)
+        ])
+
+    def init_weights(self) -> None:
+        """Initialize weights of the Deformable DETR head."""
+        bias_init = bias_init_with_prob(0.01)
+        for m in self.cls_branches:
+            nn.init.constant_(m.bias, bias_init)
+        for m in self.aux_cls_branches:
+            nn.init.constant_(m.bias, bias_init)
+        for m in self.reg_branches:
+            constant_init(m[-1], 0, bias=0)
+        for m in self.reg_branches:
+            nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+        for m in self.aux_reg_branches:
+            constant_init(m[-1], 0, bias=0)
+
+        for m in self.aux_reg_branches:
+            nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+    def forward(self, hidden_states: Tensor,
+                references: List[Tensor]) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries_total,
+                dim), where `num_queries_total` is the sum of
+                `num_denoising_queries`, `num_queries` and `num_dense_queries`
+                when `self.training` is `True`, else `num_queries`.
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). Each reference has shape (bs,
+                num_queries_total, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+
+        Returns:
+            tuple[Tensor]: results of head containing the following tensors.
+
+            - all_layers_outputs_classes (Tensor): Outputs from the
+              classification head, has shape (num_decoder_layers, bs,
+              num_queries_total, cls_out_channels).
+            - all_layers_outputs_coords (Tensor): Sigmoid outputs from the
+              regression head with normalized coordinate format (cx, cy, w,
+              h), has shape (num_decoder_layers, bs, num_queries_total, 4)
+              with the last dimension arranged as (cx, cy, w, h).
+        """
+        all_layers_outputs_classes = []
+        all_layers_outputs_coords = []
+        if self.training:
+            num_dense = self.cache_dict['num_dense_queries']
+        for layer_id in range(hidden_states.shape[0]):
+            reference = inverse_sigmoid(references[layer_id])
+            hidden_state = hidden_states[layer_id]
+            if self.training:
+                dense_hidden_state = hidden_state[:, -num_dense:]
+                hidden_state = hidden_state[:, :-num_dense]
+
+            outputs_class = self.cls_branches[layer_id](hidden_state)
+            tmp_reg_preds = self.reg_branches[layer_id](hidden_state)
+            if self.training:
+                dense_outputs_class = self.aux_cls_branches[layer_id](
+                    dense_hidden_state)
+                dense_tmp_reg_preds = self.aux_reg_branches[layer_id](
+                    dense_hidden_state)
+                outputs_class = torch.cat([outputs_class, dense_outputs_class],
+                                          dim=1)
+                tmp_reg_preds = torch.cat([tmp_reg_preds, dense_tmp_reg_preds],
+                                          dim=1)
+
+            if reference.shape[-1] == 4:
+                tmp_reg_preds += reference
+            else:
+                assert reference.shape[-1] == 2
+                tmp_reg_preds[..., :2] += reference
+            outputs_coord = tmp_reg_preds.sigmoid()
+            all_layers_outputs_classes.append(outputs_class)
+            all_layers_outputs_coords.append(outputs_coord)
+
+        all_layers_outputs_classes = torch.stack(all_layers_outputs_classes)
+        all_layers_outputs_coords = torch.stack(all_layers_outputs_coords)
+
+        return all_layers_outputs_classes, all_layers_outputs_coords
+
+    def loss(self,
+             hidden_states: Tensor,
+             references: List[Tensor],
+             enc_outputs_class: Tensor,
+             enc_outputs_coord: Tensor,
+             batch_data_samples: SampleList,
+             dn_meta: Dict[str, int],
+             aux_enc_outputs_class=None,
+             aux_enc_outputs_coord=None) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries_total,
+                dim), where `num_queries_total` is the sum of
+                `num_denoising_queries`, `num_queries` and `num_dense_queries`
+                when `self.training` is `True`, else `num_queries`.
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). Each reference has shape (bs,
+                num_queries_total, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            enc_outputs_class (Tensor): The top k classification score of
+                each point on encoder feature map, has shape (bs, num_queries,
+                cls_out_channels).
+            enc_outputs_coord (Tensor): The proposal generated from points
+                with top k score, has shape (bs, num_queries, 4) with the
+                last dimension arranged as (cx, cy, w, h).
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+            aux_enc_outputs_class (Tensor): The `dense_topk` classification
+                score of each point on encoder feature map, has shape (bs,
+                num_dense_queries, cls_out_channels).
+                It is `None` when `self.training` is `False`.
+            aux_enc_outputs_coord (Tensor): The proposal generated from points
+                with `dense_topk` score, has shape (bs, num_dense_queries, 4)
+                with the last dimension arranged as (cx, cy, w, h).
+                It is `None` when `self.training` is `False`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (enc_outputs_class, enc_outputs_coord,
+                              batch_gt_instances, batch_img_metas, dn_meta)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        aux_enc_outputs_coord = bbox_cxcywh_to_xyxy(aux_enc_outputs_coord)
+        aux_enc_outputs_coord_list = []
+        for img_id in range(len(aux_enc_outputs_coord)):
+            det_bboxes = aux_enc_outputs_coord[img_id]
+            img_shape = batch_img_metas[img_id]['img_shape']
+            det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+            det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+            aux_enc_outputs_coord_list.append(det_bboxes)
+        aux_enc_outputs_coord = torch.stack(aux_enc_outputs_coord_list)
+        aux_loss = self.aux_loss_for_dense.loss(
+            aux_enc_outputs_class.sigmoid(), aux_enc_outputs_coord,
+            [item.bboxes for item in batch_gt_instances],
+            [item.labels for item in batch_gt_instances], batch_img_metas)
+        for k, v in aux_loss.items():
+            losses[f'aux_enc_{k}'] = v
+
+        return losses
+
+    def loss_by_feat(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        enc_cls_scores: Tensor,
+        enc_bbox_preds: Tensor,
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        dn_meta: Dict[str, int],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries_total, cls_out_channels).
+            all_layers_bbox_preds (Tensor): Bbox coordinates of all decoder
+                layers. Each has shape (num_decoder_layers, bs,
+                num_queries_total, 4) with normalized coordinate format
+                (cx, cy, w, h).
+            enc_cls_scores (Tensor): The top k score of each point on
+                encoder feature map, has shape (bs, num_queries,
+                cls_out_channels).
+            enc_bbox_preds (Tensor): The proposal generated from points
+                with top k score, has shape (bs, num_queries, 4) with the
+                last dimension arranged as (cx, cy, w, h).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+                group collation, including 'num_denoising_queries' and
+                'num_denoising_groups'. It will be used for split outputs of
+                denoising and matching parts and loss calculation.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        (all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
+         all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds) = \
+            self.split_outputs(
+                all_layers_cls_scores, all_layers_bbox_preds, dn_meta)
+
+        num_dense_queries = dn_meta['num_dense_queries']
+        num_layer = all_layers_matching_bbox_preds.size(0)
+        dense_all_layers_matching_cls_scores = all_layers_matching_cls_scores[:, :,  # noqa: E501
+                                                                              -num_dense_queries:]  # noqa: E501
+        dense_all_layers_matching_bbox_preds = all_layers_matching_bbox_preds[:, :,  # noqa: E501
+                                                                              -num_dense_queries:]  # noqa: E501
+
+        all_layers_matching_cls_scores = all_layers_matching_cls_scores[:, :, :  # noqa: E501
+                                                                        -num_dense_queries]  # noqa: E501
+        all_layers_matching_bbox_preds = all_layers_matching_bbox_preds[:, :, :  # noqa: E501
+                                                                        -num_dense_queries]  # noqa: E501
+
+        loss_dict = self.loss_for_distinct_queries(
+            all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
+            batch_gt_instances, batch_img_metas, batch_gt_instances_ignore)
+
+        if enc_cls_scores is not None:
+
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
+                self.loss_by_feat_single(
+                    enc_cls_scores, enc_bbox_preds,
+                    batch_gt_instances=batch_gt_instances,
+                    batch_img_metas=batch_img_metas)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+            loss_dict['enc_loss_iou'] = enc_losses_iou
+
+        if all_layers_denoising_cls_scores is not None:
+            dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn(
+                all_layers_denoising_cls_scores,
+                all_layers_denoising_bbox_preds,
+                batch_gt_instances=batch_gt_instances,
+                batch_img_metas=batch_img_metas,
+                dn_meta=dn_meta)
+            loss_dict['dn_loss_cls'] = dn_losses_cls[-1]
+            loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1]
+            loss_dict['dn_loss_iou'] = dn_losses_iou[-1]
+            for num_dec_layer, (loss_cls_i, loss_bbox_i, loss_iou_i) in \
+                    enumerate(zip(dn_losses_cls[:-1], dn_losses_bbox[:-1],
+                                  dn_losses_iou[:-1])):
+                loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i
+                loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i
+                loss_dict[f'd{num_dec_layer}.dn_loss_iou'] = loss_iou_i
+
+        for l_id in range(num_layer):
+            cls_scores = dense_all_layers_matching_cls_scores[l_id].sigmoid()
+            bbox_preds = dense_all_layers_matching_bbox_preds[l_id]
+
+            bbox_preds = bbox_cxcywh_to_xyxy(bbox_preds)
+            bbox_preds_list = []
+            for img_id in range(len(bbox_preds)):
+                det_bboxes = bbox_preds[img_id]
+                img_shape = batch_img_metas[img_id]['img_shape']
+                det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+                det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+                bbox_preds_list.append(det_bboxes)
+            bbox_preds = torch.stack(bbox_preds_list)
+            aux_loss = self.aux_loss_for_dense.loss(
+                cls_scores, bbox_preds,
+                [item.bboxes for item in batch_gt_instances],
+                [item.labels for item in batch_gt_instances], batch_img_metas)
+            for k, v in aux_loss.items():
+                loss_dict[f'{l_id}_aux_{k}'] = v
+
+        return loss_dict
+
+    def loss_for_distinct_queries(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss of distinct queries, that is, excluding denoising
+        and dense queries. Only select the distinct queries in decoder for
+        loss.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries, cls_out_channels).
+            all_layers_bbox_preds (Tensor): Bbox coordinates of all decoder
+                layers. It has shape (num_decoder_layers, bs,
+                num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image,
+            e.g., image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert batch_gt_instances_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            'for batch_gt_instances_ignore setting to None.'
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self._loss_for_distinct_queries_single,
+            all_layers_cls_scores,
+            all_layers_bbox_preds,
+            [i for i in range(len(all_layers_bbox_preds))],
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in \
+                zip(losses_cls[:-1], losses_bbox[:-1], losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def _loss_for_distinct_queries_single(self, cls_scores, bbox_preds, l_id,
+                                          batch_gt_instances, batch_img_metas):
+        """Calculate the loss for outputs from a single decoder layer of
+        distinct queries, that is, excluding denoising and dense queries. Only
+        select the distinct queries in decoder for loss.
+
+        Args:
+            cls_scores (Tensor): Classification scores of a single
+                decoder layer, has shape (bs, num_queries, cls_out_channels).
+            bbox_preds (Tensor): Bbox coordinates of a single decoder
+                layer. It has shape (bs, num_queries, 4) with the last
+                dimension arranged as (cx, cy, w, h).
+            l_id (int): Decoder layer index for these outputs.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image,
+            e.g., image size, scaling factor, etc.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        num_imgs = cls_scores.size(0)
+        if 0 < l_id:
+            batch_mask = [
+                self.cache_dict['distinct_query_mask'][l_id - 1][
+                    img_id * self.cache_dict['num_heads']][0]
+                for img_id in range(num_imgs)
+            ]
+        else:
+            batch_mask = [
+                torch.ones(len(cls_scores[i]),
+                           device=cls_scores.device).bool()
+                for i in range(num_imgs)
+            ]
+        # only select the distinct queries in decoder for loss
+        cls_scores_list = [
+            cls_scores[i][batch_mask[i]] for i in range(num_imgs)
+        ]
+        bbox_preds_list = [
+            bbox_preds[i][batch_mask[i]] for i in range(num_imgs)
+        ]
+        cls_scores = torch.cat(cls_scores_list)
+
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           batch_gt_instances, batch_img_metas)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        loss_cls = self.loss_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds_list):
+            img_h, img_w, = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = torch.cat(bbox_preds_list)
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def predict_by_feat(self,
+                        layer_cls_scores: Tensor,
+                        layer_bbox_preds: Tensor,
+                        batch_img_metas: List[dict],
+                        rescale: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            layer_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries, cls_out_channels).
+            layer_bbox_preds (Tensor): Bbox coordinates of all decoder layers.
+                Each has shape (num_decoder_layers, bs, num_queries, 4)
+                with normalized coordinate format (cx, cy, w, h).
+            batch_img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If `True`, return boxes in original
+                image space. Default `False`.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        cls_scores = layer_cls_scores[-1]
+        bbox_preds = layer_bbox_preds[-1]
+
+        num_imgs = cls_scores.size(0)
+        # -1 is last layer input query mask
+
+        batch_mask = [
+            self.cache_dict['distinct_query_mask'][-1][
+                img_id * self.cache_dict['num_heads']][0]
+            for img_id in range(num_imgs)
+        ]
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score = cls_scores[img_id][batch_mask[img_id]]
+            bbox_pred = bbox_preds[img_id][batch_mask[img_id]]
+            img_meta = batch_img_metas[img_id]
+            results = self._predict_by_feat_single(cls_score, bbox_pred,
+                                                   img_meta, rescale)
+            result_list.append(results)
+        return result_list
diff --git a/mmde/mmdet/models/dense_heads/deformable_detr_head.py b/mmde/mmdet/models/dense_heads/deformable_detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..adedd4aa6b533bcfece618eed4045c95bf0fdebb
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/deformable_detr_head.py
@@ -0,0 +1,329 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Linear
+from mmengine.model import bias_init_with_prob, constant_init
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList, OptInstanceList
+from ..layers import inverse_sigmoid
+from .detr_head import DETRHead
+
+
+@MODELS.register_module()
+class DeformableDETRHead(DETRHead):
+    r"""Head of DeformDETR: Deformable DETR: Deformable Transformers for
+    End-to-End Object Detection.
+
+    Code is modified from the `official github repo
+    <https://github.com/fundamentalvision/Deformable-DETR>`_.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2010.04159>`_ .
+
+    Args:
+        share_pred_layer (bool): Whether to share parameters for all the
+            prediction layers. Defaults to `False`.
+        num_pred_layer (int): The number of the prediction layers.
+            Defaults to 6.
+        as_two_stage (bool, optional): Whether to generate the proposal
+            from the outputs of encoder. Defaults to `False`.
+    """
+
+    def __init__(self,
+                 *args,
+                 share_pred_layer: bool = False,
+                 num_pred_layer: int = 6,
+                 as_two_stage: bool = False,
+                 **kwargs) -> None:
+        self.share_pred_layer = share_pred_layer
+        self.num_pred_layer = num_pred_layer
+        self.as_two_stage = as_two_stage
+
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize classification branch and regression branch of head."""
+        fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, 4))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        if self.share_pred_layer:
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(self.num_pred_layer)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(self.num_pred_layer)])
+        else:
+            self.cls_branches = nn.ModuleList(
+                [copy.deepcopy(fc_cls) for _ in range(self.num_pred_layer)])
+            self.reg_branches = nn.ModuleList([
+                copy.deepcopy(reg_branch) for _ in range(self.num_pred_layer)
+            ])
+
+    def init_weights(self) -> None:
+        """Initialize weights of the Deformable DETR head."""
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias, bias_init)
+        for m in self.reg_branches:
+            constant_init(m[-1], 0, bias=0)
+        nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0)
+        if self.as_two_stage:
+            for m in self.reg_branches:
+                nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+    def forward(self, hidden_states: Tensor,
+                references: List[Tensor]) -> Tuple[Tensor, Tensor]:
+        """Forward function.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries, dim).
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - all_layers_outputs_classes (Tensor): Outputs from the
+              classification head, has shape (num_decoder_layers, bs,
+              num_queries, cls_out_channels).
+            - all_layers_outputs_coords (Tensor): Sigmoid outputs from the
+              regression head with normalized coordinate format (cx, cy, w,
+              h), has shape (num_decoder_layers, bs, num_queries, 4) with the
+              last dimension arranged as (cx, cy, w, h).
+        """
+        all_layers_outputs_classes = []
+        all_layers_outputs_coords = []
+
+        for layer_id in range(hidden_states.shape[0]):
+            reference = inverse_sigmoid(references[layer_id])
+            # NOTE The last reference will not be used.
+            hidden_state = hidden_states[layer_id]
+            outputs_class = self.cls_branches[layer_id](hidden_state)
+            tmp_reg_preds = self.reg_branches[layer_id](hidden_state)
+            if reference.shape[-1] == 4:
+                # When `layer` is 0 and `as_two_stage` of the detector
+                # is `True`, or when `layer` is greater than 0 and
+                # `with_box_refine` of the detector is `True`.
+                tmp_reg_preds += reference
+            else:
+                # When `layer` is 0 and `as_two_stage` of the detector
+                # is `False`, or when `layer` is greater than 0 and
+                # `with_box_refine` of the detector is `False`.
+                assert reference.shape[-1] == 2
+                tmp_reg_preds[..., :2] += reference
+            outputs_coord = tmp_reg_preds.sigmoid()
+            all_layers_outputs_classes.append(outputs_class)
+            all_layers_outputs_coords.append(outputs_coord)
+
+        all_layers_outputs_classes = torch.stack(all_layers_outputs_classes)
+        all_layers_outputs_coords = torch.stack(all_layers_outputs_coords)
+
+        return all_layers_outputs_classes, all_layers_outputs_coords
+
+    def loss(self, hidden_states: Tensor, references: List[Tensor],
+             enc_outputs_class: Tensor, enc_outputs_coord: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, num_queries, bs, dim).
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+            enc_outputs_class (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+                Only when `as_two_stage` is `True` it would be passed in,
+                otherwise it would be `None`.
+            enc_outputs_coord (Tensor): The proposal generate from the encode
+                feature map, has shape (bs, num_feat_points, 4) with the last
+                dimension arranged as (cx, cy, w, h). Only when `as_two_stage`
+                is `True` it would be passed in, otherwise it would be `None`.
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (enc_outputs_class, enc_outputs_coord,
+                              batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        enc_cls_scores: Tensor,
+        enc_bbox_preds: Tensor,
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs, num_queries,
+                cls_out_channels).
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and has shape (num_decoder_layers, bs,
+                num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            enc_cls_scores (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+                Only when `as_two_stage` is `True` it would be passes in,
+                otherwise, it would be `None`.
+            enc_bbox_preds (Tensor): The proposal generate from the encode
+                feature map, has shape (bs, num_feat_points, 4) with the last
+                dimension arranged as (cx, cy, w, h). Only when `as_two_stage`
+                is `True` it would be passed in, otherwise it would be `None`.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        loss_dict = super().loss_by_feat(all_layers_cls_scores,
+                                         all_layers_bbox_preds,
+                                         batch_gt_instances, batch_img_metas,
+                                         batch_gt_instances_ignore)
+
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            proposal_gt_instances = copy.deepcopy(batch_gt_instances)
+            for i in range(len(proposal_gt_instances)):
+                proposal_gt_instances[i].labels = torch.zeros_like(
+                    proposal_gt_instances[i].labels)
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
+                self.loss_by_feat_single(
+                    enc_cls_scores, enc_bbox_preds,
+                    batch_gt_instances=proposal_gt_instances,
+                    batch_img_metas=batch_img_metas)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+            loss_dict['enc_loss_iou'] = enc_losses_iou
+        return loss_dict
+
+    def predict(self,
+                hidden_states: Tensor,
+                references: List[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, num_queries, bs, dim).
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): If `True`, return boxes in original
+                image space. Defaults to `True`.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        outs = self(hidden_states, references)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self,
+                        all_layers_cls_scores: Tensor,
+                        all_layers_bbox_preds: Tensor,
+                        batch_img_metas: List[Dict],
+                        rescale: bool = False) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs, num_queries,
+                cls_out_channels).
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and shape (num_decoder_layers, bs, num_queries,
+                4) with the last dimension arranged as (cx, cy, w, h).
+            batch_img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If `True`, return boxes in original
+                image space. Default `False`.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        cls_scores = all_layers_cls_scores[-1]
+        bbox_preds = all_layers_bbox_preds[-1]
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_meta = batch_img_metas[img_id]
+            results = self._predict_by_feat_single(cls_score, bbox_pred,
+                                                   img_meta, rescale)
+            result_list.append(results)
+        return result_list
diff --git a/mmde/mmdet/models/dense_heads/dense_test_mixins.py b/mmde/mmdet/models/dense_heads/dense_test_mixins.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7526d48430d6bc6b82777980d0bef418e80b91c
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/dense_test_mixins.py
@@ -0,0 +1,215 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+import warnings
+from inspect import signature
+
+import torch
+from mmcv.ops import batched_nms
+from mmengine.structures import InstanceData
+
+from mmdet.structures.bbox import bbox_mapping_back
+from ..test_time_augs import merge_aug_proposals
+
+if sys.version_info >= (3, 7):
+    from mmdet.utils.contextmanagers import completed
+
+
+class BBoxTestMixin(object):
+    """Mixin class for testing det bboxes via DenseHead."""
+
+    def simple_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes without test-time augmentation, can be applied in
+        DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``,
+        etc.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each
+                image after the post process. \
+                Each item usually contains following keys. \
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance,)
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances,).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        warnings.warn('You are calling `simple_test_bboxes` in '
+                      '`dense_test_mixins`, but the `dense_test_mixins`'
+                      'will be deprecated soon. Please use '
+                      '`simple_test` instead.')
+        outs = self.forward(feats)
+        results_list = self.get_results(
+            *outs, img_metas=img_metas, rescale=rescale)
+        return results_list
+
+    def aug_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes with test time augmentation, can be applied in
+        DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``,
+        etc.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,). The length of list should always be 1.
+        """
+
+        warnings.warn('You are calling `aug_test_bboxes` in '
+                      '`dense_test_mixins`, but the `dense_test_mixins`'
+                      'will be deprecated soon. Please use '
+                      '`aug_test` instead.')
+        # check with_nms argument
+        gb_sig = signature(self.get_results)
+        gb_args = [p.name for p in gb_sig.parameters.values()]
+        gbs_sig = signature(self._get_results_single)
+        gbs_args = [p.name for p in gbs_sig.parameters.values()]
+        assert ('with_nms' in gb_args) and ('with_nms' in gbs_args), \
+            f'{self.__class__.__name__}' \
+            ' does not support test-time augmentation'
+
+        aug_bboxes = []
+        aug_scores = []
+        aug_labels = []
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            outs = self.forward(x)
+            bbox_outputs = self.get_results(
+                *outs,
+                img_metas=img_meta,
+                cfg=self.test_cfg,
+                rescale=False,
+                with_nms=False)[0]
+            aug_bboxes.append(bbox_outputs.bboxes)
+            aug_scores.append(bbox_outputs.scores)
+            if len(bbox_outputs) >= 3:
+                aug_labels.append(bbox_outputs.labels)
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = self.merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas)
+        merged_labels = torch.cat(aug_labels, dim=0) if aug_labels else None
+
+        if merged_bboxes.numel() == 0:
+            det_bboxes = torch.cat([merged_bboxes, merged_scores[:, None]], -1)
+            return [
+                (det_bboxes, merged_labels),
+            ]
+
+        det_bboxes, keep_idxs = batched_nms(merged_bboxes, merged_scores,
+                                            merged_labels, self.test_cfg.nms)
+        det_bboxes = det_bboxes[:self.test_cfg.max_per_img]
+        det_labels = merged_labels[keep_idxs][:self.test_cfg.max_per_img]
+
+        if rescale:
+            _det_bboxes = det_bboxes
+        else:
+            _det_bboxes = det_bboxes.clone()
+            _det_bboxes[:, :4] *= det_bboxes.new_tensor(
+                img_metas[0][0]['scale_factor'])
+
+        results = InstanceData()
+        results.bboxes = _det_bboxes[:, :4]
+        results.scores = _det_bboxes[:, 4]
+        results.labels = det_labels
+        return [results]
+
+    def aug_test_rpn(self, feats, img_metas):
+        """Test with augmentation for only for ``RPNHead`` and its variants,
+        e.g., ``GARPNHead``, etc.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                        a 4D-tensor.
+            img_metas (list[dict]): Meta info of each image.
+
+        Returns:
+            list[Tensor]: Proposals of each image, each item has shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+        """
+        samples_per_gpu = len(img_metas[0])
+        aug_proposals = [[] for _ in range(samples_per_gpu)]
+        for x, img_meta in zip(feats, img_metas):
+            results_list = self.simple_test_rpn(x, img_meta)
+            for i, results in enumerate(results_list):
+                proposals = torch.cat(
+                    [results.bboxes, results.scores[:, None]], dim=-1)
+                aug_proposals[i].append(proposals)
+        # reorganize the order of 'img_metas' to match the dimensions
+        # of 'aug_proposals'
+        aug_img_metas = []
+        for i in range(samples_per_gpu):
+            aug_img_meta = []
+            for j in range(len(img_metas)):
+                aug_img_meta.append(img_metas[j][i])
+            aug_img_metas.append(aug_img_meta)
+        # after merging, proposals will be rescaled to the original image size
+
+        merged_proposals = []
+        for proposals, aug_img_meta in zip(aug_proposals, aug_img_metas):
+            merged_proposal = merge_aug_proposals(proposals, aug_img_meta,
+                                                  self.test_cfg)
+            results = InstanceData()
+            results.bboxes = merged_proposal[:, :4]
+            results.scores = merged_proposal[:, 4]
+            merged_proposals.append(results)
+        return merged_proposals
+
+    if sys.version_info >= (3, 7):
+
+        async def async_simple_test_rpn(self, x, img_metas):
+            sleep_interval = self.test_cfg.pop('async_sleep_interval', 0.025)
+            async with completed(
+                    __name__, 'rpn_head_forward',
+                    sleep_interval=sleep_interval):
+                rpn_outs = self(x)
+
+            proposal_list = self.get_results(*rpn_outs, img_metas=img_metas)
+            return proposal_list
+
+    def merge_aug_bboxes(self, aug_bboxes, aug_scores, img_metas):
+        """Merge augmented detection bboxes and scores.
+
+        Args:
+            aug_bboxes (list[Tensor]): shape (n, 4*#class)
+            aug_scores (list[Tensor] or None): shape (n, #class)
+            img_shapes (list[Tensor]): shape (3, ).
+
+        Returns:
+            tuple[Tensor]: ``bboxes`` with shape (n,4), where
+            4 represent (tl_x, tl_y, br_x, br_y)
+            and ``scores`` with shape (n,).
+        """
+        recovered_bboxes = []
+        for bboxes, img_info in zip(aug_bboxes, img_metas):
+            img_shape = img_info[0]['img_shape']
+            scale_factor = img_info[0]['scale_factor']
+            flip = img_info[0]['flip']
+            flip_direction = img_info[0]['flip_direction']
+            bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip,
+                                       flip_direction)
+            recovered_bboxes.append(bboxes)
+        bboxes = torch.cat(recovered_bboxes, dim=0)
+        if aug_scores is None:
+            return bboxes
+        else:
+            scores = torch.cat(aug_scores, dim=0)
+            return bboxes, scores
diff --git a/mmde/mmdet/models/dense_heads/detr_head.py b/mmde/mmdet/models/dense_heads/detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9daeb4740057c1f07095ffbf97b73ea40fc93106
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/detr_head.py
@@ -0,0 +1,634 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Linear
+from mmcv.cnn.bricks.transformer import FFN
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import (bbox_cxcywh_to_xyxy, bbox_overlaps,
+                                   bbox_xyxy_to_cxcywh)
+from mmdet.utils import (ConfigType, InstanceList, OptInstanceList,
+                         OptMultiConfig, reduce_mean)
+from ..losses import QualityFocalLoss
+from ..utils import multi_apply
+
+
+@MODELS.register_module()
+class DETRHead(BaseModule):
+    r"""Head of DETR. DETR:End-to-End Object Detection with Transformers.
+
+    More details can be found in the `paper
+    <https://arxiv.org/pdf/2005.12872>`_ .
+
+    Args:
+        num_classes (int): Number of categories excluding the background.
+        embed_dims (int): The dims of Transformer embedding.
+        num_reg_fcs (int): Number of fully-connected layers used in `FFN`,
+            which is then used for the regression head. Defaults to 2.
+        sync_cls_avg_factor (bool): Whether to sync the `avg_factor` of
+            all ranks. Default to `False`.
+        loss_cls (:obj:`ConfigDict` or dict): Config of the classification
+            loss. Defaults to `CrossEntropyLoss`.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of the regression bbox
+            loss. Defaults to `L1Loss`.
+        loss_iou (:obj:`ConfigDict` or dict): Config of the regression iou
+            loss. Defaults to `GIoULoss`.
+        train_cfg (:obj:`ConfigDict` or dict): Training config of transformer
+            head.
+        test_cfg (:obj:`ConfigDict` or dict): Testing config of transformer
+            head.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    _version = 2
+
+    def __init__(
+            self,
+            num_classes: int,
+            embed_dims: int = 256,
+            num_reg_fcs: int = 2,
+            sync_cls_avg_factor: bool = False,
+            loss_cls: ConfigType = dict(
+                type='CrossEntropyLoss',
+                bg_cls_weight=0.1,
+                use_sigmoid=False,
+                loss_weight=1.0,
+                class_weight=1.0),
+            loss_bbox: ConfigType = dict(type='L1Loss', loss_weight=5.0),
+            loss_iou: ConfigType = dict(type='GIoULoss', loss_weight=2.0),
+            train_cfg: ConfigType = dict(
+                assigner=dict(
+                    type='HungarianAssigner',
+                    match_costs=[
+                        dict(type='ClassificationCost', weight=1.),
+                        dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                        dict(type='IoUCost', iou_mode='giou', weight=2.0)
+                    ])),
+            test_cfg: ConfigType = dict(max_per_img=100),
+            init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.bg_cls_weight = 0
+        self.sync_cls_avg_factor = sync_cls_avg_factor
+        class_weight = loss_cls.get('class_weight', None)
+        if class_weight is not None and (self.__class__ is DETRHead):
+            assert isinstance(class_weight, float), 'Expected ' \
+                'class_weight to have type float. Found ' \
+                f'{type(class_weight)}.'
+            # NOTE following the official DETR repo, bg_cls_weight means
+            # relative classification weight of the no-object class.
+            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
+            assert isinstance(bg_cls_weight, float), 'Expected ' \
+                'bg_cls_weight to have type float. Found ' \
+                f'{type(bg_cls_weight)}.'
+            class_weight = torch.ones(num_classes + 1) * class_weight
+            # set background class as the last indice
+            class_weight[num_classes] = bg_cls_weight
+            loss_cls.update({'class_weight': class_weight})
+            if 'bg_cls_weight' in loss_cls:
+                loss_cls.pop('bg_cls_weight')
+            self.bg_cls_weight = bg_cls_weight
+
+        if train_cfg:
+            assert 'assigner' in train_cfg, 'assigner should be provided ' \
+                                            'when train_cfg is set.'
+            assigner = train_cfg['assigner']
+            self.assigner = TASK_UTILS.build(assigner)
+            if train_cfg.get('sampler', None) is not None:
+                raise RuntimeError('DETR do not build sampler.')
+        self.num_classes = num_classes
+        self.embed_dims = embed_dims
+        self.num_reg_fcs = num_reg_fcs
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.loss_iou = MODELS.build(loss_iou)
+
+        if self.loss_cls.use_sigmoid:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the transformer head."""
+        # cls branch
+        self.fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        # reg branch
+        self.activate = nn.ReLU()
+        self.reg_ffn = FFN(
+            self.embed_dims,
+            self.embed_dims,
+            self.num_reg_fcs,
+            dict(type='ReLU', inplace=True),
+            dropout=0.0,
+            add_residual=False)
+        # NOTE the activations of reg_branch here is the same as
+        # those in transformer, but they are actually different
+        # in DAB-DETR (prelu in transformer and relu in reg_branch)
+        self.fc_reg = Linear(self.embed_dims, 4)
+
+    def forward(self, hidden_states: Tensor) -> Tuple[Tensor]:
+        """"Forward function.
+
+        Args:
+            hidden_states (Tensor): Features from transformer decoder. If
+                `return_intermediate_dec` in detr.py is True output has shape
+                (num_decoder_layers, bs, num_queries, dim), else has shape
+                (1, bs, num_queries, dim) which only contains the last layer
+                outputs.
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - layers_cls_scores (Tensor): Outputs from the classification head,
+              shape (num_decoder_layers, bs, num_queries, cls_out_channels).
+              Note cls_out_channels should include background.
+            - layers_bbox_preds (Tensor): Sigmoid outputs from the regression
+              head with normalized coordinate format (cx, cy, w, h), has shape
+              (num_decoder_layers, bs, num_queries, 4).
+        """
+        layers_cls_scores = self.fc_cls(hidden_states)
+        layers_bbox_preds = self.fc_reg(
+            self.activate(self.reg_ffn(hidden_states))).sigmoid()
+        return layers_cls_scores, layers_bbox_preds
+
+    def loss(self, hidden_states: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Feature from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, cls_out_channels)
+                or (num_decoder_layers, num_queries, bs, cls_out_channels).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states)
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """"Loss function.
+
+        Only outputs from the last feature level are used for computing
+        losses by default.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification outputs
+                of each decoder layers. Each is a 4D-tensor, has shape
+                (num_decoder_layers, bs, num_queries, cls_out_channels).
+            all_layers_bbox_preds (Tensor): Sigmoid regression
+                outputs of each decoder layers. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                (num_decoder_layers, bs, num_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert batch_gt_instances_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            'for batch_gt_instances_ignore setting to None.'
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self.loss_by_feat_single,
+            all_layers_cls_scores,
+            all_layers_bbox_preds,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in \
+                zip(losses_cls[:-1], losses_bbox[:-1], losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor,
+                            batch_gt_instances: InstanceList,
+                            batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer of a single
+        feature level.
+
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images, has shape (bs, num_queries, cls_out_channels).
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape (bs, num_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           batch_gt_instances, batch_img_metas)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        if isinstance(self.loss_cls, QualityFocalLoss):
+            bg_class_ind = self.num_classes
+            pos_inds = ((labels >= 0)
+                        & (labels < bg_class_ind)).nonzero().squeeze(1)
+            scores = label_weights.new_zeros(labels.shape)
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_decode_bbox_targets = bbox_cxcywh_to_xyxy(pos_bbox_targets)
+            pos_bbox_pred = bbox_preds.reshape(-1, 4)[pos_inds]
+            pos_decode_bbox_pred = bbox_cxcywh_to_xyxy(pos_bbox_pred)
+            scores[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            loss_cls = self.loss_cls(
+                cls_scores, (labels, scores),
+                label_weights,
+                avg_factor=cls_avg_factor)
+        else:
+            loss_cls = self.loss_cls(
+                cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds):
+            img_h, img_w, = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def get_targets(self, cls_scores_list: List[Tensor],
+                    bbox_preds_list: List[Tensor],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict]) -> tuple:
+        """Compute regression and classification targets for a batch image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image, has shape [num_queries,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_queries, 4].
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+            - labels_list (list[Tensor]): Labels for all images.
+            - label_weights_list (list[Tensor]): Label weights for all images.
+            - bbox_targets_list (list[Tensor]): BBox targets for all images.
+            - bbox_weights_list (list[Tensor]): BBox weights for all images.
+            - num_total_pos (int): Number of positive samples in all images.
+            - num_total_neg (int): Number of negative samples in all images.
+        """
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         pos_inds_list,
+         neg_inds_list) = multi_apply(self._get_targets_single,
+                                      cls_scores_list, bbox_preds_list,
+                                      batch_gt_instances, batch_img_metas)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def _get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> tuple:
+        """Compute regression and classification targets for one image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_queries, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_queries, 4].
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for one image.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels of each image.
+            - label_weights (Tensor]): Label weights of each image.
+            - bbox_targets (Tensor): BBox targets of each image.
+            - bbox_weights (Tensor): BBox weights of each image.
+            - pos_inds (Tensor): Sampled positive indices for each image.
+            - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        img_h, img_w = img_meta['img_shape']
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        num_bboxes = bbox_pred.size(0)
+        # convert bbox_pred from xywh, normalized to xyxy, unnormalized
+        bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
+        bbox_pred = bbox_pred * factor
+
+        pred_instances = InstanceData(scores=cls_score, bboxes=bbox_pred)
+        # assigner and sampler
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=gt_instances,
+            img_meta=img_meta)
+
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+        pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :]
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype)
+        bbox_weights = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype)
+        bbox_weights[pos_inds] = 1.0
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        pos_gt_bboxes_normalized = pos_gt_bboxes / factor
+        pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
+        bbox_targets[pos_inds] = pos_gt_bboxes_targets
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    def loss_and_predict(
+            self, hidden_states: Tuple[Tensor],
+            batch_data_samples: SampleList) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples. Over-write because
+        img_metas are needed as inputs for bbox_head.
+
+        Args:
+            hidden_states (tuple[Tensor]): Feature from the transformer
+                decoder, has shape (num_decoder_layers, bs, num_queries, dim).
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+            - losses: (dict[str, Tensor]): A dictionary of loss components.
+            - predictions (list[:obj:`InstanceData`]): Detection
+              results of each image after the post process.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states)
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas)
+        return losses, predictions
+
+    def predict(self,
+                hidden_states: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network. Over-write
+        because img_metas are needed as inputs for bbox_head.
+
+        Args:
+            hidden_states (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        last_layer_hidden_state = hidden_states[-1].unsqueeze(0)
+        outs = self(last_layer_hidden_state)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+
+        return predictions
+
+    def predict_by_feat(self,
+                        layer_cls_scores: Tensor,
+                        layer_bbox_preds: Tensor,
+                        batch_img_metas: List[dict],
+                        rescale: bool = True) -> InstanceList:
+        """Transform network outputs for a batch into bbox predictions.
+
+        Args:
+            layer_cls_scores (Tensor): Classification outputs of the last or
+                all decoder layer. Each is a 4D-tensor, has shape
+                (num_decoder_layers, bs, num_queries, cls_out_channels).
+            layer_bbox_preds (Tensor): Sigmoid regression outputs of the last
+                or all decoder layer. Each is a 4D-tensor with normalized
+                coordinate format (cx, cy, w, h) and shape
+                (num_decoder_layers, bs, num_queries, 4).
+            batch_img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If `True`, return boxes in original
+                image space. Defaults to `True`.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        # NOTE only using outputs from the last feature level,
+        # and only the outputs from the last decoder layer is used.
+        cls_scores = layer_cls_scores[-1]
+        bbox_preds = layer_bbox_preds[-1]
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_meta = batch_img_metas[img_id]
+            results = self._predict_by_feat_single(cls_score, bbox_pred,
+                                                   img_meta, rescale)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score: Tensor,
+                                bbox_pred: Tensor,
+                                img_meta: dict,
+                                rescale: bool = True) -> InstanceData:
+        """Transform outputs from the last decoder layer into bbox predictions
+        for each image.
+
+        Args:
+            cls_score (Tensor): Box score logits from the last decoder layer
+                for each image. Shape [num_queries, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
+                for each image, with coordinate format (cx, cy, w, h) and
+                shape [num_queries, 4].
+            img_meta (dict): Image meta info.
+            rescale (bool): If True, return boxes in original image
+                space. Default True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_score) == len(bbox_pred)  # num_queries
+        max_per_img = self.test_cfg.get('max_per_img', len(cls_score))
+        img_shape = img_meta['img_shape']
+        # exclude background
+        if self.loss_cls.use_sigmoid:
+            cls_score = cls_score.sigmoid()
+            scores, indexes = cls_score.view(-1).topk(max_per_img)
+            det_labels = indexes % self.num_classes
+            bbox_index = indexes // self.num_classes
+            bbox_pred = bbox_pred[bbox_index]
+        else:
+            scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1)
+            scores, bbox_index = scores.topk(max_per_img)
+            bbox_pred = bbox_pred[bbox_index]
+            det_labels = det_labels[bbox_index]
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
+        det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+        det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+        det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
+        det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            det_bboxes /= det_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+
+        results = InstanceData()
+        results.bboxes = det_bboxes
+        results.scores = scores
+        results.labels = det_labels
+        return results
diff --git a/mmde/mmdet/models/dense_heads/dino_head.py b/mmde/mmdet/models/dense_heads/dino_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..54f46d1474f97f2d183926a6dc68a0be79f7cef1
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/dino_head.py
@@ -0,0 +1,479 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import (bbox_cxcywh_to_xyxy, bbox_overlaps,
+                                   bbox_xyxy_to_cxcywh)
+from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
+from ..losses import QualityFocalLoss
+from ..utils import multi_apply
+from .deformable_detr_head import DeformableDETRHead
+
+
+@MODELS.register_module()
+class DINOHead(DeformableDETRHead):
+    r"""Head of the DINO: DETR with Improved DeNoising Anchor Boxes
+    for End-to-End Object Detection
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/DINO>`_.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2203.03605>`_ .
+    """
+
+    def loss(self, hidden_states: Tensor, references: List[Tensor],
+             enc_outputs_class: Tensor, enc_outputs_coord: Tensor,
+             batch_data_samples: SampleList, dn_meta: Dict[str, int]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries_total,
+                dim), where `num_queries_total` is the sum of
+                `num_denoising_queries` and `num_matching_queries` when
+                `self.training` is `True`, else `num_matching_queries`.
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries_total, 4) and each `inter_reference` has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            enc_outputs_class (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+            enc_outputs_coord (Tensor): The proposal generate from the
+                encode feature map, has shape (bs, num_feat_points, 4) with the
+                last dimension arranged as (cx, cy, w, h).
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (enc_outputs_class, enc_outputs_coord,
+                              batch_gt_instances, batch_img_metas, dn_meta)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        enc_cls_scores: Tensor,
+        enc_bbox_preds: Tensor,
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        dn_meta: Dict[str, int],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries_total, cls_out_channels), where
+                `num_queries_total` is the sum of `num_denoising_queries`
+                and `num_matching_queries`.
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and has shape (num_decoder_layers, bs,
+                num_queries_total, 4).
+            enc_cls_scores (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+            enc_bbox_preds (Tensor): The proposal generate from the encode
+                feature map, has shape (bs, num_feat_points, 4) with the last
+                dimension arranged as (cx, cy, w, h).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+                group collation, including 'num_denoising_queries' and
+                'num_denoising_groups'. It will be used for split outputs of
+                denoising and matching parts and loss calculation.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # extract denoising and matching part of outputs
+        (all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
+         all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds) = \
+            self.split_outputs(
+                all_layers_cls_scores, all_layers_bbox_preds, dn_meta)
+
+        loss_dict = super(DeformableDETRHead, self).loss_by_feat(
+            all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
+            batch_gt_instances, batch_img_metas, batch_gt_instances_ignore)
+        # NOTE DETRHead.loss_by_feat but not DeformableDETRHead.loss_by_feat
+        # is called, because the encoder loss calculations are different
+        # between DINO and DeformableDETR.
+
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            # NOTE The enc_loss calculation of the DINO is
+            # different from that of Deformable DETR.
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
+                self.loss_by_feat_single(
+                    enc_cls_scores, enc_bbox_preds,
+                    batch_gt_instances=batch_gt_instances,
+                    batch_img_metas=batch_img_metas)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+            loss_dict['enc_loss_iou'] = enc_losses_iou
+
+        if all_layers_denoising_cls_scores is not None:
+            # calculate denoising loss from all decoder layers
+            dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn(
+                all_layers_denoising_cls_scores,
+                all_layers_denoising_bbox_preds,
+                batch_gt_instances=batch_gt_instances,
+                batch_img_metas=batch_img_metas,
+                dn_meta=dn_meta)
+            # collate denoising loss
+            loss_dict['dn_loss_cls'] = dn_losses_cls[-1]
+            loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1]
+            loss_dict['dn_loss_iou'] = dn_losses_iou[-1]
+            for num_dec_layer, (loss_cls_i, loss_bbox_i, loss_iou_i) in \
+                    enumerate(zip(dn_losses_cls[:-1], dn_losses_bbox[:-1],
+                                  dn_losses_iou[:-1])):
+                loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i
+                loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i
+                loss_dict[f'd{num_dec_layer}.dn_loss_iou'] = loss_iou_i
+        return loss_dict
+
+    def loss_dn(self, all_layers_denoising_cls_scores: Tensor,
+                all_layers_denoising_bbox_preds: Tensor,
+                batch_gt_instances: InstanceList, batch_img_metas: List[dict],
+                dn_meta: Dict[str, int]) -> Tuple[List[Tensor]]:
+        """Calculate denoising loss.
+
+        Args:
+            all_layers_denoising_cls_scores (Tensor): Classification scores of
+                all decoder layers in denoising part, has shape (
+                num_decoder_layers, bs, num_denoising_queries,
+                cls_out_channels).
+            all_layers_denoising_bbox_preds (Tensor): Regression outputs of all
+                decoder layers in denoising part. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and has shape
+                (num_decoder_layers, bs, num_denoising_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            Tuple[List[Tensor]]: The loss_dn_cls, loss_dn_bbox, and loss_dn_iou
+            of each decoder layers.
+        """
+        return multi_apply(
+            self._loss_dn_single,
+            all_layers_denoising_cls_scores,
+            all_layers_denoising_bbox_preds,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas,
+            dn_meta=dn_meta)
+
+    def _loss_dn_single(self, dn_cls_scores: Tensor, dn_bbox_preds: Tensor,
+                        batch_gt_instances: InstanceList,
+                        batch_img_metas: List[dict],
+                        dn_meta: Dict[str, int]) -> Tuple[Tensor]:
+        """Denoising loss for outputs from a single decoder layer.
+
+        Args:
+            dn_cls_scores (Tensor): Classification scores of a single decoder
+                layer in denoising part, has shape (bs, num_denoising_queries,
+                cls_out_channels).
+            dn_bbox_preds (Tensor): Regression outputs of a single decoder
+                layer in denoising part. Each is a 4D-tensor with normalized
+                coordinate format (cx, cy, w, h) and has shape
+                (bs, num_denoising_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        cls_reg_targets = self.get_dn_targets(batch_gt_instances,
+                                              batch_img_metas, dn_meta)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = dn_cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = \
+            num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        if len(cls_scores) > 0:
+            if isinstance(self.loss_cls, QualityFocalLoss):
+                bg_class_ind = self.num_classes
+                pos_inds = ((labels >= 0)
+                            & (labels < bg_class_ind)).nonzero().squeeze(1)
+                scores = label_weights.new_zeros(labels.shape)
+                pos_bbox_targets = bbox_targets[pos_inds]
+                pos_decode_bbox_targets = bbox_cxcywh_to_xyxy(pos_bbox_targets)
+                pos_bbox_pred = dn_bbox_preds.reshape(-1, 4)[pos_inds]
+                pos_decode_bbox_pred = bbox_cxcywh_to_xyxy(pos_bbox_pred)
+                scores[pos_inds] = bbox_overlaps(
+                    pos_decode_bbox_pred.detach(),
+                    pos_decode_bbox_targets,
+                    is_aligned=True)
+                loss_cls = self.loss_cls(
+                    cls_scores, (labels, scores),
+                    weight=label_weights,
+                    avg_factor=cls_avg_factor)
+            else:
+                loss_cls = self.loss_cls(
+                    cls_scores,
+                    labels,
+                    label_weights,
+                    avg_factor=cls_avg_factor)
+        else:
+            loss_cls = torch.zeros(
+                1, dtype=cls_scores.dtype, device=cls_scores.device)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, dn_bbox_preds):
+            img_h, img_w = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = dn_bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def get_dn_targets(self, batch_gt_instances: InstanceList,
+                       batch_img_metas: dict, dn_meta: Dict[str,
+                                                            int]) -> tuple:
+        """Get targets in denoising part for a batch of images.
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+            - labels_list (list[Tensor]): Labels for all images.
+            - label_weights_list (list[Tensor]): Label weights for all images.
+            - bbox_targets_list (list[Tensor]): BBox targets for all images.
+            - bbox_weights_list (list[Tensor]): BBox weights for all images.
+            - num_total_pos (int): Number of positive samples in all images.
+            - num_total_neg (int): Number of negative samples in all images.
+        """
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_dn_targets_single,
+             batch_gt_instances,
+             batch_img_metas,
+             dn_meta=dn_meta)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def _get_dn_targets_single(self, gt_instances: InstanceData,
+                               img_meta: dict, dn_meta: Dict[str,
+                                                             int]) -> tuple:
+        """Get targets in denoising part for one image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for one image.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels of each image.
+            - label_weights (Tensor]): Label weights of each image.
+            - bbox_targets (Tensor): BBox targets of each image.
+            - bbox_weights (Tensor): BBox weights of each image.
+            - pos_inds (Tensor): Sampled positive indices for each image.
+            - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        num_groups = dn_meta['num_denoising_groups']
+        num_denoising_queries = dn_meta['num_denoising_queries']
+        num_queries_each_group = int(num_denoising_queries / num_groups)
+        device = gt_bboxes.device
+
+        if len(gt_labels) > 0:
+            t = torch.arange(len(gt_labels), dtype=torch.long, device=device)
+            t = t.unsqueeze(0).repeat(num_groups, 1)
+            pos_assigned_gt_inds = t.flatten()
+            pos_inds = torch.arange(
+                num_groups, dtype=torch.long, device=device)
+            pos_inds = pos_inds.unsqueeze(1) * num_queries_each_group + t
+            pos_inds = pos_inds.flatten()
+        else:
+            pos_inds = pos_assigned_gt_inds = \
+                gt_bboxes.new_tensor([], dtype=torch.long)
+
+        neg_inds = pos_inds + num_queries_each_group // 2
+
+        # label targets
+        labels = gt_bboxes.new_full((num_denoising_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_denoising_queries)
+
+        # bbox targets
+        bbox_targets = torch.zeros(num_denoising_queries, 4, device=device)
+        bbox_weights = torch.zeros(num_denoising_queries, 4, device=device)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w = img_meta['img_shape']
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        gt_bboxes_normalized = gt_bboxes / factor
+        gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized)
+        bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1])
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    @staticmethod
+    def split_outputs(all_layers_cls_scores: Tensor,
+                      all_layers_bbox_preds: Tensor,
+                      dn_meta: Dict[str, int]) -> Tuple[Tensor]:
+        """Split outputs of the denoising part and the matching part.
+
+        For the total outputs of `num_queries_total` length, the former
+        `num_denoising_queries` outputs are from denoising queries, and
+        the rest `num_matching_queries` ones are from matching queries,
+        where `num_queries_total` is the sum of `num_denoising_queries` and
+        `num_matching_queries`.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries_total, cls_out_channels).
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and has shape (num_decoder_layers, bs,
+                num_queries_total, 4).
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'.
+
+        Returns:
+            Tuple[Tensor]: a tuple containing the following outputs.
+
+            - all_layers_matching_cls_scores (Tensor): Classification scores
+              of all decoder layers in matching part, has shape
+              (num_decoder_layers, bs, num_matching_queries, cls_out_channels).
+            - all_layers_matching_bbox_preds (Tensor): Regression outputs of
+              all decoder layers in matching part. Each is a 4D-tensor with
+              normalized coordinate format (cx, cy, w, h) and has shape
+              (num_decoder_layers, bs, num_matching_queries, 4).
+            - all_layers_denoising_cls_scores (Tensor): Classification scores
+              of all decoder layers in denoising part, has shape
+              (num_decoder_layers, bs, num_denoising_queries,
+              cls_out_channels).
+            - all_layers_denoising_bbox_preds (Tensor): Regression outputs of
+              all decoder layers in denoising part. Each is a 4D-tensor with
+              normalized coordinate format (cx, cy, w, h) and has shape
+              (num_decoder_layers, bs, num_denoising_queries, 4).
+        """
+        num_denoising_queries = dn_meta['num_denoising_queries']
+        if dn_meta is not None:
+            all_layers_denoising_cls_scores = \
+                all_layers_cls_scores[:, :, : num_denoising_queries, :]
+            all_layers_denoising_bbox_preds = \
+                all_layers_bbox_preds[:, :, : num_denoising_queries, :]
+            all_layers_matching_cls_scores = \
+                all_layers_cls_scores[:, :, num_denoising_queries:, :]
+            all_layers_matching_bbox_preds = \
+                all_layers_bbox_preds[:, :, num_denoising_queries:, :]
+        else:
+            all_layers_denoising_cls_scores = None
+            all_layers_denoising_bbox_preds = None
+            all_layers_matching_cls_scores = all_layers_cls_scores
+            all_layers_matching_bbox_preds = all_layers_bbox_preds
+        return (all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
+                all_layers_denoising_cls_scores,
+                all_layers_denoising_bbox_preds)
diff --git a/mmde/mmdet/models/dense_heads/embedding_rpn_head.py b/mmde/mmdet/models/dense_heads/embedding_rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..97e84fa83b892c0274615d582fe43a6693541617
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/embedding_rpn_head.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy
+from mmdet.structures.det_data_sample import SampleList
+from mmdet.utils import InstanceList, OptConfigType
+
+
+@MODELS.register_module()
+class EmbeddingRPNHead(BaseModule):
+    """RPNHead in the `Sparse R-CNN <https://arxiv.org/abs/2011.12450>`_ .
+
+    Unlike traditional RPNHead, this module does not need FPN input, but just
+    decode `init_proposal_bboxes` and expand the first dimension of
+    `init_proposal_bboxes` and `init_proposal_features` to the batch_size.
+
+    Args:
+        num_proposals (int): Number of init_proposals. Defaults to 100.
+        proposal_feature_channel (int): Channel number of
+            init_proposal_feature. Defaults to 256.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_proposals: int = 100,
+                 proposal_feature_channel: int = 256,
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        # `**kwargs` is necessary to avoid some potential error.
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+        self.num_proposals = num_proposals
+        self.proposal_feature_channel = proposal_feature_channel
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize a sparse set of proposal boxes and proposal features."""
+        self.init_proposal_bboxes = nn.Embedding(self.num_proposals, 4)
+        self.init_proposal_features = nn.Embedding(
+            self.num_proposals, self.proposal_feature_channel)
+
+    def init_weights(self) -> None:
+        """Initialize the init_proposal_bboxes as normalized.
+
+        [c_x, c_y, w, h], and we initialize it to the size of  the entire
+        image.
+        """
+        super().init_weights()
+        nn.init.constant_(self.init_proposal_bboxes.weight[:, :2], 0.5)
+        nn.init.constant_(self.init_proposal_bboxes.weight[:, 2:], 1)
+
+    def _decode_init_proposals(self, x: List[Tensor],
+                               batch_data_samples: SampleList) -> InstanceList:
+        """Decode init_proposal_bboxes according to the size of images and
+        expand dimension of init_proposal_features to batch_size.
+
+        Args:
+            x (list[Tensor]): List of FPN features.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            List[:obj:`InstanceData`:] Detection results of each image.
+            Each item usually contains following keys.
+
+            - proposals: Decoded proposal bboxes,
+              has shape (num_proposals, 4).
+            - features: init_proposal_features, expanded proposal
+              features, has shape
+              (num_proposals, proposal_feature_channel).
+            - imgs_whwh: Tensor with shape
+              (num_proposals, 4), the dimension means
+              [img_width, img_height, img_width, img_height].
+        """
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+
+        proposals = self.init_proposal_bboxes.weight.clone()
+        proposals = bbox_cxcywh_to_xyxy(proposals)
+        imgs_whwh = []
+        for meta in batch_img_metas:
+            h, w = meta['img_shape'][:2]
+            imgs_whwh.append(x[0].new_tensor([[w, h, w, h]]))
+        imgs_whwh = torch.cat(imgs_whwh, dim=0)
+        imgs_whwh = imgs_whwh[:, None, :]
+        proposals = proposals * imgs_whwh
+
+        rpn_results_list = []
+        for idx in range(len(batch_img_metas)):
+            rpn_results = InstanceData()
+            rpn_results.bboxes = proposals[idx]
+            rpn_results.imgs_whwh = imgs_whwh[idx].repeat(
+                self.num_proposals, 1)
+            rpn_results.features = self.init_proposal_features.weight.clone()
+            rpn_results_list.append(rpn_results)
+        return rpn_results_list
+
+    def loss(self, *args, **kwargs):
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network."""
+        raise NotImplementedError(
+            'EmbeddingRPNHead does not have `loss`, please use '
+            '`predict` or `loss_and_predict` instead.')
+
+    def predict(self, x: List[Tensor], batch_data_samples: SampleList,
+                **kwargs) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network."""
+        # `**kwargs` is necessary to avoid some potential error.
+        return self._decode_init_proposals(
+            x=x, batch_data_samples=batch_data_samples)
+
+    def loss_and_predict(self, x: List[Tensor], batch_data_samples: SampleList,
+                         **kwargs) -> tuple:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples."""
+        # `**kwargs` is necessary to avoid some potential error.
+        predictions = self._decode_init_proposals(
+            x=x, batch_data_samples=batch_data_samples)
+
+        return dict(), predictions
diff --git a/mmde/mmdet/models/dense_heads/fcos_head.py b/mmde/mmdet/models/dense_heads/fcos_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba4d4640010c7e8e7c6a4db3e0fce887b4105217
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/fcos_head.py
@@ -0,0 +1,476 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Scale
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.layers import NormedConv2d
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig,
+                         OptInstanceList, RangeType, reduce_mean)
+from ..utils import multi_apply
+from .anchor_free_head import AnchorFreeHead
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class FCOSHead(AnchorFreeHead):
+    """Anchor-free head used in `FCOS <https://arxiv.org/abs/1904.01355>`_.
+
+    The FCOS head does not use anchor boxes. Instead bounding boxes are
+    predicted at each pixel and a centerness measure is used to suppress
+    low-quality predictions.
+    Here norm_on_bbox, centerness_on_reg, dcn_on_last_conv are training
+    tricks used in official repo, which will bring remarkable mAP gains
+    of up to 4.9. Please see https://github.com/tianzhi0549/FCOS for
+    more detail.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        strides (Sequence[int] or Sequence[Tuple[int, int]]): Strides of points
+            in multiple feature levels. Defaults to (4, 8, 16, 32, 64).
+        regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple
+            level points.
+        center_sampling (bool): If true, use center sampling.
+            Defaults to False.
+        center_sample_radius (float): Radius of center sampling.
+            Defaults to 1.5.
+        norm_on_bbox (bool): If true, normalize the regression targets with
+            FPN strides. Defaults to False.
+        centerness_on_reg (bool): If true, position centerness on the
+            regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
+            Defaults to False.
+        conv_bias (bool or str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Defaults to "auto".
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_centerness (:obj:`ConfigDict`, or dict): Config of centerness
+            loss.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config norm layer.  Defaults to
+            ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
+        cls_predictor_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config conv_cls. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+
+    Example:
+        >>> self = FCOSHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_score, bbox_pred, centerness = self.forward(feats)
+        >>> assert len(cls_score) == len(self.scales)
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 regress_ranges: RangeType = ((-1, 64), (64, 128), (128, 256),
+                                              (256, 512), (512, INF)),
+                 center_sampling: bool = False,
+                 center_sample_radius: float = 1.5,
+                 norm_on_bbox: bool = False,
+                 centerness_on_reg: bool = False,
+                 loss_cls: ConfigType = dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(type='IoULoss', loss_weight=1.0),
+                 loss_centerness: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 cls_predictor_cfg=None,
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='conv_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        self.regress_ranges = regress_ranges
+        self.center_sampling = center_sampling
+        self.center_sample_radius = center_sample_radius
+        self.norm_on_bbox = norm_on_bbox
+        self.centerness_on_reg = centerness_on_reg
+        self.cls_predictor_cfg = cls_predictor_cfg
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.loss_centerness = MODELS.build(loss_centerness)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        super()._init_layers()
+        self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1)
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+        if self.cls_predictor_cfg is not None:
+            self.cls_predictor_cfg.pop('type')
+            self.conv_cls = NormedConv2d(
+                self.feat_channels,
+                self.cls_out_channels,
+                1,
+                padding=0,
+                **self.cls_predictor_cfg)
+
+    def forward(
+            self, x: Tuple[Tensor]
+    ) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of each level outputs.
+
+            - cls_scores (list[Tensor]): Box scores for each scale level, \
+            each is a 4D-tensor, the channel number is \
+            num_points * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for each \
+            scale level, each is a 4D-tensor, the channel number is \
+            num_points * 4.
+            - centernesses (list[Tensor]): centerness for each scale level, \
+            each is a 4D-tensor, the channel number is num_points * 1.
+        """
+        return multi_apply(self.forward_single, x, self.scales, self.strides)
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple: scores for each class, bbox predictions and centerness
+            predictions of input feature maps.
+        """
+        cls_score, bbox_pred, cls_feat, reg_feat = super().forward_single(x)
+        if self.centerness_on_reg:
+            centerness = self.conv_centerness(reg_feat)
+        else:
+            centerness = self.conv_centerness(cls_feat)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        if self.norm_on_bbox:
+            # bbox_pred needed for gradient computation has been modified
+            # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+            # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+            bbox_pred = bbox_pred.clamp(min=0)
+            if not self.training:
+                bbox_pred *= stride
+        else:
+            bbox_pred = bbox_pred.exp()
+        return cls_score, bbox_pred, centerness
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        centernesses: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            centernesses (list[Tensor]): centerness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(centernesses)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+        labels, bbox_targets = self.get_targets(all_level_points,
+                                                batch_gt_instances)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds and centerness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_centerness = [
+            centerness.permute(0, 2, 3, 1).reshape(-1)
+            for centerness in centernesses
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_centerness = torch.cat(flatten_centerness)
+        flatten_labels = torch.cat(labels)
+        flatten_bbox_targets = torch.cat(bbox_targets)
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        losses = dict()
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((flatten_labels >= 0)
+                    & (flatten_labels < bg_class_ind)).nonzero().reshape(-1)
+        num_pos = torch.tensor(
+            len(pos_inds), dtype=torch.float, device=bbox_preds[0].device)
+        num_pos = max(reduce_mean(num_pos), 1.0)
+        loss_cls = self.loss_cls(
+            flatten_cls_scores, flatten_labels, avg_factor=num_pos)
+
+        if getattr(self.loss_cls, 'custom_accuracy', False):
+            acc = self.loss_cls.get_accuracy(flatten_cls_scores,
+                                             flatten_labels)
+            losses.update(acc)
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_centerness = flatten_centerness[pos_inds]
+        pos_bbox_targets = flatten_bbox_targets[pos_inds]
+        pos_centerness_targets = self.centerness_target(pos_bbox_targets)
+        # centerness weighted iou loss
+        centerness_denorm = max(
+            reduce_mean(pos_centerness_targets.sum().detach()), 1e-6)
+
+        if len(pos_inds) > 0:
+            pos_points = flatten_points[pos_inds]
+            pos_decoded_bbox_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_preds)
+            pos_decoded_target_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_targets)
+            loss_bbox = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds,
+                weight=pos_centerness_targets,
+                avg_factor=centerness_denorm)
+            loss_centerness = self.loss_centerness(
+                pos_centerness, pos_centerness_targets, avg_factor=num_pos)
+        else:
+            loss_bbox = pos_bbox_preds.sum()
+            loss_centerness = pos_centerness.sum()
+
+        losses['loss_cls'] = loss_cls
+        losses['loss_bbox'] = loss_bbox
+        losses['loss_centerness'] = loss_centerness
+
+        return losses
+
+    def get_targets(
+            self, points: List[Tensor], batch_gt_instances: InstanceList
+    ) -> Tuple[List[Tensor], List[Tensor]]:
+        """Compute regression, classification and centerness targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple: Targets of each level.
+
+            - concat_lvl_labels (list[Tensor]): Labels of each level.
+            - concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
+            level.
+        """
+        assert len(points) == len(self.regress_ranges)
+        num_levels = len(points)
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        # get labels and bbox_targets of each image
+        labels_list, bbox_targets_list = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            points=concat_points,
+            regress_ranges=concat_regress_ranges,
+            num_points_per_lvl=num_points)
+
+        # split to per img, per level
+        labels_list = [labels.split(num_points, 0) for labels in labels_list]
+        bbox_targets_list = [
+            bbox_targets.split(num_points, 0)
+            for bbox_targets in bbox_targets_list
+        ]
+
+        # concat per level image
+        concat_lvl_labels = []
+        concat_lvl_bbox_targets = []
+        for i in range(num_levels):
+            concat_lvl_labels.append(
+                torch.cat([labels[i] for labels in labels_list]))
+            bbox_targets = torch.cat(
+                [bbox_targets[i] for bbox_targets in bbox_targets_list])
+            if self.norm_on_bbox:
+                bbox_targets = bbox_targets / self.strides[i]
+            concat_lvl_bbox_targets.append(bbox_targets)
+        return concat_lvl_labels, concat_lvl_bbox_targets
+
+    def _get_targets_single(
+            self, gt_instances: InstanceData, points: Tensor,
+            regress_ranges: Tensor,
+            num_points_per_lvl: List[int]) -> Tuple[Tensor, Tensor]:
+        """Compute regression and classification targets for a single image."""
+        num_points = points.size(0)
+        num_gts = len(gt_instances)
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+
+        if num_gts == 0:
+            return gt_labels.new_full((num_points,), self.num_classes), \
+                   gt_bboxes.new_zeros((num_points, 4))
+
+        areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+            gt_bboxes[:, 3] - gt_bboxes[:, 1])
+        # TODO: figure out why these two are different
+        # areas = areas[None].expand(num_points, num_gts)
+        areas = areas[None].repeat(num_points, 1)
+        regress_ranges = regress_ranges[:, None, :].expand(
+            num_points, num_gts, 2)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        xs, ys = points[:, 0], points[:, 1]
+        xs = xs[:, None].expand(num_points, num_gts)
+        ys = ys[:, None].expand(num_points, num_gts)
+
+        left = xs - gt_bboxes[..., 0]
+        right = gt_bboxes[..., 2] - xs
+        top = ys - gt_bboxes[..., 1]
+        bottom = gt_bboxes[..., 3] - ys
+        bbox_targets = torch.stack((left, top, right, bottom), -1)
+
+        if self.center_sampling:
+            # condition1: inside a `center bbox`
+            radius = self.center_sample_radius
+            center_xs = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) / 2
+            center_ys = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) / 2
+            center_gts = torch.zeros_like(gt_bboxes)
+            stride = center_xs.new_zeros(center_xs.shape)
+
+            # project the points on current lvl back to the `original` sizes
+            lvl_begin = 0
+            for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl):
+                lvl_end = lvl_begin + num_points_lvl
+                stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius
+                lvl_begin = lvl_end
+
+            x_mins = center_xs - stride
+            y_mins = center_ys - stride
+            x_maxs = center_xs + stride
+            y_maxs = center_ys + stride
+            center_gts[..., 0] = torch.where(x_mins > gt_bboxes[..., 0],
+                                             x_mins, gt_bboxes[..., 0])
+            center_gts[..., 1] = torch.where(y_mins > gt_bboxes[..., 1],
+                                             y_mins, gt_bboxes[..., 1])
+            center_gts[..., 2] = torch.where(x_maxs > gt_bboxes[..., 2],
+                                             gt_bboxes[..., 2], x_maxs)
+            center_gts[..., 3] = torch.where(y_maxs > gt_bboxes[..., 3],
+                                             gt_bboxes[..., 3], y_maxs)
+
+            cb_dist_left = xs - center_gts[..., 0]
+            cb_dist_right = center_gts[..., 2] - xs
+            cb_dist_top = ys - center_gts[..., 1]
+            cb_dist_bottom = center_gts[..., 3] - ys
+            center_bbox = torch.stack(
+                (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1)
+            inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
+        else:
+            # condition1: inside a gt bbox
+            inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0
+
+        # condition2: limit the regression range for each location
+        max_regress_distance = bbox_targets.max(-1)[0]
+        inside_regress_range = (
+            (max_regress_distance >= regress_ranges[..., 0])
+            & (max_regress_distance <= regress_ranges[..., 1]))
+
+        # if there are still more than one objects for a location,
+        # we choose the one with minimal area
+        areas[inside_gt_bbox_mask == 0] = INF
+        areas[inside_regress_range == 0] = INF
+        min_area, min_area_inds = areas.min(dim=1)
+
+        labels = gt_labels[min_area_inds]
+        labels[min_area == INF] = self.num_classes  # set as BG
+        bbox_targets = bbox_targets[range(num_points), min_area_inds]
+
+        return labels, bbox_targets
+
+    def centerness_target(self, pos_bbox_targets: Tensor) -> Tensor:
+        """Compute centerness targets.
+
+        Args:
+            pos_bbox_targets (Tensor): BBox targets of positive bboxes in shape
+                (num_pos, 4)
+
+        Returns:
+            Tensor: Centerness target.
+        """
+        # only calculate pos centerness targets, otherwise there may be nan
+        left_right = pos_bbox_targets[:, [0, 2]]
+        top_bottom = pos_bbox_targets[:, [1, 3]]
+        if len(left_right) == 0:
+            centerness_targets = left_right[..., 0]
+        else:
+            centerness_targets = (
+                left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * (
+                    top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
+        return torch.sqrt(centerness_targets)
diff --git a/mmde/mmdet/models/dense_heads/fovea_head.py b/mmde/mmdet/models/dense_heads/fovea_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..89353deac7f0189c1e464288521ee8e4238f0107
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/fovea_head.py
@@ -0,0 +1,509 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import DeformConv2d
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig
+from ..utils import filter_scores_and_topk, multi_apply
+from .anchor_free_head import AnchorFreeHead
+
+INF = 1e8
+
+
+class FeatureAlign(BaseModule):
+    """Feature Align Module.
+
+    Feature Align Module is implemented based on DCN v1.
+    It uses anchor shape prediction rather than feature map to
+    predict offsets of deform conv layer.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        out_channels (int): Number of channels in the output feature map.
+        kernel_size (int): Size of the convolution kernel.
+            ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
+        deform_groups: (int): Group number of DCN in
+            FeatureAdaption module.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        deform_groups: int = 4,
+        init_cfg: OptMultiConfig = dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.1,
+            override=dict(type='Normal', name='conv_adaption', std=0.01))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        offset_channels = kernel_size * kernel_size * 2
+        self.conv_offset = nn.Conv2d(
+            4, deform_groups * offset_channels, 1, bias=False)
+        self.conv_adaption = DeformConv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(kernel_size - 1) // 2,
+            deform_groups=deform_groups)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor, shape: Tensor) -> Tensor:
+        """Forward function of feature align module.
+
+        Args:
+            x (Tensor): Features from the upstream network.
+            shape (Tensor): Exponential of bbox predictions.
+
+        Returns:
+            x (Tensor): The aligned features.
+        """
+        offset = self.conv_offset(shape)
+        x = self.relu(self.conv_adaption(x, offset))
+        return x
+
+
+@MODELS.register_module()
+class FoveaHead(AnchorFreeHead):
+    """Detection Head of `FoveaBox: Beyond Anchor-based Object Detector.
+
+    <https://arxiv.org/abs/1904.03797>`_.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        base_edge_list (list[int]): List of edges.
+        scale_ranges (list[tuple]): Range of scales.
+        sigma (float): Super parameter of ``FoveaHead``.
+        with_deform (bool):  Whether use deform conv.
+        deform_groups (int): Deformable conv group size.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 base_edge_list: List[int] = (16, 32, 64, 128, 256),
+                 scale_ranges: List[tuple] = ((8, 32), (16, 64), (32, 128),
+                                              (64, 256), (128, 512)),
+                 sigma: float = 0.4,
+                 with_deform: bool = False,
+                 deform_groups: int = 4,
+                 init_cfg: OptMultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='conv_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        self.base_edge_list = base_edge_list
+        self.scale_ranges = scale_ranges
+        self.sigma = sigma
+        self.with_deform = with_deform
+        self.deform_groups = deform_groups
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        # box branch
+        super()._init_reg_convs()
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+        # cls branch
+        if not self.with_deform:
+            super()._init_cls_convs()
+            self.conv_cls = nn.Conv2d(
+                self.feat_channels, self.cls_out_channels, 3, padding=1)
+        else:
+            self.cls_convs = nn.ModuleList()
+            self.cls_convs.append(
+                ConvModule(
+                    self.feat_channels, (self.feat_channels * 4),
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.norm_cfg is None))
+            self.cls_convs.append(
+                ConvModule((self.feat_channels * 4), (self.feat_channels * 4),
+                           1,
+                           stride=1,
+                           padding=0,
+                           conv_cfg=self.conv_cfg,
+                           norm_cfg=self.norm_cfg,
+                           bias=self.norm_cfg is None))
+            self.feature_adaption = FeatureAlign(
+                self.feat_channels,
+                self.feat_channels,
+                kernel_size=3,
+                deform_groups=self.deform_groups)
+            self.conv_cls = nn.Conv2d(
+                int(self.feat_channels * 4),
+                self.cls_out_channels,
+                3,
+                padding=1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+
+        Returns:
+            tuple: scores for each class and bbox predictions of input
+            feature maps.
+        """
+        cls_feat = x
+        reg_feat = x
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+        bbox_pred = self.conv_reg(reg_feat)
+        if self.with_deform:
+            cls_feat = self.feature_adaption(cls_feat, bbox_pred.exp())
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+        cls_score = self.conv_cls(cls_feat)
+        return cls_score, bbox_pred
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+        num_imgs = cls_scores[0].size(0)
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_labels, flatten_bbox_targets = self.get_targets(
+            batch_gt_instances, featmap_sizes, priors)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        pos_inds = ((flatten_labels >= 0)
+                    & (flatten_labels < self.num_classes)).nonzero().view(-1)
+        num_pos = len(pos_inds)
+
+        loss_cls = self.loss_cls(
+            flatten_cls_scores, flatten_labels, avg_factor=num_pos + num_imgs)
+        if num_pos > 0:
+            pos_bbox_preds = flatten_bbox_preds[pos_inds]
+            pos_bbox_targets = flatten_bbox_targets[pos_inds]
+            pos_weights = pos_bbox_targets.new_ones(pos_bbox_targets.size())
+            loss_bbox = self.loss_bbox(
+                pos_bbox_preds,
+                pos_bbox_targets,
+                pos_weights,
+                avg_factor=num_pos)
+        else:
+            loss_bbox = torch.tensor(
+                0,
+                dtype=flatten_bbox_preds.dtype,
+                device=flatten_bbox_preds.device)
+        return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
+
+    def get_targets(
+            self, batch_gt_instances: InstanceList, featmap_sizes: List[tuple],
+            priors_list: List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+        """Compute regression and classification for priors in multiple images.
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            featmap_sizes (list[tuple]): Size tuple of feature maps.
+            priors_list (list[Tensor]): Priors list of each fpn level, each has
+                shape (num_priors, 2).
+
+        Returns:
+            tuple: Targets of each level.
+
+            - flatten_labels (list[Tensor]): Labels of each level.
+            - flatten_bbox_targets (list[Tensor]): BBox targets of each
+              level.
+        """
+        label_list, bbox_target_list = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            featmap_size_list=featmap_sizes,
+            priors_list=priors_list)
+        flatten_labels = [
+            torch.cat([
+                labels_level_img.flatten() for labels_level_img in labels_level
+            ]) for labels_level in zip(*label_list)
+        ]
+        flatten_bbox_targets = [
+            torch.cat([
+                bbox_targets_level_img.reshape(-1, 4)
+                for bbox_targets_level_img in bbox_targets_level
+            ]) for bbox_targets_level in zip(*bbox_target_list)
+        ]
+        flatten_labels = torch.cat(flatten_labels)
+        flatten_bbox_targets = torch.cat(flatten_bbox_targets)
+        return flatten_labels, flatten_bbox_targets
+
+    def _get_targets_single(self,
+                            gt_instances: InstanceData,
+                            featmap_size_list: List[tuple] = None,
+                            priors_list: List[Tensor] = None) -> tuple:
+        """Compute regression and classification targets for a single image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            featmap_size_list (list[tuple]): Size tuple of feature maps.
+            priors_list (list[Tensor]): Priors of each fpn level, each has
+                shape (num_priors, 2).
+
+        Returns:
+            tuple:
+
+            - label_list (list[Tensor]): Labels of all anchors in the image.
+            - box_target_list (list[Tensor]): BBox targets of all anchors in
+              the image.
+        """
+        gt_bboxes_raw = gt_instances.bboxes
+        gt_labels_raw = gt_instances.labels
+        gt_areas = torch.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) *
+                              (gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1]))
+        label_list = []
+        bbox_target_list = []
+        # for each pyramid, find the cls and box target
+        for base_len, (lower_bound, upper_bound), stride, featmap_size, \
+            priors in zip(self.base_edge_list, self.scale_ranges,
+                          self.strides, featmap_size_list, priors_list):
+            # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+            priors = priors.view(*featmap_size, 2)
+            x, y = priors[..., 0], priors[..., 1]
+            labels = gt_labels_raw.new_full(featmap_size, self.num_classes)
+            bbox_targets = gt_bboxes_raw.new_ones(featmap_size[0],
+                                                  featmap_size[1], 4)
+            # scale assignment
+            hit_indices = ((gt_areas >= lower_bound) &
+                           (gt_areas <= upper_bound)).nonzero().flatten()
+            if len(hit_indices) == 0:
+                label_list.append(labels)
+                bbox_target_list.append(torch.log(bbox_targets))
+                continue
+            _, hit_index_order = torch.sort(-gt_areas[hit_indices])
+            hit_indices = hit_indices[hit_index_order]
+            gt_bboxes = gt_bboxes_raw[hit_indices, :] / stride
+            gt_labels = gt_labels_raw[hit_indices]
+            half_w = 0.5 * (gt_bboxes[:, 2] - gt_bboxes[:, 0])
+            half_h = 0.5 * (gt_bboxes[:, 3] - gt_bboxes[:, 1])
+            # valid fovea area: left, right, top, down
+            pos_left = torch.ceil(
+                gt_bboxes[:, 0] + (1 - self.sigma) * half_w - 0.5).long(). \
+                clamp(0, featmap_size[1] - 1)
+            pos_right = torch.floor(
+                gt_bboxes[:, 0] + (1 + self.sigma) * half_w - 0.5).long(). \
+                clamp(0, featmap_size[1] - 1)
+            pos_top = torch.ceil(
+                gt_bboxes[:, 1] + (1 - self.sigma) * half_h - 0.5).long(). \
+                clamp(0, featmap_size[0] - 1)
+            pos_down = torch.floor(
+                gt_bboxes[:, 1] + (1 + self.sigma) * half_h - 0.5).long(). \
+                clamp(0, featmap_size[0] - 1)
+            for px1, py1, px2, py2, label, (gt_x1, gt_y1, gt_x2, gt_y2) in \
+                    zip(pos_left, pos_top, pos_right, pos_down, gt_labels,
+                        gt_bboxes_raw[hit_indices, :]):
+                labels[py1:py2 + 1, px1:px2 + 1] = label
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 0] = \
+                    (x[py1:py2 + 1, px1:px2 + 1] - gt_x1) / base_len
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 1] = \
+                    (y[py1:py2 + 1, px1:px2 + 1] - gt_y1) / base_len
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 2] = \
+                    (gt_x2 - x[py1:py2 + 1, px1:px2 + 1]) / base_len
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 3] = \
+                    (gt_y2 - y[py1:py2 + 1, px1:px2 + 1]) / base_len
+            bbox_targets = bbox_targets.clamp(min=1. / 16, max=16.)
+            label_list.append(labels)
+            bbox_target_list.append(torch.log(bbox_targets))
+        return label_list, bbox_target_list
+
+    # Same as base_dense_head/_predict_by_feat_single except self._bbox_decode
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: Optional[ConfigDict] = None,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 2).
+            img_meta (dict): Image meta info.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for level_idx, (cls_score, bbox_pred, stride, base_len, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list, self.strides,
+                              self.base_edge_list, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, _, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            bboxes = self._bbox_decode(priors, bbox_pred, base_len, img_shape)
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def _bbox_decode(self, priors: Tensor, bbox_pred: Tensor, base_len: int,
+                     max_shape: int) -> Tensor:
+        """Function to decode bbox.
+
+        Args:
+            priors (Tensor): Center proiors of an image, has shape
+                (num_instances, 2).
+            bbox_preds (Tensor): Box energies / deltas for all instances,
+                has shape (batch_size, num_instances, 4).
+            base_len (int): The base length.
+            max_shape (int): The max shape of bbox.
+
+        Returns:
+            Tensor: Decoded bboxes in (tl_x, tl_y, br_x, br_y) format. Has
+            shape (batch_size, num_instances, 4).
+        """
+        bbox_pred = bbox_pred.exp()
+
+        y = priors[:, 1]
+        x = priors[:, 0]
+        x1 = (x - base_len * bbox_pred[:, 0]). \
+            clamp(min=0, max=max_shape[1] - 1)
+        y1 = (y - base_len * bbox_pred[:, 1]). \
+            clamp(min=0, max=max_shape[0] - 1)
+        x2 = (x + base_len * bbox_pred[:, 2]). \
+            clamp(min=0, max=max_shape[1] - 1)
+        y2 = (y + base_len * bbox_pred[:, 3]). \
+            clamp(min=0, max=max_shape[0] - 1)
+        decoded_bboxes = torch.stack([x1, y1, x2, y2], -1)
+        return decoded_bboxes
diff --git a/mmde/mmdet/models/dense_heads/free_anchor_retina_head.py b/mmde/mmdet/models/dense_heads/free_anchor_retina_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..df6fb9202c32735121bf7738e332fbfc5ac7e6bd
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/free_anchor_retina_head.py
@@ -0,0 +1,312 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import InstanceList, OptConfigType, OptInstanceList
+from ..utils import multi_apply
+from .retina_head import RetinaHead
+
+EPS = 1e-12
+
+
+@MODELS.register_module()
+class FreeAnchorRetinaHead(RetinaHead):
+    """FreeAnchor RetinaHead used in https://arxiv.org/abs/1909.02466.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Defaults to 4.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): dictionary to
+            construct and config conv layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): dictionary to
+            construct and config norm layer. Defaults to
+            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
+        pre_anchor_topk (int): Number of boxes that be token in each bag.
+            Defaults to 50
+        bbox_thr (float): The threshold of the saturated linear function.
+            It is usually the same with the IoU threshold used in NMS.
+            Defaults to 0.6.
+        gamma (float): Gamma parameter in focal loss. Defaults to 2.0.
+        alpha (float): Alpha parameter in focal loss. Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 pre_anchor_topk: int = 50,
+                 bbox_thr: float = 0.6,
+                 gamma: float = 2.0,
+                 alpha: float = 0.5,
+                 **kwargs) -> None:
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            stacked_convs=stacked_convs,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            **kwargs)
+
+        self.pre_anchor_topk = pre_anchor_topk
+        self.bbox_thr = bbox_thr
+        self.gamma = gamma
+        self.alpha = alpha
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, _ = self.get_anchors(
+            featmap_sizes=featmap_sizes,
+            batch_img_metas=batch_img_metas,
+            device=device)
+        concat_anchor_list = [torch.cat(anchor) for anchor in anchor_list]
+
+        # concatenate each level
+        cls_scores = [
+            cls.permute(0, 2, 3,
+                        1).reshape(cls.size(0), -1, self.cls_out_channels)
+            for cls in cls_scores
+        ]
+        bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(bbox_pred.size(0), -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        cls_scores = torch.cat(cls_scores, dim=1)
+        cls_probs = torch.sigmoid(cls_scores)
+        bbox_preds = torch.cat(bbox_preds, dim=1)
+
+        box_probs, positive_losses, num_pos_list = multi_apply(
+            self.positive_loss_single, cls_probs, bbox_preds,
+            concat_anchor_list, batch_gt_instances)
+
+        num_pos = sum(num_pos_list)
+        positive_loss = torch.cat(positive_losses).sum() / max(1, num_pos)
+
+        # box_prob: P{a_{j} \in A_{+}}
+        box_probs = torch.stack(box_probs, dim=0)
+
+        # negative_loss:
+        # \sum_{j}{ FL((1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg})) } / n||B||
+        negative_loss = self.negative_bag_loss(cls_probs, box_probs).sum() / \
+            max(1, num_pos * self.pre_anchor_topk)
+
+        # avoid the absence of gradients in regression subnet
+        # when no ground-truth in a batch
+        if num_pos == 0:
+            positive_loss = bbox_preds.sum() * 0
+
+        losses = {
+            'positive_bag_loss': positive_loss,
+            'negative_bag_loss': negative_loss
+        }
+        return losses
+
+    def positive_loss_single(self, cls_prob: Tensor, bbox_pred: Tensor,
+                             flat_anchors: Tensor,
+                             gt_instances: InstanceData) -> tuple:
+        """Compute positive loss.
+
+        Args:
+            cls_prob (Tensor): Classification probability of shape
+                (num_anchors, num_classes).
+            bbox_pred (Tensor): Box probability of shape (num_anchors, 4).
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors, 4)
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple:
+
+                - box_prob (Tensor): Box probability of shape (num_anchors, 4).
+                - positive_loss (Tensor): Positive loss of shape (num_pos, ).
+                - num_pos (int): positive samples indexes.
+        """
+
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        with torch.no_grad():
+            if len(gt_bboxes) == 0:
+                image_box_prob = torch.zeros(
+                    flat_anchors.size(0),
+                    self.cls_out_channels).type_as(bbox_pred)
+            else:
+                # box_localization: a_{j}^{loc}, shape: [j, 4]
+                pred_boxes = self.bbox_coder.decode(flat_anchors, bbox_pred)
+
+                # object_box_iou: IoU_{ij}^{loc}, shape: [i, j]
+                object_box_iou = bbox_overlaps(gt_bboxes, pred_boxes)
+
+                # object_box_prob: P{a_{j} -> b_{i}}, shape: [i, j]
+                t1 = self.bbox_thr
+                t2 = object_box_iou.max(
+                    dim=1, keepdim=True).values.clamp(min=t1 + 1e-12)
+                object_box_prob = ((object_box_iou - t1) / (t2 - t1)).clamp(
+                    min=0, max=1)
+
+                # object_cls_box_prob: P{a_{j} -> b_{i}}, shape: [i, c, j]
+                num_obj = gt_labels.size(0)
+                indices = torch.stack(
+                    [torch.arange(num_obj).type_as(gt_labels), gt_labels],
+                    dim=0)
+                object_cls_box_prob = torch.sparse_coo_tensor(
+                    indices, object_box_prob)
+
+                # image_box_iou: P{a_{j} \in A_{+}}, shape: [c, j]
+                """
+                from "start" to "end" implement:
+                image_box_iou = torch.sparse.max(object_cls_box_prob,
+                                                 dim=0).t()
+
+                """
+                # start
+                box_cls_prob = torch.sparse.sum(
+                    object_cls_box_prob, dim=0).to_dense()
+
+                indices = torch.nonzero(box_cls_prob, as_tuple=False).t_()
+                if indices.numel() == 0:
+                    image_box_prob = torch.zeros(
+                        flat_anchors.size(0),
+                        self.cls_out_channels).type_as(object_box_prob)
+                else:
+                    nonzero_box_prob = torch.where(
+                        (gt_labels.unsqueeze(dim=-1) == indices[0]),
+                        object_box_prob[:, indices[1]],
+                        torch.tensor(
+                            [0]).type_as(object_box_prob)).max(dim=0).values
+
+                    # upmap to shape [j, c]
+                    image_box_prob = torch.sparse_coo_tensor(
+                        indices.flip([0]),
+                        nonzero_box_prob,
+                        size=(flat_anchors.size(0),
+                              self.cls_out_channels)).to_dense()
+                # end
+            box_prob = image_box_prob
+
+        # construct bags for objects
+        match_quality_matrix = bbox_overlaps(gt_bboxes, flat_anchors)
+        _, matched = torch.topk(
+            match_quality_matrix, self.pre_anchor_topk, dim=1, sorted=False)
+        del match_quality_matrix
+
+        # matched_cls_prob: P_{ij}^{cls}
+        matched_cls_prob = torch.gather(
+            cls_prob[matched], 2,
+            gt_labels.view(-1, 1, 1).repeat(1, self.pre_anchor_topk,
+                                            1)).squeeze(2)
+
+        # matched_box_prob: P_{ij}^{loc}
+        matched_anchors = flat_anchors[matched]
+        matched_object_targets = self.bbox_coder.encode(
+            matched_anchors,
+            gt_bboxes.unsqueeze(dim=1).expand_as(matched_anchors))
+        loss_bbox = self.loss_bbox(
+            bbox_pred[matched],
+            matched_object_targets,
+            reduction_override='none').sum(-1)
+        matched_box_prob = torch.exp(-loss_bbox)
+
+        # positive_losses: {-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )}
+        num_pos = len(gt_bboxes)
+        positive_loss = self.positive_bag_loss(matched_cls_prob,
+                                               matched_box_prob)
+
+        return box_prob, positive_loss, num_pos
+
+    def positive_bag_loss(self, matched_cls_prob: Tensor,
+                          matched_box_prob: Tensor) -> Tensor:
+        """Compute positive bag loss.
+
+        :math:`-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )`.
+
+        :math:`P_{ij}^{cls}`: matched_cls_prob, classification probability of matched samples.
+
+        :math:`P_{ij}^{loc}`: matched_box_prob, box probability of matched samples.
+
+        Args:
+            matched_cls_prob (Tensor): Classification probability of matched
+                samples in shape (num_gt, pre_anchor_topk).
+            matched_box_prob (Tensor): BBox probability of matched samples,
+                in shape (num_gt, pre_anchor_topk).
+
+        Returns:
+            Tensor: Positive bag loss in shape (num_gt,).
+        """  # noqa: E501, W605
+        # bag_prob = Mean-max(matched_prob)
+        matched_prob = matched_cls_prob * matched_box_prob
+        weight = 1 / torch.clamp(1 - matched_prob, 1e-12, None)
+        weight /= weight.sum(dim=1).unsqueeze(dim=-1)
+        bag_prob = (weight * matched_prob).sum(dim=1)
+        # positive_bag_loss = -self.alpha * log(bag_prob)
+        return self.alpha * F.binary_cross_entropy(
+            bag_prob, torch.ones_like(bag_prob), reduction='none')
+
+    def negative_bag_loss(self, cls_prob: Tensor, box_prob: Tensor) -> Tensor:
+        """Compute negative bag loss.
+
+        :math:`FL((1 - P_{a_{j} \in A_{+}}) * (1 - P_{j}^{bg}))`.
+
+        :math:`P_{a_{j} \in A_{+}}`: Box_probability of matched samples.
+
+        :math:`P_{j}^{bg}`: Classification probability of negative samples.
+
+        Args:
+            cls_prob (Tensor): Classification probability, in shape
+                (num_img, num_anchors, num_classes).
+            box_prob (Tensor): Box probability, in shape
+                (num_img, num_anchors, num_classes).
+
+        Returns:
+            Tensor: Negative bag loss in shape (num_img, num_anchors,
+            num_classes).
+        """  # noqa: E501, W605
+        prob = cls_prob * (1 - box_prob)
+        # There are some cases when neg_prob = 0.
+        # This will cause the neg_prob.log() to be inf without clamp.
+        prob = prob.clamp(min=EPS, max=1 - EPS)
+        negative_bag_loss = prob**self.gamma * F.binary_cross_entropy(
+            prob, torch.zeros_like(prob), reduction='none')
+        return (1 - self.alpha) * negative_bag_loss
diff --git a/mmde/mmdet/models/dense_heads/fsaf_head.py b/mmde/mmdet/models/dense_heads/fsaf_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a01c487406693253eb17b883cac9ed06cf95802
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/fsaf_head.py
@@ -0,0 +1,458 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig
+from ..losses.accuracy import accuracy
+from ..losses.utils import weight_reduce_loss
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..utils import images_to_levels, multi_apply, unmap
+from .retina_head import RetinaHead
+
+
+@MODELS.register_module()
+class FSAFHead(RetinaHead):
+    """Anchor-free head used in `FSAF <https://arxiv.org/abs/1903.00621>`_.
+
+    The head contains two subnetworks. The first classifies anchor boxes and
+    the second regresses deltas for the anchors (num_anchors is 1 for anchor-
+    free methods)
+
+    Args:
+        *args: Same as its base class in :class:`RetinaHead`
+        score_threshold (float, optional): The score_threshold to calculate
+            positive recall. If given, prediction scores lower than this value
+            is counted as incorrect prediction. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+        **kwargs: Same as its base class in :class:`RetinaHead`
+
+    Example:
+        >>> import torch
+        >>> self = FSAFHead(11, 7)
+        >>> x = torch.rand(1, 7, 32, 32)
+        >>> cls_score, bbox_pred = self.forward_single(x)
+        >>> # Each anchor predicts a score for each class except background
+        >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
+        >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
+        >>> assert cls_per_anchor == self.num_classes
+        >>> assert box_per_anchor == 4
+    """
+
+    def __init__(self,
+                 *args,
+                 score_threshold: Optional[float] = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        # The positive bias in self.retina_reg conv is to prevent predicted \
+        #  bbox with 0 area
+        if init_cfg is None:
+            init_cfg = dict(
+                type='Normal',
+                layer='Conv2d',
+                std=0.01,
+                override=[
+                    dict(
+                        type='Normal',
+                        name='retina_cls',
+                        std=0.01,
+                        bias_prob=0.01),
+                    dict(
+                        type='Normal', name='retina_reg', std=0.01, bias=0.25)
+                ])
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        self.score_threshold = score_threshold
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward feature map of a single scale level.
+
+        Args:
+            x (Tensor): Feature map of a single scale level.
+
+        Returns:
+            tuple[Tensor, Tensor]:
+
+            - cls_score (Tensor): Box scores for each scale level Has \
+            shape (N, num_points * num_classes, H, W).
+            - bbox_pred (Tensor): Box energies / deltas for each scale \
+            level with shape (N, num_points * 4, H, W).
+        """
+        cls_score, bbox_pred = super().forward_single(x)
+        # relu: TBLR encoder only accepts positive bbox_pred
+        return cls_score, self.relu(bbox_pred)
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Most of the codes are the same with the base class :obj: `AnchorHead`,
+        except that it also collects and returns the matched gt index in the
+        image (from 0 to num_gt-1). If the anchor bbox is not matched to any
+        gt, the corresponding value in pos_gt_inds is -1.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors, 4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors, ).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.  Defaults to True.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # Assign gt and sample anchors
+        anchors = flat_anchors[inside_flags.type(torch.bool), :]
+
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(pred_instances, gt_instances,
+                                             gt_instances_ignore)
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(
+            (num_valid_anchors, self.cls_out_channels), dtype=torch.float)
+        pos_gt_inds = anchors.new_full((num_valid_anchors, ),
+                                       -1,
+                                       dtype=torch.long)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        if len(pos_inds) > 0:
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            else:
+                # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+                # is applied directly on the decoded bounding boxes, both
+                # the predicted boxes and regression targets should be with
+                # absolute coordinate format.
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            # The assigned gt_index for each anchor. (0-based)
+            pos_gt_inds[pos_inds] = sampling_result.pos_assigned_gt_inds
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # shadowed_labels is a tensor composed of tuples
+        #  (anchor_inds, class_label) that indicate those anchors lying in the
+        #  outer region of a gt or overlapped by another gt with a smaller
+        #  area.
+        #
+        # Therefore, only the shadowed labels are ignored for loss calculation.
+        # the key `shadowed_labels` is defined in :obj:`CenterRegionAssigner`
+        shadowed_labels = assign_result.get_extra_property('shadowed_labels')
+        if shadowed_labels is not None and shadowed_labels.numel():
+            if len(shadowed_labels.shape) == 2:
+                idx_, label_ = shadowed_labels[:, 0], shadowed_labels[:, 1]
+                assert (labels[idx_] != label_).all(), \
+                    'One label cannot be both positive and ignored'
+                label_weights[idx_, label_] = 0
+            else:
+                label_weights[shadowed_labels] = 0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+            pos_gt_inds = unmap(
+                pos_gt_inds, num_total_anchors, inside_flags, fill=-1)
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds, sampling_result, pos_gt_inds)
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_points * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_points * 4, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        for i in range(len(bbox_preds)):  # loop over fpn level
+            # avoid 0 area of the predicted bbox
+            bbox_preds[i] = bbox_preds[i].clamp(min=1e-4)
+        # TODO: It may directly use the base-class loss function.
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+        batch_size = len(batch_img_metas)
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            return_sampling_results=True)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor, sampling_results_list,
+         pos_assigned_gt_inds_list) = cls_reg_targets
+
+        num_gts = np.array(list(map(len, batch_gt_instances)))
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            avg_factor=avg_factor)
+
+        # `pos_assigned_gt_inds_list` (length: fpn_levels) stores the assigned
+        # gt index of each anchor bbox in each fpn level.
+        cum_num_gts = list(np.cumsum(num_gts))  # length of batch_size
+        for i, assign in enumerate(pos_assigned_gt_inds_list):
+            # loop over fpn levels
+            for j in range(1, batch_size):
+                # loop over batch size
+                # Convert gt indices in each img to those in the batch
+                assign[j][assign[j] >= 0] += int(cum_num_gts[j - 1])
+            pos_assigned_gt_inds_list[i] = assign.flatten()
+            labels_list[i] = labels_list[i].flatten()
+        num_gts = num_gts.sum()  # total number of gt in the batch
+        # The unique label index of each gt in the batch
+        label_sequence = torch.arange(num_gts, device=device)
+        # Collect the average loss of each gt in each level
+        with torch.no_grad():
+            loss_levels, = multi_apply(
+                self.collect_loss_level_single,
+                losses_cls,
+                losses_bbox,
+                pos_assigned_gt_inds_list,
+                labels_seq=label_sequence)
+            # Shape: (fpn_levels, num_gts). Loss of each gt at each fpn level
+            loss_levels = torch.stack(loss_levels, dim=0)
+            # Locate the best fpn level for loss back-propagation
+            if loss_levels.numel() == 0:  # zero gt
+                argmin = loss_levels.new_empty((num_gts, ), dtype=torch.long)
+            else:
+                _, argmin = loss_levels.min(dim=0)
+
+        # Reweight the loss of each (anchor, label) pair, so that only those
+        #  at the best gt level are back-propagated.
+        losses_cls, losses_bbox, pos_inds = multi_apply(
+            self.reweight_loss_single,
+            losses_cls,
+            losses_bbox,
+            pos_assigned_gt_inds_list,
+            labels_list,
+            list(range(len(losses_cls))),
+            min_levels=argmin)
+        num_pos = torch.cat(pos_inds, 0).sum().float()
+        pos_recall = self.calculate_pos_recall(cls_scores, labels_list,
+                                               pos_inds)
+
+        if num_pos == 0:  # No gt
+            num_total_neg = sum(
+                [results.num_neg for results in sampling_results_list])
+            avg_factor = num_pos + num_total_neg
+        else:
+            avg_factor = num_pos
+        for i in range(len(losses_cls)):
+            losses_cls[i] /= avg_factor
+            losses_bbox[i] /= avg_factor
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            num_pos=num_pos / batch_size,
+            pos_recall=pos_recall)
+
+    def calculate_pos_recall(self, cls_scores: List[Tensor],
+                             labels_list: List[Tensor],
+                             pos_inds: List[Tensor]) -> Tensor:
+        """Calculate positive recall with score threshold.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores at all fpn levels.
+                Each tensor is in shape (N, num_classes * num_anchors, H, W)
+            labels_list (list[Tensor]): The label that each anchor is assigned
+                to. Shape (N * H * W * num_anchors, )
+            pos_inds (list[Tensor]): List of bool tensors indicating whether
+                the anchor is assigned to a positive label.
+                Shape (N * H * W * num_anchors, )
+
+        Returns:
+            Tensor: A single float number indicating the positive recall.
+        """
+        with torch.no_grad():
+            num_class = self.num_classes
+            scores = [
+                cls.permute(0, 2, 3, 1).reshape(-1, num_class)[pos]
+                for cls, pos in zip(cls_scores, pos_inds)
+            ]
+            labels = [
+                label.reshape(-1)[pos]
+                for label, pos in zip(labels_list, pos_inds)
+            ]
+            scores = torch.cat(scores, dim=0)
+            labels = torch.cat(labels, dim=0)
+            if self.use_sigmoid_cls:
+                scores = scores.sigmoid()
+            else:
+                scores = scores.softmax(dim=1)
+
+            return accuracy(scores, labels, thresh=self.score_threshold)
+
+    def collect_loss_level_single(self, cls_loss: Tensor, reg_loss: Tensor,
+                                  assigned_gt_inds: Tensor,
+                                  labels_seq: Tensor) -> Tensor:
+        """Get the average loss in each FPN level w.r.t. each gt label.
+
+        Args:
+            cls_loss (Tensor): Classification loss of each feature map pixel,
+              shape (num_anchor, num_class)
+            reg_loss (Tensor): Regression loss of each feature map pixel,
+              shape (num_anchor, 4)
+            assigned_gt_inds (Tensor): It indicates which gt the prior is
+              assigned to (0-based, -1: no assignment). shape (num_anchor),
+            labels_seq: The rank of labels. shape (num_gt)
+
+        Returns:
+            Tensor: shape (num_gt), average loss of each gt in this level
+        """
+        if len(reg_loss.shape) == 2:  # iou loss has shape (num_prior, 4)
+            reg_loss = reg_loss.sum(dim=-1)  # sum loss in tblr dims
+        if len(cls_loss.shape) == 2:
+            cls_loss = cls_loss.sum(dim=-1)  # sum loss in class dims
+        loss = cls_loss + reg_loss
+        assert loss.size(0) == assigned_gt_inds.size(0)
+        # Default loss value is 1e6 for a layer where no anchor is positive
+        #  to ensure it will not be chosen to back-propagate gradient
+        losses_ = loss.new_full(labels_seq.shape, 1e6)
+        for i, l in enumerate(labels_seq):
+            match = assigned_gt_inds == l
+            if match.any():
+                losses_[i] = loss[match].mean()
+        return losses_,
+
+    def reweight_loss_single(self, cls_loss: Tensor, reg_loss: Tensor,
+                             assigned_gt_inds: Tensor, labels: Tensor,
+                             level: int, min_levels: Tensor) -> tuple:
+        """Reweight loss values at each level.
+
+        Reassign loss values at each level by masking those where the
+        pre-calculated loss is too large. Then return the reduced losses.
+
+        Args:
+            cls_loss (Tensor): Element-wise classification loss.
+              Shape: (num_anchors, num_classes)
+            reg_loss (Tensor): Element-wise regression loss.
+              Shape: (num_anchors, 4)
+            assigned_gt_inds (Tensor): The gt indices that each anchor bbox
+              is assigned to. -1 denotes a negative anchor, otherwise it is the
+              gt index (0-based). Shape: (num_anchors, ),
+            labels (Tensor): Label assigned to anchors. Shape: (num_anchors, ).
+            level (int): The current level index in the pyramid
+              (0-4 for RetinaNet)
+            min_levels (Tensor): The best-matching level for each gt.
+              Shape: (num_gts, ),
+
+        Returns:
+            tuple:
+
+            - cls_loss: Reduced corrected classification loss. Scalar.
+            - reg_loss: Reduced corrected regression loss. Scalar.
+            - pos_flags (Tensor): Corrected bool tensor indicating the \
+            final positive anchors. Shape: (num_anchors, ).
+        """
+        loc_weight = torch.ones_like(reg_loss)
+        cls_weight = torch.ones_like(cls_loss)
+        pos_flags = assigned_gt_inds >= 0  # positive pixel flag
+        pos_indices = torch.nonzero(pos_flags, as_tuple=False).flatten()
+
+        if pos_flags.any():  # pos pixels exist
+            pos_assigned_gt_inds = assigned_gt_inds[pos_flags]
+            zeroing_indices = (min_levels[pos_assigned_gt_inds] != level)
+            neg_indices = pos_indices[zeroing_indices]
+
+            if neg_indices.numel():
+                pos_flags[neg_indices] = 0
+                loc_weight[neg_indices] = 0
+                # Only the weight corresponding to the label is
+                #  zeroed out if not selected
+                zeroing_labels = labels[neg_indices]
+                assert (zeroing_labels >= 0).all()
+                cls_weight[neg_indices, zeroing_labels] = 0
+
+        # Weighted loss for both cls and reg loss
+        cls_loss = weight_reduce_loss(cls_loss, cls_weight, reduction='sum')
+        reg_loss = weight_reduce_loss(reg_loss, loc_weight, reduction='sum')
+
+        return cls_loss, reg_loss, pos_flags
diff --git a/mmde/mmdet/models/dense_heads/ga_retina_head.py b/mmde/mmdet/models/dense_heads/ga_retina_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..569910b365126e90638256f0d10addfa230fd141
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/ga_retina_head.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import MaskedConv2d
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead
+
+
+@MODELS.register_module()
+class GARetinaHead(GuidedAnchorHead):
+    """Guided-Anchor-based RetinaNet head."""
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        if init_cfg is None:
+            init_cfg = dict(
+                type='Normal',
+                layer='Conv2d',
+                std=0.01,
+                override=[
+                    dict(
+                        type='Normal',
+                        name='conv_loc',
+                        std=0.01,
+                        bias_prob=0.01),
+                    dict(
+                        type='Normal',
+                        name='retina_cls',
+                        std=0.01,
+                        bias_prob=0.01)
+                ])
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+        self.conv_loc = nn.Conv2d(self.feat_channels, 1, 1)
+        num_anchors = self.square_anchor_generator.num_base_priors[0]
+        self.conv_shape = nn.Conv2d(self.feat_channels, num_anchors * 2, 1)
+        self.feature_adaption_cls = FeatureAdaption(
+            self.feat_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.feature_adaption_reg = FeatureAdaption(
+            self.feat_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.retina_cls = MaskedConv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.retina_reg = MaskedConv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward feature map of a single scale level."""
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+
+        loc_pred = self.conv_loc(cls_feat)
+        shape_pred = self.conv_shape(reg_feat)
+
+        cls_feat = self.feature_adaption_cls(cls_feat, shape_pred)
+        reg_feat = self.feature_adaption_reg(reg_feat, shape_pred)
+
+        if not self.training:
+            mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
+        else:
+            mask = None
+        cls_score = self.retina_cls(cls_feat, mask)
+        bbox_pred = self.retina_reg(reg_feat, mask)
+        return cls_score, bbox_pred, shape_pred, loc_pred
diff --git a/mmde/mmdet/models/dense_heads/ga_rpn_head.py b/mmde/mmdet/models/dense_heads/ga_rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9614463165533358b8465420a87dfa47e7de1177
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/ga_rpn_head.py
@@ -0,0 +1,222 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.ops import nms
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptInstanceList
+from .guided_anchor_head import GuidedAnchorHead
+
+
+@MODELS.register_module()
+class GARPNHead(GuidedAnchorHead):
+    """Guided-Anchor-based RPN head."""
+
+    def __init__(self,
+                 in_channels: int,
+                 num_classes: int = 1,
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='conv_loc',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.rpn_conv = nn.Conv2d(
+            self.in_channels, self.feat_channels, 3, padding=1)
+        super(GARPNHead, self)._init_layers()
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward feature of a single scale level."""
+
+        x = self.rpn_conv(x)
+        x = F.relu(x, inplace=True)
+        (cls_score, bbox_pred, shape_pred,
+         loc_pred) = super().forward_single(x)
+        return cls_score, bbox_pred, shape_pred, loc_pred
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            shape_preds: List[Tensor],
+            loc_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            shape_preds (list[Tensor]): shape predictions for each scale
+                level with shape (N, 1, H, W).
+            loc_preds (list[Tensor]): location predictions for each scale
+                level with shape (N, num_anchors * 2, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        losses = super().loss_by_feat(
+            cls_scores,
+            bbox_preds,
+            shape_preds,
+            loc_preds,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        return dict(
+            loss_rpn_cls=losses['loss_cls'],
+            loss_rpn_bbox=losses['loss_bbox'],
+            loss_anchor_shape=losses['loss_shape'],
+            loss_anchor_loc=losses['loss_loc'])
+
+    def _predict_by_feat_single(self,
+                                cls_scores: List[Tensor],
+                                bbox_preds: List[Tensor],
+                                mlvl_anchors: List[Tensor],
+                                mlvl_masks: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigType,
+                                rescale: bool = False) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            mlvl_anchors (list[Tensor]): Each element in the list is
+                the anchors of a single level in feature pyramid. it has
+                shape (num_priors, 4).
+            mlvl_masks (list[Tensor]): Each element in the list is location
+                masks of a single level.
+            img_meta (dict): Image meta info.
+            cfg (:obj:`ConfigDict` or dict): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4), the last
+              dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        assert cfg.nms.get('type', 'nms') == 'nms', 'GARPNHead only support ' \
+            'naive nms.'
+
+        mlvl_proposals = []
+        for idx in range(len(cls_scores)):
+            rpn_cls_score = cls_scores[idx]
+            rpn_bbox_pred = bbox_preds[idx]
+            anchors = mlvl_anchors[idx]
+            mask = mlvl_masks[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            # if no location is kept, end.
+            if mask.sum() == 0:
+                continue
+            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(-1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = rpn_cls_score.softmax(dim=1)[:, :-1]
+            # filter scores, bbox_pred w.r.t. mask.
+            # anchors are filtered in get_anchors() beforehand.
+            scores = scores[mask]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1,
+                                                                   4)[mask, :]
+            if scores.dim() == 0:
+                rpn_bbox_pred = rpn_bbox_pred.unsqueeze(0)
+                anchors = anchors.unsqueeze(0)
+                scores = scores.unsqueeze(0)
+            # filter anchors, bbox_pred, scores w.r.t. scores
+            if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
+                _, topk_inds = scores.topk(cfg.nms_pre)
+                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
+                anchors = anchors[topk_inds, :]
+                scores = scores[topk_inds]
+            # get proposals w.r.t. anchors and rpn_bbox_pred
+            proposals = self.bbox_coder.decode(
+                anchors, rpn_bbox_pred, max_shape=img_meta['img_shape'])
+            # filter out too small bboxes
+            if cfg.min_bbox_size >= 0:
+                w = proposals[:, 2] - proposals[:, 0]
+                h = proposals[:, 3] - proposals[:, 1]
+                valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+                if not valid_mask.all():
+                    proposals = proposals[valid_mask]
+                    scores = scores[valid_mask]
+
+            # NMS in current level
+            proposals, _ = nms(proposals, scores, cfg.nms.iou_threshold)
+            proposals = proposals[:cfg.nms_post, :]
+            mlvl_proposals.append(proposals)
+        proposals = torch.cat(mlvl_proposals, 0)
+        if cfg.get('nms_across_levels', False):
+            # NMS across multi levels
+            proposals, _ = nms(proposals[:, :4], proposals[:, -1],
+                               cfg.nms.iou_threshold)
+            proposals = proposals[:cfg.max_per_img, :]
+        else:
+            scores = proposals[:, 4]
+            num = min(cfg.max_per_img, proposals.shape[0])
+            _, topk_inds = scores.topk(num)
+            proposals = proposals[topk_inds, :]
+
+        bboxes = proposals[:, :-1]
+        scores = proposals[:, -1]
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            bboxes /= bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.scores = scores
+        results.labels = scores.new_zeros(scores.size(0), dtype=torch.long)
+        return results
diff --git a/mmde/mmdet/models/dense_heads/gfl_head.py b/mmde/mmdet/models/dense_heads/gfl_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..be43d9b4da39da602b3b87bd3c9739c67367615b
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/gfl_head.py
@@ -0,0 +1,667 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..task_modules.samplers import PseudoSampler
+from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply,
+                     unmap)
+from .anchor_head import AnchorHead
+
+
+class Integral(nn.Module):
+    """A fixed layer for calculating integral result from distribution.
+
+    This layer calculates the target location by :math: ``sum{P(y_i) * y_i}``,
+    P(y_i) denotes the softmax vector that represents the discrete distribution
+    y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}
+
+    Args:
+        reg_max (int): The maximal value of the discrete set. Defaults to 16.
+            You may want to reset it according to your new dataset or related
+            settings.
+    """
+
+    def __init__(self, reg_max: int = 16) -> None:
+        super().__init__()
+        self.reg_max = reg_max
+        self.register_buffer('project',
+                             torch.linspace(0, self.reg_max, self.reg_max + 1))
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward feature from the regression head to get integral result of
+        bounding box location.
+
+        Args:
+            x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
+                n is self.reg_max.
+
+        Returns:
+            x (Tensor): Integral result of box locations, i.e., distance
+                offsets from the box center in four directions, shape (N, 4).
+        """
+        x = F.softmax(x.reshape(-1, self.reg_max + 1), dim=1)
+        x = F.linear(x, self.project.type_as(x)).reshape(-1, 4)
+        return x
+
+
+@MODELS.register_module()
+class GFLHead(AnchorHead):
+    """Generalized Focal Loss: Learning Qualified and Distributed Bounding
+    Boxes for Dense Object Detection.
+
+    GFL head structure is similar with ATSS, however GFL uses
+    1) joint representation for classification and localization quality, and
+    2) flexible General distribution for bounding box locations,
+    which are supervised by
+    Quality Focal Loss (QFL) and Distribution Focal Loss (DFL), respectively
+
+    https://arxiv.org/abs/2006.04388
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Defaults to 4.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): dictionary to construct
+            and config conv layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config norm layer. Default: dict(type='GN', num_groups=32,
+            requires_grad=True).
+        loss_qfl (:obj:`ConfigDict` or dict): Config of Quality Focal Loss
+            (QFL).
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. Defaults
+             to 'DistancePointBBoxCoder'.
+        reg_max (int): Max value of integral set :math: ``{0, ..., reg_max}``
+            in QFL setting. Defaults to 16.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+    Example:
+        >>> self = GFLHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_quality_score, bbox_pred = self.forward(feats)
+        >>> assert len(cls_quality_score) == len(self.scales)
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 loss_dfl: ConfigType = dict(
+                     type='DistributionFocalLoss', loss_weight=0.25),
+                 bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+                 reg_max: int = 16,
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='gfl_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.reg_max = reg_max
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            bbox_coder=bbox_coder,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+        self.integral = Integral(self.reg_max)
+        self.loss_dfl = MODELS.build(loss_dfl)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU()
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        assert self.num_anchors == 1, 'anchor free version'
+        self.gfl_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.gfl_reg = nn.Conv2d(
+            self.feat_channels, 4 * (self.reg_max + 1), 3, padding=1)
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+
+            - cls_scores (list[Tensor]): Classification and quality (IoU)
+              joint scores for all scale levels, each is a 4D-tensor,
+              the channel number is num_classes.
+            - bbox_preds (list[Tensor]): Box distribution logits for all
+              scale levels, each is a 4D-tensor, the channel number is
+              4*(n+1), n is max value of integral set.
+        """
+        return multi_apply(self.forward_single, x, self.scales)
+
+    def forward_single(self, x: Tensor, scale: Scale) -> Sequence[Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+
+            - cls_score (Tensor): Cls and quality joint scores for a single
+              scale level the channel number is num_classes.
+            - bbox_pred (Tensor): Box distribution logits for a single scale
+              level, the channel number is 4*(n+1), n is max value of
+              integral set.
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.gfl_cls(cls_feat)
+        bbox_pred = scale(self.gfl_reg(reg_feat)).float()
+        return cls_score, bbox_pred
+
+    def anchor_center(self, anchors: Tensor) -> Tensor:
+        """Get anchor centers from anchors.
+
+        Args:
+            anchors (Tensor): Anchor list with shape (N, 4), ``xyxy`` format.
+
+        Returns:
+            Tensor: Anchor centers with shape (N, 2), ``xy`` format.
+        """
+        anchors_cx = (anchors[..., 2] + anchors[..., 0]) / 2
+        anchors_cy = (anchors[..., 3] + anchors[..., 1]) / 2
+        return torch.stack([anchors_cx, anchors_cy], dim=-1)
+
+    def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor,
+                            bbox_pred: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            stride: Tuple[int], avg_factor: int) -> dict:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            cls_score (Tensor): Cls and quality joint scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_pred (Tensor): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            stride (Tuple[int]): Stride in this scale level.
+            avg_factor (int): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1, 4 * (self.reg_max + 1))
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+        score = label_weights.new_zeros(labels.shape)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_anchor_centers = self.anchor_center(pos_anchors) / stride[0]
+
+            weight_targets = cls_score.detach().sigmoid()
+            weight_targets = weight_targets.max(dim=1)[0][pos_inds]
+            pos_bbox_pred_corners = self.integral(pos_bbox_pred)
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchor_centers, pos_bbox_pred_corners)
+            pos_decode_bbox_targets = pos_bbox_targets / stride[0]
+            score[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1)
+            target_corners = self.bbox_coder.encode(pos_anchor_centers,
+                                                    pos_decode_bbox_targets,
+                                                    self.reg_max).reshape(-1)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=weight_targets,
+                avg_factor=1.0)
+
+            # dfl loss
+            loss_dfl = self.loss_dfl(
+                pred_corners,
+                target_corners,
+                weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
+                avg_factor=4.0)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_dfl = bbox_pred.sum() * 0
+            weight_targets = bbox_pred.new_tensor(0)
+
+        # cls (qfl) loss
+        loss_cls = self.loss_cls(
+            cls_score, (labels, score),
+            weight=label_weights,
+            avg_factor=avg_factor)
+
+        return loss_cls, loss_bbox, loss_dfl, weight_targets.sum()
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Cls and quality scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_preds (list[Tensor]): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_reg_targets
+
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+
+        losses_cls, losses_bbox, losses_dfl,\
+            avg_factor = multi_apply(
+                self.loss_by_feat_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                self.prior_generator.strides,
+                avg_factor=avg_factor)
+
+        avg_factor = sum(avg_factor)
+        avg_factor = reduce_mean(avg_factor).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / avg_factor, losses_bbox))
+        losses_dfl = list(map(lambda x: x / avg_factor, losses_dfl))
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dfl=losses_dfl)
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image. GFL head does not need this value.
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (:obj: `ConfigDict`): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+            is False and mlvl_score_factor is None, return mlvl_bboxes and
+            mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+            mlvl_score_factor. Usually with_nms is False is used for aug
+            test. If with_nms is True, then return the following format
+
+            - det_bboxes (Tensor): Predicted bboxes with shape
+              [num_bboxes, 5], where the first 4 columns are bounding
+              box positions (tl_x, tl_y, br_x, br_y) and the 5-th
+              column are scores between 0 and 1.
+            - det_labels (Tensor): Predicted labels of the corresponding
+              box with shape [num_bboxes].
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for level_idx, (cls_score, bbox_pred, stride, priors) in enumerate(
+                zip(cls_score_list, bbox_pred_list,
+                    self.prior_generator.strides, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            assert stride[0] == stride[1]
+
+            bbox_pred = bbox_pred.permute(1, 2, 0)
+            bbox_pred = self.integral(bbox_pred) * stride[0]
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, _, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            bboxes = self.bbox_coder.decode(
+                self.anchor_center(priors), bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def get_targets(self,
+                    anchor_list: List[Tensor],
+                    valid_flag_list: List[Tensor],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs=True) -> tuple:
+        """Get targets for GFL head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. Besides
+        returning the targets as the parent method does, it also returns the
+        anchors as the first element of the returned tuple.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             anchor_list,
+             valid_flag_list,
+             num_level_anchors_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs)
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, avg_factor)
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            num_level_anchors: List[int],
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors, 4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            num_level_anchors (list[int]): Number of anchors of each scale
+                level.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+
+            - anchors (Tensor): All anchors in the image with shape (N, 4).
+            - labels (Tensor): Labels of all anchors in the image with
+              shape (N,).
+            - label_weights (Tensor): Label weights of all anchor in the
+              image with shape (N,).
+            - bbox_targets (Tensor): BBox targets of all anchors in the
+              image with shape (N, 4).
+            - bbox_weights (Tensor): BBox weights of all anchors in the
+              image with shape (N, 4).
+            - pos_inds (Tensor): Indices of positive anchor with shape
+              (num_pos,).
+            - neg_inds (Tensor): Indices of negative anchor with shape
+              (num_neg,).
+            - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+        num_level_anchors_inside = self.get_num_level_anchors_inside(
+            num_level_anchors, inside_flags)
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            num_level_priors=num_level_anchors_inside,
+            gt_instances=gt_instances,
+            gt_instances_ignore=gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds, sampling_result)
+
+    def get_num_level_anchors_inside(self, num_level_anchors: List[int],
+                                     inside_flags: Tensor) -> List[int]:
+        """Get the number of valid anchors in every level."""
+
+        split_inside_flags = torch.split(inside_flags, num_level_anchors)
+        num_level_anchors_inside = [
+            int(flags.sum()) for flags in split_inside_flags
+        ]
+        return num_level_anchors_inside
diff --git a/mmde/mmdet/models/dense_heads/grounding_dino_head.py b/mmde/mmdet/models/dense_heads/grounding_dino_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8088322546f24ae6f3e60aff1378d5c2feefdcf0
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/grounding_dino_head.py
@@ -0,0 +1,774 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Linear
+from mmengine.model import constant_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.losses import QualityFocalLoss
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
+from mmdet.utils import InstanceList, reduce_mean
+from ..layers import inverse_sigmoid
+from .atss_vlfusion_head import convert_grounding_to_cls_scores
+from .dino_head import DINOHead
+
+
+class ContrastiveEmbed(nn.Module):
+    """text visual ContrastiveEmbed layer.
+
+    Args:
+        max_text_len (int, optional): Maximum length of text.
+        log_scale (Optional[Union[str, float]]):  The initial value of a
+          learnable parameter to multiply with the similarity
+          matrix to normalize the output.  Defaults to 0.0.
+          - If set to 'auto', the similarity matrix will be normalized by
+            a fixed value ``sqrt(d_c)`` where ``d_c`` is the channel number.
+          - If set to 'none' or ``None``, there is no normalization applied.
+          - If set to a float number, the similarity matrix will be multiplied
+            by ``exp(log_scale)``, where ``log_scale`` is learnable.
+        bias (bool, optional): Whether to add bias to the output.
+          If set to ``True``, a learnable bias that is initialized as -4.6
+          will be added to the output. Useful when training from scratch.
+          Defaults to False.
+    """
+
+    def __init__(self,
+                 max_text_len: int = 256,
+                 log_scale: Optional[Union[str, float]] = None,
+                 bias: bool = False):
+        super().__init__()
+        self.max_text_len = max_text_len
+        self.log_scale = log_scale
+        if isinstance(log_scale, float):
+            self.log_scale = nn.Parameter(
+                torch.Tensor([float(log_scale)]), requires_grad=True)
+        elif log_scale not in ['auto', 'none', None]:
+            raise ValueError(f'log_scale should be one of '
+                             f'"auto", "none", None, but got {log_scale}')
+
+        self.bias = None
+        if bias:
+            bias_value = -math.log((1 - 0.01) / 0.01)
+            self.bias = nn.Parameter(
+                torch.Tensor([bias_value]), requires_grad=True)
+
+    def forward(self, visual_feat: Tensor, text_feat: Tensor,
+                text_token_mask: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            visual_feat (Tensor): Visual features.
+            text_feat (Tensor): Text features.
+            text_token_mask (Tensor): A mask used for text feats.
+
+        Returns:
+            Tensor: Classification score.
+        """
+        res = visual_feat @ text_feat.transpose(-1, -2)
+        if isinstance(self.log_scale, nn.Parameter):
+            res = res * self.log_scale.exp()
+        elif self.log_scale == 'auto':
+            # NOTE: similar to the normalizer in self-attention
+            res = res / math.sqrt(visual_feat.shape[-1])
+        if self.bias is not None:
+            res = res + self.bias
+        res.masked_fill_(~text_token_mask[:, None, :], float('-inf'))
+
+        new_res = torch.full((*res.shape[:-1], self.max_text_len),
+                             float('-inf'),
+                             device=res.device)
+        new_res[..., :res.shape[-1]] = res
+
+        return new_res
+
+
+@MODELS.register_module()
+class GroundingDINOHead(DINOHead):
+    """Head of the Grounding DINO: Marrying DINO with Grounded Pre-Training for
+    Open-Set Object Detection.
+
+    Args:
+        contrastive_cfg (dict, optional): Contrastive config that contains
+          keys like ``max_text_len``. Defaults to dict(max_text_len=256).
+    """
+
+    def __init__(self, contrastive_cfg=dict(max_text_len=256), **kwargs):
+        self.contrastive_cfg = contrastive_cfg
+        self.max_text_len = contrastive_cfg.get('max_text_len', 256)
+        super().__init__(**kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize classification branch and regression branch of head."""
+        fc_cls = ContrastiveEmbed(**self.contrastive_cfg)
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, 4))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        # NOTE: due to the fc_cls is a contrastive embedding and don't
+        # have any trainable parameters,we do not need to copy it.
+        if self.share_pred_layer:
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(self.num_pred_layer)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(self.num_pred_layer)])
+        else:
+            self.cls_branches = nn.ModuleList(
+                [copy.deepcopy(fc_cls) for _ in range(self.num_pred_layer)])
+            self.reg_branches = nn.ModuleList([
+                copy.deepcopy(reg_branch) for _ in range(self.num_pred_layer)
+            ])
+
+    def init_weights(self) -> None:
+        """Initialize weights of the Deformable DETR head."""
+        for m in self.reg_branches:
+            constant_init(m[-1], 0, bias=0)
+        nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0)
+        if self.as_two_stage:
+            for m in self.reg_branches:
+                nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+    def _get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> tuple:
+        """Compute regression and classification targets for one image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_queries, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_queries, 4].
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for one image.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels of each image.
+            - label_weights (Tensor]): Label weights of each image.
+            - bbox_targets (Tensor): BBox targets of each image.
+            - bbox_weights (Tensor): BBox weights of each image.
+            - pos_inds (Tensor): Sampled positive indices for each image.
+            - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        img_h, img_w = img_meta['img_shape']
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        num_bboxes = bbox_pred.size(0)
+        # convert bbox_pred from xywh, normalized to xyxy, unnormalized
+        bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
+        bbox_pred = bbox_pred * factor
+
+        pred_instances = InstanceData(scores=cls_score, bboxes=bbox_pred)
+        # assigner and sampler
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=gt_instances,
+            img_meta=img_meta)
+        gt_bboxes = gt_instances.bboxes
+
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+        pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :]
+
+        # Major changes. The labels are 0-1 binary labels for each bbox
+        # and text tokens.
+        labels = gt_bboxes.new_full((num_bboxes, self.max_text_len),
+                                    0,
+                                    dtype=torch.float32)
+        labels[pos_inds] = gt_instances.positive_maps[pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype)
+        bbox_weights = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype)
+        bbox_weights[pos_inds] = 1.0
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        pos_gt_bboxes_normalized = pos_gt_bboxes / factor
+        pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
+        bbox_targets[pos_inds] = pos_gt_bboxes_targets
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        references: List[Tensor],
+        memory_text: Tensor,
+        text_token_mask: Tensor,
+    ) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries, dim).
+            references (List[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+            memory_text (Tensor): Memory text. It has shape (bs, len_text,
+                text_embed_dims).
+            text_token_mask (Tensor): Text token mask. It has shape (bs,
+                len_text).
+
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - all_layers_outputs_classes (Tensor): Outputs from the
+              classification head, has shape (num_decoder_layers, bs,
+              num_queries, cls_out_channels).
+            - all_layers_outputs_coords (Tensor): Sigmoid outputs from the
+              regression head with normalized coordinate format (cx, cy, w,
+              h), has shape (num_decoder_layers, bs, num_queries, 4) with the
+              last dimension arranged as (cx, cy, w, h).
+        """
+        all_layers_outputs_classes = []
+        all_layers_outputs_coords = []
+
+        for layer_id in range(hidden_states.shape[0]):
+            reference = inverse_sigmoid(references[layer_id])
+            # NOTE The last reference will not be used.
+            hidden_state = hidden_states[layer_id]
+            outputs_class = self.cls_branches[layer_id](hidden_state,
+                                                        memory_text,
+                                                        text_token_mask)
+            tmp_reg_preds = self.reg_branches[layer_id](hidden_state)
+            if reference.shape[-1] == 4:
+                # When `layer` is 0 and `as_two_stage` of the detector
+                # is `True`, or when `layer` is greater than 0 and
+                # `with_box_refine` of the detector is `True`.
+                tmp_reg_preds += reference
+            else:
+                # When `layer` is 0 and `as_two_stage` of the detector
+                # is `False`, or when `layer` is greater than 0 and
+                # `with_box_refine` of the detector is `False`.
+                assert reference.shape[-1] == 2
+                tmp_reg_preds[..., :2] += reference
+            outputs_coord = tmp_reg_preds.sigmoid()
+            all_layers_outputs_classes.append(outputs_class)
+            all_layers_outputs_coords.append(outputs_coord)
+
+        all_layers_outputs_classes = torch.stack(all_layers_outputs_classes)
+        all_layers_outputs_coords = torch.stack(all_layers_outputs_coords)
+
+        return all_layers_outputs_classes, all_layers_outputs_coords
+
+    def predict(self,
+                hidden_states: Tensor,
+                references: List[Tensor],
+                memory_text: Tensor,
+                text_token_mask: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, num_queries, bs, dim).
+            references (List[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+            memory_text (Tensor): Memory text. It has shape (bs, len_text,
+                text_embed_dims).
+            text_token_mask (Tensor): Text token mask. It has shape (bs,
+                len_text).
+            batch_data_samples (SampleList): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): If `True`, return boxes in original
+                image space. Defaults to `True`.
+
+        Returns:
+            InstanceList: Detection results of each image
+                after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        batch_token_positive_maps = [
+            data_samples.token_positive_map
+            for data_samples in batch_data_samples
+        ]
+
+        outs = self(hidden_states, references, memory_text, text_token_mask)
+
+        predictions = self.predict_by_feat(
+            *outs,
+            batch_img_metas=batch_img_metas,
+            batch_token_positive_maps=batch_token_positive_maps,
+            rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self,
+                        all_layers_cls_scores: Tensor,
+                        all_layers_bbox_preds: Tensor,
+                        batch_img_metas: List[Dict],
+                        batch_token_positive_maps: Optional[List[dict]] = None,
+                        rescale: bool = False) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            all_layers_cls_scores (Tensor):  Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs, num_queries,
+                cls_out_channels).
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and shape (num_decoder_layers, bs, num_queries,
+                4) with the last dimension arranged as (cx, cy, w, h).
+            batch_img_metas (List[Dict]): _description_
+            batch_token_positive_maps (list[dict], Optional): Batch token
+                positive map. Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cls_scores = all_layers_cls_scores[-1]
+        bbox_preds = all_layers_bbox_preds[-1]
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_meta = batch_img_metas[img_id]
+            token_positive_maps = batch_token_positive_maps[img_id]
+            results = self._predict_by_feat_single(cls_score, bbox_pred,
+                                                   token_positive_maps,
+                                                   img_meta, rescale)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score: Tensor,
+                                bbox_pred: Tensor,
+                                token_positive_maps: dict,
+                                img_meta: dict,
+                                rescale: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score (Tensor): Box score logits from the last decoder layer
+                for each image. Shape [num_queries, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
+                for each image, with coordinate format (cx, cy, w, h) and
+                shape [num_queries, 4].
+            token_positive_maps (dict): Token positive map.
+            img_meta (dict): Image meta info.
+            rescale (bool, optional): If True, return boxes in original image
+                space. Default True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_score) == len(bbox_pred)  # num_queries
+        max_per_img = self.test_cfg.get('max_per_img', len(cls_score))
+        img_shape = img_meta['img_shape']
+
+        if token_positive_maps is not None:
+            cls_score = convert_grounding_to_cls_scores(
+                logits=cls_score.sigmoid()[None],
+                positive_maps=[token_positive_maps])[0]
+            scores, indexes = cls_score.view(-1).topk(max_per_img)
+            num_classes = cls_score.shape[-1]
+            det_labels = indexes % num_classes
+            bbox_index = indexes // num_classes
+            bbox_pred = bbox_pred[bbox_index]
+        else:
+            cls_score = cls_score.sigmoid()
+            scores, _ = cls_score.max(-1)
+            scores, indexes = scores.topk(max_per_img)
+            bbox_pred = bbox_pred[indexes]
+            det_labels = scores.new_zeros(scores.shape, dtype=torch.long)
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
+        det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+        det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+        det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
+        det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            det_bboxes /= det_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+        results = InstanceData()
+        results.bboxes = det_bboxes
+        results.scores = scores
+        results.labels = det_labels
+        return results
+
+    def loss(self, hidden_states: Tensor, references: List[Tensor],
+             memory_text: Tensor, text_token_mask: Tensor,
+             enc_outputs_class: Tensor, enc_outputs_coord: Tensor,
+             batch_data_samples: SampleList, dn_meta: Dict[str, int]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries_total,
+                dim), where `num_queries_total` is the sum of
+                `num_denoising_queries` and `num_matching_queries` when
+                `self.training` is `True`, else `num_matching_queries`.
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries_total, 4) and each `inter_reference` has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            memory_text (Tensor): Memory text. It has shape (bs, len_text,
+                text_embed_dims).
+            enc_outputs_class (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+            enc_outputs_coord (Tensor): The proposal generate from the
+                encode feature map, has shape (bs, num_feat_points, 4) with the
+                last dimension arranged as (cx, cy, w, h).
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references, memory_text, text_token_mask)
+        self.text_masks = text_token_mask
+        loss_inputs = outs + (enc_outputs_class, enc_outputs_coord,
+                              batch_gt_instances, batch_img_metas, dn_meta)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor,
+                            batch_gt_instances: InstanceList,
+                            batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer of a single
+        feature level.
+
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images, has shape (bs, num_queries, cls_out_channels).
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape (bs, num_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        with torch.no_grad():
+            cls_reg_targets = self.get_targets(cls_scores_list,
+                                               bbox_preds_list,
+                                               batch_gt_instances,
+                                               batch_img_metas)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.stack(labels_list, 0)
+        label_weights = torch.stack(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # ===== this change =====
+        # Loss is not computed for the padded regions of the text.
+        assert (self.text_masks.dim() == 2)
+        text_masks = self.text_masks.new_zeros(
+            (self.text_masks.size(0), self.max_text_len))
+        text_masks[:, :self.text_masks.size(1)] = self.text_masks
+        text_mask = (text_masks > 0).unsqueeze(1)
+        text_mask = text_mask.repeat(1, cls_scores.size(1), 1)
+        cls_scores = torch.masked_select(cls_scores, text_mask).contiguous()
+
+        labels = torch.masked_select(labels, text_mask)
+        label_weights = label_weights[...,
+                                      None].repeat(1, 1, text_mask.size(-1))
+        label_weights = torch.masked_select(label_weights, text_mask)
+
+        # classification loss
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        if isinstance(self.loss_cls, QualityFocalLoss):
+            raise NotImplementedError(
+                'QualityFocalLoss for GroundingDINOHead is not supported yet.')
+        else:
+            loss_cls = self.loss_cls(
+                cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds):
+            img_h, img_w, = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def _loss_dn_single(self, dn_cls_scores: Tensor, dn_bbox_preds: Tensor,
+                        batch_gt_instances: InstanceList,
+                        batch_img_metas: List[dict],
+                        dn_meta: Dict[str, int]) -> Tuple[Tensor]:
+        """Denoising loss for outputs from a single decoder layer.
+
+        Args:
+            dn_cls_scores (Tensor): Classification scores of a single decoder
+                layer in denoising part, has shape (bs, num_denoising_queries,
+                cls_out_channels).
+            dn_bbox_preds (Tensor): Regression outputs of a single decoder
+                layer in denoising part. Each is a 4D-tensor with normalized
+                coordinate format (cx, cy, w, h) and has shape
+                (bs, num_denoising_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        cls_reg_targets = self.get_dn_targets(batch_gt_instances,
+                                              batch_img_metas, dn_meta)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.stack(labels_list, 0)
+        label_weights = torch.stack(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+        # ===== this change =====
+        # Loss is not computed for the padded regions of the text.
+        assert (self.text_masks.dim() == 2)
+        text_masks = self.text_masks.new_zeros(
+            (self.text_masks.size(0), self.max_text_len))
+        text_masks[:, :self.text_masks.size(1)] = self.text_masks
+        text_mask = (text_masks > 0).unsqueeze(1)
+        text_mask = text_mask.repeat(1, dn_cls_scores.size(1), 1)
+        cls_scores = torch.masked_select(dn_cls_scores, text_mask).contiguous()
+        labels = torch.masked_select(labels, text_mask)
+        label_weights = label_weights[...,
+                                      None].repeat(1, 1, text_mask.size(-1))
+        label_weights = torch.masked_select(label_weights, text_mask)
+        # =======================
+
+        # classification loss
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = \
+            num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        if len(cls_scores) > 0:
+            if isinstance(self.loss_cls, QualityFocalLoss):
+                raise NotImplementedError('QualityFocalLoss is not supported')
+            else:
+                loss_cls = self.loss_cls(
+                    cls_scores,
+                    labels,
+                    label_weights,
+                    avg_factor=cls_avg_factor)
+        else:
+            loss_cls = torch.zeros(
+                1, dtype=cls_scores.dtype, device=cls_scores.device)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, dn_bbox_preds):
+            img_h, img_w = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = dn_bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def _get_dn_targets_single(self, gt_instances: InstanceData,
+                               img_meta: dict, dn_meta: Dict[str,
+                                                             int]) -> tuple:
+        """Get targets in denoising part for one image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for one image.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels of each image.
+            - label_weights (Tensor]): Label weights of each image.
+            - bbox_targets (Tensor): BBox targets of each image.
+            - bbox_weights (Tensor): BBox weights of each image.
+            - pos_inds (Tensor): Sampled positive indices for each image.
+            - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        num_groups = dn_meta['num_denoising_groups']
+        num_denoising_queries = dn_meta['num_denoising_queries']
+        num_queries_each_group = int(num_denoising_queries / num_groups)
+        device = gt_bboxes.device
+
+        if len(gt_labels) > 0:
+            t = torch.arange(len(gt_labels), dtype=torch.long, device=device)
+            t = t.unsqueeze(0).repeat(num_groups, 1)
+            pos_assigned_gt_inds = t.flatten()
+            pos_inds = torch.arange(
+                num_groups, dtype=torch.long, device=device)
+            pos_inds = pos_inds.unsqueeze(1) * num_queries_each_group + t
+            pos_inds = pos_inds.flatten()
+        else:
+            pos_inds = pos_assigned_gt_inds = \
+                gt_bboxes.new_tensor([], dtype=torch.long)
+
+        neg_inds = pos_inds + num_queries_each_group // 2
+        # label targets
+        # this change
+        labels = gt_bboxes.new_full((num_denoising_queries, self.max_text_len),
+                                    0,
+                                    dtype=torch.float32)
+        labels[pos_inds] = gt_instances.positive_maps[pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_denoising_queries)
+
+        # bbox targets
+        bbox_targets = torch.zeros(num_denoising_queries, 4, device=device)
+        bbox_weights = torch.zeros(num_denoising_queries, 4, device=device)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w = img_meta['img_shape']
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        gt_bboxes_normalized = gt_bboxes / factor
+        gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized)
+        bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1])
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
diff --git a/mmde/mmdet/models/dense_heads/guided_anchor_head.py b/mmde/mmdet/models/dense_heads/guided_anchor_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..59f6dd3336e66065dc88b702e925965d4089c72f
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/guided_anchor_head.py
@@ -0,0 +1,994 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.ops import DeformConv2d, MaskedConv2d
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList)
+from ..layers import multiclass_nms
+from ..task_modules.prior_generators import anchor_inside_flags, calc_region
+from ..task_modules.samplers import PseudoSampler
+from ..utils import images_to_levels, multi_apply, unmap
+from .anchor_head import AnchorHead
+
+
+class FeatureAdaption(BaseModule):
+    """Feature Adaption Module.
+
+    Feature Adaption Module is implemented based on DCN v1.
+    It uses anchor shape prediction rather than feature map to
+    predict offsets of deform conv layer.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        out_channels (int): Number of channels in the output feature map.
+        kernel_size (int): Deformable conv kernel size. Defaults to 3.
+        deform_groups (int): Deformable conv group size. Defaults to 4.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
+            list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        deform_groups: int = 4,
+        init_cfg: MultiConfig = dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.1,
+            override=dict(type='Normal', name='conv_adaption', std=0.01))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        offset_channels = kernel_size * kernel_size * 2
+        self.conv_offset = nn.Conv2d(
+            2, deform_groups * offset_channels, 1, bias=False)
+        self.conv_adaption = DeformConv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(kernel_size - 1) // 2,
+            deform_groups=deform_groups)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor, shape: Tensor) -> Tensor:
+        offset = self.conv_offset(shape.detach())
+        x = self.relu(self.conv_adaption(x, offset))
+        return x
+
+
+@MODELS.register_module()
+class GuidedAnchorHead(AnchorHead):
+    """Guided-Anchor-based head (GA-RPN, GA-RetinaNet, etc.).
+
+    This GuidedAnchorHead will predict high-quality feature guided
+    anchors and locations where anchors will be kept in inference.
+    There are mainly 3 categories of bounding-boxes.
+
+    - Sampled 9 pairs for target assignment. (approxes)
+    - The square boxes where the predicted anchors are based on. (squares)
+    - Guided anchors.
+
+    Please refer to https://arxiv.org/abs/1901.03278 for more details.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Defaults to 256.
+        approx_anchor_generator (:obj:`ConfigDict` or dict): Config dict
+            for approx generator
+        square_anchor_generator (:obj:`ConfigDict` or dict): Config dict
+            for square generator
+        anchor_coder (:obj:`ConfigDict` or dict): Config dict for anchor coder
+        bbox_coder (:obj:`ConfigDict` or dict): Config dict for bbox coder
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Defaults to False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        deform_groups: (int): Group number of DCN in FeatureAdaption module.
+            Defaults to 4.
+        loc_filter_thr (float): Threshold to filter out unconcerned regions.
+            Defaults to 0.01.
+        loss_loc (:obj:`ConfigDict` or dict): Config of location loss.
+        loss_shape (:obj:`ConfigDict` or dict): Config of anchor shape loss.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of bbox regression loss.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
+            list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        feat_channels: int = 256,
+        approx_anchor_generator: ConfigType = dict(
+            type='AnchorGenerator',
+            octave_base_scale=8,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        square_anchor_generator: ConfigType = dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[8],
+            strides=[4, 8, 16, 32, 64]),
+        anchor_coder: ConfigType = dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        bbox_coder: ConfigType = dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        reg_decoded_bbox: bool = False,
+        deform_groups: int = 4,
+        loc_filter_thr: float = 0.01,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        loss_loc: ConfigType = dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape: ConfigType = dict(
+            type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls: ConfigType = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox: ConfigType = dict(
+            type='SmoothL1Loss', beta=1.0, loss_weight=1.0),
+        init_cfg: MultiConfig = dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.01,
+            override=dict(
+                type='Normal', name='conv_loc', std=0.01, lbias_prob=0.01))
+    ) -> None:
+        super(AnchorHead, self).__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.deform_groups = deform_groups
+        self.loc_filter_thr = loc_filter_thr
+
+        # build approx_anchor_generator and square_anchor_generator
+        assert (approx_anchor_generator['octave_base_scale'] ==
+                square_anchor_generator['scales'][0])
+        assert (approx_anchor_generator['strides'] ==
+                square_anchor_generator['strides'])
+        self.approx_anchor_generator = TASK_UTILS.build(
+            approx_anchor_generator)
+        self.square_anchor_generator = TASK_UTILS.build(
+            square_anchor_generator)
+        self.approxs_per_octave = self.approx_anchor_generator \
+            .num_base_priors[0]
+
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        # one anchor per location
+        self.num_base_priors = self.square_anchor_generator.num_base_priors[0]
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        self.loc_focal_loss = loss_loc['type'] in ['FocalLoss']
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = self.num_classes
+        else:
+            self.cls_out_channels = self.num_classes + 1
+
+        # build bbox_coder
+        self.anchor_coder = TASK_UTILS.build(anchor_coder)
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+
+        # build losses
+        self.loss_loc = MODELS.build(loss_loc)
+        self.loss_shape = MODELS.build(loss_shape)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            # use PseudoSampler when no sampler in train_cfg
+            if train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler()
+
+            self.ga_assigner = TASK_UTILS.build(self.train_cfg['ga_assigner'])
+            if train_cfg.get('ga_sampler', None) is not None:
+                self.ga_sampler = TASK_UTILS.build(
+                    self.train_cfg['ga_sampler'],
+                    default_args=dict(context=self))
+            else:
+                self.ga_sampler = PseudoSampler()
+
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.conv_loc = nn.Conv2d(self.in_channels, 1, 1)
+        self.conv_shape = nn.Conv2d(self.in_channels, self.num_base_priors * 2,
+                                    1)
+        self.feature_adaption = FeatureAdaption(
+            self.in_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.conv_cls = MaskedConv2d(
+            self.feat_channels, self.num_base_priors * self.cls_out_channels,
+            1)
+        self.conv_reg = MaskedConv2d(self.feat_channels,
+                                     self.num_base_priors * 4, 1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward feature of a single scale level."""
+        loc_pred = self.conv_loc(x)
+        shape_pred = self.conv_shape(x)
+        x = self.feature_adaption(x, shape_pred)
+        # masked conv is only used during inference for speed-up
+        if not self.training:
+            mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
+        else:
+            mask = None
+        cls_score = self.conv_cls(x, mask)
+        bbox_pred = self.conv_reg(x, mask)
+        return cls_score, bbox_pred, shape_pred, loc_pred
+
+    def forward(self, x: List[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network."""
+        return multi_apply(self.forward_single, x)
+
+    def get_sampled_approxs(self,
+                            featmap_sizes: List[Tuple[int, int]],
+                            batch_img_metas: List[dict],
+                            device: str = 'cuda') -> tuple:
+        """Get sampled approxs and inside flags according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            batch_img_metas (list[dict]): Image meta info.
+            device (str): device for returned tensors
+
+        Returns:
+            tuple: approxes of each image, inside flags of each image
+        """
+        num_imgs = len(batch_img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # approxes for one time
+        multi_level_approxs = self.approx_anchor_generator.grid_priors(
+            featmap_sizes, device=device)
+        approxs_list = [multi_level_approxs for _ in range(num_imgs)]
+
+        # for each image, we compute inside flags of multi level approxes
+        inside_flag_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_flags = []
+            multi_level_approxs = approxs_list[img_id]
+
+            # obtain valid flags for each approx first
+            multi_level_approx_flags = self.approx_anchor_generator \
+                .valid_flags(featmap_sizes,
+                             img_meta['pad_shape'],
+                             device=device)
+
+            for i, flags in enumerate(multi_level_approx_flags):
+                approxs = multi_level_approxs[i]
+                inside_flags_list = []
+                for j in range(self.approxs_per_octave):
+                    split_valid_flags = flags[j::self.approxs_per_octave]
+                    split_approxs = approxs[j::self.approxs_per_octave, :]
+                    inside_flags = anchor_inside_flags(
+                        split_approxs, split_valid_flags,
+                        img_meta['img_shape'][:2],
+                        self.train_cfg['allowed_border'])
+                    inside_flags_list.append(inside_flags)
+                # inside_flag for a position is true if any anchor in this
+                # position is true
+                inside_flags = (
+                    torch.stack(inside_flags_list, 0).sum(dim=0) > 0)
+                multi_level_flags.append(inside_flags)
+            inside_flag_list.append(multi_level_flags)
+        return approxs_list, inside_flag_list
+
+    def get_anchors(self,
+                    featmap_sizes: List[Tuple[int, int]],
+                    shape_preds: List[Tensor],
+                    loc_preds: List[Tensor],
+                    batch_img_metas: List[dict],
+                    use_loc_filter: bool = False,
+                    device: str = 'cuda') -> tuple:
+        """Get squares according to feature map sizes and guided anchors.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            shape_preds (list[tensor]): Multi-level shape predictions.
+            loc_preds (list[tensor]): Multi-level location predictions.
+            batch_img_metas (list[dict]): Image meta info.
+            use_loc_filter (bool): Use loc filter or not. Defaults to False
+            device (str): device for returned tensors.
+                Defaults to `cuda`.
+
+        Returns:
+            tuple: square approxs of each image, guided anchors of each image,
+            loc masks of each image.
+        """
+        num_imgs = len(batch_img_metas)
+        num_levels = len(featmap_sizes)
+
+        # since feature map sizes of all images are the same, we only compute
+        # squares for one time
+        multi_level_squares = self.square_anchor_generator.grid_priors(
+            featmap_sizes, device=device)
+        squares_list = [multi_level_squares for _ in range(num_imgs)]
+
+        # for each image, we compute multi level guided anchors
+        guided_anchors_list = []
+        loc_mask_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_guided_anchors = []
+            multi_level_loc_mask = []
+            for i in range(num_levels):
+                squares = squares_list[img_id][i]
+                shape_pred = shape_preds[i][img_id]
+                loc_pred = loc_preds[i][img_id]
+                guided_anchors, loc_mask = self._get_guided_anchors_single(
+                    squares,
+                    shape_pred,
+                    loc_pred,
+                    use_loc_filter=use_loc_filter)
+                multi_level_guided_anchors.append(guided_anchors)
+                multi_level_loc_mask.append(loc_mask)
+            guided_anchors_list.append(multi_level_guided_anchors)
+            loc_mask_list.append(multi_level_loc_mask)
+        return squares_list, guided_anchors_list, loc_mask_list
+
+    def _get_guided_anchors_single(
+            self,
+            squares: Tensor,
+            shape_pred: Tensor,
+            loc_pred: Tensor,
+            use_loc_filter: bool = False) -> Tuple[Tensor]:
+        """Get guided anchors and loc masks for a single level.
+
+        Args:
+            squares (tensor): Squares of a single level.
+            shape_pred (tensor): Shape predictions of a single level.
+            loc_pred (tensor): Loc predictions of a single level.
+            use_loc_filter (list[tensor]): Use loc filter or not.
+                Defaults to False.
+
+        Returns:
+            tuple: guided anchors, location masks
+        """
+        # calculate location filtering mask
+        loc_pred = loc_pred.sigmoid().detach()
+        if use_loc_filter:
+            loc_mask = loc_pred >= self.loc_filter_thr
+        else:
+            loc_mask = loc_pred >= 0.0
+        mask = loc_mask.permute(1, 2, 0).expand(-1, -1, self.num_base_priors)
+        mask = mask.contiguous().view(-1)
+        # calculate guided anchors
+        squares = squares[mask]
+        anchor_deltas = shape_pred.permute(1, 2, 0).contiguous().view(
+            -1, 2).detach()[mask]
+        bbox_deltas = anchor_deltas.new_full(squares.size(), 0)
+        bbox_deltas[:, 2:] = anchor_deltas
+        guided_anchors = self.anchor_coder.decode(
+            squares, bbox_deltas, wh_ratio_clip=1e-6)
+        return guided_anchors, mask
+
+    def ga_loc_targets(self, batch_gt_instances: InstanceList,
+                       featmap_sizes: List[Tuple[int, int]]) -> tuple:
+        """Compute location targets for guided anchoring.
+
+        Each feature map is divided into positive, negative and ignore regions.
+        - positive regions: target 1, weight 1
+        - ignore regions: target 0, weight 0
+        - negative regions: target 0, weight 0.1
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            featmap_sizes (list[tuple]): Multi level sizes of each feature
+                maps.
+
+        Returns:
+            tuple: Returns a tuple containing location targets.
+        """
+        anchor_scale = self.approx_anchor_generator.octave_base_scale
+        anchor_strides = self.approx_anchor_generator.strides
+        # Currently only supports same stride in x and y direction.
+        for stride in anchor_strides:
+            assert (stride[0] == stride[1])
+        anchor_strides = [stride[0] for stride in anchor_strides]
+
+        center_ratio = self.train_cfg['center_ratio']
+        ignore_ratio = self.train_cfg['ignore_ratio']
+        img_per_gpu = len(batch_gt_instances)
+        num_lvls = len(featmap_sizes)
+        r1 = (1 - center_ratio) / 2
+        r2 = (1 - ignore_ratio) / 2
+        all_loc_targets = []
+        all_loc_weights = []
+        all_ignore_map = []
+        for lvl_id in range(num_lvls):
+            h, w = featmap_sizes[lvl_id]
+            loc_targets = torch.zeros(
+                img_per_gpu,
+                1,
+                h,
+                w,
+                device=batch_gt_instances[0].bboxes.device,
+                dtype=torch.float32)
+            loc_weights = torch.full_like(loc_targets, -1)
+            ignore_map = torch.zeros_like(loc_targets)
+            all_loc_targets.append(loc_targets)
+            all_loc_weights.append(loc_weights)
+            all_ignore_map.append(ignore_map)
+        for img_id in range(img_per_gpu):
+            gt_bboxes = batch_gt_instances[img_id].bboxes
+            scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                               (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+            min_anchor_size = scale.new_full(
+                (1, ), float(anchor_scale * anchor_strides[0]))
+            # assign gt bboxes to different feature levels w.r.t. their scales
+            target_lvls = torch.floor(
+                torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
+            target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
+            for gt_id in range(gt_bboxes.size(0)):
+                lvl = target_lvls[gt_id].item()
+                # rescaled to corresponding feature map
+                gt_ = gt_bboxes[gt_id, :4] / anchor_strides[lvl]
+                # calculate ignore regions
+                ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                    gt_, r2, featmap_sizes[lvl])
+                # calculate positive (center) regions
+                ctr_x1, ctr_y1, ctr_x2, ctr_y2 = calc_region(
+                    gt_, r1, featmap_sizes[lvl])
+                all_loc_targets[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
+                                     ctr_x1:ctr_x2 + 1] = 1
+                all_loc_weights[lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                     ignore_x1:ignore_x2 + 1] = 0
+                all_loc_weights[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
+                                     ctr_x1:ctr_x2 + 1] = 1
+                # calculate ignore map on nearby low level feature
+                if lvl > 0:
+                    d_lvl = lvl - 1
+                    # rescaled to corresponding feature map
+                    gt_ = gt_bboxes[gt_id, :4] / anchor_strides[d_lvl]
+                    ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                        gt_, r2, featmap_sizes[d_lvl])
+                    all_ignore_map[d_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                          ignore_x1:ignore_x2 + 1] = 1
+                # calculate ignore map on nearby high level feature
+                if lvl < num_lvls - 1:
+                    u_lvl = lvl + 1
+                    # rescaled to corresponding feature map
+                    gt_ = gt_bboxes[gt_id, :4] / anchor_strides[u_lvl]
+                    ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                        gt_, r2, featmap_sizes[u_lvl])
+                    all_ignore_map[u_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                          ignore_x1:ignore_x2 + 1] = 1
+        for lvl_id in range(num_lvls):
+            # ignore negative regions w.r.t. ignore map
+            all_loc_weights[lvl_id][(all_loc_weights[lvl_id] < 0)
+                                    & (all_ignore_map[lvl_id] > 0)] = 0
+            # set negative regions with weight 0.1
+            all_loc_weights[lvl_id][all_loc_weights[lvl_id] < 0] = 0.1
+        # loc average factor to balance loss
+        loc_avg_factor = sum(
+            [t.size(0) * t.size(-1) * t.size(-2)
+             for t in all_loc_targets]) / 200
+        return all_loc_targets, all_loc_weights, loc_avg_factor
+
+    def _ga_shape_target_single(self,
+                                flat_approxs: Tensor,
+                                inside_flags: Tensor,
+                                flat_squares: Tensor,
+                                gt_instances: InstanceData,
+                                gt_instances_ignore: Optional[InstanceData],
+                                img_meta: dict,
+                                unmap_outputs: bool = True) -> tuple:
+        """Compute guided anchoring targets.
+
+        This function returns sampled anchors and gt bboxes directly
+        rather than calculates regression targets.
+
+        Args:
+            flat_approxs (Tensor): flat approxs of a single image,
+                shape (n, 4)
+            inside_flags (Tensor): inside flags of a single image,
+                shape (n, ).
+            flat_squares (Tensor): flat squares of a single image,
+                shape (approxs_per_octave * n, 4)
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+            img_meta (dict): Meta info of a single image.
+            unmap_outputs (bool): unmap outputs or not.
+
+        Returns:
+            tuple: Returns a tuple containing shape targets of each image.
+        """
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        num_square = flat_squares.size(0)
+        approxs = flat_approxs.view(num_square, self.approxs_per_octave, 4)
+        approxs = approxs[inside_flags, ...]
+        squares = flat_squares[inside_flags, :]
+
+        pred_instances = InstanceData()
+        pred_instances.priors = squares
+        pred_instances.approxs = approxs
+
+        assign_result = self.ga_assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=gt_instances,
+            gt_instances_ignore=gt_instances_ignore)
+        sampling_result = self.ga_sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+
+        bbox_anchors = torch.zeros_like(squares)
+        bbox_gts = torch.zeros_like(squares)
+        bbox_weights = torch.zeros_like(squares)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            bbox_anchors[pos_inds, :] = sampling_result.pos_bboxes
+            bbox_gts[pos_inds, :] = sampling_result.pos_gt_bboxes
+            bbox_weights[pos_inds, :] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_squares.size(0)
+            bbox_anchors = unmap(bbox_anchors, num_total_anchors, inside_flags)
+            bbox_gts = unmap(bbox_gts, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (bbox_anchors, bbox_gts, bbox_weights, pos_inds, neg_inds,
+                sampling_result)
+
+    def ga_shape_targets(self,
+                         approx_list: List[List[Tensor]],
+                         inside_flag_list: List[List[Tensor]],
+                         square_list: List[List[Tensor]],
+                         batch_gt_instances: InstanceList,
+                         batch_img_metas: List[dict],
+                         batch_gt_instances_ignore: OptInstanceList = None,
+                         unmap_outputs: bool = True) -> tuple:
+        """Compute guided anchoring targets.
+
+        Args:
+            approx_list (list[list[Tensor]]): Multi level approxs of each
+                image.
+            inside_flag_list (list[list[Tensor]]): Multi level inside flags
+                of each image.
+            square_list (list[list[Tensor]]): Multi level squares of each
+                image.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): unmap outputs or not. Defaults to None.
+
+        Returns:
+            tuple:  Returns a tuple containing shape targets.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(approx_list) == len(inside_flag_list) == len(
+            square_list) == num_imgs
+        # anchor number of multi levels
+        num_level_squares = [squares.size(0) for squares in square_list[0]]
+        # concat all level anchors and flags to a single tensor
+        inside_flag_flat_list = []
+        approx_flat_list = []
+        square_flat_list = []
+        for i in range(num_imgs):
+            assert len(square_list[i]) == len(inside_flag_list[i])
+            inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
+            approx_flat_list.append(torch.cat(approx_list[i]))
+            square_flat_list.append(torch.cat(square_list[i]))
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None for _ in range(num_imgs)]
+        (all_bbox_anchors, all_bbox_gts, all_bbox_weights, pos_inds_list,
+         neg_inds_list, sampling_results_list) = multi_apply(
+             self._ga_shape_target_single,
+             approx_flat_list,
+             inside_flag_flat_list,
+             square_flat_list,
+             batch_gt_instances,
+             batch_gt_instances_ignore,
+             batch_img_metas,
+             unmap_outputs=unmap_outputs)
+        # sampled anchors of all images
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        bbox_anchors_list = images_to_levels(all_bbox_anchors,
+                                             num_level_squares)
+        bbox_gts_list = images_to_levels(all_bbox_gts, num_level_squares)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_squares)
+        return (bbox_anchors_list, bbox_gts_list, bbox_weights_list,
+                avg_factor)
+
+    def loss_shape_single(self, shape_pred: Tensor, bbox_anchors: Tensor,
+                          bbox_gts: Tensor, anchor_weights: Tensor,
+                          avg_factor: int) -> Tensor:
+        """Compute shape loss in single level."""
+        shape_pred = shape_pred.permute(0, 2, 3, 1).contiguous().view(-1, 2)
+        bbox_anchors = bbox_anchors.contiguous().view(-1, 4)
+        bbox_gts = bbox_gts.contiguous().view(-1, 4)
+        anchor_weights = anchor_weights.contiguous().view(-1, 4)
+        bbox_deltas = bbox_anchors.new_full(bbox_anchors.size(), 0)
+        bbox_deltas[:, 2:] += shape_pred
+        # filter out negative samples to speed-up weighted_bounded_iou_loss
+        inds = torch.nonzero(
+            anchor_weights[:, 0] > 0, as_tuple=False).squeeze(1)
+        bbox_deltas_ = bbox_deltas[inds]
+        bbox_anchors_ = bbox_anchors[inds]
+        bbox_gts_ = bbox_gts[inds]
+        anchor_weights_ = anchor_weights[inds]
+        pred_anchors_ = self.anchor_coder.decode(
+            bbox_anchors_, bbox_deltas_, wh_ratio_clip=1e-6)
+        loss_shape = self.loss_shape(
+            pred_anchors_, bbox_gts_, anchor_weights_, avg_factor=avg_factor)
+        return loss_shape
+
+    def loss_loc_single(self, loc_pred: Tensor, loc_target: Tensor,
+                        loc_weight: Tensor, avg_factor: float) -> Tensor:
+        """Compute location loss in single level."""
+        loss_loc = self.loss_loc(
+            loc_pred.reshape(-1, 1),
+            loc_target.reshape(-1).long(),
+            loc_weight.reshape(-1),
+            avg_factor=avg_factor)
+        return loss_loc
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            shape_preds: List[Tensor],
+            loc_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            shape_preds (list[Tensor]): shape predictions for each scale
+                level with shape (N, 1, H, W).
+            loc_preds (list[Tensor]): location predictions for each scale
+                level with shape (N, num_anchors * 2, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.approx_anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        # get loc targets
+        loc_targets, loc_weights, loc_avg_factor = self.ga_loc_targets(
+            batch_gt_instances, featmap_sizes)
+
+        # get sampled approxes
+        approxs_list, inside_flag_list = self.get_sampled_approxs(
+            featmap_sizes, batch_img_metas, device=device)
+        # get squares and guided anchors
+        squares_list, guided_anchors_list, _ = self.get_anchors(
+            featmap_sizes,
+            shape_preds,
+            loc_preds,
+            batch_img_metas,
+            device=device)
+
+        # get shape targets
+        shape_targets = self.ga_shape_targets(approxs_list, inside_flag_list,
+                                              squares_list, batch_gt_instances,
+                                              batch_img_metas)
+        (bbox_anchors_list, bbox_gts_list, anchor_weights_list,
+         ga_avg_factor) = shape_targets
+
+        # get anchor targets
+        cls_reg_targets = self.get_targets(
+            guided_anchors_list,
+            inside_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor) = cls_reg_targets
+
+        # anchor number of multi levels
+        num_level_anchors = [
+            anchors.size(0) for anchors in guided_anchors_list[0]
+        ]
+        # concat all level anchors to a single tensor
+        concat_anchor_list = []
+        for i in range(len(guided_anchors_list)):
+            concat_anchor_list.append(torch.cat(guided_anchors_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        # get classification and bbox regression losses
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            avg_factor=avg_factor)
+
+        # get anchor location loss
+        losses_loc = []
+        for i in range(len(loc_preds)):
+            loss_loc = self.loss_loc_single(
+                loc_preds[i],
+                loc_targets[i],
+                loc_weights[i],
+                avg_factor=loc_avg_factor)
+            losses_loc.append(loss_loc)
+
+        # get anchor shape loss
+        losses_shape = []
+        for i in range(len(shape_preds)):
+            loss_shape = self.loss_shape_single(
+                shape_preds[i],
+                bbox_anchors_list[i],
+                bbox_gts_list[i],
+                anchor_weights_list[i],
+                avg_factor=ga_avg_factor)
+            losses_shape.append(loss_shape)
+
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_shape=losses_shape,
+            loss_loc=losses_loc)
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        shape_preds: List[Tensor],
+                        loc_preds: List[Tensor],
+                        batch_img_metas: List[dict],
+                        cfg: OptConfigType = None,
+                        rescale: bool = False) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            shape_preds (list[Tensor]): shape predictions for each scale
+                level with shape (N, 1, H, W).
+            loc_preds (list[Tensor]): location predictions for each scale
+                level with shape (N, num_anchors * 2, H, W).
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4), the last
+              dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(shape_preds) == len(
+            loc_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        device = cls_scores[0].device
+        # get guided anchors
+        _, guided_anchors, loc_masks = self.get_anchors(
+            featmap_sizes,
+            shape_preds,
+            loc_preds,
+            batch_img_metas,
+            use_loc_filter=not self.training,
+            device=device)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            guided_anchor_list = [
+                guided_anchors[img_id][i].detach() for i in range(num_levels)
+            ]
+            loc_mask_list = [
+                loc_masks[img_id][i].detach() for i in range(num_levels)
+            ]
+            proposals = self._predict_by_feat_single(
+                cls_scores=cls_score_list,
+                bbox_preds=bbox_pred_list,
+                mlvl_anchors=guided_anchor_list,
+                mlvl_masks=loc_mask_list,
+                img_meta=batch_img_metas[img_id],
+                cfg=cfg,
+                rescale=rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: List[Tensor],
+                                bbox_preds: List[Tensor],
+                                mlvl_anchors: List[Tensor],
+                                mlvl_masks: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigType,
+                                rescale: bool = False) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            mlvl_anchors (list[Tensor]): Each element in the list is
+                the anchors of a single level in feature pyramid. it has
+                shape (num_priors, 4).
+            mlvl_masks (list[Tensor]): Each element in the list is location
+                masks of a single level.
+            img_meta (dict): Image meta info.
+            cfg (:obj:`ConfigDict` or dict): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4), the last
+              dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bbox_preds = []
+        mlvl_valid_anchors = []
+        mlvl_scores = []
+        for cls_score, bbox_pred, anchors, mask in zip(cls_scores, bbox_preds,
+                                                       mlvl_anchors,
+                                                       mlvl_masks):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            # if no location is kept, end.
+            if mask.sum() == 0:
+                continue
+            # reshape scores and bbox_pred
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            # filter scores, bbox_pred w.r.t. mask.
+            # anchors are filtered in get_anchors() beforehand.
+            scores = scores[mask, :]
+            bbox_pred = bbox_pred[mask, :]
+            if scores.dim() == 0:
+                anchors = anchors.unsqueeze(0)
+                scores = scores.unsqueeze(0)
+                bbox_pred = bbox_pred.unsqueeze(0)
+            # filter anchors, bbox_pred, scores w.r.t. scores
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_anchors.append(anchors)
+            mlvl_scores.append(scores)
+
+        mlvl_bbox_preds = torch.cat(mlvl_bbox_preds)
+        mlvl_anchors = torch.cat(mlvl_valid_anchors)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_bboxes = self.bbox_coder.decode(
+            mlvl_anchors, mlvl_bbox_preds, max_shape=img_meta['img_shape'])
+
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the backend when using sigmoid
+            # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+            # BG cat_id: num_class
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        # multi class NMS
+        det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
+                                                cfg.score_thr, cfg.nms,
+                                                cfg.max_per_img)
+
+        results = InstanceData()
+        results.bboxes = det_bboxes[:, :-1]
+        results.scores = det_bboxes[:, -1]
+        results.labels = det_labels
+        return results
diff --git a/mmde/mmdet/models/dense_heads/lad_head.py b/mmde/mmdet/models/dense_heads/lad_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1218e1f88206704d4f414d151ccd34a189ac5d0
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/lad_head.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import InstanceList, OptInstanceList
+from ..utils import levels_to_images, multi_apply, unpack_gt_instances
+from .paa_head import PAAHead
+
+
+@MODELS.register_module()
+class LADHead(PAAHead):
+    """Label Assignment Head from the paper: `Improving Object Detection by
+    Label Assignment Distillation <https://arxiv.org/pdf/2108.10520.pdf>`_"""
+
+    def get_label_assignment(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            iou_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> tuple:
+        """Get label assignment (from teacher).
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            iou_preds (list[Tensor]): iou_preds for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            tuple: Returns a tuple containing label assignment variables.
+
+            - labels (Tensor): Labels of all anchors, each with
+              shape (num_anchors,).
+            - labels_weight (Tensor): Label weights of all anchor.
+              each with shape (num_anchors,).
+            - bboxes_target (Tensor): BBox targets of all anchors.
+              each with shape (num_anchors, 4).
+            - bboxes_weight (Tensor): BBox weights of all anchors.
+              each with shape (num_anchors, 4).
+            - pos_inds_flatten (Tensor): Contains all index of positive
+              sample in all anchor.
+            - pos_anchors (Tensor): Positive anchors.
+            - num_pos (int): Number of positive anchors.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+        )
+        (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds,
+         pos_gt_index) = cls_reg_targets
+        cls_scores = levels_to_images(cls_scores)
+        cls_scores = [
+            item.reshape(-1, self.cls_out_channels) for item in cls_scores
+        ]
+        bbox_preds = levels_to_images(bbox_preds)
+        bbox_preds = [item.reshape(-1, 4) for item in bbox_preds]
+        pos_losses_list, = multi_apply(self.get_pos_loss, anchor_list,
+                                       cls_scores, bbox_preds, labels,
+                                       labels_weight, bboxes_target,
+                                       bboxes_weight, pos_inds)
+
+        with torch.no_grad():
+            reassign_labels, reassign_label_weight, \
+                reassign_bbox_weights, num_pos = multi_apply(
+                    self.paa_reassign,
+                    pos_losses_list,
+                    labels,
+                    labels_weight,
+                    bboxes_weight,
+                    pos_inds,
+                    pos_gt_index,
+                    anchor_list)
+            num_pos = sum(num_pos)
+        # convert all tensor list to a flatten tensor
+        labels = torch.cat(reassign_labels, 0).view(-1)
+        flatten_anchors = torch.cat(
+            [torch.cat(item, 0) for item in anchor_list])
+        labels_weight = torch.cat(reassign_label_weight, 0).view(-1)
+        bboxes_target = torch.cat(bboxes_target,
+                                  0).view(-1, bboxes_target[0].size(-1))
+
+        pos_inds_flatten = ((labels >= 0)
+                            &
+                            (labels < self.num_classes)).nonzero().reshape(-1)
+
+        if num_pos:
+            pos_anchors = flatten_anchors[pos_inds_flatten]
+        else:
+            pos_anchors = None
+
+        label_assignment_results = (labels, labels_weight, bboxes_target,
+                                    bboxes_weight, pos_inds_flatten,
+                                    pos_anchors, num_pos)
+        return label_assignment_results
+
+    def loss(self, x: List[Tensor], label_assignment_results: tuple,
+             batch_data_samples: SampleList) -> dict:
+        """Forward train with the available label assignment (student receives
+        from teacher).
+
+        Args:
+            x (list[Tensor]): Features from FPN.
+            label_assignment_results (tuple): As the outputs defined in the
+                function `self.get_label_assignment`.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            losses: (dict[str, Tensor]): A dictionary of loss components.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        outs = self(x)
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(
+            *loss_inputs,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            label_assignment_results=label_assignment_results)
+        return losses
+
+    def loss_by_feat(self,
+                     cls_scores: List[Tensor],
+                     bbox_preds: List[Tensor],
+                     iou_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None,
+                     label_assignment_results: Optional[tuple] = None) -> dict:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            iou_preds (list[Tensor]): iou_preds for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            label_assignment_results (tuple, optional): As the outputs defined
+                in the function `self.get_
+                label_assignment`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss gmm_assignment.
+        """
+
+        (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds_flatten,
+         pos_anchors, num_pos) = label_assignment_results
+
+        cls_scores = levels_to_images(cls_scores)
+        cls_scores = [
+            item.reshape(-1, self.cls_out_channels) for item in cls_scores
+        ]
+        bbox_preds = levels_to_images(bbox_preds)
+        bbox_preds = [item.reshape(-1, 4) for item in bbox_preds]
+        iou_preds = levels_to_images(iou_preds)
+        iou_preds = [item.reshape(-1, 1) for item in iou_preds]
+
+        # convert all tensor list to a flatten tensor
+        cls_scores = torch.cat(cls_scores, 0).view(-1, cls_scores[0].size(-1))
+        bbox_preds = torch.cat(bbox_preds, 0).view(-1, bbox_preds[0].size(-1))
+        iou_preds = torch.cat(iou_preds, 0).view(-1, iou_preds[0].size(-1))
+
+        losses_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            labels_weight,
+            avg_factor=max(num_pos, len(batch_img_metas)))  # avoid num_pos=0
+        if num_pos:
+            pos_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, bbox_preds[pos_inds_flatten])
+            pos_bbox_target = bboxes_target[pos_inds_flatten]
+            iou_target = bbox_overlaps(
+                pos_bbox_pred.detach(), pos_bbox_target, is_aligned=True)
+            losses_iou = self.loss_centerness(
+                iou_preds[pos_inds_flatten],
+                iou_target.unsqueeze(-1),
+                avg_factor=num_pos)
+            losses_bbox = self.loss_bbox(
+                pos_bbox_pred, pos_bbox_target, avg_factor=num_pos)
+
+        else:
+            losses_iou = iou_preds.sum() * 0
+            losses_bbox = bbox_preds.sum() * 0
+
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_iou=losses_iou)
diff --git a/mmde/mmdet/models/dense_heads/ld_head.py b/mmde/mmdet/models/dense_heads/ld_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2558fac97ee26ff89c5fa1b386f5ce68c3ad384d
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/ld_head.py
@@ -0,0 +1,257 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean
+from ..utils import multi_apply, unpack_gt_instances
+from .gfl_head import GFLHead
+
+
+@MODELS.register_module()
+class LDHead(GFLHead):
+    """Localization distillation Head. (Short description)
+
+    It utilizes the learned bbox distributions to transfer the localization
+    dark knowledge from teacher to student. Original paper: `Localization
+    Distillation for Object Detection. <https://arxiv.org/abs/2102.12252>`_
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        loss_ld (:obj:`ConfigDict` or dict): Config of Localization
+            Distillation Loss (LD), T is the temperature for distillation.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 loss_ld: ConfigType = dict(
+                     type='LocalizationDistillationLoss',
+                     loss_weight=0.25,
+                     T=10),
+                 **kwargs) -> dict:
+
+        super().__init__(
+            num_classes=num_classes, in_channels=in_channels, **kwargs)
+        self.loss_ld = MODELS.build(loss_ld)
+
+    def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor,
+                            bbox_pred: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            stride: Tuple[int], soft_targets: Tensor,
+                            avg_factor: int):
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            cls_score (Tensor): Cls and quality joint scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_pred (Tensor): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            stride (tuple): Stride in this scale level.
+            soft_targets (Tensor): Soft BBox regression targets.
+            avg_factor (int): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            dict[tuple, Tensor]: Loss components and weight targets.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1, 4 * (self.reg_max + 1))
+        soft_targets = soft_targets.permute(0, 2, 3,
+                                            1).reshape(-1,
+                                                       4 * (self.reg_max + 1))
+
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+        score = label_weights.new_zeros(labels.shape)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_anchor_centers = self.anchor_center(pos_anchors) / stride[0]
+
+            weight_targets = cls_score.detach().sigmoid()
+            weight_targets = weight_targets.max(dim=1)[0][pos_inds]
+            pos_bbox_pred_corners = self.integral(pos_bbox_pred)
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchor_centers, pos_bbox_pred_corners)
+            pos_decode_bbox_targets = pos_bbox_targets / stride[0]
+            score[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1)
+            pos_soft_targets = soft_targets[pos_inds]
+            soft_corners = pos_soft_targets.reshape(-1, self.reg_max + 1)
+
+            target_corners = self.bbox_coder.encode(pos_anchor_centers,
+                                                    pos_decode_bbox_targets,
+                                                    self.reg_max).reshape(-1)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=weight_targets,
+                avg_factor=1.0)
+
+            # dfl loss
+            loss_dfl = self.loss_dfl(
+                pred_corners,
+                target_corners,
+                weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
+                avg_factor=4.0)
+
+            # ld loss
+            loss_ld = self.loss_ld(
+                pred_corners,
+                soft_corners,
+                weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
+                avg_factor=4.0)
+
+        else:
+            loss_ld = bbox_pred.sum() * 0
+            loss_bbox = bbox_pred.sum() * 0
+            loss_dfl = bbox_pred.sum() * 0
+            weight_targets = bbox_pred.new_tensor(0)
+
+        # cls (qfl) loss
+        loss_cls = self.loss_cls(
+            cls_score, (labels, score),
+            weight=label_weights,
+            avg_factor=avg_factor)
+
+        return loss_cls, loss_bbox, loss_dfl, loss_ld, weight_targets.sum()
+
+    def loss(self, x: List[Tensor], out_teacher: Tuple[Tensor],
+             batch_data_samples: SampleList) -> dict:
+        """
+        Args:
+            x (list[Tensor]): Features from FPN.
+            out_teacher (tuple[Tensor]): The output of teacher.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            tuple[dict, list]: The loss components and proposals of each image.
+
+            - losses (dict[str, Tensor]): A dictionary of loss components.
+            - proposal_list (list[Tensor]): Proposals of each image.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        outs = self(x)
+        soft_targets = out_teacher[1]
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas,
+                              soft_targets)
+        losses = self.loss_by_feat(
+            *loss_inputs, batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        return losses
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            soft_targets: List[Tensor],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Cls and quality scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_preds (list[Tensor]): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            soft_targets (list[Tensor]): Soft BBox regression targets.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_reg_targets
+
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+
+        losses_cls, losses_bbox, losses_dfl, losses_ld, \
+            avg_factor = multi_apply(
+                self.loss_by_feat_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                self.prior_generator.strides,
+                soft_targets,
+                avg_factor=avg_factor)
+
+        avg_factor = sum(avg_factor) + 1e-6
+        avg_factor = reduce_mean(avg_factor).item()
+        losses_bbox = [x / avg_factor for x in losses_bbox]
+        losses_dfl = [x / avg_factor for x in losses_dfl]
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_dfl=losses_dfl,
+            loss_ld=losses_ld)
diff --git a/mmde/mmdet/models/dense_heads/mask2former_head.py b/mmde/mmdet/models/dense_heads/mask2former_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d47c655255f92819646b8ea304b9736ec30660
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/mask2former_head.py
@@ -0,0 +1,459 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d
+from mmcv.ops import point_sample
+from mmengine.model import ModuleList, caffe2_xavier_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig, reduce_mean
+from ..layers import Mask2FormerTransformerDecoder, SinePositionalEncoding
+from ..utils import get_uncertain_point_coords_with_randomness
+from .anchor_free_head import AnchorFreeHead
+from .maskformer_head import MaskFormerHead
+
+
+@MODELS.register_module()
+class Mask2FormerHead(MaskFormerHead):
+    """Implements the Mask2Former head.
+
+    See `Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for features.
+        out_channels (int): Number of channels for output.
+        num_things_classes (int): Number of things.
+        num_stuff_classes (int): Number of stuff.
+        num_queries (int): Number of query in Transformer decoder.
+        pixel_decoder (:obj:`ConfigDict` or dict): Config for pixel
+            decoder. Defaults to None.
+        enforce_decoder_input_project (bool, optional): Whether to add
+            a layer to change the embed_dim of tranformer encoder in
+            pixel decoder to the embed_dim of transformer decoder.
+            Defaults to False.
+        transformer_decoder (:obj:`ConfigDict` or dict): Config for
+            transformer decoder. Defaults to None.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer decoder position encoding. Defaults to
+            dict(num_feats=128, normalize=True).
+        loss_cls (:obj:`ConfigDict` or dict): Config of the classification
+            loss. Defaults to None.
+        loss_mask (:obj:`ConfigDict` or dict): Config of the mask loss.
+            Defaults to None.
+        loss_dice (:obj:`ConfigDict` or dict): Config of the dice loss.
+            Defaults to None.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            Mask2Former head.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            Mask2Former head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 feat_channels: int,
+                 out_channels: int,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 num_queries: int = 100,
+                 num_transformer_feat_level: int = 3,
+                 pixel_decoder: ConfigType = ...,
+                 enforce_decoder_input_project: bool = False,
+                 transformer_decoder: ConfigType = ...,
+                 positional_encoding: ConfigType = dict(
+                     num_feats=128, normalize=True),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=2.0,
+                     reduction='mean',
+                     class_weight=[1.0] * 133 + [0.1]),
+                 loss_mask: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='mean',
+                     loss_weight=5.0),
+                 loss_dice: ConfigType = dict(
+                     type='DiceLoss',
+                     use_sigmoid=True,
+                     activate=True,
+                     reduction='mean',
+                     naive_dice=True,
+                     eps=1.0,
+                     loss_weight=5.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super(AnchorFreeHead, self).__init__(init_cfg=init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = self.num_things_classes + self.num_stuff_classes
+        self.num_queries = num_queries
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_heads = transformer_decoder.layer_cfg.cross_attn_cfg.num_heads
+        self.num_transformer_decoder_layers = transformer_decoder.num_layers
+        assert pixel_decoder.encoder.layer_cfg. \
+            self_attn_cfg.num_levels == num_transformer_feat_level
+        pixel_decoder_ = copy.deepcopy(pixel_decoder)
+        pixel_decoder_.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = MODELS.build(pixel_decoder_)
+        self.transformer_decoder = Mask2FormerTransformerDecoder(
+            **transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+
+        self.decoder_input_projs = ModuleList()
+        # from low resolution to high resolution
+        for _ in range(num_transformer_feat_level):
+            if (self.decoder_embed_dims != feat_channels
+                    or enforce_decoder_input_project):
+                self.decoder_input_projs.append(
+                    Conv2d(
+                        feat_channels, self.decoder_embed_dims, kernel_size=1))
+            else:
+                self.decoder_input_projs.append(nn.Identity())
+        self.decoder_positional_encoding = SinePositionalEncoding(
+            **positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, feat_channels)
+        self.query_feat = nn.Embedding(self.num_queries, feat_channels)
+        # from low resolution to high resolution
+        self.level_embed = nn.Embedding(self.num_transformer_feat_level,
+                                        feat_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            self.sampler = TASK_UTILS.build(
+                self.train_cfg['sampler'], default_args=dict(context=self))
+            self.num_points = self.train_cfg.get('num_points', 12544)
+            self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0)
+            self.importance_sample_ratio = self.train_cfg.get(
+                'importance_sample_ratio', 0.75)
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_mask = MODELS.build(loss_mask)
+        self.loss_dice = MODELS.build(loss_dice)
+
+    def init_weights(self) -> None:
+        for m in self.decoder_input_projs:
+            if isinstance(m, Conv2d):
+                caffe2_xavier_init(m, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+    def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> Tuple[Tensor]:
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_instances (:obj:`InstanceData`): It contains ``labels`` and
+                ``masks``.
+            img_meta (dict): Image informtation.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each \
+                    image.
+                - neg_inds (Tensor): Sampled negative indices for each \
+                    image.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        gt_labels = gt_instances.labels
+        gt_masks = gt_instances.masks
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2),
+                                  device=cls_score.device)
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(
+            mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1,
+                                                        1)).squeeze(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(
+            gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1,
+                                                               1)).squeeze(1)
+
+        sampled_gt_instances = InstanceData(
+            labels=gt_labels, masks=gt_points_masks)
+        sampled_pred_instances = InstanceData(
+            scores=cls_score, masks=mask_points_pred)
+        # assign and sample
+        assign_result = self.assigner.assign(
+            pred_instances=sampled_pred_instances,
+            gt_instances=sampled_gt_instances,
+            img_meta=img_meta)
+        pred_instances = InstanceData(scores=cls_score, masks=mask_pred)
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones((self.num_queries, ))
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def _loss_by_feat_single(self, cls_scores: Tensor, mask_preds: Tensor,
+                             batch_gt_instances: List[InstanceData],
+                             batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single \
+                decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         avg_factor) = self.get_targets(cls_scores_list, mask_preds_list,
+                                        batch_gt_instances, batch_img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([avg_factor]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        with torch.no_grad():
+            points_coords = get_uncertain_point_coords_with_randomness(
+                mask_preds.unsqueeze(1), None, self.num_points,
+                self.oversample_ratio, self.importance_sample_ratio)
+            # shape (num_total_gts, h, w) -> (num_total_gts, num_points)
+            mask_point_targets = point_sample(
+                mask_targets.unsqueeze(1).float(), points_coords).squeeze(1)
+        # shape (num_queries, h, w) -> (num_queries, num_points)
+        mask_point_preds = point_sample(
+            mask_preds.unsqueeze(1), points_coords).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_point_preds, mask_point_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # shape (num_queries, num_points) -> (num_queries * num_points, )
+        mask_point_preds = mask_point_preds.reshape(-1)
+        # shape (num_total_gts, num_points) -> (num_total_gts * num_points, )
+        mask_point_targets = mask_point_targets.reshape(-1)
+        loss_mask = self.loss_mask(
+            mask_point_preds,
+            mask_point_targets,
+            avg_factor=num_total_masks * self.num_points)
+
+        return loss_cls, loss_mask, loss_dice
+
+    def _forward_head(self, decoder_out: Tensor, mask_feature: Tensor,
+                      attn_mask_target_size: Tuple[int, int]) -> Tuple[Tensor]:
+        """Forward for head part which is called after every decoder layer.
+
+        Args:
+            decoder_out (Tensor): in shape (batch_size, num_queries, c).
+            mask_feature (Tensor): in shape (batch_size, c, h, w).
+            attn_mask_target_size (tuple[int, int]): target attention
+                mask size.
+
+        Returns:
+            tuple: A tuple contain three elements.
+
+                - cls_pred (Tensor): Classification scores in shape \
+                    (batch_size, num_queries, cls_out_channels). \
+                    Note `cls_out_channels` should includes background.
+                - mask_pred (Tensor): Mask scores in shape \
+                    (batch_size, num_queries,h, w).
+                - attn_mask (Tensor): Attention mask in shape \
+                    (batch_size * num_heads, num_queries, h, w).
+        """
+        decoder_out = self.transformer_decoder.post_norm(decoder_out)
+        # shape (num_queries, batch_size, c)
+        cls_pred = self.cls_embed(decoder_out)
+        # shape (num_queries, batch_size, c)
+        mask_embed = self.mask_embed(decoder_out)
+        # shape (num_queries, batch_size, h, w)
+        mask_pred = torch.einsum('bqc,bchw->bqhw', mask_embed, mask_feature)
+        attn_mask = F.interpolate(
+            mask_pred,
+            attn_mask_target_size,
+            mode='bilinear',
+            align_corners=False)
+        # shape (num_queries, batch_size, h, w) ->
+        #   (batch_size * num_head, num_queries, h, w)
+        attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat(
+            (1, self.num_heads, 1, 1)).flatten(0, 1)
+        attn_mask = attn_mask.sigmoid() < 0.5
+        attn_mask = attn_mask.detach()
+
+        return cls_pred, mask_pred, attn_mask
+
+    def forward(self, x: List[Tensor],
+                batch_data_samples: SampleList) -> Tuple[List[Tensor]]:
+        """Forward function.
+
+        Args:
+            x (list[Tensor]): Multi scale Features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            tuple[list[Tensor]]: A tuple contains two elements.
+
+                - cls_pred_list (list[Tensor)]: Classification logits \
+                    for each decoder layer. Each is a 3D-tensor with shape \
+                    (batch_size, num_queries, cls_out_channels). \
+                    Note `cls_out_channels` should includes background.
+                - mask_pred_list (list[Tensor]): Mask logits for each \
+                    decoder layer. Each with shape (batch_size, num_queries, \
+                    h, w).
+        """
+        batch_size = x[0].shape[0]
+        mask_features, multi_scale_memorys = self.pixel_decoder(x)
+        # multi_scale_memorys (from low resolution to high resolution)
+        decoder_inputs = []
+        decoder_positional_encodings = []
+        for i in range(self.num_transformer_feat_level):
+            decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i])
+            # shape (batch_size, c, h, w) -> (batch_size, h*w, c)
+            decoder_input = decoder_input.flatten(2).permute(0, 2, 1)
+            level_embed = self.level_embed.weight[i].view(1, 1, -1)
+            decoder_input = decoder_input + level_embed
+            # shape (batch_size, c, h, w) -> (batch_size, h*w, c)
+            mask = decoder_input.new_zeros(
+                (batch_size, ) + multi_scale_memorys[i].shape[-2:],
+                dtype=torch.bool)
+            decoder_positional_encoding = self.decoder_positional_encoding(
+                mask)
+            decoder_positional_encoding = decoder_positional_encoding.flatten(
+                2).permute(0, 2, 1)
+            decoder_inputs.append(decoder_input)
+            decoder_positional_encodings.append(decoder_positional_encoding)
+        # shape (num_queries, c) -> (batch_size, num_queries, c)
+        query_feat = self.query_feat.weight.unsqueeze(0).repeat(
+            (batch_size, 1, 1))
+        query_embed = self.query_embed.weight.unsqueeze(0).repeat(
+            (batch_size, 1, 1))
+
+        cls_pred_list = []
+        mask_pred_list = []
+        cls_pred, mask_pred, attn_mask = self._forward_head(
+            query_feat, mask_features, multi_scale_memorys[0].shape[-2:])
+        cls_pred_list.append(cls_pred)
+        mask_pred_list.append(mask_pred)
+
+        for i in range(self.num_transformer_decoder_layers):
+            level_idx = i % self.num_transformer_feat_level
+            # if a mask is all True(all background), then set it all False.
+            mask_sum = (attn_mask.sum(-1) != attn_mask.shape[-1]).unsqueeze(-1)
+            attn_mask = attn_mask & mask_sum
+            # cross_attn + self_attn
+            layer = self.transformer_decoder.layers[i]
+            query_feat = layer(
+                query=query_feat,
+                key=decoder_inputs[level_idx],
+                value=decoder_inputs[level_idx],
+                query_pos=query_embed,
+                key_pos=decoder_positional_encodings[level_idx],
+                cross_attn_mask=attn_mask,
+                query_key_padding_mask=None,
+                # here we do not apply masking on padded region
+                key_padding_mask=None)
+            cls_pred, mask_pred, attn_mask = self._forward_head(
+                query_feat, mask_features, multi_scale_memorys[
+                    (i + 1) % self.num_transformer_feat_level].shape[-2:])
+
+            cls_pred_list.append(cls_pred)
+            mask_pred_list.append(mask_pred)
+
+        return cls_pred_list, mask_pred_list
diff --git a/mmde/mmdet/models/dense_heads/maskformer_head.py b/mmde/mmdet/models/dense_heads/maskformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..24c0655ee1c36e0110cf6578d1c095c50a297d81
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/maskformer_head.py
@@ -0,0 +1,601 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d
+from mmengine.model import caffe2_xavier_init
+from mmengine.structures import InstanceData, PixelData
+from torch import Tensor
+
+from mmdet.models.layers.pixel_decoder import PixelDecoder
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptMultiConfig, reduce_mean)
+from ..layers import DetrTransformerDecoder, SinePositionalEncoding
+from ..utils import multi_apply, preprocess_panoptic_gt
+from .anchor_free_head import AnchorFreeHead
+
+
+@MODELS.register_module()
+class MaskFormerHead(AnchorFreeHead):
+    """Implements the MaskFormer head.
+
+    See `Per-Pixel Classification is Not All You Need for Semantic
+    Segmentation <https://arxiv.org/pdf/2107.06278>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for feature.
+        out_channels (int): Number of channels for output.
+        num_things_classes (int): Number of things.
+        num_stuff_classes (int): Number of stuff.
+        num_queries (int): Number of query in Transformer.
+        pixel_decoder (:obj:`ConfigDict` or dict): Config for pixel
+            decoder.
+        enforce_decoder_input_project (bool): Whether to add a layer
+            to change the embed_dim of transformer encoder in pixel decoder to
+            the embed_dim of transformer decoder. Defaults to False.
+        transformer_decoder (:obj:`ConfigDict` or dict): Config for
+            transformer decoder.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer decoder position encoding.
+        loss_cls (:obj:`ConfigDict` or dict): Config of the classification
+            loss. Defaults to `CrossEntropyLoss`.
+        loss_mask (:obj:`ConfigDict` or dict): Config of the mask loss.
+            Defaults to `FocalLoss`.
+        loss_dice (:obj:`ConfigDict` or dict): Config of the dice loss.
+            Defaults to `DiceLoss`.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            MaskFormer head.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            MaskFormer head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 feat_channels: int,
+                 out_channels: int,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 num_queries: int = 100,
+                 pixel_decoder: ConfigType = ...,
+                 enforce_decoder_input_project: bool = False,
+                 transformer_decoder: ConfigType = ...,
+                 positional_encoding: ConfigType = dict(
+                     num_feats=128, normalize=True),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0,
+                     class_weight=[1.0] * 133 + [0.1]),
+                 loss_mask: ConfigType = dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=20.0),
+                 loss_dice: ConfigType = dict(
+                     type='DiceLoss',
+                     use_sigmoid=True,
+                     activate=True,
+                     naive_dice=True,
+                     loss_weight=1.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super(AnchorFreeHead, self).__init__(init_cfg=init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = self.num_things_classes + self.num_stuff_classes
+        self.num_queries = num_queries
+
+        pixel_decoder.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = MODELS.build(pixel_decoder)
+        self.transformer_decoder = DetrTransformerDecoder(
+            **transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+        if type(self.pixel_decoder) == PixelDecoder and (
+                self.decoder_embed_dims != in_channels[-1]
+                or enforce_decoder_input_project):
+            self.decoder_input_proj = Conv2d(
+                in_channels[-1], self.decoder_embed_dims, kernel_size=1)
+        else:
+            self.decoder_input_proj = nn.Identity()
+        self.decoder_pe = SinePositionalEncoding(**positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, out_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = TASK_UTILS.build(train_cfg['assigner'])
+            self.sampler = TASK_UTILS.build(
+                train_cfg['sampler'], default_args=dict(context=self))
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_mask = MODELS.build(loss_mask)
+        self.loss_dice = MODELS.build(loss_dice)
+
+    def init_weights(self) -> None:
+        if isinstance(self.decoder_input_proj, Conv2d):
+            caffe2_xavier_init(self.decoder_input_proj, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def preprocess_gt(
+            self, batch_gt_instances: InstanceList,
+            batch_gt_semantic_segs: List[Optional[PixelData]]) -> InstanceList:
+        """Preprocess the ground truth for all images.
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``labels``, each is
+                ground truth labels of each bbox, with shape (num_gts, )
+                and ``masks``, each is ground truth masks of each instances
+                of a image, shape (num_gts, h, w).
+            gt_semantic_seg (list[Optional[PixelData]]): Ground truth of
+                semantic segmentation, each with the shape (1, h, w).
+                [0, num_thing_class - 1] means things,
+                [num_thing_class, num_class-1] means stuff,
+                255 means VOID. It's None when training instance segmentation.
+
+        Returns:
+            list[obj:`InstanceData`]: each contains the following keys
+
+                - labels (Tensor): Ground truth class indices\
+                    for a image, with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in a image.
+                - masks (Tensor): Ground truth mask for a\
+                    image, with shape (n, h, w).
+        """
+        num_things_list = [self.num_things_classes] * len(batch_gt_instances)
+        num_stuff_list = [self.num_stuff_classes] * len(batch_gt_instances)
+        gt_labels_list = [
+            gt_instances['labels'] for gt_instances in batch_gt_instances
+        ]
+        gt_masks_list = [
+            gt_instances['masks'] for gt_instances in batch_gt_instances
+        ]
+        gt_semantic_segs = [
+            None if gt_semantic_seg is None else gt_semantic_seg.sem_seg
+            for gt_semantic_seg in batch_gt_semantic_segs
+        ]
+        targets = multi_apply(preprocess_panoptic_gt, gt_labels_list,
+                              gt_masks_list, gt_semantic_segs, num_things_list,
+                              num_stuff_list)
+        labels, masks = targets
+        batch_gt_instances = [
+            InstanceData(labels=label, masks=mask)
+            for label, mask in zip(labels, masks)
+        ]
+        return batch_gt_instances
+
+    def get_targets(
+        self,
+        cls_scores_list: List[Tensor],
+        mask_preds_list: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        return_sampling_results: bool = False
+    ) -> Tuple[List[Union[Tensor, int]]]:
+        """Compute classification and mask targets for all images for a decoder
+        layer.
+
+        Args:
+            cls_scores_list (list[Tensor]): Mask score logits from a single
+                decoder layer for all images. Each with shape (num_queries,
+                cls_out_channels).
+            mask_preds_list (list[Tensor]): Mask logits from a single decoder
+                layer for all images. Each with shape (num_queries, h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+            return_sampling_results (bool): Whether to return the sampling
+                results. Defaults to False.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+                - labels_list (list[Tensor]): Labels of all images.\
+                    Each with shape (num_queries, ).
+                - label_weights_list (list[Tensor]): Label weights\
+                    of all images. Each with shape (num_queries, ).
+                - mask_targets_list (list[Tensor]): Mask targets of\
+                    all images. Each with shape (num_queries, h, w).
+                - mask_weights_list (list[Tensor]): Mask weights of\
+                    all images. Each with shape (num_queries, ).
+                - avg_factor (int): Average factor that is used to average\
+                    the loss. When using sampling method, avg_factor is
+                    usually the sum of positive and negative priors. When
+                    using `MaskPseudoSampler`, `avg_factor` is usually equal
+                    to the number of positive priors.
+
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end.
+        """
+        results = multi_apply(self._get_targets_single, cls_scores_list,
+                              mask_preds_list, batch_gt_instances,
+                              batch_img_metas)
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         pos_inds_list, neg_inds_list, sampling_results_list) = results[:7]
+        rest_results = list(results[7:])
+
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+
+        res = (labels_list, label_weights_list, mask_targets_list,
+               mask_weights_list, avg_factor)
+        if return_sampling_results:
+            res = res + (sampling_results_list)
+
+        return res + tuple(rest_results)
+
+    def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> Tuple[Tensor]:
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_instances (:obj:`InstanceData`): It contains ``labels`` and
+                ``masks``.
+            img_meta (dict): Image informtation.
+
+        Returns:
+            tuple: a tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image.
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image.
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image.
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image.
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        gt_masks = gt_instances.masks
+        gt_labels = gt_instances.labels
+
+        target_shape = mask_pred.shape[-2:]
+        if gt_masks.shape[0] > 0:
+            gt_masks_downsampled = F.interpolate(
+                gt_masks.unsqueeze(1).float(), target_shape,
+                mode='nearest').squeeze(1).long()
+        else:
+            gt_masks_downsampled = gt_masks
+
+        pred_instances = InstanceData(scores=cls_score, masks=mask_pred)
+        downsampled_gt_instances = InstanceData(
+            labels=gt_labels, masks=gt_masks_downsampled)
+        # assign and sample
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=downsampled_gt_instances,
+            img_meta=img_meta)
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones(self.num_queries)
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def loss_by_feat(self, all_cls_scores: Tensor, all_mask_preds: Tensor,
+                     batch_gt_instances: List[InstanceData],
+                     batch_img_metas: List[dict]) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification scores for all decoder
+                layers with shape (num_decoder, batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            all_mask_preds (Tensor): Mask scores for all decoder layers with
+                shape (num_decoder, batch_size, num_queries, h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_dec_layers = len(all_cls_scores)
+        batch_gt_instances_list = [
+            batch_gt_instances for _ in range(num_dec_layers)
+        ]
+        img_metas_list = [batch_img_metas for _ in range(num_dec_layers)]
+        losses_cls, losses_mask, losses_dice = multi_apply(
+            self._loss_by_feat_single, all_cls_scores, all_mask_preds,
+            batch_gt_instances_list, img_metas_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_mask'] = losses_mask[-1]
+        loss_dict['loss_dice'] = losses_dice[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_mask_i, loss_dice_i in zip(
+                losses_cls[:-1], losses_mask[:-1], losses_dice[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_mask'] = loss_mask_i
+            loss_dict[f'd{num_dec_layer}.loss_dice'] = loss_dice_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def _loss_by_feat_single(self, cls_scores: Tensor, mask_preds: Tensor,
+                             batch_gt_instances: List[InstanceData],
+                             batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single decoder\
+                layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         avg_factor) = self.get_targets(cls_scores_list, mask_preds_list,
+                                        batch_gt_instances, batch_img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([avg_factor]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+        target_shape = mask_targets.shape[-2:]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        # upsample to shape of target
+        # shape (num_total_gts, h, w)
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(1),
+            target_shape,
+            mode='bilinear',
+            align_corners=False).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_preds, mask_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # FocalLoss support input of shape (n, num_class)
+        h, w = mask_preds.shape[-2:]
+        # shape (num_total_gts, h, w) -> (num_total_gts * h * w, 1)
+        mask_preds = mask_preds.reshape(-1, 1)
+        # shape (num_total_gts, h, w) -> (num_total_gts * h * w)
+        mask_targets = mask_targets.reshape(-1)
+        # target is (1 - mask_targets) !!!
+        loss_mask = self.loss_mask(
+            mask_preds, 1 - mask_targets, avg_factor=num_total_masks * h * w)
+
+        return loss_cls, loss_mask, loss_dice
+
+    def forward(self, x: Tuple[Tensor],
+                batch_data_samples: SampleList) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each
+                is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: a tuple contains two elements.
+
+                - all_cls_scores (Tensor): Classification scores for each\
+                    scale level. Each is a 4D-tensor with shape\
+                    (num_decoder, batch_size, num_queries, cls_out_channels).\
+                    Note `cls_out_channels` should includes background.
+                - all_mask_preds (Tensor): Mask scores for each decoder\
+                    layer. Each with shape (num_decoder, batch_size,\
+                    num_queries, h, w).
+        """
+        batch_img_metas = [
+            data_sample.metainfo for data_sample in batch_data_samples
+        ]
+        batch_size = x[0].shape[0]
+        input_img_h, input_img_w = batch_img_metas[0]['batch_input_shape']
+        padding_mask = x[-1].new_ones((batch_size, input_img_h, input_img_w),
+                                      dtype=torch.float32)
+        for i in range(batch_size):
+            img_h, img_w = batch_img_metas[i]['img_shape']
+            padding_mask[i, :img_h, :img_w] = 0
+        padding_mask = F.interpolate(
+            padding_mask.unsqueeze(1), size=x[-1].shape[-2:],
+            mode='nearest').to(torch.bool).squeeze(1)
+        # when backbone is swin, memory is output of last stage of swin.
+        # when backbone is r50, memory is output of tranformer encoder.
+        mask_features, memory = self.pixel_decoder(x, batch_img_metas)
+        pos_embed = self.decoder_pe(padding_mask)
+        memory = self.decoder_input_proj(memory)
+        # shape (batch_size, c, h, w) -> (batch_size, h*w, c)
+        memory = memory.flatten(2).permute(0, 2, 1)
+        pos_embed = pos_embed.flatten(2).permute(0, 2, 1)
+        # shape (batch_size, h * w)
+        padding_mask = padding_mask.flatten(1)
+        # shape = (num_queries, embed_dims)
+        query_embed = self.query_embed.weight
+        # shape = (batch_size, num_queries, embed_dims)
+        query_embed = query_embed.unsqueeze(0).repeat(batch_size, 1, 1)
+        target = torch.zeros_like(query_embed)
+        # shape (num_decoder, num_queries, batch_size, embed_dims)
+        out_dec = self.transformer_decoder(
+            query=target,
+            key=memory,
+            value=memory,
+            query_pos=query_embed,
+            key_pos=pos_embed,
+            key_padding_mask=padding_mask)
+
+        # cls_scores
+        all_cls_scores = self.cls_embed(out_dec)
+
+        # mask_preds
+        mask_embed = self.mask_embed(out_dec)
+        all_mask_preds = torch.einsum('lbqc,bchw->lbqhw', mask_embed,
+                                      mask_features)
+
+        return all_cls_scores, all_mask_preds
+
+    def loss(
+        self,
+        x: Tuple[Tensor],
+        batch_data_samples: SampleList,
+    ) -> Dict[str, Tensor]:
+        """Perform forward propagation and loss calculation of the panoptic
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+        batch_gt_semantic_segs = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+            if 'gt_sem_seg' in data_sample:
+                batch_gt_semantic_segs.append(data_sample.gt_sem_seg)
+            else:
+                batch_gt_semantic_segs.append(None)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+
+        # preprocess ground truth
+        batch_gt_instances = self.preprocess_gt(batch_gt_instances,
+                                                batch_gt_semantic_segs)
+
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self, x: Tuple[Tensor],
+                batch_data_samples: SampleList) -> Tuple[Tensor]:
+        """Test without augmentaton.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: A tuple contains two tensors.
+
+                - mask_cls_results (Tensor): Mask classification logits,\
+                    shape (batch_size, num_queries, cls_out_channels).
+                    Note `cls_out_channels` should includes background.
+                - mask_pred_results (Tensor): Mask logits, shape \
+                    (batch_size, num_queries, h, w).
+        """
+        batch_img_metas = [
+            data_sample.metainfo for data_sample in batch_data_samples
+        ]
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+
+        # upsample masks
+        img_shape = batch_img_metas[0]['batch_input_shape']
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=(img_shape[0], img_shape[1]),
+            mode='bilinear',
+            align_corners=False)
+
+        return mask_cls_results, mask_pred_results
diff --git a/mmde/mmdet/models/dense_heads/nasfcos_head.py b/mmde/mmdet/models/dense_heads/nasfcos_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..14ee62a7910d90a108fefb2acef00c91ab83ecc8
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/nasfcos_head.py
@@ -0,0 +1,114 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+
+from mmdet.models.dense_heads.fcos_head import FCOSHead
+from mmdet.registry import MODELS
+from mmdet.utils import OptMultiConfig
+
+
+@MODELS.register_module()
+class NASFCOSHead(FCOSHead):
+    """Anchor-free head used in `NASFCOS <https://arxiv.org/abs/1906.04423>`_.
+
+    It is quite similar with FCOS head, except for the searched structure of
+    classification branch and bbox regression branch, where a structure of
+    "dconv3x3, conv3x3, dconv3x3, conv1x1" is utilized instead.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        strides (Sequence[int] or Sequence[Tuple[int, int]]): Strides of points
+            in multiple feature levels. Defaults to (4, 8, 16, 32, 64).
+        regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple
+            level points.
+        center_sampling (bool): If true, use center sampling.
+            Defaults to False.
+        center_sample_radius (float): Radius of center sampling.
+            Defaults to 1.5.
+        norm_on_bbox (bool): If true, normalize the regression targets with
+            FPN strides. Defaults to False.
+        centerness_on_reg (bool): If true, position centerness on the
+            regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
+            Defaults to False.
+        conv_bias (bool or str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Defaults to "auto".
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_centerness (:obj:`ConfigDict`, or dict): Config of centerness
+            loss.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config norm layer.  Defaults to
+            ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], opitonal): Initialization config dict.
+    """  # noqa: E501
+
+    def __init__(self,
+                 *args,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        if init_cfg is None:
+            init_cfg = [
+                dict(type='Caffe2Xavier', layer=['ConvModule', 'Conv2d']),
+                dict(
+                    type='Normal',
+                    std=0.01,
+                    override=[
+                        dict(name='conv_reg'),
+                        dict(name='conv_centerness'),
+                        dict(
+                            name='conv_cls',
+                            type='Normal',
+                            std=0.01,
+                            bias_prob=0.01)
+                    ]),
+            ]
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        dconv3x3_config = dict(
+            type='DCNv2',
+            kernel_size=3,
+            use_bias=True,
+            deform_groups=2,
+            padding=1)
+        conv3x3_config = dict(type='Conv', kernel_size=3, padding=1)
+        conv1x1_config = dict(type='Conv', kernel_size=1)
+
+        self.arch_config = [
+            dconv3x3_config, conv3x3_config, dconv3x3_config, conv1x1_config
+        ]
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i, op_ in enumerate(self.arch_config):
+            op = copy.deepcopy(op_)
+            chn = self.in_channels if i == 0 else self.feat_channels
+            assert isinstance(op, dict)
+            use_bias = op.pop('use_bias', False)
+            padding = op.pop('padding', 0)
+            kernel_size = op.pop('kernel_size')
+            module = ConvModule(
+                chn,
+                self.feat_channels,
+                kernel_size,
+                stride=1,
+                padding=padding,
+                norm_cfg=self.norm_cfg,
+                bias=use_bias,
+                conv_cfg=op)
+
+            self.cls_convs.append(copy.deepcopy(module))
+            self.reg_convs.append(copy.deepcopy(module))
+
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+        self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1)
+
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
diff --git a/mmde/mmdet/models/dense_heads/paa_head.py b/mmde/mmdet/models/dense_heads/paa_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c1f453d2788b354970254e8875068e824c370d4
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/paa_head.py
@@ -0,0 +1,730 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList)
+from ..layers import multiclass_nms
+from ..utils import levels_to_images, multi_apply
+from . import ATSSHead
+
+EPS = 1e-12
+try:
+    import sklearn.mixture as skm
+except ImportError:
+    skm = None
+
+
+@MODELS.register_module()
+class PAAHead(ATSSHead):
+    """Head of PAAAssignment: Probabilistic Anchor Assignment with IoU
+    Prediction for Object Detection.
+
+    Code is modified from the `official github repo
+    <https://github.com/kkhoot/PAA/blob/master/paa_core
+    /modeling/rpn/paa/loss.py>`_.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.08103>`_ .
+
+    Args:
+        topk (int): Select topk samples with smallest loss in
+            each level.
+        score_voting (bool): Whether to use score voting in post-process.
+        covariance_type : String describing the type of covariance parameters
+            to be used in :class:`sklearn.mixture.GaussianMixture`.
+            It must be one of:
+
+            - 'full': each component has its own general covariance matrix
+            - 'tied': all components share the same general covariance matrix
+            - 'diag': each component has its own diagonal covariance matrix
+            - 'spherical': each component has its own single variance
+            Default: 'diag'. From 'full' to 'spherical', the gmm fitting
+            process is faster yet the performance could be influenced. For most
+            cases, 'diag' should be a good choice.
+    """
+
+    def __init__(self,
+                 *args,
+                 topk: int = 9,
+                 score_voting: bool = True,
+                 covariance_type: str = 'diag',
+                 **kwargs):
+        # topk used in paa reassign process
+        self.topk = topk
+        self.with_score_voting = score_voting
+        self.covariance_type = covariance_type
+        super().__init__(*args, **kwargs)
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            iou_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            iou_preds (list[Tensor]): iou_preds for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss gmm_assignment.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+        )
+        (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds,
+         pos_gt_index) = cls_reg_targets
+        cls_scores = levels_to_images(cls_scores)
+        cls_scores = [
+            item.reshape(-1, self.cls_out_channels) for item in cls_scores
+        ]
+        bbox_preds = levels_to_images(bbox_preds)
+        bbox_preds = [item.reshape(-1, 4) for item in bbox_preds]
+        iou_preds = levels_to_images(iou_preds)
+        iou_preds = [item.reshape(-1, 1) for item in iou_preds]
+        pos_losses_list, = multi_apply(self.get_pos_loss, anchor_list,
+                                       cls_scores, bbox_preds, labels,
+                                       labels_weight, bboxes_target,
+                                       bboxes_weight, pos_inds)
+
+        with torch.no_grad():
+            reassign_labels, reassign_label_weight, \
+                reassign_bbox_weights, num_pos = multi_apply(
+                    self.paa_reassign,
+                    pos_losses_list,
+                    labels,
+                    labels_weight,
+                    bboxes_weight,
+                    pos_inds,
+                    pos_gt_index,
+                    anchor_list)
+            num_pos = sum(num_pos)
+        # convert all tensor list to a flatten tensor
+        cls_scores = torch.cat(cls_scores, 0).view(-1, cls_scores[0].size(-1))
+        bbox_preds = torch.cat(bbox_preds, 0).view(-1, bbox_preds[0].size(-1))
+        iou_preds = torch.cat(iou_preds, 0).view(-1, iou_preds[0].size(-1))
+        labels = torch.cat(reassign_labels, 0).view(-1)
+        flatten_anchors = torch.cat(
+            [torch.cat(item, 0) for item in anchor_list])
+        labels_weight = torch.cat(reassign_label_weight, 0).view(-1)
+        bboxes_target = torch.cat(bboxes_target,
+                                  0).view(-1, bboxes_target[0].size(-1))
+
+        pos_inds_flatten = ((labels >= 0)
+                            &
+                            (labels < self.num_classes)).nonzero().reshape(-1)
+
+        losses_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            labels_weight,
+            avg_factor=max(num_pos, len(batch_img_metas)))  # avoid num_pos=0
+        if num_pos:
+            pos_bbox_pred = self.bbox_coder.decode(
+                flatten_anchors[pos_inds_flatten],
+                bbox_preds[pos_inds_flatten])
+            pos_bbox_target = bboxes_target[pos_inds_flatten]
+            iou_target = bbox_overlaps(
+                pos_bbox_pred.detach(), pos_bbox_target, is_aligned=True)
+            losses_iou = self.loss_centerness(
+                iou_preds[pos_inds_flatten],
+                iou_target.unsqueeze(-1),
+                avg_factor=num_pos)
+            losses_bbox = self.loss_bbox(
+                pos_bbox_pred,
+                pos_bbox_target,
+                iou_target.clamp(min=EPS),
+                avg_factor=iou_target.sum())
+        else:
+            losses_iou = iou_preds.sum() * 0
+            losses_bbox = bbox_preds.sum() * 0
+
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_iou=losses_iou)
+
+    def get_pos_loss(self, anchors: List[Tensor], cls_score: Tensor,
+                     bbox_pred: Tensor, label: Tensor, label_weight: Tensor,
+                     bbox_target: dict, bbox_weight: Tensor,
+                     pos_inds: Tensor) -> Tensor:
+        """Calculate loss of all potential positive samples obtained from first
+        match process.
+
+        Args:
+            anchors (list[Tensor]): Anchors of each scale.
+            cls_score (Tensor): Box scores of single image with shape
+                (num_anchors, num_classes)
+            bbox_pred (Tensor): Box energies / deltas of single image
+                with shape (num_anchors, 4)
+            label (Tensor): classification target of each anchor with
+                shape (num_anchors,)
+            label_weight (Tensor): Classification loss weight of each
+                anchor with shape (num_anchors).
+            bbox_target (dict): Regression target of each anchor with
+                shape (num_anchors, 4).
+            bbox_weight (Tensor): Bbox weight of each anchor with shape
+                (num_anchors, 4).
+            pos_inds (Tensor): Index of all positive samples got from
+                first assign process.
+
+        Returns:
+            Tensor: Losses of all positive samples in single image.
+        """
+        if not len(pos_inds):
+            return cls_score.new([]),
+        anchors_all_level = torch.cat(anchors, 0)
+        pos_scores = cls_score[pos_inds]
+        pos_bbox_pred = bbox_pred[pos_inds]
+        pos_label = label[pos_inds]
+        pos_label_weight = label_weight[pos_inds]
+        pos_bbox_target = bbox_target[pos_inds]
+        pos_bbox_weight = bbox_weight[pos_inds]
+        pos_anchors = anchors_all_level[pos_inds]
+        pos_bbox_pred = self.bbox_coder.decode(pos_anchors, pos_bbox_pred)
+
+        # to keep loss dimension
+        loss_cls = self.loss_cls(
+            pos_scores,
+            pos_label,
+            pos_label_weight,
+            avg_factor=1.0,
+            reduction_override='none')
+
+        loss_bbox = self.loss_bbox(
+            pos_bbox_pred,
+            pos_bbox_target,
+            pos_bbox_weight,
+            avg_factor=1.0,  # keep same loss weight before reassign
+            reduction_override='none')
+
+        loss_cls = loss_cls.sum(-1)
+        pos_loss = loss_bbox + loss_cls
+        return pos_loss,
+
+    def paa_reassign(self, pos_losses: Tensor, label: Tensor,
+                     label_weight: Tensor, bbox_weight: Tensor,
+                     pos_inds: Tensor, pos_gt_inds: Tensor,
+                     anchors: List[Tensor]) -> tuple:
+        """Fit loss to GMM distribution and separate positive, ignore, negative
+        samples again with GMM model.
+
+        Args:
+            pos_losses (Tensor): Losses of all positive samples in
+                single image.
+            label (Tensor): classification target of each anchor with
+                shape (num_anchors,)
+            label_weight (Tensor): Classification loss weight of each
+                anchor with shape (num_anchors).
+            bbox_weight (Tensor): Bbox weight of each anchor with shape
+                (num_anchors, 4).
+            pos_inds (Tensor): Index of all positive samples got from
+                first assign process.
+            pos_gt_inds (Tensor): Gt_index of all positive samples got
+                from first assign process.
+            anchors (list[Tensor]): Anchors of each scale.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - label (Tensor): classification target of each anchor after
+                  paa assign, with shape (num_anchors,)
+                - label_weight (Tensor): Classification loss weight of each
+                  anchor after paa assign, with shape (num_anchors).
+                - bbox_weight (Tensor): Bbox weight of each anchor with shape
+                  (num_anchors, 4).
+                - num_pos (int): The number of positive samples after paa
+                  assign.
+        """
+        if not len(pos_inds):
+            return label, label_weight, bbox_weight, 0
+        label = label.clone()
+        label_weight = label_weight.clone()
+        bbox_weight = bbox_weight.clone()
+        num_gt = pos_gt_inds.max() + 1
+        num_level = len(anchors)
+        num_anchors_each_level = [item.size(0) for item in anchors]
+        num_anchors_each_level.insert(0, 0)
+        inds_level_interval = np.cumsum(num_anchors_each_level)
+        pos_level_mask = []
+        for i in range(num_level):
+            mask = (pos_inds >= inds_level_interval[i]) & (
+                pos_inds < inds_level_interval[i + 1])
+            pos_level_mask.append(mask)
+        pos_inds_after_paa = [label.new_tensor([])]
+        ignore_inds_after_paa = [label.new_tensor([])]
+        for gt_ind in range(num_gt):
+            pos_inds_gmm = []
+            pos_loss_gmm = []
+            gt_mask = pos_gt_inds == gt_ind
+            for level in range(num_level):
+                level_mask = pos_level_mask[level]
+                level_gt_mask = level_mask & gt_mask
+                value, topk_inds = pos_losses[level_gt_mask].topk(
+                    min(level_gt_mask.sum(), self.topk), largest=False)
+                pos_inds_gmm.append(pos_inds[level_gt_mask][topk_inds])
+                pos_loss_gmm.append(value)
+            pos_inds_gmm = torch.cat(pos_inds_gmm)
+            pos_loss_gmm = torch.cat(pos_loss_gmm)
+            # fix gmm need at least two sample
+            if len(pos_inds_gmm) < 2:
+                continue
+            device = pos_inds_gmm.device
+            pos_loss_gmm, sort_inds = pos_loss_gmm.sort()
+            pos_inds_gmm = pos_inds_gmm[sort_inds]
+            pos_loss_gmm = pos_loss_gmm.view(-1, 1).cpu().numpy()
+            min_loss, max_loss = pos_loss_gmm.min(), pos_loss_gmm.max()
+            means_init = np.array([min_loss, max_loss]).reshape(2, 1)
+            weights_init = np.array([0.5, 0.5])
+            precisions_init = np.array([1.0, 1.0]).reshape(2, 1, 1)  # full
+            if self.covariance_type == 'spherical':
+                precisions_init = precisions_init.reshape(2)
+            elif self.covariance_type == 'diag':
+                precisions_init = precisions_init.reshape(2, 1)
+            elif self.covariance_type == 'tied':
+                precisions_init = np.array([[1.0]])
+            if skm is None:
+                raise ImportError('Please run "pip install sklearn" '
+                                  'to install sklearn first.')
+            gmm = skm.GaussianMixture(
+                2,
+                weights_init=weights_init,
+                means_init=means_init,
+                precisions_init=precisions_init,
+                covariance_type=self.covariance_type)
+            gmm.fit(pos_loss_gmm)
+            gmm_assignment = gmm.predict(pos_loss_gmm)
+            scores = gmm.score_samples(pos_loss_gmm)
+            gmm_assignment = torch.from_numpy(gmm_assignment).to(device)
+            scores = torch.from_numpy(scores).to(device)
+
+            pos_inds_temp, ignore_inds_temp = self.gmm_separation_scheme(
+                gmm_assignment, scores, pos_inds_gmm)
+            pos_inds_after_paa.append(pos_inds_temp)
+            ignore_inds_after_paa.append(ignore_inds_temp)
+
+        pos_inds_after_paa = torch.cat(pos_inds_after_paa)
+        ignore_inds_after_paa = torch.cat(ignore_inds_after_paa)
+        reassign_mask = (pos_inds.unsqueeze(1) != pos_inds_after_paa).all(1)
+        reassign_ids = pos_inds[reassign_mask]
+        label[reassign_ids] = self.num_classes
+        label_weight[ignore_inds_after_paa] = 0
+        bbox_weight[reassign_ids] = 0
+        num_pos = len(pos_inds_after_paa)
+        return label, label_weight, bbox_weight, num_pos
+
+    def gmm_separation_scheme(self, gmm_assignment: Tensor, scores: Tensor,
+                              pos_inds_gmm: Tensor) -> Tuple[Tensor, Tensor]:
+        """A general separation scheme for gmm model.
+
+        It separates a GMM distribution of candidate samples into three
+        parts, 0 1 and uncertain areas, and you can implement other
+        separation schemes by rewriting this function.
+
+        Args:
+            gmm_assignment (Tensor): The prediction of GMM which is of shape
+                (num_samples,). The 0/1 value indicates the distribution
+                that each sample comes from.
+            scores (Tensor): The probability of sample coming from the
+                fit GMM distribution. The tensor is of shape (num_samples,).
+            pos_inds_gmm (Tensor): All the indexes of samples which are used
+                to fit GMM model. The tensor is of shape (num_samples,)
+
+        Returns:
+            tuple[Tensor, Tensor]: The indices of positive and ignored samples.
+
+                - pos_inds_temp (Tensor): Indices of positive samples.
+                - ignore_inds_temp (Tensor): Indices of ignore samples.
+        """
+        # The implementation is (c) in Fig.3 in origin paper instead of (b).
+        # You can refer to issues such as
+        # https://github.com/kkhoot/PAA/issues/8 and
+        # https://github.com/kkhoot/PAA/issues/9.
+        fgs = gmm_assignment == 0
+        pos_inds_temp = fgs.new_tensor([], dtype=torch.long)
+        ignore_inds_temp = fgs.new_tensor([], dtype=torch.long)
+        if fgs.nonzero().numel():
+            _, pos_thr_ind = scores[fgs].topk(1)
+            pos_inds_temp = pos_inds_gmm[fgs][:pos_thr_ind + 1]
+            ignore_inds_temp = pos_inds_gmm.new_tensor([])
+        return pos_inds_temp, ignore_inds_temp
+
+    def get_targets(self,
+                    anchor_list: List[List[Tensor]],
+                    valid_flag_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs: bool = True) -> tuple:
+        """Get targets for PAA head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. We direct
+        return the results from _get_targets_single instead map it to levels
+        by images_to_levels function.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - labels (list[Tensor]): Labels of all anchors, each with
+                    shape (num_anchors,).
+                - label_weights (list[Tensor]): Label weights of all anchor.
+                    each with shape (num_anchors,).
+                - bbox_targets (list[Tensor]): BBox targets of all anchors.
+                    each with shape (num_anchors, 4).
+                - bbox_weights (list[Tensor]): BBox weights of all anchors.
+                    each with shape (num_anchors, 4).
+                - pos_inds (list[Tensor]): Contains all index of positive
+                    sample in all anchor.
+                - gt_inds (list[Tensor]): Contains all gt_index of positive
+                    sample in all anchor.
+        """
+
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+        concat_anchor_list = []
+        concat_valid_flag_list = []
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+            concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        results = multi_apply(
+            self._get_targets_single,
+            concat_anchor_list,
+            concat_valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore,
+            unmap_outputs=unmap_outputs)
+
+        (labels, label_weights, bbox_targets, bbox_weights, valid_pos_inds,
+         valid_neg_inds, sampling_result) = results
+
+        # Due to valid flag of anchors, we have to calculate the real pos_inds
+        # in origin anchor set.
+        pos_inds = []
+        for i, single_labels in enumerate(labels):
+            pos_mask = (0 <= single_labels) & (
+                single_labels < self.num_classes)
+            pos_inds.append(pos_mask.nonzero().view(-1))
+
+        gt_inds = [item.pos_assigned_gt_inds for item in sampling_result]
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                gt_inds)
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        This method is same as `AnchorHead._get_targets_single()`.
+        """
+        assert unmap_outputs, 'We must map outputs back to the original' \
+                              'set of anchors in PAAhead'
+        return super(ATSSHead, self)._get_targets_single(
+            flat_anchors,
+            valid_flags,
+            gt_instances,
+            img_meta,
+            gt_instances_ignore,
+            unmap_outputs=True)
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        score_factors: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: OptConfigType = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        This method is same as `BaseDenseHead.get_results()`.
+        """
+        assert with_nms, 'PAA only supports "with_nms=True" now and it ' \
+                         'means PAAHead does not support ' \
+                         'test-time augmentation'
+        return super().predict_by_feat(
+            cls_scores=cls_scores,
+            bbox_preds=bbox_preds,
+            score_factors=score_factors,
+            batch_img_metas=batch_img_metas,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms)
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: OptConfigType = None,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factors from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (:obj:`ConfigDict` or dict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_score_factors = []
+        for level_idx, (cls_score, bbox_pred, score_factor, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              score_factor_list, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            score_factor = score_factor.permute(1, 2, 0).reshape(-1).sigmoid()
+
+            if 0 < nms_pre < scores.shape[0]:
+                max_scores, _ = (scores *
+                                 score_factor[:, None]).sqrt().max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                priors = priors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                score_factor = score_factor[topk_inds]
+
+            bboxes = self.bbox_coder.decode(
+                priors, bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_score_factors.append(score_factor)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.score_factors = torch.cat(mlvl_score_factors)
+
+        return self._bbox_post_process(results, cfg, rescale, with_nms,
+                                       img_meta)
+
+    def _bbox_post_process(self,
+                           results: InstanceData,
+                           cfg: ConfigType,
+                           rescale: bool = False,
+                           with_nms: bool = True,
+                           img_meta: Optional[dict] = None):
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually with_nms is False is used for aug test.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (:obj:`ConfigDict` or dict): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        if rescale:
+            results.bboxes /= results.bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+        # Add a dummy background class to the backend when using sigmoid
+        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+        # BG cat_id: num_class
+        padding = results.scores.new_zeros(results.scores.shape[0], 1)
+        mlvl_scores = torch.cat([results.scores, padding], dim=1)
+
+        mlvl_nms_scores = (mlvl_scores * results.score_factors[:, None]).sqrt()
+        det_bboxes, det_labels = multiclass_nms(
+            results.bboxes,
+            mlvl_nms_scores,
+            cfg.score_thr,
+            cfg.nms,
+            cfg.max_per_img,
+            score_factors=None)
+        if self.with_score_voting and len(det_bboxes) > 0:
+            det_bboxes, det_labels = self.score_voting(det_bboxes, det_labels,
+                                                       results.bboxes,
+                                                       mlvl_nms_scores,
+                                                       cfg.score_thr)
+        nms_results = InstanceData()
+        nms_results.bboxes = det_bboxes[:, :-1]
+        nms_results.scores = det_bboxes[:, -1]
+        nms_results.labels = det_labels
+        return nms_results
+
+    def score_voting(self, det_bboxes: Tensor, det_labels: Tensor,
+                     mlvl_bboxes: Tensor, mlvl_nms_scores: Tensor,
+                     score_thr: float) -> Tuple[Tensor, Tensor]:
+        """Implementation of score voting method works on each remaining boxes
+        after NMS procedure.
+
+        Args:
+            det_bboxes (Tensor): Remaining boxes after NMS procedure,
+                with shape (k, 5), each dimension means
+                (x1, y1, x2, y2, score).
+            det_labels (Tensor): The label of remaining boxes, with shape
+                (k, 1),Labels are 0-based.
+            mlvl_bboxes (Tensor): All boxes before the NMS procedure,
+                with shape (num_anchors,4).
+            mlvl_nms_scores (Tensor): The scores of all boxes which is used
+                in the NMS procedure, with shape (num_anchors, num_class)
+            score_thr (float): The score threshold of bboxes.
+
+        Returns:
+            tuple: Usually returns a tuple containing voting results.
+
+                - det_bboxes_voted (Tensor): Remaining boxes after
+                    score voting procedure, with shape (k, 5), each
+                    dimension means (x1, y1, x2, y2, score).
+                - det_labels_voted (Tensor): Label of remaining bboxes
+                    after voting, with shape (num_anchors,).
+        """
+        candidate_mask = mlvl_nms_scores > score_thr
+        candidate_mask_nonzeros = candidate_mask.nonzero(as_tuple=False)
+        candidate_inds = candidate_mask_nonzeros[:, 0]
+        candidate_labels = candidate_mask_nonzeros[:, 1]
+        candidate_bboxes = mlvl_bboxes[candidate_inds]
+        candidate_scores = mlvl_nms_scores[candidate_mask]
+        det_bboxes_voted = []
+        det_labels_voted = []
+        for cls in range(self.cls_out_channels):
+            candidate_cls_mask = candidate_labels == cls
+            if not candidate_cls_mask.any():
+                continue
+            candidate_cls_scores = candidate_scores[candidate_cls_mask]
+            candidate_cls_bboxes = candidate_bboxes[candidate_cls_mask]
+            det_cls_mask = det_labels == cls
+            det_cls_bboxes = det_bboxes[det_cls_mask].view(
+                -1, det_bboxes.size(-1))
+            det_candidate_ious = bbox_overlaps(det_cls_bboxes[:, :4],
+                                               candidate_cls_bboxes)
+            for det_ind in range(len(det_cls_bboxes)):
+                single_det_ious = det_candidate_ious[det_ind]
+                pos_ious_mask = single_det_ious > 0.01
+                pos_ious = single_det_ious[pos_ious_mask]
+                pos_bboxes = candidate_cls_bboxes[pos_ious_mask]
+                pos_scores = candidate_cls_scores[pos_ious_mask]
+                pis = (torch.exp(-(1 - pos_ious)**2 / 0.025) *
+                       pos_scores)[:, None]
+                voted_box = torch.sum(
+                    pis * pos_bboxes, dim=0) / torch.sum(
+                        pis, dim=0)
+                voted_score = det_cls_bboxes[det_ind][-1:][None, :]
+                det_bboxes_voted.append(
+                    torch.cat((voted_box[None, :], voted_score), dim=1))
+                det_labels_voted.append(cls)
+
+        det_bboxes_voted = torch.cat(det_bboxes_voted, dim=0)
+        det_labels_voted = det_labels.new_tensor(det_labels_voted)
+        return det_bboxes_voted, det_labels_voted
diff --git a/mmde/mmdet/models/dense_heads/pisa_retinanet_head.py b/mmde/mmdet/models/dense_heads/pisa_retinanet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..85fd54f5be3605d0994c2a2d4d9d7deac4c0f284
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/pisa_retinanet_head.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList, OptInstanceList
+from ..losses import carl_loss, isr_p
+from ..utils import images_to_levels
+from .retina_head import RetinaHead
+
+
+@MODELS.register_module()
+class PISARetinaHead(RetinaHead):
+    """PISA Retinanet Head.
+
+    The head owns the same structure with Retinanet Head, but differs in two
+        aspects:
+        1. Importance-based Sample Reweighting Positive (ISR-P) is applied to
+            change the positive loss weights.
+        2. Classification-aware regression loss is adopted as a third loss.
+    """
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: Loss dict, comprise classification loss, regression loss and
+            carl loss.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            return_sampling_results=True)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor, sampling_results_list) = cls_reg_targets
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        num_imgs = len(batch_img_metas)
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, label_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_cls_scores = torch.cat(
+            flatten_cls_scores, dim=1).reshape(-1,
+                                               flatten_cls_scores[0].size(-1))
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_bbox_preds = torch.cat(
+            flatten_bbox_preds, dim=1).view(-1, flatten_bbox_preds[0].size(-1))
+        flatten_labels = torch.cat(labels_list, dim=1).reshape(-1)
+        flatten_label_weights = torch.cat(
+            label_weights_list, dim=1).reshape(-1)
+        flatten_anchors = torch.cat(all_anchor_list, dim=1).reshape(-1, 4)
+        flatten_bbox_targets = torch.cat(
+            bbox_targets_list, dim=1).reshape(-1, 4)
+        flatten_bbox_weights = torch.cat(
+            bbox_weights_list, dim=1).reshape(-1, 4)
+
+        # Apply ISR-P
+        isr_cfg = self.train_cfg.get('isr', None)
+        if isr_cfg is not None:
+            all_targets = (flatten_labels, flatten_label_weights,
+                           flatten_bbox_targets, flatten_bbox_weights)
+            with torch.no_grad():
+                all_targets = isr_p(
+                    flatten_cls_scores,
+                    flatten_bbox_preds,
+                    all_targets,
+                    flatten_anchors,
+                    sampling_results_list,
+                    bbox_coder=self.bbox_coder,
+                    loss_cls=self.loss_cls,
+                    num_class=self.num_classes,
+                    **self.train_cfg['isr'])
+            (flatten_labels, flatten_label_weights, flatten_bbox_targets,
+             flatten_bbox_weights) = all_targets
+
+        # For convenience we compute loss once instead separating by fpn level,
+        # so that we don't need to separate the weights by level again.
+        # The result should be the same
+        losses_cls = self.loss_cls(
+            flatten_cls_scores,
+            flatten_labels,
+            flatten_label_weights,
+            avg_factor=avg_factor)
+        losses_bbox = self.loss_bbox(
+            flatten_bbox_preds,
+            flatten_bbox_targets,
+            flatten_bbox_weights,
+            avg_factor=avg_factor)
+        loss_dict = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+
+        # CARL Loss
+        carl_cfg = self.train_cfg.get('carl', None)
+        if carl_cfg is not None:
+            loss_carl = carl_loss(
+                flatten_cls_scores,
+                flatten_labels,
+                flatten_bbox_preds,
+                flatten_bbox_targets,
+                self.loss_bbox,
+                **self.train_cfg['carl'],
+                avg_factor=avg_factor,
+                sigmoid=True,
+                num_class=self.num_classes)
+            loss_dict.update(loss_carl)
+
+        return loss_dict
diff --git a/mmde/mmdet/models/dense_heads/pisa_ssd_head.py b/mmde/mmdet/models/dense_heads/pisa_ssd_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec09cb40a9c95d3f9889d736b80dfccef07f6fd1
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/pisa_ssd_head.py
@@ -0,0 +1,182 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList, OptInstanceList
+from ..losses import CrossEntropyLoss, SmoothL1Loss, carl_loss, isr_p
+from ..utils import multi_apply
+from .ssd_head import SSDHead
+
+
+# TODO: add loss evaluator for SSD
+@MODELS.register_module()
+class PISASSDHead(SSDHead):
+    """Implementation of `PISA SSD head <https://arxiv.org/abs/1904.04821>`_
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (Sequence[int]): Number of channels in the input feature
+            map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Defaults to 0.
+        feat_channels (int): Number of hidden channels when stacked_convs
+            > 0. Defaults to 256.
+        use_depthwise (bool): Whether to use DepthwiseSeparableConv.
+            Defaults to False.
+        conv_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config conv layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config norm layer. Defaults to None.
+        act_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config activation layer. Defaults to None.
+        anchor_generator (:obj:`ConfigDict` or dict): Config dict for anchor
+            generator.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Defaults to False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        train_cfg (:obj:`ConfigDict` or dict, Optional): Training config of
+            anchor head.
+        test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of
+            anchor head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], Optional): Initialization config dict.
+    """  # noqa: W605
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Union[List[Tensor], Tensor]]:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Union[List[Tensor], Tensor]]: A dictionary of loss
+            components. the dict has components below:
+
+            - loss_cls (list[Tensor]): A list containing each feature map \
+            classification loss.
+            - loss_bbox (list[Tensor]): A list containing each feature map \
+            regression loss.
+            - loss_carl (Tensor): The loss of CARL.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            unmap_outputs=False,
+            return_sampling_results=True)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor, sampling_results_list) = cls_reg_targets
+
+        num_images = len(batch_img_metas)
+        all_cls_scores = torch.cat([
+            s.permute(0, 2, 3, 1).reshape(
+                num_images, -1, self.cls_out_channels) for s in cls_scores
+        ], 1)
+        all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+        all_label_weights = torch.cat(label_weights_list,
+                                      -1).view(num_images, -1)
+        all_bbox_preds = torch.cat([
+            b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+            for b in bbox_preds
+        ], -2)
+        all_bbox_targets = torch.cat(bbox_targets_list,
+                                     -2).view(num_images, -1, 4)
+        all_bbox_weights = torch.cat(bbox_weights_list,
+                                     -2).view(num_images, -1, 4)
+
+        # concat all level anchors to a single tensor
+        all_anchors = []
+        for i in range(num_images):
+            all_anchors.append(torch.cat(anchor_list[i]))
+
+        isr_cfg = self.train_cfg.get('isr', None)
+        all_targets = (all_labels.view(-1), all_label_weights.view(-1),
+                       all_bbox_targets.view(-1,
+                                             4), all_bbox_weights.view(-1, 4))
+        # apply ISR-P
+        if isr_cfg is not None:
+            all_targets = isr_p(
+                all_cls_scores.view(-1, all_cls_scores.size(-1)),
+                all_bbox_preds.view(-1, 4),
+                all_targets,
+                torch.cat(all_anchors),
+                sampling_results_list,
+                loss_cls=CrossEntropyLoss(),
+                bbox_coder=self.bbox_coder,
+                **self.train_cfg['isr'],
+                num_class=self.num_classes)
+            (new_labels, new_label_weights, new_bbox_targets,
+             new_bbox_weights) = all_targets
+            all_labels = new_labels.view(all_labels.shape)
+            all_label_weights = new_label_weights.view(all_label_weights.shape)
+            all_bbox_targets = new_bbox_targets.view(all_bbox_targets.shape)
+            all_bbox_weights = new_bbox_weights.view(all_bbox_weights.shape)
+
+        # add CARL loss
+        carl_loss_cfg = self.train_cfg.get('carl', None)
+        if carl_loss_cfg is not None:
+            loss_carl = carl_loss(
+                all_cls_scores.view(-1, all_cls_scores.size(-1)),
+                all_targets[0],
+                all_bbox_preds.view(-1, 4),
+                all_targets[2],
+                SmoothL1Loss(beta=1.),
+                **self.train_cfg['carl'],
+                avg_factor=avg_factor,
+                num_class=self.num_classes)
+
+        # check NaN and Inf
+        assert torch.isfinite(all_cls_scores).all().item(), \
+            'classification scores become infinite or NaN!'
+        assert torch.isfinite(all_bbox_preds).all().item(), \
+            'bbox predications become infinite or NaN!'
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_by_feat_single,
+            all_cls_scores,
+            all_bbox_preds,
+            all_anchors,
+            all_labels,
+            all_label_weights,
+            all_bbox_targets,
+            all_bbox_weights,
+            avg_factor=avg_factor)
+        loss_dict = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+        if carl_loss_cfg is not None:
+            loss_dict.update(loss_carl)
+        return loss_dict
diff --git a/mmde/mmdet/models/dense_heads/reppoints_head.py b/mmde/mmdet/models/dense_heads/reppoints_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..22f3e3401a4abd9cc35b41d24efe23e5655a905e
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/reppoints_head.py
@@ -0,0 +1,885 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Sequence, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import DeformConv2d
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptInstanceList
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..task_modules.samplers import PseudoSampler
+from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply,
+                     unmap)
+from .anchor_free_head import AnchorFreeHead
+
+
+@MODELS.register_module()
+class RepPointsHead(AnchorFreeHead):
+    """RepPoint head.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        point_feat_channels (int): Number of channels of points features.
+        num_points (int): Number of points.
+        gradient_mul (float): The multiplier to gradients from
+            points refinement and recognition.
+        point_strides (Sequence[int]): points strides.
+        point_base_scale (int): bbox scale for assigning labels.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox_init (:obj:`ConfigDict` or dict): Config of initial points
+            loss.
+        loss_bbox_refine (:obj:`ConfigDict` or dict): Config of points loss in
+            refinement.
+        use_grid_points (bool): If we use bounding box representation, the
+        reppoints is represented as grid points on the bounding box.
+        center_init (bool): Whether to use center point assignment.
+        transform_method (str): The methods to transform RepPoints to bbox.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 point_feat_channels: int = 256,
+                 num_points: int = 9,
+                 gradient_mul: float = 0.1,
+                 point_strides: Sequence[int] = [8, 16, 32, 64, 128],
+                 point_base_scale: int = 4,
+                 loss_cls: ConfigType = dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox_init: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.5),
+                 loss_bbox_refine: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+                 use_grid_points: bool = False,
+                 center_init: bool = True,
+                 transform_method: str = 'moment',
+                 moment_mul: float = 0.01,
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='reppoints_cls_out',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        self.num_points = num_points
+        self.point_feat_channels = point_feat_channels
+        self.use_grid_points = use_grid_points
+        self.center_init = center_init
+
+        # we use deform conv to extract points features
+        self.dcn_kernel = int(np.sqrt(num_points))
+        self.dcn_pad = int((self.dcn_kernel - 1) / 2)
+        assert self.dcn_kernel * self.dcn_kernel == num_points, \
+            'The points number should be a square number.'
+        assert self.dcn_kernel % 2 == 1, \
+            'The points number should be an odd square number.'
+        dcn_base = np.arange(-self.dcn_pad,
+                             self.dcn_pad + 1).astype(np.float64)
+        dcn_base_y = np.repeat(dcn_base, self.dcn_kernel)
+        dcn_base_x = np.tile(dcn_base, self.dcn_kernel)
+        dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape(
+            (-1))
+        self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1)
+
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            loss_cls=loss_cls,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        self.gradient_mul = gradient_mul
+        self.point_base_scale = point_base_scale
+        self.point_strides = point_strides
+        self.prior_generator = MlvlPointGenerator(
+            self.point_strides, offset=0.)
+
+        if self.train_cfg:
+            self.init_assigner = TASK_UTILS.build(
+                self.train_cfg['init']['assigner'])
+            self.refine_assigner = TASK_UTILS.build(
+                self.train_cfg['refine']['assigner'])
+
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+        self.transform_method = transform_method
+        if self.transform_method == 'moment':
+            self.moment_transfer = nn.Parameter(
+                data=torch.zeros(2), requires_grad=True)
+            self.moment_mul = moment_mul
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = self.num_classes
+        else:
+            self.cls_out_channels = self.num_classes + 1
+        self.loss_bbox_init = MODELS.build(loss_bbox_init)
+        self.loss_bbox_refine = MODELS.build(loss_bbox_refine)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        pts_out_dim = 4 if self.use_grid_points else 2 * self.num_points
+        self.reppoints_cls_conv = DeformConv2d(self.feat_channels,
+                                               self.point_feat_channels,
+                                               self.dcn_kernel, 1,
+                                               self.dcn_pad)
+        self.reppoints_cls_out = nn.Conv2d(self.point_feat_channels,
+                                           self.cls_out_channels, 1, 1, 0)
+        self.reppoints_pts_init_conv = nn.Conv2d(self.feat_channels,
+                                                 self.point_feat_channels, 3,
+                                                 1, 1)
+        self.reppoints_pts_init_out = nn.Conv2d(self.point_feat_channels,
+                                                pts_out_dim, 1, 1, 0)
+        self.reppoints_pts_refine_conv = DeformConv2d(self.feat_channels,
+                                                      self.point_feat_channels,
+                                                      self.dcn_kernel, 1,
+                                                      self.dcn_pad)
+        self.reppoints_pts_refine_out = nn.Conv2d(self.point_feat_channels,
+                                                  pts_out_dim, 1, 1, 0)
+
+    def points2bbox(self, pts: Tensor, y_first: bool = True) -> Tensor:
+        """Converting the points set into bounding box.
+
+        Args:
+            pts (Tensor): the input points sets (fields), each points
+                set (fields) is represented as 2n scalar.
+            y_first (bool): if y_first=True, the point set is
+                represented as [y1, x1, y2, x2 ... yn, xn], otherwise
+                the point set is represented as
+                [x1, y1, x2, y2 ... xn, yn]. Defaults to True.
+
+        Returns:
+            Tensor: each points set is converting to a bbox [x1, y1, x2, y2].
+        """
+        pts_reshape = pts.view(pts.shape[0], -1, 2, *pts.shape[2:])
+        pts_y = pts_reshape[:, :, 0, ...] if y_first else pts_reshape[:, :, 1,
+                                                                      ...]
+        pts_x = pts_reshape[:, :, 1, ...] if y_first else pts_reshape[:, :, 0,
+                                                                      ...]
+        if self.transform_method == 'minmax':
+            bbox_left = pts_x.min(dim=1, keepdim=True)[0]
+            bbox_right = pts_x.max(dim=1, keepdim=True)[0]
+            bbox_up = pts_y.min(dim=1, keepdim=True)[0]
+            bbox_bottom = pts_y.max(dim=1, keepdim=True)[0]
+            bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom],
+                             dim=1)
+        elif self.transform_method == 'partial_minmax':
+            pts_y = pts_y[:, :4, ...]
+            pts_x = pts_x[:, :4, ...]
+            bbox_left = pts_x.min(dim=1, keepdim=True)[0]
+            bbox_right = pts_x.max(dim=1, keepdim=True)[0]
+            bbox_up = pts_y.min(dim=1, keepdim=True)[0]
+            bbox_bottom = pts_y.max(dim=1, keepdim=True)[0]
+            bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom],
+                             dim=1)
+        elif self.transform_method == 'moment':
+            pts_y_mean = pts_y.mean(dim=1, keepdim=True)
+            pts_x_mean = pts_x.mean(dim=1, keepdim=True)
+            pts_y_std = torch.std(pts_y - pts_y_mean, dim=1, keepdim=True)
+            pts_x_std = torch.std(pts_x - pts_x_mean, dim=1, keepdim=True)
+            moment_transfer = (self.moment_transfer * self.moment_mul) + (
+                self.moment_transfer.detach() * (1 - self.moment_mul))
+            moment_width_transfer = moment_transfer[0]
+            moment_height_transfer = moment_transfer[1]
+            half_width = pts_x_std * torch.exp(moment_width_transfer)
+            half_height = pts_y_std * torch.exp(moment_height_transfer)
+            bbox = torch.cat([
+                pts_x_mean - half_width, pts_y_mean - half_height,
+                pts_x_mean + half_width, pts_y_mean + half_height
+            ],
+                             dim=1)
+        else:
+            raise NotImplementedError
+        return bbox
+
+    def gen_grid_from_reg(self, reg: Tensor,
+                          previous_boxes: Tensor) -> Tuple[Tensor]:
+        """Base on the previous bboxes and regression values, we compute the
+        regressed bboxes and generate the grids on the bboxes.
+
+        Args:
+            reg (Tensor): the regression value to previous bboxes.
+            previous_boxes (Tensor): previous bboxes.
+
+        Returns:
+            Tuple[Tensor]: generate grids on the regressed bboxes.
+        """
+        b, _, h, w = reg.shape
+        bxy = (previous_boxes[:, :2, ...] + previous_boxes[:, 2:, ...]) / 2.
+        bwh = (previous_boxes[:, 2:, ...] -
+               previous_boxes[:, :2, ...]).clamp(min=1e-6)
+        grid_topleft = bxy + bwh * reg[:, :2, ...] - 0.5 * bwh * torch.exp(
+            reg[:, 2:, ...])
+        grid_wh = bwh * torch.exp(reg[:, 2:, ...])
+        grid_left = grid_topleft[:, [0], ...]
+        grid_top = grid_topleft[:, [1], ...]
+        grid_width = grid_wh[:, [0], ...]
+        grid_height = grid_wh[:, [1], ...]
+        intervel = torch.linspace(0., 1., self.dcn_kernel).view(
+            1, self.dcn_kernel, 1, 1).type_as(reg)
+        grid_x = grid_left + grid_width * intervel
+        grid_x = grid_x.unsqueeze(1).repeat(1, self.dcn_kernel, 1, 1, 1)
+        grid_x = grid_x.view(b, -1, h, w)
+        grid_y = grid_top + grid_height * intervel
+        grid_y = grid_y.unsqueeze(2).repeat(1, 1, self.dcn_kernel, 1, 1)
+        grid_y = grid_y.view(b, -1, h, w)
+        grid_yx = torch.stack([grid_y, grid_x], dim=2)
+        grid_yx = grid_yx.view(b, -1, h, w)
+        regressed_bbox = torch.cat([
+            grid_left, grid_top, grid_left + grid_width, grid_top + grid_height
+        ], 1)
+        return grid_yx, regressed_bbox
+
+    def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor]:
+        return multi_apply(self.forward_single, feats)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward feature map of a single FPN level."""
+        dcn_base_offset = self.dcn_base_offset.type_as(x)
+        # If we use center_init, the initial reppoints is from center points.
+        # If we use bounding bbox representation, the initial reppoints is
+        #   from regular grid placed on a pre-defined bbox.
+        if self.use_grid_points or not self.center_init:
+            scale = self.point_base_scale / 2
+            points_init = dcn_base_offset / dcn_base_offset.max() * scale
+            bbox_init = x.new_tensor([-scale, -scale, scale,
+                                      scale]).view(1, 4, 1, 1)
+        else:
+            points_init = 0
+        cls_feat = x
+        pts_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            pts_feat = reg_conv(pts_feat)
+        # initialize reppoints
+        pts_out_init = self.reppoints_pts_init_out(
+            self.relu(self.reppoints_pts_init_conv(pts_feat)))
+        if self.use_grid_points:
+            pts_out_init, bbox_out_init = self.gen_grid_from_reg(
+                pts_out_init, bbox_init.detach())
+        else:
+            pts_out_init = pts_out_init + points_init
+        # refine and classify reppoints
+        pts_out_init_grad_mul = (1 - self.gradient_mul) * pts_out_init.detach(
+        ) + self.gradient_mul * pts_out_init
+        dcn_offset = pts_out_init_grad_mul - dcn_base_offset
+        cls_out = self.reppoints_cls_out(
+            self.relu(self.reppoints_cls_conv(cls_feat, dcn_offset)))
+        pts_out_refine = self.reppoints_pts_refine_out(
+            self.relu(self.reppoints_pts_refine_conv(pts_feat, dcn_offset)))
+        if self.use_grid_points:
+            pts_out_refine, bbox_out_refine = self.gen_grid_from_reg(
+                pts_out_refine, bbox_out_init.detach())
+        else:
+            pts_out_refine = pts_out_refine + pts_out_init.detach()
+
+        if self.training:
+            return cls_out, pts_out_init, pts_out_refine
+        else:
+            return cls_out, self.points2bbox(pts_out_refine)
+
+    def get_points(self, featmap_sizes: List[Tuple[int]],
+                   batch_img_metas: List[dict], device: str) -> tuple:
+        """Get points according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            batch_img_metas (list[dict]): Image meta info.
+
+        Returns:
+            tuple: points of each image, valid flags of each image
+        """
+        num_imgs = len(batch_img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # points center for one time
+        multi_level_points = self.prior_generator.grid_priors(
+            featmap_sizes, device=device, with_stride=True)
+        points_list = [[point.clone() for point in multi_level_points]
+                       for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level grids
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_flags = self.prior_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device=device)
+            valid_flag_list.append(multi_level_flags)
+
+        return points_list, valid_flag_list
+
+    def centers_to_bboxes(self, point_list: List[Tensor]) -> List[Tensor]:
+        """Get bboxes according to center points.
+
+        Only used in :class:`MaxIoUAssigner`.
+        """
+        bbox_list = []
+        for i_img, point in enumerate(point_list):
+            bbox = []
+            for i_lvl in range(len(self.point_strides)):
+                scale = self.point_base_scale * self.point_strides[i_lvl] * 0.5
+                bbox_shift = torch.Tensor([-scale, -scale, scale,
+                                           scale]).view(1, 4).type_as(point[0])
+                bbox_center = torch.cat(
+                    [point[i_lvl][:, :2], point[i_lvl][:, :2]], dim=1)
+                bbox.append(bbox_center + bbox_shift)
+            bbox_list.append(bbox)
+        return bbox_list
+
+    def offset_to_pts(self, center_list: List[Tensor],
+                      pred_list: List[Tensor]) -> List[Tensor]:
+        """Change from point offset to point coordinate."""
+        pts_list = []
+        for i_lvl in range(len(self.point_strides)):
+            pts_lvl = []
+            for i_img in range(len(center_list)):
+                pts_center = center_list[i_img][i_lvl][:, :2].repeat(
+                    1, self.num_points)
+                pts_shift = pred_list[i_lvl][i_img]
+                yx_pts_shift = pts_shift.permute(1, 2, 0).view(
+                    -1, 2 * self.num_points)
+                y_pts_shift = yx_pts_shift[..., 0::2]
+                x_pts_shift = yx_pts_shift[..., 1::2]
+                xy_pts_shift = torch.stack([x_pts_shift, y_pts_shift], -1)
+                xy_pts_shift = xy_pts_shift.view(*yx_pts_shift.shape[:-1], -1)
+                pts = xy_pts_shift * self.point_strides[i_lvl] + pts_center
+                pts_lvl.append(pts)
+            pts_lvl = torch.stack(pts_lvl, 0)
+            pts_list.append(pts_lvl)
+        return pts_list
+
+    def _get_targets_single(self,
+                            flat_proposals: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            gt_instances_ignore: InstanceData,
+                            stage: str = 'init',
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute corresponding GT box and classification targets for
+        proposals.
+
+        Args:
+            flat_proposals (Tensor): Multi level points of a image.
+            valid_flags (Tensor): Multi level valid flags of a image.
+            gt_instances (InstanceData): It usually includes ``bboxes`` and
+                ``labels`` attributes.
+            gt_instances_ignore (InstanceData): It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+            stage (str): 'init' or 'refine'. Generate target for
+                init stage or refine stage. Defaults to 'init'.
+            unmap_outputs (bool): Whether to map outputs back to
+                the original set of anchors. Defaults to True.
+
+        Returns:
+            tuple:
+
+                - labels (Tensor): Labels of each level.
+                - label_weights (Tensor): Label weights of each level.
+                - bbox_targets (Tensor): BBox targets of each level.
+                - bbox_weights (Tensor): BBox weights of each level.
+                - pos_inds (Tensor): positive samples indexes.
+                - neg_inds (Tensor): negative samples indexes.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        inside_flags = valid_flags
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid proposal inside the image boundary. Please '
+                'check the image size.')
+        # assign gt and sample proposals
+        proposals = flat_proposals[inside_flags, :]
+        pred_instances = InstanceData(priors=proposals)
+
+        if stage == 'init':
+            assigner = self.init_assigner
+            pos_weight = self.train_cfg['init']['pos_weight']
+        else:
+            assigner = self.refine_assigner
+            pos_weight = self.train_cfg['refine']['pos_weight']
+
+        assign_result = assigner.assign(pred_instances, gt_instances,
+                                        gt_instances_ignore)
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_proposals = proposals.shape[0]
+        bbox_gt = proposals.new_zeros([num_valid_proposals, 4])
+        pos_proposals = torch.zeros_like(proposals)
+        proposals_weights = proposals.new_zeros([num_valid_proposals, 4])
+        labels = proposals.new_full((num_valid_proposals, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        label_weights = proposals.new_zeros(
+            num_valid_proposals, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            bbox_gt[pos_inds, :] = sampling_result.pos_gt_bboxes
+            pos_proposals[pos_inds, :] = proposals[pos_inds, :]
+            proposals_weights[pos_inds, :] = 1.0
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of proposals
+        if unmap_outputs:
+            num_total_proposals = flat_proposals.size(0)
+            labels = unmap(
+                labels,
+                num_total_proposals,
+                inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_proposals,
+                                  inside_flags)
+            bbox_gt = unmap(bbox_gt, num_total_proposals, inside_flags)
+            pos_proposals = unmap(pos_proposals, num_total_proposals,
+                                  inside_flags)
+            proposals_weights = unmap(proposals_weights, num_total_proposals,
+                                      inside_flags)
+
+        return (labels, label_weights, bbox_gt, pos_proposals,
+                proposals_weights, pos_inds, neg_inds, sampling_result)
+
+    def get_targets(self,
+                    proposals_list: List[Tensor],
+                    valid_flag_list: List[Tensor],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    stage: str = 'init',
+                    unmap_outputs: bool = True,
+                    return_sampling_results: bool = False) -> tuple:
+        """Compute corresponding GT box and classification targets for
+        proposals.
+
+        Args:
+            proposals_list (list[Tensor]): Multi level points/bboxes of each
+                image.
+            valid_flag_list (list[Tensor]): Multi level valid flags of each
+                image.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            stage (str): 'init' or 'refine'. Generate target for init stage or
+                refine stage.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+            return_sampling_results (bool): Whether to return the sampling
+                results. Defaults to False.
+
+        Returns:
+            tuple:
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_gt_list (list[Tensor]): Ground truth bbox of each level.
+                - proposals_list (list[Tensor]): Proposals(points/bboxes) of
+                  each level.
+                - proposal_weights_list (list[Tensor]): Proposal weights of
+                  each level.
+                - avg_factor (int): Average factor that is used to average
+                  the loss. When using sampling method, avg_factor is usually
+                  the sum of positive and negative priors. When using
+                  `PseudoSampler`, `avg_factor` is usually equal to the number
+                  of positive priors.
+        """
+        assert stage in ['init', 'refine']
+        num_imgs = len(batch_img_metas)
+        assert len(proposals_list) == len(valid_flag_list) == num_imgs
+
+        # points number of multi levels
+        num_level_proposals = [points.size(0) for points in proposals_list[0]]
+
+        # concat all level points and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(proposals_list[i]) == len(valid_flag_list[i])
+            proposals_list[i] = torch.cat(proposals_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+
+        (all_labels, all_label_weights, all_bbox_gt, all_proposals,
+         all_proposal_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             proposals_list,
+             valid_flag_list,
+             batch_gt_instances,
+             batch_gt_instances_ignore,
+             stage=stage,
+             unmap_outputs=unmap_outputs)
+
+        # sampled points of all images
+        avg_refactor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        labels_list = images_to_levels(all_labels, num_level_proposals)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_proposals)
+        bbox_gt_list = images_to_levels(all_bbox_gt, num_level_proposals)
+        proposals_list = images_to_levels(all_proposals, num_level_proposals)
+        proposal_weights_list = images_to_levels(all_proposal_weights,
+                                                 num_level_proposals)
+        res = (labels_list, label_weights_list, bbox_gt_list, proposals_list,
+               proposal_weights_list, avg_refactor)
+        if return_sampling_results:
+            res = res + (sampling_results_list, )
+
+        return res
+
+    def loss_by_feat_single(self, cls_score: Tensor, pts_pred_init: Tensor,
+                            pts_pred_refine: Tensor, labels: Tensor,
+                            label_weights, bbox_gt_init: Tensor,
+                            bbox_weights_init: Tensor, bbox_gt_refine: Tensor,
+                            bbox_weights_refine: Tensor, stride: int,
+                            avg_factor_init: int,
+                            avg_factor_refine: int) -> Tuple[Tensor]:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_classes, h_i, w_i).
+            pts_pred_init (Tensor): Points of shape
+                (batch_size, h_i * w_i, num_points * 2).
+            pts_pred_refine (Tensor): Points refined of shape
+                (batch_size, h_i * w_i, num_points * 2).
+            labels (Tensor): Ground truth class indices with shape
+                (batch_size, h_i * w_i).
+            label_weights (Tensor): Label weights of shape
+                (batch_size, h_i * w_i).
+            bbox_gt_init (Tensor): BBox regression targets in the init stage
+                of shape (batch_size, h_i * w_i, 4).
+            bbox_weights_init (Tensor): BBox regression loss weights in the
+                init stage of shape (batch_size, h_i * w_i, 4).
+            bbox_gt_refine (Tensor): BBox regression targets in the refine
+                stage of shape (batch_size, h_i * w_i, 4).
+            bbox_weights_refine (Tensor): BBox regression loss weights in the
+                refine stage of shape (batch_size, h_i * w_i, 4).
+            stride (int): Point stride.
+            avg_factor_init (int): Average factor that is used to average
+                the loss in the init stage.
+            avg_factor_refine (int): Average factor that is used to average
+                the loss in the refine stage.
+
+        Returns:
+            Tuple[Tensor]: loss components.
+        """
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        cls_score = cls_score.contiguous()
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor_refine)
+
+        # points loss
+        bbox_gt_init = bbox_gt_init.reshape(-1, 4)
+        bbox_weights_init = bbox_weights_init.reshape(-1, 4)
+        bbox_pred_init = self.points2bbox(
+            pts_pred_init.reshape(-1, 2 * self.num_points), y_first=False)
+        bbox_gt_refine = bbox_gt_refine.reshape(-1, 4)
+        bbox_weights_refine = bbox_weights_refine.reshape(-1, 4)
+        bbox_pred_refine = self.points2bbox(
+            pts_pred_refine.reshape(-1, 2 * self.num_points), y_first=False)
+        normalize_term = self.point_base_scale * stride
+        loss_pts_init = self.loss_bbox_init(
+            bbox_pred_init / normalize_term,
+            bbox_gt_init / normalize_term,
+            bbox_weights_init,
+            avg_factor=avg_factor_init)
+        loss_pts_refine = self.loss_bbox_refine(
+            bbox_pred_refine / normalize_term,
+            bbox_gt_refine / normalize_term,
+            bbox_weights_refine,
+            avg_factor=avg_factor_refine)
+        return loss_cls, loss_pts_init, loss_pts_refine
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        pts_preds_init: List[Tensor],
+        pts_preds_refine: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, of shape (batch_size, num_classes, h, w).
+            pts_preds_init (list[Tensor]): Points for each scale level, each is
+                a 3D-tensor, of shape (batch_size, h_i * w_i, num_points * 2).
+            pts_preds_refine (list[Tensor]): Points refined for each scale
+                level, each is a 3D-tensor, of shape
+                (batch_size, h_i * w_i, num_points * 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        device = cls_scores[0].device
+
+        # target for initial stage
+        center_list, valid_flag_list = self.get_points(featmap_sizes,
+                                                       batch_img_metas, device)
+        pts_coordinate_preds_init = self.offset_to_pts(center_list,
+                                                       pts_preds_init)
+        if self.train_cfg['init']['assigner']['type'] == 'PointAssigner':
+            # Assign target for center list
+            candidate_list = center_list
+        else:
+            # transform center list to bbox list and
+            #   assign target for bbox list
+            bbox_list = self.centers_to_bboxes(center_list)
+            candidate_list = bbox_list
+        cls_reg_targets_init = self.get_targets(
+            proposals_list=candidate_list,
+            valid_flag_list=valid_flag_list,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            stage='init',
+            return_sampling_results=False)
+        (*_, bbox_gt_list_init, candidate_list_init, bbox_weights_list_init,
+         avg_factor_init) = cls_reg_targets_init
+
+        # target for refinement stage
+        center_list, valid_flag_list = self.get_points(featmap_sizes,
+                                                       batch_img_metas, device)
+        pts_coordinate_preds_refine = self.offset_to_pts(
+            center_list, pts_preds_refine)
+        bbox_list = []
+        for i_img, center in enumerate(center_list):
+            bbox = []
+            for i_lvl in range(len(pts_preds_refine)):
+                bbox_preds_init = self.points2bbox(
+                    pts_preds_init[i_lvl].detach())
+                bbox_shift = bbox_preds_init * self.point_strides[i_lvl]
+                bbox_center = torch.cat(
+                    [center[i_lvl][:, :2], center[i_lvl][:, :2]], dim=1)
+                bbox.append(bbox_center +
+                            bbox_shift[i_img].permute(1, 2, 0).reshape(-1, 4))
+            bbox_list.append(bbox)
+        cls_reg_targets_refine = self.get_targets(
+            proposals_list=bbox_list,
+            valid_flag_list=valid_flag_list,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            stage='refine',
+            return_sampling_results=False)
+        (labels_list, label_weights_list, bbox_gt_list_refine,
+         candidate_list_refine, bbox_weights_list_refine,
+         avg_factor_refine) = cls_reg_targets_refine
+
+        # compute loss
+        losses_cls, losses_pts_init, losses_pts_refine = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            pts_coordinate_preds_init,
+            pts_coordinate_preds_refine,
+            labels_list,
+            label_weights_list,
+            bbox_gt_list_init,
+            bbox_weights_list_init,
+            bbox_gt_list_refine,
+            bbox_weights_list_refine,
+            self.point_strides,
+            avg_factor_init=avg_factor_init,
+            avg_factor_refine=avg_factor_refine)
+        loss_dict_all = {
+            'loss_cls': losses_cls,
+            'loss_pts_init': losses_pts_init,
+            'loss_pts_refine': losses_pts_refine
+        }
+        return loss_dict_all
+
+    # Same as base_dense_head/_get_bboxes_single except self._bbox_decode
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image. RepPoints head does not need
+                this value.
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 2).
+            img_meta (dict): Image meta info.
+            cfg (:obj:`ConfigDict`): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for level_idx, (cls_score, bbox_pred, priors) in enumerate(
+                zip(cls_score_list, bbox_pred_list, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, _, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            bboxes = self._bbox_decode(priors, bbox_pred,
+                                       self.point_strides[level_idx],
+                                       img_shape)
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def _bbox_decode(self, points: Tensor, bbox_pred: Tensor, stride: int,
+                     max_shape: Tuple[int, int]) -> Tensor:
+        """Decode the prediction to bounding box.
+
+        Args:
+            points (Tensor): shape (h_i * w_i, 2).
+            bbox_pred (Tensor): shape (h_i * w_i, 4).
+            stride (int): Stride for bbox_pred in different level.
+            max_shape (Tuple[int, int]): image shape.
+
+        Returns:
+            Tensor: Bounding boxes decoded.
+        """
+        bbox_pos_center = torch.cat([points[:, :2], points[:, :2]], dim=1)
+        bboxes = bbox_pred * stride + bbox_pos_center
+        x1 = bboxes[:, 0].clamp(min=0, max=max_shape[1])
+        y1 = bboxes[:, 1].clamp(min=0, max=max_shape[0])
+        x2 = bboxes[:, 2].clamp(min=0, max=max_shape[1])
+        y2 = bboxes[:, 3].clamp(min=0, max=max_shape[0])
+        decoded_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+        return decoded_bboxes
diff --git a/mmde/mmdet/models/dense_heads/retina_head.py b/mmde/mmdet/models/dense_heads/retina_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..be3ae74d81ba38609646f0d0406098ecbdcef688
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/retina_head.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmdet.registry import MODELS
+from .anchor_head import AnchorHead
+
+
+@MODELS.register_module()
+class RetinaHead(AnchorHead):
+    r"""An anchor-based head used in `RetinaNet
+    <https://arxiv.org/pdf/1708.02002.pdf>`_.
+
+    The head contains two subnetworks. The first classifies anchor boxes and
+    the second regresses deltas for the anchors.
+
+    Example:
+        >>> import torch
+        >>> self = RetinaHead(11, 7)
+        >>> x = torch.rand(1, 7, 32, 32)
+        >>> cls_score, bbox_pred = self.forward_single(x)
+        >>> # Each anchor predicts a score for each class except background
+        >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
+        >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
+        >>> assert cls_per_anchor == (self.num_classes)
+        >>> assert box_per_anchor == 4
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 anchor_generator=dict(
+                     type='AnchorGenerator',
+                     octave_base_scale=4,
+                     scales_per_octave=3,
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[8, 16, 32, 64, 128]),
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='retina_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs):
+        assert stacked_convs >= 0, \
+            '`stacked_convs` must be non-negative integers, ' \
+            f'but got {stacked_convs} instead.'
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super(RetinaHead, self).__init__(
+            num_classes,
+            in_channels,
+            anchor_generator=anchor_generator,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        in_channels = self.in_channels
+        for i in range(self.stacked_convs):
+            self.cls_convs.append(
+                ConvModule(
+                    in_channels,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    in_channels,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            in_channels = self.feat_channels
+        self.retina_cls = nn.Conv2d(
+            in_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        reg_dim = self.bbox_coder.encode_size
+        self.retina_reg = nn.Conv2d(
+            in_channels, self.num_base_priors * reg_dim, 3, padding=1)
+
+    def forward_single(self, x):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale
+                    level, the channels number is num_anchors * 4.
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.retina_cls(cls_feat)
+        bbox_pred = self.retina_reg(reg_feat)
+        return cls_score, bbox_pred
diff --git a/mmde/mmdet/models/dense_heads/retina_sepbn_head.py b/mmde/mmdet/models/dense_heads/retina_sepbn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..681a39983a08670adaa3e24a4099c4f26bc967ce
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/retina_sepbn_head.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import bias_init_with_prob, normal_init
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .anchor_head import AnchorHead
+
+
+@MODELS.register_module()
+class RetinaSepBNHead(AnchorHead):
+    """"RetinaHead with separate BN.
+
+    In RetinaHead, conv/norm layers are shared across different FPN levels,
+    while in RetinaSepBNHead, conv layers are shared across different FPN
+    levels, but BN layers are separated.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 num_ins: int,
+                 in_channels: int,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.num_ins = num_ins
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.num_ins):
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            for j in range(self.stacked_convs):
+                chn = self.in_channels if j == 0 else self.feat_channels
+                cls_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+                reg_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(reg_convs)
+        for i in range(self.stacked_convs):
+            for j in range(1, self.num_ins):
+                self.cls_convs[j][i].conv = self.cls_convs[0][i].conv
+                self.reg_convs[j][i].conv = self.reg_convs[0][i].conv
+        self.retina_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.retina_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        super().init_weights()
+        for m in self.cls_convs[0]:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs[0]:
+            normal_init(m.conv, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.retina_cls, std=0.01, bias=bias_cls)
+        normal_init(self.retina_reg, std=0.01)
+
+    def forward(self, feats: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+
+                - cls_scores (list[Tensor]): Classification scores for all
+                  scale levels, each is a 4D-tensor, the channels number is
+                  num_anchors * num_classes.
+                - bbox_preds (list[Tensor]): Box energies / deltas for all
+                  scale levels, each is a 4D-tensor, the channels number is
+                  num_anchors * 4.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for i, x in enumerate(feats):
+            cls_feat = feats[i]
+            reg_feat = feats[i]
+            for cls_conv in self.cls_convs[i]:
+                cls_feat = cls_conv(cls_feat)
+            for reg_conv in self.reg_convs[i]:
+                reg_feat = reg_conv(reg_feat)
+            cls_score = self.retina_cls(cls_feat)
+            bbox_pred = self.retina_reg(reg_feat)
+            cls_scores.append(cls_score)
+            bbox_preds.append(bbox_pred)
+        return cls_scores, bbox_preds
diff --git a/mmde/mmdet/models/dense_heads/rpn_head.py b/mmde/mmdet/models/dense_heads/rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b544009d2ffc4c3c9065707a0a8a72c577eb432
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/rpn_head.py
@@ -0,0 +1,302 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.ops import batched_nms
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import (cat_boxes, empty_box_as, get_box_tensor,
+                                   get_box_wh, scale_boxes)
+from mmdet.utils import InstanceList, MultiConfig, OptInstanceList
+from .anchor_head import AnchorHead
+
+
+@MODELS.register_module()
+class RPNHead(AnchorHead):
+    """Implementation of RPN head.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        num_classes (int): Number of categories excluding the background
+            category. Defaults to 1.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
+            list[dict]): Initialization config dict.
+        num_convs (int): Number of convolution layers in the head.
+            Defaults to 1.
+    """  # noqa: W605
+
+    def __init__(self,
+                 in_channels: int,
+                 num_classes: int = 1,
+                 init_cfg: MultiConfig = dict(
+                     type='Normal', layer='Conv2d', std=0.01),
+                 num_convs: int = 1,
+                 **kwargs) -> None:
+        self.num_convs = num_convs
+        assert num_classes == 1
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        if self.num_convs > 1:
+            rpn_convs = []
+            for i in range(self.num_convs):
+                if i == 0:
+                    in_channels = self.in_channels
+                else:
+                    in_channels = self.feat_channels
+                # use ``inplace=False`` to avoid error: one of the variables
+                # needed for gradient computation has been modified by an
+                # inplace operation.
+                rpn_convs.append(
+                    ConvModule(
+                        in_channels,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        inplace=False))
+            self.rpn_conv = nn.Sequential(*rpn_convs)
+        else:
+            self.rpn_conv = nn.Conv2d(
+                self.in_channels, self.feat_channels, 3, padding=1)
+        self.rpn_cls = nn.Conv2d(self.feat_channels,
+                                 self.num_base_priors * self.cls_out_channels,
+                                 1)
+        reg_dim = self.bbox_coder.encode_size
+        self.rpn_reg = nn.Conv2d(self.feat_channels,
+                                 self.num_base_priors * reg_dim, 1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level \
+                    the channels number is num_base_priors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale \
+                    level, the channels number is num_base_priors * 4.
+        """
+        x = self.rpn_conv(x)
+        x = F.relu(x)
+        rpn_cls_score = self.rpn_cls(x)
+        rpn_bbox_pred = self.rpn_reg(x)
+        return rpn_cls_score, rpn_bbox_pred
+
+    def loss_by_feat(self,
+                     cls_scores: List[Tensor],
+                     bbox_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None) \
+            -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            batch_gt_instances (list[obj:InstanceData]): Batch of gt_instance.
+                It usually includes ``bboxes`` and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[obj:InstanceData], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        losses = super().loss_by_feat(
+            cls_scores,
+            bbox_preds,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        return dict(
+            loss_rpn_cls=losses['loss_cls'], loss_rpn_bbox=losses['loss_bbox'])
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Be compatible with
+                BaseDenseHead. Not used in RPNHead.
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (ConfigDict, optional): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        level_ids = []
+        for level_idx, (cls_score, bbox_pred, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            reg_dim = self.bbox_coder.encode_size
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, reg_dim)
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                # remind that we set FG labels to [0] since mmdet v2.0
+                # BG cat_id: 1
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            scores = torch.squeeze(scores)
+            if 0 < nms_pre < scores.shape[0]:
+                # sort is faster than topk
+                # _, topk_inds = scores.topk(cfg.nms_pre)
+                ranked_scores, rank_inds = scores.sort(descending=True)
+                topk_inds = rank_inds[:nms_pre]
+                scores = ranked_scores[:nms_pre]
+                bbox_pred = bbox_pred[topk_inds, :]
+                priors = priors[topk_inds]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(scores)
+
+            # use level id to implement the separate level nms
+            level_ids.append(
+                scores.new_full((scores.size(0), ),
+                                level_idx,
+                                dtype=torch.long))
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = cat_boxes(mlvl_valid_priors)
+        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.scores = torch.cat(mlvl_scores)
+        results.level_ids = torch.cat(level_ids)
+
+        return self._bbox_post_process(
+            results=results, cfg=cfg, rescale=rescale, img_meta=img_meta)
+
+    def _bbox_post_process(self,
+                           results: InstanceData,
+                           cfg: ConfigDict,
+                           rescale: bool = False,
+                           with_nms: bool = True,
+                           img_meta: Optional[dict] = None) -> InstanceData:
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (ConfigDict): Test / postprocessing configuration.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default to True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert with_nms, '`with_nms` must be True in RPNHead'
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            results.bboxes = scale_boxes(results.bboxes, scale_factor)
+
+        # filter small size bboxes
+        if cfg.get('min_bbox_size', -1) >= 0:
+            w, h = get_box_wh(results.bboxes)
+            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+            if not valid_mask.all():
+                results = results[valid_mask]
+
+        if results.bboxes.numel() > 0:
+            bboxes = get_box_tensor(results.bboxes)
+            det_bboxes, keep_idxs = batched_nms(bboxes, results.scores,
+                                                results.level_ids, cfg.nms)
+            results = results[keep_idxs]
+            # some nms would reweight the score, such as softnms
+            results.scores = det_bboxes[:, -1]
+            results = results[:cfg.max_per_img]
+            # TODO: This would unreasonably show the 0th class label
+            #  in visualization
+            results.labels = results.scores.new_zeros(
+                len(results), dtype=torch.long)
+            del results.level_ids
+        else:
+            # To avoid some potential error
+            results_ = InstanceData()
+            results_.bboxes = empty_box_as(results.bboxes)
+            results_.scores = results.scores.new_zeros(0)
+            results_.labels = results.scores.new_zeros(0)
+            results = results_
+        return results
diff --git a/mmde/mmdet/models/dense_heads/rtmdet_head.py b/mmde/mmdet/models/dense_heads/rtmdet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae0ee6d2f35a0fa46ba0b8de21054433d0420b65
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/rtmdet_head.py
@@ -0,0 +1,692 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule, Scale, is_norm
+from mmengine.model import bias_init_with_prob, constant_init, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import distance2bbox
+from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean
+from ..layers.transformer import inverse_sigmoid
+from ..task_modules import anchor_inside_flags
+from ..utils import (images_to_levels, multi_apply, sigmoid_geometric_mean,
+                     unmap)
+from .atss_head import ATSSHead
+
+
+@MODELS.register_module()
+class RTMDetHead(ATSSHead):
+    """Detection Head of RTMDet.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        with_objectness (bool): Whether to add an objectness branch.
+            Defaults to True.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Default: dict(type='ReLU')
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 with_objectness: bool = True,
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 **kwargs) -> None:
+        self.act_cfg = act_cfg
+        self.with_objectness = with_objectness
+        super().__init__(num_classes, in_channels, **kwargs)
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        pred_pad_size = self.pred_kernel_size // 2
+        self.rtm_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.rtm_reg = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * 4,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        if self.with_objectness:
+            self.rtm_obj = nn.Conv2d(
+                self.feat_channels,
+                1,
+                self.pred_kernel_size,
+                padding=pred_pad_size)
+
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.rtm_cls, std=0.01, bias=bias_cls)
+        normal_init(self.rtm_reg, std=0.01)
+        if self.with_objectness:
+            normal_init(self.rtm_obj, std=0.01, bias=bias_cls)
+
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+            - cls_scores (list[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * 4.
+        """
+
+        cls_scores = []
+        bbox_preds = []
+        for idx, (x, scale, stride) in enumerate(
+                zip(feats, self.scales, self.prior_generator.strides)):
+            cls_feat = x
+            reg_feat = x
+
+            for cls_layer in self.cls_convs:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls(cls_feat)
+
+            for reg_layer in self.reg_convs:
+                reg_feat = reg_layer(reg_feat)
+
+            if self.with_objectness:
+                objectness = self.rtm_obj(reg_feat)
+                cls_score = inverse_sigmoid(
+                    sigmoid_geometric_mean(cls_score, objectness))
+
+            reg_dist = scale(self.rtm_reg(reg_feat).exp()).float() * stride[0]
+
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+        return tuple(cls_scores), tuple(bbox_preds)
+
+    def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            labels: Tensor, label_weights: Tensor,
+                            bbox_targets: Tensor, assign_metrics: Tensor,
+                            stride: List[int]):
+        """Compute loss of a single scale level.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Decoded bboxes for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors).
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            assign_metrics (Tensor): Assign metrics with shape
+                (N, num_total_anchors).
+            stride (List[int]): Downsample stride of the feature map.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        bbox_pred = bbox_pred.reshape(-1, 4)
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        assign_metrics = assign_metrics.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        targets = (labels, assign_metrics)
+
+        loss_cls = self.loss_cls(
+            cls_score, targets, label_weights, avg_factor=1.0)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+
+            pos_decode_bbox_pred = pos_bbox_pred
+            pos_decode_bbox_targets = pos_bbox_targets
+
+            # regression loss
+            pos_bbox_weight = assign_metrics[pos_inds]
+
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=pos_bbox_weight,
+                avg_factor=1.0)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            pos_bbox_weight = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, assign_metrics.sum(), pos_bbox_weight.sum()
+
+    def loss_by_feat(self,
+                     cls_scores: List[Tensor],
+                     bbox_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Decoded box for each scale
+                level with shape (N, num_anchors * 4, H, W) in
+                [tl_x, tl_y, br_x, br_y] format.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        flatten_cls_scores = torch.cat([
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ], 1)
+        decoded_bboxes = []
+        for anchor, bbox_pred in zip(anchor_list[0], bbox_preds):
+            anchor = anchor.reshape(-1, 4)
+            bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            bbox_pred = distance2bbox(anchor, bbox_pred)
+            decoded_bboxes.append(bbox_pred)
+
+        flatten_bboxes = torch.cat(decoded_bboxes, 1)
+
+        cls_reg_targets = self.get_targets(
+            flatten_cls_scores,
+            flatten_bboxes,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         assign_metrics_list, sampling_results_list) = cls_reg_targets
+
+        losses_cls, losses_bbox,\
+            cls_avg_factors, bbox_avg_factors = multi_apply(
+                self.loss_by_feat_single,
+                cls_scores,
+                decoded_bboxes,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                assign_metrics_list,
+                self.prior_generator.strides)
+
+        cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item()
+        losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls))
+
+        bbox_avg_factor = reduce_mean(
+            sum(bbox_avg_factors)).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+
+    def get_targets(self,
+                    cls_scores: Tensor,
+                    bbox_preds: Tensor,
+                    anchor_list: List[List[Tensor]],
+                    valid_flag_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs=True):
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            cls_scores (Tensor): Classification predictions of images,
+                a 3D-Tensor with shape [num_imgs, num_priors, num_classes].
+            bbox_preds (Tensor): Decoded bboxes predictions of one image,
+                a 3D-Tensor with shape [num_imgs, num_priors, 4] in [tl_x,
+                tl_y, br_x, br_y] format.
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+
+        Returns:
+            tuple: a tuple containing learning targets.
+
+            - anchors_list (list[list[Tensor]]): Anchors of each level.
+            - labels_list (list[Tensor]): Labels of each level.
+            - label_weights_list (list[Tensor]): Label weights of each
+              level.
+            - bbox_targets_list (list[Tensor]): BBox targets of each level.
+            - assign_metrics_list (list[Tensor]): alignment metrics of each
+              level.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        # anchor_list: list(b * [-1, 4])
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_assign_metrics, sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             cls_scores.detach(),
+             bbox_preds.detach(),
+             anchor_list,
+             valid_flag_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        assign_metrics_list = images_to_levels(all_assign_metrics,
+                                               num_level_anchors)
+
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, assign_metrics_list, sampling_results_list)
+
+    def _get_targets_single(self,
+                            cls_scores: Tensor,
+                            bbox_preds: Tensor,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs=True):
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            cls_scores (list(Tensor)): Box scores for each image.
+            bbox_preds (list(Tensor)): Box energies / deltas for each image.
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+
+            - anchors (Tensor): All anchors in the image with shape (N, 4).
+            - labels (Tensor): Labels of all anchors in the image with shape
+              (N,).
+            - label_weights (Tensor): Label weights of all anchor in the
+              image with shape (N,).
+            - bbox_targets (Tensor): BBox targets of all anchors in the
+              image with shape (N, 4).
+            - norm_alignment_metrics (Tensor): Normalized alignment metrics
+              of all priors in the image with shape (N,).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            return (None, ) * 7
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        pred_instances = InstanceData(
+            scores=cls_scores[inside_flags, :],
+            bboxes=bbox_preds[inside_flags, :],
+            priors=anchors)
+
+        assign_result = self.assigner.assign(pred_instances, gt_instances,
+                                             gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+        assign_metrics = anchors.new_zeros(
+            num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            # point-based
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        class_assigned_gt_inds = torch.unique(
+            sampling_result.pos_assigned_gt_inds)
+        for gt_inds in class_assigned_gt_inds:
+            gt_class_inds = pos_inds[sampling_result.pos_assigned_gt_inds ==
+                                     gt_inds]
+            assign_metrics[gt_class_inds] = assign_result.max_overlaps[
+                gt_class_inds]
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            assign_metrics = unmap(assign_metrics, num_total_anchors,
+                                   inside_flags)
+        return (anchors, labels, label_weights, bbox_targets, assign_metrics,
+                sampling_result)
+
+    def get_anchors(self,
+                    featmap_sizes: List[tuple],
+                    batch_img_metas: List[dict],
+                    device: Union[torch.device, str] = 'cuda') \
+            -> Tuple[List[List[Tensor]], List[List[Tensor]]]:
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            batch_img_metas (list[dict]): Image meta info.
+            device (torch.device or str): Device for returned tensors.
+                Defaults to cuda.
+
+        Returns:
+            tuple:
+
+            - anchor_list (list[list[Tensor]]): Anchors of each image.
+            - valid_flag_list (list[list[Tensor]]): Valid flags of each
+              image.
+        """
+        num_imgs = len(batch_img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device, with_stride=True)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_flags = self.prior_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device)
+            valid_flag_list.append(multi_level_flags)
+        return anchor_list, valid_flag_list
+
+
+@MODELS.register_module()
+class RTMDetSepBNHead(RTMDetHead):
+    """RTMDetHead with separated BN layers and shared conv layers.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        share_conv (bool): Whether to share conv layers between stages.
+            Defaults to True.
+        use_depthwise (bool): Whether to use depthwise separable convolution in
+            head. Defaults to False.
+        norm_cfg (:obj:`ConfigDict` or dict)): Config dict for normalization
+            layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (:obj:`ConfigDict` or dict)): Config dict for activation layer.
+            Defaults to dict(type='SiLU').
+        pred_kernel_size (int): Kernel size of prediction layer. Defaults to 1.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 share_conv: bool = True,
+                 use_depthwise: bool = False,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU'),
+                 pred_kernel_size: int = 1,
+                 exp_on_reg=False,
+                 **kwargs) -> None:
+        self.share_conv = share_conv
+        self.exp_on_reg = exp_on_reg
+        self.use_depthwise = use_depthwise
+        super().__init__(
+            num_classes,
+            in_channels,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            pred_kernel_size=pred_kernel_size,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+
+        self.rtm_cls = nn.ModuleList()
+        self.rtm_reg = nn.ModuleList()
+        if self.with_objectness:
+            self.rtm_obj = nn.ModuleList()
+        for n in range(len(self.prior_generator.strides)):
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            for i in range(self.stacked_convs):
+                chn = self.in_channels if i == 0 else self.feat_channels
+                cls_convs.append(
+                    conv(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_convs.append(
+                    conv(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(reg_convs)
+
+            self.rtm_cls.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * self.cls_out_channels,
+                    self.pred_kernel_size,
+                    padding=self.pred_kernel_size // 2))
+            self.rtm_reg.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * 4,
+                    self.pred_kernel_size,
+                    padding=self.pred_kernel_size // 2))
+            if self.with_objectness:
+                self.rtm_obj.append(
+                    nn.Conv2d(
+                        self.feat_channels,
+                        1,
+                        self.pred_kernel_size,
+                        padding=self.pred_kernel_size // 2))
+
+        if self.share_conv:
+            for n in range(len(self.prior_generator.strides)):
+                for i in range(self.stacked_convs):
+                    self.cls_convs[n][i].conv = self.cls_convs[0][i].conv
+                    self.reg_convs[n][i].conv = self.reg_convs[0][i].conv
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+        bias_cls = bias_init_with_prob(0.01)
+        for rtm_cls, rtm_reg in zip(self.rtm_cls, self.rtm_reg):
+            normal_init(rtm_cls, std=0.01, bias=bias_cls)
+            normal_init(rtm_reg, std=0.01)
+        if self.with_objectness:
+            for rtm_obj in self.rtm_obj:
+                normal_init(rtm_obj, std=0.01, bias=bias_cls)
+
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+
+            - cls_scores (tuple[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_anchors * num_classes.
+            - bbox_preds (tuple[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_anchors * 4.
+        """
+
+        cls_scores = []
+        bbox_preds = []
+        for idx, (x, stride) in enumerate(
+                zip(feats, self.prior_generator.strides)):
+            cls_feat = x
+            reg_feat = x
+
+            for cls_layer in self.cls_convs[idx]:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls[idx](cls_feat)
+
+            for reg_layer in self.reg_convs[idx]:
+                reg_feat = reg_layer(reg_feat)
+
+            if self.with_objectness:
+                objectness = self.rtm_obj[idx](reg_feat)
+                cls_score = inverse_sigmoid(
+                    sigmoid_geometric_mean(cls_score, objectness))
+            if self.exp_on_reg:
+                reg_dist = self.rtm_reg[idx](reg_feat).exp() * stride[0]
+            else:
+                reg_dist = self.rtm_reg[idx](reg_feat) * stride[0]
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+        return tuple(cls_scores), tuple(bbox_preds)
diff --git a/mmde/mmdet/models/dense_heads/rtmdet_ins_head.py b/mmde/mmdet/models/dense_heads/rtmdet_ins_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..261a57fe485245dcbe41696c9237258f829ca25a
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/rtmdet_ins_head.py
@@ -0,0 +1,1034 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, is_norm
+from mmcv.ops import batched_nms
+from mmengine.model import (BaseModule, bias_init_with_prob, constant_init,
+                            normal_init)
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.layers.transformer import inverse_sigmoid
+from mmdet.models.utils import (filter_scores_and_topk, multi_apply,
+                                select_single_mlvl, sigmoid_geometric_mean)
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import (cat_boxes, distance2bbox, get_box_tensor,
+                                   get_box_wh, scale_boxes)
+from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean
+from .rtmdet_head import RTMDetHead
+
+
+@MODELS.register_module()
+class RTMDetInsHead(RTMDetHead):
+    """Detection Head of RTMDet-Ins.
+
+    Args:
+        num_prototypes (int): Number of mask prototype features extracted
+            from the mask head. Defaults to 8.
+        dyconv_channels (int): Channel of the dynamic conv layers.
+            Defaults to 8.
+        num_dyconvs (int): Number of the dynamic convolution layers.
+            Defaults to 3.
+        mask_loss_stride (int): Down sample stride of the masks for loss
+            computation. Defaults to 4.
+        loss_mask (:obj:`ConfigDict` or dict): Config dict for mask loss.
+    """
+
+    def __init__(self,
+                 *args,
+                 num_prototypes: int = 8,
+                 dyconv_channels: int = 8,
+                 num_dyconvs: int = 3,
+                 mask_loss_stride: int = 4,
+                 loss_mask=dict(
+                     type='DiceLoss',
+                     loss_weight=2.0,
+                     eps=5e-6,
+                     reduction='mean'),
+                 **kwargs) -> None:
+        self.num_prototypes = num_prototypes
+        self.num_dyconvs = num_dyconvs
+        self.dyconv_channels = dyconv_channels
+        self.mask_loss_stride = mask_loss_stride
+        super().__init__(*args, **kwargs)
+        self.loss_mask = MODELS.build(loss_mask)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        super()._init_layers()
+        # a branch to predict kernels of dynamic convs
+        self.kernel_convs = nn.ModuleList()
+        # calculate num dynamic parameters
+        weight_nums, bias_nums = [], []
+        for i in range(self.num_dyconvs):
+            if i == 0:
+                weight_nums.append(
+                    # mask prototype and coordinate features
+                    (self.num_prototypes + 2) * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels * 1)
+            elif i == self.num_dyconvs - 1:
+                weight_nums.append(self.dyconv_channels * 1)
+                bias_nums.append(1)
+            else:
+                weight_nums.append(self.dyconv_channels * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels * 1)
+        self.weight_nums = weight_nums
+        self.bias_nums = bias_nums
+        self.num_gen_params = sum(weight_nums) + sum(bias_nums)
+
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.kernel_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        pred_pad_size = self.pred_kernel_size // 2
+        self.rtm_kernel = nn.Conv2d(
+            self.feat_channels,
+            self.num_gen_params,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.mask_head = MaskFeatModule(
+            in_channels=self.in_channels,
+            feat_channels=self.feat_channels,
+            stacked_convs=4,
+            num_levels=len(self.prior_generator.strides),
+            num_prototypes=self.num_prototypes,
+            act_cfg=self.act_cfg,
+            norm_cfg=self.norm_cfg)
+
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+            - cls_scores (list[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * 4.
+            - kernel_preds (list[Tensor]): Dynamic conv kernels for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_gen_params.
+            - mask_feat (Tensor): Output feature of the mask head. Each is a
+              4D-tensor, the channels number is num_prototypes.
+        """
+        mask_feat = self.mask_head(feats)
+
+        cls_scores = []
+        bbox_preds = []
+        kernel_preds = []
+        for idx, (x, scale, stride) in enumerate(
+                zip(feats, self.scales, self.prior_generator.strides)):
+            cls_feat = x
+            reg_feat = x
+            kernel_feat = x
+
+            for cls_layer in self.cls_convs:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls(cls_feat)
+
+            for kernel_layer in self.kernel_convs:
+                kernel_feat = kernel_layer(kernel_feat)
+            kernel_pred = self.rtm_kernel(kernel_feat)
+
+            for reg_layer in self.reg_convs:
+                reg_feat = reg_layer(reg_feat)
+
+            if self.with_objectness:
+                objectness = self.rtm_obj(reg_feat)
+                cls_score = inverse_sigmoid(
+                    sigmoid_geometric_mean(cls_score, objectness))
+
+            reg_dist = scale(self.rtm_reg(reg_feat)) * stride[0]
+
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+            kernel_preds.append(kernel_pred)
+        return tuple(cls_scores), tuple(bbox_preds), tuple(
+            kernel_preds), mask_feat
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        kernel_preds: List[Tensor],
+                        mask_feat: Tensor,
+                        score_factors: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigType] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            kernel_preds (list[Tensor]): Kernel predictions of dynamic
+                convs for all scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_params, H, W).
+            mask_feat (Tensor): Mask prototype features extracted from the
+                mask head, has shape (batch_size, num_prototypes, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, h, w).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        if score_factors is None:
+            # e.g. Retina, FreeAnchor, Foveabox, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, AutoAssign, etc.
+            with_score_factors = True
+            assert len(cls_scores) == len(score_factors)
+
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device,
+            with_stride=True)
+
+        result_list = []
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            cls_score_list = select_single_mlvl(
+                cls_scores, img_id, detach=True)
+            bbox_pred_list = select_single_mlvl(
+                bbox_preds, img_id, detach=True)
+            kernel_pred_list = select_single_mlvl(
+                kernel_preds, img_id, detach=True)
+            if with_score_factors:
+                score_factor_list = select_single_mlvl(
+                    score_factors, img_id, detach=True)
+            else:
+                score_factor_list = [None for _ in range(num_levels)]
+
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                kernel_pred_list=kernel_pred_list,
+                mask_feat=mask_feat[img_id],
+                score_factor_list=score_factor_list,
+                mlvl_priors=mlvl_priors,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                kernel_pred_list: List[Tensor],
+                                mask_feat: Tensor,
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigType,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox and mask results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            kernel_preds (list[Tensor]): Kernel predictions of dynamic
+                convs for all scale levels of a single image, each is a
+                4D-tensor, has shape (num_params, H, W).
+            mask_feat (Tensor): Mask prototype features of a single image
+                extracted from the mask head, has shape (num_prototypes, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, h, w).
+        """
+        if score_factor_list[0] is None:
+            # e.g. Retina, FreeAnchor, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, etc.
+            with_score_factors = True
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_kernels = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        mlvl_labels = []
+        if with_score_factors:
+            mlvl_score_factors = []
+        else:
+            mlvl_score_factors = None
+
+        for level_idx, (cls_score, bbox_pred, kernel_pred,
+                        score_factor, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list, kernel_pred_list,
+                              score_factor_list, mlvl_priors)):
+
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            dim = self.bbox_coder.encode_size
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim)
+            if with_score_factors:
+                score_factor = score_factor.permute(1, 2,
+                                                    0).reshape(-1).sigmoid()
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            kernel_pred = kernel_pred.permute(1, 2, 0).reshape(
+                -1, self.num_gen_params)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            score_thr = cfg.get('score_thr', 0)
+
+            results = filter_scores_and_topk(
+                scores, score_thr, nms_pre,
+                dict(
+                    bbox_pred=bbox_pred,
+                    priors=priors,
+                    kernel_pred=kernel_pred))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+            kernel_pred = filtered_results['kernel_pred']
+
+            if with_score_factors:
+                score_factor = score_factor[keep_idxs]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+            mlvl_kernels.append(kernel_pred)
+
+            if with_score_factors:
+                mlvl_score_factors.append(score_factor)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = cat_boxes(mlvl_valid_priors)
+        bboxes = self.bbox_coder.decode(
+            priors[..., :2], bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.priors = priors
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+        results.kernels = torch.cat(mlvl_kernels)
+        if with_score_factors:
+            results.score_factors = torch.cat(mlvl_score_factors)
+
+        return self._bbox_mask_post_process(
+            results=results,
+            mask_feat=mask_feat,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def _bbox_mask_post_process(
+            self,
+            results: InstanceData,
+            mask_feat,
+            cfg: ConfigType,
+            rescale: bool = False,
+            with_nms: bool = True,
+            img_meta: Optional[dict] = None) -> InstanceData:
+        """bbox and mask post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (ConfigDict): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default to True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, h, w).
+        """
+        stride = self.prior_generator.strides[0][0]
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            results.bboxes = scale_boxes(results.bboxes, scale_factor)
+
+        if hasattr(results, 'score_factors'):
+            # TODO: Add sqrt operation in order to be consistent with
+            #  the paper.
+            score_factors = results.pop('score_factors')
+            results.scores = results.scores * score_factors
+
+        # filter small size bboxes
+        if cfg.get('min_bbox_size', -1) >= 0:
+            w, h = get_box_wh(results.bboxes)
+            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+            if not valid_mask.all():
+                results = results[valid_mask]
+
+        # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg
+        assert with_nms, 'with_nms must be True for RTMDet-Ins'
+        if results.bboxes.numel() > 0:
+            bboxes = get_box_tensor(results.bboxes)
+            det_bboxes, keep_idxs = batched_nms(bboxes, results.scores,
+                                                results.labels, cfg.nms)
+            results = results[keep_idxs]
+            # some nms would reweight the score, such as softnms
+            results.scores = det_bboxes[:, -1]
+            results = results[:cfg.max_per_img]
+
+            # process masks
+            mask_logits = self._mask_predict_by_feat_single(
+                mask_feat, results.kernels, results.priors)
+
+            mask_logits = F.interpolate(
+                mask_logits.unsqueeze(0), scale_factor=stride, mode='bilinear')
+            if rescale:
+                ori_h, ori_w = img_meta['ori_shape'][:2]
+                mask_logits = F.interpolate(
+                    mask_logits,
+                    size=[
+                        math.ceil(mask_logits.shape[-2] * scale_factor[0]),
+                        math.ceil(mask_logits.shape[-1] * scale_factor[1])
+                    ],
+                    mode='bilinear',
+                    align_corners=False)[..., :ori_h, :ori_w]
+            masks = mask_logits.sigmoid().squeeze(0)
+            masks = masks > cfg.mask_thr_binary
+            results.masks = masks
+        else:
+            h, w = img_meta['ori_shape'][:2] if rescale else img_meta[
+                'img_shape'][:2]
+            results.masks = torch.zeros(
+                size=(results.bboxes.shape[0], h, w),
+                dtype=torch.bool,
+                device=results.bboxes.device)
+
+        return results
+
+    def parse_dynamic_params(self, flatten_kernels: Tensor) -> tuple:
+        """split kernel head prediction to conv weight and bias."""
+        n_inst = flatten_kernels.size(0)
+        n_layers = len(self.weight_nums)
+        params_splits = list(
+            torch.split_with_sizes(
+                flatten_kernels, self.weight_nums + self.bias_nums, dim=1))
+        weight_splits = params_splits[:n_layers]
+        bias_splits = params_splits[n_layers:]
+        for i in range(n_layers):
+            if i < n_layers - 1:
+                weight_splits[i] = weight_splits[i].reshape(
+                    n_inst * self.dyconv_channels, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(n_inst *
+                                                        self.dyconv_channels)
+            else:
+                weight_splits[i] = weight_splits[i].reshape(n_inst, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(n_inst)
+
+        return weight_splits, bias_splits
+
+    def _mask_predict_by_feat_single(self, mask_feat: Tensor, kernels: Tensor,
+                                     priors: Tensor) -> Tensor:
+        """Generate mask logits from mask features with dynamic convs.
+
+        Args:
+            mask_feat (Tensor): Mask prototype features.
+                Has shape (num_prototypes, H, W).
+            kernels (Tensor): Kernel parameters for each instance.
+                Has shape (num_instance, num_params)
+            priors (Tensor): Center priors for each instance.
+                Has shape (num_instance, 4).
+        Returns:
+            Tensor: Instance segmentation masks for each instance.
+                Has shape (num_instance, H, W).
+        """
+        num_inst = priors.shape[0]
+        h, w = mask_feat.size()[-2:]
+        if num_inst < 1:
+            return torch.empty(
+                size=(num_inst, h, w),
+                dtype=mask_feat.dtype,
+                device=mask_feat.device)
+        if len(mask_feat.shape) < 4:
+            mask_feat.unsqueeze(0)
+
+        coord = self.prior_generator.single_level_grid_priors(
+            (h, w), level_idx=0, device=mask_feat.device).reshape(1, -1, 2)
+        num_inst = priors.shape[0]
+        points = priors[:, :2].reshape(-1, 1, 2)
+        strides = priors[:, 2:].reshape(-1, 1, 2)
+        relative_coord = (points - coord).permute(0, 2, 1) / (
+            strides[..., 0].reshape(-1, 1, 1) * 8)
+        relative_coord = relative_coord.reshape(num_inst, 2, h, w)
+
+        mask_feat = torch.cat(
+            [relative_coord,
+             mask_feat.repeat(num_inst, 1, 1, 1)], dim=1)
+        weights, biases = self.parse_dynamic_params(kernels)
+
+        n_layers = len(weights)
+        x = mask_feat.reshape(1, -1, h, w)
+        for i, (weight, bias) in enumerate(zip(weights, biases)):
+            x = F.conv2d(
+                x, weight, bias=bias, stride=1, padding=0, groups=num_inst)
+            if i < n_layers - 1:
+                x = F.relu(x)
+        x = x.reshape(num_inst, h, w)
+        return x
+
+    def loss_mask_by_feat(self, mask_feats: Tensor, flatten_kernels: Tensor,
+                          sampling_results_list: list,
+                          batch_gt_instances: InstanceList) -> Tensor:
+        """Compute instance segmentation loss.
+
+        Args:
+            mask_feats (list[Tensor]): Mask prototype features extracted from
+                the mask head. Has shape (N, num_prototypes, H, W)
+            flatten_kernels (list[Tensor]): Kernels of the dynamic conv layers.
+                Has shape (N, num_instances, num_params)
+            sampling_results_list (list[:obj:`SamplingResults`]) Batch of
+                assignment results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            Tensor: The mask loss tensor.
+        """
+        batch_pos_mask_logits = []
+        pos_gt_masks = []
+        for idx, (mask_feat, kernels, sampling_results,
+                  gt_instances) in enumerate(
+                      zip(mask_feats, flatten_kernels, sampling_results_list,
+                          batch_gt_instances)):
+            pos_priors = sampling_results.pos_priors
+            pos_inds = sampling_results.pos_inds
+            pos_kernels = kernels[pos_inds]  # n_pos, num_gen_params
+            pos_mask_logits = self._mask_predict_by_feat_single(
+                mask_feat, pos_kernels, pos_priors)
+            if gt_instances.masks.numel() == 0:
+                gt_masks = torch.empty_like(gt_instances.masks)
+            else:
+                gt_masks = gt_instances.masks[
+                    sampling_results.pos_assigned_gt_inds, :]
+            batch_pos_mask_logits.append(pos_mask_logits)
+            pos_gt_masks.append(gt_masks)
+
+        pos_gt_masks = torch.cat(pos_gt_masks, 0)
+        batch_pos_mask_logits = torch.cat(batch_pos_mask_logits, 0)
+
+        # avg_factor
+        num_pos = batch_pos_mask_logits.shape[0]
+        num_pos = reduce_mean(mask_feats.new_tensor([num_pos
+                                                     ])).clamp_(min=1).item()
+
+        if batch_pos_mask_logits.shape[0] == 0:
+            return mask_feats.sum() * 0
+
+        scale = self.prior_generator.strides[0][0] // self.mask_loss_stride
+        # upsample pred masks
+        batch_pos_mask_logits = F.interpolate(
+            batch_pos_mask_logits.unsqueeze(0),
+            scale_factor=scale,
+            mode='bilinear',
+            align_corners=False).squeeze(0)
+        # downsample gt masks
+        pos_gt_masks = pos_gt_masks[:, self.mask_loss_stride //
+                                    2::self.mask_loss_stride,
+                                    self.mask_loss_stride //
+                                    2::self.mask_loss_stride]
+
+        loss_mask = self.loss_mask(
+            batch_pos_mask_logits,
+            pos_gt_masks,
+            weight=None,
+            avg_factor=num_pos)
+
+        return loss_mask
+
+    def loss_by_feat(self,
+                     cls_scores: List[Tensor],
+                     bbox_preds: List[Tensor],
+                     kernel_preds: List[Tensor],
+                     mask_feat: Tensor,
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Decoded box for each scale
+                level with shape (N, num_anchors * 4, H, W) in
+                [tl_x, tl_y, br_x, br_y] format.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        flatten_cls_scores = torch.cat([
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ], 1)
+        flatten_kernels = torch.cat([
+            kernel_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                    self.num_gen_params)
+            for kernel_pred in kernel_preds
+        ], 1)
+        decoded_bboxes = []
+        for anchor, bbox_pred in zip(anchor_list[0], bbox_preds):
+            anchor = anchor.reshape(-1, 4)
+            bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            bbox_pred = distance2bbox(anchor, bbox_pred)
+            decoded_bboxes.append(bbox_pred)
+
+        flatten_bboxes = torch.cat(decoded_bboxes, 1)
+        for gt_instances in batch_gt_instances:
+            gt_instances.masks = gt_instances.masks.to_tensor(
+                dtype=torch.bool, device=device)
+
+        cls_reg_targets = self.get_targets(
+            flatten_cls_scores,
+            flatten_bboxes,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         assign_metrics_list, sampling_results_list) = cls_reg_targets
+
+        losses_cls, losses_bbox,\
+            cls_avg_factors, bbox_avg_factors = multi_apply(
+                self.loss_by_feat_single,
+                cls_scores,
+                decoded_bboxes,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                assign_metrics_list,
+                self.prior_generator.strides)
+
+        cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item()
+        losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls))
+
+        bbox_avg_factor = reduce_mean(
+            sum(bbox_avg_factors)).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+
+        loss_mask = self.loss_mask_by_feat(mask_feat, flatten_kernels,
+                                           sampling_results_list,
+                                           batch_gt_instances)
+        loss = dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_mask=loss_mask)
+        return loss
+
+
+class MaskFeatModule(BaseModule):
+    """Mask feature head used in RTMDet-Ins.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels of the mask feature
+             map branch.
+        num_levels (int): The starting feature map level from RPN that
+             will be used to predict the mask feature map.
+        num_prototypes (int): Number of output channel of the mask feature
+             map branch. This is the channel count of the mask
+             feature map that to be dynamically convolved with the predicted
+             kernel.
+        stacked_convs (int): Number of convs in mask feature branch.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True)
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        feat_channels: int = 256,
+        stacked_convs: int = 4,
+        num_levels: int = 3,
+        num_prototypes: int = 8,
+        act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+        norm_cfg: ConfigType = dict(type='BN')
+    ) -> None:
+        super().__init__(init_cfg=None)
+        self.num_levels = num_levels
+        self.fusion_conv = nn.Conv2d(num_levels * in_channels, in_channels, 1)
+        convs = []
+        for i in range(stacked_convs):
+            in_c = in_channels if i == 0 else feat_channels
+            convs.append(
+                ConvModule(
+                    in_c,
+                    feat_channels,
+                    3,
+                    padding=1,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg))
+        self.stacked_convs = nn.Sequential(*convs)
+        self.projection = nn.Conv2d(
+            feat_channels, num_prototypes, kernel_size=1)
+
+    def forward(self, features: Tuple[Tensor, ...]) -> Tensor:
+        # multi-level feature fusion
+        fusion_feats = [features[0]]
+        size = features[0].shape[-2:]
+        for i in range(1, self.num_levels):
+            f = F.interpolate(features[i], size=size, mode='bilinear')
+            fusion_feats.append(f)
+        fusion_feats = torch.cat(fusion_feats, dim=1)
+        fusion_feats = self.fusion_conv(fusion_feats)
+        # pred mask feats
+        mask_features = self.stacked_convs(fusion_feats)
+        mask_features = self.projection(mask_features)
+        return mask_features
+
+
+@MODELS.register_module()
+class RTMDetInsSepBNHead(RTMDetInsHead):
+    """Detection Head of RTMDet-Ins with sep-bn layers.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        share_conv (bool): Whether to share conv layers between stages.
+            Defaults to True.
+        norm_cfg (:obj:`ConfigDict` or dict)): Config dict for normalization
+            layer. Defaults to dict(type='BN').
+        act_cfg (:obj:`ConfigDict` or dict)): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        pred_kernel_size (int): Kernel size of prediction layer. Defaults to 1.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 share_conv: bool = True,
+                 with_objectness: bool = False,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 pred_kernel_size: int = 1,
+                 **kwargs) -> None:
+        self.share_conv = share_conv
+        super().__init__(
+            num_classes,
+            in_channels,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            pred_kernel_size=pred_kernel_size,
+            with_objectness=with_objectness,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.kernel_convs = nn.ModuleList()
+
+        self.rtm_cls = nn.ModuleList()
+        self.rtm_reg = nn.ModuleList()
+        self.rtm_kernel = nn.ModuleList()
+        self.rtm_obj = nn.ModuleList()
+
+        # calculate num dynamic parameters
+        weight_nums, bias_nums = [], []
+        for i in range(self.num_dyconvs):
+            if i == 0:
+                weight_nums.append(
+                    (self.num_prototypes + 2) * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels)
+            elif i == self.num_dyconvs - 1:
+                weight_nums.append(self.dyconv_channels)
+                bias_nums.append(1)
+            else:
+                weight_nums.append(self.dyconv_channels * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels)
+        self.weight_nums = weight_nums
+        self.bias_nums = bias_nums
+        self.num_gen_params = sum(weight_nums) + sum(bias_nums)
+        pred_pad_size = self.pred_kernel_size // 2
+
+        for n in range(len(self.prior_generator.strides)):
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            kernel_convs = nn.ModuleList()
+            for i in range(self.stacked_convs):
+                chn = self.in_channels if i == 0 else self.feat_channels
+                cls_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                kernel_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(cls_convs)
+            self.kernel_convs.append(kernel_convs)
+
+            self.rtm_cls.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * self.cls_out_channels,
+                    self.pred_kernel_size,
+                    padding=pred_pad_size))
+            self.rtm_reg.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * 4,
+                    self.pred_kernel_size,
+                    padding=pred_pad_size))
+            self.rtm_kernel.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_gen_params,
+                    self.pred_kernel_size,
+                    padding=pred_pad_size))
+            if self.with_objectness:
+                self.rtm_obj.append(
+                    nn.Conv2d(
+                        self.feat_channels,
+                        1,
+                        self.pred_kernel_size,
+                        padding=pred_pad_size))
+
+        if self.share_conv:
+            for n in range(len(self.prior_generator.strides)):
+                for i in range(self.stacked_convs):
+                    self.cls_convs[n][i].conv = self.cls_convs[0][i].conv
+                    self.reg_convs[n][i].conv = self.reg_convs[0][i].conv
+
+        self.mask_head = MaskFeatModule(
+            in_channels=self.in_channels,
+            feat_channels=self.feat_channels,
+            stacked_convs=4,
+            num_levels=len(self.prior_generator.strides),
+            num_prototypes=self.num_prototypes,
+            act_cfg=self.act_cfg,
+            norm_cfg=self.norm_cfg)
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+        bias_cls = bias_init_with_prob(0.01)
+        for rtm_cls, rtm_reg, rtm_kernel in zip(self.rtm_cls, self.rtm_reg,
+                                                self.rtm_kernel):
+            normal_init(rtm_cls, std=0.01, bias=bias_cls)
+            normal_init(rtm_reg, std=0.01, bias=1)
+        if self.with_objectness:
+            for rtm_obj in self.rtm_obj:
+                normal_init(rtm_obj, std=0.01, bias=bias_cls)
+
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+            - cls_scores (list[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * 4.
+            - kernel_preds (list[Tensor]): Dynamic conv kernels for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_gen_params.
+            - mask_feat (Tensor): Output feature of the mask head. Each is a
+              4D-tensor, the channels number is num_prototypes.
+        """
+        mask_feat = self.mask_head(feats)
+
+        cls_scores = []
+        bbox_preds = []
+        kernel_preds = []
+        for idx, (x, stride) in enumerate(
+                zip(feats, self.prior_generator.strides)):
+            cls_feat = x
+            reg_feat = x
+            kernel_feat = x
+
+            for cls_layer in self.cls_convs[idx]:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls[idx](cls_feat)
+
+            for kernel_layer in self.kernel_convs[idx]:
+                kernel_feat = kernel_layer(kernel_feat)
+            kernel_pred = self.rtm_kernel[idx](kernel_feat)
+
+            for reg_layer in self.reg_convs[idx]:
+                reg_feat = reg_layer(reg_feat)
+
+            if self.with_objectness:
+                objectness = self.rtm_obj[idx](reg_feat)
+                cls_score = inverse_sigmoid(
+                    sigmoid_geometric_mean(cls_score, objectness))
+
+            reg_dist = F.relu(self.rtm_reg[idx](reg_feat)) * stride[0]
+
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+            kernel_preds.append(kernel_pred)
+        return tuple(cls_scores), tuple(bbox_preds), tuple(
+            kernel_preds), mask_feat
diff --git a/mmde/mmdet/models/dense_heads/sabl_retina_head.py b/mmde/mmdet/models/dense_heads/sabl_retina_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cd1b71cc2c80035a0378180da70caddf853375d
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/sabl_retina_head.py
@@ -0,0 +1,706 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList)
+from ..task_modules.samplers import PseudoSampler
+from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply,
+                     unmap)
+from .base_dense_head import BaseDenseHead
+from .guided_anchor_head import GuidedAnchorHead
+
+
+@MODELS.register_module()
+class SABLRetinaHead(BaseDenseHead):
+    """Side-Aware Boundary Localization (SABL) for RetinaNet.
+
+    The anchor generation, assigning and sampling in SABLRetinaHead
+    are the same as GuidedAnchorHead for guided anchoring.
+
+    Please refer to https://arxiv.org/abs/1912.04260 for more details.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of Convs for classification and
+            regression branches. Defaults to 4.
+        feat_channels (int): Number of hidden channels. Defaults to 256.
+        approx_anchor_generator (:obj:`ConfigType` or dict): Config dict for
+            approx generator.
+        square_anchor_generator (:obj:`ConfigDict` or dict): Config dict for
+            square generator.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            ConvModule. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            Norm Layer. Defaults to None.
+        bbox_coder (:obj:`ConfigDict` or dict): Config dict for bbox coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Default False. It should be ``True`` when
+            using ``IoULoss``, ``GIoULoss``, or ``DIoULoss`` in the bbox head.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            SABLRetinaHead.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            SABLRetinaHead.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox_cls (:obj:`ConfigDict` or dict): Config of classification
+            loss for bbox branch.
+        loss_bbox_reg (:obj:`ConfigDict` or dict): Config of regression loss
+            for bbox branch.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        stacked_convs: int = 4,
+        feat_channels: int = 256,
+        approx_anchor_generator: ConfigType = dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator: ConfigType = dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        bbox_coder: ConfigType = dict(
+            type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0),
+        reg_decoded_bbox: bool = False,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        loss_cls: ConfigType = dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_cls: ConfigType = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5),
+        loss_bbox_reg: ConfigType = dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5),
+        init_cfg: MultiConfig = dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.01,
+            override=dict(
+                type='Normal', name='retina_cls', std=0.01, bias_prob=0.01))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.num_buckets = bbox_coder['num_buckets']
+        self.side_num = int(np.ceil(self.num_buckets / 2))
+
+        assert (approx_anchor_generator['octave_base_scale'] ==
+                square_anchor_generator['scales'][0])
+        assert (approx_anchor_generator['strides'] ==
+                square_anchor_generator['strides'])
+
+        self.approx_anchor_generator = TASK_UTILS.build(
+            approx_anchor_generator)
+        self.square_anchor_generator = TASK_UTILS.build(
+            square_anchor_generator)
+        self.approxs_per_octave = (
+            self.approx_anchor_generator.num_base_priors[0])
+
+        # one anchor per location
+        self.num_base_priors = self.square_anchor_generator.num_base_priors[0]
+
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox_cls = MODELS.build(loss_bbox_cls)
+        self.loss_bbox_reg = MODELS.build(loss_bbox_reg)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            # use PseudoSampler when sampling is False
+            if 'sampler' in self.train_cfg:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.retina_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.retina_bbox_reg = nn.Conv2d(
+            self.feat_channels, self.side_num * 4, 3, padding=1)
+        self.retina_bbox_cls = nn.Conv2d(
+            self.feat_channels, self.side_num * 4, 3, padding=1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.retina_cls(cls_feat)
+        bbox_cls_pred = self.retina_bbox_cls(reg_feat)
+        bbox_reg_pred = self.retina_bbox_reg(reg_feat)
+        bbox_pred = (bbox_cls_pred, bbox_reg_pred)
+        return cls_score, bbox_pred
+
+    def forward(self, feats: List[Tensor]) -> Tuple[List[Tensor]]:
+        return multi_apply(self.forward_single, feats)
+
+    def get_anchors(
+        self,
+        featmap_sizes: List[tuple],
+        img_metas: List[dict],
+        device: Union[torch.device, str] = 'cuda'
+    ) -> Tuple[List[List[Tensor]], List[List[Tensor]]]:
+        """Get squares according to feature map sizes and guided anchors.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+            device (torch.device | str): device for returned tensors
+
+        Returns:
+            tuple: square approxs of each image
+        """
+        num_imgs = len(img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # squares for one time
+        multi_level_squares = self.square_anchor_generator.grid_priors(
+            featmap_sizes, device=device)
+        squares_list = [multi_level_squares for _ in range(num_imgs)]
+
+        return squares_list
+
+    def get_targets(self,
+                    approx_list: List[List[Tensor]],
+                    inside_flag_list: List[List[Tensor]],
+                    square_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas,
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs=True) -> tuple:
+        """Compute bucketing targets.
+
+        Args:
+            approx_list (list[list[Tensor]]): Multi level approxs of each
+                image.
+            inside_flag_list (list[list[Tensor]]): Multi level inside flags of
+                each image.
+            square_list (list[list[Tensor]]): Multi level squares of each
+                image.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+
+        Returns:
+            tuple: Returns a tuple containing learning targets.
+
+            - labels_list (list[Tensor]): Labels of each level.
+            - label_weights_list (list[Tensor]): Label weights of each level.
+            - bbox_cls_targets_list (list[Tensor]): BBox cls targets of \
+            each level.
+            - bbox_cls_weights_list (list[Tensor]): BBox cls weights of \
+            each level.
+            - bbox_reg_targets_list (list[Tensor]): BBox reg targets of \
+            each level.
+            - bbox_reg_weights_list (list[Tensor]): BBox reg weights of \
+            each level.
+            - num_total_pos (int): Number of positive samples in all images.
+            - num_total_neg (int): Number of negative samples in all images.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(approx_list) == len(inside_flag_list) == len(
+            square_list) == num_imgs
+        # anchor number of multi levels
+        num_level_squares = [squares.size(0) for squares in square_list[0]]
+        # concat all level anchors and flags to a single tensor
+        inside_flag_flat_list = []
+        approx_flat_list = []
+        square_flat_list = []
+        for i in range(num_imgs):
+            assert len(square_list[i]) == len(inside_flag_list[i])
+            inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
+            approx_flat_list.append(torch.cat(approx_list[i]))
+            square_flat_list.append(torch.cat(square_list[i]))
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None for _ in range(num_imgs)]
+        (all_labels, all_label_weights, all_bbox_cls_targets,
+         all_bbox_cls_weights, all_bbox_reg_targets, all_bbox_reg_weights,
+         pos_inds_list, neg_inds_list, sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             approx_flat_list,
+             inside_flag_flat_list,
+             square_flat_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs)
+
+        # sampled anchors of all images
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_squares)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_squares)
+        bbox_cls_targets_list = images_to_levels(all_bbox_cls_targets,
+                                                 num_level_squares)
+        bbox_cls_weights_list = images_to_levels(all_bbox_cls_weights,
+                                                 num_level_squares)
+        bbox_reg_targets_list = images_to_levels(all_bbox_reg_targets,
+                                                 num_level_squares)
+        bbox_reg_weights_list = images_to_levels(all_bbox_reg_weights,
+                                                 num_level_squares)
+        return (labels_list, label_weights_list, bbox_cls_targets_list,
+                bbox_cls_weights_list, bbox_reg_targets_list,
+                bbox_reg_weights_list, avg_factor)
+
+    def _get_targets_single(self,
+                            flat_approxs: Tensor,
+                            inside_flags: Tensor,
+                            flat_squares: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Args:
+            flat_approxs (Tensor): flat approxs of a single image,
+                shape (n, 4)
+            inside_flags (Tensor): inside flags of a single image,
+                shape (n, ).
+            flat_squares (Tensor): flat squares of a single image,
+                shape (approxs_per_octave * n, 4)
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.  Defaults to True.
+
+        Returns:
+            tuple:
+
+            - labels_list (Tensor): Labels in a single image.
+            - label_weights (Tensor): Label weights in a single image.
+            - bbox_cls_targets (Tensor): BBox cls targets in a single image.
+            - bbox_cls_weights (Tensor): BBox cls weights in a single image.
+            - bbox_reg_targets (Tensor): BBox reg targets in a single image.
+            - bbox_reg_weights (Tensor): BBox reg weights in a single image.
+            - num_total_pos (int): Number of positive samples in a single \
+            image.
+            - num_total_neg (int): Number of negative samples in a single \
+            image.
+            - sampling_result (:obj:`SamplingResult`): Sampling result object.
+        """
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        num_square = flat_squares.size(0)
+        approxs = flat_approxs.view(num_square, self.approxs_per_octave, 4)
+        approxs = approxs[inside_flags, ...]
+        squares = flat_squares[inside_flags, :]
+
+        pred_instances = InstanceData()
+        pred_instances.priors = squares
+        pred_instances.approxs = approxs
+        assign_result = self.assigner.assign(pred_instances, gt_instances,
+                                             gt_instances_ignore)
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_squares = squares.shape[0]
+        bbox_cls_targets = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        bbox_cls_weights = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        bbox_reg_targets = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        bbox_reg_weights = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        labels = squares.new_full((num_valid_squares, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = squares.new_zeros(num_valid_squares, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            (pos_bbox_reg_targets, pos_bbox_reg_weights, pos_bbox_cls_targets,
+             pos_bbox_cls_weights) = self.bbox_coder.encode(
+                 sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+
+            bbox_cls_targets[pos_inds, :] = pos_bbox_cls_targets
+            bbox_reg_targets[pos_inds, :] = pos_bbox_reg_targets
+            bbox_cls_weights[pos_inds, :] = pos_bbox_cls_weights
+            bbox_reg_weights[pos_inds, :] = pos_bbox_reg_weights
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_squares.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_cls_targets = unmap(bbox_cls_targets, num_total_anchors,
+                                     inside_flags)
+            bbox_cls_weights = unmap(bbox_cls_weights, num_total_anchors,
+                                     inside_flags)
+            bbox_reg_targets = unmap(bbox_reg_targets, num_total_anchors,
+                                     inside_flags)
+            bbox_reg_weights = unmap(bbox_reg_weights, num_total_anchors,
+                                     inside_flags)
+        return (labels, label_weights, bbox_cls_targets, bbox_cls_weights,
+                bbox_reg_targets, bbox_reg_weights, pos_inds, neg_inds,
+                sampling_result)
+
+    def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            labels: Tensor, label_weights: Tensor,
+                            bbox_cls_targets: Tensor, bbox_cls_weights: Tensor,
+                            bbox_reg_targets: Tensor, bbox_reg_weights: Tensor,
+                            avg_factor: float) -> Tuple[Tensor]:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            labels (Tensor): Labels in a single image.
+            label_weights (Tensor): Label weights in a single level.
+            bbox_cls_targets (Tensor): BBox cls targets in a single level.
+            bbox_cls_weights (Tensor): BBox cls weights in a single level.
+            bbox_reg_targets (Tensor): BBox reg targets in a single level.
+            bbox_reg_weights (Tensor): BBox reg weights in a single level.
+            avg_factor (int): Average factor that is used to average the loss.
+
+        Returns:
+            tuple: loss components.
+        """
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor)
+        # regression loss
+        bbox_cls_targets = bbox_cls_targets.reshape(-1, self.side_num * 4)
+        bbox_cls_weights = bbox_cls_weights.reshape(-1, self.side_num * 4)
+        bbox_reg_targets = bbox_reg_targets.reshape(-1, self.side_num * 4)
+        bbox_reg_weights = bbox_reg_weights.reshape(-1, self.side_num * 4)
+        (bbox_cls_pred, bbox_reg_pred) = bbox_pred
+        bbox_cls_pred = bbox_cls_pred.permute(0, 2, 3, 1).reshape(
+            -1, self.side_num * 4)
+        bbox_reg_pred = bbox_reg_pred.permute(0, 2, 3, 1).reshape(
+            -1, self.side_num * 4)
+        loss_bbox_cls = self.loss_bbox_cls(
+            bbox_cls_pred,
+            bbox_cls_targets.long(),
+            bbox_cls_weights,
+            avg_factor=avg_factor * 4 * self.side_num)
+        loss_bbox_reg = self.loss_bbox_reg(
+            bbox_reg_pred,
+            bbox_reg_targets,
+            bbox_reg_weights,
+            avg_factor=avg_factor * 4 * self.bbox_coder.offset_topk)
+        return loss_cls, loss_bbox_cls, loss_bbox_reg
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.approx_anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        # get sampled approxes
+        approxs_list, inside_flag_list = GuidedAnchorHead.get_sampled_approxs(
+            self, featmap_sizes, batch_img_metas, device=device)
+
+        square_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = self.get_targets(
+            approxs_list,
+            inside_flag_list,
+            square_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (labels_list, label_weights_list, bbox_cls_targets_list,
+         bbox_cls_weights_list, bbox_reg_targets_list, bbox_reg_weights_list,
+         avg_factor) = cls_reg_targets
+
+        losses_cls, losses_bbox_cls, losses_bbox_reg = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            bbox_preds,
+            labels_list,
+            label_weights_list,
+            bbox_cls_targets_list,
+            bbox_cls_weights_list,
+            bbox_reg_targets_list,
+            bbox_reg_weights_list,
+            avg_factor=avg_factor)
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox_cls=losses_bbox_cls,
+            loss_bbox_reg=losses_bbox_reg)
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        batch_img_metas: List[dict],
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+            cfg (:obj:`ConfigDict`, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+
+        device = cls_scores[0].device
+        mlvl_anchors = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_cls_pred_list = [
+                bbox_preds[i][0][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_reg_pred_list = [
+                bbox_preds[i][1][img_id].detach() for i in range(num_levels)
+            ]
+            proposals = self._predict_by_feat_single(
+                cls_scores=cls_score_list,
+                bbox_cls_preds=bbox_cls_pred_list,
+                bbox_reg_preds=bbox_reg_pred_list,
+                mlvl_anchors=mlvl_anchors[img_id],
+                img_meta=batch_img_metas[img_id],
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms)
+            result_list.append(proposals)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: List[Tensor],
+                                bbox_cls_preds: List[Tensor],
+                                bbox_reg_preds: List[Tensor],
+                                mlvl_anchors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        cfg = self.test_cfg if cfg is None else cfg
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_confids = []
+        mlvl_labels = []
+        assert len(cls_scores) == len(bbox_cls_preds) == len(
+            bbox_reg_preds) == len(mlvl_anchors)
+        for cls_score, bbox_cls_pred, bbox_reg_pred, anchors in zip(
+                cls_scores, bbox_cls_preds, bbox_reg_preds, mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_cls_pred.size(
+            )[-2:] == bbox_reg_pred.size()[-2::]
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)[:, :-1]
+            bbox_cls_pred = bbox_cls_pred.permute(1, 2, 0).reshape(
+                -1, self.side_num * 4)
+            bbox_reg_pred = bbox_reg_pred.permute(1, 2, 0).reshape(
+                -1, self.side_num * 4)
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(
+                    anchors=anchors,
+                    bbox_cls_pred=bbox_cls_pred,
+                    bbox_reg_pred=bbox_reg_pred))
+            scores, labels, _, filtered_results = results
+
+            anchors = filtered_results['anchors']
+            bbox_cls_pred = filtered_results['bbox_cls_pred']
+            bbox_reg_pred = filtered_results['bbox_reg_pred']
+
+            bbox_preds = [
+                bbox_cls_pred.contiguous(),
+                bbox_reg_pred.contiguous()
+            ]
+            bboxes, confids = self.bbox_coder.decode(
+                anchors.contiguous(),
+                bbox_preds,
+                max_shape=img_meta['img_shape'])
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_confids.append(confids)
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.score_factors = torch.cat(mlvl_confids)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
diff --git a/mmde/mmdet/models/dense_heads/solo_head.py b/mmde/mmdet/models/dense_heads/solo_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cf338451358b01899faa4b299d33fafd7262d21
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/solo_head.py
@@ -0,0 +1,1263 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import mmcv
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.utils.misc import floordiv
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType
+from ..layers import mask_matrix_nms
+from ..utils import center_of_mass, generate_coordinate, multi_apply
+from .base_mask_head import BaseMaskHead
+
+
+@MODELS.register_module()
+class SOLOHead(BaseMaskHead):
+    """SOLO mask head used in `SOLO: Segmenting Objects by Locations.
+
+    <https://arxiv.org/abs/1912.04488>`_
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+            Defaults to 256.
+        stacked_convs (int): Number of stacking convs of the head.
+            Defaults to 4.
+        strides (tuple): Downsample factor of each feature map.
+        scale_ranges (tuple[tuple[int, int]]): Area range of multiple
+            level masks, in the format [(min1, max1), (min2, max2), ...].
+            A range of (16, 64) means the area range between (16, 64).
+        pos_scale (float): Constant scale factor to control the center region.
+        num_grids (list[int]): Divided image into a uniform grids, each
+            feature map has a different grid value. The number of output
+            channels is grid ** 2. Defaults to [40, 36, 24, 16, 12].
+        cls_down_index (int): The index of downsample operation in
+            classification branch. Defaults to 0.
+        loss_mask (dict): Config of mask loss.
+        loss_cls (dict): Config of classification loss.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to norm_cfg=dict(type='GN', num_groups=32,
+            requires_grad=True).
+        train_cfg (dict): Training config of head.
+        test_cfg (dict): Testing config of head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        feat_channels: int = 256,
+        stacked_convs: int = 4,
+        strides: tuple = (4, 8, 16, 32, 64),
+        scale_ranges: tuple = ((8, 32), (16, 64), (32, 128), (64, 256), (128,
+                                                                         512)),
+        pos_scale: float = 0.2,
+        num_grids: list = [40, 36, 24, 16, 12],
+        cls_down_index: int = 0,
+        loss_mask: ConfigType = dict(
+            type='DiceLoss', use_sigmoid=True, loss_weight=3.0),
+        loss_cls: ConfigType = dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        norm_cfg: ConfigType = dict(
+            type='GN', num_groups=32, requires_grad=True),
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        init_cfg: MultiConfig = [
+            dict(type='Normal', layer='Conv2d', std=0.01),
+            dict(
+                type='Normal',
+                std=0.01,
+                bias_prob=0.01,
+                override=dict(name='conv_mask_list')),
+            dict(
+                type='Normal',
+                std=0.01,
+                bias_prob=0.01,
+                override=dict(name='conv_cls'))
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.cls_out_channels = self.num_classes
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.num_grids = num_grids
+        # number of FPN feats
+        self.num_levels = len(strides)
+        assert self.num_levels == len(scale_ranges) == len(num_grids)
+        self.scale_ranges = scale_ranges
+        self.pos_scale = pos_scale
+
+        self.cls_down_index = cls_down_index
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_mask = MODELS.build(loss_mask)
+        self.norm_cfg = norm_cfg
+        self.init_cfg = init_cfg
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.mask_convs = nn.ModuleList()
+        self.cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels + 2 if i == 0 else self.feat_channels
+            self.mask_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+        self.conv_mask_list = nn.ModuleList()
+        for num_grid in self.num_grids:
+            self.conv_mask_list.append(
+                nn.Conv2d(self.feat_channels, num_grid**2, 1))
+
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def resize_feats(self, x: Tuple[Tensor]) -> List[Tensor]:
+        """Downsample the first feat and upsample last feat in feats.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            list[Tensor]: Features after resizing, each is a 4D-tensor.
+        """
+        out = []
+        for i in range(len(x)):
+            if i == 0:
+                out.append(
+                    F.interpolate(x[0], scale_factor=0.5, mode='bilinear'))
+            elif i == len(x) - 1:
+                out.append(
+                    F.interpolate(
+                        x[i], size=x[i - 1].shape[-2:], mode='bilinear'))
+            else:
+                out.append(x[i])
+        return out
+
+    def forward(self, x: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and mask prediction.
+
+                - mlvl_mask_preds (list[Tensor]): Multi-level mask prediction.
+                  Each element in the list has shape
+                  (batch_size, num_grids**2 ,h ,w).
+                - mlvl_cls_preds (list[Tensor]): Multi-level scores.
+                  Each element in the list has shape
+                  (batch_size, num_classes, num_grids ,num_grids).
+        """
+        assert len(x) == self.num_levels
+        feats = self.resize_feats(x)
+        mlvl_mask_preds = []
+        mlvl_cls_preds = []
+        for i in range(self.num_levels):
+            x = feats[i]
+            mask_feat = x
+            cls_feat = x
+            # generate and concat the coordinate
+            coord_feat = generate_coordinate(mask_feat.size(),
+                                             mask_feat.device)
+            mask_feat = torch.cat([mask_feat, coord_feat], 1)
+
+            for mask_layer in (self.mask_convs):
+                mask_feat = mask_layer(mask_feat)
+
+            mask_feat = F.interpolate(
+                mask_feat, scale_factor=2, mode='bilinear')
+            mask_preds = self.conv_mask_list[i](mask_feat)
+
+            # cls branch
+            for j, cls_layer in enumerate(self.cls_convs):
+                if j == self.cls_down_index:
+                    num_grid = self.num_grids[i]
+                    cls_feat = F.interpolate(
+                        cls_feat, size=num_grid, mode='bilinear')
+                cls_feat = cls_layer(cls_feat)
+
+            cls_pred = self.conv_cls(cls_feat)
+
+            if not self.training:
+                feat_wh = feats[0].size()[-2:]
+                upsampled_size = (feat_wh[0] * 2, feat_wh[1] * 2)
+                mask_preds = F.interpolate(
+                    mask_preds.sigmoid(), size=upsampled_size, mode='bilinear')
+                cls_pred = cls_pred.sigmoid()
+                # get local maximum
+                local_max = F.max_pool2d(cls_pred, 2, stride=1, padding=1)
+                keep_mask = local_max[:, :, :-1, :-1] == cls_pred
+                cls_pred = cls_pred * keep_mask
+
+            mlvl_mask_preds.append(mask_preds)
+            mlvl_cls_preds.append(cls_pred)
+        return mlvl_mask_preds, mlvl_cls_preds
+
+    def loss_by_feat(self, mlvl_mask_preds: List[Tensor],
+                     mlvl_cls_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mlvl_mask_preds (list[Tensor]): Multi-level mask prediction.
+                Each element in the list has shape
+                (batch_size, num_grids**2 ,h ,w).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_levels = self.num_levels
+        num_imgs = len(batch_img_metas)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in mlvl_mask_preds]
+
+        # `BoolTensor` in `pos_masks` represent
+        # whether the corresponding point is
+        # positive
+        pos_mask_targets, labels, pos_masks = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            featmap_sizes=featmap_sizes)
+
+        # change from the outside list meaning multi images
+        # to the outside list meaning multi levels
+        mlvl_pos_mask_targets = [[] for _ in range(num_levels)]
+        mlvl_pos_mask_preds = [[] for _ in range(num_levels)]
+        mlvl_pos_masks = [[] for _ in range(num_levels)]
+        mlvl_labels = [[] for _ in range(num_levels)]
+        for img_id in range(num_imgs):
+            assert num_levels == len(pos_mask_targets[img_id])
+            for lvl in range(num_levels):
+                mlvl_pos_mask_targets[lvl].append(
+                    pos_mask_targets[img_id][lvl])
+                mlvl_pos_mask_preds[lvl].append(
+                    mlvl_mask_preds[lvl][img_id, pos_masks[img_id][lvl], ...])
+                mlvl_pos_masks[lvl].append(pos_masks[img_id][lvl].flatten())
+                mlvl_labels[lvl].append(labels[img_id][lvl].flatten())
+
+        # cat multiple image
+        temp_mlvl_cls_preds = []
+        for lvl in range(num_levels):
+            mlvl_pos_mask_targets[lvl] = torch.cat(
+                mlvl_pos_mask_targets[lvl], dim=0)
+            mlvl_pos_mask_preds[lvl] = torch.cat(
+                mlvl_pos_mask_preds[lvl], dim=0)
+            mlvl_pos_masks[lvl] = torch.cat(mlvl_pos_masks[lvl], dim=0)
+            mlvl_labels[lvl] = torch.cat(mlvl_labels[lvl], dim=0)
+            temp_mlvl_cls_preds.append(mlvl_cls_preds[lvl].permute(
+                0, 2, 3, 1).reshape(-1, self.cls_out_channels))
+
+        num_pos = sum(item.sum() for item in mlvl_pos_masks)
+        # dice loss
+        loss_mask = []
+        for pred, target in zip(mlvl_pos_mask_preds, mlvl_pos_mask_targets):
+            if pred.size()[0] == 0:
+                loss_mask.append(pred.sum().unsqueeze(0))
+                continue
+            loss_mask.append(
+                self.loss_mask(pred, target, reduction_override='none'))
+        if num_pos > 0:
+            loss_mask = torch.cat(loss_mask).sum() / num_pos
+        else:
+            loss_mask = torch.cat(loss_mask).mean()
+
+        flatten_labels = torch.cat(mlvl_labels)
+        flatten_cls_preds = torch.cat(temp_mlvl_cls_preds)
+        loss_cls = self.loss_cls(
+            flatten_cls_preds, flatten_labels, avg_factor=num_pos + 1)
+        return dict(loss_mask=loss_mask, loss_cls=loss_cls)
+
+    def _get_targets_single(self,
+                            gt_instances: InstanceData,
+                            featmap_sizes: Optional[list] = None) -> tuple:
+        """Compute targets for predictions of single image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            featmap_sizes (list[:obj:`torch.size`]): Size of each
+                feature map from feature pyramid, each element
+                means (feat_h, feat_w). Defaults to None.
+
+        Returns:
+            Tuple: Usually returns a tuple containing targets for predictions.
+
+                - mlvl_pos_mask_targets (list[Tensor]): Each element represent
+                  the binary mask targets for positive points in this
+                  level, has shape (num_pos, out_h, out_w).
+                - mlvl_labels (list[Tensor]): Each element is
+                  classification labels for all
+                  points in this level, has shape
+                  (num_grid, num_grid).
+                - mlvl_pos_masks (list[Tensor]): Each element is
+                  a `BoolTensor` to represent whether the
+                  corresponding point in single level
+                  is positive, has shape (num_grid **2).
+        """
+        gt_labels = gt_instances.labels
+        device = gt_labels.device
+
+        gt_bboxes = gt_instances.bboxes
+        gt_areas = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                              (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+
+        gt_masks = gt_instances.masks.to_tensor(
+            dtype=torch.bool, device=device)
+
+        mlvl_pos_mask_targets = []
+        mlvl_labels = []
+        mlvl_pos_masks = []
+        for (lower_bound, upper_bound), stride, featmap_size, num_grid \
+                in zip(self.scale_ranges, self.strides,
+                       featmap_sizes, self.num_grids):
+
+            mask_target = torch.zeros(
+                [num_grid**2, featmap_size[0], featmap_size[1]],
+                dtype=torch.uint8,
+                device=device)
+            # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+            labels = torch.zeros([num_grid, num_grid],
+                                 dtype=torch.int64,
+                                 device=device) + self.num_classes
+            pos_mask = torch.zeros([num_grid**2],
+                                   dtype=torch.bool,
+                                   device=device)
+
+            gt_inds = ((gt_areas >= lower_bound) &
+                       (gt_areas <= upper_bound)).nonzero().flatten()
+            if len(gt_inds) == 0:
+                mlvl_pos_mask_targets.append(
+                    mask_target.new_zeros(0, featmap_size[0], featmap_size[1]))
+                mlvl_labels.append(labels)
+                mlvl_pos_masks.append(pos_mask)
+                continue
+            hit_gt_bboxes = gt_bboxes[gt_inds]
+            hit_gt_labels = gt_labels[gt_inds]
+            hit_gt_masks = gt_masks[gt_inds, ...]
+
+            pos_w_ranges = 0.5 * (hit_gt_bboxes[:, 2] -
+                                  hit_gt_bboxes[:, 0]) * self.pos_scale
+            pos_h_ranges = 0.5 * (hit_gt_bboxes[:, 3] -
+                                  hit_gt_bboxes[:, 1]) * self.pos_scale
+
+            # Make sure hit_gt_masks has a value
+            valid_mask_flags = hit_gt_masks.sum(dim=-1).sum(dim=-1) > 0
+            output_stride = stride / 2
+
+            for gt_mask, gt_label, pos_h_range, pos_w_range, \
+                valid_mask_flag in \
+                    zip(hit_gt_masks, hit_gt_labels, pos_h_ranges,
+                        pos_w_ranges, valid_mask_flags):
+                if not valid_mask_flag:
+                    continue
+                upsampled_size = (featmap_sizes[0][0] * 4,
+                                  featmap_sizes[0][1] * 4)
+                center_h, center_w = center_of_mass(gt_mask)
+
+                coord_w = int(
+                    floordiv((center_w / upsampled_size[1]), (1. / num_grid),
+                             rounding_mode='trunc'))
+                coord_h = int(
+                    floordiv((center_h / upsampled_size[0]), (1. / num_grid),
+                             rounding_mode='trunc'))
+
+                # left, top, right, down
+                top_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_h - pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                down_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_h + pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                left_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_w - pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                right_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_w + pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+
+                top = max(top_box, coord_h - 1)
+                down = min(down_box, coord_h + 1)
+                left = max(coord_w - 1, left_box)
+                right = min(right_box, coord_w + 1)
+
+                labels[top:(down + 1), left:(right + 1)] = gt_label
+                # ins
+                gt_mask = np.uint8(gt_mask.cpu().numpy())
+                # Follow the original implementation, F.interpolate is
+                # different from cv2 and opencv
+                gt_mask = mmcv.imrescale(gt_mask, scale=1. / output_stride)
+                gt_mask = torch.from_numpy(gt_mask).to(device=device)
+
+                for i in range(top, down + 1):
+                    for j in range(left, right + 1):
+                        index = int(i * num_grid + j)
+                        mask_target[index, :gt_mask.shape[0], :gt_mask.
+                                    shape[1]] = gt_mask
+                        pos_mask[index] = True
+            mlvl_pos_mask_targets.append(mask_target[pos_mask])
+            mlvl_labels.append(labels)
+            mlvl_pos_masks.append(pos_mask)
+        return mlvl_pos_mask_targets, mlvl_labels, mlvl_pos_masks
+
+    def predict_by_feat(self, mlvl_mask_preds: List[Tensor],
+                        mlvl_cls_scores: List[Tensor],
+                        batch_img_metas: List[dict], **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mlvl_mask_preds (list[Tensor]): Multi-level mask prediction.
+                Each element in the list has shape
+                (batch_size, num_grids**2 ,h ,w).
+            mlvl_cls_scores (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids ,num_grids).
+            batch_img_metas (list[dict]): Meta information of all images.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        mlvl_cls_scores = [
+            item.permute(0, 2, 3, 1) for item in mlvl_cls_scores
+        ]
+        assert len(mlvl_mask_preds) == len(mlvl_cls_scores)
+        num_levels = len(mlvl_cls_scores)
+
+        results_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_pred_list = [
+                mlvl_cls_scores[lvl][img_id].view(-1, self.cls_out_channels)
+                for lvl in range(num_levels)
+            ]
+            mask_pred_list = [
+                mlvl_mask_preds[lvl][img_id] for lvl in range(num_levels)
+            ]
+
+            cls_pred_list = torch.cat(cls_pred_list, dim=0)
+            mask_pred_list = torch.cat(mask_pred_list, dim=0)
+            img_meta = batch_img_metas[img_id]
+
+            results = self._predict_by_feat_single(
+                cls_pred_list, mask_pred_list, img_meta=img_meta)
+            results_list.append(results)
+
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: Tensor,
+                                mask_preds: Tensor,
+                                img_meta: dict,
+                                cfg: OptConfigType = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        mask results.
+
+        Args:
+            cls_scores (Tensor): Classification score of all points
+                in single image, has shape (num_points, num_classes).
+            mask_preds (Tensor): Mask prediction of all points in
+                single image, has shape (num_points, feat_h, feat_w).
+            img_meta (dict): Meta information of corresponding image.
+            cfg (dict, optional): Config used in test phase.
+                Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+
+        def empty_results(cls_scores, ori_shape):
+            """Generate a empty results."""
+            results = InstanceData()
+            results.scores = cls_scores.new_ones(0)
+            results.masks = cls_scores.new_zeros(0, *ori_shape)
+            results.labels = cls_scores.new_ones(0)
+            results.bboxes = cls_scores.new_zeros(0, 4)
+            return results
+
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(mask_preds)
+
+        featmap_size = mask_preds.size()[-2:]
+
+        h, w = img_meta['img_shape'][:2]
+        upsampled_size = (featmap_size[0] * 4, featmap_size[1] * 4)
+
+        score_mask = (cls_scores > cfg.score_thr)
+        cls_scores = cls_scores[score_mask]
+        if len(cls_scores) == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+
+        inds = score_mask.nonzero()
+        cls_labels = inds[:, 1]
+
+        # Filter the mask mask with an area is smaller than
+        # stride of corresponding feature level
+        lvl_interval = cls_labels.new_tensor(self.num_grids).pow(2).cumsum(0)
+        strides = cls_scores.new_ones(lvl_interval[-1])
+        strides[:lvl_interval[0]] *= self.strides[0]
+        for lvl in range(1, self.num_levels):
+            strides[lvl_interval[lvl -
+                                 1]:lvl_interval[lvl]] *= self.strides[lvl]
+        strides = strides[inds[:, 0]]
+        mask_preds = mask_preds[inds[:, 0]]
+
+        masks = mask_preds > cfg.mask_thr
+        sum_masks = masks.sum((1, 2)).float()
+        keep = sum_masks > strides
+        if keep.sum() == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+        masks = masks[keep]
+        mask_preds = mask_preds[keep]
+        sum_masks = sum_masks[keep]
+        cls_scores = cls_scores[keep]
+        cls_labels = cls_labels[keep]
+
+        # maskness.
+        mask_scores = (mask_preds * masks).sum((1, 2)) / sum_masks
+        cls_scores *= mask_scores
+
+        scores, labels, _, keep_inds = mask_matrix_nms(
+            masks,
+            cls_labels,
+            cls_scores,
+            mask_area=sum_masks,
+            nms_pre=cfg.nms_pre,
+            max_num=cfg.max_per_img,
+            kernel=cfg.kernel,
+            sigma=cfg.sigma,
+            filter_thr=cfg.filter_thr)
+        # mask_matrix_nms may return an empty Tensor
+        if len(keep_inds) == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+        mask_preds = mask_preds[keep_inds]
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(0), size=upsampled_size,
+            mode='bilinear')[:, :, :h, :w]
+        mask_preds = F.interpolate(
+            mask_preds, size=img_meta['ori_shape'][:2],
+            mode='bilinear').squeeze(0)
+        masks = mask_preds > cfg.mask_thr
+
+        results = InstanceData()
+        results.masks = masks
+        results.labels = labels
+        results.scores = scores
+        # create an empty bbox in InstanceData to avoid bugs when
+        # calculating metrics.
+        results.bboxes = results.scores.new_zeros(len(scores), 4)
+        return results
+
+
+@MODELS.register_module()
+class DecoupledSOLOHead(SOLOHead):
+    """Decoupled SOLO mask head used in `SOLO: Segmenting Objects by Locations.
+
+    <https://arxiv.org/abs/1912.04488>`_
+
+    Args:
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 *args,
+                 init_cfg: MultiConfig = [
+                     dict(type='Normal', layer='Conv2d', std=0.01),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_x')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_y')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_cls'))
+                 ],
+                 **kwargs) -> None:
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self) -> None:
+        self.mask_convs_x = nn.ModuleList()
+        self.mask_convs_y = nn.ModuleList()
+        self.cls_convs = nn.ModuleList()
+
+        for i in range(self.stacked_convs):
+            chn = self.in_channels + 1 if i == 0 else self.feat_channels
+            self.mask_convs_x.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+            self.mask_convs_y.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+
+        self.conv_mask_list_x = nn.ModuleList()
+        self.conv_mask_list_y = nn.ModuleList()
+        for num_grid in self.num_grids:
+            self.conv_mask_list_x.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+            self.conv_mask_list_y.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and mask prediction.
+
+                - mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction
+                  from x branch. Each element in the list has shape
+                  (batch_size, num_grids ,h ,w).
+                - mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction
+                  from y branch. Each element in the list has shape
+                  (batch_size, num_grids ,h ,w).
+                - mlvl_cls_preds (list[Tensor]): Multi-level scores.
+                  Each element in the list has shape
+                  (batch_size, num_classes, num_grids ,num_grids).
+        """
+        assert len(x) == self.num_levels
+        feats = self.resize_feats(x)
+        mask_preds_x = []
+        mask_preds_y = []
+        cls_preds = []
+        for i in range(self.num_levels):
+            x = feats[i]
+            mask_feat = x
+            cls_feat = x
+            # generate and concat the coordinate
+            coord_feat = generate_coordinate(mask_feat.size(),
+                                             mask_feat.device)
+            mask_feat_x = torch.cat([mask_feat, coord_feat[:, 0:1, ...]], 1)
+            mask_feat_y = torch.cat([mask_feat, coord_feat[:, 1:2, ...]], 1)
+
+            for mask_layer_x, mask_layer_y in \
+                    zip(self.mask_convs_x, self.mask_convs_y):
+                mask_feat_x = mask_layer_x(mask_feat_x)
+                mask_feat_y = mask_layer_y(mask_feat_y)
+
+            mask_feat_x = F.interpolate(
+                mask_feat_x, scale_factor=2, mode='bilinear')
+            mask_feat_y = F.interpolate(
+                mask_feat_y, scale_factor=2, mode='bilinear')
+
+            mask_pred_x = self.conv_mask_list_x[i](mask_feat_x)
+            mask_pred_y = self.conv_mask_list_y[i](mask_feat_y)
+
+            # cls branch
+            for j, cls_layer in enumerate(self.cls_convs):
+                if j == self.cls_down_index:
+                    num_grid = self.num_grids[i]
+                    cls_feat = F.interpolate(
+                        cls_feat, size=num_grid, mode='bilinear')
+                cls_feat = cls_layer(cls_feat)
+
+            cls_pred = self.conv_cls(cls_feat)
+
+            if not self.training:
+                feat_wh = feats[0].size()[-2:]
+                upsampled_size = (feat_wh[0] * 2, feat_wh[1] * 2)
+                mask_pred_x = F.interpolate(
+                    mask_pred_x.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                mask_pred_y = F.interpolate(
+                    mask_pred_y.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                cls_pred = cls_pred.sigmoid()
+                # get local maximum
+                local_max = F.max_pool2d(cls_pred, 2, stride=1, padding=1)
+                keep_mask = local_max[:, :, :-1, :-1] == cls_pred
+                cls_pred = cls_pred * keep_mask
+
+            mask_preds_x.append(mask_pred_x)
+            mask_preds_y.append(mask_pred_y)
+            cls_preds.append(cls_pred)
+        return mask_preds_x, mask_preds_y, cls_preds
+
+    def loss_by_feat(self, mlvl_mask_preds_x: List[Tensor],
+                     mlvl_mask_preds_y: List[Tensor],
+                     mlvl_cls_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction
+                from x branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction
+                from y branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_cls_preds (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids ,num_grids).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_levels = self.num_levels
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in mlvl_mask_preds_x]
+
+        pos_mask_targets, labels, xy_pos_indexes = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            featmap_sizes=featmap_sizes)
+
+        # change from the outside list meaning multi images
+        # to the outside list meaning multi levels
+        mlvl_pos_mask_targets = [[] for _ in range(num_levels)]
+        mlvl_pos_mask_preds_x = [[] for _ in range(num_levels)]
+        mlvl_pos_mask_preds_y = [[] for _ in range(num_levels)]
+        mlvl_labels = [[] for _ in range(num_levels)]
+        for img_id in range(num_imgs):
+
+            for lvl in range(num_levels):
+                mlvl_pos_mask_targets[lvl].append(
+                    pos_mask_targets[img_id][lvl])
+                mlvl_pos_mask_preds_x[lvl].append(
+                    mlvl_mask_preds_x[lvl][img_id,
+                                           xy_pos_indexes[img_id][lvl][:, 1]])
+                mlvl_pos_mask_preds_y[lvl].append(
+                    mlvl_mask_preds_y[lvl][img_id,
+                                           xy_pos_indexes[img_id][lvl][:, 0]])
+                mlvl_labels[lvl].append(labels[img_id][lvl].flatten())
+
+        # cat multiple image
+        temp_mlvl_cls_preds = []
+        for lvl in range(num_levels):
+            mlvl_pos_mask_targets[lvl] = torch.cat(
+                mlvl_pos_mask_targets[lvl], dim=0)
+            mlvl_pos_mask_preds_x[lvl] = torch.cat(
+                mlvl_pos_mask_preds_x[lvl], dim=0)
+            mlvl_pos_mask_preds_y[lvl] = torch.cat(
+                mlvl_pos_mask_preds_y[lvl], dim=0)
+            mlvl_labels[lvl] = torch.cat(mlvl_labels[lvl], dim=0)
+            temp_mlvl_cls_preds.append(mlvl_cls_preds[lvl].permute(
+                0, 2, 3, 1).reshape(-1, self.cls_out_channels))
+
+        num_pos = 0.
+        # dice loss
+        loss_mask = []
+        for pred_x, pred_y, target in \
+                zip(mlvl_pos_mask_preds_x,
+                    mlvl_pos_mask_preds_y, mlvl_pos_mask_targets):
+            num_masks = pred_x.size(0)
+            if num_masks == 0:
+                # make sure can get grad
+                loss_mask.append((pred_x.sum() + pred_y.sum()).unsqueeze(0))
+                continue
+            num_pos += num_masks
+            pred_mask = pred_y.sigmoid() * pred_x.sigmoid()
+            loss_mask.append(
+                self.loss_mask(pred_mask, target, reduction_override='none'))
+        if num_pos > 0:
+            loss_mask = torch.cat(loss_mask).sum() / num_pos
+        else:
+            loss_mask = torch.cat(loss_mask).mean()
+
+        # cate
+        flatten_labels = torch.cat(mlvl_labels)
+        flatten_cls_preds = torch.cat(temp_mlvl_cls_preds)
+
+        loss_cls = self.loss_cls(
+            flatten_cls_preds, flatten_labels, avg_factor=num_pos + 1)
+        return dict(loss_mask=loss_mask, loss_cls=loss_cls)
+
+    def _get_targets_single(self,
+                            gt_instances: InstanceData,
+                            featmap_sizes: Optional[list] = None) -> tuple:
+        """Compute targets for predictions of single image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            featmap_sizes (list[:obj:`torch.size`]): Size of each
+                feature map from feature pyramid, each element
+                means (feat_h, feat_w). Defaults to None.
+
+        Returns:
+            Tuple: Usually returns a tuple containing targets for predictions.
+
+                - mlvl_pos_mask_targets (list[Tensor]): Each element represent
+                  the binary mask targets for positive points in this
+                  level, has shape (num_pos, out_h, out_w).
+                - mlvl_labels (list[Tensor]): Each element is
+                  classification labels for all
+                  points in this level, has shape
+                  (num_grid, num_grid).
+                - mlvl_xy_pos_indexes (list[Tensor]): Each element
+                  in the list contains the index of positive samples in
+                  corresponding level, has shape (num_pos, 2), last
+                  dimension 2 present (index_x, index_y).
+        """
+        mlvl_pos_mask_targets, mlvl_labels, mlvl_pos_masks = \
+            super()._get_targets_single(gt_instances,
+                                        featmap_sizes=featmap_sizes)
+
+        mlvl_xy_pos_indexes = [(item - self.num_classes).nonzero()
+                               for item in mlvl_labels]
+
+        return mlvl_pos_mask_targets, mlvl_labels, mlvl_xy_pos_indexes
+
+    def predict_by_feat(self, mlvl_mask_preds_x: List[Tensor],
+                        mlvl_mask_preds_y: List[Tensor],
+                        mlvl_cls_scores: List[Tensor],
+                        batch_img_metas: List[dict], **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction
+                from x branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction
+                from y branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_cls_scores (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes ,num_grids ,num_grids).
+            batch_img_metas (list[dict]): Meta information of all images.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        mlvl_cls_scores = [
+            item.permute(0, 2, 3, 1) for item in mlvl_cls_scores
+        ]
+        assert len(mlvl_mask_preds_x) == len(mlvl_cls_scores)
+        num_levels = len(mlvl_cls_scores)
+
+        results_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_pred_list = [
+                mlvl_cls_scores[i][img_id].view(
+                    -1, self.cls_out_channels).detach()
+                for i in range(num_levels)
+            ]
+            mask_pred_list_x = [
+                mlvl_mask_preds_x[i][img_id] for i in range(num_levels)
+            ]
+            mask_pred_list_y = [
+                mlvl_mask_preds_y[i][img_id] for i in range(num_levels)
+            ]
+
+            cls_pred_list = torch.cat(cls_pred_list, dim=0)
+            mask_pred_list_x = torch.cat(mask_pred_list_x, dim=0)
+            mask_pred_list_y = torch.cat(mask_pred_list_y, dim=0)
+            img_meta = batch_img_metas[img_id]
+
+            results = self._predict_by_feat_single(
+                cls_pred_list,
+                mask_pred_list_x,
+                mask_pred_list_y,
+                img_meta=img_meta)
+            results_list.append(results)
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: Tensor,
+                                mask_preds_x: Tensor,
+                                mask_preds_y: Tensor,
+                                img_meta: dict,
+                                cfg: OptConfigType = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        mask results.
+
+        Args:
+            cls_scores (Tensor): Classification score of all points
+                in single image, has shape (num_points, num_classes).
+            mask_preds_x (Tensor): Mask prediction of x branch of
+                all points in single image, has shape
+                (sum_num_grids, feat_h, feat_w).
+            mask_preds_y (Tensor): Mask prediction of y branch of
+                all points in single image, has shape
+                (sum_num_grids, feat_h, feat_w).
+            img_meta (dict): Meta information of corresponding image.
+            cfg (dict): Config used in test phase.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+
+        def empty_results(cls_scores, ori_shape):
+            """Generate a empty results."""
+            results = InstanceData()
+            results.scores = cls_scores.new_ones(0)
+            results.masks = cls_scores.new_zeros(0, *ori_shape)
+            results.labels = cls_scores.new_ones(0)
+            results.bboxes = cls_scores.new_zeros(0, 4)
+            return results
+
+        cfg = self.test_cfg if cfg is None else cfg
+
+        featmap_size = mask_preds_x.size()[-2:]
+
+        h, w = img_meta['img_shape'][:2]
+        upsampled_size = (featmap_size[0] * 4, featmap_size[1] * 4)
+
+        score_mask = (cls_scores > cfg.score_thr)
+        cls_scores = cls_scores[score_mask]
+        inds = score_mask.nonzero()
+        lvl_interval = inds.new_tensor(self.num_grids).pow(2).cumsum(0)
+        num_all_points = lvl_interval[-1]
+        lvl_start_index = inds.new_ones(num_all_points)
+        num_grids = inds.new_ones(num_all_points)
+        seg_size = inds.new_tensor(self.num_grids).cumsum(0)
+        mask_lvl_start_index = inds.new_ones(num_all_points)
+        strides = inds.new_ones(num_all_points)
+
+        lvl_start_index[:lvl_interval[0]] *= 0
+        mask_lvl_start_index[:lvl_interval[0]] *= 0
+        num_grids[:lvl_interval[0]] *= self.num_grids[0]
+        strides[:lvl_interval[0]] *= self.strides[0]
+
+        for lvl in range(1, self.num_levels):
+            lvl_start_index[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                lvl_interval[lvl - 1]
+            mask_lvl_start_index[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                seg_size[lvl - 1]
+            num_grids[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                self.num_grids[lvl]
+            strides[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                self.strides[lvl]
+
+        lvl_start_index = lvl_start_index[inds[:, 0]]
+        mask_lvl_start_index = mask_lvl_start_index[inds[:, 0]]
+        num_grids = num_grids[inds[:, 0]]
+        strides = strides[inds[:, 0]]
+
+        y_lvl_offset = (inds[:, 0] - lvl_start_index) // num_grids
+        x_lvl_offset = (inds[:, 0] - lvl_start_index) % num_grids
+        y_inds = mask_lvl_start_index + y_lvl_offset
+        x_inds = mask_lvl_start_index + x_lvl_offset
+
+        cls_labels = inds[:, 1]
+        mask_preds = mask_preds_x[x_inds, ...] * mask_preds_y[y_inds, ...]
+
+        masks = mask_preds > cfg.mask_thr
+        sum_masks = masks.sum((1, 2)).float()
+        keep = sum_masks > strides
+        if keep.sum() == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+
+        masks = masks[keep]
+        mask_preds = mask_preds[keep]
+        sum_masks = sum_masks[keep]
+        cls_scores = cls_scores[keep]
+        cls_labels = cls_labels[keep]
+
+        # maskness.
+        mask_scores = (mask_preds * masks).sum((1, 2)) / sum_masks
+        cls_scores *= mask_scores
+
+        scores, labels, _, keep_inds = mask_matrix_nms(
+            masks,
+            cls_labels,
+            cls_scores,
+            mask_area=sum_masks,
+            nms_pre=cfg.nms_pre,
+            max_num=cfg.max_per_img,
+            kernel=cfg.kernel,
+            sigma=cfg.sigma,
+            filter_thr=cfg.filter_thr)
+        # mask_matrix_nms may return an empty Tensor
+        if len(keep_inds) == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+        mask_preds = mask_preds[keep_inds]
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(0), size=upsampled_size,
+            mode='bilinear')[:, :, :h, :w]
+        mask_preds = F.interpolate(
+            mask_preds, size=img_meta['ori_shape'][:2],
+            mode='bilinear').squeeze(0)
+        masks = mask_preds > cfg.mask_thr
+
+        results = InstanceData()
+        results.masks = masks
+        results.labels = labels
+        results.scores = scores
+        # create an empty bbox in InstanceData to avoid bugs when
+        # calculating metrics.
+        results.bboxes = results.scores.new_zeros(len(scores), 4)
+
+        return results
+
+
+@MODELS.register_module()
+class DecoupledSOLOLightHead(DecoupledSOLOHead):
+    """Decoupled Light SOLO mask head used in `SOLO: Segmenting Objects by
+    Locations <https://arxiv.org/abs/1912.04488>`_
+
+    Args:
+        with_dcn (bool): Whether use dcn in mask_convs and cls_convs,
+            Defaults to False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 *args,
+                 dcn_cfg: OptConfigType = None,
+                 init_cfg: MultiConfig = [
+                     dict(type='Normal', layer='Conv2d', std=0.01),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_x')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_y')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_cls'))
+                 ],
+                 **kwargs) -> None:
+        assert dcn_cfg is None or isinstance(dcn_cfg, dict)
+        self.dcn_cfg = dcn_cfg
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self) -> None:
+        self.mask_convs = nn.ModuleList()
+        self.cls_convs = nn.ModuleList()
+
+        for i in range(self.stacked_convs):
+            if self.dcn_cfg is not None \
+                    and i == self.stacked_convs - 1:
+                conv_cfg = self.dcn_cfg
+            else:
+                conv_cfg = None
+
+            chn = self.in_channels + 2 if i == 0 else self.feat_channels
+            self.mask_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+        self.conv_mask_list_x = nn.ModuleList()
+        self.conv_mask_list_y = nn.ModuleList()
+        for num_grid in self.num_grids:
+            self.conv_mask_list_x.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+            self.conv_mask_list_y.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and mask prediction.
+
+                - mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction
+                  from x branch. Each element in the list has shape
+                  (batch_size, num_grids ,h ,w).
+                - mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction
+                  from y branch. Each element in the list has shape
+                  (batch_size, num_grids ,h ,w).
+                - mlvl_cls_preds (list[Tensor]): Multi-level scores.
+                  Each element in the list has shape
+                  (batch_size, num_classes, num_grids ,num_grids).
+        """
+        assert len(x) == self.num_levels
+        feats = self.resize_feats(x)
+        mask_preds_x = []
+        mask_preds_y = []
+        cls_preds = []
+        for i in range(self.num_levels):
+            x = feats[i]
+            mask_feat = x
+            cls_feat = x
+            # generate and concat the coordinate
+            coord_feat = generate_coordinate(mask_feat.size(),
+                                             mask_feat.device)
+            mask_feat = torch.cat([mask_feat, coord_feat], 1)
+
+            for mask_layer in self.mask_convs:
+                mask_feat = mask_layer(mask_feat)
+
+            mask_feat = F.interpolate(
+                mask_feat, scale_factor=2, mode='bilinear')
+
+            mask_pred_x = self.conv_mask_list_x[i](mask_feat)
+            mask_pred_y = self.conv_mask_list_y[i](mask_feat)
+
+            # cls branch
+            for j, cls_layer in enumerate(self.cls_convs):
+                if j == self.cls_down_index:
+                    num_grid = self.num_grids[i]
+                    cls_feat = F.interpolate(
+                        cls_feat, size=num_grid, mode='bilinear')
+                cls_feat = cls_layer(cls_feat)
+
+            cls_pred = self.conv_cls(cls_feat)
+
+            if not self.training:
+                feat_wh = feats[0].size()[-2:]
+                upsampled_size = (feat_wh[0] * 2, feat_wh[1] * 2)
+                mask_pred_x = F.interpolate(
+                    mask_pred_x.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                mask_pred_y = F.interpolate(
+                    mask_pred_y.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                cls_pred = cls_pred.sigmoid()
+                # get local maximum
+                local_max = F.max_pool2d(cls_pred, 2, stride=1, padding=1)
+                keep_mask = local_max[:, :, :-1, :-1] == cls_pred
+                cls_pred = cls_pred * keep_mask
+
+            mask_preds_x.append(mask_pred_x)
+            mask_preds_y.append(mask_pred_y)
+            cls_preds.append(cls_pred)
+        return mask_preds_x, mask_preds_y, cls_preds
diff --git a/mmde/mmdet/models/dense_heads/solov2_head.py b/mmde/mmdet/models/dense_heads/solov2_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..35b9df0c45148cb18e8afb659b10dd0b9e866b99
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/solov2_head.py
@@ -0,0 +1,799 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional, Tuple
+
+import mmcv
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.utils.misc import floordiv
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType
+from ..layers import mask_matrix_nms
+from ..utils import center_of_mass, generate_coordinate, multi_apply
+from .solo_head import SOLOHead
+
+
+class MaskFeatModule(BaseModule):
+    """SOLOv2 mask feature map branch used in `SOLOv2: Dynamic and Fast
+    Instance Segmentation. <https://arxiv.org/pdf/2003.10152>`_
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels of the mask feature
+             map branch.
+        start_level (int): The starting feature map level from RPN that
+             will be used to predict the mask feature map.
+        end_level (int): The ending feature map level from rpn that
+             will be used to predict the mask feature map.
+        out_channels (int): Number of output channels of the mask feature
+             map branch. This is the channel count of the mask
+             feature map that to be dynamically convolved with the predicted
+             kernel.
+        mask_stride (int): Downsample factor of the mask feature map output.
+            Defaults to 4.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        feat_channels: int,
+        start_level: int,
+        end_level: int,
+        out_channels: int,
+        mask_stride: int = 4,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        init_cfg: MultiConfig = [
+            dict(type='Normal', layer='Conv2d', std=0.01)
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.start_level = start_level
+        self.end_level = end_level
+        self.mask_stride = mask_stride
+        assert start_level >= 0 and end_level >= start_level
+        self.out_channels = out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self._init_layers()
+        self.fp16_enabled = False
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.convs_all_levels = nn.ModuleList()
+        for i in range(self.start_level, self.end_level + 1):
+            convs_per_level = nn.Sequential()
+            if i == 0:
+                convs_per_level.add_module(
+                    f'conv{i}',
+                    ConvModule(
+                        self.in_channels,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        inplace=False))
+                self.convs_all_levels.append(convs_per_level)
+                continue
+
+            for j in range(i):
+                if j == 0:
+                    if i == self.end_level:
+                        chn = self.in_channels + 2
+                    else:
+                        chn = self.in_channels
+                    convs_per_level.add_module(
+                        f'conv{j}',
+                        ConvModule(
+                            chn,
+                            self.feat_channels,
+                            3,
+                            padding=1,
+                            conv_cfg=self.conv_cfg,
+                            norm_cfg=self.norm_cfg,
+                            inplace=False))
+                    convs_per_level.add_module(
+                        f'upsample{j}',
+                        nn.Upsample(
+                            scale_factor=2,
+                            mode='bilinear',
+                            align_corners=False))
+                    continue
+
+                convs_per_level.add_module(
+                    f'conv{j}',
+                    ConvModule(
+                        self.feat_channels,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        inplace=False))
+                convs_per_level.add_module(
+                    f'upsample{j}',
+                    nn.Upsample(
+                        scale_factor=2, mode='bilinear', align_corners=False))
+
+            self.convs_all_levels.append(convs_per_level)
+
+        self.conv_pred = ConvModule(
+            self.feat_channels,
+            self.out_channels,
+            1,
+            padding=0,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+
+    def forward(self, x: Tuple[Tensor]) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            Tensor: The predicted mask feature map.
+        """
+        inputs = x[self.start_level:self.end_level + 1]
+        assert len(inputs) == (self.end_level - self.start_level + 1)
+        feature_add_all_level = self.convs_all_levels[0](inputs[0])
+        for i in range(1, len(inputs)):
+            input_p = inputs[i]
+            if i == len(inputs) - 1:
+                coord_feat = generate_coordinate(input_p.size(),
+                                                 input_p.device)
+                input_p = torch.cat([input_p, coord_feat], 1)
+
+            feature_add_all_level = feature_add_all_level + \
+                self.convs_all_levels[i](input_p)
+
+        feature_pred = self.conv_pred(feature_add_all_level)
+        return feature_pred
+
+
+@MODELS.register_module()
+class SOLOV2Head(SOLOHead):
+    """SOLOv2 mask head used in `SOLOv2: Dynamic and Fast Instance
+    Segmentation. <https://arxiv.org/pdf/2003.10152>`_
+
+    Args:
+        mask_feature_head (dict): Config of SOLOv2MaskFeatHead.
+        dynamic_conv_size (int): Dynamic Conv kernel size. Defaults to 1.
+        dcn_cfg (dict): Dcn conv configurations in kernel_convs and cls_conv.
+            Defaults to None.
+        dcn_apply_to_all_conv (bool): Whether to use dcn in every layer of
+            kernel_convs and cls_convs, or only the last layer. It shall be set
+            `True` for the normal version of SOLOv2 and `False` for the
+            light-weight version. Defaults to True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 *args,
+                 mask_feature_head: ConfigType,
+                 dynamic_conv_size: int = 1,
+                 dcn_cfg: OptConfigType = None,
+                 dcn_apply_to_all_conv: bool = True,
+                 init_cfg: MultiConfig = [
+                     dict(type='Normal', layer='Conv2d', std=0.01),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_cls'))
+                 ],
+                 **kwargs) -> None:
+        assert dcn_cfg is None or isinstance(dcn_cfg, dict)
+        self.dcn_cfg = dcn_cfg
+        self.with_dcn = dcn_cfg is not None
+        self.dcn_apply_to_all_conv = dcn_apply_to_all_conv
+        self.dynamic_conv_size = dynamic_conv_size
+        mask_out_channels = mask_feature_head.get('out_channels')
+        self.kernel_out_channels = \
+            mask_out_channels * self.dynamic_conv_size * self.dynamic_conv_size
+
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+
+        # update the in_channels of mask_feature_head
+        if mask_feature_head.get('in_channels', None) is not None:
+            if mask_feature_head.in_channels != self.in_channels:
+                warnings.warn('The `in_channels` of SOLOv2MaskFeatHead and '
+                              'SOLOv2Head should be same, changing '
+                              'mask_feature_head.in_channels to '
+                              f'{self.in_channels}')
+                mask_feature_head.update(in_channels=self.in_channels)
+        else:
+            mask_feature_head.update(in_channels=self.in_channels)
+
+        self.mask_feature_head = MaskFeatModule(**mask_feature_head)
+        self.mask_stride = self.mask_feature_head.mask_stride
+        self.fp16_enabled = False
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.kernel_convs = nn.ModuleList()
+        conv_cfg = None
+        for i in range(self.stacked_convs):
+            if self.with_dcn:
+                if self.dcn_apply_to_all_conv:
+                    conv_cfg = self.dcn_cfg
+                elif i == self.stacked_convs - 1:
+                    # light head
+                    conv_cfg = self.dcn_cfg
+
+            chn = self.in_channels + 2 if i == 0 else self.feat_channels
+            self.kernel_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.norm_cfg is None))
+
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.norm_cfg is None))
+
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+        self.conv_kernel = nn.Conv2d(
+            self.feat_channels, self.kernel_out_channels, 3, padding=1)
+
+    def forward(self, x):
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores, mask prediction,
+            and mask features.
+
+                - mlvl_kernel_preds (list[Tensor]): Multi-level dynamic kernel
+                  prediction. The kernel is used to generate instance
+                  segmentation masks by dynamic convolution. Each element in
+                  the list has shape
+                  (batch_size, kernel_out_channels, num_grids, num_grids).
+                - mlvl_cls_preds (list[Tensor]): Multi-level scores. Each
+                  element in the list has shape
+                  (batch_size, num_classes, num_grids, num_grids).
+                - mask_feats (Tensor): Unified mask feature map used to
+                  generate instance segmentation masks by dynamic convolution.
+                  Has shape (batch_size, mask_out_channels, h, w).
+        """
+        assert len(x) == self.num_levels
+        mask_feats = self.mask_feature_head(x)
+        ins_kernel_feats = self.resize_feats(x)
+        mlvl_kernel_preds = []
+        mlvl_cls_preds = []
+        for i in range(self.num_levels):
+            ins_kernel_feat = ins_kernel_feats[i]
+            # ins branch
+            # concat coord
+            coord_feat = generate_coordinate(ins_kernel_feat.size(),
+                                             ins_kernel_feat.device)
+            ins_kernel_feat = torch.cat([ins_kernel_feat, coord_feat], 1)
+
+            # kernel branch
+            kernel_feat = ins_kernel_feat
+            kernel_feat = F.interpolate(
+                kernel_feat,
+                size=self.num_grids[i],
+                mode='bilinear',
+                align_corners=False)
+
+            cate_feat = kernel_feat[:, :-2, :, :]
+
+            kernel_feat = kernel_feat.contiguous()
+            for i, kernel_conv in enumerate(self.kernel_convs):
+                kernel_feat = kernel_conv(kernel_feat)
+            kernel_pred = self.conv_kernel(kernel_feat)
+
+            # cate branch
+            cate_feat = cate_feat.contiguous()
+            for i, cls_conv in enumerate(self.cls_convs):
+                cate_feat = cls_conv(cate_feat)
+            cate_pred = self.conv_cls(cate_feat)
+
+            mlvl_kernel_preds.append(kernel_pred)
+            mlvl_cls_preds.append(cate_pred)
+
+        return mlvl_kernel_preds, mlvl_cls_preds, mask_feats
+
+    def _get_targets_single(self,
+                            gt_instances: InstanceData,
+                            featmap_sizes: Optional[list] = None) -> tuple:
+        """Compute targets for predictions of single image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            featmap_sizes (list[:obj:`torch.size`]): Size of each
+                feature map from feature pyramid, each element
+                means (feat_h, feat_w). Defaults to None.
+
+        Returns:
+            Tuple: Usually returns a tuple containing targets for predictions.
+
+                - mlvl_pos_mask_targets (list[Tensor]): Each element represent
+                  the binary mask targets for positive points in this
+                  level, has shape (num_pos, out_h, out_w).
+                - mlvl_labels (list[Tensor]): Each element is
+                  classification labels for all
+                  points in this level, has shape
+                  (num_grid, num_grid).
+                - mlvl_pos_masks  (list[Tensor]): Each element is
+                  a `BoolTensor` to represent whether the
+                  corresponding point in single level
+                  is positive, has shape (num_grid **2).
+                - mlvl_pos_indexes  (list[list]): Each element
+                  in the list contains the positive index in
+                  corresponding level, has shape (num_pos).
+        """
+        gt_labels = gt_instances.labels
+        device = gt_labels.device
+
+        gt_bboxes = gt_instances.bboxes
+        gt_areas = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                              (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+        gt_masks = gt_instances.masks.to_tensor(
+            dtype=torch.bool, device=device)
+
+        mlvl_pos_mask_targets = []
+        mlvl_pos_indexes = []
+        mlvl_labels = []
+        mlvl_pos_masks = []
+        for (lower_bound, upper_bound), num_grid \
+                in zip(self.scale_ranges, self.num_grids):
+            mask_target = []
+            # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+            pos_index = []
+            labels = torch.zeros([num_grid, num_grid],
+                                 dtype=torch.int64,
+                                 device=device) + self.num_classes
+            pos_mask = torch.zeros([num_grid**2],
+                                   dtype=torch.bool,
+                                   device=device)
+
+            gt_inds = ((gt_areas >= lower_bound) &
+                       (gt_areas <= upper_bound)).nonzero().flatten()
+            if len(gt_inds) == 0:
+                mlvl_pos_mask_targets.append(
+                    torch.zeros([0, featmap_sizes[0], featmap_sizes[1]],
+                                dtype=torch.uint8,
+                                device=device))
+                mlvl_labels.append(labels)
+                mlvl_pos_masks.append(pos_mask)
+                mlvl_pos_indexes.append([])
+                continue
+            hit_gt_bboxes = gt_bboxes[gt_inds]
+            hit_gt_labels = gt_labels[gt_inds]
+            hit_gt_masks = gt_masks[gt_inds, ...]
+
+            pos_w_ranges = 0.5 * (hit_gt_bboxes[:, 2] -
+                                  hit_gt_bboxes[:, 0]) * self.pos_scale
+            pos_h_ranges = 0.5 * (hit_gt_bboxes[:, 3] -
+                                  hit_gt_bboxes[:, 1]) * self.pos_scale
+
+            # Make sure hit_gt_masks has a value
+            valid_mask_flags = hit_gt_masks.sum(dim=-1).sum(dim=-1) > 0
+
+            for gt_mask, gt_label, pos_h_range, pos_w_range, \
+                valid_mask_flag in \
+                    zip(hit_gt_masks, hit_gt_labels, pos_h_ranges,
+                        pos_w_ranges, valid_mask_flags):
+                if not valid_mask_flag:
+                    continue
+                upsampled_size = (featmap_sizes[0] * self.mask_stride,
+                                  featmap_sizes[1] * self.mask_stride)
+                center_h, center_w = center_of_mass(gt_mask)
+
+                coord_w = int(
+                    floordiv((center_w / upsampled_size[1]), (1. / num_grid),
+                             rounding_mode='trunc'))
+                coord_h = int(
+                    floordiv((center_h / upsampled_size[0]), (1. / num_grid),
+                             rounding_mode='trunc'))
+
+                # left, top, right, down
+                top_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_h - pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                down_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_h + pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                left_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_w - pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                right_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_w + pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+
+                top = max(top_box, coord_h - 1)
+                down = min(down_box, coord_h + 1)
+                left = max(coord_w - 1, left_box)
+                right = min(right_box, coord_w + 1)
+
+                labels[top:(down + 1), left:(right + 1)] = gt_label
+                # ins
+                gt_mask = np.uint8(gt_mask.cpu().numpy())
+                # Follow the original implementation, F.interpolate is
+                # different from cv2 and opencv
+                gt_mask = mmcv.imrescale(gt_mask, scale=1. / self.mask_stride)
+                gt_mask = torch.from_numpy(gt_mask).to(device=device)
+
+                for i in range(top, down + 1):
+                    for j in range(left, right + 1):
+                        index = int(i * num_grid + j)
+                        this_mask_target = torch.zeros(
+                            [featmap_sizes[0], featmap_sizes[1]],
+                            dtype=torch.uint8,
+                            device=device)
+                        this_mask_target[:gt_mask.shape[0], :gt_mask.
+                                         shape[1]] = gt_mask
+                        mask_target.append(this_mask_target)
+                        pos_mask[index] = True
+                        pos_index.append(index)
+            if len(mask_target) == 0:
+                mask_target = torch.zeros(
+                    [0, featmap_sizes[0], featmap_sizes[1]],
+                    dtype=torch.uint8,
+                    device=device)
+            else:
+                mask_target = torch.stack(mask_target, 0)
+            mlvl_pos_mask_targets.append(mask_target)
+            mlvl_labels.append(labels)
+            mlvl_pos_masks.append(pos_mask)
+            mlvl_pos_indexes.append(pos_index)
+        return (mlvl_pos_mask_targets, mlvl_labels, mlvl_pos_masks,
+                mlvl_pos_indexes)
+
+    def loss_by_feat(self, mlvl_kernel_preds: List[Tensor],
+                     mlvl_cls_preds: List[Tensor], mask_feats: Tensor,
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mlvl_kernel_preds (list[Tensor]): Multi-level dynamic kernel
+                prediction. The kernel is used to generate instance
+                segmentation masks by dynamic convolution. Each element in the
+                list has shape
+                (batch_size, kernel_out_channels, num_grids, num_grids).
+            mlvl_cls_preds (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids, num_grids).
+            mask_feats (Tensor): Unified mask feature map used to generate
+                instance segmentation masks by dynamic convolution. Has shape
+                (batch_size, mask_out_channels, h, w).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = mask_feats.size()[-2:]
+
+        pos_mask_targets, labels, pos_masks, pos_indexes = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            featmap_sizes=featmap_sizes)
+
+        mlvl_mask_targets = [
+            torch.cat(lvl_mask_targets, 0)
+            for lvl_mask_targets in zip(*pos_mask_targets)
+        ]
+
+        mlvl_pos_kernel_preds = []
+        for lvl_kernel_preds, lvl_pos_indexes in zip(mlvl_kernel_preds,
+                                                     zip(*pos_indexes)):
+            lvl_pos_kernel_preds = []
+            for img_lvl_kernel_preds, img_lvl_pos_indexes in zip(
+                    lvl_kernel_preds, lvl_pos_indexes):
+                img_lvl_pos_kernel_preds = img_lvl_kernel_preds.view(
+                    img_lvl_kernel_preds.shape[0], -1)[:, img_lvl_pos_indexes]
+                lvl_pos_kernel_preds.append(img_lvl_pos_kernel_preds)
+            mlvl_pos_kernel_preds.append(lvl_pos_kernel_preds)
+
+        # make multilevel mlvl_mask_pred
+        mlvl_mask_preds = []
+        for lvl_pos_kernel_preds in mlvl_pos_kernel_preds:
+            lvl_mask_preds = []
+            for img_id, img_lvl_pos_kernel_pred in enumerate(
+                    lvl_pos_kernel_preds):
+                if img_lvl_pos_kernel_pred.size()[-1] == 0:
+                    continue
+                img_mask_feats = mask_feats[[img_id]]
+                h, w = img_mask_feats.shape[-2:]
+                num_kernel = img_lvl_pos_kernel_pred.shape[1]
+                img_lvl_mask_pred = F.conv2d(
+                    img_mask_feats,
+                    img_lvl_pos_kernel_pred.permute(1, 0).view(
+                        num_kernel, -1, self.dynamic_conv_size,
+                        self.dynamic_conv_size),
+                    stride=1).view(-1, h, w)
+                lvl_mask_preds.append(img_lvl_mask_pred)
+            if len(lvl_mask_preds) == 0:
+                lvl_mask_preds = None
+            else:
+                lvl_mask_preds = torch.cat(lvl_mask_preds, 0)
+            mlvl_mask_preds.append(lvl_mask_preds)
+        # dice loss
+        num_pos = 0
+        for img_pos_masks in pos_masks:
+            for lvl_img_pos_masks in img_pos_masks:
+                # Fix `Tensor` object has no attribute `count_nonzero()`
+                # in PyTorch 1.6, the type of `lvl_img_pos_masks`
+                # should be `torch.bool`.
+                num_pos += lvl_img_pos_masks.nonzero().numel()
+        loss_mask = []
+        for lvl_mask_preds, lvl_mask_targets in zip(mlvl_mask_preds,
+                                                    mlvl_mask_targets):
+            if lvl_mask_preds is None:
+                continue
+            loss_mask.append(
+                self.loss_mask(
+                    lvl_mask_preds,
+                    lvl_mask_targets,
+                    reduction_override='none'))
+        if num_pos > 0:
+            loss_mask = torch.cat(loss_mask).sum() / num_pos
+        else:
+            loss_mask = mask_feats.sum() * 0
+
+        # cate
+        flatten_labels = [
+            torch.cat(
+                [img_lvl_labels.flatten() for img_lvl_labels in lvl_labels])
+            for lvl_labels in zip(*labels)
+        ]
+        flatten_labels = torch.cat(flatten_labels)
+
+        flatten_cls_preds = [
+            lvl_cls_preds.permute(0, 2, 3, 1).reshape(-1, self.num_classes)
+            for lvl_cls_preds in mlvl_cls_preds
+        ]
+        flatten_cls_preds = torch.cat(flatten_cls_preds)
+
+        loss_cls = self.loss_cls(
+            flatten_cls_preds, flatten_labels, avg_factor=num_pos + 1)
+        return dict(loss_mask=loss_mask, loss_cls=loss_cls)
+
+    def predict_by_feat(self, mlvl_kernel_preds: List[Tensor],
+                        mlvl_cls_scores: List[Tensor], mask_feats: Tensor,
+                        batch_img_metas: List[dict], **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mlvl_kernel_preds (list[Tensor]): Multi-level dynamic kernel
+                prediction. The kernel is used to generate instance
+                segmentation masks by dynamic convolution. Each element in the
+                list has shape
+                (batch_size, kernel_out_channels, num_grids, num_grids).
+            mlvl_cls_scores (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids, num_grids).
+            mask_feats (Tensor): Unified mask feature map used to generate
+                instance segmentation masks by dynamic convolution. Has shape
+                (batch_size, mask_out_channels, h, w).
+            batch_img_metas (list[dict]): Meta information of all images.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        num_levels = len(mlvl_cls_scores)
+        assert len(mlvl_kernel_preds) == len(mlvl_cls_scores)
+
+        for lvl in range(num_levels):
+            cls_scores = mlvl_cls_scores[lvl]
+            cls_scores = cls_scores.sigmoid()
+            local_max = F.max_pool2d(cls_scores, 2, stride=1, padding=1)
+            keep_mask = local_max[:, :, :-1, :-1] == cls_scores
+            cls_scores = cls_scores * keep_mask
+            mlvl_cls_scores[lvl] = cls_scores.permute(0, 2, 3, 1)
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            img_cls_pred = [
+                mlvl_cls_scores[lvl][img_id].view(-1, self.cls_out_channels)
+                for lvl in range(num_levels)
+            ]
+            img_mask_feats = mask_feats[[img_id]]
+            img_kernel_pred = [
+                mlvl_kernel_preds[lvl][img_id].permute(1, 2, 0).view(
+                    -1, self.kernel_out_channels) for lvl in range(num_levels)
+            ]
+            img_cls_pred = torch.cat(img_cls_pred, dim=0)
+            img_kernel_pred = torch.cat(img_kernel_pred, dim=0)
+            result = self._predict_by_feat_single(
+                img_kernel_pred,
+                img_cls_pred,
+                img_mask_feats,
+                img_meta=batch_img_metas[img_id])
+            result_list.append(result)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                kernel_preds: Tensor,
+                                cls_scores: Tensor,
+                                mask_feats: Tensor,
+                                img_meta: dict,
+                                cfg: OptConfigType = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        mask results.
+
+        Args:
+            kernel_preds (Tensor): Dynamic kernel prediction of all points
+                in single image, has shape
+                (num_points, kernel_out_channels).
+            cls_scores (Tensor): Classification score of all points
+                in single image, has shape (num_points, num_classes).
+            mask_feats (Tensor): Mask prediction of all points in
+                single image, has shape (num_points, feat_h, feat_w).
+            img_meta (dict): Meta information of corresponding image.
+            cfg (dict, optional): Config used in test phase.
+                Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+
+        def empty_results(cls_scores, ori_shape):
+            """Generate a empty results."""
+            results = InstanceData()
+            results.scores = cls_scores.new_ones(0)
+            results.masks = cls_scores.new_zeros(0, *ori_shape)
+            results.labels = cls_scores.new_ones(0)
+            results.bboxes = cls_scores.new_zeros(0, 4)
+            return results
+
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(kernel_preds) == len(cls_scores)
+
+        featmap_size = mask_feats.size()[-2:]
+
+        # overall info
+        h, w = img_meta['img_shape'][:2]
+        upsampled_size = (featmap_size[0] * self.mask_stride,
+                          featmap_size[1] * self.mask_stride)
+
+        # process.
+        score_mask = (cls_scores > cfg.score_thr)
+        cls_scores = cls_scores[score_mask]
+        if len(cls_scores) == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+
+        # cate_labels & kernel_preds
+        inds = score_mask.nonzero()
+        cls_labels = inds[:, 1]
+        kernel_preds = kernel_preds[inds[:, 0]]
+
+        # trans vector.
+        lvl_interval = cls_labels.new_tensor(self.num_grids).pow(2).cumsum(0)
+        strides = kernel_preds.new_ones(lvl_interval[-1])
+
+        strides[:lvl_interval[0]] *= self.strides[0]
+        for lvl in range(1, self.num_levels):
+            strides[lvl_interval[lvl -
+                                 1]:lvl_interval[lvl]] *= self.strides[lvl]
+        strides = strides[inds[:, 0]]
+
+        # mask encoding.
+        kernel_preds = kernel_preds.view(
+            kernel_preds.size(0), -1, self.dynamic_conv_size,
+            self.dynamic_conv_size)
+        mask_preds = F.conv2d(
+            mask_feats, kernel_preds, stride=1).squeeze(0).sigmoid()
+        # mask.
+        masks = mask_preds > cfg.mask_thr
+        sum_masks = masks.sum((1, 2)).float()
+        keep = sum_masks > strides
+        if keep.sum() == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+        masks = masks[keep]
+        mask_preds = mask_preds[keep]
+        sum_masks = sum_masks[keep]
+        cls_scores = cls_scores[keep]
+        cls_labels = cls_labels[keep]
+
+        # maskness.
+        mask_scores = (mask_preds * masks).sum((1, 2)) / sum_masks
+        cls_scores *= mask_scores
+
+        scores, labels, _, keep_inds = mask_matrix_nms(
+            masks,
+            cls_labels,
+            cls_scores,
+            mask_area=sum_masks,
+            nms_pre=cfg.nms_pre,
+            max_num=cfg.max_per_img,
+            kernel=cfg.kernel,
+            sigma=cfg.sigma,
+            filter_thr=cfg.filter_thr)
+        if len(keep_inds) == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+        mask_preds = mask_preds[keep_inds]
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(0),
+            size=upsampled_size,
+            mode='bilinear',
+            align_corners=False)[:, :, :h, :w]
+        mask_preds = F.interpolate(
+            mask_preds,
+            size=img_meta['ori_shape'][:2],
+            mode='bilinear',
+            align_corners=False).squeeze(0)
+        masks = mask_preds > cfg.mask_thr
+
+        results = InstanceData()
+        results.masks = masks
+        results.labels = labels
+        results.scores = scores
+        # create an empty bbox in InstanceData to avoid bugs when
+        # calculating metrics.
+        results.bboxes = results.scores.new_zeros(len(scores), 4)
+
+        return results
diff --git a/mmde/mmdet/models/dense_heads/ssd_head.py b/mmde/mmdet/models/dense_heads/ssd_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..950df29110d914cc888bc16c6cbf1856f604a1de
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/ssd_head.py
@@ -0,0 +1,362 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptInstanceList
+from ..losses import smooth_l1_loss
+from ..task_modules.samplers import PseudoSampler
+from ..utils import multi_apply
+from .anchor_head import AnchorHead
+
+
+# TODO: add loss evaluator for SSD
+@MODELS.register_module()
+class SSDHead(AnchorHead):
+    """Implementation of `SSD head <https://arxiv.org/abs/1512.02325>`_
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (Sequence[int]): Number of channels in the input feature
+            map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Defaults to 0.
+        feat_channels (int): Number of hidden channels when stacked_convs
+            > 0. Defaults to 256.
+        use_depthwise (bool): Whether to use DepthwiseSeparableConv.
+            Defaults to False.
+        conv_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config conv layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config norm layer. Defaults to None.
+        act_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config activation layer. Defaults to None.
+        anchor_generator (:obj:`ConfigDict` or dict): Config dict for anchor
+            generator.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Defaults to False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        train_cfg (:obj:`ConfigDict` or dict, Optional): Training config of
+            anchor head.
+        test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of
+            anchor head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], Optional): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(
+        self,
+        num_classes: int = 80,
+        in_channels: Sequence[int] = (512, 1024, 512, 256, 256, 256),
+        stacked_convs: int = 0,
+        feat_channels: int = 256,
+        use_depthwise: bool = False,
+        conv_cfg: Optional[ConfigType] = None,
+        norm_cfg: Optional[ConfigType] = None,
+        act_cfg: Optional[ConfigType] = None,
+        anchor_generator: ConfigType = dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            input_size=300,
+            strides=[8, 16, 32, 64, 100, 300],
+            ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]),
+            basesize_ratio_range=(0.1, 0.9)),
+        bbox_coder: ConfigType = dict(
+            type='DeltaXYWHBBoxCoder',
+            clip_border=True,
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0],
+        ),
+        reg_decoded_bbox: bool = False,
+        train_cfg: Optional[ConfigType] = None,
+        test_cfg: Optional[ConfigType] = None,
+        init_cfg: MultiConfig = dict(
+            type='Xavier', layer='Conv2d', distribution='uniform', bias=0)
+    ) -> None:
+        super(AnchorHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.stacked_convs = stacked_convs
+        self.feat_channels = feat_channels
+        self.use_depthwise = use_depthwise
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.cls_out_channels = num_classes + 1  # add background class
+        self.prior_generator = TASK_UTILS.build(anchor_generator)
+
+        # Usually the numbers of anchors for each level are the same
+        # except SSD detectors. So it is an int in the most dense
+        # heads but a list of int in SSDHead
+        self.num_base_priors = self.prior_generator.num_base_priors
+
+        self._init_layers()
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.reg_decoded_bbox = reg_decoded_bbox
+        self.use_sigmoid_cls = False
+        self.cls_focal_loss = False
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        # TODO: Use registry to choose ConvModule type
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+
+        for channel, num_base_priors in zip(self.in_channels,
+                                            self.num_base_priors):
+            cls_layers = []
+            reg_layers = []
+            in_channel = channel
+            # build stacked conv tower, not used in default ssd
+            for i in range(self.stacked_convs):
+                cls_layers.append(
+                    conv(
+                        in_channel,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_layers.append(
+                    conv(
+                        in_channel,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                in_channel = self.feat_channels
+            # SSD-Lite head
+            if self.use_depthwise:
+                cls_layers.append(
+                    ConvModule(
+                        in_channel,
+                        in_channel,
+                        3,
+                        padding=1,
+                        groups=in_channel,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_layers.append(
+                    ConvModule(
+                        in_channel,
+                        in_channel,
+                        3,
+                        padding=1,
+                        groups=in_channel,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            cls_layers.append(
+                nn.Conv2d(
+                    in_channel,
+                    num_base_priors * self.cls_out_channels,
+                    kernel_size=1 if self.use_depthwise else 3,
+                    padding=0 if self.use_depthwise else 1))
+            reg_layers.append(
+                nn.Conv2d(
+                    in_channel,
+                    num_base_priors * 4,
+                    kernel_size=1 if self.use_depthwise else 3,
+                    padding=0 if self.use_depthwise else 1))
+            self.cls_convs.append(nn.Sequential(*cls_layers))
+            self.reg_convs.append(nn.Sequential(*reg_layers))
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple[list[Tensor], list[Tensor]]: A tuple of cls_scores list and
+            bbox_preds list.
+
+            - cls_scores (list[Tensor]): Classification scores for all scale \
+            levels, each is a 4D-tensor, the channels number is \
+            num_anchors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale \
+            levels, each is a 4D-tensor, the channels number is \
+            num_anchors * 4.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for feat, reg_conv, cls_conv in zip(x, self.reg_convs, self.cls_convs):
+            cls_scores.append(cls_conv(feat))
+            bbox_preds.append(reg_conv(feat))
+        return cls_scores, bbox_preds
+
+    def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            anchor: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            bbox_weights: Tensor,
+                            avg_factor: int) -> Tuple[Tensor, Tensor]:
+        """Compute loss of a single image.
+
+        Args:
+            cls_score (Tensor): Box scores for eachimage
+                Has shape (num_total_anchors, num_classes).
+            bbox_pred (Tensor): Box energies / deltas for each image
+                level with shape (num_total_anchors, 4).
+            anchors (Tensor): Box reference for each scale level with shape
+                (num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (num_total_anchors,).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (num_total_anchors,)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (num_total_anchors, 4).
+            bbox_weights (Tensor): BBox regression loss weights of each anchor
+                with shape (num_total_anchors, 4).
+            avg_factor (int): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            Tuple[Tensor, Tensor]: A tuple of cls loss and bbox loss of one
+            feature map.
+        """
+
+        loss_cls_all = F.cross_entropy(
+            cls_score, labels, reduction='none') * label_weights
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero(
+            as_tuple=False).reshape(-1)
+        neg_inds = (labels == self.num_classes).nonzero(
+            as_tuple=False).view(-1)
+
+        num_pos_samples = pos_inds.size(0)
+        num_neg_samples = self.train_cfg['neg_pos_ratio'] * num_pos_samples
+        if num_neg_samples > neg_inds.size(0):
+            num_neg_samples = neg_inds.size(0)
+        topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples)
+        loss_cls_pos = loss_cls_all[pos_inds].sum()
+        loss_cls_neg = topk_loss_cls_neg.sum()
+        loss_cls = (loss_cls_pos + loss_cls_neg) / avg_factor
+
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            bbox_pred = self.bbox_coder.decode(anchor, bbox_pred)
+
+        loss_bbox = smooth_l1_loss(
+            bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            beta=self.train_cfg['smoothl1_beta'],
+            avg_factor=avg_factor)
+        return loss_cls[None], loss_bbox
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, List[Tensor]]:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, list[Tensor]]: A dictionary of loss components. the dict
+            has components below:
+
+            - loss_cls (list[Tensor]): A list containing each feature map \
+            classification loss.
+            - loss_bbox (list[Tensor]): A list containing each feature map \
+            regression loss.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            unmap_outputs=True)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor) = cls_reg_targets
+
+        num_images = len(batch_img_metas)
+        all_cls_scores = torch.cat([
+            s.permute(0, 2, 3, 1).reshape(
+                num_images, -1, self.cls_out_channels) for s in cls_scores
+        ], 1)
+        all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+        all_label_weights = torch.cat(label_weights_list,
+                                      -1).view(num_images, -1)
+        all_bbox_preds = torch.cat([
+            b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+            for b in bbox_preds
+        ], -2)
+        all_bbox_targets = torch.cat(bbox_targets_list,
+                                     -2).view(num_images, -1, 4)
+        all_bbox_weights = torch.cat(bbox_weights_list,
+                                     -2).view(num_images, -1, 4)
+
+        # concat all level anchors to a single tensor
+        all_anchors = []
+        for i in range(num_images):
+            all_anchors.append(torch.cat(anchor_list[i]))
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_by_feat_single,
+            all_cls_scores,
+            all_bbox_preds,
+            all_anchors,
+            all_labels,
+            all_label_weights,
+            all_bbox_targets,
+            all_bbox_weights,
+            avg_factor=avg_factor)
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
diff --git a/mmde/mmdet/models/dense_heads/tood_head.py b/mmde/mmdet/models/dense_heads/tood_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c59598d89289df6d1a87c7b6fde112429ac8f45
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/tood_head.py
@@ -0,0 +1,805 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale
+from mmcv.ops import deform_conv2d
+from mmengine import MessageHub
+from mmengine.config import ConfigDict
+from mmengine.model import bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import distance2bbox
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply,
+                     sigmoid_geometric_mean, unmap)
+from .atss_head import ATSSHead
+
+
+class TaskDecomposition(nn.Module):
+    """Task decomposition module in task-aligned predictor of TOOD.
+
+    Args:
+        feat_channels (int): Number of feature channels in TOOD head.
+        stacked_convs (int): Number of conv layers in TOOD head.
+        la_down_rate (int): Downsample rate of layer attention.
+            Defaults to 8.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional):  Config dict for
+        normalization layer. Defaults to None.
+    """
+
+    def __init__(self,
+                 feat_channels: int,
+                 stacked_convs: int,
+                 la_down_rate: int = 8,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None) -> None:
+        super().__init__()
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.in_channels = self.feat_channels * self.stacked_convs
+        self.norm_cfg = norm_cfg
+        self.layer_attention = nn.Sequential(
+            nn.Conv2d(self.in_channels, self.in_channels // la_down_rate, 1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                self.in_channels // la_down_rate,
+                self.stacked_convs,
+                1,
+                padding=0), nn.Sigmoid())
+
+        self.reduction_conv = ConvModule(
+            self.in_channels,
+            self.feat_channels,
+            1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=norm_cfg is None)
+
+    def init_weights(self) -> None:
+        """Initialize the parameters."""
+        for m in self.layer_attention.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001)
+        normal_init(self.reduction_conv.conv, std=0.01)
+
+    def forward(self,
+                feat: Tensor,
+                avg_feat: Optional[Tensor] = None) -> Tensor:
+        """Forward function of task decomposition module."""
+        b, c, h, w = feat.shape
+        if avg_feat is None:
+            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
+        weight = self.layer_attention(avg_feat)
+
+        # here we first compute the product between layer attention weight and
+        # conv weight, and then compute the convolution between new conv weight
+        # and feature map, in order to save memory and FLOPs.
+        conv_weight = weight.reshape(
+            b, 1, self.stacked_convs,
+            1) * self.reduction_conv.conv.weight.reshape(
+                1, self.feat_channels, self.stacked_convs, self.feat_channels)
+        conv_weight = conv_weight.reshape(b, self.feat_channels,
+                                          self.in_channels)
+        feat = feat.reshape(b, self.in_channels, h * w)
+        feat = torch.bmm(conv_weight, feat).reshape(b, self.feat_channels, h,
+                                                    w)
+        if self.norm_cfg is not None:
+            feat = self.reduction_conv.norm(feat)
+        feat = self.reduction_conv.activate(feat)
+
+        return feat
+
+
+@MODELS.register_module()
+class TOODHead(ATSSHead):
+    """TOODHead used in `TOOD: Task-aligned One-stage Object Detection.
+
+    <https://arxiv.org/abs/2108.07755>`_.
+
+    TOOD uses Task-aligned head (T-head) and is optimized by Task Alignment
+    Learning (TAL).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        num_dcn (int): Number of deformable convolution in the head.
+            Defaults to 0.
+        anchor_type (str): If set to ``anchor_free``, the head will use centers
+            to regress bboxes. If set to ``anchor_based``, the head will
+            regress bboxes based on anchors. Defaults to ``anchor_free``.
+        initial_loss_cls (:obj:`ConfigDict` or dict): Config of initial loss.
+
+    Example:
+        >>> self = TOODHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_score, bbox_pred = self.forward(feats)
+        >>> assert len(cls_score) == len(self.scales)
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 num_dcn: int = 0,
+                 anchor_type: str = 'anchor_free',
+                 initial_loss_cls: ConfigType = dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     activated=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 **kwargs) -> None:
+        assert anchor_type in ['anchor_free', 'anchor_based']
+        self.num_dcn = num_dcn
+        self.anchor_type = anchor_type
+        super().__init__(
+            num_classes=num_classes, in_channels=in_channels, **kwargs)
+
+        if self.train_cfg:
+            self.initial_epoch = self.train_cfg['initial_epoch']
+            self.initial_assigner = TASK_UTILS.build(
+                self.train_cfg['initial_assigner'])
+            self.initial_loss_cls = MODELS.build(initial_loss_cls)
+            self.assigner = self.initial_assigner
+            self.alignment_assigner = TASK_UTILS.build(
+                self.train_cfg['assigner'])
+            self.alpha = self.train_cfg['alpha']
+            self.beta = self.train_cfg['beta']
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.inter_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            if i < self.num_dcn:
+                conv_cfg = dict(type='DCNv2', deform_groups=4)
+            else:
+                conv_cfg = self.conv_cfg
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.inter_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+        self.cls_decomp = TaskDecomposition(self.feat_channels,
+                                            self.stacked_convs,
+                                            self.stacked_convs * 8,
+                                            self.conv_cfg, self.norm_cfg)
+        self.reg_decomp = TaskDecomposition(self.feat_channels,
+                                            self.stacked_convs,
+                                            self.stacked_convs * 8,
+                                            self.conv_cfg, self.norm_cfg)
+
+        self.tood_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.tood_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+
+        self.cls_prob_module = nn.Sequential(
+            nn.Conv2d(self.feat_channels * self.stacked_convs,
+                      self.feat_channels // 4, 1), nn.ReLU(inplace=True),
+            nn.Conv2d(self.feat_channels // 4, 1, 3, padding=1))
+        self.reg_offset_module = nn.Sequential(
+            nn.Conv2d(self.feat_channels * self.stacked_convs,
+                      self.feat_channels // 4, 1), nn.ReLU(inplace=True),
+            nn.Conv2d(self.feat_channels // 4, 4 * 2, 3, padding=1))
+
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        bias_cls = bias_init_with_prob(0.01)
+        for m in self.inter_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.cls_prob_module:
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.01)
+        for m in self.reg_offset_module:
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001)
+        normal_init(self.cls_prob_module[-1], std=0.01, bias=bias_cls)
+
+        self.cls_decomp.init_weights()
+        self.reg_decomp.init_weights()
+
+        normal_init(self.tood_cls, std=0.01, bias=bias_cls)
+        normal_init(self.tood_reg, std=0.01)
+
+    def forward(self, feats: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Decoded box for all scale levels,
+                    each is a 4D-tensor, the channels number is
+                    num_anchors * 4. In [tl_x, tl_y, br_x, br_y] format.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for idx, (x, scale, stride) in enumerate(
+                zip(feats, self.scales, self.prior_generator.strides)):
+            b, c, h, w = x.shape
+            anchor = self.prior_generator.single_level_grid_priors(
+                (h, w), idx, device=x.device)
+            anchor = torch.cat([anchor for _ in range(b)])
+            # extract task interactive features
+            inter_feats = []
+            for inter_conv in self.inter_convs:
+                x = inter_conv(x)
+                inter_feats.append(x)
+            feat = torch.cat(inter_feats, 1)
+
+            # task decomposition
+            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
+            cls_feat = self.cls_decomp(feat, avg_feat)
+            reg_feat = self.reg_decomp(feat, avg_feat)
+
+            # cls prediction and alignment
+            cls_logits = self.tood_cls(cls_feat)
+            cls_prob = self.cls_prob_module(feat)
+            cls_score = sigmoid_geometric_mean(cls_logits, cls_prob)
+
+            # reg prediction and alignment
+            if self.anchor_type == 'anchor_free':
+                reg_dist = scale(self.tood_reg(reg_feat).exp()).float()
+                reg_dist = reg_dist.permute(0, 2, 3, 1).reshape(-1, 4)
+                reg_bbox = distance2bbox(
+                    self.anchor_center(anchor) / stride[0],
+                    reg_dist).reshape(b, h, w, 4).permute(0, 3, 1,
+                                                          2)  # (b, c, h, w)
+            elif self.anchor_type == 'anchor_based':
+                reg_dist = scale(self.tood_reg(reg_feat)).float()
+                reg_dist = reg_dist.permute(0, 2, 3, 1).reshape(-1, 4)
+                reg_bbox = self.bbox_coder.decode(anchor, reg_dist).reshape(
+                    b, h, w, 4).permute(0, 3, 1, 2) / stride[0]
+            else:
+                raise NotImplementedError(
+                    f'Unknown anchor type: {self.anchor_type}.'
+                    f'Please use `anchor_free` or `anchor_based`.')
+            reg_offset = self.reg_offset_module(feat)
+            bbox_pred = self.deform_sampling(reg_bbox.contiguous(),
+                                             reg_offset.contiguous())
+
+            # After deform_sampling, some boxes will become invalid (The
+            # left-top point is at the right or bottom of the right-bottom
+            # point), which will make the GIoULoss negative.
+            invalid_bbox_idx = (bbox_pred[:, [0]] > bbox_pred[:, [2]]) | \
+                               (bbox_pred[:, [1]] > bbox_pred[:, [3]])
+            invalid_bbox_idx = invalid_bbox_idx.expand_as(bbox_pred)
+            bbox_pred = torch.where(invalid_bbox_idx, reg_bbox, bbox_pred)
+
+            cls_scores.append(cls_score)
+            bbox_preds.append(bbox_pred)
+        return tuple(cls_scores), tuple(bbox_preds)
+
+    def deform_sampling(self, feat: Tensor, offset: Tensor) -> Tensor:
+        """Sampling the feature x according to offset.
+
+        Args:
+            feat (Tensor): Feature
+            offset (Tensor): Spatial offset for feature sampling
+        """
+        # it is an equivalent implementation of bilinear interpolation
+        b, c, h, w = feat.shape
+        weight = feat.new_ones(c, 1, 1, 1)
+        y = deform_conv2d(feat, offset, weight, 1, 0, 1, c, c)
+        return y
+
+    def anchor_center(self, anchors: Tensor) -> Tensor:
+        """Get anchor centers from anchors.
+
+        Args:
+            anchors (Tensor): Anchor list with shape (N, 4), "xyxy" format.
+
+        Returns:
+            Tensor: Anchor centers with shape (N, 2), "xy" format.
+        """
+        anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2
+        anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2
+        return torch.stack([anchors_cx, anchors_cy], dim=-1)
+
+    def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor,
+                            bbox_pred: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            alignment_metrics: Tensor,
+                            stride: Tuple[int, int]) -> dict:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Decoded bboxes for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors).
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            alignment_metrics (Tensor): Alignment metrics with shape
+                (N, num_total_anchors).
+            stride (Tuple[int, int]): Downsample stride of the feature map.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        alignment_metrics = alignment_metrics.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        targets = labels if self.epoch < self.initial_epoch else (
+            labels, alignment_metrics)
+        cls_loss_func = self.initial_loss_cls \
+            if self.epoch < self.initial_epoch else self.loss_cls
+
+        loss_cls = cls_loss_func(
+            cls_score, targets, label_weights, avg_factor=1.0)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+
+            pos_decode_bbox_pred = pos_bbox_pred
+            pos_decode_bbox_targets = pos_bbox_targets / stride[0]
+
+            # regression loss
+            pos_bbox_weight = self.centerness_target(
+                pos_anchors, pos_bbox_targets
+            ) if self.epoch < self.initial_epoch else alignment_metrics[
+                pos_inds]
+
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=pos_bbox_weight,
+                avg_factor=1.0)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            pos_bbox_weight = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, alignment_metrics.sum(
+        ), pos_bbox_weight.sum()
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Decoded box for each scale
+                level with shape (N, num_anchors * 4, H, W) in
+                [tl_x, tl_y, br_x, br_y] format.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        flatten_cls_scores = torch.cat([
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ], 1)
+        flatten_bbox_preds = torch.cat([
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) * stride[0]
+            for bbox_pred, stride in zip(bbox_preds,
+                                         self.prior_generator.strides)
+        ], 1)
+
+        cls_reg_targets = self.get_targets(
+            flatten_cls_scores,
+            flatten_bbox_preds,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         alignment_metrics_list) = cls_reg_targets
+
+        losses_cls, losses_bbox, \
+            cls_avg_factors, bbox_avg_factors = multi_apply(
+                self.loss_by_feat_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                alignment_metrics_list,
+                self.prior_generator.strides)
+
+        cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item()
+        losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls))
+
+        bbox_avg_factor = reduce_mean(
+            sum(bbox_avg_factors)).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: Optional[ConfigDict] = None,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (:obj:`ConfigDict`, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+
+        cfg = self.test_cfg if cfg is None else cfg
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for cls_score, bbox_pred, priors, stride in zip(
+                cls_score_list, bbox_pred_list, mlvl_priors,
+                self.prior_generator.strides):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) * stride[0]
+            scores = cls_score.permute(1, 2,
+                                       0).reshape(-1, self.cls_out_channels)
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bboxes = filtered_results['bbox_pred']
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def get_targets(self,
+                    cls_scores: List[List[Tensor]],
+                    bbox_preds: List[List[Tensor]],
+                    anchor_list: List[List[Tensor]],
+                    valid_flag_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            cls_scores (list[list[Tensor]]): Classification predictions of
+                images, a 3D-Tensor with shape [num_imgs, num_priors,
+                num_classes].
+            bbox_preds (list[list[Tensor]]): Decoded bboxes predictions of one
+                image, a 3D-Tensor with shape [num_imgs, num_priors, 4] in
+                [tl_x, tl_y, br_x, br_y] format.
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: a tuple containing learning targets.
+
+                - anchors_list (list[list[Tensor]]): Anchors of each level.
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - norm_alignment_metrics_list (list[Tensor]): Normalized
+                  alignment metrics of each level.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        # anchor_list: list(b * [-1, 4])
+
+        # get epoch information from message hub
+        message_hub = MessageHub.get_current_instance()
+        self.epoch = message_hub.get_info('epoch')
+
+        if self.epoch < self.initial_epoch:
+            (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+             all_bbox_weights, pos_inds_list, neg_inds_list,
+             sampling_result) = multi_apply(
+                 super()._get_targets_single,
+                 anchor_list,
+                 valid_flag_list,
+                 num_level_anchors_list,
+                 batch_gt_instances,
+                 batch_img_metas,
+                 batch_gt_instances_ignore,
+                 unmap_outputs=unmap_outputs)
+            all_assign_metrics = [
+                weight[..., 0] for weight in all_bbox_weights
+            ]
+        else:
+            (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+             all_assign_metrics) = multi_apply(
+                 self._get_targets_single,
+                 cls_scores,
+                 bbox_preds,
+                 anchor_list,
+                 valid_flag_list,
+                 batch_gt_instances,
+                 batch_img_metas,
+                 batch_gt_instances_ignore,
+                 unmap_outputs=unmap_outputs)
+
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        norm_alignment_metrics_list = images_to_levels(all_assign_metrics,
+                                                       num_level_anchors)
+
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, norm_alignment_metrics_list)
+
+    def _get_targets_single(self,
+                            cls_scores: Tensor,
+                            bbox_preds: Tensor,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            cls_scores (Tensor): Box scores for each image.
+            bbox_preds (Tensor): Box energies / deltas for each image.
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                anchors (Tensor): All anchors in the image with shape (N, 4).
+                labels (Tensor): Labels of all anchors in the image with shape
+                    (N,).
+                label_weights (Tensor): Label weights of all anchor in the
+                    image with shape (N,).
+                bbox_targets (Tensor): BBox targets of all anchors in the
+                    image with shape (N, 4).
+                norm_alignment_metrics (Tensor): Normalized alignment metrics
+                    of all priors in the image with shape (N,).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+        pred_instances = InstanceData(
+            priors=anchors,
+            scores=cls_scores[inside_flags, :],
+            bboxes=bbox_preds[inside_flags, :])
+        assign_result = self.alignment_assigner.assign(pred_instances,
+                                                       gt_instances,
+                                                       gt_instances_ignore,
+                                                       self.alpha, self.beta)
+        assign_ious = assign_result.max_overlaps
+        assign_metrics = assign_result.assign_metrics
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+        norm_alignment_metrics = anchors.new_zeros(
+            num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            # point-based
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        class_assigned_gt_inds = torch.unique(
+            sampling_result.pos_assigned_gt_inds)
+        for gt_inds in class_assigned_gt_inds:
+            gt_class_inds = pos_inds[sampling_result.pos_assigned_gt_inds ==
+                                     gt_inds]
+            pos_alignment_metrics = assign_metrics[gt_class_inds]
+            pos_ious = assign_ious[gt_class_inds]
+            pos_norm_alignment_metrics = pos_alignment_metrics / (
+                pos_alignment_metrics.max() + 10e-8) * pos_ious.max()
+            norm_alignment_metrics[gt_class_inds] = pos_norm_alignment_metrics
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            norm_alignment_metrics = unmap(norm_alignment_metrics,
+                                           num_total_anchors, inside_flags)
+        return (anchors, labels, label_weights, bbox_targets,
+                norm_alignment_metrics)
diff --git a/mmde/mmdet/models/dense_heads/vfnet_head.py b/mmde/mmdet/models/dense_heads/vfnet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..430b06d085d94760d56a7ea083eaf23bd32b1f53
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/vfnet_head.py
@@ -0,0 +1,722 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+from mmcv.ops import DeformConv2d
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig,
+                         OptInstanceList, RangeType, reduce_mean)
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..task_modules.samplers import PseudoSampler
+from ..utils import multi_apply
+from .atss_head import ATSSHead
+from .fcos_head import FCOSHead
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class VFNetHead(ATSSHead, FCOSHead):
+    """Head of `VarifocalNet (VFNet): An IoU-aware Dense Object
+    Detector.<https://arxiv.org/abs/2008.13367>`_.
+
+    The VFNet predicts IoU-aware classification scores which mix the
+    object presence confidence and object localization accuracy as the
+    detection score. It is built on the FCOS architecture and uses ATSS
+    for defining positive/negative training examples. The VFNet is trained
+    with Varifocal Loss and empolys star-shaped deformable convolution to
+    extract features for a bbox.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple
+            level points.
+        center_sampling (bool): If true, use center sampling. Defaults to False.
+        center_sample_radius (float): Radius of center sampling. Defaults to 1.5.
+        sync_num_pos (bool): If true, synchronize the number of positive
+            examples across GPUs. Defaults to True
+        gradient_mul (float): The multiplier to gradients from bbox refinement
+            and recognition. Defaults to 0.1.
+        bbox_norm_type (str): The bbox normalization type, 'reg_denom' or
+            'stride'. Defaults to reg_denom
+        loss_cls_fl (:obj:`ConfigDict` or dict): Config of focal loss.
+        use_vfl (bool): If true, use varifocal loss for training.
+            Defaults to True.
+        loss_cls (:obj:`ConfigDict` or dict): Config of varifocal loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss,
+            GIoU Loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization
+            refinement loss, GIoU Loss.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config norm layer. Defaults to norm_cfg=dict(type='GN',
+            num_groups=32, requires_grad=True).
+        use_atss (bool): If true, use ATSS to define positive/negative
+            examples. Defaults to True.
+        anchor_generator (:obj:`ConfigDict` or dict): Config of anchor
+            generator for ATSS.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+
+    Example:
+        >>> self = VFNetHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_score, bbox_pred, bbox_pred_refine= self.forward(feats)
+        >>> assert len(cls_score) == len(self.scales)
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 regress_ranges: RangeType = ((-1, 64), (64, 128), (128, 256),
+                                              (256, 512), (512, INF)),
+                 center_sampling: bool = False,
+                 center_sample_radius: float = 1.5,
+                 sync_num_pos: bool = True,
+                 gradient_mul: float = 0.1,
+                 bbox_norm_type: str = 'reg_denom',
+                 loss_cls_fl: ConfigType = dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 use_vfl: bool = True,
+                 loss_cls: ConfigType = dict(
+                     type='VarifocalLoss',
+                     use_sigmoid=True,
+                     alpha=0.75,
+                     gamma=2.0,
+                     iou_weighted=True,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='GIoULoss', loss_weight=1.5),
+                 loss_bbox_refine: ConfigType = dict(
+                     type='GIoULoss', loss_weight=2.0),
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 use_atss: bool = True,
+                 reg_decoded_bbox: bool = True,
+                 anchor_generator: ConfigType = dict(
+                     type='AnchorGenerator',
+                     ratios=[1.0],
+                     octave_base_scale=8,
+                     scales_per_octave=1,
+                     center_offset=0.0,
+                     strides=[8, 16, 32, 64, 128]),
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='vfnet_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        # dcn base offsets, adapted from reppoints_head.py
+        self.num_dconv_points = 9
+        self.dcn_kernel = int(np.sqrt(self.num_dconv_points))
+        self.dcn_pad = int((self.dcn_kernel - 1) / 2)
+        dcn_base = np.arange(-self.dcn_pad,
+                             self.dcn_pad + 1).astype(np.float64)
+        dcn_base_y = np.repeat(dcn_base, self.dcn_kernel)
+        dcn_base_x = np.tile(dcn_base, self.dcn_kernel)
+        dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape(
+            (-1))
+        self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1)
+
+        super(FCOSHead, self).__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.regress_ranges = regress_ranges
+        self.reg_denoms = [
+            regress_range[-1] for regress_range in regress_ranges
+        ]
+        self.reg_denoms[-1] = self.reg_denoms[-2] * 2
+        self.center_sampling = center_sampling
+        self.center_sample_radius = center_sample_radius
+        self.sync_num_pos = sync_num_pos
+        self.bbox_norm_type = bbox_norm_type
+        self.gradient_mul = gradient_mul
+        self.use_vfl = use_vfl
+        if self.use_vfl:
+            self.loss_cls = MODELS.build(loss_cls)
+        else:
+            self.loss_cls = MODELS.build(loss_cls_fl)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.loss_bbox_refine = MODELS.build(loss_bbox_refine)
+
+        # for getting ATSS targets
+        self.use_atss = use_atss
+        self.reg_decoded_bbox = reg_decoded_bbox
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+
+        self.anchor_center_offset = anchor_generator['center_offset']
+
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler()
+        # only be used in `get_atss_targets` when `use_atss` is True
+        self.atss_prior_generator = TASK_UTILS.build(anchor_generator)
+
+        self.fcos_prior_generator = MlvlPointGenerator(
+            anchor_generator['strides'],
+            self.anchor_center_offset if self.use_atss else 0.5)
+
+        # In order to reuse the `get_bboxes` in `BaseDenseHead.
+        # Only be used in testing phase.
+        self.prior_generator = self.fcos_prior_generator
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        super(FCOSHead, self)._init_cls_convs()
+        super(FCOSHead, self)._init_reg_convs()
+        self.relu = nn.ReLU()
+        self.vfnet_reg_conv = ConvModule(
+            self.feat_channels,
+            self.feat_channels,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            bias=self.conv_bias)
+        self.vfnet_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+        self.vfnet_reg_refine_dconv = DeformConv2d(
+            self.feat_channels,
+            self.feat_channels,
+            self.dcn_kernel,
+            1,
+            padding=self.dcn_pad)
+        self.vfnet_reg_refine = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+        self.scales_refine = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+        self.vfnet_cls_dconv = DeformConv2d(
+            self.feat_channels,
+            self.feat_channels,
+            self.dcn_kernel,
+            1,
+            padding=self.dcn_pad)
+        self.vfnet_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+
+            - cls_scores (list[Tensor]): Box iou-aware scores for each scale
+              level, each is a 4D-tensor, the channel number is
+              num_points * num_classes.
+            - bbox_preds (list[Tensor]): Box offsets for each
+              scale level, each is a 4D-tensor, the channel number is
+              num_points * 4.
+            - bbox_preds_refine (list[Tensor]): Refined Box offsets for
+              each scale level, each is a 4D-tensor, the channel
+              number is num_points * 4.
+        """
+        return multi_apply(self.forward_single, x, self.scales,
+                           self.scales_refine, self.strides, self.reg_denoms)
+
+    def forward_single(self, x: Tensor, scale: Scale, scale_refine: Scale,
+                       stride: int, reg_denom: int) -> tuple:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            scale_refine (:obj: `mmcv.cnn.Scale`): Learnable scale module to
+                resize the refined bbox prediction.
+            stride (int): The corresponding stride for feature maps,
+                used to normalize the bbox prediction when
+                bbox_norm_type = 'stride'.
+            reg_denom (int): The corresponding regression range for feature
+                maps, only used to normalize the bbox prediction when
+                bbox_norm_type = 'reg_denom'.
+
+        Returns:
+            tuple: iou-aware cls scores for each box, bbox predictions and
+            refined bbox predictions of input feature maps.
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+
+        # predict the bbox_pred of different level
+        reg_feat_init = self.vfnet_reg_conv(reg_feat)
+        if self.bbox_norm_type == 'reg_denom':
+            bbox_pred = scale(
+                self.vfnet_reg(reg_feat_init)).float().exp() * reg_denom
+        elif self.bbox_norm_type == 'stride':
+            bbox_pred = scale(
+                self.vfnet_reg(reg_feat_init)).float().exp() * stride
+        else:
+            raise NotImplementedError
+
+        # compute star deformable convolution offsets
+        # converting dcn_offset to reg_feat.dtype thus VFNet can be
+        # trained with FP16
+        dcn_offset = self.star_dcn_offset(bbox_pred, self.gradient_mul,
+                                          stride).to(reg_feat.dtype)
+
+        # refine the bbox_pred
+        reg_feat = self.relu(self.vfnet_reg_refine_dconv(reg_feat, dcn_offset))
+        bbox_pred_refine = scale_refine(
+            self.vfnet_reg_refine(reg_feat)).float().exp()
+        bbox_pred_refine = bbox_pred_refine * bbox_pred.detach()
+
+        # predict the iou-aware cls score
+        cls_feat = self.relu(self.vfnet_cls_dconv(cls_feat, dcn_offset))
+        cls_score = self.vfnet_cls(cls_feat)
+
+        if self.training:
+            return cls_score, bbox_pred, bbox_pred_refine
+        else:
+            return cls_score, bbox_pred_refine
+
+    def star_dcn_offset(self, bbox_pred: Tensor, gradient_mul: float,
+                        stride: int) -> Tensor:
+        """Compute the star deformable conv offsets.
+
+        Args:
+            bbox_pred (Tensor): Predicted bbox distance offsets (l, r, t, b).
+            gradient_mul (float): Gradient multiplier.
+            stride (int): The corresponding stride for feature maps,
+                used to project the bbox onto the feature map.
+
+        Returns:
+            Tensor: The offsets for deformable convolution.
+        """
+        dcn_base_offset = self.dcn_base_offset.type_as(bbox_pred)
+        bbox_pred_grad_mul = (1 - gradient_mul) * bbox_pred.detach() + \
+            gradient_mul * bbox_pred
+        # map to the feature map scale
+        bbox_pred_grad_mul = bbox_pred_grad_mul / stride
+        N, C, H, W = bbox_pred.size()
+
+        x1 = bbox_pred_grad_mul[:, 0, :, :]
+        y1 = bbox_pred_grad_mul[:, 1, :, :]
+        x2 = bbox_pred_grad_mul[:, 2, :, :]
+        y2 = bbox_pred_grad_mul[:, 3, :, :]
+        bbox_pred_grad_mul_offset = bbox_pred.new_zeros(
+            N, 2 * self.num_dconv_points, H, W)
+        bbox_pred_grad_mul_offset[:, 0, :, :] = -1.0 * y1  # -y1
+        bbox_pred_grad_mul_offset[:, 1, :, :] = -1.0 * x1  # -x1
+        bbox_pred_grad_mul_offset[:, 2, :, :] = -1.0 * y1  # -y1
+        bbox_pred_grad_mul_offset[:, 4, :, :] = -1.0 * y1  # -y1
+        bbox_pred_grad_mul_offset[:, 5, :, :] = x2  # x2
+        bbox_pred_grad_mul_offset[:, 7, :, :] = -1.0 * x1  # -x1
+        bbox_pred_grad_mul_offset[:, 11, :, :] = x2  # x2
+        bbox_pred_grad_mul_offset[:, 12, :, :] = y2  # y2
+        bbox_pred_grad_mul_offset[:, 13, :, :] = -1.0 * x1  # -x1
+        bbox_pred_grad_mul_offset[:, 14, :, :] = y2  # y2
+        bbox_pred_grad_mul_offset[:, 16, :, :] = y2  # y2
+        bbox_pred_grad_mul_offset[:, 17, :, :] = x2  # x2
+        dcn_offset = bbox_pred_grad_mul_offset - dcn_base_offset
+
+        return dcn_offset
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            bbox_preds_refine: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box offsets for each
+                scale level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            bbox_preds_refine (list[Tensor]): Refined Box offsets for
+                each scale level, each is a 4D-tensor, the channel
+                number is num_points * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(bbox_preds_refine)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.fcos_prior_generator.grid_priors(
+            featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device)
+        labels, label_weights, bbox_targets, bbox_weights = self.get_targets(
+            cls_scores,
+            all_level_points,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds and bbox_preds_refine
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3,
+                              1).reshape(-1,
+                                         self.cls_out_channels).contiguous()
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4).contiguous()
+            for bbox_pred in bbox_preds
+        ]
+        flatten_bbox_preds_refine = [
+            bbox_pred_refine.permute(0, 2, 3, 1).reshape(-1, 4).contiguous()
+            for bbox_pred_refine in bbox_preds_refine
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_bbox_preds_refine = torch.cat(flatten_bbox_preds_refine)
+        flatten_labels = torch.cat(labels)
+        flatten_bbox_targets = torch.cat(bbox_targets)
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        # FG cat_id: [0, num_classes - 1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = torch.where(
+            ((flatten_labels >= 0) & (flatten_labels < bg_class_ind)) > 0)[0]
+        num_pos = len(pos_inds)
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_bbox_preds_refine = flatten_bbox_preds_refine[pos_inds]
+        pos_labels = flatten_labels[pos_inds]
+
+        # sync num_pos across all gpus
+        if self.sync_num_pos:
+            num_pos_avg_per_gpu = reduce_mean(
+                pos_inds.new_tensor(num_pos).float()).item()
+            num_pos_avg_per_gpu = max(num_pos_avg_per_gpu, 1.0)
+        else:
+            num_pos_avg_per_gpu = num_pos
+
+        pos_bbox_targets = flatten_bbox_targets[pos_inds]
+        pos_points = flatten_points[pos_inds]
+
+        pos_decoded_bbox_preds = self.bbox_coder.decode(
+            pos_points, pos_bbox_preds)
+        pos_decoded_target_preds = self.bbox_coder.decode(
+            pos_points, pos_bbox_targets)
+        iou_targets_ini = bbox_overlaps(
+            pos_decoded_bbox_preds,
+            pos_decoded_target_preds.detach(),
+            is_aligned=True).clamp(min=1e-6)
+        bbox_weights_ini = iou_targets_ini.clone().detach()
+        bbox_avg_factor_ini = reduce_mean(
+            bbox_weights_ini.sum()).clamp_(min=1).item()
+
+        pos_decoded_bbox_preds_refine = \
+            self.bbox_coder.decode(pos_points, pos_bbox_preds_refine)
+        iou_targets_rf = bbox_overlaps(
+            pos_decoded_bbox_preds_refine,
+            pos_decoded_target_preds.detach(),
+            is_aligned=True).clamp(min=1e-6)
+        bbox_weights_rf = iou_targets_rf.clone().detach()
+        bbox_avg_factor_rf = reduce_mean(
+            bbox_weights_rf.sum()).clamp_(min=1).item()
+
+        if num_pos > 0:
+            loss_bbox = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds.detach(),
+                weight=bbox_weights_ini,
+                avg_factor=bbox_avg_factor_ini)
+
+            loss_bbox_refine = self.loss_bbox_refine(
+                pos_decoded_bbox_preds_refine,
+                pos_decoded_target_preds.detach(),
+                weight=bbox_weights_rf,
+                avg_factor=bbox_avg_factor_rf)
+
+            # build IoU-aware cls_score targets
+            if self.use_vfl:
+                pos_ious = iou_targets_rf.clone().detach()
+                cls_iou_targets = torch.zeros_like(flatten_cls_scores)
+                cls_iou_targets[pos_inds, pos_labels] = pos_ious
+        else:
+            loss_bbox = pos_bbox_preds.sum() * 0
+            loss_bbox_refine = pos_bbox_preds_refine.sum() * 0
+            if self.use_vfl:
+                cls_iou_targets = torch.zeros_like(flatten_cls_scores)
+
+        if self.use_vfl:
+            loss_cls = self.loss_cls(
+                flatten_cls_scores,
+                cls_iou_targets,
+                avg_factor=num_pos_avg_per_gpu)
+        else:
+            loss_cls = self.loss_cls(
+                flatten_cls_scores,
+                flatten_labels,
+                weight=label_weights,
+                avg_factor=num_pos_avg_per_gpu)
+
+        return dict(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_bbox_rf=loss_bbox_refine)
+
+    def get_targets(
+            self,
+            cls_scores: List[Tensor],
+            mlvl_points: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> tuple:
+        """A wrapper for computing ATSS and FCOS targets for points in multiple
+        images.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level with shape (N, num_points * num_classes, H, W).
+            mlvl_points (list[Tensor]): Points of each fpn level, each has
+                shape (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            tuple:
+
+            - labels_list (list[Tensor]): Labels of each level.
+            - label_weights (Tensor/None): Label weights of all levels.
+            - bbox_targets_list (list[Tensor]): Regression targets of each
+              level, (l, t, r, b).
+            - bbox_weights (Tensor/None): Bbox weights of all levels.
+        """
+        if self.use_atss:
+            return self.get_atss_targets(cls_scores, mlvl_points,
+                                         batch_gt_instances, batch_img_metas,
+                                         batch_gt_instances_ignore)
+        else:
+            self.norm_on_bbox = False
+            return self.get_fcos_targets(mlvl_points, batch_gt_instances)
+
+    def _get_targets_single(self, *args, **kwargs):
+        """Avoid ambiguity in multiple inheritance."""
+        if self.use_atss:
+            return ATSSHead._get_targets_single(self, *args, **kwargs)
+        else:
+            return FCOSHead._get_targets_single(self, *args, **kwargs)
+
+    def get_fcos_targets(self, points: List[Tensor],
+                         batch_gt_instances: InstanceList) -> tuple:
+        """Compute FCOS regression and classification targets for points in
+        multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple:
+
+            - labels (list[Tensor]): Labels of each level.
+            - label_weights: None, to be compatible with ATSS targets.
+            - bbox_targets (list[Tensor]): BBox targets of each level.
+            - bbox_weights: None, to be compatible with ATSS targets.
+        """
+        labels, bbox_targets = FCOSHead.get_targets(self, points,
+                                                    batch_gt_instances)
+        label_weights = None
+        bbox_weights = None
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def get_anchors(self,
+                    featmap_sizes: List[Tuple],
+                    batch_img_metas: List[dict],
+                    device: str = 'cuda') -> tuple:
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            batch_img_metas (list[dict]): Image meta info.
+            device (str): Device for returned tensors
+
+        Returns:
+            tuple:
+
+            - anchor_list (list[Tensor]): Anchors of each image.
+            - valid_flag_list (list[Tensor]): Valid flags of each image.
+        """
+        num_imgs = len(batch_img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.atss_prior_generator.grid_priors(
+            featmap_sizes, device=device)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_flags = self.atss_prior_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device=device)
+            valid_flag_list.append(multi_level_flags)
+
+        return anchor_list, valid_flag_list
+
+    def get_atss_targets(
+            self,
+            cls_scores: List[Tensor],
+            mlvl_points: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> tuple:
+        """A wrapper for computing ATSS targets for points in multiple images.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level with shape (N, num_points * num_classes, H, W).
+            mlvl_points (list[Tensor]): Points of each fpn level, each has
+                shape (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            tuple:
+
+            - labels_list (list[Tensor]): Labels of each level.
+            - label_weights (Tensor): Label weights of all levels.
+            - bbox_targets_list (list[Tensor]): Regression targets of each
+              level, (l, t, r, b).
+            - bbox_weights (Tensor): Bbox weights of all levels.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(
+            featmap_sizes
+        ) == self.atss_prior_generator.num_levels == \
+            self.fcos_prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = ATSSHead.get_targets(
+            self,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore,
+            unmap_outputs=True)
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_reg_targets
+
+        bbox_targets_list = [
+            bbox_targets.reshape(-1, 4) for bbox_targets in bbox_targets_list
+        ]
+
+        num_imgs = len(batch_img_metas)
+        # transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format
+        bbox_targets_list = self.transform_bbox_targets(
+            bbox_targets_list, mlvl_points, num_imgs)
+
+        labels_list = [labels.reshape(-1) for labels in labels_list]
+        label_weights_list = [
+            label_weights.reshape(-1) for label_weights in label_weights_list
+        ]
+        bbox_weights_list = [
+            bbox_weights.reshape(-1) for bbox_weights in bbox_weights_list
+        ]
+        label_weights = torch.cat(label_weights_list)
+        bbox_weights = torch.cat(bbox_weights_list)
+        return labels_list, label_weights, bbox_targets_list, bbox_weights
+
+    def transform_bbox_targets(self, decoded_bboxes: List[Tensor],
+                               mlvl_points: List[Tensor],
+                               num_imgs: int) -> List[Tensor]:
+        """Transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format.
+
+        Args:
+            decoded_bboxes (list[Tensor]): Regression targets of each level,
+                in the form of (x1, y1, x2, y2).
+            mlvl_points (list[Tensor]): Points of each fpn level, each has
+                shape (num_points, 2).
+            num_imgs (int): the number of images in a batch.
+
+        Returns:
+            bbox_targets (list[Tensor]): Regression targets of each level in
+                the form of (l, t, r, b).
+        """
+        # TODO: Re-implemented in Class PointCoder
+        assert len(decoded_bboxes) == len(mlvl_points)
+        num_levels = len(decoded_bboxes)
+        mlvl_points = [points.repeat(num_imgs, 1) for points in mlvl_points]
+        bbox_targets = []
+        for i in range(num_levels):
+            bbox_target = self.bbox_coder.encode(mlvl_points[i],
+                                                 decoded_bboxes[i])
+            bbox_targets.append(bbox_target)
+
+        return bbox_targets
+
+    def _load_from_state_dict(self, state_dict: dict, prefix: str,
+                              local_metadata: dict, strict: bool,
+                              missing_keys: Union[List[str], str],
+                              unexpected_keys: Union[List[str], str],
+                              error_msgs: Union[List[str], str]) -> None:
+        """Override the method in the parent class to avoid changing para's
+        name."""
+        pass
diff --git a/mmde/mmdet/models/dense_heads/yolact_head.py b/mmde/mmdet/models/dense_heads/yolact_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3390c136a31bee81134667eb28ad8829ddb84cc3
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/yolact_head.py
@@ -0,0 +1,1193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule, ModuleList
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig)
+from ..layers import fast_nms
+from ..utils import images_to_levels, multi_apply, select_single_mlvl
+from ..utils.misc import empty_instances
+from .anchor_head import AnchorHead
+from .base_mask_head import BaseMaskHead
+
+
+@MODELS.register_module()
+class YOLACTHead(AnchorHead):
+    """YOLACT box head used in https://arxiv.org/abs/1904.02689.
+
+    Note that YOLACT head is a light version of RetinaNet head.
+    Four differences are described as follows:
+
+    1. YOLACT box head has three-times fewer anchors.
+    2. YOLACT box head shares the convs for box and cls branches.
+    3. YOLACT box head uses OHEM instead of Focal loss.
+    4. YOLACT box head predicts a set of mask coefficients for each box.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        anchor_generator (:obj:`ConfigDict` or dict): Config dict for
+            anchor generator
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        num_head_convs (int): Number of the conv layers shared by
+            box and cls branches.
+        num_protos (int): Number of the mask coefficients.
+        use_ohem (bool): If true, ``loss_single_OHEM`` will be used for
+            cls loss calculation. If false, ``loss_single`` will be used.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Dictionary to
+            construct and config conv layer.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Dictionary to
+            construct and config norm layer.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 anchor_generator: ConfigType = dict(
+                     type='AnchorGenerator',
+                     octave_base_scale=3,
+                     scales_per_octave=1,
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[8, 16, 32, 64, 128]),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     reduction='none',
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1.5),
+                 num_head_convs: int = 1,
+                 num_protos: int = 32,
+                 use_ohem: bool = True,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = dict(
+                     type='Xavier',
+                     distribution='uniform',
+                     bias=0,
+                     layer='Conv2d'),
+                 **kwargs) -> None:
+        self.num_head_convs = num_head_convs
+        self.num_protos = num_protos
+        self.use_ohem = use_ohem
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            anchor_generator=anchor_generator,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.head_convs = ModuleList()
+        for i in range(self.num_head_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.head_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.conv_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+        self.conv_coeff = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.num_protos,
+            3,
+            padding=1)
+
+    def forward_single(self, x: Tensor) -> tuple:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+
+            - cls_score (Tensor): Cls scores for a single scale level
+              the channels number is num_anchors * num_classes.
+            - bbox_pred (Tensor): Box energies / deltas for a single scale
+              level, the channels number is num_anchors * 4.
+            - coeff_pred (Tensor): Mask coefficients for a single scale
+              level, the channels number is num_anchors * num_protos.
+        """
+        for head_conv in self.head_convs:
+            x = head_conv(x)
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        coeff_pred = self.conv_coeff(x).tanh()
+        return cls_score, bbox_pred, coeff_pred
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            coeff_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the bbox head.
+
+        When ``self.use_ohem == True``, it functions like ``SSDHead.loss``,
+        otherwise, it follows ``AnchorHead.loss``.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            coeff_preds (list[Tensor]): Mask coefficients for each scale
+                level with shape (N, num_anchors * num_protos, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            unmap_outputs=not self.use_ohem,
+            return_sampling_results=True)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor, sampling_results) = cls_reg_targets
+
+        if self.use_ohem:
+            num_images = len(batch_img_metas)
+            all_cls_scores = torch.cat([
+                s.permute(0, 2, 3, 1).reshape(
+                    num_images, -1, self.cls_out_channels) for s in cls_scores
+            ], 1)
+            all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+            all_label_weights = torch.cat(label_weights_list,
+                                          -1).view(num_images, -1)
+            all_bbox_preds = torch.cat([
+                b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+                for b in bbox_preds
+            ], -2)
+            all_bbox_targets = torch.cat(bbox_targets_list,
+                                         -2).view(num_images, -1, 4)
+            all_bbox_weights = torch.cat(bbox_weights_list,
+                                         -2).view(num_images, -1, 4)
+
+            # concat all level anchors to a single tensor
+            all_anchors = []
+            for i in range(num_images):
+                all_anchors.append(torch.cat(anchor_list[i]))
+
+            # check NaN and Inf
+            assert torch.isfinite(all_cls_scores).all().item(), \
+                'classification scores become infinite or NaN!'
+            assert torch.isfinite(all_bbox_preds).all().item(), \
+                'bbox predications become infinite or NaN!'
+
+            losses_cls, losses_bbox = multi_apply(
+                self.OHEMloss_by_feat_single,
+                all_cls_scores,
+                all_bbox_preds,
+                all_anchors,
+                all_labels,
+                all_label_weights,
+                all_bbox_targets,
+                all_bbox_weights,
+                avg_factor=avg_factor)
+        else:
+            # anchor number of multi levels
+            num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+            # concat all level anchors and flags to a single tensor
+            concat_anchor_list = []
+            for i in range(len(anchor_list)):
+                concat_anchor_list.append(torch.cat(anchor_list[i]))
+            all_anchor_list = images_to_levels(concat_anchor_list,
+                                               num_level_anchors)
+            losses_cls, losses_bbox = multi_apply(
+                self.loss_by_feat_single,
+                cls_scores,
+                bbox_preds,
+                all_anchor_list,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                bbox_weights_list,
+                avg_factor=avg_factor)
+        losses = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+        # update `_raw_positive_infos`, which will be used when calling
+        # `get_positive_infos`.
+        self._raw_positive_infos.update(coeff_preds=coeff_preds)
+        return losses
+
+    def OHEMloss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                                anchors: Tensor, labels: Tensor,
+                                label_weights: Tensor, bbox_targets: Tensor,
+                                bbox_weights: Tensor,
+                                avg_factor: int) -> tuple:
+        """Compute loss of a single image. Similar to
+        func:``SSDHead.loss_by_feat_single``
+
+        Args:
+            cls_score (Tensor): Box scores for eachimage
+                Has shape (num_total_anchors, num_classes).
+            bbox_pred (Tensor): Box energies / deltas for each image
+                level with shape (num_total_anchors, 4).
+            anchors (Tensor): Box reference for each scale level with shape
+                (num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (num_total_anchors,).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (num_total_anchors,)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (num_total_anchors, 4).
+            bbox_weights (Tensor): BBox regression loss weights of each anchor
+                with shape (num_total_anchors, 4).
+            avg_factor (int): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            Tuple[Tensor, Tensor]: A tuple of cls loss and bbox loss of one
+            feature map.
+        """
+
+        loss_cls_all = self.loss_cls(cls_score, labels, label_weights)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero(
+            as_tuple=False).reshape(-1)
+        neg_inds = (labels == self.num_classes).nonzero(
+            as_tuple=False).view(-1)
+
+        num_pos_samples = pos_inds.size(0)
+        if num_pos_samples == 0:
+            num_neg_samples = neg_inds.size(0)
+        else:
+            num_neg_samples = self.train_cfg['neg_pos_ratio'] * \
+                              num_pos_samples
+            if num_neg_samples > neg_inds.size(0):
+                num_neg_samples = neg_inds.size(0)
+        topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples)
+        loss_cls_pos = loss_cls_all[pos_inds].sum()
+        loss_cls_neg = topk_loss_cls_neg.sum()
+        loss_cls = (loss_cls_pos + loss_cls_neg) / avg_factor
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+        loss_bbox = self.loss_bbox(
+            bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor)
+        return loss_cls[None], loss_bbox
+
+    def get_positive_infos(self) -> InstanceList:
+        """Get positive information from sampling results.
+
+        Returns:
+            list[:obj:`InstanceData`]: Positive Information of each image,
+            usually including positive bboxes, positive labels, positive
+            priors, positive coeffs, etc.
+        """
+        assert len(self._raw_positive_infos) > 0
+        sampling_results = self._raw_positive_infos['sampling_results']
+        num_imgs = len(sampling_results)
+
+        coeff_pred_list = []
+        for coeff_pred_per_level in self._raw_positive_infos['coeff_preds']:
+            coeff_pred_per_level = \
+                coeff_pred_per_level.permute(
+                    0, 2, 3, 1).reshape(num_imgs, -1, self.num_protos)
+            coeff_pred_list.append(coeff_pred_per_level)
+        coeff_preds = torch.cat(coeff_pred_list, dim=1)
+
+        pos_info_list = []
+        for idx, sampling_result in enumerate(sampling_results):
+            pos_info = InstanceData()
+            coeff_preds_single = coeff_preds[idx]
+            pos_info.pos_assigned_gt_inds = \
+                sampling_result.pos_assigned_gt_inds
+            pos_info.pos_inds = sampling_result.pos_inds
+            pos_info.coeffs = coeff_preds_single[sampling_result.pos_inds]
+            pos_info.bboxes = sampling_result.pos_gt_bboxes
+            pos_info_list.append(pos_info)
+        return pos_info_list
+
+    def predict_by_feat(self,
+                        cls_scores,
+                        bbox_preds,
+                        coeff_preds,
+                        batch_img_metas,
+                        cfg=None,
+                        rescale=True,
+                        **kwargs):
+        """Similar to func:``AnchorHead.get_bboxes``, but additionally
+        processes coeff_preds.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                with shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            coeff_preds (list[Tensor]): Mask coefficients for each scale
+                level with shape (N, num_anchors * num_protos, H, W)
+            batch_img_metas (list[dict]): Batch image meta info.
+            cfg (:obj:`Config` | None): Test / postprocessing configuration,
+                if None, test_cfg would be used
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - coeffs (Tensor): the predicted mask coefficients of
+                  instance inside the corresponding box has a shape
+                  (n, num_protos).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+
+        device = cls_scores[0].device
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device)
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            cls_score_list = select_single_mlvl(cls_scores, img_id)
+            bbox_pred_list = select_single_mlvl(bbox_preds, img_id)
+            coeff_pred_list = select_single_mlvl(coeff_preds, img_id)
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                coeff_preds_list=coeff_pred_list,
+                mlvl_priors=mlvl_priors,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                coeff_preds_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigType,
+                                rescale: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results. Similar to func:``AnchorHead._predict_by_feat_single``,
+        but additionally processes coeff_preds_list and uses fast NMS instead
+        of traditional NMS.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores for a single scale level
+                Has shape (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas for a single
+                scale level with shape (num_priors * 4, H, W).
+            coeff_preds_list (list[Tensor]): Mask coefficients for a single
+                scale level with shape (num_priors * num_protos, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid,
+                has shape (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - coeffs (Tensor): the predicted mask coefficients of
+                  instance inside the corresponding box has a shape
+                  (n, num_protos).
+        """
+        assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_priors)
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        mlvl_coeffs = []
+        for cls_score, bbox_pred, coeff_pred, priors in \
+                zip(cls_score_list, bbox_pred_list,
+                    coeff_preds_list, mlvl_priors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            coeff_pred = coeff_pred.permute(1, 2,
+                                            0).reshape(-1, self.num_protos)
+
+            if 0 < nms_pre < scores.shape[0]:
+                # Get maximum scores for foreground classes.
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                priors = priors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                coeff_pred = coeff_pred[topk_inds, :]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(scores)
+            mlvl_coeffs.append(coeff_pred)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = torch.cat(mlvl_valid_priors)
+        multi_bboxes = self.bbox_coder.decode(
+            priors, bbox_pred, max_shape=img_shape)
+
+        multi_scores = torch.cat(mlvl_scores)
+        multi_coeffs = torch.cat(mlvl_coeffs)
+
+        return self._bbox_post_process(
+            multi_bboxes=multi_bboxes,
+            multi_scores=multi_scores,
+            multi_coeffs=multi_coeffs,
+            cfg=cfg,
+            rescale=rescale,
+            img_meta=img_meta)
+
+    def _bbox_post_process(self,
+                           multi_bboxes: Tensor,
+                           multi_scores: Tensor,
+                           multi_coeffs: Tensor,
+                           cfg: ConfigType,
+                           rescale: bool = False,
+                           img_meta: Optional[dict] = None,
+                           **kwargs) -> InstanceData:
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            multi_bboxes (Tensor): Predicted bbox that concat all levels.
+            multi_scores (Tensor): Bbox scores that concat all levels.
+            multi_coeffs (Tensor): Mask coefficients  that concat all levels.
+            cfg (ConfigDict): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default to False.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - coeffs (Tensor): the predicted mask coefficients of
+                  instance inside the corresponding box has a shape
+                  (n, num_protos).
+        """
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            multi_bboxes /= multi_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+            # mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the backend when using sigmoid
+            # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+            # BG cat_id: num_class
+
+            padding = multi_scores.new_zeros(multi_scores.shape[0], 1)
+            multi_scores = torch.cat([multi_scores, padding], dim=1)
+        det_bboxes, det_labels, det_coeffs = fast_nms(
+            multi_bboxes, multi_scores, multi_coeffs, cfg.score_thr,
+            cfg.iou_thr, cfg.top_k, cfg.max_per_img)
+        results = InstanceData()
+        results.bboxes = det_bboxes[:, :4]
+        results.scores = det_bboxes[:, -1]
+        results.labels = det_labels
+        results.coeffs = det_coeffs
+        return results
+
+
+@MODELS.register_module()
+class YOLACTProtonet(BaseMaskHead):
+    """YOLACT mask head used in https://arxiv.org/abs/1904.02689.
+
+    This head outputs the mask prototypes for YOLACT.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        proto_channels (tuple[int]): Output channels of protonet convs.
+        proto_kernel_sizes (tuple[int]): Kernel sizes of protonet convs.
+        include_last_relu (bool): If keep the last relu of protonet.
+        num_protos (int): Number of prototypes.
+        num_classes (int): Number of categories excluding the background
+            category.
+        loss_mask_weight (float): Reweight the mask loss by this factor.
+        max_masks_to_train (int): Maximum number of masks to train for
+            each image.
+        with_seg_branch (bool): Whether to apply a semantic segmentation
+            branch and calculate loss during training to increase
+            performance with no speed penalty. Defaults to True.
+        loss_segm (:obj:`ConfigDict` or dict, optional): Config of
+            semantic segmentation loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config
+            of head.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            head.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int = 256,
+        proto_channels: tuple = (256, 256, 256, None, 256, 32),
+        proto_kernel_sizes: tuple = (3, 3, 3, -2, 3, 1),
+        include_last_relu: bool = True,
+        num_protos: int = 32,
+        loss_mask_weight: float = 1.0,
+        max_masks_to_train: int = 100,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        with_seg_branch: bool = True,
+        loss_segm: ConfigType = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        init_cfg=dict(
+            type='Xavier',
+            distribution='uniform',
+            override=dict(name='protonet'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.proto_channels = proto_channels
+        self.proto_kernel_sizes = proto_kernel_sizes
+        self.include_last_relu = include_last_relu
+
+        # Segmentation branch
+        self.with_seg_branch = with_seg_branch
+        self.segm_branch = SegmentationModule(
+            num_classes=num_classes, in_channels=in_channels) \
+            if with_seg_branch else None
+        self.loss_segm = MODELS.build(loss_segm) if with_seg_branch else None
+
+        self.loss_mask_weight = loss_mask_weight
+        self.num_protos = num_protos
+        self.num_classes = num_classes
+        self.max_masks_to_train = max_masks_to_train
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        # Possible patterns:
+        # ( 256, 3) -> conv
+        # ( 256,-2) -> deconv
+        # (None,-2) -> bilinear interpolate
+        in_channels = self.in_channels
+        protonets = ModuleList()
+        for num_channels, kernel_size in zip(self.proto_channels,
+                                             self.proto_kernel_sizes):
+            if kernel_size > 0:
+                layer = nn.Conv2d(
+                    in_channels,
+                    num_channels,
+                    kernel_size,
+                    padding=kernel_size // 2)
+            else:
+                if num_channels is None:
+                    layer = InterpolateModule(
+                        scale_factor=-kernel_size,
+                        mode='bilinear',
+                        align_corners=False)
+                else:
+                    layer = nn.ConvTranspose2d(
+                        in_channels,
+                        num_channels,
+                        -kernel_size,
+                        padding=kernel_size // 2)
+            protonets.append(layer)
+            protonets.append(nn.ReLU(inplace=True))
+            in_channels = num_channels if num_channels is not None \
+                else in_channels
+        if not self.include_last_relu:
+            protonets = protonets[:-1]
+        self.protonet = nn.Sequential(*protonets)
+
+    def forward(self, x: tuple, positive_infos: InstanceList) -> tuple:
+        """Forward feature from the upstream network to get prototypes and
+        linearly combine the prototypes, using masks coefficients, into
+        instance masks. Finally, crop the instance masks with given bboxes.
+
+        Args:
+            x (Tuple[Tensor]): Feature from the upstream network, which is
+                a 4D-tensor.
+            positive_infos (List[:obj:``InstanceData``]): Positive information
+                that calculate from detect head.
+
+        Returns:
+            tuple: Predicted instance segmentation masks and
+            semantic segmentation map.
+        """
+        # YOLACT used single feature map to get segmentation masks
+        single_x = x[0]
+
+        # YOLACT segmentation branch, if not training or segmentation branch
+        # is None, will not process the forward function.
+        if self.segm_branch is not None and self.training:
+            segm_preds = self.segm_branch(single_x)
+        else:
+            segm_preds = None
+        # YOLACT mask head
+        prototypes = self.protonet(single_x)
+        prototypes = prototypes.permute(0, 2, 3, 1).contiguous()
+
+        num_imgs = single_x.size(0)
+
+        mask_pred_list = []
+        for idx in range(num_imgs):
+            cur_prototypes = prototypes[idx]
+            pos_coeffs = positive_infos[idx].coeffs
+
+            # Linearly combine the prototypes with the mask coefficients
+            mask_preds = cur_prototypes @ pos_coeffs.t()
+            mask_preds = torch.sigmoid(mask_preds)
+            mask_pred_list.append(mask_preds)
+        return mask_pred_list, segm_preds
+
+    def loss_by_feat(self, mask_preds: List[Tensor], segm_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], positive_infos: InstanceList,
+                     **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mask_preds (list[Tensor]): List of predicted prototypes, each has
+                shape (num_classes, H, W).
+            segm_preds (Tensor):  Predicted semantic segmentation map with
+                shape (N, num_classes, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+            positive_infos (List[:obj:``InstanceData``]): Information of
+                positive samples of each image that are assigned in detection
+                head.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert positive_infos is not None, \
+            'positive_infos should not be None in `YOLACTProtonet`'
+        losses = dict()
+
+        # crop
+        croped_mask_pred = self.crop_mask_preds(mask_preds, batch_img_metas,
+                                                positive_infos)
+
+        loss_mask = []
+        loss_segm = []
+        num_imgs, _, mask_h, mask_w = segm_preds.size()
+        assert num_imgs == len(croped_mask_pred)
+        segm_avg_factor = num_imgs * mask_h * mask_w
+        total_pos = 0
+
+        if self.segm_branch is not None:
+            assert segm_preds is not None
+
+        for idx in range(num_imgs):
+            img_meta = batch_img_metas[idx]
+
+            (mask_preds, pos_mask_targets, segm_targets, num_pos,
+             gt_bboxes_for_reweight) = self._get_targets_single(
+                 croped_mask_pred[idx], segm_preds[idx],
+                 batch_gt_instances[idx], positive_infos[idx])
+
+            # segmentation loss
+            if self.with_seg_branch:
+                if segm_targets is None:
+                    loss = segm_preds[idx].sum() * 0.
+                else:
+                    loss = self.loss_segm(
+                        segm_preds[idx],
+                        segm_targets,
+                        avg_factor=segm_avg_factor)
+                loss_segm.append(loss)
+            # mask loss
+            total_pos += num_pos
+            if num_pos == 0 or pos_mask_targets is None:
+                loss = mask_preds.sum() * 0.
+            else:
+                mask_preds = torch.clamp(mask_preds, 0, 1)
+                loss = F.binary_cross_entropy(
+                    mask_preds, pos_mask_targets,
+                    reduction='none') * self.loss_mask_weight
+
+                h, w = img_meta['img_shape'][:2]
+                gt_bboxes_width = (gt_bboxes_for_reweight[:, 2] -
+                                   gt_bboxes_for_reweight[:, 0]) / w
+                gt_bboxes_height = (gt_bboxes_for_reweight[:, 3] -
+                                    gt_bboxes_for_reweight[:, 1]) / h
+                loss = loss.mean(dim=(1,
+                                      2)) / gt_bboxes_width / gt_bboxes_height
+                loss = torch.sum(loss)
+            loss_mask.append(loss)
+
+        if total_pos == 0:
+            total_pos += 1  # avoid nan
+        loss_mask = [x / total_pos for x in loss_mask]
+
+        losses.update(loss_mask=loss_mask)
+        if self.with_seg_branch:
+            losses.update(loss_segm=loss_segm)
+
+        return losses
+
+    def _get_targets_single(self, mask_preds: Tensor, segm_pred: Tensor,
+                            gt_instances: InstanceData,
+                            positive_info: InstanceData):
+        """Compute targets for predictions of single image.
+
+        Args:
+            mask_preds (Tensor): Predicted prototypes with shape
+                (num_classes, H, W).
+            segm_pred (Tensor): Predicted semantic segmentation map
+                with shape (num_classes, H, W).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            positive_info (:obj:`InstanceData`): Information of positive
+                samples that are assigned in detection head. It usually
+                contains following keys.
+
+                    - pos_assigned_gt_inds (Tensor): Assigner GT indexes of
+                      positive proposals, has shape (num_pos, )
+                    - pos_inds (Tensor): Positive index of image, has
+                      shape (num_pos, ).
+                    - coeffs (Tensor): Positive mask coefficients
+                      with shape (num_pos, num_protos).
+                    - bboxes (Tensor): Positive bboxes with shape
+                      (num_pos, 4)
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+            - mask_preds (Tensor): Positive predicted mask with shape
+              (num_pos, mask_h, mask_w).
+            - pos_mask_targets (Tensor): Positive mask targets with shape
+              (num_pos, mask_h, mask_w).
+            - segm_targets (Tensor): Semantic segmentation targets with shape
+              (num_classes, segm_h, segm_w).
+            - num_pos (int): Positive numbers.
+            - gt_bboxes_for_reweight (Tensor): GT bboxes that match to the
+              positive priors has shape (num_pos, 4).
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        device = gt_bboxes.device
+        gt_masks = gt_instances.masks.to_tensor(
+            dtype=torch.bool, device=device).float()
+        if gt_masks.size(0) == 0:
+            return mask_preds, None, None, 0, None
+
+        # process with semantic segmentation targets
+        if segm_pred is not None:
+            num_classes, segm_h, segm_w = segm_pred.size()
+            with torch.no_grad():
+                downsampled_masks = F.interpolate(
+                    gt_masks.unsqueeze(0), (segm_h, segm_w),
+                    mode='bilinear',
+                    align_corners=False).squeeze(0)
+                downsampled_masks = downsampled_masks.gt(0.5).float()
+                segm_targets = torch.zeros_like(segm_pred, requires_grad=False)
+                for obj_idx in range(downsampled_masks.size(0)):
+                    segm_targets[gt_labels[obj_idx] - 1] = torch.max(
+                        segm_targets[gt_labels[obj_idx] - 1],
+                        downsampled_masks[obj_idx])
+        else:
+            segm_targets = None
+        # process with mask targets
+        pos_assigned_gt_inds = positive_info.pos_assigned_gt_inds
+        num_pos = pos_assigned_gt_inds.size(0)
+        # Since we're producing (near) full image masks,
+        # it'd take too much vram to backprop on every single mask.
+        # Thus we select only a subset.
+        if num_pos > self.max_masks_to_train:
+            perm = torch.randperm(num_pos)
+            select = perm[:self.max_masks_to_train]
+            mask_preds = mask_preds[select]
+            pos_assigned_gt_inds = pos_assigned_gt_inds[select]
+            num_pos = self.max_masks_to_train
+
+        gt_bboxes_for_reweight = gt_bboxes[pos_assigned_gt_inds]
+
+        mask_h, mask_w = mask_preds.shape[-2:]
+        gt_masks = F.interpolate(
+            gt_masks.unsqueeze(0), (mask_h, mask_w),
+            mode='bilinear',
+            align_corners=False).squeeze(0)
+        gt_masks = gt_masks.gt(0.5).float()
+        pos_mask_targets = gt_masks[pos_assigned_gt_inds]
+
+        return (mask_preds, pos_mask_targets, segm_targets, num_pos,
+                gt_bboxes_for_reweight)
+
+    def crop_mask_preds(self, mask_preds: List[Tensor],
+                        batch_img_metas: List[dict],
+                        positive_infos: InstanceList) -> list:
+        """Crop predicted masks by zeroing out everything not in the predicted
+        bbox.
+
+        Args:
+            mask_preds (list[Tensor]): Predicted prototypes with shape
+                (num_classes, H, W).
+            batch_img_metas (list[dict]): Meta information of multiple images.
+            positive_infos (List[:obj:``InstanceData``]): Positive
+                information that calculate from detect head.
+
+        Returns:
+            list: The cropped masks.
+        """
+        croped_mask_preds = []
+        for img_meta, mask_preds, cur_info in zip(batch_img_metas, mask_preds,
+                                                  positive_infos):
+            bboxes_for_cropping = copy.deepcopy(cur_info.bboxes)
+            h, w = img_meta['img_shape'][:2]
+            bboxes_for_cropping[:, 0::2] /= w
+            bboxes_for_cropping[:, 1::2] /= h
+            mask_preds = self.crop_single(mask_preds, bboxes_for_cropping)
+            mask_preds = mask_preds.permute(2, 0, 1).contiguous()
+            croped_mask_preds.append(mask_preds)
+        return croped_mask_preds
+
+    def crop_single(self,
+                    masks: Tensor,
+                    boxes: Tensor,
+                    padding: int = 1) -> Tensor:
+        """Crop single predicted masks by zeroing out everything not in the
+        predicted bbox.
+
+        Args:
+            masks (Tensor): Predicted prototypes, has shape [H, W, N].
+            boxes (Tensor): Bbox coords in relative point form with
+                shape [N, 4].
+            padding (int): Image padding size.
+
+        Return:
+            Tensor: The cropped masks.
+        """
+        h, w, n = masks.size()
+        x1, x2 = self.sanitize_coordinates(
+            boxes[:, 0], boxes[:, 2], w, padding, cast=False)
+        y1, y2 = self.sanitize_coordinates(
+            boxes[:, 1], boxes[:, 3], h, padding, cast=False)
+
+        rows = torch.arange(
+            w, device=masks.device, dtype=x1.dtype).view(1, -1,
+                                                         1).expand(h, w, n)
+        cols = torch.arange(
+            h, device=masks.device, dtype=x1.dtype).view(-1, 1,
+                                                         1).expand(h, w, n)
+
+        masks_left = rows >= x1.view(1, 1, -1)
+        masks_right = rows < x2.view(1, 1, -1)
+        masks_up = cols >= y1.view(1, 1, -1)
+        masks_down = cols < y2.view(1, 1, -1)
+
+        crop_mask = masks_left * masks_right * masks_up * masks_down
+
+        return masks * crop_mask.float()
+
+    def sanitize_coordinates(self,
+                             x1: Tensor,
+                             x2: Tensor,
+                             img_size: int,
+                             padding: int = 0,
+                             cast: bool = True) -> tuple:
+        """Sanitizes the input coordinates so that x1 < x2, x1 != x2, x1 >= 0,
+        and x2 <= image_size. Also converts from relative to absolute
+        coordinates and casts the results to long tensors.
+
+        Warning: this does things in-place behind the scenes so
+        copy if necessary.
+
+        Args:
+            x1 (Tensor): shape (N, ).
+            x2 (Tensor): shape (N, ).
+            img_size (int): Size of the input image.
+            padding (int): x1 >= padding, x2 <= image_size-padding.
+            cast (bool): If cast is false, the result won't be cast to longs.
+
+        Returns:
+            tuple:
+
+            - x1 (Tensor): Sanitized _x1.
+            - x2 (Tensor): Sanitized _x2.
+        """
+        x1 = x1 * img_size
+        x2 = x2 * img_size
+        if cast:
+            x1 = x1.long()
+            x2 = x2.long()
+        x1 = torch.min(x1, x2)
+        x2 = torch.max(x1, x2)
+        x1 = torch.clamp(x1 - padding, min=0)
+        x2 = torch.clamp(x2 + padding, max=img_size)
+        return x1, x2
+
+    def predict_by_feat(self,
+                        mask_preds: List[Tensor],
+                        segm_preds: Tensor,
+                        results_list: InstanceList,
+                        batch_img_metas: List[dict],
+                        rescale: bool = True,
+                        **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mask_preds (list[Tensor]): Predicted prototypes with shape
+                (num_classes, H, W).
+            results_list (List[:obj:``InstanceData``]): BBoxHead results.
+            batch_img_metas (list[dict]): Meta information of all images.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        assert len(mask_preds) == len(results_list) == len(batch_img_metas)
+
+        croped_mask_pred = self.crop_mask_preds(mask_preds, batch_img_metas,
+                                                results_list)
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            results = results_list[img_id]
+            bboxes = results.bboxes
+            mask_preds = croped_mask_pred[img_id]
+            if bboxes.shape[0] == 0 or mask_preds.shape[0] == 0:
+                results_list[img_id] = empty_instances(
+                    [img_meta],
+                    bboxes.device,
+                    task_type='mask',
+                    instance_results=[results])[0]
+            else:
+                im_mask = self._predict_by_feat_single(
+                    mask_preds=croped_mask_pred[img_id],
+                    bboxes=bboxes,
+                    img_meta=img_meta,
+                    rescale=rescale)
+                results.masks = im_mask
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                mask_preds: Tensor,
+                                bboxes: Tensor,
+                                img_meta: dict,
+                                rescale: bool,
+                                cfg: OptConfigType = None):
+        """Transform a single image's features extracted from the head into
+        mask results.
+
+        Args:
+            mask_preds (Tensor): Predicted prototypes, has shape [H, W, N].
+            bboxes (Tensor): Bbox coords in relative point form with
+                shape [N, 4].
+            img_meta (dict): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If rescale is False, then returned masks will
+                fit the scale of imgs[0].
+            cfg (dict, optional): Config used in test phase.
+                Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+            (1, 2))
+        img_h, img_w = img_meta['ori_shape'][:2]
+        if rescale:  # in-placed rescale the bboxes
+            scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+            bboxes /= scale_factor
+        else:
+            w_scale, h_scale = scale_factor[0, 0], scale_factor[0, 1]
+            img_h = np.round(img_h * h_scale.item()).astype(np.int32)
+            img_w = np.round(img_w * w_scale.item()).astype(np.int32)
+
+        masks = F.interpolate(
+            mask_preds.unsqueeze(0), (img_h, img_w),
+            mode='bilinear',
+            align_corners=False).squeeze(0) > cfg.mask_thr
+
+        if cfg.mask_thr_binary < 0:
+            # for visualization and debugging
+            masks = (masks * 255).to(dtype=torch.uint8)
+
+        return masks
+
+
+class SegmentationModule(BaseModule):
+    """YOLACT segmentation branch used in <https://arxiv.org/abs/1904.02689>`_
+
+    In mmdet v2.x `segm_loss` is calculated in YOLACTSegmHead, while in
+    mmdet v3.x `SegmentationModule` is used to obtain the predicted semantic
+    segmentation map and `segm_loss` is calculated in YOLACTProtonet.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int = 256,
+        init_cfg: ConfigType = dict(
+            type='Xavier',
+            distribution='uniform',
+            override=dict(name='segm_conv'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.segm_conv = nn.Conv2d(
+            self.in_channels, self.num_classes, kernel_size=1)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward feature from the upstream network.
+
+        Args:
+            x (Tensor): Feature from the upstream network, which is
+                a 4D-tensor.
+
+        Returns:
+            Tensor: Predicted semantic segmentation map with shape
+                (N, num_classes, H, W).
+        """
+        return self.segm_conv(x)
+
+
+class InterpolateModule(BaseModule):
+    """This is a module version of F.interpolate.
+
+    Any arguments you give it just get passed along for the ride.
+    """
+
+    def __init__(self, *args, init_cfg=None, **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.args = args
+        self.kwargs = kwargs
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tensor): Feature from the upstream network, which is
+                a 4D-tensor.
+
+        Returns:
+            Tensor: A 4D-tensor feature map.
+        """
+        return F.interpolate(x, *self.args, **self.kwargs)
diff --git a/mmde/mmdet/models/dense_heads/yolo_head.py b/mmde/mmdet/models/dense_heads/yolo_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f63afbbc94353e16e4c67ec5bc0b6cd1200de07
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/yolo_head.py
@@ -0,0 +1,527 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+import copy
+import warnings
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, is_norm
+from mmengine.model import bias_init_with_prob, constant_init, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList)
+from ..task_modules.samplers import PseudoSampler
+from ..utils import filter_scores_and_topk, images_to_levels, multi_apply
+from .base_dense_head import BaseDenseHead
+
+
+@MODELS.register_module()
+class YOLOV3Head(BaseDenseHead):
+    """YOLOV3Head Paper link: https://arxiv.org/abs/1804.02767.
+
+    Args:
+        num_classes (int): The number of object classes (w/o background)
+        in_channels (Sequence[int]): Number of input channels per scale.
+        out_channels (Sequence[int]): The number of output channels per scale
+            before the final 1x1 layer. Default: (1024, 512, 256).
+        anchor_generator (:obj:`ConfigDict` or dict): Config dict for anchor
+            generator.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bounding box coder.
+        featmap_strides (Sequence[int]): The stride of each scale.
+            Should be in descending order. Defaults to (32, 16, 8).
+        one_hot_smoother (float): Set a non-zero value to enable label-smooth
+            Defaults to 0.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config norm layer. Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='LeakyReLU', negative_slope=0.1).
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_conf (:obj:`ConfigDict` or dict): Config of confidence loss.
+        loss_xy (:obj:`ConfigDict` or dict): Config of xy coordinate loss.
+        loss_wh (:obj:`ConfigDict` or dict): Config of wh coordinate loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            YOLOV3 head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            YOLOV3 head. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: Sequence[int],
+                 out_channels: Sequence[int] = (1024, 512, 256),
+                 anchor_generator: ConfigType = dict(
+                     type='YOLOAnchorGenerator',
+                     base_sizes=[[(116, 90), (156, 198), (373, 326)],
+                                 [(30, 61), (62, 45), (59, 119)],
+                                 [(10, 13), (16, 30), (33, 23)]],
+                     strides=[32, 16, 8]),
+                 bbox_coder: ConfigType = dict(type='YOLOBBoxCoder'),
+                 featmap_strides: Sequence[int] = (32, 16, 8),
+                 one_hot_smoother: float = 0.,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: ConfigType = dict(
+                     type='LeakyReLU', negative_slope=0.1),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_conf: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_xy: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_wh: ConfigType = dict(type='MSELoss', loss_weight=1.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None) -> None:
+        super().__init__(init_cfg=None)
+        # Check params
+        assert (len(in_channels) == len(out_channels) == len(featmap_strides))
+
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.featmap_strides = featmap_strides
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            if train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], context=self)
+            else:
+                self.sampler = PseudoSampler()
+
+        self.one_hot_smoother = one_hot_smoother
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+
+        self.prior_generator = TASK_UTILS.build(anchor_generator)
+
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_conf = MODELS.build(loss_conf)
+        self.loss_xy = MODELS.build(loss_xy)
+        self.loss_wh = MODELS.build(loss_wh)
+
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+        assert len(
+            self.prior_generator.num_base_priors) == len(featmap_strides)
+        self._init_layers()
+
+    @property
+    def num_levels(self) -> int:
+        """int: number of feature map levels"""
+        return len(self.featmap_strides)
+
+    @property
+    def num_attrib(self) -> int:
+        """int: number of attributes in pred_map, bboxes (4) +
+        objectness (1) + num_classes"""
+
+        return 5 + self.num_classes
+
+    def _init_layers(self) -> None:
+        """initialize conv layers in YOLOv3 head."""
+        self.convs_bridge = nn.ModuleList()
+        self.convs_pred = nn.ModuleList()
+        for i in range(self.num_levels):
+            conv_bridge = ConvModule(
+                self.in_channels[i],
+                self.out_channels[i],
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            conv_pred = nn.Conv2d(self.out_channels[i],
+                                  self.num_base_priors * self.num_attrib, 1)
+
+            self.convs_bridge.append(conv_bridge)
+            self.convs_pred.append(conv_pred)
+
+    def init_weights(self) -> None:
+        """initialize weights."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+
+        # Use prior in model initialization to improve stability
+        for conv_pred, stride in zip(self.convs_pred, self.featmap_strides):
+            bias = conv_pred.bias.reshape(self.num_base_priors, -1)
+            # init objectness with prior of 8 objects per feature map
+            # refer to https://github.com/ultralytics/yolov3
+            nn.init.constant_(bias.data[:, 4],
+                              bias_init_with_prob(8 / (608 / stride)**2))
+            nn.init.constant_(bias.data[:, 5:], bias_init_with_prob(0.01))
+
+    def forward(self, x: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple[Tensor]: A tuple of multi-level predication map, each is a
+                4D-tensor of shape (batch_size, 5+num_classes, height, width).
+        """
+
+        assert len(x) == self.num_levels
+        pred_maps = []
+        for i in range(self.num_levels):
+            feat = x[i]
+            feat = self.convs_bridge[i](feat)
+            pred_map = self.convs_pred[i](feat)
+            pred_maps.append(pred_map)
+
+        return tuple(pred_maps),
+
+    def predict_by_feat(self,
+                        pred_maps: Sequence[Tensor],
+                        batch_img_metas: Optional[List[dict]],
+                        cfg: OptConfigType = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results. It has been accelerated since PR #5991.
+
+        Args:
+            pred_maps (Sequence[Tensor]): Raw predictions for a batch of
+                images.
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (:obj:`ConfigDict` or dict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(pred_maps) == self.num_levels
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [pred_map.shape[-2:] for pred_map in pred_maps]
+
+        mlvl_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=pred_maps[0].device)
+        flatten_preds = []
+        flatten_strides = []
+        for pred, stride in zip(pred_maps, self.featmap_strides):
+            pred = pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                    self.num_attrib)
+            pred[..., :2].sigmoid_()
+            flatten_preds.append(pred)
+            flatten_strides.append(
+                pred.new_tensor(stride).expand(pred.size(1)))
+
+        flatten_preds = torch.cat(flatten_preds, dim=1)
+        flatten_bbox_preds = flatten_preds[..., :4]
+        flatten_objectness = flatten_preds[..., 4].sigmoid()
+        flatten_cls_scores = flatten_preds[..., 5:].sigmoid()
+        flatten_anchors = torch.cat(mlvl_anchors)
+        flatten_strides = torch.cat(flatten_strides)
+        flatten_bboxes = self.bbox_coder.decode(flatten_anchors,
+                                                flatten_bbox_preds,
+                                                flatten_strides.unsqueeze(-1))
+        results_list = []
+        for (bboxes, scores, objectness,
+             img_meta) in zip(flatten_bboxes, flatten_cls_scores,
+                              flatten_objectness, batch_img_metas):
+            # Filtering out all predictions with conf < conf_thr
+            conf_thr = cfg.get('conf_thr', -1)
+            if conf_thr > 0:
+                conf_inds = objectness >= conf_thr
+                bboxes = bboxes[conf_inds, :]
+                scores = scores[conf_inds, :]
+                objectness = objectness[conf_inds]
+
+            score_thr = cfg.get('score_thr', 0)
+            nms_pre = cfg.get('nms_pre', -1)
+            scores, labels, keep_idxs, _ = filter_scores_and_topk(
+                scores, score_thr, nms_pre)
+
+            results = InstanceData(
+                scores=scores,
+                labels=labels,
+                bboxes=bboxes[keep_idxs],
+                score_factors=objectness[keep_idxs],
+            )
+            results = self._bbox_post_process(
+                results=results,
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms,
+                img_meta=img_meta)
+            results_list.append(results)
+        return results_list
+
+    def loss_by_feat(
+            self,
+            pred_maps: Sequence[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            pred_maps (list[Tensor]): Prediction map for each scale level,
+                shape (N, num_anchors * num_attrib, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        device = pred_maps[0][0].device
+
+        featmap_sizes = [
+            pred_maps[i].shape[-2:] for i in range(self.num_levels)
+        ]
+        mlvl_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device)
+        anchor_list = [mlvl_anchors for _ in range(num_imgs)]
+
+        responsible_flag_list = []
+        for img_id in range(num_imgs):
+            responsible_flag_list.append(
+                self.responsible_flags(featmap_sizes,
+                                       batch_gt_instances[img_id].bboxes,
+                                       device))
+
+        target_maps_list, neg_maps_list = self.get_targets(
+            anchor_list, responsible_flag_list, batch_gt_instances)
+
+        losses_cls, losses_conf, losses_xy, losses_wh = multi_apply(
+            self.loss_by_feat_single, pred_maps, target_maps_list,
+            neg_maps_list)
+
+        return dict(
+            loss_cls=losses_cls,
+            loss_conf=losses_conf,
+            loss_xy=losses_xy,
+            loss_wh=losses_wh)
+
+    def loss_by_feat_single(self, pred_map: Tensor, target_map: Tensor,
+                            neg_map: Tensor) -> tuple:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            pred_map (Tensor): Raw predictions for a single level.
+            target_map (Tensor): The Ground-Truth target for a single level.
+            neg_map (Tensor): The negative masks for a single level.
+
+        Returns:
+            tuple:
+                loss_cls (Tensor): Classification loss.
+                loss_conf (Tensor): Confidence loss.
+                loss_xy (Tensor): Regression loss of x, y coordinate.
+                loss_wh (Tensor): Regression loss of w, h coordinate.
+        """
+
+        num_imgs = len(pred_map)
+        pred_map = pred_map.permute(0, 2, 3,
+                                    1).reshape(num_imgs, -1, self.num_attrib)
+        neg_mask = neg_map.float()
+        pos_mask = target_map[..., 4]
+        pos_and_neg_mask = neg_mask + pos_mask
+        pos_mask = pos_mask.unsqueeze(dim=-1)
+        if torch.max(pos_and_neg_mask) > 1.:
+            warnings.warn('There is overlap between pos and neg sample.')
+            pos_and_neg_mask = pos_and_neg_mask.clamp(min=0., max=1.)
+
+        pred_xy = pred_map[..., :2]
+        pred_wh = pred_map[..., 2:4]
+        pred_conf = pred_map[..., 4]
+        pred_label = pred_map[..., 5:]
+
+        target_xy = target_map[..., :2]
+        target_wh = target_map[..., 2:4]
+        target_conf = target_map[..., 4]
+        target_label = target_map[..., 5:]
+
+        loss_cls = self.loss_cls(pred_label, target_label, weight=pos_mask)
+        loss_conf = self.loss_conf(
+            pred_conf, target_conf, weight=pos_and_neg_mask)
+        loss_xy = self.loss_xy(pred_xy, target_xy, weight=pos_mask)
+        loss_wh = self.loss_wh(pred_wh, target_wh, weight=pos_mask)
+
+        return loss_cls, loss_conf, loss_xy, loss_wh
+
+    def get_targets(self, anchor_list: List[List[Tensor]],
+                    responsible_flag_list: List[List[Tensor]],
+                    batch_gt_instances: List[InstanceData]) -> tuple:
+        """Compute target maps for anchors in multiple images.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_total_anchors, 4).
+            responsible_flag_list (list[list[Tensor]]): Multi level responsible
+                flags of each image. Each element is a tensor of shape
+                (num_total_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+                - target_map_list (list[Tensor]): Target map of each level.
+                - neg_map_list (list[Tensor]): Negative map of each level.
+        """
+        num_imgs = len(anchor_list)
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+
+        results = multi_apply(self._get_targets_single, anchor_list,
+                              responsible_flag_list, batch_gt_instances)
+
+        all_target_maps, all_neg_maps = results
+        assert num_imgs == len(all_target_maps) == len(all_neg_maps)
+        target_maps_list = images_to_levels(all_target_maps, num_level_anchors)
+        neg_maps_list = images_to_levels(all_neg_maps, num_level_anchors)
+
+        return target_maps_list, neg_maps_list
+
+    def _get_targets_single(self, anchors: List[Tensor],
+                            responsible_flags: List[Tensor],
+                            gt_instances: InstanceData) -> tuple:
+        """Generate matching bounding box prior and converted GT.
+
+        Args:
+            anchors (List[Tensor]): Multi-level anchors of the image.
+            responsible_flags (List[Tensor]): Multi-level responsible flags of
+                anchors
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple:
+                target_map (Tensor): Predication target map of each
+                    scale level, shape (num_total_anchors,
+                    5+num_classes)
+                neg_map (Tensor): Negative map of each scale level,
+                    shape (num_total_anchors,)
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        anchor_strides = []
+        for i in range(len(anchors)):
+            anchor_strides.append(
+                torch.tensor(self.featmap_strides[i],
+                             device=gt_bboxes.device).repeat(len(anchors[i])))
+        concat_anchors = torch.cat(anchors)
+        concat_responsible_flags = torch.cat(responsible_flags)
+
+        anchor_strides = torch.cat(anchor_strides)
+        assert len(anchor_strides) == len(concat_anchors) == \
+               len(concat_responsible_flags)
+        pred_instances = InstanceData(
+            priors=concat_anchors, responsible_flags=concat_responsible_flags)
+
+        assign_result = self.assigner.assign(pred_instances, gt_instances)
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        target_map = concat_anchors.new_zeros(
+            concat_anchors.size(0), self.num_attrib)
+
+        target_map[sampling_result.pos_inds, :4] = self.bbox_coder.encode(
+            sampling_result.pos_priors, sampling_result.pos_gt_bboxes,
+            anchor_strides[sampling_result.pos_inds])
+
+        target_map[sampling_result.pos_inds, 4] = 1
+
+        gt_labels_one_hot = F.one_hot(
+            gt_labels, num_classes=self.num_classes).float()
+        if self.one_hot_smoother != 0:  # label smooth
+            gt_labels_one_hot = gt_labels_one_hot * (
+                1 - self.one_hot_smoother
+            ) + self.one_hot_smoother / self.num_classes
+        target_map[sampling_result.pos_inds, 5:] = gt_labels_one_hot[
+            sampling_result.pos_assigned_gt_inds]
+
+        neg_map = concat_anchors.new_zeros(
+            concat_anchors.size(0), dtype=torch.uint8)
+        neg_map[sampling_result.neg_inds] = 1
+
+        return target_map, neg_map
+
+    def responsible_flags(self, featmap_sizes: List[tuple], gt_bboxes: Tensor,
+                          device: str) -> List[Tensor]:
+        """Generate responsible anchor flags of grid cells in multiple scales.
+
+        Args:
+            featmap_sizes (List[tuple]): List of feature map sizes in multiple
+                feature levels.
+            gt_bboxes (Tensor): Ground truth boxes, shape (n, 4).
+            device (str): Device where the anchors will be put on.
+
+        Return:
+            List[Tensor]: responsible flags of anchors in multiple level
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_responsible_flags = []
+        for i in range(self.num_levels):
+            anchor_stride = self.prior_generator.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            gt_cx = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) * 0.5).to(device)
+            gt_cy = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) * 0.5).to(device)
+            gt_grid_x = torch.floor(gt_cx / anchor_stride[0]).long()
+            gt_grid_y = torch.floor(gt_cy / anchor_stride[1]).long()
+            # row major indexing
+            gt_bboxes_grid_idx = gt_grid_y * feat_w + gt_grid_x
+
+            responsible_grid = torch.zeros(
+                feat_h * feat_w, dtype=torch.uint8, device=device)
+            responsible_grid[gt_bboxes_grid_idx] = 1
+
+            responsible_grid = responsible_grid[:, None].expand(
+                responsible_grid.size(0),
+                self.prior_generator.num_base_priors[i]).contiguous().view(-1)
+
+            multi_level_responsible_flags.append(responsible_grid)
+        return multi_level_responsible_flags
diff --git a/mmde/mmdet/models/dense_heads/yolof_head.py b/mmde/mmdet/models/dense_heads/yolof_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5e5e6b7a92861bcd2ba3824df1f94270ba51160
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/yolof_head.py
@@ -0,0 +1,399 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, is_norm
+from mmengine.model import bias_init_with_prob, constant_init, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..utils import levels_to_images, multi_apply, unmap
+from .anchor_head import AnchorHead
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class YOLOFHead(AnchorHead):
+    """Detection Head of `YOLOF <https://arxiv.org/abs/2103.09460>`_
+
+    Args:
+        num_classes (int): The number of object classes (w/o background)
+        in_channels (list[int]): The number of input channels per scale.
+        cls_num_convs (int): The number of convolutions of cls branch.
+           Defaults to 2.
+        reg_num_convs (int): The number of convolutions of reg branch.
+           Defaults to 4.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to ``dict(type='BN', requires_grad=True)``.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: List[int],
+                 num_cls_convs: int = 2,
+                 num_reg_convs: int = 4,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 **kwargs) -> None:
+        self.num_cls_convs = num_cls_convs
+        self.num_reg_convs = num_reg_convs
+        self.norm_cfg = norm_cfg
+        super().__init__(
+            num_classes=num_classes, in_channels=in_channels, **kwargs)
+
+    def _init_layers(self) -> None:
+        cls_subnet = []
+        bbox_subnet = []
+        for i in range(self.num_cls_convs):
+            cls_subnet.append(
+                ConvModule(
+                    self.in_channels,
+                    self.in_channels,
+                    kernel_size=3,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+        for i in range(self.num_reg_convs):
+            bbox_subnet.append(
+                ConvModule(
+                    self.in_channels,
+                    self.in_channels,
+                    kernel_size=3,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+        self.cls_subnet = nn.Sequential(*cls_subnet)
+        self.bbox_subnet = nn.Sequential(*bbox_subnet)
+        self.cls_score = nn.Conv2d(
+            self.in_channels,
+            self.num_base_priors * self.num_classes,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.bbox_pred = nn.Conv2d(
+            self.in_channels,
+            self.num_base_priors * 4,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.object_pred = nn.Conv2d(
+            self.in_channels,
+            self.num_base_priors,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+
+    def init_weights(self) -> None:
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+
+        # Use prior in model initialization to improve stability
+        bias_cls = bias_init_with_prob(0.01)
+        torch.nn.init.constant_(self.cls_score.bias, bias_cls)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                normalized_cls_score (Tensor): Normalized Cls scores for a \
+                    single scale level, the channels number is \
+                    num_base_priors * num_classes.
+                bbox_reg (Tensor): Box energies / deltas for a single scale \
+                    level, the channels number is num_base_priors * 4.
+        """
+        cls_score = self.cls_score(self.cls_subnet(x))
+        N, _, H, W = cls_score.shape
+        cls_score = cls_score.view(N, -1, self.num_classes, H, W)
+
+        reg_feat = self.bbox_subnet(x)
+        bbox_reg = self.bbox_pred(reg_feat)
+        objectness = self.object_pred(reg_feat)
+
+        # implicit objectness
+        objectness = objectness.view(N, -1, 1, H, W)
+        normalized_cls_score = cls_score + objectness - torch.log(
+            1. + torch.clamp(cls_score.exp(), max=INF) +
+            torch.clamp(objectness.exp(), max=INF))
+        normalized_cls_score = normalized_cls_score.view(N, -1, H, W)
+        return normalized_cls_score, bbox_reg
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        assert len(cls_scores) == 1
+        assert self.prior_generator.num_levels == 1
+
+        device = cls_scores[0].device
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        # The output level is always 1
+        anchor_list = [anchors[0] for anchors in anchor_list]
+        valid_flag_list = [valid_flags[0] for valid_flags in valid_flag_list]
+
+        cls_scores_list = levels_to_images(cls_scores)
+        bbox_preds_list = levels_to_images(bbox_preds)
+
+        cls_reg_targets = self.get_targets(
+            cls_scores_list,
+            bbox_preds_list,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        if cls_reg_targets is None:
+            return None
+        (batch_labels, batch_label_weights, avg_factor, batch_bbox_weights,
+         batch_pos_predicted_boxes, batch_target_boxes) = cls_reg_targets
+
+        flatten_labels = batch_labels.reshape(-1)
+        batch_label_weights = batch_label_weights.reshape(-1)
+        cls_score = cls_scores[0].permute(0, 2, 3,
+                                          1).reshape(-1, self.cls_out_channels)
+
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+
+        # classification loss
+        loss_cls = self.loss_cls(
+            cls_score,
+            flatten_labels,
+            batch_label_weights,
+            avg_factor=avg_factor)
+
+        # regression loss
+        if batch_pos_predicted_boxes.shape[0] == 0:
+            # no pos sample
+            loss_bbox = batch_pos_predicted_boxes.sum() * 0
+        else:
+            loss_bbox = self.loss_bbox(
+                batch_pos_predicted_boxes,
+                batch_target_boxes,
+                batch_bbox_weights.float(),
+                avg_factor=avg_factor)
+
+        return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
+
+    def get_targets(self,
+                    cls_scores_list: List[Tensor],
+                    bbox_preds_list: List[Tensor],
+                    anchor_list: List[Tensor],
+                    valid_flag_list: List[Tensor],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs: bool = True):
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            cls_scores_list (list[Tensor]): Classification scores of
+                each image. each is a 4D-tensor, the shape is
+                (h * w, num_anchors * num_classes).
+            bbox_preds_list (list[Tensor]): Bbox preds of each image.
+                each is a 4D-tensor, the shape is (h * w, num_anchors * 4).
+            anchor_list (list[Tensor]): Anchors of each image. Each element of
+                is a tensor of shape (h * w * num_anchors, 4).
+            valid_flag_list (list[Tensor]): Valid flags of each image. Each
+               element of is a tensor of shape (h * w * num_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - batch_labels (Tensor): Label of all images. Each element \
+                    of is a tensor of shape (batch, h * w * num_anchors)
+                - batch_label_weights (Tensor): Label weights of all images \
+                    of is a tensor of shape (batch, h * w * num_anchors)
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        results = multi_apply(
+            self._get_targets_single,
+            bbox_preds_list,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore,
+            unmap_outputs=unmap_outputs)
+        (all_labels, all_label_weights, pos_inds, neg_inds,
+         sampling_results_list) = results[:5]
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        rest_results = list(results[5:])  # user-added return values
+
+        batch_labels = torch.stack(all_labels, 0)
+        batch_label_weights = torch.stack(all_label_weights, 0)
+
+        res = (batch_labels, batch_label_weights, avg_factor)
+        for i, rests in enumerate(rest_results):  # user-added return values
+            rest_results[i] = torch.cat(rests, 0)
+
+        return res + tuple(rest_results)
+
+    def _get_targets_single(self,
+                            bbox_preds: Tensor,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Args:
+            bbox_preds (Tensor): Bbox prediction of the image, which
+                shape is (h * w ,4)
+            flat_anchors (Tensor): Anchors of the image, which shape is
+                (h * w * num_anchors ,4)
+            valid_flags (Tensor): Valid flags of the image, which shape is
+                (h * w * num_anchors,).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple:
+                labels (Tensor): Labels of image, which shape is
+                    (h * w * num_anchors, ).
+                label_weights (Tensor): Label weights of image, which shape is
+                    (h * w * num_anchors, ).
+                pos_inds (Tensor): Pos index of image.
+                neg_inds (Tensor): Neg index of image.
+                sampling_result (obj:`SamplingResult`): Sampling result.
+                pos_bbox_weights (Tensor): The Weight of using to calculate
+                    the bbox branch loss, which shape is (num, ).
+                pos_predicted_boxes (Tensor): boxes predicted value of
+                    using to calculate the bbox branch loss, which shape is
+                    (num, 4).
+                pos_target_boxes (Tensor): boxes target value of
+                    using to calculate the bbox branch loss, which shape is
+                    (num, 4).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bbox_preds = bbox_preds[inside_flags, :]
+
+        # decoded bbox
+        decoder_bbox_preds = self.bbox_coder.decode(anchors, bbox_preds)
+        pred_instances = InstanceData(
+            priors=anchors, decoder_priors=decoder_bbox_preds)
+        assign_result = self.assigner.assign(pred_instances, gt_instances,
+                                             gt_instances_ignore)
+
+        pos_bbox_weights = assign_result.get_extra_property('pos_idx')
+        pos_predicted_boxes = assign_result.get_extra_property(
+            'pos_predicted_boxes')
+        pos_target_boxes = assign_result.get_extra_property('target_boxes')
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+        num_valid_anchors = anchors.shape[0]
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+
+        return (labels, label_weights, pos_inds, neg_inds, sampling_result,
+                pos_bbox_weights, pos_predicted_boxes, pos_target_boxes)
diff --git a/mmde/mmdet/models/dense_heads/yolox_head.py b/mmde/mmdet/models/dense_heads/yolox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..00fe1e42766e4ca0052cf31d2e940dfab73fb200
--- /dev/null
+++ b/mmde/mmdet/models/dense_heads/yolox_head.py
@@ -0,0 +1,618 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmcv.ops.nms import batched_nms
+from mmengine.config import ConfigDict
+from mmengine.model import bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import bbox_xyxy_to_cxcywh
+from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList,
+                         OptMultiConfig, reduce_mean)
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..task_modules.samplers import PseudoSampler
+from ..utils import multi_apply
+from .base_dense_head import BaseDenseHead
+
+
+@MODELS.register_module()
+class YOLOXHead(BaseDenseHead):
+    """YOLOXHead head used in `YOLOX <https://arxiv.org/abs/2107.08430>`_.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels in stacking convs.
+            Defaults to 256
+        stacked_convs (int): Number of stacking convs of the head.
+            Defaults to (8, 16, 32).
+        strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to None.
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Defaults to False.
+        dcn_on_last_conv (bool): If true, use dcn in the last layer of
+            towers. Defaults to False.
+        conv_bias (bool or str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Defaults to "auto".
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to None.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_obj (:obj:`ConfigDict` or dict): Config of objectness loss.
+        loss_l1 (:obj:`ConfigDict` or dict): Config of L1 loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        feat_channels: int = 256,
+        stacked_convs: int = 2,
+        strides: Sequence[int] = (8, 16, 32),
+        use_depthwise: bool = False,
+        dcn_on_last_conv: bool = False,
+        conv_bias: Union[bool, str] = 'auto',
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg: ConfigType = dict(type='Swish'),
+        loss_cls: ConfigType = dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        loss_bbox: ConfigType = dict(
+            type='IoULoss',
+            mode='square',
+            eps=1e-16,
+            reduction='sum',
+            loss_weight=5.0),
+        loss_obj: ConfigType = dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        loss_l1: ConfigType = dict(
+            type='L1Loss', reduction='sum', loss_weight=1.0),
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        init_cfg: OptMultiConfig = dict(
+            type='Kaiming',
+            layer='Conv2d',
+            a=math.sqrt(5),
+            distribution='uniform',
+            mode='fan_in',
+            nonlinearity='leaky_relu')
+    ) -> None:
+
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.cls_out_channels = num_classes
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.use_depthwise = use_depthwise
+        self.dcn_on_last_conv = dcn_on_last_conv
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.use_sigmoid_cls = True
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.loss_cls: nn.Module = MODELS.build(loss_cls)
+        self.loss_bbox: nn.Module = MODELS.build(loss_bbox)
+        self.loss_obj: nn.Module = MODELS.build(loss_obj)
+
+        self.use_l1 = False  # This flag will be modified by hooks.
+        self.loss_l1: nn.Module = MODELS.build(loss_l1)
+
+        self.prior_generator = MlvlPointGenerator(strides, offset=0)
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            # YOLOX does not support sampling
+            self.sampler = PseudoSampler()
+
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize heads for all level feature maps."""
+        self.multi_level_cls_convs = nn.ModuleList()
+        self.multi_level_reg_convs = nn.ModuleList()
+        self.multi_level_conv_cls = nn.ModuleList()
+        self.multi_level_conv_reg = nn.ModuleList()
+        self.multi_level_conv_obj = nn.ModuleList()
+        for _ in self.strides:
+            self.multi_level_cls_convs.append(self._build_stacked_convs())
+            self.multi_level_reg_convs.append(self._build_stacked_convs())
+            conv_cls, conv_reg, conv_obj = self._build_predictor()
+            self.multi_level_conv_cls.append(conv_cls)
+            self.multi_level_conv_reg.append(conv_reg)
+            self.multi_level_conv_obj.append(conv_obj)
+
+    def _build_stacked_convs(self) -> nn.Sequential:
+        """Initialize conv layers of a single level head."""
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+        stacked_convs = []
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            stacked_convs.append(
+                conv(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    bias=self.conv_bias))
+        return nn.Sequential(*stacked_convs)
+
+    def _build_predictor(self) -> Tuple[nn.Module, nn.Module, nn.Module]:
+        """Initialize predictor layers of a single level head."""
+        conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 1)
+        conv_reg = nn.Conv2d(self.feat_channels, 4, 1)
+        conv_obj = nn.Conv2d(self.feat_channels, 1, 1)
+        return conv_cls, conv_reg, conv_obj
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        super(YOLOXHead, self).init_weights()
+        # Use prior in model initialization to improve stability
+        bias_init = bias_init_with_prob(0.01)
+        for conv_cls, conv_obj in zip(self.multi_level_conv_cls,
+                                      self.multi_level_conv_obj):
+            conv_cls.bias.data.fill_(bias_init)
+            conv_obj.bias.data.fill_(bias_init)
+
+    def forward_single(self, x: Tensor, cls_convs: nn.Module,
+                       reg_convs: nn.Module, conv_cls: nn.Module,
+                       conv_reg: nn.Module,
+                       conv_obj: nn.Module) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward feature of a single scale level."""
+
+        cls_feat = cls_convs(x)
+        reg_feat = reg_convs(x)
+
+        cls_score = conv_cls(cls_feat)
+        bbox_pred = conv_reg(reg_feat)
+        objectness = conv_obj(reg_feat)
+
+        return cls_score, bbox_pred, objectness
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions, and objectnesses.
+        """
+
+        return multi_apply(self.forward_single, x, self.multi_level_cls_convs,
+                           self.multi_level_reg_convs,
+                           self.multi_level_conv_cls,
+                           self.multi_level_conv_reg,
+                           self.multi_level_conv_obj)
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        objectnesses: Optional[List[Tensor]],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> List[InstanceData]:
+        """Transform a batch of output features extracted by the head into
+        bbox results.
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            objectnesses (list[Tensor], Optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(objectnesses)
+        cfg = self.test_cfg if cfg is None else cfg
+
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device,
+            with_stride=True)
+
+        # flatten cls_scores, bbox_preds and objectness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_objectness = [
+            objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+            for objectness in objectnesses
+        ]
+
+        flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid()
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid()
+        flatten_priors = torch.cat(mlvl_priors)
+
+        flatten_bboxes = self._bbox_decode(flatten_priors, flatten_bbox_preds)
+
+        result_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            max_scores, labels = torch.max(flatten_cls_scores[img_id], 1)
+            valid_mask = flatten_objectness[
+                img_id] * max_scores >= cfg.score_thr
+            results = InstanceData(
+                bboxes=flatten_bboxes[img_id][valid_mask],
+                scores=max_scores[valid_mask] *
+                flatten_objectness[img_id][valid_mask],
+                labels=labels[valid_mask])
+
+            result_list.append(
+                self._bbox_post_process(
+                    results=results,
+                    cfg=cfg,
+                    rescale=rescale,
+                    with_nms=with_nms,
+                    img_meta=img_meta))
+
+        return result_list
+
+    def _bbox_decode(self, priors: Tensor, bbox_preds: Tensor) -> Tensor:
+        """Decode regression results (delta_x, delta_x, w, h) to bboxes (tl_x,
+        tl_y, br_x, br_y).
+
+        Args:
+            priors (Tensor): Center proiors of an image, has shape
+                (num_instances, 2).
+            bbox_preds (Tensor): Box energies / deltas for all instances,
+                has shape (batch_size, num_instances, 4).
+
+        Returns:
+            Tensor: Decoded bboxes in (tl_x, tl_y, br_x, br_y) format. Has
+            shape (batch_size, num_instances, 4).
+        """
+        xys = (bbox_preds[..., :2] * priors[:, 2:]) + priors[:, :2]
+        whs = bbox_preds[..., 2:].exp() * priors[:, 2:]
+
+        tl_x = (xys[..., 0] - whs[..., 0] / 2)
+        tl_y = (xys[..., 1] - whs[..., 1] / 2)
+        br_x = (xys[..., 0] + whs[..., 0] / 2)
+        br_y = (xys[..., 1] + whs[..., 1] / 2)
+
+        decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1)
+        return decoded_bboxes
+
+    def _bbox_post_process(self,
+                           results: InstanceData,
+                           cfg: ConfigDict,
+                           rescale: bool = False,
+                           with_nms: bool = True,
+                           img_meta: Optional[dict] = None) -> InstanceData:
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default to True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            results.bboxes /= results.bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+
+        if with_nms and results.bboxes.numel() > 0:
+            det_bboxes, keep_idxs = batched_nms(results.bboxes, results.scores,
+                                                results.labels, cfg.nms)
+            results = results[keep_idxs]
+            # some nms would reweight the score, such as softnms
+            results.scores = det_bboxes[:, -1]
+        return results
+
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Tensor],
+            bbox_preds: Sequence[Tensor],
+            objectnesses: Sequence[Tensor],
+            batch_gt_instances: Sequence[InstanceData],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            objectnesses (Sequence[Tensor]): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+        num_imgs = len(batch_img_metas)
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device,
+            with_stride=True)
+
+        flatten_cls_preds = [
+            cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                 self.cls_out_channels)
+            for cls_pred in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_objectness = [
+            objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+            for objectness in objectnesses
+        ]
+
+        flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_objectness = torch.cat(flatten_objectness, dim=1)
+        flatten_priors = torch.cat(mlvl_priors)
+        flatten_bboxes = self._bbox_decode(flatten_priors, flatten_bbox_preds)
+
+        (pos_masks, cls_targets, obj_targets, bbox_targets, l1_targets,
+         num_fg_imgs) = multi_apply(
+             self._get_targets_single,
+             flatten_priors.unsqueeze(0).repeat(num_imgs, 1, 1),
+             flatten_cls_preds.detach(), flatten_bboxes.detach(),
+             flatten_objectness.detach(), batch_gt_instances, batch_img_metas,
+             batch_gt_instances_ignore)
+
+        # The experimental results show that 'reduce_mean' can improve
+        # performance on the COCO dataset.
+        num_pos = torch.tensor(
+            sum(num_fg_imgs),
+            dtype=torch.float,
+            device=flatten_cls_preds.device)
+        num_total_samples = max(reduce_mean(num_pos), 1.0)
+
+        pos_masks = torch.cat(pos_masks, 0)
+        cls_targets = torch.cat(cls_targets, 0)
+        obj_targets = torch.cat(obj_targets, 0)
+        bbox_targets = torch.cat(bbox_targets, 0)
+        if self.use_l1:
+            l1_targets = torch.cat(l1_targets, 0)
+
+        loss_obj = self.loss_obj(flatten_objectness.view(-1, 1),
+                                 obj_targets) / num_total_samples
+        if num_pos > 0:
+            loss_cls = self.loss_cls(
+                flatten_cls_preds.view(-1, self.num_classes)[pos_masks],
+                cls_targets) / num_total_samples
+            loss_bbox = self.loss_bbox(
+                flatten_bboxes.view(-1, 4)[pos_masks],
+                bbox_targets) / num_total_samples
+        else:
+            # Avoid cls and reg branch not participating in the gradient
+            # propagation when there is no ground-truth in the images.
+            # For more details, please refer to
+            # https://github.com/open-mmlab/mmdetection/issues/7298
+            loss_cls = flatten_cls_preds.sum() * 0
+            loss_bbox = flatten_bboxes.sum() * 0
+
+        loss_dict = dict(
+            loss_cls=loss_cls, loss_bbox=loss_bbox, loss_obj=loss_obj)
+
+        if self.use_l1:
+            if num_pos > 0:
+                loss_l1 = self.loss_l1(
+                    flatten_bbox_preds.view(-1, 4)[pos_masks],
+                    l1_targets) / num_total_samples
+            else:
+                # Avoid cls and reg branch not participating in the gradient
+                # propagation when there is no ground-truth in the images.
+                # For more details, please refer to
+                # https://github.com/open-mmlab/mmdetection/issues/7298
+                loss_l1 = flatten_bbox_preds.sum() * 0
+            loss_dict.update(loss_l1=loss_l1)
+
+        return loss_dict
+
+    @torch.no_grad()
+    def _get_targets_single(
+            self,
+            priors: Tensor,
+            cls_preds: Tensor,
+            decoded_bboxes: Tensor,
+            objectness: Tensor,
+            gt_instances: InstanceData,
+            img_meta: dict,
+            gt_instances_ignore: Optional[InstanceData] = None) -> tuple:
+        """Compute classification, regression, and objectness targets for
+        priors in a single image.
+
+        Args:
+            priors (Tensor): All priors of one image, a 2D-Tensor with shape
+                [num_priors, 4] in [cx, xy, stride_w, stride_y] format.
+            cls_preds (Tensor): Classification predictions of one image,
+                a 2D-Tensor with shape [num_priors, num_classes]
+            decoded_bboxes (Tensor): Decoded bboxes predictions of one image,
+                a 2D-Tensor with shape [num_priors, 4] in [tl_x, tl_y,
+                br_x, br_y] format.
+            objectness (Tensor): Objectness predictions of one image,
+                a 1D-Tensor with shape [num_priors]
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            tuple:
+                foreground_mask (list[Tensor]): Binary mask of foreground
+                targets.
+                cls_target (list[Tensor]): Classification targets of an image.
+                obj_target (list[Tensor]): Objectness targets of an image.
+                bbox_target (list[Tensor]): BBox targets of an image.
+                l1_target (int): BBox L1 targets of an image.
+                num_pos_per_img (int): Number of positive samples in an image.
+        """
+
+        num_priors = priors.size(0)
+        num_gts = len(gt_instances)
+        # No target
+        if num_gts == 0:
+            cls_target = cls_preds.new_zeros((0, self.num_classes))
+            bbox_target = cls_preds.new_zeros((0, 4))
+            l1_target = cls_preds.new_zeros((0, 4))
+            obj_target = cls_preds.new_zeros((num_priors, 1))
+            foreground_mask = cls_preds.new_zeros(num_priors).bool()
+            return (foreground_mask, cls_target, obj_target, bbox_target,
+                    l1_target, 0)
+
+        # YOLOX uses center priors with 0.5 offset to assign targets,
+        # but use center priors without offset to regress bboxes.
+        offset_priors = torch.cat(
+            [priors[:, :2] + priors[:, 2:] * 0.5, priors[:, 2:]], dim=-1)
+
+        scores = cls_preds.sigmoid() * objectness.unsqueeze(1).sigmoid()
+        pred_instances = InstanceData(
+            bboxes=decoded_bboxes, scores=scores.sqrt_(), priors=offset_priors)
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=gt_instances,
+            gt_instances_ignore=gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+        pos_inds = sampling_result.pos_inds
+        num_pos_per_img = pos_inds.size(0)
+
+        pos_ious = assign_result.max_overlaps[pos_inds]
+        # IOU aware classification score
+        cls_target = F.one_hot(sampling_result.pos_gt_labels,
+                               self.num_classes) * pos_ious.unsqueeze(-1)
+        obj_target = torch.zeros_like(objectness).unsqueeze(-1)
+        obj_target[pos_inds] = 1
+        bbox_target = sampling_result.pos_gt_bboxes
+        l1_target = cls_preds.new_zeros((num_pos_per_img, 4))
+        if self.use_l1:
+            l1_target = self._get_l1_target(l1_target, bbox_target,
+                                            priors[pos_inds])
+        foreground_mask = torch.zeros_like(objectness).to(torch.bool)
+        foreground_mask[pos_inds] = 1
+        return (foreground_mask, cls_target, obj_target, bbox_target,
+                l1_target, num_pos_per_img)
+
+    def _get_l1_target(self,
+                       l1_target: Tensor,
+                       gt_bboxes: Tensor,
+                       priors: Tensor,
+                       eps: float = 1e-8) -> Tensor:
+        """Convert gt bboxes to center offset and log width height."""
+        gt_cxcywh = bbox_xyxy_to_cxcywh(gt_bboxes)
+        l1_target[:, :2] = (gt_cxcywh[:, :2] - priors[:, :2]) / priors[:, 2:]
+        l1_target[:, 2:] = torch.log(gt_cxcywh[:, 2:] / priors[:, 2:] + eps)
+        return l1_target
diff --git a/mmde/mmdet/models/detectors/__init__.py b/mmde/mmdet/models/detectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5a06d2813c810504e12592506be9347111d6696
--- /dev/null
+++ b/mmde/mmdet/models/detectors/__init__.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .atss import ATSS
+from .autoassign import AutoAssign
+from .base import BaseDetector
+from .base_detr import DetectionTransformer
+from .boxinst import BoxInst
+from .cascade_rcnn import CascadeRCNN
+from .centernet import CenterNet
+from .condinst import CondInst
+from .conditional_detr import ConditionalDETR
+from .cornernet import CornerNet
+from .crowddet import CrowdDet
+from .d2_wrapper import Detectron2Wrapper
+from .dab_detr import DABDETR
+from .ddod import DDOD
+from .ddq_detr import DDQDETR
+from .deformable_detr import DeformableDETR
+from .detr import DETR
+from .dino import DINO
+from .fast_rcnn import FastRCNN
+from .faster_rcnn import FasterRCNN
+from .fcos import FCOS
+from .fovea import FOVEA
+from .fsaf import FSAF
+from .gfl import GFL
+from .glip import GLIP
+from .grid_rcnn import GridRCNN
+from .grounding_dino import GroundingDINO
+from .htc import HybridTaskCascade
+from .kd_one_stage import KnowledgeDistillationSingleStageDetector
+from .lad import LAD
+from .mask2former import Mask2Former
+from .mask_rcnn import MaskRCNN
+from .mask_scoring_rcnn import MaskScoringRCNN
+from .maskformer import MaskFormer
+from .nasfcos import NASFCOS
+from .paa import PAA
+from .panoptic_fpn import PanopticFPN
+from .panoptic_two_stage_segmentor import TwoStagePanopticSegmentor
+from .point_rend import PointRend
+from .queryinst import QueryInst
+from .reppoints_detector import RepPointsDetector
+from .retinanet import RetinaNet
+from .rpn import RPN
+from .rtmdet import RTMDet
+from .scnet import SCNet
+from .semi_base import SemiBaseDetector
+from .single_stage import SingleStageDetector
+from .soft_teacher import SoftTeacher
+from .solo import SOLO
+from .solov2 import SOLOv2
+from .sparse_rcnn import SparseRCNN
+from .tood import TOOD
+from .trident_faster_rcnn import TridentFasterRCNN
+from .two_stage import TwoStageDetector
+from .vfnet import VFNet
+from .yolact import YOLACT
+from .yolo import YOLOV3
+from .yolof import YOLOF
+from .yolox import YOLOX
+
+__all__ = [
+    'ATSS', 'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN',
+    'KnowledgeDistillationSingleStageDetector', 'FastRCNN', 'FasterRCNN',
+    'MaskRCNN', 'CascadeRCNN', 'HybridTaskCascade', 'RetinaNet', 'FCOS',
+    'GridRCNN', 'MaskScoringRCNN', 'RepPointsDetector', 'FOVEA', 'FSAF',
+    'NASFCOS', 'PointRend', 'GFL', 'CornerNet', 'PAA', 'YOLOV3', 'YOLACT',
+    'VFNet', 'DETR', 'TridentFasterRCNN', 'SparseRCNN', 'SCNet', 'SOLO',
+    'SOLOv2', 'DeformableDETR', 'AutoAssign', 'YOLOF', 'CenterNet', 'YOLOX',
+    'TwoStagePanopticSegmentor', 'PanopticFPN', 'QueryInst', 'LAD', 'TOOD',
+    'MaskFormer', 'DDOD', 'Mask2Former', 'SemiBaseDetector', 'SoftTeacher',
+    'RTMDet', 'Detectron2Wrapper', 'CrowdDet', 'CondInst', 'BoxInst',
+    'DetectionTransformer', 'ConditionalDETR', 'DINO', 'DABDETR', 'GLIP',
+    'DDQDETR', 'GroundingDINO'
+]
diff --git a/mmde/mmdet/models/detectors/atss.py b/mmde/mmdet/models/detectors/atss.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bfcc728dc4cc33c0b705a2ab22a4e3f4ad7386d
--- /dev/null
+++ b/mmde/mmdet/models/detectors/atss.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class ATSS(SingleStageDetector):
+    """Implementation of `ATSS <https://arxiv.org/abs/1912.02424>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of ATSS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of ATSS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/autoassign.py b/mmde/mmdet/models/detectors/autoassign.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b3570fe6e0c3812a72bc677038bb4e76b05576
--- /dev/null
+++ b/mmde/mmdet/models/detectors/autoassign.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class AutoAssign(SingleStageDetector):
+    """Implementation of `AutoAssign: Differentiable Label Assignment for Dense
+    Object Detection <https://arxiv.org/abs/2007.03496>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of AutoAssign. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of AutoAssign. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/base.py b/mmde/mmdet/models/detectors/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a193b0ca9ca3d2b42fda452004d5c97421f426c
--- /dev/null
+++ b/mmde/mmdet/models/detectors/base.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Tuple, Union
+
+import torch
+from mmengine.model import BaseModel
+from torch import Tensor
+
+from mmdet.structures import DetDataSample, OptSampleList, SampleList
+from mmdet.utils import InstanceList, OptConfigType, OptMultiConfig
+from ..utils import samplelist_boxtype2tensor
+
+ForwardResults = Union[Dict[str, torch.Tensor], List[DetDataSample],
+                       Tuple[torch.Tensor], torch.Tensor]
+
+
+class BaseDetector(BaseModel, metaclass=ABCMeta):
+    """Base class for detectors.
+
+    Args:
+       data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`BaseDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+       init_cfg (dict or ConfigDict, optional): the config to control the
+           initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+    @property
+    def with_neck(self) -> bool:
+        """bool: whether the detector has a neck"""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    # TODO: these properties need to be carefully handled
+    # for both single stage & two stage detectors
+    @property
+    def with_shared_head(self) -> bool:
+        """bool: whether the detector has a shared head in the RoI Head"""
+        return hasattr(self, 'roi_head') and self.roi_head.with_shared_head
+
+    @property
+    def with_bbox(self) -> bool:
+        """bool: whether the detector has a bbox head"""
+        return ((hasattr(self, 'roi_head') and self.roi_head.with_bbox)
+                or (hasattr(self, 'bbox_head') and self.bbox_head is not None))
+
+    @property
+    def with_mask(self) -> bool:
+        """bool: whether the detector has a mask head"""
+        return ((hasattr(self, 'roi_head') and self.roi_head.with_mask)
+                or (hasattr(self, 'mask_head') and self.mask_head is not None))
+
+    def forward(self,
+                inputs: torch.Tensor,
+                data_samples: OptSampleList = None,
+                mode: str = 'tensor') -> ForwardResults:
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+        processed to a list of :obj:`DetDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle either back propagation or
+        parameter update, which are supposed to be done in :meth:`train_step`.
+
+        Args:
+            inputs (torch.Tensor): The input tensor with shape
+                (N, C, ...) in general.
+            data_samples (list[:obj:`DetDataSample`], optional): A batch of
+                data samples that contain annotations and predictions.
+                Defaults to None.
+            mode (str): Return what kind of value. Defaults to 'tensor'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of :obj:`DetDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if mode == 'loss':
+            return self.loss(inputs, data_samples)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples)
+        elif mode == 'tensor':
+            return self._forward(inputs, data_samples)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    @abstractmethod
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs and data samples."""
+        pass
+
+    @abstractmethod
+    def predict(self, batch_inputs: Tensor,
+                batch_data_samples: SampleList) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing."""
+        pass
+
+    @abstractmethod
+    def _forward(self,
+                 batch_inputs: Tensor,
+                 batch_data_samples: OptSampleList = None):
+        """Network forward process.
+
+        Usually includes backbone, neck and head forward without any post-
+        processing.
+        """
+        pass
+
+    @abstractmethod
+    def extract_feat(self, batch_inputs: Tensor):
+        """Extract features from images."""
+        pass
+
+    def add_pred_to_datasample(self, data_samples: SampleList,
+                               results_list: InstanceList) -> SampleList:
+        """Add predictions to `DetDataSample`.
+
+        Args:
+            data_samples (list[:obj:`DetDataSample`], optional): A batch of
+                data samples that contain annotations and predictions.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances'. And the ``pred_instances`` usually
+            contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        for data_sample, pred_instances in zip(data_samples, results_list):
+            data_sample.pred_instances = pred_instances
+        samplelist_boxtype2tensor(data_samples)
+        return data_samples
diff --git a/mmde/mmdet/models/detectors/base_detr.py b/mmde/mmdet/models/detectors/base_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..88f00ec7408c389a1eb06beac6b383007f80b893
--- /dev/null
+++ b/mmde/mmdet/models/detectors/base_detr.py
@@ -0,0 +1,332 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Tuple, Union
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList, SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+
+@MODELS.register_module()
+class DetectionTransformer(BaseDetector, metaclass=ABCMeta):
+    r"""Base class for Detection Transformer.
+
+    In Detection Transformer, an encoder is used to process output features of
+    neck, then several queries interact with the encoder features using a
+    decoder and do the regression and classification with the bounding box
+    head.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): Config of the backbone.
+        neck (:obj:`ConfigDict` or dict, optional): Config of the neck.
+            Defaults to None.
+        encoder (:obj:`ConfigDict` or dict, optional): Config of the
+            Transformer encoder. Defaults to None.
+        decoder (:obj:`ConfigDict` or dict, optional): Config of the
+            Transformer decoder. Defaults to None.
+        bbox_head (:obj:`ConfigDict` or dict, optional): Config for the
+            bounding box head module. Defaults to None.
+        positional_encoding (:obj:`ConfigDict` or dict, optional): Config
+            of the positional encoding module. Defaults to None.
+        num_queries (int, optional): Number of decoder query in Transformer.
+            Defaults to 100.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            the bounding box head module. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            the bounding box head module. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 encoder: OptConfigType = None,
+                 decoder: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 positional_encoding: OptConfigType = None,
+                 num_queries: int = 100,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        # process args
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.encoder = encoder
+        self.decoder = decoder
+        self.positional_encoding = positional_encoding
+        self.num_queries = num_queries
+
+        # init model layers
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        self.bbox_head = MODELS.build(bbox_head)
+        self._init_layers()
+
+    @abstractmethod
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        pass
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, list]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (bs, dim, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+        img_feats = self.extract_feat(batch_inputs)
+        head_inputs_dict = self.forward_transformer(img_feats,
+                                                    batch_data_samples)
+        losses = self.bbox_head.loss(
+            **head_inputs_dict, batch_data_samples=batch_data_samples)
+
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs, has shape (bs, dim, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the input images.
+            Each DetDataSample usually contain 'pred_instances'. And the
+            `pred_instances` usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        img_feats = self.extract_feat(batch_inputs)
+        head_inputs_dict = self.forward_transformer(img_feats,
+                                                    batch_data_samples)
+        results_list = self.bbox_head.predict(
+            **head_inputs_dict,
+            rescale=rescale,
+            batch_data_samples=batch_data_samples)
+        batch_data_samples = self.add_pred_to_datasample(
+            batch_data_samples, results_list)
+        return batch_data_samples
+
+    def _forward(
+            self,
+            batch_inputs: Tensor,
+            batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            batch_inputs (Tensor): Inputs, has shape (bs, dim, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`], optional): The
+                batch data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[Tensor]: A tuple of features from ``bbox_head`` forward.
+        """
+        img_feats = self.extract_feat(batch_inputs)
+        head_inputs_dict = self.forward_transformer(img_feats,
+                                                    batch_data_samples)
+        results = self.bbox_head.forward(**head_inputs_dict)
+        return results
+
+    def forward_transformer(self,
+                            img_feats: Tuple[Tensor],
+                            batch_data_samples: OptSampleList = None) -> Dict:
+        """Forward process of Transformer, which includes four steps:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'. We
+        summarized the parameters flow of the existing DETR-like detector,
+        which can be illustrated as follow:
+
+        .. code:: text
+
+                 img_feats & batch_data_samples
+                               |
+                               V
+                      +-----------------+
+                      | pre_transformer |
+                      +-----------------+
+                          |          |
+                          |          V
+                          |    +-----------------+
+                          |    | forward_encoder |
+                          |    +-----------------+
+                          |             |
+                          |             V
+                          |     +---------------+
+                          |     |  pre_decoder  |
+                          |     +---------------+
+                          |         |       |
+                          V         V       |
+                      +-----------------+   |
+                      | forward_decoder |   |
+                      +-----------------+   |
+                                |           |
+                                V           V
+                               head_inputs_dict
+
+        Args:
+            img_feats (tuple[Tensor]): Tuple of feature maps from neck. Each
+                    feature map has shape (bs, dim, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`], optional): The
+                batch data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            dict: The dictionary of bbox_head function inputs, which always
+            includes the `hidden_states` of the decoder output and may contain
+            `references` including the initial and intermediate references.
+        """
+        encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer(
+            img_feats, batch_data_samples)
+
+        encoder_outputs_dict = self.forward_encoder(**encoder_inputs_dict)
+
+        tmp_dec_in, head_inputs_dict = self.pre_decoder(**encoder_outputs_dict)
+        decoder_inputs_dict.update(tmp_dec_in)
+
+        decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict)
+        head_inputs_dict.update(decoder_outputs_dict)
+        return head_inputs_dict
+
+    def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs (Tensor): Image tensor, has shape (bs, dim, H, W).
+
+        Returns:
+            tuple[Tensor]: Tuple of feature maps from neck. Each feature map
+            has shape (bs, dim, H, W).
+        """
+        x = self.backbone(batch_inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    @abstractmethod
+    def pre_transformer(
+            self,
+            img_feats: Tuple[Tensor],
+            batch_data_samples: OptSampleList = None) -> Tuple[Dict, Dict]:
+        """Process image features before feeding them to the transformer.
+
+        Args:
+            img_feats (tuple[Tensor]): Tuple of feature maps from neck. Each
+                feature map has shape (bs, dim, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`], optional): The
+                batch data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[dict, dict]: The first dict contains the inputs of encoder
+            and the second dict contains the inputs of decoder.
+
+            - encoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_encoder()`, which includes 'feat', 'feat_mask',
+              'feat_pos', and other algorithm-specific arguments.
+            - decoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_decoder()`, which includes 'memory_mask', and
+              other algorithm-specific arguments.
+        """
+        pass
+
+    @abstractmethod
+    def forward_encoder(self, feat: Tensor, feat_mask: Tensor,
+                        feat_pos: Tensor, **kwargs) -> Dict:
+        """Forward with Transformer encoder.
+
+        Args:
+            feat (Tensor): Sequential features, has shape (bs, num_feat_points,
+                dim).
+            feat_mask (Tensor): ByteTensor, the padding mask of the features,
+                has shape (bs, num_feat_points).
+            feat_pos (Tensor): The positional embeddings of the features, has
+                shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of encoder outputs, which includes the
+            `memory` of the encoder output and other algorithm-specific
+            arguments.
+        """
+        pass
+
+    @abstractmethod
+    def pre_decoder(self, memory: Tensor, **kwargs) -> Tuple[Dict, Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `query_pos`, and `reference_points`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+
+        Returns:
+            tuple[dict, dict]: The first dict contains the inputs of decoder
+            and the second dict contains the inputs of the bbox_head function.
+
+            - decoder_inputs_dict (dict): The keyword dictionary args of
+              `self.forward_decoder()`, which includes 'query', 'query_pos',
+              'memory', and other algorithm-specific arguments.
+            - head_inputs_dict (dict): The keyword dictionary args of the
+              bbox_head functions, which is usually empty, or includes
+              `enc_outputs_class` and `enc_outputs_class` when the detector
+              support 'two stage' or 'query selection' strategies.
+        """
+        pass
+
+    @abstractmethod
+    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
+                        **kwargs) -> Dict:
+        """Forward with Transformer decoder.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional queries of decoder inputs,
+                has shape (bs, num_queries, dim).
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` of the decoder output, `references` including
+            the initial and intermediate reference_points, and other
+            algorithm-specific arguments.
+        """
+        pass
diff --git a/mmde/mmdet/models/detectors/boxinst.py b/mmde/mmdet/models/detectors/boxinst.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca6b0bdd90a2a7e78f429a6822dbde6f809426da
--- /dev/null
+++ b/mmde/mmdet/models/detectors/boxinst.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@MODELS.register_module()
+class BoxInst(SingleStageInstanceSegmentor):
+    """Implementation of `BoxInst <https://arxiv.org/abs/2012.02310>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 mask_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/cascade_rcnn.py b/mmde/mmdet/models/detectors/cascade_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecf733ff104b99436fcc74130b0ccea12a0fa6d0
--- /dev/null
+++ b/mmde/mmdet/models/detectors/cascade_rcnn.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class CascadeRCNN(TwoStageDetector):
+    r"""Implementation of `Cascade R-CNN: Delving into High Quality Object
+    Detection <https://arxiv.org/abs/1906.09756>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 rpn_head: OptConfigType = None,
+                 roi_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/centernet.py b/mmde/mmdet/models/detectors/centernet.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c6622d6280227ecba9ede4aabf72c22a764e11d
--- /dev/null
+++ b/mmde/mmdet/models/detectors/centernet.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class CenterNet(SingleStageDetector):
+    """Implementation of CenterNet(Objects as Points)
+
+    <https://arxiv.org/abs/1904.07850>.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/condinst.py b/mmde/mmdet/models/detectors/condinst.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed2dc99eea3faf7b03a3970d46a372d28eb89fe1
--- /dev/null
+++ b/mmde/mmdet/models/detectors/condinst.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@MODELS.register_module()
+class CondInst(SingleStageInstanceSegmentor):
+    """Implementation of `CondInst <https://arxiv.org/abs/2003.05664>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 mask_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/conditional_detr.py b/mmde/mmdet/models/detectors/conditional_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..d57868e63a2ece085a7e5b67ee93c921ba334830
--- /dev/null
+++ b/mmde/mmdet/models/detectors/conditional_detr.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from ..layers import (ConditionalDetrTransformerDecoder,
+                      DetrTransformerEncoder, SinePositionalEncoding)
+from .detr import DETR
+
+
+@MODELS.register_module()
+class ConditionalDETR(DETR):
+    r"""Implementation of `Conditional DETR for Fast Training Convergence.
+
+    <https://arxiv.org/abs/2108.06152>`_.
+
+    Code is modified from the `official github repo
+    <https://github.com/Atten4Vis/ConditionalDETR>`_.
+    """
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = DetrTransformerEncoder(**self.encoder)
+        self.decoder = ConditionalDetrTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        # NOTE The embed_dims is typically passed from the inside out.
+        # For example in DETR, The embed_dims is passed as
+        # self_attn -> the first encoder layer -> encoder -> detector.
+        self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims)
+
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            f'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
+                        memory_mask: Tensor, memory_pos: Tensor) -> Dict:
+        """Forward with Transformer decoder.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional queries of decoder inputs,
+                has shape (bs, num_queries, dim).
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            memory_pos (Tensor): The positional embeddings of memory, has
+                shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` and `references` of the decoder output.
+
+            - hidden_states (Tensor): Has shape
+                (num_decoder_layers, bs, num_queries, dim)
+            - references (Tensor): Has shape
+                (bs, num_queries, 2)
+        """
+
+        hidden_states, references = self.decoder(
+            query=query,
+            key=memory,
+            query_pos=query_pos,
+            key_pos=memory_pos,
+            key_padding_mask=memory_mask)
+        head_inputs_dict = dict(
+            hidden_states=hidden_states, references=references)
+        return head_inputs_dict
diff --git a/mmde/mmdet/models/detectors/cornernet.py b/mmde/mmdet/models/detectors/cornernet.py
new file mode 100644
index 0000000000000000000000000000000000000000..946af4dbe6ae339d44f8db265ff7f11b9e02d239
--- /dev/null
+++ b/mmde/mmdet/models/detectors/cornernet.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class CornerNet(SingleStageDetector):
+    """CornerNet.
+
+    This detector is the implementation of the paper `CornerNet: Detecting
+    Objects as Paired Keypoints <https://arxiv.org/abs/1808.01244>`_ .
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/crowddet.py b/mmde/mmdet/models/detectors/crowddet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f43bc08aa95756324381ee4182f001a008613c8
--- /dev/null
+++ b/mmde/mmdet/models/detectors/crowddet.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class CrowdDet(TwoStageDetector):
+    """Implementation of `CrowdDet <https://arxiv.org/abs/2003.09163>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        rpn_head (:obj:`ConfigDict` or dict): The rpn config.
+        roi_head (:obj:`ConfigDict` or dict): The roi config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of FCOS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of FCOS. Defaults to None.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
diff --git a/mmde/mmdet/models/detectors/d2_wrapper.py b/mmde/mmdet/models/detectors/d2_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a2daa413e8fe0397ec37008d781ce449e7a26fd
--- /dev/null
+++ b/mmde/mmdet/models/detectors/d2_wrapper.py
@@ -0,0 +1,291 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import BaseBoxes
+from mmdet.structures.mask import BitmapMasks, PolygonMasks
+from mmdet.utils import ConfigType
+from .base import BaseDetector
+
+try:
+    import detectron2
+    from detectron2.config import get_cfg
+    from detectron2.modeling import build_model
+    from detectron2.structures.masks import BitMasks as D2_BitMasks
+    from detectron2.structures.masks import PolygonMasks as D2_PolygonMasks
+    from detectron2.utils.events import EventStorage
+except ImportError:
+    detectron2 = None
+
+
+def _to_cfgnode_list(cfg: ConfigType,
+                     config_list: list = [],
+                     father_name: str = 'MODEL') -> tuple:
+    """Convert the key and value of mmengine.ConfigDict into a list.
+
+    Args:
+        cfg (ConfigDict): The detectron2 model config.
+        config_list (list): A list contains the key and value of ConfigDict.
+            Defaults to [].
+        father_name (str): The father name add before the key.
+            Defaults to "MODEL".
+
+    Returns:
+        tuple:
+
+        - config_list: A list contains the key and value of ConfigDict.
+        - father_name (str): The father name add before the key.
+          Defaults to "MODEL".
+    """
+    for key, value in cfg.items():
+        name = f'{father_name}.{key.upper()}'
+        if isinstance(value, ConfigDict) or isinstance(value, dict):
+            config_list, fater_name = \
+                _to_cfgnode_list(value, config_list, name)
+        else:
+            config_list.append(name)
+            config_list.append(value)
+
+    return config_list, father_name
+
+
+def convert_d2_pred_to_datasample(data_samples: SampleList,
+                                  d2_results_list: list) -> SampleList:
+    """Convert the Detectron2's result to DetDataSample.
+
+    Args:
+        data_samples (list[:obj:`DetDataSample`]): The batch
+            data samples. It usually includes information such
+            as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+        d2_results_list (list): The list of the results of Detectron2's model.
+
+    Returns:
+        list[:obj:`DetDataSample`]: Detection results of the
+        input images. Each DetDataSample usually contain
+        'pred_instances'. And the ``pred_instances`` usually
+        contains following keys.
+
+        - scores (Tensor): Classification scores, has a shape
+          (num_instance, )
+        - labels (Tensor): Labels of bboxes, has a shape
+          (num_instances, ).
+        - bboxes (Tensor): Has a shape (num_instances, 4),
+          the last dimension 4 arrange as (x1, y1, x2, y2).
+    """
+    assert len(data_samples) == len(d2_results_list)
+    for data_sample, d2_results in zip(data_samples, d2_results_list):
+        d2_instance = d2_results['instances']
+
+        results = InstanceData()
+        results.bboxes = d2_instance.pred_boxes.tensor
+        results.scores = d2_instance.scores
+        results.labels = d2_instance.pred_classes
+
+        if d2_instance.has('pred_masks'):
+            results.masks = d2_instance.pred_masks
+        data_sample.pred_instances = results
+
+    return data_samples
+
+
+@MODELS.register_module()
+class Detectron2Wrapper(BaseDetector):
+    """Wrapper of a Detectron2 model. Input/output formats of this class follow
+    MMDetection's convention, so a Detectron2 model can be trained and
+    evaluated in MMDetection.
+
+    Args:
+        detector (:obj:`ConfigDict` or dict): The module config of
+            Detectron2.
+        bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+            Defaults to False.
+        rgb_to_bgr (bool): whether to convert image from RGB to BGR.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 detector: ConfigType,
+                 bgr_to_rgb: bool = False,
+                 rgb_to_bgr: bool = False) -> None:
+        if detectron2 is None:
+            raise ImportError('Please install Detectron2 first')
+        assert not (bgr_to_rgb and rgb_to_bgr), (
+            '`bgr2rgb` and `rgb2bgr` cannot be set to True at the same time')
+        super().__init__()
+        self._channel_conversion = rgb_to_bgr or bgr_to_rgb
+        cfgnode_list, _ = _to_cfgnode_list(detector)
+        self.cfg = get_cfg()
+        self.cfg.merge_from_list(cfgnode_list)
+        self.d2_model = build_model(self.cfg)
+        self.storage = EventStorage()
+
+    def init_weights(self) -> None:
+        """Initialization Backbone.
+
+        NOTE: The initialization of other layers are in Detectron2,
+        if users want to change the initialization way, please
+        change the code in Detectron2.
+        """
+        from detectron2.checkpoint import DetectionCheckpointer
+        checkpointer = DetectionCheckpointer(model=self.d2_model)
+        checkpointer.load(self.cfg.MODEL.WEIGHTS, checkpointables=[])
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        The inputs will first convert to the Detectron2 type and feed into
+        D2 models.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        d2_batched_inputs = self._convert_to_d2_inputs(
+            batch_inputs=batch_inputs,
+            batch_data_samples=batch_data_samples,
+            training=True)
+
+        with self.storage as storage:  # noqa
+            losses = self.d2_model(d2_batched_inputs)
+        # storage contains some training information, such as cls_accuracy.
+        # you can use storage.latest() to get the detail information
+        return losses
+
+    def predict(self, batch_inputs: Tensor,
+                batch_data_samples: SampleList) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        The inputs will first convert to the Detectron2 type and feed into
+        D2 models. And the results will convert back to the MMDet type.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances'. And the ``pred_instances`` usually
+            contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        d2_batched_inputs = self._convert_to_d2_inputs(
+            batch_inputs=batch_inputs,
+            batch_data_samples=batch_data_samples,
+            training=False)
+        # results in detectron2 has already rescale
+        d2_results_list = self.d2_model(d2_batched_inputs)
+        batch_data_samples = convert_d2_pred_to_datasample(
+            data_samples=batch_data_samples, d2_results_list=d2_results_list)
+
+        return batch_data_samples
+
+    def _forward(self, *args, **kwargs):
+        """Network forward process.
+
+        Usually includes backbone, neck and head forward without any post-
+        processing.
+        """
+        raise NotImplementedError(
+            f'`_forward` is not implemented in {self.__class__.__name__}')
+
+    def extract_feat(self, *args, **kwargs):
+        """Extract features from images.
+
+        `extract_feat` will not be used in obj:``Detectron2Wrapper``.
+        """
+        pass
+
+    def _convert_to_d2_inputs(self,
+                              batch_inputs: Tensor,
+                              batch_data_samples: SampleList,
+                              training=True) -> list:
+        """Convert inputs type to support Detectron2's model.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+            training (bool): Whether to enable training time processing.
+
+        Returns:
+            list[dict]: A list of dict, which will be fed into Detectron2's
+            model. And the dict usually contains following keys.
+
+            - image (Tensor): Image in (C, H, W) format.
+            - instances (Instances): GT Instance.
+            - height (int): the output height resolution of the model
+            - width (int): the output width resolution of the model
+        """
+        from detectron2.data.detection_utils import filter_empty_instances
+        from detectron2.structures import Boxes, Instances
+
+        batched_d2_inputs = []
+        for image, data_samples in zip(batch_inputs, batch_data_samples):
+            d2_inputs = dict()
+            # deal with metainfo
+            meta_info = data_samples.metainfo
+            d2_inputs['file_name'] = meta_info['img_path']
+            d2_inputs['height'], d2_inputs['width'] = meta_info['ori_shape']
+            d2_inputs['image_id'] = meta_info['img_id']
+            # deal with image
+            if self._channel_conversion:
+                image = image[[2, 1, 0], ...]
+            d2_inputs['image'] = image
+            # deal with gt_instances
+            gt_instances = data_samples.gt_instances
+            d2_instances = Instances(meta_info['img_shape'])
+
+            gt_boxes = gt_instances.bboxes
+            # TODO: use mmdet.structures.box.get_box_tensor after PR 8658
+            #  has merged
+            if isinstance(gt_boxes, BaseBoxes):
+                gt_boxes = gt_boxes.tensor
+            d2_instances.gt_boxes = Boxes(gt_boxes)
+
+            d2_instances.gt_classes = gt_instances.labels
+            if gt_instances.get('masks', None) is not None:
+                gt_masks = gt_instances.masks
+                if isinstance(gt_masks, PolygonMasks):
+                    d2_instances.gt_masks = D2_PolygonMasks(gt_masks.masks)
+                elif isinstance(gt_masks, BitmapMasks):
+                    d2_instances.gt_masks = D2_BitMasks(gt_masks.masks)
+                else:
+                    raise TypeError('The type of `gt_mask` can be '
+                                    '`PolygonMasks` or `BitMasks`, but get '
+                                    f'{type(gt_masks)}.')
+            # convert to cpu and convert back to cuda to avoid
+            # some potential error
+            if training:
+                device = gt_boxes.device
+                d2_instances = filter_empty_instances(
+                    d2_instances.to('cpu')).to(device)
+                d2_inputs['instances'] = d2_instances
+            batched_d2_inputs.append(d2_inputs)
+
+        return batched_d2_inputs
diff --git a/mmde/mmdet/models/detectors/dab_detr.py b/mmde/mmdet/models/detectors/dab_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..b61301cf6660924f0832f4068841a4664797c585
--- /dev/null
+++ b/mmde/mmdet/models/detectors/dab_detr.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+
+from mmengine.model import uniform_init
+from torch import Tensor, nn
+
+from mmdet.registry import MODELS
+from ..layers import SinePositionalEncoding
+from ..layers.transformer import (DABDetrTransformerDecoder,
+                                  DABDetrTransformerEncoder, inverse_sigmoid)
+from .detr import DETR
+
+
+@MODELS.register_module()
+class DABDETR(DETR):
+    r"""Implementation of `DAB-DETR:
+    Dynamic Anchor Boxes are Better Queries for DETR.
+
+    <https://arxiv.org/abs/2201.12329>`_.
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/DAB-DETR>`_.
+
+    Args:
+        with_random_refpoints (bool): Whether to randomly initialize query
+            embeddings and not update them during training.
+            Defaults to False.
+        num_patterns (int): Inspired by Anchor-DETR. Defaults to 0.
+    """
+
+    def __init__(self,
+                 *args,
+                 with_random_refpoints: bool = False,
+                 num_patterns: int = 0,
+                 **kwargs) -> None:
+        self.with_random_refpoints = with_random_refpoints
+        assert isinstance(num_patterns, int), \
+            f'num_patterns should be int but {num_patterns}.'
+        self.num_patterns = num_patterns
+
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = DABDetrTransformerEncoder(**self.encoder)
+        self.decoder = DABDetrTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        self.query_dim = self.decoder.query_dim
+        self.query_embedding = nn.Embedding(self.num_queries, self.query_dim)
+        if self.num_patterns > 0:
+            self.patterns = nn.Embedding(self.num_patterns, self.embed_dims)
+
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            f'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super(DABDETR, self).init_weights()
+        if self.with_random_refpoints:
+            uniform_init(self.query_embedding)
+            self.query_embedding.weight.data[:, :2] = \
+                inverse_sigmoid(self.query_embedding.weight.data[:, :2])
+            self.query_embedding.weight.data[:, :2].requires_grad = False
+
+    def pre_decoder(self, memory: Tensor) -> Tuple[Dict, Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `query_pos`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+
+        Returns:
+            tuple[dict, dict]: The first dict contains the inputs of decoder
+            and the second dict contains the inputs of the bbox_head function.
+
+            - decoder_inputs_dict (dict): The keyword args dictionary of
+                `self.forward_decoder()`, which includes 'query', 'query_pos',
+                'memory' and 'reg_branches'.
+            - head_inputs_dict (dict): The keyword args dictionary of the
+                bbox_head functions, which is usually empty, or includes
+                `enc_outputs_class` and `enc_outputs_class` when the detector
+                support 'two stage' or 'query selection' strategies.
+        """
+        batch_size = memory.size(0)
+        query_pos = self.query_embedding.weight
+        query_pos = query_pos.unsqueeze(0).repeat(batch_size, 1, 1)
+        if self.num_patterns == 0:
+            query = query_pos.new_zeros(batch_size, self.num_queries,
+                                        self.embed_dims)
+        else:
+            query = self.patterns.weight[:, None, None, :]\
+                .repeat(1, self.num_queries, batch_size, 1)\
+                .view(-1, batch_size, self.embed_dims)\
+                .permute(1, 0, 2)
+            query_pos = query_pos.repeat(1, self.num_patterns, 1)
+
+        decoder_inputs_dict = dict(
+            query_pos=query_pos, query=query, memory=memory)
+        head_inputs_dict = dict()
+        return decoder_inputs_dict, head_inputs_dict
+
+    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
+                        memory_mask: Tensor, memory_pos: Tensor) -> Dict:
+        """Forward with Transformer decoder.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional queries of decoder inputs,
+                has shape (bs, num_queries, dim).
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            memory_pos (Tensor): The positional embeddings of memory, has
+                shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` and `references` of the decoder output.
+        """
+
+        hidden_states, references = self.decoder(
+            query=query,
+            key=memory,
+            query_pos=query_pos,
+            key_pos=memory_pos,
+            key_padding_mask=memory_mask,
+            reg_branches=self.bbox_head.
+            fc_reg  # iterative refinement for anchor boxes
+        )
+        head_inputs_dict = dict(
+            hidden_states=hidden_states, references=references)
+        return head_inputs_dict
diff --git a/mmde/mmdet/models/detectors/ddod.py b/mmde/mmdet/models/detectors/ddod.py
new file mode 100644
index 0000000000000000000000000000000000000000..3503a40c8eb6d6c0496ea0f31740acecf774113a
--- /dev/null
+++ b/mmde/mmdet/models/detectors/ddod.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class DDOD(SingleStageDetector):
+    """Implementation of `DDOD <https://arxiv.org/pdf/2107.02963.pdf>`_.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of ATSS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of ATSS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/ddq_detr.py b/mmde/mmdet/models/detectors/ddq_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..57d4959d50ddd7a761d5e5c7a29d1f7f233f838a
--- /dev/null
+++ b/mmde/mmdet/models/detectors/ddq_detr.py
@@ -0,0 +1,274 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import MultiScaleDeformableAttention, batched_nms
+from torch import Tensor, nn
+from torch.nn.init import normal_
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy
+from mmdet.utils import OptConfigType
+from ..layers import DDQTransformerDecoder
+from ..utils import align_tensor
+from .deformable_detr import DeformableDETR
+from .dino import DINO
+
+
+@MODELS.register_module()
+class DDQDETR(DINO):
+    r"""Implementation of `Dense Distinct Query for
+    End-to-End Object Detection <https://arxiv.org/abs/2303.12776>`_
+
+    Code is modified from the `official github repo
+    <https://github.com/jshilong/DDQ>`_.
+
+    Args:
+        dense_topk_ratio (float): Ratio of num_dense queries to num_queries.
+            Defaults to 1.5.
+        dqs_cfg (:obj:`ConfigDict` or dict, optional): Config of
+            Distinct Queries Selection. Defaults to nms with
+            `iou_threshold` = 0.8.
+    """
+
+    def __init__(self,
+                 *args,
+                 dense_topk_ratio: float = 1.5,
+                 dqs_cfg: OptConfigType = dict(type='nms', iou_threshold=0.8),
+                 **kwargs):
+        self.dense_topk_ratio = dense_topk_ratio
+        self.decoder_cfg = kwargs['decoder']
+        self.dqs_cfg = dqs_cfg
+        super().__init__(*args, **kwargs)
+
+        # a share dict in all moduls
+        # pass some intermediate results and config parameters
+        cache_dict = dict()
+        for m in self.modules():
+            m.cache_dict = cache_dict
+        # first element is the start index of matching queries
+        # second element is the number of matching queries
+        self.cache_dict['dis_query_info'] = [0, 0]
+
+        # mask for distinct queries in each decoder layer
+        self.cache_dict['distinct_query_mask'] = []
+        # pass to decoder do the dqs
+        self.cache_dict['cls_branches'] = self.bbox_head.cls_branches
+        # Used to construct the attention mask after dqs
+        self.cache_dict['num_heads'] = self.encoder.layers[
+            0].self_attn.num_heads
+        # pass to decoder to do the dqs
+        self.cache_dict['dqs_cfg'] = self.dqs_cfg
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        super(DDQDETR, self)._init_layers()
+        self.decoder = DDQTransformerDecoder(**self.decoder_cfg)
+        self.query_embedding = None
+        self.query_map = nn.Linear(self.embed_dims, self.embed_dims)
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super(DeformableDETR, self).init_weights()
+        for coder in self.encoder, self.decoder:
+            for p in coder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        nn.init.xavier_uniform_(self.memory_trans_fc.weight)
+        normal_(self.level_embed)
+
+    def pre_decoder(
+        self,
+        memory: Tensor,
+        memory_mask: Tensor,
+        spatial_shapes: Tensor,
+        batch_data_samples: OptSampleList = None,
+    ) -> Tuple[Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `memory`, and `reference_points`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points). Will only be used when
+                `as_two_stage` is `True`.
+            spatial_shapes (Tensor): Spatial shapes of features in all levels.
+                With shape (num_levels, 2), last dimension represents (h, w).
+                Will only be used when `as_two_stage` is `True`.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[dict]: The decoder_inputs_dict and head_inputs_dict.
+
+            - decoder_inputs_dict (dict): The keyword dictionary args of
+              `self.forward_decoder()`, which includes 'query', 'memory',
+              `reference_points`, and `dn_mask`. The reference points of
+              decoder input here are 4D boxes, although it has `points`
+              in its name.
+            - head_inputs_dict (dict): The keyword dictionary args of the
+              bbox_head functions, which includes `topk_score`, `topk_coords`,
+              `dense_topk_score`, `dense_topk_coords`,
+              and `dn_meta`, when `self.training` is `True`, else is empty.
+        """
+        bs, _, c = memory.shape
+        output_memory, output_proposals = self.gen_encoder_output_proposals(
+            memory, memory_mask, spatial_shapes)
+        enc_outputs_class = self.bbox_head.cls_branches[
+            self.decoder.num_layers](
+                output_memory)
+        enc_outputs_coord_unact = self.bbox_head.reg_branches[
+            self.decoder.num_layers](output_memory) + output_proposals
+
+        if self.training:
+            # aux dense branch particularly in DDQ DETR, which doesn't exist
+            #   in DINO.
+            # -1 is the aux head for the encoder
+            dense_enc_outputs_class = self.bbox_head.cls_branches[-1](
+                output_memory)
+            dense_enc_outputs_coord_unact = self.bbox_head.reg_branches[-1](
+                output_memory) + output_proposals
+
+        topk = self.num_queries
+        dense_topk = int(topk * self.dense_topk_ratio)
+
+        proposals = enc_outputs_coord_unact.sigmoid()
+        proposals = bbox_cxcywh_to_xyxy(proposals)
+        scores = enc_outputs_class.max(-1)[0].sigmoid()
+
+        if self.training:
+            # aux dense branch particularly in DDQ DETR, which doesn't exist
+            #   in DINO.
+            dense_proposals = dense_enc_outputs_coord_unact.sigmoid()
+            dense_proposals = bbox_cxcywh_to_xyxy(dense_proposals)
+            dense_scores = dense_enc_outputs_class.max(-1)[0].sigmoid()
+
+        num_imgs = len(scores)
+        topk_score = []
+        topk_coords_unact = []
+        # Distinct query.
+        query = []
+
+        dense_topk_score = []
+        dense_topk_coords_unact = []
+        dense_query = []
+
+        for img_id in range(num_imgs):
+            single_proposals = proposals[img_id]
+            single_scores = scores[img_id]
+
+            # `batched_nms` of class scores and bbox coordinations is used
+            #   particularly by DDQ DETR for region proposal generation,
+            #   instead of `topk` of class scores by DINO.
+            _, keep_idxs = batched_nms(
+                single_proposals, single_scores,
+                torch.ones(len(single_scores), device=single_scores.device),
+                self.cache_dict['dqs_cfg'])
+
+            if self.training:
+                # aux dense branch particularly in DDQ DETR, which doesn't
+                #   exist in DINO.
+                dense_single_proposals = dense_proposals[img_id]
+                dense_single_scores = dense_scores[img_id]
+                # sort according the score
+                # Only sort by classification score, neither nms nor topk is
+                #   required. So input parameter `nms_cfg` = None.
+                _, dense_keep_idxs = batched_nms(
+                    dense_single_proposals, dense_single_scores,
+                    torch.ones(
+                        len(dense_single_scores),
+                        device=dense_single_scores.device), None)
+
+                dense_topk_score.append(dense_enc_outputs_class[img_id]
+                                        [dense_keep_idxs][:dense_topk])
+                dense_topk_coords_unact.append(
+                    dense_enc_outputs_coord_unact[img_id][dense_keep_idxs]
+                    [:dense_topk])
+
+            topk_score.append(enc_outputs_class[img_id][keep_idxs][:topk])
+
+            # Instead of initializing the content part with transformed
+            #   coordinates in Deformable DETR, we fuse the feature map
+            #   embedding of distinct positions as the content part, which
+            #   makes the initial queries more distinct.
+            topk_coords_unact.append(
+                enc_outputs_coord_unact[img_id][keep_idxs][:topk])
+
+            map_memory = self.query_map(memory[img_id].detach())
+            query.append(map_memory[keep_idxs][:topk])
+            if self.training:
+                # aux dense branch particularly in DDQ DETR, which doesn't
+                # exist in DINO.
+                dense_query.append(map_memory[dense_keep_idxs][:dense_topk])
+
+        topk_score = align_tensor(topk_score, topk)
+        topk_coords_unact = align_tensor(topk_coords_unact, topk)
+        query = align_tensor(query, topk)
+        if self.training:
+            dense_topk_score = align_tensor(dense_topk_score)
+            dense_topk_coords_unact = align_tensor(dense_topk_coords_unact)
+
+            dense_query = align_tensor(dense_query)
+            num_dense_queries = dense_query.size(1)
+        if self.training:
+            query = torch.cat([query, dense_query], dim=1)
+            topk_coords_unact = torch.cat(
+                [topk_coords_unact, dense_topk_coords_unact], dim=1)
+
+        topk_coords = topk_coords_unact.sigmoid()
+        if self.training:
+            dense_topk_coords = topk_coords[:, -num_dense_queries:]
+            topk_coords = topk_coords[:, :-num_dense_queries]
+
+        topk_coords_unact = topk_coords_unact.detach()
+
+        if self.training:
+            dn_label_query, dn_bbox_query, dn_mask, dn_meta = \
+                self.dn_query_generator(batch_data_samples)
+            query = torch.cat([dn_label_query, query], dim=1)
+            reference_points = torch.cat([dn_bbox_query, topk_coords_unact],
+                                         dim=1)
+
+            # Update `dn_mask` to add mask for dense queries.
+            ori_size = dn_mask.size(-1)
+            new_size = dn_mask.size(-1) + num_dense_queries
+            new_dn_mask = dn_mask.new_ones((new_size, new_size)).bool()
+            dense_mask = torch.zeros(num_dense_queries,
+                                     num_dense_queries).bool()
+            self.cache_dict['dis_query_info'] = [dn_label_query.size(1), topk]
+
+            new_dn_mask[ori_size:, ori_size:] = dense_mask
+            new_dn_mask[:ori_size, :ori_size] = dn_mask
+            dn_meta['num_dense_queries'] = num_dense_queries
+            dn_mask = new_dn_mask
+            self.cache_dict['num_dense_queries'] = num_dense_queries
+            self.decoder.aux_reg_branches = self.bbox_head.aux_reg_branches
+
+        else:
+            self.cache_dict['dis_query_info'] = [0, topk]
+            reference_points = topk_coords_unact
+            dn_mask, dn_meta = None, None
+
+        reference_points = reference_points.sigmoid()
+
+        decoder_inputs_dict = dict(
+            query=query,
+            memory=memory,
+            reference_points=reference_points,
+            dn_mask=dn_mask)
+        head_inputs_dict = dict(
+            enc_outputs_class=topk_score,
+            enc_outputs_coord=topk_coords,
+            aux_enc_outputs_class=dense_topk_score,
+            aux_enc_outputs_coord=dense_topk_coords,
+            dn_meta=dn_meta) if self.training else dict()
+
+        return decoder_inputs_dict, head_inputs_dict
diff --git a/mmde/mmdet/models/detectors/deformable_detr.py b/mmde/mmdet/models/detectors/deformable_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eb5cd2f95204542d5a9ace1a6d92e0b858c139f
--- /dev/null
+++ b/mmde/mmdet/models/detectors/deformable_detr.py
@@ -0,0 +1,572 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Dict, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention
+from mmengine.model import xavier_init
+from torch import Tensor, nn
+from torch.nn.init import normal_
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList
+from mmdet.utils import OptConfigType
+from ..layers import (DeformableDetrTransformerDecoder,
+                      DeformableDetrTransformerEncoder, SinePositionalEncoding)
+from .base_detr import DetectionTransformer
+
+
+@MODELS.register_module()
+class DeformableDETR(DetectionTransformer):
+    r"""Implementation of `Deformable DETR: Deformable Transformers for
+    End-to-End Object Detection <https://arxiv.org/abs/2010.04159>`_
+
+    Code is modified from the `official github repo
+    <https://github.com/fundamentalvision/Deformable-DETR>`_.
+
+    Args:
+        decoder (:obj:`ConfigDict` or dict, optional): Config of the
+            Transformer decoder. Defaults to None.
+        bbox_head (:obj:`ConfigDict` or dict, optional): Config for the
+            bounding box head module. Defaults to None.
+        with_box_refine (bool, optional): Whether to refine the references
+            in the decoder. Defaults to `False`.
+        as_two_stage (bool, optional): Whether to generate the proposal
+            from the outputs of encoder. Defaults to `False`.
+        num_feature_levels (int, optional): Number of feature levels.
+            Defaults to 4.
+    """
+
+    def __init__(self,
+                 *args,
+                 decoder: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 with_box_refine: bool = False,
+                 as_two_stage: bool = False,
+                 num_feature_levels: int = 4,
+                 **kwargs) -> None:
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        self.num_feature_levels = num_feature_levels
+
+        if bbox_head is not None:
+            assert 'share_pred_layer' not in bbox_head and \
+                   'num_pred_layer' not in bbox_head and \
+                   'as_two_stage' not in bbox_head, \
+                'The two keyword args `share_pred_layer`, `num_pred_layer`, ' \
+                'and `as_two_stage are set in `detector.__init__()`, users ' \
+                'should not set them in `bbox_head` config.'
+            # The last prediction layer is used to generate proposal
+            # from encode feature map when `as_two_stage` is `True`.
+            # And all the prediction layers should share parameters
+            # when `with_box_refine` is `True`.
+            bbox_head['share_pred_layer'] = not with_box_refine
+            bbox_head['num_pred_layer'] = (decoder['num_layers'] + 1) \
+                if self.as_two_stage else decoder['num_layers']
+            bbox_head['as_two_stage'] = as_two_stage
+
+        super().__init__(*args, decoder=decoder, bbox_head=bbox_head, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = DeformableDetrTransformerEncoder(**self.encoder)
+        self.decoder = DeformableDetrTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        if not self.as_two_stage:
+            self.query_embedding = nn.Embedding(self.num_queries,
+                                                self.embed_dims * 2)
+            # NOTE The query_embedding will be split into query and query_pos
+            # in self.pre_decoder, hence, the embed_dims are doubled.
+
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+        self.level_embed = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+
+        if self.as_two_stage:
+            self.memory_trans_fc = nn.Linear(self.embed_dims, self.embed_dims)
+            self.memory_trans_norm = nn.LayerNorm(self.embed_dims)
+            self.pos_trans_fc = nn.Linear(self.embed_dims * 2,
+                                          self.embed_dims * 2)
+            self.pos_trans_norm = nn.LayerNorm(self.embed_dims * 2)
+        else:
+            self.reference_points_fc = nn.Linear(self.embed_dims, 2)
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super().init_weights()
+        for coder in self.encoder, self.decoder:
+            for p in coder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        if self.as_two_stage:
+            nn.init.xavier_uniform_(self.memory_trans_fc.weight)
+            nn.init.xavier_uniform_(self.pos_trans_fc.weight)
+        else:
+            xavier_init(
+                self.reference_points_fc, distribution='uniform', bias=0.)
+        normal_(self.level_embed)
+
+    def pre_transformer(
+            self,
+            mlvl_feats: Tuple[Tensor],
+            batch_data_samples: OptSampleList = None) -> Tuple[Dict]:
+        """Process image features before feeding them to the transformer.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            mlvl_feats (tuple[Tensor]): Multi-level features that may have
+                different resolutions, output from neck. Each feature has
+                shape (bs, dim, h_lvl, w_lvl), where 'lvl' means 'layer'.
+            batch_data_samples (list[:obj:`DetDataSample`], optional): The
+                batch data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[dict]: The first dict contains the inputs of encoder and the
+            second dict contains the inputs of decoder.
+
+            - encoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_encoder()`, which includes 'feat', 'feat_mask',
+              and 'feat_pos'.
+            - decoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_decoder()`, which includes 'memory_mask'.
+        """
+        batch_size = mlvl_feats[0].size(0)
+
+        # construct binary masks for the transformer.
+        assert batch_data_samples is not None
+        batch_input_shape = batch_data_samples[0].batch_input_shape
+        input_img_h, input_img_w = batch_input_shape
+        img_shape_list = [sample.img_shape for sample in batch_data_samples]
+        same_shape_flag = all([
+            s[0] == input_img_h and s[1] == input_img_w for s in img_shape_list
+        ])
+        # support torch2onnx without feeding masks
+        if torch.onnx.is_in_onnx_export() or same_shape_flag:
+            mlvl_masks = []
+            mlvl_pos_embeds = []
+            for feat in mlvl_feats:
+                mlvl_masks.append(None)
+                mlvl_pos_embeds.append(
+                    self.positional_encoding(None, input=feat))
+        else:
+            masks = mlvl_feats[0].new_ones(
+                (batch_size, input_img_h, input_img_w))
+            for img_id in range(batch_size):
+                img_h, img_w = img_shape_list[img_id]
+                masks[img_id, :img_h, :img_w] = 0
+            # NOTE following the official DETR repo, non-zero
+            # values representing ignored positions, while
+            # zero values means valid positions.
+
+            mlvl_masks = []
+            mlvl_pos_embeds = []
+            for feat in mlvl_feats:
+                mlvl_masks.append(
+                    F.interpolate(masks[None], size=feat.shape[-2:]).to(
+                        torch.bool).squeeze(0))
+                mlvl_pos_embeds.append(
+                    self.positional_encoding(mlvl_masks[-1]))
+
+        feat_flatten = []
+        lvl_pos_embed_flatten = []
+        mask_flatten = []
+        spatial_shapes = []
+        for lvl, (feat, mask, pos_embed) in enumerate(
+                zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
+            batch_size, c, h, w = feat.shape
+            spatial_shape = torch._shape_as_tensor(feat)[2:].to(feat.device)
+            # [bs, c, h_lvl, w_lvl] -> [bs, h_lvl*w_lvl, c]
+            feat = feat.view(batch_size, c, -1).permute(0, 2, 1)
+            pos_embed = pos_embed.view(batch_size, c, -1).permute(0, 2, 1)
+            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            # [bs, h_lvl, w_lvl] -> [bs, h_lvl*w_lvl]
+            if mask is not None:
+                mask = mask.flatten(1)
+
+            feat_flatten.append(feat)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            mask_flatten.append(mask)
+            spatial_shapes.append(spatial_shape)
+
+        # (bs, num_feat_points, dim)
+        feat_flatten = torch.cat(feat_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        # (bs, num_feat_points), where num_feat_points = sum_lvl(h_lvl*w_lvl)
+        if mask_flatten[0] is not None:
+            mask_flatten = torch.cat(mask_flatten, 1)
+        else:
+            mask_flatten = None
+
+        # (num_level, 2)
+        spatial_shapes = torch.cat(spatial_shapes).view(-1, 2)
+        level_start_index = torch.cat((
+            spatial_shapes.new_zeros((1, )),  # (num_level)
+            spatial_shapes.prod(1).cumsum(0)[:-1]))
+        if mlvl_masks[0] is not None:
+            valid_ratios = torch.stack(  # (bs, num_level, 2)
+                [self.get_valid_ratio(m) for m in mlvl_masks], 1)
+        else:
+            valid_ratios = mlvl_feats[0].new_ones(batch_size, len(mlvl_feats),
+                                                  2)
+
+        encoder_inputs_dict = dict(
+            feat=feat_flatten,
+            feat_mask=mask_flatten,
+            feat_pos=lvl_pos_embed_flatten,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios)
+        decoder_inputs_dict = dict(
+            memory_mask=mask_flatten,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios)
+        return encoder_inputs_dict, decoder_inputs_dict
+
+    def forward_encoder(self, feat: Tensor, feat_mask: Tensor,
+                        feat_pos: Tensor, spatial_shapes: Tensor,
+                        level_start_index: Tensor,
+                        valid_ratios: Tensor) -> Dict:
+        """Forward with Transformer encoder.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            feat (Tensor): Sequential features, has shape (bs, num_feat_points,
+                dim).
+            feat_mask (Tensor): ByteTensor, the padding mask of the features,
+                has shape (bs, num_feat_points).
+            feat_pos (Tensor): The positional embeddings of the features, has
+                shape (bs, num_feat_points, dim).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+
+        Returns:
+            dict: The dictionary of encoder outputs, which includes the
+            `memory` of the encoder output.
+        """
+        memory = self.encoder(
+            query=feat,
+            query_pos=feat_pos,
+            key_padding_mask=feat_mask,  # for self_attn
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios)
+        encoder_outputs_dict = dict(
+            memory=memory,
+            memory_mask=feat_mask,
+            spatial_shapes=spatial_shapes)
+        return encoder_outputs_dict
+
+    def pre_decoder(self, memory: Tensor, memory_mask: Tensor,
+                    spatial_shapes: Tensor) -> Tuple[Dict, Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `query_pos`, and `reference_points`.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points). It will only be used when
+                `as_two_stage` is `True`.
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+                It will only be used when `as_two_stage` is `True`.
+
+        Returns:
+            tuple[dict, dict]: The decoder_inputs_dict and head_inputs_dict.
+
+            - decoder_inputs_dict (dict): The keyword dictionary args of
+              `self.forward_decoder()`, which includes 'query', 'query_pos',
+              'memory', and `reference_points`. The reference_points of
+              decoder input here are 4D boxes when `as_two_stage` is `True`,
+              otherwise 2D points, although it has `points` in its name.
+              The reference_points in encoder is always 2D points.
+            - head_inputs_dict (dict): The keyword dictionary args of the
+              bbox_head functions, which includes `enc_outputs_class` and
+              `enc_outputs_coord`. They are both `None` when 'as_two_stage'
+              is `False`. The dict is empty when `self.training` is `False`.
+        """
+        batch_size, _, c = memory.shape
+        if self.as_two_stage:
+            output_memory, output_proposals = \
+                self.gen_encoder_output_proposals(
+                    memory, memory_mask, spatial_shapes)
+            enc_outputs_class = self.bbox_head.cls_branches[
+                self.decoder.num_layers](
+                    output_memory)
+            enc_outputs_coord_unact = self.bbox_head.reg_branches[
+                self.decoder.num_layers](output_memory) + output_proposals
+            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
+            # We only use the first channel in enc_outputs_class as foreground,
+            # the other (num_classes - 1) channels are actually not used.
+            # Its targets are set to be 0s, which indicates the first
+            # class (foreground) because we use [0, num_classes - 1] to
+            # indicate class labels, background class is indicated by
+            # num_classes (similar convention in RPN).
+            # See https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/deformable_detr_head.py#L241 # noqa
+            # This follows the official implementation of Deformable DETR.
+            topk_proposals = torch.topk(
+                enc_outputs_class[..., 0], self.num_queries, dim=1)[1]
+            topk_coords_unact = torch.gather(
+                enc_outputs_coord_unact, 1,
+                topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+            topk_coords_unact = topk_coords_unact.detach()
+            reference_points = topk_coords_unact.sigmoid()
+            pos_trans_out = self.pos_trans_fc(
+                self.get_proposal_pos_embed(topk_coords_unact))
+            pos_trans_out = self.pos_trans_norm(pos_trans_out)
+            query_pos, query = torch.split(pos_trans_out, c, dim=2)
+        else:
+            enc_outputs_class, enc_outputs_coord = None, None
+            query_embed = self.query_embedding.weight
+            query_pos, query = torch.split(query_embed, c, dim=1)
+            query_pos = query_pos.unsqueeze(0).expand(batch_size, -1, -1)
+            query = query.unsqueeze(0).expand(batch_size, -1, -1)
+            reference_points = self.reference_points_fc(query_pos).sigmoid()
+
+        decoder_inputs_dict = dict(
+            query=query,
+            query_pos=query_pos,
+            memory=memory,
+            reference_points=reference_points)
+        head_inputs_dict = dict(
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord=enc_outputs_coord) if self.training else dict()
+        return decoder_inputs_dict, head_inputs_dict
+
+    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
+                        memory_mask: Tensor, reference_points: Tensor,
+                        spatial_shapes: Tensor, level_start_index: Tensor,
+                        valid_ratios: Tensor) -> Dict:
+        """Forward with Transformer decoder.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional queries of decoder inputs,
+                has shape (bs, num_queries, dim).
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h) when `as_two_stage` is `True`, otherwise has
+                shape (bs, num_queries, 2) with the last dimension arranged as
+                (cx, cy).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` of the decoder output and `references` including
+            the initial and intermediate reference_points.
+        """
+        inter_states, inter_references = self.decoder(
+            query=query,
+            value=memory,
+            query_pos=query_pos,
+            key_padding_mask=memory_mask,  # for cross_attn
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=self.bbox_head.reg_branches
+            if self.with_box_refine else None)
+        references = [reference_points, *inter_references]
+        decoder_outputs_dict = dict(
+            hidden_states=inter_states, references=references)
+        return decoder_outputs_dict
+
+    @staticmethod
+    def get_valid_ratio(mask: Tensor) -> Tensor:
+        """Get the valid radios of feature map in a level.
+
+        .. code:: text
+
+                    |---> valid_W <---|
+                 ---+-----------------+-----+---
+                  A |                 |     | A
+                  | |                 |     | |
+                  | |                 |     | |
+            valid_H |                 |     | |
+                  | |                 |     | H
+                  | |                 |     | |
+                  V |                 |     | |
+                 ---+-----------------+     | |
+                    |                       | V
+                    +-----------------------+---
+                    |---------> W <---------|
+
+          The valid_ratios are defined as:
+                r_h = valid_H / H,  r_w = valid_W / W
+          They are the factors to re-normalize the relative coordinates of the
+          image to the relative coordinates of the current level feature map.
+
+        Args:
+            mask (Tensor): Binary mask of a feature map, has shape (bs, H, W).
+
+        Returns:
+            Tensor: valid ratios [r_w, r_h] of a feature map, has shape (1, 2).
+        """
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def gen_encoder_output_proposals(
+            self, memory: Tensor, memory_mask: Tensor,
+            spatial_shapes: Tensor) -> Tuple[Tensor, Tensor]:
+        """Generate proposals from encoded memory. The function will only be
+        used when `as_two_stage` is `True`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+
+        Returns:
+            tuple: A tuple of transformed memory and proposals.
+
+            - output_memory (Tensor): The transformed memory for obtaining
+              top-k proposals, has shape (bs, num_feat_points, dim).
+            - output_proposals (Tensor): The inverse-normalized proposal, has
+              shape (batch_size, num_keys, 4) with the last dimension arranged
+              as (cx, cy, w, h).
+        """
+
+        bs = memory.size(0)
+        proposals = []
+        _cur = 0  # start index in the sequence of the current level
+        for lvl, HW in enumerate(spatial_shapes):
+            H, W = HW
+
+            if memory_mask is not None:
+                mask_flatten_ = memory_mask[:, _cur:(_cur + H * W)].view(
+                    bs, H, W, 1)
+                valid_H = torch.sum(~mask_flatten_[:, :, 0, 0],
+                                    1).unsqueeze(-1)
+                valid_W = torch.sum(~mask_flatten_[:, 0, :, 0],
+                                    1).unsqueeze(-1)
+                scale = torch.cat([valid_W, valid_H], 1).view(bs, 1, 1, 2)
+            else:
+                if not isinstance(HW, torch.Tensor):
+                    HW = memory.new_tensor(HW)
+                scale = HW.unsqueeze(0).flip(dims=[0, 1]).view(1, 1, 1, 2)
+            grid_y, grid_x = torch.meshgrid(
+                torch.linspace(
+                    0, H - 1, H, dtype=torch.float32, device=memory.device),
+                torch.linspace(
+                    0, W - 1, W, dtype=torch.float32, device=memory.device))
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+            grid = (grid.unsqueeze(0).expand(bs, -1, -1, -1) + 0.5) / scale
+            wh = torch.ones_like(grid) * 0.05 * (2.0**lvl)
+            proposal = torch.cat((grid, wh), -1).view(bs, -1, 4)
+            proposals.append(proposal)
+            _cur += (H * W)
+        output_proposals = torch.cat(proposals, 1)
+        # do not use `all` to make it exportable to onnx
+        output_proposals_valid = (
+            (output_proposals > 0.01) & (output_proposals < 0.99)).sum(
+                -1, keepdim=True) == output_proposals.shape[-1]
+        # inverse_sigmoid
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        if memory_mask is not None:
+            output_proposals = output_proposals.masked_fill(
+                memory_mask.unsqueeze(-1), float('inf'))
+        output_proposals = output_proposals.masked_fill(
+            ~output_proposals_valid, float('inf'))
+
+        output_memory = memory
+        if memory_mask is not None:
+            output_memory = output_memory.masked_fill(
+                memory_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid,
+                                                  float(0))
+        output_memory = self.memory_trans_fc(output_memory)
+        output_memory = self.memory_trans_norm(output_memory)
+        # [bs, sum(hw), 2]
+        return output_memory, output_proposals
+
+    @staticmethod
+    def get_proposal_pos_embed(proposals: Tensor,
+                               num_pos_feats: int = 128,
+                               temperature: int = 10000) -> Tensor:
+        """Get the position embedding of the proposal.
+
+        Args:
+            proposals (Tensor): Not normalized proposals, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            num_pos_feats (int, optional): The feature dimension for each
+                position along x, y, w, and h-axis. Note the final returned
+                dimension for each position is 4 times of num_pos_feats.
+                Default to 128.
+            temperature (int, optional): The temperature used for scaling the
+                position embedding. Defaults to 10000.
+
+        Returns:
+            Tensor: The position embedding of proposal, has shape
+            (bs, num_queries, num_pos_feats * 4), with the last dimension
+            arranged as (cx, cy, w, h)
+        """
+        scale = 2 * math.pi
+        dim_t = torch.arange(
+            num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 4
+        proposals = proposals.sigmoid() * scale
+        # N, L, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 4, 64, 2
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()),
+                          dim=4).flatten(2)
+        return pos
diff --git a/mmde/mmdet/models/detectors/detr.py b/mmde/mmdet/models/detectors/detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..7895e9ecb4eb66cb75d173c191c2128c3f55c197
--- /dev/null
+++ b/mmde/mmdet/models/detectors/detr.py
@@ -0,0 +1,225 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList
+from ..layers import (DetrTransformerDecoder, DetrTransformerEncoder,
+                      SinePositionalEncoding)
+from .base_detr import DetectionTransformer
+
+
+@MODELS.register_module()
+class DETR(DetectionTransformer):
+    r"""Implementation of `DETR: End-to-End Object Detection with Transformers.
+
+    <https://arxiv.org/pdf/2005.12872>`_.
+
+    Code is modified from the `official github repo
+    <https://github.com/facebookresearch/detr>`_.
+    """
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = DetrTransformerEncoder(**self.encoder)
+        self.decoder = DetrTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        # NOTE The embed_dims is typically passed from the inside out.
+        # For example in DETR, The embed_dims is passed as
+        # self_attn -> the first encoder layer -> encoder -> detector.
+        self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims)
+
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super().init_weights()
+        for coder in self.encoder, self.decoder:
+            for p in coder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+
+    def pre_transformer(
+            self,
+            img_feats: Tuple[Tensor],
+            batch_data_samples: OptSampleList = None) -> Tuple[Dict, Dict]:
+        """Prepare the inputs of the Transformer.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            img_feats (Tuple[Tensor]): Tuple of features output from the neck,
+                has shape (bs, c, h, w).
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such as
+                `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[dict, dict]: The first dict contains the inputs of encoder
+            and the second dict contains the inputs of decoder.
+
+            - encoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_encoder()`, which includes 'feat', 'feat_mask',
+              and 'feat_pos'.
+            - decoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_decoder()`, which includes 'memory_mask',
+              and 'memory_pos'.
+        """
+
+        feat = img_feats[-1]  # NOTE img_feats contains only one feature.
+        batch_size, feat_dim, _, _ = feat.shape
+        # construct binary masks which for the transformer.
+        assert batch_data_samples is not None
+        batch_input_shape = batch_data_samples[0].batch_input_shape
+        input_img_h, input_img_w = batch_input_shape
+        img_shape_list = [sample.img_shape for sample in batch_data_samples]
+        same_shape_flag = all([
+            s[0] == input_img_h and s[1] == input_img_w for s in img_shape_list
+        ])
+        if torch.onnx.is_in_onnx_export() or same_shape_flag:
+            masks = None
+            # [batch_size, embed_dim, h, w]
+            pos_embed = self.positional_encoding(masks, input=feat)
+        else:
+            masks = feat.new_ones((batch_size, input_img_h, input_img_w))
+            for img_id in range(batch_size):
+                img_h, img_w = img_shape_list[img_id]
+                masks[img_id, :img_h, :img_w] = 0
+            # NOTE following the official DETR repo, non-zero values represent
+            # ignored positions, while zero values mean valid positions.
+
+            masks = F.interpolate(
+                masks.unsqueeze(1),
+                size=feat.shape[-2:]).to(torch.bool).squeeze(1)
+            # [batch_size, embed_dim, h, w]
+            pos_embed = self.positional_encoding(masks)
+
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        # [bs, c, h, w] -> [bs, h*w, c]
+        feat = feat.view(batch_size, feat_dim, -1).permute(0, 2, 1)
+        pos_embed = pos_embed.view(batch_size, feat_dim, -1).permute(0, 2, 1)
+        # [bs, h, w] -> [bs, h*w]
+        if masks is not None:
+            masks = masks.view(batch_size, -1)
+
+        # prepare transformer_inputs_dict
+        encoder_inputs_dict = dict(
+            feat=feat, feat_mask=masks, feat_pos=pos_embed)
+        decoder_inputs_dict = dict(memory_mask=masks, memory_pos=pos_embed)
+        return encoder_inputs_dict, decoder_inputs_dict
+
+    def forward_encoder(self, feat: Tensor, feat_mask: Tensor,
+                        feat_pos: Tensor) -> Dict:
+        """Forward with Transformer encoder.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            feat (Tensor): Sequential features, has shape (bs, num_feat_points,
+                dim).
+            feat_mask (Tensor): ByteTensor, the padding mask of the features,
+                has shape (bs, num_feat_points).
+            feat_pos (Tensor): The positional embeddings of the features, has
+                shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of encoder outputs, which includes the
+            `memory` of the encoder output.
+        """
+        memory = self.encoder(
+            query=feat, query_pos=feat_pos,
+            key_padding_mask=feat_mask)  # for self_attn
+        encoder_outputs_dict = dict(memory=memory)
+        return encoder_outputs_dict
+
+    def pre_decoder(self, memory: Tensor) -> Tuple[Dict, Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `query_pos`.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+
+        Returns:
+            tuple[dict, dict]: The first dict contains the inputs of decoder
+            and the second dict contains the inputs of the bbox_head function.
+
+            - decoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_decoder()`, which includes 'query', 'query_pos',
+              'memory'.
+            - head_inputs_dict (dict): The keyword args dictionary of the
+              bbox_head functions, which is usually empty, or includes
+              `enc_outputs_class` and `enc_outputs_class` when the detector
+              support 'two stage' or 'query selection' strategies.
+        """
+
+        batch_size = memory.size(0)  # (bs, num_feat_points, dim)
+        query_pos = self.query_embedding.weight
+        # (num_queries, dim) -> (bs, num_queries, dim)
+        query_pos = query_pos.unsqueeze(0).repeat(batch_size, 1, 1)
+        query = torch.zeros_like(query_pos)
+
+        decoder_inputs_dict = dict(
+            query_pos=query_pos, query=query, memory=memory)
+        head_inputs_dict = dict()
+        return decoder_inputs_dict, head_inputs_dict
+
+    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
+                        memory_mask: Tensor, memory_pos: Tensor) -> Dict:
+        """Forward with Transformer decoder.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional queries of decoder inputs,
+                has shape (bs, num_queries, dim).
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            memory_pos (Tensor): The positional embeddings of memory, has
+                shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` of the decoder output.
+
+            - hidden_states (Tensor): Has shape
+              (num_decoder_layers, bs, num_queries, dim)
+        """
+
+        hidden_states = self.decoder(
+            query=query,
+            key=memory,
+            value=memory,
+            query_pos=query_pos,
+            key_pos=memory_pos,
+            key_padding_mask=memory_mask)  # for cross_attn
+
+        head_inputs_dict = dict(hidden_states=hidden_states)
+        return head_inputs_dict
diff --git a/mmde/mmdet/models/detectors/dino.py b/mmde/mmdet/models/detectors/dino.py
new file mode 100644
index 0000000000000000000000000000000000000000..ade47f531d27246511cafc2997a07d58677538a7
--- /dev/null
+++ b/mmde/mmdet/models/detectors/dino.py
@@ -0,0 +1,287 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Tuple
+
+import torch
+from torch import Tensor, nn
+from torch.nn.init import normal_
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList
+from mmdet.utils import OptConfigType
+from ..layers import (CdnQueryGenerator, DeformableDetrTransformerEncoder,
+                      DinoTransformerDecoder, SinePositionalEncoding)
+from .deformable_detr import DeformableDETR, MultiScaleDeformableAttention
+
+
+@MODELS.register_module()
+class DINO(DeformableDETR):
+    r"""Implementation of `DINO: DETR with Improved DeNoising Anchor Boxes
+    for End-to-End Object Detection <https://arxiv.org/abs/2203.03605>`_
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/DINO>`_.
+
+    Args:
+        dn_cfg (:obj:`ConfigDict` or dict, optional): Config of denoising
+            query generator. Defaults to `None`.
+    """
+
+    def __init__(self, *args, dn_cfg: OptConfigType = None, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        assert self.as_two_stage, 'as_two_stage must be True for DINO'
+        assert self.with_box_refine, 'with_box_refine must be True for DINO'
+
+        if dn_cfg is not None:
+            assert 'num_classes' not in dn_cfg and \
+                   'num_queries' not in dn_cfg and \
+                   'hidden_dim' not in dn_cfg, \
+                'The three keyword args `num_classes`, `embed_dims`, and ' \
+                '`num_matching_queries` are set in `detector.__init__()`, ' \
+                'users should not set them in `dn_cfg` config.'
+            dn_cfg['num_classes'] = self.bbox_head.num_classes
+            dn_cfg['embed_dims'] = self.embed_dims
+            dn_cfg['num_matching_queries'] = self.num_queries
+        self.dn_query_generator = CdnQueryGenerator(**dn_cfg)
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = DeformableDetrTransformerEncoder(**self.encoder)
+        self.decoder = DinoTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims)
+        # NOTE In DINO, the query_embedding only contains content
+        # queries, while in Deformable DETR, the query_embedding
+        # contains both content and spatial queries, and in DETR,
+        # it only contains spatial queries.
+
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            f'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+        self.level_embed = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+        self.memory_trans_fc = nn.Linear(self.embed_dims, self.embed_dims)
+        self.memory_trans_norm = nn.LayerNorm(self.embed_dims)
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super(DeformableDETR, self).init_weights()
+        for coder in self.encoder, self.decoder:
+            for p in coder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        nn.init.xavier_uniform_(self.memory_trans_fc.weight)
+        nn.init.xavier_uniform_(self.query_embedding.weight)
+        normal_(self.level_embed)
+
+    def forward_transformer(
+        self,
+        img_feats: Tuple[Tensor],
+        batch_data_samples: OptSampleList = None,
+    ) -> Dict:
+        """Forward process of Transformer.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+        The difference is that the ground truth in `batch_data_samples` is
+        required for the `pre_decoder` to prepare the query of DINO.
+        Additionally, DINO inherits the `pre_transformer` method and the
+        `forward_encoder` method of DeformableDETR. More details about the
+        two methods can be found in `mmdet/detector/deformable_detr.py`.
+
+        Args:
+            img_feats (tuple[Tensor]): Tuple of feature maps from neck. Each
+                feature map has shape (bs, dim, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            dict: The dictionary of bbox_head function inputs, which always
+            includes the `hidden_states` of the decoder output and may contain
+            `references` including the initial and intermediate references.
+        """
+        encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer(
+            img_feats, batch_data_samples)
+
+        encoder_outputs_dict = self.forward_encoder(**encoder_inputs_dict)
+
+        tmp_dec_in, head_inputs_dict = self.pre_decoder(
+            **encoder_outputs_dict, batch_data_samples=batch_data_samples)
+        decoder_inputs_dict.update(tmp_dec_in)
+
+        decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict)
+        head_inputs_dict.update(decoder_outputs_dict)
+        return head_inputs_dict
+
+    def pre_decoder(
+        self,
+        memory: Tensor,
+        memory_mask: Tensor,
+        spatial_shapes: Tensor,
+        batch_data_samples: OptSampleList = None,
+    ) -> Tuple[Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `query_pos`, and `reference_points`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points). Will only be used when
+                `as_two_stage` is `True`.
+            spatial_shapes (Tensor): Spatial shapes of features in all levels.
+                With shape (num_levels, 2), last dimension represents (h, w).
+                Will only be used when `as_two_stage` is `True`.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[dict]: The decoder_inputs_dict and head_inputs_dict.
+
+            - decoder_inputs_dict (dict): The keyword dictionary args of
+              `self.forward_decoder()`, which includes 'query', 'memory',
+              `reference_points`, and `dn_mask`. The reference points of
+              decoder input here are 4D boxes, although it has `points`
+              in its name.
+            - head_inputs_dict (dict): The keyword dictionary args of the
+              bbox_head functions, which includes `topk_score`, `topk_coords`,
+              and `dn_meta` when `self.training` is `True`, else is empty.
+        """
+        bs, _, c = memory.shape
+        cls_out_features = self.bbox_head.cls_branches[
+            self.decoder.num_layers].out_features
+
+        output_memory, output_proposals = self.gen_encoder_output_proposals(
+            memory, memory_mask, spatial_shapes)
+        enc_outputs_class = self.bbox_head.cls_branches[
+            self.decoder.num_layers](
+                output_memory)
+        enc_outputs_coord_unact = self.bbox_head.reg_branches[
+            self.decoder.num_layers](output_memory) + output_proposals
+
+        # NOTE The DINO selects top-k proposals according to scores of
+        # multi-class classification, while DeformDETR, where the input
+        # is `enc_outputs_class[..., 0]` selects according to scores of
+        # binary classification.
+        topk_indices = torch.topk(
+            enc_outputs_class.max(-1)[0], k=self.num_queries, dim=1)[1]
+        topk_score = torch.gather(
+            enc_outputs_class, 1,
+            topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features))
+        topk_coords_unact = torch.gather(
+            enc_outputs_coord_unact, 1,
+            topk_indices.unsqueeze(-1).repeat(1, 1, 4))
+        topk_coords = topk_coords_unact.sigmoid()
+        topk_coords_unact = topk_coords_unact.detach()
+
+        query = self.query_embedding.weight[:, None, :]
+        query = query.repeat(1, bs, 1).transpose(0, 1)
+        if self.training:
+            dn_label_query, dn_bbox_query, dn_mask, dn_meta = \
+                self.dn_query_generator(batch_data_samples)
+            query = torch.cat([dn_label_query, query], dim=1)
+            reference_points = torch.cat([dn_bbox_query, topk_coords_unact],
+                                         dim=1)
+        else:
+            reference_points = topk_coords_unact
+            dn_mask, dn_meta = None, None
+        reference_points = reference_points.sigmoid()
+
+        decoder_inputs_dict = dict(
+            query=query,
+            memory=memory,
+            reference_points=reference_points,
+            dn_mask=dn_mask)
+        # NOTE DINO calculates encoder losses on scores and coordinates
+        # of selected top-k encoder queries, while DeformDETR is of all
+        # encoder queries.
+        head_inputs_dict = dict(
+            enc_outputs_class=topk_score,
+            enc_outputs_coord=topk_coords,
+            dn_meta=dn_meta) if self.training else dict()
+        return decoder_inputs_dict, head_inputs_dict
+
+    def forward_decoder(self,
+                        query: Tensor,
+                        memory: Tensor,
+                        memory_mask: Tensor,
+                        reference_points: Tensor,
+                        spatial_shapes: Tensor,
+                        level_start_index: Tensor,
+                        valid_ratios: Tensor,
+                        dn_mask: Optional[Tensor] = None,
+                        **kwargs) -> Dict:
+        """Forward with Transformer decoder.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries_total, dim), where `num_queries_total` is the
+                sum of `num_denoising_queries` and `num_matching_queries` when
+                `self.training` is `True`, else `num_matching_queries`.
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries_total, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            dn_mask (Tensor, optional): The attention mask to prevent
+                information leakage from different denoising groups and
+                matching parts, will be used as `self_attn_mask` of the
+                `self.decoder`, has shape (num_queries_total,
+                num_queries_total).
+                It is `None` when `self.training` is `False`.
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` of the decoder output and `references` including
+            the initial and intermediate reference_points.
+        """
+        inter_states, references = self.decoder(
+            query=query,
+            value=memory,
+            key_padding_mask=memory_mask,
+            self_attn_mask=dn_mask,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=self.bbox_head.reg_branches,
+            **kwargs)
+
+        if len(query) == self.num_queries:
+            # NOTE: This is to make sure label_embeding can be involved to
+            # produce loss even if there is no denoising query (no ground truth
+            # target in this GPU), otherwise, this will raise runtime error in
+            # distributed training.
+            inter_states[0] += \
+                self.dn_query_generator.label_embedding.weight[0, 0] * 0.0
+
+        decoder_outputs_dict = dict(
+            hidden_states=inter_states, references=list(references))
+        return decoder_outputs_dict
diff --git a/mmde/mmdet/models/detectors/fast_rcnn.py b/mmde/mmdet/models/detectors/fast_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b39050fdc2989eb5c870704e1c1417987d53d46
--- /dev/null
+++ b/mmde/mmdet/models/detectors/fast_rcnn.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class FastRCNN(TwoStageDetector):
+    """Implementation of `Fast R-CNN <https://arxiv.org/abs/1504.08083>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
diff --git a/mmde/mmdet/models/detectors/faster_rcnn.py b/mmde/mmdet/models/detectors/faster_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..36109e3200a2d8e7d8a1032f7028e47a7699fb6a
--- /dev/null
+++ b/mmde/mmdet/models/detectors/faster_rcnn.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class FasterRCNN(TwoStageDetector):
+    """Implementation of `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
diff --git a/mmde/mmdet/models/detectors/fcos.py b/mmde/mmdet/models/detectors/fcos.py
new file mode 100644
index 0000000000000000000000000000000000000000..c628059313ac80644ec2ba2c806e7baf2e418a41
--- /dev/null
+++ b/mmde/mmdet/models/detectors/fcos.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class FCOS(SingleStageDetector):
+    """Implementation of `FCOS <https://arxiv.org/abs/1904.01355>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of FCOS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of FCOS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/fovea.py b/mmde/mmdet/models/detectors/fovea.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e4f21caa239147e3b81e66280aa1da043715b42
--- /dev/null
+++ b/mmde/mmdet/models/detectors/fovea.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class FOVEA(SingleStageDetector):
+    """Implementation of `FoveaBox <https://arxiv.org/abs/1904.03797>`_
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of FOVEA. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of FOVEA. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/fsaf.py b/mmde/mmdet/models/detectors/fsaf.py
new file mode 100644
index 0000000000000000000000000000000000000000..01b40273341f2a85cfa427f8adfc945a1b7da58a
--- /dev/null
+++ b/mmde/mmdet/models/detectors/fsaf.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class FSAF(SingleStageDetector):
+    """Implementation of `FSAF <https://arxiv.org/abs/1903.00621>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/gfl.py b/mmde/mmdet/models/detectors/gfl.py
new file mode 100644
index 0000000000000000000000000000000000000000..c26821af68c224d4b55a1ca3d2be4c6e1d1b155d
--- /dev/null
+++ b/mmde/mmdet/models/detectors/gfl.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class GFL(SingleStageDetector):
+    """Implementation of `GFL <https://arxiv.org/abs/2006.04388>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of GFL. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of GFL. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/glip.py b/mmde/mmdet/models/detectors/glip.py
new file mode 100644
index 0000000000000000000000000000000000000000..45cfe7d39fd7b8d9e9bc37c49fe369ff87bc68d9
--- /dev/null
+++ b/mmde/mmdet/models/detectors/glip.py
@@ -0,0 +1,590 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import re
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+def find_noun_phrases(caption: str) -> list:
+    """Find noun phrases in a caption using nltk.
+    Args:
+        caption (str): The caption to analyze.
+
+    Returns:
+        list: List of noun phrases found in the caption.
+
+    Examples:
+        >>> caption = 'There is two cat and a remote in the picture'
+        >>> find_noun_phrases(caption) # ['cat', 'a remote', 'the picture']
+    """
+    try:
+        import nltk
+        nltk.download('punkt', download_dir='~/nltk_data')
+        nltk.download('averaged_perceptron_tagger', download_dir='~/nltk_data')
+    except ImportError:
+        raise RuntimeError('nltk is not installed, please install it by: '
+                           'pip install nltk.')
+
+    caption = caption.lower()
+    tokens = nltk.word_tokenize(caption)
+    pos_tags = nltk.pos_tag(tokens)
+
+    grammar = 'NP: {<DT>?<JJ.*>*<NN.*>+}'
+    cp = nltk.RegexpParser(grammar)
+    result = cp.parse(pos_tags)
+
+    noun_phrases = []
+    for subtree in result.subtrees():
+        if subtree.label() == 'NP':
+            noun_phrases.append(' '.join(t[0] for t in subtree.leaves()))
+
+    return noun_phrases
+
+
+def remove_punctuation(text: str) -> str:
+    """Remove punctuation from a text.
+    Args:
+        text (str): The input text.
+
+    Returns:
+        str: The text with punctuation removed.
+    """
+    punctuation = [
+        '|', ':', ';', '@', '(', ')', '[', ']', '{', '}', '^', '\'', '\"', '’',
+        '`', '?', '$', '%', '#', '!', '&', '*', '+', ',', '.'
+    ]
+    for p in punctuation:
+        text = text.replace(p, '')
+    return text.strip()
+
+
+def run_ner(caption: str) -> Tuple[list, list]:
+    """Run NER on a caption and return the tokens and noun phrases.
+    Args:
+        caption (str): The input caption.
+
+    Returns:
+        Tuple[List, List]: A tuple containing the tokens and noun phrases.
+            - tokens_positive (List): A list of token positions.
+            - noun_phrases (List): A list of noun phrases.
+    """
+    noun_phrases = find_noun_phrases(caption)
+    noun_phrases = [remove_punctuation(phrase) for phrase in noun_phrases]
+    noun_phrases = [phrase for phrase in noun_phrases if phrase != '']
+    print('noun_phrases:', noun_phrases)
+    relevant_phrases = noun_phrases
+    labels = noun_phrases
+
+    tokens_positive = []
+    for entity, label in zip(relevant_phrases, labels):
+        try:
+            # search all occurrences and mark them as different entities
+            # TODO: Not Robust
+            for m in re.finditer(entity, caption.lower()):
+                tokens_positive.append([[m.start(), m.end()]])
+        except Exception:
+            print('noun entities:', noun_phrases)
+            print('entity:', entity)
+            print('caption:', caption.lower())
+    return tokens_positive, noun_phrases
+
+
+def create_positive_map(tokenized,
+                        tokens_positive: list,
+                        max_num_entities: int = 256) -> Tensor:
+    """construct a map such that positive_map[i,j] = True
+    if box i is associated to token j
+
+    Args:
+        tokenized: The tokenized input.
+        tokens_positive (list): A list of token ranges
+            associated with positive boxes.
+        max_num_entities (int, optional): The maximum number of entities.
+            Defaults to 256.
+
+    Returns:
+        torch.Tensor: The positive map.
+
+    Raises:
+        Exception: If an error occurs during token-to-char mapping.
+    """
+    positive_map = torch.zeros((len(tokens_positive), max_num_entities),
+                               dtype=torch.float)
+
+    for j, tok_list in enumerate(tokens_positive):
+        for (beg, end) in tok_list:
+            try:
+                beg_pos = tokenized.char_to_token(beg)
+                end_pos = tokenized.char_to_token(end - 1)
+            except Exception as e:
+                print('beg:', beg, 'end:', end)
+                print('token_positive:', tokens_positive)
+                raise e
+            if beg_pos is None:
+                try:
+                    beg_pos = tokenized.char_to_token(beg + 1)
+                    if beg_pos is None:
+                        beg_pos = tokenized.char_to_token(beg + 2)
+                except Exception:
+                    beg_pos = None
+            if end_pos is None:
+                try:
+                    end_pos = tokenized.char_to_token(end - 2)
+                    if end_pos is None:
+                        end_pos = tokenized.char_to_token(end - 3)
+                except Exception:
+                    end_pos = None
+            if beg_pos is None or end_pos is None:
+                continue
+
+            assert beg_pos is not None and end_pos is not None
+            positive_map[j, beg_pos:end_pos + 1].fill_(1)
+    return positive_map / (positive_map.sum(-1)[:, None] + 1e-6)
+
+
+def create_positive_map_label_to_token(positive_map: Tensor,
+                                       plus: int = 0) -> dict:
+    """Create a dictionary mapping the label to the token.
+    Args:
+        positive_map (Tensor): The positive map tensor.
+        plus (int, optional): Value added to the label for indexing.
+            Defaults to 0.
+
+    Returns:
+        dict: The dictionary mapping the label to the token.
+    """
+    positive_map_label_to_token = {}
+    for i in range(len(positive_map)):
+        positive_map_label_to_token[i + plus] = torch.nonzero(
+            positive_map[i], as_tuple=True)[0].tolist()
+    return positive_map_label_to_token
+
+
+def clean_label_name(name: str) -> str:
+    name = re.sub(r'\(.*\)', '', name)
+    name = re.sub(r'_', ' ', name)
+    name = re.sub(r'  ', ' ', name)
+    return name
+
+
+def chunks(lst: list, n: int) -> list:
+    """Yield successive n-sized chunks from lst."""
+    all_ = []
+    for i in range(0, len(lst), n):
+        data_index = lst[i:i + n]
+        all_.append(data_index)
+    counter = 0
+    for i in all_:
+        counter += len(i)
+    assert (counter == len(lst))
+
+    return all_
+
+
+@MODELS.register_module()
+class GLIP(SingleStageDetector):
+    """Implementation of `GLIP <https://arxiv.org/abs/2112.03857>`_
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        language_model (:obj:`ConfigDict` or dict): The language model config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of GLIP. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of GLIP. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 language_model: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+        self.language_model = MODELS.build(language_model)
+
+        self._special_tokens = '. '
+
+    def to_enhance_text_prompts(self, original_caption, enhanced_text_prompts):
+        caption_string = ''
+        tokens_positive = []
+        for idx, word in enumerate(original_caption):
+            if word in enhanced_text_prompts:
+                enhanced_text_dict = enhanced_text_prompts[word]
+                if 'prefix' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['prefix']
+                start_i = len(caption_string)
+                if 'name' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['name']
+                else:
+                    caption_string += word
+                end_i = len(caption_string)
+                tokens_positive.append([[start_i, end_i]])
+
+                if 'suffix' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['suffix']
+            else:
+                tokens_positive.append(
+                    [[len(caption_string),
+                      len(caption_string) + len(word)]])
+                caption_string += word
+
+            if idx != len(original_caption) - 1:
+                caption_string += self._special_tokens
+        return caption_string, tokens_positive
+
+    def to_plain_text_prompts(self, original_caption):
+        caption_string = ''
+        tokens_positive = []
+        for idx, word in enumerate(original_caption):
+            tokens_positive.append(
+                [[len(caption_string),
+                  len(caption_string) + len(word)]])
+            caption_string += word
+            if idx != len(original_caption) - 1:
+                caption_string += self._special_tokens
+        return caption_string, tokens_positive
+
+    def get_tokens_and_prompts(
+        self,
+        original_caption: Union[str, list, tuple],
+        custom_entities: bool = False,
+        enhanced_text_prompts: Optional[ConfigType] = None
+    ) -> Tuple[dict, str, list, list]:
+        """Get the tokens positive and prompts for the caption."""
+        if isinstance(original_caption, (list, tuple)) or custom_entities:
+            if custom_entities and isinstance(original_caption, str):
+                original_caption = original_caption.strip(self._special_tokens)
+                original_caption = original_caption.split(self._special_tokens)
+                original_caption = list(
+                    filter(lambda x: len(x) > 0, original_caption))
+
+            original_caption = [clean_label_name(i) for i in original_caption]
+
+            if custom_entities and enhanced_text_prompts is not None:
+                caption_string, tokens_positive = self.to_enhance_text_prompts(
+                    original_caption, enhanced_text_prompts)
+            else:
+                caption_string, tokens_positive = self.to_plain_text_prompts(
+                    original_caption)
+
+            tokenized = self.language_model.tokenizer([caption_string],
+                                                      return_tensors='pt')
+            entities = original_caption
+        else:
+            original_caption = original_caption.strip(self._special_tokens)
+            tokenized = self.language_model.tokenizer([original_caption],
+                                                      return_tensors='pt')
+            tokens_positive, noun_phrases = run_ner(original_caption)
+            entities = noun_phrases
+            caption_string = original_caption
+
+        return tokenized, caption_string, tokens_positive, entities
+
+    def get_positive_map(self, tokenized, tokens_positive):
+        positive_map = create_positive_map(tokenized, tokens_positive)
+        positive_map_label_to_token = create_positive_map_label_to_token(
+            positive_map, plus=1)
+        return positive_map_label_to_token, positive_map
+
+    def get_tokens_positive_and_prompts(
+        self,
+        original_caption: Union[str, list, tuple],
+        custom_entities: bool = False,
+        enhanced_text_prompt: Optional[ConfigType] = None,
+        tokens_positive: Optional[list] = None,
+    ) -> Tuple[dict, str, Tensor, list]:
+        if tokens_positive is not None:
+            if tokens_positive == -1:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                return None, original_caption, None, original_caption
+            else:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                tokenized = self.language_model.tokenizer([original_caption],
+                                                          return_tensors='pt')
+                positive_map_label_to_token, positive_map = \
+                    self.get_positive_map(tokenized, tokens_positive)
+
+                entities = []
+                for token_positive in tokens_positive:
+                    instance_entities = []
+                    for t in token_positive:
+                        instance_entities.append(original_caption[t[0]:t[1]])
+                    entities.append(' / '.join(instance_entities))
+                return positive_map_label_to_token, original_caption, \
+                    positive_map, entities
+
+        chunked_size = self.test_cfg.get('chunked_size', -1)
+        if not self.training and chunked_size > 0:
+            assert isinstance(original_caption,
+                              (list, tuple)) or custom_entities is True
+            all_output = self.get_tokens_positive_and_prompts_chunked(
+                original_caption, enhanced_text_prompt)
+            positive_map_label_to_token, \
+                caption_string, \
+                positive_map, \
+                entities = all_output
+        else:
+            tokenized, caption_string, tokens_positive, entities = \
+                self.get_tokens_and_prompts(
+                    original_caption, custom_entities, enhanced_text_prompt)
+            positive_map_label_to_token, positive_map = self.get_positive_map(
+                tokenized, tokens_positive)
+            if tokenized.input_ids.shape[1] > self.language_model.max_tokens:
+                warnings.warn('Inputting a text that is too long will result '
+                              'in poor prediction performance. '
+                              'Please reduce the text length.')
+        return positive_map_label_to_token, caption_string, \
+            positive_map, entities
+
+    def get_tokens_positive_and_prompts_chunked(
+            self,
+            original_caption: Union[list, tuple],
+            enhanced_text_prompts: Optional[ConfigType] = None):
+        chunked_size = self.test_cfg.get('chunked_size', -1)
+        original_caption = [clean_label_name(i) for i in original_caption]
+
+        original_caption_chunked = chunks(original_caption, chunked_size)
+        ids_chunked = chunks(
+            list(range(1,
+                       len(original_caption) + 1)), chunked_size)
+
+        positive_map_label_to_token_chunked = []
+        caption_string_chunked = []
+        positive_map_chunked = []
+        entities_chunked = []
+
+        for i in range(len(ids_chunked)):
+            if enhanced_text_prompts is not None:
+                caption_string, tokens_positive = self.to_enhance_text_prompts(
+                    original_caption_chunked[i], enhanced_text_prompts)
+            else:
+                caption_string, tokens_positive = self.to_plain_text_prompts(
+                    original_caption_chunked[i])
+            tokenized = self.language_model.tokenizer([caption_string],
+                                                      return_tensors='pt')
+            if tokenized.input_ids.shape[1] > self.language_model.max_tokens:
+                warnings.warn('Inputting a text that is too long will result '
+                              'in poor prediction performance. '
+                              'Please reduce the --chunked-size.')
+            positive_map_label_to_token, positive_map = self.get_positive_map(
+                tokenized, tokens_positive)
+
+            caption_string_chunked.append(caption_string)
+            positive_map_label_to_token_chunked.append(
+                positive_map_label_to_token)
+            positive_map_chunked.append(positive_map)
+            entities_chunked.append(original_caption_chunked[i])
+
+        return positive_map_label_to_token_chunked, \
+            caption_string_chunked, \
+            positive_map_chunked, \
+            entities_chunked
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, list]:
+        # TODO: Only open vocabulary tasks are supported for training now.
+        text_prompts = [
+            data_samples.text for data_samples in batch_data_samples
+        ]
+
+        gt_labels = [
+            data_samples.gt_instances.labels
+            for data_samples in batch_data_samples
+        ]
+
+        new_text_prompts = []
+        positive_maps = []
+        if len(set(text_prompts)) == 1:
+            # All the text prompts are the same,
+            # so there is no need to calculate them multiple times.
+            tokenized, caption_string, tokens_positive, _ = \
+                self.get_tokens_and_prompts(
+                    text_prompts[0], True)
+            new_text_prompts = [caption_string] * len(batch_inputs)
+            for gt_label in gt_labels:
+                new_tokens_positive = [
+                    tokens_positive[label] for label in gt_label
+                ]
+                _, positive_map = self.get_positive_map(
+                    tokenized, new_tokens_positive)
+                positive_maps.append(positive_map)
+        else:
+            for text_prompt, gt_label in zip(text_prompts, gt_labels):
+                tokenized, caption_string, tokens_positive, _ = \
+                    self.get_tokens_and_prompts(
+                        text_prompt, True)
+                new_tokens_positive = [
+                    tokens_positive[label] for label in gt_label
+                ]
+                _, positive_map = self.get_positive_map(
+                    tokenized, new_tokens_positive)
+                positive_maps.append(positive_map)
+                new_text_prompts.append(caption_string)
+
+        language_dict_features = self.language_model(new_text_prompts)
+        for i, data_samples in enumerate(batch_data_samples):
+            # .bool().float() is very important
+            positive_map = positive_maps[i].to(
+                batch_inputs.device).bool().float()
+            data_samples.gt_instances.positive_maps = positive_map
+
+        visual_features = self.extract_feat(batch_inputs)
+
+        losses = self.bbox_head.loss(visual_features, language_dict_features,
+                                     batch_data_samples)
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances'. And the ``pred_instances`` usually
+            contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - label_names (List[str]): Label names of bboxes.
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        text_prompts = []
+        enhanced_text_prompts = []
+        tokens_positives = []
+        for data_samples in batch_data_samples:
+            text_prompts.append(data_samples.text)
+            if 'caption_prompt' in data_samples:
+                enhanced_text_prompts.append(data_samples.caption_prompt)
+            else:
+                enhanced_text_prompts.append(None)
+            tokens_positives.append(data_samples.get('tokens_positive', None))
+
+        if 'custom_entities' in batch_data_samples[0]:
+            # Assuming that the `custom_entities` flag
+            # inside a batch is always the same. For single image inference
+            custom_entities = batch_data_samples[0].custom_entities
+        else:
+            custom_entities = False
+
+        if len(set(text_prompts)) == 1:
+            # All the text prompts are the same,
+            # so there is no need to calculate them multiple times.
+            _positive_maps_and_prompts = [
+                self.get_tokens_positive_and_prompts(
+                    text_prompts[0], custom_entities, enhanced_text_prompts[0],
+                    tokens_positives[0])
+            ] * len(batch_inputs)
+        else:
+            _positive_maps_and_prompts = [
+                self.get_tokens_positive_and_prompts(text_prompt,
+                                                     custom_entities,
+                                                     enhanced_text_prompt,
+                                                     tokens_positive)
+                for text_prompt, enhanced_text_prompt, tokens_positive in zip(
+                    text_prompts, enhanced_text_prompts, tokens_positives)
+            ]
+
+        token_positive_maps, text_prompts, _, entities = zip(
+            *_positive_maps_and_prompts)
+
+        visual_features = self.extract_feat(batch_inputs)
+
+        if isinstance(text_prompts[0], list):
+            # chunked text prompts, only bs=1 is supported
+            assert len(batch_inputs) == 1
+            count = 0
+            results_list = []
+
+            entities = [[item for lst in entities[0] for item in lst]]
+
+            for b in range(len(text_prompts[0])):
+                text_prompts_once = [text_prompts[0][b]]
+                token_positive_maps_once = token_positive_maps[0][b]
+                language_dict_features = self.language_model(text_prompts_once)
+                batch_data_samples[
+                    0].token_positive_map = token_positive_maps_once
+
+                pred_instances = self.bbox_head.predict(
+                    copy.deepcopy(visual_features),
+                    language_dict_features,
+                    batch_data_samples,
+                    rescale=rescale)[0]
+
+                if len(pred_instances) > 0:
+                    pred_instances.labels += count
+                count += len(token_positive_maps_once)
+                results_list.append(pred_instances)
+            results_list = [results_list[0].cat(results_list)]
+        else:
+            language_dict_features = self.language_model(list(text_prompts))
+
+            for i, data_samples in enumerate(batch_data_samples):
+                data_samples.token_positive_map = token_positive_maps[i]
+
+            results_list = self.bbox_head.predict(
+                visual_features,
+                language_dict_features,
+                batch_data_samples,
+                rescale=rescale)
+
+        for data_sample, pred_instances, entity in zip(batch_data_samples,
+                                                       results_list, entities):
+            if len(pred_instances) > 0:
+                label_names = []
+                for labels in pred_instances.labels:
+                    if labels >= len(entity):
+                        warnings.warn(
+                            'The unexpected output indicates an issue with '
+                            'named entity recognition. You can try '
+                            'setting custom_entities=True and running '
+                            'again to see if it helps.')
+                        label_names.append('unobject')
+                    else:
+                        label_names.append(entity[labels])
+                # for visualization
+                pred_instances.label_names = label_names
+            data_sample.pred_instances = pred_instances
+        return batch_data_samples
diff --git a/mmde/mmdet/models/detectors/grid_rcnn.py b/mmde/mmdet/models/detectors/grid_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bcb5b033edc620f1cf61b986c345961b719e6f1
--- /dev/null
+++ b/mmde/mmdet/models/detectors/grid_rcnn.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class GridRCNN(TwoStageDetector):
+    """Grid R-CNN.
+
+    This detector is the implementation of:
+    - Grid R-CNN (https://arxiv.org/abs/1811.12030)
+    - Grid R-CNN Plus: Faster and Better (https://arxiv.org/abs/1906.05688)
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/grounding_dino.py b/mmde/mmdet/models/detectors/grounding_dino.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ec9d14e63401bb14ff39c0df0ae7bcdb38c28d0
--- /dev/null
+++ b/mmde/mmdet/models/detectors/grounding_dino.py
@@ -0,0 +1,612 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import re
+import warnings
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList, SampleList
+from mmdet.utils import ConfigType
+from ..layers import SinePositionalEncoding
+from ..layers.transformer.grounding_dino_layers import (
+    GroundingDinoTransformerDecoder, GroundingDinoTransformerEncoder)
+from .dino import DINO
+from .glip import (create_positive_map, create_positive_map_label_to_token,
+                   run_ner)
+
+
+def clean_label_name(name: str) -> str:
+    name = re.sub(r'\(.*\)', '', name)
+    name = re.sub(r'_', ' ', name)
+    name = re.sub(r'  ', ' ', name)
+    return name
+
+
+def chunks(lst: list, n: int) -> list:
+    """Yield successive n-sized chunks from lst."""
+    all_ = []
+    for i in range(0, len(lst), n):
+        data_index = lst[i:i + n]
+        all_.append(data_index)
+    counter = 0
+    for i in all_:
+        counter += len(i)
+    assert (counter == len(lst))
+
+    return all_
+
+
+@MODELS.register_module()
+class GroundingDINO(DINO):
+    """Implementation of `Grounding DINO: Marrying DINO with Grounded Pre-
+    Training for Open-Set Object Detection.
+
+    <https://arxiv.org/abs/2303.05499>`_
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/GroundingDINO>`_.
+    """
+
+    def __init__(self, language_model, *args, **kwargs) -> None:
+
+        self.language_model_cfg = language_model
+        self._special_tokens = '. '
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = GroundingDinoTransformerEncoder(**self.encoder)
+        self.decoder = GroundingDinoTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims)
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            f'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+        self.level_embed = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+        self.memory_trans_fc = nn.Linear(self.embed_dims, self.embed_dims)
+        self.memory_trans_norm = nn.LayerNorm(self.embed_dims)
+
+        # text modules
+        self.language_model = MODELS.build(self.language_model_cfg)
+        self.text_feat_map = nn.Linear(
+            self.language_model.language_backbone.body.language_dim,
+            self.embed_dims,
+            bias=True)
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super().init_weights()
+        nn.init.constant_(self.text_feat_map.bias.data, 0)
+        nn.init.xavier_uniform_(self.text_feat_map.weight.data)
+
+    def to_enhance_text_prompts(self, original_caption, enhanced_text_prompts):
+        caption_string = ''
+        tokens_positive = []
+        for idx, word in enumerate(original_caption):
+            if word in enhanced_text_prompts:
+                enhanced_text_dict = enhanced_text_prompts[word]
+                if 'prefix' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['prefix']
+                start_i = len(caption_string)
+                if 'name' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['name']
+                else:
+                    caption_string += word
+                end_i = len(caption_string)
+                tokens_positive.append([[start_i, end_i]])
+
+                if 'suffix' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['suffix']
+            else:
+                tokens_positive.append(
+                    [[len(caption_string),
+                      len(caption_string) + len(word)]])
+                caption_string += word
+            caption_string += self._special_tokens
+        return caption_string, tokens_positive
+
+    def to_plain_text_prompts(self, original_caption):
+        caption_string = ''
+        tokens_positive = []
+        for idx, word in enumerate(original_caption):
+            tokens_positive.append(
+                [[len(caption_string),
+                  len(caption_string) + len(word)]])
+            caption_string += word
+            caption_string += self._special_tokens
+        return caption_string, tokens_positive
+
+    def get_tokens_and_prompts(
+        self,
+        original_caption: Union[str, list, tuple],
+        custom_entities: bool = False,
+        enhanced_text_prompts: Optional[ConfigType] = None
+    ) -> Tuple[dict, str, list]:
+        """Get the tokens positive and prompts for the caption."""
+        if isinstance(original_caption, (list, tuple)) or custom_entities:
+            if custom_entities and isinstance(original_caption, str):
+                original_caption = original_caption.strip(self._special_tokens)
+                original_caption = original_caption.split(self._special_tokens)
+                original_caption = list(
+                    filter(lambda x: len(x) > 0, original_caption))
+
+            original_caption = [clean_label_name(i) for i in original_caption]
+
+            if custom_entities and enhanced_text_prompts is not None:
+                caption_string, tokens_positive = self.to_enhance_text_prompts(
+                    original_caption, enhanced_text_prompts)
+            else:
+                caption_string, tokens_positive = self.to_plain_text_prompts(
+                    original_caption)
+
+            # NOTE: Tokenizer in Grounding DINO is different from
+            # that in GLIP. The tokenizer in GLIP will pad the
+            # caption_string to max_length, while the tokenizer
+            # in Grounding DINO will not.
+            tokenized = self.language_model.tokenizer(
+                [caption_string],
+                padding='max_length'
+                if self.language_model.pad_to_max else 'longest',
+                return_tensors='pt')
+            entities = original_caption
+        else:
+            if not original_caption.endswith('.'):
+                original_caption = original_caption + self._special_tokens
+            # NOTE: Tokenizer in Grounding DINO is different from
+            # that in GLIP. The tokenizer in GLIP will pad the
+            # caption_string to max_length, while the tokenizer
+            # in Grounding DINO will not.
+            tokenized = self.language_model.tokenizer(
+                [original_caption],
+                padding='max_length'
+                if self.language_model.pad_to_max else 'longest',
+                return_tensors='pt')
+            tokens_positive, noun_phrases = run_ner(original_caption)
+            entities = noun_phrases
+            caption_string = original_caption
+
+        return tokenized, caption_string, tokens_positive, entities
+
+    def get_positive_map(self, tokenized, tokens_positive):
+        positive_map = create_positive_map(
+            tokenized,
+            tokens_positive,
+            max_num_entities=self.bbox_head.cls_branches[
+                self.decoder.num_layers].max_text_len)
+        positive_map_label_to_token = create_positive_map_label_to_token(
+            positive_map, plus=1)
+        return positive_map_label_to_token, positive_map
+
+    def get_tokens_positive_and_prompts(
+        self,
+        original_caption: Union[str, list, tuple],
+        custom_entities: bool = False,
+        enhanced_text_prompt: Optional[ConfigType] = None,
+        tokens_positive: Optional[list] = None,
+    ) -> Tuple[dict, str, Tensor, list]:
+        """Get the tokens positive and prompts for the caption.
+
+        Args:
+            original_caption (str): The original caption, e.g. 'bench . car .'
+            custom_entities (bool, optional): Whether to use custom entities.
+                If ``True``, the ``original_caption`` should be a list of
+                strings, each of which is a word. Defaults to False.
+
+        Returns:
+            Tuple[dict, str, dict, str]: The dict is a mapping from each entity
+            id, which is numbered from 1, to its positive token id.
+            The str represents the prompts.
+        """
+        if tokens_positive is not None:
+            if tokens_positive == -1:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                return None, original_caption, None, original_caption
+            else:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                tokenized = self.language_model.tokenizer(
+                    [original_caption],
+                    padding='max_length'
+                    if self.language_model.pad_to_max else 'longest',
+                    return_tensors='pt')
+                positive_map_label_to_token, positive_map = \
+                    self.get_positive_map(tokenized, tokens_positive)
+
+                entities = []
+                for token_positive in tokens_positive:
+                    instance_entities = []
+                    for t in token_positive:
+                        instance_entities.append(original_caption[t[0]:t[1]])
+                    entities.append(' / '.join(instance_entities))
+                return positive_map_label_to_token, original_caption, \
+                    positive_map, entities
+
+        chunked_size = self.test_cfg.get('chunked_size', -1)
+        if not self.training and chunked_size > 0:
+            assert isinstance(original_caption,
+                              (list, tuple)) or custom_entities is True
+            all_output = self.get_tokens_positive_and_prompts_chunked(
+                original_caption, enhanced_text_prompt)
+            positive_map_label_to_token, \
+                caption_string, \
+                positive_map, \
+                entities = all_output
+        else:
+            tokenized, caption_string, tokens_positive, entities = \
+                self.get_tokens_and_prompts(
+                    original_caption, custom_entities, enhanced_text_prompt)
+            positive_map_label_to_token, positive_map = self.get_positive_map(
+                tokenized, tokens_positive)
+        return positive_map_label_to_token, caption_string, \
+            positive_map, entities
+
+    def get_tokens_positive_and_prompts_chunked(
+            self,
+            original_caption: Union[list, tuple],
+            enhanced_text_prompts: Optional[ConfigType] = None):
+        chunked_size = self.test_cfg.get('chunked_size', -1)
+        original_caption = [clean_label_name(i) for i in original_caption]
+
+        original_caption_chunked = chunks(original_caption, chunked_size)
+        ids_chunked = chunks(
+            list(range(1,
+                       len(original_caption) + 1)), chunked_size)
+
+        positive_map_label_to_token_chunked = []
+        caption_string_chunked = []
+        positive_map_chunked = []
+        entities_chunked = []
+
+        for i in range(len(ids_chunked)):
+            if enhanced_text_prompts is not None:
+                caption_string, tokens_positive = self.to_enhance_text_prompts(
+                    original_caption_chunked[i], enhanced_text_prompts)
+            else:
+                caption_string, tokens_positive = self.to_plain_text_prompts(
+                    original_caption_chunked[i])
+            tokenized = self.language_model.tokenizer([caption_string],
+                                                      return_tensors='pt')
+            if tokenized.input_ids.shape[1] > self.language_model.max_tokens:
+                warnings.warn('Inputting a text that is too long will result '
+                              'in poor prediction performance. '
+                              'Please reduce the --chunked-size.')
+            positive_map_label_to_token, positive_map = self.get_positive_map(
+                tokenized, tokens_positive)
+
+            caption_string_chunked.append(caption_string)
+            positive_map_label_to_token_chunked.append(
+                positive_map_label_to_token)
+            positive_map_chunked.append(positive_map)
+            entities_chunked.append(original_caption_chunked[i])
+
+        return positive_map_label_to_token_chunked, \
+            caption_string_chunked, \
+            positive_map_chunked, \
+            entities_chunked
+
+    def forward_transformer(
+        self,
+        img_feats: Tuple[Tensor],
+        text_dict: Dict,
+        batch_data_samples: OptSampleList = None,
+    ) -> Dict:
+        encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer(
+            img_feats, batch_data_samples)
+
+        encoder_outputs_dict = self.forward_encoder(
+            **encoder_inputs_dict, text_dict=text_dict)
+
+        tmp_dec_in, head_inputs_dict = self.pre_decoder(
+            **encoder_outputs_dict, batch_data_samples=batch_data_samples)
+        decoder_inputs_dict.update(tmp_dec_in)
+
+        decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict)
+        head_inputs_dict.update(decoder_outputs_dict)
+        return head_inputs_dict
+
+    def forward_encoder(self, feat: Tensor, feat_mask: Tensor,
+                        feat_pos: Tensor, spatial_shapes: Tensor,
+                        level_start_index: Tensor, valid_ratios: Tensor,
+                        text_dict: Dict) -> Dict:
+        text_token_mask = text_dict['text_token_mask']
+        memory, memory_text = self.encoder(
+            query=feat,
+            query_pos=feat_pos,
+            key_padding_mask=feat_mask,  # for self_attn
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            # for text encoder
+            memory_text=text_dict['embedded'],
+            text_attention_mask=~text_token_mask,
+            position_ids=text_dict['position_ids'],
+            text_self_attention_masks=text_dict['masks'])
+        encoder_outputs_dict = dict(
+            memory=memory,
+            memory_mask=feat_mask,
+            spatial_shapes=spatial_shapes,
+            memory_text=memory_text,
+            text_token_mask=text_token_mask)
+        return encoder_outputs_dict
+
+    def pre_decoder(
+        self,
+        memory: Tensor,
+        memory_mask: Tensor,
+        spatial_shapes: Tensor,
+        memory_text: Tensor,
+        text_token_mask: Tensor,
+        batch_data_samples: OptSampleList = None,
+    ) -> Tuple[Dict]:
+        bs, _, c = memory.shape
+
+        output_memory, output_proposals = self.gen_encoder_output_proposals(
+            memory, memory_mask, spatial_shapes)
+
+        enc_outputs_class = self.bbox_head.cls_branches[
+            self.decoder.num_layers](output_memory, memory_text,
+                                     text_token_mask)
+        cls_out_features = self.bbox_head.cls_branches[
+            self.decoder.num_layers].max_text_len
+        enc_outputs_coord_unact = self.bbox_head.reg_branches[
+            self.decoder.num_layers](output_memory) + output_proposals
+
+        # NOTE The DINO selects top-k proposals according to scores of
+        # multi-class classification, while DeformDETR, where the input
+        # is `enc_outputs_class[..., 0]` selects according to scores of
+        # binary classification.
+        topk_indices = torch.topk(
+            enc_outputs_class.max(-1)[0], k=self.num_queries, dim=1)[1]
+
+        topk_score = torch.gather(
+            enc_outputs_class, 1,
+            topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features))
+        topk_coords_unact = torch.gather(
+            enc_outputs_coord_unact, 1,
+            topk_indices.unsqueeze(-1).repeat(1, 1, 4))
+        topk_coords = topk_coords_unact.sigmoid()
+        topk_coords_unact = topk_coords_unact.detach()
+
+        query = self.query_embedding.weight[:, None, :]
+        query = query.repeat(1, bs, 1).transpose(0, 1)
+        if self.training:
+            dn_label_query, dn_bbox_query, dn_mask, dn_meta = \
+                self.dn_query_generator(batch_data_samples)
+            query = torch.cat([dn_label_query, query], dim=1)
+            reference_points = torch.cat([dn_bbox_query, topk_coords_unact],
+                                         dim=1)
+        else:
+            reference_points = topk_coords_unact
+            dn_mask, dn_meta = None, None
+        reference_points = reference_points.sigmoid()
+
+        decoder_inputs_dict = dict(
+            query=query,
+            memory=memory,
+            reference_points=reference_points,
+            dn_mask=dn_mask,
+            memory_text=memory_text,
+            text_attention_mask=~text_token_mask,
+        )
+        # NOTE DINO calculates encoder losses on scores and coordinates
+        # of selected top-k encoder queries, while DeformDETR is of all
+        # encoder queries.
+        head_inputs_dict = dict(
+            enc_outputs_class=topk_score,
+            enc_outputs_coord=topk_coords,
+            dn_meta=dn_meta) if self.training else dict()
+        # append text_feats to head_inputs_dict
+        head_inputs_dict['memory_text'] = memory_text
+        head_inputs_dict['text_token_mask'] = text_token_mask
+        return decoder_inputs_dict, head_inputs_dict
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, list]:
+        text_prompts = [
+            data_samples.text for data_samples in batch_data_samples
+        ]
+
+        gt_labels = [
+            data_samples.gt_instances.labels
+            for data_samples in batch_data_samples
+        ]
+
+        if 'tokens_positive' in batch_data_samples[0]:
+            tokens_positive = [
+                data_samples.tokens_positive
+                for data_samples in batch_data_samples
+            ]
+            positive_maps = []
+            for token_positive, text_prompt, gt_label in zip(
+                    tokens_positive, text_prompts, gt_labels):
+                tokenized = self.language_model.tokenizer(
+                    [text_prompt],
+                    padding='max_length'
+                    if self.language_model.pad_to_max else 'longest',
+                    return_tensors='pt')
+                new_tokens_positive = [
+                    token_positive[label.item()] for label in gt_label
+                ]
+                _, positive_map = self.get_positive_map(
+                    tokenized, new_tokens_positive)
+                positive_maps.append(positive_map)
+            new_text_prompts = text_prompts
+        else:
+            new_text_prompts = []
+            positive_maps = []
+            if len(set(text_prompts)) == 1:
+                # All the text prompts are the same,
+                # so there is no need to calculate them multiple times.
+                tokenized, caption_string, tokens_positive, _ = \
+                    self.get_tokens_and_prompts(
+                        text_prompts[0], True)
+                new_text_prompts = [caption_string] * len(batch_inputs)
+                for gt_label in gt_labels:
+                    new_tokens_positive = [
+                        tokens_positive[label] for label in gt_label
+                    ]
+                    _, positive_map = self.get_positive_map(
+                        tokenized, new_tokens_positive)
+                    positive_maps.append(positive_map)
+            else:
+                for text_prompt, gt_label in zip(text_prompts, gt_labels):
+                    tokenized, caption_string, tokens_positive, _ = \
+                        self.get_tokens_and_prompts(
+                            text_prompt, True)
+                    new_tokens_positive = [
+                        tokens_positive[label] for label in gt_label
+                    ]
+                    _, positive_map = self.get_positive_map(
+                        tokenized, new_tokens_positive)
+                    positive_maps.append(positive_map)
+                    new_text_prompts.append(caption_string)
+
+        text_dict = self.language_model(new_text_prompts)
+        if self.text_feat_map is not None:
+            text_dict['embedded'] = self.text_feat_map(text_dict['embedded'])
+
+        for i, data_samples in enumerate(batch_data_samples):
+            positive_map = positive_maps[i].to(
+                batch_inputs.device).bool().float()
+            text_token_mask = text_dict['text_token_mask'][i]
+            data_samples.gt_instances.positive_maps = positive_map
+            data_samples.gt_instances.text_token_mask = \
+                text_token_mask.unsqueeze(0).repeat(
+                    len(positive_map), 1)
+
+        visual_features = self.extract_feat(batch_inputs)
+        head_inputs_dict = self.forward_transformer(visual_features, text_dict,
+                                                    batch_data_samples)
+
+        losses = self.bbox_head.loss(
+            **head_inputs_dict, batch_data_samples=batch_data_samples)
+        return losses
+
+    def predict(self, batch_inputs, batch_data_samples, rescale: bool = True):
+        text_prompts = []
+        enhanced_text_prompts = []
+        tokens_positives = []
+        for data_samples in batch_data_samples:
+            text_prompts.append(data_samples.text)
+            if 'caption_prompt' in data_samples:
+                enhanced_text_prompts.append(data_samples.caption_prompt)
+            else:
+                enhanced_text_prompts.append(None)
+            tokens_positives.append(data_samples.get('tokens_positive', None))
+
+        if 'custom_entities' in batch_data_samples[0]:
+            # Assuming that the `custom_entities` flag
+            # inside a batch is always the same. For single image inference
+            custom_entities = batch_data_samples[0].custom_entities
+        else:
+            custom_entities = False
+        if len(text_prompts) == 1:
+            # All the text prompts are the same,
+            # so there is no need to calculate them multiple times.
+            _positive_maps_and_prompts = [
+                self.get_tokens_positive_and_prompts(
+                    text_prompts[0], custom_entities, enhanced_text_prompts[0],
+                    tokens_positives[0])
+            ] * len(batch_inputs)
+        else:
+            _positive_maps_and_prompts = [
+                self.get_tokens_positive_and_prompts(text_prompt,
+                                                     custom_entities,
+                                                     enhanced_text_prompt,
+                                                     tokens_positive)
+                for text_prompt, enhanced_text_prompt, tokens_positive in zip(
+                    text_prompts, enhanced_text_prompts, tokens_positives)
+            ]
+        token_positive_maps, text_prompts, _, entities = zip(
+            *_positive_maps_and_prompts)
+
+        # image feature extraction
+        visual_feats = self.extract_feat(batch_inputs)
+
+        if isinstance(text_prompts[0], list):
+            # chunked text prompts, only bs=1 is supported
+            assert len(batch_inputs) == 1
+            count = 0
+            results_list = []
+
+            entities = [[item for lst in entities[0] for item in lst]]
+
+            for b in range(len(text_prompts[0])):
+                text_prompts_once = [text_prompts[0][b]]
+                token_positive_maps_once = token_positive_maps[0][b]
+                text_dict = self.language_model(text_prompts_once)
+                # text feature map layer
+                if self.text_feat_map is not None:
+                    text_dict['embedded'] = self.text_feat_map(
+                        text_dict['embedded'])
+
+                batch_data_samples[
+                    0].token_positive_map = token_positive_maps_once
+
+                head_inputs_dict = self.forward_transformer(
+                    copy.deepcopy(visual_feats), text_dict, batch_data_samples)
+                pred_instances = self.bbox_head.predict(
+                    **head_inputs_dict,
+                    rescale=rescale,
+                    batch_data_samples=batch_data_samples)[0]
+
+                if len(pred_instances) > 0:
+                    pred_instances.labels += count
+                count += len(token_positive_maps_once)
+                results_list.append(pred_instances)
+            results_list = [results_list[0].cat(results_list)]
+            is_rec_tasks = [False] * len(results_list)
+        else:
+            # extract text feats
+            text_dict = self.language_model(list(text_prompts))
+            # text feature map layer
+            if self.text_feat_map is not None:
+                text_dict['embedded'] = self.text_feat_map(
+                    text_dict['embedded'])
+
+            is_rec_tasks = []
+            for i, data_samples in enumerate(batch_data_samples):
+                if token_positive_maps[i] is not None:
+                    is_rec_tasks.append(False)
+                else:
+                    is_rec_tasks.append(True)
+                data_samples.token_positive_map = token_positive_maps[i]
+
+            head_inputs_dict = self.forward_transformer(
+                visual_feats, text_dict, batch_data_samples)
+            results_list = self.bbox_head.predict(
+                **head_inputs_dict,
+                rescale=rescale,
+                batch_data_samples=batch_data_samples)
+
+        for data_sample, pred_instances, entity, is_rec_task in zip(
+                batch_data_samples, results_list, entities, is_rec_tasks):
+            if len(pred_instances) > 0:
+                label_names = []
+                for labels in pred_instances.labels:
+                    if is_rec_task:
+                        label_names.append(entity)
+                        continue
+                    if labels >= len(entity):
+                        warnings.warn(
+                            'The unexpected output indicates an issue with '
+                            'named entity recognition. You can try '
+                            'setting custom_entities=True and running '
+                            'again to see if it helps.')
+                        label_names.append('unobject')
+                    else:
+                        label_names.append(entity[labels])
+                # for visualization
+                pred_instances.label_names = label_names
+            data_sample.pred_instances = pred_instances
+        return batch_data_samples
diff --git a/mmde/mmdet/models/detectors/htc.py b/mmde/mmdet/models/detectors/htc.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a2aa889a59fd0e0afeb95a7369028def6e4fa9
--- /dev/null
+++ b/mmde/mmdet/models/detectors/htc.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from .cascade_rcnn import CascadeRCNN
+
+
+@MODELS.register_module()
+class HybridTaskCascade(CascadeRCNN):
+    """Implementation of `HTC <https://arxiv.org/abs/1901.07518>`_"""
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+    @property
+    def with_semantic(self) -> bool:
+        """bool: whether the detector has a semantic head"""
+        return self.roi_head.with_semantic
diff --git a/mmde/mmdet/models/detectors/kd_one_stage.py b/mmde/mmdet/models/detectors/kd_one_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a4a1bb564c0f6e4cabe32a5c01cfea252ecfb7d
--- /dev/null
+++ b/mmde/mmdet/models/detectors/kd_one_stage.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from pathlib import Path
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn as nn
+from mmengine.config import Config
+from mmengine.runner import load_checkpoint
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class KnowledgeDistillationSingleStageDetector(SingleStageDetector):
+    r"""Implementation of `Distilling the Knowledge in a Neural Network.
+    <https://arxiv.org/abs/1503.02531>`_.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        teacher_config (:obj:`ConfigDict` | dict | str | Path): Config file
+            path or the config object of teacher model.
+        teacher_ckpt (str, optional): Checkpoint path of teacher model.
+            If left as None, the model will not load any weights.
+            Defaults to True.
+        eval_teacher (bool): Set the train mode for teacher.
+            Defaults to True.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of ATSS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of ATSS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        backbone: ConfigType,
+        neck: ConfigType,
+        bbox_head: ConfigType,
+        teacher_config: Union[ConfigType, str, Path],
+        teacher_ckpt: Optional[str] = None,
+        eval_teacher: bool = True,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        data_preprocessor: OptConfigType = None,
+    ) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor)
+        self.eval_teacher = eval_teacher
+        # Build teacher model
+        if isinstance(teacher_config, (str, Path)):
+            teacher_config = Config.fromfile(teacher_config)
+        self.teacher_model = MODELS.build(teacher_config['model'])
+        if teacher_ckpt is not None:
+            load_checkpoint(
+                self.teacher_model, teacher_ckpt, map_location='cpu')
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs)
+        with torch.no_grad():
+            teacher_x = self.teacher_model.extract_feat(batch_inputs)
+            out_teacher = self.teacher_model.bbox_head(teacher_x)
+        losses = self.bbox_head.loss(x, out_teacher, batch_data_samples)
+        return losses
+
+    def cuda(self, device: Optional[str] = None) -> nn.Module:
+        """Since teacher_model is registered as a plain object, it is necessary
+        to put the teacher model to cuda when calling ``cuda`` function."""
+        self.teacher_model.cuda(device=device)
+        return super().cuda(device=device)
+
+    def to(self, device: Optional[str] = None) -> nn.Module:
+        """Since teacher_model is registered as a plain object, it is necessary
+        to put the teacher model to other device when calling ``to``
+        function."""
+        self.teacher_model.to(device=device)
+        return super().to(device=device)
+
+    def train(self, mode: bool = True) -> None:
+        """Set the same train mode for teacher and student model."""
+        if self.eval_teacher:
+            self.teacher_model.train(False)
+        else:
+            self.teacher_model.train(mode)
+        super().train(mode)
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        """Set attribute, i.e. self.name = value
+
+        This reloading prevent the teacher model from being registered as a
+        nn.Module. The teacher module is registered as a plain object, so that
+        the teacher parameters will not show up when calling
+        ``self.parameters``, ``self.modules``, ``self.children`` methods.
+        """
+        if name == 'teacher_model':
+            object.__setattr__(self, name, value)
+        else:
+            super().__setattr__(name, value)
diff --git a/mmde/mmdet/models/detectors/lad.py b/mmde/mmdet/models/detectors/lad.py
new file mode 100644
index 0000000000000000000000000000000000000000..008f898772988715c67783d9218ff39c4dd95d80
--- /dev/null
+++ b/mmde/mmdet/models/detectors/lad.py
@@ -0,0 +1,93 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from mmengine.runner import load_checkpoint
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType
+from ..utils.misc import unpack_gt_instances
+from .kd_one_stage import KnowledgeDistillationSingleStageDetector
+
+
+@MODELS.register_module()
+class LAD(KnowledgeDistillationSingleStageDetector):
+    """Implementation of `LAD <https://arxiv.org/pdf/2108.10520.pdf>`_."""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 teacher_backbone: ConfigType,
+                 teacher_neck: ConfigType,
+                 teacher_bbox_head: ConfigType,
+                 teacher_ckpt: Optional[str] = None,
+                 eval_teacher: bool = True,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None) -> None:
+        super(KnowledgeDistillationSingleStageDetector, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor)
+        self.eval_teacher = eval_teacher
+        self.teacher_model = nn.Module()
+        self.teacher_model.backbone = MODELS.build(teacher_backbone)
+        if teacher_neck is not None:
+            self.teacher_model.neck = MODELS.build(teacher_neck)
+        teacher_bbox_head.update(train_cfg=train_cfg)
+        teacher_bbox_head.update(test_cfg=test_cfg)
+        self.teacher_model.bbox_head = MODELS.build(teacher_bbox_head)
+        if teacher_ckpt is not None:
+            load_checkpoint(
+                self.teacher_model, teacher_ckpt, map_location='cpu')
+
+    @property
+    def with_teacher_neck(self) -> bool:
+        """bool: whether the detector has a teacher_neck"""
+        return hasattr(self.teacher_model, 'neck') and \
+            self.teacher_model.neck is not None
+
+    def extract_teacher_feat(self, batch_inputs: Tensor) -> Tensor:
+        """Directly extract teacher features from the backbone+neck."""
+        x = self.teacher_model.backbone(batch_inputs)
+        if self.with_teacher_neck:
+            x = self.teacher_model.neck(x)
+        return x
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+        # get label assignment from the teacher
+        with torch.no_grad():
+            x_teacher = self.extract_teacher_feat(batch_inputs)
+            outs_teacher = self.teacher_model.bbox_head(x_teacher)
+            label_assignment_results = \
+                self.teacher_model.bbox_head.get_label_assignment(
+                    *outs_teacher, batch_gt_instances, batch_img_metas,
+                    batch_gt_instances_ignore)
+
+        # the student use the label assignment from the teacher to learn
+        x = self.extract_feat(batch_inputs)
+        losses = self.bbox_head.loss(x, label_assignment_results,
+                                     batch_data_samples)
+        return losses
diff --git a/mmde/mmdet/models/detectors/mask2former.py b/mmde/mmdet/models/detectors/mask2former.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f38ef44e482039fdf7476d048eee5df2a96fd9b
--- /dev/null
+++ b/mmde/mmdet/models/detectors/mask2former.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .maskformer import MaskFormer
+
+
+@MODELS.register_module()
+class Mask2Former(MaskFormer):
+    r"""Implementation of `Masked-attention Mask
+    Transformer for Universal Image Segmentation
+    <https://arxiv.org/pdf/2112.01527>`_."""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 panoptic_head: OptConfigType = None,
+                 panoptic_fusion_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            panoptic_head=panoptic_head,
+            panoptic_fusion_head=panoptic_fusion_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/mask_rcnn.py b/mmde/mmdet/models/detectors/mask_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..880ee1e8ac3926d618ef47985549d3214175ee73
--- /dev/null
+++ b/mmde/mmdet/models/detectors/mask_rcnn.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import ConfigDict
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class MaskRCNN(TwoStageDetector):
+    """Implementation of `Mask R-CNN <https://arxiv.org/abs/1703.06870>`_"""
+
+    def __init__(self,
+                 backbone: ConfigDict,
+                 rpn_head: ConfigDict,
+                 roi_head: ConfigDict,
+                 train_cfg: ConfigDict,
+                 test_cfg: ConfigDict,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
diff --git a/mmde/mmdet/models/detectors/mask_scoring_rcnn.py b/mmde/mmdet/models/detectors/mask_scoring_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..e09d3a1041f929113962e42bdf8b169e52dabe25
--- /dev/null
+++ b/mmde/mmdet/models/detectors/mask_scoring_rcnn.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class MaskScoringRCNN(TwoStageDetector):
+    """Mask Scoring RCNN.
+
+    https://arxiv.org/abs/1903.00241
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/maskformer.py b/mmde/mmdet/models/detectors/maskformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7493c00e1b87cf9b2fbd2c80f1e642f6eb2bea55
--- /dev/null
+++ b/mmde/mmdet/models/detectors/maskformer.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class MaskFormer(SingleStageDetector):
+    r"""Implementation of `Per-Pixel Classification is
+    NOT All You Need for Semantic Segmentation
+    <https://arxiv.org/pdf/2107.06278>`_."""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 panoptic_head: OptConfigType = None,
+                 panoptic_fusion_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super(SingleStageDetector, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+
+        panoptic_head_ = panoptic_head.deepcopy()
+        panoptic_head_.update(train_cfg=train_cfg)
+        panoptic_head_.update(test_cfg=test_cfg)
+        self.panoptic_head = MODELS.build(panoptic_head_)
+
+        panoptic_fusion_head_ = panoptic_fusion_head.deepcopy()
+        panoptic_fusion_head_.update(test_cfg=test_cfg)
+        self.panoptic_fusion_head = MODELS.build(panoptic_fusion_head_)
+
+        self.num_things_classes = self.panoptic_head.num_things_classes
+        self.num_stuff_classes = self.panoptic_head.num_stuff_classes
+        self.num_classes = self.panoptic_head.num_classes
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        x = self.extract_feat(batch_inputs)
+        losses = self.panoptic_head.loss(x, batch_data_samples)
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances' and `pred_panoptic_seg`. And the
+            ``pred_instances`` usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+
+            And the ``pred_panoptic_seg`` contains the following key
+
+                - sem_seg (Tensor): panoptic segmentation mask, has a
+                    shape (1, h, w).
+        """
+        feats = self.extract_feat(batch_inputs)
+        mask_cls_results, mask_pred_results = self.panoptic_head.predict(
+            feats, batch_data_samples)
+        results_list = self.panoptic_fusion_head.predict(
+            mask_cls_results,
+            mask_pred_results,
+            batch_data_samples,
+            rescale=rescale)
+        results = self.add_pred_to_datasample(batch_data_samples, results_list)
+
+        return results
+
+    def add_pred_to_datasample(self, data_samples: SampleList,
+                               results_list: List[dict]) -> SampleList:
+        """Add predictions to `DetDataSample`.
+
+        Args:
+            data_samples (list[:obj:`DetDataSample`], optional): A batch of
+                data samples that contain annotations and predictions.
+            results_list (List[dict]): Instance segmentation, segmantic
+                segmentation and panoptic segmentation results.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances' and `pred_panoptic_seg`. And the
+            ``pred_instances`` usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+
+            And the ``pred_panoptic_seg`` contains the following key
+
+                - sem_seg (Tensor): panoptic segmentation mask, has a
+                    shape (1, h, w).
+        """
+        for data_sample, pred_results in zip(data_samples, results_list):
+            if 'pan_results' in pred_results:
+                data_sample.pred_panoptic_seg = pred_results['pan_results']
+
+            if 'ins_results' in pred_results:
+                data_sample.pred_instances = pred_results['ins_results']
+
+            assert 'sem_results' not in pred_results, 'segmantic ' \
+                'segmentation results are not supported yet.'
+
+        return data_samples
+
+    def _forward(self, batch_inputs: Tensor,
+                 batch_data_samples: SampleList) -> Tuple[List[Tensor]]:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            tuple[List[Tensor]]: A tuple of features from ``panoptic_head``
+            forward.
+        """
+        feats = self.extract_feat(batch_inputs)
+        results = self.panoptic_head.forward(feats, batch_data_samples)
+        return results
diff --git a/mmde/mmdet/models/detectors/nasfcos.py b/mmde/mmdet/models/detectors/nasfcos.py
new file mode 100644
index 0000000000000000000000000000000000000000..da2b911bcfc6b0ba51b00d9b3948a3df7af2e74f
--- /dev/null
+++ b/mmde/mmdet/models/detectors/nasfcos.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class NASFCOS(SingleStageDetector):
+    """Implementation of `NAS-FCOS: Fast Neural Architecture Search for Object
+    Detection. <https://arxiv.org/abs/1906.0442>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of NASFCOS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of NASFCOS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/paa.py b/mmde/mmdet/models/detectors/paa.py
new file mode 100644
index 0000000000000000000000000000000000000000..094306b2fbd18ba45536470ec80443e4ff793e67
--- /dev/null
+++ b/mmde/mmdet/models/detectors/paa.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class PAA(SingleStageDetector):
+    """Implementation of `PAA <https://arxiv.org/pdf/2007.08103.pdf>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of PAA. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of PAA. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/panoptic_fpn.py b/mmde/mmdet/models/detectors/panoptic_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae63ccc38931daa60b4e62f94dcf9f44574d3669
--- /dev/null
+++ b/mmde/mmdet/models/detectors/panoptic_fpn.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .panoptic_two_stage_segmentor import TwoStagePanopticSegmentor
+
+
+@MODELS.register_module()
+class PanopticFPN(TwoStagePanopticSegmentor):
+    r"""Implementation of `Panoptic feature pyramid
+    networks <https://arxiv.org/pdf/1901.02446>`_"""
+
+    def __init__(
+            self,
+            backbone: ConfigType,
+            neck: OptConfigType = None,
+            rpn_head: OptConfigType = None,
+            roi_head: OptConfigType = None,
+            train_cfg: OptConfigType = None,
+            test_cfg: OptConfigType = None,
+            data_preprocessor: OptConfigType = None,
+            init_cfg: OptMultiConfig = None,
+            # for panoptic segmentation
+            semantic_head: OptConfigType = None,
+            panoptic_fusion_head: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg,
+            semantic_head=semantic_head,
+            panoptic_fusion_head=panoptic_fusion_head)
diff --git a/mmde/mmdet/models/detectors/panoptic_two_stage_segmentor.py b/mmde/mmdet/models/detectors/panoptic_two_stage_segmentor.py
new file mode 100644
index 0000000000000000000000000000000000000000..879edbe1ac6a0f482fdd740f4058e508e728414d
--- /dev/null
+++ b/mmde/mmdet/models/detectors/panoptic_two_stage_segmentor.py
@@ -0,0 +1,234 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List
+
+import torch
+from mmengine.structures import PixelData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class TwoStagePanopticSegmentor(TwoStageDetector):
+    """Base class of Two-stage Panoptic Segmentor.
+
+    As well as the components in TwoStageDetector, Panoptic Segmentor has extra
+    semantic_head and panoptic_fusion_head.
+    """
+
+    def __init__(
+            self,
+            backbone: ConfigType,
+            neck: OptConfigType = None,
+            rpn_head: OptConfigType = None,
+            roi_head: OptConfigType = None,
+            train_cfg: OptConfigType = None,
+            test_cfg: OptConfigType = None,
+            data_preprocessor: OptConfigType = None,
+            init_cfg: OptMultiConfig = None,
+            # for panoptic segmentation
+            semantic_head: OptConfigType = None,
+            panoptic_fusion_head: OptConfigType = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+
+        if semantic_head is not None:
+            self.semantic_head = MODELS.build(semantic_head)
+
+        if panoptic_fusion_head is not None:
+            panoptic_cfg = test_cfg.panoptic if test_cfg is not None else None
+            panoptic_fusion_head_ = panoptic_fusion_head.deepcopy()
+            panoptic_fusion_head_.update(test_cfg=panoptic_cfg)
+            self.panoptic_fusion_head = MODELS.build(panoptic_fusion_head_)
+
+            self.num_things_classes = self.panoptic_fusion_head.\
+                num_things_classes
+            self.num_stuff_classes = self.panoptic_fusion_head.\
+                num_stuff_classes
+            self.num_classes = self.panoptic_fusion_head.num_classes
+
+    @property
+    def with_semantic_head(self) -> bool:
+        """bool: whether the detector has semantic head"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    @property
+    def with_panoptic_fusion_head(self) -> bool:
+        """bool: whether the detector has panoptic fusion head"""
+        return hasattr(self, 'panoptic_fusion_head') and \
+            self.panoptic_fusion_head is not None
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs)
+
+        losses = dict()
+
+        # RPN forward and loss
+        if self.with_rpn:
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                              self.test_cfg.rpn)
+            rpn_data_samples = copy.deepcopy(batch_data_samples)
+            # set cat_id of gt_labels to 0 in RPN
+            for data_sample in rpn_data_samples:
+                data_sample.gt_instances.labels = \
+                    torch.zeros_like(data_sample.gt_instances.labels)
+
+            rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict(
+                x, rpn_data_samples, proposal_cfg=proposal_cfg)
+            # avoid get same name with roi_head loss
+            keys = rpn_losses.keys()
+            for key in list(keys):
+                if 'loss' in key and 'rpn' not in key:
+                    rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+            losses.update(rpn_losses)
+        else:
+            # TODO: Not support currently, should have a check at Fast R-CNN
+            assert batch_data_samples[0].get('proposals', None) is not None
+            # use pre-defined proposals in InstanceData for the second stage
+            # to extract ROI features.
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        roi_losses = self.roi_head.loss(x, rpn_results_list,
+                                        batch_data_samples)
+        losses.update(roi_losses)
+
+        semantic_loss = self.semantic_head.loss(x, batch_data_samples)
+        losses.update(semantic_loss)
+
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            List[:obj:`DetDataSample`]: Return the packed panoptic segmentation
+                results of input images. Each DetDataSample usually contains
+                'pred_panoptic_seg'. And the 'pred_panoptic_seg' has a key
+                ``sem_seg``, which is a tensor of shape (1, h, w).
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        x = self.extract_feat(batch_inputs)
+
+        # If there are no pre-defined proposals, use RPN to get proposals
+        if batch_data_samples[0].get('proposals', None) is None:
+            rpn_results_list = self.rpn_head.predict(
+                x, batch_data_samples, rescale=False)
+        else:
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        results_list = self.roi_head.predict(
+            x, rpn_results_list, batch_data_samples, rescale=rescale)
+
+        seg_preds = self.semantic_head.predict(x, batch_img_metas, rescale)
+
+        results_list = self.panoptic_fusion_head.predict(
+            results_list, seg_preds)
+
+        batch_data_samples = self.add_pred_to_datasample(
+            batch_data_samples, results_list)
+        return batch_data_samples
+
+    # TODO the code has not been verified and needs to be refactored later.
+    def _forward(self, batch_inputs: Tensor,
+                 batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+
+        Returns:
+            tuple: A tuple of features from ``rpn_head``, ``roi_head`` and
+                ``semantic_head`` forward.
+        """
+        results = ()
+        x = self.extract_feat(batch_inputs)
+        rpn_outs = self.rpn_head.forward(x)
+        results = results + (rpn_outs)
+
+        # If there are no pre-defined proposals, use RPN to get proposals
+        if batch_data_samples[0].get('proposals', None) is None:
+            batch_img_metas = [
+                data_samples.metainfo for data_samples in batch_data_samples
+            ]
+            rpn_results_list = self.rpn_head.predict_by_feat(
+                *rpn_outs, batch_img_metas=batch_img_metas, rescale=False)
+        else:
+            # TODO: Not checked currently.
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        # roi_head
+        roi_outs = self.roi_head(x, rpn_results_list)
+        results = results + (roi_outs)
+
+        # semantic_head
+        sem_outs = self.semantic_head.forward(x)
+        results = results + (sem_outs['seg_preds'], )
+
+        return results
+
+    def add_pred_to_datasample(self, data_samples: SampleList,
+                               results_list: List[PixelData]) -> SampleList:
+        """Add predictions to `DetDataSample`.
+
+        Args:
+            data_samples (list[:obj:`DetDataSample`]): The
+                annotation data of every samples.
+            results_list (List[PixelData]): Panoptic segmentation results of
+                each image.
+
+        Returns:
+            List[:obj:`DetDataSample`]: Return the packed panoptic segmentation
+                results of input images. Each DetDataSample usually contains
+                'pred_panoptic_seg'. And the 'pred_panoptic_seg' has a key
+                ``sem_seg``, which is a tensor of shape (1, h, w).
+        """
+
+        for data_sample, pred_panoptic_seg in zip(data_samples, results_list):
+            data_sample.pred_panoptic_seg = pred_panoptic_seg
+        return data_samples
diff --git a/mmde/mmdet/models/detectors/point_rend.py b/mmde/mmdet/models/detectors/point_rend.py
new file mode 100644
index 0000000000000000000000000000000000000000..5062ac0c945e79bd53e66e1642aec51113475cad
--- /dev/null
+++ b/mmde/mmdet/models/detectors/point_rend.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import ConfigDict
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class PointRend(TwoStageDetector):
+    """PointRend: Image Segmentation as Rendering
+
+    This detector is the implementation of
+    `PointRend <https://arxiv.org/abs/1912.08193>`_.
+
+    """
+
+    def __init__(self,
+                 backbone: ConfigDict,
+                 rpn_head: ConfigDict,
+                 roi_head: ConfigDict,
+                 train_cfg: ConfigDict,
+                 test_cfg: ConfigDict,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
diff --git a/mmde/mmdet/models/detectors/queryinst.py b/mmde/mmdet/models/detectors/queryinst.py
new file mode 100644
index 0000000000000000000000000000000000000000..400ce20c01f5c3825e343f2d32accf740c5dd55c
--- /dev/null
+++ b/mmde/mmdet/models/detectors/queryinst.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .sparse_rcnn import SparseRCNN
+
+
+@MODELS.register_module()
+class QueryInst(SparseRCNN):
+    r"""Implementation of
+    `Instances as Queries <http://arxiv.org/abs/2105.01928>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/reppoints_detector.py b/mmde/mmdet/models/detectors/reppoints_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..d86cec2ecda0671939e227c50f00379e81d3ac9c
--- /dev/null
+++ b/mmde/mmdet/models/detectors/reppoints_detector.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class RepPointsDetector(SingleStageDetector):
+    """RepPoints: Point Set Representation for Object Detection.
+
+        This detector is the implementation of:
+        - RepPoints detector (https://arxiv.org/pdf/1904.11490)
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/retinanet.py b/mmde/mmdet/models/detectors/retinanet.py
new file mode 100644
index 0000000000000000000000000000000000000000..03e3cb20e5bda603e9384d83688a56fa590e6de8
--- /dev/null
+++ b/mmde/mmdet/models/detectors/retinanet.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class RetinaNet(SingleStageDetector):
+    """Implementation of `RetinaNet <https://arxiv.org/abs/1708.02002>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/rpn.py b/mmde/mmdet/models/detectors/rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..72fe8521fcc9bc796801b2dd68269bb57aaab984
--- /dev/null
+++ b/mmde/mmdet/models/detectors/rpn.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class RPN(SingleStageDetector):
+    """Implementation of Region Proposal Network.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 rpn_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super(SingleStageDetector, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        self.neck = MODELS.build(neck) if neck is not None else None
+        rpn_train_cfg = train_cfg['rpn'] if train_cfg is not None else None
+        rpn_head_num_classes = rpn_head.get('num_classes', 1)
+        if rpn_head_num_classes != 1:
+            warnings.warn('The `num_classes` should be 1 in RPN, but get '
+                          f'{rpn_head_num_classes}, please set '
+                          'rpn_head.num_classes = 1 in your config file.')
+            rpn_head.update(num_classes=1)
+        rpn_head.update(train_cfg=rpn_train_cfg)
+        rpn_head.update(test_cfg=test_cfg['rpn'])
+        self.bbox_head = MODELS.build(rpn_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs)
+
+        # set cat_id of gt_labels to 0 in RPN
+        rpn_data_samples = copy.deepcopy(batch_data_samples)
+        for data_sample in rpn_data_samples:
+            data_sample.gt_instances.labels = \
+                torch.zeros_like(data_sample.gt_instances.labels)
+
+        losses = self.bbox_head.loss(x, rpn_data_samples)
+        return losses
diff --git a/mmde/mmdet/models/detectors/rtmdet.py b/mmde/mmdet/models/detectors/rtmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b43e053fc41a4b8400bbc0946fffedfa735b9451
--- /dev/null
+++ b/mmde/mmdet/models/detectors/rtmdet.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.dist import get_world_size
+from mmengine.logging import print_log
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class RTMDet(SingleStageDetector):
+    """Implementation of RTMDet.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of ATSS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of ATSS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+        use_syncbn (bool): Whether to use SyncBatchNorm. Defaults to True.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 use_syncbn: bool = True) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+
+        # TODO: Waiting for mmengine support
+        if use_syncbn and get_world_size() > 1:
+            torch.nn.SyncBatchNorm.convert_sync_batchnorm(self)
+            print_log('Using SyncBatchNorm()', 'current')
diff --git a/mmde/mmdet/models/detectors/scnet.py b/mmde/mmdet/models/detectors/scnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..606a0203869f1731a21d811f06c4781f5cd90d8d
--- /dev/null
+++ b/mmde/mmdet/models/detectors/scnet.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from .cascade_rcnn import CascadeRCNN
+
+
+@MODELS.register_module()
+class SCNet(CascadeRCNN):
+    """Implementation of `SCNet <https://arxiv.org/abs/2012.10150>`_"""
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
diff --git a/mmde/mmdet/models/detectors/semi_base.py b/mmde/mmdet/models/detectors/semi_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3f0c8c030830e188bf3ad245d5b3cb471ecb04f
--- /dev/null
+++ b/mmde/mmdet/models/detectors/semi_base.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.models.utils import (filter_gt_instances, rename_loss_dict,
+                                reweight_loss_dict)
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_project
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+
+@MODELS.register_module()
+class SemiBaseDetector(BaseDetector):
+    """Base class for semi-supervised detectors.
+
+    Semi-supervised detectors typically consisting of a teacher model
+    updated by exponential moving average and a student model updated
+    by gradient descent.
+
+    Args:
+        detector (:obj:`ConfigDict` or dict): The detector config.
+        semi_train_cfg (:obj:`ConfigDict` or dict, optional):
+            The semi-supervised training config.
+        semi_test_cfg (:obj:`ConfigDict` or dict, optional):
+            The semi-supervised testing config.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: ConfigType,
+                 semi_train_cfg: OptConfigType = None,
+                 semi_test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.student = MODELS.build(detector)
+        self.teacher = MODELS.build(detector)
+        self.semi_train_cfg = semi_train_cfg
+        self.semi_test_cfg = semi_test_cfg
+        if self.semi_train_cfg.get('freeze_teacher', True) is True:
+            self.freeze(self.teacher)
+
+    @staticmethod
+    def freeze(model: nn.Module):
+        """Freeze the model."""
+        model.eval()
+        for param in model.parameters():
+            param.requires_grad = False
+
+    def loss(self, multi_batch_inputs: Dict[str, Tensor],
+             multi_batch_data_samples: Dict[str, SampleList]) -> dict:
+        """Calculate losses from multi-branch inputs and data samples.
+
+        Args:
+            multi_batch_inputs (Dict[str, Tensor]): The dict of multi-branch
+                input images, each value with shape (N, C, H, W).
+                Each value should usually be mean centered and std scaled.
+            multi_batch_data_samples (Dict[str, List[:obj:`DetDataSample`]]):
+                The dict of multi-branch data samples.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+        losses = dict()
+        losses.update(**self.loss_by_gt_instances(
+            multi_batch_inputs['sup'], multi_batch_data_samples['sup']))
+
+        origin_pseudo_data_samples, batch_info = self.get_pseudo_instances(
+            multi_batch_inputs['unsup_teacher'],
+            multi_batch_data_samples['unsup_teacher'])
+        multi_batch_data_samples[
+            'unsup_student'] = self.project_pseudo_instances(
+                origin_pseudo_data_samples,
+                multi_batch_data_samples['unsup_student'])
+        losses.update(**self.loss_by_pseudo_instances(
+            multi_batch_inputs['unsup_student'],
+            multi_batch_data_samples['unsup_student'], batch_info))
+        return losses
+
+    def loss_by_gt_instances(self, batch_inputs: Tensor,
+                             batch_data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and ground-truth data
+        samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+
+        losses = self.student.loss(batch_inputs, batch_data_samples)
+        sup_weight = self.semi_train_cfg.get('sup_weight', 1.)
+        return rename_loss_dict('sup_', reweight_loss_dict(losses, sup_weight))
+
+    def loss_by_pseudo_instances(self,
+                                 batch_inputs: Tensor,
+                                 batch_data_samples: SampleList,
+                                 batch_info: Optional[dict] = None) -> dict:
+        """Calculate losses from a batch of inputs and pseudo data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+            batch_info (dict): Batch information of teacher model
+                forward propagation process. Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+        batch_data_samples = filter_gt_instances(
+            batch_data_samples, score_thr=self.semi_train_cfg.cls_pseudo_thr)
+        losses = self.student.loss(batch_inputs, batch_data_samples)
+        pseudo_instances_num = sum([
+            len(data_samples.gt_instances)
+            for data_samples in batch_data_samples
+        ])
+        unsup_weight = self.semi_train_cfg.get(
+            'unsup_weight', 1.) if pseudo_instances_num > 0 else 0.
+        return rename_loss_dict('unsup_',
+                                reweight_loss_dict(losses, unsup_weight))
+
+    @torch.no_grad()
+    def get_pseudo_instances(
+            self, batch_inputs: Tensor, batch_data_samples: SampleList
+    ) -> Tuple[SampleList, Optional[dict]]:
+        """Get pseudo instances from teacher model."""
+        self.teacher.eval()
+        results_list = self.teacher.predict(
+            batch_inputs, batch_data_samples, rescale=False)
+        batch_info = {}
+        for data_samples, results in zip(batch_data_samples, results_list):
+            data_samples.gt_instances = results.pred_instances
+            data_samples.gt_instances.bboxes = bbox_project(
+                data_samples.gt_instances.bboxes,
+                torch.from_numpy(data_samples.homography_matrix).inverse().to(
+                    self.data_preprocessor.device), data_samples.ori_shape)
+        return batch_data_samples, batch_info
+
+    def project_pseudo_instances(self, batch_pseudo_instances: SampleList,
+                                 batch_data_samples: SampleList) -> SampleList:
+        """Project pseudo instances."""
+        for pseudo_instances, data_samples in zip(batch_pseudo_instances,
+                                                  batch_data_samples):
+            data_samples.gt_instances = copy.deepcopy(
+                pseudo_instances.gt_instances)
+            data_samples.gt_instances.bboxes = bbox_project(
+                data_samples.gt_instances.bboxes,
+                torch.tensor(data_samples.homography_matrix).to(
+                    self.data_preprocessor.device), data_samples.img_shape)
+        wh_thr = self.semi_train_cfg.get('min_pseudo_bbox_wh', (1e-2, 1e-2))
+        return filter_gt_instances(batch_data_samples, wh_thr=wh_thr)
+
+    def predict(self, batch_inputs: Tensor,
+                batch_data_samples: SampleList) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Return the detection results of the
+            input images. The returns value is DetDataSample,
+            which usually contain 'pred_instances'. And the
+            ``pred_instances`` usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        if self.semi_test_cfg.get('predict_on', 'teacher') == 'teacher':
+            return self.teacher(
+                batch_inputs, batch_data_samples, mode='predict')
+        else:
+            return self.student(
+                batch_inputs, batch_data_samples, mode='predict')
+
+    def _forward(self, batch_inputs: Tensor,
+                 batch_data_samples: SampleList) -> SampleList:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+
+        Returns:
+            tuple: A tuple of features from ``rpn_head`` and ``roi_head``
+            forward.
+        """
+        if self.semi_test_cfg.get('forward_on', 'teacher') == 'teacher':
+            return self.teacher(
+                batch_inputs, batch_data_samples, mode='tensor')
+        else:
+            return self.student(
+                batch_inputs, batch_data_samples, mode='tensor')
+
+    def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).
+
+        Returns:
+            tuple[Tensor]: Multi-level features that may have
+            different resolutions.
+        """
+        if self.semi_test_cfg.get('extract_feat_on', 'teacher') == 'teacher':
+            return self.teacher.extract_feat(batch_inputs)
+        else:
+            return self.student.extract_feat(batch_inputs)
+
+    def _load_from_state_dict(self, state_dict: dict, prefix: str,
+                              local_metadata: dict, strict: bool,
+                              missing_keys: Union[List[str], str],
+                              unexpected_keys: Union[List[str], str],
+                              error_msgs: Union[List[str], str]) -> None:
+        """Add teacher and student prefixes to model parameter names."""
+        if not any([
+                'student' in key or 'teacher' in key
+                for key in state_dict.keys()
+        ]):
+            keys = list(state_dict.keys())
+            state_dict.update({'teacher.' + k: state_dict[k] for k in keys})
+            state_dict.update({'student.' + k: state_dict[k] for k in keys})
+            for k in keys:
+                state_dict.pop(k)
+        return super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
diff --git a/mmde/mmdet/models/detectors/single_stage.py b/mmde/mmdet/models/detectors/single_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..06c074085967bbc9040d93e5eb446b67a006087e
--- /dev/null
+++ b/mmde/mmdet/models/detectors/single_stage.py
@@ -0,0 +1,149 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList, SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+
+@MODELS.register_module()
+class SingleStageDetector(BaseDetector):
+    """Base class for single-stage detectors.
+
+    Single-stage detectors directly and densely predict bounding boxes on the
+    output features of the backbone+neck.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = MODELS.build(bbox_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def _load_from_state_dict(self, state_dict: dict, prefix: str,
+                              local_metadata: dict, strict: bool,
+                              missing_keys: Union[List[str], str],
+                              unexpected_keys: Union[List[str], str],
+                              error_msgs: Union[List[str], str]) -> None:
+        """Exchange bbox_head key to rpn_head key when loading two-stage
+        weights into single-stage model."""
+        bbox_head_prefix = prefix + '.bbox_head' if prefix else 'bbox_head'
+        bbox_head_keys = [
+            k for k in state_dict.keys() if k.startswith(bbox_head_prefix)
+        ]
+        rpn_head_prefix = prefix + '.rpn_head' if prefix else 'rpn_head'
+        rpn_head_keys = [
+            k for k in state_dict.keys() if k.startswith(rpn_head_prefix)
+        ]
+        if len(bbox_head_keys) == 0 and len(rpn_head_keys) != 0:
+            for rpn_head_key in rpn_head_keys:
+                bbox_head_key = bbox_head_prefix + \
+                                rpn_head_key[len(rpn_head_prefix):]
+                state_dict[bbox_head_key] = state_dict.pop(rpn_head_key)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, list]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs)
+        losses = self.bbox_head.loss(x, batch_data_samples)
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances'. And the ``pred_instances`` usually
+            contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        x = self.extract_feat(batch_inputs)
+        results_list = self.bbox_head.predict(
+            x, batch_data_samples, rescale=rescale)
+        batch_data_samples = self.add_pred_to_datasample(
+            batch_data_samples, results_list)
+        return batch_data_samples
+
+    def _forward(
+            self,
+            batch_inputs: Tensor,
+            batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns:
+            tuple[list]: A tuple of features from ``bbox_head`` forward.
+        """
+        x = self.extract_feat(batch_inputs)
+        results = self.bbox_head.forward(x)
+        return results
+
+    def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).
+
+        Returns:
+            tuple[Tensor]: Multi-level features that may have
+            different resolutions.
+        """
+        x = self.backbone(batch_inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
diff --git a/mmde/mmdet/models/detectors/single_stage_instance_seg.py b/mmde/mmdet/models/detectors/single_stage_instance_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..acb5f0d2f8e4636b86b4b66cbf5c4916d0dae16f
--- /dev/null
+++ b/mmde/mmdet/models/detectors/single_stage_instance_seg.py
@@ -0,0 +1,180 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Tuple
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList, SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class SingleStageInstanceSegmentor(BaseDetector):
+    """Base class for single-stage instance segmentors."""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 mask_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        else:
+            self.neck = None
+        if bbox_head is not None:
+            bbox_head.update(train_cfg=copy.deepcopy(train_cfg))
+            bbox_head.update(test_cfg=copy.deepcopy(test_cfg))
+            self.bbox_head = MODELS.build(bbox_head)
+        else:
+            self.bbox_head = None
+
+        assert mask_head, f'`mask_head` must ' \
+                          f'be implemented in {self.__class__.__name__}'
+        mask_head.update(train_cfg=copy.deepcopy(train_cfg))
+        mask_head.update(test_cfg=copy.deepcopy(test_cfg))
+        self.mask_head = MODELS.build(mask_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).
+
+        Returns:
+            tuple[Tensor]: Multi-level features that may have different
+            resolutions.
+        """
+        x = self.backbone(batch_inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def _forward(self,
+                 batch_inputs: Tensor,
+                 batch_data_samples: OptSampleList = None,
+                 **kwargs) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+
+        Returns:
+            tuple: A tuple of features from ``bbox_head`` forward.
+        """
+        outs = ()
+        # backbone
+        x = self.extract_feat(batch_inputs)
+        # bbox_head
+        positive_infos = None
+        if self.with_bbox:
+            assert batch_data_samples is not None
+            bbox_outs = self.bbox_head.forward(x)
+            outs = outs + (bbox_outs, )
+            # It is necessary to use `bbox_head.loss` to update
+            # `_raw_positive_infos` which will be used in `get_positive_infos`
+            # positive_infos will be used in the following mask head.
+            _ = self.bbox_head.loss(x, batch_data_samples, **kwargs)
+            positive_infos = self.bbox_head.get_positive_infos()
+        # mask_head
+        if positive_infos is None:
+            mask_outs = self.mask_head.forward(x)
+        else:
+            mask_outs = self.mask_head.forward(x, positive_infos)
+        outs = outs + (mask_outs, )
+        return outs
+
+    def loss(self, batch_inputs: Tensor, batch_data_samples: SampleList,
+             **kwargs) -> dict:
+        """
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs)
+        losses = dict()
+
+        positive_infos = None
+        # CondInst and YOLACT have bbox_head
+        if self.with_bbox:
+            bbox_losses = self.bbox_head.loss(x, batch_data_samples, **kwargs)
+            losses.update(bbox_losses)
+            # get positive information from bbox head, which will be used
+            # in the following mask head.
+            positive_infos = self.bbox_head.get_positive_infos()
+
+        mask_loss = self.mask_head.loss(
+            x, batch_data_samples, positive_infos=positive_infos, **kwargs)
+        # avoid loss override
+        assert not set(mask_loss.keys()) & set(losses.keys())
+
+        losses.update(mask_loss)
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True,
+                **kwargs) -> SampleList:
+        """Perform forward propagation of the mask head and predict mask
+        results on the features of the upstream network.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances'. And the ``pred_instances`` usually
+            contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+                (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+                the last dimension 4 arrange as (x1, y1, x2, y2).
+            - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        x = self.extract_feat(batch_inputs)
+        if self.with_bbox:
+            # the bbox branch does not need to be scaled to the original
+            # image scale, because the mask branch will scale both bbox
+            # and mask at the same time.
+            bbox_rescale = rescale if not self.with_mask else False
+            results_list = self.bbox_head.predict(
+                x, batch_data_samples, rescale=bbox_rescale)
+        else:
+            results_list = None
+
+        results_list = self.mask_head.predict(
+            x, batch_data_samples, rescale=rescale, results_list=results_list)
+
+        batch_data_samples = self.add_pred_to_datasample(
+            batch_data_samples, results_list)
+        return batch_data_samples
diff --git a/mmde/mmdet/models/detectors/soft_teacher.py b/mmde/mmdet/models/detectors/soft_teacher.py
new file mode 100644
index 0000000000000000000000000000000000000000..80853f1d8399c70008923067777a2581671ede0b
--- /dev/null
+++ b/mmde/mmdet/models/detectors/soft_teacher.py
@@ -0,0 +1,378 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Optional, Tuple
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.utils import (filter_gt_instances, rename_loss_dict,
+                                reweight_loss_dict)
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi, bbox_project
+from mmdet.utils import ConfigType, InstanceList, OptConfigType, OptMultiConfig
+from ..utils.misc import unpack_gt_instances
+from .semi_base import SemiBaseDetector
+
+
+@MODELS.register_module()
+class SoftTeacher(SemiBaseDetector):
+    r"""Implementation of `End-to-End Semi-Supervised Object Detection
+    with Soft Teacher <https://arxiv.org/abs/2106.09018>`_
+
+    Args:
+        detector (:obj:`ConfigDict` or dict): The detector config.
+        semi_train_cfg (:obj:`ConfigDict` or dict, optional):
+            The semi-supervised training config.
+        semi_test_cfg (:obj:`ConfigDict` or dict, optional):
+            The semi-supervised testing config.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: ConfigType,
+                 semi_train_cfg: OptConfigType = None,
+                 semi_test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            detector=detector,
+            semi_train_cfg=semi_train_cfg,
+            semi_test_cfg=semi_test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+
+    def loss_by_pseudo_instances(self,
+                                 batch_inputs: Tensor,
+                                 batch_data_samples: SampleList,
+                                 batch_info: Optional[dict] = None) -> dict:
+        """Calculate losses from a batch of inputs and pseudo data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+            batch_info (dict): Batch information of teacher model
+                forward propagation process. Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+
+        x = self.student.extract_feat(batch_inputs)
+
+        losses = {}
+        rpn_losses, rpn_results_list = self.rpn_loss_by_pseudo_instances(
+            x, batch_data_samples)
+        losses.update(**rpn_losses)
+        losses.update(**self.rcnn_cls_loss_by_pseudo_instances(
+            x, rpn_results_list, batch_data_samples, batch_info))
+        losses.update(**self.rcnn_reg_loss_by_pseudo_instances(
+            x, rpn_results_list, batch_data_samples))
+        unsup_weight = self.semi_train_cfg.get('unsup_weight', 1.)
+        return rename_loss_dict('unsup_',
+                                reweight_loss_dict(losses, unsup_weight))
+
+    @torch.no_grad()
+    def get_pseudo_instances(
+            self, batch_inputs: Tensor, batch_data_samples: SampleList
+    ) -> Tuple[SampleList, Optional[dict]]:
+        """Get pseudo instances from teacher model."""
+        assert self.teacher.with_bbox, 'Bbox head must be implemented.'
+        x = self.teacher.extract_feat(batch_inputs)
+
+        # If there are no pre-defined proposals, use RPN to get proposals
+        if batch_data_samples[0].get('proposals', None) is None:
+            rpn_results_list = self.teacher.rpn_head.predict(
+                x, batch_data_samples, rescale=False)
+        else:
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        results_list = self.teacher.roi_head.predict(
+            x, rpn_results_list, batch_data_samples, rescale=False)
+
+        for data_samples, results in zip(batch_data_samples, results_list):
+            data_samples.gt_instances = results
+
+        batch_data_samples = filter_gt_instances(
+            batch_data_samples,
+            score_thr=self.semi_train_cfg.pseudo_label_initial_score_thr)
+
+        reg_uncs_list = self.compute_uncertainty_with_aug(
+            x, batch_data_samples)
+
+        for data_samples, reg_uncs in zip(batch_data_samples, reg_uncs_list):
+            data_samples.gt_instances['reg_uncs'] = reg_uncs
+            data_samples.gt_instances.bboxes = bbox_project(
+                data_samples.gt_instances.bboxes,
+                torch.from_numpy(data_samples.homography_matrix).inverse().to(
+                    self.data_preprocessor.device), data_samples.ori_shape)
+
+        batch_info = {
+            'feat': x,
+            'img_shape': [],
+            'homography_matrix': [],
+            'metainfo': []
+        }
+        for data_samples in batch_data_samples:
+            batch_info['img_shape'].append(data_samples.img_shape)
+            batch_info['homography_matrix'].append(
+                torch.from_numpy(data_samples.homography_matrix).to(
+                    self.data_preprocessor.device))
+            batch_info['metainfo'].append(data_samples.metainfo)
+        return batch_data_samples, batch_info
+
+    def rpn_loss_by_pseudo_instances(self, x: Tuple[Tensor],
+                                     batch_data_samples: SampleList) -> dict:
+        """Calculate rpn loss from a batch of inputs and pseudo data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+        Returns:
+            dict: A dictionary of rpn loss components
+        """
+
+        rpn_data_samples = copy.deepcopy(batch_data_samples)
+        rpn_data_samples = filter_gt_instances(
+            rpn_data_samples, score_thr=self.semi_train_cfg.rpn_pseudo_thr)
+        proposal_cfg = self.student.train_cfg.get('rpn_proposal',
+                                                  self.student.test_cfg.rpn)
+        # set cat_id of gt_labels to 0 in RPN
+        for data_sample in rpn_data_samples:
+            data_sample.gt_instances.labels = \
+                torch.zeros_like(data_sample.gt_instances.labels)
+
+        rpn_losses, rpn_results_list = self.student.rpn_head.loss_and_predict(
+            x, rpn_data_samples, proposal_cfg=proposal_cfg)
+        for key in rpn_losses.keys():
+            if 'loss' in key and 'rpn' not in key:
+                rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+        return rpn_losses, rpn_results_list
+
+    def rcnn_cls_loss_by_pseudo_instances(self, x: Tuple[Tensor],
+                                          unsup_rpn_results_list: InstanceList,
+                                          batch_data_samples: SampleList,
+                                          batch_info: dict) -> dict:
+        """Calculate classification loss from a batch of inputs and pseudo data
+        samples.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            unsup_rpn_results_list (list[:obj:`InstanceData`]):
+                List of region proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+            batch_info (dict): Batch information of teacher model
+                forward propagation process.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of rcnn
+                classification loss components
+        """
+        rpn_results_list = copy.deepcopy(unsup_rpn_results_list)
+        cls_data_samples = copy.deepcopy(batch_data_samples)
+        cls_data_samples = filter_gt_instances(
+            cls_data_samples, score_thr=self.semi_train_cfg.cls_pseudo_thr)
+
+        outputs = unpack_gt_instances(cls_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, _ = outputs
+
+        # assign gts and sample proposals
+        num_imgs = len(cls_data_samples)
+        sampling_results = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+            assign_result = self.student.roi_head.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.student.roi_head.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in x])
+            sampling_results.append(sampling_result)
+
+        selected_bboxes = [res.priors for res in sampling_results]
+        rois = bbox2roi(selected_bboxes)
+        bbox_results = self.student.roi_head._bbox_forward(x, rois)
+        # cls_reg_targets is a tuple of labels, label_weights,
+        # and bbox_targets, bbox_weights
+        cls_reg_targets = self.student.roi_head.bbox_head.get_targets(
+            sampling_results, self.student.train_cfg.rcnn)
+
+        selected_results_list = []
+        for bboxes, data_samples, teacher_matrix, teacher_img_shape in zip(
+                selected_bboxes, batch_data_samples,
+                batch_info['homography_matrix'], batch_info['img_shape']):
+            student_matrix = torch.tensor(
+                data_samples.homography_matrix, device=teacher_matrix.device)
+            homography_matrix = teacher_matrix @ student_matrix.inverse()
+            projected_bboxes = bbox_project(bboxes, homography_matrix,
+                                            teacher_img_shape)
+            selected_results_list.append(InstanceData(bboxes=projected_bboxes))
+
+        with torch.no_grad():
+            results_list = self.teacher.roi_head.predict_bbox(
+                batch_info['feat'],
+                batch_info['metainfo'],
+                selected_results_list,
+                rcnn_test_cfg=None,
+                rescale=False)
+            bg_score = torch.cat(
+                [results.scores[:, -1] for results in results_list])
+            # cls_reg_targets[0] is labels
+            neg_inds = cls_reg_targets[
+                0] == self.student.roi_head.bbox_head.num_classes
+            # cls_reg_targets[1] is label_weights
+            cls_reg_targets[1][neg_inds] = bg_score[neg_inds].detach()
+
+        losses = self.student.roi_head.bbox_head.loss(
+            bbox_results['cls_score'], bbox_results['bbox_pred'], rois,
+            *cls_reg_targets)
+        # cls_reg_targets[1] is label_weights
+        losses['loss_cls'] = losses['loss_cls'] * len(
+            cls_reg_targets[1]) / max(sum(cls_reg_targets[1]), 1.0)
+        return losses
+
+    def rcnn_reg_loss_by_pseudo_instances(
+            self, x: Tuple[Tensor], unsup_rpn_results_list: InstanceList,
+            batch_data_samples: SampleList) -> dict:
+        """Calculate rcnn regression loss from a batch of inputs and pseudo
+        data samples.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            unsup_rpn_results_list (list[:obj:`InstanceData`]):
+                List of region proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of rcnn
+                regression loss components
+        """
+        rpn_results_list = copy.deepcopy(unsup_rpn_results_list)
+        reg_data_samples = copy.deepcopy(batch_data_samples)
+        for data_samples in reg_data_samples:
+            if data_samples.gt_instances.bboxes.shape[0] > 0:
+                data_samples.gt_instances = data_samples.gt_instances[
+                    data_samples.gt_instances.reg_uncs <
+                    self.semi_train_cfg.reg_pseudo_thr]
+        roi_losses = self.student.roi_head.loss(x, rpn_results_list,
+                                                reg_data_samples)
+        return {'loss_bbox': roi_losses['loss_bbox']}
+
+    def compute_uncertainty_with_aug(
+            self, x: Tuple[Tensor],
+            batch_data_samples: SampleList) -> List[Tensor]:
+        """Compute uncertainty with augmented bboxes.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+
+        Returns:
+            list[Tensor]: A list of uncertainty for pseudo bboxes.
+        """
+        auged_results_list = self.aug_box(batch_data_samples,
+                                          self.semi_train_cfg.jitter_times,
+                                          self.semi_train_cfg.jitter_scale)
+        # flatten
+        auged_results_list = [
+            InstanceData(bboxes=auged.reshape(-1, auged.shape[-1]))
+            for auged in auged_results_list
+        ]
+
+        self.teacher.roi_head.test_cfg = None
+        results_list = self.teacher.roi_head.predict(
+            x, auged_results_list, batch_data_samples, rescale=False)
+        self.teacher.roi_head.test_cfg = self.teacher.test_cfg.rcnn
+
+        reg_channel = max(
+            [results.bboxes.shape[-1] for results in results_list]) // 4
+        bboxes = [
+            results.bboxes.reshape(self.semi_train_cfg.jitter_times, -1,
+                                   results.bboxes.shape[-1])
+            if results.bboxes.numel() > 0 else results.bboxes.new_zeros(
+                self.semi_train_cfg.jitter_times, 0, 4 * reg_channel).float()
+            for results in results_list
+        ]
+
+        box_unc = [bbox.std(dim=0) for bbox in bboxes]
+        bboxes = [bbox.mean(dim=0) for bbox in bboxes]
+        labels = [
+            data_samples.gt_instances.labels
+            for data_samples in batch_data_samples
+        ]
+        if reg_channel != 1:
+            bboxes = [
+                bbox.reshape(bbox.shape[0], reg_channel,
+                             4)[torch.arange(bbox.shape[0]), label]
+                for bbox, label in zip(bboxes, labels)
+            ]
+            box_unc = [
+                unc.reshape(unc.shape[0], reg_channel,
+                            4)[torch.arange(unc.shape[0]), label]
+                for unc, label in zip(box_unc, labels)
+            ]
+
+        box_shape = [(bbox[:, 2:4] - bbox[:, :2]).clamp(min=1.0)
+                     for bbox in bboxes]
+        box_unc = [
+            torch.mean(
+                unc / wh[:, None, :].expand(-1, 2, 2).reshape(-1, 4), dim=-1)
+            if wh.numel() > 0 else unc for unc, wh in zip(box_unc, box_shape)
+        ]
+        return box_unc
+
+    @staticmethod
+    def aug_box(batch_data_samples, times, frac):
+        """Augment bboxes with jitter."""
+
+        def _aug_single(box):
+            box_scale = box[:, 2:4] - box[:, :2]
+            box_scale = (
+                box_scale.clamp(min=1)[:, None, :].expand(-1, 2,
+                                                          2).reshape(-1, 4))
+            aug_scale = box_scale * frac  # [n,4]
+
+            offset = (
+                torch.randn(times, box.shape[0], 4, device=box.device) *
+                aug_scale[None, ...])
+            new_box = box.clone()[None, ...].expand(times, box.shape[0],
+                                                    -1) + offset
+            return new_box
+
+        return [
+            _aug_single(data_samples.gt_instances.bboxes)
+            for data_samples in batch_data_samples
+        ]
diff --git a/mmde/mmdet/models/detectors/solo.py b/mmde/mmdet/models/detectors/solo.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bf47ba24941e09fd795b241a3f6aa0b67ae3380
--- /dev/null
+++ b/mmde/mmdet/models/detectors/solo.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@MODELS.register_module()
+class SOLO(SingleStageInstanceSegmentor):
+    """`SOLO: Segmenting Objects by Locations
+    <https://arxiv.org/abs/1912.04488>`_
+
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 mask_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/solov2.py b/mmde/mmdet/models/detectors/solov2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eefe4c532267be1480d13b8d73fc54bf694e81c
--- /dev/null
+++ b/mmde/mmdet/models/detectors/solov2.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@MODELS.register_module()
+class SOLOv2(SingleStageInstanceSegmentor):
+    """`SOLOv2: Dynamic and Fast Instance Segmentation
+    <https://arxiv.org/abs/2003.10152>`_
+
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 mask_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/sparse_rcnn.py b/mmde/mmdet/models/detectors/sparse_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..75442a69e472953854ded9fc8c30ac4ab30535d3
--- /dev/null
+++ b/mmde/mmdet/models/detectors/sparse_rcnn.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class SparseRCNN(TwoStageDetector):
+    r"""Implementation of `Sparse R-CNN: End-to-End Object Detection with
+    Learnable Proposals <https://arxiv.org/abs/2011.12450>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 rpn_head: OptConfigType = None,
+                 roi_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+        assert self.with_rpn, 'Sparse R-CNN and QueryInst ' \
+            'do not support external proposals'
diff --git a/mmde/mmdet/models/detectors/tood.py b/mmde/mmdet/models/detectors/tood.py
new file mode 100644
index 0000000000000000000000000000000000000000..38720482c5451471f5a66a6cf689dbed6100c9fa
--- /dev/null
+++ b/mmde/mmdet/models/detectors/tood.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class TOOD(SingleStageDetector):
+    r"""Implementation of `TOOD: Task-aligned One-stage Object Detection.
+    <https://arxiv.org/abs/2108.07755>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of TOOD. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of TOOD. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/trident_faster_rcnn.py b/mmde/mmdet/models/detectors/trident_faster_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..4244925beaebea820f836b41ab5463f5f499f4d0
--- /dev/null
+++ b/mmde/mmdet/models/detectors/trident_faster_rcnn.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .faster_rcnn import FasterRCNN
+
+
+@MODELS.register_module()
+class TridentFasterRCNN(FasterRCNN):
+    """Implementation of `TridentNet <https://arxiv.org/abs/1901.01892>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+        assert self.backbone.num_branch == self.roi_head.num_branch
+        assert self.backbone.test_branch_idx == self.roi_head.test_branch_idx
+        self.num_branch = self.backbone.num_branch
+        self.test_branch_idx = self.backbone.test_branch_idx
+
+    def _forward(self, batch_inputs: Tensor,
+                 batch_data_samples: SampleList) -> tuple:
+        """copy the ``batch_data_samples`` to fit multi-branch."""
+        num_branch = self.num_branch \
+            if self.training or self.test_branch_idx == -1 else 1
+        trident_data_samples = batch_data_samples * num_branch
+        return super()._forward(
+            batch_inputs=batch_inputs, batch_data_samples=trident_data_samples)
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """copy the ``batch_data_samples`` to fit multi-branch."""
+        num_branch = self.num_branch \
+            if self.training or self.test_branch_idx == -1 else 1
+        trident_data_samples = batch_data_samples * num_branch
+        return super().loss(
+            batch_inputs=batch_inputs, batch_data_samples=trident_data_samples)
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """copy the ``batch_data_samples`` to fit multi-branch."""
+        num_branch = self.num_branch \
+            if self.training or self.test_branch_idx == -1 else 1
+        trident_data_samples = batch_data_samples * num_branch
+        return super().predict(
+            batch_inputs=batch_inputs,
+            batch_data_samples=trident_data_samples,
+            rescale=rescale)
+
+    # TODO need to refactor
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        x = self.extract_feats(imgs)
+        num_branch = (self.num_branch if self.test_branch_idx == -1 else 1)
+        trident_img_metas = [img_metas * num_branch for img_metas in img_metas]
+        proposal_list = self.rpn_head.aug_test_rpn(x, trident_img_metas)
+        return self.roi_head.aug_test(
+            x, proposal_list, img_metas, rescale=rescale)
diff --git a/mmde/mmdet/models/detectors/two_stage.py b/mmde/mmdet/models/detectors/two_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e83df9eb5ce837636e10c4592fe26a7edce1657
--- /dev/null
+++ b/mmde/mmdet/models/detectors/two_stage.py
@@ -0,0 +1,243 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import List, Tuple, Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+
+@MODELS.register_module()
+class TwoStageDetector(BaseDetector):
+    """Base class for two-stage detectors.
+
+    Two-stage detectors typically consisting of a region proposal network and a
+    task-specific regression head.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 rpn_head: OptConfigType = None,
+                 roi_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+
+        if rpn_head is not None:
+            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
+            rpn_head_ = rpn_head.copy()
+            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
+            rpn_head_num_classes = rpn_head_.get('num_classes', None)
+            if rpn_head_num_classes is None:
+                rpn_head_.update(num_classes=1)
+            else:
+                if rpn_head_num_classes != 1:
+                    warnings.warn(
+                        'The `num_classes` should be 1 in RPN, but get '
+                        f'{rpn_head_num_classes}, please set '
+                        'rpn_head.num_classes = 1 in your config file.')
+                    rpn_head_.update(num_classes=1)
+            self.rpn_head = MODELS.build(rpn_head_)
+
+        if roi_head is not None:
+            # update train and test cfg here for now
+            # TODO: refactor assigner & sampler
+            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
+            roi_head.update(train_cfg=rcnn_train_cfg)
+            roi_head.update(test_cfg=test_cfg.rcnn)
+            self.roi_head = MODELS.build(roi_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def _load_from_state_dict(self, state_dict: dict, prefix: str,
+                              local_metadata: dict, strict: bool,
+                              missing_keys: Union[List[str], str],
+                              unexpected_keys: Union[List[str], str],
+                              error_msgs: Union[List[str], str]) -> None:
+        """Exchange bbox_head key to rpn_head key when loading single-stage
+        weights into two-stage model."""
+        bbox_head_prefix = prefix + '.bbox_head' if prefix else 'bbox_head'
+        bbox_head_keys = [
+            k for k in state_dict.keys() if k.startswith(bbox_head_prefix)
+        ]
+        rpn_head_prefix = prefix + '.rpn_head' if prefix else 'rpn_head'
+        rpn_head_keys = [
+            k for k in state_dict.keys() if k.startswith(rpn_head_prefix)
+        ]
+        if len(bbox_head_keys) != 0 and len(rpn_head_keys) == 0:
+            for bbox_head_key in bbox_head_keys:
+                rpn_head_key = rpn_head_prefix + \
+                               bbox_head_key[len(bbox_head_prefix):]
+                state_dict[rpn_head_key] = state_dict.pop(bbox_head_key)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    @property
+    def with_rpn(self) -> bool:
+        """bool: whether the detector has RPN"""
+        return hasattr(self, 'rpn_head') and self.rpn_head is not None
+
+    @property
+    def with_roi_head(self) -> bool:
+        """bool: whether the detector has a RoI head"""
+        return hasattr(self, 'roi_head') and self.roi_head is not None
+
+    def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).
+
+        Returns:
+            tuple[Tensor]: Multi-level features that may have
+            different resolutions.
+        """
+        x = self.backbone(batch_inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def _forward(self, batch_inputs: Tensor,
+                 batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns:
+            tuple: A tuple of features from ``rpn_head`` and ``roi_head``
+            forward.
+        """
+        results = ()
+        x = self.extract_feat(batch_inputs)
+
+        if self.with_rpn:
+            rpn_results_list = self.rpn_head.predict(
+                x, batch_data_samples, rescale=False)
+        else:
+            assert batch_data_samples[0].get('proposals', None) is not None
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+        roi_outs = self.roi_head.forward(x, rpn_results_list,
+                                         batch_data_samples)
+        results = results + (roi_outs, )
+        return results
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+        x = self.extract_feat(batch_inputs)
+
+        losses = dict()
+
+        # RPN forward and loss
+        if self.with_rpn:
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                              self.test_cfg.rpn)
+            rpn_data_samples = copy.deepcopy(batch_data_samples)
+            # set cat_id of gt_labels to 0 in RPN
+            for data_sample in rpn_data_samples:
+                data_sample.gt_instances.labels = \
+                    torch.zeros_like(data_sample.gt_instances.labels)
+
+            rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict(
+                x, rpn_data_samples, proposal_cfg=proposal_cfg)
+            # avoid get same name with roi_head loss
+            keys = rpn_losses.keys()
+            for key in list(keys):
+                if 'loss' in key and 'rpn' not in key:
+                    rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+            losses.update(rpn_losses)
+        else:
+            assert batch_data_samples[0].get('proposals', None) is not None
+            # use pre-defined proposals in InstanceData for the second stage
+            # to extract ROI features.
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        roi_losses = self.roi_head.loss(x, rpn_results_list,
+                                        batch_data_samples)
+        losses.update(roi_losses)
+
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Return the detection results of the
+            input images. The returns value is DetDataSample,
+            which usually contain 'pred_instances'. And the
+            ``pred_instances`` usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        x = self.extract_feat(batch_inputs)
+
+        # If there are no pre-defined proposals, use RPN to get proposals
+        if batch_data_samples[0].get('proposals', None) is None:
+            rpn_results_list = self.rpn_head.predict(
+                x, batch_data_samples, rescale=False)
+        else:
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        results_list = self.roi_head.predict(
+            x, rpn_results_list, batch_data_samples, rescale=rescale)
+
+        batch_data_samples = self.add_pred_to_datasample(
+            batch_data_samples, results_list)
+        return batch_data_samples
diff --git a/mmde/mmdet/models/detectors/vfnet.py b/mmde/mmdet/models/detectors/vfnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a695513faa7d37756d7716cbca0e457060400518
--- /dev/null
+++ b/mmde/mmdet/models/detectors/vfnet.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class VFNet(SingleStageDetector):
+    """Implementation of `VarifocalNet
+    (VFNet).<https://arxiv.org/abs/2008.13367>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of VFNet. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of VFNet. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/yolact.py b/mmde/mmdet/models/detectors/yolact.py
new file mode 100644
index 0000000000000000000000000000000000000000..f15fb7b70263b0c4018751067771b1365af96f67
--- /dev/null
+++ b/mmde/mmdet/models/detectors/yolact.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@MODELS.register_module()
+class YOLACT(SingleStageInstanceSegmentor):
+    """Implementation of `YOLACT <https://arxiv.org/abs/1904.02689>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 mask_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/yolo.py b/mmde/mmdet/models/detectors/yolo.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cb9a9cd250a2c26af22032b1ed4bb5a7a8af605
--- /dev/null
+++ b/mmde/mmdet/models/detectors/yolo.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class YOLOV3(SingleStageDetector):
+    r"""Implementation of `Yolov3: An incremental improvement
+    <https://arxiv.org/abs/1804.02767>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of YOLOX. Default: None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of YOLOX. Default: None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional):
+            Model preprocessing config for processing the input data.
+            it usually includes ``to_rgb``, ``pad_size_divisor``,
+            ``pad_value``, ``mean`` and ``std``. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/yolof.py b/mmde/mmdet/models/detectors/yolof.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d98b9134a7f422fa7ea1f1a1e0d548d36603e8
--- /dev/null
+++ b/mmde/mmdet/models/detectors/yolof.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class YOLOF(SingleStageDetector):
+    r"""Implementation of `You Only Look One-level Feature
+    <https://arxiv.org/abs/2103.09460>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of YOLOF. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of YOLOF. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional):
+            Model preprocessing config for processing the input data.
+            it usually includes ``to_rgb``, ``pad_size_divisor``,
+            ``pad_value``, ``mean`` and ``std``. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/detectors/yolox.py b/mmde/mmdet/models/detectors/yolox.py
new file mode 100644
index 0000000000000000000000000000000000000000..df9190c93f7b043910fbce3bd5ee8dc0ef7b5f68
--- /dev/null
+++ b/mmde/mmdet/models/detectors/yolox.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class YOLOX(SingleStageDetector):
+    r"""Implementation of `YOLOX: Exceeding YOLO Series in 2021
+    <https://arxiv.org/abs/2107.08430>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of YOLOX. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of YOLOX. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet/models/language_models/__init__.py b/mmde/mmdet/models/language_models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..70f1a22c7c01624ba3235f1737f8aea1e26a19fe
--- /dev/null
+++ b/mmde/mmdet/models/language_models/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bert import BertModel
+
+__all__ = ['BertModel']
diff --git a/mmde/mmdet/models/language_models/bert.py b/mmde/mmdet/models/language_models/bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..efb0f46bad6eb0734a324c32a7b05f2795604265
--- /dev/null
+++ b/mmde/mmdet/models/language_models/bert.py
@@ -0,0 +1,231 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from typing import Sequence
+
+import torch
+from mmengine.model import BaseModel
+from torch import nn
+
+try:
+    from transformers import AutoTokenizer, BertConfig
+    from transformers import BertModel as HFBertModel
+except ImportError:
+    AutoTokenizer = None
+    HFBertModel = None
+
+from mmdet.registry import MODELS
+
+
+def generate_masks_with_special_tokens_and_transfer_map(
+        tokenized, special_tokens_list):
+    """Generate attention mask between each pair of special tokens.
+
+    Only token pairs in between two special tokens are attended to
+    and thus the attention mask for these pairs is positive.
+
+    Args:
+        input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
+        special_tokens_mask (list): special tokens mask.
+
+    Returns:
+        Tuple(Tensor, Tensor):
+        - attention_mask is the attention mask between each tokens.
+          Only token pairs in between two special tokens are positive.
+          Shape: [bs, num_token, num_token].
+        - position_ids is the position id of tokens within each valid sentence.
+          The id starts from 0 whenenver a special token is encountered.
+          Shape: [bs, num_token]
+    """
+    input_ids = tokenized['input_ids']
+    bs, num_token = input_ids.shape
+    # special_tokens_mask:
+    # bs, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((bs, num_token),
+                                      device=input_ids.device).bool()
+
+    for special_token in special_tokens_list:
+        special_tokens_mask |= input_ids == special_token
+
+    # idxs: each row is a list of indices of special tokens
+    idxs = torch.nonzero(special_tokens_mask)
+
+    # generate attention mask and positional ids
+    attention_mask = (
+        torch.eye(num_token,
+                  device=input_ids.device).bool().unsqueeze(0).repeat(
+                      bs, 1, 1))
+    position_ids = torch.zeros((bs, num_token), device=input_ids.device)
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1:col + 1,
+                           previous_col + 1:col + 1] = True
+            position_ids[row, previous_col + 1:col + 1] = torch.arange(
+                0, col - previous_col, device=input_ids.device)
+        previous_col = col
+
+    return attention_mask, position_ids.to(torch.long)
+
+
+@MODELS.register_module()
+class BertModel(BaseModel):
+    """BERT model for language embedding only encoder.
+
+    Args:
+        name (str, optional): name of the pretrained BERT model from
+            HuggingFace. Defaults to bert-base-uncased.
+        max_tokens (int, optional): maximum number of tokens to be
+            used for BERT. Defaults to 256.
+        pad_to_max (bool, optional): whether to pad the tokens to max_tokens.
+             Defaults to True.
+        use_sub_sentence_represent (bool, optional): whether to use sub
+            sentence represent introduced in `Grounding DINO
+            <https://arxiv.org/abs/2303.05499>`. Defaults to False.
+        special_tokens_list (list, optional): special tokens used to split
+            subsentence. It cannot be None when `use_sub_sentence_represent`
+            is True. Defaults to None.
+        add_pooling_layer (bool, optional): whether to adding pooling
+            layer in bert encoder. Defaults to False.
+        num_layers_of_embedded (int, optional): number of layers of
+            the embedded model. Defaults to 1.
+        use_checkpoint (bool, optional): whether to use gradient checkpointing.
+             Defaults to False.
+    """
+
+    def __init__(self,
+                 name: str = 'bert-base-uncased',
+                 max_tokens: int = 256,
+                 pad_to_max: bool = True,
+                 use_sub_sentence_represent: bool = False,
+                 special_tokens_list: list = None,
+                 add_pooling_layer: bool = False,
+                 num_layers_of_embedded: int = 1,
+                 use_checkpoint: bool = False,
+                 **kwargs) -> None:
+
+        super().__init__(**kwargs)
+        self.max_tokens = max_tokens
+        self.pad_to_max = pad_to_max
+
+        if AutoTokenizer is None:
+            raise RuntimeError(
+                'transformers is not installed, please install it by: '
+                'pip install transformers.')
+
+        self.tokenizer = AutoTokenizer.from_pretrained(name)
+        self.language_backbone = nn.Sequential(
+            OrderedDict([('body',
+                          BertEncoder(
+                              name,
+                              add_pooling_layer=add_pooling_layer,
+                              num_layers_of_embedded=num_layers_of_embedded,
+                              use_checkpoint=use_checkpoint))]))
+
+        self.use_sub_sentence_represent = use_sub_sentence_represent
+        if self.use_sub_sentence_represent:
+            assert special_tokens_list is not None, \
+                'special_tokens should not be None \
+                    if use_sub_sentence_represent is True'
+
+            self.special_tokens = self.tokenizer.convert_tokens_to_ids(
+                special_tokens_list)
+
+    def forward(self, captions: Sequence[str], **kwargs) -> dict:
+        """Forward function."""
+        device = next(self.language_backbone.parameters()).device
+        tokenized = self.tokenizer.batch_encode_plus(
+            captions,
+            max_length=self.max_tokens,
+            padding='max_length' if self.pad_to_max else 'longest',
+            return_special_tokens_mask=True,
+            return_tensors='pt',
+            truncation=True).to(device)
+        input_ids = tokenized.input_ids
+        if self.use_sub_sentence_represent:
+            attention_mask, position_ids = \
+                generate_masks_with_special_tokens_and_transfer_map(
+                    tokenized, self.special_tokens)
+            token_type_ids = tokenized['token_type_ids']
+
+        else:
+            attention_mask = tokenized.attention_mask
+            position_ids = None
+            token_type_ids = None
+
+        tokenizer_input = {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'position_ids': position_ids,
+            'token_type_ids': token_type_ids
+        }
+        language_dict_features = self.language_backbone(tokenizer_input)
+        if self.use_sub_sentence_represent:
+            language_dict_features['position_ids'] = position_ids
+            language_dict_features[
+                'text_token_mask'] = tokenized.attention_mask.bool()
+        return language_dict_features
+
+
+class BertEncoder(nn.Module):
+    """BERT encoder for language embedding.
+
+    Args:
+        name (str): name of the pretrained BERT model from HuggingFace.
+                Defaults to bert-base-uncased.
+        add_pooling_layer (bool): whether to add a pooling layer.
+        num_layers_of_embedded (int): number of layers of the embedded model.
+                Defaults to 1.
+        use_checkpoint (bool): whether to use gradient checkpointing.
+                Defaults to False.
+    """
+
+    def __init__(self,
+                 name: str,
+                 add_pooling_layer: bool = False,
+                 num_layers_of_embedded: int = 1,
+                 use_checkpoint: bool = False):
+        super().__init__()
+        if BertConfig is None:
+            raise RuntimeError(
+                'transformers is not installed, please install it by: '
+                'pip install transformers.')
+        config = BertConfig.from_pretrained(name)
+        config.gradient_checkpointing = use_checkpoint
+        # only encoder
+        self.model = HFBertModel.from_pretrained(
+            name, add_pooling_layer=add_pooling_layer, config=config)
+        self.language_dim = config.hidden_size
+        self.num_layers_of_embedded = num_layers_of_embedded
+
+    def forward(self, x) -> dict:
+        mask = x['attention_mask']
+
+        outputs = self.model(
+            input_ids=x['input_ids'],
+            attention_mask=mask,
+            position_ids=x['position_ids'],
+            token_type_ids=x['token_type_ids'],
+            output_hidden_states=True,
+        )
+
+        # outputs has 13 layers, 1 input layer and 12 hidden layers
+        encoded_layers = outputs.hidden_states[1:]
+        features = torch.stack(encoded_layers[-self.num_layers_of_embedded:],
+                               1).mean(1)
+        # language embedding has shape [len(phrase), seq_len, language_dim]
+        features = features / self.num_layers_of_embedded
+        if mask.dim() == 2:
+            embedded = features * mask.unsqueeze(-1).float()
+        else:
+            embedded = features
+
+        results = {
+            'embedded': embedded,
+            'masks': mask,
+            'hidden': encoded_layers[-1]
+        }
+        return results
diff --git a/mmde/mmdet/models/layers/__init__.py b/mmde/mmdet/models/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3c41f64d11bbdb7f2c8e128a2e28b2845159589
--- /dev/null
+++ b/mmde/mmdet/models/layers/__init__.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .activations import SiLU
+from .bbox_nms import fast_nms, multiclass_nms
+from .brick_wrappers import (AdaptiveAvgPool2d, FrozenBatchNorm2d,
+                             adaptive_avg_pool2d)
+from .conv_upsample import ConvUpsample
+from .csp_layer import CSPLayer
+from .dropblock import DropBlock
+from .ema import ExpMomentumEMA
+from .inverted_residual import InvertedResidual
+from .matrix_nms import mask_matrix_nms
+from .msdeformattn_pixel_decoder import MSDeformAttnPixelDecoder
+from .normed_predictor import NormedConv2d, NormedLinear
+from .pixel_decoder import PixelDecoder, TransformerEncoderPixelDecoder
+from .positional_encoding import (LearnedPositionalEncoding,
+                                  SinePositionalEncoding,
+                                  SinePositionalEncoding3D)
+from .res_layer import ResLayer, SimplifiedBasicBlock
+from .se_layer import ChannelAttention, DyReLU, SELayer
+# yapf: disable
+from .transformer import (MLP, AdaptivePadding, CdnQueryGenerator,
+                          ConditionalAttention,
+                          ConditionalDetrTransformerDecoder,
+                          ConditionalDetrTransformerDecoderLayer,
+                          DABDetrTransformerDecoder,
+                          DABDetrTransformerDecoderLayer,
+                          DABDetrTransformerEncoder, DDQTransformerDecoder,
+                          DeformableDetrTransformerDecoder,
+                          DeformableDetrTransformerDecoderLayer,
+                          DeformableDetrTransformerEncoder,
+                          DeformableDetrTransformerEncoderLayer,
+                          DetrTransformerDecoder, DetrTransformerDecoderLayer,
+                          DetrTransformerEncoder, DetrTransformerEncoderLayer,
+                          DinoTransformerDecoder, DynamicConv,
+                          Mask2FormerTransformerDecoder,
+                          Mask2FormerTransformerDecoderLayer,
+                          Mask2FormerTransformerEncoder, PatchEmbed,
+                          PatchMerging, coordinate_to_encoding,
+                          inverse_sigmoid, nchw_to_nlc, nlc_to_nchw)
+
+# yapf: enable
+
+__all__ = [
+    'fast_nms', 'multiclass_nms', 'mask_matrix_nms', 'DropBlock',
+    'PixelDecoder', 'TransformerEncoderPixelDecoder',
+    'MSDeformAttnPixelDecoder', 'ResLayer', 'PatchMerging',
+    'SinePositionalEncoding', 'LearnedPositionalEncoding', 'DynamicConv',
+    'SimplifiedBasicBlock', 'NormedLinear', 'NormedConv2d', 'InvertedResidual',
+    'SELayer', 'ConvUpsample', 'CSPLayer', 'adaptive_avg_pool2d',
+    'AdaptiveAvgPool2d', 'PatchEmbed', 'nchw_to_nlc', 'nlc_to_nchw', 'DyReLU',
+    'ExpMomentumEMA', 'inverse_sigmoid', 'ChannelAttention', 'SiLU', 'MLP',
+    'DetrTransformerEncoderLayer', 'DetrTransformerDecoderLayer',
+    'DetrTransformerEncoder', 'DetrTransformerDecoder',
+    'DeformableDetrTransformerEncoder', 'DeformableDetrTransformerDecoder',
+    'DeformableDetrTransformerEncoderLayer',
+    'DeformableDetrTransformerDecoderLayer', 'AdaptivePadding',
+    'coordinate_to_encoding', 'ConditionalAttention',
+    'DABDetrTransformerDecoderLayer', 'DABDetrTransformerDecoder',
+    'DABDetrTransformerEncoder', 'DDQTransformerDecoder',
+    'ConditionalDetrTransformerDecoder',
+    'ConditionalDetrTransformerDecoderLayer', 'DinoTransformerDecoder',
+    'CdnQueryGenerator', 'Mask2FormerTransformerEncoder',
+    'Mask2FormerTransformerDecoderLayer', 'Mask2FormerTransformerDecoder',
+    'SinePositionalEncoding3D', 'FrozenBatchNorm2d'
+]
diff --git a/mmde/mmdet/models/layers/activations.py b/mmde/mmdet/models/layers/activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e73ef42180ccd3dddb4bcca224c0b4eb5da807c
--- /dev/null
+++ b/mmde/mmdet/models/layers/activations.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.utils import digit_version
+
+from mmdet.registry import MODELS
+
+if digit_version(torch.__version__) >= digit_version('1.7.0'):
+    from torch.nn import SiLU
+else:
+
+    class SiLU(nn.Module):
+        """Sigmoid Weighted Liner Unit."""
+
+        def __init__(self, inplace=True):
+            super().__init__()
+
+        def forward(self, inputs) -> torch.Tensor:
+            return inputs * torch.sigmoid(inputs)
+
+
+MODELS.register_module(module=SiLU, name='SiLU')
diff --git a/mmde/mmdet/models/layers/bbox_nms.py b/mmde/mmdet/models/layers/bbox_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd67a45f60ca98c354e095127ab7dbb9653deca5
--- /dev/null
+++ b/mmde/mmdet/models/layers/bbox_nms.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+from mmcv.ops.nms import batched_nms
+from torch import Tensor
+
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import ConfigType
+
+
+def multiclass_nms(
+    multi_bboxes: Tensor,
+    multi_scores: Tensor,
+    score_thr: float,
+    nms_cfg: ConfigType,
+    max_num: int = -1,
+    score_factors: Optional[Tensor] = None,
+    return_inds: bool = False,
+    box_dim: int = 4
+) -> Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_cfg (Union[:obj:`ConfigDict`, dict]): a dict that contains
+            the arguments of nms operations.
+        max_num (int, optional): if there are more than max_num bboxes after
+            NMS, only top max_num will be kept. Default to -1.
+        score_factors (Tensor, optional): The factors multiplied to scores
+            before applying NMS. Default to None.
+        return_inds (bool, optional): Whether return the indices of kept
+            bboxes. Default to False.
+        box_dim (int): The dimension of boxes. Defaults to 4.
+
+    Returns:
+        Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
+            (dets, labels, indices (optional)), tensors of shape (k, 5),
+            (k), and (k). Dets are boxes with scores. Labels are 0-based.
+    """
+    num_classes = multi_scores.size(1) - 1
+    # exclude background category
+    if multi_bboxes.shape[1] > box_dim:
+        bboxes = multi_bboxes.view(multi_scores.size(0), -1, box_dim)
+    else:
+        bboxes = multi_bboxes[:, None].expand(
+            multi_scores.size(0), num_classes, box_dim)
+
+    scores = multi_scores[:, :-1]
+
+    labels = torch.arange(num_classes, dtype=torch.long, device=scores.device)
+    labels = labels.view(1, -1).expand_as(scores)
+
+    bboxes = bboxes.reshape(-1, box_dim)
+    scores = scores.reshape(-1)
+    labels = labels.reshape(-1)
+
+    if not torch.onnx.is_in_onnx_export():
+        # NonZero not supported  in TensorRT
+        # remove low scoring boxes
+        valid_mask = scores > score_thr
+    # multiply score_factor after threshold to preserve more bboxes, improve
+    # mAP by 1% for YOLOv3
+    if score_factors is not None:
+        # expand the shape to match original shape of score
+        score_factors = score_factors.view(-1, 1).expand(
+            multi_scores.size(0), num_classes)
+        score_factors = score_factors.reshape(-1)
+        scores = scores * score_factors
+
+    if not torch.onnx.is_in_onnx_export():
+        # NonZero not supported  in TensorRT
+        inds = valid_mask.nonzero(as_tuple=False).squeeze(1)
+        bboxes, scores, labels = bboxes[inds], scores[inds], labels[inds]
+    else:
+        # TensorRT NMS plugin has invalid output filled with -1
+        # add dummy data to make detection output correct.
+        bboxes = torch.cat([bboxes, bboxes.new_zeros(1, box_dim)], dim=0)
+        scores = torch.cat([scores, scores.new_zeros(1)], dim=0)
+        labels = torch.cat([labels, labels.new_zeros(1)], dim=0)
+
+    if bboxes.numel() == 0:
+        if torch.onnx.is_in_onnx_export():
+            raise RuntimeError('[ONNX Error] Can not record NMS '
+                               'as it has not been executed this time')
+        dets = torch.cat([bboxes, scores[:, None]], -1)
+        if return_inds:
+            return dets, labels, inds
+        else:
+            return dets, labels
+
+    dets, keep = batched_nms(bboxes, scores, labels, nms_cfg)
+
+    if max_num > 0:
+        dets = dets[:max_num]
+        keep = keep[:max_num]
+
+    if return_inds:
+        return dets, labels[keep], inds[keep]
+    else:
+        return dets, labels[keep]
+
+
+def fast_nms(
+    multi_bboxes: Tensor,
+    multi_scores: Tensor,
+    multi_coeffs: Tensor,
+    score_thr: float,
+    iou_thr: float,
+    top_k: int,
+    max_num: int = -1
+) -> Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
+    """Fast NMS in `YOLACT <https://arxiv.org/abs/1904.02689>`_.
+
+    Fast NMS allows already-removed detections to suppress other detections so
+    that every instance can be decided to be kept or discarded in parallel,
+    which is not possible in traditional NMS. This relaxation allows us to
+    implement Fast NMS entirely in standard GPU-accelerated matrix operations.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class+1), where the last column
+            contains scores of the background class, but this will be ignored.
+        multi_coeffs (Tensor): shape (n, #class*coeffs_dim).
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        iou_thr (float): IoU threshold to be considered as conflicted.
+        top_k (int): if there are more than top_k bboxes before NMS,
+            only top top_k will be kept.
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept. If -1, keep all the bboxes.
+            Default: -1.
+
+    Returns:
+        Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
+            (dets, labels, coefficients), tensors of shape (k, 5), (k, 1),
+            and (k, coeffs_dim). Dets are boxes with scores.
+            Labels are 0-based.
+    """
+
+    scores = multi_scores[:, :-1].t()  # [#class, n]
+    scores, idx = scores.sort(1, descending=True)
+
+    idx = idx[:, :top_k].contiguous()
+    scores = scores[:, :top_k]  # [#class, topk]
+    num_classes, num_dets = idx.size()
+    boxes = multi_bboxes[idx.view(-1), :].view(num_classes, num_dets, 4)
+    coeffs = multi_coeffs[idx.view(-1), :].view(num_classes, num_dets, -1)
+
+    iou = bbox_overlaps(boxes, boxes)  # [#class, topk, topk]
+    iou.triu_(diagonal=1)
+    iou_max, _ = iou.max(dim=1)
+
+    # Now just filter out the ones higher than the threshold
+    keep = iou_max <= iou_thr
+
+    # Second thresholding introduces 0.2 mAP gain at negligible time cost
+    keep *= scores > score_thr
+
+    # Assign each kept detection to its corresponding class
+    classes = torch.arange(
+        num_classes, device=boxes.device)[:, None].expand_as(keep)
+    classes = classes[keep]
+
+    boxes = boxes[keep]
+    coeffs = coeffs[keep]
+    scores = scores[keep]
+
+    # Only keep the top max_num highest scores across all classes
+    scores, idx = scores.sort(0, descending=True)
+    if max_num > 0:
+        idx = idx[:max_num]
+        scores = scores[:max_num]
+
+    classes = classes[idx]
+    boxes = boxes[idx]
+    coeffs = coeffs[idx]
+
+    cls_dets = torch.cat([boxes, scores[:, None]], dim=1)
+    return cls_dets, classes, coeffs
diff --git a/mmde/mmdet/models/layers/brick_wrappers.py b/mmde/mmdet/models/layers/brick_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ecb8499de329132561dfedb8f55c36080787b31
--- /dev/null
+++ b/mmde/mmdet/models/layers/brick_wrappers.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn.bricks.wrappers import NewEmptyTensorOp, obsolete_torch_version
+
+from mmdet.registry import MODELS
+
+if torch.__version__ == 'parrots':
+    TORCH_VERSION = torch.__version__
+else:
+    # torch.__version__ could be 1.3.1+cu92, we only need the first two
+    # for comparison
+    TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
+
+
+def adaptive_avg_pool2d(input, output_size):
+    """Handle empty batch dimension to adaptive_avg_pool2d.
+
+    Args:
+        input (tensor): 4D tensor.
+        output_size (int, tuple[int,int]): the target output size.
+    """
+    if input.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+        output_size = [*input.shape[:2], *output_size]
+        empty = NewEmptyTensorOp.apply(input, output_size)
+        return empty
+    else:
+        return F.adaptive_avg_pool2d(input, output_size)
+
+
+class AdaptiveAvgPool2d(nn.AdaptiveAvgPool2d):
+    """Handle empty batch dimension to AdaptiveAvgPool2d."""
+
+    def forward(self, x):
+        # PyTorch 1.9 does not support empty tensor inference yet
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+            output_size = self.output_size
+            if isinstance(output_size, int):
+                output_size = [output_size, output_size]
+            else:
+                output_size = [
+                    v if v is not None else d
+                    for v, d in zip(output_size,
+                                    x.size()[-2:])
+                ]
+            output_size = [*x.shape[:2], *output_size]
+            empty = NewEmptyTensorOp.apply(x, output_size)
+            return empty
+
+        return super().forward(x)
+
+
+# Modified from
+# https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py#L13 # noqa
+@MODELS.register_module('FrozenBN')
+class FrozenBatchNorm2d(nn.Module):
+    """BatchNorm2d where the batch statistics and the affine parameters are
+    fixed.
+
+    It contains non-trainable buffers called
+    "weight" and "bias", "running_mean", "running_var",
+    initialized to perform identity transformation.
+    Args:
+       num_features (int):  :math:`C` from an expected input of size
+            :math:`(N, C, H, W)`.
+       eps (float): a value added to the denominator for numerical stability.
+            Default: 1e-5
+    """
+
+    def __init__(self, num_features, eps=1e-5, **kwargs):
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.register_buffer('weight', torch.ones(num_features))
+        self.register_buffer('bias', torch.zeros(num_features))
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features) - eps)
+
+    def forward(self, x):
+        if x.requires_grad:
+            # When gradients are needed, F.batch_norm will use extra memory
+            # because its backward op computes gradients for weight/bias
+            # as well.
+            scale = self.weight * (self.running_var + self.eps).rsqrt()
+            bias = self.bias - self.running_mean * scale
+            scale = scale.reshape(1, -1, 1, 1)
+            bias = bias.reshape(1, -1, 1, 1)
+            out_dtype = x.dtype  # may be half
+            return x * scale.to(out_dtype) + bias.to(out_dtype)
+        else:
+            # When gradients are not needed, F.batch_norm is a single fused op
+            # and provide more optimization opportunities.
+            return F.batch_norm(
+                x,
+                self.running_mean,
+                self.running_var,
+                self.weight,
+                self.bias,
+                training=False,
+                eps=self.eps,
+            )
+
+    def __repr__(self):
+        return 'FrozenBatchNorm2d(num_features={}, eps={})'.format(
+            self.num_features, self.eps)
+
+    @classmethod
+    def convert_frozen_batchnorm(cls, module):
+        """Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
+
+        Args:
+            module (torch.nn.Module):
+        Returns:
+            If module is BatchNorm/SyncBatchNorm, returns a new module.
+            Otherwise, in-place convert module and return it.
+        Similar to convert_sync_batchnorm in
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
+        """
+        bn_module = nn.modules.batchnorm
+        bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
+        res = module
+        if isinstance(module, bn_module):
+            res = cls(module.num_features)
+            if module.affine:
+                res.weight.data = module.weight.data.clone().detach()
+                res.bias.data = module.bias.data.clone().detach()
+            res.running_mean.data = module.running_mean.data
+            res.running_var.data = module.running_var.data
+            res.eps = module.eps
+        else:
+            for name, child in module.named_children():
+                new_child = cls.convert_frozen_batchnorm(child)
+                if new_child is not child:
+                    res.add_module(name, new_child)
+        return res
diff --git a/mmde/mmdet/models/layers/conv_upsample.py b/mmde/mmdet/models/layers/conv_upsample.py
new file mode 100644
index 0000000000000000000000000000000000000000..32505875a2162330ed7d00455f088d08d94f679e
--- /dev/null
+++ b/mmde/mmdet/models/layers/conv_upsample.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule, ModuleList
+
+
+class ConvUpsample(BaseModule):
+    """ConvUpsample performs 2x upsampling after Conv.
+
+    There are several `ConvModule` layers. In the first few layers, upsampling
+    will be applied after each layer of convolution. The number of upsampling
+    must be no more than the number of ConvModule layers.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        inner_channels (int): Number of channels produced by the convolution.
+        num_layers (int): Number of convolution layers.
+        num_upsample (int | optional): Number of upsampling layer. Must be no
+            more than num_layers. Upsampling will be applied after the first
+            ``num_upsample`` layers of convolution. Default: ``num_layers``.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        init_cfg (dict): Config dict for initialization. Default: None.
+        kwargs (key word augments): Other augments used in ConvModule.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 inner_channels,
+                 num_layers=1,
+                 num_upsample=None,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(ConvUpsample, self).__init__(init_cfg)
+        if num_upsample is None:
+            num_upsample = num_layers
+        assert num_upsample <= num_layers, \
+            f'num_upsample({num_upsample})must be no more than ' \
+            f'num_layers({num_layers})'
+        self.num_layers = num_layers
+        self.num_upsample = num_upsample
+        self.conv = ModuleList()
+        for i in range(num_layers):
+            self.conv.append(
+                ConvModule(
+                    in_channels,
+                    inner_channels,
+                    3,
+                    padding=1,
+                    stride=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+            in_channels = inner_channels
+
+    def forward(self, x):
+        num_upsample = self.num_upsample
+        for i in range(self.num_layers):
+            x = self.conv[i](x)
+            if num_upsample > 0:
+                num_upsample -= 1
+                x = F.interpolate(
+                    x, scale_factor=2, mode='bilinear', align_corners=False)
+        return x
diff --git a/mmde/mmdet/models/layers/csp_layer.py b/mmde/mmdet/models/layers/csp_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8b547b8994862bfe14739033bb6b254ef886f29
--- /dev/null
+++ b/mmde/mmdet/models/layers/csp_layer.py
@@ -0,0 +1,246 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .se_layer import ChannelAttention
+
+
+class DarknetBottleneck(BaseModule):
+    """The basic bottleneck block used in Darknet.
+
+    Each ResBlock consists of two ConvModules and the input is added to the
+    final output. Each ConvModule is composed of Conv, BN, and LeakyReLU.
+    The first convLayer has filter size of 1x1 and the second one has the
+    filter size of 3x3.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        expansion (float): The kernel size of the convolution.
+            Defaults to 0.5.
+        add_identity (bool): Whether to add identity to the out.
+            Defaults to True.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish').
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expansion: float = 0.5,
+                 add_identity: bool = True,
+                 use_depthwise: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='Swish'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        hidden_channels = int(out_channels * expansion)
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        self.conv1 = ConvModule(
+            in_channels,
+            hidden_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = conv(
+            hidden_channels,
+            out_channels,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.add_identity = \
+            add_identity and in_channels == out_channels
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.add_identity:
+            return out + identity
+        else:
+            return out
+
+
+class CSPNeXtBlock(BaseModule):
+    """The basic bottleneck block used in CSPNeXt.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        expansion (float): Expand ratio of the hidden channel. Defaults to 0.5.
+        add_identity (bool): Whether to add identity to the out. Only works
+            when in_channels == out_channels. Defaults to True.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        kernel_size (int): The kernel size of the second convolution layer.
+            Defaults to 5.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU').
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expansion: float = 0.5,
+                 add_identity: bool = True,
+                 use_depthwise: bool = False,
+                 kernel_size: int = 5,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        hidden_channels = int(out_channels * expansion)
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        self.conv1 = conv(
+            in_channels,
+            hidden_channels,
+            3,
+            stride=1,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = DepthwiseSeparableConvModule(
+            hidden_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.add_identity = \
+            add_identity and in_channels == out_channels
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.add_identity:
+            return out + identity
+        else:
+            return out
+
+
+class CSPLayer(BaseModule):
+    """Cross Stage Partial Layer.
+
+    Args:
+        in_channels (int): The input channels of the CSP layer.
+        out_channels (int): The output channels of the CSP layer.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Defaults to 0.5.
+        num_blocks (int): Number of blocks. Defaults to 1.
+        add_identity (bool): Whether to add identity in blocks.
+            Defaults to True.
+        use_cspnext_block (bool): Whether to use CSPNeXt block.
+            Defaults to False.
+        use_depthwise (bool): Whether to use depthwise separable convolution in
+            blocks. Defaults to False.
+        channel_attention (bool): Whether to add channel attention in each
+            stage. Defaults to True.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish')
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expand_ratio: float = 0.5,
+                 num_blocks: int = 1,
+                 add_identity: bool = True,
+                 use_depthwise: bool = False,
+                 use_cspnext_block: bool = False,
+                 channel_attention: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='Swish'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        block = CSPNeXtBlock if use_cspnext_block else DarknetBottleneck
+        mid_channels = int(out_channels * expand_ratio)
+        self.channel_attention = channel_attention
+        self.main_conv = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.short_conv = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.final_conv = ConvModule(
+            2 * mid_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.blocks = nn.Sequential(*[
+            block(
+                mid_channels,
+                mid_channels,
+                1.0,
+                add_identity,
+                use_depthwise,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg) for _ in range(num_blocks)
+        ])
+        if channel_attention:
+            self.attention = ChannelAttention(2 * mid_channels)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        x_short = self.short_conv(x)
+
+        x_main = self.main_conv(x)
+        x_main = self.blocks(x_main)
+
+        x_final = torch.cat((x_main, x_short), dim=1)
+
+        if self.channel_attention:
+            x_final = self.attention(x_final)
+        return self.final_conv(x_final)
diff --git a/mmde/mmdet/models/layers/dropblock.py b/mmde/mmdet/models/layers/dropblock.py
new file mode 100644
index 0000000000000000000000000000000000000000..7938199b761d637afdb1b2c62dbca01d1bf629eb
--- /dev/null
+++ b/mmde/mmdet/models/layers/dropblock.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.registry import MODELS
+
+eps = 1e-6
+
+
+@MODELS.register_module()
+class DropBlock(nn.Module):
+    """Randomly drop some regions of feature maps.
+
+     Please refer to the method proposed in `DropBlock
+     <https://arxiv.org/abs/1810.12890>`_ for details.
+
+    Args:
+        drop_prob (float): The probability of dropping each block.
+        block_size (int): The size of dropped blocks.
+        warmup_iters (int): The drop probability will linearly increase
+            from `0` to `drop_prob` during the first `warmup_iters` iterations.
+            Default: 2000.
+    """
+
+    def __init__(self, drop_prob, block_size, warmup_iters=2000, **kwargs):
+        super(DropBlock, self).__init__()
+        assert block_size % 2 == 1
+        assert 0 < drop_prob <= 1
+        assert warmup_iters >= 0
+        self.drop_prob = drop_prob
+        self.block_size = block_size
+        self.warmup_iters = warmup_iters
+        self.iter_cnt = 0
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Input feature map on which some areas will be randomly
+                dropped.
+
+        Returns:
+            Tensor: The tensor after DropBlock layer.
+        """
+        if not self.training:
+            return x
+        self.iter_cnt += 1
+        N, C, H, W = list(x.shape)
+        gamma = self._compute_gamma((H, W))
+        mask_shape = (N, C, H - self.block_size + 1, W - self.block_size + 1)
+        mask = torch.bernoulli(torch.full(mask_shape, gamma, device=x.device))
+
+        mask = F.pad(mask, [self.block_size // 2] * 4, value=0)
+        mask = F.max_pool2d(
+            input=mask,
+            stride=(1, 1),
+            kernel_size=(self.block_size, self.block_size),
+            padding=self.block_size // 2)
+        mask = 1 - mask
+        x = x * mask * mask.numel() / (eps + mask.sum())
+        return x
+
+    def _compute_gamma(self, feat_size):
+        """Compute the value of gamma according to paper. gamma is the
+        parameter of bernoulli distribution, which controls the number of
+        features to drop.
+
+        gamma = (drop_prob * fm_area) / (drop_area * keep_area)
+
+        Args:
+            feat_size (tuple[int, int]): The height and width of feature map.
+
+        Returns:
+            float: The value of gamma.
+        """
+        gamma = (self.drop_prob * feat_size[0] * feat_size[1])
+        gamma /= ((feat_size[0] - self.block_size + 1) *
+                  (feat_size[1] - self.block_size + 1))
+        gamma /= (self.block_size**2)
+        factor = (1.0 if self.iter_cnt > self.warmup_iters else self.iter_cnt /
+                  self.warmup_iters)
+        return gamma * factor
+
+    def extra_repr(self):
+        return (f'drop_prob={self.drop_prob}, block_size={self.block_size}, '
+                f'warmup_iters={self.warmup_iters}')
diff --git a/mmde/mmdet/models/layers/ema.py b/mmde/mmdet/models/layers/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..bce503c4641f7391a7bd7d722c05f4e49bd07db9
--- /dev/null
+++ b/mmde/mmdet/models/layers/ema.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from mmengine.model import ExponentialMovingAverage
+from torch import Tensor
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class ExpMomentumEMA(ExponentialMovingAverage):
+    """Exponential moving average (EMA) with exponential momentum strategy,
+    which is used in YOLOX.
+
+    Args:
+        model (nn.Module): The model to be averaged.
+        momentum (float): The momentum used for updating ema parameter.
+            Ema's parameter are updated with the formula:
+           `averaged_param = (1-momentum) * averaged_param + momentum *
+           source_param`. Defaults to 0.0002.
+        gamma (int): Use a larger momentum early in training and gradually
+            annealing to a smaller value to update the ema model smoothly. The
+            momentum is calculated as
+            `(1 - momentum) * exp(-(1 + steps) / gamma) + momentum`.
+            Defaults to 2000.
+        interval (int): Interval between two updates. Defaults to 1.
+        device (torch.device, optional): If provided, the averaged model will
+            be stored on the :attr:`device`. Defaults to None.
+        update_buffers (bool): if True, it will compute running averages for
+            both the parameters and the buffers of the model. Defaults to
+            False.
+    """
+
+    def __init__(self,
+                 model: nn.Module,
+                 momentum: float = 0.0002,
+                 gamma: int = 2000,
+                 interval=1,
+                 device: Optional[torch.device] = None,
+                 update_buffers: bool = False) -> None:
+        super().__init__(
+            model=model,
+            momentum=momentum,
+            interval=interval,
+            device=device,
+            update_buffers=update_buffers)
+        assert gamma > 0, f'gamma must be greater than 0, but got {gamma}'
+        self.gamma = gamma
+
+    def avg_func(self, averaged_param: Tensor, source_param: Tensor,
+                 steps: int) -> None:
+        """Compute the moving average of the parameters using the exponential
+        momentum strategy.
+
+        Args:
+            averaged_param (Tensor): The averaged parameters.
+            source_param (Tensor): The source parameters.
+            steps (int): The number of times the parameters have been
+                updated.
+        """
+        momentum = (1 - self.momentum) * math.exp(
+            -float(1 + steps) / self.gamma) + self.momentum
+        averaged_param.mul_(1 - momentum).add_(source_param, alpha=momentum)
diff --git a/mmde/mmdet/models/layers/inverted_residual.py b/mmde/mmdet/models/layers/inverted_residual.py
new file mode 100644
index 0000000000000000000000000000000000000000..a174ccc8835a1ee720f9cdaa7c5be210f5be8113
--- /dev/null
+++ b/mmde/mmdet/models/layers/inverted_residual.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import DropPath
+from mmengine.model import BaseModule
+
+from .se_layer import SELayer
+
+
+class InvertedResidual(BaseModule):
+    """Inverted Residual Block.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        mid_channels (int): The input channels of the depthwise convolution.
+        kernel_size (int): The kernel size of the depthwise convolution.
+            Default: 3.
+        stride (int): The stride of the depthwise convolution. Default: 1.
+        se_cfg (dict): Config dict for se layer. Default: None, which means no
+            se layer.
+        with_expand_conv (bool): Use expand conv or not. If set False,
+            mid_channels must be the same with in_channels.
+            Default: True.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 stride=1,
+                 se_cfg=None,
+                 with_expand_conv=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 drop_path_rate=0.,
+                 with_cp=False,
+                 init_cfg=None):
+        super(InvertedResidual, self).__init__(init_cfg)
+        self.with_res_shortcut = (stride == 1 and in_channels == out_channels)
+        assert stride in [1, 2], f'stride must in [1, 2]. ' \
+            f'But received {stride}.'
+        self.with_cp = with_cp
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.with_se = se_cfg is not None
+        self.with_expand_conv = with_expand_conv
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+        if not self.with_expand_conv:
+            assert mid_channels == in_channels
+
+        if self.with_expand_conv:
+            self.expand_conv = ConvModule(
+                in_channels=in_channels,
+                out_channels=mid_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        self.depthwise_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=mid_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+
+        self.linear_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+
+            if self.with_expand_conv:
+                out = self.expand_conv(out)
+
+            out = self.depthwise_conv(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.linear_conv(out)
+
+            if self.with_res_shortcut:
+                return x + self.drop_path(out)
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
diff --git a/mmde/mmdet/models/layers/matrix_nms.py b/mmde/mmdet/models/layers/matrix_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dc8c4f74e28127fb69ccc684f0bdb2bd3943b20
--- /dev/null
+++ b/mmde/mmdet/models/layers/matrix_nms.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def mask_matrix_nms(masks,
+                    labels,
+                    scores,
+                    filter_thr=-1,
+                    nms_pre=-1,
+                    max_num=-1,
+                    kernel='gaussian',
+                    sigma=2.0,
+                    mask_area=None):
+    """Matrix NMS for multi-class masks.
+
+    Args:
+        masks (Tensor): Has shape (num_instances, h, w)
+        labels (Tensor): Labels of corresponding masks,
+            has shape (num_instances,).
+        scores (Tensor): Mask scores of corresponding masks,
+            has shape (num_instances).
+        filter_thr (float): Score threshold to filter the masks
+            after matrix nms. Default: -1, which means do not
+            use filter_thr.
+        nms_pre (int): The max number of instances to do the matrix nms.
+            Default: -1, which means do not use nms_pre.
+        max_num (int, optional): If there are more than max_num masks after
+            matrix, only top max_num will be kept. Default: -1, which means
+            do not use max_num.
+        kernel (str): 'linear' or 'gaussian'.
+        sigma (float): std in gaussian method.
+        mask_area (Tensor): The sum of seg_masks.
+
+    Returns:
+        tuple(Tensor): Processed mask results.
+
+            - scores (Tensor): Updated scores, has shape (n,).
+            - labels (Tensor): Remained labels, has shape (n,).
+            - masks (Tensor): Remained masks, has shape (n, w, h).
+            - keep_inds (Tensor): The indices number of
+                the remaining mask in the input mask, has shape (n,).
+    """
+    assert len(labels) == len(masks) == len(scores)
+    if len(labels) == 0:
+        return scores.new_zeros(0), labels.new_zeros(0), masks.new_zeros(
+            0, *masks.shape[-2:]), labels.new_zeros(0)
+    if mask_area is None:
+        mask_area = masks.sum((1, 2)).float()
+    else:
+        assert len(masks) == len(mask_area)
+
+    # sort and keep top nms_pre
+    scores, sort_inds = torch.sort(scores, descending=True)
+
+    keep_inds = sort_inds
+    if nms_pre > 0 and len(sort_inds) > nms_pre:
+        sort_inds = sort_inds[:nms_pre]
+        keep_inds = keep_inds[:nms_pre]
+        scores = scores[:nms_pre]
+    masks = masks[sort_inds]
+    mask_area = mask_area[sort_inds]
+    labels = labels[sort_inds]
+
+    num_masks = len(labels)
+    flatten_masks = masks.reshape(num_masks, -1).float()
+    # inter.
+    inter_matrix = torch.mm(flatten_masks, flatten_masks.transpose(1, 0))
+    expanded_mask_area = mask_area.expand(num_masks, num_masks)
+    # Upper triangle iou matrix.
+    iou_matrix = (inter_matrix /
+                  (expanded_mask_area + expanded_mask_area.transpose(1, 0) -
+                   inter_matrix)).triu(diagonal=1)
+    # label_specific matrix.
+    expanded_labels = labels.expand(num_masks, num_masks)
+    # Upper triangle label matrix.
+    label_matrix = (expanded_labels == expanded_labels.transpose(
+        1, 0)).triu(diagonal=1)
+
+    # IoU compensation
+    compensate_iou, _ = (iou_matrix * label_matrix).max(0)
+    compensate_iou = compensate_iou.expand(num_masks,
+                                           num_masks).transpose(1, 0)
+
+    # IoU decay
+    decay_iou = iou_matrix * label_matrix
+
+    # Calculate the decay_coefficient
+    if kernel == 'gaussian':
+        decay_matrix = torch.exp(-1 * sigma * (decay_iou**2))
+        compensate_matrix = torch.exp(-1 * sigma * (compensate_iou**2))
+        decay_coefficient, _ = (decay_matrix / compensate_matrix).min(0)
+    elif kernel == 'linear':
+        decay_matrix = (1 - decay_iou) / (1 - compensate_iou)
+        decay_coefficient, _ = decay_matrix.min(0)
+    else:
+        raise NotImplementedError(
+            f'{kernel} kernel is not supported in matrix nms!')
+    # update the score.
+    scores = scores * decay_coefficient
+
+    if filter_thr > 0:
+        keep = scores >= filter_thr
+        keep_inds = keep_inds[keep]
+        if not keep.any():
+            return scores.new_zeros(0), labels.new_zeros(0), masks.new_zeros(
+                0, *masks.shape[-2:]), labels.new_zeros(0)
+        masks = masks[keep]
+        scores = scores[keep]
+        labels = labels[keep]
+
+    # sort and keep top max_num
+    scores, sort_inds = torch.sort(scores, descending=True)
+    keep_inds = keep_inds[sort_inds]
+    if max_num > 0 and len(sort_inds) > max_num:
+        sort_inds = sort_inds[:max_num]
+        keep_inds = keep_inds[:max_num]
+        scores = scores[:max_num]
+    masks = masks[sort_inds]
+    labels = labels[sort_inds]
+
+    return scores, labels, masks, keep_inds
diff --git a/mmde/mmdet/models/layers/msdeformattn_pixel_decoder.py b/mmde/mmdet/models/layers/msdeformattn_pixel_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a67dc3c4437f83ebe1c82d12b3ed91f429030ce7
--- /dev/null
+++ b/mmde/mmdet/models/layers/msdeformattn_pixel_decoder.py
@@ -0,0 +1,246 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, ConvModule
+from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention
+from mmengine.model import (BaseModule, ModuleList, caffe2_xavier_init,
+                            normal_init, xavier_init)
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptMultiConfig
+from ..task_modules.prior_generators import MlvlPointGenerator
+from .positional_encoding import SinePositionalEncoding
+from .transformer import Mask2FormerTransformerEncoder
+
+
+@MODELS.register_module()
+class MSDeformAttnPixelDecoder(BaseModule):
+    """Pixel decoder with multi-scale deformable attention.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        strides (list[int] | tuple[int]): Output strides of feature from
+            backbone.
+        feat_channels (int): Number of channels for feature.
+        out_channels (int): Number of channels for output.
+        num_outs (int): Number of output scales.
+        norm_cfg (:obj:`ConfigDict` or dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (:obj:`ConfigDict` or dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (:obj:`ConfigDict` or dict): Config for transformer
+            encoder. Defaults to None.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(num_feats=128, normalize=True).
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: Union[List[int],
+                                    Tuple[int]] = [256, 512, 1024, 2048],
+                 strides: Union[List[int], Tuple[int]] = [4, 8, 16, 32],
+                 feat_channels: int = 256,
+                 out_channels: int = 256,
+                 num_outs: int = 3,
+                 norm_cfg: ConfigType = dict(type='GN', num_groups=32),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 encoder: ConfigType = None,
+                 positional_encoding: ConfigType = dict(
+                     num_feats=128, normalize=True),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.strides = strides
+        self.num_input_levels = len(in_channels)
+        self.num_encoder_levels = \
+            encoder.layer_cfg.self_attn_cfg.num_levels
+        assert self.num_encoder_levels >= 1, \
+            'num_levels in attn_cfgs must be at least one'
+        input_conv_list = []
+        # from top to down (low to high resolution)
+        for i in range(self.num_input_levels - 1,
+                       self.num_input_levels - self.num_encoder_levels - 1,
+                       -1):
+            input_conv = ConvModule(
+                in_channels[i],
+                feat_channels,
+                kernel_size=1,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                bias=True)
+            input_conv_list.append(input_conv)
+        self.input_convs = ModuleList(input_conv_list)
+
+        self.encoder = Mask2FormerTransformerEncoder(**encoder)
+        self.postional_encoding = SinePositionalEncoding(**positional_encoding)
+        # high resolution to low resolution
+        self.level_encoding = nn.Embedding(self.num_encoder_levels,
+                                           feat_channels)
+
+        # fpn-like structure
+        self.lateral_convs = ModuleList()
+        self.output_convs = ModuleList()
+        self.use_bias = norm_cfg is None
+        # from top to down (low to high resolution)
+        # fpn for the rest features that didn't pass in encoder
+        for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1,
+                       -1):
+            lateral_conv = ConvModule(
+                in_channels[i],
+                feat_channels,
+                kernel_size=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            output_conv = ConvModule(
+                feat_channels,
+                feat_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.lateral_convs.append(lateral_conv)
+            self.output_convs.append(output_conv)
+
+        self.mask_feature = Conv2d(
+            feat_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+        self.num_outs = num_outs
+        self.point_generator = MlvlPointGenerator(strides)
+
+    def init_weights(self) -> None:
+        """Initialize weights."""
+        for i in range(0, self.num_encoder_levels):
+            xavier_init(
+                self.input_convs[i].conv,
+                gain=1,
+                bias=0,
+                distribution='uniform')
+
+        for i in range(0, self.num_input_levels - self.num_encoder_levels):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+
+        normal_init(self.level_encoding, mean=0, std=1)
+        for p in self.encoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+        # init_weights defined in MultiScaleDeformableAttention
+        for m in self.encoder.layers.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+
+    def forward(self, feats: List[Tensor]) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+
+        Returns:
+            tuple: A tuple containing the following:
+
+                - mask_feature (Tensor): shape (batch_size, c, h, w).
+                - multi_scale_features (list[Tensor]): Multi scale \
+                        features, each in shape (batch_size, c, h, w).
+        """
+        # generate padding mask for each level, for each image
+        batch_size = feats[0].shape[0]
+        encoder_input_list = []
+        padding_mask_list = []
+        level_positional_encoding_list = []
+        spatial_shapes = []
+        reference_points_list = []
+        for i in range(self.num_encoder_levels):
+            level_idx = self.num_input_levels - i - 1
+            feat = feats[level_idx]
+            feat_projected = self.input_convs[i](feat)
+            feat_hw = torch._shape_as_tensor(feat)[2:].to(feat.device)
+
+            # no padding
+            padding_mask_resized = feat.new_zeros(
+                (batch_size, ) + feat.shape[-2:], dtype=torch.bool)
+            pos_embed = self.postional_encoding(padding_mask_resized)
+            level_embed = self.level_encoding.weight[i]
+            level_pos_embed = level_embed.view(1, -1, 1, 1) + pos_embed
+            # (h_i * w_i, 2)
+            reference_points = self.point_generator.single_level_grid_priors(
+                feat.shape[-2:], level_idx, device=feat.device)
+            # normalize
+            feat_wh = feat_hw.unsqueeze(0).flip(dims=[0, 1])
+            factor = feat_wh * self.strides[level_idx]
+            reference_points = reference_points / factor
+
+            # shape (batch_size, c, h_i, w_i) -> (h_i * w_i, batch_size, c)
+            feat_projected = feat_projected.flatten(2).permute(0, 2, 1)
+            level_pos_embed = level_pos_embed.flatten(2).permute(0, 2, 1)
+            padding_mask_resized = padding_mask_resized.flatten(1)
+
+            encoder_input_list.append(feat_projected)
+            padding_mask_list.append(padding_mask_resized)
+            level_positional_encoding_list.append(level_pos_embed)
+            spatial_shapes.append(feat_hw)
+            reference_points_list.append(reference_points)
+        # shape (batch_size, total_num_queries),
+        # total_num_queries=sum([., h_i * w_i,.])
+        padding_masks = torch.cat(padding_mask_list, dim=1)
+        # shape (total_num_queries, batch_size, c)
+        encoder_inputs = torch.cat(encoder_input_list, dim=1)
+        level_positional_encodings = torch.cat(
+            level_positional_encoding_list, dim=1)
+        # shape (num_encoder_levels, 2), from low
+        # resolution to high resolution
+        num_queries_per_level = [e[0] * e[1] for e in spatial_shapes]
+        spatial_shapes = torch.cat(spatial_shapes).view(-1, 2)
+        # shape (0, h_0*w_0, h_0*w_0+h_1*w_1, ...)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        reference_points = torch.cat(reference_points_list, dim=0)
+        reference_points = reference_points[None, :, None].repeat(
+            batch_size, 1, self.num_encoder_levels, 1)
+        valid_radios = reference_points.new_ones(
+            (batch_size, self.num_encoder_levels, 2))
+        # shape (num_total_queries, batch_size, c)
+        memory = self.encoder(
+            query=encoder_inputs,
+            query_pos=level_positional_encodings,
+            key_padding_mask=padding_masks,
+            spatial_shapes=spatial_shapes,
+            reference_points=reference_points,
+            level_start_index=level_start_index,
+            valid_ratios=valid_radios)
+        # (batch_size, c, num_total_queries)
+        memory = memory.permute(0, 2, 1)
+
+        # from low resolution to high resolution
+        outs = torch.split(memory, num_queries_per_level, dim=-1)
+        outs = [
+            x.reshape(batch_size, -1, spatial_shapes[i][0],
+                      spatial_shapes[i][1]) for i, x in enumerate(outs)
+        ]
+
+        for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1,
+                       -1):
+            x = feats[i]
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + F.interpolate(
+                outs[-1],
+                size=cur_feat.shape[-2:],
+                mode='bilinear',
+                align_corners=False)
+            y = self.output_convs[i](y)
+            outs.append(y)
+        multi_scale_features = outs[:self.num_outs]
+
+        mask_feature = self.mask_feature(outs[-1])
+        return mask_feature, multi_scale_features
diff --git a/mmde/mmdet/models/layers/normed_predictor.py b/mmde/mmdet/models/layers/normed_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..592194b1dbbb8582f4c642bf29135573e1f8c3c8
--- /dev/null
+++ b/mmde/mmdet/models/layers/normed_predictor.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.utils import digit_version
+from torch import Tensor
+
+from mmdet.registry import MODELS
+
+MODELS.register_module('Linear', module=nn.Linear)
+
+
+@MODELS.register_module(name='NormedLinear')
+class NormedLinear(nn.Linear):
+    """Normalized Linear Layer.
+
+    Args:
+        tempeature (float, optional): Tempeature term. Defaults to 20.
+        power (int, optional): Power term. Defaults to 1.0.
+        eps (float, optional): The minimal value of divisor to
+             keep numerical stability. Defaults to 1e-6.
+    """
+
+    def __init__(self,
+                 *args,
+                 tempearture: float = 20,
+                 power: int = 1.0,
+                 eps: float = 1e-6,
+                 **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.tempearture = tempearture
+        self.power = power
+        self.eps = eps
+        self.init_weights()
+
+    def init_weights(self) -> None:
+        """Initialize the weights."""
+        nn.init.normal_(self.weight, mean=0, std=0.01)
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for `NormedLinear`."""
+        weight_ = self.weight / (
+            self.weight.norm(dim=1, keepdim=True).pow(self.power) + self.eps)
+        x_ = x / (x.norm(dim=1, keepdim=True).pow(self.power) + self.eps)
+        x_ = x_ * self.tempearture
+
+        return F.linear(x_, weight_, self.bias)
+
+
+@MODELS.register_module(name='NormedConv2d')
+class NormedConv2d(nn.Conv2d):
+    """Normalized Conv2d Layer.
+
+    Args:
+        tempeature (float, optional): Tempeature term. Defaults to 20.
+        power (int, optional): Power term. Defaults to 1.0.
+        eps (float, optional): The minimal value of divisor to
+             keep numerical stability. Defaults to 1e-6.
+        norm_over_kernel (bool, optional): Normalize over kernel.
+             Defaults to False.
+    """
+
+    def __init__(self,
+                 *args,
+                 tempearture: float = 20,
+                 power: int = 1.0,
+                 eps: float = 1e-6,
+                 norm_over_kernel: bool = False,
+                 **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.tempearture = tempearture
+        self.power = power
+        self.norm_over_kernel = norm_over_kernel
+        self.eps = eps
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for `NormedConv2d`."""
+        if not self.norm_over_kernel:
+            weight_ = self.weight / (
+                self.weight.norm(dim=1, keepdim=True).pow(self.power) +
+                self.eps)
+        else:
+            weight_ = self.weight / (
+                self.weight.view(self.weight.size(0), -1).norm(
+                    dim=1, keepdim=True).pow(self.power)[..., None, None] +
+                self.eps)
+        x_ = x / (x.norm(dim=1, keepdim=True).pow(self.power) + self.eps)
+        x_ = x_ * self.tempearture
+
+        if hasattr(self, 'conv2d_forward'):
+            x_ = self.conv2d_forward(x_, weight_)
+        else:
+            if digit_version(torch.__version__) >= digit_version('1.8'):
+                x_ = self._conv_forward(x_, weight_, self.bias)
+            else:
+                x_ = self._conv_forward(x_, weight_)
+        return x_
diff --git a/mmde/mmdet/models/layers/pixel_decoder.py b/mmde/mmdet/models/layers/pixel_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb61434045eb9996276518577800132e4a25eb3e
--- /dev/null
+++ b/mmde/mmdet/models/layers/pixel_decoder.py
@@ -0,0 +1,249 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, ConvModule
+from mmengine.model import BaseModule, ModuleList, caffe2_xavier_init
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptMultiConfig
+from .positional_encoding import SinePositionalEncoding
+from .transformer import DetrTransformerEncoder
+
+
+@MODELS.register_module()
+class PixelDecoder(BaseModule):
+    """Pixel decoder with a structure like fpn.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        feat_channels (int): Number channels for feature.
+        out_channels (int): Number channels for output.
+        norm_cfg (:obj:`ConfigDict` or dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (:obj:`ConfigDict` or dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (:obj:`ConfigDict` or dict): Config for transorformer
+            encoder.Defaults to None.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(type='SinePositionalEncoding', num_feats=128,
+            normalize=True).
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: Union[List[int], Tuple[int]],
+                 feat_channels: int,
+                 out_channels: int,
+                 norm_cfg: ConfigType = dict(type='GN', num_groups=32),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_inputs = len(in_channels)
+        self.lateral_convs = ModuleList()
+        self.output_convs = ModuleList()
+        self.use_bias = norm_cfg is None
+        for i in range(0, self.num_inputs - 1):
+            lateral_conv = ConvModule(
+                in_channels[i],
+                feat_channels,
+                kernel_size=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            output_conv = ConvModule(
+                feat_channels,
+                feat_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.lateral_convs.append(lateral_conv)
+            self.output_convs.append(output_conv)
+
+        self.last_feat_conv = ConvModule(
+            in_channels[-1],
+            feat_channels,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            bias=self.use_bias,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.mask_feature = Conv2d(
+            feat_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+    def init_weights(self) -> None:
+        """Initialize weights."""
+        for i in range(0, self.num_inputs - 2):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+        caffe2_xavier_init(self.last_feat_conv, bias=0)
+
+    def forward(self, feats: List[Tensor],
+                batch_img_metas: List[dict]) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+            batch_img_metas (list[dict]): List of image information.
+                Pass in for creating more accurate padding mask. Not
+                used here.
+
+        Returns:
+            tuple[Tensor, Tensor]: a tuple containing the following:
+
+                - mask_feature (Tensor): Shape (batch_size, c, h, w).
+                - memory (Tensor): Output of last stage of backbone.\
+                        Shape (batch_size, c, h, w).
+        """
+        y = self.last_feat_conv(feats[-1])
+        for i in range(self.num_inputs - 2, -1, -1):
+            x = feats[i]
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + \
+                F.interpolate(y, size=cur_feat.shape[-2:], mode='nearest')
+            y = self.output_convs[i](y)
+
+        mask_feature = self.mask_feature(y)
+        memory = feats[-1]
+        return mask_feature, memory
+
+
+@MODELS.register_module()
+class TransformerEncoderPixelDecoder(PixelDecoder):
+    """Pixel decoder with transormer encoder inside.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        feat_channels (int): Number channels for feature.
+        out_channels (int): Number channels for output.
+        norm_cfg (:obj:`ConfigDict` or dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (:obj:`ConfigDict` or dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (:obj:`ConfigDict` or dict): Config for transformer encoder.
+            Defaults to None.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(num_feats=128, normalize=True).
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: Union[List[int], Tuple[int]],
+                 feat_channels: int,
+                 out_channels: int,
+                 norm_cfg: ConfigType = dict(type='GN', num_groups=32),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 encoder: ConfigType = None,
+                 positional_encoding: ConfigType = dict(
+                     num_feats=128, normalize=True),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            init_cfg=init_cfg)
+        self.last_feat_conv = None
+
+        self.encoder = DetrTransformerEncoder(**encoder)
+        self.encoder_embed_dims = self.encoder.embed_dims
+        assert self.encoder_embed_dims == feat_channels, 'embed_dims({}) of ' \
+            'tranformer encoder must equal to feat_channels({})'.format(
+                feat_channels, self.encoder_embed_dims)
+        self.positional_encoding = SinePositionalEncoding(
+            **positional_encoding)
+        self.encoder_in_proj = Conv2d(
+            in_channels[-1], feat_channels, kernel_size=1)
+        self.encoder_out_proj = ConvModule(
+            feat_channels,
+            feat_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=self.use_bias,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def init_weights(self) -> None:
+        """Initialize weights."""
+        for i in range(0, self.num_inputs - 2):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+        caffe2_xavier_init(self.encoder_in_proj, bias=0)
+        caffe2_xavier_init(self.encoder_out_proj.conv, bias=0)
+
+        for p in self.encoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, feats: List[Tensor],
+                batch_img_metas: List[dict]) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+            batch_img_metas (list[dict]): List of image information. Pass in
+                for creating more accurate padding mask.
+
+        Returns:
+            tuple: a tuple containing the following:
+
+                - mask_feature (Tensor): shape (batch_size, c, h, w).
+                - memory (Tensor): shape (batch_size, c, h, w).
+        """
+        feat_last = feats[-1]
+        bs, c, h, w = feat_last.shape
+        input_img_h, input_img_w = batch_img_metas[0]['batch_input_shape']
+        padding_mask = feat_last.new_ones((bs, input_img_h, input_img_w),
+                                          dtype=torch.float32)
+        for i in range(bs):
+            img_h, img_w = batch_img_metas[i]['img_shape']
+            padding_mask[i, :img_h, :img_w] = 0
+        padding_mask = F.interpolate(
+            padding_mask.unsqueeze(1),
+            size=feat_last.shape[-2:],
+            mode='nearest').to(torch.bool).squeeze(1)
+
+        pos_embed = self.positional_encoding(padding_mask)
+        feat_last = self.encoder_in_proj(feat_last)
+        # (batch_size, c, h, w) -> (batch_size, num_queries, c)
+        feat_last = feat_last.flatten(2).permute(0, 2, 1)
+        pos_embed = pos_embed.flatten(2).permute(0, 2, 1)
+        # (batch_size, h, w) -> (batch_size, h*w)
+        padding_mask = padding_mask.flatten(1)
+        memory = self.encoder(
+            query=feat_last,
+            query_pos=pos_embed,
+            key_padding_mask=padding_mask)
+        # (batch_size, num_queries, c) -> (batch_size, c, h, w)
+        memory = memory.permute(0, 2, 1).view(bs, self.encoder_embed_dims, h,
+                                              w)
+        y = self.encoder_out_proj(memory)
+        for i in range(self.num_inputs - 2, -1, -1):
+            x = feats[i]
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + \
+                F.interpolate(y, size=cur_feat.shape[-2:], mode='nearest')
+            y = self.output_convs[i](y)
+
+        mask_feature = self.mask_feature(y)
+        return mask_feature, memory
diff --git a/mmde/mmdet/models/layers/positional_encoding.py b/mmde/mmdet/models/layers/positional_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..87080d81a9f155839d453b8671103e5d51fbf88a
--- /dev/null
+++ b/mmde/mmdet/models/layers/positional_encoding.py
@@ -0,0 +1,269 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig, OptMultiConfig
+
+
+@MODELS.register_module()
+class SinePositionalEncoding(BaseModule):
+    """Position encoding with sine and cosine functions.
+
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None
+    """
+
+    def __init__(self,
+                 num_feats: int,
+                 temperature: int = 10000,
+                 normalize: bool = False,
+                 scale: float = 2 * math.pi,
+                 eps: float = 1e-6,
+                 offset: float = 0.,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        if normalize:
+            assert isinstance(scale, (float, int)), 'when normalize is set,' \
+                'scale should be provided and in float or int type, ' \
+                f'found {type(scale)}'
+        self.num_feats = num_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = scale
+        self.eps = eps
+        self.offset = offset
+
+    def forward(self, mask: Tensor, input: Optional[Tensor] = None) -> Tensor:
+        """Forward function for `SinePositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+            input (Tensor, optional): Input image/feature Tensor.
+                Shape [bs, c, h, w]
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        assert not (mask is None and input is None)
+
+        if mask is not None:
+            B, H, W = mask.size()
+            device = mask.device
+            # For convenience of exporting to ONNX,
+            # it's required to convert
+            # `masks` from bool to int.
+            mask = mask.to(torch.int)
+            not_mask = 1 - mask  # logical_not
+            y_embed = not_mask.cumsum(1, dtype=torch.float32)
+            x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        else:
+            # single image or batch image with no padding
+            B, _, H, W = input.shape
+            device = input.device
+            x_embed = torch.arange(
+                1, W + 1, dtype=torch.float32, device=device)
+            x_embed = x_embed.view(1, 1, -1).repeat(B, H, 1)
+            y_embed = torch.arange(
+                1, H + 1, dtype=torch.float32, device=device)
+            y_embed = y_embed.view(1, -1, 1).repeat(B, 1, W)
+        if self.normalize:
+            y_embed = (y_embed + self.offset) / \
+                      (y_embed[:, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / \
+                      (x_embed[:, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+    def __repr__(self) -> str:
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'temperature={self.temperature}, '
+        repr_str += f'normalize={self.normalize}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'eps={self.eps})'
+        return repr_str
+
+
+@MODELS.register_module()
+class LearnedPositionalEncoding(BaseModule):
+    """Position embedding with learnable embedding weights.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. The final returned dimension for
+            each position is 2 times of this value.
+        row_num_embed (int, optional): The dictionary size of row embeddings.
+            Defaults to 50.
+        col_num_embed (int, optional): The dictionary size of col embeddings.
+            Defaults to 50.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_feats: int,
+        row_num_embed: int = 50,
+        col_num_embed: int = 50,
+        init_cfg: MultiConfig = dict(type='Uniform', layer='Embedding')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.row_embed = nn.Embedding(row_num_embed, num_feats)
+        self.col_embed = nn.Embedding(col_num_embed, num_feats)
+        self.num_feats = num_feats
+        self.row_num_embed = row_num_embed
+        self.col_num_embed = col_num_embed
+
+    def forward(self, mask: Tensor) -> Tensor:
+        """Forward function for `LearnedPositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        h, w = mask.shape[-2:]
+        x = torch.arange(w, device=mask.device)
+        y = torch.arange(h, device=mask.device)
+        x_embed = self.col_embed(x)
+        y_embed = self.row_embed(y)
+        pos = torch.cat(
+            (x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(
+                1, w, 1)),
+            dim=-1).permute(2, 0,
+                            1).unsqueeze(0).repeat(mask.shape[0], 1, 1, 1)
+        return pos
+
+    def __repr__(self) -> str:
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'row_num_embed={self.row_num_embed}, '
+        repr_str += f'col_num_embed={self.col_num_embed})'
+        return repr_str
+
+
+@MODELS.register_module()
+class SinePositionalEncoding3D(SinePositionalEncoding):
+    """Position encoding with sine and cosine functions.
+
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def forward(self, mask: Tensor) -> Tensor:
+        """Forward function for `SinePositionalEncoding3D`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, t, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        assert mask.dim() == 4,\
+            f'{mask.shape} should be a 4-dimensional Tensor,' \
+            f' got {mask.dim()}-dimensional Tensor instead '
+        # For convenience of exporting to ONNX, it's required to convert
+        # `masks` from bool to int.
+        mask = mask.to(torch.int)
+        not_mask = 1 - mask  # logical_not
+        z_embed = not_mask.cumsum(1, dtype=torch.float32)
+        y_embed = not_mask.cumsum(2, dtype=torch.float32)
+        x_embed = not_mask.cumsum(3, dtype=torch.float32)
+        if self.normalize:
+            z_embed = (z_embed + self.offset) / \
+                      (z_embed[:, -1:, :, :] + self.eps) * self.scale
+            y_embed = (y_embed + self.offset) / \
+                      (y_embed[:, :, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / \
+                      (x_embed[:, :, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+
+        dim_t_z = torch.arange((self.num_feats * 2),
+                               dtype=torch.float32,
+                               device=mask.device)
+        dim_t_z = self.temperature**(2 * (dim_t_z // 2) / (self.num_feats * 2))
+
+        pos_x = x_embed[:, :, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, :, None] / dim_t
+        pos_z = z_embed[:, :, :, :, None] / dim_t_z
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        B, T, H, W = mask.size()
+        pos_x = torch.stack(
+            (pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()),
+            dim=5).view(B, T, H, W, -1)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()),
+            dim=5).view(B, T, H, W, -1)
+        pos_z = torch.stack(
+            (pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()),
+            dim=5).view(B, T, H, W, -1)
+        pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3)
+        return pos
diff --git a/mmde/mmdet/models/layers/res_layer.py b/mmde/mmdet/models/layers/res_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff24d3e8562d1c3c724b35f7dc10cafe48e47650
--- /dev/null
+++ b/mmde/mmdet/models/layers/res_layer.py
@@ -0,0 +1,195 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule, Sequential
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+
+
+class ResLayer(Sequential):
+    """ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Defaults to 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Defaults to False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Defaults to None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Defaults to dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Defaults to True
+    """
+
+    def __init__(self,
+                 block: BaseModule,
+                 inplanes: int,
+                 planes: int,
+                 num_blocks: int,
+                 stride: int = 1,
+                 avg_down: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 downsample_first: bool = True,
+                 **kwargs) -> None:
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if downsample_first:
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+            inplanes = planes * block.expansion
+            for _ in range(1, num_blocks):
+                layers.append(
+                    block(
+                        inplanes=inplanes,
+                        planes=planes,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+
+        else:  # downsample_first=False is for HourglassModule
+            for _ in range(num_blocks - 1):
+                layers.append(
+                    block(
+                        inplanes=inplanes,
+                        planes=inplanes,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+        super().__init__(*layers)
+
+
+class SimplifiedBasicBlock(BaseModule):
+    """Simplified version of original basic residual block. This is used in
+    `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    - Norm layer is now optional
+    - Last ReLU in forward function is removed
+    """
+    expansion = 1
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[Sequential] = None,
+                 style: ConfigType = 'pytorch',
+                 with_cp: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 dcn: OptConfigType = None,
+                 plugins: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+        assert not with_cp, 'Not implemented yet.'
+        self.with_norm = norm_cfg is not None
+        with_bias = True if norm_cfg is None else False
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=with_bias)
+        if self.with_norm:
+            self.norm1_name, norm1 = build_norm_layer(
+                norm_cfg, planes, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=with_bias)
+        if self.with_norm:
+            self.norm2_name, norm2 = build_norm_layer(
+                norm_cfg, planes, postfix=2)
+            self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self) -> Optional[BaseModule]:
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name) if self.with_norm else None
+
+    @property
+    def norm2(self) -> Optional[BaseModule]:
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name) if self.with_norm else None
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for SimplifiedBasicBlock."""
+
+        identity = x
+
+        out = self.conv1(x)
+        if self.with_norm:
+            out = self.norm1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        if self.with_norm:
+            out = self.norm2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+
+        return out
diff --git a/mmde/mmdet/models/layers/se_layer.py b/mmde/mmdet/models/layers/se_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5598dabaf6f3b3a09f4348fcd65ff39897b7068f
--- /dev/null
+++ b/mmde/mmdet/models/layers/se_layer.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from mmengine.utils import digit_version, is_tuple_of
+from torch import Tensor
+
+from mmdet.utils import MultiConfig, OptConfigType, OptMultiConfig
+
+
+class SELayer(BaseModule):
+    """Squeeze-and-Excitation Module.
+
+    Args:
+        channels (int): The input (and output) channels of the SE layer.
+        ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
+            ``int(channels/ratio)``. Defaults to 16.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configurated
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configurated by the first dict and the
+            second activation layer will be configurated by the second dict.
+            Defaults to (dict(type='ReLU'), dict(type='Sigmoid'))
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None
+    """
+
+    def __init__(self,
+                 channels: int,
+                 ratio: int = 16,
+                 conv_cfg: OptConfigType = None,
+                 act_cfg: MultiConfig = (dict(type='ReLU'),
+                                         dict(type='Sigmoid')),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert is_tuple_of(act_cfg, dict)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for SELayer."""
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
+
+
+class DyReLU(BaseModule):
+    """Dynamic ReLU (DyReLU) module.
+
+    See `Dynamic ReLU <https://arxiv.org/abs/2003.10027>`_ for details.
+    Current implementation is specialized for task-aware attention in DyHead.
+    HSigmoid arguments in default act_cfg follow DyHead official code.
+    https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py
+
+    Args:
+        channels (int): The input (and output) channels of DyReLU module.
+        ratio (int): Squeeze ratio in Squeeze-and-Excitation-like module,
+            the intermediate channel will be ``int(channels/ratio)``.
+            Defaults to 4.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configurated
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configurated by the first dict and the
+            second activation layer will be configurated by the second dict.
+            Defaults to (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0,
+            divisor=6.0))
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None
+    """
+
+    def __init__(self,
+                 channels: int,
+                 ratio: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 act_cfg: MultiConfig = (dict(type='ReLU'),
+                                         dict(
+                                             type='HSigmoid',
+                                             bias=3.0,
+                                             divisor=6.0)),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert is_tuple_of(act_cfg, dict)
+        self.channels = channels
+        self.expansion = 4  # for a1, b1, a2, b2
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels * self.expansion,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        coeffs = self.global_avgpool(x)
+        coeffs = self.conv1(coeffs)
+        coeffs = self.conv2(coeffs) - 0.5  # value range: [-0.5, 0.5]
+        a1, b1, a2, b2 = torch.split(coeffs, self.channels, dim=1)
+        a1 = a1 * 2.0 + 1.0  # [-1.0, 1.0] + 1.0
+        a2 = a2 * 2.0  # [-1.0, 1.0]
+        out = torch.max(x * a1 + b1, x * a2 + b2)
+        return out
+
+
+class ChannelAttention(BaseModule):
+    """Channel attention Module.
+
+    Args:
+        channels (int): The input (and output) channels of the attention layer.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None
+    """
+
+    def __init__(self, channels: int, init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
+        if digit_version(torch.__version__) < (1, 7, 0):
+            self.act = nn.Hardsigmoid()
+        else:
+            self.act = nn.Hardsigmoid(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for ChannelAttention."""
+        with torch.cuda.amp.autocast(enabled=False):
+            out = self.global_avgpool(x)
+        out = self.fc(out)
+        out = self.act(out)
+        return x * out
diff --git a/mmde/mmdet/models/layers/transformer/__init__.py b/mmde/mmdet/models/layers/transformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..839d936412673d765cd9f89a44a366a64976bb9c
--- /dev/null
+++ b/mmde/mmdet/models/layers/transformer/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .conditional_detr_layers import (ConditionalDetrTransformerDecoder,
+                                      ConditionalDetrTransformerDecoderLayer)
+from .dab_detr_layers import (DABDetrTransformerDecoder,
+                              DABDetrTransformerDecoderLayer,
+                              DABDetrTransformerEncoder)
+from .ddq_detr_layers import DDQTransformerDecoder
+from .deformable_detr_layers import (DeformableDetrTransformerDecoder,
+                                     DeformableDetrTransformerDecoderLayer,
+                                     DeformableDetrTransformerEncoder,
+                                     DeformableDetrTransformerEncoderLayer)
+from .detr_layers import (DetrTransformerDecoder, DetrTransformerDecoderLayer,
+                          DetrTransformerEncoder, DetrTransformerEncoderLayer)
+from .dino_layers import CdnQueryGenerator, DinoTransformerDecoder
+from .grounding_dino_layers import (GroundingDinoTransformerDecoder,
+                                    GroundingDinoTransformerDecoderLayer,
+                                    GroundingDinoTransformerEncoder)
+from .mask2former_layers import (Mask2FormerTransformerDecoder,
+                                 Mask2FormerTransformerDecoderLayer,
+                                 Mask2FormerTransformerEncoder)
+from .utils import (MLP, AdaptivePadding, ConditionalAttention, DynamicConv,
+                    PatchEmbed, PatchMerging, coordinate_to_encoding,
+                    inverse_sigmoid, nchw_to_nlc, nlc_to_nchw)
+
+__all__ = [
+    'nlc_to_nchw', 'nchw_to_nlc', 'AdaptivePadding', 'PatchEmbed',
+    'PatchMerging', 'inverse_sigmoid', 'DynamicConv', 'MLP',
+    'DetrTransformerEncoder', 'DetrTransformerDecoder',
+    'DetrTransformerEncoderLayer', 'DetrTransformerDecoderLayer',
+    'DeformableDetrTransformerEncoder', 'DeformableDetrTransformerDecoder',
+    'DeformableDetrTransformerEncoderLayer',
+    'DeformableDetrTransformerDecoderLayer', 'coordinate_to_encoding',
+    'ConditionalAttention', 'DABDetrTransformerDecoderLayer',
+    'DABDetrTransformerDecoder', 'DABDetrTransformerEncoder',
+    'DDQTransformerDecoder', 'ConditionalDetrTransformerDecoder',
+    'ConditionalDetrTransformerDecoderLayer', 'DinoTransformerDecoder',
+    'CdnQueryGenerator', 'Mask2FormerTransformerEncoder',
+    'Mask2FormerTransformerDecoderLayer', 'Mask2FormerTransformerDecoder',
+    'GroundingDinoTransformerDecoderLayer', 'GroundingDinoTransformerEncoder',
+    'GroundingDinoTransformerDecoder'
+]
diff --git a/mmde/mmdet/models/layers/transformer/conditional_detr_layers.py b/mmde/mmdet/models/layers/transformer/conditional_detr_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6db12a1340c758996e8c0e96f0b21cbc6fa928c9
--- /dev/null
+++ b/mmde/mmdet/models/layers/transformer/conditional_detr_layers.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN
+from torch import Tensor
+from torch.nn import ModuleList
+
+from .detr_layers import DetrTransformerDecoder, DetrTransformerDecoderLayer
+from .utils import MLP, ConditionalAttention, coordinate_to_encoding
+
+
+class ConditionalDetrTransformerDecoder(DetrTransformerDecoder):
+    """Decoder of Conditional DETR."""
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers and other layers."""
+        self.layers = ModuleList([
+            ConditionalDetrTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        self.post_norm = build_norm_layer(self.post_norm_cfg,
+                                          self.embed_dims)[1]
+        # conditional detr affline
+        self.query_scale = MLP(self.embed_dims, self.embed_dims,
+                               self.embed_dims, 2)
+        self.ref_point_head = MLP(self.embed_dims, self.embed_dims, 2, 2)
+        # we have substitute 'qpos_proj' with 'qpos_sine_proj' except for
+        # the first decoder layer), so 'qpos_proj' should be deleted
+        # in other layers.
+        for layer_id in range(self.num_layers - 1):
+            self.layers[layer_id + 1].cross_attn.qpos_proj = None
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                key_padding_mask: Tensor = None):
+        """Forward function of decoder.
+
+        Args:
+            query (Tensor): The input query with shape
+                (bs, num_queries, dim).
+            key (Tensor): The input key with shape (bs, num_keys, dim) If
+                `None`, the `query` will be used. Defaults to `None`.
+            query_pos (Tensor): The positional encoding for `query`, with the
+                same shape as `query`. If not `None`, it will be added to
+                `query` before forward function. Defaults to `None`.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. If not `None`, it will be added to
+                `key` before forward function. If `None`, and `query_pos`
+                has the same shape as `key`, then `query_pos` will be used
+                as `key_pos`. Defaults to `None`.
+            key_padding_mask (Tensor): ByteTensor with shape (bs, num_keys).
+                Defaults to `None`.
+        Returns:
+            List[Tensor]: forwarded results with shape (num_decoder_layers,
+            bs, num_queries, dim) if `return_intermediate` is True, otherwise
+            with shape (1, bs, num_queries, dim). References with shape
+            (bs, num_queries, 2).
+        """
+        reference_unsigmoid = self.ref_point_head(
+            query_pos)  # [bs, num_queries, 2]
+        reference = reference_unsigmoid.sigmoid()
+        reference_xy = reference[..., :2]
+        intermediate = []
+        for layer_id, layer in enumerate(self.layers):
+            if layer_id == 0:
+                pos_transformation = 1
+            else:
+                pos_transformation = self.query_scale(query)
+            # get sine embedding for the query reference
+            ref_sine_embed = coordinate_to_encoding(coord_tensor=reference_xy)
+            # apply transformation
+            ref_sine_embed = ref_sine_embed * pos_transformation
+            query = layer(
+                query,
+                key=key,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                key_padding_mask=key_padding_mask,
+                ref_sine_embed=ref_sine_embed,
+                is_first=(layer_id == 0))
+            if self.return_intermediate:
+                intermediate.append(self.post_norm(query))
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), reference
+
+        query = self.post_norm(query)
+        return query.unsqueeze(0), reference
+
+
+class ConditionalDetrTransformerDecoderLayer(DetrTransformerDecoderLayer):
+    """Implements decoder layer in Conditional DETR transformer."""
+
+    def _init_layers(self):
+        """Initialize self-attention, cross-attention, FFN, and
+        normalization."""
+        self.self_attn = ConditionalAttention(**self.self_attn_cfg)
+        self.cross_attn = ConditionalAttention(**self.cross_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(3)
+        ]
+        self.norms = ModuleList(norms_list)
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                self_attn_masks: Tensor = None,
+                cross_attn_masks: Tensor = None,
+                key_padding_mask: Tensor = None,
+                ref_sine_embed: Tensor = None,
+                is_first: bool = False):
+        """
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim)
+            key (Tensor, optional): The input key, has shape (bs, num_keys,
+                dim). If `None`, the `query` will be used. Defaults to `None`.
+            query_pos (Tensor, optional): The positional encoding for `query`,
+                has the same shape as `query`. If not `None`, it will be
+                added to `query` before forward function. Defaults to `None`.
+            ref_sine_embed (Tensor): The positional encoding for query in
+                cross attention, with the same shape as `x`. Defaults to None.
+            key_pos (Tensor, optional): The positional encoding for `key`, has
+                the same shape as `key`. If not None, it will be added to
+                `key` before forward function. If None, and `query_pos` has
+                the same shape as `key`, then `query_pos` will be used for
+                `key_pos`. Defaults to None.
+            self_attn_masks (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), Same in `nn.MultiheadAttention.
+                forward`. Defaults to None.
+            cross_attn_masks (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), Same in `nn.MultiheadAttention.
+                forward`. Defaults to None.
+            key_padding_mask (Tensor, optional): ByteTensor, has shape
+                (bs, num_keys). Defaults to None.
+            is_first (bool): A indicator to tell whether the current layer
+                is the first layer of the decoder. Defaults to False.
+
+        Returns:
+            Tensor: Forwarded results, has shape (bs, num_queries, dim).
+        """
+        query = self.self_attn(
+            query=query,
+            key=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_masks)
+        query = self.norms[0](query)
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=cross_attn_masks,
+            key_padding_mask=key_padding_mask,
+            ref_sine_embed=ref_sine_embed,
+            is_first=is_first)
+        query = self.norms[1](query)
+        query = self.ffn(query)
+        query = self.norms[2](query)
+
+        return query
diff --git a/mmde/mmdet/models/layers/transformer/dab_detr_layers.py b/mmde/mmdet/models/layers/transformer/dab_detr_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8a6e7724a1b1ca18f26dd10455f3e3a4d696460
--- /dev/null
+++ b/mmde/mmdet/models/layers/transformer/dab_detr_layers.py
@@ -0,0 +1,298 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN
+from mmengine.model import ModuleList
+from torch import Tensor
+
+from .detr_layers import (DetrTransformerDecoder, DetrTransformerDecoderLayer,
+                          DetrTransformerEncoder, DetrTransformerEncoderLayer)
+from .utils import (MLP, ConditionalAttention, coordinate_to_encoding,
+                    inverse_sigmoid)
+
+
+class DABDetrTransformerDecoderLayer(DetrTransformerDecoderLayer):
+    """Implements decoder layer in DAB-DETR transformer."""
+
+    def _init_layers(self):
+        """Initialize self-attention, cross-attention, FFN, normalization and
+        others."""
+        self.self_attn = ConditionalAttention(**self.self_attn_cfg)
+        self.cross_attn = ConditionalAttention(**self.cross_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(3)
+        ]
+        self.norms = ModuleList(norms_list)
+        self.keep_query_pos = self.cross_attn.keep_query_pos
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor,
+                query_pos: Tensor,
+                key_pos: Tensor,
+                ref_sine_embed: Tensor = None,
+                self_attn_masks: Tensor = None,
+                cross_attn_masks: Tensor = None,
+                key_padding_mask: Tensor = None,
+                is_first: bool = False,
+                **kwargs) -> Tensor:
+        """
+        Args:
+            query (Tensor): The input query with shape [bs, num_queries,
+                dim].
+            key (Tensor): The key tensor with shape [bs, num_keys,
+                dim].
+            query_pos (Tensor): The positional encoding for query in self
+                attention, with the same shape as `x`.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`.
+            ref_sine_embed (Tensor): The positional encoding for query in
+                cross attention, with the same shape as `x`.
+                Defaults to None.
+            self_attn_masks (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            cross_attn_masks (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+            is_first (bool): A indicator to tell whether the current layer
+                is the first layer of the decoder.
+                Defaults to False.
+
+        Returns:
+            Tensor: forwarded results with shape
+            [bs, num_queries, dim].
+        """
+
+        query = self.self_attn(
+            query=query,
+            key=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_masks,
+            **kwargs)
+        query = self.norms[0](query)
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            ref_sine_embed=ref_sine_embed,
+            attn_mask=cross_attn_masks,
+            key_padding_mask=key_padding_mask,
+            is_first=is_first,
+            **kwargs)
+        query = self.norms[1](query)
+        query = self.ffn(query)
+        query = self.norms[2](query)
+
+        return query
+
+
+class DABDetrTransformerDecoder(DetrTransformerDecoder):
+    """Decoder of DAB-DETR.
+
+    Args:
+        query_dim (int): The last dimension of query pos,
+            4 for anchor format, 2 for point format.
+            Defaults to 4.
+        query_scale_type (str): Type of transformation applied
+            to content query. Defaults to `cond_elewise`.
+        with_modulated_hw_attn (bool): Whether to inject h&w info
+            during cross conditional attention. Defaults to True.
+    """
+
+    def __init__(self,
+                 *args,
+                 query_dim: int = 4,
+                 query_scale_type: str = 'cond_elewise',
+                 with_modulated_hw_attn: bool = True,
+                 **kwargs):
+
+        self.query_dim = query_dim
+        self.query_scale_type = query_scale_type
+        self.with_modulated_hw_attn = with_modulated_hw_attn
+
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self):
+        """Initialize decoder layers and other layers."""
+        assert self.query_dim in [2, 4], \
+            f'{"dab-detr only supports anchor prior or reference point prior"}'
+        assert self.query_scale_type in [
+            'cond_elewise', 'cond_scalar', 'fix_elewise'
+        ]
+
+        self.layers = ModuleList([
+            DABDetrTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+
+        embed_dims = self.layers[0].embed_dims
+        self.embed_dims = embed_dims
+
+        self.post_norm = build_norm_layer(self.post_norm_cfg, embed_dims)[1]
+        if self.query_scale_type == 'cond_elewise':
+            self.query_scale = MLP(embed_dims, embed_dims, embed_dims, 2)
+        elif self.query_scale_type == 'cond_scalar':
+            self.query_scale = MLP(embed_dims, embed_dims, 1, 2)
+        elif self.query_scale_type == 'fix_elewise':
+            self.query_scale = nn.Embedding(self.num_layers, embed_dims)
+        else:
+            raise NotImplementedError('Unknown query_scale_type: {}'.format(
+                self.query_scale_type))
+
+        self.ref_point_head = MLP(self.query_dim // 2 * embed_dims, embed_dims,
+                                  embed_dims, 2)
+
+        if self.with_modulated_hw_attn and self.query_dim == 4:
+            self.ref_anchor_head = MLP(embed_dims, embed_dims, 2, 2)
+
+        self.keep_query_pos = self.layers[0].keep_query_pos
+        if not self.keep_query_pos:
+            for layer_id in range(self.num_layers - 1):
+                self.layers[layer_id + 1].cross_attn.qpos_proj = None
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor,
+                query_pos: Tensor,
+                key_pos: Tensor,
+                reg_branches: nn.Module,
+                key_padding_mask: Tensor = None,
+                **kwargs) -> List[Tensor]:
+        """Forward function of decoder.
+
+        Args:
+            query (Tensor): The input query with shape (bs, num_queries, dim).
+            key (Tensor): The input key with shape (bs, num_keys, dim).
+            query_pos (Tensor): The positional encoding for `query`, with the
+                same shape as `query`.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`.
+            reg_branches (nn.Module): The regression branch for dynamically
+                updating references in each layer.
+            key_padding_mask (Tensor): ByteTensor with shape (bs, num_keys).
+                Defaults to `None`.
+
+        Returns:
+            List[Tensor]: forwarded results with shape (num_decoder_layers,
+            bs, num_queries, dim) if `return_intermediate` is True, otherwise
+            with shape (1, bs, num_queries, dim). references with shape
+            (num_decoder_layers, bs, num_queries, 2/4).
+        """
+        output = query
+        unsigmoid_references = query_pos
+
+        reference_points = unsigmoid_references.sigmoid()
+        intermediate_reference_points = [reference_points]
+
+        intermediate = []
+        for layer_id, layer in enumerate(self.layers):
+            obj_center = reference_points[..., :self.query_dim]
+            ref_sine_embed = coordinate_to_encoding(
+                coord_tensor=obj_center, num_feats=self.embed_dims // 2)
+            query_pos = self.ref_point_head(
+                ref_sine_embed)  # [bs, nq, 2c] -> [bs, nq, c]
+            # For the first decoder layer, do not apply transformation
+            if self.query_scale_type != 'fix_elewise':
+                if layer_id == 0:
+                    pos_transformation = 1
+                else:
+                    pos_transformation = self.query_scale(output)
+            else:
+                pos_transformation = self.query_scale.weight[layer_id]
+            # apply transformation
+            ref_sine_embed = ref_sine_embed[
+                ..., :self.embed_dims] * pos_transformation
+            # modulated height and weight attention
+            if self.with_modulated_hw_attn:
+                assert obj_center.size(-1) == 4
+                ref_hw = self.ref_anchor_head(output).sigmoid()
+                ref_sine_embed[..., self.embed_dims // 2:] *= \
+                    (ref_hw[..., 0] / obj_center[..., 2]).unsqueeze(-1)
+                ref_sine_embed[..., : self.embed_dims // 2] *= \
+                    (ref_hw[..., 1] / obj_center[..., 3]).unsqueeze(-1)
+
+            output = layer(
+                output,
+                key,
+                query_pos=query_pos,
+                ref_sine_embed=ref_sine_embed,
+                key_pos=key_pos,
+                key_padding_mask=key_padding_mask,
+                is_first=(layer_id == 0),
+                **kwargs)
+            # iter update
+            tmp_reg_preds = reg_branches(output)
+            tmp_reg_preds[..., :self.query_dim] += inverse_sigmoid(
+                reference_points)
+            new_reference_points = tmp_reg_preds[
+                ..., :self.query_dim].sigmoid()
+            if layer_id != self.num_layers - 1:
+                intermediate_reference_points.append(new_reference_points)
+            reference_points = new_reference_points.detach()
+
+            if self.return_intermediate:
+                intermediate.append(self.post_norm(output))
+
+        output = self.post_norm(output)
+
+        if self.return_intermediate:
+            return [
+                torch.stack(intermediate),
+                torch.stack(intermediate_reference_points),
+            ]
+        else:
+            return [
+                output.unsqueeze(0),
+                torch.stack(intermediate_reference_points)
+            ]
+
+
+class DABDetrTransformerEncoder(DetrTransformerEncoder):
+    """Encoder of DAB-DETR."""
+
+    def _init_layers(self):
+        """Initialize encoder layers."""
+        self.layers = ModuleList([
+            DetrTransformerEncoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        embed_dims = self.layers[0].embed_dims
+        self.embed_dims = embed_dims
+        self.query_scale = MLP(embed_dims, embed_dims, embed_dims, 2)
+
+    def forward(self, query: Tensor, query_pos: Tensor,
+                key_padding_mask: Tensor, **kwargs):
+        """Forward function of encoder.
+
+        Args:
+            query (Tensor): Input queries of encoder, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional embeddings of the queries, has
+                shape (bs, num_feat_points, dim).
+            key_padding_mask (Tensor): ByteTensor, the key padding mask
+                of the queries, has shape (bs, num_feat_points).
+
+        Returns:
+            Tensor: With shape (num_queries, bs, dim).
+        """
+
+        for layer in self.layers:
+            pos_scales = self.query_scale(query)
+            query = layer(
+                query,
+                query_pos=query_pos * pos_scales,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+
+        return query
diff --git a/mmde/mmdet/models/layers/transformer/ddq_detr_layers.py b/mmde/mmdet/models/layers/transformer/ddq_detr_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..57664c7ea2bdd17681ccdabe9140eb043a99e155
--- /dev/null
+++ b/mmde/mmdet/models/layers/transformer/ddq_detr_layers.py
@@ -0,0 +1,223 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+from mmcv.ops import batched_nms
+from torch import Tensor, nn
+
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy
+from .deformable_detr_layers import DeformableDetrTransformerDecoder
+from .utils import MLP, coordinate_to_encoding, inverse_sigmoid
+
+
+class DDQTransformerDecoder(DeformableDetrTransformerDecoder):
+    """Transformer decoder of DDQ."""
+
+    def _init_layers(self) -> None:
+        """Initialize encoder layers."""
+        super()._init_layers()
+        self.ref_point_head = MLP(self.embed_dims * 2, self.embed_dims,
+                                  self.embed_dims, 2)
+        self.norm = nn.LayerNorm(self.embed_dims)
+
+    def select_distinct_queries(self, reference_points: Tensor, query: Tensor,
+                                self_attn_mask: Tensor, layer_index):
+        """Get updated `self_attn_mask` for distinct queries selection, it is
+        used in self attention layers of decoder.
+
+        Args:
+            reference_points (Tensor): The input reference of decoder,
+                has shape (bs, num_queries, 4) with the last dimension
+                arranged as (cx, cy, w, h).
+            query (Tensor): The input query of decoder, has shape
+                (bs, num_queries, dims).
+            self_attn_mask (Tensor): The input self attention mask of
+                last decoder layer, has shape (bs, num_queries_total,
+                num_queries_total).
+            layer_index (int): Last decoder layer index, used to get
+                classification score of last layer output, for
+                distinct queries selection.
+
+        Returns:
+            Tensor: `self_attn_mask` used in self attention layers
+                of decoder, has shape (bs, num_queries_total,
+                num_queries_total).
+        """
+        num_imgs = len(reference_points)
+        dis_start, num_dis = self.cache_dict['dis_query_info']
+        # shape of self_attn_mask
+        # (batch⋅num_heads, num_queries, embed_dims)
+        dis_mask = self_attn_mask[:, dis_start:dis_start + num_dis,
+                                  dis_start:dis_start + num_dis]
+        # cls_branches from DDQDETRHead
+        scores = self.cache_dict['cls_branches'][layer_index](
+            query[:, dis_start:dis_start + num_dis]).sigmoid().max(-1).values
+        proposals = reference_points[:, dis_start:dis_start + num_dis]
+        proposals = bbox_cxcywh_to_xyxy(proposals)
+
+        attn_mask_list = []
+        for img_id in range(num_imgs):
+            single_proposals = proposals[img_id]
+            single_scores = scores[img_id]
+            attn_mask = ~dis_mask[img_id * self.cache_dict['num_heads']][0]
+            # distinct query inds in this layer
+            ori_index = attn_mask.nonzero().view(-1)
+            _, keep_idxs = batched_nms(single_proposals[ori_index],
+                                       single_scores[ori_index],
+                                       torch.ones(len(ori_index)),
+                                       self.cache_dict['dqs_cfg'])
+
+            real_keep_index = ori_index[keep_idxs]
+
+            attn_mask = torch.ones_like(dis_mask[0]).bool()
+            # such a attn_mask give best result
+            # If it requires to keep index i, then all cells in row or column
+            #   i should be kept in `attn_mask` . For example, if
+            #   `real_keep_index` = [1, 4], and `attn_mask` size = [8, 8],
+            #   then all cells at rows or columns [1, 4] should be kept, and
+            #   all the other cells should be masked out. So the value of
+            #  `attn_mask` should be:
+            #
+            # target\source   0 1 2 3 4 5 6 7
+            #             0 [ 0 1 0 0 1 0 0 0 ]
+            #             1 [ 1 1 1 1 1 1 1 1 ]
+            #             2 [ 0 1 0 0 1 0 0 0 ]
+            #             3 [ 0 1 0 0 1 0 0 0 ]
+            #             4 [ 1 1 1 1 1 1 1 1 ]
+            #             5 [ 0 1 0 0 1 0 0 0 ]
+            #             6 [ 0 1 0 0 1 0 0 0 ]
+            #             7 [ 0 1 0 0 1 0 0 0 ]
+            attn_mask[real_keep_index] = False
+            attn_mask[:, real_keep_index] = False
+
+            attn_mask = attn_mask[None].repeat(self.cache_dict['num_heads'], 1,
+                                               1)
+            attn_mask_list.append(attn_mask)
+        attn_mask = torch.cat(attn_mask_list)
+        self_attn_mask = copy.deepcopy(self_attn_mask)
+        self_attn_mask[:, dis_start:dis_start + num_dis,
+                       dis_start:dis_start + num_dis] = attn_mask
+        # will be used in loss and inference
+        self.cache_dict['distinct_query_mask'].append(~attn_mask)
+        return self_attn_mask
+
+    def forward(self, query: Tensor, value: Tensor, key_padding_mask: Tensor,
+                self_attn_mask: Tensor, reference_points: Tensor,
+                spatial_shapes: Tensor, level_start_index: Tensor,
+                valid_ratios: Tensor, reg_branches: nn.ModuleList,
+                **kwargs) -> Tensor:
+        """Forward function of Transformer decoder.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries,
+                dims).
+            value (Tensor): The input values, has shape (bs, num_value, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `cross_attn`
+                input. ByteTensor, has shape (bs, num_value).
+            self_attn_mask (Tensor): The attention mask to prevent information
+                leakage from different denoising groups, distinct queries and
+                dense queries, has shape (num_queries_total,
+                num_queries_total). It will be updated for distinct queries
+                selection in this forward function. It is `None` when
+                `self.training` is `False`.
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            reg_branches: (obj:`nn.ModuleList`): Used for refining the
+                regression results.
+
+        Returns:
+            tuple[Tensor]: Output queries and references of Transformer
+                decoder
+
+            - query (Tensor): Output embeddings of the last decoder, has
+              shape (bs, num_queries, embed_dims) when `return_intermediate`
+              is `False`. Otherwise, Intermediate output embeddings of all
+              decoder layers, has shape (num_decoder_layers, bs, num_queries,
+              embed_dims).
+            - reference_points (Tensor): The reference of the last decoder
+              layer, has shape (bs, num_queries, 4) when `return_intermediate`
+              is `False`. Otherwise, Intermediate references of all decoder
+              layers, has shape (1 + num_decoder_layers, bs, num_queries, 4).
+              The coordinates are arranged as (cx, cy, w, h).
+        """
+        intermediate = []
+        intermediate_reference_points = [reference_points]
+        self.cache_dict['distinct_query_mask'] = []
+        if self_attn_mask is None:
+            self_attn_mask = torch.zeros((query.size(1), query.size(1)),
+                                         device=query.device).bool()
+        # shape is (batch*number_heads, num_queries, num_queries)
+        self_attn_mask = self_attn_mask[None].repeat(
+            len(query) * self.cache_dict['num_heads'], 1, 1)
+        for layer_index, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = \
+                    reference_points[:, :, None] * torch.cat(
+                        [valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = \
+                    reference_points[:, :, None] * valid_ratios[:, None]
+
+            query_sine_embed = coordinate_to_encoding(
+                reference_points_input[:, :, 0, :],
+                num_feats=self.embed_dims // 2)
+            query_pos = self.ref_point_head(query_sine_embed)
+
+            query = layer(
+                query,
+                query_pos=query_pos,
+                value=value,
+                key_padding_mask=key_padding_mask,
+                self_attn_mask=self_attn_mask,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points_input,
+                **kwargs)
+
+            if not self.training:
+                tmp = reg_branches[layer_index](query)
+                assert reference_points.shape[-1] == 4
+                new_reference_points = tmp + inverse_sigmoid(
+                    reference_points, eps=1e-3)
+                new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+                if layer_index < (len(self.layers) - 1):
+                    self_attn_mask = self.select_distinct_queries(
+                        reference_points, query, self_attn_mask, layer_index)
+
+            else:
+                num_dense = self.cache_dict['num_dense_queries']
+                tmp = reg_branches[layer_index](query[:, :-num_dense])
+                tmp_dense = self.aux_reg_branches[layer_index](
+                    query[:, -num_dense:])
+
+                tmp = torch.cat([tmp, tmp_dense], dim=1)
+                assert reference_points.shape[-1] == 4
+                new_reference_points = tmp + inverse_sigmoid(
+                    reference_points, eps=1e-3)
+                new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+                if layer_index < (len(self.layers) - 1):
+                    self_attn_mask = self.select_distinct_queries(
+                        reference_points, query, self_attn_mask, layer_index)
+
+            if self.return_intermediate:
+                intermediate.append(self.norm(query))
+                intermediate_reference_points.append(new_reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return query, reference_points
diff --git a/mmde/mmdet/models/layers/transformer/deformable_detr_layers.py b/mmde/mmdet/models/layers/transformer/deformable_detr_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..da6325d61270eb3546a39d5487587bc0610434d6
--- /dev/null
+++ b/mmde/mmdet/models/layers/transformer/deformable_detr_layers.py
@@ -0,0 +1,265 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmcv.ops import MultiScaleDeformableAttention
+from mmengine.model import ModuleList
+from torch import Tensor, nn
+
+from .detr_layers import (DetrTransformerDecoder, DetrTransformerDecoderLayer,
+                          DetrTransformerEncoder, DetrTransformerEncoderLayer)
+from .utils import inverse_sigmoid
+
+try:
+    from fairscale.nn.checkpoint import checkpoint_wrapper
+except Exception:
+    checkpoint_wrapper = None
+
+
+class DeformableDetrTransformerEncoder(DetrTransformerEncoder):
+    """Transformer encoder of Deformable DETR."""
+
+    def _init_layers(self) -> None:
+        """Initialize encoder layers."""
+        self.layers = ModuleList([
+            DeformableDetrTransformerEncoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+
+        if self.num_cp > 0:
+            if checkpoint_wrapper is None:
+                raise NotImplementedError(
+                    'If you want to reduce GPU memory usage, \
+                    please install fairscale by executing the \
+                    following command: pip install fairscale.')
+            for i in range(self.num_cp):
+                self.layers[i] = checkpoint_wrapper(self.layers[i])
+
+        self.embed_dims = self.layers[0].embed_dims
+
+    def forward(self, query: Tensor, query_pos: Tensor,
+                key_padding_mask: Tensor, spatial_shapes: Tensor,
+                level_start_index: Tensor, valid_ratios: Tensor,
+                **kwargs) -> Tensor:
+        """Forward function of Transformer encoder.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            query_pos (Tensor): The positional encoding for query, has shape
+                (bs, num_queries, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (bs, num_queries).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+
+        Returns:
+            Tensor: Output queries of Transformer encoder, which is also
+            called 'encoder output embeddings' or 'memory', has shape
+            (bs, num_queries, dim)
+        """
+        reference_points = self.get_encoder_reference_points(
+            spatial_shapes, valid_ratios, device=query.device)
+        for layer in self.layers:
+            query = layer(
+                query=query,
+                query_pos=query_pos,
+                key_padding_mask=key_padding_mask,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points,
+                **kwargs)
+        return query
+
+    @staticmethod
+    def get_encoder_reference_points(
+            spatial_shapes: Tensor, valid_ratios: Tensor,
+            device: Union[torch.device, str]) -> Tensor:
+        """Get the reference points used in encoder.
+
+        Args:
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            device (obj:`device` or str): The device acquired by the
+                `reference_points`.
+
+        Returns:
+            Tensor: Reference points used in decoder, has shape (bs, length,
+            num_levels, 2).
+        """
+
+        reference_points_list = []
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(
+                    0.5, H - 0.5, H, dtype=torch.float32, device=device),
+                torch.linspace(
+                    0.5, W - 0.5, W, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (
+                valid_ratios[:, None, lvl, 1] * H)
+            ref_x = ref_x.reshape(-1)[None] / (
+                valid_ratios[:, None, lvl, 0] * W)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        # [bs, sum(hw), num_level, 2]
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+
+class DeformableDetrTransformerDecoder(DetrTransformerDecoder):
+    """Transformer Decoder of Deformable DETR."""
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers."""
+        self.layers = ModuleList([
+            DeformableDetrTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        if self.post_norm_cfg is not None:
+            raise ValueError('There is not post_norm in '
+                             f'{self._get_name()}')
+
+    def forward(self,
+                query: Tensor,
+                query_pos: Tensor,
+                value: Tensor,
+                key_padding_mask: Tensor,
+                reference_points: Tensor,
+                spatial_shapes: Tensor,
+                level_start_index: Tensor,
+                valid_ratios: Tensor,
+                reg_branches: Optional[nn.Module] = None,
+                **kwargs) -> Tuple[Tensor]:
+        """Forward function of Transformer decoder.
+
+        Args:
+            query (Tensor): The input queries, has shape (bs, num_queries,
+                dim).
+            query_pos (Tensor): The input positional query, has shape
+                (bs, num_queries, dim). It will be added to `query` before
+                forward function.
+            value (Tensor): The input values, has shape (bs, num_value, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `cross_attn`
+                input. ByteTensor, has shape (bs, num_value).
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h) when `as_two_stage` is `True`, otherwise has
+                shape (bs, num_queries, 2) with the last dimension arranged
+                as (cx, cy).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            reg_branches: (obj:`nn.ModuleList`, optional): Used for refining
+                the regression results. Only would be passed when
+                `with_box_refine` is `True`, otherwise would be `None`.
+
+        Returns:
+            tuple[Tensor]: Outputs of Deformable Transformer Decoder.
+
+            - output (Tensor): Output embeddings of the last decoder, has
+              shape (num_queries, bs, embed_dims) when `return_intermediate`
+              is `False`. Otherwise, Intermediate output embeddings of all
+              decoder layers, has shape (num_decoder_layers, num_queries, bs,
+              embed_dims).
+            - reference_points (Tensor): The reference of the last decoder
+              layer, has shape (bs, num_queries, 4)  when `return_intermediate`
+              is `False`. Otherwise, Intermediate references of all decoder
+              layers, has shape (num_decoder_layers, bs, num_queries, 4). The
+              coordinates are arranged as (cx, cy, w, h)
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for layer_id, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = \
+                    reference_points[:, :, None] * \
+                    torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = \
+                    reference_points[:, :, None] * \
+                    valid_ratios[:, None]
+            output = layer(
+                output,
+                query_pos=query_pos,
+                value=value,
+                key_padding_mask=key_padding_mask,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points_input,
+                **kwargs)
+
+            if reg_branches is not None:
+                tmp_reg_preds = reg_branches[layer_id](output)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp_reg_preds + inverse_sigmoid(
+                        reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    assert reference_points.shape[-1] == 2
+                    new_reference_points = tmp_reg_preds
+                    new_reference_points[..., :2] = tmp_reg_preds[
+                        ..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+class DeformableDetrTransformerEncoderLayer(DetrTransformerEncoderLayer):
+    """Encoder layer of Deformable DETR."""
+
+    def _init_layers(self) -> None:
+        """Initialize self_attn, ffn, and norms."""
+        self.self_attn = MultiScaleDeformableAttention(**self.self_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(2)
+        ]
+        self.norms = ModuleList(norms_list)
+
+
+class DeformableDetrTransformerDecoderLayer(DetrTransformerDecoderLayer):
+    """Decoder layer of Deformable DETR."""
+
+    def _init_layers(self) -> None:
+        """Initialize self_attn, cross-attn, ffn, and norms."""
+        self.self_attn = MultiheadAttention(**self.self_attn_cfg)
+        self.cross_attn = MultiScaleDeformableAttention(**self.cross_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(3)
+        ]
+        self.norms = ModuleList(norms_list)
diff --git a/mmde/mmdet/models/layers/transformer/detr_layers.py b/mmde/mmdet/models/layers/transformer/detr_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a83dd2faa660ed8f54bdd08271db1fcf6b53886
--- /dev/null
+++ b/mmde/mmdet/models/layers/transformer/detr_layers.py
@@ -0,0 +1,374 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmengine import ConfigDict
+from mmengine.model import BaseModule, ModuleList
+from torch import Tensor
+
+from mmdet.utils import ConfigType, OptConfigType
+
+try:
+    from fairscale.nn.checkpoint import checkpoint_wrapper
+except Exception:
+    checkpoint_wrapper = None
+
+
+class DetrTransformerEncoder(BaseModule):
+    """Encoder of DETR.
+
+    Args:
+        num_layers (int): Number of encoder layers.
+        layer_cfg (:obj:`ConfigDict` or dict): the config of each encoder
+            layer. All the layers will share the same config.
+        num_cp (int): Number of checkpointing blocks in encoder layer.
+            Default to -1.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_layers: int,
+                 layer_cfg: ConfigType,
+                 num_cp: int = -1,
+                 init_cfg: OptConfigType = None) -> None:
+
+        super().__init__(init_cfg=init_cfg)
+        self.num_layers = num_layers
+        self.layer_cfg = layer_cfg
+        self.num_cp = num_cp
+        assert self.num_cp <= self.num_layers
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize encoder layers."""
+        self.layers = ModuleList([
+            DetrTransformerEncoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+
+        if self.num_cp > 0:
+            if checkpoint_wrapper is None:
+                raise NotImplementedError(
+                    'If you want to reduce GPU memory usage, \
+                    please install fairscale by executing the \
+                    following command: pip install fairscale.')
+            for i in range(self.num_cp):
+                self.layers[i] = checkpoint_wrapper(self.layers[i])
+
+        self.embed_dims = self.layers[0].embed_dims
+
+    def forward(self, query: Tensor, query_pos: Tensor,
+                key_padding_mask: Tensor, **kwargs) -> Tensor:
+        """Forward function of encoder.
+
+        Args:
+            query (Tensor): Input queries of encoder, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional embeddings of the queries, has
+                shape (bs, num_queries, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (bs, num_queries).
+
+        Returns:
+            Tensor: Has shape (bs, num_queries, dim) if `batch_first` is
+            `True`, otherwise (num_queries, bs, dim).
+        """
+        for layer in self.layers:
+            query = layer(query, query_pos, key_padding_mask, **kwargs)
+        return query
+
+
+class DetrTransformerDecoder(BaseModule):
+    """Decoder of DETR.
+
+    Args:
+        num_layers (int): Number of decoder layers.
+        layer_cfg (:obj:`ConfigDict` or dict): the config of each encoder
+            layer. All the layers will share the same config.
+        post_norm_cfg (:obj:`ConfigDict` or dict, optional): Config of the
+            post normalization layer. Defaults to `LN`.
+        return_intermediate (bool, optional): Whether to return outputs of
+            intermediate layers. Defaults to `True`,
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_layers: int,
+                 layer_cfg: ConfigType,
+                 post_norm_cfg: OptConfigType = dict(type='LN'),
+                 return_intermediate: bool = True,
+                 init_cfg: Union[dict, ConfigDict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.layer_cfg = layer_cfg
+        self.num_layers = num_layers
+        self.post_norm_cfg = post_norm_cfg
+        self.return_intermediate = return_intermediate
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers."""
+        self.layers = ModuleList([
+            DetrTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        self.post_norm = build_norm_layer(self.post_norm_cfg,
+                                          self.embed_dims)[1]
+
+    def forward(self, query: Tensor, key: Tensor, value: Tensor,
+                query_pos: Tensor, key_pos: Tensor, key_padding_mask: Tensor,
+                **kwargs) -> Tensor:
+        """Forward function of decoder
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            key (Tensor): The input key, has shape (bs, num_keys, dim).
+            value (Tensor): The input value with the same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`, with the
+                same shape as `query`.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`.
+            key_padding_mask (Tensor): The `key_padding_mask` of `cross_attn`
+                input. ByteTensor, has shape (bs, num_value).
+
+        Returns:
+            Tensor: The forwarded results will have shape
+            (num_decoder_layers, bs, num_queries, dim) if
+            `return_intermediate` is `True` else (1, bs, num_queries, dim).
+        """
+        intermediate = []
+        for layer in self.layers:
+            query = layer(
+                query,
+                key=key,
+                value=value,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+            if self.return_intermediate:
+                intermediate.append(self.post_norm(query))
+        query = self.post_norm(query)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return query.unsqueeze(0)
+
+
+class DetrTransformerEncoderLayer(BaseModule):
+    """Implements encoder layer in DETR transformer.
+
+    Args:
+        self_attn_cfg (:obj:`ConfigDict` or dict, optional): Config for self
+            attention.
+        ffn_cfg (:obj:`ConfigDict` or dict, optional): Config for FFN.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config for
+            normalization layers. All the layers will share the same
+            config. Defaults to `LN`.
+        init_cfg (:obj:`ConfigDict` or dict, optional): Config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 self_attn_cfg: OptConfigType = dict(
+                     embed_dims=256, num_heads=8, dropout=0.0),
+                 ffn_cfg: OptConfigType = dict(
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True)),
+                 norm_cfg: OptConfigType = dict(type='LN'),
+                 init_cfg: OptConfigType = None) -> None:
+
+        super().__init__(init_cfg=init_cfg)
+
+        self.self_attn_cfg = self_attn_cfg
+        if 'batch_first' not in self.self_attn_cfg:
+            self.self_attn_cfg['batch_first'] = True
+        else:
+            assert self.self_attn_cfg['batch_first'] is True, 'First \
+            dimension of all DETRs in mmdet is `batch`, \
+            please set `batch_first` flag.'
+
+        self.ffn_cfg = ffn_cfg
+        self.norm_cfg = norm_cfg
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize self-attention, FFN, and normalization."""
+        self.self_attn = MultiheadAttention(**self.self_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(2)
+        ]
+        self.norms = ModuleList(norms_list)
+
+    def forward(self, query: Tensor, query_pos: Tensor,
+                key_padding_mask: Tensor, **kwargs) -> Tensor:
+        """Forward function of an encoder layer.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            query_pos (Tensor): The positional encoding for query, with
+                the same shape as `query`.
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor. has shape (bs, num_queries).
+        Returns:
+            Tensor: forwarded results, has shape (bs, num_queries, dim).
+        """
+        query = self.self_attn(
+            query=query,
+            key=query,
+            value=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+        query = self.norms[0](query)
+        query = self.ffn(query)
+        query = self.norms[1](query)
+
+        return query
+
+
+class DetrTransformerDecoderLayer(BaseModule):
+    """Implements decoder layer in DETR transformer.
+
+    Args:
+        self_attn_cfg (:obj:`ConfigDict` or dict, optional): Config for self
+            attention.
+        cross_attn_cfg (:obj:`ConfigDict` or dict, optional): Config for cross
+            attention.
+        ffn_cfg (:obj:`ConfigDict` or dict, optional): Config for FFN.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config for
+            normalization layers. All the layers will share the same
+            config. Defaults to `LN`.
+        init_cfg (:obj:`ConfigDict` or dict, optional): Config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 self_attn_cfg: OptConfigType = dict(
+                     embed_dims=256,
+                     num_heads=8,
+                     dropout=0.0,
+                     batch_first=True),
+                 cross_attn_cfg: OptConfigType = dict(
+                     embed_dims=256,
+                     num_heads=8,
+                     dropout=0.0,
+                     batch_first=True),
+                 ffn_cfg: OptConfigType = dict(
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                 ),
+                 norm_cfg: OptConfigType = dict(type='LN'),
+                 init_cfg: OptConfigType = None) -> None:
+
+        super().__init__(init_cfg=init_cfg)
+
+        self.self_attn_cfg = self_attn_cfg
+        self.cross_attn_cfg = cross_attn_cfg
+        if 'batch_first' not in self.self_attn_cfg:
+            self.self_attn_cfg['batch_first'] = True
+        else:
+            assert self.self_attn_cfg['batch_first'] is True, 'First \
+            dimension of all DETRs in mmdet is `batch`, \
+            please set `batch_first` flag.'
+
+        if 'batch_first' not in self.cross_attn_cfg:
+            self.cross_attn_cfg['batch_first'] = True
+        else:
+            assert self.cross_attn_cfg['batch_first'] is True, 'First \
+            dimension of all DETRs in mmdet is `batch`, \
+            please set `batch_first` flag.'
+
+        self.ffn_cfg = ffn_cfg
+        self.norm_cfg = norm_cfg
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize self-attention, FFN, and normalization."""
+        self.self_attn = MultiheadAttention(**self.self_attn_cfg)
+        self.cross_attn = MultiheadAttention(**self.cross_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(3)
+        ]
+        self.norms = ModuleList(norms_list)
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                value: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                self_attn_mask: Tensor = None,
+                cross_attn_mask: Tensor = None,
+                key_padding_mask: Tensor = None,
+                **kwargs) -> Tensor:
+        """
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            key (Tensor, optional): The input key, has shape (bs, num_keys,
+                dim). If `None`, the `query` will be used. Defaults to `None`.
+            value (Tensor, optional): The input value, has the same shape as
+                `key`, as in `nn.MultiheadAttention.forward`. If `None`, the
+                `key` will be used. Defaults to `None`.
+            query_pos (Tensor, optional): The positional encoding for `query`,
+                has the same shape as `query`. If not `None`, it will be added
+                to `query` before forward function. Defaults to `None`.
+            key_pos (Tensor, optional): The positional encoding for `key`, has
+                the same shape as `key`. If not `None`, it will be added to
+                `key` before forward function. If None, and `query_pos` has the
+                same shape as `key`, then `query_pos` will be used for
+                `key_pos`. Defaults to None.
+            self_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            cross_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor, optional): The `key_padding_mask` of
+                `self_attn` input. ByteTensor, has shape (bs, num_value).
+                Defaults to None.
+
+        Returns:
+            Tensor: forwarded results, has shape (bs, num_queries, dim).
+        """
+
+        query = self.self_attn(
+            query=query,
+            key=query,
+            value=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_mask,
+            **kwargs)
+        query = self.norms[0](query)
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            value=value,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=cross_attn_mask,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+        query = self.norms[1](query)
+        query = self.ffn(query)
+        query = self.norms[2](query)
+
+        return query
diff --git a/mmde/mmdet/models/layers/transformer/dino_layers.py b/mmde/mmdet/models/layers/transformer/dino_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..64610d0a7c0121a88f5e4279b6f854924230237e
--- /dev/null
+++ b/mmde/mmdet/models/layers/transformer/dino_layers.py
@@ -0,0 +1,562 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Tuple, Union
+
+import torch
+from mmengine.model import BaseModule
+from torch import Tensor, nn
+
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_xyxy_to_cxcywh
+from mmdet.utils import OptConfigType
+from .deformable_detr_layers import DeformableDetrTransformerDecoder
+from .utils import MLP, coordinate_to_encoding, inverse_sigmoid
+
+
+class DinoTransformerDecoder(DeformableDetrTransformerDecoder):
+    """Transformer decoder of DINO."""
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers."""
+        super()._init_layers()
+        self.ref_point_head = MLP(self.embed_dims * 2, self.embed_dims,
+                                  self.embed_dims, 2)
+        self.norm = nn.LayerNorm(self.embed_dims)
+
+    def forward(self, query: Tensor, value: Tensor, key_padding_mask: Tensor,
+                self_attn_mask: Tensor, reference_points: Tensor,
+                spatial_shapes: Tensor, level_start_index: Tensor,
+                valid_ratios: Tensor, reg_branches: nn.ModuleList,
+                **kwargs) -> Tuple[Tensor]:
+        """Forward function of Transformer decoder.
+
+        Args:
+            query (Tensor): The input query, has shape (num_queries, bs, dim).
+            value (Tensor): The input values, has shape (num_value, bs, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (num_queries, bs).
+            self_attn_mask (Tensor): The attention mask to prevent information
+                leakage from different denoising groups and matching parts, has
+                shape (num_queries_total, num_queries_total). It is `None` when
+                `self.training` is `False`.
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            reg_branches: (obj:`nn.ModuleList`): Used for refining the
+                regression results.
+
+        Returns:
+            tuple[Tensor]: Output queries and references of Transformer
+                decoder
+
+            - query (Tensor): Output embeddings of the last decoder, has
+              shape (num_queries, bs, embed_dims) when `return_intermediate`
+              is `False`. Otherwise, Intermediate output embeddings of all
+              decoder layers, has shape (num_decoder_layers, num_queries, bs,
+              embed_dims).
+            - reference_points (Tensor): The reference of the last decoder
+              layer, has shape (bs, num_queries, 4)  when `return_intermediate`
+              is `False`. Otherwise, Intermediate references of all decoder
+              layers, has shape (num_decoder_layers, bs, num_queries, 4). The
+              coordinates are arranged as (cx, cy, w, h)
+        """
+        intermediate = []
+        intermediate_reference_points = [reference_points]
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = \
+                    reference_points[:, :, None] * torch.cat(
+                        [valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = \
+                    reference_points[:, :, None] * valid_ratios[:, None]
+
+            query_sine_embed = coordinate_to_encoding(
+                reference_points_input[:, :, 0, :])
+            query_pos = self.ref_point_head(query_sine_embed)
+
+            query = layer(
+                query,
+                query_pos=query_pos,
+                value=value,
+                key_padding_mask=key_padding_mask,
+                self_attn_mask=self_attn_mask,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points_input,
+                **kwargs)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](query)
+                assert reference_points.shape[-1] == 4
+                new_reference_points = tmp + inverse_sigmoid(
+                    reference_points, eps=1e-3)
+                new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            if self.return_intermediate:
+                intermediate.append(self.norm(query))
+                intermediate_reference_points.append(new_reference_points)
+                # NOTE this is for the "Look Forward Twice" module,
+                # in the DeformDETR, reference_points was appended.
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return query, reference_points
+
+
+class CdnQueryGenerator(BaseModule):
+    """Implement query generator of the Contrastive denoising (CDN) proposed in
+    `DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object
+    Detection <https://arxiv.org/abs/2203.03605>`_
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/DINO>`_.
+
+    Args:
+        num_classes (int): Number of object classes.
+        embed_dims (int): The embedding dimensions of the generated queries.
+        num_matching_queries (int): The queries number of the matching part.
+            Used for generating dn_mask.
+        label_noise_scale (float): The scale of label noise, defaults to 0.5.
+        box_noise_scale (float): The scale of box noise, defaults to 1.0.
+        group_cfg (:obj:`ConfigDict` or dict, optional): The config of the
+            denoising queries grouping, includes `dynamic`, `num_dn_queries`,
+            and `num_groups`. Two grouping strategies, 'static dn groups' and
+            'dynamic dn groups', are supported. When `dynamic` is `False`,
+            the `num_groups` should be set, and the number of denoising query
+            groups will always be `num_groups`. When `dynamic` is `True`, the
+            `num_dn_queries` should be set, and the group number will be
+            dynamic to ensure that the denoising queries number will not exceed
+            `num_dn_queries` to prevent large fluctuations of memory. Defaults
+            to `None`.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 embed_dims: int,
+                 num_matching_queries: int,
+                 label_noise_scale: float = 0.5,
+                 box_noise_scale: float = 1.0,
+                 group_cfg: OptConfigType = None) -> None:
+        super().__init__()
+        self.num_classes = num_classes
+        self.embed_dims = embed_dims
+        self.num_matching_queries = num_matching_queries
+        self.label_noise_scale = label_noise_scale
+        self.box_noise_scale = box_noise_scale
+
+        # prepare grouping strategy
+        group_cfg = {} if group_cfg is None else group_cfg
+        self.dynamic_dn_groups = group_cfg.get('dynamic', True)
+        if self.dynamic_dn_groups:
+            if 'num_dn_queries' not in group_cfg:
+                warnings.warn("'num_dn_queries' should be set when using "
+                              'dynamic dn groups, use 100 as default.')
+            self.num_dn_queries = group_cfg.get('num_dn_queries', 100)
+            assert isinstance(self.num_dn_queries, int), \
+                f'Expected the num_dn_queries to have type int, but got ' \
+                f'{self.num_dn_queries}({type(self.num_dn_queries)}). '
+        else:
+            assert 'num_groups' in group_cfg, \
+                'num_groups should be set when using static dn groups'
+            self.num_groups = group_cfg['num_groups']
+            assert isinstance(self.num_groups, int), \
+                f'Expected the num_groups to have type int, but got ' \
+                f'{self.num_groups}({type(self.num_groups)}). '
+
+        # NOTE The original repo of DINO set the num_embeddings 92 for coco,
+        # 91 (0~90) of which represents target classes and the 92 (91)
+        # indicates `Unknown` class. However, the embedding of `unknown` class
+        # is not used in the original DINO.
+        # TODO: num_classes + 1 or num_classes ?
+        self.label_embedding = nn.Embedding(self.num_classes, self.embed_dims)
+
+    def __call__(self, batch_data_samples: SampleList) -> tuple:
+        """Generate contrastive denoising (cdn) queries with ground truth.
+
+        Descriptions of the Number Values in code and comments:
+            - num_target_total: the total target number of the input batch
+              samples.
+            - max_num_target: the max target number of the input batch samples.
+            - num_noisy_targets: the total targets number after adding noise,
+              i.e., num_target_total * num_groups * 2.
+            - num_denoising_queries: the length of the output batched queries,
+              i.e., max_num_target * num_groups * 2.
+
+        NOTE The format of input bboxes in batch_data_samples is unnormalized
+        (x, y, x, y), and the output bbox queries are embedded by normalized
+        (cx, cy, w, h) format bboxes going through inverse_sigmoid.
+
+        Args:
+            batch_data_samples (list[:obj:`DetDataSample`]): List of the batch
+                data samples, each includes `gt_instance` which has attributes
+                `bboxes` and `labels`. The `bboxes` has unnormalized coordinate
+                format (x, y, x, y).
+
+        Returns:
+            tuple: The outputs of the dn query generator.
+
+            - dn_label_query (Tensor): The output content queries for denoising
+              part, has shape (bs, num_denoising_queries, dim), where
+              `num_denoising_queries = max_num_target * num_groups * 2`.
+            - dn_bbox_query (Tensor): The output reference bboxes as positions
+              of queries for denoising part, which are embedded by normalized
+              (cx, cy, w, h) format bboxes going through inverse_sigmoid, has
+              shape (bs, num_denoising_queries, 4) with the last dimension
+              arranged as (cx, cy, w, h).
+            - attn_mask (Tensor): The attention mask to prevent information
+              leakage from different denoising groups and matching parts,
+              will be used as `self_attn_mask` of the `decoder`, has shape
+              (num_queries_total, num_queries_total), where `num_queries_total`
+              is the sum of `num_denoising_queries` and `num_matching_queries`.
+            - dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+        """
+        # normalize bbox and collate ground truth (gt)
+        gt_labels_list = []
+        gt_bboxes_list = []
+        for sample in batch_data_samples:
+            img_h, img_w = sample.img_shape
+            bboxes = sample.gt_instances.bboxes
+            factor = bboxes.new_tensor([img_w, img_h, img_w,
+                                        img_h]).unsqueeze(0)
+            bboxes_normalized = bboxes / factor
+            gt_bboxes_list.append(bboxes_normalized)
+            gt_labels_list.append(sample.gt_instances.labels)
+        gt_labels = torch.cat(gt_labels_list)  # (num_target_total, 4)
+        gt_bboxes = torch.cat(gt_bboxes_list)
+
+        num_target_list = [len(bboxes) for bboxes in gt_bboxes_list]
+        max_num_target = max(num_target_list)
+        num_groups = self.get_num_groups(max_num_target)
+
+        dn_label_query = self.generate_dn_label_query(gt_labels, num_groups)
+        dn_bbox_query = self.generate_dn_bbox_query(gt_bboxes, num_groups)
+
+        # The `batch_idx` saves the batch index of the corresponding sample
+        # for each target, has shape (num_target_total).
+        batch_idx = torch.cat([
+            torch.full_like(t.long(), i) for i, t in enumerate(gt_labels_list)
+        ])
+        dn_label_query, dn_bbox_query = self.collate_dn_queries(
+            dn_label_query, dn_bbox_query, batch_idx, len(batch_data_samples),
+            num_groups)
+
+        attn_mask = self.generate_dn_mask(
+            max_num_target, num_groups, device=dn_label_query.device)
+
+        dn_meta = dict(
+            num_denoising_queries=int(max_num_target * 2 * num_groups),
+            num_denoising_groups=num_groups)
+
+        return dn_label_query, dn_bbox_query, attn_mask, dn_meta
+
+    def get_num_groups(self, max_num_target: int = None) -> int:
+        """Calculate denoising query groups number.
+
+        Two grouping strategies, 'static dn groups' and 'dynamic dn groups',
+        are supported. When `self.dynamic_dn_groups` is `False`, the number
+        of denoising query groups will always be `self.num_groups`. When
+        `self.dynamic_dn_groups` is `True`, the group number will be dynamic,
+        ensuring the denoising queries number will not exceed
+        `self.num_dn_queries` to prevent large fluctuations of memory.
+
+        NOTE The `num_group` is shared for different samples in a batch. When
+        the target numbers in the samples varies, the denoising queries of the
+        samples containing fewer targets are padded to the max length.
+
+        Args:
+            max_num_target (int, optional): The max target number of the batch
+                samples. It will only be used when `self.dynamic_dn_groups` is
+                `True`. Defaults to `None`.
+
+        Returns:
+            int: The denoising group number of the current batch.
+        """
+        if self.dynamic_dn_groups:
+            assert max_num_target is not None, \
+                'group_queries should be provided when using ' \
+                'dynamic dn groups'
+            if max_num_target == 0:
+                num_groups = 1
+            else:
+                num_groups = self.num_dn_queries // max_num_target
+        else:
+            num_groups = self.num_groups
+        if num_groups < 1:
+            num_groups = 1
+        return int(num_groups)
+
+    def generate_dn_label_query(self, gt_labels: Tensor,
+                                num_groups: int) -> Tensor:
+        """Generate noisy labels and their query embeddings.
+
+        The strategy for generating noisy labels is: Randomly choose labels of
+        `self.label_noise_scale * 0.5` proportion and override each of them
+        with a random object category label.
+
+        NOTE Not add noise to all labels. Besides, the `self.label_noise_scale
+        * 0.5` arg is the ratio of the chosen positions, which is higher than
+        the actual proportion of noisy labels, because the labels to override
+        may be correct. And the gap becomes larger as the number of target
+        categories decreases. The users should notice this and modify the scale
+        arg or the corresponding logic according to specific dataset.
+
+        Args:
+            gt_labels (Tensor): The concatenated gt labels of all samples
+                in the batch, has shape (num_target_total, ) where
+                `num_target_total = sum(num_target_list)`.
+            num_groups (int): The number of denoising query groups.
+
+        Returns:
+            Tensor: The query embeddings of noisy labels, has shape
+            (num_noisy_targets, embed_dims), where `num_noisy_targets =
+            num_target_total * num_groups * 2`.
+        """
+        assert self.label_noise_scale > 0
+        gt_labels_expand = gt_labels.repeat(2 * num_groups,
+                                            1).view(-1)  # Note `* 2`  # noqa
+        p = torch.rand_like(gt_labels_expand.float())
+        chosen_indice = torch.nonzero(p < (self.label_noise_scale * 0.5)).view(
+            -1)  # Note `* 0.5`
+        new_labels = torch.randint_like(chosen_indice, 0, self.num_classes)
+        noisy_labels_expand = gt_labels_expand.scatter(0, chosen_indice,
+                                                       new_labels)
+        dn_label_query = self.label_embedding(noisy_labels_expand)
+        return dn_label_query
+
+    def generate_dn_bbox_query(self, gt_bboxes: Tensor,
+                               num_groups: int) -> Tensor:
+        """Generate noisy bboxes and their query embeddings.
+
+        The strategy for generating noisy bboxes is as follow:
+
+        .. code:: text
+
+            +--------------------+
+            |      negative      |
+            |    +----------+    |
+            |    | positive |    |
+            |    |    +-----|----+------------+
+            |    |    |     |    |            |
+            |    +----+-----+    |            |
+            |         |          |            |
+            +---------+----------+            |
+                      |                       |
+                      |        gt bbox        |
+                      |                       |
+                      |             +---------+----------+
+                      |             |         |          |
+                      |             |    +----+-----+    |
+                      |             |    |    |     |    |
+                      +-------------|--- +----+     |    |
+                                    |    | positive |    |
+                                    |    +----------+    |
+                                    |      negative      |
+                                    +--------------------+
+
+         The random noise is added to the top-left and down-right point
+         positions, hence, normalized (x, y, x, y) format of bboxes are
+         required. The noisy bboxes of positive queries have the points
+         both within the inner square, while those of negative queries
+         have the points both between the inner and outer squares.
+
+        Besides, the length of outer square is twice as long as that of
+        the inner square, i.e., self.box_noise_scale * w_or_h / 2.
+        NOTE The noise is added to all the bboxes. Moreover, there is still
+        unconsidered case when one point is within the positive square and
+        the others is between the inner and outer squares.
+
+        Args:
+            gt_bboxes (Tensor): The concatenated gt bboxes of all samples
+                in the batch, has shape (num_target_total, 4) with the last
+                dimension arranged as (cx, cy, w, h) where
+                `num_target_total = sum(num_target_list)`.
+            num_groups (int): The number of denoising query groups.
+
+        Returns:
+            Tensor: The output noisy bboxes, which are embedded by normalized
+            (cx, cy, w, h) format bboxes going through inverse_sigmoid, has
+            shape (num_noisy_targets, 4) with the last dimension arranged as
+            (cx, cy, w, h), where
+            `num_noisy_targets = num_target_total * num_groups * 2`.
+        """
+        assert self.box_noise_scale > 0
+        device = gt_bboxes.device
+
+        # expand gt_bboxes as groups
+        gt_bboxes_expand = gt_bboxes.repeat(2 * num_groups, 1)  # xyxy
+
+        # obtain index of negative queries in gt_bboxes_expand
+        positive_idx = torch.arange(
+            len(gt_bboxes), dtype=torch.long, device=device)
+        positive_idx = positive_idx.unsqueeze(0).repeat(num_groups, 1)
+        positive_idx += 2 * len(gt_bboxes) * torch.arange(
+            num_groups, dtype=torch.long, device=device)[:, None]
+        positive_idx = positive_idx.flatten()
+        negative_idx = positive_idx + len(gt_bboxes)
+
+        # determine the sign of each element in the random part of the added
+        # noise to be positive or negative randomly.
+        rand_sign = torch.randint_like(
+            gt_bboxes_expand, low=0, high=2,
+            dtype=torch.float32) * 2.0 - 1.0  # [low, high), 1 or -1, randomly
+
+        # calculate the random part of the added noise
+        rand_part = torch.rand_like(gt_bboxes_expand)  # [0, 1)
+        rand_part[negative_idx] += 1.0  # pos: [0, 1); neg: [1, 2)
+        rand_part *= rand_sign  # pos: (-1, 1); neg: (-2, -1] U [1, 2)
+
+        # add noise to the bboxes
+        bboxes_whwh = bbox_xyxy_to_cxcywh(gt_bboxes_expand)[:, 2:].repeat(1, 2)
+        noisy_bboxes_expand = gt_bboxes_expand + torch.mul(
+            rand_part, bboxes_whwh) * self.box_noise_scale / 2  # xyxy
+        noisy_bboxes_expand = noisy_bboxes_expand.clamp(min=0.0, max=1.0)
+        noisy_bboxes_expand = bbox_xyxy_to_cxcywh(noisy_bboxes_expand)
+
+        dn_bbox_query = inverse_sigmoid(noisy_bboxes_expand, eps=1e-3)
+        return dn_bbox_query
+
+    def collate_dn_queries(self, input_label_query: Tensor,
+                           input_bbox_query: Tensor, batch_idx: Tensor,
+                           batch_size: int, num_groups: int) -> Tuple[Tensor]:
+        """Collate generated queries to obtain batched dn queries.
+
+        The strategy for query collation is as follow:
+
+        .. code:: text
+
+                    input_queries (num_target_total, query_dim)
+            P_A1 P_B1 P_B2 N_A1 N_B1 N_B2 P'A1 P'B1 P'B2 N'A1 N'B1 N'B2
+              |________ group1 ________|    |________ group2 ________|
+                                         |
+                                         V
+                      P_A1 Pad0 N_A1 Pad0 P'A1 Pad0 N'A1 Pad0
+                      P_B1 P_B2 N_B1 N_B2 P'B1 P'B2 N'B1 N'B2
+                       |____ group1 ____| |____ group2 ____|
+             batched_queries (batch_size, max_num_target, query_dim)
+
+            where query_dim is 4 for bbox and self.embed_dims for label.
+            Notation: _-group 1; '-group 2;
+                      A-Sample1(has 1 target); B-sample2(has 2 targets)
+
+        Args:
+            input_label_query (Tensor): The generated label queries of all
+                targets, has shape (num_target_total, embed_dims) where
+                `num_target_total = sum(num_target_list)`.
+            input_bbox_query (Tensor): The generated bbox queries of all
+                targets, has shape (num_target_total, 4) with the last
+                dimension arranged as (cx, cy, w, h).
+            batch_idx (Tensor): The batch index of the corresponding sample
+                for each target, has shape (num_target_total).
+            batch_size (int): The size of the input batch.
+            num_groups (int): The number of denoising query groups.
+
+        Returns:
+            tuple[Tensor]: Output batched label and bbox queries.
+            - batched_label_query (Tensor): The output batched label queries,
+              has shape (batch_size, max_num_target, embed_dims).
+            - batched_bbox_query (Tensor): The output batched bbox queries,
+              has shape (batch_size, max_num_target, 4) with the last dimension
+              arranged as (cx, cy, w, h).
+        """
+        device = input_label_query.device
+        num_target_list = [
+            torch.sum(batch_idx == idx) for idx in range(batch_size)
+        ]
+        max_num_target = max(num_target_list)
+        num_denoising_queries = int(max_num_target * 2 * num_groups)
+
+        map_query_index = torch.cat([
+            torch.arange(num_target, device=device)
+            for num_target in num_target_list
+        ])
+        map_query_index = torch.cat([
+            map_query_index + max_num_target * i for i in range(2 * num_groups)
+        ]).long()
+        batch_idx_expand = batch_idx.repeat(2 * num_groups, 1).view(-1)
+        mapper = (batch_idx_expand, map_query_index)
+
+        batched_label_query = torch.zeros(
+            batch_size, num_denoising_queries, self.embed_dims, device=device)
+        batched_bbox_query = torch.zeros(
+            batch_size, num_denoising_queries, 4, device=device)
+
+        batched_label_query[mapper] = input_label_query
+        batched_bbox_query[mapper] = input_bbox_query
+        return batched_label_query, batched_bbox_query
+
+    def generate_dn_mask(self, max_num_target: int, num_groups: int,
+                         device: Union[torch.device, str]) -> Tensor:
+        """Generate attention mask to prevent information leakage from
+        different denoising groups and matching parts.
+
+        .. code:: text
+
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+         max_num_target |_|           |_________| num_matching_queries
+                        |_____________| num_denoising_queries
+
+               1 -> True  (Masked), means 'can not see'.
+               0 -> False (UnMasked), means 'can see'.
+
+        Args:
+            max_num_target (int): The max target number of the input batch
+                samples.
+            num_groups (int): The number of denoising query groups.
+            device (obj:`device` or str): The device of generated mask.
+
+        Returns:
+            Tensor: The attention mask to prevent information leakage from
+            different denoising groups and matching parts, will be used as
+            `self_attn_mask` of the `decoder`, has shape (num_queries_total,
+            num_queries_total), where `num_queries_total` is the sum of
+            `num_denoising_queries` and `num_matching_queries`.
+        """
+        num_denoising_queries = int(max_num_target * 2 * num_groups)
+        num_queries_total = num_denoising_queries + self.num_matching_queries
+        attn_mask = torch.zeros(
+            num_queries_total,
+            num_queries_total,
+            device=device,
+            dtype=torch.bool)
+        # Make the matching part cannot see the denoising groups
+        attn_mask[num_denoising_queries:, :num_denoising_queries] = True
+        # Make the denoising groups cannot see each other
+        for i in range(num_groups):
+            # Mask rows of one group per step.
+            row_scope = slice(max_num_target * 2 * i,
+                              max_num_target * 2 * (i + 1))
+            left_scope = slice(max_num_target * 2 * i)
+            right_scope = slice(max_num_target * 2 * (i + 1),
+                                num_denoising_queries)
+            attn_mask[row_scope, right_scope] = True
+            attn_mask[row_scope, left_scope] = True
+        return attn_mask
diff --git a/mmde/mmdet/models/layers/transformer/grounding_dino_layers.py b/mmde/mmdet/models/layers/transformer/grounding_dino_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c285768f36af98075607b43e48e6f1018125ad1
--- /dev/null
+++ b/mmde/mmdet/models/layers/transformer/grounding_dino_layers.py
@@ -0,0 +1,270 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmcv.ops import MultiScaleDeformableAttention
+from mmengine.model import ModuleList
+from torch import Tensor
+
+from mmdet.models.utils.vlfuse_helper import SingleScaleBiAttentionBlock
+from mmdet.utils import ConfigType, OptConfigType
+from .deformable_detr_layers import (DeformableDetrTransformerDecoderLayer,
+                                     DeformableDetrTransformerEncoder,
+                                     DeformableDetrTransformerEncoderLayer)
+from .detr_layers import DetrTransformerEncoderLayer
+from .dino_layers import DinoTransformerDecoder
+from .utils import MLP, get_text_sine_pos_embed
+
+try:
+    from fairscale.nn.checkpoint import checkpoint_wrapper
+except Exception:
+    checkpoint_wrapper = None
+
+
+class GroundingDinoTransformerDecoderLayer(
+        DeformableDetrTransformerDecoderLayer):
+
+    def __init__(self,
+                 cross_attn_text_cfg: OptConfigType = dict(
+                     embed_dims=256,
+                     num_heads=8,
+                     dropout=0.0,
+                     batch_first=True),
+                 **kwargs) -> None:
+        """Decoder layer of Deformable DETR."""
+        self.cross_attn_text_cfg = cross_attn_text_cfg
+        if 'batch_first' not in self.cross_attn_text_cfg:
+            self.cross_attn_text_cfg['batch_first'] = True
+        super().__init__(**kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize self_attn, cross-attn, ffn, and norms."""
+        self.self_attn = MultiheadAttention(**self.self_attn_cfg)
+        self.cross_attn_text = MultiheadAttention(**self.cross_attn_text_cfg)
+        self.cross_attn = MultiScaleDeformableAttention(**self.cross_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(4)
+        ]
+        self.norms = ModuleList(norms_list)
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                value: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                self_attn_mask: Tensor = None,
+                cross_attn_mask: Tensor = None,
+                key_padding_mask: Tensor = None,
+                memory_text: Tensor = None,
+                text_attention_mask: Tensor = None,
+                **kwargs) -> Tensor:
+        """Implements decoder layer in Grounding DINO transformer.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            key (Tensor, optional): The input key, has shape (bs, num_keys,
+                dim). If `None`, the `query` will be used. Defaults to `None`.
+            value (Tensor, optional): The input value, has the same shape as
+                `key`, as in `nn.MultiheadAttention.forward`. If `None`, the
+                `key` will be used. Defaults to `None`.
+            query_pos (Tensor, optional): The positional encoding for `query`,
+                has the same shape as `query`. If not `None`, it will be added
+                to `query` before forward function. Defaults to `None`.
+            key_pos (Tensor, optional): The positional encoding for `key`, has
+                the same shape as `key`. If not `None`, it will be added to
+                `key` before forward function. If None, and `query_pos` has the
+                same shape as `key`, then `query_pos` will be used for
+                `key_pos`. Defaults to None.
+            self_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            cross_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor, optional): The `key_padding_mask` of
+                `self_attn` input. ByteTensor, has shape (bs, num_value).
+                Defaults to None.
+            memory_text (Tensor): Memory text. It has shape (bs, len_text,
+                text_embed_dims).
+            text_attention_mask (Tensor): Text token mask. It has shape (bs,
+                len_text).
+
+        Returns:
+            Tensor: forwarded results, has shape (bs, num_queries, dim).
+        """
+        # self attention
+        query = self.self_attn(
+            query=query,
+            key=query,
+            value=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_mask,
+            **kwargs)
+        query = self.norms[0](query)
+        # cross attention between query and text
+        query = self.cross_attn_text(
+            query=query,
+            query_pos=query_pos,
+            key=memory_text,
+            value=memory_text,
+            key_padding_mask=text_attention_mask)
+        query = self.norms[1](query)
+        # cross attention between query and image
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            value=value,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=cross_attn_mask,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+        query = self.norms[2](query)
+        query = self.ffn(query)
+        query = self.norms[3](query)
+
+        return query
+
+
+class GroundingDinoTransformerEncoder(DeformableDetrTransformerEncoder):
+
+    def __init__(self, text_layer_cfg: ConfigType,
+                 fusion_layer_cfg: ConfigType, **kwargs) -> None:
+        self.text_layer_cfg = text_layer_cfg
+        self.fusion_layer_cfg = fusion_layer_cfg
+        super().__init__(**kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize encoder layers."""
+        self.layers = ModuleList([
+            DeformableDetrTransformerEncoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.text_layers = ModuleList([
+            DetrTransformerEncoderLayer(**self.text_layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.fusion_layers = ModuleList([
+            SingleScaleBiAttentionBlock(**self.fusion_layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        if self.num_cp > 0:
+            if checkpoint_wrapper is None:
+                raise NotImplementedError(
+                    'If you want to reduce GPU memory usage, \
+                    please install fairscale by executing the \
+                    following command: pip install fairscale.')
+            for i in range(self.num_cp):
+                self.layers[i] = checkpoint_wrapper(self.layers[i])
+                self.fusion_layers[i] = checkpoint_wrapper(
+                    self.fusion_layers[i])
+
+    def forward(self,
+                query: Tensor,
+                query_pos: Tensor,
+                key_padding_mask: Tensor,
+                spatial_shapes: Tensor,
+                level_start_index: Tensor,
+                valid_ratios: Tensor,
+                memory_text: Tensor = None,
+                text_attention_mask: Tensor = None,
+                pos_text: Tensor = None,
+                text_self_attention_masks: Tensor = None,
+                position_ids: Tensor = None):
+        """Forward function of Transformer encoder.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            query_pos (Tensor): The positional encoding for query, has shape
+                (bs, num_queries, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (bs, num_queries).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            memory_text (Tensor, optional): Memory text. It has shape (bs,
+                len_text, text_embed_dims).
+            text_attention_mask (Tensor, optional): Text token mask. It has
+                shape (bs,len_text).
+            pos_text (Tensor, optional): The positional encoding for text.
+                Defaults to None.
+            text_self_attention_masks (Tensor, optional): Text self attention
+                mask. Defaults to None.
+            position_ids (Tensor, optional): Text position ids.
+                Defaults to None.
+        """
+        output = query
+        reference_points = self.get_encoder_reference_points(
+            spatial_shapes, valid_ratios, device=query.device)
+        if self.text_layers:
+            # generate pos_text
+            bs, n_text, _ = memory_text.shape
+            if pos_text is None and position_ids is None:
+                pos_text = (
+                    torch.arange(n_text,
+                                 device=memory_text.device).float().unsqueeze(
+                                     0).unsqueeze(-1).repeat(bs, 1, 1))
+                pos_text = get_text_sine_pos_embed(
+                    pos_text, num_pos_feats=256, exchange_xy=False)
+            if position_ids is not None:
+                pos_text = get_text_sine_pos_embed(
+                    position_ids[..., None],
+                    num_pos_feats=256,
+                    exchange_xy=False)
+
+        # main process
+        for layer_id, layer in enumerate(self.layers):
+            if self.fusion_layers:
+                output, memory_text = self.fusion_layers[layer_id](
+                    visual_feature=output,
+                    lang_feature=memory_text,
+                    attention_mask_v=key_padding_mask,
+                    attention_mask_l=text_attention_mask,
+                )
+            if self.text_layers:
+                text_num_heads = self.text_layers[
+                    layer_id].self_attn_cfg.num_heads
+                memory_text = self.text_layers[layer_id](
+                    query=memory_text,
+                    query_pos=(pos_text if pos_text is not None else None),
+                    attn_mask=~text_self_attention_masks.repeat(
+                        text_num_heads, 1, 1),  # note we use ~ for mask here
+                    key_padding_mask=None,
+                )
+            output = layer(
+                query=output,
+                query_pos=query_pos,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                key_padding_mask=key_padding_mask)
+        return output, memory_text
+
+
+class GroundingDinoTransformerDecoder(DinoTransformerDecoder):
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers."""
+        self.layers = ModuleList([
+            GroundingDinoTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        if self.post_norm_cfg is not None:
+            raise ValueError('There is not post_norm in '
+                             f'{self._get_name()}')
+        self.ref_point_head = MLP(self.embed_dims * 2, self.embed_dims,
+                                  self.embed_dims, 2)
+        self.norm = nn.LayerNorm(self.embed_dims)
diff --git a/mmde/mmdet/models/layers/transformer/mask2former_layers.py b/mmde/mmdet/models/layers/transformer/mask2former_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcc604e277d91151334ed520d78e6a5a8f388036
--- /dev/null
+++ b/mmde/mmdet/models/layers/transformer/mask2former_layers.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import build_norm_layer
+from mmengine.model import ModuleList
+from torch import Tensor
+
+from .deformable_detr_layers import DeformableDetrTransformerEncoder
+from .detr_layers import DetrTransformerDecoder, DetrTransformerDecoderLayer
+
+
+class Mask2FormerTransformerEncoder(DeformableDetrTransformerEncoder):
+    """Encoder in PixelDecoder of Mask2Former."""
+
+    def forward(self, query: Tensor, query_pos: Tensor,
+                key_padding_mask: Tensor, spatial_shapes: Tensor,
+                level_start_index: Tensor, valid_ratios: Tensor,
+                reference_points: Tensor, **kwargs) -> Tensor:
+        """Forward function of Transformer encoder.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            query_pos (Tensor): The positional encoding for query, has shape
+                (bs, num_queries, dim). If not None, it will be added to the
+                `query` before forward function. Defaults to None.
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (bs, num_queries).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 2) with the last dimension arranged
+                as (cx, cy).
+
+        Returns:
+            Tensor: Output queries of Transformer encoder, which is also
+            called 'encoder output embeddings' or 'memory', has shape
+            (bs, num_queries, dim)
+        """
+        for layer in self.layers:
+            query = layer(
+                query=query,
+                query_pos=query_pos,
+                key_padding_mask=key_padding_mask,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points,
+                **kwargs)
+        return query
+
+
+class Mask2FormerTransformerDecoder(DetrTransformerDecoder):
+    """Decoder of Mask2Former."""
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers."""
+        self.layers = ModuleList([
+            Mask2FormerTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        self.post_norm = build_norm_layer(self.post_norm_cfg,
+                                          self.embed_dims)[1]
+
+
+class Mask2FormerTransformerDecoderLayer(DetrTransformerDecoderLayer):
+    """Implements decoder layer in Mask2Former transformer."""
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                value: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                self_attn_mask: Tensor = None,
+                cross_attn_mask: Tensor = None,
+                key_padding_mask: Tensor = None,
+                **kwargs) -> Tensor:
+        """
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            key (Tensor, optional): The input key, has shape (bs, num_keys,
+                dim). If `None`, the `query` will be used. Defaults to `None`.
+            value (Tensor, optional): The input value, has the same shape as
+                `key`, as in `nn.MultiheadAttention.forward`. If `None`, the
+                `key` will be used. Defaults to `None`.
+            query_pos (Tensor, optional): The positional encoding for `query`,
+                has the same shape as `query`. If not `None`, it will be added
+                to `query` before forward function. Defaults to `None`.
+            key_pos (Tensor, optional): The positional encoding for `key`, has
+                the same shape as `key`. If not `None`, it will be added to
+                `key` before forward function. If None, and `query_pos` has the
+                same shape as `key`, then `query_pos` will be used for
+                `key_pos`. Defaults to None.
+            self_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            cross_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor, optional): The `key_padding_mask` of
+                `self_attn` input. ByteTensor, has shape (bs, num_value).
+                Defaults to None.
+
+        Returns:
+            Tensor: forwarded results, has shape (bs, num_queries, dim).
+        """
+
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            value=value,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=cross_attn_mask,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+        query = self.norms[0](query)
+        query = self.self_attn(
+            query=query,
+            key=query,
+            value=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_mask,
+            **kwargs)
+        query = self.norms[1](query)
+        query = self.ffn(query)
+        query = self.norms[2](query)
+
+        return query
diff --git a/mmde/mmdet/models/layers/transformer/utils.py b/mmde/mmdet/models/layers/transformer/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e43a172ca7175b23c82f60894faf38ec6c437e3
--- /dev/null
+++ b/mmde/mmdet/models/layers/transformer/utils.py
@@ -0,0 +1,915 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+from typing import Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer,
+                      build_norm_layer)
+from mmcv.cnn.bricks.drop import Dropout
+from mmengine.model import BaseModule, ModuleList
+from mmengine.utils import to_2tuple
+from torch import Tensor, nn
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+def nlc_to_nchw(x: Tensor, hw_shape: Sequence[int]) -> Tensor:
+    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, L, C] before conversion.
+        hw_shape (Sequence[int]): The height and width of output feature map.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W] after conversion.
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len does not match H, W'
+    return x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+
+
+def nchw_to_nlc(x):
+    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C] after conversion.
+    """
+    assert len(x.shape) == 4
+    return x.flatten(2).transpose(1, 2).contiguous()
+
+
+def coordinate_to_encoding(coord_tensor: Tensor,
+                           num_feats: int = 128,
+                           temperature: int = 10000,
+                           scale: float = 2 * math.pi):
+    """Convert coordinate tensor to positional encoding.
+
+    Args:
+        coord_tensor (Tensor): Coordinate tensor to be converted to
+            positional encoding. With the last dimension as 2 or 4.
+        num_feats (int, optional): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value. Defaults to 128.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+    Returns:
+        Tensor: Returned encoded positional tensor.
+    """
+    dim_t = torch.arange(
+        num_feats, dtype=torch.float32, device=coord_tensor.device)
+    dim_t = temperature**(2 * (dim_t // 2) / num_feats)
+    x_embed = coord_tensor[..., 0] * scale
+    y_embed = coord_tensor[..., 1] * scale
+    pos_x = x_embed[..., None] / dim_t
+    pos_y = y_embed[..., None] / dim_t
+    pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()),
+                        dim=-1).flatten(2)
+    pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()),
+                        dim=-1).flatten(2)
+    if coord_tensor.size(-1) == 2:
+        pos = torch.cat((pos_y, pos_x), dim=-1)
+    elif coord_tensor.size(-1) == 4:
+        w_embed = coord_tensor[..., 2] * scale
+        pos_w = w_embed[..., None] / dim_t
+        pos_w = torch.stack((pos_w[..., 0::2].sin(), pos_w[..., 1::2].cos()),
+                            dim=-1).flatten(2)
+
+        h_embed = coord_tensor[..., 3] * scale
+        pos_h = h_embed[..., None] / dim_t
+        pos_h = torch.stack((pos_h[..., 0::2].sin(), pos_h[..., 1::2].cos()),
+                            dim=-1).flatten(2)
+
+        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=-1)
+    else:
+        raise ValueError('Unknown pos_tensor shape(-1):{}'.format(
+            coord_tensor.size(-1)))
+    return pos
+
+
+def inverse_sigmoid(x: Tensor, eps: float = 1e-5) -> Tensor:
+    """Inverse function of sigmoid.
+
+    Args:
+        x (Tensor): The tensor to do the inverse.
+        eps (float): EPS avoid numerical overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse function of sigmoid, has the same
+        shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding to input (if needed) so that input can get fully covered
+    by filter you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
+    input. The "corner"  mode would pad zero to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel:
+        stride (int | tuple): Stride of the filter. Default: 1:
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+
+        super(AdaptivePadding, self).__init__()
+
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        padding = to_2tuple(padding)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The config dict for embedding
+            conv layer type selection. Default: "Conv2d.
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int): The slide stride of embedding conv.
+            Default: None (Would be set as `kernel_size`).
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only work when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmengine.ConfigDict`, optional): The Config for
+            initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int = 3,
+                 embed_dims: int = 768,
+                 conv_type: str = 'Conv2d',
+                 kernel_size: int = 16,
+                 stride: int = 16,
+                 padding: Union[int, tuple, str] = 'corner',
+                 dilation: int = 1,
+                 bias: bool = True,
+                 norm_cfg: OptConfigType = None,
+                 input_size: Union[int, tuple] = None,
+                 init_cfg: OptConfigType = None) -> None:
+        super(PatchEmbed, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adap_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adap_padding:
+                pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tuple[int]]:
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (out_h, out_w).
+        """
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map. Our implementation uses `nn.Unfold` to
+    merge patch, which is about 25% faster than original implementation.
+    Instead, we need to modify pretrained models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+            to gets fully covered by filter and stride you specified..
+            Default: True.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Optional[Union[int, tuple]] = 2,
+                 stride: Optional[Union[int, tuple]] = None,
+                 padding: Union[int, tuple, str] = 'corner',
+                 dilation: Optional[Union[int, tuple]] = 1,
+                 bias: Optional[bool] = False,
+                 norm_cfg: OptConfigType = dict(type='LN'),
+                 init_cfg: OptConfigType = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adap_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x: Tensor,
+                input_size: Tuple[int]) -> Tuple[Tensor, Tuple[int]]:
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+            H, W = x.shape[-2:]
+
+        x = self.sampler(x)
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
+
+
+class ConditionalAttention(BaseModule):
+    """A wrapper of conditional attention, dropout and residual connection.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop: A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        cross_attn (bool): Whether the attention module is for cross attention.
+            Default: False
+        keep_query_pos (bool): Whether to transform query_pos before cross
+            attention.
+            Default: False.
+        batch_first (bool): When it is True, Key, Query and Value are shape of
+            (batch, n, embed_dim), otherwise (n, batch, embed_dim).
+             Default: True.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 num_heads: int,
+                 attn_drop: float = 0.,
+                 proj_drop: float = 0.,
+                 cross_attn: bool = False,
+                 keep_query_pos: bool = False,
+                 batch_first: bool = True,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+
+        assert batch_first is True, 'Set `batch_first`\
+        to False is NOT supported in ConditionalAttention. \
+        First dimension of all DETRs in mmdet is `batch`, \
+        please set `batch_first` to True.'
+
+        self.cross_attn = cross_attn
+        self.keep_query_pos = keep_query_pos
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.attn_drop = Dropout(attn_drop)
+        self.proj_drop = Dropout(proj_drop)
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers for qkv projection."""
+        embed_dims = self.embed_dims
+        self.qcontent_proj = Linear(embed_dims, embed_dims)
+        self.qpos_proj = Linear(embed_dims, embed_dims)
+        self.kcontent_proj = Linear(embed_dims, embed_dims)
+        self.kpos_proj = Linear(embed_dims, embed_dims)
+        self.v_proj = Linear(embed_dims, embed_dims)
+        if self.cross_attn:
+            self.qpos_sine_proj = Linear(embed_dims, embed_dims)
+        self.out_proj = Linear(embed_dims, embed_dims)
+
+        nn.init.constant_(self.out_proj.bias, 0.)
+
+    def forward_attn(self,
+                     query: Tensor,
+                     key: Tensor,
+                     value: Tensor,
+                     attn_mask: Tensor = None,
+                     key_padding_mask: Tensor = None) -> Tuple[Tensor]:
+        """Forward process for `ConditionalAttention`.
+
+        Args:
+            query (Tensor): The input query with shape [bs, num_queries,
+                embed_dims].
+            key (Tensor): The key tensor with shape [bs, num_keys,
+                embed_dims].
+                If None, the `query` will be used. Defaults to None.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+                If None, the `key` will be used.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+        Returns:
+            Tuple[Tensor]: Attention outputs of shape :math:`(N, L, E)`,
+            where :math:`N` is the batch size, :math:`L` is the target
+            sequence length , and :math:`E` is the embedding dimension
+            `embed_dim`. Attention weights per head of shape :math:`
+            (num_heads, L, S)`. where :math:`N` is batch size, :math:`L`
+            is target sequence length, and :math:`S` is the source sequence
+            length.
+        """
+        assert key.size(1) == value.size(1), \
+            f'{"key, value must have the same sequence length"}'
+        assert query.size(0) == key.size(0) == value.size(0), \
+            f'{"batch size must be equal for query, key, value"}'
+        assert query.size(2) == key.size(2), \
+            f'{"q_dims, k_dims must be equal"}'
+        assert value.size(2) == self.embed_dims, \
+            f'{"v_dims must be equal to embed_dims"}'
+
+        bs, tgt_len, hidden_dims = query.size()
+        _, src_len, _ = key.size()
+        head_dims = hidden_dims // self.num_heads
+        v_head_dims = self.embed_dims // self.num_heads
+        assert head_dims * self.num_heads == hidden_dims, \
+            f'{"hidden_dims must be divisible by num_heads"}'
+        scaling = float(head_dims)**-0.5
+
+        q = query * scaling
+        k = key
+        v = value
+
+        if attn_mask is not None:
+            assert attn_mask.dtype == torch.float32 or \
+                   attn_mask.dtype == torch.float64 or \
+                   attn_mask.dtype == torch.float16 or \
+                   attn_mask.dtype == torch.uint8 or \
+                   attn_mask.dtype == torch.bool, \
+                   'Only float, byte, and bool types are supported for \
+                    attn_mask'
+
+            if attn_mask.dtype == torch.uint8:
+                warnings.warn('Byte tensor for attn_mask is deprecated.\
+                     Use bool tensor instead.')
+                attn_mask = attn_mask.to(torch.bool)
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask.unsqueeze(0)
+                if list(attn_mask.size()) != [1, query.size(1), key.size(1)]:
+                    raise RuntimeError(
+                        'The size of the 2D attn_mask is not correct.')
+            elif attn_mask.dim() == 3:
+                if list(attn_mask.size()) != [
+                        bs * self.num_heads,
+                        query.size(1),
+                        key.size(1)
+                ]:
+                    raise RuntimeError(
+                        'The size of the 3D attn_mask is not correct.')
+            else:
+                raise RuntimeError(
+                    "attn_mask's dimension {} is not supported".format(
+                        attn_mask.dim()))
+        # attn_mask's dim is 3 now.
+
+        if key_padding_mask is not None and key_padding_mask.dtype == int:
+            key_padding_mask = key_padding_mask.to(torch.bool)
+
+        q = q.contiguous().view(bs, tgt_len, self.num_heads,
+                                head_dims).permute(0, 2, 1, 3).flatten(0, 1)
+        if k is not None:
+            k = k.contiguous().view(bs, src_len, self.num_heads,
+                                    head_dims).permute(0, 2, 1,
+                                                       3).flatten(0, 1)
+        if v is not None:
+            v = v.contiguous().view(bs, src_len, self.num_heads,
+                                    v_head_dims).permute(0, 2, 1,
+                                                         3).flatten(0, 1)
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bs
+            assert key_padding_mask.size(1) == src_len
+
+        attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+        assert list(attn_output_weights.size()) == [
+            bs * self.num_heads, tgt_len, src_len
+        ]
+
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_output_weights.masked_fill_(attn_mask, float('-inf'))
+            else:
+                attn_output_weights += attn_mask
+
+        if key_padding_mask is not None:
+            attn_output_weights = attn_output_weights.view(
+                bs, self.num_heads, tgt_len, src_len)
+            attn_output_weights = attn_output_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                float('-inf'),
+            )
+            attn_output_weights = attn_output_weights.view(
+                bs * self.num_heads, tgt_len, src_len)
+
+        attn_output_weights = F.softmax(
+            attn_output_weights -
+            attn_output_weights.max(dim=-1, keepdim=True)[0],
+            dim=-1)
+        attn_output_weights = self.attn_drop(attn_output_weights)
+
+        attn_output = torch.bmm(attn_output_weights, v)
+        assert list(
+            attn_output.size()) == [bs * self.num_heads, tgt_len, v_head_dims]
+        attn_output = attn_output.view(bs, self.num_heads, tgt_len,
+                                       v_head_dims).permute(0, 2, 1,
+                                                            3).flatten(2)
+        attn_output = self.out_proj(attn_output)
+
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bs, self.num_heads,
+                                                       tgt_len, src_len)
+        return attn_output, attn_output_weights.sum(dim=1) / self.num_heads
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor,
+                query_pos: Tensor = None,
+                ref_sine_embed: Tensor = None,
+                key_pos: Tensor = None,
+                attn_mask: Tensor = None,
+                key_padding_mask: Tensor = None,
+                is_first: bool = False) -> Tensor:
+        """Forward function for `ConditionalAttention`.
+        Args:
+            query (Tensor): The input query with shape [bs, num_queries,
+                embed_dims].
+            key (Tensor): The key tensor with shape [bs, num_keys,
+                embed_dims].
+                If None, the `query` will be used. Defaults to None.
+            query_pos (Tensor): The positional encoding for query in self
+                attention, with the same shape as `x`. If not None, it will
+                be added to `x` before forward function.
+                Defaults to None.
+            query_sine_embed (Tensor): The positional encoding for query in
+                cross attention, with the same shape as `x`. If not None, it
+                will be added to `x` before forward function.
+                Defaults to None.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. Defaults to None. If not None, it will
+                be added to `key` before forward function. If None, and
+                `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+            is_first (bool): A indicator to tell whether the current layer
+                is the first layer of the decoder.
+                Defaults to False.
+        Returns:
+            Tensor: forwarded results with shape
+            [bs, num_queries, embed_dims].
+        """
+
+        if self.cross_attn:
+            q_content = self.qcontent_proj(query)
+            k_content = self.kcontent_proj(key)
+            v = self.v_proj(key)
+
+            bs, nq, c = q_content.size()
+            _, hw, _ = k_content.size()
+
+            k_pos = self.kpos_proj(key_pos)
+            if is_first or self.keep_query_pos:
+                q_pos = self.qpos_proj(query_pos)
+                q = q_content + q_pos
+                k = k_content + k_pos
+            else:
+                q = q_content
+                k = k_content
+            q = q.view(bs, nq, self.num_heads, c // self.num_heads)
+            query_sine_embed = self.qpos_sine_proj(ref_sine_embed)
+            query_sine_embed = query_sine_embed.view(bs, nq, self.num_heads,
+                                                     c // self.num_heads)
+            q = torch.cat([q, query_sine_embed], dim=3).view(bs, nq, 2 * c)
+            k = k.view(bs, hw, self.num_heads, c // self.num_heads)
+            k_pos = k_pos.view(bs, hw, self.num_heads, c // self.num_heads)
+            k = torch.cat([k, k_pos], dim=3).view(bs, hw, 2 * c)
+            ca_output = self.forward_attn(
+                query=q,
+                key=k,
+                value=v,
+                attn_mask=attn_mask,
+                key_padding_mask=key_padding_mask)[0]
+            query = query + self.proj_drop(ca_output)
+        else:
+            q_content = self.qcontent_proj(query)
+            q_pos = self.qpos_proj(query_pos)
+            k_content = self.kcontent_proj(query)
+            k_pos = self.kpos_proj(query_pos)
+            v = self.v_proj(query)
+            q = q_content if q_pos is None else q_content + q_pos
+            k = k_content if k_pos is None else k_content + k_pos
+            sa_output = self.forward_attn(
+                query=q,
+                key=k,
+                value=v,
+                attn_mask=attn_mask,
+                key_padding_mask=key_padding_mask)[0]
+            query = query + self.proj_drop(sa_output)
+
+        return query
+
+
+class MLP(BaseModule):
+    """Very simple multi-layer perceptron (also called FFN) with relu. Mostly
+    used in DETR series detectors.
+
+    Args:
+        input_dim (int): Feature dim of the input tensor.
+        hidden_dim (int): Feature dim of the hidden layer.
+        output_dim (int): Feature dim of the output tensor.
+        num_layers (int): Number of FFN layers. As the last
+            layer of MLP only contains FFN (Linear).
+    """
+
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int,
+                 num_layers: int) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = ModuleList(
+            Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function of MLP.
+
+        Args:
+            x (Tensor): The input feature, has shape
+                (num_queries, bs, input_dim).
+        Returns:
+            Tensor: The output feature, has shape
+                (num_queries, bs, output_dim).
+        """
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+@MODELS.register_module()
+class DynamicConv(BaseModule):
+    """Implements Dynamic Convolution.
+
+    This module generate parameters for each sample and
+    use bmm to implement 1*1 convolution. Code is modified
+    from the `official github repo <https://github.com/PeizeSun/
+    SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/head.py#L258>`_ .
+
+    Args:
+        in_channels (int): The input feature channel.
+            Defaults to 256.
+        feat_channels (int): The inner feature channel.
+            Defaults to 64.
+        out_channels (int, optional): The output feature channel.
+            When not specified, it will be set to `in_channels`
+            by default
+        input_feat_shape (int): The shape of input feature.
+            Defaults to 7.
+        with_proj (bool): Project two-dimentional feature to
+            one-dimentional feature. Default to True.
+        act_cfg (dict): The activation config for DynamicConv.
+        norm_cfg (dict): Config dict for normalization layer. Default
+            layer normalization.
+        init_cfg (obj:`mmengine.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int = 256,
+                 feat_channels: int = 64,
+                 out_channels: Optional[int] = None,
+                 input_feat_shape: int = 7,
+                 with_proj: bool = True,
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 norm_cfg: OptConfigType = dict(type='LN'),
+                 init_cfg: OptConfigType = None) -> None:
+        super(DynamicConv, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.out_channels_raw = out_channels
+        self.input_feat_shape = input_feat_shape
+        self.with_proj = with_proj
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.out_channels = out_channels if out_channels else in_channels
+
+        self.num_params_in = self.in_channels * self.feat_channels
+        self.num_params_out = self.out_channels * self.feat_channels
+        self.dynamic_layer = nn.Linear(
+            self.in_channels, self.num_params_in + self.num_params_out)
+
+        self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.norm_out = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+        self.activation = build_activation_layer(act_cfg)
+
+        num_output = self.out_channels * input_feat_shape**2
+        if self.with_proj:
+            self.fc_layer = nn.Linear(num_output, self.out_channels)
+            self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+    def forward(self, param_feature: Tensor, input_feature: Tensor) -> Tensor:
+        """Forward function for `DynamicConv`.
+
+        Args:
+            param_feature (Tensor): The feature can be used
+                to generate the parameter, has shape
+                (num_all_proposals, in_channels).
+            input_feature (Tensor): Feature that
+                interact with parameters, has shape
+                (num_all_proposals, in_channels, H, W).
+
+        Returns:
+            Tensor: The output feature has shape
+            (num_all_proposals, out_channels).
+        """
+        input_feature = input_feature.flatten(2).permute(2, 0, 1)
+
+        input_feature = input_feature.permute(1, 0, 2)
+        parameters = self.dynamic_layer(param_feature)
+
+        param_in = parameters[:, :self.num_params_in].view(
+            -1, self.in_channels, self.feat_channels)
+        param_out = parameters[:, -self.num_params_out:].view(
+            -1, self.feat_channels, self.out_channels)
+
+        # input_feature has shape (num_all_proposals, H*W, in_channels)
+        # param_in has shape (num_all_proposals, in_channels, feat_channels)
+        # feature has shape (num_all_proposals, H*W, feat_channels)
+        features = torch.bmm(input_feature, param_in)
+        features = self.norm_in(features)
+        features = self.activation(features)
+
+        # param_out has shape (batch_size, feat_channels, out_channels)
+        features = torch.bmm(features, param_out)
+        features = self.norm_out(features)
+        features = self.activation(features)
+
+        if self.with_proj:
+            features = features.flatten(1)
+            features = self.fc_layer(features)
+            features = self.fc_norm(features)
+            features = self.activation(features)
+
+        return features
+
+
+def get_text_sine_pos_embed(
+    pos_tensor: torch.Tensor,
+    num_pos_feats: int = 128,
+    temperature: int = 10000,
+    exchange_xy: bool = True,
+):
+    """generate sine position embedding from a position tensor
+    Args:
+        pos_tensor (torch.Tensor): shape: [..., n].
+        num_pos_feats (int): projected shape for each float in the tensor.
+        temperature (int): temperature in the sine/cosine function.
+        exchange_xy (bool, optional): exchange pos x and pos y. For example,
+            input tensor is [x,y], the results will be [pos(y), pos(x)].
+            Defaults to True.
+    Returns:
+        pos_embed (torch.Tensor): shape: [..., n*num_pos_feats].
+    """
+    scale = 2 * math.pi
+    dim_t = torch.arange(
+        num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = temperature**(2 * torch.div(dim_t, 2, rounding_mode='floor') /
+                          num_pos_feats)
+
+    def sine_func(x: torch.Tensor):
+        sin_x = x * scale / dim_t
+        sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()),
+                            dim=3).flatten(2)
+        return sin_x
+
+    pos_res = [
+        sine_func(x)
+        for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)
+    ]
+    if exchange_xy:
+        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
+    pos_res = torch.cat(pos_res, dim=-1)
+    return pos_res
diff --git a/mmde/mmdet/models/losses/__init__.py b/mmde/mmdet/models/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c57a3a96879c6bd5eb61c300d316e2b4579b287
--- /dev/null
+++ b/mmde/mmdet/models/losses/__init__.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .accuracy import Accuracy, accuracy
+from .ae_loss import AssociativeEmbeddingLoss
+from .balanced_l1_loss import BalancedL1Loss, balanced_l1_loss
+from .cross_entropy_loss import (CrossEntropyCustomLoss, CrossEntropyLoss,
+                                 binary_cross_entropy, cross_entropy,
+                                 mask_cross_entropy)
+from .ddq_detr_aux_loss import DDQAuxLoss
+from .dice_loss import DiceLoss
+from .eqlv2_loss import EQLV2Loss
+from .focal_loss import FocalCustomLoss, FocalLoss, sigmoid_focal_loss
+from .gaussian_focal_loss import GaussianFocalLoss
+from .gfocal_loss import DistributionFocalLoss, QualityFocalLoss
+from .ghm_loss import GHMC, GHMR
+from .iou_loss import (BoundedIoULoss, CIoULoss, DIoULoss, EIoULoss, GIoULoss,
+                       IoULoss, SIoULoss, bounded_iou_loss, iou_loss)
+from .kd_loss import KnowledgeDistillationKLDivLoss
+from .l2_loss import L2Loss
+from .margin_loss import MarginL2Loss
+from .mse_loss import MSELoss, mse_loss
+from .multipos_cross_entropy_loss import MultiPosCrossEntropyLoss
+from .pisa_loss import carl_loss, isr_p
+from .seesaw_loss import SeesawLoss
+from .smooth_l1_loss import L1Loss, SmoothL1Loss, l1_loss, smooth_l1_loss
+from .triplet_loss import TripletLoss
+from .utils import reduce_loss, weight_reduce_loss, weighted_loss
+from .varifocal_loss import VarifocalLoss
+
+__all__ = [
+    'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
+    'mask_cross_entropy', 'CrossEntropyLoss', 'sigmoid_focal_loss',
+    'FocalLoss', 'smooth_l1_loss', 'SmoothL1Loss', 'balanced_l1_loss',
+    'BalancedL1Loss', 'mse_loss', 'MSELoss', 'iou_loss', 'bounded_iou_loss',
+    'IoULoss', 'BoundedIoULoss', 'GIoULoss', 'DIoULoss', 'CIoULoss',
+    'EIoULoss', 'SIoULoss', 'GHMC', 'GHMR', 'reduce_loss',
+    'weight_reduce_loss', 'weighted_loss', 'L1Loss', 'l1_loss', 'isr_p',
+    'carl_loss', 'AssociativeEmbeddingLoss', 'GaussianFocalLoss',
+    'QualityFocalLoss', 'DistributionFocalLoss', 'VarifocalLoss',
+    'KnowledgeDistillationKLDivLoss', 'SeesawLoss', 'DiceLoss', 'EQLV2Loss',
+    'MarginL2Loss', 'MultiPosCrossEntropyLoss', 'L2Loss', 'TripletLoss',
+    'DDQAuxLoss', 'CrossEntropyCustomLoss', 'FocalCustomLoss'
+]
diff --git a/mmde/mmdet/models/losses/accuracy.py b/mmde/mmdet/models/losses/accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..d68484e13965ced3bd6b104071d22657a9b3fde6
--- /dev/null
+++ b/mmde/mmdet/models/losses/accuracy.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+
+
+def accuracy(pred, target, topk=1, thresh=None):
+    """Calculate accuracy according to the prediction and target.
+
+    Args:
+        pred (torch.Tensor): The model prediction, shape (N, num_class)
+        target (torch.Tensor): The target of each prediction, shape (N, )
+        topk (int | tuple[int], optional): If the predictions in ``topk``
+            matches the target, the predictions will be regarded as
+            correct ones. Defaults to 1.
+        thresh (float, optional): If not None, predictions with scores under
+            this threshold are considered incorrect. Default to None.
+
+    Returns:
+        float | tuple[float]: If the input ``topk`` is a single integer,
+            the function will return a single float as accuracy. If
+            ``topk`` is a tuple containing multiple integers, the
+            function will return a tuple containing accuracies of
+            each ``topk`` number.
+    """
+    assert isinstance(topk, (int, tuple))
+    if isinstance(topk, int):
+        topk = (topk, )
+        return_single = True
+    else:
+        return_single = False
+
+    maxk = max(topk)
+    if pred.size(0) == 0:
+        accu = [pred.new_tensor(0.) for i in range(len(topk))]
+        return accu[0] if return_single else accu
+    assert pred.ndim == 2 and target.ndim == 1
+    assert pred.size(0) == target.size(0)
+    assert maxk <= pred.size(1), \
+        f'maxk {maxk} exceeds pred dimension {pred.size(1)}'
+    pred_value, pred_label = pred.topk(maxk, dim=1)
+    pred_label = pred_label.t()  # transpose to shape (maxk, N)
+    correct = pred_label.eq(target.view(1, -1).expand_as(pred_label))
+    if thresh is not None:
+        # Only prediction values larger than thresh are counted as correct
+        correct = correct & (pred_value > thresh).t()
+    res = []
+    for k in topk:
+        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / pred.size(0)))
+    return res[0] if return_single else res
+
+
+class Accuracy(nn.Module):
+
+    def __init__(self, topk=(1, ), thresh=None):
+        """Module to calculate the accuracy.
+
+        Args:
+            topk (tuple, optional): The criterion used to calculate the
+                accuracy. Defaults to (1,).
+            thresh (float, optional): If not None, predictions with scores
+                under this threshold are considered incorrect. Default to None.
+        """
+        super().__init__()
+        self.topk = topk
+        self.thresh = thresh
+
+    def forward(self, pred, target):
+        """Forward function to calculate accuracy.
+
+        Args:
+            pred (torch.Tensor): Prediction of models.
+            target (torch.Tensor): Target for each prediction.
+
+        Returns:
+            tuple[float]: The accuracies under different topk criterions.
+        """
+        return accuracy(pred, target, self.topk, self.thresh)
diff --git a/mmde/mmdet/models/losses/ae_loss.py b/mmde/mmdet/models/losses/ae_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aa7d696be4b937a2d45545a8309aaa936fe5f22
--- /dev/null
+++ b/mmde/mmdet/models/losses/ae_loss.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.registry import MODELS
+
+
+def ae_loss_per_image(tl_preds, br_preds, match):
+    """Associative Embedding Loss in one image.
+
+    Associative Embedding Loss including two parts: pull loss and push loss.
+    Pull loss makes embedding vectors from same object closer to each other.
+    Push loss distinguish embedding vector from different objects, and makes
+        the gap between them is large enough.
+
+    During computing, usually there are 3 cases:
+        - no object in image: both pull loss and push loss will be 0.
+        - one object in image: push loss will be 0 and pull loss is computed
+            by the two corner of the only object.
+        - more than one objects in image: pull loss is computed by corner pairs
+            from each object, push loss is computed by each object with all
+            other objects. We use confusion matrix with 0 in diagonal to
+            compute the push loss.
+
+    Args:
+        tl_preds (tensor): Embedding feature map of left-top corner.
+        br_preds (tensor): Embedding feature map of bottim-right corner.
+        match (list): Downsampled coordinates pair of each ground truth box.
+    """
+
+    tl_list, br_list, me_list = [], [], []
+    if len(match) == 0:  # no object in image
+        pull_loss = tl_preds.sum() * 0.
+        push_loss = tl_preds.sum() * 0.
+    else:
+        for m in match:
+            [tl_y, tl_x], [br_y, br_x] = m
+            tl_e = tl_preds[:, tl_y, tl_x].view(-1, 1)
+            br_e = br_preds[:, br_y, br_x].view(-1, 1)
+            tl_list.append(tl_e)
+            br_list.append(br_e)
+            me_list.append((tl_e + br_e) / 2.0)
+
+        tl_list = torch.cat(tl_list)
+        br_list = torch.cat(br_list)
+        me_list = torch.cat(me_list)
+
+        assert tl_list.size() == br_list.size()
+
+        # N is object number in image, M is dimension of embedding vector
+        N, M = tl_list.size()
+
+        pull_loss = (tl_list - me_list).pow(2) + (br_list - me_list).pow(2)
+        pull_loss = pull_loss.sum() / N
+
+        margin = 1  # exp setting of CornerNet, details in section 3.3 of paper
+
+        # confusion matrix of push loss
+        conf_mat = me_list.expand((N, N, M)).permute(1, 0, 2) - me_list
+        conf_weight = 1 - torch.eye(N).type_as(me_list)
+        conf_mat = conf_weight * (margin - conf_mat.sum(-1).abs())
+
+        if N > 1:  # more than one object in current image
+            push_loss = F.relu(conf_mat).sum() / (N * (N - 1))
+        else:
+            push_loss = tl_preds.sum() * 0.
+
+    return pull_loss, push_loss
+
+
+@MODELS.register_module()
+class AssociativeEmbeddingLoss(nn.Module):
+    """Associative Embedding Loss.
+
+    More details can be found in
+    `Associative Embedding <https://arxiv.org/abs/1611.05424>`_ and
+    `CornerNet <https://arxiv.org/abs/1808.01244>`_ .
+    Code is modified from `kp_utils.py <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/kp_utils.py#L180>`_  # noqa: E501
+
+    Args:
+        pull_weight (float): Loss weight for corners from same object.
+        push_weight (float): Loss weight for corners from different object.
+    """
+
+    def __init__(self, pull_weight=0.25, push_weight=0.25):
+        super(AssociativeEmbeddingLoss, self).__init__()
+        self.pull_weight = pull_weight
+        self.push_weight = push_weight
+
+    def forward(self, pred, target, match):
+        """Forward function."""
+        batch = pred.size(0)
+        pull_all, push_all = 0.0, 0.0
+        for i in range(batch):
+            pull, push = ae_loss_per_image(pred[i], target[i], match[i])
+
+            pull_all += self.pull_weight * pull
+            push_all += self.push_weight * push
+
+        return pull_all, push_all
diff --git a/mmde/mmdet/models/losses/balanced_l1_loss.py b/mmde/mmdet/models/losses/balanced_l1_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..25adaab2239e871476d9d4e3cbb1a238c3043041
--- /dev/null
+++ b/mmde/mmdet/models/losses/balanced_l1_loss.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+
+from mmdet.registry import MODELS
+from .utils import weighted_loss
+
+
+@weighted_loss
+def balanced_l1_loss(pred,
+                     target,
+                     beta=1.0,
+                     alpha=0.5,
+                     gamma=1.5,
+                     reduction='mean'):
+    """Calculate balanced L1 loss.
+
+    Please see the `Libra R-CNN <https://arxiv.org/pdf/1904.02701.pdf>`_
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 4).
+        target (torch.Tensor): The learning target of the prediction with
+            shape (N, 4).
+        beta (float): The loss is a piecewise function of prediction and target
+            and ``beta`` serves as a threshold for the difference between the
+            prediction and target. Defaults to 1.0.
+        alpha (float): The denominator ``alpha`` in the balanced L1 loss.
+            Defaults to 0.5.
+        gamma (float): The ``gamma`` in the balanced L1 loss.
+            Defaults to 1.5.
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    assert beta > 0
+    if target.numel() == 0:
+        return pred.sum() * 0
+
+    assert pred.size() == target.size()
+
+    diff = torch.abs(pred - target)
+    b = np.e**(gamma / alpha) - 1
+    loss = torch.where(
+        diff < beta, alpha / b *
+        (b * diff + 1) * torch.log(b * diff / beta + 1) - alpha * diff,
+        gamma * diff + gamma / b - alpha * beta)
+
+    return loss
+
+
+@MODELS.register_module()
+class BalancedL1Loss(nn.Module):
+    """Balanced L1 Loss.
+
+    arXiv: https://arxiv.org/pdf/1904.02701.pdf (CVPR 2019)
+
+    Args:
+        alpha (float): The denominator ``alpha`` in the balanced L1 loss.
+            Defaults to 0.5.
+        gamma (float): The ``gamma`` in the balanced L1 loss. Defaults to 1.5.
+        beta (float, optional): The loss is a piecewise function of prediction
+            and target. ``beta`` serves as a threshold for the difference
+            between the prediction and target. Defaults to 1.0.
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+
+    def __init__(self,
+                 alpha=0.5,
+                 gamma=1.5,
+                 beta=1.0,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(BalancedL1Loss, self).__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function of loss.
+
+        Args:
+            pred (torch.Tensor): The prediction with shape (N, 4).
+            target (torch.Tensor): The learning target of the prediction with
+                shape (N, 4).
+            weight (torch.Tensor, optional): Sample-wise loss weight with
+                shape (N, ).
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * balanced_l1_loss(
+            pred,
+            target,
+            weight,
+            alpha=self.alpha,
+            gamma=self.gamma,
+            beta=self.beta,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_bbox
diff --git a/mmde/mmdet/models/losses/cross_entropy_loss.py b/mmde/mmdet/models/losses/cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..49fac7743ceddd2454f44b76c63d514de43b5aef
--- /dev/null
+++ b/mmde/mmdet/models/losses/cross_entropy_loss.py
@@ -0,0 +1,401 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.registry import MODELS
+from .accuracy import accuracy
+from .utils import weight_reduce_loss
+
+
+def cross_entropy(pred,
+                  label,
+                  weight=None,
+                  reduction='mean',
+                  avg_factor=None,
+                  class_weight=None,
+                  ignore_index=-100,
+                  avg_non_ignore=False):
+    """Calculate the CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        label (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (int | None): The label index to be ignored.
+            If None, it will be set to default value. Default: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    # The default value of ignore_index is the same as F.cross_entropy
+    ignore_index = -100 if ignore_index is None else ignore_index
+    # element-wise losses
+    loss = F.cross_entropy(
+        pred,
+        label,
+        weight=class_weight,
+        reduction='none',
+        ignore_index=ignore_index)
+
+    # average loss over non-ignored elements
+    # pytorch's official cross_entropy average loss over non-ignored elements
+    # refer to https://github.com/pytorch/pytorch/blob/56b43f4fec1f76953f15a627694d4bba34588969/torch/nn/functional.py#L2660  # noqa
+    if (avg_factor is None) and avg_non_ignore and reduction == 'mean':
+        avg_factor = label.numel() - (label == ignore_index).sum().item()
+
+    # apply weights and do the reduction
+    if weight is not None:
+        weight = weight.float()
+    loss = weight_reduce_loss(
+        loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def _expand_onehot_labels(labels, label_weights, label_channels, ignore_index):
+    """Expand onehot labels to match the size of prediction."""
+    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
+    valid_mask = (labels >= 0) & (labels != ignore_index)
+    inds = torch.nonzero(
+        valid_mask & (labels < label_channels), as_tuple=False)
+
+    if inds.numel() > 0:
+        bin_labels[inds, labels[inds]] = 1
+
+    valid_mask = valid_mask.view(-1, 1).expand(labels.size(0),
+                                               label_channels).float()
+    if label_weights is None:
+        bin_label_weights = valid_mask
+    else:
+        bin_label_weights = label_weights.view(-1, 1).repeat(1, label_channels)
+        bin_label_weights *= valid_mask
+
+    return bin_labels, bin_label_weights, valid_mask
+
+
+def binary_cross_entropy(pred,
+                         label,
+                         weight=None,
+                         reduction='mean',
+                         avg_factor=None,
+                         class_weight=None,
+                         ignore_index=-100,
+                         avg_non_ignore=False):
+    """Calculate the binary CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 1) or (N, ).
+            When the shape of pred is (N, 1), label will be expanded to
+            one-hot format, and when the shape of pred is (N, ), label
+            will not be expanded to one-hot format.
+        label (torch.Tensor): The learning label of the prediction,
+            with shape (N, ).
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (int | None): The label index to be ignored.
+            If None, it will be set to default value. Default: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    # The default value of ignore_index is the same as F.cross_entropy
+    ignore_index = -100 if ignore_index is None else ignore_index
+
+    if pred.dim() != label.dim():
+        label, weight, valid_mask = _expand_onehot_labels(
+            label, weight, pred.size(-1), ignore_index)
+    else:
+        # should mask out the ignored elements
+        valid_mask = ((label >= 0) & (label != ignore_index)).float()
+        if weight is not None:
+            # The inplace writing method will have a mismatched broadcast
+            # shape error if the weight and valid_mask dimensions
+            # are inconsistent such as (B,N,1) and (B,N,C).
+            weight = weight * valid_mask
+        else:
+            weight = valid_mask
+
+    # average loss over non-ignored elements
+    if (avg_factor is None) and avg_non_ignore and reduction == 'mean':
+        avg_factor = valid_mask.sum().item()
+
+    # weighted element-wise losses
+    weight = weight.float()
+    loss = F.binary_cross_entropy_with_logits(
+        pred, label.float(), pos_weight=class_weight, reduction='none')
+    # do the reduction for the weighted loss
+    loss = weight_reduce_loss(
+        loss, weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def mask_cross_entropy(pred,
+                       target,
+                       label,
+                       reduction='mean',
+                       avg_factor=None,
+                       class_weight=None,
+                       ignore_index=None,
+                       **kwargs):
+    """Calculate the CrossEntropy loss for masks.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C, *), C is the
+            number of classes. The trailing * indicates arbitrary shape.
+        target (torch.Tensor): The learning label of the prediction.
+        label (torch.Tensor): ``label`` indicates the class label of the mask
+            corresponding object. This will be used to select the mask in the
+            of the class which the object belongs to when the mask prediction
+            if not class-agnostic.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (None): Placeholder, to be consistent with other loss.
+            Default: None.
+
+    Returns:
+        torch.Tensor: The calculated loss
+
+    Example:
+        >>> N, C = 3, 11
+        >>> H, W = 2, 2
+        >>> pred = torch.randn(N, C, H, W) * 1000
+        >>> target = torch.rand(N, H, W)
+        >>> label = torch.randint(0, C, size=(N,))
+        >>> reduction = 'mean'
+        >>> avg_factor = None
+        >>> class_weights = None
+        >>> loss = mask_cross_entropy(pred, target, label, reduction,
+        >>>                           avg_factor, class_weights)
+        >>> assert loss.shape == (1,)
+    """
+    assert ignore_index is None, 'BCE loss does not support ignore_index'
+    # TODO: handle these two reserved arguments
+    assert reduction == 'mean' and avg_factor is None
+    num_rois = pred.size()[0]
+    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
+    pred_slice = pred[inds, label].squeeze(1)
+    return F.binary_cross_entropy_with_logits(
+        pred_slice, target, weight=class_weight, reduction='mean')[None]
+
+
+@MODELS.register_module()
+class CrossEntropyLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=False,
+                 use_mask=False,
+                 reduction='mean',
+                 class_weight=None,
+                 ignore_index=None,
+                 loss_weight=1.0,
+                 avg_non_ignore=False):
+        """CrossEntropyLoss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+                of softmax. Defaults to False.
+            use_mask (bool, optional): Whether to use mask cross entropy loss.
+                Defaults to False.
+            reduction (str, optional): . Defaults to 'mean'.
+                Options are "none", "mean" and "sum".
+            class_weight (list[float], optional): Weight of each class.
+                Defaults to None.
+            ignore_index (int | None): The label index to be ignored.
+                Defaults to None.
+            loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+            avg_non_ignore (bool): The flag decides to whether the loss is
+                only averaged over non-ignored targets. Default: False.
+        """
+        super(CrossEntropyLoss, self).__init__()
+        assert (use_sigmoid is False) or (use_mask is False)
+        self.use_sigmoid = use_sigmoid
+        self.use_mask = use_mask
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = class_weight
+        self.ignore_index = ignore_index
+        self.avg_non_ignore = avg_non_ignore
+        if ((ignore_index is not None) and not self.avg_non_ignore
+                and self.reduction == 'mean'):
+            warnings.warn(
+                'Default ``avg_non_ignore`` is False, if you would like to '
+                'ignore the certain label and average loss over non-ignore '
+                'labels, which is the same with PyTorch official '
+                'cross_entropy, set ``avg_non_ignore=True``.')
+
+        if self.use_sigmoid:
+            self.cls_criterion = binary_cross_entropy
+        elif self.use_mask:
+            self.cls_criterion = mask_cross_entropy
+        else:
+            self.cls_criterion = cross_entropy
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'avg_non_ignore={self.avg_non_ignore}'
+        return s
+
+    def forward(self,
+                cls_score,
+                label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                ignore_index=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            cls_score (torch.Tensor): The prediction.
+            label (torch.Tensor): The learning label of the prediction.
+            weight (torch.Tensor, optional): Sample-wise loss weight.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The method used to reduce the
+                loss. Options are "none", "mean" and "sum".
+            ignore_index (int | None): The label index to be ignored.
+                If not None, it will override the default value. Default: None.
+        Returns:
+            torch.Tensor: The calculated loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if ignore_index is None:
+            ignore_index = self.ignore_index
+
+        if self.class_weight is not None:
+            class_weight = cls_score.new_tensor(
+                self.class_weight, device=cls_score.device)
+        else:
+            class_weight = None
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            weight,
+            class_weight=class_weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            ignore_index=ignore_index,
+            avg_non_ignore=self.avg_non_ignore,
+            **kwargs)
+        return loss_cls
+
+
+@MODELS.register_module()
+class CrossEntropyCustomLoss(CrossEntropyLoss):
+
+    def __init__(self,
+                 use_sigmoid=False,
+                 use_mask=False,
+                 reduction='mean',
+                 num_classes=-1,
+                 class_weight=None,
+                 ignore_index=None,
+                 loss_weight=1.0,
+                 avg_non_ignore=False):
+        """CrossEntropyCustomLoss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+                of softmax. Defaults to False.
+            use_mask (bool, optional): Whether to use mask cross entropy loss.
+                Defaults to False.
+            reduction (str, optional): . Defaults to 'mean'.
+                Options are "none", "mean" and "sum".
+            num_classes (int): Number of classes to classify.
+            class_weight (list[float], optional): Weight of each class.
+                Defaults to None.
+            ignore_index (int | None): The label index to be ignored.
+                Defaults to None.
+            loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+            avg_non_ignore (bool): The flag decides to whether the loss is
+                only averaged over non-ignored targets. Default: False.
+        """
+        super(CrossEntropyCustomLoss, self).__init__()
+        assert (use_sigmoid is False) or (use_mask is False)
+        self.use_sigmoid = use_sigmoid
+        self.use_mask = use_mask
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = class_weight
+        self.ignore_index = ignore_index
+        self.avg_non_ignore = avg_non_ignore
+        if ((ignore_index is not None) and not self.avg_non_ignore
+                and self.reduction == 'mean'):
+            warnings.warn(
+                'Default ``avg_non_ignore`` is False, if you would like to '
+                'ignore the certain label and average loss over non-ignore '
+                'labels, which is the same with PyTorch official '
+                'cross_entropy, set ``avg_non_ignore=True``.')
+
+        if self.use_sigmoid:
+            self.cls_criterion = binary_cross_entropy
+        elif self.use_mask:
+            self.cls_criterion = mask_cross_entropy
+        else:
+            self.cls_criterion = cross_entropy
+
+        self.num_classes = num_classes
+
+        assert self.num_classes != -1
+
+        # custom output channels of the classifier
+        self.custom_cls_channels = True
+        # custom activation of cls_score
+        self.custom_activation = True
+        # custom accuracy of the classsifier
+        self.custom_accuracy = True
+
+    def get_cls_channels(self, num_classes):
+        assert num_classes == self.num_classes
+        if not self.use_sigmoid:
+            return num_classes + 1
+        else:
+            return num_classes
+
+    def get_activation(self, cls_score):
+
+        fine_cls_score = cls_score[:, :self.num_classes]
+
+        if not self.use_sigmoid:
+            bg_score = cls_score[:, [-1]]
+            new_score = torch.cat([fine_cls_score, bg_score], dim=-1)
+            scores = F.softmax(new_score, dim=-1)
+        else:
+            score_classes = fine_cls_score.sigmoid()
+            score_neg = 1 - score_classes.sum(dim=1, keepdim=True)
+            score_neg = score_neg.clamp(min=0, max=1)
+            scores = torch.cat([score_classes, score_neg], dim=1)
+
+        return scores
+
+    def get_accuracy(self, cls_score, labels):
+
+        fine_cls_score = cls_score[:, :self.num_classes]
+
+        pos_inds = labels < self.num_classes
+        acc_classes = accuracy(fine_cls_score[pos_inds], labels[pos_inds])
+        acc = dict()
+        acc['acc_classes'] = acc_classes
+        return acc
diff --git a/mmde/mmdet/models/losses/ddq_detr_aux_loss.py b/mmde/mmdet/models/losses/ddq_detr_aux_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..41f1c7166e6c7d05c5414cd04ad3eb3cd467f1b6
--- /dev/null
+++ b/mmde/mmdet/models/losses/ddq_detr_aux_loss.py
@@ -0,0 +1,303 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.structures import BaseDataElement
+
+from mmdet.models.utils import multi_apply
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import reduce_mean
+
+
+class DDQAuxLoss(nn.Module):
+    """DDQ auxiliary branches loss for dense queries.
+
+    Args:
+        loss_cls (dict):
+            Configuration of classification loss function.
+        loss_bbox (dict):
+            Configuration of bbox regression loss function.
+        train_cfg (dict):
+            Configuration of gt targets assigner for each predicted bbox.
+    """
+
+    def __init__(
+        self,
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            activated=True,  # use probability instead of logit as input
+            beta=2.0,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        train_cfg=dict(
+            assigner=dict(type='TopkHungarianAssigner', topk=8),
+            alpha=1,
+            beta=6),
+    ):
+        super(DDQAuxLoss, self).__init__()
+        self.train_cfg = train_cfg
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+
+        sampler_cfg = dict(type='PseudoSampler')
+        self.sampler = TASK_UTILS.build(sampler_cfg)
+
+    def loss_single(self, cls_score, bbox_pred, labels, label_weights,
+                    bbox_targets, alignment_metrics):
+        """Calculate auxiliary branches loss for dense queries for one image.
+
+        Args:
+            cls_score (Tensor): Predicted normalized classification
+                scores for one image, has shape (num_dense_queries,
+                cls_out_channels).
+            bbox_pred (Tensor): Predicted unnormalized bbox coordinates
+                for one image, has shape (num_dense_queries, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+            labels (Tensor): Labels for one image.
+            label_weights (Tensor): Label weights for one image.
+            bbox_targets (Tensor): Bbox targets for one image.
+            alignment_metrics (Tensor): Normalized alignment metrics for one
+                image.
+
+        Returns:
+            tuple: A tuple of loss components and loss weights.
+        """
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        alignment_metrics = alignment_metrics.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        targets = (labels, alignment_metrics)
+        cls_loss_func = self.loss_cls
+
+        loss_cls = cls_loss_func(
+            cls_score, targets, label_weights, avg_factor=1.0)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = cls_score.size(-1)
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+
+            pos_decode_bbox_pred = pos_bbox_pred
+            pos_decode_bbox_targets = pos_bbox_targets
+
+            # regression loss
+            pos_bbox_weight = alignment_metrics[pos_inds]
+
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=pos_bbox_weight,
+                avg_factor=1.0)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            pos_bbox_weight = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, alignment_metrics.sum(
+        ), pos_bbox_weight.sum()
+
+    def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas,
+             **kwargs):
+        """Calculate auxiliary branches loss for dense queries.
+
+        Args:
+            cls_scores (Tensor): Predicted normalized classification
+                scores, has shape (bs, num_dense_queries,
+                cls_out_channels).
+            bbox_preds (Tensor): Predicted unnormalized bbox coordinates,
+                has shape (bs, num_dense_queries, 4) with the last
+                dimension arranged as (x1, y1, x2, y2).
+            gt_bboxes (list[Tensor]): List of unnormalized ground truth
+                bboxes for each image, each has shape (num_gt, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+                NOTE: num_gt is dynamic for each image.
+            gt_labels (list[Tensor]): List of ground truth classification
+                index for each image, each has shape (num_gt,).
+                NOTE: num_gt is dynamic for each image.
+            img_metas (list[dict]): Meta information for one image,
+                e.g., image size, scaling factor, etc.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        flatten_cls_scores = cls_scores
+        flatten_bbox_preds = bbox_preds
+
+        cls_reg_targets = self.get_targets(
+            flatten_cls_scores,
+            flatten_bbox_preds,
+            gt_bboxes,
+            img_metas,
+            gt_labels_list=gt_labels,
+        )
+        (labels_list, label_weights_list, bbox_targets_list,
+         alignment_metrics_list) = cls_reg_targets
+
+        losses_cls, losses_bbox, \
+            cls_avg_factors, bbox_avg_factors = multi_apply(
+                self.loss_single,
+                flatten_cls_scores,
+                flatten_bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                alignment_metrics_list,
+                )
+
+        cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item()
+        losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls))
+
+        bbox_avg_factor = reduce_mean(
+            sum(bbox_avg_factors)).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        return dict(aux_loss_cls=losses_cls, aux_loss_bbox=losses_bbox)
+
+    def get_targets(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_labels_list=None,
+                    **kwargs):
+        """Compute regression and classification targets for a batch images.
+
+        Args:
+            cls_scores (Tensor): Predicted normalized classification
+                scores, has shape (bs, num_dense_queries,
+                cls_out_channels).
+            bbox_preds (Tensor): Predicted unnormalized bbox coordinates,
+                has shape (bs, num_dense_queries, 4) with the last
+                dimension arranged as (x1, y1, x2, y2).
+            gt_bboxes_list (List[Tensor]): List of unnormalized ground truth
+                bboxes for each image, each has shape (num_gt, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+                NOTE: num_gt is dynamic for each image.
+            img_metas (list[dict]): Meta information for one image,
+                e.g., image size, scaling factor, etc.
+            gt_labels_list (list[Tensor]): List of ground truth classification
+                    index for each image, each has shape (num_gt,).
+                    NOTE: num_gt is dynamic for each image.
+                    Default: None.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+            - all_labels (list[Tensor]): Labels for all images.
+            - all_label_weights (list[Tensor]): Label weights for all images.
+            - all_bbox_targets (list[Tensor]): Bbox targets for all images.
+            - all_assign_metrics (list[Tensor]): Normalized alignment metrics
+                for all images.
+        """
+        (all_labels, all_label_weights, all_bbox_targets,
+         all_assign_metrics) = multi_apply(self._get_target_single, cls_scores,
+                                           bbox_preds, gt_bboxes_list,
+                                           gt_labels_list, img_metas)
+
+        return (all_labels, all_label_weights, all_bbox_targets,
+                all_assign_metrics)
+
+    def _get_target_single(self, cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                           img_meta, **kwargs):
+        """Compute regression and classification targets for one image.
+
+        Args:
+            cls_scores (Tensor): Predicted normalized classification
+                scores for one image, has shape (num_dense_queries,
+                cls_out_channels).
+            bbox_preds (Tensor): Predicted unnormalized bbox coordinates
+                for one image, has shape (num_dense_queries, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+            gt_bboxes (Tensor): Unnormalized ground truth
+                bboxes for one image, has shape (num_gt, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+                NOTE: num_gt is dynamic for each image.
+            gt_labels (Tensor): Ground truth classification
+                    index for the image, has shape (num_gt,).
+                    NOTE: num_gt is dynamic for each image.
+            img_meta (dict): Meta information for one image.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels for one image.
+            - label_weights (Tensor): Label weights for one image.
+            - bbox_targets (Tensor): Bbox targets for one image.
+            - norm_alignment_metrics (Tensor): Normalized alignment
+                metrics for one image.
+        """
+        if len(gt_labels) == 0:
+            num_valid_anchors = len(cls_scores)
+            bbox_targets = torch.zeros_like(bbox_preds)
+            labels = bbox_preds.new_full((num_valid_anchors, ),
+                                         cls_scores.size(-1),
+                                         dtype=torch.long)
+            label_weights = bbox_preds.new_zeros(
+                num_valid_anchors, dtype=torch.float)
+            norm_alignment_metrics = bbox_preds.new_zeros(
+                num_valid_anchors, dtype=torch.float)
+            return (labels, label_weights, bbox_targets,
+                    norm_alignment_metrics)
+
+        assign_result = self.assigner.assign(cls_scores, bbox_preds, gt_bboxes,
+                                             gt_labels, img_meta)
+        assign_ious = assign_result.max_overlaps
+        assign_metrics = assign_result.assign_metrics
+
+        pred_instances = BaseDataElement()
+        gt_instances = BaseDataElement()
+
+        pred_instances.bboxes = bbox_preds
+        gt_instances.bboxes = gt_bboxes
+
+        pred_instances.priors = cls_scores
+        gt_instances.labels = gt_labels
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = len(cls_scores)
+        bbox_targets = torch.zeros_like(bbox_preds)
+        labels = bbox_preds.new_full((num_valid_anchors, ),
+                                     cls_scores.size(-1),
+                                     dtype=torch.long)
+        label_weights = bbox_preds.new_zeros(
+            num_valid_anchors, dtype=torch.float)
+        norm_alignment_metrics = bbox_preds.new_zeros(
+            num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            # point-based
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+
+            if gt_labels is None:
+                # Only dense_heads gives gt_labels as None
+                # Foreground is the first class since v2.5.0
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+
+            label_weights[pos_inds] = 1.0
+
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        class_assigned_gt_inds = torch.unique(
+            sampling_result.pos_assigned_gt_inds)
+        for gt_inds in class_assigned_gt_inds:
+            gt_class_inds = sampling_result.pos_assigned_gt_inds == gt_inds
+            pos_alignment_metrics = assign_metrics[gt_class_inds]
+            pos_ious = assign_ious[gt_class_inds]
+            pos_norm_alignment_metrics = pos_alignment_metrics / (
+                pos_alignment_metrics.max() + 10e-8) * pos_ious.max()
+            norm_alignment_metrics[
+                pos_inds[gt_class_inds]] = pos_norm_alignment_metrics
+
+        return (labels, label_weights, bbox_targets, norm_alignment_metrics)
diff --git a/mmde/mmdet/models/losses/dice_loss.py b/mmde/mmdet/models/losses/dice_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d5cac1e9710a6a72fe0401db22b8b72cfe058f9
--- /dev/null
+++ b/mmde/mmdet/models/losses/dice_loss.py
@@ -0,0 +1,146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmdet.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+def dice_loss(pred,
+              target,
+              weight=None,
+              eps=1e-3,
+              reduction='mean',
+              naive_dice=False,
+              avg_factor=None):
+    """Calculate dice loss, there are two forms of dice loss is supported:
+
+        - the one proposed in `V-Net: Fully Convolutional Neural
+            Networks for Volumetric Medical Image Segmentation
+            <https://arxiv.org/abs/1606.04797>`_.
+        - the dice loss in which the power of the number in the
+            denominator is the first power instead of the second
+            power.
+
+    Args:
+        pred (torch.Tensor): The prediction, has a shape (n, *)
+        target (torch.Tensor): The learning label of the prediction,
+            shape (n, *), same shape of pred.
+        weight (torch.Tensor, optional): The weight of loss for each
+            prediction, has a shape (n,). Defaults to None.
+        eps (float): Avoid dividing by zero. Default: 1e-3.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power.Defaults to False.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+
+    input = pred.flatten(1)
+    target = target.flatten(1).float()
+
+    a = torch.sum(input * target, 1)
+    if naive_dice:
+        b = torch.sum(input, 1)
+        c = torch.sum(target, 1)
+        d = (2 * a + eps) / (b + c + eps)
+    else:
+        b = torch.sum(input * input, 1) + eps
+        c = torch.sum(target * target, 1) + eps
+        d = (2 * a) / (b + c)
+
+    loss = 1 - d
+    if weight is not None:
+        assert weight.ndim == loss.ndim
+        assert len(weight) == len(pred)
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class DiceLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 activate=True,
+                 reduction='mean',
+                 naive_dice=False,
+                 loss_weight=1.0,
+                 eps=1e-3):
+        """Compute dice loss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            activate (bool): Whether to activate the predictions inside,
+                this will disable the inside sigmoid operation.
+                Defaults to True.
+            reduction (str, optional): The method used
+                to reduce the loss. Options are "none",
+                "mean" and "sum". Defaults to 'mean'.
+            naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power. Defaults to False.
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            eps (float): Avoid dividing by zero. Defaults to 1e-3.
+        """
+
+        super(DiceLoss, self).__init__()
+        self.use_sigmoid = use_sigmoid
+        self.reduction = reduction
+        self.naive_dice = naive_dice
+        self.loss_weight = loss_weight
+        self.eps = eps
+        self.activate = activate
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                reduction_override=None,
+                avg_factor=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction, has a shape (n, *).
+            target (torch.Tensor): The label of the prediction,
+                shape (n, *), same shape of pred.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction, has a shape (n,). Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        if self.activate:
+            if self.use_sigmoid:
+                pred = pred.sigmoid()
+            else:
+                raise NotImplementedError
+
+        loss = self.loss_weight * dice_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            naive_dice=self.naive_dice,
+            avg_factor=avg_factor)
+
+        return loss
diff --git a/mmde/mmdet/models/losses/eqlv2_loss.py b/mmde/mmdet/models/losses/eqlv2_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea1f4a9a8f7c71119c2bed743d714a34ab4db82c
--- /dev/null
+++ b/mmde/mmdet/models/losses/eqlv2_loss.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from functools import partial
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.logging import print_log
+from torch import Tensor
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class EQLV2Loss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid: bool = True,
+                 reduction: str = 'mean',
+                 class_weight: Optional[Tensor] = None,
+                 loss_weight: float = 1.0,
+                 num_classes: int = 1203,
+                 use_distributed: bool = False,
+                 mu: float = 0.8,
+                 alpha: float = 4.0,
+                 gamma: int = 12,
+                 vis_grad: bool = False,
+                 test_with_obj: bool = True) -> None:
+        """`Equalization Loss v2 <https://arxiv.org/abs/2012.08548>`_
+
+        Args:
+            use_sigmoid (bool): EQLv2 uses the sigmoid function to transform
+                the predicted logits to an estimated probability distribution.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'.
+            class_weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            loss_weight (float, optional): The weight of the total EQLv2 loss.
+                Defaults to 1.0.
+            num_classes (int): 1203 for lvis v1.0, 1230 for lvis v0.5.
+            use_distributed (bool, float): EQLv2 will calculate the gradients
+                on all GPUs if there is any. Change to True if you are using
+                distributed training. Default to False.
+            mu (float, optional): Defaults to 0.8
+            alpha (float, optional): A balance factor for the negative part of
+                EQLV2 Loss. Defaults to 4.0.
+            gamma (int, optional): The gamma for calculating the modulating
+                factor. Defaults to 12.
+            vis_grad (bool, optional): Default to False.
+            test_with_obj (bool, optional): Default to True.
+
+        Returns:
+            None.
+        """
+        super().__init__()
+        self.use_sigmoid = True
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = class_weight
+        self.num_classes = num_classes
+        self.group = True
+
+        # cfg for eqlv2
+        self.vis_grad = vis_grad
+        self.mu = mu
+        self.alpha = alpha
+        self.gamma = gamma
+        self.use_distributed = use_distributed
+
+        # initial variables
+        self.register_buffer('pos_grad', torch.zeros(self.num_classes))
+        self.register_buffer('neg_grad', torch.zeros(self.num_classes))
+        # At the beginning of training, we set a high value (eg. 100)
+        # for the initial gradient ratio so that the weight for pos
+        # gradients and neg gradients are 1.
+        self.register_buffer('pos_neg', torch.ones(self.num_classes) * 100)
+
+        self.test_with_obj = test_with_obj
+
+        def _func(x, gamma, mu):
+            return 1 / (1 + torch.exp(-gamma * (x - mu)))
+
+        self.map_func = partial(_func, gamma=self.gamma, mu=self.mu)
+
+        print_log(
+            f'build EQL v2, gamma: {gamma}, mu: {mu}, alpha: {alpha}',
+            logger='current',
+            level=logging.DEBUG)
+
+    def forward(self,
+                cls_score: Tensor,
+                label: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[Tensor] = None) -> Tensor:
+        """`Equalization Loss v2 <https://arxiv.org/abs/2012.08548>`_
+
+        Args:
+            cls_score (Tensor): The prediction with shape (N, C), C is the
+                number of classes.
+            label (Tensor): The ground truth label of the predicted target with
+                shape (N, C), C is the number of classes.
+            weight (Tensor, optional): The weight of loss for each prediction.
+                Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+           Tensor: The calculated loss
+        """
+        self.n_i, self.n_c = cls_score.size()
+        self.gt_classes = label
+        self.pred_class_logits = cls_score
+
+        def expand_label(pred, gt_classes):
+            target = pred.new_zeros(self.n_i, self.n_c)
+            target[torch.arange(self.n_i), gt_classes] = 1
+            return target
+
+        target = expand_label(cls_score, label)
+
+        pos_w, neg_w = self.get_weight(cls_score)
+
+        weight = pos_w * target + neg_w * (1 - target)
+
+        cls_loss = F.binary_cross_entropy_with_logits(
+            cls_score, target, reduction='none')
+        cls_loss = torch.sum(cls_loss * weight) / self.n_i
+
+        self.collect_grad(cls_score.detach(), target.detach(), weight.detach())
+
+        return self.loss_weight * cls_loss
+
+    def get_channel_num(self, num_classes):
+        num_channel = num_classes + 1
+        return num_channel
+
+    def get_activation(self, pred):
+        pred = torch.sigmoid(pred)
+        n_i, n_c = pred.size()
+        bg_score = pred[:, -1].view(n_i, 1)
+        if self.test_with_obj:
+            pred[:, :-1] *= (1 - bg_score)
+        return pred
+
+    def collect_grad(self, pred, target, weight):
+        prob = torch.sigmoid(pred)
+        grad = target * (prob - 1) + (1 - target) * prob
+        grad = torch.abs(grad)
+
+        # do not collect grad for objectiveness branch [:-1]
+        pos_grad = torch.sum(grad * target * weight, dim=0)[:-1]
+        neg_grad = torch.sum(grad * (1 - target) * weight, dim=0)[:-1]
+
+        if self.use_distributed:
+            dist.all_reduce(pos_grad)
+            dist.all_reduce(neg_grad)
+
+        self.pos_grad += pos_grad
+        self.neg_grad += neg_grad
+        self.pos_neg = self.pos_grad / (self.neg_grad + 1e-10)
+
+    def get_weight(self, pred):
+        neg_w = torch.cat([self.map_func(self.pos_neg), pred.new_ones(1)])
+        pos_w = 1 + self.alpha * (1 - neg_w)
+        neg_w = neg_w.view(1, -1).expand(self.n_i, self.n_c)
+        pos_w = pos_w.view(1, -1).expand(self.n_i, self.n_c)
+        return pos_w, neg_w
diff --git a/mmde/mmdet/models/losses/focal_loss.py b/mmde/mmdet/models/losses/focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..15bef293a591a7f4c099febdaa82abaf7fb4928a
--- /dev/null
+++ b/mmde/mmdet/models/losses/focal_loss.py
@@ -0,0 +1,371 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.ops import sigmoid_focal_loss as _sigmoid_focal_loss
+
+from mmdet.registry import MODELS
+from .accuracy import accuracy
+from .utils import weight_reduce_loss
+
+
+# This method is only for debugging
+def py_sigmoid_focal_loss(pred,
+                          target,
+                          weight=None,
+                          gamma=2.0,
+                          alpha=0.25,
+                          reduction='mean',
+                          avg_factor=None):
+    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    # Actually, pt here denotes (1 - pt) in the Focal Loss paper
+    pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
+    # Thus it's pt.pow(gamma) rather than (1 - pt).pow(gamma)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * pt.pow(gamma)
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+def py_focal_loss_with_prob(pred,
+                            target,
+                            weight=None,
+                            gamma=2.0,
+                            alpha=0.25,
+                            reduction='mean',
+                            avg_factor=None):
+    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.
+    Different from `py_sigmoid_focal_loss`, this function accepts probability
+    as input.
+
+    Args:
+        pred (torch.Tensor): The prediction probability with shape (N, C),
+            C is the number of classes.
+        target (torch.Tensor): The learning label of the prediction.
+            The target shape support (N,C) or (N,), (N,C) means one-hot form.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    if pred.dim() != target.dim():
+        num_classes = pred.size(1)
+        target = F.one_hot(target, num_classes=num_classes + 1)
+        target = target[:, :num_classes]
+
+    target = target.type_as(pred)
+    pt = (1 - pred) * target + pred * (1 - target)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * pt.pow(gamma)
+    loss = F.binary_cross_entropy(
+        pred, target, reduction='none') * focal_weight
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+def sigmoid_focal_loss(pred,
+                       target,
+                       weight=None,
+                       gamma=2.0,
+                       alpha=0.25,
+                       reduction='mean',
+                       avg_factor=None):
+    r"""A wrapper of cuda version `Focal Loss
+    <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'. Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    # Function.apply does not accept keyword arguments, so the decorator
+    # "weighted_loss" is not applicable
+    loss = _sigmoid_focal_loss(pred.contiguous(), target.contiguous(), gamma,
+                               alpha, None, 'none')
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class FocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 gamma=2.0,
+                 alpha=0.25,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 activated=False):
+        """`Focal Loss <https://arxiv.org/abs/1708.02002>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            alpha (float, optional): A balanced form for Focal Loss.
+                Defaults to 0.25.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            activated (bool, optional): Whether the input is activated.
+                If True, it means the input has been activated and can be
+                treated as probabilities. Else, it should be treated as logits.
+                Defaults to False.
+        """
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid focal loss supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.activated = activated
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning label of the prediction.
+                The target shape support (N,C) or (N,), (N,C) means
+                one-hot form.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            if self.activated:
+                calculate_loss_func = py_focal_loss_with_prob
+            else:
+                if pred.dim() == target.dim():
+                    # this means that target is already in One-Hot form.
+                    calculate_loss_func = py_sigmoid_focal_loss
+                elif torch.cuda.is_available() and pred.is_cuda:
+                    calculate_loss_func = sigmoid_focal_loss
+                else:
+                    num_classes = pred.size(1)
+                    target = F.one_hot(target, num_classes=num_classes + 1)
+                    target = target[:, :num_classes]
+                    calculate_loss_func = py_sigmoid_focal_loss
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                weight,
+                gamma=self.gamma,
+                alpha=self.alpha,
+                reduction=reduction,
+                avg_factor=avg_factor)
+
+        else:
+            raise NotImplementedError
+        return loss_cls
+
+
+@MODELS.register_module()
+class FocalCustomLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 num_classes=-1,
+                 gamma=2.0,
+                 alpha=0.25,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 activated=False):
+        """`Focal Loss for V3Det <https://arxiv.org/abs/1708.02002>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            num_classes (int): Number of classes to classify.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            alpha (float, optional): A balanced form for Focal Loss.
+                Defaults to 0.25.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            activated (bool, optional): Whether the input is activated.
+                If True, it means the input has been activated and can be
+                treated as probabilities. Else, it should be treated as logits.
+                Defaults to False.
+        """
+        super(FocalCustomLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid focal loss supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.num_classes = num_classes
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.activated = activated
+
+        assert self.num_classes != -1
+
+        # custom output channels of the classifier
+        self.custom_cls_channels = True
+        # custom activation of cls_score
+        self.custom_activation = True
+        # custom accuracy of the classsifier
+        self.custom_accuracy = True
+
+    def get_cls_channels(self, num_classes):
+        assert num_classes == self.num_classes
+        return num_classes
+
+    def get_activation(self, cls_score):
+
+        fine_cls_score = cls_score[:, :self.num_classes]
+
+        score_classes = fine_cls_score.sigmoid()
+
+        return score_classes
+
+    def get_accuracy(self, cls_score, labels):
+
+        fine_cls_score = cls_score[:, :self.num_classes]
+
+        pos_inds = labels < self.num_classes
+        acc_classes = accuracy(fine_cls_score[pos_inds], labels[pos_inds])
+        acc = dict()
+        acc['acc_classes'] = acc_classes
+        return acc
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning label of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+
+            num_classes = pred.size(1)
+            target = F.one_hot(target, num_classes=num_classes + 1)
+            target = target[:, :num_classes]
+            calculate_loss_func = py_sigmoid_focal_loss
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                weight,
+                gamma=self.gamma,
+                alpha=self.alpha,
+                reduction=reduction,
+                avg_factor=avg_factor)
+
+        else:
+            raise NotImplementedError
+        return loss_cls
diff --git a/mmde/mmdet/models/losses/gaussian_focal_loss.py b/mmde/mmdet/models/losses/gaussian_focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..14fa8da462a5e7cabde2166878a1b9f2ccc16d62
--- /dev/null
+++ b/mmde/mmdet/models/losses/gaussian_focal_loss.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weight_reduce_loss, weighted_loss
+
+
+@weighted_loss
+def gaussian_focal_loss(pred: Tensor,
+                        gaussian_target: Tensor,
+                        alpha: float = 2.0,
+                        gamma: float = 4.0,
+                        pos_weight: float = 1.0,
+                        neg_weight: float = 1.0) -> Tensor:
+    """`Focal Loss <https://arxiv.org/abs/1708.02002>`_ for targets in gaussian
+    distribution.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        gaussian_target (torch.Tensor): The learning target of the prediction
+            in gaussian distribution.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 2.0.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 4.0.
+        pos_weight(float): Positive sample loss weight. Defaults to 1.0.
+        neg_weight(float): Negative sample loss weight. Defaults to 1.0.
+    """
+    eps = 1e-12
+    pos_weights = gaussian_target.eq(1)
+    neg_weights = (1 - gaussian_target).pow(gamma)
+    pos_loss = -(pred + eps).log() * (1 - pred).pow(alpha) * pos_weights
+    neg_loss = -(1 - pred + eps).log() * pred.pow(alpha) * neg_weights
+    return pos_weight * pos_loss + neg_weight * neg_loss
+
+
+def gaussian_focal_loss_with_pos_inds(
+        pred: Tensor,
+        gaussian_target: Tensor,
+        pos_inds: Tensor,
+        pos_labels: Tensor,
+        alpha: float = 2.0,
+        gamma: float = 4.0,
+        pos_weight: float = 1.0,
+        neg_weight: float = 1.0,
+        reduction: str = 'mean',
+        avg_factor: Optional[Union[int, float]] = None) -> Tensor:
+    """`Focal Loss <https://arxiv.org/abs/1708.02002>`_ for targets in gaussian
+    distribution.
+
+    Note: The index with a value of 1 in ``gaussian_target`` in the
+    ``gaussian_focal_loss`` function is a positive sample, but in
+    ``gaussian_focal_loss_with_pos_inds`` the positive sample is passed
+    in through the ``pos_inds`` parameter.
+
+    Args:
+        pred (torch.Tensor): The prediction. The shape is (N, num_classes).
+        gaussian_target (torch.Tensor): The learning target of the prediction
+            in gaussian distribution. The shape is (N, num_classes).
+        pos_inds (torch.Tensor): The positive sample index.
+            The shape is (M, ).
+        pos_labels (torch.Tensor): The label corresponding to the positive
+            sample index. The shape is (M, ).
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 2.0.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 4.0.
+        pos_weight(float): Positive sample loss weight. Defaults to 1.0.
+        neg_weight(float): Negative sample loss weight. Defaults to 1.0.
+        reduction (str): Options are "none", "mean" and "sum".
+            Defaults to 'mean`.
+        avg_factor (int, float, optional): Average factor that is used to
+            average the loss. Defaults to None.
+    """
+    eps = 1e-12
+    neg_weights = (1 - gaussian_target).pow(gamma)
+
+    pos_pred_pix = pred[pos_inds]
+    pos_pred = pos_pred_pix.gather(1, pos_labels.unsqueeze(1))
+    pos_loss = -(pos_pred + eps).log() * (1 - pos_pred).pow(alpha)
+    pos_loss = weight_reduce_loss(pos_loss, None, reduction, avg_factor)
+
+    neg_loss = -(1 - pred + eps).log() * pred.pow(alpha) * neg_weights
+    neg_loss = weight_reduce_loss(neg_loss, None, reduction, avg_factor)
+
+    return pos_weight * pos_loss + neg_weight * neg_loss
+
+
+@MODELS.register_module()
+class GaussianFocalLoss(nn.Module):
+    """GaussianFocalLoss is a variant of focal loss.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1808.01244>`_
+    Code is modified from `kp_utils.py
+    <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/kp_utils.py#L152>`_  # noqa: E501
+    Please notice that the target in GaussianFocalLoss is a gaussian heatmap,
+    not 0/1 binary target.
+
+    Args:
+        alpha (float): Power of prediction.
+        gamma (float): Power of target for negative samples.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Loss weight of current loss.
+        pos_weight(float): Positive sample loss weight. Defaults to 1.0.
+        neg_weight(float): Negative sample loss weight. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 alpha: float = 2.0,
+                 gamma: float = 4.0,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 pos_weight: float = 1.0,
+                 neg_weight: float = 1.0) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.pos_weight = pos_weight
+        self.neg_weight = neg_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                pos_inds: Optional[Tensor] = None,
+                pos_labels: Optional[Tensor] = None,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[Union[int, float]] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        If you want to manually determine which positions are
+        positive samples, you can set the pos_index and pos_label
+        parameter. Currently, only the CenterNet update version uses
+        the parameter.
+
+        Args:
+            pred (torch.Tensor): The prediction. The shape is (N, num_classes).
+            target (torch.Tensor): The learning target of the prediction
+                in gaussian distribution. The shape is (N, num_classes).
+            pos_inds (torch.Tensor): The positive sample index.
+                Defaults to None.
+            pos_labels (torch.Tensor): The label corresponding to the positive
+                sample index. Defaults to None.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, float, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if pos_inds is not None:
+            assert pos_labels is not None
+            # Only used by centernet update version
+            loss_reg = self.loss_weight * gaussian_focal_loss_with_pos_inds(
+                pred,
+                target,
+                pos_inds,
+                pos_labels,
+                alpha=self.alpha,
+                gamma=self.gamma,
+                pos_weight=self.pos_weight,
+                neg_weight=self.neg_weight,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            loss_reg = self.loss_weight * gaussian_focal_loss(
+                pred,
+                target,
+                weight,
+                alpha=self.alpha,
+                gamma=self.gamma,
+                pos_weight=self.pos_weight,
+                neg_weight=self.neg_weight,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        return loss_reg
diff --git a/mmde/mmdet/models/losses/gfocal_loss.py b/mmde/mmdet/models/losses/gfocal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3a1172207e859039ca5ed7e0604d8b787131c29
--- /dev/null
+++ b/mmde/mmdet/models/losses/gfocal_loss.py
@@ -0,0 +1,295 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.models.losses.utils import weighted_loss
+from mmdet.registry import MODELS
+
+
+@weighted_loss
+def quality_focal_loss(pred, target, beta=2.0):
+    r"""Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted joint representation of classification
+            and quality (IoU) estimation with shape (N, C), C is the number of
+            classes.
+        target (tuple([torch.Tensor])): Target category label with shape (N,)
+            and target quality label with shape (N,).
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    assert len(target) == 2, """target for QFL must be a tuple of two elements,
+        including category label and quality label, respectively"""
+    # label denotes the category id, score denotes the quality score
+    label, score = target
+
+    # negatives are supervised by 0 quality score
+    pred_sigmoid = pred.sigmoid()
+    scale_factor = pred_sigmoid
+    zerolabel = scale_factor.new_zeros(pred.shape)
+    loss = F.binary_cross_entropy_with_logits(
+        pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+    bg_class_ind = pred.size(1)
+    pos = ((label >= 0) & (label < bg_class_ind)).nonzero().squeeze(1)
+    pos_label = label[pos].long()
+    # positives are supervised by bbox quality (IoU) score
+    scale_factor = score[pos] - pred_sigmoid[pos, pos_label]
+    loss[pos, pos_label] = F.binary_cross_entropy_with_logits(
+        pred[pos, pos_label], score[pos],
+        reduction='none') * scale_factor.abs().pow(beta)
+
+    loss = loss.sum(dim=1, keepdim=False)
+    return loss
+
+
+@weighted_loss
+def quality_focal_loss_tensor_target(pred, target, beta=2.0, activated=False):
+    """`QualityFocal Loss <https://arxiv.org/abs/2008.13367>`_
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (torch.Tensor): The learning target of the iou-aware
+            classification score with shape (N, C), C is the number of classes.
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+        activated (bool): Whether the input is activated.
+            If True, it means the input has been activated and can be
+            treated as probabilities. Else, it should be treated as logits.
+            Defaults to False.
+    """
+    # pred and target should be of the same size
+    assert pred.size() == target.size()
+    if activated:
+        pred_sigmoid = pred
+        loss_function = F.binary_cross_entropy
+    else:
+        pred_sigmoid = pred.sigmoid()
+        loss_function = F.binary_cross_entropy_with_logits
+
+    scale_factor = pred_sigmoid
+    target = target.type_as(pred)
+
+    zerolabel = scale_factor.new_zeros(pred.shape)
+    loss = loss_function(
+        pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    pos = (target != 0)
+    scale_factor = target[pos] - pred_sigmoid[pos]
+    loss[pos] = loss_function(
+        pred[pos], target[pos],
+        reduction='none') * scale_factor.abs().pow(beta)
+
+    loss = loss.sum(dim=1, keepdim=False)
+    return loss
+
+
+@weighted_loss
+def quality_focal_loss_with_prob(pred, target, beta=2.0):
+    r"""Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+    Different from `quality_focal_loss`, this function accepts probability
+    as input.
+
+    Args:
+        pred (torch.Tensor): Predicted joint representation of classification
+            and quality (IoU) estimation with shape (N, C), C is the number of
+            classes.
+        target (tuple([torch.Tensor])): Target category label with shape (N,)
+            and target quality label with shape (N,).
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    assert len(target) == 2, """target for QFL must be a tuple of two elements,
+        including category label and quality label, respectively"""
+    # label denotes the category id, score denotes the quality score
+    label, score = target
+
+    # negatives are supervised by 0 quality score
+    pred_sigmoid = pred
+    scale_factor = pred_sigmoid
+    zerolabel = scale_factor.new_zeros(pred.shape)
+    loss = F.binary_cross_entropy(
+        pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+    bg_class_ind = pred.size(1)
+    pos = ((label >= 0) & (label < bg_class_ind)).nonzero().squeeze(1)
+    pos_label = label[pos].long()
+    # positives are supervised by bbox quality (IoU) score
+    scale_factor = score[pos] - pred_sigmoid[pos, pos_label]
+    loss[pos, pos_label] = F.binary_cross_entropy(
+        pred[pos, pos_label], score[pos],
+        reduction='none') * scale_factor.abs().pow(beta)
+
+    loss = loss.sum(dim=1, keepdim=False)
+    return loss
+
+
+@weighted_loss
+def distribution_focal_loss(pred, label):
+    r"""Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted general distribution of bounding boxes
+            (before softmax) with shape (N, n+1), n is the max value of the
+            integral set `{0, ..., n}` in paper.
+        label (torch.Tensor): Target distance label for bounding boxes with
+            shape (N,).
+
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    dis_left = label.long()
+    dis_right = dis_left + 1
+    weight_left = dis_right.float() - label
+    weight_right = label - dis_left.float()
+    loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \
+        + F.cross_entropy(pred, dis_right, reduction='none') * weight_right
+    return loss
+
+
+@MODELS.register_module()
+class QualityFocalLoss(nn.Module):
+    r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        use_sigmoid (bool): Whether sigmoid operation is conducted in QFL.
+            Defaults to True.
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Loss weight of current loss.
+        activated (bool, optional): Whether the input is activated.
+            If True, it means the input has been activated and can be
+            treated as probabilities. Else, it should be treated as logits.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 beta=2.0,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 activated=False):
+        super(QualityFocalLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid in QFL supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.activated = activated
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): Predicted joint representation of
+                classification and quality (IoU) estimation with shape (N, C),
+                C is the number of classes.
+            target (Union(tuple([torch.Tensor]),Torch.Tensor)): The type is
+                tuple, it should be included Target category label with
+                shape (N,) and target quality label with shape (N,).The type
+                is torch.Tensor, the target should be one-hot form with
+                soft weights.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            if self.activated:
+                calculate_loss_func = quality_focal_loss_with_prob
+            else:
+                calculate_loss_func = quality_focal_loss
+            if isinstance(target, torch.Tensor):
+                # the target shape with (N,C) or (N,C,...), which means
+                # the target is one-hot form with soft weights.
+                calculate_loss_func = partial(
+                    quality_focal_loss_tensor_target, activated=self.activated)
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                weight,
+                beta=self.beta,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            raise NotImplementedError
+        return loss_cls
+
+
+@MODELS.register_module()
+class DistributionFocalLoss(nn.Module):
+    r"""Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(DistributionFocalLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): Predicted general distribution of bounding
+                boxes (before softmax) with shape (N, n+1), n is the max value
+                of the integral set `{0, ..., n}` in paper.
+            target (torch.Tensor): Target distance label for bounding boxes
+                with shape (N,).
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_cls = self.loss_weight * distribution_focal_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_cls
diff --git a/mmde/mmdet/models/losses/ghm_loss.py b/mmde/mmdet/models/losses/ghm_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..a874c0038cc4a77769705a3a06a95a56d3e8dd2d
--- /dev/null
+++ b/mmde/mmdet/models/losses/ghm_loss.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+def _expand_onehot_labels(labels, label_weights, label_channels):
+    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
+    inds = torch.nonzero(
+        (labels >= 0) & (labels < label_channels), as_tuple=False).squeeze()
+    if inds.numel() > 0:
+        bin_labels[inds, labels[inds]] = 1
+    bin_label_weights = label_weights.view(-1, 1).expand(
+        label_weights.size(0), label_channels)
+    return bin_labels, bin_label_weights
+
+
+# TODO: code refactoring to make it consistent with other losses
+@MODELS.register_module()
+class GHMC(nn.Module):
+    """GHM Classification Loss.
+
+    Details of the theorem can be viewed in the paper
+    `Gradient Harmonized Single-stage Detector
+    <https://arxiv.org/abs/1811.05181>`_.
+
+    Args:
+        bins (int): Number of the unit regions for distribution calculation.
+        momentum (float): The parameter for moving average.
+        use_sigmoid (bool): Can only be true for BCE based loss now.
+        loss_weight (float): The weight of the total GHM-C loss.
+        reduction (str): Options are "none", "mean" and "sum".
+            Defaults to "mean"
+    """
+
+    def __init__(self,
+                 bins=10,
+                 momentum=0,
+                 use_sigmoid=True,
+                 loss_weight=1.0,
+                 reduction='mean'):
+        super(GHMC, self).__init__()
+        self.bins = bins
+        self.momentum = momentum
+        edges = torch.arange(bins + 1).float() / bins
+        self.register_buffer('edges', edges)
+        self.edges[-1] += 1e-6
+        if momentum > 0:
+            acc_sum = torch.zeros(bins)
+            self.register_buffer('acc_sum', acc_sum)
+        self.use_sigmoid = use_sigmoid
+        if not self.use_sigmoid:
+            raise NotImplementedError
+        self.loss_weight = loss_weight
+        self.reduction = reduction
+
+    def forward(self,
+                pred,
+                target,
+                label_weight,
+                reduction_override=None,
+                **kwargs):
+        """Calculate the GHM-C loss.
+
+        Args:
+            pred (float tensor of size [batch_num, class_num]):
+                The direct prediction of classification fc layer.
+            target (float tensor of size [batch_num, class_num]):
+                Binary class target for each sample.
+            label_weight (float tensor of size [batch_num, class_num]):
+                the value is 1 if the sample is valid and 0 if ignored.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        Returns:
+            The gradient harmonized loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # the target should be binary class label
+        if pred.dim() != target.dim():
+            target, label_weight = _expand_onehot_labels(
+                target, label_weight, pred.size(-1))
+        target, label_weight = target.float(), label_weight.float()
+        edges = self.edges
+        mmt = self.momentum
+        weights = torch.zeros_like(pred)
+
+        # gradient length
+        g = torch.abs(pred.sigmoid().detach() - target)
+
+        valid = label_weight > 0
+        tot = max(valid.float().sum().item(), 1.0)
+        n = 0  # n valid bins
+        for i in range(self.bins):
+            inds = (g >= edges[i]) & (g < edges[i + 1]) & valid
+            num_in_bin = inds.sum().item()
+            if num_in_bin > 0:
+                if mmt > 0:
+                    self.acc_sum[i] = mmt * self.acc_sum[i] \
+                        + (1 - mmt) * num_in_bin
+                    weights[inds] = tot / self.acc_sum[i]
+                else:
+                    weights[inds] = tot / num_in_bin
+                n += 1
+        if n > 0:
+            weights = weights / n
+
+        loss = F.binary_cross_entropy_with_logits(
+            pred, target, reduction='none')
+        loss = weight_reduce_loss(
+            loss, weights, reduction=reduction, avg_factor=tot)
+        return loss * self.loss_weight
+
+
+# TODO: code refactoring to make it consistent with other losses
+@MODELS.register_module()
+class GHMR(nn.Module):
+    """GHM Regression Loss.
+
+    Details of the theorem can be viewed in the paper
+    `Gradient Harmonized Single-stage Detector
+    <https://arxiv.org/abs/1811.05181>`_.
+
+    Args:
+        mu (float): The parameter for the Authentic Smooth L1 loss.
+        bins (int): Number of the unit regions for distribution calculation.
+        momentum (float): The parameter for moving average.
+        loss_weight (float): The weight of the total GHM-R loss.
+        reduction (str): Options are "none", "mean" and "sum".
+            Defaults to "mean"
+    """
+
+    def __init__(self,
+                 mu=0.02,
+                 bins=10,
+                 momentum=0,
+                 loss_weight=1.0,
+                 reduction='mean'):
+        super(GHMR, self).__init__()
+        self.mu = mu
+        self.bins = bins
+        edges = torch.arange(bins + 1).float() / bins
+        self.register_buffer('edges', edges)
+        self.edges[-1] = 1e3
+        self.momentum = momentum
+        if momentum > 0:
+            acc_sum = torch.zeros(bins)
+            self.register_buffer('acc_sum', acc_sum)
+        self.loss_weight = loss_weight
+        self.reduction = reduction
+
+    # TODO: support reduction parameter
+    def forward(self,
+                pred,
+                target,
+                label_weight,
+                avg_factor=None,
+                reduction_override=None):
+        """Calculate the GHM-R loss.
+
+        Args:
+            pred (float tensor of size [batch_num, 4 (* class_num)]):
+                The prediction of box regression layer. Channel number can be 4
+                or 4 * class_num depending on whether it is class-agnostic.
+            target (float tensor of size [batch_num, 4 (* class_num)]):
+                The target regression values with the same size of pred.
+            label_weight (float tensor of size [batch_num, 4 (* class_num)]):
+                The weight of each sample, 0 if ignored.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        Returns:
+            The gradient harmonized loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        mu = self.mu
+        edges = self.edges
+        mmt = self.momentum
+
+        # ASL1 loss
+        diff = pred - target
+        loss = torch.sqrt(diff * diff + mu * mu) - mu
+
+        # gradient length
+        g = torch.abs(diff / torch.sqrt(mu * mu + diff * diff)).detach()
+        weights = torch.zeros_like(g)
+
+        valid = label_weight > 0
+        tot = max(label_weight.float().sum().item(), 1.0)
+        n = 0  # n: valid bins
+        for i in range(self.bins):
+            inds = (g >= edges[i]) & (g < edges[i + 1]) & valid
+            num_in_bin = inds.sum().item()
+            if num_in_bin > 0:
+                n += 1
+                if mmt > 0:
+                    self.acc_sum[i] = mmt * self.acc_sum[i] \
+                        + (1 - mmt) * num_in_bin
+                    weights[inds] = tot / self.acc_sum[i]
+                else:
+                    weights[inds] = tot / num_in_bin
+        if n > 0:
+            weights /= n
+        loss = weight_reduce_loss(
+            loss, weights, reduction=reduction, avg_factor=tot)
+        return loss * self.loss_weight
diff --git a/mmde/mmdet/models/losses/iou_loss.py b/mmde/mmdet/models/losses/iou_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8a2b977868cef6f4039b49277bfc853ffc720bd
--- /dev/null
+++ b/mmde/mmdet/models/losses/iou_loss.py
@@ -0,0 +1,926 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_overlaps
+from .utils import weighted_loss
+
+
+@weighted_loss
+def iou_loss(pred: Tensor,
+             target: Tensor,
+             linear: bool = False,
+             mode: str = 'log',
+             eps: float = 1e-6) -> Tensor:
+    """IoU loss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+    The loss is calculated as negative log of IoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        linear (bool, optional): If True, use linear scale of loss instead of
+            log scale. Default: False.
+        mode (str): Loss scaling mode, including "linear", "square", and "log".
+            Default: 'log'
+        eps (float): Epsilon to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    assert mode in ['linear', 'square', 'log']
+    if linear:
+        mode = 'linear'
+        warnings.warn('DeprecationWarning: Setting "linear=True" in '
+                      'iou_loss is deprecated, please use "mode=`linear`" '
+                      'instead.')
+    # avoid fp16 overflow
+    if pred.dtype == torch.float16:
+        fp16 = True
+        pred = pred.to(torch.float32)
+    else:
+        fp16 = False
+
+    ious = bbox_overlaps(pred, target, is_aligned=True).clamp(min=eps)
+
+    if fp16:
+        ious = ious.to(torch.float16)
+
+    if mode == 'linear':
+        loss = 1 - ious
+    elif mode == 'square':
+        loss = 1 - ious**2
+    elif mode == 'log':
+        loss = -ious.log()
+    else:
+        raise NotImplementedError
+    return loss
+
+
+@weighted_loss
+def bounded_iou_loss(pred: Tensor,
+                     target: Tensor,
+                     beta: float = 0.2,
+                     eps: float = 1e-3) -> Tensor:
+    """BIoULoss.
+
+    This is an implementation of paper
+    `Improving Object Localization with Fitness NMS and Bounded IoU Loss.
+    <https://arxiv.org/abs/1711.00164>`_.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        beta (float, optional): Beta parameter in smoothl1.
+        eps (float, optional): Epsilon to avoid NaN values.
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    pred_ctrx = (pred[:, 0] + pred[:, 2]) * 0.5
+    pred_ctry = (pred[:, 1] + pred[:, 3]) * 0.5
+    pred_w = pred[:, 2] - pred[:, 0]
+    pred_h = pred[:, 3] - pred[:, 1]
+    with torch.no_grad():
+        target_ctrx = (target[:, 0] + target[:, 2]) * 0.5
+        target_ctry = (target[:, 1] + target[:, 3]) * 0.5
+        target_w = target[:, 2] - target[:, 0]
+        target_h = target[:, 3] - target[:, 1]
+
+    dx = target_ctrx - pred_ctrx
+    dy = target_ctry - pred_ctry
+
+    loss_dx = 1 - torch.max(
+        (target_w - 2 * dx.abs()) /
+        (target_w + 2 * dx.abs() + eps), torch.zeros_like(dx))
+    loss_dy = 1 - torch.max(
+        (target_h - 2 * dy.abs()) /
+        (target_h + 2 * dy.abs() + eps), torch.zeros_like(dy))
+    loss_dw = 1 - torch.min(target_w / (pred_w + eps), pred_w /
+                            (target_w + eps))
+    loss_dh = 1 - torch.min(target_h / (pred_h + eps), pred_h /
+                            (target_h + eps))
+    # view(..., -1) does not work for empty tensor
+    loss_comb = torch.stack([loss_dx, loss_dy, loss_dw, loss_dh],
+                            dim=-1).flatten(1)
+
+    loss = torch.where(loss_comb < beta, 0.5 * loss_comb * loss_comb / beta,
+                       loss_comb - 0.5 * beta)
+    return loss
+
+
+@weighted_loss
+def giou_loss(pred: Tensor, target: Tensor, eps: float = 1e-7) -> Tensor:
+    r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding
+    Box Regression <https://arxiv.org/abs/1902.09630>`_.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Epsilon to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    # avoid fp16 overflow
+    if pred.dtype == torch.float16:
+        fp16 = True
+        pred = pred.to(torch.float32)
+    else:
+        fp16 = False
+
+    gious = bbox_overlaps(pred, target, mode='giou', is_aligned=True, eps=eps)
+
+    if fp16:
+        gious = gious.to(torch.float16)
+
+    loss = 1 - gious
+    return loss
+
+
+@weighted_loss
+def diou_loss(pred: Tensor, target: Tensor, eps: float = 1e-7) -> Tensor:
+    r"""Implementation of `Distance-IoU Loss: Faster and Better
+    Learning for Bounding Box Regression https://arxiv.org/abs/1911.08287`_.
+
+    Code is modified from https://github.com/Zzh-tju/DIoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Epsilon to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    c2 = cw**2 + ch**2 + eps
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+    right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+    rho2 = left + right
+
+    # DIoU
+    dious = ious - rho2 / c2
+    loss = 1 - dious
+    return loss
+
+
+@weighted_loss
+def ciou_loss(pred: Tensor, target: Tensor, eps: float = 1e-7) -> Tensor:
+    r"""`Implementation of paper `Enhancing Geometric Factors into
+    Model Learning and Inference for Object Detection and Instance
+    Segmentation <https://arxiv.org/abs/2005.03572>`_.
+
+    Code is modified from https://github.com/Zzh-tju/CIoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Epsilon to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    c2 = cw**2 + ch**2 + eps
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+
+    left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+    right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+    rho2 = left + right
+
+    factor = 4 / math.pi**2
+    v = factor * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
+
+    with torch.no_grad():
+        alpha = (ious > 0.5).float() * v / (1 - ious + v)
+
+    # CIoU
+    cious = ious - (rho2 / c2 + alpha * v)
+    loss = 1 - cious.clamp(min=-1.0, max=1.0)
+    return loss
+
+
+@weighted_loss
+def eiou_loss(pred: Tensor,
+              target: Tensor,
+              smooth_point: float = 0.1,
+              eps: float = 1e-7) -> Tensor:
+    r"""Implementation of paper `Extended-IoU Loss: A Systematic
+    IoU-Related Method: Beyond Simplified Regression for Better
+    Localization <https://ieeexplore.ieee.org/abstract/document/9429909>`_
+
+    Code is modified from https://github.com//ShiqiYu/libfacedetection.train.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        smooth_point (float): hyperparameter, default is 0.1.
+        eps (float): Epsilon to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    px1, py1, px2, py2 = pred[:, 0], pred[:, 1], pred[:, 2], pred[:, 3]
+    tx1, ty1, tx2, ty2 = target[:, 0], target[:, 1], target[:, 2], target[:, 3]
+
+    # extent top left
+    ex1 = torch.min(px1, tx1)
+    ey1 = torch.min(py1, ty1)
+
+    # intersection coordinates
+    ix1 = torch.max(px1, tx1)
+    iy1 = torch.max(py1, ty1)
+    ix2 = torch.min(px2, tx2)
+    iy2 = torch.min(py2, ty2)
+
+    # extra
+    xmin = torch.min(ix1, ix2)
+    ymin = torch.min(iy1, iy2)
+    xmax = torch.max(ix1, ix2)
+    ymax = torch.max(iy1, iy2)
+
+    # Intersection
+    intersection = (ix2 - ex1) * (iy2 - ey1) + (xmin - ex1) * (ymin - ey1) - (
+        ix1 - ex1) * (ymax - ey1) - (xmax - ex1) * (
+            iy1 - ey1)
+    # Union
+    union = (px2 - px1) * (py2 - py1) + (tx2 - tx1) * (
+        ty2 - ty1) - intersection + eps
+    # IoU
+    ious = 1 - (intersection / union)
+
+    # Smooth-EIoU
+    smooth_sign = (ious < smooth_point).detach().float()
+    loss = 0.5 * smooth_sign * (ious**2) / smooth_point + (1 - smooth_sign) * (
+        ious - 0.5 * smooth_point)
+    return loss
+
+
+@weighted_loss
+def siou_loss(pred, target, eps=1e-7, neg_gamma=False):
+    r"""`Implementation of paper `SIoU Loss: More Powerful Learning
+    for Bounding Box Regression <https://arxiv.org/abs/2205.12740>`_.
+
+    Code is modified from https://github.com/meituan/YOLOv6.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+        neg_gamma (bool): `True` follows original implementation in paper.
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    # modified clamp threshold zero to eps to avoid NaN
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=eps)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+
+    # angle cost
+    s_cw = (b2_x1 + b2_x2 - b1_x1 - b1_x2) * 0.5 + eps
+    s_ch = (b2_y1 + b2_y2 - b1_y1 - b1_y2) * 0.5 + eps
+
+    sigma = torch.pow(s_cw**2 + s_ch**2, 0.5)
+
+    sin_alpha_1 = torch.abs(s_cw) / sigma
+    sin_alpha_2 = torch.abs(s_ch) / sigma
+    threshold = pow(2, 0.5) / 2
+    sin_alpha = torch.where(sin_alpha_1 > threshold, sin_alpha_2, sin_alpha_1)
+    angle_cost = torch.cos(torch.asin(sin_alpha) * 2 - math.pi / 2)
+
+    # distance cost
+    rho_x = (s_cw / cw)**2
+    rho_y = (s_ch / ch)**2
+
+    # `neg_gamma=True` follows original implementation in paper
+    # but setting `neg_gamma=False` makes training more stable.
+    gamma = angle_cost - 2 if neg_gamma else 2 - angle_cost
+    distance_cost = 2 - torch.exp(gamma * rho_x) - torch.exp(gamma * rho_y)
+
+    # shape cost
+    omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2)
+    omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2)
+    shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), 4) + torch.pow(
+        1 - torch.exp(-1 * omiga_h), 4)
+
+    # SIoU
+    sious = ious - 0.5 * (distance_cost + shape_cost)
+    loss = 1 - sious.clamp(min=-1.0, max=1.0)
+    return loss
+
+
+@MODELS.register_module()
+class IoULoss(nn.Module):
+    """IoULoss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+
+    Args:
+        linear (bool): If True, use linear scale of loss else determined
+            by mode. Default: False.
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+        mode (str): Loss scaling mode, including "linear", "square", and "log".
+            Default: 'log'
+    """
+
+    def __init__(self,
+                 linear: bool = False,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 mode: str = 'log') -> None:
+        super().__init__()
+        assert mode in ['linear', 'square', 'log']
+        if linear:
+            mode = 'linear'
+            warnings.warn('DeprecationWarning: Setting "linear=True" in '
+                          'IOULoss is deprecated, please use "mode=`linear`" '
+                          'instead.')
+        self.mode = mode
+        self.linear = linear
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Return:
+            Tensor: Loss tensor.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if (weight is not None) and (not torch.any(weight > 0)) and (
+                reduction != 'none'):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # iou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * iou_loss(
+            pred,
+            target,
+            weight,
+            mode=self.mode,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class BoundedIoULoss(nn.Module):
+    """BIoULoss.
+
+    This is an implementation of paper
+    `Improving Object Localization with Fitness NMS and Bounded IoU Loss.
+    <https://arxiv.org/abs/1711.00164>`_.
+
+    Args:
+        beta (float, optional): Beta parameter in smoothl1.
+        eps (float, optional): Epsilon to avoid NaN values.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 beta: float = 0.2,
+                 eps: float = 1e-3,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.beta = beta
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss = self.loss_weight * bounded_iou_loss(
+            pred,
+            target,
+            weight,
+            beta=self.beta,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class GIoULoss(nn.Module):
+    r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding
+    Box Regression <https://arxiv.org/abs/1902.09630>`_.
+
+    Args:
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * giou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class DIoULoss(nn.Module):
+    r"""Implementation of `Distance-IoU Loss: Faster and Better
+    Learning for Bounding Box Regression https://arxiv.org/abs/1911.08287`_.
+
+    Code is modified from https://github.com/Zzh-tju/DIoU.
+
+    Args:
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * diou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class CIoULoss(nn.Module):
+    r"""`Implementation of paper `Enhancing Geometric Factors into
+    Model Learning and Inference for Object Detection and Instance
+    Segmentation <https://arxiv.org/abs/2005.03572>`_.
+
+    Code is modified from https://github.com/Zzh-tju/CIoU.
+
+    Args:
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * ciou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class EIoULoss(nn.Module):
+    r"""Implementation of paper `Extended-IoU Loss: A Systematic
+    IoU-Related Method: Beyond Simplified Regression for Better
+    Localization <https://ieeexplore.ieee.org/abstract/document/9429909>`_
+
+    Code is modified from https://github.com//ShiqiYu/libfacedetection.train.
+
+    Args:
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+        smooth_point (float): hyperparameter, default is 0.1.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 smooth_point: float = 0.1) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.smooth_point = smooth_point
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * eiou_loss(
+            pred,
+            target,
+            weight,
+            smooth_point=self.smooth_point,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class SIoULoss(nn.Module):
+    r"""`Implementation of paper `SIoU Loss: More Powerful Learning
+    for Bounding Box Regression <https://arxiv.org/abs/2205.12740>`_.
+
+    Code is modified from https://github.com/meituan/YOLOv6.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+        neg_gamma (bool): `True` follows original implementation in paper.
+
+    Return:
+        Tensor: Loss tensor.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 neg_gamma: bool = False) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.neg_gamma = neg_gamma
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * siou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            neg_gamma=self.neg_gamma,
+            **kwargs)
+        return loss
diff --git a/mmde/mmdet/models/losses/kd_loss.py b/mmde/mmdet/models/losses/kd_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a7d5ef24a0b0d7d7390a27c7cd9cbfdbe61d823
--- /dev/null
+++ b/mmde/mmdet/models/losses/kd_loss.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weighted_loss
+
+
+@weighted_loss
+def knowledge_distillation_kl_div_loss(pred: Tensor,
+                                       soft_label: Tensor,
+                                       T: int,
+                                       detach_target: bool = True) -> Tensor:
+    r"""Loss function for knowledge distilling using KL divergence.
+
+    Args:
+        pred (Tensor): Predicted logits with shape (N, n + 1).
+        soft_label (Tensor): Target logits with shape (N, N + 1).
+        T (int): Temperature for distillation.
+        detach_target (bool): Remove soft_label from automatic differentiation
+
+    Returns:
+        Tensor: Loss tensor with shape (N,).
+    """
+    assert pred.size() == soft_label.size()
+    target = F.softmax(soft_label / T, dim=1)
+    if detach_target:
+        target = target.detach()
+
+    kd_loss = F.kl_div(
+        F.log_softmax(pred / T, dim=1), target, reduction='none').mean(1) * (
+            T * T)
+
+    return kd_loss
+
+
+@MODELS.register_module()
+class KnowledgeDistillationKLDivLoss(nn.Module):
+    """Loss function for knowledge distilling using KL divergence.
+
+    Args:
+        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
+        loss_weight (float): Loss weight of current loss.
+        T (int): Temperature for distillation.
+    """
+
+    def __init__(self,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 T: int = 10) -> None:
+        super().__init__()
+        assert T >= 1
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.T = T
+
+    def forward(self,
+                pred: Tensor,
+                soft_label: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted logits with shape (N, n + 1).
+            soft_label (Tensor): Target logits with shape (N, N + 1).
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        loss_kd = self.loss_weight * knowledge_distillation_kl_div_loss(
+            pred,
+            soft_label,
+            weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            T=self.T)
+
+        return loss_kd
diff --git a/mmde/mmdet/models/losses/l2_loss.py b/mmde/mmdet/models/losses/l2_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..6210a3007b2c39540f022925cc93181c7328e42d
--- /dev/null
+++ b/mmde/mmdet/models/losses/l2_loss.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weighted_loss
+
+
+@weighted_loss
+def l2_loss(pred: Tensor, target: Tensor) -> Tensor:
+    """L2 loss.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    assert pred.size() == target.size()
+    loss = torch.abs(pred - target)**2
+    return loss
+
+
+@MODELS.register_module()
+class L2Loss(BaseModule):
+    """L2 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self,
+                 neg_pos_ub: int = -1,
+                 pos_margin: float = -1,
+                 neg_margin: float = -1,
+                 hard_mining: bool = False,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0):
+        super(L2Loss, self).__init__()
+        self.neg_pos_ub = neg_pos_ub
+        self.pos_margin = pos_margin
+        self.neg_margin = neg_margin
+        self.hard_mining = hard_mining
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (float, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        pred, weight, avg_factor = self.update_weight(pred, target, weight,
+                                                      avg_factor)
+        loss_bbox = self.loss_weight * l2_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+    def update_weight(self, pred: Tensor, target: Tensor, weight: Tensor,
+                      avg_factor: float) -> Tuple[Tensor, Tensor, float]:
+        """Update the weight according to targets."""
+        if weight is None:
+            weight = target.new_ones(target.size())
+
+        invalid_inds = weight <= 0
+        target[invalid_inds] = -1
+        pos_inds = target == 1
+        neg_inds = target == 0
+
+        if self.pos_margin > 0:
+            pred[pos_inds] -= self.pos_margin
+        if self.neg_margin > 0:
+            pred[neg_inds] -= self.neg_margin
+        pred = torch.clamp(pred, min=0, max=1)
+
+        num_pos = int((target == 1).sum())
+        num_neg = int((target == 0).sum())
+        if self.neg_pos_ub > 0 and num_neg / (num_pos +
+                                              1e-6) > self.neg_pos_ub:
+            num_neg = num_pos * self.neg_pos_ub
+            neg_idx = torch.nonzero(target == 0, as_tuple=False)
+
+            if self.hard_mining:
+                costs = l2_loss(
+                    pred, target, reduction='none')[neg_idx[:, 0],
+                                                    neg_idx[:, 1]].detach()
+                neg_idx = neg_idx[costs.topk(num_neg)[1], :]
+            else:
+                neg_idx = self.random_choice(neg_idx, num_neg)
+
+            new_neg_inds = neg_inds.new_zeros(neg_inds.size()).bool()
+            new_neg_inds[neg_idx[:, 0], neg_idx[:, 1]] = True
+
+            invalid_neg_inds = torch.logical_xor(neg_inds, new_neg_inds)
+            weight[invalid_neg_inds] = 0
+
+        avg_factor = (weight > 0).sum()
+        return pred, weight, avg_factor
+
+    @staticmethod
+    def random_choice(gallery: Union[list, np.ndarray, Tensor],
+                      num: int) -> np.ndarray:
+        """Random select some elements from the gallery.
+
+        It seems that Pytorch's implementation is slower than numpy so we use
+        numpy to randperm the indices.
+        """
+        assert len(gallery) >= num
+        if isinstance(gallery, list):
+            gallery = np.array(gallery)
+        cands = np.arange(len(gallery))
+        np.random.shuffle(cands)
+        rand_inds = cands[:num]
+        if not isinstance(gallery, np.ndarray):
+            rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device)
+        return gallery[rand_inds]
diff --git a/mmde/mmdet/models/losses/margin_loss.py b/mmde/mmdet/models/losses/margin_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..0609e1db50edf89c8ae8b65709e8ab786f580366
--- /dev/null
+++ b/mmde/mmdet/models/losses/margin_loss.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .mse_loss import mse_loss
+
+
+@MODELS.register_module()
+class MarginL2Loss(BaseModule):
+    """L2 loss with margin.
+
+    Args:
+        neg_pos_ub (int, optional): The upper bound of negative to positive
+            samples in hard mining. Defaults to -1.
+        pos_margin (float, optional): The similarity margin for positive
+            samples in hard mining. Defaults to -1.
+        neg_margin (float, optional): The similarity margin for negative
+            samples in hard mining. Defaults to -1.
+        hard_mining (bool, optional): Whether to use hard mining. Defaults to
+            False.
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum". Defaults to "mean".
+        loss_weight (float, optional): The weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 neg_pos_ub: int = -1,
+                 pos_margin: float = -1,
+                 neg_margin: float = -1,
+                 hard_mining: bool = False,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0):
+        super(MarginL2Loss, self).__init__()
+        self.neg_pos_ub = neg_pos_ub
+        self.pos_margin = pos_margin
+        self.neg_margin = neg_margin
+        self.hard_mining = hard_mining
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (float, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        pred, weight, avg_factor = self.update_weight(pred, target, weight,
+                                                      avg_factor)
+        loss_bbox = self.loss_weight * mse_loss(
+            pred,
+            target.float(),
+            weight.float(),
+            reduction=reduction,
+            avg_factor=avg_factor)
+        return loss_bbox
+
+    def update_weight(self, pred: Tensor, target: Tensor, weight: Tensor,
+                      avg_factor: float) -> Tuple[Tensor, Tensor, float]:
+        """Update the weight according to targets.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor): The weight of loss for each prediction.
+            avg_factor (float): Average factor that is used to average the
+                loss.
+
+        Returns:
+            tuple[torch.Tensor]: The updated prediction, weight and average
+            factor.
+        """
+        if weight is None:
+            weight = target.new_ones(target.size())
+
+        invalid_inds = weight <= 0
+        target[invalid_inds] = -1
+        pos_inds = target == 1
+        neg_inds = target == 0
+
+        if self.pos_margin > 0:
+            pred[pos_inds] -= self.pos_margin
+        if self.neg_margin > 0:
+            pred[neg_inds] -= self.neg_margin
+        pred = torch.clamp(pred, min=0, max=1)
+
+        num_pos = int((target == 1).sum())
+        num_neg = int((target == 0).sum())
+        if self.neg_pos_ub > 0 and num_neg / (num_pos +
+                                              1e-6) > self.neg_pos_ub:
+            num_neg = num_pos * self.neg_pos_ub
+            neg_idx = torch.nonzero(target == 0, as_tuple=False)
+
+            if self.hard_mining:
+                costs = mse_loss(
+                    pred, target.float(),
+                    reduction='none')[neg_idx[:, 0], neg_idx[:, 1]].detach()
+                neg_idx = neg_idx[costs.topk(num_neg)[1], :]
+            else:
+                neg_idx = self.random_choice(neg_idx, num_neg)
+
+            new_neg_inds = neg_inds.new_zeros(neg_inds.size()).bool()
+            new_neg_inds[neg_idx[:, 0], neg_idx[:, 1]] = True
+
+            invalid_neg_inds = torch.logical_xor(neg_inds, new_neg_inds)
+            weight[invalid_neg_inds] = 0
+
+        avg_factor = (weight > 0).sum()
+        return pred, weight, avg_factor
+
+    @staticmethod
+    def random_choice(gallery: Union[list, np.ndarray, Tensor],
+                      num: int) -> np.ndarray:
+        """Random select some elements from the gallery.
+
+        It seems that Pytorch's implementation is slower than numpy so we use
+        numpy to randperm the indices.
+
+        Args:
+            gallery (list | np.ndarray | torch.Tensor): The gallery from
+                which to sample.
+            num (int): The number of elements to sample.
+        """
+        assert len(gallery) >= num
+        if isinstance(gallery, list):
+            gallery = np.array(gallery)
+        cands = np.arange(len(gallery))
+        np.random.shuffle(cands)
+        rand_inds = cands[:num]
+        if not isinstance(gallery, np.ndarray):
+            rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device)
+        return gallery[rand_inds]
diff --git a/mmde/mmdet/models/losses/mse_loss.py b/mmde/mmdet/models/losses/mse_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..6048218ad36a8105e7fa182f40fae93ef7c9268f
--- /dev/null
+++ b/mmde/mmdet/models/losses/mse_loss.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weighted_loss
+
+
+@weighted_loss
+def mse_loss(pred: Tensor, target: Tensor) -> Tensor:
+    """A Wrapper of MSE loss.
+    Args:
+        pred (Tensor): The prediction.
+        target (Tensor): The learning target of the prediction.
+
+    Returns:
+        Tensor: loss Tensor
+    """
+    return F.mse_loss(pred, target, reduction='none')
+
+
+@MODELS.register_module()
+class MSELoss(nn.Module):
+    """MSELoss.
+
+    Args:
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+
+    def __init__(self,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function of loss.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            weight (Tensor, optional): Weight of the loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            Tensor: The calculated loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss = self.loss_weight * mse_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss
diff --git a/mmde/mmdet/models/losses/multipos_cross_entropy_loss.py b/mmde/mmdet/models/losses/multipos_cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7d1561ed414b7c15412b5e746dff39ca0c53ba1
--- /dev/null
+++ b/mmde/mmdet/models/losses/multipos_cross_entropy_loss.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+@MODELS.register_module()
+class MultiPosCrossEntropyLoss(BaseModule):
+    """multi-positive targets cross entropy loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum". Defaults to "mean".
+        loss_weight (float, optional): The weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self, reduction: str = 'mean', loss_weight: float = 1.0):
+        super(MultiPosCrossEntropyLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def multi_pos_cross_entropy(self,
+                                pred: Tensor,
+                                label: Tensor,
+                                weight: Optional[Tensor] = None,
+                                reduction: str = 'mean',
+                                avg_factor: Optional[float] = None) -> Tensor:
+        """Multi-positive targets cross entropy loss.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            label (torch.Tensor): The assigned label of the prediction.
+            weight (torch.Tensor): The element-wise weight.
+            reduction (str): Same as built-in losses of PyTorch.
+            avg_factor (float): Average factor when computing
+                the mean of losses.
+
+        Returns:
+            torch.Tensor: Calculated loss
+        """
+
+        pos_inds = (label >= 1)
+        neg_inds = (label == 0)
+        pred_pos = pred * pos_inds.float()
+        pred_neg = pred * neg_inds.float()
+        # use -inf to mask out unwanted elements.
+        pred_pos[neg_inds] = pred_pos[neg_inds] + float('inf')
+        pred_neg[pos_inds] = pred_neg[pos_inds] + float('-inf')
+
+        _pos_expand = torch.repeat_interleave(pred_pos, pred.shape[1], dim=1)
+        _neg_expand = pred_neg.repeat(1, pred.shape[1])
+
+        x = torch.nn.functional.pad((_neg_expand - _pos_expand), (0, 1),
+                                    'constant', 0)
+        loss = torch.logsumexp(x, dim=1)
+
+        # apply weights and do the reduction
+        if weight is not None:
+            weight = weight.float()
+        loss = weight_reduce_loss(
+            loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
+
+        return loss
+
+    def forward(self,
+                cls_score: Tensor,
+                label: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            cls_score (torch.Tensor): The classification score.
+            label (torch.Tensor): The assigned label of the prediction.
+            weight (torch.Tensor): The element-wise weight.
+            avg_factor (float): Average factor when computing
+                the mean of losses.
+            reduction_override (str): Same as built-in losses of PyTorch.
+
+        Returns:
+            torch.Tensor: Calculated loss
+        """
+        assert cls_score.size() == label.size()
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_cls = self.loss_weight * self.multi_pos_cross_entropy(
+            cls_score,
+            label,
+            weight,
+            reduction=reduction,
+            avg_factor=avg_factor)
+        return loss_cls
diff --git a/mmde/mmdet/models/losses/pisa_loss.py b/mmde/mmdet/models/losses/pisa_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b192aa0dbc7eb554755eb2f242eab0ea7f1fc650
--- /dev/null
+++ b/mmde/mmdet/models/losses/pisa_loss.py
@@ -0,0 +1,187 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.structures.bbox import bbox_overlaps
+from ..task_modules.coders import BaseBBoxCoder
+from ..task_modules.samplers import SamplingResult
+
+
+def isr_p(cls_score: Tensor,
+          bbox_pred: Tensor,
+          bbox_targets: Tuple[Tensor],
+          rois: Tensor,
+          sampling_results: List[SamplingResult],
+          loss_cls: nn.Module,
+          bbox_coder: BaseBBoxCoder,
+          k: float = 2,
+          bias: float = 0,
+          num_class: int = 80) -> tuple:
+    """Importance-based Sample Reweighting (ISR_P), positive part.
+
+    Args:
+        cls_score (Tensor): Predicted classification scores.
+        bbox_pred (Tensor): Predicted bbox deltas.
+        bbox_targets (tuple[Tensor]): A tuple of bbox targets, the are
+            labels, label_weights, bbox_targets, bbox_weights, respectively.
+        rois (Tensor): Anchors (single_stage) in shape (n, 4) or RoIs
+            (two_stage) in shape (n, 5).
+        sampling_results (:obj:`SamplingResult`): Sampling results.
+        loss_cls (:obj:`nn.Module`): Classification loss func of the head.
+        bbox_coder (:obj:`BaseBBoxCoder`): BBox coder of the head.
+        k (float): Power of the non-linear mapping. Defaults to 2.
+        bias (float): Shift of the non-linear mapping. Defaults to 0.
+        num_class (int): Number of classes, defaults to 80.
+
+    Return:
+        tuple([Tensor]): labels, imp_based_label_weights, bbox_targets,
+            bbox_target_weights
+    """
+
+    labels, label_weights, bbox_targets, bbox_weights = bbox_targets
+    pos_label_inds = ((labels >= 0) &
+                      (labels < num_class)).nonzero().reshape(-1)
+    pos_labels = labels[pos_label_inds]
+
+    # if no positive samples, return the original targets
+    num_pos = float(pos_label_inds.size(0))
+    if num_pos == 0:
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    # merge pos_assigned_gt_inds of per image to a single tensor
+    gts = list()
+    last_max_gt = 0
+    for i in range(len(sampling_results)):
+        gt_i = sampling_results[i].pos_assigned_gt_inds
+        gts.append(gt_i + last_max_gt)
+        if len(gt_i) != 0:
+            last_max_gt = gt_i.max() + 1
+    gts = torch.cat(gts)
+    assert len(gts) == num_pos
+
+    cls_score = cls_score.detach()
+    bbox_pred = bbox_pred.detach()
+
+    # For single stage detectors, rois here indicate anchors, in shape (N, 4)
+    # For two stage detectors, rois are in shape (N, 5)
+    if rois.size(-1) == 5:
+        pos_rois = rois[pos_label_inds][:, 1:]
+    else:
+        pos_rois = rois[pos_label_inds]
+
+    if bbox_pred.size(-1) > 4:
+        bbox_pred = bbox_pred.view(bbox_pred.size(0), -1, 4)
+        pos_delta_pred = bbox_pred[pos_label_inds, pos_labels].view(-1, 4)
+    else:
+        pos_delta_pred = bbox_pred[pos_label_inds].view(-1, 4)
+
+    # compute iou of the predicted bbox and the corresponding GT
+    pos_delta_target = bbox_targets[pos_label_inds].view(-1, 4)
+    pos_bbox_pred = bbox_coder.decode(pos_rois, pos_delta_pred)
+    target_bbox_pred = bbox_coder.decode(pos_rois, pos_delta_target)
+    ious = bbox_overlaps(pos_bbox_pred, target_bbox_pred, is_aligned=True)
+
+    pos_imp_weights = label_weights[pos_label_inds]
+    # Two steps to compute IoU-HLR. Samples are first sorted by IoU locally,
+    # then sorted again within the same-rank group
+    max_l_num = pos_labels.bincount().max()
+    for label in pos_labels.unique():
+        l_inds = (pos_labels == label).nonzero().view(-1)
+        l_gts = gts[l_inds]
+        for t in l_gts.unique():
+            t_inds = l_inds[l_gts == t]
+            t_ious = ious[t_inds]
+            _, t_iou_rank_idx = t_ious.sort(descending=True)
+            _, t_iou_rank = t_iou_rank_idx.sort()
+            ious[t_inds] += max_l_num - t_iou_rank.float()
+        l_ious = ious[l_inds]
+        _, l_iou_rank_idx = l_ious.sort(descending=True)
+        _, l_iou_rank = l_iou_rank_idx.sort()  # IoU-HLR
+        # linearly map HLR to label weights
+        pos_imp_weights[l_inds] *= (max_l_num - l_iou_rank.float()) / max_l_num
+
+    pos_imp_weights = (bias + pos_imp_weights * (1 - bias)).pow(k)
+
+    # normalize to make the new weighted loss value equal to the original loss
+    pos_loss_cls = loss_cls(
+        cls_score[pos_label_inds], pos_labels, reduction_override='none')
+    if pos_loss_cls.dim() > 1:
+        ori_pos_loss_cls = pos_loss_cls * label_weights[pos_label_inds][:,
+                                                                        None]
+        new_pos_loss_cls = pos_loss_cls * pos_imp_weights[:, None]
+    else:
+        ori_pos_loss_cls = pos_loss_cls * label_weights[pos_label_inds]
+        new_pos_loss_cls = pos_loss_cls * pos_imp_weights
+    pos_loss_cls_ratio = ori_pos_loss_cls.sum() / new_pos_loss_cls.sum()
+    pos_imp_weights = pos_imp_weights * pos_loss_cls_ratio
+    label_weights[pos_label_inds] = pos_imp_weights
+
+    bbox_targets = labels, label_weights, bbox_targets, bbox_weights
+    return bbox_targets
+
+
+def carl_loss(cls_score: Tensor,
+              labels: Tensor,
+              bbox_pred: Tensor,
+              bbox_targets: Tensor,
+              loss_bbox: nn.Module,
+              k: float = 1,
+              bias: float = 0.2,
+              avg_factor: Optional[int] = None,
+              sigmoid: bool = False,
+              num_class: int = 80) -> dict:
+    """Classification-Aware Regression Loss (CARL).
+
+    Args:
+        cls_score (Tensor): Predicted classification scores.
+        labels (Tensor): Targets of classification.
+        bbox_pred (Tensor): Predicted bbox deltas.
+        bbox_targets (Tensor): Target of bbox regression.
+        loss_bbox (func): Regression loss func of the head.
+        bbox_coder (obj): BBox coder of the head.
+        k (float): Power of the non-linear mapping. Defaults to 1.
+        bias (float): Shift of the non-linear mapping. Defaults to 0.2.
+        avg_factor (int, optional): Average factor used in regression loss.
+        sigmoid (bool): Activation of the classification score.
+        num_class (int): Number of classes, defaults to 80.
+
+    Return:
+        dict: CARL loss dict.
+    """
+    pos_label_inds = ((labels >= 0) &
+                      (labels < num_class)).nonzero().reshape(-1)
+    if pos_label_inds.numel() == 0:
+        return dict(loss_carl=cls_score.sum()[None] * 0.)
+    pos_labels = labels[pos_label_inds]
+
+    # multiply pos_cls_score with the corresponding bbox weight
+    # and remain gradient
+    if sigmoid:
+        pos_cls_score = cls_score.sigmoid()[pos_label_inds, pos_labels]
+    else:
+        pos_cls_score = cls_score.softmax(-1)[pos_label_inds, pos_labels]
+    carl_loss_weights = (bias + (1 - bias) * pos_cls_score).pow(k)
+
+    # normalize carl_loss_weight to make its sum equal to num positive
+    num_pos = float(pos_cls_score.size(0))
+    weight_ratio = num_pos / carl_loss_weights.sum()
+    carl_loss_weights *= weight_ratio
+
+    if avg_factor is None:
+        avg_factor = bbox_targets.size(0)
+    # if is class agnostic, bbox pred is in shape (N, 4)
+    # otherwise, bbox pred is in shape (N, #classes, 4)
+    if bbox_pred.size(-1) > 4:
+        bbox_pred = bbox_pred.view(bbox_pred.size(0), -1, 4)
+        pos_bbox_preds = bbox_pred[pos_label_inds, pos_labels]
+    else:
+        pos_bbox_preds = bbox_pred[pos_label_inds]
+    ori_loss_reg = loss_bbox(
+        pos_bbox_preds,
+        bbox_targets[pos_label_inds],
+        reduction_override='none') / avg_factor
+    loss_carl = (ori_loss_reg * carl_loss_weights[:, None]).sum()
+    return dict(loss_carl=loss_carl[None])
diff --git a/mmde/mmdet/models/losses/seesaw_loss.py b/mmde/mmdet/models/losses/seesaw_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dec62b0afdc01e848e0c7f53ba0b6b10b899ea4
--- /dev/null
+++ b/mmde/mmdet/models/losses/seesaw_loss.py
@@ -0,0 +1,278 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .accuracy import accuracy
+from .cross_entropy_loss import cross_entropy
+from .utils import weight_reduce_loss
+
+
+def seesaw_ce_loss(cls_score: Tensor,
+                   labels: Tensor,
+                   label_weights: Tensor,
+                   cum_samples: Tensor,
+                   num_classes: int,
+                   p: float,
+                   q: float,
+                   eps: float,
+                   reduction: str = 'mean',
+                   avg_factor: Optional[int] = None) -> Tensor:
+    """Calculate the Seesaw CrossEntropy loss.
+
+    Args:
+        cls_score (Tensor): The prediction with shape (N, C),
+             C is the number of classes.
+        labels (Tensor): The learning label of the prediction.
+        label_weights (Tensor): Sample-wise loss weight.
+        cum_samples (Tensor): Cumulative samples for each category.
+        num_classes (int): The number of classes.
+        p (float): The ``p`` in the mitigation factor.
+        q (float): The ``q`` in the compenstation factor.
+        eps (float): The minimal value of divisor to smooth
+             the computation of compensation factor
+        reduction (str, optional): The method used to reduce the loss.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+
+    Returns:
+        Tensor: The calculated loss
+    """
+    assert cls_score.size(-1) == num_classes
+    assert len(cum_samples) == num_classes
+
+    onehot_labels = F.one_hot(labels, num_classes)
+    seesaw_weights = cls_score.new_ones(onehot_labels.size())
+
+    # mitigation factor
+    if p > 0:
+        sample_ratio_matrix = cum_samples[None, :].clamp(
+            min=1) / cum_samples[:, None].clamp(min=1)
+        index = (sample_ratio_matrix < 1.0).float()
+        sample_weights = sample_ratio_matrix.pow(p) * index + (1 - index)
+        mitigation_factor = sample_weights[labels.long(), :]
+        seesaw_weights = seesaw_weights * mitigation_factor
+
+    # compensation factor
+    if q > 0:
+        scores = F.softmax(cls_score.detach(), dim=1)
+        self_scores = scores[
+            torch.arange(0, len(scores)).to(scores.device).long(),
+            labels.long()]
+        score_matrix = scores / self_scores[:, None].clamp(min=eps)
+        index = (score_matrix > 1.0).float()
+        compensation_factor = score_matrix.pow(q) * index + (1 - index)
+        seesaw_weights = seesaw_weights * compensation_factor
+
+    cls_score = cls_score + (seesaw_weights.log() * (1 - onehot_labels))
+
+    loss = F.cross_entropy(cls_score, labels, weight=None, reduction='none')
+
+    if label_weights is not None:
+        label_weights = label_weights.float()
+    loss = weight_reduce_loss(
+        loss, weight=label_weights, reduction=reduction, avg_factor=avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class SeesawLoss(nn.Module):
+    """
+    Seesaw Loss for Long-Tailed Instance Segmentation (CVPR 2021)
+    arXiv: https://arxiv.org/abs/2008.10032
+
+    Args:
+        use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+             of softmax. Only False is supported.
+        p (float, optional): The ``p`` in the mitigation factor.
+             Defaults to 0.8.
+        q (float, optional): The ``q`` in the compenstation factor.
+             Defaults to 2.0.
+        num_classes (int, optional): The number of classes.
+             Default to 1203 for LVIS v1 dataset.
+        eps (float, optional): The minimal value of divisor to smooth
+             the computation of compensation factor
+        reduction (str, optional): The method that reduces the loss to a
+             scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+        return_dict (bool, optional): Whether return the losses as a dict.
+             Default to True.
+    """
+
+    def __init__(self,
+                 use_sigmoid: bool = False,
+                 p: float = 0.8,
+                 q: float = 2.0,
+                 num_classes: int = 1203,
+                 eps: float = 1e-2,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 return_dict: bool = True) -> None:
+        super().__init__()
+        assert not use_sigmoid
+        self.use_sigmoid = False
+        self.p = p
+        self.q = q
+        self.num_classes = num_classes
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.return_dict = return_dict
+
+        # 0 for pos, 1 for neg
+        self.cls_criterion = seesaw_ce_loss
+
+        # cumulative samples for each category
+        self.register_buffer(
+            'cum_samples',
+            torch.zeros(self.num_classes + 1, dtype=torch.float))
+
+        # custom output channels of the classifier
+        self.custom_cls_channels = True
+        # custom activation of cls_score
+        self.custom_activation = True
+        # custom accuracy of the classsifier
+        self.custom_accuracy = True
+
+    def _split_cls_score(self, cls_score: Tensor) -> Tuple[Tensor, Tensor]:
+        """split cls_score.
+
+        Args:
+            cls_score (Tensor): The prediction with shape (N, C + 2).
+
+        Returns:
+            Tuple[Tensor, Tensor]: The score for classes and objectness,
+                 respectively
+        """
+        # split cls_score to cls_score_classes and cls_score_objectness
+        assert cls_score.size(-1) == self.num_classes + 2
+        cls_score_classes = cls_score[..., :-2]
+        cls_score_objectness = cls_score[..., -2:]
+        return cls_score_classes, cls_score_objectness
+
+    def get_cls_channels(self, num_classes: int) -> int:
+        """Get custom classification channels.
+
+        Args:
+            num_classes (int): The number of classes.
+
+        Returns:
+            int: The custom classification channels.
+        """
+        assert num_classes == self.num_classes
+        return num_classes + 2
+
+    def get_activation(self, cls_score: Tensor) -> Tensor:
+        """Get custom activation of cls_score.
+
+        Args:
+            cls_score (Tensor): The prediction with shape (N, C + 2).
+
+        Returns:
+            Tensor: The custom activation of cls_score with shape
+                 (N, C + 1).
+        """
+        cls_score_classes, cls_score_objectness = self._split_cls_score(
+            cls_score)
+        score_classes = F.softmax(cls_score_classes, dim=-1)
+        score_objectness = F.softmax(cls_score_objectness, dim=-1)
+        score_pos = score_objectness[..., [0]]
+        score_neg = score_objectness[..., [1]]
+        score_classes = score_classes * score_pos
+        scores = torch.cat([score_classes, score_neg], dim=-1)
+        return scores
+
+    def get_accuracy(self, cls_score: Tensor,
+                     labels: Tensor) -> Dict[str, Tensor]:
+        """Get custom accuracy w.r.t. cls_score and labels.
+
+        Args:
+            cls_score (Tensor): The prediction with shape (N, C + 2).
+            labels (Tensor): The learning label of the prediction.
+
+        Returns:
+            Dict [str, Tensor]: The accuracy for objectness and classes,
+                 respectively.
+        """
+        pos_inds = labels < self.num_classes
+        obj_labels = (labels == self.num_classes).long()
+        cls_score_classes, cls_score_objectness = self._split_cls_score(
+            cls_score)
+        acc_objectness = accuracy(cls_score_objectness, obj_labels)
+        acc_classes = accuracy(cls_score_classes[pos_inds], labels[pos_inds])
+        acc = dict()
+        acc['acc_objectness'] = acc_objectness
+        acc['acc_classes'] = acc_classes
+        return acc
+
+    def forward(
+        self,
+        cls_score: Tensor,
+        labels: Tensor,
+        label_weights: Optional[Tensor] = None,
+        avg_factor: Optional[int] = None,
+        reduction_override: Optional[str] = None
+    ) -> Union[Tensor, Dict[str, Tensor]]:
+        """Forward function.
+
+        Args:
+            cls_score (Tensor): The prediction with shape (N, C + 2).
+            labels (Tensor): The learning label of the prediction.
+            label_weights (Tensor, optional): Sample-wise loss weight.
+            avg_factor (int, optional): Average factor that is used to average
+                 the loss. Defaults to None.
+            reduction (str, optional): The method used to reduce the loss.
+                 Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor | Dict [str, Tensor]:
+                 if return_dict == False: The calculated loss |
+                 if return_dict == True: The dict of calculated losses
+                 for objectness and classes, respectively.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        assert cls_score.size(-1) == self.num_classes + 2
+        pos_inds = labels < self.num_classes
+        # 0 for pos, 1 for neg
+        obj_labels = (labels == self.num_classes).long()
+
+        # accumulate the samples for each category
+        unique_labels = labels.unique()
+        for u_l in unique_labels:
+            inds_ = labels == u_l.item()
+            self.cum_samples[u_l] += inds_.sum()
+
+        if label_weights is not None:
+            label_weights = label_weights.float()
+        else:
+            label_weights = labels.new_ones(labels.size(), dtype=torch.float)
+
+        cls_score_classes, cls_score_objectness = self._split_cls_score(
+            cls_score)
+        # calculate loss_cls_classes (only need pos samples)
+        if pos_inds.sum() > 0:
+            loss_cls_classes = self.loss_weight * self.cls_criterion(
+                cls_score_classes[pos_inds], labels[pos_inds],
+                label_weights[pos_inds], self.cum_samples[:self.num_classes],
+                self.num_classes, self.p, self.q, self.eps, reduction,
+                avg_factor)
+        else:
+            loss_cls_classes = cls_score_classes[pos_inds].sum()
+        # calculate loss_cls_objectness
+        loss_cls_objectness = self.loss_weight * cross_entropy(
+            cls_score_objectness, obj_labels, label_weights, reduction,
+            avg_factor)
+
+        if self.return_dict:
+            loss_cls = dict()
+            loss_cls['loss_cls_objectness'] = loss_cls_objectness
+            loss_cls['loss_cls_classes'] = loss_cls_classes
+        else:
+            loss_cls = loss_cls_classes + loss_cls_objectness
+        return loss_cls
diff --git a/mmde/mmdet/models/losses/smooth_l1_loss.py b/mmde/mmdet/models/losses/smooth_l1_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..102f9780706172a44ade2ebe1709c7a1e847db7c
--- /dev/null
+++ b/mmde/mmdet/models/losses/smooth_l1_loss.py
@@ -0,0 +1,165 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weighted_loss
+
+
+@weighted_loss
+def smooth_l1_loss(pred: Tensor, target: Tensor, beta: float = 1.0) -> Tensor:
+    """Smooth L1 loss.
+
+    Args:
+        pred (Tensor): The prediction.
+        target (Tensor): The learning target of the prediction.
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+
+    Returns:
+        Tensor: Calculated loss
+    """
+    assert beta > 0
+    if target.numel() == 0:
+        return pred.sum() * 0
+
+    assert pred.size() == target.size()
+    diff = torch.abs(pred - target)
+    loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
+                       diff - 0.5 * beta)
+    return loss
+
+
+@weighted_loss
+def l1_loss(pred: Tensor, target: Tensor) -> Tensor:
+    """L1 loss.
+
+    Args:
+        pred (Tensor): The prediction.
+        target (Tensor): The learning target of the prediction.
+
+    Returns:
+        Tensor: Calculated loss
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+
+    assert pred.size() == target.size()
+    loss = torch.abs(pred - target)
+    return loss
+
+
+@MODELS.register_module()
+class SmoothL1Loss(nn.Module):
+    """Smooth L1 loss.
+
+    Args:
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum". Defaults to "mean".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self,
+                 beta: float = 1.0,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            Tensor: Calculated loss
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * smooth_l1_loss(
+            pred,
+            target,
+            weight,
+            beta=self.beta,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_bbox
+
+
+@MODELS.register_module()
+class L1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            Tensor: Calculated loss
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
diff --git a/mmde/mmdet/models/losses/triplet_loss.py b/mmde/mmdet/models/losses/triplet_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..4528239beb4bf122fa1a05ee2ce21cb1cb144bde
--- /dev/null
+++ b/mmde/mmdet/models/losses/triplet_loss.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class TripletLoss(BaseModule):
+    """Triplet loss with hard positive/negative mining.
+
+    Reference:
+        Hermans et al. In Defense of the Triplet Loss for
+            Person Re-Identification. arXiv:1703.07737.
+    Imported from `<https://github.com/KaiyangZhou/deep-person-reid/blob/
+        master/torchreid/losses/hard_mine_triplet_loss.py>`_.
+    Args:
+        margin (float, optional): Margin for triplet loss. Defaults to 0.3.
+        loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+        hard_mining (bool, optional): Whether to perform hard mining.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 margin: float = 0.3,
+                 loss_weight: float = 1.0,
+                 hard_mining=True):
+        super(TripletLoss, self).__init__()
+        self.margin = margin
+        self.ranking_loss = nn.MarginRankingLoss(margin=margin)
+        self.loss_weight = loss_weight
+        self.hard_mining = hard_mining
+
+    def hard_mining_triplet_loss_forward(
+            self, inputs: torch.Tensor,
+            targets: torch.LongTensor) -> torch.Tensor:
+        """
+        Args:
+            inputs (torch.Tensor): feature matrix with shape
+                (batch_size, feat_dim).
+            targets (torch.LongTensor): ground truth labels with shape
+                (batch_size).
+
+        Returns:
+            torch.Tensor: triplet loss with hard mining.
+        """
+
+        batch_size = inputs.size(0)
+
+        # Compute Euclidean distance
+        dist = torch.pow(inputs, 2).sum(
+            dim=1, keepdim=True).expand(batch_size, batch_size)
+        dist = dist + dist.t()
+        dist.addmm_(inputs, inputs.t(), beta=1, alpha=-2)
+        dist = dist.clamp(min=1e-12).sqrt()  # for numerical stability
+
+        # For each anchor, find the furthest positive sample
+        # and nearest negative sample in the embedding space
+        mask = targets.expand(batch_size, batch_size).eq(
+            targets.expand(batch_size, batch_size).t())
+        dist_ap, dist_an = [], []
+        for i in range(batch_size):
+            dist_ap.append(dist[i][mask[i]].max().unsqueeze(0))
+            dist_an.append(dist[i][mask[i] == 0].min().unsqueeze(0))
+        dist_ap = torch.cat(dist_ap)
+        dist_an = torch.cat(dist_an)
+
+        # Compute ranking hinge loss
+        y = torch.ones_like(dist_an)
+        return self.loss_weight * self.ranking_loss(dist_an, dist_ap, y)
+
+    def forward(self, inputs: torch.Tensor,
+                targets: torch.LongTensor) -> torch.Tensor:
+        """
+        Args:
+            inputs (torch.Tensor): feature matrix with shape
+                (batch_size, feat_dim).
+            targets (torch.LongTensor): ground truth labels with shape
+                (num_classes).
+
+        Returns:
+            torch.Tensor: triplet loss.
+        """
+        if self.hard_mining:
+            return self.hard_mining_triplet_loss_forward(inputs, targets)
+        else:
+            raise NotImplementedError()
diff --git a/mmde/mmdet/models/losses/utils.py b/mmde/mmdet/models/losses/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e6e7859f353f3e5456f0cfc1f66b4b0ad535427
--- /dev/null
+++ b/mmde/mmdet/models/losses/utils.py
@@ -0,0 +1,125 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+from typing import Callable, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def reduce_loss(loss: Tensor, reduction: str) -> Tensor:
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+
+def weight_reduce_loss(loss: Tensor,
+                       weight: Optional[Tensor] = None,
+                       reduction: str = 'mean',
+                       avg_factor: Optional[float] = None) -> Tensor:
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Optional[Tensor], optional): Element-wise weights.
+            Defaults to None.
+        reduction (str, optional): Same as built-in losses of PyTorch.
+            Defaults to 'mean'.
+        avg_factor (Optional[float], optional): Average factor when
+            computing the mean of losses. Defaults to None.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+            # i.e., all labels of an image belong to ignore index.
+            eps = torch.finfo(torch.float32).eps
+            loss = loss.sum() / (avg_factor + eps)
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+
+def weighted_loss(loss_func: Callable) -> Callable:
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                reduction: str = 'mean',
+                avg_factor: Optional[int] = None,
+                **kwargs) -> Tensor:
+        """
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): Target bboxes.
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            reduction (str, optional): Options are "none", "mean" and "sum".
+                Defaults to 'mean'.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
diff --git a/mmde/mmdet/models/losses/varifocal_loss.py b/mmde/mmdet/models/losses/varifocal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..58ab167352e1ae32566f5e731339966d5fd10759
--- /dev/null
+++ b/mmde/mmdet/models/losses/varifocal_loss.py
@@ -0,0 +1,141 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+def varifocal_loss(pred: Tensor,
+                   target: Tensor,
+                   weight: Optional[Tensor] = None,
+                   alpha: float = 0.75,
+                   gamma: float = 2.0,
+                   iou_weighted: bool = True,
+                   reduction: str = 'mean',
+                   avg_factor: Optional[int] = None) -> Tensor:
+    """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+    Args:
+        pred (Tensor): The prediction with shape (N, C), C is the
+            number of classes.
+        target (Tensor): The learning target of the iou-aware
+            classification score with shape (N, C), C is the number of classes.
+        weight (Tensor, optional): The weight of loss for each
+            prediction. Defaults to None.
+        alpha (float, optional): A balance factor for the negative part of
+            Varifocal Loss, which is different from the alpha of Focal Loss.
+            Defaults to 0.75.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        iou_weighted (bool, optional): Whether to weight the loss of the
+            positive example with the iou target. Defaults to True.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'. Options are "none", "mean" and
+            "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+
+    Returns:
+        Tensor: Loss tensor.
+    """
+    # pred and target should be of the same size
+    assert pred.size() == target.size()
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    if iou_weighted:
+        focal_weight = target * (target > 0.0).float() + \
+            alpha * (pred_sigmoid - target).abs().pow(gamma) * \
+            (target <= 0.0).float()
+    else:
+        focal_weight = (target > 0.0).float() + \
+            alpha * (pred_sigmoid - target).abs().pow(gamma) * \
+            (target <= 0.0).float()
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class VarifocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid: bool = True,
+                 alpha: float = 0.75,
+                 gamma: float = 2.0,
+                 iou_weighted: bool = True,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            alpha (float, optional): A balance factor for the negative part of
+                Varifocal Loss, which is different from the alpha of Focal
+                Loss. Defaults to 0.75.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            iou_weighted (bool, optional): Whether to weight the loss of the
+                positive examples with the iou target. Defaults to True.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+        """
+        super().__init__()
+        assert use_sigmoid is True, \
+            'Only sigmoid varifocal loss supported now.'
+        assert alpha >= 0.0
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.iou_weighted = iou_weighted
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction with shape (N, C), C is the
+                number of classes.
+            target (Tensor): The learning target of the iou-aware
+                classification score with shape (N, C), C is
+                the number of classes.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            loss_cls = self.loss_weight * varifocal_loss(
+                pred,
+                target,
+                weight,
+                alpha=self.alpha,
+                gamma=self.gamma,
+                iou_weighted=self.iou_weighted,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            raise NotImplementedError
+        return loss_cls
diff --git a/mmde/mmdet/models/mot/__init__.py b/mmde/mmdet/models/mot/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bd3c8d3ba53daad736e05b5d29a6abb377fd595
--- /dev/null
+++ b/mmde/mmdet/models/mot/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseMOTModel
+from .bytetrack import ByteTrack
+from .deep_sort import DeepSORT
+from .ocsort import OCSORT
+from .qdtrack import QDTrack
+from .strongsort import StrongSORT
+
+__all__ = [
+    'BaseMOTModel', 'ByteTrack', 'QDTrack', 'DeepSORT', 'StrongSORT', 'OCSORT'
+]
diff --git a/mmde/mmdet/models/mot/base.py b/mmde/mmdet/models/mot/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..9981417924af3970319b0cbe6a9cc8d8a1095451
--- /dev/null
+++ b/mmde/mmdet/models/mot/base.py
@@ -0,0 +1,147 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Tuple, Union
+
+from mmengine.model import BaseModel
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptTrackSampleList, TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class BaseMOTModel(BaseModel, metaclass=ABCMeta):
+    """Base class for multiple object tracking.
+
+    Args:
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Initialization config dict.
+    """
+
+    def __init__(self,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+    def freeze_module(self, module: Union[List[str], Tuple[str], str]) -> None:
+        """Freeze module during training."""
+        if isinstance(module, str):
+            modules = [module]
+        else:
+            if not (isinstance(module, list) or isinstance(module, tuple)):
+                raise TypeError('module must be a str or a list.')
+            else:
+                modules = module
+        for module in modules:
+            m = getattr(self, module)
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    @property
+    def with_detector(self) -> bool:
+        """bool: whether the framework has a detector."""
+        return hasattr(self, 'detector') and self.detector is not None
+
+    @property
+    def with_reid(self) -> bool:
+        """bool: whether the framework has a reid model."""
+        return hasattr(self, 'reid') and self.reid is not None
+
+    @property
+    def with_motion(self) -> bool:
+        """bool: whether the framework has a motion model."""
+        return hasattr(self, 'motion') and self.motion is not None
+
+    @property
+    def with_track_head(self) -> bool:
+        """bool: whether the framework has a track_head."""
+        return hasattr(self, 'track_head') and self.track_head is not None
+
+    @property
+    def with_tracker(self) -> bool:
+        """bool: whether the framework has a tracker."""
+        return hasattr(self, 'tracker') and self.tracker is not None
+
+    def forward(self,
+                inputs: Dict[str, Tensor],
+                data_samples: OptTrackSampleList = None,
+                mode: str = 'predict',
+                **kwargs):
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+        processed to a list of :obj:`TrackDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W)
+                encoding input images. Typically these should be mean centered
+                and std scaled. The N denotes batch size. The T denotes the
+                number of key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`], optional): The
+                annotation data of every samples. Defaults to None.
+            mode (str): Return what kind of value. Defaults to 'predict'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of :obj:`TrackDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if mode == 'loss':
+            return self.loss(inputs, data_samples, **kwargs)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples, **kwargs)
+        elif mode == 'tensor':
+            return self._forward(inputs, data_samples, **kwargs)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    @abstractmethod
+    def loss(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList,
+             **kwargs) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs and data samples."""
+        pass
+
+    @abstractmethod
+    def predict(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing."""
+        pass
+
+    def _forward(self,
+                 inputs: Dict[str, Tensor],
+                 data_samples: OptTrackSampleList = None,
+                 **kwargs):
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W).
+            data_samples (List[:obj:`TrackDataSample`], optional): The
+                Data Samples. It usually includes information such as
+                `gt_instance`.
+
+        Returns:
+            tuple[list]: A tuple of features from ``head`` forward.
+        """
+        raise NotImplementedError(
+            "_forward function (namely 'tensor' mode) is not supported now")
diff --git a/mmde/mmdet/models/mot/bytetrack.py b/mmde/mmdet/models/mot/bytetrack.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a3bb867cb284aad9854de44b2942341a4a33be8
--- /dev/null
+++ b/mmde/mmdet/models/mot/bytetrack.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList, TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .base import BaseMOTModel
+
+
+@MODELS.register_module()
+class ByteTrack(BaseMOTModel):
+    """ByteTrack: Multi-Object Tracking by Associating Every Detection Box.
+
+    This multi object tracker is the implementation of `ByteTrack
+    <https://arxiv.org/abs/2110.06864>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        tracker (dict): Configuration of tracker. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+    def loss(self, inputs: Tensor, data_samples: SampleList, **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Tensor): of shape (N, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size
+            data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        return self.detector.loss(inputs, data_samples, **kwargs)
+
+    def predict(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a video and data samples with post-processing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of frames in a video.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+        Returns:
+            TrackSampleList: Tracking results of the inputs.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(0) == 1, \
+            'Bytetrack inference only support ' \
+            '1 batch size per gpu for now.'
+
+        assert len(data_samples) == 1, \
+            'Bytetrack inference only support 1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            # det_results List[DetDataSample]
+            det_results = self.detector.predict(single_img, [img_data_sample])
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+
+            pred_track_instances = self.tracker.track(
+                data_sample=det_results[0], **kwargs)
+            img_data_sample.pred_track_instances = pred_track_instances
+
+        return [track_data_sample]
diff --git a/mmde/mmdet/models/mot/deep_sort.py b/mmde/mmdet/models/mot/deep_sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..70b30c7b07b2211fd0ad70767f479e57b6cd33f6
--- /dev/null
+++ b/mmde/mmdet/models/mot/deep_sort.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import TrackSampleList
+from mmdet.utils import OptConfigType
+from .base import BaseMOTModel
+
+
+@MODELS.register_module()
+class DeepSORT(BaseMOTModel):
+    """Simple online and realtime tracking with a deep association metric.
+
+    Details can be found at `DeepSORT<https://arxiv.org/abs/1703.07402>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        reid (dict): Configuration of reid. Defaults to None
+        tracker (dict): Configuration of tracker. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 reid: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(data_preprocessor, init_cfg)
+
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+
+        if reid is not None:
+            self.reid = MODELS.build(reid)
+
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+        self.preprocess_cfg = data_preprocessor
+
+    def loss(self, inputs: Tensor, data_samples: TrackSampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+        raise NotImplementedError(
+            'Please train `detector` and `reid` models firstly, then \
+                inference with SORT/DeepSORT.')
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: TrackSampleList,
+                rescale: bool = True,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a video and data samples with post- processing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of key frames
+                and reference frames.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            TrackSampleList: List[TrackDataSample]
+            Tracking results of the input videos.
+            Each DetDataSample usually contains ``pred_track_instances``.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(0) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        assert len(data_samples) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+        if track_data_sample[0].frame_id == 0:
+            self.tracker.reset()
+
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            # det_results List[DetDataSample]
+            det_results = self.detector.predict(single_img, [img_data_sample])
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+
+            pred_track_instances = self.tracker.track(
+                model=self,
+                img=single_img,
+                feats=None,
+                data_sample=det_results[0],
+                data_preprocessor=self.preprocess_cfg,
+                rescale=rescale,
+                **kwargs)
+            img_data_sample.pred_track_instances = pred_track_instances
+
+        return [track_data_sample]
diff --git a/mmde/mmdet/models/mot/ocsort.py b/mmde/mmdet/models/mot/ocsort.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf4eb3b06e2b1b223fe948f30dac877248377e3
--- /dev/null
+++ b/mmde/mmdet/models/mot/ocsort.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Dict, Optional
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .base import BaseMOTModel
+
+
+@MODELS.register_module()
+class OCSORT(BaseMOTModel):
+    """OCOSRT: Observation-Centric SORT: Rethinking SORT for Robust
+    Multi-Object Tracking
+
+    This multi object tracker is the implementation of `OC-SORT
+    <https://arxiv.org/abs/2203.14360>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        tracker (dict): Configuration of tracker. Defaults to None.
+        motion (dict): Configuration of motion. Defaults to None.
+        init_cfg (dict): Configuration of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+    def loss(self, inputs: Tensor, data_samples: TrackSampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+        return self.detector.loss(inputs, data_samples, **kwargs)
+
+    def predict(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a video and data samples with post-processing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of frames in a video.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+        Returns:
+            TrackSampleList: Tracking results of the inputs.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(0) == 1, \
+            'OCSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        assert len(data_samples) == 1, \
+            'OCSORT inference only support 1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            # det_results List[DetDataSample]
+            det_results = self.detector.predict(single_img, [img_data_sample])
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+
+            pred_track_instances = self.tracker.track(
+                data_sample=det_results[0], **kwargs)
+            img_data_sample.pred_track_instances = pred_track_instances
+
+        return [track_data_sample]
diff --git a/mmde/mmdet/models/mot/qdtrack.py b/mmde/mmdet/models/mot/qdtrack.py
new file mode 100644
index 0000000000000000000000000000000000000000..43d5dd60b8af8a6200e21a196c47d00dd2812a46
--- /dev/null
+++ b/mmde/mmdet/models/mot/qdtrack.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .base import BaseMOTModel
+
+
+@MODELS.register_module()
+class QDTrack(BaseMOTModel):
+    """Quasi-Dense Similarity Learning for Multiple Object Tracking.
+
+    This multi object tracker is the implementation of `QDTrack
+    <https://arxiv.org/abs/2006.06664>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        track_head (dict): Configuration of track head. Defaults to None.
+        tracker (dict): Configuration of tracker. Defaults to None.
+        freeze_detector (bool): If True, freeze the detector weights.
+            Defaults to False.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 track_head: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 freeze_detector: bool = False,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+
+        if track_head is not None:
+            self.track_head = MODELS.build(track_head)
+
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+        self.freeze_detector = freeze_detector
+        if self.freeze_detector:
+            self.freeze_module('detector')
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: TrackSampleList,
+                rescale: bool = True,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a video and data samples with post- processing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of frames in a video.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            TrackSampleList: Tracking results of the inputs.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(0) == 1, \
+            'QDTrack inference only support 1 batch size per gpu for now.'
+
+        assert len(data_samples) == 1, \
+            'QDTrack only support 1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+        if track_data_sample[0].frame_id == 0:
+            self.tracker.reset()
+
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            x = self.detector.extract_feat(single_img)
+            rpn_results_list = self.detector.rpn_head.predict(
+                x, [img_data_sample])
+            # det_results List[InstanceData]
+            det_results = self.detector.roi_head.predict(
+                x, rpn_results_list, [img_data_sample], rescale=rescale)
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+            img_data_sample.pred_instances = det_results[0]
+            frame_pred_track_instances = self.tracker.track(
+                model=self,
+                img=single_img,
+                feats=x,
+                data_sample=img_data_sample,
+                **kwargs)
+            img_data_sample.pred_track_instances = frame_pred_track_instances
+
+        return [track_data_sample]
+
+    def loss(self, inputs: Tensor, data_samples: TrackSampleList,
+             **kwargs) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size. The T denotes the number of
+                frames.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        # modify the inputs shape to fit mmdet
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(1) == 2, \
+            'QDTrack can only have 1 key frame and 1 reference frame.'
+
+        # split the data_samples into two aspects: key frames and reference
+        # frames
+        ref_data_samples, key_data_samples = [], []
+        key_frame_inds, ref_frame_inds = [], []
+        # set cat_id of gt_labels to 0 in RPN
+        for track_data_sample in data_samples:
+            key_frame_inds.append(track_data_sample.key_frames_inds[0])
+            ref_frame_inds.append(track_data_sample.ref_frames_inds[0])
+            key_data_sample = track_data_sample.get_key_frames()[0]
+            key_data_sample.gt_instances.labels = \
+                torch.zeros_like(key_data_sample.gt_instances.labels)
+            key_data_samples.append(key_data_sample)
+            ref_data_sample = track_data_sample.get_ref_frames()[0]
+            ref_data_samples.append(ref_data_sample)
+
+        key_frame_inds = torch.tensor(key_frame_inds, dtype=torch.int64)
+        ref_frame_inds = torch.tensor(ref_frame_inds, dtype=torch.int64)
+        batch_inds = torch.arange(len(inputs))
+        key_imgs = inputs[batch_inds, key_frame_inds].contiguous()
+        ref_imgs = inputs[batch_inds, ref_frame_inds].contiguous()
+
+        x = self.detector.extract_feat(key_imgs)
+        ref_x = self.detector.extract_feat(ref_imgs)
+
+        losses = dict()
+        # RPN head forward and loss
+        assert self.detector.with_rpn, \
+            'QDTrack only support detector with RPN.'
+
+        proposal_cfg = self.detector.train_cfg.get('rpn_proposal',
+                                                   self.detector.test_cfg.rpn)
+        rpn_losses, rpn_results_list = self.detector.rpn_head. \
+            loss_and_predict(x,
+                             key_data_samples,
+                             proposal_cfg=proposal_cfg,
+                             **kwargs)
+        ref_rpn_results_list = self.detector.rpn_head.predict(
+            ref_x, ref_data_samples, **kwargs)
+
+        # avoid get same name with roi_head loss
+        keys = rpn_losses.keys()
+        for key in keys:
+            if 'loss' in key and 'rpn' not in key:
+                rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+        losses.update(rpn_losses)
+
+        # roi_head loss
+        losses_detect = self.detector.roi_head.loss(x, rpn_results_list,
+                                                    key_data_samples, **kwargs)
+        losses.update(losses_detect)
+
+        # tracking head loss
+        losses_track = self.track_head.loss(x, ref_x, rpn_results_list,
+                                            ref_rpn_results_list, data_samples,
+                                            **kwargs)
+        losses.update(losses_track)
+
+        return losses
diff --git a/mmde/mmdet/models/mot/strongsort.py b/mmde/mmdet/models/mot/strongsort.py
new file mode 100644
index 0000000000000000000000000000000000000000..6129bf49972233206b3c05daa2174f99723d1b9d
--- /dev/null
+++ b/mmde/mmdet/models/mot/strongsort.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import numpy as np
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import TrackSampleList
+from mmdet.utils import OptConfigType
+from .deep_sort import DeepSORT
+
+
+@MODELS.register_module()
+class StrongSORT(DeepSORT):
+    """StrongSORT: Make DeepSORT Great Again.
+
+    Details can be found at `StrongSORT<https://arxiv.org/abs/2202.13514>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        reid (dict): Configuration of reid. Defaults to None
+        tracker (dict): Configuration of tracker. Defaults to None.
+        kalman (dict): Configuration of Kalman filter. Defaults to None.
+        cmc (dict): Configuration of camera model compensation.
+            Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 reid: Optional[dict] = None,
+                 cmc: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 postprocess_model: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(detector, reid, tracker, data_preprocessor, init_cfg)
+
+        if cmc is not None:
+            self.cmc = TASK_UTILS.build(cmc)
+
+        if postprocess_model is not None:
+            self.postprocess_model = TASK_UTILS.build(postprocess_model)
+
+    @property
+    def with_cmc(self):
+        """bool: whether the framework has a camera model compensation
+                model.
+        """
+        return hasattr(self, 'cmc') and self.cmc is not None
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: TrackSampleList,
+                rescale: bool = True,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a video and data samples with post- processing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of key frames
+                and reference frames.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            TrackSampleList: List[TrackDataSample]
+            Tracking results of the input videos.
+            Each DetDataSample usually contains ``pred_track_instances``.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(0) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        assert len(data_samples) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+
+        video_track_instances = []
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            # det_results List[DetDataSample]
+            det_results = self.detector.predict(single_img, [img_data_sample])
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+
+            pred_track_instances = self.tracker.track(
+                model=self,
+                img=single_img,
+                data_sample=det_results[0],
+                data_preprocessor=self.preprocess_cfg,
+                rescale=rescale,
+                **kwargs)
+            for i in range(len(pred_track_instances.instances_id)):
+                video_track_instances.append(
+                    np.array([
+                        frame_id + 1,
+                        pred_track_instances.instances_id[i].cpu(),
+                        pred_track_instances.bboxes[i][0].cpu(),
+                        pred_track_instances.bboxes[i][1].cpu(),
+                        (pred_track_instances.bboxes[i][2] -
+                         pred_track_instances.bboxes[i][0]).cpu(),
+                        (pred_track_instances.bboxes[i][3] -
+                         pred_track_instances.bboxes[i][1]).cpu(),
+                        pred_track_instances.scores[i].cpu()
+                    ]))
+        video_track_instances = np.array(video_track_instances).reshape(-1, 7)
+        video_track_instances = self.postprocess_model.forward(
+            video_track_instances)
+        for frame_id in range(video_len):
+            track_data_sample[frame_id].pred_track_instances = \
+                    InstanceData(bboxes=video_track_instances[
+                        video_track_instances[:, 0] == frame_id + 1, :])
+
+        return [track_data_sample]
diff --git a/mmde/mmdet/models/necks/__init__.py b/mmde/mmdet/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..343fbfefbd871d00e855d1c3cf4b531345e4dcf1
--- /dev/null
+++ b/mmde/mmdet/models/necks/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bfp import BFP
+from .channel_mapper import ChannelMapper
+from .cspnext_pafpn import CSPNeXtPAFPN
+from .ct_resnet_neck import CTResNetNeck
+from .dilated_encoder import DilatedEncoder
+from .dyhead import DyHead
+from .fpg import FPG
+from .fpn import FPN
+from .fpn_carafe import FPN_CARAFE
+from .fpn_dropblock import FPN_DropBlock
+from .hrfpn import HRFPN
+from .nas_fpn import NASFPN
+from .nasfcos_fpn import NASFCOS_FPN
+from .pafpn import PAFPN
+from .rfp import RFP
+from .ssd_neck import SSDNeck
+from .ssh import SSH
+from .yolo_neck import YOLOV3Neck
+from .yolox_pafpn import YOLOXPAFPN
+
+__all__ = [
+    'FPN', 'BFP', 'ChannelMapper', 'HRFPN', 'NASFPN', 'FPN_CARAFE', 'PAFPN',
+    'NASFCOS_FPN', 'RFP', 'YOLOV3Neck', 'FPG', 'DilatedEncoder',
+    'CTResNetNeck', 'SSDNeck', 'YOLOXPAFPN', 'DyHead', 'CSPNeXtPAFPN', 'SSH',
+    'FPN_DropBlock'
+]
diff --git a/mmde/mmdet/models/necks/bfp.py b/mmde/mmdet/models/necks/bfp.py
new file mode 100644
index 0000000000000000000000000000000000000000..401cdb0f552b06c9e8eb185c3e8ae0ba7112a9d8
--- /dev/null
+++ b/mmde/mmdet/models/necks/bfp.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import NonLocal2d
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class BFP(BaseModule):
+    """BFP (Balanced Feature Pyramids)
+
+    BFP takes multi-level features as inputs and gather them into a single one,
+    then refine the gathered feature and scatter the refined results to
+    multi-level features. This module is used in Libra R-CNN (CVPR 2019), see
+    the paper `Libra R-CNN: Towards Balanced Learning for Object Detection
+    <https://arxiv.org/abs/1904.02701>`_ for details.
+
+    Args:
+        in_channels (int): Number of input channels (feature maps of all levels
+            should have the same channels).
+        num_levels (int): Number of input feature levels.
+        refine_level (int): Index of integration and refine level of BSF in
+            multi-level features from bottom to top.
+        refine_type (str): Type of the refine op, currently support
+            [None, 'conv', 'non_local'].
+        conv_cfg (:obj:`ConfigDict` or dict, optional): The config dict for
+            convolution layers.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): The config dict for
+            normalization layers.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        num_levels: int,
+        refine_level: int = 2,
+        refine_type: str = None,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        init_cfg: OptMultiConfig = dict(
+            type='Xavier', layer='Conv2d', distribution='uniform')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert refine_type in [None, 'conv', 'non_local']
+
+        self.in_channels = in_channels
+        self.num_levels = num_levels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.refine_level = refine_level
+        self.refine_type = refine_type
+        assert 0 <= self.refine_level < self.num_levels
+
+        if self.refine_type == 'conv':
+            self.refine = ConvModule(
+                self.in_channels,
+                self.in_channels,
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+        elif self.refine_type == 'non_local':
+            self.refine = NonLocal2d(
+                self.in_channels,
+                reduction=1,
+                use_scale=False,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+
+    def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]:
+        """Forward function."""
+        assert len(inputs) == self.num_levels
+
+        # step 1: gather multi-level features by resize and average
+        feats = []
+        gather_size = inputs[self.refine_level].size()[2:]
+        for i in range(self.num_levels):
+            if i < self.refine_level:
+                gathered = F.adaptive_max_pool2d(
+                    inputs[i], output_size=gather_size)
+            else:
+                gathered = F.interpolate(
+                    inputs[i], size=gather_size, mode='nearest')
+            feats.append(gathered)
+
+        bsf = sum(feats) / len(feats)
+
+        # step 2: refine gathered features
+        if self.refine_type is not None:
+            bsf = self.refine(bsf)
+
+        # step 3: scatter refined features to multi-levels by a residual path
+        outs = []
+        for i in range(self.num_levels):
+            out_size = inputs[i].size()[2:]
+            if i < self.refine_level:
+                residual = F.interpolate(bsf, size=out_size, mode='nearest')
+            else:
+                residual = F.adaptive_max_pool2d(bsf, output_size=out_size)
+            outs.append(residual + inputs[i])
+
+        return tuple(outs)
diff --git a/mmde/mmdet/models/necks/channel_mapper.py b/mmde/mmdet/models/necks/channel_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..74293618f2b8a649328ae4a5a0571809de9991dd
--- /dev/null
+++ b/mmde/mmdet/models/necks/channel_mapper.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class ChannelMapper(BaseModule):
+    """Channel Mapper to reduce/increase channels of backbone features.
+
+    This is used to reduce/increase channels of backbone features.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        kernel_size (int, optional): kernel_size for reducing channels (used
+            at each scale). Default: 3.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Default: None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Default: None.
+        act_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            activation layer in ConvModule. Default: dict(type='ReLU').
+        bias (bool | str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
+            False. Default: "auto".
+        num_outs (int, optional): Number of output feature maps. There would
+            be extra_convs when num_outs larger than the length of in_channels.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or dict],
+            optional): Initialization config dict.
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = ChannelMapper(in_channels, 11, 3).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(
+        self,
+        in_channels: List[int],
+        out_channels: int,
+        kernel_size: int = 3,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        act_cfg: OptConfigType = dict(type='ReLU'),
+        bias: Union[bool, str] = 'auto',
+        num_outs: int = None,
+        init_cfg: OptMultiConfig = dict(
+            type='Xavier', layer='Conv2d', distribution='uniform')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(in_channels, list)
+        self.extra_convs = None
+        if num_outs is None:
+            num_outs = len(in_channels)
+        self.convs = nn.ModuleList()
+        for in_channel in in_channels:
+            self.convs.append(
+                ConvModule(
+                    in_channel,
+                    out_channels,
+                    kernel_size,
+                    padding=(kernel_size - 1) // 2,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    bias=bias))
+        if num_outs > len(in_channels):
+            self.extra_convs = nn.ModuleList()
+            for i in range(len(in_channels), num_outs):
+                if i == len(in_channels):
+                    in_channel = in_channels[-1]
+                else:
+                    in_channel = out_channels
+                self.extra_convs.append(
+                    ConvModule(
+                        in_channel,
+                        out_channels,
+                        3,
+                        stride=2,
+                        padding=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg,
+                        bias=bias))
+
+    def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]:
+        """Forward function."""
+        assert len(inputs) == len(self.convs)
+        outs = [self.convs[i](inputs[i]) for i in range(len(inputs))]
+        if self.extra_convs:
+            for i in range(len(self.extra_convs)):
+                if i == 0:
+                    outs.append(self.extra_convs[0](inputs[-1]))
+                else:
+                    outs.append(self.extra_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/mmde/mmdet/models/necks/cspnext_pafpn.py b/mmde/mmdet/models/necks/cspnext_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a52ba72d9b3e48c4866fb16507bc2118eb23010e
--- /dev/null
+++ b/mmde/mmdet/models/necks/cspnext_pafpn.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptMultiConfig
+from ..layers import CSPLayer
+
+
+@MODELS.register_module()
+class CSPNeXtPAFPN(BaseModule):
+    """Path Aggregation Network with CSPNeXt blocks.
+
+    Args:
+        in_channels (Sequence[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer.
+            Defaults to 3.
+        use_depthwise (bool): Whether to use depthwise separable convolution in
+            blocks. Defaults to False.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Default: 0.5
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(scale_factor=2, mode='nearest')`
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish')
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(
+        self,
+        in_channels: Sequence[int],
+        out_channels: int,
+        num_csp_blocks: int = 3,
+        use_depthwise: bool = False,
+        expand_ratio: float = 0.5,
+        upsample_cfg: ConfigType = dict(scale_factor=2, mode='nearest'),
+        conv_cfg: bool = None,
+        norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg: ConfigType = dict(type='Swish'),
+        init_cfg: OptMultiConfig = dict(
+            type='Kaiming',
+            layer='Conv2d',
+            a=math.sqrt(5),
+            distribution='uniform',
+            mode='fan_in',
+            nonlinearity='leaky_relu')
+    ) -> None:
+        super().__init__(init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        # build top-down blocks
+        self.upsample = nn.Upsample(**upsample_cfg)
+        self.reduce_layers = nn.ModuleList()
+        self.top_down_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.reduce_layers.append(
+                ConvModule(
+                    in_channels[idx],
+                    in_channels[idx - 1],
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.top_down_blocks.append(
+                CSPLayer(
+                    in_channels[idx - 1] * 2,
+                    in_channels[idx - 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    use_cspnext_block=True,
+                    expand_ratio=expand_ratio,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        # build bottom-up blocks
+        self.downsamples = nn.ModuleList()
+        self.bottom_up_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1):
+            self.downsamples.append(
+                conv(
+                    in_channels[idx],
+                    in_channels[idx],
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.bottom_up_blocks.append(
+                CSPLayer(
+                    in_channels[idx] * 2,
+                    in_channels[idx + 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    use_cspnext_block=True,
+                    expand_ratio=expand_ratio,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        self.out_convs = nn.ModuleList()
+        for i in range(len(in_channels)):
+            self.out_convs.append(
+                conv(
+                    in_channels[i],
+                    out_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, inputs: Tuple[Tensor, ...]) -> Tuple[Tensor, ...]:
+        """
+        Args:
+            inputs (tuple[Tensor]): input features.
+
+        Returns:
+            tuple[Tensor]: YOLOXPAFPN features.
+        """
+        assert len(inputs) == len(self.in_channels)
+
+        # top-down path
+        inner_outs = [inputs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = inputs[idx - 1]
+            feat_heigh = self.reduce_layers[len(self.in_channels) - 1 - idx](
+                feat_heigh)
+            inner_outs[0] = feat_heigh
+
+            upsample_feat = self.upsample(feat_heigh)
+
+            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
+                torch.cat([upsample_feat, feat_low], 1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsamples[idx](feat_low)
+            out = self.bottom_up_blocks[idx](
+                torch.cat([downsample_feat, feat_height], 1))
+            outs.append(out)
+
+        # out convs
+        for idx, conv in enumerate(self.out_convs):
+            outs[idx] = conv(outs[idx])
+
+        return tuple(outs)
diff --git a/mmde/mmdet/models/necks/ct_resnet_neck.py b/mmde/mmdet/models/necks/ct_resnet_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..9109fe79290fafecd954f223d5365ef619c0c301
--- /dev/null
+++ b/mmde/mmdet/models/necks/ct_resnet_neck.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptMultiConfig
+
+
+@MODELS.register_module()
+class CTResNetNeck(BaseModule):
+    """The neck used in `CenterNet <https://arxiv.org/abs/1904.07850>`_ for
+    object classification and box regression.
+
+    Args:
+         in_channels (int): Number of input channels.
+         num_deconv_filters (tuple[int]): Number of filters per stage.
+         num_deconv_kernels (tuple[int]): Number of kernels per stage.
+         use_dcn (bool): If True, use DCNv2. Defaults to True.
+         init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+             list[:obj:`ConfigDict`], optional): Initialization
+             config dict.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 num_deconv_filters: Tuple[int, ...],
+                 num_deconv_kernels: Tuple[int, ...],
+                 use_dcn: bool = True,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert len(num_deconv_filters) == len(num_deconv_kernels)
+        self.fp16_enabled = False
+        self.use_dcn = use_dcn
+        self.in_channels = in_channels
+        self.deconv_layers = self._make_deconv_layer(num_deconv_filters,
+                                                     num_deconv_kernels)
+
+    def _make_deconv_layer(
+            self, num_deconv_filters: Tuple[int, ...],
+            num_deconv_kernels: Tuple[int, ...]) -> nn.Sequential:
+        """use deconv layers to upsample backbone's output."""
+        layers = []
+        for i in range(len(num_deconv_filters)):
+            feat_channels = num_deconv_filters[i]
+            conv_module = ConvModule(
+                self.in_channels,
+                feat_channels,
+                3,
+                padding=1,
+                conv_cfg=dict(type='DCNv2') if self.use_dcn else None,
+                norm_cfg=dict(type='BN'))
+            layers.append(conv_module)
+            upsample_module = ConvModule(
+                feat_channels,
+                feat_channels,
+                num_deconv_kernels[i],
+                stride=2,
+                padding=1,
+                conv_cfg=dict(type='deconv'),
+                norm_cfg=dict(type='BN'))
+            layers.append(upsample_module)
+            self.in_channels = feat_channels
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self) -> None:
+        """Initialize the parameters."""
+        for m in self.modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                # In order to be consistent with the source code,
+                # reset the ConvTranspose2d initialization parameters
+                m.reset_parameters()
+                # Simulated bilinear upsampling kernel
+                w = m.weight.data
+                f = math.ceil(w.size(2) / 2)
+                c = (2 * f - 1 - f % 2) / (2. * f)
+                for i in range(w.size(2)):
+                    for j in range(w.size(3)):
+                        w[0, 0, i, j] = \
+                            (1 - math.fabs(i / f - c)) * (
+                                    1 - math.fabs(j / f - c))
+                for c in range(1, w.size(0)):
+                    w[c, 0, :, :] = w[0, 0, :, :]
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            # self.use_dcn is False
+            elif not self.use_dcn and isinstance(m, nn.Conv2d):
+                # In order to be consistent with the source code,
+                # reset the Conv2d initialization parameters
+                m.reset_parameters()
+
+    def forward(self, x: Sequence[torch.Tensor]) -> Tuple[torch.Tensor]:
+        """model forward."""
+        assert isinstance(x, (list, tuple))
+        outs = self.deconv_layers(x[-1])
+        return outs,
diff --git a/mmde/mmdet/models/necks/dilated_encoder.py b/mmde/mmdet/models/necks/dilated_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9beb3ea9b4289da8d0100ae7759927f045829bb
--- /dev/null
+++ b/mmde/mmdet/models/necks/dilated_encoder.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, is_norm
+from mmengine.model import caffe2_xavier_init, constant_init, normal_init
+from torch.nn import BatchNorm2d
+
+from mmdet.registry import MODELS
+
+
+class Bottleneck(nn.Module):
+    """Bottleneck block for DilatedEncoder used in `YOLOF.
+
+    <https://arxiv.org/abs/2103.09460>`.
+
+    The Bottleneck contains three ConvLayers and one residual connection.
+
+    Args:
+        in_channels (int): The number of input channels.
+        mid_channels (int): The number of middle output channels.
+        dilation (int): Dilation rate.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 dilation,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        super(Bottleneck, self).__init__()
+        self.conv1 = ConvModule(
+            in_channels, mid_channels, 1, norm_cfg=norm_cfg)
+        self.conv2 = ConvModule(
+            mid_channels,
+            mid_channels,
+            3,
+            padding=dilation,
+            dilation=dilation,
+            norm_cfg=norm_cfg)
+        self.conv3 = ConvModule(
+            mid_channels, in_channels, 1, norm_cfg=norm_cfg)
+
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = self.conv3(out)
+        out = out + identity
+        return out
+
+
+@MODELS.register_module()
+class DilatedEncoder(nn.Module):
+    """Dilated Encoder for YOLOF <https://arxiv.org/abs/2103.09460>`.
+
+    This module contains two types of components:
+        - the original FPN lateral convolution layer and fpn convolution layer,
+              which are 1x1 conv + 3x3 conv
+        - the dilated residual block
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        block_mid_channels (int): The number of middle block output channels
+        num_residual_blocks (int): The number of residual blocks.
+        block_dilations (list): The list of residual blocks dilation.
+    """
+
+    def __init__(self, in_channels, out_channels, block_mid_channels,
+                 num_residual_blocks, block_dilations):
+        super(DilatedEncoder, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.block_mid_channels = block_mid_channels
+        self.num_residual_blocks = num_residual_blocks
+        self.block_dilations = block_dilations
+        self._init_layers()
+
+    def _init_layers(self):
+        self.lateral_conv = nn.Conv2d(
+            self.in_channels, self.out_channels, kernel_size=1)
+        self.lateral_norm = BatchNorm2d(self.out_channels)
+        self.fpn_conv = nn.Conv2d(
+            self.out_channels, self.out_channels, kernel_size=3, padding=1)
+        self.fpn_norm = BatchNorm2d(self.out_channels)
+        encoder_blocks = []
+        for i in range(self.num_residual_blocks):
+            dilation = self.block_dilations[i]
+            encoder_blocks.append(
+                Bottleneck(
+                    self.out_channels,
+                    self.block_mid_channels,
+                    dilation=dilation))
+        self.dilated_encoder_blocks = nn.Sequential(*encoder_blocks)
+
+    def init_weights(self):
+        caffe2_xavier_init(self.lateral_conv)
+        caffe2_xavier_init(self.fpn_conv)
+        for m in [self.lateral_norm, self.fpn_norm]:
+            constant_init(m, 1)
+        for m in self.dilated_encoder_blocks.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+
+    def forward(self, feature):
+        out = self.lateral_norm(self.lateral_conv(feature[-1]))
+        out = self.fpn_norm(self.fpn_conv(out))
+        return self.dilated_encoder_blocks(out),
diff --git a/mmde/mmdet/models/necks/dyhead.py b/mmde/mmdet/models/necks/dyhead.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f5ae0b285c20558a0c7bcc59cbb7b214684eab2
--- /dev/null
+++ b/mmde/mmdet/models/necks/dyhead.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmcv.ops.modulated_deform_conv import ModulatedDeformConv2d
+from mmengine.model import BaseModule, constant_init, normal_init
+
+from mmdet.registry import MODELS
+from ..layers import DyReLU
+
+# Reference:
+# https://github.com/microsoft/DynamicHead
+# https://github.com/jshilong/SEPC
+
+
+class DyDCNv2(nn.Module):
+    """ModulatedDeformConv2d with normalization layer used in DyHead.
+
+    This module cannot be configured with `conv_cfg=dict(type='DCNv2')`
+    because DyHead calculates offset and mask from middle-level feature.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        stride (int | tuple[int], optional): Stride of the convolution.
+            Default: 1.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='GN', num_groups=16, requires_grad=True).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 norm_cfg=dict(type='GN', num_groups=16, requires_grad=True)):
+        super().__init__()
+        self.with_norm = norm_cfg is not None
+        bias = not self.with_norm
+        self.conv = ModulatedDeformConv2d(
+            in_channels, out_channels, 3, stride=stride, padding=1, bias=bias)
+        if self.with_norm:
+            self.norm = build_norm_layer(norm_cfg, out_channels)[1]
+
+    def forward(self, x, offset, mask):
+        """Forward function."""
+        x = self.conv(x.contiguous(), offset, mask)
+        if self.with_norm:
+            x = self.norm(x)
+        return x
+
+
+class DyHeadBlock(nn.Module):
+    """DyHead Block with three types of attention.
+
+    HSigmoid arguments in default act_cfg follow official code, not paper.
+    https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        zero_init_offset (bool, optional): Whether to use zero init for
+            `spatial_conv_offset`. Default: True.
+        act_cfg (dict, optional): Config dict for the last activation layer of
+            scale-aware attention. Default: dict(type='HSigmoid', bias=3.0,
+            divisor=6.0).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 zero_init_offset=True,
+                 act_cfg=dict(type='HSigmoid', bias=3.0, divisor=6.0)):
+        super().__init__()
+        self.zero_init_offset = zero_init_offset
+        # (offset_x, offset_y, mask) * kernel_size_y * kernel_size_x
+        self.offset_and_mask_dim = 3 * 3 * 3
+        self.offset_dim = 2 * 3 * 3
+
+        self.spatial_conv_high = DyDCNv2(in_channels, out_channels)
+        self.spatial_conv_mid = DyDCNv2(in_channels, out_channels)
+        self.spatial_conv_low = DyDCNv2(in_channels, out_channels, stride=2)
+        self.spatial_conv_offset = nn.Conv2d(
+            in_channels, self.offset_and_mask_dim, 3, padding=1)
+        self.scale_attn_module = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1), nn.Conv2d(out_channels, 1, 1),
+            nn.ReLU(inplace=True), build_activation_layer(act_cfg))
+        self.task_attn_module = DyReLU(out_channels)
+        self._init_weights()
+
+    def _init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, 0, 0.01)
+        if self.zero_init_offset:
+            constant_init(self.spatial_conv_offset, 0)
+
+    def forward(self, x):
+        """Forward function."""
+        outs = []
+        for level in range(len(x)):
+            # calculate offset and mask of DCNv2 from middle-level feature
+            offset_and_mask = self.spatial_conv_offset(x[level])
+            offset = offset_and_mask[:, :self.offset_dim, :, :]
+            mask = offset_and_mask[:, self.offset_dim:, :, :].sigmoid()
+
+            mid_feat = self.spatial_conv_mid(x[level], offset, mask)
+            sum_feat = mid_feat * self.scale_attn_module(mid_feat)
+            summed_levels = 1
+            if level > 0:
+                low_feat = self.spatial_conv_low(x[level - 1], offset, mask)
+                sum_feat += low_feat * self.scale_attn_module(low_feat)
+                summed_levels += 1
+            if level < len(x) - 1:
+                # this upsample order is weird, but faster than natural order
+                # https://github.com/microsoft/DynamicHead/issues/25
+                high_feat = F.interpolate(
+                    self.spatial_conv_high(x[level + 1], offset, mask),
+                    size=x[level].shape[-2:],
+                    mode='bilinear',
+                    align_corners=True)
+                sum_feat += high_feat * self.scale_attn_module(high_feat)
+                summed_levels += 1
+            outs.append(self.task_attn_module(sum_feat / summed_levels))
+
+        return outs
+
+
+@MODELS.register_module()
+class DyHead(BaseModule):
+    """DyHead neck consisting of multiple DyHead Blocks.
+
+    See `Dynamic Head: Unifying Object Detection Heads with Attentions
+    <https://arxiv.org/abs/2106.08322>`_ for details.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_blocks (int, optional): Number of DyHead Blocks. Default: 6.
+        zero_init_offset (bool, optional): Whether to use zero init for
+            `spatial_conv_offset`. Default: True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=6,
+                 zero_init_offset=True,
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.zero_init_offset = zero_init_offset
+
+        dyhead_blocks = []
+        for i in range(num_blocks):
+            in_channels = self.in_channels if i == 0 else self.out_channels
+            dyhead_blocks.append(
+                DyHeadBlock(
+                    in_channels,
+                    self.out_channels,
+                    zero_init_offset=zero_init_offset))
+        self.dyhead_blocks = nn.Sequential(*dyhead_blocks)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert isinstance(inputs, (tuple, list))
+        outs = self.dyhead_blocks(inputs)
+        return tuple(outs)
diff --git a/mmde/mmdet/models/necks/fpg.py b/mmde/mmdet/models/necks/fpg.py
new file mode 100644
index 0000000000000000000000000000000000000000..73ee799bb83645ab2556fe871dcd8b1c5bbff89e
--- /dev/null
+++ b/mmde/mmdet/models/necks/fpg.py
@@ -0,0 +1,406 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+
+
+class Transition(BaseModule):
+    """Base class for transition.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+    """
+
+    def __init__(self, in_channels, out_channels, init_cfg=None):
+        super().__init__(init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+    def forward(x):
+        pass
+
+
+class UpInterpolationConv(Transition):
+    """A transition used for up-sampling.
+
+    Up-sample the input by interpolation then refines the feature by
+    a convolution layer.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        scale_factor (int): Up-sampling factor. Default: 2.
+        mode (int): Interpolation mode. Default: nearest.
+        align_corners (bool): Whether align corners when interpolation.
+            Default: None.
+        kernel_size (int): Kernel size for the conv. Default: 3.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 scale_factor=2,
+                 mode='nearest',
+                 align_corners=None,
+                 kernel_size=3,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, init_cfg)
+        self.mode = mode
+        self.scale_factor = scale_factor
+        self.align_corners = align_corners
+        self.conv = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=(kernel_size - 1) // 2,
+            **kwargs)
+
+    def forward(self, x):
+        x = F.interpolate(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners)
+        x = self.conv(x)
+        return x
+
+
+class LastConv(Transition):
+    """A transition used for refining the output of the last stage.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_inputs (int): Number of inputs of the FPN features.
+        kernel_size (int): Kernel size for the conv. Default: 3.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_inputs,
+                 kernel_size=3,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, init_cfg)
+        self.num_inputs = num_inputs
+        self.conv_out = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=(kernel_size - 1) // 2,
+            **kwargs)
+
+    def forward(self, inputs):
+        assert len(inputs) == self.num_inputs
+        return self.conv_out(inputs[-1])
+
+
+@MODELS.register_module()
+class FPG(BaseModule):
+    """FPG.
+
+    Implementation of `Feature Pyramid Grids (FPG)
+    <https://arxiv.org/abs/2004.03580>`_.
+    This implementation only gives the basic structure stated in the paper.
+    But users can implement different type of transitions to fully explore the
+    the potential power of the structure of FPG.
+
+    Args:
+        in_channels (int): Number of input channels (feature maps of all levels
+            should have the same channels).
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        stack_times (int): The number of times the pyramid architecture will
+            be stacked.
+        paths (list[str]): Specify the path order of each stack level.
+            Each element in the list should be either 'bu' (bottom-up) or
+            'td' (top-down).
+        inter_channels (int): Number of inter channels.
+        same_up_trans (dict): Transition that goes down at the same stage.
+        same_down_trans (dict): Transition that goes up at the same stage.
+        across_lateral_trans (dict): Across-pathway same-stage
+        across_down_trans (dict): Across-pathway bottom-up connection.
+        across_up_trans (dict): Across-pathway top-down connection.
+        across_skip_trans (dict): Across-pathway skip connection.
+        output_trans (dict): Transition that trans the output of the
+            last stage.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool): It decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    transition_types = {
+        'conv': ConvModule,
+        'interpolation_conv': UpInterpolationConv,
+        'last_conv': LastConv,
+    }
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 stack_times,
+                 paths,
+                 inter_channels=None,
+                 same_down_trans=None,
+                 same_up_trans=dict(
+                     type='conv', kernel_size=3, stride=2, padding=1),
+                 across_lateral_trans=dict(type='conv', kernel_size=1),
+                 across_down_trans=dict(type='conv', kernel_size=3),
+                 across_up_trans=None,
+                 across_skip_trans=dict(type='identity'),
+                 output_trans=dict(type='last_conv', kernel_size=3),
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 norm_cfg=None,
+                 skip_inds=None,
+                 init_cfg=[
+                     dict(type='Caffe2Xavier', layer='Conv2d'),
+                     dict(
+                         type='Constant',
+                         layer=[
+                             '_BatchNorm', '_InstanceNorm', 'GroupNorm',
+                             'LayerNorm'
+                         ],
+                         val=1.0)
+                 ]):
+        super(FPG, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        if inter_channels is None:
+            self.inter_channels = [out_channels for _ in range(num_outs)]
+        elif isinstance(inter_channels, int):
+            self.inter_channels = [inter_channels for _ in range(num_outs)]
+        else:
+            assert isinstance(inter_channels, list)
+            assert len(inter_channels) == num_outs
+            self.inter_channels = inter_channels
+        self.stack_times = stack_times
+        self.paths = paths
+        assert isinstance(paths, list) and len(paths) == stack_times
+        for d in paths:
+            assert d in ('bu', 'td')
+
+        self.same_down_trans = same_down_trans
+        self.same_up_trans = same_up_trans
+        self.across_lateral_trans = across_lateral_trans
+        self.across_down_trans = across_down_trans
+        self.across_up_trans = across_up_trans
+        self.output_trans = output_trans
+        self.across_skip_trans = across_skip_trans
+
+        self.with_bias = norm_cfg is None
+        # skip inds must be specified if across skip trans is not None
+        if self.across_skip_trans is not None:
+            skip_inds is not None
+        self.skip_inds = skip_inds
+        assert len(self.skip_inds[0]) <= self.stack_times
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+
+        # build lateral 1x1 convs to reduce channels
+        self.lateral_convs = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = nn.Conv2d(self.in_channels[i],
+                               self.inter_channels[i - self.start_level], 1)
+            self.lateral_convs.append(l_conv)
+
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        self.extra_downsamples = nn.ModuleList()
+        for i in range(extra_levels):
+            if self.add_extra_convs:
+                fpn_idx = self.backbone_end_level - self.start_level + i
+                extra_conv = nn.Conv2d(
+                    self.inter_channels[fpn_idx - 1],
+                    self.inter_channels[fpn_idx],
+                    3,
+                    stride=2,
+                    padding=1)
+                self.extra_downsamples.append(extra_conv)
+            else:
+                self.extra_downsamples.append(nn.MaxPool2d(1, stride=2))
+
+        self.fpn_transitions = nn.ModuleList()  # stack times
+        for s in range(self.stack_times):
+            stage_trans = nn.ModuleList()  # num of feature levels
+            for i in range(self.num_outs):
+                # same, across_lateral, across_down, across_up
+                trans = nn.ModuleDict()
+                if s in self.skip_inds[i]:
+                    stage_trans.append(trans)
+                    continue
+                # build same-stage down trans (used in bottom-up paths)
+                if i == 0 or self.same_up_trans is None:
+                    same_up_trans = None
+                else:
+                    same_up_trans = self.build_trans(
+                        self.same_up_trans, self.inter_channels[i - 1],
+                        self.inter_channels[i])
+                trans['same_up'] = same_up_trans
+                # build same-stage up trans (used in top-down paths)
+                if i == self.num_outs - 1 or self.same_down_trans is None:
+                    same_down_trans = None
+                else:
+                    same_down_trans = self.build_trans(
+                        self.same_down_trans, self.inter_channels[i + 1],
+                        self.inter_channels[i])
+                trans['same_down'] = same_down_trans
+                # build across lateral trans
+                across_lateral_trans = self.build_trans(
+                    self.across_lateral_trans, self.inter_channels[i],
+                    self.inter_channels[i])
+                trans['across_lateral'] = across_lateral_trans
+                # build across down trans
+                if i == self.num_outs - 1 or self.across_down_trans is None:
+                    across_down_trans = None
+                else:
+                    across_down_trans = self.build_trans(
+                        self.across_down_trans, self.inter_channels[i + 1],
+                        self.inter_channels[i])
+                trans['across_down'] = across_down_trans
+                # build across up trans
+                if i == 0 or self.across_up_trans is None:
+                    across_up_trans = None
+                else:
+                    across_up_trans = self.build_trans(
+                        self.across_up_trans, self.inter_channels[i - 1],
+                        self.inter_channels[i])
+                trans['across_up'] = across_up_trans
+                if self.across_skip_trans is None:
+                    across_skip_trans = None
+                else:
+                    across_skip_trans = self.build_trans(
+                        self.across_skip_trans, self.inter_channels[i - 1],
+                        self.inter_channels[i])
+                trans['across_skip'] = across_skip_trans
+                # build across_skip trans
+                stage_trans.append(trans)
+            self.fpn_transitions.append(stage_trans)
+
+        self.output_transition = nn.ModuleList()  # output levels
+        for i in range(self.num_outs):
+            trans = self.build_trans(
+                self.output_trans,
+                self.inter_channels[i],
+                self.out_channels,
+                num_inputs=self.stack_times + 1)
+            self.output_transition.append(trans)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def build_trans(self, cfg, in_channels, out_channels, **extra_args):
+        cfg_ = cfg.copy()
+        trans_type = cfg_.pop('type')
+        trans_cls = self.transition_types[trans_type]
+        return trans_cls(in_channels, out_channels, **cfg_, **extra_args)
+
+    def fuse(self, fuse_dict):
+        out = None
+        for item in fuse_dict.values():
+            if item is not None:
+                if out is None:
+                    out = item
+                else:
+                    out = out + item
+        return out
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # build all levels from original feature maps
+        feats = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+        for downsample in self.extra_downsamples:
+            feats.append(downsample(feats[-1]))
+
+        outs = [feats]
+
+        for i in range(self.stack_times):
+            current_outs = outs[-1]
+            next_outs = []
+            direction = self.paths[i]
+            for j in range(self.num_outs):
+                if i in self.skip_inds[j]:
+                    next_outs.append(outs[-1][j])
+                    continue
+                # feature level
+                if direction == 'td':
+                    lvl = self.num_outs - j - 1
+                else:
+                    lvl = j
+                # get transitions
+                if direction == 'td':
+                    same_trans = self.fpn_transitions[i][lvl]['same_down']
+                else:
+                    same_trans = self.fpn_transitions[i][lvl]['same_up']
+                across_lateral_trans = self.fpn_transitions[i][lvl][
+                    'across_lateral']
+                across_down_trans = self.fpn_transitions[i][lvl]['across_down']
+                across_up_trans = self.fpn_transitions[i][lvl]['across_up']
+                across_skip_trans = self.fpn_transitions[i][lvl]['across_skip']
+                # init output
+                to_fuse = dict(
+                    same=None, lateral=None, across_up=None, across_down=None)
+                # same downsample/upsample
+                if same_trans is not None:
+                    to_fuse['same'] = same_trans(next_outs[-1])
+                # across lateral
+                if across_lateral_trans is not None:
+                    to_fuse['lateral'] = across_lateral_trans(
+                        current_outs[lvl])
+                # across downsample
+                if lvl > 0 and across_up_trans is not None:
+                    to_fuse['across_up'] = across_up_trans(current_outs[lvl -
+                                                                        1])
+                # across upsample
+                if (lvl < self.num_outs - 1 and across_down_trans is not None):
+                    to_fuse['across_down'] = across_down_trans(
+                        current_outs[lvl + 1])
+                if across_skip_trans is not None:
+                    to_fuse['across_skip'] = across_skip_trans(outs[0][lvl])
+                x = self.fuse(to_fuse)
+                next_outs.append(x)
+
+            if direction == 'td':
+                outs.append(next_outs[::-1])
+            else:
+                outs.append(next_outs)
+
+        # output trans
+        final_outs = []
+        for i in range(self.num_outs):
+            lvl_out_list = []
+            for s in range(len(outs)):
+                lvl_out_list.append(outs[s][i])
+            lvl_out = self.output_transition[i](lvl_out_list)
+            final_outs.append(lvl_out)
+
+        return final_outs
diff --git a/mmde/mmdet/models/necks/fpn.py b/mmde/mmdet/models/necks/fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..67bd8879641f8539f329e6ffb94f88d25e417244
--- /dev/null
+++ b/mmde/mmdet/models/necks/fpn.py
@@ -0,0 +1,221 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class FPN(BaseModule):
+    r"""Feature Pyramid Network.
+
+    This is an implementation of paper `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (list[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Defaults to 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Defaults to -1, which means the
+            last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Defaults to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral': Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Defaults to False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Defaults to False.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Defaults to None.
+        act_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            activation layer in ConvModule. Defaults to None.
+        upsample_cfg (:obj:`ConfigDict` or dict, optional): Config dict
+            for interpolate layer. Defaults to dict(mode='nearest').
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(
+        self,
+        in_channels: List[int],
+        out_channels: int,
+        num_outs: int,
+        start_level: int = 0,
+        end_level: int = -1,
+        add_extra_convs: Union[bool, str] = False,
+        relu_before_extra_convs: bool = False,
+        no_norm_on_lateral: bool = False,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        act_cfg: OptConfigType = None,
+        upsample_cfg: ConfigType = dict(mode='nearest'),
+        init_cfg: MultiConfig = dict(
+            type='Xavier', layer='Conv2d', distribution='uniform')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            self.add_extra_convs = 'on_input'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    def forward(self, inputs: Tuple[Tensor]) -> tuple:
+        """Forward function.
+
+        Args:
+            inputs (tuple[Tensor]): Features from the upstream network, each
+                is a 4D-tensor.
+
+        Returns:
+            tuple: Feature maps, each is a 4D-tensor.
+        """
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                # fix runtime error of "+=" inplace operation in PyTorch 1.10
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/mmde/mmdet/models/necks/fpn_carafe.py b/mmde/mmdet/models/necks/fpn_carafe.py
new file mode 100644
index 0000000000000000000000000000000000000000..b393ff7c340c0c343fc4c91a4d87d341f66a3177
--- /dev/null
+++ b/mmde/mmdet/models/necks/fpn_carafe.py
@@ -0,0 +1,275 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_upsample_layer
+from mmcv.ops.carafe import CARAFEPack
+from mmengine.model import BaseModule, ModuleList, xavier_init
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class FPN_CARAFE(BaseModule):
+    """FPN_CARAFE is a more flexible implementation of FPN. It allows more
+    choice for upsample methods during the top-down pathway.
+
+    It can reproduce the performance of ICCV 2019 paper
+    CARAFE: Content-Aware ReAssembly of FEatures
+    Please refer to https://arxiv.org/abs/1905.02188 for more details.
+
+    Args:
+        in_channels (list[int]): Number of channels for each input feature map.
+        out_channels (int): Output channels of feature pyramids.
+        num_outs (int): Number of output stages.
+        start_level (int): Start level of feature pyramids.
+            (Default: 0)
+        end_level (int): End level of feature pyramids.
+            (Default: -1 indicates the last level).
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        activate (str): Type of activation function in ConvModule
+            (Default: None indicates w/o activation).
+        order (dict): Order of components in ConvModule.
+        upsample (str): Type of upsample layer.
+        upsample_cfg (dict): Dictionary to construct and config upsample layer.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 order=('conv', 'norm', 'act'),
+                 upsample_cfg=dict(
+                     type='carafe',
+                     up_kernel=5,
+                     up_group=1,
+                     encoder_kernel=3,
+                     encoder_dilation=1),
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super(FPN_CARAFE, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.with_bias = norm_cfg is None
+        self.upsample_cfg = upsample_cfg.copy()
+        self.upsample = self.upsample_cfg.get('type')
+        self.relu = nn.ReLU(inplace=False)
+
+        self.order = order
+        assert order in [('conv', 'norm', 'act'), ('act', 'conv', 'norm')]
+
+        assert self.upsample in [
+            'nearest', 'bilinear', 'deconv', 'pixel_shuffle', 'carafe', None
+        ]
+        if self.upsample in ['deconv', 'pixel_shuffle']:
+            assert hasattr(
+                self.upsample_cfg,
+                'upsample_kernel') and self.upsample_cfg.upsample_kernel > 0
+            self.upsample_kernel = self.upsample_cfg.pop('upsample_kernel')
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+
+        self.lateral_convs = ModuleList()
+        self.fpn_convs = ModuleList()
+        self.upsample_modules = ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                norm_cfg=norm_cfg,
+                bias=self.with_bias,
+                act_cfg=act_cfg,
+                inplace=False,
+                order=self.order)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                bias=self.with_bias,
+                act_cfg=act_cfg,
+                inplace=False,
+                order=self.order)
+            if i != self.backbone_end_level - 1:
+                upsample_cfg_ = self.upsample_cfg.copy()
+                if self.upsample == 'deconv':
+                    upsample_cfg_.update(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        kernel_size=self.upsample_kernel,
+                        stride=2,
+                        padding=(self.upsample_kernel - 1) // 2,
+                        output_padding=(self.upsample_kernel - 1) // 2)
+                elif self.upsample == 'pixel_shuffle':
+                    upsample_cfg_.update(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        scale_factor=2,
+                        upsample_kernel=self.upsample_kernel)
+                elif self.upsample == 'carafe':
+                    upsample_cfg_.update(channels=out_channels, scale_factor=2)
+                else:
+                    # suppress warnings
+                    align_corners = (None
+                                     if self.upsample == 'nearest' else False)
+                    upsample_cfg_.update(
+                        scale_factor=2,
+                        mode=self.upsample,
+                        align_corners=align_corners)
+                upsample_module = build_upsample_layer(upsample_cfg_)
+                self.upsample_modules.append(upsample_module)
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_out_levels = (
+            num_outs - self.backbone_end_level + self.start_level)
+        if extra_out_levels >= 1:
+            for i in range(extra_out_levels):
+                in_channels = (
+                    self.in_channels[self.backbone_end_level -
+                                     1] if i == 0 else out_channels)
+                extra_l_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    bias=self.with_bias,
+                    act_cfg=act_cfg,
+                    inplace=False,
+                    order=self.order)
+                if self.upsample == 'deconv':
+                    upsampler_cfg_ = dict(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        kernel_size=self.upsample_kernel,
+                        stride=2,
+                        padding=(self.upsample_kernel - 1) // 2,
+                        output_padding=(self.upsample_kernel - 1) // 2)
+                elif self.upsample == 'pixel_shuffle':
+                    upsampler_cfg_ = dict(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        scale_factor=2,
+                        upsample_kernel=self.upsample_kernel)
+                elif self.upsample == 'carafe':
+                    upsampler_cfg_ = dict(
+                        channels=out_channels,
+                        scale_factor=2,
+                        **self.upsample_cfg)
+                else:
+                    # suppress warnings
+                    align_corners = (None
+                                     if self.upsample == 'nearest' else False)
+                    upsampler_cfg_ = dict(
+                        scale_factor=2,
+                        mode=self.upsample,
+                        align_corners=align_corners)
+                upsampler_cfg_['type'] = self.upsample
+                upsample_module = build_upsample_layer(upsampler_cfg_)
+                extra_fpn_conv = ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.with_bias,
+                    act_cfg=act_cfg,
+                    inplace=False,
+                    order=self.order)
+                self.upsample_modules.append(upsample_module)
+                self.fpn_convs.append(extra_fpn_conv)
+                self.lateral_convs.append(extra_l_conv)
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        """Initialize the weights of module."""
+        super(FPN_CARAFE, self).init_weights()
+        for m in self.modules():
+            if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
+                xavier_init(m, distribution='uniform')
+        for m in self.modules():
+            if isinstance(m, CARAFEPack):
+                m.init_weights()
+
+    def slice_as(self, src, dst):
+        """Slice ``src`` as ``dst``
+
+        Note:
+            ``src`` should have the same or larger size than ``dst``.
+
+        Args:
+            src (torch.Tensor): Tensors to be sliced.
+            dst (torch.Tensor): ``src`` will be sliced to have the same
+                size as ``dst``.
+
+        Returns:
+            torch.Tensor: Sliced tensor.
+        """
+        assert (src.size(2) >= dst.size(2)) and (src.size(3) >= dst.size(3))
+        if src.size(2) == dst.size(2) and src.size(3) == dst.size(3):
+            return src
+        else:
+            return src[:, :, :dst.size(2), :dst.size(3)]
+
+    def tensor_add(self, a, b):
+        """Add tensors ``a`` and ``b`` that might have different sizes."""
+        if a.size() == b.size():
+            c = a + b
+        else:
+            c = a + self.slice_as(b, a)
+        return c
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = []
+        for i, lateral_conv in enumerate(self.lateral_convs):
+            if i <= self.backbone_end_level - self.start_level:
+                input = inputs[min(i + self.start_level, len(inputs) - 1)]
+            else:
+                input = laterals[-1]
+            lateral = lateral_conv(input)
+            laterals.append(lateral)
+
+        # build top-down path
+        for i in range(len(laterals) - 1, 0, -1):
+            if self.upsample is not None:
+                upsample_feat = self.upsample_modules[i - 1](laterals[i])
+            else:
+                upsample_feat = laterals[i]
+            laterals[i - 1] = self.tensor_add(laterals[i - 1], upsample_feat)
+
+        # build outputs
+        num_conv_outs = len(self.fpn_convs)
+        outs = []
+        for i in range(num_conv_outs):
+            out = self.fpn_convs[i](laterals[i])
+            outs.append(out)
+        return tuple(outs)
diff --git a/mmde/mmdet/models/necks/fpn_dropblock.py b/mmde/mmdet/models/necks/fpn_dropblock.py
new file mode 100644
index 0000000000000000000000000000000000000000..473af924cdaaecf88aa4a0a6e1500511530b91a2
--- /dev/null
+++ b/mmde/mmdet/models/necks/fpn_dropblock.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .fpn import FPN
+
+
+@MODELS.register_module()
+class FPN_DropBlock(FPN):
+
+    def __init__(self,
+                 *args,
+                 plugin: Optional[dict] = dict(
+                     type='DropBlock',
+                     drop_prob=0.3,
+                     block_size=3,
+                     warmup_iters=0),
+                 **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.plugin = None
+        if plugin is not None:
+            self.plugin = MODELS.build(plugin)
+
+    def forward(self, inputs: Tuple[Tensor]) -> tuple:
+        """Forward function.
+
+        Args:
+            inputs (tuple[Tensor]): Features from the upstream network, each
+                is a 4D-tensor.
+
+        Returns:
+            tuple: Feature maps, each is a 4D-tensor.
+        """
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                # fix runtime error of "+=" inplace operation in PyTorch 1.10
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+            if self.plugin is not None:
+                laterals[i - 1] = self.plugin(laterals[i - 1])
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/mmde/mmdet/models/necks/hrfpn.py b/mmde/mmdet/models/necks/hrfpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2627549b4cb8acc6833bc40425e459c28aa5c20
--- /dev/null
+++ b/mmde/mmdet/models/necks/hrfpn.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch.utils.checkpoint import checkpoint
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class HRFPN(BaseModule):
+    """HRFPN (High Resolution Feature Pyramids)
+
+    paper: `High-Resolution Representations for Labeling Pixels and Regions
+    <https://arxiv.org/abs/1904.04514>`_.
+
+    Args:
+        in_channels (list): number of channels for each branch.
+        out_channels (int): output channels of feature pyramids.
+        num_outs (int): number of output stages.
+        pooling_type (str): pooling for generating feature pyramids
+            from {MAX, AVG}.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        with_cp  (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        stride (int): stride of 3x3 convolutional layers
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs=5,
+                 pooling_type='AVG',
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 with_cp=False,
+                 stride=1,
+                 init_cfg=dict(type='Caffe2Xavier', layer='Conv2d')):
+        super(HRFPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.reduction_conv = ConvModule(
+            sum(in_channels),
+            out_channels,
+            kernel_size=1,
+            conv_cfg=self.conv_cfg,
+            act_cfg=None)
+
+        self.fpn_convs = nn.ModuleList()
+        for i in range(self.num_outs):
+            self.fpn_convs.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    stride=stride,
+                    conv_cfg=self.conv_cfg,
+                    act_cfg=None))
+
+        if pooling_type == 'MAX':
+            self.pooling = F.max_pool2d
+        else:
+            self.pooling = F.avg_pool2d
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == self.num_ins
+        outs = [inputs[0]]
+        for i in range(1, self.num_ins):
+            outs.append(
+                F.interpolate(inputs[i], scale_factor=2**i, mode='bilinear'))
+        out = torch.cat(outs, dim=1)
+        if out.requires_grad and self.with_cp:
+            out = checkpoint(self.reduction_conv, out)
+        else:
+            out = self.reduction_conv(out)
+        outs = [out]
+        for i in range(1, self.num_outs):
+            outs.append(self.pooling(out, kernel_size=2**i, stride=2**i))
+        outputs = []
+
+        for i in range(self.num_outs):
+            if outs[i].requires_grad and self.with_cp:
+                tmp_out = checkpoint(self.fpn_convs[i], outs[i])
+            else:
+                tmp_out = self.fpn_convs[i](outs[i])
+            outputs.append(tmp_out)
+        return tuple(outputs)
diff --git a/mmde/mmdet/models/necks/nas_fpn.py b/mmde/mmdet/models/necks/nas_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ec90cd6eed3aa65a3a192d332cbfd8c16d5bc36
--- /dev/null
+++ b/mmde/mmdet/models/necks/nas_fpn.py
@@ -0,0 +1,171 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops.merge_cells import GlobalPoolingCell, SumCell
+from mmengine.model import BaseModule, ModuleList
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class NASFPN(BaseModule):
+    """NAS-FPN.
+
+    Implementation of `NAS-FPN: Learning Scalable Feature Pyramid Architecture
+    for Object Detection <https://arxiv.org/abs/1904.07392>`_
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        stack_times (int): The number of times the pyramid architecture will
+            be stacked.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Defaults to 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Defaults to -1, which means the
+            last level.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: List[int],
+        out_channels: int,
+        num_outs: int,
+        stack_times: int,
+        start_level: int = 0,
+        end_level: int = -1,
+        norm_cfg: OptConfigType = None,
+        init_cfg: MultiConfig = dict(type='Caffe2Xavier', layer='Conv2d')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)  # num of input feature levels
+        self.num_outs = num_outs  # num of output feature levels
+        self.stack_times = stack_times
+        self.norm_cfg = norm_cfg
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+
+        # add lateral connections
+        self.lateral_convs = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            self.lateral_convs.append(l_conv)
+
+        # add extra downsample layers (stride-2 pooling or conv)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        self.extra_downsamples = nn.ModuleList()
+        for i in range(extra_levels):
+            extra_conv = ConvModule(
+                out_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+            self.extra_downsamples.append(
+                nn.Sequential(extra_conv, nn.MaxPool2d(2, 2)))
+
+        # add NAS FPN connections
+        self.fpn_stages = ModuleList()
+        for _ in range(self.stack_times):
+            stage = nn.ModuleDict()
+            # gp(p6, p4) -> p4_1
+            stage['gp_64_4'] = GlobalPoolingCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p4_1, p4) -> p4_2
+            stage['sum_44_4'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p4_2, p3) -> p3_out
+            stage['sum_43_3'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p3_out, p4_2) -> p4_out
+            stage['sum_34_4'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p5, gp(p4_out, p3_out)) -> p5_out
+            stage['gp_43_5'] = GlobalPoolingCell(with_out_conv=False)
+            stage['sum_55_5'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p7, gp(p5_out, p4_2)) -> p7_out
+            stage['gp_54_7'] = GlobalPoolingCell(with_out_conv=False)
+            stage['sum_77_7'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # gp(p7_out, p5_out) -> p6_out
+            stage['gp_75_6'] = GlobalPoolingCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            self.fpn_stages.append(stage)
+
+    def forward(self, inputs: Tuple[Tensor]) -> tuple:
+        """Forward function.
+
+         Args:
+            inputs (tuple[Tensor]): Features from the upstream network, each
+                is a 4D-tensor.
+
+        Returns:
+            tuple: Feature maps, each is a 4D-tensor.
+        """
+        # build P3-P5
+        feats = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+        # build P6-P7 on top of P5
+        for downsample in self.extra_downsamples:
+            feats.append(downsample(feats[-1]))
+
+        p3, p4, p5, p6, p7 = feats
+
+        for stage in self.fpn_stages:
+            # gp(p6, p4) -> p4_1
+            p4_1 = stage['gp_64_4'](p6, p4, out_size=p4.shape[-2:])
+            # sum(p4_1, p4) -> p4_2
+            p4_2 = stage['sum_44_4'](p4_1, p4, out_size=p4.shape[-2:])
+            # sum(p4_2, p3) -> p3_out
+            p3 = stage['sum_43_3'](p4_2, p3, out_size=p3.shape[-2:])
+            # sum(p3_out, p4_2) -> p4_out
+            p4 = stage['sum_34_4'](p3, p4_2, out_size=p4.shape[-2:])
+            # sum(p5, gp(p4_out, p3_out)) -> p5_out
+            p5_tmp = stage['gp_43_5'](p4, p3, out_size=p5.shape[-2:])
+            p5 = stage['sum_55_5'](p5, p5_tmp, out_size=p5.shape[-2:])
+            # sum(p7, gp(p5_out, p4_2)) -> p7_out
+            p7_tmp = stage['gp_54_7'](p5, p4_2, out_size=p7.shape[-2:])
+            p7 = stage['sum_77_7'](p7, p7_tmp, out_size=p7.shape[-2:])
+            # gp(p7_out, p5_out) -> p6_out
+            p6 = stage['gp_75_6'](p7, p5, out_size=p6.shape[-2:])
+
+        return p3, p4, p5, p6, p7
diff --git a/mmde/mmdet/models/necks/nasfcos_fpn.py b/mmde/mmdet/models/necks/nasfcos_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d0848f7634bb0113e0b5a16b5b65ba8b7ebb9c
--- /dev/null
+++ b/mmde/mmdet/models/necks/nasfcos_fpn.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.ops.merge_cells import ConcatCell
+from mmengine.model import BaseModule, caffe2_xavier_init
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class NASFCOS_FPN(BaseModule):
+    """FPN structure in NASFPN.
+
+    Implementation of paper `NAS-FCOS: Fast Neural Architecture Search for
+    Object Detection <https://arxiv.org/abs/1906.04423>`_
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool): It decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=1,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super(NASFCOS_FPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+
+        self.adapt_convs = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            adapt_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                stride=1,
+                padding=0,
+                bias=False,
+                norm_cfg=dict(type='BN'),
+                act_cfg=dict(type='ReLU', inplace=False))
+            self.adapt_convs.append(adapt_conv)
+
+        # C2 is omitted according to the paper
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+
+        def build_concat_cell(with_input1_conv, with_input2_conv):
+            cell_conv_cfg = dict(
+                kernel_size=1, padding=0, bias=False, groups=out_channels)
+            return ConcatCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                with_out_conv=True,
+                out_conv_cfg=cell_conv_cfg,
+                out_norm_cfg=dict(type='BN'),
+                out_conv_order=('norm', 'act', 'conv'),
+                with_input1_conv=with_input1_conv,
+                with_input2_conv=with_input2_conv,
+                input_conv_cfg=conv_cfg,
+                input_norm_cfg=norm_cfg,
+                upsample_mode='nearest')
+
+        # Denote c3=f0, c4=f1, c5=f2 for convince
+        self.fpn = nn.ModuleDict()
+        self.fpn['c22_1'] = build_concat_cell(True, True)
+        self.fpn['c22_2'] = build_concat_cell(True, True)
+        self.fpn['c32'] = build_concat_cell(True, False)
+        self.fpn['c02'] = build_concat_cell(True, False)
+        self.fpn['c42'] = build_concat_cell(True, True)
+        self.fpn['c36'] = build_concat_cell(True, True)
+        self.fpn['c61'] = build_concat_cell(True, True)  # f9
+        self.extra_downsamples = nn.ModuleList()
+        for i in range(extra_levels):
+            extra_act_cfg = None if i == 0 \
+                else dict(type='ReLU', inplace=False)
+            self.extra_downsamples.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    act_cfg=extra_act_cfg,
+                    order=('act', 'norm', 'conv')))
+
+    def forward(self, inputs):
+        """Forward function."""
+        feats = [
+            adapt_conv(inputs[i + self.start_level])
+            for i, adapt_conv in enumerate(self.adapt_convs)
+        ]
+
+        for (i, module_name) in enumerate(self.fpn):
+            idx_1, idx_2 = int(module_name[1]), int(module_name[2])
+            res = self.fpn[module_name](feats[idx_1], feats[idx_2])
+            feats.append(res)
+
+        ret = []
+        for (idx, input_idx) in zip([9, 8, 7], [1, 2, 3]):  # add P3, P4, P5
+            feats1, feats2 = feats[idx], feats[5]
+            feats2_resize = F.interpolate(
+                feats2,
+                size=feats1.size()[2:],
+                mode='bilinear',
+                align_corners=False)
+
+            feats_sum = feats1 + feats2_resize
+            ret.append(
+                F.interpolate(
+                    feats_sum,
+                    size=inputs[input_idx].size()[2:],
+                    mode='bilinear',
+                    align_corners=False))
+
+        for submodule in self.extra_downsamples:
+            ret.append(submodule(ret[-1]))
+
+        return tuple(ret)
+
+    def init_weights(self):
+        """Initialize the weights of module."""
+        super(NASFCOS_FPN, self).init_weights()
+        for module in self.fpn.values():
+            if hasattr(module, 'conv_out'):
+                caffe2_xavier_init(module.out_conv.conv)
+
+        for modules in [
+                self.adapt_convs.modules(),
+                self.extra_downsamples.modules()
+        ]:
+            for module in modules:
+                if isinstance(module, nn.Conv2d):
+                    caffe2_xavier_init(module)
diff --git a/mmde/mmdet/models/necks/pafpn.py b/mmde/mmdet/models/necks/pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..557638f48a629691f780d3e1466e234bbe987518
--- /dev/null
+++ b/mmde/mmdet/models/necks/pafpn.py
@@ -0,0 +1,157 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmdet.registry import MODELS
+from .fpn import FPN
+
+
+@MODELS.register_module()
+class PAFPN(FPN):
+    """Path Aggregation Network for Instance Segmentation.
+
+    This is an implementation of the `PAFPN in Path Aggregation Network
+    <https://arxiv.org/abs/1803.01534>`_.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral':  Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (str): Config dict for activation layer in ConvModule.
+            Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(PAFPN, self).__init__(
+            in_channels,
+            out_channels,
+            num_outs,
+            start_level,
+            end_level,
+            add_extra_convs,
+            relu_before_extra_convs,
+            no_norm_on_lateral,
+            conv_cfg,
+            norm_cfg,
+            act_cfg,
+            init_cfg=init_cfg)
+        # add extra bottom up pathway
+        self.downsample_convs = nn.ModuleList()
+        self.pafpn_convs = nn.ModuleList()
+        for i in range(self.start_level + 1, self.backbone_end_level):
+            d_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+            pafpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+            self.downsample_convs.append(d_conv)
+            self.pafpn_convs.append(pafpn_conv)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            prev_shape = laterals[i - 1].shape[2:]
+            laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                laterals[i], size=prev_shape, mode='nearest')
+
+        # build outputs
+        # part 1: from original levels
+        inter_outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+
+        # part 2: add bottom-up path
+        for i in range(0, used_backbone_levels - 1):
+            inter_outs[i + 1] = inter_outs[i + 1] + \
+                                self.downsample_convs[i](inter_outs[i])
+
+        outs = []
+        outs.append(inter_outs[0])
+        outs.extend([
+            self.pafpn_convs[i - 1](inter_outs[i])
+            for i in range(1, used_backbone_levels)
+        ])
+
+        # part 3: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    orig = inputs[self.backbone_end_level - 1]
+                    outs.append(self.fpn_convs[used_backbone_levels](orig))
+                elif self.add_extra_convs == 'on_lateral':
+                    outs.append(self.fpn_convs[used_backbone_levels](
+                        laterals[-1]))
+                elif self.add_extra_convs == 'on_output':
+                    outs.append(self.fpn_convs[used_backbone_levels](outs[-1]))
+                else:
+                    raise NotImplementedError
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/mmde/mmdet/models/necks/rfp.py b/mmde/mmdet/models/necks/rfp.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ec9b3753c5031bb12a2b4c88733f13bf27c44e2
--- /dev/null
+++ b/mmde/mmdet/models/necks/rfp.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule, ModuleList, constant_init, xavier_init
+
+from mmdet.registry import MODELS
+from .fpn import FPN
+
+
+class ASPP(BaseModule):
+    """ASPP (Atrous Spatial Pyramid Pooling)
+
+    This is an implementation of the ASPP module used in DetectoRS
+    (https://arxiv.org/pdf/2006.02334.pdf)
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of channels produced by this module
+        dilations (tuple[int]): Dilations of the four branches.
+            Default: (1, 3, 6, 1)
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 dilations=(1, 3, 6, 1),
+                 init_cfg=dict(type='Kaiming', layer='Conv2d')):
+        super().__init__(init_cfg)
+        assert dilations[-1] == 1
+        self.aspp = nn.ModuleList()
+        for dilation in dilations:
+            kernel_size = 3 if dilation > 1 else 1
+            padding = dilation if dilation > 1 else 0
+            conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                dilation=dilation,
+                padding=padding,
+                bias=True)
+            self.aspp.append(conv)
+        self.gap = nn.AdaptiveAvgPool2d(1)
+
+    def forward(self, x):
+        avg_x = self.gap(x)
+        out = []
+        for aspp_idx in range(len(self.aspp)):
+            inp = avg_x if (aspp_idx == len(self.aspp) - 1) else x
+            out.append(F.relu_(self.aspp[aspp_idx](inp)))
+        out[-1] = out[-1].expand_as(out[-2])
+        out = torch.cat(out, dim=1)
+        return out
+
+
+@MODELS.register_module()
+class RFP(FPN):
+    """RFP (Recursive Feature Pyramid)
+
+    This is an implementation of RFP in `DetectoRS
+    <https://arxiv.org/pdf/2006.02334.pdf>`_. Different from standard FPN, the
+    input of RFP should be multi level features along with origin input image
+    of backbone.
+
+    Args:
+        rfp_steps (int): Number of unrolled steps of RFP.
+        rfp_backbone (dict): Configuration of the backbone for RFP.
+        aspp_out_channels (int): Number of output channels of ASPP module.
+        aspp_dilations (tuple[int]): Dilation rates of four branches.
+            Default: (1, 3, 6, 1)
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 rfp_steps,
+                 rfp_backbone,
+                 aspp_out_channels,
+                 aspp_dilations=(1, 3, 6, 1),
+                 init_cfg=None,
+                 **kwargs):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg, **kwargs)
+        self.rfp_steps = rfp_steps
+        # Be careful! Pretrained weights cannot be loaded when use
+        # nn.ModuleList
+        self.rfp_modules = ModuleList()
+        for rfp_idx in range(1, rfp_steps):
+            rfp_module = MODELS.build(rfp_backbone)
+            self.rfp_modules.append(rfp_module)
+        self.rfp_aspp = ASPP(self.out_channels, aspp_out_channels,
+                             aspp_dilations)
+        self.rfp_weight = nn.Conv2d(
+            self.out_channels,
+            1,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+
+    def init_weights(self):
+        # Avoid using super().init_weights(), which may alter the default
+        # initialization of the modules in self.rfp_modules that have missing
+        # keys in the pretrained checkpoint.
+        for convs in [self.lateral_convs, self.fpn_convs]:
+            for m in convs.modules():
+                if isinstance(m, nn.Conv2d):
+                    xavier_init(m, distribution='uniform')
+        for rfp_idx in range(self.rfp_steps - 1):
+            self.rfp_modules[rfp_idx].init_weights()
+        constant_init(self.rfp_weight, 0)
+
+    def forward(self, inputs):
+        inputs = list(inputs)
+        assert len(inputs) == len(self.in_channels) + 1  # +1 for input image
+        img = inputs.pop(0)
+        # FPN forward
+        x = super().forward(tuple(inputs))
+        for rfp_idx in range(self.rfp_steps - 1):
+            rfp_feats = [x[0]] + list(
+                self.rfp_aspp(x[i]) for i in range(1, len(x)))
+            x_idx = self.rfp_modules[rfp_idx].rfp_forward(img, rfp_feats)
+            # FPN forward
+            x_idx = super().forward(x_idx)
+            x_new = []
+            for ft_idx in range(len(x_idx)):
+                add_weight = torch.sigmoid(self.rfp_weight(x_idx[ft_idx]))
+                x_new.append(add_weight * x_idx[ft_idx] +
+                             (1 - add_weight) * x[ft_idx])
+            x = x_new
+        return x
diff --git a/mmde/mmdet/models/necks/ssd_neck.py b/mmde/mmdet/models/necks/ssd_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ba319370b988b9c7e2d98c2f10607ff8f8b5c3
--- /dev/null
+++ b/mmde/mmdet/models/necks/ssd_neck.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class SSDNeck(BaseModule):
+    """Extra layers of SSD backbone to generate multi-scale feature maps.
+
+    Args:
+        in_channels (Sequence[int]): Number of input channels per scale.
+        out_channels (Sequence[int]): Number of output channels per scale.
+        level_strides (Sequence[int]): Stride of 3x3 conv per level.
+        level_paddings (Sequence[int]): Padding size of 3x3 conv per level.
+        l2_norm_scale (float|None): L2 normalization layer init scale.
+            If None, not use L2 normalization on the first input feature.
+        last_kernel_size (int): Kernel size of the last conv layer.
+            Default: 3.
+        use_depthwise (bool): Whether to use DepthwiseSeparableConv.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 level_strides,
+                 level_paddings,
+                 l2_norm_scale=20.,
+                 last_kernel_size=3,
+                 use_depthwise=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=[
+                     dict(
+                         type='Xavier', distribution='uniform',
+                         layer='Conv2d'),
+                     dict(type='Constant', val=1, layer='BatchNorm2d'),
+                 ]):
+        super(SSDNeck, self).__init__(init_cfg)
+        assert len(out_channels) > len(in_channels)
+        assert len(out_channels) - len(in_channels) == len(level_strides)
+        assert len(level_strides) == len(level_paddings)
+        assert in_channels == out_channels[:len(in_channels)]
+
+        if l2_norm_scale:
+            self.l2_norm = L2Norm(in_channels[0], l2_norm_scale)
+            self.init_cfg += [
+                dict(
+                    type='Constant',
+                    val=self.l2_norm.scale,
+                    override=dict(name='l2_norm'))
+            ]
+
+        self.extra_layers = nn.ModuleList()
+        extra_layer_channels = out_channels[len(in_channels):]
+        second_conv = DepthwiseSeparableConvModule if \
+            use_depthwise else ConvModule
+
+        for i, (out_channel, stride, padding) in enumerate(
+                zip(extra_layer_channels, level_strides, level_paddings)):
+            kernel_size = last_kernel_size \
+                if i == len(extra_layer_channels) - 1 else 3
+            per_lvl_convs = nn.Sequential(
+                ConvModule(
+                    out_channels[len(in_channels) - 1 + i],
+                    out_channel // 2,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+                second_conv(
+                    out_channel // 2,
+                    out_channel,
+                    kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.extra_layers.append(per_lvl_convs)
+
+    def forward(self, inputs):
+        """Forward function."""
+        outs = [feat for feat in inputs]
+        if hasattr(self, 'l2_norm'):
+            outs[0] = self.l2_norm(outs[0])
+
+        feat = outs[-1]
+        for layer in self.extra_layers:
+            feat = layer(feat)
+            outs.append(feat)
+        return tuple(outs)
+
+
+class L2Norm(nn.Module):
+
+    def __init__(self, n_dims, scale=20., eps=1e-10):
+        """L2 normalization layer.
+
+        Args:
+            n_dims (int): Number of dimensions to be normalized
+            scale (float, optional): Defaults to 20..
+            eps (float, optional): Used to avoid division by zero.
+                Defaults to 1e-10.
+        """
+        super(L2Norm, self).__init__()
+        self.n_dims = n_dims
+        self.weight = nn.Parameter(torch.Tensor(self.n_dims))
+        self.eps = eps
+        self.scale = scale
+
+    def forward(self, x):
+        """Forward function."""
+        # normalization layer convert to FP32 in FP16 training
+        x_float = x.float()
+        norm = x_float.pow(2).sum(1, keepdim=True).sqrt() + self.eps
+        return (self.weight[None, :, None, None].float().expand_as(x_float) *
+                x_float / norm).type_as(x)
diff --git a/mmde/mmdet/models/necks/ssh.py b/mmde/mmdet/models/necks/ssh.py
new file mode 100644
index 0000000000000000000000000000000000000000..75a6561489d8d3634fc34829dafe819bbf066ed4
--- /dev/null
+++ b/mmde/mmdet/models/necks/ssh.py
@@ -0,0 +1,216 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+
+
+class SSHContextModule(BaseModule):
+    """This is an implementation of `SSH context module` described in `SSH:
+    Single Stage Headless Face Detector.
+
+    <https://arxiv.org/pdf/1708.03979.pdf>`_.
+
+    Args:
+        in_channels (int): Number of input channels used at each scale.
+        out_channels (int): Number of output channels used at each scale.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN').
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        assert out_channels % 4 == 0
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.conv5x5_1 = ConvModule(
+            self.in_channels,
+            self.out_channels // 4,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+        )
+
+        self.conv5x5_2 = ConvModule(
+            self.out_channels // 4,
+            self.out_channels // 4,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.conv7x7_2 = ConvModule(
+            self.out_channels // 4,
+            self.out_channels // 4,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+        )
+
+        self.conv7x7_3 = ConvModule(
+            self.out_channels // 4,
+            self.out_channels // 4,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None,
+        )
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        conv5x5_1 = self.conv5x5_1(x)
+        conv5x5 = self.conv5x5_2(conv5x5_1)
+        conv7x7_2 = self.conv7x7_2(conv5x5_1)
+        conv7x7 = self.conv7x7_3(conv7x7_2)
+
+        return (conv5x5, conv7x7)
+
+
+class SSHDetModule(BaseModule):
+    """This is an implementation of `SSH detection module` described in `SSH:
+    Single Stage Headless Face Detector.
+
+    <https://arxiv.org/pdf/1708.03979.pdf>`_.
+
+    Args:
+        in_channels (int): Number of input channels used at each scale.
+        out_channels (int): Number of output channels used at each scale.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN').
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        assert out_channels % 4 == 0
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.conv3x3 = ConvModule(
+            self.in_channels,
+            self.out_channels // 2,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.context_module = SSHContextModule(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        conv3x3 = self.conv3x3(x)
+        conv5x5, conv7x7 = self.context_module(x)
+        out = torch.cat([conv3x3, conv5x5, conv7x7], dim=1)
+        out = F.relu(out)
+
+        return out
+
+
+@MODELS.register_module()
+class SSH(BaseModule):
+    """`SSH Neck` used in `SSH: Single Stage Headless Face Detector.
+
+    <https://arxiv.org/pdf/1708.03979.pdf>`_.
+
+    Args:
+        num_scales (int): The number of scales / stages.
+        in_channels (list[int]): The number of input channels per scale.
+        out_channels (list[int]): The number of output channels  per scale.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN').
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+
+    Example:
+        >>> import torch
+        >>> in_channels = [8, 16, 32, 64]
+        >>> out_channels = [16, 32, 64, 128]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = SSH(num_scales=4, in_channels=in_channels,
+        ...           out_channels=out_channels)
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 16, 340, 340])
+        outputs[1].shape = torch.Size([1, 32, 170, 170])
+        outputs[2].shape = torch.Size([1, 64, 84, 84])
+        outputs[3].shape = torch.Size([1, 128, 43, 43])
+    """
+
+    def __init__(self,
+                 num_scales: int,
+                 in_channels: List[int],
+                 out_channels: List[int],
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 init_cfg: OptMultiConfig = dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super().__init__(init_cfg=init_cfg)
+        assert (num_scales == len(in_channels) == len(out_channels))
+        self.num_scales = num_scales
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        for idx in range(self.num_scales):
+            in_c, out_c = self.in_channels[idx], self.out_channels[idx]
+            self.add_module(
+                f'ssh_module{idx}',
+                SSHDetModule(
+                    in_channels=in_c,
+                    out_channels=out_c,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg))
+
+    def forward(self, inputs: Tuple[torch.Tensor]) -> tuple:
+        assert len(inputs) == self.num_scales
+
+        outs = []
+        for idx, x in enumerate(inputs):
+            ssh_module = getattr(self, f'ssh_module{idx}')
+            out = ssh_module(x)
+            outs.append(out)
+
+        return tuple(outs)
diff --git a/mmde/mmdet/models/necks/yolo_neck.py b/mmde/mmdet/models/necks/yolo_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..48a6b1a4897c85083aa1e1e7d692263f66de67c3
--- /dev/null
+++ b/mmde/mmdet/models/necks/yolo_neck.py
@@ -0,0 +1,145 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+
+
+class DetectionBlock(BaseModule):
+    """Detection block in YOLO neck.
+
+    Let out_channels = n, the DetectionBlock contains:
+    Six ConvLayers, 1 Conv2D Layer and 1 YoloLayer.
+    The first 6 ConvLayers are formed the following way:
+        1x1xn, 3x3x2n, 1x1xn, 3x3x2n, 1x1xn, 3x3x2n.
+    The Conv2D layer is 1x1x255.
+    Some block will have branch after the fifth ConvLayer.
+    The input channel is arbitrary (in_channels)
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: ConfigType = dict(
+                     type='LeakyReLU', negative_slope=0.1),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(DetectionBlock, self).__init__(init_cfg)
+        double_out_channels = out_channels * 2
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.conv1 = ConvModule(in_channels, out_channels, 1, **cfg)
+        self.conv2 = ConvModule(
+            out_channels, double_out_channels, 3, padding=1, **cfg)
+        self.conv3 = ConvModule(double_out_channels, out_channels, 1, **cfg)
+        self.conv4 = ConvModule(
+            out_channels, double_out_channels, 3, padding=1, **cfg)
+        self.conv5 = ConvModule(double_out_channels, out_channels, 1, **cfg)
+
+    def forward(self, x: Tensor) -> Tensor:
+        tmp = self.conv1(x)
+        tmp = self.conv2(tmp)
+        tmp = self.conv3(tmp)
+        tmp = self.conv4(tmp)
+        out = self.conv5(tmp)
+        return out
+
+
+@MODELS.register_module()
+class YOLOV3Neck(BaseModule):
+    """The neck of YOLOV3.
+
+    It can be treated as a simplified version of FPN. It
+    will take the result from Darknet backbone and do some upsampling and
+    concatenation. It will finally output the detection result.
+
+    Note:
+        The input feats should be from top to bottom.
+            i.e., from high-lvl to low-lvl
+        But YOLOV3Neck will process them in reversed order.
+            i.e., from bottom (high-lvl) to top (low-lvl)
+
+    Args:
+        num_scales (int): The number of scales / stages.
+        in_channels (List[int]): The number of input channels per scale.
+        out_channels (List[int]): The number of output channels  per scale.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict, optional): Dictionary to construct and config norm
+            layer. Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict, optional): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 num_scales: int,
+                 in_channels: List[int],
+                 out_channels: List[int],
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: ConfigType = dict(
+                     type='LeakyReLU', negative_slope=0.1),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(YOLOV3Neck, self).__init__(init_cfg)
+        assert (num_scales == len(in_channels) == len(out_channels))
+        self.num_scales = num_scales
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        # To support arbitrary scales, the code looks awful, but it works.
+        # Better solution is welcomed.
+        self.detect1 = DetectionBlock(in_channels[0], out_channels[0], **cfg)
+        for i in range(1, self.num_scales):
+            in_c, out_c = self.in_channels[i], self.out_channels[i]
+            inter_c = out_channels[i - 1]
+            self.add_module(f'conv{i}', ConvModule(inter_c, out_c, 1, **cfg))
+            # in_c + out_c : High-lvl feats will be cat with low-lvl feats
+            self.add_module(f'detect{i+1}',
+                            DetectionBlock(in_c + out_c, out_c, **cfg))
+
+    def forward(self, feats=Tuple[Tensor]) -> Tuple[Tensor]:
+        assert len(feats) == self.num_scales
+
+        # processed from bottom (high-lvl) to top (low-lvl)
+        outs = []
+        out = self.detect1(feats[-1])
+        outs.append(out)
+
+        for i, x in enumerate(reversed(feats[:-1])):
+            conv = getattr(self, f'conv{i+1}')
+            tmp = conv(out)
+
+            # Cat with low-lvl feats
+            tmp = F.interpolate(tmp, scale_factor=2)
+            tmp = torch.cat((tmp, x), 1)
+
+            detect = getattr(self, f'detect{i+2}')
+            out = detect(tmp)
+            outs.append(out)
+
+        return tuple(outs)
diff --git a/mmde/mmdet/models/necks/yolox_pafpn.py b/mmde/mmdet/models/necks/yolox_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ec3d12bfde8158c1a817fbf223a8eea94798667
--- /dev/null
+++ b/mmde/mmdet/models/necks/yolox_pafpn.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from ..layers import CSPLayer
+
+
+@MODELS.register_module()
+class YOLOXPAFPN(BaseModule):
+    """Path Aggregation Network used in YOLOX.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 3
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Default: False
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(scale_factor=2, mode='nearest')`
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish')
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_csp_blocks=3,
+                 use_depthwise=False,
+                 upsample_cfg=dict(scale_factor=2, mode='nearest'),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 init_cfg=dict(
+                     type='Kaiming',
+                     layer='Conv2d',
+                     a=math.sqrt(5),
+                     distribution='uniform',
+                     mode='fan_in',
+                     nonlinearity='leaky_relu')):
+        super(YOLOXPAFPN, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        # build top-down blocks
+        self.upsample = nn.Upsample(**upsample_cfg)
+        self.reduce_layers = nn.ModuleList()
+        self.top_down_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.reduce_layers.append(
+                ConvModule(
+                    in_channels[idx],
+                    in_channels[idx - 1],
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.top_down_blocks.append(
+                CSPLayer(
+                    in_channels[idx - 1] * 2,
+                    in_channels[idx - 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        # build bottom-up blocks
+        self.downsamples = nn.ModuleList()
+        self.bottom_up_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1):
+            self.downsamples.append(
+                conv(
+                    in_channels[idx],
+                    in_channels[idx],
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.bottom_up_blocks.append(
+                CSPLayer(
+                    in_channels[idx] * 2,
+                    in_channels[idx + 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        self.out_convs = nn.ModuleList()
+        for i in range(len(in_channels)):
+            self.out_convs.append(
+                ConvModule(
+                    in_channels[i],
+                    out_channels,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (tuple[Tensor]): input features.
+
+        Returns:
+            tuple[Tensor]: YOLOXPAFPN features.
+        """
+        assert len(inputs) == len(self.in_channels)
+
+        # top-down path
+        inner_outs = [inputs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = inputs[idx - 1]
+            feat_heigh = self.reduce_layers[len(self.in_channels) - 1 - idx](
+                feat_heigh)
+            inner_outs[0] = feat_heigh
+
+            upsample_feat = self.upsample(feat_heigh)
+
+            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
+                torch.cat([upsample_feat, feat_low], 1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsamples[idx](feat_low)
+            out = self.bottom_up_blocks[idx](
+                torch.cat([downsample_feat, feat_height], 1))
+            outs.append(out)
+
+        # out convs
+        for idx, conv in enumerate(self.out_convs):
+            outs[idx] = conv(outs[idx])
+
+        return tuple(outs)
diff --git a/mmde/mmdet/models/reid/__init__.py b/mmde/mmdet/models/reid/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aca617f7dea0b8047891c666ddb684dbbd018c81
--- /dev/null
+++ b/mmde/mmdet/models/reid/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_reid import BaseReID
+from .fc_module import FcModule
+from .gap import GlobalAveragePooling
+from .linear_reid_head import LinearReIDHead
+
+__all__ = ['BaseReID', 'GlobalAveragePooling', 'LinearReIDHead', 'FcModule']
diff --git a/mmde/mmdet/models/reid/base_reid.py b/mmde/mmdet/models/reid/base_reid.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c45964394aa1651f846f2a7e63da3ee70b78909
--- /dev/null
+++ b/mmde/mmdet/models/reid/base_reid.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import torch
+
+try:
+    import mmpretrain
+    from mmpretrain.models.classifiers import ImageClassifier
+except ImportError:
+    mmpretrain = None
+    ImageClassifier = object
+
+from mmdet.registry import MODELS
+from mmdet.structures import ReIDDataSample
+
+
+@MODELS.register_module()
+class BaseReID(ImageClassifier):
+    """Base model for re-identification."""
+
+    def __init__(self, *args, **kwargs):
+        if mmpretrain is None:
+            raise RuntimeError('Please run "pip install openmim" and '
+                               'run "mim install mmpretrain" to '
+                               'install mmpretrain first.')
+        super().__init__(*args, **kwargs)
+
+    def forward(self,
+                inputs: torch.Tensor,
+                data_samples: Optional[List[ReIDDataSample]] = None,
+                mode: str = 'tensor'):
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+          tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+          processed to a list of :obj:`ReIDDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+          inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (torch.Tensor): The input tensor with shape
+                (N, C, H, W) or (N, T, C, H, W).
+            data_samples (List[ReIDDataSample], optional): The annotation
+                data of every sample. It's required if ``mode="loss"``.
+                Defaults to None.
+            mode (str): Return what kind of value. Defaults to 'tensor'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of
+              :obj:`ReIDDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if len(inputs.size()) == 5:
+            assert inputs.size(0) == 1
+            inputs = inputs[0]
+        return super().forward(inputs, data_samples, mode)
diff --git a/mmde/mmdet/models/reid/fc_module.py b/mmde/mmdet/models/reid/fc_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..76e7efd66e300a242bb250cc6ba5cc68ed722034
--- /dev/null
+++ b/mmde/mmdet/models/reid/fc_module.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class FcModule(BaseModule):
+    """Fully-connected layer module.
+
+    Args:
+        in_channels (int): Input channels.
+        out_channels (int): Ourput channels.
+        norm_cfg (dict, optional): Configuration of normlization method
+            after fc. Defaults to None.
+        act_cfg (dict, optional): Configuration of activation method after fc.
+            Defaults to dict(type='ReLU').
+        inplace (bool, optional): Whether inplace the activatation module.
+            Defaults to True.
+        init_cfg (dict, optional): Initialization config dict.
+            Defaults to dict(type='Kaiming', layer='Linear').
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: dict = None,
+                 act_cfg: dict = dict(type='ReLU'),
+                 inplace: bool = True,
+                 init_cfg=dict(type='Kaiming', layer='Linear')):
+        super(FcModule, self).__init__(init_cfg)
+        assert norm_cfg is None or isinstance(norm_cfg, dict)
+        assert act_cfg is None or isinstance(act_cfg, dict)
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.inplace = inplace
+
+        self.with_norm = norm_cfg is not None
+        self.with_activation = act_cfg is not None
+
+        self.fc = nn.Linear(in_channels, out_channels)
+        # build normalization layers
+        if self.with_norm:
+            self.norm_name, norm = build_norm_layer(norm_cfg, out_channels)
+            self.add_module(self.norm_name, norm)
+
+        # build activation layer
+        if self.with_activation:
+            act_cfg_ = act_cfg.copy()
+            # nn.Tanh has no 'inplace' argument
+            if act_cfg_['type'] not in [
+                    'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish'
+            ]:
+                act_cfg_.setdefault('inplace', inplace)
+            self.activate = build_activation_layer(act_cfg_)
+
+    @property
+    def norm(self):
+        """Normalization."""
+        return getattr(self, self.norm_name)
+
+    def forward(self, x, activate=True, norm=True):
+        """Model forward."""
+        x = self.fc(x)
+        if norm and self.with_norm:
+            x = self.norm(x)
+        if activate and self.with_activation:
+            x = self.activate(x)
+        return x
diff --git a/mmde/mmdet/models/reid/gap.py b/mmde/mmdet/models/reid/gap.py
new file mode 100644
index 0000000000000000000000000000000000000000..aadc25e7144f2ca9efb66b496bf8ffa5504619ff
--- /dev/null
+++ b/mmde/mmdet/models/reid/gap.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class GlobalAveragePooling(BaseModule):
+    """Global Average Pooling neck.
+
+    Note that we use `view` to remove extra channel after pooling. We do not
+    use `squeeze` as it will also remove the batch dimension when the tensor
+    has a batch dimension of size 1, which can lead to unexpected errors.
+    """
+
+    def __init__(self, kernel_size=None, stride=None):
+        super(GlobalAveragePooling, self).__init__()
+        if kernel_size is None and stride is None:
+            self.gap = nn.AdaptiveAvgPool2d((1, 1))
+        else:
+            self.gap = nn.AvgPool2d(kernel_size, stride)
+
+    def forward(self, inputs):
+        if isinstance(inputs, tuple):
+            outs = tuple([self.gap(x) for x in inputs])
+            outs = tuple([
+                out.view(x.size(0),
+                         torch.tensor(out.size()[1:]).prod())
+                for out, x in zip(outs, inputs)
+            ])
+        elif isinstance(inputs, torch.Tensor):
+            outs = self.gap(inputs)
+            outs = outs.view(
+                inputs.size(0),
+                torch.tensor(outs.size()[1:]).prod())
+        else:
+            raise TypeError('neck inputs should be tuple or torch.tensor')
+        return outs
diff --git a/mmde/mmdet/models/reid/linear_reid_head.py b/mmde/mmdet/models/reid/linear_reid_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f35aaf6c2fc57b60e36017268e2a632df60ed342
--- /dev/null
+++ b/mmde/mmdet/models/reid/linear_reid_head.py
@@ -0,0 +1,202 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+try:
+    import mmpretrain
+    from mmpretrain.evaluation.metrics import Accuracy
+except ImportError:
+    mmpretrain = None
+
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from mmdet.structures import ReIDDataSample
+from .fc_module import FcModule
+
+
+@MODELS.register_module()
+class LinearReIDHead(BaseModule):
+    """Linear head for re-identification.
+
+    Args:
+        num_fcs (int): Number of fcs.
+        in_channels (int): Number of channels in the input.
+        fc_channels (int): Number of channels in the fcs.
+        out_channels (int): Number of channels in the output.
+        norm_cfg (dict, optional): Configuration of normlization method
+            after fc. Defaults to None.
+        act_cfg (dict, optional): Configuration of activation method after fc.
+            Defaults to None.
+        num_classes (int, optional): Number of the identities. Default to None.
+        loss_cls (dict, optional): Cross entropy loss to train the ReID module.
+            Defaults to None.
+        loss_triplet (dict, optional): Triplet loss to train the ReID module.
+            Defaults to None.
+        topk (int | Tuple[int]): Top-k accuracy. Defaults to ``(1, )``.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to dict(type='Normal',layer='Linear', mean=0, std=0.01,
+            bias=0).
+    """
+
+    def __init__(self,
+                 num_fcs: int,
+                 in_channels: int,
+                 fc_channels: int,
+                 out_channels: int,
+                 norm_cfg: Optional[dict] = None,
+                 act_cfg: Optional[dict] = None,
+                 num_classes: Optional[int] = None,
+                 loss_cls: Optional[dict] = None,
+                 loss_triplet: Optional[dict] = None,
+                 topk: Union[int, Tuple[int]] = (1, ),
+                 init_cfg: Union[dict, List[dict]] = dict(
+                     type='Normal', layer='Linear', mean=0, std=0.01, bias=0)):
+        if mmpretrain is None:
+            raise RuntimeError('Please run "pip install openmim" and '
+                               'run "mim install mmpretrain" to '
+                               'install mmpretrain first.')
+        super(LinearReIDHead, self).__init__(init_cfg=init_cfg)
+
+        assert isinstance(topk, (int, tuple))
+        if isinstance(topk, int):
+            topk = (topk, )
+        for _topk in topk:
+            assert _topk > 0, 'Top-k should be larger than 0'
+        self.topk = topk
+
+        if loss_cls is None:
+            if isinstance(num_classes, int):
+                warnings.warn('Since cross entropy is not set, '
+                              'the num_classes will be ignored.')
+            if loss_triplet is None:
+                raise ValueError('Please choose at least one loss in '
+                                 'triplet loss and cross entropy loss.')
+        elif not isinstance(num_classes, int):
+            raise TypeError('The num_classes must be a current number, '
+                            'if there is cross entropy loss.')
+        self.loss_cls = MODELS.build(loss_cls) if loss_cls else None
+        self.loss_triplet = MODELS.build(loss_triplet) \
+            if loss_triplet else None
+
+        self.num_fcs = num_fcs
+        self.in_channels = in_channels
+        self.fc_channels = fc_channels
+        self.out_channels = out_channels
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.num_classes = num_classes
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize fc layers."""
+        self.fcs = nn.ModuleList()
+        for i in range(self.num_fcs):
+            in_channels = self.in_channels if i == 0 else self.fc_channels
+            self.fcs.append(
+                FcModule(in_channels, self.fc_channels, self.norm_cfg,
+                         self.act_cfg))
+        in_channels = self.in_channels if self.num_fcs == 0 else \
+            self.fc_channels
+        self.fc_out = nn.Linear(in_channels, self.out_channels)
+        if self.loss_cls:
+            self.bn = nn.BatchNorm1d(self.out_channels)
+            self.classifier = nn.Linear(self.out_channels, self.num_classes)
+
+    def forward(self, feats: Tuple[torch.Tensor]) -> torch.Tensor:
+        """The forward process."""
+        # Multiple stage inputs are acceptable
+        # but only the last stage will be used.
+        feats = feats[-1]
+
+        for m in self.fcs:
+            feats = m(feats)
+        feats = self.fc_out(feats)
+        return feats
+
+    def loss(self, feats: Tuple[torch.Tensor],
+             data_samples: List[ReIDDataSample]) -> dict:
+        """Calculate losses.
+
+        Args:
+            feats (tuple[Tensor]): The features extracted from the backbone.
+            data_samples (List[ReIDDataSample]): The annotation data of
+                every samples.
+
+        Returns:
+            dict: a dictionary of loss components
+        """
+        # The part can be traced by torch.fx
+        feats = self(feats)
+
+        # The part can not be traced by torch.fx
+        losses = self.loss_by_feat(feats, data_samples)
+        return losses
+
+    def loss_by_feat(self, feats: torch.Tensor,
+                     data_samples: List[ReIDDataSample]) -> dict:
+        """Unpack data samples and compute loss."""
+        losses = dict()
+        gt_label = torch.cat([i.gt_label.label for i in data_samples])
+        gt_label = gt_label.to(feats.device)
+
+        if self.loss_triplet:
+            losses['triplet_loss'] = self.loss_triplet(feats, gt_label)
+
+        if self.loss_cls:
+            feats_bn = self.bn(feats)
+            cls_score = self.classifier(feats_bn)
+            losses['ce_loss'] = self.loss_cls(cls_score, gt_label)
+            acc = Accuracy.calculate(cls_score, gt_label, topk=self.topk)
+            losses.update(
+                {f'accuracy_top-{k}': a
+                 for k, a in zip(self.topk, acc)})
+
+        return losses
+
+    def predict(
+            self,
+            feats: Tuple[torch.Tensor],
+            data_samples: List[ReIDDataSample] = None) -> List[ReIDDataSample]:
+        """Inference without augmentation.
+
+        Args:
+            feats (Tuple[Tensor]): The features extracted from the backbone.
+                Multiple stage inputs are acceptable but only the last stage
+                will be used.
+            data_samples (List[ReIDDataSample], optional): The annotation
+                data of every samples. If not None, set ``pred_label`` of
+                the input data samples. Defaults to None.
+
+        Returns:
+            List[ReIDDataSample]: A list of data samples which contains the
+            predicted results.
+        """
+        # The part can be traced by torch.fx
+        feats = self(feats)
+
+        # The part can not be traced by torch.fx
+        data_samples = self.predict_by_feat(feats, data_samples)
+
+        return data_samples
+
+    def predict_by_feat(
+            self,
+            feats: torch.Tensor,
+            data_samples: List[ReIDDataSample] = None) -> List[ReIDDataSample]:
+        """Add prediction features to data samples."""
+        if data_samples is not None:
+            for data_sample, feat in zip(data_samples, feats):
+                data_sample.pred_feature = feat
+        else:
+            data_samples = []
+            for feat in feats:
+                data_sample = ReIDDataSample()
+                data_sample.pred_feature = feat
+                data_samples.append(data_sample)
+
+        return data_samples
diff --git a/mmde/mmdet/models/roi_heads/__init__.py b/mmde/mmdet/models/roi_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bba5664cc5ae5229ddebcb42f7583364ca9f77d8
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_roi_head import BaseRoIHead
+from .bbox_heads import (BBoxHead, ConvFCBBoxHead, DIIHead,
+                         DoubleConvFCBBoxHead, SABLHead, SCNetBBoxHead,
+                         Shared2FCBBoxHead, Shared4Conv1FCBBoxHead)
+from .cascade_roi_head import CascadeRoIHead
+from .double_roi_head import DoubleHeadRoIHead
+from .dynamic_roi_head import DynamicRoIHead
+from .grid_roi_head import GridRoIHead
+from .htc_roi_head import HybridTaskCascadeRoIHead
+from .mask_heads import (CoarseMaskHead, FCNMaskHead, FeatureRelayHead,
+                         FusedSemanticHead, GlobalContextHead, GridHead,
+                         HTCMaskHead, MaskIoUHead, MaskPointHead,
+                         SCNetMaskHead, SCNetSemanticHead)
+from .mask_scoring_roi_head import MaskScoringRoIHead
+from .multi_instance_roi_head import MultiInstanceRoIHead
+from .pisa_roi_head import PISARoIHead
+from .point_rend_roi_head import PointRendRoIHead
+from .roi_extractors import (BaseRoIExtractor, GenericRoIExtractor,
+                             SingleRoIExtractor)
+from .scnet_roi_head import SCNetRoIHead
+from .shared_heads import ResLayer
+from .sparse_roi_head import SparseRoIHead
+from .standard_roi_head import StandardRoIHead
+from .trident_roi_head import TridentRoIHead
+
+__all__ = [
+    'BaseRoIHead', 'CascadeRoIHead', 'DoubleHeadRoIHead', 'MaskScoringRoIHead',
+    'HybridTaskCascadeRoIHead', 'GridRoIHead', 'ResLayer', 'BBoxHead',
+    'ConvFCBBoxHead', 'DIIHead', 'SABLHead', 'Shared2FCBBoxHead',
+    'StandardRoIHead', 'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead',
+    'FCNMaskHead', 'HTCMaskHead', 'FusedSemanticHead', 'GridHead',
+    'MaskIoUHead', 'BaseRoIExtractor', 'GenericRoIExtractor',
+    'SingleRoIExtractor', 'PISARoIHead', 'PointRendRoIHead', 'MaskPointHead',
+    'CoarseMaskHead', 'DynamicRoIHead', 'SparseRoIHead', 'TridentRoIHead',
+    'SCNetRoIHead', 'SCNetMaskHead', 'SCNetSemanticHead', 'SCNetBBoxHead',
+    'FeatureRelayHead', 'GlobalContextHead', 'MultiInstanceRoIHead'
+]
diff --git a/mmde/mmdet/models/roi_heads/base_roi_head.py b/mmde/mmdet/models/roi_heads/base_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..405f80a73ecc5db7343d81ca55518160fcbc2b63
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/base_roi_head.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Tuple
+
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList, OptConfigType, OptMultiConfig
+
+
+class BaseRoIHead(BaseModule, metaclass=ABCMeta):
+    """Base class for RoIHeads."""
+
+    def __init__(self,
+                 bbox_roi_extractor: OptMultiConfig = None,
+                 bbox_head: OptMultiConfig = None,
+                 mask_roi_extractor: OptMultiConfig = None,
+                 mask_head: OptMultiConfig = None,
+                 shared_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if shared_head is not None:
+            self.shared_head = MODELS.build(shared_head)
+
+        if bbox_head is not None:
+            self.init_bbox_head(bbox_roi_extractor, bbox_head)
+
+        if mask_head is not None:
+            self.init_mask_head(mask_roi_extractor, mask_head)
+
+        self.init_assigner_sampler()
+
+    @property
+    def with_bbox(self) -> bool:
+        """bool: whether the RoI head contains a `bbox_head`"""
+        return hasattr(self, 'bbox_head') and self.bbox_head is not None
+
+    @property
+    def with_mask(self) -> bool:
+        """bool: whether the RoI head contains a `mask_head`"""
+        return hasattr(self, 'mask_head') and self.mask_head is not None
+
+    @property
+    def with_shared_head(self) -> bool:
+        """bool: whether the RoI head contains a `shared_head`"""
+        return hasattr(self, 'shared_head') and self.shared_head is not None
+
+    @abstractmethod
+    def init_bbox_head(self, *args, **kwargs):
+        """Initialize ``bbox_head``"""
+        pass
+
+    @abstractmethod
+    def init_mask_head(self, *args, **kwargs):
+        """Initialize ``mask_head``"""
+        pass
+
+    @abstractmethod
+    def init_assigner_sampler(self, *args, **kwargs):
+        """Initialize assigner and sampler."""
+        pass
+
+    @abstractmethod
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList):
+        """Perform forward propagation and loss calculation of the roi head on
+        the features of the upstream network."""
+
+    def predict(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (N, C, H, W).
+            rpn_results_list (list[:obj:`InstanceData`]): list of region
+                proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results to
+                the original image. Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        # TODO: nms_op in mmcv need be enhanced, the bbox result may get
+        #  difference when not rescale in bbox_head
+
+        # If it has the mask branch, the bbox branch does not need
+        # to be scaled to the original image scale, because the mask
+        # branch will scale both bbox and mask at the same time.
+        bbox_rescale = rescale if not self.with_mask else False
+        results_list = self.predict_bbox(
+            x,
+            batch_img_metas,
+            rpn_results_list,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=bbox_rescale)
+
+        if self.with_mask:
+            results_list = self.predict_mask(
+                x, batch_img_metas, results_list, rescale=rescale)
+
+        return results_list
diff --git a/mmde/mmdet/models/roi_heads/bbox_heads/__init__.py b/mmde/mmdet/models/roi_heads/bbox_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9e742abfecfc9dfe37b78822407fc92e9d64cc3
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/bbox_heads/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox_head import BBoxHead
+from .convfc_bbox_head import (ConvFCBBoxHead, Shared2FCBBoxHead,
+                               Shared4Conv1FCBBoxHead)
+from .dii_head import DIIHead
+from .double_bbox_head import DoubleConvFCBBoxHead
+from .multi_instance_bbox_head import MultiInstanceBBoxHead
+from .sabl_head import SABLHead
+from .scnet_bbox_head import SCNetBBoxHead
+
+__all__ = [
+    'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead',
+    'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'SABLHead', 'DIIHead',
+    'SCNetBBoxHead', 'MultiInstanceBBoxHead'
+]
diff --git a/mmde/mmdet/models/roi_heads/bbox_heads/bbox_head.py b/mmde/mmdet/models/roi_heads/bbox_heads/bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b2e8aae0833ae0351b544099d79d296f082a76e
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/bbox_heads/bbox_head.py
@@ -0,0 +1,708 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.layers import multiclass_nms
+from mmdet.models.losses import accuracy
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.utils import empty_instances, multi_apply
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import get_box_tensor, scale_boxes
+from mmdet.utils import ConfigType, InstanceList, OptMultiConfig
+
+
+@MODELS.register_module()
+class BBoxHead(BaseModule):
+    """Simplest RoI head, with only two fc layers for classification and
+    regression respectively."""
+
+    def __init__(self,
+                 with_avg_pool: bool = False,
+                 with_cls: bool = True,
+                 with_reg: bool = True,
+                 roi_feat_size: int = 7,
+                 in_channels: int = 256,
+                 num_classes: int = 80,
+                 bbox_coder: ConfigType = dict(
+                     type='DeltaXYWHBBoxCoder',
+                     clip_border=True,
+                     target_means=[0., 0., 0., 0.],
+                     target_stds=[0.1, 0.1, 0.2, 0.2]),
+                 predict_box_type: str = 'hbox',
+                 reg_class_agnostic: bool = False,
+                 reg_decoded_bbox: bool = False,
+                 reg_predictor_cfg: ConfigType = dict(type='Linear'),
+                 cls_predictor_cfg: ConfigType = dict(type='Linear'),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1.0),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert with_cls or with_reg
+        self.with_avg_pool = with_avg_pool
+        self.with_cls = with_cls
+        self.with_reg = with_reg
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1]
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.predict_box_type = predict_box_type
+        self.reg_class_agnostic = reg_class_agnostic
+        self.reg_decoded_bbox = reg_decoded_bbox
+        self.reg_predictor_cfg = reg_predictor_cfg
+        self.cls_predictor_cfg = cls_predictor_cfg
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+
+        in_channels = self.in_channels
+        if self.with_avg_pool:
+            self.avg_pool = nn.AvgPool2d(self.roi_feat_size)
+        else:
+            in_channels *= self.roi_feat_area
+        if self.with_cls:
+            # need to add background class
+            if self.custom_cls_channels:
+                cls_channels = self.loss_cls.get_cls_channels(self.num_classes)
+            else:
+                cls_channels = num_classes + 1
+            cls_predictor_cfg_ = self.cls_predictor_cfg.copy()
+            cls_predictor_cfg_.update(
+                in_features=in_channels, out_features=cls_channels)
+            self.fc_cls = MODELS.build(cls_predictor_cfg_)
+        if self.with_reg:
+            box_dim = self.bbox_coder.encode_size
+            out_dim_reg = box_dim if reg_class_agnostic else \
+                box_dim * num_classes
+            reg_predictor_cfg_ = self.reg_predictor_cfg.copy()
+            if isinstance(reg_predictor_cfg_, (dict, ConfigDict)):
+                reg_predictor_cfg_.update(
+                    in_features=in_channels, out_features=out_dim_reg)
+            self.fc_reg = MODELS.build(reg_predictor_cfg_)
+        self.debug_imgs = None
+        if init_cfg is None:
+            self.init_cfg = []
+            if self.with_cls:
+                self.init_cfg += [
+                    dict(
+                        type='Normal', std=0.01, override=dict(name='fc_cls'))
+                ]
+            if self.with_reg:
+                self.init_cfg += [
+                    dict(
+                        type='Normal', std=0.001, override=dict(name='fc_reg'))
+                ]
+
+    # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead
+    @property
+    def custom_cls_channels(self) -> bool:
+        """get custom_cls_channels from loss_cls."""
+        return getattr(self.loss_cls, 'custom_cls_channels', False)
+
+    # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead
+    @property
+    def custom_activation(self) -> bool:
+        """get custom_activation from loss_cls."""
+        return getattr(self.loss_cls, 'custom_activation', False)
+
+    # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead
+    @property
+    def custom_accuracy(self) -> bool:
+        """get custom_accuracy from loss_cls."""
+        return getattr(self.loss_cls, 'custom_accuracy', False)
+
+    def forward(self, x: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_score (Tensor): Classification scores for all
+                  scale levels, each is a 4D-tensor, the channels number
+                  is num_base_priors * num_classes.
+                - bbox_pred (Tensor): Box energies / deltas for all
+                  scale levels, each is a 4D-tensor, the channels number
+                  is num_base_priors * 4.
+        """
+        if self.with_avg_pool:
+            if x.numel() > 0:
+                x = self.avg_pool(x)
+                x = x.view(x.size(0), -1)
+            else:
+                # avg_pool does not support empty tensor,
+                # so use torch.mean instead it
+                x = torch.mean(x, dim=(-1, -2))
+        cls_score = self.fc_cls(x) if self.with_cls else None
+        bbox_pred = self.fc_reg(x) if self.with_reg else None
+        return cls_score, bbox_pred
+
+    def _get_targets_single(self, pos_priors: Tensor, neg_priors: Tensor,
+                            pos_gt_bboxes: Tensor, pos_gt_labels: Tensor,
+                            cfg: ConfigDict) -> tuple:
+        """Calculate the ground truth for proposals in the single image
+        according to the sampling results.
+
+        Args:
+            pos_priors (Tensor): Contains all the positive boxes,
+                has shape (num_pos, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            neg_priors (Tensor): Contains all the negative boxes,
+                has shape (num_neg, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_bboxes (Tensor): Contains gt_boxes for
+                all positive samples, has shape (num_pos, 4),
+                the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_labels (Tensor): Contains gt_labels for
+                all positive samples, has shape (num_pos, ).
+            cfg (obj:`ConfigDict`): `train_cfg` of R-CNN.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals
+            in a single image. Containing the following Tensors:
+
+                - labels(Tensor): Gt_labels for all proposals, has
+                  shape (num_proposals,).
+                - label_weights(Tensor): Labels_weights for all
+                  proposals, has shape (num_proposals,).
+                - bbox_targets(Tensor):Regression target for all
+                  proposals, has shape (num_proposals, 4), the
+                  last dimension 4 represents [tl_x, tl_y, br_x, br_y].
+                - bbox_weights(Tensor):Regression weights for all
+                  proposals, has shape (num_proposals, 4).
+        """
+        num_pos = pos_priors.size(0)
+        num_neg = neg_priors.size(0)
+        num_samples = num_pos + num_neg
+
+        # original implementation uses new_zeros since BG are set to be 0
+        # now use empty & fill because BG cat_id = num_classes,
+        # FG cat_id = [0, num_classes-1]
+        labels = pos_priors.new_full((num_samples, ),
+                                     self.num_classes,
+                                     dtype=torch.long)
+        reg_dim = pos_gt_bboxes.size(-1) if self.reg_decoded_bbox \
+            else self.bbox_coder.encode_size
+        label_weights = pos_priors.new_zeros(num_samples)
+        bbox_targets = pos_priors.new_zeros(num_samples, reg_dim)
+        bbox_weights = pos_priors.new_zeros(num_samples, reg_dim)
+        if num_pos > 0:
+            labels[:num_pos] = pos_gt_labels
+            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
+            label_weights[:num_pos] = pos_weight
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    pos_priors, pos_gt_bboxes)
+            else:
+                # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+                # is applied directly on the decoded bounding boxes, both
+                # the predicted boxes and regression targets should be with
+                # absolute coordinate format.
+                pos_bbox_targets = get_box_tensor(pos_gt_bboxes)
+            bbox_targets[:num_pos, :] = pos_bbox_targets
+            bbox_weights[:num_pos, :] = 1
+        if num_neg > 0:
+            label_weights[-num_neg:] = 1.0
+
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def get_targets(self,
+                    sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict,
+                    concat: bool = True) -> tuple:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Almost the same as the implementation in bbox_head, we passed
+        additional parameters pos_inds_list and neg_inds_list to
+        `_get_targets_single` function.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals in a single image.
+            Containing the following list of Tensors:
+
+            - labels (list[Tensor],Tensor): Gt_labels for all
+                proposals in a batch, each tensor in list has
+                shape (num_proposals,) when `concat=False`, otherwise
+                just a single tensor has shape (num_all_proposals,).
+            - label_weights (list[Tensor]): Labels_weights for
+                all proposals in a batch, each tensor in list has
+                shape (num_proposals,) when `concat=False`, otherwise
+                just a single tensor has shape (num_all_proposals,).
+            - bbox_targets (list[Tensor],Tensor): Regression target
+                for all proposals in a batch, each tensor in list
+                has shape (num_proposals, 4) when `concat=False`,
+                otherwise just a single tensor has shape
+                (num_all_proposals, 4), the last dimension 4 represents
+                [tl_x, tl_y, br_x, br_y].
+            - bbox_weights (list[tensor],Tensor): Regression weights for
+                all proposals in a batch, each tensor in list has shape
+                (num_proposals, 4) when `concat=False`, otherwise just a
+                single tensor has shape (num_all_proposals, 4).
+        """
+        pos_priors_list = [res.pos_priors for res in sampling_results]
+        neg_priors_list = [res.neg_priors for res in sampling_results]
+        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
+        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
+        labels, label_weights, bbox_targets, bbox_weights = multi_apply(
+            self._get_targets_single,
+            pos_priors_list,
+            neg_priors_list,
+            pos_gt_bboxes_list,
+            pos_gt_labels_list,
+            cfg=rcnn_train_cfg)
+
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            bbox_weights = torch.cat(bbox_weights, 0)
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def loss_and_target(self,
+                        cls_score: Tensor,
+                        bbox_pred: Tensor,
+                        rois: Tensor,
+                        sampling_results: List[SamplingResult],
+                        rcnn_train_cfg: ConfigDict,
+                        concat: bool = True,
+                        reduction_override: Optional[str] = None) -> dict:
+        """Calculate the loss based on the features extracted by the bbox head.
+
+        Args:
+            cls_score (Tensor): Classification prediction
+                results of all class, has shape
+                (batch_size * num_proposals_single_image, num_classes)
+            bbox_pred (Tensor): Regression prediction results,
+                has shape
+                (batch_size * num_proposals_single_image, 4), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            rois (Tensor): RoIs with the shape
+                (batch_size * num_proposals_single_image, 5) where the first
+                column indicates batch id of each RoI.
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch. Defaults to True.
+            reduction_override (str, optional): The reduction
+                method used to override the original reduction
+                method of the loss. Options are "none",
+                "mean" and "sum". Defaults to None,
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+                The targets are only used for cascade rcnn.
+        """
+
+        cls_reg_targets = self.get_targets(
+            sampling_results, rcnn_train_cfg, concat=concat)
+        losses = self.loss(
+            cls_score,
+            bbox_pred,
+            rois,
+            *cls_reg_targets,
+            reduction_override=reduction_override)
+
+        # cls_reg_targets is only for cascade rcnn
+        return dict(loss_bbox=losses, bbox_targets=cls_reg_targets)
+
+    def loss(self,
+             cls_score: Tensor,
+             bbox_pred: Tensor,
+             rois: Tensor,
+             labels: Tensor,
+             label_weights: Tensor,
+             bbox_targets: Tensor,
+             bbox_weights: Tensor,
+             reduction_override: Optional[str] = None) -> dict:
+        """Calculate the loss based on the network predictions and targets.
+
+        Args:
+            cls_score (Tensor): Classification prediction
+                results of all class, has shape
+                (batch_size * num_proposals_single_image, num_classes)
+            bbox_pred (Tensor): Regression prediction results,
+                has shape
+                (batch_size * num_proposals_single_image, 4), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            rois (Tensor): RoIs with the shape
+                (batch_size * num_proposals_single_image, 5) where the first
+                column indicates batch id of each RoI.
+            labels (Tensor): Gt_labels for all proposals in a batch, has
+                shape (batch_size * num_proposals_single_image, ).
+            label_weights (Tensor): Labels_weights for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image, ).
+            bbox_targets (Tensor): Regression target for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image, 4),
+                the last dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            bbox_weights (Tensor): Regression weights for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image, 4).
+            reduction_override (str, optional): The reduction
+                method used to override the original reduction
+                method of the loss. Options are "none",
+                "mean" and "sum". Defaults to None,
+
+        Returns:
+            dict: A dictionary of loss.
+        """
+
+        losses = dict()
+
+        if cls_score is not None:
+            avg_factor = max(torch.sum(label_weights > 0).float().item(), 1.)
+            if cls_score.numel() > 0:
+                loss_cls_ = self.loss_cls(
+                    cls_score,
+                    labels,
+                    label_weights,
+                    avg_factor=avg_factor,
+                    reduction_override=reduction_override)
+                if isinstance(loss_cls_, dict):
+                    losses.update(loss_cls_)
+                else:
+                    losses['loss_cls'] = loss_cls_
+                if self.custom_activation:
+                    acc_ = self.loss_cls.get_accuracy(cls_score, labels)
+                    losses.update(acc_)
+                else:
+                    losses['acc'] = accuracy(cls_score, labels)
+        if bbox_pred is not None:
+            bg_class_ind = self.num_classes
+            # 0~self.num_classes-1 are FG, self.num_classes is BG
+            pos_inds = (labels >= 0) & (labels < bg_class_ind)
+            # do not perform bounding box regression for BG anymore.
+            if pos_inds.any():
+                if self.reg_decoded_bbox:
+                    # When the regression loss (e.g. `IouLoss`,
+                    # `GIouLoss`, `DIouLoss`) is applied directly on
+                    # the decoded bounding boxes, it decodes the
+                    # already encoded coordinates to absolute format.
+                    bbox_pred = self.bbox_coder.decode(rois[:, 1:], bbox_pred)
+                    bbox_pred = get_box_tensor(bbox_pred)
+                if self.reg_class_agnostic:
+                    pos_bbox_pred = bbox_pred.view(
+                        bbox_pred.size(0), -1)[pos_inds.type(torch.bool)]
+                else:
+                    pos_bbox_pred = bbox_pred.view(
+                        bbox_pred.size(0), self.num_classes,
+                        -1)[pos_inds.type(torch.bool),
+                            labels[pos_inds.type(torch.bool)]]
+                losses['loss_bbox'] = self.loss_bbox(
+                    pos_bbox_pred,
+                    bbox_targets[pos_inds.type(torch.bool)],
+                    bbox_weights[pos_inds.type(torch.bool)],
+                    avg_factor=bbox_targets.size(0),
+                    reduction_override=reduction_override)
+            else:
+                losses['loss_bbox'] = bbox_pred[pos_inds].sum()
+
+        return losses
+
+    def predict_by_feat(self,
+                        rois: Tuple[Tensor],
+                        cls_scores: Tuple[Tensor],
+                        bbox_preds: Tuple[Tensor],
+                        batch_img_metas: List[dict],
+                        rcnn_test_cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            rois (tuple[Tensor]): Tuple of boxes to be transformed.
+                Each has shape  (num_boxes, 5). last dimension 5 arrange as
+                (batch_index, x1, y1, x2, y2).
+            cls_scores (tuple[Tensor]): Tuple of box scores, each has shape
+                (num_boxes, num_classes + 1).
+            bbox_preds (tuple[Tensor]): Tuple of box energies / deltas, each
+                has shape (num_boxes, num_classes * 4).
+            batch_img_metas (list[dict]): List of image information.
+            rcnn_test_cfg (obj:`ConfigDict`, optional): `test_cfg` of R-CNN.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Instance segmentation
+            results of each image after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            results = self._predict_by_feat_single(
+                roi=rois[img_id],
+                cls_score=cls_scores[img_id],
+                bbox_pred=bbox_preds[img_id],
+                img_meta=img_meta,
+                rescale=rescale,
+                rcnn_test_cfg=rcnn_test_cfg)
+            result_list.append(results)
+
+        return result_list
+
+    def _predict_by_feat_single(
+            self,
+            roi: Tensor,
+            cls_score: Tensor,
+            bbox_pred: Tensor,
+            img_meta: dict,
+            rescale: bool = False,
+            rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5).
+                last dimension 5 arrange as (batch_index, x1, y1, x2, y2).
+            cls_score (Tensor): Box scores, has shape
+                (num_boxes, num_classes + 1).
+            bbox_pred (Tensor): Box energies / deltas.
+                has shape (num_boxes, num_classes * 4).
+            img_meta (dict): image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+                Defaults to None
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image\
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        results = InstanceData()
+        if roi.shape[0] == 0:
+            return empty_instances([img_meta],
+                                   roi.device,
+                                   task_type='bbox',
+                                   instance_results=[results],
+                                   box_type=self.predict_box_type,
+                                   use_box_type=False,
+                                   num_classes=self.num_classes,
+                                   score_per_cls=rcnn_test_cfg is None)[0]
+
+        # some loss (Seesaw loss..) may have custom activation
+        if self.custom_cls_channels:
+            scores = self.loss_cls.get_activation(cls_score)
+        else:
+            scores = F.softmax(
+                cls_score, dim=-1) if cls_score is not None else None
+
+        img_shape = img_meta['img_shape']
+        num_rois = roi.size(0)
+        # bbox_pred would be None in some detector when with_reg is False,
+        # e.g. Grid R-CNN.
+        if bbox_pred is not None:
+            num_classes = 1 if self.reg_class_agnostic else self.num_classes
+            roi = roi.repeat_interleave(num_classes, dim=0)
+            bbox_pred = bbox_pred.view(-1, self.bbox_coder.encode_size)
+            bboxes = self.bbox_coder.decode(
+                roi[..., 1:], bbox_pred, max_shape=img_shape)
+        else:
+            bboxes = roi[:, 1:].clone()
+            if img_shape is not None and bboxes.size(-1) == 4:
+                bboxes[:, [0, 2]].clamp_(min=0, max=img_shape[1])
+                bboxes[:, [1, 3]].clamp_(min=0, max=img_shape[0])
+
+        if rescale and bboxes.size(0) > 0:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            bboxes = scale_boxes(bboxes, scale_factor)
+
+        # Get the inside tensor when `bboxes` is a box type
+        bboxes = get_box_tensor(bboxes)
+        box_dim = bboxes.size(-1)
+        bboxes = bboxes.view(num_rois, -1)
+
+        if rcnn_test_cfg is None:
+            # This means that it is aug test.
+            # It needs to return the raw results without nms.
+            results.bboxes = bboxes
+            results.scores = scores
+        else:
+            det_bboxes, det_labels = multiclass_nms(
+                bboxes,
+                scores,
+                rcnn_test_cfg.score_thr,
+                rcnn_test_cfg.nms,
+                rcnn_test_cfg.max_per_img,
+                box_dim=box_dim)
+            results.bboxes = det_bboxes[:, :-1]
+            results.scores = det_bboxes[:, -1]
+            results.labels = det_labels
+        return results
+
+    def refine_bboxes(self, sampling_results: Union[List[SamplingResult],
+                                                    InstanceList],
+                      bbox_results: dict,
+                      batch_img_metas: List[dict]) -> InstanceList:
+        """Refine bboxes during training.
+
+        Args:
+            sampling_results (List[:obj:`SamplingResult`] or
+                List[:obj:`InstanceData`]): Sampling results.
+                :obj:`SamplingResult` is the real sampling results
+                calculate from bbox_head, while :obj:`InstanceData` is
+                fake sampling results, e.g., in Sparse R-CNN or QueryInst, etc.
+            bbox_results (dict): Usually is a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+            batch_img_metas (List[dict]): List of image information.
+
+        Returns:
+            list[:obj:`InstanceData`]: Refined bboxes of each image.
+
+        Example:
+            >>> # xdoctest: +REQUIRES(module:kwarray)
+            >>> import numpy as np
+            >>> from mmdet.models.task_modules.samplers.
+            ... sampling_result import random_boxes
+            >>> from mmdet.models.task_modules.samplers import SamplingResult
+            >>> self = BBoxHead(reg_class_agnostic=True)
+            >>> n_roi = 2
+            >>> n_img = 4
+            >>> scale = 512
+            >>> rng = np.random.RandomState(0)
+            ... batch_img_metas = [{'img_shape': (scale, scale)}
+            >>>                     for _ in range(n_img)]
+            >>> sampling_results = [SamplingResult.random(rng=10)
+            ...                     for _ in range(n_img)]
+            >>> # Create rois in the expected format
+            >>> roi_boxes = random_boxes(n_roi, scale=scale, rng=rng)
+            >>> img_ids = torch.randint(0, n_img, (n_roi,))
+            >>> img_ids = img_ids.float()
+            >>> rois = torch.cat([img_ids[:, None], roi_boxes], dim=1)
+            >>> # Create other args
+            >>> labels = torch.randint(0, 81, (scale,)).long()
+            >>> bbox_preds = random_boxes(n_roi, scale=scale, rng=rng)
+            >>> cls_score = torch.randn((scale, 81))
+            ... # For each image, pretend random positive boxes are gts
+            >>> bbox_targets = (labels, None, None, None)
+            ... bbox_results = dict(rois=rois, bbox_pred=bbox_preds,
+            ...                     cls_score=cls_score,
+            ...                     bbox_targets=bbox_targets)
+            >>> bboxes_list = self.refine_bboxes(sampling_results,
+            ...                                  bbox_results,
+            ...                                  batch_img_metas)
+            >>> print(bboxes_list)
+        """
+        pos_is_gts = [res.pos_is_gt for res in sampling_results]
+        # bbox_targets is a tuple
+        labels = bbox_results['bbox_targets'][0]
+        cls_scores = bbox_results['cls_score']
+        rois = bbox_results['rois']
+        bbox_preds = bbox_results['bbox_pred']
+        if self.custom_activation:
+            # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead
+            cls_scores = self.loss_cls.get_activation(cls_scores)
+        if cls_scores.numel() == 0:
+            return None
+        if cls_scores.shape[-1] == self.num_classes + 1:
+            # remove background class
+            cls_scores = cls_scores[:, :-1]
+        elif cls_scores.shape[-1] != self.num_classes:
+            raise ValueError('The last dim of `cls_scores` should equal to '
+                             '`num_classes` or `num_classes + 1`,'
+                             f'but got {cls_scores.shape[-1]}.')
+        labels = torch.where(labels == self.num_classes, cls_scores.argmax(1),
+                             labels)
+
+        img_ids = rois[:, 0].long().unique(sorted=True)
+        assert img_ids.numel() <= len(batch_img_metas)
+
+        results_list = []
+        for i in range(len(batch_img_metas)):
+            inds = torch.nonzero(
+                rois[:, 0] == i, as_tuple=False).squeeze(dim=1)
+            num_rois = inds.numel()
+
+            bboxes_ = rois[inds, 1:]
+            label_ = labels[inds]
+            bbox_pred_ = bbox_preds[inds]
+            img_meta_ = batch_img_metas[i]
+            pos_is_gts_ = pos_is_gts[i]
+
+            bboxes = self.regress_by_class(bboxes_, label_, bbox_pred_,
+                                           img_meta_)
+            # filter gt bboxes
+            pos_keep = 1 - pos_is_gts_
+            keep_inds = pos_is_gts_.new_ones(num_rois)
+            keep_inds[:len(pos_is_gts_)] = pos_keep
+            results = InstanceData(bboxes=bboxes[keep_inds.type(torch.bool)])
+            results_list.append(results)
+
+        return results_list
+
+    def regress_by_class(self, priors: Tensor, label: Tensor,
+                         bbox_pred: Tensor, img_meta: dict) -> Tensor:
+        """Regress the bbox for the predicted class. Used in Cascade R-CNN.
+
+        Args:
+            priors (Tensor): Priors from `rpn_head` or last stage
+                `bbox_head`, has shape (num_proposals, 4).
+            label (Tensor): Only used when `self.reg_class_agnostic`
+                is False, has shape (num_proposals, ).
+            bbox_pred (Tensor): Regression prediction of
+                current stage `bbox_head`. When `self.reg_class_agnostic`
+                is False, it has shape (n, num_classes * 4), otherwise
+                it has shape (n, 4).
+            img_meta (dict): Image meta info.
+
+        Returns:
+            Tensor: Regressed bboxes, the same shape as input rois.
+        """
+        reg_dim = self.bbox_coder.encode_size
+        if not self.reg_class_agnostic:
+            label = label * reg_dim
+            inds = torch.stack([label + i for i in range(reg_dim)], 1)
+            bbox_pred = torch.gather(bbox_pred, 1, inds)
+        assert bbox_pred.size()[1] == reg_dim
+
+        max_shape = img_meta['img_shape']
+        regressed_bboxes = self.bbox_coder.decode(
+            priors, bbox_pred, max_shape=max_shape)
+        return regressed_bboxes
diff --git a/mmde/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py b/mmde/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb6aadd86d34af3605d432492931442026432cc8
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
@@ -0,0 +1,249 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .bbox_head import BBoxHead
+
+
+@MODELS.register_module()
+class ConvFCBBoxHead(BBoxHead):
+    r"""More general bbox head, with shared conv and fc layers and two optional
+    separated branches.
+
+    .. code-block:: none
+
+                                    /-> cls convs -> cls fcs -> cls
+        shared convs -> shared fcs
+                                    \-> reg convs -> reg fcs -> reg
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_shared_convs: int = 0,
+                 num_shared_fcs: int = 0,
+                 num_cls_convs: int = 0,
+                 num_cls_fcs: int = 0,
+                 num_reg_convs: int = 0,
+                 num_reg_fcs: int = 0,
+                 conv_out_channels: int = 256,
+                 fc_out_channels: int = 1024,
+                 conv_cfg: Optional[Union[dict, ConfigDict]] = None,
+                 norm_cfg: Optional[Union[dict, ConfigDict]] = None,
+                 init_cfg: Optional[Union[dict, ConfigDict]] = None,
+                 *args,
+                 **kwargs) -> None:
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        assert (num_shared_convs + num_shared_fcs + num_cls_convs +
+                num_cls_fcs + num_reg_convs + num_reg_fcs > 0)
+        if num_cls_convs > 0 or num_reg_convs > 0:
+            assert num_shared_fcs == 0
+        if not self.with_cls:
+            assert num_cls_convs == 0 and num_cls_fcs == 0
+        if not self.with_reg:
+            assert num_reg_convs == 0 and num_reg_fcs == 0
+        self.num_shared_convs = num_shared_convs
+        self.num_shared_fcs = num_shared_fcs
+        self.num_cls_convs = num_cls_convs
+        self.num_cls_fcs = num_cls_fcs
+        self.num_reg_convs = num_reg_convs
+        self.num_reg_fcs = num_reg_fcs
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        # add shared convs and fcs
+        self.shared_convs, self.shared_fcs, last_layer_dim = \
+            self._add_conv_fc_branch(
+                self.num_shared_convs, self.num_shared_fcs, self.in_channels,
+                True)
+        self.shared_out_channels = last_layer_dim
+
+        # add cls specific branch
+        self.cls_convs, self.cls_fcs, self.cls_last_dim = \
+            self._add_conv_fc_branch(
+                self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels)
+
+        # add reg specific branch
+        self.reg_convs, self.reg_fcs, self.reg_last_dim = \
+            self._add_conv_fc_branch(
+                self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels)
+
+        if self.num_shared_fcs == 0 and not self.with_avg_pool:
+            if self.num_cls_fcs == 0:
+                self.cls_last_dim *= self.roi_feat_area
+            if self.num_reg_fcs == 0:
+                self.reg_last_dim *= self.roi_feat_area
+
+        self.relu = nn.ReLU(inplace=True)
+        # reconstruct fc_cls and fc_reg since input channels are changed
+        if self.with_cls:
+            if self.custom_cls_channels:
+                cls_channels = self.loss_cls.get_cls_channels(self.num_classes)
+            else:
+                cls_channels = self.num_classes + 1
+            cls_predictor_cfg_ = self.cls_predictor_cfg.copy()
+            cls_predictor_cfg_.update(
+                in_features=self.cls_last_dim, out_features=cls_channels)
+            self.fc_cls = MODELS.build(cls_predictor_cfg_)
+        if self.with_reg:
+            box_dim = self.bbox_coder.encode_size
+            out_dim_reg = box_dim if self.reg_class_agnostic else \
+                box_dim * self.num_classes
+            reg_predictor_cfg_ = self.reg_predictor_cfg.copy()
+            if isinstance(reg_predictor_cfg_, (dict, ConfigDict)):
+                reg_predictor_cfg_.update(
+                    in_features=self.reg_last_dim, out_features=out_dim_reg)
+            self.fc_reg = MODELS.build(reg_predictor_cfg_)
+
+        if init_cfg is None:
+            # when init_cfg is None,
+            # It has been set to
+            # [[dict(type='Normal', std=0.01, override=dict(name='fc_cls'))],
+            #  [dict(type='Normal', std=0.001, override=dict(name='fc_reg'))]
+            # after `super(ConvFCBBoxHead, self).__init__()`
+            # we only need to append additional configuration
+            # for `shared_fcs`, `cls_fcs` and `reg_fcs`
+            self.init_cfg += [
+                dict(
+                    type='Xavier',
+                    distribution='uniform',
+                    override=[
+                        dict(name='shared_fcs'),
+                        dict(name='cls_fcs'),
+                        dict(name='reg_fcs')
+                    ])
+            ]
+
+    def _add_conv_fc_branch(self,
+                            num_branch_convs: int,
+                            num_branch_fcs: int,
+                            in_channels: int,
+                            is_shared: bool = False) -> tuple:
+        """Add shared or separable branch.
+
+        convs -> avg pool (optional) -> fcs
+        """
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        branch_convs = nn.ModuleList()
+        if num_branch_convs > 0:
+            for i in range(num_branch_convs):
+                conv_in_channels = (
+                    last_layer_dim if i == 0 else self.conv_out_channels)
+                branch_convs.append(
+                    ConvModule(
+                        conv_in_channels,
+                        self.conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            last_layer_dim = self.conv_out_channels
+        # add branch specific fc layers
+        branch_fcs = nn.ModuleList()
+        if num_branch_fcs > 0:
+            # for shared branch, only consider self.with_avg_pool
+            # for separated branches, also consider self.num_shared_fcs
+            if (is_shared
+                    or self.num_shared_fcs == 0) and not self.with_avg_pool:
+                last_layer_dim *= self.roi_feat_area
+            for i in range(num_branch_fcs):
+                fc_in_channels = (
+                    last_layer_dim if i == 0 else self.fc_out_channels)
+                branch_fcs.append(
+                    nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+        return branch_convs, branch_fcs, last_layer_dim
+
+    def forward(self, x: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_score (Tensor): Classification scores for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_base_priors * num_classes.
+                - bbox_pred (Tensor): Box energies / deltas for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_base_priors * 4.
+        """
+        # shared part
+        if self.num_shared_convs > 0:
+            for conv in self.shared_convs:
+                x = conv(x)
+
+        if self.num_shared_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+
+            x = x.flatten(1)
+
+            for fc in self.shared_fcs:
+                x = self.relu(fc(x))
+        # separate branches
+        x_cls = x
+        x_reg = x
+
+        for conv in self.cls_convs:
+            x_cls = conv(x_cls)
+        if x_cls.dim() > 2:
+            if self.with_avg_pool:
+                x_cls = self.avg_pool(x_cls)
+            x_cls = x_cls.flatten(1)
+        for fc in self.cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+
+        for conv in self.reg_convs:
+            x_reg = conv(x_reg)
+        if x_reg.dim() > 2:
+            if self.with_avg_pool:
+                x_reg = self.avg_pool(x_reg)
+            x_reg = x_reg.flatten(1)
+        for fc in self.reg_fcs:
+            x_reg = self.relu(fc(x_reg))
+
+        cls_score = self.fc_cls(x_cls) if self.with_cls else None
+        bbox_pred = self.fc_reg(x_reg) if self.with_reg else None
+        return cls_score, bbox_pred
+
+
+@MODELS.register_module()
+class Shared2FCBBoxHead(ConvFCBBoxHead):
+
+    def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None:
+        super().__init__(
+            num_shared_convs=0,
+            num_shared_fcs=2,
+            num_cls_convs=0,
+            num_cls_fcs=0,
+            num_reg_convs=0,
+            num_reg_fcs=0,
+            fc_out_channels=fc_out_channels,
+            *args,
+            **kwargs)
+
+
+@MODELS.register_module()
+class Shared4Conv1FCBBoxHead(ConvFCBBoxHead):
+
+    def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None:
+        super().__init__(
+            num_shared_convs=4,
+            num_shared_fcs=1,
+            num_cls_convs=0,
+            num_cls_fcs=0,
+            num_reg_convs=0,
+            num_reg_fcs=0,
+            fc_out_channels=fc_out_channels,
+            *args,
+            **kwargs)
diff --git a/mmde/mmdet/models/roi_heads/bbox_heads/dii_head.py b/mmde/mmdet/models/roi_heads/bbox_heads/dii_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae9a31bbeb2a8f1da62b457363fa05031d21925a
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/bbox_heads/dii_head.py
@@ -0,0 +1,422 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmengine.config import ConfigDict
+from mmengine.model import bias_init_with_prob
+from torch import Tensor
+
+from mmdet.models.losses import accuracy
+from mmdet.models.task_modules import SamplingResult
+from mmdet.models.utils import multi_apply
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, reduce_mean
+from .bbox_head import BBoxHead
+
+
+@MODELS.register_module()
+class DIIHead(BBoxHead):
+    r"""Dynamic Instance Interactive Head for `Sparse R-CNN: End-to-End Object
+    Detection with Learnable Proposals <https://arxiv.org/abs/2011.12450>`_
+
+    Args:
+        num_classes (int): Number of class in dataset.
+            Defaults to 80.
+        num_ffn_fcs (int): The number of fully-connected
+            layers in FFNs. Defaults to 2.
+        num_heads (int): The hidden dimension of FFNs.
+            Defaults to 8.
+        num_cls_fcs (int): The number of fully-connected
+            layers in classification subnet. Defaults to 1.
+        num_reg_fcs (int): The number of fully-connected
+            layers in regression subnet. Defaults to 3.
+        feedforward_channels (int): The hidden dimension
+            of FFNs. Defaults to 2048
+        in_channels (int): Hidden_channels of MultiheadAttention.
+            Defaults to 256.
+        dropout (float): Probability of drop the channel.
+            Defaults to 0.0
+        ffn_act_cfg (:obj:`ConfigDict` or dict): The activation config
+            for FFNs.
+        dynamic_conv_cfg (:obj:`ConfigDict` or dict): The convolution
+            config for DynamicConv.
+        loss_iou (:obj:`ConfigDict` or dict): The config for iou or
+            giou loss.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int = 80,
+                 num_ffn_fcs: int = 2,
+                 num_heads: int = 8,
+                 num_cls_fcs: int = 1,
+                 num_reg_fcs: int = 3,
+                 feedforward_channels: int = 2048,
+                 in_channels: int = 256,
+                 dropout: float = 0.0,
+                 ffn_act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+                 dynamic_conv_cfg: ConfigType = dict(
+                     type='DynamicConv',
+                     in_channels=256,
+                     feat_channels=64,
+                     out_channels=256,
+                     input_feat_shape=7,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                     norm_cfg=dict(type='LN')),
+                 loss_iou: ConfigType = dict(type='GIoULoss', loss_weight=2.0),
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(
+            num_classes=num_classes,
+            reg_decoded_bbox=True,
+            reg_class_agnostic=True,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.loss_iou = MODELS.build(loss_iou)
+        self.in_channels = in_channels
+        self.fp16_enabled = False
+        self.attention = MultiheadAttention(in_channels, num_heads, dropout)
+        self.attention_norm = build_norm_layer(dict(type='LN'), in_channels)[1]
+
+        self.instance_interactive_conv = MODELS.build(dynamic_conv_cfg)
+        self.instance_interactive_conv_dropout = nn.Dropout(dropout)
+        self.instance_interactive_conv_norm = build_norm_layer(
+            dict(type='LN'), in_channels)[1]
+
+        self.ffn = FFN(
+            in_channels,
+            feedforward_channels,
+            num_ffn_fcs,
+            act_cfg=ffn_act_cfg,
+            dropout=dropout)
+        self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1]
+
+        self.cls_fcs = nn.ModuleList()
+        for _ in range(num_cls_fcs):
+            self.cls_fcs.append(
+                nn.Linear(in_channels, in_channels, bias=False))
+            self.cls_fcs.append(
+                build_norm_layer(dict(type='LN'), in_channels)[1])
+            self.cls_fcs.append(
+                build_activation_layer(dict(type='ReLU', inplace=True)))
+
+        # over load the self.fc_cls in BBoxHead
+        if self.loss_cls.use_sigmoid:
+            self.fc_cls = nn.Linear(in_channels, self.num_classes)
+        else:
+            self.fc_cls = nn.Linear(in_channels, self.num_classes + 1)
+
+        self.reg_fcs = nn.ModuleList()
+        for _ in range(num_reg_fcs):
+            self.reg_fcs.append(
+                nn.Linear(in_channels, in_channels, bias=False))
+            self.reg_fcs.append(
+                build_norm_layer(dict(type='LN'), in_channels)[1])
+            self.reg_fcs.append(
+                build_activation_layer(dict(type='ReLU', inplace=True)))
+        # over load the self.fc_cls in BBoxHead
+        self.fc_reg = nn.Linear(in_channels, 4)
+
+        assert self.reg_class_agnostic, 'DIIHead only ' \
+            'suppport `reg_class_agnostic=True` '
+        assert self.reg_decoded_bbox, 'DIIHead only ' \
+            'suppport `reg_decoded_bbox=True`'
+
+    def init_weights(self) -> None:
+        """Use xavier initialization for all weight parameter and set
+        classification head bias as a specific value when use focal loss."""
+        super().init_weights()
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+            else:
+                # adopt the default initialization for
+                # the weight and bias of the layer norm
+                pass
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            nn.init.constant_(self.fc_cls.bias, bias_init)
+
+    def forward(self, roi_feat: Tensor, proposal_feat: Tensor) -> tuple:
+        """Forward function of Dynamic Instance Interactive Head.
+
+        Args:
+            roi_feat (Tensor): Roi-pooling features with shape
+                (batch_size*num_proposals, feature_dimensions,
+                pooling_h , pooling_w).
+            proposal_feat (Tensor): Intermediate feature get from
+                diihead in last stage, has shape
+                (batch_size, num_proposals, feature_dimensions)
+
+        Returns:
+            tuple[Tensor]: Usually a tuple of classification scores
+            and bbox prediction and a intermediate feature.
+
+            - cls_scores (Tensor): Classification scores for
+              all proposals, has shape
+              (batch_size, num_proposals, num_classes).
+            - bbox_preds (Tensor): Box energies / deltas for
+              all proposals, has shape
+              (batch_size, num_proposals, 4).
+            - obj_feat (Tensor): Object feature before classification
+              and regression subnet, has shape
+              (batch_size, num_proposal, feature_dimensions).
+            - attn_feats (Tensor): Intermediate feature.
+        """
+        N, num_proposals = proposal_feat.shape[:2]
+
+        # Self attention
+        proposal_feat = proposal_feat.permute(1, 0, 2)
+        proposal_feat = self.attention_norm(self.attention(proposal_feat))
+        attn_feats = proposal_feat.permute(1, 0, 2)
+
+        # instance interactive
+        proposal_feat = attn_feats.reshape(-1, self.in_channels)
+        proposal_feat_iic = self.instance_interactive_conv(
+            proposal_feat, roi_feat)
+        proposal_feat = proposal_feat + self.instance_interactive_conv_dropout(
+            proposal_feat_iic)
+        obj_feat = self.instance_interactive_conv_norm(proposal_feat)
+
+        # FFN
+        obj_feat = self.ffn_norm(self.ffn(obj_feat))
+
+        cls_feat = obj_feat
+        reg_feat = obj_feat
+
+        for cls_layer in self.cls_fcs:
+            cls_feat = cls_layer(cls_feat)
+        for reg_layer in self.reg_fcs:
+            reg_feat = reg_layer(reg_feat)
+
+        cls_score = self.fc_cls(cls_feat).view(
+            N, num_proposals, self.num_classes
+            if self.loss_cls.use_sigmoid else self.num_classes + 1)
+        bbox_delta = self.fc_reg(reg_feat).view(N, num_proposals, 4)
+
+        return cls_score, bbox_delta, obj_feat.view(
+            N, num_proposals, self.in_channels), attn_feats
+
+    def loss_and_target(self,
+                        cls_score: Tensor,
+                        bbox_pred: Tensor,
+                        sampling_results: List[SamplingResult],
+                        rcnn_train_cfg: ConfigType,
+                        imgs_whwh: Tensor,
+                        concat: bool = True,
+                        reduction_override: str = None) -> dict:
+        """Calculate the loss based on the features extracted by the DIIHead.
+
+        Args:
+            cls_score (Tensor): Classification prediction
+                results of all class, has shape
+                (batch_size * num_proposals_single_image, num_classes)
+            bbox_pred (Tensor): Regression prediction results, has shape
+                (batch_size * num_proposals_single_image, 4), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            imgs_whwh (Tensor): imgs_whwh (Tensor): Tensor with\
+                shape (batch_size, num_proposals, 4), the last
+                dimension means
+                [img_width,img_height, img_width, img_height].
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch. Defaults to True.
+            reduction_override (str, optional): The reduction
+                method used to override the original reduction
+                method of the loss. Options are "none",
+                "mean" and "sum". Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+            The targets are only used for cascade rcnn.
+        """
+        cls_reg_targets = self.get_targets(
+            sampling_results=sampling_results,
+            rcnn_train_cfg=rcnn_train_cfg,
+            concat=concat)
+        (labels, label_weights, bbox_targets, bbox_weights) = cls_reg_targets
+
+        losses = dict()
+        bg_class_ind = self.num_classes
+        # note in spare rcnn num_gt == num_pos
+        pos_inds = (labels >= 0) & (labels < bg_class_ind)
+        num_pos = pos_inds.sum().float()
+        avg_factor = reduce_mean(num_pos)
+        if cls_score is not None:
+            if cls_score.numel() > 0:
+                losses['loss_cls'] = self.loss_cls(
+                    cls_score,
+                    labels,
+                    label_weights,
+                    avg_factor=avg_factor,
+                    reduction_override=reduction_override)
+                losses['pos_acc'] = accuracy(cls_score[pos_inds],
+                                             labels[pos_inds])
+        if bbox_pred is not None:
+            # 0~self.num_classes-1 are FG, self.num_classes is BG
+            # do not perform bounding box regression for BG anymore.
+            if pos_inds.any():
+                pos_bbox_pred = bbox_pred.reshape(bbox_pred.size(0),
+                                                  4)[pos_inds.type(torch.bool)]
+                imgs_whwh = imgs_whwh.reshape(bbox_pred.size(0),
+                                              4)[pos_inds.type(torch.bool)]
+                losses['loss_bbox'] = self.loss_bbox(
+                    pos_bbox_pred / imgs_whwh,
+                    bbox_targets[pos_inds.type(torch.bool)] / imgs_whwh,
+                    bbox_weights[pos_inds.type(torch.bool)],
+                    avg_factor=avg_factor)
+                losses['loss_iou'] = self.loss_iou(
+                    pos_bbox_pred,
+                    bbox_targets[pos_inds.type(torch.bool)],
+                    bbox_weights[pos_inds.type(torch.bool)],
+                    avg_factor=avg_factor)
+            else:
+                losses['loss_bbox'] = bbox_pred.sum() * 0
+                losses['loss_iou'] = bbox_pred.sum() * 0
+        return dict(loss_bbox=losses, bbox_targets=cls_reg_targets)
+
+    def _get_targets_single(self, pos_inds: Tensor, neg_inds: Tensor,
+                            pos_priors: Tensor, neg_priors: Tensor,
+                            pos_gt_bboxes: Tensor, pos_gt_labels: Tensor,
+                            cfg: ConfigDict) -> tuple:
+        """Calculate the ground truth for proposals in the single image
+        according to the sampling results.
+
+        Almost the same as the implementation in `bbox_head`,
+        we add pos_inds and neg_inds to select positive and
+        negative samples instead of selecting the first num_pos
+        as positive samples.
+
+        Args:
+            pos_inds (Tensor): The length is equal to the
+                positive sample numbers contain all index
+                of the positive sample in the origin proposal set.
+            neg_inds (Tensor): The length is equal to the
+                negative sample numbers contain all index
+                of the negative sample in the origin proposal set.
+            pos_priors (Tensor): Contains all the positive boxes,
+                has shape (num_pos, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            neg_priors (Tensor): Contains all the negative boxes,
+                has shape (num_neg, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_bboxes (Tensor): Contains gt_boxes for
+                all positive samples, has shape (num_pos, 4),
+                the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_labels (Tensor): Contains gt_labels for
+                all positive samples, has shape (num_pos, ).
+            cfg (obj:`ConfigDict`): `train_cfg` of R-CNN.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals in a single image.
+            Containing the following Tensors:
+
+            - labels(Tensor): Gt_labels for all proposals, has
+              shape (num_proposals,).
+            - label_weights(Tensor): Labels_weights for all proposals, has
+              shape (num_proposals,).
+            - bbox_targets(Tensor):Regression target for all proposals, has
+              shape (num_proposals, 4), the last dimension 4
+              represents [tl_x, tl_y, br_x, br_y].
+            - bbox_weights(Tensor):Regression weights for all proposals,
+              has shape (num_proposals, 4).
+        """
+        num_pos = pos_priors.size(0)
+        num_neg = neg_priors.size(0)
+        num_samples = num_pos + num_neg
+
+        # original implementation uses new_zeros since BG are set to be 0
+        # now use empty & fill because BG cat_id = num_classes,
+        # FG cat_id = [0, num_classes-1]
+        labels = pos_priors.new_full((num_samples, ),
+                                     self.num_classes,
+                                     dtype=torch.long)
+        label_weights = pos_priors.new_zeros(num_samples)
+        bbox_targets = pos_priors.new_zeros(num_samples, 4)
+        bbox_weights = pos_priors.new_zeros(num_samples, 4)
+        if num_pos > 0:
+            labels[pos_inds] = pos_gt_labels
+            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
+            label_weights[pos_inds] = pos_weight
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    pos_priors, pos_gt_bboxes)
+            else:
+                pos_bbox_targets = pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1
+        if num_neg > 0:
+            label_weights[neg_inds] = 1.0
+
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def get_targets(self,
+                    sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict,
+                    concat: bool = True) -> tuple:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Almost the same as the implementation in bbox_head, we passed
+        additional parameters pos_inds_list and neg_inds_list to
+        `_get_targets_single` function.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals in a single image.
+            Containing the following list of Tensors:
+
+            - labels (list[Tensor],Tensor): Gt_labels for all
+              proposals in a batch, each tensor in list has
+              shape (num_proposals,) when `concat=False`, otherwise just
+              a single tensor has shape (num_all_proposals,).
+            - label_weights (list[Tensor]): Labels_weights for
+              all proposals in a batch, each tensor in list has shape
+              (num_proposals,) when `concat=False`, otherwise just a
+              single tensor has shape (num_all_proposals,).
+            - bbox_targets (list[Tensor],Tensor): Regression target
+              for all proposals in a batch, each tensor in list has
+              shape (num_proposals, 4) when `concat=False`, otherwise
+              just a single tensor has shape (num_all_proposals, 4),
+              the last dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            - bbox_weights (list[tensor],Tensor): Regression weights for
+              all proposals in a batch, each tensor in list has shape
+              (num_proposals, 4) when `concat=False`, otherwise just a
+              single tensor has shape (num_all_proposals, 4).
+        """
+        pos_inds_list = [res.pos_inds for res in sampling_results]
+        neg_inds_list = [res.neg_inds for res in sampling_results]
+        pos_priors_list = [res.pos_priors for res in sampling_results]
+        neg_priors_list = [res.neg_priors for res in sampling_results]
+        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
+        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
+        labels, label_weights, bbox_targets, bbox_weights = multi_apply(
+            self._get_targets_single,
+            pos_inds_list,
+            neg_inds_list,
+            pos_priors_list,
+            neg_priors_list,
+            pos_gt_bboxes_list,
+            pos_gt_labels_list,
+            cfg=rcnn_train_cfg)
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            bbox_weights = torch.cat(bbox_weights, 0)
+        return labels, label_weights, bbox_targets, bbox_weights
diff --git a/mmde/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py b/mmde/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..076c35843375c7aef5e58786d55ebacd281d54a3
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule, ModuleList
+from torch import Tensor
+
+from mmdet.models.backbones.resnet import Bottleneck
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, MultiConfig, OptConfigType, OptMultiConfig
+from .bbox_head import BBoxHead
+
+
+class BasicResBlock(BaseModule):
+    """Basic residual block.
+
+    This block is a little different from the block in the ResNet backbone.
+    The kernel size of conv1 is 1 in this block while 3 in ResNet BasicBlock.
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        out_channels (int): Channels of the output feature map.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): The config dict
+            for convolution layers.
+        norm_cfg (:obj:`ConfigDict` or dict): The config dict for
+            normalization layers.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        # main path
+        self.conv1 = ConvModule(
+            in_channels,
+            in_channels,
+            kernel_size=3,
+            padding=1,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+        self.conv2 = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        # identity path
+        self.conv_identity = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        identity = x
+
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        identity = self.conv_identity(identity)
+        out = x + identity
+
+        out = self.relu(out)
+        return out
+
+
+@MODELS.register_module()
+class DoubleConvFCBBoxHead(BBoxHead):
+    r"""Bbox head used in Double-Head R-CNN
+
+    .. code-block:: none
+
+                                          /-> cls
+                      /-> shared convs ->
+                                          \-> reg
+        roi features
+                                          /-> cls
+                      \-> shared fc    ->
+                                          \-> reg
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_convs: int = 0,
+                 num_fcs: int = 0,
+                 conv_out_channels: int = 1024,
+                 fc_out_channels: int = 1024,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     override=[
+                         dict(type='Normal', name='fc_cls', std=0.01),
+                         dict(type='Normal', name='fc_reg', std=0.001),
+                         dict(
+                             type='Xavier',
+                             name='fc_branch',
+                             distribution='uniform')
+                     ]),
+                 **kwargs) -> None:
+        kwargs.setdefault('with_avg_pool', True)
+        super().__init__(init_cfg=init_cfg, **kwargs)
+        assert self.with_avg_pool
+        assert num_convs > 0
+        assert num_fcs > 0
+        self.num_convs = num_convs
+        self.num_fcs = num_fcs
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        # increase the channel of input features
+        self.res_block = BasicResBlock(self.in_channels,
+                                       self.conv_out_channels)
+
+        # add conv heads
+        self.conv_branch = self._add_conv_branch()
+        # add fc heads
+        self.fc_branch = self._add_fc_branch()
+
+        out_dim_reg = 4 if self.reg_class_agnostic else 4 * self.num_classes
+        self.fc_reg = nn.Linear(self.conv_out_channels, out_dim_reg)
+
+        self.fc_cls = nn.Linear(self.fc_out_channels, self.num_classes + 1)
+        self.relu = nn.ReLU()
+
+    def _add_conv_branch(self) -> None:
+        """Add the fc branch which consists of a sequential of conv layers."""
+        branch_convs = ModuleList()
+        for i in range(self.num_convs):
+            branch_convs.append(
+                Bottleneck(
+                    inplanes=self.conv_out_channels,
+                    planes=self.conv_out_channels // 4,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        return branch_convs
+
+    def _add_fc_branch(self) -> None:
+        """Add the fc branch which consists of a sequential of fc layers."""
+        branch_fcs = ModuleList()
+        for i in range(self.num_fcs):
+            fc_in_channels = (
+                self.in_channels *
+                self.roi_feat_area if i == 0 else self.fc_out_channels)
+            branch_fcs.append(nn.Linear(fc_in_channels, self.fc_out_channels))
+        return branch_fcs
+
+    def forward(self, x_cls: Tensor, x_reg: Tensor) -> Tuple[Tensor]:
+        """Forward features from the upstream network.
+
+        Args:
+            x_cls (Tensor): Classification features of rois
+            x_reg (Tensor): Regression features from the upstream network.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_score (Tensor): Classification score predictions of rois.
+                  each roi predicts num_classes + 1 channels.
+                - bbox_pred (Tensor): BBox deltas predictions of rois. each roi
+                  predicts 4 * num_classes channels.
+        """
+        # conv head
+        x_conv = self.res_block(x_reg)
+
+        for conv in self.conv_branch:
+            x_conv = conv(x_conv)
+
+        if self.with_avg_pool:
+            x_conv = self.avg_pool(x_conv)
+
+        x_conv = x_conv.view(x_conv.size(0), -1)
+        bbox_pred = self.fc_reg(x_conv)
+
+        # fc head
+        x_fc = x_cls.view(x_cls.size(0), -1)
+        for fc in self.fc_branch:
+            x_fc = self.relu(fc(x_fc))
+
+        cls_score = self.fc_cls(x_fc)
+
+        return cls_score, bbox_pred
diff --git a/mmde/mmdet/models/roi_heads/bbox_heads/multi_instance_bbox_head.py b/mmde/mmdet/models/roi_heads/bbox_heads/multi_instance_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..38e57d2eddd580b13256da63c9bd8723be98e764
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/bbox_heads/multi_instance_bbox_head.py
@@ -0,0 +1,626 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor, nn
+
+from mmdet.models.roi_heads.bbox_heads.bbox_head import BBoxHead
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.utils import empty_instances
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_overlaps
+
+
+@MODELS.register_module()
+class MultiInstanceBBoxHead(BBoxHead):
+    r"""Bbox head used in CrowdDet.
+
+    .. code-block:: none
+
+                                      /-> cls convs_1 -> cls fcs_1 -> cls_1
+                                   |--
+                                   |  \-> reg convs_1 -> reg fcs_1 -> reg_1
+                                   |
+                                   |  /-> cls convs_2 -> cls fcs_2 -> cls_2
+        shared convs -> shared fcs |--
+                                   |  \-> reg convs_2 -> reg fcs_2 -> reg_2
+                                   |
+                                   |                     ...
+                                   |
+                                   |  /-> cls convs_k -> cls fcs_k -> cls_k
+                                   |--
+                                      \-> reg convs_k -> reg fcs_k -> reg_k
+
+
+    Args:
+        num_instance (int): The number of branches after shared fcs.
+            Defaults to 2.
+        with_refine (bool): Whether to use refine module. Defaults to False.
+        num_shared_convs (int): The number of shared convs. Defaults to 0.
+        num_shared_fcs (int): The number of shared fcs. Defaults to 2.
+        num_cls_convs (int): The number of cls convs. Defaults to 0.
+        num_cls_fcs (int): The number of cls fcs. Defaults to 0.
+        num_reg_convs (int): The number of reg convs. Defaults to 0.
+        num_reg_fcs (int): The number of reg fcs. Defaults to 0.
+        conv_out_channels (int): The number of conv out channels.
+            Defaults to 256.
+        fc_out_channels (int): The number of fc out channels. Defaults to 1024.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_instance: int = 2,
+                 with_refine: bool = False,
+                 num_shared_convs: int = 0,
+                 num_shared_fcs: int = 2,
+                 num_cls_convs: int = 0,
+                 num_cls_fcs: int = 0,
+                 num_reg_convs: int = 0,
+                 num_reg_fcs: int = 0,
+                 conv_out_channels: int = 256,
+                 fc_out_channels: int = 1024,
+                 init_cfg: Optional[Union[dict, ConfigDict]] = None,
+                 *args,
+                 **kwargs) -> None:
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        assert (num_shared_convs + num_shared_fcs + num_cls_convs +
+                num_cls_fcs + num_reg_convs + num_reg_fcs > 0)
+        assert num_instance == 2, 'Currently only 2 instances are supported'
+        if num_cls_convs > 0 or num_reg_convs > 0:
+            assert num_shared_fcs == 0
+        if not self.with_cls:
+            assert num_cls_convs == 0 and num_cls_fcs == 0
+        if not self.with_reg:
+            assert num_reg_convs == 0 and num_reg_fcs == 0
+        self.num_instance = num_instance
+        self.num_shared_convs = num_shared_convs
+        self.num_shared_fcs = num_shared_fcs
+        self.num_cls_convs = num_cls_convs
+        self.num_cls_fcs = num_cls_fcs
+        self.num_reg_convs = num_reg_convs
+        self.num_reg_fcs = num_reg_fcs
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.with_refine = with_refine
+
+        # add shared convs and fcs
+        self.shared_convs, self.shared_fcs, last_layer_dim = \
+            self._add_conv_fc_branch(
+                self.num_shared_convs, self.num_shared_fcs, self.in_channels,
+                True)
+        self.shared_out_channels = last_layer_dim
+        self.relu = nn.ReLU(inplace=True)
+
+        if self.with_refine:
+            refine_model_cfg = {
+                'type': 'Linear',
+                'in_features': self.shared_out_channels + 20,
+                'out_features': self.shared_out_channels
+            }
+            self.shared_fcs_ref = MODELS.build(refine_model_cfg)
+            self.fc_cls_ref = nn.ModuleList()
+            self.fc_reg_ref = nn.ModuleList()
+
+        self.cls_convs = nn.ModuleList()
+        self.cls_fcs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.reg_fcs = nn.ModuleList()
+        self.cls_last_dim = list()
+        self.reg_last_dim = list()
+        self.fc_cls = nn.ModuleList()
+        self.fc_reg = nn.ModuleList()
+        for k in range(self.num_instance):
+            # add cls specific branch
+            cls_convs, cls_fcs, cls_last_dim = self._add_conv_fc_branch(
+                self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels)
+            self.cls_convs.append(cls_convs)
+            self.cls_fcs.append(cls_fcs)
+            self.cls_last_dim.append(cls_last_dim)
+
+            # add reg specific branch
+            reg_convs, reg_fcs, reg_last_dim = self._add_conv_fc_branch(
+                self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels)
+            self.reg_convs.append(reg_convs)
+            self.reg_fcs.append(reg_fcs)
+            self.reg_last_dim.append(reg_last_dim)
+
+            if self.num_shared_fcs == 0 and not self.with_avg_pool:
+                if self.num_cls_fcs == 0:
+                    self.cls_last_dim *= self.roi_feat_area
+                if self.num_reg_fcs == 0:
+                    self.reg_last_dim *= self.roi_feat_area
+
+            if self.with_cls:
+                if self.custom_cls_channels:
+                    cls_channels = self.loss_cls.get_cls_channels(
+                        self.num_classes)
+                else:
+                    cls_channels = self.num_classes + 1
+                cls_predictor_cfg_ = self.cls_predictor_cfg.copy()  # deepcopy
+                cls_predictor_cfg_.update(
+                    in_features=self.cls_last_dim[k],
+                    out_features=cls_channels)
+                self.fc_cls.append(MODELS.build(cls_predictor_cfg_))
+                if self.with_refine:
+                    self.fc_cls_ref.append(MODELS.build(cls_predictor_cfg_))
+
+            if self.with_reg:
+                out_dim_reg = (4 if self.reg_class_agnostic else 4 *
+                               self.num_classes)
+                reg_predictor_cfg_ = self.reg_predictor_cfg.copy()
+                reg_predictor_cfg_.update(
+                    in_features=self.reg_last_dim[k], out_features=out_dim_reg)
+                self.fc_reg.append(MODELS.build(reg_predictor_cfg_))
+                if self.with_refine:
+                    self.fc_reg_ref.append(MODELS.build(reg_predictor_cfg_))
+
+        if init_cfg is None:
+            # when init_cfg is None,
+            # It has been set to
+            # [[dict(type='Normal', std=0.01, override=dict(name='fc_cls'))],
+            #  [dict(type='Normal', std=0.001, override=dict(name='fc_reg'))]
+            # after `super(ConvFCBBoxHead, self).__init__()`
+            # we only need to append additional configuration
+            # for `shared_fcs`, `cls_fcs` and `reg_fcs`
+            self.init_cfg += [
+                dict(
+                    type='Xavier',
+                    distribution='uniform',
+                    override=[
+                        dict(name='shared_fcs'),
+                        dict(name='cls_fcs'),
+                        dict(name='reg_fcs')
+                    ])
+            ]
+
+    def _add_conv_fc_branch(self,
+                            num_branch_convs: int,
+                            num_branch_fcs: int,
+                            in_channels: int,
+                            is_shared: bool = False) -> tuple:
+        """Add shared or separable branch.
+
+        convs -> avg pool (optional) -> fcs
+        """
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        branch_convs = nn.ModuleList()
+        if num_branch_convs > 0:
+            for i in range(num_branch_convs):
+                conv_in_channels = (
+                    last_layer_dim if i == 0 else self.conv_out_channels)
+                branch_convs.append(
+                    ConvModule(
+                        conv_in_channels, self.conv_out_channels, 3,
+                        padding=1))
+            last_layer_dim = self.conv_out_channels
+        # add branch specific fc layers
+        branch_fcs = nn.ModuleList()
+        if num_branch_fcs > 0:
+            # for shared branch, only consider self.with_avg_pool
+            # for separated branches, also consider self.num_shared_fcs
+            if (is_shared
+                    or self.num_shared_fcs == 0) and not self.with_avg_pool:
+                last_layer_dim *= self.roi_feat_area
+            for i in range(num_branch_fcs):
+                fc_in_channels = (
+                    last_layer_dim if i == 0 else self.fc_out_channels)
+                branch_fcs.append(
+                    nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+        return branch_convs, branch_fcs, last_layer_dim
+
+    def forward(self, x: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_score (Tensor): Classification scores for all scale
+                  levels, each is a 4D-tensor, the channels number is
+                  num_base_priors * num_classes.
+                - bbox_pred (Tensor): Box energies / deltas for all scale
+                  levels, each is a 4D-tensor, the channels number is
+                  num_base_priors * 4.
+                - cls_score_ref (Tensor): The cls_score after refine model.
+                - bbox_pred_ref (Tensor): The bbox_pred after refine model.
+        """
+        # shared part
+        if self.num_shared_convs > 0:
+            for conv in self.shared_convs:
+                x = conv(x)
+
+        if self.num_shared_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+
+            x = x.flatten(1)
+            for fc in self.shared_fcs:
+                x = self.relu(fc(x))
+
+        x_cls = x
+        x_reg = x
+        # separate branches
+        cls_score = list()
+        bbox_pred = list()
+        for k in range(self.num_instance):
+            for conv in self.cls_convs[k]:
+                x_cls = conv(x_cls)
+            if x_cls.dim() > 2:
+                if self.with_avg_pool:
+                    x_cls = self.avg_pool(x_cls)
+                x_cls = x_cls.flatten(1)
+            for fc in self.cls_fcs[k]:
+                x_cls = self.relu(fc(x_cls))
+
+            for conv in self.reg_convs[k]:
+                x_reg = conv(x_reg)
+            if x_reg.dim() > 2:
+                if self.with_avg_pool:
+                    x_reg = self.avg_pool(x_reg)
+                x_reg = x_reg.flatten(1)
+            for fc in self.reg_fcs[k]:
+                x_reg = self.relu(fc(x_reg))
+
+            cls_score.append(self.fc_cls[k](x_cls) if self.with_cls else None)
+            bbox_pred.append(self.fc_reg[k](x_reg) if self.with_reg else None)
+
+        if self.with_refine:
+            x_ref = x
+            cls_score_ref = list()
+            bbox_pred_ref = list()
+            for k in range(self.num_instance):
+                feat_ref = cls_score[k].softmax(dim=-1)
+                feat_ref = torch.cat((bbox_pred[k], feat_ref[:, 1][:, None]),
+                                     dim=1).repeat(1, 4)
+                feat_ref = torch.cat((x_ref, feat_ref), dim=1)
+                feat_ref = F.relu_(self.shared_fcs_ref(feat_ref))
+
+                cls_score_ref.append(self.fc_cls_ref[k](feat_ref))
+                bbox_pred_ref.append(self.fc_reg_ref[k](feat_ref))
+
+            cls_score = torch.cat(cls_score, dim=1)
+            bbox_pred = torch.cat(bbox_pred, dim=1)
+            cls_score_ref = torch.cat(cls_score_ref, dim=1)
+            bbox_pred_ref = torch.cat(bbox_pred_ref, dim=1)
+            return cls_score, bbox_pred, cls_score_ref, bbox_pred_ref
+
+        cls_score = torch.cat(cls_score, dim=1)
+        bbox_pred = torch.cat(bbox_pred, dim=1)
+
+        return cls_score, bbox_pred
+
+    def get_targets(self,
+                    sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict,
+                    concat: bool = True) -> tuple:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Almost the same as the implementation in bbox_head, we passed
+        additional parameters pos_inds_list and neg_inds_list to
+        `_get_targets_single` function.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals in a single image.
+            Containing the following list of Tensors:
+
+            - labels (list[Tensor],Tensor): Gt_labels for all proposals in a
+              batch, each tensor in list has shape (num_proposals,) when
+              `concat=False`, otherwise just a single tensor has shape
+              (num_all_proposals,).
+            - label_weights (list[Tensor]): Labels_weights for
+              all proposals in a batch, each tensor in list has shape
+              (num_proposals,) when `concat=False`, otherwise just a single
+              tensor has shape (num_all_proposals,).
+            - bbox_targets (list[Tensor],Tensor): Regression target for all
+              proposals in a batch, each tensor in list has shape
+              (num_proposals, 4) when `concat=False`, otherwise just a single
+              tensor has shape (num_all_proposals, 4), the last dimension 4
+              represents [tl_x, tl_y, br_x, br_y].
+            - bbox_weights (list[tensor],Tensor): Regression weights for
+              all proposals in a batch, each tensor in list has shape
+              (num_proposals, 4) when `concat=False`, otherwise just a
+              single tensor has shape (num_all_proposals, 4).
+        """
+        labels = []
+        bbox_targets = []
+        bbox_weights = []
+        label_weights = []
+        for i in range(len(sampling_results)):
+            sample_bboxes = torch.cat([
+                sampling_results[i].pos_gt_bboxes,
+                sampling_results[i].neg_gt_bboxes
+            ])
+            sample_priors = sampling_results[i].priors
+            sample_priors = sample_priors.repeat(1, self.num_instance).reshape(
+                -1, 4)
+            sample_bboxes = sample_bboxes.reshape(-1, 4)
+
+            if not self.reg_decoded_bbox:
+                _bbox_targets = self.bbox_coder.encode(sample_priors,
+                                                       sample_bboxes)
+            else:
+                _bbox_targets = sample_priors
+            _bbox_targets = _bbox_targets.reshape(-1, self.num_instance * 4)
+            _bbox_weights = torch.ones(_bbox_targets.shape)
+            _labels = torch.cat([
+                sampling_results[i].pos_gt_labels,
+                sampling_results[i].neg_gt_labels
+            ])
+            _labels_weights = torch.ones(_labels.shape)
+
+            bbox_targets.append(_bbox_targets)
+            bbox_weights.append(_bbox_weights)
+            labels.append(_labels)
+            label_weights.append(_labels_weights)
+
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            bbox_weights = torch.cat(bbox_weights, 0)
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def loss(self, cls_score: Tensor, bbox_pred: Tensor, rois: Tensor,
+             labels: Tensor, label_weights: Tensor, bbox_targets: Tensor,
+             bbox_weights: Tensor, **kwargs) -> dict:
+        """Calculate the loss based on the network predictions and targets.
+
+        Args:
+            cls_score (Tensor): Classification prediction results of all class,
+                has shape (batch_size * num_proposals_single_image,
+                (num_classes + 1) * k), k represents the number of prediction
+                boxes generated by each proposal box.
+            bbox_pred (Tensor): Regression prediction results, has shape
+                (batch_size * num_proposals_single_image, 4 * k), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            rois (Tensor): RoIs with the shape
+                (batch_size * num_proposals_single_image, 5) where the first
+                column indicates batch id of each RoI.
+            labels (Tensor): Gt_labels for all proposals in a batch, has
+                shape (batch_size * num_proposals_single_image, k).
+            label_weights (Tensor): Labels_weights for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image, k).
+            bbox_targets (Tensor): Regression target for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image,
+                4 * k), the last dimension 4 represents [tl_x, tl_y, br_x,
+                br_y].
+            bbox_weights (Tensor): Regression weights for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image,
+                4 * k).
+
+        Returns:
+            dict: A dictionary of loss.
+        """
+        losses = dict()
+        if bbox_pred.numel():
+            loss_0 = self.emd_loss(bbox_pred[:, 0:4], cls_score[:, 0:2],
+                                   bbox_pred[:, 4:8], cls_score[:, 2:4],
+                                   bbox_targets, labels)
+            loss_1 = self.emd_loss(bbox_pred[:, 4:8], cls_score[:, 2:4],
+                                   bbox_pred[:, 0:4], cls_score[:, 0:2],
+                                   bbox_targets, labels)
+            loss = torch.cat([loss_0, loss_1], dim=1)
+            _, min_indices = loss.min(dim=1)
+            loss_emd = loss[torch.arange(loss.shape[0]), min_indices]
+            loss_emd = loss_emd.mean()
+        else:
+            loss_emd = bbox_pred.sum()
+        losses['loss_rcnn_emd'] = loss_emd
+        return losses
+
+    def emd_loss(self, bbox_pred_0: Tensor, cls_score_0: Tensor,
+                 bbox_pred_1: Tensor, cls_score_1: Tensor, targets: Tensor,
+                 labels: Tensor) -> Tensor:
+        """Calculate the emd loss.
+
+        Note:
+            This implementation is modified from https://github.com/Purkialo/
+            CrowdDet/blob/master/lib/det_oprs/loss_opr.py
+
+        Args:
+            bbox_pred_0 (Tensor): Part of regression prediction results, has
+                shape (batch_size * num_proposals_single_image, 4), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            cls_score_0 (Tensor): Part of classification prediction results,
+                has shape (batch_size * num_proposals_single_image,
+                (num_classes + 1)), where 1 represents the background.
+            bbox_pred_1 (Tensor): The other part of regression prediction
+                results, has shape (batch_size*num_proposals_single_image, 4).
+            cls_score_1 (Tensor):The other part of classification prediction
+                results, has shape (batch_size * num_proposals_single_image,
+                (num_classes + 1)).
+            targets (Tensor):Regression target for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image,
+                4 * k), the last dimension 4 represents [tl_x, tl_y, br_x,
+                br_y], k represents the number of prediction boxes generated
+                by each proposal box.
+            labels (Tensor): Gt_labels for all proposals in a batch, has
+                shape (batch_size * num_proposals_single_image, k).
+
+        Returns:
+            torch.Tensor: The calculated loss.
+        """
+
+        bbox_pred = torch.cat([bbox_pred_0, bbox_pred_1],
+                              dim=1).reshape(-1, bbox_pred_0.shape[-1])
+        cls_score = torch.cat([cls_score_0, cls_score_1],
+                              dim=1).reshape(-1, cls_score_0.shape[-1])
+        targets = targets.reshape(-1, 4)
+        labels = labels.long().flatten()
+
+        # masks
+        valid_masks = labels >= 0
+        fg_masks = labels > 0
+
+        # multiple class
+        bbox_pred = bbox_pred.reshape(-1, self.num_classes, 4)
+        fg_gt_classes = labels[fg_masks]
+        bbox_pred = bbox_pred[fg_masks, fg_gt_classes - 1, :]
+
+        # loss for regression
+        loss_bbox = self.loss_bbox(bbox_pred, targets[fg_masks])
+        loss_bbox = loss_bbox.sum(dim=1)
+
+        # loss for classification
+        labels = labels * valid_masks
+        loss_cls = self.loss_cls(cls_score, labels)
+
+        loss_cls[fg_masks] = loss_cls[fg_masks] + loss_bbox
+        loss = loss_cls.reshape(-1, 2).sum(dim=1)
+        return loss.reshape(-1, 1)
+
+    def _predict_by_feat_single(
+            self,
+            roi: Tensor,
+            cls_score: Tensor,
+            bbox_pred: Tensor,
+            img_meta: dict,
+            rescale: bool = False,
+            rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5).
+                last dimension 5 arrange as (batch_index, x1, y1, x2, y2).
+            cls_score (Tensor): Box scores, has shape
+                (num_boxes, num_classes + 1).
+            bbox_pred (Tensor): Box energies / deltas. has shape
+                (num_boxes, num_classes * 4).
+            img_meta (dict): image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+                Defaults to None
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+
+        cls_score = cls_score.reshape(-1, self.num_classes + 1)
+        bbox_pred = bbox_pred.reshape(-1, 4)
+        roi = roi.repeat_interleave(self.num_instance, dim=0)
+
+        results = InstanceData()
+        if roi.shape[0] == 0:
+            return empty_instances([img_meta],
+                                   roi.device,
+                                   task_type='bbox',
+                                   instance_results=[results])[0]
+
+        scores = cls_score.softmax(dim=-1) if cls_score is not None else None
+        img_shape = img_meta['img_shape']
+        bboxes = self.bbox_coder.decode(
+            roi[..., 1:], bbox_pred, max_shape=img_shape)
+
+        if rescale and bboxes.size(0) > 0:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+            bboxes = (bboxes.view(bboxes.size(0), -1, 4) / scale_factor).view(
+                bboxes.size()[0], -1)
+
+        if rcnn_test_cfg is None:
+            # This means that it is aug test.
+            # It needs to return the raw results without nms.
+            results.bboxes = bboxes
+            results.scores = scores
+        else:
+            roi_idx = np.tile(
+                np.arange(bboxes.shape[0] / self.num_instance)[:, None],
+                (1, self.num_instance)).reshape(-1, 1)[:, 0]
+            roi_idx = torch.from_numpy(roi_idx).to(bboxes.device).reshape(
+                -1, 1)
+            bboxes = torch.cat([bboxes, roi_idx], dim=1)
+            det_bboxes, det_scores = self.set_nms(
+                bboxes, scores[:, 1], rcnn_test_cfg.score_thr,
+                rcnn_test_cfg.nms['iou_threshold'], rcnn_test_cfg.max_per_img)
+
+            results.bboxes = det_bboxes[:, :-1]
+            results.scores = det_scores
+            results.labels = torch.zeros_like(det_scores)
+
+        return results
+
+    @staticmethod
+    def set_nms(bboxes: Tensor,
+                scores: Tensor,
+                score_thr: float,
+                iou_threshold: float,
+                max_num: int = -1) -> Tuple[Tensor, Tensor]:
+        """NMS for multi-instance prediction. Please refer to
+        https://github.com/Purkialo/CrowdDet for more details.
+
+        Args:
+            bboxes (Tensor): predict bboxes.
+            scores (Tensor): The score of each predict bbox.
+            score_thr (float): bbox threshold, bboxes with scores lower than it
+                will not be considered.
+            iou_threshold (float): IoU threshold to be considered as
+                conflicted.
+            max_num (int, optional): if there are more than max_num bboxes
+                after NMS, only top max_num will be kept. Default to -1.
+
+        Returns:
+            Tuple[Tensor, Tensor]: (bboxes, scores).
+        """
+
+        bboxes = bboxes[scores > score_thr]
+        scores = scores[scores > score_thr]
+
+        ordered_scores, order = scores.sort(descending=True)
+        ordered_bboxes = bboxes[order]
+        roi_idx = ordered_bboxes[:, -1]
+
+        keep = torch.ones(len(ordered_bboxes)) == 1
+        ruler = torch.arange(len(ordered_bboxes))
+
+        keep = keep.to(bboxes.device)
+        ruler = ruler.to(bboxes.device)
+
+        while ruler.shape[0] > 0:
+            basement = ruler[0]
+            ruler = ruler[1:]
+            idx = roi_idx[basement]
+            # calculate the body overlap
+            basement_bbox = ordered_bboxes[:, :4][basement].reshape(-1, 4)
+            ruler_bbox = ordered_bboxes[:, :4][ruler].reshape(-1, 4)
+            overlap = bbox_overlaps(basement_bbox, ruler_bbox)
+            indices = torch.where(overlap > iou_threshold)[1]
+            loc = torch.where(roi_idx[ruler][indices] == idx)
+            # the mask won't change in the step
+            mask = keep[ruler[indices][loc]]
+            keep[ruler[indices]] = False
+            keep[ruler[indices][loc][mask]] = True
+            ruler[~keep[ruler]] = -1
+            ruler = ruler[ruler > 0]
+
+        keep = keep[order.sort()[1]]
+        return bboxes[keep][:max_num, :], scores[keep][:max_num]
diff --git a/mmde/mmdet/models/roi_heads/bbox_heads/sabl_head.py b/mmde/mmdet/models/roi_heads/bbox_heads/sabl_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a9ee6aba9669514ec8ce7218e8c97e026830f6c
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/bbox_heads/sabl_head.py
@@ -0,0 +1,684 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.layers import multiclass_nms
+from mmdet.models.losses import accuracy
+from mmdet.models.task_modules import SamplingResult
+from mmdet.models.utils import multi_apply
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import ConfigType, InstanceList, OptConfigType, OptMultiConfig
+from .bbox_head import BBoxHead
+
+
+@MODELS.register_module()
+class SABLHead(BBoxHead):
+    """Side-Aware Boundary Localization (SABL) for RoI-Head.
+
+    Side-Aware features are extracted by conv layers
+    with an attention mechanism.
+    Boundary Localization with Bucketing and Bucketing Guided Rescoring
+    are implemented in BucketingBBoxCoder.
+
+    Please refer to https://arxiv.org/abs/1912.04260 for more details.
+
+    Args:
+        cls_in_channels (int): Input channels of cls RoI feature. \
+            Defaults to 256.
+        reg_in_channels (int): Input channels of reg RoI feature. \
+            Defaults to 256.
+        roi_feat_size (int): Size of RoI features. Defaults to 7.
+        reg_feat_up_ratio (int): Upsample ratio of reg features. \
+            Defaults to 2.
+        reg_pre_kernel (int): Kernel of 2D conv layers before \
+            attention pooling. Defaults to 3.
+        reg_post_kernel (int): Kernel of 1D conv layers after \
+            attention pooling. Defaults to 3.
+        reg_pre_num (int): Number of pre convs. Defaults to 2.
+        reg_post_num (int): Number of post convs. Defaults to 1.
+        num_classes (int): Number of classes in dataset. Defaults to 80.
+        cls_out_channels (int): Hidden channels in cls fcs. Defaults to 1024.
+        reg_offset_out_channels (int): Hidden and output channel \
+            of reg offset branch. Defaults to 256.
+        reg_cls_out_channels (int): Hidden and output channel \
+            of reg cls branch. Defaults to 256.
+        num_cls_fcs (int): Number of fcs for cls branch. Defaults to 1.
+        num_reg_fcs (int): Number of fcs for reg branch.. Defaults to 0.
+        reg_class_agnostic (bool): Class agnostic regression or not. \
+            Defaults to True.
+        norm_cfg (dict): Config of norm layers. Defaults to None.
+        bbox_coder (dict): Config of bbox coder. Defaults 'BucketingBBoxCoder'.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox_cls (dict): Config of classification loss for bbox branch.
+        loss_bbox_reg (dict): Config of regression loss for bbox branch.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 cls_in_channels: int = 256,
+                 reg_in_channels: int = 256,
+                 roi_feat_size: int = 7,
+                 reg_feat_up_ratio: int = 2,
+                 reg_pre_kernel: int = 3,
+                 reg_post_kernel: int = 3,
+                 reg_pre_num: int = 2,
+                 reg_post_num: int = 1,
+                 cls_out_channels: int = 1024,
+                 reg_offset_out_channels: int = 256,
+                 reg_cls_out_channels: int = 256,
+                 num_cls_fcs: int = 1,
+                 num_reg_fcs: int = 0,
+                 reg_class_agnostic: bool = True,
+                 norm_cfg: OptConfigType = None,
+                 bbox_coder: ConfigType = dict(
+                     type='BucketingBBoxCoder',
+                     num_buckets=14,
+                     scale_factor=1.7),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 loss_bbox_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox_reg: ConfigType = dict(
+                     type='SmoothL1Loss', beta=0.1, loss_weight=1.0),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(BBoxHead, self).__init__(init_cfg=init_cfg)
+        self.cls_in_channels = cls_in_channels
+        self.reg_in_channels = reg_in_channels
+        self.roi_feat_size = roi_feat_size
+        self.reg_feat_up_ratio = int(reg_feat_up_ratio)
+        self.num_buckets = bbox_coder['num_buckets']
+        assert self.reg_feat_up_ratio // 2 >= 1
+        self.up_reg_feat_size = roi_feat_size * self.reg_feat_up_ratio
+        assert self.up_reg_feat_size == bbox_coder['num_buckets']
+        self.reg_pre_kernel = reg_pre_kernel
+        self.reg_post_kernel = reg_post_kernel
+        self.reg_pre_num = reg_pre_num
+        self.reg_post_num = reg_post_num
+        self.num_classes = num_classes
+        self.cls_out_channels = cls_out_channels
+        self.reg_offset_out_channels = reg_offset_out_channels
+        self.reg_cls_out_channels = reg_cls_out_channels
+        self.num_cls_fcs = num_cls_fcs
+        self.num_reg_fcs = num_reg_fcs
+        self.reg_class_agnostic = reg_class_agnostic
+        assert self.reg_class_agnostic
+        self.norm_cfg = norm_cfg
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox_cls = MODELS.build(loss_bbox_cls)
+        self.loss_bbox_reg = MODELS.build(loss_bbox_reg)
+
+        self.cls_fcs = self._add_fc_branch(self.num_cls_fcs,
+                                           self.cls_in_channels,
+                                           self.roi_feat_size,
+                                           self.cls_out_channels)
+
+        self.side_num = int(np.ceil(self.num_buckets / 2))
+
+        if self.reg_feat_up_ratio > 1:
+            self.upsample_x = nn.ConvTranspose1d(
+                reg_in_channels,
+                reg_in_channels,
+                self.reg_feat_up_ratio,
+                stride=self.reg_feat_up_ratio)
+            self.upsample_y = nn.ConvTranspose1d(
+                reg_in_channels,
+                reg_in_channels,
+                self.reg_feat_up_ratio,
+                stride=self.reg_feat_up_ratio)
+
+        self.reg_pre_convs = nn.ModuleList()
+        for i in range(self.reg_pre_num):
+            reg_pre_conv = ConvModule(
+                reg_in_channels,
+                reg_in_channels,
+                kernel_size=reg_pre_kernel,
+                padding=reg_pre_kernel // 2,
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU'))
+            self.reg_pre_convs.append(reg_pre_conv)
+
+        self.reg_post_conv_xs = nn.ModuleList()
+        for i in range(self.reg_post_num):
+            reg_post_conv_x = ConvModule(
+                reg_in_channels,
+                reg_in_channels,
+                kernel_size=(1, reg_post_kernel),
+                padding=(0, reg_post_kernel // 2),
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU'))
+            self.reg_post_conv_xs.append(reg_post_conv_x)
+        self.reg_post_conv_ys = nn.ModuleList()
+        for i in range(self.reg_post_num):
+            reg_post_conv_y = ConvModule(
+                reg_in_channels,
+                reg_in_channels,
+                kernel_size=(reg_post_kernel, 1),
+                padding=(reg_post_kernel // 2, 0),
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU'))
+            self.reg_post_conv_ys.append(reg_post_conv_y)
+
+        self.reg_conv_att_x = nn.Conv2d(reg_in_channels, 1, 1)
+        self.reg_conv_att_y = nn.Conv2d(reg_in_channels, 1, 1)
+
+        self.fc_cls = nn.Linear(self.cls_out_channels, self.num_classes + 1)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.reg_cls_fcs = self._add_fc_branch(self.num_reg_fcs,
+                                               self.reg_in_channels, 1,
+                                               self.reg_cls_out_channels)
+        self.reg_offset_fcs = self._add_fc_branch(self.num_reg_fcs,
+                                                  self.reg_in_channels, 1,
+                                                  self.reg_offset_out_channels)
+        self.fc_reg_cls = nn.Linear(self.reg_cls_out_channels, 1)
+        self.fc_reg_offset = nn.Linear(self.reg_offset_out_channels, 1)
+
+        if init_cfg is None:
+            self.init_cfg = [
+                dict(
+                    type='Xavier',
+                    layer='Linear',
+                    distribution='uniform',
+                    override=[
+                        dict(type='Normal', name='reg_conv_att_x', std=0.01),
+                        dict(type='Normal', name='reg_conv_att_y', std=0.01),
+                        dict(type='Normal', name='fc_reg_cls', std=0.01),
+                        dict(type='Normal', name='fc_cls', std=0.01),
+                        dict(type='Normal', name='fc_reg_offset', std=0.001)
+                    ])
+            ]
+            if self.reg_feat_up_ratio > 1:
+                self.init_cfg += [
+                    dict(
+                        type='Kaiming',
+                        distribution='normal',
+                        override=[
+                            dict(name='upsample_x'),
+                            dict(name='upsample_y')
+                        ])
+                ]
+
+    def _add_fc_branch(self, num_branch_fcs: int, in_channels: int,
+                       roi_feat_size: int,
+                       fc_out_channels: int) -> nn.ModuleList:
+        """build fc layers."""
+        in_channels = in_channels * roi_feat_size * roi_feat_size
+        branch_fcs = nn.ModuleList()
+        for i in range(num_branch_fcs):
+            fc_in_channels = (in_channels if i == 0 else fc_out_channels)
+            branch_fcs.append(nn.Linear(fc_in_channels, fc_out_channels))
+        return branch_fcs
+
+    def cls_forward(self, cls_x: Tensor) -> Tensor:
+        """forward of classification fc layers."""
+        cls_x = cls_x.view(cls_x.size(0), -1)
+        for fc in self.cls_fcs:
+            cls_x = self.relu(fc(cls_x))
+        cls_score = self.fc_cls(cls_x)
+        return cls_score
+
+    def attention_pool(self, reg_x: Tensor) -> tuple:
+        """Extract direction-specific features fx and fy with attention
+        methanism."""
+        reg_fx = reg_x
+        reg_fy = reg_x
+        reg_fx_att = self.reg_conv_att_x(reg_fx).sigmoid()
+        reg_fy_att = self.reg_conv_att_y(reg_fy).sigmoid()
+        reg_fx_att = reg_fx_att / reg_fx_att.sum(dim=2).unsqueeze(2)
+        reg_fy_att = reg_fy_att / reg_fy_att.sum(dim=3).unsqueeze(3)
+        reg_fx = (reg_fx * reg_fx_att).sum(dim=2)
+        reg_fy = (reg_fy * reg_fy_att).sum(dim=3)
+        return reg_fx, reg_fy
+
+    def side_aware_feature_extractor(self, reg_x: Tensor) -> tuple:
+        """Refine and extract side-aware features without split them."""
+        for reg_pre_conv in self.reg_pre_convs:
+            reg_x = reg_pre_conv(reg_x)
+        reg_fx, reg_fy = self.attention_pool(reg_x)
+
+        if self.reg_post_num > 0:
+            reg_fx = reg_fx.unsqueeze(2)
+            reg_fy = reg_fy.unsqueeze(3)
+            for i in range(self.reg_post_num):
+                reg_fx = self.reg_post_conv_xs[i](reg_fx)
+                reg_fy = self.reg_post_conv_ys[i](reg_fy)
+            reg_fx = reg_fx.squeeze(2)
+            reg_fy = reg_fy.squeeze(3)
+        if self.reg_feat_up_ratio > 1:
+            reg_fx = self.relu(self.upsample_x(reg_fx))
+            reg_fy = self.relu(self.upsample_y(reg_fy))
+        reg_fx = torch.transpose(reg_fx, 1, 2)
+        reg_fy = torch.transpose(reg_fy, 1, 2)
+        return reg_fx.contiguous(), reg_fy.contiguous()
+
+    def reg_pred(self, x: Tensor, offset_fcs: nn.ModuleList,
+                 cls_fcs: nn.ModuleList) -> tuple:
+        """Predict bucketing estimation (cls_pred) and fine regression (offset
+        pred) with side-aware features."""
+        x_offset = x.view(-1, self.reg_in_channels)
+        x_cls = x.view(-1, self.reg_in_channels)
+
+        for fc in offset_fcs:
+            x_offset = self.relu(fc(x_offset))
+        for fc in cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+        offset_pred = self.fc_reg_offset(x_offset)
+        cls_pred = self.fc_reg_cls(x_cls)
+
+        offset_pred = offset_pred.view(x.size(0), -1)
+        cls_pred = cls_pred.view(x.size(0), -1)
+
+        return offset_pred, cls_pred
+
+    def side_aware_split(self, feat: Tensor) -> Tensor:
+        """Split side-aware features aligned with orders of bucketing
+        targets."""
+        l_end = int(np.ceil(self.up_reg_feat_size / 2))
+        r_start = int(np.floor(self.up_reg_feat_size / 2))
+        feat_fl = feat[:, :l_end]
+        feat_fr = feat[:, r_start:].flip(dims=(1, ))
+        feat_fl = feat_fl.contiguous()
+        feat_fr = feat_fr.contiguous()
+        feat = torch.cat([feat_fl, feat_fr], dim=-1)
+        return feat
+
+    def bbox_pred_split(self, bbox_pred: tuple,
+                        num_proposals_per_img: Sequence[int]) -> tuple:
+        """Split batch bbox prediction back to each image."""
+        bucket_cls_preds, bucket_offset_preds = bbox_pred
+        bucket_cls_preds = bucket_cls_preds.split(num_proposals_per_img, 0)
+        bucket_offset_preds = bucket_offset_preds.split(
+            num_proposals_per_img, 0)
+        bbox_pred = tuple(zip(bucket_cls_preds, bucket_offset_preds))
+        return bbox_pred
+
+    def reg_forward(self, reg_x: Tensor) -> tuple:
+        """forward of regression branch."""
+        outs = self.side_aware_feature_extractor(reg_x)
+        edge_offset_preds = []
+        edge_cls_preds = []
+        reg_fx = outs[0]
+        reg_fy = outs[1]
+        offset_pred_x, cls_pred_x = self.reg_pred(reg_fx, self.reg_offset_fcs,
+                                                  self.reg_cls_fcs)
+        offset_pred_y, cls_pred_y = self.reg_pred(reg_fy, self.reg_offset_fcs,
+                                                  self.reg_cls_fcs)
+        offset_pred_x = self.side_aware_split(offset_pred_x)
+        offset_pred_y = self.side_aware_split(offset_pred_y)
+        cls_pred_x = self.side_aware_split(cls_pred_x)
+        cls_pred_y = self.side_aware_split(cls_pred_y)
+        edge_offset_preds = torch.cat([offset_pred_x, offset_pred_y], dim=-1)
+        edge_cls_preds = torch.cat([cls_pred_x, cls_pred_y], dim=-1)
+
+        return edge_cls_preds, edge_offset_preds
+
+    def forward(self, x: Tensor) -> tuple:
+        """Forward features from the upstream network."""
+        bbox_pred = self.reg_forward(x)
+        cls_score = self.cls_forward(x)
+
+        return cls_score, bbox_pred
+
+    def get_targets(self,
+                    sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict,
+                    concat: bool = True) -> tuple:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results."""
+        pos_proposals = [res.pos_bboxes for res in sampling_results]
+        neg_proposals = [res.neg_bboxes for res in sampling_results]
+        pos_gt_bboxes = [res.pos_gt_bboxes for res in sampling_results]
+        pos_gt_labels = [res.pos_gt_labels for res in sampling_results]
+        cls_reg_targets = self.bucket_target(
+            pos_proposals,
+            neg_proposals,
+            pos_gt_bboxes,
+            pos_gt_labels,
+            rcnn_train_cfg,
+            concat=concat)
+        (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+         bucket_offset_targets, bucket_offset_weights) = cls_reg_targets
+        return (labels, label_weights, (bucket_cls_targets,
+                                        bucket_offset_targets),
+                (bucket_cls_weights, bucket_offset_weights))
+
+    def bucket_target(self,
+                      pos_proposals_list: list,
+                      neg_proposals_list: list,
+                      pos_gt_bboxes_list: list,
+                      pos_gt_labels_list: list,
+                      rcnn_train_cfg: ConfigDict,
+                      concat: bool = True) -> tuple:
+        """Compute bucketing estimation targets and fine regression targets for
+        a batch of images."""
+        (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+         bucket_offset_targets, bucket_offset_weights) = multi_apply(
+             self._bucket_target_single,
+             pos_proposals_list,
+             neg_proposals_list,
+             pos_gt_bboxes_list,
+             pos_gt_labels_list,
+             cfg=rcnn_train_cfg)
+
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bucket_cls_targets = torch.cat(bucket_cls_targets, 0)
+            bucket_cls_weights = torch.cat(bucket_cls_weights, 0)
+            bucket_offset_targets = torch.cat(bucket_offset_targets, 0)
+            bucket_offset_weights = torch.cat(bucket_offset_weights, 0)
+        return (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+                bucket_offset_targets, bucket_offset_weights)
+
+    def _bucket_target_single(self, pos_proposals: Tensor,
+                              neg_proposals: Tensor, pos_gt_bboxes: Tensor,
+                              pos_gt_labels: Tensor, cfg: ConfigDict) -> tuple:
+        """Compute bucketing estimation targets and fine regression targets for
+        a single image.
+
+        Args:
+            pos_proposals (Tensor): positive proposals of a single image,
+                 Shape (n_pos, 4)
+            neg_proposals (Tensor): negative proposals of a single image,
+                 Shape (n_neg, 4).
+            pos_gt_bboxes (Tensor): gt bboxes assigned to positive proposals
+                 of a single image, Shape (n_pos, 4).
+            pos_gt_labels (Tensor): gt labels assigned to positive proposals
+                 of a single image, Shape (n_pos, ).
+            cfg (dict): Config of calculating targets
+
+        Returns:
+            tuple:
+
+            - labels (Tensor): Labels in a single image. Shape (n,).
+            - label_weights (Tensor): Label weights in a single image.
+                Shape (n,)
+            - bucket_cls_targets (Tensor): Bucket cls targets in
+                a single image. Shape (n, num_buckets*2).
+            - bucket_cls_weights (Tensor): Bucket cls weights in
+                a single image. Shape (n, num_buckets*2).
+            - bucket_offset_targets (Tensor): Bucket offset targets
+                in a single image. Shape (n, num_buckets*2).
+            - bucket_offset_targets (Tensor): Bucket offset weights
+                in a single image. Shape (n, num_buckets*2).
+        """
+        num_pos = pos_proposals.size(0)
+        num_neg = neg_proposals.size(0)
+        num_samples = num_pos + num_neg
+        labels = pos_gt_bboxes.new_full((num_samples, ),
+                                        self.num_classes,
+                                        dtype=torch.long)
+        label_weights = pos_proposals.new_zeros(num_samples)
+        bucket_cls_targets = pos_proposals.new_zeros(num_samples,
+                                                     4 * self.side_num)
+        bucket_cls_weights = pos_proposals.new_zeros(num_samples,
+                                                     4 * self.side_num)
+        bucket_offset_targets = pos_proposals.new_zeros(
+            num_samples, 4 * self.side_num)
+        bucket_offset_weights = pos_proposals.new_zeros(
+            num_samples, 4 * self.side_num)
+        if num_pos > 0:
+            labels[:num_pos] = pos_gt_labels
+            label_weights[:num_pos] = 1.0
+            (pos_bucket_offset_targets, pos_bucket_offset_weights,
+             pos_bucket_cls_targets,
+             pos_bucket_cls_weights) = self.bbox_coder.encode(
+                 pos_proposals, pos_gt_bboxes)
+            bucket_cls_targets[:num_pos, :] = pos_bucket_cls_targets
+            bucket_cls_weights[:num_pos, :] = pos_bucket_cls_weights
+            bucket_offset_targets[:num_pos, :] = pos_bucket_offset_targets
+            bucket_offset_weights[:num_pos, :] = pos_bucket_offset_weights
+        if num_neg > 0:
+            label_weights[-num_neg:] = 1.0
+        return (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+                bucket_offset_targets, bucket_offset_weights)
+
+    def loss(self,
+             cls_score: Tensor,
+             bbox_pred: Tuple[Tensor, Tensor],
+             rois: Tensor,
+             labels: Tensor,
+             label_weights: Tensor,
+             bbox_targets: Tuple[Tensor, Tensor],
+             bbox_weights: Tuple[Tensor, Tensor],
+             reduction_override: Optional[str] = None) -> dict:
+        """Calculate the loss based on the network predictions and targets.
+
+        Args:
+            cls_score (Tensor): Classification prediction
+                results of all class, has shape
+                (batch_size * num_proposals_single_image, num_classes)
+            bbox_pred (Tensor): A tuple of regression prediction results
+                containing `bucket_cls_preds and` `bucket_offset_preds`.
+            rois (Tensor): RoIs with the shape
+                (batch_size * num_proposals_single_image, 5) where the first
+                column indicates batch id of each RoI.
+            labels (Tensor): Gt_labels for all proposals in a batch, has
+                shape (batch_size * num_proposals_single_image, ).
+            label_weights (Tensor): Labels_weights for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image, ).
+            bbox_targets (Tuple[Tensor, Tensor]): A tuple of regression target
+                containing `bucket_cls_targets` and `bucket_offset_targets`.
+                the last dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            bbox_weights (Tuple[Tensor, Tensor]): A tuple of regression
+                weights containing `bucket_cls_weights` and
+                `bucket_offset_weights`.
+            reduction_override (str, optional): The reduction
+                method used to override the original reduction
+                method of the loss. Options are "none",
+                "mean" and "sum". Defaults to None,
+
+        Returns:
+            dict: A dictionary of loss.
+        """
+        losses = dict()
+        if cls_score is not None:
+            avg_factor = max(torch.sum(label_weights > 0).float().item(), 1.)
+            losses['loss_cls'] = self.loss_cls(
+                cls_score,
+                labels,
+                label_weights,
+                avg_factor=avg_factor,
+                reduction_override=reduction_override)
+            losses['acc'] = accuracy(cls_score, labels)
+
+        if bbox_pred is not None:
+            bucket_cls_preds, bucket_offset_preds = bbox_pred
+            bucket_cls_targets, bucket_offset_targets = bbox_targets
+            bucket_cls_weights, bucket_offset_weights = bbox_weights
+            # edge cls
+            bucket_cls_preds = bucket_cls_preds.view(-1, self.side_num)
+            bucket_cls_targets = bucket_cls_targets.view(-1, self.side_num)
+            bucket_cls_weights = bucket_cls_weights.view(-1, self.side_num)
+            losses['loss_bbox_cls'] = self.loss_bbox_cls(
+                bucket_cls_preds,
+                bucket_cls_targets,
+                bucket_cls_weights,
+                avg_factor=bucket_cls_targets.size(0),
+                reduction_override=reduction_override)
+
+            losses['loss_bbox_reg'] = self.loss_bbox_reg(
+                bucket_offset_preds,
+                bucket_offset_targets,
+                bucket_offset_weights,
+                avg_factor=bucket_offset_targets.size(0),
+                reduction_override=reduction_override)
+
+        return losses
+
+    def _predict_by_feat_single(
+            self,
+            roi: Tensor,
+            cls_score: Tensor,
+            bbox_pred: Tuple[Tensor, Tensor],
+            img_meta: dict,
+            rescale: bool = False,
+            rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5).
+                last dimension 5 arrange as (batch_index, x1, y1, x2, y2).
+            cls_score (Tensor): Box scores, has shape
+                (num_boxes, num_classes + 1).
+            bbox_pred (Tuple[Tensor, Tensor]): Box cls preds and offset preds.
+            img_meta (dict): image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+                Defaults to None
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        results = InstanceData()
+        if isinstance(cls_score, list):
+            cls_score = sum(cls_score) / float(len(cls_score))
+        scores = F.softmax(cls_score, dim=1) if cls_score is not None else None
+        img_shape = img_meta['img_shape']
+        if bbox_pred is not None:
+            bboxes, confidences = self.bbox_coder.decode(
+                roi[:, 1:], bbox_pred, img_shape)
+        else:
+            bboxes = roi[:, 1:].clone()
+            confidences = None
+            if img_shape is not None:
+                bboxes[:, [0, 2]].clamp_(min=0, max=img_shape[1] - 1)
+                bboxes[:, [1, 3]].clamp_(min=0, max=img_shape[0] - 1)
+
+        if rescale and bboxes.size(0) > 0:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+            bboxes = (bboxes.view(bboxes.size(0), -1, 4) / scale_factor).view(
+                bboxes.size()[0], -1)
+
+        if rcnn_test_cfg is None:
+            results.bboxes = bboxes
+            results.scores = scores
+        else:
+            det_bboxes, det_labels = multiclass_nms(
+                bboxes,
+                scores,
+                rcnn_test_cfg.score_thr,
+                rcnn_test_cfg.nms,
+                rcnn_test_cfg.max_per_img,
+                score_factors=confidences)
+            results.bboxes = det_bboxes[:, :4]
+            results.scores = det_bboxes[:, -1]
+            results.labels = det_labels
+        return results
+
+    def refine_bboxes(self, sampling_results: List[SamplingResult],
+                      bbox_results: dict,
+                      batch_img_metas: List[dict]) -> InstanceList:
+        """Refine bboxes during training.
+
+        Args:
+            sampling_results (List[:obj:`SamplingResult`]): Sampling results.
+            bbox_results (dict): Usually is a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+            batch_img_metas (List[dict]): List of image information.
+
+        Returns:
+            list[:obj:`InstanceData`]: Refined bboxes of each image.
+        """
+        pos_is_gts = [res.pos_is_gt for res in sampling_results]
+        # bbox_targets is a tuple
+        labels = bbox_results['bbox_targets'][0]
+        cls_scores = bbox_results['cls_score']
+        rois = bbox_results['rois']
+        bbox_preds = bbox_results['bbox_pred']
+
+        if cls_scores.numel() == 0:
+            return None
+
+        labels = torch.where(labels == self.num_classes,
+                             cls_scores[:, :-1].argmax(1), labels)
+
+        img_ids = rois[:, 0].long().unique(sorted=True)
+        assert img_ids.numel() <= len(batch_img_metas)
+
+        results_list = []
+        for i in range(len(batch_img_metas)):
+            inds = torch.nonzero(
+                rois[:, 0] == i, as_tuple=False).squeeze(dim=1)
+            num_rois = inds.numel()
+
+            bboxes_ = rois[inds, 1:]
+            label_ = labels[inds]
+            edge_cls_preds, edge_offset_preds = bbox_preds
+            edge_cls_preds_ = edge_cls_preds[inds]
+            edge_offset_preds_ = edge_offset_preds[inds]
+            bbox_pred_ = (edge_cls_preds_, edge_offset_preds_)
+            img_meta_ = batch_img_metas[i]
+            pos_is_gts_ = pos_is_gts[i]
+
+            bboxes = self.regress_by_class(bboxes_, label_, bbox_pred_,
+                                           img_meta_)
+            # filter gt bboxes
+            pos_keep = 1 - pos_is_gts_
+            keep_inds = pos_is_gts_.new_ones(num_rois)
+            keep_inds[:len(pos_is_gts_)] = pos_keep
+            results = InstanceData(bboxes=bboxes[keep_inds.type(torch.bool)])
+            results_list.append(results)
+
+        return results_list
+
+    def regress_by_class(self, rois: Tensor, label: Tensor, bbox_pred: tuple,
+                         img_meta: dict) -> Tensor:
+        """Regress the bbox for the predicted class. Used in Cascade R-CNN.
+
+        Args:
+            rois (Tensor): shape (n, 4) or (n, 5)
+            label (Tensor): shape (n, )
+            bbox_pred (Tuple[Tensor]): shape [(n, num_buckets *2), \
+                (n, num_buckets *2)]
+            img_meta (dict): Image meta info.
+
+        Returns:
+            Tensor: Regressed bboxes, the same shape as input rois.
+        """
+        assert rois.size(1) == 4 or rois.size(1) == 5
+
+        if rois.size(1) == 4:
+            new_rois, _ = self.bbox_coder.decode(rois, bbox_pred,
+                                                 img_meta['img_shape'])
+        else:
+            bboxes, _ = self.bbox_coder.decode(rois[:, 1:], bbox_pred,
+                                               img_meta['img_shape'])
+            new_rois = torch.cat((rois[:, [0]], bboxes), dim=1)
+
+        return new_rois
diff --git a/mmde/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py b/mmde/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..790b08fb207970927c7925cb8b3fb365bc183dc4
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .convfc_bbox_head import ConvFCBBoxHead
+
+
+@MODELS.register_module()
+class SCNetBBoxHead(ConvFCBBoxHead):
+    """BBox head for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    This inherits ``ConvFCBBoxHead`` with modified forward() function, allow us
+    to get intermediate shared feature.
+    """
+
+    def _forward_shared(self, x: Tensor) -> Tensor:
+        """Forward function for shared part.
+
+        Args:
+            x (Tensor): Input feature.
+
+        Returns:
+            Tensor: Shared feature.
+        """
+        if self.num_shared_convs > 0:
+            for conv in self.shared_convs:
+                x = conv(x)
+
+        if self.num_shared_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+
+            x = x.flatten(1)
+
+            for fc in self.shared_fcs:
+                x = self.relu(fc(x))
+
+        return x
+
+    def _forward_cls_reg(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward function for classification and regression parts.
+
+        Args:
+            x (Tensor): Input feature.
+
+        Returns:
+            tuple[Tensor]:
+
+                - cls_score (Tensor): classification prediction.
+                - bbox_pred (Tensor): bbox prediction.
+        """
+        x_cls = x
+        x_reg = x
+
+        for conv in self.cls_convs:
+            x_cls = conv(x_cls)
+        if x_cls.dim() > 2:
+            if self.with_avg_pool:
+                x_cls = self.avg_pool(x_cls)
+            x_cls = x_cls.flatten(1)
+        for fc in self.cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+
+        for conv in self.reg_convs:
+            x_reg = conv(x_reg)
+        if x_reg.dim() > 2:
+            if self.with_avg_pool:
+                x_reg = self.avg_pool(x_reg)
+            x_reg = x_reg.flatten(1)
+        for fc in self.reg_fcs:
+            x_reg = self.relu(fc(x_reg))
+
+        cls_score = self.fc_cls(x_cls) if self.with_cls else None
+        bbox_pred = self.fc_reg(x_reg) if self.with_reg else None
+
+        return cls_score, bbox_pred
+
+    def forward(
+            self,
+            x: Tensor,
+            return_shared_feat: bool = False) -> Union[Tensor, Tuple[Tensor]]:
+        """Forward function.
+
+        Args:
+            x (Tensor): input features
+            return_shared_feat (bool): If True, return cls-reg-shared feature.
+
+        Return:
+            out (tuple[Tensor]): contain ``cls_score`` and ``bbox_pred``,
+                if  ``return_shared_feat`` is True, append ``x_shared`` to the
+                returned tuple.
+        """
+        x_shared = self._forward_shared(x)
+        out = self._forward_cls_reg(x_shared)
+
+        if return_shared_feat:
+            out += (x_shared, )
+
+        return out
diff --git a/mmde/mmdet/models/roi_heads/cascade_roi_head.py b/mmde/mmdet/models/roi_heads/cascade_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..81db671113a63beb7849abdc0e432a738ee46f5e
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/cascade_roi_head.py
@@ -0,0 +1,568 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.model import ModuleList
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.test_time_augs import merge_aug_masks
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi, get_box_tensor
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptMultiConfig)
+from ..utils.misc import empty_instances, unpack_gt_instances
+from .base_roi_head import BaseRoIHead
+
+
+@MODELS.register_module()
+class CascadeRoIHead(BaseRoIHead):
+    """Cascade roi head including one bbox head and one mask head.
+
+    https://arxiv.org/abs/1712.00726
+    """
+
+    def __init__(self,
+                 num_stages: int,
+                 stage_loss_weights: Union[List[float], Tuple[float]],
+                 bbox_roi_extractor: OptMultiConfig = None,
+                 bbox_head: OptMultiConfig = None,
+                 mask_roi_extractor: OptMultiConfig = None,
+                 mask_head: OptMultiConfig = None,
+                 shared_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        assert bbox_roi_extractor is not None
+        assert bbox_head is not None
+        assert shared_head is None, \
+            'Shared head is not supported in Cascade RCNN anymore'
+
+        self.num_stages = num_stages
+        self.stage_loss_weights = stage_loss_weights
+        super().__init__(
+            bbox_roi_extractor=bbox_roi_extractor,
+            bbox_head=bbox_head,
+            mask_roi_extractor=mask_roi_extractor,
+            mask_head=mask_head,
+            shared_head=shared_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+
+    def init_bbox_head(self, bbox_roi_extractor: MultiConfig,
+                       bbox_head: MultiConfig) -> None:
+        """Initialize box head and box roi extractor.
+
+        Args:
+            bbox_roi_extractor (:obj:`ConfigDict`, dict or list):
+                Config of box roi extractor.
+            bbox_head (:obj:`ConfigDict`, dict or list): Config
+                of box in box head.
+        """
+        self.bbox_roi_extractor = ModuleList()
+        self.bbox_head = ModuleList()
+        if not isinstance(bbox_roi_extractor, list):
+            bbox_roi_extractor = [
+                bbox_roi_extractor for _ in range(self.num_stages)
+            ]
+        if not isinstance(bbox_head, list):
+            bbox_head = [bbox_head for _ in range(self.num_stages)]
+        assert len(bbox_roi_extractor) == len(bbox_head) == self.num_stages
+        for roi_extractor, head in zip(bbox_roi_extractor, bbox_head):
+            self.bbox_roi_extractor.append(MODELS.build(roi_extractor))
+            self.bbox_head.append(MODELS.build(head))
+
+    def init_mask_head(self, mask_roi_extractor: MultiConfig,
+                       mask_head: MultiConfig) -> None:
+        """Initialize mask head and mask roi extractor.
+
+        Args:
+            mask_head (dict): Config of mask in mask head.
+            mask_roi_extractor (:obj:`ConfigDict`, dict or list):
+                Config of mask roi extractor.
+        """
+        self.mask_head = nn.ModuleList()
+        if not isinstance(mask_head, list):
+            mask_head = [mask_head for _ in range(self.num_stages)]
+        assert len(mask_head) == self.num_stages
+        for head in mask_head:
+            self.mask_head.append(MODELS.build(head))
+        if mask_roi_extractor is not None:
+            self.share_roi_extractor = False
+            self.mask_roi_extractor = ModuleList()
+            if not isinstance(mask_roi_extractor, list):
+                mask_roi_extractor = [
+                    mask_roi_extractor for _ in range(self.num_stages)
+                ]
+            assert len(mask_roi_extractor) == self.num_stages
+            for roi_extractor in mask_roi_extractor:
+                self.mask_roi_extractor.append(MODELS.build(roi_extractor))
+        else:
+            self.share_roi_extractor = True
+            self.mask_roi_extractor = self.bbox_roi_extractor
+
+    def init_assigner_sampler(self) -> None:
+        """Initialize assigner and sampler for each stage."""
+        self.bbox_assigner = []
+        self.bbox_sampler = []
+        if self.train_cfg is not None:
+            for idx, rcnn_train_cfg in enumerate(self.train_cfg):
+                self.bbox_assigner.append(
+                    TASK_UTILS.build(rcnn_train_cfg.assigner))
+                self.current_stage = idx
+                self.bbox_sampler.append(
+                    TASK_UTILS.build(
+                        rcnn_train_cfg.sampler,
+                        default_args=dict(context=self)))
+
+    def _bbox_forward(self, stage: int, x: Tuple[Tensor],
+                      rois: Tensor) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        cls_score, bbox_pred = bbox_head(bbox_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def bbox_loss(self, stage: int, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult]) -> dict:
+        """Run forward function and calculate loss for box head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+        """
+        bbox_head = self.bbox_head[stage]
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(stage, x, rois)
+        bbox_results.update(rois=rois)
+
+        bbox_loss_and_target = bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg[stage])
+        bbox_results.update(bbox_loss_and_target)
+
+        return bbox_results
+
+    def _mask_forward(self, stage: int, x: Tuple[Tensor],
+                      rois: Tensor) -> dict:
+        """Mask head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+        """
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        mask_preds = mask_head(mask_feats)
+
+        mask_results = dict(mask_preds=mask_preds)
+        return mask_results
+
+    def mask_loss(self, stage: int, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  batch_gt_instances: InstanceList) -> dict:
+        """Run forward function and calculate loss for mask head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+        mask_results = self._mask_forward(stage, x, pos_rois)
+
+        mask_head = self.mask_head[stage]
+
+        mask_loss_and_target = mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg[stage])
+        mask_results.update(mask_loss_and_target)
+
+        return mask_results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        # TODO: May add a new function in baseroihead
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        num_imgs = len(batch_data_samples)
+        losses = dict()
+        results_list = rpn_results_list
+        for stage in range(self.num_stages):
+            self.current_stage = stage
+
+            stage_loss_weight = self.stage_loss_weights[stage]
+
+            # assign gts and sample proposals
+            sampling_results = []
+            if self.with_bbox or self.with_mask:
+                bbox_assigner = self.bbox_assigner[stage]
+                bbox_sampler = self.bbox_sampler[stage]
+
+                for i in range(num_imgs):
+                    results = results_list[i]
+                    # rename rpn_results.bboxes to rpn_results.priors
+                    results.priors = results.pop('bboxes')
+
+                    assign_result = bbox_assigner.assign(
+                        results, batch_gt_instances[i],
+                        batch_gt_instances_ignore[i])
+
+                    sampling_result = bbox_sampler.sample(
+                        assign_result,
+                        results,
+                        batch_gt_instances[i],
+                        feats=[lvl_feat[i][None] for lvl_feat in x])
+                    sampling_results.append(sampling_result)
+
+            # bbox head forward and loss
+            bbox_results = self.bbox_loss(stage, x, sampling_results)
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{stage}.{name}'] = (
+                    value * stage_loss_weight if 'loss' in name else value)
+
+            # mask head forward and loss
+            if self.with_mask:
+                mask_results = self.mask_loss(stage, x, sampling_results,
+                                              batch_gt_instances)
+                for name, value in mask_results['loss_mask'].items():
+                    losses[f's{stage}.{name}'] = (
+                        value * stage_loss_weight if 'loss' in name else value)
+
+            # refine bboxes
+            if stage < self.num_stages - 1:
+                bbox_head = self.bbox_head[stage]
+                with torch.no_grad():
+                    results_list = bbox_head.refine_bboxes(
+                        sampling_results, bbox_results, batch_img_metas)
+                    # Empty proposal
+                    if results_list is None:
+                        break
+        return losses
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False,
+                     **kwargs) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        proposals = [res.bboxes for res in rpn_results_list]
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = bbox2roi(proposals)
+
+        if rois.shape[0] == 0:
+            return empty_instances(
+                batch_img_metas,
+                rois.device,
+                task_type='bbox',
+                box_type=self.bbox_head[-1].predict_box_type,
+                num_classes=self.bbox_head[-1].num_classes,
+                score_per_cls=rcnn_test_cfg is None)
+
+        rois, cls_scores, bbox_preds = self._refine_roi(
+            x=x,
+            rois=rois,
+            batch_img_metas=batch_img_metas,
+            num_proposals_per_img=num_proposals_per_img,
+            **kwargs)
+
+        results_list = self.bbox_head[-1].predict_by_feat(
+            rois=rois,
+            cls_scores=cls_scores,
+            bbox_preds=bbox_preds,
+            batch_img_metas=batch_img_metas,
+            rescale=rescale,
+            rcnn_test_cfg=rcnn_test_cfg)
+        return results_list
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     results_list: List[InstanceData],
+                     rescale: bool = False) -> List[InstanceData]:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        aug_masks = []
+        for stage in range(self.num_stages):
+            mask_results = self._mask_forward(stage, x, mask_rois)
+            mask_preds = mask_results['mask_preds']
+            # split batch mask prediction back to each image
+            mask_preds = mask_preds.split(num_mask_rois_per_img, 0)
+            aug_masks.append([m.sigmoid().detach() for m in mask_preds])
+
+        merged_masks = []
+        for i in range(len(batch_img_metas)):
+            aug_mask = [mask[i] for mask in aug_masks]
+            merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i])
+            merged_masks.append(merged_mask)
+        results_list = self.mask_head[-1].predict_by_feat(
+            mask_preds=merged_masks,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale,
+            activate_map=True)
+        return results_list
+
+    def _refine_roi(self, x: Tuple[Tensor], rois: Tensor,
+                    batch_img_metas: List[dict],
+                    num_proposals_per_img: Sequence[int], **kwargs) -> tuple:
+        """Multi-stage refinement of RoI.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): shape (n, 5), [batch_ind, x1, y1, x2, y2]
+            batch_img_metas (list[dict]): List of image information.
+            num_proposals_per_img (sequence[int]): number of proposals
+                in each image.
+
+        Returns:
+            tuple:
+
+               - rois (Tensor): Refined RoI.
+               - cls_scores (list[Tensor]): Average predicted
+                   cls score per image.
+               - bbox_preds (list[Tensor]): Bbox branch predictions
+                   for the last stage of per image.
+        """
+        # "ms" in variable names means multi-stage
+        ms_scores = []
+        for stage in range(self.num_stages):
+            bbox_results = self._bbox_forward(
+                stage=stage, x=x, rois=rois, **kwargs)
+
+            # split batch bbox prediction back to each image
+            cls_scores = bbox_results['cls_score']
+            bbox_preds = bbox_results['bbox_pred']
+
+            rois = rois.split(num_proposals_per_img, 0)
+            cls_scores = cls_scores.split(num_proposals_per_img, 0)
+            ms_scores.append(cls_scores)
+
+            # some detector with_reg is False, bbox_preds will be None
+            if bbox_preds is not None:
+                # TODO move this to a sabl_roi_head
+                # the bbox prediction of some detectors like SABL is not Tensor
+                if isinstance(bbox_preds, torch.Tensor):
+                    bbox_preds = bbox_preds.split(num_proposals_per_img, 0)
+                else:
+                    bbox_preds = self.bbox_head[stage].bbox_pred_split(
+                        bbox_preds, num_proposals_per_img)
+            else:
+                bbox_preds = (None, ) * len(batch_img_metas)
+
+            if stage < self.num_stages - 1:
+                bbox_head = self.bbox_head[stage]
+                if bbox_head.custom_activation:
+                    cls_scores = [
+                        bbox_head.loss_cls.get_activation(s)
+                        for s in cls_scores
+                    ]
+                refine_rois_list = []
+                for i in range(len(batch_img_metas)):
+                    if rois[i].shape[0] > 0:
+                        bbox_label = cls_scores[i][:, :-1].argmax(dim=1)
+                        # Refactor `bbox_head.regress_by_class` to only accept
+                        # box tensor without img_idx concatenated.
+                        refined_bboxes = bbox_head.regress_by_class(
+                            rois[i][:, 1:], bbox_label, bbox_preds[i],
+                            batch_img_metas[i])
+                        refined_bboxes = get_box_tensor(refined_bboxes)
+                        refined_rois = torch.cat(
+                            [rois[i][:, [0]], refined_bboxes], dim=1)
+                        refine_rois_list.append(refined_rois)
+                rois = torch.cat(refine_rois_list)
+
+        # average scores of each image by stages
+        cls_scores = [
+            sum([score[i] for score in ms_scores]) / float(len(ms_scores))
+            for i in range(len(batch_img_metas))
+        ]
+        return rois, cls_scores, bbox_preds
+
+    def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+                batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            rois, cls_scores, bbox_preds = self._refine_roi(
+                x, rois, batch_img_metas, num_proposals_per_img)
+            results = results + (cls_scores, bbox_preds)
+        # mask head
+        if self.with_mask:
+            aug_masks = []
+            rois = torch.cat(rois)
+            for stage in range(self.num_stages):
+                mask_results = self._mask_forward(stage, x, rois)
+                mask_preds = mask_results['mask_preds']
+                mask_preds = mask_preds.split(num_proposals_per_img, 0)
+                aug_masks.append([m.sigmoid().detach() for m in mask_preds])
+
+            merged_masks = []
+            for i in range(len(batch_img_metas)):
+                aug_mask = [mask[i] for mask in aug_masks]
+                merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i])
+                merged_masks.append(merged_mask)
+            results = results + (merged_masks, )
+        return results
diff --git a/mmde/mmdet/models/roi_heads/double_roi_head.py b/mmde/mmdet/models/roi_heads/double_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9464ff55bafcca9f3545a3a72dde1eb3939cece
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/double_roi_head.py
@@ -0,0 +1,53 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class DoubleHeadRoIHead(StandardRoIHead):
+    """RoI head for `Double Head RCNN <https://arxiv.org/abs/1904.06493>`_.
+
+    Args:
+        reg_roi_scale_factor (float): The scale factor to extend the rois
+            used to extract the regression features.
+    """
+
+    def __init__(self, reg_roi_scale_factor: float, **kwargs):
+        super().__init__(**kwargs)
+        self.reg_roi_scale_factor = reg_roi_scale_factor
+
+    def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        bbox_cls_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        bbox_reg_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs],
+            rois,
+            roi_scale_factor=self.reg_roi_scale_factor)
+        if self.with_shared_head:
+            bbox_cls_feats = self.shared_head(bbox_cls_feats)
+            bbox_reg_feats = self.shared_head(bbox_reg_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_cls_feats, bbox_reg_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            bbox_feats=bbox_cls_feats)
+        return bbox_results
diff --git a/mmde/mmdet/models/roi_heads/dynamic_roi_head.py b/mmde/mmdet/models/roi_heads/dynamic_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c7f7bd2f68cab0fcdec725501f74b65274eb30e
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/dynamic_roi_head.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.models.losses import SmoothL1Loss
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import InstanceList
+from ..utils.misc import unpack_gt_instances
+from .standard_roi_head import StandardRoIHead
+
+EPS = 1e-15
+
+
+@MODELS.register_module()
+class DynamicRoIHead(StandardRoIHead):
+    """RoI head for `Dynamic R-CNN <https://arxiv.org/abs/2004.06002>`_."""
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        assert isinstance(self.bbox_head.loss_bbox, SmoothL1Loss)
+        # the IoU history of the past `update_iter_interval` iterations
+        self.iou_history = []
+        # the beta history of the past `update_iter_interval` iterations
+        self.beta_history = []
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Forward function for training.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, _ = outputs
+
+        # assign gts and sample proposals
+        num_imgs = len(batch_data_samples)
+        sampling_results = []
+        cur_iou = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in x])
+            # record the `iou_topk`-th largest IoU in an image
+            iou_topk = min(self.train_cfg.dynamic_rcnn.iou_topk,
+                           len(assign_result.max_overlaps))
+            ious, _ = torch.topk(assign_result.max_overlaps, iou_topk)
+            cur_iou.append(ious[-1].item())
+            sampling_results.append(sampling_result)
+        # average the current IoUs over images
+        cur_iou = np.mean(cur_iou)
+        self.iou_history.append(cur_iou)
+
+        losses = dict()
+        # bbox head forward and loss
+        if self.with_bbox:
+            bbox_results = self.bbox_loss(x, sampling_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self.mask_loss(x, sampling_results,
+                                          bbox_results['bbox_feats'],
+                                          batch_gt_instances)
+            losses.update(mask_results['loss_mask'])
+
+        # update IoU threshold and SmoothL1 beta
+        update_iter_interval = self.train_cfg.dynamic_rcnn.update_iter_interval
+        if len(self.iou_history) % update_iter_interval == 0:
+            new_iou_thr, new_beta = self.update_hyperparameters()
+
+        return losses
+
+    def bbox_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult]) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+
+        bbox_loss_and_target = self.bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg)
+        bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox'])
+
+        # record the `beta_topk`-th smallest target
+        # `bbox_targets[2]` and `bbox_targets[3]` stand for bbox_targets
+        # and bbox_weights, respectively
+        bbox_targets = bbox_loss_and_target['bbox_targets']
+        pos_inds = bbox_targets[3][:, 0].nonzero().squeeze(1)
+        num_pos = len(pos_inds)
+        num_imgs = len(sampling_results)
+        if num_pos > 0:
+            cur_target = bbox_targets[2][pos_inds, :2].abs().mean(dim=1)
+            beta_topk = min(self.train_cfg.dynamic_rcnn.beta_topk * num_imgs,
+                            num_pos)
+            cur_target = torch.kthvalue(cur_target, beta_topk)[0].item()
+            self.beta_history.append(cur_target)
+
+        return bbox_results
+
+    def update_hyperparameters(self):
+        """Update hyperparameters like IoU thresholds for assigner and beta for
+        SmoothL1 loss based on the training statistics.
+
+        Returns:
+            tuple[float]: the updated ``iou_thr`` and ``beta``.
+        """
+        new_iou_thr = max(self.train_cfg.dynamic_rcnn.initial_iou,
+                          np.mean(self.iou_history))
+        self.iou_history = []
+        self.bbox_assigner.pos_iou_thr = new_iou_thr
+        self.bbox_assigner.neg_iou_thr = new_iou_thr
+        self.bbox_assigner.min_pos_iou = new_iou_thr
+        if (not self.beta_history) or (np.median(self.beta_history) < EPS):
+            # avoid 0 or too small value for new_beta
+            new_beta = self.bbox_head.loss_bbox.beta
+        else:
+            new_beta = min(self.train_cfg.dynamic_rcnn.initial_beta,
+                           np.median(self.beta_history))
+        self.beta_history = []
+        self.bbox_head.loss_bbox.beta = new_beta
+        return new_iou_thr, new_beta
diff --git a/mmde/mmdet/models/roi_heads/grid_roi_head.py b/mmde/mmdet/models/roi_heads/grid_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eda7f01bcd4e44faca14b61ec4956ee2c372ad6
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/grid_roi_head.py
@@ -0,0 +1,280 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList
+from ..task_modules.samplers import SamplingResult
+from ..utils.misc import unpack_gt_instances
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class GridRoIHead(StandardRoIHead):
+    """Implementation of `Grid RoI Head <https://arxiv.org/abs/1811.12030>`_
+
+    Args:
+        grid_roi_extractor (:obj:`ConfigDict` or dict): Config of
+            roi extractor.
+        grid_head (:obj:`ConfigDict` or dict): Config of grid head
+    """
+
+    def __init__(self, grid_roi_extractor: ConfigType, grid_head: ConfigType,
+                 **kwargs) -> None:
+        assert grid_head is not None
+        super().__init__(**kwargs)
+        if grid_roi_extractor is not None:
+            self.grid_roi_extractor = MODELS.build(grid_roi_extractor)
+            self.share_roi_extractor = False
+        else:
+            self.share_roi_extractor = True
+            self.grid_roi_extractor = self.bbox_roi_extractor
+        self.grid_head = MODELS.build(grid_head)
+
+    def _random_jitter(self,
+                       sampling_results: List[SamplingResult],
+                       batch_img_metas: List[dict],
+                       amplitude: float = 0.15) -> List[SamplingResult]:
+        """Ramdom jitter positive proposals for training.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            batch_img_metas (list[dict]): List of image information.
+            amplitude (float): Amplitude of random offset. Defaults to 0.15.
+
+        Returns:
+            list[obj:SamplingResult]: SamplingResults after random jittering.
+        """
+        for sampling_result, img_meta in zip(sampling_results,
+                                             batch_img_metas):
+            bboxes = sampling_result.pos_priors
+            random_offsets = bboxes.new_empty(bboxes.shape[0], 4).uniform_(
+                -amplitude, amplitude)
+            # before jittering
+            cxcy = (bboxes[:, 2:4] + bboxes[:, :2]) / 2
+            wh = (bboxes[:, 2:4] - bboxes[:, :2]).abs()
+            # after jittering
+            new_cxcy = cxcy + wh * random_offsets[:, :2]
+            new_wh = wh * (1 + random_offsets[:, 2:])
+            # xywh to xyxy
+            new_x1y1 = (new_cxcy - new_wh / 2)
+            new_x2y2 = (new_cxcy + new_wh / 2)
+            new_bboxes = torch.cat([new_x1y1, new_x2y2], dim=1)
+            # clip bboxes
+            max_shape = img_meta['img_shape']
+            if max_shape is not None:
+                new_bboxes[:, 0::2].clamp_(min=0, max=max_shape[1] - 1)
+                new_bboxes[:, 1::2].clamp_(min=0, max=max_shape[0] - 1)
+
+            sampling_result.pos_priors = new_bboxes
+        return sampling_results
+
+    # TODO: Forward is incorrect and need to refactor.
+    def forward(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList = None) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (Tuple[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+            the meta information of each image and corresponding
+            annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            bbox_results = self._bbox_forward(x, rois)
+            results = results + (bbox_results['cls_score'], )
+            if self.bbox_head.with_reg:
+                results = results + (bbox_results['bbox_pred'], )
+
+            # grid head
+            grid_rois = rois[:100]
+            grid_feats = self.grid_roi_extractor(
+                x[:len(self.grid_roi_extractor.featmap_strides)], grid_rois)
+            if self.with_shared_head:
+                grid_feats = self.shared_head(grid_feats)
+            self.grid_head.test_mode = True
+            grid_preds = self.grid_head(grid_feats)
+            results = results + (grid_preds, )
+
+        # mask head
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_results = self._mask_forward(x, mask_rois)
+            results = results + (mask_results['mask_preds'], )
+        return results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList, **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        (batch_gt_instances, batch_gt_instances_ignore,
+         batch_img_metas) = outputs
+
+        # assign gts and sample proposals
+        num_imgs = len(batch_data_samples)
+        sampling_results = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in x])
+            sampling_results.append(sampling_result)
+
+        losses = dict()
+        # bbox head loss
+        if self.with_bbox:
+            bbox_results = self.bbox_loss(x, sampling_results, batch_img_metas)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self.mask_loss(x, sampling_results,
+                                          bbox_results['bbox_feats'],
+                                          batch_gt_instances)
+            losses.update(mask_results['loss_mask'])
+
+        return losses
+
+    def bbox_loss(self,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  batch_img_metas: Optional[List[dict]] = None) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list[:obj:`SamplingResult`]): Sampling results.
+            batch_img_metas (list[dict], optional): Meta information of each
+                image, e.g., image size, scaling factor, etc.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+            - `cls_score` (Tensor): Classification scores.
+            - `bbox_pred` (Tensor): Box energies / deltas.
+            - `bbox_feats` (Tensor): Extract bbox RoI features.
+            - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        assert batch_img_metas is not None
+        bbox_results = super().bbox_loss(x, sampling_results)
+
+        # Grid head forward and loss
+        sampling_results = self._random_jitter(sampling_results,
+                                               batch_img_metas)
+        pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+
+        # GN in head does not support zero shape input
+        if pos_rois.shape[0] == 0:
+            return bbox_results
+
+        grid_feats = self.grid_roi_extractor(
+            x[:self.grid_roi_extractor.num_inputs], pos_rois)
+        if self.with_shared_head:
+            grid_feats = self.shared_head(grid_feats)
+        # Accelerate training
+        max_sample_num_grid = self.train_cfg.get('max_num_grid', 192)
+        sample_idx = torch.randperm(
+            grid_feats.shape[0])[:min(grid_feats.shape[0], max_sample_num_grid
+                                      )]
+        grid_feats = grid_feats[sample_idx]
+        grid_pred = self.grid_head(grid_feats)
+
+        loss_grid = self.grid_head.loss(grid_pred, sample_idx,
+                                        sampling_results, self.train_cfg)
+
+        bbox_results['loss_bbox'].update(loss_grid)
+        return bbox_results
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (:obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape \
+            (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4), the last \
+            dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        results_list = super().predict_bbox(
+            x,
+            batch_img_metas=batch_img_metas,
+            rpn_results_list=rpn_results_list,
+            rcnn_test_cfg=rcnn_test_cfg,
+            rescale=False)
+
+        grid_rois = bbox2roi([res.bboxes for res in results_list])
+        if grid_rois.shape[0] != 0:
+            grid_feats = self.grid_roi_extractor(
+                x[:len(self.grid_roi_extractor.featmap_strides)], grid_rois)
+            if self.with_shared_head:
+                grid_feats = self.shared_head(grid_feats)
+            self.grid_head.test_mode = True
+            grid_preds = self.grid_head(grid_feats)
+            results_list = self.grid_head.predict_by_feat(
+                grid_preds=grid_preds,
+                results_list=results_list,
+                batch_img_metas=batch_img_metas,
+                rescale=rescale)
+
+        return results_list
diff --git a/mmde/mmdet/models/roi_heads/htc_roi_head.py b/mmde/mmdet/models/roi_heads/htc_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fdd99ddd5ce4d9d42345d1f1d14ecbcae658124
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/htc_roi_head.py
@@ -0,0 +1,581 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.models.test_time_augs import merge_aug_masks
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import InstanceList, OptConfigType
+from ..layers import adaptive_avg_pool2d
+from ..task_modules.samplers import SamplingResult
+from ..utils import empty_instances, unpack_gt_instances
+from .cascade_roi_head import CascadeRoIHead
+
+
+@MODELS.register_module()
+class HybridTaskCascadeRoIHead(CascadeRoIHead):
+    """Hybrid task cascade roi head including one bbox head and one mask head.
+
+    https://arxiv.org/abs/1901.07518
+
+    Args:
+        num_stages (int): Number of cascade stages.
+        stage_loss_weights (list[float]): Loss weight for every stage.
+        semantic_roi_extractor (:obj:`ConfigDict` or dict, optional):
+            Config of semantic roi extractor. Defaults to None.
+        Semantic_head (:obj:`ConfigDict` or dict, optional):
+            Config of semantic head. Defaults to None.
+        interleaved (bool): Whether to interleaves the box branch and mask
+            branch. If True, the mask branch can take the refined bounding
+            box predictions. Defaults to True.
+        mask_info_flow (bool): Whether to turn on the mask information flow,
+            which means that feeding the mask features of the preceding stage
+            to the current stage. Defaults to True.
+    """
+
+    def __init__(self,
+                 num_stages: int,
+                 stage_loss_weights: List[float],
+                 semantic_roi_extractor: OptConfigType = None,
+                 semantic_head: OptConfigType = None,
+                 semantic_fusion: Tuple[str] = ('bbox', 'mask'),
+                 interleaved: bool = True,
+                 mask_info_flow: bool = True,
+                 **kwargs) -> None:
+        super().__init__(
+            num_stages=num_stages,
+            stage_loss_weights=stage_loss_weights,
+            **kwargs)
+        assert self.with_bbox
+        assert not self.with_shared_head  # shared head is not supported
+
+        if semantic_head is not None:
+            self.semantic_roi_extractor = MODELS.build(semantic_roi_extractor)
+            self.semantic_head = MODELS.build(semantic_head)
+
+        self.semantic_fusion = semantic_fusion
+        self.interleaved = interleaved
+        self.mask_info_flow = mask_info_flow
+
+    # TODO move to base_roi_head later
+    @property
+    def with_semantic(self) -> bool:
+        """bool: whether the head has semantic head"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    def _bbox_forward(
+            self,
+            stage: int,
+            x: Tuple[Tensor],
+            rois: Tensor,
+            semantic_feat: Optional[Tensor] = None) -> Dict[str, Tensor]:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            semantic_feat (Tensor, optional): Semantic feature. Defaults to
+                None.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        if self.with_semantic and 'bbox' in self.semantic_fusion:
+            bbox_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if bbox_semantic_feat.shape[-2:] != bbox_feats.shape[-2:]:
+                bbox_semantic_feat = adaptive_avg_pool2d(
+                    bbox_semantic_feat, bbox_feats.shape[-2:])
+            bbox_feats += bbox_semantic_feat
+        cls_score, bbox_pred = bbox_head(bbox_feats)
+
+        bbox_results = dict(cls_score=cls_score, bbox_pred=bbox_pred)
+        return bbox_results
+
+    def bbox_loss(self,
+                  stage: int,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  semantic_feat: Optional[Tensor] = None) -> dict:
+        """Run forward function and calculate loss for box head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            semantic_feat (Tensor, optional): Semantic feature. Defaults to
+                None.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+        """
+        bbox_head = self.bbox_head[stage]
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(
+            stage, x, rois, semantic_feat=semantic_feat)
+        bbox_results.update(rois=rois)
+
+        bbox_loss_and_target = bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg[stage])
+        bbox_results.update(bbox_loss_and_target)
+        return bbox_results
+
+    def _mask_forward(self,
+                      stage: int,
+                      x: Tuple[Tensor],
+                      rois: Tensor,
+                      semantic_feat: Optional[Tensor] = None,
+                      training: bool = True) -> Dict[str, Tensor]:
+        """Mask head forward function used only in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            semantic_feat (Tensor, optional): Semantic feature. Defaults to
+                None.
+            training (bool): Mask Forward is different between training and
+                testing. If True, use the mask forward in training.
+                Defaults to True.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+        """
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs],
+                                        rois)
+
+        # semantic feature fusion
+        # element-wise sum for original features and pooled semantic features
+        if self.with_semantic and 'mask' in self.semantic_fusion:
+            mask_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]:
+                mask_semantic_feat = F.adaptive_avg_pool2d(
+                    mask_semantic_feat, mask_feats.shape[-2:])
+            mask_feats = mask_feats + mask_semantic_feat
+
+        # mask information flow
+        # forward all previous mask heads to obtain last_feat, and fuse it
+        # with the normal mask feature
+        if training:
+            if self.mask_info_flow:
+                last_feat = None
+                for i in range(stage):
+                    last_feat = self.mask_head[i](
+                        mask_feats, last_feat, return_logits=False)
+                mask_preds = mask_head(
+                    mask_feats, last_feat, return_feat=False)
+            else:
+                mask_preds = mask_head(mask_feats, return_feat=False)
+
+            mask_results = dict(mask_preds=mask_preds)
+        else:
+            aug_masks = []
+            last_feat = None
+            for i in range(self.num_stages):
+                mask_head = self.mask_head[i]
+                if self.mask_info_flow:
+                    mask_preds, last_feat = mask_head(mask_feats, last_feat)
+                else:
+                    mask_preds = mask_head(mask_feats)
+            aug_masks.append(mask_preds)
+
+            mask_results = dict(mask_preds=aug_masks)
+
+        return mask_results
+
+    def mask_loss(self,
+                  stage: int,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  batch_gt_instances: InstanceList,
+                  semantic_feat: Optional[Tensor] = None) -> dict:
+        """Run forward function and calculate loss for mask head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            semantic_feat (Tensor, optional): Semantic feature. Defaults to
+                None.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+        mask_results = self._mask_forward(
+            stage=stage,
+            x=x,
+            rois=pos_rois,
+            semantic_feat=semantic_feat,
+            training=True)
+
+        mask_head = self.mask_head[stage]
+        mask_loss_and_target = mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg[stage])
+        mask_results.update(mask_loss_and_target)
+
+        return mask_results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        # semantic segmentation part
+        # 2 outputs: segmentation prediction and embedded features
+        losses = dict()
+        if self.with_semantic:
+            gt_semantic_segs = [
+                data_sample.gt_sem_seg.sem_seg
+                for data_sample in batch_data_samples
+            ]
+            gt_semantic_segs = torch.stack(gt_semantic_segs)
+            semantic_pred, semantic_feat = self.semantic_head(x)
+            loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_segs)
+            losses['loss_semantic_seg'] = loss_seg
+        else:
+            semantic_feat = None
+
+        results_list = rpn_results_list
+        num_imgs = len(batch_img_metas)
+        for stage in range(self.num_stages):
+            self.current_stage = stage
+
+            stage_loss_weight = self.stage_loss_weights[stage]
+
+            # assign gts and sample proposals
+            sampling_results = []
+            bbox_assigner = self.bbox_assigner[stage]
+            bbox_sampler = self.bbox_sampler[stage]
+            for i in range(num_imgs):
+                results = results_list[i]
+                # rename rpn_results.bboxes to rpn_results.priors
+                if 'bboxes' in results:
+                    results.priors = results.pop('bboxes')
+
+                assign_result = bbox_assigner.assign(
+                    results, batch_gt_instances[i],
+                    batch_gt_instances_ignore[i])
+                sampling_result = bbox_sampler.sample(
+                    assign_result,
+                    results,
+                    batch_gt_instances[i],
+                    feats=[lvl_feat[i][None] for lvl_feat in x])
+                sampling_results.append(sampling_result)
+
+            # bbox head forward and loss
+            bbox_results = self.bbox_loss(
+                stage=stage,
+                x=x,
+                sampling_results=sampling_results,
+                semantic_feat=semantic_feat)
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{stage}.{name}'] = (
+                    value * stage_loss_weight if 'loss' in name else value)
+
+            # mask head forward and loss
+            if self.with_mask:
+                # interleaved execution: use regressed bboxes by the box branch
+                # to train the mask branch
+                if self.interleaved:
+                    bbox_head = self.bbox_head[stage]
+                    with torch.no_grad():
+                        results_list = bbox_head.refine_bboxes(
+                            sampling_results, bbox_results, batch_img_metas)
+                        # re-assign and sample 512 RoIs from 512 RoIs
+                        sampling_results = []
+                        for i in range(num_imgs):
+                            results = results_list[i]
+                            # rename rpn_results.bboxes to rpn_results.priors
+                            results.priors = results.pop('bboxes')
+                            assign_result = bbox_assigner.assign(
+                                results, batch_gt_instances[i],
+                                batch_gt_instances_ignore[i])
+                            sampling_result = bbox_sampler.sample(
+                                assign_result,
+                                results,
+                                batch_gt_instances[i],
+                                feats=[lvl_feat[i][None] for lvl_feat in x])
+                            sampling_results.append(sampling_result)
+                mask_results = self.mask_loss(
+                    stage=stage,
+                    x=x,
+                    sampling_results=sampling_results,
+                    batch_gt_instances=batch_gt_instances,
+                    semantic_feat=semantic_feat)
+                for name, value in mask_results['loss_mask'].items():
+                    losses[f's{stage}.{name}'] = (
+                        value * stage_loss_weight if 'loss' in name else value)
+
+            # refine bboxes (same as Cascade R-CNN)
+            if stage < self.num_stages - 1 and not self.interleaved:
+                bbox_head = self.bbox_head[stage]
+                with torch.no_grad():
+                    results_list = bbox_head.refine_bboxes(
+                        sampling_results=sampling_results,
+                        bbox_results=bbox_results,
+                        batch_img_metas=batch_img_metas)
+
+        return losses
+
+    def predict(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (N, C, H, W).
+            rpn_results_list (list[:obj:`InstanceData`]): list of region
+                proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results to
+                the original image. Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+
+        # TODO: nms_op in mmcv need be enhanced, the bbox result may get
+        #  difference when not rescale in bbox_head
+
+        # If it has the mask branch, the bbox branch does not need
+        # to be scaled to the original image scale, because the mask
+        # branch will scale both bbox and mask at the same time.
+        bbox_rescale = rescale if not self.with_mask else False
+        results_list = self.predict_bbox(
+            x=x,
+            semantic_feat=semantic_feat,
+            batch_img_metas=batch_img_metas,
+            rpn_results_list=rpn_results_list,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=bbox_rescale)
+
+        if self.with_mask:
+            results_list = self.predict_mask(
+                x=x,
+                semantic_heat=semantic_feat,
+                batch_img_metas=batch_img_metas,
+                results_list=results_list,
+                rescale=rescale)
+
+        return results_list
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     semantic_heat: Tensor,
+                     batch_img_metas: List[dict],
+                     results_list: InstanceList,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            semantic_feat (Tensor): Semantic feature.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        num_imgs = len(batch_img_metas)
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas=batch_img_metas,
+                device=mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        mask_results = self._mask_forward(
+            stage=-1,
+            x=x,
+            rois=mask_rois,
+            semantic_feat=semantic_heat,
+            training=False)
+        # split batch mask prediction back to each image
+        aug_masks = [[
+            mask.sigmoid().detach()
+            for mask in mask_preds.split(num_mask_rois_per_img, 0)
+        ] for mask_preds in mask_results['mask_preds']]
+
+        merged_masks = []
+        for i in range(num_imgs):
+            aug_mask = [mask[i] for mask in aug_masks]
+            merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i])
+            merged_masks.append(merged_mask)
+
+        results_list = self.mask_head[-1].predict_by_feat(
+            mask_preds=merged_masks,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale,
+            activate_map=True)
+
+        return results_list
+
+    def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+                batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        num_imgs = len(batch_img_metas)
+
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            rois, cls_scores, bbox_preds = self._refine_roi(
+                x=x,
+                rois=rois,
+                semantic_feat=semantic_feat,
+                batch_img_metas=batch_img_metas,
+                num_proposals_per_img=num_proposals_per_img)
+            results = results + (cls_scores, bbox_preds)
+        # mask head
+        if self.with_mask:
+            rois = torch.cat(rois)
+            mask_results = self._mask_forward(
+                stage=-1,
+                x=x,
+                rois=rois,
+                semantic_feat=semantic_feat,
+                training=False)
+            aug_masks = [[
+                mask.sigmoid().detach()
+                for mask in mask_preds.split(num_proposals_per_img, 0)
+            ] for mask_preds in mask_results['mask_preds']]
+
+            merged_masks = []
+            for i in range(num_imgs):
+                aug_mask = [mask[i] for mask in aug_masks]
+                merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i])
+                merged_masks.append(merged_mask)
+            results = results + (merged_masks, )
+        return results
diff --git a/mmde/mmdet/models/roi_heads/mask_heads/__init__.py b/mmde/mmdet/models/roi_heads/mask_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..48a5d4227be41b8985403251e1803f78cf500636
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/mask_heads/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .coarse_mask_head import CoarseMaskHead
+from .dynamic_mask_head import DynamicMaskHead
+from .fcn_mask_head import FCNMaskHead
+from .feature_relay_head import FeatureRelayHead
+from .fused_semantic_head import FusedSemanticHead
+from .global_context_head import GlobalContextHead
+from .grid_head import GridHead
+from .htc_mask_head import HTCMaskHead
+from .mask_point_head import MaskPointHead
+from .maskiou_head import MaskIoUHead
+from .scnet_mask_head import SCNetMaskHead
+from .scnet_semantic_head import SCNetSemanticHead
+
+__all__ = [
+    'FCNMaskHead', 'HTCMaskHead', 'FusedSemanticHead', 'GridHead',
+    'MaskIoUHead', 'CoarseMaskHead', 'MaskPointHead', 'SCNetMaskHead',
+    'SCNetSemanticHead', 'GlobalContextHead', 'FeatureRelayHead',
+    'DynamicMaskHead'
+]
diff --git a/mmde/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py b/mmde/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1caa901228f2439492b82d1890eba468963eb28d
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import ConvModule, Linear
+from mmengine.model import ModuleList
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig
+from .fcn_mask_head import FCNMaskHead
+
+
+@MODELS.register_module()
+class CoarseMaskHead(FCNMaskHead):
+    """Coarse mask head used in PointRend.
+
+    Compared with standard ``FCNMaskHead``, ``CoarseMaskHead`` will downsample
+    the input feature map instead of upsample it.
+
+    Args:
+        num_convs (int): Number of conv layers in the head. Defaults to 0.
+        num_fcs (int): Number of fc layers in the head. Defaults to 2.
+        fc_out_channels (int): Number of output channels of fc layer.
+            Defaults to 1024.
+        downsample_factor (int): The factor that feature map is downsampled by.
+            Defaults to 2.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_convs: int = 0,
+                 num_fcs: int = 2,
+                 fc_out_channels: int = 1024,
+                 downsample_factor: int = 2,
+                 init_cfg: MultiConfig = dict(
+                     type='Xavier',
+                     override=[
+                         dict(name='fcs'),
+                         dict(type='Constant', val=0.001, name='fc_logits')
+                     ]),
+                 *arg,
+                 **kwarg) -> None:
+        super().__init__(
+            *arg,
+            num_convs=num_convs,
+            upsample_cfg=dict(type=None),
+            init_cfg=None,
+            **kwarg)
+        self.init_cfg = init_cfg
+        self.num_fcs = num_fcs
+        assert self.num_fcs > 0
+        self.fc_out_channels = fc_out_channels
+        self.downsample_factor = downsample_factor
+        assert self.downsample_factor >= 1
+        # remove conv_logit
+        delattr(self, 'conv_logits')
+
+        if downsample_factor > 1:
+            downsample_in_channels = (
+                self.conv_out_channels
+                if self.num_convs > 0 else self.in_channels)
+            self.downsample_conv = ConvModule(
+                downsample_in_channels,
+                self.conv_out_channels,
+                kernel_size=downsample_factor,
+                stride=downsample_factor,
+                padding=0,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+        else:
+            self.downsample_conv = None
+
+        self.output_size = (self.roi_feat_size[0] // downsample_factor,
+                            self.roi_feat_size[1] // downsample_factor)
+        self.output_area = self.output_size[0] * self.output_size[1]
+
+        last_layer_dim = self.conv_out_channels * self.output_area
+
+        self.fcs = ModuleList()
+        for i in range(num_fcs):
+            fc_in_channels = (
+                last_layer_dim if i == 0 else self.fc_out_channels)
+            self.fcs.append(Linear(fc_in_channels, self.fc_out_channels))
+        last_layer_dim = self.fc_out_channels
+        output_channels = self.num_classes * self.output_area
+        self.fc_logits = Linear(last_layer_dim, output_channels)
+
+    def init_weights(self) -> None:
+        """Initialize weights."""
+        super(FCNMaskHead, self).init_weights()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tensor): Extract mask RoI features.
+
+        Returns:
+            Tensor: Predicted foreground masks.
+        """
+        for conv in self.convs:
+            x = conv(x)
+
+        if self.downsample_conv is not None:
+            x = self.downsample_conv(x)
+
+        x = x.flatten(1)
+        for fc in self.fcs:
+            x = self.relu(fc(x))
+        mask_preds = self.fc_logits(x).view(
+            x.size(0), self.num_classes, *self.output_size)
+        return mask_preds
diff --git a/mmde/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py b/mmde/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f33612b1b141668d0463435975c14a26fbe5a0cd
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+from mmengine.config import ConfigDict
+from torch import Tensor
+
+from mmdet.models.task_modules import SamplingResult
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, OptConfigType, reduce_mean
+from .fcn_mask_head import FCNMaskHead
+
+
+@MODELS.register_module()
+class DynamicMaskHead(FCNMaskHead):
+    r"""Dynamic Mask Head for
+    `Instances as Queries <http://arxiv.org/abs/2105.01928>`_
+
+    Args:
+        num_convs (int): Number of convolution layer.
+            Defaults to 4.
+        roi_feat_size (int): The output size of RoI extractor,
+            Defaults to 14.
+        in_channels (int): Input feature channels.
+            Defaults to 256.
+        conv_kernel_size (int): Kernel size of convolution layers.
+            Defaults to 3.
+        conv_out_channels (int): Output channels of convolution layers.
+            Defaults to 256.
+        num_classes (int): Number of classes.
+            Defaults to 80
+        class_agnostic (int): Whether generate class agnostic prediction.
+            Defaults to False.
+        dropout (float): Probability of drop the channel.
+            Defaults to 0.0
+        upsample_cfg (:obj:`ConfigDict` or dict): The config for
+            upsample layer.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): The convolution
+            layer config.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): The norm layer config.
+        dynamic_conv_cfg (:obj:`ConfigDict` or dict): The dynamic convolution
+            layer config.
+        loss_mask (:obj:`ConfigDict` or dict): The config for mask loss.
+    """
+
+    def __init__(self,
+                 num_convs: int = 4,
+                 roi_feat_size: int = 14,
+                 in_channels: int = 256,
+                 conv_kernel_size: int = 3,
+                 conv_out_channels: int = 256,
+                 num_classes: int = 80,
+                 class_agnostic: bool = False,
+                 upsample_cfg: ConfigType = dict(
+                     type='deconv', scale_factor=2),
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 dynamic_conv_cfg: ConfigType = dict(
+                     type='DynamicConv',
+                     in_channels=256,
+                     feat_channels=64,
+                     out_channels=256,
+                     input_feat_shape=14,
+                     with_proj=False,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                     norm_cfg=dict(type='LN')),
+                 loss_mask: ConfigType = dict(
+                     type='DiceLoss', loss_weight=8.0),
+                 **kwargs) -> None:
+        super().__init__(
+            num_convs=num_convs,
+            roi_feat_size=roi_feat_size,
+            in_channels=in_channels,
+            conv_kernel_size=conv_kernel_size,
+            conv_out_channels=conv_out_channels,
+            num_classes=num_classes,
+            class_agnostic=class_agnostic,
+            upsample_cfg=upsample_cfg,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            loss_mask=loss_mask,
+            **kwargs)
+        assert class_agnostic is False, \
+            'DynamicMaskHead only support class_agnostic=False'
+        self.fp16_enabled = False
+
+        self.instance_interactive_conv = MODELS.build(dynamic_conv_cfg)
+
+    def init_weights(self) -> None:
+        """Use xavier initialization for all weight parameter and set
+        classification head bias as a specific value when use focal loss."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+            nn.init.constant_(self.conv_logits.bias, 0.)
+
+    def forward(self, roi_feat: Tensor, proposal_feat: Tensor) -> Tensor:
+        """Forward function of DynamicMaskHead.
+
+        Args:
+            roi_feat (Tensor): Roi-pooling features with shape
+                (batch_size*num_proposals, feature_dimensions,
+                pooling_h , pooling_w).
+            proposal_feat (Tensor): Intermediate feature get from
+                diihead in last stage, has shape
+                (batch_size*num_proposals, feature_dimensions)
+
+          Returns:
+            mask_preds (Tensor): Predicted foreground masks with shape
+            (batch_size*num_proposals, num_classes, pooling_h*2, pooling_w*2).
+        """
+
+        proposal_feat = proposal_feat.reshape(-1, self.in_channels)
+        proposal_feat_iic = self.instance_interactive_conv(
+            proposal_feat, roi_feat)
+
+        x = proposal_feat_iic.permute(0, 2, 1).reshape(roi_feat.size())
+
+        for conv in self.convs:
+            x = conv(x)
+        if self.upsample is not None:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+        mask_preds = self.conv_logits(x)
+        return mask_preds
+
+    def loss_and_target(self, mask_preds: Tensor,
+                        sampling_results: List[SamplingResult],
+                        batch_gt_instances: InstanceList,
+                        rcnn_train_cfg: ConfigDict) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mask_preds (Tensor): Predicted foreground masks, has shape
+                (num_pos, num_classes, h, w).
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+        """
+        mask_targets = self.get_targets(
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=rcnn_train_cfg)
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+
+        num_pos = pos_labels.new_ones(pos_labels.size()).float().sum()
+        avg_factor = torch.clamp(reduce_mean(num_pos), min=1.).item()
+        loss = dict()
+        if mask_preds.size(0) == 0:
+            loss_mask = mask_preds.sum()
+        else:
+            loss_mask = self.loss_mask(
+                mask_preds[torch.arange(num_pos).long(), pos_labels,
+                           ...].sigmoid(),
+                mask_targets,
+                avg_factor=avg_factor)
+        loss['loss_mask'] = loss_mask
+        return dict(loss_mask=loss, mask_targets=mask_targets)
diff --git a/mmde/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py b/mmde/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a089dfafcb69784f2fc266f0945e6d56b0466d3
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
@@ -0,0 +1,474 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_conv_layer, build_upsample_layer
+from mmcv.ops.carafe import CARAFEPack
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, ModuleList
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.utils import empty_instances
+from mmdet.registry import MODELS
+from mmdet.structures.mask import mask_target
+from mmdet.utils import ConfigType, InstanceList, OptConfigType, OptMultiConfig
+
+BYTES_PER_FLOAT = 4
+# TODO: This memory limit may be too much or too little. It would be better to
+#  determine it based on available resources.
+GPU_MEM_LIMIT = 1024**3  # 1 GB memory limit
+
+
+@MODELS.register_module()
+class FCNMaskHead(BaseModule):
+
+    def __init__(self,
+                 num_convs: int = 4,
+                 roi_feat_size: int = 14,
+                 in_channels: int = 256,
+                 conv_kernel_size: int = 3,
+                 conv_out_channels: int = 256,
+                 num_classes: int = 80,
+                 class_agnostic: int = False,
+                 upsample_cfg: ConfigType = dict(
+                     type='deconv', scale_factor=2),
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 predictor_cfg: ConfigType = dict(type='Conv'),
+                 loss_mask: ConfigType = dict(
+                     type='CrossEntropyLoss', use_mask=True, loss_weight=1.0),
+                 init_cfg: OptMultiConfig = None) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+        self.upsample_cfg = upsample_cfg.copy()
+        if self.upsample_cfg['type'] not in [
+                None, 'deconv', 'nearest', 'bilinear', 'carafe'
+        ]:
+            raise ValueError(
+                f'Invalid upsample method {self.upsample_cfg["type"]}, '
+                'accepted methods are "deconv", "nearest", "bilinear", '
+                '"carafe"')
+        self.num_convs = num_convs
+        # WARN: roi_feat_size is reserved and not used
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.in_channels = in_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.conv_out_channels = conv_out_channels
+        self.upsample_method = self.upsample_cfg.get('type')
+        self.scale_factor = self.upsample_cfg.pop('scale_factor', None)
+        self.num_classes = num_classes
+        self.class_agnostic = class_agnostic
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.predictor_cfg = predictor_cfg
+        self.loss_mask = MODELS.build(loss_mask)
+
+        self.convs = ModuleList()
+        for i in range(self.num_convs):
+            in_channels = (
+                self.in_channels if i == 0 else self.conv_out_channels)
+            padding = (self.conv_kernel_size - 1) // 2
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    self.conv_out_channels,
+                    self.conv_kernel_size,
+                    padding=padding,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg))
+        upsample_in_channels = (
+            self.conv_out_channels if self.num_convs > 0 else in_channels)
+        upsample_cfg_ = self.upsample_cfg.copy()
+        if self.upsample_method is None:
+            self.upsample = None
+        elif self.upsample_method == 'deconv':
+            upsample_cfg_.update(
+                in_channels=upsample_in_channels,
+                out_channels=self.conv_out_channels,
+                kernel_size=self.scale_factor,
+                stride=self.scale_factor)
+            self.upsample = build_upsample_layer(upsample_cfg_)
+        elif self.upsample_method == 'carafe':
+            upsample_cfg_.update(
+                channels=upsample_in_channels, scale_factor=self.scale_factor)
+            self.upsample = build_upsample_layer(upsample_cfg_)
+        else:
+            # suppress warnings
+            align_corners = (None
+                             if self.upsample_method == 'nearest' else False)
+            upsample_cfg_.update(
+                scale_factor=self.scale_factor,
+                mode=self.upsample_method,
+                align_corners=align_corners)
+            self.upsample = build_upsample_layer(upsample_cfg_)
+
+        out_channels = 1 if self.class_agnostic else self.num_classes
+        logits_in_channel = (
+            self.conv_out_channels
+            if self.upsample_method == 'deconv' else upsample_in_channels)
+        self.conv_logits = build_conv_layer(self.predictor_cfg,
+                                            logits_in_channel, out_channels, 1)
+        self.relu = nn.ReLU(inplace=True)
+        self.debug_imgs = None
+
+    def init_weights(self) -> None:
+        """Initialize the weights."""
+        super().init_weights()
+        for m in [self.upsample, self.conv_logits]:
+            if m is None:
+                continue
+            elif isinstance(m, CARAFEPack):
+                m.init_weights()
+            elif hasattr(m, 'weight') and hasattr(m, 'bias'):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tensor): Extract mask RoI features.
+
+        Returns:
+            Tensor: Predicted foreground masks.
+        """
+        for conv in self.convs:
+            x = conv(x)
+        if self.upsample is not None:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+        mask_preds = self.conv_logits(x)
+        return mask_preds
+
+    def get_targets(self, sampling_results: List[SamplingResult],
+                    batch_gt_instances: InstanceList,
+                    rcnn_train_cfg: ConfigDict) -> Tensor:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+
+        Returns:
+            Tensor: Mask target of each positive proposals in the image.
+        """
+        pos_proposals = [res.pos_priors for res in sampling_results]
+        pos_assigned_gt_inds = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+        gt_masks = [res.masks for res in batch_gt_instances]
+        mask_targets = mask_target(pos_proposals, pos_assigned_gt_inds,
+                                   gt_masks, rcnn_train_cfg)
+        return mask_targets
+
+    def loss_and_target(self, mask_preds: Tensor,
+                        sampling_results: List[SamplingResult],
+                        batch_gt_instances: InstanceList,
+                        rcnn_train_cfg: ConfigDict) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mask_preds (Tensor): Predicted foreground masks, has shape
+                (num_pos, num_classes, h, w).
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+        """
+        mask_targets = self.get_targets(
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=rcnn_train_cfg)
+
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+
+        loss = dict()
+        if mask_preds.size(0) == 0:
+            loss_mask = mask_preds.sum()
+        else:
+            if self.class_agnostic:
+                loss_mask = self.loss_mask(mask_preds, mask_targets,
+                                           torch.zeros_like(pos_labels))
+            else:
+                loss_mask = self.loss_mask(mask_preds, mask_targets,
+                                           pos_labels)
+        loss['loss_mask'] = loss_mask
+        # TODO: which algorithm requires mask_targets?
+        return dict(loss_mask=loss, mask_targets=mask_targets)
+
+    def predict_by_feat(self,
+                        mask_preds: Tuple[Tensor],
+                        results_list: List[InstanceData],
+                        batch_img_metas: List[dict],
+                        rcnn_test_cfg: ConfigDict,
+                        rescale: bool = False,
+                        activate_map: bool = False) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mask_preds (tuple[Tensor]): Tuple of predicted foreground masks,
+                each has shape (n, num_classes, h, w).
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            batch_img_metas (list[dict]): List of image information.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            activate_map (book): Whether get results with augmentations test.
+                If True, the `mask_preds` will not process with sigmoid.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        assert len(mask_preds) == len(results_list) == len(batch_img_metas)
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            results = results_list[img_id]
+            bboxes = results.bboxes
+            if bboxes.shape[0] == 0:
+                results_list[img_id] = empty_instances(
+                    [img_meta],
+                    bboxes.device,
+                    task_type='mask',
+                    instance_results=[results],
+                    mask_thr_binary=rcnn_test_cfg.mask_thr_binary)[0]
+            else:
+                im_mask = self._predict_by_feat_single(
+                    mask_preds=mask_preds[img_id],
+                    bboxes=bboxes,
+                    labels=results.labels,
+                    img_meta=img_meta,
+                    rcnn_test_cfg=rcnn_test_cfg,
+                    rescale=rescale,
+                    activate_map=activate_map)
+                results.masks = im_mask
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                mask_preds: Tensor,
+                                bboxes: Tensor,
+                                labels: Tensor,
+                                img_meta: dict,
+                                rcnn_test_cfg: ConfigDict,
+                                rescale: bool = False,
+                                activate_map: bool = False) -> Tensor:
+        """Get segmentation masks from mask_preds and bboxes.
+
+        Args:
+            mask_preds (Tensor): Predicted foreground masks, has shape
+                (n, num_classes, h, w).
+            bboxes (Tensor): Predicted bboxes, has shape (n, 4)
+            labels (Tensor): Labels of bboxes, has shape (n, )
+            img_meta (dict): image information.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            activate_map (book): Whether get results with augmentations test.
+                If True, the `mask_preds` will not process with sigmoid.
+                Defaults to False.
+
+        Returns:
+            Tensor: Encoded masks, has shape (n, img_w, img_h)
+
+        Example:
+            >>> from mmengine.config import Config
+            >>> from mmdet.models.roi_heads.mask_heads.fcn_mask_head import *  # NOQA
+            >>> N = 7  # N = number of extracted ROIs
+            >>> C, H, W = 11, 32, 32
+            >>> # Create example instance of FCN Mask Head.
+            >>> self = FCNMaskHead(num_classes=C, num_convs=0)
+            >>> inputs = torch.rand(N, self.in_channels, H, W)
+            >>> mask_preds = self.forward(inputs)
+            >>> # Each input is associated with some bounding box
+            >>> bboxes = torch.Tensor([[1, 1, 42, 42 ]] * N)
+            >>> labels = torch.randint(0, C, size=(N,))
+            >>> rcnn_test_cfg = Config({'mask_thr_binary': 0, })
+            >>> ori_shape = (H * 4, W * 4)
+            >>> scale_factor = (1, 1)
+            >>> rescale = False
+            >>> img_meta = {'scale_factor': scale_factor,
+            ...             'ori_shape': ori_shape}
+            >>> # Encoded masks are a list for each category.
+            >>> encoded_masks = self._get_seg_masks_single(
+            ...     mask_preds, bboxes, labels,
+            ...     img_meta, rcnn_test_cfg, rescale)
+            >>> assert encoded_masks.size()[0] == N
+            >>> assert encoded_masks.size()[1:] == ori_shape
+        """
+        scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+            (1, 2))
+        img_h, img_w = img_meta['ori_shape'][:2]
+        device = bboxes.device
+
+        if not activate_map:
+            mask_preds = mask_preds.sigmoid()
+        else:
+            # In AugTest, has been activated before
+            mask_preds = bboxes.new_tensor(mask_preds)
+
+        if rescale:  # in-placed rescale the bboxes
+            bboxes /= scale_factor
+        else:
+            w_scale, h_scale = scale_factor[0, 0], scale_factor[0, 1]
+            img_h = np.round(img_h * h_scale.item()).astype(np.int32)
+            img_w = np.round(img_w * w_scale.item()).astype(np.int32)
+
+        N = len(mask_preds)
+        # The actual implementation split the input into chunks,
+        # and paste them chunk by chunk.
+        if device.type == 'cpu':
+            # CPU is most efficient when they are pasted one by one with
+            # skip_empty=True, so that it performs minimal number of
+            # operations.
+            num_chunks = N
+        else:
+            # GPU benefits from parallelism for larger chunks,
+            # but may have memory issue
+            # the types of img_w and img_h are np.int32,
+            # when the image resolution is large,
+            # the calculation of num_chunks will overflow.
+            # so we need to change the types of img_w and img_h to int.
+            # See https://github.com/open-mmlab/mmdetection/pull/5191
+            num_chunks = int(
+                np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT /
+                        GPU_MEM_LIMIT))
+            assert (num_chunks <=
+                    N), 'Default GPU_MEM_LIMIT is too small; try increasing it'
+        chunks = torch.chunk(torch.arange(N, device=device), num_chunks)
+
+        threshold = rcnn_test_cfg.mask_thr_binary
+        im_mask = torch.zeros(
+            N,
+            img_h,
+            img_w,
+            device=device,
+            dtype=torch.bool if threshold >= 0 else torch.uint8)
+
+        if not self.class_agnostic:
+            mask_preds = mask_preds[range(N), labels][:, None]
+
+        for inds in chunks:
+            masks_chunk, spatial_inds = _do_paste_mask(
+                mask_preds[inds],
+                bboxes[inds],
+                img_h,
+                img_w,
+                skip_empty=device.type == 'cpu')
+
+            if threshold >= 0:
+                masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
+            else:
+                # for visualization and debugging
+                masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)
+
+            im_mask[(inds, ) + spatial_inds] = masks_chunk
+        return im_mask
+
+
+def _do_paste_mask(masks: Tensor,
+                   boxes: Tensor,
+                   img_h: int,
+                   img_w: int,
+                   skip_empty: bool = True) -> tuple:
+    """Paste instance masks according to boxes.
+
+    This implementation is modified from
+    https://github.com/facebookresearch/detectron2/
+
+    Args:
+        masks (Tensor): N, 1, H, W
+        boxes (Tensor): N, 4
+        img_h (int): Height of the image to be pasted.
+        img_w (int): Width of the image to be pasted.
+        skip_empty (bool): Only paste masks within the region that
+            tightly bound all boxes, and returns the results this region only.
+            An important optimization for CPU.
+
+    Returns:
+        tuple: (Tensor, tuple). The first item is mask tensor, the second one
+        is the slice object.
+
+            If skip_empty == False, the whole image will be pasted. It will
+            return a mask of shape (N, img_h, img_w) and an empty tuple.
+
+            If skip_empty == True, only area around the mask will be pasted.
+            A mask of shape (N, h', w') and its start and end coordinates
+            in the original image will be returned.
+    """
+    # On GPU, paste all masks together (up to chunk size)
+    # by using the entire image to sample the masks
+    # Compared to pasting them one by one,
+    # this has more operations but is faster on COCO-scale dataset.
+    device = masks.device
+    if skip_empty:
+        x0_int, y0_int = torch.clamp(
+            boxes.min(dim=0).values.floor()[:2] - 1,
+            min=0).to(dtype=torch.int32)
+        x1_int = torch.clamp(
+            boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32)
+        y1_int = torch.clamp(
+            boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32)
+    else:
+        x0_int, y0_int = 0, 0
+        x1_int, y1_int = img_w, img_h
+    x0, y0, x1, y1 = torch.split(boxes, 1, dim=1)  # each is Nx1
+
+    N = masks.shape[0]
+
+    img_y = torch.arange(y0_int, y1_int, device=device).to(torch.float32) + 0.5
+    img_x = torch.arange(x0_int, x1_int, device=device).to(torch.float32) + 0.5
+    img_y = (img_y - y0) / (y1 - y0) * 2 - 1
+    img_x = (img_x - x0) / (x1 - x0) * 2 - 1
+    # img_x, img_y have shapes (N, w), (N, h)
+    # IsInf op is not supported with ONNX<=1.7.0
+    if not torch.onnx.is_in_onnx_export():
+        if torch.isinf(img_x).any():
+            inds = torch.where(torch.isinf(img_x))
+            img_x[inds] = 0
+        if torch.isinf(img_y).any():
+            inds = torch.where(torch.isinf(img_y))
+            img_y[inds] = 0
+
+    gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1))
+    gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
+    grid = torch.stack([gx, gy], dim=3)
+
+    img_masks = F.grid_sample(
+        masks.to(dtype=torch.float32), grid, align_corners=False)
+
+    if skip_empty:
+        return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int))
+    else:
+        return img_masks[:, 0], ()
diff --git a/mmde/mmdet/models/roi_heads/mask_heads/feature_relay_head.py b/mmde/mmdet/models/roi_heads/mask_heads/feature_relay_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c34561fa5fd749329eda164465ce9787278d357
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/mask_heads/feature_relay_head.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig
+
+
+@MODELS.register_module()
+class FeatureRelayHead(BaseModule):
+    """Feature Relay Head used in `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        in_channels (int): number of input channels. Defaults to 256.
+        conv_out_channels (int): number of output channels before
+            classification layer. Defaults to 256.
+        roi_feat_size (int): roi feat size at box head. Default: 7.
+        scale_factor (int): scale factor to match roi feat size
+            at mask head. Defaults to 2.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict. Defaults to
+            dict(type='Kaiming', layer='Linear').
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 1024,
+        out_conv_channels: int = 256,
+        roi_feat_size: int = 7,
+        scale_factor: int = 2,
+        init_cfg: MultiConfig = dict(type='Kaiming', layer='Linear')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(roi_feat_size, int)
+
+        self.in_channels = in_channels
+        self.out_conv_channels = out_conv_channels
+        self.roi_feat_size = roi_feat_size
+        self.out_channels = (roi_feat_size**2) * out_conv_channels
+        self.scale_factor = scale_factor
+        self.fp16_enabled = False
+
+        self.fc = nn.Linear(self.in_channels, self.out_channels)
+        self.upsample = nn.Upsample(
+            scale_factor=scale_factor, mode='bilinear', align_corners=True)
+
+    def forward(self, x: Tensor) -> Optional[Tensor]:
+        """Forward function.
+
+        Args:
+            x (Tensor): Input feature.
+
+        Returns:
+            Optional[Tensor]: Output feature. When the first dim of input is
+            0, None is returned.
+        """
+        N, _ = x.shape
+        if N > 0:
+            out_C = self.out_conv_channels
+            out_HW = self.roi_feat_size
+            x = self.fc(x)
+            x = x.reshape(N, out_C, out_HW, out_HW)
+            x = self.upsample(x)
+            return x
+        return None
diff --git a/mmde/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py b/mmde/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20beb2975a563f03e7b6b2afcef287cb41af05a
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Tuple
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class FusedSemanticHead(BaseModule):
+    r"""Multi-level fused semantic segmentation head.
+
+    .. code-block:: none
+
+        in_1 -> 1x1 conv ---
+                            |
+        in_2 -> 1x1 conv -- |
+                           ||
+        in_3 -> 1x1 conv - ||
+                          |||                  /-> 1x1 conv (mask prediction)
+        in_4 -> 1x1 conv -----> 3x3 convs (*4)
+                            |                  \-> 1x1 conv (feature)
+        in_5 -> 1x1 conv ---
+    """  # noqa: W605
+
+    def __init__(
+        self,
+        num_ins: int,
+        fusion_level: int,
+        seg_scale_factor=1 / 8,
+        num_convs: int = 4,
+        in_channels: int = 256,
+        conv_out_channels: int = 256,
+        num_classes: int = 183,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        ignore_label: int = None,
+        loss_weight: float = None,
+        loss_seg: ConfigDict = dict(
+            type='CrossEntropyLoss', ignore_index=255, loss_weight=0.2),
+        init_cfg: MultiConfig = dict(
+            type='Kaiming', override=dict(name='conv_logits'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_ins = num_ins
+        self.fusion_level = fusion_level
+        self.seg_scale_factor = seg_scale_factor
+        self.num_convs = num_convs
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.num_classes = num_classes
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.fp16_enabled = False
+
+        self.lateral_convs = nn.ModuleList()
+        for i in range(self.num_ins):
+            self.lateral_convs.append(
+                ConvModule(
+                    self.in_channels,
+                    self.in_channels,
+                    1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=False))
+
+        self.convs = nn.ModuleList()
+        for i in range(self.num_convs):
+            in_channels = self.in_channels if i == 0 else conv_out_channels
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    conv_out_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.conv_embedding = ConvModule(
+            conv_out_channels,
+            conv_out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+        self.conv_logits = nn.Conv2d(conv_out_channels, self.num_classes, 1)
+        if ignore_label:
+            loss_seg['ignore_index'] = ignore_label
+        if loss_weight:
+            loss_seg['loss_weight'] = loss_weight
+        if ignore_label or loss_weight:
+            warnings.warn('``ignore_label`` and ``loss_weight`` would be '
+                          'deprecated soon. Please set ``ingore_index`` and '
+                          '``loss_weight`` in ``loss_seg`` instead.')
+        self.criterion = MODELS.build(loss_seg)
+
+    def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            feats (tuple[Tensor]): Multi scale feature maps.
+
+        Returns:
+            tuple[Tensor]:
+
+                - mask_preds (Tensor): Predicted mask logits.
+                - x (Tensor): Fused feature.
+        """
+        x = self.lateral_convs[self.fusion_level](feats[self.fusion_level])
+        fused_size = tuple(x.shape[-2:])
+        for i, feat in enumerate(feats):
+            if i != self.fusion_level:
+                feat = F.interpolate(
+                    feat, size=fused_size, mode='bilinear', align_corners=True)
+                # fix runtime error of "+=" inplace operation in PyTorch 1.10
+                x = x + self.lateral_convs[i](feat)
+
+        for i in range(self.num_convs):
+            x = self.convs[i](x)
+
+        mask_preds = self.conv_logits(x)
+        x = self.conv_embedding(x)
+        return mask_preds, x
+
+    def loss(self, mask_preds: Tensor, labels: Tensor) -> Tensor:
+        """Loss function.
+
+        Args:
+            mask_preds (Tensor): Predicted mask logits.
+            labels (Tensor): Ground truth.
+
+        Returns:
+            Tensor: Semantic segmentation loss.
+        """
+        labels = F.interpolate(
+            labels.float(), scale_factor=self.seg_scale_factor, mode='nearest')
+        labels = labels.squeeze(1).long()
+        loss_semantic_seg = self.criterion(mask_preds, labels)
+        return loss_semantic_seg
diff --git a/mmde/mmdet/models/roi_heads/mask_heads/global_context_head.py b/mmde/mmdet/models/roi_heads/mask_heads/global_context_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb947ea582227d2b74112cbb930e1a3f85b77ff5
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/mask_heads/global_context_head.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.models.layers import ResLayer, SimplifiedBasicBlock
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class GlobalContextHead(BaseModule):
+    """Global context head used in `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        num_convs (int, optional): number of convolutional layer in GlbCtxHead.
+            Defaults to 4.
+        in_channels (int, optional): number of input channels. Defaults to 256.
+        conv_out_channels (int, optional): number of output channels before
+            classification layer. Defaults to 256.
+        num_classes (int, optional): number of classes. Defaults to 80.
+        loss_weight (float, optional): global context loss weight.
+            Defaults to 1.
+        conv_cfg (dict, optional): config to init conv layer. Defaults to None.
+        norm_cfg (dict, optional): config to init norm layer. Defaults to None.
+        conv_to_res (bool, optional): if True, 2 convs will be grouped into
+            1 `SimplifiedBasicBlock` using a skip connection.
+            Defaults to False.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict. Defaults to
+            dict(type='Normal', std=0.01, override=dict(name='fc')).
+    """
+
+    def __init__(
+        self,
+        num_convs: int = 4,
+        in_channels: int = 256,
+        conv_out_channels: int = 256,
+        num_classes: int = 80,
+        loss_weight: float = 1.0,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        conv_to_res: bool = False,
+        init_cfg: MultiConfig = dict(
+            type='Normal', std=0.01, override=dict(name='fc'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_convs = num_convs
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.num_classes = num_classes
+        self.loss_weight = loss_weight
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.conv_to_res = conv_to_res
+        self.fp16_enabled = False
+
+        if self.conv_to_res:
+            num_res_blocks = num_convs // 2
+            self.convs = ResLayer(
+                SimplifiedBasicBlock,
+                in_channels,
+                self.conv_out_channels,
+                num_res_blocks,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+            self.num_convs = num_res_blocks
+        else:
+            self.convs = nn.ModuleList()
+            for i in range(self.num_convs):
+                in_channels = self.in_channels if i == 0 else conv_out_channels
+                self.convs.append(
+                    ConvModule(
+                        in_channels,
+                        conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(conv_out_channels, num_classes)
+
+        self.criterion = nn.BCEWithLogitsLoss()
+
+    def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            feats (Tuple[Tensor]): Multi-scale feature maps.
+
+        Returns:
+            Tuple[Tensor]:
+
+                - mc_pred (Tensor): Multi-class prediction.
+                - x (Tensor): Global context feature.
+        """
+        x = feats[-1]
+        for i in range(self.num_convs):
+            x = self.convs[i](x)
+        x = self.pool(x)
+
+        # multi-class prediction
+        mc_pred = x.reshape(x.size(0), -1)
+        mc_pred = self.fc(mc_pred)
+
+        return mc_pred, x
+
+    def loss(self, pred: Tensor, labels: List[Tensor]) -> Tensor:
+        """Loss function.
+
+        Args:
+            pred (Tensor): Logits.
+            labels (list[Tensor]): Grouth truths.
+
+        Returns:
+            Tensor: Loss.
+        """
+        labels = [lbl.unique() for lbl in labels]
+        targets = pred.new_zeros(pred.size())
+        for i, label in enumerate(labels):
+            targets[i, label] = 1.0
+        loss = self.loss_weight * self.criterion(pred, targets)
+        return loss
diff --git a/mmde/mmdet/models/roi_heads/mask_heads/grid_head.py b/mmde/mmdet/models/roi_heads/mask_heads/grid_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9514ae7bcfc1b7d5613fa0107e9bd087e13dd46
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/mask_heads/grid_head.py
@@ -0,0 +1,490 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class GridHead(BaseModule):
+    """Implementation of `Grid Head <https://arxiv.org/abs/1811.12030>`_
+
+    Args:
+        grid_points (int): The number of grid points. Defaults to 9.
+        num_convs (int): The number of convolution layers. Defaults to 8.
+        roi_feat_size (int): RoI feature size. Default to 14.
+        in_channels (int): The channel number of inputs features.
+            Defaults to 256.
+        conv_kernel_size (int): The kernel size of convolution layers.
+            Defaults to 3.
+        point_feat_channels (int): The number of channels of each point
+            features. Defaults to 64.
+        class_agnostic (bool): Whether use class agnostic classification.
+            If so, the output channels of logits will be 1. Defaults to False.
+        loss_grid (:obj:`ConfigDict` or dict): Config of grid loss.
+        conv_cfg (:obj:`ConfigDict` or dict, optional) dictionary to
+            construct and config conv layer.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config norm layer.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        grid_points: int = 9,
+        num_convs: int = 8,
+        roi_feat_size: int = 14,
+        in_channels: int = 256,
+        conv_kernel_size: int = 3,
+        point_feat_channels: int = 64,
+        deconv_kernel_size: int = 4,
+        class_agnostic: bool = False,
+        loss_grid: ConfigType = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=15),
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='GN', num_groups=36),
+        init_cfg: MultiConfig = [
+            dict(type='Kaiming', layer=['Conv2d', 'Linear']),
+            dict(
+                type='Normal',
+                layer='ConvTranspose2d',
+                std=0.001,
+                override=dict(
+                    type='Normal',
+                    name='deconv2',
+                    std=0.001,
+                    bias=-np.log(0.99 / 0.01)))
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.grid_points = grid_points
+        self.num_convs = num_convs
+        self.roi_feat_size = roi_feat_size
+        self.in_channels = in_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.point_feat_channels = point_feat_channels
+        self.conv_out_channels = self.point_feat_channels * self.grid_points
+        self.class_agnostic = class_agnostic
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        if isinstance(norm_cfg, dict) and norm_cfg['type'] == 'GN':
+            assert self.conv_out_channels % norm_cfg['num_groups'] == 0
+
+        assert self.grid_points >= 4
+        self.grid_size = int(np.sqrt(self.grid_points))
+        if self.grid_size * self.grid_size != self.grid_points:
+            raise ValueError('grid_points must be a square number')
+
+        # the predicted heatmap is half of whole_map_size
+        if not isinstance(self.roi_feat_size, int):
+            raise ValueError('Only square RoIs are supporeted in Grid R-CNN')
+        self.whole_map_size = self.roi_feat_size * 4
+
+        # compute point-wise sub-regions
+        self.sub_regions = self.calc_sub_regions()
+
+        self.convs = []
+        for i in range(self.num_convs):
+            in_channels = (
+                self.in_channels if i == 0 else self.conv_out_channels)
+            stride = 2 if i == 0 else 1
+            padding = (self.conv_kernel_size - 1) // 2
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    self.conv_out_channels,
+                    self.conv_kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=True))
+        self.convs = nn.Sequential(*self.convs)
+
+        self.deconv1 = nn.ConvTranspose2d(
+            self.conv_out_channels,
+            self.conv_out_channels,
+            kernel_size=deconv_kernel_size,
+            stride=2,
+            padding=(deconv_kernel_size - 2) // 2,
+            groups=grid_points)
+        self.norm1 = nn.GroupNorm(grid_points, self.conv_out_channels)
+        self.deconv2 = nn.ConvTranspose2d(
+            self.conv_out_channels,
+            grid_points,
+            kernel_size=deconv_kernel_size,
+            stride=2,
+            padding=(deconv_kernel_size - 2) // 2,
+            groups=grid_points)
+
+        # find the 4-neighbor of each grid point
+        self.neighbor_points = []
+        grid_size = self.grid_size
+        for i in range(grid_size):  # i-th column
+            for j in range(grid_size):  # j-th row
+                neighbors = []
+                if i > 0:  # left: (i - 1, j)
+                    neighbors.append((i - 1) * grid_size + j)
+                if j > 0:  # up: (i, j - 1)
+                    neighbors.append(i * grid_size + j - 1)
+                if j < grid_size - 1:  # down: (i, j + 1)
+                    neighbors.append(i * grid_size + j + 1)
+                if i < grid_size - 1:  # right: (i + 1, j)
+                    neighbors.append((i + 1) * grid_size + j)
+                self.neighbor_points.append(tuple(neighbors))
+        # total edges in the grid
+        self.num_edges = sum([len(p) for p in self.neighbor_points])
+
+        self.forder_trans = nn.ModuleList()  # first-order feature transition
+        self.sorder_trans = nn.ModuleList()  # second-order feature transition
+        for neighbors in self.neighbor_points:
+            fo_trans = nn.ModuleList()
+            so_trans = nn.ModuleList()
+            for _ in range(len(neighbors)):
+                # each transition module consists of a 5x5 depth-wise conv and
+                # 1x1 conv.
+                fo_trans.append(
+                    nn.Sequential(
+                        nn.Conv2d(
+                            self.point_feat_channels,
+                            self.point_feat_channels,
+                            5,
+                            stride=1,
+                            padding=2,
+                            groups=self.point_feat_channels),
+                        nn.Conv2d(self.point_feat_channels,
+                                  self.point_feat_channels, 1)))
+                so_trans.append(
+                    nn.Sequential(
+                        nn.Conv2d(
+                            self.point_feat_channels,
+                            self.point_feat_channels,
+                            5,
+                            1,
+                            2,
+                            groups=self.point_feat_channels),
+                        nn.Conv2d(self.point_feat_channels,
+                                  self.point_feat_channels, 1)))
+            self.forder_trans.append(fo_trans)
+            self.sorder_trans.append(so_trans)
+
+        self.loss_grid = MODELS.build(loss_grid)
+
+    def forward(self, x: Tensor) -> Dict[str, Tensor]:
+        """forward function of ``GridHead``.
+
+        Args:
+            x (Tensor): RoI features, has shape
+                (num_rois, num_channels, roi_feat_size, roi_feat_size).
+
+        Returns:
+            Dict[str, Tensor]: Return a dict including fused and unfused
+            heatmap.
+        """
+        assert x.shape[-1] == x.shape[-2] == self.roi_feat_size
+        # RoI feature transformation, downsample 2x
+        x = self.convs(x)
+
+        c = self.point_feat_channels
+        # first-order fusion
+        x_fo = [None for _ in range(self.grid_points)]
+        for i, points in enumerate(self.neighbor_points):
+            x_fo[i] = x[:, i * c:(i + 1) * c]
+            for j, point_idx in enumerate(points):
+                x_fo[i] = x_fo[i] + self.forder_trans[i][j](
+                    x[:, point_idx * c:(point_idx + 1) * c])
+
+        # second-order fusion
+        x_so = [None for _ in range(self.grid_points)]
+        for i, points in enumerate(self.neighbor_points):
+            x_so[i] = x[:, i * c:(i + 1) * c]
+            for j, point_idx in enumerate(points):
+                x_so[i] = x_so[i] + self.sorder_trans[i][j](x_fo[point_idx])
+
+        # predicted heatmap with fused features
+        x2 = torch.cat(x_so, dim=1)
+        x2 = self.deconv1(x2)
+        x2 = F.relu(self.norm1(x2), inplace=True)
+        heatmap = self.deconv2(x2)
+
+        # predicted heatmap with original features (applicable during training)
+        if self.training:
+            x1 = x
+            x1 = self.deconv1(x1)
+            x1 = F.relu(self.norm1(x1), inplace=True)
+            heatmap_unfused = self.deconv2(x1)
+        else:
+            heatmap_unfused = heatmap
+
+        return dict(fused=heatmap, unfused=heatmap_unfused)
+
+    def calc_sub_regions(self) -> List[Tuple[float]]:
+        """Compute point specific representation regions.
+
+        See `Grid R-CNN Plus <https://arxiv.org/abs/1906.05688>`_ for details.
+        """
+        # to make it consistent with the original implementation, half_size
+        # is computed as 2 * quarter_size, which is smaller
+        half_size = self.whole_map_size // 4 * 2
+        sub_regions = []
+        for i in range(self.grid_points):
+            x_idx = i // self.grid_size
+            y_idx = i % self.grid_size
+            if x_idx == 0:
+                sub_x1 = 0
+            elif x_idx == self.grid_size - 1:
+                sub_x1 = half_size
+            else:
+                ratio = x_idx / (self.grid_size - 1) - 0.25
+                sub_x1 = max(int(ratio * self.whole_map_size), 0)
+
+            if y_idx == 0:
+                sub_y1 = 0
+            elif y_idx == self.grid_size - 1:
+                sub_y1 = half_size
+            else:
+                ratio = y_idx / (self.grid_size - 1) - 0.25
+                sub_y1 = max(int(ratio * self.whole_map_size), 0)
+            sub_regions.append(
+                (sub_x1, sub_y1, sub_x1 + half_size, sub_y1 + half_size))
+        return sub_regions
+
+    def get_targets(self, sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict) -> Tensor:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.".
+
+        Args:
+            sampling_results (List[:obj:`SamplingResult`]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (:obj:`ConfigDict`): `train_cfg` of RCNN.
+
+        Returns:
+            Tensor: Grid heatmap targets.
+        """
+        # mix all samples (across images) together.
+        pos_bboxes = torch.cat([res.pos_bboxes for res in sampling_results],
+                               dim=0).cpu()
+        pos_gt_bboxes = torch.cat(
+            [res.pos_gt_bboxes for res in sampling_results], dim=0).cpu()
+        assert pos_bboxes.shape == pos_gt_bboxes.shape
+
+        # expand pos_bboxes to 2x of original size
+        x1 = pos_bboxes[:, 0] - (pos_bboxes[:, 2] - pos_bboxes[:, 0]) / 2
+        y1 = pos_bboxes[:, 1] - (pos_bboxes[:, 3] - pos_bboxes[:, 1]) / 2
+        x2 = pos_bboxes[:, 2] + (pos_bboxes[:, 2] - pos_bboxes[:, 0]) / 2
+        y2 = pos_bboxes[:, 3] + (pos_bboxes[:, 3] - pos_bboxes[:, 1]) / 2
+        pos_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+        pos_bbox_ws = (pos_bboxes[:, 2] - pos_bboxes[:, 0]).unsqueeze(-1)
+        pos_bbox_hs = (pos_bboxes[:, 3] - pos_bboxes[:, 1]).unsqueeze(-1)
+
+        num_rois = pos_bboxes.shape[0]
+        map_size = self.whole_map_size
+        # this is not the final target shape
+        targets = torch.zeros((num_rois, self.grid_points, map_size, map_size),
+                              dtype=torch.float)
+
+        # pre-compute interpolation factors for all grid points.
+        # the first item is the factor of x-dim, and the second is y-dim.
+        # for a 9-point grid, factors are like (1, 0), (0.5, 0.5), (0, 1)
+        factors = []
+        for j in range(self.grid_points):
+            x_idx = j // self.grid_size
+            y_idx = j % self.grid_size
+            factors.append((1 - x_idx / (self.grid_size - 1),
+                            1 - y_idx / (self.grid_size - 1)))
+
+        radius = rcnn_train_cfg.pos_radius
+        radius2 = radius**2
+        for i in range(num_rois):
+            # ignore small bboxes
+            if (pos_bbox_ws[i] <= self.grid_size
+                    or pos_bbox_hs[i] <= self.grid_size):
+                continue
+            # for each grid point, mark a small circle as positive
+            for j in range(self.grid_points):
+                factor_x, factor_y = factors[j]
+                gridpoint_x = factor_x * pos_gt_bboxes[i, 0] + (
+                    1 - factor_x) * pos_gt_bboxes[i, 2]
+                gridpoint_y = factor_y * pos_gt_bboxes[i, 1] + (
+                    1 - factor_y) * pos_gt_bboxes[i, 3]
+
+                cx = int((gridpoint_x - pos_bboxes[i, 0]) / pos_bbox_ws[i] *
+                         map_size)
+                cy = int((gridpoint_y - pos_bboxes[i, 1]) / pos_bbox_hs[i] *
+                         map_size)
+
+                for x in range(cx - radius, cx + radius + 1):
+                    for y in range(cy - radius, cy + radius + 1):
+                        if x >= 0 and x < map_size and y >= 0 and y < map_size:
+                            if (x - cx)**2 + (y - cy)**2 <= radius2:
+                                targets[i, j, y, x] = 1
+        # reduce the target heatmap size by a half
+        # proposed in Grid R-CNN Plus (https://arxiv.org/abs/1906.05688).
+        sub_targets = []
+        for i in range(self.grid_points):
+            sub_x1, sub_y1, sub_x2, sub_y2 = self.sub_regions[i]
+            sub_targets.append(targets[:, [i], sub_y1:sub_y2, sub_x1:sub_x2])
+        sub_targets = torch.cat(sub_targets, dim=1)
+        sub_targets = sub_targets.to(sampling_results[0].pos_bboxes.device)
+        return sub_targets
+
+    def loss(self, grid_pred: Tensor, sample_idx: Tensor,
+             sampling_results: List[SamplingResult],
+             rcnn_train_cfg: ConfigDict) -> dict:
+        """Calculate the loss based on the features extracted by the grid head.
+
+        Args:
+            grid_pred (dict[str, Tensor]): Outputs of grid_head forward.
+            sample_idx (Tensor): The sampling index of ``grid_pred``.
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:`ConfigDict`): `train_cfg` of RCNN.
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+        """
+        grid_targets = self.get_targets(sampling_results, rcnn_train_cfg)
+        grid_targets = grid_targets[sample_idx]
+
+        loss_fused = self.loss_grid(grid_pred['fused'], grid_targets)
+        loss_unfused = self.loss_grid(grid_pred['unfused'], grid_targets)
+        loss_grid = loss_fused + loss_unfused
+        return dict(loss_grid=loss_grid)
+
+    def predict_by_feat(self,
+                        grid_preds: Dict[str, Tensor],
+                        results_list: List[InstanceData],
+                        batch_img_metas: List[dict],
+                        rescale: bool = False) -> InstanceList:
+        """Adjust the predicted bboxes from bbox head.
+
+        Args:
+            grid_preds (dict[str, Tensor]): dictionary outputted by forward
+                function.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            batch_img_metas (list[dict]): List of image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process. Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape \
+            (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4), the last \
+            dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        num_roi_per_img = tuple(res.bboxes.size(0) for res in results_list)
+        grid_preds = {
+            k: v.split(num_roi_per_img, 0)
+            for k, v in grid_preds.items()
+        }
+
+        for i, results in enumerate(results_list):
+            if len(results) != 0:
+                bboxes = self._predict_by_feat_single(
+                    grid_pred=grid_preds['fused'][i],
+                    bboxes=results.bboxes,
+                    img_meta=batch_img_metas[i],
+                    rescale=rescale)
+                results.bboxes = bboxes
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                grid_pred: Tensor,
+                                bboxes: Tensor,
+                                img_meta: dict,
+                                rescale: bool = False) -> Tensor:
+        """Adjust ``bboxes`` according to ``grid_pred``.
+
+        Args:
+            grid_pred (Tensor): Grid fused heatmap.
+            bboxes (Tensor): Predicted bboxes, has shape (n, 4)
+            img_meta (dict): image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            Tensor: adjusted bboxes.
+        """
+        assert bboxes.size(0) == grid_pred.size(0)
+        grid_pred = grid_pred.sigmoid()
+
+        R, c, h, w = grid_pred.shape
+        half_size = self.whole_map_size // 4 * 2
+        assert h == w == half_size
+        assert c == self.grid_points
+
+        # find the point with max scores in the half-sized heatmap
+        grid_pred = grid_pred.view(R * c, h * w)
+        pred_scores, pred_position = grid_pred.max(dim=1)
+        xs = pred_position % w
+        ys = pred_position // w
+
+        # get the position in the whole heatmap instead of half-sized heatmap
+        for i in range(self.grid_points):
+            xs[i::self.grid_points] += self.sub_regions[i][0]
+            ys[i::self.grid_points] += self.sub_regions[i][1]
+
+        # reshape to (num_rois, grid_points)
+        pred_scores, xs, ys = tuple(
+            map(lambda x: x.view(R, c), [pred_scores, xs, ys]))
+
+        # get expanded pos_bboxes
+        widths = (bboxes[:, 2] - bboxes[:, 0]).unsqueeze(-1)
+        heights = (bboxes[:, 3] - bboxes[:, 1]).unsqueeze(-1)
+        x1 = (bboxes[:, 0, None] - widths / 2)
+        y1 = (bboxes[:, 1, None] - heights / 2)
+        # map the grid point to the absolute coordinates
+        abs_xs = (xs.float() + 0.5) / w * widths + x1
+        abs_ys = (ys.float() + 0.5) / h * heights + y1
+
+        # get the grid points indices that fall on the bbox boundaries
+        x1_inds = [i for i in range(self.grid_size)]
+        y1_inds = [i * self.grid_size for i in range(self.grid_size)]
+        x2_inds = [
+            self.grid_points - self.grid_size + i
+            for i in range(self.grid_size)
+        ]
+        y2_inds = [(i + 1) * self.grid_size - 1 for i in range(self.grid_size)]
+
+        # voting of all grid points on some boundary
+        bboxes_x1 = (abs_xs[:, x1_inds] * pred_scores[:, x1_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, x1_inds].sum(dim=1, keepdim=True))
+        bboxes_y1 = (abs_ys[:, y1_inds] * pred_scores[:, y1_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, y1_inds].sum(dim=1, keepdim=True))
+        bboxes_x2 = (abs_xs[:, x2_inds] * pred_scores[:, x2_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, x2_inds].sum(dim=1, keepdim=True))
+        bboxes_y2 = (abs_ys[:, y2_inds] * pred_scores[:, y2_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, y2_inds].sum(dim=1, keepdim=True))
+
+        bboxes = torch.cat([bboxes_x1, bboxes_y1, bboxes_x2, bboxes_y2], dim=1)
+        bboxes[:, [0, 2]].clamp_(min=0, max=img_meta['img_shape'][1])
+        bboxes[:, [1, 3]].clamp_(min=0, max=img_meta['img_shape'][0])
+
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            bboxes /= bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+
+        return bboxes
diff --git a/mmde/mmdet/models/roi_heads/mask_heads/htc_mask_head.py b/mmde/mmdet/models/roi_heads/mask_heads/htc_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..73ac1e6e5f115927e1a2accdd693aae512cac753
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/mask_heads/htc_mask_head.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+from mmcv.cnn import ConvModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .fcn_mask_head import FCNMaskHead
+
+
+@MODELS.register_module()
+class HTCMaskHead(FCNMaskHead):
+    """Mask head for HTC.
+
+    Args:
+        with_conv_res (bool): Whether add conv layer for ``res_feat``.
+            Defaults to True.
+    """
+
+    def __init__(self, with_conv_res: bool = True, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.with_conv_res = with_conv_res
+        if self.with_conv_res:
+            self.conv_res = ConvModule(
+                self.conv_out_channels,
+                self.conv_out_channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+
+    def forward(self,
+                x: Tensor,
+                res_feat: Optional[Tensor] = None,
+                return_logits: bool = True,
+                return_feat: bool = True) -> Union[Tensor, List[Tensor]]:
+        """
+        Args:
+            x (Tensor): Feature map.
+            res_feat (Tensor, optional): Feature for residual connection.
+                Defaults to None.
+            return_logits (bool): Whether return mask logits. Defaults to True.
+            return_feat (bool): Whether return feature map. Defaults to True.
+
+        Returns:
+            Union[Tensor, List[Tensor]]: The return result is one of three
+                results: res_feat, logits, or [logits, res_feat].
+        """
+        assert not (not return_logits and not return_feat)
+        if res_feat is not None:
+            assert self.with_conv_res
+            res_feat = self.conv_res(res_feat)
+            x = x + res_feat
+        for conv in self.convs:
+            x = conv(x)
+        res_feat = x
+        outs = []
+        if return_logits:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+            mask_preds = self.conv_logits(x)
+            outs.append(mask_preds)
+        if return_feat:
+            outs.append(res_feat)
+        return outs if len(outs) > 1 else outs[0]
diff --git a/mmde/mmdet/models/roi_heads/mask_heads/mask_point_head.py b/mmde/mmdet/models/roi_heads/mask_heads/mask_point_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2084f59f07b48bf2e5b05bb7af61172df8737478
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/mask_heads/mask_point_head.py
@@ -0,0 +1,284 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend/point_head/point_head.py  # noqa
+
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import point_sample, rel_roi_point_to_rel_img_point
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.utils import (get_uncertain_point_coords_with_randomness,
+                                get_uncertainty)
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class MaskPointHead(BaseModule):
+    """A mask point head use in PointRend.
+
+    ``MaskPointHead`` use shared multi-layer perceptron (equivalent to
+    nn.Conv1d) to predict the logit of input points. The fine-grained feature
+    and coarse feature will be concatenate together for predication.
+
+    Args:
+        num_fcs (int): Number of fc layers in the head. Defaults to 3.
+        in_channels (int): Number of input channels. Defaults to 256.
+        fc_channels (int): Number of fc channels. Defaults to 256.
+        num_classes (int): Number of classes for logits. Defaults to 80.
+        class_agnostic (bool): Whether use class agnostic classification.
+            If so, the output channels of logits will be 1. Defaults to False.
+        coarse_pred_each_layer (bool): Whether concatenate coarse feature with
+            the output of each fc layer. Defaults to True.
+        conv_cfg (:obj:`ConfigDict` or dict): Dictionary to construct
+            and config conv layer. Defaults to dict(type='Conv1d')).
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Dictionary to construct
+            and config norm layer. Defaults to None.
+        loss_point (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config loss layer of point head. Defaults to
+            dict(type='CrossEntropyLoss', use_mask=True, loss_weight=1.0).
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        num_fcs: int = 3,
+        in_channels: int = 256,
+        fc_channels: int = 256,
+        class_agnostic: bool = False,
+        coarse_pred_each_layer: bool = True,
+        conv_cfg: ConfigType = dict(type='Conv1d'),
+        norm_cfg: OptConfigType = None,
+        act_cfg: ConfigType = dict(type='ReLU'),
+        loss_point: ConfigType = dict(
+            type='CrossEntropyLoss', use_mask=True, loss_weight=1.0),
+        init_cfg: MultiConfig = dict(
+            type='Normal', std=0.001, override=dict(name='fc_logits'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_fcs = num_fcs
+        self.in_channels = in_channels
+        self.fc_channels = fc_channels
+        self.num_classes = num_classes
+        self.class_agnostic = class_agnostic
+        self.coarse_pred_each_layer = coarse_pred_each_layer
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.loss_point = MODELS.build(loss_point)
+
+        fc_in_channels = in_channels + num_classes
+        self.fcs = nn.ModuleList()
+        for _ in range(num_fcs):
+            fc = ConvModule(
+                fc_in_channels,
+                fc_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.fcs.append(fc)
+            fc_in_channels = fc_channels
+            fc_in_channels += num_classes if self.coarse_pred_each_layer else 0
+
+        out_channels = 1 if self.class_agnostic else self.num_classes
+        self.fc_logits = nn.Conv1d(
+            fc_in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, fine_grained_feats: Tensor,
+                coarse_feats: Tensor) -> Tensor:
+        """Classify each point base on fine grained and coarse feats.
+
+        Args:
+            fine_grained_feats (Tensor): Fine grained feature sampled from FPN,
+                shape (num_rois, in_channels, num_points).
+            coarse_feats (Tensor): Coarse feature sampled from CoarseMaskHead,
+                shape (num_rois, num_classes, num_points).
+
+        Returns:
+            Tensor: Point classification results,
+            shape (num_rois, num_class, num_points).
+        """
+
+        x = torch.cat([fine_grained_feats, coarse_feats], dim=1)
+        for fc in self.fcs:
+            x = fc(x)
+            if self.coarse_pred_each_layer:
+                x = torch.cat((x, coarse_feats), dim=1)
+        return self.fc_logits(x)
+
+    def get_targets(self, rois: Tensor, rel_roi_points: Tensor,
+                    sampling_results: List[SamplingResult],
+                    batch_gt_instances: InstanceList,
+                    cfg: ConfigType) -> Tensor:
+        """Get training targets of MaskPointHead for all images.
+
+        Args:
+            rois (Tensor): Region of Interest, shape (num_rois, 5).
+            rel_roi_points (Tensor): Points coordinates relative to RoI, shape
+                (num_rois, num_points, 2).
+            sampling_results (:obj:`SamplingResult`): Sampling result after
+                sampling and assignment.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            cfg (obj:`ConfigDict` or dict): Training cfg.
+
+        Returns:
+            Tensor: Point target, shape (num_rois, num_points).
+        """
+
+        num_imgs = len(sampling_results)
+        rois_list = []
+        rel_roi_points_list = []
+        for batch_ind in range(num_imgs):
+            inds = (rois[:, 0] == batch_ind)
+            rois_list.append(rois[inds])
+            rel_roi_points_list.append(rel_roi_points[inds])
+        pos_assigned_gt_inds_list = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+        cfg_list = [cfg for _ in range(num_imgs)]
+
+        point_targets = map(self._get_targets_single, rois_list,
+                            rel_roi_points_list, pos_assigned_gt_inds_list,
+                            batch_gt_instances, cfg_list)
+        point_targets = list(point_targets)
+
+        if len(point_targets) > 0:
+            point_targets = torch.cat(point_targets)
+
+        return point_targets
+
+    def _get_targets_single(self, rois: Tensor, rel_roi_points: Tensor,
+                            pos_assigned_gt_inds: Tensor,
+                            gt_instances: InstanceData,
+                            cfg: ConfigType) -> Tensor:
+        """Get training target of MaskPointHead for each image."""
+        num_pos = rois.size(0)
+        num_points = cfg.num_points
+        if num_pos > 0:
+            gt_masks_th = (
+                gt_instances.masks.to_tensor(rois.dtype,
+                                             rois.device).index_select(
+                                                 0, pos_assigned_gt_inds))
+            gt_masks_th = gt_masks_th.unsqueeze(1)
+            rel_img_points = rel_roi_point_to_rel_img_point(
+                rois, rel_roi_points, gt_masks_th)
+            point_targets = point_sample(gt_masks_th,
+                                         rel_img_points).squeeze(1)
+        else:
+            point_targets = rois.new_zeros((0, num_points))
+        return point_targets
+
+    def loss_and_target(self, point_pred: Tensor, rel_roi_points: Tensor,
+                        sampling_results: List[SamplingResult],
+                        batch_gt_instances: InstanceList,
+                        cfg: ConfigType) -> dict:
+        """Calculate loss for MaskPointHead.
+
+        Args:
+            point_pred (Tensor): Point predication result, shape
+                (num_rois, num_classes, num_points).
+            rel_roi_points (Tensor): Points coordinates relative to RoI, shape
+                (num_rois, num_points, 2).
+             sampling_results (:obj:`SamplingResult`): Sampling result after
+                sampling and assignment.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            cfg (obj:`ConfigDict` or dict): Training cfg.
+
+        Returns:
+            dict: a dictionary of point loss and point target.
+        """
+        rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+
+        point_target = self.get_targets(rois, rel_roi_points, sampling_results,
+                                        batch_gt_instances, cfg)
+        if self.class_agnostic:
+            loss_point = self.loss_point(point_pred, point_target,
+                                         torch.zeros_like(pos_labels))
+        else:
+            loss_point = self.loss_point(point_pred, point_target, pos_labels)
+
+        return dict(loss_point=loss_point, point_target=point_target)
+
+    def get_roi_rel_points_train(self, mask_preds: Tensor, labels: Tensor,
+                                 cfg: ConfigType) -> Tensor:
+        """Get ``num_points`` most uncertain points with random points during
+        train.
+
+        Sample points in [0, 1] x [0, 1] coordinate space based on their
+        uncertainty. The uncertainties are calculated for each point using
+        '_get_uncertainty()' function that takes point's logit prediction as
+        input.
+
+        Args:
+            mask_preds (Tensor): A tensor of shape (num_rois, num_classes,
+                mask_height, mask_width) for class-specific or class-agnostic
+                prediction.
+            labels (Tensor): The ground truth class for each instance.
+            cfg (:obj:`ConfigDict` or dict): Training config of point head.
+
+        Returns:
+            point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+            that contains the coordinates sampled points.
+        """
+        point_coords = get_uncertain_point_coords_with_randomness(
+            mask_preds, labels, cfg.num_points, cfg.oversample_ratio,
+            cfg.importance_sample_ratio)
+        return point_coords
+
+    def get_roi_rel_points_test(self, mask_preds: Tensor, label_preds: Tensor,
+                                cfg: ConfigType) -> Tuple[Tensor, Tensor]:
+        """Get ``num_points`` most uncertain points during test.
+
+        Args:
+            mask_preds (Tensor): A tensor of shape (num_rois, num_classes,
+                mask_height, mask_width) for class-specific or class-agnostic
+                prediction.
+            label_preds (Tensor): The predication class for each instance.
+            cfg (:obj:`ConfigDict` or dict): Testing config of point head.
+
+        Returns:
+            tuple:
+
+            - point_indices (Tensor): A tensor of shape (num_rois, num_points)
+              that contains indices from [0, mask_height x mask_width) of the
+              most uncertain points.
+            - point_coords (Tensor): A tensor of shape (num_rois, num_points,
+              2) that contains [0, 1] x [0, 1] normalized coordinates of the
+              most uncertain points from the [mask_height, mask_width] grid.
+        """
+        num_points = cfg.subdivision_num_points
+        uncertainty_map = get_uncertainty(mask_preds, label_preds)
+        num_rois, _, mask_height, mask_width = uncertainty_map.shape
+
+        # During ONNX exporting, the type of each elements of 'shape' is
+        # `Tensor(float)`, while it is `float` during PyTorch inference.
+        if isinstance(mask_height, torch.Tensor):
+            h_step = 1.0 / mask_height.float()
+            w_step = 1.0 / mask_width.float()
+        else:
+            h_step = 1.0 / mask_height
+            w_step = 1.0 / mask_width
+        # cast to int to avoid dynamic K for TopK op in ONNX
+        mask_size = int(mask_height * mask_width)
+        uncertainty_map = uncertainty_map.view(num_rois, mask_size)
+        num_points = min(mask_size, num_points)
+        point_indices = uncertainty_map.topk(num_points, dim=1)[1]
+        xs = w_step / 2.0 + (point_indices % mask_width).float() * w_step
+        ys = h_step / 2.0 + (point_indices // mask_width).float() * h_step
+        point_coords = torch.stack([xs, ys], dim=2)
+        return point_indices, point_coords
diff --git a/mmde/mmdet/models/roi_heads/mask_heads/maskiou_head.py b/mmde/mmdet/models/roi_heads/mask_heads/maskiou_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8901871e754c491f7bc94eb68a27fa1b50e29148
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/mask_heads/maskiou_head.py
@@ -0,0 +1,277 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import Conv2d, Linear, MaxPool2d
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, OptMultiConfig
+
+
+@MODELS.register_module()
+class MaskIoUHead(BaseModule):
+    """Mask IoU Head.
+
+    This head predicts the IoU of predicted masks and corresponding gt masks.
+
+    Args:
+        num_convs (int): The number of convolution layers. Defaults to 4.
+        num_fcs (int): The number of fully connected layers. Defaults to 2.
+        roi_feat_size (int): RoI feature size. Default to 14.
+        in_channels (int): The channel number of inputs features.
+            Defaults to 256.
+        conv_out_channels (int): The feature channels of convolution layers.
+            Defaults to 256.
+        fc_out_channels (int): The feature channels of fully connected layers.
+            Defaults to 1024.
+        num_classes (int): Number of categories excluding the background
+            category. Defaults to 80.
+        loss_iou (:obj:`ConfigDict` or dict): IoU loss.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_convs: int = 4,
+        num_fcs: int = 2,
+        roi_feat_size: int = 14,
+        in_channels: int = 256,
+        conv_out_channels: int = 256,
+        fc_out_channels: int = 1024,
+        num_classes: int = 80,
+        loss_iou: ConfigType = dict(type='MSELoss', loss_weight=0.5),
+        init_cfg: OptMultiConfig = [
+            dict(type='Kaiming', override=dict(name='convs')),
+            dict(type='Caffe2Xavier', override=dict(name='fcs')),
+            dict(type='Normal', std=0.01, override=dict(name='fc_mask_iou'))
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.num_classes = num_classes
+
+        self.convs = nn.ModuleList()
+        for i in range(num_convs):
+            if i == 0:
+                # concatenation of mask feature and mask prediction
+                in_channels = self.in_channels + 1
+            else:
+                in_channels = self.conv_out_channels
+            stride = 2 if i == num_convs - 1 else 1
+            self.convs.append(
+                Conv2d(
+                    in_channels,
+                    self.conv_out_channels,
+                    3,
+                    stride=stride,
+                    padding=1))
+
+        roi_feat_size = _pair(roi_feat_size)
+        pooled_area = (roi_feat_size[0] // 2) * (roi_feat_size[1] // 2)
+        self.fcs = nn.ModuleList()
+        for i in range(num_fcs):
+            in_channels = (
+                self.conv_out_channels *
+                pooled_area if i == 0 else self.fc_out_channels)
+            self.fcs.append(Linear(in_channels, self.fc_out_channels))
+
+        self.fc_mask_iou = Linear(self.fc_out_channels, self.num_classes)
+        self.relu = nn.ReLU()
+        self.max_pool = MaxPool2d(2, 2)
+        self.loss_iou = MODELS.build(loss_iou)
+
+    def forward(self, mask_feat: Tensor, mask_preds: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            mask_feat (Tensor): Mask features from upstream models.
+            mask_preds (Tensor): Mask predictions from mask head.
+
+        Returns:
+            Tensor: Mask IoU predictions.
+        """
+        mask_preds = mask_preds.sigmoid()
+        mask_pred_pooled = self.max_pool(mask_preds.unsqueeze(1))
+
+        x = torch.cat((mask_feat, mask_pred_pooled), 1)
+
+        for conv in self.convs:
+            x = self.relu(conv(x))
+        x = x.flatten(1)
+        for fc in self.fcs:
+            x = self.relu(fc(x))
+        mask_iou = self.fc_mask_iou(x)
+        return mask_iou
+
+    def loss_and_target(self, mask_iou_pred: Tensor, mask_preds: Tensor,
+                        mask_targets: Tensor,
+                        sampling_results: List[SamplingResult],
+                        batch_gt_instances: InstanceList,
+                        rcnn_train_cfg: ConfigDict) -> dict:
+        """Calculate the loss and targets of MaskIoUHead.
+
+        Args:
+            mask_iou_pred (Tensor): Mask IoU predictions results, has shape
+                (num_pos, num_classes)
+            mask_preds (Tensor): Mask predictions from mask head, has shape
+                (num_pos, mask_size, mask_size).
+            mask_targets (Tensor): The ground truth masks assigned with
+                predictions, has shape
+                (num_pos, mask_size, mask_size).
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It includes ``masks`` inside.
+            rcnn_train_cfg (obj:`ConfigDict`): `train_cfg` of RCNN.
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+                The targets are only used for cascade rcnn.
+        """
+        mask_iou_targets = self.get_targets(
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            mask_preds=mask_preds,
+            mask_targets=mask_targets,
+            rcnn_train_cfg=rcnn_train_cfg)
+
+        pos_inds = mask_iou_targets > 0
+        if pos_inds.sum() > 0:
+            loss_mask_iou = self.loss_iou(mask_iou_pred[pos_inds],
+                                          mask_iou_targets[pos_inds])
+        else:
+            loss_mask_iou = mask_iou_pred.sum() * 0
+        return dict(loss_mask_iou=loss_mask_iou)
+
+    def get_targets(self, sampling_results: List[SamplingResult],
+                    batch_gt_instances: InstanceList, mask_preds: Tensor,
+                    mask_targets: Tensor,
+                    rcnn_train_cfg: ConfigDict) -> Tensor:
+        """Compute target of mask IoU.
+
+        Mask IoU target is the IoU of the predicted mask (inside a bbox) and
+        the gt mask of corresponding gt mask (the whole instance).
+        The intersection area is computed inside the bbox, and the gt mask area
+        is computed with two steps, firstly we compute the gt area inside the
+        bbox, then divide it by the area ratio of gt area inside the bbox and
+        the gt area of the whole instance.
+
+        Args:
+            sampling_results (list[:obj:`SamplingResult`]): sampling results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It includes ``masks`` inside.
+            mask_preds (Tensor): Predicted masks of each positive proposal,
+                shape (num_pos, h, w).
+            mask_targets (Tensor): Gt mask of each positive proposal,
+                binary map of the shape (num_pos, h, w).
+            rcnn_train_cfg (obj:`ConfigDict`): Training config for R-CNN part.
+
+        Returns:
+            Tensor: mask iou target (length == num positive).
+        """
+        pos_proposals = [res.pos_priors for res in sampling_results]
+        pos_assigned_gt_inds = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+        gt_masks = [res.masks for res in batch_gt_instances]
+
+        # compute the area ratio of gt areas inside the proposals and
+        # the whole instance
+        area_ratios = map(self._get_area_ratio, pos_proposals,
+                          pos_assigned_gt_inds, gt_masks)
+        area_ratios = torch.cat(list(area_ratios))
+        assert mask_targets.size(0) == area_ratios.size(0)
+
+        mask_preds = (mask_preds > rcnn_train_cfg.mask_thr_binary).float()
+        mask_pred_areas = mask_preds.sum((-1, -2))
+
+        # mask_preds and mask_targets are binary maps
+        overlap_areas = (mask_preds * mask_targets).sum((-1, -2))
+
+        # compute the mask area of the whole instance
+        gt_full_areas = mask_targets.sum((-1, -2)) / (area_ratios + 1e-7)
+
+        mask_iou_targets = overlap_areas / (
+            mask_pred_areas + gt_full_areas - overlap_areas)
+        return mask_iou_targets
+
+    def _get_area_ratio(self, pos_proposals: Tensor,
+                        pos_assigned_gt_inds: Tensor,
+                        gt_masks: InstanceData) -> Tensor:
+        """Compute area ratio of the gt mask inside the proposal and the gt
+        mask of the corresponding instance.
+
+        Args:
+            pos_proposals (Tensor): Positive proposals, has shape (num_pos, 4).
+            pos_assigned_gt_inds (Tensor): positive proposals assigned ground
+                truth index.
+            gt_masks (BitmapMask or PolygonMask): Gt masks (the whole instance)
+                of each image, with the same shape of the input image.
+
+        Returns:
+            Tensor: The area ratio of the gt mask inside the proposal and the
+            gt mask of the corresponding instance.
+        """
+        num_pos = pos_proposals.size(0)
+        if num_pos > 0:
+            area_ratios = []
+            proposals_np = pos_proposals.cpu().numpy()
+            pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
+            # compute mask areas of gt instances (batch processing for speedup)
+            gt_instance_mask_area = gt_masks.areas
+            for i in range(num_pos):
+                gt_mask = gt_masks[pos_assigned_gt_inds[i]]
+
+                # crop the gt mask inside the proposal
+                bbox = proposals_np[i, :].astype(np.int32)
+                gt_mask_in_proposal = gt_mask.crop(bbox)
+
+                ratio = gt_mask_in_proposal.areas[0] / (
+                    gt_instance_mask_area[pos_assigned_gt_inds[i]] + 1e-7)
+                area_ratios.append(ratio)
+            area_ratios = torch.from_numpy(np.stack(area_ratios)).float().to(
+                pos_proposals.device)
+        else:
+            area_ratios = pos_proposals.new_zeros((0, ))
+        return area_ratios
+
+    def predict_by_feat(self, mask_iou_preds: Tuple[Tensor],
+                        results_list: InstanceList) -> InstanceList:
+        """Predict the mask iou and calculate it into ``results.scores``.
+
+        Args:
+            mask_iou_preds (Tensor): Mask IoU predictions results, has shape
+                (num_proposals, num_classes)
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        assert len(mask_iou_preds) == len(results_list)
+        for results, mask_iou_pred in zip(results_list, mask_iou_preds):
+            labels = results.labels
+            scores = results.scores
+            results.scores = scores * mask_iou_pred[range(labels.size(0)),
+                                                    labels]
+        return results_list
diff --git a/mmde/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py b/mmde/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffd30c337c37f4e280980e459c126df177fe7efa
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.layers import ResLayer, SimplifiedBasicBlock
+from mmdet.registry import MODELS
+from .fcn_mask_head import FCNMaskHead
+
+
+@MODELS.register_module()
+class SCNetMaskHead(FCNMaskHead):
+    """Mask head for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        conv_to_res (bool, optional): if True, change the conv layers to
+            ``SimplifiedBasicBlock``.
+    """
+
+    def __init__(self, conv_to_res: bool = True, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.conv_to_res = conv_to_res
+        if conv_to_res:
+            assert self.conv_kernel_size == 3
+            self.num_res_blocks = self.num_convs // 2
+            self.convs = ResLayer(
+                SimplifiedBasicBlock,
+                self.in_channels,
+                self.conv_out_channels,
+                self.num_res_blocks,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
diff --git a/mmde/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py b/mmde/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..55c5c8e4fae7d4e941a770d985c7253fd70f2226
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.layers import ResLayer, SimplifiedBasicBlock
+from mmdet.registry import MODELS
+from .fused_semantic_head import FusedSemanticHead
+
+
+@MODELS.register_module()
+class SCNetSemanticHead(FusedSemanticHead):
+    """Mask head for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        conv_to_res (bool, optional): if True, change the conv layers to
+            ``SimplifiedBasicBlock``.
+    """
+
+    def __init__(self, conv_to_res: bool = True, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.conv_to_res = conv_to_res
+        if self.conv_to_res:
+            num_res_blocks = self.num_convs // 2
+            self.convs = ResLayer(
+                SimplifiedBasicBlock,
+                self.in_channels,
+                self.conv_out_channels,
+                num_res_blocks,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+            self.num_convs = num_res_blocks
diff --git a/mmde/mmdet/models/roi_heads/mask_scoring_roi_head.py b/mmde/mmdet/models/roi_heads/mask_scoring_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6545c0ed41ee7ad17b5f1b841f8bc8d65a7b6391
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/mask_scoring_roi_head.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList
+from ..task_modules.samplers import SamplingResult
+from ..utils.misc import empty_instances
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class MaskScoringRoIHead(StandardRoIHead):
+    """Mask Scoring RoIHead for `Mask Scoring RCNN.
+
+    <https://arxiv.org/abs/1903.00241>`_.
+
+    Args:
+        mask_iou_head (:obj`ConfigDict`, dict): The config of mask_iou_head.
+    """
+
+    def __init__(self, mask_iou_head: ConfigType, **kwargs):
+        assert mask_iou_head is not None
+        super().__init__(**kwargs)
+        self.mask_iou_head = MODELS.build(mask_iou_head)
+
+    def forward(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList = None) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+            the meta information of each image and corresponding
+            annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            bbox_results = self._bbox_forward(x, rois)
+            results = results + (bbox_results['cls_score'],
+                                 bbox_results['bbox_pred'])
+        # mask head
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_results = self._mask_forward(x, mask_rois)
+            results = results + (mask_results['mask_preds'], )
+
+            # mask iou head
+            cls_score = bbox_results['cls_score'][:100]
+            mask_preds = mask_results['mask_preds']
+            mask_feats = mask_results['mask_feats']
+            _, labels = cls_score[:, :self.bbox_head.num_classes].max(dim=1)
+            mask_iou_preds = self.mask_iou_head(
+                mask_feats, mask_preds[range(labels.size(0)), labels])
+            results = results + (mask_iou_preds, )
+
+        return results
+
+    def mask_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult], bbox_feats,
+                  batch_gt_instances: InstanceList) -> dict:
+        """Perform forward propagation and loss calculation of the mask head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            bbox_feats (Tensor): Extract bbox RoI features.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `mask_feats` (Tensor): Extract mask RoI features.
+                - `mask_targets` (Tensor): Mask target of each positive\
+                    proposals in the image.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+                - `loss_mask_iou` (Tensor): mask iou loss.
+        """
+        if not self.share_roi_extractor:
+            pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+            mask_results = self._mask_forward(x, pos_rois)
+        else:
+            pos_inds = []
+            device = bbox_feats.device
+            for res in sampling_results:
+                pos_inds.append(
+                    torch.ones(
+                        res.pos_priors.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+                pos_inds.append(
+                    torch.zeros(
+                        res.neg_priors.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+            pos_inds = torch.cat(pos_inds)
+
+            mask_results = self._mask_forward(
+                x, pos_inds=pos_inds, bbox_feats=bbox_feats)
+
+        mask_loss_and_target = self.mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg)
+        mask_targets = mask_loss_and_target['mask_targets']
+        mask_results.update(loss_mask=mask_loss_and_target['loss_mask'])
+        if mask_results['loss_mask'] is None:
+            return mask_results
+
+        # mask iou head forward and loss
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        pos_mask_pred = mask_results['mask_preds'][
+            range(mask_results['mask_preds'].size(0)), pos_labels]
+        mask_iou_pred = self.mask_iou_head(mask_results['mask_feats'],
+                                           pos_mask_pred)
+        pos_mask_iou_pred = mask_iou_pred[range(mask_iou_pred.size(0)),
+                                          pos_labels]
+
+        loss_mask_iou = self.mask_iou_head.loss_and_target(
+            pos_mask_iou_pred, pos_mask_pred, mask_targets, sampling_results,
+            batch_gt_instances, self.train_cfg)
+        mask_results['loss_mask'].update(loss_mask_iou)
+        return mask_results
+
+    def predict_mask(self,
+                     x: Tensor,
+                     batch_img_metas: List[dict],
+                     results_list: InstanceList,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        mask_results = self._mask_forward(x, mask_rois)
+        mask_preds = mask_results['mask_preds']
+        mask_feats = mask_results['mask_feats']
+        # get mask scores with mask iou head
+        labels = torch.cat([res.labels for res in results_list])
+        mask_iou_preds = self.mask_iou_head(
+            mask_feats, mask_preds[range(labels.size(0)), labels])
+        # split batch mask prediction back to each image
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        mask_preds = mask_preds.split(num_mask_rois_per_img, 0)
+        mask_iou_preds = mask_iou_preds.split(num_mask_rois_per_img, 0)
+
+        # TODO: Handle the case where rescale is false
+        results_list = self.mask_head.predict_by_feat(
+            mask_preds=mask_preds,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale)
+        results_list = self.mask_iou_head.predict_by_feat(
+            mask_iou_preds=mask_iou_preds, results_list=results_list)
+        return results_list
diff --git a/mmde/mmdet/models/roi_heads/multi_instance_roi_head.py b/mmde/mmdet/models/roi_heads/multi_instance_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..fee55b0a5d341c03165649f59737fd34d85c207e
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/multi_instance_roi_head.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList
+from ..task_modules.samplers import SamplingResult
+from ..utils import empty_instances, unpack_gt_instances
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class MultiInstanceRoIHead(StandardRoIHead):
+    """The roi head for Multi-instance prediction."""
+
+    def __init__(self, num_instance: int = 2, *args, **kwargs) -> None:
+        self.num_instance = num_instance
+        super().__init__(*args, **kwargs)
+
+    def init_bbox_head(self, bbox_roi_extractor: ConfigType,
+                       bbox_head: ConfigType) -> None:
+        """Initialize box head and box roi extractor.
+
+        Args:
+            bbox_roi_extractor (dict or ConfigDict): Config of box
+                roi extractor.
+            bbox_head (dict or ConfigDict): Config of box in box head.
+        """
+        self.bbox_roi_extractor = MODELS.build(bbox_roi_extractor)
+        self.bbox_head = MODELS.build(bbox_head)
+
+    def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `cls_score_ref` (Tensor): The cls_score after refine model.
+                - `bbox_pred_ref` (Tensor): The bbox_pred after refine model.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        # TODO: a more flexible way to decide which feature maps to use
+        bbox_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        bbox_results = self.bbox_head(bbox_feats)
+
+        if self.bbox_head.with_refine:
+            bbox_results = dict(
+                cls_score=bbox_results[0],
+                bbox_pred=bbox_results[1],
+                cls_score_ref=bbox_results[2],
+                bbox_pred_ref=bbox_results[3],
+                bbox_feats=bbox_feats)
+        else:
+            bbox_results = dict(
+                cls_score=bbox_results[0],
+                bbox_pred=bbox_results[1],
+                bbox_feats=bbox_feats)
+
+        return bbox_results
+
+    def bbox_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult]) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+
+        # If there is a refining process, add refine loss.
+        if 'cls_score_ref' in bbox_results:
+            bbox_loss_and_target = self.bbox_head.loss_and_target(
+                cls_score=bbox_results['cls_score'],
+                bbox_pred=bbox_results['bbox_pred'],
+                rois=rois,
+                sampling_results=sampling_results,
+                rcnn_train_cfg=self.train_cfg)
+            bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox'])
+            bbox_loss_and_target_ref = self.bbox_head.loss_and_target(
+                cls_score=bbox_results['cls_score_ref'],
+                bbox_pred=bbox_results['bbox_pred_ref'],
+                rois=rois,
+                sampling_results=sampling_results,
+                rcnn_train_cfg=self.train_cfg)
+            bbox_results['loss_bbox']['loss_rcnn_emd_ref'] = \
+                bbox_loss_and_target_ref['loss_bbox']['loss_rcnn_emd']
+        else:
+            bbox_loss_and_target = self.bbox_head.loss_and_target(
+                cls_score=bbox_results['cls_score'],
+                bbox_pred=bbox_results['bbox_pred'],
+                rois=rois,
+                sampling_results=sampling_results,
+                rcnn_train_cfg=self.train_cfg)
+            bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox'])
+
+        return bbox_results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: List[DetDataSample]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, _ = outputs
+
+        sampling_results = []
+        for i in range(len(batch_data_samples)):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                batch_gt_instances_ignore=batch_gt_instances_ignore[i])
+            sampling_results.append(sampling_result)
+
+        losses = dict()
+        # bbox head loss
+        if self.with_bbox:
+            bbox_results = self.bbox_loss(x, sampling_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        return losses
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        proposals = [res.bboxes for res in rpn_results_list]
+        rois = bbox2roi(proposals)
+
+        if rois.shape[0] == 0:
+            return empty_instances(
+                batch_img_metas, rois.device, task_type='bbox')
+
+        bbox_results = self._bbox_forward(x, rois)
+
+        # split batch bbox prediction back to each image
+        if 'cls_score_ref' in bbox_results:
+            cls_scores = bbox_results['cls_score_ref']
+            bbox_preds = bbox_results['bbox_pred_ref']
+        else:
+            cls_scores = bbox_results['cls_score']
+            bbox_preds = bbox_results['bbox_pred']
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = rois.split(num_proposals_per_img, 0)
+        cls_scores = cls_scores.split(num_proposals_per_img, 0)
+
+        if bbox_preds is not None:
+            bbox_preds = bbox_preds.split(num_proposals_per_img, 0)
+        else:
+            bbox_preds = (None, ) * len(proposals)
+
+        result_list = self.bbox_head.predict_by_feat(
+            rois=rois,
+            cls_scores=cls_scores,
+            bbox_preds=bbox_preds,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=rcnn_test_cfg,
+            rescale=rescale)
+        return result_list
diff --git a/mmde/mmdet/models/roi_heads/pisa_roi_head.py b/mmde/mmdet/models/roi_heads/pisa_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..45d59879da73b48df790c55d40a4a88f1d099111
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/pisa_roi_head.py
@@ -0,0 +1,148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+from torch import Tensor
+
+from mmdet.models.task_modules import SamplingResult
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import InstanceList
+from ..losses.pisa_loss import carl_loss, isr_p
+from ..utils import unpack_gt_instances
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class PISARoIHead(StandardRoIHead):
+    r"""The RoI head for `Prime Sample Attention in Object Detection
+    <https://arxiv.org/abs/1904.04821>`_."""
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: List[DetDataSample]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, _ = outputs
+
+        # assign gts and sample proposals
+        num_imgs = len(batch_data_samples)
+        sampling_results = []
+        neg_label_weights = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in x])
+            if isinstance(sampling_result, tuple):
+                sampling_result, neg_label_weight = sampling_result
+            sampling_results.append(sampling_result)
+            neg_label_weights.append(neg_label_weight)
+
+        losses = dict()
+        # bbox head forward and loss
+        if self.with_bbox:
+            bbox_results = self.bbox_loss(
+                x, sampling_results, neg_label_weights=neg_label_weights)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self.mask_loss(x, sampling_results,
+                                          bbox_results['bbox_feats'],
+                                          batch_gt_instances)
+            losses.update(mask_results['loss_mask'])
+
+        return losses
+
+    def bbox_loss(self,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  neg_label_weights: List[Tensor] = None) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+            - `cls_score` (Tensor): Classification scores.
+            - `bbox_pred` (Tensor): Box energies / deltas.
+            - `bbox_feats` (Tensor): Extract bbox RoI features.
+            - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+        bbox_targets = self.bbox_head.get_targets(sampling_results,
+                                                  self.train_cfg)
+
+        # neg_label_weights obtained by sampler is image-wise, mapping back to
+        # the corresponding location in label weights
+        if neg_label_weights[0] is not None:
+            label_weights = bbox_targets[1]
+            cur_num_rois = 0
+            for i in range(len(sampling_results)):
+                num_pos = sampling_results[i].pos_inds.size(0)
+                num_neg = sampling_results[i].neg_inds.size(0)
+                label_weights[cur_num_rois + num_pos:cur_num_rois + num_pos +
+                              num_neg] = neg_label_weights[i]
+                cur_num_rois += num_pos + num_neg
+
+        cls_score = bbox_results['cls_score']
+        bbox_pred = bbox_results['bbox_pred']
+
+        # Apply ISR-P
+        isr_cfg = self.train_cfg.get('isr', None)
+        if isr_cfg is not None:
+            bbox_targets = isr_p(
+                cls_score,
+                bbox_pred,
+                bbox_targets,
+                rois,
+                sampling_results,
+                self.bbox_head.loss_cls,
+                self.bbox_head.bbox_coder,
+                **isr_cfg,
+                num_class=self.bbox_head.num_classes)
+        loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, rois,
+                                        *bbox_targets)
+
+        # Add CARL Loss
+        carl_cfg = self.train_cfg.get('carl', None)
+        if carl_cfg is not None:
+            loss_carl = carl_loss(
+                cls_score,
+                bbox_targets[0],
+                bbox_pred,
+                bbox_targets[2],
+                self.bbox_head.loss_bbox,
+                **carl_cfg,
+                num_class=self.bbox_head.num_classes)
+            loss_bbox.update(loss_carl)
+
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
diff --git a/mmde/mmdet/models/roi_heads/point_rend_roi_head.py b/mmde/mmdet/models/roi_heads/point_rend_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a0641549631e243c3db25039b01fed64fb1e0d1
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/point_rend_roi_head.py
@@ -0,0 +1,236 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.ops import point_sample, rel_roi_point_to_rel_img_point
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList
+from ..task_modules.samplers import SamplingResult
+from ..utils import empty_instances
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class PointRendRoIHead(StandardRoIHead):
+    """`PointRend <https://arxiv.org/abs/1912.08193>`_."""
+
+    def __init__(self, point_head: ConfigType, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        assert self.with_bbox and self.with_mask
+        self.init_point_head(point_head)
+
+    def init_point_head(self, point_head: ConfigType) -> None:
+        """Initialize ``point_head``"""
+        self.point_head = MODELS.build(point_head)
+
+    def mask_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult], bbox_feats: Tensor,
+                  batch_gt_instances: InstanceList) -> dict:
+        """Run forward function and calculate loss for mask head and point head
+        in training."""
+        mask_results = super().mask_loss(
+            x=x,
+            sampling_results=sampling_results,
+            bbox_feats=bbox_feats,
+            batch_gt_instances=batch_gt_instances)
+
+        mask_point_results = self._mask_point_loss(
+            x=x,
+            sampling_results=sampling_results,
+            mask_preds=mask_results['mask_preds'],
+            batch_gt_instances=batch_gt_instances)
+        mask_results['loss_mask'].update(
+            loss_point=mask_point_results['loss_point'])
+
+        return mask_results
+
+    def _mask_point_loss(self, x: Tuple[Tensor],
+                         sampling_results: List[SamplingResult],
+                         mask_preds: Tensor,
+                         batch_gt_instances: InstanceList) -> dict:
+        """Run forward function and calculate loss for point head in
+        training."""
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        rel_roi_points = self.point_head.get_roi_rel_points_train(
+            mask_preds, pos_labels, cfg=self.train_cfg)
+        rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+
+        fine_grained_point_feats = self._get_fine_grained_point_feats(
+            x, rois, rel_roi_points)
+        coarse_point_feats = point_sample(mask_preds, rel_roi_points)
+        mask_point_pred = self.point_head(fine_grained_point_feats,
+                                          coarse_point_feats)
+
+        loss_and_target = self.point_head.loss_and_target(
+            point_pred=mask_point_pred,
+            rel_roi_points=rel_roi_points,
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            cfg=self.train_cfg)
+
+        return loss_and_target
+
+    def _mask_point_forward_test(self, x: Tuple[Tensor], rois: Tensor,
+                                 label_preds: Tensor,
+                                 mask_preds: Tensor) -> Tensor:
+        """Mask refining process with point head in testing.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            rois (Tensor): shape (num_rois, 5).
+            label_preds (Tensor): The predication class for each rois.
+            mask_preds (Tensor): The predication coarse masks of
+                shape (num_rois, num_classes, small_size, small_size).
+
+        Returns:
+            Tensor: The refined masks of shape (num_rois, num_classes,
+            large_size, large_size).
+        """
+        refined_mask_pred = mask_preds.clone()
+        for subdivision_step in range(self.test_cfg.subdivision_steps):
+            refined_mask_pred = F.interpolate(
+                refined_mask_pred,
+                scale_factor=self.test_cfg.scale_factor,
+                mode='bilinear',
+                align_corners=False)
+            # If `subdivision_num_points` is larger or equal to the
+            # resolution of the next step, then we can skip this step
+            num_rois, channels, mask_height, mask_width = \
+                refined_mask_pred.shape
+            if (self.test_cfg.subdivision_num_points >=
+                    self.test_cfg.scale_factor**2 * mask_height * mask_width
+                    and
+                    subdivision_step < self.test_cfg.subdivision_steps - 1):
+                continue
+            point_indices, rel_roi_points = \
+                self.point_head.get_roi_rel_points_test(
+                    refined_mask_pred, label_preds, cfg=self.test_cfg)
+
+            fine_grained_point_feats = self._get_fine_grained_point_feats(
+                x=x, rois=rois, rel_roi_points=rel_roi_points)
+            coarse_point_feats = point_sample(mask_preds, rel_roi_points)
+            mask_point_pred = self.point_head(fine_grained_point_feats,
+                                              coarse_point_feats)
+
+            point_indices = point_indices.unsqueeze(1).expand(-1, channels, -1)
+            refined_mask_pred = refined_mask_pred.reshape(
+                num_rois, channels, mask_height * mask_width)
+            refined_mask_pred = refined_mask_pred.scatter_(
+                2, point_indices, mask_point_pred)
+            refined_mask_pred = refined_mask_pred.view(num_rois, channels,
+                                                       mask_height, mask_width)
+
+        return refined_mask_pred
+
+    def _get_fine_grained_point_feats(self, x: Tuple[Tensor], rois: Tensor,
+                                      rel_roi_points: Tensor) -> Tensor:
+        """Sample fine grained feats from each level feature map and
+        concatenate them together.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            rois (Tensor): shape (num_rois, 5).
+            rel_roi_points (Tensor): A tensor of shape (num_rois, num_points,
+                2) that contains [0, 1] x [0, 1] normalized coordinates of the
+                most uncertain points from the [mask_height, mask_width] grid.
+
+        Returns:
+            Tensor: The fine grained features for each points,
+            has shape (num_rois, feats_channels, num_points).
+        """
+        assert rois.shape[0] > 0, 'RoI is a empty tensor.'
+        num_imgs = x[0].shape[0]
+        fine_grained_feats = []
+        for idx in range(self.mask_roi_extractor.num_inputs):
+            feats = x[idx]
+            spatial_scale = 1. / float(
+                self.mask_roi_extractor.featmap_strides[idx])
+            point_feats = []
+            for batch_ind in range(num_imgs):
+                # unravel batch dim
+                feat = feats[batch_ind].unsqueeze(0)
+                inds = (rois[:, 0].long() == batch_ind)
+                if inds.any():
+                    rel_img_points = rel_roi_point_to_rel_img_point(
+                        rois=rois[inds],
+                        rel_roi_points=rel_roi_points[inds],
+                        img=feat.shape[2:],
+                        spatial_scale=spatial_scale).unsqueeze(0)
+                    point_feat = point_sample(feat, rel_img_points)
+                    point_feat = point_feat.squeeze(0).transpose(0, 1)
+                    point_feats.append(point_feat)
+            fine_grained_feats.append(torch.cat(point_feats, dim=0))
+        return torch.cat(fine_grained_feats, dim=1)
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     results_list: InstanceList,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+            - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        # don't need to consider aug_test.
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        mask_results = self._mask_forward(x, mask_rois)
+        mask_preds = mask_results['mask_preds']
+        # split batch mask prediction back to each image
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        mask_preds = mask_preds.split(num_mask_rois_per_img, 0)
+
+        # refine mask_preds
+        mask_rois = mask_rois.split(num_mask_rois_per_img, 0)
+        mask_preds_refined = []
+        for i in range(len(batch_img_metas)):
+            labels = results_list[i].labels
+            x_i = [xx[[i]] for xx in x]
+            mask_rois_i = mask_rois[i]
+            mask_rois_i[:, 0] = 0
+            mask_pred_i = self._mask_point_forward_test(
+                x_i, mask_rois_i, labels, mask_preds[i])
+            mask_preds_refined.append(mask_pred_i)
+
+        # TODO: Handle the case where rescale is false
+        results_list = self.mask_head.predict_by_feat(
+            mask_preds=mask_preds_refined,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale)
+        return results_list
diff --git a/mmde/mmdet/models/roi_heads/roi_extractors/__init__.py b/mmde/mmdet/models/roi_heads/roi_extractors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f60214991b0ed14cdbc3964aee15356c6aaf2aa
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/roi_extractors/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_roi_extractor import BaseRoIExtractor
+from .generic_roi_extractor import GenericRoIExtractor
+from .single_level_roi_extractor import SingleRoIExtractor
+
+__all__ = ['BaseRoIExtractor', 'SingleRoIExtractor', 'GenericRoIExtractor']
diff --git a/mmde/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py b/mmde/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8de0518818aba8d9aac7b807e3215d0da6c9b99
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv import ops
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.utils import ConfigType, OptMultiConfig
+
+
+class BaseRoIExtractor(BaseModule, metaclass=ABCMeta):
+    """Base class for RoI extractor.
+
+    Args:
+        roi_layer (:obj:`ConfigDict` or dict): Specify RoI layer type and
+            arguments.
+        out_channels (int): Output channels of RoI layers.
+        featmap_strides (list[int]): Strides of input feature maps.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 roi_layer: ConfigType,
+                 out_channels: int,
+                 featmap_strides: List[int],
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.roi_layers = self.build_roi_layers(roi_layer, featmap_strides)
+        self.out_channels = out_channels
+        self.featmap_strides = featmap_strides
+
+    @property
+    def num_inputs(self) -> int:
+        """int: Number of input feature maps."""
+        return len(self.featmap_strides)
+
+    def build_roi_layers(self, layer_cfg: ConfigType,
+                         featmap_strides: List[int]) -> nn.ModuleList:
+        """Build RoI operator to extract feature from each level feature map.
+
+        Args:
+            layer_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+                config RoI layer operation. Options are modules under
+                ``mmcv/ops`` such as ``RoIAlign``.
+            featmap_strides (list[int]): The stride of input feature map w.r.t
+                to the original image size, which would be used to scale RoI
+                coordinate (original image coordinate system) to feature
+                coordinate system.
+
+        Returns:
+            :obj:`nn.ModuleList`: The RoI extractor modules for each level
+                feature map.
+        """
+
+        cfg = layer_cfg.copy()
+        layer_type = cfg.pop('type')
+        if isinstance(layer_type, str):
+            assert hasattr(ops, layer_type)
+            layer_cls = getattr(ops, layer_type)
+        else:
+            layer_cls = layer_type
+        roi_layers = nn.ModuleList(
+            [layer_cls(spatial_scale=1 / s, **cfg) for s in featmap_strides])
+        return roi_layers
+
+    def roi_rescale(self, rois: Tensor, scale_factor: float) -> Tensor:
+        """Scale RoI coordinates by scale factor.
+
+        Args:
+            rois (Tensor): RoI (Region of Interest), shape (n, 5)
+            scale_factor (float): Scale factor that RoI will be multiplied by.
+
+        Returns:
+            Tensor: Scaled RoI.
+        """
+
+        cx = (rois[:, 1] + rois[:, 3]) * 0.5
+        cy = (rois[:, 2] + rois[:, 4]) * 0.5
+        w = rois[:, 3] - rois[:, 1]
+        h = rois[:, 4] - rois[:, 2]
+        new_w = w * scale_factor
+        new_h = h * scale_factor
+        x1 = cx - new_w * 0.5
+        x2 = cx + new_w * 0.5
+        y1 = cy - new_h * 0.5
+        y2 = cy + new_h * 0.5
+        new_rois = torch.stack((rois[:, 0], x1, y1, x2, y2), dim=-1)
+        return new_rois
+
+    @abstractmethod
+    def forward(self,
+                feats: Tuple[Tensor],
+                rois: Tensor,
+                roi_scale_factor: Optional[float] = None) -> Tensor:
+        """Extractor ROI feats.
+
+        Args:
+            feats (Tuple[Tensor]): Multi-scale features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            roi_scale_factor (Optional[float]): RoI scale factor.
+                Defaults to None.
+
+        Returns:
+            Tensor: RoI feature.
+        """
+        pass
diff --git a/mmde/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py b/mmde/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..39d4c90135d853404d564391f029558841ac9cac
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+from mmcv.cnn.bricks import build_plugin_layer
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType
+from .base_roi_extractor import BaseRoIExtractor
+
+
+@MODELS.register_module()
+class GenericRoIExtractor(BaseRoIExtractor):
+    """Extract RoI features from all level feature maps levels.
+
+    This is the implementation of `A novel Region of Interest Extraction Layer
+    for Instance Segmentation <https://arxiv.org/abs/2004.13665>`_.
+
+    Args:
+        aggregation (str): The method to aggregate multiple feature maps.
+            Options are 'sum', 'concat'. Defaults to 'sum'.
+        pre_cfg (:obj:`ConfigDict` or dict): Specify pre-processing modules.
+            Defaults to None.
+        post_cfg (:obj:`ConfigDict` or dict): Specify post-processing modules.
+            Defaults to None.
+        kwargs (keyword arguments): Arguments that are the same
+            as :class:`BaseRoIExtractor`.
+    """
+
+    def __init__(self,
+                 aggregation: str = 'sum',
+                 pre_cfg: OptConfigType = None,
+                 post_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+
+        assert aggregation in ['sum', 'concat']
+
+        self.aggregation = aggregation
+        self.with_post = post_cfg is not None
+        self.with_pre = pre_cfg is not None
+        # build pre/post processing modules
+        if self.with_post:
+            self.post_module = build_plugin_layer(post_cfg, '_post_module')[1]
+        if self.with_pre:
+            self.pre_module = build_plugin_layer(pre_cfg, '_pre_module')[1]
+
+    def forward(self,
+                feats: Tuple[Tensor],
+                rois: Tensor,
+                roi_scale_factor: Optional[float] = None) -> Tensor:
+        """Extractor ROI feats.
+
+        Args:
+            feats (Tuple[Tensor]): Multi-scale features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            roi_scale_factor (Optional[float]): RoI scale factor.
+                Defaults to None.
+
+        Returns:
+            Tensor: RoI feature.
+        """
+        out_size = self.roi_layers[0].output_size
+        num_levels = len(feats)
+        roi_feats = feats[0].new_zeros(
+            rois.size(0), self.out_channels, *out_size)
+
+        # some times rois is an empty tensor
+        if roi_feats.shape[0] == 0:
+            return roi_feats
+
+        if num_levels == 1:
+            return self.roi_layers[0](feats[0], rois)
+
+        if roi_scale_factor is not None:
+            rois = self.roi_rescale(rois, roi_scale_factor)
+
+        # mark the starting channels for concat mode
+        start_channels = 0
+        for i in range(num_levels):
+            roi_feats_t = self.roi_layers[i](feats[i], rois)
+            end_channels = start_channels + roi_feats_t.size(1)
+            if self.with_pre:
+                # apply pre-processing to a RoI extracted from each layer
+                roi_feats_t = self.pre_module(roi_feats_t)
+            if self.aggregation == 'sum':
+                # and sum them all
+                roi_feats += roi_feats_t
+            else:
+                # and concat them along channel dimension
+                roi_feats[:, start_channels:end_channels] = roi_feats_t
+            # update channels starting position
+            start_channels = end_channels
+        # check if concat channels match at the end
+        if self.aggregation == 'concat':
+            assert start_channels == self.out_channels
+
+        if self.with_post:
+            # apply post-processing before return the result
+            roi_feats = self.post_module(roi_feats)
+        return roi_feats
diff --git a/mmde/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py b/mmde/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..59229e0b0b0a18dff81abca6f5c20cb50b0d542c
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
@@ -0,0 +1,119 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptMultiConfig
+from .base_roi_extractor import BaseRoIExtractor
+
+
+@MODELS.register_module()
+class SingleRoIExtractor(BaseRoIExtractor):
+    """Extract RoI features from a single level feature map.
+
+    If there are multiple input feature levels, each RoI is mapped to a level
+    according to its scale. The mapping rule is proposed in
+    `FPN <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        roi_layer (:obj:`ConfigDict` or dict): Specify RoI layer type and
+            arguments.
+        out_channels (int): Output channels of RoI layers.
+        featmap_strides (List[int]): Strides of input feature maps.
+        finest_scale (int): Scale threshold of mapping to level 0.
+            Defaults to 56.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 roi_layer: ConfigType,
+                 out_channels: int,
+                 featmap_strides: List[int],
+                 finest_scale: int = 56,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            roi_layer=roi_layer,
+            out_channels=out_channels,
+            featmap_strides=featmap_strides,
+            init_cfg=init_cfg)
+        self.finest_scale = finest_scale
+
+    def map_roi_levels(self, rois: Tensor, num_levels: int) -> Tensor:
+        """Map rois to corresponding feature levels by scales.
+
+        - scale < finest_scale * 2: level 0
+        - finest_scale * 2 <= scale < finest_scale * 4: level 1
+        - finest_scale * 4 <= scale < finest_scale * 8: level 2
+        - scale >= finest_scale * 8: level 3
+
+        Args:
+            rois (Tensor): Input RoIs, shape (k, 5).
+            num_levels (int): Total level number.
+
+        Returns:
+            Tensor: Level index (0-based) of each RoI, shape (k, )
+        """
+        scale = torch.sqrt(
+            (rois[:, 3] - rois[:, 1]) * (rois[:, 4] - rois[:, 2]))
+        target_lvls = torch.floor(torch.log2(scale / self.finest_scale + 1e-6))
+        target_lvls = target_lvls.clamp(min=0, max=num_levels - 1).long()
+        return target_lvls
+
+    def forward(self,
+                feats: Tuple[Tensor],
+                rois: Tensor,
+                roi_scale_factor: Optional[float] = None):
+        """Extractor ROI feats.
+
+        Args:
+            feats (Tuple[Tensor]): Multi-scale features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            roi_scale_factor (Optional[float]): RoI scale factor.
+                Defaults to None.
+
+        Returns:
+            Tensor: RoI feature.
+        """
+        # convert fp32 to fp16 when amp is on
+        rois = rois.type_as(feats[0])
+        out_size = self.roi_layers[0].output_size
+        num_levels = len(feats)
+        roi_feats = feats[0].new_zeros(
+            rois.size(0), self.out_channels, *out_size)
+
+        # TODO: remove this when parrots supports
+        if torch.__version__ == 'parrots':
+            roi_feats.requires_grad = True
+
+        if num_levels == 1:
+            if len(rois) == 0:
+                return roi_feats
+            return self.roi_layers[0](feats[0], rois)
+
+        target_lvls = self.map_roi_levels(rois, num_levels)
+
+        if roi_scale_factor is not None:
+            rois = self.roi_rescale(rois, roi_scale_factor)
+
+        for i in range(num_levels):
+            mask = target_lvls == i
+            inds = mask.nonzero(as_tuple=False).squeeze(1)
+            if inds.numel() > 0:
+                rois_ = rois[inds]
+                roi_feats_t = self.roi_layers[i](feats[i], rois_)
+                roi_feats[inds] = roi_feats_t
+            else:
+                # Sometimes some pyramid levels will not be used for RoI
+                # feature extraction and this will cause an incomplete
+                # computation graph in one GPU, which is different from those
+                # in other GPUs and will cause a hanging error.
+                # Therefore, we add it to ensure each feature pyramid is
+                # included in the computation graph to avoid runtime bugs.
+                roi_feats += sum(
+                    x.view(-1)[0]
+                    for x in self.parameters()) * 0. + feats[i].sum() * 0.
+        return roi_feats
diff --git a/mmde/mmdet/models/roi_heads/scnet_roi_head.py b/mmde/mmdet/models/roi_heads/scnet_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6d2bc1915bae38011cc75a720e48ed53b51ddb5
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/scnet_roi_head.py
@@ -0,0 +1,677 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList, OptConfigType
+from ..layers import adaptive_avg_pool2d
+from ..task_modules.samplers import SamplingResult
+from ..utils import empty_instances, unpack_gt_instances
+from .cascade_roi_head import CascadeRoIHead
+
+
+@MODELS.register_module()
+class SCNetRoIHead(CascadeRoIHead):
+    """RoIHead for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        num_stages (int): number of cascade stages.
+        stage_loss_weights (list): loss weight of cascade stages.
+        semantic_roi_extractor (dict): config to init semantic roi extractor.
+        semantic_head (dict): config to init semantic head.
+        feat_relay_head (dict): config to init feature_relay_head.
+        glbctx_head (dict): config to init global context head.
+    """
+
+    def __init__(self,
+                 num_stages: int,
+                 stage_loss_weights: List[float],
+                 semantic_roi_extractor: OptConfigType = None,
+                 semantic_head: OptConfigType = None,
+                 feat_relay_head: OptConfigType = None,
+                 glbctx_head: OptConfigType = None,
+                 **kwargs) -> None:
+        super().__init__(
+            num_stages=num_stages,
+            stage_loss_weights=stage_loss_weights,
+            **kwargs)
+        assert self.with_bbox and self.with_mask
+        assert not self.with_shared_head  # shared head is not supported
+
+        if semantic_head is not None:
+            self.semantic_roi_extractor = MODELS.build(semantic_roi_extractor)
+            self.semantic_head = MODELS.build(semantic_head)
+
+        if feat_relay_head is not None:
+            self.feat_relay_head = MODELS.build(feat_relay_head)
+
+        if glbctx_head is not None:
+            self.glbctx_head = MODELS.build(glbctx_head)
+
+    def init_mask_head(self, mask_roi_extractor: ConfigType,
+                       mask_head: ConfigType) -> None:
+        """Initialize ``mask_head``"""
+        if mask_roi_extractor is not None:
+            self.mask_roi_extractor = MODELS.build(mask_roi_extractor)
+            self.mask_head = MODELS.build(mask_head)
+
+    # TODO move to base_roi_head later
+    @property
+    def with_semantic(self) -> bool:
+        """bool: whether the head has semantic head"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    @property
+    def with_feat_relay(self) -> bool:
+        """bool: whether the head has feature relay head"""
+        return (hasattr(self, 'feat_relay_head')
+                and self.feat_relay_head is not None)
+
+    @property
+    def with_glbctx(self) -> bool:
+        """bool: whether the head has global context head"""
+        return hasattr(self, 'glbctx_head') and self.glbctx_head is not None
+
+    def _fuse_glbctx(self, roi_feats: Tensor, glbctx_feat: Tensor,
+                     rois: Tensor) -> Tensor:
+        """Fuse global context feats with roi feats.
+
+        Args:
+            roi_feats (Tensor): RoI features.
+            glbctx_feat (Tensor): Global context feature..
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+            Tensor: Fused feature.
+        """
+        assert roi_feats.size(0) == rois.size(0)
+        # RuntimeError: isDifferentiableType(variable.scalar_type())
+        # INTERNAL ASSERT FAILED if detach() is not used when calling
+        # roi_head.predict().
+        img_inds = torch.unique(rois[:, 0].detach().cpu(), sorted=True).long()
+        fused_feats = torch.zeros_like(roi_feats)
+        for img_id in img_inds:
+            inds = (rois[:, 0] == img_id.item())
+            fused_feats[inds] = roi_feats[inds] + glbctx_feat[img_id]
+        return fused_feats
+
+    def _slice_pos_feats(self, feats: Tensor,
+                         sampling_results: List[SamplingResult]) -> Tensor:
+        """Get features from pos rois.
+
+        Args:
+            feats (Tensor): Input features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            Tensor: Sliced features.
+        """
+        num_rois = [res.priors.size(0) for res in sampling_results]
+        num_pos_rois = [res.pos_priors.size(0) for res in sampling_results]
+        inds = torch.zeros(sum(num_rois), dtype=torch.bool)
+        start = 0
+        for i in range(len(num_rois)):
+            start = 0 if i == 0 else start + num_rois[i - 1]
+            stop = start + num_pos_rois[i]
+            inds[start:stop] = 1
+        sliced_feats = feats[inds]
+        return sliced_feats
+
+    def _bbox_forward(self,
+                      stage: int,
+                      x: Tuple[Tensor],
+                      rois: Tensor,
+                      semantic_feat: Optional[Tensor] = None,
+                      glbctx_feat: Optional[Tensor] = None) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            semantic_feat (Tensor): Semantic feature. Defaults to None.
+            glbctx_feat (Tensor): Global context feature. Defaults to None.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        if self.with_semantic and semantic_feat is not None:
+            bbox_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if bbox_semantic_feat.shape[-2:] != bbox_feats.shape[-2:]:
+                bbox_semantic_feat = adaptive_avg_pool2d(
+                    bbox_semantic_feat, bbox_feats.shape[-2:])
+            bbox_feats += bbox_semantic_feat
+        if self.with_glbctx and glbctx_feat is not None:
+            bbox_feats = self._fuse_glbctx(bbox_feats, glbctx_feat, rois)
+        cls_score, bbox_pred, relayed_feat = bbox_head(
+            bbox_feats, return_shared_feat=True)
+
+        bbox_results = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            relayed_feat=relayed_feat)
+        return bbox_results
+
+    def _mask_forward(self,
+                      x: Tuple[Tensor],
+                      rois: Tensor,
+                      semantic_feat: Optional[Tensor] = None,
+                      glbctx_feat: Optional[Tensor] = None,
+                      relayed_feat: Optional[Tensor] = None) -> dict:
+        """Mask head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            semantic_feat (Tensor): Semantic feature. Defaults to None.
+            glbctx_feat (Tensor): Global context feature. Defaults to None.
+            relayed_feat (Tensor): Relayed feature. Defaults to None.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+        """
+        mask_feats = self.mask_roi_extractor(
+            x[:self.mask_roi_extractor.num_inputs], rois)
+        if self.with_semantic and semantic_feat is not None:
+            mask_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]:
+                mask_semantic_feat = F.adaptive_avg_pool2d(
+                    mask_semantic_feat, mask_feats.shape[-2:])
+            mask_feats += mask_semantic_feat
+        if self.with_glbctx and glbctx_feat is not None:
+            mask_feats = self._fuse_glbctx(mask_feats, glbctx_feat, rois)
+        if self.with_feat_relay and relayed_feat is not None:
+            mask_feats = mask_feats + relayed_feat
+        mask_preds = self.mask_head(mask_feats)
+        mask_results = dict(mask_preds=mask_preds)
+
+        return mask_results
+
+    def bbox_loss(self,
+                  stage: int,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  semantic_feat: Optional[Tensor] = None,
+                  glbctx_feat: Optional[Tensor] = None) -> dict:
+        """Run forward function and calculate loss for box head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            semantic_feat (Tensor): Semantic feature. Defaults to None.
+            glbctx_feat (Tensor): Global context feature. Defaults to None.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+        """
+        bbox_head = self.bbox_head[stage]
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(
+            stage,
+            x,
+            rois,
+            semantic_feat=semantic_feat,
+            glbctx_feat=glbctx_feat)
+        bbox_results.update(rois=rois)
+
+        bbox_loss_and_target = bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg[stage])
+
+        bbox_results.update(bbox_loss_and_target)
+        return bbox_results
+
+    def mask_loss(self,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  batch_gt_instances: InstanceList,
+                  semantic_feat: Optional[Tensor] = None,
+                  glbctx_feat: Optional[Tensor] = None,
+                  relayed_feat: Optional[Tensor] = None) -> dict:
+        """Run forward function and calculate loss for mask head in training.
+
+        Args:
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            semantic_feat (Tensor): Semantic feature. Defaults to None.
+            glbctx_feat (Tensor): Global context feature. Defaults to None.
+            relayed_feat (Tensor): Relayed feature. Defaults to None.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+        mask_results = self._mask_forward(
+            x,
+            pos_rois,
+            semantic_feat=semantic_feat,
+            glbctx_feat=glbctx_feat,
+            relayed_feat=relayed_feat)
+
+        mask_loss_and_target = self.mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg[-1])
+        mask_results.update(mask_loss_and_target)
+
+        return mask_results
+
+    def semantic_loss(self, x: Tuple[Tensor],
+                      batch_data_samples: SampleList) -> dict:
+        """Semantic segmentation loss.
+
+        Args:
+            x (Tuple[Tensor]): Tuple of multi-level img features.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `semantic_feat` (Tensor): Semantic feature.
+                - `loss_seg` (dict): Semantic segmentation loss.
+        """
+        gt_semantic_segs = [
+            data_sample.gt_sem_seg.sem_seg
+            for data_sample in batch_data_samples
+        ]
+        gt_semantic_segs = torch.stack(gt_semantic_segs)
+        semantic_pred, semantic_feat = self.semantic_head(x)
+        loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_segs)
+
+        semantic_results = dict(loss_seg=loss_seg, semantic_feat=semantic_feat)
+
+        return semantic_results
+
+    def global_context_loss(self, x: Tuple[Tensor],
+                            batch_gt_instances: InstanceList) -> dict:
+        """Global context loss.
+
+        Args:
+            x (Tuple[Tensor]): Tuple of multi-level img features.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `glbctx_feat` (Tensor): Global context feature.
+                - `loss_glbctx` (dict): Global context loss.
+        """
+        gt_labels = [
+            gt_instances.labels for gt_instances in batch_gt_instances
+        ]
+        mc_pred, glbctx_feat = self.glbctx_head(x)
+        loss_glbctx = self.glbctx_head.loss(mc_pred, gt_labels)
+        global_context_results = dict(
+            loss_glbctx=loss_glbctx, glbctx_feat=glbctx_feat)
+
+        return global_context_results
+
+    def loss(self, x: Tensor, rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        losses = dict()
+
+        # semantic segmentation branch
+        if self.with_semantic:
+            semantic_results = self.semantic_loss(
+                x=x, batch_data_samples=batch_data_samples)
+            losses['loss_semantic_seg'] = semantic_results['loss_seg']
+            semantic_feat = semantic_results['semantic_feat']
+        else:
+            semantic_feat = None
+
+        # global context branch
+        if self.with_glbctx:
+            global_context_results = self.global_context_loss(
+                x=x, batch_gt_instances=batch_gt_instances)
+            losses['loss_glbctx'] = global_context_results['loss_glbctx']
+            glbctx_feat = global_context_results['glbctx_feat']
+        else:
+            glbctx_feat = None
+
+        results_list = rpn_results_list
+        num_imgs = len(batch_img_metas)
+        for stage in range(self.num_stages):
+            stage_loss_weight = self.stage_loss_weights[stage]
+
+            # assign gts and sample proposals
+            sampling_results = []
+            bbox_assigner = self.bbox_assigner[stage]
+            bbox_sampler = self.bbox_sampler[stage]
+            for i in range(num_imgs):
+                results = results_list[i]
+                # rename rpn_results.bboxes to rpn_results.priors
+                results.priors = results.pop('bboxes')
+
+                assign_result = bbox_assigner.assign(
+                    results, batch_gt_instances[i],
+                    batch_gt_instances_ignore[i])
+                sampling_result = bbox_sampler.sample(
+                    assign_result,
+                    results,
+                    batch_gt_instances[i],
+                    feats=[lvl_feat[i][None] for lvl_feat in x])
+                sampling_results.append(sampling_result)
+
+            # bbox head forward and loss
+            bbox_results = self.bbox_loss(
+                stage=stage,
+                x=x,
+                sampling_results=sampling_results,
+                semantic_feat=semantic_feat,
+                glbctx_feat=glbctx_feat)
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{stage}.{name}'] = (
+                    value * stage_loss_weight if 'loss' in name else value)
+
+            # refine bboxes
+            if stage < self.num_stages - 1:
+                bbox_head = self.bbox_head[stage]
+                with torch.no_grad():
+                    results_list = bbox_head.refine_bboxes(
+                        sampling_results=sampling_results,
+                        bbox_results=bbox_results,
+                        batch_img_metas=batch_img_metas)
+
+        if self.with_feat_relay:
+            relayed_feat = self._slice_pos_feats(bbox_results['relayed_feat'],
+                                                 sampling_results)
+            relayed_feat = self.feat_relay_head(relayed_feat)
+        else:
+            relayed_feat = None
+
+        # mask head forward and loss
+        mask_results = self.mask_loss(
+            x=x,
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            semantic_feat=semantic_feat,
+            glbctx_feat=glbctx_feat,
+            relayed_feat=relayed_feat)
+        mask_stage_loss_weight = sum(self.stage_loss_weights)
+        losses['loss_mask'] = mask_stage_loss_weight * mask_results[
+            'loss_mask']['loss_mask']
+
+        return losses
+
+    def predict(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (N, C, H, W).
+            rpn_results_list (list[:obj:`InstanceData`]): list of region
+                proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results to
+                the original image. Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+
+        if self.with_glbctx:
+            _, glbctx_feat = self.glbctx_head(x)
+        else:
+            glbctx_feat = None
+
+        # TODO: nms_op in mmcv need be enhanced, the bbox result may get
+        #  difference when not rescale in bbox_head
+
+        # If it has the mask branch, the bbox branch does not need
+        # to be scaled to the original image scale, because the mask
+        # branch will scale both bbox and mask at the same time.
+        bbox_rescale = rescale if not self.with_mask else False
+        results_list = self.predict_bbox(
+            x=x,
+            semantic_feat=semantic_feat,
+            glbctx_feat=glbctx_feat,
+            batch_img_metas=batch_img_metas,
+            rpn_results_list=rpn_results_list,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=bbox_rescale)
+
+        if self.with_mask:
+            results_list = self.predict_mask(
+                x=x,
+                semantic_heat=semantic_feat,
+                glbctx_feat=glbctx_feat,
+                batch_img_metas=batch_img_metas,
+                results_list=results_list,
+                rescale=rescale)
+
+        return results_list
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     semantic_heat: Tensor,
+                     glbctx_feat: Tensor,
+                     batch_img_metas: List[dict],
+                     results_list: List[InstanceData],
+                     rescale: bool = False) -> List[InstanceData]:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            semantic_feat (Tensor): Semantic feature.
+            glbctx_feat (Tensor): Global context feature.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas=batch_img_metas,
+                device=mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        bboxes_results = self._bbox_forward(
+            stage=-1,
+            x=x,
+            rois=mask_rois,
+            semantic_feat=semantic_heat,
+            glbctx_feat=glbctx_feat)
+        relayed_feat = bboxes_results['relayed_feat']
+        relayed_feat = self.feat_relay_head(relayed_feat)
+
+        mask_results = self._mask_forward(
+            x=x,
+            rois=mask_rois,
+            semantic_feat=semantic_heat,
+            glbctx_feat=glbctx_feat,
+            relayed_feat=relayed_feat)
+        mask_preds = mask_results['mask_preds']
+
+        # split batch mask prediction back to each image
+        num_bbox_per_img = tuple(len(_bbox) for _bbox in bboxes)
+        mask_preds = mask_preds.split(num_bbox_per_img, 0)
+
+        results_list = self.mask_head.predict_by_feat(
+            mask_preds=mask_preds,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale)
+
+        return results_list
+
+    def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+                batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+
+        if self.with_glbctx:
+            _, glbctx_feat = self.glbctx_head(x)
+        else:
+            glbctx_feat = None
+
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            rois, cls_scores, bbox_preds = self._refine_roi(
+                x=x,
+                rois=rois,
+                semantic_feat=semantic_feat,
+                glbctx_feat=glbctx_feat,
+                batch_img_metas=batch_img_metas,
+                num_proposals_per_img=num_proposals_per_img)
+            results = results + (cls_scores, bbox_preds)
+        # mask head
+        if self.with_mask:
+            rois = torch.cat(rois)
+            bboxes_results = self._bbox_forward(
+                stage=-1,
+                x=x,
+                rois=rois,
+                semantic_feat=semantic_feat,
+                glbctx_feat=glbctx_feat)
+            relayed_feat = bboxes_results['relayed_feat']
+            relayed_feat = self.feat_relay_head(relayed_feat)
+            mask_results = self._mask_forward(
+                x=x,
+                rois=rois,
+                semantic_feat=semantic_feat,
+                glbctx_feat=glbctx_feat,
+                relayed_feat=relayed_feat)
+            mask_preds = mask_results['mask_preds']
+            mask_preds = mask_preds.split(num_proposals_per_img, 0)
+            results = results + (mask_preds, )
+        return results
diff --git a/mmde/mmdet/models/roi_heads/shared_heads/__init__.py b/mmde/mmdet/models/roi_heads/shared_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d56636ab34d1dd2592828238099bcdccf179d6d3
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/shared_heads/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .res_layer import ResLayer
+
+__all__ = ['ResLayer']
diff --git a/mmde/mmdet/models/roi_heads/shared_heads/res_layer.py b/mmde/mmdet/models/roi_heads/shared_heads/res_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9210cb928fec92135a195d44d13a8588382b947
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/shared_heads/res_layer.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmengine.model import BaseModule
+
+from mmdet.models.backbones import ResNet
+from mmdet.models.layers import ResLayer as _ResLayer
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class ResLayer(BaseModule):
+
+    def __init__(self,
+                 depth,
+                 stage=3,
+                 stride=2,
+                 dilation=1,
+                 style='pytorch',
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 with_cp=False,
+                 dcn=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResLayer, self).__init__(init_cfg)
+
+        self.norm_eval = norm_eval
+        self.norm_cfg = norm_cfg
+        self.stage = stage
+        self.fp16_enabled = False
+        block, stage_blocks = ResNet.arch_settings[depth]
+        stage_block = stage_blocks[stage]
+        planes = 64 * 2**stage
+        inplanes = 64 * 2**(stage - 1) * block.expansion
+
+        res_layer = _ResLayer(
+            block,
+            inplanes,
+            planes,
+            stage_block,
+            stride=stride,
+            dilation=dilation,
+            style=style,
+            with_cp=with_cp,
+            norm_cfg=self.norm_cfg,
+            dcn=dcn)
+        self.add_module(f'layer{stage + 1}', res_layer)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        res_layer = getattr(self, f'layer{self.stage + 1}')
+        out = res_layer(x)
+        return out
+
+    def train(self, mode=True):
+        super(ResLayer, self).train(mode)
+        if self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
diff --git a/mmde/mmdet/models/roi_heads/sparse_roi_head.py b/mmde/mmdet/models/roi_heads/sparse_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..19c3e1e335ca4e4a9d5befcbffcf4665b459cb5a
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/sparse_roi_head.py
@@ -0,0 +1,601 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.task_modules.samplers import PseudoSampler
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList, OptConfigType
+from ..utils.misc import empty_instances, unpack_gt_instances
+from .cascade_roi_head import CascadeRoIHead
+
+
+@MODELS.register_module()
+class SparseRoIHead(CascadeRoIHead):
+    r"""The RoIHead for `Sparse R-CNN: End-to-End Object Detection with
+    Learnable Proposals <https://arxiv.org/abs/2011.12450>`_
+    and `Instances as Queries <http://arxiv.org/abs/2105.01928>`_
+
+    Args:
+        num_stages (int): Number of stage whole iterative process.
+            Defaults to 6.
+        stage_loss_weights (Tuple[float]): The loss
+            weight of each stage. By default all stages have
+            the same weight 1.
+        bbox_roi_extractor (:obj:`ConfigDict` or dict): Config of box
+            roi extractor.
+        mask_roi_extractor (:obj:`ConfigDict` or dict): Config of mask
+            roi extractor.
+        bbox_head (:obj:`ConfigDict` or dict): Config of box head.
+        mask_head (:obj:`ConfigDict` or dict): Config of mask head.
+        train_cfg (:obj:`ConfigDict` or dict, Optional): Configuration
+            information in train stage. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, Optional): Configuration
+            information in test stage. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_stages: int = 6,
+                 stage_loss_weights: Tuple[float] = (1, 1, 1, 1, 1, 1),
+                 proposal_feature_channel: int = 256,
+                 bbox_roi_extractor: ConfigType = dict(
+                     type='SingleRoIExtractor',
+                     roi_layer=dict(
+                         type='RoIAlign', output_size=7, sampling_ratio=2),
+                     out_channels=256,
+                     featmap_strides=[4, 8, 16, 32]),
+                 mask_roi_extractor: OptConfigType = None,
+                 bbox_head: ConfigType = dict(
+                     type='DIIHead',
+                     num_classes=80,
+                     num_fcs=2,
+                     num_heads=8,
+                     num_cls_fcs=1,
+                     num_reg_fcs=3,
+                     feedforward_channels=2048,
+                     hidden_channels=256,
+                     dropout=0.0,
+                     roi_feat_size=7,
+                     ffn_act_cfg=dict(type='ReLU', inplace=True)),
+                 mask_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptConfigType = None) -> None:
+        assert bbox_roi_extractor is not None
+        assert bbox_head is not None
+        assert len(stage_loss_weights) == num_stages
+        self.num_stages = num_stages
+        self.stage_loss_weights = stage_loss_weights
+        self.proposal_feature_channel = proposal_feature_channel
+        super().__init__(
+            num_stages=num_stages,
+            stage_loss_weights=stage_loss_weights,
+            bbox_roi_extractor=bbox_roi_extractor,
+            mask_roi_extractor=mask_roi_extractor,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        # train_cfg would be None when run the test.py
+        if train_cfg is not None:
+            for stage in range(num_stages):
+                assert isinstance(self.bbox_sampler[stage], PseudoSampler), \
+                    'Sparse R-CNN and QueryInst only support `PseudoSampler`'
+
+    def bbox_loss(self, stage: int, x: Tuple[Tensor],
+                  results_list: InstanceList, object_feats: Tensor,
+                  batch_img_metas: List[dict],
+                  batch_gt_instances: InstanceList) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            stage (int): The current stage in iterative process.
+            x (tuple[Tensor]): List of multi-level img features.
+            results_list (List[:obj:`InstanceData`]) : List of region
+                proposals.
+            object_feats (Tensor): The object feature extracted from
+                the previous stage.
+            batch_img_metas (list[dict]): Meta information of each image.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+            - `cls_score` (Tensor): Classification scores.
+            - `bbox_pred` (Tensor): Box energies / deltas.
+            - `bbox_feats` (Tensor): Extract bbox RoI features.
+            - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        proposal_list = [res.bboxes for res in results_list]
+        rois = bbox2roi(proposal_list)
+        bbox_results = self._bbox_forward(stage, x, rois, object_feats,
+                                          batch_img_metas)
+        imgs_whwh = torch.cat(
+            [res.imgs_whwh[None, ...] for res in results_list])
+        cls_pred_list = bbox_results['detached_cls_scores']
+        proposal_list = bbox_results['detached_proposals']
+
+        sampling_results = []
+        bbox_head = self.bbox_head[stage]
+        for i in range(len(batch_img_metas)):
+            pred_instances = InstanceData()
+            # TODO: Enhance the logic
+            pred_instances.bboxes = proposal_list[i]  # for assinger
+            pred_instances.scores = cls_pred_list[i]
+            pred_instances.priors = proposal_list[i]  # for sampler
+
+            assign_result = self.bbox_assigner[stage].assign(
+                pred_instances=pred_instances,
+                gt_instances=batch_gt_instances[i],
+                gt_instances_ignore=None,
+                img_meta=batch_img_metas[i])
+
+            sampling_result = self.bbox_sampler[stage].sample(
+                assign_result, pred_instances, batch_gt_instances[i])
+            sampling_results.append(sampling_result)
+
+        bbox_results.update(sampling_results=sampling_results)
+
+        cls_score = bbox_results['cls_score']
+        decoded_bboxes = bbox_results['decoded_bboxes']
+        cls_score = cls_score.view(-1, cls_score.size(-1))
+        decoded_bboxes = decoded_bboxes.view(-1, 4)
+        bbox_loss_and_target = bbox_head.loss_and_target(
+            cls_score,
+            decoded_bboxes,
+            sampling_results,
+            self.train_cfg[stage],
+            imgs_whwh=imgs_whwh,
+            concat=True)
+        bbox_results.update(bbox_loss_and_target)
+
+        # propose for the new proposal_list
+        proposal_list = []
+        for idx in range(len(batch_img_metas)):
+            results = InstanceData()
+            results.imgs_whwh = results_list[idx].imgs_whwh
+            results.bboxes = bbox_results['detached_proposals'][idx]
+            proposal_list.append(results)
+        bbox_results.update(results_list=proposal_list)
+        return bbox_results
+
+    def _bbox_forward(self, stage: int, x: Tuple[Tensor], rois: Tensor,
+                      object_feats: Tensor,
+                      batch_img_metas: List[dict]) -> dict:
+        """Box head forward function used in both training and testing. Returns
+        all regression, classification results and a intermediate feature.
+
+        Args:
+            stage (int): The current stage in iterative process.
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+                Each dimension means (img_index, x1, y1, x2, y2).
+            object_feats (Tensor): The object feature extracted from
+                the previous stage.
+            batch_img_metas (list[dict]): Meta information of each image.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of bbox head outputs,
+            Containing the following results:
+
+            - cls_score (Tensor): The score of each class, has
+              shape (batch_size, num_proposals, num_classes)
+              when use focal loss or
+              (batch_size, num_proposals, num_classes+1)
+              otherwise.
+            - decoded_bboxes (Tensor): The regression results
+              with shape (batch_size, num_proposal, 4).
+              The last dimension 4 represents
+              [tl_x, tl_y, br_x, br_y].
+            - object_feats (Tensor): The object feature extracted
+              from current stage
+            - detached_cls_scores (list[Tensor]): The detached
+              classification results, length is batch_size, and
+              each tensor has shape (num_proposal, num_classes).
+            - detached_proposals (list[tensor]): The detached
+              regression results, length is batch_size, and each
+              tensor has shape (num_proposal, 4). The last
+              dimension 4 represents [tl_x, tl_y, br_x, br_y].
+        """
+        num_imgs = len(batch_img_metas)
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        cls_score, bbox_pred, object_feats, attn_feats = bbox_head(
+            bbox_feats, object_feats)
+
+        fake_bbox_results = dict(
+            rois=rois,
+            bbox_targets=(rois.new_zeros(len(rois), dtype=torch.long), None),
+            bbox_pred=bbox_pred.view(-1, bbox_pred.size(-1)),
+            cls_score=cls_score.view(-1, cls_score.size(-1)))
+        fake_sampling_results = [
+            InstanceData(pos_is_gt=rois.new_zeros(object_feats.size(1)))
+            for _ in range(len(batch_img_metas))
+        ]
+
+        results_list = bbox_head.refine_bboxes(
+            sampling_results=fake_sampling_results,
+            bbox_results=fake_bbox_results,
+            batch_img_metas=batch_img_metas)
+        proposal_list = [res.bboxes for res in results_list]
+        bbox_results = dict(
+            cls_score=cls_score,
+            decoded_bboxes=torch.cat(proposal_list),
+            object_feats=object_feats,
+            attn_feats=attn_feats,
+            # detach then use it in label assign
+            detached_cls_scores=[
+                cls_score[i].detach() for i in range(num_imgs)
+            ],
+            detached_proposals=[item.detach() for item in proposal_list])
+
+        return bbox_results
+
+    def _mask_forward(self, stage: int, x: Tuple[Tensor], rois: Tensor,
+                      attn_feats) -> dict:
+        """Mask head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            attn_feats (Tensot): Intermediate feature get from the last
+                diihead, has shape
+                (batch_size*num_proposals, feature_dimensions)
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+            - `mask_preds` (Tensor): Mask prediction.
+        """
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        mask_preds = mask_head(mask_feats, attn_feats)
+
+        mask_results = dict(mask_preds=mask_preds)
+        return mask_results
+
+    def mask_loss(self, stage: int, x: Tuple[Tensor], bbox_results: dict,
+                  batch_gt_instances: InstanceList,
+                  rcnn_train_cfg: ConfigDict) -> dict:
+        """Run forward function and calculate loss for mask head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            bbox_results (dict): Results obtained from `bbox_loss`.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+            - `mask_preds` (Tensor): Mask prediction.
+            - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        attn_feats = bbox_results['attn_feats']
+        sampling_results = bbox_results['sampling_results']
+
+        pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+
+        attn_feats = torch.cat([
+            feats[res.pos_inds]
+            for (feats, res) in zip(attn_feats, sampling_results)
+        ])
+        mask_results = self._mask_forward(stage, x, pos_rois, attn_feats)
+
+        mask_loss_and_target = self.mask_head[stage].loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=rcnn_train_cfg)
+        mask_results.update(mask_loss_and_target)
+
+        return mask_results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (List[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: a dictionary of loss components of all stage.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        object_feats = torch.cat(
+            [res.pop('features')[None, ...] for res in rpn_results_list])
+        results_list = rpn_results_list
+        losses = {}
+        for stage in range(self.num_stages):
+            stage_loss_weight = self.stage_loss_weights[stage]
+
+            # bbox head forward and loss
+            bbox_results = self.bbox_loss(
+                stage=stage,
+                x=x,
+                object_feats=object_feats,
+                results_list=results_list,
+                batch_img_metas=batch_img_metas,
+                batch_gt_instances=batch_gt_instances)
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{stage}.{name}'] = (
+                    value * stage_loss_weight if 'loss' in name else value)
+
+            if self.with_mask:
+                mask_results = self.mask_loss(
+                    stage=stage,
+                    x=x,
+                    bbox_results=bbox_results,
+                    batch_gt_instances=batch_gt_instances,
+                    rcnn_train_cfg=self.train_cfg[stage])
+
+                for name, value in mask_results['loss_mask'].items():
+                    losses[f's{stage}.{name}'] = (
+                        value * stage_loss_weight if 'loss' in name else value)
+
+            object_feats = bbox_results['object_feats']
+            results_list = bbox_results['results_list']
+        return losses
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x(tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        proposal_list = [res.bboxes for res in rpn_results_list]
+        object_feats = torch.cat(
+            [res.pop('features')[None, ...] for res in rpn_results_list])
+        if all([proposal.shape[0] == 0 for proposal in proposal_list]):
+            # There is no proposal in the whole batch
+            return empty_instances(
+                batch_img_metas, x[0].device, task_type='bbox')
+
+        for stage in range(self.num_stages):
+            rois = bbox2roi(proposal_list)
+            bbox_results = self._bbox_forward(stage, x, rois, object_feats,
+                                              batch_img_metas)
+            object_feats = bbox_results['object_feats']
+            cls_score = bbox_results['cls_score']
+            proposal_list = bbox_results['detached_proposals']
+
+        num_classes = self.bbox_head[-1].num_classes
+
+        if self.bbox_head[-1].loss_cls.use_sigmoid:
+            cls_score = cls_score.sigmoid()
+        else:
+            cls_score = cls_score.softmax(-1)[..., :-1]
+
+        topk_inds_list = []
+        results_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score_per_img = cls_score[img_id]
+            scores_per_img, topk_inds = cls_score_per_img.flatten(0, 1).topk(
+                self.test_cfg.max_per_img, sorted=False)
+            labels_per_img = topk_inds % num_classes
+            bboxes_per_img = proposal_list[img_id][topk_inds // num_classes]
+            topk_inds_list.append(topk_inds)
+            if rescale and bboxes_per_img.size(0) > 0:
+                assert batch_img_metas[img_id].get('scale_factor') is not None
+                scale_factor = bboxes_per_img.new_tensor(
+                    batch_img_metas[img_id]['scale_factor']).repeat((1, 2))
+                bboxes_per_img = (
+                    bboxes_per_img.view(bboxes_per_img.size(0), -1, 4) /
+                    scale_factor).view(bboxes_per_img.size()[0], -1)
+
+            results = InstanceData()
+            results.bboxes = bboxes_per_img
+            results.scores = scores_per_img
+            results.labels = labels_per_img
+            results_list.append(results)
+        if self.with_mask:
+            for img_id in range(len(batch_img_metas)):
+                # add positive information in InstanceData to predict
+                # mask results in `mask_head`.
+                proposals = bbox_results['detached_proposals'][img_id]
+                topk_inds = topk_inds_list[img_id]
+                attn_feats = bbox_results['attn_feats'][img_id]
+
+                results_list[img_id].proposals = proposals
+                results_list[img_id].topk_inds = topk_inds
+                results_list[img_id].attn_feats = attn_feats
+        return results_list
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     results_list: InstanceList,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image. Each item usually contains following keys:
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - proposal (Tensor): Bboxes predicted from bbox_head,
+                  has a shape (num_instances, 4).
+                - topk_inds (Tensor): Topk indices of each image, has
+                  shape (num_instances, )
+                - attn_feats (Tensor): Intermediate feature get from the last
+                  diihead, has shape (num_instances, feature_dimensions)
+
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+            - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        proposal_list = [res.pop('proposals') for res in results_list]
+        topk_inds_list = [res.pop('topk_inds') for res in results_list]
+        attn_feats = torch.cat(
+            [res.pop('attn_feats')[None, ...] for res in results_list])
+
+        rois = bbox2roi(proposal_list)
+
+        if rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        last_stage = self.num_stages - 1
+        mask_results = self._mask_forward(last_stage, x, rois, attn_feats)
+
+        num_imgs = len(batch_img_metas)
+        mask_results['mask_preds'] = mask_results['mask_preds'].reshape(
+            num_imgs, -1, *mask_results['mask_preds'].size()[1:])
+        num_classes = self.bbox_head[-1].num_classes
+
+        mask_preds = []
+        for img_id in range(num_imgs):
+            topk_inds = topk_inds_list[img_id]
+            masks_per_img = mask_results['mask_preds'][img_id].flatten(
+                0, 1)[topk_inds]
+            masks_per_img = masks_per_img[:, None,
+                                          ...].repeat(1, num_classes, 1, 1)
+            mask_preds.append(masks_per_img)
+        results_list = self.mask_head[-1].predict_by_feat(
+            mask_preds,
+            results_list,
+            batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale)
+
+        return results_list
+
+    # TODO: Need to refactor later
+    def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+                batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (List[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        (batch_gt_instances, batch_gt_instances_ignore,
+         batch_img_metas) = outputs
+
+        all_stage_bbox_results = []
+        object_feats = torch.cat(
+            [res.pop('features')[None, ...] for res in rpn_results_list])
+        results_list = rpn_results_list
+        if self.with_bbox:
+            for stage in range(self.num_stages):
+                bbox_results = self.bbox_loss(
+                    stage=stage,
+                    x=x,
+                    results_list=results_list,
+                    object_feats=object_feats,
+                    batch_img_metas=batch_img_metas,
+                    batch_gt_instances=batch_gt_instances)
+                bbox_results.pop('loss_bbox')
+                # torch.jit does not support obj:SamplingResult
+                bbox_results.pop('results_list')
+                bbox_res = bbox_results.copy()
+                bbox_res.pop('sampling_results')
+                all_stage_bbox_results.append((bbox_res, ))
+
+                if self.with_mask:
+                    attn_feats = bbox_results['attn_feats']
+                    sampling_results = bbox_results['sampling_results']
+
+                    pos_rois = bbox2roi(
+                        [res.pos_priors for res in sampling_results])
+
+                    attn_feats = torch.cat([
+                        feats[res.pos_inds]
+                        for (feats, res) in zip(attn_feats, sampling_results)
+                    ])
+                    mask_results = self._mask_forward(stage, x, pos_rois,
+                                                      attn_feats)
+                    all_stage_bbox_results[-1] += (mask_results, )
+        return tuple(all_stage_bbox_results)
diff --git a/mmde/mmdet/models/roi_heads/standard_roi_head.py b/mmde/mmdet/models/roi_heads/standard_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d168eba0fb2ccf6aa89bde5c637160f10aea83a
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/standard_roi_head.py
@@ -0,0 +1,419 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import DetDataSample, SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList
+from ..task_modules.samplers import SamplingResult
+from ..utils import empty_instances, unpack_gt_instances
+from .base_roi_head import BaseRoIHead
+
+
+@MODELS.register_module()
+class StandardRoIHead(BaseRoIHead):
+    """Simplest base roi head including one bbox head and one mask head."""
+
+    def init_assigner_sampler(self) -> None:
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            self.bbox_sampler = TASK_UTILS.build(
+                self.train_cfg.sampler, default_args=dict(context=self))
+
+    def init_bbox_head(self, bbox_roi_extractor: ConfigType,
+                       bbox_head: ConfigType) -> None:
+        """Initialize box head and box roi extractor.
+
+        Args:
+            bbox_roi_extractor (dict or ConfigDict): Config of box
+                roi extractor.
+            bbox_head (dict or ConfigDict): Config of box in box head.
+        """
+        self.bbox_roi_extractor = MODELS.build(bbox_roi_extractor)
+        self.bbox_head = MODELS.build(bbox_head)
+
+    def init_mask_head(self, mask_roi_extractor: ConfigType,
+                       mask_head: ConfigType) -> None:
+        """Initialize mask head and mask roi extractor.
+
+        Args:
+            mask_roi_extractor (dict or ConfigDict): Config of mask roi
+                extractor.
+            mask_head (dict or ConfigDict): Config of mask in mask head.
+        """
+        if mask_roi_extractor is not None:
+            self.mask_roi_extractor = MODELS.build(mask_roi_extractor)
+            self.share_roi_extractor = False
+        else:
+            self.share_roi_extractor = True
+            self.mask_roi_extractor = self.bbox_roi_extractor
+        self.mask_head = MODELS.build(mask_head)
+
+    # TODO: Need to refactor later
+    def forward(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList = None) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+            the meta information of each image and corresponding
+            annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            bbox_results = self._bbox_forward(x, rois)
+            results = results + (bbox_results['cls_score'],
+                                 bbox_results['bbox_pred'])
+        # mask head
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_results = self._mask_forward(x, mask_rois)
+            results = results + (mask_results['mask_preds'], )
+        return results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: List[DetDataSample]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, _ = outputs
+
+        # assign gts and sample proposals
+        num_imgs = len(batch_data_samples)
+        sampling_results = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in x])
+            sampling_results.append(sampling_result)
+
+        losses = dict()
+        # bbox head loss
+        if self.with_bbox:
+            bbox_results = self.bbox_loss(x, sampling_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self.mask_loss(x, sampling_results,
+                                          bbox_results['bbox_feats'],
+                                          batch_gt_instances)
+            losses.update(mask_results['loss_mask'])
+
+        return losses
+
+    def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        # TODO: a more flexible way to decide which feature maps to use
+        bbox_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        if self.with_shared_head:
+            bbox_feats = self.shared_head(bbox_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def bbox_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult]) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+
+        bbox_loss_and_target = self.bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg)
+
+        bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox'])
+        return bbox_results
+
+    def mask_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult], bbox_feats: Tensor,
+                  batch_gt_instances: InstanceList) -> dict:
+        """Perform forward propagation and loss calculation of the mask head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            bbox_feats (Tensor): Extract bbox RoI features.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `mask_feats` (Tensor): Extract mask RoI features.
+                - `mask_targets` (Tensor): Mask target of each positive\
+                    proposals in the image.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        if not self.share_roi_extractor:
+            pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+            mask_results = self._mask_forward(x, pos_rois)
+        else:
+            pos_inds = []
+            device = bbox_feats.device
+            for res in sampling_results:
+                pos_inds.append(
+                    torch.ones(
+                        res.pos_priors.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+                pos_inds.append(
+                    torch.zeros(
+                        res.neg_priors.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+            pos_inds = torch.cat(pos_inds)
+
+            mask_results = self._mask_forward(
+                x, pos_inds=pos_inds, bbox_feats=bbox_feats)
+
+        mask_loss_and_target = self.mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg)
+
+        mask_results.update(loss_mask=mask_loss_and_target['loss_mask'])
+        return mask_results
+
+    def _mask_forward(self,
+                      x: Tuple[Tensor],
+                      rois: Tensor = None,
+                      pos_inds: Optional[Tensor] = None,
+                      bbox_feats: Optional[Tensor] = None) -> dict:
+        """Mask head forward function used in both training and testing.
+
+        Args:
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            pos_inds (Tensor, optional): Indices of positive samples.
+                Defaults to None.
+            bbox_feats (Tensor): Extract bbox RoI features. Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `mask_feats` (Tensor): Extract mask RoI features.
+        """
+        assert ((rois is not None) ^
+                (pos_inds is not None and bbox_feats is not None))
+        if rois is not None:
+            mask_feats = self.mask_roi_extractor(
+                x[:self.mask_roi_extractor.num_inputs], rois)
+            if self.with_shared_head:
+                mask_feats = self.shared_head(mask_feats)
+        else:
+            assert bbox_feats is not None
+            mask_feats = bbox_feats[pos_inds]
+
+        mask_preds = self.mask_head(mask_feats)
+        mask_results = dict(mask_preds=mask_preds, mask_feats=mask_feats)
+        return mask_results
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        proposals = [res.bboxes for res in rpn_results_list]
+        rois = bbox2roi(proposals)
+
+        if rois.shape[0] == 0:
+            return empty_instances(
+                batch_img_metas,
+                rois.device,
+                task_type='bbox',
+                box_type=self.bbox_head.predict_box_type,
+                num_classes=self.bbox_head.num_classes,
+                score_per_cls=rcnn_test_cfg is None)
+
+        bbox_results = self._bbox_forward(x, rois)
+
+        # split batch bbox prediction back to each image
+        cls_scores = bbox_results['cls_score']
+        bbox_preds = bbox_results['bbox_pred']
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = rois.split(num_proposals_per_img, 0)
+        cls_scores = cls_scores.split(num_proposals_per_img, 0)
+
+        # some detector with_reg is False, bbox_preds will be None
+        if bbox_preds is not None:
+            # TODO move this to a sabl_roi_head
+            # the bbox prediction of some detectors like SABL is not Tensor
+            if isinstance(bbox_preds, torch.Tensor):
+                bbox_preds = bbox_preds.split(num_proposals_per_img, 0)
+            else:
+                bbox_preds = self.bbox_head.bbox_pred_split(
+                    bbox_preds, num_proposals_per_img)
+        else:
+            bbox_preds = (None, ) * len(proposals)
+
+        result_list = self.bbox_head.predict_by_feat(
+            rois=rois,
+            cls_scores=cls_scores,
+            bbox_preds=bbox_preds,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=rcnn_test_cfg,
+            rescale=rescale)
+        return result_list
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     results_list: InstanceList,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        # don't need to consider aug_test.
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        mask_results = self._mask_forward(x, mask_rois)
+        mask_preds = mask_results['mask_preds']
+        # split batch mask prediction back to each image
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        mask_preds = mask_preds.split(num_mask_rois_per_img, 0)
+
+        # TODO: Handle the case where rescale is false
+        results_list = self.mask_head.predict_by_feat(
+            mask_preds=mask_preds,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale)
+        return results_list
diff --git a/mmde/mmdet/models/roi_heads/test_mixins.py b/mmde/mmdet/models/roi_heads/test_mixins.py
new file mode 100644
index 0000000000000000000000000000000000000000..940490454d9cf1fde4d69c1f890c173b92d522a1
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/test_mixins.py
@@ -0,0 +1,171 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# TODO: delete this file after refactor
+import sys
+
+import torch
+
+from mmdet.models.layers import multiclass_nms
+from mmdet.models.test_time_augs import merge_aug_bboxes, merge_aug_masks
+from mmdet.structures.bbox import bbox2roi, bbox_mapping
+
+if sys.version_info >= (3, 7):
+    from mmdet.utils.contextmanagers import completed
+
+
+class BBoxTestMixin:
+
+    if sys.version_info >= (3, 7):
+        # TODO: Currently not supported
+        async def async_test_bboxes(self,
+                                    x,
+                                    img_metas,
+                                    proposals,
+                                    rcnn_test_cfg,
+                                    rescale=False,
+                                    **kwargs):
+            """Asynchronized test for box head without augmentation."""
+            rois = bbox2roi(proposals)
+            roi_feats = self.bbox_roi_extractor(
+                x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
+            if self.with_shared_head:
+                roi_feats = self.shared_head(roi_feats)
+            sleep_interval = rcnn_test_cfg.get('async_sleep_interval', 0.017)
+
+            async with completed(
+                    __name__, 'bbox_head_forward',
+                    sleep_interval=sleep_interval):
+                cls_score, bbox_pred = self.bbox_head(roi_feats)
+
+            img_shape = img_metas[0]['img_shape']
+            scale_factor = img_metas[0]['scale_factor']
+            det_bboxes, det_labels = self.bbox_head.get_bboxes(
+                rois,
+                cls_score,
+                bbox_pred,
+                img_shape,
+                scale_factor,
+                rescale=rescale,
+                cfg=rcnn_test_cfg)
+            return det_bboxes, det_labels
+
+    # TODO: Currently not supported
+    def aug_test_bboxes(self, feats, img_metas, rpn_results_list,
+                        rcnn_test_cfg):
+        """Test det bboxes with test time augmentation."""
+        aug_bboxes = []
+        aug_scores = []
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            flip = img_meta[0]['flip']
+            flip_direction = img_meta[0]['flip_direction']
+            # TODO more flexible
+            proposals = bbox_mapping(rpn_results_list[0][:, :4], img_shape,
+                                     scale_factor, flip, flip_direction)
+            rois = bbox2roi([proposals])
+            bbox_results = self.bbox_forward(x, rois)
+            bboxes, scores = self.bbox_head.get_bboxes(
+                rois,
+                bbox_results['cls_score'],
+                bbox_results['bbox_pred'],
+                img_shape,
+                scale_factor,
+                rescale=False,
+                cfg=None)
+            aug_bboxes.append(bboxes)
+            aug_scores.append(scores)
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, rcnn_test_cfg)
+        if merged_bboxes.shape[0] == 0:
+            # There is no proposal in the single image
+            det_bboxes = merged_bboxes.new_zeros(0, 5)
+            det_labels = merged_bboxes.new_zeros((0, ), dtype=torch.long)
+        else:
+            det_bboxes, det_labels = multiclass_nms(merged_bboxes,
+                                                    merged_scores,
+                                                    rcnn_test_cfg.score_thr,
+                                                    rcnn_test_cfg.nms,
+                                                    rcnn_test_cfg.max_per_img)
+        return det_bboxes, det_labels
+
+
+class MaskTestMixin:
+
+    if sys.version_info >= (3, 7):
+        # TODO: Currently not supported
+        async def async_test_mask(self,
+                                  x,
+                                  img_metas,
+                                  det_bboxes,
+                                  det_labels,
+                                  rescale=False,
+                                  mask_test_cfg=None):
+            """Asynchronized test for mask head without augmentation."""
+            # image shape of the first image in the batch (only one)
+            ori_shape = img_metas[0]['ori_shape']
+            scale_factor = img_metas[0]['scale_factor']
+            if det_bboxes.shape[0] == 0:
+                segm_result = [[] for _ in range(self.mask_head.num_classes)]
+            else:
+                if rescale and not isinstance(scale_factor,
+                                              (float, torch.Tensor)):
+                    scale_factor = det_bboxes.new_tensor(scale_factor)
+                _bboxes = (
+                    det_bboxes[:, :4] *
+                    scale_factor if rescale else det_bboxes)
+                mask_rois = bbox2roi([_bboxes])
+                mask_feats = self.mask_roi_extractor(
+                    x[:len(self.mask_roi_extractor.featmap_strides)],
+                    mask_rois)
+
+                if self.with_shared_head:
+                    mask_feats = self.shared_head(mask_feats)
+                if mask_test_cfg and \
+                        mask_test_cfg.get('async_sleep_interval'):
+                    sleep_interval = mask_test_cfg['async_sleep_interval']
+                else:
+                    sleep_interval = 0.035
+                async with completed(
+                        __name__,
+                        'mask_head_forward',
+                        sleep_interval=sleep_interval):
+                    mask_pred = self.mask_head(mask_feats)
+                segm_result = self.mask_head.get_results(
+                    mask_pred, _bboxes, det_labels, self.test_cfg, ori_shape,
+                    scale_factor, rescale)
+            return segm_result
+
+    # TODO: Currently not supported
+    def aug_test_mask(self, feats, img_metas, det_bboxes, det_labels):
+        """Test for mask head with test time augmentation."""
+        if det_bboxes.shape[0] == 0:
+            segm_result = [[] for _ in range(self.mask_head.num_classes)]
+        else:
+            aug_masks = []
+            for x, img_meta in zip(feats, img_metas):
+                img_shape = img_meta[0]['img_shape']
+                scale_factor = img_meta[0]['scale_factor']
+                flip = img_meta[0]['flip']
+                flip_direction = img_meta[0]['flip_direction']
+                _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
+                                       scale_factor, flip, flip_direction)
+                mask_rois = bbox2roi([_bboxes])
+                mask_results = self._mask_forward(x, mask_rois)
+                # convert to numpy array to save memory
+                aug_masks.append(
+                    mask_results['mask_pred'].sigmoid().cpu().numpy())
+            merged_masks = merge_aug_masks(aug_masks, img_metas, self.test_cfg)
+
+            ori_shape = img_metas[0][0]['ori_shape']
+            scale_factor = det_bboxes.new_ones(4)
+            segm_result = self.mask_head.get_results(
+                merged_masks,
+                det_bboxes,
+                det_labels,
+                self.test_cfg,
+                ori_shape,
+                scale_factor=scale_factor,
+                rescale=False)
+        return segm_result
diff --git a/mmde/mmdet/models/roi_heads/trident_roi_head.py b/mmde/mmdet/models/roi_heads/trident_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..5215327296282a8e7ca502f3321aced8a4f840b7
--- /dev/null
+++ b/mmde/mmdet/models/roi_heads/trident_roi_head.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from mmcv.ops import batched_nms
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class TridentRoIHead(StandardRoIHead):
+    """Trident roi head.
+
+    Args:
+        num_branch (int): Number of branches in TridentNet.
+        test_branch_idx (int): In inference, all 3 branches will be used
+            if `test_branch_idx==-1`, otherwise only branch with index
+            `test_branch_idx` will be used.
+    """
+
+    def __init__(self, num_branch: int, test_branch_idx: int,
+                 **kwargs) -> None:
+        self.num_branch = num_branch
+        self.test_branch_idx = test_branch_idx
+        super().__init__(**kwargs)
+
+    def merge_trident_bboxes(self,
+                             trident_results: InstanceList) -> InstanceData:
+        """Merge bbox predictions of each branch.
+
+        Args:
+            trident_results (List[:obj:`InstanceData`]): A list of InstanceData
+                predicted from every branch.
+
+        Returns:
+            :obj:`InstanceData`: merged InstanceData.
+        """
+        bboxes = torch.cat([res.bboxes for res in trident_results])
+        scores = torch.cat([res.scores for res in trident_results])
+        labels = torch.cat([res.labels for res in trident_results])
+
+        nms_cfg = self.test_cfg['nms']
+        results = InstanceData()
+        if bboxes.numel() == 0:
+            results.bboxes = bboxes
+            results.scores = scores
+            results.labels = labels
+        else:
+            det_bboxes, keep = batched_nms(bboxes, scores, labels, nms_cfg)
+            results.bboxes = det_bboxes[:, :-1]
+            results.scores = det_bboxes[:, -1]
+            results.labels = labels[keep]
+
+        if self.test_cfg['max_per_img'] > 0:
+            results = results[:self.test_cfg['max_per_img']]
+        return results
+
+    def predict(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        - Compute prediction bbox and label per branch.
+        - Merge predictions of each branch according to scores of
+          bboxes, i.e., bboxes with higher score are kept to give
+          top-k prediction.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (N, C, H, W).
+            rpn_results_list (list[:obj:`InstanceData`]): list of region
+                proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results to
+                the original image. Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        results_list = super().predict(
+            x=x,
+            rpn_results_list=rpn_results_list,
+            batch_data_samples=batch_data_samples,
+            rescale=rescale)
+
+        num_branch = self.num_branch \
+            if self.training or self.test_branch_idx == -1 else 1
+
+        merged_results_list = []
+        for i in range(len(batch_data_samples) // num_branch):
+            merged_results_list.append(
+                self.merge_trident_bboxes(results_list[i * num_branch:(i + 1) *
+                                                       num_branch]))
+        return merged_results_list
diff --git a/mmde/mmdet/models/seg_heads/__init__.py b/mmde/mmdet/models/seg_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b489a905b1e9b6cef2e8b9575600990563128e4e
--- /dev/null
+++ b/mmde/mmdet/models/seg_heads/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .panoptic_fpn_head import PanopticFPNHead  # noqa: F401,F403
+from .panoptic_fusion_heads import *  # noqa: F401,F403
diff --git a/mmde/mmdet/models/seg_heads/base_semantic_head.py b/mmde/mmdet/models/seg_heads/base_semantic_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1db71549d89766c45012517c20cef443f4760419
--- /dev/null
+++ b/mmde/mmdet/models/seg_heads/base_semantic_head.py
@@ -0,0 +1,113 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Tuple, Union
+
+import torch.nn.functional as F
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class BaseSemanticHead(BaseModule, metaclass=ABCMeta):
+    """Base module of Semantic Head.
+
+    Args:
+        num_classes (int): the number of classes.
+        seg_rescale_factor (float): the rescale factor for ``gt_sem_seg``,
+            which equals to ``1 / output_strides``. The output_strides is
+            for ``seg_preds``. Defaults to  1 / 4.
+        init_cfg (Optional[Union[:obj:`ConfigDict`, dict]]): the initialization
+            config.
+        loss_seg (Union[:obj:`ConfigDict`, dict]): the loss of the semantic
+            head.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 seg_rescale_factor: float = 1 / 4.,
+                 loss_seg: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     ignore_index=255,
+                     loss_weight=1.0),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.loss_seg = MODELS.build(loss_seg)
+        self.num_classes = num_classes
+        self.seg_rescale_factor = seg_rescale_factor
+
+    @abstractmethod
+    def forward(self, x: Union[Tensor, Tuple[Tensor]]) -> Dict[str, Tensor]:
+        """Placeholder of forward function.
+
+        Args:
+            x (Tensor): Feature maps.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary, including features
+                and predicted scores. Required keys: 'seg_preds'
+                and 'feats'.
+        """
+        pass
+
+    @abstractmethod
+    def loss(self, x: Union[Tensor, Tuple[Tensor]],
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """
+        Args:
+            x (Union[Tensor, Tuple[Tensor]]): Feature maps.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Args:
+            x (Tensor): Feature maps.
+
+        Returns:
+            Dict[str, Tensor]: The loss of semantic head.
+        """
+        pass
+
+    def predict(self,
+                x: Union[Tensor, Tuple[Tensor]],
+                batch_img_metas: List[dict],
+                rescale: bool = False) -> List[Tensor]:
+        """Test without Augmentation.
+
+        Args:
+            x (Union[Tensor, Tuple[Tensor]]): Feature maps.
+            batch_img_metas (List[dict]): List of image information.
+            rescale (bool): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[Tensor]: semantic segmentation logits.
+        """
+        seg_preds = self.forward(x)['seg_preds']
+        seg_preds = F.interpolate(
+            seg_preds,
+            size=batch_img_metas[0]['batch_input_shape'],
+            mode='bilinear',
+            align_corners=False)
+        seg_preds = [seg_preds[i] for i in range(len(batch_img_metas))]
+
+        if rescale:
+            seg_pred_list = []
+            for i in range(len(batch_img_metas)):
+                h, w = batch_img_metas[i]['img_shape']
+                seg_pred = seg_preds[i][:, :h, :w]
+
+                h, w = batch_img_metas[i]['ori_shape']
+                seg_pred = F.interpolate(
+                    seg_pred[None],
+                    size=(h, w),
+                    mode='bilinear',
+                    align_corners=False)[0]
+                seg_pred_list.append(seg_pred)
+        else:
+            seg_pred_list = seg_preds
+
+        return seg_pred_list
diff --git a/mmde/mmdet/models/seg_heads/panoptic_fpn_head.py b/mmde/mmdet/models/seg_heads/panoptic_fpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d8b901360922f6cdb9f8d15b60dac8d7514ee75
--- /dev/null
+++ b/mmde/mmdet/models/seg_heads/panoptic_fpn_head.py
@@ -0,0 +1,174 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import ModuleList
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from ..layers import ConvUpsample
+from ..utils import interpolate_as
+from .base_semantic_head import BaseSemanticHead
+
+
+@MODELS.register_module()
+class PanopticFPNHead(BaseSemanticHead):
+    """PanopticFPNHead used in Panoptic FPN.
+
+    In this head, the number of output channels is ``num_stuff_classes
+    + 1``, including all stuff classes and one thing class. The stuff
+    classes will be reset from ``0`` to ``num_stuff_classes - 1``, the
+    thing classes will be merged to ``num_stuff_classes``-th channel.
+
+    Arg:
+        num_things_classes (int): Number of thing classes. Default: 80.
+        num_stuff_classes (int): Number of stuff classes. Default: 53.
+        in_channels (int): Number of channels in the input feature
+            map.
+        inner_channels (int): Number of channels in inner features.
+        start_level (int): The start level of the input features
+            used in PanopticFPN.
+        end_level (int): The end level of the used features, the
+            ``end_level``-th layer will not be used.
+        conv_cfg (Optional[Union[ConfigDict, dict]]): Dictionary to construct
+            and config conv layer.
+        norm_cfg (Union[ConfigDict, dict]): Dictionary to construct and config
+            norm layer. Use ``GN`` by default.
+        init_cfg (Optional[Union[ConfigDict, dict]]): Initialization config
+            dict.
+        loss_seg (Union[ConfigDict, dict]): the loss of the semantic head.
+    """
+
+    def __init__(self,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 in_channels: int = 256,
+                 inner_channels: int = 128,
+                 start_level: int = 0,
+                 end_level: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 loss_seg: ConfigType = dict(
+                     type='CrossEntropyLoss', ignore_index=-1,
+                     loss_weight=1.0),
+                 init_cfg: OptMultiConfig = None) -> None:
+        seg_rescale_factor = 1 / 2**(start_level + 2)
+        super().__init__(
+            num_classes=num_stuff_classes + 1,
+            seg_rescale_factor=seg_rescale_factor,
+            loss_seg=loss_seg,
+            init_cfg=init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        # Used feature layers are [start_level, end_level)
+        self.start_level = start_level
+        self.end_level = end_level
+        self.num_stages = end_level - start_level
+        self.inner_channels = inner_channels
+
+        self.conv_upsample_layers = ModuleList()
+        for i in range(start_level, end_level):
+            self.conv_upsample_layers.append(
+                ConvUpsample(
+                    in_channels,
+                    inner_channels,
+                    num_layers=i if i > 0 else 1,
+                    num_upsample=i if i > 0 else 0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                ))
+        self.conv_logits = nn.Conv2d(inner_channels, self.num_classes, 1)
+
+    def _set_things_to_void(self, gt_semantic_seg: Tensor) -> Tensor:
+        """Merge thing classes to one class.
+
+        In PanopticFPN, the background labels will be reset from `0` to
+        `self.num_stuff_classes-1`, the foreground labels will be merged to
+        `self.num_stuff_classes`-th channel.
+        """
+        gt_semantic_seg = gt_semantic_seg.int()
+        fg_mask = gt_semantic_seg < self.num_things_classes
+        bg_mask = (gt_semantic_seg >= self.num_things_classes) * (
+            gt_semantic_seg < self.num_things_classes + self.num_stuff_classes)
+
+        new_gt_seg = torch.clone(gt_semantic_seg)
+        new_gt_seg = torch.where(bg_mask,
+                                 gt_semantic_seg - self.num_things_classes,
+                                 new_gt_seg)
+        new_gt_seg = torch.where(fg_mask,
+                                 fg_mask.int() * self.num_stuff_classes,
+                                 new_gt_seg)
+        return new_gt_seg
+
+    def loss(self, x: Union[Tensor, Tuple[Tensor]],
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """
+        Args:
+            x (Union[Tensor, Tuple[Tensor]]): Feature maps.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            Dict[str, Tensor]: The loss of semantic head.
+        """
+        seg_preds = self(x)['seg_preds']
+        gt_semantic_segs = [
+            data_sample.gt_sem_seg.sem_seg
+            for data_sample in batch_data_samples
+        ]
+
+        gt_semantic_segs = torch.stack(gt_semantic_segs)
+        if self.seg_rescale_factor != 1.0:
+            gt_semantic_segs = F.interpolate(
+                gt_semantic_segs.float(),
+                scale_factor=self.seg_rescale_factor,
+                mode='nearest').squeeze(1)
+
+        # Things classes will be merged to one class in PanopticFPN.
+        gt_semantic_segs = self._set_things_to_void(gt_semantic_segs)
+
+        if seg_preds.shape[-2:] != gt_semantic_segs.shape[-2:]:
+            seg_preds = interpolate_as(seg_preds, gt_semantic_segs)
+        seg_preds = seg_preds.permute((0, 2, 3, 1))
+
+        loss_seg = self.loss_seg(
+            seg_preds.reshape(-1, self.num_classes),  # => [NxHxW, C]
+            gt_semantic_segs.reshape(-1).long())
+
+        return dict(loss_seg=loss_seg)
+
+    def init_weights(self) -> None:
+        """Initialize weights."""
+        super().init_weights()
+        nn.init.normal_(self.conv_logits.weight.data, 0, 0.01)
+        self.conv_logits.bias.data.zero_()
+
+    def forward(self, x: Tuple[Tensor]) -> Dict[str, Tensor]:
+        """Forward.
+
+        Args:
+            x (Tuple[Tensor]): Multi scale Feature maps.
+
+        Returns:
+            dict[str, Tensor]: semantic segmentation predictions and
+                feature maps.
+        """
+        # the number of subnets must be not more than
+        # the length of features.
+        assert self.num_stages <= len(x)
+
+        feats = []
+        for i, layer in enumerate(self.conv_upsample_layers):
+            f = layer(x[self.start_level + i])
+            feats.append(f)
+
+        seg_feats = torch.sum(torch.stack(feats, dim=0), dim=0)
+        seg_preds = self.conv_logits(seg_feats)
+        out = dict(seg_preds=seg_preds, seg_feats=seg_feats)
+        return out
diff --git a/mmde/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py b/mmde/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..41625a61d6d1c38c633062c24b1e3455bd3ae2df
--- /dev/null
+++ b/mmde/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_panoptic_fusion_head import \
+    BasePanopticFusionHead  # noqa: F401,F403
+from .heuristic_fusion_head import HeuristicFusionHead  # noqa: F401,F403
+from .maskformer_fusion_head import MaskFormerFusionHead  # noqa: F401,F403
diff --git a/mmde/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py b/mmde/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6b20e1cd144eaebd042b8017f143c0a643adde1
--- /dev/null
+++ b/mmde/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class BasePanopticFusionHead(BaseModule, metaclass=ABCMeta):
+    """Base class for panoptic heads."""
+
+    def __init__(self,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 test_cfg: OptConfigType = None,
+                 loss_panoptic: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = num_things_classes + num_stuff_classes
+        self.test_cfg = test_cfg
+
+        if loss_panoptic:
+            self.loss_panoptic = MODELS.build(loss_panoptic)
+        else:
+            self.loss_panoptic = None
+
+    @property
+    def with_loss(self) -> bool:
+        """bool: whether the panoptic head contains loss function."""
+        return self.loss_panoptic is not None
+
+    @abstractmethod
+    def loss(self, **kwargs):
+        """Loss function."""
+
+    @abstractmethod
+    def predict(self, **kwargs):
+        """Predict function."""
diff --git a/mmde/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py b/mmde/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a4a4200edd97f42e9a138e14a1d07328ad9b139
--- /dev/null
+++ b/mmde/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from mmengine.structures import InstanceData, PixelData
+from torch import Tensor
+
+from mmdet.evaluation.functional import INSTANCE_OFFSET
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList, OptConfigType, OptMultiConfig, PixelList
+from .base_panoptic_fusion_head import BasePanopticFusionHead
+
+
+@MODELS.register_module()
+class HeuristicFusionHead(BasePanopticFusionHead):
+    """Fusion Head with Heuristic method."""
+
+    def __init__(self,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super().__init__(
+            num_things_classes=num_things_classes,
+            num_stuff_classes=num_stuff_classes,
+            test_cfg=test_cfg,
+            loss_panoptic=None,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def loss(self, **kwargs) -> dict:
+        """HeuristicFusionHead has no training loss."""
+        return dict()
+
+    def _lay_masks(self,
+                   mask_results: InstanceData,
+                   overlap_thr: float = 0.5) -> Tensor:
+        """Lay instance masks to a result map.
+
+        Args:
+            mask_results (:obj:`InstanceData`): Instance segmentation results,
+                each contains ``bboxes``, ``labels``, ``scores`` and ``masks``.
+            overlap_thr (float): Threshold to determine whether two masks
+                overlap. default: 0.5.
+
+        Returns:
+            Tensor: The result map, (H, W).
+        """
+        bboxes = mask_results.bboxes
+        scores = mask_results.scores
+        labels = mask_results.labels
+        masks = mask_results.masks
+
+        num_insts = bboxes.shape[0]
+        id_map = torch.zeros(
+            masks.shape[-2:], device=bboxes.device, dtype=torch.long)
+        if num_insts == 0:
+            return id_map, labels
+
+        # Sort by score to use heuristic fusion
+        order = torch.argsort(-scores)
+        bboxes = bboxes[order]
+        labels = labels[order]
+        segm_masks = masks[order]
+
+        instance_id = 1
+        left_labels = []
+        for idx in range(bboxes.shape[0]):
+            _cls = labels[idx]
+            _mask = segm_masks[idx]
+            instance_id_map = torch.ones_like(
+                _mask, dtype=torch.long) * instance_id
+            area = _mask.sum()
+            if area == 0:
+                continue
+
+            pasted = id_map > 0
+            intersect = (_mask * pasted).sum()
+            if (intersect / (area + 1e-5)) > overlap_thr:
+                continue
+
+            _part = _mask * (~pasted)
+            id_map = torch.where(_part, instance_id_map, id_map)
+            left_labels.append(_cls)
+            instance_id += 1
+
+        if len(left_labels) > 0:
+            instance_labels = torch.stack(left_labels)
+        else:
+            instance_labels = bboxes.new_zeros((0, ), dtype=torch.long)
+        assert instance_id == (len(instance_labels) + 1)
+        return id_map, instance_labels
+
+    def _predict_single(self, mask_results: InstanceData, seg_preds: Tensor,
+                        **kwargs) -> PixelData:
+        """Fuse the results of instance and semantic segmentations.
+
+        Args:
+            mask_results (:obj:`InstanceData`): Instance segmentation results,
+                each contains ``bboxes``, ``labels``, ``scores`` and ``masks``.
+            seg_preds (Tensor): The semantic segmentation results,
+                (num_stuff + 1, H, W).
+
+        Returns:
+            Tensor: The panoptic segmentation result, (H, W).
+        """
+        id_map, labels = self._lay_masks(mask_results,
+                                         self.test_cfg.mask_overlap)
+
+        seg_results = seg_preds.argmax(dim=0)
+        seg_results = seg_results + self.num_things_classes
+
+        pan_results = seg_results
+        instance_id = 1
+        for idx in range(len(mask_results)):
+            _mask = id_map == (idx + 1)
+            if _mask.sum() == 0:
+                continue
+            _cls = labels[idx]
+            # simply trust detection
+            segment_id = _cls + instance_id * INSTANCE_OFFSET
+            pan_results[_mask] = segment_id
+            instance_id += 1
+
+        ids, counts = torch.unique(
+            pan_results % INSTANCE_OFFSET, return_counts=True)
+        stuff_ids = ids[ids >= self.num_things_classes]
+        stuff_counts = counts[ids >= self.num_things_classes]
+        ignore_stuff_ids = stuff_ids[
+            stuff_counts < self.test_cfg.stuff_area_limit]
+
+        assert pan_results.ndim == 2
+        pan_results[(pan_results.unsqueeze(2) == ignore_stuff_ids.reshape(
+            1, 1, -1)).any(dim=2)] = self.num_classes
+
+        pan_results = PixelData(sem_seg=pan_results[None].int())
+        return pan_results
+
+    def predict(self, mask_results_list: InstanceList,
+                seg_preds_list: List[Tensor], **kwargs) -> PixelList:
+        """Predict results by fusing the results of instance and semantic
+        segmentations.
+
+        Args:
+            mask_results_list (list[:obj:`InstanceData`]): Instance
+                segmentation results, each contains ``bboxes``, ``labels``,
+                ``scores`` and ``masks``.
+            seg_preds_list (Tensor): List of semantic segmentation results.
+
+        Returns:
+            List[PixelData]: Panoptic segmentation result.
+        """
+        results_list = [
+            self._predict_single(mask_results_list[i], seg_preds_list[i])
+            for i in range(len(mask_results_list))
+        ]
+
+        return results_list
diff --git a/mmde/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py b/mmde/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b76e6b45bb9be2584f8b3eca2e5e1c0809249fa
--- /dev/null
+++ b/mmde/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData, PixelData
+from torch import Tensor
+
+from mmdet.evaluation.functional import INSTANCE_OFFSET
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.mask import mask2bbox
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .base_panoptic_fusion_head import BasePanopticFusionHead
+
+
+@MODELS.register_module()
+class MaskFormerFusionHead(BasePanopticFusionHead):
+    """MaskFormer fusion head which postprocesses results for panoptic
+    segmentation, instance segmentation and semantic segmentation."""
+
+    def __init__(self,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 test_cfg: OptConfigType = None,
+                 loss_panoptic: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs):
+        super().__init__(
+            num_things_classes=num_things_classes,
+            num_stuff_classes=num_stuff_classes,
+            test_cfg=test_cfg,
+            loss_panoptic=loss_panoptic,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def loss(self, **kwargs):
+        """MaskFormerFusionHead has no training loss."""
+        return dict()
+
+    def panoptic_postprocess(self, mask_cls: Tensor,
+                             mask_pred: Tensor) -> PixelData:
+        """Panoptic segmengation inference.
+
+        Args:
+            mask_cls (Tensor): Classfication outputs of shape
+                (num_queries, cls_out_channels) for a image.
+                Note `cls_out_channels` should includes
+                background.
+            mask_pred (Tensor): Mask outputs of shape
+                (num_queries, h, w) for a image.
+
+        Returns:
+            :obj:`PixelData`: Panoptic segment result of shape \
+                (h, w), each element in Tensor means: \
+                ``segment_id = _cls + instance_id * INSTANCE_OFFSET``.
+        """
+        object_mask_thr = self.test_cfg.get('object_mask_thr', 0.8)
+        iou_thr = self.test_cfg.get('iou_thr', 0.8)
+        filter_low_score = self.test_cfg.get('filter_low_score', False)
+
+        scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
+        mask_pred = mask_pred.sigmoid()
+
+        keep = labels.ne(self.num_classes) & (scores > object_mask_thr)
+        cur_scores = scores[keep]
+        cur_classes = labels[keep]
+        cur_masks = mask_pred[keep]
+
+        cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
+
+        h, w = cur_masks.shape[-2:]
+        panoptic_seg = torch.full((h, w),
+                                  self.num_classes,
+                                  dtype=torch.int32,
+                                  device=cur_masks.device)
+        if cur_masks.shape[0] == 0:
+            # We didn't detect any mask :(
+            pass
+        else:
+            cur_mask_ids = cur_prob_masks.argmax(0)
+            instance_id = 1
+            for k in range(cur_classes.shape[0]):
+                pred_class = int(cur_classes[k].item())
+                isthing = pred_class < self.num_things_classes
+                mask = cur_mask_ids == k
+                mask_area = mask.sum().item()
+                original_area = (cur_masks[k] >= 0.5).sum().item()
+
+                if filter_low_score:
+                    mask = mask & (cur_masks[k] >= 0.5)
+
+                if mask_area > 0 and original_area > 0:
+                    if mask_area / original_area < iou_thr:
+                        continue
+
+                    if not isthing:
+                        # different stuff regions of same class will be
+                        # merged here, and stuff share the instance_id 0.
+                        panoptic_seg[mask] = pred_class
+                    else:
+                        panoptic_seg[mask] = (
+                            pred_class + instance_id * INSTANCE_OFFSET)
+                        instance_id += 1
+
+        return PixelData(sem_seg=panoptic_seg[None])
+
+    def semantic_postprocess(self, mask_cls: Tensor,
+                             mask_pred: Tensor) -> PixelData:
+        """Semantic segmengation postprocess.
+
+        Args:
+            mask_cls (Tensor): Classfication outputs of shape
+                (num_queries, cls_out_channels) for a image.
+                Note `cls_out_channels` should includes
+                background.
+            mask_pred (Tensor): Mask outputs of shape
+                (num_queries, h, w) for a image.
+
+        Returns:
+            :obj:`PixelData`: Semantic segment result.
+        """
+        # TODO add semantic segmentation result
+        raise NotImplementedError
+
+    def instance_postprocess(self, mask_cls: Tensor,
+                             mask_pred: Tensor) -> InstanceData:
+        """Instance segmengation postprocess.
+
+        Args:
+            mask_cls (Tensor): Classfication outputs of shape
+                (num_queries, cls_out_channels) for a image.
+                Note `cls_out_channels` should includes
+                background.
+            mask_pred (Tensor): Mask outputs of shape
+                (num_queries, h, w) for a image.
+
+        Returns:
+            :obj:`InstanceData`: Instance segmentation results.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        max_per_image = self.test_cfg.get('max_per_image', 100)
+        num_queries = mask_cls.shape[0]
+        # shape (num_queries, num_class)
+        scores = F.softmax(mask_cls, dim=-1)[:, :-1]
+        # shape (num_queries * num_class, )
+        labels = torch.arange(self.num_classes, device=mask_cls.device).\
+            unsqueeze(0).repeat(num_queries, 1).flatten(0, 1)
+        scores_per_image, top_indices = scores.flatten(0, 1).topk(
+            max_per_image, sorted=False)
+        labels_per_image = labels[top_indices]
+
+        query_indices = top_indices // self.num_classes
+        mask_pred = mask_pred[query_indices]
+
+        # extract things
+        is_thing = labels_per_image < self.num_things_classes
+        scores_per_image = scores_per_image[is_thing]
+        labels_per_image = labels_per_image[is_thing]
+        mask_pred = mask_pred[is_thing]
+
+        mask_pred_binary = (mask_pred > 0).float()
+        mask_scores_per_image = (mask_pred.sigmoid() *
+                                 mask_pred_binary).flatten(1).sum(1) / (
+                                     mask_pred_binary.flatten(1).sum(1) + 1e-6)
+        det_scores = scores_per_image * mask_scores_per_image
+        mask_pred_binary = mask_pred_binary.bool()
+        bboxes = mask2bbox(mask_pred_binary)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.labels = labels_per_image
+        results.scores = det_scores
+        results.masks = mask_pred_binary
+        return results
+
+    def predict(self,
+                mask_cls_results: Tensor,
+                mask_pred_results: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = False,
+                **kwargs) -> List[dict]:
+        """Test segment without test-time aumengtation.
+
+        Only the output of last decoder layers was used.
+
+        Args:
+            mask_cls_results (Tensor): Mask classification logits,
+                shape (batch_size, num_queries, cls_out_channels).
+                Note `cls_out_channels` should includes background.
+            mask_pred_results (Tensor): Mask logits, shape
+                (batch_size, num_queries, h, w).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): If True, return boxes in
+                original image space. Default False.
+
+        Returns:
+            list[dict]: Instance segmentation \
+                results and panoptic segmentation results for each \
+                image.
+
+            .. code-block:: none
+
+                [
+                    {
+                        'pan_results': PixelData,
+                        'ins_results': InstanceData,
+                        # semantic segmentation results are not supported yet
+                        'sem_results': PixelData
+                    },
+                    ...
+                ]
+        """
+        batch_img_metas = [
+            data_sample.metainfo for data_sample in batch_data_samples
+        ]
+        panoptic_on = self.test_cfg.get('panoptic_on', True)
+        semantic_on = self.test_cfg.get('semantic_on', False)
+        instance_on = self.test_cfg.get('instance_on', False)
+        assert not semantic_on, 'segmantic segmentation '\
+            'results are not supported yet.'
+
+        results = []
+        for mask_cls_result, mask_pred_result, meta in zip(
+                mask_cls_results, mask_pred_results, batch_img_metas):
+            # remove padding
+            img_height, img_width = meta['img_shape'][:2]
+            mask_pred_result = mask_pred_result[:, :img_height, :img_width]
+
+            if rescale:
+                # return result in original resolution
+                ori_height, ori_width = meta['ori_shape'][:2]
+                mask_pred_result = F.interpolate(
+                    mask_pred_result[:, None],
+                    size=(ori_height, ori_width),
+                    mode='bilinear',
+                    align_corners=False)[:, 0]
+
+            result = dict()
+            if panoptic_on:
+                pan_results = self.panoptic_postprocess(
+                    mask_cls_result, mask_pred_result)
+                result['pan_results'] = pan_results
+
+            if instance_on:
+                ins_results = self.instance_postprocess(
+                    mask_cls_result, mask_pred_result)
+                result['ins_results'] = ins_results
+
+            if semantic_on:
+                sem_results = self.semantic_postprocess(
+                    mask_cls_result, mask_pred_result)
+                result['sem_results'] = sem_results
+
+            results.append(result)
+
+        return results
diff --git a/mmde/mmdet/models/task_modules/__init__.py b/mmde/mmdet/models/task_modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bfd8f058ed656760e0b1a3fd6118f31a799cb11
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .assigners import *  # noqa: F401,F403
+from .builder import (ANCHOR_GENERATORS, BBOX_ASSIGNERS, BBOX_CODERS,
+                      BBOX_SAMPLERS, IOU_CALCULATORS, MATCH_COSTS,
+                      PRIOR_GENERATORS, build_anchor_generator, build_assigner,
+                      build_bbox_coder, build_iou_calculator, build_match_cost,
+                      build_prior_generator, build_sampler)
+from .coders import *  # noqa: F401,F403
+from .prior_generators import *  # noqa: F401,F403
+from .samplers import *  # noqa: F401,F403
+from .tracking import *  # noqa: F401,F403
+
+__all__ = [
+    'ANCHOR_GENERATORS', 'PRIOR_GENERATORS', 'BBOX_ASSIGNERS', 'BBOX_SAMPLERS',
+    'MATCH_COSTS', 'BBOX_CODERS', 'IOU_CALCULATORS', 'build_anchor_generator',
+    'build_prior_generator', 'build_assigner', 'build_sampler',
+    'build_iou_calculator', 'build_match_cost', 'build_bbox_coder'
+]
diff --git a/mmde/mmdet/models/task_modules/assigners/__init__.py b/mmde/mmdet/models/task_modules/assigners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e564f24c95b1cc6be8a35a1a309ebf10e582032
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .approx_max_iou_assigner import ApproxMaxIoUAssigner
+from .assign_result import AssignResult
+from .atss_assigner import ATSSAssigner
+from .base_assigner import BaseAssigner
+from .center_region_assigner import CenterRegionAssigner
+from .dynamic_soft_label_assigner import DynamicSoftLabelAssigner
+from .grid_assigner import GridAssigner
+from .hungarian_assigner import HungarianAssigner
+from .iou2d_calculator import BboxOverlaps2D, BboxOverlaps2D_GLIP
+from .match_cost import (BBoxL1Cost, BinaryFocalLossCost, ClassificationCost,
+                         CrossEntropyLossCost, DiceCost, FocalLossCost,
+                         IoUCost)
+from .max_iou_assigner import MaxIoUAssigner
+from .multi_instance_assigner import MultiInstanceAssigner
+from .point_assigner import PointAssigner
+from .region_assigner import RegionAssigner
+from .sim_ota_assigner import SimOTAAssigner
+from .task_aligned_assigner import TaskAlignedAssigner
+from .topk_hungarian_assigner import TopkHungarianAssigner
+from .uniform_assigner import UniformAssigner
+
+__all__ = [
+    'BaseAssigner', 'BinaryFocalLossCost', 'MaxIoUAssigner',
+    'ApproxMaxIoUAssigner', 'AssignResult', 'PointAssigner', 'ATSSAssigner',
+    'CenterRegionAssigner', 'GridAssigner', 'HungarianAssigner',
+    'RegionAssigner', 'UniformAssigner', 'SimOTAAssigner',
+    'TaskAlignedAssigner', 'TopkHungarianAssigner', 'BBoxL1Cost',
+    'ClassificationCost', 'CrossEntropyLossCost', 'DiceCost', 'FocalLossCost',
+    'IoUCost', 'BboxOverlaps2D', 'DynamicSoftLabelAssigner',
+    'MultiInstanceAssigner', 'BboxOverlaps2D_GLIP'
+]
diff --git a/mmde/mmdet/models/task_modules/assigners/approx_max_iou_assigner.py b/mmde/mmdet/models/task_modules/assigners/approx_max_iou_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..471d54e578d640da242355b54cebe05658309ca2
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/approx_max_iou_assigner.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import torch
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .max_iou_assigner import MaxIoUAssigner
+
+
+@TASK_UTILS.register_module()
+class ApproxMaxIoUAssigner(MaxIoUAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with an integer indicating the ground truth
+     index. (semi-positive index: gt label (0-based), -1: background)
+
+    - -1: negative sample, no assigned gt
+    - semi-positive integer: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+        match_low_quality (bool): Whether to allow quality matches. This is
+            usually allowed for RPN and single stage detectors, but not allowed
+            in the second stage.
+        gpu_assign_thr (int): The upper bound of the number of GT for GPU
+            assign. When the number of gt is above this threshold, will assign
+            on CPU device. Negative values mean not assign on CPU.
+        iou_calculator (:obj:`ConfigDict` or dict): Config of overlaps
+            Calculator.
+    """
+
+    def __init__(
+        self,
+        pos_iou_thr: float,
+        neg_iou_thr: Union[float, tuple],
+        min_pos_iou: float = .0,
+        gt_max_assign_all: bool = True,
+        ignore_iof_thr: float = -1,
+        ignore_wrt_candidates: bool = True,
+        match_low_quality: bool = True,
+        gpu_assign_thr: int = -1,
+        iou_calculator: Union[ConfigDict, dict] = dict(type='BboxOverlaps2D')
+    ) -> None:
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        self.gpu_assign_thr = gpu_assign_thr
+        self.match_low_quality = match_low_quality
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to approxs.
+
+        This method assign a gt bbox to each group of approxs (bboxes),
+        each group of approxs is represent by a base approx (bbox) and
+        will be assigned with -1, or a semi-positive number.
+        background_label (-1) means negative sample,
+        semi-positive number is the index (0-based) of assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to background_label (-1)
+        2. use the max IoU of each group of approxs to assign
+        2. assign proposals whose iou with all gts < neg_iou_thr to background
+        3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
+           assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals (may be more than
+           one) to itself
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). ``approxs`` means the
+                group of approxs aligned with ``priors``, has shape
+                (n, num_approxs, 4).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        squares = pred_instances.priors
+        approxs = pred_instances.approxs
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        gt_bboxes_ignore = None if gt_instances_ignore is None else \
+            gt_instances_ignore.get('bboxes', None)
+        approxs_per_octave = approxs.size(1)
+
+        num_squares = squares.size(0)
+        num_gts = gt_bboxes.size(0)
+
+        if num_squares == 0 or num_gts == 0:
+            # No predictions and/or truth, return empty assignment
+            overlaps = approxs.new(num_gts, num_squares)
+            assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+            return assign_result
+
+        # re-organize anchors by approxs_per_octave x num_squares
+        approxs = torch.transpose(approxs, 0, 1).contiguous().view(-1, 4)
+        assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
+            num_gts > self.gpu_assign_thr) else False
+        # compute overlap and assign gt on CPU when number of GT is large
+        if assign_on_cpu:
+            device = approxs.device
+            approxs = approxs.cpu()
+            gt_bboxes = gt_bboxes.cpu()
+            if gt_bboxes_ignore is not None:
+                gt_bboxes_ignore = gt_bboxes_ignore.cpu()
+            if gt_labels is not None:
+                gt_labels = gt_labels.cpu()
+        all_overlaps = self.iou_calculator(approxs, gt_bboxes)
+
+        overlaps, _ = all_overlaps.view(approxs_per_octave, num_squares,
+                                        num_gts).max(dim=0)
+        overlaps = torch.transpose(overlaps, 0, 1)
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and squares.numel() > 0):
+            if self.ignore_wrt_candidates:
+                ignore_overlaps = self.iou_calculator(
+                    squares, gt_bboxes_ignore, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            else:
+                ignore_overlaps = self.iou_calculator(
+                    gt_bboxes_ignore, squares, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
+            overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
+
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        if assign_on_cpu:
+            assign_result.gt_inds = assign_result.gt_inds.to(device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(device)
+            if assign_result.labels is not None:
+                assign_result.labels = assign_result.labels.to(device)
+        return assign_result
diff --git a/mmde/mmdet/models/task_modules/assigners/assign_result.py b/mmde/mmdet/models/task_modules/assigners/assign_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..56ca2c3c18fee94cc4a039b769e42521bd14907d
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/assign_result.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import Tensor
+
+from mmdet.utils import util_mixins
+
+
+class AssignResult(util_mixins.NiceRepr):
+    """Stores assignments between predicted and truth boxes.
+
+    Attributes:
+        num_gts (int): the number of truth boxes considered when computing this
+            assignment
+        gt_inds (Tensor): for each predicted box indicates the 1-based
+            index of the assigned truth box. 0 means unassigned and -1 means
+            ignore.
+        max_overlaps (Tensor): the iou between the predicted box and its
+            assigned truth box.
+        labels (Tensor): If specified, for each predicted box
+            indicates the category label of the assigned truth box.
+
+    Example:
+        >>> # An assign result between 4 predicted boxes and 9 true boxes
+        >>> # where only two boxes were assigned.
+        >>> num_gts = 9
+        >>> max_overlaps = torch.LongTensor([0, .5, .9, 0])
+        >>> gt_inds = torch.LongTensor([-1, 1, 2, 0])
+        >>> labels = torch.LongTensor([0, 3, 4, 0])
+        >>> self = AssignResult(num_gts, gt_inds, max_overlaps, labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(4,), max_overlaps.shape=(4,),
+                      labels.shape=(4,))>
+        >>> # Force addition of gt labels (when adding gt as proposals)
+        >>> new_labels = torch.LongTensor([3, 4, 5])
+        >>> self.add_gt_(new_labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(7,), max_overlaps.shape=(7,),
+                      labels.shape=(7,))>
+    """
+
+    def __init__(self, num_gts: int, gt_inds: Tensor, max_overlaps: Tensor,
+                 labels: Tensor) -> None:
+        self.num_gts = num_gts
+        self.gt_inds = gt_inds
+        self.max_overlaps = max_overlaps
+        self.labels = labels
+        # Interface for possible user-defined properties
+        self._extra_properties = {}
+
+    @property
+    def num_preds(self):
+        """int: the number of predictions in this assignment"""
+        return len(self.gt_inds)
+
+    def set_extra_property(self, key, value):
+        """Set user-defined new property."""
+        assert key not in self.info
+        self._extra_properties[key] = value
+
+    def get_extra_property(self, key):
+        """Get user-defined property."""
+        return self._extra_properties.get(key, None)
+
+    @property
+    def info(self):
+        """dict: a dictionary of info about the object"""
+        basic_info = {
+            'num_gts': self.num_gts,
+            'num_preds': self.num_preds,
+            'gt_inds': self.gt_inds,
+            'max_overlaps': self.max_overlaps,
+            'labels': self.labels,
+        }
+        basic_info.update(self._extra_properties)
+        return basic_info
+
+    def __nice__(self):
+        """str: a "nice" summary string describing this assign result"""
+        parts = []
+        parts.append(f'num_gts={self.num_gts!r}')
+        if self.gt_inds is None:
+            parts.append(f'gt_inds={self.gt_inds!r}')
+        else:
+            parts.append(f'gt_inds.shape={tuple(self.gt_inds.shape)!r}')
+        if self.max_overlaps is None:
+            parts.append(f'max_overlaps={self.max_overlaps!r}')
+        else:
+            parts.append('max_overlaps.shape='
+                         f'{tuple(self.max_overlaps.shape)!r}')
+        if self.labels is None:
+            parts.append(f'labels={self.labels!r}')
+        else:
+            parts.append(f'labels.shape={tuple(self.labels.shape)!r}')
+        return ', '.join(parts)
+
+    @classmethod
+    def random(cls, **kwargs):
+        """Create random AssignResult for tests or debugging.
+
+        Args:
+            num_preds: number of predicted boxes
+            num_gts: number of true boxes
+            p_ignore (float): probability of a predicted box assigned to an
+                ignored truth
+            p_assigned (float): probability of a predicted box not being
+                assigned
+            p_use_label (float | bool): with labels or not
+            rng (None | int | numpy.random.RandomState): seed or state
+
+        Returns:
+            :obj:`AssignResult`: Randomly generated assign results.
+
+        Example:
+            >>> from mmdet.models.task_modules.assigners.assign_result import *  # NOQA
+            >>> self = AssignResult.random()
+            >>> print(self.info)
+        """
+        from ..samplers.sampling_result import ensure_rng
+        rng = ensure_rng(kwargs.get('rng', None))
+
+        num_gts = kwargs.get('num_gts', None)
+        num_preds = kwargs.get('num_preds', None)
+        p_ignore = kwargs.get('p_ignore', 0.3)
+        p_assigned = kwargs.get('p_assigned', 0.7)
+        num_classes = kwargs.get('num_classes', 3)
+
+        if num_gts is None:
+            num_gts = rng.randint(0, 8)
+        if num_preds is None:
+            num_preds = rng.randint(0, 16)
+
+        if num_gts == 0:
+            max_overlaps = torch.zeros(num_preds, dtype=torch.float32)
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+            labels = torch.zeros(num_preds, dtype=torch.int64)
+
+        else:
+            import numpy as np
+
+            # Create an overlap for each predicted box
+            max_overlaps = torch.from_numpy(rng.rand(num_preds))
+
+            # Construct gt_inds for each predicted box
+            is_assigned = torch.from_numpy(rng.rand(num_preds) < p_assigned)
+            # maximum number of assignments constraints
+            n_assigned = min(num_preds, min(num_gts, is_assigned.sum()))
+
+            assigned_idxs = np.where(is_assigned)[0]
+            rng.shuffle(assigned_idxs)
+            assigned_idxs = assigned_idxs[0:n_assigned]
+            assigned_idxs.sort()
+
+            is_assigned[:] = 0
+            is_assigned[assigned_idxs] = True
+
+            is_ignore = torch.from_numpy(
+                rng.rand(num_preds) < p_ignore) & is_assigned
+
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+
+            true_idxs = np.arange(num_gts)
+            rng.shuffle(true_idxs)
+            true_idxs = torch.from_numpy(true_idxs)
+            gt_inds[is_assigned] = true_idxs[:n_assigned].long()
+
+            gt_inds = torch.from_numpy(
+                rng.randint(1, num_gts + 1, size=num_preds))
+            gt_inds[is_ignore] = -1
+            gt_inds[~is_assigned] = 0
+            max_overlaps[~is_assigned] = 0
+
+            if num_classes == 0:
+                labels = torch.zeros(num_preds, dtype=torch.int64)
+            else:
+                labels = torch.from_numpy(
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    rng.randint(0, num_classes, size=num_preds))
+                labels[~is_assigned] = 0
+
+        self = cls(num_gts, gt_inds, max_overlaps, labels)
+        return self
+
+    def add_gt_(self, gt_labels):
+        """Add ground truth as assigned results.
+
+        Args:
+            gt_labels (torch.Tensor): Labels of gt boxes
+        """
+        self_inds = torch.arange(
+            1, len(gt_labels) + 1, dtype=torch.long, device=gt_labels.device)
+        self.gt_inds = torch.cat([self_inds, self.gt_inds])
+
+        self.max_overlaps = torch.cat(
+            [self.max_overlaps.new_ones(len(gt_labels)), self.max_overlaps])
+
+        self.labels = torch.cat([gt_labels, self.labels])
diff --git a/mmde/mmdet/models/task_modules/assigners/atss_assigner.py b/mmde/mmdet/models/task_modules/assigners/atss_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..2796b990c5ae4c56bcf314e1342671d950232ae6
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/atss_assigner.py
@@ -0,0 +1,254 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+def bbox_center_distance(bboxes: Tensor, priors: Tensor) -> Tensor:
+    """Compute the center distance between bboxes and priors.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for , "xyxy" format.
+        priors (Tensor): Shape (n, 4) for priors, "xyxy" format.
+
+    Returns:
+        Tensor: Center distances between bboxes and priors.
+    """
+    bbox_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
+    bbox_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
+    bbox_points = torch.stack((bbox_cx, bbox_cy), dim=1)
+
+    priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0
+    priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0
+    priors_points = torch.stack((priors_cx, priors_cy), dim=1)
+
+    distances = (priors_points[:, None, :] -
+                 bbox_points[None, :, :]).pow(2).sum(-1).sqrt()
+
+    return distances
+
+
+@TASK_UTILS.register_module()
+class ATSSAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each prior.
+
+    Each proposals will be assigned with `0` or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    If ``alpha`` is not None, it means that the dynamic cost
+    ATSSAssigner is adopted, which is currently only used in the DDOD.
+
+    Args:
+        topk (int): number of priors selected in each level
+        alpha (float, optional): param of cost rate for each proposal only
+            in DDOD. Defaults to None.
+        iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou
+            calculator. Defaults to ``dict(type='BboxOverlaps2D')``
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes. Defaults to -1.
+    """
+
+    def __init__(self,
+                 topk: int,
+                 alpha: Optional[float] = None,
+                 iou_calculator: ConfigType = dict(type='BboxOverlaps2D'),
+                 ignore_iof_thr: float = -1) -> None:
+        self.topk = topk
+        self.alpha = alpha
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+        self.ignore_iof_thr = ignore_iof_thr
+
+    # https://github.com/sfzhang15/ATSS/blob/master/atss_core/modeling/rpn/atss/loss.py
+    def assign(
+            self,
+            pred_instances: InstanceData,
+            num_level_priors: List[int],
+            gt_instances: InstanceData,
+            gt_instances_ignore: Optional[InstanceData] = None
+    ) -> AssignResult:
+        """Assign gt to priors.
+
+        The assignment is done in following steps
+
+        1. compute iou between all prior (prior of all pyramid levels) and gt
+        2. compute center distance between all prior and gt
+        3. on each pyramid level, for each gt, select k prior whose center
+           are closest to the gt center, so we total select k*l prior as
+           candidates for each gt
+        4. get corresponding iou for the these candidates, and compute the
+           mean and std, set mean + std as the iou threshold
+        5. select these candidates whose iou are greater than or equal to
+           the threshold as positive
+        6. limit the positive sample's center in gt
+
+        If ``alpha`` is not None, and ``cls_scores`` and `bbox_preds`
+        are not None, the overlaps calculation in the first step
+        will also include dynamic cost, which is currently only used in
+        the DDOD.
+
+        Args:
+            pred_instances (:obj:`InstaceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors, points, or bboxes predicted by the model,
+                shape(n, 4).
+            num_level_priors (List): Number of bboxes in each level
+            gt_instances (:obj:`InstaceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            gt_instances_ignore (:obj:`InstaceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels
+        if gt_instances_ignore is not None:
+            gt_bboxes_ignore = gt_instances_ignore.bboxes
+        else:
+            gt_bboxes_ignore = None
+
+        INF = 100000000
+        priors = priors[:, :4]
+        num_gt, num_priors = gt_bboxes.size(0), priors.size(0)
+
+        message = 'Invalid alpha parameter because cls_scores or ' \
+                  'bbox_preds are None. If you want to use the ' \
+                  'cost-based ATSSAssigner,  please set cls_scores, ' \
+                  'bbox_preds and self.alpha at the same time. '
+
+        # compute iou between all bbox and gt
+        if self.alpha is None:
+            # ATSSAssigner
+            overlaps = self.iou_calculator(priors, gt_bboxes)
+            if ('scores' in pred_instances or 'bboxes' in pred_instances):
+                warnings.warn(message)
+
+        else:
+            # Dynamic cost ATSSAssigner in DDOD
+            assert ('scores' in pred_instances
+                    and 'bboxes' in pred_instances), message
+            cls_scores = pred_instances.scores
+            bbox_preds = pred_instances.bboxes
+
+            # compute cls cost for bbox and GT
+            cls_cost = torch.sigmoid(cls_scores[:, gt_labels])
+
+            # compute iou between all bbox and gt
+            overlaps = self.iou_calculator(bbox_preds, gt_bboxes)
+
+            # make sure that we are in element-wise multiplication
+            assert cls_cost.shape == overlaps.shape
+
+            # overlaps is actually a cost matrix
+            overlaps = cls_cost**(1 - self.alpha) * overlaps**self.alpha
+
+        # assign 0 by default
+        assigned_gt_inds = overlaps.new_full((num_priors, ),
+                                             0,
+                                             dtype=torch.long)
+
+        if num_gt == 0 or num_priors == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_priors, ))
+            if num_gt == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            assigned_labels = overlaps.new_full((num_priors, ),
+                                                -1,
+                                                dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        # compute center distance between all bbox and gt
+        distances = bbox_center_distance(gt_bboxes, priors)
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and priors.numel() > 0):
+            ignore_overlaps = self.iou_calculator(
+                priors, gt_bboxes_ignore, mode='iof')
+            ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            ignore_idxs = ignore_max_overlaps > self.ignore_iof_thr
+            distances[ignore_idxs, :] = INF
+            assigned_gt_inds[ignore_idxs] = -1
+
+        # Selecting candidates based on the center distance
+        candidate_idxs = []
+        start_idx = 0
+        for level, priors_per_level in enumerate(num_level_priors):
+            # on each pyramid level, for each gt,
+            # select k bbox whose center are closest to the gt center
+            end_idx = start_idx + priors_per_level
+            distances_per_level = distances[start_idx:end_idx, :]
+            selectable_k = min(self.topk, priors_per_level)
+            _, topk_idxs_per_level = distances_per_level.topk(
+                selectable_k, dim=0, largest=False)
+            candidate_idxs.append(topk_idxs_per_level + start_idx)
+            start_idx = end_idx
+        candidate_idxs = torch.cat(candidate_idxs, dim=0)
+
+        # get corresponding iou for the these candidates, and compute the
+        # mean and std, set mean + std as the iou threshold
+        candidate_overlaps = overlaps[candidate_idxs, torch.arange(num_gt)]
+        overlaps_mean_per_gt = candidate_overlaps.mean(0)
+        overlaps_std_per_gt = candidate_overlaps.std(0)
+        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
+
+        is_pos = candidate_overlaps >= overlaps_thr_per_gt[None, :]
+
+        # limit the positive sample's center in gt
+        for gt_idx in range(num_gt):
+            candidate_idxs[:, gt_idx] += gt_idx * num_priors
+        priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0
+        priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0
+        ep_priors_cx = priors_cx.view(1, -1).expand(
+            num_gt, num_priors).contiguous().view(-1)
+        ep_priors_cy = priors_cy.view(1, -1).expand(
+            num_gt, num_priors).contiguous().view(-1)
+        candidate_idxs = candidate_idxs.view(-1)
+
+        # calculate the left, top, right, bottom distance between positive
+        # prior center and gt side
+        l_ = ep_priors_cx[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 0]
+        t_ = ep_priors_cy[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - ep_priors_cx[candidate_idxs].view(-1, num_gt)
+        b_ = gt_bboxes[:, 3] - ep_priors_cy[candidate_idxs].view(-1, num_gt)
+        is_in_gts = torch.stack([l_, t_, r_, b_], dim=1).min(dim=1)[0] > 0.01
+
+        is_pos = is_pos & is_in_gts
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest IoU will be selected.
+        overlaps_inf = torch.full_like(overlaps,
+                                       -INF).t().contiguous().view(-1)
+        index = candidate_idxs.view(-1)[is_pos.view(-1)]
+        overlaps_inf[index] = overlaps.t().contiguous().view(-1)[index]
+        overlaps_inf = overlaps_inf.view(num_gt, -1).t()
+
+        max_overlaps, argmax_overlaps = overlaps_inf.max(dim=1)
+        assigned_gt_inds[
+            max_overlaps != -INF] = argmax_overlaps[max_overlaps != -INF] + 1
+
+        assigned_labels = assigned_gt_inds.new_full((num_priors, ), -1)
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+        return AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
diff --git a/mmde/mmdet/models/task_modules/assigners/base_assigner.py b/mmde/mmdet/models/task_modules/assigners/base_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..b12280ad746c7557008313dd936a62a99e8c78d5
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/base_assigner.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Optional
+
+from mmengine.structures import InstanceData
+
+
+class BaseAssigner(metaclass=ABCMeta):
+    """Base assigner that assigns boxes to ground truth boxes."""
+
+    @abstractmethod
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs):
+        """Assign boxes to either a ground truth boxes or a negative boxes."""
diff --git a/mmde/mmdet/models/task_modules/assigners/center_region_assigner.py b/mmde/mmdet/models/task_modules/assigners/center_region_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..11c8055c67cdf46c1ae0f877e88192db33795581
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/center_region_assigner.py
@@ -0,0 +1,366 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+def scale_boxes(bboxes: Tensor, scale: float) -> Tensor:
+    """Expand an array of boxes by a given scale.
+
+    Args:
+        bboxes (Tensor): Shape (m, 4)
+        scale (float): The scale factor of bboxes
+
+    Returns:
+        Tensor: Shape (m, 4). Scaled bboxes
+    """
+    assert bboxes.size(1) == 4
+    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
+    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
+    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
+    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_scaled = torch.zeros_like(bboxes)
+    boxes_scaled[:, 0] = x_c - w_half
+    boxes_scaled[:, 2] = x_c + w_half
+    boxes_scaled[:, 1] = y_c - h_half
+    boxes_scaled[:, 3] = y_c + h_half
+    return boxes_scaled
+
+
+def is_located_in(points: Tensor, bboxes: Tensor) -> Tensor:
+    """Are points located in bboxes.
+
+    Args:
+        points (Tensor): Points, shape: (m, 2).
+        bboxes (Tensor): Bounding boxes, shape: (n, 4).
+
+    Return:
+        Tensor: Flags indicating if points are located in bboxes,
+        shape: (m, n).
+    """
+    assert points.size(1) == 2
+    assert bboxes.size(1) == 4
+    return (points[:, 0].unsqueeze(1) > bboxes[:, 0].unsqueeze(0)) & \
+           (points[:, 0].unsqueeze(1) < bboxes[:, 2].unsqueeze(0)) & \
+           (points[:, 1].unsqueeze(1) > bboxes[:, 1].unsqueeze(0)) & \
+           (points[:, 1].unsqueeze(1) < bboxes[:, 3].unsqueeze(0))
+
+
+def bboxes_area(bboxes: Tensor) -> Tensor:
+    """Compute the area of an array of bboxes.
+
+    Args:
+        bboxes (Tensor): The coordinates ox bboxes. Shape: (m, 4)
+
+    Returns:
+        Tensor: Area of the bboxes. Shape: (m, )
+    """
+    assert bboxes.size(1) == 4
+    w = (bboxes[:, 2] - bboxes[:, 0])
+    h = (bboxes[:, 3] - bboxes[:, 1])
+    areas = w * h
+    return areas
+
+
+@TASK_UTILS.register_module()
+class CenterRegionAssigner(BaseAssigner):
+    """Assign pixels at the center region of a bbox as positive.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+    - -1: negative samples
+    - semi-positive numbers: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_scale (float): Threshold within which pixels are
+            labelled as positive.
+        neg_scale (float): Threshold above which pixels are
+            labelled as positive.
+        min_pos_iof (float): Minimum iof of a pixel with a gt to be
+            labelled as positive. Default: 1e-2
+        ignore_gt_scale (float): Threshold within which the pixels
+            are ignored when the gt is labelled as shadowed. Default: 0.5
+        foreground_dominate (bool): If True, the bbox will be assigned as
+            positive when a gt's kernel region overlaps with another's shadowed
+            (ignored) region, otherwise it is set as ignored. Default to False.
+        iou_calculator (:obj:`ConfigDict` or dict): Config of overlaps
+            Calculator.
+    """
+
+    def __init__(
+        self,
+        pos_scale: float,
+        neg_scale: float,
+        min_pos_iof: float = 1e-2,
+        ignore_gt_scale: float = 0.5,
+        foreground_dominate: bool = False,
+        iou_calculator: ConfigType = dict(type='BboxOverlaps2D')
+    ) -> None:
+        self.pos_scale = pos_scale
+        self.neg_scale = neg_scale
+        self.min_pos_iof = min_pos_iof
+        self.ignore_gt_scale = ignore_gt_scale
+        self.foreground_dominate = foreground_dominate
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def get_gt_priorities(self, gt_bboxes: Tensor) -> Tensor:
+        """Get gt priorities according to their areas.
+
+        Smaller gt has higher priority.
+
+        Args:
+            gt_bboxes (Tensor): Ground truth boxes, shape (k, 4).
+
+        Returns:
+            Tensor: The priority of gts so that gts with larger priority is
+            more likely to be assigned. Shape (k, )
+        """
+        gt_areas = bboxes_area(gt_bboxes)
+        # Rank all gt bbox areas. Smaller objects has larger priority
+        _, sort_idx = gt_areas.sort(descending=True)
+        sort_idx = sort_idx.argsort()
+        return sort_idx
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to bboxes.
+
+        This method assigns gts to every prior (proposal/anchor), each prior
+        will be assigned with -1, or a semi-positive number. -1 means
+        negative sample, semi-positive number is the index (0-based) of
+        assigned gt.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result. Note that shadowed_labels
+            of shape (N, 2) is also added as an `assign_result` attribute.
+            `shadowed_labels` is a tensor composed of N pairs of anchor_ind,
+            class_label], where N is the number of anchors that lie in the
+            outer region of a gt, anchor_ind is the shadowed anchor index
+            and class_label is the shadowed class label.
+
+        Example:
+            >>> from mmengine.structures import InstanceData
+            >>> self = CenterRegionAssigner(0.2, 0.2)
+            >>> pred_instances.priors = torch.Tensor([[0, 0, 10, 10],
+            ...                                      [10, 10, 20, 20]])
+            >>> gt_instances = InstanceData()
+            >>> gt_instances.bboxes = torch.Tensor([[0, 0, 10, 10]])
+            >>> gt_instances.labels = torch.Tensor([0])
+            >>> assign_result = self.assign(pred_instances, gt_instances)
+            >>> expected_gt_inds = torch.LongTensor([1, 0])
+            >>> assert torch.all(assign_result.gt_inds == expected_gt_inds)
+        """
+        # There are in total 5 steps in the pixel assignment
+        # 1. Find core (the center region, say inner 0.2)
+        #     and shadow (the relatively ourter part, say inner 0.2-0.5)
+        #     regions of every gt.
+        # 2. Find all prior bboxes that lie in gt_core and gt_shadow regions
+        # 3. Assign prior bboxes in gt_core with a one-hot id of the gt in
+        #      the image.
+        #    3.1. For overlapping objects, the prior bboxes in gt_core is
+        #           assigned with the object with smallest area
+        # 4. Assign prior bboxes with class label according to its gt id.
+        #    4.1. Assign -1 to prior bboxes lying in shadowed gts
+        #    4.2. Assign positive prior boxes with the corresponding label
+        # 5. Find pixels lying in the shadow of an object and assign them with
+        #      background label, but set the loss weight of its corresponding
+        #      gt to zero.
+
+        # TODO not extract bboxes in assign.
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels
+
+        assert priors.size(1) == 4, 'priors must have size of 4'
+        # 1. Find core positive and shadow region of every gt
+        gt_core = scale_boxes(gt_bboxes, self.pos_scale)
+        gt_shadow = scale_boxes(gt_bboxes, self.neg_scale)
+
+        # 2. Find prior bboxes that lie in gt_core and gt_shadow regions
+        prior_centers = (priors[:, 2:4] + priors[:, 0:2]) / 2
+        # The center points lie within the gt boxes
+        is_prior_in_gt = is_located_in(prior_centers, gt_bboxes)
+        # Only calculate prior and gt_core IoF. This enables small prior bboxes
+        #   to match large gts
+        prior_and_gt_core_overlaps = self.iou_calculator(
+            priors, gt_core, mode='iof')
+        # The center point of effective priors should be within the gt box
+        is_prior_in_gt_core = is_prior_in_gt & (
+            prior_and_gt_core_overlaps > self.min_pos_iof)  # shape (n, k)
+
+        is_prior_in_gt_shadow = (
+            self.iou_calculator(priors, gt_shadow, mode='iof') >
+            self.min_pos_iof)
+        # Rule out center effective positive pixels
+        is_prior_in_gt_shadow &= (~is_prior_in_gt_core)
+
+        num_gts, num_priors = gt_bboxes.size(0), priors.size(0)
+        if num_gts == 0 or num_priors == 0:
+            # If no gts exist, assign all pixels to negative
+            assigned_gt_ids = \
+                is_prior_in_gt_core.new_zeros((num_priors,),
+                                              dtype=torch.long)
+            pixels_in_gt_shadow = assigned_gt_ids.new_empty((0, 2))
+        else:
+            # Step 3: assign a one-hot gt id to each pixel, and smaller objects
+            #    have high priority to assign the pixel.
+            sort_idx = self.get_gt_priorities(gt_bboxes)
+            assigned_gt_ids, pixels_in_gt_shadow = \
+                self.assign_one_hot_gt_indices(is_prior_in_gt_core,
+                                               is_prior_in_gt_shadow,
+                                               gt_priority=sort_idx)
+
+        if (gt_instances_ignore is not None
+                and gt_instances_ignore.bboxes.numel() > 0):
+            # No ground truth or boxes, return empty assignment
+            gt_bboxes_ignore = gt_instances_ignore.bboxes
+            gt_bboxes_ignore = scale_boxes(
+                gt_bboxes_ignore, scale=self.ignore_gt_scale)
+            is_prior_in_ignored_gts = is_located_in(prior_centers,
+                                                    gt_bboxes_ignore)
+            is_prior_in_ignored_gts = is_prior_in_ignored_gts.any(dim=1)
+            assigned_gt_ids[is_prior_in_ignored_gts] = -1
+
+        # 4. Assign prior bboxes with class label according to its gt id.
+        # Default assigned label is the background (-1)
+        assigned_labels = assigned_gt_ids.new_full((num_priors, ), -1)
+        pos_inds = torch.nonzero(assigned_gt_ids > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_ids[pos_inds] -
+                                                  1]
+        # 5. Find pixels lying in the shadow of an object
+        shadowed_pixel_labels = pixels_in_gt_shadow.clone()
+        if pixels_in_gt_shadow.numel() > 0:
+            pixel_idx, gt_idx =\
+                pixels_in_gt_shadow[:, 0], pixels_in_gt_shadow[:, 1]
+            assert (assigned_gt_ids[pixel_idx] != gt_idx).all(), \
+                'Some pixels are dually assigned to ignore and gt!'
+            shadowed_pixel_labels[:, 1] = gt_labels[gt_idx - 1]
+            override = (
+                assigned_labels[pixel_idx] == shadowed_pixel_labels[:, 1])
+            if self.foreground_dominate:
+                # When a pixel is both positive and shadowed, set it as pos
+                shadowed_pixel_labels = shadowed_pixel_labels[~override]
+            else:
+                # When a pixel is both pos and shadowed, set it as shadowed
+                assigned_labels[pixel_idx[override]] = -1
+                assigned_gt_ids[pixel_idx[override]] = 0
+
+        assign_result = AssignResult(
+            num_gts, assigned_gt_ids, None, labels=assigned_labels)
+        # Add shadowed_labels as assign_result property. Shape: (num_shadow, 2)
+        assign_result.set_extra_property('shadowed_labels',
+                                         shadowed_pixel_labels)
+        return assign_result
+
+    def assign_one_hot_gt_indices(
+            self,
+            is_prior_in_gt_core: Tensor,
+            is_prior_in_gt_shadow: Tensor,
+            gt_priority: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
+        """Assign only one gt index to each prior box.
+
+        Gts with large gt_priority are more likely to be assigned.
+
+        Args:
+            is_prior_in_gt_core (Tensor): Bool tensor indicating the prior
+                center is in the core area of a gt (e.g. 0-0.2).
+                Shape: (num_prior, num_gt).
+            is_prior_in_gt_shadow (Tensor): Bool tensor indicating the prior
+                center is in the shadowed area of a gt (e.g. 0.2-0.5).
+                Shape: (num_prior, num_gt).
+            gt_priority (Tensor): Priorities of gts. The gt with a higher
+                priority is more likely to be assigned to the bbox when the
+                bbox match with multiple gts. Shape: (num_gt, ).
+
+        Returns:
+            tuple: Returns (assigned_gt_inds, shadowed_gt_inds).
+
+            - assigned_gt_inds: The assigned gt index of each prior bbox \
+            (i.e. index from 1 to num_gts). Shape: (num_prior, ).
+            - shadowed_gt_inds: shadowed gt indices. It is a tensor of \
+            shape (num_ignore, 2) with first column being the shadowed prior \
+            bbox indices and the second column the shadowed gt \
+            indices (1-based).
+        """
+        num_bboxes, num_gts = is_prior_in_gt_core.shape
+
+        if gt_priority is None:
+            gt_priority = torch.arange(
+                num_gts, device=is_prior_in_gt_core.device)
+        assert gt_priority.size(0) == num_gts
+        # The bigger gt_priority, the more preferable to be assigned
+        # The assigned inds are by default 0 (background)
+        assigned_gt_inds = is_prior_in_gt_core.new_zeros((num_bboxes, ),
+                                                         dtype=torch.long)
+        # Shadowed bboxes are assigned to be background. But the corresponding
+        #   label is ignored during loss calculation, which is done through
+        #   shadowed_gt_inds
+        shadowed_gt_inds = torch.nonzero(is_prior_in_gt_shadow, as_tuple=False)
+        if is_prior_in_gt_core.sum() == 0:  # No gt match
+            shadowed_gt_inds[:, 1] += 1  # 1-based. For consistency issue
+            return assigned_gt_inds, shadowed_gt_inds
+
+        # The priority of each prior box and gt pair. If one prior box is
+        #  matched bo multiple gts. Only the pair with the highest priority
+        #  is saved
+        pair_priority = is_prior_in_gt_core.new_full((num_bboxes, num_gts),
+                                                     -1,
+                                                     dtype=torch.long)
+
+        # Each bbox could match with multiple gts.
+        # The following codes deal with this situation
+        # Matched  bboxes (to any gt). Shape: (num_pos_anchor, )
+        inds_of_match = torch.any(is_prior_in_gt_core, dim=1)
+        # The matched gt index of each positive bbox. Length >= num_pos_anchor
+        #   , since one bbox could match multiple gts
+        matched_bbox_gt_inds = torch.nonzero(
+            is_prior_in_gt_core, as_tuple=False)[:, 1]
+        # Assign priority to each bbox-gt pair.
+        pair_priority[is_prior_in_gt_core] = gt_priority[matched_bbox_gt_inds]
+        _, argmax_priority = pair_priority[inds_of_match].max(dim=1)
+        assigned_gt_inds[inds_of_match] = argmax_priority + 1  # 1-based
+        # Zero-out the assigned anchor box to filter the shadowed gt indices
+        is_prior_in_gt_core[inds_of_match, argmax_priority] = 0
+        # Concat the shadowed indices due to overlapping with that out side of
+        #   effective scale. shape: (total_num_ignore, 2)
+        shadowed_gt_inds = torch.cat(
+            (shadowed_gt_inds,
+             torch.nonzero(is_prior_in_gt_core, as_tuple=False)),
+            dim=0)
+        # Change `is_prior_in_gt_core` back to keep arguments intact.
+        is_prior_in_gt_core[inds_of_match, argmax_priority] = 1
+        # 1-based shadowed gt indices, to be consistent with `assigned_gt_inds`
+        if shadowed_gt_inds.numel() > 0:
+            shadowed_gt_inds[:, 1] += 1
+        return assigned_gt_inds, shadowed_gt_inds
diff --git a/mmde/mmdet/models/task_modules/assigners/dynamic_soft_label_assigner.py b/mmde/mmdet/models/task_modules/assigners/dynamic_soft_label_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fc7af39b22cd6dc00248e330547176787c23963
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/dynamic_soft_label_assigner.py
@@ -0,0 +1,227 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+INF = 100000000
+EPS = 1.0e-7
+
+
+def center_of_mass(masks: Tensor, eps: float = 1e-7) -> Tensor:
+    """Compute the masks center of mass.
+
+    Args:
+        masks: Mask tensor, has shape (num_masks, H, W).
+        eps: a small number to avoid normalizer to be zero.
+            Defaults to 1e-7.
+    Returns:
+        Tensor: The masks center of mass. Has shape (num_masks, 2).
+    """
+    n, h, w = masks.shape
+    grid_h = torch.arange(h, device=masks.device)[:, None]
+    grid_w = torch.arange(w, device=masks.device)
+    normalizer = masks.sum(dim=(1, 2)).float().clamp(min=eps)
+    center_y = (masks * grid_h).sum(dim=(1, 2)) / normalizer
+    center_x = (masks * grid_w).sum(dim=(1, 2)) / normalizer
+    center = torch.cat([center_x[:, None], center_y[:, None]], dim=1)
+    return center
+
+
+@TASK_UTILS.register_module()
+class DynamicSoftLabelAssigner(BaseAssigner):
+    """Computes matching between predictions and ground truth with dynamic soft
+    label assignment.
+
+    Args:
+        soft_center_radius (float): Radius of the soft center prior.
+            Defaults to 3.0.
+        topk (int): Select top-k predictions to calculate dynamic k
+            best matches for each gt. Defaults to 13.
+        iou_weight (float): The scale factor of iou cost. Defaults to 3.0.
+        iou_calculator (ConfigType): Config of overlaps Calculator.
+            Defaults to dict(type='BboxOverlaps2D').
+    """
+
+    def __init__(
+        self,
+        soft_center_radius: float = 3.0,
+        topk: int = 13,
+        iou_weight: float = 3.0,
+        iou_calculator: ConfigType = dict(type='BboxOverlaps2D')
+    ) -> None:
+        self.soft_center_radius = soft_center_radius
+        self.topk = topk
+        self.iou_weight = iou_weight
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to priors.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            obj:`AssignResult`: The assigned result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        num_gt = gt_bboxes.size(0)
+
+        decoded_bboxes = pred_instances.bboxes
+        pred_scores = pred_instances.scores
+        priors = pred_instances.priors
+        num_bboxes = decoded_bboxes.size(0)
+
+        # assign 0 by default
+        assigned_gt_inds = decoded_bboxes.new_full((num_bboxes, ),
+                                                   0,
+                                                   dtype=torch.long)
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        prior_center = priors[:, :2]
+        if isinstance(gt_bboxes, BaseBoxes):
+            is_in_gts = gt_bboxes.find_inside_points(prior_center)
+        else:
+            # Tensor boxes will be treated as horizontal boxes by defaults
+            lt_ = prior_center[:, None] - gt_bboxes[:, :2]
+            rb_ = gt_bboxes[:, 2:] - prior_center[:, None]
+
+            deltas = torch.cat([lt_, rb_], dim=-1)
+            is_in_gts = deltas.min(dim=-1).values > 0
+
+        valid_mask = is_in_gts.sum(dim=1) > 0
+
+        valid_decoded_bbox = decoded_bboxes[valid_mask]
+        valid_pred_scores = pred_scores[valid_mask]
+        num_valid = valid_decoded_bbox.size(0)
+
+        if num_valid == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+        if hasattr(gt_instances, 'masks'):
+            gt_center = center_of_mass(gt_instances.masks, eps=EPS)
+        elif isinstance(gt_bboxes, BaseBoxes):
+            gt_center = gt_bboxes.centers
+        else:
+            # Tensor boxes will be treated as horizontal boxes by defaults
+            gt_center = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) / 2.0
+        valid_prior = priors[valid_mask]
+        strides = valid_prior[:, 2]
+        distance = (valid_prior[:, None, :2] - gt_center[None, :, :]
+                    ).pow(2).sum(-1).sqrt() / strides[:, None]
+        soft_center_prior = torch.pow(10, distance - self.soft_center_radius)
+
+        pairwise_ious = self.iou_calculator(valid_decoded_bbox, gt_bboxes)
+        iou_cost = -torch.log(pairwise_ious + EPS) * self.iou_weight
+
+        gt_onehot_label = (
+            F.one_hot(gt_labels.to(torch.int64),
+                      pred_scores.shape[-1]).float().unsqueeze(0).repeat(
+                          num_valid, 1, 1))
+        valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat(1, num_gt, 1)
+
+        soft_label = gt_onehot_label * pairwise_ious[..., None]
+        scale_factor = soft_label - valid_pred_scores.sigmoid()
+        soft_cls_cost = F.binary_cross_entropy_with_logits(
+            valid_pred_scores, soft_label,
+            reduction='none') * scale_factor.abs().pow(2.0)
+        soft_cls_cost = soft_cls_cost.sum(dim=-1)
+
+        cost_matrix = soft_cls_cost + iou_cost + soft_center_prior
+
+        matched_pred_ious, matched_gt_inds = self.dynamic_k_matching(
+            cost_matrix, pairwise_ious, num_gt, valid_mask)
+
+        # convert to AssignResult format
+        assigned_gt_inds[valid_mask] = matched_gt_inds + 1
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        assigned_labels[valid_mask] = gt_labels[matched_gt_inds].long()
+        max_overlaps = assigned_gt_inds.new_full((num_bboxes, ),
+                                                 -INF,
+                                                 dtype=torch.float32)
+        max_overlaps[valid_mask] = matched_pred_ious
+        return AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+    def dynamic_k_matching(self, cost: Tensor, pairwise_ious: Tensor,
+                           num_gt: int,
+                           valid_mask: Tensor) -> Tuple[Tensor, Tensor]:
+        """Use IoU and matching cost to calculate the dynamic top-k positive
+        targets. Same as SimOTA.
+
+        Args:
+            cost (Tensor): Cost matrix.
+            pairwise_ious (Tensor): Pairwise iou matrix.
+            num_gt (int): Number of gt.
+            valid_mask (Tensor): Mask for valid bboxes.
+
+        Returns:
+            tuple: matched ious and gt indexes.
+        """
+        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
+        # select candidate topk ious for dynamic-k calculation
+        candidate_topk = min(self.topk, pairwise_ious.size(0))
+        topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0)
+        # calculate dynamic k for each gt
+        dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1)
+        for gt_idx in range(num_gt):
+            _, pos_idx = torch.topk(
+                cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
+            matching_matrix[:, gt_idx][pos_idx] = 1
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        prior_match_gt_mask = matching_matrix.sum(1) > 1
+        if prior_match_gt_mask.sum() > 0:
+            cost_min, cost_argmin = torch.min(
+                cost[prior_match_gt_mask, :], dim=1)
+            matching_matrix[prior_match_gt_mask, :] *= 0
+            matching_matrix[prior_match_gt_mask, cost_argmin] = 1
+        # get foreground mask inside box and center prior
+        fg_mask_inboxes = matching_matrix.sum(1) > 0
+        valid_mask[valid_mask.clone()] = fg_mask_inboxes
+
+        matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1)
+        matched_pred_ious = (matching_matrix *
+                             pairwise_ious).sum(1)[fg_mask_inboxes]
+        return matched_pred_ious, matched_gt_inds
diff --git a/mmde/mmdet/models/task_modules/assigners/grid_assigner.py b/mmde/mmdet/models/task_modules/assigners/grid_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8935d2df2937f90c71599e5b45ed9a3dff8cd7e
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/grid_assigner.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@TASK_UTILS.register_module()
+class GridAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+
+    - -1: don't care
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple[float, float]): IoU threshold for negative
+        bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+            Defaults to 0.
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        iou_calculator (:obj:`ConfigDict` or dict): Config of overlaps
+            Calculator.
+    """
+
+    def __init__(
+        self,
+        pos_iou_thr: float,
+        neg_iou_thr: Union[float, Tuple[float, float]],
+        min_pos_iou: float = .0,
+        gt_max_assign_all: bool = True,
+        iou_calculator: ConfigType = dict(type='BboxOverlaps2D')
+    ) -> None:
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to bboxes. The process is very much like the max iou
+        assigner, except that positive samples are constrained within the cell
+        that the gt boxes fell in.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, 0, or a positive number. -1 means don't care,
+        0 means negative sample, positive number is the index (1-based) of
+        assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to -1
+        2. assign proposals whose iou with all gts <= neg_iou_thr to 0
+        3. for each bbox within a cell, if the iou with its nearest gt >
+            pos_iou_thr and the center of that gt falls inside the cell,
+            assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals within the cell the
+            gt bbox falls in to itself.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+
+        priors = pred_instances.priors
+        responsible_flags = pred_instances.responsible_flags
+
+        num_gts, num_priors = gt_bboxes.size(0), priors.size(0)
+
+        # compute iou between all gt and priors
+        overlaps = self.iou_calculator(gt_bboxes, priors)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = overlaps.new_full((num_priors, ),
+                                             -1,
+                                             dtype=torch.long)
+
+        if num_gts == 0 or num_priors == 0:
+            # No ground truth or priors, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_priors, ))
+            if num_gts == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            assigned_labels = overlaps.new_full((num_priors, ),
+                                                -1,
+                                                dtype=torch.long)
+            return AssignResult(
+                num_gts,
+                assigned_gt_inds,
+                max_overlaps,
+                labels=assigned_labels)
+
+        # 2. assign negative: below
+        # for each anchor, which gt best overlaps with it
+        # for each anchor, the max iou of all gts
+        # shape of max_overlaps == argmax_overlaps == num_priors
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+        if isinstance(self.neg_iou_thr, float):
+            assigned_gt_inds[(max_overlaps >= 0)
+                             & (max_overlaps <= self.neg_iou_thr)] = 0
+        elif isinstance(self.neg_iou_thr, (tuple, list)):
+            assert len(self.neg_iou_thr) == 2
+            assigned_gt_inds[(max_overlaps > self.neg_iou_thr[0])
+                             & (max_overlaps <= self.neg_iou_thr[1])] = 0
+
+        # 3. assign positive: falls into responsible cell and above
+        # positive IOU threshold, the order matters.
+        # the prior condition of comparison is to filter out all
+        # unrelated anchors, i.e. not responsible_flags
+        overlaps[:, ~responsible_flags.type(torch.bool)] = -1.
+
+        # calculate max_overlaps again, but this time we only consider IOUs
+        # for anchors responsible for prediction
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+        # for each gt, which anchor best overlaps with it
+        # for each gt, the max iou of all proposals
+        # shape of gt_max_overlaps == gt_argmax_overlaps == num_gts
+        gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1)
+
+        pos_inds = (max_overlaps > self.pos_iou_thr) & responsible_flags.type(
+            torch.bool)
+        assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
+
+        # 4. assign positive to max overlapped anchors within responsible cell
+        for i in range(num_gts):
+            if gt_max_overlaps[i] > self.min_pos_iou:
+                if self.gt_max_assign_all:
+                    max_iou_inds = (overlaps[i, :] == gt_max_overlaps[i]) & \
+                         responsible_flags.type(torch.bool)
+                    assigned_gt_inds[max_iou_inds] = i + 1
+                elif responsible_flags[gt_argmax_overlaps[i]]:
+                    assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
+
+        # assign labels of positive anchors
+        assigned_labels = assigned_gt_inds.new_full((num_priors, ), -1)
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+
+        return AssignResult(
+            num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
diff --git a/mmde/mmdet/models/task_modules/assigners/hungarian_assigner.py b/mmde/mmdet/models/task_modules/assigners/hungarian_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6745a36cdc713c74f801f62dae0d8fe3d03828f
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/hungarian_assigner.py
@@ -0,0 +1,145 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import torch
+from mmengine import ConfigDict
+from mmengine.structures import InstanceData
+from scipy.optimize import linear_sum_assignment
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@TASK_UTILS.register_module()
+class HungarianAssigner(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of some components.
+    For DETR the costs are weighted sum of classification cost, regression L1
+    cost and regression iou cost. The targets don't include the no_object, so
+    generally there are more predictions than targets. After the one-to-one
+    matching, the un-matched are treated as backgrounds. Thus each query
+    prediction will be assigned with `0` or a positive integer indicating the
+    ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        match_costs (:obj:`ConfigDict` or dict or \
+            List[Union[:obj:`ConfigDict`, dict]]): Match cost configs.
+    """
+
+    def __init__(
+        self, match_costs: Union[List[Union[dict, ConfigDict]], dict,
+                                 ConfigDict]
+    ) -> None:
+
+        if isinstance(match_costs, dict):
+            match_costs = [match_costs]
+        elif isinstance(match_costs, list):
+            assert len(match_costs) > 0, \
+                'match_costs must not be a empty list.'
+
+        self.match_costs = [
+            TASK_UTILS.build(match_cost) for match_cost in match_costs
+        ]
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               img_meta: Optional[dict] = None,
+               **kwargs) -> AssignResult:
+        """Computes one-to-one matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places. It may includes ``masks``, with shape
+                (n, h, w) or (n, l).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                ``labels``, with shape (k, ) and ``masks``, with shape
+                (k, h, w) or (k, l).
+            img_meta (dict): Image information.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert isinstance(gt_instances.labels, Tensor)
+        num_gts, num_preds = len(gt_instances), len(pred_instances)
+        gt_labels = gt_instances.labels
+        device = gt_labels.device
+
+        # 1. assign -1 by default
+        assigned_gt_inds = torch.full((num_preds, ),
+                                      -1,
+                                      dtype=torch.long,
+                                      device=device)
+        assigned_labels = torch.full((num_preds, ),
+                                     -1,
+                                     dtype=torch.long,
+                                     device=device)
+
+        if num_gts == 0 or num_preds == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts=num_gts,
+                gt_inds=assigned_gt_inds,
+                max_overlaps=None,
+                labels=assigned_labels)
+
+        # 2. compute weighted cost
+        cost_list = []
+        for match_cost in self.match_costs:
+            cost = match_cost(
+                pred_instances=pred_instances,
+                gt_instances=gt_instances,
+                img_meta=img_meta)
+            cost_list.append(cost)
+        cost = torch.stack(cost_list).sum(dim=0)
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts=num_gts,
+            gt_inds=assigned_gt_inds,
+            max_overlaps=None,
+            labels=assigned_labels)
diff --git a/mmde/mmdet/models/task_modules/assigners/iou2d_calculator.py b/mmde/mmdet/models/task_modules/assigners/iou2d_calculator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6daa94feb46ac2f188df41c7be59ffdc3905e58
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/iou2d_calculator.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox_overlaps, get_box_tensor
+
+
+def cast_tensor_type(x, scale=1., dtype=None):
+    if dtype == 'fp16':
+        # scale is for preventing overflows
+        x = (x / scale).half()
+    return x
+
+
+@TASK_UTILS.register_module()
+class BboxOverlaps2D:
+    """2D Overlaps (e.g. IoUs, GIoUs) Calculator."""
+
+    def __init__(self, scale=1., dtype=None):
+        self.scale = scale
+        self.dtype = dtype
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        """Calculate IoU between 2D bboxes.
+
+        Args:
+            bboxes1 (Tensor or :obj:`BaseBoxes`): bboxes have shape (m, 4)
+                in <x1, y1, x2, y2> format, or shape (m, 5) in <x1, y1, x2,
+                y2, score> format.
+            bboxes2 (Tensor or :obj:`BaseBoxes`): bboxes have shape (m, 4)
+                in <x1, y1, x2, y2> format, shape (m, 5) in <x1, y1, x2, y2,
+                score> format, or be empty. If ``is_aligned `` is ``True``,
+                then m and n must be equal.
+            mode (str): "iou" (intersection over union), "iof" (intersection
+                over foreground), or "giou" (generalized intersection over
+                union).
+            is_aligned (bool, optional): If True, then m and n must be equal.
+                Default False.
+
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+        """
+        bboxes1 = get_box_tensor(bboxes1)
+        bboxes2 = get_box_tensor(bboxes2)
+        assert bboxes1.size(-1) in [0, 4, 5]
+        assert bboxes2.size(-1) in [0, 4, 5]
+        if bboxes2.size(-1) == 5:
+            bboxes2 = bboxes2[..., :4]
+        if bboxes1.size(-1) == 5:
+            bboxes1 = bboxes1[..., :4]
+
+        if self.dtype == 'fp16':
+            # change tensor type to save cpu and cuda memory and keep speed
+            bboxes1 = cast_tensor_type(bboxes1, self.scale, self.dtype)
+            bboxes2 = cast_tensor_type(bboxes2, self.scale, self.dtype)
+            overlaps = bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
+            if not overlaps.is_cuda and overlaps.dtype == torch.float16:
+                # resume cpu float32
+                overlaps = overlaps.float()
+            return overlaps
+
+        return bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
+
+    def __repr__(self):
+        """str: a string describing the module"""
+        repr_str = self.__class__.__name__ + f'(' \
+            f'scale={self.scale}, dtype={self.dtype})'
+        return repr_str
+
+
+@TASK_UTILS.register_module()
+class BboxOverlaps2D_GLIP(BboxOverlaps2D):
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        TO_REMOVE = 1
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + TO_REMOVE) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + TO_REMOVE)
+        area2 = (bboxes2[:, 2] - bboxes2[:, 0] + TO_REMOVE) * (
+            bboxes2[:, 3] - bboxes2[:, 1] + TO_REMOVE)
+
+        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [N,M,2]
+        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [N,M,2]
+
+        wh = (rb - lt + TO_REMOVE).clamp(min=0)  # [N,M,2]
+        inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+        iou = inter / (area1[:, None] + area2 - inter)
+        return iou
diff --git a/mmde/mmdet/models/task_modules/assigners/match_cost.py b/mmde/mmdet/models/task_modules/assigners/match_cost.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fc62f01f29138cba31ef2b41254f497351fe0d0
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/match_cost.py
@@ -0,0 +1,525 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox_overlaps, bbox_xyxy_to_cxcywh
+
+
+class BaseMatchCost:
+    """Base match cost class.
+
+    Args:
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self, weight: Union[float, int] = 1.) -> None:
+        self.weight = weight
+
+    @abstractmethod
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            img_meta (dict, optional): Image information.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pass
+
+
+@TASK_UTILS.register_module()
+class BBoxL1Cost(BaseMatchCost):
+    """BBoxL1Cost.
+
+    Note: ``bboxes`` in ``InstanceData`` passed in is of format 'xyxy'
+    and its coordinates are unnormalized.
+
+    Args:
+        box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN.
+            Defaults to 'xyxy'.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+
+    Examples:
+        >>> from mmdet.models.task_modules.assigners.
+        ... match_costs.match_cost import BBoxL1Cost
+        >>> import torch
+        >>> self = BBoxL1Cost()
+        >>> bbox_pred = torch.rand(1, 4)
+        >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+        >>> factor = torch.tensor([10, 8, 10, 8])
+        >>> self(bbox_pred, gt_bboxes, factor)
+        tensor([[1.6172, 1.6422]])
+    """
+
+    def __init__(self,
+                 box_format: str = 'xyxy',
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        assert box_format in ['xyxy', 'xywh']
+        self.box_format = box_format
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): ``bboxes`` inside is
+                predicted boxes with unnormalized coordinate
+                (x, y, x, y).
+            gt_instances (:obj:`InstanceData`): ``bboxes`` inside is gt
+                bboxes with unnormalized coordinate (x, y, x, y).
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pred_bboxes = pred_instances.bboxes
+        gt_bboxes = gt_instances.bboxes
+
+        # convert box format
+        if self.box_format == 'xywh':
+            gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes)
+            pred_bboxes = bbox_xyxy_to_cxcywh(pred_bboxes)
+
+        # normalized
+        img_h, img_w = img_meta['img_shape']
+        factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        gt_bboxes = gt_bboxes / factor
+        pred_bboxes = pred_bboxes / factor
+
+        bbox_cost = torch.cdist(pred_bboxes, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class IoUCost(BaseMatchCost):
+    """IoUCost.
+
+    Note: ``bboxes`` in ``InstanceData`` passed in is of format 'xyxy'
+    and its coordinates are unnormalized.
+
+    Args:
+        iou_mode (str): iou mode such as 'iou', 'giou'. Defaults to 'giou'.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+
+    Examples:
+        >>> from mmdet.models.task_modules.assigners.
+        ... match_costs.match_cost import IoUCost
+        >>> import torch
+        >>> self = IoUCost()
+        >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
+        >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+        >>> self(bboxes, gt_bboxes)
+        tensor([[-0.1250,  0.1667],
+            [ 0.1667, -0.5000]])
+    """
+
+    def __init__(self, iou_mode: str = 'giou', weight: Union[float, int] = 1.):
+        super().__init__(weight=weight)
+        self.iou_mode = iou_mode
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs):
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): ``bboxes`` inside is
+                predicted boxes with unnormalized coordinate
+                (x, y, x, y).
+            gt_instances (:obj:`InstanceData`): ``bboxes`` inside is gt
+                bboxes with unnormalized coordinate (x, y, x, y).
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pred_bboxes = pred_instances.bboxes
+        gt_bboxes = gt_instances.bboxes
+
+        # avoid fp16 overflow
+        if pred_bboxes.dtype == torch.float16:
+            fp16 = True
+            pred_bboxes = pred_bboxes.to(torch.float32)
+        else:
+            fp16 = False
+
+        overlaps = bbox_overlaps(
+            pred_bboxes, gt_bboxes, mode=self.iou_mode, is_aligned=False)
+
+        if fp16:
+            overlaps = overlaps.to(torch.float16)
+
+        # The 1 is a constant that doesn't change the matching, so omitted.
+        iou_cost = -overlaps
+        return iou_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class ClassificationCost(BaseMatchCost):
+    """ClsSoftmaxCost.
+
+    Args:
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+
+    Examples:
+        >>> from mmdet.models.task_modules.assigners.
+        ...  match_costs.match_cost import ClassificationCost
+        >>> import torch
+        >>> self = ClassificationCost()
+        >>> cls_pred = torch.rand(4, 3)
+        >>> gt_labels = torch.tensor([0, 1, 2])
+        >>> factor = torch.tensor([10, 8, 10, 8])
+        >>> self(cls_pred, gt_labels)
+        tensor([[-0.3430, -0.3525, -0.3045],
+            [-0.3077, -0.2931, -0.3992],
+            [-0.3664, -0.3455, -0.2881],
+            [-0.3343, -0.2701, -0.3956]])
+    """
+
+    def __init__(self, weight: Union[float, int] = 1) -> None:
+        super().__init__(weight=weight)
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): ``scores`` inside is
+                predicted classification logits, of shape
+                (num_queries, num_class).
+            gt_instances (:obj:`InstanceData`): ``labels`` inside should have
+                shape (num_gt, ).
+            img_meta (Optional[dict]): _description_. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pred_scores = pred_instances.scores
+        gt_labels = gt_instances.labels
+
+        pred_scores = pred_scores.softmax(-1)
+        cls_cost = -pred_scores[:, gt_labels]
+
+        return cls_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class FocalLossCost(BaseMatchCost):
+    """FocalLossCost.
+
+    Args:
+        alpha (Union[float, int]): focal_loss alpha. Defaults to 0.25.
+        gamma (Union[float, int]): focal_loss gamma. Defaults to 2.
+        eps (float): Defaults to 1e-12.
+        binary_input (bool): Whether the input is binary. Currently,
+            binary_input = True is for masks input, binary_input = False
+            is for label input. Defaults to False.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self,
+                 alpha: Union[float, int] = 0.25,
+                 gamma: Union[float, int] = 2,
+                 eps: float = 1e-12,
+                 binary_input: bool = False,
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        self.alpha = alpha
+        self.gamma = gamma
+        self.eps = eps
+        self.binary_input = binary_input
+
+    def _focal_loss_cost(self, cls_pred: Tensor, gt_labels: Tensor) -> Tensor:
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_queries, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = pos_cost[:, gt_labels] - neg_cost[:, gt_labels]
+        return cls_cost * self.weight
+
+    def _mask_focal_loss_cost(self, cls_pred, gt_labels) -> Tensor:
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits.
+                in shape (num_queries, d1, ..., dn), dtype=torch.float32.
+            gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),
+                dtype=torch.long. Labels should be binary.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_queries, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1)
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = torch.einsum('nc,mc->nm', pos_cost, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
+        return cls_cost / n * self.weight
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Predicted instances which
+                must contain ``scores`` or ``masks``.
+            gt_instances (:obj:`InstanceData`): Ground truth which must contain
+                ``labels`` or ``mask``.
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        if self.binary_input:
+            pred_masks = pred_instances.masks
+            gt_masks = gt_instances.masks
+            return self._mask_focal_loss_cost(pred_masks, gt_masks)
+        else:
+            pred_scores = pred_instances.scores
+            gt_labels = gt_instances.labels
+            return self._focal_loss_cost(pred_scores, gt_labels)
+
+
+@TASK_UTILS.register_module()
+class BinaryFocalLossCost(FocalLossCost):
+
+    def _focal_loss_cost(self, cls_pred: Tensor, gt_labels: Tensor) -> Tensor:
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_queries, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        cls_pred = cls_pred.flatten(1)
+        gt_labels = gt_labels.flatten(1).float()
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = torch.einsum('nc,mc->nm', pos_cost, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
+        return cls_cost * self.weight
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Predicted instances which
+                must contain ``scores`` or ``masks``.
+            gt_instances (:obj:`InstanceData`): Ground truth which must contain
+                ``labels`` or ``mask``.
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        # gt_instances.text_token_mask is a repeated tensor of the same length
+        # of instances. Only gt_instances.text_token_mask[0] is useful
+        text_token_mask = torch.nonzero(
+            gt_instances.text_token_mask[0]).squeeze(-1)
+        pred_scores = pred_instances.scores[:, text_token_mask]
+        gt_labels = gt_instances.positive_maps[:, text_token_mask]
+        return self._focal_loss_cost(pred_scores, gt_labels)
+
+
+@TASK_UTILS.register_module()
+class DiceCost(BaseMatchCost):
+    """Cost of mask assignments based on dice losses.
+
+    Args:
+        pred_act (bool): Whether to apply sigmoid to mask_pred.
+            Defaults to False.
+        eps (float): Defaults to 1e-3.
+        naive_dice (bool): If True, use the naive dice loss
+            in which the power of the number in the denominator is
+            the first power. If False, use the second power that
+            is adopted by K-Net and SOLO. Defaults to True.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self,
+                 pred_act: bool = False,
+                 eps: float = 1e-3,
+                 naive_dice: bool = True,
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        self.pred_act = pred_act
+        self.eps = eps
+        self.naive_dice = naive_dice
+
+    def _binary_mask_dice_loss(self, mask_preds: Tensor,
+                               gt_masks: Tensor) -> Tensor:
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction in shape (num_queries, *).
+            gt_masks (Tensor): Ground truth in shape (num_gt, *)
+                store 0 or 1, 0 for negative class and 1 for
+                positive class.
+
+        Returns:
+            Tensor: Dice cost matrix in shape (num_queries, num_gt).
+        """
+        mask_preds = mask_preds.flatten(1)
+        gt_masks = gt_masks.flatten(1).float()
+        numerator = 2 * torch.einsum('nc,mc->nm', mask_preds, gt_masks)
+        if self.naive_dice:
+            denominator = mask_preds.sum(-1)[:, None] + \
+                          gt_masks.sum(-1)[None, :]
+        else:
+            denominator = mask_preds.pow(2).sum(1)[:, None] + \
+                          gt_masks.pow(2).sum(1)[None, :]
+        loss = 1 - (numerator + self.eps) / (denominator + self.eps)
+        return loss
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Predicted instances which
+                must contain ``masks``.
+            gt_instances (:obj:`InstanceData`): Ground truth which must contain
+                ``mask``.
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pred_masks = pred_instances.masks
+        gt_masks = gt_instances.masks
+
+        if self.pred_act:
+            pred_masks = pred_masks.sigmoid()
+        dice_cost = self._binary_mask_dice_loss(pred_masks, gt_masks)
+        return dice_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class CrossEntropyLossCost(BaseMatchCost):
+    """CrossEntropyLossCost.
+
+    Args:
+        use_sigmoid (bool): Whether the prediction uses sigmoid
+                of softmax. Defaults to True.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self,
+                 use_sigmoid: bool = True,
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        self.use_sigmoid = use_sigmoid
+
+    def _binary_cross_entropy(self, cls_pred: Tensor,
+                              gt_labels: Tensor) -> Tensor:
+        """
+        Args:
+            cls_pred (Tensor): The prediction with shape (num_queries, 1, *) or
+                (num_queries, *).
+            gt_labels (Tensor): The learning label of prediction with
+                shape (num_gt, *).
+
+        Returns:
+            Tensor: Cross entropy cost matrix in shape (num_queries, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1).float()
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        pos = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.ones_like(cls_pred), reduction='none')
+        neg = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.zeros_like(cls_pred), reduction='none')
+        cls_cost = torch.einsum('nc,mc->nm', pos, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg, 1 - gt_labels)
+        cls_cost = cls_cost / n
+
+        return cls_cost
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Predicted instances which
+                must contain ``scores`` or ``masks``.
+            gt_instances (:obj:`InstanceData`): Ground truth which must contain
+                ``labels`` or ``masks``.
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pred_masks = pred_instances.masks
+        gt_masks = gt_instances.masks
+        if self.use_sigmoid:
+            cls_cost = self._binary_cross_entropy(pred_masks, gt_masks)
+        else:
+            raise NotImplementedError
+
+        return cls_cost * self.weight
diff --git a/mmde/mmdet/models/task_modules/assigners/max_iou_assigner.py b/mmde/mmdet/models/task_modules/assigners/max_iou_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..71da54429ae0526bf52277bc3b1d24630acceaed
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/max_iou_assigner.py
@@ -0,0 +1,325 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Optional, Union
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+def _perm_box(bboxes,
+              iou_calculator,
+              iou_thr=0.97,
+              perm_range=0.01,
+              counter=0,
+              max_iter=5):
+    """Compute the permuted bboxes.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for , "xyxy" format.
+        iou_calculator (obj): Overlaps Calculator.
+        iou_thr (float): The permuted bboxes should have IoU > iou_thr.
+        perm_range (float): The scale of permutation.
+        counter (int): Counter of permutation iteration.
+        max_iter (int): The max iterations of permutation.
+    Returns:
+        Tensor: The permuted bboxes.
+    """
+    ori_bboxes = copy.deepcopy(bboxes)
+    is_valid = True
+    N = bboxes.size(0)
+    perm_factor = bboxes.new_empty(N, 4).uniform_(1 - perm_range,
+                                                  1 + perm_range)
+    bboxes *= perm_factor
+    new_wh = bboxes[:, 2:] - bboxes[:, :2]
+    if (new_wh <= 0).any():
+        is_valid = False
+    iou = iou_calculator(ori_bboxes.unique(dim=0), bboxes)
+    if (iou < iou_thr).any():
+        is_valid = False
+    if not is_valid and counter < max_iter:
+        return _perm_box(
+            ori_bboxes,
+            iou_calculator,
+            perm_range=max(perm_range - counter * 0.001, 1e-3),
+            counter=counter + 1)
+    return bboxes
+
+
+def perm_repeat_bboxes(bboxes, iou_calculator=None, perm_repeat_cfg=None):
+    """Permute the repeated bboxes.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for , "xyxy" format.
+        iou_calculator (obj): Overlaps Calculator.
+        perm_repeat_cfg (Dict): Config of permutation.
+    Returns:
+        Tensor: Bboxes after permuted repeated bboxes.
+    """
+    assert isinstance(bboxes, torch.Tensor)
+    if iou_calculator is None:
+        import torchvision
+        iou_calculator = torchvision.ops.box_iou
+    bboxes = copy.deepcopy(bboxes)
+    unique_bboxes = bboxes.unique(dim=0)
+    iou_thr = perm_repeat_cfg.get('iou_thr', 0.97)
+    perm_range = perm_repeat_cfg.get('perm_range', 0.01)
+    for box in unique_bboxes:
+        inds = (bboxes == box).sum(-1).float() == 4
+        if inds.float().sum().item() == 1:
+            continue
+        bboxes[inds] = _perm_box(
+            bboxes[inds],
+            iou_calculator,
+            iou_thr=iou_thr,
+            perm_range=perm_range,
+            counter=0)
+    return bboxes
+
+
+@TASK_UTILS.register_module()
+class MaxIoUAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, or a semi-positive integer
+    indicating the ground truth index.
+
+    - -1: negative sample, no assigned gt
+    - semi-positive integer: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+            `min_pos_iou` is set to avoid assigning bboxes that have extremely
+            small iou with GT as positive samples. It brings about 0.3 mAP
+            improvements in 1x schedule but does not affect the performance of
+            3x schedule. More comparisons can be found in
+            `PR #7464 <https://github.com/open-mmlab/mmdetection/pull/7464>`_.
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+        match_low_quality (bool): Whether to allow low quality matches. This is
+            usually allowed for RPN and single stage detectors, but not allowed
+            in the second stage. Details are demonstrated in Step 4.
+        gpu_assign_thr (int): The upper bound of the number of GT for GPU
+            assign. When the number of gt is above this threshold, will assign
+            on CPU device. Negative values mean not assign on CPU.
+        iou_calculator (dict): Config of overlaps Calculator.
+        perm_repeat_gt_cfg (dict): Config of permute repeated gt bboxes.
+    """
+
+    def __init__(self,
+                 pos_iou_thr: float,
+                 neg_iou_thr: Union[float, tuple],
+                 min_pos_iou: float = .0,
+                 gt_max_assign_all: bool = True,
+                 ignore_iof_thr: float = -1,
+                 ignore_wrt_candidates: bool = True,
+                 match_low_quality: bool = True,
+                 gpu_assign_thr: float = -1,
+                 iou_calculator: dict = dict(type='BboxOverlaps2D'),
+                 perm_repeat_gt_cfg=None):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        self.gpu_assign_thr = gpu_assign_thr
+        self.match_low_quality = match_low_quality
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+        self.perm_repeat_gt_cfg = perm_repeat_gt_cfg
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to bboxes.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, or a semi-positive number. -1 means negative
+        sample, semi-positive number is the index (0-based) of assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to the background
+        2. assign proposals whose iou with all gts < neg_iou_thr to 0
+        3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
+           assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals (may be more than
+           one) to itself
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+
+        Example:
+            >>> from mmengine.structures import InstanceData
+            >>> self = MaxIoUAssigner(0.5, 0.5)
+            >>> pred_instances = InstanceData()
+            >>> pred_instances.priors = torch.Tensor([[0, 0, 10, 10],
+            ...                                      [10, 10, 20, 20]])
+            >>> gt_instances = InstanceData()
+            >>> gt_instances.bboxes = torch.Tensor([[0, 0, 10, 9]])
+            >>> gt_instances.labels = torch.Tensor([0])
+            >>> assign_result = self.assign(pred_instances, gt_instances)
+            >>> expected_gt_inds = torch.LongTensor([1, 0])
+            >>> assert torch.all(assign_result.gt_inds == expected_gt_inds)
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels
+        if gt_instances_ignore is not None:
+            gt_bboxes_ignore = gt_instances_ignore.bboxes
+        else:
+            gt_bboxes_ignore = None
+
+        assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
+            gt_bboxes.shape[0] > self.gpu_assign_thr) else False
+        # compute overlap and assign gt on CPU when number of GT is large
+        if assign_on_cpu:
+            device = priors.device
+            priors = priors.cpu()
+            gt_bboxes = gt_bboxes.cpu()
+            gt_labels = gt_labels.cpu()
+            if gt_bboxes_ignore is not None:
+                gt_bboxes_ignore = gt_bboxes_ignore.cpu()
+
+        if self.perm_repeat_gt_cfg is not None and priors.numel() > 0:
+            gt_bboxes_unique = perm_repeat_bboxes(gt_bboxes,
+                                                  self.iou_calculator,
+                                                  self.perm_repeat_gt_cfg)
+        else:
+            gt_bboxes_unique = gt_bboxes
+        overlaps = self.iou_calculator(gt_bboxes_unique, priors)
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and priors.numel() > 0):
+            if self.ignore_wrt_candidates:
+                ignore_overlaps = self.iou_calculator(
+                    priors, gt_bboxes_ignore, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            else:
+                ignore_overlaps = self.iou_calculator(
+                    gt_bboxes_ignore, priors, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
+            overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
+
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        if assign_on_cpu:
+            assign_result.gt_inds = assign_result.gt_inds.to(device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(device)
+            if assign_result.labels is not None:
+                assign_result.labels = assign_result.labels.to(device)
+        return assign_result
+
+    def assign_wrt_overlaps(self, overlaps: Tensor,
+                            gt_labels: Tensor) -> AssignResult:
+        """Assign w.r.t. the overlaps of priors with gts.
+
+        Args:
+            overlaps (Tensor): Overlaps between k gt_bboxes and n bboxes,
+                shape(k, n).
+            gt_labels (Tensor): Labels of k gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        num_gts, num_bboxes = overlaps.size(0), overlaps.size(1)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = overlaps.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_bboxes, ))
+            assigned_labels = overlaps.new_full((num_bboxes, ),
+                                                -1,
+                                                dtype=torch.long)
+            if num_gts == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts=num_gts,
+                gt_inds=assigned_gt_inds,
+                max_overlaps=max_overlaps,
+                labels=assigned_labels)
+
+        # for each anchor, which gt best overlaps with it
+        # for each anchor, the max iou of all gts
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+        # for each gt, which anchor best overlaps with it
+        # for each gt, the max iou of all proposals
+        gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1)
+
+        # 2. assign negative: below
+        # the negative inds are set to be 0
+        if isinstance(self.neg_iou_thr, float):
+            assigned_gt_inds[(max_overlaps >= 0)
+                             & (max_overlaps < self.neg_iou_thr)] = 0
+        elif isinstance(self.neg_iou_thr, tuple):
+            assert len(self.neg_iou_thr) == 2
+            assigned_gt_inds[(max_overlaps >= self.neg_iou_thr[0])
+                             & (max_overlaps < self.neg_iou_thr[1])] = 0
+
+        # 3. assign positive: above positive IoU threshold
+        pos_inds = max_overlaps >= self.pos_iou_thr
+        assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
+
+        if self.match_low_quality:
+            # Low-quality matching will overwrite the assigned_gt_inds assigned
+            # in Step 3. Thus, the assigned gt might not be the best one for
+            # prediction.
+            # For example, if bbox A has 0.9 and 0.8 iou with GT bbox 1 & 2,
+            # bbox 1 will be assigned as the best target for bbox A in step 3.
+            # However, if GT bbox 2's gt_argmax_overlaps = A, bbox A's
+            # assigned_gt_inds will be overwritten to be bbox 2.
+            # This might be the reason that it is not used in ROI Heads.
+            for i in range(num_gts):
+                if gt_max_overlaps[i] >= self.min_pos_iou:
+                    if self.gt_max_assign_all:
+                        max_iou_inds = overlaps[i, :] == gt_max_overlaps[i]
+                        assigned_gt_inds[max_iou_inds] = i + 1
+                    else:
+                        assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
+
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+
+        return AssignResult(
+            num_gts=num_gts,
+            gt_inds=assigned_gt_inds,
+            max_overlaps=max_overlaps,
+            labels=assigned_labels)
diff --git a/mmde/mmdet/models/task_modules/assigners/multi_instance_assigner.py b/mmde/mmdet/models/task_modules/assigners/multi_instance_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ba32afe856b3c2ad03ed89562d080f15b6ccf30
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/multi_instance_assigner.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .max_iou_assigner import MaxIoUAssigner
+
+
+@TASK_UTILS.register_module()
+class MultiInstanceAssigner(MaxIoUAssigner):
+    """Assign a corresponding gt bbox or background to each proposal bbox. If
+    we need to use a proposal box to generate multiple predict boxes,
+    `MultiInstanceAssigner` can assign multiple gt to each proposal box.
+
+    Args:
+        num_instance (int): How many bboxes are predicted by each proposal box.
+    """
+
+    def __init__(self, num_instance: int = 2, **kwargs):
+        super().__init__(**kwargs)
+        self.num_instance = num_instance
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to bboxes.
+
+        This method assign gt bboxes to every bbox (proposal/anchor), each bbox
+        is assigned a set of gts, and the number of gts in this set is defined
+        by `self.num_instance`.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        # Set the FG label to 1 and add ignored annotations
+        gt_labels = gt_instances.labels + 1
+        if gt_instances_ignore is not None:
+            gt_bboxes_ignore = gt_instances_ignore.bboxes
+            if hasattr(gt_instances_ignore, 'labels'):
+                gt_labels_ignore = gt_instances_ignore.labels
+            else:
+                gt_labels_ignore = torch.ones_like(gt_bboxes_ignore)[:, 0] * -1
+        else:
+            gt_bboxes_ignore = None
+            gt_labels_ignore = None
+
+        assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
+            gt_bboxes.shape[0] > self.gpu_assign_thr) else False
+        # compute overlap and assign gt on CPU when number of GT is large
+        if assign_on_cpu:
+            device = priors.device
+            priors = priors.cpu()
+            gt_bboxes = gt_bboxes.cpu()
+            gt_labels = gt_labels.cpu()
+            if gt_bboxes_ignore is not None:
+                gt_bboxes_ignore = gt_bboxes_ignore.cpu()
+                gt_labels_ignore = gt_labels_ignore.cpu()
+
+        if gt_bboxes_ignore is not None:
+            all_bboxes = torch.cat([gt_bboxes, gt_bboxes_ignore], dim=0)
+            all_labels = torch.cat([gt_labels, gt_labels_ignore], dim=0)
+        else:
+            all_bboxes = gt_bboxes
+            all_labels = gt_labels
+        all_priors = torch.cat([priors, all_bboxes], dim=0)
+
+        overlaps_normal = self.iou_calculator(
+            all_priors, all_bboxes, mode='iou')
+        overlaps_ignore = self.iou_calculator(
+            all_priors, all_bboxes, mode='iof')
+        gt_ignore_mask = all_labels.eq(-1).repeat(all_priors.shape[0], 1)
+        overlaps_normal = overlaps_normal * ~gt_ignore_mask
+        overlaps_ignore = overlaps_ignore * gt_ignore_mask
+
+        overlaps_normal, overlaps_normal_indices = overlaps_normal.sort(
+            descending=True, dim=1)
+        overlaps_ignore, overlaps_ignore_indices = overlaps_ignore.sort(
+            descending=True, dim=1)
+
+        # select the roi with the higher score
+        max_overlaps_normal = overlaps_normal[:, :self.num_instance].flatten()
+        gt_assignment_normal = overlaps_normal_indices[:, :self.
+                                                       num_instance].flatten()
+        max_overlaps_ignore = overlaps_ignore[:, :self.num_instance].flatten()
+        gt_assignment_ignore = overlaps_ignore_indices[:, :self.
+                                                       num_instance].flatten()
+
+        # ignore or not
+        ignore_assign_mask = (max_overlaps_normal < self.pos_iou_thr) * (
+            max_overlaps_ignore > max_overlaps_normal)
+        overlaps = (max_overlaps_normal * ~ignore_assign_mask) + (
+            max_overlaps_ignore * ignore_assign_mask)
+        gt_assignment = (gt_assignment_normal * ~ignore_assign_mask) + (
+            gt_assignment_ignore * ignore_assign_mask)
+
+        assigned_labels = all_labels[gt_assignment]
+        fg_mask = (overlaps >= self.pos_iou_thr) * (assigned_labels != -1)
+        bg_mask = (overlaps < self.neg_iou_thr) * (overlaps >= 0)
+        assigned_labels[fg_mask] = 1
+        assigned_labels[bg_mask] = 0
+
+        overlaps = overlaps.reshape(-1, self.num_instance)
+        gt_assignment = gt_assignment.reshape(-1, self.num_instance)
+        assigned_labels = assigned_labels.reshape(-1, self.num_instance)
+
+        assign_result = AssignResult(
+            num_gts=all_bboxes.size(0),
+            gt_inds=gt_assignment,
+            max_overlaps=overlaps,
+            labels=assigned_labels)
+
+        if assign_on_cpu:
+            assign_result.gt_inds = assign_result.gt_inds.to(device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(device)
+            if assign_result.labels is not None:
+                assign_result.labels = assign_result.labels.to(device)
+        return assign_result
diff --git a/mmde/mmdet/models/task_modules/assigners/point_assigner.py b/mmde/mmdet/models/task_modules/assigners/point_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..4da60a490b0022ac76c46db8a34f814bc9da8e2e
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/point_assigner.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@TASK_UTILS.register_module()
+class PointAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each point.
+
+    Each proposals will be assigned with `0`, or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    """
+
+    def __init__(self, scale: int = 4, pos_num: int = 3) -> None:
+        self.scale = scale
+        self.pos_num = pos_num
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to points.
+
+        This method assign a gt bbox to every points set, each points set
+        will be assigned with  the background_label (-1), or a label number.
+        -1 is background, and semi-positive number is the index (0-based) of
+        assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every points to the background_label (-1)
+        2. A point is assigned to some gt bbox if
+            (i) the point is within the k closest points to the gt bbox
+            (ii) the distance between this point and the gt is smaller than
+                other gt bboxes
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+
+
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        # points to be assigned, shape(n, 3) while last
+        # dimension stands for (x, y, stride).
+        points = pred_instances.priors
+
+        num_points = points.shape[0]
+        num_gts = gt_bboxes.shape[0]
+
+        if num_gts == 0 or num_points == 0:
+            # If no truth assign everything to the background
+            assigned_gt_inds = points.new_full((num_points, ),
+                                               0,
+                                               dtype=torch.long)
+            assigned_labels = points.new_full((num_points, ),
+                                              -1,
+                                              dtype=torch.long)
+            return AssignResult(
+                num_gts=num_gts,
+                gt_inds=assigned_gt_inds,
+                max_overlaps=None,
+                labels=assigned_labels)
+
+        points_xy = points[:, :2]
+        points_stride = points[:, 2]
+        points_lvl = torch.log2(
+            points_stride).int()  # [3...,4...,5...,6...,7...]
+        lvl_min, lvl_max = points_lvl.min(), points_lvl.max()
+
+        # assign gt box
+        gt_bboxes_xy = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) / 2
+        gt_bboxes_wh = (gt_bboxes[:, 2:] - gt_bboxes[:, :2]).clamp(min=1e-6)
+        scale = self.scale
+        gt_bboxes_lvl = ((torch.log2(gt_bboxes_wh[:, 0] / scale) +
+                          torch.log2(gt_bboxes_wh[:, 1] / scale)) / 2).int()
+        gt_bboxes_lvl = torch.clamp(gt_bboxes_lvl, min=lvl_min, max=lvl_max)
+
+        # stores the assigned gt index of each point
+        assigned_gt_inds = points.new_zeros((num_points, ), dtype=torch.long)
+        # stores the assigned gt dist (to this point) of each point
+        assigned_gt_dist = points.new_full((num_points, ), float('inf'))
+        points_range = torch.arange(points.shape[0])
+
+        for idx in range(num_gts):
+            gt_lvl = gt_bboxes_lvl[idx]
+            # get the index of points in this level
+            lvl_idx = gt_lvl == points_lvl
+            points_index = points_range[lvl_idx]
+            # get the points in this level
+            lvl_points = points_xy[lvl_idx, :]
+            # get the center point of gt
+            gt_point = gt_bboxes_xy[[idx], :]
+            # get width and height of gt
+            gt_wh = gt_bboxes_wh[[idx], :]
+            # compute the distance between gt center and
+            #   all points in this level
+            points_gt_dist = ((lvl_points - gt_point) / gt_wh).norm(dim=1)
+            # find the nearest k points to gt center in this level
+            min_dist, min_dist_index = torch.topk(
+                points_gt_dist, self.pos_num, largest=False)
+            # the index of nearest k points to gt center in this level
+            min_dist_points_index = points_index[min_dist_index]
+            # The less_than_recorded_index stores the index
+            #   of min_dist that is less then the assigned_gt_dist. Where
+            #   assigned_gt_dist stores the dist from previous assigned gt
+            #   (if exist) to each point.
+            less_than_recorded_index = min_dist < assigned_gt_dist[
+                min_dist_points_index]
+            # The min_dist_points_index stores the index of points satisfy:
+            #   (1) it is k nearest to current gt center in this level.
+            #   (2) it is closer to current gt center than other gt center.
+            min_dist_points_index = min_dist_points_index[
+                less_than_recorded_index]
+            # assign the result
+            assigned_gt_inds[min_dist_points_index] = idx + 1
+            assigned_gt_dist[min_dist_points_index] = min_dist[
+                less_than_recorded_index]
+
+        assigned_labels = assigned_gt_inds.new_full((num_points, ), -1)
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+
+        return AssignResult(
+            num_gts=num_gts,
+            gt_inds=assigned_gt_inds,
+            max_overlaps=None,
+            labels=assigned_labels)
diff --git a/mmde/mmdet/models/task_modules/assigners/region_assigner.py b/mmde/mmdet/models/task_modules/assigners/region_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..df549143086c1195efaf12a2f3e81259da0e6c97
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/region_assigner.py
@@ -0,0 +1,239 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from ..prior_generators import anchor_inside_flags
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+def calc_region(
+        bbox: Tensor,
+        ratio: float,
+        stride: int,
+        featmap_size: Optional[Tuple[int, int]] = None) -> Tuple[Tensor]:
+    """Calculate region of the box defined by the ratio, the ratio is from the
+    center of the box to every edge."""
+    # project bbox on the feature
+    f_bbox = bbox / stride
+    x1 = torch.round((1 - ratio) * f_bbox[0] + ratio * f_bbox[2])
+    y1 = torch.round((1 - ratio) * f_bbox[1] + ratio * f_bbox[3])
+    x2 = torch.round(ratio * f_bbox[0] + (1 - ratio) * f_bbox[2])
+    y2 = torch.round(ratio * f_bbox[1] + (1 - ratio) * f_bbox[3])
+    if featmap_size is not None:
+        x1 = x1.clamp(min=0, max=featmap_size[1])
+        y1 = y1.clamp(min=0, max=featmap_size[0])
+        x2 = x2.clamp(min=0, max=featmap_size[1])
+        y2 = y2.clamp(min=0, max=featmap_size[0])
+    return (x1, y1, x2, y2)
+
+
+def anchor_ctr_inside_region_flags(anchors: Tensor, stride: int,
+                                   region: Tuple[Tensor]) -> Tensor:
+    """Get the flag indicate whether anchor centers are inside regions."""
+    x1, y1, x2, y2 = region
+    f_anchors = anchors / stride
+    x = (f_anchors[:, 0] + f_anchors[:, 2]) * 0.5
+    y = (f_anchors[:, 1] + f_anchors[:, 3]) * 0.5
+    flags = (x >= x1) & (x <= x2) & (y >= y1) & (y <= y2)
+    return flags
+
+
+@TASK_UTILS.register_module()
+class RegionAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+
+    - -1: don't care
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        center_ratio (float): ratio of the region in the center of the bbox to
+            define positive sample.
+        ignore_ratio (float): ratio of the region to define ignore samples.
+    """
+
+    def __init__(self,
+                 center_ratio: float = 0.2,
+                 ignore_ratio: float = 0.5) -> None:
+        self.center_ratio = center_ratio
+        self.ignore_ratio = ignore_ratio
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               img_meta: dict,
+               featmap_sizes: List[Tuple[int, int]],
+               num_level_anchors: List[int],
+               anchor_scale: int,
+               anchor_strides: List[int],
+               gt_instances_ignore: Optional[InstanceData] = None,
+               allowed_border: int = 0) -> AssignResult:
+        """Assign gt to anchors.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, 0, or a positive number. -1 means don't care,
+        0 means negative sample, positive number is the index (1-based) of
+        assigned gt.
+
+        The assignment is done in following steps, and the order matters.
+
+        1. Assign every anchor to 0 (negative)
+        2. (For each gt_bboxes) Compute ignore flags based on ignore_region
+           then assign -1 to anchors w.r.t. ignore flags
+        3. (For each gt_bboxes) Compute pos flags based on center_region then
+           assign gt_bboxes to anchors w.r.t. pos flags
+        4. (For each gt_bboxes) Compute ignore flags based on adjacent anchor
+           level then assign -1 to anchors w.r.t. ignore flags
+        5. Assign anchor outside of image to -1
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            img_meta (dict): Meta info of image.
+            featmap_sizes (list[tuple[int, int]]): Feature map size each level.
+            num_level_anchors (list[int]): The number of anchors in each level.
+            anchor_scale (int): Scale of the anchor.
+            anchor_strides (list[int]): Stride of the anchor.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+            allowed_border (int, optional): The border to allow the valid
+                anchor. Defaults to 0.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        if gt_instances_ignore is not None:
+            raise NotImplementedError
+
+        num_gts = len(gt_instances)
+        num_bboxes = len(pred_instances)
+
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        flat_anchors = pred_instances.priors
+        flat_valid_flags = pred_instances.valid_flags
+        mlvl_anchors = torch.split(flat_anchors, num_level_anchors)
+
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = gt_bboxes.new_zeros((num_bboxes, ))
+            assigned_gt_inds = gt_bboxes.new_zeros((num_bboxes, ),
+                                                   dtype=torch.long)
+            assigned_labels = gt_bboxes.new_full((num_bboxes, ),
+                                                 -1,
+                                                 dtype=torch.long)
+            return AssignResult(
+                num_gts=num_gts,
+                gt_inds=assigned_gt_inds,
+                max_overlaps=max_overlaps,
+                labels=assigned_labels)
+
+        num_lvls = len(mlvl_anchors)
+        r1 = (1 - self.center_ratio) / 2
+        r2 = (1 - self.ignore_ratio) / 2
+
+        scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                           (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+        min_anchor_size = scale.new_full(
+            (1, ), float(anchor_scale * anchor_strides[0]))
+        target_lvls = torch.floor(
+            torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
+        target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
+
+        # 1. assign 0 (negative) by default
+        mlvl_assigned_gt_inds = []
+        mlvl_ignore_flags = []
+        for lvl in range(num_lvls):
+            assigned_gt_inds = gt_bboxes.new_full((num_level_anchors[lvl], ),
+                                                  0,
+                                                  dtype=torch.long)
+            ignore_flags = torch.zeros_like(assigned_gt_inds)
+            mlvl_assigned_gt_inds.append(assigned_gt_inds)
+            mlvl_ignore_flags.append(ignore_flags)
+
+        for gt_id in range(num_gts):
+            lvl = target_lvls[gt_id].item()
+            featmap_size = featmap_sizes[lvl]
+            stride = anchor_strides[lvl]
+            anchors = mlvl_anchors[lvl]
+            gt_bbox = gt_bboxes[gt_id, :4]
+
+            # Compute regions
+            ignore_region = calc_region(gt_bbox, r2, stride, featmap_size)
+            ctr_region = calc_region(gt_bbox, r1, stride, featmap_size)
+
+            # 2. Assign -1 to ignore flags
+            ignore_flags = anchor_ctr_inside_region_flags(
+                anchors, stride, ignore_region)
+            mlvl_assigned_gt_inds[lvl][ignore_flags] = -1
+
+            # 3. Assign gt_bboxes to pos flags
+            pos_flags = anchor_ctr_inside_region_flags(anchors, stride,
+                                                       ctr_region)
+            mlvl_assigned_gt_inds[lvl][pos_flags] = gt_id + 1
+
+            # 4. Assign -1 to ignore adjacent lvl
+            if lvl > 0:
+                d_lvl = lvl - 1
+                d_anchors = mlvl_anchors[d_lvl]
+                d_featmap_size = featmap_sizes[d_lvl]
+                d_stride = anchor_strides[d_lvl]
+                d_ignore_region = calc_region(gt_bbox, r2, d_stride,
+                                              d_featmap_size)
+                ignore_flags = anchor_ctr_inside_region_flags(
+                    d_anchors, d_stride, d_ignore_region)
+                mlvl_ignore_flags[d_lvl][ignore_flags] = 1
+            if lvl < num_lvls - 1:
+                u_lvl = lvl + 1
+                u_anchors = mlvl_anchors[u_lvl]
+                u_featmap_size = featmap_sizes[u_lvl]
+                u_stride = anchor_strides[u_lvl]
+                u_ignore_region = calc_region(gt_bbox, r2, u_stride,
+                                              u_featmap_size)
+                ignore_flags = anchor_ctr_inside_region_flags(
+                    u_anchors, u_stride, u_ignore_region)
+                mlvl_ignore_flags[u_lvl][ignore_flags] = 1
+
+        # 4. (cont.) Assign -1 to ignore adjacent lvl
+        for lvl in range(num_lvls):
+            ignore_flags = mlvl_ignore_flags[lvl]
+            mlvl_assigned_gt_inds[lvl][ignore_flags == 1] = -1
+
+        # 5. Assign -1 to anchor outside of image
+        flat_assigned_gt_inds = torch.cat(mlvl_assigned_gt_inds)
+        assert (flat_assigned_gt_inds.shape[0] == flat_anchors.shape[0] ==
+                flat_valid_flags.shape[0])
+        inside_flags = anchor_inside_flags(flat_anchors, flat_valid_flags,
+                                           img_meta['img_shape'],
+                                           allowed_border)
+        outside_flags = ~inside_flags
+        flat_assigned_gt_inds[outside_flags] = -1
+
+        assigned_labels = torch.zeros_like(flat_assigned_gt_inds)
+        pos_flags = flat_assigned_gt_inds > 0
+        assigned_labels[pos_flags] = gt_labels[flat_assigned_gt_inds[pos_flags]
+                                               - 1]
+
+        return AssignResult(
+            num_gts=num_gts,
+            gt_inds=flat_assigned_gt_inds,
+            max_overlaps=None,
+            labels=assigned_labels)
diff --git a/mmde/mmdet/models/task_modules/assigners/sim_ota_assigner.py b/mmde/mmdet/models/task_modules/assigners/sim_ota_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..d54a8b91d132d9bf661267de666bfed7e915a65a
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/sim_ota_assigner.py
@@ -0,0 +1,223 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+INF = 100000.0
+EPS = 1.0e-7
+
+
+@TASK_UTILS.register_module()
+class SimOTAAssigner(BaseAssigner):
+    """Computes matching between predictions and ground truth.
+
+    Args:
+        center_radius (float): Ground truth center size
+            to judge whether a prior is in center. Defaults to 2.5.
+        candidate_topk (int): The candidate top-k which used to
+            get top-k ious to calculate dynamic-k. Defaults to 10.
+        iou_weight (float): The scale factor for regression
+            iou cost. Defaults to 3.0.
+        cls_weight (float): The scale factor for classification
+            cost. Defaults to 1.0.
+        iou_calculator (ConfigType): Config of overlaps Calculator.
+            Defaults to dict(type='BboxOverlaps2D').
+    """
+
+    def __init__(self,
+                 center_radius: float = 2.5,
+                 candidate_topk: int = 10,
+                 iou_weight: float = 3.0,
+                 cls_weight: float = 1.0,
+                 iou_calculator: ConfigType = dict(type='BboxOverlaps2D')):
+        self.center_radius = center_radius
+        self.candidate_topk = candidate_topk
+        self.iou_weight = iou_weight
+        self.cls_weight = cls_weight
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to priors using SimOTA.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            obj:`AssignResult`: The assigned result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        num_gt = gt_bboxes.size(0)
+
+        decoded_bboxes = pred_instances.bboxes
+        pred_scores = pred_instances.scores
+        priors = pred_instances.priors
+        num_bboxes = decoded_bboxes.size(0)
+
+        # assign 0 by default
+        assigned_gt_inds = decoded_bboxes.new_full((num_bboxes, ),
+                                                   0,
+                                                   dtype=torch.long)
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        valid_mask, is_in_boxes_and_center = self.get_in_gt_and_in_center_info(
+            priors, gt_bboxes)
+        valid_decoded_bbox = decoded_bboxes[valid_mask]
+        valid_pred_scores = pred_scores[valid_mask]
+        num_valid = valid_decoded_bbox.size(0)
+        if num_valid == 0:
+            # No valid bboxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        pairwise_ious = self.iou_calculator(valid_decoded_bbox, gt_bboxes)
+        iou_cost = -torch.log(pairwise_ious + EPS)
+
+        gt_onehot_label = (
+            F.one_hot(gt_labels.to(torch.int64),
+                      pred_scores.shape[-1]).float().unsqueeze(0).repeat(
+                          num_valid, 1, 1))
+
+        valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat(1, num_gt, 1)
+        # disable AMP autocast and calculate BCE with FP32 to avoid overflow
+        with torch.cuda.amp.autocast(enabled=False):
+            cls_cost = (
+                F.binary_cross_entropy(
+                    valid_pred_scores.to(dtype=torch.float32),
+                    gt_onehot_label,
+                    reduction='none',
+                ).sum(-1).to(dtype=valid_pred_scores.dtype))
+
+        cost_matrix = (
+            cls_cost * self.cls_weight + iou_cost * self.iou_weight +
+            (~is_in_boxes_and_center) * INF)
+
+        matched_pred_ious, matched_gt_inds = \
+            self.dynamic_k_matching(
+                cost_matrix, pairwise_ious, num_gt, valid_mask)
+
+        # convert to AssignResult format
+        assigned_gt_inds[valid_mask] = matched_gt_inds + 1
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        assigned_labels[valid_mask] = gt_labels[matched_gt_inds].long()
+        max_overlaps = assigned_gt_inds.new_full((num_bboxes, ),
+                                                 -INF,
+                                                 dtype=torch.float32)
+        max_overlaps[valid_mask] = matched_pred_ious
+        return AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+    def get_in_gt_and_in_center_info(
+            self, priors: Tensor, gt_bboxes: Tensor) -> Tuple[Tensor, Tensor]:
+        """Get the information of which prior is in gt bboxes and gt center
+        priors."""
+        num_gt = gt_bboxes.size(0)
+
+        repeated_x = priors[:, 0].unsqueeze(1).repeat(1, num_gt)
+        repeated_y = priors[:, 1].unsqueeze(1).repeat(1, num_gt)
+        repeated_stride_x = priors[:, 2].unsqueeze(1).repeat(1, num_gt)
+        repeated_stride_y = priors[:, 3].unsqueeze(1).repeat(1, num_gt)
+
+        # is prior centers in gt bboxes, shape: [n_prior, n_gt]
+        l_ = repeated_x - gt_bboxes[:, 0]
+        t_ = repeated_y - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - repeated_x
+        b_ = gt_bboxes[:, 3] - repeated_y
+
+        deltas = torch.stack([l_, t_, r_, b_], dim=1)
+        is_in_gts = deltas.min(dim=1).values > 0
+        is_in_gts_all = is_in_gts.sum(dim=1) > 0
+
+        # is prior centers in gt centers
+        gt_cxs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
+        gt_cys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
+        ct_box_l = gt_cxs - self.center_radius * repeated_stride_x
+        ct_box_t = gt_cys - self.center_radius * repeated_stride_y
+        ct_box_r = gt_cxs + self.center_radius * repeated_stride_x
+        ct_box_b = gt_cys + self.center_radius * repeated_stride_y
+
+        cl_ = repeated_x - ct_box_l
+        ct_ = repeated_y - ct_box_t
+        cr_ = ct_box_r - repeated_x
+        cb_ = ct_box_b - repeated_y
+
+        ct_deltas = torch.stack([cl_, ct_, cr_, cb_], dim=1)
+        is_in_cts = ct_deltas.min(dim=1).values > 0
+        is_in_cts_all = is_in_cts.sum(dim=1) > 0
+
+        # in boxes or in centers, shape: [num_priors]
+        is_in_gts_or_centers = is_in_gts_all | is_in_cts_all
+
+        # both in boxes and centers, shape: [num_fg, num_gt]
+        is_in_boxes_and_centers = (
+            is_in_gts[is_in_gts_or_centers, :]
+            & is_in_cts[is_in_gts_or_centers, :])
+        return is_in_gts_or_centers, is_in_boxes_and_centers
+
+    def dynamic_k_matching(self, cost: Tensor, pairwise_ious: Tensor,
+                           num_gt: int,
+                           valid_mask: Tensor) -> Tuple[Tensor, Tensor]:
+        """Use IoU and matching cost to calculate the dynamic top-k positive
+        targets."""
+        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
+        # select candidate topk ious for dynamic-k calculation
+        candidate_topk = min(self.candidate_topk, pairwise_ious.size(0))
+        topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0)
+        # calculate dynamic k for each gt
+        dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1)
+        for gt_idx in range(num_gt):
+            _, pos_idx = torch.topk(
+                cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
+            matching_matrix[:, gt_idx][pos_idx] = 1
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        prior_match_gt_mask = matching_matrix.sum(1) > 1
+        if prior_match_gt_mask.sum() > 0:
+            cost_min, cost_argmin = torch.min(
+                cost[prior_match_gt_mask, :], dim=1)
+            matching_matrix[prior_match_gt_mask, :] *= 0
+            matching_matrix[prior_match_gt_mask, cost_argmin] = 1
+        # get foreground mask inside box and center prior
+        fg_mask_inboxes = matching_matrix.sum(1) > 0
+        valid_mask[valid_mask.clone()] = fg_mask_inboxes
+
+        matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1)
+        matched_pred_ious = (matching_matrix *
+                             pairwise_ious).sum(1)[fg_mask_inboxes]
+        return matched_pred_ious, matched_gt_inds
diff --git a/mmde/mmdet/models/task_modules/assigners/task_aligned_assigner.py b/mmde/mmdet/models/task_modules/assigners/task_aligned_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..220ea8485933ab3243f6c1e205dbf1b973df08d7
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/task_aligned_assigner.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+INF = 100000000
+
+
+@TASK_UTILS.register_module()
+class TaskAlignedAssigner(BaseAssigner):
+    """Task aligned assigner used in the paper:
+    `TOOD: Task-aligned One-stage Object Detection.
+    <https://arxiv.org/abs/2108.07755>`_.
+
+    Assign a corresponding gt bbox or background to each predicted bbox.
+    Each bbox will be assigned with `0` or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        topk (int): number of bbox selected in each level
+        iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou
+            calculator. Defaults to ``dict(type='BboxOverlaps2D')``
+    """
+
+    def __init__(self,
+                 topk: int,
+                 iou_calculator: ConfigType = dict(type='BboxOverlaps2D')):
+        assert topk >= 1
+        self.topk = topk
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               alpha: int = 1,
+               beta: int = 6) -> AssignResult:
+        """Assign gt to bboxes.
+
+        The assignment is done in following steps
+
+        1. compute alignment metric between all bbox (bbox of all pyramid
+           levels) and gt
+        2. select top-k bbox as candidates for each gt
+        3. limit the positive sample's center in gt (because the anchor-free
+           detector only can predict positive distance)
+
+
+        Args:
+            pred_instances (:obj:`InstaceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors, points, or bboxes predicted by the model,
+                shape(n, 4).
+            gt_instances (:obj:`InstaceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            gt_instances_ignore (:obj:`InstaceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+            alpha (int): Hyper-parameters related to alignment_metrics.
+                Defaults to 1.
+            beta (int): Hyper-parameters related to alignment_metrics.
+                Defaults to 6.
+
+        Returns:
+            :obj:`TaskAlignedAssignResult`: The assign result.
+        """
+        priors = pred_instances.priors
+        decode_bboxes = pred_instances.bboxes
+        pred_scores = pred_instances.scores
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+
+        priors = priors[:, :4]
+        num_gt, num_bboxes = gt_bboxes.size(0), priors.size(0)
+        # compute alignment metric between all bbox and gt
+        overlaps = self.iou_calculator(decode_bboxes, gt_bboxes).detach()
+        bbox_scores = pred_scores[:, gt_labels].detach()
+        # assign 0 by default
+        assigned_gt_inds = priors.new_full((num_bboxes, ), 0, dtype=torch.long)
+        assign_metrics = priors.new_zeros((num_bboxes, ))
+
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = priors.new_zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No gt boxes, assign everything to background
+                assigned_gt_inds[:] = 0
+            assigned_labels = priors.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+            assign_result = AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+            assign_result.assign_metrics = assign_metrics
+            return assign_result
+
+        # select top-k bboxes as candidates for each gt
+        alignment_metrics = bbox_scores**alpha * overlaps**beta
+        topk = min(self.topk, alignment_metrics.size(0))
+        _, candidate_idxs = alignment_metrics.topk(topk, dim=0, largest=True)
+        candidate_metrics = alignment_metrics[candidate_idxs,
+                                              torch.arange(num_gt)]
+        is_pos = candidate_metrics > 0
+
+        # limit the positive sample's center in gt
+        priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0
+        priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0
+        for gt_idx in range(num_gt):
+            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
+        ep_priors_cx = priors_cx.view(1, -1).expand(
+            num_gt, num_bboxes).contiguous().view(-1)
+        ep_priors_cy = priors_cy.view(1, -1).expand(
+            num_gt, num_bboxes).contiguous().view(-1)
+        candidate_idxs = candidate_idxs.view(-1)
+
+        # calculate the left, top, right, bottom distance between positive
+        # bbox center and gt side
+        l_ = ep_priors_cx[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 0]
+        t_ = ep_priors_cy[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - ep_priors_cx[candidate_idxs].view(-1, num_gt)
+        b_ = gt_bboxes[:, 3] - ep_priors_cy[candidate_idxs].view(-1, num_gt)
+        is_in_gts = torch.stack([l_, t_, r_, b_], dim=1).min(dim=1)[0] > 0.01
+        is_pos = is_pos & is_in_gts
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest iou will be selected.
+        overlaps_inf = torch.full_like(overlaps,
+                                       -INF).t().contiguous().view(-1)
+        index = candidate_idxs.view(-1)[is_pos.view(-1)]
+        overlaps_inf[index] = overlaps.t().contiguous().view(-1)[index]
+        overlaps_inf = overlaps_inf.view(num_gt, -1).t()
+
+        max_overlaps, argmax_overlaps = overlaps_inf.max(dim=1)
+        assigned_gt_inds[
+            max_overlaps != -INF] = argmax_overlaps[max_overlaps != -INF] + 1
+        assign_metrics[max_overlaps != -INF] = alignment_metrics[
+            max_overlaps != -INF, argmax_overlaps[max_overlaps != -INF]]
+
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+        assign_result = AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+        assign_result.assign_metrics = assign_metrics
+        return assign_result
diff --git a/mmde/mmdet/models/task_modules/assigners/topk_hungarian_assigner.py b/mmde/mmdet/models/task_modules/assigners/topk_hungarian_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..e48f092ac1ae99eadfdf7502b591b57c782e6354
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/topk_hungarian_assigner.py
@@ -0,0 +1,182 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.structures import BaseDataElement
+from scipy.optimize import linear_sum_assignment
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .task_aligned_assigner import TaskAlignedAssigner
+
+
+@TASK_UTILS.register_module()
+class TopkHungarianAssigner(TaskAlignedAssigner):
+    """Computes 1-to-k matching between ground truth and predictions.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of some components.
+    For DETR the costs are weighted sum of classification cost, regression L1
+    cost and regression iou cost. The targets don't include the no_object, so
+    generally there are more predictions than targets. After the 1-to-k
+    gt-pred matching, the un-matched are treated as backgrounds. Thus each
+    query prediction will be assigned with `0` or a positive integer
+    indicating the ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        cls_cost (dict): Classification cost configuration.
+        reg_cost (dict): Regression L1  cost configuration.
+        iou_cost (dict): Regression iou cost configuration.
+    """
+
+    def __init__(self,
+                 *args,
+                 cls_cost=dict(type='FocalLossCost', weight=2.0),
+                 reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+                 iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                 **kwargs):
+        super(TopkHungarianAssigner, self).__init__(*args, **kwargs)
+
+        self.cls_cost = TASK_UTILS.build(cls_cost)
+        self.reg_cost = TASK_UTILS.build(reg_cost)
+        self.iou_cost = TASK_UTILS.build(iou_cost)
+
+    def assign(self,
+               pred_scores,
+               decode_bboxes,
+               gt_bboxes,
+               gt_labels,
+               img_meta,
+               alpha=1,
+               beta=6,
+               **kwargs):
+        """Computes 1-to-k gt-pred matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+
+        1. Assign every prediction to -1.
+        2. Compute the weighted costs, each cost has shape (num_pred, num_gt).
+        3. Update topk to be min(topk, int(num_pred / num_gt)), then repeat
+            costs topk times to shape: (num_pred, num_gt * topk), so that each
+            gt will match topk predictions.
+        3. Do Hungarian matching on CPU based on the costs.
+        4. Assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+        5. Calculate alignment metrics and overlaps of each matched pred-gt
+            pair.
+
+        Args:
+            pred_scores (Tensor): Predicted normalized classification
+                scores for one image, has shape (num_dense_queries,
+                cls_out_channels).
+            decode_bboxes (Tensor): Predicted unnormalized bbox coordinates
+                for one image, has shape (num_dense_queries, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+            gt_bboxes (Tensor): Unnormalized ground truth
+                bboxes for one image, has shape (num_gt, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+                NOTE: num_gt is dynamic for each image.
+            gt_labels (Tensor): Ground truth classification
+                    index for the image, has shape (num_gt,).
+                    NOTE: num_gt is dynamic for each image.
+            img_meta (dict): Meta information for one image.
+            alpha (int): Hyper-parameters related to alignment_metrics.
+                Defaults to 1.
+            beta (int): Hyper-parameters related to alignment_metrics.
+                Defaults to 6.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        pred_scores = pred_scores.detach()
+        decode_bboxes = decode_bboxes.detach()
+        temp_overlaps = self.iou_calculator(decode_bboxes, gt_bboxes).detach()
+        bbox_scores = pred_scores[:, gt_labels].detach()
+        alignment_metrics = bbox_scores**alpha * temp_overlaps**beta
+
+        pred_instances = BaseDataElement()
+        gt_instances = BaseDataElement()
+
+        pred_instances.bboxes = decode_bboxes
+        gt_instances.bboxes = gt_bboxes
+
+        pred_instances.scores = pred_scores
+        gt_instances.labels = gt_labels
+
+        reg_cost = self.reg_cost(pred_instances, gt_instances, img_meta)
+        iou_cost = self.iou_cost(pred_instances, gt_instances, img_meta)
+        cls_cost = self.cls_cost(pred_instances, gt_instances, img_meta)
+        all_cost = cls_cost + reg_cost + iou_cost
+
+        num_gt, num_bboxes = gt_bboxes.size(0), pred_scores.size(0)
+        if num_gt > 0:
+            # assign 0 by default
+            assigned_gt_inds = pred_scores.new_full((num_bboxes, ),
+                                                    0,
+                                                    dtype=torch.long)
+            select_cost = all_cost
+
+            topk = min(self.topk, int(len(select_cost) / num_gt))
+
+            # Repeat the ground truth `topk` times to perform 1-to-k gt-pred
+            #   matching. For example, if `num_pred` = 900, `num_gt` = 3, then
+            #   there are only 3 gt-pred pairs in sum for 1-1 matching.
+            #   However, for 1-k gt-pred matching, if `topk` = 4, then each
+            #   gt is assigned 4 unique predictions, so there would be 12
+            #   gt-pred pairs in sum.
+            repeat_select_cost = select_cost[...,
+                                             None].repeat(1, 1, topk).view(
+                                                 select_cost.size(0), -1)
+            # anchor index and gt index
+            matched_row_inds, matched_col_inds = linear_sum_assignment(
+                repeat_select_cost.detach().cpu().numpy())
+            matched_row_inds = torch.from_numpy(matched_row_inds).to(
+                pred_scores.device)
+            matched_col_inds = torch.from_numpy(matched_col_inds).to(
+                pred_scores.device)
+
+            match_gt_ids = matched_col_inds // topk
+            candidate_idxs = matched_row_inds
+
+            assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+
+            if candidate_idxs.numel() > 0:
+                assigned_labels[candidate_idxs] = gt_labels[match_gt_ids]
+            else:
+                assigned_labels = None
+
+            assigned_gt_inds[candidate_idxs] = match_gt_ids + 1
+
+            overlaps = self.iou_calculator(
+                decode_bboxes[candidate_idxs],
+                gt_bboxes[match_gt_ids],
+                is_aligned=True).detach()
+
+            temp_pos_alignment_metrics = alignment_metrics[candidate_idxs]
+            pos_alignment_metrics = torch.gather(temp_pos_alignment_metrics, 1,
+                                                 match_gt_ids[:,
+                                                              None]).view(-1)
+            assign_result = AssignResult(
+                num_gt, assigned_gt_inds, overlaps, labels=assigned_labels)
+
+            assign_result.assign_metrics = pos_alignment_metrics
+            return assign_result
+        else:
+
+            assigned_gt_inds = pred_scores.new_full((num_bboxes, ),
+                                                    -1,
+                                                    dtype=torch.long)
+
+            assigned_labels = pred_scores.new_full((num_bboxes, ),
+                                                   -1,
+                                                   dtype=torch.long)
+
+            assigned_gt_inds[:] = 0
+            return AssignResult(
+                0, assigned_gt_inds, None, labels=assigned_labels)
diff --git a/mmde/mmdet/models/task_modules/assigners/uniform_assigner.py b/mmde/mmdet/models/task_modules/assigners/uniform_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a83bfd0b46a3690dce9cf0adf2c1e676f304d06
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/assigners/uniform_assigner.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox_xyxy_to_cxcywh
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@TASK_UTILS.register_module()
+class UniformAssigner(BaseAssigner):
+    """Uniform Matching between the priors and gt boxes, which can achieve
+    balance in positive priors, and gt_bboxes_ignore was not considered for
+    now.
+
+    Args:
+        pos_ignore_thr (float): the threshold to ignore positive priors
+        neg_ignore_thr (float): the threshold to ignore negative priors
+        match_times(int): Number of positive priors for each gt box.
+           Defaults to 4.
+        iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou
+            calculator. Defaults to ``dict(type='BboxOverlaps2D')``
+    """
+
+    def __init__(self,
+                 pos_ignore_thr: float,
+                 neg_ignore_thr: float,
+                 match_times: int = 4,
+                 iou_calculator: ConfigType = dict(type='BboxOverlaps2D')):
+        self.match_times = match_times
+        self.pos_ignore_thr = pos_ignore_thr
+        self.neg_ignore_thr = neg_ignore_thr
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(
+            self,
+            pred_instances: InstanceData,
+            gt_instances: InstanceData,
+            gt_instances_ignore: Optional[InstanceData] = None
+    ) -> AssignResult:
+        """Assign gt to priors.
+
+        The assignment is done in following steps
+
+        1. assign -1 by default
+        2. compute the L1 cost between boxes. Note that we use priors and
+           predict boxes both
+        3. compute the ignore indexes use gt_bboxes and predict boxes
+        4. compute the ignore indexes of positive sample use priors and
+           predict boxes
+
+
+        Args:
+            pred_instances (:obj:`InstaceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be priors, points, or bboxes predicted by the model,
+                shape(n, 4).
+            gt_instances (:obj:`InstaceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            gt_instances_ignore (:obj:`InstaceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        priors = pred_instances.priors
+        bbox_pred = pred_instances.decoder_priors
+
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              0,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            assign_result = AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+            assign_result.set_extra_property(
+                'pos_idx', bbox_pred.new_empty(0, dtype=torch.bool))
+            assign_result.set_extra_property('pos_predicted_boxes',
+                                             bbox_pred.new_empty((0, 4)))
+            assign_result.set_extra_property('target_boxes',
+                                             bbox_pred.new_empty((0, 4)))
+            return assign_result
+
+        # 2. Compute the L1 cost between boxes
+        # Note that we use priors and predict boxes both
+        cost_bbox = torch.cdist(
+            bbox_xyxy_to_cxcywh(bbox_pred),
+            bbox_xyxy_to_cxcywh(gt_bboxes),
+            p=1)
+        cost_bbox_priors = torch.cdist(
+            bbox_xyxy_to_cxcywh(priors), bbox_xyxy_to_cxcywh(gt_bboxes), p=1)
+
+        # We found that topk function has different results in cpu and
+        # cuda mode. In order to ensure consistency with the source code,
+        # we also use cpu mode.
+        # TODO: Check whether the performance of cpu and cuda are the same.
+        C = cost_bbox.cpu()
+        C1 = cost_bbox_priors.cpu()
+
+        # self.match_times x n
+        index = torch.topk(
+            C,  # c=b,n,x c[i]=n,x
+            k=self.match_times,
+            dim=0,
+            largest=False)[1]
+
+        # self.match_times x n
+        index1 = torch.topk(C1, k=self.match_times, dim=0, largest=False)[1]
+        # (self.match_times*2) x n
+        indexes = torch.cat((index, index1),
+                            dim=1).reshape(-1).to(bbox_pred.device)
+
+        pred_overlaps = self.iou_calculator(bbox_pred, gt_bboxes)
+        anchor_overlaps = self.iou_calculator(priors, gt_bboxes)
+        pred_max_overlaps, _ = pred_overlaps.max(dim=1)
+        anchor_max_overlaps, _ = anchor_overlaps.max(dim=0)
+
+        # 3. Compute the ignore indexes use gt_bboxes and predict boxes
+        ignore_idx = pred_max_overlaps > self.neg_ignore_thr
+        assigned_gt_inds[ignore_idx] = -1
+
+        # 4. Compute the ignore indexes of positive sample use priors
+        # and predict boxes
+        pos_gt_index = torch.arange(
+            0, C1.size(1),
+            device=bbox_pred.device).repeat(self.match_times * 2)
+        pos_ious = anchor_overlaps[indexes, pos_gt_index]
+        pos_ignore_idx = pos_ious < self.pos_ignore_thr
+
+        pos_gt_index_with_ignore = pos_gt_index + 1
+        pos_gt_index_with_ignore[pos_ignore_idx] = -1
+        assigned_gt_inds[indexes] = pos_gt_index_with_ignore
+
+        if gt_labels is not None:
+            assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+            pos_inds = torch.nonzero(
+                assigned_gt_inds > 0, as_tuple=False).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[
+                    assigned_gt_inds[pos_inds] - 1]
+        else:
+            assigned_labels = None
+
+        assign_result = AssignResult(
+            num_gts,
+            assigned_gt_inds,
+            anchor_max_overlaps,
+            labels=assigned_labels)
+        assign_result.set_extra_property('pos_idx', ~pos_ignore_idx)
+        assign_result.set_extra_property('pos_predicted_boxes',
+                                         bbox_pred[indexes])
+        assign_result.set_extra_property('target_boxes',
+                                         gt_bboxes[pos_gt_index])
+        return assign_result
diff --git a/mmde/mmdet/models/task_modules/builder.py b/mmde/mmdet/models/task_modules/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..6736049fef688e0d663d6195c79ec9688dc4c5d7
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/builder.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmdet.registry import TASK_UTILS
+
+PRIOR_GENERATORS = TASK_UTILS
+ANCHOR_GENERATORS = TASK_UTILS
+BBOX_ASSIGNERS = TASK_UTILS
+BBOX_SAMPLERS = TASK_UTILS
+BBOX_CODERS = TASK_UTILS
+MATCH_COSTS = TASK_UTILS
+IOU_CALCULATORS = TASK_UTILS
+
+
+def build_bbox_coder(cfg, **default_args):
+    """Builder of box coder."""
+    warnings.warn('``build_sampler`` would be deprecated soon, please use '
+                  '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_iou_calculator(cfg, default_args=None):
+    """Builder of IoU calculator."""
+    warnings.warn(
+        '``build_iou_calculator`` would be deprecated soon, please use '
+        '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_match_cost(cfg, default_args=None):
+    """Builder of IoU calculator."""
+    warnings.warn('``build_match_cost`` would be deprecated soon, please use '
+                  '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_assigner(cfg, **default_args):
+    """Builder of box assigner."""
+    warnings.warn('``build_assigner`` would be deprecated soon, please use '
+                  '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_sampler(cfg, **default_args):
+    """Builder of box sampler."""
+    warnings.warn('``build_sampler`` would be deprecated soon, please use '
+                  '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_prior_generator(cfg, default_args=None):
+    warnings.warn(
+        '``build_prior_generator`` would be deprecated soon, please use '
+        '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_anchor_generator(cfg, default_args=None):
+    warnings.warn(
+        '``build_anchor_generator`` would be deprecated soon, please use '
+        '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
diff --git a/mmde/mmdet/models/task_modules/coders/__init__.py b/mmde/mmdet/models/task_modules/coders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97c3982140021958dabdd03f8040519f946250ff
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/coders/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_bbox_coder import BaseBBoxCoder
+from .bucketing_bbox_coder import BucketingBBoxCoder
+from .delta_xywh_bbox_coder import (DeltaXYWHBBoxCoder,
+                                    DeltaXYWHBBoxCoderForGLIP)
+from .distance_point_bbox_coder import DistancePointBBoxCoder
+from .legacy_delta_xywh_bbox_coder import LegacyDeltaXYWHBBoxCoder
+from .pseudo_bbox_coder import PseudoBBoxCoder
+from .tblr_bbox_coder import TBLRBBoxCoder
+from .yolo_bbox_coder import YOLOBBoxCoder
+
+__all__ = [
+    'BaseBBoxCoder', 'PseudoBBoxCoder', 'DeltaXYWHBBoxCoder',
+    'LegacyDeltaXYWHBBoxCoder', 'TBLRBBoxCoder', 'YOLOBBoxCoder',
+    'BucketingBBoxCoder', 'DistancePointBBoxCoder', 'DeltaXYWHBBoxCoderForGLIP'
+]
diff --git a/mmde/mmdet/models/task_modules/coders/base_bbox_coder.py b/mmde/mmdet/models/task_modules/coders/base_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..806d2651869e02173578c9eb331758743a068dd9
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/coders/base_bbox_coder.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class BaseBBoxCoder(metaclass=ABCMeta):
+    """Base bounding box coder.
+
+    Args:
+        use_box_type (bool): Whether to warp decoded boxes with the
+            box type data structure. Defaults to False.
+    """
+
+    # The size of the last of dimension of the encoded tensor.
+    encode_size = 4
+
+    def __init__(self, use_box_type: bool = False, **kwargs):
+        self.use_box_type = use_box_type
+
+    @abstractmethod
+    def encode(self, bboxes, gt_bboxes):
+        """Encode deltas between bboxes and ground truth boxes."""
+
+    @abstractmethod
+    def decode(self, bboxes, bboxes_pred):
+        """Decode the predicted bboxes according to prediction and base
+        boxes."""
diff --git a/mmde/mmdet/models/task_modules/coders/bucketing_bbox_coder.py b/mmde/mmdet/models/task_modules/coders/bucketing_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..4044e1cd91d619521606f3c03032a40a9fc27130
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/coders/bucketing_bbox_coder.py
@@ -0,0 +1,366 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import (BaseBoxes, HorizontalBoxes, bbox_rescale,
+                                   get_box_tensor)
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class BucketingBBoxCoder(BaseBBoxCoder):
+    """Bucketing BBox Coder for Side-Aware Boundary Localization (SABL).
+
+    Boundary Localization with Bucketing and Bucketing Guided Rescoring
+    are implemented here.
+
+    Please refer to https://arxiv.org/abs/1912.04260 for more details.
+
+    Args:
+        num_buckets (int): Number of buckets.
+        scale_factor (int): Scale factor of proposals to generate buckets.
+        offset_topk (int): Topk buckets are used to generate
+             bucket fine regression targets. Defaults to 2.
+        offset_upperbound (float): Offset upperbound to generate
+             bucket fine regression targets.
+             To avoid too large offset displacements. Defaults to 1.0.
+        cls_ignore_neighbor (bool): Ignore second nearest bucket or Not.
+             Defaults to True.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 num_buckets: int,
+                 scale_factor: int,
+                 offset_topk: int = 2,
+                 offset_upperbound: float = 1.0,
+                 cls_ignore_neighbor: bool = True,
+                 clip_border: bool = True,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.num_buckets = num_buckets
+        self.scale_factor = scale_factor
+        self.offset_topk = offset_topk
+        self.offset_upperbound = offset_upperbound
+        self.cls_ignore_neighbor = cls_ignore_neighbor
+        self.clip_border = clip_border
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes]) -> Tuple[Tensor]:
+        """Get bucketing estimation and fine regression targets during
+        training.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): source boxes,
+                e.g., object proposals.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): target of the
+                transformation, e.g., ground truth boxes.
+
+        Returns:
+           encoded_bboxes(tuple[Tensor]): bucketing estimation
+            and fine regression targets and weights
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bbox2bucket(bboxes, gt_bboxes, self.num_buckets,
+                                     self.scale_factor, self.offset_topk,
+                                     self.offset_upperbound,
+                                     self.cls_ignore_neighbor)
+        return encoded_bboxes
+
+    def decode(
+        self,
+        bboxes: Union[Tensor, BaseBoxes],
+        pred_bboxes: Tensor,
+        max_shape: Optional[Tuple[int]] = None
+    ) -> Tuple[Union[Tensor, BaseBoxes], Tensor]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+        Args:
+            boxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes.
+            pred_bboxes (torch.Tensor): Predictions for bucketing estimation
+                and fine regression
+            max_shape (tuple[int], optional): Maximum shape of boxes.
+                Defaults to None.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        assert len(pred_bboxes) == 2
+        cls_preds, offset_preds = pred_bboxes
+        assert cls_preds.size(0) == bboxes.size(0) and offset_preds.size(
+            0) == bboxes.size(0)
+        bboxes, loc_confidence = bucket2bbox(bboxes, cls_preds, offset_preds,
+                                             self.num_buckets,
+                                             self.scale_factor, max_shape,
+                                             self.clip_border)
+        if self.use_box_type:
+            bboxes = HorizontalBoxes(bboxes, clone=False)
+        return bboxes, loc_confidence
+
+
+def generat_buckets(proposals: Tensor,
+                    num_buckets: int,
+                    scale_factor: float = 1.0) -> Tuple[Tensor]:
+    """Generate buckets w.r.t bucket number and scale factor of proposals.
+
+    Args:
+        proposals (Tensor): Shape (n, 4)
+        num_buckets (int): Number of buckets.
+        scale_factor (float): Scale factor to rescale proposals.
+
+    Returns:
+        tuple[Tensor]: (bucket_w, bucket_h, l_buckets, r_buckets,
+         t_buckets, d_buckets)
+
+            - bucket_w: Width of buckets on x-axis. Shape (n, ).
+            - bucket_h: Height of buckets on y-axis. Shape (n, ).
+            - l_buckets: Left buckets. Shape (n, ceil(side_num/2)).
+            - r_buckets: Right buckets. Shape (n, ceil(side_num/2)).
+            - t_buckets: Top buckets. Shape (n, ceil(side_num/2)).
+            - d_buckets: Down buckets. Shape (n, ceil(side_num/2)).
+    """
+    proposals = bbox_rescale(proposals, scale_factor)
+
+    # number of buckets in each side
+    side_num = int(np.ceil(num_buckets / 2.0))
+    pw = proposals[..., 2] - proposals[..., 0]
+    ph = proposals[..., 3] - proposals[..., 1]
+    px1 = proposals[..., 0]
+    py1 = proposals[..., 1]
+    px2 = proposals[..., 2]
+    py2 = proposals[..., 3]
+
+    bucket_w = pw / num_buckets
+    bucket_h = ph / num_buckets
+
+    # left buckets
+    l_buckets = px1[:, None] + (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_w[:, None]
+    # right buckets
+    r_buckets = px2[:, None] - (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_w[:, None]
+    # top buckets
+    t_buckets = py1[:, None] + (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_h[:, None]
+    # down buckets
+    d_buckets = py2[:, None] - (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_h[:, None]
+    return bucket_w, bucket_h, l_buckets, r_buckets, t_buckets, d_buckets
+
+
+def bbox2bucket(proposals: Tensor,
+                gt: Tensor,
+                num_buckets: int,
+                scale_factor: float,
+                offset_topk: int = 2,
+                offset_upperbound: float = 1.0,
+                cls_ignore_neighbor: bool = True) -> Tuple[Tensor]:
+    """Generate buckets estimation and fine regression targets.
+
+    Args:
+        proposals (Tensor): Shape (n, 4)
+        gt (Tensor): Shape (n, 4)
+        num_buckets (int): Number of buckets.
+        scale_factor (float): Scale factor to rescale proposals.
+        offset_topk (int): Topk buckets are used to generate
+             bucket fine regression targets. Defaults to 2.
+        offset_upperbound (float): Offset allowance to generate
+             bucket fine regression targets.
+             To avoid too large offset displacements. Defaults to 1.0.
+        cls_ignore_neighbor (bool): Ignore second nearest bucket or Not.
+             Defaults to True.
+
+    Returns:
+        tuple[Tensor]: (offsets, offsets_weights, bucket_labels, cls_weights).
+
+            - offsets: Fine regression targets. \
+                Shape (n, num_buckets*2).
+            - offsets_weights: Fine regression weights. \
+                Shape (n, num_buckets*2).
+            - bucket_labels: Bucketing estimation labels. \
+                Shape (n, num_buckets*2).
+            - cls_weights: Bucketing estimation weights. \
+                Shape (n, num_buckets*2).
+    """
+    assert proposals.size() == gt.size()
+
+    # generate buckets
+    proposals = proposals.float()
+    gt = gt.float()
+    (bucket_w, bucket_h, l_buckets, r_buckets, t_buckets,
+     d_buckets) = generat_buckets(proposals, num_buckets, scale_factor)
+
+    gx1 = gt[..., 0]
+    gy1 = gt[..., 1]
+    gx2 = gt[..., 2]
+    gy2 = gt[..., 3]
+
+    # generate offset targets and weights
+    # offsets from buckets to gts
+    l_offsets = (l_buckets - gx1[:, None]) / bucket_w[:, None]
+    r_offsets = (r_buckets - gx2[:, None]) / bucket_w[:, None]
+    t_offsets = (t_buckets - gy1[:, None]) / bucket_h[:, None]
+    d_offsets = (d_buckets - gy2[:, None]) / bucket_h[:, None]
+
+    # select top-k nearest buckets
+    l_topk, l_label = l_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+    r_topk, r_label = r_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+    t_topk, t_label = t_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+    d_topk, d_label = d_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+
+    offset_l_weights = l_offsets.new_zeros(l_offsets.size())
+    offset_r_weights = r_offsets.new_zeros(r_offsets.size())
+    offset_t_weights = t_offsets.new_zeros(t_offsets.size())
+    offset_d_weights = d_offsets.new_zeros(d_offsets.size())
+    inds = torch.arange(0, proposals.size(0)).to(proposals).long()
+
+    # generate offset weights of top-k nearest buckets
+    for k in range(offset_topk):
+        if k >= 1:
+            offset_l_weights[inds, l_label[:,
+                                           k]] = (l_topk[:, k] <
+                                                  offset_upperbound).float()
+            offset_r_weights[inds, r_label[:,
+                                           k]] = (r_topk[:, k] <
+                                                  offset_upperbound).float()
+            offset_t_weights[inds, t_label[:,
+                                           k]] = (t_topk[:, k] <
+                                                  offset_upperbound).float()
+            offset_d_weights[inds, d_label[:,
+                                           k]] = (d_topk[:, k] <
+                                                  offset_upperbound).float()
+        else:
+            offset_l_weights[inds, l_label[:, k]] = 1.0
+            offset_r_weights[inds, r_label[:, k]] = 1.0
+            offset_t_weights[inds, t_label[:, k]] = 1.0
+            offset_d_weights[inds, d_label[:, k]] = 1.0
+
+    offsets = torch.cat([l_offsets, r_offsets, t_offsets, d_offsets], dim=-1)
+    offsets_weights = torch.cat([
+        offset_l_weights, offset_r_weights, offset_t_weights, offset_d_weights
+    ],
+                                dim=-1)
+
+    # generate bucket labels and weight
+    side_num = int(np.ceil(num_buckets / 2.0))
+    labels = torch.stack(
+        [l_label[:, 0], r_label[:, 0], t_label[:, 0], d_label[:, 0]], dim=-1)
+
+    batch_size = labels.size(0)
+    bucket_labels = F.one_hot(labels.view(-1), side_num).view(batch_size,
+                                                              -1).float()
+    bucket_cls_l_weights = (l_offsets.abs() < 1).float()
+    bucket_cls_r_weights = (r_offsets.abs() < 1).float()
+    bucket_cls_t_weights = (t_offsets.abs() < 1).float()
+    bucket_cls_d_weights = (d_offsets.abs() < 1).float()
+    bucket_cls_weights = torch.cat([
+        bucket_cls_l_weights, bucket_cls_r_weights, bucket_cls_t_weights,
+        bucket_cls_d_weights
+    ],
+                                   dim=-1)
+    # ignore second nearest buckets for cls if necessary
+    if cls_ignore_neighbor:
+        bucket_cls_weights = (~((bucket_cls_weights == 1) &
+                                (bucket_labels == 0))).float()
+    else:
+        bucket_cls_weights[:] = 1.0
+    return offsets, offsets_weights, bucket_labels, bucket_cls_weights
+
+
+def bucket2bbox(proposals: Tensor,
+                cls_preds: Tensor,
+                offset_preds: Tensor,
+                num_buckets: int,
+                scale_factor: float = 1.0,
+                max_shape: Optional[Union[Sequence[int], Tensor,
+                                          Sequence[Sequence[int]]]] = None,
+                clip_border: bool = True) -> Tuple[Tensor]:
+    """Apply bucketing estimation (cls preds) and fine regression (offset
+    preds) to generate det bboxes.
+
+    Args:
+        proposals (Tensor): Boxes to be transformed. Shape (n, 4)
+        cls_preds (Tensor): bucketing estimation. Shape (n, num_buckets*2).
+        offset_preds (Tensor): fine regression. Shape (n, num_buckets*2).
+        num_buckets (int): Number of buckets.
+        scale_factor (float): Scale factor to rescale proposals.
+        max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W)
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+
+    Returns:
+        tuple[Tensor]: (bboxes, loc_confidence).
+
+            - bboxes: predicted bboxes. Shape (n, 4)
+            - loc_confidence: localization confidence of predicted bboxes.
+                Shape (n,).
+    """
+
+    side_num = int(np.ceil(num_buckets / 2.0))
+    cls_preds = cls_preds.view(-1, side_num)
+    offset_preds = offset_preds.view(-1, side_num)
+
+    scores = F.softmax(cls_preds, dim=1)
+    score_topk, score_label = scores.topk(2, dim=1, largest=True, sorted=True)
+
+    rescaled_proposals = bbox_rescale(proposals, scale_factor)
+
+    pw = rescaled_proposals[..., 2] - rescaled_proposals[..., 0]
+    ph = rescaled_proposals[..., 3] - rescaled_proposals[..., 1]
+    px1 = rescaled_proposals[..., 0]
+    py1 = rescaled_proposals[..., 1]
+    px2 = rescaled_proposals[..., 2]
+    py2 = rescaled_proposals[..., 3]
+
+    bucket_w = pw / num_buckets
+    bucket_h = ph / num_buckets
+
+    score_inds_l = score_label[0::4, 0]
+    score_inds_r = score_label[1::4, 0]
+    score_inds_t = score_label[2::4, 0]
+    score_inds_d = score_label[3::4, 0]
+    l_buckets = px1 + (0.5 + score_inds_l.float()) * bucket_w
+    r_buckets = px2 - (0.5 + score_inds_r.float()) * bucket_w
+    t_buckets = py1 + (0.5 + score_inds_t.float()) * bucket_h
+    d_buckets = py2 - (0.5 + score_inds_d.float()) * bucket_h
+
+    offsets = offset_preds.view(-1, 4, side_num)
+    inds = torch.arange(proposals.size(0)).to(proposals).long()
+    l_offsets = offsets[:, 0, :][inds, score_inds_l]
+    r_offsets = offsets[:, 1, :][inds, score_inds_r]
+    t_offsets = offsets[:, 2, :][inds, score_inds_t]
+    d_offsets = offsets[:, 3, :][inds, score_inds_d]
+
+    x1 = l_buckets - l_offsets * bucket_w
+    x2 = r_buckets - r_offsets * bucket_w
+    y1 = t_buckets - t_offsets * bucket_h
+    y2 = d_buckets - d_offsets * bucket_h
+
+    if clip_border and max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1] - 1)
+        y1 = y1.clamp(min=0, max=max_shape[0] - 1)
+        x2 = x2.clamp(min=0, max=max_shape[1] - 1)
+        y2 = y2.clamp(min=0, max=max_shape[0] - 1)
+    bboxes = torch.cat([x1[:, None], y1[:, None], x2[:, None], y2[:, None]],
+                       dim=-1)
+
+    # bucketing guided rescoring
+    loc_confidence = score_topk[:, 0]
+    top2_neighbor_inds = (score_label[:, 0] - score_label[:, 1]).abs() == 1
+    loc_confidence += score_topk[:, 1] * top2_neighbor_inds.float()
+    loc_confidence = loc_confidence.view(-1, 4).mean(dim=1)
+
+    return bboxes, loc_confidence
diff --git a/mmde/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py b/mmde/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2b60b5ee791e05ce4f5f8d8e1876f7f61e964ed
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py
@@ -0,0 +1,579 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class DeltaXYWHBBoxCoder(BaseBBoxCoder):
+    """Delta XYWH BBox coder.
+
+    Following the practice in `R-CNN <https://arxiv.org/abs/1311.2524>`_,
+    this coder encodes bbox (x1, y1, x2, y2) into delta (dx, dy, dw, dh) and
+    decodes delta (dx, dy, dw, dh) back to original bbox (x1, y1, x2, y2).
+
+    Args:
+        target_means (Sequence[float]): Denormalizing means of target for
+            delta coordinates
+        target_stds (Sequence[float]): Denormalizing standard deviation of
+            target for delta coordinates
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+        add_ctr_clamp (bool): Whether to add center clamp, when added, the
+            predicted box is clamped is its center is too far away from
+            the original anchor's center. Only used by YOLOF. Default False.
+        ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
+            Default 32.
+    """
+
+    def __init__(self,
+                 target_means: Sequence[float] = (0., 0., 0., 0.),
+                 target_stds: Sequence[float] = (1., 1., 1., 1.),
+                 clip_border: bool = True,
+                 add_ctr_clamp: bool = False,
+                 ctr_clamp: int = 32,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.means = target_means
+        self.stds = target_stds
+        self.clip_border = clip_border
+        self.add_ctr_clamp = add_ctr_clamp
+        self.ctr_clamp = ctr_clamp
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor:
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Source boxes,
+                e.g., object proposals.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): Target of the
+                transformation, e.g., ground-truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bbox2delta(bboxes, gt_bboxes, self.means, self.stds)
+        return encoded_bboxes
+
+    def decode(
+        self,
+        bboxes: Union[Tensor, BaseBoxes],
+        pred_bboxes: Tensor,
+        max_shape: Optional[Union[Sequence[int], Tensor,
+                                  Sequence[Sequence[int]]]] = None,
+        wh_ratio_clip: Optional[float] = 16 / 1000
+    ) -> Union[Tensor, BaseBoxes]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes. Shape
+                (B, N, 4) or (N, 4)
+            pred_bboxes (Tensor): Encoded offsets with respect to each roi.
+               Has shape (B, N, num_classes * 4) or (B, N, 4) or
+               (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H
+               when rois is a grid of anchors.Offset encoding follows [1]_.
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+               Sequence[int]],optional): Maximum bounds for boxes, specifies
+               (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then
+               the max_shape should be a Sequence[Sequence[int]]
+               and the length of max_shape should also be B.
+            wh_ratio_clip (float, optional): The allowed ratio between
+                width and height.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        assert pred_bboxes.size(0) == bboxes.size(0)
+        if pred_bboxes.ndim == 3:
+            assert pred_bboxes.size(1) == bboxes.size(1)
+
+        if pred_bboxes.ndim == 2 and not torch.onnx.is_in_onnx_export():
+            # single image decode
+            decoded_bboxes = delta2bbox(bboxes, pred_bboxes, self.means,
+                                        self.stds, max_shape, wh_ratio_clip,
+                                        self.clip_border, self.add_ctr_clamp,
+                                        self.ctr_clamp)
+        else:
+            if pred_bboxes.ndim == 3 and not torch.onnx.is_in_onnx_export():
+                warnings.warn(
+                    'DeprecationWarning: onnx_delta2bbox is deprecated '
+                    'in the case of batch decoding and non-ONNX, '
+                    'please use “delta2bbox” instead. In order to improve '
+                    'the decoding speed, the batch function will no '
+                    'longer be supported. ')
+            decoded_bboxes = onnx_delta2bbox(bboxes, pred_bboxes, self.means,
+                                             self.stds, max_shape,
+                                             wh_ratio_clip, self.clip_border,
+                                             self.add_ctr_clamp,
+                                             self.ctr_clamp)
+
+        if self.use_box_type:
+            assert decoded_bboxes.size(-1) == 4, \
+                ('Cannot warp decoded boxes with box type when decoded boxes'
+                 'have shape of (N, num_classes * 4)')
+            decoded_bboxes = HorizontalBoxes(decoded_bboxes)
+        return decoded_bboxes
+
+
+@TASK_UTILS.register_module()
+class DeltaXYWHBBoxCoderForGLIP(DeltaXYWHBBoxCoder):
+    """This is designed specifically for the GLIP algorithm.
+
+    In order to completely match the official performance, we need to perform
+    special calculations in the encoding and decoding processes, such as
+    additional +1 and -1 calculations. However, this is not a user-friendly
+    design.
+    """
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor:
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Source boxes,
+                e.g., object proposals.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): Target of the
+                transformation, e.g., ground-truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bbox2delta(bboxes, gt_bboxes, self.means, self.stds)
+        return encoded_bboxes
+
+    def decode(
+        self,
+        bboxes: Union[Tensor, BaseBoxes],
+        pred_bboxes: Tensor,
+        max_shape: Optional[Union[Sequence[int], Tensor,
+                                  Sequence[Sequence[int]]]] = None,
+        wh_ratio_clip: Optional[float] = 16 / 1000
+    ) -> Union[Tensor, BaseBoxes]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes. Shape
+                (B, N, 4) or (N, 4)
+            pred_bboxes (Tensor): Encoded offsets with respect to each roi.
+               Has shape (B, N, num_classes * 4) or (B, N, 4) or
+               (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H
+               when rois is a grid of anchors.Offset encoding follows [1]_.
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+               Sequence[int]],optional): Maximum bounds for boxes, specifies
+               (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then
+               the max_shape should be a Sequence[Sequence[int]]
+               and the length of max_shape should also be B.
+            wh_ratio_clip (float, optional): The allowed ratio between
+                width and height.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        assert pred_bboxes.size(0) == bboxes.size(0)
+        if pred_bboxes.ndim == 3:
+            assert pred_bboxes.size(1) == bboxes.size(1)
+
+        if pred_bboxes.ndim == 2 and not torch.onnx.is_in_onnx_export():
+            # single image decode
+            decoded_bboxes = delta2bbox_glip(bboxes, pred_bboxes, self.means,
+                                             self.stds, max_shape,
+                                             wh_ratio_clip, self.clip_border,
+                                             self.add_ctr_clamp,
+                                             self.ctr_clamp)
+        else:
+            raise NotImplementedError()
+
+        if self.use_box_type:
+            assert decoded_bboxes.size(-1) == 4, \
+                ('Cannot warp decoded boxes with box type when decoded boxes'
+                 'have shape of (N, num_classes * 4)')
+            decoded_bboxes = HorizontalBoxes(decoded_bboxes)
+        return decoded_bboxes
+
+
+def bbox2delta(
+    proposals: Tensor,
+    gt: Tensor,
+    means: Sequence[float] = (0., 0., 0., 0.),
+    stds: Sequence[float] = (1., 1., 1., 1.)
+) -> Tensor:
+    """Compute deltas of proposals w.r.t. gt.
+
+    We usually compute the deltas of x, y, w, h of proposals w.r.t ground
+    truth bboxes to get regression target.
+    This is the inverse function of :func:`delta2bbox`.
+
+    Args:
+        proposals (Tensor): Boxes to be transformed, shape (N, ..., 4)
+        gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4)
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+
+    Returns:
+        Tensor: deltas with shape (N, 4), where columns represent dx, dy,
+            dw, dh.
+    """
+    assert proposals.size() == gt.size()
+
+    proposals = proposals.float()
+    gt = gt.float()
+    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
+    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
+    pw = proposals[..., 2] - proposals[..., 0]
+    ph = proposals[..., 3] - proposals[..., 1]
+
+    gx = (gt[..., 0] + gt[..., 2]) * 0.5
+    gy = (gt[..., 1] + gt[..., 3]) * 0.5
+    gw = gt[..., 2] - gt[..., 0]
+    gh = gt[..., 3] - gt[..., 1]
+
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = torch.log(gw / pw)
+    dh = torch.log(gh / ph)
+    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
+
+    means = deltas.new_tensor(means).unsqueeze(0)
+    stds = deltas.new_tensor(stds).unsqueeze(0)
+    deltas = deltas.sub_(means).div_(stds)
+
+    return deltas
+
+
+def delta2bbox(rois: Tensor,
+               deltas: Tensor,
+               means: Sequence[float] = (0., 0., 0., 0.),
+               stds: Sequence[float] = (1., 1., 1., 1.),
+               max_shape: Optional[Union[Sequence[int], Tensor,
+                                         Sequence[Sequence[int]]]] = None,
+               wh_ratio_clip: float = 16 / 1000,
+               clip_border: bool = True,
+               add_ctr_clamp: bool = False,
+               ctr_clamp: int = 32) -> Tensor:
+    """Apply deltas to shift/scale base boxes.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of :func:`bbox2delta`.
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4).
+        deltas (Tensor): Encoded offsets relative to each roi.
+            Has shape (N, num_classes * 4) or (N, 4). Note
+            N = num_base_anchors * W * H, when rois is a grid of
+            anchors. Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates.
+            Default (0., 0., 0., 0.).
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates. Default (1., 1., 1., 1.).
+        max_shape (tuple[int, int]): Maximum bounds for boxes, specifies
+           (H, W). Default None.
+        wh_ratio_clip (float): Maximum aspect ratio for boxes. Default
+            16 / 1000.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Default True.
+        add_ctr_clamp (bool): Whether to add center clamp. When set to True,
+            the center of the prediction bounding box will be clamped to
+            avoid being too far away from the center of the anchor.
+            Only used by YOLOF. Default False.
+        ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
+            Default 32.
+
+    Returns:
+        Tensor: Boxes with shape (N, num_classes * 4) or (N, 4), where 4
+           represent tl_x, tl_y, br_x, br_y.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Example:
+        >>> rois = torch.Tensor([[ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 5.,  5.,  5.,  5.]])
+        >>> deltas = torch.Tensor([[  0.,   0.,   0.,   0.],
+        >>>                        [  1.,   1.,   1.,   1.],
+        >>>                        [  0.,   0.,   2.,  -1.],
+        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
+        >>> delta2bbox(rois, deltas, max_shape=(32, 32, 3))
+        tensor([[0.0000, 0.0000, 1.0000, 1.0000],
+                [0.1409, 0.1409, 2.8591, 2.8591],
+                [0.0000, 0.3161, 4.1945, 0.6839],
+                [5.0000, 5.0000, 5.0000, 5.0000]])
+    """
+    num_bboxes, num_classes = deltas.size(0), deltas.size(1) // 4
+    if num_bboxes == 0:
+        return deltas
+
+    deltas = deltas.reshape(-1, 4)
+
+    means = deltas.new_tensor(means).view(1, -1)
+    stds = deltas.new_tensor(stds).view(1, -1)
+    denorm_deltas = deltas * stds + means
+
+    dxy = denorm_deltas[:, :2]
+    dwh = denorm_deltas[:, 2:]
+
+    # Compute width/height of each roi
+    rois_ = rois.repeat(1, num_classes).reshape(-1, 4)
+    pxy = ((rois_[:, :2] + rois_[:, 2:]) * 0.5)
+    pwh = (rois_[:, 2:] - rois_[:, :2])
+
+    dxy_wh = pwh * dxy
+
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    if add_ctr_clamp:
+        dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp)
+        dwh = torch.clamp(dwh, max=max_ratio)
+    else:
+        dwh = dwh.clamp(min=-max_ratio, max=max_ratio)
+
+    gxy = pxy + dxy_wh
+    gwh = pwh * dwh.exp()
+    x1y1 = gxy - (gwh * 0.5)
+    x2y2 = gxy + (gwh * 0.5)
+    bboxes = torch.cat([x1y1, x2y2], dim=-1)
+    if clip_border and max_shape is not None:
+        bboxes[..., 0::2].clamp_(min=0, max=max_shape[1])
+        bboxes[..., 1::2].clamp_(min=0, max=max_shape[0])
+    bboxes = bboxes.reshape(num_bboxes, -1)
+    return bboxes
+
+
+def onnx_delta2bbox(rois: Tensor,
+                    deltas: Tensor,
+                    means: Sequence[float] = (0., 0., 0., 0.),
+                    stds: Sequence[float] = (1., 1., 1., 1.),
+                    max_shape: Optional[Union[Sequence[int], Tensor,
+                                              Sequence[Sequence[int]]]] = None,
+                    wh_ratio_clip: float = 16 / 1000,
+                    clip_border: Optional[bool] = True,
+                    add_ctr_clamp: bool = False,
+                    ctr_clamp: int = 32) -> Tensor:
+    """Apply deltas to shift/scale base boxes.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of :func:`bbox2delta`.
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4) or (B, N, 4)
+        deltas (Tensor): Encoded offsets with respect to each roi.
+            Has shape (B, N, num_classes * 4) or (B, N, 4) or
+            (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H
+            when rois is a grid of anchors.Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates.
+            Default (0., 0., 0., 0.).
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates. Default (1., 1., 1., 1.).
+        max_shape (Sequence[int] or torch.Tensor or Sequence[
+            Sequence[int]],optional): Maximum bounds for boxes, specifies
+            (H, W, C) or (H, W). If rois shape is (B, N, 4), then
+            the max_shape should be a Sequence[Sequence[int]]
+            and the length of max_shape should also be B. Default None.
+        wh_ratio_clip (float): Maximum aspect ratio for boxes.
+            Default 16 / 1000.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Default True.
+        add_ctr_clamp (bool): Whether to add center clamp, when added, the
+            predicted box is clamped is its center is too far away from
+            the original anchor's center. Only used by YOLOF. Default False.
+        ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
+            Default 32.
+
+    Returns:
+        Tensor: Boxes with shape (B, N, num_classes * 4) or (B, N, 4) or
+           (N, num_classes * 4) or (N, 4), where 4 represent
+           tl_x, tl_y, br_x, br_y.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Example:
+        >>> rois = torch.Tensor([[ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 5.,  5.,  5.,  5.]])
+        >>> deltas = torch.Tensor([[  0.,   0.,   0.,   0.],
+        >>>                        [  1.,   1.,   1.,   1.],
+        >>>                        [  0.,   0.,   2.,  -1.],
+        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
+        >>> delta2bbox(rois, deltas, max_shape=(32, 32, 3))
+        tensor([[0.0000, 0.0000, 1.0000, 1.0000],
+                [0.1409, 0.1409, 2.8591, 2.8591],
+                [0.0000, 0.3161, 4.1945, 0.6839],
+                [5.0000, 5.0000, 5.0000, 5.0000]])
+    """
+    means = deltas.new_tensor(means).view(1,
+                                          -1).repeat(1,
+                                                     deltas.size(-1) // 4)
+    stds = deltas.new_tensor(stds).view(1, -1).repeat(1, deltas.size(-1) // 4)
+    denorm_deltas = deltas * stds + means
+    dx = denorm_deltas[..., 0::4]
+    dy = denorm_deltas[..., 1::4]
+    dw = denorm_deltas[..., 2::4]
+    dh = denorm_deltas[..., 3::4]
+
+    x1, y1 = rois[..., 0], rois[..., 1]
+    x2, y2 = rois[..., 2], rois[..., 3]
+    # Compute center of each roi
+    px = ((x1 + x2) * 0.5).unsqueeze(-1).expand_as(dx)
+    py = ((y1 + y2) * 0.5).unsqueeze(-1).expand_as(dy)
+    # Compute width/height of each roi
+    pw = (x2 - x1).unsqueeze(-1).expand_as(dw)
+    ph = (y2 - y1).unsqueeze(-1).expand_as(dh)
+
+    dx_width = pw * dx
+    dy_height = ph * dy
+
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    if add_ctr_clamp:
+        dx_width = torch.clamp(dx_width, max=ctr_clamp, min=-ctr_clamp)
+        dy_height = torch.clamp(dy_height, max=ctr_clamp, min=-ctr_clamp)
+        dw = torch.clamp(dw, max=max_ratio)
+        dh = torch.clamp(dh, max=max_ratio)
+    else:
+        dw = dw.clamp(min=-max_ratio, max=max_ratio)
+        dh = dh.clamp(min=-max_ratio, max=max_ratio)
+    # Use exp(network energy) to enlarge/shrink each roi
+    gw = pw * dw.exp()
+    gh = ph * dh.exp()
+    # Use network energy to shift the center of each roi
+    gx = px + dx_width
+    gy = py + dy_height
+    # Convert center-xy/width/height to top-left, bottom-right
+    x1 = gx - gw * 0.5
+    y1 = gy - gh * 0.5
+    x2 = gx + gw * 0.5
+    y2 = gy + gh * 0.5
+
+    bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size())
+
+    if clip_border and max_shape is not None:
+        # clip bboxes with dynamic `min` and `max` for onnx
+        if torch.onnx.is_in_onnx_export():
+            from mmdet.core.export import dynamic_clip_for_onnx
+            x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape)
+            bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size())
+            return bboxes
+        if not isinstance(max_shape, torch.Tensor):
+            max_shape = x1.new_tensor(max_shape)
+        max_shape = max_shape[..., :2].type_as(x1)
+        if max_shape.ndim == 2:
+            assert bboxes.ndim == 3
+            assert max_shape.size(0) == bboxes.size(0)
+
+        min_xy = x1.new_tensor(0)
+        max_xy = torch.cat(
+            [max_shape] * (deltas.size(-1) // 2),
+            dim=-1).flip(-1).unsqueeze(-2)
+        bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+        bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+    return bboxes
+
+
+def delta2bbox_glip(rois: Tensor,
+                    deltas: Tensor,
+                    means: Sequence[float] = (0., 0., 0., 0.),
+                    stds: Sequence[float] = (1., 1., 1., 1.),
+                    max_shape: Optional[Union[Sequence[int], Tensor,
+                                              Sequence[Sequence[int]]]] = None,
+                    wh_ratio_clip: float = 16 / 1000,
+                    clip_border: bool = True,
+                    add_ctr_clamp: bool = False,
+                    ctr_clamp: int = 32) -> Tensor:
+    """Apply deltas to shift/scale base boxes.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of :func:`bbox2delta`.
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4).
+        deltas (Tensor): Encoded offsets relative to each roi.
+            Has shape (N, num_classes * 4) or (N, 4). Note
+            N = num_base_anchors * W * H, when rois is a grid of
+            anchors. Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates.
+            Default (0., 0., 0., 0.).
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates. Default (1., 1., 1., 1.).
+        max_shape (tuple[int, int]): Maximum bounds for boxes, specifies
+           (H, W). Default None.
+        wh_ratio_clip (float): Maximum aspect ratio for boxes. Default
+            16 / 1000.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Default True.
+        add_ctr_clamp (bool): Whether to add center clamp. When set to True,
+            the center of the prediction bounding box will be clamped to
+            avoid being too far away from the center of the anchor.
+            Only used by YOLOF. Default False.
+        ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
+            Default 32.
+
+    Returns:
+        Tensor: Boxes with shape (N, num_classes * 4) or (N, 4), where 4
+           represent tl_x, tl_y, br_x, br_y.
+    """
+    num_bboxes, num_classes = deltas.size(0), deltas.size(1) // 4
+    if num_bboxes == 0:
+        return deltas
+
+    deltas = deltas.reshape(-1, 4)
+
+    means = deltas.new_tensor(means).view(1, -1)
+    stds = deltas.new_tensor(stds).view(1, -1)
+    denorm_deltas = deltas * stds + means
+
+    dxy = denorm_deltas[:, :2]
+    dwh = denorm_deltas[:, 2:]
+
+    # Compute width/height of each roi
+    rois_ = rois.repeat(1, num_classes).reshape(-1, 4)
+    pxy = ((rois_[:, :2] + rois_[:, 2:] - 1) * 0.5)  # note
+    pwh = (rois_[:, 2:] - rois_[:, :2])
+
+    dxy_wh = pwh * dxy
+
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    if add_ctr_clamp:
+        dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp)
+        dwh = torch.clamp(dwh, max=max_ratio)
+    else:
+        dwh = dwh.clamp(min=-max_ratio, max=max_ratio)
+
+    gxy = pxy + dxy_wh
+    gwh = pwh * dwh.exp()
+
+    x1y1 = gxy - (gwh - 1) * 0.5  # Note
+    x2y2 = gxy + (gwh - 1) * 0.5  # Note
+
+    bboxes = torch.cat([x1y1, x2y2], dim=-1)
+
+    if clip_border and max_shape is not None:
+        bboxes[..., 0::2].clamp_(min=0, max=max_shape[1] - 1)  # Note
+        bboxes[..., 1::2].clamp_(min=0, max=max_shape[0] - 1)  # Note
+    bboxes = bboxes.reshape(num_bboxes, -1)
+    return bboxes
diff --git a/mmde/mmdet/models/task_modules/coders/distance_point_bbox_coder.py b/mmde/mmdet/models/task_modules/coders/distance_point_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab26bf4b96c48df689da3722c23aa65e646348db
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/coders/distance_point_bbox_coder.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import (BaseBoxes, HorizontalBoxes, bbox2distance,
+                                   distance2bbox, get_box_tensor)
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class DistancePointBBoxCoder(BaseBBoxCoder):
+    """Distance Point BBox coder.
+
+    This coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left,
+    right) and decode it back to the original.
+
+    Args:
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self, clip_border: Optional[bool] = True, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.clip_border = clip_border
+
+    def encode(self,
+               points: Tensor,
+               gt_bboxes: Union[Tensor, BaseBoxes],
+               max_dis: Optional[float] = None,
+               eps: float = 0.1) -> Tensor:
+        """Encode bounding box to distances.
+
+        Args:
+            points (Tensor): Shape (N, 2), The format is [x, y].
+            gt_bboxes (Tensor or :obj:`BaseBoxes`): Shape (N, 4), The format
+                is "xyxy"
+            max_dis (float): Upper bound of the distance. Default None.
+            eps (float): a small value to ensure target < max_dis, instead <=.
+                Default 0.1.
+
+        Returns:
+            Tensor: Box transformation deltas. The shape is (N, 4).
+        """
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert points.size(0) == gt_bboxes.size(0)
+        assert points.size(-1) == 2
+        assert gt_bboxes.size(-1) == 4
+        return bbox2distance(points, gt_bboxes, max_dis, eps)
+
+    def decode(
+        self,
+        points: Tensor,
+        pred_bboxes: Tensor,
+        max_shape: Optional[Union[Sequence[int], Tensor,
+                                  Sequence[Sequence[int]]]] = None
+    ) -> Union[Tensor, BaseBoxes]:
+        """Decode distance prediction to bounding box.
+
+        Args:
+            points (Tensor): Shape (B, N, 2) or (N, 2).
+            pred_bboxes (Tensor): Distance from the given point to 4
+                boundaries (left, top, right, bottom). Shape (B, N, 4)
+                or (N, 4)
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+                Sequence[int]],optional): Maximum bounds for boxes, specifies
+                (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+                the max_shape should be a Sequence[Sequence[int]],
+                and the length of max_shape should also be B.
+                Default None.
+        Returns:
+            Union[Tensor, :obj:`BaseBoxes`]: Boxes with shape (N, 4) or
+            (B, N, 4)
+        """
+        assert points.size(0) == pred_bboxes.size(0)
+        assert points.size(-1) == 2
+        assert pred_bboxes.size(-1) == 4
+        if self.clip_border is False:
+            max_shape = None
+        bboxes = distance2bbox(points, pred_bboxes, max_shape)
+
+        if self.use_box_type:
+            bboxes = HorizontalBoxes(bboxes)
+        return bboxes
diff --git a/mmde/mmdet/models/task_modules/coders/legacy_delta_xywh_bbox_coder.py b/mmde/mmdet/models/task_modules/coders/legacy_delta_xywh_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eb1bedb3fbe19433c8bdb37f80891efa2cb72fc
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/coders/legacy_delta_xywh_bbox_coder.py
@@ -0,0 +1,235 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class LegacyDeltaXYWHBBoxCoder(BaseBBoxCoder):
+    """Legacy Delta XYWH BBox coder used in MMDet V1.x.
+
+    Following the practice in R-CNN [1]_, this coder encodes bbox (x1, y1, x2,
+    y2) into delta (dx, dy, dw, dh) and decodes delta (dx, dy, dw, dh)
+    back to original bbox (x1, y1, x2, y2).
+
+    Note:
+        The main difference between :class`LegacyDeltaXYWHBBoxCoder` and
+        :class:`DeltaXYWHBBoxCoder` is whether ``+ 1`` is used during width and
+        height calculation. We suggest to only use this coder when testing with
+        MMDet V1.x models.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Args:
+        target_means (Sequence[float]): denormalizing means of target for
+            delta coordinates
+        target_stds (Sequence[float]): denormalizing standard deviation of
+            target for delta coordinates
+    """
+
+    def __init__(self,
+                 target_means: Sequence[float] = (0., 0., 0., 0.),
+                 target_stds: Sequence[float] = (1., 1., 1., 1.),
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.means = target_means
+        self.stds = target_stds
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor:
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): source boxes,
+                e.g., object proposals.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): target of the
+                transformation, e.g., ground-truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = legacy_bbox2delta(bboxes, gt_bboxes, self.means,
+                                           self.stds)
+        return encoded_bboxes
+
+    def decode(
+        self,
+        bboxes: Union[Tensor, BaseBoxes],
+        pred_bboxes: Tensor,
+        max_shape: Optional[Union[Sequence[int], Tensor,
+                                  Sequence[Sequence[int]]]] = None,
+        wh_ratio_clip: Optional[float] = 16 / 1000
+    ) -> Union[Tensor, BaseBoxes]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            boxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes.
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+            max_shape (tuple[int], optional): Maximum shape of boxes.
+                Defaults to None.
+            wh_ratio_clip (float, optional): The allowed ratio between
+                width and height.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        assert pred_bboxes.size(0) == bboxes.size(0)
+        decoded_bboxes = legacy_delta2bbox(bboxes, pred_bboxes, self.means,
+                                           self.stds, max_shape, wh_ratio_clip)
+
+        if self.use_box_type:
+            assert decoded_bboxes.size(-1) == 4, \
+                ('Cannot warp decoded boxes with box type when decoded boxes'
+                 'have shape of (N, num_classes * 4)')
+            decoded_bboxes = HorizontalBoxes(decoded_bboxes)
+        return decoded_bboxes
+
+
+def legacy_bbox2delta(
+    proposals: Tensor,
+    gt: Tensor,
+    means: Sequence[float] = (0., 0., 0., 0.),
+    stds: Sequence[float] = (1., 1., 1., 1.)
+) -> Tensor:
+    """Compute deltas of proposals w.r.t. gt in the MMDet V1.x manner.
+
+    We usually compute the deltas of x, y, w, h of proposals w.r.t ground
+    truth bboxes to get regression target.
+    This is the inverse function of `delta2bbox()`
+
+    Args:
+        proposals (Tensor): Boxes to be transformed, shape (N, ..., 4)
+        gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4)
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+
+    Returns:
+        Tensor: deltas with shape (N, 4), where columns represent dx, dy,
+            dw, dh.
+    """
+    assert proposals.size() == gt.size()
+
+    proposals = proposals.float()
+    gt = gt.float()
+    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
+    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
+    pw = proposals[..., 2] - proposals[..., 0] + 1.0
+    ph = proposals[..., 3] - proposals[..., 1] + 1.0
+
+    gx = (gt[..., 0] + gt[..., 2]) * 0.5
+    gy = (gt[..., 1] + gt[..., 3]) * 0.5
+    gw = gt[..., 2] - gt[..., 0] + 1.0
+    gh = gt[..., 3] - gt[..., 1] + 1.0
+
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = torch.log(gw / pw)
+    dh = torch.log(gh / ph)
+    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
+
+    means = deltas.new_tensor(means).unsqueeze(0)
+    stds = deltas.new_tensor(stds).unsqueeze(0)
+    deltas = deltas.sub_(means).div_(stds)
+
+    return deltas
+
+
+def legacy_delta2bbox(rois: Tensor,
+                      deltas: Tensor,
+                      means: Sequence[float] = (0., 0., 0., 0.),
+                      stds: Sequence[float] = (1., 1., 1., 1.),
+                      max_shape: Optional[
+                          Union[Sequence[int], Tensor,
+                                Sequence[Sequence[int]]]] = None,
+                      wh_ratio_clip: float = 16 / 1000) -> Tensor:
+    """Apply deltas to shift/scale base boxes in the MMDet V1.x manner.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of `bbox2delta()`
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4)
+        deltas (Tensor): Encoded offsets with respect to each roi.
+            Has shape (N, 4 * num_classes). Note N = num_anchors * W * H when
+            rois is a grid of anchors. Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+        max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W)
+        wh_ratio_clip (float): Maximum aspect ratio for boxes.
+
+    Returns:
+        Tensor: Boxes with shape (N, 4), where columns represent
+            tl_x, tl_y, br_x, br_y.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Example:
+        >>> rois = torch.Tensor([[ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 5.,  5.,  5.,  5.]])
+        >>> deltas = torch.Tensor([[  0.,   0.,   0.,   0.],
+        >>>                        [  1.,   1.,   1.,   1.],
+        >>>                        [  0.,   0.,   2.,  -1.],
+        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
+        >>> legacy_delta2bbox(rois, deltas, max_shape=(32, 32))
+        tensor([[0.0000, 0.0000, 1.5000, 1.5000],
+                [0.0000, 0.0000, 5.2183, 5.2183],
+                [0.0000, 0.1321, 7.8891, 0.8679],
+                [5.3967, 2.4251, 6.0033, 3.7749]])
+    """
+    means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4)
+    stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4)
+    denorm_deltas = deltas * stds + means
+    dx = denorm_deltas[:, 0::4]
+    dy = denorm_deltas[:, 1::4]
+    dw = denorm_deltas[:, 2::4]
+    dh = denorm_deltas[:, 3::4]
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    dw = dw.clamp(min=-max_ratio, max=max_ratio)
+    dh = dh.clamp(min=-max_ratio, max=max_ratio)
+    # Compute center of each roi
+    px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)
+    py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)
+    # Compute width/height of each roi
+    pw = (rois[:, 2] - rois[:, 0] + 1.0).unsqueeze(1).expand_as(dw)
+    ph = (rois[:, 3] - rois[:, 1] + 1.0).unsqueeze(1).expand_as(dh)
+    # Use exp(network energy) to enlarge/shrink each roi
+    gw = pw * dw.exp()
+    gh = ph * dh.exp()
+    # Use network energy to shift the center of each roi
+    gx = px + pw * dx
+    gy = py + ph * dy
+    # Convert center-xy/width/height to top-left, bottom-right
+
+    # The true legacy box coder should +- 0.5 here.
+    # However, current implementation improves the performance when testing
+    # the models trained in MMDetection 1.X (~0.5 bbox AP, 0.2 mask AP)
+    x1 = gx - gw * 0.5
+    y1 = gy - gh * 0.5
+    x2 = gx + gw * 0.5
+    y2 = gy + gh * 0.5
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1] - 1)
+        y1 = y1.clamp(min=0, max=max_shape[0] - 1)
+        x2 = x2.clamp(min=0, max=max_shape[1] - 1)
+        y2 = y2.clamp(min=0, max=max_shape[0] - 1)
+    bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas)
+    return bboxes
diff --git a/mmde/mmdet/models/task_modules/coders/pseudo_bbox_coder.py b/mmde/mmdet/models/task_modules/coders/pseudo_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ee74311f6d12bde49d0c678edb60540a8c95c8b
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/coders/pseudo_bbox_coder.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class PseudoBBoxCoder(BaseBBoxCoder):
+    """Pseudo bounding box coder."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def encode(self, bboxes: Tensor, gt_bboxes: Union[Tensor,
+                                                      BaseBoxes]) -> Tensor:
+        """torch.Tensor: return the given ``bboxes``"""
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        return gt_bboxes
+
+    def decode(self, bboxes: Tensor, pred_bboxes: Union[Tensor,
+                                                        BaseBoxes]) -> Tensor:
+        """torch.Tensor: return the given ``pred_bboxes``"""
+        if self.use_box_type:
+            pred_bboxes = HorizontalBoxes(pred_bboxes)
+        return pred_bboxes
diff --git a/mmde/mmdet/models/task_modules/coders/tblr_bbox_coder.py b/mmde/mmdet/models/task_modules/coders/tblr_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..74b388f7bad6ebc1911cee5b0b7d73bbd04de17a
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/coders/tblr_bbox_coder.py
@@ -0,0 +1,228 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class TBLRBBoxCoder(BaseBBoxCoder):
+    """TBLR BBox coder.
+
+    Following the practice in `FSAF <https://arxiv.org/abs/1903.00621>`_,
+    this coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left,
+    right) and decode it back to the original.
+
+    Args:
+        normalizer (list | float): Normalization factor to be
+          divided with when coding the coordinates. If it is a list, it should
+          have length of 4 indicating normalization factor in tblr dims.
+          Otherwise it is a unified float factor for all dims. Default: 4.0
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 normalizer: Union[Sequence[float], float] = 4.0,
+                 clip_border: bool = True,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.normalizer = normalizer
+        self.clip_border = clip_border
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor:
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes`` in the (top, left,
+        bottom, right) order.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): source boxes,
+                e.g., object proposals.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): target of the
+                transformation, e.g., ground truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bboxes2tblr(
+            bboxes, gt_bboxes, normalizer=self.normalizer)
+        return encoded_bboxes
+
+    def decode(
+        self,
+        bboxes: Union[Tensor, BaseBoxes],
+        pred_bboxes: Tensor,
+        max_shape: Optional[Union[Sequence[int], Tensor,
+                                  Sequence[Sequence[int]]]] = None
+    ) -> Union[Tensor, BaseBoxes]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes.Shape
+                (B, N, 4) or (N, 4)
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+               (B, N, 4) or (N, 4)
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+               Sequence[int]],optional): Maximum bounds for boxes, specifies
+               (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then
+               the max_shape should be a Sequence[Sequence[int]]
+               and the length of max_shape should also be B.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        decoded_bboxes = tblr2bboxes(
+            bboxes,
+            pred_bboxes,
+            normalizer=self.normalizer,
+            max_shape=max_shape,
+            clip_border=self.clip_border)
+
+        if self.use_box_type:
+            decoded_bboxes = HorizontalBoxes(decoded_bboxes)
+        return decoded_bboxes
+
+
+def bboxes2tblr(priors: Tensor,
+                gts: Tensor,
+                normalizer: Union[Sequence[float], float] = 4.0,
+                normalize_by_wh: bool = True) -> Tensor:
+    """Encode ground truth boxes to tblr coordinate.
+
+    It first convert the gt coordinate to tblr format,
+     (top, bottom, left, right), relative to prior box centers.
+     The tblr coordinate may be normalized by the side length of prior bboxes
+     if `normalize_by_wh` is specified as True, and it is then normalized by
+     the `normalizer` factor.
+
+    Args:
+        priors (Tensor): Prior boxes in point form
+            Shape: (num_proposals,4).
+        gts (Tensor): Coords of ground truth for each prior in point-form
+            Shape: (num_proposals, 4).
+        normalizer (Sequence[float] | float): normalization parameter of
+            encoded boxes. If it is a list, it has to have length = 4.
+            Default: 4.0
+        normalize_by_wh (bool): Whether to normalize tblr coordinate by the
+            side length (wh) of prior bboxes.
+
+    Return:
+        encoded boxes (Tensor), Shape: (num_proposals, 4)
+    """
+
+    # dist b/t match center and prior's center
+    if not isinstance(normalizer, float):
+        normalizer = torch.tensor(normalizer, device=priors.device)
+        assert len(normalizer) == 4, 'Normalizer must have length = 4'
+    assert priors.size(0) == gts.size(0)
+    prior_centers = (priors[:, 0:2] + priors[:, 2:4]) / 2
+    xmin, ymin, xmax, ymax = gts.split(1, dim=1)
+    top = prior_centers[:, 1].unsqueeze(1) - ymin
+    bottom = ymax - prior_centers[:, 1].unsqueeze(1)
+    left = prior_centers[:, 0].unsqueeze(1) - xmin
+    right = xmax - prior_centers[:, 0].unsqueeze(1)
+    loc = torch.cat((top, bottom, left, right), dim=1)
+    if normalize_by_wh:
+        # Normalize tblr by anchor width and height
+        wh = priors[:, 2:4] - priors[:, 0:2]
+        w, h = torch.split(wh, 1, dim=1)
+        loc[:, :2] /= h  # tb is normalized by h
+        loc[:, 2:] /= w  # lr is normalized by w
+    # Normalize tblr by the given normalization factor
+    return loc / normalizer
+
+
+def tblr2bboxes(priors: Tensor,
+                tblr: Tensor,
+                normalizer: Union[Sequence[float], float] = 4.0,
+                normalize_by_wh: bool = True,
+                max_shape: Optional[Union[Sequence[int], Tensor,
+                                          Sequence[Sequence[int]]]] = None,
+                clip_border: bool = True) -> Tensor:
+    """Decode tblr outputs to prediction boxes.
+
+    The process includes 3 steps: 1) De-normalize tblr coordinates by
+    multiplying it with `normalizer`; 2) De-normalize tblr coordinates by the
+    prior bbox width and height if `normalize_by_wh` is `True`; 3) Convert
+    tblr (top, bottom, left, right) pair relative to the center of priors back
+    to (xmin, ymin, xmax, ymax) coordinate.
+
+    Args:
+        priors (Tensor): Prior boxes in point form (x0, y0, x1, y1)
+          Shape: (N,4) or (B, N, 4).
+        tblr (Tensor): Coords of network output in tblr form
+          Shape: (N, 4) or (B, N, 4).
+        normalizer (Sequence[float] | float): Normalization parameter of
+          encoded boxes. By list, it represents the normalization factors at
+          tblr dims. By float, it is the unified normalization factor at all
+          dims. Default: 4.0
+        normalize_by_wh (bool): Whether the tblr coordinates have been
+          normalized by the side length (wh) of prior bboxes.
+        max_shape (Sequence[int] or torch.Tensor or Sequence[
+            Sequence[int]],optional): Maximum bounds for boxes, specifies
+            (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+            the max_shape should be a Sequence[Sequence[int]]
+            and the length of max_shape should also be B.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+
+    Return:
+        encoded boxes (Tensor): Boxes with shape (N, 4) or (B, N, 4)
+    """
+    if not isinstance(normalizer, float):
+        normalizer = torch.tensor(normalizer, device=priors.device)
+        assert len(normalizer) == 4, 'Normalizer must have length = 4'
+    assert priors.size(0) == tblr.size(0)
+    if priors.ndim == 3:
+        assert priors.size(1) == tblr.size(1)
+
+    loc_decode = tblr * normalizer
+    prior_centers = (priors[..., 0:2] + priors[..., 2:4]) / 2
+    if normalize_by_wh:
+        wh = priors[..., 2:4] - priors[..., 0:2]
+        w, h = torch.split(wh, 1, dim=-1)
+        # Inplace operation with slice would failed for exporting to ONNX
+        th = h * loc_decode[..., :2]  # tb
+        tw = w * loc_decode[..., 2:]  # lr
+        loc_decode = torch.cat([th, tw], dim=-1)
+    # Cannot be exported using onnx when loc_decode.split(1, dim=-1)
+    top, bottom, left, right = loc_decode.split((1, 1, 1, 1), dim=-1)
+    xmin = prior_centers[..., 0].unsqueeze(-1) - left
+    xmax = prior_centers[..., 0].unsqueeze(-1) + right
+    ymin = prior_centers[..., 1].unsqueeze(-1) - top
+    ymax = prior_centers[..., 1].unsqueeze(-1) + bottom
+
+    bboxes = torch.cat((xmin, ymin, xmax, ymax), dim=-1)
+
+    if clip_border and max_shape is not None:
+        # clip bboxes with dynamic `min` and `max` for onnx
+        if torch.onnx.is_in_onnx_export():
+            from mmdet.core.export import dynamic_clip_for_onnx
+            xmin, ymin, xmax, ymax = dynamic_clip_for_onnx(
+                xmin, ymin, xmax, ymax, max_shape)
+            bboxes = torch.cat([xmin, ymin, xmax, ymax], dim=-1)
+            return bboxes
+        if not isinstance(max_shape, torch.Tensor):
+            max_shape = priors.new_tensor(max_shape)
+        max_shape = max_shape[..., :2].type_as(priors)
+        if max_shape.ndim == 2:
+            assert bboxes.ndim == 3
+            assert max_shape.size(0) == bboxes.size(0)
+
+        min_xy = priors.new_tensor(0)
+        max_xy = torch.cat([max_shape, max_shape],
+                           dim=-1).flip(-1).unsqueeze(-2)
+        bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+        bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+    return bboxes
diff --git a/mmde/mmdet/models/task_modules/coders/yolo_bbox_coder.py b/mmde/mmdet/models/task_modules/coders/yolo_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e1c766789bec844ff359e225435bc3b2f5dd736
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/coders/yolo_bbox_coder.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class YOLOBBoxCoder(BaseBBoxCoder):
+    """YOLO BBox coder.
+
+    Following `YOLO <https://arxiv.org/abs/1506.02640>`_, this coder divide
+    image into grids, and encode bbox (x1, y1, x2, y2) into (cx, cy, dw, dh).
+    cx, cy in [0., 1.], denotes relative center position w.r.t the center of
+    bboxes. dw, dh are the same as :obj:`DeltaXYWHBBoxCoder`.
+
+    Args:
+        eps (float): Min value of cx, cy when encoding.
+    """
+
+    def __init__(self, eps: float = 1e-6, **kwargs):
+        super().__init__(**kwargs)
+        self.eps = eps
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes],
+               stride: Union[Tensor, int]) -> Tensor:
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Source boxes,
+                e.g., anchors.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): Target of the
+                transformation, e.g., ground-truth boxes.
+            stride (torch.Tensor | int): Stride of bboxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        x_center_gt = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) * 0.5
+        y_center_gt = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) * 0.5
+        w_gt = gt_bboxes[..., 2] - gt_bboxes[..., 0]
+        h_gt = gt_bboxes[..., 3] - gt_bboxes[..., 1]
+        x_center = (bboxes[..., 0] + bboxes[..., 2]) * 0.5
+        y_center = (bboxes[..., 1] + bboxes[..., 3]) * 0.5
+        w = bboxes[..., 2] - bboxes[..., 0]
+        h = bboxes[..., 3] - bboxes[..., 1]
+        w_target = torch.log((w_gt / w).clamp(min=self.eps))
+        h_target = torch.log((h_gt / h).clamp(min=self.eps))
+        x_center_target = ((x_center_gt - x_center) / stride + 0.5).clamp(
+            self.eps, 1 - self.eps)
+        y_center_target = ((y_center_gt - y_center) / stride + 0.5).clamp(
+            self.eps, 1 - self.eps)
+        encoded_bboxes = torch.stack(
+            [x_center_target, y_center_target, w_target, h_target], dim=-1)
+        return encoded_bboxes
+
+    def decode(self, bboxes: Union[Tensor, BaseBoxes], pred_bboxes: Tensor,
+               stride: Union[Tensor, int]) -> Union[Tensor, BaseBoxes]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            boxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes,
+                e.g. anchors.
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+            stride (torch.Tensor | int): Strides of bboxes.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        assert pred_bboxes.size(-1) == bboxes.size(-1) == 4
+        xy_centers = (bboxes[..., :2] + bboxes[..., 2:]) * 0.5 + (
+            pred_bboxes[..., :2] - 0.5) * stride
+        whs = (bboxes[..., 2:] -
+               bboxes[..., :2]) * 0.5 * pred_bboxes[..., 2:].exp()
+        decoded_bboxes = torch.stack(
+            (xy_centers[..., 0] - whs[..., 0], xy_centers[..., 1] -
+             whs[..., 1], xy_centers[..., 0] + whs[..., 0],
+             xy_centers[..., 1] + whs[..., 1]),
+            dim=-1)
+
+        if self.use_box_type:
+            decoded_bboxes = HorizontalBoxes(decoded_bboxes)
+        return decoded_bboxes
diff --git a/mmde/mmdet/models/task_modules/prior_generators/__init__.py b/mmde/mmdet/models/task_modules/prior_generators/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7795e98ca77bb5ffc77ff1da848130717d8f85a6
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/prior_generators/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor_generator import (AnchorGenerator, LegacyAnchorGenerator,
+                               SSDAnchorGenerator, YOLOAnchorGenerator)
+from .point_generator import MlvlPointGenerator, PointGenerator
+from .utils import anchor_inside_flags, calc_region
+
+__all__ = [
+    'AnchorGenerator', 'LegacyAnchorGenerator', 'anchor_inside_flags',
+    'PointGenerator', 'calc_region', 'YOLOAnchorGenerator',
+    'MlvlPointGenerator', 'SSDAnchorGenerator'
+]
diff --git a/mmde/mmdet/models/task_modules/prior_generators/anchor_generator.py b/mmde/mmdet/models/task_modules/prior_generators/anchor_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..2757697ce2283ec8b46ba89325e63fad0be4a7e8
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/prior_generators/anchor_generator.py
@@ -0,0 +1,848 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from mmengine.utils import is_tuple_of
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import HorizontalBoxes
+
+DeviceType = Union[str, torch.device]
+
+
+@TASK_UTILS.register_module()
+class AnchorGenerator:
+    """Standard anchor generator for 2D anchor-based detectors.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels in order (w, h).
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        scales (list[int], Optional): Anchor scales for anchors
+            in a single level. It cannot be set at the same time
+            if `octave_base_scale` and `scales_per_octave` are set.
+        base_sizes (list[int], Optional): The basic sizes
+            of anchors in multiple levels.
+            If None is given, strides will be used as base_sizes.
+            (If strides are non square, the shortest stride is taken.)
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. By default it is True in V2.0
+        octave_base_scale (int, Optional): The base scale of octave.
+        scales_per_octave (int, Optional): Number of scales for each octave.
+            `octave_base_scale` and `scales_per_octave` are usually used in
+            retinanet and the `scales` should be None when they are set.
+        centers (list[tuple[float]], Optional): The centers of the anchor
+            relative to the feature grid center in multiple feature levels.
+            By default it is set to be None and not used. If a list of tuple of
+            float is given, they will be used to shift the centers of anchors.
+        center_offset (float): The offset of center in proportion to anchors'
+            width and height. By default it is 0 in V2.0.
+        use_box_type (bool): Whether to warp anchors with the box type data
+            structure. Defaults to False.
+
+    Examples:
+        >>> from mmdet.models.task_modules.
+        ... prior_generators import AnchorGenerator
+        >>> self = AnchorGenerator([16], [1.], [1.], [9])
+        >>> all_anchors = self.grid_priors([(2, 2)], device='cpu')
+        >>> print(all_anchors)
+        [tensor([[-4.5000, -4.5000,  4.5000,  4.5000],
+                [11.5000, -4.5000, 20.5000,  4.5000],
+                [-4.5000, 11.5000,  4.5000, 20.5000],
+                [11.5000, 11.5000, 20.5000, 20.5000]])]
+        >>> self = AnchorGenerator([16, 32], [1.], [1.], [9, 18])
+        >>> all_anchors = self.grid_priors([(2, 2), (1, 1)], device='cpu')
+        >>> print(all_anchors)
+        [tensor([[-4.5000, -4.5000,  4.5000,  4.5000],
+                [11.5000, -4.5000, 20.5000,  4.5000],
+                [-4.5000, 11.5000,  4.5000, 20.5000],
+                [11.5000, 11.5000, 20.5000, 20.5000]]), \
+        tensor([[-9., -9., 9., 9.]])]
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 ratios: List[float],
+                 scales: Optional[List[int]] = None,
+                 base_sizes: Optional[List[int]] = None,
+                 scale_major: bool = True,
+                 octave_base_scale: Optional[int] = None,
+                 scales_per_octave: Optional[int] = None,
+                 centers: Optional[List[Tuple[float, float]]] = None,
+                 center_offset: float = 0.,
+                 use_box_type: bool = False) -> None:
+        # check center and center_offset
+        if center_offset != 0:
+            assert centers is None, 'center cannot be set when center_offset' \
+                                    f'!=0, {centers} is given.'
+        if not (0 <= center_offset <= 1):
+            raise ValueError('center_offset should be in range [0, 1], '
+                             f'{center_offset} is given.')
+        if centers is not None:
+            assert len(centers) == len(strides), \
+                'The number of strides should be the same as centers, got ' \
+                f'{strides} and {centers}'
+
+        # calculate base sizes of anchors
+        self.strides = [_pair(stride) for stride in strides]
+        self.base_sizes = [min(stride) for stride in self.strides
+                           ] if base_sizes is None else base_sizes
+        assert len(self.base_sizes) == len(self.strides), \
+            'The number of strides should be the same as base sizes, got ' \
+            f'{self.strides} and {self.base_sizes}'
+
+        # calculate scales of anchors
+        assert ((octave_base_scale is not None
+                 and scales_per_octave is not None) ^ (scales is not None)), \
+            'scales and octave_base_scale with scales_per_octave cannot' \
+            ' be set at the same time'
+        if scales is not None:
+            self.scales = torch.Tensor(scales)
+        elif octave_base_scale is not None and scales_per_octave is not None:
+            octave_scales = np.array(
+                [2**(i / scales_per_octave) for i in range(scales_per_octave)])
+            scales = octave_scales * octave_base_scale
+            self.scales = torch.Tensor(scales)
+        else:
+            raise ValueError('Either scales or octave_base_scale with '
+                             'scales_per_octave should be set')
+
+        self.octave_base_scale = octave_base_scale
+        self.scales_per_octave = scales_per_octave
+        self.ratios = torch.Tensor(ratios)
+        self.scale_major = scale_major
+        self.centers = centers
+        self.center_offset = center_offset
+        self.base_anchors = self.gen_base_anchors()
+        self.use_box_type = use_box_type
+
+    @property
+    def num_base_anchors(self) -> List[int]:
+        """list[int]: total number of base anchors in a feature grid"""
+        return self.num_base_priors
+
+    @property
+    def num_base_priors(self) -> List[int]:
+        """list[int]: The number of priors (anchors) at a point
+        on the feature grid"""
+        return [base_anchors.size(0) for base_anchors in self.base_anchors]
+
+    @property
+    def num_levels(self) -> int:
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.strides)
+
+    def gen_base_anchors(self) -> List[Tensor]:
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_size in enumerate(self.base_sizes):
+            center = None
+            if self.centers is not None:
+                center = self.centers[i]
+            multi_level_base_anchors.append(
+                self.gen_single_level_base_anchors(
+                    base_size,
+                    scales=self.scales,
+                    ratios=self.ratios,
+                    center=center))
+        return multi_level_base_anchors
+
+    def gen_single_level_base_anchors(self,
+                                      base_size: Union[int, float],
+                                      scales: Tensor,
+                                      ratios: Tensor,
+                                      center: Optional[Tuple[float]] = None) \
+            -> Tensor:
+        """Generate base anchors of a single level.
+
+        Args:
+            base_size (int | float): Basic size of an anchor.
+            scales (torch.Tensor): Scales of the anchor.
+            ratios (torch.Tensor): The ratio between the height
+                and width of anchors in a single level.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature maps.
+        """
+        w = base_size
+        h = base_size
+        if center is None:
+            x_center = self.center_offset * w
+            y_center = self.center_offset * h
+        else:
+            x_center, y_center = center
+
+        h_ratios = torch.sqrt(ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+        else:
+            ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+        # use float anchor and the anchor's center is aligned with the
+        # pixel center
+        base_anchors = [
+            x_center - 0.5 * ws, y_center - 0.5 * hs, x_center + 0.5 * ws,
+            y_center + 0.5 * hs
+        ]
+        base_anchors = torch.stack(base_anchors, dim=-1)
+
+        return base_anchors
+
+    def _meshgrid(self,
+                  x: Tensor,
+                  y: Tensor,
+                  row_major: bool = True) -> Tuple[Tensor]:
+        """Generate mesh grid of x and y.
+
+        Args:
+            x (torch.Tensor): Grids of x dimension.
+            y (torch.Tensor): Grids of y dimension.
+            row_major (bool): Whether to return y grids first.
+                Defaults to True.
+
+        Returns:
+            tuple[torch.Tensor]: The mesh grids of x and y.
+        """
+        # use shape instead of len to keep tracing while exporting to onnx
+        xx = x.repeat(y.shape[0])
+        yy = y.view(-1, 1).repeat(1, x.shape[0]).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_priors(self,
+                    featmap_sizes: List[Tuple],
+                    dtype: torch.dtype = torch.float32,
+                    device: DeviceType = 'cuda') -> List[Tensor]:
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels.
+            dtype (:obj:`torch.dtype`): Dtype of priors.
+                Defaults to torch.float32.
+            device (str | torch.device): The device where the anchors
+                will be put on.
+
+        Return:
+            list[torch.Tensor]: Anchors in multiple feature levels. \
+                The sizes of each tensor should be [N, 4], where \
+                N = width * height * num_base_anchors, width and height \
+                are the sizes of the corresponding feature level, \
+                num_base_anchors is the number of anchors for that level.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_anchors = []
+        for i in range(self.num_levels):
+            anchors = self.single_level_grid_priors(
+                featmap_sizes[i], level_idx=i, dtype=dtype, device=device)
+            multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def single_level_grid_priors(self,
+                                 featmap_size: Tuple[int, int],
+                                 level_idx: int,
+                                 dtype: torch.dtype = torch.float32,
+                                 device: DeviceType = 'cuda') -> Tensor:
+        """Generate grid anchors of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_priors``.
+
+        Args:
+            featmap_size (tuple[int, int]): Size of the feature maps.
+            level_idx (int): The index of corresponding feature map level.
+            dtype (obj:`torch.dtype`): Date type of points.Defaults to
+                ``torch.float32``.
+            device (str | torch.device): The device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature maps.
+        """
+
+        base_anchors = self.base_anchors[level_idx].to(device).to(dtype)
+        feat_h, feat_w = featmap_size
+        stride_w, stride_h = self.strides[level_idx]
+        # First create Range with the default dtype, than convert to
+        # target `dtype` for onnx exporting.
+        shift_x = torch.arange(0, feat_w, device=device).to(dtype) * stride_w
+        shift_y = torch.arange(0, feat_h, device=device).to(dtype) * stride_h
+
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        # first feat_w elements correspond to the first row of shifts
+        # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+        # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+        all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+        all_anchors = all_anchors.view(-1, 4)
+        # first A rows correspond to A anchors of (0, 0) in feature map,
+        # then (0, 1), (0, 2), ...
+        if self.use_box_type:
+            all_anchors = HorizontalBoxes(all_anchors)
+        return all_anchors
+
+    def sparse_priors(self,
+                      prior_idxs: Tensor,
+                      featmap_size: Tuple[int, int],
+                      level_idx: int,
+                      dtype: torch.dtype = torch.float32,
+                      device: DeviceType = 'cuda') -> Tensor:
+        """Generate sparse anchors according to the ``prior_idxs``.
+
+        Args:
+            prior_idxs (Tensor): The index of corresponding anchors
+                in the feature map.
+            featmap_size (tuple[int, int]): feature map size arrange as (h, w).
+            level_idx (int): The level index of corresponding feature
+                map.
+            dtype (obj:`torch.dtype`): Date type of points.Defaults to
+                ``torch.float32``.
+            device (str | torch.device): The device where the points is
+                located.
+        Returns:
+            Tensor: Anchor with shape (N, 4), N should be equal to
+                the length of ``prior_idxs``.
+        """
+
+        height, width = featmap_size
+        num_base_anchors = self.num_base_anchors[level_idx]
+        base_anchor_id = prior_idxs % num_base_anchors
+        x = (prior_idxs //
+             num_base_anchors) % width * self.strides[level_idx][0]
+        y = (prior_idxs // width //
+             num_base_anchors) % height * self.strides[level_idx][1]
+        priors = torch.stack([x, y, x, y], 1).to(dtype).to(device) + \
+            self.base_anchors[level_idx][base_anchor_id, :].to(device)
+
+        return priors
+
+    def grid_anchors(self,
+                     featmap_sizes: List[Tuple],
+                     device: DeviceType = 'cuda') -> List[Tensor]:
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels.
+            device (str | torch.device): Device where the anchors will be
+                put on.
+
+        Return:
+            list[torch.Tensor]: Anchors in multiple feature levels. \
+                The sizes of each tensor should be [N, 4], where \
+                N = width * height * num_base_anchors, width and height \
+                are the sizes of the corresponding feature level, \
+                num_base_anchors is the number of anchors for that level.
+        """
+        warnings.warn('``grid_anchors`` would be deprecated soon. '
+                      'Please use ``grid_priors`` ')
+
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_anchors = []
+        for i in range(self.num_levels):
+            anchors = self.single_level_grid_anchors(
+                self.base_anchors[i].to(device),
+                featmap_sizes[i],
+                self.strides[i],
+                device=device)
+            multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def single_level_grid_anchors(self,
+                                  base_anchors: Tensor,
+                                  featmap_size: Tuple[int, int],
+                                  stride: Tuple[int, int] = (16, 16),
+                                  device: DeviceType = 'cuda') -> Tensor:
+        """Generate grid anchors of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_anchors``.
+
+        Args:
+            base_anchors (torch.Tensor): The base anchors of a feature grid.
+            featmap_size (tuple[int]): Size of the feature maps.
+            stride (tuple[int, int]): Stride of the feature map in order
+                (w, h). Defaults to (16, 16).
+            device (str | torch.device): Device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature maps.
+        """
+
+        warnings.warn(
+            '``single_level_grid_anchors`` would be deprecated soon. '
+            'Please use ``single_level_grid_priors`` ')
+
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        feat_h, feat_w = featmap_size
+        shift_x = torch.arange(0, feat_w, device=device) * stride[0]
+        shift_y = torch.arange(0, feat_h, device=device) * stride[1]
+
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        shifts = shifts.type_as(base_anchors)
+        # first feat_w elements correspond to the first row of shifts
+        # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+        # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+        all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+        all_anchors = all_anchors.view(-1, 4)
+        # first A rows correspond to A anchors of (0, 0) in feature map,
+        # then (0, 1), (0, 2), ...
+        return all_anchors
+
+    def valid_flags(self,
+                    featmap_sizes: List[Tuple[int, int]],
+                    pad_shape: Tuple,
+                    device: DeviceType = 'cuda') -> List[Tensor]:
+        """Generate valid flags of anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list(tuple[int, int])): List of feature map sizes in
+                multiple feature levels.
+            pad_shape (tuple): The padded shape of the image.
+            device (str | torch.device): Device where the anchors will be
+                put on.
+
+        Return:
+            list(torch.Tensor): Valid flags of anchors in multiple levels.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_flags = []
+        for i in range(self.num_levels):
+            anchor_stride = self.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            h, w = pad_shape[:2]
+            valid_feat_h = min(int(np.ceil(h / anchor_stride[1])), feat_h)
+            valid_feat_w = min(int(np.ceil(w / anchor_stride[0])), feat_w)
+            flags = self.single_level_valid_flags((feat_h, feat_w),
+                                                  (valid_feat_h, valid_feat_w),
+                                                  self.num_base_anchors[i],
+                                                  device=device)
+            multi_level_flags.append(flags)
+        return multi_level_flags
+
+    def single_level_valid_flags(self,
+                                 featmap_size: Tuple[int, int],
+                                 valid_size: Tuple[int, int],
+                                 num_base_anchors: int,
+                                 device: DeviceType = 'cuda') -> Tensor:
+        """Generate the valid flags of anchor in a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps, arrange
+                as (h, w).
+            valid_size (tuple[int]): The valid size of the feature maps.
+            num_base_anchors (int): The number of base anchors.
+            device (str | torch.device): Device where the flags will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each anchor in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        valid = valid[:, None].expand(valid.size(0),
+                                      num_base_anchors).contiguous().view(-1)
+        return valid
+
+    def __repr__(self) -> str:
+        """str: a string that describes the module"""
+        indent_str = '    '
+        repr_str = self.__class__.__name__ + '(\n'
+        repr_str += f'{indent_str}strides={self.strides},\n'
+        repr_str += f'{indent_str}ratios={self.ratios},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}base_sizes={self.base_sizes},\n'
+        repr_str += f'{indent_str}scale_major={self.scale_major},\n'
+        repr_str += f'{indent_str}octave_base_scale='
+        repr_str += f'{self.octave_base_scale},\n'
+        repr_str += f'{indent_str}scales_per_octave='
+        repr_str += f'{self.scales_per_octave},\n'
+        repr_str += f'{indent_str}num_levels={self.num_levels}\n'
+        repr_str += f'{indent_str}centers={self.centers},\n'
+        repr_str += f'{indent_str}center_offset={self.center_offset})'
+        return repr_str
+
+
+@TASK_UTILS.register_module()
+class SSDAnchorGenerator(AnchorGenerator):
+    """Anchor generator for SSD.
+
+    Args:
+        strides (list[int]  | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels.
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        min_sizes (list[float]): The list of minimum anchor sizes on each
+            level.
+        max_sizes (list[float]): The list of maximum anchor sizes on each
+            level.
+        basesize_ratio_range (tuple(float)): Ratio range of anchors. Being
+            used when not setting min_sizes and max_sizes.
+        input_size (int): Size of feature map, 300 for SSD300, 512 for
+            SSD512. Being used when not setting min_sizes and max_sizes.
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. It is always set to be False in SSD.
+        use_box_type (bool): Whether to warp anchors with the box type data
+            structure. Defaults to False.
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 ratios: List[float],
+                 min_sizes: Optional[List[float]] = None,
+                 max_sizes: Optional[List[float]] = None,
+                 basesize_ratio_range: Tuple[float] = (0.15, 0.9),
+                 input_size: int = 300,
+                 scale_major: bool = True,
+                 use_box_type: bool = False) -> None:
+        assert len(strides) == len(ratios)
+        assert not (min_sizes is None) ^ (max_sizes is None)
+        self.strides = [_pair(stride) for stride in strides]
+        self.centers = [(stride[0] / 2., stride[1] / 2.)
+                        for stride in self.strides]
+
+        if min_sizes is None and max_sizes is None:
+            # use hard code to generate SSD anchors
+            self.input_size = input_size
+            assert is_tuple_of(basesize_ratio_range, float)
+            self.basesize_ratio_range = basesize_ratio_range
+            # calculate anchor ratios and sizes
+            min_ratio, max_ratio = basesize_ratio_range
+            min_ratio = int(min_ratio * 100)
+            max_ratio = int(max_ratio * 100)
+            step = int(np.floor(max_ratio - min_ratio) / (self.num_levels - 2))
+            min_sizes = []
+            max_sizes = []
+            for ratio in range(int(min_ratio), int(max_ratio) + 1, step):
+                min_sizes.append(int(self.input_size * ratio / 100))
+                max_sizes.append(int(self.input_size * (ratio + step) / 100))
+            if self.input_size == 300:
+                if basesize_ratio_range[0] == 0.15:  # SSD300 COCO
+                    min_sizes.insert(0, int(self.input_size * 7 / 100))
+                    max_sizes.insert(0, int(self.input_size * 15 / 100))
+                elif basesize_ratio_range[0] == 0.2:  # SSD300 VOC
+                    min_sizes.insert(0, int(self.input_size * 10 / 100))
+                    max_sizes.insert(0, int(self.input_size * 20 / 100))
+                else:
+                    raise ValueError(
+                        'basesize_ratio_range[0] should be either 0.15'
+                        'or 0.2 when input_size is 300, got '
+                        f'{basesize_ratio_range[0]}.')
+            elif self.input_size == 512:
+                if basesize_ratio_range[0] == 0.1:  # SSD512 COCO
+                    min_sizes.insert(0, int(self.input_size * 4 / 100))
+                    max_sizes.insert(0, int(self.input_size * 10 / 100))
+                elif basesize_ratio_range[0] == 0.15:  # SSD512 VOC
+                    min_sizes.insert(0, int(self.input_size * 7 / 100))
+                    max_sizes.insert(0, int(self.input_size * 15 / 100))
+                else:
+                    raise ValueError(
+                        'When not setting min_sizes and max_sizes,'
+                        'basesize_ratio_range[0] should be either 0.1'
+                        'or 0.15 when input_size is 512, got'
+                        f' {basesize_ratio_range[0]}.')
+            else:
+                raise ValueError(
+                    'Only support 300 or 512 in SSDAnchorGenerator when '
+                    'not setting min_sizes and max_sizes, '
+                    f'got {self.input_size}.')
+
+        assert len(min_sizes) == len(max_sizes) == len(strides)
+
+        anchor_ratios = []
+        anchor_scales = []
+        for k in range(len(self.strides)):
+            scales = [1., np.sqrt(max_sizes[k] / min_sizes[k])]
+            anchor_ratio = [1.]
+            for r in ratios[k]:
+                anchor_ratio += [1 / r, r]  # 4 or 6 ratio
+            anchor_ratios.append(torch.Tensor(anchor_ratio))
+            anchor_scales.append(torch.Tensor(scales))
+
+        self.base_sizes = min_sizes
+        self.scales = anchor_scales
+        self.ratios = anchor_ratios
+        self.scale_major = scale_major
+        self.center_offset = 0
+        self.base_anchors = self.gen_base_anchors()
+        self.use_box_type = use_box_type
+
+    def gen_base_anchors(self) -> List[Tensor]:
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_size in enumerate(self.base_sizes):
+            base_anchors = self.gen_single_level_base_anchors(
+                base_size,
+                scales=self.scales[i],
+                ratios=self.ratios[i],
+                center=self.centers[i])
+            indices = list(range(len(self.ratios[i])))
+            indices.insert(1, len(indices))
+            base_anchors = torch.index_select(base_anchors, 0,
+                                              torch.LongTensor(indices))
+            multi_level_base_anchors.append(base_anchors)
+        return multi_level_base_anchors
+
+    def __repr__(self) -> str:
+        """str: a string that describes the module"""
+        indent_str = '    '
+        repr_str = self.__class__.__name__ + '(\n'
+        repr_str += f'{indent_str}strides={self.strides},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}scale_major={self.scale_major},\n'
+        repr_str += f'{indent_str}input_size={self.input_size},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}ratios={self.ratios},\n'
+        repr_str += f'{indent_str}num_levels={self.num_levels},\n'
+        repr_str += f'{indent_str}base_sizes={self.base_sizes},\n'
+        repr_str += f'{indent_str}basesize_ratio_range='
+        repr_str += f'{self.basesize_ratio_range})'
+        return repr_str
+
+
+@TASK_UTILS.register_module()
+class LegacyAnchorGenerator(AnchorGenerator):
+    """Legacy anchor generator used in MMDetection V1.x.
+
+    Note:
+        Difference to the V2.0 anchor generator:
+
+        1. The center offset of V1.x anchors are set to be 0.5 rather than 0.
+        2. The width/height are minused by 1 when calculating the anchors' \
+            centers and corners to meet the V1.x coordinate system.
+        3. The anchors' corners are quantized.
+
+    Args:
+        strides (list[int] | list[tuple[int]]): Strides of anchors
+            in multiple feature levels.
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        scales (list[int] | None): Anchor scales for anchors in a single level.
+            It cannot be set at the same time if `octave_base_scale` and
+            `scales_per_octave` are set.
+        base_sizes (list[int]): The basic sizes of anchors in multiple levels.
+            If None is given, strides will be used to generate base_sizes.
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. By default it is True in V2.0
+        octave_base_scale (int): The base scale of octave.
+        scales_per_octave (int): Number of scales for each octave.
+            `octave_base_scale` and `scales_per_octave` are usually used in
+            retinanet and the `scales` should be None when they are set.
+        centers (list[tuple[float, float]] | None): The centers of the anchor
+            relative to the feature grid center in multiple feature levels.
+            By default it is set to be None and not used. It a list of float
+            is given, this list will be used to shift the centers of anchors.
+        center_offset (float): The offset of center in proportion to anchors'
+            width and height. By default it is 0.5 in V2.0 but it should be 0.5
+            in v1.x models.
+        use_box_type (bool): Whether to warp anchors with the box type data
+            structure. Defaults to False.
+
+    Examples:
+        >>> from mmdet.models.task_modules.
+        ... prior_generators import LegacyAnchorGenerator
+        >>> self = LegacyAnchorGenerator(
+        >>>     [16], [1.], [1.], [9], center_offset=0.5)
+        >>> all_anchors = self.grid_anchors(((2, 2),), device='cpu')
+        >>> print(all_anchors)
+        [tensor([[ 0.,  0.,  8.,  8.],
+                [16.,  0., 24.,  8.],
+                [ 0., 16.,  8., 24.],
+                [16., 16., 24., 24.]])]
+    """
+
+    def gen_single_level_base_anchors(self,
+                                      base_size: Union[int, float],
+                                      scales: Tensor,
+                                      ratios: Tensor,
+                                      center: Optional[Tuple[float]] = None) \
+            -> Tensor:
+        """Generate base anchors of a single level.
+
+        Note:
+            The width/height of anchors are minused by 1 when calculating \
+                the centers and corners to meet the V1.x coordinate system.
+
+        Args:
+            base_size (int | float): Basic size of an anchor.
+            scales (torch.Tensor): Scales of the anchor.
+            ratios (torch.Tensor): The ratio between the height.
+                and width of anchors in a single level.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature map.
+        """
+        w = base_size
+        h = base_size
+        if center is None:
+            x_center = self.center_offset * (w - 1)
+            y_center = self.center_offset * (h - 1)
+        else:
+            x_center, y_center = center
+
+        h_ratios = torch.sqrt(ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+        else:
+            ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+        # use float anchor and the anchor's center is aligned with the
+        # pixel center
+        base_anchors = [
+            x_center - 0.5 * (ws - 1), y_center - 0.5 * (hs - 1),
+            x_center + 0.5 * (ws - 1), y_center + 0.5 * (hs - 1)
+        ]
+        base_anchors = torch.stack(base_anchors, dim=-1).round()
+
+        return base_anchors
+
+
+@TASK_UTILS.register_module()
+class LegacySSDAnchorGenerator(SSDAnchorGenerator, LegacyAnchorGenerator):
+    """Legacy anchor generator used in MMDetection V1.x.
+
+    The difference between `LegacySSDAnchorGenerator` and `SSDAnchorGenerator`
+    can be found in `LegacyAnchorGenerator`.
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 ratios: List[float],
+                 basesize_ratio_range: Tuple[float],
+                 input_size: int = 300,
+                 scale_major: bool = True,
+                 use_box_type: bool = False) -> None:
+        super(LegacySSDAnchorGenerator, self).__init__(
+            strides=strides,
+            ratios=ratios,
+            basesize_ratio_range=basesize_ratio_range,
+            input_size=input_size,
+            scale_major=scale_major,
+            use_box_type=use_box_type)
+        self.centers = [((stride - 1) / 2., (stride - 1) / 2.)
+                        for stride in strides]
+        self.base_anchors = self.gen_base_anchors()
+
+
+@TASK_UTILS.register_module()
+class YOLOAnchorGenerator(AnchorGenerator):
+    """Anchor generator for YOLO.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels.
+        base_sizes (list[list[tuple[int, int]]]): The basic sizes
+            of anchors in multiple levels.
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 base_sizes: List[List[Tuple[int, int]]],
+                 use_box_type: bool = False) -> None:
+        self.strides = [_pair(stride) for stride in strides]
+        self.centers = [(stride[0] / 2., stride[1] / 2.)
+                        for stride in self.strides]
+        self.base_sizes = []
+        num_anchor_per_level = len(base_sizes[0])
+        for base_sizes_per_level in base_sizes:
+            assert num_anchor_per_level == len(base_sizes_per_level)
+            self.base_sizes.append(
+                [_pair(base_size) for base_size in base_sizes_per_level])
+        self.base_anchors = self.gen_base_anchors()
+        self.use_box_type = use_box_type
+
+    @property
+    def num_levels(self) -> int:
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.base_sizes)
+
+    def gen_base_anchors(self) -> List[Tensor]:
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_sizes_per_level in enumerate(self.base_sizes):
+            center = None
+            if self.centers is not None:
+                center = self.centers[i]
+            multi_level_base_anchors.append(
+                self.gen_single_level_base_anchors(base_sizes_per_level,
+                                                   center))
+        return multi_level_base_anchors
+
+    def gen_single_level_base_anchors(self,
+                                      base_sizes_per_level: List[Tuple[int]],
+                                      center: Optional[Tuple[float]] = None) \
+            -> Tensor:
+        """Generate base anchors of a single level.
+
+        Args:
+            base_sizes_per_level (list[tuple[int]]): Basic sizes of
+                anchors.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature maps.
+        """
+        x_center, y_center = center
+        base_anchors = []
+        for base_size in base_sizes_per_level:
+            w, h = base_size
+
+            # use float anchor and the anchor's center is aligned with the
+            # pixel center
+            base_anchor = torch.Tensor([
+                x_center - 0.5 * w, y_center - 0.5 * h, x_center + 0.5 * w,
+                y_center + 0.5 * h
+            ])
+            base_anchors.append(base_anchor)
+        base_anchors = torch.stack(base_anchors, dim=0)
+
+        return base_anchors
diff --git a/mmde/mmdet/models/task_modules/prior_generators/point_generator.py b/mmde/mmdet/models/task_modules/prior_generators/point_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..c87ad656c61cb251bfdfcbd23b1cc5263c68bf5f
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/prior_generators/point_generator.py
@@ -0,0 +1,321 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.registry import TASK_UTILS
+
+DeviceType = Union[str, torch.device]
+
+
+@TASK_UTILS.register_module()
+class PointGenerator:
+
+    def _meshgrid(self,
+                  x: Tensor,
+                  y: Tensor,
+                  row_major: bool = True) -> Tuple[Tensor, Tensor]:
+        """Generate mesh grid of x and y.
+
+        Args:
+            x (torch.Tensor): Grids of x dimension.
+            y (torch.Tensor): Grids of y dimension.
+            row_major (bool): Whether to return y grids first.
+                Defaults to True.
+
+        Returns:
+            tuple[torch.Tensor]: The mesh grids of x and y.
+        """
+        xx = x.repeat(len(y))
+        yy = y.view(-1, 1).repeat(1, len(x)).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_points(self,
+                    featmap_size: Tuple[int, int],
+                    stride=16,
+                    device: DeviceType = 'cuda') -> Tensor:
+        """Generate grid points of a single level.
+
+        Args:
+            featmap_size (tuple[int, int]): Size of the feature maps.
+            stride (int): The stride of corresponding feature map.
+            device (str | torch.device): The device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: grid point in a feature map.
+        """
+        feat_h, feat_w = featmap_size
+        shift_x = torch.arange(0., feat_w, device=device) * stride
+        shift_y = torch.arange(0., feat_h, device=device) * stride
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        stride = shift_x.new_full((shift_xx.shape[0], ), stride)
+        shifts = torch.stack([shift_xx, shift_yy, stride], dim=-1)
+        all_points = shifts.to(device)
+        return all_points
+
+    def valid_flags(self,
+                    featmap_size: Tuple[int, int],
+                    valid_size: Tuple[int, int],
+                    device: DeviceType = 'cuda') -> Tensor:
+        """Generate valid flags of anchors in a feature map.
+
+        Args:
+            featmap_sizes (list(tuple[int, int])): List of feature map sizes in
+                multiple feature levels.
+            valid_shape (tuple[int, int]): The valid shape of the image.
+            device (str | torch.device): Device where the anchors will be
+                put on.
+
+        Return:
+            torch.Tensor: Valid flags of anchors in a level.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        return valid
+
+
+@TASK_UTILS.register_module()
+class MlvlPointGenerator:
+    """Standard points generator for multi-level (Mlvl) feature maps in 2D
+    points-based detectors.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels in order (w, h).
+        offset (float): The offset of points, the value is normalized with
+            corresponding stride. Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 offset: float = 0.5) -> None:
+        self.strides = [_pair(stride) for stride in strides]
+        self.offset = offset
+
+    @property
+    def num_levels(self) -> int:
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.strides)
+
+    @property
+    def num_base_priors(self) -> List[int]:
+        """list[int]: The number of priors (points) at a point
+        on the feature grid"""
+        return [1 for _ in range(len(self.strides))]
+
+    def _meshgrid(self,
+                  x: Tensor,
+                  y: Tensor,
+                  row_major: bool = True) -> Tuple[Tensor, Tensor]:
+        yy, xx = torch.meshgrid(y, x)
+        if row_major:
+            # warning .flatten() would cause error in ONNX exporting
+            # have to use reshape here
+            return xx.reshape(-1), yy.reshape(-1)
+
+        else:
+            return yy.reshape(-1), xx.reshape(-1)
+
+    def grid_priors(self,
+                    featmap_sizes: List[Tuple],
+                    dtype: torch.dtype = torch.float32,
+                    device: DeviceType = 'cuda',
+                    with_stride: bool = False) -> List[Tensor]:
+        """Generate grid points of multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels, each size arrange as
+                as (h, w).
+            dtype (:obj:`dtype`): Dtype of priors. Defaults to torch.float32.
+            device (str | torch.device): The device where the anchors will be
+                put on.
+            with_stride (bool): Whether to concatenate the stride to
+                the last dimension of points.
+
+        Return:
+            list[torch.Tensor]: Points of  multiple feature levels.
+            The sizes of each tensor should be (N, 2) when with stride is
+            ``False``, where N = width * height, width and height
+            are the sizes of the corresponding feature level,
+            and the last dimension 2 represent (coord_x, coord_y),
+            otherwise the shape should be (N, 4),
+            and the last dimension 4 represent
+            (coord_x, coord_y, stride_w, stride_h).
+        """
+
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_priors = []
+        for i in range(self.num_levels):
+            priors = self.single_level_grid_priors(
+                featmap_sizes[i],
+                level_idx=i,
+                dtype=dtype,
+                device=device,
+                with_stride=with_stride)
+            multi_level_priors.append(priors)
+        return multi_level_priors
+
+    def single_level_grid_priors(self,
+                                 featmap_size: Tuple[int],
+                                 level_idx: int,
+                                 dtype: torch.dtype = torch.float32,
+                                 device: DeviceType = 'cuda',
+                                 with_stride: bool = False) -> Tensor:
+        """Generate grid Points of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_priors``.
+
+        Args:
+            featmap_size (tuple[int]): Size of the feature maps, arrange as
+                (h, w).
+            level_idx (int): The index of corresponding feature map level.
+            dtype (:obj:`dtype`): Dtype of priors. Defaults to torch.float32.
+            device (str | torch.device): The device the tensor will be put on.
+                Defaults to 'cuda'.
+            with_stride (bool): Concatenate the stride to the last dimension
+                of points.
+
+        Return:
+            Tensor: Points of single feature levels.
+            The shape of tensor should be (N, 2) when with stride is
+            ``False``, where N = width * height, width and height
+            are the sizes of the corresponding feature level,
+            and the last dimension 2 represent (coord_x, coord_y),
+            otherwise the shape should be (N, 4),
+            and the last dimension 4 represent
+            (coord_x, coord_y, stride_w, stride_h).
+        """
+        feat_h, feat_w = featmap_size
+        stride_w, stride_h = self.strides[level_idx]
+        shift_x = (torch.arange(0, feat_w, device=device) +
+                   self.offset) * stride_w
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        shift_x = shift_x.to(dtype)
+
+        shift_y = (torch.arange(0, feat_h, device=device) +
+                   self.offset) * stride_h
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        shift_y = shift_y.to(dtype)
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        if not with_stride:
+            shifts = torch.stack([shift_xx, shift_yy], dim=-1)
+        else:
+            # use `shape[0]` instead of `len(shift_xx)` for ONNX export
+            stride_w = shift_xx.new_full((shift_xx.shape[0], ),
+                                         stride_w).to(dtype)
+            stride_h = shift_xx.new_full((shift_yy.shape[0], ),
+                                         stride_h).to(dtype)
+            shifts = torch.stack([shift_xx, shift_yy, stride_w, stride_h],
+                                 dim=-1)
+        all_points = shifts.to(device)
+        return all_points
+
+    def valid_flags(self,
+                    featmap_sizes: List[Tuple[int, int]],
+                    pad_shape: Tuple[int],
+                    device: DeviceType = 'cuda') -> List[Tensor]:
+        """Generate valid flags of points of multiple feature levels.
+
+        Args:
+            featmap_sizes (list(tuple)): List of feature map sizes in
+                multiple feature levels, each size arrange as
+                as (h, w).
+            pad_shape (tuple(int)): The padded shape of the image,
+                arrange as (h, w).
+            device (str | torch.device): The device where the anchors will be
+                put on.
+
+        Return:
+            list(torch.Tensor): Valid flags of points of multiple levels.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_flags = []
+        for i in range(self.num_levels):
+            point_stride = self.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            h, w = pad_shape[:2]
+            valid_feat_h = min(int(np.ceil(h / point_stride[1])), feat_h)
+            valid_feat_w = min(int(np.ceil(w / point_stride[0])), feat_w)
+            flags = self.single_level_valid_flags((feat_h, feat_w),
+                                                  (valid_feat_h, valid_feat_w),
+                                                  device=device)
+            multi_level_flags.append(flags)
+        return multi_level_flags
+
+    def single_level_valid_flags(self,
+                                 featmap_size: Tuple[int, int],
+                                 valid_size: Tuple[int, int],
+                                 device: DeviceType = 'cuda') -> Tensor:
+        """Generate the valid flags of points of a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps, arrange as
+                as (h, w).
+            valid_size (tuple[int]): The valid size of the feature maps.
+                The size arrange as as (h, w).
+            device (str | torch.device): The device where the flags will be
+            put on. Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each points in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        return valid
+
+    def sparse_priors(self,
+                      prior_idxs: Tensor,
+                      featmap_size: Tuple[int],
+                      level_idx: int,
+                      dtype: torch.dtype = torch.float32,
+                      device: DeviceType = 'cuda') -> Tensor:
+        """Generate sparse points according to the ``prior_idxs``.
+
+        Args:
+            prior_idxs (Tensor): The index of corresponding anchors
+                in the feature map.
+            featmap_size (tuple[int]): feature map size arrange as (w, h).
+            level_idx (int): The level index of corresponding feature
+                map.
+            dtype (obj:`torch.dtype`): Date type of points. Defaults to
+                ``torch.float32``.
+            device (str | torch.device): The device where the points is
+                located.
+        Returns:
+            Tensor: Anchor with shape (N, 2), N should be equal to
+            the length of ``prior_idxs``. And last dimension
+            2 represent (coord_x, coord_y).
+        """
+        height, width = featmap_size
+        x = (prior_idxs % width + self.offset) * self.strides[level_idx][0]
+        y = ((prior_idxs // width) % height +
+             self.offset) * self.strides[level_idx][1]
+        prioris = torch.stack([x, y], 1).to(dtype)
+        prioris = prioris.to(device)
+        return prioris
diff --git a/mmde/mmdet/models/task_modules/prior_generators/utils.py b/mmde/mmdet/models/task_modules/prior_generators/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aa2dfd49669ba931d20ad9482cb841698cceb8a
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/prior_generators/utils.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.structures.bbox import BaseBoxes
+
+
+def anchor_inside_flags(flat_anchors: Tensor,
+                        valid_flags: Tensor,
+                        img_shape: Tuple[int],
+                        allowed_border: int = 0) -> Tensor:
+    """Check whether the anchors are inside the border.
+
+    Args:
+        flat_anchors (torch.Tensor): Flatten anchors, shape (n, 4).
+        valid_flags (torch.Tensor): An existing valid flags of anchors.
+        img_shape (tuple(int)): Shape of current image.
+        allowed_border (int): The border to allow the valid anchor.
+            Defaults to 0.
+
+    Returns:
+        torch.Tensor: Flags indicating whether the anchors are inside a \
+            valid range.
+    """
+    img_h, img_w = img_shape[:2]
+    if allowed_border >= 0:
+        if isinstance(flat_anchors, BaseBoxes):
+            inside_flags = valid_flags & \
+                flat_anchors.is_inside([img_h, img_w],
+                                       all_inside=True,
+                                       allowed_border=allowed_border)
+        else:
+            inside_flags = valid_flags & \
+                (flat_anchors[:, 0] >= -allowed_border) & \
+                (flat_anchors[:, 1] >= -allowed_border) & \
+                (flat_anchors[:, 2] < img_w + allowed_border) & \
+                (flat_anchors[:, 3] < img_h + allowed_border)
+    else:
+        inside_flags = valid_flags
+    return inside_flags
+
+
+def calc_region(bbox: Tensor,
+                ratio: float,
+                featmap_size: Optional[Tuple] = None) -> Tuple[int]:
+    """Calculate a proportional bbox region.
+
+    The bbox center are fixed and the new h' and w' is h * ratio and w * ratio.
+
+    Args:
+        bbox (Tensor): Bboxes to calculate regions, shape (n, 4).
+        ratio (float): Ratio of the output region.
+        featmap_size (tuple, Optional): Feature map size in (height, width)
+            order used for clipping the boundary. Defaults to None.
+
+    Returns:
+        tuple: x1, y1, x2, y2
+    """
+    x1 = torch.round((1 - ratio) * bbox[0] + ratio * bbox[2]).long()
+    y1 = torch.round((1 - ratio) * bbox[1] + ratio * bbox[3]).long()
+    x2 = torch.round(ratio * bbox[0] + (1 - ratio) * bbox[2]).long()
+    y2 = torch.round(ratio * bbox[1] + (1 - ratio) * bbox[3]).long()
+    if featmap_size is not None:
+        x1 = x1.clamp(min=0, max=featmap_size[1])
+        y1 = y1.clamp(min=0, max=featmap_size[0])
+        x2 = x2.clamp(min=0, max=featmap_size[1])
+        y2 = y2.clamp(min=0, max=featmap_size[0])
+    return (x1, y1, x2, y2)
diff --git a/mmde/mmdet/models/task_modules/samplers/__init__.py b/mmde/mmdet/models/task_modules/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3782eb898cf8acace63b4f16204cae6c07eb6e30
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/samplers/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_sampler import BaseSampler
+from .combined_sampler import CombinedSampler
+from .instance_balanced_pos_sampler import InstanceBalancedPosSampler
+from .iou_balanced_neg_sampler import IoUBalancedNegSampler
+from .mask_pseudo_sampler import MaskPseudoSampler
+from .mask_sampling_result import MaskSamplingResult
+from .multi_instance_random_sampler import MultiInsRandomSampler
+from .multi_instance_sampling_result import MultiInstanceSamplingResult
+from .ohem_sampler import OHEMSampler
+from .pseudo_sampler import PseudoSampler
+from .random_sampler import RandomSampler
+from .sampling_result import SamplingResult
+from .score_hlr_sampler import ScoreHLRSampler
+
+__all__ = [
+    'BaseSampler', 'PseudoSampler', 'RandomSampler',
+    'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
+    'OHEMSampler', 'SamplingResult', 'ScoreHLRSampler', 'MaskPseudoSampler',
+    'MaskSamplingResult', 'MultiInstanceSamplingResult',
+    'MultiInsRandomSampler'
+]
diff --git a/mmde/mmdet/models/task_modules/samplers/base_sampler.py b/mmde/mmdet/models/task_modules/samplers/base_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..be8a9a5ee3ec4e70b19aeea21b7998cf2b131d59
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/samplers/base_sampler.py
@@ -0,0 +1,136 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.structures.bbox import BaseBoxes, cat_boxes
+from ..assigners import AssignResult
+from .sampling_result import SamplingResult
+
+
+class BaseSampler(metaclass=ABCMeta):
+    """Base class of samplers.
+
+    Args:
+        num (int): Number of samples
+        pos_fraction (float): Fraction of positive samples
+        neg_pos_up (int): Upper bound number of negative and
+            positive samples. Defaults to -1.
+        add_gt_as_proposals (bool): Whether to add ground truth
+            boxes as proposals. Defaults to True.
+    """
+
+    def __init__(self,
+                 num: int,
+                 pos_fraction: float,
+                 neg_pos_ub: int = -1,
+                 add_gt_as_proposals: bool = True,
+                 **kwargs) -> None:
+        self.num = num
+        self.pos_fraction = pos_fraction
+        self.neg_pos_ub = neg_pos_ub
+        self.add_gt_as_proposals = add_gt_as_proposals
+        self.pos_sampler = self
+        self.neg_sampler = self
+
+    @abstractmethod
+    def _sample_pos(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs):
+        """Sample positive samples."""
+        pass
+
+    @abstractmethod
+    def _sample_neg(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs):
+        """Sample negative samples."""
+        pass
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData, **kwargs) -> SamplingResult:
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigning results.
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+
+        Returns:
+            :obj:`SamplingResult`: Sampling result.
+
+        Example:
+            >>> from mmengine.structures import InstanceData
+            >>> from mmdet.models.task_modules.samplers import RandomSampler,
+            >>> from mmdet.models.task_modules.assigners import AssignResult
+            >>> from mmdet.models.task_modules.samplers.
+            ... sampling_result import ensure_rng, random_boxes
+            >>> rng = ensure_rng(None)
+            >>> assign_result = AssignResult.random(rng=rng)
+            >>> pred_instances = InstanceData()
+            >>> pred_instances.priors = random_boxes(assign_result.num_preds,
+            ...                                      rng=rng)
+            >>> gt_instances = InstanceData()
+            >>> gt_instances.bboxes = random_boxes(assign_result.num_gts,
+            ...                                    rng=rng)
+            >>> gt_instances.labels = torch.randint(
+            ...     0, 5, (assign_result.num_gts,), dtype=torch.long)
+            >>> self = RandomSampler(num=32, pos_fraction=0.5, neg_pos_ub=-1,
+            >>>                      add_gt_as_proposals=False)
+            >>> self = self.sample(assign_result, pred_instances, gt_instances)
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels
+        if len(priors.shape) < 2:
+            priors = priors[None, :]
+
+        gt_flags = priors.new_zeros((priors.shape[0], ), dtype=torch.uint8)
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            # When `gt_bboxes` and `priors` are all box type, convert
+            # `gt_bboxes` type to `priors` type.
+            if (isinstance(gt_bboxes, BaseBoxes)
+                    and isinstance(priors, BaseBoxes)):
+                gt_bboxes_ = gt_bboxes.convert_to(type(priors))
+            else:
+                gt_bboxes_ = gt_bboxes
+            priors = cat_boxes([gt_bboxes_, priors], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = priors.new_ones(gt_bboxes_.shape[0], dtype=torch.uint8)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(
+            assign_result, num_expected_pos, bboxes=priors, **kwargs)
+        # We found that sampled indices have duplicated items occasionally.
+        # (may be a bug of PyTorch)
+        pos_inds = pos_inds.unique()
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds = self.neg_sampler._sample_neg(
+            assign_result, num_expected_neg, bboxes=priors, **kwargs)
+        neg_inds = neg_inds.unique()
+
+        sampling_result = SamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_bboxes=gt_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags)
+        return sampling_result
diff --git a/mmde/mmdet/models/task_modules/samplers/combined_sampler.py b/mmde/mmdet/models/task_modules/samplers/combined_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e0560e372efffe865fa32028d823280a8bd5d87
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/samplers/combined_sampler.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import TASK_UTILS
+from .base_sampler import BaseSampler
+
+
+@TASK_UTILS.register_module()
+class CombinedSampler(BaseSampler):
+    """A sampler that combines positive sampler and negative sampler."""
+
+    def __init__(self, pos_sampler, neg_sampler, **kwargs):
+        super(CombinedSampler, self).__init__(**kwargs)
+        self.pos_sampler = TASK_UTILS.build(pos_sampler, default_args=kwargs)
+        self.neg_sampler = TASK_UTILS.build(neg_sampler, default_args=kwargs)
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
diff --git a/mmde/mmdet/models/task_modules/samplers/instance_balanced_pos_sampler.py b/mmde/mmdet/models/task_modules/samplers/instance_balanced_pos_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..e48d8e9158e8dabf0bb4072b8e421de9b6410d00
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/samplers/instance_balanced_pos_sampler.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.registry import TASK_UTILS
+from .random_sampler import RandomSampler
+
+
+@TASK_UTILS.register_module()
+class InstanceBalancedPosSampler(RandomSampler):
+    """Instance balanced sampler that samples equal number of positive samples
+    for each instance."""
+
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Sample positive boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): The assigned results of boxes.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            unique_gt_inds = assign_result.gt_inds[pos_inds].unique()
+            num_gts = len(unique_gt_inds)
+            num_per_gt = int(round(num_expected / float(num_gts)) + 1)
+            sampled_inds = []
+            for i in unique_gt_inds:
+                inds = torch.nonzero(
+                    assign_result.gt_inds == i.item(), as_tuple=False)
+                if inds.numel() != 0:
+                    inds = inds.squeeze(1)
+                else:
+                    continue
+                if len(inds) > num_per_gt:
+                    inds = self.random_choice(inds, num_per_gt)
+                sampled_inds.append(inds)
+            sampled_inds = torch.cat(sampled_inds)
+            if len(sampled_inds) < num_expected:
+                num_extra = num_expected - len(sampled_inds)
+                extra_inds = np.array(
+                    list(set(pos_inds.cpu()) - set(sampled_inds.cpu())))
+                if len(extra_inds) > num_extra:
+                    extra_inds = self.random_choice(extra_inds, num_extra)
+                extra_inds = torch.from_numpy(extra_inds).to(
+                    assign_result.gt_inds.device).long()
+                sampled_inds = torch.cat([sampled_inds, extra_inds])
+            elif len(sampled_inds) > num_expected:
+                sampled_inds = self.random_choice(sampled_inds, num_expected)
+            return sampled_inds
diff --git a/mmde/mmdet/models/task_modules/samplers/iou_balanced_neg_sampler.py b/mmde/mmdet/models/task_modules/samplers/iou_balanced_neg_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc1f46413c99d115f31ef190b4fb198b588a156e
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/samplers/iou_balanced_neg_sampler.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.registry import TASK_UTILS
+from .random_sampler import RandomSampler
+
+
+@TASK_UTILS.register_module()
+class IoUBalancedNegSampler(RandomSampler):
+    """IoU Balanced Sampling.
+
+    arXiv: https://arxiv.org/pdf/1904.02701.pdf (CVPR 2019)
+
+    Sampling proposals according to their IoU. `floor_fraction` of needed RoIs
+    are sampled from proposals whose IoU are lower than `floor_thr` randomly.
+    The others are sampled from proposals whose IoU are higher than
+    `floor_thr`. These proposals are sampled from some bins evenly, which are
+    split by `num_bins` via IoU evenly.
+
+    Args:
+        num (int): number of proposals.
+        pos_fraction (float): fraction of positive proposals.
+        floor_thr (float): threshold (minimum) IoU for IoU balanced sampling,
+            set to -1 if all using IoU balanced sampling.
+        floor_fraction (float): sampling fraction of proposals under floor_thr.
+        num_bins (int): number of bins in IoU balanced sampling.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 floor_thr=-1,
+                 floor_fraction=0,
+                 num_bins=3,
+                 **kwargs):
+        super(IoUBalancedNegSampler, self).__init__(num, pos_fraction,
+                                                    **kwargs)
+        assert floor_thr >= 0 or floor_thr == -1
+        assert 0 <= floor_fraction <= 1
+        assert num_bins >= 1
+
+        self.floor_thr = floor_thr
+        self.floor_fraction = floor_fraction
+        self.num_bins = num_bins
+
+    def sample_via_interval(self, max_overlaps, full_set, num_expected):
+        """Sample according to the iou interval.
+
+        Args:
+            max_overlaps (torch.Tensor): IoU between bounding boxes and ground
+                truth boxes.
+            full_set (set(int)): A full set of indices of boxes。
+            num_expected (int): Number of expected samples。
+
+        Returns:
+            np.ndarray: Indices  of samples
+        """
+        max_iou = max_overlaps.max()
+        iou_interval = (max_iou - self.floor_thr) / self.num_bins
+        per_num_expected = int(num_expected / self.num_bins)
+
+        sampled_inds = []
+        for i in range(self.num_bins):
+            start_iou = self.floor_thr + i * iou_interval
+            end_iou = self.floor_thr + (i + 1) * iou_interval
+            tmp_set = set(
+                np.where(
+                    np.logical_and(max_overlaps >= start_iou,
+                                   max_overlaps < end_iou))[0])
+            tmp_inds = list(tmp_set & full_set)
+            if len(tmp_inds) > per_num_expected:
+                tmp_sampled_set = self.random_choice(tmp_inds,
+                                                     per_num_expected)
+            else:
+                tmp_sampled_set = np.array(tmp_inds, dtype=np.int64)
+            sampled_inds.append(tmp_sampled_set)
+
+        sampled_inds = np.concatenate(sampled_inds)
+        if len(sampled_inds) < num_expected:
+            num_extra = num_expected - len(sampled_inds)
+            extra_inds = np.array(list(full_set - set(sampled_inds)))
+            if len(extra_inds) > num_extra:
+                extra_inds = self.random_choice(extra_inds, num_extra)
+            sampled_inds = np.concatenate([sampled_inds, extra_inds])
+
+        return sampled_inds
+
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Sample negative boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): The assigned results of boxes.
+            num_expected (int): The number of expected negative samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            max_overlaps = assign_result.max_overlaps.cpu().numpy()
+            # balance sampling for negative samples
+            neg_set = set(neg_inds.cpu().numpy())
+
+            if self.floor_thr > 0:
+                floor_set = set(
+                    np.where(
+                        np.logical_and(max_overlaps >= 0,
+                                       max_overlaps < self.floor_thr))[0])
+                iou_sampling_set = set(
+                    np.where(max_overlaps >= self.floor_thr)[0])
+            elif self.floor_thr == 0:
+                floor_set = set(np.where(max_overlaps == 0)[0])
+                iou_sampling_set = set(
+                    np.where(max_overlaps > self.floor_thr)[0])
+            else:
+                floor_set = set()
+                iou_sampling_set = set(
+                    np.where(max_overlaps > self.floor_thr)[0])
+                # for sampling interval calculation
+                self.floor_thr = 0
+
+            floor_neg_inds = list(floor_set & neg_set)
+            iou_sampling_neg_inds = list(iou_sampling_set & neg_set)
+            num_expected_iou_sampling = int(num_expected *
+                                            (1 - self.floor_fraction))
+            if len(iou_sampling_neg_inds) > num_expected_iou_sampling:
+                if self.num_bins >= 2:
+                    iou_sampled_inds = self.sample_via_interval(
+                        max_overlaps, set(iou_sampling_neg_inds),
+                        num_expected_iou_sampling)
+                else:
+                    iou_sampled_inds = self.random_choice(
+                        iou_sampling_neg_inds, num_expected_iou_sampling)
+            else:
+                iou_sampled_inds = np.array(
+                    iou_sampling_neg_inds, dtype=np.int64)
+            num_expected_floor = num_expected - len(iou_sampled_inds)
+            if len(floor_neg_inds) > num_expected_floor:
+                sampled_floor_inds = self.random_choice(
+                    floor_neg_inds, num_expected_floor)
+            else:
+                sampled_floor_inds = np.array(floor_neg_inds, dtype=np.int64)
+            sampled_inds = np.concatenate(
+                (sampled_floor_inds, iou_sampled_inds))
+            if len(sampled_inds) < num_expected:
+                num_extra = num_expected - len(sampled_inds)
+                extra_inds = np.array(list(neg_set - set(sampled_inds)))
+                if len(extra_inds) > num_extra:
+                    extra_inds = self.random_choice(extra_inds, num_extra)
+                sampled_inds = np.concatenate((sampled_inds, extra_inds))
+            sampled_inds = torch.from_numpy(sampled_inds).long().to(
+                assign_result.gt_inds.device)
+            return sampled_inds
diff --git a/mmde/mmdet/models/task_modules/samplers/mask_pseudo_sampler.py b/mmde/mmdet/models/task_modules/samplers/mask_pseudo_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..307dd5d15c962b97dc60b899e60170d0bfed90a7
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/samplers/mask_pseudo_sampler.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""copy from
+https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py."""
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from ..assigners import AssignResult
+from .base_sampler import BaseSampler
+from .mask_sampling_result import MaskSamplingResult
+
+
+@TASK_UTILS.register_module()
+class MaskPseudoSampler(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData, *args, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Mask assigning results.
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``scores`` and ``masks`` predicted
+                by the model.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``labels`` and ``masks``
+                attributes.
+
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        pred_masks = pred_instances.masks
+        gt_masks = gt_instances.masks
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        gt_flags = pred_masks.new_zeros(pred_masks.shape[0], dtype=torch.uint8)
+        sampling_result = MaskSamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            masks=pred_masks,
+            gt_masks=gt_masks,
+            assign_result=assign_result,
+            gt_flags=gt_flags,
+            avg_factor_with_neg=False)
+        return sampling_result
diff --git a/mmde/mmdet/models/task_modules/samplers/mask_sampling_result.py b/mmde/mmdet/models/task_modules/samplers/mask_sampling_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..adaa62e8a0af28bb004a34b961f672ec03988d2c
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/samplers/mask_sampling_result.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""copy from
+https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py."""
+
+import torch
+from torch import Tensor
+
+from ..assigners import AssignResult
+from .sampling_result import SamplingResult
+
+
+class MaskSamplingResult(SamplingResult):
+    """Mask sampling result."""
+
+    def __init__(self,
+                 pos_inds: Tensor,
+                 neg_inds: Tensor,
+                 masks: Tensor,
+                 gt_masks: Tensor,
+                 assign_result: AssignResult,
+                 gt_flags: Tensor,
+                 avg_factor_with_neg: bool = True) -> None:
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.num_pos = max(pos_inds.numel(), 1)
+        self.num_neg = max(neg_inds.numel(), 1)
+        self.avg_factor = self.num_pos + self.num_neg \
+            if avg_factor_with_neg else self.num_pos
+
+        self.pos_masks = masks[pos_inds]
+        self.neg_masks = masks[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_masks.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+        if gt_masks.numel() == 0:
+            # hack for index error case
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_masks = torch.empty_like(gt_masks)
+        else:
+            self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :]
+
+    @property
+    def masks(self) -> Tensor:
+        """torch.Tensor: concatenated positive and negative masks."""
+        return torch.cat([self.pos_masks, self.neg_masks])
+
+    def __nice__(self) -> str:
+        data = self.info.copy()
+        data['pos_masks'] = data.pop('pos_masks').shape
+        data['neg_masks'] = data.pop('neg_masks').shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = '    ' + ',\n    '.join(parts)
+        return '{\n' + body + '\n}'
+
+    @property
+    def info(self) -> dict:
+        """Returns a dictionary of info about the object."""
+        return {
+            'pos_inds': self.pos_inds,
+            'neg_inds': self.neg_inds,
+            'pos_masks': self.pos_masks,
+            'neg_masks': self.neg_masks,
+            'pos_is_gt': self.pos_is_gt,
+            'num_gts': self.num_gts,
+            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+        }
diff --git a/mmde/mmdet/models/task_modules/samplers/multi_instance_random_sampler.py b/mmde/mmdet/models/task_modules/samplers/multi_instance_random_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b74054e3a11ed6025e98e90bd0addb131a1dc02
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/samplers/multi_instance_random_sampler.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from mmengine.structures import InstanceData
+from numpy import ndarray
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from ..assigners import AssignResult
+from .multi_instance_sampling_result import MultiInstanceSamplingResult
+from .random_sampler import RandomSampler
+
+
+@TASK_UTILS.register_module()
+class MultiInsRandomSampler(RandomSampler):
+    """Random sampler for multi instance.
+
+    Note:
+        Multi-instance means to predict multiple detection boxes with
+        one proposal box. `AssignResult` may assign multiple gt boxes
+        to each proposal box, in this case `RandomSampler` should be
+        replaced by `MultiInsRandomSampler`
+    """
+
+    def _sample_pos(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some positive samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        pos_inds = torch.nonzero(
+            assign_result.labels[:, 0] > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some negative samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        neg_inds = torch.nonzero(
+            assign_result.labels[:, 0] == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            return self.random_choice(neg_inds, num_expected)
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               **kwargs) -> MultiInstanceSamplingResult:
+        """Sample positive and negative bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigning results from
+                MultiInstanceAssigner.
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+
+        Returns:
+            :obj:`MultiInstanceSamplingResult`: Sampling result.
+        """
+
+        assert 'batch_gt_instances_ignore' in kwargs, \
+            'batch_gt_instances_ignore is necessary for MultiInsRandomSampler'
+
+        gt_bboxes = gt_instances.bboxes
+        ignore_bboxes = kwargs['batch_gt_instances_ignore'].bboxes
+        gt_and_ignore_bboxes = torch.cat([gt_bboxes, ignore_bboxes], dim=0)
+        priors = pred_instances.priors
+        if len(priors.shape) < 2:
+            priors = priors[None, :]
+        priors = priors[:, :4]
+
+        gt_flags = priors.new_zeros((priors.shape[0], ), dtype=torch.uint8)
+        priors = torch.cat([priors, gt_and_ignore_bboxes], dim=0)
+        gt_ones = priors.new_ones(
+            gt_and_ignore_bboxes.shape[0], dtype=torch.uint8)
+        gt_flags = torch.cat([gt_flags, gt_ones])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(assign_result,
+                                                num_expected_pos)
+        # We found that sampled indices have duplicated items occasionally.
+        # (may be a bug of PyTorch)
+        pos_inds = pos_inds.unique()
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds = self.neg_sampler._sample_neg(assign_result,
+                                                num_expected_neg)
+        neg_inds = neg_inds.unique()
+
+        sampling_result = MultiInstanceSamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_and_ignore_bboxes=gt_and_ignore_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags)
+        return sampling_result
diff --git a/mmde/mmdet/models/task_modules/samplers/multi_instance_sampling_result.py b/mmde/mmdet/models/task_modules/samplers/multi_instance_sampling_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..438a0aa91c0cc8904f6d8bba7139408dd99b98cf
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/samplers/multi_instance_sampling_result.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import Tensor
+
+from ..assigners import AssignResult
+from .sampling_result import SamplingResult
+
+
+class MultiInstanceSamplingResult(SamplingResult):
+    """Bbox sampling result. Further encapsulation of SamplingResult. Three
+    attributes neg_assigned_gt_inds, neg_gt_labels, and neg_gt_bboxes have been
+    added for SamplingResult.
+
+    Args:
+        pos_inds (Tensor): Indices of positive samples.
+        neg_inds (Tensor): Indices of negative samples.
+        priors (Tensor): The priors can be anchors or points,
+            or the bboxes predicted by the previous stage.
+        gt_and_ignore_bboxes (Tensor): Ground truth and ignore bboxes.
+        assign_result (:obj:`AssignResult`): Assigning results.
+        gt_flags (Tensor): The Ground truth flags.
+        avg_factor_with_neg (bool):  If True, ``avg_factor`` equal to
+            the number of total priors; Otherwise, it is the number of
+            positive priors. Defaults to True.
+    """
+
+    def __init__(self,
+                 pos_inds: Tensor,
+                 neg_inds: Tensor,
+                 priors: Tensor,
+                 gt_and_ignore_bboxes: Tensor,
+                 assign_result: AssignResult,
+                 gt_flags: Tensor,
+                 avg_factor_with_neg: bool = True) -> None:
+        self.neg_assigned_gt_inds = assign_result.gt_inds[neg_inds]
+        self.neg_gt_labels = assign_result.labels[neg_inds]
+
+        if gt_and_ignore_bboxes.numel() == 0:
+            self.neg_gt_bboxes = torch.empty_like(gt_and_ignore_bboxes).view(
+                -1, 4)
+        else:
+            if len(gt_and_ignore_bboxes.shape) < 2:
+                gt_and_ignore_bboxes = gt_and_ignore_bboxes.view(-1, 4)
+            self.neg_gt_bboxes = gt_and_ignore_bboxes[
+                self.neg_assigned_gt_inds.long(), :]
+
+        # To resist the minus 1 operation in `SamplingResult.init()`.
+        assign_result.gt_inds += 1
+        super().__init__(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_bboxes=gt_and_ignore_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags,
+            avg_factor_with_neg=avg_factor_with_neg)
diff --git a/mmde/mmdet/models/task_modules/samplers/ohem_sampler.py b/mmde/mmdet/models/task_modules/samplers/ohem_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f478a448cde00d64caeba1d0ba613d2497a7fb12
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/samplers/ohem_sampler.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox2roi
+from .base_sampler import BaseSampler
+
+
+@TASK_UTILS.register_module()
+class OHEMSampler(BaseSampler):
+    r"""Online Hard Example Mining Sampler described in `Training Region-based
+    Object Detectors with Online Hard Example Mining
+    <https://arxiv.org/abs/1604.03540>`_.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 context,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 loss_key='loss_cls',
+                 **kwargs):
+        super(OHEMSampler, self).__init__(num, pos_fraction, neg_pos_ub,
+                                          add_gt_as_proposals)
+        self.context = context
+        if not hasattr(self.context, 'num_stages'):
+            self.bbox_head = self.context.bbox_head
+        else:
+            self.bbox_head = self.context.bbox_head[self.context.current_stage]
+
+        self.loss_key = loss_key
+
+    def hard_mining(self, inds, num_expected, bboxes, labels, feats):
+        with torch.no_grad():
+            rois = bbox2roi([bboxes])
+            if not hasattr(self.context, 'num_stages'):
+                bbox_results = self.context._bbox_forward(feats, rois)
+            else:
+                bbox_results = self.context._bbox_forward(
+                    self.context.current_stage, feats, rois)
+            cls_score = bbox_results['cls_score']
+            loss = self.bbox_head.loss(
+                cls_score=cls_score,
+                bbox_pred=None,
+                rois=rois,
+                labels=labels,
+                label_weights=cls_score.new_ones(cls_score.size(0)),
+                bbox_targets=None,
+                bbox_weights=None,
+                reduction_override='none')[self.loss_key]
+            _, topk_loss_inds = loss.topk(num_expected)
+        return inds[topk_loss_inds]
+
+    def _sample_pos(self,
+                    assign_result,
+                    num_expected,
+                    bboxes=None,
+                    feats=None,
+                    **kwargs):
+        """Sample positive boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            num_expected (int): Number of expected positive samples
+            bboxes (torch.Tensor, optional): Boxes. Defaults to None.
+            feats (list[torch.Tensor], optional): Multi-level features.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Indices  of positive samples
+        """
+        # Sample some hard positive samples
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.hard_mining(pos_inds, num_expected, bboxes[pos_inds],
+                                    assign_result.labels[pos_inds], feats)
+
+    def _sample_neg(self,
+                    assign_result,
+                    num_expected,
+                    bboxes=None,
+                    feats=None,
+                    **kwargs):
+        """Sample negative boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            num_expected (int): Number of expected negative samples
+            bboxes (torch.Tensor, optional): Boxes. Defaults to None.
+            feats (list[torch.Tensor], optional): Multi-level features.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Indices  of negative samples
+        """
+        # Sample some hard negative samples
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            neg_labels = assign_result.labels.new_empty(
+                neg_inds.size(0)).fill_(self.bbox_head.num_classes)
+            return self.hard_mining(neg_inds, num_expected, bboxes[neg_inds],
+                                    neg_labels, feats)
diff --git a/mmde/mmdet/models/task_modules/samplers/pseudo_sampler.py b/mmde/mmdet/models/task_modules/samplers/pseudo_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8186cc3364516f34abe1c293017db6e2042d92a
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/samplers/pseudo_sampler.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from ..assigners import AssignResult
+from .base_sampler import BaseSampler
+from .sampling_result import SamplingResult
+
+
+@TASK_UTILS.register_module()
+class PseudoSampler(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData, *args, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors, points, or bboxes predicted by the model,
+                shape(n, 4).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+
+        gt_flags = priors.new_zeros(priors.shape[0], dtype=torch.uint8)
+        sampling_result = SamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_bboxes=gt_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags,
+            avg_factor_with_neg=False)
+        return sampling_result
diff --git a/mmde/mmdet/models/task_modules/samplers/random_sampler.py b/mmde/mmdet/models/task_modules/samplers/random_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa03665fc36cc6a0084431324b16727b2dc8993e
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/samplers/random_sampler.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from numpy import ndarray
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from ..assigners import AssignResult
+from .base_sampler import BaseSampler
+
+
+@TASK_UTILS.register_module()
+class RandomSampler(BaseSampler):
+    """Random sampler.
+
+    Args:
+        num (int): Number of samples
+        pos_fraction (float): Fraction of positive samples
+        neg_pos_up (int): Upper bound number of negative and
+            positive samples. Defaults to -1.
+        add_gt_as_proposals (bool): Whether to add ground truth
+            boxes as proposals. Defaults to True.
+    """
+
+    def __init__(self,
+                 num: int,
+                 pos_fraction: float,
+                 neg_pos_ub: int = -1,
+                 add_gt_as_proposals: bool = True,
+                 **kwargs):
+        from .sampling_result import ensure_rng
+        super().__init__(
+            num=num,
+            pos_fraction=pos_fraction,
+            neg_pos_ub=neg_pos_ub,
+            add_gt_as_proposals=add_gt_as_proposals)
+        self.rng = ensure_rng(kwargs.get('rng', None))
+
+    def random_choice(self, gallery: Union[Tensor, ndarray, list],
+                      num: int) -> Union[Tensor, ndarray]:
+        """Random select some elements from the gallery.
+
+        If `gallery` is a Tensor, the returned indices will be a Tensor;
+        If `gallery` is a ndarray or list, the returned indices will be a
+        ndarray.
+
+        Args:
+            gallery (Tensor | ndarray | list): indices pool.
+            num (int): expected sample num.
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        assert len(gallery) >= num
+
+        is_tensor = isinstance(gallery, torch.Tensor)
+        if not is_tensor:
+            if torch.cuda.is_available():
+                device = torch.cuda.current_device()
+            else:
+                device = 'cpu'
+            gallery = torch.tensor(gallery, dtype=torch.long, device=device)
+        # This is a temporary fix. We can revert the following code
+        # when PyTorch fixes the abnormal return of torch.randperm.
+        # See: https://github.com/open-mmlab/mmdetection/pull/5014
+        perm = torch.randperm(gallery.numel())[:num].to(device=gallery.device)
+        rand_inds = gallery[perm]
+        if not is_tensor:
+            rand_inds = rand_inds.cpu().numpy()
+        return rand_inds
+
+    def _sample_pos(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some positive samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some negative samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            return self.random_choice(neg_inds, num_expected)
diff --git a/mmde/mmdet/models/task_modules/samplers/sampling_result.py b/mmde/mmdet/models/task_modules/samplers/sampling_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb510ee68f24b8c444b6ed447016bfc785b825c2
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/samplers/sampling_result.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.structures.bbox import BaseBoxes, cat_boxes
+from mmdet.utils import util_mixins
+from mmdet.utils.util_random import ensure_rng
+from ..assigners import AssignResult
+
+
+def random_boxes(num=1, scale=1, rng=None):
+    """Simple version of ``kwimage.Boxes.random``
+
+    Returns:
+        Tensor: shape (n, 4) in x1, y1, x2, y2 format.
+
+    References:
+        https://gitlab.kitware.com/computer-vision/kwimage/blob/master/kwimage/structs/boxes.py#L1390
+
+    Example:
+        >>> num = 3
+        >>> scale = 512
+        >>> rng = 0
+        >>> boxes = random_boxes(num, scale, rng)
+        >>> print(boxes)
+        tensor([[280.9925, 278.9802, 308.6148, 366.1769],
+                [216.9113, 330.6978, 224.0446, 456.5878],
+                [405.3632, 196.3221, 493.3953, 270.7942]])
+    """
+    rng = ensure_rng(rng)
+
+    tlbr = rng.rand(num, 4).astype(np.float32)
+
+    tl_x = np.minimum(tlbr[:, 0], tlbr[:, 2])
+    tl_y = np.minimum(tlbr[:, 1], tlbr[:, 3])
+    br_x = np.maximum(tlbr[:, 0], tlbr[:, 2])
+    br_y = np.maximum(tlbr[:, 1], tlbr[:, 3])
+
+    tlbr[:, 0] = tl_x * scale
+    tlbr[:, 1] = tl_y * scale
+    tlbr[:, 2] = br_x * scale
+    tlbr[:, 3] = br_y * scale
+
+    boxes = torch.from_numpy(tlbr)
+    return boxes
+
+
+class SamplingResult(util_mixins.NiceRepr):
+    """Bbox sampling result.
+
+    Args:
+        pos_inds (Tensor): Indices of positive samples.
+        neg_inds (Tensor): Indices of negative samples.
+        priors (Tensor): The priors can be anchors or points,
+            or the bboxes predicted by the previous stage.
+        gt_bboxes (Tensor): Ground truth of bboxes.
+        assign_result (:obj:`AssignResult`): Assigning results.
+        gt_flags (Tensor): The Ground truth flags.
+        avg_factor_with_neg (bool):  If True, ``avg_factor`` equal to
+            the number of total priors; Otherwise, it is the number of
+            positive priors. Defaults to True.
+
+    Example:
+        >>> # xdoctest: +IGNORE_WANT
+        >>> from mmdet.models.task_modules.samplers.sampling_result import *  # NOQA
+        >>> self = SamplingResult.random(rng=10)
+        >>> print(f'self = {self}')
+        self = <SamplingResult({
+            'neg_inds': tensor([1,  2,  3,  5,  6,  7,  8,
+                                9, 10, 11, 12, 13]),
+            'neg_priors': torch.Size([12, 4]),
+            'num_gts': 1,
+            'num_neg': 12,
+            'num_pos': 1,
+            'avg_factor': 13,
+            'pos_assigned_gt_inds': tensor([0]),
+            'pos_inds': tensor([0]),
+            'pos_is_gt': tensor([1], dtype=torch.uint8),
+            'pos_priors': torch.Size([1, 4])
+        })>
+    """
+
+    def __init__(self,
+                 pos_inds: Tensor,
+                 neg_inds: Tensor,
+                 priors: Tensor,
+                 gt_bboxes: Tensor,
+                 assign_result: AssignResult,
+                 gt_flags: Tensor,
+                 avg_factor_with_neg: bool = True) -> None:
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.num_pos = max(pos_inds.numel(), 1)
+        self.num_neg = max(neg_inds.numel(), 1)
+        self.avg_factor_with_neg = avg_factor_with_neg
+        self.avg_factor = self.num_pos + self.num_neg \
+            if avg_factor_with_neg else self.num_pos
+        self.pos_priors = priors[pos_inds]
+        self.neg_priors = priors[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_bboxes.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+        self.pos_gt_labels = assign_result.labels[pos_inds]
+        box_dim = gt_bboxes.box_dim if isinstance(gt_bboxes, BaseBoxes) else 4
+        if gt_bboxes.numel() == 0:
+            # hack for index error case
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_bboxes = gt_bboxes.view(-1, box_dim)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.view(-1, box_dim)
+            self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds.long()]
+
+    @property
+    def priors(self):
+        """torch.Tensor: concatenated positive and negative priors"""
+        return cat_boxes([self.pos_priors, self.neg_priors])
+
+    @property
+    def bboxes(self):
+        """torch.Tensor: concatenated positive and negative boxes"""
+        warnings.warn('DeprecationWarning: bboxes is deprecated, '
+                      'please use "priors" instead')
+        return self.priors
+
+    @property
+    def pos_bboxes(self):
+        warnings.warn('DeprecationWarning: pos_bboxes is deprecated, '
+                      'please use "pos_priors" instead')
+        return self.pos_priors
+
+    @property
+    def neg_bboxes(self):
+        warnings.warn('DeprecationWarning: neg_bboxes is deprecated, '
+                      'please use "neg_priors" instead')
+        return self.neg_priors
+
+    def to(self, device):
+        """Change the device of the data inplace.
+
+        Example:
+            >>> self = SamplingResult.random()
+            >>> print(f'self = {self.to(None)}')
+            >>> # xdoctest: +REQUIRES(--gpu)
+            >>> print(f'self = {self.to(0)}')
+        """
+        _dict = self.__dict__
+        for key, value in _dict.items():
+            if isinstance(value, (torch.Tensor, BaseBoxes)):
+                _dict[key] = value.to(device)
+        return self
+
+    def __nice__(self):
+        data = self.info.copy()
+        data['pos_priors'] = data.pop('pos_priors').shape
+        data['neg_priors'] = data.pop('neg_priors').shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = '    ' + ',\n    '.join(parts)
+        return '{\n' + body + '\n}'
+
+    @property
+    def info(self):
+        """Returns a dictionary of info about the object."""
+        return {
+            'pos_inds': self.pos_inds,
+            'neg_inds': self.neg_inds,
+            'pos_priors': self.pos_priors,
+            'neg_priors': self.neg_priors,
+            'pos_is_gt': self.pos_is_gt,
+            'num_gts': self.num_gts,
+            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+            'num_pos': self.num_pos,
+            'num_neg': self.num_neg,
+            'avg_factor': self.avg_factor
+        }
+
+    @classmethod
+    def random(cls, rng=None, **kwargs):
+        """
+        Args:
+            rng (None | int | numpy.random.RandomState): seed or state.
+            kwargs (keyword arguments):
+                - num_preds: Number of predicted boxes.
+                - num_gts: Number of true boxes.
+                - p_ignore (float): Probability of a predicted box assigned to
+                    an ignored truth.
+                - p_assigned (float): probability of a predicted box not being
+                    assigned.
+
+        Returns:
+            :obj:`SamplingResult`: Randomly generated sampling result.
+
+        Example:
+            >>> from mmdet.models.task_modules.samplers.sampling_result import *  # NOQA
+            >>> self = SamplingResult.random()
+            >>> print(self.__dict__)
+        """
+        from mmengine.structures import InstanceData
+
+        from mmdet.models.task_modules.assigners import AssignResult
+        from mmdet.models.task_modules.samplers import RandomSampler
+        rng = ensure_rng(rng)
+
+        # make probabilistic?
+        num = 32
+        pos_fraction = 0.5
+        neg_pos_ub = -1
+
+        assign_result = AssignResult.random(rng=rng, **kwargs)
+
+        # Note we could just compute an assignment
+        priors = random_boxes(assign_result.num_preds, rng=rng)
+        gt_bboxes = random_boxes(assign_result.num_gts, rng=rng)
+        gt_labels = torch.randint(
+            0, 5, (assign_result.num_gts, ), dtype=torch.long)
+
+        pred_instances = InstanceData()
+        pred_instances.priors = priors
+
+        gt_instances = InstanceData()
+        gt_instances.bboxes = gt_bboxes
+        gt_instances.labels = gt_labels
+
+        add_gt_as_proposals = True
+
+        sampler = RandomSampler(
+            num,
+            pos_fraction,
+            neg_pos_ub=neg_pos_ub,
+            add_gt_as_proposals=add_gt_as_proposals,
+            rng=rng)
+        self = sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+        return self
diff --git a/mmde/mmdet/models/task_modules/samplers/score_hlr_sampler.py b/mmde/mmdet/models/task_modules/samplers/score_hlr_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..0227585b92329625d053f1e9f8c161fd02af8aef
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/samplers/score_hlr_sampler.py
@@ -0,0 +1,290 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from mmcv.ops import nms_match
+from mmengine.structures import InstanceData
+from numpy import ndarray
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox2roi
+from ..assigners import AssignResult
+from .base_sampler import BaseSampler
+from .sampling_result import SamplingResult
+
+
+@TASK_UTILS.register_module()
+class ScoreHLRSampler(BaseSampler):
+    r"""Importance-based Sample Reweighting (ISR_N), described in `Prime Sample
+    Attention in Object Detection <https://arxiv.org/abs/1904.04821>`_.
+
+    Score hierarchical local rank (HLR) differentiates with RandomSampler in
+    negative part. It firstly computes Score-HLR in a two-step way,
+    then linearly maps score hlr to the loss weights.
+
+    Args:
+        num (int): Total number of sampled RoIs.
+        pos_fraction (float): Fraction of positive samples.
+        context (:obj:`BaseRoIHead`): RoI head that the sampler belongs to.
+        neg_pos_ub (int): Upper bound of the ratio of num negative to num
+            positive, -1 means no upper bound. Defaults to -1.
+        add_gt_as_proposals (bool): Whether to add ground truth as proposals.
+            Defaults to True.
+        k (float): Power of the non-linear mapping. Defaults to 0.5
+        bias (float): Shift of the non-linear mapping. Defaults to 0.
+        score_thr (float): Minimum score that a negative sample is to be
+            considered as valid bbox. Defaults to 0.05.
+        iou_thr (float): IoU threshold for NMS match. Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 num: int,
+                 pos_fraction: float,
+                 context,
+                 neg_pos_ub: int = -1,
+                 add_gt_as_proposals: bool = True,
+                 k: float = 0.5,
+                 bias: float = 0,
+                 score_thr: float = 0.05,
+                 iou_thr: float = 0.5,
+                 **kwargs) -> None:
+        super().__init__(
+            num=num,
+            pos_fraction=pos_fraction,
+            neg_pos_ub=neg_pos_ub,
+            add_gt_as_proposals=add_gt_as_proposals)
+        self.k = k
+        self.bias = bias
+        self.score_thr = score_thr
+        self.iou_thr = iou_thr
+        self.context = context
+        # context of cascade detectors is a list, so distinguish them here.
+        if not hasattr(context, 'num_stages'):
+            self.bbox_roi_extractor = context.bbox_roi_extractor
+            self.bbox_head = context.bbox_head
+            self.with_shared_head = context.with_shared_head
+            if self.with_shared_head:
+                self.shared_head = context.shared_head
+        else:
+            self.bbox_roi_extractor = context.bbox_roi_extractor[
+                context.current_stage]
+            self.bbox_head = context.bbox_head[context.current_stage]
+
+    @staticmethod
+    def random_choice(gallery: Union[Tensor, ndarray, list],
+                      num: int) -> Union[Tensor, ndarray]:
+        """Randomly select some elements from the gallery.
+
+        If `gallery` is a Tensor, the returned indices will be a Tensor;
+        If `gallery` is a ndarray or list, the returned indices will be a
+        ndarray.
+
+        Args:
+            gallery (Tensor or ndarray or list): indices pool.
+            num (int): expected sample num.
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        assert len(gallery) >= num
+
+        is_tensor = isinstance(gallery, torch.Tensor)
+        if not is_tensor:
+            if torch.cuda.is_available():
+                device = torch.cuda.current_device()
+            else:
+                device = 'cpu'
+            gallery = torch.tensor(gallery, dtype=torch.long, device=device)
+        perm = torch.randperm(gallery.numel(), device=gallery.device)[:num]
+        rand_inds = gallery[perm]
+        if not is_tensor:
+            rand_inds = rand_inds.cpu().numpy()
+        return rand_inds
+
+    def _sample_pos(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some positive samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0).flatten()
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result: AssignResult, num_expected: int,
+                    bboxes: Tensor, feats: Tensor,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Sample negative samples.
+
+        Score-HLR sampler is done in the following steps:
+        1. Take the maximum positive score prediction of each negative samples
+            as s_i.
+        2. Filter out negative samples whose s_i <= score_thr, the left samples
+            are called valid samples.
+        3. Use NMS-Match to divide valid samples into different groups,
+            samples in the same group will greatly overlap with each other
+        4. Rank the matched samples in two-steps to get Score-HLR.
+            (1) In the same group, rank samples with their scores.
+            (2) In the same score rank across different groups,
+                rank samples with their scores again.
+        5. Linearly map Score-HLR to the final label weights.
+
+        Args:
+            assign_result (:obj:`AssignResult`): result of assigner.
+            num_expected (int): Expected number of samples.
+            bboxes (Tensor): bbox to be sampled.
+            feats (Tensor): Features come from FPN.
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0).flatten()
+        num_neg = neg_inds.size(0)
+        if num_neg == 0:
+            return neg_inds, None
+        with torch.no_grad():
+            neg_bboxes = bboxes[neg_inds]
+            neg_rois = bbox2roi([neg_bboxes])
+            bbox_result = self.context._bbox_forward(feats, neg_rois)
+            cls_score, bbox_pred = bbox_result['cls_score'], bbox_result[
+                'bbox_pred']
+
+            ori_loss = self.bbox_head.loss(
+                cls_score=cls_score,
+                bbox_pred=None,
+                rois=None,
+                labels=neg_inds.new_full((num_neg, ),
+                                         self.bbox_head.num_classes),
+                label_weights=cls_score.new_ones(num_neg),
+                bbox_targets=None,
+                bbox_weights=None,
+                reduction_override='none')['loss_cls']
+
+            # filter out samples with the max score lower than score_thr
+            max_score, argmax_score = cls_score.softmax(-1)[:, :-1].max(-1)
+            valid_inds = (max_score > self.score_thr).nonzero().view(-1)
+            invalid_inds = (max_score <= self.score_thr).nonzero().view(-1)
+            num_valid = valid_inds.size(0)
+            num_invalid = invalid_inds.size(0)
+
+            num_expected = min(num_neg, num_expected)
+            num_hlr = min(num_valid, num_expected)
+            num_rand = num_expected - num_hlr
+            if num_valid > 0:
+                valid_rois = neg_rois[valid_inds]
+                valid_max_score = max_score[valid_inds]
+                valid_argmax_score = argmax_score[valid_inds]
+                valid_bbox_pred = bbox_pred[valid_inds]
+
+                # valid_bbox_pred shape: [num_valid, #num_classes, 4]
+                valid_bbox_pred = valid_bbox_pred.view(
+                    valid_bbox_pred.size(0), -1, 4)
+                selected_bbox_pred = valid_bbox_pred[range(num_valid),
+                                                     valid_argmax_score]
+                pred_bboxes = self.bbox_head.bbox_coder.decode(
+                    valid_rois[:, 1:], selected_bbox_pred)
+                pred_bboxes_with_score = torch.cat(
+                    [pred_bboxes, valid_max_score[:, None]], -1)
+                group = nms_match(pred_bboxes_with_score, self.iou_thr)
+
+                # imp: importance
+                imp = cls_score.new_zeros(num_valid)
+                for g in group:
+                    g_score = valid_max_score[g]
+                    # g_score has already sorted
+                    rank = g_score.new_tensor(range(g_score.size(0)))
+                    imp[g] = num_valid - rank + g_score
+                _, imp_rank_inds = imp.sort(descending=True)
+                _, imp_rank = imp_rank_inds.sort()
+                hlr_inds = imp_rank_inds[:num_expected]
+
+                if num_rand > 0:
+                    rand_inds = torch.randperm(num_invalid)[:num_rand]
+                    select_inds = torch.cat(
+                        [valid_inds[hlr_inds], invalid_inds[rand_inds]])
+                else:
+                    select_inds = valid_inds[hlr_inds]
+
+                neg_label_weights = cls_score.new_ones(num_expected)
+
+                up_bound = max(num_expected, num_valid)
+                imp_weights = (up_bound -
+                               imp_rank[hlr_inds].float()) / up_bound
+                neg_label_weights[:num_hlr] = imp_weights
+                neg_label_weights[num_hlr:] = imp_weights.min()
+                neg_label_weights = (self.bias +
+                                     (1 - self.bias) * neg_label_weights).pow(
+                                         self.k)
+                ori_selected_loss = ori_loss[select_inds]
+                new_loss = ori_selected_loss * neg_label_weights
+                norm_ratio = ori_selected_loss.sum() / new_loss.sum()
+                neg_label_weights *= norm_ratio
+            else:
+                neg_label_weights = cls_score.new_ones(num_expected)
+                select_inds = torch.randperm(num_neg)[:num_expected]
+
+            return neg_inds[select_inds], neg_label_weights
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData, **kwargs) -> SamplingResult:
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigning results.
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+
+        Returns:
+            :obj:`SamplingResult`: Sampling result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels
+
+        gt_flags = priors.new_zeros((priors.shape[0], ), dtype=torch.uint8)
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            priors = torch.cat([gt_bboxes, priors], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = priors.new_ones(gt_bboxes.shape[0], dtype=torch.uint8)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(
+            assign_result, num_expected_pos, bboxes=priors, **kwargs)
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds, neg_label_weights = self.neg_sampler._sample_neg(
+            assign_result, num_expected_neg, bboxes=priors, **kwargs)
+
+        sampling_result = SamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_bboxes=gt_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags)
+        return sampling_result, neg_label_weights
diff --git a/mmde/mmdet/models/task_modules/tracking/__init__.py b/mmde/mmdet/models/task_modules/tracking/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..57a86d739d586e47e007d26de4542d6bdeced755
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/tracking/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .aflink import AppearanceFreeLink
+from .camera_motion_compensation import CameraMotionCompensation
+from .interpolation import InterpolateTracklets
+from .kalman_filter import KalmanFilter
+from .similarity import embed_similarity
+
+__all__ = [
+    'KalmanFilter', 'InterpolateTracklets', 'embed_similarity',
+    'AppearanceFreeLink', 'CameraMotionCompensation'
+]
diff --git a/mmde/mmdet/models/task_modules/tracking/aflink.py b/mmde/mmdet/models/task_modules/tracking/aflink.py
new file mode 100644
index 0000000000000000000000000000000000000000..52461067e372b30bbd28325ead00f5381c546326
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/tracking/aflink.py
@@ -0,0 +1,281 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import Tuple
+
+import numpy as np
+import torch
+from mmengine.model import BaseModule
+from mmengine.runner.checkpoint import load_checkpoint
+from scipy.optimize import linear_sum_assignment
+from torch import Tensor, nn
+
+from mmdet.registry import TASK_UTILS
+
+INFINITY = 1e5
+
+
+class TemporalBlock(BaseModule):
+    """The temporal block of AFLink model.
+
+    Args:
+        in_channel (int): the dimension of the input channels.
+        out_channel (int): the dimension of the output channels.
+    """
+
+    def __init__(self,
+                 in_channel: int,
+                 out_channel: int,
+                 kernel_size: tuple = (7, 1)):
+        super(TemporalBlock, self).__init__()
+        self.conv = nn.Conv2d(in_channel, out_channel, kernel_size, bias=False)
+        self.relu = nn.ReLU(inplace=True)
+        self.bnf = nn.BatchNorm1d(out_channel)
+        self.bnx = nn.BatchNorm1d(out_channel)
+        self.bny = nn.BatchNorm1d(out_channel)
+
+    def bn(self, x: Tensor) -> Tensor:
+        x[:, :, :, 0] = self.bnf(x[:, :, :, 0])
+        x[:, :, :, 1] = self.bnx(x[:, :, :, 1])
+        x[:, :, :, 2] = self.bny(x[:, :, :, 2])
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class FusionBlock(BaseModule):
+    """The fusion block of AFLink model.
+
+    Args:
+        in_channel (int): the dimension of the input channels.
+        out_channel (int): the dimension of the output channels.
+    """
+
+    def __init__(self, in_channel: int, out_channel: int):
+        super(FusionBlock, self).__init__()
+        self.conv = nn.Conv2d(in_channel, out_channel, (1, 3), bias=False)
+        self.bn = nn.BatchNorm2d(out_channel)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class Classifier(BaseModule):
+    """The classifier of AFLink model.
+
+    Args:
+        in_channel (int): the dimension of the input channels.
+    """
+
+    def __init__(self, in_channel: int, out_channel: int):
+        super(Classifier, self).__init__()
+        self.fc1 = nn.Linear(in_channel * 2, in_channel // 2)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc2 = nn.Linear(in_channel // 2, out_channel)
+
+    def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+        x = torch.cat((x1, x2), dim=1)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        return x
+
+
+class AFLinkModel(BaseModule):
+    """Appearance-Free Link Model."""
+
+    def __init__(self,
+                 temporal_module_channels: list = [1, 32, 64, 128, 256],
+                 fusion_module_channels: list = [256, 256],
+                 classifier_channels: list = [256, 2]):
+        super(AFLinkModel, self).__init__()
+        self.TemporalModule_1 = nn.Sequential(*[
+            TemporalBlock(temporal_module_channels[i],
+                          temporal_module_channels[i + 1])
+            for i in range(len(temporal_module_channels) - 1)
+        ])
+
+        self.TemporalModule_2 = nn.Sequential(*[
+            TemporalBlock(temporal_module_channels[i],
+                          temporal_module_channels[i + 1])
+            for i in range(len(temporal_module_channels) - 1)
+        ])
+
+        self.FusionBlock_1 = FusionBlock(*fusion_module_channels)
+        self.FusionBlock_2 = FusionBlock(*fusion_module_channels)
+
+        self.pooling = nn.AdaptiveAvgPool2d((1, 1))
+        self.classifier = Classifier(*classifier_channels)
+
+    def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+        assert not self.training, 'Only testing is supported for AFLink.'
+        x1 = x1[:, :, :, :3]
+        x2 = x2[:, :, :, :3]
+        x1 = self.TemporalModule_1(x1)  # [B,1,30,3] -> [B,256,6,3]
+        x2 = self.TemporalModule_2(x2)
+        x1 = self.FusionBlock_1(x1)
+        x2 = self.FusionBlock_2(x2)
+        x1 = self.pooling(x1).squeeze(-1).squeeze(-1)
+        x2 = self.pooling(x2).squeeze(-1).squeeze(-1)
+        y = self.classifier(x1, x2)
+        y = torch.softmax(y, dim=1)[0, 1]
+        return y
+
+
+@TASK_UTILS.register_module()
+class AppearanceFreeLink(BaseModule):
+    """Appearance-Free Link method.
+
+    This method is proposed in
+    "StrongSORT: Make DeepSORT Great Again"
+    `StrongSORT<https://arxiv.org/abs/2202.13514>`_.
+
+    Args:
+        checkpoint (str): Checkpoint path.
+        temporal_threshold (tuple, optional): The temporal constraint
+            for tracklets association. Defaults to (0, 30).
+        spatial_threshold (int, optional): The spatial constraint for
+            tracklets association. Defaults to 75.
+        confidence_threshold (float, optional): The minimum confidence
+            threshold for tracklets association. Defaults to 0.95.
+    """
+
+    def __init__(self,
+                 checkpoint: str,
+                 temporal_threshold: tuple = (0, 30),
+                 spatial_threshold: int = 75,
+                 confidence_threshold: float = 0.95):
+        super(AppearanceFreeLink, self).__init__()
+        self.temporal_threshold = temporal_threshold
+        self.spatial_threshold = spatial_threshold
+        self.confidence_threshold = confidence_threshold
+
+        self.model = AFLinkModel()
+        if checkpoint:
+            load_checkpoint(self.model, checkpoint)
+        if torch.cuda.is_available():
+            self.model.cuda()
+        self.model.eval()
+
+        self.device = next(self.model.parameters()).device
+        self.fn_l2 = lambda x, y: np.sqrt(x**2 + y**2)
+
+    def data_transform(self,
+                       track1: np.ndarray,
+                       track2: np.ndarray,
+                       length: int = 30) -> Tuple[np.ndarray]:
+        """Data Transformation. This is used to standardize the length of
+        tracks to a unified length. Then perform min-max normalization to the
+        motion embeddings.
+
+        Args:
+            track1 (ndarray): the first track with shape (N,C).
+            track2 (ndarray): the second track with shape (M,C).
+            length (int): the unified length of tracks. Defaults to 30.
+
+        Returns:
+            Tuple[ndarray]: the transformed track1 and track2.
+        """
+        # fill or cut track1
+        length_1 = track1.shape[0]
+        track1 = track1[-length:] if length_1 >= length else \
+            np.pad(track1, ((length - length_1, 0), (0, 0)))
+
+        # fill or cut track1
+        length_2 = track2.shape[0]
+        track2 = track2[:length] if length_2 >= length else \
+            np.pad(track2, ((0, length - length_2), (0, 0)))
+
+        # min-max normalization
+        min_ = np.concatenate((track1, track2), axis=0).min(axis=0)
+        max_ = np.concatenate((track1, track2), axis=0).max(axis=0)
+        subtractor = (max_ + min_) / 2
+        divisor = (max_ - min_) / 2 + 1e-5
+        track1 = (track1 - subtractor) / divisor
+        track2 = (track2 - subtractor) / divisor
+
+        return track1, track2
+
+    def forward(self, pred_tracks: np.ndarray) -> np.ndarray:
+        """Forward function.
+
+        pred_tracks (ndarray): With shape (N, 7). Each row denotes
+            (frame_id, track_id, x1, y1, x2, y2, score).
+
+        Returns:
+            ndarray: The linked tracks with shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score)
+        """
+        # sort tracks by the frame id
+        pred_tracks = pred_tracks[np.argsort(pred_tracks[:, 0])]
+
+        # gather tracks information
+        id2info = defaultdict(list)
+        for row in pred_tracks:
+            frame_id, track_id, x1, y1, x2, y2 = row[:6]
+            id2info[track_id].append([frame_id, x1, y1, x2 - x1, y2 - y1])
+        id2info = {k: np.array(v) for k, v in id2info.items()}
+        num_track = len(id2info)
+        track_ids = np.array(list(id2info))
+        cost_matrix = np.full((num_track, num_track), INFINITY)
+
+        # compute the cost matrix
+        for i, id_i in enumerate(track_ids):
+            for j, id_j in enumerate(track_ids):
+                if id_i == id_j:
+                    continue
+                info_i, info_j = id2info[id_i], id2info[id_j]
+                frame_i, box_i = info_i[-1][0], info_i[-1][1:3]
+                frame_j, box_j = info_j[0][0], info_j[0][1:3]
+                # temporal constraint
+                if not self.temporal_threshold[0] <= \
+                        frame_j - frame_i <= self.temporal_threshold[1]:
+                    continue
+                # spatial constraint
+                if self.fn_l2(box_i[0] - box_j[0], box_i[1] - box_j[1]) \
+                        > self.spatial_threshold:
+                    continue
+                # confidence constraint
+                track_i, track_j = self.data_transform(info_i, info_j)
+
+                # numpy to torch
+                track_i = torch.tensor(
+                    track_i, dtype=torch.float).to(self.device)
+                track_j = torch.tensor(
+                    track_j, dtype=torch.float).to(self.device)
+                track_i = track_i.unsqueeze(0).unsqueeze(0)
+                track_j = track_j.unsqueeze(0).unsqueeze(0)
+
+                confidence = self.model(track_i,
+                                        track_j).detach().cpu().numpy()
+                if confidence >= self.confidence_threshold:
+                    cost_matrix[i, j] = 1 - confidence
+
+        # linear assignment
+        indices = linear_sum_assignment(cost_matrix)
+        _id2id = dict()  # the temporary assignment results
+        id2id = dict()  # the final assignment results
+        for i, j in zip(indices[0], indices[1]):
+            if cost_matrix[i, j] < INFINITY:
+                _id2id[i] = j
+        for k, v in _id2id.items():
+            if k in id2id:
+                id2id[v] = id2id[k]
+            else:
+                id2id[v] = k
+
+        # link
+        for k, v in id2id.items():
+            pred_tracks[pred_tracks[:, 1] == k, 1] = v
+
+        # deduplicate
+        _, index = np.unique(pred_tracks[:, :2], return_index=True, axis=0)
+
+        return pred_tracks[index]
diff --git a/mmde/mmdet/models/task_modules/tracking/camera_motion_compensation.py b/mmde/mmdet/models/task_modules/tracking/camera_motion_compensation.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a6298494fd1c24e0e7bba457dd50864725f98c8
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/tracking/camera_motion_compensation.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox_cxcyah_to_xyxy, bbox_xyxy_to_cxcyah
+
+
+@TASK_UTILS.register_module()
+class CameraMotionCompensation:
+    """Camera motion compensation.
+
+    Args:
+        warp_mode (str): Warp mode in opencv.
+            Defaults to 'cv2.MOTION_EUCLIDEAN'.
+        num_iters (int): Number of the iterations. Defaults to 50.
+        stop_eps (float): Terminate threshold. Defaults to 0.001.
+    """
+
+    def __init__(self,
+                 warp_mode: str = 'cv2.MOTION_EUCLIDEAN',
+                 num_iters: int = 50,
+                 stop_eps: float = 0.001):
+        self.warp_mode = eval(warp_mode)
+        self.num_iters = num_iters
+        self.stop_eps = stop_eps
+
+    def get_warp_matrix(self, img: np.ndarray, ref_img: np.ndarray) -> Tensor:
+        """Calculate warping matrix between two images."""
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        ref_img = cv2.cvtColor(ref_img, cv2.COLOR_BGR2GRAY)
+
+        warp_matrix = np.eye(2, 3, dtype=np.float32)
+        criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT,
+                    self.num_iters, self.stop_eps)
+        cc, warp_matrix = cv2.findTransformECC(img, ref_img, warp_matrix,
+                                               self.warp_mode, criteria, None,
+                                               1)
+        warp_matrix = torch.from_numpy(warp_matrix)
+        return warp_matrix
+
+    def warp_bboxes(self, bboxes: Tensor, warp_matrix: Tensor) -> Tensor:
+        """Warp bounding boxes according to the warping matrix."""
+        tl, br = bboxes[:, :2], bboxes[:, 2:]
+        tl = torch.cat((tl, torch.ones(tl.shape[0], 1).to(bboxes.device)),
+                       dim=1)
+        br = torch.cat((br, torch.ones(tl.shape[0], 1).to(bboxes.device)),
+                       dim=1)
+        trans_tl = torch.mm(warp_matrix, tl.t()).t()
+        trans_br = torch.mm(warp_matrix, br.t()).t()
+        trans_bboxes = torch.cat((trans_tl, trans_br), dim=1)
+        return trans_bboxes.to(bboxes.device)
+
+    def warp_means(self, means: np.ndarray, warp_matrix: Tensor) -> np.ndarray:
+        """Warp track.mean according to the warping matrix."""
+        cxcyah = torch.from_numpy(means[:, :4]).float()
+        xyxy = bbox_cxcyah_to_xyxy(cxcyah)
+        warped_xyxy = self.warp_bboxes(xyxy, warp_matrix)
+        warped_cxcyah = bbox_xyxy_to_cxcyah(warped_xyxy).numpy()
+        means[:, :4] = warped_cxcyah
+        return means
+
+    def track(self, img: Tensor, ref_img: Tensor, tracks: dict,
+              num_samples: int, frame_id: int, metainfo: dict) -> dict:
+        """Tracking forward."""
+        img = img.squeeze(0).cpu().numpy().transpose((1, 2, 0))
+        ref_img = ref_img.squeeze(0).cpu().numpy().transpose((1, 2, 0))
+        warp_matrix = self.get_warp_matrix(img, ref_img)
+
+        # rescale the warp_matrix due to the `resize` in pipeline
+        scale_factor_h, scale_factor_w = metainfo['scale_factor']
+        warp_matrix[0, 2] = warp_matrix[0, 2] / scale_factor_w
+        warp_matrix[1, 2] = warp_matrix[1, 2] / scale_factor_h
+
+        bboxes = []
+        num_bboxes = []
+        means = []
+        for k, v in tracks.items():
+            if int(v['frame_ids'][-1]) < frame_id - 1:
+                _num = 1
+            else:
+                _num = min(num_samples, len(v.bboxes))
+            num_bboxes.append(_num)
+            bboxes.extend(v.bboxes[-_num:])
+            if len(v.mean) > 0:
+                means.append(v.mean)
+        bboxes = torch.cat(bboxes, dim=0)
+        warped_bboxes = self.warp_bboxes(bboxes, warp_matrix.to(bboxes.device))
+
+        warped_bboxes = torch.split(warped_bboxes, num_bboxes)
+        for b, (k, v) in zip(warped_bboxes, tracks.items()):
+            _num = b.shape[0]
+            b = torch.split(b, [1] * _num)
+            tracks[k].bboxes[-_num:] = b
+
+        if means:
+            means = np.asarray(means)
+            warped_means = self.warp_means(means, warp_matrix)
+            for m, (k, v) in zip(warped_means, tracks.items()):
+                tracks[k].mean = m
+
+        return tracks
diff --git a/mmde/mmdet/models/task_modules/tracking/interpolation.py b/mmde/mmdet/models/task_modules/tracking/interpolation.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb6a25af4f253e3ec6b9781831ff43c6bafe50e1
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/tracking/interpolation.py
@@ -0,0 +1,168 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+try:
+    from sklearn.gaussian_process import GaussianProcessRegressor as GPR
+    from sklearn.gaussian_process.kernels import RBF
+    HAS_SKIKIT_LEARN = True
+except ImportError:
+    HAS_SKIKIT_LEARN = False
+
+from mmdet.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class InterpolateTracklets:
+    """Interpolate tracks to make tracks more complete.
+
+    Args:
+        min_num_frames (int, optional): The minimum length of a track that will
+            be interpolated. Defaults to 5.
+        max_num_frames (int, optional): The maximum disconnected length in
+            a track. Defaults to 20.
+        use_gsi (bool, optional): Whether to use the GSI (Gaussian-smoothed
+            interpolation) method. Defaults to False.
+        smooth_tau (int, optional): smoothing parameter in GSI. Defaults to 10.
+    """
+
+    def __init__(self,
+                 min_num_frames: int = 5,
+                 max_num_frames: int = 20,
+                 use_gsi: bool = False,
+                 smooth_tau: int = 10):
+        if not HAS_SKIKIT_LEARN:
+            raise RuntimeError('sscikit-learn is not installed,\
+                 please install it by: pip install scikit-learn')
+        self.min_num_frames = min_num_frames
+        self.max_num_frames = max_num_frames
+        self.use_gsi = use_gsi
+        self.smooth_tau = smooth_tau
+
+    def _interpolate_track(self,
+                           track: np.ndarray,
+                           track_id: int,
+                           max_num_frames: int = 20) -> np.ndarray:
+        """Interpolate a track linearly to make the track more complete.
+
+        This function is proposed in
+        "ByteTrack: Multi-Object Tracking by Associating Every Detection Box."
+        `ByteTrack<https://arxiv.org/abs/2110.06864>`_.
+
+        Args:
+            track (ndarray): With shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score).
+            max_num_frames (int, optional): The maximum disconnected length in
+                the track. Defaults to 20.
+
+        Returns:
+            ndarray: The interpolated track with shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score)
+        """
+        assert (track[:, 1] == track_id).all(), \
+            'The track id should not changed when interpolate a track.'
+
+        frame_ids = track[:, 0]
+        interpolated_track = np.zeros((0, 7))
+        # perform interpolation for the disconnected frames in the track.
+        for i in np.where(np.diff(frame_ids) > 1)[0]:
+            left_frame_id = frame_ids[i]
+            right_frame_id = frame_ids[i + 1]
+            num_disconnected_frames = int(right_frame_id - left_frame_id)
+
+            if 1 < num_disconnected_frames < max_num_frames:
+                left_bbox = track[i, 2:6]
+                right_bbox = track[i + 1, 2:6]
+
+                # perform interpolation for two adjacent tracklets.
+                for j in range(1, num_disconnected_frames):
+                    cur_bbox = j / (num_disconnected_frames) * (
+                        right_bbox - left_bbox) + left_bbox
+                    cur_result = np.ones((7, ))
+                    cur_result[0] = j + left_frame_id
+                    cur_result[1] = track_id
+                    cur_result[2:6] = cur_bbox
+
+                    interpolated_track = np.concatenate(
+                        (interpolated_track, cur_result[None]), axis=0)
+
+        interpolated_track = np.concatenate((track, interpolated_track),
+                                            axis=0)
+        return interpolated_track
+
+    def gaussian_smoothed_interpolation(self,
+                                        track: np.ndarray,
+                                        smooth_tau: int = 10) -> np.ndarray:
+        """Gaussian-Smoothed Interpolation.
+
+        This function is proposed in
+        "StrongSORT: Make DeepSORT Great Again"
+        `StrongSORT<https://arxiv.org/abs/2202.13514>`_.
+
+        Args:
+            track (ndarray): With shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score).
+            smooth_tau (int, optional): smoothing parameter in GSI.
+                Defaults to 10.
+
+        Returns:
+            ndarray: The interpolated tracks with shape (N, 7). Each row
+                denotes (frame_id, track_id, x1, y1, x2, y2, score)
+        """
+        len_scale = np.clip(smooth_tau * np.log(smooth_tau**3 / len(track)),
+                            smooth_tau**-1, smooth_tau**2)
+        gpr = GPR(RBF(len_scale, 'fixed'))
+        t = track[:, 0].reshape(-1, 1)
+        x1 = track[:, 2].reshape(-1, 1)
+        y1 = track[:, 3].reshape(-1, 1)
+        x2 = track[:, 4].reshape(-1, 1)
+        y2 = track[:, 5].reshape(-1, 1)
+        gpr.fit(t, x1)
+        x1_gpr = gpr.predict(t)
+        gpr.fit(t, y1)
+        y1_gpr = gpr.predict(t)
+        gpr.fit(t, x2)
+        x2_gpr = gpr.predict(t)
+        gpr.fit(t, y2)
+        y2_gpr = gpr.predict(t)
+        gsi_track = [[
+            t[i, 0], track[i, 1], x1_gpr[i], y1_gpr[i], x2_gpr[i], y2_gpr[i],
+            track[i, 6]
+        ] for i in range(len(t))]
+        return np.array(gsi_track)
+
+    def forward(self, pred_tracks: np.ndarray) -> np.ndarray:
+        """Forward function.
+
+        pred_tracks (ndarray): With shape (N, 7). Each row denotes
+            (frame_id, track_id, x1, y1, x2, y2, score).
+
+        Returns:
+            ndarray: The interpolated tracks with shape (N, 7). Each row
+            denotes (frame_id, track_id, x1, y1, x2, y2, score).
+        """
+        max_track_id = int(np.max(pred_tracks[:, 1]))
+        min_track_id = int(np.min(pred_tracks[:, 1]))
+
+        # perform interpolation for each track
+        interpolated_tracks = []
+        for track_id in range(min_track_id, max_track_id + 1):
+            inds = pred_tracks[:, 1] == track_id
+            track = pred_tracks[inds]
+            num_frames = len(track)
+            if num_frames <= 2:
+                continue
+
+            if num_frames > self.min_num_frames:
+                interpolated_track = self._interpolate_track(
+                    track, track_id, self.max_num_frames)
+            else:
+                interpolated_track = track
+
+            if self.use_gsi:
+                interpolated_track = self.gaussian_smoothed_interpolation(
+                    interpolated_track, self.smooth_tau)
+
+            interpolated_tracks.append(interpolated_track)
+
+        interpolated_tracks = np.concatenate(interpolated_tracks)
+        return interpolated_tracks[interpolated_tracks[:, 0].argsort()]
diff --git a/mmde/mmdet/models/task_modules/tracking/kalman_filter.py b/mmde/mmdet/models/task_modules/tracking/kalman_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8ae1416af69bce17fd20dd5231eba2f12f7ed64
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/tracking/kalman_filter.py
@@ -0,0 +1,267 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import numpy as np
+import torch
+
+try:
+    import scipy.linalg
+    HAS_SCIPY = True
+except ImportError:
+    HAS_SCIPY = False
+
+from mmdet.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class KalmanFilter:
+    """A simple Kalman filter for tracking bounding boxes in image space.
+
+    The implementation is referred to https://github.com/nwojke/deep_sort.
+
+    Args:
+        center_only (bool): If True, distance computation is done with
+            respect to the bounding box center position only.
+            Defaults to False.
+        use_nsa (bool): Whether to use the NSA (Noise Scale Adaptive) Kalman
+            Filter, which adaptively modulates the noise scale according to
+            the quality of detections. More details in
+            https://arxiv.org/abs/2202.11983. Defaults to False.
+    """
+    chi2inv95 = {
+        1: 3.8415,
+        2: 5.9915,
+        3: 7.8147,
+        4: 9.4877,
+        5: 11.070,
+        6: 12.592,
+        7: 14.067,
+        8: 15.507,
+        9: 16.919
+    }
+
+    def __init__(self, center_only: bool = False, use_nsa: bool = False):
+        if not HAS_SCIPY:
+            raise RuntimeError('sscikit-learn is not installed,\
+                 please install it by: pip install scikit-learn')
+        self.center_only = center_only
+        if self.center_only:
+            self.gating_threshold = self.chi2inv95[2]
+        else:
+            self.gating_threshold = self.chi2inv95[4]
+
+        self.use_nsa = use_nsa
+        ndim, dt = 4, 1.
+
+        # Create Kalman filter model matrices.
+        self._motion_mat = np.eye(2 * ndim, 2 * ndim)
+        for i in range(ndim):
+            self._motion_mat[i, ndim + i] = dt
+        self._update_mat = np.eye(ndim, 2 * ndim)
+
+        # Motion and observation uncertainty are chosen relative to the current
+        # state estimate. These weights control the amount of uncertainty in
+        # the model. This is a bit hacky.
+        self._std_weight_position = 1. / 20
+        self._std_weight_velocity = 1. / 160
+
+    def initiate(self, measurement: np.array) -> Tuple[np.array, np.array]:
+        """Create track from unassociated measurement.
+
+        Args:
+            measurement (ndarray):  Bounding box coordinates (x, y, a, h) with
+            center position (x, y), aspect ratio a, and height h.
+
+        Returns:
+             (ndarray, ndarray): Returns the mean vector (8 dimensional) and
+                covariance matrix (8x8 dimensional) of the new track.
+                Unobserved velocities are initialized to 0 mean.
+        """
+        mean_pos = measurement
+        mean_vel = np.zeros_like(mean_pos)
+        mean = np.r_[mean_pos, mean_vel]
+
+        std = [
+            2 * self._std_weight_position * measurement[3],
+            2 * self._std_weight_position * measurement[3], 1e-2,
+            2 * self._std_weight_position * measurement[3],
+            10 * self._std_weight_velocity * measurement[3],
+            10 * self._std_weight_velocity * measurement[3], 1e-5,
+            10 * self._std_weight_velocity * measurement[3]
+        ]
+        covariance = np.diag(np.square(std))
+        return mean, covariance
+
+    def predict(self, mean: np.array,
+                covariance: np.array) -> Tuple[np.array, np.array]:
+        """Run Kalman filter prediction step.
+
+        Args:
+            mean (ndarray): The 8 dimensional mean vector of the object
+                state at the previous time step.
+
+            covariance (ndarray): The 8x8 dimensional covariance matrix
+                of the object state at the previous time step.
+
+        Returns:
+            (ndarray, ndarray): Returns the mean vector and covariance
+                matrix of the predicted state. Unobserved velocities are
+                initialized to 0 mean.
+        """
+        std_pos = [
+            self._std_weight_position * mean[3],
+            self._std_weight_position * mean[3], 1e-2,
+            self._std_weight_position * mean[3]
+        ]
+        std_vel = [
+            self._std_weight_velocity * mean[3],
+            self._std_weight_velocity * mean[3], 1e-5,
+            self._std_weight_velocity * mean[3]
+        ]
+        motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))
+
+        mean = np.dot(self._motion_mat, mean)
+        covariance = np.linalg.multi_dot(
+            (self._motion_mat, covariance, self._motion_mat.T)) + motion_cov
+
+        return mean, covariance
+
+    def project(self,
+                mean: np.array,
+                covariance: np.array,
+                bbox_score: float = 0.) -> Tuple[np.array, np.array]:
+        """Project state distribution to measurement space.
+
+        Args:
+            mean (ndarray): The state's mean vector (8 dimensional array).
+            covariance (ndarray): The state's covariance matrix (8x8
+                dimensional).
+            bbox_score (float): The confidence score of the bbox.
+                Defaults to 0.
+
+        Returns:
+            (ndarray, ndarray):  Returns the projected mean and covariance
+            matrix of the given state estimate.
+        """
+        std = [
+            self._std_weight_position * mean[3],
+            self._std_weight_position * mean[3], 1e-1,
+            self._std_weight_position * mean[3]
+        ]
+
+        if self.use_nsa:
+            std = [(1 - bbox_score) * x for x in std]
+
+        innovation_cov = np.diag(np.square(std))
+
+        mean = np.dot(self._update_mat, mean)
+        covariance = np.linalg.multi_dot(
+            (self._update_mat, covariance, self._update_mat.T))
+        return mean, covariance + innovation_cov
+
+    def update(self,
+               mean: np.array,
+               covariance: np.array,
+               measurement: np.array,
+               bbox_score: float = 0.) -> Tuple[np.array, np.array]:
+        """Run Kalman filter correction step.
+
+        Args:
+            mean (ndarray): The predicted state's mean vector (8 dimensional).
+            covariance (ndarray): The state's covariance matrix (8x8
+                dimensional).
+            measurement (ndarray): The 4 dimensional measurement vector
+                (x, y, a, h), where (x, y) is the center position, a the
+                aspect ratio, and h the height of the bounding box.
+            bbox_score (float): The confidence score of the bbox.
+                Defaults to 0.
+
+        Returns:
+             (ndarray, ndarray): Returns the measurement-corrected state
+             distribution.
+        """
+        projected_mean, projected_cov = \
+            self.project(mean, covariance, bbox_score)
+
+        chol_factor, lower = scipy.linalg.cho_factor(
+            projected_cov, lower=True, check_finite=False)
+        kalman_gain = scipy.linalg.cho_solve((chol_factor, lower),
+                                             np.dot(covariance,
+                                                    self._update_mat.T).T,
+                                             check_finite=False).T
+        innovation = measurement - projected_mean
+
+        new_mean = mean + np.dot(innovation, kalman_gain.T)
+        new_covariance = covariance - np.linalg.multi_dot(
+            (kalman_gain, projected_cov, kalman_gain.T))
+        return new_mean, new_covariance
+
+    def gating_distance(self,
+                        mean: np.array,
+                        covariance: np.array,
+                        measurements: np.array,
+                        only_position: bool = False) -> np.array:
+        """Compute gating distance between state distribution and measurements.
+
+        A suitable distance threshold can be obtained from `chi2inv95`. If
+        `only_position` is False, the chi-square distribution has 4 degrees of
+        freedom, otherwise 2.
+
+        Args:
+            mean (ndarray): Mean vector over the state distribution (8
+                dimensional).
+            covariance (ndarray): Covariance of the state distribution (8x8
+                dimensional).
+            measurements (ndarray): An Nx4 dimensional matrix of N
+                measurements, each in format (x, y, a, h) where (x, y) is the
+                bounding box center position, a the aspect ratio, and h the
+                height.
+            only_position (bool, optional): If True, distance computation is
+                done with respect to the bounding box center position only.
+                Defaults to False.
+
+        Returns:
+            ndarray: Returns an array of length N, where the i-th element
+            contains the squared Mahalanobis distance between
+            (mean, covariance) and `measurements[i]`.
+        """
+        mean, covariance = self.project(mean, covariance)
+        if only_position:
+            mean, covariance = mean[:2], covariance[:2, :2]
+            measurements = measurements[:, :2]
+
+        cholesky_factor = np.linalg.cholesky(covariance)
+        d = measurements - mean
+        z = scipy.linalg.solve_triangular(
+            cholesky_factor,
+            d.T,
+            lower=True,
+            check_finite=False,
+            overwrite_b=True)
+        squared_maha = np.sum(z * z, axis=0)
+        return squared_maha
+
+    def track(self, tracks: dict,
+              bboxes: torch.Tensor) -> Tuple[dict, np.array]:
+        """Track forward.
+
+        Args:
+            tracks (dict[int:dict]): Track buffer.
+            bboxes (Tensor): Detected bounding boxes.
+
+        Returns:
+            (dict[int:dict], ndarray): Updated tracks and bboxes.
+        """
+        costs = []
+        for id, track in tracks.items():
+            track.mean, track.covariance = self.predict(
+                track.mean, track.covariance)
+            gating_distance = self.gating_distance(track.mean,
+                                                   track.covariance,
+                                                   bboxes.cpu().numpy(),
+                                                   self.center_only)
+            costs.append(gating_distance)
+
+        costs = np.stack(costs, 0)
+        costs[costs > self.gating_threshold] = np.nan
+        return tracks, costs
diff --git a/mmde/mmdet/models/task_modules/tracking/similarity.py b/mmde/mmdet/models/task_modules/tracking/similarity.py
new file mode 100644
index 0000000000000000000000000000000000000000..730e43b86214ae92ffdcab8ae39e6f9261075caa
--- /dev/null
+++ b/mmde/mmdet/models/task_modules/tracking/similarity.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def embed_similarity(key_embeds: Tensor,
+                     ref_embeds: Tensor,
+                     method: str = 'dot_product',
+                     temperature: int = -1) -> Tensor:
+    """Calculate feature similarity from embeddings.
+
+    Args:
+        key_embeds (Tensor): Shape (N1, C).
+        ref_embeds (Tensor): Shape (N2, C).
+        method (str, optional): Method to calculate the similarity,
+            options are 'dot_product' and 'cosine'. Defaults to
+            'dot_product'.
+        temperature (int, optional): Softmax temperature. Defaults to -1.
+
+    Returns:
+        Tensor: Similarity matrix of shape (N1, N2).
+    """
+    assert method in ['dot_product', 'cosine']
+
+    if method == 'cosine':
+        key_embeds = F.normalize(key_embeds, p=2, dim=1)
+        ref_embeds = F.normalize(ref_embeds, p=2, dim=1)
+
+    similarity = torch.mm(key_embeds, ref_embeds.T)
+
+    if temperature > 0:
+        similarity /= float(temperature)
+    return similarity
diff --git a/mmde/mmdet/models/test_time_augs/__init__.py b/mmde/mmdet/models/test_time_augs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5e4926efb011b45b3ab7d3d303fb2d105aaa192
--- /dev/null
+++ b/mmde/mmdet/models/test_time_augs/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .det_tta import DetTTAModel
+from .merge_augs import (merge_aug_bboxes, merge_aug_masks,
+                         merge_aug_proposals, merge_aug_results,
+                         merge_aug_scores)
+
+__all__ = [
+    'merge_aug_bboxes', 'merge_aug_masks', 'merge_aug_proposals',
+    'merge_aug_scores', 'merge_aug_results', 'DetTTAModel'
+]
diff --git a/mmde/mmdet/models/test_time_augs/det_tta.py b/mmde/mmdet/models/test_time_augs/det_tta.py
new file mode 100644
index 0000000000000000000000000000000000000000..95f91db9e1250358db0e1a572cf4c37cc7fe6e6f
--- /dev/null
+++ b/mmde/mmdet/models/test_time_augs/det_tta.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from mmcv.ops import batched_nms
+from mmengine.model import BaseTTAModel
+from mmengine.registry import MODELS
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import bbox_flip
+
+
+@MODELS.register_module()
+class DetTTAModel(BaseTTAModel):
+    """Merge augmented detection results, only bboxes corresponding score under
+    flipping and multi-scale resizing can be processed now.
+
+    Examples:
+        >>> tta_model = dict(
+        >>>     type='DetTTAModel',
+        >>>     tta_cfg=dict(nms=dict(
+        >>>                     type='nms',
+        >>>                     iou_threshold=0.5),
+        >>>                     max_per_img=100))
+        >>>
+        >>> tta_pipeline = [
+        >>>     dict(type='LoadImageFromFile',
+        >>>          backend_args=None),
+        >>>     dict(
+        >>>         type='TestTimeAug',
+        >>>         transforms=[[
+        >>>             dict(type='Resize',
+        >>>                  scale=(1333, 800),
+        >>>                  keep_ratio=True),
+        >>>         ], [
+        >>>             dict(type='RandomFlip', prob=1.),
+        >>>             dict(type='RandomFlip', prob=0.)
+        >>>         ], [
+        >>>             dict(
+        >>>                 type='PackDetInputs',
+        >>>                 meta_keys=('img_id', 'img_path', 'ori_shape',
+        >>>                         'img_shape', 'scale_factor', 'flip',
+        >>>                         'flip_direction'))
+        >>>         ]])]
+    """
+
+    def __init__(self, tta_cfg=None, **kwargs):
+        super().__init__(**kwargs)
+        self.tta_cfg = tta_cfg
+
+    def merge_aug_bboxes(self, aug_bboxes: List[Tensor],
+                         aug_scores: List[Tensor],
+                         img_metas: List[str]) -> Tuple[Tensor, Tensor]:
+        """Merge augmented detection bboxes and scores.
+
+        Args:
+            aug_bboxes (list[Tensor]): shape (n, 4*#class)
+            aug_scores (list[Tensor] or None): shape (n, #class)
+        Returns:
+            tuple[Tensor]: ``bboxes`` with shape (n,4), where
+            4 represent (tl_x, tl_y, br_x, br_y)
+            and ``scores`` with shape (n,).
+        """
+        recovered_bboxes = []
+        for bboxes, img_info in zip(aug_bboxes, img_metas):
+            ori_shape = img_info['ori_shape']
+            flip = img_info['flip']
+            flip_direction = img_info['flip_direction']
+            if flip:
+                bboxes = bbox_flip(
+                    bboxes=bboxes,
+                    img_shape=ori_shape,
+                    direction=flip_direction)
+            recovered_bboxes.append(bboxes)
+        bboxes = torch.cat(recovered_bboxes, dim=0)
+        if aug_scores is None:
+            return bboxes
+        else:
+            scores = torch.cat(aug_scores, dim=0)
+            return bboxes, scores
+
+    def merge_preds(self, data_samples_list: List[List[DetDataSample]]):
+        """Merge batch predictions of enhanced data.
+
+        Args:
+            data_samples_list (List[List[DetDataSample]]): List of predictions
+                of all enhanced data. The outer list indicates images, and the
+                inner list corresponds to the different views of one image.
+                Each element of the inner list is a ``DetDataSample``.
+        Returns:
+            List[DetDataSample]: Merged batch prediction.
+        """
+        merged_data_samples = []
+        for data_samples in data_samples_list:
+            merged_data_samples.append(self._merge_single_sample(data_samples))
+        return merged_data_samples
+
+    def _merge_single_sample(
+            self, data_samples: List[DetDataSample]) -> DetDataSample:
+        """Merge predictions which come form the different views of one image
+        to one prediction.
+
+        Args:
+            data_samples (List[DetDataSample]): List of predictions
+            of enhanced data which come form one image.
+        Returns:
+            List[DetDataSample]: Merged prediction.
+        """
+        aug_bboxes = []
+        aug_scores = []
+        aug_labels = []
+        img_metas = []
+        # TODO: support instance segmentation TTA
+        assert data_samples[0].pred_instances.get('masks', None) is None, \
+            'TTA of instance segmentation does not support now.'
+        for data_sample in data_samples:
+            aug_bboxes.append(data_sample.pred_instances.bboxes)
+            aug_scores.append(data_sample.pred_instances.scores)
+            aug_labels.append(data_sample.pred_instances.labels)
+            img_metas.append(data_sample.metainfo)
+
+        merged_bboxes, merged_scores = self.merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas)
+        merged_labels = torch.cat(aug_labels, dim=0)
+
+        if merged_bboxes.numel() == 0:
+            return data_samples[0]
+
+        det_bboxes, keep_idxs = batched_nms(merged_bboxes, merged_scores,
+                                            merged_labels, self.tta_cfg.nms)
+
+        det_bboxes = det_bboxes[:self.tta_cfg.max_per_img]
+        det_labels = merged_labels[keep_idxs][:self.tta_cfg.max_per_img]
+
+        results = InstanceData()
+        _det_bboxes = det_bboxes.clone()
+        results.bboxes = _det_bboxes[:, :-1]
+        results.scores = _det_bboxes[:, -1]
+        results.labels = det_labels
+        det_results = data_samples[0]
+        det_results.pred_instances = results
+        return det_results
diff --git a/mmde/mmdet/models/test_time_augs/merge_augs.py b/mmde/mmdet/models/test_time_augs/merge_augs.py
new file mode 100644
index 0000000000000000000000000000000000000000..5935a8614c39d70253a09a339f51c144661c64fb
--- /dev/null
+++ b/mmde/mmdet/models/test_time_augs/merge_augs.py
@@ -0,0 +1,219 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+from mmcv.ops import nms
+from mmengine.config import ConfigDict
+from torch import Tensor
+
+from mmdet.structures.bbox import bbox_mapping_back
+
+
+# TODO remove this, never be used in mmdet
+def merge_aug_proposals(aug_proposals, img_metas, cfg):
+    """Merge augmented proposals (multiscale, flip, etc.)
+
+    Args:
+        aug_proposals (list[Tensor]): proposals from different testing
+            schemes, shape (n, 5). Note that they are not rescaled to the
+            original image size.
+
+        img_metas (list[dict]): list of image info dict where each dict has:
+            'img_shape', 'scale_factor', 'flip', and may also contain
+            'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+            For details on the values of these keys see
+            `mmdet/datasets/pipelines/formatting.py:Collect`.
+
+        cfg (dict): rpn test config.
+
+    Returns:
+        Tensor: shape (n, 4), proposals corresponding to original image scale.
+    """
+
+    cfg = copy.deepcopy(cfg)
+
+    # deprecate arguments warning
+    if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg:
+        warnings.warn(
+            'In rpn_proposal or test_cfg, '
+            'nms_thr has been moved to a dict named nms as '
+            'iou_threshold, max_num has been renamed as max_per_img, '
+            'name of original arguments and the way to specify '
+            'iou_threshold of NMS will be deprecated.')
+    if 'nms' not in cfg:
+        cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr))
+    if 'max_num' in cfg:
+        if 'max_per_img' in cfg:
+            assert cfg.max_num == cfg.max_per_img, f'You set max_num and ' \
+                f'max_per_img at the same time, but get {cfg.max_num} ' \
+                f'and {cfg.max_per_img} respectively' \
+                f'Please delete max_num which will be deprecated.'
+        else:
+            cfg.max_per_img = cfg.max_num
+    if 'nms_thr' in cfg:
+        assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set ' \
+            f'iou_threshold in nms and ' \
+            f'nms_thr at the same time, but get ' \
+            f'{cfg.nms.iou_threshold} and {cfg.nms_thr}' \
+            f' respectively. Please delete the nms_thr ' \
+            f'which will be deprecated.'
+
+    recovered_proposals = []
+    for proposals, img_info in zip(aug_proposals, img_metas):
+        img_shape = img_info['img_shape']
+        scale_factor = img_info['scale_factor']
+        flip = img_info['flip']
+        flip_direction = img_info['flip_direction']
+        _proposals = proposals.clone()
+        _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape,
+                                              scale_factor, flip,
+                                              flip_direction)
+        recovered_proposals.append(_proposals)
+    aug_proposals = torch.cat(recovered_proposals, dim=0)
+    merged_proposals, _ = nms(aug_proposals[:, :4].contiguous(),
+                              aug_proposals[:, -1].contiguous(),
+                              cfg.nms.iou_threshold)
+    scores = merged_proposals[:, 4]
+    _, order = scores.sort(0, descending=True)
+    num = min(cfg.max_per_img, merged_proposals.shape[0])
+    order = order[:num]
+    merged_proposals = merged_proposals[order, :]
+    return merged_proposals
+
+
+# TODO remove this, never be used in mmdet
+def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg):
+    """Merge augmented detection bboxes and scores.
+
+    Args:
+        aug_bboxes (list[Tensor]): shape (n, 4*#class)
+        aug_scores (list[Tensor] or None): shape (n, #class)
+        img_shapes (list[Tensor]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_bboxes = []
+    for bboxes, img_info in zip(aug_bboxes, img_metas):
+        img_shape = img_info[0]['img_shape']
+        scale_factor = img_info[0]['scale_factor']
+        flip = img_info[0]['flip']
+        flip_direction = img_info[0]['flip_direction']
+        bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip,
+                                   flip_direction)
+        recovered_bboxes.append(bboxes)
+    bboxes = torch.stack(recovered_bboxes).mean(dim=0)
+    if aug_scores is None:
+        return bboxes
+    else:
+        scores = torch.stack(aug_scores).mean(dim=0)
+        return bboxes, scores
+
+
+def merge_aug_results(aug_batch_results, aug_batch_img_metas):
+    """Merge augmented detection results, only bboxes corresponding score under
+    flipping and multi-scale resizing can be processed now.
+
+    Args:
+        aug_batch_results (list[list[[obj:`InstanceData`]]):
+            Detection results of multiple images with
+            different augmentations.
+            The outer list indicate the augmentation . The inter
+            list indicate the batch dimension.
+            Each item usually contains the following keys.
+
+            - scores (Tensor): Classification scores, in shape
+              (num_instance,)
+            - labels (Tensor): Labels of bboxes, in shape
+              (num_instances,).
+            - bboxes (Tensor): In shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        aug_batch_img_metas (list[list[dict]]): The outer list
+            indicates test-time augs (multiscale, flip, etc.)
+            and the inner list indicates
+            images in a batch. Each dict in the list contains
+            information of an image in the batch.
+
+    Returns:
+        batch_results (list[obj:`InstanceData`]): Same with
+        the input `aug_results` except that all bboxes have
+        been mapped to the original scale.
+    """
+    num_augs = len(aug_batch_results)
+    num_imgs = len(aug_batch_results[0])
+
+    batch_results = []
+    aug_batch_results = copy.deepcopy(aug_batch_results)
+    for img_id in range(num_imgs):
+        aug_results = []
+        for aug_id in range(num_augs):
+            img_metas = aug_batch_img_metas[aug_id][img_id]
+            results = aug_batch_results[aug_id][img_id]
+
+            img_shape = img_metas['img_shape']
+            scale_factor = img_metas['scale_factor']
+            flip = img_metas['flip']
+            flip_direction = img_metas['flip_direction']
+            bboxes = bbox_mapping_back(results.bboxes, img_shape, scale_factor,
+                                       flip, flip_direction)
+            results.bboxes = bboxes
+            aug_results.append(results)
+        merged_aug_results = results.cat(aug_results)
+        batch_results.append(merged_aug_results)
+
+    return batch_results
+
+
+def merge_aug_scores(aug_scores):
+    """Merge augmented bbox scores."""
+    if isinstance(aug_scores[0], torch.Tensor):
+        return torch.mean(torch.stack(aug_scores), dim=0)
+    else:
+        return np.mean(aug_scores, axis=0)
+
+
+def merge_aug_masks(aug_masks: List[Tensor],
+                    img_metas: dict,
+                    weights: Optional[Union[list, Tensor]] = None) -> Tensor:
+    """Merge augmented mask prediction.
+
+    Args:
+        aug_masks (list[Tensor]): each has shape
+            (n, c, h, w).
+        img_metas (dict): Image information.
+        weights (list or Tensor): Weight of each aug_masks,
+            the length should be n.
+
+    Returns:
+        Tensor: has shape (n, c, h, w)
+    """
+    recovered_masks = []
+    for i, mask in enumerate(aug_masks):
+        if weights is not None:
+            assert len(weights) == len(aug_masks)
+            weight = weights[i]
+        else:
+            weight = 1
+        flip = img_metas.get('flip', False)
+        if flip:
+            flip_direction = img_metas['flip_direction']
+            if flip_direction == 'horizontal':
+                mask = mask[:, :, :, ::-1]
+            elif flip_direction == 'vertical':
+                mask = mask[:, :, ::-1, :]
+            elif flip_direction == 'diagonal':
+                mask = mask[:, :, :, ::-1]
+                mask = mask[:, :, ::-1, :]
+            else:
+                raise ValueError(
+                    f"Invalid flipping direction '{flip_direction}'")
+        recovered_masks.append(mask[None, :] * weight)
+
+    merged_masks = torch.cat(recovered_masks, 0).mean(dim=0)
+    if weights is not None:
+        merged_masks = merged_masks * len(weights) / sum(weights)
+    return merged_masks
diff --git a/mmde/mmdet/models/trackers/__init__.py b/mmde/mmdet/models/trackers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..00284bb7b40dd007c28b6cc9175ac26a52c6c528
--- /dev/null
+++ b/mmde/mmdet/models/trackers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_tracker import BaseTracker
+from .byte_tracker import ByteTracker
+from .masktrack_rcnn_tracker import MaskTrackRCNNTracker
+from .ocsort_tracker import OCSORTTracker
+from .quasi_dense_tracker import QuasiDenseTracker
+from .sort_tracker import SORTTracker
+from .strongsort_tracker import StrongSORTTracker
+
+__all__ = [
+    'BaseTracker', 'ByteTracker', 'QuasiDenseTracker', 'SORTTracker',
+    'StrongSORTTracker', 'OCSORTTracker', 'MaskTrackRCNNTracker'
+]
diff --git a/mmde/mmdet/models/trackers/base_tracker.py b/mmde/mmdet/models/trackers/base_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cf188653cd9adda59decd45f65fc4ede63fe3a7
--- /dev/null
+++ b/mmde/mmdet/models/trackers/base_tracker.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from addict import Dict
+
+
+class BaseTracker(metaclass=ABCMeta):
+    """Base tracker model.
+
+    Args:
+        momentums (dict[str:float], optional): Momentums to update the buffers.
+            The `str` indicates the name of the buffer while the `float`
+            indicates the momentum. Defaults to None.
+        num_frames_retain (int, optional). If a track is disappeared more than
+            `num_frames_retain` frames, it will be deleted in the memo.
+             Defaults to 10.
+    """
+
+    def __init__(self,
+                 momentums: Optional[dict] = None,
+                 num_frames_retain: int = 10) -> None:
+        super().__init__()
+        if momentums is not None:
+            assert isinstance(momentums, dict), 'momentums must be a dict'
+        self.momentums = momentums
+        self.num_frames_retain = num_frames_retain
+
+        self.reset()
+
+    def reset(self) -> None:
+        """Reset the buffer of the tracker."""
+        self.num_tracks = 0
+        self.tracks = dict()
+
+    @property
+    def empty(self) -> bool:
+        """Whether the buffer is empty or not."""
+        return False if self.tracks else True
+
+    @property
+    def ids(self) -> List[dict]:
+        """All ids in the tracker."""
+        return list(self.tracks.keys())
+
+    @property
+    def with_reid(self) -> bool:
+        """bool: whether the framework has a reid model"""
+        return hasattr(self, 'reid') and self.reid is not None
+
+    def update(self, **kwargs) -> None:
+        """Update the tracker.
+
+        Args:
+            kwargs (dict[str: Tensor | int]): The `str` indicates the
+                name of the input variable. `ids` and `frame_ids` are
+                obligatory in the keys.
+        """
+        memo_items = [k for k, v in kwargs.items() if v is not None]
+        rm_items = [k for k in kwargs.keys() if k not in memo_items]
+        for item in rm_items:
+            kwargs.pop(item)
+        if not hasattr(self, 'memo_items'):
+            self.memo_items = memo_items
+        else:
+            assert memo_items == self.memo_items
+
+        assert 'ids' in memo_items
+        num_objs = len(kwargs['ids'])
+        id_indice = memo_items.index('ids')
+        assert 'frame_ids' in memo_items
+        frame_id = int(kwargs['frame_ids'])
+        if isinstance(kwargs['frame_ids'], int):
+            kwargs['frame_ids'] = torch.tensor([kwargs['frame_ids']] *
+                                               num_objs)
+        # cur_frame_id = int(kwargs['frame_ids'][0])
+        for k, v in kwargs.items():
+            if len(v) != num_objs:
+                raise ValueError('kwargs value must both equal')
+
+        for obj in zip(*kwargs.values()):
+            id = int(obj[id_indice])
+            if id in self.tracks:
+                self.update_track(id, obj)
+            else:
+                self.init_track(id, obj)
+
+        self.pop_invalid_tracks(frame_id)
+
+    def pop_invalid_tracks(self, frame_id: int) -> None:
+        """Pop out invalid tracks."""
+        invalid_ids = []
+        for k, v in self.tracks.items():
+            if frame_id - v['frame_ids'][-1] >= self.num_frames_retain:
+                invalid_ids.append(k)
+        for invalid_id in invalid_ids:
+            self.tracks.pop(invalid_id)
+
+    def update_track(self, id: int, obj: Tuple[torch.Tensor]):
+        """Update a track."""
+        for k, v in zip(self.memo_items, obj):
+            v = v[None]
+            if self.momentums is not None and k in self.momentums:
+                m = self.momentums[k]
+                self.tracks[id][k] = (1 - m) * self.tracks[id][k] + m * v
+            else:
+                self.tracks[id][k].append(v)
+
+    def init_track(self, id: int, obj: Tuple[torch.Tensor]):
+        """Initialize a track."""
+        self.tracks[id] = Dict()
+        for k, v in zip(self.memo_items, obj):
+            v = v[None]
+            if self.momentums is not None and k in self.momentums:
+                self.tracks[id][k] = v
+            else:
+                self.tracks[id][k] = [v]
+
+    @property
+    def memo(self) -> dict:
+        """Return all buffers in the tracker."""
+        outs = Dict()
+        for k in self.memo_items:
+            outs[k] = []
+
+        for id, objs in self.tracks.items():
+            for k, v in objs.items():
+                if k not in outs:
+                    continue
+                if self.momentums is not None and k in self.momentums:
+                    v = v
+                else:
+                    v = v[-1]
+                outs[k].append(v)
+
+        for k, v in outs.items():
+            outs[k] = torch.cat(v, dim=0)
+        return outs
+
+    def get(self,
+            item: str,
+            ids: Optional[list] = None,
+            num_samples: Optional[int] = None,
+            behavior: Optional[str] = None) -> torch.Tensor:
+        """Get the buffer of a specific item.
+
+        Args:
+            item (str): The demanded item.
+            ids (list[int], optional): The demanded ids. Defaults to None.
+            num_samples (int, optional): Number of samples to calculate the
+                results. Defaults to None.
+            behavior (str, optional): Behavior to calculate the results.
+                Options are `mean` | None. Defaults to None.
+
+        Returns:
+            Tensor: The results of the demanded item.
+        """
+        if ids is None:
+            ids = self.ids
+
+        outs = []
+        for id in ids:
+            out = self.tracks[id][item]
+            if isinstance(out, list):
+                if num_samples is not None:
+                    out = out[-num_samples:]
+                    out = torch.cat(out, dim=0)
+                    if behavior == 'mean':
+                        out = out.mean(dim=0, keepdim=True)
+                    elif behavior is None:
+                        out = out[None]
+                    else:
+                        raise NotImplementedError()
+                else:
+                    out = out[-1]
+            outs.append(out)
+        return torch.cat(outs, dim=0)
+
+    @abstractmethod
+    def track(self, *args, **kwargs):
+        """Tracking forward function."""
+        pass
+
+    def crop_imgs(self,
+                  img: torch.Tensor,
+                  meta_info: dict,
+                  bboxes: torch.Tensor,
+                  rescale: bool = False) -> torch.Tensor:
+        """Crop the images according to some bounding boxes. Typically for re-
+        identification sub-module.
+
+        Args:
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+            meta_info (dict): image information dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+            bboxes (Tensor): of shape (N, 4) or (N, 5).
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the scale of the image. Defaults to False.
+
+        Returns:
+            Tensor: Image tensor of shape (T, C, H, W).
+        """
+        h, w = meta_info['img_shape']
+        img = img[:, :, :h, :w]
+        if rescale:
+            factor_x, factor_y = meta_info['scale_factor']
+            bboxes[:, :4] *= torch.tensor(
+                [factor_x, factor_y, factor_x, factor_y]).to(bboxes.device)
+        bboxes[:, 0] = torch.clamp(bboxes[:, 0], min=0, max=w - 1)
+        bboxes[:, 1] = torch.clamp(bboxes[:, 1], min=0, max=h - 1)
+        bboxes[:, 2] = torch.clamp(bboxes[:, 2], min=1, max=w)
+        bboxes[:, 3] = torch.clamp(bboxes[:, 3], min=1, max=h)
+
+        crop_imgs = []
+        for bbox in bboxes:
+            x1, y1, x2, y2 = map(int, bbox)
+            if x2 <= x1:
+                x2 = x1 + 1
+            if y2 <= y1:
+                y2 = y1 + 1
+            crop_img = img[:, :, y1:y2, x1:x2]
+            if self.reid.get('img_scale', False):
+                crop_img = F.interpolate(
+                    crop_img,
+                    size=self.reid['img_scale'],
+                    mode='bilinear',
+                    align_corners=False)
+            crop_imgs.append(crop_img)
+
+        if len(crop_imgs) > 0:
+            return torch.cat(crop_imgs, dim=0)
+        elif self.reid.get('img_scale', False):
+            _h, _w = self.reid['img_scale']
+            return img.new_zeros((0, 3, _h, _w))
+        else:
+            return img.new_zeros((0, 3, h, w))
diff --git a/mmde/mmdet/models/trackers/byte_tracker.py b/mmde/mmdet/models/trackers/byte_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..11f3adc53c58339f6289cbfa77aed738259fc98c
--- /dev/null
+++ b/mmde/mmdet/models/trackers/byte_tracker.py
@@ -0,0 +1,334 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+try:
+    import lap
+except ImportError:
+    lap = None
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import (bbox_cxcyah_to_xyxy, bbox_overlaps,
+                                   bbox_xyxy_to_cxcyah)
+from .base_tracker import BaseTracker
+
+
+@MODELS.register_module()
+class ByteTracker(BaseTracker):
+    """Tracker for ByteTrack.
+
+    Args:
+        motion (dict): Configuration of motion. Defaults to None.
+        obj_score_thrs (dict): Detection score threshold for matching objects.
+            - high (float): Threshold of the first matching. Defaults to 0.6.
+            - low (float): Threshold of the second matching. Defaults to 0.1.
+        init_track_thr (float): Detection score threshold for initializing a
+            new tracklet. Defaults to 0.7.
+        weight_iou_with_det_scores (bool): Whether using detection scores to
+            weight IOU which is used for matching. Defaults to True.
+        match_iou_thrs (dict): IOU distance threshold for matching between two
+            frames.
+            - high (float): Threshold of the first matching. Defaults to 0.1.
+            - low (float): Threshold of the second matching. Defaults to 0.5.
+            - tentative (float): Threshold of the matching for tentative
+                tracklets. Defaults to 0.3.
+        num_tentatives (int, optional): Number of continuous frames to confirm
+            a track. Defaults to 3.
+    """
+
+    def __init__(self,
+                 motion: Optional[dict] = None,
+                 obj_score_thrs: dict = dict(high=0.6, low=0.1),
+                 init_track_thr: float = 0.7,
+                 weight_iou_with_det_scores: bool = True,
+                 match_iou_thrs: dict = dict(high=0.1, low=0.5, tentative=0.3),
+                 num_tentatives: int = 3,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        if lap is None:
+            raise RuntimeError('lap is not installed,\
+                 please install it by: pip install lap')
+        if motion is not None:
+            self.motion = TASK_UTILS.build(motion)
+
+        self.obj_score_thrs = obj_score_thrs
+        self.init_track_thr = init_track_thr
+
+        self.weight_iou_with_det_scores = weight_iou_with_det_scores
+        self.match_iou_thrs = match_iou_thrs
+
+        self.num_tentatives = num_tentatives
+
+    @property
+    def confirmed_ids(self) -> List:
+        """Confirmed ids in the tracker."""
+        ids = [id for id, track in self.tracks.items() if not track.tentative]
+        return ids
+
+    @property
+    def unconfirmed_ids(self) -> List:
+        """Unconfirmed ids in the tracker."""
+        ids = [id for id, track in self.tracks.items() if track.tentative]
+        return ids
+
+    def init_track(self, id: int, obj: Tuple[torch.Tensor]) -> None:
+        """Initialize a track."""
+        super().init_track(id, obj)
+        if self.tracks[id].frame_ids[-1] == 0:
+            self.tracks[id].tentative = False
+        else:
+            self.tracks[id].tentative = True
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate(
+            bbox)
+
+    def update_track(self, id: int, obj: Tuple[torch.Tensor]) -> None:
+        """Update a track."""
+        super().update_track(id, obj)
+        if self.tracks[id].tentative:
+            if len(self.tracks[id]['bboxes']) >= self.num_tentatives:
+                self.tracks[id].tentative = False
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        track_label = self.tracks[id]['labels'][-1]
+        label_idx = self.memo_items.index('labels')
+        obj_label = obj[label_idx]
+        assert obj_label == track_label
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
+            self.tracks[id].mean, self.tracks[id].covariance, bbox)
+
+    def pop_invalid_tracks(self, frame_id: int) -> None:
+        """Pop out invalid tracks."""
+        invalid_ids = []
+        for k, v in self.tracks.items():
+            # case1: disappeared frames >= self.num_frames_retrain
+            case1 = frame_id - v['frame_ids'][-1] >= self.num_frames_retain
+            # case2: tentative tracks but not matched in this frame
+            case2 = v.tentative and v['frame_ids'][-1] != frame_id
+            if case1 or case2:
+                invalid_ids.append(k)
+        for invalid_id in invalid_ids:
+            self.tracks.pop(invalid_id)
+
+    def assign_ids(
+            self,
+            ids: List[int],
+            det_bboxes: torch.Tensor,
+            det_labels: torch.Tensor,
+            det_scores: torch.Tensor,
+            weight_iou_with_det_scores: Optional[bool] = False,
+            match_iou_thr: Optional[float] = 0.5
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Assign ids.
+
+        Args:
+            ids (list[int]): Tracking ids.
+            det_bboxes (Tensor): of shape (N, 4)
+            det_labels (Tensor): of shape (N,)
+            det_scores (Tensor): of shape (N,)
+            weight_iou_with_det_scores (bool, optional): Whether using
+                detection scores to weight IOU which is used for matching.
+                Defaults to False.
+            match_iou_thr (float, optional): Matching threshold.
+                Defaults to 0.5.
+
+        Returns:
+            tuple(np.ndarray, np.ndarray): The assigning ids.
+        """
+        # get track_bboxes
+        track_bboxes = np.zeros((0, 4))
+        for id in ids:
+            track_bboxes = np.concatenate(
+                (track_bboxes, self.tracks[id].mean[:4][None]), axis=0)
+        track_bboxes = torch.from_numpy(track_bboxes).to(det_bboxes)
+        track_bboxes = bbox_cxcyah_to_xyxy(track_bboxes)
+
+        # compute distance
+        ious = bbox_overlaps(track_bboxes, det_bboxes)
+        if weight_iou_with_det_scores:
+            ious *= det_scores
+        # support multi-class association
+        track_labels = torch.tensor([
+            self.tracks[id]['labels'][-1] for id in ids
+        ]).to(det_bboxes.device)
+
+        cate_match = det_labels[None, :] == track_labels[:, None]
+        # to avoid det and track of different categories are matched
+        cate_cost = (1 - cate_match.int()) * 1e6
+
+        dists = (1 - ious + cate_cost).cpu().numpy()
+
+        # bipartite match
+        if dists.size > 0:
+            cost, row, col = lap.lapjv(
+                dists, extend_cost=True, cost_limit=1 - match_iou_thr)
+        else:
+            row = np.zeros(len(ids)).astype(np.int32) - 1
+            col = np.zeros(len(det_bboxes)).astype(np.int32) - 1
+        return row, col
+
+    def track(self, data_sample: DetDataSample, **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            data_sample (:obj:`DetDataSample`): The data sample.
+                It includes information such as `pred_instances`.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
+        if not hasattr(self, 'kf'):
+            self.kf = self.motion
+
+        if self.empty or bboxes.size(0) == 0:
+            valid_inds = scores > self.init_track_thr
+            scores = scores[valid_inds]
+            bboxes = bboxes[valid_inds]
+            labels = labels[valid_inds]
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(self.num_tracks,
+                               self.num_tracks + num_new_tracks).to(labels)
+            self.num_tracks += num_new_tracks
+
+        else:
+            # 0. init
+            ids = torch.full((bboxes.size(0), ),
+                             -1,
+                             dtype=labels.dtype,
+                             device=labels.device)
+
+            # get the detection bboxes for the first association
+            first_det_inds = scores > self.obj_score_thrs['high']
+            first_det_bboxes = bboxes[first_det_inds]
+            first_det_labels = labels[first_det_inds]
+            first_det_scores = scores[first_det_inds]
+            first_det_ids = ids[first_det_inds]
+
+            # get the detection bboxes for the second association
+            second_det_inds = (~first_det_inds) & (
+                scores > self.obj_score_thrs['low'])
+            second_det_bboxes = bboxes[second_det_inds]
+            second_det_labels = labels[second_det_inds]
+            second_det_scores = scores[second_det_inds]
+            second_det_ids = ids[second_det_inds]
+
+            # 1. use Kalman Filter to predict current location
+            for id in self.confirmed_ids:
+                # track is lost in previous frame
+                if self.tracks[id].frame_ids[-1] != frame_id - 1:
+                    self.tracks[id].mean[7] = 0
+                (self.tracks[id].mean,
+                 self.tracks[id].covariance) = self.kf.predict(
+                     self.tracks[id].mean, self.tracks[id].covariance)
+
+            # 2. first match
+            first_match_track_inds, first_match_det_inds = self.assign_ids(
+                self.confirmed_ids, first_det_bboxes, first_det_labels,
+                first_det_scores, self.weight_iou_with_det_scores,
+                self.match_iou_thrs['high'])
+            # '-1' mean a detection box is not matched with tracklets in
+            # previous frame
+            valid = first_match_det_inds > -1
+            first_det_ids[valid] = torch.tensor(
+                self.confirmed_ids)[first_match_det_inds[valid]].to(labels)
+
+            first_match_det_bboxes = first_det_bboxes[valid]
+            first_match_det_labels = first_det_labels[valid]
+            first_match_det_scores = first_det_scores[valid]
+            first_match_det_ids = first_det_ids[valid]
+            assert (first_match_det_ids > -1).all()
+
+            first_unmatch_det_bboxes = first_det_bboxes[~valid]
+            first_unmatch_det_labels = first_det_labels[~valid]
+            first_unmatch_det_scores = first_det_scores[~valid]
+            first_unmatch_det_ids = first_det_ids[~valid]
+            assert (first_unmatch_det_ids == -1).all()
+
+            # 3. use unmatched detection bboxes from the first match to match
+            # the unconfirmed tracks
+            (tentative_match_track_inds,
+             tentative_match_det_inds) = self.assign_ids(
+                 self.unconfirmed_ids, first_unmatch_det_bboxes,
+                 first_unmatch_det_labels, first_unmatch_det_scores,
+                 self.weight_iou_with_det_scores,
+                 self.match_iou_thrs['tentative'])
+            valid = tentative_match_det_inds > -1
+            first_unmatch_det_ids[valid] = torch.tensor(self.unconfirmed_ids)[
+                tentative_match_det_inds[valid]].to(labels)
+
+            # 4. second match for unmatched tracks from the first match
+            first_unmatch_track_ids = []
+            for i, id in enumerate(self.confirmed_ids):
+                # tracklet is not matched in the first match
+                case_1 = first_match_track_inds[i] == -1
+                # tracklet is not lost in the previous frame
+                case_2 = self.tracks[id].frame_ids[-1] == frame_id - 1
+                if case_1 and case_2:
+                    first_unmatch_track_ids.append(id)
+
+            second_match_track_inds, second_match_det_inds = self.assign_ids(
+                first_unmatch_track_ids, second_det_bboxes, second_det_labels,
+                second_det_scores, False, self.match_iou_thrs['low'])
+            valid = second_match_det_inds > -1
+            second_det_ids[valid] = torch.tensor(first_unmatch_track_ids)[
+                second_match_det_inds[valid]].to(ids)
+
+            # 5. gather all matched detection bboxes from step 2-4
+            # we only keep matched detection bboxes in second match, which
+            # means the id != -1
+            valid = second_det_ids > -1
+            bboxes = torch.cat(
+                (first_match_det_bboxes, first_unmatch_det_bboxes), dim=0)
+            bboxes = torch.cat((bboxes, second_det_bboxes[valid]), dim=0)
+
+            labels = torch.cat(
+                (first_match_det_labels, first_unmatch_det_labels), dim=0)
+            labels = torch.cat((labels, second_det_labels[valid]), dim=0)
+
+            scores = torch.cat(
+                (first_match_det_scores, first_unmatch_det_scores), dim=0)
+            scores = torch.cat((scores, second_det_scores[valid]), dim=0)
+
+            ids = torch.cat((first_match_det_ids, first_unmatch_det_ids),
+                            dim=0)
+            ids = torch.cat((ids, second_det_ids[valid]), dim=0)
+
+            # 6. assign new ids
+            new_track_inds = ids == -1
+            ids[new_track_inds] = torch.arange(
+                self.num_tracks,
+                self.num_tracks + new_track_inds.sum()).to(labels)
+            self.num_tracks += new_track_inds.sum()
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            scores=scores,
+            labels=labels,
+            frame_ids=frame_id)
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/mmde/mmdet/models/trackers/masktrack_rcnn_tracker.py b/mmde/mmdet/models/trackers/masktrack_rcnn_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc167786b8b412629885a4f134a1bf79f3dfaa93
--- /dev/null
+++ b/mmde/mmdet/models/trackers/masktrack_rcnn_tracker.py
@@ -0,0 +1,189 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import bbox_overlaps
+from .base_tracker import BaseTracker
+
+
+@MODELS.register_module()
+class MaskTrackRCNNTracker(BaseTracker):
+    """Tracker for MaskTrack R-CNN.
+
+    Args:
+        match_weights (dict[str : float]): The Weighting factor when computing
+        the match score. It contains keys as follows:
+
+            - det_score (float): The coefficient of `det_score` when computing
+                match score.
+            - iou (float): The coefficient of `ious` when computing match
+                score.
+            - det_label (float): The coefficient of `label_deltas` when
+                computing match score.
+    """
+
+    def __init__(self,
+                 match_weights: dict = dict(
+                     det_score=1.0, iou=2.0, det_label=10.0),
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.match_weights = match_weights
+
+    def get_match_score(self, bboxes: Tensor, labels: Tensor, scores: Tensor,
+                        prev_bboxes: Tensor, prev_labels: Tensor,
+                        similarity_logits: Tensor) -> Tensor:
+        """Get the match score.
+
+        Args:
+            bboxes (torch.Tensor): of shape (num_current_bboxes, 4) in
+                [tl_x, tl_y, br_x, br_y] format. Denoting the detection
+                bboxes of current frame.
+            labels (torch.Tensor): of shape (num_current_bboxes, )
+            scores (torch.Tensor): of shape (num_current_bboxes, )
+            prev_bboxes (torch.Tensor): of shape (num_previous_bboxes, 4) in
+                [tl_x, tl_y, br_x, br_y] format.  Denoting the detection bboxes
+                of previous frame.
+            prev_labels (torch.Tensor): of shape (num_previous_bboxes, )
+            similarity_logits (torch.Tensor): of shape (num_current_bboxes,
+                num_previous_bboxes + 1). Denoting the similarity logits from
+                track head.
+
+        Returns:
+            torch.Tensor: The matching score of shape (num_current_bboxes,
+            num_previous_bboxes + 1)
+        """
+        similarity_scores = similarity_logits.softmax(dim=1)
+
+        ious = bbox_overlaps(bboxes, prev_bboxes)
+        iou_dummy = ious.new_zeros(ious.shape[0], 1)
+        ious = torch.cat((iou_dummy, ious), dim=1)
+
+        label_deltas = (labels.view(-1, 1) == prev_labels).float()
+        label_deltas_dummy = label_deltas.new_ones(label_deltas.shape[0], 1)
+        label_deltas = torch.cat((label_deltas_dummy, label_deltas), dim=1)
+
+        match_score = similarity_scores.log()
+        match_score += self.match_weights['det_score'] * \
+            scores.view(-1, 1).log()
+        match_score += self.match_weights['iou'] * ious
+        match_score += self.match_weights['det_label'] * label_deltas
+
+        return match_score
+
+    def assign_ids(self, match_scores: Tensor):
+        num_prev_bboxes = match_scores.shape[1] - 1
+        _, match_ids = match_scores.max(dim=1)
+
+        ids = match_ids.new_zeros(match_ids.shape[0]) - 1
+        best_match_scores = match_scores.new_zeros(num_prev_bboxes) - 1e6
+        for idx, match_id in enumerate(match_ids):
+            if match_id == 0:
+                ids[idx] = self.num_tracks
+                self.num_tracks += 1
+            else:
+                match_score = match_scores[idx, match_id]
+                # TODO: fix the bug where multiple candidate might match
+                # with the same previous object.
+                if match_score > best_match_scores[match_id - 1]:
+                    ids[idx] = self.ids[match_id - 1]
+                    best_match_scores[match_id - 1] = match_score
+        return ids, best_match_scores
+
+    def track(self,
+              model: torch.nn.Module,
+              feats: List[torch.Tensor],
+              data_sample: DetDataSample,
+              rescale=True,
+              **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            model (nn.Module): VIS model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                MaskTrackRCNN method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                True.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        masks = data_sample.pred_instances.masks
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        # create pred_track_instances
+        pred_track_instances = InstanceData()
+
+        if bboxes.shape[0] == 0:
+            ids = torch.zeros_like(labels)
+            pred_track_instances = data_sample.pred_instances.clone()
+            pred_track_instances.instances_id = ids
+            return pred_track_instances
+
+        rescaled_bboxes = bboxes.clone()
+        if rescale:
+            scale_factor = rescaled_bboxes.new_tensor(
+                metainfo['scale_factor']).repeat((1, 2))
+            rescaled_bboxes = rescaled_bboxes * scale_factor
+        roi_feats, _ = model.track_head.extract_roi_feats(
+            feats, [rescaled_bboxes])
+
+        if self.empty:
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(
+                self.num_tracks,
+                self.num_tracks + num_new_tracks,
+                dtype=torch.long)
+            self.num_tracks += num_new_tracks
+        else:
+            prev_bboxes = self.get('bboxes')
+            prev_labels = self.get('labels')
+            prev_roi_feats = self.get('roi_feats')
+
+            similarity_logits = model.track_head.predict(
+                roi_feats, prev_roi_feats)
+            match_scores = self.get_match_score(bboxes, labels, scores,
+                                                prev_bboxes, prev_labels,
+                                                similarity_logits)
+            ids, _ = self.assign_ids(match_scores)
+
+        valid_inds = ids > -1
+        ids = ids[valid_inds]
+        bboxes = bboxes[valid_inds]
+        labels = labels[valid_inds]
+        scores = scores[valid_inds]
+        masks = masks[valid_inds]
+        roi_feats = roi_feats[valid_inds]
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            labels=labels,
+            scores=scores,
+            masks=masks,
+            roi_feats=roi_feats,
+            frame_ids=frame_id)
+        # update pred_track_instances
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.masks = masks
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/mmde/mmdet/models/trackers/ocsort_tracker.py b/mmde/mmdet/models/trackers/ocsort_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e09990c603aee8ced3bf3a65ceb530142e6e873
--- /dev/null
+++ b/mmde/mmdet/models/trackers/ocsort_tracker.py
@@ -0,0 +1,531 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+try:
+    import lap
+except ImportError:
+    lap = None
+import numpy as np
+import torch
+from addict import Dict
+from mmengine.structures import InstanceData
+
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import (bbox_cxcyah_to_xyxy, bbox_overlaps,
+                                   bbox_xyxy_to_cxcyah)
+from .sort_tracker import SORTTracker
+
+
+@MODELS.register_module()
+class OCSORTTracker(SORTTracker):
+    """Tracker for OC-SORT.
+
+    Args:
+        motion (dict): Configuration of motion. Defaults to None.
+        obj_score_thrs (float): Detection score threshold for matching objects.
+            Defaults to 0.3.
+        init_track_thr (float): Detection score threshold for initializing a
+            new tracklet. Defaults to 0.7.
+        weight_iou_with_det_scores (bool): Whether using detection scores to
+            weight IOU which is used for matching. Defaults to True.
+        match_iou_thr (float): IOU distance threshold for matching between two
+            frames. Defaults to 0.3.
+        num_tentatives (int, optional): Number of continuous frames to confirm
+            a track. Defaults to 3.
+        vel_consist_weight (float): Weight of the velocity consistency term in
+            association (OCM term in the paper).
+        vel_delta_t (int): The difference of time step for calculating of the
+            velocity direction of tracklets.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 motion: Optional[dict] = None,
+                 obj_score_thr: float = 0.3,
+                 init_track_thr: float = 0.7,
+                 weight_iou_with_det_scores: bool = True,
+                 match_iou_thr: float = 0.3,
+                 num_tentatives: int = 3,
+                 vel_consist_weight: float = 0.2,
+                 vel_delta_t: int = 3,
+                 **kwargs):
+        if lap is None:
+            raise RuntimeError('lap is not installed,\
+                 please install it by: pip install lap')
+        super().__init__(motion=motion, **kwargs)
+        self.obj_score_thr = obj_score_thr
+        self.init_track_thr = init_track_thr
+
+        self.weight_iou_with_det_scores = weight_iou_with_det_scores
+        self.match_iou_thr = match_iou_thr
+        self.vel_consist_weight = vel_consist_weight
+        self.vel_delta_t = vel_delta_t
+
+        self.num_tentatives = num_tentatives
+
+    @property
+    def unconfirmed_ids(self):
+        """Unconfirmed ids in the tracker."""
+        ids = [id for id, track in self.tracks.items() if track.tentative]
+        return ids
+
+    def init_track(self, id: int, obj: Tuple[torch.Tensor]):
+        """Initialize a track."""
+        super().init_track(id, obj)
+        if self.tracks[id].frame_ids[-1] == 0:
+            self.tracks[id].tentative = False
+        else:
+            self.tracks[id].tentative = True
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate(
+            bbox)
+        # track.obs maintains the history associated detections to this track
+        self.tracks[id].obs = []
+        bbox_id = self.memo_items.index('bboxes')
+        self.tracks[id].obs.append(obj[bbox_id])
+        # a placefolder to save mean/covariance before losing tracking it
+        # parameters to save: mean, covariance, measurement
+        self.tracks[id].tracked = True
+        self.tracks[id].saved_attr = Dict()
+        self.tracks[id].velocity = torch.tensor(
+            (-1, -1)).to(obj[bbox_id].device)  # placeholder
+
+    def update_track(self, id: int, obj: Tuple[torch.Tensor]):
+        """Update a track."""
+        super().update_track(id, obj)
+        if self.tracks[id].tentative:
+            if len(self.tracks[id]['bboxes']) >= self.num_tentatives:
+                self.tracks[id].tentative = False
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
+            self.tracks[id].mean, self.tracks[id].covariance, bbox)
+        self.tracks[id].tracked = True
+        bbox_id = self.memo_items.index('bboxes')
+        self.tracks[id].obs.append(obj[bbox_id])
+
+        bbox1 = self.k_step_observation(self.tracks[id])
+        bbox2 = obj[bbox_id]
+        self.tracks[id].velocity = self.vel_direction(bbox1, bbox2).to(
+            obj[bbox_id].device)
+
+    def vel_direction(self, bbox1: torch.Tensor, bbox2: torch.Tensor):
+        """Estimate the direction vector between two boxes."""
+        if bbox1.sum() < 0 or bbox2.sum() < 0:
+            return torch.tensor((-1, -1))
+        cx1, cy1 = (bbox1[0] + bbox1[2]) / 2.0, (bbox1[1] + bbox1[3]) / 2.0
+        cx2, cy2 = (bbox2[0] + bbox2[2]) / 2.0, (bbox2[1] + bbox2[3]) / 2.0
+        speed = torch.tensor([cy2 - cy1, cx2 - cx1])
+        norm = torch.sqrt((speed[0])**2 + (speed[1])**2) + 1e-6
+        return speed / norm
+
+    def vel_direction_batch(self, bboxes1: torch.Tensor,
+                            bboxes2: torch.Tensor):
+        """Estimate the direction vector given two batches of boxes."""
+        cx1, cy1 = (bboxes1[:, 0] + bboxes1[:, 2]) / 2.0, (bboxes1[:, 1] +
+                                                           bboxes1[:, 3]) / 2.0
+        cx2, cy2 = (bboxes2[:, 0] + bboxes2[:, 2]) / 2.0, (bboxes2[:, 1] +
+                                                           bboxes2[:, 3]) / 2.0
+        speed_diff_y = cy2[None, :] - cy1[:, None]
+        speed_diff_x = cx2[None, :] - cx1[:, None]
+        speed = torch.cat((speed_diff_y[..., None], speed_diff_x[..., None]),
+                          dim=-1)
+        norm = torch.sqrt((speed[:, :, 0])**2 + (speed[:, :, 1])**2) + 1e-6
+        speed[:, :, 0] /= norm
+        speed[:, :, 1] /= norm
+        return speed
+
+    def k_step_observation(self, track: Dict):
+        """return the observation k step away before."""
+        obs_seqs = track.obs
+        num_obs = len(obs_seqs)
+        if num_obs == 0:
+            return torch.tensor((-1, -1, -1, -1)).to(track.obs[0].device)
+        elif num_obs > self.vel_delta_t:
+            if obs_seqs[num_obs - 1 - self.vel_delta_t] is not None:
+                return obs_seqs[num_obs - 1 - self.vel_delta_t]
+            else:
+                return self.last_obs(track)
+        else:
+            return self.last_obs(track)
+
+    def ocm_assign_ids(self,
+                       ids: List[int],
+                       det_bboxes: torch.Tensor,
+                       det_labels: torch.Tensor,
+                       det_scores: torch.Tensor,
+                       weight_iou_with_det_scores: Optional[bool] = False,
+                       match_iou_thr: Optional[float] = 0.5):
+        """Apply Observation-Centric Momentum (OCM) to assign ids.
+
+        OCM adds movement direction consistency into the association cost
+        matrix. This term requires no additional assumption but from the
+        same linear motion assumption as the canonical Kalman Filter in SORT.
+
+        Args:
+            ids (list[int]): Tracking ids.
+            det_bboxes (Tensor): of shape (N, 4)
+            det_labels (Tensor): of shape (N,)
+            det_scores (Tensor): of shape (N,)
+            weight_iou_with_det_scores (bool, optional): Whether using
+                detection scores to weight IOU which is used for matching.
+                Defaults to False.
+            match_iou_thr (float, optional): Matching threshold.
+                Defaults to 0.5.
+
+        Returns:
+            tuple(int): The assigning ids.
+
+        OC-SORT uses velocity consistency besides IoU for association
+        """
+        # get track_bboxes
+        track_bboxes = np.zeros((0, 4))
+        for id in ids:
+            track_bboxes = np.concatenate(
+                (track_bboxes, self.tracks[id].mean[:4][None]), axis=0)
+        track_bboxes = torch.from_numpy(track_bboxes).to(det_bboxes)
+        track_bboxes = bbox_cxcyah_to_xyxy(track_bboxes)
+
+        # compute distance
+        ious = bbox_overlaps(track_bboxes, det_bboxes)
+        if weight_iou_with_det_scores:
+            ious *= det_scores
+
+        # support multi-class association
+        track_labels = torch.tensor([
+            self.tracks[id]['labels'][-1] for id in ids
+        ]).to(det_bboxes.device)
+        cate_match = det_labels[None, :] == track_labels[:, None]
+        # to avoid det and track of different categories are matched
+        cate_cost = (1 - cate_match.int()) * 1e6
+
+        dists = (1 - ious + cate_cost).cpu().numpy()
+
+        if len(ids) > 0 and len(det_bboxes) > 0:
+            track_velocities = torch.stack(
+                [self.tracks[id].velocity for id in ids]).to(det_bboxes.device)
+            k_step_observations = torch.stack([
+                self.k_step_observation(self.tracks[id]) for id in ids
+            ]).to(det_bboxes.device)
+            # valid1: if the track has previous observations to estimate speed
+            # valid2: if the associated observation k steps ago is a detection
+            valid1 = track_velocities.sum(dim=1) != -2
+            valid2 = k_step_observations.sum(dim=1) != -4
+            valid = valid1 & valid2
+
+            vel_to_match = self.vel_direction_batch(k_step_observations,
+                                                    det_bboxes)
+            track_velocities = track_velocities[:, None, :].repeat(
+                1, det_bboxes.shape[0], 1)
+
+            angle_cos = (vel_to_match * track_velocities).sum(dim=-1)
+            angle_cos = torch.clamp(angle_cos, min=-1, max=1)
+            angle = torch.acos(angle_cos)  # [0, pi]
+            norm_angle = (angle - np.pi / 2.) / np.pi  # [-0.5, 0.5]
+            valid_matrix = valid[:, None].int().repeat(1, det_bboxes.shape[0])
+            # set non-valid entries 0
+            valid_norm_angle = norm_angle * valid_matrix
+
+            dists += valid_norm_angle.cpu().numpy() * self.vel_consist_weight
+
+        # bipartite match
+        if dists.size > 0:
+            cost, row, col = lap.lapjv(
+                dists, extend_cost=True, cost_limit=1 - match_iou_thr)
+        else:
+            row = np.zeros(len(ids)).astype(np.int32) - 1
+            col = np.zeros(len(det_bboxes)).astype(np.int32) - 1
+        return row, col
+
+    def last_obs(self, track: Dict):
+        """extract the last associated observation."""
+        for bbox in track.obs[::-1]:
+            if bbox is not None:
+                return bbox
+
+    def ocr_assign_ids(self,
+                       track_obs: torch.Tensor,
+                       last_track_labels: torch.Tensor,
+                       det_bboxes: torch.Tensor,
+                       det_labels: torch.Tensor,
+                       det_scores: torch.Tensor,
+                       weight_iou_with_det_scores: Optional[bool] = False,
+                       match_iou_thr: Optional[float] = 0.5):
+        """association for Observation-Centric Recovery.
+
+        As try to recover tracks from being lost whose estimated velocity is
+        out- to-date, we use IoU-only matching strategy.
+
+        Args:
+            track_obs (Tensor): the list of historical associated
+                detections of tracks
+            det_bboxes (Tensor): of shape (N, 5), unmatched detections
+            det_labels (Tensor): of shape (N,)
+            det_scores (Tensor): of shape (N,)
+            weight_iou_with_det_scores (bool, optional): Whether using
+                detection scores to weight IOU which is used for matching.
+                Defaults to False.
+            match_iou_thr (float, optional): Matching threshold.
+                Defaults to 0.5.
+
+        Returns:
+            tuple(int): The assigning ids.
+        """
+        # compute distance
+        ious = bbox_overlaps(track_obs, det_bboxes)
+        if weight_iou_with_det_scores:
+            ious *= det_scores
+
+        # support multi-class association
+        cate_match = det_labels[None, :] == last_track_labels[:, None]
+        # to avoid det and track of different categories are matched
+        cate_cost = (1 - cate_match.int()) * 1e6
+
+        dists = (1 - ious + cate_cost).cpu().numpy()
+
+        # bipartite match
+        if dists.size > 0:
+            cost, row, col = lap.lapjv(
+                dists, extend_cost=True, cost_limit=1 - match_iou_thr)
+        else:
+            row = np.zeros(len(track_obs)).astype(np.int32) - 1
+            col = np.zeros(len(det_bboxes)).astype(np.int32) - 1
+        return row, col
+
+    def online_smooth(self, track: Dict, obj: torch.Tensor):
+        """Once a track is recovered from being lost, online smooth its
+        parameters to fix the error accumulated during being lost.
+
+        NOTE: you can use different virtual trajectory generation
+        strategies, we adopt the naive linear interpolation as default
+        """
+        last_match_bbox = self.last_obs(track)
+        new_match_bbox = obj
+        unmatch_len = 0
+        for bbox in track.obs[::-1]:
+            if bbox is None:
+                unmatch_len += 1
+            else:
+                break
+        bbox_shift_per_step = (new_match_bbox - last_match_bbox) / (
+            unmatch_len + 1)
+        track.mean = track.saved_attr.mean
+        track.covariance = track.saved_attr.covariance
+        for i in range(unmatch_len):
+            virtual_bbox = last_match_bbox + (i + 1) * bbox_shift_per_step
+            virtual_bbox = bbox_xyxy_to_cxcyah(virtual_bbox[None, :])
+            virtual_bbox = virtual_bbox.squeeze(0).cpu().numpy()
+            track.mean, track.covariance = self.kf.update(
+                track.mean, track.covariance, virtual_bbox)
+
+    def track(self, data_sample: DetDataSample, **kwargs) -> InstanceData:
+        """Tracking forward function.
+        NOTE: this implementation is slightly different from the original
+        OC-SORT implementation (https://github.com/noahcao/OC_SORT)that we
+        do association between detections and tentative/non-tentative tracks
+        independently while the original implementation combines them together.
+
+        Args:
+            data_sample (:obj:`DetDataSample`): The data sample.
+                It includes information such as `pred_instances`.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
+        if not hasattr(self, 'kf'):
+            self.kf = self.motion
+
+        if self.empty or bboxes.size(0) == 0:
+            valid_inds = scores > self.init_track_thr
+            scores = scores[valid_inds]
+            bboxes = bboxes[valid_inds]
+            labels = labels[valid_inds]
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(self.num_tracks,
+                               self.num_tracks + num_new_tracks).to(labels)
+            self.num_tracks += num_new_tracks
+        else:
+            # 0. init
+            ids = torch.full((bboxes.size(0), ),
+                             -1,
+                             dtype=labels.dtype,
+                             device=labels.device)
+
+            # get the detection bboxes for the first association
+            det_inds = scores > self.obj_score_thr
+            det_bboxes = bboxes[det_inds]
+            det_labels = labels[det_inds]
+            det_scores = scores[det_inds]
+            det_ids = ids[det_inds]
+
+            # 1. predict by Kalman Filter
+            for id in self.confirmed_ids:
+                # track is lost in previous frame
+                if self.tracks[id].frame_ids[-1] != frame_id - 1:
+                    self.tracks[id].mean[7] = 0
+                if self.tracks[id].tracked:
+                    self.tracks[id].saved_attr.mean = self.tracks[id].mean
+                    self.tracks[id].saved_attr.covariance = self.tracks[
+                        id].covariance
+                (self.tracks[id].mean,
+                 self.tracks[id].covariance) = self.kf.predict(
+                     self.tracks[id].mean, self.tracks[id].covariance)
+
+            # 2. match detections and tracks' predicted locations
+            match_track_inds, raw_match_det_inds = self.ocm_assign_ids(
+                self.confirmed_ids, det_bboxes, det_labels, det_scores,
+                self.weight_iou_with_det_scores, self.match_iou_thr)
+            # '-1' mean a detection box is not matched with tracklets in
+            # previous frame
+            valid = raw_match_det_inds > -1
+            det_ids[valid] = torch.tensor(
+                self.confirmed_ids)[raw_match_det_inds[valid]].to(labels)
+
+            match_det_bboxes = det_bboxes[valid]
+            match_det_labels = det_labels[valid]
+            match_det_scores = det_scores[valid]
+            match_det_ids = det_ids[valid]
+            assert (match_det_ids > -1).all()
+
+            # unmatched tracks and detections
+            unmatch_det_bboxes = det_bboxes[~valid]
+            unmatch_det_labels = det_labels[~valid]
+            unmatch_det_scores = det_scores[~valid]
+            unmatch_det_ids = det_ids[~valid]
+            assert (unmatch_det_ids == -1).all()
+
+            # 3. use unmatched detection bboxes from the first match to match
+            # the unconfirmed tracks
+            (tentative_match_track_inds,
+             tentative_match_det_inds) = self.ocm_assign_ids(
+                 self.unconfirmed_ids, unmatch_det_bboxes, unmatch_det_labels,
+                 unmatch_det_scores, self.weight_iou_with_det_scores,
+                 self.match_iou_thr)
+            valid = tentative_match_det_inds > -1
+            unmatch_det_ids[valid] = torch.tensor(self.unconfirmed_ids)[
+                tentative_match_det_inds[valid]].to(labels)
+
+            match_det_bboxes = torch.cat(
+                (match_det_bboxes, unmatch_det_bboxes[valid]), dim=0)
+            match_det_labels = torch.cat(
+                (match_det_labels, unmatch_det_labels[valid]), dim=0)
+            match_det_scores = torch.cat(
+                (match_det_scores, unmatch_det_scores[valid]), dim=0)
+            match_det_ids = torch.cat((match_det_ids, unmatch_det_ids[valid]),
+                                      dim=0)
+            assert (match_det_ids > -1).all()
+
+            unmatch_det_bboxes = unmatch_det_bboxes[~valid]
+            unmatch_det_labels = unmatch_det_labels[~valid]
+            unmatch_det_scores = unmatch_det_scores[~valid]
+            unmatch_det_ids = unmatch_det_ids[~valid]
+            assert (unmatch_det_ids == -1).all()
+
+            all_track_ids = [id for id, _ in self.tracks.items()]
+            unmatched_track_inds = torch.tensor(
+                [ind for ind in all_track_ids if ind not in match_det_ids])
+
+            if len(unmatched_track_inds) > 0:
+                # 4. still some tracks not associated yet, perform OCR
+                last_observations = []
+                for id in unmatched_track_inds:
+                    last_box = self.last_obs(self.tracks[id.item()])
+                    last_observations.append(last_box)
+                last_observations = torch.stack(last_observations)
+                last_track_labels = torch.tensor([
+                    self.tracks[id.item()]['labels'][-1]
+                    for id in unmatched_track_inds
+                ]).to(det_bboxes.device)
+
+                remain_det_ids = torch.full((unmatch_det_bboxes.size(0), ),
+                                            -1,
+                                            dtype=labels.dtype,
+                                            device=labels.device)
+
+                _, ocr_match_det_inds = self.ocr_assign_ids(
+                    last_observations, last_track_labels, unmatch_det_bboxes,
+                    unmatch_det_labels, unmatch_det_scores,
+                    self.weight_iou_with_det_scores, self.match_iou_thr)
+
+                valid = ocr_match_det_inds > -1
+                remain_det_ids[valid] = unmatched_track_inds.clone()[
+                    ocr_match_det_inds[valid]].to(labels)
+
+                ocr_match_det_bboxes = unmatch_det_bboxes[valid]
+                ocr_match_det_labels = unmatch_det_labels[valid]
+                ocr_match_det_scores = unmatch_det_scores[valid]
+                ocr_match_det_ids = remain_det_ids[valid]
+                assert (ocr_match_det_ids > -1).all()
+
+                ocr_unmatch_det_bboxes = unmatch_det_bboxes[~valid]
+                ocr_unmatch_det_labels = unmatch_det_labels[~valid]
+                ocr_unmatch_det_scores = unmatch_det_scores[~valid]
+                ocr_unmatch_det_ids = remain_det_ids[~valid]
+                assert (ocr_unmatch_det_ids == -1).all()
+
+                unmatch_det_bboxes = ocr_unmatch_det_bboxes
+                unmatch_det_labels = ocr_unmatch_det_labels
+                unmatch_det_scores = ocr_unmatch_det_scores
+                unmatch_det_ids = ocr_unmatch_det_ids
+                match_det_bboxes = torch.cat(
+                    (match_det_bboxes, ocr_match_det_bboxes), dim=0)
+                match_det_labels = torch.cat(
+                    (match_det_labels, ocr_match_det_labels), dim=0)
+                match_det_scores = torch.cat(
+                    (match_det_scores, ocr_match_det_scores), dim=0)
+                match_det_ids = torch.cat((match_det_ids, ocr_match_det_ids),
+                                          dim=0)
+
+            # 5. summarize the track results
+            for i in range(len(match_det_ids)):
+                det_bbox = match_det_bboxes[i]
+                track_id = match_det_ids[i].item()
+                if not self.tracks[track_id].tracked:
+                    # the track is lost before this step
+                    self.online_smooth(self.tracks[track_id], det_bbox)
+
+            for track_id in all_track_ids:
+                if track_id not in match_det_ids:
+                    self.tracks[track_id].tracked = False
+                    self.tracks[track_id].obs.append(None)
+
+            bboxes = torch.cat((match_det_bboxes, unmatch_det_bboxes), dim=0)
+            labels = torch.cat((match_det_labels, unmatch_det_labels), dim=0)
+            scores = torch.cat((match_det_scores, unmatch_det_scores), dim=0)
+            ids = torch.cat((match_det_ids, unmatch_det_ids), dim=0)
+            # 6. assign new ids
+            new_track_inds = ids == -1
+
+            ids[new_track_inds] = torch.arange(
+                self.num_tracks,
+                self.num_tracks + new_track_inds.sum()).to(labels)
+            self.num_tracks += new_track_inds.sum()
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            labels=labels,
+            scores=scores,
+            frame_ids=frame_id)
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+        return pred_track_instances
diff --git a/mmde/mmdet/models/trackers/quasi_dense_tracker.py b/mmde/mmdet/models/trackers/quasi_dense_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..c93c3c4c3bd5c8939e77195f30a7eb2f0314e225
--- /dev/null
+++ b/mmde/mmdet/models/trackers/quasi_dense_tracker.py
@@ -0,0 +1,316 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import TrackDataSample
+from mmdet.structures.bbox import bbox_overlaps
+from .base_tracker import BaseTracker
+
+
+@MODELS.register_module()
+class QuasiDenseTracker(BaseTracker):
+    """Tracker for Quasi-Dense Tracking.
+
+    Args:
+        init_score_thr (float): The cls_score threshold to
+            initialize a new tracklet. Defaults to 0.8.
+        obj_score_thr (float): The cls_score threshold to
+            update a tracked tracklet. Defaults to 0.5.
+        match_score_thr (float): The match threshold. Defaults to 0.5.
+        memo_tracklet_frames (int): The most frames in a tracklet memory.
+            Defaults to 10.
+        memo_backdrop_frames (int): The most frames in the backdrops.
+            Defaults to 1.
+        memo_momentum (float): The momentum value for embeds updating.
+            Defaults to 0.8.
+        nms_conf_thr (float): The nms threshold for confidence.
+            Defaults to 0.5.
+        nms_backdrop_iou_thr (float): The nms threshold for backdrop IoU.
+            Defaults to 0.3.
+        nms_class_iou_thr (float): The nms threshold for class IoU.
+            Defaults to 0.7.
+        with_cats (bool): Whether to track with the same category.
+            Defaults to True.
+        match_metric (str): The match metric. Defaults to 'bisoftmax'.
+    """
+
+    def __init__(self,
+                 init_score_thr: float = 0.8,
+                 obj_score_thr: float = 0.5,
+                 match_score_thr: float = 0.5,
+                 memo_tracklet_frames: int = 10,
+                 memo_backdrop_frames: int = 1,
+                 memo_momentum: float = 0.8,
+                 nms_conf_thr: float = 0.5,
+                 nms_backdrop_iou_thr: float = 0.3,
+                 nms_class_iou_thr: float = 0.7,
+                 with_cats: bool = True,
+                 match_metric: str = 'bisoftmax',
+                 **kwargs):
+        super().__init__(**kwargs)
+        assert 0 <= memo_momentum <= 1.0
+        assert memo_tracklet_frames >= 0
+        assert memo_backdrop_frames >= 0
+        self.init_score_thr = init_score_thr
+        self.obj_score_thr = obj_score_thr
+        self.match_score_thr = match_score_thr
+        self.memo_tracklet_frames = memo_tracklet_frames
+        self.memo_backdrop_frames = memo_backdrop_frames
+        self.memo_momentum = memo_momentum
+        self.nms_conf_thr = nms_conf_thr
+        self.nms_backdrop_iou_thr = nms_backdrop_iou_thr
+        self.nms_class_iou_thr = nms_class_iou_thr
+        self.with_cats = with_cats
+        assert match_metric in ['bisoftmax', 'softmax', 'cosine']
+        self.match_metric = match_metric
+
+        self.num_tracks = 0
+        self.tracks = dict()
+        self.backdrops = []
+
+    def reset(self):
+        """Reset the buffer of the tracker."""
+        self.num_tracks = 0
+        self.tracks = dict()
+        self.backdrops = []
+
+    def update(self, ids: Tensor, bboxes: Tensor, embeds: Tensor,
+               labels: Tensor, scores: Tensor, frame_id: int) -> None:
+        """Tracking forward function.
+
+        Args:
+            ids (Tensor): of shape(N, ).
+            bboxes (Tensor): of shape (N, 5).
+            embeds (Tensor): of shape (N, 256).
+            labels (Tensor): of shape (N, ).
+            scores (Tensor): of shape (N, ).
+            frame_id (int): The id of current frame, 0-index.
+        """
+        tracklet_inds = ids > -1
+
+        for id, bbox, embed, label, score in zip(ids[tracklet_inds],
+                                                 bboxes[tracklet_inds],
+                                                 embeds[tracklet_inds],
+                                                 labels[tracklet_inds],
+                                                 scores[tracklet_inds]):
+            id = int(id)
+            # update the tracked ones and initialize new tracks
+            if id in self.tracks.keys():
+                velocity = (bbox - self.tracks[id]['bbox']) / (
+                    frame_id - self.tracks[id]['last_frame'])
+                self.tracks[id]['bbox'] = bbox
+                self.tracks[id]['embed'] = (
+                    1 - self.memo_momentum
+                ) * self.tracks[id]['embed'] + self.memo_momentum * embed
+                self.tracks[id]['last_frame'] = frame_id
+                self.tracks[id]['label'] = label
+                self.tracks[id]['score'] = score
+                self.tracks[id]['velocity'] = (
+                    self.tracks[id]['velocity'] * self.tracks[id]['acc_frame']
+                    + velocity) / (
+                        self.tracks[id]['acc_frame'] + 1)
+                self.tracks[id]['acc_frame'] += 1
+            else:
+                self.tracks[id] = dict(
+                    bbox=bbox,
+                    embed=embed,
+                    label=label,
+                    score=score,
+                    last_frame=frame_id,
+                    velocity=torch.zeros_like(bbox),
+                    acc_frame=0)
+        # backdrop update according to IoU
+        backdrop_inds = torch.nonzero(ids == -1, as_tuple=False).squeeze(1)
+        ious = bbox_overlaps(bboxes[backdrop_inds], bboxes)
+        for i, ind in enumerate(backdrop_inds):
+            if (ious[i, :ind] > self.nms_backdrop_iou_thr).any():
+                backdrop_inds[i] = -1
+        backdrop_inds = backdrop_inds[backdrop_inds > -1]
+        # old backdrops would be removed at first
+        self.backdrops.insert(
+            0,
+            dict(
+                bboxes=bboxes[backdrop_inds],
+                embeds=embeds[backdrop_inds],
+                labels=labels[backdrop_inds]))
+
+        # pop memo
+        invalid_ids = []
+        for k, v in self.tracks.items():
+            if frame_id - v['last_frame'] >= self.memo_tracklet_frames:
+                invalid_ids.append(k)
+        for invalid_id in invalid_ids:
+            self.tracks.pop(invalid_id)
+
+        if len(self.backdrops) > self.memo_backdrop_frames:
+            self.backdrops.pop()
+
+    @property
+    def memo(self) -> Tuple[Tensor, ...]:
+        """Get tracks memory."""
+        memo_embeds = []
+        memo_ids = []
+        memo_bboxes = []
+        memo_labels = []
+        # velocity of tracks
+        memo_vs = []
+        # get tracks
+        for k, v in self.tracks.items():
+            memo_bboxes.append(v['bbox'][None, :])
+            memo_embeds.append(v['embed'][None, :])
+            memo_ids.append(k)
+            memo_labels.append(v['label'].view(1, 1))
+            memo_vs.append(v['velocity'][None, :])
+        memo_ids = torch.tensor(memo_ids, dtype=torch.long).view(1, -1)
+        # get backdrops
+        for backdrop in self.backdrops:
+            backdrop_ids = torch.full((1, backdrop['embeds'].size(0)),
+                                      -1,
+                                      dtype=torch.long)
+            backdrop_vs = torch.zeros_like(backdrop['bboxes'])
+            memo_bboxes.append(backdrop['bboxes'])
+            memo_embeds.append(backdrop['embeds'])
+            memo_ids = torch.cat([memo_ids, backdrop_ids], dim=1)
+            memo_labels.append(backdrop['labels'][:, None])
+            memo_vs.append(backdrop_vs)
+
+        memo_bboxes = torch.cat(memo_bboxes, dim=0)
+        memo_embeds = torch.cat(memo_embeds, dim=0)
+        memo_labels = torch.cat(memo_labels, dim=0).squeeze(1)
+        memo_vs = torch.cat(memo_vs, dim=0)
+        return memo_bboxes, memo_labels, memo_embeds, memo_ids.squeeze(
+            0), memo_vs
+
+    def track(self,
+              model: torch.nn.Module,
+              img: torch.Tensor,
+              feats: List[torch.Tensor],
+              data_sample: TrackDataSample,
+              rescale=True,
+              **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            model (nn.Module): MOT model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                QDTrack method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_instances`.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                True.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        # create pred_track_instances
+        pred_track_instances = InstanceData()
+
+        # return zero bboxes if there is no track targets
+        if bboxes.shape[0] == 0:
+            ids = torch.zeros_like(labels)
+            pred_track_instances = data_sample.pred_instances.clone()
+            pred_track_instances.instances_id = ids
+            return pred_track_instances
+
+        # get track feats
+        rescaled_bboxes = bboxes.clone()
+        if rescale:
+            scale_factor = rescaled_bboxes.new_tensor(
+                metainfo['scale_factor']).repeat((1, 2))
+            rescaled_bboxes = rescaled_bboxes * scale_factor
+        track_feats = model.track_head.predict(feats, [rescaled_bboxes])
+        # sort according to the object_score
+        _, inds = scores.sort(descending=True)
+        bboxes = bboxes[inds]
+        scores = scores[inds]
+        labels = labels[inds]
+        embeds = track_feats[inds, :]
+
+        # duplicate removal for potential backdrops and cross classes
+        valids = bboxes.new_ones((bboxes.size(0)))
+        ious = bbox_overlaps(bboxes, bboxes)
+        for i in range(1, bboxes.size(0)):
+            thr = self.nms_backdrop_iou_thr if scores[
+                i] < self.obj_score_thr else self.nms_class_iou_thr
+            if (ious[i, :i] > thr).any():
+                valids[i] = 0
+        valids = valids == 1
+        bboxes = bboxes[valids]
+        scores = scores[valids]
+        labels = labels[valids]
+        embeds = embeds[valids, :]
+
+        # init ids container
+        ids = torch.full((bboxes.size(0), ), -1, dtype=torch.long)
+
+        # match if buffer is not empty
+        if bboxes.size(0) > 0 and not self.empty:
+            (memo_bboxes, memo_labels, memo_embeds, memo_ids,
+             memo_vs) = self.memo
+
+            if self.match_metric == 'bisoftmax':
+                feats = torch.mm(embeds, memo_embeds.t())
+                d2t_scores = feats.softmax(dim=1)
+                t2d_scores = feats.softmax(dim=0)
+                match_scores = (d2t_scores + t2d_scores) / 2
+            elif self.match_metric == 'softmax':
+                feats = torch.mm(embeds, memo_embeds.t())
+                match_scores = feats.softmax(dim=1)
+            elif self.match_metric == 'cosine':
+                match_scores = torch.mm(
+                    F.normalize(embeds, p=2, dim=1),
+                    F.normalize(memo_embeds, p=2, dim=1).t())
+            else:
+                raise NotImplementedError
+            # track with the same category
+            if self.with_cats:
+                cat_same = labels.view(-1, 1) == memo_labels.view(1, -1)
+                match_scores *= cat_same.float().to(match_scores.device)
+            # track according to match_scores
+            for i in range(bboxes.size(0)):
+                conf, memo_ind = torch.max(match_scores[i, :], dim=0)
+                id = memo_ids[memo_ind]
+                if conf > self.match_score_thr:
+                    if id > -1:
+                        # keep bboxes with high object score
+                        # and remove background bboxes
+                        if scores[i] > self.obj_score_thr:
+                            ids[i] = id
+                            match_scores[:i, memo_ind] = 0
+                            match_scores[i + 1:, memo_ind] = 0
+                        else:
+                            if conf > self.nms_conf_thr:
+                                ids[i] = -2
+        # initialize new tracks
+        new_inds = (ids == -1) & (scores > self.init_score_thr).cpu()
+        num_news = new_inds.sum()
+        ids[new_inds] = torch.arange(
+            self.num_tracks, self.num_tracks + num_news, dtype=torch.long)
+        self.num_tracks += num_news
+
+        self.update(ids, bboxes, embeds, labels, scores, frame_id)
+        tracklet_inds = ids > -1
+        # update pred_track_instances
+        pred_track_instances.bboxes = bboxes[tracklet_inds]
+        pred_track_instances.labels = labels[tracklet_inds]
+        pred_track_instances.scores = scores[tracklet_inds]
+        pred_track_instances.instances_id = ids[tracklet_inds]
+
+        return pred_track_instances
diff --git a/mmde/mmdet/models/trackers/sort_tracker.py b/mmde/mmdet/models/trackers/sort_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4a4fed92702f7d1ea66917a7157fcf5d0773a30
--- /dev/null
+++ b/mmde/mmdet/models/trackers/sort_tracker.py
@@ -0,0 +1,268 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+
+try:
+    import motmetrics
+    from motmetrics.lap import linear_sum_assignment
+except ImportError:
+    motmetrics = None
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import bbox_overlaps, bbox_xyxy_to_cxcyah
+from mmdet.utils import OptConfigType
+from ..utils import imrenormalize
+from .base_tracker import BaseTracker
+
+
+@MODELS.register_module()
+class SORTTracker(BaseTracker):
+    """Tracker for SORT/DeepSORT.
+
+    Args:
+        obj_score_thr (float, optional): Threshold to filter the objects.
+            Defaults to 0.3.
+        motion (dict): Configuration of motion. Defaults to None.
+        reid (dict, optional): Configuration for the ReID model.
+            - num_samples (int, optional): Number of samples to calculate the
+                feature embeddings of a track. Default to 10.
+            - image_scale (tuple, optional): Input scale of the ReID model.
+                Default to (256, 128).
+            - img_norm_cfg (dict, optional): Configuration to normalize the
+                input. Default to None.
+            - match_score_thr (float, optional): Similarity threshold for the
+                matching process. Default to 2.0.
+        match_iou_thr (float, optional): Threshold of the IoU matching process.
+            Defaults to 0.7.
+        num_tentatives (int, optional): Number of continuous frames to confirm
+            a track. Defaults to 3.
+    """
+
+    def __init__(self,
+                 motion: Optional[dict] = None,
+                 obj_score_thr: float = 0.3,
+                 reid: dict = dict(
+                     num_samples=10,
+                     img_scale=(256, 128),
+                     img_norm_cfg=None,
+                     match_score_thr=2.0),
+                 match_iou_thr: float = 0.7,
+                 num_tentatives: int = 3,
+                 **kwargs):
+        if motmetrics is None:
+            raise RuntimeError('motmetrics is not installed,\
+                 please install it by: pip install motmetrics')
+        super().__init__(**kwargs)
+        if motion is not None:
+            self.motion = TASK_UTILS.build(motion)
+            assert self.motion is not None, 'SORT/Deep SORT need KalmanFilter'
+        self.obj_score_thr = obj_score_thr
+        self.reid = reid
+        self.match_iou_thr = match_iou_thr
+        self.num_tentatives = num_tentatives
+
+    @property
+    def confirmed_ids(self) -> List:
+        """Confirmed ids in the tracker."""
+        ids = [id for id, track in self.tracks.items() if not track.tentative]
+        return ids
+
+    def init_track(self, id: int, obj: Tuple[Tensor]) -> None:
+        """Initialize a track."""
+        super().init_track(id, obj)
+        self.tracks[id].tentative = True
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate(
+            bbox)
+
+    def update_track(self, id: int, obj: Tuple[Tensor]) -> None:
+        """Update a track."""
+        super().update_track(id, obj)
+        if self.tracks[id].tentative:
+            if len(self.tracks[id]['bboxes']) >= self.num_tentatives:
+                self.tracks[id].tentative = False
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
+            self.tracks[id].mean, self.tracks[id].covariance, bbox)
+
+    def pop_invalid_tracks(self, frame_id: int) -> None:
+        """Pop out invalid tracks."""
+        invalid_ids = []
+        for k, v in self.tracks.items():
+            # case1: disappeared frames >= self.num_frames_retrain
+            case1 = frame_id - v['frame_ids'][-1] >= self.num_frames_retain
+            # case2: tentative tracks but not matched in this frame
+            case2 = v.tentative and v['frame_ids'][-1] != frame_id
+            if case1 or case2:
+                invalid_ids.append(k)
+        for invalid_id in invalid_ids:
+            self.tracks.pop(invalid_id)
+
+    def track(self,
+              model: torch.nn.Module,
+              img: Tensor,
+              data_sample: DetDataSample,
+              data_preprocessor: OptConfigType = None,
+              rescale: bool = False,
+              **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            model (nn.Module): MOT model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                SORT method.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+            data_preprocessor (dict or ConfigDict, optional): The pre-process
+               config of :class:`TrackDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                False.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
+        if not hasattr(self, 'kf'):
+            self.kf = self.motion
+
+        if self.with_reid:
+            if self.reid.get('img_norm_cfg', False):
+                img_norm_cfg = dict(
+                    mean=data_preprocessor['mean'],
+                    std=data_preprocessor['std'],
+                    to_bgr=data_preprocessor['rgb_to_bgr'])
+                reid_img = imrenormalize(img, img_norm_cfg,
+                                         self.reid['img_norm_cfg'])
+            else:
+                reid_img = img.clone()
+
+        valid_inds = scores > self.obj_score_thr
+        bboxes = bboxes[valid_inds]
+        labels = labels[valid_inds]
+        scores = scores[valid_inds]
+
+        if self.empty or bboxes.size(0) == 0:
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(
+                self.num_tracks,
+                self.num_tracks + num_new_tracks,
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += num_new_tracks
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                if crops.size(0) > 0:
+                    embeds = model.reid(crops, mode='tensor')
+                else:
+                    embeds = crops.new_zeros((0, model.reid.head.out_channels))
+        else:
+            ids = torch.full((bboxes.size(0), ), -1,
+                             dtype=torch.long).to(bboxes.device)
+
+            # motion
+            self.tracks, costs = self.motion.track(self.tracks,
+                                                   bbox_xyxy_to_cxcyah(bboxes))
+
+            active_ids = self.confirmed_ids
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                embeds = model.reid(crops, mode='tensor')
+
+                # reid
+                if len(active_ids) > 0:
+                    track_embeds = self.get(
+                        'embeds',
+                        active_ids,
+                        self.reid.get('num_samples', None),
+                        behavior='mean')
+                    reid_dists = torch.cdist(track_embeds, embeds)
+
+                    # support multi-class association
+                    track_labels = torch.tensor([
+                        self.tracks[id]['labels'][-1] for id in active_ids
+                    ]).to(bboxes.device)
+                    cate_match = labels[None, :] == track_labels[:, None]
+                    cate_cost = (1 - cate_match.int()) * 1e6
+                    reid_dists = (reid_dists + cate_cost).cpu().numpy()
+
+                    valid_inds = [list(self.ids).index(_) for _ in active_ids]
+                    reid_dists[~np.isfinite(costs[valid_inds, :])] = np.nan
+
+                    row, col = linear_sum_assignment(reid_dists)
+                    for r, c in zip(row, col):
+                        dist = reid_dists[r, c]
+                        if not np.isfinite(dist):
+                            continue
+                        if dist <= self.reid['match_score_thr']:
+                            ids[c] = active_ids[r]
+
+            active_ids = [
+                id for id in self.ids if id not in ids
+                and self.tracks[id].frame_ids[-1] == frame_id - 1
+            ]
+            if len(active_ids) > 0:
+                active_dets = torch.nonzero(ids == -1).squeeze(1)
+                track_bboxes = self.get('bboxes', active_ids)
+                ious = bbox_overlaps(track_bboxes, bboxes[active_dets])
+
+                # support multi-class association
+                track_labels = torch.tensor([
+                    self.tracks[id]['labels'][-1] for id in active_ids
+                ]).to(bboxes.device)
+                cate_match = labels[None, active_dets] == track_labels[:, None]
+                cate_cost = (1 - cate_match.int()) * 1e6
+
+                dists = (1 - ious + cate_cost).cpu().numpy()
+
+                row, col = linear_sum_assignment(dists)
+                for r, c in zip(row, col):
+                    dist = dists[r, c]
+                    if dist < 1 - self.match_iou_thr:
+                        ids[active_dets[c]] = active_ids[r]
+
+            new_track_inds = ids == -1
+            ids[new_track_inds] = torch.arange(
+                self.num_tracks,
+                self.num_tracks + new_track_inds.sum(),
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += new_track_inds.sum()
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            scores=scores,
+            labels=labels,
+            embeds=embeds if self.with_reid else None,
+            frame_ids=frame_id)
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/mmde/mmdet/models/trackers/strongsort_tracker.py b/mmde/mmdet/models/trackers/strongsort_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d7075701bc3205b9ea30f03790cfa1c42a97822
--- /dev/null
+++ b/mmde/mmdet/models/trackers/strongsort_tracker.py
@@ -0,0 +1,273 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+
+try:
+    import motmetrics
+    from motmetrics.lap import linear_sum_assignment
+except ImportError:
+    motmetrics = None
+from torch import Tensor
+
+from mmdet.models.utils import imrenormalize
+from mmdet.registry import MODELS
+from mmdet.structures import TrackDataSample
+from mmdet.structures.bbox import bbox_overlaps, bbox_xyxy_to_cxcyah
+from mmdet.utils import OptConfigType
+from .sort_tracker import SORTTracker
+
+
+def cosine_distance(x: Tensor, y: Tensor) -> np.ndarray:
+    """compute the cosine distance.
+
+    Args:
+        x (Tensor): embeddings with shape (N,C).
+        y (Tensor): embeddings with shape (M,C).
+
+    Returns:
+        ndarray: cosine distance with shape (N,M).
+    """
+    x = x.cpu().numpy()
+    y = y.cpu().numpy()
+    x = x / np.linalg.norm(x, axis=1, keepdims=True)
+    y = y / np.linalg.norm(y, axis=1, keepdims=True)
+    dists = 1. - np.dot(x, y.T)
+    return dists
+
+
+@MODELS.register_module()
+class StrongSORTTracker(SORTTracker):
+    """Tracker for StrongSORT.
+
+    Args:
+        obj_score_thr (float, optional): Threshold to filter the objects.
+            Defaults to 0.6.
+        motion (dict): Configuration of motion. Defaults to None.
+        reid (dict, optional): Configuration for the ReID model.
+            - num_samples (int, optional): Number of samples to calculate the
+                feature embeddings of a track. Default to None.
+            - image_scale (tuple, optional): Input scale of the ReID model.
+                Default to (256, 128).
+            - img_norm_cfg (dict, optional): Configuration to normalize the
+                input. Default to None.
+            - match_score_thr (float, optional): Similarity threshold for the
+                matching process. Default to 0.3.
+            - motion_weight (float, optional): the weight of the motion cost.
+                Defaults to 0.02.
+        match_iou_thr (float, optional): Threshold of the IoU matching process.
+            Defaults to 0.7.
+        num_tentatives (int, optional): Number of continuous frames to confirm
+            a track. Defaults to 2.
+    """
+
+    def __init__(self,
+                 motion: Optional[dict] = None,
+                 obj_score_thr: float = 0.6,
+                 reid: dict = dict(
+                     num_samples=None,
+                     img_scale=(256, 128),
+                     img_norm_cfg=None,
+                     match_score_thr=0.3,
+                     motion_weight=0.02),
+                 match_iou_thr: float = 0.7,
+                 num_tentatives: int = 2,
+                 **kwargs):
+        if motmetrics is None:
+            raise RuntimeError('motmetrics is not installed,\
+                 please install it by: pip install motmetrics')
+        super().__init__(motion, obj_score_thr, reid, match_iou_thr,
+                         num_tentatives, **kwargs)
+
+    def update_track(self, id: int, obj: Tuple[Tensor]) -> None:
+        """Update a track."""
+        for k, v in zip(self.memo_items, obj):
+            v = v[None]
+            if self.momentums is not None and k in self.momentums:
+                m = self.momentums[k]
+                self.tracks[id][k] = (1 - m) * self.tracks[id][k] + m * v
+            else:
+                self.tracks[id][k].append(v)
+
+        if self.tracks[id].tentative:
+            if len(self.tracks[id]['bboxes']) >= self.num_tentatives:
+                self.tracks[id].tentative = False
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        score = float(self.tracks[id].scores[-1].cpu())
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
+            self.tracks[id].mean, self.tracks[id].covariance, bbox, score)
+
+    def track(self,
+              model: torch.nn.Module,
+              img: Tensor,
+              data_sample: TrackDataSample,
+              data_preprocessor: OptConfigType = None,
+              rescale: bool = False,
+              **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            model (nn.Module): MOT model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                SORT method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+            data_preprocessor (dict or ConfigDict, optional): The pre-process
+               config of :class:`TrackDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                False.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
+        if not hasattr(self, 'kf'):
+            self.kf = self.motion
+
+        if self.with_reid:
+            if self.reid.get('img_norm_cfg', False):
+                img_norm_cfg = dict(
+                    mean=data_preprocessor.get('mean', [0, 0, 0]),
+                    std=data_preprocessor.get('std', [1, 1, 1]),
+                    to_bgr=data_preprocessor.get('rgb_to_bgr', False))
+                reid_img = imrenormalize(img, img_norm_cfg,
+                                         self.reid['img_norm_cfg'])
+            else:
+                reid_img = img.clone()
+
+        valid_inds = scores > self.obj_score_thr
+        bboxes = bboxes[valid_inds]
+        labels = labels[valid_inds]
+        scores = scores[valid_inds]
+
+        if self.empty or bboxes.size(0) == 0:
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(
+                self.num_tracks,
+                self.num_tracks + num_new_tracks,
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += num_new_tracks
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                if crops.size(0) > 0:
+                    embeds = model.reid(crops, mode='tensor')
+                else:
+                    embeds = crops.new_zeros((0, model.reid.head.out_channels))
+        else:
+            ids = torch.full((bboxes.size(0), ), -1,
+                             dtype=torch.long).to(bboxes.device)
+
+            # motion
+            if model.with_cmc:
+                num_samples = 1
+                self.tracks = model.cmc.track(self.last_img, img, self.tracks,
+                                              num_samples, frame_id, metainfo)
+
+            self.tracks, motion_dists = self.motion.track(
+                self.tracks, bbox_xyxy_to_cxcyah(bboxes))
+
+            active_ids = self.confirmed_ids
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                embeds = model.reid(crops, mode='tensor')
+
+                # reid
+                if len(active_ids) > 0:
+                    track_embeds = self.get(
+                        'embeds',
+                        active_ids,
+                        self.reid.get('num_samples', None),
+                        behavior='mean')
+                    reid_dists = cosine_distance(track_embeds, embeds)
+                    valid_inds = [list(self.ids).index(_) for _ in active_ids]
+                    reid_dists[~np.isfinite(motion_dists[
+                        valid_inds, :])] = np.nan
+
+                    weight_motion = self.reid.get('motion_weight')
+                    match_dists = (1 - weight_motion) * reid_dists + \
+                        weight_motion * motion_dists[valid_inds]
+
+                    # support multi-class association
+                    track_labels = torch.tensor([
+                        self.tracks[id]['labels'][-1] for id in active_ids
+                    ]).to(bboxes.device)
+                    cate_match = labels[None, :] == track_labels[:, None]
+                    cate_cost = ((1 - cate_match.int()) * 1e6).cpu().numpy()
+                    match_dists = match_dists + cate_cost
+
+                    row, col = linear_sum_assignment(match_dists)
+                    for r, c in zip(row, col):
+                        dist = match_dists[r, c]
+                        if not np.isfinite(dist):
+                            continue
+                        if dist <= self.reid['match_score_thr']:
+                            ids[c] = active_ids[r]
+
+            active_ids = [
+                id for id in self.ids if id not in ids
+                and self.tracks[id].frame_ids[-1] == frame_id - 1
+            ]
+            if len(active_ids) > 0:
+                active_dets = torch.nonzero(ids == -1).squeeze(1)
+                track_bboxes = self.get('bboxes', active_ids)
+                ious = bbox_overlaps(track_bboxes, bboxes[active_dets])
+
+                # support multi-class association
+                track_labels = torch.tensor([
+                    self.tracks[id]['labels'][-1] for id in active_ids
+                ]).to(bboxes.device)
+                cate_match = labels[None, active_dets] == track_labels[:, None]
+                cate_cost = (1 - cate_match.int()) * 1e6
+
+                dists = (1 - ious + cate_cost).cpu().numpy()
+
+                row, col = linear_sum_assignment(dists)
+                for r, c in zip(row, col):
+                    dist = dists[r, c]
+                    if dist < 1 - self.match_iou_thr:
+                        ids[active_dets[c]] = active_ids[r]
+
+            new_track_inds = ids == -1
+            ids[new_track_inds] = torch.arange(
+                self.num_tracks,
+                self.num_tracks + new_track_inds.sum(),
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += new_track_inds.sum()
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            scores=scores,
+            labels=labels,
+            embeds=embeds if self.with_reid else None,
+            frame_ids=frame_id)
+        self.last_img = img
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/mmde/mmdet/models/tracking_heads/__init__.py b/mmde/mmdet/models/tracking_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd1f0561cc076f2a603a64eb479cc6de0372a438
--- /dev/null
+++ b/mmde/mmdet/models/tracking_heads/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .mask2former_track_head import Mask2FormerTrackHead
+from .quasi_dense_embed_head import QuasiDenseEmbedHead
+from .quasi_dense_track_head import QuasiDenseTrackHead
+from .roi_embed_head import RoIEmbedHead
+from .roi_track_head import RoITrackHead
+
+__all__ = [
+    'QuasiDenseEmbedHead', 'QuasiDenseTrackHead', 'Mask2FormerTrackHead',
+    'RoIEmbedHead', 'RoITrackHead'
+]
diff --git a/mmde/mmdet/models/tracking_heads/mask2former_track_head.py b/mmde/mmdet/models/tracking_heads/mask2former_track_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0877241bc33fcd1ef8f7ed154d503d9dbd8ab938
--- /dev/null
+++ b/mmde/mmdet/models/tracking_heads/mask2former_track_head.py
@@ -0,0 +1,729 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from collections import defaultdict
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d
+from mmcv.ops import point_sample
+from mmengine.model import ModuleList
+from mmengine.model.weight_init import caffe2_xavier_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.dense_heads import AnchorFreeHead, MaskFormerHead
+from mmdet.models.utils import get_uncertain_point_coords_with_randomness
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import TrackDataSample, TrackSampleList
+from mmdet.structures.mask import mask2bbox
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptMultiConfig, reduce_mean)
+from ..layers import Mask2FormerTransformerDecoder
+
+
+@MODELS.register_module()
+class Mask2FormerTrackHead(MaskFormerHead):
+    """Implements the Mask2Former head.
+
+    See `Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for features.
+        out_channels (int): Number of channels for output.
+        num_classes (int): Number of VIS classes.
+        num_queries (int): Number of query in Transformer decoder.
+            Defaults to 100.
+        num_transformer_feat_level (int): Number of feats levels.
+            Defaults to 3.
+        pixel_decoder (:obj:`ConfigDict` or dict): Config for pixel
+            decoder.
+        enforce_decoder_input_project (bool, optional): Whether to add
+            a layer to change the embed_dim of transformer encoder in
+            pixel decoder to the embed_dim of transformer decoder.
+            Defaults to False.
+        transformer_decoder (:obj:`ConfigDict` or dict): Config for
+            transformer decoder.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer decoder position encoding.
+            Defaults to `SinePositionalEncoding3D`.
+        loss_cls (:obj:`ConfigDict` or dict): Config of the classification
+            loss. Defaults to `CrossEntropyLoss`.
+        loss_mask (:obj:`ConfigDict` or dict): Config of the mask loss.
+            Defaults to 'CrossEntropyLoss'.
+        loss_dice (:obj:`ConfigDict` or dict): Config of the dice loss.
+            Defaults to 'DiceLoss'.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            Mask2Former head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            Mask2Former head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 feat_channels: int,
+                 out_channels: int,
+                 num_classes: int,
+                 num_frames: int = 2,
+                 num_queries: int = 100,
+                 num_transformer_feat_level: int = 3,
+                 pixel_decoder: ConfigType = ...,
+                 enforce_decoder_input_project: bool = False,
+                 transformer_decoder: ConfigType = ...,
+                 positional_encoding: ConfigType = dict(
+                     num_feats=128, normalize=True),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=2.0,
+                     reduction='mean',
+                     class_weight=[1.0] * 133 + [0.1]),
+                 loss_mask: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='mean',
+                     loss_weight=5.0),
+                 loss_dice: ConfigType = dict(
+                     type='DiceLoss',
+                     use_sigmoid=True,
+                     activate=True,
+                     reduction='mean',
+                     naive_dice=True,
+                     eps=1.0,
+                     loss_weight=5.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super(AnchorFreeHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.num_frames = num_frames
+        self.num_queries = num_queries
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_heads = transformer_decoder.layer_cfg.cross_attn_cfg.num_heads
+        self.num_transformer_decoder_layers = transformer_decoder.num_layers
+        assert pixel_decoder.encoder.layer_cfg. \
+            self_attn_cfg.num_levels == num_transformer_feat_level
+        pixel_decoder_ = copy.deepcopy(pixel_decoder)
+        pixel_decoder_.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = MODELS.build(pixel_decoder_)
+        self.transformer_decoder = Mask2FormerTransformerDecoder(
+            **transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+
+        self.decoder_input_projs = ModuleList()
+        # from low resolution to high resolution
+        for _ in range(num_transformer_feat_level):
+            if (self.decoder_embed_dims != feat_channels
+                    or enforce_decoder_input_project):
+                self.decoder_input_projs.append(
+                    Conv2d(
+                        feat_channels, self.decoder_embed_dims, kernel_size=1))
+            else:
+                self.decoder_input_projs.append(nn.Identity())
+        self.decoder_positional_encoding = MODELS.build(positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, feat_channels)
+        self.query_feat = nn.Embedding(self.num_queries, feat_channels)
+        # from low resolution to high resolution
+        self.level_embed = nn.Embedding(self.num_transformer_feat_level,
+                                        feat_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            self.sampler = TASK_UTILS.build(
+                # self.train_cfg.sampler, default_args=dict(context=self))
+                self.train_cfg['sampler'],
+                default_args=dict(context=self))
+            self.num_points = self.train_cfg.get('num_points', 12544)
+            self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0)
+            self.importance_sample_ratio = self.train_cfg.get(
+                'importance_sample_ratio', 0.75)
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_mask = MODELS.build(loss_mask)
+        self.loss_dice = MODELS.build(loss_dice)
+
+    def init_weights(self) -> None:
+        for m in self.decoder_input_projs:
+            if isinstance(m, Conv2d):
+                caffe2_xavier_init(m, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+    def preprocess_gt(self, batch_gt_instances: InstanceList) -> InstanceList:
+        """Preprocess the ground truth for all images.
+
+        It aims to reorganize the `gt`. For example, in the
+        `batch_data_sample.gt_instances.mask`, its shape is
+        `(all_num_gts, h, w)`, but we don't know each gt belongs to which `img`
+        (assume `num_frames` is 2). So, this func used to reshape the `gt_mask`
+        to `(num_gts_per_img, num_frames, h, w)`. In addition, we can't
+        guarantee that the number of instances in these two images is equal,
+        so `-1` refers to nonexistent instances.
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``labels``, each is
+                ground truth labels of each bbox, with shape (num_gts, )
+                and ``masks``, each is ground truth masks of each instances
+                of an image, shape (num_gts, h, w).
+
+        Returns:
+            list[obj:`InstanceData`]: each contains the following keys
+
+                - labels (Tensor): Ground truth class indices\
+                    for an image, with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in an image.
+                - masks (Tensor): Ground truth mask for a\
+                    image, with shape (n, t, h, w).
+        """
+        final_batch_gt_instances = []
+        batch_size = len(batch_gt_instances) // self.num_frames
+        for batch_idx in range(batch_size):
+            pair_gt_insatences = batch_gt_instances[batch_idx *
+                                                    self.num_frames:batch_idx *
+                                                    self.num_frames +
+                                                    self.num_frames]
+
+            assert len(
+                pair_gt_insatences
+            ) > 1, f'mask2former for vis need multi frames to train, \
+                but you only use {len(pair_gt_insatences)} frames'
+
+            _device = pair_gt_insatences[0].labels.device
+
+            for gt_instances in pair_gt_insatences:
+                gt_instances.masks = gt_instances.masks.to_tensor(
+                    dtype=torch.bool, device=_device)
+            all_ins_id = torch.cat([
+                gt_instances.instances_ids
+                for gt_instances in pair_gt_insatences
+            ])
+            all_ins_id = all_ins_id.unique().tolist()
+            map_ins_id = dict()
+            for i, ins_id in enumerate(all_ins_id):
+                map_ins_id[ins_id] = i
+
+            num_instances = len(all_ins_id)
+            mask_shape = [
+                num_instances, self.num_frames,
+                pair_gt_insatences[0].masks.shape[1],
+                pair_gt_insatences[0].masks.shape[2]
+            ]
+            gt_masks_per_video = torch.zeros(
+                mask_shape, dtype=torch.bool, device=_device)
+            gt_ids_per_video = torch.full((num_instances, self.num_frames),
+                                          -1,
+                                          dtype=torch.long,
+                                          device=_device)
+            gt_labels_per_video = torch.full((num_instances, ),
+                                             -1,
+                                             dtype=torch.long,
+                                             device=_device)
+
+            for frame_id in range(self.num_frames):
+                cur_frame_gts = pair_gt_insatences[frame_id]
+                ins_ids = cur_frame_gts.instances_ids.tolist()
+                for i, id in enumerate(ins_ids):
+                    gt_masks_per_video[map_ins_id[id],
+                                       frame_id, :, :] = cur_frame_gts.masks[i]
+                    gt_ids_per_video[map_ins_id[id],
+                                     frame_id] = cur_frame_gts.instances_ids[i]
+                    gt_labels_per_video[
+                        map_ins_id[id]] = cur_frame_gts.labels[i]
+
+            tmp_instances = InstanceData(
+                labels=gt_labels_per_video,
+                masks=gt_masks_per_video.long(),
+                instances_id=gt_ids_per_video)
+            final_batch_gt_instances.append(tmp_instances)
+
+        return final_batch_gt_instances
+
+    def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> Tuple[Tensor]:
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, num_frames, h, w).
+            gt_instances (:obj:`InstanceData`): It contains ``labels`` and
+                ``masks``.
+            img_meta (dict): Image informtation.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, num_frames, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each \
+                    image.
+                - neg_inds (Tensor): Sampled negative indices for each \
+                    image.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        # (num_gts, )
+        gt_labels = gt_instances.labels
+        # (num_gts, num_frames, h, w)
+        gt_masks = gt_instances.masks
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2),
+                                  device=cls_score.device)
+
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(mask_pred,
+                                        point_coords.repeat(num_queries, 1,
+                                                            1)).flatten(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(gt_masks.float(),
+                                       point_coords.repeat(num_gts, 1,
+                                                           1)).flatten(1)
+
+        sampled_gt_instances = InstanceData(
+            labels=gt_labels, masks=gt_points_masks)
+        sampled_pred_instances = InstanceData(
+            scores=cls_score, masks=mask_points_pred)
+        # assign and sample
+        assign_result = self.assigner.assign(
+            pred_instances=sampled_pred_instances,
+            gt_instances=sampled_gt_instances,
+            img_meta=img_meta)
+        pred_instances = InstanceData(scores=cls_score, masks=mask_pred)
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones((self.num_queries, ))
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def _loss_by_feat_single(self, cls_scores: Tensor, mask_preds: Tensor,
+                             batch_gt_instances: List[InstanceData],
+                             batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should include
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, num_frames,h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single \
+                decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         avg_factor) = self.get_targets(cls_scores_list, mask_preds_list,
+                                        batch_gt_instances, batch_img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, num_frames, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([avg_factor]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, num_frames, h, w)
+        # -> (num_total_gts, num_frames, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        with torch.no_grad():
+            points_coords = get_uncertain_point_coords_with_randomness(
+                mask_preds.flatten(0, 1).unsqueeze(1), None, self.num_points,
+                self.oversample_ratio, self.importance_sample_ratio)
+            # shape (num_total_gts * num_frames, h, w) ->
+            # (num_total_gts, num_points)
+            mask_point_targets = point_sample(
+                mask_targets.flatten(0, 1).unsqueeze(1).float(),
+                points_coords).squeeze(1)
+        # shape (num_total_gts * num_frames, num_points)
+        mask_point_preds = point_sample(
+            mask_preds.flatten(0, 1).unsqueeze(1), points_coords).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_point_preds, mask_point_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # shape (num_total_gts * num_frames, num_points) ->
+        # (num_total_gts * num_frames * num_points, )
+        mask_point_preds = mask_point_preds.reshape(-1)
+        # shape (num_total_gts, num_points) -> (num_total_gts * num_points, )
+        mask_point_targets = mask_point_targets.reshape(-1)
+        loss_mask = self.loss_mask(
+            mask_point_preds,
+            mask_point_targets,
+            avg_factor=num_total_masks * self.num_points / self.num_frames)
+
+        return loss_cls, loss_mask, loss_dice
+
+    def _forward_head(
+        self, decoder_out: Tensor, mask_feature: Tensor,
+        attn_mask_target_size: Tuple[int,
+                                     int]) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward for head part which is called after every decoder layer.
+
+        Args:
+            decoder_out (Tensor): in shape (num_queries, batch_size, c).
+            mask_feature (Tensor): in shape (batch_size, t, c, h, w).
+            attn_mask_target_size (tuple[int, int]): target attention
+                mask size.
+
+        Returns:
+            tuple: A tuple contain three elements.
+
+                - cls_pred (Tensor): Classification scores in shape \
+                    (batch_size, num_queries, cls_out_channels). \
+                    Note `cls_out_channels` should include background.
+                - mask_pred (Tensor): Mask scores in shape \
+                    (batch_size, num_queries,h, w).
+                - attn_mask (Tensor): Attention mask in shape \
+                    (batch_size * num_heads, num_queries, h, w).
+        """
+        decoder_out = self.transformer_decoder.post_norm(decoder_out)
+        cls_pred = self.cls_embed(decoder_out)
+        mask_embed = self.mask_embed(decoder_out)
+
+        # shape (batch_size, num_queries, t, h, w)
+        mask_pred = torch.einsum('bqc,btchw->bqthw', mask_embed, mask_feature)
+        b, q, t, _, _ = mask_pred.shape
+
+        attn_mask = F.interpolate(
+            mask_pred.flatten(0, 1),
+            attn_mask_target_size,
+            mode='bilinear',
+            align_corners=False).view(b, q, t, attn_mask_target_size[0],
+                                      attn_mask_target_size[1])
+
+        # shape (batch_size, num_queries, t, h, w) ->
+        # (batch_size, num_queries, t*h*w) ->
+        # (batch_size, num_head, num_queries, t*h*w) ->
+        # (batch_size*num_head, num_queries, t*h*w)
+        attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat(
+            (1, self.num_heads, 1, 1)).flatten(0, 1)
+        attn_mask = attn_mask.sigmoid() < 0.5
+        attn_mask = attn_mask.detach()
+
+        return cls_pred, mask_pred, attn_mask
+
+    def forward(
+            self, x: List[Tensor], data_samples: TrackDataSample
+    ) -> Tuple[List[Tensor], List[Tensor]]:
+        """Forward function.
+
+        Args:
+            x (list[Tensor]): Multi scale Features from the
+                upstream network, each is a 4D-tensor.
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+
+        Returns:
+            tuple[list[Tensor]]: A tuple contains two elements.
+
+                - cls_pred_list (list[Tensor)]: Classification logits \
+                    for each decoder layer. Each is a 3D-tensor with shape \
+                    (batch_size, num_queries, cls_out_channels). \
+                    Note `cls_out_channels` should include background.
+                - mask_pred_list (list[Tensor]): Mask logits for each \
+                    decoder layer. Each with shape (batch_size, num_queries, \
+                    h, w).
+        """
+        mask_features, multi_scale_memorys = self.pixel_decoder(x)
+        bt, c_m, h_m, w_m = mask_features.shape
+        batch_size = bt // self.num_frames if self.training else 1
+        t = bt // batch_size
+        mask_features = mask_features.view(batch_size, t, c_m, h_m, w_m)
+        # multi_scale_memorys (from low resolution to high resolution)
+        decoder_inputs = []
+        decoder_positional_encodings = []
+        for i in range(self.num_transformer_feat_level):
+            decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i])
+            decoder_input = decoder_input.flatten(2)
+            level_embed = self.level_embed.weight[i][None, :, None]
+            decoder_input = decoder_input + level_embed
+            _, c, hw = decoder_input.shape
+            # shape (batch_size*t, c, h, w) ->
+            # (batch_size, t, c, hw) ->
+            # (batch_size, t*h*w, c)
+            decoder_input = decoder_input.view(batch_size, t, c,
+                                               hw).permute(0, 1, 3,
+                                                           2).flatten(1, 2)
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            mask = decoder_input.new_zeros(
+                (batch_size, t) + multi_scale_memorys[i].shape[-2:],
+                dtype=torch.bool)
+            decoder_positional_encoding = self.decoder_positional_encoding(
+                mask)
+            decoder_positional_encoding = decoder_positional_encoding.flatten(
+                3).permute(0, 1, 3, 2).flatten(1, 2)
+            decoder_inputs.append(decoder_input)
+            decoder_positional_encodings.append(decoder_positional_encoding)
+        # shape (num_queries, c) -> (batch_size, num_queries, c)
+        query_feat = self.query_feat.weight.unsqueeze(0).repeat(
+            (batch_size, 1, 1))
+        query_embed = self.query_embed.weight.unsqueeze(0).repeat(
+            (batch_size, 1, 1))
+
+        cls_pred_list = []
+        mask_pred_list = []
+        cls_pred, mask_pred, attn_mask = self._forward_head(
+            query_feat, mask_features, multi_scale_memorys[0].shape[-2:])
+        cls_pred_list.append(cls_pred)
+        mask_pred_list.append(mask_pred)
+
+        for i in range(self.num_transformer_decoder_layers):
+            level_idx = i % self.num_transformer_feat_level
+            # if a mask is all True(all background), then set it all False.
+            attn_mask[torch.where(
+                attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+
+            # cross_attn + self_attn
+            layer = self.transformer_decoder.layers[i]
+            query_feat = layer(
+                query=query_feat,
+                key=decoder_inputs[level_idx],
+                value=decoder_inputs[level_idx],
+                query_pos=query_embed,
+                key_pos=decoder_positional_encodings[level_idx],
+                cross_attn_mask=attn_mask,
+                query_key_padding_mask=None,
+                # here we do not apply masking on padded region
+                key_padding_mask=None)
+            cls_pred, mask_pred, attn_mask = self._forward_head(
+                query_feat, mask_features, multi_scale_memorys[
+                    (i + 1) % self.num_transformer_feat_level].shape[-2:])
+
+            cls_pred_list.append(cls_pred)
+            mask_pred_list.append(mask_pred)
+
+        return cls_pred_list, mask_pred_list
+
+    def loss(
+        self,
+        x: Tuple[Tensor],
+        data_samples: TrackSampleList,
+    ) -> Dict[str, Tensor]:
+        """Perform forward propagation and loss calculation of the track head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+
+        for data_sample in data_samples:
+            video_img_metas = defaultdict(list)
+            for image_idx in range(len(data_sample)):
+                batch_gt_instances.append(data_sample[image_idx].gt_instances)
+                for key, value in data_sample[image_idx].metainfo.items():
+                    video_img_metas[key].append(value)
+            batch_img_metas.append(video_img_metas)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, data_samples)
+
+        # preprocess ground truth
+        batch_gt_instances = self.preprocess_gt(batch_gt_instances)
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self,
+                x: Tuple[Tensor],
+                data_samples: TrackDataSample,
+                rescale: bool = True) -> InstanceList:
+        """Test without augmentation.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: each contains the following keys
+                - labels (Tensor): Prediction class indices\
+                    for an image, with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in an image.
+                - masks (Tensor): Prediction mask for a\
+                    image, with shape (n, t, h, w).
+        """
+
+        batch_img_metas = [
+            data_samples[img_idx].metainfo
+            for img_idx in range(len(data_samples))
+        ]
+        all_cls_scores, all_mask_preds = self(x, data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+
+        mask_cls_results = mask_cls_results[0]
+        # upsample masks
+        img_shape = batch_img_metas[0]['batch_input_shape']
+        mask_pred_results = F.interpolate(
+            mask_pred_results[0],
+            size=(img_shape[0], img_shape[1]),
+            mode='bilinear',
+            align_corners=False)
+
+        results = self.predict_by_feat(mask_cls_results, mask_pred_results,
+                                       batch_img_metas)
+        return results
+
+    def predict_by_feat(self,
+                        mask_cls_results: List[Tensor],
+                        mask_pred_results: List[Tensor],
+                        batch_img_metas: List[dict],
+                        rescale: bool = True) -> InstanceList:
+        """Get top-10 predictions.
+
+        Args:
+            mask_cls_results (Tensor): Mask classification logits,\
+                shape (batch_size, num_queries, cls_out_channels).
+                Note `cls_out_channels` should include background.
+            mask_pred_results (Tensor): Mask logits, shape \
+                (batch_size, num_queries, h, w).
+            batch_img_metas (list[dict]): List of image meta information.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: each contains the following keys
+                - labels (Tensor): Prediction class indices\
+                    for an image, with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in an image.
+                - masks (Tensor): Prediction mask for a\
+                    image, with shape (n, t, h, w).
+        """
+        results = []
+        if len(mask_cls_results) > 0:
+            scores = F.softmax(mask_cls_results, dim=-1)[:, :-1]
+            labels = torch.arange(self.num_classes).unsqueeze(0).repeat(
+                self.num_queries, 1).flatten(0, 1).to(scores.device)
+            # keep top-10 predictions
+            scores_per_image, topk_indices = scores.flatten(0, 1).topk(
+                10, sorted=False)
+            labels_per_image = labels[topk_indices]
+            topk_indices = topk_indices // self.num_classes
+            mask_pred_results = mask_pred_results[topk_indices]
+
+            img_shape = batch_img_metas[0]['img_shape']
+            mask_pred_results = \
+                mask_pred_results[:, :, :img_shape[0], :img_shape[1]]
+            if rescale:
+                # return result in original resolution
+                ori_height, ori_width = batch_img_metas[0]['ori_shape'][:2]
+                mask_pred_results = F.interpolate(
+                    mask_pred_results,
+                    size=(ori_height, ori_width),
+                    mode='bilinear',
+                    align_corners=False)
+
+            masks = mask_pred_results > 0.
+
+            # format top-10 predictions
+            for img_idx in range(len(batch_img_metas)):
+                pred_track_instances = InstanceData()
+
+                pred_track_instances.masks = masks[:, img_idx]
+                pred_track_instances.bboxes = mask2bbox(masks[:, img_idx])
+                pred_track_instances.labels = labels_per_image
+                pred_track_instances.scores = scores_per_image
+                pred_track_instances.instances_id = torch.arange(10)
+
+                results.append(pred_track_instances)
+
+            return results
diff --git a/mmde/mmdet/models/tracking_heads/quasi_dense_embed_head.py b/mmde/mmdet/models/tracking_heads/quasi_dense_embed_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..55e3c05b7aba188608f7dd2fdda54e0759cee03c
--- /dev/null
+++ b/mmde/mmdet/models/tracking_heads/quasi_dense_embed_head.py
@@ -0,0 +1,347 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.task_modules import SamplingResult
+from mmdet.registry import MODELS
+from ..task_modules.tracking import embed_similarity
+
+
+@MODELS.register_module()
+class QuasiDenseEmbedHead(BaseModule):
+    """The quasi-dense roi embed head.
+
+    Args:
+        embed_channels (int): The input channel of embed features.
+            Defaults to 256.
+        softmax_temp (int): Softmax temperature. Defaults to -1.
+        loss_track (dict): The loss function for tracking. Defaults to
+            MultiPosCrossEntropyLoss.
+        loss_track_aux (dict): The auxiliary loss function for tracking.
+            Defaults to MarginL2Loss.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_convs: int = 0,
+                 num_fcs: int = 0,
+                 roi_feat_size: int = 7,
+                 in_channels: int = 256,
+                 conv_out_channels: int = 256,
+                 with_avg_pool: bool = False,
+                 fc_out_channels: int = 1024,
+                 conv_cfg: Optional[dict] = None,
+                 norm_cfg: Optional[dict] = None,
+                 embed_channels: int = 256,
+                 softmax_temp: int = -1,
+                 loss_track: Optional[dict] = None,
+                 loss_track_aux: dict = dict(
+                     type='MarginL2Loss',
+                     sample_ratio=3,
+                     margin=0.3,
+                     loss_weight=1.0,
+                     hard_mining=True),
+                 init_cfg: dict = dict(
+                     type='Xavier',
+                     layer='Linear',
+                     distribution='uniform',
+                     bias=0,
+                     override=dict(
+                         type='Normal',
+                         name='fc_embed',
+                         mean=0,
+                         std=0.01,
+                         bias=0))):
+        super(QuasiDenseEmbedHead, self).__init__(init_cfg=init_cfg)
+        self.num_convs = num_convs
+        self.num_fcs = num_fcs
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1]
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.with_avg_pool = with_avg_pool
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        if self.with_avg_pool:
+            self.avg_pool = nn.AvgPool2d(self.roi_feat_size)
+        # add convs and fcs
+        self.convs, self.fcs, self.last_layer_dim = self._add_conv_fc_branch(
+            self.num_convs, self.num_fcs, self.in_channels)
+        self.relu = nn.ReLU(inplace=True)
+
+        if loss_track is None:
+            loss_track = dict(
+                type='MultiPosCrossEntropyLoss', loss_weight=0.25)
+
+        self.fc_embed = nn.Linear(self.last_layer_dim, embed_channels)
+        self.softmax_temp = softmax_temp
+        self.loss_track = MODELS.build(loss_track)
+        if loss_track_aux is not None:
+            self.loss_track_aux = MODELS.build(loss_track_aux)
+        else:
+            self.loss_track_aux = None
+
+    def _add_conv_fc_branch(
+            self, num_branch_convs: int, num_branch_fcs: int,
+            in_channels: int) -> Tuple[nn.ModuleList, nn.ModuleList, int]:
+        """Add shared or separable branch. convs -> avg pool (optional) -> fcs.
+
+        Args:
+            num_branch_convs (int): The number of convoluational layers.
+            num_branch_fcs (int): The number of fully connection layers.
+            in_channels (int): The input channel of roi features.
+
+        Returns:
+            Tuple[nn.ModuleList, nn.ModuleList, int]: The convs, fcs and the
+                last layer dimension.
+        """
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        branch_convs = nn.ModuleList()
+        if num_branch_convs > 0:
+            for i in range(num_branch_convs):
+                conv_in_channels = (
+                    last_layer_dim if i == 0 else self.conv_out_channels)
+                branch_convs.append(
+                    ConvModule(
+                        conv_in_channels,
+                        self.conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            last_layer_dim = self.conv_out_channels
+
+        # add branch specific fc layers
+        branch_fcs = nn.ModuleList()
+        if num_branch_fcs > 0:
+            if not self.with_avg_pool:
+                last_layer_dim *= self.roi_feat_area
+            for i in range(num_branch_fcs):
+                fc_in_channels = (
+                    last_layer_dim if i == 0 else self.fc_out_channels)
+                branch_fcs.append(
+                    nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+
+        return branch_convs, branch_fcs, last_layer_dim
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x (Tensor): The input features from ROI head.
+
+        Returns:
+            Tensor: The embedding feature map.
+        """
+
+        if self.num_convs > 0:
+            for conv in self.convs:
+                x = conv(x)
+        x = x.flatten(1)
+        if self.num_fcs > 0:
+            for fc in self.fcs:
+                x = self.relu(fc(x))
+        x = self.fc_embed(x)
+        return x
+
+    def get_targets(
+            self, gt_match_indices: List[Tensor],
+            key_sampling_results: List[SamplingResult],
+            ref_sampling_results: List[SamplingResult]) -> Tuple[List, List]:
+        """Calculate the track targets and track weights for all samples in a
+        batch according to the sampling_results.
+
+        Args:
+            gt_match_indices (list(Tensor)): Mapping from gt_instance_ids to
+                ref_gt_instance_ids of the same tracklet in a pair of images.
+            key_sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            ref_sampling_results (List[obj:SamplingResult]): Assign results of
+                all reference images in a batch after sampling.
+
+        Returns:
+            Tuple[list[Tensor]]: Association results.
+            Containing the following list of Tensors:
+
+                - track_targets (list[Tensor]): The mapping instance ids from
+                    all positive proposals in the key image to all proposals
+                    in the reference image, each tensor in list has
+                    shape (len(key_pos_bboxes), len(ref_bboxes)).
+                - track_weights (list[Tensor]): Loss weights for all positive
+                    proposals in a batch, each tensor in list has
+                    shape (len(key_pos_bboxes),).
+        """
+
+        track_targets = []
+        track_weights = []
+        for _gt_match_indices, key_res, ref_res in zip(gt_match_indices,
+                                                       key_sampling_results,
+                                                       ref_sampling_results):
+            targets = _gt_match_indices.new_zeros(
+                (key_res.pos_bboxes.size(0), ref_res.bboxes.size(0)),
+                dtype=torch.int)
+            _match_indices = _gt_match_indices[key_res.pos_assigned_gt_inds]
+            pos2pos = (_match_indices.view(
+                -1, 1) == ref_res.pos_assigned_gt_inds.view(1, -1)).int()
+            targets[:, :pos2pos.size(1)] = pos2pos
+            weights = (targets.sum(dim=1) > 0).float()
+            track_targets.append(targets)
+            track_weights.append(weights)
+        return track_targets, track_weights
+
+    def match(
+        self, key_embeds: Tensor, ref_embeds: Tensor,
+        key_sampling_results: List[SamplingResult],
+        ref_sampling_results: List[SamplingResult]
+    ) -> Tuple[List[Tensor], List[Tensor]]:
+        """Calculate the dist matrixes for loss measurement.
+
+        Args:
+            key_embeds (Tensor): Embeds of positive bboxes in sampling results
+                of key image.
+            ref_embeds (Tensor): Embeds of all bboxes in sampling results
+                of the reference image.
+            key_sampling_results (List[obj:SamplingResults]): Assign results of
+                all images in a batch after sampling.
+            ref_sampling_results (List[obj:SamplingResults]): Assign results of
+                all reference images in a batch after sampling.
+
+        Returns:
+            Tuple[list[Tensor]]: Calculation results.
+            Containing the following list of Tensors:
+
+                - dists (list[Tensor]): Dot-product dists between
+                    key_embeds and ref_embeds, each tensor in list has
+                    shape (len(key_pos_bboxes), len(ref_bboxes)).
+                - cos_dists (list[Tensor]): Cosine dists between
+                    key_embeds and ref_embeds, each tensor in list has
+                    shape (len(key_pos_bboxes), len(ref_bboxes)).
+        """
+
+        num_key_rois = [res.pos_bboxes.size(0) for res in key_sampling_results]
+        key_embeds = torch.split(key_embeds, num_key_rois)
+        num_ref_rois = [res.bboxes.size(0) for res in ref_sampling_results]
+        ref_embeds = torch.split(ref_embeds, num_ref_rois)
+
+        dists, cos_dists = [], []
+        for key_embed, ref_embed in zip(key_embeds, ref_embeds):
+            dist = embed_similarity(
+                key_embed,
+                ref_embed,
+                method='dot_product',
+                temperature=self.softmax_temp)
+            dists.append(dist)
+            if self.loss_track_aux is not None:
+                cos_dist = embed_similarity(
+                    key_embed, ref_embed, method='cosine')
+                cos_dists.append(cos_dist)
+            else:
+                cos_dists.append(None)
+        return dists, cos_dists
+
+    def loss(self, key_roi_feats: Tensor, ref_roi_feats: Tensor,
+             key_sampling_results: List[SamplingResult],
+             ref_sampling_results: List[SamplingResult],
+             gt_match_indices_list: List[Tensor]) -> dict:
+        """Calculate the track loss and the auxiliary track loss.
+
+        Args:
+            key_roi_feats (Tensor): Embeds of positive bboxes in sampling
+                results of key image.
+            ref_roi_feats (Tensor): Embeds of all bboxes in sampling results
+                of the reference image.
+            key_sampling_results (List[obj:SamplingResults]): Assign results of
+                all images in a batch after sampling.
+            ref_sampling_results (List[obj:SamplingResults]): Assign results of
+                all reference images in a batch after sampling.
+            gt_match_indices_list (list(Tensor)): Mapping from gt_instances_ids
+                to ref_gt_instances_ids of the same tracklet in a pair of
+                images.
+
+        Returns:
+            Dict [str: Tensor]: Calculation results.
+            Containing the following list of Tensors:
+
+                - loss_track (Tensor): Results of loss_track function.
+                - loss_track_aux (Tensor): Results of loss_track_aux function.
+        """
+        key_track_feats = self(key_roi_feats)
+        ref_track_feats = self(ref_roi_feats)
+
+        losses = self.loss_by_feat(key_track_feats, ref_track_feats,
+                                   key_sampling_results, ref_sampling_results,
+                                   gt_match_indices_list)
+        return losses
+
+    def loss_by_feat(self, key_track_feats: Tensor, ref_track_feats: Tensor,
+                     key_sampling_results: List[SamplingResult],
+                     ref_sampling_results: List[SamplingResult],
+                     gt_match_indices_list: List[Tensor]) -> dict:
+        """Calculate the track loss and the auxiliary track loss.
+
+        Args:
+            key_track_feats (Tensor): Embeds of positive bboxes in sampling
+                results of key image.
+            ref_track_feats (Tensor): Embeds of all bboxes in sampling results
+                of the reference image.
+            key_sampling_results (List[obj:SamplingResults]): Assign results of
+                all images in a batch after sampling.
+            ref_sampling_results (List[obj:SamplingResults]): Assign results of
+                all reference images in a batch after sampling.
+            gt_match_indices_list (list(Tensor)): Mapping from instances_ids
+                from key image to reference image of the same tracklet in a
+                pair of images.
+
+        Returns:
+            Dict [str: Tensor]: Calculation results.
+            Containing the following list of Tensors:
+
+                - loss_track (Tensor): Results of loss_track function.
+                - loss_track_aux (Tensor): Results of loss_track_aux function.
+        """
+        dists, cos_dists = self.match(key_track_feats, ref_track_feats,
+                                      key_sampling_results,
+                                      ref_sampling_results)
+        targets, weights = self.get_targets(gt_match_indices_list,
+                                            key_sampling_results,
+                                            ref_sampling_results)
+        losses = dict()
+
+        loss_track = 0.
+        loss_track_aux = 0.
+        for _dists, _cos_dists, _targets, _weights in zip(
+                dists, cos_dists, targets, weights):
+            loss_track += self.loss_track(
+                _dists, _targets, _weights, avg_factor=_weights.sum())
+            if self.loss_track_aux is not None:
+                loss_track_aux += self.loss_track_aux(_cos_dists, _targets)
+        losses['loss_track'] = loss_track / len(dists)
+
+        if self.loss_track_aux is not None:
+            losses['loss_track_aux'] = loss_track_aux / len(dists)
+
+        return losses
+
+    def predict(self, bbox_feats: Tensor) -> Tensor:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            bbox_feats: The extracted roi features.
+
+        Returns:
+            Tensor: The extracted track features.
+        """
+        track_feats = self(bbox_feats)
+        return track_feats
diff --git a/mmde/mmdet/models/tracking_heads/quasi_dense_track_head.py b/mmde/mmdet/models/tracking_heads/quasi_dense_track_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd078dac827e35c7514330870cf884001985156b
--- /dev/null
+++ b/mmde/mmdet/models/tracking_heads/quasi_dense_track_head.py
@@ -0,0 +1,178 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import TrackSampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import InstanceList
+
+
+@MODELS.register_module()
+class QuasiDenseTrackHead(BaseModule):
+    """The quasi-dense track head."""
+
+    def __init__(self,
+                 roi_extractor: Optional[dict] = None,
+                 embed_head: Optional[dict] = None,
+                 regress_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 **kwargs):
+        super().__init__(init_cfg=init_cfg)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if embed_head is not None:
+            self.init_embed_head(roi_extractor, embed_head)
+
+        if regress_head is not None:
+            raise NotImplementedError('Regression head is not supported yet.')
+
+        self.init_assigner_sampler()
+
+    def init_embed_head(self, roi_extractor, embed_head) -> None:
+        """Initialize ``embed_head``
+
+        Args:
+            roi_extractor (dict, optional): Configuration of roi extractor.
+                Defaults to None.
+            embed_head (dict, optional): Configuration of embed head. Defaults
+                to None.
+        """
+        self.roi_extractor = MODELS.build(roi_extractor)
+        self.embed_head = MODELS.build(embed_head)
+
+    def init_assigner_sampler(self) -> None:
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            self.bbox_sampler = TASK_UTILS.build(
+                self.train_cfg.sampler, default_args=dict(context=self))
+
+    @property
+    def with_track(self) -> bool:
+        """bool: whether the multi-object tracker has an embed head"""
+        return hasattr(self, 'embed_head') and self.embed_head is not None
+
+    def extract_roi_feats(self, feats: List[Tensor],
+                          bboxes: List[Tensor]) -> Tensor:
+        """Extract roi features.
+
+        Args:
+            feats (list[Tensor]): list of multi-level image features.
+            bboxes (list[Tensor]): list of bboxes in sampling result.
+
+        Returns:
+            Tensor: The extracted roi features.
+        """
+        rois = bbox2roi(bboxes)
+        bbox_feats = self.roi_extractor(feats[:self.roi_extractor.num_inputs],
+                                        rois)
+        return bbox_feats
+
+    def loss(self, key_feats: List[Tensor], ref_feats: List[Tensor],
+             rpn_results_list: InstanceList,
+             ref_rpn_results_list: InstanceList, data_samples: TrackSampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            key_feats (list[Tensor]): list of multi-level image features.
+            ref_feats (list[Tensor]): list of multi-level ref_img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals of key img.
+            ref_rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals of ref img.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        assert self.with_track
+        num_imgs = len(data_samples)
+        batch_gt_instances = []
+        ref_batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        gt_match_indices_list = []
+        for track_data_sample in data_samples:
+            key_data_sample = track_data_sample.get_key_frames()[0]
+            ref_data_sample = track_data_sample.get_ref_frames()[0]
+            batch_gt_instances.append(key_data_sample.gt_instances)
+            ref_batch_gt_instances.append(ref_data_sample.gt_instances)
+            if 'ignored_instances' in key_data_sample:
+                batch_gt_instances_ignore.append(
+                    key_data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+            # get gt_match_indices
+            ins_ids = key_data_sample.gt_instances.instances_ids.tolist()
+            ref_ins_ids = ref_data_sample.gt_instances.instances_ids.tolist()
+            match_indices = Tensor([
+                ref_ins_ids.index(i) if (i in ref_ins_ids and i > 0) else -1
+                for i in ins_ids
+            ]).to(key_feats[0].device)
+            gt_match_indices_list.append(match_indices)
+
+        key_sampling_results, ref_sampling_results = [], []
+        for i in range(num_imgs):
+            rpn_results = rpn_results_list[i]
+            ref_rpn_results = ref_rpn_results_list[i]
+            # rename ref_rpn_results.bboxes to ref_rpn_results.priors
+            ref_rpn_results.priors = ref_rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in key_feats])
+            key_sampling_results.append(sampling_result)
+
+            ref_assign_result = self.bbox_assigner.assign(
+                ref_rpn_results, ref_batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            ref_sampling_result = self.bbox_sampler.sample(
+                ref_assign_result,
+                ref_rpn_results,
+                ref_batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in ref_feats])
+            ref_sampling_results.append(ref_sampling_result)
+
+        key_bboxes = [res.pos_bboxes for res in key_sampling_results]
+        key_roi_feats = self.extract_roi_feats(key_feats, key_bboxes)
+        ref_bboxes = [res.bboxes for res in ref_sampling_results]
+        ref_roi_feats = self.extract_roi_feats(ref_feats, ref_bboxes)
+
+        loss_track = self.embed_head.loss(key_roi_feats, ref_roi_feats,
+                                          key_sampling_results,
+                                          ref_sampling_results,
+                                          gt_match_indices_list)
+
+        return loss_track
+
+    def predict(self, feats: List[Tensor],
+                rescaled_bboxes: List[Tensor]) -> Tensor:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            rescaled_bboxes (list[Tensor]): list of rescaled bboxes in sampling
+                result.
+
+        Returns:
+            Tensor: The extracted track features.
+        """
+        bbox_feats = self.extract_roi_feats(feats, rescaled_bboxes)
+        track_feats = self.embed_head.predict(bbox_feats)
+        return track_feats
diff --git a/mmde/mmdet/models/tracking_heads/roi_embed_head.py b/mmde/mmdet/models/tracking_heads/roi_embed_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..e18b81fbe52e109e7afb3e6d5e8e6624ef48242f
--- /dev/null
+++ b/mmde/mmdet/models/tracking_heads/roi_embed_head.py
@@ -0,0 +1,391 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.losses import accuracy
+from mmdet.models.task_modules import SamplingResult
+from mmdet.models.task_modules.tracking import embed_similarity
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class RoIEmbedHead(BaseModule):
+    """The roi embed head.
+
+    This module is used in multi-object tracking methods, such as MaskTrack
+    R-CNN.
+
+    Args:
+        num_convs (int): The number of convoluational layers to embed roi
+            features. Defaults to 0.
+        num_fcs (int): The number of fully connection layers to embed roi
+            features. Defaults to 0.
+        roi_feat_size (int|tuple(int)): The spatial size of roi features.
+            Defaults to 7.
+        in_channels (int): The input channel of roi features. Defaults to 256.
+        conv_out_channels (int): The output channel of roi features after
+            forwarding convoluational layers. Defaults to 256.
+        with_avg_pool (bool): Whether use average pooling before passing roi
+            features into fully connection layers. Defaults to False.
+        fc_out_channels (int): The output channel of roi features after
+            forwarding fully connection layers. Defaults to 1024.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Defaults to None.
+        loss_match (dict): The loss function. Defaults to
+            dict(type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)
+        init_cfg (dict): Configuration of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_convs: int = 0,
+                 num_fcs: int = 0,
+                 roi_feat_size: int = 7,
+                 in_channels: int = 256,
+                 conv_out_channels: int = 256,
+                 with_avg_pool: bool = False,
+                 fc_out_channels: int = 1024,
+                 conv_cfg: Optional[dict] = None,
+                 norm_cfg: Optional[dict] = None,
+                 loss_match: dict = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 init_cfg: Optional[dict] = None,
+                 **kwargs):
+        super(RoIEmbedHead, self).__init__(init_cfg=init_cfg)
+        self.num_convs = num_convs
+        self.num_fcs = num_fcs
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1]
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.with_avg_pool = with_avg_pool
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.loss_match = MODELS.build(loss_match)
+        self.fp16_enabled = False
+
+        if self.with_avg_pool:
+            self.avg_pool = nn.AvgPool2d(self.roi_feat_size)
+        # add convs and fcs
+        self.convs, self.fcs, self.last_layer_dim = self._add_conv_fc_branch(
+            self.num_convs, self.num_fcs, self.in_channels)
+        self.relu = nn.ReLU(inplace=True)
+
+    def _add_conv_fc_branch(
+            self, num_branch_convs: int, num_branch_fcs: int,
+            in_channels: int) -> Tuple[nn.ModuleList, nn.ModuleList, int]:
+        """Add shared or separable branch.
+
+        convs -> avg pool (optional) -> fcs
+        """
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        branch_convs = nn.ModuleList()
+        if num_branch_convs > 0:
+            for i in range(num_branch_convs):
+                conv_in_channels = (
+                    last_layer_dim if i == 0 else self.conv_out_channels)
+                branch_convs.append(
+                    ConvModule(
+                        conv_in_channels,
+                        self.conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            last_layer_dim = self.conv_out_channels
+
+        # add branch specific fc layers
+        branch_fcs = nn.ModuleList()
+        if num_branch_fcs > 0:
+            if not self.with_avg_pool:
+                last_layer_dim *= self.roi_feat_area
+            for i in range(num_branch_fcs):
+                fc_in_channels = (
+                    last_layer_dim if i == 0 else self.fc_out_channels)
+                branch_fcs.append(
+                    nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+
+        return branch_convs, branch_fcs, last_layer_dim
+
+    @property
+    def custom_activation(self):
+        return getattr(self.loss_match, 'custom_activation', False)
+
+    def extract_feat(self, x: Tensor,
+                     num_x_per_img: List[int]) -> Tuple[Tensor]:
+        """Extract feature from the input `x`, and split the output to a list.
+
+        Args:
+            x (Tensor): of shape [N, C, H, W]. N is the number of proposals.
+            num_x_per_img (list[int]): The `x` contains proposals of
+                multi-images. `num_x_per_img` denotes the number of proposals
+                for each image.
+
+        Returns:
+            list[Tensor]: Each Tensor denotes the embed features belonging to
+            an image in a batch.
+        """
+        if self.num_convs > 0:
+            for conv in self.convs:
+                x = conv(x)
+
+        if self.num_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+            x = x.flatten(1)
+            for fc in self.fcs:
+                x = self.relu(fc(x))
+        else:
+            x = x.flatten(1)
+
+        x_split = torch.split(x, num_x_per_img, dim=0)
+        return x_split
+
+    def forward(
+            self, x: Tensor, ref_x: Tensor, num_x_per_img: List[int],
+            num_x_per_ref_img: List[int]
+    ) -> Tuple[Tuple[Tensor], Tuple[Tensor]]:
+        """Computing the similarity scores between `x` and `ref_x`.
+
+        Args:
+            x (Tensor): of shape [N, C, H, W]. N is the number of key frame
+                proposals.
+            ref_x (Tensor): of shape [M, C, H, W]. M is the number of reference
+                frame proposals.
+            num_x_per_img (list[int]): The `x` contains proposals of
+                multi-images. `num_x_per_img` denotes the number of proposals
+                for each key image.
+            num_x_per_ref_img (list[int]): The `ref_x` contains proposals of
+                multi-images. `num_x_per_ref_img` denotes the number of
+                proposals for each reference image.
+
+        Returns:
+            tuple[tuple[Tensor], tuple[Tensor]]: Each tuple of tensor denotes
+            the embed features belonging to an image in a batch.
+        """
+        x_split = self.extract_feat(x, num_x_per_img)
+        ref_x_split = self.extract_feat(ref_x, num_x_per_ref_img)
+
+        return x_split, ref_x_split
+
+    def get_targets(self, sampling_results: List[SamplingResult],
+                    gt_instance_ids: List[Tensor],
+                    ref_gt_instance_ids: List[Tensor]) -> Tuple[List, List]:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes of
+                all images in a batch, each tensor has shape (num_gt, ).
+            ref_gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes
+                of all reference images in a batch, each tensor has shape
+                (num_gt, ).
+
+        Returns:
+            Tuple[list[Tensor]]: Ground truth for proposals in a batch.
+            Containing the following list of Tensors:
+
+                - track_id_targets (list[Tensor]): The instance ids of
+                  Gt_labels for all proposals in a batch, each tensor in list
+                  has shape (num_proposals,).
+                - track_id_weights (list[Tensor]): Labels_weights for
+                  all proposals in a batch, each tensor in list has
+                  shape (num_proposals,).
+        """
+        track_id_targets = []
+        track_id_weights = []
+
+        for res, gt_instance_id, ref_gt_instance_id in zip(
+                sampling_results, gt_instance_ids, ref_gt_instance_ids):
+            pos_instance_ids = gt_instance_id[res.pos_assigned_gt_inds]
+            pos_match_id = gt_instance_id.new_zeros(len(pos_instance_ids))
+            for i, id in enumerate(pos_instance_ids):
+                if id in ref_gt_instance_id:
+                    pos_match_id[i] = ref_gt_instance_id.tolist().index(id) + 1
+
+            track_id_target = gt_instance_id.new_zeros(
+                len(res.bboxes), dtype=torch.int64)
+            track_id_target[:len(res.pos_bboxes)] = pos_match_id
+            track_id_weight = res.bboxes.new_zeros(len(res.bboxes))
+            track_id_weight[:len(res.pos_bboxes)] = 1.0
+
+            track_id_targets.append(track_id_target)
+            track_id_weights.append(track_id_weight)
+
+        return track_id_targets, track_id_weights
+
+    def loss(
+        self,
+        bbox_feats: Tensor,
+        ref_bbox_feats: Tensor,
+        num_bbox_per_img: int,
+        num_bbox_per_ref_img: int,
+        sampling_results: List[SamplingResult],
+        gt_instance_ids: List[Tensor],
+        ref_gt_instance_ids: List[Tensor],
+        reduction_override: Optional[str] = None,
+    ) -> dict:
+        """Calculate the loss in a batch.
+
+        Args:
+            bbox_feats (Tensor): of shape [N, C, H, W]. N is the number of
+                bboxes.
+            ref_bbox_feats (Tensor): of shape [M, C, H, W]. M is the number of
+                reference bboxes.
+            num_bbox_per_img (list[int]): The `bbox_feats` contains proposals
+                of multi-images. `num_bbox_per_img` denotes the number of
+                proposals for each key image.
+            num_bbox_per_ref_img (list[int]): The `ref_bbox_feats` contains
+                proposals of multi-images. `num_bbox_per_ref_img` denotes the
+                number of proposals for each reference image.
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes of
+                all images in a batch, each tensor has shape (num_gt, ).
+            ref_gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes
+                of all reference images in a batch, each tensor has shape
+                (num_gt, ).
+            reduction_override (str, optional): The method used to reduce the
+                loss. Options are "none", "mean" and "sum".
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        x_split, ref_x_split = self(bbox_feats, ref_bbox_feats,
+                                    num_bbox_per_img, num_bbox_per_ref_img)
+
+        losses = self.loss_by_feat(x_split, ref_x_split, sampling_results,
+                                   gt_instance_ids, ref_gt_instance_ids,
+                                   reduction_override)
+        return losses
+
+    def loss_by_feat(self,
+                     x_split: Tuple[Tensor],
+                     ref_x_split: Tuple[Tensor],
+                     sampling_results: List[SamplingResult],
+                     gt_instance_ids: List[Tensor],
+                     ref_gt_instance_ids: List[Tensor],
+                     reduction_override: Optional[str] = None) -> dict:
+        """Calculate losses.
+
+        Args:
+            x_split (Tensor): The embed features belonging to key image.
+            ref_x_split (Tensor): The embed features belonging to ref image.
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes of
+                all images in a batch, each tensor has shape (num_gt, ).
+            ref_gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes
+                of all reference images in a batch, each tensor has shape
+                (num_gt, ).
+            reduction_override (str, optional): The method used to reduce the
+                loss. Options are "none", "mean" and "sum".
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        track_id_targets, track_id_weights = self.get_targets(
+            sampling_results, gt_instance_ids, ref_gt_instance_ids)
+        assert isinstance(track_id_targets, list)
+        assert isinstance(track_id_weights, list)
+        assert len(track_id_weights) == len(track_id_targets)
+
+        losses = defaultdict(list)
+        similarity_logits = []
+        for one_x, one_ref_x in zip(x_split, ref_x_split):
+            similarity_logit = embed_similarity(
+                one_x, one_ref_x, method='dot_product')
+            dummy = similarity_logit.new_zeros(one_x.shape[0], 1)
+            similarity_logit = torch.cat((dummy, similarity_logit), dim=1)
+            similarity_logits.append(similarity_logit)
+        assert isinstance(similarity_logits, list)
+        assert len(similarity_logits) == len(track_id_targets)
+
+        for similarity_logit, track_id_target, track_id_weight in zip(
+                similarity_logits, track_id_targets, track_id_weights):
+            avg_factor = max(torch.sum(track_id_target > 0).float().item(), 1.)
+            if similarity_logit.numel() > 0:
+                loss_match = self.loss_match(
+                    similarity_logit,
+                    track_id_target,
+                    track_id_weight,
+                    avg_factor=avg_factor,
+                    reduction_override=reduction_override)
+                if isinstance(loss_match, dict):
+                    for key, value in loss_match.items():
+                        losses[key].append(value)
+                else:
+                    losses['loss_match'].append(loss_match)
+
+                valid_index = track_id_weight > 0
+                valid_similarity_logit = similarity_logit[valid_index]
+                valid_track_id_target = track_id_target[valid_index]
+                if self.custom_activation:
+                    match_accuracy = self.loss_match.get_accuracy(
+                        valid_similarity_logit, valid_track_id_target)
+                    for key, value in match_accuracy.items():
+                        losses[key].append(value)
+                else:
+                    losses['match_accuracy'].append(
+                        accuracy(valid_similarity_logit,
+                                 valid_track_id_target))
+
+        for key, value in losses.items():
+            losses[key] = sum(losses[key]) / len(similarity_logits)
+        return losses
+
+    def predict(self, roi_feats: Tensor,
+                prev_roi_feats: Tensor) -> List[Tensor]:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            roi_feats (Tensor): Feature map of current images rois.
+            prev_roi_feats (Tensor): Feature map of previous images rois.
+
+        Returns:
+            list[Tensor]: The predicted similarity_logits of each pair of key
+            image and reference image.
+        """
+        x_split, ref_x_split = self(roi_feats, prev_roi_feats,
+                                    [roi_feats.shape[0]],
+                                    [prev_roi_feats.shape[0]])
+
+        similarity_logits = self.predict_by_feat(x_split, ref_x_split)
+
+        return similarity_logits
+
+    def predict_by_feat(self, x_split: Tuple[Tensor],
+                        ref_x_split: Tuple[Tensor]) -> List[Tensor]:
+        """Get similarity_logits.
+
+        Args:
+            x_split (Tensor): The embed features belonging to key image.
+            ref_x_split (Tensor): The embed features belonging to ref image.
+
+        Returns:
+            list[Tensor]: The predicted similarity_logits of each pair of key
+            image and reference image.
+        """
+        similarity_logits = []
+        for one_x, one_ref_x in zip(x_split, ref_x_split):
+            similarity_logit = embed_similarity(
+                one_x, one_ref_x, method='dot_product')
+            dummy = similarity_logit.new_zeros(one_x.shape[0], 1)
+            similarity_logit = torch.cat((dummy, similarity_logit), dim=1)
+            similarity_logits.append(similarity_logit)
+        return similarity_logits
diff --git a/mmde/mmdet/models/tracking_heads/roi_track_head.py b/mmde/mmdet/models/tracking_heads/roi_track_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c51c810022cc856411e1de83278e38fdc2b670c8
--- /dev/null
+++ b/mmde/mmdet/models/tracking_heads/roi_track_head.py
@@ -0,0 +1,178 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+from typing import List, Optional, Tuple
+
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import TrackSampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import InstanceList
+
+
+@MODELS.register_module()
+class RoITrackHead(BaseModule, metaclass=ABCMeta):
+    """The roi track head.
+
+    This module is used in multi-object tracking methods, such as MaskTrack
+    R-CNN.
+
+    Args:
+        roi_extractor (dict): Configuration of roi extractor. Defaults to None.
+        embed_head (dict): Configuration of embed head. Defaults to None.
+        train_cfg (dict): Configuration when training. Defaults to None.
+        test_cfg (dict): Configuration when testing. Defaults to None.
+        init_cfg (dict): Configuration of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 roi_extractor: Optional[dict] = None,
+                 embed_head: Optional[dict] = None,
+                 regress_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 *args,
+                 **kwargs):
+        super().__init__(init_cfg=init_cfg)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if embed_head is not None:
+            self.init_embed_head(roi_extractor, embed_head)
+
+        if regress_head is not None:
+            raise NotImplementedError('Regression head is not supported yet.')
+
+        self.init_assigner_sampler()
+
+    def init_embed_head(self, roi_extractor, embed_head) -> None:
+        """Initialize ``embed_head``"""
+        self.roi_extractor = MODELS.build(roi_extractor)
+        self.embed_head = MODELS.build(embed_head)
+
+    def init_assigner_sampler(self) -> None:
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            self.bbox_sampler = TASK_UTILS.build(
+                self.train_cfg.sampler, default_args=dict(context=self))
+
+    @property
+    def with_track(self) -> bool:
+        """bool: whether the multi-object tracker has an embed head"""
+        return hasattr(self, 'embed_head') and self.embed_head is not None
+
+    def extract_roi_feats(
+            self, feats: List[Tensor],
+            bboxes: List[Tensor]) -> Tuple[Tuple[Tensor], List[int]]:
+        """Extract roi features.
+
+        Args:
+            feats (list[Tensor]): list of multi-level image features.
+            bboxes (list[Tensor]): list of bboxes in sampling result.
+
+        Returns:
+            tuple[tuple[Tensor], list[int]]: The extracted roi features and
+            the number of bboxes in each image.
+        """
+        rois = bbox2roi(bboxes)
+        bbox_feats = self.roi_extractor(feats[:self.roi_extractor.num_inputs],
+                                        rois)
+        num_bbox_per_img = [len(bbox) for bbox in bboxes]
+        return bbox_feats, num_bbox_per_img
+
+    def loss(self, key_feats: List[Tensor], ref_feats: List[Tensor],
+             rpn_results_list: InstanceList, data_samples: TrackSampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            key_feats (list[Tensor]): list of multi-level image features.
+            ref_feats (list[Tensor]): list of multi-level ref_img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        assert self.with_track
+        batch_gt_instances = []
+        ref_batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        gt_instance_ids = []
+        ref_gt_instance_ids = []
+        for track_data_sample in data_samples:
+            key_data_sample = track_data_sample.get_key_frames()[0]
+            ref_data_sample = track_data_sample.get_ref_frames()[0]
+            batch_gt_instances.append(key_data_sample.gt_instances)
+            ref_batch_gt_instances.append(ref_data_sample.gt_instances)
+            if 'ignored_instances' in key_data_sample:
+                batch_gt_instances_ignore.append(
+                    key_data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+
+            gt_instance_ids.append(key_data_sample.gt_instances.instances_ids)
+            ref_gt_instance_ids.append(
+                ref_data_sample.gt_instances.instances_ids)
+
+        losses = dict()
+        num_imgs = len(data_samples)
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        sampling_results = []
+        for i in range(num_imgs):
+            rpn_results = rpn_results_list[i]
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in key_feats])
+            sampling_results.append(sampling_result)
+
+        bboxes = [res.bboxes for res in sampling_results]
+        bbox_feats, num_bbox_per_img = self.extract_roi_feats(
+            key_feats, bboxes)
+
+        # batch_size is 1
+        ref_gt_bboxes = [
+            ref_batch_gt_instance.bboxes
+            for ref_batch_gt_instance in ref_batch_gt_instances
+        ]
+        ref_bbox_feats, num_bbox_per_ref_img = self.extract_roi_feats(
+            ref_feats, ref_gt_bboxes)
+
+        loss_track = self.embed_head.loss(bbox_feats, ref_bbox_feats,
+                                          num_bbox_per_img,
+                                          num_bbox_per_ref_img,
+                                          sampling_results, gt_instance_ids,
+                                          ref_gt_instance_ids)
+        losses.update(loss_track)
+
+        return losses
+
+    def predict(self, roi_feats: Tensor,
+                prev_roi_feats: Tensor) -> List[Tensor]:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            roi_feats (Tensor): Feature map of current images rois.
+            prev_roi_feats (Tensor): Feature map of previous images rois.
+
+        Returns:
+            list[Tensor]: The predicted similarity_logits of each pair of key
+            image and reference image.
+        """
+        return self.embed_head.predict(roi_feats, prev_roi_feats)[0]
diff --git a/mmde/mmdet/models/utils/__init__.py b/mmde/mmdet/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a00d9a37f33169dc1c523c68db55f823dd0424fa
--- /dev/null
+++ b/mmde/mmdet/models/utils/__init__.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .gaussian_target import (gather_feat, gaussian_radius,
+                              gen_gaussian_target, get_local_maximum,
+                              get_topk_from_heatmap, transpose_and_gather_feat)
+from .image import imrenormalize
+from .make_divisible import make_divisible
+# Disable yapf because it conflicts with isort.
+# yapf: disable
+from .misc import (align_tensor, aligned_bilinear, center_of_mass,
+                   empty_instances, filter_gt_instances,
+                   filter_scores_and_topk, flip_tensor, generate_coordinate,
+                   images_to_levels, interpolate_as, levels_to_images,
+                   mask2ndarray, multi_apply, relative_coordinate_maps,
+                   rename_loss_dict, reweight_loss_dict,
+                   samplelist_boxtype2tensor, select_single_mlvl,
+                   sigmoid_geometric_mean, unfold_wo_center, unmap,
+                   unpack_gt_instances)
+from .panoptic_gt_processing import preprocess_panoptic_gt
+from .point_sample import (get_uncertain_point_coords_with_randomness,
+                           get_uncertainty)
+from .vlfuse_helper import BertEncoderLayer, VLFuse, permute_and_flatten
+from .wbf import weighted_boxes_fusion
+
+__all__ = [
+    'gaussian_radius', 'gen_gaussian_target', 'make_divisible',
+    'get_local_maximum', 'get_topk_from_heatmap', 'transpose_and_gather_feat',
+    'interpolate_as', 'sigmoid_geometric_mean', 'gather_feat',
+    'preprocess_panoptic_gt', 'get_uncertain_point_coords_with_randomness',
+    'get_uncertainty', 'unpack_gt_instances', 'empty_instances',
+    'center_of_mass', 'filter_scores_and_topk', 'flip_tensor',
+    'generate_coordinate', 'levels_to_images', 'mask2ndarray', 'multi_apply',
+    'select_single_mlvl', 'unmap', 'images_to_levels',
+    'samplelist_boxtype2tensor', 'filter_gt_instances', 'rename_loss_dict',
+    'reweight_loss_dict', 'relative_coordinate_maps', 'aligned_bilinear',
+    'unfold_wo_center', 'imrenormalize', 'VLFuse', 'permute_and_flatten',
+    'BertEncoderLayer', 'align_tensor', 'weighted_boxes_fusion'
+]
diff --git a/mmde/mmdet/models/utils/gaussian_target.py b/mmde/mmdet/models/utils/gaussian_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bf4d558ce05c4f953e1c3fcf75016e5874afce1
--- /dev/null
+++ b/mmde/mmdet/models/utils/gaussian_target.py
@@ -0,0 +1,268 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from math import sqrt
+
+import torch
+import torch.nn.functional as F
+
+
+def gaussian2D(radius, sigma=1, dtype=torch.float32, device='cpu'):
+    """Generate 2D gaussian kernel.
+
+    Args:
+        radius (int): Radius of gaussian kernel.
+        sigma (int): Sigma of gaussian function. Default: 1.
+        dtype (torch.dtype): Dtype of gaussian tensor. Default: torch.float32.
+        device (str): Device of gaussian tensor. Default: 'cpu'.
+
+    Returns:
+        h (Tensor): Gaussian kernel with a
+            ``(2 * radius + 1) * (2 * radius + 1)`` shape.
+    """
+    x = torch.arange(
+        -radius, radius + 1, dtype=dtype, device=device).view(1, -1)
+    y = torch.arange(
+        -radius, radius + 1, dtype=dtype, device=device).view(-1, 1)
+
+    h = (-(x * x + y * y) / (2 * sigma * sigma)).exp()
+
+    h[h < torch.finfo(h.dtype).eps * h.max()] = 0
+    return h
+
+
+def gen_gaussian_target(heatmap, center, radius, k=1):
+    """Generate 2D gaussian heatmap.
+
+    Args:
+        heatmap (Tensor): Input heatmap, the gaussian kernel will cover on
+            it and maintain the max value.
+        center (list[int]): Coord of gaussian kernel's center.
+        radius (int): Radius of gaussian kernel.
+        k (int): Coefficient of gaussian kernel. Default: 1.
+
+    Returns:
+        out_heatmap (Tensor): Updated heatmap covered by gaussian kernel.
+    """
+    diameter = 2 * radius + 1
+    gaussian_kernel = gaussian2D(
+        radius, sigma=diameter / 6, dtype=heatmap.dtype, device=heatmap.device)
+
+    x, y = center
+
+    height, width = heatmap.shape[:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian_kernel[radius - top:radius + bottom,
+                                      radius - left:radius + right]
+    out_heatmap = heatmap
+    torch.max(
+        masked_heatmap,
+        masked_gaussian * k,
+        out=out_heatmap[y - top:y + bottom, x - left:x + right])
+
+    return out_heatmap
+
+
+def gaussian_radius(det_size, min_overlap):
+    r"""Generate 2D gaussian radius.
+
+    This function is modified from the `official github repo
+    <https://github.com/princeton-vl/CornerNet-Lite/blob/master/core/sample/
+    utils.py#L65>`_.
+
+    Given ``min_overlap``, radius could computed by a quadratic equation
+    according to Vieta's formulas.
+
+    There are 3 cases for computing gaussian radius, details are following:
+
+    - Explanation of figure: ``lt`` and ``br`` indicates the left-top and
+      bottom-right corner of ground truth box. ``x`` indicates the
+      generated corner at the limited position when ``radius=r``.
+
+    - Case1: one corner is inside the gt box and the other is outside.
+
+    .. code:: text
+
+        |<   width   >|
+
+        lt-+----------+         -
+        |  |          |         ^
+        +--x----------+--+
+        |  |          |  |
+        |  |          |  |    height
+        |  | overlap  |  |
+        |  |          |  |
+        |  |          |  |      v
+        +--+---------br--+      -
+           |          |  |
+           +----------+--x
+
+    To ensure IoU of generated box and gt box is larger than ``min_overlap``:
+
+    .. math::
+        \cfrac{(w-r)*(h-r)}{w*h+(w+h)r-r^2} \ge {iou} \quad\Rightarrow\quad
+        {r^2-(w+h)r+\cfrac{1-iou}{1+iou}*w*h} \ge 0 \\
+        {a} = 1,\quad{b} = {-(w+h)},\quad{c} = {\cfrac{1-iou}{1+iou}*w*h}
+        {r} \le \cfrac{-b-\sqrt{b^2-4*a*c}}{2*a}
+
+    - Case2: both two corners are inside the gt box.
+
+    .. code:: text
+
+        |<   width   >|
+
+        lt-+----------+         -
+        |  |          |         ^
+        +--x-------+  |
+        |  |       |  |
+        |  |overlap|  |       height
+        |  |       |  |
+        |  +-------x--+
+        |          |  |         v
+        +----------+-br         -
+
+    To ensure IoU of generated box and gt box is larger than ``min_overlap``:
+
+    .. math::
+        \cfrac{(w-2*r)*(h-2*r)}{w*h} \ge {iou} \quad\Rightarrow\quad
+        {4r^2-2(w+h)r+(1-iou)*w*h} \ge 0 \\
+        {a} = 4,\quad {b} = {-2(w+h)},\quad {c} = {(1-iou)*w*h}
+        {r} \le \cfrac{-b-\sqrt{b^2-4*a*c}}{2*a}
+
+    - Case3: both two corners are outside the gt box.
+
+    .. code:: text
+
+           |<   width   >|
+
+        x--+----------------+
+        |  |                |
+        +-lt-------------+  |   -
+        |  |             |  |   ^
+        |  |             |  |
+        |  |   overlap   |  | height
+        |  |             |  |
+        |  |             |  |   v
+        |  +------------br--+   -
+        |                |  |
+        +----------------+--x
+
+    To ensure IoU of generated box and gt box is larger than ``min_overlap``:
+
+    .. math::
+        \cfrac{w*h}{(w+2*r)*(h+2*r)} \ge {iou} \quad\Rightarrow\quad
+        {4*iou*r^2+2*iou*(w+h)r+(iou-1)*w*h} \le 0 \\
+        {a} = {4*iou},\quad {b} = {2*iou*(w+h)},\quad {c} = {(iou-1)*w*h} \\
+        {r} \le \cfrac{-b+\sqrt{b^2-4*a*c}}{2*a}
+
+    Args:
+        det_size (list[int]): Shape of object.
+        min_overlap (float): Min IoU with ground truth for boxes generated by
+            keypoints inside the gaussian kernel.
+
+    Returns:
+        radius (int): Radius of gaussian kernel.
+    """
+    height, width = det_size
+
+    a1 = 1
+    b1 = (height + width)
+    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
+    sq1 = sqrt(b1**2 - 4 * a1 * c1)
+    r1 = (b1 - sq1) / (2 * a1)
+
+    a2 = 4
+    b2 = 2 * (height + width)
+    c2 = (1 - min_overlap) * width * height
+    sq2 = sqrt(b2**2 - 4 * a2 * c2)
+    r2 = (b2 - sq2) / (2 * a2)
+
+    a3 = 4 * min_overlap
+    b3 = -2 * min_overlap * (height + width)
+    c3 = (min_overlap - 1) * width * height
+    sq3 = sqrt(b3**2 - 4 * a3 * c3)
+    r3 = (b3 + sq3) / (2 * a3)
+    return min(r1, r2, r3)
+
+
+def get_local_maximum(heat, kernel=3):
+    """Extract local maximum pixel with given kernel.
+
+    Args:
+        heat (Tensor): Target heatmap.
+        kernel (int): Kernel size of max pooling. Default: 3.
+
+    Returns:
+        heat (Tensor): A heatmap where local maximum pixels maintain its
+            own value and other positions are 0.
+    """
+    pad = (kernel - 1) // 2
+    hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad)
+    keep = (hmax == heat).float()
+    return heat * keep
+
+
+def get_topk_from_heatmap(scores, k=20):
+    """Get top k positions from heatmap.
+
+    Args:
+        scores (Tensor): Target heatmap with shape
+            [batch, num_classes, height, width].
+        k (int): Target number. Default: 20.
+
+    Returns:
+        tuple[torch.Tensor]: Scores, indexes, categories and coords of
+            topk keypoint. Containing following Tensors:
+
+        - topk_scores (Tensor): Max scores of each topk keypoint.
+        - topk_inds (Tensor): Indexes of each topk keypoint.
+        - topk_clses (Tensor): Categories of each topk keypoint.
+        - topk_ys (Tensor): Y-coord of each topk keypoint.
+        - topk_xs (Tensor): X-coord of each topk keypoint.
+    """
+    batch, _, height, width = scores.size()
+    topk_scores, topk_inds = torch.topk(scores.view(batch, -1), k)
+    topk_clses = topk_inds // (height * width)
+    topk_inds = topk_inds % (height * width)
+    topk_ys = topk_inds // width
+    topk_xs = (topk_inds % width).int().float()
+    return topk_scores, topk_inds, topk_clses, topk_ys, topk_xs
+
+
+def gather_feat(feat, ind, mask=None):
+    """Gather feature according to index.
+
+    Args:
+        feat (Tensor): Target feature map.
+        ind (Tensor): Target coord index.
+        mask (Tensor | None): Mask of feature map. Default: None.
+
+    Returns:
+        feat (Tensor): Gathered feature.
+    """
+    dim = feat.size(2)
+    ind = ind.unsqueeze(2).repeat(1, 1, dim)
+    feat = feat.gather(1, ind)
+    if mask is not None:
+        mask = mask.unsqueeze(2).expand_as(feat)
+        feat = feat[mask]
+        feat = feat.view(-1, dim)
+    return feat
+
+
+def transpose_and_gather_feat(feat, ind):
+    """Transpose and gather feature according to index.
+
+    Args:
+        feat (Tensor): Target feature map.
+        ind (Tensor): Target coord index.
+
+    Returns:
+        feat (Tensor): Transposed and gathered feature.
+    """
+    feat = feat.permute(0, 2, 3, 1).contiguous()
+    feat = feat.view(feat.size(0), -1, feat.size(3))
+    feat = gather_feat(feat, ind)
+    return feat
diff --git a/mmde/mmdet/models/utils/image.py b/mmde/mmdet/models/utils/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..16b5787a78232e46f47585c99526ca2b4ca9d1a1
--- /dev/null
+++ b/mmde/mmdet/models/utils/image.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import mmcv
+import numpy as np
+import torch
+from torch import Tensor
+
+
+def imrenormalize(img: Union[Tensor, np.ndarray], img_norm_cfg: dict,
+                  new_img_norm_cfg: dict) -> Union[Tensor, np.ndarray]:
+    """Re-normalize the image.
+
+    Args:
+        img (Tensor | ndarray): Input image. If the input is a Tensor, the
+            shape is (1, C, H, W). If the input is a ndarray, the shape
+            is (H, W, C).
+        img_norm_cfg (dict): Original configuration for the normalization.
+        new_img_norm_cfg (dict): New configuration for the normalization.
+
+    Returns:
+        Tensor | ndarray: Output image with the same type and shape of
+        the input.
+    """
+    if isinstance(img, torch.Tensor):
+        assert img.ndim == 4 and img.shape[0] == 1
+        new_img = img.squeeze(0).cpu().numpy().transpose(1, 2, 0)
+        new_img = _imrenormalize(new_img, img_norm_cfg, new_img_norm_cfg)
+        new_img = new_img.transpose(2, 0, 1)[None]
+        return torch.from_numpy(new_img).to(img)
+    else:
+        return _imrenormalize(img, img_norm_cfg, new_img_norm_cfg)
+
+
+def _imrenormalize(img: Union[Tensor, np.ndarray], img_norm_cfg: dict,
+                   new_img_norm_cfg: dict) -> Union[Tensor, np.ndarray]:
+    """Re-normalize the image."""
+    img_norm_cfg = img_norm_cfg.copy()
+    new_img_norm_cfg = new_img_norm_cfg.copy()
+    for k, v in img_norm_cfg.items():
+        if (k == 'mean' or k == 'std') and not isinstance(v, np.ndarray):
+            img_norm_cfg[k] = np.array(v, dtype=img.dtype)
+    # reverse cfg
+    if 'bgr_to_rgb' in img_norm_cfg:
+        img_norm_cfg['rgb_to_bgr'] = img_norm_cfg['bgr_to_rgb']
+        img_norm_cfg.pop('bgr_to_rgb')
+    for k, v in new_img_norm_cfg.items():
+        if (k == 'mean' or k == 'std') and not isinstance(v, np.ndarray):
+            new_img_norm_cfg[k] = np.array(v, dtype=img.dtype)
+    img = mmcv.imdenormalize(img, **img_norm_cfg)
+    img = mmcv.imnormalize(img, **new_img_norm_cfg)
+    return img
diff --git a/mmde/mmdet/models/utils/make_divisible.py b/mmde/mmdet/models/utils/make_divisible.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed42c2eeea2a6aed03a0be5516b8d1ef1139e486
--- /dev/null
+++ b/mmde/mmdet/models/utils/make_divisible.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
+    """Make divisible function.
+
+    This function rounds the channel number to the nearest value that can be
+    divisible by the divisor. It is taken from the original tf repo. It ensures
+    that all layers have a channel number that is divisible by divisor. It can
+    be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py  # noqa
+
+    Args:
+        value (int): The original channel number.
+        divisor (int): The divisor to fully divide the channel number.
+        min_value (int): The minimum value of the output channel.
+            Default: None, means that the minimum value equal to the divisor.
+        min_ratio (float): The minimum ratio of the rounded channel number to
+            the original channel number. Default: 0.9.
+
+    Returns:
+        int: The modified output channel number.
+    """
+
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than (1-min_ratio).
+    if new_value < min_ratio * value:
+        new_value += divisor
+    return new_value
diff --git a/mmde/mmdet/models/utils/misc.py b/mmde/mmdet/models/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cf429153ba7e0be025396b069aef8212144e34d
--- /dev/null
+++ b/mmde/mmdet/models/utils/misc.py
@@ -0,0 +1,697 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+from typing import List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+from mmengine.utils import digit_version
+from six.moves import map, zip
+from torch import Tensor
+from torch.autograd import Function
+from torch.nn import functional as F
+
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import BaseBoxes, get_box_type, stack_boxes
+from mmdet.structures.mask import BitmapMasks, PolygonMasks
+from mmdet.utils import OptInstanceList
+
+
+class SigmoidGeometricMean(Function):
+    """Forward and backward function of geometric mean of two sigmoid
+    functions.
+
+    This implementation with analytical gradient function substitutes
+    the autograd function of (x.sigmoid() * y.sigmoid()).sqrt(). The
+    original implementation incurs none during gradient backprapagation
+    if both x and y are very small values.
+    """
+
+    @staticmethod
+    def forward(ctx, x, y):
+        x_sigmoid = x.sigmoid()
+        y_sigmoid = y.sigmoid()
+        z = (x_sigmoid * y_sigmoid).sqrt()
+        ctx.save_for_backward(x_sigmoid, y_sigmoid, z)
+        return z
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x_sigmoid, y_sigmoid, z = ctx.saved_tensors
+        grad_x = grad_output * z * (1 - x_sigmoid) / 2
+        grad_y = grad_output * z * (1 - y_sigmoid) / 2
+        return grad_x, grad_y
+
+
+sigmoid_geometric_mean = SigmoidGeometricMean.apply
+
+
+def interpolate_as(source, target, mode='bilinear', align_corners=False):
+    """Interpolate the `source` to the shape of the `target`.
+
+    The `source` must be a Tensor, but the `target` can be a Tensor or a
+    np.ndarray with the shape (..., target_h, target_w).
+
+    Args:
+        source (Tensor): A 3D/4D Tensor with the shape (N, H, W) or
+            (N, C, H, W).
+        target (Tensor | np.ndarray): The interpolation target with the shape
+            (..., target_h, target_w).
+        mode (str): Algorithm used for interpolation. The options are the
+            same as those in F.interpolate(). Default: ``'bilinear'``.
+        align_corners (bool): The same as the argument in F.interpolate().
+
+    Returns:
+        Tensor: The interpolated source Tensor.
+    """
+    assert len(target.shape) >= 2
+
+    def _interpolate_as(source, target, mode='bilinear', align_corners=False):
+        """Interpolate the `source` (4D) to the shape of the `target`."""
+        target_h, target_w = target.shape[-2:]
+        source_h, source_w = source.shape[-2:]
+        if target_h != source_h or target_w != source_w:
+            source = F.interpolate(
+                source,
+                size=(target_h, target_w),
+                mode=mode,
+                align_corners=align_corners)
+        return source
+
+    if len(source.shape) == 3:
+        source = source[:, None, :, :]
+        source = _interpolate_as(source, target, mode, align_corners)
+        return source[:, 0, :, :]
+    else:
+        return _interpolate_as(source, target, mode, align_corners)
+
+
+def unpack_gt_instances(batch_data_samples: SampleList) -> tuple:
+    """Unpack ``gt_instances``, ``gt_instances_ignore`` and ``img_metas`` based
+    on ``batch_data_samples``
+
+    Args:
+        batch_data_samples (List[:obj:`DetDataSample`]): The Data
+            Samples. It usually includes information such as
+            `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+    Returns:
+        tuple:
+
+            - batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            - batch_gt_instances_ignore (list[:obj:`InstanceData`]):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            - batch_img_metas (list[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+    """
+    batch_gt_instances = []
+    batch_gt_instances_ignore = []
+    batch_img_metas = []
+    for data_sample in batch_data_samples:
+        batch_img_metas.append(data_sample.metainfo)
+        batch_gt_instances.append(data_sample.gt_instances)
+        if 'ignored_instances' in data_sample:
+            batch_gt_instances_ignore.append(data_sample.ignored_instances)
+        else:
+            batch_gt_instances_ignore.append(None)
+
+    return batch_gt_instances, batch_gt_instances_ignore, batch_img_metas
+
+
+def empty_instances(batch_img_metas: List[dict],
+                    device: torch.device,
+                    task_type: str,
+                    instance_results: OptInstanceList = None,
+                    mask_thr_binary: Union[int, float] = 0,
+                    box_type: Union[str, type] = 'hbox',
+                    use_box_type: bool = False,
+                    num_classes: int = 80,
+                    score_per_cls: bool = False) -> List[InstanceData]:
+    """Handle predicted instances when RoI is empty.
+
+    Note: If ``instance_results`` is not None, it will be modified
+    in place internally, and then return ``instance_results``
+
+    Args:
+        batch_img_metas (list[dict]): List of image information.
+        device (torch.device): Device of tensor.
+        task_type (str): Expected returned task type. it currently
+            supports bbox and mask.
+        instance_results (list[:obj:`InstanceData`]): List of instance
+            results.
+        mask_thr_binary (int, float): mask binarization threshold.
+            Defaults to 0.
+        box_type (str or type): The empty box type. Defaults to `hbox`.
+        use_box_type (bool): Whether to warp boxes with the box type.
+            Defaults to False.
+        num_classes (int): num_classes of bbox_head. Defaults to 80.
+        score_per_cls (bool):  Whether to generate classwise score for
+            the empty instance. ``score_per_cls`` will be True when the model
+            needs to produce raw results without nms. Defaults to False.
+
+    Returns:
+        list[:obj:`InstanceData`]: Detection results of each image
+    """
+    assert task_type in ('bbox', 'mask'), 'Only support bbox and mask,' \
+                                          f' but got {task_type}'
+
+    if instance_results is not None:
+        assert len(instance_results) == len(batch_img_metas)
+
+    results_list = []
+    for img_id in range(len(batch_img_metas)):
+        if instance_results is not None:
+            results = instance_results[img_id]
+            assert isinstance(results, InstanceData)
+        else:
+            results = InstanceData()
+
+        if task_type == 'bbox':
+            _, box_type = get_box_type(box_type)
+            bboxes = torch.zeros(0, box_type.box_dim, device=device)
+            if use_box_type:
+                bboxes = box_type(bboxes, clone=False)
+            results.bboxes = bboxes
+            score_shape = (0, num_classes + 1) if score_per_cls else (0, )
+            results.scores = torch.zeros(score_shape, device=device)
+            results.labels = torch.zeros((0, ),
+                                         device=device,
+                                         dtype=torch.long)
+        else:
+            # TODO: Handle the case where rescale is false
+            img_h, img_w = batch_img_metas[img_id]['ori_shape'][:2]
+            # the type of `im_mask` will be torch.bool or torch.uint8,
+            # where uint8 if for visualization and debugging.
+            im_mask = torch.zeros(
+                0,
+                img_h,
+                img_w,
+                device=device,
+                dtype=torch.bool if mask_thr_binary >= 0 else torch.uint8)
+            results.masks = im_mask
+        results_list.append(results)
+    return results_list
+
+
+def multi_apply(func, *args, **kwargs):
+    """Apply function to a list of arguments.
+
+    Note:
+        This function applies the ``func`` to multiple inputs and
+        map the multiple outputs of the ``func`` into different
+        list. Each list contains the same type of outputs corresponding
+        to different inputs.
+
+    Args:
+        func (Function): A function that will be applied to a list of
+            arguments
+
+    Returns:
+        tuple(list): A tuple containing multiple list, each list contains \
+            a kind of returned results by the function
+    """
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def unmap(data, count, inds, fill=0):
+    """Unmap a subset of item (data) back to the original set of items (of size
+    count)"""
+    if data.dim() == 1:
+        ret = data.new_full((count, ), fill)
+        ret[inds.type(torch.bool)] = data
+    else:
+        new_size = (count, ) + data.size()[1:]
+        ret = data.new_full(new_size, fill)
+        ret[inds.type(torch.bool), :] = data
+    return ret
+
+
+def mask2ndarray(mask):
+    """Convert Mask to ndarray..
+
+    Args:
+        mask (:obj:`BitmapMasks` or :obj:`PolygonMasks` or
+        torch.Tensor or np.ndarray): The mask to be converted.
+
+    Returns:
+        np.ndarray: Ndarray mask of shape (n, h, w) that has been converted
+    """
+    if isinstance(mask, (BitmapMasks, PolygonMasks)):
+        mask = mask.to_ndarray()
+    elif isinstance(mask, torch.Tensor):
+        mask = mask.detach().cpu().numpy()
+    elif not isinstance(mask, np.ndarray):
+        raise TypeError(f'Unsupported {type(mask)} data type')
+    return mask
+
+
+def flip_tensor(src_tensor, flip_direction):
+    """flip tensor base on flip_direction.
+
+    Args:
+        src_tensor (Tensor): input feature map, shape (B, C, H, W).
+        flip_direction (str): The flipping direction. Options are
+          'horizontal', 'vertical', 'diagonal'.
+
+    Returns:
+        out_tensor (Tensor): Flipped tensor.
+    """
+    assert src_tensor.ndim == 4
+    valid_directions = ['horizontal', 'vertical', 'diagonal']
+    assert flip_direction in valid_directions
+    if flip_direction == 'horizontal':
+        out_tensor = torch.flip(src_tensor, [3])
+    elif flip_direction == 'vertical':
+        out_tensor = torch.flip(src_tensor, [2])
+    else:
+        out_tensor = torch.flip(src_tensor, [2, 3])
+    return out_tensor
+
+
+def select_single_mlvl(mlvl_tensors, batch_id, detach=True):
+    """Extract a multi-scale single image tensor from a multi-scale batch
+    tensor based on batch index.
+
+    Note: The default value of detach is True, because the proposal gradient
+    needs to be detached during the training of the two-stage model. E.g
+    Cascade Mask R-CNN.
+
+    Args:
+        mlvl_tensors (list[Tensor]): Batch tensor for all scale levels,
+           each is a 4D-tensor.
+        batch_id (int): Batch index.
+        detach (bool): Whether detach gradient. Default True.
+
+    Returns:
+        list[Tensor]: Multi-scale single image tensor.
+    """
+    assert isinstance(mlvl_tensors, (list, tuple))
+    num_levels = len(mlvl_tensors)
+
+    if detach:
+        mlvl_tensor_list = [
+            mlvl_tensors[i][batch_id].detach() for i in range(num_levels)
+        ]
+    else:
+        mlvl_tensor_list = [
+            mlvl_tensors[i][batch_id] for i in range(num_levels)
+        ]
+    return mlvl_tensor_list
+
+
+def filter_scores_and_topk(scores, score_thr, topk, results=None):
+    """Filter results using score threshold and topk candidates.
+
+    Args:
+        scores (Tensor): The scores, shape (num_bboxes, K).
+        score_thr (float): The score filter threshold.
+        topk (int): The number of topk candidates.
+        results (dict or list or Tensor, Optional): The results to
+           which the filtering rule is to be applied. The shape
+           of each item is (num_bboxes, N).
+
+    Returns:
+        tuple: Filtered results
+
+            - scores (Tensor): The scores after being filtered, \
+                shape (num_bboxes_filtered, ).
+            - labels (Tensor): The class labels, shape \
+                (num_bboxes_filtered, ).
+            - anchor_idxs (Tensor): The anchor indexes, shape \
+                (num_bboxes_filtered, ).
+            - filtered_results (dict or list or Tensor, Optional): \
+                The filtered results. The shape of each item is \
+                (num_bboxes_filtered, N).
+    """
+    valid_mask = scores > score_thr
+    scores = scores[valid_mask]
+    valid_idxs = torch.nonzero(valid_mask)
+
+    num_topk = min(topk, valid_idxs.size(0))
+    # torch.sort is actually faster than .topk (at least on GPUs)
+    scores, idxs = scores.sort(descending=True)
+    scores = scores[:num_topk]
+    topk_idxs = valid_idxs[idxs[:num_topk]]
+    keep_idxs, labels = topk_idxs.unbind(dim=1)
+
+    filtered_results = None
+    if results is not None:
+        if isinstance(results, dict):
+            filtered_results = {k: v[keep_idxs] for k, v in results.items()}
+        elif isinstance(results, list):
+            filtered_results = [result[keep_idxs] for result in results]
+        elif isinstance(results, torch.Tensor):
+            filtered_results = results[keep_idxs]
+        else:
+            raise NotImplementedError(f'Only supports dict or list or Tensor, '
+                                      f'but get {type(results)}.')
+    return scores, labels, keep_idxs, filtered_results
+
+
+def center_of_mass(mask, esp=1e-6):
+    """Calculate the centroid coordinates of the mask.
+
+    Args:
+        mask (Tensor): The mask to be calculated, shape (h, w).
+        esp (float): Avoid dividing by zero. Default: 1e-6.
+
+    Returns:
+        tuple[Tensor]: the coordinates of the center point of the mask.
+
+            - center_h (Tensor): the center point of the height.
+            - center_w (Tensor): the center point of the width.
+    """
+    h, w = mask.shape
+    grid_h = torch.arange(h, device=mask.device)[:, None]
+    grid_w = torch.arange(w, device=mask.device)
+    normalizer = mask.sum().float().clamp(min=esp)
+    center_h = (mask * grid_h).sum() / normalizer
+    center_w = (mask * grid_w).sum() / normalizer
+    return center_h, center_w
+
+
+def generate_coordinate(featmap_sizes, device='cuda'):
+    """Generate the coordinate.
+
+    Args:
+        featmap_sizes (tuple): The feature to be calculated,
+            of shape (N, C, W, H).
+        device (str): The device where the feature will be put on.
+    Returns:
+        coord_feat (Tensor): The coordinate feature, of shape (N, 2, W, H).
+    """
+
+    x_range = torch.linspace(-1, 1, featmap_sizes[-1], device=device)
+    y_range = torch.linspace(-1, 1, featmap_sizes[-2], device=device)
+    y, x = torch.meshgrid(y_range, x_range)
+    y = y.expand([featmap_sizes[0], 1, -1, -1])
+    x = x.expand([featmap_sizes[0], 1, -1, -1])
+    coord_feat = torch.cat([x, y], 1)
+
+    return coord_feat
+
+
+def levels_to_images(mlvl_tensor: List[torch.Tensor]) -> List[torch.Tensor]:
+    """Concat multi-level feature maps by image.
+
+    [feature_level0, feature_level1...] -> [feature_image0, feature_image1...]
+    Convert the shape of each element in mlvl_tensor from (N, C, H, W) to
+    (N, H*W , C), then split the element to N elements with shape (H*W, C), and
+    concat elements in same image of all level along first dimension.
+
+    Args:
+        mlvl_tensor (list[Tensor]): list of Tensor which collect from
+            corresponding level. Each element is of shape (N, C, H, W)
+
+    Returns:
+        list[Tensor]: A list that contains N tensors and each tensor is
+            of shape (num_elements, C)
+    """
+    batch_size = mlvl_tensor[0].size(0)
+    batch_list = [[] for _ in range(batch_size)]
+    channels = mlvl_tensor[0].size(1)
+    for t in mlvl_tensor:
+        t = t.permute(0, 2, 3, 1)
+        t = t.view(batch_size, -1, channels).contiguous()
+        for img in range(batch_size):
+            batch_list[img].append(t[img])
+    return [torch.cat(item, 0) for item in batch_list]
+
+
+def images_to_levels(target, num_levels):
+    """Convert targets by image to targets by feature level.
+
+    [target_img0, target_img1] -> [target_level0, target_level1, ...]
+    """
+    target = stack_boxes(target, 0)
+    level_targets = []
+    start = 0
+    for n in num_levels:
+        end = start + n
+        # level_targets.append(target[:, start:end].squeeze(0))
+        level_targets.append(target[:, start:end])
+        start = end
+    return level_targets
+
+
+def samplelist_boxtype2tensor(batch_data_samples: SampleList) -> SampleList:
+    for data_samples in batch_data_samples:
+        if 'gt_instances' in data_samples:
+            bboxes = data_samples.gt_instances.get('bboxes', None)
+            if isinstance(bboxes, BaseBoxes):
+                data_samples.gt_instances.bboxes = bboxes.tensor
+        if 'pred_instances' in data_samples:
+            bboxes = data_samples.pred_instances.get('bboxes', None)
+            if isinstance(bboxes, BaseBoxes):
+                data_samples.pred_instances.bboxes = bboxes.tensor
+        if 'ignored_instances' in data_samples:
+            bboxes = data_samples.ignored_instances.get('bboxes', None)
+            if isinstance(bboxes, BaseBoxes):
+                data_samples.ignored_instances.bboxes = bboxes.tensor
+
+
+_torch_version_div_indexing = (
+    'parrots' not in torch.__version__
+    and digit_version(torch.__version__) >= digit_version('1.8'))
+
+
+def floordiv(dividend, divisor, rounding_mode='trunc'):
+    if _torch_version_div_indexing:
+        return torch.div(dividend, divisor, rounding_mode=rounding_mode)
+    else:
+        return dividend // divisor
+
+
+def _filter_gt_instances_by_score(batch_data_samples: SampleList,
+                                  score_thr: float) -> SampleList:
+    """Filter ground truth (GT) instances by score.
+
+    Args:
+        batch_data_samples (SampleList): The Data
+            Samples. It usually includes information such as
+            `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+        score_thr (float): The score filter threshold.
+
+    Returns:
+        SampleList: The Data Samples filtered by score.
+    """
+    for data_samples in batch_data_samples:
+        assert 'scores' in data_samples.gt_instances, \
+            'there does not exit scores in instances'
+        if data_samples.gt_instances.bboxes.shape[0] > 0:
+            data_samples.gt_instances = data_samples.gt_instances[
+                data_samples.gt_instances.scores > score_thr]
+    return batch_data_samples
+
+
+def _filter_gt_instances_by_size(batch_data_samples: SampleList,
+                                 wh_thr: tuple) -> SampleList:
+    """Filter ground truth (GT) instances by size.
+
+    Args:
+        batch_data_samples (SampleList): The Data
+            Samples. It usually includes information such as
+            `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+        wh_thr (tuple):  Minimum width and height of bbox.
+
+    Returns:
+        SampleList: The Data Samples filtered by score.
+    """
+    for data_samples in batch_data_samples:
+        bboxes = data_samples.gt_instances.bboxes
+        if bboxes.shape[0] > 0:
+            w = bboxes[:, 2] - bboxes[:, 0]
+            h = bboxes[:, 3] - bboxes[:, 1]
+            data_samples.gt_instances = data_samples.gt_instances[
+                (w > wh_thr[0]) & (h > wh_thr[1])]
+    return batch_data_samples
+
+
+def filter_gt_instances(batch_data_samples: SampleList,
+                        score_thr: float = None,
+                        wh_thr: tuple = None):
+    """Filter ground truth (GT) instances by score and/or size.
+
+    Args:
+        batch_data_samples (SampleList): The Data
+            Samples. It usually includes information such as
+            `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+        score_thr (float): The score filter threshold.
+        wh_thr (tuple):  Minimum width and height of bbox.
+
+    Returns:
+        SampleList: The Data Samples filtered by score and/or size.
+    """
+
+    if score_thr is not None:
+        batch_data_samples = _filter_gt_instances_by_score(
+            batch_data_samples, score_thr)
+    if wh_thr is not None:
+        batch_data_samples = _filter_gt_instances_by_size(
+            batch_data_samples, wh_thr)
+    return batch_data_samples
+
+
+def rename_loss_dict(prefix: str, losses: dict) -> dict:
+    """Rename the key names in loss dict by adding a prefix.
+
+    Args:
+        prefix (str): The prefix for loss components.
+        losses (dict):  A dictionary of loss components.
+
+    Returns:
+            dict: A dictionary of loss components with prefix.
+    """
+    return {prefix + k: v for k, v in losses.items()}
+
+
+def reweight_loss_dict(losses: dict, weight: float) -> dict:
+    """Reweight losses in the dict by weight.
+
+    Args:
+        losses (dict):  A dictionary of loss components.
+        weight (float): Weight for loss components.
+
+    Returns:
+            dict: A dictionary of weighted loss components.
+    """
+    for name, loss in losses.items():
+        if 'loss' in name:
+            if isinstance(loss, Sequence):
+                losses[name] = [item * weight for item in loss]
+            else:
+                losses[name] = loss * weight
+    return losses
+
+
+def relative_coordinate_maps(
+    locations: Tensor,
+    centers: Tensor,
+    strides: Tensor,
+    size_of_interest: int,
+    feat_sizes: Tuple[int],
+) -> Tensor:
+    """Generate the relative coordinate maps with feat_stride.
+
+    Args:
+        locations (Tensor): The prior location of mask feature map.
+            It has shape (num_priors, 2).
+        centers (Tensor): The prior points of a object in
+            all feature pyramid. It has shape (num_pos, 2)
+        strides (Tensor): The prior strides of a object in
+            all feature pyramid. It has shape (num_pos, 1)
+        size_of_interest (int): The size of the region used in rel coord.
+        feat_sizes (Tuple[int]): The feature size H and W, which has 2 dims.
+    Returns:
+        rel_coord_feat (Tensor): The coordinate feature
+            of shape (num_pos, 2, H, W).
+    """
+
+    H, W = feat_sizes
+    rel_coordinates = centers.reshape(-1, 1, 2) - locations.reshape(1, -1, 2)
+    rel_coordinates = rel_coordinates.permute(0, 2, 1).float()
+    rel_coordinates = rel_coordinates / (
+        strides[:, None, None] * size_of_interest)
+    return rel_coordinates.reshape(-1, 2, H, W)
+
+
+def aligned_bilinear(tensor: Tensor, factor: int) -> Tensor:
+    """aligned bilinear, used in original implement in CondInst:
+
+    https://github.com/aim-uofa/AdelaiDet/blob/\
+    c0b2092ce72442b0f40972f7c6dda8bb52c46d16/adet/utils/comm.py#L23
+    """
+
+    assert tensor.dim() == 4
+    assert factor >= 1
+    assert int(factor) == factor
+
+    if factor == 1:
+        return tensor
+
+    h, w = tensor.size()[2:]
+    tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode='replicate')
+    oh = factor * h + 1
+    ow = factor * w + 1
+    tensor = F.interpolate(
+        tensor, size=(oh, ow), mode='bilinear', align_corners=True)
+    tensor = F.pad(
+        tensor, pad=(factor // 2, 0, factor // 2, 0), mode='replicate')
+
+    return tensor[:, :, :oh - 1, :ow - 1]
+
+
+def unfold_wo_center(x, kernel_size: int, dilation: int) -> Tensor:
+    """unfold_wo_center, used in original implement in BoxInst:
+
+    https://github.com/aim-uofa/AdelaiDet/blob/\
+    4a3a1f7372c35b48ebf5f6adc59f135a0fa28d60/\
+    adet/modeling/condinst/condinst.py#L53
+    """
+    assert x.dim() == 4
+    assert kernel_size % 2 == 1
+
+    # using SAME padding
+    padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
+    unfolded_x = F.unfold(
+        x, kernel_size=kernel_size, padding=padding, dilation=dilation)
+    unfolded_x = unfolded_x.reshape(
+        x.size(0), x.size(1), -1, x.size(2), x.size(3))
+    # remove the center pixels
+    size = kernel_size**2
+    unfolded_x = torch.cat(
+        (unfolded_x[:, :, :size // 2], unfolded_x[:, :, size // 2 + 1:]),
+        dim=2)
+
+    return unfolded_x
+
+
+def padding_to(input_tensor: Tensor, max_len: int = 300) -> Tensor:
+    """Pad the first dimension of `input_tensor` to `max_len`.
+
+    Args:
+        input_tensor (Tensor): The tensor to be padded,
+        max_len (int): Padding target size in the first dimension.
+            Default: 300
+    https://github.com/jshilong/DDQ/blob/ddq_detr/projects/models/utils.py#L19
+    Returns:
+        Tensor: The tensor padded with the first dimension size `max_len`.
+    """
+    if max_len is None:
+        return input_tensor
+    num_padding = max_len - len(input_tensor)
+    if input_tensor.dim() > 1:
+        padding = input_tensor.new_zeros(
+            num_padding, *input_tensor.size()[1:], dtype=input_tensor.dtype)
+    else:
+        padding = input_tensor.new_zeros(num_padding, dtype=input_tensor.dtype)
+    output_tensor = torch.cat([input_tensor, padding], dim=0)
+    return output_tensor
+
+
+def align_tensor(inputs: List[Tensor],
+                 max_len: Optional[int] = None) -> Tensor:
+    """Pad each input to `max_len`, then stack them. If `max_len` is None, then
+    it is the max size of the first dimension of each input.
+
+        https://github.com/jshilong/DDQ/blob/ddq_detr/projects/models/\
+        utils.py#L12
+
+    Args:
+        inputs (list[Tensor]): The tensors to be padded,
+            Each input should have the same shape except the first dimension.
+        max_len (int): Padding target size in the first dimension.
+            Default: None
+    Returns:
+        Tensor: Stacked inputs after padding in the first dimension.
+    """
+    if max_len is None:
+        max_len = max([len(item) for item in inputs])
+
+    return torch.stack([padding_to(item, max_len) for item in inputs])
diff --git a/mmde/mmdet/models/utils/panoptic_gt_processing.py b/mmde/mmdet/models/utils/panoptic_gt_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3bc95fc04040b4a2a13fa63f2d02f092f725e6
--- /dev/null
+++ b/mmde/mmdet/models/utils/panoptic_gt_processing.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from torch import Tensor
+
+
+def preprocess_panoptic_gt(gt_labels: Tensor, gt_masks: Tensor,
+                           gt_semantic_seg: Tensor, num_things: int,
+                           num_stuff: int) -> Tuple[Tensor, Tensor]:
+    """Preprocess the ground truth for a image.
+
+    Args:
+        gt_labels (Tensor): Ground truth labels of each bbox,
+            with shape (num_gts, ).
+        gt_masks (BitmapMasks): Ground truth masks of each instances
+            of a image, shape (num_gts, h, w).
+        gt_semantic_seg (Tensor | None): Ground truth of semantic
+            segmentation with the shape (1, h, w).
+            [0, num_thing_class - 1] means things,
+            [num_thing_class, num_class-1] means stuff,
+            255 means VOID. It's None when training instance segmentation.
+
+    Returns:
+        tuple[Tensor, Tensor]: a tuple containing the following targets.
+
+            - labels (Tensor): Ground truth class indices for a
+                image, with shape (n, ), n is the sum of number
+                of stuff type and number of instance in a image.
+            - masks (Tensor): Ground truth mask for a image, with
+                shape (n, h, w). Contains stuff and things when training
+                panoptic segmentation, and things only when training
+                instance segmentation.
+    """
+    num_classes = num_things + num_stuff
+    things_masks = gt_masks.to_tensor(
+        dtype=torch.bool, device=gt_labels.device)
+
+    if gt_semantic_seg is None:
+        masks = things_masks.long()
+        return gt_labels, masks
+
+    things_labels = gt_labels
+    gt_semantic_seg = gt_semantic_seg.squeeze(0)
+
+    semantic_labels = torch.unique(
+        gt_semantic_seg,
+        sorted=False,
+        return_inverse=False,
+        return_counts=False)
+    stuff_masks_list = []
+    stuff_labels_list = []
+    for label in semantic_labels:
+        if label < num_things or label >= num_classes:
+            continue
+        stuff_mask = gt_semantic_seg == label
+        stuff_masks_list.append(stuff_mask)
+        stuff_labels_list.append(label)
+
+    if len(stuff_masks_list) > 0:
+        stuff_masks = torch.stack(stuff_masks_list, dim=0)
+        stuff_labels = torch.stack(stuff_labels_list, dim=0)
+        labels = torch.cat([things_labels, stuff_labels], dim=0)
+        masks = torch.cat([things_masks, stuff_masks], dim=0)
+    else:
+        labels = things_labels
+        masks = things_masks
+
+    masks = masks.long()
+    return labels, masks
diff --git a/mmde/mmdet/models/utils/point_sample.py b/mmde/mmdet/models/utils/point_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..1afc957f3da7d1dc030c21d40311c768c6952ea4
--- /dev/null
+++ b/mmde/mmdet/models/utils/point_sample.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import point_sample
+from torch import Tensor
+
+
+def get_uncertainty(mask_preds: Tensor, labels: Tensor) -> Tensor:
+    """Estimate uncertainty based on pred logits.
+
+    We estimate uncertainty as L1 distance between 0.0 and the logits
+    prediction in 'mask_preds' for the foreground class in `classes`.
+
+    Args:
+        mask_preds (Tensor): mask predication logits, shape (num_rois,
+            num_classes, mask_height, mask_width).
+
+        labels (Tensor): Either predicted or ground truth label for
+            each predicted mask, of length num_rois.
+
+    Returns:
+        scores (Tensor): Uncertainty scores with the most uncertain
+            locations having the highest uncertainty score,
+            shape (num_rois, 1, mask_height, mask_width)
+    """
+    if mask_preds.shape[1] == 1:
+        gt_class_logits = mask_preds.clone()
+    else:
+        inds = torch.arange(mask_preds.shape[0], device=mask_preds.device)
+        gt_class_logits = mask_preds[inds, labels].unsqueeze(1)
+    return -torch.abs(gt_class_logits)
+
+
+def get_uncertain_point_coords_with_randomness(
+        mask_preds: Tensor, labels: Tensor, num_points: int,
+        oversample_ratio: float, importance_sample_ratio: float) -> Tensor:
+    """Get ``num_points`` most uncertain points with random points during
+    train.
+
+    Sample points in [0, 1] x [0, 1] coordinate space based on their
+    uncertainty. The uncertainties are calculated for each point using
+    'get_uncertainty()' function that takes point's logit prediction as
+    input.
+
+    Args:
+        mask_preds (Tensor): A tensor of shape (num_rois, num_classes,
+            mask_height, mask_width) for class-specific or class-agnostic
+            prediction.
+        labels (Tensor): The ground truth class for each instance.
+        num_points (int): The number of points to sample.
+        oversample_ratio (float): Oversampling parameter.
+        importance_sample_ratio (float): Ratio of points that are sampled
+            via importnace sampling.
+
+    Returns:
+        point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+            that contains the coordinates sampled points.
+    """
+    assert oversample_ratio >= 1
+    assert 0 <= importance_sample_ratio <= 1
+    batch_size = mask_preds.shape[0]
+    num_sampled = int(num_points * oversample_ratio)
+    point_coords = torch.rand(
+        batch_size, num_sampled, 2, device=mask_preds.device)
+    point_logits = point_sample(mask_preds, point_coords)
+    # It is crucial to calculate uncertainty based on the sampled
+    # prediction value for the points. Calculating uncertainties of the
+    # coarse predictions first and sampling them for points leads to
+    # incorrect results.  To illustrate this: assume uncertainty func(
+    # logits)=-abs(logits), a sampled point between two coarse
+    # predictions with -1 and 1 logits has 0 logits, and therefore 0
+    # uncertainty value. However, if we calculate uncertainties for the
+    # coarse predictions first, both will have -1 uncertainty,
+    # and sampled point will get -1 uncertainty.
+    point_uncertainties = get_uncertainty(point_logits, labels)
+    num_uncertain_points = int(importance_sample_ratio * num_points)
+    num_random_points = num_points - num_uncertain_points
+    idx = torch.topk(
+        point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+    shift = num_sampled * torch.arange(
+        batch_size, dtype=torch.long, device=mask_preds.device)
+    idx += shift[:, None]
+    point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
+        batch_size, num_uncertain_points, 2)
+    if num_random_points > 0:
+        rand_roi_coords = torch.rand(
+            batch_size, num_random_points, 2, device=mask_preds.device)
+        point_coords = torch.cat((point_coords, rand_roi_coords), dim=1)
+    return point_coords
diff --git a/mmde/mmdet/models/utils/vlfuse_helper.py b/mmde/mmdet/models/utils/vlfuse_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..76b54de317c1f24d7cb40573954f988fd94fef42
--- /dev/null
+++ b/mmde/mmdet/models/utils/vlfuse_helper.py
@@ -0,0 +1,773 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/microsoft/GLIP/blob/main/maskrcnn_benchmark/utils/fuse_helper.py  # noqa
+# and https://github.com/microsoft/GLIP/blob/main/maskrcnn_benchmark/modeling/rpn/modeling_bert.py  # noqa
+import math
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from mmcv.cnn.bricks import DropPath
+from torch import Tensor
+
+try:
+    from transformers import BertConfig, BertPreTrainedModel
+    from transformers.modeling_utils import apply_chunking_to_forward
+    from transformers.models.bert.modeling_bert import \
+        BertAttention as HFBertAttention
+    from transformers.models.bert.modeling_bert import \
+        BertIntermediate as HFBertIntermediate
+    from transformers.models.bert.modeling_bert import \
+        BertOutput as HFBertOutput
+except ImportError:
+    BertConfig = None
+    BertPreTrainedModel = object
+    apply_chunking_to_forward = None
+    HFBertAttention = object
+    HFBertIntermediate = object
+    HFBertOutput = object
+
+MAX_CLAMP_VALUE = 50000
+
+
+def permute_and_flatten(layer: Tensor, N: int, A: int, C: int, H: int,
+                        W: int) -> Tensor:
+    """Permute and then flatten a tensor,
+
+       from size (N, A, C, H, W) to (N, H * W * A, C).
+
+    Args:
+        layer (Tensor): Tensor of shape (N, C, H, W).
+        N (int): Batch size.
+        A (int): Number of attention heads.
+        C (int): Number of channels.
+        H (int): Height of feature map.
+        W (int): Width of feature map.
+
+    Returns:
+        Tensor: A Tensor of shape (N, H * W * A, C).
+    """
+    layer = layer.view(N, A, C, H, W)
+    layer = layer.permute(0, 3, 4, 1, 2)
+    layer = layer.reshape(N, -1, C)
+    return layer
+
+
+def clamp_values(vector: Tensor) -> Tensor:
+    """Clamp the values of a vector to the range [-MAX_CLAMP_VALUE,
+    MAX_CLAMP_VALUE].
+
+    Args:
+        vector (Tensor): Tensor of shape (N, C, H, W).
+
+    Returns:
+        Tensor: A Tensor of shape (N, C, H, W) with clamped values.
+    """
+    vector = torch.clamp(vector, min=-MAX_CLAMP_VALUE, max=MAX_CLAMP_VALUE)
+    return vector
+
+
+class BiMultiHeadAttention(nn.Module):
+    """Bidirectional fusion Multi-Head Attention layer.
+
+    Args:
+        v_dim (int): The dimension of the vision input.
+        l_dim (int): The dimension of the language input.
+        embed_dim (int): The embedding dimension for the attention operation.
+        num_heads (int): The number of attention heads.
+        dropout (float, optional): The dropout probability. Defaults to 0.1.
+    """
+
+    def __init__(self,
+                 v_dim: int,
+                 l_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 dropout: float = 0.1):
+        super(BiMultiHeadAttention, self).__init__()
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.v_dim = v_dim
+        self.l_dim = l_dim
+
+        assert (
+            self.head_dim * self.num_heads == self.embed_dim
+        ), 'embed_dim must be divisible by num_heads ' \
+           f'(got `embed_dim`: {self.embed_dim} ' \
+           f'and `num_heads`: {self.num_heads}).'
+        self.scale = self.head_dim**(-0.5)
+        self.dropout = dropout
+
+        self.v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.l_proj = nn.Linear(self.l_dim, self.embed_dim)
+        self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim)
+
+        self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim)
+        self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim)
+
+        self.stable_softmax_2d = False
+        self.clamp_min_for_underflow = True
+        self.clamp_max_for_overflow = True
+
+        self._reset_parameters()
+
+    def _shape(self, tensor: Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads,
+                           self.head_dim).transpose(1, 2).contiguous()
+
+    def _reset_parameters(self):
+        nn.init.xavier_uniform_(self.v_proj.weight)
+        self.v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.l_proj.weight)
+        self.l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_v_proj.weight)
+        self.values_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_l_proj.weight)
+        self.values_l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_v_proj.weight)
+        self.out_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_l_proj.weight)
+        self.out_l_proj.bias.data.fill_(0)
+
+    def forward(
+        self,
+        vision: Tensor,
+        lang: Tensor,
+        attention_mask_v: Optional[Tensor] = None,
+        attention_mask_l: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Tensor]:
+        bsz, tgt_len, _ = vision.size()
+
+        query_states = self.v_proj(vision) * self.scale
+        key_states = self._shape(self.l_proj(lang), -1, bsz)
+        value_v_states = self._shape(self.values_v_proj(vision), -1, bsz)
+        value_l_states = self._shape(self.values_l_proj(lang), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len,
+                                   bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_v_states = value_v_states.view(*proj_shape)
+        value_l_states = value_l_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f'Attention weights should be of '
+                f'size {(bsz * self.num_heads, tgt_len, src_len)}, '
+                f'but is {attn_weights.size()}')
+
+        if self.stable_softmax_2d:
+            attn_weights = attn_weights - attn_weights.max()
+
+        if self.clamp_min_for_underflow:
+            # Do not increase -50000, data type half has quite limited range
+            attn_weights = torch.clamp(attn_weights, min=-MAX_CLAMP_VALUE)
+        if self.clamp_max_for_overflow:
+            # Do not increase 50000, data type half has quite limited range
+            attn_weights = torch.clamp(attn_weights, max=MAX_CLAMP_VALUE)
+
+        attn_weights_T = attn_weights.transpose(1, 2)
+        attn_weights_l = (
+            attn_weights_T -
+            torch.max(attn_weights_T, dim=-1, keepdim=True)[0])
+        if self.clamp_min_for_underflow:
+            # Do not increase -50000, data type half has quite limited range
+            attn_weights_l = torch.clamp(attn_weights_l, min=-MAX_CLAMP_VALUE)
+        if self.clamp_max_for_overflow:
+            # Do not increase 50000, data type half has quite limited range
+            attn_weights_l = torch.clamp(attn_weights_l, max=MAX_CLAMP_VALUE)
+
+        if attention_mask_v is not None:
+            attention_mask_v = (
+                attention_mask_v[:, None,
+                                 None, :].repeat(1, self.num_heads, 1,
+                                                 1).flatten(0, 1))
+            attn_weights_l.masked_fill_(attention_mask_v, float('-inf'))
+
+        attn_weights_l = attn_weights_l.softmax(dim=-1)
+
+        if attention_mask_l is not None:
+            assert (attention_mask_l.dim() == 2)
+            attention_mask = attention_mask_l.unsqueeze(1).unsqueeze(1)
+            attention_mask = attention_mask.expand(bsz, 1, tgt_len, src_len)
+            attention_mask = attention_mask.masked_fill(
+                attention_mask == 0, -9e15)
+
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError('Attention mask should be of '
+                                 f'size {(bsz, 1, tgt_len, src_len)}')
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
+                                             src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len,
+                                             src_len)
+
+        attn_weights_v = nn.functional.softmax(attn_weights, dim=-1)
+
+        attn_probs_v = F.dropout(
+            attn_weights_v, p=self.dropout, training=self.training)
+        attn_probs_l = F.dropout(
+            attn_weights_l, p=self.dropout, training=self.training)
+
+        attn_output_v = torch.bmm(attn_probs_v, value_l_states)
+        attn_output_l = torch.bmm(attn_probs_l, value_v_states)
+
+        if attn_output_v.size() != (bsz * self.num_heads, tgt_len,
+                                    self.head_dim):
+            raise ValueError(
+                '`attn_output_v` should be of '
+                f'size {(bsz, self.num_heads, tgt_len, self.head_dim)}, '
+                f'but is {attn_output_v.size()}')
+
+        if attn_output_l.size() != (bsz * self.num_heads, src_len,
+                                    self.head_dim):
+            raise ValueError(
+                '`attn_output_l` should be of size '
+                f'{(bsz, self.num_heads, src_len, self.head_dim)}, '
+                f'but is {attn_output_l.size()}')
+
+        attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len,
+                                           self.head_dim)
+        attn_output_v = attn_output_v.transpose(1, 2)
+        attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len,
+                                           self.head_dim)
+        attn_output_l = attn_output_l.transpose(1, 2)
+        attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim)
+
+        attn_output_v = self.out_v_proj(attn_output_v)
+        attn_output_l = self.out_l_proj(attn_output_l)
+
+        return attn_output_v, attn_output_l
+
+
+class BiAttentionBlock(nn.Module):
+    """BiAttentionBlock Module:
+
+    First, multi-level visual features are concat; Then the concat visual
+    feature and lang feature are fused by attention; Finally the newly visual
+    feature are split into multi levels.
+
+    Args:
+        v_dim (int): The dimension of the visual features.
+        l_dim (int): The dimension of the language feature.
+        embed_dim (int): The embedding dimension for the attention operation.
+        num_heads (int): The number of attention heads.
+        dropout (float, optional): The dropout probability. Defaults to 0.1.
+        drop_path (float, optional): The drop path probability.
+            Defaults to 0.0.
+        init_values (float, optional):
+            The initial value for the scaling parameter.
+            Defaults to 1e-4.
+    """
+
+    def __init__(self,
+                 v_dim: int,
+                 l_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 dropout: float = 0.1,
+                 drop_path: float = .0,
+                 init_values: float = 1e-4):
+        super().__init__()
+
+        # pre layer norm
+        self.layer_norm_v = nn.LayerNorm(v_dim)
+        self.layer_norm_l = nn.LayerNorm(l_dim)
+        self.attn = BiMultiHeadAttention(
+            v_dim=v_dim,
+            l_dim=l_dim,
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            dropout=dropout)
+
+        # add layer scale for training stability
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.gamma_v = nn.Parameter(
+            init_values * torch.ones(v_dim), requires_grad=True)
+        self.gamma_l = nn.Parameter(
+            init_values * torch.ones(l_dim), requires_grad=True)
+
+    def forward(self,
+                vf0: Tensor,
+                vf1: Tensor,
+                vf2: Tensor,
+                vf3: Tensor,
+                vf4: Tensor,
+                lang_feature: Tensor,
+                attention_mask_l=None):
+        visual_features = [vf0, vf1, vf2, vf3, vf4]
+        size_per_level, visual_features_flatten = [], []
+        for i, feat_per_level in enumerate(visual_features):
+            bs, c, h, w = feat_per_level.shape
+            size_per_level.append([h, w])
+            feat = permute_and_flatten(feat_per_level, bs, -1, c, h, w)
+            visual_features_flatten.append(feat)
+        visual_features_flatten = torch.cat(visual_features_flatten, dim=1)
+        new_v, new_lang_feature = self.single_attention_call(
+            visual_features_flatten,
+            lang_feature,
+            attention_mask_l=attention_mask_l)
+        # [bs, N, C] -> [bs, C, N]
+        new_v = new_v.transpose(1, 2).contiguous()
+
+        start = 0
+        # fvfs is mean fusion_visual_features
+        fvfs = []
+        for (h, w) in size_per_level:
+            new_v_per_level = new_v[:, :,
+                                    start:start + h * w].view(bs, -1, h,
+                                                              w).contiguous()
+            fvfs.append(new_v_per_level)
+            start += h * w
+
+        return fvfs[0], fvfs[1], fvfs[2], fvfs[3], fvfs[4], new_lang_feature
+
+    def single_attention_call(
+        self,
+        visual: Tensor,
+        lang: Tensor,
+        attention_mask_v: Optional[Tensor] = None,
+        attention_mask_l: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Tensor]:
+        """Perform a single attention call between the visual and language
+        inputs.
+
+        Args:
+        visual (Tensor): The visual input tensor.
+        lang (Tensor): The language input tensor.
+        attention_mask_v (Optional[Tensor]):
+            An optional attention mask tensor for the visual input.
+        attention_mask_l (Optional[Tensor]):
+            An optional attention mask tensor for the language input.
+
+        Returns:
+            Tuple[Tensor, Tensor]: A tuple containing the updated
+                visual and language tensors after the attention call.
+        """
+        visual = self.layer_norm_v(visual)
+        lang = self.layer_norm_l(lang)
+        delta_v, delta_l = self.attn(
+            visual,
+            lang,
+            attention_mask_v=attention_mask_v,
+            attention_mask_l=attention_mask_l)
+        # visual, lang = visual + delta_v, l + delta_l
+        visual = visual + self.drop_path(self.gamma_v * delta_v)
+        lang = lang + self.drop_path(self.gamma_l * delta_l)
+        return visual, lang
+
+
+class SingleScaleBiAttentionBlock(BiAttentionBlock):
+    """This is a single-scale implementation of `BiAttentionBlock`.
+
+    The only differenece between it and `BiAttentionBlock` is that the
+    `forward` function of `SingleScaleBiAttentionBlock` only accepts a single
+    flatten visual feature map, while the `forward` function in
+    `BiAttentionBlock` accepts multiple visual feature maps.
+    """
+
+    def forward(self,
+                visual_feature: Tensor,
+                lang_feature: Tensor,
+                attention_mask_v=None,
+                attention_mask_l=None):
+        """Single-scale forward pass.
+
+        Args:
+            visual_feature (Tensor): The visual input tensor. Tensor of
+                shape (bs, patch_len, ch).
+            lang_feature (Tensor): The language input tensor. Tensor of
+                shape (bs, text_len, ch).
+            attention_mask_v (_type_, optional): Visual feature attention
+                mask. Defaults to None.
+            attention_mask_l (_type_, optional): Language feature attention
+                mask.Defaults to None.
+        """
+        new_v, new_lang_feature = self.single_attention_call(
+            visual_feature,
+            lang_feature,
+            attention_mask_v=attention_mask_v,
+            attention_mask_l=attention_mask_l)
+        return new_v, new_lang_feature
+
+
+class VLFuse(nn.Module):
+    """Early Fusion Module.
+
+    Args:
+        v_dim (int): Dimension of visual features.
+        l_dim (int): Dimension of language features.
+        embed_dim (int): The embedding dimension for the attention operation.
+        num_heads (int): Number of attention heads.
+        dropout (float): Dropout probability.
+        drop_path (float): Drop path probability.
+        use_checkpoint (bool): Whether to use PyTorch's checkpoint function.
+    """
+
+    def __init__(self,
+                 v_dim: int = 256,
+                 l_dim: int = 768,
+                 embed_dim: int = 2048,
+                 num_heads: int = 8,
+                 dropout: float = 0.1,
+                 drop_path: float = 0.0,
+                 use_checkpoint: bool = False):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.b_attn = BiAttentionBlock(
+            v_dim=v_dim,
+            l_dim=l_dim,
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            drop_path=drop_path,
+            init_values=1.0 / 6.0)
+
+    def forward(self, x: dict) -> dict:
+        """Forward pass of the VLFuse module."""
+        visual_features = x['visual']
+        language_dict_features = x['lang']
+
+        if self.use_checkpoint:
+            # vf is mean visual_features
+            # checkpoint does not allow complex data structures as input,
+            # such as list, so we must split them.
+            vf0, vf1, vf2, vf3, vf4, language_features = checkpoint.checkpoint(
+                self.b_attn, *visual_features,
+                language_dict_features['hidden'],
+                language_dict_features['masks'])
+        else:
+            vf0, vf1, vf2, vf3, vf4, language_features = self.b_attn(
+                *visual_features, language_dict_features['hidden'],
+                language_dict_features['masks'])
+
+        language_dict_features['hidden'] = language_features
+        fused_language_dict_features = language_dict_features
+
+        features_dict = {
+            'visual': [vf0, vf1, vf2, vf3, vf4],
+            'lang': fused_language_dict_features
+        }
+
+        return features_dict
+
+
+class BertEncoderLayer(BertPreTrainedModel):
+    """A modified version of the `BertLayer` class from the
+    `transformers.models.bert.modeling_bert` module.
+
+    Args:
+        config (:class:`~transformers.BertConfig`):
+            The configuration object that
+            contains various parameters for the model.
+        clamp_min_for_underflow (bool, optional):
+            Whether to clamp the minimum value of the hidden states
+             to prevent underflow. Defaults to `False`.
+        clamp_max_for_overflow (bool, optional):
+            Whether to clamp the maximum value of the hidden states
+            to prevent overflow. Defaults to `False`.
+    """
+
+    def __init__(self,
+                 config: BertConfig,
+                 clamp_min_for_underflow: bool = False,
+                 clamp_max_for_overflow: bool = False):
+        super().__init__(config)
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+
+        self.attention = BertAttention(config, clamp_min_for_underflow,
+                                       clamp_max_for_overflow)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self, inputs: Dict[str, Dict[str, torch.Tensor]]
+    ) -> Dict[str, Dict[str, torch.Tensor]]:
+        """Applies the BertEncoderLayer to the input features."""
+        language_dict_features = inputs['lang']
+        hidden_states = language_dict_features['hidden']
+        attention_mask = language_dict_features['masks']
+
+        device = hidden_states.device
+        input_shape = hidden_states.size()[:-1]
+        extended_attention_mask = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        self_attention_outputs = self.attention(
+            hidden_states,
+            extended_attention_mask,
+            None,
+            output_attentions=False,
+            past_key_value=None)
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+        hidden_states = outputs[0]
+
+        language_dict_features['hidden'] = hidden_states
+
+        features_dict = {
+            'visual': inputs['visual'],
+            'lang': language_dict_features
+        }
+
+        return features_dict
+
+    def feed_forward_chunk(self, attention_output: Tensor) -> Tensor:
+        """Applies the intermediate and output layers of the BertEncoderLayer
+        to a chunk of the input sequence."""
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# The following code is the same as the Huggingface code,
+# with the only difference being the additional clamp operation.
+class BertSelfAttention(nn.Module):
+    """BERT self-attention layer from Huggingface transformers.
+
+    Compared to the BertSelfAttention of Huggingface, only add the clamp.
+
+    Args:
+        config (:class:`~transformers.BertConfig`):
+            The configuration object that
+            contains various parameters for the model.
+        clamp_min_for_underflow (bool, optional):
+            Whether to clamp the minimum value of the hidden states
+             to prevent underflow. Defaults to `False`.
+        clamp_max_for_overflow (bool, optional):
+            Whether to clamp the maximum value of the hidden states
+            to prevent overflow. Defaults to `False`.
+    """
+
+    def __init__(self,
+                 config: BertConfig,
+                 clamp_min_for_underflow: bool = False,
+                 clamp_max_for_overflow: bool = False):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and \
+                not hasattr(config, 'embedding_size'):
+            raise ValueError(f'The hidden size ({config.hidden_size}) is '
+                             'not a multiple of the number of attention '
+                             f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size /
+                                       config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * \
+            self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        if self.position_embedding_type == 'relative_key' or \
+                self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+        self.clamp_min_for_underflow = clamp_min_for_underflow
+        self.clamp_max_for_overflow = clamp_max_for_overflow
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: Tensor) -> Tensor:
+        """Transpose the dimensions of `x`."""
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        head_mask: Optional[Tensor] = None,
+        encoder_hidden_states: Optional[Tensor] = None,
+        encoder_attention_mask: Optional[Tensor] = None,
+        past_key_value: Optional[Tuple[Tensor, Tensor]] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[Tensor, ...]:
+        """Perform a forward pass through the BERT self-attention layer."""
+
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key"
+        # to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or \
+                self.position_embedding_type == 'relative_key_query':
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + \
+                    relative_position_scores_query + \
+                    relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+
+        if self.clamp_min_for_underflow:
+            attention_scores = torch.clamp(
+                attention_scores, min=-MAX_CLAMP_VALUE
+            )  # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attention_scores = torch.clamp(
+                attention_scores, max=MAX_CLAMP_VALUE
+            )  # Do not increase 50000, data type half has quite limited range
+
+        if attention_mask is not None:
+            # Apply the attention mask is
+            # (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+class BertAttention(HFBertAttention):
+    """BertAttention is made up of self-attention and intermediate+output.
+
+    Compared to the BertAttention of Huggingface, only add the clamp.
+
+    Args:
+        config (:class:`~transformers.BertConfig`):
+            The configuration object that
+            contains various parameters for the model.
+        clamp_min_for_underflow (bool, optional):
+            Whether to clamp the minimum value of the hidden states
+             to prevent underflow. Defaults to `False`.
+        clamp_max_for_overflow (bool, optional):
+            Whether to clamp the maximum value of the hidden states
+            to prevent overflow. Defaults to `False`.
+    """
+
+    def __init__(self,
+                 config: BertConfig,
+                 clamp_min_for_underflow: bool = False,
+                 clamp_max_for_overflow: bool = False):
+        super().__init__(config)
+        self.self = BertSelfAttention(config, clamp_min_for_underflow,
+                                      clamp_max_for_overflow)
+
+
+class BertIntermediate(HFBertIntermediate):
+    """Modified from transformers.models.bert.modeling_bert.BertIntermediate.
+
+    Compared to the BertIntermediate of Huggingface, only add the clamp.
+    """
+
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = clamp_values(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = clamp_values(hidden_states)
+        return hidden_states
+
+
+class BertOutput(HFBertOutput):
+    """Modified from transformers.models.bert.modeling_bert.BertOutput.
+
+    Compared to the BertOutput of Huggingface, only add the clamp.
+    """
+
+    def forward(self, hidden_states: Tensor, input_tensor: Tensor) -> Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = clamp_values(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        hidden_states = clamp_values(hidden_states)
+        return hidden_states
diff --git a/mmde/mmdet/models/utils/wbf.py b/mmde/mmdet/models/utils/wbf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b26a2c669a520467c6fcf52d0eec53a69834a16a
--- /dev/null
+++ b/mmde/mmdet/models/utils/wbf.py
@@ -0,0 +1,250 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import warnings
+from typing import Tuple
+
+import numpy as np
+import torch
+from torch import Tensor
+
+
+# References: https://github.com/ZFTurbo/Weighted-Boxes-Fusion
+def weighted_boxes_fusion(
+        bboxes_list: list,
+        scores_list: list,
+        labels_list: list,
+        weights: list = None,
+        iou_thr: float = 0.55,
+        skip_box_thr: float = 0.0,
+        conf_type: str = 'avg',
+        allows_overflow: bool = False) -> Tuple[Tensor, Tensor, Tensor]:
+    """weighted boxes fusion <https://arxiv.org/abs/1910.13302> is a method for
+    fusing predictions from different object detection models, which utilizes
+    confidence scores of all proposed bounding boxes to construct averaged
+    boxes.
+
+    Args:
+        bboxes_list(list): list of boxes predictions from each model,
+                                    each box is 4 numbers.
+        scores_list(list): list of scores for each model
+        labels_list(list): list of labels for each model
+        weights: list of weights for each model.
+                Default: None, which means weight == 1 for each model
+        iou_thr: IoU value for boxes to be a match
+        skip_box_thr: exclude boxes with score lower than this variable.
+        conf_type: how to calculate confidence in weighted boxes.
+            'avg': average value,
+            'max': maximum value,
+            'box_and_model_avg': box and model wise hybrid weighted average,
+            'absent_model_aware_avg': weighted average that takes into
+                            account the absent model.
+        allows_overflow: false if we want confidence score not exceed 1.0.
+
+    Returns:
+        bboxes(Tensor): boxes coordinates (Order of boxes: x1, y1, x2, y2).
+        scores(Tensor): confidence scores
+        labels(Tensor): boxes labels
+    """
+
+    if weights is None:
+        weights = np.ones(len(bboxes_list))
+    if len(weights) != len(bboxes_list):
+        print('Warning: incorrect number of weights {}. Must be: '
+              '{}. Set weights equal to 1.'.format(
+                  len(weights), len(bboxes_list)))
+        weights = np.ones(len(bboxes_list))
+    weights = np.array(weights)
+
+    if conf_type not in [
+            'avg', 'max', 'box_and_model_avg', 'absent_model_aware_avg'
+    ]:
+        print('Unknown conf_type: {}. Must be "avg", '
+              '"max" or "box_and_model_avg", '
+              'or "absent_model_aware_avg"'.format(conf_type))
+        exit()
+
+    filtered_boxes = prefilter_boxes(bboxes_list, scores_list, labels_list,
+                                     weights, skip_box_thr)
+    if len(filtered_boxes) == 0:
+        return torch.Tensor(), torch.Tensor(), torch.Tensor()
+
+    overall_boxes = []
+
+    for label in filtered_boxes:
+        boxes = filtered_boxes[label]
+        new_boxes = []
+        weighted_boxes = np.empty((0, 8))
+
+        # Clusterize boxes
+        for j in range(0, len(boxes)):
+            index, best_iou = find_matching_box_fast(weighted_boxes, boxes[j],
+                                                     iou_thr)
+
+            if index != -1:
+                new_boxes[index].append(boxes[j])
+                weighted_boxes[index] = get_weighted_box(
+                    new_boxes[index], conf_type)
+            else:
+                new_boxes.append([boxes[j].copy()])
+                weighted_boxes = np.vstack((weighted_boxes, boxes[j].copy()))
+
+        # Rescale confidence based on number of models and boxes
+        for i in range(len(new_boxes)):
+            clustered_boxes = new_boxes[i]
+            if conf_type == 'box_and_model_avg':
+                clustered_boxes = np.array(clustered_boxes)
+                # weighted average for boxes
+                weighted_boxes[i, 1] = weighted_boxes[i, 1] * len(
+                    clustered_boxes) / weighted_boxes[i, 2]
+                # identify unique model index by model index column
+                _, idx = np.unique(clustered_boxes[:, 3], return_index=True)
+                # rescale by unique model weights
+                weighted_boxes[i, 1] = weighted_boxes[i, 1] * clustered_boxes[
+                    idx, 2].sum() / weights.sum()
+            elif conf_type == 'absent_model_aware_avg':
+                clustered_boxes = np.array(clustered_boxes)
+                # get unique model index in the cluster
+                models = np.unique(clustered_boxes[:, 3]).astype(int)
+                # create a mask to get unused model weights
+                mask = np.ones(len(weights), dtype=bool)
+                mask[models] = False
+                # absent model aware weighted average
+                weighted_boxes[
+                    i, 1] = weighted_boxes[i, 1] * len(clustered_boxes) / (
+                        weighted_boxes[i, 2] + weights[mask].sum())
+            elif conf_type == 'max':
+                weighted_boxes[i, 1] = weighted_boxes[i, 1] / weights.max()
+            elif not allows_overflow:
+                weighted_boxes[i, 1] = weighted_boxes[i, 1] * min(
+                    len(weights), len(clustered_boxes)) / weights.sum()
+            else:
+                weighted_boxes[i, 1] = weighted_boxes[i, 1] * len(
+                    clustered_boxes) / weights.sum()
+        overall_boxes.append(weighted_boxes)
+    overall_boxes = np.concatenate(overall_boxes, axis=0)
+    overall_boxes = overall_boxes[overall_boxes[:, 1].argsort()[::-1]]
+
+    bboxes = torch.Tensor(overall_boxes[:, 4:])
+    scores = torch.Tensor(overall_boxes[:, 1])
+    labels = torch.Tensor(overall_boxes[:, 0]).int()
+
+    return bboxes, scores, labels
+
+
+def prefilter_boxes(boxes, scores, labels, weights, thr):
+
+    new_boxes = dict()
+
+    for t in range(len(boxes)):
+
+        if len(boxes[t]) != len(scores[t]):
+            print('Error. Length of boxes arrays not equal to '
+                  'length of scores array: {} != {}'.format(
+                      len(boxes[t]), len(scores[t])))
+            exit()
+
+        if len(boxes[t]) != len(labels[t]):
+            print('Error. Length of boxes arrays not equal to '
+                  'length of labels array: {} != {}'.format(
+                      len(boxes[t]), len(labels[t])))
+            exit()
+
+        for j in range(len(boxes[t])):
+            score = scores[t][j]
+            if score < thr:
+                continue
+            label = int(labels[t][j])
+            box_part = boxes[t][j]
+            x1 = float(box_part[0])
+            y1 = float(box_part[1])
+            x2 = float(box_part[2])
+            y2 = float(box_part[3])
+
+            # Box data checks
+            if x2 < x1:
+                warnings.warn('X2 < X1 value in box. Swap them.')
+                x1, x2 = x2, x1
+            if y2 < y1:
+                warnings.warn('Y2 < Y1 value in box. Swap them.')
+                y1, y2 = y2, y1
+            if (x2 - x1) * (y2 - y1) == 0.0:
+                warnings.warn('Zero area box skipped: {}.'.format(box_part))
+                continue
+
+            # [label, score, weight, model index, x1, y1, x2, y2]
+            b = [
+                int(label),
+                float(score) * weights[t], weights[t], t, x1, y1, x2, y2
+            ]
+
+            if label not in new_boxes:
+                new_boxes[label] = []
+            new_boxes[label].append(b)
+
+    # Sort each list in dict by score and transform it to numpy array
+    for k in new_boxes:
+        current_boxes = np.array(new_boxes[k])
+        new_boxes[k] = current_boxes[current_boxes[:, 1].argsort()[::-1]]
+
+    return new_boxes
+
+
+def get_weighted_box(boxes, conf_type='avg'):
+
+    box = np.zeros(8, dtype=np.float32)
+    conf = 0
+    conf_list = []
+    w = 0
+    for b in boxes:
+        box[4:] += (b[1] * b[4:])
+        conf += b[1]
+        conf_list.append(b[1])
+        w += b[2]
+    box[0] = boxes[0][0]
+    if conf_type in ('avg', 'box_and_model_avg', 'absent_model_aware_avg'):
+        box[1] = conf / len(boxes)
+    elif conf_type == 'max':
+        box[1] = np.array(conf_list).max()
+    box[2] = w
+    box[3] = -1
+    box[4:] /= conf
+
+    return box
+
+
+def find_matching_box_fast(boxes_list, new_box, match_iou):
+
+    def bb_iou_array(boxes, new_box):
+        # bb intersection over union
+        xA = np.maximum(boxes[:, 0], new_box[0])
+        yA = np.maximum(boxes[:, 1], new_box[1])
+        xB = np.minimum(boxes[:, 2], new_box[2])
+        yB = np.minimum(boxes[:, 3], new_box[3])
+
+        interArea = np.maximum(xB - xA, 0) * np.maximum(yB - yA, 0)
+
+        # compute the area of both the prediction and ground-truth rectangles
+        boxAArea = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+        boxBArea = (new_box[2] - new_box[0]) * (new_box[3] - new_box[1])
+
+        iou = interArea / (boxAArea + boxBArea - interArea)
+
+        return iou
+
+    if boxes_list.shape[0] == 0:
+        return -1, match_iou
+
+    boxes = boxes_list
+
+    ious = bb_iou_array(boxes[:, 4:], new_box[4:])
+
+    ious[boxes[:, 0] != new_box[0]] = -1
+
+    best_idx = np.argmax(ious)
+    best_iou = ious[best_idx]
+
+    if best_iou <= match_iou:
+        best_iou = match_iou
+        best_idx = -1
+
+    return best_idx, best_iou
diff --git a/mmde/mmdet/models/vis/__init__.py b/mmde/mmdet/models/vis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab63a9066bcf6cd25d7c9063cc66d9b0390b3d42
--- /dev/null
+++ b/mmde/mmdet/models/vis/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .mask2former_vis import Mask2FormerVideo
+from .masktrack_rcnn import MaskTrackRCNN
+
+__all__ = ['Mask2FormerVideo', 'MaskTrackRCNN']
diff --git a/mmde/mmdet/models/vis/mask2former_vis.py b/mmde/mmdet/models/vis/mask2former_vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ab04296e120622f4b5e28739f4c3323d253f7d5
--- /dev/null
+++ b/mmde/mmdet/models/vis/mask2former_vis.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+from torch import Tensor
+
+from mmdet.models.mot import BaseMOTModel
+from mmdet.registry import MODELS
+from mmdet.structures import TrackDataSample, TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class Mask2FormerVideo(BaseMOTModel):
+    r"""Implementation of `Masked-attention Mask
+    Transformer for Universal Image Segmentation
+    <https://arxiv.org/pdf/2112.01527>`_.
+
+    Args:
+        backbone (dict): Configuration of backbone. Defaults to None.
+        track_head (dict): Configuration of track head. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            Defaults to None.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: Optional[dict] = None,
+                 track_head: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super(BaseMOTModel, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+        if backbone is not None:
+            self.backbone = MODELS.build(backbone)
+
+        if track_head is not None:
+            self.track_head = MODELS.build(track_head)
+
+        self.num_classes = self.track_head.num_classes
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """Overload in order to load mmdet pretrained ckpt."""
+        for key in list(state_dict):
+            if key.startswith('panoptic_head'):
+                state_dict[key.replace('panoptic',
+                                       'track')] = state_dict.pop(key)
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def loss(self, inputs: Tensor, data_samples: TrackSampleList,
+             **kwargs) -> Union[dict, tuple]:
+        """
+        Args:
+            inputs (Tensor): Input images of shape (N, T, C, H, W).
+                These should usually be mean centered and std scaled.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        # shape (N * T, C, H, W)
+        img = inputs.flatten(0, 1)
+
+        x = self.backbone(img)
+        losses = self.track_head.loss(x, data_samples)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: TrackSampleList,
+                rescale: bool = True) -> TrackSampleList:
+        """Predict results from a batch of inputs and data samples with
+        postprocessing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of frames in a video.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            TrackSampleList: Tracking results of the inputs.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+
+        assert len(data_samples) == 1, \
+            'Mask2former only support 1 batch size per gpu for now.'
+
+        # [T, C, H, W]
+        img = inputs[0]
+        track_data_sample = data_samples[0]
+        feats = self.backbone(img)
+        pred_track_ins_list = self.track_head.predict(feats, track_data_sample,
+                                                      rescale)
+
+        det_data_samples_list = []
+        for idx, pred_track_ins in enumerate(pred_track_ins_list):
+            img_data_sample = track_data_sample[idx]
+            img_data_sample.pred_track_instances = pred_track_ins
+            det_data_samples_list.append(img_data_sample)
+
+        results = TrackDataSample()
+        results.video_data_samples = det_data_samples_list
+        return [results]
diff --git a/mmde/mmdet/models/vis/masktrack_rcnn.py b/mmde/mmdet/models/vis/masktrack_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c28e7b8529d3d53d5a59ecff0ea46662d035f23
--- /dev/null
+++ b/mmde/mmdet/models/vis/masktrack_rcnn.py
@@ -0,0 +1,181 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+from mmdet.models.mot import BaseMOTModel
+from mmdet.registry import MODELS
+from mmdet.structures import TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class MaskTrackRCNN(BaseMOTModel):
+    """Video Instance Segmentation.
+
+    This video instance segmentor is the implementation of`MaskTrack R-CNN
+    <https://arxiv.org/abs/1905.04804>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        track_head (dict): Configuration of track head. Defaults to None.
+        tracker (dict): Configuration of tracker. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 track_head: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+        assert hasattr(self.detector, 'roi_head'), \
+            'MaskTrack R-CNN only supports two stage detectors.'
+
+        if track_head is not None:
+            self.track_head = MODELS.build(track_head)
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+    def loss(self, inputs: Tensor, data_samples: TrackSampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size. The T denotes the number of
+                frames.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(1) == 2, \
+            'MaskTrackRCNN can only have 1 key frame and 1 reference frame.'
+
+        # split the data_samples into two aspects: key frames and reference
+        # frames
+        ref_data_samples, key_data_samples = [], []
+        key_frame_inds, ref_frame_inds = [], []
+
+        # set cat_id of gt_labels to 0 in RPN
+        for track_data_sample in data_samples:
+            key_data_sample = track_data_sample.get_key_frames()[0]
+            key_data_samples.append(key_data_sample)
+            ref_data_sample = track_data_sample.get_ref_frames()[0]
+            ref_data_samples.append(ref_data_sample)
+            key_frame_inds.append(track_data_sample.key_frames_inds[0])
+            ref_frame_inds.append(track_data_sample.ref_frames_inds[0])
+
+        key_frame_inds = torch.tensor(key_frame_inds, dtype=torch.int64)
+        ref_frame_inds = torch.tensor(ref_frame_inds, dtype=torch.int64)
+        batch_inds = torch.arange(len(inputs))
+        key_imgs = inputs[batch_inds, key_frame_inds].contiguous()
+        ref_imgs = inputs[batch_inds, ref_frame_inds].contiguous()
+
+        x = self.detector.extract_feat(key_imgs)
+        ref_x = self.detector.extract_feat(ref_imgs)
+
+        losses = dict()
+
+        # RPN forward and loss
+        if self.detector.with_rpn:
+            proposal_cfg = self.detector.train_cfg.get(
+                'rpn_proposal', self.detector.test_cfg.rpn)
+
+            rpn_losses, rpn_results_list = self.detector.rpn_head. \
+                loss_and_predict(x,
+                                 key_data_samples,
+                                 proposal_cfg=proposal_cfg,
+                                 **kwargs)
+
+            # avoid get same name with roi_head loss
+            keys = rpn_losses.keys()
+            for key in keys:
+                if 'loss' in key and 'rpn' not in key:
+                    rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+            losses.update(rpn_losses)
+        else:
+            # TODO: Not support currently, should have a check at Fast R-CNN
+            assert key_data_samples[0].get('proposals', None) is not None
+            # use pre-defined proposals in InstanceData for the second stage
+            # to extract ROI features.
+            rpn_results_list = [
+                key_data_sample.proposals
+                for key_data_sample in key_data_samples
+            ]
+
+        losses_detect = self.detector.roi_head.loss(x, rpn_results_list,
+                                                    key_data_samples, **kwargs)
+        losses.update(losses_detect)
+
+        losses_track = self.track_head.loss(x, ref_x, rpn_results_list,
+                                            data_samples, **kwargs)
+        losses.update(losses_track)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: TrackSampleList,
+                rescale: bool = True,
+                **kwargs) -> TrackSampleList:
+        """Test without augmentation.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of frames in a video.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            TrackSampleList: Tracking results of the inputs.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+
+        assert len(data_samples) == 1, \
+            'MaskTrackRCNN only support 1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+        if track_data_sample[0].frame_id == 0:
+            self.tracker.reset()
+
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            x = self.detector.extract_feat(single_img)
+
+            rpn_results_list = self.detector.rpn_head.predict(
+                x, [img_data_sample])
+            # det_results List[InstanceData]
+            det_results = self.detector.roi_head.predict(
+                x, rpn_results_list, [img_data_sample], rescale=rescale)
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+            assert 'masks' in det_results[0], 'There are no mask results.'
+
+            img_data_sample.pred_instances = det_results[0]
+            frame_pred_track_instances = self.tracker.track(
+                model=self, feats=x, data_sample=img_data_sample, **kwargs)
+            img_data_sample.pred_track_instances = frame_pred_track_instances
+
+        return [track_data_sample]
diff --git a/mmde/mmdet/registry.py b/mmde/mmdet/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a5b2b28a4f80a488994b48a99043a20c604e55e
--- /dev/null
+++ b/mmde/mmdet/registry.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""MMDetection provides 17 registry nodes to support using modules across
+projects. Each node is a child of the root registry in MMEngine.
+
+More details can be found at
+https://mmengine.readthedocs.io/en/latest/tutorials/registry.html.
+"""
+
+from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS
+from mmengine.registry import DATASETS as MMENGINE_DATASETS
+from mmengine.registry import EVALUATOR as MMENGINE_EVALUATOR
+from mmengine.registry import HOOKS as MMENGINE_HOOKS
+from mmengine.registry import LOG_PROCESSORS as MMENGINE_LOG_PROCESSORS
+from mmengine.registry import LOOPS as MMENGINE_LOOPS
+from mmengine.registry import METRICS as MMENGINE_METRICS
+from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS
+from mmengine.registry import MODELS as MMENGINE_MODELS
+from mmengine.registry import \
+    OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS
+from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS
+from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS
+from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS
+from mmengine.registry import \
+    RUNNER_CONSTRUCTORS as MMENGINE_RUNNER_CONSTRUCTORS
+from mmengine.registry import RUNNERS as MMENGINE_RUNNERS
+from mmengine.registry import TASK_UTILS as MMENGINE_TASK_UTILS
+from mmengine.registry import TRANSFORMS as MMENGINE_TRANSFORMS
+from mmengine.registry import VISBACKENDS as MMENGINE_VISBACKENDS
+from mmengine.registry import VISUALIZERS as MMENGINE_VISUALIZERS
+from mmengine.registry import \
+    WEIGHT_INITIALIZERS as MMENGINE_WEIGHT_INITIALIZERS
+from mmengine.registry import Registry
+
+# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner`
+RUNNERS = Registry(
+    'runner', parent=MMENGINE_RUNNERS, locations=['mmdet.engine.runner'])
+# manage runner constructors that define how to initialize runners
+RUNNER_CONSTRUCTORS = Registry(
+    'runner constructor',
+    parent=MMENGINE_RUNNER_CONSTRUCTORS,
+    locations=['mmdet.engine.runner'])
+# manage all kinds of loops like `EpochBasedTrainLoop`
+LOOPS = Registry(
+    'loop', parent=MMENGINE_LOOPS, locations=['mmdet.engine.runner'])
+# manage all kinds of hooks like `CheckpointHook`
+HOOKS = Registry(
+    'hook', parent=MMENGINE_HOOKS, locations=['mmdet.engine.hooks'])
+
+# manage data-related modules
+DATASETS = Registry(
+    'dataset', parent=MMENGINE_DATASETS, locations=['mmdet.datasets'])
+DATA_SAMPLERS = Registry(
+    'data sampler',
+    parent=MMENGINE_DATA_SAMPLERS,
+    locations=['mmdet.datasets.samplers'])
+TRANSFORMS = Registry(
+    'transform',
+    parent=MMENGINE_TRANSFORMS,
+    locations=['mmdet.datasets.transforms'])
+
+# manage all kinds of modules inheriting `nn.Module`
+MODELS = Registry('model', parent=MMENGINE_MODELS, locations=['mmdet.models'])
+# manage all kinds of model wrappers like 'MMDistributedDataParallel'
+MODEL_WRAPPERS = Registry(
+    'model_wrapper',
+    parent=MMENGINE_MODEL_WRAPPERS,
+    locations=['mmdet.models'])
+# manage all kinds of weight initialization modules like `Uniform`
+WEIGHT_INITIALIZERS = Registry(
+    'weight initializer',
+    parent=MMENGINE_WEIGHT_INITIALIZERS,
+    locations=['mmdet.models'])
+
+# manage all kinds of optimizers like `SGD` and `Adam`
+OPTIMIZERS = Registry(
+    'optimizer',
+    parent=MMENGINE_OPTIMIZERS,
+    locations=['mmdet.engine.optimizers'])
+# manage optimizer wrapper
+OPTIM_WRAPPERS = Registry(
+    'optim_wrapper',
+    parent=MMENGINE_OPTIM_WRAPPERS,
+    locations=['mmdet.engine.optimizers'])
+# manage constructors that customize the optimization hyperparameters.
+OPTIM_WRAPPER_CONSTRUCTORS = Registry(
+    'optimizer constructor',
+    parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS,
+    locations=['mmdet.engine.optimizers'])
+# manage all kinds of parameter schedulers like `MultiStepLR`
+PARAM_SCHEDULERS = Registry(
+    'parameter scheduler',
+    parent=MMENGINE_PARAM_SCHEDULERS,
+    locations=['mmdet.engine.schedulers'])
+# manage all kinds of metrics
+METRICS = Registry(
+    'metric', parent=MMENGINE_METRICS, locations=['mmdet.evaluation'])
+# manage evaluator
+EVALUATOR = Registry(
+    'evaluator', parent=MMENGINE_EVALUATOR, locations=['mmdet.evaluation'])
+
+# manage task-specific modules like anchor generators and box coders
+TASK_UTILS = Registry(
+    'task util', parent=MMENGINE_TASK_UTILS, locations=['mmdet.models'])
+
+# manage visualizer
+VISUALIZERS = Registry(
+    'visualizer',
+    parent=MMENGINE_VISUALIZERS,
+    locations=['mmdet.visualization'])
+# manage visualizer backend
+VISBACKENDS = Registry(
+    'vis_backend',
+    parent=MMENGINE_VISBACKENDS,
+    locations=['mmdet.visualization'])
+
+# manage logprocessor
+LOG_PROCESSORS = Registry(
+    'log_processor',
+    parent=MMENGINE_LOG_PROCESSORS,
+    # TODO: update the location when mmdet has its own log processor
+    locations=['mmdet.engine'])
diff --git a/mmde/mmdet/structures/__init__.py b/mmde/mmdet/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..381c6a4f4549c2c4395d994cbd860a3e52eb9994
--- /dev/null
+++ b/mmde/mmdet/structures/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .det_data_sample import DetDataSample, OptSampleList, SampleList
+from .reid_data_sample import ReIDDataSample
+from .track_data_sample import (OptTrackSampleList, TrackDataSample,
+                                TrackSampleList)
+
+__all__ = [
+    'DetDataSample', 'SampleList', 'OptSampleList', 'TrackDataSample',
+    'TrackSampleList', 'OptTrackSampleList', 'ReIDDataSample'
+]
diff --git a/mmde/mmdet/structures/bbox/__init__.py b/mmde/mmdet/structures/bbox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d531986509ad1b2141118449aab39343bbde82c
--- /dev/null
+++ b/mmde/mmdet/structures/bbox/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_boxes import BaseBoxes
+from .bbox_overlaps import bbox_overlaps
+from .box_type import (autocast_box_type, convert_box_type, get_box_type,
+                       register_box, register_box_converter)
+from .horizontal_boxes import HorizontalBoxes
+from .transforms import bbox_cxcyah_to_xyxy  # noqa: E501
+from .transforms import (bbox2corner, bbox2distance, bbox2result, bbox2roi,
+                         bbox_cxcywh_to_xyxy, bbox_flip, bbox_mapping,
+                         bbox_mapping_back, bbox_project, bbox_rescale,
+                         bbox_xyxy_to_cxcyah, bbox_xyxy_to_cxcywh, cat_boxes,
+                         corner2bbox, distance2bbox, empty_box_as,
+                         find_inside_bboxes, get_box_tensor, get_box_wh,
+                         roi2bbox, scale_boxes, stack_boxes)
+
+__all__ = [
+    'bbox_overlaps', 'bbox_flip', 'bbox_mapping', 'bbox_mapping_back',
+    'bbox2roi', 'roi2bbox', 'bbox2result', 'distance2bbox', 'bbox2distance',
+    'bbox_rescale', 'bbox_cxcywh_to_xyxy', 'bbox_xyxy_to_cxcywh',
+    'find_inside_bboxes', 'bbox2corner', 'corner2bbox', 'bbox_project',
+    'BaseBoxes', 'convert_box_type', 'get_box_type', 'register_box',
+    'register_box_converter', 'HorizontalBoxes', 'autocast_box_type',
+    'cat_boxes', 'stack_boxes', 'scale_boxes', 'get_box_wh', 'get_box_tensor',
+    'empty_box_as', 'bbox_xyxy_to_cxcyah', 'bbox_cxcyah_to_xyxy'
+]
diff --git a/mmde/mmdet/structures/bbox/base_boxes.py b/mmde/mmdet/structures/bbox/base_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ed667664a8a57a1b9b7e422af03d41274882747
--- /dev/null
+++ b/mmde/mmdet/structures/bbox/base_boxes.py
@@ -0,0 +1,549 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod, abstractproperty, abstractstaticmethod
+from typing import List, Optional, Sequence, Tuple, Type, TypeVar, Union
+
+import numpy as np
+import torch
+from torch import BoolTensor, Tensor
+
+from mmdet.structures.mask.structures import BitmapMasks, PolygonMasks
+
+T = TypeVar('T')
+DeviceType = Union[str, torch.device]
+IndexType = Union[slice, int, list, torch.LongTensor, torch.cuda.LongTensor,
+                  torch.BoolTensor, torch.cuda.BoolTensor, np.ndarray]
+MaskType = Union[BitmapMasks, PolygonMasks]
+
+
+class BaseBoxes(metaclass=ABCMeta):
+    """The base class for 2D box types.
+
+    The functions of ``BaseBoxes`` lie in three fields:
+
+    - Verify the boxes shape.
+    - Support tensor-like operations.
+    - Define abstract functions for 2D boxes.
+
+    In ``__init__`` , ``BaseBoxes`` verifies the validity of the data shape
+    w.r.t ``box_dim``. The tensor with the dimension >= 2 and the length
+    of the last dimension being ``box_dim`` will be regarded as valid.
+    ``BaseBoxes`` will restore them at the field ``tensor``. It's necessary
+    to override ``box_dim`` in subclass to guarantee the data shape is
+    correct.
+
+    There are many basic tensor-like functions implemented in ``BaseBoxes``.
+    In most cases, users can operate ``BaseBoxes`` instance like a normal
+    tensor. To protect the validity of data shape, All tensor-like functions
+    cannot modify the last dimension of ``self.tensor``.
+
+    When creating a new box type, users need to inherit from ``BaseBoxes``
+    and override abstract methods and specify the ``box_dim``. Then, register
+    the new box type by using the decorator ``register_box_type``.
+
+    Args:
+        data (Tensor or np.ndarray or Sequence): The box data with shape
+            (..., box_dim).
+        dtype (torch.dtype, Optional): data type of boxes. Defaults to None.
+        device (str or torch.device, Optional): device of boxes.
+            Default to None.
+        clone (bool): Whether clone ``boxes`` or not. Defaults to True.
+    """
+
+    # Used to verify the last dimension length
+    # Should override it in subclass.
+    box_dim: int = 0
+
+    def __init__(self,
+                 data: Union[Tensor, np.ndarray, Sequence],
+                 dtype: Optional[torch.dtype] = None,
+                 device: Optional[DeviceType] = None,
+                 clone: bool = True) -> None:
+        if isinstance(data, (np.ndarray, Tensor, Sequence)):
+            data = torch.as_tensor(data)
+        else:
+            raise TypeError('boxes should be Tensor, ndarray, or Sequence, ',
+                            f'but got {type(data)}')
+
+        if device is not None or dtype is not None:
+            data = data.to(dtype=dtype, device=device)
+        # Clone the data to avoid potential bugs
+        if clone:
+            data = data.clone()
+        # handle the empty input like []
+        if data.numel() == 0:
+            data = data.reshape((-1, self.box_dim))
+
+        assert data.dim() >= 2 and data.size(-1) == self.box_dim, \
+            ('The boxes dimension must >= 2 and the length of the last '
+             f'dimension must be {self.box_dim}, but got boxes with '
+             f'shape {data.shape}.')
+        self.tensor = data
+
+    def convert_to(self, dst_type: Union[str, type]) -> 'BaseBoxes':
+        """Convert self to another box type.
+
+        Args:
+            dst_type (str or type): destination box type.
+
+        Returns:
+            :obj:`BaseBoxes`: destination box type object .
+        """
+        from .box_type import convert_box_type
+        return convert_box_type(self, dst_type=dst_type)
+
+    def empty_boxes(self: T,
+                    dtype: Optional[torch.dtype] = None,
+                    device: Optional[DeviceType] = None) -> T:
+        """Create empty box.
+
+        Args:
+            dtype (torch.dtype, Optional): data type of boxes.
+            device (str or torch.device, Optional): device of boxes.
+
+        Returns:
+            T: empty boxes with shape of (0, box_dim).
+        """
+        empty_box = self.tensor.new_zeros(
+            0, self.box_dim, dtype=dtype, device=device)
+        return type(self)(empty_box, clone=False)
+
+    def fake_boxes(self: T,
+                   sizes: Tuple[int],
+                   fill: float = 0,
+                   dtype: Optional[torch.dtype] = None,
+                   device: Optional[DeviceType] = None) -> T:
+        """Create fake boxes with specific sizes and fill values.
+
+        Args:
+            sizes (Tuple[int]): The size of fake boxes. The last value must
+                be equal with ``self.box_dim``.
+            fill (float): filling value. Defaults to 0.
+            dtype (torch.dtype, Optional): data type of boxes.
+            device (str or torch.device, Optional): device of boxes.
+
+        Returns:
+            T: Fake boxes with shape of ``sizes``.
+        """
+        fake_boxes = self.tensor.new_full(
+            sizes, fill, dtype=dtype, device=device)
+        return type(self)(fake_boxes, clone=False)
+
+    def __getitem__(self: T, index: IndexType) -> T:
+        """Rewrite getitem to protect the last dimension shape."""
+        boxes = self.tensor
+        if isinstance(index, np.ndarray):
+            index = torch.as_tensor(index, device=self.device)
+        if isinstance(index, Tensor) and index.dtype == torch.bool:
+            assert index.dim() < boxes.dim()
+        elif isinstance(index, tuple):
+            assert len(index) < boxes.dim()
+            # `Ellipsis`(...) is commonly used in index like [None, ...].
+            # When `Ellipsis` is in index, it must be the last item.
+            if Ellipsis in index:
+                assert index[-1] is Ellipsis
+
+        boxes = boxes[index]
+        if boxes.dim() == 1:
+            boxes = boxes.reshape(1, -1)
+        return type(self)(boxes, clone=False)
+
+    def __setitem__(self: T, index: IndexType, values: Union[Tensor, T]) -> T:
+        """Rewrite setitem to protect the last dimension shape."""
+        assert type(values) is type(self), \
+            'The value to be set must be the same box type as self'
+        values = values.tensor
+
+        if isinstance(index, np.ndarray):
+            index = torch.as_tensor(index, device=self.device)
+        if isinstance(index, Tensor) and index.dtype == torch.bool:
+            assert index.dim() < self.tensor.dim()
+        elif isinstance(index, tuple):
+            assert len(index) < self.tensor.dim()
+            # `Ellipsis`(...) is commonly used in index like [None, ...].
+            # When `Ellipsis` is in index, it must be the last item.
+            if Ellipsis in index:
+                assert index[-1] is Ellipsis
+
+        self.tensor[index] = values
+
+    def __len__(self) -> int:
+        """Return the length of self.tensor first dimension."""
+        return self.tensor.size(0)
+
+    def __deepcopy__(self, memo):
+        """Only clone the ``self.tensor`` when applying deepcopy."""
+        cls = self.__class__
+        other = cls.__new__(cls)
+        memo[id(self)] = other
+        other.tensor = self.tensor.clone()
+        return other
+
+    def __repr__(self) -> str:
+        """Return a strings that describes the object."""
+        return self.__class__.__name__ + '(\n' + str(self.tensor) + ')'
+
+    def new_tensor(self, *args, **kwargs) -> Tensor:
+        """Reload ``new_tensor`` from self.tensor."""
+        return self.tensor.new_tensor(*args, **kwargs)
+
+    def new_full(self, *args, **kwargs) -> Tensor:
+        """Reload ``new_full`` from self.tensor."""
+        return self.tensor.new_full(*args, **kwargs)
+
+    def new_empty(self, *args, **kwargs) -> Tensor:
+        """Reload ``new_empty`` from self.tensor."""
+        return self.tensor.new_empty(*args, **kwargs)
+
+    def new_ones(self, *args, **kwargs) -> Tensor:
+        """Reload ``new_ones`` from self.tensor."""
+        return self.tensor.new_ones(*args, **kwargs)
+
+    def new_zeros(self, *args, **kwargs) -> Tensor:
+        """Reload ``new_zeros`` from self.tensor."""
+        return self.tensor.new_zeros(*args, **kwargs)
+
+    def size(self, dim: Optional[int] = None) -> Union[int, torch.Size]:
+        """Reload new_zeros from self.tensor."""
+        # self.tensor.size(dim) cannot work when dim=None.
+        return self.tensor.size() if dim is None else self.tensor.size(dim)
+
+    def dim(self) -> int:
+        """Reload ``dim`` from self.tensor."""
+        return self.tensor.dim()
+
+    @property
+    def device(self) -> torch.device:
+        """Reload ``device`` from self.tensor."""
+        return self.tensor.device
+
+    @property
+    def dtype(self) -> torch.dtype:
+        """Reload ``dtype`` from self.tensor."""
+        return self.tensor.dtype
+
+    @property
+    def shape(self) -> torch.Size:
+        return self.tensor.shape
+
+    def numel(self) -> int:
+        """Reload ``numel`` from self.tensor."""
+        return self.tensor.numel()
+
+    def numpy(self) -> np.ndarray:
+        """Reload ``numpy`` from self.tensor."""
+        return self.tensor.numpy()
+
+    def to(self: T, *args, **kwargs) -> T:
+        """Reload ``to`` from self.tensor."""
+        return type(self)(self.tensor.to(*args, **kwargs), clone=False)
+
+    def cpu(self: T) -> T:
+        """Reload ``cpu`` from self.tensor."""
+        return type(self)(self.tensor.cpu(), clone=False)
+
+    def cuda(self: T, *args, **kwargs) -> T:
+        """Reload ``cuda`` from self.tensor."""
+        return type(self)(self.tensor.cuda(*args, **kwargs), clone=False)
+
+    def clone(self: T) -> T:
+        """Reload ``clone`` from self.tensor."""
+        return type(self)(self.tensor)
+
+    def detach(self: T) -> T:
+        """Reload ``detach`` from self.tensor."""
+        return type(self)(self.tensor.detach(), clone=False)
+
+    def view(self: T, *shape: Tuple[int]) -> T:
+        """Reload ``view`` from self.tensor."""
+        return type(self)(self.tensor.view(shape), clone=False)
+
+    def reshape(self: T, *shape: Tuple[int]) -> T:
+        """Reload ``reshape`` from self.tensor."""
+        return type(self)(self.tensor.reshape(shape), clone=False)
+
+    def expand(self: T, *sizes: Tuple[int]) -> T:
+        """Reload ``expand`` from self.tensor."""
+        return type(self)(self.tensor.expand(sizes), clone=False)
+
+    def repeat(self: T, *sizes: Tuple[int]) -> T:
+        """Reload ``repeat`` from self.tensor."""
+        return type(self)(self.tensor.repeat(sizes), clone=False)
+
+    def transpose(self: T, dim0: int, dim1: int) -> T:
+        """Reload ``transpose`` from self.tensor."""
+        ndim = self.tensor.dim()
+        assert dim0 != -1 and dim0 != ndim - 1
+        assert dim1 != -1 and dim1 != ndim - 1
+        return type(self)(self.tensor.transpose(dim0, dim1), clone=False)
+
+    def permute(self: T, *dims: Tuple[int]) -> T:
+        """Reload ``permute`` from self.tensor."""
+        assert dims[-1] == -1 or dims[-1] == self.tensor.dim() - 1
+        return type(self)(self.tensor.permute(dims), clone=False)
+
+    def split(self: T,
+              split_size_or_sections: Union[int, Sequence[int]],
+              dim: int = 0) -> List[T]:
+        """Reload ``split`` from self.tensor."""
+        assert dim != -1 and dim != self.tensor.dim() - 1
+        boxes_list = self.tensor.split(split_size_or_sections, dim=dim)
+        return [type(self)(boxes, clone=False) for boxes in boxes_list]
+
+    def chunk(self: T, chunks: int, dim: int = 0) -> List[T]:
+        """Reload ``chunk`` from self.tensor."""
+        assert dim != -1 and dim != self.tensor.dim() - 1
+        boxes_list = self.tensor.chunk(chunks, dim=dim)
+        return [type(self)(boxes, clone=False) for boxes in boxes_list]
+
+    def unbind(self: T, dim: int = 0) -> T:
+        """Reload ``unbind`` from self.tensor."""
+        assert dim != -1 and dim != self.tensor.dim() - 1
+        boxes_list = self.tensor.unbind(dim=dim)
+        return [type(self)(boxes, clone=False) for boxes in boxes_list]
+
+    def flatten(self: T, start_dim: int = 0, end_dim: int = -2) -> T:
+        """Reload ``flatten`` from self.tensor."""
+        assert end_dim != -1 and end_dim != self.tensor.dim() - 1
+        return type(self)(self.tensor.flatten(start_dim, end_dim), clone=False)
+
+    def squeeze(self: T, dim: Optional[int] = None) -> T:
+        """Reload ``squeeze`` from self.tensor."""
+        boxes = self.tensor.squeeze() if dim is None else \
+            self.tensor.squeeze(dim)
+        return type(self)(boxes, clone=False)
+
+    def unsqueeze(self: T, dim: int) -> T:
+        """Reload ``unsqueeze`` from self.tensor."""
+        assert dim != -1 and dim != self.tensor.dim()
+        return type(self)(self.tensor.unsqueeze(dim), clone=False)
+
+    @classmethod
+    def cat(cls: Type[T], box_list: Sequence[T], dim: int = 0) -> T:
+        """Cancatenates a box instance list into one single box instance.
+        Similar to ``torch.cat``.
+
+        Args:
+            box_list (Sequence[T]): A sequence of box instances.
+            dim (int): The dimension over which the box are concatenated.
+                Defaults to 0.
+
+        Returns:
+            T: Concatenated box instance.
+        """
+        assert isinstance(box_list, Sequence)
+        if len(box_list) == 0:
+            raise ValueError('box_list should not be a empty list.')
+
+        assert dim != -1 and dim != box_list[0].dim() - 1
+        assert all(isinstance(boxes, cls) for boxes in box_list)
+
+        th_box_list = [boxes.tensor for boxes in box_list]
+        return cls(torch.cat(th_box_list, dim=dim), clone=False)
+
+    @classmethod
+    def stack(cls: Type[T], box_list: Sequence[T], dim: int = 0) -> T:
+        """Concatenates a sequence of tensors along a new dimension. Similar to
+        ``torch.stack``.
+
+        Args:
+            box_list (Sequence[T]): A sequence of box instances.
+            dim (int): Dimension to insert. Defaults to 0.
+
+        Returns:
+            T: Concatenated box instance.
+        """
+        assert isinstance(box_list, Sequence)
+        if len(box_list) == 0:
+            raise ValueError('box_list should not be a empty list.')
+
+        assert dim != -1 and dim != box_list[0].dim()
+        assert all(isinstance(boxes, cls) for boxes in box_list)
+
+        th_box_list = [boxes.tensor for boxes in box_list]
+        return cls(torch.stack(th_box_list, dim=dim), clone=False)
+
+    @abstractproperty
+    def centers(self) -> Tensor:
+        """Return a tensor representing the centers of boxes."""
+        pass
+
+    @abstractproperty
+    def areas(self) -> Tensor:
+        """Return a tensor representing the areas of boxes."""
+        pass
+
+    @abstractproperty
+    def widths(self) -> Tensor:
+        """Return a tensor representing the widths of boxes."""
+        pass
+
+    @abstractproperty
+    def heights(self) -> Tensor:
+        """Return a tensor representing the heights of boxes."""
+        pass
+
+    @abstractmethod
+    def flip_(self,
+              img_shape: Tuple[int, int],
+              direction: str = 'horizontal') -> None:
+        """Flip boxes horizontally or vertically in-place.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+            direction (str): Flip direction, options are "horizontal",
+                "vertical" and "diagonal". Defaults to "horizontal"
+        """
+        pass
+
+    @abstractmethod
+    def translate_(self, distances: Tuple[float, float]) -> None:
+        """Translate boxes in-place.
+
+        Args:
+            distances (Tuple[float, float]): translate distances. The first
+                is horizontal distance and the second is vertical distance.
+        """
+        pass
+
+    @abstractmethod
+    def clip_(self, img_shape: Tuple[int, int]) -> None:
+        """Clip boxes according to the image shape in-place.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+        """
+        pass
+
+    @abstractmethod
+    def rotate_(self, center: Tuple[float, float], angle: float) -> None:
+        """Rotate all boxes in-place.
+
+        Args:
+            center (Tuple[float, float]): Rotation origin.
+            angle (float): Rotation angle represented in degrees. Positive
+                values mean clockwise rotation.
+        """
+        pass
+
+    @abstractmethod
+    def project_(self, homography_matrix: Union[Tensor, np.ndarray]) -> None:
+        """Geometric transformat boxes in-place.
+
+        Args:
+            homography_matrix (Tensor or np.ndarray]):
+                Shape (3, 3) for geometric transformation.
+        """
+        pass
+
+    @abstractmethod
+    def rescale_(self, scale_factor: Tuple[float, float]) -> None:
+        """Rescale boxes w.r.t. rescale_factor in-place.
+
+        Note:
+            Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes
+            w.r.t ``scale_facotr``. The difference is that ``resize_`` only
+            changes the width and the height of boxes, but ``rescale_`` also
+            rescales the box centers simultaneously.
+
+        Args:
+            scale_factor (Tuple[float, float]): factors for scaling boxes.
+                The length should be 2.
+        """
+        pass
+
+    @abstractmethod
+    def resize_(self, scale_factor: Tuple[float, float]) -> None:
+        """Resize the box width and height w.r.t scale_factor in-place.
+
+        Note:
+            Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes
+            w.r.t ``scale_facotr``. The difference is that ``resize_`` only
+            changes the width and the height of boxes, but ``rescale_`` also
+            rescales the box centers simultaneously.
+
+        Args:
+            scale_factor (Tuple[float, float]): factors for scaling box
+                shapes. The length should be 2.
+        """
+        pass
+
+    @abstractmethod
+    def is_inside(self,
+                  img_shape: Tuple[int, int],
+                  all_inside: bool = False,
+                  allowed_border: int = 0) -> BoolTensor:
+        """Find boxes inside the image.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+            all_inside (bool): Whether the boxes are all inside the image or
+                part inside the image. Defaults to False.
+            allowed_border (int): Boxes that extend beyond the image shape
+                boundary by more than ``allowed_border`` are considered
+                "outside" Defaults to 0.
+        Returns:
+            BoolTensor: A BoolTensor indicating whether the box is inside
+            the image. Assuming the original boxes have shape (m, n, box_dim),
+            the output has shape (m, n).
+        """
+        pass
+
+    @abstractmethod
+    def find_inside_points(self,
+                           points: Tensor,
+                           is_aligned: bool = False) -> BoolTensor:
+        """Find inside box points. Boxes dimension must be 2.
+
+        Args:
+            points (Tensor): Points coordinates. Has shape of (m, 2).
+            is_aligned (bool): Whether ``points`` has been aligned with boxes
+                or not. If True, the length of boxes and ``points`` should be
+                the same. Defaults to False.
+
+        Returns:
+            BoolTensor: A BoolTensor indicating whether a point is inside
+            boxes. Assuming the boxes has shape of (n, box_dim), if
+            ``is_aligned`` is False. The index has shape of (m, n). If
+            ``is_aligned`` is True, m should be equal to n and the index has
+            shape of (m, ).
+        """
+        pass
+
+    @abstractstaticmethod
+    def overlaps(boxes1: 'BaseBoxes',
+                 boxes2: 'BaseBoxes',
+                 mode: str = 'iou',
+                 is_aligned: bool = False,
+                 eps: float = 1e-6) -> Tensor:
+        """Calculate overlap between two set of boxes with their types
+        converted to the present box type.
+
+        Args:
+            boxes1 (:obj:`BaseBoxes`): BaseBoxes with shape of (m, box_dim)
+                or empty.
+            boxes2 (:obj:`BaseBoxes`): BaseBoxes with shape of (n, box_dim)
+                or empty.
+            mode (str): "iou" (intersection over union), "iof" (intersection
+                over foreground). Defaults to "iou".
+            is_aligned (bool): If True, then m and n must be equal. Defaults
+                to False.
+            eps (float): A value added to the denominator for numerical
+                stability. Defaults to 1e-6.
+
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+        """
+        pass
+
+    @abstractstaticmethod
+    def from_instance_masks(masks: MaskType) -> 'BaseBoxes':
+        """Create boxes from instance masks.
+
+        Args:
+            masks (:obj:`BitmapMasks` or :obj:`PolygonMasks`): BitmapMasks or
+                PolygonMasks instance with length of n.
+
+        Returns:
+            :obj:`BaseBoxes`: Converted boxes with shape of (n, box_dim).
+        """
+        pass
diff --git a/mmde/mmdet/structures/bbox/bbox_overlaps.py b/mmde/mmdet/structures/bbox/bbox_overlaps.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e3435d28b38a5479a6c791f52a76d8ba293a6eb
--- /dev/null
+++ b/mmde/mmdet/structures/bbox/bbox_overlaps.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def fp16_clamp(x, min=None, max=None):
+    if not x.is_cuda and x.dtype == torch.float16:
+        # clamp for cpu float16, tensor fp16 has no clamp implementation
+        return x.float().clamp(min, max).half()
+
+    return x.clamp(min, max)
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
+    """Calculate overlap between two set of bboxes.
+
+    FP16 Contributed by https://github.com/open-mmlab/mmdetection/pull/4889
+    Note:
+        Assume bboxes1 is M x 4, bboxes2 is N x 4, when mode is 'iou',
+        there are some new generated variable when calculating IOU
+        using bbox_overlaps function:
+
+        1) is_aligned is False
+            area1: M x 1
+            area2: N x 1
+            lt: M x N x 2
+            rb: M x N x 2
+            wh: M x N x 2
+            overlap: M x N x 1
+            union: M x N x 1
+            ious: M x N x 1
+
+            Total memory:
+                S = (9 x N x M + N + M) * 4 Byte,
+
+            When using FP16, we can reduce:
+                R = (9 x N x M + N + M) * 4 / 2 Byte
+                R large than (N + M) * 4 * 2 is always true when N and M >= 1.
+                Obviously, N + M <= N * M < 3 * N * M, when N >=2 and M >=2,
+                           N + 1 < 3 * N, when N or M is 1.
+
+            Given M = 40 (ground truth), N = 400000 (three anchor boxes
+            in per grid, FPN, R-CNNs),
+                R = 275 MB (one times)
+
+            A special case (dense detection), M = 512 (ground truth),
+                R = 3516 MB = 3.43 GB
+
+            When the batch size is B, reduce:
+                B x R
+
+            Therefore, CUDA memory runs out frequently.
+
+            Experiments on GeForce RTX 2080Ti (11019 MiB):
+
+            |   dtype   |   M   |   N   |   Use    |   Real   |   Ideal   |
+            |:----:|:----:|:----:|:----:|:----:|:----:|
+            |   FP32   |   512 | 400000 | 8020 MiB |   --   |   --   |
+            |   FP16   |   512 | 400000 |   4504 MiB | 3516 MiB | 3516 MiB |
+            |   FP32   |   40 | 400000 |   1540 MiB |   --   |   --   |
+            |   FP16   |   40 | 400000 |   1264 MiB |   276MiB   | 275 MiB |
+
+        2) is_aligned is True
+            area1: N x 1
+            area2: N x 1
+            lt: N x 2
+            rb: N x 2
+            wh: N x 2
+            overlap: N x 1
+            union: N x 1
+            ious: N x 1
+
+            Total memory:
+                S = 11 x N * 4 Byte
+
+            When using FP16, we can reduce:
+                R = 11 x N * 4 / 2 Byte
+
+        So do the 'giou' (large than 'iou').
+
+        Time-wise, FP16 is generally faster than FP32.
+
+        When gpu_assign_thr is not -1, it takes more time on cpu
+        but not reduce memory.
+        There, we can reduce half the memory and keep the speed.
+
+    If ``is_aligned`` is ``False``, then calculate the overlaps between each
+    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
+    pair of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
+        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned`` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union), "iof" (intersection over
+            foreground) or "giou" (generalized intersection over union).
+            Default "iou".
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Default False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-6.
+
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 10, 10],
+        >>>     [10, 10, 20, 20],
+        >>>     [32, 32, 38, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 10, 20],
+        >>>     [0, 10, 10, 19],
+        >>>     [10, 10, 20, 20],
+        >>> ])
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2)
+        >>> assert overlaps.shape == (3, 3)
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
+        >>> assert overlaps.shape == (3, )
+
+    Example:
+        >>> empty = torch.empty(0, 4)
+        >>> nonempty = torch.FloatTensor([[0, 0, 10, 9]])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
+    # Either the boxes are empty or the length of boxes' last dimension is 4
+    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.size(-2)
+    cols = bboxes2.size(-2)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return bboxes1.new(batch_shape + (rows, ))
+        else:
+            return bboxes1.new(batch_shape + (rows, cols))
+
+    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+        bboxes1[..., 3] - bboxes1[..., 1])
+    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+        bboxes2[..., 3] - bboxes2[..., 1])
+
+    if is_aligned:
+        lt = torch.max(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
+        rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]
+
+        wh = fp16_clamp(rb - lt, min=0)
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2])
+            enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:])
+    else:
+        lt = torch.max(bboxes1[..., :, None, :2],
+                       bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
+        rb = torch.min(bboxes1[..., :, None, 2:],
+                       bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]
+
+        wh = fp16_clamp(rb - lt, min=0)
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1[..., None] + area2[..., None, :] - overlap
+        else:
+            union = area1[..., None]
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :, None, :2],
+                                    bboxes2[..., None, :, :2])
+            enclosed_rb = torch.max(bboxes1[..., :, None, 2:],
+                                    bboxes2[..., None, :, 2:])
+
+    eps = union.new_tensor([eps])
+    union = torch.max(union, eps)
+    ious = overlap / union
+    if mode in ['iou', 'iof']:
+        return ious
+    # calculate gious
+    enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0)
+    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
+    enclose_area = torch.max(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return gious
diff --git a/mmde/mmdet/structures/bbox/box_type.py b/mmde/mmdet/structures/bbox/box_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7eb5494c36c8efcbb414897f7c2532a6d3a1ddb
--- /dev/null
+++ b/mmde/mmdet/structures/bbox/box_type.py
@@ -0,0 +1,296 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, Optional, Tuple, Type, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from .base_boxes import BaseBoxes
+
+BoxType = Union[np.ndarray, Tensor, BaseBoxes]
+
+box_types: dict = {}
+_box_type_to_name: dict = {}
+box_converters: dict = {}
+
+
+def _register_box(name: str, box_type: Type, force: bool = False) -> None:
+    """Register a box type.
+
+    Args:
+        name (str): The name of box type.
+        box_type (type): Box mode class to be registered.
+        force (bool): Whether to override an existing class with the same
+            name. Defaults to False.
+    """
+    assert issubclass(box_type, BaseBoxes)
+    name = name.lower()
+
+    if not force and (name in box_types or box_type in _box_type_to_name):
+        raise KeyError(f'box type {name} has been registered')
+    elif name in box_types:
+        _box_type = box_types.pop(name)
+        _box_type_to_name.pop(_box_type)
+    elif box_type in _box_type_to_name:
+        _name = _box_type_to_name.pop(box_type)
+        box_types.pop(_name)
+
+    box_types[name] = box_type
+    _box_type_to_name[box_type] = name
+
+
+def register_box(name: str,
+                 box_type: Type = None,
+                 force: bool = False) -> Union[Type, Callable]:
+    """Register a box type.
+
+    A record will be added to ``bbox_types``, whose key is the box type name
+    and value is the box type itself. Simultaneously, a reverse dictionary
+    ``_box_type_to_name`` will be updated. It can be used as a decorator or
+    a normal function.
+
+    Args:
+        name (str): The name of box type.
+        bbox_type (type, Optional): Box type class to be registered.
+            Defaults to None.
+        force (bool): Whether to override the existing box type with the same
+            name. Defaults to False.
+
+    Examples:
+        >>> from mmdet.structures.bbox import register_box
+        >>> from mmdet.structures.bbox import BaseBoxes
+
+        >>> # as a decorator
+        >>> @register_box('hbox')
+        >>> class HorizontalBoxes(BaseBoxes):
+        >>>     pass
+
+        >>> # as a normal function
+        >>> class RotatedBoxes(BaseBoxes):
+        >>>     pass
+        >>> register_box('rbox', RotatedBoxes)
+    """
+    if not isinstance(force, bool):
+        raise TypeError(f'force must be a boolean, but got {type(force)}')
+
+    # use it as a normal method: register_box(name, box_type=BoxCls)
+    if box_type is not None:
+        _register_box(name=name, box_type=box_type, force=force)
+        return box_type
+
+    # use it as a decorator: @register_box(name)
+    def _register(cls):
+        _register_box(name=name, box_type=cls, force=force)
+        return cls
+
+    return _register
+
+
+def _register_box_converter(src_type: Union[str, type],
+                            dst_type: Union[str, type],
+                            converter: Callable,
+                            force: bool = False) -> None:
+    """Register a box converter.
+
+    Args:
+        src_type (str or type): source box type name or class.
+        dst_type (str or type): destination box type name or class.
+        converter (Callable): Convert function.
+        force (bool): Whether to override the existing box type with the same
+            name. Defaults to False.
+    """
+    assert callable(converter)
+    src_type_name, _ = get_box_type(src_type)
+    dst_type_name, _ = get_box_type(dst_type)
+
+    converter_name = src_type_name + '2' + dst_type_name
+    if not force and converter_name in box_converters:
+        raise KeyError(f'The box converter from {src_type_name} to '
+                       f'{dst_type_name} has been registered.')
+
+    box_converters[converter_name] = converter
+
+
+def register_box_converter(src_type: Union[str, type],
+                           dst_type: Union[str, type],
+                           converter: Optional[Callable] = None,
+                           force: bool = False) -> Callable:
+    """Register a box converter.
+
+    A record will be added to ``box_converter``, whose key is
+    '{src_type_name}2{dst_type_name}' and value is the convert function.
+    It can be used as a decorator or a normal function.
+
+    Args:
+        src_type (str or type): source box type name or class.
+        dst_type (str or type): destination box type name or class.
+        converter (Callable): Convert function. Defaults to None.
+        force (bool): Whether to override the existing box type with the same
+            name. Defaults to False.
+
+    Examples:
+        >>> from mmdet.structures.bbox import register_box_converter
+        >>> # as a decorator
+        >>> @register_box_converter('hbox', 'rbox')
+        >>> def converter_A(boxes):
+        >>>     pass
+
+        >>> # as a normal function
+        >>> def converter_B(boxes):
+        >>>     pass
+        >>> register_box_converter('rbox', 'hbox', converter_B)
+    """
+    if not isinstance(force, bool):
+        raise TypeError(f'force must be a boolean, but got {type(force)}')
+
+    # use it as a normal method:
+    # register_box_converter(src_type, dst_type, converter=Func)
+    if converter is not None:
+        _register_box_converter(
+            src_type=src_type,
+            dst_type=dst_type,
+            converter=converter,
+            force=force)
+        return converter
+
+    # use it as a decorator: @register_box_converter(name)
+    def _register(func):
+        _register_box_converter(
+            src_type=src_type, dst_type=dst_type, converter=func, force=force)
+        return func
+
+    return _register
+
+
+def get_box_type(box_type: Union[str, type]) -> Tuple[str, type]:
+    """get both box type name and class.
+
+    Args:
+        box_type (str or type): Single box type name or class.
+
+    Returns:
+        Tuple[str, type]: A tuple of box type name and class.
+    """
+    if isinstance(box_type, str):
+        type_name = box_type.lower()
+        assert type_name in box_types, \
+            f"Box type {type_name} hasn't been registered in box_types."
+        type_cls = box_types[type_name]
+    elif issubclass(box_type, BaseBoxes):
+        assert box_type in _box_type_to_name, \
+            f"Box type {box_type} hasn't been registered in box_types."
+        type_name = _box_type_to_name[box_type]
+        type_cls = box_type
+    else:
+        raise KeyError('box_type must be a str or class inheriting from '
+                       f'BaseBoxes, but got {type(box_type)}.')
+    return type_name, type_cls
+
+
+def convert_box_type(boxes: BoxType,
+                     *,
+                     src_type: Union[str, type] = None,
+                     dst_type: Union[str, type] = None) -> BoxType:
+    """Convert boxes from source type to destination type.
+
+    If ``boxes`` is a instance of BaseBoxes, the ``src_type`` will be set
+    as the type of ``boxes``.
+
+    Args:
+        boxes (np.ndarray or Tensor or :obj:`BaseBoxes`): boxes need to
+            convert.
+        src_type (str or type, Optional): source box type. Defaults to None.
+        dst_type (str or type, Optional): destination box type. Defaults to
+            None.
+
+    Returns:
+        Union[np.ndarray, Tensor, :obj:`BaseBoxes`]: Converted boxes. It's type
+        is consistent with the input's type.
+    """
+    assert dst_type is not None
+    dst_type_name, dst_type_cls = get_box_type(dst_type)
+
+    is_box_cls = False
+    is_numpy = False
+    if isinstance(boxes, BaseBoxes):
+        src_type_name, _ = get_box_type(type(boxes))
+        is_box_cls = True
+    elif isinstance(boxes, (Tensor, np.ndarray)):
+        assert src_type is not None
+        src_type_name, _ = get_box_type(src_type)
+        if isinstance(boxes, np.ndarray):
+            is_numpy = True
+    else:
+        raise TypeError('boxes must be a instance of BaseBoxes, Tensor or '
+                        f'ndarray, but get {type(boxes)}.')
+
+    if src_type_name == dst_type_name:
+        return boxes
+
+    converter_name = src_type_name + '2' + dst_type_name
+    assert converter_name in box_converters, \
+        "Convert function hasn't been registered in box_converters."
+    converter = box_converters[converter_name]
+
+    if is_box_cls:
+        boxes = converter(boxes.tensor)
+        return dst_type_cls(boxes)
+    elif is_numpy:
+        boxes = converter(torch.from_numpy(boxes))
+        return boxes.numpy()
+    else:
+        return converter(boxes)
+
+
+def autocast_box_type(dst_box_type='hbox') -> Callable:
+    """A decorator which automatically casts results['gt_bboxes'] to the
+    destination box type.
+
+    It commenly used in mmdet.datasets.transforms to make the transforms up-
+    compatible with the np.ndarray type of results['gt_bboxes'].
+
+    The speed of processing of np.ndarray and BaseBoxes data are the same:
+
+    - np.ndarray: 0.0509 img/s
+    - BaseBoxes: 0.0551 img/s
+
+    Args:
+        dst_box_type (str): Destination box type.
+    """
+    _, box_type_cls = get_box_type(dst_box_type)
+
+    def decorator(func: Callable) -> Callable:
+
+        def wrapper(self, results: dict, *args, **kwargs) -> dict:
+            if ('gt_bboxes' not in results
+                    or isinstance(results['gt_bboxes'], BaseBoxes)):
+                return func(self, results)
+            elif isinstance(results['gt_bboxes'], np.ndarray):
+                results['gt_bboxes'] = box_type_cls(
+                    results['gt_bboxes'], clone=False)
+                if 'mix_results' in results:
+                    for res in results['mix_results']:
+                        if isinstance(res['gt_bboxes'], np.ndarray):
+                            res['gt_bboxes'] = box_type_cls(
+                                res['gt_bboxes'], clone=False)
+
+                _results = func(self, results, *args, **kwargs)
+
+                # In some cases, the function will process gt_bboxes in-place
+                # Simultaneously convert inputting and outputting gt_bboxes
+                # back to np.ndarray
+                if isinstance(_results, dict) and 'gt_bboxes' in _results:
+                    if isinstance(_results['gt_bboxes'], BaseBoxes):
+                        _results['gt_bboxes'] = _results['gt_bboxes'].numpy()
+                if isinstance(results['gt_bboxes'], BaseBoxes):
+                    results['gt_bboxes'] = results['gt_bboxes'].numpy()
+                return _results
+            else:
+                raise TypeError(
+                    "auto_box_type requires results['gt_bboxes'] to "
+                    'be BaseBoxes or np.ndarray, but got '
+                    f"{type(results['gt_bboxes'])}")
+
+        return wrapper
+
+    return decorator
diff --git a/mmde/mmdet/structures/bbox/horizontal_boxes.py b/mmde/mmdet/structures/bbox/horizontal_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3a78518105fda02cef2d3a2bcaceb410759165c
--- /dev/null
+++ b/mmde/mmdet/structures/bbox/horizontal_boxes.py
@@ -0,0 +1,432 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, TypeVar, Union
+
+import cv2
+import numpy as np
+import torch
+from torch import BoolTensor, Tensor
+
+from mmdet.structures.mask.structures import BitmapMasks, PolygonMasks
+from .base_boxes import BaseBoxes
+from .bbox_overlaps import bbox_overlaps
+from .box_type import register_box
+
+T = TypeVar('T')
+DeviceType = Union[str, torch.device]
+MaskType = Union[BitmapMasks, PolygonMasks]
+
+
+@register_box(name='hbox')
+class HorizontalBoxes(BaseBoxes):
+    """The horizontal box class used in MMDetection by default.
+
+    The ``box_dim`` of ``HorizontalBoxes`` is 4, which means the length of
+    the last dimension of the data should be 4. Two modes of box data are
+    supported in ``HorizontalBoxes``:
+
+    - 'xyxy': Each row of data indicates (x1, y1, x2, y2), which are the
+      coordinates of the left-top and right-bottom points.
+    - 'cxcywh': Each row of data indicates (x, y, w, h), where (x, y) are the
+      coordinates of the box centers and (w, h) are the width and height.
+
+    ``HorizontalBoxes`` only restores 'xyxy' mode of data. If the the data is
+    in 'cxcywh' mode, users need to input ``in_mode='cxcywh'`` and The code
+    will convert the 'cxcywh' data to 'xyxy' automatically.
+
+    Args:
+        data (Tensor or np.ndarray or Sequence): The box data with shape of
+            (..., 4).
+        dtype (torch.dtype, Optional): data type of boxes. Defaults to None.
+        device (str or torch.device, Optional): device of boxes.
+            Default to None.
+        clone (bool): Whether clone ``boxes`` or not. Defaults to True.
+        mode (str, Optional): the mode of boxes. If it is 'cxcywh', the
+            `data` will be converted to 'xyxy' mode. Defaults to None.
+    """
+
+    box_dim: int = 4
+
+    def __init__(self,
+                 data: Union[Tensor, np.ndarray],
+                 dtype: torch.dtype = None,
+                 device: DeviceType = None,
+                 clone: bool = True,
+                 in_mode: Optional[str] = None) -> None:
+        super().__init__(data=data, dtype=dtype, device=device, clone=clone)
+        if isinstance(in_mode, str):
+            if in_mode not in ('xyxy', 'cxcywh'):
+                raise ValueError(f'Get invalid mode {in_mode}.')
+            if in_mode == 'cxcywh':
+                self.tensor = self.cxcywh_to_xyxy(self.tensor)
+
+    @staticmethod
+    def cxcywh_to_xyxy(boxes: Tensor) -> Tensor:
+        """Convert box coordinates from (cx, cy, w, h) to (x1, y1, x2, y2).
+
+        Args:
+            boxes (Tensor): cxcywh boxes tensor with shape of (..., 4).
+
+        Returns:
+            Tensor: xyxy boxes tensor with shape of (..., 4).
+        """
+        ctr, wh = boxes.split((2, 2), dim=-1)
+        return torch.cat([(ctr - wh / 2), (ctr + wh / 2)], dim=-1)
+
+    @staticmethod
+    def xyxy_to_cxcywh(boxes: Tensor) -> Tensor:
+        """Convert box coordinates from (x1, y1, x2, y2) to (cx, cy, w, h).
+
+        Args:
+            boxes (Tensor): xyxy boxes tensor with shape of (..., 4).
+
+        Returns:
+            Tensor: cxcywh boxes tensor with shape of (..., 4).
+        """
+        xy1, xy2 = boxes.split((2, 2), dim=-1)
+        return torch.cat([(xy2 + xy1) / 2, (xy2 - xy1)], dim=-1)
+
+    @property
+    def cxcywh(self) -> Tensor:
+        """Return a tensor representing the cxcywh boxes."""
+        return self.xyxy_to_cxcywh(self.tensor)
+
+    @property
+    def centers(self) -> Tensor:
+        """Return a tensor representing the centers of boxes."""
+        boxes = self.tensor
+        return (boxes[..., :2] + boxes[..., 2:]) / 2
+
+    @property
+    def areas(self) -> Tensor:
+        """Return a tensor representing the areas of boxes."""
+        boxes = self.tensor
+        return (boxes[..., 2] - boxes[..., 0]) * (
+            boxes[..., 3] - boxes[..., 1])
+
+    @property
+    def widths(self) -> Tensor:
+        """Return a tensor representing the widths of boxes."""
+        boxes = self.tensor
+        return boxes[..., 2] - boxes[..., 0]
+
+    @property
+    def heights(self) -> Tensor:
+        """Return a tensor representing the heights of boxes."""
+        boxes = self.tensor
+        return boxes[..., 3] - boxes[..., 1]
+
+    def flip_(self,
+              img_shape: Tuple[int, int],
+              direction: str = 'horizontal') -> None:
+        """Flip boxes horizontally or vertically in-place.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+            direction (str): Flip direction, options are "horizontal",
+                "vertical" and "diagonal". Defaults to "horizontal"
+        """
+        assert direction in ['horizontal', 'vertical', 'diagonal']
+        flipped = self.tensor
+        boxes = flipped.clone()
+        if direction == 'horizontal':
+            flipped[..., 0] = img_shape[1] - boxes[..., 2]
+            flipped[..., 2] = img_shape[1] - boxes[..., 0]
+        elif direction == 'vertical':
+            flipped[..., 1] = img_shape[0] - boxes[..., 3]
+            flipped[..., 3] = img_shape[0] - boxes[..., 1]
+        else:
+            flipped[..., 0] = img_shape[1] - boxes[..., 2]
+            flipped[..., 1] = img_shape[0] - boxes[..., 3]
+            flipped[..., 2] = img_shape[1] - boxes[..., 0]
+            flipped[..., 3] = img_shape[0] - boxes[..., 1]
+
+    def translate_(self, distances: Tuple[float, float]) -> None:
+        """Translate boxes in-place.
+
+        Args:
+            distances (Tuple[float, float]): translate distances. The first
+                is horizontal distance and the second is vertical distance.
+        """
+        boxes = self.tensor
+        assert len(distances) == 2
+        self.tensor = boxes + boxes.new_tensor(distances).repeat(2)
+
+    def clip_(self, img_shape: Tuple[int, int]) -> None:
+        """Clip boxes according to the image shape in-place.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+        """
+        boxes = self.tensor
+        boxes[..., 0::2] = boxes[..., 0::2].clamp(0, img_shape[1])
+        boxes[..., 1::2] = boxes[..., 1::2].clamp(0, img_shape[0])
+
+    def rotate_(self, center: Tuple[float, float], angle: float) -> None:
+        """Rotate all boxes in-place.
+
+        Args:
+            center (Tuple[float, float]): Rotation origin.
+            angle (float): Rotation angle represented in degrees. Positive
+                values mean clockwise rotation.
+        """
+        boxes = self.tensor
+        rotation_matrix = boxes.new_tensor(
+            cv2.getRotationMatrix2D(center, -angle, 1))
+
+        corners = self.hbox2corner(boxes)
+        corners = torch.cat(
+            [corners, corners.new_ones(*corners.shape[:-1], 1)], dim=-1)
+        corners_T = torch.transpose(corners, -1, -2)
+        corners_T = torch.matmul(rotation_matrix, corners_T)
+        corners = torch.transpose(corners_T, -1, -2)
+        self.tensor = self.corner2hbox(corners)
+
+    def project_(self, homography_matrix: Union[Tensor, np.ndarray]) -> None:
+        """Geometric transformat boxes in-place.
+
+        Args:
+            homography_matrix (Tensor or np.ndarray]):
+                Shape (3, 3) for geometric transformation.
+        """
+        boxes = self.tensor
+        if isinstance(homography_matrix, np.ndarray):
+            homography_matrix = boxes.new_tensor(homography_matrix)
+        corners = self.hbox2corner(boxes)
+        corners = torch.cat(
+            [corners, corners.new_ones(*corners.shape[:-1], 1)], dim=-1)
+        corners_T = torch.transpose(corners, -1, -2)
+        corners_T = torch.matmul(homography_matrix, corners_T)
+        corners = torch.transpose(corners_T, -1, -2)
+        # Convert to homogeneous coordinates by normalization
+        corners = corners[..., :2] / corners[..., 2:3]
+        self.tensor = self.corner2hbox(corners)
+
+    @staticmethod
+    def hbox2corner(boxes: Tensor) -> Tensor:
+        """Convert box coordinates from (x1, y1, x2, y2) to corners ((x1, y1),
+        (x2, y1), (x1, y2), (x2, y2)).
+
+        Args:
+            boxes (Tensor): Horizontal box tensor with shape of (..., 4).
+
+        Returns:
+            Tensor: Corner tensor with shape of (..., 4, 2).
+        """
+        x1, y1, x2, y2 = torch.split(boxes, 1, dim=-1)
+        corners = torch.cat([x1, y1, x2, y1, x1, y2, x2, y2], dim=-1)
+        return corners.reshape(*corners.shape[:-1], 4, 2)
+
+    @staticmethod
+    def corner2hbox(corners: Tensor) -> Tensor:
+        """Convert box coordinates from corners ((x1, y1), (x2, y1), (x1, y2),
+        (x2, y2)) to (x1, y1, x2, y2).
+
+        Args:
+            corners (Tensor): Corner tensor with shape of (..., 4, 2).
+
+        Returns:
+            Tensor: Horizontal box tensor with shape of (..., 4).
+        """
+        if corners.numel() == 0:
+            return corners.new_zeros((0, 4))
+        min_xy = corners.min(dim=-2)[0]
+        max_xy = corners.max(dim=-2)[0]
+        return torch.cat([min_xy, max_xy], dim=-1)
+
+    def rescale_(self, scale_factor: Tuple[float, float]) -> None:
+        """Rescale boxes w.r.t. rescale_factor in-place.
+
+        Note:
+            Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes
+            w.r.t ``scale_facotr``. The difference is that ``resize_`` only
+            changes the width and the height of boxes, but ``rescale_`` also
+            rescales the box centers simultaneously.
+
+        Args:
+            scale_factor (Tuple[float, float]): factors for scaling boxes.
+                The length should be 2.
+        """
+        boxes = self.tensor
+        assert len(scale_factor) == 2
+        scale_factor = boxes.new_tensor(scale_factor).repeat(2)
+        self.tensor = boxes * scale_factor
+
+    def resize_(self, scale_factor: Tuple[float, float]) -> None:
+        """Resize the box width and height w.r.t scale_factor in-place.
+
+        Note:
+            Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes
+            w.r.t ``scale_facotr``. The difference is that ``resize_`` only
+            changes the width and the height of boxes, but ``rescale_`` also
+            rescales the box centers simultaneously.
+
+        Args:
+            scale_factor (Tuple[float, float]): factors for scaling box
+                shapes. The length should be 2.
+        """
+        boxes = self.tensor
+        assert len(scale_factor) == 2
+        ctrs = (boxes[..., 2:] + boxes[..., :2]) / 2
+        wh = boxes[..., 2:] - boxes[..., :2]
+        scale_factor = boxes.new_tensor(scale_factor)
+        wh = wh * scale_factor
+        xy1 = ctrs - 0.5 * wh
+        xy2 = ctrs + 0.5 * wh
+        self.tensor = torch.cat([xy1, xy2], dim=-1)
+
+    def is_inside(self,
+                  img_shape: Tuple[int, int],
+                  all_inside: bool = False,
+                  allowed_border: int = 0) -> BoolTensor:
+        """Find boxes inside the image.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+            all_inside (bool): Whether the boxes are all inside the image or
+                part inside the image. Defaults to False.
+            allowed_border (int): Boxes that extend beyond the image shape
+                boundary by more than ``allowed_border`` are considered
+                "outside" Defaults to 0.
+        Returns:
+            BoolTensor: A BoolTensor indicating whether the box is inside
+            the image. Assuming the original boxes have shape (m, n, 4),
+            the output has shape (m, n).
+        """
+        img_h, img_w = img_shape
+        boxes = self.tensor
+        if all_inside:
+            return (boxes[:, 0] >= -allowed_border) & \
+                (boxes[:, 1] >= -allowed_border) & \
+                (boxes[:, 2] < img_w + allowed_border) & \
+                (boxes[:, 3] < img_h + allowed_border)
+        else:
+            return (boxes[..., 0] < img_w + allowed_border) & \
+                (boxes[..., 1] < img_h + allowed_border) & \
+                (boxes[..., 2] > -allowed_border) & \
+                (boxes[..., 3] > -allowed_border)
+
+    def find_inside_points(self,
+                           points: Tensor,
+                           is_aligned: bool = False) -> BoolTensor:
+        """Find inside box points. Boxes dimension must be 2.
+
+        Args:
+            points (Tensor): Points coordinates. Has shape of (m, 2).
+            is_aligned (bool): Whether ``points`` has been aligned with boxes
+                or not. If True, the length of boxes and ``points`` should be
+                the same. Defaults to False.
+
+        Returns:
+            BoolTensor: A BoolTensor indicating whether a point is inside
+            boxes. Assuming the boxes has shape of (n, 4), if ``is_aligned``
+            is False. The index has shape of (m, n). If ``is_aligned`` is
+            True, m should be equal to n and the index has shape of (m, ).
+        """
+        boxes = self.tensor
+        assert boxes.dim() == 2, 'boxes dimension must be 2.'
+
+        if not is_aligned:
+            boxes = boxes[None, :, :]
+            points = points[:, None, :]
+        else:
+            assert boxes.size(0) == points.size(0)
+
+        x_min, y_min, x_max, y_max = boxes.unbind(dim=-1)
+        return (points[..., 0] >= x_min) & (points[..., 0] <= x_max) & \
+            (points[..., 1] >= y_min) & (points[..., 1] <= y_max)
+
+    def create_masks(self, img_shape: Tuple[int, int]) -> BitmapMasks:
+        """
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+
+        Returns:
+            :obj:`BitmapMasks`: Converted masks
+        """
+        img_h, img_w = img_shape
+        boxes = self.tensor
+
+        xmin, ymin = boxes[:, 0:1], boxes[:, 1:2]
+        xmax, ymax = boxes[:, 2:3], boxes[:, 3:4]
+        gt_masks = np.zeros((len(boxes), img_h, img_w), dtype=np.uint8)
+        for i in range(len(boxes)):
+            gt_masks[i,
+                     int(ymin[i]):int(ymax[i]),
+                     int(xmin[i]):int(xmax[i])] = 1
+        return BitmapMasks(gt_masks, img_h, img_w)
+
+    @staticmethod
+    def overlaps(boxes1: BaseBoxes,
+                 boxes2: BaseBoxes,
+                 mode: str = 'iou',
+                 is_aligned: bool = False,
+                 eps: float = 1e-6) -> Tensor:
+        """Calculate overlap between two set of boxes with their types
+        converted to ``HorizontalBoxes``.
+
+        Args:
+            boxes1 (:obj:`BaseBoxes`): BaseBoxes with shape of (m, box_dim)
+                or empty.
+            boxes2 (:obj:`BaseBoxes`): BaseBoxes with shape of (n, box_dim)
+                or empty.
+            mode (str): "iou" (intersection over union), "iof" (intersection
+                over foreground). Defaults to "iou".
+            is_aligned (bool): If True, then m and n must be equal. Defaults
+                to False.
+            eps (float): A value added to the denominator for numerical
+                stability. Defaults to 1e-6.
+
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+        """
+        boxes1 = boxes1.convert_to('hbox')
+        boxes2 = boxes2.convert_to('hbox')
+        return bbox_overlaps(
+            boxes1.tensor,
+            boxes2.tensor,
+            mode=mode,
+            is_aligned=is_aligned,
+            eps=eps)
+
+    @staticmethod
+    def from_instance_masks(masks: MaskType) -> 'HorizontalBoxes':
+        """Create horizontal boxes from instance masks.
+
+        Args:
+            masks (:obj:`BitmapMasks` or :obj:`PolygonMasks`): BitmapMasks or
+                PolygonMasks instance with length of n.
+
+        Returns:
+            :obj:`HorizontalBoxes`: Converted boxes with shape of (n, 4).
+        """
+        num_masks = len(masks)
+        boxes = np.zeros((num_masks, 4), dtype=np.float32)
+        if isinstance(masks, BitmapMasks):
+            x_any = masks.masks.any(axis=1)
+            y_any = masks.masks.any(axis=2)
+            for idx in range(num_masks):
+                x = np.where(x_any[idx, :])[0]
+                y = np.where(y_any[idx, :])[0]
+                if len(x) > 0 and len(y) > 0:
+                    # use +1 for x_max and y_max so that the right and bottom
+                    # boundary of instance masks are fully included by the box
+                    boxes[idx, :] = np.array(
+                        [x[0], y[0], x[-1] + 1, y[-1] + 1], dtype=np.float32)
+        elif isinstance(masks, PolygonMasks):
+            for idx, poly_per_obj in enumerate(masks.masks):
+                # simply use a number that is big enough for comparison with
+                # coordinates
+                xy_min = np.array([masks.width * 2, masks.height * 2],
+                                  dtype=np.float32)
+                xy_max = np.zeros(2, dtype=np.float32)
+                for p in poly_per_obj:
+                    xy = np.array(p).reshape(-1, 2).astype(np.float32)
+                    xy_min = np.minimum(xy_min, np.min(xy, axis=0))
+                    xy_max = np.maximum(xy_max, np.max(xy, axis=0))
+                boxes[idx, :2] = xy_min
+                boxes[idx, 2:] = xy_max
+        else:
+            raise TypeError(
+                '`masks` must be `BitmapMasks`  or `PolygonMasks`, '
+                f'but got {type(masks)}.')
+        return HorizontalBoxes(boxes)
diff --git a/mmde/mmdet/structures/bbox/transforms.py b/mmde/mmdet/structures/bbox/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..287e6aa6fcaeaf09a8a2838a04a97157cd02a00c
--- /dev/null
+++ b/mmde/mmdet/structures/bbox/transforms.py
@@ -0,0 +1,498 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.structures.bbox import BaseBoxes
+
+
+def find_inside_bboxes(bboxes: Tensor, img_h: int, img_w: int) -> Tensor:
+    """Find bboxes as long as a part of bboxes is inside the image.
+
+    Args:
+        bboxes (Tensor): Shape (N, 4).
+        img_h (int): Image height.
+        img_w (int): Image width.
+
+    Returns:
+        Tensor: Index of the remaining bboxes.
+    """
+    inside_inds = (bboxes[:, 0] < img_w) & (bboxes[:, 2] > 0) \
+        & (bboxes[:, 1] < img_h) & (bboxes[:, 3] > 0)
+    return inside_inds
+
+
+def bbox_flip(bboxes: Tensor,
+              img_shape: Tuple[int],
+              direction: str = 'horizontal') -> Tensor:
+    """Flip bboxes horizontally or vertically.
+
+    Args:
+        bboxes (Tensor): Shape (..., 4*k)
+        img_shape (Tuple[int]): Image shape.
+        direction (str): Flip direction, options are "horizontal", "vertical",
+            "diagonal". Default: "horizontal"
+
+    Returns:
+        Tensor: Flipped bboxes.
+    """
+    assert bboxes.shape[-1] % 4 == 0
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    flipped = bboxes.clone()
+    if direction == 'horizontal':
+        flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+        flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+    elif direction == 'vertical':
+        flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+        flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+    else:
+        flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+        flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+        flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+        flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+    return flipped
+
+
+def bbox_mapping(bboxes: Tensor,
+                 img_shape: Tuple[int],
+                 scale_factor: Union[float, Tuple[float]],
+                 flip: bool,
+                 flip_direction: str = 'horizontal') -> Tensor:
+    """Map bboxes from the original image scale to testing scale."""
+    new_bboxes = bboxes * bboxes.new_tensor(scale_factor)
+    if flip:
+        new_bboxes = bbox_flip(new_bboxes, img_shape, flip_direction)
+    return new_bboxes
+
+
+def bbox_mapping_back(bboxes: Tensor,
+                      img_shape: Tuple[int],
+                      scale_factor: Union[float, Tuple[float]],
+                      flip: bool,
+                      flip_direction: str = 'horizontal') -> Tensor:
+    """Map bboxes from testing scale to original image scale."""
+    new_bboxes = bbox_flip(bboxes, img_shape,
+                           flip_direction) if flip else bboxes
+    new_bboxes = new_bboxes.view(-1, 4) / new_bboxes.new_tensor(scale_factor)
+    return new_bboxes.view(bboxes.shape)
+
+
+def bbox2roi(bbox_list: List[Union[Tensor, BaseBoxes]]) -> Tensor:
+    """Convert a list of bboxes to roi format.
+
+    Args:
+        bbox_list (List[Union[Tensor, :obj:`BaseBoxes`]): a list of bboxes
+            corresponding to a batch of images.
+
+    Returns:
+        Tensor: shape (n, box_dim + 1), where ``box_dim`` depends on the
+        different box types. For example, If the box type in ``bbox_list``
+        is HorizontalBoxes, the output shape is (n, 5). Each row of data
+        indicates [batch_ind, x1, y1, x2, y2].
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        bboxes = get_box_tensor(bboxes)
+        img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+        rois = torch.cat([img_inds, bboxes], dim=-1)
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+def roi2bbox(rois: Tensor) -> List[Tensor]:
+    """Convert rois to bounding box format.
+
+    Args:
+        rois (Tensor): RoIs with the shape (n, 5) where the first
+            column indicates batch id of each RoI.
+
+    Returns:
+        List[Tensor]: Converted boxes of corresponding rois.
+    """
+    bbox_list = []
+    img_ids = torch.unique(rois[:, 0].cpu(), sorted=True)
+    for img_id in img_ids:
+        inds = (rois[:, 0] == img_id.item())
+        bbox = rois[inds, 1:]
+        bbox_list.append(bbox)
+    return bbox_list
+
+
+# TODO remove later
+def bbox2result(bboxes: Union[Tensor, np.ndarray], labels: Union[Tensor,
+                                                                 np.ndarray],
+                num_classes: int) -> List[np.ndarray]:
+    """Convert detection results to a list of numpy arrays.
+
+    Args:
+        bboxes (Tensor | np.ndarray): shape (n, 5)
+        labels (Tensor | np.ndarray): shape (n, )
+        num_classes (int): class number, including background class
+
+    Returns:
+        List(np.ndarray]): bbox results of each class
+    """
+    if bboxes.shape[0] == 0:
+        return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)]
+    else:
+        if isinstance(bboxes, torch.Tensor):
+            bboxes = bboxes.detach().cpu().numpy()
+            labels = labels.detach().cpu().numpy()
+        return [bboxes[labels == i, :] for i in range(num_classes)]
+
+
+def distance2bbox(
+    points: Tensor,
+    distance: Tensor,
+    max_shape: Optional[Union[Sequence[int], Tensor,
+                              Sequence[Sequence[int]]]] = None
+) -> Tensor:
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (B, N, 2) or (N, 2).
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom). Shape (B, N, 4) or (N, 4)
+        max_shape (Union[Sequence[int], Tensor, Sequence[Sequence[int]]],
+            optional): Maximum bounds for boxes, specifies
+            (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+            the max_shape should be a Sequence[Sequence[int]]
+            and the length of max_shape should also be B.
+
+    Returns:
+        Tensor: Boxes with shape (N, 4) or (B, N, 4)
+    """
+
+    x1 = points[..., 0] - distance[..., 0]
+    y1 = points[..., 1] - distance[..., 1]
+    x2 = points[..., 0] + distance[..., 2]
+    y2 = points[..., 1] + distance[..., 3]
+
+    bboxes = torch.stack([x1, y1, x2, y2], -1)
+
+    if max_shape is not None:
+        if bboxes.dim() == 2 and not torch.onnx.is_in_onnx_export():
+            # speed up
+            bboxes[:, 0::2].clamp_(min=0, max=max_shape[1])
+            bboxes[:, 1::2].clamp_(min=0, max=max_shape[0])
+            return bboxes
+
+        # clip bboxes with dynamic `min` and `max` for onnx
+        if torch.onnx.is_in_onnx_export():
+            # TODO: delete
+            from mmdet.core.export import dynamic_clip_for_onnx
+            x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape)
+            bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+            return bboxes
+        if not isinstance(max_shape, torch.Tensor):
+            max_shape = x1.new_tensor(max_shape)
+        max_shape = max_shape[..., :2].type_as(x1)
+        if max_shape.ndim == 2:
+            assert bboxes.ndim == 3
+            assert max_shape.size(0) == bboxes.size(0)
+
+        min_xy = x1.new_tensor(0)
+        max_xy = torch.cat([max_shape, max_shape],
+                           dim=-1).flip(-1).unsqueeze(-2)
+        bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+        bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+    return bboxes
+
+
+def bbox2distance(points: Tensor,
+                  bbox: Tensor,
+                  max_dis: Optional[float] = None,
+                  eps: float = 0.1) -> Tensor:
+    """Decode bounding box based on distances.
+
+    Args:
+        points (Tensor): Shape (n, 2) or (b, n, 2), [x, y].
+        bbox (Tensor): Shape (n, 4) or (b, n, 4), "xyxy" format
+        max_dis (float, optional): Upper bound of the distance.
+        eps (float): a small value to ensure target < max_dis, instead <=
+
+    Returns:
+        Tensor: Decoded distances.
+    """
+    left = points[..., 0] - bbox[..., 0]
+    top = points[..., 1] - bbox[..., 1]
+    right = bbox[..., 2] - points[..., 0]
+    bottom = bbox[..., 3] - points[..., 1]
+    if max_dis is not None:
+        left = left.clamp(min=0, max=max_dis - eps)
+        top = top.clamp(min=0, max=max_dis - eps)
+        right = right.clamp(min=0, max=max_dis - eps)
+        bottom = bottom.clamp(min=0, max=max_dis - eps)
+    return torch.stack([left, top, right, bottom], -1)
+
+
+def bbox_rescale(bboxes: Tensor, scale_factor: float = 1.0) -> Tensor:
+    """Rescale bounding box w.r.t. scale_factor.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for bboxes or (n, 5) for rois
+        scale_factor (float): rescale factor
+
+    Returns:
+        Tensor: Rescaled bboxes.
+    """
+    if bboxes.size(1) == 5:
+        bboxes_ = bboxes[:, 1:]
+        inds_ = bboxes[:, 0]
+    else:
+        bboxes_ = bboxes
+    cx = (bboxes_[:, 0] + bboxes_[:, 2]) * 0.5
+    cy = (bboxes_[:, 1] + bboxes_[:, 3]) * 0.5
+    w = bboxes_[:, 2] - bboxes_[:, 0]
+    h = bboxes_[:, 3] - bboxes_[:, 1]
+    w = w * scale_factor
+    h = h * scale_factor
+    x1 = cx - 0.5 * w
+    x2 = cx + 0.5 * w
+    y1 = cy - 0.5 * h
+    y2 = cy + 0.5 * h
+    if bboxes.size(1) == 5:
+        rescaled_bboxes = torch.stack([inds_, x1, y1, x2, y2], dim=-1)
+    else:
+        rescaled_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+    return rescaled_bboxes
+
+
+def bbox_cxcywh_to_xyxy(bbox: Tensor) -> Tensor:
+    """Convert bbox coordinates from (cx, cy, w, h) to (x1, y1, x2, y2).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    cx, cy, w, h = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [(cx - 0.5 * w), (cy - 0.5 * h), (cx + 0.5 * w), (cy + 0.5 * h)]
+    return torch.cat(bbox_new, dim=-1)
+
+
+def bbox_xyxy_to_cxcywh(bbox: Tensor) -> Tensor:
+    """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, w, h).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    x1, y1, x2, y2 = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)]
+    return torch.cat(bbox_new, dim=-1)
+
+
+def bbox2corner(bboxes: torch.Tensor) -> torch.Tensor:
+    """Convert bbox coordinates from (x1, y1, x2, y2) to corners ((x1, y1),
+    (x2, y1), (x1, y2), (x2, y2)).
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for bboxes.
+    Returns:
+        Tensor: Shape (n*4, 2) for corners.
+    """
+    x1, y1, x2, y2 = torch.split(bboxes, 1, dim=1)
+    return torch.cat([x1, y1, x2, y1, x1, y2, x2, y2], dim=1).reshape(-1, 2)
+
+
+def corner2bbox(corners: torch.Tensor) -> torch.Tensor:
+    """Convert bbox coordinates from corners ((x1, y1), (x2, y1), (x1, y2),
+    (x2, y2)) to (x1, y1, x2, y2).
+
+    Args:
+        corners (Tensor): Shape (n*4, 2) for corners.
+    Returns:
+        Tensor: Shape (n, 4) for bboxes.
+    """
+    corners = corners.reshape(-1, 4, 2)
+    min_xy = corners.min(dim=1)[0]
+    max_xy = corners.max(dim=1)[0]
+    return torch.cat([min_xy, max_xy], dim=1)
+
+
+def bbox_project(
+    bboxes: Union[torch.Tensor, np.ndarray],
+    homography_matrix: Union[torch.Tensor, np.ndarray],
+    img_shape: Optional[Tuple[int, int]] = None
+) -> Union[torch.Tensor, np.ndarray]:
+    """Geometric transformation for bbox.
+
+    Args:
+        bboxes (Union[torch.Tensor, np.ndarray]): Shape (n, 4) for bboxes.
+        homography_matrix (Union[torch.Tensor, np.ndarray]):
+            Shape (3, 3) for geometric transformation.
+        img_shape (Tuple[int, int], optional): Image shape. Defaults to None.
+    Returns:
+        Union[torch.Tensor, np.ndarray]: Converted bboxes.
+    """
+    bboxes_type = type(bboxes)
+    if bboxes_type is np.ndarray:
+        bboxes = torch.from_numpy(bboxes)
+    if isinstance(homography_matrix, np.ndarray):
+        homography_matrix = torch.from_numpy(homography_matrix)
+    corners = bbox2corner(bboxes)
+    corners = torch.cat(
+        [corners, corners.new_ones(corners.shape[0], 1)], dim=1)
+    corners = torch.matmul(homography_matrix, corners.t()).t()
+    # Convert to homogeneous coordinates by normalization
+    corners = corners[:, :2] / corners[:, 2:3]
+    bboxes = corner2bbox(corners)
+    if img_shape is not None:
+        bboxes[:, 0::2] = bboxes[:, 0::2].clamp(0, img_shape[1])
+        bboxes[:, 1::2] = bboxes[:, 1::2].clamp(0, img_shape[0])
+    if bboxes_type is np.ndarray:
+        bboxes = bboxes.numpy()
+    return bboxes
+
+
+def cat_boxes(data_list: List[Union[Tensor, BaseBoxes]],
+              dim: int = 0) -> Union[Tensor, BaseBoxes]:
+    """Concatenate boxes with type of tensor or box type.
+
+    Args:
+        data_list (List[Union[Tensor, :obj:`BaseBoxes`]]): A list of tensors
+            or box types need to be concatenated.
+            dim (int): The dimension over which the box are concatenated.
+                Defaults to 0.
+
+    Returns:
+        Union[Tensor, :obj`BaseBoxes`]: Concatenated results.
+    """
+    if data_list and isinstance(data_list[0], BaseBoxes):
+        return data_list[0].cat(data_list, dim=dim)
+    else:
+        return torch.cat(data_list, dim=dim)
+
+
+def stack_boxes(data_list: List[Union[Tensor, BaseBoxes]],
+                dim: int = 0) -> Union[Tensor, BaseBoxes]:
+    """Stack boxes with type of tensor or box type.
+
+    Args:
+        data_list (List[Union[Tensor, :obj:`BaseBoxes`]]): A list of tensors
+            or box types need to be stacked.
+            dim (int): The dimension over which the box are stacked.
+                Defaults to 0.
+
+    Returns:
+        Union[Tensor, :obj`BaseBoxes`]: Stacked results.
+    """
+    if data_list and isinstance(data_list[0], BaseBoxes):
+        return data_list[0].stack(data_list, dim=dim)
+    else:
+        return torch.stack(data_list, dim=dim)
+
+
+def scale_boxes(boxes: Union[Tensor, BaseBoxes],
+                scale_factor: Tuple[float, float]) -> Union[Tensor, BaseBoxes]:
+    """Scale boxes with type of tensor or box type.
+
+    Args:
+        boxes (Tensor or :obj:`BaseBoxes`): boxes need to be scaled. Its type
+            can be a tensor or a box type.
+        scale_factor (Tuple[float, float]): factors for scaling boxes.
+            The length should be 2.
+
+    Returns:
+        Union[Tensor, :obj:`BaseBoxes`]: Scaled boxes.
+    """
+    if isinstance(boxes, BaseBoxes):
+        boxes.rescale_(scale_factor)
+        return boxes
+    else:
+        # Tensor boxes will be treated as horizontal boxes
+        repeat_num = int(boxes.size(-1) / 2)
+        scale_factor = boxes.new_tensor(scale_factor).repeat((1, repeat_num))
+        return boxes * scale_factor
+
+
+def get_box_wh(boxes: Union[Tensor, BaseBoxes]) -> Tuple[Tensor, Tensor]:
+    """Get the width and height of boxes with type of tensor or box type.
+
+    Args:
+        boxes (Tensor or :obj:`BaseBoxes`): boxes with type of tensor
+            or box type.
+
+    Returns:
+        Tuple[Tensor, Tensor]: the width and height of boxes.
+    """
+    if isinstance(boxes, BaseBoxes):
+        w = boxes.widths
+        h = boxes.heights
+    else:
+        # Tensor boxes will be treated as horizontal boxes by defaults
+        w = boxes[:, 2] - boxes[:, 0]
+        h = boxes[:, 3] - boxes[:, 1]
+    return w, h
+
+
+def get_box_tensor(boxes: Union[Tensor, BaseBoxes]) -> Tensor:
+    """Get tensor data from box type boxes.
+
+    Args:
+        boxes (Tensor or BaseBoxes): boxes with type of tensor or box type.
+            If its type is a tensor, the boxes will be directly returned.
+            If its type is a box type, the `boxes.tensor` will be returned.
+
+    Returns:
+        Tensor: boxes tensor.
+    """
+    if isinstance(boxes, BaseBoxes):
+        boxes = boxes.tensor
+    return boxes
+
+
+def empty_box_as(boxes: Union[Tensor, BaseBoxes]) -> Union[Tensor, BaseBoxes]:
+    """Generate empty box according to input ``boxes` type and device.
+
+    Args:
+        boxes (Tensor or :obj:`BaseBoxes`): boxes with type of tensor
+            or box type.
+
+    Returns:
+        Union[Tensor, BaseBoxes]: Generated empty box.
+    """
+    if isinstance(boxes, BaseBoxes):
+        return boxes.empty_boxes()
+    else:
+        # Tensor boxes will be treated as horizontal boxes by defaults
+        return boxes.new_zeros(0, 4)
+
+
+def bbox_xyxy_to_cxcyah(bboxes: torch.Tensor) -> torch.Tensor:
+    """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, ratio, h).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    cx = (bboxes[:, 2] + bboxes[:, 0]) / 2
+    cy = (bboxes[:, 3] + bboxes[:, 1]) / 2
+    w = bboxes[:, 2] - bboxes[:, 0]
+    h = bboxes[:, 3] - bboxes[:, 1]
+    xyah = torch.stack([cx, cy, w / h, h], -1)
+    return xyah
+
+
+def bbox_cxcyah_to_xyxy(bboxes: torch.Tensor) -> torch.Tensor:
+    """Convert bbox coordinates from (cx, cy, ratio, h) to (x1, y1, x2, y2).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    cx, cy, ratio, h = bboxes.split((1, 1, 1, 1), dim=-1)
+    w = ratio * h
+    x1y1x2y2 = [cx - w / 2.0, cy - h / 2.0, cx + w / 2.0, cy + h / 2.0]
+    return torch.cat(x1y1x2y2, dim=-1)
diff --git a/mmde/mmdet/structures/det_data_sample.py b/mmde/mmdet/structures/det_data_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..37dd74725ed2ff5eb8a088c9d23a9ac5469b07a3
--- /dev/null
+++ b/mmde/mmdet/structures/det_data_sample.py
@@ -0,0 +1,237 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+from mmengine.structures import BaseDataElement, InstanceData, PixelData
+
+
+class DetDataSample(BaseDataElement):
+    """A data structure interface of MMDetection. They are used as interfaces
+    between different components.
+
+    The attributes in ``DetDataSample`` are divided into several parts:
+
+        - ``proposals``(InstanceData): Region proposals used in two-stage
+            detectors.
+        - ``gt_instances``(InstanceData): Ground truth of instance annotations.
+        - ``pred_instances``(InstanceData): Instances of detection predictions.
+        - ``pred_track_instances``(InstanceData): Instances of tracking
+            predictions.
+        - ``ignored_instances``(InstanceData): Instances to be ignored during
+            training/testing.
+        - ``gt_panoptic_seg``(PixelData): Ground truth of panoptic
+            segmentation.
+        - ``pred_panoptic_seg``(PixelData): Prediction of panoptic
+           segmentation.
+        - ``gt_sem_seg``(PixelData): Ground truth of semantic segmentation.
+        - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation.
+
+    Examples:
+         >>> import torch
+         >>> import numpy as np
+         >>> from mmengine.structures import InstanceData
+         >>> from mmdet.structures import DetDataSample
+
+         >>> data_sample = DetDataSample()
+         >>> img_meta = dict(img_shape=(800, 1196),
+         ...                 pad_shape=(800, 1216))
+         >>> gt_instances = InstanceData(metainfo=img_meta)
+         >>> gt_instances.bboxes = torch.rand((5, 4))
+         >>> gt_instances.labels = torch.rand((5,))
+         >>> data_sample.gt_instances = gt_instances
+         >>> assert 'img_shape' in data_sample.gt_instances.metainfo_keys()
+         >>> len(data_sample.gt_instances)
+         5
+         >>> print(data_sample)
+        <DetDataSample(
+
+            META INFORMATION
+
+            DATA FIELDS
+            gt_instances: <InstanceData(
+
+                    META INFORMATION
+                    pad_shape: (800, 1216)
+                    img_shape: (800, 1196)
+
+                    DATA FIELDS
+                    labels: tensor([0.8533, 0.1550, 0.5433, 0.7294, 0.5098])
+                    bboxes:
+                    tensor([[9.7725e-01, 5.8417e-01, 1.7269e-01, 6.5694e-01],
+                            [1.7894e-01, 5.1780e-01, 7.0590e-01, 4.8589e-01],
+                            [7.0392e-01, 6.6770e-01, 1.7520e-01, 1.4267e-01],
+                            [2.2411e-01, 5.1962e-01, 9.6953e-01, 6.6994e-01],
+                            [4.1338e-01, 2.1165e-01, 2.7239e-04, 6.8477e-01]])
+                ) at 0x7f21fb1b9190>
+        ) at 0x7f21fb1b9880>
+         >>> pred_instances = InstanceData(metainfo=img_meta)
+         >>> pred_instances.bboxes = torch.rand((5, 4))
+         >>> pred_instances.scores = torch.rand((5,))
+         >>> data_sample = DetDataSample(pred_instances=pred_instances)
+         >>> assert 'pred_instances' in data_sample
+
+         >>> pred_track_instances = InstanceData(metainfo=img_meta)
+         >>> pred_track_instances.bboxes = torch.rand((5, 4))
+         >>> pred_track_instances.scores = torch.rand((5,))
+         >>> data_sample = DetDataSample(
+         ...    pred_track_instances=pred_track_instances)
+         >>> assert 'pred_track_instances' in data_sample
+
+         >>> data_sample = DetDataSample()
+         >>> gt_instances_data = dict(
+         ...                        bboxes=torch.rand(2, 4),
+         ...                        labels=torch.rand(2),
+         ...                        masks=np.random.rand(2, 2, 2))
+         >>> gt_instances = InstanceData(**gt_instances_data)
+         >>> data_sample.gt_instances = gt_instances
+         >>> assert 'gt_instances' in data_sample
+         >>> assert 'masks' in data_sample.gt_instances
+
+         >>> data_sample = DetDataSample()
+         >>> gt_panoptic_seg_data = dict(panoptic_seg=torch.rand(2, 4))
+         >>> gt_panoptic_seg = PixelData(**gt_panoptic_seg_data)
+         >>> data_sample.gt_panoptic_seg = gt_panoptic_seg
+         >>> print(data_sample)
+        <DetDataSample(
+
+            META INFORMATION
+
+            DATA FIELDS
+            _gt_panoptic_seg: <BaseDataElement(
+
+                    META INFORMATION
+
+                    DATA FIELDS
+                    panoptic_seg: tensor([[0.7586, 0.1262, 0.2892, 0.9341],
+                                [0.3200, 0.7448, 0.1052, 0.5371]])
+                ) at 0x7f66c2bb7730>
+            gt_panoptic_seg: <BaseDataElement(
+
+                    META INFORMATION
+
+                    DATA FIELDS
+                    panoptic_seg: tensor([[0.7586, 0.1262, 0.2892, 0.9341],
+                                [0.3200, 0.7448, 0.1052, 0.5371]])
+                ) at 0x7f66c2bb7730>
+        ) at 0x7f66c2bb7280>
+        >>> data_sample = DetDataSample()
+        >>> gt_segm_seg_data = dict(segm_seg=torch.rand(2, 2, 2))
+        >>> gt_segm_seg = PixelData(**gt_segm_seg_data)
+        >>> data_sample.gt_segm_seg = gt_segm_seg
+        >>> assert 'gt_segm_seg' in data_sample
+        >>> assert 'segm_seg' in data_sample.gt_segm_seg
+    """
+
+    @property
+    def proposals(self) -> InstanceData:
+        return self._proposals
+
+    @proposals.setter
+    def proposals(self, value: InstanceData):
+        self.set_field(value, '_proposals', dtype=InstanceData)
+
+    @proposals.deleter
+    def proposals(self):
+        del self._proposals
+
+    @property
+    def gt_instances(self) -> InstanceData:
+        return self._gt_instances
+
+    @gt_instances.setter
+    def gt_instances(self, value: InstanceData):
+        self.set_field(value, '_gt_instances', dtype=InstanceData)
+
+    @gt_instances.deleter
+    def gt_instances(self):
+        del self._gt_instances
+
+    @property
+    def pred_instances(self) -> InstanceData:
+        return self._pred_instances
+
+    @pred_instances.setter
+    def pred_instances(self, value: InstanceData):
+        self.set_field(value, '_pred_instances', dtype=InstanceData)
+
+    @pred_instances.deleter
+    def pred_instances(self):
+        del self._pred_instances
+
+    # directly add ``pred_track_instances`` in ``DetDataSample``
+    # so that the ``TrackDataSample`` does not bother to access the
+    # instance-level information.
+    @property
+    def pred_track_instances(self) -> InstanceData:
+        return self._pred_track_instances
+
+    @pred_track_instances.setter
+    def pred_track_instances(self, value: InstanceData):
+        self.set_field(value, '_pred_track_instances', dtype=InstanceData)
+
+    @pred_track_instances.deleter
+    def pred_track_instances(self):
+        del self._pred_track_instances
+
+    @property
+    def ignored_instances(self) -> InstanceData:
+        return self._ignored_instances
+
+    @ignored_instances.setter
+    def ignored_instances(self, value: InstanceData):
+        self.set_field(value, '_ignored_instances', dtype=InstanceData)
+
+    @ignored_instances.deleter
+    def ignored_instances(self):
+        del self._ignored_instances
+
+    @property
+    def gt_panoptic_seg(self) -> PixelData:
+        return self._gt_panoptic_seg
+
+    @gt_panoptic_seg.setter
+    def gt_panoptic_seg(self, value: PixelData):
+        self.set_field(value, '_gt_panoptic_seg', dtype=PixelData)
+
+    @gt_panoptic_seg.deleter
+    def gt_panoptic_seg(self):
+        del self._gt_panoptic_seg
+
+    @property
+    def pred_panoptic_seg(self) -> PixelData:
+        return self._pred_panoptic_seg
+
+    @pred_panoptic_seg.setter
+    def pred_panoptic_seg(self, value: PixelData):
+        self.set_field(value, '_pred_panoptic_seg', dtype=PixelData)
+
+    @pred_panoptic_seg.deleter
+    def pred_panoptic_seg(self):
+        del self._pred_panoptic_seg
+
+    @property
+    def gt_sem_seg(self) -> PixelData:
+        return self._gt_sem_seg
+
+    @gt_sem_seg.setter
+    def gt_sem_seg(self, value: PixelData):
+        self.set_field(value, '_gt_sem_seg', dtype=PixelData)
+
+    @gt_sem_seg.deleter
+    def gt_sem_seg(self):
+        del self._gt_sem_seg
+
+    @property
+    def pred_sem_seg(self) -> PixelData:
+        return self._pred_sem_seg
+
+    @pred_sem_seg.setter
+    def pred_sem_seg(self, value: PixelData):
+        self.set_field(value, '_pred_sem_seg', dtype=PixelData)
+
+    @pred_sem_seg.deleter
+    def pred_sem_seg(self):
+        del self._pred_sem_seg
+
+
+SampleList = List[DetDataSample]
+OptSampleList = Optional[SampleList]
diff --git a/mmde/mmdet/structures/mask/__init__.py b/mmde/mmdet/structures/mask/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f78394701df1b493259c4c23a79aea5c5cb8be95
--- /dev/null
+++ b/mmde/mmdet/structures/mask/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .mask_target import mask_target
+from .structures import (BaseInstanceMasks, BitmapMasks, PolygonMasks,
+                         bitmap_to_polygon, polygon_to_bitmap)
+from .utils import encode_mask_results, mask2bbox, split_combined_polys
+
+__all__ = [
+    'split_combined_polys', 'mask_target', 'BaseInstanceMasks', 'BitmapMasks',
+    'PolygonMasks', 'encode_mask_results', 'mask2bbox', 'polygon_to_bitmap',
+    'bitmap_to_polygon'
+]
diff --git a/mmde/mmdet/structures/mask/mask_target.py b/mmde/mmdet/structures/mask/mask_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2fc5f1878300446b114c9f57c6a885fea8c927c
--- /dev/null
+++ b/mmde/mmdet/structures/mask/mask_target.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+
+def mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list,
+                cfg):
+    """Compute mask target for positive proposals in multiple images.
+
+    Args:
+        pos_proposals_list (list[Tensor]): Positive proposals in multiple
+            images, each has shape (num_pos, 4).
+        pos_assigned_gt_inds_list (list[Tensor]): Assigned GT indices for each
+            positive proposals, each has shape (num_pos,).
+        gt_masks_list (list[:obj:`BaseInstanceMasks`]): Ground truth masks of
+            each image.
+        cfg (dict): Config dict that specifies the mask size.
+
+    Returns:
+        Tensor: Mask target of each image, has shape (num_pos, w, h).
+
+    Example:
+        >>> from mmengine.config import Config
+        >>> import mmdet
+        >>> from mmdet.data_elements.mask import BitmapMasks
+        >>> from mmdet.data_elements.mask.mask_target import *
+        >>> H, W = 17, 18
+        >>> cfg = Config({'mask_size': (13, 14)})
+        >>> rng = np.random.RandomState(0)
+        >>> # Positive proposals (tl_x, tl_y, br_x, br_y) for each image
+        >>> pos_proposals_list = [
+        >>>     torch.Tensor([
+        >>>         [ 7.2425,  5.5929, 13.9414, 14.9541],
+        >>>         [ 7.3241,  3.6170, 16.3850, 15.3102],
+        >>>     ]),
+        >>>     torch.Tensor([
+        >>>         [ 4.8448, 6.4010, 7.0314, 9.7681],
+        >>>         [ 5.9790, 2.6989, 7.4416, 4.8580],
+        >>>         [ 0.0000, 0.0000, 0.1398, 9.8232],
+        >>>     ]),
+        >>> ]
+        >>> # Corresponding class index for each proposal for each image
+        >>> pos_assigned_gt_inds_list = [
+        >>>     torch.LongTensor([7, 0]),
+        >>>     torch.LongTensor([5, 4, 1]),
+        >>> ]
+        >>> # Ground truth mask for each true object for each image
+        >>> gt_masks_list = [
+        >>>     BitmapMasks(rng.rand(8, H, W), height=H, width=W),
+        >>>     BitmapMasks(rng.rand(6, H, W), height=H, width=W),
+        >>> ]
+        >>> mask_targets = mask_target(
+        >>>     pos_proposals_list, pos_assigned_gt_inds_list,
+        >>>     gt_masks_list, cfg)
+        >>> assert mask_targets.shape == (5,) + cfg['mask_size']
+    """
+    cfg_list = [cfg for _ in range(len(pos_proposals_list))]
+    mask_targets = map(mask_target_single, pos_proposals_list,
+                       pos_assigned_gt_inds_list, gt_masks_list, cfg_list)
+    mask_targets = list(mask_targets)
+    if len(mask_targets) > 0:
+        mask_targets = torch.cat(mask_targets)
+    return mask_targets
+
+
+def mask_target_single(pos_proposals, pos_assigned_gt_inds, gt_masks, cfg):
+    """Compute mask target for each positive proposal in the image.
+
+    Args:
+        pos_proposals (Tensor): Positive proposals.
+        pos_assigned_gt_inds (Tensor): Assigned GT inds of positive proposals.
+        gt_masks (:obj:`BaseInstanceMasks`): GT masks in the format of Bitmap
+            or Polygon.
+        cfg (dict): Config dict that indicate the mask size.
+
+    Returns:
+        Tensor: Mask target of each positive proposals in the image.
+
+    Example:
+        >>> from mmengine.config import Config
+        >>> import mmdet
+        >>> from mmdet.data_elements.mask import BitmapMasks
+        >>> from mmdet.data_elements.mask.mask_target import *  # NOQA
+        >>> H, W = 32, 32
+        >>> cfg = Config({'mask_size': (7, 11)})
+        >>> rng = np.random.RandomState(0)
+        >>> # Masks for each ground truth box (relative to the image)
+        >>> gt_masks_data = rng.rand(3, H, W)
+        >>> gt_masks = BitmapMasks(gt_masks_data, height=H, width=W)
+        >>> # Predicted positive boxes in one image
+        >>> pos_proposals = torch.FloatTensor([
+        >>>     [ 16.2,   5.5, 19.9, 20.9],
+        >>>     [ 17.3,  13.6, 19.3, 19.3],
+        >>>     [ 14.8,  16.4, 17.0, 23.7],
+        >>>     [  0.0,   0.0, 16.0, 16.0],
+        >>>     [  4.0,   0.0, 20.0, 16.0],
+        >>> ])
+        >>> # For each predicted proposal, its assignment to a gt mask
+        >>> pos_assigned_gt_inds = torch.LongTensor([0, 1, 2, 1, 1])
+        >>> mask_targets = mask_target_single(
+        >>>     pos_proposals, pos_assigned_gt_inds, gt_masks, cfg)
+        >>> assert mask_targets.shape == (5,) + cfg['mask_size']
+    """
+    device = pos_proposals.device
+    mask_size = _pair(cfg.mask_size)
+    binarize = not cfg.get('soft_mask_target', False)
+    num_pos = pos_proposals.size(0)
+    if num_pos > 0:
+        proposals_np = pos_proposals.cpu().numpy()
+        maxh, maxw = gt_masks.height, gt_masks.width
+        proposals_np[:, [0, 2]] = np.clip(proposals_np[:, [0, 2]], 0, maxw)
+        proposals_np[:, [1, 3]] = np.clip(proposals_np[:, [1, 3]], 0, maxh)
+        pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
+
+        mask_targets = gt_masks.crop_and_resize(
+            proposals_np,
+            mask_size,
+            device=device,
+            inds=pos_assigned_gt_inds,
+            binarize=binarize).to_ndarray()
+
+        mask_targets = torch.from_numpy(mask_targets).float().to(device)
+    else:
+        mask_targets = pos_proposals.new_zeros((0, ) + mask_size)
+
+    return mask_targets
diff --git a/mmde/mmdet/structures/mask/structures.py b/mmde/mmdet/structures/mask/structures.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4fdd27570b0d11d92eba4e8f854e153750135a4
--- /dev/null
+++ b/mmde/mmdet/structures/mask/structures.py
@@ -0,0 +1,1193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+from abc import ABCMeta, abstractmethod
+from typing import Sequence, Type, TypeVar
+
+import cv2
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+import shapely.geometry as geometry
+import torch
+from mmcv.ops.roi_align import roi_align
+
+T = TypeVar('T')
+
+
+class BaseInstanceMasks(metaclass=ABCMeta):
+    """Base class for instance masks."""
+
+    @abstractmethod
+    def rescale(self, scale, interpolation='nearest'):
+        """Rescale masks as large as possible while keeping the aspect ratio.
+        For details can refer to `mmcv.imrescale`.
+
+        Args:
+            scale (tuple[int]): The maximum size (h, w) of rescaled mask.
+            interpolation (str): Same as :func:`mmcv.imrescale`.
+
+        Returns:
+            BaseInstanceMasks: The rescaled masks.
+        """
+
+    @abstractmethod
+    def resize(self, out_shape, interpolation='nearest'):
+        """Resize masks to the given out_shape.
+
+        Args:
+            out_shape: Target (h, w) of resized mask.
+            interpolation (str): See :func:`mmcv.imresize`.
+
+        Returns:
+            BaseInstanceMasks: The resized masks.
+        """
+
+    @abstractmethod
+    def flip(self, flip_direction='horizontal'):
+        """Flip masks alone the given direction.
+
+        Args:
+            flip_direction (str): Either 'horizontal' or 'vertical'.
+
+        Returns:
+            BaseInstanceMasks: The flipped masks.
+        """
+
+    @abstractmethod
+    def pad(self, out_shape, pad_val):
+        """Pad masks to the given size of (h, w).
+
+        Args:
+            out_shape (tuple[int]): Target (h, w) of padded mask.
+            pad_val (int): The padded value.
+
+        Returns:
+            BaseInstanceMasks: The padded masks.
+        """
+
+    @abstractmethod
+    def crop(self, bbox):
+        """Crop each mask by the given bbox.
+
+        Args:
+            bbox (ndarray): Bbox in format [x1, y1, x2, y2], shape (4, ).
+
+        Return:
+            BaseInstanceMasks: The cropped masks.
+        """
+
+    @abstractmethod
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device,
+                        interpolation='bilinear',
+                        binarize=True):
+        """Crop and resize masks by the given bboxes.
+
+        This function is mainly used in mask targets computation.
+        It firstly align mask to bboxes by assigned_inds, then crop mask by the
+        assigned bbox and resize to the size of (mask_h, mask_w)
+
+        Args:
+            bboxes (Tensor): Bboxes in format [x1, y1, x2, y2], shape (N, 4)
+            out_shape (tuple[int]): Target (h, w) of resized mask
+            inds (ndarray): Indexes to assign masks to each bbox,
+                shape (N,) and values should be between [0, num_masks - 1].
+            device (str): Device of bboxes
+            interpolation (str): See `mmcv.imresize`
+            binarize (bool): if True fractional values are rounded to 0 or 1
+                after the resize operation. if False and unsupported an error
+                will be raised. Defaults to True.
+
+        Return:
+            BaseInstanceMasks: the cropped and resized masks.
+        """
+
+    @abstractmethod
+    def expand(self, expanded_h, expanded_w, top, left):
+        """see :class:`Expand`."""
+
+    @property
+    @abstractmethod
+    def areas(self):
+        """ndarray: areas of each instance."""
+
+    @abstractmethod
+    def to_ndarray(self):
+        """Convert masks to the format of ndarray.
+
+        Return:
+            ndarray: Converted masks in the format of ndarray.
+        """
+
+    @abstractmethod
+    def to_tensor(self, dtype, device):
+        """Convert masks to the format of Tensor.
+
+        Args:
+            dtype (str): Dtype of converted mask.
+            device (torch.device): Device of converted masks.
+
+        Returns:
+            Tensor: Converted masks in the format of Tensor.
+        """
+
+    @abstractmethod
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  border_value=0,
+                  interpolation='bilinear'):
+        """Translate the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            offset (int | float): The offset for translate.
+            direction (str): The translate direction, either "horizontal"
+                or "vertical".
+            border_value (int | float): Border value. Default 0.
+            interpolation (str): Same as :func:`mmcv.imtranslate`.
+
+        Returns:
+            Translated masks.
+        """
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """Shear the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            magnitude (int | float): The magnitude used for shear.
+            direction (str): The shear direction, either "horizontal"
+                or "vertical".
+            border_value (int | tuple[int]): Value used in case of a
+                constant border. Default 0.
+            interpolation (str): Same as in :func:`mmcv.imshear`.
+
+        Returns:
+            ndarray: Sheared masks.
+        """
+
+    @abstractmethod
+    def rotate(self, out_shape, angle, center=None, scale=1.0, border_value=0):
+        """Rotate the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            angle (int | float): Rotation angle in degrees. Positive values
+                mean counter-clockwise rotation.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation in source image. If not specified, the center of
+                the image will be used.
+            scale (int | float): Isotropic scale factor.
+            border_value (int | float): Border value. Default 0 for masks.
+
+        Returns:
+            Rotated masks.
+        """
+
+    def get_bboxes(self, dst_type='hbb'):
+        """Get the certain type boxes from masks.
+
+        Please refer to ``mmdet.structures.bbox.box_type`` for more details of
+        the box type.
+
+        Args:
+            dst_type: Destination box type.
+
+        Returns:
+            :obj:`BaseBoxes`: Certain type boxes.
+        """
+        from ..bbox import get_box_type
+        _, box_type_cls = get_box_type(dst_type)
+        return box_type_cls.from_instance_masks(self)
+
+    @classmethod
+    @abstractmethod
+    def cat(cls: Type[T], masks: Sequence[T]) -> T:
+        """Concatenate a sequence of masks into one single mask instance.
+
+        Args:
+            masks (Sequence[T]): A sequence of mask instances.
+
+        Returns:
+            T: Concatenated mask instance.
+        """
+
+
+class BitmapMasks(BaseInstanceMasks):
+    """This class represents masks in the form of bitmaps.
+
+    Args:
+        masks (ndarray): ndarray of masks in shape (N, H, W), where N is
+            the number of objects.
+        height (int): height of masks
+        width (int): width of masks
+
+    Example:
+        >>> from mmdet.data_elements.mask.structures import *  # NOQA
+        >>> num_masks, H, W = 3, 32, 32
+        >>> rng = np.random.RandomState(0)
+        >>> masks = (rng.rand(num_masks, H, W) > 0.1).astype(np.int64)
+        >>> self = BitmapMasks(masks, height=H, width=W)
+
+        >>> # demo crop_and_resize
+        >>> num_boxes = 5
+        >>> bboxes = np.array([[0, 0, 30, 10.0]] * num_boxes)
+        >>> out_shape = (14, 14)
+        >>> inds = torch.randint(0, len(self), size=(num_boxes,))
+        >>> device = 'cpu'
+        >>> interpolation = 'bilinear'
+        >>> new = self.crop_and_resize(
+        ...     bboxes, out_shape, inds, device, interpolation)
+        >>> assert len(new) == num_boxes
+        >>> assert new.height, new.width == out_shape
+    """
+
+    def __init__(self, masks, height, width):
+        self.height = height
+        self.width = width
+        if len(masks) == 0:
+            self.masks = np.empty((0, self.height, self.width), dtype=np.uint8)
+        else:
+            assert isinstance(masks, (list, np.ndarray))
+            if isinstance(masks, list):
+                assert isinstance(masks[0], np.ndarray)
+                assert masks[0].ndim == 2  # (H, W)
+            else:
+                assert masks.ndim == 3  # (N, H, W)
+
+            self.masks = np.stack(masks).reshape(-1, height, width)
+            assert self.masks.shape[1] == self.height
+            assert self.masks.shape[2] == self.width
+
+    def __getitem__(self, index):
+        """Index the BitmapMask.
+
+        Args:
+            index (int | ndarray): Indices in the format of integer or ndarray.
+
+        Returns:
+            :obj:`BitmapMasks`: Indexed bitmap masks.
+        """
+        masks = self.masks[index].reshape(-1, self.height, self.width)
+        return BitmapMasks(masks, self.height, self.width)
+
+    def __iter__(self):
+        return iter(self.masks)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f'num_masks={len(self.masks)}, '
+        s += f'height={self.height}, '
+        s += f'width={self.width})'
+        return s
+
+    def __len__(self):
+        """Number of masks."""
+        return len(self.masks)
+
+    def rescale(self, scale, interpolation='nearest'):
+        """See :func:`BaseInstanceMasks.rescale`."""
+        if len(self.masks) == 0:
+            new_w, new_h = mmcv.rescale_size((self.width, self.height), scale)
+            rescaled_masks = np.empty((0, new_h, new_w), dtype=np.uint8)
+        else:
+            rescaled_masks = np.stack([
+                mmcv.imrescale(mask, scale, interpolation=interpolation)
+                for mask in self.masks
+            ])
+        height, width = rescaled_masks.shape[1:]
+        return BitmapMasks(rescaled_masks, height, width)
+
+    def resize(self, out_shape, interpolation='nearest'):
+        """See :func:`BaseInstanceMasks.resize`."""
+        if len(self.masks) == 0:
+            resized_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            resized_masks = np.stack([
+                mmcv.imresize(
+                    mask, out_shape[::-1], interpolation=interpolation)
+                for mask in self.masks
+            ])
+        return BitmapMasks(resized_masks, *out_shape)
+
+    def flip(self, flip_direction='horizontal'):
+        """See :func:`BaseInstanceMasks.flip`."""
+        assert flip_direction in ('horizontal', 'vertical', 'diagonal')
+
+        if len(self.masks) == 0:
+            flipped_masks = self.masks
+        else:
+            flipped_masks = np.stack([
+                mmcv.imflip(mask, direction=flip_direction)
+                for mask in self.masks
+            ])
+        return BitmapMasks(flipped_masks, self.height, self.width)
+
+    def pad(self, out_shape, pad_val=0):
+        """See :func:`BaseInstanceMasks.pad`."""
+        if len(self.masks) == 0:
+            padded_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            padded_masks = np.stack([
+                mmcv.impad(mask, shape=out_shape, pad_val=pad_val)
+                for mask in self.masks
+            ])
+        return BitmapMasks(padded_masks, *out_shape)
+
+    def crop(self, bbox):
+        """See :func:`BaseInstanceMasks.crop`."""
+        assert isinstance(bbox, np.ndarray)
+        assert bbox.ndim == 1
+
+        # clip the boundary
+        bbox = bbox.copy()
+        bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
+        bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1, 1)
+        h = np.maximum(y2 - y1, 1)
+
+        if len(self.masks) == 0:
+            cropped_masks = np.empty((0, h, w), dtype=np.uint8)
+        else:
+            cropped_masks = self.masks[:, y1:y1 + h, x1:x1 + w]
+        return BitmapMasks(cropped_masks, h, w)
+
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device='cpu',
+                        interpolation='bilinear',
+                        binarize=True):
+        """See :func:`BaseInstanceMasks.crop_and_resize`."""
+        if len(self.masks) == 0:
+            empty_masks = np.empty((0, *out_shape), dtype=np.uint8)
+            return BitmapMasks(empty_masks, *out_shape)
+
+        # convert bboxes to tensor
+        if isinstance(bboxes, np.ndarray):
+            bboxes = torch.from_numpy(bboxes).to(device=device)
+        if isinstance(inds, np.ndarray):
+            inds = torch.from_numpy(inds).to(device=device)
+
+        num_bbox = bboxes.shape[0]
+        fake_inds = torch.arange(
+            num_bbox, device=device).to(dtype=bboxes.dtype)[:, None]
+        rois = torch.cat([fake_inds, bboxes], dim=1)  # Nx5
+        rois = rois.to(device=device)
+        if num_bbox > 0:
+            gt_masks_th = torch.from_numpy(self.masks).to(device).index_select(
+                0, inds).to(dtype=rois.dtype)
+            targets = roi_align(gt_masks_th[:, None, :, :], rois, out_shape,
+                                1.0, 0, 'avg', True).squeeze(1)
+            if binarize:
+                resized_masks = (targets >= 0.5).cpu().numpy()
+            else:
+                resized_masks = targets.cpu().numpy()
+        else:
+            resized_masks = []
+        return BitmapMasks(resized_masks, *out_shape)
+
+    def expand(self, expanded_h, expanded_w, top, left):
+        """See :func:`BaseInstanceMasks.expand`."""
+        if len(self.masks) == 0:
+            expanded_mask = np.empty((0, expanded_h, expanded_w),
+                                     dtype=np.uint8)
+        else:
+            expanded_mask = np.zeros((len(self), expanded_h, expanded_w),
+                                     dtype=np.uint8)
+            expanded_mask[:, top:top + self.height,
+                          left:left + self.width] = self.masks
+        return BitmapMasks(expanded_mask, expanded_h, expanded_w)
+
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  border_value=0,
+                  interpolation='bilinear'):
+        """Translate the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            offset (int | float): The offset for translate.
+            direction (str): The translate direction, either "horizontal"
+                or "vertical".
+            border_value (int | float): Border value. Default 0 for masks.
+            interpolation (str): Same as :func:`mmcv.imtranslate`.
+
+        Returns:
+            BitmapMasks: Translated BitmapMasks.
+
+        Example:
+            >>> from mmdet.data_elements.mask.structures import BitmapMasks
+            >>> self = BitmapMasks.random(dtype=np.uint8)
+            >>> out_shape = (32, 32)
+            >>> offset = 4
+            >>> direction = 'horizontal'
+            >>> border_value = 0
+            >>> interpolation = 'bilinear'
+            >>> # Note, There seem to be issues when:
+            >>> # * the mask dtype is not supported by cv2.AffineWarp
+            >>> new = self.translate(out_shape, offset, direction,
+            >>>                      border_value, interpolation)
+            >>> assert len(new) == len(self)
+            >>> assert new.height, new.width == out_shape
+        """
+        if len(self.masks) == 0:
+            translated_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            masks = self.masks
+            if masks.shape[-2:] != out_shape:
+                empty_masks = np.zeros((masks.shape[0], *out_shape),
+                                       dtype=masks.dtype)
+                min_h = min(out_shape[0], masks.shape[1])
+                min_w = min(out_shape[1], masks.shape[2])
+                empty_masks[:, :min_h, :min_w] = masks[:, :min_h, :min_w]
+                masks = empty_masks
+            translated_masks = mmcv.imtranslate(
+                masks.transpose((1, 2, 0)),
+                offset,
+                direction,
+                border_value=border_value,
+                interpolation=interpolation)
+            if translated_masks.ndim == 2:
+                translated_masks = translated_masks[:, :, None]
+            translated_masks = translated_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(translated_masks, *out_shape)
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """Shear the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            magnitude (int | float): The magnitude used for shear.
+            direction (str): The shear direction, either "horizontal"
+                or "vertical".
+            border_value (int | tuple[int]): Value used in case of a
+                constant border.
+            interpolation (str): Same as in :func:`mmcv.imshear`.
+
+        Returns:
+            BitmapMasks: The sheared masks.
+        """
+        if len(self.masks) == 0:
+            sheared_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            sheared_masks = mmcv.imshear(
+                self.masks.transpose((1, 2, 0)),
+                magnitude,
+                direction,
+                border_value=border_value,
+                interpolation=interpolation)
+            if sheared_masks.ndim == 2:
+                sheared_masks = sheared_masks[:, :, None]
+            sheared_masks = sheared_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(sheared_masks, *out_shape)
+
+    def rotate(self,
+               out_shape,
+               angle,
+               center=None,
+               scale=1.0,
+               border_value=0,
+               interpolation='bilinear'):
+        """Rotate the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            angle (int | float): Rotation angle in degrees. Positive values
+                mean counter-clockwise rotation.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation in source image. If not specified, the center of
+                the image will be used.
+            scale (int | float): Isotropic scale factor.
+            border_value (int | float): Border value. Default 0 for masks.
+            interpolation (str): Same as in :func:`mmcv.imrotate`.
+
+        Returns:
+            BitmapMasks: Rotated BitmapMasks.
+        """
+        if len(self.masks) == 0:
+            rotated_masks = np.empty((0, *out_shape), dtype=self.masks.dtype)
+        else:
+            rotated_masks = mmcv.imrotate(
+                self.masks.transpose((1, 2, 0)),
+                angle,
+                center=center,
+                scale=scale,
+                border_value=border_value,
+                interpolation=interpolation)
+            if rotated_masks.ndim == 2:
+                # case when only one mask, (h, w)
+                rotated_masks = rotated_masks[:, :, None]  # (h, w, 1)
+            rotated_masks = rotated_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(rotated_masks, *out_shape)
+
+    @property
+    def areas(self):
+        """See :py:attr:`BaseInstanceMasks.areas`."""
+        return self.masks.sum((1, 2))
+
+    def to_ndarray(self):
+        """See :func:`BaseInstanceMasks.to_ndarray`."""
+        return self.masks
+
+    def to_tensor(self, dtype, device):
+        """See :func:`BaseInstanceMasks.to_tensor`."""
+        return torch.tensor(self.masks, dtype=dtype, device=device)
+
+    @classmethod
+    def random(cls,
+               num_masks=3,
+               height=32,
+               width=32,
+               dtype=np.uint8,
+               rng=None):
+        """Generate random bitmap masks for demo / testing purposes.
+
+        Example:
+            >>> from mmdet.data_elements.mask.structures import BitmapMasks
+            >>> self = BitmapMasks.random()
+            >>> print('self = {}'.format(self))
+            self = BitmapMasks(num_masks=3, height=32, width=32)
+        """
+        from mmdet.utils.util_random import ensure_rng
+        rng = ensure_rng(rng)
+        masks = (rng.rand(num_masks, height, width) > 0.1).astype(dtype)
+        self = cls(masks, height=height, width=width)
+        return self
+
+    @classmethod
+    def cat(cls: Type[T], masks: Sequence[T]) -> T:
+        """Concatenate a sequence of masks into one single mask instance.
+
+        Args:
+            masks (Sequence[BitmapMasks]): A sequence of mask instances.
+
+        Returns:
+            BitmapMasks: Concatenated mask instance.
+        """
+        assert isinstance(masks, Sequence)
+        if len(masks) == 0:
+            raise ValueError('masks should not be an empty list.')
+        assert all(isinstance(m, cls) for m in masks)
+
+        mask_array = np.concatenate([m.masks for m in masks], axis=0)
+        return cls(mask_array, *mask_array.shape[1:])
+
+
+class PolygonMasks(BaseInstanceMasks):
+    """This class represents masks in the form of polygons.
+
+    Polygons is a list of three levels. The first level of the list
+    corresponds to objects, the second level to the polys that compose the
+    object, the third level to the poly coordinates
+
+    Args:
+        masks (list[list[ndarray]]): The first level of the list
+            corresponds to objects, the second level to the polys that
+            compose the object, the third level to the poly coordinates
+        height (int): height of masks
+        width (int): width of masks
+
+    Example:
+        >>> from mmdet.data_elements.mask.structures import *  # NOQA
+        >>> masks = [
+        >>>     [ np.array([0, 0, 10, 0, 10, 10., 0, 10, 0, 0]) ]
+        >>> ]
+        >>> height, width = 16, 16
+        >>> self = PolygonMasks(masks, height, width)
+
+        >>> # demo translate
+        >>> new = self.translate((16, 16), 4., direction='horizontal')
+        >>> assert np.all(new.masks[0][0][1::2] == masks[0][0][1::2])
+        >>> assert np.all(new.masks[0][0][0::2] == masks[0][0][0::2] + 4)
+
+        >>> # demo crop_and_resize
+        >>> num_boxes = 3
+        >>> bboxes = np.array([[0, 0, 30, 10.0]] * num_boxes)
+        >>> out_shape = (16, 16)
+        >>> inds = torch.randint(0, len(self), size=(num_boxes,))
+        >>> device = 'cpu'
+        >>> interpolation = 'bilinear'
+        >>> new = self.crop_and_resize(
+        ...     bboxes, out_shape, inds, device, interpolation)
+        >>> assert len(new) == num_boxes
+        >>> assert new.height, new.width == out_shape
+    """
+
+    def __init__(self, masks, height, width):
+        assert isinstance(masks, list)
+        if len(masks) > 0:
+            assert isinstance(masks[0], list)
+            assert isinstance(masks[0][0], np.ndarray)
+
+        self.height = height
+        self.width = width
+        self.masks = masks
+
+    def __getitem__(self, index):
+        """Index the polygon masks.
+
+        Args:
+            index (ndarray | List): The indices.
+
+        Returns:
+            :obj:`PolygonMasks`: The indexed polygon masks.
+        """
+        if isinstance(index, np.ndarray):
+            if index.dtype == bool:
+                index = np.where(index)[0].tolist()
+            else:
+                index = index.tolist()
+        if isinstance(index, list):
+            masks = [self.masks[i] for i in index]
+        else:
+            try:
+                masks = self.masks[index]
+            except Exception:
+                raise ValueError(
+                    f'Unsupported input of type {type(index)} for indexing!')
+        if len(masks) and isinstance(masks[0], np.ndarray):
+            masks = [masks]  # ensure a list of three levels
+        return PolygonMasks(masks, self.height, self.width)
+
+    def __iter__(self):
+        return iter(self.masks)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f'num_masks={len(self.masks)}, '
+        s += f'height={self.height}, '
+        s += f'width={self.width})'
+        return s
+
+    def __len__(self):
+        """Number of masks."""
+        return len(self.masks)
+
+    def rescale(self, scale, interpolation=None):
+        """see :func:`BaseInstanceMasks.rescale`"""
+        new_w, new_h = mmcv.rescale_size((self.width, self.height), scale)
+        if len(self.masks) == 0:
+            rescaled_masks = PolygonMasks([], new_h, new_w)
+        else:
+            rescaled_masks = self.resize((new_h, new_w))
+        return rescaled_masks
+
+    def resize(self, out_shape, interpolation=None):
+        """see :func:`BaseInstanceMasks.resize`"""
+        if len(self.masks) == 0:
+            resized_masks = PolygonMasks([], *out_shape)
+        else:
+            h_scale = out_shape[0] / self.height
+            w_scale = out_shape[1] / self.width
+            resized_masks = []
+            for poly_per_obj in self.masks:
+                resized_poly = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    p[0::2] = p[0::2] * w_scale
+                    p[1::2] = p[1::2] * h_scale
+                    resized_poly.append(p)
+                resized_masks.append(resized_poly)
+            resized_masks = PolygonMasks(resized_masks, *out_shape)
+        return resized_masks
+
+    def flip(self, flip_direction='horizontal'):
+        """see :func:`BaseInstanceMasks.flip`"""
+        assert flip_direction in ('horizontal', 'vertical', 'diagonal')
+        if len(self.masks) == 0:
+            flipped_masks = PolygonMasks([], self.height, self.width)
+        else:
+            flipped_masks = []
+            for poly_per_obj in self.masks:
+                flipped_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    if flip_direction == 'horizontal':
+                        p[0::2] = self.width - p[0::2]
+                    elif flip_direction == 'vertical':
+                        p[1::2] = self.height - p[1::2]
+                    else:
+                        p[0::2] = self.width - p[0::2]
+                        p[1::2] = self.height - p[1::2]
+                    flipped_poly_per_obj.append(p)
+                flipped_masks.append(flipped_poly_per_obj)
+            flipped_masks = PolygonMasks(flipped_masks, self.height,
+                                         self.width)
+        return flipped_masks
+
+    def crop(self, bbox):
+        """see :func:`BaseInstanceMasks.crop`"""
+        assert isinstance(bbox, np.ndarray)
+        assert bbox.ndim == 1
+
+        # clip the boundary
+        bbox = bbox.copy()
+        bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
+        bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1, 1)
+        h = np.maximum(y2 - y1, 1)
+
+        if len(self.masks) == 0:
+            cropped_masks = PolygonMasks([], h, w)
+        else:
+            # reference: https://github.com/facebookresearch/fvcore/blob/main/fvcore/transforms/transform.py  # noqa
+            crop_box = geometry.box(x1, y1, x2, y2).buffer(0.0)
+            cropped_masks = []
+            # suppress shapely warnings util it incorporates GEOS>=3.11.2
+            # reference: https://github.com/shapely/shapely/issues/1345
+            initial_settings = np.seterr()
+            np.seterr(invalid='ignore')
+            for poly_per_obj in self.masks:
+                cropped_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    p = geometry.Polygon(p.reshape(-1, 2)).buffer(0.0)
+                    # polygon must be valid to perform intersection.
+                    if not p.is_valid:
+                        continue
+                    cropped = p.intersection(crop_box)
+                    if cropped.is_empty:
+                        continue
+                    if isinstance(cropped,
+                                  geometry.collection.BaseMultipartGeometry):
+                        cropped = cropped.geoms
+                    else:
+                        cropped = [cropped]
+                    # one polygon may be cropped to multiple ones
+                    for poly in cropped:
+                        # ignore lines or points
+                        if not isinstance(
+                                poly, geometry.Polygon) or not poly.is_valid:
+                            continue
+                        coords = np.asarray(poly.exterior.coords)
+                        # remove an extra identical vertex at the end
+                        coords = coords[:-1]
+                        coords[:, 0] -= x1
+                        coords[:, 1] -= y1
+                        cropped_poly_per_obj.append(coords.reshape(-1))
+                # a dummy polygon to avoid misalignment between masks and boxes
+                if len(cropped_poly_per_obj) == 0:
+                    cropped_poly_per_obj = [np.array([0, 0, 0, 0, 0, 0])]
+                cropped_masks.append(cropped_poly_per_obj)
+            np.seterr(**initial_settings)
+            cropped_masks = PolygonMasks(cropped_masks, h, w)
+        return cropped_masks
+
+    def pad(self, out_shape, pad_val=0):
+        """padding has no effect on polygons`"""
+        return PolygonMasks(self.masks, *out_shape)
+
+    def expand(self, *args, **kwargs):
+        """TODO: Add expand for polygon"""
+        raise NotImplementedError
+
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device='cpu',
+                        interpolation='bilinear',
+                        binarize=True):
+        """see :func:`BaseInstanceMasks.crop_and_resize`"""
+        out_h, out_w = out_shape
+        if len(self.masks) == 0:
+            return PolygonMasks([], out_h, out_w)
+
+        if not binarize:
+            raise ValueError('Polygons are always binary, '
+                             'setting binarize=False is unsupported')
+
+        resized_masks = []
+        for i in range(len(bboxes)):
+            mask = self.masks[inds[i]]
+            bbox = bboxes[i, :]
+            x1, y1, x2, y2 = bbox
+            w = np.maximum(x2 - x1, 1)
+            h = np.maximum(y2 - y1, 1)
+            h_scale = out_h / max(h, 0.1)  # avoid too large scale
+            w_scale = out_w / max(w, 0.1)
+
+            resized_mask = []
+            for p in mask:
+                p = p.copy()
+                # crop
+                # pycocotools will clip the boundary
+                p[0::2] = p[0::2] - bbox[0]
+                p[1::2] = p[1::2] - bbox[1]
+
+                # resize
+                p[0::2] = p[0::2] * w_scale
+                p[1::2] = p[1::2] * h_scale
+                resized_mask.append(p)
+            resized_masks.append(resized_mask)
+        return PolygonMasks(resized_masks, *out_shape)
+
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  border_value=None,
+                  interpolation=None):
+        """Translate the PolygonMasks.
+
+        Example:
+            >>> self = PolygonMasks.random(dtype=np.int64)
+            >>> out_shape = (self.height, self.width)
+            >>> new = self.translate(out_shape, 4., direction='horizontal')
+            >>> assert np.all(new.masks[0][0][1::2] == self.masks[0][0][1::2])
+            >>> assert np.all(new.masks[0][0][0::2] == self.masks[0][0][0::2] + 4)  # noqa: E501
+        """
+        assert border_value is None or border_value == 0, \
+            'Here border_value is not '\
+            f'used, and defaultly should be None or 0. got {border_value}.'
+        if len(self.masks) == 0:
+            translated_masks = PolygonMasks([], *out_shape)
+        else:
+            translated_masks = []
+            for poly_per_obj in self.masks:
+                translated_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    if direction == 'horizontal':
+                        p[0::2] = np.clip(p[0::2] + offset, 0, out_shape[1])
+                    elif direction == 'vertical':
+                        p[1::2] = np.clip(p[1::2] + offset, 0, out_shape[0])
+                    translated_poly_per_obj.append(p)
+                translated_masks.append(translated_poly_per_obj)
+            translated_masks = PolygonMasks(translated_masks, *out_shape)
+        return translated_masks
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """See :func:`BaseInstanceMasks.shear`."""
+        if len(self.masks) == 0:
+            sheared_masks = PolygonMasks([], *out_shape)
+        else:
+            sheared_masks = []
+            if direction == 'horizontal':
+                shear_matrix = np.stack([[1, magnitude],
+                                         [0, 1]]).astype(np.float32)
+            elif direction == 'vertical':
+                shear_matrix = np.stack([[1, 0], [magnitude,
+                                                  1]]).astype(np.float32)
+            for poly_per_obj in self.masks:
+                sheared_poly = []
+                for p in poly_per_obj:
+                    p = np.stack([p[0::2], p[1::2]], axis=0)  # [2, n]
+                    new_coords = np.matmul(shear_matrix, p)  # [2, n]
+                    new_coords[0, :] = np.clip(new_coords[0, :], 0,
+                                               out_shape[1])
+                    new_coords[1, :] = np.clip(new_coords[1, :], 0,
+                                               out_shape[0])
+                    sheared_poly.append(
+                        new_coords.transpose((1, 0)).reshape(-1))
+                sheared_masks.append(sheared_poly)
+            sheared_masks = PolygonMasks(sheared_masks, *out_shape)
+        return sheared_masks
+
+    def rotate(self,
+               out_shape,
+               angle,
+               center=None,
+               scale=1.0,
+               border_value=0,
+               interpolation='bilinear'):
+        """See :func:`BaseInstanceMasks.rotate`."""
+        if len(self.masks) == 0:
+            rotated_masks = PolygonMasks([], *out_shape)
+        else:
+            rotated_masks = []
+            rotate_matrix = cv2.getRotationMatrix2D(center, -angle, scale)
+            for poly_per_obj in self.masks:
+                rotated_poly = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    coords = np.stack([p[0::2], p[1::2]], axis=1)  # [n, 2]
+                    # pad 1 to convert from format [x, y] to homogeneous
+                    # coordinates format [x, y, 1]
+                    coords = np.concatenate(
+                        (coords, np.ones((coords.shape[0], 1), coords.dtype)),
+                        axis=1)  # [n, 3]
+                    rotated_coords = np.matmul(
+                        rotate_matrix[None, :, :],
+                        coords[:, :, None])[..., 0]  # [n, 2, 1] -> [n, 2]
+                    rotated_coords[:, 0] = np.clip(rotated_coords[:, 0], 0,
+                                                   out_shape[1])
+                    rotated_coords[:, 1] = np.clip(rotated_coords[:, 1], 0,
+                                                   out_shape[0])
+                    rotated_poly.append(rotated_coords.reshape(-1))
+                rotated_masks.append(rotated_poly)
+            rotated_masks = PolygonMasks(rotated_masks, *out_shape)
+        return rotated_masks
+
+    def to_bitmap(self):
+        """convert polygon masks to bitmap masks."""
+        bitmap_masks = self.to_ndarray()
+        return BitmapMasks(bitmap_masks, self.height, self.width)
+
+    @property
+    def areas(self):
+        """Compute areas of masks.
+
+        This func is modified from `detectron2
+        <https://github.com/facebookresearch/detectron2/blob/ffff8acc35ea88ad1cb1806ab0f00b4c1c5dbfd9/detectron2/structures/masks.py#L387>`_.
+        The function only works with Polygons using the shoelace formula.
+
+        Return:
+            ndarray: areas of each instance
+        """  # noqa: W501
+        area = []
+        for polygons_per_obj in self.masks:
+            area_per_obj = 0
+            for p in polygons_per_obj:
+                area_per_obj += self._polygon_area(p[0::2], p[1::2])
+            area.append(area_per_obj)
+        return np.asarray(area)
+
+    def _polygon_area(self, x, y):
+        """Compute the area of a component of a polygon.
+
+        Using the shoelace formula:
+        https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+
+        Args:
+            x (ndarray): x coordinates of the component
+            y (ndarray): y coordinates of the component
+
+        Return:
+            float: the are of the component
+        """  # noqa: 501
+        return 0.5 * np.abs(
+            np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
+
+    def to_ndarray(self):
+        """Convert masks to the format of ndarray."""
+        if len(self.masks) == 0:
+            return np.empty((0, self.height, self.width), dtype=np.uint8)
+        bitmap_masks = []
+        for poly_per_obj in self.masks:
+            bitmap_masks.append(
+                polygon_to_bitmap(poly_per_obj, self.height, self.width))
+        return np.stack(bitmap_masks)
+
+    def to_tensor(self, dtype, device):
+        """See :func:`BaseInstanceMasks.to_tensor`."""
+        if len(self.masks) == 0:
+            return torch.empty((0, self.height, self.width),
+                               dtype=dtype,
+                               device=device)
+        ndarray_masks = self.to_ndarray()
+        return torch.tensor(ndarray_masks, dtype=dtype, device=device)
+
+    @classmethod
+    def random(cls,
+               num_masks=3,
+               height=32,
+               width=32,
+               n_verts=5,
+               dtype=np.float32,
+               rng=None):
+        """Generate random polygon masks for demo / testing purposes.
+
+        Adapted from [1]_
+
+        References:
+            .. [1] https://gitlab.kitware.com/computer-vision/kwimage/-/blob/928cae35ca8/kwimage/structs/polygon.py#L379  # noqa: E501
+
+        Example:
+            >>> from mmdet.data_elements.mask.structures import PolygonMasks
+            >>> self = PolygonMasks.random()
+            >>> print('self = {}'.format(self))
+        """
+        from mmdet.utils.util_random import ensure_rng
+        rng = ensure_rng(rng)
+
+        def _gen_polygon(n, irregularity, spikeyness):
+            """Creates the polygon by sampling points on a circle around the
+            centre.  Random noise is added by varying the angular spacing
+            between sequential points, and by varying the radial distance of
+            each point from the centre.
+
+            Based on original code by Mike Ounsworth
+
+            Args:
+                n (int): number of vertices
+                irregularity (float): [0,1] indicating how much variance there
+                    is in the angular spacing of vertices. [0,1] will map to
+                    [0, 2pi/numberOfVerts]
+                spikeyness (float): [0,1] indicating how much variance there is
+                    in each vertex from the circle of radius aveRadius. [0,1]
+                    will map to [0, aveRadius]
+
+            Returns:
+                a list of vertices, in CCW order.
+            """
+            from scipy.stats import truncnorm
+
+            # Generate around the unit circle
+            cx, cy = (0.0, 0.0)
+            radius = 1
+
+            tau = np.pi * 2
+
+            irregularity = np.clip(irregularity, 0, 1) * 2 * np.pi / n
+            spikeyness = np.clip(spikeyness, 1e-9, 1)
+
+            # generate n angle steps
+            lower = (tau / n) - irregularity
+            upper = (tau / n) + irregularity
+            angle_steps = rng.uniform(lower, upper, n)
+
+            # normalize the steps so that point 0 and point n+1 are the same
+            k = angle_steps.sum() / (2 * np.pi)
+            angles = (angle_steps / k).cumsum() + rng.uniform(0, tau)
+
+            # Convert high and low values to be wrt the standard normal range
+            # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.truncnorm.html
+            low = 0
+            high = 2 * radius
+            mean = radius
+            std = spikeyness
+            a = (low - mean) / std
+            b = (high - mean) / std
+            tnorm = truncnorm(a=a, b=b, loc=mean, scale=std)
+
+            # now generate the points
+            radii = tnorm.rvs(n, random_state=rng)
+            x_pts = cx + radii * np.cos(angles)
+            y_pts = cy + radii * np.sin(angles)
+
+            points = np.hstack([x_pts[:, None], y_pts[:, None]])
+
+            # Scale to 0-1 space
+            points = points - points.min(axis=0)
+            points = points / points.max(axis=0)
+
+            # Randomly place within 0-1 space
+            points = points * (rng.rand() * .8 + .2)
+            min_pt = points.min(axis=0)
+            max_pt = points.max(axis=0)
+
+            high = (1 - max_pt)
+            low = (0 - min_pt)
+            offset = (rng.rand(2) * (high - low)) + low
+            points = points + offset
+            return points
+
+        def _order_vertices(verts):
+            """
+            References:
+                https://stackoverflow.com/questions/1709283/how-can-i-sort-a-coordinate-list-for-a-rectangle-counterclockwise
+            """
+            mlat = verts.T[0].sum() / len(verts)
+            mlng = verts.T[1].sum() / len(verts)
+
+            tau = np.pi * 2
+            angle = (np.arctan2(mlat - verts.T[0], verts.T[1] - mlng) +
+                     tau) % tau
+            sortx = angle.argsort()
+            verts = verts.take(sortx, axis=0)
+            return verts
+
+        # Generate a random exterior for each requested mask
+        masks = []
+        for _ in range(num_masks):
+            exterior = _order_vertices(_gen_polygon(n_verts, 0.9, 0.9))
+            exterior = (exterior * [(width, height)]).astype(dtype)
+            masks.append([exterior.ravel()])
+
+        self = cls(masks, height, width)
+        return self
+
+    @classmethod
+    def cat(cls: Type[T], masks: Sequence[T]) -> T:
+        """Concatenate a sequence of masks into one single mask instance.
+
+        Args:
+            masks (Sequence[PolygonMasks]): A sequence of mask instances.
+
+        Returns:
+            PolygonMasks: Concatenated mask instance.
+        """
+        assert isinstance(masks, Sequence)
+        if len(masks) == 0:
+            raise ValueError('masks should not be an empty list.')
+        assert all(isinstance(m, cls) for m in masks)
+
+        mask_list = list(itertools.chain(*[m.masks for m in masks]))
+        return cls(mask_list, masks[0].height, masks[0].width)
+
+
+def polygon_to_bitmap(polygons, height, width):
+    """Convert masks from the form of polygons to bitmaps.
+
+    Args:
+        polygons (list[ndarray]): masks in polygon representation
+        height (int): mask height
+        width (int): mask width
+
+    Return:
+        ndarray: the converted masks in bitmap representation
+    """
+    rles = maskUtils.frPyObjects(polygons, height, width)
+    rle = maskUtils.merge(rles)
+    bitmap_mask = maskUtils.decode(rle).astype(bool)
+    return bitmap_mask
+
+
+def bitmap_to_polygon(bitmap):
+    """Convert masks from the form of bitmaps to polygons.
+
+    Args:
+        bitmap (ndarray): masks in bitmap representation.
+
+    Return:
+        list[ndarray]: the converted mask in polygon representation.
+        bool: whether the mask has holes.
+    """
+    bitmap = np.ascontiguousarray(bitmap).astype(np.uint8)
+    # cv2.RETR_CCOMP: retrieves all of the contours and organizes them
+    #   into a two-level hierarchy. At the top level, there are external
+    #   boundaries of the components. At the second level, there are
+    #   boundaries of the holes. If there is another contour inside a hole
+    #   of a connected component, it is still put at the top level.
+    # cv2.CHAIN_APPROX_NONE: stores absolutely all the contour points.
+    outs = cv2.findContours(bitmap, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
+    contours = outs[-2]
+    hierarchy = outs[-1]
+    if hierarchy is None:
+        return [], False
+    # hierarchy[i]: 4 elements, for the indexes of next, previous,
+    # parent, or nested contours. If there is no corresponding contour,
+    # it will be -1.
+    with_hole = (hierarchy.reshape(-1, 4)[:, 3] >= 0).any()
+    contours = [c.reshape(-1, 2) for c in contours]
+    return contours, with_hole
diff --git a/mmde/mmdet/structures/mask/utils.py b/mmde/mmdet/structures/mask/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bd445e4fce1a312949f222d54d230a1a622d726
--- /dev/null
+++ b/mmde/mmdet/structures/mask/utils.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+from mmengine.utils import slice_list
+
+
+def split_combined_polys(polys, poly_lens, polys_per_mask):
+    """Split the combined 1-D polys into masks.
+
+    A mask is represented as a list of polys, and a poly is represented as
+    a 1-D array. In dataset, all masks are concatenated into a single 1-D
+    tensor. Here we need to split the tensor into original representations.
+
+    Args:
+        polys (list): a list (length = image num) of 1-D tensors
+        poly_lens (list): a list (length = image num) of poly length
+        polys_per_mask (list): a list (length = image num) of poly number
+            of each mask
+
+    Returns:
+        list: a list (length = image num) of list (length = mask num) of \
+            list (length = poly num) of numpy array.
+    """
+    mask_polys_list = []
+    for img_id in range(len(polys)):
+        polys_single = polys[img_id]
+        polys_lens_single = poly_lens[img_id].tolist()
+        polys_per_mask_single = polys_per_mask[img_id].tolist()
+
+        split_polys = slice_list(polys_single, polys_lens_single)
+        mask_polys = slice_list(split_polys, polys_per_mask_single)
+        mask_polys_list.append(mask_polys)
+    return mask_polys_list
+
+
+# TODO: move this function to more proper place
+def encode_mask_results(mask_results):
+    """Encode bitmap mask to RLE code.
+
+    Args:
+        mask_results (list): bitmap mask results.
+
+    Returns:
+        list | tuple: RLE encoded mask.
+    """
+    encoded_mask_results = []
+    for mask in mask_results:
+        encoded_mask_results.append(
+            mask_util.encode(
+                np.array(mask[:, :, np.newaxis], order='F',
+                         dtype='uint8'))[0])  # encoded with RLE
+    return encoded_mask_results
+
+
+def mask2bbox(masks):
+    """Obtain tight bounding boxes of binary masks.
+
+    Args:
+        masks (Tensor): Binary mask of shape (n, h, w).
+
+    Returns:
+        Tensor: Bboxe with shape (n, 4) of \
+            positive region in binary mask.
+    """
+    N = masks.shape[0]
+    bboxes = masks.new_zeros((N, 4), dtype=torch.float32)
+    x_any = torch.any(masks, dim=1)
+    y_any = torch.any(masks, dim=2)
+    for i in range(N):
+        x = torch.where(x_any[i, :])[0]
+        y = torch.where(y_any[i, :])[0]
+        if len(x) > 0 and len(y) > 0:
+            bboxes[i, :] = bboxes.new_tensor(
+                [x[0], y[0], x[-1] + 1, y[-1] + 1])
+
+    return bboxes
diff --git a/mmde/mmdet/structures/reid_data_sample.py b/mmde/mmdet/structures/reid_data_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..69958eece3671c9040c1f5561e724ca2d5f8e155
--- /dev/null
+++ b/mmde/mmdet/structures/reid_data_sample.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from numbers import Number
+from typing import Sequence, Union
+
+import mmengine
+import numpy as np
+import torch
+from mmengine.structures import BaseDataElement, LabelData
+
+
+def format_label(value: Union[torch.Tensor, np.ndarray, Sequence, int],
+                 num_classes: int = None) -> LabelData:
+    """Convert label of various python types to :obj:`mmengine.LabelData`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int`.
+
+    Args:
+        value (torch.Tensor | numpy.ndarray | Sequence | int): Label value.
+        num_classes (int, optional): The number of classes. If not None, set
+            it to the metainfo. Defaults to None.
+
+    Returns:
+        :obj:`mmengine.LabelData`: The foramtted label data.
+    """
+
+    # Handle single number
+    if isinstance(value, (torch.Tensor, np.ndarray)) and value.ndim == 0:
+        value = int(value.item())
+
+    if isinstance(value, np.ndarray):
+        value = torch.from_numpy(value)
+    elif isinstance(value, Sequence) and not mmengine.utils.is_str(value):
+        value = torch.tensor(value)
+    elif isinstance(value, int):
+        value = torch.LongTensor([value])
+    elif not isinstance(value, torch.Tensor):
+        raise TypeError(f'Type {type(value)} is not an available label type.')
+
+    metainfo = {}
+    if num_classes is not None:
+        metainfo['num_classes'] = num_classes
+        if value.max() >= num_classes:
+            raise ValueError(f'The label data ({value}) should not '
+                             f'exceed num_classes ({num_classes}).')
+    label = LabelData(label=value, metainfo=metainfo)
+    return label
+
+
+class ReIDDataSample(BaseDataElement):
+    """A data structure interface of ReID task.
+
+    It's used as interfaces between different components.
+
+    Meta field:
+        img_shape (Tuple): The shape of the corresponding input image.
+            Used for visualization.
+        ori_shape (Tuple): The original shape of the corresponding image.
+            Used for visualization.
+        num_classes (int): The number of all categories.
+            Used for label format conversion.
+
+    Data field:
+        gt_label (LabelData): The ground truth label.
+        pred_label (LabelData): The predicted label.
+        scores (torch.Tensor): The outputs of model.
+    """
+
+    @property
+    def gt_label(self):
+        return self._gt_label
+
+    @gt_label.setter
+    def gt_label(self, value: LabelData):
+        self.set_field(value, '_gt_label', dtype=LabelData)
+
+    @gt_label.deleter
+    def gt_label(self):
+        del self._gt_label
+
+    def set_gt_label(
+        self, value: Union[np.ndarray, torch.Tensor, Sequence[Number], Number]
+    ) -> 'ReIDDataSample':
+        """Set label of ``gt_label``."""
+        label = format_label(value, self.get('num_classes'))
+        if 'gt_label' in self:  # setting for the second time
+            self.gt_label.label = label.label
+        else:  # setting for the first time
+            self.gt_label = label
+        return self
+
+    def set_gt_score(self, value: torch.Tensor) -> 'ReIDDataSample':
+        """Set score of ``gt_label``."""
+        assert isinstance(value, torch.Tensor), \
+            f'The value should be a torch.Tensor but got {type(value)}.'
+        assert value.ndim == 1, \
+            f'The dims of value should be 1, but got {value.ndim}.'
+
+        if 'num_classes' in self:
+            assert value.size(0) == self.num_classes, \
+                f"The length of value ({value.size(0)}) doesn't "\
+                f'match the num_classes ({self.num_classes}).'
+            metainfo = {'num_classes': self.num_classes}
+        else:
+            metainfo = {'num_classes': value.size(0)}
+
+        if 'gt_label' in self:  # setting for the second time
+            self.gt_label.score = value
+        else:  # setting for the first time
+            self.gt_label = LabelData(score=value, metainfo=metainfo)
+        return self
+
+    @property
+    def pred_feature(self):
+        return self._pred_feature
+
+    @pred_feature.setter
+    def pred_feature(self, value: torch.Tensor):
+        self.set_field(value, '_pred_feature', dtype=torch.Tensor)
+
+    @pred_feature.deleter
+    def pred_feature(self):
+        del self._pred_feature
diff --git a/mmde/mmdet/structures/track_data_sample.py b/mmde/mmdet/structures/track_data_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..d005a5a42f57682d0b76d60d3dae463c4b4dc727
--- /dev/null
+++ b/mmde/mmdet/structures/track_data_sample.py
@@ -0,0 +1,273 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence
+
+import numpy as np
+import torch
+from mmengine.structures import BaseDataElement
+
+from .det_data_sample import DetDataSample
+
+
+class TrackDataSample(BaseDataElement):
+    """A data structure interface of tracking task in MMDetection. It is used
+    as interfaces between different components.
+
+    This data structure can be viewd as a wrapper of multiple DetDataSample to
+    some extent. Specifically, it only contains a property:
+    ``video_data_samples`` which is a list of DetDataSample, each of which
+    corresponds to a single frame. If you want to get the property of a single
+    frame, you must first get the corresponding ``DetDataSample`` by indexing
+    and then get the property of the frame, such as ``gt_instances``,
+    ``pred_instances`` and so on. As for metainfo, it differs from
+    ``DetDataSample`` in that each value corresponds to the metainfo key is a
+    list where each element corresponds to information of a single frame.
+
+    Examples:
+        >>> import torch
+        >>> from mmengine.structures import InstanceData
+        >>> from mmdet.structures import DetDataSample, TrackDataSample
+        >>> track_data_sample = TrackDataSample()
+        >>> # set the 1st frame
+        >>> frame1_data_sample = DetDataSample(metainfo=dict(
+        ...         img_shape=(100, 100), frame_id=0))
+        >>> frame1_gt_instances = InstanceData()
+        >>> frame1_gt_instances.bbox = torch.zeros([2, 4])
+        >>> frame1_data_sample.gt_instances = frame1_gt_instances
+        >>> # set the 2nd frame
+        >>> frame2_data_sample = DetDataSample(metainfo=dict(
+        ...         img_shape=(100, 100), frame_id=1))
+        >>> frame2_gt_instances = InstanceData()
+        >>> frame2_gt_instances.bbox = torch.ones([3, 4])
+        >>> frame2_data_sample.gt_instances = frame2_gt_instances
+        >>> track_data_sample.video_data_samples = [frame1_data_sample,
+        ...                                         frame2_data_sample]
+        >>> # set metainfo for track_data_sample
+        >>> track_data_sample.set_metainfo(dict(key_frames_inds=[0]))
+        >>> track_data_sample.set_metainfo(dict(ref_frames_inds=[1]))
+        >>> print(track_data_sample)
+        <TrackDataSample(
+
+            META INFORMATION
+            key_frames_inds: [0]
+            ref_frames_inds: [1]
+
+            DATA FIELDS
+            video_data_samples: [<DetDataSample(
+
+                    META INFORMATION
+                    img_shape: (100, 100)
+
+                    DATA FIELDS
+                    gt_instances: <InstanceData(
+
+                            META INFORMATION
+
+                            DATA FIELDS
+                            bbox: tensor([[0., 0., 0., 0.],
+                                        [0., 0., 0., 0.]])
+                        ) at 0x7f639320dcd0>
+                ) at 0x7f64bd223340>, <DetDataSample(
+
+                    META INFORMATION
+                    img_shape: (100, 100)
+
+                    DATA FIELDS
+                    gt_instances: <InstanceData(
+
+                            META INFORMATION
+
+                            DATA FIELDS
+                            bbox: tensor([[1., 1., 1., 1.],
+                                        [1., 1., 1., 1.],
+                                        [1., 1., 1., 1.]])
+                        ) at 0x7f64bd128b20>
+                ) at 0x7f64bd1346d0>]
+        ) at 0x7f64bd2237f0>
+        >>> print(len(track_data_sample))
+        2
+        >>> key_data_sample = track_data_sample.get_key_frames()
+        >>> print(key_data_sample[0].frame_id)
+        0
+        >>> ref_data_sample = track_data_sample.get_ref_frames()
+        >>> print(ref_data_sample[0].frame_id)
+        1
+        >>> frame1_data_sample = track_data_sample[0]
+        >>> print(frame1_data_sample.gt_instances.bbox)
+        tensor([[0., 0., 0., 0.],
+                [0., 0., 0., 0.]])
+        >>> # Tensor-like methods
+        >>> cuda_track_data_sample = track_data_sample.to('cuda')
+        >>> cuda_track_data_sample = track_data_sample.cuda()
+        >>> cpu_track_data_sample = track_data_sample.cpu()
+        >>> cpu_track_data_sample = track_data_sample.to('cpu')
+        >>> fp16_instances = cuda_track_data_sample.to(
+        ...     device=None, dtype=torch.float16, non_blocking=False,
+        ...     copy=False, memory_format=torch.preserve_format)
+    """
+
+    @property
+    def video_data_samples(self) -> List[DetDataSample]:
+        return self._video_data_samples
+
+    @video_data_samples.setter
+    def video_data_samples(self, value: List[DetDataSample]):
+        if isinstance(value, DetDataSample):
+            value = [value]
+        assert isinstance(value, list), 'video_data_samples must be a list'
+        assert isinstance(
+            value[0], DetDataSample
+        ), 'video_data_samples must be a list of DetDataSample, but got '
+        f'{value[0]}'
+        self.set_field(value, '_video_data_samples', dtype=list)
+
+    @video_data_samples.deleter
+    def video_data_samples(self):
+        del self._video_data_samples
+
+    def __getitem__(self, index):
+        assert hasattr(self,
+                       '_video_data_samples'), 'video_data_samples not set'
+        return self._video_data_samples[index]
+
+    def get_key_frames(self):
+        assert hasattr(self, 'key_frames_inds'), \
+            'key_frames_inds not set'
+        assert isinstance(self.key_frames_inds, Sequence)
+        key_frames_info = []
+        for index in self.key_frames_inds:
+            key_frames_info.append(self[index])
+        return key_frames_info
+
+    def get_ref_frames(self):
+        assert hasattr(self, 'ref_frames_inds'), \
+            'ref_frames_inds not set'
+        ref_frames_info = []
+        assert isinstance(self.ref_frames_inds, Sequence)
+        for index in self.ref_frames_inds:
+            ref_frames_info.append(self[index])
+        return ref_frames_info
+
+    def __len__(self):
+        return len(self._video_data_samples) if hasattr(
+            self, '_video_data_samples') else 0
+
+    # TODO: add UT for this Tensor-like method
+    # Tensor-like methods
+    def to(self, *args, **kwargs) -> 'BaseDataElement':
+        """Apply same name function to all tensors in data_fields."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if hasattr(v, 'to'):
+                    v = v.to(*args, **kwargs)
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def cpu(self) -> 'BaseDataElement':
+        """Convert all tensors to CPU in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, (torch.Tensor, BaseDataElement)):
+                    v = v.cpu()
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def cuda(self) -> 'BaseDataElement':
+        """Convert all tensors to GPU in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, (torch.Tensor, BaseDataElement)):
+                    v = v.cuda()
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def npu(self) -> 'BaseDataElement':
+        """Convert all tensors to NPU in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, (torch.Tensor, BaseDataElement)):
+                    v = v.npu()
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def detach(self) -> 'BaseDataElement':
+        """Detach all tensors in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, (torch.Tensor, BaseDataElement)):
+                    v = v.detach()
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def numpy(self) -> 'BaseDataElement':
+        """Convert all tensors to np.ndarray in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, (torch.Tensor, BaseDataElement)):
+                    v = v.detach().cpu().numpy()
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    def to_tensor(self) -> 'BaseDataElement':
+        """Convert all np.ndarray to tensor in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, np.ndarray):
+                    v = torch.from_numpy(v)
+                elif isinstance(v, BaseDataElement):
+                    v = v.to_tensor()
+                data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def clone(self) -> 'BaseDataElement':
+        """Deep copy the current data element.
+
+        Returns:
+            BaseDataElement: The copy of current data element.
+        """
+        clone_data = self.__class__()
+        clone_data.set_metainfo(dict(self.metainfo_items()))
+
+        for k, v_list in self.items():
+            clone_item_list = []
+            for v in v_list:
+                clone_item_list.append(v.clone())
+            clone_data.set_data({k: clone_item_list})
+        return clone_data
+
+
+TrackSampleList = List[TrackDataSample]
+OptTrackSampleList = Optional[TrackSampleList]
diff --git a/mmde/mmdet/testing/__init__.py b/mmde/mmdet/testing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..766fb471022ee6f2e4e1ff13a52040ae57772e53
--- /dev/null
+++ b/mmde/mmdet/testing/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ._fast_stop_training_hook import FastStopTrainingHook  # noqa: F401,F403
+from ._utils import (demo_mm_inputs, demo_mm_proposals,
+                     demo_mm_sampling_results, demo_track_inputs,
+                     get_detector_cfg, get_roi_head_cfg, random_boxes,
+                     replace_to_ceph)
+
+__all__ = [
+    'demo_mm_inputs', 'get_detector_cfg', 'get_roi_head_cfg',
+    'demo_mm_proposals', 'demo_mm_sampling_results', 'replace_to_ceph',
+    'demo_track_inputs', 'VideoDataSampleFeeder', 'random_boxes'
+]
diff --git a/mmde/mmdet/testing/_fast_stop_training_hook.py b/mmde/mmdet/testing/_fast_stop_training_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8e3d11439f875d2c9a6ce6b8a0b33acc832c2c5
--- /dev/null
+++ b/mmde/mmdet/testing/_fast_stop_training_hook.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import Hook
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class FastStopTrainingHook(Hook):
+    """Set runner's epoch information to the model."""
+
+    def __init__(self, by_epoch, save_ckpt=False, stop_iter_or_epoch=5):
+        self.by_epoch = by_epoch
+        self.save_ckpt = save_ckpt
+        self.stop_iter_or_epoch = stop_iter_or_epoch
+
+    def after_train_iter(self, runner, batch_idx: int, data_batch: None,
+                         outputs: None) -> None:
+        if self.save_ckpt and self.by_epoch:
+            # If it is epoch-based and want to save weights,
+            # we must run at least 1 epoch.
+            return
+        if runner.iter >= self.stop_iter_or_epoch:
+            raise RuntimeError('quick exit')
+
+    def after_train_epoch(self, runner) -> None:
+        if runner.epoch >= self.stop_iter_or_epoch - 1:
+            raise RuntimeError('quick exit')
diff --git a/mmde/mmdet/testing/_utils.py b/mmde/mmdet/testing/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4d3a86deab17e9c5acd1b1fe7f42e0bfa78943d
--- /dev/null
+++ b/mmde/mmdet/testing/_utils.py
@@ -0,0 +1,469 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from os.path import dirname, exists, join
+
+import numpy as np
+import torch
+from mmengine.config import Config
+from mmengine.dataset import pseudo_collate
+from mmengine.structures import InstanceData, PixelData
+
+from mmdet.utils.util_random import ensure_rng
+from ..registry import TASK_UTILS
+from ..structures import DetDataSample, TrackDataSample
+from ..structures.bbox import HorizontalBoxes
+
+
+def _get_config_directory():
+    """Find the predefined detector config directory."""
+    try:
+        # Assume we are running in the source mmdetection repo
+        repo_dpath = dirname(dirname(dirname(__file__)))
+    except NameError:
+        # For IPython development when this __file__ is not defined
+        import mmdet
+        repo_dpath = dirname(dirname(mmdet.__file__))
+    config_dpath = join(repo_dpath, 'configs')
+    if not exists(config_dpath):
+        raise Exception('Cannot find config path')
+    return config_dpath
+
+
+def _get_config_module(fname):
+    """Load a configuration as a python module."""
+    config_dpath = _get_config_directory()
+    config_fpath = join(config_dpath, fname)
+    config_mod = Config.fromfile(config_fpath)
+    return config_mod
+
+
+def get_detector_cfg(fname):
+    """Grab configs necessary to create a detector.
+
+    These are deep copied to allow for safe modification of parameters without
+    influencing other tests.
+    """
+    config = _get_config_module(fname)
+    model = copy.deepcopy(config.model)
+    return model
+
+
+def get_roi_head_cfg(fname):
+    """Grab configs necessary to create a roi_head.
+
+    These are deep copied to allow for safe modification of parameters without
+    influencing other tests.
+    """
+    config = _get_config_module(fname)
+    model = copy.deepcopy(config.model)
+
+    roi_head = model.roi_head
+    train_cfg = None if model.train_cfg is None else model.train_cfg.rcnn
+    test_cfg = None if model.test_cfg is None else model.test_cfg.rcnn
+    roi_head.update(dict(train_cfg=train_cfg, test_cfg=test_cfg))
+    return roi_head
+
+
+def _rand_bboxes(rng, num_boxes, w, h):
+    cx, cy, bw, bh = rng.rand(num_boxes, 4).T
+
+    tl_x = ((cx * w) - (w * bw / 2)).clip(0, w)
+    tl_y = ((cy * h) - (h * bh / 2)).clip(0, h)
+    br_x = ((cx * w) + (w * bw / 2)).clip(0, w)
+    br_y = ((cy * h) + (h * bh / 2)).clip(0, h)
+
+    bboxes = np.vstack([tl_x, tl_y, br_x, br_y]).T
+    return bboxes
+
+
+def _rand_masks(rng, num_boxes, bboxes, img_w, img_h):
+    from mmdet.structures.mask import BitmapMasks
+    masks = np.zeros((num_boxes, img_h, img_w))
+    for i, bbox in enumerate(bboxes):
+        bbox = bbox.astype(np.int32)
+        mask = (rng.rand(1, bbox[3] - bbox[1], bbox[2] - bbox[0]) >
+                0.3).astype(np.int64)
+        masks[i:i + 1, bbox[1]:bbox[3], bbox[0]:bbox[2]] = mask
+    return BitmapMasks(masks, height=img_h, width=img_w)
+
+
+def demo_mm_inputs(batch_size=2,
+                   image_shapes=(3, 128, 128),
+                   num_items=None,
+                   num_classes=10,
+                   sem_seg_output_strides=1,
+                   with_mask=False,
+                   with_semantic=False,
+                   use_box_type=False,
+                   device='cpu',
+                   texts=None,
+                   custom_entities=False):
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        batch_size (int): batch size. Defaults to 2.
+        image_shapes (List[tuple], Optional): image shape.
+            Defaults to (3, 128, 128)
+        num_items (None | List[int]): specifies the number
+            of boxes in each batch item. Default to None.
+        num_classes (int): number of different labels a
+            box might have. Defaults to 10.
+        with_mask (bool): Whether to return mask annotation.
+            Defaults to False.
+        with_semantic (bool): whether to return semantic.
+            Defaults to False.
+        device (str): Destination device type. Defaults to cpu.
+    """
+    rng = np.random.RandomState(0)
+
+    if isinstance(image_shapes, list):
+        assert len(image_shapes) == batch_size
+    else:
+        image_shapes = [image_shapes] * batch_size
+
+    if isinstance(num_items, list):
+        assert len(num_items) == batch_size
+
+    if texts is not None:
+        assert batch_size == len(texts)
+
+    packed_inputs = []
+    for idx in range(batch_size):
+        image_shape = image_shapes[idx]
+        c, h, w = image_shape
+
+        image = rng.randint(0, 255, size=image_shape, dtype=np.uint8)
+
+        mm_inputs = dict()
+        mm_inputs['inputs'] = torch.from_numpy(image).to(device)
+
+        img_meta = {
+            'img_id': idx,
+            'img_shape': image_shape[1:],
+            'ori_shape': image_shape[1:],
+            'filename': '<demo>.png',
+            'scale_factor': np.array([1.1, 1.2]),
+            'flip': False,
+            'flip_direction': None,
+            'border': [1, 1, 1, 1]  # Only used by CenterNet
+        }
+
+        if texts:
+            img_meta['text'] = texts[idx]
+            img_meta['custom_entities'] = custom_entities
+
+        data_sample = DetDataSample()
+        data_sample.set_metainfo(img_meta)
+
+        # gt_instances
+        gt_instances = InstanceData()
+        if num_items is None:
+            num_boxes = rng.randint(1, 10)
+        else:
+            num_boxes = num_items[idx]
+
+        bboxes = _rand_bboxes(rng, num_boxes, w, h)
+        labels = rng.randint(1, num_classes, size=num_boxes)
+        # TODO: remove this part when all model adapted with BaseBoxes
+        if use_box_type:
+            gt_instances.bboxes = HorizontalBoxes(bboxes, dtype=torch.float32)
+        else:
+            gt_instances.bboxes = torch.FloatTensor(bboxes)
+        gt_instances.labels = torch.LongTensor(labels)
+
+        if with_mask:
+            masks = _rand_masks(rng, num_boxes, bboxes, w, h)
+            gt_instances.masks = masks
+
+        # TODO: waiting for ci to be fixed
+        # masks = np.random.randint(0, 2, (len(bboxes), h, w), dtype=np.uint8)
+        # gt_instances.mask = BitmapMasks(masks, h, w)
+
+        data_sample.gt_instances = gt_instances
+
+        # ignore_instances
+        ignore_instances = InstanceData()
+        bboxes = _rand_bboxes(rng, num_boxes, w, h)
+        if use_box_type:
+            ignore_instances.bboxes = HorizontalBoxes(
+                bboxes, dtype=torch.float32)
+        else:
+            ignore_instances.bboxes = torch.FloatTensor(bboxes)
+        data_sample.ignored_instances = ignore_instances
+
+        # gt_sem_seg
+        if with_semantic:
+            # assume gt_semantic_seg using scale 1/8 of the img
+            gt_semantic_seg = torch.from_numpy(
+                np.random.randint(
+                    0,
+                    num_classes, (1, h // sem_seg_output_strides,
+                                  w // sem_seg_output_strides),
+                    dtype=np.uint8))
+            gt_sem_seg_data = dict(sem_seg=gt_semantic_seg)
+            data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data)
+
+        mm_inputs['data_samples'] = data_sample.to(device)
+
+        # TODO: gt_ignore
+
+        packed_inputs.append(mm_inputs)
+    data = pseudo_collate(packed_inputs)
+    return data
+
+
+def demo_mm_proposals(image_shapes, num_proposals, device='cpu'):
+    """Create a list of fake porposals.
+
+    Args:
+        image_shapes (list[tuple[int]]): Batch image shapes.
+        num_proposals (int): The number of fake proposals.
+    """
+    rng = np.random.RandomState(0)
+
+    results = []
+    for img_shape in image_shapes:
+        result = InstanceData()
+        w, h = img_shape[1:]
+        proposals = _rand_bboxes(rng, num_proposals, w, h)
+        result.bboxes = torch.from_numpy(proposals).float()
+        result.scores = torch.from_numpy(rng.rand(num_proposals)).float()
+        result.labels = torch.zeros(num_proposals).long()
+        results.append(result.to(device))
+    return results
+
+
+def demo_mm_sampling_results(proposals_list,
+                             batch_gt_instances,
+                             batch_gt_instances_ignore=None,
+                             assigner_cfg=None,
+                             sampler_cfg=None,
+                             feats=None):
+    """Create sample results that can be passed to BBoxHead.get_targets."""
+    assert len(proposals_list) == len(batch_gt_instances)
+    if batch_gt_instances_ignore is None:
+        batch_gt_instances_ignore = [None for _ in batch_gt_instances]
+    else:
+        assert len(batch_gt_instances_ignore) == len(batch_gt_instances)
+
+    default_assigner_cfg = dict(
+        type='MaxIoUAssigner',
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        min_pos_iou=0.5,
+        ignore_iof_thr=-1)
+    assigner_cfg = assigner_cfg if assigner_cfg is not None \
+        else default_assigner_cfg
+    default_sampler_cfg = dict(
+        type='RandomSampler',
+        num=512,
+        pos_fraction=0.25,
+        neg_pos_ub=-1,
+        add_gt_as_proposals=True)
+    sampler_cfg = sampler_cfg if sampler_cfg is not None \
+        else default_sampler_cfg
+    bbox_assigner = TASK_UTILS.build(assigner_cfg)
+    bbox_sampler = TASK_UTILS.build(sampler_cfg)
+
+    sampling_results = []
+    for i in range(len(batch_gt_instances)):
+        if feats is not None:
+            feats = [lvl_feat[i][None] for lvl_feat in feats]
+        # rename proposals.bboxes to proposals.priors
+        proposals = proposals_list[i]
+        proposals.priors = proposals.pop('bboxes')
+
+        assign_result = bbox_assigner.assign(proposals, batch_gt_instances[i],
+                                             batch_gt_instances_ignore[i])
+        sampling_result = bbox_sampler.sample(
+            assign_result, proposals, batch_gt_instances[i], feats=feats)
+        sampling_results.append(sampling_result)
+
+    return sampling_results
+
+
+def demo_track_inputs(batch_size=1,
+                      num_frames=2,
+                      key_frames_inds=None,
+                      image_shapes=(3, 128, 128),
+                      num_items=None,
+                      num_classes=1,
+                      with_mask=False,
+                      with_semantic=False):
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        batch_size (int): batch size. Default to 1.
+        num_frames (int): The number of frames.
+        key_frames_inds (List): The indices of key frames.
+        image_shapes (List[tuple], Optional): image shape.
+            Default to (3, 128, 128)
+        num_items (None | List[int]): specifies the number
+            of boxes in each batch item. Default to None.
+        num_classes (int): number of different labels a
+            box might have. Default to 1.
+        with_mask (bool): Whether to return mask annotation.
+            Defaults to False.
+        with_semantic (bool): whether to return semantic.
+            Default to False.
+    """
+    rng = np.random.RandomState(0)
+
+    # Make sure the length of image_shapes is equal to ``batch_size``
+    if isinstance(image_shapes, list):
+        assert len(image_shapes) == batch_size
+    else:
+        image_shapes = [image_shapes] * batch_size
+
+    packed_inputs = []
+    for idx in range(batch_size):
+        mm_inputs = dict(inputs=dict())
+        _, h, w = image_shapes[idx]
+
+        imgs = rng.randint(
+            0, 255, size=(num_frames, *image_shapes[idx]), dtype=np.uint8)
+        mm_inputs['inputs'] = torch.from_numpy(imgs)
+
+        img_meta = {
+            'img_id': idx,
+            'img_shape': image_shapes[idx][-2:],
+            'ori_shape': image_shapes[idx][-2:],
+            'filename': '<demo>.png',
+            'scale_factor': np.array([1.1, 1.2]),
+            'flip': False,
+            'flip_direction': None,
+            'is_video_data': True,
+        }
+
+        video_data_samples = []
+        for i in range(num_frames):
+            data_sample = DetDataSample()
+            img_meta['frame_id'] = i
+            data_sample.set_metainfo(img_meta)
+
+            # gt_instances
+            gt_instances = InstanceData()
+            if num_items is None:
+                num_boxes = rng.randint(1, 10)
+            else:
+                num_boxes = num_items[idx]
+
+            bboxes = _rand_bboxes(rng, num_boxes, w, h)
+            labels = rng.randint(0, num_classes, size=num_boxes)
+            instances_id = rng.randint(100, num_classes + 100, size=num_boxes)
+            gt_instances.bboxes = torch.FloatTensor(bboxes)
+            gt_instances.labels = torch.LongTensor(labels)
+            gt_instances.instances_ids = torch.LongTensor(instances_id)
+
+            if with_mask:
+                masks = _rand_masks(rng, num_boxes, bboxes, w, h)
+                gt_instances.masks = masks
+
+            data_sample.gt_instances = gt_instances
+            # ignore_instances
+            ignore_instances = InstanceData()
+            bboxes = _rand_bboxes(rng, num_boxes, w, h)
+            ignore_instances.bboxes = bboxes
+            data_sample.ignored_instances = ignore_instances
+
+            video_data_samples.append(data_sample)
+
+        track_data_sample = TrackDataSample()
+        track_data_sample.video_data_samples = video_data_samples
+        if key_frames_inds is not None:
+            assert isinstance(
+                key_frames_inds,
+                list) and len(key_frames_inds) < num_frames and max(
+                    key_frames_inds) < num_frames
+            ref_frames_inds = [
+                i for i in range(num_frames) if i not in key_frames_inds
+            ]
+            track_data_sample.set_metainfo(
+                dict(key_frames_inds=key_frames_inds))
+            track_data_sample.set_metainfo(
+                dict(ref_frames_inds=ref_frames_inds))
+        mm_inputs['data_samples'] = track_data_sample
+
+        # TODO: gt_ignore
+        packed_inputs.append(mm_inputs)
+    data = pseudo_collate(packed_inputs)
+    return data
+
+
+def random_boxes(num=1, scale=1, rng=None):
+    """Simple version of ``kwimage.Boxes.random``
+    Returns:
+        Tensor: shape (n, 4) in x1, y1, x2, y2 format.
+    References:
+        https://gitlab.kitware.com/computer-vision/kwimage/blob/master/kwimage/structs/boxes.py#L1390 # noqa: E501
+    Example:
+        >>> num = 3
+        >>> scale = 512
+        >>> rng = 0
+        >>> boxes = random_boxes(num, scale, rng)
+        >>> print(boxes)
+        tensor([[280.9925, 278.9802, 308.6148, 366.1769],
+                [216.9113, 330.6978, 224.0446, 456.5878],
+                [405.3632, 196.3221, 493.3953, 270.7942]])
+    """
+    rng = ensure_rng(rng)
+
+    tlbr = rng.rand(num, 4).astype(np.float32)
+
+    tl_x = np.minimum(tlbr[:, 0], tlbr[:, 2])
+    tl_y = np.minimum(tlbr[:, 1], tlbr[:, 3])
+    br_x = np.maximum(tlbr[:, 0], tlbr[:, 2])
+    br_y = np.maximum(tlbr[:, 1], tlbr[:, 3])
+
+    tlbr[:, 0] = tl_x * scale
+    tlbr[:, 1] = tl_y * scale
+    tlbr[:, 2] = br_x * scale
+    tlbr[:, 3] = br_y * scale
+
+    boxes = torch.from_numpy(tlbr)
+    return boxes
+
+
+# TODO: Support full ceph
+def replace_to_ceph(cfg):
+    backend_args = dict(
+        backend='petrel',
+        path_mapping=dict({
+            './data/': 's3://openmmlab/datasets/detection/',
+            'data/': 's3://openmmlab/datasets/detection/'
+        }))
+
+    # TODO: name is a reserved interface, which will be used later.
+    def _process_pipeline(dataset, name):
+
+        def replace_img(pipeline):
+            if pipeline['type'] == 'LoadImageFromFile':
+                pipeline['backend_args'] = backend_args
+
+        def replace_ann(pipeline):
+            if pipeline['type'] == 'LoadAnnotations' or pipeline[
+                    'type'] == 'LoadPanopticAnnotations':
+                pipeline['backend_args'] = backend_args
+
+        if 'pipeline' in dataset:
+            replace_img(dataset.pipeline[0])
+            replace_ann(dataset.pipeline[1])
+            if 'dataset' in dataset:
+                # dataset wrapper
+                replace_img(dataset.dataset.pipeline[0])
+                replace_ann(dataset.dataset.pipeline[1])
+        else:
+            # dataset wrapper
+            replace_img(dataset.dataset.pipeline[0])
+            replace_ann(dataset.dataset.pipeline[1])
+
+    def _process_evaluator(evaluator, name):
+        if evaluator['type'] == 'CocoPanopticMetric':
+            evaluator['backend_args'] = backend_args
+
+    # half ceph
+    _process_pipeline(cfg.train_dataloader.dataset, cfg.filename)
+    _process_pipeline(cfg.val_dataloader.dataset, cfg.filename)
+    _process_pipeline(cfg.test_dataloader.dataset, cfg.filename)
+    _process_evaluator(cfg.val_evaluator, cfg.filename)
+    _process_evaluator(cfg.test_evaluator, cfg.filename)
diff --git a/mmde/mmdet/utils/__init__.py b/mmde/mmdet/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..449a890bac411f84790eb3d014175e3a48757847
--- /dev/null
+++ b/mmde/mmdet/utils/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .collect_env import collect_env
+from .compat_config import compat_cfg
+from .dist_utils import (all_reduce_dict, allreduce_grads, reduce_mean,
+                         sync_random_seed)
+from .logger import get_caller_name, log_img_scale
+from .memory import AvoidCUDAOOM, AvoidOOM
+from .misc import (find_latest_checkpoint, get_test_pipeline_cfg,
+                   update_data_root)
+from .mot_error_visualize import imshow_mot_errors
+from .replace_cfg_vals import replace_cfg_vals
+from .setup_env import (register_all_modules, setup_cache_size_limit_of_dynamo,
+                        setup_multi_processes)
+from .split_batch import split_batch
+from .typing_utils import (ConfigType, InstanceList, MultiConfig,
+                           OptConfigType, OptInstanceList, OptMultiConfig,
+                           OptPixelList, PixelList, RangeType)
+
+__all__ = [
+    'collect_env', 'find_latest_checkpoint', 'update_data_root',
+    'setup_multi_processes', 'get_caller_name', 'log_img_scale', 'compat_cfg',
+    'split_batch', 'register_all_modules', 'replace_cfg_vals', 'AvoidOOM',
+    'AvoidCUDAOOM', 'all_reduce_dict', 'allreduce_grads', 'reduce_mean',
+    'sync_random_seed', 'ConfigType', 'InstanceList', 'MultiConfig',
+    'OptConfigType', 'OptInstanceList', 'OptMultiConfig', 'OptPixelList',
+    'PixelList', 'RangeType', 'get_test_pipeline_cfg',
+    'setup_cache_size_limit_of_dynamo', 'imshow_mot_errors'
+]
diff --git a/mmde/mmdet/utils/benchmark.py b/mmde/mmdet/utils/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..5419b2d175e3c48c063a39ae28758b386f9ab597
--- /dev/null
+++ b/mmde/mmdet/utils/benchmark.py
@@ -0,0 +1,529 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import time
+from functools import partial
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import fuse_conv_bn
+# TODO need update
+# from mmcv.runner import wrap_fp16_model
+from mmengine import MMLogger
+from mmengine.config import Config
+from mmengine.device import get_max_cuda_memory
+from mmengine.dist import get_world_size
+from mmengine.runner import Runner, load_checkpoint
+from mmengine.utils.dl_utils import set_multi_processing
+from torch.nn.parallel import DistributedDataParallel
+
+from mmdet.registry import DATASETS, MODELS
+
+try:
+    import psutil
+except ImportError:
+    psutil = None
+
+
+def custom_round(value: Union[int, float],
+                 factor: Union[int, float],
+                 precision: int = 2) -> float:
+    """Custom round function."""
+    return round(value / factor, precision)
+
+
+gb_round = partial(custom_round, factor=1024**3)
+
+
+def print_log(msg: str, logger: Optional[MMLogger] = None) -> None:
+    """Print a log message."""
+    if logger is None:
+        print(msg, flush=True)
+    else:
+        logger.info(msg)
+
+
+def print_process_memory(p: psutil.Process,
+                         logger: Optional[MMLogger] = None) -> None:
+    """print process memory info."""
+    mem_used = gb_round(psutil.virtual_memory().used)
+    memory_full_info = p.memory_full_info()
+    uss_mem = gb_round(memory_full_info.uss)
+    if hasattr(memory_full_info, 'pss'):
+        pss_mem = gb_round(memory_full_info.pss)
+
+    for children in p.children():
+        child_mem_info = children.memory_full_info()
+        uss_mem += gb_round(child_mem_info.uss)
+        if hasattr(child_mem_info, 'pss'):
+            pss_mem += gb_round(child_mem_info.pss)
+
+    process_count = 1 + len(p.children())
+
+    log_msg = f'(GB) mem_used: {mem_used:.2f} | uss: {uss_mem:.2f} | '
+    if hasattr(memory_full_info, 'pss'):
+        log_msg += f'pss: {pss_mem:.2f} | '
+    log_msg += f'total_proc: {process_count}'
+    print_log(log_msg, logger)
+
+
+class BaseBenchmark:
+    """The benchmark base class.
+
+    The ``run`` method is an external calling interface, and it will
+    call the ``run_once`` method ``repeat_num`` times for benchmarking.
+    Finally, call the ``average_multiple_runs`` method to further process
+    the results of multiple runs.
+
+    Args:
+        max_iter (int): maximum iterations of benchmark.
+        log_interval (int): interval of logging.
+        num_warmup (int): Number of Warmup.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 max_iter: int,
+                 log_interval: int,
+                 num_warmup: int,
+                 logger: Optional[MMLogger] = None):
+        self.max_iter = max_iter
+        self.log_interval = log_interval
+        self.num_warmup = num_warmup
+        self.logger = logger
+
+    def run(self, repeat_num: int = 1) -> dict:
+        """benchmark entry method.
+
+        Args:
+            repeat_num (int): Number of repeat benchmark.
+                Defaults to 1.
+        """
+        assert repeat_num >= 1
+
+        results = []
+        for _ in range(repeat_num):
+            results.append(self.run_once())
+
+        results = self.average_multiple_runs(results)
+        return results
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        raise NotImplementedError()
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        raise NotImplementedError()
+
+
+class InferenceBenchmark(BaseBenchmark):
+    """The inference benchmark class. It will be statistical inference FPS,
+    CUDA memory and CPU memory information.
+
+    Args:
+        cfg (mmengine.Config): config.
+        checkpoint (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``.
+        distributed (bool): distributed testing flag.
+        is_fuse_conv_bn (bool): Whether to fuse conv and bn, this will
+            slightly increase the inference speed.
+        max_iter (int): maximum iterations of benchmark. Defaults to 2000.
+        log_interval (int): interval of logging. Defaults to 50.
+        num_warmup (int): Number of Warmup. Defaults to 5.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 cfg: Config,
+                 checkpoint: str,
+                 distributed: bool,
+                 is_fuse_conv_bn: bool,
+                 max_iter: int = 2000,
+                 log_interval: int = 50,
+                 num_warmup: int = 5,
+                 logger: Optional[MMLogger] = None):
+        super().__init__(max_iter, log_interval, num_warmup, logger)
+
+        assert get_world_size(
+        ) == 1, 'Inference benchmark does not allow distributed multi-GPU'
+
+        self.cfg = copy.deepcopy(cfg)
+        self.distributed = distributed
+
+        if psutil is None:
+            raise ImportError('psutil is not installed, please install it by: '
+                              'pip install psutil')
+
+        self._process = psutil.Process()
+        env_cfg = self.cfg.get('env_cfg')
+        if env_cfg.get('cudnn_benchmark'):
+            torch.backends.cudnn.benchmark = True
+
+        mp_cfg: dict = env_cfg.get('mp_cfg', {})
+        set_multi_processing(**mp_cfg, distributed=self.distributed)
+
+        print_log('before build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+        self.model = self._init_model(checkpoint, is_fuse_conv_bn)
+
+        # Because multiple processes will occupy additional CPU resources,
+        # FPS statistics will be more unstable when num_workers is not 0.
+        # It is reasonable to set num_workers to 0.
+        dataloader_cfg = cfg.test_dataloader
+        dataloader_cfg['num_workers'] = 0
+        dataloader_cfg['batch_size'] = 1
+        dataloader_cfg['persistent_workers'] = False
+        self.data_loader = Runner.build_dataloader(dataloader_cfg)
+
+        print_log('after build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+    def _init_model(self, checkpoint: str, is_fuse_conv_bn: bool) -> nn.Module:
+        """Initialize the model."""
+        model = MODELS.build(self.cfg.model)
+        # TODO need update
+        # fp16_cfg = self.cfg.get('fp16', None)
+        # if fp16_cfg is not None:
+        #     wrap_fp16_model(model)
+
+        load_checkpoint(model, checkpoint, map_location='cpu')
+        if is_fuse_conv_bn:
+            model = fuse_conv_bn(model)
+
+        model = model.cuda()
+
+        if self.distributed:
+            model = DistributedDataParallel(
+                model,
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=False)
+
+        model.eval()
+        return model
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        pure_inf_time = 0
+        fps = 0
+
+        for i, data in enumerate(self.data_loader):
+
+            if (i + 1) % self.log_interval == 0:
+                print_log('==================================', self.logger)
+
+            torch.cuda.synchronize()
+            start_time = time.perf_counter()
+
+            with torch.no_grad():
+                self.model.test_step(data)
+
+            torch.cuda.synchronize()
+            elapsed = time.perf_counter() - start_time
+
+            if i >= self.num_warmup:
+                pure_inf_time += elapsed
+                if (i + 1) % self.log_interval == 0:
+                    fps = (i + 1 - self.num_warmup) / pure_inf_time
+                    cuda_memory = get_max_cuda_memory()
+
+                    print_log(
+                        f'Done image [{i + 1:<3}/{self.max_iter}], '
+                        f'fps: {fps:.1f} img/s, '
+                        f'times per image: {1000 / fps:.1f} ms/img, '
+                        f'cuda memory: {cuda_memory} MB', self.logger)
+                    print_process_memory(self._process, self.logger)
+
+            if (i + 1) == self.max_iter:
+                fps = (i + 1 - self.num_warmup) / pure_inf_time
+                break
+
+        return {'fps': fps}
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        print_log('============== Done ==================', self.logger)
+
+        fps_list_ = [round(result['fps'], 1) for result in results]
+        avg_fps_ = sum(fps_list_) / len(fps_list_)
+        outputs = {'avg_fps': avg_fps_, 'fps_list': fps_list_}
+
+        if len(fps_list_) > 1:
+            times_pre_image_list_ = [
+                round(1000 / result['fps'], 1) for result in results
+            ]
+            avg_times_pre_image_ = sum(times_pre_image_list_) / len(
+                times_pre_image_list_)
+
+            print_log(
+                f'Overall fps: {fps_list_}[{avg_fps_:.1f}] img/s, '
+                'times per image: '
+                f'{times_pre_image_list_}[{avg_times_pre_image_:.1f}] '
+                'ms/img', self.logger)
+        else:
+            print_log(
+                f'Overall fps: {fps_list_[0]:.1f} img/s, '
+                f'times per image: {1000 / fps_list_[0]:.1f} ms/img',
+                self.logger)
+
+        print_log(f'cuda memory: {get_max_cuda_memory()} MB', self.logger)
+        print_process_memory(self._process, self.logger)
+
+        return outputs
+
+
+class DataLoaderBenchmark(BaseBenchmark):
+    """The dataloader benchmark class. It will be statistical inference FPS and
+    CPU memory information.
+
+    Args:
+        cfg (mmengine.Config): config.
+        distributed (bool): distributed testing flag.
+        dataset_type (str): benchmark data type, only supports ``train``,
+            ``val`` and ``test``.
+        max_iter (int): maximum iterations of benchmark. Defaults to 2000.
+        log_interval (int): interval of logging. Defaults to 50.
+        num_warmup (int): Number of Warmup. Defaults to 5.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 cfg: Config,
+                 distributed: bool,
+                 dataset_type: str,
+                 max_iter: int = 2000,
+                 log_interval: int = 50,
+                 num_warmup: int = 5,
+                 logger: Optional[MMLogger] = None):
+        super().__init__(max_iter, log_interval, num_warmup, logger)
+
+        assert dataset_type in ['train', 'val', 'test'], \
+            'dataset_type only supports train,' \
+            f' val and test, but got {dataset_type}'
+        assert get_world_size(
+        ) == 1, 'Dataloader benchmark does not allow distributed multi-GPU'
+
+        self.cfg = copy.deepcopy(cfg)
+        self.distributed = distributed
+
+        if psutil is None:
+            raise ImportError('psutil is not installed, please install it by: '
+                              'pip install psutil')
+        self._process = psutil.Process()
+
+        mp_cfg = self.cfg.get('env_cfg', {}).get('mp_cfg')
+        if mp_cfg is not None:
+            set_multi_processing(distributed=self.distributed, **mp_cfg)
+        else:
+            set_multi_processing(distributed=self.distributed)
+
+        print_log('before build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+        if dataset_type == 'train':
+            self.data_loader = Runner.build_dataloader(cfg.train_dataloader)
+        elif dataset_type == 'test':
+            self.data_loader = Runner.build_dataloader(cfg.test_dataloader)
+        else:
+            self.data_loader = Runner.build_dataloader(cfg.val_dataloader)
+
+        self.batch_size = self.data_loader.batch_size
+        self.num_workers = self.data_loader.num_workers
+
+        print_log('after build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        pure_inf_time = 0
+        fps = 0
+
+        # benchmark with 2000 image and take the average
+        start_time = time.perf_counter()
+        for i, data in enumerate(self.data_loader):
+            elapsed = time.perf_counter() - start_time
+
+            if (i + 1) % self.log_interval == 0:
+                print_log('==================================', self.logger)
+
+            if i >= self.num_warmup:
+                pure_inf_time += elapsed
+                if (i + 1) % self.log_interval == 0:
+                    fps = (i + 1 - self.num_warmup) / pure_inf_time
+
+                    print_log(
+                        f'Done batch [{i + 1:<3}/{self.max_iter}], '
+                        f'fps: {fps:.1f} batch/s, '
+                        f'times per batch: {1000 / fps:.1f} ms/batch, '
+                        f'batch size: {self.batch_size}, num_workers: '
+                        f'{self.num_workers}', self.logger)
+                    print_process_memory(self._process, self.logger)
+
+            if (i + 1) == self.max_iter:
+                fps = (i + 1 - self.num_warmup) / pure_inf_time
+                break
+
+            start_time = time.perf_counter()
+
+        return {'fps': fps}
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        print_log('============== Done ==================', self.logger)
+
+        fps_list_ = [round(result['fps'], 1) for result in results]
+        avg_fps_ = sum(fps_list_) / len(fps_list_)
+        outputs = {'avg_fps': avg_fps_, 'fps_list': fps_list_}
+
+        if len(fps_list_) > 1:
+            times_pre_image_list_ = [
+                round(1000 / result['fps'], 1) for result in results
+            ]
+            avg_times_pre_image_ = sum(times_pre_image_list_) / len(
+                times_pre_image_list_)
+
+            print_log(
+                f'Overall fps: {fps_list_}[{avg_fps_:.1f}] img/s, '
+                'times per batch: '
+                f'{times_pre_image_list_}[{avg_times_pre_image_:.1f}] '
+                f'ms/batch, batch size: {self.batch_size}, num_workers: '
+                f'{self.num_workers}', self.logger)
+        else:
+            print_log(
+                f'Overall fps: {fps_list_[0]:.1f} batch/s, '
+                f'times per batch: {1000 / fps_list_[0]:.1f} ms/batch, '
+                f'batch size: {self.batch_size}, num_workers: '
+                f'{self.num_workers}', self.logger)
+
+        print_process_memory(self._process, self.logger)
+
+        return outputs
+
+
+class DatasetBenchmark(BaseBenchmark):
+    """The dataset benchmark class. It will be statistical inference FPS, FPS
+    pre transform and CPU memory information.
+
+    Args:
+        cfg (mmengine.Config): config.
+        dataset_type (str): benchmark data type, only supports ``train``,
+            ``val`` and ``test``.
+        max_iter (int): maximum iterations of benchmark. Defaults to 2000.
+        log_interval (int): interval of logging. Defaults to 50.
+        num_warmup (int): Number of Warmup. Defaults to 5.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 cfg: Config,
+                 dataset_type: str,
+                 max_iter: int = 2000,
+                 log_interval: int = 50,
+                 num_warmup: int = 5,
+                 logger: Optional[MMLogger] = None):
+        super().__init__(max_iter, log_interval, num_warmup, logger)
+        assert dataset_type in ['train', 'val', 'test'], \
+            'dataset_type only supports train,' \
+            f' val and test, but got {dataset_type}'
+        assert get_world_size(
+        ) == 1, 'Dataset benchmark does not allow distributed multi-GPU'
+        self.cfg = copy.deepcopy(cfg)
+
+        if dataset_type == 'train':
+            dataloader_cfg = copy.deepcopy(cfg.train_dataloader)
+        elif dataset_type == 'test':
+            dataloader_cfg = copy.deepcopy(cfg.test_dataloader)
+        else:
+            dataloader_cfg = copy.deepcopy(cfg.val_dataloader)
+
+        dataset_cfg = dataloader_cfg.pop('dataset')
+        dataset = DATASETS.build(dataset_cfg)
+        if hasattr(dataset, 'full_init'):
+            dataset.full_init()
+        self.dataset = dataset
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        pure_inf_time = 0
+        fps = 0
+
+        total_index = list(range(len(self.dataset)))
+        np.random.shuffle(total_index)
+
+        start_time = time.perf_counter()
+        for i, idx in enumerate(total_index):
+            if (i + 1) % self.log_interval == 0:
+                print_log('==================================', self.logger)
+
+            get_data_info_start_time = time.perf_counter()
+            data_info = self.dataset.get_data_info(idx)
+            get_data_info_elapsed = time.perf_counter(
+            ) - get_data_info_start_time
+
+            if (i + 1) % self.log_interval == 0:
+                print_log(f'get_data_info - {get_data_info_elapsed * 1000} ms',
+                          self.logger)
+
+            for t in self.dataset.pipeline.transforms:
+                transform_start_time = time.perf_counter()
+                data_info = t(data_info)
+                transform_elapsed = time.perf_counter() - transform_start_time
+
+                if (i + 1) % self.log_interval == 0:
+                    print_log(
+                        f'{t.__class__.__name__} - '
+                        f'{transform_elapsed * 1000} ms', self.logger)
+
+                if data_info is None:
+                    break
+
+            elapsed = time.perf_counter() - start_time
+
+            if i >= self.num_warmup:
+                pure_inf_time += elapsed
+                if (i + 1) % self.log_interval == 0:
+                    fps = (i + 1 - self.num_warmup) / pure_inf_time
+
+                    print_log(
+                        f'Done img [{i + 1:<3}/{self.max_iter}], '
+                        f'fps: {fps:.1f} img/s, '
+                        f'times per img: {1000 / fps:.1f} ms/img', self.logger)
+
+            if (i + 1) == self.max_iter:
+                fps = (i + 1 - self.num_warmup) / pure_inf_time
+                break
+
+            start_time = time.perf_counter()
+
+        return {'fps': fps}
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        print_log('============== Done ==================', self.logger)
+
+        fps_list_ = [round(result['fps'], 1) for result in results]
+        avg_fps_ = sum(fps_list_) / len(fps_list_)
+        outputs = {'avg_fps': avg_fps_, 'fps_list': fps_list_}
+
+        if len(fps_list_) > 1:
+            times_pre_image_list_ = [
+                round(1000 / result['fps'], 1) for result in results
+            ]
+            avg_times_pre_image_ = sum(times_pre_image_list_) / len(
+                times_pre_image_list_)
+
+            print_log(
+                f'Overall fps: {fps_list_}[{avg_fps_:.1f}] img/s, '
+                'times per img: '
+                f'{times_pre_image_list_}[{avg_times_pre_image_:.1f}] '
+                'ms/img', self.logger)
+        else:
+            print_log(
+                f'Overall fps: {fps_list_[0]:.1f} img/s, '
+                f'times per img: {1000 / fps_list_[0]:.1f} ms/img',
+                self.logger)
+
+        return outputs
diff --git a/mmde/mmdet/utils/collect_env.py b/mmde/mmdet/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0eed80fe2e4630b78ea3b13fde6046914e47e8b
--- /dev/null
+++ b/mmde/mmdet/utils/collect_env.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils import get_git_hash
+from mmengine.utils.dl_utils import collect_env as collect_base_env
+
+import mmdet
+
+
+def collect_env():
+    """Collect the information of the running environments."""
+    env_info = collect_base_env()
+    env_info['MMDetection'] = mmdet.__version__ + '+' + get_git_hash()[:7]
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/mmde/mmdet/utils/compat_config.py b/mmde/mmdet/utils/compat_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..133adb65c2276401eca947e223e5b7c1760de418
--- /dev/null
+++ b/mmde/mmdet/utils/compat_config.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+
+from mmengine.config import ConfigDict
+
+
+def compat_cfg(cfg):
+    """This function would modify some filed to keep the compatibility of
+    config.
+
+    For example, it will move some args which will be deprecated to the correct
+    fields.
+    """
+    cfg = copy.deepcopy(cfg)
+    cfg = compat_imgs_per_gpu(cfg)
+    cfg = compat_loader_args(cfg)
+    cfg = compat_runner_args(cfg)
+    return cfg
+
+
+def compat_runner_args(cfg):
+    if 'runner' not in cfg:
+        cfg.runner = ConfigDict({
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.total_epochs
+        })
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+    else:
+        if 'total_epochs' in cfg:
+            assert cfg.total_epochs == cfg.runner.max_epochs
+    return cfg
+
+
+def compat_imgs_per_gpu(cfg):
+    cfg = copy.deepcopy(cfg)
+    if 'imgs_per_gpu' in cfg.data:
+        warnings.warn('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+                      'Please use "samples_per_gpu" instead')
+        if 'samples_per_gpu' in cfg.data:
+            warnings.warn(
+                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+                f'={cfg.data.imgs_per_gpu} is used in this experiments')
+        else:
+            warnings.warn('Automatically set "samples_per_gpu"="imgs_per_gpu"='
+                          f'{cfg.data.imgs_per_gpu} in this experiments')
+        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+    return cfg
+
+
+def compat_loader_args(cfg):
+    """Deprecated sample_per_gpu in cfg.data."""
+
+    cfg = copy.deepcopy(cfg)
+    if 'train_dataloader' not in cfg.data:
+        cfg.data['train_dataloader'] = ConfigDict()
+    if 'val_dataloader' not in cfg.data:
+        cfg.data['val_dataloader'] = ConfigDict()
+    if 'test_dataloader' not in cfg.data:
+        cfg.data['test_dataloader'] = ConfigDict()
+
+    # special process for train_dataloader
+    if 'samples_per_gpu' in cfg.data:
+
+        samples_per_gpu = cfg.data.pop('samples_per_gpu')
+        assert 'samples_per_gpu' not in \
+               cfg.data.train_dataloader, ('`samples_per_gpu` are set '
+                                           'in `data` field and ` '
+                                           'data.train_dataloader` '
+                                           'at the same time. '
+                                           'Please only set it in '
+                                           '`data.train_dataloader`. ')
+        cfg.data.train_dataloader['samples_per_gpu'] = samples_per_gpu
+
+    if 'persistent_workers' in cfg.data:
+
+        persistent_workers = cfg.data.pop('persistent_workers')
+        assert 'persistent_workers' not in \
+               cfg.data.train_dataloader, ('`persistent_workers` are set '
+                                           'in `data` field and ` '
+                                           'data.train_dataloader` '
+                                           'at the same time. '
+                                           'Please only set it in '
+                                           '`data.train_dataloader`. ')
+        cfg.data.train_dataloader['persistent_workers'] = persistent_workers
+
+    if 'workers_per_gpu' in cfg.data:
+
+        workers_per_gpu = cfg.data.pop('workers_per_gpu')
+        cfg.data.train_dataloader['workers_per_gpu'] = workers_per_gpu
+        cfg.data.val_dataloader['workers_per_gpu'] = workers_per_gpu
+        cfg.data.test_dataloader['workers_per_gpu'] = workers_per_gpu
+
+    # special process for val_dataloader
+    if 'samples_per_gpu' in cfg.data.val:
+        # keep default value of `sample_per_gpu` is 1
+        assert 'samples_per_gpu' not in \
+               cfg.data.val_dataloader, ('`samples_per_gpu` are set '
+                                         'in `data.val` field and ` '
+                                         'data.val_dataloader` at '
+                                         'the same time. '
+                                         'Please only set it in '
+                                         '`data.val_dataloader`. ')
+        cfg.data.val_dataloader['samples_per_gpu'] = \
+            cfg.data.val.pop('samples_per_gpu')
+    # special process for val_dataloader
+
+    # in case the test dataset is concatenated
+    if isinstance(cfg.data.test, dict):
+        if 'samples_per_gpu' in cfg.data.test:
+            assert 'samples_per_gpu' not in \
+                   cfg.data.test_dataloader, ('`samples_per_gpu` are set '
+                                              'in `data.test` field and ` '
+                                              'data.test_dataloader` '
+                                              'at the same time. '
+                                              'Please only set it in '
+                                              '`data.test_dataloader`. ')
+
+            cfg.data.test_dataloader['samples_per_gpu'] = \
+                cfg.data.test.pop('samples_per_gpu')
+
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            if 'samples_per_gpu' in ds_cfg:
+                assert 'samples_per_gpu' not in \
+                       cfg.data.test_dataloader, ('`samples_per_gpu` are set '
+                                                  'in `data.test` field and ` '
+                                                  'data.test_dataloader` at'
+                                                  ' the same time. '
+                                                  'Please only set it in '
+                                                  '`data.test_dataloader`. ')
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        cfg.data.test_dataloader['samples_per_gpu'] = samples_per_gpu
+
+    return cfg
diff --git a/mmde/mmdet/utils/contextmanagers.py b/mmde/mmdet/utils/contextmanagers.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa12bfcaff1e781b0a8cc7d7c8b839c2f2955a05
--- /dev/null
+++ b/mmde/mmdet/utils/contextmanagers.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import asyncio
+import contextlib
+import logging
+import os
+import time
+from typing import List
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+DEBUG_COMPLETED_TIME = bool(os.environ.get('DEBUG_COMPLETED_TIME', False))
+
+
+@contextlib.asynccontextmanager
+async def completed(trace_name='',
+                    name='',
+                    sleep_interval=0.05,
+                    streams: List[torch.cuda.Stream] = None):
+    """Async context manager that waits for work to complete on given CUDA
+    streams."""
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    stream_before_context_switch = torch.cuda.current_stream()
+    if not streams:
+        streams = [stream_before_context_switch]
+    else:
+        streams = [s if s else stream_before_context_switch for s in streams]
+
+    end_events = [
+        torch.cuda.Event(enable_timing=DEBUG_COMPLETED_TIME) for _ in streams
+    ]
+
+    if DEBUG_COMPLETED_TIME:
+        start = torch.cuda.Event(enable_timing=True)
+        stream_before_context_switch.record_event(start)
+
+        cpu_start = time.monotonic()
+    logger.debug('%s %s starting, streams: %s', trace_name, name, streams)
+    grad_enabled_before = torch.is_grad_enabled()
+    try:
+        yield
+    finally:
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_end = time.monotonic()
+        for i, stream in enumerate(streams):
+            event = end_events[i]
+            stream.record_event(event)
+
+        grad_enabled_after = torch.is_grad_enabled()
+
+        # observed change of torch.is_grad_enabled() during concurrent run of
+        # async_test_bboxes code
+        assert (grad_enabled_before == grad_enabled_after
+                ), 'Unexpected is_grad_enabled() value change'
+
+        are_done = [e.query() for e in end_events]
+        logger.debug('%s %s completed: %s streams: %s', trace_name, name,
+                     are_done, streams)
+        with torch.cuda.stream(stream_before_context_switch):
+            while not all(are_done):
+                await asyncio.sleep(sleep_interval)
+                are_done = [e.query() for e in end_events]
+                logger.debug(
+                    '%s %s completed: %s streams: %s',
+                    trace_name,
+                    name,
+                    are_done,
+                    streams,
+                )
+
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_time = (cpu_end - cpu_start) * 1000
+            stream_times_ms = ''
+            for i, stream in enumerate(streams):
+                elapsed_time = start.elapsed_time(end_events[i])
+                stream_times_ms += f' {stream} {elapsed_time:.2f} ms'
+            logger.info('%s %s %.2f ms %s', trace_name, name, cpu_time,
+                        stream_times_ms)
+
+
+@contextlib.asynccontextmanager
+async def concurrent(streamqueue: asyncio.Queue,
+                     trace_name='concurrent',
+                     name='stream'):
+    """Run code concurrently in different streams.
+
+    :param streamqueue: asyncio.Queue instance.
+
+    Queue tasks define the pool of streams used for concurrent execution.
+    """
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    initial_stream = torch.cuda.current_stream()
+
+    with torch.cuda.stream(initial_stream):
+        stream = await streamqueue.get()
+        assert isinstance(stream, torch.cuda.Stream)
+
+        try:
+            with torch.cuda.stream(stream):
+                logger.debug('%s %s is starting, stream: %s', trace_name, name,
+                             stream)
+                yield
+                current = torch.cuda.current_stream()
+                assert current == stream
+                logger.debug('%s %s has finished, stream: %s', trace_name,
+                             name, stream)
+        finally:
+            streamqueue.task_done()
+            streamqueue.put_nowait(stream)
diff --git a/mmde/mmdet/utils/dist_utils.py b/mmde/mmdet/utils/dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f2c8614a181ec0594ba157002a2760737e2c6e3
--- /dev/null
+++ b/mmde/mmdet/utils/dist_utils.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+import pickle
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmengine.dist import get_dist_info
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce gradients.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters of a model
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    world_size = dist.get_world_size()
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+def reduce_mean(tensor):
+    """"Obtain the mean of tensor on different GPUs."""
+    if not (dist.is_available() and dist.is_initialized()):
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
+    return tensor
+
+
+def obj2tensor(pyobj, device='cuda'):
+    """Serialize picklable python object to tensor."""
+    storage = torch.ByteStorage.from_buffer(pickle.dumps(pyobj))
+    return torch.ByteTensor(storage).to(device=device)
+
+
+def tensor2obj(tensor):
+    """Deserialize tensor to picklable python object."""
+    return pickle.loads(tensor.cpu().numpy().tobytes())
+
+
+@functools.lru_cache()
+def _get_global_gloo_group():
+    """Return a process group based on gloo backend, containing all the ranks
+    The result is cached."""
+    if dist.get_backend() == 'nccl':
+        return dist.new_group(backend='gloo')
+    else:
+        return dist.group.WORLD
+
+
+def all_reduce_dict(py_dict, op='sum', group=None, to_float=True):
+    """Apply all reduce function for python dict object.
+
+    The code is modified from https://github.com/Megvii-
+    BaseDetection/YOLOX/blob/main/yolox/utils/allreduce_norm.py.
+
+    NOTE: make sure that py_dict in different ranks has the same keys and
+    the values should be in the same shape. Currently only supports
+    nccl backend.
+
+    Args:
+        py_dict (dict): Dict to be applied all reduce op.
+        op (str): Operator, could be 'sum' or 'mean'. Default: 'sum'
+        group (:obj:`torch.distributed.group`, optional): Distributed group,
+            Default: None.
+        to_float (bool): Whether to convert all values of dict to float.
+            Default: True.
+
+    Returns:
+        OrderedDict: reduced python dict object.
+    """
+    warnings.warn(
+        'group` is deprecated. Currently only supports NCCL backend.')
+    _, world_size = get_dist_info()
+    if world_size == 1:
+        return py_dict
+
+    # all reduce logic across different devices.
+    py_key = list(py_dict.keys())
+    if not isinstance(py_dict, OrderedDict):
+        py_key_tensor = obj2tensor(py_key)
+        dist.broadcast(py_key_tensor, src=0)
+        py_key = tensor2obj(py_key_tensor)
+
+    tensor_shapes = [py_dict[k].shape for k in py_key]
+    tensor_numels = [py_dict[k].numel() for k in py_key]
+
+    if to_float:
+        warnings.warn('Note: the "to_float" is True, you need to '
+                      'ensure that the behavior is reasonable.')
+        flatten_tensor = torch.cat(
+            [py_dict[k].flatten().float() for k in py_key])
+    else:
+        flatten_tensor = torch.cat([py_dict[k].flatten() for k in py_key])
+
+    dist.all_reduce(flatten_tensor, op=dist.ReduceOp.SUM)
+    if op == 'mean':
+        flatten_tensor /= world_size
+
+    split_tensors = [
+        x.reshape(shape) for x, shape in zip(
+            torch.split(flatten_tensor, tensor_numels), tensor_shapes)
+    ]
+    out_dict = {k: v for k, v in zip(py_key, split_tensors)}
+    if isinstance(py_dict, OrderedDict):
+        out_dict = OrderedDict(out_dict)
+    return out_dict
+
+
+def sync_random_seed(seed=None, device='cuda'):
+    """Make sure different ranks share the same seed.
+
+    All workers must call this function, otherwise it will deadlock.
+    This method is generally used in `DistributedSampler`,
+    because the seed should be identical across all processes
+    in the distributed group.
+
+    In distributed sampling, different ranks should sample non-overlapped
+    data in the dataset. Therefore, this function is used to make sure that
+    each rank shuffles the data indices in the same order based
+    on the same seed. Then different ranks could use different indices
+    to select non-overlapped data from the same data list.
+
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is None:
+        seed = np.random.randint(2**31)
+    assert isinstance(seed, int)
+
+    rank, world_size = get_dist_info()
+
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
diff --git a/mmde/mmdet/utils/large_image.py b/mmde/mmdet/utils/large_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f07c2bdc6958f2b3bdd69da0a639276252a91e
--- /dev/null
+++ b/mmde/mmdet/utils/large_image.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Tuple
+
+import torch
+from mmcv.ops import batched_nms
+from mmengine.structures import InstanceData
+
+from mmdet.structures import DetDataSample, SampleList
+
+
+def shift_rbboxes(bboxes: torch.Tensor, offset: Sequence[int]):
+    """Shift rotated bboxes with offset.
+
+    Args:
+        bboxes (Tensor): The rotated bboxes need to be translated.
+            With shape (n, 5), which means (x, y, w, h, a).
+        offset (Sequence[int]): The translation offsets with shape of (2, ).
+    Returns:
+        Tensor: Shifted rotated bboxes.
+    """
+    offset_tensor = bboxes.new_tensor(offset)
+    shifted_bboxes = bboxes.clone()
+    shifted_bboxes[:, 0:2] = shifted_bboxes[:, 0:2] + offset_tensor
+    return shifted_bboxes
+
+
+def shift_predictions(det_data_samples: SampleList,
+                      offsets: Sequence[Tuple[int, int]],
+                      src_image_shape: Tuple[int, int]) -> SampleList:
+    """Shift predictions to the original image.
+
+    Args:
+        det_data_samples (List[:obj:`DetDataSample`]): A list of patch results.
+        offsets (Sequence[Tuple[int, int]]): Positions of the left top points
+            of patches.
+        src_image_shape (Tuple[int, int]): A (height, width) tuple of the large
+            image's width and height.
+    Returns:
+        (List[:obj:`DetDataSample`]): shifted results.
+    """
+    try:
+        from sahi.slicing import shift_bboxes, shift_masks
+    except ImportError:
+        raise ImportError('Please run "pip install -U sahi" '
+                          'to install sahi first for large image inference.')
+
+    assert len(det_data_samples) == len(
+        offsets), 'The `results` should has the ' 'same length with `offsets`.'
+    shifted_predictions = []
+    for det_data_sample, offset in zip(det_data_samples, offsets):
+        pred_inst = det_data_sample.pred_instances.clone()
+
+        # Check bbox type
+        if pred_inst.bboxes.size(-1) == 4:
+            # Horizontal bboxes
+            shifted_bboxes = shift_bboxes(pred_inst.bboxes, offset)
+        elif pred_inst.bboxes.size(-1) == 5:
+            # Rotated bboxes
+            shifted_bboxes = shift_rbboxes(pred_inst.bboxes, offset)
+        else:
+            raise NotImplementedError
+
+        # shift bboxes and masks
+        pred_inst.bboxes = shifted_bboxes
+        if 'masks' in det_data_sample:
+            pred_inst.masks = shift_masks(pred_inst.masks, offset,
+                                          src_image_shape)
+
+        shifted_predictions.append(pred_inst.clone())
+
+    shifted_predictions = InstanceData.cat(shifted_predictions)
+
+    return shifted_predictions
+
+
+def merge_results_by_nms(results: SampleList, offsets: Sequence[Tuple[int,
+                                                                      int]],
+                         src_image_shape: Tuple[int, int],
+                         nms_cfg: dict) -> DetDataSample:
+    """Merge patch results by nms.
+
+    Args:
+        results (List[:obj:`DetDataSample`]): A list of patch results.
+        offsets (Sequence[Tuple[int, int]]): Positions of the left top points
+            of patches.
+        src_image_shape (Tuple[int, int]): A (height, width) tuple of the large
+            image's width and height.
+        nms_cfg (dict): it should specify nms type and other parameters
+            like `iou_threshold`.
+    Returns:
+        :obj:`DetDataSample`: merged results.
+    """
+    shifted_instances = shift_predictions(results, offsets, src_image_shape)
+
+    _, keeps = batched_nms(
+        boxes=shifted_instances.bboxes,
+        scores=shifted_instances.scores,
+        idxs=shifted_instances.labels,
+        nms_cfg=nms_cfg)
+    merged_instances = shifted_instances[keeps]
+
+    merged_result = results[0].clone()
+    merged_result.pred_instances = merged_instances
+    return merged_result
diff --git a/mmde/mmdet/utils/logger.py b/mmde/mmdet/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fec08bbad5517c9169eedb15b4768e7d88d39c7
--- /dev/null
+++ b/mmde/mmdet/utils/logger.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+
+from mmengine.logging import print_log
+
+
+def get_caller_name():
+    """Get name of caller method."""
+    # this_func_frame = inspect.stack()[0][0]  # i.e., get_caller_name
+    # callee_frame = inspect.stack()[1][0]  # e.g., log_img_scale
+    caller_frame = inspect.stack()[2][0]  # e.g., caller of log_img_scale
+    caller_method = caller_frame.f_code.co_name
+    try:
+        caller_class = caller_frame.f_locals['self'].__class__.__name__
+        return f'{caller_class}.{caller_method}'
+    except KeyError:  # caller is a function
+        return caller_method
+
+
+def log_img_scale(img_scale, shape_order='hw', skip_square=False):
+    """Log image size.
+
+    Args:
+        img_scale (tuple): Image size to be logged.
+        shape_order (str, optional): The order of image shape.
+            'hw' for (height, width) and 'wh' for (width, height).
+            Defaults to 'hw'.
+        skip_square (bool, optional): Whether to skip logging for square
+            img_scale. Defaults to False.
+
+    Returns:
+        bool: Whether to have done logging.
+    """
+    if shape_order == 'hw':
+        height, width = img_scale
+    elif shape_order == 'wh':
+        width, height = img_scale
+    else:
+        raise ValueError(f'Invalid shape_order {shape_order}.')
+
+    if skip_square and (height == width):
+        return False
+
+    caller = get_caller_name()
+    print_log(
+        f'image shape: height={height}, width={width} in {caller}',
+        logger='current')
+
+    return True
diff --git a/mmde/mmdet/utils/memory.py b/mmde/mmdet/utils/memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6f9cbc7f9e5f54e2cc429e5e655b2a27d38d61f
--- /dev/null
+++ b/mmde/mmdet/utils/memory.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections import abc
+from contextlib import contextmanager
+from functools import wraps
+
+import torch
+from mmengine.logging import MMLogger
+
+
+def cast_tensor_type(inputs, src_type=None, dst_type=None):
+    """Recursively convert Tensor in inputs from ``src_type`` to ``dst_type``.
+
+    Args:
+        inputs: Inputs that to be casted.
+        src_type (torch.dtype | torch.device): Source type.
+        src_type (torch.dtype | torch.device): Destination type.
+
+    Returns:
+        The same type with inputs, but all contained Tensors have been cast.
+    """
+    assert dst_type is not None
+    if isinstance(inputs, torch.Tensor):
+        if isinstance(dst_type, torch.device):
+            # convert Tensor to dst_device
+            if hasattr(inputs, 'to') and \
+                    hasattr(inputs, 'device') and \
+                    (inputs.device == src_type or src_type is None):
+                return inputs.to(dst_type)
+            else:
+                return inputs
+        else:
+            # convert Tensor to dst_dtype
+            if hasattr(inputs, 'to') and \
+                    hasattr(inputs, 'dtype') and \
+                    (inputs.dtype == src_type or src_type is None):
+                return inputs.to(dst_type)
+            else:
+                return inputs
+        # we need to ensure that the type of inputs to be casted are the same
+        # as the argument `src_type`.
+    elif isinstance(inputs, abc.Mapping):
+        return type(inputs)({
+            k: cast_tensor_type(v, src_type=src_type, dst_type=dst_type)
+            for k, v in inputs.items()
+        })
+    elif isinstance(inputs, abc.Iterable):
+        return type(inputs)(
+            cast_tensor_type(item, src_type=src_type, dst_type=dst_type)
+            for item in inputs)
+    # TODO: Currently not supported
+    # elif isinstance(inputs, InstanceData):
+    #     for key, value in inputs.items():
+    #         inputs[key] = cast_tensor_type(
+    #             value, src_type=src_type, dst_type=dst_type)
+    #     return inputs
+    else:
+        return inputs
+
+
+@contextmanager
+def _ignore_torch_cuda_oom():
+    """A context which ignores CUDA OOM exception from pytorch.
+
+    Code is modified from
+    <https://github.com/facebookresearch/detectron2/blob/main/detectron2/utils/memory.py>  # noqa: E501
+    """
+    try:
+        yield
+    except RuntimeError as e:
+        # NOTE: the string may change?
+        if 'CUDA out of memory. ' in str(e):
+            pass
+        else:
+            raise
+
+
+class AvoidOOM:
+    """Try to convert inputs to FP16 and CPU if got a PyTorch's CUDA Out of
+    Memory error. It will do the following steps:
+
+        1. First retry after calling `torch.cuda.empty_cache()`.
+        2. If that still fails, it will then retry by converting inputs
+          to FP16.
+        3. If that still fails trying to convert inputs to CPUs.
+          In this case, it expects the function to dispatch to
+          CPU implementation.
+
+    Args:
+        to_cpu (bool): Whether to convert outputs to CPU if get an OOM
+            error. This will slow down the code significantly.
+            Defaults to True.
+        test (bool): Skip `_ignore_torch_cuda_oom` operate that can use
+            lightweight data in unit test, only used in
+            test unit. Defaults to False.
+
+    Examples:
+        >>> from mmdet.utils.memory import AvoidOOM
+        >>> AvoidCUDAOOM = AvoidOOM()
+        >>> output = AvoidOOM.retry_if_cuda_oom(
+        >>>     some_torch_function)(input1, input2)
+        >>> # To use as a decorator
+        >>> # from mmdet.utils import AvoidCUDAOOM
+        >>> @AvoidCUDAOOM.retry_if_cuda_oom
+        >>> def function(*args, **kwargs):
+        >>>     return None
+    ```
+
+    Note:
+        1. The output may be on CPU even if inputs are on GPU. Processing
+            on CPU will slow down the code significantly.
+        2. When converting inputs to CPU, it will only look at each argument
+            and check if it has `.device` and `.to` for conversion. Nested
+            structures of tensors are not supported.
+        3. Since the function might be called more than once, it has to be
+            stateless.
+    """
+
+    def __init__(self, to_cpu=True, test=False):
+        self.to_cpu = to_cpu
+        self.test = test
+
+    def retry_if_cuda_oom(self, func):
+        """Makes a function retry itself after encountering pytorch's CUDA OOM
+        error.
+
+        The implementation logic is referred to
+        https://github.com/facebookresearch/detectron2/blob/main/detectron2/utils/memory.py
+
+        Args:
+            func: a stateless callable that takes tensor-like objects
+                as arguments.
+        Returns:
+            func: a callable which retries `func` if OOM is encountered.
+        """  # noqa: W605
+
+        @wraps(func)
+        def wrapped(*args, **kwargs):
+
+            # raw function
+            if not self.test:
+                with _ignore_torch_cuda_oom():
+                    return func(*args, **kwargs)
+
+                # Clear cache and retry
+                torch.cuda.empty_cache()
+                with _ignore_torch_cuda_oom():
+                    return func(*args, **kwargs)
+
+            # get the type and device of first tensor
+            dtype, device = None, None
+            values = args + tuple(kwargs.values())
+            for value in values:
+                if isinstance(value, torch.Tensor):
+                    dtype = value.dtype
+                    device = value.device
+                    break
+            if dtype is None or device is None:
+                raise ValueError('There is no tensor in the inputs, '
+                                 'cannot get dtype and device.')
+
+            # Convert to FP16
+            fp16_args = cast_tensor_type(args, dst_type=torch.half)
+            fp16_kwargs = cast_tensor_type(kwargs, dst_type=torch.half)
+            logger = MMLogger.get_current_instance()
+            logger.warning(f'Attempting to copy inputs of {str(func)} '
+                           'to FP16 due to CUDA OOM')
+
+            # get input tensor type, the output type will same as
+            # the first parameter type.
+            with _ignore_torch_cuda_oom():
+                output = func(*fp16_args, **fp16_kwargs)
+                output = cast_tensor_type(
+                    output, src_type=torch.half, dst_type=dtype)
+                if not self.test:
+                    return output
+            logger.warning('Using FP16 still meet CUDA OOM')
+
+            # Try on CPU. This will slow down the code significantly,
+            # therefore print a notice.
+            if self.to_cpu:
+                logger.warning(f'Attempting to copy inputs of {str(func)} '
+                               'to CPU due to CUDA OOM')
+                cpu_device = torch.empty(0).device
+                cpu_args = cast_tensor_type(args, dst_type=cpu_device)
+                cpu_kwargs = cast_tensor_type(kwargs, dst_type=cpu_device)
+
+                # convert outputs to GPU
+                with _ignore_torch_cuda_oom():
+                    logger.warning(f'Convert outputs to GPU (device={device})')
+                    output = func(*cpu_args, **cpu_kwargs)
+                    output = cast_tensor_type(
+                        output, src_type=cpu_device, dst_type=device)
+                    return output
+
+                warnings.warn('Cannot convert output to GPU due to CUDA OOM, '
+                              'the output is now on CPU, which might cause '
+                              'errors if the output need to interact with GPU '
+                              'data in subsequent operations')
+                logger.warning('Cannot convert output to GPU due to '
+                               'CUDA OOM, the output is on CPU now.')
+
+                return func(*cpu_args, **cpu_kwargs)
+            else:
+                # may still get CUDA OOM error
+                return func(*args, **kwargs)
+
+        return wrapped
+
+
+# To use AvoidOOM as a decorator
+AvoidCUDAOOM = AvoidOOM()
diff --git a/mmde/mmdet/utils/misc.py b/mmde/mmdet/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dfb394465196cbd1e60c96f5be3aaee416d59cf
--- /dev/null
+++ b/mmde/mmdet/utils/misc.py
@@ -0,0 +1,149 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import glob
+import os
+import os.path as osp
+import urllib
+import warnings
+from typing import Union
+
+import torch
+from mmengine.config import Config, ConfigDict
+from mmengine.logging import print_log
+from mmengine.utils import scandir
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
+                  '.tiff', '.webp')
+
+
+def find_latest_checkpoint(path, suffix='pth'):
+    """Find the latest checkpoint from the working directory.
+
+    Args:
+        path(str): The path to find checkpoints.
+        suffix(str): File extension.
+            Defaults to pth.
+
+    Returns:
+        latest_path(str | None): File path of the latest checkpoint.
+    References:
+        .. [1] https://github.com/microsoft/SoftTeacher
+                  /blob/main/ssod/utils/patch.py
+    """
+    if not osp.exists(path):
+        warnings.warn('The path of checkpoints does not exist.')
+        return None
+    if osp.exists(osp.join(path, f'latest.{suffix}')):
+        return osp.join(path, f'latest.{suffix}')
+
+    checkpoints = glob.glob(osp.join(path, f'*.{suffix}'))
+    if len(checkpoints) == 0:
+        warnings.warn('There are no checkpoints in the path.')
+        return None
+    latest = -1
+    latest_path = None
+    for checkpoint in checkpoints:
+        count = int(osp.basename(checkpoint).split('_')[-1].split('.')[0])
+        if count > latest:
+            latest = count
+            latest_path = checkpoint
+    return latest_path
+
+
+def update_data_root(cfg, logger=None):
+    """Update data root according to env MMDET_DATASETS.
+
+    If set env MMDET_DATASETS, update cfg.data_root according to
+    MMDET_DATASETS. Otherwise, using cfg.data_root as default.
+
+    Args:
+        cfg (:obj:`Config`): The model config need to modify
+        logger (logging.Logger | str | None): the way to print msg
+    """
+    assert isinstance(cfg, Config), \
+        f'cfg got wrong type: {type(cfg)}, expected mmengine.Config'
+
+    if 'MMDET_DATASETS' in os.environ:
+        dst_root = os.environ['MMDET_DATASETS']
+        print_log(f'MMDET_DATASETS has been set to be {dst_root}.'
+                  f'Using {dst_root} as data root.')
+    else:
+        return
+
+    assert isinstance(cfg, Config), \
+        f'cfg got wrong type: {type(cfg)}, expected mmengine.Config'
+
+    def update(cfg, src_str, dst_str):
+        for k, v in cfg.items():
+            if isinstance(v, ConfigDict):
+                update(cfg[k], src_str, dst_str)
+            if isinstance(v, str) and src_str in v:
+                cfg[k] = v.replace(src_str, dst_str)
+
+    update(cfg.data, cfg.data_root, dst_root)
+    cfg.data_root = dst_root
+
+
+def get_test_pipeline_cfg(cfg: Union[str, ConfigDict]) -> ConfigDict:
+    """Get the test dataset pipeline from entire config.
+
+    Args:
+        cfg (str or :obj:`ConfigDict`): the entire config. Can be a config
+            file or a ``ConfigDict``.
+
+    Returns:
+        :obj:`ConfigDict`: the config of test dataset.
+    """
+    if isinstance(cfg, str):
+        cfg = Config.fromfile(cfg)
+
+    def _get_test_pipeline_cfg(dataset_cfg):
+        if 'pipeline' in dataset_cfg:
+            return dataset_cfg.pipeline
+        # handle dataset wrapper
+        elif 'dataset' in dataset_cfg:
+            return _get_test_pipeline_cfg(dataset_cfg.dataset)
+        # handle dataset wrappers like ConcatDataset
+        elif 'datasets' in dataset_cfg:
+            return _get_test_pipeline_cfg(dataset_cfg.datasets[0])
+
+        raise RuntimeError('Cannot find `pipeline` in `test_dataloader`')
+
+    return _get_test_pipeline_cfg(cfg.test_dataloader.dataset)
+
+
+def get_file_list(source_root: str) -> [list, dict]:
+    """Get file list.
+
+    Args:
+        source_root (str): image or video source path
+
+    Return:
+        source_file_path_list (list): A list for all source file.
+        source_type (dict): Source type: file or url or dir.
+    """
+    is_dir = os.path.isdir(source_root)
+    is_url = source_root.startswith(('http:/', 'https:/'))
+    is_file = os.path.splitext(source_root)[-1].lower() in IMG_EXTENSIONS
+
+    source_file_path_list = []
+    if is_dir:
+        # when input source is dir
+        for file in scandir(source_root, IMG_EXTENSIONS, recursive=True):
+            source_file_path_list.append(os.path.join(source_root, file))
+    elif is_url:
+        # when input source is url
+        filename = os.path.basename(
+            urllib.parse.unquote(source_root).split('?')[0])
+        file_save_path = os.path.join(os.getcwd(), filename)
+        print(f'Downloading source file to {file_save_path}')
+        torch.hub.download_url_to_file(source_root, file_save_path)
+        source_file_path_list = [file_save_path]
+    elif is_file:
+        # when input source is single image
+        source_file_path_list = [source_root]
+    else:
+        print('Cannot find image file.')
+
+    source_type = dict(is_dir=is_dir, is_url=is_url, is_file=is_file)
+
+    return source_file_path_list, source_type
diff --git a/mmde/mmdet/utils/mot_error_visualize.py b/mmde/mmdet/utils/mot_error_visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..01bf8645d340aa1f5ab8251211a719f2de9845b1
--- /dev/null
+++ b/mmde/mmdet/utils/mot_error_visualize.py
@@ -0,0 +1,273 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Union
+
+try:
+    import seaborn as sns
+except ImportError:
+    sns = None
+import cv2
+import matplotlib.pyplot as plt
+import mmcv
+import numpy as np
+from matplotlib.patches import Rectangle
+from mmengine.utils import mkdir_or_exist
+
+
+def imshow_mot_errors(*args, backend: str = 'cv2', **kwargs):
+    """Show the wrong tracks on the input image.
+
+    Args:
+        backend (str, optional): Backend of visualization.
+            Defaults to 'cv2'.
+    """
+    if backend == 'cv2':
+        return _cv2_show_wrong_tracks(*args, **kwargs)
+    elif backend == 'plt':
+        return _plt_show_wrong_tracks(*args, **kwargs)
+    else:
+        raise NotImplementedError()
+
+
+def _cv2_show_wrong_tracks(img: Union[str, np.ndarray],
+                           bboxes: np.ndarray,
+                           ids: np.ndarray,
+                           error_types: np.ndarray,
+                           thickness: int = 2,
+                           font_scale: float = 0.4,
+                           text_width: int = 10,
+                           text_height: int = 15,
+                           show: bool = False,
+                           wait_time: int = 100,
+                           out_file: str = None) -> np.ndarray:
+    """Show the wrong tracks with opencv.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (ndarray): A ndarray of shape (k, 5).
+        ids (ndarray): A ndarray of shape (k, ).
+        error_types (ndarray): A ndarray of shape (k, ), where 0 denotes
+            false positives, 1 denotes false negative and 2 denotes ID switch.
+        thickness (int, optional): Thickness of lines.
+            Defaults to 2.
+        font_scale (float, optional): Font scale to draw id and score.
+            Defaults to 0.4.
+        text_width (int, optional): Width to draw id and score.
+            Defaults to 10.
+        text_height (int, optional): Height to draw id and score.
+            Defaults to 15.
+        show (bool, optional): Whether to show the image on the fly.
+            Defaults to False.
+        wait_time (int, optional): Value of waitKey param.
+            Defaults to 100.
+        out_file (str, optional): The filename to write the image.
+            Defaults to None.
+
+    Returns:
+        ndarray: Visualized image.
+    """
+    if sns is None:
+        raise ImportError('please run pip install seaborn')
+    assert bboxes.ndim == 2, \
+        f' bboxes ndim should be 2, but its ndim is {bboxes.ndim}.'
+    assert ids.ndim == 1, \
+        f' ids ndim should be 1, but its ndim is {ids.ndim}.'
+    assert error_types.ndim == 1, \
+        f' error_types ndim should be 1, but its ndim is {error_types.ndim}.'
+    assert bboxes.shape[0] == ids.shape[0], \
+        'bboxes.shape[0] and ids.shape[0] should have the same length.'
+    assert bboxes.shape[1] == 5, \
+        f' bboxes.shape[1] should be 5, but its {bboxes.shape[1]}.'
+
+    bbox_colors = sns.color_palette()
+    # red, yellow, blue
+    bbox_colors = [bbox_colors[3], bbox_colors[1], bbox_colors[0]]
+    bbox_colors = [[int(255 * _c) for _c in bbox_color][::-1]
+                   for bbox_color in bbox_colors]
+
+    if isinstance(img, str):
+        img = mmcv.imread(img)
+    else:
+        assert img.ndim == 3
+
+    img_shape = img.shape
+    bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+    bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+
+    for bbox, error_type, id in zip(bboxes, error_types, ids):
+        x1, y1, x2, y2 = bbox[:4].astype(np.int32)
+        score = float(bbox[-1])
+
+        # bbox
+        bbox_color = bbox_colors[error_type]
+        cv2.rectangle(img, (x1, y1), (x2, y2), bbox_color, thickness=thickness)
+
+        # FN does not have id and score
+        if error_type == 1:
+            continue
+
+        # score
+        text = '{:.02f}'.format(score)
+        width = (len(text) - 1) * text_width
+        img[y1:y1 + text_height, x1:x1 + width, :] = bbox_color
+        cv2.putText(
+            img,
+            text, (x1, y1 + text_height - 2),
+            cv2.FONT_HERSHEY_COMPLEX,
+            font_scale,
+            color=(0, 0, 0))
+
+        # id
+        text = str(id)
+        width = len(text) * text_width
+        img[y1 + text_height:y1 + text_height * 2,
+            x1:x1 + width, :] = bbox_color
+        cv2.putText(
+            img,
+            str(id), (x1, y1 + text_height * 2 - 2),
+            cv2.FONT_HERSHEY_COMPLEX,
+            font_scale,
+            color=(0, 0, 0))
+
+    if show:
+        mmcv.imshow(img, wait_time=wait_time)
+    if out_file is not None:
+        mmcv.imwrite(img, out_file)
+
+    return img
+
+
+def _plt_show_wrong_tracks(img: Union[str, np.ndarray],
+                           bboxes: np.ndarray,
+                           ids: np.ndarray,
+                           error_types: np.ndarray,
+                           thickness: float = 0.1,
+                           font_scale: float = 3.0,
+                           text_width: int = 8,
+                           text_height: int = 13,
+                           show: bool = False,
+                           wait_time: int = 100,
+                           out_file: str = None) -> np.ndarray:
+    """Show the wrong tracks with matplotlib.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (ndarray): A ndarray of shape (k, 5).
+        ids (ndarray): A ndarray of shape (k, ).
+        error_types (ndarray): A ndarray of shape (k, ), where 0 denotes
+            false positives, 1 denotes false negative and 2 denotes ID switch.
+        thickness (float, optional): Thickness of lines.
+            Defaults to 0.1.
+        font_scale (float, optional): Font scale to draw id and score.
+            Defaults to 3.0.
+        text_width (int, optional): Width to draw id and score.
+            Defaults to 8.
+        text_height (int, optional): Height to draw id and score.
+            Defaults to 13.
+        show (bool, optional): Whether to show the image on the fly.
+            Defaults to False.
+        wait_time (int, optional): Value of waitKey param.
+            Defaults to 100.
+        out_file (str, optional): The filename to write the image.
+            Defaults to None.
+
+    Returns:
+        ndarray: Original image.
+    """
+    assert bboxes.ndim == 2, \
+        f' bboxes ndim should be 2, but its ndim is {bboxes.ndim}.'
+    assert ids.ndim == 1, \
+        f' ids ndim should be 1, but its ndim is {ids.ndim}.'
+    assert error_types.ndim == 1, \
+        f' error_types ndim should be 1, but its ndim is {error_types.ndim}.'
+    assert bboxes.shape[0] == ids.shape[0], \
+        'bboxes.shape[0] and ids.shape[0] should have the same length.'
+    assert bboxes.shape[1] == 5, \
+        f' bboxes.shape[1] should be 5, but its {bboxes.shape[1]}.'
+
+    bbox_colors = sns.color_palette()
+    # red, yellow, blue
+    bbox_colors = [bbox_colors[3], bbox_colors[1], bbox_colors[0]]
+
+    if isinstance(img, str):
+        img = plt.imread(img)
+    else:
+        assert img.ndim == 3
+        img = mmcv.bgr2rgb(img)
+
+    img_shape = img.shape
+    bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+    bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+
+    plt.imshow(img)
+    plt.gca().set_axis_off()
+    plt.autoscale(False)
+    plt.subplots_adjust(
+        top=1, bottom=0, right=1, left=0, hspace=None, wspace=None)
+    plt.margins(0, 0)
+    plt.gca().xaxis.set_major_locator(plt.NullLocator())
+    plt.gca().yaxis.set_major_locator(plt.NullLocator())
+    plt.rcParams['figure.figsize'] = img_shape[1], img_shape[0]
+
+    for bbox, error_type, id in zip(bboxes, error_types, ids):
+        x1, y1, x2, y2, score = bbox
+        w, h = int(x2 - x1), int(y2 - y1)
+        left_top = (int(x1), int(y1))
+
+        # bbox
+        plt.gca().add_patch(
+            Rectangle(
+                left_top,
+                w,
+                h,
+                thickness,
+                edgecolor=bbox_colors[error_type],
+                facecolor='none'))
+
+        # FN does not have id and score
+        if error_type == 1:
+            continue
+
+        # score
+        text = '{:.02f}'.format(score)
+        width = len(text) * text_width
+        plt.gca().add_patch(
+            Rectangle((left_top[0], left_top[1]),
+                      width,
+                      text_height,
+                      thickness,
+                      edgecolor=bbox_colors[error_type],
+                      facecolor=bbox_colors[error_type]))
+
+        plt.text(
+            left_top[0],
+            left_top[1] + text_height + 2,
+            text,
+            fontsize=font_scale)
+
+        # id
+        text = str(id)
+        width = len(text) * text_width
+        plt.gca().add_patch(
+            Rectangle((left_top[0], left_top[1] + text_height + 1),
+                      width,
+                      text_height,
+                      thickness,
+                      edgecolor=bbox_colors[error_type],
+                      facecolor=bbox_colors[error_type]))
+        plt.text(
+            left_top[0],
+            left_top[1] + 2 * (text_height + 1),
+            text,
+            fontsize=font_scale)
+
+    if out_file is not None:
+        mkdir_or_exist(osp.abspath(osp.dirname(out_file)))
+        plt.savefig(out_file, dpi=300, bbox_inches='tight', pad_inches=0.0)
+
+    if show:
+        plt.draw()
+        plt.pause(wait_time / 1000.)
+
+    plt.clf()
+    return img
diff --git a/mmde/mmdet/utils/profiling.py b/mmde/mmdet/utils/profiling.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f53f456c72db57bfa69a8d022c92d153580209e
--- /dev/null
+++ b/mmde/mmdet/utils/profiling.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import contextlib
+import sys
+import time
+
+import torch
+
+if sys.version_info >= (3, 7):
+
+    @contextlib.contextmanager
+    def profile_time(trace_name,
+                     name,
+                     enabled=True,
+                     stream=None,
+                     end_stream=None):
+        """Print time spent by CPU and GPU.
+
+        Useful as a temporary context manager to find sweet spots of code
+        suitable for async implementation.
+        """
+        if (not enabled) or not torch.cuda.is_available():
+            yield
+            return
+        stream = stream if stream else torch.cuda.current_stream()
+        end_stream = end_stream if end_stream else stream
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        stream.record_event(start)
+        try:
+            cpu_start = time.monotonic()
+            yield
+        finally:
+            cpu_end = time.monotonic()
+            end_stream.record_event(end)
+            end.synchronize()
+            cpu_time = (cpu_end - cpu_start) * 1000
+            gpu_time = start.elapsed_time(end)
+            msg = f'{trace_name} {name} cpu_time {cpu_time:.2f} ms '
+            msg += f'gpu_time {gpu_time:.2f} ms stream {stream}'
+            print(msg, end_stream)
diff --git a/mmde/mmdet/utils/replace_cfg_vals.py b/mmde/mmdet/utils/replace_cfg_vals.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3331a36ce5a22fcc4d4a955d757f5e8b6bfc6bb
--- /dev/null
+++ b/mmde/mmdet/utils/replace_cfg_vals.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import re
+
+from mmengine.config import Config
+
+
+def replace_cfg_vals(ori_cfg):
+    """Replace the string "${key}" with the corresponding value.
+
+    Replace the "${key}" with the value of ori_cfg.key in the config. And
+    support replacing the chained ${key}. Such as, replace "${key0.key1}"
+    with the value of cfg.key0.key1. Code is modified from `vars.py
+    < https://github.com/microsoft/SoftTeacher/blob/main/ssod/utils/vars.py>`_  # noqa: E501
+
+    Args:
+        ori_cfg (mmengine.config.Config):
+            The origin config with "${key}" generated from a file.
+
+    Returns:
+        updated_cfg [mmengine.config.Config]:
+            The config with "${key}" replaced by the corresponding value.
+    """
+
+    def get_value(cfg, key):
+        for k in key.split('.'):
+            cfg = cfg[k]
+        return cfg
+
+    def replace_value(cfg):
+        if isinstance(cfg, dict):
+            return {key: replace_value(value) for key, value in cfg.items()}
+        elif isinstance(cfg, list):
+            return [replace_value(item) for item in cfg]
+        elif isinstance(cfg, tuple):
+            return tuple([replace_value(item) for item in cfg])
+        elif isinstance(cfg, str):
+            # the format of string cfg may be:
+            # 1) "${key}", which will be replaced with cfg.key directly
+            # 2) "xxx${key}xxx" or "xxx${key1}xxx${key2}xxx",
+            # which will be replaced with the string of the cfg.key
+            keys = pattern_key.findall(cfg)
+            values = [get_value(ori_cfg, key[2:-1]) for key in keys]
+            if len(keys) == 1 and keys[0] == cfg:
+                # the format of string cfg is "${key}"
+                cfg = values[0]
+            else:
+                for key, value in zip(keys, values):
+                    # the format of string cfg is
+                    # "xxx${key}xxx" or "xxx${key1}xxx${key2}xxx"
+                    assert not isinstance(value, (dict, list, tuple)), \
+                        f'for the format of string cfg is ' \
+                        f"'xxxxx${key}xxxxx' or 'xxx${key}xxx${key}xxx', " \
+                        f"the type of the value of '${key}' " \
+                        f'can not be dict, list, or tuple' \
+                        f'but you input {type(value)} in {cfg}'
+                    cfg = cfg.replace(key, str(value))
+            return cfg
+        else:
+            return cfg
+
+    # the pattern of string "${key}"
+    pattern_key = re.compile(r'\$\{[a-zA-Z\d_.]*\}')
+    # the type of ori_cfg._cfg_dict is mmengine.config.ConfigDict
+    updated_cfg = Config(
+        replace_value(ori_cfg._cfg_dict), filename=ori_cfg.filename)
+    # replace the model with model_wrapper
+    if updated_cfg.get('model_wrapper', None) is not None:
+        updated_cfg.model = updated_cfg.model_wrapper
+        updated_cfg.pop('model_wrapper')
+    return updated_cfg
diff --git a/mmde/mmdet/utils/setup_env.py b/mmde/mmdet/utils/setup_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7b37845a883752a1659fabf62c7404cff971191
--- /dev/null
+++ b/mmde/mmdet/utils/setup_env.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import logging
+import os
+import platform
+import warnings
+
+import cv2
+import torch.multiprocessing as mp
+from mmengine import DefaultScope
+from mmengine.logging import print_log
+from mmengine.utils import digit_version
+
+
+def setup_cache_size_limit_of_dynamo():
+    """Setup cache size limit of dynamo.
+
+    Note: Due to the dynamic shape of the loss calculation and
+    post-processing parts in the object detection algorithm, these
+    functions must be compiled every time they are run.
+    Setting a large value for torch._dynamo.config.cache_size_limit
+    may result in repeated compilation, which can slow down training
+    and testing speed. Therefore, we need to set the default value of
+    cache_size_limit smaller. An empirical value is 4.
+    """
+
+    import torch
+    if digit_version(torch.__version__) >= digit_version('2.0.0'):
+        if 'DYNAMO_CACHE_SIZE_LIMIT' in os.environ:
+            import torch._dynamo
+            cache_size_limit = int(os.environ['DYNAMO_CACHE_SIZE_LIMIT'])
+            torch._dynamo.config.cache_size_limit = cache_size_limit
+            print_log(
+                f'torch._dynamo.config.cache_size_limit is force '
+                f'set to {cache_size_limit}.',
+                logger='current',
+                level=logging.WARNING)
+
+
+def setup_multi_processes(cfg):
+    """Setup multi-processing environment variables."""
+    # set multi-process start method as `fork` to speed up the training
+    if platform.system() != 'Windows':
+        mp_start_method = cfg.get('mp_start_method', 'fork')
+        current_method = mp.get_start_method(allow_none=True)
+        if current_method is not None and current_method != mp_start_method:
+            warnings.warn(
+                f'Multi-processing start method `{mp_start_method}` is '
+                f'different from the previous setting `{current_method}`.'
+                f'It will be force set to `{mp_start_method}`. You can change '
+                f'this behavior by changing `mp_start_method` in your config.')
+        mp.set_start_method(mp_start_method, force=True)
+
+    # disable opencv multithreading to avoid system being overloaded
+    opencv_num_threads = cfg.get('opencv_num_threads', 0)
+    cv2.setNumThreads(opencv_num_threads)
+
+    # setup OMP threads
+    # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py  # noqa
+    workers_per_gpu = cfg.data.get('workers_per_gpu', 1)
+    if 'train_dataloader' in cfg.data:
+        workers_per_gpu = \
+            max(cfg.data.train_dataloader.get('workers_per_gpu', 1),
+                workers_per_gpu)
+
+    if 'OMP_NUM_THREADS' not in os.environ and workers_per_gpu > 1:
+        omp_num_threads = 1
+        warnings.warn(
+            f'Setting OMP_NUM_THREADS environment variable for each process '
+            f'to be {omp_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
+
+    # setup MKL threads
+    if 'MKL_NUM_THREADS' not in os.environ and workers_per_gpu > 1:
+        mkl_num_threads = 1
+        warnings.warn(
+            f'Setting MKL_NUM_THREADS environment variable for each process '
+            f'to be {mkl_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
+
+
+def register_all_modules(init_default_scope: bool = True) -> None:
+    """Register all modules in mmdet into the registries.
+
+    Args:
+        init_default_scope (bool): Whether initialize the mmdet default scope.
+            When `init_default_scope=True`, the global default scope will be
+            set to `mmdet`, and all registries will build modules from mmdet's
+            registry node. To understand more about the registry, please refer
+            to https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md
+            Defaults to True.
+    """  # noqa
+    import mmdet.datasets  # noqa: F401,F403
+    import mmdet.engine  # noqa: F401,F403
+    import mmdet.evaluation  # noqa: F401,F403
+    import mmdet.models  # noqa: F401,F403
+    import mmdet.visualization  # noqa: F401,F403
+
+    if init_default_scope:
+        never_created = DefaultScope.get_current_instance() is None \
+                        or not DefaultScope.check_instance_created('mmdet')
+        if never_created:
+            DefaultScope.get_instance('mmdet', scope_name='mmdet')
+            return
+        current_scope = DefaultScope.get_current_instance()
+        if current_scope.scope_name != 'mmdet':
+            warnings.warn('The current default scope '
+                          f'"{current_scope.scope_name}" is not "mmdet", '
+                          '`register_all_modules` will force the current'
+                          'default scope to be "mmdet". If this is not '
+                          'expected, please set `init_default_scope=False`.')
+            # avoid name conflict
+            new_instance_name = f'mmdet-{datetime.datetime.now()}'
+            DefaultScope.get_instance(new_instance_name, scope_name='mmdet')
diff --git a/mmde/mmdet/utils/split_batch.py b/mmde/mmdet/utils/split_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..0276fb331f23c1a7f7451faf2a8f768e616d45fd
--- /dev/null
+++ b/mmde/mmdet/utils/split_batch.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def split_batch(img, img_metas, kwargs):
+    """Split data_batch by tags.
+
+    Code is modified from
+    <https://github.com/microsoft/SoftTeacher/blob/main/ssod/utils/structure_utils.py> # noqa: E501
+
+    Args:
+        img (Tensor): of shape (N, C, H, W) encoding input images.
+            Typically these should be mean centered and std scaled.
+        img_metas (list[dict]): List of image info dict where each dict
+            has: 'img_shape', 'scale_factor', 'flip', and may also contain
+            'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+            For details on the values of these keys, see
+            :class:`mmdet.datasets.pipelines.Collect`.
+        kwargs (dict): Specific to concrete implementation.
+
+    Returns:
+        data_groups (dict): a dict that data_batch splited by tags,
+            such as 'sup', 'unsup_teacher', and 'unsup_student'.
+    """
+
+    # only stack img in the batch
+    def fuse_list(obj_list, obj):
+        return torch.stack(obj_list) if isinstance(obj,
+                                                   torch.Tensor) else obj_list
+
+    # select data with tag from data_batch
+    def select_group(data_batch, current_tag):
+        group_flag = [tag == current_tag for tag in data_batch['tag']]
+        return {
+            k: fuse_list([vv for vv, gf in zip(v, group_flag) if gf], v)
+            for k, v in data_batch.items()
+        }
+
+    kwargs.update({'img': img, 'img_metas': img_metas})
+    kwargs.update({'tag': [meta['tag'] for meta in img_metas]})
+    tags = list(set(kwargs['tag']))
+    data_groups = {tag: select_group(kwargs, tag) for tag in tags}
+    for tag, group in data_groups.items():
+        group.pop('tag')
+    return data_groups
diff --git a/mmde/mmdet/utils/typing_utils.py b/mmde/mmdet/utils/typing_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6caf6de53274594e139dbe7c1973c747229bf010
--- /dev/null
+++ b/mmde/mmdet/utils/typing_utils.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Collecting some commonly used type hint in mmdetection."""
+from typing import List, Optional, Sequence, Tuple, Union
+
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData, PixelData
+
+# TODO: Need to avoid circular import with assigner and sampler
+# Type hint of config data
+ConfigType = Union[ConfigDict, dict]
+OptConfigType = Optional[ConfigType]
+# Type hint of one or more config data
+MultiConfig = Union[ConfigType, List[ConfigType]]
+OptMultiConfig = Optional[MultiConfig]
+
+InstanceList = List[InstanceData]
+OptInstanceList = Optional[InstanceList]
+
+PixelList = List[PixelData]
+OptPixelList = Optional[PixelList]
+
+RangeType = Sequence[Tuple[int, int]]
diff --git a/mmde/mmdet/utils/util_mixins.py b/mmde/mmdet/utils/util_mixins.py
new file mode 100644
index 0000000000000000000000000000000000000000..b83b6617f5e4a202067e1659bf448962a2a2bc72
--- /dev/null
+++ b/mmde/mmdet/utils/util_mixins.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This module defines the :class:`NiceRepr` mixin class, which defines a
+``__repr__`` and ``__str__`` method that only depend on a custom ``__nice__``
+method, which you must define. This means you only have to overload one
+function instead of two.  Furthermore, if the object defines a ``__len__``
+method, then the ``__nice__`` method defaults to something sensible, otherwise
+it is treated as abstract and raises ``NotImplementedError``.
+
+To use simply have your object inherit from :class:`NiceRepr`
+(multi-inheritance should be ok).
+
+This code was copied from the ubelt library: https://github.com/Erotemic/ubelt
+
+Example:
+    >>> # Objects that define __nice__ have a default __str__ and __repr__
+    >>> class Student(NiceRepr):
+    ...    def __init__(self, name):
+    ...        self.name = name
+    ...    def __nice__(self):
+    ...        return self.name
+    >>> s1 = Student('Alice')
+    >>> s2 = Student('Bob')
+    >>> print(f's1 = {s1}')
+    >>> print(f's2 = {s2}')
+    s1 = <Student(Alice)>
+    s2 = <Student(Bob)>
+
+Example:
+    >>> # Objects that define __len__ have a default __nice__
+    >>> class Group(NiceRepr):
+    ...    def __init__(self, data):
+    ...        self.data = data
+    ...    def __len__(self):
+    ...        return len(self.data)
+    >>> g = Group([1, 2, 3])
+    >>> print(f'g = {g}')
+    g = <Group(3)>
+"""
+import warnings
+
+
+class NiceRepr:
+    """Inherit from this class and define ``__nice__`` to "nicely" print your
+    objects.
+
+    Defines ``__str__`` and ``__repr__`` in terms of ``__nice__`` function
+    Classes that inherit from :class:`NiceRepr` should redefine ``__nice__``.
+    If the inheriting class has a ``__len__``, method then the default
+    ``__nice__`` method will return its length.
+
+    Example:
+        >>> class Foo(NiceRepr):
+        ...    def __nice__(self):
+        ...        return 'info'
+        >>> foo = Foo()
+        >>> assert str(foo) == '<Foo(info)>'
+        >>> assert repr(foo).startswith('<Foo(info) at ')
+
+    Example:
+        >>> class Bar(NiceRepr):
+        ...    pass
+        >>> bar = Bar()
+        >>> import pytest
+        >>> with pytest.warns(None) as record:
+        >>>     assert 'object at' in str(bar)
+        >>>     assert 'object at' in repr(bar)
+
+    Example:
+        >>> class Baz(NiceRepr):
+        ...    def __len__(self):
+        ...        return 5
+        >>> baz = Baz()
+        >>> assert str(baz) == '<Baz(5)>'
+    """
+
+    def __nice__(self):
+        """str: a "nice" summary string describing this module"""
+        if hasattr(self, '__len__'):
+            # It is a common pattern for objects to use __len__ in __nice__
+            # As a convenience we define a default __nice__ for these objects
+            return str(len(self))
+        else:
+            # In all other cases force the subclass to overload __nice__
+            raise NotImplementedError(
+                f'Define the __nice__ method for {self.__class__!r}')
+
+    def __repr__(self):
+        """str: the string of the module"""
+        try:
+            nice = self.__nice__()
+            classname = self.__class__.__name__
+            return f'<{classname}({nice}) at {hex(id(self))}>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
+
+    def __str__(self):
+        """str: the string of the module"""
+        try:
+            classname = self.__class__.__name__
+            nice = self.__nice__()
+            return f'<{classname}({nice})>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
diff --git a/mmde/mmdet/utils/util_random.py b/mmde/mmdet/utils/util_random.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc1ecb6c03b026156c9947cb6d356a822448be0f
--- /dev/null
+++ b/mmde/mmdet/utils/util_random.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Helpers for random number generators."""
+import numpy as np
+
+
+def ensure_rng(rng=None):
+    """Coerces input into a random number generator.
+
+    If the input is None, then a global random state is returned.
+
+    If the input is a numeric value, then that is used as a seed to construct a
+    random state. Otherwise the input is returned as-is.
+
+    Adapted from [1]_.
+
+    Args:
+        rng (int | numpy.random.RandomState | None):
+            if None, then defaults to the global rng. Otherwise this can be an
+            integer or a RandomState class
+    Returns:
+        (numpy.random.RandomState) : rng -
+            a numpy random number generator
+
+    References:
+        .. [1] https://gitlab.kitware.com/computer-vision/kwarray/blob/master/kwarray/util_random.py#L270  # noqa: E501
+    """
+
+    if rng is None:
+        rng = np.random.mtrand._rand
+    elif isinstance(rng, int):
+        rng = np.random.RandomState(rng)
+    else:
+        rng = rng
+    return rng
diff --git a/mmde/mmdet/version.py b/mmde/mmdet/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..47989fc0a31f8d8eaa3adff72ab83db61b25b529
--- /dev/null
+++ b/mmde/mmdet/version.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+__version__ = '3.3.0'
+short_version = __version__
+
+
+def parse_version_info(version_str):
+    """Parse a version string into a tuple.
+
+    Args:
+        version_str (str): The version string.
+    Returns:
+        tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
+            (1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1').
+    """
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/mmde/mmdet/visualization/__init__.py b/mmde/mmdet/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7edaed9d8701b1be72ff2f7ca646b865007e2eb
--- /dev/null
+++ b/mmde/mmdet/visualization/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .local_visualizer import DetLocalVisualizer, TrackLocalVisualizer
+from .palette import get_palette, jitter_color, palette_val
+
+__all__ = [
+    'palette_val', 'get_palette', 'DetLocalVisualizer', 'jitter_color',
+    'TrackLocalVisualizer'
+]
diff --git a/mmde/mmdet/visualization/local_visualizer.py b/mmde/mmdet/visualization/local_visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc6521c56eb167c2c94a3f058594d9e832fb15ad
--- /dev/null
+++ b/mmde/mmdet/visualization/local_visualizer.py
@@ -0,0 +1,699 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple, Union
+
+import cv2
+import mmcv
+import numpy as np
+
+try:
+    import seaborn as sns
+except ImportError:
+    sns = None
+import torch
+from mmengine.dist import master_only
+from mmengine.structures import InstanceData, PixelData
+from mmengine.visualization import Visualizer
+
+from ..evaluation import INSTANCE_OFFSET
+from ..registry import VISUALIZERS
+from ..structures import DetDataSample
+from ..structures.mask import BitmapMasks, PolygonMasks, bitmap_to_polygon
+from .palette import _get_adaptive_scales, get_palette, jitter_color
+
+
+@VISUALIZERS.register_module()
+class DetLocalVisualizer(Visualizer):
+    """MMDetection Local Visualizer.
+
+    Args:
+        name (str): Name of the instance. Defaults to 'visualizer'.
+        image (np.ndarray, optional): the origin image to draw. The format
+            should be RGB. Defaults to None.
+        vis_backends (list, optional): Visual backend config list.
+            Defaults to None.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        bbox_color (str, tuple(int), optional): Color of bbox lines.
+            The tuple of color should be in BGR order. Defaults to None.
+        text_color (str, tuple(int), optional): Color of texts.
+            The tuple of color should be in BGR order.
+            Defaults to (200, 200, 200).
+        mask_color (str, tuple(int), optional): Color of masks.
+            The tuple of color should be in BGR order.
+            Defaults to None.
+        line_width (int, float): The linewidth of lines.
+            Defaults to 3.
+        alpha (int, float): The transparency of bboxes or mask.
+            Defaults to 0.8.
+
+    Examples:
+        >>> import numpy as np
+        >>> import torch
+        >>> from mmengine.structures import InstanceData
+        >>> from mmdet.structures import DetDataSample
+        >>> from mmdet.visualization import DetLocalVisualizer
+
+        >>> det_local_visualizer = DetLocalVisualizer()
+        >>> image = np.random.randint(0, 256,
+        ...                     size=(10, 12, 3)).astype('uint8')
+        >>> gt_instances = InstanceData()
+        >>> gt_instances.bboxes = torch.Tensor([[1, 2, 2, 5]])
+        >>> gt_instances.labels = torch.randint(0, 2, (1,))
+        >>> gt_det_data_sample = DetDataSample()
+        >>> gt_det_data_sample.gt_instances = gt_instances
+        >>> det_local_visualizer.add_datasample('image', image,
+        ...                         gt_det_data_sample)
+        >>> det_local_visualizer.add_datasample(
+        ...                       'image', image, gt_det_data_sample,
+        ...                        out_file='out_file.jpg')
+        >>> det_local_visualizer.add_datasample(
+        ...                        'image', image, gt_det_data_sample,
+        ...                         show=True)
+        >>> pred_instances = InstanceData()
+        >>> pred_instances.bboxes = torch.Tensor([[2, 4, 4, 8]])
+        >>> pred_instances.labels = torch.randint(0, 2, (1,))
+        >>> pred_det_data_sample = DetDataSample()
+        >>> pred_det_data_sample.pred_instances = pred_instances
+        >>> det_local_visualizer.add_datasample('image', image,
+        ...                         gt_det_data_sample,
+        ...                         pred_det_data_sample)
+    """
+
+    def __init__(self,
+                 name: str = 'visualizer',
+                 image: Optional[np.ndarray] = None,
+                 vis_backends: Optional[Dict] = None,
+                 save_dir: Optional[str] = None,
+                 bbox_color: Optional[Union[str, Tuple[int]]] = None,
+                 text_color: Optional[Union[str,
+                                            Tuple[int]]] = (200, 200, 200),
+                 mask_color: Optional[Union[str, Tuple[int]]] = None,
+                 line_width: Union[int, float] = 3,
+                 alpha: float = 0.8) -> None:
+        super().__init__(
+            name=name,
+            image=image,
+            vis_backends=vis_backends,
+            save_dir=save_dir)
+        self.bbox_color = bbox_color
+        self.text_color = text_color
+        self.mask_color = mask_color
+        self.line_width = line_width
+        self.alpha = alpha
+        # Set default value. When calling
+        # `DetLocalVisualizer().dataset_meta=xxx`,
+        # it will override the default value.
+        self.dataset_meta = {}
+
+    def _draw_instances(self, image: np.ndarray, instances: ['InstanceData'],
+                        classes: Optional[List[str]],
+                        palette: Optional[List[tuple]]) -> np.ndarray:
+        """Draw instances of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            instances (:obj:`InstanceData`): Data structure for
+                instance-level annotations or predictions.
+            classes (List[str], optional): Category information.
+            palette (List[tuple], optional): Palette information
+                corresponding to the category.
+
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        self.set_image(image)
+
+        if 'bboxes' in instances and instances.bboxes.sum() > 0:
+            bboxes = instances.bboxes
+            labels = instances.labels
+
+            max_label = int(max(labels) if len(labels) > 0 else 0)
+            text_palette = get_palette(self.text_color, max_label + 1)
+            text_colors = [text_palette[label] for label in labels]
+
+            bbox_color = palette if self.bbox_color is None \
+                else self.bbox_color
+            bbox_palette = get_palette(bbox_color, max_label + 1)
+            colors = [bbox_palette[label] for label in labels]
+            self.draw_bboxes(
+                bboxes,
+                edge_colors=colors,
+                alpha=self.alpha,
+                line_widths=self.line_width)
+
+            positions = bboxes[:, :2] + self.line_width
+            areas = (bboxes[:, 3] - bboxes[:, 1]) * (
+                bboxes[:, 2] - bboxes[:, 0])
+            scales = _get_adaptive_scales(areas)
+
+            for i, (pos, label) in enumerate(zip(positions, labels)):
+                if 'label_names' in instances:
+                    label_text = instances.label_names[i]
+                else:
+                    label_text = classes[
+                        label] if classes is not None else f'class {label}'
+                if 'scores' in instances:
+                    score = round(float(instances.scores[i]) * 100, 1)
+                    label_text += f': {score}'
+
+                self.draw_texts(
+                    label_text,
+                    pos,
+                    colors=text_colors[i],
+                    font_sizes=int(13 * scales[i]),
+                    bboxes=[{
+                        'facecolor': 'black',
+                        'alpha': 0.8,
+                        'pad': 0.7,
+                        'edgecolor': 'none'
+                    }])
+
+        if 'masks' in instances:
+            labels = instances.labels
+            masks = instances.masks
+            if isinstance(masks, torch.Tensor):
+                masks = masks.numpy()
+            elif isinstance(masks, (PolygonMasks, BitmapMasks)):
+                masks = masks.to_ndarray()
+
+            masks = masks.astype(bool)
+
+            max_label = int(max(labels) if len(labels) > 0 else 0)
+            mask_color = palette if self.mask_color is None \
+                else self.mask_color
+            mask_palette = get_palette(mask_color, max_label + 1)
+            colors = [jitter_color(mask_palette[label]) for label in labels]
+            text_palette = get_palette(self.text_color, max_label + 1)
+            text_colors = [text_palette[label] for label in labels]
+
+            polygons = []
+            for i, mask in enumerate(masks):
+                contours, _ = bitmap_to_polygon(mask)
+                polygons.extend(contours)
+            self.draw_polygons(polygons, edge_colors='w', alpha=self.alpha)
+            self.draw_binary_masks(masks, colors=colors, alphas=self.alpha)
+
+            if len(labels) > 0 and \
+                    ('bboxes' not in instances or
+                     instances.bboxes.sum() == 0):
+                # instances.bboxes.sum()==0 represent dummy bboxes.
+                # A typical example of SOLO does not exist bbox branch.
+                areas = []
+                positions = []
+                for mask in masks:
+                    _, _, stats, centroids = cv2.connectedComponentsWithStats(
+                        mask.astype(np.uint8), connectivity=8)
+                    if stats.shape[0] > 1:
+                        largest_id = np.argmax(stats[1:, -1]) + 1
+                        positions.append(centroids[largest_id])
+                        areas.append(stats[largest_id, -1])
+                areas = np.stack(areas, axis=0)
+                scales = _get_adaptive_scales(areas)
+
+                for i, (pos, label) in enumerate(zip(positions, labels)):
+                    if 'label_names' in instances:
+                        label_text = instances.label_names[i]
+                    else:
+                        label_text = classes[
+                            label] if classes is not None else f'class {label}'
+                    if 'scores' in instances:
+                        score = round(float(instances.scores[i]) * 100, 1)
+                        label_text += f': {score}'
+
+                    self.draw_texts(
+                        label_text,
+                        pos,
+                        colors=text_colors[i],
+                        font_sizes=int(13 * scales[i]),
+                        horizontal_alignments='center',
+                        bboxes=[{
+                            'facecolor': 'black',
+                            'alpha': 0.8,
+                            'pad': 0.7,
+                            'edgecolor': 'none'
+                        }])
+        return self.get_image()
+
+    def _draw_panoptic_seg(self, image: np.ndarray,
+                           panoptic_seg: ['PixelData'],
+                           classes: Optional[List[str]],
+                           palette: Optional[List]) -> np.ndarray:
+        """Draw panoptic seg of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            panoptic_seg (:obj:`PixelData`): Data structure for
+                pixel-level annotations or predictions.
+            classes (List[str], optional): Category information.
+
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        # TODO: Is there a way to bypass？
+        num_classes = len(classes)
+
+        panoptic_seg_data = panoptic_seg.sem_seg[0]
+
+        ids = np.unique(panoptic_seg_data)[::-1]
+
+        if 'label_names' in panoptic_seg:
+            # open set panoptic segmentation
+            classes = panoptic_seg.metainfo['label_names']
+            ignore_index = panoptic_seg.metainfo.get('ignore_index',
+                                                     len(classes))
+            ids = ids[ids != ignore_index]
+        else:
+            # for VOID label
+            ids = ids[ids != num_classes]
+
+        labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64)
+        segms = (panoptic_seg_data[None] == ids[:, None, None])
+
+        max_label = int(max(labels) if len(labels) > 0 else 0)
+
+        mask_color = palette if self.mask_color is None \
+            else self.mask_color
+        mask_palette = get_palette(mask_color, max_label + 1)
+        colors = [mask_palette[label] for label in labels]
+
+        self.set_image(image)
+
+        # draw segm
+        polygons = []
+        for i, mask in enumerate(segms):
+            contours, _ = bitmap_to_polygon(mask)
+            polygons.extend(contours)
+        self.draw_polygons(polygons, edge_colors='w', alpha=self.alpha)
+        self.draw_binary_masks(segms, colors=colors, alphas=self.alpha)
+
+        # draw label
+        areas = []
+        positions = []
+        for mask in segms:
+            _, _, stats, centroids = cv2.connectedComponentsWithStats(
+                mask.astype(np.uint8), connectivity=8)
+            max_id = np.argmax(stats[1:, -1]) + 1
+            positions.append(centroids[max_id])
+            areas.append(stats[max_id, -1])
+        areas = np.stack(areas, axis=0)
+        scales = _get_adaptive_scales(areas)
+
+        text_palette = get_palette(self.text_color, max_label + 1)
+        text_colors = [text_palette[label] for label in labels]
+
+        for i, (pos, label) in enumerate(zip(positions, labels)):
+            label_text = classes[label]
+
+            self.draw_texts(
+                label_text,
+                pos,
+                colors=text_colors[i],
+                font_sizes=int(13 * scales[i]),
+                bboxes=[{
+                    'facecolor': 'black',
+                    'alpha': 0.8,
+                    'pad': 0.7,
+                    'edgecolor': 'none'
+                }],
+                horizontal_alignments='center')
+        return self.get_image()
+
+    def _draw_sem_seg(self, image: np.ndarray, sem_seg: PixelData,
+                      classes: Optional[List],
+                      palette: Optional[List]) -> np.ndarray:
+        """Draw semantic seg of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            sem_seg (:obj:`PixelData`): Data structure for pixel-level
+                annotations or predictions.
+            classes (list, optional): Input classes for result rendering, as
+                the prediction of segmentation model is a segment map with
+                label indices, `classes` is a list which includes items
+                responding to the label indices. If classes is not defined,
+                visualizer will take `cityscapes` classes by default.
+                Defaults to None.
+            palette (list, optional): Input palette for result rendering, which
+                is a list of color palette responding to the classes.
+                Defaults to None.
+
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        sem_seg_data = sem_seg.sem_seg
+        if isinstance(sem_seg_data, torch.Tensor):
+            sem_seg_data = sem_seg_data.numpy()
+
+        # 0 ~ num_class, the value 0 means background
+        ids = np.unique(sem_seg_data)
+        ignore_index = sem_seg.metainfo.get('ignore_index', 255)
+        ids = ids[ids != ignore_index]
+
+        if 'label_names' in sem_seg:
+            # open set semseg
+            label_names = sem_seg.metainfo['label_names']
+        else:
+            label_names = classes
+
+        labels = np.array(ids, dtype=np.int64)
+        colors = [palette[label] for label in labels]
+
+        self.set_image(image)
+
+        # draw semantic masks
+        for i, (label, color) in enumerate(zip(labels, colors)):
+            masks = sem_seg_data == label
+            self.draw_binary_masks(masks, colors=[color], alphas=self.alpha)
+            label_text = label_names[label]
+            _, _, stats, centroids = cv2.connectedComponentsWithStats(
+                masks[0].astype(np.uint8), connectivity=8)
+            if stats.shape[0] > 1:
+                largest_id = np.argmax(stats[1:, -1]) + 1
+                centroids = centroids[largest_id]
+
+                areas = stats[largest_id, -1]
+                scales = _get_adaptive_scales(areas)
+
+                self.draw_texts(
+                    label_text,
+                    centroids,
+                    colors=(255, 255, 255),
+                    font_sizes=int(13 * scales),
+                    horizontal_alignments='center',
+                    bboxes=[{
+                        'facecolor': 'black',
+                        'alpha': 0.8,
+                        'pad': 0.7,
+                        'edgecolor': 'none'
+                    }])
+
+        return self.get_image()
+
+    @master_only
+    def add_datasample(
+            self,
+            name: str,
+            image: np.ndarray,
+            data_sample: Optional['DetDataSample'] = None,
+            draw_gt: bool = True,
+            draw_pred: bool = True,
+            show: bool = False,
+            wait_time: float = 0,
+            # TODO: Supported in mmengine's Viusalizer.
+            out_file: Optional[str] = None,
+            pred_score_thr: float = 0.3,
+            step: int = 0) -> None:
+        """Draw datasample and save to all backends.
+
+        - If GT and prediction are plotted at the same time, they are
+        displayed in a stitched image where the left image is the
+        ground truth and the right image is the prediction.
+        - If ``show`` is True, all storage backends are ignored, and
+        the images will be displayed in a local window.
+        - If ``out_file`` is specified, the drawn image will be
+        saved to ``out_file``. t is usually used when the display
+        is not available.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to draw.
+            data_sample (:obj:`DetDataSample`, optional): A data
+                sample that contain annotations and predictions.
+                Defaults to None.
+            draw_gt (bool): Whether to draw GT DetDataSample. Default to True.
+            draw_pred (bool): Whether to draw Prediction DetDataSample.
+                Defaults to True.
+            show (bool): Whether to display the drawn image. Default to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            out_file (str): Path to output file. Defaults to None.
+            pred_score_thr (float): The threshold to visualize the bboxes
+                and masks. Defaults to 0.3.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        image = image.clip(0, 255).astype(np.uint8)
+        classes = self.dataset_meta.get('classes', None)
+        palette = self.dataset_meta.get('palette', None)
+
+        gt_img_data = None
+        pred_img_data = None
+
+        if data_sample is not None:
+            data_sample = data_sample.cpu()
+
+        if draw_gt and data_sample is not None:
+            gt_img_data = image
+            if 'gt_instances' in data_sample:
+                gt_img_data = self._draw_instances(image,
+                                                   data_sample.gt_instances,
+                                                   classes, palette)
+            if 'gt_sem_seg' in data_sample:
+                gt_img_data = self._draw_sem_seg(gt_img_data,
+                                                 data_sample.gt_sem_seg,
+                                                 classes, palette)
+
+            if 'gt_panoptic_seg' in data_sample:
+                assert classes is not None, 'class information is ' \
+                                            'not provided when ' \
+                                            'visualizing panoptic ' \
+                                            'segmentation results.'
+                gt_img_data = self._draw_panoptic_seg(
+                    gt_img_data, data_sample.gt_panoptic_seg, classes, palette)
+
+        if draw_pred and data_sample is not None:
+            pred_img_data = image
+            if 'pred_instances' in data_sample:
+                pred_instances = data_sample.pred_instances
+                pred_instances = pred_instances[
+                    pred_instances.scores > pred_score_thr]
+                pred_img_data = self._draw_instances(image, pred_instances,
+                                                     classes, palette)
+
+            if 'pred_sem_seg' in data_sample:
+                pred_img_data = self._draw_sem_seg(pred_img_data,
+                                                   data_sample.pred_sem_seg,
+                                                   classes, palette)
+
+            if 'pred_panoptic_seg' in data_sample:
+                assert classes is not None, 'class information is ' \
+                                            'not provided when ' \
+                                            'visualizing panoptic ' \
+                                            'segmentation results.'
+                pred_img_data = self._draw_panoptic_seg(
+                    pred_img_data, data_sample.pred_panoptic_seg.numpy(),
+                    classes, palette)
+
+        if gt_img_data is not None and pred_img_data is not None:
+            drawn_img = np.concatenate((gt_img_data, pred_img_data), axis=1)
+        elif gt_img_data is not None:
+            drawn_img = gt_img_data
+        elif pred_img_data is not None:
+            drawn_img = pred_img_data
+        else:
+            # Display the original image directly if nothing is drawn.
+            drawn_img = image
+
+        # It is convenient for users to obtain the drawn image.
+        # For example, the user wants to obtain the drawn image and
+        # save it as a video during video inference.
+        self.set_image(drawn_img)
+
+        if show:
+            self.show(drawn_img, win_name=name, wait_time=wait_time)
+
+        if out_file is not None:
+            mmcv.imwrite(drawn_img[..., ::-1], out_file)
+        else:
+            self.add_image(name, drawn_img, step)
+
+
+def random_color(seed):
+    """Random a color according to the input seed."""
+    if sns is None:
+        raise RuntimeError('motmetrics is not installed,\
+                 please install it by: pip install seaborn')
+    np.random.seed(seed)
+    colors = sns.color_palette()
+    color = colors[np.random.choice(range(len(colors)))]
+    color = tuple([int(255 * c) for c in color])
+    return color
+
+
+@VISUALIZERS.register_module()
+class TrackLocalVisualizer(Visualizer):
+    """Tracking Local Visualizer for the MOT, VIS tasks.
+
+    Args:
+        name (str): Name of the instance. Defaults to 'visualizer'.
+        image (np.ndarray, optional): the origin image to draw. The format
+            should be RGB. Defaults to None.
+        vis_backends (list, optional): Visual backend config list.
+            Defaults to None.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        line_width (int, float): The linewidth of lines.
+            Defaults to 3.
+        alpha (int, float): The transparency of bboxes or mask.
+                Defaults to 0.8.
+    """
+
+    def __init__(self,
+                 name: str = 'visualizer',
+                 image: Optional[np.ndarray] = None,
+                 vis_backends: Optional[Dict] = None,
+                 save_dir: Optional[str] = None,
+                 line_width: Union[int, float] = 3,
+                 alpha: float = 0.8) -> None:
+        super().__init__(name, image, vis_backends, save_dir)
+        self.line_width = line_width
+        self.alpha = alpha
+        # Set default value. When calling
+        # `TrackLocalVisualizer().dataset_meta=xxx`,
+        # it will override the default value.
+        self.dataset_meta = {}
+
+    def _draw_instances(self, image: np.ndarray,
+                        instances: InstanceData) -> np.ndarray:
+        """Draw instances of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            instances (:obj:`InstanceData`): Data structure for
+                instance-level annotations or predictions.
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        self.set_image(image)
+        classes = self.dataset_meta.get('classes', None)
+
+        # get colors and texts
+        # for the MOT and VIS tasks
+        colors = [random_color(_id) for _id in instances.instances_id]
+        categories = [
+            classes[label] if classes is not None else f'cls{label}'
+            for label in instances.labels
+        ]
+        if 'scores' in instances:
+            texts = [
+                f'{category_name}\n{instance_id} | {score:.2f}'
+                for category_name, instance_id, score in zip(
+                    categories, instances.instances_id, instances.scores)
+            ]
+        else:
+            texts = [
+                f'{category_name}\n{instance_id}' for category_name,
+                instance_id in zip(categories, instances.instances_id)
+            ]
+
+        # draw bboxes and texts
+        if 'bboxes' in instances:
+            # draw bboxes
+            bboxes = instances.bboxes.clone()
+            self.draw_bboxes(
+                bboxes,
+                edge_colors=colors,
+                alpha=self.alpha,
+                line_widths=self.line_width)
+            # draw texts
+            if texts is not None:
+                positions = bboxes[:, :2] + self.line_width
+                areas = (bboxes[:, 3] - bboxes[:, 1]) * (
+                    bboxes[:, 2] - bboxes[:, 0])
+                scales = _get_adaptive_scales(areas.cpu().numpy())
+                for i, pos in enumerate(positions):
+                    self.draw_texts(
+                        texts[i],
+                        pos,
+                        colors='black',
+                        font_sizes=int(13 * scales[i]),
+                        bboxes=[{
+                            'facecolor': [c / 255 for c in colors[i]],
+                            'alpha': 0.8,
+                            'pad': 0.7,
+                            'edgecolor': 'none'
+                        }])
+
+        # draw masks
+        if 'masks' in instances:
+            masks = instances.masks
+            polygons = []
+            for i, mask in enumerate(masks):
+                contours, _ = bitmap_to_polygon(mask)
+                polygons.extend(contours)
+            self.draw_polygons(polygons, edge_colors='w', alpha=self.alpha)
+            self.draw_binary_masks(masks, colors=colors, alphas=self.alpha)
+
+        return self.get_image()
+
+    @master_only
+    def add_datasample(
+            self,
+            name: str,
+            image: np.ndarray,
+            data_sample: DetDataSample = None,
+            draw_gt: bool = True,
+            draw_pred: bool = True,
+            show: bool = False,
+            wait_time: int = 0,
+            # TODO: Supported in mmengine's Viusalizer.
+            out_file: Optional[str] = None,
+            pred_score_thr: float = 0.3,
+            step: int = 0) -> None:
+        """Draw datasample and save to all backends.
+
+        - If GT and prediction are plotted at the same time, they are
+        displayed in a stitched image where the left image is the
+        ground truth and the right image is the prediction.
+        - If ``show`` is True, all storage backends are ignored, and
+        the images will be displayed in a local window.
+        - If ``out_file`` is specified, the drawn image will be
+        saved to ``out_file``. t is usually used when the display
+        is not available.
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to draw.
+            data_sample (OptTrackSampleList): A data
+                sample that contain annotations and predictions.
+                Defaults to None.
+            draw_gt (bool): Whether to draw GT TrackDataSample.
+                Default to True.
+            draw_pred (bool): Whether to draw Prediction TrackDataSample.
+                Defaults to True.
+            show (bool): Whether to display the drawn image. Default to False.
+            wait_time (int): The interval of show (s). Defaults to 0.
+            out_file (str): Path to output file. Defaults to None.
+            pred_score_thr (float): The threshold to visualize the bboxes
+                and masks. Defaults to 0.3.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        gt_img_data = None
+        pred_img_data = None
+
+        if data_sample is not None:
+            data_sample = data_sample.cpu()
+
+        if draw_gt and data_sample is not None:
+            assert 'gt_instances' in data_sample
+            gt_img_data = self._draw_instances(image, data_sample.gt_instances)
+
+        if draw_pred and data_sample is not None:
+            assert 'pred_track_instances' in data_sample
+            pred_instances = data_sample.pred_track_instances
+            if 'scores' in pred_instances:
+                pred_instances = pred_instances[
+                    pred_instances.scores > pred_score_thr].cpu()
+            pred_img_data = self._draw_instances(image, pred_instances)
+
+        if gt_img_data is not None and pred_img_data is not None:
+            drawn_img = np.concatenate((gt_img_data, pred_img_data), axis=1)
+        elif gt_img_data is not None:
+            drawn_img = gt_img_data
+        else:
+            drawn_img = pred_img_data
+
+        if show:
+            self.show(drawn_img, win_name=name, wait_time=wait_time)
+
+        if out_file is not None:
+            mmcv.imwrite(drawn_img[..., ::-1], out_file)
+        else:
+            self.add_image(name, drawn_img, step)
diff --git a/mmde/mmdet/visualization/palette.py b/mmde/mmdet/visualization/palette.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c402c08823a60759c984093ba7f05f1e310dbd9
--- /dev/null
+++ b/mmde/mmdet/visualization/palette.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import mmcv
+import numpy as np
+from mmengine.utils import is_str
+
+
+def palette_val(palette: List[tuple]) -> List[tuple]:
+    """Convert palette to matplotlib palette.
+
+    Args:
+        palette (List[tuple]): A list of color tuples.
+
+    Returns:
+        List[tuple[float]]: A list of RGB matplotlib color tuples.
+    """
+    new_palette = []
+    for color in palette:
+        color = [c / 255 for c in color]
+        new_palette.append(tuple(color))
+    return new_palette
+
+
+def get_palette(palette: Union[List[tuple], str, tuple],
+                num_classes: int) -> List[Tuple[int]]:
+    """Get palette from various inputs.
+
+    Args:
+        palette (list[tuple] | str | tuple): palette inputs.
+        num_classes (int): the number of classes.
+
+    Returns:
+        list[tuple[int]]: A list of color tuples.
+    """
+    assert isinstance(num_classes, int)
+
+    if isinstance(palette, list):
+        dataset_palette = palette
+    elif isinstance(palette, tuple):
+        dataset_palette = [palette] * num_classes
+    elif palette == 'random' or palette is None:
+        state = np.random.get_state()
+        # random color
+        np.random.seed(42)
+        palette = np.random.randint(0, 256, size=(num_classes, 3))
+        np.random.set_state(state)
+        dataset_palette = [tuple(c) for c in palette]
+    elif palette == 'coco':
+        from mmdet.datasets import CocoDataset, CocoPanopticDataset
+        dataset_palette = CocoDataset.METAINFO['palette']
+        if len(dataset_palette) < num_classes:
+            dataset_palette = CocoPanopticDataset.METAINFO['palette']
+    elif palette == 'citys':
+        from mmdet.datasets import CityscapesDataset
+        dataset_palette = CityscapesDataset.METAINFO['palette']
+    elif palette == 'voc':
+        from mmdet.datasets import VOCDataset
+        dataset_palette = VOCDataset.METAINFO['palette']
+    elif is_str(palette):
+        dataset_palette = [mmcv.color_val(palette)[::-1]] * num_classes
+    else:
+        raise TypeError(f'Invalid type for palette: {type(palette)}')
+
+    assert len(dataset_palette) >= num_classes, \
+        'The length of palette should not be less than `num_classes`.'
+    return dataset_palette
+
+
+def _get_adaptive_scales(areas: np.ndarray,
+                         min_area: int = 800,
+                         max_area: int = 30000) -> np.ndarray:
+    """Get adaptive scales according to areas.
+
+    The scale range is [0.5, 1.0]. When the area is less than
+    ``min_area``, the scale is 0.5 while the area is larger than
+    ``max_area``, the scale is 1.0.
+
+    Args:
+        areas (ndarray): The areas of bboxes or masks with the
+            shape of (n, ).
+        min_area (int): Lower bound areas for adaptive scales.
+            Defaults to 800.
+        max_area (int): Upper bound areas for adaptive scales.
+            Defaults to 30000.
+
+    Returns:
+        ndarray: The adaotive scales with the shape of (n, ).
+    """
+    scales = 0.5 + (areas - min_area) // (max_area - min_area)
+    scales = np.clip(scales, 0.5, 1.0)
+    return scales
+
+
+def jitter_color(color: tuple) -> tuple:
+    """Randomly jitter the given color in order to better distinguish instances
+    with the same class.
+
+    Args:
+        color (tuple): The RGB color tuple. Each value is between [0, 255].
+
+    Returns:
+        tuple: The jittered color tuple.
+    """
+    jitter = np.random.rand(3)
+    jitter = (jitter / np.linalg.norm(jitter) - 0.5) * 0.5 * 255
+    color = np.clip(jitter + color, 0, 255).astype(np.uint8)
+    return tuple(color)
diff --git a/mmde/mmdet3d/__init__.py b/mmde/mmdet3d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f34db2492545e1db9311dbfc9928867659ba42bc
--- /dev/null
+++ b/mmde/mmdet3d/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import mmdet
+import mmengine
+from mmengine.utils import digit_version
+
+from .version import __version__, version_info
+
+mmcv_minimum_version = '2.0.0rc4'
+mmcv_maximum_version = '2.2.0'
+mmcv_version = digit_version(mmcv.__version__)
+
+mmengine_minimum_version = '0.8.0'
+mmengine_maximum_version = '1.0.0'
+mmengine_version = digit_version(mmengine.__version__)
+
+mmdet_minimum_version = '3.0.0rc5'
+mmdet_maximum_version = '3.4.0'
+mmdet_version = digit_version(mmdet.__version__)
+
+# assert (mmcv_version >= digit_version(mmcv_minimum_version)
+#         and mmcv_version < digit_version(mmcv_maximum_version)), \
+#     f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+#     f'Please install mmcv>={mmcv_minimum_version}, <{mmcv_maximum_version}.'
+
+# assert (mmengine_version >= digit_version(mmengine_minimum_version)
+#         and mmengine_version < digit_version(mmengine_maximum_version)), \
+#     f'MMEngine=={mmengine.__version__} is used but incompatible. ' \
+#     f'Please install mmengine>={mmengine_minimum_version}, ' \
+#     f'<{mmengine_maximum_version}.'
+
+# assert (mmdet_version >= digit_version(mmdet_minimum_version)
+#         and mmdet_version < digit_version(mmdet_maximum_version)), \
+#     f'MMDET=={mmdet.__version__} is used but incompatible. ' \
+#     f'Please install mmdet>={mmdet_minimum_version}, ' \
+#     f'<{mmdet_maximum_version}.'
+
+__all__ = ['__version__', 'version_info', 'digit_version']
diff --git a/mmde/mmdet3d/apis/__init__.py b/mmde/mmdet3d/apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..57f732f16646f0d80ebff3c29879d891fdabceec
--- /dev/null
+++ b/mmde/mmdet3d/apis/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .inference import (convert_SyncBN, inference_detector,
+                        inference_mono_3d_detector,
+                        inference_multi_modality_detector, inference_segmentor,
+                        init_model)
+from .inferencers import (Base3DInferencer, LidarDet3DInferencer,
+                          LidarSeg3DInferencer, MonoDet3DInferencer,
+                          MultiModalityDet3DInferencer)
+
+__all__ = [
+    'inference_detector', 'init_model', 'inference_mono_3d_detector',
+    'convert_SyncBN', 'inference_multi_modality_detector',
+    'inference_segmentor', 'Base3DInferencer', 'MonoDet3DInferencer',
+    'LidarDet3DInferencer', 'LidarSeg3DInferencer',
+    'MultiModalityDet3DInferencer'
+]
diff --git a/mmde/mmdet3d/apis/inference.py b/mmde/mmdet3d/apis/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..e355b958eefe691e1d15ea5945e834e7e6294a52
--- /dev/null
+++ b/mmde/mmdet3d/apis/inference.py
@@ -0,0 +1,416 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from copy import deepcopy
+from os import path as osp
+from pathlib import Path
+from typing import Optional, Sequence, Union
+
+import mmengine
+import numpy as np
+import torch
+import torch.nn as nn
+from mmengine.config import Config
+from mmengine.dataset import Compose, pseudo_collate
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+
+from mmdet3d.registry import DATASETS, MODELS
+from mmdet3d.structures import Box3DMode, Det3DDataSample, get_box_type
+from mmdet3d.structures.det3d_data_sample import SampleList
+
+
+def convert_SyncBN(config):
+    """Convert config's naiveSyncBN to BN.
+
+    Args:
+         config (str or :obj:`mmengine.Config`): Config file path or the config
+            object.
+    """
+    if isinstance(config, dict):
+        for item in config:
+            if item == 'norm_cfg':
+                config[item]['type'] = config[item]['type']. \
+                                    replace('naiveSyncBN', 'BN')
+            else:
+                convert_SyncBN(config[item])
+
+
+def init_model(config: Union[str, Path, Config],
+               checkpoint: Optional[str] = None,
+               device: str = 'cuda:0',
+               palette: str = 'none',
+               cfg_options: Optional[dict] = None):
+    """Initialize a model from config file, which could be a 3D detector or a
+    3D segmentor.
+
+    Args:
+        config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path,
+            :obj:`Path`, or the config object.
+        checkpoint (str, optional): Checkpoint path. If left as None, the model
+            will not load any weights.
+        device (str): Device to use.
+        cfg_options (dict, optional): Options to override some settings in
+            the used config.
+
+    Returns:
+        nn.Module: The constructed detector.
+    """
+    if isinstance(config, (str, Path)):
+        config = Config.fromfile(config)
+    elif not isinstance(config, Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    if cfg_options is not None:
+        config.merge_from_dict(cfg_options)
+
+    convert_SyncBN(config.model)
+    config.model.train_cfg = None
+    init_default_scope(config.get('default_scope', 'mmdet3d'))
+    model = MODELS.build(config.model)
+
+    if checkpoint is not None:
+        checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
+        # save the dataset_meta in the model for convenience
+        if 'dataset_meta' in checkpoint.get('meta', {}):
+            # mmdet3d 1.x
+            model.dataset_meta = checkpoint['meta']['dataset_meta']
+        elif 'CLASSES' in checkpoint.get('meta', {}):
+            # < mmdet3d 1.x
+            classes = checkpoint['meta']['CLASSES']
+            model.dataset_meta = {'classes': classes}
+
+            if 'PALETTE' in checkpoint.get('meta', {}):  # 3D Segmentor
+                model.dataset_meta['palette'] = checkpoint['meta']['PALETTE']
+        else:
+            # < mmdet3d 1.x
+            model.dataset_meta = {'classes': config.class_names}
+
+            if 'PALETTE' in checkpoint.get('meta', {}):  # 3D Segmentor
+                model.dataset_meta['palette'] = checkpoint['meta']['PALETTE']
+
+        test_dataset_cfg = deepcopy(config.test_dataloader.dataset)
+        # lazy init. We only need the metainfo.
+        test_dataset_cfg['lazy_init'] = True
+        metainfo = DATASETS.build(test_dataset_cfg).metainfo
+        cfg_palette = metainfo.get('palette', None)
+        if cfg_palette is not None:
+            model.dataset_meta['palette'] = cfg_palette
+        else:
+            if 'palette' not in model.dataset_meta:
+                warnings.warn(
+                    'palette does not exist, random is used by default. '
+                    'You can also set the palette to customize.')
+                model.dataset_meta['palette'] = 'random'
+
+    model.cfg = config  # save the config in the model for convenience
+    if device != 'cpu':
+        torch.cuda.set_device(device)
+    else:
+        warnings.warn('Don\'t suggest using CPU device. '
+                      'Some functions are not supported for now.')
+
+    model.to(device)
+    model.eval()
+    return model
+
+
+PointsType = Union[str, np.ndarray, Sequence[str], Sequence[np.ndarray]]
+ImagesType = Union[str, np.ndarray, Sequence[str], Sequence[np.ndarray]]
+
+
+def inference_detector(model: nn.Module,
+                       pcds: PointsType) -> Union[Det3DDataSample, SampleList]:
+    """Inference point cloud with the detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        pcds (str, ndarray, Sequence[str/ndarray]):
+            Either point cloud files or loaded point cloud.
+
+    Returns:
+        :obj:`Det3DDataSample` or list[:obj:`Det3DDataSample`]:
+        If pcds is a list or tuple, the same length list type results
+        will be returned, otherwise return the detection results directly.
+    """
+    if isinstance(pcds, (list, tuple)):
+        is_batch = True
+    else:
+        pcds = [pcds]
+        is_batch = False
+
+    cfg = model.cfg
+
+    if not isinstance(pcds[0], str):
+        cfg = cfg.copy()
+        # set loading pipeline type
+        cfg.test_dataloader.dataset.pipeline[0].type = 'LoadPointsFromDict'
+
+    # build the data pipeline
+    test_pipeline = deepcopy(cfg.test_dataloader.dataset.pipeline)
+    test_pipeline = Compose(test_pipeline)
+    box_type_3d, box_mode_3d = \
+        get_box_type(cfg.test_dataloader.dataset.box_type_3d)
+
+    data = []
+    for pcd in pcds:
+        # prepare data
+        if isinstance(pcd, str):
+            # load from point cloud file
+            data_ = dict(
+                lidar_points=dict(lidar_path=pcd),
+                timestamp=1,
+                # for ScanNet demo we need axis_align_matrix
+                axis_align_matrix=np.eye(4),
+                box_type_3d=box_type_3d,
+                box_mode_3d=box_mode_3d)
+        else:
+            # directly use loaded point cloud
+            data_ = dict(
+                points=pcd,
+                timestamp=1,
+                # for ScanNet demo we need axis_align_matrix
+                axis_align_matrix=np.eye(4),
+                box_type_3d=box_type_3d,
+                box_mode_3d=box_mode_3d)
+        data_ = test_pipeline(data_)
+        data.append(data_)
+
+    collate_data = pseudo_collate(data)
+
+    # forward the model
+    with torch.no_grad():
+        results = model.test_step(collate_data)
+
+    if not is_batch:
+        return results[0], data[0]
+    else:
+        return results, data
+
+
+def inference_multi_modality_detector(model: nn.Module,
+                                      pcds: Union[str, Sequence[str]],
+                                      imgs: Union[str, Sequence[str]],
+                                      ann_file: Union[str, Sequence[str]],
+                                      cam_type: str = 'CAM2'):
+    """Inference point cloud with the multi-modality detector. Now we only
+    support multi-modality detector for KITTI and SUNRGBD datasets since the
+    multi-view image loading is not supported yet in this inference function.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        pcds (str, Sequence[str]):
+            Either point cloud files or loaded point cloud.
+        imgs (str, Sequence[str]):
+           Either image files or loaded images.
+        ann_file (str, Sequence[str]): Annotation files.
+        cam_type (str): Image of Camera chose to infer. When detector only uses
+            single-view image, we need to specify a camera view. For kitti
+            dataset, it should be 'CAM2'. For sunrgbd, it should be 'CAM0'.
+            When detector uses multi-view images, we should set it to 'all'.
+
+    Returns:
+        :obj:`Det3DDataSample` or list[:obj:`Det3DDataSample`]:
+        If pcds is a list or tuple, the same length list type results
+        will be returned, otherwise return the detection results directly.
+    """
+    if isinstance(pcds, (list, tuple)):
+        is_batch = True
+        assert isinstance(imgs, (list, tuple))
+        assert len(pcds) == len(imgs)
+    else:
+        pcds = [pcds]
+        imgs = [imgs]
+        is_batch = False
+
+    cfg = model.cfg
+
+    # build the data pipeline
+    test_pipeline = deepcopy(cfg.test_dataloader.dataset.pipeline)
+    test_pipeline = Compose(test_pipeline)
+    box_type_3d, box_mode_3d = \
+        get_box_type(cfg.test_dataloader.dataset.box_type_3d)
+
+    data_list = mmengine.load(ann_file)['data_list']
+
+    data = []
+    for index, pcd in enumerate(pcds):
+        # get data info containing calib
+        data_info = data_list[index]
+        img = imgs[index]
+
+        if cam_type != 'all':
+            assert osp.isfile(img), f'{img} must be a file.'
+            img_path = data_info['images'][cam_type]['img_path']
+            if osp.basename(img_path) != osp.basename(img):
+                raise ValueError(
+                    f'the info file of {img_path} is not provided.')
+            data_ = dict(
+                lidar_points=dict(lidar_path=pcd),
+                img_path=img,
+                box_type_3d=box_type_3d,
+                box_mode_3d=box_mode_3d)
+            data_info['images'][cam_type]['img_path'] = img
+            if 'cam2img' in data_info['images'][cam_type]:
+                # The data annotation in SRUNRGBD dataset does not contain
+                # `cam2img`
+                data_['cam2img'] = np.array(
+                    data_info['images'][cam_type]['cam2img'])
+
+            # LiDAR to image conversion for KITTI dataset
+            if box_mode_3d == Box3DMode.LIDAR:
+                if 'lidar2img' in data_info['images'][cam_type]:
+                    data_['lidar2img'] = np.array(
+                        data_info['images'][cam_type]['lidar2img'])
+            # Depth to image conversion for SUNRGBD dataset
+            elif box_mode_3d == Box3DMode.DEPTH:
+                data_['depth2img'] = np.array(
+                    data_info['images'][cam_type]['depth2img'])
+        else:
+            assert osp.isdir(img), f'{img} must be a file directory'
+            for _, img_info in data_info['images'].items():
+                img_info['img_path'] = osp.join(img, img_info['img_path'])
+                assert osp.isfile(img_info['img_path']
+                                  ), f'{img_info["img_path"]} does not exist.'
+            data_ = dict(
+                lidar_points=dict(lidar_path=pcd),
+                images=data_info['images'],
+                box_type_3d=box_type_3d,
+                box_mode_3d=box_mode_3d)
+
+        if 'timestamp' in data_info:
+            # Using multi-sweeps need `timestamp`
+            data_['timestamp'] = data_info['timestamp']
+
+        data_ = test_pipeline(data_)
+        data.append(data_)
+
+    collate_data = pseudo_collate(data)
+
+    # forward the model
+    with torch.no_grad():
+        results = model.test_step(collate_data)
+
+    if not is_batch:
+        return results[0], data[0]
+    else:
+        return results, data
+
+
+def inference_mono_3d_detector(model: nn.Module,
+                               imgs: ImagesType,
+                               ann_file: Union[str, Sequence[str]],
+                               cam_type: str = 'CAM_FRONT'):
+    """Inference image with the monocular 3D detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        imgs (str, Sequence[str]):
+           Either image files or loaded images.
+        ann_files (str, Sequence[str]): Annotation files.
+        cam_type (str): Image of Camera chose to infer.
+            For kitti dataset, it should be 'CAM_2',
+            and for nuscenes dataset, it should be
+            'CAM_FRONT'. Defaults to 'CAM_FRONT'.
+
+    Returns:
+        :obj:`Det3DDataSample` or list[:obj:`Det3DDataSample`]:
+        If pcds is a list or tuple, the same length list type results
+        will be returned, otherwise return the detection results directly.
+    """
+    if isinstance(imgs, (list, tuple)):
+        is_batch = True
+    else:
+        imgs = [imgs]
+        is_batch = False
+
+    cfg = model.cfg
+
+    # build the data pipeline
+    test_pipeline = deepcopy(cfg.test_dataloader.dataset.pipeline)
+    test_pipeline = Compose(test_pipeline)
+    box_type_3d, box_mode_3d = \
+        get_box_type(cfg.test_dataloader.dataset.box_type_3d)
+
+    data_list = mmengine.load(ann_file)['data_list']
+    assert len(imgs) == len(data_list)
+
+    data = []
+    for index, img in enumerate(imgs):
+        # get data info containing calib
+        data_info = data_list[index]
+        img_path = data_info['images'][cam_type]['img_path']
+        if osp.basename(img_path) != osp.basename(img):
+            raise ValueError(f'the info file of {img_path} is not provided.')
+
+        # replace the img_path in data_info with img
+        data_info['images'][cam_type]['img_path'] = img
+        # avoid data_info['images'] has multiple keys anout camera views.
+        mono_img_info = {f'{cam_type}': data_info['images'][cam_type]}
+        data_ = dict(
+            images=mono_img_info,
+            box_type_3d=box_type_3d,
+            box_mode_3d=box_mode_3d)
+
+        data_ = test_pipeline(data_)
+        data.append(data_)
+
+    collate_data = pseudo_collate(data)
+
+    # forward the model
+    with torch.no_grad():
+        results = model.test_step(collate_data)
+
+    if not is_batch:
+        return results[0]
+    else:
+        return results
+
+
+def inference_segmentor(model: nn.Module, pcds: PointsType):
+    """Inference point cloud with the segmentor.
+
+    Args:
+        model (nn.Module): The loaded segmentor.
+        pcds (str, Sequence[str]):
+            Either point cloud files or loaded point cloud.
+
+    Returns:
+        :obj:`Det3DDataSample` or list[:obj:`Det3DDataSample`]:
+        If pcds is a list or tuple, the same length list type results
+        will be returned, otherwise return the detection results directly.
+    """
+    if isinstance(pcds, (list, tuple)):
+        is_batch = True
+    else:
+        pcds = [pcds]
+        is_batch = False
+
+    cfg = model.cfg
+
+    # build the data pipeline
+    test_pipeline = deepcopy(cfg.test_dataloader.dataset.pipeline)
+
+    new_test_pipeline = []
+    for pipeline in test_pipeline:
+        if pipeline['type'] != 'LoadAnnotations3D' and pipeline[
+                'type'] != 'PointSegClassMapping':
+            new_test_pipeline.append(pipeline)
+    test_pipeline = Compose(new_test_pipeline)
+
+    data = []
+    # TODO: support load points array
+    for pcd in pcds:
+        data_ = dict(lidar_points=dict(lidar_path=pcd))
+        data_ = test_pipeline(data_)
+        data.append(data_)
+
+    collate_data = pseudo_collate(data)
+
+    # forward the model
+    with torch.no_grad():
+        results = model.test_step(collate_data)
+
+    if not is_batch:
+        return results[0], data[0]
+    else:
+        return results, data
diff --git a/mmde/mmdet3d/apis/inferencers/__init__.py b/mmde/mmdet3d/apis/inferencers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0da7b52a3e5d83e68b4bd032cd69ae1c97275bd2
--- /dev/null
+++ b/mmde/mmdet3d/apis/inferencers/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_3d_inferencer import Base3DInferencer
+from .lidar_det3d_inferencer import LidarDet3DInferencer
+from .lidar_seg3d_inferencer import LidarSeg3DInferencer
+from .mono_det3d_inferencer import MonoDet3DInferencer
+from .multi_modality_det3d_inferencer import MultiModalityDet3DInferencer
+
+__all__ = [
+    'Base3DInferencer', 'MonoDet3DInferencer', 'LidarDet3DInferencer',
+    'LidarSeg3DInferencer', 'MultiModalityDet3DInferencer'
+]
diff --git a/mmde/mmdet3d/apis/inferencers/base_3d_inferencer.py b/mmde/mmdet3d/apis/inferencers/base_3d_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6564325e8cc1ebcd6a79e61cf3704ce62be89e49
--- /dev/null
+++ b/mmde/mmdet3d/apis/inferencers/base_3d_inferencer.py
@@ -0,0 +1,346 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os.path as osp
+from copy import deepcopy
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch.nn as nn
+from mmengine import dump, print_log
+from mmengine.infer.infer import BaseInferencer, ModelType
+from mmengine.model.utils import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+from mmengine.structures import InstanceData
+from mmengine.visualization import Visualizer
+from rich.progress import track
+
+from mmdet3d.registry import DATASETS, MODELS
+from mmdet3d.structures import Box3DMode, Det3DDataSample
+from mmdet3d.utils import ConfigType
+
+InstanceList = List[InstanceData]
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = Union[InstanceData, InstanceList]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]]
+
+
+class Base3DInferencer(BaseInferencer):
+    """Base 3D model inferencer.
+
+    Args:
+        model (str, optional): Path to the config file or the model name
+            defined in metafile. For example, it could be
+            "pgd-kitti" or
+            "configs/pgd/pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d.py".
+            If model is not specified, user must provide the
+            `weights` saved by MMEngine which contains the config string.
+            Defaults to None.
+        weights (str, optional): Path to the checkpoint. If it is not specified
+            and model is a model name of metafile, the weights will be loaded
+            from metafile. Defaults to None.
+        device (str, optional): Device to run inference. If None, the available
+            device will be automatically used. Defaults to None.
+        scope (str): The scope of the model. Defaults to 'mmdet3d'.
+        palette (str): Color palette used for visualization. The order of
+            priority is palette -> config -> checkpoint. Defaults to 'none'.
+    """
+
+    preprocess_kwargs: set = {'cam_type'}
+    forward_kwargs: set = set()
+    visualize_kwargs: set = {
+        'return_vis', 'show', 'wait_time', 'draw_pred', 'pred_score_thr',
+        'img_out_dir', 'no_save_vis', 'cam_type_dir'
+    }
+    postprocess_kwargs: set = {
+        'print_result', 'pred_out_dir', 'return_datasample', 'no_save_pred'
+    }
+
+    def __init__(self,
+                 model: Union[ModelType, str, None] = None,
+                 weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: str = 'mmdet3d',
+                 palette: str = 'none') -> None:
+        # A global counter tracking the number of frames processed, for
+        # naming of the output results
+        self.num_predicted_frames = 0
+        self.palette = palette
+        init_default_scope(scope)
+        super().__init__(
+            model=model, weights=weights, device=device, scope=scope)
+        self.model = revert_sync_batchnorm(self.model)
+
+    def _convert_syncbn(self, cfg: ConfigType):
+        """Convert config's naiveSyncBN to BN.
+
+        Args:
+            config (str or :obj:`mmengine.Config`): Config file path
+                or the config object.
+        """
+        if isinstance(cfg, dict):
+            for item in cfg:
+                if item == 'norm_cfg':
+                    cfg[item]['type'] = cfg[item]['type']. \
+                                        replace('naiveSyncBN', 'BN')
+                else:
+                    self._convert_syncbn(cfg[item])
+
+    def _init_model(
+        self,
+        cfg: ConfigType,
+        weights: str,
+        device: str = 'cpu',
+    ) -> nn.Module:
+        self._convert_syncbn(cfg.model)
+        cfg.model.train_cfg = None
+        model = MODELS.build(cfg.model)
+
+        checkpoint = load_checkpoint(model, weights, map_location='cpu')
+        if 'dataset_meta' in checkpoint.get('meta', {}):
+            # mmdet3d 1.x
+            model.dataset_meta = checkpoint['meta']['dataset_meta']
+        elif 'CLASSES' in checkpoint.get('meta', {}):
+            # < mmdet3d 1.x
+            classes = checkpoint['meta']['CLASSES']
+            model.dataset_meta = {'classes': classes}
+
+            if 'PALETTE' in checkpoint.get('meta', {}):  # 3D Segmentor
+                model.dataset_meta['palette'] = checkpoint['meta']['PALETTE']
+        else:
+            # < mmdet3d 1.x
+            model.dataset_meta = {'classes': cfg.class_names}
+
+            if 'PALETTE' in checkpoint.get('meta', {}):  # 3D Segmentor
+                model.dataset_meta['palette'] = checkpoint['meta']['PALETTE']
+
+        test_dataset_cfg = deepcopy(cfg.test_dataloader.dataset)
+        # lazy init. We only need the metainfo.
+        test_dataset_cfg['lazy_init'] = True
+        metainfo = DATASETS.build(test_dataset_cfg).metainfo
+        cfg_palette = metainfo.get('palette', None)
+        if cfg_palette is not None:
+            model.dataset_meta['palette'] = cfg_palette
+
+        model.cfg = cfg  # save the config in the model for convenience
+        model.to(device)
+        model.eval()
+        return model
+
+    def _get_transform_idx(self, pipeline_cfg: ConfigType, name: str) -> int:
+        """Returns the index of the transform in a pipeline.
+
+        If the transform is not found, returns -1.
+        """
+        for i, transform in enumerate(pipeline_cfg):
+            if transform['type'] == name:
+                return i
+        return -1
+
+    def _init_visualizer(self, cfg: ConfigType) -> Optional[Visualizer]:
+        visualizer = super()._init_visualizer(cfg)
+        visualizer.dataset_meta = self.model.dataset_meta
+        return visualizer
+
+    def _dispatch_kwargs(self,
+                         out_dir: str = '',
+                         cam_type: str = '',
+                         **kwargs) -> Tuple[Dict, Dict, Dict, Dict]:
+        """Dispatch kwargs to preprocess(), forward(), visualize() and
+        postprocess() according to the actual demands.
+
+        Args:
+            out_dir (str): Dir to save the inference results.
+            cam_type (str): Camera type. Defaults to ''.
+            **kwargs (dict): Key words arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
+                and ``postprocess_kwargs``.
+
+        Returns:
+            Tuple[Dict, Dict, Dict, Dict]: kwargs passed to preprocess,
+            forward, visualize and postprocess respectively.
+        """
+        kwargs['img_out_dir'] = out_dir
+        kwargs['pred_out_dir'] = out_dir
+        if cam_type != '':
+            kwargs['cam_type_dir'] = cam_type
+        return super()._dispatch_kwargs(**kwargs)
+
+    def __call__(self,
+                 inputs: InputsType,
+                 batch_size: int = 1,
+                 return_datasamples: bool = False,
+                 **kwargs) -> Optional[dict]:
+        """Call the inferencer.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+            batch_size (int): Batch size. Defaults to 1.
+            return_datasamples (bool): Whether to return results as
+                :obj:`BaseDataElement`. Defaults to False.
+            **kwargs: Key words arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
+                and ``postprocess_kwargs``.
+
+
+        Returns:
+            dict: Inference and visualization results.
+        """
+
+        (
+            preprocess_kwargs,
+            forward_kwargs,
+            visualize_kwargs,
+            postprocess_kwargs,
+        ) = self._dispatch_kwargs(**kwargs)
+
+        cam_type = preprocess_kwargs.pop('cam_type', 'CAM2')
+        ori_inputs = self._inputs_to_list(inputs, cam_type=cam_type)
+        inputs = self.preprocess(
+            ori_inputs, batch_size=batch_size, **preprocess_kwargs)
+        preds = []
+
+        results_dict = {'predictions': [], 'visualization': []}
+        for data in (track(inputs, description='Inference')
+                     if self.show_progress else inputs):
+            preds.extend(self.forward(data, **forward_kwargs))
+            visualization = self.visualize(ori_inputs, preds,
+                                           **visualize_kwargs)
+            results = self.postprocess(preds, visualization,
+                                       return_datasamples,
+                                       **postprocess_kwargs)
+            results_dict['predictions'].extend(results['predictions'])
+            if results['visualization'] is not None:
+                results_dict['visualization'].extend(results['visualization'])
+        return results_dict
+
+    def postprocess(
+        self,
+        preds: PredType,
+        visualization: Optional[List[np.ndarray]] = None,
+        return_datasample: bool = False,
+        print_result: bool = False,
+        no_save_pred: bool = False,
+        pred_out_dir: str = '',
+    ) -> Union[ResType, Tuple[ResType, np.ndarray]]:
+        """Process the predictions and visualization results from ``forward``
+        and ``visualize``.
+
+        This method should be responsible for the following tasks:
+
+        1. Convert datasamples into a json-serializable dict if needed.
+        2. Pack the predictions and visualization results and return them.
+        3. Dump or log the predictions.
+
+        Args:
+            preds (List[Dict]): Predictions of the model.
+            visualization (np.ndarray, optional): Visualized predictions.
+                Defaults to None.
+            return_datasample (bool): Whether to use Datasample to store
+                inference results. If False, dict will be used.
+                Defaults to False.
+            print_result (bool): Whether to print the inference result w/o
+                visualization to the console. Defaults to False.
+            pred_out_dir (str): Directory to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+
+        Returns:
+            dict: Inference and visualization results with key ``predictions``
+            and ``visualization``.
+
+            - ``visualization`` (Any): Returned by :meth:`visualize`.
+            - ``predictions`` (dict or DataSample): Returned by
+              :meth:`forward` and processed in :meth:`postprocess`.
+              If ``return_datasample=False``, it usually should be a
+              json-serializable dict containing only basic data elements such
+              as strings and numbers.
+        """
+        if no_save_pred is True:
+            pred_out_dir = ''
+
+        result_dict = {}
+        results = preds
+        if not return_datasample:
+            results = []
+            for pred in preds:
+                result = self.pred2dict(pred, pred_out_dir)
+                results.append(result)
+        elif pred_out_dir != '':
+            print_log(
+                'Currently does not support saving datasample '
+                'when return_datasample is set to True. '
+                'Prediction results are not saved!',
+                level=logging.WARNING)
+        # Add img to the results after printing and dumping
+        result_dict['predictions'] = results
+        if print_result:
+            print(result_dict)
+        result_dict['visualization'] = visualization
+        return result_dict
+
+    # TODO: The data format and fields saved in json need further discussion.
+    #  Maybe should include model name, timestamp, filename, image info etc.
+    def pred2dict(self,
+                  data_sample: Det3DDataSample,
+                  pred_out_dir: str = '') -> Dict:
+        """Extract elements necessary to represent a prediction into a
+        dictionary.
+
+        It's better to contain only basic data elements such as strings and
+        numbers in order to guarantee it's json-serializable.
+
+        Args:
+            data_sample (:obj:`DetDataSample`): Predictions of the model.
+            pred_out_dir: Dir to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+
+        Returns:
+            dict: Prediction results.
+        """
+        result = {}
+        if 'pred_instances_3d' in data_sample:
+            pred_instances_3d = data_sample.pred_instances_3d.numpy()
+            result = {
+                'labels_3d': pred_instances_3d.labels_3d.tolist(),
+                'scores_3d': pred_instances_3d.scores_3d.tolist(),
+                'bboxes_3d': pred_instances_3d.bboxes_3d.tensor.cpu().tolist()
+            }
+
+        if 'pred_pts_seg' in data_sample:
+            pred_pts_seg = data_sample.pred_pts_seg.numpy()
+            result['pts_semantic_mask'] = \
+                pred_pts_seg.pts_semantic_mask.tolist()
+
+        if data_sample.box_mode_3d == Box3DMode.LIDAR:
+            result['box_type_3d'] = 'LiDAR'
+        elif data_sample.box_mode_3d == Box3DMode.CAM:
+            result['box_type_3d'] = 'Camera'
+        elif data_sample.box_mode_3d == Box3DMode.DEPTH:
+            result['box_type_3d'] = 'Depth'
+
+        if pred_out_dir != '':
+            if 'lidar_path' in data_sample:
+                lidar_path = osp.basename(data_sample.lidar_path)
+                lidar_path = osp.splitext(lidar_path)[0]
+                out_json_path = osp.join(pred_out_dir, 'preds',
+                                         lidar_path + '.json')
+            elif 'img_path' in data_sample:
+                img_path = osp.basename(data_sample.img_path)
+                img_path = osp.splitext(img_path)[0]
+                out_json_path = osp.join(pred_out_dir, 'preds',
+                                         img_path + '.json')
+            else:
+                out_json_path = osp.join(
+                    pred_out_dir, 'preds',
+                    f'{str(self.num_visualized_imgs).zfill(8)}.json')
+            dump(result, out_json_path)
+
+        return result
diff --git a/mmde/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py b/mmde/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc513e85d8c561b177f3a6e5fd413562d7fdb460
--- /dev/null
+++ b/mmde/mmdet3d/apis/inferencers/lidar_det3d_inferencer.py
@@ -0,0 +1,242 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Dict, List, Optional, Sequence, Union
+
+import mmengine
+import numpy as np
+import torch
+from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
+from mmengine.infer.infer import ModelType
+from mmengine.structures import InstanceData
+
+from mmdet3d.registry import INFERENCERS
+from mmdet3d.structures import (CameraInstance3DBoxes, DepthInstance3DBoxes,
+                                Det3DDataSample, LiDARInstance3DBoxes)
+from mmdet3d.utils import ConfigType
+from .base_3d_inferencer import Base3DInferencer
+
+InstanceList = List[InstanceData]
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = Union[InstanceData, InstanceList]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]]
+
+
+@INFERENCERS.register_module(name='det3d-lidar')
+@INFERENCERS.register_module()
+class LidarDet3DInferencer(Base3DInferencer):
+    """The inferencer of LiDAR-based detection.
+
+    Args:
+        model (str, optional): Path to the config file or the model name
+            defined in metafile. For example, it could be
+            "pointpillars_kitti-3class" or
+            "configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py". # noqa: E501
+            If model is not specified, user must provide the
+            `weights` saved by MMEngine which contains the config string.
+            Defaults to None.
+        weights (str, optional): Path to the checkpoint. If it is not specified
+            and model is a model name of metafile, the weights will be loaded
+            from metafile. Defaults to None.
+        device (str, optional): Device to run inference. If None, the available
+            device will be automatically used. Defaults to None.
+        scope (str): The scope of the model. Defaults to 'mmdet3d'.
+        palette (str): Color palette used for visualization. The order of
+            priority is palette -> config -> checkpoint. Defaults to 'none'.
+    """
+
+    def __init__(self,
+                 model: Union[ModelType, str, None] = None,
+                 weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: str = 'mmdet3d',
+                 palette: str = 'none') -> None:
+        # A global counter tracking the number of frames processed, for
+        # naming of the output results
+        self.num_visualized_frames = 0
+        super(LidarDet3DInferencer, self).__init__(
+            model=model,
+            weights=weights,
+            device=device,
+            scope=scope,
+            palette=palette)
+
+    def _inputs_to_list(self, inputs: Union[dict, list], **kwargs) -> list:
+        """Preprocess the inputs to a list.
+
+        Preprocess inputs to a list according to its type:
+
+        - list or tuple: return inputs
+        - dict: the value with key 'points' is
+            - Directory path: return all files in the directory
+            - other cases: return a list containing the string. The string
+              could be a path to file, a url or other types of string according
+              to the task.
+
+        Args:
+            inputs (Union[dict, list]): Inputs for the inferencer.
+
+        Returns:
+            list: List of input for the :meth:`preprocess`.
+        """
+        if isinstance(inputs, dict) and isinstance(inputs['points'], str):
+            pcd = inputs['points']
+            backend = get_file_backend(pcd)
+            if hasattr(backend, 'isdir') and isdir(pcd):
+                # Backends like HttpsBackend do not implement `isdir`, so
+                # only those backends that implement `isdir` could accept
+                # the inputs as a directory
+                filename_list = list_dir_or_file(pcd, list_dir=False)
+                inputs = [{
+                    'points': join_path(pcd, filename)
+                } for filename in filename_list]
+
+        if not isinstance(inputs, (list, tuple)):
+            inputs = [inputs]
+
+        return list(inputs)
+
+    def _init_pipeline(self, cfg: ConfigType) -> Compose:
+        """Initialize the test pipeline."""
+        pipeline_cfg = cfg.test_dataloader.dataset.pipeline
+
+        load_point_idx = self._get_transform_idx(pipeline_cfg,
+                                                 'LoadPointsFromFile')
+        if load_point_idx == -1:
+            raise ValueError(
+                'LoadPointsFromFile is not found in the test pipeline')
+
+        load_cfg = pipeline_cfg[load_point_idx]
+        self.coord_type, self.load_dim = load_cfg['coord_type'], load_cfg[
+            'load_dim']
+        self.use_dim = list(range(load_cfg['use_dim'])) if isinstance(
+            load_cfg['use_dim'], int) else load_cfg['use_dim']
+
+        pipeline_cfg[load_point_idx]['type'] = 'LidarDet3DInferencerLoader'
+        return Compose(pipeline_cfg)
+
+    def visualize(self,
+                  inputs: InputsType,
+                  preds: PredType,
+                  return_vis: bool = False,
+                  show: bool = False,
+                  wait_time: int = -1,
+                  draw_pred: bool = True,
+                  pred_score_thr: float = 0.3,
+                  no_save_vis: bool = False,
+                  img_out_dir: str = '') -> Union[List[np.ndarray], None]:
+        """Visualize predictions.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+            preds (PredType): Predictions of the model.
+            return_vis (bool): Whether to return the visualization result.
+                Defaults to False.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to -1.
+            draw_pred (bool): Whether to draw predicted bounding boxes.
+                Defaults to True.
+            pred_score_thr (float): Minimum score of bboxes to draw.
+                Defaults to 0.3.
+            no_save_vis (bool): Whether to force not to save prediction
+                vis results. Defaults to False.
+            img_out_dir (str): Output directory of visualization results.
+                If left as empty, no file will be saved. Defaults to ''.
+
+        Returns:
+            List[np.ndarray] or None: Returns visualization results only if
+            applicable.
+        """
+        if no_save_vis is True:
+            img_out_dir = ''
+
+        if not show and img_out_dir == '' and not return_vis:
+            return None
+
+        if getattr(self, 'visualizer') is None:
+            raise ValueError('Visualization needs the "visualizer" term'
+                             'defined in the config, but got None.')
+
+        results = []
+
+        for single_input, pred in zip(inputs, preds):
+            single_input = single_input['points']
+            if isinstance(single_input, str):
+                pts_bytes = mmengine.fileio.get(single_input)
+                points = np.frombuffer(pts_bytes, dtype=np.float32)
+                points = points.reshape(-1, self.load_dim)
+                points = points[:, self.use_dim]
+                pc_name = osp.basename(single_input).split('.bin')[0]
+                pc_name = f'{pc_name}.png'
+            elif isinstance(single_input, np.ndarray):
+                points = single_input.copy()
+                pc_num = str(self.num_visualized_frames).zfill(8)
+                pc_name = f'{pc_num}.png'
+            else:
+                raise ValueError('Unsupported input type: '
+                                 f'{type(single_input)}')
+
+            if img_out_dir != '' and show:
+                o3d_save_path = osp.join(img_out_dir, 'vis_lidar', pc_name)
+                mmengine.mkdir_or_exist(osp.dirname(o3d_save_path))
+            else:
+                o3d_save_path = None
+
+            data_input = dict(points=points)
+            self.visualizer.add_datasample(
+                pc_name,
+                data_input,
+                pred,
+                show=show,
+                wait_time=wait_time,
+                draw_gt=False,
+                draw_pred=draw_pred,
+                pred_score_thr=pred_score_thr,
+                o3d_save_path=o3d_save_path,
+                vis_task='lidar_det',
+            )
+            results.append(points)
+            self.num_visualized_frames += 1
+
+        return results
+
+    def visualize_preds_fromfile(self, inputs: InputsType, preds: PredType,
+                                 **kwargs) -> Union[List[np.ndarray], None]:
+        """Visualize predictions from `*.json` files.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+            preds (PredType): Predictions of the model.
+
+        Returns:
+            List[np.ndarray] or None: Returns visualization results only if
+            applicable.
+        """
+        data_samples = []
+        for pred in preds:
+            pred = mmengine.load(pred)
+            data_sample = Det3DDataSample()
+            data_sample.pred_instances_3d = InstanceData()
+
+            data_sample.pred_instances_3d.labels_3d = torch.tensor(
+                pred['labels_3d'])
+            data_sample.pred_instances_3d.scores_3d = torch.tensor(
+                pred['scores_3d'])
+            if pred['box_type_3d'] == 'LiDAR':
+                data_sample.pred_instances_3d.bboxes_3d = \
+                    LiDARInstance3DBoxes(pred['bboxes_3d'])
+            elif pred['box_type_3d'] == 'Camera':
+                data_sample.pred_instances_3d.bboxes_3d = \
+                    CameraInstance3DBoxes(pred['bboxes_3d'])
+            elif pred['box_type_3d'] == 'Depth':
+                data_sample.pred_instances_3d.bboxes_3d = \
+                    DepthInstance3DBoxes(pred['bboxes_3d'])
+            else:
+                raise ValueError('Unsupported box type: '
+                                 f'{pred["box_type_3d"]}')
+            data_samples.append(data_sample)
+        return self.visualize(inputs=inputs, preds=data_samples, **kwargs)
diff --git a/mmde/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py b/mmde/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..33ab95c1041d80949bbe3789366a463758e04a56
--- /dev/null
+++ b/mmde/mmdet3d/apis/inferencers/lidar_seg3d_inferencer.py
@@ -0,0 +1,209 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Dict, List, Optional, Sequence, Union
+
+import mmengine
+import numpy as np
+from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
+from mmengine.infer.infer import ModelType
+from mmengine.structures import InstanceData
+
+from mmdet3d.registry import INFERENCERS
+from mmdet3d.utils import ConfigType
+from .base_3d_inferencer import Base3DInferencer
+
+InstanceList = List[InstanceData]
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = Union[InstanceData, InstanceList]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]]
+
+
+@INFERENCERS.register_module(name='seg3d-lidar')
+@INFERENCERS.register_module()
+class LidarSeg3DInferencer(Base3DInferencer):
+    """The inferencer of LiDAR-based segmentation.
+
+    Args:
+        model (str, optional): Path to the config file or the model name
+            defined in metafile. For example, it could be
+            "pointnet2-ssg_s3dis-seg" or
+            "configs/pointnet2/pointnet2_ssg_2xb16-cosine-50e_s3dis-seg.py".
+            If model is not specified, user must provide the
+            `weights` saved by MMEngine which contains the config string.
+            Defaults to None.
+        weights (str, optional): Path to the checkpoint. If it is not specified
+            and model is a model name of metafile, the weights will be loaded
+            from metafile. Defaults to None.
+        device (str, optional): Device to run inference. If None, the available
+            device will be automatically used. Defaults to None.
+        scope (str): The scope of the model. Defaults to 'mmdet3d'.
+        palette (str): Color palette used for visualization. The order of
+            priority is palette -> config -> checkpoint. Defaults to 'none'.
+    """
+
+    def __init__(self,
+                 model: Union[ModelType, str, None] = None,
+                 weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: str = 'mmdet3d',
+                 palette: str = 'none') -> None:
+        # A global counter tracking the number of frames processed, for
+        # naming of the output results
+        self.num_visualized_frames = 0
+        super(LidarSeg3DInferencer, self).__init__(
+            model=model,
+            weights=weights,
+            device=device,
+            scope=scope,
+            palette=palette)
+
+    def _inputs_to_list(self, inputs: Union[dict, list], **kwargs) -> list:
+        """Preprocess the inputs to a list.
+
+        Preprocess inputs to a list according to its type:
+
+        - list or tuple: return inputs
+        - dict: the value with key 'points' is
+            - Directory path: return all files in the directory
+            - other cases: return a list containing the string. The string
+              could be a path to file, a url or other types of string according
+              to the task.
+
+        Args:
+            inputs (Union[dict, list]): Inputs for the inferencer.
+
+        Returns:
+            list: List of input for the :meth:`preprocess`.
+        """
+        if isinstance(inputs, dict) and isinstance(inputs['points'], str):
+            pcd = inputs['points']
+            backend = get_file_backend(pcd)
+            if hasattr(backend, 'isdir') and isdir(pcd):
+                # Backends like HttpsBackend do not implement `isdir`, so
+                # only those backends that implement `isdir` could accept
+                # the inputs as a directory
+                filename_list = list_dir_or_file(pcd, list_dir=False)
+                inputs = [{
+                    'points': join_path(pcd, filename)
+                } for filename in filename_list]
+
+        if not isinstance(inputs, (list, tuple)):
+            inputs = [inputs]
+
+        return list(inputs)
+
+    def _init_pipeline(self, cfg: ConfigType) -> Compose:
+        """Initialize the test pipeline."""
+        pipeline_cfg = cfg.test_dataloader.dataset.pipeline
+        # Load annotation is also not applicable
+        idx = self._get_transform_idx(pipeline_cfg, 'LoadAnnotations3D')
+        if idx != -1:
+            del pipeline_cfg[idx]
+
+        idx = self._get_transform_idx(pipeline_cfg, 'PointSegClassMapping')
+        if idx != -1:
+            del pipeline_cfg[idx]
+
+        load_point_idx = self._get_transform_idx(pipeline_cfg,
+                                                 'LoadPointsFromFile')
+        if load_point_idx == -1:
+            raise ValueError(
+                'LoadPointsFromFile is not found in the test pipeline')
+
+        load_cfg = pipeline_cfg[load_point_idx]
+        self.coord_type, self.load_dim = load_cfg['coord_type'], load_cfg[
+            'load_dim']
+        self.use_dim = list(range(load_cfg['use_dim'])) if isinstance(
+            load_cfg['use_dim'], int) else load_cfg['use_dim']
+
+        pipeline_cfg[load_point_idx]['type'] = 'LidarDet3DInferencerLoader'
+        return Compose(pipeline_cfg)
+
+    def visualize(self,
+                  inputs: InputsType,
+                  preds: PredType,
+                  return_vis: bool = False,
+                  show: bool = False,
+                  wait_time: int = 0,
+                  draw_pred: bool = True,
+                  pred_score_thr: float = 0.3,
+                  no_save_vis: bool = False,
+                  img_out_dir: str = '') -> Union[List[np.ndarray], None]:
+        """Visualize predictions.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+            preds (PredType): Predictions of the model.
+            return_vis (bool): Whether to return the visualization result.
+                Defaults to False.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            draw_pred (bool): Whether to draw predicted bounding boxes.
+                Defaults to True.
+            pred_score_thr (float): Minimum score of bboxes to draw.
+                Defaults to 0.3.
+            no_save_vis (bool): Whether to save visualization results.
+            img_out_dir (str): Output directory of visualization results.
+                If left as empty, no file will be saved. Defaults to ''.
+
+        Returns:
+            List[np.ndarray] or None: Returns visualization results only if
+            applicable.
+        """
+        if no_save_vis is True:
+            img_out_dir = ''
+
+        if not show and img_out_dir == '' and not return_vis:
+            return None
+
+        if getattr(self, 'visualizer') is None:
+            raise ValueError('Visualization needs the "visualizer" term'
+                             'defined in the config, but got None.')
+
+        results = []
+
+        for single_input, pred in zip(inputs, preds):
+            single_input = single_input['points']
+            if isinstance(single_input, str):
+                pts_bytes = mmengine.fileio.get(single_input)
+                points = np.frombuffer(pts_bytes, dtype=np.float32)
+                points = points.reshape(-1, self.load_dim)
+                points = points[:, self.use_dim]
+                pc_name = osp.basename(single_input).split('.bin')[0]
+                pc_name = f'{pc_name}.png'
+            elif isinstance(single_input, np.ndarray):
+                points = single_input.copy()
+                pc_num = str(self.num_visualized_frames).zfill(8)
+                pc_name = f'{pc_num}.png'
+            else:
+                raise ValueError('Unsupported input type: '
+                                 f'{type(single_input)}')
+
+            if img_out_dir != '' and show:
+                o3d_save_path = osp.join(img_out_dir, 'vis_lidar', pc_name)
+                mmengine.mkdir_or_exist(osp.dirname(o3d_save_path))
+            else:
+                o3d_save_path = None
+
+            data_input = dict(points=points)
+            self.visualizer.add_datasample(
+                pc_name,
+                data_input,
+                pred,
+                show=show,
+                wait_time=wait_time,
+                draw_gt=False,
+                draw_pred=draw_pred,
+                pred_score_thr=pred_score_thr,
+                o3d_save_path=o3d_save_path,
+                vis_task='lidar_seg',
+            )
+            results.append(points)
+            self.num_visualized_frames += 1
+
+        return results
diff --git a/mmde/mmdet3d/apis/inferencers/mono_det3d_inferencer.py b/mmde/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..22863ae166e089211c91f1db858a4610995a0e60
--- /dev/null
+++ b/mmde/mmdet3d/apis/inferencers/mono_det3d_inferencer.py
@@ -0,0 +1,251 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Dict, List, Optional, Sequence, Union
+
+import mmcv
+import mmengine
+import numpy as np
+from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
+from mmengine.infer.infer import ModelType
+from mmengine.structures import InstanceData
+
+from mmdet3d.registry import INFERENCERS
+from mmdet3d.utils import ConfigType
+from .base_3d_inferencer import Base3DInferencer
+
+InstanceList = List[InstanceData]
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = Union[InstanceData, InstanceList]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]]
+
+
+@INFERENCERS.register_module(name='det3d-mono')
+@INFERENCERS.register_module()
+class MonoDet3DInferencer(Base3DInferencer):
+    """MMDet3D Monocular 3D object detection inferencer.
+
+    Args:
+        model (str, optional): Path to the config file or the model name
+            defined in metafile. For example, it could be
+            "pgd_kitti" or
+            "configs/pgd/pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d.py".
+            If model is not specified, user must provide the
+            `weights` saved by MMEngine which contains the config string.
+            Defaults to None.
+        weights (str, optional): Path to the checkpoint. If it is not specified
+            and model is a model name of metafile, the weights will be loaded
+            from metafile. Defaults to None.
+        device (str, optional): Device to run inference. If None, the available
+            device will be automatically used. Defaults to None.
+        scope (str): The scope of the model. Defaults to 'mmdet3d'.
+        palette (str): Color palette used for visualization. The order of
+            priority is palette -> config -> checkpoint. Defaults to 'none'.
+    """
+
+    def __init__(self,
+                 model: Union[ModelType, str, None] = None,
+                 weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: str = 'mmdet3d',
+                 palette: str = 'none') -> None:
+        # A global counter tracking the number of images processed, for
+        # naming of the output images
+        self.num_visualized_imgs = 0
+        super(MonoDet3DInferencer, self).__init__(
+            model=model,
+            weights=weights,
+            device=device,
+            scope=scope,
+            palette=palette)
+
+    def _inputs_to_list(self,
+                        inputs: Union[dict, list],
+                        cam_type='CAM2',
+                        **kwargs) -> list:
+        """Preprocess the inputs to a list.
+
+        Preprocess inputs to a list according to its type:
+
+        - list or tuple: return inputs
+        - dict: the value with key 'img' is
+            - Directory path: return all files in the directory
+            - other cases: return a list containing the string. The string
+              could be a path to file, a url or other types of string according
+              to the task.
+
+        Args:
+            inputs (Union[dict, list]): Inputs for the inferencer.
+
+        Returns:
+            list: List of input for the :meth:`preprocess`.
+        """
+        if isinstance(inputs, dict):
+            assert 'infos' in inputs
+            infos = inputs.pop('infos')
+
+            if isinstance(inputs['img'], str):
+                img = inputs['img']
+                backend = get_file_backend(img)
+                if hasattr(backend, 'isdir') and isdir(img):
+                    # Backends like HttpsBackend do not implement `isdir`, so
+                    # only those backends that implement `isdir` could accept
+                    # the inputs as a directory
+                    filename_list = list_dir_or_file(img, list_dir=False)
+                    inputs = [{
+                        'img': join_path(img, filename)
+                    } for filename in filename_list]
+
+            if not isinstance(inputs, (list, tuple)):
+                inputs = [inputs]
+
+            # get cam2img, lidar2cam and lidar2img from infos
+            info_list = mmengine.load(infos)['data_list']
+            assert len(info_list) == len(inputs)
+            for index, input in enumerate(inputs):
+                data_info = info_list[index]
+                img_path = data_info['images'][cam_type]['img_path']
+                if isinstance(input['img'], str) and \
+                        osp.basename(img_path) != osp.basename(input['img']):
+                    raise ValueError(
+                        f'the info file of {img_path} is not provided.')
+                cam2img = np.asarray(
+                    data_info['images'][cam_type]['cam2img'], dtype=np.float32)
+                lidar2cam = np.asarray(
+                    data_info['images'][cam_type]['lidar2cam'],
+                    dtype=np.float32)
+                if 'lidar2img' in data_info['images'][cam_type]:
+                    lidar2img = np.asarray(
+                        data_info['images'][cam_type]['lidar2img'],
+                        dtype=np.float32)
+                else:
+                    lidar2img = cam2img @ lidar2cam
+                input['cam2img'] = cam2img
+                input['lidar2cam'] = lidar2cam
+                input['lidar2img'] = lidar2img
+        elif isinstance(inputs, (list, tuple)):
+            # get cam2img, lidar2cam and lidar2img from infos
+            for input in inputs:
+                assert 'infos' in input
+                infos = input.pop('infos')
+                info_list = mmengine.load(infos)['data_list']
+                assert len(info_list) == 1, 'Only support single sample info' \
+                    'in `.pkl`, when inputs is a list.'
+                data_info = info_list[0]
+                img_path = data_info['images'][cam_type]['img_path']
+                if isinstance(input['img'], str) and \
+                        osp.basename(img_path) != osp.basename(input['img']):
+                    raise ValueError(
+                        f'the info file of {img_path} is not provided.')
+                cam2img = np.asarray(
+                    data_info['images'][cam_type]['cam2img'], dtype=np.float32)
+                lidar2cam = np.asarray(
+                    data_info['images'][cam_type]['lidar2cam'],
+                    dtype=np.float32)
+                if 'lidar2img' in data_info['images'][cam_type]:
+                    lidar2img = np.asarray(
+                        data_info['images'][cam_type]['lidar2img'],
+                        dtype=np.float32)
+                else:
+                    lidar2img = cam2img @ lidar2cam
+                input['cam2img'] = cam2img
+                input['lidar2cam'] = lidar2cam
+                input['lidar2img'] = lidar2img
+
+        return list(inputs)
+
+    def _init_pipeline(self, cfg: ConfigType) -> Compose:
+        """Initialize the test pipeline."""
+        pipeline_cfg = cfg.test_dataloader.dataset.pipeline
+
+        load_img_idx = self._get_transform_idx(pipeline_cfg,
+                                               'LoadImageFromFileMono3D')
+        if load_img_idx == -1:
+            raise ValueError(
+                'LoadImageFromFileMono3D is not found in the test pipeline')
+        pipeline_cfg[load_img_idx]['type'] = 'MonoDet3DInferencerLoader'
+        return Compose(pipeline_cfg)
+
+    def visualize(self,
+                  inputs: InputsType,
+                  preds: PredType,
+                  return_vis: bool = False,
+                  show: bool = False,
+                  wait_time: int = 0,
+                  draw_pred: bool = True,
+                  pred_score_thr: float = 0.3,
+                  no_save_vis: bool = False,
+                  img_out_dir: str = '',
+                  cam_type_dir: str = 'CAM2') -> Union[List[np.ndarray], None]:
+        """Visualize predictions.
+
+        Args:
+            inputs (List[Dict]): Inputs for the inferencer.
+            preds (List[Dict]): Predictions of the model.
+            return_vis (bool): Whether to return the visualization result.
+                Defaults to False.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            draw_pred (bool): Whether to draw predicted bounding boxes.
+                Defaults to True.
+            pred_score_thr (float): Minimum score of bboxes to draw.
+                Defaults to 0.3.
+            no_save_vis (bool): Whether to save visualization results.
+            img_out_dir (str): Output directory of visualization results.
+                If left as empty, no file will be saved. Defaults to ''.
+            cam_type_dir (str): Camera type directory. Defaults to 'CAM2'.
+
+        Returns:
+            List[np.ndarray] or None: Returns visualization results only if
+            applicable.
+        """
+        if no_save_vis is True:
+            img_out_dir = ''
+
+        if not show and img_out_dir == '' and not return_vis:
+            return None
+
+        if getattr(self, 'visualizer') is None:
+            raise ValueError('Visualization needs the "visualizer" term'
+                             'defined in the config, but got None.')
+
+        results = []
+
+        for single_input, pred in zip(inputs, preds):
+            if isinstance(single_input['img'], str):
+                img_bytes = mmengine.fileio.get(single_input['img'])
+                img = mmcv.imfrombytes(img_bytes)
+                img = img[:, :, ::-1]
+                img_name = osp.basename(single_input['img'])
+            elif isinstance(single_input['img'], np.ndarray):
+                img = single_input['img'].copy()
+                img_num = str(self.num_visualized_imgs).zfill(8)
+                img_name = f'{img_num}.jpg'
+            else:
+                raise ValueError('Unsupported input type: '
+                                 f"{type(single_input['img'])}")
+
+            out_file = osp.join(img_out_dir, 'vis_camera', cam_type_dir,
+                                img_name) if img_out_dir != '' else None
+
+            data_input = dict(img=img)
+            self.visualizer.add_datasample(
+                img_name,
+                data_input,
+                pred,
+                show=show,
+                wait_time=wait_time,
+                draw_gt=False,
+                draw_pred=draw_pred,
+                pred_score_thr=pred_score_thr,
+                out_file=out_file,
+                vis_task='mono_det',
+            )
+            results.append(img)
+            self.num_visualized_imgs += 1
+
+        return results
diff --git a/mmde/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py b/mmde/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6717bb18c8209aab634f7dfe844f730e6f6c495f
--- /dev/null
+++ b/mmde/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py
@@ -0,0 +1,315 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from typing import Dict, List, Optional, Sequence, Union
+
+import mmcv
+import mmengine
+import numpy as np
+from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
+from mmengine.infer.infer import ModelType
+from mmengine.structures import InstanceData
+
+from mmdet3d.registry import INFERENCERS
+from mmdet3d.utils import ConfigType
+from .base_3d_inferencer import Base3DInferencer
+
+InstanceList = List[InstanceData]
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = Union[InstanceData, InstanceList]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]]
+
+
+@INFERENCERS.register_module(name='det3d-multi_modality')
+@INFERENCERS.register_module()
+class MultiModalityDet3DInferencer(Base3DInferencer):
+    """The inferencer of multi-modality detection.
+
+    Args:
+        model (str, optional): Path to the config file or the model name
+            defined in metafile. For example, it could be
+            "pointpillars_kitti-3class" or
+            "configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py". # noqa: E501
+            If model is not specified, user must provide the
+            `weights` saved by MMEngine which contains the config string.
+            Defaults to None.
+        weights (str, optional): Path to the checkpoint. If it is not specified
+            and model is a model name of metafile, the weights will be loaded
+            from metafile. Defaults to None.
+        device (str, optional): Device to run inference. If None, the available
+            device will be automatically used. Defaults to None.
+        scope (str): The scope of registry. Defaults to 'mmdet3d'.
+        palette (str): The palette of visualization. Defaults to 'none'.
+    """
+
+    def __init__(self,
+                 model: Union[ModelType, str, None] = None,
+                 weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: str = 'mmdet3d',
+                 palette: str = 'none') -> None:
+        # A global counter tracking the number of frames processed, for
+        # naming of the output results
+        self.num_visualized_frames = 0
+        super(MultiModalityDet3DInferencer, self).__init__(
+            model=model,
+            weights=weights,
+            device=device,
+            scope=scope,
+            palette=palette)
+
+    def _inputs_to_list(self,
+                        inputs: Union[dict, list],
+                        cam_type: str = 'CAM2',
+                        **kwargs) -> list:
+        """Preprocess the inputs to a list.
+
+        Preprocess inputs to a list according to its type:
+
+        - list or tuple: return inputs
+        - dict: the value with key 'points' is
+            - Directory path: return all files in the directory
+            - other cases: return a list containing the string. The string
+              could be a path to file, a url or other types of string according
+              to the task.
+
+        Args:
+            inputs (Union[dict, list]): Inputs for the inferencer.
+
+        Returns:
+            list: List of input for the :meth:`preprocess`.
+        """
+        if isinstance(inputs, dict):
+            assert 'infos' in inputs
+            infos = inputs.pop('infos')
+
+            if isinstance(inputs['img'], str):
+                img, pcd = inputs['img'], inputs['points']
+                backend = get_file_backend(img)
+                if hasattr(backend, 'isdir') and isdir(img) and isdir(pcd):
+                    # Backends like HttpsBackend do not implement `isdir`, so
+                    # only those backends that implement `isdir` could accept
+                    # the inputs as a directory
+                    img_filename_list = list_dir_or_file(
+                        img, list_dir=False, suffix=['.png', '.jpg'])
+                    pcd_filename_list = list_dir_or_file(
+                        pcd, list_dir=False, suffix='.bin')
+                    assert len(img_filename_list) == len(pcd_filename_list)
+
+                    inputs = [{
+                        'img': join_path(img, img_filename),
+                        'points': join_path(pcd, pcd_filename)
+                    } for pcd_filename, img_filename in zip(
+                        pcd_filename_list, img_filename_list)]
+
+            if not isinstance(inputs, (list, tuple)):
+                inputs = [inputs]
+
+            # get cam2img, lidar2cam and lidar2img from infos
+            info_list = mmengine.load(infos)['data_list']
+            assert len(info_list) == len(inputs)
+            for index, input in enumerate(inputs):
+                data_info = info_list[index]
+                img_path = data_info['images'][cam_type]['img_path']
+                if isinstance(input['img'], str) and \
+                        osp.basename(img_path) != osp.basename(input['img']):
+                    raise ValueError(
+                        f'the info file of {img_path} is not provided.')
+                cam2img = np.asarray(
+                    data_info['images'][cam_type]['cam2img'], dtype=np.float32)
+                lidar2cam = np.asarray(
+                    data_info['images'][cam_type]['lidar2cam'],
+                    dtype=np.float32)
+                if 'lidar2img' in data_info['images'][cam_type]:
+                    lidar2img = np.asarray(
+                        data_info['images'][cam_type]['lidar2img'],
+                        dtype=np.float32)
+                else:
+                    lidar2img = cam2img @ lidar2cam
+                input['cam2img'] = cam2img
+                input['lidar2cam'] = lidar2cam
+                input['lidar2img'] = lidar2img
+        elif isinstance(inputs, (list, tuple)):
+            # get cam2img, lidar2cam and lidar2img from infos
+            for input in inputs:
+                assert 'infos' in input
+                infos = input.pop('infos')
+                info_list = mmengine.load(infos)['data_list']
+                assert len(info_list) == 1, 'Only support single sample' \
+                    'info in `.pkl`, when input is a list.'
+                data_info = info_list[0]
+                img_path = data_info['images'][cam_type]['img_path']
+                if isinstance(input['img'], str) and \
+                        osp.basename(img_path) != osp.basename(input['img']):
+                    raise ValueError(
+                        f'the info file of {img_path} is not provided.')
+                cam2img = np.asarray(
+                    data_info['images'][cam_type]['cam2img'], dtype=np.float32)
+                lidar2cam = np.asarray(
+                    data_info['images'][cam_type]['lidar2cam'],
+                    dtype=np.float32)
+                if 'lidar2img' in data_info['images'][cam_type]:
+                    lidar2img = np.asarray(
+                        data_info['images'][cam_type]['lidar2img'],
+                        dtype=np.float32)
+                else:
+                    lidar2img = cam2img @ lidar2cam
+                input['cam2img'] = cam2img
+                input['lidar2cam'] = lidar2cam
+                input['lidar2img'] = lidar2img
+
+        return list(inputs)
+
+    def _init_pipeline(self, cfg: ConfigType) -> Compose:
+        """Initialize the test pipeline."""
+        pipeline_cfg = cfg.test_dataloader.dataset.pipeline
+
+        load_point_idx = self._get_transform_idx(pipeline_cfg,
+                                                 'LoadPointsFromFile')
+        load_mv_img_idx = self._get_transform_idx(
+            pipeline_cfg, 'LoadMultiViewImageFromFiles')
+        if load_mv_img_idx != -1:
+            warnings.warn(
+                'LoadMultiViewImageFromFiles is not supported yet in the '
+                'multi-modality inferencer. Please remove it')
+        # Now, we only support ``LoadImageFromFile`` as the image loader in the
+        # original piepline. `LoadMultiViewImageFromFiles` is not supported
+        # yet.
+        load_img_idx = self._get_transform_idx(pipeline_cfg,
+                                               'LoadImageFromFile')
+
+        if load_point_idx == -1 or load_img_idx == -1:
+            raise ValueError(
+                'Both LoadPointsFromFile and LoadImageFromFile must '
+                'be specified the pipeline, but LoadPointsFromFile is '
+                f'{load_point_idx == -1} and LoadImageFromFile is '
+                f'{load_img_idx}')
+
+        load_cfg = pipeline_cfg[load_point_idx]
+        self.coord_type, self.load_dim = load_cfg['coord_type'], load_cfg[
+            'load_dim']
+        self.use_dim = list(range(load_cfg['use_dim'])) if isinstance(
+            load_cfg['use_dim'], int) else load_cfg['use_dim']
+
+        load_point_args = pipeline_cfg[load_point_idx]
+        load_point_args.pop('type')
+        load_img_args = pipeline_cfg[load_img_idx]
+        load_img_args.pop('type')
+
+        load_idx = min(load_point_idx, load_img_idx)
+        pipeline_cfg.pop(max(load_point_idx, load_img_idx))
+
+        pipeline_cfg[load_idx] = dict(
+            type='MultiModalityDet3DInferencerLoader',
+            load_point_args=load_point_args,
+            load_img_args=load_img_args)
+
+        return Compose(pipeline_cfg)
+
+    def visualize(self,
+                  inputs: InputsType,
+                  preds: PredType,
+                  return_vis: bool = False,
+                  show: bool = False,
+                  wait_time: int = 0,
+                  draw_pred: bool = True,
+                  pred_score_thr: float = 0.3,
+                  no_save_vis: bool = False,
+                  img_out_dir: str = '',
+                  cam_type_dir: str = 'CAM2') -> Union[List[np.ndarray], None]:
+        """Visualize predictions.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+            preds (PredType): Predictions of the model.
+            return_vis (bool): Whether to return the visualization result.
+                Defaults to False.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            draw_pred (bool): Whether to draw predicted bounding boxes.
+                Defaults to True.
+            no_save_vis (bool): Whether to save visualization results.
+            pred_score_thr (float): Minimum score of bboxes to draw.
+                Defaults to 0.3.
+            img_out_dir (str): Output directory of visualization results.
+                If left as empty, no file will be saved. Defaults to ''.
+
+        Returns:
+            List[np.ndarray] or None: Returns visualization results only if
+            applicable.
+        """
+        if no_save_vis is True:
+            img_out_dir = ''
+
+        if not show and img_out_dir == '' and not return_vis:
+            return None
+
+        if getattr(self, 'visualizer') is None:
+            raise ValueError('Visualization needs the "visualizer" term'
+                             'defined in the config, but got None.')
+
+        results = []
+
+        for single_input, pred in zip(inputs, preds):
+            points_input = single_input['points']
+            if isinstance(points_input, str):
+                pts_bytes = mmengine.fileio.get(points_input)
+                points = np.frombuffer(pts_bytes, dtype=np.float32)
+                points = points.reshape(-1, self.load_dim)
+                points = points[:, self.use_dim]
+                pc_name = osp.basename(points_input).split('.bin')[0]
+                pc_name = f'{pc_name}.png'
+            elif isinstance(points_input, np.ndarray):
+                points = points_input.copy()
+                pc_num = str(self.num_visualized_frames).zfill(8)
+                pc_name = f'{pc_num}.png'
+            else:
+                raise ValueError('Unsupported input type: '
+                                 f'{type(points_input)}')
+
+            if img_out_dir != '' and show:
+                o3d_save_path = osp.join(img_out_dir, 'vis_lidar', pc_name)
+                mmengine.mkdir_or_exist(osp.dirname(o3d_save_path))
+            else:
+                o3d_save_path = None
+
+            img_input = single_input['img']
+            if isinstance(single_input['img'], str):
+                img_bytes = mmengine.fileio.get(img_input)
+                img = mmcv.imfrombytes(img_bytes)
+                img = img[:, :, ::-1]
+                img_name = osp.basename(img_input)
+            elif isinstance(img_input, np.ndarray):
+                img = img_input.copy()
+                img_num = str(self.num_visualized_frames).zfill(8)
+                img_name = f'{img_num}.jpg'
+            else:
+                raise ValueError('Unsupported input type: '
+                                 f'{type(img_input)}')
+
+            out_file = osp.join(img_out_dir, 'vis_camera', cam_type_dir,
+                                img_name) if img_out_dir != '' else None
+
+            data_input = dict(points=points, img=img)
+            self.visualizer.add_datasample(
+                pc_name,
+                data_input,
+                pred,
+                show=show,
+                wait_time=wait_time,
+                draw_gt=False,
+                draw_pred=draw_pred,
+                pred_score_thr=pred_score_thr,
+                o3d_save_path=o3d_save_path,
+                out_file=out_file,
+                vis_task='multi-modality_det',
+            )
+            results.append(points)
+            self.num_visualized_frames += 1
+
+        return results
diff --git a/mmde/mmdet3d/configs/_base_/datasets/kitti_3d_3class.py b/mmde/mmdet3d/configs/_base_/datasets/kitti_3d_3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..085cab4cb4b76959465bef909cbcd77c2d46cfc7
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/kitti_3d_3class.py
@@ -0,0 +1,181 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dataset.dataset_wrapper import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.kitti_dataset import KittiDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (  # noqa
+    GlobalRotScaleTrans, ObjectNoise, ObjectRangeFilter, ObjectSample,
+    PointShuffle, PointsRangeFilter, RandomFlip3D)
+from mmdet3d.evaluation.metrics.kitti_metric import KittiMetric
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+metainfo = dict(classes=class_names)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/kitti/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6),
+    points_loader=dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,  # x, y, z, intensity
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type=LoadAnnotations3D, with_bbox_3d=True, with_label_3d=True),
+    dict(type=ObjectSample, db_sampler=db_sampler),
+    dict(
+        type=ObjectNoise,
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type=RandomFlip3D, flip_ratio_bev_horizontal=0.5),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=PointShuffle),
+    dict(
+        type=Pack3DDetInputs, keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type=RandomFlip3D),
+            dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range)
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=6,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=RepeatDataset,
+        times=2,
+        dataset=dict(
+            type=KittiDataset,
+            data_root=data_root,
+            ann_file='kitti_infos_train.pkl',
+            data_prefix=dict(pts='training/velodyne_reduced'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=KittiDataset,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=KittiDataset,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+val_evaluator = dict(
+    type=KittiMetric,
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/mmdet3d/configs/_base_/datasets/kitti_3d_car.py b/mmde/mmdet3d/configs/_base_/datasets/kitti_3d_car.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ffe20d44e699b9db4c53054801cfc004ab1f674
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/kitti_3d_car.py
@@ -0,0 +1,179 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dataset.dataset_wrapper import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.kitti_dataset import KittiDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (  # noqa
+    GlobalRotScaleTrans, ObjectNoise, ObjectRangeFilter, ObjectSample,
+    PointShuffle, PointsRangeFilter, RandomFlip3D)
+from mmdet3d.evaluation.metrics.kitti_metric import KittiMetric
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+metainfo = dict(classes=class_names)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/kitti/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15),
+    points_loader=dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,  # x, y, z, intensity
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type=LoadAnnotations3D, with_bbox_3d=True, with_label_3d=True),
+    dict(type=ObjectSample, db_sampler=db_sampler),
+    dict(
+        type=ObjectNoise,
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type=RandomFlip3D, flip_ratio_bev_horizontal=0.5),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=PointShuffle),
+    dict(
+        type=Pack3DDetInputs, keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type=RandomFlip3D),
+            dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range)
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=6,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=RepeatDataset,
+        times=2,
+        dataset=dict(
+            type=KittiDataset,
+            data_root=data_root,
+            ann_file='kitti_infos_train.pkl',
+            data_prefix=dict(pts='training/velodyne_reduced'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=KittiDataset,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=KittiDataset,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+val_evaluator = dict(
+    type=KittiMetric,
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/mmdet3d/configs/_base_/datasets/kitti_mono3d.py b/mmde/mmdet3d/configs/_base_/datasets/kitti_mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..4043ae1e6f75b6c904f6b3cdadbc2220182c5b80
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/kitti_mono3d.py
@@ -0,0 +1,113 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.processing import Resize
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.kitti_dataset import KittiDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadImageFromFileMono3D)
+from mmdet3d.datasets.transforms.transforms_3d import RandomFlip3D
+from mmdet3d.evaluation.metrics.kitti_metric import KittiMetric
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+input_modality = dict(use_lidar=False, use_camera=True)
+metainfo = dict(classes=class_names)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/kitti/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFileMono3D, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type=Resize, scale=(1242, 375), keep_ratio=True),
+    dict(type=RandomFlip3D, flip_ratio_bev_horizontal=0.5),
+    dict(
+        type=Pack3DDetInputs,
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+test_pipeline = [
+    dict(type=LoadImageFromFileMono3D, backend_args=backend_args),
+    dict(type=Resize, scale=(1242, 375), keep_ratio=True),
+    dict(type=Pack3DDetInputs, keys=['img'])
+]
+eval_pipeline = [
+    dict(type=LoadImageFromFileMono3D, backend_args=backend_args),
+    dict(type=Pack3DDetInputs, keys=['img'])
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=KittiDataset,
+        data_root=data_root,
+        ann_file='kitti_infos_train.pkl',
+        data_prefix=dict(img='training/image_2'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        load_type='fov_image_based',
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='Camera' in monocular 3d
+        # detection task
+        box_type_3d='Camera',
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=KittiDataset,
+        data_root=data_root,
+        data_prefix=dict(img='training/image_2'),
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        load_type='fov_image_based',
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Camera',
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=KittiMetric,
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/mmdet3d/configs/_base_/datasets/lyft_3d.py b/mmde/mmdet3d/configs/_base_/datasets/lyft_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdab42dd99818a758198daa4d377273cc6125363
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/lyft_3d.py
@@ -0,0 +1,176 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.lyft_dataset import LyftDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile,
+                                                 LoadPointsFromMultiSweeps)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (GlobalRotScaleTrans,
+                                                       ObjectRangeFilter,
+                                                       PointShuffle,
+                                                       PointsRangeFilter,
+                                                       RandomFlip3D)
+from mmdet3d.evaluation.metrics.lyft_metric import LyftMetric
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-80, -80, -5, 80, 80, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(use_lidar=True, use_camera=False)
+data_prefix = dict(pts='v1.01-train/lidar', img='', sweeps='v1.01-train/lidar')
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/lyft/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type=LoadAnnotations3D, with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type=RandomFlip3D, flip_ratio_bev_horizontal=0.5),
+    dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=PointShuffle),
+    dict(
+        type=Pack3DDetInputs, keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type=RandomFlip3D),
+            dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range)
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=LyftDataset,
+        data_root=data_root,
+        ann_file='lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        metainfo=dict(classes=class_names),
+        modality=input_modality,
+        data_prefix=data_prefix,
+        test_mode=False,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=LyftDataset,
+        data_root=data_root,
+        ann_file='lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=dict(classes=class_names),
+        modality=input_modality,
+        data_prefix=data_prefix,
+        test_mode=True,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=LyftDataset,
+        data_root=data_root,
+        ann_file='lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=dict(classes=class_names),
+        modality=input_modality,
+        test_mode=True,
+        data_prefix=data_prefix,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type=LyftMetric,
+    data_root=data_root,
+    ann_file='lyft_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/mmdet3d/configs/_base_/datasets/lyft_3d_range100.py b/mmde/mmdet3d/configs/_base_/datasets/lyft_3d_range100.py
new file mode 100644
index 0000000000000000000000000000000000000000..eab6aea5982501c72fbe45d5c842aa7cfb55a0c0
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/lyft_3d_range100.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.lyft_dataset import LyftDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile,
+                                                 LoadPointsFromMultiSweeps)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (GlobalRotScaleTrans,
+                                                       ObjectRangeFilter,
+                                                       PointShuffle,
+                                                       PointsRangeFilter,
+                                                       RandomFlip3D)
+from mmdet3d.evaluation.metrics.lyft_metric import LyftMetric
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-100, -100, -5, 100, 100, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+data_prefix = dict(pts='v1.01-train/lidar', img='', sweeps='v1.01-train/lidar')
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/lyft/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type=LoadAnnotations3D, with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type=RandomFlip3D, flip_ratio_bev_horizontal=0.5),
+    dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=PointShuffle),
+    dict(
+        type=Pack3DDetInputs, keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type=RandomFlip3D),
+            dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=LyftDataset,
+        data_root=data_root,
+        ann_file='lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        metainfo=dict(classes=class_names),
+        modality=input_modality,
+        data_prefix=data_prefix,
+        test_mode=False,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=LyftDataset,
+        data_root=data_root,
+        ann_file='lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=dict(classes=class_names),
+        modality=input_modality,
+        test_mode=True,
+        data_prefix=data_prefix,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=LyftMetric,
+    data_root=data_root,
+    ann_file='lyft_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/mmdet3d/configs/_base_/datasets/nuim_instance.py b/mmde/mmdet3d/configs/_base_/datasets/nuim_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..44323d8365f802c482c3d5aa269796546e1077c1
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/nuim_instance.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.loading import LoadAnnotations, LoadImageFromFile
+from mmcv.transforms.processing import MultiScaleFlipAug, RandomFlip, Resize
+
+dataset_type = 'CocoDataset'
+data_root = 'data/nuimages/'
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/nuimages/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=Resize,
+        img_scale=[(1280, 720), (1920, 1080)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type=RandomFlip, flip_ratio=0.5),
+    dict(type='PackDetInputs'),
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug,
+        img_scale=(1600, 900),
+        flip=False,
+        transforms=[
+            dict(type=Resize, keep_ratio=True),
+            dict(type=RandomFlip),
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor')),
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-train.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/mmde/mmdet3d/configs/_base_/datasets/nus_3d.py b/mmde/mmdet3d/configs/_base_/datasets/nus_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..feebc0f2bd3210b80d9c01b1cd18e1c206491c2f
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/nus_3d.py
@@ -0,0 +1,183 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.nuscenes_dataset import NuScenesDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile,
+                                                 LoadPointsFromMultiSweeps)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (  # noqa
+    GlobalRotScaleTrans, ObjectNameFilter, ObjectRangeFilter, PointShuffle,
+    PointsRangeFilter, RandomFlip3D)
+from mmdet3d.evaluation.metrics.nuscenes_metric import NuScenesMetric
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# Using calibration info convert the Lidar-coordinate point cloud range to the
+# ego-coordinate point cloud range could bring a little promotion in nuScenes.
+# point_cloud_range = [-50, -50.8, -5, 50, 49.2, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+metainfo = dict(classes=class_names)
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(use_lidar=True, use_camera=False)
+data_prefix = dict(pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP')
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/nuscenes/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type=LoadAnnotations3D, with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type=RandomFlip3D, flip_ratio_bev_horizontal=0.5),
+    dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectNameFilter, classes=class_names),
+    dict(type=PointShuffle),
+    dict(
+        type=Pack3DDetInputs, keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=10,
+        test_mode=True,
+        backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type=RandomFlip3D),
+            dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range)
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=10,
+        test_mode=True,
+        backend_args=backend_args),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=NuScenesDataset,
+        data_root=data_root,
+        ann_file='nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        test_mode=False,
+        data_prefix=data_prefix,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=NuScenesDataset,
+        data_root=data_root,
+        ann_file='nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        data_prefix=data_prefix,
+        test_mode=True,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=NuScenesDataset,
+        data_root=data_root,
+        ann_file='nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        test_mode=True,
+        data_prefix=data_prefix,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type=NuScenesMetric,
+    data_root=data_root,
+    ann_file=data_root + 'nuscenes_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/mmdet3d/configs/_base_/datasets/nus_mono3d.py b/mmde/mmdet3d/configs/_base_/datasets/nus_mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c81226634584bd7a28496e8897c44bcd1e463e7
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/nus_mono3d.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.processing import Resize
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.nuscenes_dataset import NuScenesDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadImageFromFileMono3D)
+from mmdet3d.datasets.transforms.transforms_3d import RandomFlip3D
+from mmdet3d.evaluation.metrics.nuscenes_metric import NuScenesMetric
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+metainfo = dict(classes=class_names)
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/nuscenes/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFileMono3D, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type=Resize, scale=(1600, 900), keep_ratio=True),
+    dict(type=RandomFlip3D, flip_ratio_bev_horizontal=0.5),
+    dict(
+        type=Pack3DDetInputs,
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'attr_labels',
+            'gt_bboxes_3d', 'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type=LoadImageFromFileMono3D, backend_args=backend_args),
+    dict(type=Resize, scale=(1600, 900), keep_ratio=True),
+    dict(type=Pack3DDetInputs, keys=['img'])
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=NuScenesDataset,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='',
+            CAM_FRONT='samples/CAM_FRONT',
+            CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+            CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+            CAM_BACK='samples/CAM_BACK',
+            CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+            CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
+        ann_file='nuscenes_infos_train.pkl',
+        load_type='mv_image_based',
+        pipeline=train_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        test_mode=False,
+        # we use box_type_3d='Camera' in monocular 3d
+        # detection task
+        box_type_3d='Camera',
+        use_valid_flag=True,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=NuScenesDataset,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='',
+            CAM_FRONT='samples/CAM_FRONT',
+            CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+            CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+            CAM_BACK='samples/CAM_BACK',
+            CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+            CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
+        ann_file='nuscenes_infos_val.pkl',
+        load_type='mv_image_based',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Camera',
+        use_valid_flag=True,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=NuScenesMetric,
+    data_root=data_root,
+    ann_file=data_root + 'nuscenes_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/mmdet3d/configs/_base_/datasets/s3dis_3d.py b/mmde/mmdet3d/configs/_base_/datasets/s3dis_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..da7b7aa6bc6fd15ae8406a00571dc117e6686b29
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/s3dis_3d.py
@@ -0,0 +1,150 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dataset.dataset_wrapper import ConcatDataset, RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.s3dis_dataset import S3DISDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile,
+                                                 NormalizePointsColor)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (GlobalRotScaleTrans,
+                                                       PointSample,
+                                                       RandomFlip3D)
+from mmdet3d.evaluation.metrics.indoor_metric import IndoorMetric
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+# dataset settings
+dataset_type = 'S3DISDataset'
+data_root = 'data/s3dis/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/s3dis/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+metainfo = dict(classes=('table', 'chair', 'sofa', 'bookcase', 'board'))
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(type=LoadAnnotations3D, with_bbox_3d=True, with_label_3d=True),
+    dict(type=PointSample, num_points=100000),
+    dict(
+        type=RandomFlip3D,
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[0.9, 1.1],
+        translation_std=[.1, .1, .1],
+        shift_height=False),
+    dict(type=NormalizePointsColor, color_mean=None),
+    dict(
+        type=Pack3DDetInputs, keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type=RandomFlip3D,
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type=PointSample, num_points=100000),
+            dict(type=NormalizePointsColor, color_mean=None),
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=RepeatDataset,
+        times=13,
+        dataset=dict(
+            type=ConcatDataset,
+            datasets=[
+                dict(
+                    type=S3DISDataset,
+                    data_root=data_root,
+                    ann_file=f's3dis_infos_Area_{i}.pkl',
+                    pipeline=train_pipeline,
+                    filter_empty_gt=True,
+                    metainfo=metainfo,
+                    box_type_3d='Depth',
+                    backend_args=backend_args) for i in train_area
+            ])))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=S3DISDataset,
+        data_root=data_root,
+        ann_file=f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=S3DISDataset,
+        data_root=data_root,
+        ann_file=f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+val_evaluator = dict(type=IndoorMetric)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/mmdet3d/configs/_base_/datasets/s3dis_seg.py b/mmde/mmdet3d/configs/_base_/datasets/s3dis_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..be1353a7e87bb4033a56c8bd79d73c09e3993f1c
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/s3dis_seg.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.processing import TestTimeAug
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.s3dis_dataset import S3DISSegDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile,
+                                                 NormalizePointsColor,
+                                                 PointSegClassMapping)
+from mmdet3d.datasets.transforms.transforms_3d import (IndoorPatchPointSample,
+                                                       RandomFlip3D)
+from mmdet3d.evaluation.metrics.seg_metric import SegMetric
+from mmdet3d.models.segmentors.seg3d_tta import Seg3DTTAModel
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+# For S3DIS seg we usually do 13-class segmentation
+class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+metainfo = dict(classes=class_names)
+dataset_type = 'S3DISSegDataset'
+data_root = 'data/s3dis/'
+input_modality = dict(use_lidar=True, use_camera=False)
+data_prefix = dict(
+    pts='points',
+    pts_instance_mask='instance_mask',
+    pts_semantic_mask='semantic_mask')
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/s3dis/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+num_points = 4096
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type=PointSegClassMapping),
+    dict(
+        type=IndoorPatchPointSample,
+        num_points=num_points,
+        block_size=1.0,
+        ignore_index=len(class_names),
+        use_normalized_coord=True,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type=NormalizePointsColor, color_mean=None),
+    dict(type=Pack3DDetInputs, keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type=NormalizePointsColor, color_mean=None),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(type=NormalizePointsColor, color_mean=None),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+tta_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type=NormalizePointsColor, color_mean=None),
+    dict(
+        type=TestTimeAug,
+        transforms=[[
+            dict(
+                type=RandomFlip3D,
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.,
+                flip_ratio_bev_vertical=0.)
+        ], [dict(type=Pack3DDetInputs, keys=['points'])]])
+]
+
+# train on area 1, 2, 3, 4, 6
+# test on area 5
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=S3DISSegDataset,
+        data_root=data_root,
+        ann_files=[f's3dis_infos_Area_{i}.pkl' for i in train_area],
+        metainfo=metainfo,
+        data_prefix=data_prefix,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        ignore_index=len(class_names),
+        scene_idxs=[
+            f'seg_info/Area_{i}_resampled_scene_idxs.npy' for i in train_area
+        ],
+        test_mode=False,
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=S3DISSegDataset,
+        data_root=data_root,
+        ann_files=f's3dis_infos_Area_{test_area}.pkl',
+        metainfo=metainfo,
+        data_prefix=data_prefix,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        ignore_index=len(class_names),
+        scene_idxs=f'seg_info/Area_{test_area}_resampled_scene_idxs.npy',
+        test_mode=True,
+        backend_args=backend_args))
+val_dataloader = test_dataloader
+
+val_evaluator = dict(type=SegMetric)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
+
+tta_model = dict(type=Seg3DTTAModel)
diff --git a/mmde/mmdet3d/configs/_base_/datasets/scannet_3d.py b/mmde/mmdet3d/configs/_base_/datasets/scannet_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..b90f9d0839ea19df4504a94c76e829607ecea347
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/scannet_3d.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dataset.dataset_wrapper import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.scannet_dataset import ScanNetDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile,
+                                                 PointSegClassMapping)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (GlobalAlignment,
+                                                       GlobalRotScaleTrans,
+                                                       PointSample,
+                                                       RandomFlip3D)
+from mmdet3d.evaluation.metrics.indoor_metric import IndoorMetric
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = 'data/scannet/'
+
+metainfo = dict(
+    classes=('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+             'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+             'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+             'garbagebin'))
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/scannet/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type=GlobalAlignment, rotation_axis=2),
+    dict(type=PointSegClassMapping),
+    dict(type=PointSample, num_points=40000),
+    dict(
+        type=RandomFlip3D,
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0],
+        shift_height=True),
+    dict(
+        type=Pack3DDetInputs,
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type=GlobalAlignment, rotation_axis=2),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type=RandomFlip3D,
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type=PointSample, num_points=40000),
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=RepeatDataset,
+        times=5,
+        dataset=dict(
+            type=ScanNetDataset,
+            data_root=data_root,
+            ann_file='scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth',
+            backend_args=backend_args)))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=ScanNetDataset,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=ScanNetDataset,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+val_evaluator = dict(type=IndoorMetric)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/mmdet3d/configs/_base_/datasets/scannet_seg.py b/mmde/mmdet3d/configs/_base_/datasets/scannet_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2ec8cf417ce7fb8d97f069c0b7a71e2603f9796
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/scannet_seg.py
@@ -0,0 +1,181 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.processing import TestTimeAug
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.scannet_dataset import ScanNetSegDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile,
+                                                 NormalizePointsColor,
+                                                 PointSegClassMapping)
+from mmdet3d.datasets.transforms.transforms_3d import (IndoorPatchPointSample,
+                                                       RandomFlip3D)
+from mmdet3d.evaluation.metrics.seg_metric import SegMetric
+from mmdet3d.models.segmentors.seg3d_tta import Seg3DTTAModel
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+# For ScanNet seg we usually do 20-class segmentation
+class_names = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table',
+               'door', 'window', 'bookshelf', 'picture', 'counter', 'desk',
+               'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink',
+               'bathtub', 'otherfurniture')
+metainfo = dict(classes=class_names)
+dataset_type = 'ScanNetSegDataset'
+data_root = 'data/scannet/'
+input_modality = dict(use_lidar=True, use_camera=False)
+data_prefix = dict(
+    pts='points',
+    pts_instance_mask='instance_mask',
+    pts_semantic_mask='semantic_mask')
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/scannet/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+num_points = 8192
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type=PointSegClassMapping),
+    dict(
+        type=IndoorPatchPointSample,
+        num_points=num_points,
+        block_size=1.5,
+        ignore_index=len(class_names),
+        use_normalized_coord=False,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type=NormalizePointsColor, color_mean=None),
+    dict(type=Pack3DDetInputs, keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type=NormalizePointsColor, color_mean=None),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(type=NormalizePointsColor, color_mean=None),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+tta_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type=NormalizePointsColor, color_mean=None),
+    dict(
+        type=TestTimeAug,
+        transforms=[[
+            dict(
+                type=RandomFlip3D,
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.,
+                flip_ratio_bev_vertical=0.)
+        ], [dict(type=Pack3DDetInputs, keys=['points'])]])
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=ScanNetSegDataset,
+        data_root=data_root,
+        ann_file='scannet_infos_train.pkl',
+        metainfo=metainfo,
+        data_prefix=data_prefix,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        ignore_index=len(class_names),
+        scene_idxs=data_root + 'seg_info/train_resampled_scene_idxs.npy',
+        test_mode=False,
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=ScanNetSegDataset,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        metainfo=metainfo,
+        data_prefix=data_prefix,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        ignore_index=len(class_names),
+        test_mode=True,
+        backend_args=backend_args))
+val_dataloader = test_dataloader
+
+val_evaluator = dict(type=SegMetric)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
+
+tta_model = dict(type=Seg3DTTAModel)
diff --git a/mmde/mmdet3d/configs/_base_/datasets/semantickitti.py b/mmde/mmdet3d/configs/_base_/datasets/semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e756ab455dc3473138b5960e0d1f976855eb220
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/semantickitti.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.processing import TestTimeAug
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.semantickitti_dataset import SemanticKittiDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile,
+                                                 PointSegClassMapping)
+from mmdet3d.datasets.transforms.transforms_3d import (GlobalRotScaleTrans,
+                                                       RandomFlip3D)
+from mmdet3d.evaluation.metrics.seg_metric import SegMetric
+from mmdet3d.models.segmentors.seg3d_tta import Seg3DTTAModel
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+# For SemanticKitti we usually do 19-class segmentation.
+# For labels_map we follow the uniform format of MMDetection & MMSegmentation
+# i.e. we consider the unlabeled class as the last one, which is different
+# from the original implementation of some methods e.g. Cylinder3D.
+dataset_type = 'SemanticKittiDataset'
+data_root = 'data/semantickitti/'
+class_names = [
+    'car', 'bicycle', 'motorcycle', 'truck', 'bus', 'person', 'bicyclist',
+    'motorcyclist', 'road', 'parking', 'sidewalk', 'other-ground', 'building',
+    'fence', 'vegetation', 'trunck', 'terrian', 'pole', 'traffic-sign'
+]
+labels_map = {
+    0: 19,  # "unlabeled"
+    1: 19,  # "outlier" mapped to "unlabeled" --------------mapped
+    10: 0,  # "car"
+    11: 1,  # "bicycle"
+    13: 4,  # "bus" mapped to "other-vehicle" --------------mapped
+    15: 2,  # "motorcycle"
+    16: 4,  # "on-rails" mapped to "other-vehicle" ---------mapped
+    18: 3,  # "truck"
+    20: 4,  # "other-vehicle"
+    30: 5,  # "person"
+    31: 6,  # "bicyclist"
+    32: 7,  # "motorcyclist"
+    40: 8,  # "road"
+    44: 9,  # "parking"
+    48: 10,  # "sidewalk"
+    49: 11,  # "other-ground"
+    50: 12,  # "building"
+    51: 13,  # "fence"
+    52: 19,  # "other-structure" mapped to "unlabeled" ------mapped
+    60: 8,  # "lane-marking" to "road" ---------------------mapped
+    70: 14,  # "vegetation"
+    71: 15,  # "trunk"
+    72: 16,  # "terrain"
+    80: 17,  # "pole"
+    81: 18,  # "traffic-sign"
+    99: 19,  # "other-object" to "unlabeled" ----------------mapped
+    252: 0,  # "moving-car" to "car" ------------------------mapped
+    253: 6,  # "moving-bicyclist" to "bicyclist" ------------mapped
+    254: 5,  # "moving-person" to "person" ------------------mapped
+    255: 7,  # "moving-motorcyclist" to "motorcyclist" ------mapped
+    256: 4,  # "moving-on-rails" mapped to "other-vehic------mapped
+    257: 4,  # "moving-bus" mapped to "other-vehicle" -------mapped
+    258: 3,  # "moving-truck" to "truck" --------------------mapped
+    259: 4  # "moving-other"-vehicle to "other-vehicle"-----mapped
+}
+
+metainfo = dict(
+    classes=class_names, seg_label_mapping=labels_map, max_label=259)
+
+input_modality = dict(use_lidar=True, use_camera=False)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/semantickitti/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti',
+        backend_args=backend_args),
+    dict(type=PointSegClassMapping),
+    dict(
+        type=RandomFlip3D,
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.1, 0.1, 0.1],
+    ),
+    dict(type=Pack3DDetInputs, keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti',
+        backend_args=backend_args),
+    dict(type=PointSegClassMapping),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+tta_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti',
+        backend_args=backend_args),
+    dict(type=PointSegClassMapping),
+    dict(
+        type=TestTimeAug,
+        transforms=[[
+            dict(
+                type=RandomFlip3D,
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.,
+                flip_ratio_bev_vertical=0.),
+            dict(
+                type=RandomFlip3D,
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.,
+                flip_ratio_bev_vertical=1.),
+            dict(
+                type=RandomFlip3D,
+                sync_2d=False,
+                flip_ratio_bev_horizontal=1.,
+                flip_ratio_bev_vertical=0.),
+            dict(
+                type=RandomFlip3D,
+                sync_2d=False,
+                flip_ratio_bev_horizontal=1.,
+                flip_ratio_bev_vertical=1.)
+        ],
+                    [
+                        dict(
+                            type=GlobalRotScaleTrans,
+                            rot_range=[pcd_rotate_range, pcd_rotate_range],
+                            scale_ratio_range=[
+                                pcd_scale_factor, pcd_scale_factor
+                            ],
+                            translation_std=[0, 0, 0])
+                        for pcd_rotate_range in [-0.78539816, 0.0, 0.78539816]
+                        for pcd_scale_factor in [0.95, 1.0, 1.05]
+                    ], [dict(type=Pack3DDetInputs, keys=['points'])]])
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=SemanticKittiDataset,
+        data_root=data_root,
+        ann_file='semantickitti_infos_train.pkl',
+        pipeline=train_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        ignore_index=19,
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=SemanticKittiDataset,
+        data_root=data_root,
+        ann_file='semantickitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        ignore_index=19,
+        test_mode=True,
+        backend_args=backend_args))
+
+val_dataloader = test_dataloader
+
+val_evaluator = dict(type=SegMetric)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
+
+tta_model = dict(type=Seg3DTTAModel)
diff --git a/mmde/mmdet3d/configs/_base_/datasets/sunrgbd_3d.py b/mmde/mmdet3d/configs/_base_/datasets/sunrgbd_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bc847b27a12cabc9f08df77cb40925fcb0aef14
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/sunrgbd_3d.py
@@ -0,0 +1,141 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dataset.dataset_wrapper import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.sunrgbd_dataset import SUNRGBDDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (GlobalRotScaleTrans,
+                                                       PointSample,
+                                                       RandomFlip3D)
+from mmdet3d.evaluation.metrics.indoor_metric import IndoorMetric
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+dataset_type = 'SUNRGBDDataset'
+data_root = 'data/sunrgbd/'
+class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+               'night_stand', 'bookshelf', 'bathtub')
+
+metainfo = dict(classes=class_names)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/sunrgbd/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type=LoadAnnotations3D),
+    dict(
+        type=RandomFlip3D,
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+    ),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.523599, 0.523599],
+        scale_ratio_range=[0.85, 1.15],
+        shift_height=True),
+    dict(type=PointSample, num_points=20000),
+    dict(
+        type=Pack3DDetInputs, keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type=RandomFlip3D,
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+            ),
+            dict(type=PointSample, num_points=20000)
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=RepeatDataset,
+        times=5,
+        dataset=dict(
+            type=SUNRGBDDataset,
+            data_root=data_root,
+            ann_file='sunrgbd_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth',
+            backend_args=backend_args)))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=SUNRGBDDataset,
+        data_root=data_root,
+        ann_file='sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=SUNRGBDDataset,
+        data_root=data_root,
+        ann_file='sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+val_evaluator = dict(type=IndoorMetric)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/mmdet3d/configs/_base_/datasets/waymoD5_3d_3class.py b/mmde/mmdet3d/configs/_base_/datasets/waymoD5_3d_3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b6337210dcaf4f2a66d37c8d324fc34c828b439
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/waymoD5_3d_3class.py
@@ -0,0 +1,191 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dataset.dataset_wrapper import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (  # noqa
+    GlobalRotScaleTrans, ObjectRangeFilter, ObjectSample, PointShuffle,
+    PointsRangeFilter, RandomFlip3D)
+from mmdet3d.datasets.waymo_dataset import WaymoDataset
+from mmdet3d.evaluation.metrics.waymo_metric import WaymoMetric
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+data_root = 'data/waymo/kitti_format/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+metainfo = dict(classes=class_names)
+
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+    points_loader=dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(type=LoadAnnotations3D, with_bbox_3d=True, with_label_3d=True),
+    # dict(type=ObjectSample, db_sampler=db_sampler),
+    dict(
+        type=RandomFlip3D,
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=PointShuffle),
+    dict(
+        type=Pack3DDetInputs, keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type=RandomFlip3D),
+            dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range)
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(type=Pack3DDetInputs, keys=['points']),
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=RepeatDataset,
+        times=2,
+        dataset=dict(
+            type=WaymoDataset,
+            data_root=data_root,
+            ann_file='waymo_infos_train.pkl',
+            data_prefix=dict(
+                pts='training/velodyne', sweeps='training/velodyne'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=WaymoDataset,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=WaymoDataset,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type=WaymoMetric,
+    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
+    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
+    data_root='./data/waymo/waymo_format',
+    backend_args=backend_args,
+    convert_kitti_format=False)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/mmdet3d/configs/_base_/datasets/waymoD5_3d_car.py b/mmde/mmdet3d/configs/_base_/datasets/waymoD5_3d_car.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcc72fc06a6a27e6200c802364b06386f96f823c
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/waymoD5_3d_car.py
@@ -0,0 +1,188 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dataset.dataset_wrapper import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (  # noqa
+    GlobalRotScaleTrans, ObjectRangeFilter, ObjectSample, PointShuffle,
+    PointsRangeFilter, RandomFlip3D)
+from mmdet3d.datasets.waymo_dataset import WaymoDataset
+from mmdet3d.evaluation.metrics.waymo_metric import WaymoMetric
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+class_names = ['Car']
+metainfo = dict(classes=class_names)
+
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15),
+    points_loader=dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(type=LoadAnnotations3D, with_bbox_3d=True, with_label_3d=True),
+    dict(type=ObjectSample, db_sampler=db_sampler),
+    dict(
+        type=RandomFlip3D,
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=PointShuffle),
+    dict(
+        type=Pack3DDetInputs, keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type=RandomFlip3D),
+            dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range)
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(type=Pack3DDetInputs, keys=['points']),
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=RepeatDataset,
+        times=2,
+        dataset=dict(
+            type=WaymoDataset,
+            data_root=data_root,
+            ann_file='waymo_infos_train.pkl',
+            data_prefix=dict(
+                pts='training/velodyne', sweeps='training/velodyne'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=WaymoDataset,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=WaymoDataset,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type=WaymoMetric,
+    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
+    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
+    data_root='./data/waymo/waymo_format',
+    convert_kitti_format=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/mmdet3d/configs/_base_/datasets/waymoD5_fov_mono3d_3class.py b/mmde/mmdet3d/configs/_base_/datasets/waymoD5_fov_mono3d_3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9109df20c7095ac20b0e7c6da892de42626bf26
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/waymoD5_fov_mono3d_3class.py
@@ -0,0 +1,174 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dataset.sampler import DefaultSampler
+
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadImageFromFileMono3D)
+from mmdet3d.datasets.transforms.transforms_3d import (RandomFlip3D,
+                                                       RandomResize3D)
+from mmdet3d.datasets.waymo_dataset import WaymoDataset
+from mmdet3d.evaluation.metrics.waymo_metric import WaymoMetric
+
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFileMono3D, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    # base shape (1248, 832), scale (0.95, 1.05)
+    dict(
+        type=RandomResize3D,
+        scale=(1284, 832),
+        ratio_range=(0.95, 1.05),
+        keep_ratio=True,
+    ),
+    dict(type=RandomFlip3D, flip_ratio_bev_horizontal=0.5),
+    dict(
+        type=Pack3DDetInputs,
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type=LoadImageFromFileMono3D, backend_args=backend_args),
+    dict(
+        type=RandomResize3D,
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        keep_ratio=True),
+    dict(type=Pack3DDetInputs, keys=['img']),
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type=LoadImageFromFileMono3D, backend_args=backend_args),
+    dict(
+        type=RandomResize3D,
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        keep_ratio=True),
+    dict(type=Pack3DDetInputs, keys=['img']),
+]
+
+metainfo = dict(CLASSES=class_names)
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=WaymoDataset,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        # load one frame every three frames
+        load_interval=5,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=WaymoDataset,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=WaymoDataset,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type=WaymoMetric,
+    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
+    waymo_bin_file='./data/waymo/waymo_format/fov_gt.bin',
+    data_root='./data/waymo/waymo_format',
+    metric='LET_mAP',
+    load_type='fov_image_based',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet3d/configs/_base_/datasets/waymoD5_mv3d_3class.py b/mmde/mmdet3d/configs/_base_/datasets/waymoD5_mv3d_3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..38d3e7ef33abe899a8f42ec1c732b0e5009495ed
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/waymoD5_mv3d_3class.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dataset.sampler import DefaultSampler
+
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadMultiViewImageFromFiles)
+from mmdet3d.datasets.transforms.transforms_3d import (  # noqa
+    MultiViewWrapper, ObjectNameFilter, ObjectRangeFilter,
+    PhotoMetricDistortion3D, RandomCrop3D, RandomFlip3D, RandomResize3D)
+from mmdet3d.datasets.waymo_dataset import WaymoDataset
+from mmdet3d.evaluation.metrics.waymo_metric import WaymoMetric
+
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+input_modality = dict(use_lidar=False, use_camera=True)
+point_cloud_range = [-35.0, -75.0, -2, 75.0, 75.0, 4]
+
+train_transforms = [
+    dict(type=PhotoMetricDistortion3D),
+    dict(
+        type=RandomResize3D,
+        scale=(1248, 832),
+        ratio_range=(0.95, 1.05),
+        keep_ratio=True),
+    dict(type=RandomCrop3D, crop_size=(720, 1080)),
+    dict(type=RandomFlip3D, flip_ratio_bev_horizontal=0.5, flip_box3d=False),
+]
+
+train_pipeline = [
+    dict(
+        type=LoadMultiViewImageFromFiles,
+        to_float32=True,
+        backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type=MultiViewWrapper, transforms=train_transforms),
+    dict(type=ObjectRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectNameFilter, classes=class_names),
+    dict(type=Pack3DDetInputs, keys=[
+        'img',
+        'gt_bboxes_3d',
+        'gt_labels_3d',
+    ]),
+]
+test_transforms = [
+    dict(
+        type=RandomResize3D,
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        keep_ratio=True)
+]
+test_pipeline = [
+    dict(
+        type=LoadMultiViewImageFromFiles,
+        to_float32=True,
+        backend_args=backend_args),
+    dict(type=MultiViewWrapper, transforms=test_transforms),
+    dict(type=Pack3DDetInputs, keys=['img'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type=LoadMultiViewImageFromFiles,
+        to_float32=True,
+        backend_args=backend_args),
+    dict(type=MultiViewWrapper, transforms=test_transforms),
+    dict(type=Pack3DDetInputs, keys=['img'])
+]
+metainfo = dict(classes=class_names)
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=WaymoDataset,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        box_type_3d='Lidar',
+        load_interval=5,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=WaymoDataset,
+        data_root=data_root,
+        ann_file='waymo_infos_val.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='Lidar',
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=WaymoDataset,
+        data_root=data_root,
+        ann_file='waymo_infos_val.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='Lidar',
+        backend_args=backend_args))
+val_evaluator = dict(
+    type=WaymoMetric,
+    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
+    waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
+    data_root='./data/waymo/waymo_format',
+    metric='LET_mAP',
+    backend_args=backend_args)
+
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet3d/configs/_base_/datasets/waymoD5_mv_mono3d_3class.py b/mmde/mmdet3d/configs/_base_/datasets/waymoD5_mv_mono3d_3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..025a1511dbc1a25fa2d2a1bbe7391ef775d287e4
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/datasets/waymoD5_mv_mono3d_3class.py
@@ -0,0 +1,174 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dataset.sampler import DefaultSampler
+
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadImageFromFileMono3D)
+from mmdet3d.datasets.transforms.transforms_3d import (RandomFlip3D,
+                                                       RandomResize3D)
+from mmdet3d.datasets.waymo_dataset import WaymoDataset
+from mmdet3d.evaluation.metrics.waymo_metric import WaymoMetric
+
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFileMono3D, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    # base shape (1248, 832), scale (0.95, 1.05)
+    dict(
+        type=RandomResize3D,
+        scale=(1284, 832),
+        ratio_range=(0.95, 1.05),
+        keep_ratio=True,
+    ),
+    dict(type=RandomFlip3D, flip_ratio_bev_horizontal=0.5),
+    dict(
+        type=Pack3DDetInputs,
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type=LoadImageFromFileMono3D, backend_args=backend_args),
+    dict(
+        type=RandomResize3D,
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        keep_ratio=True),
+    dict(type=Pack3DDetInputs, keys=['img']),
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type=LoadImageFromFileMono3D, backend_args=backend_args),
+    dict(
+        type=RandomResize3D,
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        keep_ratio=True),
+    dict(type=Pack3DDetInputs, keys=['img']),
+]
+
+metainfo = dict(classes=class_names)
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=WaymoDataset,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        # load one frame every three frames
+        load_interval=5,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=WaymoDataset,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=WaymoDataset,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='mv_image_based',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type=WaymoMetric,
+    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
+    waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
+    data_root='./data/waymo/waymo_format',
+    metric='LET_mAP',
+    load_type='mv_image_based',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
diff --git a/mmde/mmdet3d/configs/_base_/default_runtime.py b/mmde/mmdet3d/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..c13d0e11ed709ca384ffb1a94c98f631cd95901e
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/default_runtime.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks.checkpoint_hook import CheckpointHook
+from mmengine.hooks.iter_timer_hook import IterTimerHook
+from mmengine.hooks.logger_hook import LoggerHook
+from mmengine.hooks.param_scheduler_hook import ParamSchedulerHook
+from mmengine.hooks.sampler_seed_hook import DistSamplerSeedHook
+from mmengine.runner.log_processor import LogProcessor
+
+from mmdet3d.engine.hooks.visualization_hook import Det3DVisualizationHook
+
+default_scope = 'mmdet3d'
+
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, interval=-1),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=Det3DVisualizationHook))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+
+log_processor = dict(type=LogProcessor, window_size=50, by_epoch=True)
+
+log_level = 'INFO'
+load_from = None
+resume = False
+
+# TODO: support auto scaling lr
diff --git a/mmde/mmdet3d/configs/_base_/models/centerpoint_pillar02_second_secfpn_nus.py b/mmde/mmdet3d/configs/_base_/models/centerpoint_pillar02_second_secfpn_nus.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc1ebaaead49ef1ea6e284613d20dad516a54519
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/models/centerpoint_pillar02_second_secfpn_nus.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch.nn.modules.conv import Conv2d
+
+from mmdet3d.models.backbones.second import SECOND
+from mmdet3d.models.data_preprocessors.data_preprocessor import \
+    Det3DDataPreprocessor
+from mmdet3d.models.dense_heads.centerpoint_head import (CenterHead,
+                                                         SeparateHead)
+from mmdet3d.models.detectors.centerpoint import CenterPoint
+from mmdet3d.models.middle_encoders.pillar_scatter import PointPillarsScatter
+from mmdet3d.models.necks.second_fpn import SECONDFPN
+from mmdet3d.models.task_modules.coders.centerpoint_bbox_coders import \
+    CenterPointBBoxCoder
+from mmdet3d.models.voxel_encoders.pillar_encoder import PillarFeatureNet
+
+voxel_size = [0.2, 0.2, 8]
+model = dict(
+    type=CenterPoint,
+    data_preprocessor=dict(
+        type=Det3DDataPreprocessor,
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=20,
+            voxel_size=voxel_size,
+            max_voxels=(30000, 40000))),
+    pts_voxel_encoder=dict(
+        type=PillarFeatureNet,
+        in_channels=5,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=(0.2, 0.2, 8),
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+        legacy=False),
+    pts_middle_encoder=dict(
+        type=PointPillarsScatter, in_channels=64, output_shape=(512, 512)),
+    pts_backbone=dict(
+        type=SECOND,
+        in_channels=64,
+        out_channels=[64, 128, 256],
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        conv_cfg=dict(type=Conv2d, bias=False)),
+    pts_neck=dict(
+        type=SECONDFPN,
+        in_channels=[64, 128, 256],
+        out_channels=[128, 128, 128],
+        upsample_strides=[0.5, 1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=True),
+    pts_bbox_head=dict(
+        type=CenterHead,
+        in_channels=sum([128, 128, 128]),
+        tasks=[
+            dict(num_class=1, class_names=['car']),
+            dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+            dict(num_class=2, class_names=['bus', 'trailer']),
+            dict(num_class=1, class_names=['barrier']),
+            dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+            dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type=CenterPointBBoxCoder,
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.1,
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(type=SeparateHead, init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='mmdet.GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(
+            type='mmdet.L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            out_size_factor=4,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            pc_range=[-51.2, -51.2],
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            nms_type='rotate',
+            pre_max_size=1000,
+            post_max_size=83,
+            nms_thr=0.2)))
diff --git a/mmde/mmdet3d/configs/_base_/models/centerpoint_voxel01_second_secfpn_nus.py b/mmde/mmdet3d/configs/_base_/models/centerpoint_voxel01_second_secfpn_nus.py
new file mode 100644
index 0000000000000000000000000000000000000000..af85ec979d0cc120268a9c1d3addf130c602af0a
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/models/centerpoint_voxel01_second_secfpn_nus.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch.nn.modules.conv import Conv2d
+
+from mmdet3d.models.backbones.second import SECOND
+from mmdet3d.models.data_preprocessors.data_preprocessor import \
+    Det3DDataPreprocessor
+from mmdet3d.models.dense_heads.centerpoint_head import (CenterHead,
+                                                         SeparateHead)
+from mmdet3d.models.detectors.centerpoint import CenterPoint
+from mmdet3d.models.middle_encoders.sparse_encoder import SparseEncoder
+from mmdet3d.models.necks.second_fpn import SECONDFPN
+from mmdet3d.models.task_modules.coders.centerpoint_bbox_coders import \
+    CenterPointBBoxCoder
+from mmdet3d.models.voxel_encoders.voxel_encoder import HardSimpleVFE
+
+voxel_size = [0.1, 0.1, 0.2]
+model = dict(
+    type=CenterPoint,
+    data_preprocessor=dict(
+        type=Det3DDataPreprocessor,
+        voxel=True,
+        voxel_layer=dict(
+            max_num_points=10,
+            voxel_size=voxel_size,
+            max_voxels=(90000, 120000))),
+    pts_voxel_encoder=dict(type=HardSimpleVFE, num_features=5),
+    pts_middle_encoder=dict(
+        type=SparseEncoder,
+        in_channels=5,
+        sparse_shape=[41, 1024, 1024],
+        output_channels=128,
+        order=('conv', 'norm', 'act'),
+        encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
+                                                                      128)),
+        encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
+        block_type='basicblock'),
+    pts_backbone=dict(
+        type=SECOND,
+        in_channels=256,
+        out_channels=[128, 256],
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        conv_cfg=dict(type=Conv2d, bias=False)),
+    pts_neck=dict(
+        type=SECONDFPN,
+        in_channels=[128, 256],
+        out_channels=[256, 256],
+        upsample_strides=[1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=True),
+    pts_bbox_head=dict(
+        type=CenterHead,
+        in_channels=sum([256, 256]),
+        tasks=[
+            dict(num_class=1, class_names=['car']),
+            dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+            dict(num_class=2, class_names=['bus', 'trailer']),
+            dict(num_class=1, class_names=['barrier']),
+            dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+            dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type=CenterPointBBoxCoder,
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.1,
+            out_size_factor=8,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(type=SeparateHead, init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='mmdet.GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(
+            type='mmdet.L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[1024, 1024, 40],
+            voxel_size=voxel_size,
+            out_size_factor=8,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            out_size_factor=8,
+            voxel_size=voxel_size[:2],
+            nms_type='rotate',
+            pre_max_size=1000,
+            post_max_size=83,
+            nms_thr=0.2)))
diff --git a/mmde/mmdet3d/configs/_base_/models/cylinder3d.py b/mmde/mmdet3d/configs/_base_/models/cylinder3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..381921786dc0def8c3c046e8e498f111163ac216
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/models/cylinder3d.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet3d.models import Cylinder3D
+from mmdet3d.models.backbones import Asymm3DSpconv
+from mmdet3d.models.data_preprocessors import Det3DDataPreprocessor
+from mmdet3d.models.decode_heads.cylinder3d_head import Cylinder3DHead
+from mmdet3d.models.losses import LovaszLoss
+from mmdet3d.models.voxel_encoders import SegVFE
+
+grid_shape = [480, 360, 32]
+model = dict(
+    type=Cylinder3D,
+    data_preprocessor=dict(
+        type=Det3DDataPreprocessor,
+        voxel=True,
+        voxel_type='cylindrical',
+        voxel_layer=dict(
+            grid_shape=grid_shape,
+            point_cloud_range=[0, -3.14159265359, -4, 50, 3.14159265359, 2],
+            max_num_points=-1,
+            max_voxels=-1,
+        ),
+    ),
+    voxel_encoder=dict(
+        type=SegVFE,
+        feat_channels=[64, 128, 256, 256],
+        in_channels=6,
+        with_voxel_center=True,
+        feat_compression=16,
+        return_point_feats=False),
+    backbone=dict(
+        type=Asymm3DSpconv,
+        grid_size=grid_shape,
+        input_channels=16,
+        base_channels=32,
+        norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.1)),
+    decode_head=dict(
+        type=Cylinder3DHead,
+        channels=128,
+        num_classes=20,
+        loss_ce=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            class_weight=None,
+            loss_weight=1.0),
+        loss_lovasz=dict(type=LovaszLoss, loss_weight=1.0, reduction='none'),
+    ),
+    train_cfg=None,
+    test_cfg=dict(mode='whole'),
+)
diff --git a/mmde/mmdet3d/configs/_base_/models/fcos3d.py b/mmde/mmdet3d/configs/_base_/models/fcos3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3a3d6a8f9bc30658a1f76a2a4bc72e2837f5828
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/models/fcos3d.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet3d.models.data_preprocessors.data_preprocessor import \
+    Det3DDataPreprocessor
+from mmdet3d.models.dense_heads.fcos_mono3d_head import FCOSMono3DHead
+from mmdet3d.models.detectors.fcos_mono3d import FCOSMono3D
+from mmdet3d.models.task_modules.coders.fcos3d_bbox_coder import \
+    FCOS3DBBoxCoder
+
+# model settings
+model = dict(
+    type=FCOSMono3D,
+    data_preprocessor=dict(
+        type=Det3DDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5,
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type=FCOSMono3DHead,
+        num_classes=10,
+        in_channels=256,
+        stacked_convs=2,
+        feat_channels=256,
+        use_direction_classifier=True,
+        diff_rad_by_sin=True,
+        pred_attrs=True,
+        pred_velo=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        strides=[8, 16, 32, 64, 128],
+        group_reg_dims=(2, 1, 3, 1, 2),  # offset, depth, size, rot, velo
+        cls_branch=(256, ),
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            ()  # velo
+        ),
+        dir_branch=(256, ),
+        attr_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_attr=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        bbox_coder=dict(type=FCOS3DBBoxCoder, code_size=9),
+        norm_on_bbox=True,
+        centerness_on_reg=True,
+        center_sampling=True,
+        conv_bias=True,
+        dcn_on_last_conv=True),
+    train_cfg=dict(
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05],
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_thr=0.8,
+        score_thr=0.05,
+        min_bbox_size=0,
+        max_per_img=200))
diff --git a/mmde/mmdet3d/configs/_base_/models/minkunet.py b/mmde/mmdet3d/configs/_base_/models/minkunet.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac862ed224248679e8e7f944fb85cfebb8cda344
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/models/minkunet.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet3d.models.backbones.minkunet_backbone import MinkUNetBackbone
+from mmdet3d.models.data_preprocessors.data_preprocessor import \
+    Det3DDataPreprocessor
+from mmdet3d.models.decode_heads.minkunet_head import MinkUNetHead
+from mmdet3d.models.segmentors.minkunet import MinkUNet
+
+model = dict(
+    type=MinkUNet,
+    data_preprocessor=dict(
+        type=Det3DDataPreprocessor,
+        voxel=True,
+        voxel_type='minkunet',
+        batch_first=False,
+        max_voxels=80000,
+        voxel_layer=dict(
+            max_num_points=-1,
+            point_cloud_range=[-100, -100, -20, 100, 100, 20],
+            voxel_size=[0.05, 0.05, 0.05],
+            max_voxels=(-1, -1))),
+    backbone=dict(
+        type=MinkUNetBackbone,
+        in_channels=4,
+        num_stages=4,
+        base_channels=32,
+        encoder_channels=[32, 64, 128, 256],
+        encoder_blocks=[2, 2, 2, 2],
+        decoder_channels=[256, 128, 96, 96],
+        decoder_blocks=[2, 2, 2, 2],
+        block_type='basic',
+        sparseconv_backend='torchsparse'),
+    decode_head=dict(
+        type=MinkUNetHead,
+        channels=96,
+        num_classes=19,
+        dropout_ratio=0,
+        loss_decode=dict(type='mmdet.CrossEntropyLoss', avg_non_ignore=True),
+        ignore_index=19),
+    train_cfg=dict(),
+    test_cfg=dict())
diff --git a/mmde/mmdet3d/configs/_base_/models/pgd.py b/mmde/mmdet3d/configs/_base_/models/pgd.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfaaf383918b6883e6ed6e15e916c7e86d6fb02a
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/models/pgd.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+if '_base_':
+    from .fcos3d import *
+
+from mmdet3d.models.dense_heads.pgd_head import PGDHead
+from mmdet3d.models.task_modules.coders.pgd_bbox_coder import PGDBBoxCoder
+
+# model settings
+model.merge(
+    dict(
+        bbox_head=dict(
+            _delete_=True,
+            type=PGDHead,
+            num_classes=10,
+            in_channels=256,
+            stacked_convs=2,
+            feat_channels=256,
+            use_direction_classifier=True,
+            diff_rad_by_sin=True,
+            pred_attrs=True,
+            pred_velo=True,
+            pred_bbox2d=True,
+            pred_keypoints=False,
+            dir_offset=0.7854,  # pi/4
+            strides=[8, 16, 32, 64, 128],
+            group_reg_dims=(2, 1, 3, 1, 2),  # offset, depth, size, rot, velo
+            cls_branch=(256, ),
+            reg_branch=(
+                (256, ),  # offset
+                (256, ),  # depth
+                (256, ),  # size
+                (256, ),  # rot
+                ()  # velo
+            ),
+            dir_branch=(256, ),
+            attr_branch=(256, ),
+            loss_cls=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+            loss_dir=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0),
+            loss_attr=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0),
+            loss_centerness=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                loss_weight=1.0),
+            norm_on_bbox=True,
+            centerness_on_reg=True,
+            center_sampling=True,
+            conv_bias=True,
+            dcn_on_last_conv=True,
+            use_depth_classifier=True,
+            depth_branch=(256, ),
+            depth_range=(0, 50),
+            depth_unit=10,
+            division='uniform',
+            depth_bins=6,
+            bbox_coder=dict(type=PGDBBoxCoder, code_size=9)),
+        test_cfg=dict(
+            nms_pre=1000, nms_thr=0.8, score_thr=0.01, max_per_img=200)))
diff --git a/mmde/mmdet3d/configs/_base_/models/votenet.py b/mmde/mmdet3d/configs/_base_/models/votenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..77b56aa3743aef3fa9baccddd91aaee4e98cf158
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/models/votenet.py
@@ -0,0 +1,83 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch.nn.modules.conv import Conv1d
+
+from mmdet3d.models.backbones.pointnet2_sa_ssg import PointNet2SASSG
+from mmdet3d.models.data_preprocessors.data_preprocessor import \
+    Det3DDataPreprocessor
+from mmdet3d.models.dense_heads.vote_head import VoteHead
+from mmdet3d.models.detectors.votenet import VoteNet
+from mmdet3d.models.losses.chamfer_distance import ChamferDistance
+
+model = dict(
+    type=VoteNet,
+    data_preprocessor=dict(type=Det3DDataPreprocessor),
+    backbone=dict(
+        type=PointNet2SASSG,
+        in_channels=4,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 256)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    bbox_head=dict(
+        type=VoteHead,
+        vote_module_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type=Conv1d),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type=ChamferDistance,
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModule',
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        pred_layer_cfg=dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True),
+        objectness_loss=dict(
+            type='mmdet.CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type=ChamferDistance,
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum',
+            loss_weight=10.0 / 3.0),
+        semantic_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mode='vote'),
+    test_cfg=dict(
+        sample_mode='seed',
+        nms_thr=0.25,
+        score_thr=0.05,
+        per_class_proposal=True))
diff --git a/mmde/mmdet3d/configs/_base_/schedules/cosine.py b/mmde/mmdet3d/configs/_base_/schedules/cosine.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cd3ce05a2803d91d2b4925f234edcfabe85c1cc
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/schedules/cosine.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR, LinearLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.adamw import AdamW
+
+# This schedule is mainly used by models with dynamic voxelization
+# optimizer
+lr = 0.003  # max learning rate
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=AdamW, lr=lr, weight_decay=0.001, betas=(0.95, 0.99)),
+    clip_grad=dict(max_norm=10, norm_type=2),
+)
+
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.1, by_epoch=False, begin=0, end=1000),
+    dict(
+        type=CosineAnnealingLR,
+        begin=0,
+        T_max=40,
+        end=40,
+        by_epoch=True,
+        eta_min=1e-5)
+]
+# training schedule for 1x
+train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=40, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet3d/configs/_base_/schedules/cyclic_20e.py b/mmde/mmdet3d/configs/_base_/schedules/cyclic_20e.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae185e4c3199c4c9a26b352e0f6c1f33357ac8e8
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/schedules/cyclic_20e.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR
+from mmengine.optim.scheduler.momentum_scheduler import CosineAnnealingMomentum
+from torch.optim.adamw import AdamW
+
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 20. Please change the interval accordingly if you do not
+# use a default schedule.
+# optimizer
+lr = 1e-4
+# This schedule is mainly used by models on nuScenes dataset
+# max_norm=10 is better for SECOND
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=AdamW, lr=lr, weight_decay=0.01),
+    clip_grad=dict(max_norm=35, norm_type=2))
+# learning rate
+param_scheduler = [
+    # learning rate scheduler
+    # During the first 8 epochs, learning rate increases from 0 to lr * 10
+    # during the next 12 epochs, learning rate decreases from lr * 10 to
+    # lr * 1e-4
+    dict(
+        type=CosineAnnealingLR,
+        T_max=8,
+        eta_min=lr * 10,
+        begin=0,
+        end=8,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=12,
+        eta_min=lr * 1e-4,
+        begin=8,
+        end=20,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    # During the first 8 epochs, momentum increases from 0 to 0.85 / 0.95
+    # during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1
+    dict(
+        type=CosineAnnealingMomentum,
+        T_max=8,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=8,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingMomentum,
+        T_max=12,
+        eta_min=1,
+        begin=8,
+        end=20,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=20)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
diff --git a/mmde/mmdet3d/configs/_base_/schedules/cyclic_40e.py b/mmde/mmdet3d/configs/_base_/schedules/cyclic_40e.py
new file mode 100644
index 0000000000000000000000000000000000000000..b67efae458fa2f53a7d3a7bf806fd9b8eef0021d
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/schedules/cyclic_40e.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR
+from mmengine.optim.scheduler.momentum_scheduler import CosineAnnealingMomentum
+from torch.optim.adamw import AdamW
+
+# The schedule is usually used by models trained on KITTI dataset
+# The learning rate set in the cyclic schedule is the initial learning rate
+# rather than the max learning rate. Since the target_ratio is (10, 1e-4),
+# the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4
+lr = 0.0018
+# The optimizer follows the setting in SECOND.Pytorch, but here we use
+# the official AdamW optimizer implemented by PyTorch.
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=AdamW, lr=lr, betas=(0.95, 0.99), weight_decay=0.01),
+    clip_grad=dict(max_norm=10, norm_type=2))
+# learning rate
+param_scheduler = [
+    # learning rate scheduler
+    # During the first 16 epochs, learning rate increases from 0 to lr * 10
+    # during the next 24 epochs, learning rate decreases from lr * 10 to
+    # lr * 1e-4
+    dict(
+        type=CosineAnnealingLR,
+        T_max=16,
+        eta_min=lr * 10,
+        begin=0,
+        end=16,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        T_max=24,
+        eta_min=lr * 1e-4,
+        begin=16,
+        end=40,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    # During the first 16 epochs, momentum increases from 0 to 0.85 / 0.95
+    # during the next 24 epochs, momentum increases from 0.85 / 0.95 to 1
+    dict(
+        type=CosineAnnealingMomentum,
+        T_max=16,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=16,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingMomentum,
+        T_max=24,
+        eta_min=1,
+        begin=16,
+        end=40,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+# Runtime settings，training schedule for 40e
+# Although the max_epochs is 40, this schedule is usually used we
+# RepeatDataset with repeat ratio N, thus the actual max epoch
+# number could be Nx40
+train_cfg = dict(by_epoch=True, max_epochs=40, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (6 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=48)
diff --git a/mmde/mmdet3d/configs/_base_/schedules/mmdet_schedule_1x.py b/mmde/mmdet3d/configs/_base_/schedules/mmdet_schedule_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..47d1fa6a4852c40f3f9962a47ec90e365671c61c
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/schedules/mmdet_schedule_1x.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+# training schedule for 1x
+train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=12, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/mmde/mmdet3d/configs/_base_/schedules/schedule_2x.py b/mmde/mmdet3d/configs/_base_/schedules/schedule_2x.py
new file mode 100644
index 0000000000000000000000000000000000000000..576e1303877c1079be6dd8687cb3d08829006a15
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/schedules/schedule_2x.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.adamw import AdamW
+
+# optimizer
+# This schedule is mainly used by models on nuScenes dataset
+lr = 0.001
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=AdamW, lr=lr, weight_decay=0.01),
+    # max_norm=10 is better for SECOND
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# training schedule for 2x
+train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=24, val_interval=24)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[20, 23],
+        gamma=0.1)
+]
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
diff --git a/mmde/mmdet3d/configs/_base_/schedules/schedule_3x.py b/mmde/mmdet3d/configs/_base_/schedules/schedule_3x.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec0724fb617079f508f969e1ddf8fb544fe6246f
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/schedules/schedule_3x.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.adamw import AdamW
+
+# optimizer
+# This schedule is mainly used by models on indoor dataset,
+# e.g., VoteNet on SUNRGBD and ScanNet
+lr = 0.008  # max learning rate
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=AdamW, lr=lr, weight_decay=0.01),
+    clip_grad=dict(max_norm=10, norm_type=2),
+)
+
+# training schedule for 3x
+train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=36, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning rate
+param_scheduler = [
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[24, 32],
+        gamma=0.1)
+]
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (4 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
diff --git a/mmde/mmdet3d/configs/_base_/schedules/seg_cosine_100e.py b/mmde/mmdet3d/configs/_base_/schedules/seg_cosine_100e.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7ab6ba25ade0ab436f6276bf7f3d28f0d970a61
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/schedules/seg_cosine_100e.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR
+from torch.optim.sgd import SGD
+
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.1, momentum=0.9, weight_decay=0.001),
+    clip_grad=None)
+
+param_scheduler = [
+    dict(
+        type=CosineAnnealingLR,
+        T_max=100,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=0,
+        end=100)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (4 GPUs) x (32 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/mmde/mmdet3d/configs/_base_/schedules/seg_cosine_150e.py b/mmde/mmdet3d/configs/_base_/schedules/seg_cosine_150e.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2c8f29b986aabadb70007e3525c0a443c7a4429
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/schedules/seg_cosine_150e.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR
+from torch.optim.sgd import SGD
+
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.2, momentum=0.9, weight_decay=0.0001),
+    clip_grad=None)
+
+param_scheduler = [
+    dict(
+        type=CosineAnnealingLR,
+        T_max=150,
+        eta_min=0.002,
+        by_epoch=True,
+        begin=0,
+        end=150)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=150, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/mmde/mmdet3d/configs/_base_/schedules/seg_cosine_200e.py b/mmde/mmdet3d/configs/_base_/schedules/seg_cosine_200e.py
new file mode 100644
index 0000000000000000000000000000000000000000..689586f74ffc1339f7034e4304223dadd57b0edd
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/schedules/seg_cosine_200e.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR
+from torch.optim.adam import Adam
+
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=Adam, lr=0.001, weight_decay=0.01),
+    clip_grad=None)
+
+param_scheduler = [
+    dict(
+        type=CosineAnnealingLR,
+        T_max=200,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=0,
+        end=200)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=200, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (2 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
diff --git a/mmde/mmdet3d/configs/_base_/schedules/seg_cosine_50e.py b/mmde/mmdet3d/configs/_base_/schedules/seg_cosine_50e.py
new file mode 100644
index 0000000000000000000000000000000000000000..1054402f2e9abd01d85bf833ebad02e633a25f55
--- /dev/null
+++ b/mmde/mmdet3d/configs/_base_/schedules/seg_cosine_50e.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR
+from torch.optim.adam import Adam
+
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=Adam, lr=0.001, weight_decay=0.001),
+    clip_grad=None)
+
+param_scheduler = [
+    dict(
+        type=CosineAnnealingLR,
+        T_max=50,
+        eta_min=1e-5,
+        by_epoch=True,
+        begin=0,
+        end=50)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=50, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (2 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
diff --git a/mmde/mmdet3d/configs/centerpoint/centerpoint_pillar02_second_secfpn_8xb4_cyclic_20e_nus_3d.py b/mmde/mmdet3d/configs/centerpoint/centerpoint_pillar02_second_secfpn_8xb4_cyclic_20e_nus_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ca53f0130ab6ae95209249f49289a92dd239b45
--- /dev/null
+++ b/mmde/mmdet3d/configs/centerpoint/centerpoint_pillar02_second_secfpn_8xb4_cyclic_20e_nus_3d.py
@@ -0,0 +1,181 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.nus_3d import *
+    from .._base_.models.centerpoint_pillar02_second_secfpn_nus import *
+    from .._base_.schedules.cyclic_20e import *
+    from .._base_.default_runtime import *
+
+from mmengine.dataset.sampler import DefaultSampler
+
+from mmdet3d.datasets.dataset_wrappers import CBGSDataset
+from mmdet3d.datasets.nuscenes_dataset import NuScenesDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile,
+                                                 LoadPointsFromMultiSweeps)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (  # noqa
+    GlobalRotScaleTrans, ObjectNameFilter, ObjectRangeFilter, ObjectSample,
+    PointShuffle, PointsRangeFilter, RandomFlip3D)
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# Using calibration info convert the Lidar-coordinate point cloud range to the
+# ego-coordinate point cloud range could bring a little promotion in nuScenes.
+# point_cloud_range = [-51.2, -52, -5.0, 51.2, 50.4, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_prefix.update(
+    dict(pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP'))
+model.update(
+    dict(
+        data_preprocessor=dict(
+            voxel_layer=dict(point_cloud_range=point_cloud_range)),
+        pts_voxel_encoder=dict(point_cloud_range=point_cloud_range),
+        pts_bbox_head=dict(bbox_coder=dict(pc_range=point_cloud_range[:2])),
+        # model training and testing settings
+        train_cfg=dict(pts=dict(point_cloud_range=point_cloud_range)),
+        test_cfg=dict(pts=dict(pc_range=point_cloud_range[:2]))))
+
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+backend_args = None
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            car=5,
+            truck=5,
+            bus=5,
+            trailer=5,
+            construction_vehicle=5,
+            traffic_cone=5,
+            barrier=5,
+            motorcycle=5,
+            bicycle=5,
+            pedestrian=5)),
+    classes=class_names,
+    sample_groups=dict(
+        car=2,
+        truck=3,
+        construction_vehicle=7,
+        bus=4,
+        trailer=6,
+        barrier=2,
+        motorcycle=6,
+        bicycle=6,
+        pedestrian=2,
+        traffic_cone=2),
+    points_loader=dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(type=LoadAnnotations3D, with_bbox_3d=True, with_label_3d=True),
+    dict(type=ObjectSample, db_sampler=db_sampler),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(
+        type=RandomFlip3D,
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectNameFilter, classes=class_names),
+    dict(type=PointShuffle),
+    dict(
+        type=Pack3DDetInputs, keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type=RandomFlip3D)
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+
+train_dataloader.merge(
+    dict(
+        _delete_=True,
+        batch_size=4,
+        num_workers=4,
+        persistent_workers=True,
+        sampler=dict(type=DefaultSampler, shuffle=True),
+        dataset=dict(
+            type=CBGSDataset,
+            dataset=dict(
+                type=NuScenesDataset,
+                data_root=data_root,
+                ann_file='nuscenes_infos_train.pkl',
+                pipeline=train_pipeline,
+                metainfo=dict(classes=class_names),
+                test_mode=False,
+                data_prefix=data_prefix,
+                use_valid_flag=True,
+                # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+                # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+                box_type_3d='LiDAR',
+                backend_args=backend_args))))
+test_dataloader.update(
+    dict(
+        dataset=dict(
+            pipeline=test_pipeline, metainfo=dict(classes=class_names))))
+val_dataloader.update(
+    dict(
+        dataset=dict(
+            pipeline=test_pipeline, metainfo=dict(classes=class_names))))
+
+train_cfg.update(dict(val_interval=20))
diff --git a/mmde/mmdet3d/configs/centerpoint/centerpoint_voxel01_second_secfpn_8xb4_cyclic_20e_nus_3d.py b/mmde/mmdet3d/configs/centerpoint/centerpoint_voxel01_second_secfpn_8xb4_cyclic_20e_nus_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c3a3ef64170028c8791addec15bab723c0b6186
--- /dev/null
+++ b/mmde/mmdet3d/configs/centerpoint/centerpoint_voxel01_second_secfpn_8xb4_cyclic_20e_nus_3d.py
@@ -0,0 +1,181 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import read_base
+
+with read_base():
+    from .._base_.datasets.nus_3d import *
+    from .._base_.models.centerpoint_voxel01_second_secfpn_nus import *
+    from .._base_.schedules.cyclic_20e import *
+    from .._base_.default_runtime import *
+
+from mmengine.dataset.sampler import DefaultSampler
+
+from mmdet3d.datasets.dataset_wrappers import CBGSDataset
+from mmdet3d.datasets.nuscenes_dataset import NuScenesDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile,
+                                                 LoadPointsFromMultiSweeps)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (  # noqa
+    GlobalRotScaleTrans, ObjectNameFilter, ObjectRangeFilter, ObjectSample,
+    PointShuffle, PointsRangeFilter, RandomFlip3D)
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# Using calibration info convert the Lidar-coordinate point cloud range to the
+# ego-coordinate point cloud range could bring a little promotion in nuScenes.
+# point_cloud_range = [-51.2, -52, -5.0, 51.2, 50.4, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_prefix.update(
+    dict(pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP'))
+model.update(
+    dict(
+        data_preprocessor=dict(
+            voxel_layer=dict(point_cloud_range=point_cloud_range)),
+        pts_bbox_head=dict(bbox_coder=dict(pc_range=point_cloud_range[:2])),
+        # model training and testing settings
+        train_cfg=dict(pts=dict(point_cloud_range=point_cloud_range)),
+        test_cfg=dict(pts=dict(pc_range=point_cloud_range[:2]))))
+
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+backend_args = None
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            car=5,
+            truck=5,
+            bus=5,
+            trailer=5,
+            construction_vehicle=5,
+            traffic_cone=5,
+            barrier=5,
+            motorcycle=5,
+            bicycle=5,
+            pedestrian=5)),
+    classes=class_names,
+    sample_groups=dict(
+        car=2,
+        truck=3,
+        construction_vehicle=7,
+        bus=4,
+        trailer=6,
+        barrier=2,
+        motorcycle=6,
+        bicycle=6,
+        pedestrian=2,
+        traffic_cone=2),
+    points_loader=dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(type=LoadAnnotations3D, with_bbox_3d=True, with_label_3d=True),
+    dict(type=ObjectSample, db_sampler=db_sampler),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(
+        type=RandomFlip3D,
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectNameFilter, classes=class_names),
+    dict(type=PointShuffle),
+    dict(
+        type=Pack3DDetInputs, keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type=LoadPointsFromMultiSweeps,
+        sweeps_num=9,
+        use_dim=[0, 1, 2, 3, 4],
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type=RandomFlip3D),
+            dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range)
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points'])
+]
+
+train_dataloader.merge(
+    dict(
+        _delete_=True,
+        batch_size=4,
+        num_workers=4,
+        persistent_workers=True,
+        sampler=dict(type=DefaultSampler, shuffle=True),
+        dataset=dict(
+            type=CBGSDataset,
+            dataset=dict(
+                type=NuScenesDataset,
+                data_root=data_root,
+                ann_file='nuscenes_infos_train.pkl',
+                pipeline=train_pipeline,
+                metainfo=dict(classes=class_names),
+                test_mode=False,
+                data_prefix=data_prefix,
+                use_valid_flag=True,
+                # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+                # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+                box_type_3d='LiDAR',
+                backend_args=backend_args))))
+test_dataloader.update(
+    dict(
+        dataset=dict(
+            pipeline=test_pipeline, metainfo=dict(classes=class_names))))
+val_dataloader.update(
+    dict(
+        dataset=dict(
+            pipeline=test_pipeline, metainfo=dict(classes=class_names))))
+
+train_cfg.update(dict(val_interval=20))
diff --git a/mmde/mmdet3d/configs/cylinder3d/cylinder3d_4xb4-3x_semantickitti.py b/mmde/mmdet3d/configs/cylinder3d/cylinder3d_4xb4-3x_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..11e18eee753132fae3990ec9ead3d1724f9332c3
--- /dev/null
+++ b/mmde/mmdet3d/configs/cylinder3d/cylinder3d_4xb4-3x_semantickitti.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import read_base
+
+with read_base():
+    from .._base_.datasets.semantickitti import *
+    from .._base_.models.cylinder3d import *
+    from .._base_.default_runtime import *
+
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+# optimizer
+lr = 0.001
+optim_wrapper = dict(
+    type=OptimWrapper, optimizer=dict(type=AdamW, lr=lr, weight_decay=0.01))
+
+train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=36, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=1000),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[30],
+        gamma=0.1)
+]
+
+train_dataloader.update(dict(batch_size=4, ))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
+# auto_scale_lr = dict(enable=False, base_batch_size=32)
+
+default_hooks.update(dict(checkpoint=dict(type=CheckpointHook, interval=5)))
diff --git a/mmde/mmdet3d/configs/cylinder3d/cylinder3d_8xb2-laser-polar-mix-3x_semantickitti.py b/mmde/mmdet3d/configs/cylinder3d/cylinder3d_8xb2-laser-polar-mix-3x_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a44d4aa4e876a3315b4618788d50b956bd8f939
--- /dev/null
+++ b/mmde/mmdet3d/configs/cylinder3d/cylinder3d_8xb2-laser-polar-mix-3x_semantickitti.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import read_base
+
+with read_base():
+    from .._base_.datasets.semantickitti import *
+    from .._base_.default_runtime import *
+    from .._base_.models.cylinder3d import *
+    from .._base_.schedules.schedule_3x import *
+
+from mmcv.transforms.wrappers import RandomChoice
+
+from mmdet3d.datasets.transforms.transforms_3d import LaserMix, PolarMix
+
+train_pipeline = [
+    dict(type=LoadPointsFromFile, coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti'),
+    dict(type=PointSegClassMapping),
+    dict(
+        type=RandomChoice,
+        transforms=[
+            [
+                dict(
+                    type=LaserMix,
+                    num_areas=[3, 4, 5, 6],
+                    pitch_angles=[-25, 3],
+                    pre_transform=[
+                        dict(
+                            type=LoadPointsFromFile,
+                            coord_type='LIDAR',
+                            load_dim=4,
+                            use_dim=4),
+                        dict(
+                            type=LoadAnnotations3D,
+                            with_bbox_3d=False,
+                            with_label_3d=False,
+                            with_seg_3d=True,
+                            seg_3d_dtype='np.int32',
+                            seg_offset=2**16,
+                            dataset_type='semantickitti'),
+                        dict(type=PointSegClassMapping)
+                    ],
+                    prob=1)
+            ],
+            [
+                dict(
+                    type=PolarMix,
+                    instance_classes=[0, 1, 2, 3, 4, 5, 6, 7],
+                    swap_ratio=0.5,
+                    rotate_paste_ratio=1.0,
+                    pre_transform=[
+                        dict(
+                            type=LoadPointsFromFile,
+                            coord_type='LIDAR',
+                            load_dim=4,
+                            use_dim=4),
+                        dict(
+                            type=LoadAnnotations3D,
+                            with_bbox_3d=False,
+                            with_label_3d=False,
+                            with_seg_3d=True,
+                            seg_3d_dtype='np.int32',
+                            seg_offset=2**16,
+                            dataset_type='semantickitti'),
+                        dict(type=PointSegClassMapping)
+                    ],
+                    prob=1)
+            ],
+        ],
+        prob=[0.5, 0.5]),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[0., 6.28318531],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+    ),
+    dict(type=Pack3DDetInputs, keys=['points', 'pts_semantic_mask'])
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
+
+default_hooks.update(dict(checkpoint=dict(type=CheckpointHook, interval=1)))
diff --git a/mmde/mmdet3d/configs/minkunet/minkunet34_w32_torchsparse_8xb2_laser_polar_mix_3x_semantickitti.py b/mmde/mmdet3d/configs/minkunet/minkunet34_w32_torchsparse_8xb2_laser_polar_mix_3x_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f7ef88bccf0d3815cfe2d47478a9c194c5d3436
--- /dev/null
+++ b/mmde/mmdet3d/configs/minkunet/minkunet34_w32_torchsparse_8xb2_laser_polar_mix_3x_semantickitti.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import read_base
+
+with read_base():
+    from .._base_.datasets.semantickitti import *
+    from .._base_.models.minkunet import *
+    from .._base_.schedules.schedule_3x import *
+    from .._base_.default_runtime import *
+
+from mmcv.transforms.wrappers import RandomChoice
+from mmengine.hooks.checkpoint_hook import CheckpointHook
+
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile,
+                                                 PointSegClassMapping)
+from mmdet3d.datasets.transforms.transforms_3d import (GlobalRotScaleTrans,
+                                                       LaserMix, PolarMix)
+
+model.update(
+    dict(
+        data_preprocessor=dict(max_voxels=None),
+        backbone=dict(encoder_blocks=[2, 3, 4, 6])))
+
+train_pipeline = [
+    dict(type=LoadPointsFromFile, coord_type='LIDAR', load_dim=4, use_dim=4),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti'),
+    dict(type=PointSegClassMapping),
+    dict(
+        type=RandomChoice,
+        transforms=[
+            [
+                dict(
+                    type=LaserMix,
+                    num_areas=[3, 4, 5, 6],
+                    pitch_angles=[-25, 3],
+                    pre_transform=[
+                        dict(
+                            type=LoadPointsFromFile,
+                            coord_type='LIDAR',
+                            load_dim=4,
+                            use_dim=4),
+                        dict(
+                            type=LoadAnnotations3D,
+                            with_bbox_3d=False,
+                            with_label_3d=False,
+                            with_seg_3d=True,
+                            seg_3d_dtype='np.int32',
+                            seg_offset=2**16,
+                            dataset_type='semantickitti'),
+                        dict(type=PointSegClassMapping)
+                    ],
+                    prob=1)
+            ],
+            [
+                dict(
+                    type=PolarMix,
+                    instance_classes=[0, 1, 2, 3, 4, 5, 6, 7],
+                    swap_ratio=0.5,
+                    rotate_paste_ratio=1.0,
+                    pre_transform=[
+                        dict(
+                            type=LoadPointsFromFile,
+                            coord_type='LIDAR',
+                            load_dim=4,
+                            use_dim=4),
+                        dict(
+                            type=LoadAnnotations3D,
+                            with_bbox_3d=False,
+                            with_label_3d=False,
+                            with_seg_3d=True,
+                            seg_3d_dtype='np.int32',
+                            seg_offset=2**16,
+                            dataset_type='semantickitti'),
+                        dict(type=PointSegClassMapping)
+                    ],
+                    prob=1)
+            ],
+        ],
+        prob=[0.5, 0.5]),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[0., 6.28318531],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+    ),
+    dict(type=Pack3DDetInputs, keys=['points', 'pts_semantic_mask'])
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
+
+default_hooks.update(dict(checkpoint=dict(type=CheckpointHook, interval=1)))
diff --git a/mmde/mmdet3d/configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2_80e_kitti_3d_3class.py b/mmde/mmdet3d/configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2_80e_kitti_3d_3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b255894b00bf5bf0539b34444737357f63d2fae
--- /dev/null
+++ b/mmde/mmdet3d/configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2_80e_kitti_3d_3class.py
@@ -0,0 +1,312 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import read_base
+
+with read_base():
+    from .._base_.schedules.cosine import *
+    from .._base_.default_runtime import *
+
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import RandomResize, Resize
+from mmengine.dataset.dataset_wrapper import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.visualization.vis_backend import LocalVisBackend
+
+from mmdet3d.datasets.kitti_dataset import KittiDataset
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile)
+from mmdet3d.datasets.transforms.test_time_aug import MultiScaleFlipAug3D
+from mmdet3d.datasets.transforms.transforms_3d import (GlobalRotScaleTrans,
+                                                       ObjectRangeFilter,
+                                                       PointShuffle,
+                                                       PointsRangeFilter,
+                                                       RandomFlip3D)
+from mmdet3d.evaluation.metrics.kitti_metric import KittiMetric
+from mmdet3d.models.backbones.second import SECOND
+from mmdet3d.models.data_preprocessors.data_preprocessor import \
+    Det3DDataPreprocessor
+from mmdet3d.models.dense_heads.anchor3d_head import Anchor3DHead
+from mmdet3d.models.detectors.mvx_faster_rcnn import DynamicMVXFasterRCNN
+from mmdet3d.models.layers.fusion_layers.point_fusion import PointFusion
+from mmdet3d.models.middle_encoders.sparse_encoder import SparseEncoder
+from mmdet3d.models.necks.second_fpn import SECONDFPN
+from mmdet3d.models.task_modules.anchor.anchor_3d_generator import \
+    Anchor3DRangeGenerator
+from mmdet3d.models.task_modules.assigners.max_3d_iou_assigner import \
+    Max3DIoUAssigner
+from mmdet3d.models.task_modules.coders.delta_xyzwhlr_bbox_coder import \
+    DeltaXYZWLHRBBoxCoder
+from mmdet3d.models.voxel_encoders.voxel_encoder import DynamicVFE
+from mmdet3d.structures.ops.iou3d_calculator import BboxOverlapsNearest3D
+from mmdet3d.visualization.local_visualizer import Det3DLocalVisualizer
+
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+model = dict(
+    type=DynamicMVXFasterRCNN,
+    data_preprocessor=dict(
+        type=Det3DDataPreprocessor,
+        voxel=True,
+        voxel_type='dynamic',
+        voxel_layer=dict(
+            max_num_points=-1,
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(-1, -1)),
+        mean=[102.9801, 115.9465, 122.7717],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    img_backbone=dict(
+        type='mmdet.ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    img_neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        # make the image features more stable numerically to avoid loss nan
+        norm_cfg=dict(type='BN', requires_grad=False),
+        num_outs=5),
+    pts_voxel_encoder=dict(
+        type=DynamicVFE,
+        in_channels=4,
+        feat_channels=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=point_cloud_range,
+        fusion_layer=dict(
+            type=PointFusion,
+            img_channels=256,
+            pts_channels=64,
+            mid_channels=128,
+            out_channels=128,
+            img_levels=[0, 1, 2, 3, 4],
+            align_corners=False,
+            activate_out=True,
+            fuse_out=False)),
+    pts_middle_encoder=dict(
+        type=SparseEncoder,
+        in_channels=128,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    pts_backbone=dict(
+        type=SECOND,
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    pts_neck=dict(
+        type=SECONDFPN,
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    pts_bbox_head=dict(
+        type=Anchor3DHead,
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type=Anchor3DRangeGenerator,
+            ranges=[
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+            ],
+            sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        assigner_per_size=True,
+        diff_rad_by_sin=True,
+        assign_per_class=True,
+        bbox_coder=dict(type=DeltaXYZWLHRBBoxCoder),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type=Max3DIoUAssigner,
+                    iou_calculator=dict(type=BboxOverlapsNearest3D),
+                    pos_iou_thr=0.35,
+                    neg_iou_thr=0.2,
+                    min_pos_iou=0.2,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type=Max3DIoUAssigner,
+                    iou_calculator=dict(type=BboxOverlapsNearest3D),
+                    pos_iou_thr=0.35,
+                    neg_iou_thr=0.2,
+                    min_pos_iou=0.2,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type=Max3DIoUAssigner,
+                    iou_calculator=dict(type=BboxOverlapsNearest3D),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.45,
+                    min_pos_iou=0.45,
+                    ignore_iof_thr=-1),
+            ],
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        pts=dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_thr=0.01,
+            score_thr=0.1,
+            min_bbox_size=0,
+            nms_pre=100,
+            max_num=50)))
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=True, use_camera=True)
+backend_args = None
+train_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations3D, with_bbox_3d=True, with_label_3d=True),
+    dict(type=RandomResize, scale=[(640, 192), (2560, 768)], keep_ratio=True),
+    dict(
+        type=GlobalRotScaleTrans,
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.2, 0.2, 0.2]),
+    dict(type=RandomFlip3D, flip_ratio_bev_horizontal=0.5),
+    dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=ObjectRangeFilter, point_cloud_range=point_cloud_range),
+    dict(type=PointShuffle),
+    dict(
+        type=Pack3DDetInputs,
+        keys=[
+            'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes',
+            'gt_labels'
+        ])
+]
+test_pipeline = [
+    dict(
+        type=LoadPointsFromFile,
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=MultiScaleFlipAug3D,
+        img_scale=(1280, 384),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            # Temporary solution, fix this after refactor the augtest
+            dict(type=Resize, scale=0, keep_ratio=True),
+            dict(
+                type=GlobalRotScaleTrans,
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type=RandomFlip3D),
+            dict(type=PointsRangeFilter, point_cloud_range=point_cloud_range),
+        ]),
+    dict(type=Pack3DDetInputs, keys=['points', 'img'])
+]
+modality = dict(use_lidar=True, use_camera=True)
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=RepeatDataset,
+        times=2,
+        dataset=dict(
+            type=KittiDataset,
+            data_root=data_root,
+            modality=modality,
+            ann_file='kitti_infos_train.pkl',
+            data_prefix=dict(
+                pts='training/velodyne_reduced', img='training/image_2'),
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            backend_args=backend_args)))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=KittiDataset,
+        data_root=data_root,
+        modality=modality,
+        ann_file='kitti_infos_val.pkl',
+        data_prefix=dict(
+            pts='training/velodyne_reduced', img='training/image_2'),
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=KittiDataset,
+        data_root=data_root,
+        ann_file='kitti_infos_val.pkl',
+        modality=modality,
+        data_prefix=dict(
+            pts='training/velodyne_reduced', img='training/image_2'),
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+optim_wrapper.update(
+    dict(
+        optimizer=dict(weight_decay=0.01),
+        clip_grad=dict(max_norm=35, norm_type=2),
+    ))
+val_evaluator = dict(
+    type=KittiMetric, ann_file='data/kitti/kitti_infos_val.pkl')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=Det3DLocalVisualizer, vis_backends=vis_backends, name='visualizer')
+
+# You may need to download the model first is the network is unstable
+load_from = 'https://download.openmmlab.com/mmdetection3d/pretrain_models/mvx_faster_rcnn_detectron2-caffe_20e_coco-pretrain_gt-sample_kitti-3-class_moderate-79.3_20200207-a4a6a3c7.pth'  # noqa
diff --git a/mmde/mmdet3d/configs/pgd/pgd_r101_caffe_fpn_head_gn_4xb3_4x_kitti_mono3d.py b/mmde/mmdet3d/configs/pgd/pgd_r101_caffe_fpn_head_gn_4xb3_4x_kitti_mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..45568956074643b7cd609402bf04ef3692a2340b
--- /dev/null
+++ b/mmde/mmdet3d/configs/pgd/pgd_r101_caffe_fpn_head_gn_4xb3_4x_kitti_mono3d.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import read_base
+
+with read_base():
+    from .._base_.datasets.kitti_mono3d import *
+    from .._base_.models.pgd import *
+    from .._base_.schedules.mmdet_schedule_1x import *
+    from .._base_.default_runtime import *
+
+from mmcv.transforms.processing import Resize
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadImageFromFileMono3D)
+from mmdet3d.datasets.transforms.transforms_3d import RandomFlip3D
+from mmdet3d.models.data_preprocessors.data_preprocessor import \
+    Det3DDataPreprocessor
+from mmdet3d.models.losses.uncertain_smooth_l1_loss import \
+    UncertainSmoothL1Loss
+from mmdet3d.models.task_modules.coders.pgd_bbox_coder import PGDBBoxCoder
+
+# model settings
+model.update(
+    dict(
+        data_preprocessor=dict(
+            type=Det3DDataPreprocessor,
+            mean=[103.530, 116.280, 123.675],
+            std=[1.0, 1.0, 1.0],
+            bgr_to_rgb=False,
+            pad_size_divisor=32),
+        backbone=dict(frozen_stages=0),
+        neck=dict(start_level=0, num_outs=4),
+        bbox_head=dict(
+            num_classes=3,
+            bbox_code_size=7,
+            pred_attrs=False,
+            pred_velo=False,
+            pred_bbox2d=True,
+            use_onlyreg_proj=True,
+            strides=(4, 8, 16, 32),
+            regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 1e8)),
+            group_reg_dims=(2, 1, 3, 1, 16,
+                            4),  # offset, depth, size, rot, kpts, bbox2d
+            reg_branch=(
+                (256, ),  # offset
+                (256, ),  # depth
+                (256, ),  # size
+                (256, ),  # rot
+                (256, ),  # kpts
+                (256, )  # bbox2d
+            ),
+            centerness_branch=(256, ),
+            loss_cls=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+            loss_dir=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0),
+            loss_centerness=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                loss_weight=1.0),
+            use_depth_classifier=True,
+            depth_branch=(256, ),
+            depth_range=(0, 70),
+            depth_unit=10,
+            division='uniform',
+            depth_bins=8,
+            pred_keypoints=True,
+            weight_dim=1,
+            loss_depth=dict(
+                type=UncertainSmoothL1Loss,
+                alpha=1.0,
+                beta=3.0,
+                loss_weight=1.0),
+            bbox_coder=dict(
+                type=PGDBBoxCoder,
+                base_depths=((28.01, 16.32), ),
+                base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6), (3.9, 1.56,
+                                                                 1.6)),
+                code_size=7)),
+        # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+        # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+        train_cfg=dict(code_weight=[
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+            0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0,
+            1.0
+        ]),
+        test_cfg=dict(
+            nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20)))
+
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFileMono3D, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations3D,
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type=Resize, scale=(1242, 375), keep_ratio=True),
+    dict(type=RandomFlip3D, flip_ratio_bev_horizontal=0.5),
+    dict(
+        type=Pack3DDetInputs,
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+test_pipeline = [
+    dict(type=LoadImageFromFileMono3D, backend_args=backend_args),
+    dict(type=Resize, scale_factor=1.0),
+    dict(type=Pack3DDetInputs, keys=['img'])
+]
+
+train_dataloader.update(
+    dict(batch_size=3, num_workers=3, dataset=dict(pipeline=train_pipeline)))
+test_dataloader.update(dict(dataset=dict(pipeline=test_pipeline)))
+val_dataloader.update(dict(dataset=dict(pipeline=test_pipeline)))
+
+# optimizer
+optim_wrapper.update(
+    dict(
+        optimizer=dict(lr=0.001),
+        paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+        clip_grad=dict(max_norm=35, norm_type=2)))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type=LinearLR, start_factor=1.0 / 3, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=48,
+        by_epoch=True,
+        milestones=[32, 44],
+        gamma=0.1)
+]
+
+train_cfg.update(dict(max_epochs=48, val_interval=2))
+auto_scale_lr.update(dict(base_batch_size=12))
diff --git a/mmde/mmdet3d/configs/votenet/__init__.py b/mmde/mmdet3d/configs/votenet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d
--- /dev/null
+++ b/mmde/mmdet3d/configs/votenet/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmde/mmdet3d/configs/votenet/votenet_8xb8_scannet_3d.py b/mmde/mmdet3d/configs/votenet/votenet_8xb8_scannet_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..68d83cab900ce6ab4be716ddb93c433df81b6742
--- /dev/null
+++ b/mmde/mmdet3d/configs/votenet/votenet_8xb8_scannet_3d.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import read_base
+
+with read_base():
+    from .._base_.datasets.scannet_3d import *
+    from .._base_.models.votenet import *
+    from .._base_.schedules.schedule_3x import *
+    from .._base_.default_runtime import *
+
+from mmengine.hooks.logger_hook import LoggerHook
+
+from mmdet3d.models.task_modules.coders.partial_bin_based_bbox_coder import \
+    PartialBinBasedBBoxCoder
+
+# model settings
+model.update(
+    dict(
+        bbox_head=dict(
+            num_classes=18,
+            bbox_coder=dict(
+                type=PartialBinBasedBBoxCoder,
+                num_sizes=18,
+                num_dir_bins=1,
+                with_rot=False,
+                mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                            [1.876858, 1.8425595, 1.1931566],
+                            [0.61328, 0.6148609, 0.7182701],
+                            [1.3955007, 1.5121545, 0.83443564],
+                            [0.97949594, 1.0675149, 0.6329687],
+                            [0.531663, 0.5955577, 1.7500148],
+                            [0.9624706, 0.72462326, 1.1481868],
+                            [0.83221924, 1.0490936, 1.6875663],
+                            [0.21132214, 0.4206159, 0.5372846],
+                            [1.4440073, 1.8970833, 0.26985747],
+                            [1.0294262, 1.4040797, 0.87554324],
+                            [1.3766412, 0.65521795, 1.6813129],
+                            [0.6650819, 0.71111923, 1.298853],
+                            [0.41999173, 0.37906948, 1.7513971],
+                            [0.59359556, 0.5912492, 0.73919016],
+                            [0.50867593, 0.50656086, 0.30136237],
+                            [1.1511526, 1.0546296, 0.49706793],
+                            [0.47535285, 0.49249494, 0.5802117]]))))
+
+default_hooks.update(dict(logger=dict(type=LoggerHook, interval=30)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr.update(dict(enable=False, base_batch_size=64))
diff --git a/mmde/mmdet3d/datasets/__init__.py b/mmde/mmdet3d/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d573ca4ed9eaf9f6453f062204c184d7b8e18244
--- /dev/null
+++ b/mmde/mmdet3d/datasets/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dataset_wrappers import CBGSDataset
+from .det3d_dataset import Det3DDataset
+from .kitti_dataset import KittiDataset
+from .lyft_dataset import LyftDataset
+from .nuscenes_dataset import NuScenesDataset
+# yapf: enable
+from .s3dis_dataset import S3DISDataset, S3DISSegDataset
+from .scannet_dataset import (ScanNetDataset, ScanNetInstanceSegDataset,
+                              ScanNetSegDataset)
+from .seg3d_dataset import Seg3DDataset
+from .semantickitti_dataset import SemanticKittiDataset
+from .sunrgbd_dataset import SUNRGBDDataset
+# yapf: disable
+from .transforms import (AffineResize, BackgroundPointsFilter, GlobalAlignment,
+                         GlobalRotScaleTrans, IndoorPatchPointSample,
+                         IndoorPointSample, LoadAnnotations3D,
+                         LoadPointsFromDict, LoadPointsFromFile,
+                         LoadPointsFromMultiSweeps, NormalizePointsColor,
+                         ObjectNameFilter, ObjectNoise, ObjectRangeFilter,
+                         ObjectSample, PointSample, PointShuffle,
+                         PointsRangeFilter, RandomDropPointsColor,
+                         RandomFlip3D, RandomJitterPoints, RandomResize3D,
+                         RandomShiftScale, Resize3D, VoxelBasedPointSampler)
+from .utils import get_loading_pipeline
+from .waymo_dataset import WaymoDataset
+
+__all__ = [
+    'KittiDataset', 'CBGSDataset', 'NuScenesDataset', 'LyftDataset',
+    'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
+    'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter',
+    'LoadPointsFromFile', 'S3DISSegDataset', 'S3DISDataset',
+    'NormalizePointsColor', 'IndoorPatchPointSample', 'IndoorPointSample',
+    'PointSample', 'LoadAnnotations3D', 'GlobalAlignment', 'SUNRGBDDataset',
+    'ScanNetDataset', 'ScanNetSegDataset', 'ScanNetInstanceSegDataset',
+    'SemanticKittiDataset', 'Det3DDataset', 'Seg3DDataset',
+    'LoadPointsFromMultiSweeps', 'WaymoDataset', 'BackgroundPointsFilter',
+    'VoxelBasedPointSampler', 'get_loading_pipeline', 'RandomDropPointsColor',
+    'RandomJitterPoints', 'ObjectNameFilter', 'AffineResize',
+    'RandomShiftScale', 'LoadPointsFromDict', 'Resize3D', 'RandomResize3D',
+]
diff --git a/mmde/mmdet3d/datasets/convert_utils.py b/mmde/mmdet3d/datasets/convert_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb4d97e1370e235497b9f46ba4a0f70ea5050061
--- /dev/null
+++ b/mmde/mmdet3d/datasets/convert_utils.py
@@ -0,0 +1,425 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+from nuscenes import NuScenes
+from nuscenes.utils.geometry_utils import view_points
+from pyquaternion import Quaternion
+from shapely.geometry import MultiPoint, box
+from shapely.geometry.polygon import Polygon
+
+from mmdet3d.structures import Box3DMode, CameraInstance3DBoxes, points_cam2img
+from mmdet3d.structures.ops import box_np_ops
+
+kitti_categories = ('Pedestrian', 'Cyclist', 'Car', 'Van', 'Truck',
+                    'Person_sitting', 'Tram', 'Misc')
+
+waymo_categories = ('Car', 'Pedestrian', 'Cyclist')
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+                  'barrier')
+
+nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
+                  'pedestrian.moving', 'pedestrian.standing',
+                  'pedestrian.sitting_lying_down', 'vehicle.moving',
+                  'vehicle.parked', 'vehicle.stopped', 'None')
+NuScenesNameMapping = {
+    'movable_object.barrier': 'barrier',
+    'vehicle.bicycle': 'bicycle',
+    'vehicle.bus.bendy': 'bus',
+    'vehicle.bus.rigid': 'bus',
+    'vehicle.car': 'car',
+    'vehicle.construction': 'construction_vehicle',
+    'vehicle.motorcycle': 'motorcycle',
+    'human.pedestrian.adult': 'pedestrian',
+    'human.pedestrian.child': 'pedestrian',
+    'human.pedestrian.construction_worker': 'pedestrian',
+    'human.pedestrian.police_officer': 'pedestrian',
+    'movable_object.trafficcone': 'traffic_cone',
+    'vehicle.trailer': 'trailer',
+    'vehicle.truck': 'truck'
+}
+LyftNameMapping = {
+    'bicycle': 'bicycle',
+    'bus': 'bus',
+    'car': 'car',
+    'emergency_vehicle': 'emergency_vehicle',
+    'motorcycle': 'motorcycle',
+    'other_vehicle': 'other_vehicle',
+    'pedestrian': 'pedestrian',
+    'truck': 'truck',
+    'animal': 'animal'
+}
+
+
+def get_nuscenes_2d_boxes(nusc: NuScenes, sample_data_token: str,
+                          visibilities: List[str]) -> List[dict]:
+    """Get the 2d / mono3d annotation records for a given `sample_data_token`
+    of nuscenes dataset.
+
+    Args:
+        nusc (:obj:`NuScenes`): NuScenes class.
+        sample_data_token (str): Sample data token belonging to a camera
+            keyframe.
+        visibilities (List[str]): Visibility filter.
+
+    Return:
+        List[dict]: List of 2d annotation record that belongs to the input
+        `sample_data_token`.
+    """
+
+    # Get the sample data and the sample corresponding to that sample data.
+    sd_rec = nusc.get('sample_data', sample_data_token)
+
+    assert sd_rec[
+        'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
+        ' for camera sample_data!'
+    if not sd_rec['is_key_frame']:
+        raise ValueError(
+            'The 2D re-projections are available only for keyframes.')
+
+    s_rec = nusc.get('sample', sd_rec['sample_token'])
+
+    # Get the calibrated sensor and ego pose
+    # record to get the transformation matrices.
+    cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
+    pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
+
+    # Get all the annotation with the specified visibilties.
+    ann_recs = [
+        nusc.get('sample_annotation', token) for token in s_rec['anns']
+    ]
+    ann_recs = [
+        ann_rec for ann_rec in ann_recs
+        if (ann_rec['visibility_token'] in visibilities)
+    ]
+
+    repro_recs = []
+
+    for ann_rec in ann_recs:
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = ann_rec['token']
+        ann_rec['sample_data_token'] = sample_data_token
+
+        # Get the box in global coordinates.
+        box = nusc.get_box(ann_rec['token'])
+
+        # Move them to the ego-pose frame.
+        box.translate(-np.array(pose_rec['translation']))
+        box.rotate(Quaternion(pose_rec['rotation']).inverse)
+
+        # Move them to the calibrated sensor frame.
+        box.translate(-np.array(cs_rec['translation']))
+        box.rotate(Quaternion(cs_rec['rotation']).inverse)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box.corners()
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    'nuscenes')
+
+        # if repro_rec is None, we do not append it into repre_recs
+        if repro_rec is not None:
+            loc = box.center.tolist()
+
+            dim = box.wlh
+            dim[[0, 1, 2]] = dim[[1, 2, 0]]  # convert wlh to our lhw
+            dim = dim.tolist()
+
+            rot = box.orientation.yaw_pitch_roll[0]
+            rot = [-rot]  # convert the rot to our cam coordinate
+
+            global_velo2d = nusc.box_velocity(box.token)[:2]
+            global_velo3d = np.array([*global_velo2d, 0.0])
+            e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
+            c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
+            cam_velo3d = global_velo3d @ np.linalg.inv(
+                e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
+            velo = cam_velo3d[0::2].tolist()
+
+            repro_rec['bbox_3d'] = loc + dim + rot
+            repro_rec['velocity'] = velo
+
+            center_3d = np.array(loc).reshape([1, 3])
+            center_2d_with_depth = points_cam2img(
+                center_3d, camera_intrinsic, with_depth=True)
+            center_2d_with_depth = center_2d_with_depth.squeeze().tolist()
+            repro_rec['center_2d'] = center_2d_with_depth[:2]
+            repro_rec['depth'] = center_2d_with_depth[2]
+            # normalized center2D + depth
+            # if samples with depth < 0 will be removed
+            if repro_rec['depth'] <= 0:
+                continue
+
+            ann_token = nusc.get('sample_annotation',
+                                 box.token)['attribute_tokens']
+            if len(ann_token) == 0:
+                attr_name = 'None'
+            else:
+                attr_name = nusc.get('attribute', ann_token[0])['name']
+            attr_id = nus_attributes.index(attr_name)
+            # repro_rec['attribute_name'] = attr_name
+            repro_rec['attr_label'] = attr_id
+
+            repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def get_kitti_style_2d_boxes(info: dict,
+                             cam_idx: int = 2,
+                             occluded: Tuple[int] = (0, 1, 2, 3),
+                             annos: Optional[dict] = None,
+                             mono3d: bool = True,
+                             dataset: str = 'kitti') -> List[dict]:
+    """Get the 2d / mono3d annotation records for a given info.
+
+    This function is used to get 2D/Mono3D annotations when loading annotations
+    from a kitti-style dataset class, such as KITTI and Waymo dataset.
+
+    Args:
+        info (dict): Information of the given sample data.
+        cam_idx (int): Camera id which the 2d / mono3d annotations to obtain
+            belong to. In KITTI, typically only CAM 2 will be used,
+            and in Waymo, multi cameras could be used.
+            Defaults to 2.
+        occluded (Tuple[int]): Integer (0, 1, 2, 3) indicating occlusion state:
+            0 = fully visible, 1 = partly occluded, 2 = largely occluded,
+            3 = unknown, -1 = DontCare.
+            Defaults to (0, 1, 2, 3).
+        annos (dict, optional): Original annotations. Defaults to None.
+        mono3d (bool): Whether to get boxes with mono3d annotation.
+            Defaults to True.
+        dataset (str): Dataset name of getting 2d bboxes.
+            Defaults to 'kitti'.
+
+    Return:
+        List[dict]: List of 2d / mono3d annotation record that
+        belongs to the input camera id.
+    """
+    # Get calibration information
+    camera_intrinsic = info['calib'][f'P{cam_idx}']
+
+    repro_recs = []
+    # if no annotations in info (test dataset), then return
+    if annos is None:
+        return repro_recs
+
+    # Get all the annotation with the specified visibilties.
+    # filter the annotation bboxes by occluded attributes
+    ann_dicts = annos
+    mask = [(ocld in occluded) for ocld in ann_dicts['occluded']]
+    for k in ann_dicts.keys():
+        ann_dicts[k] = ann_dicts[k][mask]
+
+    # convert dict of list to list of dict
+    ann_recs = []
+    for i in range(len(ann_dicts['occluded'])):
+        ann_rec = {}
+        for k in ann_dicts.keys():
+            ann_rec[k] = ann_dicts[k][i]
+        ann_recs.append(ann_rec)
+
+    for ann_idx, ann_rec in enumerate(ann_recs):
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = \
+            f"{info['image']['image_idx']}.{ann_idx}"
+        ann_rec['sample_data_token'] = info['image']['image_idx']
+
+        loc = ann_rec['location'][np.newaxis, :]
+        dim = ann_rec['dimensions'][np.newaxis, :]
+        rot = ann_rec['rotation_y'][np.newaxis, np.newaxis]
+
+        # transform the center from [0.5, 1.0, 0.5] to [0.5, 0.5, 0.5]
+        dst = np.array([0.5, 0.5, 0.5])
+        src = np.array([0.5, 1.0, 0.5])
+        # gravity center
+        loc_center = loc + dim * (dst - src)
+        gt_bbox_3d = np.concatenate([loc_center, dim, rot],
+                                    axis=1).astype(np.float32)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box_np_ops.center_to_corner_box3d(
+            gt_bbox_3d[:, :3],
+            gt_bbox_3d[:, 3:6],
+            gt_bbox_3d[:, 6], (0.5, 0.5, 0.5),
+            axis=1)
+        corners_3d = corners_3d[0].T  # (1, 8, 3) -> (3, 8)
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(
+            corner_coords,
+            imsize=(info['image']['image_shape'][1],
+                    info['image']['image_shape'][0]))
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    dataset)
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            # use bottom center to represent the bbox_3d
+            repro_rec['bbox_3d'] = np.concatenate(
+                [loc, dim, rot], axis=1).astype(np.float32).squeeze().tolist()
+            repro_rec['velocity'] = -1  # no velocity in KITTI
+
+            center_3d = np.array(loc_center).reshape([1, 3])
+            center_2d_with_depth = points_cam2img(
+                center_3d, camera_intrinsic, with_depth=True)
+            center_2d_with_depth = center_2d_with_depth.squeeze().tolist()
+
+            repro_rec['center_2d'] = center_2d_with_depth[:2]
+            repro_rec['depth'] = center_2d_with_depth[2]
+            # normalized center2D + depth
+            # samples with depth < 0 will be removed
+            if repro_rec['depth'] <= 0:
+                continue
+            repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def convert_annos(info: dict, cam_idx: int) -> dict:
+    """Convert front-cam anns to i-th camera (KITTI-style info)."""
+    rect = info['calib']['R0_rect'].astype(np.float32)
+    lidar2cam0 = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+    lidar2cami = info['calib'][f'Tr_velo_to_cam{cam_idx}'].astype(np.float32)
+    annos = info['annos']
+    converted_annos = copy.deepcopy(annos)
+    loc = annos['location']
+    dims = annos['dimensions']
+    rots = annos['rotation_y']
+    gt_bboxes_3d = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                  axis=1).astype(np.float32)
+    # convert gt_bboxes_3d to velodyne coordinates
+    gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d).convert_to(
+        Box3DMode.LIDAR, np.linalg.inv(rect @ lidar2cam0), correct_yaw=True)
+    # convert gt_bboxes_3d to cam coordinates
+    gt_bboxes_3d = gt_bboxes_3d.convert_to(
+        Box3DMode.CAM, rect @ lidar2cami, correct_yaw=True).numpy()
+    converted_annos['location'] = gt_bboxes_3d[:, :3]
+    converted_annos['dimensions'] = gt_bboxes_3d[:, 3:6]
+    converted_annos['rotation_y'] = gt_bboxes_3d[:, 6]
+    return converted_annos
+
+
+def post_process_coords(
+    corner_coords: List[int], imsize: Tuple[int] = (1600, 900)
+) -> Union[Tuple[float], None]:
+    """Get the intersection of the convex hull of the reprojected bbox corners
+    and the image canvas, return None if no intersection.
+
+    Args:
+        corner_coords (List[int]): Corner coordinates of reprojected
+            bounding box.
+        imsize (Tuple[int]): Size of the image canvas.
+            Defaults to (1600, 900).
+
+    Return:
+        Tuple[float] or None: Intersection of the convex hull of the 2D box
+        corners and the image canvas.
+    """
+    polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
+    img_canvas = box(0, 0, imsize[0], imsize[1])
+
+    if polygon_from_2d_box.intersects(img_canvas):
+        img_intersection = polygon_from_2d_box.intersection(img_canvas)
+        if isinstance(img_intersection, Polygon):
+            intersection_coords = np.array(
+                [coord for coord in img_intersection.exterior.coords])
+            min_x = min(intersection_coords[:, 0])
+            min_y = min(intersection_coords[:, 1])
+            max_x = max(intersection_coords[:, 0])
+            max_y = max(intersection_coords[:, 1])
+            return min_x, min_y, max_x, max_y
+        else:
+            warnings.warn('img_intersection is not an object of Polygon.')
+            return None
+    else:
+        return None
+
+
+def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
+                    dataset: str) -> Union[dict, None]:
+    """Generate one 2D annotation record given various information on top of
+    the 2D bounding box coordinates.
+
+    Args:
+        ann_rec (dict): Original 3d annotation record.
+        x1 (float): Minimum value of the x coordinate.
+        y1 (float): Minimum value of the y coordinate.
+        x2 (float): Maximum value of the x coordinate.
+        y2 (float): Maximum value of the y coordinate.
+        dataset (str): Name of dataset.
+
+    Returns:
+        dict or None: A sample 2d annotation record.
+
+            - bbox_label (int): 2d box label id
+            - bbox_label_3d (int): 3d box label id
+            - bbox (List[float]): left x, top y, right x, bottom y of 2d box
+            - bbox_3d_isvalid (bool): whether the box is valid
+    """
+
+    if dataset == 'nuscenes':
+        cat_name = ann_rec['category_name']
+        if cat_name not in NuScenesNameMapping:
+            return None
+        else:
+            cat_name = NuScenesNameMapping[cat_name]
+            categories = nus_categories
+    else:
+        if dataset == 'kitti':
+            categories = kitti_categories
+        elif dataset == 'waymo':
+            categories = waymo_categories
+        else:
+            raise NotImplementedError('Unsupported dataset!')
+
+        cat_name = ann_rec['name']
+        if cat_name not in categories:
+            return None
+
+    rec = dict()
+    rec['bbox_label'] = categories.index(cat_name)
+    rec['bbox_label_3d'] = rec['bbox_label']
+    rec['bbox'] = [x1, y1, x2, y2]
+    rec['bbox_3d_isvalid'] = True
+
+    return rec
diff --git a/mmde/mmdet3d/datasets/dataset_wrappers.py b/mmde/mmdet3d/datasets/dataset_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..398f8543a8002337df14d32c14a9caf8feb411ff
--- /dev/null
+++ b/mmde/mmdet3d/datasets/dataset_wrappers.py
@@ -0,0 +1,182 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import List, Set, Union
+
+import numpy as np
+from mmengine.dataset import BaseDataset, force_full_init
+
+from mmdet3d.registry import DATASETS
+
+
+@DATASETS.register_module()
+class CBGSDataset:
+    """A wrapper of class sampled dataset with ann_file path. Implementation of
+    paper `Class-balanced Grouping and Sampling for Point Cloud 3D Object
+    Detection <https://arxiv.org/abs/1908.09492>`_.
+
+    Balance the number of scenes under different classes.
+
+    Args:
+        dataset (:obj:`BaseDataset` or dict): The dataset to be class sampled.
+        lazy_init (bool): Whether to load annotation during instantiation.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 dataset: Union[BaseDataset, dict],
+                 lazy_init: bool = False) -> None:
+        self.dataset: BaseDataset
+        if isinstance(dataset, dict):
+            self.dataset = DATASETS.build(dataset)
+        elif isinstance(dataset, BaseDataset):
+            self.dataset = dataset
+        else:
+            raise TypeError(
+                'elements in datasets sequence should be config or '
+                f'`BaseDataset` instance, but got {type(dataset)}')
+        self._metainfo = self.dataset.metainfo
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+    @property
+    def metainfo(self) -> dict:
+        """Get the meta information of the repeated dataset.
+
+        Returns:
+            dict: The meta information of repeated dataset.
+        """
+        return copy.deepcopy(self._metainfo)
+
+    def full_init(self) -> None:
+        """Loop to ``full_init`` each dataset."""
+        if self._fully_initialized:
+            return
+
+        self.dataset.full_init()
+        # Get sample_indices
+        self.sample_indices = self._get_sample_indices(self.dataset)
+
+        self._fully_initialized = True
+
+    def _get_sample_indices(self, dataset: BaseDataset) -> List[int]:
+        """Load sample indices according to ann_file.
+
+        Args:
+            dataset (:obj:`BaseDataset`): The dataset.
+
+        Returns:
+            List[dict]: List of indices after class sampling.
+        """
+        classes = self.metainfo['classes']
+        cat2id = {name: i for i, name in enumerate(classes)}
+        class_sample_idxs = {cat_id: [] for cat_id in cat2id.values()}
+        for idx in range(len(dataset)):
+            sample_cat_ids = dataset.get_cat_ids(idx)
+            for cat_id in sample_cat_ids:
+                if cat_id != -1:
+                    # Filter categories that do not need to be cared.
+                    # -1 indicates dontcare in MMDet3D.
+                    class_sample_idxs[cat_id].append(idx)
+        duplicated_samples = sum(
+            [len(v) for _, v in class_sample_idxs.items()])
+        class_distribution = {
+            k: len(v) / duplicated_samples
+            for k, v in class_sample_idxs.items()
+        }
+
+        sample_indices = []
+
+        frac = 1.0 / len(classes)
+        ratios = [frac / v for v in class_distribution.values()]
+        for cls_inds, ratio in zip(list(class_sample_idxs.values()), ratios):
+            sample_indices += np.random.choice(cls_inds,
+                                               int(len(cls_inds) *
+                                                   ratio)).tolist()
+        return sample_indices
+
+    @force_full_init
+    def _get_ori_dataset_idx(self, idx: int) -> int:
+        """Convert global index to local index.
+
+        Args:
+            idx (int): Global index of ``CBGSDataset``.
+
+        Returns:
+            int: Local index of data.
+        """
+        return self.sample_indices[idx]
+
+    @force_full_init
+    def get_cat_ids(self, idx: int) -> Set[int]:
+        """Get category ids of class balanced dataset by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            Set[int]: All categories in the sample of specified index.
+        """
+        sample_idx = self._get_ori_dataset_idx(idx)
+        return self.dataset.get_cat_ids(sample_idx)
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index.
+
+        Args:
+            idx (int): Global index of ``CBGSDataset``.
+
+        Returns:
+            dict: The idx-th annotation of the dataset.
+        """
+        sample_idx = self._get_ori_dataset_idx(idx)
+        return self.dataset.get_data_info(sample_idx)
+
+    def __getitem__(self, idx: int) -> dict:
+        """Get item from infos according to the given index.
+
+        Args:
+            idx (int): The index of self.sample_indices.
+
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if not self._fully_initialized:
+            warnings.warn('Please call `full_init` method manually to '
+                          'accelerate the speed.')
+            self.full_init()
+
+        ori_index = self._get_ori_dataset_idx(idx)
+        return self.dataset[ori_index]
+
+    @force_full_init
+    def __len__(self) -> int:
+        """Return the length of data infos.
+
+        Returns:
+            int: Length of data infos.
+        """
+        return len(self.sample_indices)
+
+    def get_subset_(self, indices: Union[List[int], int]) -> None:
+        """Not supported in ``CBGSDataset`` for the ambiguous meaning of sub-
+        dataset."""
+        raise NotImplementedError(
+            '`CBGSDataset` does not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `CBGSDataset`.')
+
+    def get_subset(self, indices: Union[List[int], int]) -> BaseDataset:
+        """Not supported in ``CBGSDataset`` for the ambiguous meaning of sub-
+        dataset."""
+        raise NotImplementedError(
+            '`CBGSDataset` does not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `CBGSDataset`.')
diff --git a/mmde/mmdet3d/datasets/det3d_dataset.py b/mmde/mmdet3d/datasets/det3d_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c701a893fd6bf453d0a305c958cdc771c7975dfa
--- /dev/null
+++ b/mmde/mmdet3d/datasets/det3d_dataset.py
@@ -0,0 +1,423 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os
+from os import path as osp
+from typing import Callable, List, Optional, Set, Union
+
+import numpy as np
+import torch
+from mmengine.dataset import BaseDataset
+from mmengine.logging import print_log
+from terminaltables import AsciiTable
+
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import get_box_type
+
+
+@DATASETS.register_module()
+class Det3DDataset(BaseDataset):
+    """Base Class of 3D dataset.
+
+    This is the base dataset of SUNRGB-D, ScanNet, nuScenes, and KITTI
+    dataset.
+    # TODO: doc link here for the standard data format
+
+    Args:
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(pts='velodyne', img='').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input,
+            it usually has following keys:
+
+                - use_camera: bool
+                - use_lidar: bool
+            Defaults to dict(use_lidar=True, use_camera=False).
+        default_cam_key (str, optional): The default camera name adopted.
+            Defaults to None.
+        box_type_3d (str): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes:
+
+            - 'LiDAR': Box in LiDAR coordinates, usually for
+              outdoor point cloud 3d detection.
+            - 'Depth': Box in depth coordinates, usually for
+              indoor point cloud 3d detection.
+            - 'Camera': Box in camera coordinates, usually
+              for vision-based 3d detection.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+        load_eval_anns (bool): Whether to load annotations in test_mode,
+            the annotation will be save in `eval_ann_infos`, which can be
+            used in Evaluator. Defaults to True.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        show_ins_var (bool): For debug purpose. Whether to show variation
+            of the number of instances before and after through pipeline.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 data_root: Optional[str] = None,
+                 ann_file: str = '',
+                 metainfo: Optional[dict] = None,
+                 data_prefix: dict = dict(pts='velodyne', img=''),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_lidar=True, use_camera=False),
+                 default_cam_key: str = None,
+                 box_type_3d: dict = 'LiDAR',
+                 filter_empty_gt: bool = True,
+                 test_mode: bool = False,
+                 load_eval_anns: bool = True,
+                 backend_args: Optional[dict] = None,
+                 show_ins_var: bool = False,
+                 **kwargs) -> None:
+        self.backend_args = backend_args
+        self.filter_empty_gt = filter_empty_gt
+        self.load_eval_anns = load_eval_anns
+        _default_modality_keys = ('use_lidar', 'use_camera')
+        if modality is None:
+            modality = dict()
+
+        # Defaults to False if not specify
+        for key in _default_modality_keys:
+            if key not in modality:
+                modality[key] = False
+        self.modality = modality
+        self.default_cam_key = default_cam_key
+        assert self.modality['use_lidar'] or self.modality['use_camera'], (
+            'Please specify the `modality` (`use_lidar` '
+            f', `use_camera`) for {self.__class__.__name__}')
+
+        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
+
+        if metainfo is not None and 'classes' in metainfo:
+            # we allow to train on subset of self.METAINFO['classes']
+            # map unselected labels to -1
+            self.label_mapping = {
+                i: -1
+                for i in range(len(self.METAINFO['classes']))
+            }
+            self.label_mapping[-1] = -1
+            for label_idx, name in enumerate(metainfo['classes']):
+                ori_label = self.METAINFO['classes'].index(name)
+                self.label_mapping[ori_label] = label_idx
+
+            self.num_ins_per_cat = [0] * len(metainfo['classes'])
+        else:
+            self.label_mapping = {
+                i: i
+                for i in range(len(self.METAINFO['classes']))
+            }
+            self.label_mapping[-1] = -1
+
+            self.num_ins_per_cat = [0] * len(self.METAINFO['classes'])
+
+        super().__init__(
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_root=data_root,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            test_mode=test_mode,
+            **kwargs)
+
+        # can be accessed by other component in runner
+        self.metainfo['box_type_3d'] = box_type_3d
+        self.metainfo['label_mapping'] = self.label_mapping
+
+        if not kwargs.get('lazy_init', False):
+            # used for showing variation of the number of instances before and
+            # after through the pipeline
+            self.show_ins_var = show_ins_var
+
+            # show statistics of this dataset
+            print_log('-' * 30, 'current')
+            print_log(
+                f'The length of {"test" if self.test_mode else "training"} dataset: {len(self)}',  # noqa: E501
+                'current')
+            content_show = [['category', 'number']]
+            for label, num in enumerate(self.num_ins_per_cat):
+                cat_name = self.metainfo['classes'][label]
+                content_show.append([cat_name, num])
+            table = AsciiTable(content_show)
+            print_log(
+                f'The number of instances per category in the dataset:\n{table.table}',  # noqa: E501
+                'current')
+
+    def _remove_dontcare(self, ann_info: dict) -> dict:
+        """Remove annotations that do not need to be cared.
+
+        -1 indicates dontcare in MMDet3d.
+
+        Args:
+            ann_info (dict): Dict of annotation infos. The
+                instance with label `-1` will be removed.
+
+        Returns:
+            dict: Annotations after filtering.
+        """
+        img_filtered_annotations = {}
+        filter_mask = ann_info['gt_labels_3d'] > -1
+        for key in ann_info.keys():
+            if key != 'instances':
+                img_filtered_annotations[key] = (ann_info[key][filter_mask])
+            else:
+                img_filtered_annotations[key] = ann_info[key]
+        return img_filtered_annotations
+
+    def get_ann_info(self, index: int) -> dict:
+        """Get annotation info according to the given index.
+
+        Use index to get the corresponding annotations, thus the
+        evalhook could use this api.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information.
+        """
+        data_info = self.get_data_info(index)
+        # test model
+        if 'ann_info' not in data_info:
+            ann_info = self.parse_ann_info(data_info)
+        else:
+            ann_info = data_info['ann_info']
+
+        return ann_info
+
+    def parse_ann_info(self, info: dict) -> Union[dict, None]:
+        """Process the `instances` in data info to `ann_info`.
+
+        In `Custom3DDataset`, we simply concatenate all the field
+        in `instances` to `np.ndarray`, you can do the specific
+        process in subclass. You have to convert `gt_bboxes_3d`
+        to different coordinates according to the task.
+
+        Args:
+            info (dict): Info dict.
+
+        Returns:
+            dict or None: Processed `ann_info`.
+        """
+        # add s or gt prefix for most keys after concat
+        # we only process 3d annotations here, the corresponding
+        # 2d annotation process is in the `LoadAnnotations3D`
+        # in `transforms`
+        name_mapping = {
+            'bbox_label_3d': 'gt_labels_3d',
+            'bbox_label': 'gt_bboxes_labels',
+            'bbox': 'gt_bboxes',
+            'bbox_3d': 'gt_bboxes_3d',
+            'depth': 'depths',
+            'center_2d': 'centers_2d',
+            'attr_label': 'attr_labels',
+            'velocity': 'velocities',
+        }
+        instances = info['instances']
+        # empty gt
+        if len(instances) == 0:
+            return None
+        else:
+            keys = list(instances[0].keys())
+            ann_info = dict()
+            for ann_name in keys:
+                temp_anns = [item[ann_name] for item in instances]
+                # map the original dataset label to training label
+                if 'label' in ann_name and ann_name != 'attr_label':
+                    temp_anns = [
+                        self.label_mapping[item] for item in temp_anns
+                    ]
+                if ann_name in name_mapping:
+                    mapped_ann_name = name_mapping[ann_name]
+                else:
+                    mapped_ann_name = ann_name
+
+                if 'label' in ann_name:
+                    temp_anns = np.array(temp_anns).astype(np.int64)
+                elif ann_name in name_mapping:
+                    temp_anns = np.array(temp_anns).astype(np.float32)
+                else:
+                    temp_anns = np.array(temp_anns)
+
+                ann_info[mapped_ann_name] = temp_anns
+            ann_info['instances'] = info['instances']
+
+            for label in ann_info['gt_labels_3d']:
+                if label != -1:
+                    self.num_ins_per_cat[label] += 1
+
+        return ann_info
+
+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        Convert all relative path of needed modality data file to
+        the absolute path. And process the `instances` field to
+        `ann_info` in training stage.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+
+        if self.modality['use_lidar']:
+            info['lidar_points']['lidar_path'] = \
+                osp.join(
+                    self.data_prefix.get('pts', ''),
+                    info['lidar_points']['lidar_path'])
+
+            info['num_pts_feats'] = info['lidar_points']['num_pts_feats']
+            info['lidar_path'] = info['lidar_points']['lidar_path']
+            if 'lidar_sweeps' in info:
+                for sweep in info['lidar_sweeps']:
+                    file_suffix = sweep['lidar_points']['lidar_path'].split(
+                        os.sep)[-1]
+                    if 'samples' in sweep['lidar_points']['lidar_path']:
+                        sweep['lidar_points']['lidar_path'] = osp.join(
+                            self.data_prefix['pts'], file_suffix)
+                    else:
+                        sweep['lidar_points']['lidar_path'] = osp.join(
+                            self.data_prefix['sweeps'], file_suffix)
+
+        if self.modality['use_camera']:
+            for cam_id, img_info in info['images'].items():
+                if 'img_path' in img_info:
+                    if cam_id in self.data_prefix:
+                        cam_prefix = self.data_prefix[cam_id]
+                    else:
+                        cam_prefix = self.data_prefix.get('img', '')
+                    img_info['img_path'] = osp.join(cam_prefix,
+                                                    img_info['img_path'])
+            if self.default_cam_key is not None:
+                info['img_path'] = info['images'][
+                    self.default_cam_key]['img_path']
+                if 'lidar2cam' in info['images'][self.default_cam_key]:
+                    info['lidar2cam'] = np.array(
+                        info['images'][self.default_cam_key]['lidar2cam'])
+                if 'cam2img' in info['images'][self.default_cam_key]:
+                    info['cam2img'] = np.array(
+                        info['images'][self.default_cam_key]['cam2img'])
+                if 'lidar2img' in info['images'][self.default_cam_key]:
+                    info['lidar2img'] = np.array(
+                        info['images'][self.default_cam_key]['lidar2img'])
+                else:
+                    info['lidar2img'] = info['cam2img'] @ info['lidar2cam']
+
+        if not self.test_mode:
+            # used in training
+            info['ann_info'] = self.parse_ann_info(info)
+        if self.test_mode and self.load_eval_anns:
+            info['eval_ann_info'] = self.parse_ann_info(info)
+
+        return info
+
+    def _show_ins_var(self, old_labels: np.ndarray,
+                      new_labels: torch.Tensor) -> None:
+        """Show variation of the number of instances before and after through
+        the pipeline.
+
+        Args:
+            old_labels (np.ndarray): The labels before through the pipeline.
+            new_labels (torch.Tensor): The labels after through the pipeline.
+        """
+        ori_num_per_cat = dict()
+        for label in old_labels:
+            if label != -1:
+                cat_name = self.metainfo['classes'][label]
+                ori_num_per_cat[cat_name] = ori_num_per_cat.get(cat_name,
+                                                                0) + 1
+        new_num_per_cat = dict()
+        for label in new_labels:
+            if label != -1:
+                cat_name = self.metainfo['classes'][label]
+                new_num_per_cat[cat_name] = new_num_per_cat.get(cat_name,
+                                                                0) + 1
+        content_show = [['category', 'new number', 'ori number']]
+        for cat_name, num in ori_num_per_cat.items():
+            new_num = new_num_per_cat.get(cat_name, 0)
+            content_show.append([cat_name, new_num, num])
+        table = AsciiTable(content_show)
+        print_log(
+            'The number of instances per category after and before '
+            f'through pipeline:\n{table.table}', 'current')
+
+    def prepare_data(self, index: int) -> Union[dict, None]:
+        """Data preparation for both training and testing stage.
+
+        Called by `__getitem__`  of dataset.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict or None: Data dict of the corresponding index.
+        """
+        ori_input_dict = self.get_data_info(index)
+
+        # deepcopy here to avoid inplace modification in pipeline.
+        input_dict = copy.deepcopy(ori_input_dict)
+
+        # box_type_3d (str): 3D box type.
+        input_dict['box_type_3d'] = self.box_type_3d
+        # box_mode_3d (str): 3D box mode.
+        input_dict['box_mode_3d'] = self.box_mode_3d
+
+        # pre-pipline return None to random another in `__getitem__`
+        if not self.test_mode and self.filter_empty_gt:
+            if len(input_dict['ann_info']['gt_labels_3d']) == 0:
+                return None
+
+        example = self.pipeline(input_dict)
+
+        if not self.test_mode and self.filter_empty_gt:
+            # after pipeline drop the example with empty annotations
+            # return None to random another in `__getitem__`
+            if example is None or len(
+                    example['data_samples'].gt_instances_3d.labels_3d) == 0:
+                return None
+
+        if self.show_ins_var:
+            if 'ann_info' in ori_input_dict:
+                self._show_ins_var(
+                    ori_input_dict['ann_info']['gt_labels_3d'],
+                    example['data_samples'].gt_instances_3d.labels_3d)
+            else:
+                print_log(
+                    "'ann_info' is not in the input dict. It's probably that "
+                    'the data is not in training mode',
+                    'current',
+                    level=30)
+
+        return example
+
+    def get_cat_ids(self, idx: int) -> Set[int]:
+        """Get category ids by index. Dataset wrapped by ClassBalancedDataset
+        must implement this method.
+
+        The ``CBGSDataset`` or ``ClassBalancedDataset``requires a subclass
+        which implements this method.
+
+        Args:
+            idx (int): The index of data.
+
+        Returns:
+            set[int]: All categories in the sample of specified index.
+        """
+        info = self.get_data_info(idx)
+        gt_labels = info['ann_info']['gt_labels_3d'].tolist()
+        return set(gt_labels)
diff --git a/mmde/mmdet3d/datasets/kitti2d_dataset.py b/mmde/mmdet3d/datasets/kitti2d_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..780ecbfdeb35179f25bcc40f47bb98f706dccbc1
--- /dev/null
+++ b/mmde/mmdet3d/datasets/kitti2d_dataset.py
@@ -0,0 +1,241 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine
+import numpy as np
+
+from mmdet3d.datasets import Det3DDataset
+from mmdet3d.registry import DATASETS
+
+
+@DATASETS.register_module()
+class Kitti2DDataset(Det3DDataset):
+    r"""KITTI 2D Dataset.
+
+    This class serves as the API for experiments on the `KITTI Dataset
+    <http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d>`_.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR'. Available options includes
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+
+    classes = ('car', 'pedestrian', 'cyclist')
+    """
+    Annotation format:
+    [
+        {
+            'image': {
+                'image_idx': 0,
+                'image_path': 'training/image_2/000000.png',
+                'image_shape': array([ 370, 1224], dtype=int32)
+            },
+            'point_cloud': {
+                 'num_features': 4,
+                 'velodyne_path': 'training/velodyne/000000.bin'
+             },
+             'calib': {
+                 'P0': <np.ndarray> (4, 4),
+                 'P1': <np.ndarray> (4, 4),
+                 'P2': <np.ndarray> (4, 4),
+                 'P3': <np.ndarray> (4, 4),
+                 'R0_rect':4x4 np.array,
+                 'Tr_velo_to_cam': 4x4 np.array,
+                 'Tr_imu_to_velo': 4x4 np.array
+             },
+             'annos': {
+                 'name': <np.ndarray> (n),
+                 'truncated': <np.ndarray> (n),
+                 'occluded': <np.ndarray> (n),
+                 'alpha': <np.ndarray> (n),
+                 'bbox': <np.ndarray> (n, 4),
+                 'dimensions': <np.ndarray> (n, 3),
+                 'location': <np.ndarray> (n, 3),
+                 'rotation_y': <np.ndarray> (n),
+                 'score': <np.ndarray> (n),
+                 'index': array([0], dtype=int32),
+                 'group_ids': array([0], dtype=int32),
+                 'difficulty': array([0], dtype=int32),
+                 'num_points_in_gt': <np.ndarray> (n),
+             }
+        }
+    ]
+    """
+
+    def load_annotations(self, ann_file):
+        """Load annotations from ann_file.
+
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations.
+        """
+        self.data_infos = mmengine.load(ann_file)
+        self.cat2label = {
+            cat_name: i
+            for i, cat_name in enumerate(self.classes)
+        }
+        return self.data_infos
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images without ground truths."""
+        valid_inds = []
+        for i, img_info in enumerate(self.data_infos):
+            if len(img_info['annos']['name']) > 0:
+                valid_inds.append(i)
+        return valid_inds
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - bboxes (np.ndarray): Ground truth bboxes.
+                - labels (np.ndarray): Labels of ground truths.
+        """
+        # Use index to get the annos, thus the evalhook could also use this api
+        info = self.data_infos[index]
+        annos = info['annos']
+        gt_names = annos['name']
+        gt_bboxes = annos['bbox']
+        difficulty = annos['difficulty']
+
+        # remove classes that is not needed
+        selected = self.keep_arrays_by_name(gt_names, self.classes)
+        gt_bboxes = gt_bboxes[selected]
+        gt_names = gt_names[selected]
+        difficulty = difficulty[selected]
+        gt_labels = np.array([self.cat2label[n] for n in gt_names])
+
+        anns_results = dict(
+            bboxes=gt_bboxes.astype(np.float32),
+            labels=gt_labels,
+        )
+        return anns_results
+
+    def prepare_train_img(self, idx):
+        """Training image preparation.
+
+        Args:
+            index (int): Index for accessing the target image data.
+
+        Returns:
+            dict: Training image data dict after preprocessing
+                corresponding to the index.
+        """
+        img_raw_info = self.data_infos[idx]['image']
+        img_info = dict(filename=img_raw_info['image_path'])
+        ann_info = self.get_ann_info(idx)
+        if len(ann_info['bboxes']) == 0:
+            return None
+        results = dict(img_info=img_info, ann_info=ann_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    def prepare_test_img(self, idx):
+        """Prepare data for testing.
+
+        Args:
+            index (int): Index for accessing the target image data.
+
+        Returns:
+            dict: Testing image data dict after preprocessing
+                corresponding to the index.
+        """
+        img_raw_info = self.data_infos[idx]['image']
+        img_info = dict(filename=img_raw_info['image_path'])
+        results = dict(img_info=img_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    def drop_arrays_by_name(self, gt_names, used_classes):
+        """Drop irrelevant ground truths by name.
+
+        Args:
+            gt_names (list[str]): Names of ground truths.
+            used_classes (list[str]): Classes of interest.
+
+        Returns:
+            np.ndarray: Indices of ground truths that will be dropped.
+        """
+        inds = [i for i, x in enumerate(gt_names) if x not in used_classes]
+        inds = np.array(inds, dtype=np.int64)
+        return inds
+
+    def keep_arrays_by_name(self, gt_names, used_classes):
+        """Keep useful ground truths by name.
+
+        Args:
+            gt_names (list[str]): Names of ground truths.
+            used_classes (list[str]): Classes of interest.
+
+        Returns:
+            np.ndarray: Indices of ground truths that will be keeped.
+        """
+        inds = [i for i, x in enumerate(gt_names) if x in used_classes]
+        inds = np.array(inds, dtype=np.int64)
+        return inds
+
+    def reformat_bbox(self, outputs, out=None):
+        """Reformat bounding boxes to KITTI 2D styles.
+
+        Args:
+            outputs (list[np.ndarray]): List of arrays storing the inferenced
+                bounding boxes and scores.
+            out (str, optional): The prefix of output file.
+                Default: None.
+
+        Returns:
+            list[dict]: A list of dictionaries with the kitti 2D format.
+        """
+        from mmdet3d.structures.ops.transforms import bbox2result_kitti2d
+        sample_idx = [info['image']['image_idx'] for info in self.data_infos]
+        result_files = bbox2result_kitti2d(outputs, self.classes, sample_idx,
+                                           out)
+        return result_files
+
+    def evaluate(self, result_files, eval_types=None):
+        """Evaluation in KITTI protocol.
+
+        Args:
+            result_files (str): Path of result files.
+            eval_types (str, optional): Types of evaluation. Default: None.
+                KITTI dataset only support 'bbox' evaluation type.
+
+        Returns:
+            tuple (str, dict): Average precision results in str format
+                and average precision results in dict format.
+        """
+        from mmdet3d.evaluation import kitti_eval
+        eval_types = ['bbox'] if not eval_types else eval_types
+        assert eval_types in ('bbox', ['bbox'
+                                       ]), 'KITTI data set only evaluate bbox'
+        gt_annos = [info['annos'] for info in self.data_infos]
+        ap_result_str, ap_dict = kitti_eval(
+            gt_annos, result_files, self.classes, eval_types=['bbox'])
+        return ap_result_str, ap_dict
diff --git a/mmde/mmdet3d/datasets/kitti_dataset.py b/mmde/mmdet3d/datasets/kitti_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba77443462eb85ac981297c6e9696468d8ad96f
--- /dev/null
+++ b/mmde/mmdet3d/datasets/kitti_dataset.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, List, Union
+
+import numpy as np
+
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import CameraInstance3DBoxes
+from .det3d_dataset import Det3DDataset
+
+
+@DATASETS.register_module()
+class KittiDataset(Det3DDataset):
+    r"""KITTI Dataset.
+
+    This class serves as the API for experiments on the `KITTI Dataset
+    <http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d>`_.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_lidar=True).
+        default_cam_key (str): The default camera name adopted.
+            Defaults to 'CAM2'.
+        load_type (str): Type of loading mode. Defaults to 'frame_based'.
+
+            - 'frame_based': Load all of the instances in the frame.
+            - 'mv_image_based': Load all of the instances in the frame and need
+              to convert to the FOV-based data type to support image-based
+              detector.
+            - 'fov_image_based': Only load the instances inside the default
+              cam, and need to convert to the FOV-based data type to support
+              image-based detector.
+        box_type_3d (str): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes:
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+        pcd_limit_range (List[float]): The range of point cloud used to filter
+            invalid predicted boxes.
+            Defaults to [0, -40, -3, 70.4, 40, 0.0].
+    """
+    # TODO: use full classes of kitti
+    METAINFO = {
+        'classes': ('Pedestrian', 'Cyclist', 'Car', 'Van', 'Truck',
+                    'Person_sitting', 'Tram', 'Misc'),
+        'palette': [(106, 0, 228), (119, 11, 32), (165, 42, 42), (0, 0, 192),
+                    (197, 226, 255), (0, 60, 100), (0, 0, 142), (255, 77, 255)]
+    }
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_lidar=True),
+                 default_cam_key: str = 'CAM2',
+                 load_type: str = 'frame_based',
+                 box_type_3d: str = 'LiDAR',
+                 filter_empty_gt: bool = True,
+                 test_mode: bool = False,
+                 pcd_limit_range: List[float] = [0, -40, -3, 70.4, 40, 0.0],
+                 **kwargs) -> None:
+
+        self.pcd_limit_range = pcd_limit_range
+        assert load_type in ('frame_based', 'mv_image_based',
+                             'fov_image_based')
+        self.load_type = load_type
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            modality=modality,
+            default_cam_key=default_cam_key,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+        assert self.modality is not None
+        assert box_type_3d.lower() in ('lidar', 'camera')
+
+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        The only difference with it in `Det3DDataset`
+        is the specific process for `plane`.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+        if self.modality['use_lidar']:
+            if 'plane' in info:
+                # convert ground plane to velodyne coordinates
+                plane = np.array(info['plane'])
+                lidar2cam = np.array(
+                    info['images']['CAM2']['lidar2cam'], dtype=np.float32)
+                reverse = np.linalg.inv(lidar2cam)
+
+                (plane_norm_cam, plane_off_cam) = (plane[:3],
+                                                   -plane[:3] * plane[3])
+                plane_norm_lidar = \
+                    (reverse[:3, :3] @ plane_norm_cam[:, None])[:, 0]
+                plane_off_lidar = (
+                    reverse[:3, :3] @ plane_off_cam[:, None][:, 0] +
+                    reverse[:3, 3])
+                plane_lidar = np.zeros_like(plane_norm_lidar, shape=(4, ))
+                plane_lidar[:3] = plane_norm_lidar
+                plane_lidar[3] = -plane_norm_lidar.T @ plane_off_lidar
+            else:
+                plane_lidar = None
+
+            info['plane'] = plane_lidar
+
+        if self.load_type == 'fov_image_based' and self.load_eval_anns:
+            info['instances'] = info['cam_instances'][self.default_cam_key]
+
+        info = super().parse_data_info(info)
+
+        return info
+
+    def parse_ann_info(self, info: dict) -> dict:
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Data information of single data sample.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
+                  3D ground truth bboxes.
+                - bbox_labels_3d (np.ndarray): Labels of ground truths.
+                - gt_bboxes (np.ndarray): 2D ground truth bboxes.
+                - gt_labels (np.ndarray): Labels of ground truths.
+                - difficulty (int): Difficulty defined by KITTI.
+                  0, 1, 2 represent xxxxx respectively.
+        """
+        ann_info = super().parse_ann_info(info)
+        if ann_info is None:
+            ann_info = dict()
+            # empty instance
+            ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
+
+            if self.load_type in ['fov_image_based', 'mv_image_based']:
+                ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
+                ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64)
+                ann_info['centers_2d'] = np.zeros((0, 2), dtype=np.float32)
+                ann_info['depths'] = np.zeros((0), dtype=np.float32)
+
+        ann_info = self._remove_dontcare(ann_info)
+        # in kitti, lidar2cam = R0_rect @ Tr_velo_to_cam
+        lidar2cam = np.array(info['images']['CAM2']['lidar2cam'])
+        # convert gt_bboxes_3d to velodyne coordinates with `lidar2cam`
+        gt_bboxes_3d = CameraInstance3DBoxes(
+            ann_info['gt_bboxes_3d']).convert_to(self.box_mode_3d,
+                                                 np.linalg.inv(lidar2cam))
+        ann_info['gt_bboxes_3d'] = gt_bboxes_3d
+        return ann_info
diff --git a/mmde/mmdet3d/datasets/lyft_dataset.py b/mmde/mmdet3d/datasets/lyft_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e878acc0a3e0506e74cb901a5b1a24feace89dd8
--- /dev/null
+++ b/mmde/mmdet3d/datasets/lyft_dataset.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, List, Union
+
+import numpy as np
+
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import LiDARInstance3DBoxes
+from .det3d_dataset import Det3DDataset
+
+
+@DATASETS.register_module()
+class LyftDataset(Det3DDataset):
+    r"""Lyft Dataset.
+
+    This class serves as the API for experiments on the Lyft Dataset.
+
+    Please refer to
+    `<https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data>`_
+    for data downloading.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=False, use_lidar=True).
+        box_type_3d (str): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes:
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+
+    METAINFO = {
+        'classes':
+        ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle',
+         'motorcycle', 'bicycle', 'pedestrian', 'animal'),
+        'palette': [(106, 0, 228), (119, 11, 32), (165, 42, 42), (0, 0, 192),
+                    (197, 226, 255), (0, 60, 100), (0, 0, 142), (255, 77, 255),
+                    (153, 69, 1)]
+    }
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_camera=False, use_lidar=True),
+                 box_type_3d: str = 'LiDAR',
+                 filter_empty_gt: bool = True,
+                 test_mode: bool = False,
+                 **kwargs):
+        assert box_type_3d.lower() in ['lidar']
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+
+    def parse_ann_info(self, info: dict) -> dict:
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Data information of single data sample.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
+                  3D ground truth bboxes.
+                - gt_labels_3d (np.ndarray): Labels of 3D ground truths.
+        """
+        ann_info = super().parse_ann_info(info)
+        if ann_info is None:
+            # empty instance
+            anns_results = dict()
+            anns_results['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
+            anns_results['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
+            return anns_results
+        gt_bboxes_3d = ann_info['gt_bboxes_3d']
+        gt_labels_3d = ann_info['gt_labels_3d']
+
+        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+        # the same as KITTI (0.5, 0.5, 0)
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d)
+        return anns_results
diff --git a/mmde/mmdet3d/datasets/nuscenes_dataset.py b/mmde/mmdet3d/datasets/nuscenes_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..553480a588920b4885e60803aa2b7556f8f28d83
--- /dev/null
+++ b/mmde/mmdet3d/datasets/nuscenes_dataset.py
@@ -0,0 +1,248 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from os import path as osp
+from typing import Callable, List, Union
+
+import numpy as np
+
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import LiDARInstance3DBoxes
+from mmdet3d.structures.bbox_3d.cam_box3d import CameraInstance3DBoxes
+from .det3d_dataset import Det3DDataset
+
+
+@DATASETS.register_module()
+class NuScenesDataset(Det3DDataset):
+    r"""NuScenes Dataset.
+
+    This class serves as the API for experiments on the NuScenes Dataset.
+
+    Please refer to `NuScenes Dataset <https://www.nuscenes.org/download>`_
+    for data downloading.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict]): Pipeline used for data processing.
+            Defaults to [].
+        box_type_3d (str): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes:
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        load_type (str): Type of loading mode. Defaults to 'frame_based'.
+
+            - 'frame_based': Load all of the instances in the frame.
+            - 'mv_image_based': Load all of the instances in the frame and need
+                to convert to the FOV-based data type to support image-based
+                detector.
+            - 'fov_image_based': Only load the instances inside the default
+                cam, and need to convert to the FOV-based data type to support
+                image-based detector.
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=False, use_lidar=True).
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+        with_velocity (bool): Whether to include velocity prediction
+            into the experiments. Defaults to True.
+        use_valid_flag (bool): Whether to use `use_valid_flag` key
+            in the info file as mask to filter gt_boxes and gt_names.
+            Defaults to False.
+    """
+    METAINFO = {
+        'classes':
+        ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+         'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'),
+        'version':
+        'v1.0-trainval',
+        'palette': [
+            (255, 158, 0),  # Orange
+            (255, 99, 71),  # Tomato
+            (255, 140, 0),  # Darkorange
+            (255, 127, 80),  # Coral
+            (233, 150, 70),  # Darksalmon
+            (220, 20, 60),  # Crimson
+            (255, 61, 99),  # Red
+            (0, 0, 230),  # Blue
+            (47, 79, 79),  # Darkslategrey
+            (112, 128, 144),  # Slategrey
+        ]
+    }
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 box_type_3d: str = 'LiDAR',
+                 load_type: str = 'frame_based',
+                 modality: dict = dict(
+                     use_camera=False,
+                     use_lidar=True,
+                 ),
+                 filter_empty_gt: bool = True,
+                 test_mode: bool = False,
+                 with_velocity: bool = True,
+                 use_valid_flag: bool = False,
+                 **kwargs) -> None:
+        self.use_valid_flag = use_valid_flag
+        self.with_velocity = with_velocity
+
+        # TODO: Redesign multi-view data process in the future
+        assert load_type in ('frame_based', 'mv_image_based',
+                             'fov_image_based')
+        self.load_type = load_type
+
+        assert box_type_3d.lower() in ('lidar', 'camera')
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            modality=modality,
+            pipeline=pipeline,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+
+    def _filter_with_mask(self, ann_info: dict) -> dict:
+        """Remove annotations that do not need to be cared.
+
+        Args:
+            ann_info (dict): Dict of annotation infos.
+
+        Returns:
+            dict: Annotations after filtering.
+        """
+        filtered_annotations = {}
+        if self.use_valid_flag:
+            filter_mask = ann_info['bbox_3d_isvalid']
+        else:
+            filter_mask = ann_info['num_lidar_pts'] > 0
+        for key in ann_info.keys():
+            if key != 'instances':
+                filtered_annotations[key] = (ann_info[key][filter_mask])
+            else:
+                filtered_annotations[key] = ann_info[key]
+        return filtered_annotations
+
+    def parse_ann_info(self, info: dict) -> dict:
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Data information of single data sample.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
+                  3D ground truth bboxes.
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+        """
+        ann_info = super().parse_ann_info(info)
+        if ann_info is not None:
+
+            ann_info = self._filter_with_mask(ann_info)
+
+            if self.with_velocity:
+                gt_bboxes_3d = ann_info['gt_bboxes_3d']
+                gt_velocities = ann_info['velocities']
+                nan_mask = np.isnan(gt_velocities[:, 0])
+                gt_velocities[nan_mask] = [0.0, 0.0]
+                gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocities],
+                                              axis=-1)
+                ann_info['gt_bboxes_3d'] = gt_bboxes_3d
+        else:
+            # empty instance
+            ann_info = dict()
+            if self.with_velocity:
+                ann_info['gt_bboxes_3d'] = np.zeros((0, 9), dtype=np.float32)
+            else:
+                ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
+
+            if self.load_type in ['fov_image_based', 'mv_image_based']:
+                ann_info['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
+                ann_info['gt_bboxes_labels'] = np.array(0, dtype=np.int64)
+                ann_info['attr_labels'] = np.array(0, dtype=np.int64)
+                ann_info['centers_2d'] = np.zeros((0, 2), dtype=np.float32)
+                ann_info['depths'] = np.zeros((0), dtype=np.float32)
+
+        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+        # the same as KITTI (0.5, 0.5, 0)
+        # TODO: Unify the coordinates
+        if self.load_type in ['fov_image_based', 'mv_image_based']:
+            gt_bboxes_3d = CameraInstance3DBoxes(
+                ann_info['gt_bboxes_3d'],
+                box_dim=ann_info['gt_bboxes_3d'].shape[-1],
+                origin=(0.5, 0.5, 0.5))
+        else:
+            gt_bboxes_3d = LiDARInstance3DBoxes(
+                ann_info['gt_bboxes_3d'],
+                box_dim=ann_info['gt_bboxes_3d'].shape[-1],
+                origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        ann_info['gt_bboxes_3d'] = gt_bboxes_3d
+
+        return ann_info
+
+    def parse_data_info(self, info: dict) -> Union[List[dict], dict]:
+        """Process the raw data info.
+
+        The only difference with it in `Det3DDataset`
+        is the specific process for `plane`.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            List[dict] or dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+        if self.load_type == 'mv_image_based':
+            data_list = []
+            if self.modality['use_lidar']:
+                info['lidar_points']['lidar_path'] = \
+                    osp.join(
+                        self.data_prefix.get('pts', ''),
+                        info['lidar_points']['lidar_path'])
+
+            if self.modality['use_camera']:
+                for cam_id, img_info in info['images'].items():
+                    if 'img_path' in img_info:
+                        if cam_id in self.data_prefix:
+                            cam_prefix = self.data_prefix[cam_id]
+                        else:
+                            cam_prefix = self.data_prefix.get('img', '')
+                        img_info['img_path'] = osp.join(
+                            cam_prefix, img_info['img_path'])
+
+            for idx, (cam_id, img_info) in enumerate(info['images'].items()):
+                camera_info = dict()
+                camera_info['images'] = dict()
+                camera_info['images'][cam_id] = img_info
+                if 'cam_instances' in info and cam_id in info['cam_instances']:
+                    camera_info['instances'] = info['cam_instances'][cam_id]
+                else:
+                    camera_info['instances'] = []
+                # TODO: check whether to change sample_idx for 6 cameras
+                #  in one frame
+                camera_info['sample_idx'] = info['sample_idx'] * 6 + idx
+                camera_info['token'] = info['token']
+                camera_info['ego2global'] = info['ego2global']
+
+                if not self.test_mode:
+                    # used in traing
+                    camera_info['ann_info'] = self.parse_ann_info(camera_info)
+                if self.test_mode and self.load_eval_anns:
+                    camera_info['eval_ann_info'] = \
+                        self.parse_ann_info(camera_info)
+                data_list.append(camera_info)
+            return data_list
+        else:
+            data_info = super().parse_data_info(info)
+            return data_info
diff --git a/mmde/mmdet3d/datasets/s3dis_dataset.py b/mmde/mmdet3d/datasets/s3dis_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed289cab10f31475851f979c0d70d4780d8cb185
--- /dev/null
+++ b/mmde/mmdet3d/datasets/s3dis_dataset.py
@@ -0,0 +1,364 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from os import path as osp
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import DepthInstance3DBoxes
+from .det3d_dataset import Det3DDataset
+from .seg3d_dataset import Seg3DDataset
+
+
+@DATASETS.register_module()
+class S3DISDataset(Det3DDataset):
+    r"""S3DIS Dataset for Detection Task.
+
+    This class is the inner dataset for S3DIS. Since S3DIS has 6 areas, we
+    often train on 5 of them and test on the remaining one. The one for
+    test is Area_5 as suggested in `GSDN <https://arxiv.org/abs/2006.12356>`_.
+    To concatenate 5 areas during training
+    `mmengine.datasets.dataset_wrappers.ConcatDataset` should be used.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for data. Defaults to
+            dict(pts='points',
+                 pts_instance_mask='instance_mask',
+                 pts_semantic_mask='semantic_mask').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=False, use_lidar=True).
+        box_type_3d (str): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'Depth' in this dataset. Available options includes:
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+    METAINFO = {
+        'classes': ('table', 'chair', 'sofa', 'bookcase', 'board'),
+        # the valid ids of segmentation annotations
+        'seg_valid_class_ids': (7, 8, 9, 10, 11),
+        'seg_all_class_ids':
+        tuple(range(1, 14)),  # possibly with 'stair' class
+        'palette': [(170, 120, 200), (255, 0, 0), (200, 100, 100),
+                    (10, 200, 100), (200, 200, 200)]
+    }
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 metainfo: Optional[dict] = None,
+                 data_prefix: dict = dict(
+                     pts='points',
+                     pts_instance_mask='instance_mask',
+                     pts_semantic_mask='semantic_mask'),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_camera=False, use_lidar=True),
+                 box_type_3d: str = 'Depth',
+                 filter_empty_gt: bool = True,
+                 test_mode: bool = False,
+                 **kwargs) -> None:
+
+        # construct seg_label_mapping for semantic mask
+        seg_max_cat_id = len(self.METAINFO['seg_all_class_ids'])
+        seg_valid_cat_ids = self.METAINFO['seg_valid_class_ids']
+        neg_label = len(seg_valid_cat_ids)
+        seg_label_mapping = np.ones(
+            seg_max_cat_id + 1, dtype=np.int64) * neg_label
+        for cls_idx, cat_id in enumerate(seg_valid_cat_ids):
+            seg_label_mapping[cat_id] = cls_idx
+        self.seg_label_mapping = seg_label_mapping
+
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+
+        self.metainfo['seg_label_mapping'] = self.seg_label_mapping
+        assert 'use_camera' in self.modality and \
+               'use_lidar' in self.modality
+        assert self.modality['use_camera'] or self.modality['use_lidar']
+
+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+        info['pts_instance_mask_path'] = osp.join(
+            self.data_prefix.get('pts_instance_mask', ''),
+            info['pts_instance_mask_path'])
+        info['pts_semantic_mask_path'] = osp.join(
+            self.data_prefix.get('pts_semantic_mask', ''),
+            info['pts_semantic_mask_path'])
+
+        info = super().parse_data_info(info)
+        # only be used in `PointSegClassMapping` in pipeline
+        # to map original semantic class to valid category ids.
+        info['seg_label_mapping'] = self.seg_label_mapping
+        return info
+
+    def parse_ann_info(self, info: dict) -> dict:
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Info dict.
+
+        Returns:
+            dict: Processed `ann_info`.
+        """
+        ann_info = super().parse_ann_info(info)
+        # empty gt
+        if ann_info is None:
+            ann_info = dict()
+            ann_info['gt_bboxes_3d'] = np.zeros((0, 6), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros((0, ), dtype=np.int64)
+        # to target box structure
+
+        ann_info['gt_bboxes_3d'] = DepthInstance3DBoxes(
+            ann_info['gt_bboxes_3d'],
+            box_dim=ann_info['gt_bboxes_3d'].shape[-1],
+            with_yaw=False,
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        return ann_info
+
+
+class _S3DISSegDataset(Seg3DDataset):
+    r"""S3DIS Dataset for Semantic Segmentation Task.
+
+    This class is the inner dataset for S3DIS. Since S3DIS has 6 areas, we
+    often train on 5 of them and test on the remaining one.
+    However, there is not a fixed train-test split of S3DIS. People often test
+    on Area_5 as suggested by `SEGCloud <https://arxiv.org/abs/1710.07563>`_.
+    But many papers also report the average results of 6-fold cross validation
+    over the 6 areas (e.g. `DGCNN <https://arxiv.org/abs/1801.07829>`_).
+    Therefore, we use an inner dataset for one area, and further use a dataset
+    wrapper to concat all the provided data in different areas.
+
+    Args:
+        data_root (str, optional): Path of dataset root, Defaults to None.
+        ann_file (str): Path of annotation file. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(pts='points', pts_instance_mask='', pts_semantic_mask='').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_lidar=True, use_camera=False).
+        ignore_index (int, optional): The label index to be ignored, e.g.
+            unannotated points. If None is given, set to len(self.classes) to
+            be consistent with PointSegClassMapping function in pipeline.
+            Defaults to None.
+        scene_idxs (np.ndarray or str, optional): Precomputed index to load
+            data. For scenes with many points, we may sample it several times.
+            Defaults to None.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+    METAINFO = {
+        'classes':
+        ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+         'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter'),
+        'palette': [[0, 255, 0], [0, 0, 255], [0, 255, 255], [255, 255, 0],
+                    [255, 0, 255], [100, 100, 255], [200, 200, 100],
+                    [170, 120, 200], [255, 0, 0], [200, 100, 100],
+                    [10, 200, 100], [200, 200, 200], [50, 50, 50]],
+        'seg_valid_class_ids':
+        tuple(range(13)),
+        'seg_all_class_ids':
+        tuple(range(14))  # possibly with 'stair' class
+    }
+
+    def __init__(self,
+                 data_root: Optional[str] = None,
+                 ann_file: str = '',
+                 metainfo: Optional[dict] = None,
+                 data_prefix: dict = dict(
+                     pts='points', pts_instance_mask='', pts_semantic_mask=''),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_lidar=True, use_camera=False),
+                 ignore_index: Optional[int] = None,
+                 scene_idxs: Optional[Union[np.ndarray, str]] = None,
+                 test_mode: bool = False,
+                 **kwargs) -> None:
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            modality=modality,
+            ignore_index=ignore_index,
+            scene_idxs=scene_idxs,
+            test_mode=test_mode,
+            **kwargs)
+
+    def get_scene_idxs(self, scene_idxs: Union[np.ndarray, str,
+                                               None]) -> np.ndarray:
+        """Compute scene_idxs for data sampling.
+
+        We sample more times for scenes with more points.
+        """
+        # when testing, we load one whole scene every time
+        if not self.test_mode and scene_idxs is None:
+            raise NotImplementedError(
+                'please provide re-sampled scene indexes for training')
+
+        return super().get_scene_idxs(scene_idxs)
+
+
+@DATASETS.register_module()
+class S3DISSegDataset(_S3DISSegDataset):
+    r"""S3DIS Dataset for Semantic Segmentation Task.
+
+    This class serves as the API for experiments on the S3DIS Dataset.
+    It wraps the provided datasets of different areas.
+    We don't use `mmdet.datasets.dataset_wrappers.ConcatDataset` because we
+    need to concat the `scene_idxs` of different areas.
+
+    Please refer to the `google form <https://docs.google.com/forms/d/e/1FAIpQL
+    ScDimvNMCGhy_rmBA2gHfDu3naktRm6A8BPwAWWDv-Uhm6Shw/viewform?c=0&w=1>`_ for
+    data downloading.
+
+    Args:
+        data_root (str, optional): Path of dataset root. Defaults to None.
+        ann_files (List[str]): Path of several annotation files.
+            Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(pts='points', pts_instance_mask='', pts_semantic_mask='').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_lidar=True, use_camera=False).
+        ignore_index (int, optional): The label index to be ignored, e.g.
+            unannotated points. If None is given, set to len(self.classes) to
+            be consistent with PointSegClassMapping function in pipeline.
+            Defaults to None.
+        scene_idxs (List[np.ndarray] | List[str], optional): Precomputed index
+            to load data. For scenes with many points, we may sample it
+            several times. Defaults to None.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 data_root: Optional[str] = None,
+                 ann_files: List[str] = '',
+                 metainfo: Optional[dict] = None,
+                 data_prefix: dict = dict(
+                     pts='points', pts_instance_mask='', pts_semantic_mask=''),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_lidar=True, use_camera=False),
+                 ignore_index: Optional[int] = None,
+                 scene_idxs: Optional[Union[List[np.ndarray],
+                                            List[str]]] = None,
+                 test_mode: bool = False,
+                 **kwargs) -> None:
+
+        # make sure that ann_files and scene_idxs have same length
+        ann_files = self._check_ann_files(ann_files)
+        scene_idxs = self._check_scene_idxs(scene_idxs, len(ann_files))
+
+        # initialize some attributes as datasets[0]
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_files[0],
+            metainfo=metainfo,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            modality=modality,
+            ignore_index=ignore_index,
+            scene_idxs=scene_idxs[0],
+            test_mode=test_mode,
+            **kwargs)
+
+        datasets = [
+            _S3DISSegDataset(
+                data_root=data_root,
+                ann_file=ann_files[i],
+                metainfo=metainfo,
+                data_prefix=data_prefix,
+                pipeline=pipeline,
+                modality=modality,
+                ignore_index=ignore_index,
+                scene_idxs=scene_idxs[i],
+                test_mode=test_mode,
+                **kwargs) for i in range(len(ann_files))
+        ]
+
+        # data_list and scene_idxs need to be concat
+        self.concat_data_list([dst.data_list for dst in datasets])
+
+        # set group flag for the sampler
+        if not self.test_mode:
+            self._set_group_flag()
+
+    def concat_data_list(self, data_lists: List[List[dict]]) -> None:
+        """Concat data_list from several datasets to form self.data_list.
+
+        Args:
+            data_lists (List[List[dict]]): List of dict containing
+                annotation information.
+        """
+        self.data_list = [
+            data for data_list in data_lists for data in data_list
+        ]
+
+    @staticmethod
+    def _duplicate_to_list(x: Any, num: int) -> list:
+        """Repeat x `num` times to form a list."""
+        return [x for _ in range(num)]
+
+    def _check_ann_files(
+            self, ann_file: Union[List[str], Tuple[str], str]) -> List[str]:
+        """Make ann_files as list/tuple."""
+        # ann_file could be str
+        if not isinstance(ann_file, (list, tuple)):
+            ann_file = self._duplicate_to_list(ann_file, 1)
+        return ann_file
+
+    def _check_scene_idxs(self, scene_idx: Union[str, List[Union[list, tuple,
+                                                                 np.ndarray]],
+                                                 List[str], None],
+                          num: int) -> List[np.ndarray]:
+        """Make scene_idxs as list/tuple."""
+        if scene_idx is None:
+            return self._duplicate_to_list(scene_idx, num)
+        # scene_idx could be str, np.ndarray, list or tuple
+        if isinstance(scene_idx, str):  # str
+            return self._duplicate_to_list(scene_idx, num)
+        if isinstance(scene_idx[0], str):  # list of str
+            return scene_idx
+        if isinstance(scene_idx[0], (list, tuple, np.ndarray)):  # list of idx
+            return scene_idx
+        # single idx
+        return self._duplicate_to_list(scene_idx, num)
diff --git a/mmde/mmdet3d/datasets/scannet_dataset.py b/mmde/mmdet3d/datasets/scannet_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..069c837857c7fb76a7a05771456ae1a3cc59b17c
--- /dev/null
+++ b/mmde/mmdet3d/datasets/scannet_dataset.py
@@ -0,0 +1,353 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from os import path as osp
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import DepthInstance3DBoxes
+from .det3d_dataset import Det3DDataset
+from .seg3d_dataset import Seg3DDataset
+
+
+@DATASETS.register_module()
+class ScanNetDataset(Det3DDataset):
+    r"""ScanNet Dataset for Detection Task.
+
+    This class serves as the API for experiments on the ScanNet Dataset.
+
+    Please refer to the `github repo <https://github.com/ScanNet/ScanNet>`_
+    for data downloading.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for data. Defaults to
+            dict(pts='points',
+                 pts_instance_mask='instance_mask',
+                 pts_semantic_mask='semantic_mask').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=False, use_lidar=True).
+        box_type_3d (str): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'Depth' in this dataset. Available options includes:
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+    METAINFO = {
+        'classes':
+        ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+         'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator',
+         'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin'),
+        # the valid ids of segmentation annotations
+        'seg_valid_class_ids':
+        (3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39),
+        'seg_all_class_ids':
+        tuple(range(1, 41)),
+        'palette': [(31, 119, 180), (255, 187, 120), (188, 189, 34),
+                    (140, 86, 75), (255, 152, 150), (214, 39, 40),
+                    (197, 176, 213), (148, 103, 189), (196, 156, 148),
+                    (23, 190, 207), (247, 182, 210), (219, 219, 141),
+                    (255, 127, 14), (158, 218, 229), (44, 160, 44),
+                    (112, 128, 144), (227, 119, 194), (82, 84, 163)]
+    }
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 metainfo: Optional[dict] = None,
+                 data_prefix: dict = dict(
+                     pts='points',
+                     pts_instance_mask='instance_mask',
+                     pts_semantic_mask='semantic_mask'),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_camera=False, use_lidar=True),
+                 box_type_3d: str = 'Depth',
+                 filter_empty_gt: bool = True,
+                 test_mode: bool = False,
+                 **kwargs) -> None:
+
+        # construct seg_label_mapping for semantic mask
+        seg_max_cat_id = len(self.METAINFO['seg_all_class_ids'])
+        seg_valid_cat_ids = self.METAINFO['seg_valid_class_ids']
+        neg_label = len(seg_valid_cat_ids)
+        seg_label_mapping = np.ones(
+            seg_max_cat_id + 1, dtype=np.int64) * neg_label
+        for cls_idx, cat_id in enumerate(seg_valid_cat_ids):
+            seg_label_mapping[cat_id] = cls_idx
+        self.seg_label_mapping = seg_label_mapping
+
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+
+        self.metainfo['seg_label_mapping'] = self.seg_label_mapping
+        assert 'use_camera' in self.modality and \
+               'use_lidar' in self.modality
+        assert self.modality['use_camera'] or self.modality['use_lidar']
+
+    @staticmethod
+    def _get_axis_align_matrix(info: dict) -> np.ndarray:
+        """Get axis_align_matrix from info. If not exist, return identity mat.
+
+        Args:
+            info (dict): Info of a single sample data.
+
+        Returns:
+            np.ndarray: 4x4 transformation matrix.
+        """
+        if 'axis_align_matrix' in info:
+            return np.array(info['axis_align_matrix'])
+        else:
+            warnings.warn(
+                'axis_align_matrix is not found in ScanNet data info, please '
+                'use new pre-process scripts to re-generate ScanNet data')
+            return np.eye(4).astype(np.float32)
+
+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        The only difference with it in `Det3DDataset`
+        is the specific process for `axis_align_matrix'.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+        info['axis_align_matrix'] = self._get_axis_align_matrix(info)
+        info['pts_instance_mask_path'] = osp.join(
+            self.data_prefix.get('pts_instance_mask', ''),
+            info['pts_instance_mask_path'])
+        info['pts_semantic_mask_path'] = osp.join(
+            self.data_prefix.get('pts_semantic_mask', ''),
+            info['pts_semantic_mask_path'])
+
+        info = super().parse_data_info(info)
+        # only be used in `PointSegClassMapping` in pipeline
+        # to map original semantic class to valid category ids.
+        info['seg_label_mapping'] = self.seg_label_mapping
+        return info
+
+    def parse_ann_info(self, info: dict) -> dict:
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Info dict.
+
+        Returns:
+            dict: Processed `ann_info`.
+        """
+        ann_info = super().parse_ann_info(info)
+        # empty gt
+        if ann_info is None:
+            ann_info = dict()
+            ann_info['gt_bboxes_3d'] = np.zeros((0, 6), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros((0, ), dtype=np.int64)
+        # to target box structure
+
+        ann_info['gt_bboxes_3d'] = DepthInstance3DBoxes(
+            ann_info['gt_bboxes_3d'],
+            box_dim=ann_info['gt_bboxes_3d'].shape[-1],
+            with_yaw=False,
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        return ann_info
+
+
+@DATASETS.register_module()
+class ScanNetSegDataset(Seg3DDataset):
+    r"""ScanNet Dataset for Semantic Segmentation Task.
+
+    This class serves as the API for experiments on the ScanNet Dataset.
+
+    Please refer to the `github repo <https://github.com/ScanNet/ScanNet>`_
+    for data downloading.
+
+    Args:
+        data_root (str, optional): Path of dataset root. Defaults to None.
+        ann_file (str): Path of annotation file. Defaults to ''.
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(pts='points',
+                 img='',
+                 pts_instance_mask='',
+                 pts_semantic_mask='').
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_lidar=True, use_camera=False).
+        ignore_index (int, optional): The label index to be ignored, e.g.
+            unannotated points. If None is given, set to len(self.classes) to
+            be consistent with PointSegClassMapping function in pipeline.
+            Defaults to None.
+        scene_idxs (np.ndarray or str, optional): Precomputed index to load
+            data. For scenes with many points, we may sample it several times.
+            Defaults to None.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+    METAINFO = {
+        'classes':
+        ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table', 'door',
+         'window', 'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+         'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+         'otherfurniture'),
+        'palette': [
+            [174, 199, 232],
+            [152, 223, 138],
+            [31, 119, 180],
+            [255, 187, 120],
+            [188, 189, 34],
+            [140, 86, 75],
+            [255, 152, 150],
+            [214, 39, 40],
+            [197, 176, 213],
+            [148, 103, 189],
+            [196, 156, 148],
+            [23, 190, 207],
+            [247, 182, 210],
+            [219, 219, 141],
+            [255, 127, 14],
+            [158, 218, 229],
+            [44, 160, 44],
+            [112, 128, 144],
+            [227, 119, 194],
+            [82, 84, 163],
+        ],
+        'seg_valid_class_ids': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16,
+                                24, 28, 33, 34, 36, 39),
+        'seg_all_class_ids':
+        tuple(range(41)),
+    }
+
+    def __init__(self,
+                 data_root: Optional[str] = None,
+                 ann_file: str = '',
+                 metainfo: Optional[dict] = None,
+                 data_prefix: dict = dict(
+                     pts='points',
+                     img='',
+                     pts_instance_mask='',
+                     pts_semantic_mask=''),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_lidar=True, use_camera=False),
+                 ignore_index: Optional[int] = None,
+                 scene_idxs: Optional[Union[np.ndarray, str]] = None,
+                 test_mode: bool = False,
+                 **kwargs) -> None:
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            modality=modality,
+            ignore_index=ignore_index,
+            scene_idxs=scene_idxs,
+            test_mode=test_mode,
+            **kwargs)
+
+    def get_scene_idxs(self, scene_idxs: Union[np.ndarray, str,
+                                               None]) -> np.ndarray:
+        """Compute scene_idxs for data sampling.
+
+        We sample more times for scenes with more points.
+        """
+        # when testing, we load one whole scene every time
+        if not self.test_mode and scene_idxs is None:
+            raise NotImplementedError(
+                'please provide re-sampled scene indexes for training')
+
+        return super().get_scene_idxs(scene_idxs)
+
+
+@DATASETS.register_module()
+class ScanNetInstanceSegDataset(Seg3DDataset):
+
+    METAINFO = {
+        'classes':
+        ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+         'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator',
+         'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin'),
+        'palette': [
+            [174, 199, 232],
+            [152, 223, 138],
+            [31, 119, 180],
+            [255, 187, 120],
+            [188, 189, 34],
+            [140, 86, 75],
+            [255, 152, 150],
+            [214, 39, 40],
+            [197, 176, 213],
+            [148, 103, 189],
+            [196, 156, 148],
+            [23, 190, 207],
+            [247, 182, 210],
+            [219, 219, 141],
+            [255, 127, 14],
+            [158, 218, 229],
+            [44, 160, 44],
+            [112, 128, 144],
+            [227, 119, 194],
+            [82, 84, 163],
+        ],
+        'seg_valid_class_ids':
+        (3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39),
+        'seg_all_class_ids':
+        tuple(range(41))
+    }
+
+    def __init__(self,
+                 data_root: Optional[str] = None,
+                 ann_file: str = '',
+                 metainfo: Optional[dict] = None,
+                 data_prefix: dict = dict(
+                     pts='points',
+                     img='',
+                     pts_instance_mask='',
+                     pts_semantic_mask=''),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_lidar=True, use_camera=False),
+                 test_mode: bool = False,
+                 ignore_index: Optional[int] = None,
+                 scene_idxs: Optional[Union[np.ndarray, str]] = None,
+                 backend_args: Optional[dict] = None,
+                 **kwargs) -> None:
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            metainfo=metainfo,
+            pipeline=pipeline,
+            data_prefix=data_prefix,
+            modality=modality,
+            test_mode=test_mode,
+            ignore_index=ignore_index,
+            scene_idxs=scene_idxs,
+            backend_args=backend_args,
+            **kwargs)
diff --git a/mmde/mmdet3d/datasets/seg3d_dataset.py b/mmde/mmdet3d/datasets/seg3d_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c4de10d779eca3c06f8a7eb12d45aa6545d300c
--- /dev/null
+++ b/mmde/mmdet3d/datasets/seg3d_dataset.py
@@ -0,0 +1,338 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from os import path as osp
+from typing import Callable, List, Optional, Sequence, Union
+
+import numpy as np
+from mmengine.dataset import BaseDataset
+from mmengine.fileio import get_local_path
+
+from mmdet3d.registry import DATASETS
+
+
+@DATASETS.register_module()
+class Seg3DDataset(BaseDataset):
+    """Base Class for 3D semantic segmentation dataset.
+
+    This is the base dataset of ScanNet, S3DIS and SemanticKITTI dataset.
+
+    Args:
+        data_root (str, optional): Path of dataset root. Defaults to None.
+        ann_file (str): Path of annotation file. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(pts='points',
+                 img='',
+                 pts_instance_mask='',
+                 pts_semantic_mask='').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used
+            as input, it usually has following keys:
+
+                - use_camera: bool
+                - use_lidar: bool
+            Defaults to dict(use_lidar=True, use_camera=False).
+        ignore_index (int, optional): The label index to be ignored, e.g.
+            unannotated points. If None is given, set to len(self.classes) to
+            be consistent with PointSegClassMapping function in pipeline.
+            Defaults to None.
+        scene_idxs (np.ndarray or str, optional): Precomputed index to load
+            data. For scenes with many points, we may sample it several times.
+            Defaults to None.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+        serialize_data (bool): Whether to hold memory using serialized objects,
+            when enabled, data loader workers can use shared RAM from master
+            process instead of making a copy.
+            Defaults to False for 3D Segmentation datasets.
+        load_eval_anns (bool): Whether to load annotations in test_mode,
+            the annotation will be save in `eval_ann_infos`, which can be used
+            in Evaluator. Defaults to True.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+    METAINFO = {
+        'classes': None,  # names of all classes data used for the task
+        'palette': None,  # official color for visualization
+        'seg_valid_class_ids': None,  # class_ids used for training
+        'seg_all_class_ids': None,  # all possible class_ids in loaded seg mask
+    }
+
+    def __init__(self,
+                 data_root: Optional[str] = None,
+                 ann_file: str = '',
+                 metainfo: Optional[dict] = None,
+                 data_prefix: dict = dict(
+                     pts='points',
+                     img='',
+                     pts_instance_mask='',
+                     pts_semantic_mask=''),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_lidar=True, use_camera=False),
+                 ignore_index: Optional[int] = None,
+                 scene_idxs: Optional[Union[str, np.ndarray]] = None,
+                 test_mode: bool = False,
+                 serialize_data: bool = False,
+                 load_eval_anns: bool = True,
+                 backend_args: Optional[dict] = None,
+                 **kwargs) -> None:
+        self.backend_args = backend_args
+        self.modality = modality
+        self.load_eval_anns = load_eval_anns
+
+        # TODO: We maintain the ignore_index attributes,
+        # but we may consider to remove it in the future.
+        self.ignore_index = len(self.METAINFO['classes']) if \
+            ignore_index is None else ignore_index
+
+        # Get label mapping for custom classes
+        new_classes = metainfo.get('classes', None)
+
+        self.label_mapping, self.label2cat, seg_valid_class_ids = \
+            self.get_label_mapping(new_classes)
+
+        metainfo['label_mapping'] = self.label_mapping
+        metainfo['label2cat'] = self.label2cat
+        metainfo['ignore_index'] = self.ignore_index
+        metainfo['seg_valid_class_ids'] = seg_valid_class_ids
+
+        # generate palette if it is not defined based on
+        # label mapping, otherwise directly use palette
+        # defined in dataset config.
+        palette = metainfo.get('palette', None)
+        updated_palette = self._update_palette(new_classes, palette)
+
+        metainfo['palette'] = updated_palette
+
+        # construct seg_label_mapping for semantic mask
+        self.seg_label_mapping = self.get_seg_label_mapping(metainfo)
+
+        super().__init__(
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_root=data_root,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            test_mode=test_mode,
+            serialize_data=serialize_data,
+            **kwargs)
+
+        self.metainfo['seg_label_mapping'] = self.seg_label_mapping
+        if not kwargs.get('lazy_init', False):
+            self.scene_idxs = self.get_scene_idxs(scene_idxs)
+            self.data_list = [self.data_list[i] for i in self.scene_idxs]
+
+            # set group flag for the sampler
+            if not self.test_mode:
+                self._set_group_flag()
+
+    def get_label_mapping(self,
+                          new_classes: Optional[Sequence] = None) -> tuple:
+        """Get label mapping.
+
+        The ``label_mapping`` is a dictionary, its keys are the old label ids
+        and its values are the new label ids, and is used for changing pixel
+        labels in load_annotations. If and only if old classes in cls.METAINFO
+        is not equal to new classes in self._metainfo and nether of them is not
+        None, `label_mapping` is not None.
+
+        Args:
+            new_classes (list or tuple, optional): The new classes name from
+                metainfo. Defaults to None.
+
+        Returns:
+            tuple: The mapping from old classes in cls.METAINFO to
+            new classes in metainfo
+        """
+        old_classes = self.METAINFO.get('classes', None)
+        if (new_classes is not None and old_classes is not None
+                and list(new_classes) != list(old_classes)):
+            if not set(new_classes).issubset(old_classes):
+                raise ValueError(
+                    f'new classes {new_classes} is not a '
+                    f'subset of classes {old_classes} in METAINFO.')
+
+            # obtain true id from valid_class_ids
+            valid_class_ids = [
+                self.METAINFO['seg_valid_class_ids'][old_classes.index(
+                    cls_name)] for cls_name in new_classes
+            ]
+            label_mapping = {
+                cls_id: self.ignore_index
+                for cls_id in self.METAINFO['seg_all_class_ids']
+            }
+            label_mapping.update(
+                {cls_id: i
+                 for i, cls_id in enumerate(valid_class_ids)})
+            label2cat = {i: cat_name for i, cat_name in enumerate(new_classes)}
+        else:
+            label_mapping = {
+                cls_id: self.ignore_index
+                for cls_id in self.METAINFO['seg_all_class_ids']
+            }
+            label_mapping.update({
+                cls_id: i
+                for i, cls_id in enumerate(
+                    self.METAINFO['seg_valid_class_ids'])
+            })
+            # map label to category name
+            label2cat = {
+                i: cat_name
+                for i, cat_name in enumerate(self.METAINFO['classes'])
+            }
+            valid_class_ids = self.METAINFO['seg_valid_class_ids']
+
+        return label_mapping, label2cat, valid_class_ids
+
+    def get_seg_label_mapping(self, metainfo=None):
+        """Get segmentation label mapping.
+
+        The ``seg_label_mapping`` is an array, its indices are the old label
+        ids and its values are the new label ids, and is specifically used
+        for changing point labels in PointSegClassMapping.
+
+        Args:
+            metainfo (dict, optional): Meta information to set
+            seg_label_mapping. Defaults to None.
+
+        Returns:
+            tuple: The mapping from old classes to new classes.
+        """
+        seg_max_cat_id = len(self.METAINFO['seg_all_class_ids'])
+        seg_valid_cat_ids = self.METAINFO['seg_valid_class_ids']
+        neg_label = len(seg_valid_cat_ids)
+        seg_label_mapping = np.ones(
+            seg_max_cat_id + 1, dtype=np.int64) * neg_label
+        for cls_idx, cat_id in enumerate(seg_valid_cat_ids):
+            seg_label_mapping[cat_id] = cls_idx
+        return seg_label_mapping
+
+    def _update_palette(self, new_classes: list, palette: Union[None,
+                                                                list]) -> list:
+        """Update palette according to metainfo.
+
+        If length of palette is equal to classes, just return the palette.
+        If palette is not defined, it will randomly generate a palette.
+        If classes is updated by customer, it will return the subset of
+        palette.
+
+        Returns:
+            Sequence: Palette for current dataset.
+        """
+        if palette is None:
+            # If palette is not defined, it generate a palette according
+            # to the original palette and classes.
+            old_classes = self.METAINFO.get('classes', None)
+            palette = [
+                self.METAINFO['palette'][old_classes.index(cls_name)]
+                for cls_name in new_classes
+            ]
+            return palette
+
+        # palette does match classes
+        if len(palette) == len(new_classes):
+            return palette
+        else:
+            raise ValueError('Once palette in set in metainfo, it should'
+                             'match classes in metainfo')
+
+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        Convert all relative path of needed modality data file to
+        the absolute path. And process
+        the `instances` field to `ann_info` in training stage.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+        if self.modality['use_lidar']:
+            info['lidar_points']['lidar_path'] = \
+                osp.join(
+                    self.data_prefix.get('pts', ''),
+                    info['lidar_points']['lidar_path'])
+            if 'num_pts_feats' in info['lidar_points']:
+                info['num_pts_feats'] = info['lidar_points']['num_pts_feats']
+            info['lidar_path'] = info['lidar_points']['lidar_path']
+
+        if self.modality['use_camera']:
+            for cam_id, img_info in info['images'].items():
+                if 'img_path' in img_info:
+                    img_info['img_path'] = osp.join(
+                        self.data_prefix.get('img', ''), img_info['img_path'])
+
+        if 'pts_instance_mask_path' in info:
+            info['pts_instance_mask_path'] = \
+                osp.join(self.data_prefix.get('pts_instance_mask', ''),
+                         info['pts_instance_mask_path'])
+
+        if 'pts_semantic_mask_path' in info:
+            info['pts_semantic_mask_path'] = \
+                osp.join(self.data_prefix.get('pts_semantic_mask', ''),
+                         info['pts_semantic_mask_path'])
+
+        # only be used in `PointSegClassMapping` in pipeline
+        # to map original semantic class to valid category ids.
+        info['seg_label_mapping'] = self.seg_label_mapping
+
+        # 'eval_ann_info' will be updated in loading transforms
+        if self.test_mode and self.load_eval_anns:
+            info['eval_ann_info'] = dict()
+
+        return info
+
+    def prepare_data(self, idx: int) -> dict:
+        """Get data processed by ``self.pipeline``.
+
+        Args:
+            idx (int): The index of ``data_info``.
+
+        Returns:
+            dict: Results passed through ``self.pipeline``.
+        """
+        if not self.test_mode:
+            data_info = self.get_data_info(idx)
+            # Pass the dataset to the pipeline during training to support mixed
+            # data augmentation, such as polarmix and lasermix.
+            data_info['dataset'] = self
+            return self.pipeline(data_info)
+        else:
+            return super().prepare_data(idx)
+
+    def get_scene_idxs(self, scene_idxs: Union[None, str,
+                                               np.ndarray]) -> np.ndarray:
+        """Compute scene_idxs for data sampling.
+
+        We sample more times for scenes with more points.
+        """
+        if self.test_mode:
+            # when testing, we load one whole scene every time
+            return np.arange(len(self)).astype(np.int32)
+
+        # we may need to re-sample different scenes according to scene_idxs
+        # this is necessary for indoor scene segmentation such as ScanNet
+        if scene_idxs is None:
+            scene_idxs = np.arange(len(self))
+        if isinstance(scene_idxs, str):
+            scene_idxs = osp.join(self.data_root, scene_idxs)
+            with get_local_path(
+                    scene_idxs, backend_args=self.backend_args) as local_path:
+                scene_idxs = np.load(local_path)
+        else:
+            scene_idxs = np.array(scene_idxs)
+
+        return scene_idxs.astype(np.int32)
+
+    def _set_group_flag(self) -> None:
+        """Set flag according to image aspect ratio.
+
+        Images with aspect ratio greater than 1 will be set as group 1,
+        otherwise group 0. In 3D datasets, they are all the same, thus are all
+        zeros.
+        """
+        self.flag = np.zeros(len(self), dtype=np.uint8)
diff --git a/mmde/mmdet3d/datasets/semantickitti_dataset.py b/mmde/mmdet3d/datasets/semantickitti_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8a57ce182980ac7e3a022872d1e662ea1bb0e61
--- /dev/null
+++ b/mmde/mmdet3d/datasets/semantickitti_dataset.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+
+from mmdet3d.registry import DATASETS
+from .seg3d_dataset import Seg3DDataset
+
+
+@DATASETS.register_module()
+class SemanticKittiDataset(Seg3DDataset):
+    r"""SemanticKitti Dataset.
+
+    This class serves as the API for experiments on the SemanticKITTI Dataset
+    Please refer to <http://www.semantic-kitti.org/dataset.html>`_
+    for data downloading
+
+    Args:
+        data_root (str, optional): Path of dataset root. Defaults to None.
+        ann_file (str): Path of annotation file. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(pts='',
+                 img='',
+                 pts_instance_mask='',
+                 pts_semantic_mask='').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input,
+            it usually has following keys:
+
+                - use_camera: bool
+                - use_lidar: bool
+            Defaults to dict(use_lidar=True, use_camera=False).
+        ignore_index (int, optional): The label index to be ignored, e.g.
+            unannotated points. If None is given, set to len(self.classes) to
+            be consistent with PointSegClassMapping function in pipeline.
+            Defaults to None.
+        scene_idxs (np.ndarray or str, optional): Precomputed index to load
+            data. For scenes with many points, we may sample it several times.
+            Defaults to None.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+    METAINFO = {
+        'classes': ('car', 'bicycle', 'motorcycle', 'truck', 'bus', 'person',
+                    'bicyclist', 'motorcyclist', 'road', 'parking', 'sidewalk',
+                    'other-ground', 'building', 'fence', 'vegetation',
+                    'trunck', 'terrian', 'pole', 'traffic-sign'),
+        'palette': [[100, 150, 245], [100, 230, 245], [30, 60, 150],
+                    [80, 30, 180], [100, 80, 250], [155, 30, 30],
+                    [255, 40, 200], [150, 30, 90], [255, 0, 255],
+                    [255, 150, 255], [75, 0, 75], [175, 0, 75], [255, 200, 0],
+                    [255, 120, 50], [0, 175, 0], [135, 60, 0], [150, 240, 80],
+                    [255, 240, 150], [255, 0, 0]],
+        'seg_valid_class_ids':
+        tuple(range(19)),
+        'seg_all_class_ids':
+        tuple(range(19)),
+    }
+
+    def __init__(self,
+                 data_root: Optional[str] = None,
+                 ann_file: str = '',
+                 metainfo: Optional[dict] = None,
+                 data_prefix: dict = dict(
+                     pts='',
+                     img='',
+                     pts_instance_mask='',
+                     pts_semantic_mask=''),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_lidar=True, use_camera=False),
+                 ignore_index: Optional[int] = None,
+                 scene_idxs: Optional[Union[str, np.ndarray]] = None,
+                 test_mode: bool = False,
+                 **kwargs) -> None:
+
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            modality=modality,
+            ignore_index=ignore_index,
+            scene_idxs=scene_idxs,
+            test_mode=test_mode,
+            **kwargs)
+
+    def get_seg_label_mapping(self, metainfo):
+        seg_label_mapping = np.zeros(metainfo['max_label'] + 1, dtype=np.int64)
+        for idx in metainfo['seg_label_mapping']:
+            seg_label_mapping[idx] = metainfo['seg_label_mapping'][idx]
+        return seg_label_mapping
diff --git a/mmde/mmdet3d/datasets/sunrgbd_dataset.py b/mmde/mmdet3d/datasets/sunrgbd_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8aa62c425207d2644e0d0fac952396310f8021a
--- /dev/null
+++ b/mmde/mmdet3d/datasets/sunrgbd_dataset.py
@@ -0,0 +1,147 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import os.path as osp
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import DepthInstance3DBoxes
+from .det3d_dataset import Det3DDataset
+
+
+@DATASETS.register_module()
+class SUNRGBDDataset(Det3DDataset):
+    r"""SUNRGBD Dataset.
+
+    This class serves as the API for experiments on the SUNRGBD Dataset.
+
+    See the `download page <http://rgbd.cs.princeton.edu/challenge.html>`_
+    for data downloading.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_prefix (dict): Prefix for data. Defaults to
+            dict(pts='points',img='sunrgbd_trainval').
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=True, use_lidar=True).
+        default_cam_key (str): The default camera name adopted.
+            Defaults to 'CAM0'.
+        box_type_3d (str): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'Depth' in this dataset. Available options includes:
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+    METAINFO = {
+        'classes': ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk',
+                    'dresser', 'night_stand', 'bookshelf', 'bathtub'),
+        'palette': [(255, 187, 120), (255, 152, 150), (140, 86, 75),
+                    (188, 189, 34), (44, 160, 44), (247, 182, 210),
+                    (196, 156, 148), (23, 190, 207), (148, 103, 189),
+                    (227, 119, 194)]
+    }
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 metainfo: Optional[dict] = None,
+                 data_prefix: dict = dict(
+                     pts='points', img='sunrgbd_trainval/image'),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 default_cam_key: str = 'CAM0',
+                 modality: dict = dict(use_camera=True, use_lidar=True),
+                 box_type_3d: str = 'Depth',
+                 filter_empty_gt: bool = True,
+                 test_mode: bool = False,
+                 **kwargs) -> None:
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            default_cam_key=default_cam_key,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+        assert 'use_camera' in self.modality and \
+            'use_lidar' in self.modality
+        assert self.modality['use_camera'] or self.modality['use_lidar']
+
+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        Convert all relative path of needed modality data file to
+        the absolute path. And process
+        the `instances` field to `ann_info` in training stage.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+
+        if self.modality['use_lidar']:
+            info['lidar_points']['lidar_path'] = \
+                osp.join(
+                    self.data_prefix.get('pts', ''),
+                    info['lidar_points']['lidar_path'])
+
+        if self.modality['use_camera']:
+            for cam_id, img_info in info['images'].items():
+                if 'img_path' in img_info:
+                    img_info['img_path'] = osp.join(
+                        self.data_prefix.get('img', ''), img_info['img_path'])
+            if self.default_cam_key is not None:
+                info['img_path'] = info['images'][
+                    self.default_cam_key]['img_path']
+                info['depth2img'] = np.array(
+                    info['images'][self.default_cam_key]['depth2img'],
+                    dtype=np.float32)
+
+        if not self.test_mode:
+            # used in traing
+            info['ann_info'] = self.parse_ann_info(info)
+        if self.test_mode and self.load_eval_anns:
+            info['eval_ann_info'] = self.parse_ann_info(info)
+
+        return info
+
+    def parse_ann_info(self, info: dict) -> dict:
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Info dict.
+
+        Returns:
+            dict: Processed `ann_info`
+        """
+        ann_info = super().parse_ann_info(info)
+        # process data without any annotations
+        if ann_info is None:
+            ann_info = dict()
+            ann_info['gt_bboxes_3d'] = np.zeros((0, 6), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros((0, ), dtype=np.int64)
+        # to target box structure
+        ann_info['gt_bboxes_3d'] = DepthInstance3DBoxes(
+            ann_info['gt_bboxes_3d'],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        return ann_info
diff --git a/mmde/mmdet3d/datasets/transforms/__init__.py b/mmde/mmdet3d/datasets/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf91ba2352ca149cee3e3f2f977deb7c64e923e3
--- /dev/null
+++ b/mmde/mmdet3d/datasets/transforms/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dbsampler import DataBaseSampler
+from .formating import Pack3DDetInputs
+from .loading import (LidarDet3DInferencerLoader, LoadAnnotations3D,
+                      LoadImageFromFileMono3D, LoadMultiViewImageFromFiles,
+                      LoadPointsFromDict, LoadPointsFromFile,
+                      LoadPointsFromMultiSweeps, MonoDet3DInferencerLoader,
+                      MultiModalityDet3DInferencerLoader, NormalizePointsColor,
+                      PointSegClassMapping)
+from .test_time_aug import MultiScaleFlipAug3D
+# yapf: disable
+from .transforms_3d import (AffineResize, BackgroundPointsFilter,
+                            GlobalAlignment, GlobalRotScaleTrans,
+                            IndoorPatchPointSample, IndoorPointSample,
+                            LaserMix, MultiViewWrapper, ObjectNameFilter,
+                            ObjectNoise, ObjectRangeFilter, ObjectSample,
+                            PhotoMetricDistortion3D, PointSample, PointShuffle,
+                            PointsRangeFilter, PolarMix, RandomDropPointsColor,
+                            RandomFlip3D, RandomJitterPoints, RandomResize3D,
+                            RandomShiftScale, Resize3D, VoxelBasedPointSampler)
+
+__all__ = [
+    'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
+    'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter',
+    'Pack3DDetInputs', 'LoadMultiViewImageFromFiles', 'LoadPointsFromFile',
+    'DataBaseSampler', 'NormalizePointsColor', 'LoadAnnotations3D',
+    'IndoorPointSample', 'PointSample', 'PointSegClassMapping',
+    'MultiScaleFlipAug3D', 'LoadPointsFromMultiSweeps',
+    'BackgroundPointsFilter', 'VoxelBasedPointSampler', 'GlobalAlignment',
+    'IndoorPatchPointSample', 'LoadImageFromFileMono3D', 'ObjectNameFilter',
+    'RandomDropPointsColor', 'RandomJitterPoints', 'AffineResize',
+    'RandomShiftScale', 'LoadPointsFromDict', 'Resize3D', 'RandomResize3D',
+    'MultiViewWrapper', 'PhotoMetricDistortion3D', 'MonoDet3DInferencerLoader',
+    'LidarDet3DInferencerLoader', 'PolarMix', 'LaserMix',
+    'MultiModalityDet3DInferencerLoader'
+]
diff --git a/mmde/mmdet3d/datasets/transforms/data_augment_utils.py b/mmde/mmdet3d/datasets/transforms/data_augment_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5744267b78089a30956d5a09426f4e6a5dd281be
--- /dev/null
+++ b/mmde/mmdet3d/datasets/transforms/data_augment_utils.py
@@ -0,0 +1,411 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numba
+import numpy as np
+from numba.core.errors import NumbaPerformanceWarning
+
+from mmdet3d.structures.ops import box_np_ops
+
+warnings.filterwarnings('ignore', category=NumbaPerformanceWarning)
+
+
+@numba.njit
+def _rotation_box2d_jit_(corners, angle, rot_mat_T):
+    """Rotate 2D boxes.
+
+    Args:
+        corners (np.ndarray): Corners of boxes.
+        angle (float): Rotation angle.
+        rot_mat_T (np.ndarray): Transposed rotation matrix.
+    """
+    rot_sin = np.sin(angle)
+    rot_cos = np.cos(angle)
+    rot_mat_T[0, 0] = rot_cos
+    rot_mat_T[0, 1] = rot_sin
+    rot_mat_T[1, 0] = -rot_sin
+    rot_mat_T[1, 1] = rot_cos
+    corners[:] = corners @ rot_mat_T
+
+
+@numba.jit(nopython=True)
+def box_collision_test(boxes, qboxes, clockwise=True):
+    """Box collision test.
+
+    Args:
+        boxes (np.ndarray): Corners of current boxes.
+        qboxes (np.ndarray): Boxes to be avoid colliding.
+        clockwise (bool, optional): Whether the corners are in
+            clockwise order. Default: True.
+    """
+    N = boxes.shape[0]
+    K = qboxes.shape[0]
+    ret = np.zeros((N, K), dtype=np.bool_)
+    slices = np.array([1, 2, 3, 0])
+    lines_boxes = np.stack((boxes, boxes[:, slices, :]),
+                           axis=2)  # [N, 4, 2(line), 2(xy)]
+    lines_qboxes = np.stack((qboxes, qboxes[:, slices, :]), axis=2)
+    # vec = np.zeros((2,), dtype=boxes.dtype)
+    boxes_standup = box_np_ops.corner_to_standup_nd_jit(boxes)
+    qboxes_standup = box_np_ops.corner_to_standup_nd_jit(qboxes)
+    for i in range(N):
+        for j in range(K):
+            # calculate standup first
+            iw = (
+                min(boxes_standup[i, 2], qboxes_standup[j, 2]) -
+                max(boxes_standup[i, 0], qboxes_standup[j, 0]))
+            if iw > 0:
+                ih = (
+                    min(boxes_standup[i, 3], qboxes_standup[j, 3]) -
+                    max(boxes_standup[i, 1], qboxes_standup[j, 1]))
+                if ih > 0:
+                    for k in range(4):
+                        for box_l in range(4):
+                            A = lines_boxes[i, k, 0]
+                            B = lines_boxes[i, k, 1]
+                            C = lines_qboxes[j, box_l, 0]
+                            D = lines_qboxes[j, box_l, 1]
+                            acd = (D[1] - A[1]) * (C[0] -
+                                                   A[0]) > (C[1] - A[1]) * (
+                                                       D[0] - A[0])
+                            bcd = (D[1] - B[1]) * (C[0] -
+                                                   B[0]) > (C[1] - B[1]) * (
+                                                       D[0] - B[0])
+                            if acd != bcd:
+                                abc = (C[1] - A[1]) * (B[0] - A[0]) > (
+                                    B[1] - A[1]) * (
+                                        C[0] - A[0])
+                                abd = (D[1] - A[1]) * (B[0] - A[0]) > (
+                                    B[1] - A[1]) * (
+                                        D[0] - A[0])
+                                if abc != abd:
+                                    ret[i, j] = True  # collision.
+                                    break
+                        if ret[i, j] is True:
+                            break
+                    if ret[i, j] is False:
+                        # now check complete overlap.
+                        # box overlap qbox:
+                        box_overlap_qbox = True
+                        for box_l in range(4):  # point l in qboxes
+                            for k in range(4):  # corner k in boxes
+                                vec = boxes[i, k] - boxes[i, (k + 1) % 4]
+                                if clockwise:
+                                    vec = -vec
+                                cross = vec[1] * (
+                                    boxes[i, k, 0] - qboxes[j, box_l, 0])
+                                cross -= vec[0] * (
+                                    boxes[i, k, 1] - qboxes[j, box_l, 1])
+                                if cross >= 0:
+                                    box_overlap_qbox = False
+                                    break
+                            if box_overlap_qbox is False:
+                                break
+
+                        if box_overlap_qbox is False:
+                            qbox_overlap_box = True
+                            for box_l in range(4):  # point box_l in boxes
+                                for k in range(4):  # corner k in qboxes
+                                    vec = qboxes[j, k] - qboxes[j, (k + 1) % 4]
+                                    if clockwise:
+                                        vec = -vec
+                                    cross = vec[1] * (
+                                        qboxes[j, k, 0] - boxes[i, box_l, 0])
+                                    cross -= vec[0] * (
+                                        qboxes[j, k, 1] - boxes[i, box_l, 1])
+                                    if cross >= 0:  #
+                                        qbox_overlap_box = False
+                                        break
+                                if qbox_overlap_box is False:
+                                    break
+                            if qbox_overlap_box:
+                                ret[i, j] = True  # collision.
+                        else:
+                            ret[i, j] = True  # collision.
+    return ret
+
+
+@numba.njit
+def noise_per_box(boxes, valid_mask, loc_noises, rot_noises):
+    """Add noise to every box (only on the horizontal plane).
+
+    Args:
+        boxes (np.ndarray): Input boxes with shape (N, 5).
+        valid_mask (np.ndarray): Mask to indicate which boxes are valid
+            with shape (N).
+        loc_noises (np.ndarray): Location noises with shape (N, M, 3).
+        rot_noises (np.ndarray): Rotation noises with shape (N, M).
+
+    Returns:
+        np.ndarray: Mask to indicate whether the noise is
+            added successfully (pass the collision test).
+    """
+    num_boxes = boxes.shape[0]
+    num_tests = loc_noises.shape[1]
+    box_corners = box_np_ops.box2d_to_corner_jit(boxes)
+    current_corners = np.zeros((4, 2), dtype=boxes.dtype)
+    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+    success_mask = -np.ones((num_boxes, ), dtype=np.int64)
+    # print(valid_mask)
+    for i in range(num_boxes):
+        if valid_mask[i]:
+            for j in range(num_tests):
+                current_corners[:] = box_corners[i]
+                current_corners -= boxes[i, :2]
+                _rotation_box2d_jit_(current_corners, rot_noises[i, j],
+                                     rot_mat_T)
+                current_corners += boxes[i, :2] + loc_noises[i, j, :2]
+                coll_mat = box_collision_test(
+                    current_corners.reshape(1, 4, 2), box_corners)
+                coll_mat[0, i] = False
+                # print(coll_mat)
+                if not coll_mat.any():
+                    success_mask[i] = j
+                    box_corners[i] = current_corners
+                    break
+    return success_mask
+
+
+@numba.njit
+def noise_per_box_v2_(boxes, valid_mask, loc_noises, rot_noises,
+                      global_rot_noises):
+    """Add noise to every box (only on the horizontal plane). Version 2 used
+    when enable global rotations.
+
+    Args:
+        boxes (np.ndarray): Input boxes with shape (N, 5).
+        valid_mask (np.ndarray): Mask to indicate which boxes are valid
+            with shape (N).
+        loc_noises (np.ndarray): Location noises with shape (N, M, 3).
+        rot_noises (np.ndarray): Rotation noises with shape (N, M).
+
+    Returns:
+        np.ndarray: Mask to indicate whether the noise is
+            added successfully (pass the collision test).
+    """
+    num_boxes = boxes.shape[0]
+    num_tests = loc_noises.shape[1]
+    box_corners = box_np_ops.box2d_to_corner_jit(boxes)
+    current_corners = np.zeros((4, 2), dtype=boxes.dtype)
+    current_box = np.zeros((1, 5), dtype=boxes.dtype)
+    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+    dst_pos = np.zeros((2, ), dtype=boxes.dtype)
+    success_mask = -np.ones((num_boxes, ), dtype=np.int64)
+    corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
+    corners_norm[1, 1] = 1.0
+    corners_norm[2] = 1.0
+    corners_norm[3, 0] = 1.0
+    corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
+    corners_norm = corners_norm.reshape(4, 2)
+    for i in range(num_boxes):
+        if valid_mask[i]:
+            for j in range(num_tests):
+                current_box[0, :] = boxes[i]
+                current_radius = np.sqrt(boxes[i, 0]**2 + boxes[i, 1]**2)
+                current_grot = np.arctan2(boxes[i, 0], boxes[i, 1])
+                dst_grot = current_grot + global_rot_noises[i, j]
+                dst_pos[0] = current_radius * np.sin(dst_grot)
+                dst_pos[1] = current_radius * np.cos(dst_grot)
+                current_box[0, :2] = dst_pos
+                current_box[0, -1] += (dst_grot - current_grot)
+
+                rot_sin = np.sin(current_box[0, -1])
+                rot_cos = np.cos(current_box[0, -1])
+                rot_mat_T[0, 0] = rot_cos
+                rot_mat_T[0, 1] = rot_sin
+                rot_mat_T[1, 0] = -rot_sin
+                rot_mat_T[1, 1] = rot_cos
+                current_corners[:] = current_box[
+                    0, 2:4] * corners_norm @ rot_mat_T + current_box[0, :2]
+                current_corners -= current_box[0, :2]
+                _rotation_box2d_jit_(current_corners, rot_noises[i, j],
+                                     rot_mat_T)
+                current_corners += current_box[0, :2] + loc_noises[i, j, :2]
+                coll_mat = box_collision_test(
+                    current_corners.reshape(1, 4, 2), box_corners)
+                coll_mat[0, i] = False
+                if not coll_mat.any():
+                    success_mask[i] = j
+                    box_corners[i] = current_corners
+                    loc_noises[i, j, :2] += (dst_pos - boxes[i, :2])
+                    rot_noises[i, j] += (dst_grot - current_grot)
+                    break
+    return success_mask
+
+
+def _select_transform(transform, indices):
+    """Select transform.
+
+    Args:
+        transform (np.ndarray): Transforms to select from.
+        indices (np.ndarray): Mask to indicate which transform to select.
+
+    Returns:
+        np.ndarray: Selected transforms.
+    """
+    result = np.zeros((transform.shape[0], *transform.shape[2:]),
+                      dtype=transform.dtype)
+    for i in range(transform.shape[0]):
+        if indices[i] != -1:
+            result[i] = transform[i, indices[i]]
+    return result
+
+
+@numba.njit
+def _rotation_matrix_3d_(rot_mat_T, angle, axis):
+    """Get the 3D rotation matrix.
+
+    Args:
+        rot_mat_T (np.ndarray): Transposed rotation matrix.
+        angle (float): Rotation angle.
+        axis (int): Rotation axis.
+    """
+    rot_sin = np.sin(angle)
+    rot_cos = np.cos(angle)
+    rot_mat_T[:] = np.eye(3)
+    if axis == 1:
+        rot_mat_T[0, 0] = rot_cos
+        rot_mat_T[0, 2] = rot_sin
+        rot_mat_T[2, 0] = -rot_sin
+        rot_mat_T[2, 2] = rot_cos
+    elif axis == 2 or axis == -1:
+        rot_mat_T[0, 0] = rot_cos
+        rot_mat_T[0, 1] = rot_sin
+        rot_mat_T[1, 0] = -rot_sin
+        rot_mat_T[1, 1] = rot_cos
+    elif axis == 0:
+        rot_mat_T[1, 1] = rot_cos
+        rot_mat_T[1, 2] = rot_sin
+        rot_mat_T[2, 1] = -rot_sin
+        rot_mat_T[2, 2] = rot_cos
+
+
+@numba.njit
+def points_transform_(points, centers, point_masks, loc_transform,
+                      rot_transform, valid_mask):
+    """Apply transforms to points and box centers.
+
+    Args:
+        points (np.ndarray): Input points.
+        centers (np.ndarray): Input box centers.
+        point_masks (np.ndarray): Mask to indicate which points need
+            to be transformed.
+        loc_transform (np.ndarray): Location transform to be applied.
+        rot_transform (np.ndarray): Rotation transform to be applied.
+        valid_mask (np.ndarray): Mask to indicate which boxes are valid.
+    """
+    num_box = centers.shape[0]
+    num_points = points.shape[0]
+    rot_mat_T = np.zeros((num_box, 3, 3), dtype=points.dtype)
+    for i in range(num_box):
+        _rotation_matrix_3d_(rot_mat_T[i], rot_transform[i], 2)
+    for i in range(num_points):
+        for j in range(num_box):
+            if valid_mask[j]:
+                if point_masks[i, j] == 1:
+                    points[i, :3] -= centers[j, :3]
+                    points[i:i + 1, :3] = points[i:i + 1, :3] @ rot_mat_T[j]
+                    points[i, :3] += centers[j, :3]
+                    points[i, :3] += loc_transform[j]
+                    break  # only apply first box's transform
+
+
+@numba.njit
+def box3d_transform_(boxes, loc_transform, rot_transform, valid_mask):
+    """Transform 3D boxes.
+
+    Args:
+        boxes (np.ndarray): 3D boxes to be transformed.
+        loc_transform (np.ndarray): Location transform to be applied.
+        rot_transform (np.ndarray): Rotation transform to be applied.
+        valid_mask (np.ndarray): Mask to indicate which boxes are valid.
+    """
+    num_box = boxes.shape[0]
+    for i in range(num_box):
+        if valid_mask[i]:
+            boxes[i, :3] += loc_transform[i]
+            boxes[i, 6] += rot_transform[i]
+
+
+def noise_per_object_v3_(gt_boxes,
+                         points=None,
+                         valid_mask=None,
+                         rotation_perturb=np.pi / 4,
+                         center_noise_std=1.0,
+                         global_random_rot_range=np.pi / 4,
+                         num_try=100):
+    """Random rotate or remove each groundtruth independently. use kitti viewer
+    to test this function points_transform_
+
+    Args:
+        gt_boxes (np.ndarray): Ground truth boxes with shape (N, 7).
+        points (np.ndarray, optional): Input point cloud with
+            shape (M, 4). Default: None.
+        valid_mask (np.ndarray, optional): Mask to indicate which
+            boxes are valid. Default: None.
+        rotation_perturb (float, optional): Rotation perturbation.
+            Default: pi / 4.
+        center_noise_std (float, optional): Center noise standard deviation.
+            Default: 1.0.
+        global_random_rot_range (float, optional): Global random rotation
+            range. Default: pi/4.
+        num_try (int, optional): Number of try. Default: 100.
+    """
+    num_boxes = gt_boxes.shape[0]
+    if not isinstance(rotation_perturb, (list, tuple, np.ndarray)):
+        rotation_perturb = [-rotation_perturb, rotation_perturb]
+    if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)):
+        global_random_rot_range = [
+            -global_random_rot_range, global_random_rot_range
+        ]
+    enable_grot = np.abs(global_random_rot_range[0] -
+                         global_random_rot_range[1]) >= 1e-3
+
+    if not isinstance(center_noise_std, (list, tuple, np.ndarray)):
+        center_noise_std = [
+            center_noise_std, center_noise_std, center_noise_std
+        ]
+    if valid_mask is None:
+        valid_mask = np.ones((num_boxes, ), dtype=np.bool_)
+    center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype)
+
+    loc_noises = np.random.normal(
+        scale=center_noise_std, size=[num_boxes, num_try, 3])
+    rot_noises = np.random.uniform(
+        rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try])
+    gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1])
+    grot_lowers = global_random_rot_range[0] - gt_grots
+    grot_uppers = global_random_rot_range[1] - gt_grots
+    global_rot_noises = np.random.uniform(
+        grot_lowers[..., np.newaxis],
+        grot_uppers[..., np.newaxis],
+        size=[num_boxes, num_try])
+
+    origin = (0.5, 0.5, 0)
+    gt_box_corners = box_np_ops.center_to_corner_box3d(
+        gt_boxes[:, :3],
+        gt_boxes[:, 3:6],
+        gt_boxes[:, 6],
+        origin=origin,
+        axis=2)
+
+    # TODO: rewrite this noise box function?
+    if not enable_grot:
+        selected_noise = noise_per_box(gt_boxes[:, [0, 1, 3, 4, 6]],
+                                       valid_mask, loc_noises, rot_noises)
+    else:
+        selected_noise = noise_per_box_v2_(gt_boxes[:, [0, 1, 3, 4, 6]],
+                                           valid_mask, loc_noises, rot_noises,
+                                           global_rot_noises)
+
+    loc_transforms = _select_transform(loc_noises, selected_noise)
+    rot_transforms = _select_transform(rot_noises, selected_noise)
+    surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners)
+    if points is not None:
+        # TODO: replace this points_in_convex function by my tools?
+        point_masks = box_np_ops.points_in_convex_polygon_3d_jit(
+            points[:, :3], surfaces)
+        points_transform_(points, gt_boxes[:, :3], point_masks, loc_transforms,
+                          rot_transforms, valid_mask)
+
+    box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask)
diff --git a/mmde/mmdet3d/datasets/transforms/dbsampler.py b/mmde/mmdet3d/datasets/transforms/dbsampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..56e8440b7473b453fede40b2a8803dfd43f20eb9
--- /dev/null
+++ b/mmde/mmdet3d/datasets/transforms/dbsampler.py
@@ -0,0 +1,345 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os
+from typing import List, Optional
+
+import mmengine
+import numpy as np
+from mmengine.fileio import get_local_path
+
+from mmdet3d.datasets.transforms import data_augment_utils
+from mmdet3d.registry import TRANSFORMS
+from mmdet3d.structures.ops import box_np_ops
+
+
+class BatchSampler:
+    """Class for sampling specific category of ground truths.
+
+    Args:
+        sample_list (list[dict]): List of samples.
+        name (str, optional): The category of samples. Defaults to None.
+        epoch (int, optional): Sampling epoch. Defaults to None.
+        shuffle (bool): Whether to shuffle indices. Defaults to False.
+        drop_reminder (bool): Drop reminder. Defaults to False.
+    """
+
+    def __init__(self,
+                 sampled_list: List[dict],
+                 name: Optional[str] = None,
+                 epoch: Optional[int] = None,
+                 shuffle: bool = True,
+                 drop_reminder: bool = False) -> None:
+        self._sampled_list = sampled_list
+        self._indices = np.arange(len(sampled_list))
+        if shuffle:
+            np.random.shuffle(self._indices)
+        self._idx = 0
+        self._example_num = len(sampled_list)
+        self._name = name
+        self._shuffle = shuffle
+        self._epoch = epoch
+        self._epoch_counter = 0
+        self._drop_reminder = drop_reminder
+
+    def _sample(self, num: int) -> List[int]:
+        """Sample specific number of ground truths and return indices.
+
+        Args:
+            num (int): Sampled number.
+
+        Returns:
+            list[int]: Indices of sampled ground truths.
+        """
+        if self._idx + num >= self._example_num:
+            ret = self._indices[self._idx:].copy()
+            self._reset()
+        else:
+            ret = self._indices[self._idx:self._idx + num]
+            self._idx += num
+        return ret
+
+    def _reset(self) -> None:
+        """Reset the index of batchsampler to zero."""
+        assert self._name is not None
+        # print("reset", self._name)
+        if self._shuffle:
+            np.random.shuffle(self._indices)
+        self._idx = 0
+
+    def sample(self, num: int) -> List[dict]:
+        """Sample specific number of ground truths.
+
+        Args:
+            num (int): Sampled number.
+
+        Returns:
+            list[dict]: Sampled ground truths.
+        """
+        indices = self._sample(num)
+        return [self._sampled_list[i] for i in indices]
+
+
+@TRANSFORMS.register_module()
+class DataBaseSampler(object):
+    """Class for sampling data from the ground truth database.
+
+    Args:
+        info_path (str): Path of groundtruth database info.
+        data_root (str): Path of groundtruth database.
+        rate (float): Rate of actual sampled over maximum sampled number.
+        prepare (dict): Name of preparation functions and the input value.
+        sample_groups (dict): Sampled classes and numbers.
+        classes (list[str], optional): List of classes. Defaults to None.
+        points_loader (dict): Config of points loader. Defaults to
+            dict(type='LoadPointsFromFile', load_dim=4, use_dim=[0, 1, 2, 3]).
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    def __init__(self,
+                 info_path: str,
+                 data_root: str,
+                 rate: float,
+                 prepare: dict,
+                 sample_groups: dict,
+                 classes: Optional[List[str]] = None,
+                 points_loader: dict = dict(
+                     type='LoadPointsFromFile',
+                     coord_type='LIDAR',
+                     load_dim=4,
+                     use_dim=[0, 1, 2, 3],
+                     backend_args=None),
+                 backend_args: Optional[dict] = None) -> None:
+        super().__init__()
+        self.data_root = data_root
+        self.info_path = info_path
+        self.rate = rate
+        self.prepare = prepare
+        self.classes = classes
+        self.cat2label = {name: i for i, name in enumerate(classes)}
+        self.label2cat = {i: name for i, name in enumerate(classes)}
+        self.points_loader = TRANSFORMS.build(points_loader)
+        self.backend_args = backend_args
+
+        # load data base infos
+        with get_local_path(
+                info_path, backend_args=self.backend_args) as local_path:
+            # loading data from a file-like object needs file format
+            db_infos = mmengine.load(open(local_path, 'rb'), file_format='pkl')
+
+        # filter database infos
+        from mmengine.logging import MMLogger
+        logger: MMLogger = MMLogger.get_current_instance()
+        for k, v in db_infos.items():
+            logger.info(f'load {len(v)} {k} database infos in DataBaseSampler')
+        for prep_func, val in prepare.items():
+            db_infos = getattr(self, prep_func)(db_infos, val)
+        logger.info('After filter database:')
+        for k, v in db_infos.items():
+            logger.info(f'load {len(v)} {k} database infos in DataBaseSampler')
+
+        self.db_infos = db_infos
+
+        # load sample groups
+        # TODO: more elegant way to load sample groups
+        self.sample_groups = []
+        for name, num in sample_groups.items():
+            self.sample_groups.append({name: int(num)})
+
+        self.group_db_infos = self.db_infos  # just use db_infos
+        self.sample_classes = []
+        self.sample_max_nums = []
+        for group_info in self.sample_groups:
+            self.sample_classes += list(group_info.keys())
+            self.sample_max_nums += list(group_info.values())
+
+        self.sampler_dict = {}
+        for k, v in self.group_db_infos.items():
+            self.sampler_dict[k] = BatchSampler(v, k, shuffle=True)
+        # TODO: No group_sampling currently
+
+    @staticmethod
+    def filter_by_difficulty(db_infos: dict, removed_difficulty: list) -> dict:
+        """Filter ground truths by difficulties.
+
+        Args:
+            db_infos (dict): Info of groundtruth database.
+            removed_difficulty (list): Difficulties that are not qualified.
+
+        Returns:
+            dict: Info of database after filtering.
+        """
+        new_db_infos = {}
+        for key, dinfos in db_infos.items():
+            new_db_infos[key] = [
+                info for info in dinfos
+                if info['difficulty'] not in removed_difficulty
+            ]
+        return new_db_infos
+
+    @staticmethod
+    def filter_by_min_points(db_infos: dict, min_gt_points_dict: dict) -> dict:
+        """Filter ground truths by number of points in the bbox.
+
+        Args:
+            db_infos (dict): Info of groundtruth database.
+            min_gt_points_dict (dict): Different number of minimum points
+                needed for different categories of ground truths.
+
+        Returns:
+            dict: Info of database after filtering.
+        """
+        for name, min_num in min_gt_points_dict.items():
+            min_num = int(min_num)
+            if min_num > 0:
+                filtered_infos = []
+                for info in db_infos[name]:
+                    if info['num_points_in_gt'] >= min_num:
+                        filtered_infos.append(info)
+                db_infos[name] = filtered_infos
+        return db_infos
+
+    def sample_all(self,
+                   gt_bboxes: np.ndarray,
+                   gt_labels: np.ndarray,
+                   img: Optional[np.ndarray] = None,
+                   ground_plane: Optional[np.ndarray] = None) -> dict:
+        """Sampling all categories of bboxes.
+
+        Args:
+            gt_bboxes (np.ndarray): Ground truth bounding boxes.
+            gt_labels (np.ndarray): Ground truth labels of boxes.
+            img (np.ndarray, optional): Image array. Defaults to None.
+            ground_plane (np.ndarray, optional): Ground plane information.
+                Defaults to None.
+
+        Returns:
+            dict: Dict of sampled 'pseudo ground truths'.
+
+                - gt_labels_3d (np.ndarray): ground truths labels
+                  of sampled objects.
+                - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`):
+                  sampled ground truth 3D bounding boxes
+                - points (np.ndarray): sampled points
+                - group_ids (np.ndarray): ids of sampled ground truths
+        """
+        sampled_num_dict = {}
+        sample_num_per_class = []
+        for class_name, max_sample_num in zip(self.sample_classes,
+                                              self.sample_max_nums):
+            class_label = self.cat2label[class_name]
+            # sampled_num = int(max_sample_num -
+            #                   np.sum([n == class_name for n in gt_names]))
+            sampled_num = int(max_sample_num -
+                              np.sum([n == class_label for n in gt_labels]))
+            sampled_num = np.round(self.rate * sampled_num).astype(np.int64)
+            sampled_num_dict[class_name] = sampled_num
+            sample_num_per_class.append(sampled_num)
+
+        sampled = []
+        sampled_gt_bboxes = []
+        avoid_coll_boxes = gt_bboxes
+
+        for class_name, sampled_num in zip(self.sample_classes,
+                                           sample_num_per_class):
+            if sampled_num > 0:
+                sampled_cls = self.sample_class_v2(class_name, sampled_num,
+                                                   avoid_coll_boxes)
+
+                sampled += sampled_cls
+                if len(sampled_cls) > 0:
+                    if len(sampled_cls) == 1:
+                        sampled_gt_box = sampled_cls[0]['box3d_lidar'][
+                            np.newaxis, ...]
+                    else:
+                        sampled_gt_box = np.stack(
+                            [s['box3d_lidar'] for s in sampled_cls], axis=0)
+
+                    sampled_gt_bboxes += [sampled_gt_box]
+                    avoid_coll_boxes = np.concatenate(
+                        [avoid_coll_boxes, sampled_gt_box], axis=0)
+
+        ret = None
+        if len(sampled) > 0:
+            sampled_gt_bboxes = np.concatenate(sampled_gt_bboxes, axis=0)
+            # center = sampled_gt_bboxes[:, 0:3]
+
+            # num_sampled = len(sampled)
+            s_points_list = []
+            count = 0
+            for info in sampled:
+                file_path = os.path.join(
+                    self.data_root,
+                    info['path']) if self.data_root else info['path']
+                results = dict(lidar_points=dict(lidar_path=file_path))
+                s_points = self.points_loader(results)['points']
+                s_points.translate(info['box3d_lidar'][:3])
+
+                count += 1
+
+                s_points_list.append(s_points)
+
+            gt_labels = np.array([self.cat2label[s['name']] for s in sampled],
+                                 dtype=np.long)
+
+            if ground_plane is not None:
+                xyz = sampled_gt_bboxes[:, :3]
+                dz = (ground_plane[:3][None, :] *
+                      xyz).sum(-1) + ground_plane[3]
+                sampled_gt_bboxes[:, 2] -= dz
+                for i, s_points in enumerate(s_points_list):
+                    s_points.tensor[:, 2].sub_(dz[i])
+
+            ret = {
+                'gt_labels_3d':
+                gt_labels,
+                'gt_bboxes_3d':
+                sampled_gt_bboxes,
+                'points':
+                s_points_list[0].cat(s_points_list),
+                'group_ids':
+                np.arange(gt_bboxes.shape[0],
+                          gt_bboxes.shape[0] + len(sampled))
+            }
+
+        return ret
+
+    def sample_class_v2(self, name: str, num: int,
+                        gt_bboxes: np.ndarray) -> List[dict]:
+        """Sampling specific categories of bounding boxes.
+
+        Args:
+            name (str): Class of objects to be sampled.
+            num (int): Number of sampled bboxes.
+            gt_bboxes (np.ndarray): Ground truth boxes.
+
+        Returns:
+            list[dict]: Valid samples after collision test.
+        """
+        sampled = self.sampler_dict[name].sample(num)
+        sampled = copy.deepcopy(sampled)
+        num_gt = gt_bboxes.shape[0]
+        num_sampled = len(sampled)
+        gt_bboxes_bv = box_np_ops.center_to_corner_box2d(
+            gt_bboxes[:, 0:2], gt_bboxes[:, 3:5], gt_bboxes[:, 6])
+
+        sp_boxes = np.stack([i['box3d_lidar'] for i in sampled], axis=0)
+        boxes = np.concatenate([gt_bboxes, sp_boxes], axis=0).copy()
+
+        sp_boxes_new = boxes[gt_bboxes.shape[0]:]
+        sp_boxes_bv = box_np_ops.center_to_corner_box2d(
+            sp_boxes_new[:, 0:2], sp_boxes_new[:, 3:5], sp_boxes_new[:, 6])
+
+        total_bv = np.concatenate([gt_bboxes_bv, sp_boxes_bv], axis=0)
+        coll_mat = data_augment_utils.box_collision_test(total_bv, total_bv)
+        diag = np.arange(total_bv.shape[0])
+        coll_mat[diag, diag] = False
+
+        valid_samples = []
+        for i in range(num_gt, num_gt + num_sampled):
+            if coll_mat[i].any():
+                coll_mat[i] = False
+                coll_mat[:, i] = False
+            else:
+                valid_samples.append(sampled[i - num_gt])
+        return valid_samples
diff --git a/mmde/mmdet3d/datasets/transforms/formating.py b/mmde/mmdet3d/datasets/transforms/formating.py
new file mode 100644
index 0000000000000000000000000000000000000000..97b590b0d807efa676ad90cdca18a5cca0c187ac
--- /dev/null
+++ b/mmde/mmdet3d/datasets/transforms/formating.py
@@ -0,0 +1,262 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Union
+
+import mmengine
+import numpy as np
+import torch
+from mmcv import BaseTransform
+from mmengine.structures import InstanceData
+from numpy import dtype
+
+from mmdet3d.registry import TRANSFORMS
+from mmdet3d.structures import BaseInstance3DBoxes, Det3DDataSample, PointData
+from mmdet3d.structures.points import BasePoints
+
+
+def to_tensor(
+    data: Union[torch.Tensor, np.ndarray, Sequence, int,
+                float]) -> torch.Tensor:
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+
+    Returns:
+        torch.Tensor: the converted data.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        if data.dtype is dtype('float64'):
+            data = data.astype(np.float32)
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmengine.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@TRANSFORMS.register_module()
+class Pack3DDetInputs(BaseTransform):
+    INPUTS_KEYS = ['points', 'img']
+    INSTANCEDATA_3D_KEYS = [
+        'gt_bboxes_3d', 'gt_labels_3d', 'attr_labels', 'depths', 'centers_2d'
+    ]
+    INSTANCEDATA_2D_KEYS = [
+        'gt_bboxes',
+        'gt_bboxes_labels',
+    ]
+
+    SEG_KEYS = [
+        'gt_seg_map', 'pts_instance_mask', 'pts_semantic_mask',
+        'gt_semantic_seg'
+    ]
+
+    def __init__(
+        self,
+        keys: tuple,
+        meta_keys: tuple = ('img_path', 'ori_shape', 'img_shape', 'lidar2img',
+                            'depth2img', 'cam2img', 'pad_shape',
+                            'scale_factor', 'flip', 'pcd_horizontal_flip',
+                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                            'img_norm_cfg', 'num_pts_feats', 'pcd_trans',
+                            'sample_idx', 'pcd_scale_factor', 'pcd_rotation',
+                            'pcd_rotation_angle', 'lidar_path',
+                            'transformation_3d_flow', 'trans_mat',
+                            'affine_aug', 'sweep_img_metas', 'ori_cam2img',
+                            'cam2global', 'crop_offset', 'img_crop_offset',
+                            'resize_img_shape', 'lidar2cam', 'ori_lidar2img',
+                            'num_ref_frames', 'num_views', 'ego2global',
+                            'axis_align_matrix')
+    ) -> None:
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def _remove_prefix(self, key: str) -> str:
+        if key.startswith('gt_'):
+            key = key[3:]
+        return key
+
+    def transform(self, results: Union[dict,
+                                       List[dict]]) -> Union[dict, List[dict]]:
+        """Method to pack the input data. when the value in this dict is a
+        list, it usually is in Augmentations Testing.
+
+        Args:
+            results (dict | list[dict]): Result dict from the data pipeline.
+
+        Returns:
+            dict | List[dict]:
+
+            - 'inputs' (dict): The forward data of models. It usually contains
+              following keys:
+
+                - points
+                - img
+
+            - 'data_samples' (:obj:`Det3DDataSample`): The annotation info of
+              the sample.
+        """
+        # augtest
+        if isinstance(results, list):
+            if len(results) == 1:
+                # simple test
+                return self.pack_single_results(results[0])
+            pack_results = []
+            for single_result in results:
+                pack_results.append(self.pack_single_results(single_result))
+            return pack_results
+        # norm training and simple testing
+        elif isinstance(results, dict):
+            return self.pack_single_results(results)
+        else:
+            raise NotImplementedError
+
+    def pack_single_results(self, results: dict) -> dict:
+        """Method to pack the single input data. when the value in this dict is
+        a list, it usually is in Augmentations Testing.
+
+        Args:
+            results (dict): Result dict from the data pipeline.
+
+        Returns:
+            dict: A dict contains
+
+            - 'inputs' (dict): The forward data of models. It usually contains
+              following keys:
+
+                - points
+                - img
+
+            - 'data_samples' (:obj:`Det3DDataSample`): The annotation info
+              of the sample.
+        """
+        # Format 3D data
+        if 'points' in results:
+            if isinstance(results['points'], BasePoints):
+                results['points'] = results['points'].tensor
+
+        if 'img' in results:
+            if isinstance(results['img'], list):
+                # process multiple imgs in single frame
+                imgs = np.stack(results['img'], axis=0)
+                if imgs.flags.c_contiguous:
+                    imgs = to_tensor(imgs).permute(0, 3, 1, 2).contiguous()
+                else:
+                    imgs = to_tensor(
+                        np.ascontiguousarray(imgs.transpose(0, 3, 1, 2)))
+                results['img'] = imgs
+            else:
+                img = results['img']
+                if len(img.shape) < 3:
+                    img = np.expand_dims(img, -1)
+                # To improve the computational speed by by 3-5 times, apply:
+                # `torch.permute()` rather than `np.transpose()`.
+                # Refer to https://github.com/open-mmlab/mmdetection/pull/9533
+                # for more details
+                if img.flags.c_contiguous:
+                    img = to_tensor(img).permute(2, 0, 1).contiguous()
+                else:
+                    img = to_tensor(
+                        np.ascontiguousarray(img.transpose(2, 0, 1)))
+                results['img'] = img
+
+        for key in [
+                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
+                'gt_bboxes_labels', 'attr_labels', 'pts_instance_mask',
+                'pts_semantic_mask', 'centers_2d', 'depths', 'gt_labels_3d'
+        ]:
+            if key not in results:
+                continue
+            if isinstance(results[key], list):
+                results[key] = [to_tensor(res) for res in results[key]]
+            else:
+                results[key] = to_tensor(results[key])
+        if 'gt_bboxes_3d' in results:
+            if not isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
+                results['gt_bboxes_3d'] = to_tensor(results['gt_bboxes_3d'])
+
+        if 'gt_semantic_seg' in results:
+            results['gt_semantic_seg'] = to_tensor(
+                results['gt_semantic_seg'][None])
+        if 'gt_seg_map' in results:
+            results['gt_seg_map'] = results['gt_seg_map'][None, ...]
+
+        data_sample = Det3DDataSample()
+        gt_instances_3d = InstanceData()
+        gt_instances = InstanceData()
+        gt_pts_seg = PointData()
+
+        data_metas = {}
+        for key in self.meta_keys:
+            if key in results:
+                data_metas[key] = results[key]
+            elif 'images' in results:
+                if len(results['images'].keys()) == 1:
+                    cam_type = list(results['images'].keys())[0]
+                    # single-view image
+                    if key in results['images'][cam_type]:
+                        data_metas[key] = results['images'][cam_type][key]
+                else:
+                    # multi-view image
+                    img_metas = []
+                    cam_types = list(results['images'].keys())
+                    for cam_type in cam_types:
+                        if key in results['images'][cam_type]:
+                            img_metas.append(results['images'][cam_type][key])
+                    if len(img_metas) > 0:
+                        data_metas[key] = img_metas
+            elif 'lidar_points' in results:
+                if key in results['lidar_points']:
+                    data_metas[key] = results['lidar_points'][key]
+        data_sample.set_metainfo(data_metas)
+
+        inputs = {}
+        for key in self.keys:
+            if key in results:
+                if key in self.INPUTS_KEYS:
+                    inputs[key] = results[key]
+                elif key in self.INSTANCEDATA_3D_KEYS:
+                    gt_instances_3d[self._remove_prefix(key)] = results[key]
+                elif key in self.INSTANCEDATA_2D_KEYS:
+                    if key == 'gt_bboxes_labels':
+                        gt_instances['labels'] = results[key]
+                    else:
+                        gt_instances[self._remove_prefix(key)] = results[key]
+                elif key in self.SEG_KEYS:
+                    gt_pts_seg[self._remove_prefix(key)] = results[key]
+                else:
+                    raise NotImplementedError(f'Please modified '
+                                              f'`Pack3DDetInputs` '
+                                              f'to put {key} to '
+                                              f'corresponding field')
+
+        data_sample.gt_instances_3d = gt_instances_3d
+        data_sample.gt_instances = gt_instances
+        data_sample.gt_pts_seg = gt_pts_seg
+        if 'eval_ann_info' in results:
+            data_sample.eval_ann_info = results['eval_ann_info']
+        else:
+            data_sample.eval_ann_info = None
+
+        packed_results = dict()
+        packed_results['data_samples'] = data_sample
+        packed_results['inputs'] = inputs
+
+        return packed_results
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(keys={self.keys})'
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str
diff --git a/mmde/mmdet3d/datasets/transforms/loading.py b/mmde/mmdet3d/datasets/transforms/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..383c44536f2b94ee32d5ee7ddd11d6ac4c202645
--- /dev/null
+++ b/mmde/mmdet3d/datasets/transforms/loading.py
@@ -0,0 +1,1300 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Optional, Union
+
+import mmcv
+import mmengine
+import numpy as np
+from mmcv.transforms import LoadImageFromFile
+from mmcv.transforms.base import BaseTransform
+from mmdet.datasets.transforms import LoadAnnotations
+from mmengine.fileio import get
+
+from mmdet3d.registry import TRANSFORMS
+from mmdet3d.structures.bbox_3d import get_box_type
+from mmdet3d.structures.points import BasePoints, get_points_type
+
+
+@TRANSFORMS.register_module()
+class LoadMultiViewImageFromFiles(BaseTransform):
+    """Load multi channel images from a list of separate channel files.
+
+    Expects results['img_filename'] to be a list of filenames.
+
+    Args:
+        to_float32 (bool): Whether to convert the img to float32.
+            Defaults to False.
+        color_type (str): Color type of the file. Defaults to 'unchanged'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        num_views (int): Number of view in a frame. Defaults to 5.
+        num_ref_frames (int): Number of frame in loading. Defaults to -1.
+        test_mode (bool): Whether is test mode in loading. Defaults to False.
+        set_default_scale (bool): Whether to set default scale.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 to_float32: bool = False,
+                 color_type: str = 'unchanged',
+                 backend_args: Optional[dict] = None,
+                 num_views: int = 5,
+                 num_ref_frames: int = -1,
+                 test_mode: bool = False,
+                 set_default_scale: bool = True) -> None:
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.backend_args = backend_args
+        self.num_views = num_views
+        # num_ref_frames is used for multi-sweep loading
+        self.num_ref_frames = num_ref_frames
+        # when test_mode=False, we randomly select previous frames
+        # otherwise, select the earliest one
+        self.test_mode = test_mode
+        self.set_default_scale = set_default_scale
+
+    def transform(self, results: dict) -> Optional[dict]:
+        """Call function to load multi-view image from files.
+
+        Args:
+            results (dict): Result dict containing multi-view image filenames.
+
+        Returns:
+            dict: The result dict containing the multi-view image data.
+            Added keys and values are described below.
+
+                - filename (str): Multi-view image filenames.
+                - img (np.ndarray): Multi-view image arrays.
+                - img_shape (tuple[int]): Shape of multi-view image arrays.
+                - ori_shape (tuple[int]): Shape of original image arrays.
+                - pad_shape (tuple[int]): Shape of padded image arrays.
+                - scale_factor (float): Scale factor.
+                - img_norm_cfg (dict): Normalization configuration of images.
+        """
+        # TODO: consider split the multi-sweep part out of this pipeline
+        # Derive the mask and transform for loading of multi-sweep data
+        if self.num_ref_frames > 0:
+            # init choice with the current frame
+            init_choice = np.array([0], dtype=np.int64)
+            num_frames = len(results['img_filename']) // self.num_views - 1
+            if num_frames == 0:  # no previous frame, then copy cur frames
+                choices = np.random.choice(
+                    1, self.num_ref_frames, replace=True)
+            elif num_frames >= self.num_ref_frames:
+                # NOTE: suppose the info is saved following the order
+                # from latest to earlier frames
+                if self.test_mode:
+                    choices = np.arange(num_frames - self.num_ref_frames,
+                                        num_frames) + 1
+                # NOTE: +1 is for selecting previous frames
+                else:
+                    choices = np.random.choice(
+                        num_frames, self.num_ref_frames, replace=False) + 1
+            elif num_frames > 0 and num_frames < self.num_ref_frames:
+                if self.test_mode:
+                    base_choices = np.arange(num_frames) + 1
+                    random_choices = np.random.choice(
+                        num_frames,
+                        self.num_ref_frames - num_frames,
+                        replace=True) + 1
+                    choices = np.concatenate([base_choices, random_choices])
+                else:
+                    choices = np.random.choice(
+                        num_frames, self.num_ref_frames, replace=True) + 1
+            else:
+                raise NotImplementedError
+            choices = np.concatenate([init_choice, choices])
+            select_filename = []
+            for choice in choices:
+                select_filename += results['img_filename'][choice *
+                                                           self.num_views:
+                                                           (choice + 1) *
+                                                           self.num_views]
+            results['img_filename'] = select_filename
+            for key in ['cam2img', 'lidar2cam']:
+                if key in results:
+                    select_results = []
+                    for choice in choices:
+                        select_results += results[key][choice *
+                                                       self.num_views:(choice +
+                                                                       1) *
+                                                       self.num_views]
+                    results[key] = select_results
+            for key in ['ego2global']:
+                if key in results:
+                    select_results = []
+                    for choice in choices:
+                        select_results += [results[key][choice]]
+                    results[key] = select_results
+            # Transform lidar2cam to
+            # [cur_lidar]2[prev_img] and [cur_lidar]2[prev_cam]
+            for key in ['lidar2cam']:
+                if key in results:
+                    # only change matrices of previous frames
+                    for choice_idx in range(1, len(choices)):
+                        pad_prev_ego2global = np.eye(4)
+                        prev_ego2global = results['ego2global'][choice_idx]
+                        pad_prev_ego2global[:prev_ego2global.
+                                            shape[0], :prev_ego2global.
+                                            shape[1]] = prev_ego2global
+                        pad_cur_ego2global = np.eye(4)
+                        cur_ego2global = results['ego2global'][0]
+                        pad_cur_ego2global[:cur_ego2global.
+                                           shape[0], :cur_ego2global.
+                                           shape[1]] = cur_ego2global
+                        cur2prev = np.linalg.inv(pad_prev_ego2global).dot(
+                            pad_cur_ego2global)
+                        for result_idx in range(choice_idx * self.num_views,
+                                                (choice_idx + 1) *
+                                                self.num_views):
+                            results[key][result_idx] = \
+                                results[key][result_idx].dot(cur2prev)
+        # Support multi-view images with different shapes
+        # TODO: record the origin shape and padded shape
+        filename, cam2img, lidar2cam = [], [], []
+        for _, cam_item in results['images'].items():
+            filename.append(cam_item['img_path'])
+            cam2img.append(cam_item['cam2img'])
+            lidar2cam.append(cam_item['lidar2cam'])
+        results['filename'] = filename
+        results['cam2img'] = cam2img
+        results['lidar2cam'] = lidar2cam
+
+        results['ori_cam2img'] = copy.deepcopy(results['cam2img'])
+
+        # img is of shape (h, w, c, num_views)
+        # h and w can be different for different views
+        img_bytes = [
+            get(name, backend_args=self.backend_args) for name in filename
+        ]
+        imgs = [
+            mmcv.imfrombytes(img_byte, flag=self.color_type)
+            for img_byte in img_bytes
+        ]
+        # handle the image with different shape
+        img_shapes = np.stack([img.shape for img in imgs], axis=0)
+        img_shape_max = np.max(img_shapes, axis=0)
+        img_shape_min = np.min(img_shapes, axis=0)
+        assert img_shape_min[-1] == img_shape_max[-1]
+        if not np.all(img_shape_max == img_shape_min):
+            pad_shape = img_shape_max[:2]
+        else:
+            pad_shape = None
+        if pad_shape is not None:
+            imgs = [
+                mmcv.impad(img, shape=pad_shape, pad_val=0) for img in imgs
+            ]
+        img = np.stack(imgs, axis=-1)
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['filename'] = filename
+        # unravel to list, see `DefaultFormatBundle` in formating.py
+        # which will transpose each image separately and then stack into array
+        results['img'] = [img[..., i] for i in range(img.shape[-1])]
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        # Set initial values for default meta_keys
+        results['pad_shape'] = img.shape[:2]
+        if self.set_default_scale:
+            results['scale_factor'] = 1.0
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results['img_norm_cfg'] = dict(
+            mean=np.zeros(num_channels, dtype=np.float32),
+            std=np.ones(num_channels, dtype=np.float32),
+            to_rgb=False)
+        results['num_views'] = self.num_views
+        results['num_ref_frames'] = self.num_ref_frames
+        return results
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(to_float32={self.to_float32}, '
+        repr_str += f"color_type='{self.color_type}', "
+        repr_str += f'num_views={self.num_views}, '
+        repr_str += f'num_ref_frames={self.num_ref_frames}, '
+        repr_str += f'test_mode={self.test_mode})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadImageFromFileMono3D(LoadImageFromFile):
+    """Load an image from file in monocular 3D object detection. Compared to 2D
+    detection, additional camera parameters need to be loaded.
+
+    Args:
+        kwargs (dict): Arguments are the same as those in
+            :class:`LoadImageFromFile`.
+    """
+
+    def transform(self, results: dict) -> dict:
+        """Call functions to load image and get image meta information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        # TODO: load different camera image from data info,
+        # for kitti dataset, we load 'CAM2' image.
+        # for nuscenes dataset, we load 'CAM_FRONT' image.
+
+        if 'CAM2' in results['images']:
+            filename = results['images']['CAM2']['img_path']
+            results['cam2img'] = results['images']['CAM2']['cam2img']
+        elif len(list(results['images'].keys())) == 1:
+            camera_type = list(results['images'].keys())[0]
+            filename = results['images'][camera_type]['img_path']
+            results['cam2img'] = results['images'][camera_type]['cam2img']
+        else:
+            raise NotImplementedError(
+                'Currently we only support load image from kitti and '
+                'nuscenes datasets')
+
+        try:
+            img_bytes = get(filename, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(
+                img_bytes, flag=self.color_type, backend=self.imdecode_backend)
+        except Exception as e:
+            if self.ignore_empty:
+                return None
+            else:
+                raise e
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadImageFromNDArray(LoadImageFromFile):
+    """Load an image from ``results['img']``.
+    Similar with :obj:`LoadImageFromFile`, but the image has been loaded as
+    :obj:`np.ndarray` in ``results['img']``. Can be used when loading image
+    from webcam.
+    Required Keys:
+    - img
+    Modified Keys:
+    - img
+    - img_path
+    - img_shape
+    - ori_shape
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to add image meta information.
+
+        Args:
+            results (dict): Result dict with Webcam read image in
+                ``results['img']``.
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        img = results['img']
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img_path'] = None
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadPointsFromMultiSweeps(BaseTransform):
+    """Load points from multiple sweeps.
+
+    This is usually used for nuScenes dataset to utilize previous sweeps.
+
+    Args:
+        sweeps_num (int): Number of sweeps. Defaults to 10.
+        load_dim (int): Dimension number of the loaded points. Defaults to 5.
+        use_dim (list[int]): Which dimension to use. Defaults to [0, 1, 2, 4].
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        pad_empty_sweeps (bool): Whether to repeat keyframe when
+            sweeps is empty. Defaults to False.
+        remove_close (bool): Whether to remove close points. Defaults to False.
+        test_mode (bool): If `test_mode=True`, it will not randomly sample
+            sweeps but select the nearest N frames. Defaults to False.
+    """
+
+    def __init__(self,
+                 sweeps_num: int = 10,
+                 load_dim: int = 5,
+                 use_dim: List[int] = [0, 1, 2, 4],
+                 backend_args: Optional[dict] = None,
+                 pad_empty_sweeps: bool = False,
+                 remove_close: bool = False,
+                 test_mode: bool = False) -> None:
+        self.load_dim = load_dim
+        self.sweeps_num = sweeps_num
+        if isinstance(use_dim, int):
+            use_dim = list(range(use_dim))
+        assert max(use_dim) < load_dim, \
+            f'Expect all used dimensions < {load_dim}, got {use_dim}'
+        self.use_dim = use_dim
+        self.backend_args = backend_args
+        self.pad_empty_sweeps = pad_empty_sweeps
+        self.remove_close = remove_close
+        self.test_mode = test_mode
+
+    def _load_points(self, pts_filename: str) -> np.ndarray:
+        """Private function to load point clouds data.
+
+        Args:
+            pts_filename (str): Filename of point clouds data.
+
+        Returns:
+            np.ndarray: An array containing point clouds data.
+        """
+        try:
+            pts_bytes = get(pts_filename, backend_args=self.backend_args)
+            points = np.frombuffer(pts_bytes, dtype=np.float32)
+        except ConnectionError:
+            mmengine.check_file_exist(pts_filename)
+            if pts_filename.endswith('.npy'):
+                points = np.load(pts_filename)
+            else:
+                points = np.fromfile(pts_filename, dtype=np.float32)
+        return points
+
+    def _remove_close(self,
+                      points: Union[np.ndarray, BasePoints],
+                      radius: float = 1.0) -> Union[np.ndarray, BasePoints]:
+        """Remove point too close within a certain radius from origin.
+
+        Args:
+            points (np.ndarray | :obj:`BasePoints`): Sweep points.
+            radius (float): Radius below which points are removed.
+                Defaults to 1.0.
+
+        Returns:
+            np.ndarray | :obj:`BasePoints`: Points after removing.
+        """
+        if isinstance(points, np.ndarray):
+            points_numpy = points
+        elif isinstance(points, BasePoints):
+            points_numpy = points.numpy()
+        else:
+            raise NotImplementedError
+        x_filt = np.abs(points_numpy[:, 0]) < radius
+        y_filt = np.abs(points_numpy[:, 1]) < radius
+        not_close = np.logical_not(np.logical_and(x_filt, y_filt))
+        return points[not_close]
+
+    def transform(self, results: dict) -> dict:
+        """Call function to load multi-sweep point clouds from files.
+
+        Args:
+            results (dict): Result dict containing multi-sweep point cloud
+                filenames.
+
+        Returns:
+            dict: The result dict containing the multi-sweep points data.
+            Updated key and value are described below.
+
+                - points (np.ndarray | :obj:`BasePoints`): Multi-sweep point
+                  cloud arrays.
+        """
+        points = results['points']
+        points.tensor[:, 4] = 0
+        sweep_points_list = [points]
+        ts = results['timestamp']
+        if 'lidar_sweeps' not in results:
+            if self.pad_empty_sweeps:
+                for i in range(self.sweeps_num):
+                    if self.remove_close:
+                        sweep_points_list.append(self._remove_close(points))
+                    else:
+                        sweep_points_list.append(points)
+        else:
+            if len(results['lidar_sweeps']) <= self.sweeps_num:
+                choices = np.arange(len(results['lidar_sweeps']))
+            elif self.test_mode:
+                choices = np.arange(self.sweeps_num)
+            else:
+                choices = np.random.choice(
+                    len(results['lidar_sweeps']),
+                    self.sweeps_num,
+                    replace=False)
+            for idx in choices:
+                sweep = results['lidar_sweeps'][idx]
+                points_sweep = self._load_points(
+                    sweep['lidar_points']['lidar_path'])
+                points_sweep = np.copy(points_sweep).reshape(-1, self.load_dim)
+                if self.remove_close:
+                    points_sweep = self._remove_close(points_sweep)
+                # bc-breaking: Timestamp has divided 1e6 in pkl infos.
+                sweep_ts = sweep['timestamp']
+                lidar2sensor = np.array(sweep['lidar_points']['lidar2sensor'])
+                points_sweep[:, :
+                             3] = points_sweep[:, :3] @ lidar2sensor[:3, :3]
+                points_sweep[:, :3] -= lidar2sensor[:3, 3]
+                points_sweep[:, 4] = ts - sweep_ts
+                points_sweep = points.new_point(points_sweep)
+                sweep_points_list.append(points_sweep)
+
+        points = points.cat(sweep_points_list)
+        points = points[:, self.use_dim]
+        results['points'] = points
+        return results
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        return f'{self.__class__.__name__}(sweeps_num={self.sweeps_num})'
+
+
+@TRANSFORMS.register_module()
+class PointSegClassMapping(BaseTransform):
+    """Map original semantic class to valid category ids.
+
+    Required Keys:
+
+    - seg_label_mapping (np.ndarray)
+    - pts_semantic_mask (np.ndarray)
+
+    Added Keys:
+
+    - points (np.float32)
+
+    Map valid classes as 0~len(valid_cat_ids)-1 and
+    others as len(valid_cat_ids).
+    """
+
+    def transform(self, results: dict) -> dict:
+        """Call function to map original semantic class to valid category ids.
+
+        Args:
+            results (dict): Result dict containing point semantic masks.
+
+        Returns:
+            dict: The result dict containing the mapped category ids.
+            Updated key and value are described below.
+
+                - pts_semantic_mask (np.ndarray): Mapped semantic masks.
+        """
+        assert 'pts_semantic_mask' in results
+        pts_semantic_mask = results['pts_semantic_mask']
+
+        assert 'seg_label_mapping' in results
+        label_mapping = results['seg_label_mapping']
+        converted_pts_sem_mask = label_mapping[pts_semantic_mask]
+
+        results['pts_semantic_mask'] = converted_pts_sem_mask
+
+        # 'eval_ann_info' will be passed to evaluator
+        if 'eval_ann_info' in results:
+            assert 'pts_semantic_mask' in results['eval_ann_info']
+            results['eval_ann_info']['pts_semantic_mask'] = \
+                converted_pts_sem_mask
+
+        return results
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class NormalizePointsColor(BaseTransform):
+    """Normalize color of points.
+
+    Args:
+        color_mean (list[float]): Mean color of the point cloud.
+    """
+
+    def __init__(self, color_mean: List[float]) -> None:
+        self.color_mean = color_mean
+
+    def transform(self, input_dict: dict) -> dict:
+        """Call function to normalize color of points.
+
+        Args:
+            results (dict): Result dict containing point clouds data.
+
+        Returns:
+            dict: The result dict containing the normalized points.
+            Updated key and value are described below.
+
+                - points (:obj:`BasePoints`): Points after color normalization.
+        """
+        points = input_dict['points']
+        assert points.attribute_dims is not None and \
+               'color' in points.attribute_dims.keys(), \
+               'Expect points have color attribute'
+        if self.color_mean is not None:
+            points.color = points.color - \
+                           points.color.new_tensor(self.color_mean)
+        points.color = points.color / 255.0
+        input_dict['points'] = points
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(color_mean={self.color_mean})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadPointsFromFile(BaseTransform):
+    """Load Points From File.
+
+    Required Keys:
+
+    - lidar_points (dict)
+
+        - lidar_path (str)
+
+    Added Keys:
+
+    - points (np.float32)
+
+    Args:
+        coord_type (str): The type of coordinates of points cloud.
+            Available options includes:
+
+            - 'LIDAR': Points in LiDAR coordinates.
+            - 'DEPTH': Points in depth coordinates, usually for indoor dataset.
+            - 'CAMERA': Points in camera coordinates.
+        load_dim (int): The dimension of the loaded points. Defaults to 6.
+        use_dim (list[int] | int): Which dimensions of the points to use.
+            Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4
+            or use_dim=[0, 1, 2, 3] to use the intensity dimension.
+        shift_height (bool): Whether to use shifted height. Defaults to False.
+        use_color (bool): Whether to use color features. Defaults to False.
+        norm_intensity (bool): Whether to normlize the intensity. Defaults to
+            False.
+        norm_elongation (bool): Whether to normlize the elongation. This is
+            usually used in Waymo dataset.Defaults to False.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    def __init__(self,
+                 coord_type: str,
+                 load_dim: int = 6,
+                 use_dim: Union[int, List[int]] = [0, 1, 2],
+                 shift_height: bool = False,
+                 use_color: bool = False,
+                 norm_intensity: bool = False,
+                 norm_elongation: bool = False,
+                 backend_args: Optional[dict] = None) -> None:
+        self.shift_height = shift_height
+        self.use_color = use_color
+        if isinstance(use_dim, int):
+            use_dim = list(range(use_dim))
+        assert max(use_dim) < load_dim, \
+            f'Expect all used dimensions < {load_dim}, got {use_dim}'
+        assert coord_type in ['CAMERA', 'LIDAR', 'DEPTH']
+
+        self.coord_type = coord_type
+        self.load_dim = load_dim
+        self.use_dim = use_dim
+        self.norm_intensity = norm_intensity
+        self.norm_elongation = norm_elongation
+        self.backend_args = backend_args
+
+    def _load_points(self, pts_filename: str) -> np.ndarray:
+        """Private function to load point clouds data.
+
+        Args:
+            pts_filename (str): Filename of point clouds data.
+
+        Returns:
+            np.ndarray: An array containing point clouds data.
+        """
+        try:
+            pts_bytes = get(pts_filename, backend_args=self.backend_args)
+            points = np.frombuffer(pts_bytes, dtype=np.float32)
+        except ConnectionError:
+            mmengine.check_file_exist(pts_filename)
+            if pts_filename.endswith('.npy'):
+                points = np.load(pts_filename)
+            else:
+                points = np.fromfile(pts_filename, dtype=np.float32)
+
+        return points
+
+    def transform(self, results: dict) -> dict:
+        """Method to load points data from file.
+
+        Args:
+            results (dict): Result dict containing point clouds data.
+
+        Returns:
+            dict: The result dict containing the point clouds data.
+            Added key and value are described below.
+
+                - points (:obj:`BasePoints`): Point clouds data.
+        """
+        pts_file_path = results['lidar_points']['lidar_path']
+        points = self._load_points(pts_file_path)
+        points = points.reshape(-1, self.load_dim)
+        points = points[:, self.use_dim]
+        if self.norm_intensity:
+            assert len(self.use_dim) >= 4, \
+                f'When using intensity norm, expect used dimensions >= 4, got {len(self.use_dim)}'  # noqa: E501
+            points[:, 3] = np.tanh(points[:, 3])
+        if self.norm_elongation:
+            assert len(self.use_dim) >= 5, \
+                f'When using elongation norm, expect used dimensions >= 5, got {len(self.use_dim)}'  # noqa: E501
+            points[:, 4] = np.tanh(points[:, 4])
+        attribute_dims = None
+
+        if self.shift_height:
+            floor_height = np.percentile(points[:, 2], 0.99)
+            height = points[:, 2] - floor_height
+            points = np.concatenate(
+                [points[:, :3],
+                 np.expand_dims(height, 1), points[:, 3:]], 1)
+            attribute_dims = dict(height=3)
+
+        if self.use_color:
+            assert len(self.use_dim) >= 6
+            if attribute_dims is None:
+                attribute_dims = dict()
+            attribute_dims.update(
+                dict(color=[
+                    points.shape[1] - 3,
+                    points.shape[1] - 2,
+                    points.shape[1] - 1,
+                ]))
+
+        points_class = get_points_type(self.coord_type)
+        points = points_class(
+            points, points_dim=points.shape[-1], attribute_dims=attribute_dims)
+        results['points'] = points
+
+        return results
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__ + '('
+        repr_str += f'shift_height={self.shift_height}, '
+        repr_str += f'use_color={self.use_color}, '
+        repr_str += f'backend_args={self.backend_args}, '
+        repr_str += f'load_dim={self.load_dim}, '
+        repr_str += f'use_dim={self.use_dim})'
+        repr_str += f'norm_intensity={self.norm_intensity})'
+        repr_str += f'norm_elongation={self.norm_elongation})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadPointsFromDict(LoadPointsFromFile):
+    """Load Points From Dict."""
+
+    def transform(self, results: dict) -> dict:
+        """Convert the type of points from ndarray to corresponding
+        `point_class`.
+
+        Args:
+            results (dict): input result. The value of key `points` is a
+                numpy array.
+
+        Returns:
+            dict: The processed results.
+        """
+        assert 'points' in results
+        points = results['points']
+
+        if self.norm_intensity:
+            assert len(self.use_dim) >= 4, \
+                f'When using intensity norm, expect used dimensions >= 4, got {len(self.use_dim)}'  # noqa: E501
+            points[:, 3] = np.tanh(points[:, 3])
+        attribute_dims = None
+
+        if self.shift_height:
+            floor_height = np.percentile(points[:, 2], 0.99)
+            height = points[:, 2] - floor_height
+            points = np.concatenate(
+                [points[:, :3],
+                 np.expand_dims(height, 1), points[:, 3:]], 1)
+            attribute_dims = dict(height=3)
+
+        if self.use_color:
+            assert len(self.use_dim) >= 6
+            if attribute_dims is None:
+                attribute_dims = dict()
+            attribute_dims.update(
+                dict(color=[
+                    points.shape[1] - 3,
+                    points.shape[1] - 2,
+                    points.shape[1] - 1,
+                ]))
+
+        points_class = get_points_type(self.coord_type)
+        points = points_class(
+            points, points_dim=points.shape[-1], attribute_dims=attribute_dims)
+        results['points'] = points
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadAnnotations3D(LoadAnnotations):
+    """Load Annotations3D.
+
+    Load instance mask and semantic mask of points and
+    encapsulate the items into related fields.
+
+    Required Keys:
+
+    - ann_info (dict)
+
+        - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes` |
+          :obj:`DepthInstance3DBoxes` | :obj:`CameraInstance3DBoxes`):
+          3D ground truth bboxes. Only when `with_bbox_3d` is True
+        - gt_labels_3d (np.int64): Labels of ground truths.
+          Only when `with_label_3d` is True.
+        - gt_bboxes (np.float32): 2D ground truth bboxes.
+          Only when `with_bbox` is True.
+        - gt_labels (np.ndarray): Labels of ground truths.
+          Only when `with_label` is True.
+        - depths (np.ndarray): Only when
+          `with_bbox_depth` is True.
+        - centers_2d (np.ndarray): Only when
+          `with_bbox_depth` is True.
+        - attr_labels (np.ndarray): Attribute labels of instances.
+          Only when `with_attr_label` is True.
+
+    - pts_instance_mask_path (str): Path of instance mask file.
+      Only when `with_mask_3d` is True.
+    - pts_semantic_mask_path (str): Path of semantic mask file.
+      Only when `with_seg_3d` is True.
+    - pts_panoptic_mask_path (str): Path of panoptic mask file.
+      Only when both `with_panoptic_3d` is True.
+
+    Added Keys:
+
+    - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes` |
+      :obj:`DepthInstance3DBoxes` | :obj:`CameraInstance3DBoxes`):
+      3D ground truth bboxes. Only when `with_bbox_3d` is True
+    - gt_labels_3d (np.int64): Labels of ground truths.
+      Only when `with_label_3d` is True.
+    - gt_bboxes (np.float32): 2D ground truth bboxes.
+      Only when `with_bbox` is True.
+    - gt_labels (np.int64): Labels of ground truths.
+      Only when `with_label` is True.
+    - depths (np.float32): Only when
+      `with_bbox_depth` is True.
+    - centers_2d (np.ndarray): Only when
+      `with_bbox_depth` is True.
+    - attr_labels (np.int64): Attribute labels of instances.
+      Only when `with_attr_label` is True.
+    - pts_instance_mask (np.int64): Instance mask of each point.
+      Only when `with_mask_3d` is True.
+    - pts_semantic_mask (np.int64): Semantic mask of each point.
+      Only when `with_seg_3d` is True.
+
+    Args:
+        with_bbox_3d (bool): Whether to load 3D boxes. Defaults to True.
+        with_label_3d (bool): Whether to load 3D labels. Defaults to True.
+        with_attr_label (bool): Whether to load attribute label.
+            Defaults to False.
+        with_mask_3d (bool): Whether to load 3D instance masks for points.
+            Defaults to False.
+        with_seg_3d (bool): Whether to load 3D semantic masks for points.
+            Defaults to False.
+        with_bbox (bool): Whether to load 2D boxes. Defaults to False.
+        with_label (bool): Whether to load 2D labels. Defaults to False.
+        with_mask (bool): Whether to load 2D instance masks. Defaults to False.
+        with_seg (bool): Whether to load 2D semantic masks. Defaults to False.
+        with_bbox_depth (bool): Whether to load 2.5D boxes. Defaults to False.
+        with_panoptic_3d (bool): Whether to load 3D panoptic masks for points.
+            Defaults to False.
+        poly2mask (bool): Whether to convert polygon annotations to bitmasks.
+            Defaults to True.
+        seg_3d_dtype (str): String of dtype of 3D semantic masks.
+            Defaults to 'np.int64'.
+        seg_offset (int): The offset to split semantic and instance labels from
+            panoptic labels. Defaults to None.
+        dataset_type (str): Type of dataset used for splitting semantic and
+            instance labels. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    def __init__(self,
+                 with_bbox_3d: bool = True,
+                 with_label_3d: bool = True,
+                 with_attr_label: bool = False,
+                 with_mask_3d: bool = False,
+                 with_seg_3d: bool = False,
+                 with_bbox: bool = False,
+                 with_label: bool = False,
+                 with_mask: bool = False,
+                 with_seg: bool = False,
+                 with_bbox_depth: bool = False,
+                 with_panoptic_3d: bool = False,
+                 poly2mask: bool = True,
+                 seg_3d_dtype: str = 'np.int64',
+                 seg_offset: int = None,
+                 dataset_type: str = None,
+                 backend_args: Optional[dict] = None) -> None:
+        super().__init__(
+            with_bbox=with_bbox,
+            with_label=with_label,
+            with_mask=with_mask,
+            with_seg=with_seg,
+            poly2mask=poly2mask,
+            backend_args=backend_args)
+        self.with_bbox_3d = with_bbox_3d
+        self.with_bbox_depth = with_bbox_depth
+        self.with_label_3d = with_label_3d
+        self.with_attr_label = with_attr_label
+        self.with_mask_3d = with_mask_3d
+        self.with_seg_3d = with_seg_3d
+        self.with_panoptic_3d = with_panoptic_3d
+        self.seg_3d_dtype = eval(seg_3d_dtype)
+        self.seg_offset = seg_offset
+        self.dataset_type = dataset_type
+
+    def _load_bboxes_3d(self, results: dict) -> dict:
+        """Private function to move the 3D bounding box annotation from
+        `ann_info` field to the root of `results`.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 3D bounding box annotations.
+        """
+
+        results['gt_bboxes_3d'] = results['ann_info']['gt_bboxes_3d']
+        return results
+
+    def _load_bboxes_depth(self, results: dict) -> dict:
+        """Private function to load 2.5D bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 2.5D bounding box annotations.
+        """
+
+        results['depths'] = results['ann_info']['depths']
+        results['centers_2d'] = results['ann_info']['centers_2d']
+        return results
+
+    def _load_labels_3d(self, results: dict) -> dict:
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded label annotations.
+        """
+
+        results['gt_labels_3d'] = results['ann_info']['gt_labels_3d']
+        return results
+
+    def _load_attr_labels(self, results: dict) -> dict:
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded label annotations.
+        """
+        results['attr_labels'] = results['ann_info']['attr_labels']
+        return results
+
+    def _load_masks_3d(self, results: dict) -> dict:
+        """Private function to load 3D mask annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 3D mask annotations.
+        """
+        pts_instance_mask_path = results['pts_instance_mask_path']
+
+        try:
+            mask_bytes = get(
+                pts_instance_mask_path, backend_args=self.backend_args)
+            pts_instance_mask = np.frombuffer(mask_bytes, dtype=np.int64)
+        except ConnectionError:
+            mmengine.check_file_exist(pts_instance_mask_path)
+            pts_instance_mask = np.fromfile(
+                pts_instance_mask_path, dtype=np.int64)
+
+        results['pts_instance_mask'] = pts_instance_mask
+        # 'eval_ann_info' will be passed to evaluator
+        if 'eval_ann_info' in results:
+            results['eval_ann_info']['pts_instance_mask'] = pts_instance_mask
+        return results
+
+    def _load_semantic_seg_3d(self, results: dict) -> dict:
+        """Private function to load 3D semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing the semantic segmentation annotations.
+        """
+        pts_semantic_mask_path = results['pts_semantic_mask_path']
+
+        try:
+            mask_bytes = get(
+                pts_semantic_mask_path, backend_args=self.backend_args)
+            # add .copy() to fix read-only bug
+            pts_semantic_mask = np.frombuffer(
+                mask_bytes, dtype=self.seg_3d_dtype).copy()
+        except ConnectionError:
+            mmengine.check_file_exist(pts_semantic_mask_path)
+            pts_semantic_mask = np.fromfile(
+                pts_semantic_mask_path, dtype=np.int64)
+
+        if self.dataset_type == 'semantickitti':
+            pts_semantic_mask = pts_semantic_mask.astype(np.int64)
+            pts_semantic_mask = pts_semantic_mask % self.seg_offset
+        # nuScenes loads semantic and panoptic labels from different files.
+
+        results['pts_semantic_mask'] = pts_semantic_mask
+
+        # 'eval_ann_info' will be passed to evaluator
+        if 'eval_ann_info' in results:
+            results['eval_ann_info']['pts_semantic_mask'] = pts_semantic_mask
+        return results
+
+    def _load_panoptic_3d(self, results: dict) -> dict:
+        """Private function to load 3D panoptic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing the panoptic segmentation annotations.
+        """
+        pts_panoptic_mask_path = results['pts_panoptic_mask_path']
+
+        try:
+            mask_bytes = get(
+                pts_panoptic_mask_path, backend_args=self.backend_args)
+            # add .copy() to fix read-only bug
+            pts_panoptic_mask = np.frombuffer(
+                mask_bytes, dtype=self.seg_3d_dtype).copy()
+        except ConnectionError:
+            mmengine.check_file_exist(pts_panoptic_mask_path)
+            pts_panoptic_mask = np.fromfile(
+                pts_panoptic_mask_path, dtype=np.int64)
+
+        if self.dataset_type == 'semantickitti':
+            pts_semantic_mask = pts_panoptic_mask.astype(np.int64)
+            pts_semantic_mask = pts_semantic_mask % self.seg_offset
+        elif self.dataset_type == 'nuscenes':
+            pts_semantic_mask = pts_semantic_mask // self.seg_offset
+
+        results['pts_semantic_mask'] = pts_semantic_mask
+
+        # We can directly take panoptic labels as instance ids.
+        pts_instance_mask = pts_panoptic_mask.astype(np.int64)
+        results['pts_instance_mask'] = pts_instance_mask
+
+        # 'eval_ann_info' will be passed to evaluator
+        if 'eval_ann_info' in results:
+            results['eval_ann_info']['pts_semantic_mask'] = pts_semantic_mask
+            results['eval_ann_info']['pts_instance_mask'] = pts_instance_mask
+        return results
+
+    def _load_bboxes(self, results: dict) -> None:
+        """Private function to load bounding box annotations.
+
+        The only difference is it remove the proceess for
+        `ignore_flag`
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+
+        results['gt_bboxes'] = results['ann_info']['gt_bboxes']
+
+    def _load_labels(self, results: dict) -> None:
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj :obj:`mmcv.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+        results['gt_bboxes_labels'] = results['ann_info']['gt_bboxes_labels']
+
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 3D bounding box, label, mask and
+            semantic segmentation annotations.
+        """
+        results = super().transform(results)
+        if self.with_bbox_3d:
+            results = self._load_bboxes_3d(results)
+        if self.with_bbox_depth:
+            results = self._load_bboxes_depth(results)
+        if self.with_label_3d:
+            results = self._load_labels_3d(results)
+        if self.with_attr_label:
+            results = self._load_attr_labels(results)
+        if self.with_panoptic_3d:
+            results = self._load_panoptic_3d(results)
+        if self.with_mask_3d:
+            results = self._load_masks_3d(results)
+        if self.with_seg_3d:
+            results = self._load_semantic_seg_3d(results)
+        return results
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        indent_str = '    '
+        repr_str = self.__class__.__name__ + '(\n'
+        repr_str += f'{indent_str}with_bbox_3d={self.with_bbox_3d}, '
+        repr_str += f'{indent_str}with_label_3d={self.with_label_3d}, '
+        repr_str += f'{indent_str}with_attr_label={self.with_attr_label}, '
+        repr_str += f'{indent_str}with_mask_3d={self.with_mask_3d}, '
+        repr_str += f'{indent_str}with_seg_3d={self.with_seg_3d}, '
+        repr_str += f'{indent_str}with_panoptic_3d={self.with_panoptic_3d}, '
+        repr_str += f'{indent_str}with_bbox={self.with_bbox}, '
+        repr_str += f'{indent_str}with_label={self.with_label}, '
+        repr_str += f'{indent_str}with_mask={self.with_mask}, '
+        repr_str += f'{indent_str}with_seg={self.with_seg}, '
+        repr_str += f'{indent_str}with_bbox_depth={self.with_bbox_depth}, '
+        repr_str += f'{indent_str}poly2mask={self.poly2mask})'
+        repr_str += f'{indent_str}seg_offset={self.seg_offset})'
+
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LidarDet3DInferencerLoader(BaseTransform):
+    """Load point cloud in the Inferencer's pipeline.
+
+    Added keys:
+      - points
+      - timestamp
+      - axis_align_matrix
+      - box_type_3d
+      - box_mode_3d
+    """
+
+    def __init__(self, coord_type='LIDAR', **kwargs) -> None:
+        super().__init__()
+        self.from_file = TRANSFORMS.build(
+            dict(type='LoadPointsFromFile', coord_type=coord_type, **kwargs))
+        self.from_ndarray = TRANSFORMS.build(
+            dict(type='LoadPointsFromDict', coord_type=coord_type, **kwargs))
+        self.box_type_3d, self.box_mode_3d = get_box_type(coord_type)
+
+    def transform(self, single_input: dict) -> dict:
+        """Transform function to add image meta information.
+        Args:
+            single_input (dict): Single input.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        assert 'points' in single_input, "key 'points' must be in input dict"
+        if isinstance(single_input['points'], str):
+            inputs = dict(
+                lidar_points=dict(lidar_path=single_input['points']),
+                timestamp=1,
+                # for ScanNet demo we need axis_align_matrix
+                axis_align_matrix=np.eye(4),
+                box_type_3d=self.box_type_3d,
+                box_mode_3d=self.box_mode_3d)
+        elif isinstance(single_input['points'], np.ndarray):
+            inputs = dict(
+                points=single_input['points'],
+                timestamp=1,
+                # for ScanNet demo we need axis_align_matrix
+                axis_align_matrix=np.eye(4),
+                box_type_3d=self.box_type_3d,
+                box_mode_3d=self.box_mode_3d)
+        else:
+            raise ValueError('Unsupported input points type: '
+                             f"{type(single_input['points'])}")
+
+        if 'points' in inputs:
+            return self.from_ndarray(inputs)
+        return self.from_file(inputs)
+
+
+@TRANSFORMS.register_module()
+class MonoDet3DInferencerLoader(BaseTransform):
+    """Load an image from ``results['images']['CAMX']['img']``. Similar with
+    :obj:`LoadImageFromFileMono3D`, but the image has been loaded as
+    :obj:`np.ndarray` in ``results['images']['CAMX']['img']``.
+
+    Added keys:
+      - img
+      - box_type_3d
+      - box_mode_3d
+
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__()
+        self.from_file = TRANSFORMS.build(
+            dict(type='LoadImageFromFileMono3D', **kwargs))
+        self.from_ndarray = TRANSFORMS.build(
+            dict(type='LoadImageFromNDArray', **kwargs))
+
+    def transform(self, single_input: dict) -> dict:
+        """Transform function to add image meta information.
+
+        Args:
+            single_input (dict): Result dict with Webcam read image in
+                ``results['images']['CAMX']['img']``.
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        box_type_3d, box_mode_3d = get_box_type('camera')
+
+        if isinstance(single_input['img'], str):
+            inputs = dict(
+                images=dict(
+                    CAM_FRONT=dict(
+                        img_path=single_input['img'],
+                        cam2img=single_input['cam2img'])),
+                box_mode_3d=box_mode_3d,
+                box_type_3d=box_type_3d)
+        elif isinstance(single_input['img'], np.ndarray):
+            inputs = dict(
+                img=single_input['img'],
+                cam2img=single_input['cam2img'],
+                box_type_3d=box_type_3d,
+                box_mode_3d=box_mode_3d)
+        else:
+            raise ValueError('Unsupported input image type: '
+                             f"{type(single_input['img'])}")
+
+        if 'img' in inputs:
+            return self.from_ndarray(inputs)
+        return self.from_file(inputs)
+
+
+@TRANSFORMS.register_module()
+class MultiModalityDet3DInferencerLoader(BaseTransform):
+    """Load point cloud and image in the Inferencer's pipeline.
+
+    Added keys:
+      - points
+      - img
+      - cam2img
+      - lidar2cam
+      - lidar2img
+      - timestamp
+      - axis_align_matrix
+      - box_type_3d
+      - box_mode_3d
+    """
+
+    def __init__(self, load_point_args: dict, load_img_args: dict) -> None:
+        super().__init__()
+        self.points_from_file = TRANSFORMS.build(
+            dict(type='LoadPointsFromFile', **load_point_args))
+        self.points_from_ndarray = TRANSFORMS.build(
+            dict(type='LoadPointsFromDict', **load_point_args))
+        coord_type = load_point_args['coord_type']
+        self.box_type_3d, self.box_mode_3d = get_box_type(coord_type)
+
+        self.imgs_from_file = TRANSFORMS.build(
+            dict(type='LoadImageFromFile', **load_img_args))
+        self.imgs_from_ndarray = TRANSFORMS.build(
+            dict(type='LoadImageFromNDArray', **load_img_args))
+
+    def transform(self, single_input: dict) -> dict:
+        """Transform function to add image meta information.
+        Args:
+            single_input (dict): Single input.
+
+        Returns:
+            dict: The dict contains loaded image, point cloud and meta
+            information.
+        """
+        assert 'points' in single_input and 'img' in single_input, \
+            "key 'points', 'img' and must be in input dict," \
+            f'but got {single_input}'
+        if isinstance(single_input['points'], str):
+            inputs = dict(
+                lidar_points=dict(lidar_path=single_input['points']),
+                timestamp=1,
+                # for ScanNet demo we need axis_align_matrix
+                axis_align_matrix=np.eye(4),
+                box_type_3d=self.box_type_3d,
+                box_mode_3d=self.box_mode_3d)
+        elif isinstance(single_input['points'], np.ndarray):
+            inputs = dict(
+                points=single_input['points'],
+                timestamp=1,
+                # for ScanNet demo we need axis_align_matrix
+                axis_align_matrix=np.eye(4),
+                box_type_3d=self.box_type_3d,
+                box_mode_3d=self.box_mode_3d)
+        else:
+            raise ValueError('Unsupported input points type: '
+                             f"{type(single_input['points'])}")
+
+        if 'points' in inputs:
+            points_inputs = self.points_from_ndarray(inputs)
+        else:
+            points_inputs = self.points_from_file(inputs)
+
+        multi_modality_inputs = points_inputs
+
+        box_type_3d, box_mode_3d = get_box_type('lidar')
+
+        if isinstance(single_input['img'], str):
+            inputs = dict(
+                img_path=single_input['img'],
+                cam2img=single_input['cam2img'],
+                lidar2img=single_input['lidar2img'],
+                lidar2cam=single_input['lidar2cam'],
+                box_mode_3d=box_mode_3d,
+                box_type_3d=box_type_3d)
+        elif isinstance(single_input['img'], np.ndarray):
+            inputs = dict(
+                img=single_input['img'],
+                cam2img=single_input['cam2img'],
+                lidar2img=single_input['lidar2img'],
+                lidar2cam=single_input['lidar2cam'],
+                box_type_3d=box_type_3d,
+                box_mode_3d=box_mode_3d)
+        else:
+            raise ValueError('Unsupported input image type: '
+                             f"{type(single_input['img'])}")
+
+        if isinstance(single_input['img'], np.ndarray):
+            imgs_inputs = self.imgs_from_ndarray(inputs)
+        else:
+            imgs_inputs = self.imgs_from_file(inputs)
+
+        multi_modality_inputs.update(imgs_inputs)
+
+        return multi_modality_inputs
diff --git a/mmde/mmdet3d/datasets/transforms/test_time_aug.py b/mmde/mmdet3d/datasets/transforms/test_time_aug.py
new file mode 100644
index 0000000000000000000000000000000000000000..1aea7a86e9e19a42d29d861e4b43b167fd402d2c
--- /dev/null
+++ b/mmde/mmdet3d/datasets/transforms/test_time_aug.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from copy import deepcopy
+from typing import Dict, List, Optional, Tuple, Union
+
+import mmengine
+from mmcv import BaseTransform
+from mmengine.dataset import Compose
+
+from mmdet3d.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class MultiScaleFlipAug3D(BaseTransform):
+    """Test-time augmentation with multiple scales and flipping.
+
+    Args:
+        transforms (list[dict]): Transforms to apply in each augmentation.
+        img_scale (tuple | list[tuple]): Images scales for resizing.
+        pts_scale_ratio (float | list[float]): Points scale ratios for
+            resizing.
+        flip (bool): Whether apply flip augmentation. Defaults to False.
+        flip_direction (str | list[str]): Flip augmentation directions
+            for images, options are "horizontal" and "vertical".
+            If flip_direction is list, multiple flip augmentations will
+            be applied. It has no effect when ``flip == False``.
+            Defaults to 'horizontal'.
+        pcd_horizontal_flip (bool): Whether to apply horizontal flip
+            augmentation to point cloud. Defaults to False.
+            Note that it works only when 'flip' is turned on.
+        pcd_vertical_flip (bool): Whether to apply vertical flip
+            augmentation to point cloud. Defaults to False.
+            Note that it works only when 'flip' is turned on.
+    """
+
+    def __init__(self,
+                 transforms: List[dict],
+                 img_scale: Optional[Union[Tuple[int], List[Tuple[int]]]],
+                 pts_scale_ratio: Union[float, List[float]],
+                 flip: bool = False,
+                 flip_direction: str = 'horizontal',
+                 pcd_horizontal_flip: bool = False,
+                 pcd_vertical_flip: bool = False) -> None:
+        self.transforms = Compose(transforms)
+        self.img_scale = img_scale if isinstance(img_scale,
+                                                 list) else [img_scale]
+        self.pts_scale_ratio = pts_scale_ratio \
+            if isinstance(pts_scale_ratio, list) else [float(pts_scale_ratio)]
+
+        assert mmengine.is_list_of(self.img_scale, tuple)
+        assert mmengine.is_list_of(self.pts_scale_ratio, float)
+
+        self.flip = flip
+        self.pcd_horizontal_flip = pcd_horizontal_flip
+        self.pcd_vertical_flip = pcd_vertical_flip
+
+        self.flip_direction = flip_direction if isinstance(
+            flip_direction, list) else [flip_direction]
+        assert mmengine.is_list_of(self.flip_direction, str)
+        if not self.flip and self.flip_direction != ['horizontal']:
+            warnings.warn(
+                'flip_direction has no effect when flip is set to False')
+        if (self.flip and not any([(t['type'] == 'RandomFlip3D'
+                                    or t['type'] == 'RandomFlip')
+                                   for t in transforms])):
+            warnings.warn(
+                'flip has no effect when RandomFlip is not in transforms')
+
+    def transform(self, results: Dict) -> List[Dict]:
+        """Call function to augment common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to augment.
+
+        Returns:
+            List[dict]: The list contains the data that is augmented with
+            different scales and flips.
+        """
+        aug_data_list = []
+
+        # modified from `flip_aug = [False, True] if self.flip else [False]`
+        # to reduce unnecessary scenes when using double flip augmentation
+        # during test time
+        flip_aug = [True] if self.flip else [False]
+        pcd_horizontal_flip_aug = [False, True] \
+            if self.flip and self.pcd_horizontal_flip else [False]
+        pcd_vertical_flip_aug = [False, True] \
+            if self.flip and self.pcd_vertical_flip else [False]
+        for scale in self.img_scale:
+            # TODO refactor according to augtest docs
+            self.transforms.transforms[0].scale = scale
+            for pts_scale_ratio in self.pts_scale_ratio:
+                for flip in flip_aug:
+                    for pcd_horizontal_flip in pcd_horizontal_flip_aug:
+                        for pcd_vertical_flip in pcd_vertical_flip_aug:
+                            for direction in self.flip_direction:
+                                # results.copy will cause bug
+                                # since it is shallow copy
+                                _results = deepcopy(results)
+                                _results['scale'] = scale
+                                _results['flip'] = flip
+                                _results['pcd_scale_factor'] = \
+                                    pts_scale_ratio
+                                _results['flip_direction'] = direction
+                                _results['pcd_horizontal_flip'] = \
+                                    pcd_horizontal_flip
+                                _results['pcd_vertical_flip'] = \
+                                    pcd_vertical_flip
+                                data = self.transforms(_results)
+                                aug_data_list.append(data)
+
+        return aug_data_list
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms={self.transforms}, '
+        repr_str += f'img_scale={self.img_scale}, flip={self.flip}, '
+        repr_str += f'pts_scale_ratio={self.pts_scale_ratio}, '
+        repr_str += f'flip_direction={self.flip_direction})'
+        return repr_str
diff --git a/mmde/mmdet3d/datasets/transforms/transforms_3d.py b/mmde/mmdet3d/datasets/transforms/transforms_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..94275d42deca28dd24b95aa9ffb22485be7e4006
--- /dev/null
+++ b/mmde/mmdet3d/datasets/transforms/transforms_3d.py
@@ -0,0 +1,2685 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+import warnings
+from typing import List, Optional, Sequence, Tuple, Union
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+from mmcv.transforms import BaseTransform, Compose, RandomResize, Resize
+from mmdet.datasets.transforms import (PhotoMetricDistortion, RandomCrop,
+                                       RandomFlip)
+from mmengine import is_list_of, is_tuple_of
+
+from mmdet3d.models.task_modules import VoxelGenerator
+from mmdet3d.registry import TRANSFORMS
+from mmdet3d.structures import (CameraInstance3DBoxes, DepthInstance3DBoxes,
+                                LiDARInstance3DBoxes)
+from mmdet3d.structures.ops import box_np_ops
+from mmdet3d.structures.points import BasePoints
+from .data_augment_utils import noise_per_object_v3_
+
+
+@TRANSFORMS.register_module()
+class RandomDropPointsColor(BaseTransform):
+    r"""Randomly set the color of points to all zeros.
+
+    Once this transform is executed, all the points' color will be dropped.
+    Refer to `PAConv <https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/
+    util/transform.py#L223>`_ for more details.
+
+    Args:
+        drop_ratio (float): The probability of dropping point colors.
+            Defaults to 0.2.
+    """
+
+    def __init__(self, drop_ratio: float = 0.2) -> None:
+        assert isinstance(drop_ratio, (int, float)) and 0 <= drop_ratio <= 1, \
+            f'invalid drop_ratio value {drop_ratio}'
+        self.drop_ratio = drop_ratio
+
+    def transform(self, input_dict: dict) -> dict:
+        """Call function to drop point colors.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after color dropping, 'points' key is updated
+            in the result dict.
+        """
+        points = input_dict['points']
+        assert points.attribute_dims is not None and \
+            'color' in points.attribute_dims, \
+            'Expect points have color attribute'
+
+        # this if-expression is a bit strange
+        # `RandomDropPointsColor` is used in training 3D segmentor PAConv
+        # we discovered in our experiments that, using
+        # `if np.random.rand() > 1.0 - self.drop_ratio` consistently leads to
+        # better results than using `if np.random.rand() < self.drop_ratio`
+        # so we keep this hack in our codebase
+        if np.random.rand() > 1.0 - self.drop_ratio:
+            points.color = points.color * 0.0
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(drop_ratio={self.drop_ratio})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomFlip3D(RandomFlip):
+    """Flip the points & bbox.
+
+    If the input dict contains the key "flip", then the flag will be used,
+    otherwise it will be randomly decided by a ratio specified in the init
+    method.
+
+    Required Keys:
+
+    - points (np.float32)
+    - gt_bboxes_3d (np.float32)
+
+    Modified Keys:
+
+    - points (np.float32)
+    - gt_bboxes_3d (np.float32)
+
+    Added Keys:
+
+    - points (np.float32)
+    - pcd_trans (np.float32)
+    - pcd_rotation (np.float32)
+    - pcd_rotation_angle (np.float32)
+    - pcd_scale_factor (np.float32)
+
+    Args:
+        sync_2d (bool): Whether to apply flip according to the 2D
+            images. If True, it will apply the same flip as that to 2D images.
+            If False, it will decide whether to flip randomly and independently
+            to that of 2D images. Defaults to True.
+        flip_ratio_bev_horizontal (float): The flipping probability
+            in horizontal direction. Defaults to 0.0.
+        flip_ratio_bev_vertical (float): The flipping probability
+            in vertical direction. Defaults to 0.0.
+        flip_box3d (bool): Whether to flip bounding box. In most of the case,
+            the box should be fliped. In cam-based bev detection, this is set
+            to False, since the flip of 2D images does not influence the 3D
+            box. Defaults to True.
+    """
+
+    def __init__(self,
+                 sync_2d: bool = True,
+                 flip_ratio_bev_horizontal: float = 0.0,
+                 flip_ratio_bev_vertical: float = 0.0,
+                 flip_box3d: bool = True,
+                 **kwargs) -> None:
+        # `flip_ratio_bev_horizontal` is equal to
+        # for flip prob of 2d image when
+        # `sync_2d` is True
+        super(RandomFlip3D, self).__init__(
+            prob=flip_ratio_bev_horizontal, direction='horizontal', **kwargs)
+        self.sync_2d = sync_2d
+        self.flip_ratio_bev_horizontal = flip_ratio_bev_horizontal
+        self.flip_ratio_bev_vertical = flip_ratio_bev_vertical
+        self.flip_box3d = flip_box3d
+        if flip_ratio_bev_horizontal is not None:
+            assert isinstance(
+                flip_ratio_bev_horizontal,
+                (int, float)) and 0 <= flip_ratio_bev_horizontal <= 1
+        if flip_ratio_bev_vertical is not None:
+            assert isinstance(
+                flip_ratio_bev_vertical,
+                (int, float)) and 0 <= flip_ratio_bev_vertical <= 1
+
+    def random_flip_data_3d(self,
+                            input_dict: dict,
+                            direction: str = 'horizontal') -> None:
+        """Flip 3D data randomly.
+
+        `random_flip_data_3d` should take these situations into consideration:
+
+        - 1. LIDAR-based 3d detection
+        - 2. LIDAR-based 3d segmentation
+        - 3. vision-only detection
+        - 4. multi-modality 3d detection.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+            direction (str): Flip direction. Defaults to 'horizontal'.
+
+        Returns:
+            dict: Flipped results, 'points', 'bbox3d_fields' keys are
+            updated in the result dict.
+        """
+        assert direction in ['horizontal', 'vertical']
+        if self.flip_box3d:
+            if 'gt_bboxes_3d' in input_dict:
+                if 'points' in input_dict:
+                    input_dict['points'] = input_dict['gt_bboxes_3d'].flip(
+                        direction, points=input_dict['points'])
+                else:
+                    # vision-only detection
+                    input_dict['gt_bboxes_3d'].flip(direction)
+            else:
+                input_dict['points'].flip(direction)
+
+        if 'centers_2d' in input_dict:
+            assert self.sync_2d is True and direction == 'horizontal', \
+                'Only support sync_2d=True and horizontal flip with images'
+            w = input_dict['img_shape'][1]
+            input_dict['centers_2d'][..., 0] = \
+                w - input_dict['centers_2d'][..., 0]
+            # need to modify the horizontal position of camera center
+            # along u-axis in the image (flip like centers2d)
+            # ['cam2img'][0][2] = c_u
+            # see more details and examples at
+            # https://github.com/open-mmlab/mmdetection3d/pull/744
+            input_dict['cam2img'][0][2] = w - input_dict['cam2img'][0][2]
+
+    def _flip_on_direction(self, results: dict) -> None:
+        """Function to flip images, bounding boxes, semantic segmentation map
+        and keypoints.
+
+        Add the override feature that if 'flip' is already in results, use it
+        to do the augmentation.
+        """
+        if 'flip' not in results:
+            cur_dir = self._choose_direction()
+        else:
+            # `flip_direction` works only when `flip` is True.
+            # For example, in `MultiScaleFlipAug3D`, `flip_direction` is
+            # 'horizontal' but `flip` is False.
+            if results['flip']:
+                assert 'flip_direction' in results, 'flip and flip_direction '
+                'must exist simultaneously'
+                cur_dir = results['flip_direction']
+            else:
+                cur_dir = None
+        if cur_dir is None:
+            results['flip'] = False
+            results['flip_direction'] = None
+        else:
+            results['flip'] = True
+            results['flip_direction'] = cur_dir
+            self._flip(results)
+
+    def transform(self, input_dict: dict) -> dict:
+        """Call function to flip points, values in the ``bbox3d_fields`` and
+        also flip 2D image and its annotations.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Flipped results, 'flip', 'flip_direction',
+            'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added
+            into result dict.
+        """
+        # flip 2D image and its annotations
+        if 'img' in input_dict:
+            super(RandomFlip3D, self).transform(input_dict)
+
+        if self.sync_2d and 'img' in input_dict:
+            input_dict['pcd_horizontal_flip'] = input_dict['flip']
+            input_dict['pcd_vertical_flip'] = False
+        else:
+            if 'pcd_horizontal_flip' not in input_dict:
+                flip_horizontal = True if np.random.rand(
+                ) < self.flip_ratio_bev_horizontal else False
+                input_dict['pcd_horizontal_flip'] = flip_horizontal
+            if 'pcd_vertical_flip' not in input_dict:
+                flip_vertical = True if np.random.rand(
+                ) < self.flip_ratio_bev_vertical else False
+                input_dict['pcd_vertical_flip'] = flip_vertical
+
+        if 'transformation_3d_flow' not in input_dict:
+            input_dict['transformation_3d_flow'] = []
+
+        if input_dict['pcd_horizontal_flip']:
+            self.random_flip_data_3d(input_dict, 'horizontal')
+            input_dict['transformation_3d_flow'].extend(['HF'])
+        if input_dict['pcd_vertical_flip']:
+            self.random_flip_data_3d(input_dict, 'vertical')
+            input_dict['transformation_3d_flow'].extend(['VF'])
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(sync_2d={self.sync_2d},'
+        repr_str += f' flip_ratio_bev_vertical={self.flip_ratio_bev_vertical})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomJitterPoints(BaseTransform):
+    """Randomly jitter point coordinates.
+
+    Different from the global translation in ``GlobalRotScaleTrans``, here we
+    apply different noises to each point in a scene.
+
+    Args:
+        jitter_std (list[float]): The standard deviation of jittering noise.
+            This applies random noise to all points in a 3D scene, which is
+            sampled from a gaussian distribution whose standard deviation is
+            set by ``jitter_std``. Defaults to [0.01, 0.01, 0.01]
+        clip_range (list[float]): Clip the randomly generated jitter
+            noise into this range. If None is given, don't perform clipping.
+            Defaults to [-0.05, 0.05]
+
+    Note:
+        This transform should only be used in point cloud segmentation tasks
+        because we don't transform ground-truth bboxes accordingly.
+        For similar transform in detection task, please refer to `ObjectNoise`.
+    """
+
+    def __init__(self,
+                 jitter_std: List[float] = [0.01, 0.01, 0.01],
+                 clip_range: List[float] = [-0.05, 0.05]) -> None:
+        seq_types = (list, tuple, np.ndarray)
+        if not isinstance(jitter_std, seq_types):
+            assert isinstance(jitter_std, (int, float)), \
+                f'unsupported jitter_std type {type(jitter_std)}'
+            jitter_std = [jitter_std, jitter_std, jitter_std]
+        self.jitter_std = jitter_std
+
+        if clip_range is not None:
+            if not isinstance(clip_range, seq_types):
+                assert isinstance(clip_range, (int, float)), \
+                    f'unsupported clip_range type {type(clip_range)}'
+                clip_range = [-clip_range, clip_range]
+        self.clip_range = clip_range
+
+    def transform(self, input_dict: dict) -> dict:
+        """Call function to jitter all the points in the scene.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after adding noise to each point,
+            'points' key is updated in the result dict.
+        """
+        points = input_dict['points']
+        jitter_std = np.array(self.jitter_std, dtype=np.float32)
+        jitter_noise = \
+            np.random.randn(points.shape[0], 3) * jitter_std[None, :]
+        if self.clip_range is not None:
+            jitter_noise = np.clip(jitter_noise, self.clip_range[0],
+                                   self.clip_range[1])
+
+        points.translate(jitter_noise)
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(jitter_std={self.jitter_std},'
+        repr_str += f' clip_range={self.clip_range})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ObjectSample(BaseTransform):
+    """Sample GT objects to the data.
+
+    Required Keys:
+
+    - points
+    - ann_info
+    - gt_bboxes_3d
+    - gt_labels_3d
+    - img (optional)
+    - gt_bboxes (optional)
+
+    Modified Keys:
+
+    - points
+    - gt_bboxes_3d
+    - gt_labels_3d
+    - img (optional)
+    - gt_bboxes (optional)
+
+    Added Keys:
+
+    - plane (optional)
+
+    Args:
+        db_sampler (dict): Config dict of the database sampler.
+        sample_2d (bool): Whether to also paste 2D image patch to the images.
+            This should be true when applying multi-modality cut-and-paste.
+            Defaults to False.
+        use_ground_plane (bool): Whether to use ground plane to adjust the
+            3D labels. Defaults to False.
+    """
+
+    def __init__(self,
+                 db_sampler: dict,
+                 sample_2d: bool = False,
+                 use_ground_plane: bool = False) -> None:
+        self.sampler_cfg = db_sampler
+        self.sample_2d = sample_2d
+        if 'type' not in db_sampler.keys():
+            db_sampler['type'] = 'DataBaseSampler'
+        self.db_sampler = TRANSFORMS.build(db_sampler)
+        self.use_ground_plane = use_ground_plane
+        self.disabled = False
+
+    @staticmethod
+    def remove_points_in_boxes(points: BasePoints,
+                               boxes: np.ndarray) -> np.ndarray:
+        """Remove the points in the sampled bounding boxes.
+
+        Args:
+            points (:obj:`BasePoints`): Input point cloud array.
+            boxes (np.ndarray): Sampled ground truth boxes.
+
+        Returns:
+            np.ndarray: Points with those in the boxes removed.
+        """
+        masks = box_np_ops.points_in_rbbox(points.coord.numpy(), boxes)
+        points = points[np.logical_not(masks.any(-1))]
+        return points
+
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to sample ground truth objects to the data.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after object sampling augmentation,
+            'points', 'gt_bboxes_3d', 'gt_labels_3d' keys are updated
+            in the result dict.
+        """
+        if self.disabled:
+            return input_dict
+
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+
+        if self.use_ground_plane:
+            ground_plane = input_dict.get('plane', None)
+            assert ground_plane is not None, '`use_ground_plane` is True ' \
+                                             'but find plane is None'
+        else:
+            ground_plane = None
+        # change to float for blending operation
+        points = input_dict['points']
+        if self.sample_2d:
+            img = input_dict['img']
+            gt_bboxes_2d = input_dict['gt_bboxes']
+            # Assume for now 3D & 2D bboxes are the same
+            sampled_dict = self.db_sampler.sample_all(
+                gt_bboxes_3d.numpy(),
+                gt_labels_3d,
+                gt_bboxes_2d=gt_bboxes_2d,
+                img=img)
+        else:
+            sampled_dict = self.db_sampler.sample_all(
+                gt_bboxes_3d.numpy(),
+                gt_labels_3d,
+                img=None,
+                ground_plane=ground_plane)
+
+        if sampled_dict is not None:
+            sampled_gt_bboxes_3d = sampled_dict['gt_bboxes_3d']
+            sampled_points = sampled_dict['points']
+            sampled_gt_labels = sampled_dict['gt_labels_3d']
+
+            gt_labels_3d = np.concatenate([gt_labels_3d, sampled_gt_labels],
+                                          axis=0)
+            gt_bboxes_3d = gt_bboxes_3d.new_box(
+                np.concatenate([gt_bboxes_3d.numpy(), sampled_gt_bboxes_3d]))
+
+            points = self.remove_points_in_boxes(points, sampled_gt_bboxes_3d)
+            # check the points dimension
+            points = points.cat([sampled_points, points])
+
+            if self.sample_2d:
+                sampled_gt_bboxes_2d = sampled_dict['gt_bboxes_2d']
+                gt_bboxes_2d = np.concatenate(
+                    [gt_bboxes_2d, sampled_gt_bboxes_2d]).astype(np.float32)
+
+                input_dict['gt_bboxes'] = gt_bboxes_2d
+                input_dict['img'] = sampled_dict['img']
+
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d.astype(np.int64)
+        input_dict['points'] = points
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(db_sampler={self.db_sampler},'
+        repr_str += f' sample_2d={self.sample_2d},'
+        repr_str += f' use_ground_plane={self.use_ground_plane})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ObjectNoise(BaseTransform):
+    """Apply noise to each GT objects in the scene.
+
+    Required Keys:
+
+    - points
+    - gt_bboxes_3d
+
+    Modified Keys:
+
+    - points
+    - gt_bboxes_3d
+
+    Args:
+        translation_std (list[float]): Standard deviation of the
+            distribution where translation noise are sampled from.
+            Defaults to [0.25, 0.25, 0.25].
+        global_rot_range (list[float]): Global rotation to the scene.
+            Defaults to [0.0, 0.0].
+        rot_range (list[float]): Object rotation range.
+            Defaults to [-0.15707963267, 0.15707963267].
+        num_try (int): Number of times to try if the noise applied is invalid.
+            Defaults to 100.
+    """
+
+    def __init__(self,
+                 translation_std: List[float] = [0.25, 0.25, 0.25],
+                 global_rot_range: List[float] = [0.0, 0.0],
+                 rot_range: List[float] = [-0.15707963267, 0.15707963267],
+                 num_try: int = 100) -> None:
+        self.translation_std = translation_std
+        self.global_rot_range = global_rot_range
+        self.rot_range = rot_range
+        self.num_try = num_try
+
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to apply noise to each ground truth in the scene.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after adding noise to each object,
+            'points', 'gt_bboxes_3d' keys are updated in the result dict.
+        """
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        points = input_dict['points']
+
+        # TODO: this is inplace operation
+        numpy_box = gt_bboxes_3d.numpy()
+        numpy_points = points.numpy()
+
+        noise_per_object_v3_(
+            numpy_box,
+            numpy_points,
+            rotation_perturb=self.rot_range,
+            center_noise_std=self.translation_std,
+            global_random_rot_range=self.global_rot_range,
+            num_try=self.num_try)
+
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d.new_box(numpy_box)
+        input_dict['points'] = points.new_point(numpy_points)
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_try={self.num_try},'
+        repr_str += f' translation_std={self.translation_std},'
+        repr_str += f' global_rot_range={self.global_rot_range},'
+        repr_str += f' rot_range={self.rot_range})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class GlobalAlignment(BaseTransform):
+    """Apply global alignment to 3D scene points by rotation and translation.
+
+    Args:
+        rotation_axis (int): Rotation axis for points and bboxes rotation.
+
+    Note:
+        We do not record the applied rotation and translation as in
+        GlobalRotScaleTrans. Because usually, we do not need to reverse
+        the alignment step.
+        For example, ScanNet 3D detection task uses aligned ground-truth
+        bounding boxes for evaluation.
+    """
+
+    def __init__(self, rotation_axis: int) -> None:
+        self.rotation_axis = rotation_axis
+
+    def _trans_points(self, results: dict, trans_factor: np.ndarray) -> None:
+        """Private function to translate points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+            trans_factor (np.ndarray): Translation vector to be applied.
+
+        Returns:
+            dict: Results after translation, 'points' is updated in the dict.
+        """
+        results['points'].translate(trans_factor)
+
+    def _rot_points(self, results: dict, rot_mat: np.ndarray) -> None:
+        """Private function to rotate bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+            rot_mat (np.ndarray): Rotation matrix to be applied.
+
+        Returns:
+            dict: Results after rotation, 'points' is updated in the dict.
+        """
+        # input should be rot_mat_T so I transpose it here
+        results['points'].rotate(rot_mat.T)
+
+    def _check_rot_mat(self, rot_mat: np.ndarray) -> None:
+        """Check if rotation matrix is valid for self.rotation_axis.
+
+        Args:
+            rot_mat (np.ndarray): Rotation matrix to be applied.
+        """
+        is_valid = np.allclose(np.linalg.det(rot_mat), 1.0)
+        valid_array = np.zeros(3)
+        valid_array[self.rotation_axis] = 1.0
+        is_valid &= (rot_mat[self.rotation_axis, :] == valid_array).all()
+        is_valid &= (rot_mat[:, self.rotation_axis] == valid_array).all()
+        assert is_valid, f'invalid rotation matrix {rot_mat}'
+
+    def transform(self, results: dict) -> dict:
+        """Call function to shuffle points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after global alignment, 'points' and keys in
+            input_dict['bbox3d_fields'] are updated in the result dict.
+        """
+        assert 'axis_align_matrix' in results, \
+            'axis_align_matrix is not provided in GlobalAlignment'
+
+        axis_align_matrix = results['axis_align_matrix']
+        assert axis_align_matrix.shape == (4, 4), \
+            f'invalid shape {axis_align_matrix.shape} for axis_align_matrix'
+        rot_mat = axis_align_matrix[:3, :3]
+        trans_vec = axis_align_matrix[:3, -1]
+
+        self._check_rot_mat(rot_mat)
+        self._rot_points(results, rot_mat)
+        self._trans_points(results, trans_vec)
+
+        return results
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(rotation_axis={self.rotation_axis})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class GlobalRotScaleTrans(BaseTransform):
+    """Apply global rotation, scaling and translation to a 3D scene.
+
+    Required Keys:
+
+    - points (np.float32)
+    - gt_bboxes_3d (np.float32)
+
+    Modified Keys:
+
+    - points (np.float32)
+    - gt_bboxes_3d (np.float32)
+
+    Added Keys:
+
+    - points (np.float32)
+    - pcd_trans (np.float32)
+    - pcd_rotation (np.float32)
+    - pcd_rotation_angle (np.float32)
+    - pcd_scale_factor (np.float32)
+
+    Args:
+        rot_range (list[float]): Range of rotation angle.
+            Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]).
+        scale_ratio_range (list[float]): Range of scale ratio.
+            Defaults to [0.95, 1.05].
+        translation_std (list[float]): The standard deviation of
+            translation noise applied to a scene, which
+            is sampled from a gaussian distribution whose standard deviation
+            is set by ``translation_std``. Defaults to [0, 0, 0].
+        shift_height (bool): Whether to shift height.
+            (the fourth dimension of indoor points) when scaling.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 rot_range: List[float] = [-0.78539816, 0.78539816],
+                 scale_ratio_range: List[float] = [0.95, 1.05],
+                 translation_std: List[int] = [0, 0, 0],
+                 shift_height: bool = False) -> None:
+        seq_types = (list, tuple, np.ndarray)
+        if not isinstance(rot_range, seq_types):
+            assert isinstance(rot_range, (int, float)), \
+                f'unsupported rot_range type {type(rot_range)}'
+            rot_range = [-rot_range, rot_range]
+        self.rot_range = rot_range
+
+        assert isinstance(scale_ratio_range, seq_types), \
+            f'unsupported scale_ratio_range type {type(scale_ratio_range)}'
+
+        self.scale_ratio_range = scale_ratio_range
+
+        if not isinstance(translation_std, seq_types):
+            assert isinstance(translation_std, (int, float)), \
+                f'unsupported translation_std type {type(translation_std)}'
+            translation_std = [
+                translation_std, translation_std, translation_std
+            ]
+        assert all([std >= 0 for std in translation_std]), \
+            'translation_std should be positive'
+        self.translation_std = translation_std
+        self.shift_height = shift_height
+
+    def _trans_bbox_points(self, input_dict: dict) -> None:
+        """Private function to translate bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after translation, 'points', 'pcd_trans'
+            and `gt_bboxes_3d` is updated in the result dict.
+        """
+        translation_std = np.array(self.translation_std, dtype=np.float32)
+        trans_factor = np.random.normal(scale=translation_std, size=3).T
+
+        input_dict['points'].translate(trans_factor)
+        input_dict['pcd_trans'] = trans_factor
+        if 'gt_bboxes_3d' in input_dict:
+            input_dict['gt_bboxes_3d'].translate(trans_factor)
+
+    def _rot_bbox_points(self, input_dict: dict) -> None:
+        """Private function to rotate bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after rotation, 'points', 'pcd_rotation'
+            and `gt_bboxes_3d` is updated in the result dict.
+        """
+        rotation = self.rot_range
+        noise_rotation = np.random.uniform(rotation[0], rotation[1])
+
+        if 'gt_bboxes_3d' in input_dict and \
+                len(input_dict['gt_bboxes_3d'].tensor) != 0:
+            # rotate points with bboxes
+            points, rot_mat_T = input_dict['gt_bboxes_3d'].rotate(
+                noise_rotation, input_dict['points'])
+            input_dict['points'] = points
+        else:
+            # if no bbox in input_dict, only rotate points
+            rot_mat_T = input_dict['points'].rotate(noise_rotation)
+
+        input_dict['pcd_rotation'] = rot_mat_T
+        input_dict['pcd_rotation_angle'] = noise_rotation
+
+    def _scale_bbox_points(self, input_dict: dict) -> None:
+        """Private function to scale bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after scaling, 'points' and
+            `gt_bboxes_3d` is updated in the result dict.
+        """
+        scale = input_dict['pcd_scale_factor']
+        points = input_dict['points']
+        points.scale(scale)
+        if self.shift_height:
+            assert 'height' in points.attribute_dims.keys(), \
+                'setting shift_height=True but points have no height attribute'
+            points.tensor[:, points.attribute_dims['height']] *= scale
+        input_dict['points'] = points
+
+        if 'gt_bboxes_3d' in input_dict and \
+                len(input_dict['gt_bboxes_3d'].tensor) != 0:
+            input_dict['gt_bboxes_3d'].scale(scale)
+
+    def _random_scale(self, input_dict: dict) -> None:
+        """Private function to randomly set the scale factor.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after scaling, 'pcd_scale_factor'
+            are updated in the result dict.
+        """
+        scale_factor = np.random.uniform(self.scale_ratio_range[0],
+                                         self.scale_ratio_range[1])
+        input_dict['pcd_scale_factor'] = scale_factor
+
+    def transform(self, input_dict: dict) -> dict:
+        """Private function to rotate, scale and translate bounding boxes and
+        points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after scaling, 'points', 'pcd_rotation',
+            'pcd_scale_factor', 'pcd_trans' and `gt_bboxes_3d` are updated
+            in the result dict.
+        """
+        if 'transformation_3d_flow' not in input_dict:
+            input_dict['transformation_3d_flow'] = []
+
+        self._rot_bbox_points(input_dict)
+
+        if 'pcd_scale_factor' not in input_dict:
+            self._random_scale(input_dict)
+        self._scale_bbox_points(input_dict)
+
+        self._trans_bbox_points(input_dict)
+
+        input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(rot_range={self.rot_range},'
+        repr_str += f' scale_ratio_range={self.scale_ratio_range},'
+        repr_str += f' translation_std={self.translation_std},'
+        repr_str += f' shift_height={self.shift_height})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PointShuffle(BaseTransform):
+    """Shuffle input points."""
+
+    def transform(self, input_dict: dict) -> dict:
+        """Call function to shuffle points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask'
+            and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        idx = input_dict['points'].shuffle()
+        idx = idx.numpy()
+
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[idx]
+
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[idx]
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        return self.__class__.__name__
+
+
+@TRANSFORMS.register_module()
+class ObjectRangeFilter(BaseTransform):
+    """Filter objects by the range.
+
+    Required Keys:
+
+    - gt_bboxes_3d
+
+    Modified Keys:
+
+    - gt_bboxes_3d
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range: List[float]) -> None:
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to filter objects by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
+            keys are updated in the result dict.
+        """
+        # Check points instance type and initialise bev_range
+        if isinstance(input_dict['gt_bboxes_3d'],
+                      (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+            bev_range = self.pcd_range[[0, 1, 3, 4]]
+        elif isinstance(input_dict['gt_bboxes_3d'], CameraInstance3DBoxes):
+            bev_range = self.pcd_range[[0, 2, 3, 5]]
+
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+        mask = gt_bboxes_3d.in_range_bev(bev_range)
+        gt_bboxes_3d = gt_bboxes_3d[mask]
+        # mask is a torch tensor but gt_labels_3d is still numpy array
+        # using mask to index gt_labels_3d will cause bug when
+        # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+        # as gt_labels_3d[1] and cause out of index error
+        gt_labels_3d = gt_labels_3d[mask.numpy().astype(bool)]
+
+        # limit rad to [-pi, pi]
+        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PointsRangeFilter(BaseTransform):
+    """Filter points by the range.
+
+    Required Keys:
+
+    - points
+    - pts_instance_mask (optional)
+
+    Modified Keys:
+
+    - points
+    - pts_instance_mask (optional)
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range: List[float]) -> None:
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to filter points by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask'
+            and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = input_dict['points']
+        points_mask = points.in_range_3d(self.pcd_range)
+        clean_points = points[points_mask]
+        input_dict['points'] = clean_points
+        points_mask = points_mask.numpy()
+
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[points_mask]
+
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[points_mask]
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ObjectNameFilter(BaseTransform):
+    """Filter GT objects by their names.
+
+    Required Keys:
+
+    - gt_labels_3d
+
+    Modified Keys:
+
+    - gt_labels_3d
+
+    Args:
+        classes (list[str]): List of class names to be kept for training.
+    """
+
+    def __init__(self, classes: List[str]) -> None:
+        self.classes = classes
+        self.labels = list(range(len(self.classes)))
+
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to filter objects by their names.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
+            keys are updated in the result dict.
+        """
+        gt_labels_3d = input_dict['gt_labels_3d']
+        gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d],
+                                  dtype=bool)
+        input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask]
+        input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask]
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(classes={self.classes})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PointSample(BaseTransform):
+    """Point sample.
+
+    Sampling data to a certain number.
+
+    Required Keys:
+
+    - points
+    - pts_instance_mask (optional)
+    - pts_semantic_mask (optional)
+
+    Modified Keys:
+
+    - points
+    - pts_instance_mask (optional)
+    - pts_semantic_mask (optional)
+
+    Args:
+        num_points (int): Number of points to be sampled.
+        sample_range (float, optional): The range where to sample points.
+            If not None, the points with depth larger than `sample_range` are
+            prior to be sampled. Defaults to None.
+        replace (bool): Whether the sampling is with or without replacement.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 num_points: int,
+                 sample_range: Optional[float] = None,
+                 replace: bool = False) -> None:
+        self.num_points = num_points
+        self.sample_range = sample_range
+        self.replace = replace
+
+    def _points_random_sampling(
+        self,
+        points: BasePoints,
+        num_samples: Union[int, float],
+        sample_range: Optional[float] = None,
+        replace: bool = False,
+        return_choices: bool = False
+    ) -> Union[Tuple[BasePoints, np.ndarray], BasePoints]:
+        """Points random sampling.
+
+        Sample points to a certain number.
+
+        Args:
+            points (:obj:`BasePoints`): 3D Points.
+            num_samples (int, float): Number of samples to be sampled. If
+                float, we sample random fraction of points from num_points
+                to 100%.
+            sample_range (float, optional): Indicating the range where the
+                points will be sampled. Defaults to None.
+            replace (bool): Sampling with or without replacement.
+                Defaults to False.
+            return_choices (bool): Whether return choice. Defaults to False.
+
+        Returns:
+            tuple[:obj:`BasePoints`, np.ndarray] | :obj:`BasePoints`:
+
+                - points (:obj:`BasePoints`): 3D Points.
+                - choices (np.ndarray, optional): The generated random samples.
+        """
+        if isinstance(num_samples, float):
+            assert num_samples < 1
+            num_samples = int(
+                np.random.uniform(self.num_points, 1.) * points.shape[0])
+
+        if not replace:
+            replace = (points.shape[0] < num_samples)
+        point_range = range(len(points))
+        if sample_range is not None and not replace:
+            # Only sampling the near points when len(points) >= num_samples
+            dist = np.linalg.norm(points.coord.numpy(), axis=1)
+            far_inds = np.where(dist >= sample_range)[0]
+            near_inds = np.where(dist < sample_range)[0]
+            # in case there are too many far points
+            if len(far_inds) > num_samples:
+                far_inds = np.random.choice(
+                    far_inds, num_samples, replace=False)
+            point_range = near_inds
+            num_samples -= len(far_inds)
+        choices = np.random.choice(point_range, num_samples, replace=replace)
+        if sample_range is not None and not replace:
+            choices = np.concatenate((far_inds, choices))
+            # Shuffle points after sampling
+            np.random.shuffle(choices)
+        if return_choices:
+            return points[choices], choices
+        else:
+            return points[choices]
+
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to sample points to in indoor scenes.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask'
+            and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = input_dict['points']
+        points, choices = self._points_random_sampling(
+            points,
+            self.num_points,
+            self.sample_range,
+            self.replace,
+            return_choices=True)
+        input_dict['points'] = points
+
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            pts_instance_mask = pts_instance_mask[choices]
+            input_dict['pts_instance_mask'] = pts_instance_mask
+
+        if pts_semantic_mask is not None:
+            pts_semantic_mask = pts_semantic_mask[choices]
+            input_dict['pts_semantic_mask'] = pts_semantic_mask
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_points={self.num_points},'
+        repr_str += f' sample_range={self.sample_range},'
+        repr_str += f' replace={self.replace})'
+
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class IndoorPointSample(PointSample):
+    """Indoor point sample.
+
+    Sampling data to a certain number.
+    NOTE: IndoorPointSample is deprecated in favor of PointSample
+
+    Args:
+        num_points (int): Number of points to be sampled.
+    """
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            'IndoorPointSample is deprecated in favor of PointSample')
+        super(IndoorPointSample, self).__init__(*args, **kwargs)
+
+
+@TRANSFORMS.register_module()
+class IndoorPatchPointSample(BaseTransform):
+    r"""Indoor point sample within a patch. Modified from `PointNet++ <https://
+    github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py>`_.
+
+    Sampling data to a certain number for semantic segmentation.
+
+    Args:
+        num_points (int): Number of points to be sampled.
+        block_size (float): Size of a block to sample points from.
+            Defaults to 1.5.
+        sample_rate (float, optional): Stride used in sliding patch generation.
+            This parameter is unused in `IndoorPatchPointSample` and thus has
+            been deprecated. We plan to remove it in the future.
+            Defaults to None.
+        ignore_index (int, optional): Label index that won't be used for the
+            segmentation task. This is set in PointSegClassMapping as neg_cls.
+            If not None, will be used as a patch selection criterion.
+            Defaults to None.
+        use_normalized_coord (bool): Whether to use normalized xyz as
+            additional features. Defaults to False.
+        num_try (int): Number of times to try if the patch selected is invalid.
+            Defaults to 10.
+        enlarge_size (float): Enlarge the sampled patch to
+            [-block_size / 2 - enlarge_size, block_size / 2 + enlarge_size] as
+            an augmentation. If None, set it as 0. Defaults to 0.2.
+        min_unique_num (int, optional): Minimum number of unique points
+            the sampled patch should contain. If None, use PointNet++'s method
+            to judge uniqueness. Defaults to None.
+        eps (float): A value added to patch boundary to guarantee
+            points coverage. Defaults to 1e-2.
+
+    Note:
+        This transform should only be used in the training process of point
+        cloud segmentation tasks. For the sliding patch generation and
+        inference process in testing, please refer to the `slide_inference`
+        function of `EncoderDecoder3D` class.
+    """
+
+    def __init__(self,
+                 num_points: int,
+                 block_size: float = 1.5,
+                 sample_rate: Optional[float] = None,
+                 ignore_index: Optional[int] = None,
+                 use_normalized_coord: bool = False,
+                 num_try: int = 10,
+                 enlarge_size: float = 0.2,
+                 min_unique_num: Optional[int] = None,
+                 eps: float = 1e-2) -> None:
+        self.num_points = num_points
+        self.block_size = block_size
+        self.ignore_index = ignore_index
+        self.use_normalized_coord = use_normalized_coord
+        self.num_try = num_try
+        self.enlarge_size = enlarge_size if enlarge_size is not None else 0.0
+        self.min_unique_num = min_unique_num
+        self.eps = eps
+
+        if sample_rate is not None:
+            warnings.warn(
+                "'sample_rate' has been deprecated and will be removed in "
+                'the future. Please remove them from your code.')
+
+    def _input_generation(self, coords: np.ndarray, patch_center: np.ndarray,
+                          coord_max: np.ndarray, attributes: np.ndarray,
+                          attribute_dims: dict,
+                          point_type: type) -> BasePoints:
+        """Generating model input.
+
+        Generate input by subtracting patch center and adding additional
+            features. Currently support colors and normalized xyz as features.
+
+        Args:
+            coords (np.ndarray): Sampled 3D Points.
+            patch_center (np.ndarray): Center coordinate of the selected patch.
+            coord_max (np.ndarray): Max coordinate of all 3D Points.
+            attributes (np.ndarray): features of input points.
+            attribute_dims (dict): Dictionary to indicate the meaning of extra
+                dimension.
+            point_type (type): class of input points inherited from BasePoints.
+
+        Returns:
+            :obj:`BasePoints`: The generated input data.
+        """
+        # subtract patch center, the z dimension is not centered
+        centered_coords = coords.copy()
+        centered_coords[:, 0] -= patch_center[0]
+        centered_coords[:, 1] -= patch_center[1]
+
+        if self.use_normalized_coord:
+            normalized_coord = coords / coord_max
+            attributes = np.concatenate([attributes, normalized_coord], axis=1)
+            if attribute_dims is None:
+                attribute_dims = dict()
+            attribute_dims.update(
+                dict(normalized_coord=[
+                    attributes.shape[1], attributes.shape[1] +
+                    1, attributes.shape[1] + 2
+                ]))
+
+        points = np.concatenate([centered_coords, attributes], axis=1)
+        points = point_type(
+            points, points_dim=points.shape[1], attribute_dims=attribute_dims)
+
+        return points
+
+    def _patch_points_sampling(
+            self, points: BasePoints,
+            sem_mask: np.ndarray) -> Tuple[BasePoints, np.ndarray]:
+        """Patch points sampling.
+
+        First sample a valid patch.
+        Then sample points within that patch to a certain number.
+
+        Args:
+            points (:obj:`BasePoints`): 3D Points.
+            sem_mask (np.ndarray): semantic segmentation mask for input points.
+
+        Returns:
+            tuple[:obj:`BasePoints`, np.ndarray]:
+
+                - points (:obj:`BasePoints`): 3D Points.
+                - choices (np.ndarray): The generated random samples.
+        """
+        coords = points.coord.numpy()
+        attributes = points.tensor[:, 3:].numpy()
+        attribute_dims = points.attribute_dims
+        point_type = type(points)
+
+        coord_max = np.amax(coords, axis=0)
+        coord_min = np.amin(coords, axis=0)
+
+        for _ in range(self.num_try):
+            # random sample a point as patch center
+            cur_center = coords[np.random.choice(coords.shape[0])]
+
+            # boundary of a patch, which would be enlarged by
+            # `self.enlarge_size` as an augmentation
+            cur_max = cur_center + np.array(
+                [self.block_size / 2.0, self.block_size / 2.0, 0.0])
+            cur_min = cur_center - np.array(
+                [self.block_size / 2.0, self.block_size / 2.0, 0.0])
+            cur_max[2] = coord_max[2]
+            cur_min[2] = coord_min[2]
+            cur_choice = np.sum(
+                (coords >= (cur_min - self.enlarge_size)) *
+                (coords <= (cur_max + self.enlarge_size)),
+                axis=1) == 3
+
+            if not cur_choice.any():  # no points in this patch
+                continue
+
+            cur_coords = coords[cur_choice, :]
+            cur_sem_mask = sem_mask[cur_choice]
+            point_idxs = np.where(cur_choice)[0]
+            mask = np.sum(
+                (cur_coords >= (cur_min - self.eps)) * (cur_coords <=
+                                                        (cur_max + self.eps)),
+                axis=1) == 3
+
+            # two criteria for patch sampling, adopted from PointNet++
+            # 1. selected patch should contain enough unique points
+            if self.min_unique_num is None:
+                # use PointNet++'s method as default
+                # [31, 31, 62] are just some big values used to transform
+                # coords from 3d array to 1d and then check their uniqueness
+                # this is used in all the ScanNet code following PointNet++
+                vidx = np.ceil(
+                    (cur_coords[mask, :] - cur_min) / (cur_max - cur_min) *
+                    np.array([31.0, 31.0, 62.0]))
+                vidx = np.unique(vidx[:, 0] * 31.0 * 62.0 + vidx[:, 1] * 62.0 +
+                                 vidx[:, 2])
+                flag1 = len(vidx) / 31.0 / 31.0 / 62.0 >= 0.02
+            else:
+                # if `min_unique_num` is provided, directly compare with it
+                flag1 = mask.sum() >= self.min_unique_num
+
+            # 2. selected patch should contain enough annotated points
+            if self.ignore_index is None:
+                flag2 = True
+            else:
+                flag2 = np.sum(cur_sem_mask != self.ignore_index) / \
+                               len(cur_sem_mask) >= 0.7
+
+            if flag1 and flag2:
+                break
+
+        # sample idx to `self.num_points`
+        if point_idxs.size >= self.num_points:
+            # no duplicate in sub-sampling
+            choices = np.random.choice(
+                point_idxs, self.num_points, replace=False)
+        else:
+            # do not use random choice here to avoid some points not counted
+            dup = np.random.choice(point_idxs.size,
+                                   self.num_points - point_idxs.size)
+            idx_dup = np.concatenate(
+                [np.arange(point_idxs.size),
+                 np.array(dup)], 0)
+            choices = point_idxs[idx_dup]
+
+        # construct model input
+        points = self._input_generation(coords[choices], cur_center, coord_max,
+                                        attributes[choices], attribute_dims,
+                                        point_type)
+
+        return points, choices
+
+    def transform(self, input_dict: dict) -> dict:
+        """Call function to sample points to in indoor scenes.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask'
+            and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = input_dict['points']
+
+        assert 'pts_semantic_mask' in input_dict.keys(), \
+            'semantic mask should be provided in training and evaluation'
+        pts_semantic_mask = input_dict['pts_semantic_mask']
+
+        points, choices = self._patch_points_sampling(points,
+                                                      pts_semantic_mask)
+
+        input_dict['points'] = points
+        input_dict['pts_semantic_mask'] = pts_semantic_mask[choices]
+
+        # 'eval_ann_info' will be passed to evaluator
+        if 'eval_ann_info' in input_dict:
+            input_dict['eval_ann_info']['pts_semantic_mask'] = \
+                pts_semantic_mask[choices]
+
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[choices]
+            # 'eval_ann_info' will be passed to evaluator
+            if 'eval_ann_info' in input_dict:
+                input_dict['eval_ann_info']['pts_instance_mask'] = \
+                    pts_instance_mask[choices]
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_points={self.num_points},'
+        repr_str += f' block_size={self.block_size},'
+        repr_str += f' ignore_index={self.ignore_index},'
+        repr_str += f' use_normalized_coord={self.use_normalized_coord},'
+        repr_str += f' num_try={self.num_try},'
+        repr_str += f' enlarge_size={self.enlarge_size},'
+        repr_str += f' min_unique_num={self.min_unique_num},'
+        repr_str += f' eps={self.eps})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class BackgroundPointsFilter(BaseTransform):
+    """Filter background points near the bounding box.
+
+    Args:
+        bbox_enlarge_range (tuple[float] | float): Bbox enlarge range.
+    """
+
+    def __init__(self, bbox_enlarge_range: Union[Tuple[float], float]) -> None:
+        assert (is_tuple_of(bbox_enlarge_range, float)
+                and len(bbox_enlarge_range) == 3) \
+            or isinstance(bbox_enlarge_range, float), \
+            f'Invalid arguments bbox_enlarge_range {bbox_enlarge_range}'
+
+        if isinstance(bbox_enlarge_range, float):
+            bbox_enlarge_range = [bbox_enlarge_range] * 3
+        self.bbox_enlarge_range = np.array(
+            bbox_enlarge_range, dtype=np.float32)[np.newaxis, :]
+
+    def transform(self, input_dict: dict) -> dict:
+        """Call function to filter points by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask'
+            and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = input_dict['points']
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+
+        # avoid groundtruth being modified
+        gt_bboxes_3d_np = gt_bboxes_3d.tensor.clone().numpy()
+        gt_bboxes_3d_np[:, :3] = gt_bboxes_3d.gravity_center.clone().numpy()
+
+        enlarged_gt_bboxes_3d = gt_bboxes_3d_np.copy()
+        enlarged_gt_bboxes_3d[:, 3:6] += self.bbox_enlarge_range
+        points_numpy = points.tensor.clone().numpy()
+        foreground_masks = box_np_ops.points_in_rbbox(
+            points_numpy, gt_bboxes_3d_np, origin=(0.5, 0.5, 0.5))
+        enlarge_foreground_masks = box_np_ops.points_in_rbbox(
+            points_numpy, enlarged_gt_bboxes_3d, origin=(0.5, 0.5, 0.5))
+        foreground_masks = foreground_masks.max(1)
+        enlarge_foreground_masks = enlarge_foreground_masks.max(1)
+        valid_masks = ~np.logical_and(~foreground_masks,
+                                      enlarge_foreground_masks)
+
+        input_dict['points'] = points[valid_masks]
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[valid_masks]
+
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[valid_masks]
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(bbox_enlarge_range={self.bbox_enlarge_range.tolist()})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class VoxelBasedPointSampler(BaseTransform):
+    """Voxel based point sampler.
+
+    Apply voxel sampling to multiple sweep points.
+
+    Args:
+        cur_sweep_cfg (dict): Config for sampling current points.
+        prev_sweep_cfg (dict, optional): Config for sampling previous points.
+            Defaults to None.
+        time_dim (int): Index that indicate the time dimension
+            for input points. Defaults to 3.
+    """
+
+    def __init__(self,
+                 cur_sweep_cfg: dict,
+                 prev_sweep_cfg: Optional[dict] = None,
+                 time_dim: int = 3) -> None:
+        self.cur_voxel_generator = VoxelGenerator(**cur_sweep_cfg)
+        self.cur_voxel_num = self.cur_voxel_generator._max_voxels
+        self.time_dim = time_dim
+        if prev_sweep_cfg is not None:
+            assert prev_sweep_cfg['max_num_points'] == \
+                cur_sweep_cfg['max_num_points']
+            self.prev_voxel_generator = VoxelGenerator(**prev_sweep_cfg)
+            self.prev_voxel_num = self.prev_voxel_generator._max_voxels
+        else:
+            self.prev_voxel_generator = None
+            self.prev_voxel_num = 0
+
+    def _sample_points(self, points: np.ndarray, sampler: VoxelGenerator,
+                       point_dim: int) -> np.ndarray:
+        """Sample points for each points subset.
+
+        Args:
+            points (np.ndarray): Points subset to be sampled.
+            sampler (VoxelGenerator): Voxel based sampler for
+                each points subset.
+            point_dim (int): The dimension of each points.
+
+        Returns:
+            np.ndarray: Sampled points.
+        """
+        voxels, coors, num_points_per_voxel = sampler.generate(points)
+        if voxels.shape[0] < sampler._max_voxels:
+            padding_points = np.zeros([
+                sampler._max_voxels - voxels.shape[0], sampler._max_num_points,
+                point_dim
+            ],
+                                      dtype=points.dtype)
+            padding_points[:] = voxels[0]
+            sample_points = np.concatenate([voxels, padding_points], axis=0)
+        else:
+            sample_points = voxels
+
+        return sample_points
+
+    def transform(self, results: dict) -> dict:
+        """Call function to sample points from multiple sweeps.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask'
+            and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = results['points']
+        original_dim = points.shape[1]
+
+        # TODO: process instance and semantic mask while _max_num_points
+        # is larger than 1
+        # Extend points with seg and mask fields
+        map_fields2dim = []
+        start_dim = original_dim
+        points_numpy = points.numpy()
+        extra_channel = [points_numpy]
+        for idx, key in enumerate(results['pts_mask_fields']):
+            map_fields2dim.append((key, idx + start_dim))
+            extra_channel.append(results[key][..., None])
+
+        start_dim += len(results['pts_mask_fields'])
+        for idx, key in enumerate(results['pts_seg_fields']):
+            map_fields2dim.append((key, idx + start_dim))
+            extra_channel.append(results[key][..., None])
+
+        points_numpy = np.concatenate(extra_channel, axis=-1)
+
+        # Split points into two part, current sweep points and
+        # previous sweeps points.
+        # TODO: support different sampling methods for next sweeps points
+        # and previous sweeps points.
+        cur_points_flag = (points_numpy[:, self.time_dim] == 0)
+        cur_sweep_points = points_numpy[cur_points_flag]
+        prev_sweeps_points = points_numpy[~cur_points_flag]
+        if prev_sweeps_points.shape[0] == 0:
+            prev_sweeps_points = cur_sweep_points
+
+        # Shuffle points before sampling
+        np.random.shuffle(cur_sweep_points)
+        np.random.shuffle(prev_sweeps_points)
+
+        cur_sweep_points = self._sample_points(cur_sweep_points,
+                                               self.cur_voxel_generator,
+                                               points_numpy.shape[1])
+        if self.prev_voxel_generator is not None:
+            prev_sweeps_points = self._sample_points(prev_sweeps_points,
+                                                     self.prev_voxel_generator,
+                                                     points_numpy.shape[1])
+
+            points_numpy = np.concatenate(
+                [cur_sweep_points, prev_sweeps_points], 0)
+        else:
+            points_numpy = cur_sweep_points
+
+        if self.cur_voxel_generator._max_num_points == 1:
+            points_numpy = points_numpy.squeeze(1)
+        results['points'] = points.new_point(points_numpy[..., :original_dim])
+
+        # Restore the corresponding seg and mask fields
+        for key, dim_index in map_fields2dim:
+            results[key] = points_numpy[..., dim_index]
+
+        return results
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+
+        def _auto_indent(repr_str, indent):
+            repr_str = repr_str.split('\n')
+            repr_str = [' ' * indent + t + '\n' for t in repr_str]
+            repr_str = ''.join(repr_str)[:-1]
+            return repr_str
+
+        repr_str = self.__class__.__name__
+        indent = 4
+        repr_str += '(\n'
+        repr_str += ' ' * indent + f'num_cur_sweep={self.cur_voxel_num},\n'
+        repr_str += ' ' * indent + f'num_prev_sweep={self.prev_voxel_num},\n'
+        repr_str += ' ' * indent + f'time_dim={self.time_dim},\n'
+        repr_str += ' ' * indent + 'cur_voxel_generator=\n'
+        repr_str += f'{_auto_indent(repr(self.cur_voxel_generator), 8)},\n'
+        repr_str += ' ' * indent + 'prev_voxel_generator=\n'
+        repr_str += f'{_auto_indent(repr(self.prev_voxel_generator), 8)})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class AffineResize(BaseTransform):
+    """Get the affine transform matrices to the target size.
+
+    Different from :class:`RandomAffine` in MMDetection, this class can
+    calculate the affine transform matrices while resizing the input image
+    to a fixed size. The affine transform matrices include: 1) matrix
+    transforming original image to the network input image size. 2) matrix
+    transforming original image to the network output feature map size.
+
+    Args:
+        img_scale (tuple): Images scales for resizing.
+        down_ratio (int): The down ratio of feature map.
+            Actually the arg should be >= 1.
+        bbox_clip_border (bool): Whether clip the objects
+            outside the border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple,
+                 down_ratio: int,
+                 bbox_clip_border: bool = True) -> None:
+
+        self.img_scale = img_scale
+        self.down_ratio = down_ratio
+        self.bbox_clip_border = bbox_clip_border
+
+    def transform(self, results: dict) -> dict:
+        """Call function to do affine transform to input image and labels.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after affine resize, 'affine_aug', 'trans_mat'
+            keys are added in the result dict.
+        """
+        # The results have gone through RandomShiftScale before AffineResize
+        if 'center' not in results:
+            img = results['img']
+            height, width = img.shape[:2]
+            center = np.array([width / 2, height / 2], dtype=np.float32)
+            size = np.array([width, height], dtype=np.float32)
+            results['affine_aug'] = False
+        else:
+            # The results did not go through RandomShiftScale before
+            # AffineResize
+            img = results['img']
+            center = results['center']
+            size = results['size']
+
+        trans_affine = self._get_transform_matrix(center, size, self.img_scale)
+
+        img = cv2.warpAffine(img, trans_affine[:2, :], self.img_scale)
+
+        if isinstance(self.down_ratio, tuple):
+            trans_mat = [
+                self._get_transform_matrix(
+                    center, size,
+                    (self.img_scale[0] // ratio, self.img_scale[1] // ratio))
+                for ratio in self.down_ratio
+            ]  # (3, 3)
+        else:
+            trans_mat = self._get_transform_matrix(
+                center, size, (self.img_scale[0] // self.down_ratio,
+                               self.img_scale[1] // self.down_ratio))
+
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['pad_shape'] = img.shape
+        results['trans_mat'] = trans_mat
+
+        if 'gt_bboxes' in results:
+            self._affine_bboxes(results, trans_affine)
+
+        if 'centers_2d' in results:
+            centers2d = self._affine_transform(results['centers_2d'],
+                                               trans_affine)
+            valid_index = (centers2d[:, 0] >
+                           0) & (centers2d[:, 0] <
+                                 self.img_scale[0]) & (centers2d[:, 1] > 0) & (
+                                     centers2d[:, 1] < self.img_scale[1])
+            results['centers_2d'] = centers2d[valid_index]
+
+            if 'gt_bboxes' in results:
+                results['gt_bboxes'] = results['gt_bboxes'][valid_index]
+                if 'gt_bboxes_labels' in results:
+                    results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
+                        valid_index]
+                if 'gt_masks' in results:
+                    raise NotImplementedError(
+                        'AffineResize only supports bbox.')
+
+            if 'gt_bboxes_3d' in results:
+                results['gt_bboxes_3d'].tensor = results[
+                    'gt_bboxes_3d'].tensor[valid_index]
+                if 'gt_labels_3d' in results:
+                    results['gt_labels_3d'] = results['gt_labels_3d'][
+                        valid_index]
+
+            results['depths'] = results['depths'][valid_index]
+
+        return results
+
+    def _affine_bboxes(self, results: dict, matrix: np.ndarray) -> None:
+        """Affine transform bboxes to input image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            matrix (np.ndarray): Matrix transforming original
+                image to the network input image size.
+                shape: (3, 3)
+        """
+
+        bboxes = results['gt_bboxes']
+        bboxes[:, :2] = self._affine_transform(bboxes[:, :2], matrix)
+        bboxes[:, 2:] = self._affine_transform(bboxes[:, 2:], matrix)
+        if self.bbox_clip_border:
+            bboxes[:, [0, 2]] = bboxes[:, [0, 2]].clip(0,
+                                                       self.img_scale[0] - 1)
+            bboxes[:, [1, 3]] = bboxes[:, [1, 3]].clip(0,
+                                                       self.img_scale[1] - 1)
+        results['gt_bboxes'] = bboxes
+
+    def _affine_transform(self, points: np.ndarray,
+                          matrix: np.ndarray) -> np.ndarray:
+        """Affine transform bbox points to input image.
+
+        Args:
+            points (np.ndarray): Points to be transformed.
+                shape: (N, 2)
+            matrix (np.ndarray): Affine transform matrix.
+                shape: (3, 3)
+
+        Returns:
+            np.ndarray: Transformed points.
+        """
+        num_points = points.shape[0]
+        hom_points_2d = np.concatenate((points, np.ones((num_points, 1))),
+                                       axis=1)
+        hom_points_2d = hom_points_2d.T
+        affined_points = np.matmul(matrix, hom_points_2d).T
+        return affined_points[:, :2]
+
+    def _get_transform_matrix(self, center: Tuple, scale: Tuple,
+                              output_scale: Tuple[float]) -> np.ndarray:
+        """Get affine transform matrix.
+
+        Args:
+            center (tuple): Center of current image.
+            scale (tuple): Scale of current image.
+            output_scale (tuple[float]): The transform target image scales.
+
+        Returns:
+            np.ndarray: Affine transform matrix.
+        """
+        # TODO: further add rot and shift here.
+        src_w = scale[0]
+        dst_w = output_scale[0]
+        dst_h = output_scale[1]
+
+        src_dir = np.array([0, src_w * -0.5])
+        dst_dir = np.array([0, dst_w * -0.5])
+
+        src = np.zeros((3, 2), dtype=np.float32)
+        dst = np.zeros((3, 2), dtype=np.float32)
+        src[0, :] = center
+        src[1, :] = center + src_dir
+        dst[0, :] = np.array([dst_w * 0.5, dst_h * 0.5])
+        dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+
+        src[2, :] = self._get_ref_point(src[0, :], src[1, :])
+        dst[2, :] = self._get_ref_point(dst[0, :], dst[1, :])
+
+        get_matrix = cv2.getAffineTransform(src, dst)
+
+        matrix = np.concatenate((get_matrix, [[0., 0., 1.]]))
+
+        return matrix.astype(np.float32)
+
+    def _get_ref_point(self, ref_point1: np.ndarray,
+                       ref_point2: np.ndarray) -> np.ndarray:
+        """Get reference point to calculate affine transform matrix.
+
+        While using opencv to calculate the affine matrix, we need at least
+        three corresponding points separately on original image and target
+        image. Here we use two points to get the the third reference point.
+        """
+        d = ref_point1 - ref_point2
+        ref_point3 = ref_point2 + np.array([-d[1], d[0]])
+        return ref_point3
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'down_ratio={self.down_ratio}) '
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomShiftScale(BaseTransform):
+    """Random shift scale.
+
+    Different from the normal shift and scale function, it doesn't
+    directly shift or scale image. It can record the shift and scale
+    infos into loading TRANSFORMS. It's designed to be used with
+    AffineResize together.
+
+    Args:
+        shift_scale (tuple[float]): Shift and scale range.
+        aug_prob (float): The shifting and scaling probability.
+    """
+
+    def __init__(self, shift_scale: Tuple[float], aug_prob: float) -> None:
+
+        self.shift_scale = shift_scale
+        self.aug_prob = aug_prob
+
+    def transform(self, results: dict) -> dict:
+        """Call function to record random shift and scale infos.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after random shift and scale, 'center', 'size'
+            and 'affine_aug' keys are added in the result dict.
+        """
+        img = results['img']
+
+        height, width = img.shape[:2]
+
+        center = np.array([width / 2, height / 2], dtype=np.float32)
+        size = np.array([width, height], dtype=np.float32)
+
+        if random.random() < self.aug_prob:
+            shift, scale = self.shift_scale[0], self.shift_scale[1]
+            shift_ranges = np.arange(-shift, shift + 0.1, 0.1)
+            center[0] += size[0] * random.choice(shift_ranges)
+            center[1] += size[1] * random.choice(shift_ranges)
+            scale_ranges = np.arange(1 - scale, 1 + scale + 0.1, 0.1)
+            size *= random.choice(scale_ranges)
+            results['affine_aug'] = True
+        else:
+            results['affine_aug'] = False
+
+        results['center'] = center
+        results['size'] = size
+
+        return results
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(shift_scale={self.shift_scale}, '
+        repr_str += f'aug_prob={self.aug_prob}) '
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Resize3D(Resize):
+
+    def _resize_3d(self, results: dict) -> None:
+        """Resize centers_2d and modify camera intrinisc with
+        ``results['scale']``."""
+        if 'centers_2d' in results:
+            results['centers_2d'] *= results['scale_factor'][:2]
+        results['cam2img'][0] *= np.array(results['scale_factor'][0])
+        results['cam2img'][1] *= np.array(results['scale_factor'][1])
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to resize images, bounding boxes, semantic
+        segmentation map and keypoints.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
+            'gt_keypoints', 'scale', 'scale_factor', 'img_shape',
+            and 'keep_ratio' keys are updated in result dict.
+        """
+
+        super(Resize3D, self).transform(results)
+        self._resize_3d(results)
+        return results
+
+
+@TRANSFORMS.register_module()
+class RandomResize3D(RandomResize):
+    """The difference between RandomResize3D and RandomResize:
+
+    1. Compared to RandomResize, this class would further
+        check if scale is already set in results.
+    2. During resizing, this class would modify the centers_2d
+        and cam2img with ``results['scale']``.
+    """
+
+    def _resize_3d(self, results: dict) -> None:
+        """Resize centers_2d and modify camera intrinisc with
+        ``results['scale']``."""
+        if 'centers_2d' in results:
+            results['centers_2d'] *= results['scale_factor'][:2]
+        results['cam2img'][0] *= np.array(results['scale_factor'][0])
+        results['cam2img'][1] *= np.array(results['scale_factor'][1])
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to resize images, bounding boxes, masks, semantic
+        segmentation map. Compared to RandomResize, this function would further
+        check if scale is already set in results.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor',
+            'keep_ratio' keys are added into result dict.
+        """
+        if 'scale' not in results:
+            results['scale'] = self._random_scale()
+        self.resize.scale = results['scale']
+        results = self.resize(results)
+        self._resize_3d(results)
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class RandomCrop3D(RandomCrop):
+    """3D version of RandomCrop. RamdomCrop3D supports the modifications of
+    camera intrinsic matrix and using predefined randomness variable to do the
+    augmentation.
+
+    The absolute ``crop_size`` is sampled based on ``crop_type`` and
+    ``image_size``, then the cropped results are generated.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (np.float32) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_masks (optional)
+    - gt_ignore_flags (optional)
+    - gt_seg_map (optional)
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        crop_size (tuple): The relative ratio or absolute pixels of
+            height and width.
+        crop_type (str): One of "relative_range", "relative",
+            "absolute", "absolute_range". "relative" randomly crops
+            (h * crop_size[0], w * crop_size[1]) part from an input of size
+            (h, w). "relative_range" uniformly samples relative crop size from
+            range [crop_size[0], 1] and [crop_size[1], 1] for height and width
+            respectively. "absolute" crops from an input with absolute size
+            (crop_size[0], crop_size[1]). "absolute_range" uniformly samples
+            crop_h in range [crop_size[0], min(h, crop_size[1])] and crop_w
+            in range [crop_size[0], min(w, crop_size[1])].
+            Defaults to "absolute".
+        allow_negative_crop (bool): Whether to allow a crop that does
+            not contain any bbox area. Defaults to False.
+        recompute_bbox (bool): Whether to re-compute the boxes based
+            on cropped instance masks. Defaults to False.
+        bbox_clip_border (bool): Whether clip the objects outside
+            the border of the image. Defaults to True.
+        rel_offset_h (tuple): The cropping interval of image height. Defaults
+            to (0., 1.).
+        rel_offset_w (tuple): The cropping interval of image width. Defaults
+            to (0., 1.).
+
+    Note:
+        - If the image is smaller than the absolute crop size, return the
+          original image.
+        - The keys for bboxes, labels and masks must be aligned. That is,
+          ``gt_bboxes`` corresponds to ``gt_labels`` and ``gt_masks``, and
+          ``gt_bboxes_ignore`` corresponds to ``gt_labels_ignore`` and
+          ``gt_masks_ignore``.
+        - If the crop does not contain any gt-bbox region and
+          ``allow_negative_crop`` is set to False, skip this image.
+    """
+
+    def __init__(
+        self,
+        crop_size: tuple,
+        crop_type: str = 'absolute',
+        allow_negative_crop: bool = False,
+        recompute_bbox: bool = False,
+        bbox_clip_border: bool = True,
+        rel_offset_h: tuple = (0., 1.),
+        rel_offset_w: tuple = (0., 1.)
+    ) -> None:
+        super().__init__(
+            crop_size=crop_size,
+            crop_type=crop_type,
+            allow_negative_crop=allow_negative_crop,
+            recompute_bbox=recompute_bbox,
+            bbox_clip_border=bbox_clip_border)
+        # rel_offset specifies the relative offset range of cropping origin
+        # [0., 1.] means starting from 0*margin to 1*margin + 1
+        self.rel_offset_h = rel_offset_h
+        self.rel_offset_w = rel_offset_w
+
+    def _crop_data(self,
+                   results: dict,
+                   crop_size: tuple,
+                   allow_negative_crop: bool = False) -> dict:
+        """Function to randomly crop images, bounding boxes, masks, semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            crop_size (tuple): Expected absolute size after cropping, (h, w).
+            allow_negative_crop (bool): Whether to allow a crop that does not
+                contain any bbox area. Defaults to False.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+            updated according to crop size.
+        """
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            if 'img_crop_offset' not in results:
+                margin_h = max(img.shape[0] - crop_size[0], 0)
+                margin_w = max(img.shape[1] - crop_size[1], 0)
+                # TOCHECK: a little different from LIGA implementation
+                offset_h = np.random.randint(
+                    self.rel_offset_h[0] * margin_h,
+                    self.rel_offset_h[1] * margin_h + 1)
+                offset_w = np.random.randint(
+                    self.rel_offset_w[0] * margin_w,
+                    self.rel_offset_w[1] * margin_w + 1)
+            else:
+                offset_w, offset_h = results['img_crop_offset']
+
+            crop_h = min(crop_size[0], img.shape[0])
+            crop_w = min(crop_size[1], img.shape[1])
+            crop_y1, crop_y2 = offset_h, offset_h + crop_h
+            crop_x1, crop_x2 = offset_w, offset_w + crop_w
+
+            # crop the image
+            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+            img_shape = img.shape
+            results[key] = img
+        results['img_shape'] = img_shape
+
+        # crop bboxes accordingly and clip to the image boundary
+        for key in results.get('bbox_fields', []):
+            # e.g. gt_bboxes and gt_bboxes_ignore
+            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
+                                   dtype=np.float32)
+            bboxes = results[key] - bbox_offset
+            if self.bbox_clip_border:
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+            valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
+                bboxes[:, 3] > bboxes[:, 1])
+            # If the crop does not contain any gt-bbox area and
+            # allow_negative_crop is False, skip this image.
+            if (key == 'gt_bboxes' and not valid_inds.any()
+                    and not allow_negative_crop):
+                return None
+            results[key] = bboxes[valid_inds, :]
+            # label fields. e.g. gt_labels and gt_labels_ignore
+            label_key = self.bbox2label.get(key)
+            if label_key in results:
+                results[label_key] = results[label_key][valid_inds]
+
+            # mask fields, e.g. gt_masks and gt_masks_ignore
+            mask_key = self.bbox2mask.get(key)
+            if mask_key in results:
+                results[mask_key] = results[mask_key][
+                    valid_inds.nonzero()[0]].crop(
+                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
+                if self.recompute_bbox:
+                    results[key] = results[mask_key].get_bboxes()
+
+        # crop semantic seg
+        for key in results.get('seg_fields', []):
+            results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]
+
+        # manipulate camera intrinsic matrix
+        # needs to apply offset to K instead of P2 (on KITTI)
+        if isinstance(results['cam2img'], list):
+            # TODO ignore this, but should handle it in the future
+            pass
+        else:
+            K = results['cam2img'][:3, :3].copy()
+            inv_K = np.linalg.inv(K)
+            T = np.matmul(inv_K, results['cam2img'][:3])
+            K[0, 2] -= crop_x1
+            K[1, 2] -= crop_y1
+            offset_cam2img = np.matmul(K, T)
+            results['cam2img'][:offset_cam2img.shape[0], :offset_cam2img.
+                               shape[1]] = offset_cam2img
+
+        results['img_crop_offset'] = [offset_w, offset_h]
+
+        return results
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to randomly crop images, bounding boxes, masks,
+        semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+            updated according to crop size.
+        """
+        image_size = results['img'].shape[:2]
+        if 'crop_size' not in results:
+            crop_size = self._get_crop_size(image_size)
+            results['crop_size'] = crop_size
+        else:
+            crop_size = results['crop_size']
+        results = self._crop_data(results, crop_size, self.allow_negative_crop)
+        return results
+
+    def __repr__(self) -> dict:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size={self.crop_size}, '
+        repr_str += f'crop_type={self.crop_type}, '
+        repr_str += f'allow_negative_crop={self.allow_negative_crop}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border}), '
+        repr_str += f'rel_offset_h={self.rel_offset_h}), '
+        repr_str += f'rel_offset_w={self.rel_offset_w})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PhotoMetricDistortion3D(PhotoMetricDistortion):
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+
+    PhotoMetricDistortion3D further support using predefined randomness
+    variable to do the augmentation.
+
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+
+    Required Keys:
+
+    - img (np.uint8)
+
+    Modified Keys:
+
+    - img (np.float32)
+
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (sequence): range of contrast.
+        saturation_range (sequence): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images distorted.
+        """
+        assert 'img' in results, '`img` is not found in results'
+        img = results['img']
+        img = img.astype(np.float32)
+        if 'photometric_param' not in results:
+            photometric_param = self._random_flags()
+            results['photometric_param'] = photometric_param
+        else:
+            photometric_param = results['photometric_param']
+
+        (mode, brightness_flag, contrast_flag, saturation_flag, hue_flag,
+         swap_flag, delta_value, alpha_value, saturation_value, hue_value,
+         swap_value) = photometric_param
+
+        # random brightness
+        if brightness_flag:
+            img += delta_value
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        if mode == 1:
+            if contrast_flag:
+                img *= alpha_value
+
+        # convert color from BGR to HSV
+        img = mmcv.bgr2hsv(img)
+
+        # random saturation
+        if saturation_flag:
+            img[..., 1] *= saturation_value
+
+        # random hue
+        if hue_flag:
+            img[..., 0] += hue_value
+            img[..., 0][img[..., 0] > 360] -= 360
+            img[..., 0][img[..., 0] < 0] += 360
+
+        # convert color from HSV to BGR
+        img = mmcv.hsv2bgr(img)
+
+        # random contrast
+        if mode == 0:
+            if contrast_flag:
+                img *= alpha_value
+
+        # randomly swap channels
+        if swap_flag:
+            img = img[..., swap_value]
+
+        results['img'] = img
+        return results
+
+
+@TRANSFORMS.register_module()
+class MultiViewWrapper(BaseTransform):
+    """Wrap transformation from single-view into multi-view.
+
+    The wrapper processes the images from multi-view one by one. For each
+    image, it constructs a pseudo dict according to the keys specified by the
+    'process_fields' parameter. After the transformation is finished, desired
+    information can be collected by specifying the keys in the 'collected_keys'
+    parameter. Multi-view images share the same transformation parameters
+    but do not share the same magnitude when a random transformation is
+    conducted.
+
+    Args:
+        transforms (list[dict]): A list of dict specifying the transformations
+            for the monocular situation.
+        override_aug_config (bool): flag of whether to use the same aug config
+            for multiview image. Defaults to True.
+        process_fields (list): Desired keys that the transformations should
+            be conducted on. Defaults to ['img', 'cam2img', 'lidar2cam'].
+        collected_keys (list): Collect information in transformation
+            like rotate angles, crop roi, and flip state. Defaults to
+                ['scale', 'scale_factor', 'crop',
+                 'crop_offset', 'ori_shape',
+                 'pad_shape', 'img_shape',
+                 'pad_fixed_size', 'pad_size_divisor',
+                 'flip', 'flip_direction', 'rotate'].
+        randomness_keys (list): The keys that related to the randomness
+            in transformation. Defaults to
+                    ['scale', 'scale_factor', 'crop_size', 'flip',
+                     'flip_direction', 'photometric_param']
+    """
+
+    def __init__(
+        self,
+        transforms: dict,
+        override_aug_config: bool = True,
+        process_fields: list = ['img', 'cam2img', 'lidar2cam'],
+        collected_keys: list = [
+            'scale', 'scale_factor', 'crop', 'img_crop_offset', 'ori_shape',
+            'pad_shape', 'img_shape', 'pad_fixed_size', 'pad_size_divisor',
+            'flip', 'flip_direction', 'rotate'
+        ],
+        randomness_keys: list = [
+            'scale', 'scale_factor', 'crop_size', 'img_crop_offset', 'flip',
+            'flip_direction', 'photometric_param'
+        ]
+    ) -> None:
+        self.transforms = Compose(transforms)
+        self.override_aug_config = override_aug_config
+        self.collected_keys = collected_keys
+        self.process_fields = process_fields
+        self.randomness_keys = randomness_keys
+
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to do the transform for multiview image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: output dict after transformtaion
+        """
+        # store the augmentation related keys for each image.
+        for key in self.collected_keys:
+            if key not in input_dict or \
+                    not isinstance(input_dict[key], list):
+                input_dict[key] = []
+        prev_process_dict = {}
+        for img_id in range(len(input_dict['img'])):
+            process_dict = {}
+
+            # override the process dict (e.g. scale in random scale,
+            # crop_size in random crop, flip, flip_direction in
+            # random flip)
+            if img_id != 0 and self.override_aug_config:
+                for key in self.randomness_keys:
+                    if key in prev_process_dict:
+                        process_dict[key] = prev_process_dict[key]
+
+            for key in self.process_fields:
+                if key in input_dict:
+                    process_dict[key] = input_dict[key][img_id]
+            process_dict = self.transforms(process_dict)
+            # store the randomness variable in transformation.
+            prev_process_dict = process_dict
+
+            # store the related results to results_dict
+            for key in self.process_fields:
+                if key in process_dict:
+                    input_dict[key][img_id] = process_dict[key]
+            # update the keys
+            for key in self.collected_keys:
+                if key in process_dict:
+                    if len(input_dict[key]) == img_id + 1:
+                        input_dict[key][img_id] = process_dict[key]
+                    else:
+                        input_dict[key].append(process_dict[key])
+
+        for key in self.collected_keys:
+            if len(input_dict[key]) == 0:
+                input_dict.pop(key)
+        return input_dict
+
+
+@TRANSFORMS.register_module()
+class PolarMix(BaseTransform):
+    """PolarMix data augmentation.
+
+    The polarmix transform steps are as follows:
+
+        1. Another random point cloud is picked by dataset.
+        2. Exchange sectors of two point clouds that are cut with certain
+           azimuth angles.
+        3. Cut point instances from picked point cloud, rotate them by multiple
+           azimuth angles, and paste the cut and rotated instances.
+
+    Required Keys:
+
+    - points (:obj:`BasePoints`)
+    - pts_semantic_mask (np.int64)
+    - dataset (:obj:`BaseDataset`)
+
+    Modified Keys:
+
+    - points (:obj:`BasePoints`)
+    - pts_semantic_mask (np.int64)
+
+    Args:
+        instance_classes (List[int]): Semantic masks which represent the
+            instance.
+        swap_ratio (float): Swap ratio of two point cloud. Defaults to 0.5.
+        rotate_paste_ratio (float): Rotate paste ratio. Defaults to 1.0.
+        pre_transform (Sequence[dict], optional): Sequence of transform object
+            or config dict to be composed. Defaults to None.
+        prob (float): The transformation probability. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 instance_classes: List[int],
+                 swap_ratio: float = 0.5,
+                 rotate_paste_ratio: float = 1.0,
+                 pre_transform: Optional[Sequence[dict]] = None,
+                 prob: float = 1.0) -> None:
+        assert is_list_of(instance_classes, int), \
+            'instance_classes should be a list of int'
+        self.instance_classes = instance_classes
+        self.swap_ratio = swap_ratio
+        self.rotate_paste_ratio = rotate_paste_ratio
+
+        self.prob = prob
+        if pre_transform is None:
+            self.pre_transform = None
+        else:
+            self.pre_transform = Compose(pre_transform)
+
+    def polar_mix_transform(self, input_dict: dict, mix_results: dict) -> dict:
+        """PolarMix transform function.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+            mix_results (dict): Mixed dict picked from dataset.
+
+        Returns:
+            dict: output dict after transformation.
+        """
+        mix_points = mix_results['points']
+        mix_pts_semantic_mask = mix_results['pts_semantic_mask']
+
+        points = input_dict['points']
+        pts_semantic_mask = input_dict['pts_semantic_mask']
+
+        # 1. swap point cloud
+        if np.random.random() < self.swap_ratio:
+            start_angle = (np.random.random() - 1) * np.pi  # -pi~0
+            end_angle = start_angle + np.pi
+            # calculate horizontal angle for each point
+            yaw = -torch.atan2(points.coord[:, 1], points.coord[:, 0])
+            mix_yaw = -torch.atan2(mix_points.coord[:, 1], mix_points.coord[:,
+                                                                            0])
+
+            # select points in sector
+            idx = (yaw <= start_angle) | (yaw >= end_angle)
+            mix_idx = (mix_yaw > start_angle) & (mix_yaw < end_angle)
+
+            # swap
+            points = points.cat([points[idx], mix_points[mix_idx]])
+            pts_semantic_mask = np.concatenate(
+                (pts_semantic_mask[idx.numpy()],
+                 mix_pts_semantic_mask[mix_idx.numpy()]),
+                axis=0)
+
+        # 2. rotate-pasting
+        if np.random.random() < self.rotate_paste_ratio:
+            # extract instance points
+            instance_points, instance_pts_semantic_mask = [], []
+            for instance_class in self.instance_classes:
+                mix_idx = mix_pts_semantic_mask == instance_class
+                instance_points.append(mix_points[mix_idx])
+                instance_pts_semantic_mask.append(
+                    mix_pts_semantic_mask[mix_idx])
+            instance_points = mix_points.cat(instance_points)
+            instance_pts_semantic_mask = np.concatenate(
+                instance_pts_semantic_mask, axis=0)
+
+            # rotate-copy
+            copy_points = [instance_points]
+            copy_pts_semantic_mask = [instance_pts_semantic_mask]
+            angle_list = [
+                np.random.random() * np.pi * 2 / 3,
+                (np.random.random() + 1) * np.pi * 2 / 3
+            ]
+            for angle in angle_list:
+                new_points = instance_points.clone()
+                new_points.rotate(angle)
+                copy_points.append(new_points)
+                copy_pts_semantic_mask.append(instance_pts_semantic_mask)
+            copy_points = instance_points.cat(copy_points)
+            copy_pts_semantic_mask = np.concatenate(
+                copy_pts_semantic_mask, axis=0)
+
+            points = points.cat([points, copy_points])
+            pts_semantic_mask = np.concatenate(
+                (pts_semantic_mask, copy_pts_semantic_mask), axis=0)
+
+        input_dict['points'] = points
+        input_dict['pts_semantic_mask'] = pts_semantic_mask
+        return input_dict
+
+    def transform(self, input_dict: dict) -> dict:
+        """PolarMix transform function.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: output dict after transformation.
+        """
+        if np.random.rand() > self.prob:
+            return input_dict
+
+        assert 'dataset' in input_dict, \
+            '`dataset` is needed to pass through PolarMix, while not found.'
+        dataset = input_dict['dataset']
+
+        # get index of other point cloud
+        index = np.random.randint(0, len(dataset))
+
+        mix_results = dataset.get_data_info(index)
+
+        if self.pre_transform is not None:
+            # pre_transform may also require dataset
+            mix_results.update({'dataset': dataset})
+            # before polarmix need to go through
+            # the necessary pre_transform
+            mix_results = self.pre_transform(mix_results)
+            mix_results.pop('dataset')
+
+        input_dict = self.polar_mix_transform(input_dict, mix_results)
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(instance_classes={self.instance_classes}, '
+        repr_str += f'swap_ratio={self.swap_ratio}, '
+        repr_str += f'rotate_paste_ratio={self.rotate_paste_ratio}, '
+        repr_str += f'pre_transform={self.pre_transform}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LaserMix(BaseTransform):
+    """LaserMix data augmentation.
+
+    The lasermix transform steps are as follows:
+
+        1. Another random point cloud is picked by dataset.
+        2. Divide the point cloud into several regions according to pitch
+           angles and combine the areas crossly.
+
+    Required Keys:
+
+    - points (:obj:`BasePoints`)
+    - pts_semantic_mask (np.int64)
+    - dataset (:obj:`BaseDataset`)
+
+    Modified Keys:
+
+    - points (:obj:`BasePoints`)
+    - pts_semantic_mask (np.int64)
+
+    Args:
+        num_areas (List[int]): A list of area numbers will be divided into.
+        pitch_angles (Sequence[float]): Pitch angles used to divide areas.
+        pre_transform (Sequence[dict], optional): Sequence of transform object
+            or config dict to be composed. Defaults to None.
+        prob (float): The transformation probability. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 num_areas: List[int],
+                 pitch_angles: Sequence[float],
+                 pre_transform: Optional[Sequence[dict]] = None,
+                 prob: float = 1.0) -> None:
+        assert is_list_of(num_areas, int), \
+            'num_areas should be a list of int.'
+        self.num_areas = num_areas
+
+        assert len(pitch_angles) == 2, \
+            'The length of pitch_angles should be 2, ' \
+            f'but got {len(pitch_angles)}.'
+        assert pitch_angles[1] > pitch_angles[0], \
+            'pitch_angles[1] should be larger than pitch_angles[0].'
+        self.pitch_angles = pitch_angles
+
+        self.prob = prob
+        if pre_transform is None:
+            self.pre_transform = None
+        else:
+            self.pre_transform = Compose(pre_transform)
+
+    def laser_mix_transform(self, input_dict: dict, mix_results: dict) -> dict:
+        """LaserMix transform function.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+            mix_results (dict): Mixed dict picked from dataset.
+
+        Returns:
+            dict: output dict after transformation.
+        """
+        mix_points = mix_results['points']
+        mix_pts_semantic_mask = mix_results['pts_semantic_mask']
+
+        points = input_dict['points']
+        pts_semantic_mask = input_dict['pts_semantic_mask']
+
+        # convert angle to radian
+        pitch_angle_down = self.pitch_angles[0] / 180 * np.pi
+        pitch_angle_up = self.pitch_angles[1] / 180 * np.pi
+
+        rho = torch.sqrt(points.coord[:, 0]**2 + points.coord[:, 1]**2)
+        pitch = torch.atan2(points.coord[:, 2], rho)
+        pitch = torch.clamp(pitch, pitch_angle_down + 1e-5,
+                            pitch_angle_up - 1e-5)
+
+        mix_rho = torch.sqrt(mix_points.coord[:, 0]**2 +
+                             mix_points.coord[:, 1]**2)
+        mix_pitch = torch.atan2(mix_points.coord[:, 2], mix_rho)
+        mix_pitch = torch.clamp(mix_pitch, pitch_angle_down + 1e-5,
+                                pitch_angle_up - 1e-5)
+
+        num_areas = np.random.choice(self.num_areas, size=1)[0]
+        angle_list = np.linspace(pitch_angle_up, pitch_angle_down,
+                                 num_areas + 1)
+        out_points = []
+        out_pts_semantic_mask = []
+        for i in range(num_areas):
+            start_angle = angle_list[i + 1]
+            end_angle = angle_list[i]
+            if i % 2 == 0:  # pick from original point cloud
+                idx = (pitch > start_angle) & (pitch <= end_angle)
+                out_points.append(points[idx])
+                out_pts_semantic_mask.append(pts_semantic_mask[idx.numpy()])
+            else:  # pickle from mixed point cloud
+                idx = (mix_pitch > start_angle) & (mix_pitch <= end_angle)
+                out_points.append(mix_points[idx])
+                out_pts_semantic_mask.append(
+                    mix_pts_semantic_mask[idx.numpy()])
+        out_points = points.cat(out_points)
+        out_pts_semantic_mask = np.concatenate(out_pts_semantic_mask, axis=0)
+        input_dict['points'] = out_points
+        input_dict['pts_semantic_mask'] = out_pts_semantic_mask
+        return input_dict
+
+    def transform(self, input_dict: dict) -> dict:
+        """LaserMix transform function.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: output dict after transformation.
+        """
+        if np.random.rand() > self.prob:
+            return input_dict
+
+        assert 'dataset' in input_dict, \
+            '`dataset` is needed to pass through LaserMix, while not found.'
+        dataset = input_dict['dataset']
+
+        # get index of other point cloud
+        index = np.random.randint(0, len(dataset))
+
+        mix_results = dataset.get_data_info(index)
+
+        if self.pre_transform is not None:
+            # pre_transform may also require dataset
+            mix_results.update({'dataset': dataset})
+            # before lasermix need to go through
+            # the necessary pre_transform
+            mix_results = self.pre_transform(mix_results)
+            mix_results.pop('dataset')
+
+        input_dict = self.laser_mix_transform(input_dict, mix_results)
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_areas={self.num_areas}, '
+        repr_str += f'pitch_angles={self.pitch_angles}, '
+        repr_str += f'pre_transform={self.pre_transform}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
diff --git a/mmde/mmdet3d/datasets/utils.py b/mmde/mmdet3d/datasets/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4ea9b3f033efd706236a85cd4dd018de7a8aa6d
--- /dev/null
+++ b/mmde/mmdet3d/datasets/utils.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.transforms import LoadImageFromFile
+from pyquaternion import Quaternion
+
+# yapf: disable
+from mmdet3d.datasets.transforms import (LoadAnnotations3D,
+                                         LoadImageFromFileMono3D,
+                                         LoadMultiViewImageFromFiles,
+                                         LoadPointsFromFile,
+                                         LoadPointsFromMultiSweeps,
+                                         MultiScaleFlipAug3D, Pack3DDetInputs,
+                                         PointSegClassMapping)
+# yapf: enable
+from mmdet3d.registry import TRANSFORMS
+
+
+def is_loading_function(transform):
+    """Judge whether a transform function is a loading function.
+
+    Note: `MultiScaleFlipAug3D` is a wrapper for multiple pipeline functions,
+    so we need to search if its inner transforms contain any loading function.
+
+    Args:
+        transform (dict | :obj:`Pipeline`): A transform config or a function.
+
+    Returns:
+        bool: Whether it is a loading function. None means can't judge.
+            When transform is `MultiScaleFlipAug3D`, we return None.
+    """
+    # TODO: use more elegant way to distinguish loading modules
+    loading_functions = (LoadImageFromFile, LoadPointsFromFile,
+                         LoadAnnotations3D, LoadMultiViewImageFromFiles,
+                         LoadPointsFromMultiSweeps, Pack3DDetInputs,
+                         LoadImageFromFileMono3D, PointSegClassMapping)
+    if isinstance(transform, dict):
+        obj_cls = TRANSFORMS.get(transform['type'])
+        if obj_cls is None:
+            return False
+        if obj_cls in loading_functions:
+            return True
+        if obj_cls in (MultiScaleFlipAug3D, ):
+            return None
+    elif callable(transform):
+        if isinstance(transform, loading_functions):
+            return True
+        if isinstance(transform, (MultiScaleFlipAug3D)):
+            return None
+    return False
+
+
+def get_loading_pipeline(pipeline):
+    """Only keep loading image, points and annotations related configuration.
+
+    Args:
+        pipeline (list[dict] | list[:obj:`Pipeline`]):
+            Data pipeline configs or list of pipeline functions.
+
+    Returns:
+        list[dict] | list[:obj:`Pipeline`]): The new pipeline list with only
+            keep loading image, points and annotations related configuration.
+
+    Examples:
+        >>> transforms = [
+        ...    dict(type='LoadPointsFromFile',
+        ...         coord_type='LIDAR', load_dim=4, use_dim=4),
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(type='LoadAnnotations3D',
+        ...         with_bbox=True, with_label_3d=True),
+        ...    dict(type='Resize',
+        ...         img_scale=[(640, 192), (2560, 768)], keep_ratio=True),
+        ...    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+        ...    dict(type='PointsRangeFilter',
+        ...         point_cloud_range=point_cloud_range),
+        ...    dict(type='ObjectRangeFilter',
+        ...         point_cloud_range=point_cloud_range),
+        ...    dict(type='PointShuffle'),
+        ...    dict(type='Normalize', **img_norm_cfg),
+        ...    dict(type='Pad', size_divisor=32),
+        ...    dict(type='DefaultFormatBundle3D', class_names=class_names),
+        ...    dict(type='Collect3D',
+        ...         keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'])
+        ...    ]
+        >>> expected_pipelines = [
+        ...    dict(type='LoadPointsFromFile',
+        ...         coord_type='LIDAR', load_dim=4, use_dim=4),
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(type='LoadAnnotations3D',
+        ...         with_bbox=True, with_label_3d=True),
+        ...    dict(type='DefaultFormatBundle3D', class_names=class_names),
+        ...    dict(type='Collect3D',
+        ...         keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'])
+        ...    ]
+        >>> assert expected_pipelines == \
+        ...        get_loading_pipeline(transforms)
+    """
+    loading_pipeline = []
+    for transform in pipeline:
+        is_loading = is_loading_function(transform)
+        if is_loading is None:  # MultiScaleFlipAug3D
+            # extract its inner pipeline
+            if isinstance(transform, dict):
+                inner_pipeline = transform.get('transforms', [])
+            else:
+                inner_pipeline = transform.transforms.transforms
+            loading_pipeline.extend(get_loading_pipeline(inner_pipeline))
+        elif is_loading:
+            loading_pipeline.append(transform)
+    assert len(loading_pipeline) > 0, \
+        'The data pipeline in your config file must include ' \
+        'loading step.'
+    return loading_pipeline
+
+
+def convert_quaternion_to_matrix(quaternion: list,
+                                 translation: list = None) -> list:
+    """Compute a transform matrix by given quaternion and translation
+    vector."""
+    result = np.eye(4)
+    result[:3, :3] = Quaternion(quaternion).rotation_matrix
+    if translation is not None:
+        result[:3, 3] = np.array(translation)
+    return result.astype(np.float32).tolist()
diff --git a/mmde/mmdet3d/datasets/waymo_dataset.py b/mmde/mmdet3d/datasets/waymo_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..cda27e42e55bdef8b528f0f1eb08f73caf300e9d
--- /dev/null
+++ b/mmde/mmdet3d/datasets/waymo_dataset.py
@@ -0,0 +1,289 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Callable, List, Union
+
+import numpy as np
+from mmengine import print_log
+from mmengine.fileio import load
+
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import CameraInstance3DBoxes, LiDARInstance3DBoxes
+from .det3d_dataset import Det3DDataset
+from .kitti_dataset import KittiDataset
+
+
+@DATASETS.register_module()
+class WaymoDataset(KittiDataset):
+    """Waymo Dataset.
+
+    This class serves as the API for experiments on the Waymo Dataset.
+
+    Please refer to `<https://waymo.com/open/download/>`_for data downloading.
+    It is recommended to symlink the dataset root to $MMDETECTION3D/data and
+    organize them as the doc shows.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        data_prefix (dict): data prefix for point cloud and
+            camera data dict. Defaults to dict(
+                                    pts='velodyne',
+                                    CAM_FRONT='image_0',
+                                    CAM_FRONT_LEFT='image_1',
+                                    CAM_FRONT_RIGHT='image_2',
+                                    CAM_SIDE_LEFT='image_3',
+                                    CAM_SIDE_RIGHT='image_4')
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used
+            as input. Defaults to dict(use_lidar=True).
+        default_cam_key (str): Default camera key for lidar2img
+            association. Defaults to 'CAM_FRONT'.
+        box_type_3d (str): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes:
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        load_type (str): Type of loading mode. Defaults to 'frame_based'.
+
+            - 'frame_based': Load all of the instances in the frame.
+            - 'mv_image_based': Load all of the instances in the frame and need
+                to convert to the FOV-based data type to support image-based
+                detector.
+            - 'fov_image_based': Only load the instances inside the default
+                cam, and need to convert to the FOV-based data type to support
+                image-based detector.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+        pcd_limit_range (List[float]): The range of point cloud
+            used to filter invalid predicted boxes.
+            Defaults to [-85, -85, -5, 85, 85, 5].
+        cam_sync_instances (bool): If use the camera sync label
+            supported from waymo version 1.3.1. Defaults to False.
+        load_interval (int): load frame interval. Defaults to 1.
+        max_sweeps (int): max sweep for each frame. Defaults to 0.
+    """
+    METAINFO = {
+        'classes': ('Car', 'Pedestrian', 'Cyclist'),
+        'palette': [
+            (0, 120, 255),  # Waymo Blue
+            (0, 232, 157),  # Waymo Green
+            (255, 205, 85)  # Amber
+        ]
+    }
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 data_prefix: dict = dict(
+                     pts='velodyne',
+                     CAM_FRONT='image_0',
+                     CAM_FRONT_LEFT='image_1',
+                     CAM_FRONT_RIGHT='image_2',
+                     CAM_SIDE_LEFT='image_3',
+                     CAM_SIDE_RIGHT='image_4'),
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_lidar=True),
+                 default_cam_key: str = 'CAM_FRONT',
+                 box_type_3d: str = 'LiDAR',
+                 load_type: str = 'frame_based',
+                 filter_empty_gt: bool = True,
+                 test_mode: bool = False,
+                 pcd_limit_range: List[float] = [0, -40, -3, 70.4, 40, 0.0],
+                 cam_sync_instances: bool = False,
+                 load_interval: int = 1,
+                 max_sweeps: int = 0,
+                 **kwargs) -> None:
+        self.load_interval = load_interval
+        # set loading mode for different task settings
+        self.cam_sync_instances = cam_sync_instances
+        # construct self.cat_ids for vision-only anns parsing
+        self.cat_ids = range(len(self.METAINFO['classes']))
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.max_sweeps = max_sweeps
+        # we do not provide backend_args to custom_3d init
+        # because we want disk loading for info
+        # while ceph loading for Prediction2Waymo
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            pcd_limit_range=pcd_limit_range,
+            default_cam_key=default_cam_key,
+            data_prefix=data_prefix,
+            test_mode=test_mode,
+            load_type=load_type,
+            **kwargs)
+
+    def parse_ann_info(self, info: dict) -> dict:
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Data information of single data sample.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - bboxes_3d (:obj:`LiDARInstance3DBoxes`):
+                  3D ground truth bboxes.
+                - bbox_labels_3d (np.ndarray): Labels of ground truths.
+                - gt_bboxes (np.ndarray): 2D ground truth bboxes.
+                - gt_labels (np.ndarray): Labels of ground truths.
+                - difficulty (int): Difficulty defined by KITTI.
+                  0, 1, 2 represent xxxxx respectively.
+        """
+        ann_info = Det3DDataset.parse_ann_info(self, info)
+        if ann_info is None:
+            # empty instance
+            ann_info = {}
+            ann_info['gt_bboxes_3d'] = np.zeros((0, 7), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros(0, dtype=np.int64)
+
+        ann_info = self._remove_dontcare(ann_info)
+        # in kitti, lidar2cam = R0_rect @ Tr_velo_to_cam
+        # convert gt_bboxes_3d to velodyne coordinates with `lidar2cam`
+        if 'gt_bboxes' in ann_info:
+            gt_bboxes = ann_info['gt_bboxes']
+            gt_bboxes_labels = ann_info['gt_bboxes_labels']
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_bboxes_labels = np.zeros(0, dtype=np.int64)
+        if 'centers_2d' in ann_info:
+            centers_2d = ann_info['centers_2d']
+            depths = ann_info['depths']
+        else:
+            centers_2d = np.zeros((0, 2), dtype=np.float32)
+            depths = np.zeros((0), dtype=np.float32)
+
+        if self.load_type == 'frame_based':
+            gt_bboxes_3d = LiDARInstance3DBoxes(ann_info['gt_bboxes_3d'])
+        else:
+            gt_bboxes_3d = CameraInstance3DBoxes(ann_info['gt_bboxes_3d'])
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=ann_info['gt_labels_3d'],
+            gt_bboxes=gt_bboxes,
+            gt_bboxes_labels=gt_bboxes_labels,
+            centers_2d=centers_2d,
+            depths=depths)
+
+        return anns_results
+
+    def load_data_list(self) -> List[dict]:
+        """Add the load interval.
+
+        Returns:
+            list[dict]: A list of annotation.
+        """  # noqa: E501
+        # `self.ann_file` denotes the absolute annotation file path if
+        # `self.root=None` or relative path if `self.root=/path/to/data/`.
+        annotations = load(self.ann_file)
+        if not isinstance(annotations, dict):
+            raise TypeError(f'The annotations loaded from annotation file '
+                            f'should be a dict, but got {type(annotations)}!')
+        if 'data_list' not in annotations or 'metainfo' not in annotations:
+            raise ValueError('Annotation must have data_list and metainfo '
+                             'keys')
+        metainfo = annotations['metainfo']
+        raw_data_list = annotations['data_list']
+        raw_data_list = raw_data_list[::self.load_interval]
+        if self.load_interval > 1:
+            print_log(
+                f'Sample size will be reduced to 1/{self.load_interval} of'
+                ' the original data sample',
+                logger='current')
+
+        # Meta information load from annotation file will not influence the
+        # existed meta information load from `BaseDataset.METAINFO` and
+        # `metainfo` arguments defined in constructor.
+        for k, v in metainfo.items():
+            self._metainfo.setdefault(k, v)
+
+        # load and parse data_infos.
+        data_list = []
+        for raw_data_info in raw_data_list:
+            # parse raw data information to target format
+            data_info = self.parse_data_info(raw_data_info)
+            if isinstance(data_info, dict):
+                # For image tasks, `data_info` should information if single
+                # image, such as dict(img_path='xxx', width=360, ...)
+                data_list.append(data_info)
+            elif isinstance(data_info, list):
+                # For video tasks, `data_info` could contain image
+                # information of multiple frames, such as
+                # [dict(video_path='xxx', timestamps=...),
+                #  dict(video_path='xxx', timestamps=...)]
+                for item in data_info:
+                    if not isinstance(item, dict):
+                        raise TypeError('data_info must be list of dict, but '
+                                        f'got {type(item)}')
+                data_list.extend(data_info)
+            else:
+                raise TypeError('data_info should be a dict or list of dict, '
+                                f'but got {type(data_info)}')
+
+        return data_list
+
+    def parse_data_info(self, info: dict) -> Union[dict, List[dict]]:
+        """if task is lidar or multiview det, use super() method elif task is
+        mono3d, split the info from frame-wise to img-wise."""
+
+        if self.cam_sync_instances:
+            info['instances'] = info['cam_sync_instances']
+
+        if self.load_type == 'frame_based':
+            return super().parse_data_info(info)
+        elif self.load_type == 'fov_image_based':
+            # only loading the fov image and the fov instance
+            new_image_info = {}
+            new_image_info[self.default_cam_key] = \
+                info['images'][self.default_cam_key]
+            info['images'] = new_image_info
+            info['instances'] = info['cam_instances'][self.default_cam_key]
+            return Det3DDataset.parse_data_info(self, info)
+        else:
+            # in the mono3d, the instances is from cam sync.
+            # Convert frame-based infos to multi-view image-based
+            data_list = []
+            for (cam_key, img_info) in info['images'].items():
+                camera_info = dict()
+                camera_info['sample_idx'] = info['sample_idx']
+                camera_info['timestamp'] = info['timestamp']
+                camera_info['context_name'] = info['context_name']
+                camera_info['images'] = dict()
+                camera_info['images'][cam_key] = img_info
+                if 'img_path' in img_info:
+                    cam_prefix = self.data_prefix.get(cam_key, '')
+                    camera_info['images'][cam_key]['img_path'] = osp.join(
+                        cam_prefix, img_info['img_path'])
+                if 'lidar2cam' in img_info:
+                    camera_info['lidar2cam'] = np.array(img_info['lidar2cam'])
+                if 'cam2img' in img_info:
+                    camera_info['cam2img'] = np.array(img_info['cam2img'])
+                if 'lidar2img' in img_info:
+                    camera_info['lidar2img'] = np.array(img_info['lidar2img'])
+                else:
+                    camera_info['lidar2img'] = camera_info[
+                        'cam2img'] @ camera_info['lidar2cam']
+
+                if not self.test_mode:
+                    # used in training
+                    camera_info['instances'] = info['cam_instances'][cam_key]
+                    camera_info['ann_info'] = self.parse_ann_info(camera_info)
+                if self.test_mode and self.load_eval_anns:
+                    camera_info['instances'] = info['cam_instances'][cam_key]
+                    camera_info['eval_ann_info'] = self.parse_ann_info(
+                        camera_info)
+                data_list.append(camera_info)
+            return data_list
diff --git a/mmde/mmdet3d/engine/__init__.py b/mmde/mmdet3d/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..34909821eadc8f5f998d84a13b37bf9ae835e02d
--- /dev/null
+++ b/mmde/mmdet3d/engine/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hooks import BenchmarkHook, Det3DVisualizationHook
+
+__all__ = ['Det3DVisualizationHook', 'BenchmarkHook']
diff --git a/mmde/mmdet3d/engine/hooks/__init__.py b/mmde/mmdet3d/engine/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..578f173d41841c155902ec3b676a8d4cd2d502b9
--- /dev/null
+++ b/mmde/mmdet3d/engine/hooks/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .benchmark_hook import BenchmarkHook
+from .disable_object_sample_hook import DisableObjectSampleHook
+from .visualization_hook import Det3DVisualizationHook
+
+__all__ = [
+    'Det3DVisualizationHook', 'BenchmarkHook', 'DisableObjectSampleHook'
+]
diff --git a/mmde/mmdet3d/engine/hooks/benchmark_hook.py b/mmde/mmdet3d/engine/hooks/benchmark_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..65e613393377a369333ec18a82badea63d84a640
--- /dev/null
+++ b/mmde/mmdet3d/engine/hooks/benchmark_hook.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmengine.hooks import Hook
+
+from mmdet3d.registry import HOOKS
+
+
+@HOOKS.register_module()
+class BenchmarkHook(Hook):
+    """A hook that logs the training speed of each epch."""
+
+    priority = 'NORMAL'
+
+    def after_train_epoch(self, runner) -> None:
+        """We use the average throughput in iterations of the entire training
+        run and skip the first 50 iterations of each epoch to skip GPU warmup
+        time.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        message_hub = runner.message_hub
+        max_iter_num = len(runner.train_dataloader)
+        speed = message_hub.get_scalar('train/time').mean(max_iter_num - 50)
+        message_hub.update_scalar('train/speed', speed)
+        runner.logger.info(
+            f'Training speed of epoch {runner.epoch + 1} is {speed} s/iter')
+
+    def after_train(self, runner) -> None:
+        """Log average training speed of entire training process.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        message_hub = runner.message_hub
+        avg_speed = message_hub.get_scalar('train/speed').mean()
+        runner.logger.info('Average training speed of entire training process'
+                           f'is {avg_speed} s/iter')
diff --git a/mmde/mmdet3d/engine/hooks/disable_object_sample_hook.py b/mmde/mmdet3d/engine/hooks/disable_object_sample_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..07d12762be8ad8da563729380658f47354f77ce7
--- /dev/null
+++ b/mmde/mmdet3d/engine/hooks/disable_object_sample_hook.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dataset import BaseDataset
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+from mmengine.runner import Runner
+
+from mmdet3d.datasets.transforms import ObjectSample
+from mmdet3d.registry import HOOKS
+
+
+@HOOKS.register_module()
+class DisableObjectSampleHook(Hook):
+    """The hook of disabling augmentations during training.
+
+    Args:
+        disable_after_epoch (int): The number of epochs after which
+            the ``ObjectSample`` will be closed in the training.
+            Defaults to 15.
+    """
+
+    def __init__(self, disable_after_epoch: int = 15):
+        self.disable_after_epoch = disable_after_epoch
+        self._restart_dataloader = False
+
+    def before_train_epoch(self, runner: Runner):
+        """Close augmentation.
+
+        Args:
+            runner (Runner): The runner.
+        """
+        epoch = runner.epoch
+        train_loader = runner.train_dataloader
+        model = runner.model
+        # TODO: refactor after mmengine using model wrapper
+        if is_model_wrapper(model):
+            model = model.module
+        if epoch == self.disable_after_epoch:
+            runner.logger.info('Disable ObjectSample')
+            dataset = runner.train_dataloader.dataset
+            # handle dataset wrapper
+            if not isinstance(dataset, BaseDataset):
+                dataset = dataset.dataset
+            for transform in dataset.pipeline.transforms:  # noqa: E501
+                if isinstance(transform, ObjectSample):
+                    assert hasattr(transform, 'disabled')
+                    transform.disabled = True
+            # The dataset pipeline cannot be updated when persistent_workers
+            # is True, so we need to force the dataloader's multi-process
+            # restart. This is a very hacky approach.
+            if hasattr(train_loader, 'persistent_workers'
+                       ) and train_loader.persistent_workers is True:
+                train_loader._DataLoader__initialized = False
+                train_loader._iterator = None
+                self._restart_dataloader = True
+        else:
+            # Once the restart is complete, we need to restore
+            # the initialization flag.
+            if self._restart_dataloader:
+                train_loader._DataLoader__initialized = True
diff --git a/mmde/mmdet3d/engine/hooks/visualization_hook.py b/mmde/mmdet3d/engine/hooks/visualization_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..9de46d9692cbdb850d4f79f7316fd00cb422c045
--- /dev/null
+++ b/mmde/mmdet3d/engine/hooks/visualization_hook.py
@@ -0,0 +1,241 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from typing import Optional, Sequence
+
+import mmcv
+import numpy as np
+from mmengine.fileio import get
+from mmengine.hooks import Hook
+from mmengine.logging import print_log
+from mmengine.runner import Runner
+from mmengine.utils import mkdir_or_exist
+from mmengine.visualization import Visualizer
+
+from mmdet3d.registry import HOOKS
+from mmdet3d.structures import Det3DDataSample
+
+
+@HOOKS.register_module()
+class Det3DVisualizationHook(Hook):
+    """Detection Visualization Hook. Used to visualize validation and testing
+    process prediction results.
+
+    In the testing phase:
+
+    1. If ``show`` is True, it means that only the prediction results are
+        visualized without storing data, so ``vis_backends`` needs to
+        be excluded.
+    2. If ``test_out_dir`` is specified, it means that the prediction results
+        need to be saved to ``test_out_dir``. In order to avoid vis_backends
+        also storing data, so ``vis_backends`` needs to be excluded.
+    3. ``vis_backends`` takes effect if the user does not specify ``show``
+        and `test_out_dir``. You can set ``vis_backends`` to WandbVisBackend or
+        TensorboardVisBackend to store the prediction result in Wandb or
+        Tensorboard.
+
+    Args:
+        draw (bool): whether to draw prediction results. If it is False,
+            it means that no drawing will be done. Defaults to False.
+        interval (int): The interval of visualization. Defaults to 50.
+        score_thr (float): The threshold to visualize the bboxes
+            and masks. Defaults to 0.3.
+        show (bool): Whether to display the drawn image. Default to False.
+        vis_task (str): Visualization task. Defaults to 'mono_det'.
+        wait_time (float): The interval of show (s). Defaults to 0.
+        draw_gt (bool): Whether to draw ground truth. Defaults to True.
+        draw_pred (bool): Whether to draw prediction. Defaults to True.
+        show_pcd_rgb (bool): Whether to show RGB point cloud. Defaults to
+            False.
+        test_out_dir (str, optional): directory where painted images
+            will be saved in testing process.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    def __init__(self,
+                 draw: bool = False,
+                 interval: int = 50,
+                 score_thr: float = 0.3,
+                 show: bool = False,
+                 vis_task: str = 'mono_det',
+                 wait_time: float = 0.,
+                 test_out_dir: Optional[str] = None,
+                 draw_gt: bool = False,
+                 draw_pred: bool = True,
+                 show_pcd_rgb: bool = False,
+                 backend_args: Optional[dict] = None):
+        self._visualizer: Visualizer = Visualizer.get_current_instance()
+        self.interval = interval
+        self.score_thr = score_thr
+        self.show = show
+        if self.show:
+            # No need to think about vis backends.
+            self._visualizer._vis_backends = {}
+            warnings.warn('The show is True, it means that only '
+                          'the prediction results are visualized '
+                          'without storing data, so vis_backends '
+                          'needs to be excluded.')
+        self.vis_task = vis_task
+
+        if show and wait_time == -1:
+            print_log(
+                'Manual control mode, press [Right] to next sample.',
+                logger='current')
+        elif show:
+            print_log(
+                'Autoplay mode, press [SPACE] to pause.', logger='current')
+        self.wait_time = wait_time
+        self.backend_args = backend_args
+        self.draw = draw
+        self.test_out_dir = test_out_dir
+        self._test_index = 0
+        self.draw_gt = draw_gt
+        self.draw_pred = draw_pred
+        self.show_pcd_rgb = show_pcd_rgb
+
+    def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                       outputs: Sequence[Det3DDataSample]) -> None:
+        """Run after every ``self.interval`` validation iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`DetDataSample`]]): A batch of data samples
+                that contain annotations and predictions.
+        """
+        if self.draw is False:
+            return
+
+        # There is no guarantee that the same batch of images
+        # is visualized for each evaluation.
+        total_curr_iter = runner.iter + batch_idx
+
+        data_input = dict()
+
+        # Visualize only the first data
+        if self.vis_task in [
+                'mono_det', 'multi-view_det', 'multi-modality_det'
+        ]:
+            assert 'img_path' in outputs[0], 'img_path is not in outputs[0]'
+            img_path = outputs[0].img_path
+            if isinstance(img_path, list):
+                img = []
+                for single_img_path in img_path:
+                    img_bytes = get(
+                        single_img_path, backend_args=self.backend_args)
+                    single_img = mmcv.imfrombytes(
+                        img_bytes, channel_order='rgb')
+                    img.append(single_img)
+            else:
+                img_bytes = get(img_path, backend_args=self.backend_args)
+                img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+            data_input['img'] = img
+
+        if self.vis_task in ['lidar_det', 'multi-modality_det', 'lidar_seg']:
+            assert 'lidar_path' in outputs[
+                0], 'lidar_path is not in outputs[0]'
+            lidar_path = outputs[0].lidar_path
+            num_pts_feats = outputs[0].num_pts_feats
+            pts_bytes = get(lidar_path, backend_args=self.backend_args)
+            points = np.frombuffer(pts_bytes, dtype=np.float32)
+            points = points.reshape(-1, num_pts_feats)
+            data_input['points'] = points
+
+        if total_curr_iter % self.interval == 0:
+            self._visualizer.add_datasample(
+                'val sample',
+                data_input,
+                data_sample=outputs[0],
+                draw_gt=self.draw_gt,
+                draw_pred=self.draw_pred,
+                show=self.show,
+                vis_task=self.vis_task,
+                wait_time=self.wait_time,
+                pred_score_thr=self.score_thr,
+                step=total_curr_iter,
+                show_pcd_rgb=self.show_pcd_rgb)
+
+    def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                        outputs: Sequence[Det3DDataSample]) -> None:
+        """Run after every testing iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`DetDataSample`]): A batch of data samples
+                that contain annotations and predictions.
+        """
+        if self.draw is False:
+            return
+
+        if self.test_out_dir is not None:
+            self.test_out_dir = osp.join(runner.work_dir, runner.timestamp,
+                                         self.test_out_dir)
+            mkdir_or_exist(self.test_out_dir)
+
+        for data_sample in outputs:
+            self._test_index += 1
+
+            data_input = dict()
+            assert 'img_path' in data_sample or 'lidar_path' in data_sample, \
+                "'data_sample' must contain 'img_path' or 'lidar_path'"
+
+            out_file = o3d_save_path = None
+
+            if self.vis_task in [
+                    'mono_det', 'multi-view_det', 'multi-modality_det'
+            ]:
+                assert 'img_path' in data_sample, \
+                    'img_path is not in data_sample'
+                img_path = data_sample.img_path
+                if isinstance(img_path, list):
+                    img = []
+                    for single_img_path in img_path:
+                        img_bytes = get(
+                            single_img_path, backend_args=self.backend_args)
+                        single_img = mmcv.imfrombytes(
+                            img_bytes, channel_order='rgb')
+                        img.append(single_img)
+                else:
+                    img_bytes = get(img_path, backend_args=self.backend_args)
+                    img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+                data_input['img'] = img
+                if self.test_out_dir is not None:
+                    if isinstance(img_path, list):
+                        img_path = img_path[0]
+                    out_file = osp.basename(img_path)
+                    out_file = osp.join(self.test_out_dir, out_file)
+
+            if self.vis_task in [
+                    'lidar_det', 'multi-modality_det', 'lidar_seg'
+            ]:
+                assert 'lidar_path' in data_sample, \
+                    'lidar_path is not in data_sample'
+                lidar_path = data_sample.lidar_path
+                num_pts_feats = data_sample.num_pts_feats
+                pts_bytes = get(lidar_path, backend_args=self.backend_args)
+                points = np.frombuffer(pts_bytes, dtype=np.float32)
+                points = points.reshape(-1, num_pts_feats)
+                data_input['points'] = points
+                if self.test_out_dir is not None:
+                    o3d_save_path = osp.basename(lidar_path).split(
+                        '.')[0] + '.png'
+                    o3d_save_path = osp.join(self.test_out_dir, o3d_save_path)
+
+            self._visualizer.add_datasample(
+                'test sample',
+                data_input,
+                data_sample=data_sample,
+                draw_gt=self.draw_gt,
+                draw_pred=self.draw_pred,
+                show=self.show,
+                vis_task=self.vis_task,
+                wait_time=self.wait_time,
+                pred_score_thr=self.score_thr,
+                out_file=out_file,
+                o3d_save_path=o3d_save_path,
+                step=self._test_index,
+                show_pcd_rgb=self.show_pcd_rgb)
diff --git a/mmde/mmdet3d/evaluation/__init__.py b/mmde/mmdet3d/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c23cc7d73d5f8dc2935b8eb39caf969d962e851
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet3d.evaluation.functional.kitti_utils import (do_eval, eval_class,
+                                                       kitti_eval,
+                                                       kitti_eval_coco_style)
+from .functional import (aggregate_predictions, average_precision,
+                         eval_det_cls, eval_map_recall, fast_hist, get_acc,
+                         get_acc_cls, get_classwise_aps, get_single_class_aps,
+                         indoor_eval, instance_seg_eval, load_lyft_gts,
+                         load_lyft_predictions, lyft_eval, panoptic_seg_eval,
+                         per_class_iou, rename_gt, seg_eval)
+from .metrics import (IndoorMetric, InstanceSegMetric, KittiMetric, LyftMetric,
+                      NuScenesMetric, PanopticSegMetric, SegMetric,
+                      WaymoMetric)
+
+__all__ = [
+    'kitti_eval_coco_style', 'kitti_eval', 'indoor_eval', 'lyft_eval',
+    'seg_eval', 'instance_seg_eval', 'average_precision', 'eval_det_cls',
+    'eval_map_recall', 'indoor_eval', 'aggregate_predictions', 'rename_gt',
+    'instance_seg_eval', 'load_lyft_gts', 'load_lyft_predictions', 'lyft_eval',
+    'get_classwise_aps', 'get_single_class_aps', 'fast_hist', 'per_class_iou',
+    'get_acc', 'get_acc_cls', 'seg_eval', 'KittiMetric', 'NuScenesMetric',
+    'IndoorMetric', 'LyftMetric', 'SegMetric', 'InstanceSegMetric',
+    'WaymoMetric', 'eval_class', 'do_eval', 'PanopticSegMetric',
+    'panoptic_seg_eval'
+]
diff --git a/mmde/mmdet3d/evaluation/functional/__init__.py b/mmde/mmdet3d/evaluation/functional/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e570e05796e9f21a7efd34382d2c75149c67301
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/functional/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .indoor_eval import (average_precision, eval_det_cls, eval_map_recall,
+                          indoor_eval)
+from .instance_seg_eval import (aggregate_predictions, instance_seg_eval,
+                                rename_gt)
+from .kitti_utils import do_eval, kitti_eval, kitti_eval_coco_style
+from .lyft_eval import (get_classwise_aps, get_single_class_aps, load_lyft_gts,
+                        load_lyft_predictions, lyft_eval)
+from .panoptic_seg_eval import panoptic_seg_eval
+from .scannet_utils import evaluate_matches, scannet_eval
+from .seg_eval import fast_hist, get_acc, get_acc_cls, per_class_iou, seg_eval
+
+__all__ = [
+    'average_precision', 'eval_det_cls', 'eval_map_recall', 'indoor_eval',
+    'aggregate_predictions', 'rename_gt', 'instance_seg_eval', 'load_lyft_gts',
+    'load_lyft_predictions', 'lyft_eval', 'get_classwise_aps',
+    'get_single_class_aps', 'fast_hist', 'per_class_iou', 'get_acc',
+    'get_acc_cls', 'seg_eval', 'kitti_eval', 'kitti_eval_coco_style',
+    'scannet_eval', 'evaluate_matches', 'do_eval', 'panoptic_seg_eval'
+]
diff --git a/mmde/mmdet3d/evaluation/functional/indoor_eval.py b/mmde/mmdet3d/evaluation/functional/indoor_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..7742a66e35cc2449d251f6efb28685cf7fad2c97
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/functional/indoor_eval.py
@@ -0,0 +1,302 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmengine.logging import print_log
+from terminaltables import AsciiTable
+
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (np.ndarray): Recalls with shape of (num_scales, num_dets)
+            or (num_dets, ).
+        precisions (np.ndarray): Precisions with shape of
+            (num_scales, num_dets) or (num_dets, ).
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or np.ndarray: Calculated average precision.
+    """
+    if recalls.ndim == 1:
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+
+    assert recalls.shape == precisions.shape
+    assert recalls.ndim == 2
+
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+            ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    return ap
+
+
+def eval_det_cls(pred, gt, iou_thr=None):
+    """Generic functions to compute precision/recall for object detection for a
+    single class.
+
+    Args:
+        pred (dict): Predictions mapping from image id to bounding boxes
+            and scores.
+        gt (dict): Ground truths mapping from image id to bounding boxes.
+        iou_thr (list[float]): A list of iou thresholds.
+
+    Return:
+        tuple (np.ndarray, np.ndarray, float): Recalls, precisions and
+            average precision.
+    """
+
+    # {img_id: {'bbox': box structure, 'det': matched list}}
+    class_recs = {}
+    npos = 0
+    for img_id in gt.keys():
+        cur_gt_num = len(gt[img_id])
+        if cur_gt_num != 0:
+            gt_cur = torch.zeros([cur_gt_num, 7], dtype=torch.float32)
+            for i in range(cur_gt_num):
+                gt_cur[i] = gt[img_id][i].tensor
+            bbox = gt[img_id][0].new_box(gt_cur)
+        else:
+            bbox = gt[img_id]
+        det = [[False] * len(bbox) for i in iou_thr]
+        npos += len(bbox)
+        class_recs[img_id] = {'bbox': bbox, 'det': det}
+
+    # construct dets
+    image_ids = []
+    confidence = []
+    ious = []
+    for img_id in pred.keys():
+        cur_num = len(pred[img_id])
+        if cur_num == 0:
+            continue
+        pred_cur = torch.zeros((cur_num, 7), dtype=torch.float32)
+        box_idx = 0
+        for box, score in pred[img_id]:
+            image_ids.append(img_id)
+            confidence.append(score)
+            pred_cur[box_idx] = box.tensor
+            box_idx += 1
+        pred_cur = box.new_box(pred_cur)
+        gt_cur = class_recs[img_id]['bbox']
+        if len(gt_cur) > 0:
+            # calculate iou in each image
+            iou_cur = pred_cur.overlaps(pred_cur, gt_cur)
+            for i in range(cur_num):
+                ious.append(iou_cur[i])
+        else:
+            for i in range(cur_num):
+                ious.append(np.zeros(1))
+
+    confidence = np.array(confidence)
+
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    image_ids = [image_ids[x] for x in sorted_ind]
+    ious = [ious[x] for x in sorted_ind]
+
+    # go down dets and mark TPs and FPs
+    nd = len(image_ids)
+    tp_thr = [np.zeros(nd) for i in iou_thr]
+    fp_thr = [np.zeros(nd) for i in iou_thr]
+    for d in range(nd):
+        R = class_recs[image_ids[d]]
+        iou_max = -np.inf
+        BBGT = R['bbox']
+        cur_iou = ious[d]
+
+        if len(BBGT) > 0:
+            # compute overlaps
+            for j in range(len(BBGT)):
+                # iou = get_iou_main(get_iou_func, (bb, BBGT[j,...]))
+                iou = cur_iou[j]
+                if iou > iou_max:
+                    iou_max = iou
+                    jmax = j
+
+        for iou_idx, thresh in enumerate(iou_thr):
+            if iou_max > thresh:
+                if not R['det'][iou_idx][jmax]:
+                    tp_thr[iou_idx][d] = 1.
+                    R['det'][iou_idx][jmax] = 1
+                else:
+                    fp_thr[iou_idx][d] = 1.
+            else:
+                fp_thr[iou_idx][d] = 1.
+
+    ret = []
+    for iou_idx, thresh in enumerate(iou_thr):
+        # compute precision recall
+        fp = np.cumsum(fp_thr[iou_idx])
+        tp = np.cumsum(tp_thr[iou_idx])
+        recall = tp / float(npos)
+        # avoid divide by zero in case the first detection matches a difficult
+        # ground truth
+        precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+        ap = average_precision(recall, precision)
+        ret.append((recall, precision, ap))
+
+    return ret
+
+
+def eval_map_recall(pred, gt, ovthresh=None):
+    """Evaluate mAP and recall.
+
+    Generic functions to compute precision/recall for object detection
+        for multiple classes.
+
+    Args:
+        pred (dict): Information of detection results,
+            which maps class_id and predictions.
+        gt (dict): Information of ground truths, which maps class_id and
+            ground truths.
+        ovthresh (list[float], optional): iou threshold. Default: None.
+
+    Return:
+        tuple[dict]: dict results of recall, AP, and precision for all classes.
+    """
+
+    ret_values = {}
+    for classname in gt.keys():
+        if classname in pred:
+            ret_values[classname] = eval_det_cls(pred[classname],
+                                                 gt[classname], ovthresh)
+    recall = [{} for i in ovthresh]
+    precision = [{} for i in ovthresh]
+    ap = [{} for i in ovthresh]
+
+    for label in gt.keys():
+        for iou_idx, thresh in enumerate(ovthresh):
+            if label in pred:
+                recall[iou_idx][label], precision[iou_idx][label], ap[iou_idx][
+                    label] = ret_values[label][iou_idx]
+            else:
+                recall[iou_idx][label] = np.zeros(1)
+                precision[iou_idx][label] = np.zeros(1)
+                ap[iou_idx][label] = np.zeros(1)
+
+    return recall, precision, ap
+
+
+def indoor_eval(gt_annos,
+                dt_annos,
+                metric,
+                label2cat,
+                logger=None,
+                box_mode_3d=None):
+    """Indoor Evaluation.
+
+    Evaluate the result of the detection.
+
+    Args:
+        gt_annos (list[dict]): Ground truth annotations.
+        dt_annos (list[dict]): Detection annotations. the dict
+            includes the following keys
+
+            - labels_3d (torch.Tensor): Labels of boxes.
+            - bboxes_3d (:obj:`BaseInstance3DBoxes`):
+                3D bounding boxes in Depth coordinate.
+            - scores_3d (torch.Tensor): Scores of boxes.
+        metric (list[float]): IoU thresholds for computing average precisions.
+        label2cat (tuple): Map from label to category.
+        logger (logging.Logger | str, optional): The way to print the mAP
+            summary. See `mmdet.utils.print_log()` for details. Default: None.
+
+    Return:
+        dict[str, float]: Dict of results.
+    """
+    assert len(dt_annos) == len(gt_annos)
+    pred = {}  # map {class_id: pred}
+    gt = {}  # map {class_id: gt}
+    for img_id in range(len(dt_annos)):
+        # parse detected annotations
+        det_anno = dt_annos[img_id]
+        for i in range(len(det_anno['labels_3d'])):
+            label = det_anno['labels_3d'].numpy()[i]
+            bbox = det_anno['bboxes_3d'].convert_to(box_mode_3d)[i]
+            score = det_anno['scores_3d'].numpy()[i]
+            if label not in pred:
+                pred[int(label)] = {}
+            if img_id not in pred[label]:
+                pred[int(label)][img_id] = []
+            if label not in gt:
+                gt[int(label)] = {}
+            if img_id not in gt[label]:
+                gt[int(label)][img_id] = []
+            pred[int(label)][img_id].append((bbox, score))
+
+        # parse gt annotations
+        gt_anno = gt_annos[img_id]
+
+        gt_boxes = gt_anno['gt_bboxes_3d']
+        labels_3d = gt_anno['gt_labels_3d']
+
+        for i in range(len(labels_3d)):
+            label = labels_3d[i]
+            bbox = gt_boxes[i]
+            if label not in gt:
+                gt[label] = {}
+            if img_id not in gt[label]:
+                gt[label][img_id] = []
+            gt[label][img_id].append(bbox)
+
+    rec, prec, ap = eval_map_recall(pred, gt, metric)
+    ret_dict = dict()
+    header = ['classes']
+    table_columns = [[label2cat[label]
+                      for label in ap[0].keys()] + ['Overall']]
+
+    for i, iou_thresh in enumerate(metric):
+        header.append(f'AP_{iou_thresh:.2f}')
+        header.append(f'AR_{iou_thresh:.2f}')
+        rec_list = []
+        for label in ap[i].keys():
+            ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float(
+                ap[i][label][0])
+        ret_dict[f'mAP_{iou_thresh:.2f}'] = float(
+            np.mean(list(ap[i].values())))
+
+        table_columns.append(list(map(float, list(ap[i].values()))))
+        table_columns[-1] += [ret_dict[f'mAP_{iou_thresh:.2f}']]
+        table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
+
+        for label in rec[i].keys():
+            ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float(
+                rec[i][label][-1])
+            rec_list.append(rec[i][label][-1])
+        ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list))
+
+        table_columns.append(list(map(float, rec_list)))
+        table_columns[-1] += [ret_dict[f'mAR_{iou_thresh:.2f}']]
+        table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
+
+    table_data = [header]
+    table_rows = list(zip(*table_columns))
+    table_data += table_rows
+    table = AsciiTable(table_data)
+    table.inner_footing_row_border = True
+    print_log('\n' + table.table, logger=logger)
+
+    return ret_dict
diff --git a/mmde/mmdet3d/evaluation/functional/instance_seg_eval.py b/mmde/mmdet3d/evaluation/functional/instance_seg_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0717196a2700c0d9434c041d824b1fad5ee42c
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/functional/instance_seg_eval.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmengine.logging import print_log
+from terminaltables import AsciiTable
+
+from .scannet_utils.evaluate_semantic_instance import scannet_eval
+
+
+def aggregate_predictions(masks, labels, scores, valid_class_ids):
+    """Maps predictions to ScanNet evaluator format.
+
+    Args:
+        masks (list[torch.Tensor]): Per scene predicted instance masks.
+        labels (list[torch.Tensor]): Per scene predicted instance labels.
+        scores (list[torch.Tensor]): Per scene predicted instance scores.
+        valid_class_ids (tuple[int]): Ids of valid categories.
+
+    Returns:
+        list[dict]: Per scene aggregated predictions.
+    """
+    infos = []
+    for id, (mask, label, score) in enumerate(zip(masks, labels, scores)):
+        mask = mask.clone().numpy()
+        label = label.clone().numpy()
+        score = score.clone().numpy()
+        info = dict()
+        n_instances = mask.max() + 1
+        for i in range(n_instances):
+            # match pred_instance['filename'] from assign_instances_for_scan
+            file_name = f'{id}_{i}'
+            info[file_name] = dict()
+            info[file_name]['mask'] = (mask == i).astype(np.int64)
+            info[file_name]['label_id'] = valid_class_ids[label[i]]
+            info[file_name]['conf'] = score[i]
+        infos.append(info)
+    return infos
+
+
+def rename_gt(gt_semantic_masks, gt_instance_masks, valid_class_ids):
+    """Maps gt instance and semantic masks to instance masks for ScanNet
+    evaluator.
+
+    Args:
+        gt_semantic_masks (list[torch.Tensor]): Per scene gt semantic masks.
+        gt_instance_masks (list[torch.Tensor]): Per scene gt instance masks.
+        valid_class_ids (tuple[int]): Ids of valid categories.
+
+    Returns:
+        list[np.array]: Per scene instance masks.
+    """
+    renamed_instance_masks = []
+    for semantic_mask, instance_mask in zip(gt_semantic_masks,
+                                            gt_instance_masks):
+        semantic_mask = semantic_mask.clone().numpy()
+        instance_mask = instance_mask.clone().numpy()
+        unique = np.unique(instance_mask)
+        assert len(unique) < 1000
+        for i in unique:
+            semantic_instance = semantic_mask[instance_mask == i]
+            semantic_unique = np.unique(semantic_instance)
+            assert len(semantic_unique) == 1
+            if semantic_unique[0] < len(valid_class_ids):
+                instance_mask[
+                    instance_mask ==
+                    i] = 1000 * valid_class_ids[semantic_unique[0]] + i
+        renamed_instance_masks.append(instance_mask)
+    return renamed_instance_masks
+
+
+def instance_seg_eval(gt_semantic_masks,
+                      gt_instance_masks,
+                      pred_instance_masks,
+                      pred_instance_labels,
+                      pred_instance_scores,
+                      valid_class_ids,
+                      class_labels,
+                      options=None,
+                      logger=None):
+    """Instance Segmentation Evaluation.
+
+    Evaluate the result of the instance segmentation.
+
+    Args:
+        gt_semantic_masks (list[torch.Tensor]): Ground truth semantic masks.
+        gt_instance_masks (list[torch.Tensor]): Ground truth instance masks.
+        pred_instance_masks (list[torch.Tensor]): Predicted instance masks.
+        pred_instance_labels (list[torch.Tensor]): Predicted instance labels.
+        pred_instance_scores (list[torch.Tensor]): Predicted instance labels.
+        valid_class_ids (tuple[int]): Ids of valid categories.
+        class_labels (tuple[str]): Names of valid categories.
+        options (dict, optional): Additional options. Keys may contain:
+            `overlaps`, `min_region_sizes`, `distance_threshes`,
+            `distance_confs`. Default: None.
+        logger (logging.Logger | str, optional): The way to print the mAP
+            summary. See `mmdet.utils.print_log()` for details. Default: None.
+
+    Returns:
+        dict[str, float]: Dict of results.
+    """
+    assert len(valid_class_ids) == len(class_labels)
+    id_to_label = {
+        valid_class_ids[i]: class_labels[i]
+        for i in range(len(valid_class_ids))
+    }
+    preds = aggregate_predictions(
+        masks=pred_instance_masks,
+        labels=pred_instance_labels,
+        scores=pred_instance_scores,
+        valid_class_ids=valid_class_ids)
+    gts = rename_gt(gt_semantic_masks, gt_instance_masks, valid_class_ids)
+    metrics = scannet_eval(
+        preds=preds,
+        gts=gts,
+        options=options,
+        valid_class_ids=valid_class_ids,
+        class_labels=class_labels,
+        id_to_label=id_to_label)
+    header = ['classes', 'AP_0.25', 'AP_0.50', 'AP']
+    rows = []
+    for label, data in metrics['classes'].items():
+        aps = [data['ap25%'], data['ap50%'], data['ap']]
+        rows.append([label] + [f'{ap:.4f}' for ap in aps])
+    aps = metrics['all_ap_25%'], metrics['all_ap_50%'], metrics['all_ap']
+    footer = ['Overall'] + [f'{ap:.4f}' for ap in aps]
+    table = AsciiTable([header] + rows + [footer])
+    table.inner_footing_row_border = True
+    print_log('\n' + table.table, logger=logger)
+    return metrics
diff --git a/mmde/mmdet3d/evaluation/functional/kitti_utils/__init__.py b/mmde/mmdet3d/evaluation/functional/kitti_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf46c1b0362702d620fd4bba6517344fe77082e9
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/functional/kitti_utils/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .eval import do_eval, eval_class, kitti_eval, kitti_eval_coco_style
+
+__all__ = ['kitti_eval', 'kitti_eval_coco_style', 'do_eval', 'eval_class']
diff --git a/mmde/mmdet3d/evaluation/functional/kitti_utils/eval.py b/mmde/mmdet3d/evaluation/functional/kitti_utils/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e559529f8935f43d344c5a60c24bfd2ebecc07a
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/functional/kitti_utils/eval.py
@@ -0,0 +1,950 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import gc
+import io as sysio
+
+import numba
+import numpy as np
+
+
+@numba.jit
+def get_thresholds(scores: np.ndarray, num_gt, num_sample_pts=41):
+    scores.sort()
+    scores = scores[::-1]
+    current_recall = 0
+    thresholds = []
+    for i, score in enumerate(scores):
+        l_recall = (i + 1) / num_gt
+        if i < (len(scores) - 1):
+            r_recall = (i + 2) / num_gt
+        else:
+            r_recall = l_recall
+        if (((r_recall - current_recall) < (current_recall - l_recall))
+                and (i < (len(scores) - 1))):
+            continue
+        # recall = l_recall
+        thresholds.append(score)
+        current_recall += 1 / (num_sample_pts - 1.0)
+    return thresholds
+
+
+def clean_data(gt_anno, dt_anno, current_class, difficulty):
+    CLASS_NAMES = ['car', 'pedestrian', 'cyclist']
+    MIN_HEIGHT = [40, 25, 25]
+    MAX_OCCLUSION = [0, 1, 2]
+    MAX_TRUNCATION = [0.15, 0.3, 0.5]
+    dc_bboxes, ignored_gt, ignored_dt = [], [], []
+    current_cls_name = CLASS_NAMES[current_class].lower()
+    num_gt = len(gt_anno['name'])
+    num_dt = len(dt_anno['name'])
+    num_valid_gt = 0
+    for i in range(num_gt):
+        bbox = gt_anno['bbox'][i]
+        gt_name = gt_anno['name'][i].lower()
+        height = bbox[3] - bbox[1]
+        valid_class = -1
+        if (gt_name == current_cls_name):
+            valid_class = 1
+        elif (current_cls_name == 'Pedestrian'.lower()
+              and 'Person_sitting'.lower() == gt_name):
+            valid_class = 0
+        elif (current_cls_name == 'Car'.lower() and 'Van'.lower() == gt_name):
+            valid_class = 0
+        else:
+            valid_class = -1
+        ignore = False
+        if ((gt_anno['occluded'][i] > MAX_OCCLUSION[difficulty])
+                or (gt_anno['truncated'][i] > MAX_TRUNCATION[difficulty])
+                or (height <= MIN_HEIGHT[difficulty])):
+            ignore = True
+        if valid_class == 1 and not ignore:
+            ignored_gt.append(0)
+            num_valid_gt += 1
+        elif (valid_class == 0 or (ignore and (valid_class == 1))):
+            ignored_gt.append(1)
+        else:
+            ignored_gt.append(-1)
+    # for i in range(num_gt):
+        if gt_anno['name'][i] == 'DontCare':
+            dc_bboxes.append(gt_anno['bbox'][i])
+    for i in range(num_dt):
+        if (dt_anno['name'][i].lower() == current_cls_name):
+            valid_class = 1
+        else:
+            valid_class = -1
+        height = abs(dt_anno['bbox'][i, 3] - dt_anno['bbox'][i, 1])
+        if height < MIN_HEIGHT[difficulty]:
+            ignored_dt.append(1)
+        elif valid_class == 1:
+            ignored_dt.append(0)
+        else:
+            ignored_dt.append(-1)
+
+    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes
+
+
+@numba.jit(nopython=True)
+def image_box_overlap(boxes, query_boxes, criterion=-1):
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        qbox_area = ((query_boxes[k, 2] - query_boxes[k, 0]) *
+                     (query_boxes[k, 3] - query_boxes[k, 1]))
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]))
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]))
+                if ih > 0:
+                    if criterion == -1:
+                        ua = ((boxes[n, 2] - boxes[n, 0]) *
+                              (boxes[n, 3] - boxes[n, 1]) + qbox_area -
+                              iw * ih)
+                    elif criterion == 0:
+                        ua = ((boxes[n, 2] - boxes[n, 0]) *
+                              (boxes[n, 3] - boxes[n, 1]))
+                    elif criterion == 1:
+                        ua = qbox_area
+                    else:
+                        ua = 1.0
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+def bev_box_overlap(boxes, qboxes, criterion=-1):
+    from .rotate_iou import rotate_iou_gpu_eval
+    riou = rotate_iou_gpu_eval(boxes, qboxes, criterion)
+    return riou
+
+
+@numba.jit(nopython=True, parallel=True)
+def d3_box_overlap_kernel(boxes, qboxes, rinc, criterion=-1):
+    # ONLY support overlap in CAMERA, not lidar.
+    # TODO: change to use prange for parallel mode, should check the difference
+    N, K = boxes.shape[0], qboxes.shape[0]
+    for i in numba.prange(N):
+        for j in numba.prange(K):
+            if rinc[i, j] > 0:
+                # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] +
+                #         qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1]))
+                iw = (
+                    min(boxes[i, 1], qboxes[j, 1]) -
+                    max(boxes[i, 1] - boxes[i, 4],
+                        qboxes[j, 1] - qboxes[j, 4]))
+
+                if iw > 0:
+                    area1 = boxes[i, 3] * boxes[i, 4] * boxes[i, 5]
+                    area2 = qboxes[j, 3] * qboxes[j, 4] * qboxes[j, 5]
+                    inc = iw * rinc[i, j]
+                    if criterion == -1:
+                        ua = (area1 + area2 - inc)
+                    elif criterion == 0:
+                        ua = area1
+                    elif criterion == 1:
+                        ua = area2
+                    else:
+                        ua = inc
+                    rinc[i, j] = inc / ua
+                else:
+                    rinc[i, j] = 0.0
+
+
+def d3_box_overlap(boxes, qboxes, criterion=-1):
+    from .rotate_iou import rotate_iou_gpu_eval
+    rinc = rotate_iou_gpu_eval(boxes[:, [0, 2, 3, 5, 6]],
+                               qboxes[:, [0, 2, 3, 5, 6]], 2)
+    d3_box_overlap_kernel(boxes, qboxes, rinc, criterion)
+    return rinc
+
+
+@numba.jit(nopython=True)
+def compute_statistics_jit(overlaps,
+                           gt_datas,
+                           dt_datas,
+                           ignored_gt,
+                           ignored_det,
+                           dc_bboxes,
+                           metric,
+                           min_overlap,
+                           thresh=0,
+                           compute_fp=False,
+                           compute_aos=False):
+
+    det_size = dt_datas.shape[0]
+    gt_size = gt_datas.shape[0]
+    dt_scores = dt_datas[:, -1]
+    dt_alphas = dt_datas[:, 4]
+    gt_alphas = gt_datas[:, 4]
+    dt_bboxes = dt_datas[:, :4]
+    # gt_bboxes = gt_datas[:, :4]
+
+    assigned_detection = [False] * det_size
+    ignored_threshold = [False] * det_size
+    if compute_fp:
+        for i in range(det_size):
+            if (dt_scores[i] < thresh):
+                ignored_threshold[i] = True
+    NO_DETECTION = -10000000
+    tp, fp, fn, similarity = 0, 0, 0, 0
+    # thresholds = [0.0]
+    # delta = [0.0]
+    thresholds = np.zeros((gt_size, ))
+    thresh_idx = 0
+    delta = np.zeros((gt_size, ))
+    delta_idx = 0
+    for i in range(gt_size):
+        if ignored_gt[i] == -1:
+            continue
+        det_idx = -1
+        valid_detection = NO_DETECTION
+        max_overlap = 0
+        assigned_ignored_det = False
+
+        for j in range(det_size):
+            if (ignored_det[j] == -1):
+                continue
+            if (assigned_detection[j]):
+                continue
+            if (ignored_threshold[j]):
+                continue
+            overlap = overlaps[j, i]
+            dt_score = dt_scores[j]
+            if (not compute_fp and (overlap > min_overlap)
+                    and dt_score > valid_detection):
+                det_idx = j
+                valid_detection = dt_score
+            elif (compute_fp and (overlap > min_overlap)
+                  and (overlap > max_overlap or assigned_ignored_det)
+                  and ignored_det[j] == 0):
+                max_overlap = overlap
+                det_idx = j
+                valid_detection = 1
+                assigned_ignored_det = False
+            elif (compute_fp and (overlap > min_overlap)
+                  and (valid_detection == NO_DETECTION)
+                  and ignored_det[j] == 1):
+                det_idx = j
+                valid_detection = 1
+                assigned_ignored_det = True
+
+        if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0:
+            fn += 1
+        elif ((valid_detection != NO_DETECTION)
+              and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1)):
+            assigned_detection[det_idx] = True
+        elif valid_detection != NO_DETECTION:
+            tp += 1
+            # thresholds.append(dt_scores[det_idx])
+            thresholds[thresh_idx] = dt_scores[det_idx]
+            thresh_idx += 1
+            if compute_aos:
+                # delta.append(gt_alphas[i] - dt_alphas[det_idx])
+                delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx]
+                delta_idx += 1
+
+            assigned_detection[det_idx] = True
+    if compute_fp:
+        for i in range(det_size):
+            if (not (assigned_detection[i] or ignored_det[i] == -1
+                     or ignored_det[i] == 1 or ignored_threshold[i])):
+                fp += 1
+        nstuff = 0
+        if metric == 0:
+            overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0)
+            for i in range(dc_bboxes.shape[0]):
+                for j in range(det_size):
+                    if (assigned_detection[j]):
+                        continue
+                    if (ignored_det[j] == -1 or ignored_det[j] == 1):
+                        continue
+                    if (ignored_threshold[j]):
+                        continue
+                    if overlaps_dt_dc[j, i] > min_overlap:
+                        assigned_detection[j] = True
+                        nstuff += 1
+        fp -= nstuff
+        if compute_aos:
+            tmp = np.zeros((fp + delta_idx, ))
+            # tmp = [0] * fp
+            for i in range(delta_idx):
+                tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0
+                # tmp.append((1.0 + np.cos(delta[i])) / 2.0)
+            # assert len(tmp) == fp + tp
+            # assert len(delta) == tp
+            if tp > 0 or fp > 0:
+                similarity = np.sum(tmp)
+            else:
+                similarity = -1
+    return tp, fp, fn, similarity, thresholds[:thresh_idx]
+
+
+def get_split_parts(num, num_part):
+    if num % num_part == 0:
+        same_part = num // num_part
+        return [same_part] * num_part
+    else:
+        same_part = num // (num_part - 1)
+        remain_num = num % (num_part - 1)
+        return [same_part] * (num_part - 1) + [remain_num]
+
+
+@numba.jit(nopython=True)
+def fused_compute_statistics(overlaps,
+                             pr,
+                             gt_nums,
+                             dt_nums,
+                             dc_nums,
+                             gt_datas,
+                             dt_datas,
+                             dontcares,
+                             ignored_gts,
+                             ignored_dets,
+                             metric,
+                             min_overlap,
+                             thresholds,
+                             compute_aos=False):
+    gt_num = 0
+    dt_num = 0
+    dc_num = 0
+    for i in range(gt_nums.shape[0]):
+        for t, thresh in enumerate(thresholds):
+            overlap = overlaps[dt_num:dt_num + dt_nums[i],
+                               gt_num:gt_num + gt_nums[i]]
+
+            gt_data = gt_datas[gt_num:gt_num + gt_nums[i]]
+            dt_data = dt_datas[dt_num:dt_num + dt_nums[i]]
+            ignored_gt = ignored_gts[gt_num:gt_num + gt_nums[i]]
+            ignored_det = ignored_dets[dt_num:dt_num + dt_nums[i]]
+            dontcare = dontcares[dc_num:dc_num + dc_nums[i]]
+            tp, fp, fn, similarity, _ = compute_statistics_jit(
+                overlap,
+                gt_data,
+                dt_data,
+                ignored_gt,
+                ignored_det,
+                dontcare,
+                metric,
+                min_overlap=min_overlap,
+                thresh=thresh,
+                compute_fp=True,
+                compute_aos=compute_aos)
+            pr[t, 0] += tp
+            pr[t, 1] += fp
+            pr[t, 2] += fn
+            if similarity != -1:
+                pr[t, 3] += similarity
+        gt_num += gt_nums[i]
+        dt_num += dt_nums[i]
+        dc_num += dc_nums[i]
+
+
+def calculate_iou_partly(dt_annos, gt_annos, metric, num_parts=50):
+    """Fast iou algorithm. this function can be used independently to do result
+    analysis. Must be used in CAMERA coordinate system.
+
+    Args:
+        dt_annos (dict): Must from get_label_annos() in kitti_common.py.
+        gt_annos (dict): Must from get_label_annos() in kitti_common.py.
+        metric (int): Eval type. 0: bbox, 1: bev, 2: 3d.
+        num_parts (int): A parameter for fast calculate algorithm.
+    """
+    assert len(dt_annos) == len(gt_annos)
+    total_gt_num = np.stack([len(a['name']) for a in gt_annos], 0)
+    total_dt_num = np.stack([len(a['name']) for a in dt_annos], 0)
+    num_examples = len(dt_annos)
+    split_parts = get_split_parts(num_examples, num_parts)
+    parted_overlaps = []
+    example_idx = 0
+
+    for num_part in split_parts:
+        dt_annos_part = dt_annos[example_idx:example_idx + num_part]
+        gt_annos_part = gt_annos[example_idx:example_idx + num_part]
+        if metric == 0:
+            dt_boxes = np.concatenate([a['bbox'] for a in dt_annos_part], 0)
+            gt_boxes = np.concatenate([a['bbox'] for a in gt_annos_part], 0)
+            overlap_part = image_box_overlap(dt_boxes, gt_boxes)
+        elif metric == 1:
+            loc = np.concatenate(
+                [a['location'][:, [0, 2]] for a in dt_annos_part], 0)
+            dims = np.concatenate(
+                [a['dimensions'][:, [0, 2]] for a in dt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
+            dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            loc = np.concatenate(
+                [a['location'][:, [0, 2]] for a in gt_annos_part], 0)
+            dims = np.concatenate(
+                [a['dimensions'][:, [0, 2]] for a in gt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
+            gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            overlap_part = bev_box_overlap(dt_boxes,
+                                           gt_boxes).astype(np.float64)
+        elif metric == 2:
+            loc = np.concatenate([a['location'] for a in dt_annos_part], 0)
+            dims = np.concatenate([a['dimensions'] for a in dt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
+            dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            loc = np.concatenate([a['location'] for a in gt_annos_part], 0)
+            dims = np.concatenate([a['dimensions'] for a in gt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
+            gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            overlap_part = d3_box_overlap(dt_boxes,
+                                          gt_boxes).astype(np.float64)
+        else:
+            raise ValueError('unknown metric')
+        parted_overlaps.append(overlap_part)
+        example_idx += num_part
+    overlaps = []
+    example_idx = 0
+    for j, num_part in enumerate(split_parts):
+        gt_num_idx, dt_num_idx = 0, 0
+        for i in range(num_part):
+            gt_box_num = total_gt_num[example_idx + i]
+            dt_box_num = total_dt_num[example_idx + i]
+            overlaps.append(
+                parted_overlaps[j][dt_num_idx:dt_num_idx + dt_box_num,
+                                   gt_num_idx:gt_num_idx + gt_box_num])
+            gt_num_idx += gt_box_num
+            dt_num_idx += dt_box_num
+        example_idx += num_part
+
+    return overlaps, parted_overlaps, total_dt_num, total_gt_num
+
+
+def _prepare_data(gt_annos, dt_annos, current_class, difficulty):
+    gt_datas_list = []
+    dt_datas_list = []
+    total_dc_num = []
+    ignored_gts, ignored_dets, dontcares = [], [], []
+    total_num_valid_gt = 0
+    for i in range(len(gt_annos)):
+        rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty)
+        num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets
+        ignored_gts.append(np.array(ignored_gt, dtype=np.int64))
+        ignored_dets.append(np.array(ignored_det, dtype=np.int64))
+        if len(dc_bboxes) == 0:
+            dc_bboxes = np.zeros((0, 4)).astype(np.float64)
+        else:
+            dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64)
+        total_dc_num.append(dc_bboxes.shape[0])
+        dontcares.append(dc_bboxes)
+        total_num_valid_gt += num_valid_gt
+        gt_datas = np.concatenate(
+            [gt_annos[i]['bbox'], gt_annos[i]['alpha'][..., np.newaxis]], 1)
+        dt_datas = np.concatenate([
+            dt_annos[i]['bbox'], dt_annos[i]['alpha'][..., np.newaxis],
+            dt_annos[i]['score'][..., np.newaxis]
+        ], 1)
+        gt_datas_list.append(gt_datas)
+        dt_datas_list.append(dt_datas)
+    total_dc_num = np.stack(total_dc_num, axis=0)
+    return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares,
+            total_dc_num, total_num_valid_gt)
+
+
+def eval_class(gt_annos,
+               dt_annos,
+               current_classes,
+               difficultys,
+               metric,
+               min_overlaps,
+               compute_aos=False,
+               num_parts=200):
+    """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP.
+
+    Args:
+        gt_annos (dict): Must from get_label_annos() in kitti_common.py.
+        dt_annos (dict): Must from get_label_annos() in kitti_common.py.
+        current_classes (list[int]): 0: car, 1: pedestrian, 2: cyclist.
+        difficultys (list[int]): Eval difficulty, 0: easy, 1: normal, 2: hard
+        metric (int): Eval type. 0: bbox, 1: bev, 2: 3d
+        min_overlaps (float): Min overlap. format:
+            [num_overlap, metric, class].
+        num_parts (int): A parameter for fast calculate algorithm
+
+    Returns:
+        dict[str, np.ndarray]: recall, precision and aos
+    """
+    assert len(gt_annos) == len(dt_annos)
+    num_examples = len(gt_annos)
+    if num_examples < num_parts:
+        num_parts = num_examples
+    split_parts = get_split_parts(num_examples, num_parts)
+
+    rets = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts)
+    overlaps, parted_overlaps, total_dt_num, total_gt_num = rets
+
+    N_SAMPLE_PTS = 41
+    num_minoverlap = len(min_overlaps)
+    num_class = len(current_classes)
+    num_difficulty = len(difficultys)
+    precision = np.zeros(
+        [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    recall = np.zeros(
+        [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    for m, current_class in enumerate(current_classes):
+        for idx_l, difficulty in enumerate(difficultys):
+            rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty)
+            (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets,
+             dontcares, total_dc_num, total_num_valid_gt) = rets
+            for k, min_overlap in enumerate(min_overlaps[:, metric, m]):
+                thresholdss = []
+                for i in range(len(gt_annos)):
+                    rets = compute_statistics_jit(
+                        overlaps[i],
+                        gt_datas_list[i],
+                        dt_datas_list[i],
+                        ignored_gts[i],
+                        ignored_dets[i],
+                        dontcares[i],
+                        metric,
+                        min_overlap=min_overlap,
+                        thresh=0.0,
+                        compute_fp=False)
+                    tp, fp, fn, similarity, thresholds = rets
+                    thresholdss += thresholds.tolist()
+                thresholdss = np.array(thresholdss)
+                thresholds = get_thresholds(thresholdss, total_num_valid_gt)
+                thresholds = np.array(thresholds)
+                pr = np.zeros([len(thresholds), 4])
+                idx = 0
+                for j, num_part in enumerate(split_parts):
+                    gt_datas_part = np.concatenate(
+                        gt_datas_list[idx:idx + num_part], 0)
+                    dt_datas_part = np.concatenate(
+                        dt_datas_list[idx:idx + num_part], 0)
+                    dc_datas_part = np.concatenate(
+                        dontcares[idx:idx + num_part], 0)
+                    ignored_dets_part = np.concatenate(
+                        ignored_dets[idx:idx + num_part], 0)
+                    ignored_gts_part = np.concatenate(
+                        ignored_gts[idx:idx + num_part], 0)
+                    fused_compute_statistics(
+                        parted_overlaps[j],
+                        pr,
+                        total_gt_num[idx:idx + num_part],
+                        total_dt_num[idx:idx + num_part],
+                        total_dc_num[idx:idx + num_part],
+                        gt_datas_part,
+                        dt_datas_part,
+                        dc_datas_part,
+                        ignored_gts_part,
+                        ignored_dets_part,
+                        metric,
+                        min_overlap=min_overlap,
+                        thresholds=thresholds,
+                        compute_aos=compute_aos)
+                    idx += num_part
+                for i in range(len(thresholds)):
+                    recall[m, idx_l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2])
+                    precision[m, idx_l, k, i] = pr[i, 0] / (
+                        pr[i, 0] + pr[i, 1])
+                    if compute_aos:
+                        aos[m, idx_l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1])
+                for i in range(len(thresholds)):
+                    precision[m, idx_l, k, i] = np.max(
+                        precision[m, idx_l, k, i:], axis=-1)
+                    recall[m, idx_l, k, i] = np.max(
+                        recall[m, idx_l, k, i:], axis=-1)
+                    if compute_aos:
+                        aos[m, idx_l, k, i] = np.max(
+                            aos[m, idx_l, k, i:], axis=-1)
+    ret_dict = {
+        'recall': recall,
+        'precision': precision,
+        'orientation': aos,
+    }
+
+    # clean temp variables
+    del overlaps
+    del parted_overlaps
+
+    gc.collect()
+    return ret_dict
+
+
+def get_mAP11(prec):
+    sums = 0
+    for i in range(0, prec.shape[-1], 4):
+        sums = sums + prec[..., i]
+    return sums / 11 * 100
+
+
+def get_mAP40(prec):
+    sums = 0
+    for i in range(1, prec.shape[-1]):
+        sums = sums + prec[..., i]
+    return sums / 40 * 100
+
+
+def print_str(value, *arg, sstream=None):
+    if sstream is None:
+        sstream = sysio.StringIO()
+    sstream.truncate(0)
+    sstream.seek(0)
+    print(value, *arg, file=sstream)
+    return sstream.getvalue()
+
+
+def do_eval(gt_annos,
+            dt_annos,
+            current_classes,
+            min_overlaps,
+            eval_types=['bbox', 'bev', '3d']):
+    # min_overlaps: [num_minoverlap, metric, num_class]
+    difficultys = [0, 1, 2]
+    mAP11_bbox = None
+    mAP11_aos = None
+    mAP40_bbox = None
+    mAP40_aos = None
+    if 'bbox' in eval_types:
+        ret = eval_class(
+            gt_annos,
+            dt_annos,
+            current_classes,
+            difficultys,
+            0,
+            min_overlaps,
+            compute_aos=('aos' in eval_types))
+        # ret: [num_class, num_diff, num_minoverlap, num_sample_points]
+        mAP11_bbox = get_mAP11(ret['precision'])
+        mAP40_bbox = get_mAP40(ret['precision'])
+        if 'aos' in eval_types:
+            mAP11_aos = get_mAP11(ret['orientation'])
+            mAP40_aos = get_mAP40(ret['orientation'])
+
+    mAP11_bev = None
+    mAP40_bev = None
+    if 'bev' in eval_types:
+        ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 1,
+                         min_overlaps)
+        mAP11_bev = get_mAP11(ret['precision'])
+        mAP40_bev = get_mAP40(ret['precision'])
+
+    mAP11_3d = None
+    mAP40_3d = None
+    if '3d' in eval_types:
+        ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2,
+                         min_overlaps)
+        mAP11_3d = get_mAP11(ret['precision'])
+        mAP40_3d = get_mAP40(ret['precision'])
+    return (mAP11_bbox, mAP11_bev, mAP11_3d, mAP11_aos, mAP40_bbox, mAP40_bev,
+            mAP40_3d, mAP40_aos)
+
+
+def do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges,
+                       compute_aos):
+    # overlap_ranges: [range, metric, num_class]
+    min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]])
+    for i in range(overlap_ranges.shape[1]):
+        for j in range(overlap_ranges.shape[2]):
+            min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j])
+    mAP_bbox, mAP_bev, mAP_3d, mAP_aos, _, _, \
+        _, _ = do_eval(gt_annos, dt_annos,
+                       current_classes, min_overlaps,
+                       compute_aos)
+    # ret: [num_class, num_diff, num_minoverlap]
+    mAP_bbox = mAP_bbox.mean(-1)
+    mAP_bev = mAP_bev.mean(-1)
+    mAP_3d = mAP_3d.mean(-1)
+    if mAP_aos is not None:
+        mAP_aos = mAP_aos.mean(-1)
+    return mAP_bbox, mAP_bev, mAP_3d, mAP_aos
+
+
+def kitti_eval(gt_annos,
+               dt_annos,
+               current_classes,
+               eval_types=['bbox', 'bev', '3d']):
+    """KITTI evaluation.
+
+    Args:
+        gt_annos (list[dict]): Contain gt information of each sample.
+        dt_annos (list[dict]): Contain detected information of each sample.
+        current_classes (list[str]): Classes to evaluation.
+        eval_types (list[str], optional): Types to eval.
+            Defaults to ['bbox', 'bev', '3d'].
+
+    Returns:
+        tuple: String and dict of evaluation results.
+    """
+    assert len(eval_types) > 0, 'must contain at least one evaluation type'
+    if 'aos' in eval_types:
+        assert 'bbox' in eval_types, 'must evaluate bbox when evaluating aos'
+    overlap_0_7 = np.array([[0.7, 0.5, 0.5, 0.7,
+                             0.5], [0.7, 0.5, 0.5, 0.7, 0.5],
+                            [0.7, 0.5, 0.5, 0.7, 0.5]])
+    overlap_0_5 = np.array([[0.7, 0.5, 0.5, 0.7, 0.5],
+                            [0.5, 0.25, 0.25, 0.5, 0.25],
+                            [0.5, 0.25, 0.25, 0.5, 0.25]])
+    min_overlaps = np.stack([overlap_0_7, overlap_0_5], axis=0)  # [2, 3, 5]
+    class_to_name = {
+        0: 'Car',
+        1: 'Pedestrian',
+        2: 'Cyclist',
+        3: 'Van',
+        4: 'Person_sitting',
+    }
+    name_to_class = {v: n for n, v in class_to_name.items()}
+    if not isinstance(current_classes, (list, tuple)):
+        current_classes = [current_classes]
+    current_classes_int = []
+    for curcls in current_classes:
+        if isinstance(curcls, str):
+            current_classes_int.append(name_to_class[curcls])
+        else:
+            current_classes_int.append(curcls)
+    current_classes = current_classes_int
+    min_overlaps = min_overlaps[:, :, current_classes]
+    result = ''
+    # check whether alpha is valid
+    compute_aos = False
+    pred_alpha = False
+    valid_alpha_gt = False
+    for anno in dt_annos:
+        mask = (anno['alpha'] != -10)
+        if anno['alpha'][mask].shape[0] != 0:
+            pred_alpha = True
+            break
+    for anno in gt_annos:
+        if anno['alpha'][0] != -10:
+            valid_alpha_gt = True
+            break
+    compute_aos = (pred_alpha and valid_alpha_gt)
+    if compute_aos:
+        eval_types.append('aos')
+
+    mAP11_bbox, mAP11_bev, mAP11_3d, mAP11_aos, mAP40_bbox, mAP40_bev, \
+        mAP40_3d, mAP40_aos = do_eval(gt_annos, dt_annos,
+                                      current_classes, min_overlaps,
+                                      eval_types)
+
+    ret_dict = {}
+    difficulty = ['easy', 'moderate', 'hard']
+
+    # calculate AP11
+    result += '\n----------- AP11 Results ------------\n\n'
+    for j, curcls in enumerate(current_classes):
+        # mAP threshold array: [num_minoverlap, metric, class]
+        # mAP result: [num_class, num_diff, num_minoverlap]
+        curcls_name = class_to_name[curcls]
+        for i in range(min_overlaps.shape[0]):
+            # prepare results for print
+            result += ('{} AP11@{:.2f}, {:.2f}, {:.2f}:\n'.format(
+                curcls_name, *min_overlaps[i, :, j]))
+            if mAP11_bbox is not None:
+                result += 'bbox AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                    *mAP11_bbox[j, :, i])
+            if mAP11_bev is not None:
+                result += 'bev  AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                    *mAP11_bev[j, :, i])
+            if mAP11_3d is not None:
+                result += '3d   AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                    *mAP11_3d[j, :, i])
+            if compute_aos:
+                result += 'aos  AP11:{:.2f}, {:.2f}, {:.2f}\n'.format(
+                    *mAP11_aos[j, :, i])
+
+            # prepare results for logger
+            for idx in range(3):
+                if i == 0:
+                    postfix = f'{difficulty[idx]}_strict'
+                else:
+                    postfix = f'{difficulty[idx]}_loose'
+                prefix = f'KITTI/{curcls_name}'
+                if mAP11_3d is not None:
+                    ret_dict[f'{prefix}_3D_AP11_{postfix}'] =\
+                        mAP11_3d[j, idx, i]
+                if mAP11_bev is not None:
+                    ret_dict[f'{prefix}_BEV_AP11_{postfix}'] =\
+                        mAP11_bev[j, idx, i]
+                if mAP11_bbox is not None:
+                    ret_dict[f'{prefix}_2D_AP11_{postfix}'] =\
+                        mAP11_bbox[j, idx, i]
+
+    # calculate mAP11 over all classes if there are multiple classes
+    if len(current_classes) > 1:
+        # prepare results for print
+        result += ('\nOverall AP11@{}, {}, {}:\n'.format(*difficulty))
+        if mAP11_bbox is not None:
+            mAP11_bbox = mAP11_bbox.mean(axis=0)
+            result += 'bbox AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                *mAP11_bbox[:, 0])
+        if mAP11_bev is not None:
+            mAP11_bev = mAP11_bev.mean(axis=0)
+            result += 'bev  AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                *mAP11_bev[:, 0])
+        if mAP11_3d is not None:
+            mAP11_3d = mAP11_3d.mean(axis=0)
+            result += '3d   AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP11_3d[:,
+                                                                            0])
+        if compute_aos:
+            mAP11_aos = mAP11_aos.mean(axis=0)
+            result += 'aos  AP11:{:.2f}, {:.2f}, {:.2f}\n'.format(
+                *mAP11_aos[:, 0])
+
+        # prepare results for logger
+        for idx in range(3):
+            postfix = f'{difficulty[idx]}'
+            if mAP11_3d is not None:
+                ret_dict[f'KITTI/Overall_3D_AP11_{postfix}'] = mAP11_3d[idx, 0]
+            if mAP11_bev is not None:
+                ret_dict[f'KITTI/Overall_BEV_AP11_{postfix}'] =\
+                    mAP11_bev[idx, 0]
+            if mAP11_bbox is not None:
+                ret_dict[f'KITTI/Overall_2D_AP11_{postfix}'] =\
+                    mAP11_bbox[idx, 0]
+
+    # Calculate AP40
+    result += '\n----------- AP40 Results ------------\n\n'
+    for j, curcls in enumerate(current_classes):
+        # mAP threshold array: [num_minoverlap, metric, class]
+        # mAP result: [num_class, num_diff, num_minoverlap]
+        curcls_name = class_to_name[curcls]
+        for i in range(min_overlaps.shape[0]):
+            # prepare results for print
+            result += ('{} AP40@{:.2f}, {:.2f}, {:.2f}:\n'.format(
+                curcls_name, *min_overlaps[i, :, j]))
+            if mAP40_bbox is not None:
+                result += 'bbox AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                    *mAP40_bbox[j, :, i])
+            if mAP40_bev is not None:
+                result += 'bev  AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                    *mAP40_bev[j, :, i])
+            if mAP40_3d is not None:
+                result += '3d   AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                    *mAP40_3d[j, :, i])
+            if compute_aos:
+                result += 'aos  AP40:{:.2f}, {:.2f}, {:.2f}\n'.format(
+                    *mAP40_aos[j, :, i])
+
+            # prepare results for logger
+            for idx in range(3):
+                if i == 0:
+                    postfix = f'{difficulty[idx]}_strict'
+                else:
+                    postfix = f'{difficulty[idx]}_loose'
+                prefix = f'KITTI/{curcls_name}'
+                if mAP40_3d is not None:
+                    ret_dict[f'{prefix}_3D_AP40_{postfix}'] =\
+                        mAP40_3d[j, idx, i]
+                if mAP40_bev is not None:
+                    ret_dict[f'{prefix}_BEV_AP40_{postfix}'] =\
+                        mAP40_bev[j, idx, i]
+                if mAP40_bbox is not None:
+                    ret_dict[f'{prefix}_2D_AP40_{postfix}'] =\
+                        mAP40_bbox[j, idx, i]
+
+    # calculate mAP40 over all classes if there are multiple classes
+    if len(current_classes) > 1:
+        # prepare results for print
+        result += ('\nOverall AP40@{}, {}, {}:\n'.format(*difficulty))
+        if mAP40_bbox is not None:
+            mAP40_bbox = mAP40_bbox.mean(axis=0)
+            result += 'bbox AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                *mAP40_bbox[:, 0])
+        if mAP40_bev is not None:
+            mAP40_bev = mAP40_bev.mean(axis=0)
+            result += 'bev  AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                *mAP40_bev[:, 0])
+        if mAP40_3d is not None:
+            mAP40_3d = mAP40_3d.mean(axis=0)
+            result += '3d   AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP40_3d[:,
+                                                                            0])
+        if compute_aos:
+            mAP40_aos = mAP40_aos.mean(axis=0)
+            result += 'aos  AP40:{:.2f}, {:.2f}, {:.2f}\n'.format(
+                *mAP40_aos[:, 0])
+
+        # prepare results for logger
+        for idx in range(3):
+            postfix = f'{difficulty[idx]}'
+            if mAP40_3d is not None:
+                ret_dict[f'KITTI/Overall_3D_AP40_{postfix}'] = mAP40_3d[idx, 0]
+            if mAP40_bev is not None:
+                ret_dict[f'KITTI/Overall_BEV_AP40_{postfix}'] =\
+                    mAP40_bev[idx, 0]
+            if mAP40_bbox is not None:
+                ret_dict[f'KITTI/Overall_2D_AP40_{postfix}'] =\
+                    mAP40_bbox[idx, 0]
+
+    return result, ret_dict
+
+
+def kitti_eval_coco_style(gt_annos, dt_annos, current_classes):
+    """coco style evaluation of kitti.
+
+    Args:
+        gt_annos (list[dict]): Contain gt information of each sample.
+        dt_annos (list[dict]): Contain detected information of each sample.
+        current_classes (list[str]): Classes to evaluation.
+
+    Returns:
+        string: Evaluation results.
+    """
+    class_to_name = {
+        0: 'Car',
+        1: 'Pedestrian',
+        2: 'Cyclist',
+        3: 'Van',
+        4: 'Person_sitting',
+    }
+    class_to_range = {
+        0: [0.5, 0.95, 10],
+        1: [0.25, 0.7, 10],
+        2: [0.25, 0.7, 10],
+        3: [0.5, 0.95, 10],
+        4: [0.25, 0.7, 10],
+    }
+    name_to_class = {v: n for n, v in class_to_name.items()}
+    if not isinstance(current_classes, (list, tuple)):
+        current_classes = [current_classes]
+    current_classes_int = []
+    for curcls in current_classes:
+        if isinstance(curcls, str):
+            current_classes_int.append(name_to_class[curcls])
+        else:
+            current_classes_int.append(curcls)
+    current_classes = current_classes_int
+    overlap_ranges = np.zeros([3, 3, len(current_classes)])
+    for i, curcls in enumerate(current_classes):
+        overlap_ranges[:, :, i] = np.array(class_to_range[curcls])[:,
+                                                                   np.newaxis]
+    result = ''
+    # check whether alpha is valid
+    compute_aos = False
+    for anno in dt_annos:
+        if anno['alpha'].shape[0] != 0:
+            if anno['alpha'][0] != -10:
+                compute_aos = True
+            break
+    mAPbbox, mAPbev, mAP3d, mAPaos = do_coco_style_eval(
+        gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos)
+    for j, curcls in enumerate(current_classes):
+        # mAP threshold array: [num_minoverlap, metric, class]
+        # mAP result: [num_class, num_diff, num_minoverlap]
+        o_range = np.array(class_to_range[curcls])[[0, 2, 1]]
+        o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1)
+        result += print_str((f'{class_to_name[curcls]} '
+                             'coco AP@{:.2f}:{:.2f}:{:.2f}:'.format(*o_range)))
+        result += print_str((f'bbox AP:{mAPbbox[j, 0]:.2f}, '
+                             f'{mAPbbox[j, 1]:.2f}, '
+                             f'{mAPbbox[j, 2]:.2f}'))
+        result += print_str((f'bev  AP:{mAPbev[j, 0]:.2f}, '
+                             f'{mAPbev[j, 1]:.2f}, '
+                             f'{mAPbev[j, 2]:.2f}'))
+        result += print_str((f'3d   AP:{mAP3d[j, 0]:.2f}, '
+                             f'{mAP3d[j, 1]:.2f}, '
+                             f'{mAP3d[j, 2]:.2f}'))
+        if compute_aos:
+            result += print_str((f'aos  AP:{mAPaos[j, 0]:.2f}, '
+                                 f'{mAPaos[j, 1]:.2f}, '
+                                 f'{mAPaos[j, 2]:.2f}'))
+    return result
diff --git a/mmde/mmdet3d/evaluation/functional/kitti_utils/rotate_iou.py b/mmde/mmdet3d/evaluation/functional/kitti_utils/rotate_iou.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ed75bf08d5868652a47e0fd63828c27b36f5188
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/functional/kitti_utils/rotate_iou.py
@@ -0,0 +1,379 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+#####################
+# Based on https://github.com/hongzhenwang/RRPN-revise
+# Licensed under The MIT License
+# Author: yanyan, scrin@foxmail.com
+#####################
+import math
+
+import numba
+import numpy as np
+from numba import cuda
+
+
+@numba.jit(nopython=True)
+def div_up(m, n):
+    return m // n + (m % n > 0)
+
+
+@cuda.jit(device=True, inline=True)
+def trangle_area(a, b, c):
+    return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) *
+            (b[0] - c[0])) / 2.0
+
+
+@cuda.jit(device=True, inline=True)
+def area(int_pts, num_of_inter):
+    area_val = 0.0
+    for i in range(num_of_inter - 2):
+        area_val += abs(
+            trangle_area(int_pts[:2], int_pts[2 * i + 2:2 * i + 4],
+                         int_pts[2 * i + 4:2 * i + 6]))
+    return area_val
+
+
+@cuda.jit(device=True, inline=True)
+def sort_vertex_in_convex_polygon(int_pts, num_of_inter):
+    if num_of_inter > 0:
+        center = cuda.local.array((2, ), dtype=numba.float32)
+        center[:] = 0.0
+        for i in range(num_of_inter):
+            center[0] += int_pts[2 * i]
+            center[1] += int_pts[2 * i + 1]
+        center[0] /= num_of_inter
+        center[1] /= num_of_inter
+        v = cuda.local.array((2, ), dtype=numba.float32)
+        vs = cuda.local.array((16, ), dtype=numba.float32)
+        for i in range(num_of_inter):
+            v[0] = int_pts[2 * i] - center[0]
+            v[1] = int_pts[2 * i + 1] - center[1]
+            d = math.sqrt(v[0] * v[0] + v[1] * v[1])
+            v[0] = v[0] / d
+            v[1] = v[1] / d
+            if v[1] < 0:
+                v[0] = -2 - v[0]
+            vs[i] = v[0]
+        j = 0
+        temp = 0
+        for i in range(1, num_of_inter):
+            if vs[i - 1] > vs[i]:
+                temp = vs[i]
+                tx = int_pts[2 * i]
+                ty = int_pts[2 * i + 1]
+                j = i
+                while j > 0 and vs[j - 1] > temp:
+                    vs[j] = vs[j - 1]
+                    int_pts[j * 2] = int_pts[j * 2 - 2]
+                    int_pts[j * 2 + 1] = int_pts[j * 2 - 1]
+                    j -= 1
+
+                vs[j] = temp
+                int_pts[j * 2] = tx
+                int_pts[j * 2 + 1] = ty
+
+
+@cuda.jit(device=True, inline=True)
+def line_segment_intersection(pts1, pts2, i, j, temp_pts):
+    A = cuda.local.array((2, ), dtype=numba.float32)
+    B = cuda.local.array((2, ), dtype=numba.float32)
+    C = cuda.local.array((2, ), dtype=numba.float32)
+    D = cuda.local.array((2, ), dtype=numba.float32)
+
+    A[0] = pts1[2 * i]
+    A[1] = pts1[2 * i + 1]
+
+    B[0] = pts1[2 * ((i + 1) % 4)]
+    B[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+    C[0] = pts2[2 * j]
+    C[1] = pts2[2 * j + 1]
+
+    D[0] = pts2[2 * ((j + 1) % 4)]
+    D[1] = pts2[2 * ((j + 1) % 4) + 1]
+    BA0 = B[0] - A[0]
+    BA1 = B[1] - A[1]
+    DA0 = D[0] - A[0]
+    CA0 = C[0] - A[0]
+    DA1 = D[1] - A[1]
+    CA1 = C[1] - A[1]
+    acd = DA1 * CA0 > CA1 * DA0
+    bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0])
+    if acd != bcd:
+        abc = CA1 * BA0 > BA1 * CA0
+        abd = DA1 * BA0 > BA1 * DA0
+        if abc != abd:
+            DC0 = D[0] - C[0]
+            DC1 = D[1] - C[1]
+            ABBA = A[0] * B[1] - B[0] * A[1]
+            CDDC = C[0] * D[1] - D[0] * C[1]
+            DH = BA1 * DC0 - BA0 * DC1
+            Dx = ABBA * DC0 - BA0 * CDDC
+            Dy = ABBA * DC1 - BA1 * CDDC
+            temp_pts[0] = Dx / DH
+            temp_pts[1] = Dy / DH
+            return True
+    return False
+
+
+@cuda.jit(device=True, inline=True)
+def line_segment_intersection_v1(pts1, pts2, i, j, temp_pts):
+    a = cuda.local.array((2, ), dtype=numba.float32)
+    b = cuda.local.array((2, ), dtype=numba.float32)
+    c = cuda.local.array((2, ), dtype=numba.float32)
+    d = cuda.local.array((2, ), dtype=numba.float32)
+
+    a[0] = pts1[2 * i]
+    a[1] = pts1[2 * i + 1]
+
+    b[0] = pts1[2 * ((i + 1) % 4)]
+    b[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+    c[0] = pts2[2 * j]
+    c[1] = pts2[2 * j + 1]
+
+    d[0] = pts2[2 * ((j + 1) % 4)]
+    d[1] = pts2[2 * ((j + 1) % 4) + 1]
+
+    area_abc = trangle_area(a, b, c)
+    area_abd = trangle_area(a, b, d)
+
+    if area_abc * area_abd >= 0:
+        return False
+
+    area_cda = trangle_area(c, d, a)
+    area_cdb = area_cda + area_abc - area_abd
+
+    if area_cda * area_cdb >= 0:
+        return False
+    t = area_cda / (area_abd - area_abc)
+
+    dx = t * (b[0] - a[0])
+    dy = t * (b[1] - a[1])
+    temp_pts[0] = a[0] + dx
+    temp_pts[1] = a[1] + dy
+    return True
+
+
+@cuda.jit(device=True, inline=True)
+def point_in_quadrilateral(pt_x, pt_y, corners):
+    ab0 = corners[2] - corners[0]
+    ab1 = corners[3] - corners[1]
+
+    ad0 = corners[6] - corners[0]
+    ad1 = corners[7] - corners[1]
+
+    ap0 = pt_x - corners[0]
+    ap1 = pt_y - corners[1]
+
+    abab = ab0 * ab0 + ab1 * ab1
+    abap = ab0 * ap0 + ab1 * ap1
+    adad = ad0 * ad0 + ad1 * ad1
+    adap = ad0 * ap0 + ad1 * ap1
+
+    return abab >= abap and abap >= 0 and adad >= adap and adap >= 0
+
+
+@cuda.jit(device=True, inline=True)
+def quadrilateral_intersection(pts1, pts2, int_pts):
+    num_of_inter = 0
+    for i in range(4):
+        if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2):
+            int_pts[num_of_inter * 2] = pts1[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1]
+            num_of_inter += 1
+        if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1):
+            int_pts[num_of_inter * 2] = pts2[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1]
+            num_of_inter += 1
+    temp_pts = cuda.local.array((2, ), dtype=numba.float32)
+    for i in range(4):
+        for j in range(4):
+            has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts)
+            if has_pts:
+                int_pts[num_of_inter * 2] = temp_pts[0]
+                int_pts[num_of_inter * 2 + 1] = temp_pts[1]
+                num_of_inter += 1
+
+    return num_of_inter
+
+
+@cuda.jit(device=True, inline=True)
+def rbbox_to_corners(corners, rbbox):
+    # generate clockwise corners and rotate it clockwise
+    angle = rbbox[4]
+    a_cos = math.cos(angle)
+    a_sin = math.sin(angle)
+    center_x = rbbox[0]
+    center_y = rbbox[1]
+    x_d = rbbox[2]
+    y_d = rbbox[3]
+    corners_x = cuda.local.array((4, ), dtype=numba.float32)
+    corners_y = cuda.local.array((4, ), dtype=numba.float32)
+    corners_x[0] = -x_d / 2
+    corners_x[1] = -x_d / 2
+    corners_x[2] = x_d / 2
+    corners_x[3] = x_d / 2
+    corners_y[0] = -y_d / 2
+    corners_y[1] = y_d / 2
+    corners_y[2] = y_d / 2
+    corners_y[3] = -y_d / 2
+    for i in range(4):
+        corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x
+        corners[2 * i +
+                1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y
+
+
+@cuda.jit(device=True, inline=True)
+def inter(rbbox1, rbbox2):
+    """Compute intersection of two rotated boxes.
+
+    Args:
+        rbox1 (np.ndarray, shape=[5]): Rotated 2d box.
+        rbox2 (np.ndarray, shape=[5]): Rotated 2d box.
+
+    Returns:
+        float: Intersection of two rotated boxes.
+    """
+    corners1 = cuda.local.array((8, ), dtype=numba.float32)
+    corners2 = cuda.local.array((8, ), dtype=numba.float32)
+    intersection_corners = cuda.local.array((16, ), dtype=numba.float32)
+
+    rbbox_to_corners(corners1, rbbox1)
+    rbbox_to_corners(corners2, rbbox2)
+
+    num_intersection = quadrilateral_intersection(corners1, corners2,
+                                                  intersection_corners)
+    sort_vertex_in_convex_polygon(intersection_corners, num_intersection)
+    # print(intersection_corners.reshape([-1, 2])[:num_intersection])
+
+    return area(intersection_corners, num_intersection)
+
+
+@cuda.jit(device=True, inline=True)
+def devRotateIoUEval(rbox1, rbox2, criterion=-1):
+    """Compute rotated iou on device.
+
+    Args:
+        rbox1 (np.ndarray, shape=[5]): Rotated 2d box.
+        rbox2 (np.ndarray, shape=[5]): Rotated 2d box.
+        criterion (int, optional): Indicate different type of iou.
+            -1 indicate `area_inter / (area1 + area2 - area_inter)`,
+            0 indicate `area_inter / area1`,
+            1 indicate `area_inter / area2`.
+
+    Returns:
+        float: iou between two input boxes.
+    """
+    area1 = rbox1[2] * rbox1[3]
+    area2 = rbox2[2] * rbox2[3]
+    area_inter = inter(rbox1, rbox2)
+    if criterion == -1:
+        return area_inter / (area1 + area2 - area_inter)
+    elif criterion == 0:
+        return area_inter / area1
+    elif criterion == 1:
+        return area_inter / area2
+    else:
+        return area_inter
+
+
+@cuda.jit(
+    '(int64, int64, float32[:], float32[:], float32[:], int32)',
+    fastmath=False)
+def rotate_iou_kernel_eval(N,
+                           K,
+                           dev_boxes,
+                           dev_query_boxes,
+                           dev_iou,
+                           criterion=-1):
+    """Kernel of computing rotated IoU. This function is for bev boxes in
+    camera coordinate system ONLY (the rotation is clockwise).
+
+    Args:
+        N (int): The number of boxes.
+        K (int): The number of query boxes.
+        dev_boxes (np.ndarray): Boxes on device.
+        dev_query_boxes (np.ndarray): Query boxes on device.
+        dev_iou (np.ndarray): Computed iou to return.
+        criterion (int, optional): Indicate different type of iou.
+            -1 indicate `area_inter / (area1 + area2 - area_inter)`,
+            0 indicate `area_inter / area1`,
+            1 indicate `area_inter / area2`.
+    """
+    threadsPerBlock = 8 * 8
+    row_start = cuda.blockIdx.x
+    col_start = cuda.blockIdx.y
+    tx = cuda.threadIdx.x
+    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
+    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
+    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
+    block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
+
+    dev_query_box_idx = threadsPerBlock * col_start + tx
+    dev_box_idx = threadsPerBlock * row_start + tx
+    if (tx < col_size):
+        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
+        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
+        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
+        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
+        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
+    if (tx < row_size):
+        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
+        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
+        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
+        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
+        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
+    cuda.syncthreads()
+    if tx < row_size:
+        for i in range(col_size):
+            offset = (
+                row_start * threadsPerBlock * K + col_start * threadsPerBlock +
+                tx * K + i)
+            dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
+                                               block_boxes[tx * 5:tx * 5 + 5],
+                                               criterion)
+
+
+def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
+    """Rotated box iou running in gpu. 500x faster than cpu version (take 5ms
+    in one example with numba.cuda code). convert from [this project](
+    https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
+
+    This function is for bev boxes in camera coordinate system ONLY
+    (the rotation is clockwise).
+
+    Args:
+        boxes (torch.Tensor): rbboxes. format: centers, dims,
+            angles(clockwise when positive) with the shape of [N, 5].
+        query_boxes (torch.FloatTensor, shape=(K, 5)):
+            rbboxes to compute iou with boxes.
+        device_id (int, optional): Defaults to 0. Device to use.
+        criterion (int, optional): Indicate different type of iou.
+            -1 indicate `area_inter / (area1 + area2 - area_inter)`,
+            0 indicate `area_inter / area1`,
+            1 indicate `area_inter / area2`.
+
+    Returns:
+        np.ndarray: IoU results.
+    """
+    boxes = boxes.astype(np.float32)
+    query_boxes = query_boxes.astype(np.float32)
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    iou = np.zeros((N, K), dtype=np.float32)
+    if N == 0 or K == 0:
+        return iou
+    threadsPerBlock = 8 * 8
+    cuda.select_device(device_id)
+    blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))
+
+    stream = cuda.stream()
+    with stream.auto_synchronize():
+        boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
+        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
+        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
+        rotate_iou_kernel_eval[blockspergrid, threadsPerBlock,
+                               stream](N, K, boxes_dev, query_boxes_dev,
+                                       iou_dev, criterion)
+        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
+    return iou.astype(boxes.dtype)
diff --git a/mmde/mmdet3d/evaluation/functional/lyft_eval.py b/mmde/mmdet3d/evaluation/functional/lyft_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..72a1156092a88cd2c51601d907d27a5838dc99cf
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/functional/lyft_eval.py
@@ -0,0 +1,285 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from os import path as osp
+
+import mmengine
+import numpy as np
+from lyft_dataset_sdk.eval.detection.mAP_evaluation import (Box3D, get_ap,
+                                                            get_class_names,
+                                                            get_ious,
+                                                            group_by_key,
+                                                            wrap_in_box)
+from mmengine.logging import print_log
+from terminaltables import AsciiTable
+
+
+def load_lyft_gts(lyft, data_root, eval_split, logger=None):
+    """Loads ground truth boxes from database.
+
+    Args:
+        lyft (:obj:`LyftDataset`): Lyft class in the sdk.
+        data_root (str): Root of data for reading splits.
+        eval_split (str): Name of the split for evaluation.
+        logger (logging.Logger | str, optional): Logger used for printing
+        related information during evaluation. Default: None.
+
+    Returns:
+        list[dict]: List of annotation dictionaries.
+    """
+    split_scenes = mmengine.list_from_file(
+        osp.join(data_root, f'{eval_split}.txt'))
+
+    # Read out all sample_tokens in DB.
+    sample_tokens_all = [s['token'] for s in lyft.sample]
+    assert len(sample_tokens_all) > 0, 'Error: Database has no samples!'
+
+    if eval_split == 'test':
+        # Check that you aren't trying to cheat :)
+        assert len(lyft.sample_annotation) > 0, \
+            'Error: You are trying to evaluate on the test set \
+             but you do not have the annotations!'
+
+    sample_tokens = []
+    for sample_token in sample_tokens_all:
+        scene_token = lyft.get('sample', sample_token)['scene_token']
+        scene_record = lyft.get('scene', scene_token)
+        if scene_record['name'] in split_scenes:
+            sample_tokens.append(sample_token)
+
+    all_annotations = []
+
+    print_log('Loading ground truth annotations...', logger=logger)
+    # Load annotations and filter predictions and annotations.
+    for sample_token in mmengine.track_iter_progress(sample_tokens):
+        sample = lyft.get('sample', sample_token)
+        sample_annotation_tokens = sample['anns']
+        for sample_annotation_token in sample_annotation_tokens:
+            # Get label name in detection task and filter unused labels.
+            sample_annotation = \
+                lyft.get('sample_annotation', sample_annotation_token)
+            detection_name = sample_annotation['category_name']
+            if detection_name is None:
+                continue
+            annotation = {
+                'sample_token': sample_token,
+                'translation': sample_annotation['translation'],
+                'size': sample_annotation['size'],
+                'rotation': sample_annotation['rotation'],
+                'name': detection_name,
+            }
+            all_annotations.append(annotation)
+
+    return all_annotations
+
+
+def load_lyft_predictions(res_path):
+    """Load Lyft predictions from json file.
+
+    Args:
+        res_path (str): Path of result json file recording detections.
+
+    Returns:
+        list[dict]: List of prediction dictionaries.
+    """
+    predictions = mmengine.load(res_path)
+    predictions = predictions['results']
+    all_preds = []
+    for sample_token in predictions.keys():
+        all_preds.extend(predictions[sample_token])
+    return all_preds
+
+
+def lyft_eval(lyft, data_root, res_path, eval_set, output_dir, logger=None):
+    """Evaluation API for Lyft dataset.
+
+    Args:
+        lyft (:obj:`LyftDataset`): Lyft class in the sdk.
+        data_root (str): Root of data for reading splits.
+        res_path (str): Path of result json file recording detections.
+        eval_set (str): Name of the split for evaluation.
+        output_dir (str): Output directory for output json files.
+        logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+
+    Returns:
+        dict[str, float]: The evaluation results.
+    """
+    # evaluate by lyft metrics
+    gts = load_lyft_gts(lyft, data_root, eval_set, logger)
+    predictions = load_lyft_predictions(res_path)
+
+    class_names = get_class_names(gts)
+    print('Calculating mAP@0.5:0.95...')
+
+    iou_thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+    metrics = {}
+    average_precisions = \
+        get_classwise_aps(gts, predictions, class_names, iou_thresholds)
+    APs_data = [['IOU', 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]]
+
+    mAPs = np.mean(average_precisions, axis=0)
+    mAPs_cate = np.mean(average_precisions, axis=1)
+    final_mAP = np.mean(mAPs)
+
+    metrics['average_precisions'] = average_precisions.tolist()
+    metrics['mAPs'] = mAPs.tolist()
+    metrics['Final mAP'] = float(final_mAP)
+    metrics['class_names'] = class_names
+    metrics['mAPs_cate'] = mAPs_cate.tolist()
+
+    APs_data = [['class', 'mAP@0.5:0.95']]
+    for i in range(len(class_names)):
+        row = [class_names[i], round(mAPs_cate[i], 3)]
+        APs_data.append(row)
+    APs_data.append(['Overall', round(final_mAP, 3)])
+    APs_table = AsciiTable(APs_data, title='mAPs@0.5:0.95')
+    APs_table.inner_footing_row_border = True
+    print_log(APs_table.table, logger=logger)
+
+    res_path = osp.join(output_dir, 'lyft_metrics.json')
+    mmengine.dump(metrics, res_path)
+    return metrics
+
+
+def get_classwise_aps(gt, predictions, class_names, iou_thresholds):
+    """Returns an array with an average precision per class.
+
+    Note: Ground truth and predictions should have the following format.
+
+    .. code-block::
+
+    gt = [{
+        'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207
+                         fbb039a550991a5149214f98cec136ac',
+        'translation': [974.2811881299899, 1714.6815014457964,
+                        -23.689857123368846],
+        'size': [1.796, 4.488, 1.664],
+        'rotation': [0.14882026466054782, 0, 0, 0.9888642620837121],
+        'name': 'car'
+    }]
+
+    predictions = [{
+        'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207
+                         fbb039a550991a5149214f98cec136ac',
+        'translation': [971.8343488872263, 1713.6816097857359,
+                        -25.82534357061308],
+        'size': [2.519726579986132, 7.810161372666739, 3.483438286096803],
+        'rotation': [0.10913582721095375, 0.04099572636992043,
+                     0.01927712319721745, 1.029328402625659],
+        'name': 'car',
+        'score': 0.3077029437237213
+    }]
+
+    Args:
+        gt (list[dict]): list of dictionaries in the format described below.
+        predictions (list[dict]): list of dictionaries in the format
+            described below.
+        class_names (list[str]): list of the class names.
+        iou_thresholds (list[float]): IOU thresholds used to calculate
+            TP / FN
+
+    Returns:
+        np.ndarray: an array with an average precision per class.
+    """
+    assert all([0 <= iou_th <= 1 for iou_th in iou_thresholds])
+
+    gt_by_class_name = group_by_key(gt, 'name')
+    pred_by_class_name = group_by_key(predictions, 'name')
+
+    average_precisions = np.zeros((len(class_names), len(iou_thresholds)))
+
+    for class_id, class_name in enumerate(class_names):
+        if class_name in pred_by_class_name:
+            recalls, precisions, average_precision = get_single_class_aps(
+                gt_by_class_name[class_name], pred_by_class_name[class_name],
+                iou_thresholds)
+            average_precisions[class_id, :] = average_precision
+
+    return average_precisions
+
+
+def get_single_class_aps(gt, predictions, iou_thresholds):
+    """Compute recall and precision for all iou thresholds. Adapted from
+    LyftDatasetDevkit.
+
+    Args:
+        gt (list[dict]): list of dictionaries in the format described above.
+        predictions (list[dict]): list of dictionaries in the format
+            described below.
+        iou_thresholds (list[float]): IOU thresholds used to calculate
+            TP / FN
+
+    Returns:
+        tuple[np.ndarray]: Returns (recalls, precisions, average precisions)
+            for each class.
+    """
+    num_gts = len(gt)
+    image_gts = group_by_key(gt, 'sample_token')
+    image_gts = wrap_in_box(image_gts)
+
+    sample_gt_checked = {
+        sample_token: np.zeros((len(boxes), len(iou_thresholds)))
+        for sample_token, boxes in image_gts.items()
+    }
+
+    predictions = sorted(predictions, key=lambda x: x['score'], reverse=True)
+
+    # go down dets and mark TPs and FPs
+    num_predictions = len(predictions)
+    tps = np.zeros((num_predictions, len(iou_thresholds)))
+    fps = np.zeros((num_predictions, len(iou_thresholds)))
+
+    for prediction_index, prediction in enumerate(predictions):
+        predicted_box = Box3D(**prediction)
+
+        sample_token = prediction['sample_token']
+
+        max_overlap = -np.inf
+        jmax = -1
+
+        if sample_token in image_gts:
+            gt_boxes = image_gts[sample_token]
+            # gt_boxes per sample
+            gt_checked = sample_gt_checked[sample_token]
+            # gt flags per sample
+        else:
+            gt_boxes = []
+            gt_checked = None
+
+        if len(gt_boxes) > 0:
+            overlaps = get_ious(gt_boxes, predicted_box)
+
+            max_overlap = np.max(overlaps)
+
+            jmax = np.argmax(overlaps)
+
+        for i, iou_threshold in enumerate(iou_thresholds):
+            if max_overlap > iou_threshold:
+                if gt_checked[jmax, i] == 0:
+                    tps[prediction_index, i] = 1.0
+                    gt_checked[jmax, i] = 1
+                else:
+                    fps[prediction_index, i] = 1.0
+            else:
+                fps[prediction_index, i] = 1.0
+
+    # compute precision recall
+    fps = np.cumsum(fps, axis=0)
+    tps = np.cumsum(tps, axis=0)
+
+    recalls = tps / float(num_gts)
+    # avoid divide by zero in case the first detection
+    # matches a difficult ground truth
+    precisions = tps / np.maximum(tps + fps, np.finfo(np.float64).eps)
+
+    aps = []
+    for i in range(len(iou_thresholds)):
+        recall = recalls[:, i]
+        precision = precisions[:, i]
+        assert np.all(0 <= recall) & np.all(recall <= 1)
+        assert np.all(0 <= precision) & np.all(precision <= 1)
+        ap = get_ap(recall, precision)
+        aps.append(ap)
+
+    aps = np.array(aps)
+
+    return recalls, precisions, aps
diff --git a/mmde/mmdet3d/evaluation/functional/panoptic_seg_eval.py b/mmde/mmdet3d/evaluation/functional/panoptic_seg_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..6029b73e09efaffffd2b9dcf2dd0890e9d01bfa8
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/functional/panoptic_seg_eval.py
@@ -0,0 +1,387 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import numpy as np
+from mmengine.logging import MMLogger, print_log
+
+PQReturnsType = Tuple[np.double, np.double, np.ndarray, np.ndarray, np.ndarray]
+
+
+class EvalPanoptic:
+    r"""Evaluate panoptic results for Semantickitti and NuScenes.
+    Please refer to the `semantic kitti api
+    <https://github.com/PRBonn/semantic-kitti-api/>`_ for more details
+
+    Args:
+        classes (list): Classes used in the dataset.
+        thing_classes (list): Thing classes used in the dataset.
+        stuff_classes (list): Stuff classes used in the dataset.
+        min_num_points (int): Minimum number of points of an object to be
+            counted as ground truth in evaluation.
+        id_offset (int): Offset for instance ids to concat with
+            semantic labels.
+        label2cat (dict[str]): Mapping from label to category.
+        ignore_index (list[int]): Indices of ignored classes in evaluation.
+        logger (logging.Logger | str, optional): Logger used for printing.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 classes: List[str],
+                 thing_classes: List[str],
+                 stuff_classes: List[str],
+                 min_num_points: int,
+                 id_offset: int,
+                 label2cat: Dict[str, str],
+                 ignore_index: List[str],
+                 logger: MMLogger = None):
+        self.classes = classes
+        self.thing_classes = thing_classes
+        self.stuff_classes = stuff_classes
+        self.ignore_index = np.array(ignore_index, dtype=int)
+        self.num_classes = len(classes)
+        self.label2cat = label2cat
+        self.logger = logger
+        self.include = np.array(
+            [n for n in range(self.num_classes) if n not in self.ignore_index],
+            dtype=int)
+        self.id_offset = id_offset
+        self.eps = 1e-15
+        self.min_num_points = min_num_points
+        self.reset()
+
+    def reset(self):
+        """Reset class variables."""
+        # general things
+        # iou stuff
+        self.confusion_matrix = np.zeros((self.num_classes, self.num_classes),
+                                         dtype=int)
+        # panoptic stuff
+        self.pan_tp = np.zeros(self.num_classes, dtype=int)
+        self.pan_iou = np.zeros(self.num_classes, dtype=np.double)
+        self.pan_fp = np.zeros(self.num_classes, dtype=int)
+        self.pan_fn = np.zeros(self.num_classes, dtype=int)
+
+        self.evaluated_fnames = []
+
+    def evaluate(self, gt_labels: List[Dict[str, np.ndarray]],
+                 seg_preds: List[Dict[str, np.ndarray]]) -> Dict[str, float]:
+        """Evaluate the predictions.
+
+        Args:
+            gt_labels (list[dict[np.ndarray]]): Ground Truth.
+            seg_preds (list[dict[np.ndarray]]): Predictions.
+
+        Returns:
+            dict[float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        assert len(seg_preds) == len(gt_labels)
+        for f in range(len(seg_preds)):
+            gt_semantic_seg = gt_labels[f]['pts_semantic_mask'].astype(int)
+            gt_instance_seg = gt_labels[f]['pts_instance_mask'].astype(int)
+            pred_semantic_seg = seg_preds[f]['pts_semantic_mask'].astype(int)
+            pred_instance_seg = seg_preds[f]['pts_instance_mask'].astype(int)
+
+            self.add_semantic_sample(pred_semantic_seg, gt_semantic_seg)
+            self.add_panoptic_sample(pred_semantic_seg, gt_semantic_seg,
+                                     pred_instance_seg, gt_instance_seg)
+
+        result_dicts = self.print_results()
+
+        return result_dicts
+
+    def print_results(self) -> Dict[str, float]:
+        """Print results.
+
+        Returns:
+            dict[float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        pq, sq, rq, all_pq, all_sq, all_rq = self.get_pq()
+        miou, iou = self.get_iou()
+
+        # now make a nice dictionary
+        output_dict = {}
+
+        # make python variables
+        pq = pq.item()
+        sq = sq.item()
+        rq = rq.item()
+        all_pq = all_pq.flatten().tolist()
+        all_sq = all_sq.flatten().tolist()
+        all_rq = all_rq.flatten().tolist()
+        miou = miou.item()
+        iou = iou.flatten().tolist()
+
+        output_dict['all'] = {}
+        output_dict['all']['pq'] = pq
+        output_dict['all']['sq'] = sq
+        output_dict['all']['rq'] = rq
+        output_dict['all']['miou'] = miou
+        for idx, (_pq, _sq, _rq,
+                  _iou) in enumerate(zip(all_pq, all_sq, all_rq, iou)):
+            class_str = self.classes[idx]
+            output_dict[class_str] = {}
+            output_dict[class_str]['pq'] = _pq
+            output_dict[class_str]['sq'] = _sq
+            output_dict[class_str]['rq'] = _rq
+            output_dict[class_str]['miou'] = _iou
+
+        pq_dagger = np.mean(
+            [float(output_dict[c]['pq']) for c in self.thing_classes] +
+            [float(output_dict[c]['miou']) for c in self.stuff_classes])
+
+        pq_things = np.mean(
+            [float(output_dict[c]['pq']) for c in self.thing_classes])
+        rq_things = np.mean(
+            [float(output_dict[c]['rq']) for c in self.thing_classes])
+        sq_things = np.mean(
+            [float(output_dict[c]['sq']) for c in self.thing_classes])
+
+        pq_stuff = np.mean(
+            [float(output_dict[c]['pq']) for c in self.stuff_classes])
+        rq_stuff = np.mean(
+            [float(output_dict[c]['rq']) for c in self.stuff_classes])
+        sq_stuff = np.mean(
+            [float(output_dict[c]['sq']) for c in self.stuff_classes])
+
+        result_dicts = {}
+        result_dicts['pq'] = float(pq)
+        result_dicts['pq_dagger'] = float(pq_dagger)
+        result_dicts['sq_mean'] = float(sq)
+        result_dicts['rq_mean'] = float(rq)
+        result_dicts['miou'] = float(miou)
+        result_dicts['pq_stuff'] = float(pq_stuff)
+        result_dicts['rq_stuff'] = float(rq_stuff)
+        result_dicts['sq_stuff'] = float(sq_stuff)
+        result_dicts['pq_things'] = float(pq_things)
+        result_dicts['rq_things'] = float(rq_things)
+        result_dicts['sq_things'] = float(sq_things)
+
+        if self.logger is not None:
+            print_log('|        |   IoU   |   PQ   |   RQ   |  SQ   |',
+                      self.logger)
+            for k, v in output_dict.items():
+                print_log(
+                    '|{}| {:.4f} | {:.4f} | {:.4f} | {:.4f} |'.format(
+                        k.ljust(8)[-8:], v['miou'], v['pq'], v['rq'], v['sq']),
+                    self.logger)
+            print_log('True Positive: ', self.logger)
+            print_log('\t|\t'.join([str(x) for x in self.pan_tp]), self.logger)
+            print_log('False Positive: ')
+            print_log('\t|\t'.join([str(x) for x in self.pan_fp]), self.logger)
+            print_log('False Negative: ')
+            print_log('\t|\t'.join([str(x) for x in self.pan_fn]), self.logger)
+
+        else:
+            print('|        |   IoU   |   PQ   |   RQ   |  SQ   |')
+            for k, v in output_dict.items():
+                print('|{}| {:.4f} | {:.4f} | {:.4f} | {:.4f} |'.format(
+                    k.ljust(8)[-8:], v['miou'], v['pq'], v['rq'], v['sq']))
+            print('True Positive: ')
+            print('\t|\t'.join([str(x) for x in self.pan_tp]))
+            print('False Positive: ')
+            print('\t|\t'.join([str(x) for x in self.pan_fp]))
+            print('False Negative: ')
+            print('\t|\t'.join([str(x) for x in self.pan_fn]))
+
+        return result_dicts
+
+    def get_pq(self) -> PQReturnsType:
+        """Get results of PQ metric.
+
+        Returns:
+            tuple(np.ndarray): PQ, SQ, RQ of each class and all class.
+        """
+        # get PQ and first calculate for all classes
+        sq_all = self.pan_iou.astype(np.double) / np.maximum(
+            self.pan_tp.astype(np.double), self.eps)
+        rq_all = self.pan_tp.astype(np.double) / np.maximum(
+            self.pan_tp.astype(np.double) + 0.5 * self.pan_fp.astype(np.double)
+            + 0.5 * self.pan_fn.astype(np.double), self.eps)
+        pq_all = sq_all * rq_all
+
+        # then do the REAL mean (no ignored classes)
+        sq = sq_all[self.include].mean()
+        rq = rq_all[self.include].mean()
+        pq = pq_all[self.include].mean()
+
+        return (pq, sq, rq, pq_all, sq_all, rq_all)
+
+    def get_iou(self) -> Tuple[np.double, np.ndarray]:
+        """Get results of IOU metric.
+
+        Returns:
+            tuple(np.ndarray): iou of all class and each class.
+        """
+        tp, fp, fn = self.get_iou_stats()
+        intersection = tp
+        union = tp + fp + fn
+        union = np.maximum(union, self.eps)
+        iou = intersection.astype(np.double) / union.astype(np.double)
+        iou_mean = (intersection[self.include].astype(np.double) /
+                    union[self.include].astype(np.double)).mean()
+
+        return iou_mean, iou
+
+    def get_iou_stats(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """Get IOU statistics of TP, FP and FN.
+
+        Returns:
+            tuple(np.ndarray): TP, FP, FN of all class.
+        """
+        # copy to avoid modifying the real deal
+        conf = self.confusion_matrix.copy().astype(np.double)
+        # remove fp from confusion on the ignore classes predictions
+        # points that were predicted of another class, but were ignore
+        # (corresponds to zeroing the cols of those classes,
+        # since the predictions go on the rows)
+        conf[:, self.ignore_index] = 0
+
+        # get the clean stats
+        tp = conf.diagonal()
+        fp = conf.sum(axis=1) - tp
+        fn = conf.sum(axis=0) - tp
+        return tp, fp, fn
+
+    def add_semantic_sample(self, semantic_preds: np.ndarray,
+                            gt_semantics: np.ndarray):
+        """Add one batch of semantic predictions and ground truths.
+
+        Args:
+            semantic_preds (np.ndarray): Semantic predictions.
+            gt_semantics (np.ndarray): Semantic ground truths.
+        """
+        idxs = np.stack([semantic_preds, gt_semantics], axis=0)
+        # make confusion matrix (cols = gt, rows = pred)
+        np.add.at(self.confusion_matrix, tuple(idxs), 1)
+
+    def add_panoptic_sample(self, semantic_preds: np.ndarray,
+                            gt_semantics: np.ndarray,
+                            instance_preds: np.ndarray,
+                            gt_instances: np.ndarray):
+        """Add one sample of panoptic predictions and ground truths for
+        evaluation.
+
+        Args:
+            semantic_preds (np.ndarray): Semantic predictions.
+            gt_semantics (np.ndarray): Semantic ground truths.
+            instance_preds (np.ndarray): Instance predictions.
+            gt_instances (np.ndarray): Instance ground truths.
+        """
+        # avoid zero (ignored label)
+        instance_preds = instance_preds + 1
+        gt_instances = gt_instances + 1
+
+        # only interested in points that are
+        # outside the void area (not in excluded classes)
+        for cl in self.ignore_index:
+            # make a mask for this class
+            gt_not_in_excl_mask = gt_semantics != cl
+            # remove all other points
+            semantic_preds = semantic_preds[gt_not_in_excl_mask]
+            gt_semantics = gt_semantics[gt_not_in_excl_mask]
+            instance_preds = instance_preds[gt_not_in_excl_mask]
+            gt_instances = gt_instances[gt_not_in_excl_mask]
+
+        # first step is to count intersections > 0.5 IoU
+        # for each class (except the ignored ones)
+        for cl in self.include:
+            # get a class mask
+            pred_inst_in_cl_mask = semantic_preds == cl
+            gt_inst_in_cl_mask = gt_semantics == cl
+
+            # get instance points in class (makes outside stuff 0)
+            pred_inst_in_cl = instance_preds * pred_inst_in_cl_mask.astype(int)
+            gt_inst_in_cl = gt_instances * gt_inst_in_cl_mask.astype(int)
+
+            # generate the areas for each unique instance prediction
+            unique_pred, counts_pred = np.unique(
+                pred_inst_in_cl[pred_inst_in_cl > 0], return_counts=True)
+            id2idx_pred = {id: idx for idx, id in enumerate(unique_pred)}
+            matched_pred = np.array([False] * unique_pred.shape[0])
+
+            # generate the areas for each unique instance gt_np
+            unique_gt, counts_gt = np.unique(
+                gt_inst_in_cl[gt_inst_in_cl > 0], return_counts=True)
+            id2idx_gt = {id: idx for idx, id in enumerate(unique_gt)}
+            matched_gt = np.array([False] * unique_gt.shape[0])
+
+            # generate intersection using offset
+            valid_combos = np.logical_and(pred_inst_in_cl > 0,
+                                          gt_inst_in_cl > 0)
+            id_offset_combo = pred_inst_in_cl[
+                valid_combos] + self.id_offset * gt_inst_in_cl[valid_combos]
+            unique_combo, counts_combo = np.unique(
+                id_offset_combo, return_counts=True)
+
+            # generate an intersection map
+            # count the intersections with over 0.5 IoU as TP
+            gt_labels = unique_combo // self.id_offset
+            pred_labels = unique_combo % self.id_offset
+            gt_areas = np.array([counts_gt[id2idx_gt[id]] for id in gt_labels])
+            pred_areas = np.array(
+                [counts_pred[id2idx_pred[id]] for id in pred_labels])
+            intersections = counts_combo
+            unions = gt_areas + pred_areas - intersections
+            ious = intersections.astype(float) / unions.astype(float)
+
+            tp_indexes = ious > 0.5
+            self.pan_tp[cl] += np.sum(tp_indexes)
+            self.pan_iou[cl] += np.sum(ious[tp_indexes])
+
+            matched_gt[[id2idx_gt[id] for id in gt_labels[tp_indexes]]] = True
+            matched_pred[[id2idx_pred[id]
+                          for id in pred_labels[tp_indexes]]] = True
+
+            # count the FN
+            if len(counts_gt) > 0:
+                self.pan_fn[cl] += np.sum(
+                    np.logical_and(counts_gt >= self.min_num_points,
+                                   ~matched_gt))
+
+            # count the FP
+            if len(matched_pred) > 0:
+                self.pan_fp[cl] += np.sum(
+                    np.logical_and(counts_pred >= self.min_num_points,
+                                   ~matched_pred))
+
+
+def panoptic_seg_eval(gt_labels: List[np.ndarray],
+                      seg_preds: List[np.ndarray],
+                      classes: List[str],
+                      thing_classes: List[str],
+                      stuff_classes: List[str],
+                      min_num_points: int,
+                      id_offset: int,
+                      label2cat: Dict[str, str],
+                      ignore_index: List[int],
+                      logger: MMLogger = None) -> Dict[str, float]:
+    """Panoptic Segmentation Evaluation.
+
+    Evaluate the result of the panoptic segmentation.
+
+    Args:
+        gt_labels (list[dict[np.ndarray]]): Ground Truth.
+        seg_preds (list[dict[np.ndarray]]): Predictions.
+        classes (list[str]): Classes used in the dataset.
+        thing_classes (list[str]): Thing classes used in the dataset.
+        stuff_classes (list[str]): Stuff classes used in the dataset.
+        min_num_points (int): Minimum point number of object to be
+            counted as ground truth in evaluation.
+        id_offset (int): Offset for instance ids to concat with
+            semantic labels.
+        label2cat (dict[str]): Mapping from label to category.
+        ignore_index (list[int]): Indices of ignored classes in evaluation.
+        logger (logging.Logger | str, optional): Logger used for printing.
+            Defaults to None.
+
+    Returns:
+        dict[float]: Dict of results.
+    """
+    panoptic_seg_eval = EvalPanoptic(classes, thing_classes, stuff_classes,
+                                     min_num_points, id_offset, label2cat,
+                                     ignore_index, logger)
+    ret_dict = panoptic_seg_eval.evaluate(gt_labels, seg_preds)
+    return ret_dict
diff --git a/mmde/mmdet3d/evaluation/functional/scannet_utils/__init__.py b/mmde/mmdet3d/evaluation/functional/scannet_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c98ea835b6f4213b499ecbd54cb43bced1aea06a
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/functional/scannet_utils/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .evaluate_semantic_instance import evaluate_matches, scannet_eval
+
+__all__ = ['scannet_eval', 'evaluate_matches']
diff --git a/mmde/mmdet3d/evaluation/functional/scannet_utils/evaluate_semantic_instance.py b/mmde/mmdet3d/evaluation/functional/scannet_utils/evaluate_semantic_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b931c530b163e4f7150b94f03271571fff7730b
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/functional/scannet_utils/evaluate_semantic_instance.py
@@ -0,0 +1,346 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# adapted from https://github.com/ScanNet/ScanNet/blob/master/BenchmarkScripts/3d_evaluation/evaluate_semantic_instance.py # noqa
+from copy import deepcopy
+
+import numpy as np
+
+from . import util_3d
+
+
+def evaluate_matches(matches, class_labels, options):
+    """Evaluate instance segmentation from matched gt and predicted instances
+    for all scenes.
+
+    Args:
+        matches (dict): Contains gt2pred and pred2gt infos for every scene.
+        class_labels (tuple[str]): Class names.
+        options (dict): ScanNet evaluator options. See get_options.
+
+    Returns:
+        np.array: Average precision scores for all thresholds and categories.
+    """
+    overlaps = options['overlaps']
+    min_region_sizes = [options['min_region_sizes'][0]]
+    dist_threshes = [options['distance_threshes'][0]]
+    dist_confs = [options['distance_confs'][0]]
+
+    # results: class x overlap
+    ap = np.zeros((len(dist_threshes), len(class_labels), len(overlaps)))
+    for di, (min_region_size, distance_thresh, distance_conf) in enumerate(
+            zip(min_region_sizes, dist_threshes, dist_confs)):
+        for oi, overlap_th in enumerate(overlaps):
+            pred_visited = {}
+            for m in matches:
+                for label_name in class_labels:
+                    for p in matches[m]['pred'][label_name]:
+                        if 'filename' in p:
+                            pred_visited[p['filename']] = False
+            for li, label_name in enumerate(class_labels):
+                y_true = np.empty(0)
+                y_score = np.empty(0)
+                hard_false_negatives = 0
+                has_gt = False
+                has_pred = False
+                for m in matches:
+                    pred_instances = matches[m]['pred'][label_name]
+                    gt_instances = matches[m]['gt'][label_name]
+                    # filter groups in ground truth
+                    gt_instances = [
+                        gt for gt in gt_instances
+                        if gt['instance_id'] >= 1000 and gt['vert_count'] >=
+                        min_region_size and gt['med_dist'] <= distance_thresh
+                        and gt['dist_conf'] >= distance_conf
+                    ]
+                    if gt_instances:
+                        has_gt = True
+                    if pred_instances:
+                        has_pred = True
+
+                    cur_true = np.ones(len(gt_instances))
+                    cur_score = np.ones(len(gt_instances)) * (-float('inf'))
+                    cur_match = np.zeros(len(gt_instances), dtype=bool)
+                    # collect matches
+                    for (gti, gt) in enumerate(gt_instances):
+                        found_match = False
+                        for pred in gt['matched_pred']:
+                            # greedy assignments
+                            if pred_visited[pred['filename']]:
+                                continue
+                            overlap = float(pred['intersection']) / (
+                                gt['vert_count'] + pred['vert_count'] -
+                                pred['intersection'])
+                            if overlap > overlap_th:
+                                confidence = pred['confidence']
+                                # if already have a prediction for this gt,
+                                # the prediction with the lower score is automatically a false positive # noqa
+                                if cur_match[gti]:
+                                    max_score = max(cur_score[gti], confidence)
+                                    min_score = min(cur_score[gti], confidence)
+                                    cur_score[gti] = max_score
+                                    # append false positive
+                                    cur_true = np.append(cur_true, 0)
+                                    cur_score = np.append(cur_score, min_score)
+                                    cur_match = np.append(cur_match, True)
+                                # otherwise set score
+                                else:
+                                    found_match = True
+                                    cur_match[gti] = True
+                                    cur_score[gti] = confidence
+                                    pred_visited[pred['filename']] = True
+                        if not found_match:
+                            hard_false_negatives += 1
+                    # remove non-matched ground truth instances
+                    cur_true = cur_true[cur_match]
+                    cur_score = cur_score[cur_match]
+
+                    # collect non-matched predictions as false positive
+                    for pred in pred_instances:
+                        found_gt = False
+                        for gt in pred['matched_gt']:
+                            overlap = float(gt['intersection']) / (
+                                gt['vert_count'] + pred['vert_count'] -
+                                gt['intersection'])
+                            if overlap > overlap_th:
+                                found_gt = True
+                                break
+                        if not found_gt:
+                            num_ignore = pred['void_intersection']
+                            for gt in pred['matched_gt']:
+                                # group?
+                                if gt['instance_id'] < 1000:
+                                    num_ignore += gt['intersection']
+                                # small ground truth instances
+                                if gt['vert_count'] < min_region_size or gt[
+                                        'med_dist'] > distance_thresh or gt[
+                                            'dist_conf'] < distance_conf:
+                                    num_ignore += gt['intersection']
+                            proportion_ignore = float(
+                                num_ignore) / pred['vert_count']
+                            # if not ignored append false positive
+                            if proportion_ignore <= overlap_th:
+                                cur_true = np.append(cur_true, 0)
+                                confidence = pred['confidence']
+                                cur_score = np.append(cur_score, confidence)
+
+                    # append to overall results
+                    y_true = np.append(y_true, cur_true)
+                    y_score = np.append(y_score, cur_score)
+
+                # compute average precision
+                if has_gt and has_pred:
+                    # compute precision recall curve first
+
+                    # sorting and cumsum
+                    score_arg_sort = np.argsort(y_score)
+                    y_score_sorted = y_score[score_arg_sort]
+                    y_true_sorted = y_true[score_arg_sort]
+                    y_true_sorted_cumsum = np.cumsum(y_true_sorted)
+
+                    # unique thresholds
+                    (thresholds, unique_indices) = np.unique(
+                        y_score_sorted, return_index=True)
+                    num_prec_recall = len(unique_indices) + 1
+
+                    # prepare precision recall
+                    num_examples = len(y_score_sorted)
+                    # follow https://github.com/ScanNet/ScanNet/pull/26 ? # noqa
+                    num_true_examples = y_true_sorted_cumsum[-1] if len(
+                        y_true_sorted_cumsum) > 0 else 0
+                    precision = np.zeros(num_prec_recall)
+                    recall = np.zeros(num_prec_recall)
+
+                    # deal with the first point
+                    y_true_sorted_cumsum = np.append(y_true_sorted_cumsum, 0)
+                    # deal with remaining
+                    for idx_res, idx_scores in enumerate(unique_indices):
+                        cumsum = y_true_sorted_cumsum[idx_scores - 1]
+                        tp = num_true_examples - cumsum
+                        fp = num_examples - idx_scores - tp
+                        fn = cumsum + hard_false_negatives
+                        p = float(tp) / (tp + fp)
+                        r = float(tp) / (tp + fn)
+                        precision[idx_res] = p
+                        recall[idx_res] = r
+
+                    # first point in curve is artificial
+                    precision[-1] = 1.
+                    recall[-1] = 0.
+
+                    # compute average of precision-recall curve
+                    recall_for_conv = np.copy(recall)
+                    recall_for_conv = np.append(recall_for_conv[0],
+                                                recall_for_conv)
+                    recall_for_conv = np.append(recall_for_conv, 0.)
+
+                    stepWidths = np.convolve(recall_for_conv, [-0.5, 0, 0.5],
+                                             'valid')
+                    # integrate is now simply a dot product
+                    ap_current = np.dot(precision, stepWidths)
+
+                elif has_gt:
+                    ap_current = 0.0
+                else:
+                    ap_current = float('nan')
+                ap[di, li, oi] = ap_current
+    return ap
+
+
+def compute_averages(aps, options, class_labels):
+    """Averages AP scores for all categories.
+
+    Args:
+        aps (np.array): AP scores for all thresholds and categories.
+        options (dict): ScanNet evaluator options. See get_options.
+        class_labels (tuple[str]): Class names.
+
+    Returns:
+        dict: Overall and per-category AP scores.
+    """
+    d_inf = 0
+    o50 = np.where(np.isclose(options['overlaps'], 0.5))
+    o25 = np.where(np.isclose(options['overlaps'], 0.25))
+    o_all_but25 = np.where(
+        np.logical_not(np.isclose(options['overlaps'], 0.25)))
+    avg_dict = {}
+    avg_dict['all_ap'] = np.nanmean(aps[d_inf, :, o_all_but25])
+    avg_dict['all_ap_50%'] = np.nanmean(aps[d_inf, :, o50])
+    avg_dict['all_ap_25%'] = np.nanmean(aps[d_inf, :, o25])
+    avg_dict['classes'] = {}
+    for (li, label_name) in enumerate(class_labels):
+        avg_dict['classes'][label_name] = {}
+        avg_dict['classes'][label_name]['ap'] = np.average(aps[d_inf, li,
+                                                               o_all_but25])
+        avg_dict['classes'][label_name]['ap50%'] = np.average(aps[d_inf, li,
+                                                                  o50])
+        avg_dict['classes'][label_name]['ap25%'] = np.average(aps[d_inf, li,
+                                                                  o25])
+    return avg_dict
+
+
+def assign_instances_for_scan(pred_info, gt_ids, options, valid_class_ids,
+                              class_labels, id_to_label):
+    """Assign gt and predicted instances for a single scene.
+
+    Args:
+        pred_info (dict): Predicted masks, labels and scores.
+        gt_ids (np.array): Ground truth instance masks.
+        options (dict): ScanNet evaluator options. See get_options.
+        valid_class_ids (tuple[int]): Ids of valid categories.
+        class_labels (tuple[str]): Class names.
+        id_to_label (dict[int, str]): Mapping of valid class id to class label.
+
+    Returns:
+        dict: Per class assigned gt to predicted instances.
+        dict: Per class assigned predicted to gt instances.
+    """
+    # get gt instances
+    gt_instances = util_3d.get_instances(gt_ids, valid_class_ids, class_labels,
+                                         id_to_label)
+    # associate
+    gt2pred = deepcopy(gt_instances)
+    for label in gt2pred:
+        for gt in gt2pred[label]:
+            gt['matched_pred'] = []
+    pred2gt = {}
+    for label in class_labels:
+        pred2gt[label] = []
+    num_pred_instances = 0
+    # mask of void labels in the ground truth
+    bool_void = np.logical_not(np.in1d(gt_ids // 1000, valid_class_ids))
+    # go through all prediction masks
+    for pred_mask_file in pred_info:
+        label_id = int(pred_info[pred_mask_file]['label_id'])
+        conf = pred_info[pred_mask_file]['conf']
+        if not label_id in id_to_label:  # noqa E713
+            continue
+        label_name = id_to_label[label_id]
+        # read the mask
+        pred_mask = pred_info[pred_mask_file]['mask']
+        if len(pred_mask) != len(gt_ids):
+            raise ValueError('len(pred_mask) != len(gt_ids)')
+        # convert to binary
+        pred_mask = np.not_equal(pred_mask, 0)
+        num = np.count_nonzero(pred_mask)
+        if num < options['min_region_sizes'][0]:
+            continue  # skip if empty
+
+        pred_instance = {}
+        pred_instance['filename'] = pred_mask_file
+        pred_instance['pred_id'] = num_pred_instances
+        pred_instance['label_id'] = label_id
+        pred_instance['vert_count'] = num
+        pred_instance['confidence'] = conf
+        pred_instance['void_intersection'] = np.count_nonzero(
+            np.logical_and(bool_void, pred_mask))
+
+        # matched gt instances
+        matched_gt = []
+        # go through all gt instances with matching label
+        for (gt_num, gt_inst) in enumerate(gt2pred[label_name]):
+            intersection = np.count_nonzero(
+                np.logical_and(gt_ids == gt_inst['instance_id'], pred_mask))
+            if intersection > 0:
+                gt_copy = gt_inst.copy()
+                pred_copy = pred_instance.copy()
+                gt_copy['intersection'] = intersection
+                pred_copy['intersection'] = intersection
+                matched_gt.append(gt_copy)
+                gt2pred[label_name][gt_num]['matched_pred'].append(pred_copy)
+        pred_instance['matched_gt'] = matched_gt
+        num_pred_instances += 1
+        pred2gt[label_name].append(pred_instance)
+
+    return gt2pred, pred2gt
+
+
+def scannet_eval(preds, gts, options, valid_class_ids, class_labels,
+                 id_to_label):
+    """Evaluate instance segmentation in ScanNet protocol.
+
+    Args:
+        preds (list[dict]): Per scene predictions of mask, label and
+            confidence.
+        gts (list[np.array]): Per scene ground truth instance masks.
+        options (dict): ScanNet evaluator options. See get_options.
+        valid_class_ids (tuple[int]): Ids of valid categories.
+        class_labels (tuple[str]): Class names.
+        id_to_label (dict[int, str]): Mapping of valid class id to class label.
+
+    Returns:
+        dict: Overall and per-category AP scores.
+    """
+    options = get_options(options)
+    matches = {}
+    for i, (pred, gt) in enumerate(zip(preds, gts)):
+        matches_key = i
+        # assign gt to predictions
+        gt2pred, pred2gt = assign_instances_for_scan(pred, gt, options,
+                                                     valid_class_ids,
+                                                     class_labels, id_to_label)
+        matches[matches_key] = {}
+        matches[matches_key]['gt'] = gt2pred
+        matches[matches_key]['pred'] = pred2gt
+
+    ap_scores = evaluate_matches(matches, class_labels, options)
+    avgs = compute_averages(ap_scores, options, class_labels)
+    return avgs
+
+
+def get_options(options=None):
+    """Set ScanNet evaluator options.
+
+    Args:
+        options (dict, optional): Not default options. Default: None.
+
+    Returns:
+        dict: Updated options with all 4 keys.
+    """
+    assert options is None or isinstance(options, dict)
+    _options = dict(
+        overlaps=np.append(np.arange(0.5, 0.95, 0.05), 0.25),
+        min_region_sizes=np.array([100]),
+        distance_threshes=np.array([float('inf')]),
+        distance_confs=np.array([-float('inf')]))
+    if options is not None:
+        _options.update(options)
+    return _options
diff --git a/mmde/mmdet3d/evaluation/functional/scannet_utils/util_3d.py b/mmde/mmdet3d/evaluation/functional/scannet_utils/util_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..527d341266b29d436c87ef969411974a254aeeb3
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/functional/scannet_utils/util_3d.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# adapted from https://github.com/ScanNet/ScanNet/blob/master/BenchmarkScripts/util_3d.py # noqa
+import json
+
+import numpy as np
+
+
+class Instance:
+    """Single instance for ScanNet evaluator.
+
+    Args:
+        mesh_vert_instances (np.array): Instance ids for each point.
+        instance_id: Id of single instance.
+    """
+    instance_id = 0
+    label_id = 0
+    vert_count = 0
+    med_dist = -1
+    dist_conf = 0.0
+
+    def __init__(self, mesh_vert_instances, instance_id):
+        if instance_id == -1:
+            return
+        self.instance_id = int(instance_id)
+        self.label_id = int(self.get_label_id(instance_id))
+        self.vert_count = int(
+            self.get_instance_verts(mesh_vert_instances, instance_id))
+
+    @staticmethod
+    def get_label_id(instance_id):
+        return int(instance_id // 1000)
+
+    @staticmethod
+    def get_instance_verts(mesh_vert_instances, instance_id):
+        return (mesh_vert_instances == instance_id).sum()
+
+    def to_json(self):
+        return json.dumps(
+            self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
+
+    def to_dict(self):
+        dict = {}
+        dict['instance_id'] = self.instance_id
+        dict['label_id'] = self.label_id
+        dict['vert_count'] = self.vert_count
+        dict['med_dist'] = self.med_dist
+        dict['dist_conf'] = self.dist_conf
+        return dict
+
+    def from_json(self, data):
+        self.instance_id = int(data['instance_id'])
+        self.label_id = int(data['label_id'])
+        self.vert_count = int(data['vert_count'])
+        if 'med_dist' in data:
+            self.med_dist = float(data['med_dist'])
+            self.dist_conf = float(data['dist_conf'])
+
+    def __str__(self):
+        return '(' + str(self.instance_id) + ')'
+
+
+def get_instances(ids, class_ids, class_labels, id2label):
+    """Transform gt instance mask to Instance objects.
+
+    Args:
+        ids (np.array): Instance ids for each point.
+        class_ids: (tuple[int]): Ids of valid categories.
+        class_labels (tuple[str]): Class names.
+        id2label: (dict[int, str]): Mapping of valid class id to class label.
+
+    Returns:
+        dict [str, list]: Instance objects grouped by class label.
+    """
+    instances = {}
+    for label in class_labels:
+        instances[label] = []
+    instance_ids = np.unique(ids)
+    for id in instance_ids:
+        if id == 0:
+            continue
+        inst = Instance(ids, id)
+        if inst.label_id in class_ids:
+            instances[id2label[inst.label_id]].append(inst.to_dict())
+    return instances
diff --git a/mmde/mmdet3d/evaluation/functional/seg_eval.py b/mmde/mmdet3d/evaluation/functional/seg_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..b78df1203435c72a419423ac813c3f994c348976
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/functional/seg_eval.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmengine.logging import print_log
+from terminaltables import AsciiTable
+
+
+def fast_hist(preds, labels, num_classes):
+    """Compute the confusion matrix for every batch.
+
+    Args:
+        preds (np.ndarray):  Prediction labels of points with shape of
+        (num_points, ).
+        labels (np.ndarray): Ground truth labels of points with shape of
+        (num_points, ).
+        num_classes (int): number of classes
+
+    Returns:
+        np.ndarray: Calculated confusion matrix.
+    """
+
+    k = (labels >= 0) & (labels < num_classes)
+    bin_count = np.bincount(
+        num_classes * labels[k].astype(int) + preds[k],
+        minlength=num_classes**2)
+    return bin_count[:num_classes**2].reshape(num_classes, num_classes)
+
+
+def per_class_iou(hist):
+    """Compute the per class iou.
+
+    Args:
+        hist(np.ndarray):  Overall confusion martix
+        (num_classes, num_classes ).
+
+    Returns:
+        np.ndarray: Calculated per class iou
+    """
+
+    return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
+
+
+def get_acc(hist):
+    """Compute the overall accuracy.
+
+    Args:
+        hist(np.ndarray):  Overall confusion martix
+        (num_classes, num_classes ).
+
+    Returns:
+        float: Calculated overall acc
+    """
+
+    return np.diag(hist).sum() / hist.sum()
+
+
+def get_acc_cls(hist):
+    """Compute the class average accuracy.
+
+    Args:
+        hist(np.ndarray):  Overall confusion martix
+        (num_classes, num_classes ).
+
+    Returns:
+        float: Calculated class average acc
+    """
+
+    return np.nanmean(np.diag(hist) / hist.sum(axis=1))
+
+
+def seg_eval(gt_labels, seg_preds, label2cat, ignore_index, logger=None):
+    """Semantic Segmentation  Evaluation.
+
+    Evaluate the result of the Semantic Segmentation.
+
+    Args:
+        gt_labels (list[torch.Tensor]): Ground truth labels.
+        seg_preds  (list[torch.Tensor]): Predictions.
+        label2cat (dict): Map from label to category name.
+        ignore_index (int): Index that will be ignored in evaluation.
+        logger (logging.Logger | str, optional): The way to print the mAP
+            summary. See `mmdet.utils.print_log()` for details. Default: None.
+
+    Returns:
+        dict[str, float]: Dict of results.
+    """
+    assert len(seg_preds) == len(gt_labels)
+    num_classes = len(label2cat)
+
+    hist_list = []
+    for i in range(len(gt_labels)):
+        gt_seg = gt_labels[i].astype(np.int64)
+        pred_seg = seg_preds[i].astype(np.int64)
+
+        # filter out ignored points
+        pred_seg[gt_seg == ignore_index] = -1
+        gt_seg[gt_seg == ignore_index] = -1
+
+        # calculate one instance result
+        hist_list.append(fast_hist(pred_seg, gt_seg, num_classes))
+
+    iou = per_class_iou(sum(hist_list))
+    # if ignore_index is in iou, replace it with nan
+    if ignore_index < len(iou):
+        iou[ignore_index] = np.nan
+    miou = np.nanmean(iou)
+    acc = get_acc(sum(hist_list))
+    acc_cls = get_acc_cls(sum(hist_list))
+
+    header = ['classes']
+    for i in range(len(label2cat)):
+        header.append(label2cat[i])
+    header.extend(['miou', 'acc', 'acc_cls'])
+
+    ret_dict = dict()
+    table_columns = [['results']]
+    for i in range(len(label2cat)):
+        ret_dict[label2cat[i]] = float(iou[i])
+        table_columns.append([f'{iou[i]:.4f}'])
+    ret_dict['miou'] = float(miou)
+    ret_dict['acc'] = float(acc)
+    ret_dict['acc_cls'] = float(acc_cls)
+
+    table_columns.append([f'{miou:.4f}'])
+    table_columns.append([f'{acc:.4f}'])
+    table_columns.append([f'{acc_cls:.4f}'])
+
+    table_data = [header]
+    table_rows = list(zip(*table_columns))
+    table_data += table_rows
+    table = AsciiTable(table_data)
+    table.inner_footing_row_border = True
+    print_log('\n' + table.table, logger=logger)
+
+    return ret_dict
diff --git a/mmde/mmdet3d/evaluation/functional/waymo_utils/__init__.py b/mmde/mmdet3d/evaluation/functional/waymo_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..722fdc406e6c4e3a0080356147520573b155fc41
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/functional/waymo_utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .prediction_to_waymo import Prediction2Waymo
+
+__all__ = ['Prediction2Waymo']
diff --git a/mmde/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py b/mmde/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c79d6f6cb063c9fbdcc28edcc8fdf6b52f583bb
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/functional/waymo_utils/prediction_to_waymo.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Adapted from `Waymo to KITTI converter
+    <https://github.com/caizhongang/waymo_kitti_converter>`_.
+"""
+
+try:
+    from waymo_open_dataset import label_pb2
+    from waymo_open_dataset.protos import metrics_pb2
+    from waymo_open_dataset.protos.metrics_pb2 import Objects
+except ImportError:
+    Objects = None
+    raise ImportError(
+        'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" '
+        'to install the official devkit first.')
+
+from typing import List
+
+import mmengine
+from mmengine import print_log
+
+
+class Prediction2Waymo(object):
+    """Predictions to Waymo converter. The format of prediction results could
+    be original format or kitti-format.
+
+    This class serves as the converter to change predictions from KITTI to
+    Waymo format.
+
+    Args:
+        results (list[dict]): Prediction results.
+        waymo_results_save_dir (str): Directory to save converted predictions
+            in waymo format (.bin files).
+        waymo_results_final_path (str): Path to save combined
+            predictions in waymo format (.bin file), like 'a/b/c.bin'.
+        num_workers (str): Number of parallel processes. Defaults to 4.
+    """
+
+    def __init__(self,
+                 results: List[dict],
+                 waymo_results_final_path: str,
+                 classes: dict,
+                 num_workers: int = 4):
+        self.results = results
+        self.waymo_results_final_path = waymo_results_final_path
+        self.classes = classes
+        self.num_workers = num_workers
+
+        self.k2w_cls_map = {
+            'Car': label_pb2.Label.TYPE_VEHICLE,
+            'Pedestrian': label_pb2.Label.TYPE_PEDESTRIAN,
+            'Sign': label_pb2.Label.TYPE_SIGN,
+            'Cyclist': label_pb2.Label.TYPE_CYCLIST,
+        }
+
+    def convert_one(self, res_idx: int):
+        """Convert action for single file. It read the metainfo from the
+        preprocessed file offline and will be faster.
+
+        Args:
+            res_idx (int): The indices of the results.
+        """
+        sample_idx = self.results[res_idx]['sample_idx']
+        if len(self.results[res_idx]['labels_3d']) > 0:
+            objects = self.parse_objects_from_origin(
+                self.results[res_idx], self.results[res_idx]['context_name'],
+                self.results[res_idx]['timestamp'])
+        else:
+            print(sample_idx, 'not found.')
+            objects = metrics_pb2.Objects()
+
+        return objects
+
+    def parse_objects_from_origin(self, result: dict, contextname: str,
+                                  timestamp: str) -> Objects:
+        """Parse obejcts from the original prediction results.
+
+        Args:
+            result (dict): The original prediction results.
+            contextname (str): The ``contextname`` of sample in waymo.
+            timestamp (str): The ``timestamp`` of sample in waymo.
+
+        Returns:
+            metrics_pb2.Objects: The parsed object.
+        """
+        lidar_boxes = result['bboxes_3d']
+        scores = result['scores_3d']
+        labels = result['labels_3d']
+
+        objects = metrics_pb2.Objects()
+        for lidar_box, score, label in zip(lidar_boxes, scores, labels):
+            # Parse one object
+            box = label_pb2.Label.Box()
+            height = lidar_box[5]
+            heading = lidar_box[6]
+
+            box.center_x = lidar_box[0]
+            box.center_y = lidar_box[1]
+            box.center_z = lidar_box[2] + height / 2
+            box.length = lidar_box[3]
+            box.width = lidar_box[4]
+            box.height = height
+            box.heading = heading
+
+            object = metrics_pb2.Object()
+            object.object.box.CopyFrom(box)
+
+            class_name = self.classes[label]
+            object.object.type = self.k2w_cls_map[class_name]
+            object.score = score
+            object.context_name = contextname
+            object.frame_timestamp_micros = timestamp
+            objects.objects.append(object)
+
+        return objects
+
+    def convert(self):
+        """Convert action."""
+        print_log('Start converting ...', logger='current')
+
+        # TODO: use parallel processes.
+        # objects_list = mmengine.track_parallel_progress(
+        #     self.convert_one, range(len(self)), self.num_workers)
+
+        objects_list = mmengine.track_progress(self.convert_one,
+                                               range(len(self)))
+
+        combined = metrics_pb2.Objects()
+        for objects in objects_list:
+            for o in objects.objects:
+                combined.objects.append(o)
+
+        with open(self.waymo_results_final_path, 'wb') as f:
+            f.write(combined.SerializeToString())
+
+    def __len__(self):
+        """Length of the filename list."""
+        return len(self.results)
diff --git a/mmde/mmdet3d/evaluation/metrics/__init__.py b/mmde/mmdet3d/evaluation/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4dae15fe54193fde1ed7ebf057cdd069afff1d5
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/metrics/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .indoor_metric import IndoorMetric  # noqa: F401,F403
+from .instance_seg_metric import InstanceSegMetric  # noqa: F401,F403
+from .kitti_metric import KittiMetric  # noqa: F401,F403
+from .lyft_metric import LyftMetric  # noqa: F401,F403
+from .nuscenes_metric import NuScenesMetric  # noqa: F401,F403
+from .panoptic_seg_metric import PanopticSegMetric  # noqa: F401,F403
+from .seg_metric import SegMetric  # noqa: F401,F403
+from .waymo_metric import WaymoMetric  # noqa: F401,F403
+
+__all__ = [
+    'KittiMetric', 'NuScenesMetric', 'IndoorMetric', 'LyftMetric', 'SegMetric',
+    'InstanceSegMetric', 'WaymoMetric', 'PanopticSegMetric'
+]
diff --git a/mmde/mmdet3d/evaluation/metrics/indoor_metric.py b/mmde/mmdet3d/evaluation/metrics/indoor_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..987e641a333410d2b01b026e8dd242e896e60d44
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/metrics/indoor_metric.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from typing import Dict, List, Optional, Sequence, Union
+
+import numpy as np
+from mmdet.evaluation import eval_map
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+
+from mmdet3d.evaluation import indoor_eval
+from mmdet3d.registry import METRICS
+from mmdet3d.structures import get_box_type
+
+
+@METRICS.register_module()
+class IndoorMetric(BaseMetric):
+    """Indoor scene evaluation metric.
+
+    Args:
+        iou_thr (float or List[float]): List of iou threshold when calculate
+            the metric. Defaults to [0.25, 0.5].
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix will
+            be used instead. Defaults to None.
+    """
+
+    def __init__(self,
+                 iou_thr: List[float] = [0.25, 0.5],
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super(IndoorMetric, self).__init__(
+            prefix=prefix, collect_device=collect_device)
+        self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            pred_3d = data_sample['pred_instances_3d']
+            eval_ann_info = data_sample['eval_ann_info']
+            cpu_pred_3d = dict()
+            for k, v in pred_3d.items():
+                if hasattr(v, 'to'):
+                    cpu_pred_3d[k] = v.to('cpu')
+                else:
+                    cpu_pred_3d[k] = v
+            self.results.append((eval_ann_info, cpu_pred_3d))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        ann_infos = []
+        pred_results = []
+
+        for eval_ann, sinlge_pred_results in results:
+            ann_infos.append(eval_ann)
+            pred_results.append(sinlge_pred_results)
+
+        # some checkpoints may not record the key "box_type_3d"
+        box_type_3d, box_mode_3d = get_box_type(
+            self.dataset_meta.get('box_type_3d', 'depth'))
+
+        ret_dict = indoor_eval(
+            ann_infos,
+            pred_results,
+            self.iou_thr,
+            self.dataset_meta['classes'],
+            logger=logger,
+            box_mode_3d=box_mode_3d)
+
+        return ret_dict
+
+
+@METRICS.register_module()
+class Indoor2DMetric(BaseMetric):
+    """indoor 2d predictions evaluation metric.
+
+    Args:
+        iou_thr (float or List[float]): List of iou threshold when calculate
+            the metric. Defaults to [0.5].
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix will
+            be used instead. Defaults to None.
+    """
+
+    def __init__(self,
+                 iou_thr: Union[float, List[float]] = [0.5],
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None):
+        super(Indoor2DMetric, self).__init__(
+            prefix=prefix, collect_device=collect_device)
+        self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            pred = data_sample['pred_instances']
+            eval_ann_info = data_sample['eval_ann_info']
+            ann = dict(
+                labels=eval_ann_info['gt_bboxes_labels'],
+                bboxes=eval_ann_info['gt_bboxes'])
+
+            pred_bboxes = pred['bboxes'].cpu().numpy()
+            pred_scores = pred['scores'].cpu().numpy()
+            pred_labels = pred['labels'].cpu().numpy()
+
+            dets = []
+            for label in range(len(self.dataset_meta['classes'])):
+                index = np.where(pred_labels == label)[0]
+                pred_bbox_scores = np.hstack(
+                    [pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
+                dets.append(pred_bbox_scores)
+
+            self.results.append((ann, dets))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        annotations, preds = zip(*results)
+        eval_results = OrderedDict()
+        for iou_thr_2d_single in self.iou_thr:
+            mean_ap, _ = eval_map(
+                preds,
+                annotations,
+                scale_ranges=None,
+                iou_thr=iou_thr_2d_single,
+                dataset=self.dataset_meta['classes'],
+                logger=logger)
+            eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap
+        return eval_results
diff --git a/mmde/mmdet3d/evaluation/metrics/instance_seg_metric.py b/mmde/mmdet3d/evaluation/metrics/instance_seg_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..95b1e51c42f90ac43edd9f76a96c32c052c02b48
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/metrics/instance_seg_metric.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Sequence
+
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+
+from mmdet3d.evaluation import instance_seg_eval
+from mmdet3d.registry import METRICS
+
+
+@METRICS.register_module()
+class InstanceSegMetric(BaseMetric):
+    """3D instance segmentation evaluation metric.
+
+    Args:
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix will
+            be used instead. Defaults to None.
+    """
+
+    def __init__(self,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None):
+        super(InstanceSegMetric, self).__init__(
+            prefix=prefix, collect_device=collect_device)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            pred_3d = data_sample['pred_pts_seg']
+            eval_ann_info = data_sample['eval_ann_info']
+            cpu_pred_3d = dict()
+            for k, v in pred_3d.items():
+                if hasattr(v, 'to'):
+                    cpu_pred_3d[k] = v.to('cpu')
+                else:
+                    cpu_pred_3d[k] = v
+            self.results.append((eval_ann_info, cpu_pred_3d))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        self.classes = self.dataset_meta['classes']
+        self.valid_class_ids = self.dataset_meta['seg_valid_class_ids']
+
+        gt_semantic_masks = []
+        gt_instance_masks = []
+        pred_instance_masks = []
+        pred_instance_labels = []
+        pred_instance_scores = []
+
+        for eval_ann, sinlge_pred_results in results:
+            gt_semantic_masks.append(eval_ann['pts_semantic_mask'])
+            gt_instance_masks.append(eval_ann['pts_instance_mask'])
+            pred_instance_masks.append(
+                sinlge_pred_results['pts_instance_mask'])
+            pred_instance_labels.append(sinlge_pred_results['instance_labels'])
+            pred_instance_scores.append(sinlge_pred_results['instance_scores'])
+
+        ret_dict = instance_seg_eval(
+            gt_semantic_masks,
+            gt_instance_masks,
+            pred_instance_masks,
+            pred_instance_labels,
+            pred_instance_scores,
+            valid_class_ids=self.valid_class_ids,
+            class_labels=self.classes,
+            logger=logger)
+
+        return ret_dict
diff --git a/mmde/mmdet3d/evaluation/metrics/kitti_metric.py b/mmde/mmdet3d/evaluation/metrics/kitti_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0188042784f6520ef3e5a4119d3b57847374a94
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/metrics/kitti_metric.py
@@ -0,0 +1,650 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import tempfile
+from os import path as osp
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import mmengine
+import numpy as np
+import torch
+from mmengine import load
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+
+from mmdet3d.evaluation import kitti_eval
+from mmdet3d.registry import METRICS
+from mmdet3d.structures import (Box3DMode, CameraInstance3DBoxes,
+                                LiDARInstance3DBoxes, points_cam2img)
+
+
+@METRICS.register_module()
+class KittiMetric(BaseMetric):
+    """Kitti evaluation metric.
+
+    Args:
+        ann_file (str): Annotation file path.
+        metric (str or List[str]): Metrics to be evaluated. Defaults to 'bbox'.
+        pcd_limit_range (List[float]): The range of point cloud used to filter
+            invalid predicted boxes. Defaults to [0, -40, -3, 70.4, 40, 0.0].
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix will
+            be used instead. Defaults to None.
+        pklfile_prefix (str, optional): The prefix of pkl files, including the
+            file path and the prefix of filename, e.g., "a/b/prefix". If not
+            specified, a temp file will be created. Defaults to None.
+        default_cam_key (str): The default camera for lidar to camera
+            conversion. By default, KITTI: 'CAM2', Waymo: 'CAM_FRONT'.
+            Defaults to 'CAM2'.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result to a
+            specific format and submit it to the test server.
+            Defaults to False.
+        submission_prefix (str, optional): The prefix of submission data. If
+            not specified, the submission data will not be generated.
+            Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    def __init__(self,
+                 ann_file: str,
+                 metric: Union[str, List[str]] = 'bbox',
+                 pcd_limit_range: List[float] = [0, -40, -3, 70.4, 40, 0.0],
+                 prefix: Optional[str] = None,
+                 pklfile_prefix: Optional[str] = None,
+                 default_cam_key: str = 'CAM2',
+                 format_only: bool = False,
+                 submission_prefix: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 backend_args: Optional[dict] = None) -> None:
+        self.default_prefix = 'Kitti metric'
+        super(KittiMetric, self).__init__(
+            collect_device=collect_device, prefix=prefix)
+        self.pcd_limit_range = pcd_limit_range
+        self.ann_file = ann_file
+        self.pklfile_prefix = pklfile_prefix
+        self.format_only = format_only
+        if self.format_only:
+            assert submission_prefix is not None, 'submission_prefix must be '
+            'not None when format_only is True, otherwise the result files '
+            'will be saved to a temp directory which will be cleaned up at '
+            'the end.'
+
+        self.submission_prefix = submission_prefix
+        self.default_cam_key = default_cam_key
+        self.backend_args = backend_args
+
+        allowed_metrics = ['bbox', 'img_bbox', 'mAP', 'LET_mAP']
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        for metric in self.metrics:
+            if metric not in allowed_metrics:
+                raise KeyError("metric should be one of 'bbox', 'img_bbox', "
+                               f'but got {metric}.')
+
+    def convert_annos_to_kitti_annos(self, data_infos: dict) -> List[dict]:
+        """Convert loading annotations to Kitti annotations.
+
+        Args:
+            data_infos (dict): Data infos including metainfo and annotations
+                loaded from ann_file.
+
+        Returns:
+            List[dict]: List of Kitti annotations.
+        """
+        data_annos = data_infos['data_list']
+        if not self.format_only:
+            cat2label = data_infos['metainfo']['categories']
+            label2cat = dict((v, k) for (k, v) in cat2label.items())
+            assert 'instances' in data_annos[0]
+            for i, annos in enumerate(data_annos):
+                if len(annos['instances']) == 0:
+                    kitti_annos = {
+                        'name': np.array([]),
+                        'truncated': np.array([]),
+                        'occluded': np.array([]),
+                        'alpha': np.array([]),
+                        'bbox': np.zeros([0, 4]),
+                        'dimensions': np.zeros([0, 3]),
+                        'location': np.zeros([0, 3]),
+                        'rotation_y': np.array([]),
+                        'score': np.array([]),
+                    }
+                else:
+                    kitti_annos = {
+                        'name': [],
+                        'truncated': [],
+                        'occluded': [],
+                        'alpha': [],
+                        'bbox': [],
+                        'location': [],
+                        'dimensions': [],
+                        'rotation_y': [],
+                        'score': []
+                    }
+                    for instance in annos['instances']:
+                        label = instance['bbox_label']
+                        kitti_annos['name'].append(label2cat[label])
+                        kitti_annos['truncated'].append(instance['truncated'])
+                        kitti_annos['occluded'].append(instance['occluded'])
+                        kitti_annos['alpha'].append(instance['alpha'])
+                        kitti_annos['bbox'].append(instance['bbox'])
+                        kitti_annos['location'].append(instance['bbox_3d'][:3])
+                        kitti_annos['dimensions'].append(
+                            instance['bbox_3d'][3:6])
+                        kitti_annos['rotation_y'].append(
+                            instance['bbox_3d'][6])
+                        kitti_annos['score'].append(instance['score'])
+                    for name in kitti_annos:
+                        kitti_annos[name] = np.array(kitti_annos[name])
+                data_annos[i]['kitti_annos'] = kitti_annos
+        return data_annos
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+
+        for data_sample in data_samples:
+            result = dict()
+            pred_3d = data_sample['pred_instances_3d']
+            pred_2d = data_sample['pred_instances']
+            for attr_name in pred_3d:
+                pred_3d[attr_name] = pred_3d[attr_name].to('cpu')
+            result['pred_instances_3d'] = pred_3d
+            for attr_name in pred_2d:
+                pred_2d[attr_name] = pred_2d[attr_name].to('cpu')
+            result['pred_instances'] = pred_2d
+            sample_idx = data_sample['sample_idx']
+            result['sample_idx'] = sample_idx
+            self.results.append(result)
+
+    def compute_metrics(self, results: List[dict]) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (List[dict]): The processed results of the whole dataset.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        self.classes = self.dataset_meta['classes']
+
+        # load annotations
+        pkl_infos = load(self.ann_file, backend_args=self.backend_args)
+        self.data_infos = self.convert_annos_to_kitti_annos(pkl_infos)
+        result_dict, tmp_dir = self.format_results(
+            results,
+            pklfile_prefix=self.pklfile_prefix,
+            submission_prefix=self.submission_prefix,
+            classes=self.classes)
+
+        metric_dict = {}
+
+        if self.format_only:
+            logger.info(
+                f'results are saved in {osp.dirname(self.submission_prefix)}')
+            return metric_dict
+
+        gt_annos = [
+            self.data_infos[result['sample_idx']]['kitti_annos']
+            for result in results
+        ]
+
+        for metric in self.metrics:
+            ap_dict = self.kitti_evaluate(
+                result_dict,
+                gt_annos,
+                metric=metric,
+                logger=logger,
+                classes=self.classes)
+            for result in ap_dict:
+                metric_dict[result] = ap_dict[result]
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return metric_dict
+
+    def kitti_evaluate(self,
+                       results_dict: dict,
+                       gt_annos: List[dict],
+                       metric: Optional[str] = None,
+                       classes: Optional[List[str]] = None,
+                       logger: Optional[MMLogger] = None) -> Dict[str, float]:
+        """Evaluation in KITTI protocol.
+
+        Args:
+            results_dict (dict): Formatted results of the dataset.
+            gt_annos (List[dict]): Contain gt information of each sample.
+            metric (str, optional): Metrics to be evaluated. Defaults to None.
+            classes (List[str], optional): A list of class name.
+                Defaults to None.
+            logger (MMLogger, optional): Logger used for printing related
+                information during evaluation. Defaults to None.
+
+        Returns:
+            Dict[str, float]: Results of each evaluation metric.
+        """
+        ap_dict = dict()
+        for name in results_dict:
+            if name == 'pred_instances' or metric == 'img_bbox':
+                eval_types = ['bbox']
+            else:
+                eval_types = ['bbox', 'bev', '3d']
+            ap_result_str, ap_dict_ = kitti_eval(
+                gt_annos, results_dict[name], classes, eval_types=eval_types)
+            for ap_type, ap in ap_dict_.items():
+                ap_dict[f'{name}/{ap_type}'] = float(f'{ap:.4f}')
+
+            print_log(f'Results of {name}:\n' + ap_result_str, logger=logger)
+
+        return ap_dict
+
+    def format_results(
+        self,
+        results: List[dict],
+        pklfile_prefix: Optional[str] = None,
+        submission_prefix: Optional[str] = None,
+        classes: Optional[List[str]] = None
+    ) -> Tuple[dict, Union[tempfile.TemporaryDirectory, None]]:
+        """Format the results to pkl file.
+
+        Args:
+            results (List[dict]): Testing results of the dataset.
+            pklfile_prefix (str, optional): The prefix of pkl files. It
+                includes the file path and the prefix of filename, e.g.,
+                "a/b/prefix". If not specified, a temp file will be created.
+                Defaults to None.
+            submission_prefix (str, optional): The prefix of submitted files.
+                It includes the file path and the prefix of filename, e.g.,
+                "a/b/prefix". If not specified, a temp file will be created.
+                Defaults to None.
+            classes (List[str], optional): A list of class name.
+                Defaults to None.
+
+        Returns:
+            tuple: (result_dict, tmp_dir), result_dict is a dict containing the
+            formatted result, tmp_dir is the temporal directory created for
+            saving json files when jsonfile_prefix is not specified.
+        """
+        if pklfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            pklfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+        result_dict = dict()
+        sample_idx_list = [result['sample_idx'] for result in results]
+        for name in results[0]:
+            if submission_prefix is not None:
+                submission_prefix_ = osp.join(submission_prefix, name)
+            else:
+                submission_prefix_ = None
+            if pklfile_prefix is not None:
+                pklfile_prefix_ = osp.join(pklfile_prefix, name) + '.pkl'
+            else:
+                pklfile_prefix_ = None
+            if 'pred_instances' in name and '3d' in name and name[
+                    0] != '_' and results[0][name]:
+                net_outputs = [result[name] for result in results]
+                result_list_ = self.bbox2result_kitti(net_outputs,
+                                                      sample_idx_list, classes,
+                                                      pklfile_prefix_,
+                                                      submission_prefix_)
+                result_dict[name] = result_list_
+            elif name == 'pred_instances' and name[0] != '_' and results[0][
+                    name]:
+                net_outputs = [result[name] for result in results]
+                result_list_ = self.bbox2result_kitti2d(
+                    net_outputs, sample_idx_list, classes, pklfile_prefix_,
+                    submission_prefix_)
+                result_dict[name] = result_list_
+        return result_dict, tmp_dir
+
+    def bbox2result_kitti(
+            self,
+            net_outputs: List[dict],
+            sample_idx_list: List[int],
+            class_names: List[str],
+            pklfile_prefix: Optional[str] = None,
+            submission_prefix: Optional[str] = None) -> List[dict]:
+        """Convert 3D detection results to kitti format for evaluation and test
+        submission.
+
+        Args:
+            net_outputs (List[dict]): List of dict storing the inferenced
+                bounding boxes and scores.
+            sample_idx_list (List[int]): List of input sample idx.
+            class_names (List[str]): A list of class names.
+            pklfile_prefix (str, optional): The prefix of pkl file.
+                Defaults to None.
+            submission_prefix (str, optional): The prefix of submission file.
+                Defaults to None.
+
+        Returns:
+            List[dict]: A list of dictionaries with the kitti format.
+        """
+        assert len(net_outputs) == len(self.data_infos), \
+            'invalid list length of network outputs'
+        if submission_prefix is not None:
+            mmengine.mkdir_or_exist(submission_prefix)
+
+        det_annos = []
+        print('\nConverting 3D prediction to KITTI format')
+        for idx, pred_dicts in enumerate(
+                mmengine.track_iter_progress(net_outputs)):
+            sample_idx = sample_idx_list[idx]
+            info = self.data_infos[sample_idx]
+            # Here default used 'CAM2' to compute metric. If you want to
+            # use another camera, please modify it.
+            image_shape = (info['images'][self.default_cam_key]['height'],
+                           info['images'][self.default_cam_key]['width'])
+            box_dict = self.convert_valid_bboxes(pred_dicts, info)
+            anno = {
+                'name': [],
+                'truncated': [],
+                'occluded': [],
+                'alpha': [],
+                'bbox': [],
+                'dimensions': [],
+                'location': [],
+                'rotation_y': [],
+                'score': []
+            }
+            if len(box_dict['bbox']) > 0:
+                box_2d_preds = box_dict['bbox']
+                box_preds = box_dict['box3d_camera']
+                scores = box_dict['scores']
+                box_preds_lidar = box_dict['box3d_lidar']
+                label_preds = box_dict['label_preds']
+                pred_box_type_3d = box_dict['pred_box_type_3d']
+
+                for box, box_lidar, bbox, score, label in zip(
+                        box_preds, box_preds_lidar, box_2d_preds, scores,
+                        label_preds):
+                    bbox[2:] = np.minimum(bbox[2:], image_shape[::-1])
+                    bbox[:2] = np.maximum(bbox[:2], [0, 0])
+                    anno['name'].append(class_names[int(label)])
+                    anno['truncated'].append(0.0)
+                    anno['occluded'].append(0)
+                    if pred_box_type_3d == CameraInstance3DBoxes:
+                        anno['alpha'].append(-np.arctan2(box[0], box[2]) +
+                                             box[6])
+                    elif pred_box_type_3d == LiDARInstance3DBoxes:
+                        anno['alpha'].append(
+                            -np.arctan2(-box_lidar[1], box_lidar[0]) + box[6])
+                    anno['bbox'].append(bbox)
+                    anno['dimensions'].append(box[3:6])
+                    anno['location'].append(box[:3])
+                    anno['rotation_y'].append(box[6])
+                    anno['score'].append(score)
+
+                anno = {k: np.stack(v) for k, v in anno.items()}
+            else:
+                anno = {
+                    'name': np.array([]),
+                    'truncated': np.array([]),
+                    'occluded': np.array([]),
+                    'alpha': np.array([]),
+                    'bbox': np.zeros([0, 4]),
+                    'dimensions': np.zeros([0, 3]),
+                    'location': np.zeros([0, 3]),
+                    'rotation_y': np.array([]),
+                    'score': np.array([]),
+                }
+
+            if submission_prefix is not None:
+                curr_file = f'{submission_prefix}/{sample_idx:06d}.txt'
+                with open(curr_file, 'w') as f:
+                    bbox = anno['bbox']
+                    loc = anno['location']
+                    dims = anno['dimensions']  # lhw -> hwl
+
+                    for idx in range(len(bbox)):
+                        print(
+                            '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} '
+                            '{:.4f} {:.4f} {:.4f} '
+                            '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format(
+                                anno['name'][idx], anno['alpha'][idx],
+                                bbox[idx][0], bbox[idx][1], bbox[idx][2],
+                                bbox[idx][3], dims[idx][1], dims[idx][2],
+                                dims[idx][0], loc[idx][0], loc[idx][1],
+                                loc[idx][2], anno['rotation_y'][idx],
+                                anno['score'][idx]),
+                            file=f)
+
+            anno['sample_idx'] = np.array(
+                [sample_idx] * len(anno['score']), dtype=np.int64)
+
+            det_annos.append(anno)
+
+        if pklfile_prefix is not None:
+            if not pklfile_prefix.endswith(('.pkl', '.pickle')):
+                out = f'{pklfile_prefix}.pkl'
+            else:
+                out = pklfile_prefix
+            mmengine.dump(det_annos, out)
+            print(f'Result is saved to {out}.')
+
+        return det_annos
+
+    def bbox2result_kitti2d(
+            self,
+            net_outputs: List[dict],
+            sample_idx_list: List[int],
+            class_names: List[str],
+            pklfile_prefix: Optional[str] = None,
+            submission_prefix: Optional[str] = None) -> List[dict]:
+        """Convert 2D detection results to kitti format for evaluation and test
+        submission.
+
+        Args:
+            net_outputs (List[dict]): List of dict storing the inferenced
+                bounding boxes and scores.
+            sample_idx_list (List[int]): List of input sample idx.
+            class_names (List[str]): A list of class names.
+            pklfile_prefix (str, optional): The prefix of pkl file.
+                Defaults to None.
+            submission_prefix (str, optional): The prefix of submission file.
+                Defaults to None.
+
+        Returns:
+            List[dict]: A list of dictionaries with the kitti format.
+        """
+        assert len(net_outputs) == len(self.data_infos), \
+            'invalid list length of network outputs'
+        det_annos = []
+        print('\nConverting 2D prediction to KITTI format')
+        for i, bboxes_per_sample in enumerate(
+                mmengine.track_iter_progress(net_outputs)):
+            anno = dict(
+                name=[],
+                truncated=[],
+                occluded=[],
+                alpha=[],
+                bbox=[],
+                dimensions=[],
+                location=[],
+                rotation_y=[],
+                score=[])
+            sample_idx = sample_idx_list[i]
+
+            num_example = 0
+            bbox = bboxes_per_sample['bboxes']
+            for i in range(bbox.shape[0]):
+                anno['name'].append(class_names[int(
+                    bboxes_per_sample['labels'][i])])
+                anno['truncated'].append(0.0)
+                anno['occluded'].append(0)
+                anno['alpha'].append(0.0)
+                anno['bbox'].append(bbox[i, :4])
+                # set dimensions (height, width, length) to zero
+                anno['dimensions'].append(
+                    np.zeros(shape=[3], dtype=np.float32))
+                # set the 3D translation to (-1000, -1000, -1000)
+                anno['location'].append(
+                    np.ones(shape=[3], dtype=np.float32) * (-1000.0))
+                anno['rotation_y'].append(0.0)
+                anno['score'].append(bboxes_per_sample['scores'][i])
+                num_example += 1
+
+            if num_example == 0:
+                anno = dict(
+                    name=np.array([]),
+                    truncated=np.array([]),
+                    occluded=np.array([]),
+                    alpha=np.array([]),
+                    bbox=np.zeros([0, 4]),
+                    dimensions=np.zeros([0, 3]),
+                    location=np.zeros([0, 3]),
+                    rotation_y=np.array([]),
+                    score=np.array([]),
+                )
+            else:
+                anno = {k: np.stack(v) for k, v in anno.items()}
+
+            anno['sample_idx'] = np.array(
+                [sample_idx] * num_example, dtype=np.int64)
+            det_annos.append(anno)
+
+        if pklfile_prefix is not None:
+            if not pklfile_prefix.endswith(('.pkl', '.pickle')):
+                out = f'{pklfile_prefix}.pkl'
+            else:
+                out = pklfile_prefix
+            mmengine.dump(det_annos, out)
+            print(f'Result is saved to {out}.')
+
+        if submission_prefix is not None:
+            # save file in submission format
+            mmengine.mkdir_or_exist(submission_prefix)
+            print(f'Saving KITTI submission to {submission_prefix}')
+            for i, anno in enumerate(det_annos):
+                sample_idx = sample_idx_list[i]
+                cur_det_file = f'{submission_prefix}/{sample_idx:06d}.txt'
+                with open(cur_det_file, 'w') as f:
+                    bbox = anno['bbox']
+                    loc = anno['location']
+                    dims = anno['dimensions'][::-1]  # lhw -> hwl
+                    for idx in range(len(bbox)):
+                        print(
+                            '{} -1 -1 {:4f} {:4f} {:4f} {:4f} {:4f} {:4f} '
+                            '{:4f} {:4f} {:4f} {:4f} {:4f} {:4f} {:4f}'.format(
+                                anno['name'][idx],
+                                anno['alpha'][idx],
+                                *bbox[idx],  # 4 float
+                                *dims[idx],  # 3 float
+                                *loc[idx],  # 3 float
+                                anno['rotation_y'][idx],
+                                anno['score'][idx]),
+                            file=f,
+                        )
+            print(f'Result is saved to {submission_prefix}')
+
+        return det_annos
+
+    def convert_valid_bboxes(self, box_dict: dict, info: dict) -> dict:
+        """Convert the predicted boxes into valid ones.
+
+        Args:
+            box_dict (dict): Box dictionaries to be converted.
+
+                - bboxes_3d (:obj:`BaseInstance3DBoxes`): 3D bounding boxes.
+                - scores_3d (Tensor): Scores of boxes.
+                - labels_3d (Tensor): Class labels of boxes.
+            info (dict): Data info.
+
+        Returns:
+            dict: Valid predicted boxes.
+
+            - bbox (np.ndarray): 2D bounding boxes.
+            - box3d_camera (np.ndarray): 3D bounding boxes in
+              camera coordinate.
+            - box3d_lidar (np.ndarray): 3D bounding boxes in
+              LiDAR coordinate.
+            - scores (np.ndarray): Scores of boxes.
+            - label_preds (np.ndarray): Class label predictions.
+            - sample_idx (int): Sample index.
+        """
+        # TODO: refactor this function
+        box_preds = box_dict['bboxes_3d']
+        scores = box_dict['scores_3d']
+        labels = box_dict['labels_3d']
+        sample_idx = info['sample_idx']
+        box_preds.limit_yaw(offset=0.5, period=np.pi * 2)
+
+        if len(box_preds) == 0:
+            return dict(
+                bbox=np.zeros([0, 4]),
+                box3d_camera=np.zeros([0, 7]),
+                box3d_lidar=np.zeros([0, 7]),
+                scores=np.zeros([0]),
+                label_preds=np.zeros([0, 4]),
+                sample_idx=sample_idx)
+        # Here default used 'CAM2' to compute metric. If you want to
+        # use another camera, please modify it.
+        lidar2cam = np.array(
+            info['images'][self.default_cam_key]['lidar2cam']).astype(
+                np.float32)
+        P2 = np.array(info['images'][self.default_cam_key]['cam2img']).astype(
+            np.float32)
+        img_shape = (info['images'][self.default_cam_key]['height'],
+                     info['images'][self.default_cam_key]['width'])
+        P2 = box_preds.tensor.new_tensor(P2)
+
+        if isinstance(box_preds, LiDARInstance3DBoxes):
+            box_preds_camera = box_preds.convert_to(Box3DMode.CAM, lidar2cam)
+            box_preds_lidar = box_preds
+        elif isinstance(box_preds, CameraInstance3DBoxes):
+            box_preds_camera = box_preds
+            box_preds_lidar = box_preds.convert_to(Box3DMode.LIDAR,
+                                                   np.linalg.inv(lidar2cam))
+
+        box_corners = box_preds_camera.corners
+        box_corners_in_image = points_cam2img(box_corners, P2)
+        # box_corners_in_image: [N, 8, 2]
+        minxy = torch.min(box_corners_in_image, dim=1)[0]
+        maxxy = torch.max(box_corners_in_image, dim=1)[0]
+        box_2d_preds = torch.cat([minxy, maxxy], dim=1)
+        # Post-processing
+        # check box_preds_camera
+        image_shape = box_preds.tensor.new_tensor(img_shape)
+        valid_cam_inds = ((box_2d_preds[:, 0] < image_shape[1]) &
+                          (box_2d_preds[:, 1] < image_shape[0]) &
+                          (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
+        # check box_preds_lidar
+        if isinstance(box_preds, LiDARInstance3DBoxes):
+            limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
+            valid_pcd_inds = ((box_preds_lidar.center > limit_range[:3]) &
+                              (box_preds_lidar.center < limit_range[3:]))
+            valid_inds = valid_cam_inds & valid_pcd_inds.all(-1)
+        else:
+            valid_inds = valid_cam_inds
+
+        if valid_inds.sum() > 0:
+            return dict(
+                bbox=box_2d_preds[valid_inds, :].numpy(),
+                pred_box_type_3d=type(box_preds),
+                box3d_camera=box_preds_camera[valid_inds].numpy(),
+                box3d_lidar=box_preds_lidar[valid_inds].numpy(),
+                scores=scores[valid_inds].numpy(),
+                label_preds=labels[valid_inds].numpy(),
+                sample_idx=sample_idx)
+        else:
+            return dict(
+                bbox=np.zeros([0, 4]),
+                pred_box_type_3d=type(box_preds),
+                box3d_camera=np.zeros([0, 7]),
+                box3d_lidar=np.zeros([0, 7]),
+                scores=np.zeros([0]),
+                label_preds=np.zeros([0]),
+                sample_idx=sample_idx)
diff --git a/mmde/mmdet3d/evaluation/metrics/lyft_metric.py b/mmde/mmdet3d/evaluation/metrics/lyft_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..176e5aac539b4d535577dda9c87b477389d93d12
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/metrics/lyft_metric.py
@@ -0,0 +1,412 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import tempfile
+from os import path as osp
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import mmengine
+import numpy as np
+import pandas as pd
+from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft
+from lyft_dataset_sdk.utils.data_classes import Box as LyftBox
+from mmengine import load
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+from pyquaternion import Quaternion
+
+from mmdet3d.evaluation import lyft_eval
+from mmdet3d.registry import METRICS
+
+
+@METRICS.register_module()
+class LyftMetric(BaseMetric):
+    """Lyft evaluation metric.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        metric (str or List[str]): Metrics to be evaluated. Defaults to 'bbox'.
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=False, use_lidar=True).
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix will
+            be used instead. Defaults to None.
+        jsonfile_prefix (str, optional): The prefix of json files including the
+            file path and the prefix of filename, e.g., "a/b/prefix". If not
+            specified, a temp file will be created. Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result to a
+            specific format and submit it to the test server.
+            Defaults to False.
+        csv_savepath (str, optional): The path for saving csv files. It
+            includes the file path and the csv filename, e.g.,
+            "a/b/filename.csv". If not specified, the result will not be
+            converted to csv file. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 metric: Union[str, List[str]] = 'bbox',
+                 modality=dict(
+                     use_camera=False,
+                     use_lidar=True,
+                 ),
+                 prefix: Optional[str] = None,
+                 jsonfile_prefix: str = None,
+                 format_only: bool = False,
+                 csv_savepath: str = None,
+                 collect_device: str = 'cpu',
+                 backend_args: Optional[dict] = None) -> None:
+        self.default_prefix = 'Lyft metric'
+        super(LyftMetric, self).__init__(
+            collect_device=collect_device, prefix=prefix)
+        self.ann_file = ann_file
+        self.data_root = data_root
+        self.modality = modality
+        self.jsonfile_prefix = jsonfile_prefix
+        self.format_only = format_only
+        if self.format_only:
+            assert csv_savepath is not None, 'csv_savepath must be not None '
+            'when format_only is True, otherwise the result files will be '
+            'saved to a temp directory which will be cleaned up at the end.'
+
+        self.backend_args = backend_args
+        self.csv_savepath = csv_savepath
+        self.metrics = metric if isinstance(metric, list) else [metric]
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and data_samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            result = dict()
+            pred_3d = data_sample['pred_instances_3d']
+            pred_2d = data_sample['pred_instances']
+            for attr_name in pred_3d:
+                pred_3d[attr_name] = pred_3d[attr_name].to('cpu')
+            result['pred_instances_3d'] = pred_3d
+            for attr_name in pred_2d:
+                pred_2d[attr_name] = pred_2d[attr_name].to('cpu')
+            result['pred_instances'] = pred_2d
+            sample_idx = data_sample['sample_idx']
+            result['sample_idx'] = sample_idx
+            self.results.append(result)
+
+    def compute_metrics(self, results: List[dict]) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (List[dict]): The processed results of the whole dataset.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        classes = self.dataset_meta['classes']
+        self.version = self.dataset_meta['version']
+
+        # load annotations
+        self.data_infos = load(
+            osp.join(self.data_root, self.ann_file),
+            backend_args=self.backend_args)['data_list']
+        result_dict, tmp_dir = self.format_results(results, classes,
+                                                   self.jsonfile_prefix,
+                                                   self.csv_savepath)
+
+        metric_dict = {}
+
+        if self.format_only:
+            logger.info(
+                f'results are saved in {osp.dirname(self.csv_savepath)}')
+            return metric_dict
+
+        for metric in self.metrics:
+            ap_dict = self.lyft_evaluate(
+                result_dict, metric=metric, logger=logger)
+            for result in ap_dict:
+                metric_dict[result] = ap_dict[result]
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return metric_dict
+
+    def format_results(
+        self,
+        results: List[dict],
+        classes: Optional[List[str]] = None,
+        jsonfile_prefix: Optional[str] = None,
+        csv_savepath: Optional[str] = None
+    ) -> Tuple[dict, Union[tempfile.TemporaryDirectory, None]]:
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (List[dict]): Testing results of the dataset.
+            classes (List[str], optional): A list of class name.
+                Defaults to None.
+            jsonfile_prefix (str, optional): The prefix of json files. It
+                includes the file path and the prefix of filename, e.g.,
+                "a/b/prefix". If not specified, a temp file will be created.
+                Defaults to None.
+            csv_savepath (str, optional): The path for saving csv files. It
+                includes the file path and the csv filename, e.g.,
+                "a/b/filename.csv". If not specified, the result will not be
+                converted to csv file. Defaults to None.
+
+        Returns:
+            tuple: Returns (result_dict, tmp_dir), where ``result_dict`` is a
+            dict containing the json filepaths, ``tmp_dir`` is the temporal
+            directory created for saving json files when ``jsonfile_prefix`` is
+            not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+        result_dict = dict()
+        sample_idx_list = [result['sample_idx'] for result in results]
+
+        for name in results[0]:
+            if 'pred' in name and '3d' in name and name[0] != '_':
+                print(f'\nFormating bboxes of {name}')
+                # format result of model output in Det3dDataSample,
+                # include 'pred_instances_3d','pts_pred_instances_3d',
+                # 'img_pred_instances_3d'
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_dict[name] = self._format_bbox(results_,
+                                                      sample_idx_list, classes,
+                                                      tmp_file_)
+        if csv_savepath is not None:
+            if 'pred_instances_3d' in result_dict:
+                self.json2csv(result_dict['pred_instances_3d'], csv_savepath)
+            elif 'pts_pred_instances_3d' in result_dict:
+                self.json2csv(result_dict['pts_pred_instances_3d'],
+                              csv_savepath)
+        return result_dict, tmp_dir
+
+    def json2csv(self, json_path: str, csv_savepath: str) -> None:
+        """Convert the json file to csv format for submission.
+
+        Args:
+            json_path (str): Path of the result json file.
+            csv_savepath (str): Path to save the csv file.
+        """
+        results = mmengine.load(json_path)['results']
+        sample_list_path = osp.join(self.data_root, 'sample_submission.csv')
+        data = pd.read_csv(sample_list_path)
+        Id_list = list(data['Id'])
+        pred_list = list(data['PredictionString'])
+        cnt = 0
+        print('Converting the json to csv...')
+        for token in results.keys():
+            cnt += 1
+            predictions = results[token]
+            prediction_str = ''
+            for i in range(len(predictions)):
+                prediction_str += \
+                    str(predictions[i]['score']) + ' ' + \
+                    str(predictions[i]['translation'][0]) + ' ' + \
+                    str(predictions[i]['translation'][1]) + ' ' + \
+                    str(predictions[i]['translation'][2]) + ' ' + \
+                    str(predictions[i]['size'][0]) + ' ' + \
+                    str(predictions[i]['size'][1]) + ' ' + \
+                    str(predictions[i]['size'][2]) + ' ' + \
+                    str(Quaternion(list(predictions[i]['rotation']))
+                        .yaw_pitch_roll[0]) + ' ' + \
+                    predictions[i]['name'] + ' '
+            prediction_str = prediction_str[:-1]
+            idx = Id_list.index(token)
+            pred_list[idx] = prediction_str
+        df = pd.DataFrame({'Id': Id_list, 'PredictionString': pred_list})
+        mmengine.mkdir_or_exist(os.path.dirname(csv_savepath))
+        df.to_csv(csv_savepath, index=False)
+
+    def _format_bbox(self,
+                     results: List[dict],
+                     sample_idx_list: List[int],
+                     classes: Optional[List[str]] = None,
+                     jsonfile_prefix: Optional[str] = None) -> str:
+        """Convert the results to the standard format.
+
+        Args:
+            results (List[dict]): Testing results of the dataset.
+            sample_idx_list (List[int]): List of result sample idx.
+            classes (List[str], optional): A list of class name.
+                Defaults to None.
+            jsonfile_prefix (str, optional): The prefix of the output jsonfile.
+                You can specify the output directory/filename by modifying the
+                jsonfile_prefix. Defaults to None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+        lyft_annos = {}
+
+        print('Start to convert detection format...')
+        for i, det in enumerate(mmengine.track_iter_progress(results)):
+            annos = []
+            boxes = output_to_lyft_box(det)
+            sample_idx = sample_idx_list[i]
+            sample_token = self.data_infos[sample_idx]['token']
+            boxes = lidar_lyft_box_to_global(self.data_infos[sample_idx],
+                                             boxes)
+            for i, box in enumerate(boxes):
+                name = classes[box.label]
+                lyft_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    name=name,
+                    score=box.score)
+                annos.append(lyft_anno)
+            lyft_annos[sample_token] = annos
+        lyft_submissions = {
+            'meta': self.modality,
+            'results': lyft_annos,
+        }
+
+        mmengine.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_lyft.json')
+        print('Results writes to', res_path)
+        mmengine.dump(lyft_submissions, res_path)
+        return res_path
+
+    def lyft_evaluate(self,
+                      result_dict: dict,
+                      metric: str = 'bbox',
+                      logger: Optional[MMLogger] = None) -> Dict[str, float]:
+        """Evaluation in Lyft protocol.
+
+        Args:
+            result_dict (dict): Formatted results of the dataset.
+            metric (str): Metrics to be evaluated. Defaults to 'bbox'.
+            logger (MMLogger, optional): Logger used for printing related
+                information during evaluation. Defaults to None.
+
+        Returns:
+            Dict[str, float]: Evaluation results.
+        """
+        metric_dict = dict()
+        for name in result_dict:
+            print(f'Evaluating bboxes of {name}')
+            ret_dict = self._evaluate_single(
+                result_dict[name], logger=logger, result_name=name)
+            metric_dict.update(ret_dict)
+        return metric_dict
+
+    def _evaluate_single(self,
+                         result_path: str,
+                         logger: MMLogger = None,
+                         result_name: str = 'pts_bbox') -> dict:
+        """Evaluation for a single model in Lyft protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (MMLogger, optional): Logger used for printing related
+                information during evaluation. Defaults to None.
+            result_name (str): Result name in the metric prefix.
+                Defaults to 'pts_bbox'.
+
+        Returns:
+            Dict[str, float]: Dictionary of evaluation details.
+        """
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+        lyft = Lyft(
+            data_path=osp.join(self.data_root, self.version),
+            json_path=osp.join(self.data_root, self.version, self.version),
+            verbose=True)
+        eval_set_map = {
+            'v1.01-train': 'val',
+        }
+        metrics = lyft_eval(lyft, self.data_root, result_path,
+                            eval_set_map[self.version], output_dir, logger)
+
+        # record metrics
+        detail = dict()
+        metric_prefix = f'{result_name}_Lyft'
+
+        for i, name in enumerate(metrics['class_names']):
+            AP = float(metrics['mAPs_cate'][i])
+            detail[f'{metric_prefix}/{name}_AP'] = AP
+
+        detail[f'{metric_prefix}/mAP'] = metrics['Final mAP']
+        return detail
+
+
+def output_to_lyft_box(detection: dict) -> List[LyftBox]:
+    """Convert the output to the box class in the Lyft.
+
+    Args:
+        detection (dict): Detection results.
+
+    Returns:
+        List[:obj:`LyftBox`]: List of standard LyftBoxes.
+    """
+    bbox3d = detection['bboxes_3d']
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+
+    box_gravity_center = bbox3d.gravity_center.numpy()
+    box_dims = bbox3d.dims.numpy()
+    box_yaw = bbox3d.yaw.numpy()
+
+    # our LiDAR coordinate system -> Lyft box coordinate system
+    lyft_box_dims = box_dims[:, [1, 0, 2]]
+
+    box_list = []
+    for i in range(len(bbox3d)):
+        quat = Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        box = LyftBox(
+            box_gravity_center[i],
+            lyft_box_dims[i],
+            quat,
+            label=labels[i],
+            score=scores[i])
+        box_list.append(box)
+    return box_list
+
+
+def lidar_lyft_box_to_global(info: dict,
+                             boxes: List[LyftBox]) -> List[LyftBox]:
+    """Convert the box from ego to global coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the calibration
+            information.
+        boxes (List[:obj:`LyftBox`]): List of predicted LyftBoxes.
+
+    Returns:
+        List[:obj:`LyftBox`]: List of standard LyftBoxes in the global
+        coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        lidar2ego = np.array(info['lidar_points']['lidar2ego'])
+        box.rotate(Quaternion(matrix=lidar2ego, rtol=1e-05, atol=1e-07))
+        box.translate(lidar2ego[:3, 3])
+        # Move box to global coord system
+        ego2global = np.array(info['ego2global'])
+        box.rotate(Quaternion(matrix=ego2global, rtol=1e-05, atol=1e-07))
+        box.translate(ego2global[:3, 3])
+        box_list.append(box)
+    return box_list
diff --git a/mmde/mmdet3d/evaluation/metrics/nuscenes_metric.py b/mmde/mmdet3d/evaluation/metrics/nuscenes_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..e30c0bcd9609702543b3255e28cc077ce254eac8
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/metrics/nuscenes_metric.py
@@ -0,0 +1,788 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import tempfile
+from os import path as osp
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import mmengine
+import numpy as np
+import pyquaternion
+import torch
+from mmengine import Config, load
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+from nuscenes.eval.detection.config import config_factory
+from nuscenes.eval.detection.data_classes import DetectionConfig
+from nuscenes.utils.data_classes import Box as NuScenesBox
+
+from mmdet3d.models.layers import box3d_multiclass_nms
+from mmdet3d.registry import METRICS
+from mmdet3d.structures import (CameraInstance3DBoxes, LiDARInstance3DBoxes,
+                                bbox3d2result, xywhr2xyxyr)
+
+
+@METRICS.register_module()
+class NuScenesMetric(BaseMetric):
+    """Nuscenes evaluation metric.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        metric (str or List[str]): Metrics to be evaluated. Defaults to 'bbox'.
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=False, use_lidar=True).
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix will
+            be used instead. Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result to a
+            specific format and submit it to the test server.
+            Defaults to False.
+        jsonfile_prefix (str, optional): The prefix of json files including the
+            file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Defaults to None.
+        eval_version (str): Configuration version of evaluation.
+            Defaults to 'detection_cvpr_2019'.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+    NameMapping = {
+        'movable_object.barrier': 'barrier',
+        'vehicle.bicycle': 'bicycle',
+        'vehicle.bus.bendy': 'bus',
+        'vehicle.bus.rigid': 'bus',
+        'vehicle.car': 'car',
+        'vehicle.construction': 'construction_vehicle',
+        'vehicle.motorcycle': 'motorcycle',
+        'human.pedestrian.adult': 'pedestrian',
+        'human.pedestrian.child': 'pedestrian',
+        'human.pedestrian.construction_worker': 'pedestrian',
+        'human.pedestrian.police_officer': 'pedestrian',
+        'movable_object.trafficcone': 'traffic_cone',
+        'vehicle.trailer': 'trailer',
+        'vehicle.truck': 'truck'
+    }
+    DefaultAttribute = {
+        'car': 'vehicle.parked',
+        'pedestrian': 'pedestrian.moving',
+        'trailer': 'vehicle.parked',
+        'truck': 'vehicle.parked',
+        'bus': 'vehicle.moving',
+        'motorcycle': 'cycle.without_rider',
+        'construction_vehicle': 'vehicle.parked',
+        'bicycle': 'cycle.without_rider',
+        'barrier': '',
+        'traffic_cone': '',
+    }
+    # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa
+    ErrNameMapping = {
+        'trans_err': 'mATE',
+        'scale_err': 'mASE',
+        'orient_err': 'mAOE',
+        'vel_err': 'mAVE',
+        'attr_err': 'mAAE'
+    }
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 metric: Union[str, List[str]] = 'bbox',
+                 modality: dict = dict(use_camera=False, use_lidar=True),
+                 prefix: Optional[str] = None,
+                 format_only: bool = False,
+                 jsonfile_prefix: Optional[str] = None,
+                 eval_version: str = 'detection_cvpr_2019',
+                 collect_device: str = 'cpu',
+                 backend_args: Optional[dict] = None) -> None:
+        self.default_prefix = 'NuScenes metric'
+        super(NuScenesMetric, self).__init__(
+            collect_device=collect_device, prefix=prefix)
+        if modality is None:
+            modality = dict(
+                use_camera=False,
+                use_lidar=True,
+            )
+        self.ann_file = ann_file
+        self.data_root = data_root
+        self.modality = modality
+        self.format_only = format_only
+        if self.format_only:
+            assert jsonfile_prefix is not None, 'jsonfile_prefix must be not '
+            'None when format_only is True, otherwise the result files will '
+            'be saved to a temp directory which will be cleanup at the end.'
+
+        self.jsonfile_prefix = jsonfile_prefix
+        self.backend_args = backend_args
+
+        self.metrics = metric if isinstance(metric, list) else [metric]
+
+        self.eval_version = eval_version
+        self.eval_detection_configs = config_factory(self.eval_version)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            result = dict()
+            pred_3d = data_sample['pred_instances_3d']
+            pred_2d = data_sample['pred_instances']
+            for attr_name in pred_3d:
+                pred_3d[attr_name] = pred_3d[attr_name].to('cpu')
+            result['pred_instances_3d'] = pred_3d
+            for attr_name in pred_2d:
+                pred_2d[attr_name] = pred_2d[attr_name].to('cpu')
+            result['pred_instances'] = pred_2d
+            sample_idx = data_sample['sample_idx']
+            result['sample_idx'] = sample_idx
+            self.results.append(result)
+
+    def compute_metrics(self, results: List[dict]) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (List[dict]): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        classes = self.dataset_meta['classes']
+        self.version = self.dataset_meta['version']
+        # load annotations
+        self.data_infos = load(
+            self.ann_file, backend_args=self.backend_args)['data_list']
+        result_dict, tmp_dir = self.format_results(results, classes,
+                                                   self.jsonfile_prefix)
+
+        metric_dict = {}
+
+        if self.format_only:
+            logger.info(
+                f'results are saved in {osp.basename(self.jsonfile_prefix)}')
+            return metric_dict
+
+        for metric in self.metrics:
+            ap_dict = self.nus_evaluate(
+                result_dict, classes=classes, metric=metric, logger=logger)
+            for result in ap_dict:
+                metric_dict[result] = ap_dict[result]
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return metric_dict
+
+    def nus_evaluate(self,
+                     result_dict: dict,
+                     metric: str = 'bbox',
+                     classes: Optional[List[str]] = None,
+                     logger: Optional[MMLogger] = None) -> Dict[str, float]:
+        """Evaluation in Nuscenes protocol.
+
+        Args:
+            result_dict (dict): Formatted results of the dataset.
+            metric (str): Metrics to be evaluated. Defaults to 'bbox'.
+            classes (List[str], optional): A list of class name.
+                Defaults to None.
+            logger (MMLogger, optional): Logger used for printing related
+                information during evaluation. Defaults to None.
+
+        Returns:
+            Dict[str, float]: Results of each evaluation metric.
+        """
+        metric_dict = dict()
+        for name in result_dict:
+            print(f'Evaluating bboxes of {name}')
+            ret_dict = self._evaluate_single(
+                result_dict[name], classes=classes, result_name=name)
+            metric_dict.update(ret_dict)
+        return metric_dict
+
+    def _evaluate_single(
+            self,
+            result_path: str,
+            classes: Optional[List[str]] = None,
+            result_name: str = 'pred_instances_3d') -> Dict[str, float]:
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            classes (List[str], optional): A list of class name.
+                Defaults to None.
+            result_name (str): Result name in the metric prefix.
+                Defaults to 'pred_instances_3d'.
+
+        Returns:
+            Dict[str, float]: Dictionary of evaluation details.
+        """
+        from nuscenes import NuScenes
+        from nuscenes.eval.detection.evaluate import NuScenesEval
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+        nusc = NuScenes(
+            version=self.version, dataroot=self.data_root, verbose=False)
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        nusc_eval = NuScenesEval(
+            nusc,
+            config=self.eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=False)
+        nusc_eval.main(render_curves=False)
+
+        # record metrics
+        metrics = mmengine.load(osp.join(output_dir, 'metrics_summary.json'))
+        detail = dict()
+        metric_prefix = f'{result_name}_NuScenes'
+        for name in classes:
+            for k, v in metrics['label_aps'][name].items():
+                val = float(f'{v:.4f}')
+                detail[f'{metric_prefix}/{name}_AP_dist_{k}'] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float(f'{v:.4f}')
+                detail[f'{metric_prefix}/{name}_{k}'] = val
+            for k, v in metrics['tp_errors'].items():
+                val = float(f'{v:.4f}')
+                detail[f'{metric_prefix}/{self.ErrNameMapping[k]}'] = val
+
+        detail[f'{metric_prefix}/NDS'] = metrics['nd_score']
+        detail[f'{metric_prefix}/mAP'] = metrics['mean_ap']
+        return detail
+
+    def format_results(
+        self,
+        results: List[dict],
+        classes: Optional[List[str]] = None,
+        jsonfile_prefix: Optional[str] = None
+    ) -> Tuple[dict, Union[tempfile.TemporaryDirectory, None]]:
+        """Format the mmdet3d results to standard NuScenes json file.
+
+        Args:
+            results (List[dict]): Testing results of the dataset.
+            classes (List[str], optional): A list of class name.
+                Defaults to None.
+            jsonfile_prefix (str, optional): The prefix of json files. It
+                includes the file path and the prefix of filename, e.g.,
+                "a/b/prefix". If not specified, a temp file will be created.
+                Defaults to None.
+
+        Returns:
+            tuple: Returns (result_dict, tmp_dir), where ``result_dict`` is a
+            dict containing the json filepaths, ``tmp_dir`` is the temporal
+            directory created for saving json files when ``jsonfile_prefix`` is
+            not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+        result_dict = dict()
+        sample_idx_list = [result['sample_idx'] for result in results]
+
+        for name in results[0]:
+            if 'pred' in name and '3d' in name and name[0] != '_':
+                print(f'\nFormating bboxes of {name}')
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                box_type_3d = type(results_[0]['bboxes_3d'])
+                if box_type_3d == LiDARInstance3DBoxes:
+                    result_dict[name] = self._format_lidar_bbox(
+                        results_, sample_idx_list, classes, tmp_file_)
+                elif box_type_3d == CameraInstance3DBoxes:
+                    result_dict[name] = self._format_camera_bbox(
+                        results_, sample_idx_list, classes, tmp_file_)
+
+        return result_dict, tmp_dir
+
+    def get_attr_name(self, attr_idx: int, label_name: str) -> str:
+        """Get attribute from predicted index.
+
+        This is a workaround to predict attribute when the predicted velocity
+        is not reliable. We map the predicted attribute index to the one in the
+        attribute set. If it is consistent with the category, we will keep it.
+        Otherwise, we will use the default attribute.
+
+        Args:
+            attr_idx (int): Attribute index.
+            label_name (str): Predicted category name.
+
+        Returns:
+            str: Predicted attribute name.
+        """
+        # TODO: Simplify the variable name
+        AttrMapping_rev2 = [
+            'cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving',
+            'pedestrian.standing', 'pedestrian.sitting_lying_down',
+            'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None'
+        ]
+        if label_name == 'car' or label_name == 'bus' \
+            or label_name == 'truck' or label_name == 'trailer' \
+                or label_name == 'construction_vehicle':
+            if AttrMapping_rev2[attr_idx] == 'vehicle.moving' or \
+                AttrMapping_rev2[attr_idx] == 'vehicle.parked' or \
+                    AttrMapping_rev2[attr_idx] == 'vehicle.stopped':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return self.DefaultAttribute[label_name]
+        elif label_name == 'pedestrian':
+            if AttrMapping_rev2[attr_idx] == 'pedestrian.moving' or \
+                AttrMapping_rev2[attr_idx] == 'pedestrian.standing' or \
+                    AttrMapping_rev2[attr_idx] == \
+                    'pedestrian.sitting_lying_down':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return self.DefaultAttribute[label_name]
+        elif label_name == 'bicycle' or label_name == 'motorcycle':
+            if AttrMapping_rev2[attr_idx] == 'cycle.with_rider' or \
+                    AttrMapping_rev2[attr_idx] == 'cycle.without_rider':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return self.DefaultAttribute[label_name]
+        else:
+            return self.DefaultAttribute[label_name]
+
+    def _format_camera_bbox(self,
+                            results: List[dict],
+                            sample_idx_list: List[int],
+                            classes: Optional[List[str]] = None,
+                            jsonfile_prefix: Optional[str] = None) -> str:
+        """Convert the results to the standard format.
+
+        Args:
+            results (List[dict]): Testing results of the dataset.
+            sample_idx_list (List[int]): List of result sample idx.
+            classes (List[str], optional): A list of class name.
+                Defaults to None.
+            jsonfile_prefix (str, optional): The prefix of the output jsonfile.
+                You can specify the output directory/filename by modifying the
+                jsonfile_prefix. Defaults to None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+        nusc_annos = {}
+
+        print('Start to convert detection format...')
+
+        # Camera types in Nuscenes datasets
+        camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_FRONT_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_BACK_RIGHT',
+        ]
+
+        CAM_NUM = 6
+
+        for i, det in enumerate(mmengine.track_iter_progress(results)):
+
+            sample_idx = sample_idx_list[i]
+
+            frame_sample_idx = sample_idx // CAM_NUM
+            camera_type_id = sample_idx % CAM_NUM
+
+            if camera_type_id == 0:
+                boxes_per_frame = []
+                attrs_per_frame = []
+
+            # need to merge results from images of the same sample
+            annos = []
+            boxes, attrs = output_to_nusc_box(det)
+            sample_token = self.data_infos[frame_sample_idx]['token']
+            camera_type = camera_types[camera_type_id]
+            boxes, attrs = cam_nusc_box_to_global(
+                self.data_infos[frame_sample_idx], boxes, attrs, classes,
+                self.eval_detection_configs, camera_type)
+            boxes_per_frame.extend(boxes)
+            attrs_per_frame.extend(attrs)
+            # Remove redundant predictions caused by overlap of images
+            if (sample_idx + 1) % CAM_NUM != 0:
+                continue
+            boxes = global_nusc_box_to_cam(self.data_infos[frame_sample_idx],
+                                           boxes_per_frame, classes,
+                                           self.eval_detection_configs)
+            cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes)
+            # box nms 3d over 6 images in a frame
+            # TODO: move this global setting into config
+            nms_cfg = dict(
+                use_rotate_nms=True,
+                nms_across_levels=False,
+                nms_pre=4096,
+                nms_thr=0.05,
+                score_thr=0.01,
+                min_bbox_size=0,
+                max_per_frame=500)
+            nms_cfg = Config(nms_cfg)
+            cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev)
+            boxes3d = cam_boxes3d.tensor
+            # generate attr scores from attr labels
+            attrs = labels.new_tensor([attr for attr in attrs_per_frame])
+            boxes3d, scores, labels, attrs = box3d_multiclass_nms(
+                boxes3d,
+                cam_boxes3d_for_nms,
+                scores,
+                nms_cfg.score_thr,
+                nms_cfg.max_per_frame,
+                nms_cfg,
+                mlvl_attr_scores=attrs)
+            cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9)
+            det = bbox3d2result(cam_boxes3d, scores, labels, attrs)
+            boxes, attrs = output_to_nusc_box(det)
+            boxes, attrs = cam_nusc_box_to_global(
+                self.data_infos[frame_sample_idx], boxes, attrs, classes,
+                self.eval_detection_configs)
+
+            for i, box in enumerate(boxes):
+                name = classes[box.label]
+                attr = self.get_attr_name(attrs[i], name)
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                    detection_name=name,
+                    detection_score=box.score,
+                    attribute_name=attr)
+                annos.append(nusc_anno)
+            # other views results of the same frame should be concatenated
+            if sample_token in nusc_annos:
+                nusc_annos[sample_token].extend(annos)
+            else:
+                nusc_annos[sample_token] = annos
+
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+        }
+
+        mmengine.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print(f'Results writes to {res_path}')
+        mmengine.dump(nusc_submissions, res_path)
+        return res_path
+
+    def _format_lidar_bbox(self,
+                           results: List[dict],
+                           sample_idx_list: List[int],
+                           classes: Optional[List[str]] = None,
+                           jsonfile_prefix: Optional[str] = None) -> str:
+        """Convert the results to the standard format.
+
+        Args:
+            results (List[dict]): Testing results of the dataset.
+            sample_idx_list (List[int]): List of result sample idx.
+            classes (List[str], optional): A list of class name.
+                Defaults to None.
+            jsonfile_prefix (str, optional): The prefix of the output jsonfile.
+                You can specify the output directory/filename by modifying the
+                jsonfile_prefix. Defaults to None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+        nusc_annos = {}
+
+        print('Start to convert detection format...')
+        for i, det in enumerate(mmengine.track_iter_progress(results)):
+            annos = []
+            boxes, attrs = output_to_nusc_box(det)
+            sample_idx = sample_idx_list[i]
+            sample_token = self.data_infos[sample_idx]['token']
+            boxes = lidar_nusc_box_to_global(self.data_infos[sample_idx],
+                                             boxes, classes,
+                                             self.eval_detection_configs)
+            for i, box in enumerate(boxes):
+                name = classes[box.label]
+                if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2:
+                    if name in [
+                            'car',
+                            'construction_vehicle',
+                            'bus',
+                            'truck',
+                            'trailer',
+                    ]:
+                        attr = 'vehicle.moving'
+                    elif name in ['bicycle', 'motorcycle']:
+                        attr = 'cycle.with_rider'
+                    else:
+                        attr = self.DefaultAttribute[name]
+                else:
+                    if name in ['pedestrian']:
+                        attr = 'pedestrian.standing'
+                    elif name in ['bus']:
+                        attr = 'vehicle.stopped'
+                    else:
+                        attr = self.DefaultAttribute[name]
+
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                    detection_name=name,
+                    detection_score=box.score,
+                    attribute_name=attr)
+                annos.append(nusc_anno)
+            nusc_annos[sample_token] = annos
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+        }
+        mmengine.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print(f'Results writes to {res_path}')
+        mmengine.dump(nusc_submissions, res_path)
+        return res_path
+
+
+def output_to_nusc_box(
+        detection: dict) -> Tuple[List[NuScenesBox], Union[np.ndarray, None]]:
+    """Convert the output to the box class in the nuScenes.
+
+    Args:
+        detection (dict): Detection results.
+
+            - bboxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (torch.Tensor): Detection scores.
+            - labels_3d (torch.Tensor): Predicted box labels.
+
+    Returns:
+        Tuple[List[:obj:`NuScenesBox`], np.ndarray or None]: List of standard
+        NuScenesBoxes and attribute labels.
+    """
+    bbox3d = detection['bboxes_3d']
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+    attrs = None
+    if 'attr_labels' in detection:
+        attrs = detection['attr_labels'].numpy()
+
+    box_gravity_center = bbox3d.gravity_center.numpy()
+    box_dims = bbox3d.dims.numpy()
+    box_yaw = bbox3d.yaw.numpy()
+
+    box_list = []
+
+    if isinstance(bbox3d, LiDARInstance3DBoxes):
+        # our LiDAR coordinate system -> nuScenes box coordinate system
+        nus_box_dims = box_dims[:, [1, 0, 2]]
+        for i in range(len(bbox3d)):
+            quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+            velocity = (*bbox3d.tensor[i, 7:9], 0.0)
+            # velo_val = np.linalg.norm(box3d[i, 7:9])
+            # velo_ori = box3d[i, 6]
+            # velocity = (
+            # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
+            box = NuScenesBox(
+                box_gravity_center[i],
+                nus_box_dims[i],
+                quat,
+                label=labels[i],
+                score=scores[i],
+                velocity=velocity)
+            box_list.append(box)
+    elif isinstance(bbox3d, CameraInstance3DBoxes):
+        # our Camera coordinate system -> nuScenes box coordinate system
+        # convert the dim/rot to nuscbox convention
+        nus_box_dims = box_dims[:, [2, 0, 1]]
+        nus_box_yaw = -box_yaw
+        for i in range(len(bbox3d)):
+            q1 = pyquaternion.Quaternion(
+                axis=[0, 0, 1], radians=nus_box_yaw[i])
+            q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2)
+            quat = q2 * q1
+            velocity = (bbox3d.tensor[i, 7], 0.0, bbox3d.tensor[i, 8])
+            box = NuScenesBox(
+                box_gravity_center[i],
+                nus_box_dims[i],
+                quat,
+                label=labels[i],
+                score=scores[i],
+                velocity=velocity)
+            box_list.append(box)
+    else:
+        raise NotImplementedError(
+            f'Do not support convert {type(bbox3d)} bboxes '
+            'to standard NuScenesBoxes.')
+
+    return box_list, attrs
+
+
+def lidar_nusc_box_to_global(
+        info: dict, boxes: List[NuScenesBox], classes: List[str],
+        eval_configs: DetectionConfig) -> List[NuScenesBox]:
+    """Convert the box from ego to global coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the calibration
+            information.
+        boxes (List[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (List[str]): Mapped classes in the evaluation.
+        eval_configs (:obj:`DetectionConfig`): Evaluation configuration object.
+
+    Returns:
+        List[:obj:`DetectionConfig`]: List of standard NuScenesBoxes in the
+        global coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        lidar2ego = np.array(info['lidar_points']['lidar2ego'])
+        box.rotate(
+            pyquaternion.Quaternion(matrix=lidar2ego, rtol=1e-05, atol=1e-07))
+        box.translate(lidar2ego[:3, 3])
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to global coord system
+        ego2global = np.array(info['ego2global'])
+        box.rotate(
+            pyquaternion.Quaternion(matrix=ego2global, rtol=1e-05, atol=1e-07))
+        box.translate(ego2global[:3, 3])
+        box_list.append(box)
+    return box_list
+
+
+def cam_nusc_box_to_global(
+    info: dict,
+    boxes: List[NuScenesBox],
+    attrs: np.ndarray,
+    classes: List[str],
+    eval_configs: DetectionConfig,
+    camera_type: str = 'CAM_FRONT',
+) -> Tuple[List[NuScenesBox], List[int]]:
+    """Convert the box from camera to global coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the calibration
+            information.
+        boxes (List[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        attrs (np.ndarray): Predicted attributes.
+        classes (List[str]): Mapped classes in the evaluation.
+        eval_configs (:obj:`DetectionConfig`): Evaluation configuration object.
+        camera_type (str): Type of camera. Defaults to 'CAM_FRONT'.
+
+    Returns:
+        Tuple[List[:obj:`NuScenesBox`], List[int]]: List of standard
+        NuScenesBoxes in the global coordinate and attribute label.
+    """
+    box_list = []
+    attr_list = []
+    for (box, attr) in zip(boxes, attrs):
+        # Move box to ego vehicle coord system
+        cam2ego = np.array(info['images'][camera_type]['cam2ego'])
+        box.rotate(
+            pyquaternion.Quaternion(matrix=cam2ego, rtol=1e-05, atol=1e-07))
+        box.translate(cam2ego[:3, 3])
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to global coord system
+        ego2global = np.array(info['ego2global'])
+        box.rotate(
+            pyquaternion.Quaternion(matrix=ego2global, rtol=1e-05, atol=1e-07))
+        box.translate(ego2global[:3, 3])
+        box_list.append(box)
+        attr_list.append(attr)
+    return box_list, attr_list
+
+
+def global_nusc_box_to_cam(info: dict, boxes: List[NuScenesBox],
+                           classes: List[str],
+                           eval_configs: DetectionConfig) -> List[NuScenesBox]:
+    """Convert the box from global to camera coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the calibration
+            information.
+        boxes (List[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (List[str]): Mapped classes in the evaluation.
+        eval_configs (:obj:`DetectionConfig`): Evaluation configuration object.
+
+    Returns:
+        List[:obj:`NuScenesBox`]: List of standard NuScenesBoxes in camera
+        coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        ego2global = np.array(info['ego2global'])
+        box.translate(-ego2global[:3, 3])
+        box.rotate(
+            pyquaternion.Quaternion(matrix=ego2global, rtol=1e-05,
+                                    atol=1e-07).inverse)
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to camera coord system
+        cam2ego = np.array(info['images']['CAM_FRONT']['cam2ego'])
+        box.translate(-cam2ego[:3, 3])
+        box.rotate(
+            pyquaternion.Quaternion(matrix=cam2ego, rtol=1e-05,
+                                    atol=1e-07).inverse)
+        box_list.append(box)
+    return box_list
+
+
+def nusc_box_to_cam_box3d(
+    boxes: List[NuScenesBox]
+) -> Tuple[CameraInstance3DBoxes, torch.Tensor, torch.Tensor]:
+    """Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`.
+
+    Args:
+        boxes (:obj:`List[NuScenesBox]`): List of predicted NuScenesBoxes.
+
+    Returns:
+        Tuple[:obj:`CameraInstance3DBoxes`, torch.Tensor, torch.Tensor]:
+        Converted 3D bounding boxes, scores and labels.
+    """
+    locs = torch.Tensor([b.center for b in boxes]).view(-1, 3)
+    dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3)
+    rots = torch.Tensor([b.orientation.yaw_pitch_roll[0]
+                         for b in boxes]).view(-1, 1)
+    velocity = torch.Tensor([b.velocity[0::2] for b in boxes]).view(-1, 2)
+
+    # convert nusbox to cambox convention
+    dims[:, [0, 1, 2]] = dims[:, [1, 2, 0]]
+    rots = -rots
+
+    boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda()
+    cam_boxes3d = CameraInstance3DBoxes(
+        boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5))
+    scores = torch.Tensor([b.score for b in boxes]).cuda()
+    labels = torch.LongTensor([b.label for b in boxes]).cuda()
+    nms_scores = scores.new_zeros(scores.shape[0], 10 + 1)
+    indices = labels.new_tensor(list(range(scores.shape[0])))
+    nms_scores[indices, labels] = scores
+    return cam_boxes3d, nms_scores, labels
diff --git a/mmde/mmdet3d/evaluation/metrics/panoptic_seg_metric.py b/mmde/mmdet3d/evaluation/metrics/panoptic_seg_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..02e4d0902a744563d9d6f4625d3dad34fa37631e
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/metrics/panoptic_seg_metric.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Dict, List, Optional
+
+from mmengine.logging import MMLogger
+
+from mmdet3d.evaluation import panoptic_seg_eval
+from mmdet3d.registry import METRICS
+from .seg_metric import SegMetric
+
+
+@METRICS.register_module()
+class PanopticSegMetric(SegMetric):
+    """3D Panoptic segmentation evaluation metric.
+
+    Args:
+        thing_class_inds (list[int]): Indices of thing classes.
+        stuff_class_inds (list[int]): Indices of stuff classes.
+        min_num_points (int): Minimum number of points of an object to be
+            counted as ground truth in evaluation.
+        id_offset (int): Offset for instance ids to concat with
+            semantic labels.
+        collect_device (str, optional): Device name used for collecting
+            results from different ranks during distributed training.
+            Must be 'cpu' or 'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default to None.
+        pklfile_prefix (str, optional): The prefix of pkl files, including
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Default to None.
+        submission_prefix (str, optional): The prefix of submission data.
+            If not specified, the submission data will not be generated.
+            Default to None.
+    """
+
+    def __init__(self,
+                 thing_class_inds: List[int],
+                 stuff_class_inds: List[int],
+                 min_num_points: int,
+                 id_offset: int,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 pklfile_prefix: str = None,
+                 submission_prefix: str = None,
+                 **kwargs):
+        self.thing_class_inds = thing_class_inds
+        self.stuff_class_inds = stuff_class_inds
+        self.min_num_points = min_num_points
+        self.id_offset = id_offset
+
+        super(PanopticSegMetric, self).__init__(
+            pklfile_prefix=pklfile_prefix,
+            submission_prefix=submission_prefix,
+            prefix=prefix,
+            collect_device=collect_device,
+            **kwargs)
+
+    # TODO modify format_result for panoptic segmentation evaluation, \
+    # different datasets have different needs.
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        if self.submission_prefix:
+            self.format_results(results)
+            return None
+
+        label2cat = self.dataset_meta['label2cat']
+        ignore_index = self.dataset_meta['ignore_index']
+        classes = self.dataset_meta['classes']
+        thing_classes = [classes[i] for i in self.thing_class_inds]
+        stuff_classes = [classes[i] for i in self.stuff_class_inds]
+
+        gt_labels = []
+        seg_preds = []
+        for eval_ann, sinlge_pred_results in results:
+            gt_labels.append(eval_ann)
+            seg_preds.append(sinlge_pred_results)
+
+        ret_dict = panoptic_seg_eval(gt_labels, seg_preds, classes,
+                                     thing_classes, stuff_classes,
+                                     self.min_num_points, self.id_offset,
+                                     label2cat, [ignore_index], logger)
+
+        return ret_dict
diff --git a/mmde/mmdet3d/evaluation/metrics/seg_metric.py b/mmde/mmdet3d/evaluation/metrics/seg_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bd81e71e83f559c6c45707a3725d5675eb671de
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/metrics/seg_metric.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+from typing import Dict, Optional, Sequence
+
+import mmcv
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+
+from mmdet3d.evaluation import seg_eval
+from mmdet3d.registry import METRICS
+
+
+@METRICS.register_module()
+class SegMetric(BaseMetric):
+    """3D semantic segmentation evaluation metric.
+
+    Args:
+        collect_device (str, optional): Device name used for collecting
+            results from different ranks during distributed training.
+            Must be 'cpu' or 'gpu'. Defaults to 'cpu'.
+        prefix (str): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None.
+        pklfile_prefix (str, optional): The prefix of pkl files, including
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Default: None.
+        submission_prefix (str, optional): The prefix of submission data.
+            If not specified, the submission data will not be generated.
+            Default: None.
+    """
+
+    def __init__(self,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 pklfile_prefix: str = None,
+                 submission_prefix: str = None,
+                 **kwargs):
+        self.pklfile_prefix = pklfile_prefix
+        self.submission_prefix = submission_prefix
+        super(SegMetric, self).__init__(
+            prefix=prefix, collect_device=collect_device)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``,
+        which will be used to compute the metrics when all batches
+        have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from
+                the model.
+        """
+        for data_sample in data_samples:
+            pred_3d = data_sample['pred_pts_seg']
+            eval_ann_info = data_sample['eval_ann_info']
+            cpu_pred_3d = dict()
+            for k, v in pred_3d.items():
+                if hasattr(v, 'to'):
+                    cpu_pred_3d[k] = v.to('cpu').numpy()
+                else:
+                    cpu_pred_3d[k] = v
+            self.results.append((eval_ann_info, cpu_pred_3d))
+
+    def format_results(self, results):
+        r"""Format the results to txt file. Refer to `ScanNet documentation
+        <http://kaldir.vc.in.tum.de/scannet_benchmark/documentation>`_.
+
+        Args:
+            outputs (list[dict]): Testing results of the dataset.
+
+        Returns:
+            tuple: (outputs, tmp_dir), outputs is the detection results,
+                tmp_dir is the temporal directory created for saving submission
+                files when ``submission_prefix`` is not specified.
+        """
+
+        submission_prefix = self.submission_prefix
+        if submission_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            submission_prefix = osp.join(tmp_dir.name, 'results')
+        mmcv.mkdir_or_exist(submission_prefix)
+        ignore_index = self.dataset_meta['ignore_index']
+        # need to map network output to original label idx
+        cat2label = np.zeros(len(self.dataset_meta['label2cat'])).astype(
+            np.int64)
+        for original_label, output_idx in self.dataset_meta['label2cat'].items(
+        ):
+            if output_idx != ignore_index:
+                cat2label[output_idx] = original_label
+
+        for i, (eval_ann, result) in enumerate(results):
+            sample_idx = eval_ann['point_cloud']['lidar_idx']
+            pred_sem_mask = result['semantic_mask'].numpy().astype(np.int64)
+            pred_label = cat2label[pred_sem_mask]
+            curr_file = f'{submission_prefix}/{sample_idx}.txt'
+            np.savetxt(curr_file, pred_label, fmt='%d')
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        if self.submission_prefix:
+            self.format_results(results)
+            return None
+
+        label2cat = self.dataset_meta['label2cat']
+        ignore_index = self.dataset_meta['ignore_index']
+
+        gt_semantic_masks = []
+        pred_semantic_masks = []
+
+        for eval_ann, sinlge_pred_results in results:
+            gt_semantic_masks.append(eval_ann['pts_semantic_mask'])
+            pred_semantic_masks.append(
+                sinlge_pred_results['pts_semantic_mask'])
+
+        ret_dict = seg_eval(
+            gt_semantic_masks,
+            pred_semantic_masks,
+            label2cat,
+            ignore_index,
+            logger=logger)
+
+        return ret_dict
diff --git a/mmde/mmdet3d/evaluation/metrics/waymo_metric.py b/mmde/mmdet3d/evaluation/metrics/waymo_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdbc4a58dba00872a2531392591dd5b0a007addc
--- /dev/null
+++ b/mmde/mmdet3d/evaluation/metrics/waymo_metric.py
@@ -0,0 +1,365 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import tempfile
+from os import path as osp
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from mmengine import Config
+from mmengine.device import get_device
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+
+from mmdet3d.models.layers import box3d_multiclass_nms
+from mmdet3d.registry import METRICS
+from mmdet3d.structures import (Box3DMode, CameraInstance3DBoxes,
+                                LiDARInstance3DBoxes, points_cam2img,
+                                xywhr2xyxyr)
+
+
+@METRICS.register_module()
+class WaymoMetric(BaseMetric):
+    """Waymo evaluation metric.
+
+    Args:
+        waymo_bin_file (str): The path of the annotation file in waymo format.
+        metric (str or List[str]): Metrics to be evaluated. Defaults to 'mAP'.
+        load_type (str): Type of loading mode during training.
+            - 'frame_based': Load all of the instances in the frame.
+            - 'mv_image_based': Load all of the instances in the frame and need
+              to convert to the FOV-based data type to support image-based
+              detector.
+            - 'fov_image_based': Only load the instances inside the default cam
+              and need to convert to the FOV-based data type to support image-
+              based detector.
+        result_prefix (str, optional): The prefix of result '*.bin' file,
+            including the file path and the prefix of filename, e.g.,
+            "a/b/prefix". If not specified, a temp file will be created.
+            Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result to a
+            specific format and submit it to the test server.
+            Defaults to False.
+        nms_cfg (dict): The configuration of non-maximum suppression for
+            the mergence of multi-image predicted bboxes, only use when
+            load_type == 'mv_image_based'. Defaults to None.
+    """
+    num_cams = 5
+    default_prefix = 'Waymo metric'
+
+    def __init__(self,
+                 waymo_bin_file: str,
+                 metric: Union[str, List[str]] = 'mAP',
+                 load_type: str = 'frame_based',
+                 result_prefix: Optional[str] = None,
+                 format_only: bool = False,
+                 nms_cfg=None,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.waymo_bin_file = waymo_bin_file
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        self.load_type = load_type
+        self.result_prefix = result_prefix
+        self.format_only = format_only
+        if self.format_only:
+            assert result_prefix is not None, 'result_prefix must be not '
+            'None when format_only is True, otherwise the result files will '
+            'be saved to a temp directory which will be cleaned up at the end.'
+        if nms_cfg is not None:
+            assert load_type == 'mv_image_based', 'nms_cfg in WaymoMetric '
+            'only use when load_type == \'mv_image_based\'.'
+            self.nms_cfg = Config(nms_cfg)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+
+        for data_sample in data_samples:
+            result = dict()
+            bboxes_3d = data_sample['pred_instances_3d']['bboxes_3d']
+            bboxes_3d.limit_yaw(offset=0.5, period=np.pi * 2)
+            scores_3d = data_sample['pred_instances_3d']['scores_3d']
+            labels_3d = data_sample['pred_instances_3d']['labels_3d']
+            # TODO: check lidar post-processing
+            if isinstance(bboxes_3d, CameraInstance3DBoxes):
+                box_corners = bboxes_3d.corners
+                cam2img = box_corners.new_tensor(
+                    np.array(data_sample['cam2img']))
+                box_corners_in_image = points_cam2img(box_corners, cam2img)
+                # box_corners_in_image: [N, 8, 2]
+                minxy = torch.min(box_corners_in_image, dim=1)[0]
+                maxxy = torch.max(box_corners_in_image, dim=1)[0]
+                # check minxy & maxxy
+                # if the projected 2d bbox has intersection
+                # with the image, we keep it, otherwise, we omit it.
+                img_shape = data_sample['img_shape']
+                valid_inds = ((minxy[:, 0] < img_shape[1]) &
+                              (minxy[:, 1] < img_shape[0]) & (maxxy[:, 0] > 0)
+                              & (maxxy[:, 1] > 0))
+
+                if valid_inds.sum() > 0:
+                    lidar2cam = data_sample['lidar2cam']
+                    bboxes_3d = bboxes_3d.convert_to(
+                        Box3DMode.LIDAR,
+                        np.linalg.inv(lidar2cam),
+                        correct_yaw=True)
+                    bboxes_3d = bboxes_3d[valid_inds]
+                    scores_3d = scores_3d[valid_inds]
+                    labels_3d = labels_3d[valid_inds]
+                else:
+                    bboxes_3d = torch.zeros([0, 7])
+                    scores_3d = torch.zeros([0])
+                    labels_3d = torch.zeros([0])
+            result['bboxes_3d'] = bboxes_3d.tensor.cpu().numpy()
+            result['scores_3d'] = scores_3d.cpu().numpy()
+            result['labels_3d'] = labels_3d.cpu().numpy()
+            result['sample_idx'] = data_sample['sample_idx']
+            result['context_name'] = data_sample['context_name']
+            result['timestamp'] = data_sample['timestamp']
+            self.results.append(result)
+
+    def compute_metrics(self, results: List[dict]) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (List[dict]): The processed results of the whole dataset.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        self.classes = self.dataset_meta['classes']
+
+        # different from kitti, waymo do not need to convert the ann file
+        # handle the mv_image_based load_mode
+        if self.load_type == 'mv_image_based':
+            assert len(results) % 5 == 0, 'The multi-view image-based results'
+            ' must be 5 times as large as the original frame-based results.'
+            frame_results = [
+                results[i:i + 5] for i in range(0, len(results), 5)
+            ]
+            results = self.merge_multi_view_boxes(frame_results)
+
+        if self.result_prefix is None:
+            eval_tmp_dir = tempfile.TemporaryDirectory()
+            result_prefix = osp.join(eval_tmp_dir.name, 'results')
+        else:
+            eval_tmp_dir = None
+            result_prefix = self.result_prefix
+
+        self.format_results(results, result_prefix=result_prefix)
+
+        metric_dict = {}
+
+        if self.format_only:
+            logger.info('results are saved in '
+                        f'{osp.dirname(self.result_prefix)}')
+            return metric_dict
+
+        for metric in self.metrics:
+            ap_dict = self.waymo_evaluate(
+                result_prefix, metric=metric, logger=logger)
+            metric_dict.update(ap_dict)
+        if eval_tmp_dir is not None:
+            eval_tmp_dir.cleanup()
+
+        return metric_dict
+
+    def waymo_evaluate(self,
+                       result_prefix: str,
+                       metric: Optional[str] = None,
+                       logger: Optional[MMLogger] = None) -> Dict[str, float]:
+        """Evaluation in Waymo protocol.
+
+        Args:
+            result_prefix (str): The location that stored the prediction
+                results.
+            metric (str, optional): Metric to be evaluated. Defaults to None.
+            logger (MMLogger, optional): Logger used for printing related
+                information during evaluation. Defaults to None.
+
+        Returns:
+            Dict[str, float]: Results of each evaluation metric.
+        """
+
+        import subprocess
+
+        if metric == 'mAP':
+            eval_str = 'mmdet3d/evaluation/functional/waymo_utils/' + \
+                f'compute_detection_metrics_main {result_prefix}.bin ' + \
+                f'{self.waymo_bin_file}'
+            print(eval_str)
+            ret_bytes = subprocess.check_output(eval_str, shell=True)
+            ret_texts = ret_bytes.decode('utf-8')
+            print_log(ret_texts, logger=logger)
+
+            ap_dict = {
+                'Vehicle/L1 mAP': 0,
+                'Vehicle/L1 mAPH': 0,
+                'Vehicle/L2 mAP': 0,
+                'Vehicle/L2 mAPH': 0,
+                'Pedestrian/L1 mAP': 0,
+                'Pedestrian/L1 mAPH': 0,
+                'Pedestrian/L2 mAP': 0,
+                'Pedestrian/L2 mAPH': 0,
+                'Sign/L1 mAP': 0,
+                'Sign/L1 mAPH': 0,
+                'Sign/L2 mAP': 0,
+                'Sign/L2 mAPH': 0,
+                'Cyclist/L1 mAP': 0,
+                'Cyclist/L1 mAPH': 0,
+                'Cyclist/L2 mAP': 0,
+                'Cyclist/L2 mAPH': 0,
+                'Overall/L1 mAP': 0,
+                'Overall/L1 mAPH': 0,
+                'Overall/L2 mAP': 0,
+                'Overall/L2 mAPH': 0
+            }
+            mAP_splits = ret_texts.split('mAP ')
+            mAPH_splits = ret_texts.split('mAPH ')
+            for idx, key in enumerate(ap_dict.keys()):
+                split_idx = int(idx / 2) + 1
+                if idx % 2 == 0:  # mAP
+                    ap_dict[key] = float(mAP_splits[split_idx].split(']')[0])
+                else:  # mAPH
+                    ap_dict[key] = float(mAPH_splits[split_idx].split(']')[0])
+            ap_dict['Overall/L1 mAP'] = \
+                (ap_dict['Vehicle/L1 mAP'] + ap_dict['Pedestrian/L1 mAP'] +
+                    ap_dict['Cyclist/L1 mAP']) / 3
+            ap_dict['Overall/L1 mAPH'] = \
+                (ap_dict['Vehicle/L1 mAPH'] + ap_dict['Pedestrian/L1 mAPH'] +
+                    ap_dict['Cyclist/L1 mAPH']) / 3
+            ap_dict['Overall/L2 mAP'] = \
+                (ap_dict['Vehicle/L2 mAP'] + ap_dict['Pedestrian/L2 mAP'] +
+                    ap_dict['Cyclist/L2 mAP']) / 3
+            ap_dict['Overall/L2 mAPH'] = \
+                (ap_dict['Vehicle/L2 mAPH'] + ap_dict['Pedestrian/L2 mAPH'] +
+                    ap_dict['Cyclist/L2 mAPH']) / 3
+        elif metric == 'LET_mAP':
+            eval_str = 'mmdet3d/evaluation/functional/waymo_utils/' + \
+                f'compute_detection_let_metrics_main {result_prefix}.bin ' + \
+                f'{self.waymo_bin_file}'
+
+            print(eval_str)
+            ret_bytes = subprocess.check_output(eval_str, shell=True)
+            ret_texts = ret_bytes.decode('utf-8')
+
+            print_log(ret_texts, logger=logger)
+            ap_dict = {
+                'Vehicle mAPL': 0,
+                'Vehicle mAP': 0,
+                'Vehicle mAPH': 0,
+                'Pedestrian mAPL': 0,
+                'Pedestrian mAP': 0,
+                'Pedestrian mAPH': 0,
+                'Sign mAPL': 0,
+                'Sign mAP': 0,
+                'Sign mAPH': 0,
+                'Cyclist mAPL': 0,
+                'Cyclist mAP': 0,
+                'Cyclist mAPH': 0,
+                'Overall mAPL': 0,
+                'Overall mAP': 0,
+                'Overall mAPH': 0
+            }
+            mAPL_splits = ret_texts.split('mAPL ')
+            mAP_splits = ret_texts.split('mAP ')
+            mAPH_splits = ret_texts.split('mAPH ')
+            for idx, key in enumerate(ap_dict.keys()):
+                split_idx = int(idx / 3) + 1
+                if idx % 3 == 0:  # mAPL
+                    ap_dict[key] = float(mAPL_splits[split_idx].split(']')[0])
+                elif idx % 3 == 1:  # mAP
+                    ap_dict[key] = float(mAP_splits[split_idx].split(']')[0])
+                else:  # mAPH
+                    ap_dict[key] = float(mAPH_splits[split_idx].split(']')[0])
+            ap_dict['Overall mAPL'] = \
+                (ap_dict['Vehicle mAPL'] + ap_dict['Pedestrian mAPL'] +
+                    ap_dict['Cyclist mAPL']) / 3
+            ap_dict['Overall mAP'] = \
+                (ap_dict['Vehicle mAP'] + ap_dict['Pedestrian mAP'] +
+                    ap_dict['Cyclist mAP']) / 3
+            ap_dict['Overall mAPH'] = \
+                (ap_dict['Vehicle mAPH'] + ap_dict['Pedestrian mAPH'] +
+                    ap_dict['Cyclist mAPH']) / 3
+        return ap_dict
+
+    def format_results(
+        self,
+        results: List[dict],
+        result_prefix: Optional[str] = None
+    ) -> Tuple[dict, Union[tempfile.TemporaryDirectory, None]]:
+        """Format the results to bin file.
+
+        Args:
+            results (List[dict]): Testing results of the dataset.
+            result_prefix (str, optional): The prefix of result file. It
+                includes the file path and the prefix of filename, e.g.,
+                "a/b/prefix". If not specified, a temp file will be created.
+                Defaults to None.
+        """
+        waymo_results_final_path = f'{result_prefix}.bin'
+
+        from ..functional.waymo_utils.prediction_to_waymo import \
+            Prediction2Waymo
+        converter = Prediction2Waymo(results, waymo_results_final_path,
+                                     self.classes)
+        converter.convert()
+
+    def merge_multi_view_boxes(self, frame_results: List[dict]) -> dict:
+        """Merge bounding boxes predicted from multi-view images.
+
+        Args:
+            box_dict_per_frame (List[dict]): The results of prediction for each
+                camera.
+            cam0_info (dict): Store the sample idx for the given frame.
+
+        Returns:
+            Dict: Merged results.
+        """
+        merged_results = []
+        for frame_result in frame_results:
+            merged_result = dict()
+            merged_result['sample_idx'] = frame_result[0]['sample_idx'] // 5
+            merged_result['context_name'] = frame_result[0]['context_name']
+            merged_result['timestamp'] = frame_result[0]['timestamp']
+            bboxes_3d, scores_3d, labels_3d = [], [], []
+            for result in frame_result:
+                assert result['timestamp'] == merged_result['timestamp']
+                bboxes_3d.append(result['bboxes_3d'])
+                scores_3d.append(result['scores_3d'])
+                labels_3d.append(result['labels_3d'])
+
+            bboxes_3d = np.concatenate(bboxes_3d)
+            scores_3d = np.concatenate(scores_3d)
+            labels_3d = np.concatenate(labels_3d)
+
+            device = get_device()
+            lidar_boxes3d = LiDARInstance3DBoxes(
+                torch.from_numpy(bboxes_3d).to(device))
+            scores = torch.from_numpy(scores_3d).to(device)
+            labels = torch.from_numpy(labels_3d).long().to(device)
+            nms_scores = scores.new_zeros(scores.shape[0],
+                                          len(self.classes) + 1)
+            indices = labels.new_tensor(list(range(scores.shape[0])))
+            nms_scores[indices, labels] = scores
+            lidar_boxes3d_for_nms = xywhr2xyxyr(lidar_boxes3d.bev)
+            boxes3d = lidar_boxes3d.tensor
+            bboxes_3d, scores_3d, labels_3d = box3d_multiclass_nms(
+                boxes3d, lidar_boxes3d_for_nms, nms_scores,
+                self.nms_cfg.score_thr, self.nms_cfg.max_per_frame,
+                self.nms_cfg)
+
+            merged_result['bboxes_3d'] = bboxes_3d.cpu().numpy()
+            merged_result['scores_3d'] = scores_3d.cpu().numpy()
+            merged_result['labels_3d'] = labels_3d.cpu().numpy()
+            merged_results.append(merged_result)
+        return merged_results
diff --git a/mmde/mmdet3d/models/__init__.py b/mmde/mmdet3d/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6d4b86391db581b5931872bd71bfa8c3698f259
--- /dev/null
+++ b/mmde/mmdet3d/models/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet3d.models.layers.fusion_layers import *  # noqa: F401,F403
+from .backbones import *  # noqa: F401,F403
+from .data_preprocessors import *  # noqa: F401,F403
+from .decode_heads import *  # noqa: F401,F403
+from .dense_heads import *  # noqa: F401,F403
+from .detectors import *  # noqa: F401,F403
+from .layers import *  # noqa: F401,F403
+from .losses import *  # noqa: F401,F403
+from .middle_encoders import *  # noqa: F401,F403
+from .necks import *  # noqa: F401,F403
+from .roi_heads import *  # noqa: F401,F403
+from .segmentors import *  # noqa: F401,F403
+from .test_time_augs import *  # noqa: F401,F403
+from .utils import *  # noqa: F401,F403
+from .voxel_encoders import *  # noqa: F401,F403
diff --git a/mmde/mmdet3d/models/backbones/__init__.py b/mmde/mmdet3d/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..64102bec1f3ea46cdd17fba2f3aa4a0cb447e622
--- /dev/null
+++ b/mmde/mmdet3d/models/backbones/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.backbones import SSDVGG, HRNet, ResNet, ResNetV1d, ResNeXt
+
+from .cylinder3d import Asymm3DSpconv
+from .dgcnn import DGCNNBackbone
+from .dla import DLANet
+from .mink_resnet import MinkResNet
+from .minkunet_backbone import MinkUNetBackbone
+from .multi_backbone import MultiBackbone
+from .nostem_regnet import NoStemRegNet
+from .pointnet2_sa_msg import PointNet2SAMSG
+from .pointnet2_sa_ssg import PointNet2SASSG
+from .second import SECOND
+from .spvcnn_backone import MinkUNetBackboneV2, SPVCNNBackbone
+
+__all__ = [
+    'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet', 'NoStemRegNet',
+    'SECOND', 'DGCNNBackbone', 'PointNet2SASSG', 'PointNet2SAMSG',
+    'MultiBackbone', 'DLANet', 'MinkResNet', 'Asymm3DSpconv',
+    'MinkUNetBackbone', 'SPVCNNBackbone', 'MinkUNetBackboneV2'
+]
diff --git a/mmde/mmdet3d/models/backbones/base_pointnet.py b/mmde/mmdet3d/models/backbones/base_pointnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..82342de7bacc991217859a232fea6d3a7bb92882
--- /dev/null
+++ b/mmde/mmdet3d/models/backbones/base_pointnet.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from abc import ABCMeta
+from typing import Optional, Tuple
+
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet3d.utils import OptMultiConfig
+
+
+class BasePointNet(BaseModule, metaclass=ABCMeta):
+    """Base class for PointNet."""
+
+    def __init__(self,
+                 init_cfg: OptMultiConfig = None,
+                 pretrained: Optional[str] = None):
+        super(BasePointNet, self).__init__(init_cfg)
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+
+    @staticmethod
+    def _split_point_feats(points: Tensor) -> Tuple[Tensor, Optional[Tensor]]:
+        """Split coordinates and features of input points.
+
+        Args:
+            points (torch.Tensor): Point coordinates with features,
+                with shape (B, N, 3 + input_feature_dim).
+
+        Returns:
+            torch.Tensor: Coordinates of input points.
+            torch.Tensor: Features of input points.
+        """
+        xyz = points[..., 0:3].contiguous()
+        if points.size(-1) > 3:
+            features = points[..., 3:].transpose(1, 2).contiguous()
+        else:
+            features = None
+
+        return xyz, features
diff --git a/mmde/mmdet3d/models/backbones/cylinder3d.py b/mmde/mmdet3d/models/backbones/cylinder3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..738aafa67c052b603ee306dbd972cfbc6c6b400f
--- /dev/null
+++ b/mmde/mmdet3d/models/backbones/cylinder3d.py
@@ -0,0 +1,481 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Modified from Cylinder3D.
+
+Please refer to `Cylinder3D github page
+<https://github.com/xinge008/Cylinder3D>`_ for details
+"""
+
+from typing import List, Optional
+
+import numpy as np
+import torch
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmcv.ops import (SparseConv3d, SparseConvTensor, SparseInverseConv3d,
+                      SubMConv3d)
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType
+
+
+class AsymmResBlock(BaseModule):
+    """Asymmetrical Residual Block.
+
+    Args:
+        in_channels (int): Input channels of the block.
+        out_channels (int): Output channels of the block.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for
+            normalization layer.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict of activation layers.
+            Defaults to dict(type='LeakyReLU').
+        indice_key (str, optional): Name of indice tables. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: ConfigType,
+                 act_cfg: ConfigType = dict(type='LeakyReLU'),
+                 indice_key: Optional[str] = None):
+        super().__init__()
+
+        self.conv0_0 = SubMConv3d(
+            in_channels,
+            out_channels,
+            kernel_size=(1, 3, 3),
+            padding=1,
+            bias=False,
+            indice_key=indice_key + 'bef')
+        self.act0_0 = build_activation_layer(act_cfg)
+        self.bn0_0 = build_norm_layer(norm_cfg, out_channels)[1]
+
+        self.conv0_1 = SubMConv3d(
+            out_channels,
+            out_channels,
+            kernel_size=(3, 1, 3),
+            padding=1,
+            bias=False,
+            indice_key=indice_key + 'bef')
+        self.act0_1 = build_activation_layer(act_cfg)
+        self.bn0_1 = build_norm_layer(norm_cfg, out_channels)[1]
+
+        self.conv1_0 = SubMConv3d(
+            in_channels,
+            out_channels,
+            kernel_size=(3, 1, 3),
+            padding=1,
+            bias=False,
+            indice_key=indice_key + 'bef')
+        self.act1_0 = build_activation_layer(act_cfg)
+        self.bn1_0 = build_norm_layer(norm_cfg, out_channels)[1]
+
+        self.conv1_1 = SubMConv3d(
+            out_channels,
+            out_channels,
+            kernel_size=(1, 3, 3),
+            padding=1,
+            bias=False,
+            indice_key=indice_key + 'bef')
+        self.act1_1 = build_activation_layer(act_cfg)
+        self.bn1_1 = build_norm_layer(norm_cfg, out_channels)[1]
+
+    def forward(self, x: SparseConvTensor) -> SparseConvTensor:
+        """Forward pass."""
+        shortcut = self.conv0_0(x)
+
+        shortcut.features = self.act0_0(shortcut.features)
+        shortcut.features = self.bn0_0(shortcut.features)
+
+        shortcut = self.conv0_1(shortcut)
+        shortcut.features = self.act0_1(shortcut.features)
+        shortcut.features = self.bn0_1(shortcut.features)
+
+        res = self.conv1_0(x)
+        res.features = self.act1_0(res.features)
+        res.features = self.bn1_0(res.features)
+
+        res = self.conv1_1(res)
+        res.features = self.act1_1(res.features)
+        res.features = self.bn1_1(res.features)
+
+        res.features = res.features + shortcut.features
+
+        return res
+
+
+class AsymmeDownBlock(BaseModule):
+    """Asymmetrical DownSample Block.
+
+    Args:
+       in_channels (int): Input channels of the block.
+       out_channels (int): Output channels of the block.
+       norm_cfg (:obj:`ConfigDict` or dict): Config dict for
+            normalization layer.
+       act_cfg (:obj:`ConfigDict` or dict): Config dict of activation layers.
+            Defaults to dict(type='LeakyReLU').
+       pooling (bool): Whether pooling features at the end of
+           block. Defaults: True.
+       height_pooling (bool): Whether pooling features at
+           the height dimension. Defaults: False.
+       indice_key (str, optional): Name of indice tables. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: ConfigType,
+                 act_cfg: ConfigType = dict(type='LeakyReLU'),
+                 pooling: bool = True,
+                 height_pooling: bool = False,
+                 indice_key: Optional[str] = None):
+        super().__init__()
+        self.pooling = pooling
+
+        self.conv0_0 = SubMConv3d(
+            in_channels,
+            out_channels,
+            kernel_size=(3, 1, 3),
+            padding=1,
+            bias=False,
+            indice_key=indice_key + 'bef')
+        self.act0_0 = build_activation_layer(act_cfg)
+        self.bn0_0 = build_norm_layer(norm_cfg, out_channels)[1]
+
+        self.conv0_1 = SubMConv3d(
+            out_channels,
+            out_channels,
+            kernel_size=(1, 3, 3),
+            padding=1,
+            bias=False,
+            indice_key=indice_key + 'bef')
+        self.act0_1 = build_activation_layer(act_cfg)
+        self.bn0_1 = build_norm_layer(norm_cfg, out_channels)[1]
+
+        self.conv1_0 = SubMConv3d(
+            in_channels,
+            out_channels,
+            kernel_size=(1, 3, 3),
+            padding=1,
+            bias=False,
+            indice_key=indice_key + 'bef')
+        self.act1_0 = build_activation_layer(act_cfg)
+        self.bn1_0 = build_norm_layer(norm_cfg, out_channels)[1]
+
+        self.conv1_1 = SubMConv3d(
+            out_channels,
+            out_channels,
+            kernel_size=(3, 1, 3),
+            padding=1,
+            bias=False,
+            indice_key=indice_key + 'bef')
+        self.act1_1 = build_activation_layer(act_cfg)
+        self.bn1_1 = build_norm_layer(norm_cfg, out_channels)[1]
+
+        if pooling:
+            if height_pooling:
+                self.pool = SparseConv3d(
+                    out_channels,
+                    out_channels,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    indice_key=indice_key,
+                    bias=False)
+            else:
+                self.pool = SparseConv3d(
+                    out_channels,
+                    out_channels,
+                    kernel_size=3,
+                    stride=(2, 2, 1),
+                    padding=1,
+                    indice_key=indice_key,
+                    bias=False)
+
+    def forward(self, x: SparseConvTensor) -> SparseConvTensor:
+        """Forward pass."""
+        shortcut = self.conv0_0(x)
+        shortcut.features = self.act0_0(shortcut.features)
+        shortcut.features = self.bn0_0(shortcut.features)
+
+        shortcut = self.conv0_1(shortcut)
+        shortcut.features = self.act0_1(shortcut.features)
+        shortcut.features = self.bn0_1(shortcut.features)
+
+        res = self.conv1_0(x)
+        res.features = self.act1_0(res.features)
+        res.features = self.bn1_0(res.features)
+
+        res = self.conv1_1(res)
+        res.features = self.act1_1(res.features)
+        res.features = self.bn1_1(res.features)
+
+        res.features = res.features + shortcut.features
+
+        if self.pooling:
+            pooled_res = self.pool(res)
+            return pooled_res, res
+        else:
+            return res
+
+
+class AsymmeUpBlock(BaseModule):
+    """Asymmetrical UpSample Block.
+
+    Args:
+        in_channels (int): Input channels of the block.
+        out_channels (int): Output channels of the block.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for
+                normalization layer.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict of activation layers.
+                Defaults to dict(type='LeakyReLU').
+        indice_key (str, optional): Name of indice tables. Defaults to None.
+        up_key (str, optional): Name of indice tables used in
+            SparseInverseConv3d. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: ConfigType,
+                 act_cfg: ConfigType = dict(type='LeakyReLU'),
+                 indice_key: Optional[str] = None,
+                 up_key: Optional[str] = None):
+        super().__init__()
+
+        self.trans_conv = SubMConv3d(
+            in_channels,
+            out_channels,
+            kernel_size=(3, 3, 3),
+            padding=1,
+            bias=False,
+            indice_key=indice_key + 'new_up')
+        self.trans_act = build_activation_layer(act_cfg)
+        self.trans_bn = build_norm_layer(norm_cfg, out_channels)[1]
+
+        self.conv1 = SubMConv3d(
+            out_channels,
+            out_channels,
+            kernel_size=(1, 3, 3),
+            padding=1,
+            bias=False,
+            indice_key=indice_key)
+        self.act1 = build_activation_layer(act_cfg)
+        self.bn1 = build_norm_layer(norm_cfg, out_channels)[1]
+
+        self.conv2 = SubMConv3d(
+            out_channels,
+            out_channels,
+            kernel_size=(3, 1, 3),
+            padding=1,
+            bias=False,
+            indice_key=indice_key)
+        self.act2 = build_activation_layer(act_cfg)
+        self.bn2 = build_norm_layer(norm_cfg, out_channels)[1]
+
+        self.conv3 = SubMConv3d(
+            out_channels,
+            out_channels,
+            kernel_size=(3, 3, 3),
+            padding=1,
+            bias=False,
+            indice_key=indice_key)
+        self.act3 = build_activation_layer(act_cfg)
+        self.bn3 = build_norm_layer(norm_cfg, out_channels)[1]
+
+        self.up_subm = SparseInverseConv3d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            indice_key=up_key,
+            bias=False)
+
+    def forward(self, x: SparseConvTensor,
+                skip: SparseConvTensor) -> SparseConvTensor:
+        """Forward pass."""
+        x_trans = self.trans_conv(x)
+        x_trans.features = self.trans_act(x_trans.features)
+        x_trans.features = self.trans_bn(x_trans.features)
+
+        # upsample
+        up = self.up_subm(x_trans)
+
+        up.features = up.features + skip.features
+
+        up = self.conv1(up)
+        up.features = self.act1(up.features)
+        up.features = self.bn1(up.features)
+
+        up = self.conv2(up)
+        up.features = self.act2(up.features)
+        up.features = self.bn2(up.features)
+
+        up = self.conv3(up)
+        up.features = self.act3(up.features)
+        up.features = self.bn3(up.features)
+
+        return up
+
+
+class DDCMBlock(BaseModule):
+    """Dimension-Decomposition based Context Modeling.
+
+    Args:
+        in_channels (int): Input channels of the block.
+        out_channels (int): Output channels of the block.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for
+            normalization layer.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict of activation layers.
+            Defaults to dict(type='Sigmoid').
+        indice_key (str, optional): Name of indice tables. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: ConfigType,
+                 act_cfg: ConfigType = dict(type='Sigmoid'),
+                 indice_key: Optional[str] = None):
+        super().__init__()
+
+        self.conv1 = SubMConv3d(
+            in_channels,
+            out_channels,
+            kernel_size=(3, 1, 1),
+            padding=1,
+            bias=False,
+            indice_key=indice_key)
+        self.bn1 = build_norm_layer(norm_cfg, out_channels)[1]
+        self.act1 = build_activation_layer(act_cfg)
+
+        self.conv2 = SubMConv3d(
+            in_channels,
+            out_channels,
+            kernel_size=(1, 3, 1),
+            padding=1,
+            bias=False,
+            indice_key=indice_key)
+        self.bn2 = build_norm_layer(norm_cfg, out_channels)[1]
+        self.act2 = build_activation_layer(act_cfg)
+
+        self.conv3 = SubMConv3d(
+            in_channels,
+            out_channels,
+            kernel_size=(1, 1, 3),
+            padding=1,
+            bias=False,
+            indice_key=indice_key)
+        self.bn3 = build_norm_layer(norm_cfg, out_channels)[1]
+        self.act3 = build_activation_layer(act_cfg)
+
+    def forward(self, x: SparseConvTensor) -> SparseConvTensor:
+        """Forward pass."""
+        shortcut = self.conv1(x)
+        shortcut.features = self.bn1(shortcut.features)
+        shortcut.features = self.act1(shortcut.features)
+
+        shortcut2 = self.conv2(x)
+        shortcut2.features = self.bn2(shortcut2.features)
+        shortcut2.features = self.act2(shortcut2.features)
+
+        shortcut3 = self.conv3(x)
+        shortcut3.features = self.bn3(shortcut3.features)
+        shortcut3.features = self.act3(shortcut3.features)
+        shortcut.features = shortcut.features + \
+            shortcut2.features + shortcut3.features
+
+        shortcut.features = shortcut.features * x.features
+
+        return shortcut
+
+
+@MODELS.register_module()
+class Asymm3DSpconv(BaseModule):
+    """Asymmetrical 3D convolution networks.
+
+    Args:
+        grid_size (int): Size of voxel grids.
+        input_channels (int): Input channels of the block.
+        base_channels (int): Initial size of feature channels before
+            feeding into Encoder-Decoder structure. Defaults to 16.
+        backbone_depth (int): The depth of backbone. The backbone contains
+            downblocks and upblocks with the number of backbone_depth.
+        height_pooing (List[bool]): List indicating which downblocks perform
+            height pooling.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01)).
+        init_cfg (dict, optional): Initialization config.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 grid_size: int,
+                 input_channels: int,
+                 base_channels: int = 16,
+                 backbone_depth: int = 4,
+                 height_pooing: List[bool] = [True, True, False, False],
+                 norm_cfg: ConfigType = dict(
+                     type='BN1d', eps=1e-3, momentum=0.01),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.grid_size = grid_size
+        self.backbone_depth = backbone_depth
+        self.down_context = AsymmResBlock(
+            input_channels, base_channels, indice_key='pre', norm_cfg=norm_cfg)
+
+        self.down_block_list = torch.nn.ModuleList()
+        self.up_block_list = torch.nn.ModuleList()
+        for i in range(self.backbone_depth):
+            self.down_block_list.append(
+                AsymmeDownBlock(
+                    2**i * base_channels,
+                    2**(i + 1) * base_channels,
+                    height_pooling=height_pooing[i],
+                    indice_key='down' + str(i),
+                    norm_cfg=norm_cfg))
+            if i == self.backbone_depth - 1:
+                self.up_block_list.append(
+                    AsymmeUpBlock(
+                        2**(i + 1) * base_channels,
+                        2**(i + 1) * base_channels,
+                        up_key='down' + str(i),
+                        indice_key='up' + str(self.backbone_depth - 1 - i),
+                        norm_cfg=norm_cfg))
+            else:
+                self.up_block_list.append(
+                    AsymmeUpBlock(
+                        2**(i + 2) * base_channels,
+                        2**(i + 1) * base_channels,
+                        up_key='down' + str(i),
+                        indice_key='up' + str(self.backbone_depth - 1 - i),
+                        norm_cfg=norm_cfg))
+
+        self.ddcm = DDCMBlock(
+            2 * base_channels,
+            2 * base_channels,
+            indice_key='ddcm',
+            norm_cfg=norm_cfg)
+
+    def forward(self, voxel_features: Tensor, coors: Tensor,
+                batch_size: int) -> SparseConvTensor:
+        """Forward pass."""
+        coors = coors.int()
+        ret = SparseConvTensor(voxel_features, coors, np.array(self.grid_size),
+                               batch_size)
+        ret = self.down_context(ret)
+
+        down_skip_list = []
+        down_pool = ret
+        for i in range(self.backbone_depth):
+            down_pool, down_skip = self.down_block_list[i](down_pool)
+            down_skip_list.append(down_skip)
+
+        up = down_pool
+        for i in range(self.backbone_depth - 1, -1, -1):
+            up = self.up_block_list[i](up, down_skip_list[i])
+
+        ddcm = self.ddcm(up)
+        ddcm.features = torch.cat((ddcm.features, up.features), 1)
+
+        return ddcm
diff --git a/mmde/mmdet3d/models/backbones/dgcnn.py b/mmde/mmdet3d/models/backbones/dgcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a94625147a383717dc5c0f0c26625bbd63063d43
--- /dev/null
+++ b/mmde/mmdet3d/models/backbones/dgcnn.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Union
+
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.models.layers import DGCNNFAModule, DGCNNGFModule
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class DGCNNBackbone(BaseModule):
+    """Backbone network for DGCNN.
+
+    Args:
+        in_channels (int): Input channels of point cloud.
+        num_samples (tuple[int], optional): The number of samples for knn or
+            ball query in each graph feature (GF) module.
+            Defaults to (20, 20, 20).
+        knn_modes (tuple[str], optional): Mode of KNN of each knn module.
+            Defaults to ('D-KNN', 'F-KNN', 'F-KNN').
+        radius (tuple[float], optional): Sampling radii of each GF module.
+            Defaults to (None, None, None).
+        gf_channels (tuple[tuple[int]], optional): Out channels of each mlp in
+            GF module. Defaults to ((64, 64), (64, 64), (64, )).
+        fa_channels (tuple[int], optional): Out channels of each mlp in FA
+            module. Defaults to (1024, ).
+        act_cfg (dict, optional): Config of activation layer.
+            Defaults to dict(type='ReLU').
+        init_cfg (dict, optional): Initialization config.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 num_samples: Sequence[int] = (20, 20, 20),
+                 knn_modes: Sequence[str] = ('D-KNN', 'F-KNN', 'F-KNN'),
+                 radius: Sequence[Union[float, None]] = (None, None, None),
+                 gf_channels: Sequence[Sequence[int]] = ((64, 64), (64, 64),
+                                                         (64, )),
+                 fa_channels: Sequence[int] = (1024, ),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        self.num_gf = len(gf_channels)
+
+        assert len(num_samples) == len(knn_modes) == len(radius) == len(
+            gf_channels), 'Num_samples, knn_modes, radius and gf_channels \
+            should have the same length.'
+
+        self.GF_modules = nn.ModuleList()
+        gf_in_channel = in_channels * 2
+        skip_channel_list = [gf_in_channel]  # input channel list
+
+        for gf_index in range(self.num_gf):
+            cur_gf_mlps = list(gf_channels[gf_index])
+            cur_gf_mlps = [gf_in_channel] + cur_gf_mlps
+            gf_out_channel = cur_gf_mlps[-1]
+
+            self.GF_modules.append(
+                DGCNNGFModule(
+                    mlp_channels=cur_gf_mlps,
+                    num_sample=num_samples[gf_index],
+                    knn_mode=knn_modes[gf_index],
+                    radius=radius[gf_index],
+                    act_cfg=act_cfg))
+            skip_channel_list.append(gf_out_channel)
+            gf_in_channel = gf_out_channel * 2
+
+        fa_in_channel = sum(skip_channel_list[1:])
+        cur_fa_mlps = list(fa_channels)
+        cur_fa_mlps = [fa_in_channel] + cur_fa_mlps
+
+        self.FA_module = DGCNNFAModule(
+            mlp_channels=cur_fa_mlps, act_cfg=act_cfg)
+
+    def forward(self, points: Tensor) -> dict:
+        """Forward pass.
+
+        Args:
+            points (torch.Tensor): point coordinates with features,
+                with shape (B, N, in_channels).
+
+        Returns:
+            dict[str, list[torch.Tensor]]: Outputs after graph feature (GF) and
+                feature aggregation (FA) modules.
+
+                - gf_points (list[torch.Tensor]): Outputs after each GF module.
+                - fa_points (torch.Tensor): Outputs after FA module.
+        """
+        gf_points = [points]
+
+        for i in range(self.num_gf):
+            cur_points = self.GF_modules[i](gf_points[i])
+            gf_points.append(cur_points)
+
+        fa_points = self.FA_module(gf_points)
+
+        out = dict(gf_points=gf_points, fa_points=fa_points)
+        return out
diff --git a/mmde/mmdet3d/models/backbones/dla.py b/mmde/mmdet3d/models/backbones/dla.py
new file mode 100644
index 0000000000000000000000000000000000000000..effc6bf36c64910049c943a1cb344c5d20d6e63b
--- /dev/null
+++ b/mmde/mmdet3d/models/backbones/dla.py
@@ -0,0 +1,453 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+from torch import Tensor, nn
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
+
+
+def dla_build_norm_layer(cfg: ConfigType,
+                         num_features: int) -> Tuple[str, nn.Module]:
+    """Build normalization layer specially designed for DLANet.
+
+    Args:
+        cfg (dict): The norm layer config, which should contain:
+
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate a norm layer.
+            - requires_grad (bool, optional): Whether stop gradient updates.
+        num_features (int): Number of input channels.
+
+
+    Returns:
+        Function: Build normalization layer in mmcv.
+    """
+    cfg_ = cfg.copy()
+    if cfg_['type'] == 'GN':
+        if num_features % 32 == 0:
+            return build_norm_layer(cfg_, num_features)
+        else:
+            assert 'num_groups' in cfg_
+            cfg_['num_groups'] = cfg_['num_groups'] // 2
+            return build_norm_layer(cfg_, num_features)
+    else:
+        return build_norm_layer(cfg_, num_features)
+
+
+class BasicBlock(BaseModule):
+    """BasicBlock in DLANet.
+
+    Args:
+        in_channels (int): Input feature channel.
+        out_channels (int): Output feature channel.
+        norm_cfg (dict): Dictionary to construct and config
+            norm layer.
+        conv_cfg (dict): Dictionary to construct and config
+            conv layer.
+        stride (int, optional): Conv stride. Default: 1.
+        dilation (int, optional): Conv dilation. Default: 1.
+        init_cfg (dict, optional): Initialization config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: ConfigType,
+                 conv_cfg: ConfigType,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 init_cfg: OptMultiConfig = None):
+        super(BasicBlock, self).__init__(init_cfg)
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            out_channels,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.norm1 = dla_build_norm_layer(norm_cfg, out_channels)[1]
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            out_channels,
+            out_channels,
+            3,
+            stride=1,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.norm2 = dla_build_norm_layer(norm_cfg, out_channels)[1]
+        self.stride = stride
+
+    def forward(self, x: Tensor, identity: Optional[Tensor] = None) -> Tensor:
+        """Forward function."""
+
+        if identity is None:
+            identity = x
+        out = self.conv1(x)
+        out = self.norm1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.norm2(out)
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Root(BaseModule):
+    """Root in DLANet.
+
+    Args:
+        in_channels (int): Input feature channel.
+        out_channels (int): Output feature channel.
+        norm_cfg (dict): Dictionary to construct and config
+            norm layer.
+        conv_cfg (dict): Dictionary to construct and config
+            conv layer.
+        kernel_size (int): Size of convolution kernel.
+        add_identity (bool): Whether to add identity in root.
+        init_cfg (dict, optional): Initialization config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: ConfigType,
+                 conv_cfg: ConfigType,
+                 kernel_size: int,
+                 add_identity: bool,
+                 init_cfg: OptMultiConfig = None):
+        super(Root, self).__init__(init_cfg)
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            out_channels,
+            1,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+            bias=False)
+        self.norm = dla_build_norm_layer(norm_cfg, out_channels)[1]
+        self.relu = nn.ReLU(inplace=True)
+        self.add_identity = add_identity
+
+    def forward(self, feat_list: List[Tensor]) -> Tensor:
+        """Forward function.
+
+        Args:
+            feat_list (list[torch.Tensor]): Output features from
+                multiple layers.
+        """
+        children = feat_list
+        x = self.conv(torch.cat(feat_list, 1))
+        x = self.norm(x)
+        if self.add_identity:
+            x += children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class Tree(BaseModule):
+    """Tree in DLANet.
+
+    Args:
+        levels (int): The level of the tree.
+        block (nn.Module): The block module in tree.
+        in_channels: Input feature channel.
+        out_channels: Output feature channel.
+        norm_cfg (dict): Dictionary to construct and config
+            norm layer.
+        conv_cfg (dict): Dictionary to construct and config
+            conv layer.
+        stride (int, optional): Convolution stride.
+            Default: 1.
+        level_root (bool, optional): whether belongs to the
+            root layer.
+        root_dim (int, optional): Root input feature channel.
+        root_kernel_size (int, optional): Size of root
+            convolution kernel. Default: 1.
+        dilation (int, optional): Conv dilation. Default: 1.
+        add_identity (bool, optional): Whether to add
+            identity in root. Default: False.
+        init_cfg (dict, optional): Initialization config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 levels: int,
+                 block: nn.Module,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: ConfigType,
+                 conv_cfg: ConfigType,
+                 stride: int = 1,
+                 level_root: bool = False,
+                 root_dim: Optional[int] = None,
+                 root_kernel_size: int = 1,
+                 dilation: int = 1,
+                 add_identity: bool = False,
+                 init_cfg: OptMultiConfig = None):
+        super(Tree, self).__init__(init_cfg)
+        if root_dim is None:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+        if levels == 1:
+            self.root = Root(root_dim, out_channels, norm_cfg, conv_cfg,
+                             root_kernel_size, add_identity)
+            self.tree1 = block(
+                in_channels,
+                out_channels,
+                norm_cfg,
+                conv_cfg,
+                stride,
+                dilation=dilation)
+            self.tree2 = block(
+                out_channels,
+                out_channels,
+                norm_cfg,
+                conv_cfg,
+                1,
+                dilation=dilation)
+        else:
+            self.tree1 = Tree(
+                levels - 1,
+                block,
+                in_channels,
+                out_channels,
+                norm_cfg,
+                conv_cfg,
+                stride,
+                root_dim=None,
+                root_kernel_size=root_kernel_size,
+                dilation=dilation,
+                add_identity=add_identity)
+            self.tree2 = Tree(
+                levels - 1,
+                block,
+                out_channels,
+                out_channels,
+                norm_cfg,
+                conv_cfg,
+                root_dim=root_dim + out_channels,
+                root_kernel_size=root_kernel_size,
+                dilation=dilation,
+                add_identity=add_identity)
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.downsample = None
+        self.project = None
+        self.levels = levels
+        if stride > 1:
+            self.downsample = nn.MaxPool2d(stride, stride=stride)
+        if in_channels != out_channels:
+            self.project = nn.Sequential(
+                build_conv_layer(
+                    conv_cfg,
+                    in_channels,
+                    out_channels,
+                    1,
+                    stride=1,
+                    bias=False),
+                dla_build_norm_layer(norm_cfg, out_channels)[1])
+
+    def forward(self,
+                x: Tensor,
+                identity: Optional[Tensor] = None,
+                children: Optional[List[Tensor]] = None) -> Tensor:
+        children = [] if children is None else children
+        bottom = self.downsample(x) if self.downsample else x
+        identity = self.project(bottom) if self.project else bottom
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, identity)
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            feat_list = [x2, x1] + children
+            x = self.root(feat_list)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+@MODELS.register_module()
+class DLANet(BaseModule):
+    r"""`DLA backbone <https://arxiv.org/abs/1707.06484>`_.
+
+    Args:
+        depth (int): Depth of DLA. Default: 34.
+        in_channels (int, optional): Number of input image channels.
+            Default: 3.
+        norm_cfg (dict, optional): Dictionary to construct and config
+            norm layer. Default: None.
+        conv_cfg (dict, optional): Dictionary to construct and config
+            conv layer. Default: None.
+        layer_with_level_root (list[bool], optional): Whether to apply
+            level_root in each DLA layer, this is only used for
+            tree levels. Default: (False, True, True, True).
+        with_identity_root (bool, optional): Whether to add identity
+            in root layer. Default: False.
+        pretrained (str, optional): model pretrained path.
+            Default: None.
+        init_cfg (dict or list[dict], optional): Initialization
+            config dict. Default: None
+    """
+    arch_settings = {
+        34: (BasicBlock, (1, 1, 1, 2, 2, 1), (16, 32, 64, 128, 256, 512)),
+    }
+
+    def __init__(self,
+                 depth: int,
+                 in_channels: int = 3,
+                 out_indices: Sequence[int] = (0, 1, 2, 3, 4, 5),
+                 frozen_stages: int = -1,
+                 norm_cfg: OptConfigType = None,
+                 conv_cfg: OptConfigType = None,
+                 layer_with_level_root: Sequence[bool] = (False, True, True,
+                                                          True),
+                 with_identity_root: bool = False,
+                 pretrained: Optional[str] = None,
+                 init_cfg: OptMultiConfig = None):
+        super(DLANet, self).__init__(init_cfg)
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalida depth {depth} for DLA')
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+
+        block, levels, channels = self.arch_settings[depth]
+        self.channels = channels
+        self.num_levels = len(levels)
+        self.frozen_stages = frozen_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_levels
+        self.base_layer = nn.Sequential(
+            build_conv_layer(
+                conv_cfg,
+                in_channels,
+                channels[0],
+                7,
+                stride=1,
+                padding=3,
+                bias=False),
+            dla_build_norm_layer(norm_cfg, channels[0])[1],
+            nn.ReLU(inplace=True))
+
+        # DLANet first uses two conv layers then uses several
+        # Tree layers
+        for i in range(2):
+            level_layer = self._make_conv_level(
+                channels[0],
+                channels[i],
+                levels[i],
+                norm_cfg,
+                conv_cfg,
+                stride=i + 1)
+            layer_name = f'level{i}'
+            self.add_module(layer_name, level_layer)
+
+        for i in range(2, self.num_levels):
+            dla_layer = Tree(
+                levels[i],
+                block,
+                channels[i - 1],
+                channels[i],
+                norm_cfg,
+                conv_cfg,
+                2,
+                level_root=layer_with_level_root[i - 2],
+                add_identity=with_identity_root)
+            layer_name = f'level{i}'
+            self.add_module(layer_name, dla_layer)
+
+        self._freeze_stages()
+
+    def _make_conv_level(self,
+                         in_channels: int,
+                         out_channels: int,
+                         num_convs: int,
+                         norm_cfg: ConfigType,
+                         conv_cfg: ConfigType,
+                         stride: int = 1,
+                         dilation: int = 1) -> nn.Sequential:
+        """Conv modules.
+
+        Args:
+            in_channels (int): Input feature channel.
+            out_channels (int): Output feature channel.
+            num_convs (int): Number of Conv module.
+            norm_cfg (dict): Dictionary to construct and config
+                norm layer.
+            conv_cfg (dict): Dictionary to construct and config
+                conv layer.
+            stride (int, optional): Conv stride. Default: 1.
+            dilation (int, optional): Conv dilation. Default: 1.
+        """
+        modules = []
+        for i in range(num_convs):
+            modules.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=stride if i == 0 else 1,
+                    padding=dilation,
+                    bias=False,
+                    dilation=dilation),
+                dla_build_norm_layer(norm_cfg, out_channels)[1],
+                nn.ReLU(inplace=True)
+            ])
+            in_channels = out_channels
+        return nn.Sequential(*modules)
+
+    def _freeze_stages(self) -> None:
+        if self.frozen_stages >= 0:
+            self.base_layer.eval()
+            for param in self.base_layer.parameters():
+                param.requires_grad = False
+
+            for i in range(2):
+                m = getattr(self, f'level{i}')
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'level{i+1}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, ...]:
+        outs = []
+        x = self.base_layer(x)
+        for i in range(self.num_levels):
+            x = getattr(self, 'level{}'.format(i))(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/mmde/mmdet3d/models/backbones/mink_resnet.py b/mmde/mmdet3d/models/backbones/mink_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..91ff10c314acee03c58e46ea5f022b2deac7edff
--- /dev/null
+++ b/mmde/mmdet3d/models/backbones/mink_resnet.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Follow https://github.com/NVIDIA/MinkowskiEngine/blob/master/examples/resnet.py # noqa
+# and mmcv.cnn.ResNet
+from typing import List, Union
+
+try:
+    import MinkowskiEngine as ME
+    from MinkowskiEngine import SparseTensor
+    from MinkowskiEngine.modules.resnet_block import BasicBlock, Bottleneck
+except ImportError:
+    # blocks are used in the static part of MinkResNet
+    ME = BasicBlock = Bottleneck = SparseTensor = None
+
+import torch.nn as nn
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class MinkResNet(BaseModule):
+    r"""Minkowski ResNet backbone. See `4D Spatio-Temporal ConvNets
+    <https://arxiv.org/abs/1904.08755>`_ for more details.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input channels, 3 for RGB.
+        num_stages (int): Resnet stages. Defaults to 4.
+        pool (bool): Whether to add max pooling after first conv.
+            Defaults to True.
+    """
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth: int,
+                 in_channels: int,
+                 num_stages: int = 4,
+                 pool: bool = True):
+        super(MinkResNet, self).__init__()
+        if ME is None:
+            raise ImportError(
+                'Please follow `get_started.md` to install MinkowskiEngine.`')
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        assert 4 >= num_stages >= 1
+        block, stage_blocks = self.arch_settings[depth]
+        stage_blocks = stage_blocks[:num_stages]
+        self.num_stages = num_stages
+        self.pool = pool
+
+        self.inplanes = 64
+        self.conv1 = ME.MinkowskiConvolution(
+            in_channels, self.inplanes, kernel_size=3, stride=2, dimension=3)
+        # May be BatchNorm is better, but we follow original implementation.
+        self.norm1 = ME.MinkowskiInstanceNorm(self.inplanes)
+        self.relu = ME.MinkowskiReLU(inplace=True)
+        if self.pool:
+            self.maxpool = ME.MinkowskiMaxPooling(
+                kernel_size=2, stride=2, dimension=3)
+
+        for i in range(len(stage_blocks)):
+            setattr(
+                self, f'layer{i + 1}',
+                self._make_layer(block, 64 * 2**i, stage_blocks[i], stride=2))
+
+    def init_weights(self):
+        """Initialize weights."""
+        for m in self.modules():
+            if isinstance(m, ME.MinkowskiConvolution):
+                ME.utils.kaiming_normal_(
+                    m.kernel, mode='fan_out', nonlinearity='relu')
+
+            if isinstance(m, ME.MinkowskiBatchNorm):
+                nn.init.constant_(m.bn.weight, 1)
+                nn.init.constant_(m.bn.bias, 0)
+
+    def _make_layer(self, block: Union[BasicBlock, Bottleneck], planes: int,
+                    blocks: int, stride: int) -> nn.Module:
+        """Make single level of residual blocks.
+
+        Args:
+            block (BasicBlock | Bottleneck): Residual block class.
+            planes (int): Number of convolution filters.
+            blocks (int): Number of blocks in the layers.
+            stride (int): Stride of the first convolutional layer.
+
+        Returns:
+            nn.Module: With residual blocks.
+        """
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                ME.MinkowskiConvolution(
+                    self.inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    dimension=3),
+                ME.MinkowskiBatchNorm(planes * block.expansion))
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride=stride,
+                downsample=downsample,
+                dimension=3))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, stride=1, dimension=3))
+        return nn.Sequential(*layers)
+
+    def forward(self, x: SparseTensor) -> List[SparseTensor]:
+        """Forward pass of ResNet.
+
+        Args:
+            x (ME.SparseTensor): Input sparse tensor.
+
+        Returns:
+            list[ME.SparseTensor]: Output sparse tensors.
+        """
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        if self.pool:
+            x = self.maxpool(x)
+        outs = []
+        for i in range(self.num_stages):
+            x = getattr(self, f'layer{i + 1}')(x)
+            outs.append(x)
+        return outs
diff --git a/mmde/mmdet3d/models/backbones/minkunet_backbone.py b/mmde/mmdet3d/models/backbones/minkunet_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae1782231844ee4d6880b1ea68910d9dce0cc9b0
--- /dev/null
+++ b/mmde/mmdet3d/models/backbones/minkunet_backbone.py
@@ -0,0 +1,245 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from functools import partial
+from typing import List
+
+import torch
+from mmengine.model import BaseModule
+from mmengine.registry import MODELS
+from torch import Tensor, nn
+
+from mmdet3d.models.layers.minkowski_engine_block import (
+    IS_MINKOWSKI_ENGINE_AVAILABLE, MinkowskiBasicBlock, MinkowskiBottleneck,
+    MinkowskiConvModule)
+from mmdet3d.models.layers.sparse_block import (SparseBasicBlock,
+                                                SparseBottleneck,
+                                                make_sparse_convmodule,
+                                                replace_feature)
+from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE
+from mmdet3d.models.layers.torchsparse import IS_TORCHSPARSE_AVAILABLE
+from mmdet3d.models.layers.torchsparse_block import (TorchSparseBasicBlock,
+                                                     TorchSparseBottleneck,
+                                                     TorchSparseConvModule)
+from mmdet3d.utils import OptMultiConfig
+
+if IS_SPCONV2_AVAILABLE:
+    from spconv.pytorch import SparseConvTensor
+else:
+    from mmcv.ops import SparseConvTensor
+
+if IS_TORCHSPARSE_AVAILABLE:
+    import torchsparse
+
+if IS_MINKOWSKI_ENGINE_AVAILABLE:
+    import MinkowskiEngine as ME
+
+
+@MODELS.register_module()
+class MinkUNetBackbone(BaseModule):
+    r"""MinkUNet backbone with TorchSparse backend.
+
+    Refer to `implementation code <https://github.com/mit-han-lab/spvnas>`_.
+
+    Args:
+        in_channels (int): Number of input voxel feature channels.
+            Defaults to 4.
+        base_channels (int): The input channels for first encoder layer.
+            Defaults to 32.
+        num_stages (int): Number of stages in encoder and decoder.
+            Defaults to 4.
+        encoder_channels (List[int]): Convolutional channels of each encode
+            layer. Defaults to [32, 64, 128, 256].
+        encoder_blocks (List[int]): Number of blocks in each encode layer.
+        decoder_channels (List[int]): Convolutional channels of each decode
+            layer. Defaults to [256, 128, 96, 96].
+        decoder_blocks (List[int]): Number of blocks in each decode layer.
+        block_type (str): Type of block in encoder and decoder.
+        sparseconv_backend (str): Sparse convolutional backend.
+        init_cfg (dict or :obj:`ConfigDict` or List[dict or :obj:`ConfigDict`]
+            , optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels: int = 4,
+                 base_channels: int = 32,
+                 num_stages: int = 4,
+                 encoder_channels: List[int] = [32, 64, 128, 256],
+                 encoder_blocks: List[int] = [2, 2, 2, 2],
+                 decoder_channels: List[int] = [256, 128, 96, 96],
+                 decoder_blocks: List[int] = [2, 2, 2, 2],
+                 block_type: str = 'basic',
+                 sparseconv_backend: str = 'torchsparse',
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg)
+        assert num_stages == len(encoder_channels) == len(decoder_channels)
+        assert sparseconv_backend in [
+            'torchsparse', 'spconv', 'minkowski'
+        ], f'sparseconv backend: {sparseconv_backend} not supported.'
+        self.num_stages = num_stages
+        self.sparseconv_backend = sparseconv_backend
+        if sparseconv_backend == 'torchsparse':
+            assert IS_TORCHSPARSE_AVAILABLE, \
+                'Please follow `get_started.md` to install Torchsparse.`'
+            input_conv = TorchSparseConvModule
+            encoder_conv = TorchSparseConvModule
+            decoder_conv = TorchSparseConvModule
+            residual_block = TorchSparseBasicBlock if block_type == 'basic' \
+                else TorchSparseBottleneck
+            # for torchsparse, residual branch will be implemented internally
+            residual_branch = None
+        elif sparseconv_backend == 'spconv':
+            if not IS_SPCONV2_AVAILABLE:
+                warnings.warn('Spconv 2.x is not available,'
+                              'turn to use spconv 1.x in mmcv.')
+            input_conv = partial(
+                make_sparse_convmodule, conv_type='SubMConv3d')
+            encoder_conv = partial(
+                make_sparse_convmodule, conv_type='SparseConv3d')
+            decoder_conv = partial(
+                make_sparse_convmodule, conv_type='SparseInverseConv3d')
+            residual_block = SparseBasicBlock if block_type == 'basic' \
+                else SparseBottleneck
+            residual_branch = partial(
+                make_sparse_convmodule,
+                conv_type='SubMConv3d',
+                order=('conv', 'norm'))
+        elif sparseconv_backend == 'minkowski':
+            assert IS_MINKOWSKI_ENGINE_AVAILABLE, \
+                'Please follow `get_started.md` to install Minkowski Engine.`'
+            input_conv = MinkowskiConvModule
+            encoder_conv = MinkowskiConvModule
+            decoder_conv = partial(
+                MinkowskiConvModule,
+                conv_cfg=dict(type='MinkowskiConvNdTranspose'))
+            residual_block = MinkowskiBasicBlock if block_type == 'basic' \
+                else MinkowskiBottleneck
+            residual_branch = partial(MinkowskiConvModule, act_cfg=None)
+
+        self.conv_input = nn.Sequential(
+            input_conv(
+                in_channels,
+                base_channels,
+                kernel_size=3,
+                padding=1,
+                indice_key='subm0'),
+            input_conv(
+                base_channels,
+                base_channels,
+                kernel_size=3,
+                padding=1,
+                indice_key='subm0'))
+
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+        encoder_channels.insert(0, base_channels)
+        decoder_channels.insert(0, encoder_channels[-1])
+
+        for i in range(num_stages):
+            encoder_layer = [
+                encoder_conv(
+                    encoder_channels[i],
+                    encoder_channels[i],
+                    kernel_size=2,
+                    stride=2,
+                    indice_key=f'spconv{i+1}')
+            ]
+            for j in range(encoder_blocks[i]):
+                if j == 0 and encoder_channels[i] != encoder_channels[i + 1]:
+                    encoder_layer.append(
+                        residual_block(
+                            encoder_channels[i],
+                            encoder_channels[i + 1],
+                            downsample=residual_branch(
+                                encoder_channels[i],
+                                encoder_channels[i + 1],
+                                kernel_size=1)
+                            if residual_branch is not None else None,
+                            indice_key=f'subm{i+1}'))
+                else:
+                    encoder_layer.append(
+                        residual_block(
+                            encoder_channels[i + 1],
+                            encoder_channels[i + 1],
+                            indice_key=f'subm{i+1}'))
+            self.encoder.append(nn.Sequential(*encoder_layer))
+
+            decoder_layer = [
+                decoder_conv(
+                    decoder_channels[i],
+                    decoder_channels[i + 1],
+                    kernel_size=2,
+                    stride=2,
+                    transposed=True,
+                    indice_key=f'spconv{num_stages-i}')
+            ]
+            for j in range(decoder_blocks[i]):
+                if j == 0:
+                    decoder_layer.append(
+                        residual_block(
+                            decoder_channels[i + 1] + encoder_channels[-2 - i],
+                            decoder_channels[i + 1],
+                            downsample=residual_branch(
+                                decoder_channels[i + 1] +
+                                encoder_channels[-2 - i],
+                                decoder_channels[i + 1],
+                                kernel_size=1)
+                            if residual_branch is not None else None,
+                            indice_key=f'subm{num_stages-i-1}'))
+                else:
+                    decoder_layer.append(
+                        residual_block(
+                            decoder_channels[i + 1],
+                            decoder_channels[i + 1],
+                            indice_key=f'subm{num_stages-i-1}'))
+            self.decoder.append(
+                nn.ModuleList(
+                    [decoder_layer[0],
+                     nn.Sequential(*decoder_layer[1:])]))
+
+    def forward(self, voxel_features: Tensor, coors: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            voxel_features (Tensor): Voxel features in shape (N, C).
+            coors (Tensor): Coordinates in shape (N, 4),
+                the columns in the order of (x_idx, y_idx, z_idx, batch_idx).
+
+        Returns:
+            Tensor: Backbone features.
+        """
+        if self.sparseconv_backend == 'torchsparse':
+            x = torchsparse.SparseTensor(voxel_features, coors)
+        elif self.sparseconv_backend == 'spconv':
+            spatial_shape = coors.max(0)[0][1:] + 1
+            batch_size = int(coors[-1, 0]) + 1
+            x = SparseConvTensor(voxel_features, coors, spatial_shape,
+                                 batch_size)
+        elif self.sparseconv_backend == 'minkowski':
+            x = ME.SparseTensor(voxel_features, coors)
+
+        x = self.conv_input(x)
+        laterals = [x]
+        for encoder_layer in self.encoder:
+            x = encoder_layer(x)
+            laterals.append(x)
+        laterals = laterals[:-1][::-1]
+
+        decoder_outs = []
+        for i, decoder_layer in enumerate(self.decoder):
+            x = decoder_layer[0](x)
+
+            if self.sparseconv_backend == 'torchsparse':
+                x = torchsparse.cat((x, laterals[i]))
+            elif self.sparseconv_backend == 'spconv':
+                x = replace_feature(
+                    x, torch.cat((x.features, laterals[i].features), dim=1))
+            elif self.sparseconv_backend == 'minkowski':
+                x = ME.cat(x, laterals[i])
+
+            x = decoder_layer[1](x)
+            decoder_outs.append(x)
+
+        if self.sparseconv_backend == 'spconv':
+            return decoder_outs[-1].features
+        else:
+            return decoder_outs[-1].F
diff --git a/mmde/mmdet3d/models/backbones/multi_backbone.py b/mmde/mmdet3d/models/backbones/multi_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca0ccea36464a728d360ab1ca66c7c10f1aebe86
--- /dev/null
+++ b/mmde/mmdet3d/models/backbones/multi_backbone.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor, nn
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class MultiBackbone(BaseModule):
+    """MultiBackbone with different configs.
+
+    Args:
+        num_streams (int): The number of backbones.
+        backbones (list or dict): A list of backbone configs.
+        aggregation_mlp_channels (list[int]): Specify the mlp layers
+            for feature aggregation.
+        conv_cfg (dict): Config dict of convolutional layers.
+        norm_cfg (dict): Config dict of normalization layers.
+        act_cfg (dict): Config dict of activation layers.
+        suffixes (list): A list of suffixes to rename the return dict
+            for each backbone.
+    """
+
+    def __init__(self,
+                 num_streams: int,
+                 backbones: Union[List[dict], Dict],
+                 aggregation_mlp_channels: Optional[Sequence[int]] = None,
+                 conv_cfg: ConfigType = dict(type='Conv1d'),
+                 norm_cfg: ConfigType = dict(
+                     type='BN1d', eps=1e-5, momentum=0.01),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 suffixes: Tuple[str] = ('net0', 'net1'),
+                 init_cfg: OptMultiConfig = None,
+                 pretrained: Optional[str] = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(backbones, dict) or isinstance(backbones, list)
+        if isinstance(backbones, dict):
+            backbones_list = []
+            for ind in range(num_streams):
+                backbones_list.append(copy.deepcopy(backbones))
+            backbones = backbones_list
+
+        assert len(backbones) == num_streams
+        assert len(suffixes) == num_streams
+
+        self.backbone_list = nn.ModuleList()
+        # Rename the ret_dict with different suffixs.
+        self.suffixes = suffixes
+
+        out_channels = 0
+
+        for backbone_cfg in backbones:
+            out_channels += backbone_cfg['fp_channels'][-1][-1]
+            self.backbone_list.append(MODELS.build(backbone_cfg))
+
+        # Feature aggregation layers
+        if aggregation_mlp_channels is None:
+            aggregation_mlp_channels = [
+                out_channels, out_channels // 2,
+                out_channels // len(self.backbone_list)
+            ]
+        else:
+            aggregation_mlp_channels.insert(0, out_channels)
+
+        self.aggregation_layers = nn.Sequential()
+        for i in range(len(aggregation_mlp_channels) - 1):
+            self.aggregation_layers.add_module(
+                f'layer{i}',
+                ConvModule(
+                    aggregation_mlp_channels[i],
+                    aggregation_mlp_channels[i + 1],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    bias=True,
+                    inplace=True))
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+
+    def forward(self, points: Tensor) -> dict:
+        """Forward pass.
+
+        Args:
+            points (torch.Tensor): point coordinates with features,
+                with shape (B, N, 3 + input_feature_dim).
+
+        Returns:
+            dict[str, list[torch.Tensor]]: Outputs from multiple backbones.
+
+                - fp_xyz[suffix] (list[torch.Tensor]): The coordinates of
+                  each fp features.
+                - fp_features[suffix] (list[torch.Tensor]): The features
+                  from each Feature Propagate Layers.
+                - fp_indices[suffix] (list[torch.Tensor]): Indices of the
+                  input points.
+                - hd_feature (torch.Tensor): The aggregation feature
+                  from multiple backbones.
+        """
+        ret = {}
+        fp_features = []
+        for ind in range(len(self.backbone_list)):
+            cur_ret = self.backbone_list[ind](points)
+            cur_suffix = self.suffixes[ind]
+            fp_features.append(cur_ret['fp_features'][-1])
+            cur_ret_new = dict()
+            if cur_suffix != '':
+                for k in cur_ret.keys():
+                    cur_ret_new[k + '_' + cur_suffix] = cur_ret[k]
+            ret.update(cur_ret_new)
+
+        # Combine the features here
+        hd_feature = torch.cat(fp_features, dim=1)
+        hd_feature = self.aggregation_layers(hd_feature)
+        ret['hd_feature'] = hd_feature
+        return ret
diff --git a/mmde/mmdet3d/models/backbones/nostem_regnet.py b/mmde/mmdet3d/models/backbones/nostem_regnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..36cf0b5c27ee95a4fd15c7e98a8981d01b4c1f5b
--- /dev/null
+++ b/mmde/mmdet3d/models/backbones/nostem_regnet.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn as nn
+from mmdet.models.backbones import RegNet
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import OptMultiConfig
+
+
+@MODELS.register_module()
+class NoStemRegNet(RegNet):
+    """RegNet backbone without Stem for 3D detection.
+
+    More details can be found in `paper <https://arxiv.org/abs/2003.13678>`_ .
+
+    Args:
+        arch (dict): The parameter of RegNets.
+            - w0 (int): Initial width.
+            - wa (float): Slope of width.
+            - wm (float): Quantization parameter to quantize the width.
+            - depth (int): Depth of the backbone.
+            - group_w (int): Width of group.
+            - bot_mul (float): Bottleneck ratio, i.e. expansion of bottleneck.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        base_channels (int): Base channels after stem layer.
+        in_channels (int): Number of input image channels. Normally 3.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from mmdet3d.models import NoStemRegNet
+        >>> import torch
+        >>> self = NoStemRegNet(
+                arch=dict(
+                    w0=88,
+                    wa=26.31,
+                    wm=2.25,
+                    group_w=48,
+                    depth=25,
+                    bot_mul=1.0))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 64, 16, 16)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 96, 8, 8)
+        (1, 192, 4, 4)
+        (1, 432, 2, 2)
+        (1, 1008, 1, 1)
+    """
+
+    def __init__(self,
+                 arch: dict,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super(NoStemRegNet, self).__init__(arch, init_cfg=init_cfg, **kwargs)
+
+    def _make_stem_layer(self, in_channels: int,
+                         base_channels: int) -> nn.Module:
+        """Override the original function that do not initialize a stem layer
+        since 3D detector's voxel encoder works like a stem layer."""
+        return
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, ...]:
+        """Forward function of backbone.
+
+        Args:
+            x (torch.Tensor): Features in shape (N, C, H, W).
+
+        Returns:
+            tuple[torch.Tensor]: Multi-scale features.
+        """
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/mmde/mmdet3d/models/backbones/pointnet2_sa_msg.py b/mmde/mmdet3d/models/backbones/pointnet2_sa_msg.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ca52ba388b8b5f17d0692ebc030fee79fd56708
--- /dev/null
+++ b/mmde/mmdet3d/models/backbones/pointnet2_sa_msg.py
@@ -0,0 +1,191 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from mmcv.cnn import ConvModule
+from torch import Tensor, nn
+
+from mmdet3d.models.layers.pointnet_modules import build_sa_module
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import OptConfigType
+from .base_pointnet import BasePointNet
+
+ThreeTupleIntType = Tuple[Tuple[Tuple[int, int, int]]]
+TwoTupleIntType = Tuple[Tuple[int, int, int]]
+TwoTupleStrType = Tuple[Tuple[str]]
+
+
+@MODELS.register_module()
+class PointNet2SAMSG(BasePointNet):
+    """PointNet2 with Multi-scale grouping.
+
+    Args:
+        in_channels (int): Input channels of point cloud.
+        num_points (tuple[int]): The number of points which each SA
+            module samples.
+        radii (tuple[float]): Sampling radii of each SA module.
+        num_samples (tuple[int]): The number of samples for ball
+            query in each SA module.
+        sa_channels (tuple[tuple[int]]): Out channels of each mlp in SA module.
+        aggregation_channels (tuple[int]): Out channels of aggregation
+            multi-scale grouping features.
+        fps_mods Sequence[Tuple[str]]: Mod of FPS for each SA module.
+        fps_sample_range_lists (tuple[tuple[int]]): The number of sampling
+            points which each SA module samples.
+        dilated_group (tuple[bool]): Whether to use dilated ball query for
+        out_indices (Sequence[int]): Output from which stages.
+        norm_cfg (dict): Config of normalization layer.
+        sa_cfg (dict): Config of set abstraction module, which may contain
+            the following keys and values:
+
+            - pool_mod (str): Pool method ('max' or 'avg') for SA modules.
+            - use_xyz (bool): Whether to use xyz as a part of features.
+            - normalize_xyz (bool): Whether to normalize xyz with radii in
+              each SA module.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 num_points: Tuple[int] = (2048, 1024, 512, 256),
+                 radii: Tuple[Tuple[float, float, float]] = (
+                     (0.2, 0.4, 0.8),
+                     (0.4, 0.8, 1.6),
+                     (1.6, 3.2, 4.8),
+                 ),
+                 num_samples: TwoTupleIntType = ((32, 32, 64), (32, 32, 64),
+                                                 (32, 32, 32)),
+                 sa_channels: ThreeTupleIntType = (((16, 16, 32), (16, 16, 32),
+                                                    (32, 32, 64)),
+                                                   ((64, 64, 128),
+                                                    (64, 64, 128), (64, 96,
+                                                                    128)),
+                                                   ((128, 128, 256),
+                                                    (128, 192, 256), (128, 256,
+                                                                      256))),
+                 aggregation_channels: Tuple[int] = (64, 128, 256),
+                 fps_mods: TwoTupleStrType = (('D-FPS'), ('FS'), ('F-FPS',
+                                                                  'D-FPS')),
+                 fps_sample_range_lists: TwoTupleIntType = ((-1), (-1), (512,
+                                                                         -1)),
+                 dilated_group: Tuple[bool] = (True, True, True),
+                 out_indices: Tuple[int] = (2, ),
+                 norm_cfg: dict = dict(type='BN2d'),
+                 sa_cfg: dict = dict(
+                     type='PointSAModuleMSG',
+                     pool_mod='max',
+                     use_xyz=True,
+                     normalize_xyz=False),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg=init_cfg)
+        self.num_sa = len(sa_channels)
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_sa
+        assert len(num_points) == len(radii) == len(num_samples) == len(
+            sa_channels)
+        if aggregation_channels is not None:
+            assert len(sa_channels) == len(aggregation_channels)
+        else:
+            aggregation_channels = [None] * len(sa_channels)
+
+        self.SA_modules = nn.ModuleList()
+        self.aggregation_mlps = nn.ModuleList()
+        sa_in_channel = in_channels - 3  # number of channels without xyz
+        skip_channel_list = [sa_in_channel]
+
+        for sa_index in range(self.num_sa):
+            cur_sa_mlps = list(sa_channels[sa_index])
+            sa_out_channel = 0
+            for radius_index in range(len(radii[sa_index])):
+                cur_sa_mlps[radius_index] = [sa_in_channel] + list(
+                    cur_sa_mlps[radius_index])
+                sa_out_channel += cur_sa_mlps[radius_index][-1]
+
+            if isinstance(fps_mods[sa_index], tuple):
+                cur_fps_mod = list(fps_mods[sa_index])
+            else:
+                cur_fps_mod = list([fps_mods[sa_index]])
+
+            if isinstance(fps_sample_range_lists[sa_index], tuple):
+                cur_fps_sample_range_list = list(
+                    fps_sample_range_lists[sa_index])
+            else:
+                cur_fps_sample_range_list = list(
+                    [fps_sample_range_lists[sa_index]])
+
+            self.SA_modules.append(
+                build_sa_module(
+                    num_point=num_points[sa_index],
+                    radii=radii[sa_index],
+                    sample_nums=num_samples[sa_index],
+                    mlp_channels=cur_sa_mlps,
+                    fps_mod=cur_fps_mod,
+                    fps_sample_range_list=cur_fps_sample_range_list,
+                    dilated_group=dilated_group[sa_index],
+                    norm_cfg=norm_cfg,
+                    cfg=sa_cfg,
+                    bias=True))
+            skip_channel_list.append(sa_out_channel)
+
+            cur_aggregation_channel = aggregation_channels[sa_index]
+            if cur_aggregation_channel is None:
+                self.aggregation_mlps.append(None)
+                sa_in_channel = sa_out_channel
+            else:
+                self.aggregation_mlps.append(
+                    ConvModule(
+                        sa_out_channel,
+                        cur_aggregation_channel,
+                        conv_cfg=dict(type='Conv1d'),
+                        norm_cfg=dict(type='BN1d'),
+                        kernel_size=1,
+                        bias=True))
+                sa_in_channel = cur_aggregation_channel
+
+    def forward(self, points: Tensor):
+        """Forward pass.
+
+        Args:
+            points (torch.Tensor): point coordinates with features,
+                with shape (B, N, 3 + input_feature_dim).
+
+        Returns:
+            dict[str, torch.Tensor]: Outputs of the last SA module.
+
+                - sa_xyz (torch.Tensor): The coordinates of sa features.
+                - sa_features (torch.Tensor): The features from the
+                    last Set Aggregation Layers.
+                - sa_indices (torch.Tensor): Indices of the
+                    input points.
+        """
+        xyz, features = self._split_point_feats(points)
+
+        batch, num_points = xyz.shape[:2]
+        indices = xyz.new_tensor(range(num_points)).unsqueeze(0).repeat(
+            batch, 1).long()
+
+        sa_xyz = [xyz]
+        sa_features = [features]
+        sa_indices = [indices]
+
+        out_sa_xyz = [xyz]
+        out_sa_features = [features]
+        out_sa_indices = [indices]
+
+        for i in range(self.num_sa):
+            cur_xyz, cur_features, cur_indices = self.SA_modules[i](
+                sa_xyz[i], sa_features[i])
+            if self.aggregation_mlps[i] is not None:
+                cur_features = self.aggregation_mlps[i](cur_features)
+            sa_xyz.append(cur_xyz)
+            sa_features.append(cur_features)
+            sa_indices.append(
+                torch.gather(sa_indices[-1], 1, cur_indices.long()))
+            if i in self.out_indices:
+                out_sa_xyz.append(sa_xyz[-1])
+                out_sa_features.append(sa_features[-1])
+                out_sa_indices.append(sa_indices[-1])
+
+        return dict(
+            sa_xyz=out_sa_xyz,
+            sa_features=out_sa_features,
+            sa_indices=out_sa_indices)
diff --git a/mmde/mmdet3d/models/backbones/pointnet2_sa_ssg.py b/mmde/mmdet3d/models/backbones/pointnet2_sa_ssg.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b278fbb9c2edb8a638977966ff93b1b391d5de5
--- /dev/null
+++ b/mmde/mmdet3d/models/backbones/pointnet2_sa_ssg.py
@@ -0,0 +1,147 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Sequence
+
+import torch
+from torch import Tensor, nn
+
+from mmdet3d.models.layers import PointFPModule, build_sa_module
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptMultiConfig
+from .base_pointnet import BasePointNet
+
+
+@MODELS.register_module()
+class PointNet2SASSG(BasePointNet):
+    """PointNet2 with Single-scale grouping.
+
+    Args:
+        in_channels (int): Input channels of point cloud.
+        num_points (tuple[int]): The number of points which each SA
+            module samples.
+        radius (tuple[float]): Sampling radii of each SA module.
+        num_samples (tuple[int]): The number of samples for ball
+            query in each SA module.
+        sa_channels (tuple[tuple[int]]): Out channels of each mlp in SA module.
+        fp_channels (tuple[tuple[int]]): Out channels of each mlp in FP module.
+        norm_cfg (dict): Config of normalization layer.
+        sa_cfg (dict): Config of set abstraction module, which may contain
+            the following keys and values:
+
+            - pool_mod (str): Pool method ('max' or 'avg') for SA modules.
+            - use_xyz (bool): Whether to use xyz as a part of features.
+            - normalize_xyz (bool): Whether to normalize xyz with radii in
+              each SA module.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 num_points: Sequence[int] = (2048, 1024, 512, 256),
+                 radius: Sequence[float] = (0.2, 0.4, 0.8, 1.2),
+                 num_samples: Sequence[int] = (64, 32, 16, 16),
+                 sa_channels: Sequence[Sequence[int]] = ((64, 64, 128),
+                                                         (128, 128, 256),
+                                                         (128, 128, 256),
+                                                         (128, 128, 256)),
+                 fp_channels: Sequence[Sequence[int]] = ((256, 256), (256,
+                                                                      256)),
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 sa_cfg: ConfigType = dict(
+                     type='PointSAModule',
+                     pool_mod='max',
+                     use_xyz=True,
+                     normalize_xyz=True),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        self.num_sa = len(sa_channels)
+        self.num_fp = len(fp_channels)
+
+        assert len(num_points) == len(radius) == len(num_samples) == len(
+            sa_channels)
+        assert len(sa_channels) >= len(fp_channels)
+
+        self.SA_modules = nn.ModuleList()
+        sa_in_channel = in_channels - 3  # number of channels without xyz
+        skip_channel_list = [sa_in_channel]
+
+        for sa_index in range(self.num_sa):
+            cur_sa_mlps = list(sa_channels[sa_index])
+            cur_sa_mlps = [sa_in_channel] + cur_sa_mlps
+            sa_out_channel = cur_sa_mlps[-1]
+
+            self.SA_modules.append(
+                build_sa_module(
+                    num_point=num_points[sa_index],
+                    radius=radius[sa_index],
+                    num_sample=num_samples[sa_index],
+                    mlp_channels=cur_sa_mlps,
+                    norm_cfg=norm_cfg,
+                    cfg=sa_cfg))
+            skip_channel_list.append(sa_out_channel)
+            sa_in_channel = sa_out_channel
+
+        self.FP_modules = nn.ModuleList()
+
+        fp_source_channel = skip_channel_list.pop()
+        fp_target_channel = skip_channel_list.pop()
+        for fp_index in range(len(fp_channels)):
+            cur_fp_mlps = list(fp_channels[fp_index])
+            cur_fp_mlps = [fp_source_channel + fp_target_channel] + cur_fp_mlps
+            self.FP_modules.append(PointFPModule(mlp_channels=cur_fp_mlps))
+            if fp_index != len(fp_channels) - 1:
+                fp_source_channel = cur_fp_mlps[-1]
+                fp_target_channel = skip_channel_list.pop()
+
+    def forward(self, points: Tensor) -> Dict[str, List[Tensor]]:
+        """Forward pass.
+
+        Args:
+            points (torch.Tensor): point coordinates with features,
+                with shape (B, N, 3 + input_feature_dim).
+
+        Returns:
+            dict[str, list[torch.Tensor]]: Outputs after SA and FP modules.
+
+                - fp_xyz (list[torch.Tensor]): The coordinates of
+                    each fp features.
+                - fp_features (list[torch.Tensor]): The features
+                    from each Feature Propagate Layers.
+                - fp_indices (list[torch.Tensor]): Indices of the
+                    input points.
+        """
+        xyz, features = self._split_point_feats(points)
+
+        batch, num_points = xyz.shape[:2]
+        indices = xyz.new_tensor(range(num_points)).unsqueeze(0).repeat(
+            batch, 1).long()
+
+        sa_xyz = [xyz]
+        sa_features = [features]
+        sa_indices = [indices]
+
+        for i in range(self.num_sa):
+            cur_xyz, cur_features, cur_indices = self.SA_modules[i](
+                sa_xyz[i], sa_features[i])
+            sa_xyz.append(cur_xyz)
+            sa_features.append(cur_features)
+            sa_indices.append(
+                torch.gather(sa_indices[-1], 1, cur_indices.long()))
+
+        fp_xyz = [sa_xyz[-1]]
+        fp_features = [sa_features[-1]]
+        fp_indices = [sa_indices[-1]]
+
+        for i in range(self.num_fp):
+            fp_features.append(self.FP_modules[i](
+                sa_xyz[self.num_sa - i - 1], sa_xyz[self.num_sa - i],
+                sa_features[self.num_sa - i - 1], fp_features[-1]))
+            fp_xyz.append(sa_xyz[self.num_sa - i - 1])
+            fp_indices.append(sa_indices[self.num_sa - i - 1])
+
+        ret = dict(
+            fp_xyz=fp_xyz,
+            fp_features=fp_features,
+            fp_indices=fp_indices,
+            sa_xyz=sa_xyz,
+            sa_features=sa_features,
+            sa_indices=sa_indices)
+        return ret
diff --git a/mmde/mmdet3d/models/backbones/second.py b/mmde/mmdet3d/models/backbones/second.py
new file mode 100644
index 0000000000000000000000000000000000000000..83e8e3b4b5b0b4fc40b8098b186b74fa9df73a59
--- /dev/null
+++ b/mmde/mmdet3d/models/backbones/second.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Optional, Sequence, Tuple
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class SECOND(BaseModule):
+    """Backbone network for SECOND/PointPillars/PartA2/MVXNet.
+
+    Args:
+        in_channels (int): Input channels.
+        out_channels (list[int]): Output channels for multi-scale feature maps.
+        layer_nums (list[int]): Number of layers in each stage.
+        layer_strides (list[int]): Strides of each stage.
+        norm_cfg (dict): Config dict of normalization layers.
+        conv_cfg (dict): Config dict of convolutional layers.
+    """
+
+    def __init__(self,
+                 in_channels: int = 128,
+                 out_channels: Sequence[int] = [128, 128, 256],
+                 layer_nums: Sequence[int] = [3, 5, 5],
+                 layer_strides: Sequence[int] = [2, 2, 2],
+                 norm_cfg: ConfigType = dict(
+                     type='BN', eps=1e-3, momentum=0.01),
+                 conv_cfg: ConfigType = dict(type='Conv2d', bias=False),
+                 init_cfg: OptMultiConfig = None,
+                 pretrained: Optional[str] = None) -> None:
+        super(SECOND, self).__init__(init_cfg=init_cfg)
+        assert len(layer_strides) == len(layer_nums)
+        assert len(out_channels) == len(layer_nums)
+
+        in_filters = [in_channels, *out_channels[:-1]]
+        # note that when stride > 1, conv2d with same padding isn't
+        # equal to pad-conv2d. we should use pad-conv2d.
+        blocks = []
+        for i, layer_num in enumerate(layer_nums):
+            block = [
+                build_conv_layer(
+                    conv_cfg,
+                    in_filters[i],
+                    out_channels[i],
+                    3,
+                    stride=layer_strides[i],
+                    padding=1),
+                build_norm_layer(norm_cfg, out_channels[i])[1],
+                nn.ReLU(inplace=True),
+            ]
+            for j in range(layer_num):
+                block.append(
+                    build_conv_layer(
+                        conv_cfg,
+                        out_channels[i],
+                        out_channels[i],
+                        3,
+                        padding=1))
+                block.append(build_norm_layer(norm_cfg, out_channels[i])[1])
+                block.append(nn.ReLU(inplace=True))
+
+            block = nn.Sequential(*block)
+            blocks.append(block)
+
+        self.blocks = nn.ModuleList(blocks)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        else:
+            self.init_cfg = dict(type='Kaiming', layer='Conv2d')
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, ...]:
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): Input with shape (N, C, H, W).
+
+        Returns:
+            tuple[torch.Tensor]: Multi-scale features.
+        """
+        outs = []
+        for i in range(len(self.blocks)):
+            x = self.blocks[i](x)
+            outs.append(x)
+        return tuple(outs)
diff --git a/mmde/mmdet3d/models/backbones/spvcnn_backone.py b/mmde/mmdet3d/models/backbones/spvcnn_backone.py
new file mode 100644
index 0000000000000000000000000000000000000000..535d343b861d62426b9f705a8c56606bf87ebab9
--- /dev/null
+++ b/mmde/mmdet3d/models/backbones/spvcnn_backone.py
@@ -0,0 +1,297 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence
+
+import torch
+from mmengine.registry import MODELS
+from torch import Tensor, nn
+
+from mmdet3d.models.layers.torchsparse import IS_TORCHSPARSE_AVAILABLE
+from .minkunet_backbone import MinkUNetBackbone
+
+if IS_TORCHSPARSE_AVAILABLE:
+    import torchsparse
+    import torchsparse.nn.functional as F
+    from torchsparse.nn.utils import get_kernel_offsets
+    from torchsparse.tensor import PointTensor, SparseTensor
+else:
+    PointTensor = SparseTensor = None
+
+
+@MODELS.register_module()
+class SPVCNNBackbone(MinkUNetBackbone):
+    """SPVCNN backbone with torchsparse backend.
+
+    More details can be found in `paper <https://arxiv.org/abs/2007.16100>`_ .
+
+    Args:
+        in_channels (int): Number of input voxel feature channels.
+            Defaults to 4.
+        base_channels (int): The input channels for first encoder layer.
+            Defaults to 32.
+        num_stages (int): Number of stages in encoder and decoder.
+            Defaults to 4.
+        encoder_channels (List[int]): Convolutional channels of each encode
+            layer. Defaults to [32, 64, 128, 256].
+        decoder_channels (List[int]): Convolutional channels of each decode
+            layer. Defaults to [256, 128, 96, 96].
+        drop_ratio (float): Dropout ratio of voxel features. Defaults to 0.3.
+        sparseconv_backend (str): Sparse convolution backend.
+        init_cfg (dict or :obj:`ConfigDict` or list[dict or :obj:`ConfigDict`]
+            , optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int = 4,
+                 base_channels: int = 32,
+                 num_stages: int = 4,
+                 encoder_channels: Sequence[int] = [32, 64, 128, 256],
+                 decoder_channels: Sequence[int] = [256, 128, 96, 96],
+                 drop_ratio: float = 0.3,
+                 sparseconv_backend: str = 'torchsparse',
+                 **kwargs) -> None:
+        assert num_stages == 4, 'SPVCNN backbone only supports 4 stages.'
+        assert sparseconv_backend == 'torchsparse', \
+            f'SPVCNN backbone only supports torchsparse backend, but got ' \
+            f'sparseconv backend: {sparseconv_backend}.'
+        super().__init__(
+            in_channels=in_channels,
+            base_channels=base_channels,
+            num_stages=num_stages,
+            encoder_channels=encoder_channels,
+            decoder_channels=decoder_channels,
+            sparseconv_backend=sparseconv_backend,
+            **kwargs)
+
+        self.point_transforms = nn.ModuleList([
+            nn.Sequential(
+                nn.Linear(base_channels, encoder_channels[-1]),
+                nn.BatchNorm1d(encoder_channels[-1]), nn.ReLU(True)),
+            nn.Sequential(
+                nn.Linear(encoder_channels[-1], decoder_channels[2]),
+                nn.BatchNorm1d(decoder_channels[2]), nn.ReLU(True)),
+            nn.Sequential(
+                nn.Linear(decoder_channels[2], decoder_channels[4]),
+                nn.BatchNorm1d(decoder_channels[4]), nn.ReLU(True))
+        ])
+        self.dropout = nn.Dropout(drop_ratio, True)
+
+    def forward(self, voxel_features: Tensor, coors: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            voxel_features (Tensor): Voxel features in shape (N, C).
+            coors (Tensor): Coordinates in shape (N, 4),
+                the columns in the order of (x_idx, y_idx, z_idx, batch_idx).
+
+        Returns:
+            PointTensor: Backbone features.
+        """
+        voxels = SparseTensor(voxel_features, coors)
+        points = PointTensor(voxels.F, voxels.C.float())
+        voxels = initial_voxelize(points)
+
+        voxels = self.conv_input(voxels)
+        points = voxel_to_point(voxels, points)
+        voxels = point_to_voxel(voxels, points)
+        laterals = [voxels]
+        for encoder in self.encoder:
+            voxels = encoder(voxels)
+            laterals.append(voxels)
+        laterals = laterals[:-1][::-1]
+
+        points = voxel_to_point(voxels, points, self.point_transforms[0])
+        voxels = point_to_voxel(voxels, points)
+        voxels.F = self.dropout(voxels.F)
+
+        decoder_outs = []
+        for i, decoder in enumerate(self.decoder):
+            voxels = decoder[0](voxels)
+            voxels = torchsparse.cat((voxels, laterals[i]))
+            voxels = decoder[1](voxels)
+            decoder_outs.append(voxels)
+            if i == 1:
+                points = voxel_to_point(voxels, points,
+                                        self.point_transforms[1])
+                voxels = point_to_voxel(voxels, points)
+                voxels.F = self.dropout(voxels.F)
+
+        points = voxel_to_point(voxels, points, self.point_transforms[2])
+        return points.F
+
+
+@MODELS.register_module()
+class MinkUNetBackboneV2(MinkUNetBackbone):
+    r"""MinkUNet backbone V2.
+
+    refer to https://github.com/PJLab-ADG/PCSeg/blob/master/pcseg/model/segmentor/voxel/minkunet/minkunet.py
+
+    Args:
+        sparseconv_backend (str): Sparse convolution backend.
+    """  # noqa: E501
+
+    def __init__(self,
+                 sparseconv_backend: str = 'torchsparse',
+                 **kwargs) -> None:
+        assert sparseconv_backend == 'torchsparse', \
+            f'SPVCNN backbone only supports torchsparse backend, but got ' \
+            f'sparseconv backend: {sparseconv_backend}.'
+        super().__init__(sparseconv_backend=sparseconv_backend, **kwargs)
+
+    def forward(self, voxel_features: Tensor, coors: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            voxel_features (Tensor): Voxel features in shape (N, C).
+            coors (Tensor): Coordinates in shape (N, 4),
+                the columns in the order of (x_idx, y_idx, z_idx, batch_idx).
+
+        Returns:
+            SparseTensor: Backbone features.
+        """
+        voxels = SparseTensor(voxel_features, coors)
+        points = PointTensor(voxels.F, voxels.C.float())
+
+        voxels = initial_voxelize(points)
+        voxels = self.conv_input(voxels)
+        points = voxel_to_point(voxels, points)
+
+        laterals = [voxels]
+        for encoder_layer in self.encoder:
+            voxels = encoder_layer(voxels)
+            laterals.append(voxels)
+        laterals = laterals[:-1][::-1]
+        points = voxel_to_point(voxels, points)
+        output_features = [points.F]
+
+        for i, decoder_layer in enumerate(self.decoder):
+            voxels = decoder_layer[0](voxels)
+            voxels = torchsparse.cat((voxels, laterals[i]))
+            voxels = decoder_layer[1](voxels)
+            if i % 2 == 1:
+                points = voxel_to_point(voxels, points)
+                output_features.append(points.F)
+
+        points.F = torch.cat(output_features, dim=1)
+        return points.F
+
+
+def initial_voxelize(points: PointTensor) -> SparseTensor:
+    """Voxelization again based on input PointTensor.
+
+    Args:
+        points (PointTensor): Input points after voxelization.
+
+    Returns:
+        SparseTensor: New voxels.
+    """
+    pc_hash = F.sphash(torch.floor(points.C).int())
+    sparse_hash = torch.unique(pc_hash)
+    idx_query = F.sphashquery(pc_hash, sparse_hash)
+    counts = F.spcount(idx_query.int(), len(sparse_hash))
+
+    inserted_coords = F.spvoxelize(torch.floor(points.C), idx_query, counts)
+    inserted_coords = torch.round(inserted_coords).int()
+    inserted_feat = F.spvoxelize(points.F, idx_query, counts)
+
+    new_tensor = SparseTensor(inserted_feat, inserted_coords, 1)
+    new_tensor.cmaps.setdefault(new_tensor.stride, new_tensor.coords)
+    points.additional_features['idx_query'][1] = idx_query
+    points.additional_features['counts'][1] = counts
+    return new_tensor
+
+
+def voxel_to_point(voxels: SparseTensor,
+                   points: PointTensor,
+                   point_transform: Optional[nn.Module] = None,
+                   nearest: bool = False) -> PointTensor:
+    """Feed voxel features to points.
+
+    Args:
+        voxels (SparseTensor): Input voxels.
+        points (PointTensor): Input points.
+        point_transform (nn.Module, optional): Point transform module
+            for input point features. Defaults to None.
+        nearest (bool): Whether to use nearest neighbor interpolation.
+            Defaults to False.
+
+    Returns:
+        PointTensor: Points with new features.
+    """
+    if points.idx_query is None or points.weights is None or \
+            points.idx_query.get(voxels.s) is None or \
+            points.weights.get(voxels.s) is None:
+        offsets = get_kernel_offsets(2, voxels.s, 1, device=points.F.device)
+        old_hash = F.sphash(
+            torch.cat([
+                torch.floor(points.C[:, :3] / voxels.s[0]).int() * voxels.s[0],
+                points.C[:, -1].int().view(-1, 1)
+            ], 1), offsets)
+        pc_hash = F.sphash(voxels.C.to(points.F.device))
+        idx_query = F.sphashquery(old_hash, pc_hash)
+        weights = F.calc_ti_weights(
+            points.C, idx_query, scale=voxels.s[0]).transpose(0,
+                                                              1).contiguous()
+        idx_query = idx_query.transpose(0, 1).contiguous()
+        if nearest:
+            weights[:, 1:] = 0.
+            idx_query[:, 1:] = -1
+        new_features = F.spdevoxelize(voxels.F, idx_query, weights)
+        new_tensor = PointTensor(
+            new_features,
+            points.C,
+            idx_query=points.idx_query,
+            weights=points.weights)
+        new_tensor.additional_features = points.additional_features
+        new_tensor.idx_query[voxels.s] = idx_query
+        new_tensor.weights[voxels.s] = weights
+        points.idx_query[voxels.s] = idx_query
+        points.weights[voxels.s] = weights
+    else:
+        new_features = F.spdevoxelize(voxels.F, points.idx_query.get(voxels.s),
+                                      points.weights.get(voxels.s))
+        new_tensor = PointTensor(
+            new_features,
+            points.C,
+            idx_query=points.idx_query,
+            weights=points.weights)
+        new_tensor.additional_features = points.additional_features
+
+    if point_transform is not None:
+        new_tensor.F = new_tensor.F + point_transform(points.F)
+
+    return new_tensor
+
+
+def point_to_voxel(voxels: SparseTensor, points: PointTensor) -> SparseTensor:
+    """Feed point features to voxels.
+
+    Args:
+        voxels (SparseTensor): Input voxels.
+        points (PointTensor): Input points.
+
+    Returns:
+        SparseTensor: Voxels with new features.
+    """
+    if points.additional_features is None or \
+            points.additional_features.get('idx_query') is None or \
+            points.additional_features['idx_query'].get(voxels.s) is None:
+        pc_hash = F.sphash(
+            torch.cat([
+                torch.floor(points.C[:, :3] / voxels.s[0]).int() * voxels.s[0],
+                points.C[:, -1].int().view(-1, 1)
+            ], 1))
+        sparse_hash = F.sphash(voxels.C)
+        idx_query = F.sphashquery(pc_hash, sparse_hash)
+        counts = F.spcount(idx_query.int(), voxels.C.shape[0])
+        points.additional_features['idx_query'][voxels.s] = idx_query
+        points.additional_features['counts'][voxels.s] = counts
+    else:
+        idx_query = points.additional_features['idx_query'][voxels.s]
+        counts = points.additional_features['counts'][voxels.s]
+
+    inserted_features = F.spvoxelize(points.F, idx_query, counts)
+    new_tensor = SparseTensor(inserted_features, voxels.C, voxels.s)
+    new_tensor.cmaps = voxels.cmaps
+    new_tensor.kmaps = voxels.kmaps
+
+    return new_tensor
diff --git a/mmde/mmdet3d/models/data_preprocessors/__init__.py b/mmde/mmdet3d/models/data_preprocessors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bf69f2b7483cdd00e56cf39a0b9b4ac728ab31f
--- /dev/null
+++ b/mmde/mmdet3d/models/data_preprocessors/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .data_preprocessor import Det3DDataPreprocessor
+
+__all__ = ['Det3DDataPreprocessor']
diff --git a/mmde/mmdet3d/models/data_preprocessors/data_preprocessor.py b/mmde/mmdet3d/models/data_preprocessors/data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..05c8452a0ccff2d4c3bbc78e11e996ee85fb34c9
--- /dev/null
+++ b/mmde/mmdet3d/models/data_preprocessors/data_preprocessor.py
@@ -0,0 +1,542 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from numbers import Number
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from mmdet.models import DetDataPreprocessor
+from mmdet.models.utils.misc import samplelist_boxtype2tensor
+from mmengine.model import stack_batch
+from mmengine.utils import is_seq_of
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import OptConfigType
+from .utils import multiview_img_stack_batch
+from .voxelize import VoxelizationByGridShape, dynamic_scatter_3d
+
+
+@MODELS.register_module()
+class Det3DDataPreprocessor(DetDataPreprocessor):
+    """Points / Image pre-processor for point clouds / vision-only / multi-
+    modality 3D detection tasks.
+
+    It provides the data pre-processing as follows
+
+    - Collate and move image and point cloud data to the target device.
+
+    - 1) For image data:
+
+      - Pad images in inputs to the maximum size of current batch with defined
+        ``pad_value``. The padding size can be divisible by a defined
+        ``pad_size_divisor``.
+      - Stack images in inputs to batch_imgs.
+      - Convert images in inputs from bgr to rgb if the shape of input is
+        (3, H, W).
+      - Normalize images in inputs with defined std and mean.
+      - Do batch augmentations during training.
+
+    - 2) For point cloud data:
+
+      - If no voxelization, directly return list of point cloud data.
+      - If voxelization is applied, voxelize point cloud according to
+        ``voxel_type`` and obtain ``voxels``.
+
+    Args:
+        voxel (bool): Whether to apply voxelization to point cloud.
+            Defaults to False.
+        voxel_type (str): Voxelization type. Two voxelization types are
+            provided: 'hard' and 'dynamic', respectively for hard voxelization
+            and dynamic voxelization. Defaults to 'hard'.
+        voxel_layer (dict or :obj:`ConfigDict`, optional): Voxelization layer
+            config. Defaults to None.
+        batch_first (bool): Whether to put the batch dimension to the first
+            dimension when getting voxel coordinates. Defaults to True.
+        max_voxels (int, optional): Maximum number of voxels in each voxel
+            grid. Defaults to None.
+        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
+            Defaults to None.
+        std (Sequence[Number], optional): The pixel standard deviation of
+            R, G, B channels. Defaults to None.
+        pad_size_divisor (int): The size of padded image should be divisible by
+            ``pad_size_divisor``. Defaults to 1.
+        pad_value (float or int): The padded pixel value. Defaults to 0.
+        pad_mask (bool): Whether to pad instance masks. Defaults to False.
+        mask_pad_value (int): The padded pixel value for instance masks.
+            Defaults to 0.
+        pad_seg (bool): Whether to pad semantic segmentation maps.
+            Defaults to False.
+        seg_pad_value (int): The padded pixel value for semantic segmentation
+            maps. Defaults to 255.
+        bgr_to_rgb (bool): Whether to convert image from BGR to RGB.
+            Defaults to False.
+        rgb_to_bgr (bool): Whether to convert image from RGB to BGR.
+            Defaults to False.
+        boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of
+            bboxes data to ``Tensor`` type. Defaults to True.
+        non_blocking (bool): Whether to block current process when transferring
+            data to device. Defaults to False.
+        batch_augments (List[dict], optional): Batch-level augmentations.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 voxel: bool = False,
+                 voxel_type: str = 'hard',
+                 voxel_layer: OptConfigType = None,
+                 batch_first: bool = True,
+                 max_voxels: Optional[int] = None,
+                 mean: Sequence[Number] = None,
+                 std: Sequence[Number] = None,
+                 pad_size_divisor: int = 1,
+                 pad_value: Union[float, int] = 0,
+                 pad_mask: bool = False,
+                 mask_pad_value: int = 0,
+                 pad_seg: bool = False,
+                 seg_pad_value: int = 255,
+                 bgr_to_rgb: bool = False,
+                 rgb_to_bgr: bool = False,
+                 boxtype2tensor: bool = True,
+                 non_blocking: bool = False,
+                 batch_augments: Optional[List[dict]] = None) -> None:
+        super(Det3DDataPreprocessor, self).__init__(
+            mean=mean,
+            std=std,
+            pad_size_divisor=pad_size_divisor,
+            pad_value=pad_value,
+            pad_mask=pad_mask,
+            mask_pad_value=mask_pad_value,
+            pad_seg=pad_seg,
+            seg_pad_value=seg_pad_value,
+            bgr_to_rgb=bgr_to_rgb,
+            rgb_to_bgr=rgb_to_bgr,
+            boxtype2tensor=boxtype2tensor,
+            non_blocking=non_blocking,
+            batch_augments=batch_augments)
+        self.voxel = voxel
+        self.voxel_type = voxel_type
+        self.batch_first = batch_first
+        self.max_voxels = max_voxels
+        if voxel:
+            self.voxel_layer = VoxelizationByGridShape(**voxel_layer)
+
+    def forward(self,
+                data: Union[dict, List[dict]],
+                training: bool = False) -> Union[dict, List[dict]]:
+        """Perform normalization, padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict or List[dict]): Data from dataloader. The dict contains
+                the whole batch data, when it is a list[dict], the list
+                indicates test time augmentation.
+            training (bool): Whether to enable training time augmentation.
+                Defaults to False.
+
+        Returns:
+            dict or List[dict]: Data in the same format as the model input.
+        """
+        if isinstance(data, list):
+            num_augs = len(data)
+            aug_batch_data = []
+            for aug_id in range(num_augs):
+                single_aug_batch_data = self.simple_process(
+                    data[aug_id], training)
+                aug_batch_data.append(single_aug_batch_data)
+            return aug_batch_data
+
+        else:
+            return self.simple_process(data, training)
+
+    def simple_process(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization, padding and bgr2rgb conversion for img data
+        based on ``BaseDataPreprocessor``, and voxelize point cloud if `voxel`
+        is set to be True.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+                Defaults to False.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        if 'img' in data['inputs']:
+            batch_pad_shape = self._get_pad_shape(data)
+
+        data = self.collate_data(data)
+        inputs, data_samples = data['inputs'], data['data_samples']
+        batch_inputs = dict()
+
+        if 'points' in inputs:
+            batch_inputs['points'] = inputs['points']
+
+            if self.voxel:
+                voxel_dict = self.voxelize(inputs['points'], data_samples)
+                batch_inputs['voxels'] = voxel_dict
+
+        if 'imgs' in inputs:
+            imgs = inputs['imgs']
+
+            if data_samples is not None:
+                # NOTE the batched image size information may be useful, e.g.
+                # in DETR, this is needed for the construction of masks, which
+                # is then used for the transformer_head.
+                batch_input_shape = tuple(imgs[0].size()[-2:])
+                for data_sample, pad_shape in zip(data_samples,
+                                                  batch_pad_shape):
+                    data_sample.set_metainfo({
+                        'batch_input_shape': batch_input_shape,
+                        'pad_shape': pad_shape
+                    })
+
+                if self.boxtype2tensor:
+                    samplelist_boxtype2tensor(data_samples)
+                if self.pad_mask:
+                    self.pad_gt_masks(data_samples)
+                if self.pad_seg:
+                    self.pad_gt_sem_seg(data_samples)
+
+            if training and self.batch_augments is not None:
+                for batch_aug in self.batch_augments:
+                    imgs, data_samples = batch_aug(imgs, data_samples)
+            batch_inputs['imgs'] = imgs
+
+        return {'inputs': batch_inputs, 'data_samples': data_samples}
+
+    def preprocess_img(self, _batch_img: Tensor) -> Tensor:
+        # channel transform
+        if self._channel_conversion:
+            _batch_img = _batch_img[[2, 1, 0], ...]
+        # Convert to float after channel conversion to ensure
+        # efficiency
+        _batch_img = _batch_img.float()
+        # Normalization.
+        if self._enable_normalize:
+            if self.mean.shape[0] == 3:
+                assert _batch_img.dim() == 3 and _batch_img.shape[0] == 3, (
+                    'If the mean has 3 values, the input tensor '
+                    'should in shape of (3, H, W), but got the '
+                    f'tensor with shape {_batch_img.shape}')
+            _batch_img = (_batch_img - self.mean) / self.std
+        return _batch_img
+
+    def collate_data(self, data: dict) -> dict:
+        """Copy data to the target device and perform normalization, padding
+        and bgr2rgb conversion and stack based on ``BaseDataPreprocessor``.
+
+        Collates the data sampled from dataloader into a list of dict and list
+        of labels, and then copies tensor to the target device.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        data = self.cast_data(data)  # type: ignore
+
+        if 'img' in data['inputs']:
+            _batch_imgs = data['inputs']['img']
+            # Process data with `pseudo_collate`.
+            if is_seq_of(_batch_imgs, torch.Tensor):
+                batch_imgs = []
+                img_dim = _batch_imgs[0].dim()
+                for _batch_img in _batch_imgs:
+                    if img_dim == 3:  # standard img
+                        _batch_img = self.preprocess_img(_batch_img)
+                    elif img_dim == 4:
+                        _batch_img = [
+                            self.preprocess_img(_img) for _img in _batch_img
+                        ]
+
+                        _batch_img = torch.stack(_batch_img, dim=0)
+
+                    batch_imgs.append(_batch_img)
+
+                # Pad and stack Tensor.
+                if img_dim == 3:
+                    batch_imgs = stack_batch(batch_imgs, self.pad_size_divisor,
+                                             self.pad_value)
+                elif img_dim == 4:
+                    batch_imgs = multiview_img_stack_batch(
+                        batch_imgs, self.pad_size_divisor, self.pad_value)
+
+            # Process data with `default_collate`.
+            elif isinstance(_batch_imgs, torch.Tensor):
+                assert _batch_imgs.dim() == 4, (
+                    'The input of `ImgDataPreprocessor` should be a NCHW '
+                    'tensor or a list of tensor, but got a tensor with '
+                    f'shape: {_batch_imgs.shape}')
+                if self._channel_conversion:
+                    _batch_imgs = _batch_imgs[:, [2, 1, 0], ...]
+                # Convert to float after channel conversion to ensure
+                # efficiency
+                _batch_imgs = _batch_imgs.float()
+                if self._enable_normalize:
+                    _batch_imgs = (_batch_imgs - self.mean) / self.std
+                h, w = _batch_imgs.shape[2:]
+                target_h = math.ceil(
+                    h / self.pad_size_divisor) * self.pad_size_divisor
+                target_w = math.ceil(
+                    w / self.pad_size_divisor) * self.pad_size_divisor
+                pad_h = target_h - h
+                pad_w = target_w - w
+                batch_imgs = F.pad(_batch_imgs, (0, pad_w, 0, pad_h),
+                                   'constant', self.pad_value)
+            else:
+                raise TypeError(
+                    'Output of `cast_data` should be a list of dict '
+                    'or a tuple with inputs and data_samples, but got '
+                    f'{type(data)}: {data}')
+
+            data['inputs']['imgs'] = batch_imgs
+
+        data.setdefault('data_samples', None)
+
+        return data
+
+    def _get_pad_shape(self, data: dict) -> List[Tuple[int, int]]:
+        """Get the pad_shape of each image based on data and
+        pad_size_divisor."""
+        # rewrite `_get_pad_shape` for obtaining image inputs.
+        _batch_inputs = data['inputs']['img']
+        # Process data with `pseudo_collate`.
+        if is_seq_of(_batch_inputs, torch.Tensor):
+            batch_pad_shape = []
+            for ori_input in _batch_inputs:
+                if ori_input.dim() == 4:
+                    # mean multiview input, select one of the
+                    # image to calculate the pad shape
+                    ori_input = ori_input[0]
+                pad_h = int(
+                    np.ceil(ori_input.shape[1] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                pad_w = int(
+                    np.ceil(ori_input.shape[2] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                batch_pad_shape.append((pad_h, pad_w))
+        # Process data with `default_collate`.
+        elif isinstance(_batch_inputs, torch.Tensor):
+            assert _batch_inputs.dim() == 4, (
+                'The input of `ImgDataPreprocessor` should be a NCHW tensor '
+                'or a list of tensor, but got a tensor with shape: '
+                f'{_batch_inputs.shape}')
+            pad_h = int(
+                np.ceil(_batch_inputs.shape[1] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            pad_w = int(
+                np.ceil(_batch_inputs.shape[2] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0]
+        else:
+            raise TypeError('Output of `cast_data` should be a list of dict '
+                            'or a tuple with inputs and data_samples, but got '
+                            f'{type(data)}: {data}')
+        return batch_pad_shape
+
+    @torch.no_grad()
+    def voxelize(self, points: List[Tensor],
+                 data_samples: SampleList) -> Dict[str, Tensor]:
+        """Apply voxelization to point cloud.
+
+        Args:
+            points (List[Tensor]): Point cloud in one data batch.
+            data_samples: (list[:obj:`Det3DDataSample`]): The annotation data
+                of every samples. Add voxel-wise annotation for segmentation.
+
+        Returns:
+            Dict[str, Tensor]: Voxelization information.
+
+            - voxels (Tensor): Features of voxels, shape is MxNxC for hard
+              voxelization, NxC for dynamic voxelization.
+            - coors (Tensor): Coordinates of voxels, shape is Nx(1+NDim),
+              where 1 represents the batch index.
+            - num_points (Tensor, optional): Number of points in each voxel.
+            - voxel_centers (Tensor, optional): Centers of voxels.
+        """
+
+        voxel_dict = dict()
+
+        if self.voxel_type == 'hard':
+            voxels, coors, num_points, voxel_centers = [], [], [], []
+            for i, res in enumerate(points):
+                res_voxels, res_coors, res_num_points = self.voxel_layer(res)
+                res_voxel_centers = (
+                    res_coors[:, [2, 1, 0]] + 0.5) * res_voxels.new_tensor(
+                        self.voxel_layer.voxel_size) + res_voxels.new_tensor(
+                            self.voxel_layer.point_cloud_range[0:3])
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
+                voxels.append(res_voxels)
+                coors.append(res_coors)
+                num_points.append(res_num_points)
+                voxel_centers.append(res_voxel_centers)
+
+            voxels = torch.cat(voxels, dim=0)
+            coors = torch.cat(coors, dim=0)
+            num_points = torch.cat(num_points, dim=0)
+            voxel_centers = torch.cat(voxel_centers, dim=0)
+
+            voxel_dict['num_points'] = num_points
+            voxel_dict['voxel_centers'] = voxel_centers
+        elif self.voxel_type == 'dynamic':
+            coors = []
+            # dynamic voxelization only provide a coors mapping
+            for i, res in enumerate(points):
+                res_coors = self.voxel_layer(res)
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
+                coors.append(res_coors)
+            voxels = torch.cat(points, dim=0)
+            coors = torch.cat(coors, dim=0)
+        elif self.voxel_type == 'cylindrical':
+            voxels, coors = [], []
+            for i, (res, data_sample) in enumerate(zip(points, data_samples)):
+                rho = torch.sqrt(res[:, 0]**2 + res[:, 1]**2)
+                phi = torch.atan2(res[:, 1], res[:, 0])
+                polar_res = torch.stack((rho, phi, res[:, 2]), dim=-1)
+                min_bound = polar_res.new_tensor(
+                    self.voxel_layer.point_cloud_range[:3])
+                max_bound = polar_res.new_tensor(
+                    self.voxel_layer.point_cloud_range[3:])
+                try:  # only support PyTorch >= 1.9.0
+                    polar_res_clamp = torch.clamp(polar_res, min_bound,
+                                                  max_bound)
+                except TypeError:
+                    polar_res_clamp = polar_res.clone()
+                    for coor_idx in range(3):
+                        polar_res_clamp[:, coor_idx][
+                            polar_res[:, coor_idx] >
+                            max_bound[coor_idx]] = max_bound[coor_idx]
+                        polar_res_clamp[:, coor_idx][
+                            polar_res[:, coor_idx] <
+                            min_bound[coor_idx]] = min_bound[coor_idx]
+                res_coors = torch.floor(
+                    (polar_res_clamp - min_bound) / polar_res_clamp.new_tensor(
+                        self.voxel_layer.voxel_size)).int()
+                self.get_voxel_seg(res_coors, data_sample)
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
+                res_voxels = torch.cat((polar_res, res[:, :2], res[:, 3:]),
+                                       dim=-1)
+                voxels.append(res_voxels)
+                coors.append(res_coors)
+            voxels = torch.cat(voxels, dim=0)
+            coors = torch.cat(coors, dim=0)
+        elif self.voxel_type == 'minkunet':
+            voxels, coors = [], []
+            voxel_size = points[0].new_tensor(self.voxel_layer.voxel_size)
+            for i, (res, data_sample) in enumerate(zip(points, data_samples)):
+                res_coors = torch.round(res[:, :3] / voxel_size).int()
+                res_coors -= res_coors.min(0)[0]
+
+                res_coors_numpy = res_coors.cpu().numpy()
+                inds, point2voxel_map = self.sparse_quantize(
+                    res_coors_numpy, return_index=True, return_inverse=True)
+                point2voxel_map = torch.from_numpy(point2voxel_map).cuda()
+                if self.training and self.max_voxels is not None:
+                    if len(inds) > self.max_voxels:
+                        inds = np.random.choice(
+                            inds, self.max_voxels, replace=False)
+                inds = torch.from_numpy(inds).cuda()
+                if hasattr(data_sample.gt_pts_seg, 'pts_semantic_mask'):
+                    data_sample.gt_pts_seg.voxel_semantic_mask \
+                        = data_sample.gt_pts_seg.pts_semantic_mask[inds]
+                res_voxel_coors = res_coors[inds]
+                res_voxels = res[inds]
+                if self.batch_first:
+                    res_voxel_coors = F.pad(
+                        res_voxel_coors, (1, 0), mode='constant', value=i)
+                    data_sample.batch_idx = res_voxel_coors[:, 0]
+                else:
+                    res_voxel_coors = F.pad(
+                        res_voxel_coors, (0, 1), mode='constant', value=i)
+                    data_sample.batch_idx = res_voxel_coors[:, -1]
+                data_sample.point2voxel_map = point2voxel_map.long()
+                voxels.append(res_voxels)
+                coors.append(res_voxel_coors)
+            voxels = torch.cat(voxels, dim=0)
+            coors = torch.cat(coors, dim=0)
+
+        else:
+            raise ValueError(f'Invalid voxelization type {self.voxel_type}')
+
+        voxel_dict['voxels'] = voxels
+        voxel_dict['coors'] = coors
+
+        return voxel_dict
+
+    def get_voxel_seg(self, res_coors: Tensor,
+                      data_sample: SampleList) -> None:
+        """Get voxel-wise segmentation label and point2voxel map.
+
+        Args:
+            res_coors (Tensor): The voxel coordinates of points, Nx3.
+            data_sample: (:obj:`Det3DDataSample`): The annotation data of
+                every samples. Add voxel-wise annotation forsegmentation.
+        """
+
+        if self.training:
+            pts_semantic_mask = data_sample.gt_pts_seg.pts_semantic_mask
+            voxel_semantic_mask, _, point2voxel_map = dynamic_scatter_3d(
+                F.one_hot(pts_semantic_mask.long()).float(), res_coors, 'mean',
+                True)
+            voxel_semantic_mask = torch.argmax(voxel_semantic_mask, dim=-1)
+            data_sample.gt_pts_seg.voxel_semantic_mask = voxel_semantic_mask
+            data_sample.point2voxel_map = point2voxel_map
+        else:
+            pseudo_tensor = res_coors.new_ones([res_coors.shape[0], 1]).float()
+            _, _, point2voxel_map = dynamic_scatter_3d(pseudo_tensor,
+                                                       res_coors, 'mean', True)
+            data_sample.point2voxel_map = point2voxel_map
+
+    def ravel_hash(self, x: np.ndarray) -> np.ndarray:
+        """Get voxel coordinates hash for np.unique.
+
+        Args:
+            x (np.ndarray): The voxel coordinates of points, Nx3.
+
+        Returns:
+            np.ndarray: Voxels coordinates hash.
+        """
+        assert x.ndim == 2, x.shape
+
+        x = x - np.min(x, axis=0)
+        x = x.astype(np.uint64, copy=False)
+        xmax = np.max(x, axis=0).astype(np.uint64) + 1
+
+        h = np.zeros(x.shape[0], dtype=np.uint64)
+        for k in range(x.shape[1] - 1):
+            h += x[:, k]
+            h *= xmax[k + 1]
+        h += x[:, -1]
+        return h
+
+    def sparse_quantize(self,
+                        coords: np.ndarray,
+                        return_index: bool = False,
+                        return_inverse: bool = False) -> List[np.ndarray]:
+        """Sparse Quantization for voxel coordinates used in Minkunet.
+
+        Args:
+            coords (np.ndarray): The voxel coordinates of points, Nx3.
+            return_index (bool): Whether to return the indices of the unique
+                coords, shape (M,).
+            return_inverse (bool): Whether to return the indices of the
+                original coords, shape (N,).
+
+        Returns:
+            List[np.ndarray]: Return index and inverse map if return_index and
+            return_inverse is True.
+        """
+        _, indices, inverse_indices = np.unique(
+            self.ravel_hash(coords), return_index=True, return_inverse=True)
+        coords = coords[indices]
+
+        outputs = []
+        if return_index:
+            outputs += [indices]
+        if return_inverse:
+            outputs += [inverse_indices]
+        return outputs
diff --git a/mmde/mmdet3d/models/data_preprocessors/utils.py b/mmde/mmdet3d/models/data_preprocessors/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef9cfe7b5862062a2d8a57ad6756a430b7c0f2a1
--- /dev/null
+++ b/mmde/mmdet3d/models/data_preprocessors/utils.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def multiview_img_stack_batch(tensor_list: List[Tensor],
+                              pad_size_divisor: int = 1,
+                              pad_value: Union[int, float] = 0) -> Tensor:
+    """Compared to the ``stack_batch`` in `mmengine.model.utils`,
+    multiview_img_stack_batch further handle the multiview images.
+
+    See diff of padded_sizes[:, :-2] = 0 vs padded_sizes[:, 0] = 0 in line 47.
+
+    Stack multiple tensors to form a batch and pad the tensor to the max shape
+    use the right bottom padding mode in these images. If
+    ``pad_size_divisor > 0``, add padding to ensure the shape of each dim is
+    divisible by ``pad_size_divisor``.
+
+    Args:
+        tensor_list (List[Tensor]): A list of tensors with the same dim.
+        pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding to
+            ensure the shape of each dim is divisible by ``pad_size_divisor``.
+            This depends on the model, and many models need to be divisible by
+            32. Defaults to 1.
+        pad_value (int or float): The padding value. Defaults to 0.
+
+    Returns:
+        Tensor: The n dim tensor.
+    """
+    assert isinstance(tensor_list, list), \
+        f'Expected input type to be list, but got {type(tensor_list)}'
+    assert tensor_list, '`tensor_list` could not be an empty list'
+    assert len({tensor.ndim for tensor in tensor_list}) == 1, \
+        'Expected the dimensions of all tensors must be the same, ' \
+        f'but got {[tensor.ndim for tensor in tensor_list]}'
+
+    dim = tensor_list[0].dim()
+    num_img = len(tensor_list)
+    all_sizes: torch.Tensor = torch.Tensor(
+        [tensor.shape for tensor in tensor_list])
+    max_sizes = torch.ceil(
+        torch.max(all_sizes, dim=0)[0] / pad_size_divisor) * pad_size_divisor
+    padded_sizes = max_sizes - all_sizes
+    # The first dim normally means channel, which should not be padded.
+    padded_sizes[:, :-2] = 0
+    if padded_sizes.sum() == 0:
+        return torch.stack(tensor_list)
+    # `pad` is the second arguments of `F.pad`. If pad is (1, 2, 3, 4),
+    # it means that padding the last dim with 1(left) 2(right), padding the
+    # penultimate dim to 3(top) 4(bottom). The order of `pad` is opposite of
+    # the `padded_sizes`. Therefore, the `padded_sizes` needs to be reversed,
+    # and only odd index of pad should be assigned to keep padding "right" and
+    # "bottom".
+    pad = torch.zeros(num_img, 2 * dim, dtype=torch.int)
+    pad[:, 1::2] = padded_sizes[:, range(dim - 1, -1, -1)]
+    batch_tensor = []
+    for idx, tensor in enumerate(tensor_list):
+        batch_tensor.append(
+            F.pad(tensor, tuple(pad[idx].tolist()), value=pad_value))
+    return torch.stack(batch_tensor)
diff --git a/mmde/mmdet3d/models/data_preprocessors/voxelize.py b/mmde/mmdet3d/models/data_preprocessors/voxelize.py
new file mode 100644
index 0000000000000000000000000000000000000000..25cd5bf586a24e5b09767dde62c3f9eea8680ea9
--- /dev/null
+++ b/mmde/mmdet3d/models/data_preprocessors/voxelize.py
@@ -0,0 +1,326 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, List, Optional, Tuple, Union
+
+import torch
+from mmcv.utils import ext_loader
+from torch import nn
+from torch.autograd import Function
+from torch.nn import functional as F
+from torch.nn.modules.utils import _pair
+
+ext_module = ext_loader.load_ext('_ext', [
+    'dynamic_voxelize_forward', 'hard_voxelize_forward',
+    'dynamic_point_to_voxel_forward', 'dynamic_point_to_voxel_backward'
+])
+
+
+class _Voxelization(Function):
+
+    @staticmethod
+    def forward(
+            ctx: Any,
+            points: torch.Tensor,
+            voxel_size: Union[tuple, float],
+            coors_range: Union[tuple, float],
+            max_points: int = 35,
+            max_voxels: int = 20000,
+            deterministic: bool = True) -> Union[Tuple[torch.Tensor], Tuple]:
+        """Convert kitti points(N, >=3) to voxels.
+
+        Args:
+            points (torch.Tensor): [N, ndim]. Points[:, :3] contain xyz points
+                and points[:, 3:] contain other information like reflectivity.
+            voxel_size (tuple or float): The size of voxel with the shape of
+                [3].
+            coors_range (tuple or float): The coordinate range of voxel with
+                the shape of [6].
+            max_points (int, optional): maximum points contained in a voxel. if
+                max_points=-1, it means using dynamic_voxelize. Default: 35.
+            max_voxels (int, optional): maximum voxels this function create.
+                for second, 20000 is a good choice. Users should shuffle points
+                before call this function because max_voxels may drop points.
+                Default: 20000.
+            deterministic: bool. whether to invoke the non-deterministic
+                version of hard-voxelization implementations. non-deterministic
+                version is considerablly fast but is not deterministic. only
+                affects hard voxelization. default True. for more information
+                of this argument and the implementation insights, please refer
+                to the following links:
+                https://github.com/open-mmlab/mmdetection3d/issues/894
+                https://github.com/open-mmlab/mmdetection3d/pull/904
+                it is an experimental feature and we will appreciate it if
+                you could share with us the failing cases.
+
+        Returns:
+            tuple[torch.Tensor]: tuple[torch.Tensor]: A tuple contains three
+            elements. The first one is the output voxels with the shape of
+            [M, max_points, n_dim], which only contain points and returned
+            when max_points != -1. The second is the voxel coordinates with
+            shape of [M, 3]. The last is number of point per voxel with the
+            shape of [M], which only returned when max_points != -1.
+        """
+        if max_points == -1 or max_voxels == -1:
+            coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
+            ext_module.dynamic_voxelize_forward(
+                points,
+                torch.tensor(voxel_size, dtype=torch.float),
+                torch.tensor(coors_range, dtype=torch.float),
+                coors,
+                NDim=3)
+            return coors
+        else:
+            voxels = points.new_zeros(
+                size=(max_voxels, max_points, points.size(1)))
+            coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
+            num_points_per_voxel = points.new_zeros(
+                size=(max_voxels, ), dtype=torch.int)
+            voxel_num = torch.zeros(size=(), dtype=torch.long)
+            ext_module.hard_voxelize_forward(
+                points,
+                torch.tensor(voxel_size, dtype=torch.float),
+                torch.tensor(coors_range, dtype=torch.float),
+                voxels,
+                coors,
+                num_points_per_voxel,
+                voxel_num,
+                max_points=max_points,
+                max_voxels=max_voxels,
+                NDim=3,
+                deterministic=deterministic)
+            # select the valid voxels
+            voxels_out = voxels[:voxel_num]
+            coors_out = coors[:voxel_num]
+            num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
+            return voxels_out, coors_out, num_points_per_voxel_out
+
+
+voxelization = _Voxelization.apply
+
+
+class VoxelizationByGridShape(nn.Module):
+    """Voxelization that allows inferring voxel size automatically based on
+    grid shape.
+
+    Please refer to `Point-Voxel CNN for Efficient 3D Deep Learning
+    <https://arxiv.org/abs/1907.03739>`_ for more details.
+
+    Args:
+        point_cloud_range (list):
+            [x_min, y_min, z_min, x_max, y_max, z_max]
+        max_num_points (int): max number of points per voxel
+        voxel_size (list): list [x, y, z] or [rho, phi, z]
+            size of single voxel.
+        grid_shape (list): [L, W, H], grid shape of voxelization.
+        max_voxels (tuple or int): max number of voxels in
+            (training, testing) time
+        deterministic: bool. whether to invoke the non-deterministic
+            version of hard-voxelization implementations. non-deterministic
+            version is considerablly fast but is not deterministic. only
+            affects hard voxelization. default True. for more information
+            of this argument and the implementation insights, please refer
+            to the following links:
+            https://github.com/open-mmlab/mmdetection3d/issues/894
+            https://github.com/open-mmlab/mmdetection3d/pull/904
+            it is an experimental feature and we will appreciate it if
+            you could share with us the failing cases.
+    """
+
+    def __init__(self,
+                 point_cloud_range: List,
+                 max_num_points: int,
+                 voxel_size: List = [],
+                 grid_shape: List[int] = [],
+                 max_voxels: Union[tuple, int] = 20000,
+                 deterministic: bool = True):
+        super().__init__()
+        if voxel_size and grid_shape:
+            raise ValueError('voxel_size is mutually exclusive grid_shape')
+        self.point_cloud_range = point_cloud_range
+        self.max_num_points = max_num_points
+        if isinstance(max_voxels, tuple):
+            self.max_voxels = max_voxels
+        else:
+            self.max_voxels = _pair(max_voxels)
+        self.deterministic = deterministic
+
+        point_cloud_range = torch.tensor(
+            point_cloud_range, dtype=torch.float32)
+        if voxel_size:
+            self.voxel_size = voxel_size
+            voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
+            grid_shape = (point_cloud_range[3:] -
+                          point_cloud_range[:3]) / voxel_size
+            grid_shape = torch.round(grid_shape).long().tolist()
+            self.grid_shape = grid_shape
+        elif grid_shape:
+            grid_shape = torch.tensor(grid_shape, dtype=torch.float32)
+            voxel_size = (point_cloud_range[3:] - point_cloud_range[:3]) / (
+                grid_shape - 1)
+            voxel_size = voxel_size.tolist()
+            self.voxel_size = voxel_size
+        else:
+            raise ValueError('must assign a value to voxel_size or grid_shape')
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.training:
+            max_voxels = self.max_voxels[0]
+        else:
+            max_voxels = self.max_voxels[1]
+
+        return voxelization(input, self.voxel_size, self.point_cloud_range,
+                            self.max_num_points, max_voxels,
+                            self.deterministic)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'voxel_size=' + str(self.voxel_size)
+        s += ', grid_shape=' + str(self.grid_shape)
+        s += ', point_cloud_range=' + str(self.point_cloud_range)
+        s += ', max_num_points=' + str(self.max_num_points)
+        s += ', max_voxels=' + str(self.max_voxels)
+        s += ', deterministic=' + str(self.deterministic)
+        s += ')'
+        return s
+
+
+class _DynamicScatter(Function):
+    """Different from the mmcv implementation, here it is allowed to return
+    point2voxel_map."""
+
+    @staticmethod
+    def forward(ctx: Any,
+                feats: torch.Tensor,
+                coors: torch.Tensor,
+                reduce_type: str = 'max',
+                return_map: str = False) -> Tuple[torch.Tensor, torch.Tensor]:
+        """convert kitti points(N, >=3) to voxels.
+
+        Args:
+            feats (torch.Tensor): [N, C]. Points features to be reduced
+                into voxels.
+            coors (torch.Tensor): [N, ndim]. Corresponding voxel coordinates
+                (specifically multi-dim voxel index) of each points.
+            reduce_type (str, optional): Reduce op. support 'max', 'sum' and
+                'mean'. Default: 'max'.
+            return_map (str, optional): Whether to return point2voxel_map.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
+        """
+        results = ext_module.dynamic_point_to_voxel_forward(
+            feats, coors, reduce_type)
+        (voxel_feats, voxel_coors, point2voxel_map,
+         voxel_points_count) = results
+        ctx.reduce_type = reduce_type
+        ctx.save_for_backward(feats, voxel_feats, point2voxel_map,
+                              voxel_points_count)
+        ctx.mark_non_differentiable(voxel_coors)
+        if return_map:
+            return voxel_feats, voxel_coors, point2voxel_map
+        else:
+            return voxel_feats, voxel_coors
+
+    @staticmethod
+    def backward(ctx: Any,
+                 grad_voxel_feats: torch.Tensor,
+                 grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple:
+        (feats, voxel_feats, point2voxel_map,
+         voxel_points_count) = ctx.saved_tensors
+        grad_feats = torch.zeros_like(feats)
+        # TODO: whether to use index put or use cuda_backward
+        # To use index put, need point to voxel index
+        ext_module.dynamic_point_to_voxel_backward(
+            grad_feats, grad_voxel_feats.contiguous(), feats, voxel_feats,
+            point2voxel_map, voxel_points_count, ctx.reduce_type)
+        return grad_feats, None, None
+
+
+dynamic_scatter_3d = _DynamicScatter.apply
+
+
+class DynamicScatter3D(nn.Module):
+    """Scatters points into voxels, used in the voxel encoder with dynamic
+    voxelization.
+
+    Note:
+        The CPU and GPU implementation get the same output, but have numerical
+        difference after summation and division (e.g., 5e-7).
+
+    Args:
+        voxel_size (list): list [x, y, z] size of three dimension.
+        point_cloud_range (list): The coordinate range of points, [x_min,
+            y_min, z_min, x_max, y_max, z_max].
+        average_points (bool): whether to use avg pooling to scatter points
+            into voxel.
+    """
+
+    def __init__(self, voxel_size: List, point_cloud_range: List,
+                 average_points: bool):
+        super().__init__()
+
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.average_points = average_points
+
+    def forward_single(
+            self, points: torch.Tensor,
+            coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Scatters points into voxels.
+
+        Args:
+            points (torch.Tensor): Points to be reduced into voxels.
+            coors (torch.Tensor): Corresponding voxel coordinates (specifically
+                multi-dim voxel index) of each points.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
+        """
+        reduce = 'mean' if self.average_points else 'max'
+        return dynamic_scatter_3d(points.contiguous(), coors.contiguous(),
+                                  reduce)
+
+    def forward(self, points: torch.Tensor,
+                coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Scatters points/features into voxels.
+
+        Args:
+            points (torch.Tensor): Points to be reduced into voxels.
+            coors (torch.Tensor): Corresponding voxel coordinates (specifically
+                multi-dim voxel index) of each points.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
+        """
+        if coors.size(-1) == 3:
+            return self.forward_single(points, coors)
+        else:
+            batch_size = coors[-1, 0] + 1
+            voxels, voxel_coors = [], []
+            for i in range(batch_size):
+                inds = torch.where(coors[:, 0] == i)
+                voxel, voxel_coor = self.forward_single(
+                    points[inds], coors[inds][:, 1:])
+                coor_pad = F.pad(voxel_coor, (1, 0), mode='constant', value=i)
+                voxel_coors.append(coor_pad)
+                voxels.append(voxel)
+            features = torch.cat(voxels, dim=0)
+            feature_coors = torch.cat(voxel_coors, dim=0)
+
+            return features, feature_coors
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'voxel_size=' + str(self.voxel_size)
+        s += ', point_cloud_range=' + str(self.point_cloud_range)
+        s += ', average_points=' + str(self.average_points)
+        s += ')'
+        return s
diff --git a/mmde/mmdet3d/models/decode_heads/__init__.py b/mmde/mmdet3d/models/decode_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6265875bba2e019ff1c0e40d04162d91e57fc5eb
--- /dev/null
+++ b/mmde/mmdet3d/models/decode_heads/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .cylinder3d_head import Cylinder3DHead
+from .decode_head import Base3DDecodeHead
+from .dgcnn_head import DGCNNHead
+from .minkunet_head import MinkUNetHead
+from .paconv_head import PAConvHead
+from .pointnet2_head import PointNet2Head
+
+__all__ = [
+    'PointNet2Head', 'DGCNNHead', 'PAConvHead', 'Cylinder3DHead',
+    'Base3DDecodeHead', 'MinkUNetHead'
+]
diff --git a/mmde/mmdet3d/models/decode_heads/cylinder3d_head.py b/mmde/mmdet3d/models/decode_heads/cylinder3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1672a0f20941e3cd156effefa681687019dc80cb
--- /dev/null
+++ b/mmde/mmdet3d/models/decode_heads/cylinder3d_head.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+from mmcv.ops import SparseConvTensor, SparseModule, SubMConv3d
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import OptMultiConfig
+from mmdet3d.utils.typing_utils import ConfigType
+from .decode_head import Base3DDecodeHead
+
+
+@MODELS.register_module()
+class Cylinder3DHead(Base3DDecodeHead):
+    """Cylinder3D decoder head.
+
+    Decoder head used in `Cylinder3D <https://arxiv.org/abs/2011.10033>`_.
+    Refer to the
+    `official code <https://https://github.com/xinge008/Cylinder3D>`_.
+
+    Args:
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        dropout_ratio (float): Ratio of dropout layer. Defaults to 0.
+        conv_cfg (dict or :obj:`ConfigDict`): Config of conv layers.
+            Defaults to dict(type='Conv1d').
+        norm_cfg (dict or :obj:`ConfigDict`): Config of norm layers.
+            Defaults to dict(type='BN1d').
+        act_cfg (dict or :obj:`ConfigDict`): Config of activation layers.
+            Defaults to dict(type='ReLU').
+        loss_ce (dict or :obj:`ConfigDict`): Config of CrossEntropy loss.
+            Defaults to dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=False,
+                     class_weight=None,
+                     loss_weight=1.0).
+        loss_lovasz (dict or :obj:`ConfigDict`): Config of Lovasz loss.
+            Defaults to dict(type='LovaszLoss', loss_weight=1.0).
+        conv_seg_kernel_size (int): The kernel size used in conv_seg.
+            Defaults to 3.
+        ignore_index (int): The label index to be ignored. When using masked
+            BCE loss, ignore_index should be set to None. Defaults to 19.
+        init_cfg (dict or :obj:`ConfigDict` or list[dict or :obj:`ConfigDict`],
+            optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 channels: int,
+                 num_classes: int,
+                 dropout_ratio: float = 0,
+                 conv_cfg: ConfigType = dict(type='Conv1d'),
+                 norm_cfg: ConfigType = dict(type='BN1d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 loss_ce: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=False,
+                     class_weight=None,
+                     loss_weight=1.0),
+                 loss_lovasz: ConfigType = dict(
+                     type='LovaszLoss', loss_weight=1.0),
+                 conv_seg_kernel_size: int = 3,
+                 ignore_index: int = 19,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(Cylinder3DHead, self).__init__(
+            channels=channels,
+            num_classes=num_classes,
+            dropout_ratio=dropout_ratio,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            conv_seg_kernel_size=conv_seg_kernel_size,
+            init_cfg=init_cfg)
+
+        self.loss_lovasz = MODELS.build(loss_lovasz)
+        self.loss_ce = MODELS.build(loss_ce)
+        self.ignore_index = ignore_index
+
+    def build_conv_seg(self, channels: int, num_classes: int,
+                       kernel_size: int) -> SparseModule:
+        return SubMConv3d(
+            channels,
+            num_classes,
+            indice_key='logit',
+            kernel_size=kernel_size,
+            stride=1,
+            padding=1,
+            bias=True)
+
+    def forward(self, sparse_voxels: SparseConvTensor) -> SparseConvTensor:
+        """Forward function."""
+        sparse_logits = self.cls_seg(sparse_voxels)
+        return sparse_logits
+
+    def loss_by_feat(self, seg_logit: SparseConvTensor,
+                     batch_data_samples: SampleList) -> dict:
+        """Compute semantic segmentation loss.
+
+        Args:
+            seg_logit (SparseConvTensor): Predicted per-voxel
+                segmentation logits of shape [num_voxels, num_classes]
+                stored in SparseConvTensor.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_pts_seg`.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        gt_semantic_segs = [
+            data_sample.gt_pts_seg.voxel_semantic_mask
+            for data_sample in batch_data_samples
+        ]
+        seg_label = torch.cat(gt_semantic_segs)
+        seg_logit_feat = seg_logit.features
+        loss = dict()
+        loss['loss_ce'] = self.loss_ce(
+            seg_logit_feat, seg_label, ignore_index=self.ignore_index)
+        loss['loss_lovasz'] = self.loss_lovasz(
+            seg_logit_feat, seg_label, ignore_index=self.ignore_index)
+
+        return loss
+
+    def predict(
+        self,
+        inputs: SparseConvTensor,
+        batch_inputs_dict: dict,
+        batch_data_samples: SampleList,
+    ) -> torch.Tensor:
+        """Forward function for testing.
+
+        Args:
+            inputs (SparseConvTensor): Feature from backbone.
+            batch_inputs_dict (dict): Input sample dict which includes 'points'
+                and 'voxels' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - voxels (dict): Dict of voxelized voxels and the corresponding
+                coordinates.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`. We use `point2voxel_map` in this function.
+
+        Returns:
+            List[torch.Tensor]: List of point-wise segmentation logits.
+        """
+        seg_logits = self.forward(inputs).features
+
+        seg_pred_list = []
+        coors = batch_inputs_dict['voxels']['voxel_coors']
+        for batch_idx in range(len(batch_data_samples)):
+            seg_logits_sample = seg_logits[coors[:, 0] == batch_idx]
+            point2voxel_map = batch_data_samples[
+                batch_idx].point2voxel_map.long()
+            point_seg_predicts = seg_logits_sample[point2voxel_map]
+            seg_pred_list.append(point_seg_predicts)
+
+        return seg_pred_list
diff --git a/mmde/mmdet3d/models/decode_heads/decode_head.py b/mmde/mmdet3d/models/decode_heads/decode_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..58688d8df5abb3784331402133beaaf275eff181
--- /dev/null
+++ b/mmde/mmdet3d/models/decode_heads/decode_head.py
@@ -0,0 +1,178 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List
+
+import torch
+from mmengine.model import BaseModule, normal_init
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils.typing_utils import ConfigType, OptMultiConfig
+
+
+class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead.
+
+    1. The ``init_weights`` method is used to initialize decode_head's
+    model parameters. After segmentor initialization, ``init_weights``
+    is triggered when ``segmentor.init_weights()`` is called externally.
+
+    2. The ``loss`` method is used to calculate the loss of decode_head,
+    which includes two steps: (1) the decode_head model performs forward
+    propagation to obtain the feature maps (2) The ``loss_by_feat`` method
+    is called based on the feature maps to calculate the loss.
+
+    .. code:: text
+
+    loss(): forward() -> loss_by_feat()
+
+    3. The ``predict`` method is used to predict segmentation results,
+    which includes two steps: (1) the decode_head model performs forward
+    propagation to obtain the feature maps (2) The ``predict_by_feat`` method
+    is called based on the feature maps to predict segmentation results
+    including post-processing.
+
+    .. code:: text
+
+    predict(): forward() -> predict_by_feat()
+
+    Args:
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        dropout_ratio (float): Ratio of dropout layer. Defaults to 0.5.
+        conv_cfg (dict or :obj:`ConfigDict`): Config of conv layers.
+            Defaults to dict(type='Conv1d').
+        norm_cfg (dict or :obj:`ConfigDict`): Config of norm layers.
+            Defaults to dict(type='BN1d').
+        act_cfg (dict or :obj:`ConfigDict`): Config of activation layers.
+            Defaults to dict(type='ReLU').
+        loss_decode (dict or :obj:`ConfigDict`): Config of decode loss.
+            Defaults to dict(type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            class_weight=None, loss_weight=1.0).
+        conv_seg_kernel_size (int): The kernel size used in conv_seg.
+            Defaults to 1.
+        ignore_index (int): The label index to be ignored. When using masked
+            BCE loss, ignore_index should be set to None. Defaults to 255.
+        init_cfg (dict or :obj:`ConfigDict` or list[dict or :obj:`ConfigDict`],
+            optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 channels: int,
+                 num_classes: int,
+                 dropout_ratio: float = 0.5,
+                 conv_cfg: ConfigType = dict(type='Conv1d'),
+                 norm_cfg: ConfigType = dict(type='BN1d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 loss_decode: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=False,
+                     class_weight=None,
+                     loss_weight=1.0),
+                 conv_seg_kernel_size: int = 1,
+                 ignore_index: int = 255,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(Base3DDecodeHead, self).__init__(init_cfg=init_cfg)
+        self.channels = channels
+        self.num_classes = num_classes
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.loss_decode = MODELS.build(loss_decode)
+        self.ignore_index = ignore_index
+
+        self.conv_seg = self.build_conv_seg(
+            channels=channels,
+            num_classes=num_classes,
+            kernel_size=conv_seg_kernel_size)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout(dropout_ratio)
+        else:
+            self.dropout = None
+
+    def init_weights(self) -> None:
+        """Initialize weights of classification layer."""
+        super().init_weights()
+        normal_init(self.conv_seg, mean=0, std=0.01)
+
+    @abstractmethod
+    def forward(self, feats_dict: dict) -> Tensor:
+        """Placeholder of forward function."""
+        pass
+
+    def build_conv_seg(self, channels: int, num_classes: int,
+                       kernel_size: int) -> nn.Module:
+        """Build Convolutional Segmentation Layers."""
+        return nn.Conv1d(channels, num_classes, kernel_size=kernel_size)
+
+    def cls_seg(self, feat: Tensor) -> Tensor:
+        """Classify each points."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+    def loss(self, inputs: dict, batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> Dict[str, Tensor]:
+        """Forward function for training.
+
+        Args:
+            inputs (dict): Feature dict from backbone.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+            train_cfg (dict or :obj:`ConfigDict`): The training config.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components.
+        """
+        seg_logits = self.forward(inputs)
+        losses = self.loss_by_feat(seg_logits, batch_data_samples)
+        return losses
+
+    def predict(self, inputs: dict, batch_input_metas: List[dict],
+                test_cfg: ConfigType) -> Tensor:
+        """Forward function for testing.
+
+        Args:
+            inputs (dict): Feature dict from backbone.
+            batch_input_metas (List[dict]): Meta information of a batch of
+                samples.
+            test_cfg (dict or :obj:`ConfigDict`): The testing config.
+
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        seg_logits = self.forward(inputs)
+
+        return seg_logits
+
+    def _stack_batch_gt(self, batch_data_samples: SampleList) -> Tensor:
+        gt_semantic_segs = [
+            data_sample.gt_pts_seg.pts_semantic_mask
+            for data_sample in batch_data_samples
+        ]
+        return torch.stack(gt_semantic_segs, dim=0)
+
+    def loss_by_feat(self, seg_logit: Tensor,
+                     batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """Compute semantic segmentation loss.
+
+        Args:
+            seg_logit (Tensor): Predicted per-point segmentation logits of
+                shape [B, num_classes, N].
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components.
+        """
+        seg_label = self._stack_batch_gt(batch_data_samples)
+        loss = dict()
+        loss['loss_sem_seg'] = self.loss_decode(
+            seg_logit, seg_label, ignore_index=self.ignore_index)
+        return loss
diff --git a/mmde/mmdet3d/models/decode_heads/dgcnn_head.py b/mmde/mmdet3d/models/decode_heads/dgcnn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..b64d2b8bf98d8b420c8ba5458a930c7f55376082
--- /dev/null
+++ b/mmde/mmdet3d/models/decode_heads/dgcnn_head.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+from mmcv.cnn.bricks import ConvModule
+from torch import Tensor
+
+from mmdet3d.models.layers import DGCNNFPModule
+from mmdet3d.registry import MODELS
+from .decode_head import Base3DDecodeHead
+
+
+@MODELS.register_module()
+class DGCNNHead(Base3DDecodeHead):
+    r"""DGCNN decoder head.
+
+    Decoder head used in `DGCNN <https://arxiv.org/abs/1801.07829>`_.
+    Refer to the
+    `reimplementation code <https://github.com/AnTao97/dgcnn.pytorch>`_.
+
+    Args:
+        fp_channels (Sequence[int]): Tuple of mlp channels in feature
+            propagation (FP) modules. Defaults to (1216, 512).
+    """
+
+    def __init__(self, fp_channels: Sequence[int] = (1216, 512),
+                 **kwargs) -> None:
+        super(DGCNNHead, self).__init__(**kwargs)
+
+        self.FP_module = DGCNNFPModule(
+            mlp_channels=fp_channels, act_cfg=self.act_cfg)
+
+        # https://github.com/charlesq34/pointnet2/blob/master/models/pointnet2_sem_seg.py#L40
+        self.pre_seg_conv = ConvModule(
+            fp_channels[-1],
+            self.channels,
+            kernel_size=1,
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _extract_input(self, feat_dict: dict) -> Tensor:
+        """Extract inputs from features dictionary.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            torch.Tensor: Points for decoder.
+        """
+        fa_points = feat_dict['fa_points']
+
+        return fa_points
+
+    def forward(self, feat_dict: dict) -> Tensor:
+        """Forward pass.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            Tensor: Segmentation map of shape [B, num_classes, N].
+        """
+        fa_points = self._extract_input(feat_dict)
+
+        fp_points = self.FP_module(fa_points)
+        fp_points = fp_points.transpose(1, 2).contiguous()
+        output = self.pre_seg_conv(fp_points)
+        output = self.cls_seg(output)
+
+        return output
diff --git a/mmde/mmdet3d/models/decode_heads/minkunet_head.py b/mmde/mmdet3d/models/decode_heads/minkunet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a2d1a455e12a23fe71b99f5cce69cd8a57f6072
--- /dev/null
+++ b/mmde/mmdet3d/models/decode_heads/minkunet_head.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.det3d_data_sample import SampleList
+from .decode_head import Base3DDecodeHead
+
+
+@MODELS.register_module()
+class MinkUNetHead(Base3DDecodeHead):
+    r"""MinkUNet decoder head with TorchSparse backend.
+
+    Refer to `implementation code <https://github.com/mit-han-lab/spvnas>`_.
+
+    Args:
+        channels (int): The input channel of conv_seg.
+        num_classes (int): Number of classes.
+    """
+
+    def __init__(self, channels: int, num_classes: int, **kwargs) -> None:
+        super().__init__(channels, num_classes, **kwargs)
+
+    def build_conv_seg(self, channels: int, num_classes: int,
+                       kernel_size: int) -> nn.Module:
+        """Build Convolutional Segmentation Layers."""
+        return nn.Linear(channels, num_classes)
+
+    def _stack_batch_gt(self, batch_data_samples: SampleList) -> Tensor:
+        """Concat voxel-wise Groud Truth."""
+        gt_semantic_segs = [
+            data_sample.gt_pts_seg.voxel_semantic_mask
+            for data_sample in batch_data_samples
+        ]
+        return torch.cat(gt_semantic_segs)
+
+    def predict(self, inputs: Tensor,
+                batch_data_samples: SampleList) -> List[Tensor]:
+        """Forward function for testing.
+
+        Args:
+            inputs (Tensor): Features from backone.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg
+                data samples.
+
+        Returns:
+            List[Tensor]: The segmentation prediction mask of each batch.
+        """
+        seg_logits = self.forward(inputs)
+
+        batch_idx = torch.cat(
+            [data_samples.batch_idx for data_samples in batch_data_samples])
+        seg_logit_list = []
+        for i, data_sample in enumerate(batch_data_samples):
+            seg_logit = seg_logits[batch_idx == i]
+            seg_logit = seg_logit[data_sample.point2voxel_map]
+            seg_logit_list.append(seg_logit)
+
+        return seg_logit_list
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x (Tensor): Features from backbone.
+
+        Returns:
+            Tensor: Segmentation map of shape [N, C].
+                Note that output contains all points from each batch.
+        """
+        return self.cls_seg(x)
diff --git a/mmde/mmdet3d/models/decode_heads/paconv_head.py b/mmde/mmdet3d/models/decode_heads/paconv_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ae20b8d6d8157683061f1e452a067883dafda08
--- /dev/null
+++ b/mmde/mmdet3d/models/decode_heads/paconv_head.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+from mmcv.cnn.bricks import ConvModule
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils.typing_utils import ConfigType
+from .pointnet2_head import PointNet2Head
+
+
+@MODELS.register_module()
+class PAConvHead(PointNet2Head):
+    r"""PAConv decoder head.
+
+    Decoder head used in `PAConv <https://arxiv.org/abs/2103.14635>`_.
+    Refer to the `official code <https://github.com/CVMI-Lab/PAConv>`_.
+
+    Args:
+        fp_channels (Sequence[Sequence[int]]): Tuple of mlp channels in FP
+            modules. Defaults to ((768, 256, 256), (384, 256, 256),
+            (320, 256, 128), (128 + 6, 128, 128, 128)).
+        fp_norm_cfg (dict or :obj:`ConfigDict`): Config of norm layers used in
+            FP modules. Defaults to dict(type='BN2d').
+    """
+
+    def __init__(self,
+                 fp_channels: Sequence[Sequence[int]] = ((768, 256, 256),
+                                                         (384, 256, 256),
+                                                         (320, 256,
+                                                          128), (128 + 6, 128,
+                                                                 128, 128)),
+                 fp_norm_cfg: ConfigType = dict(type='BN2d'),
+                 **kwargs) -> None:
+        super(PAConvHead, self).__init__(
+            fp_channels=fp_channels, fp_norm_cfg=fp_norm_cfg, **kwargs)
+
+        # https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/pointnet2/pointnet2_paconv_seg.py#L53
+        # PointNet++'s decoder conv has bias while PAConv's doesn't have
+        # so we need to rebuild it here
+        self.pre_seg_conv = ConvModule(
+            fp_channels[-1][-1],
+            self.channels,
+            kernel_size=1,
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, feat_dict: dict) -> Tensor:
+        """Forward pass.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            torch.Tensor: Segmentation map of shape [B, num_classes, N].
+        """
+        sa_xyz, sa_features = self._extract_input(feat_dict)
+
+        # PointNet++ doesn't use the first level of `sa_features` as input
+        # while PAConv inputs it through skip-connection
+        fp_feature = sa_features[-1]
+
+        for i in range(self.num_fp):
+            # consume the points in a bottom-up manner
+            fp_feature = self.FP_modules[i](sa_xyz[-(i + 2)], sa_xyz[-(i + 1)],
+                                            sa_features[-(i + 2)], fp_feature)
+
+        output = self.pre_seg_conv(fp_feature)
+        output = self.cls_seg(output)
+
+        return output
diff --git a/mmde/mmdet3d/models/decode_heads/pointnet2_head.py b/mmde/mmdet3d/models/decode_heads/pointnet2_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a762cdede6bc298e0daaf35fcd58d21e20e6338
--- /dev/null
+++ b/mmde/mmdet3d/models/decode_heads/pointnet2_head.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Tuple
+
+from mmcv.cnn.bricks import ConvModule
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.models.layers import PointFPModule
+from mmdet3d.registry import MODELS
+from mmdet3d.utils.typing_utils import ConfigType
+from .decode_head import Base3DDecodeHead
+
+
+@MODELS.register_module()
+class PointNet2Head(Base3DDecodeHead):
+    r"""PointNet2 decoder head.
+
+    Decoder head used in `PointNet++ <https://arxiv.org/abs/1706.02413>`_.
+    Refer to the `official code <https://github.com/charlesq34/pointnet2>`_.
+
+    Args:
+        fp_channels (Sequence[Sequence[int]]): Tuple of mlp channels in FP
+            modules. Defaults to ((768, 256, 256), (384, 256, 256),
+            (320, 256, 128), (128, 128, 128, 128)).
+        fp_norm_cfg (dict or :obj:`ConfigDict`): Config of norm layers used
+            in FP modules. Defaults to dict(type='BN2d').
+    """
+
+    def __init__(self,
+                 fp_channels: Sequence[Sequence[int]] = ((768, 256, 256),
+                                                         (384, 256, 256),
+                                                         (320, 256, 128),
+                                                         (128, 128, 128, 128)),
+                 fp_norm_cfg: ConfigType = dict(type='BN2d'),
+                 **kwargs) -> None:
+        super(PointNet2Head, self).__init__(**kwargs)
+
+        self.num_fp = len(fp_channels)
+        self.FP_modules = nn.ModuleList()
+        for cur_fp_mlps in fp_channels:
+            self.FP_modules.append(
+                PointFPModule(mlp_channels=cur_fp_mlps, norm_cfg=fp_norm_cfg))
+
+        # https://github.com/charlesq34/pointnet2/blob/master/models/pointnet2_sem_seg.py#L40
+        self.pre_seg_conv = ConvModule(
+            fp_channels[-1][-1],
+            self.channels,
+            kernel_size=1,
+            bias=True,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _extract_input(self,
+                       feat_dict: dict) -> Tuple[List[Tensor], List[Tensor]]:
+        """Extract inputs from features dictionary.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            Tuple[List[Tensor], List[Tensor]]: Coordinates and features of
+            multiple levels of points.
+        """
+        sa_xyz = feat_dict['sa_xyz']
+        sa_features = feat_dict['sa_features']
+        assert len(sa_xyz) == len(sa_features)
+
+        return sa_xyz, sa_features
+
+    def forward(self, feat_dict: dict) -> Tensor:
+        """Forward pass.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            Tensor: Segmentation map of shape [B, num_classes, N].
+        """
+        sa_xyz, sa_features = self._extract_input(feat_dict)
+
+        # https://github.com/charlesq34/pointnet2/blob/master/models/pointnet2_sem_seg.py#L24
+        sa_features[0] = None
+
+        fp_feature = sa_features[-1]
+
+        for i in range(self.num_fp):
+            # consume the points in a bottom-up manner
+            fp_feature = self.FP_modules[i](sa_xyz[-(i + 2)], sa_xyz[-(i + 1)],
+                                            sa_features[-(i + 2)], fp_feature)
+        output = self.pre_seg_conv(fp_feature)
+        output = self.cls_seg(output)
+
+        return output
diff --git a/mmde/mmdet3d/models/dense_heads/__init__.py b/mmde/mmdet3d/models/dense_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2503ee8c607d8f04e7b82995b2c7c9a3cb13e13a
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor3d_head import Anchor3DHead
+from .anchor_free_mono3d_head import AnchorFreeMono3DHead
+from .base_3d_dense_head import Base3DDenseHead
+from .base_conv_bbox_head import BaseConvBboxHead
+from .base_mono3d_dense_head import BaseMono3DDenseHead
+from .centerpoint_head import CenterHead
+from .fcaf3d_head import FCAF3DHead
+from .fcos_mono3d_head import FCOSMono3DHead
+from .free_anchor3d_head import FreeAnchor3DHead
+from .groupfree3d_head import GroupFree3DHead
+from .imvoxel_head import ImVoxelHead
+from .monoflex_head import MonoFlexHead
+from .parta2_rpn_head import PartA2RPNHead
+from .pgd_head import PGDHead
+from .point_rpn_head import PointRPNHead
+from .shape_aware_head import ShapeAwareHead
+from .smoke_mono3d_head import SMOKEMono3DHead
+from .ssd_3d_head import SSD3DHead
+from .vote_head import VoteHead
+
+__all__ = [
+    'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead',
+    'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
+    'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead',
+    'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead',
+    'MonoFlexHead', 'Base3DDenseHead', 'FCAF3DHead', 'ImVoxelHead'
+]
diff --git a/mmde/mmdet3d/models/dense_heads/anchor3d_head.py b/mmde/mmdet3d/models/dense_heads/anchor3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..053a8ca46e50d37b89a844c0b3d787f4cdff2ee8
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/anchor3d_head.py
@@ -0,0 +1,428 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Tuple
+
+import numpy as np
+import torch
+from mmdet.models.utils import multi_apply
+from mmdet.utils.memory import cast_tensor_type
+from mmengine.runner import amp
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.models.task_modules import PseudoSampler
+from mmdet3d.models.test_time_augs import merge_aug_bboxes_3d
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.utils.typing_utils import (ConfigType, InstanceList,
+                                        OptConfigType, OptInstanceList)
+from .base_3d_dense_head import Base3DDenseHead
+from .train_mixins import AnchorTrainMixin
+
+
+@MODELS.register_module()
+class Anchor3DHead(Base3DDenseHead, AnchorTrainMixin):
+    """Anchor-based head for SECOND/PointPillars/MVXNet/PartA2.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of channels of the feature map.
+        use_direction_classifier (bool): Whether to add a direction classifier.
+        anchor_generator(dict): Config dict of anchor generator.
+        assigner_per_size (bool): Whether to do assignment for each separate
+            anchor size.
+        assign_per_class (bool): Whether to do assignment for each class.
+        diff_rad_by_sin (bool): Whether to change the difference into sin
+            difference for box regression loss.
+        dir_offset (float | int): The offset of BEV rotation angles.
+            (TODO: may be moved into box coder)
+        dir_limit_offset (float | int): The limited range of BEV
+            rotation angles. (TODO: may be moved into box coder)
+        bbox_coder (dict): Config dict of box coders.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        loss_dir (dict): Config of direction classifier loss.
+        train_cfg (dict): Train configs.
+        test_cfg (dict): Test configs.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 feat_channels: int = 256,
+                 use_direction_classifier: bool = True,
+                 anchor_generator: ConfigType = dict(
+                     type='Anchor3DRangeGenerator',
+                     range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
+                     strides=[2],
+                     sizes=[[3.9, 1.6, 1.56]],
+                     rotations=[0, 1.57],
+                     custom_values=[],
+                     reshape_out=False),
+                 assigner_per_size: bool = False,
+                 assign_per_class: bool = False,
+                 diff_rad_by_sin: bool = True,
+                 dir_offset: float = -np.pi / 2,
+                 dir_limit_offset: int = 0,
+                 bbox_coder: ConfigType = dict(type='DeltaXYZWLHRBBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='mmdet.SmoothL1Loss',
+                     beta=1.0 / 9.0,
+                     loss_weight=2.0),
+                 loss_dir: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss', loss_weight=0.2),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptConfigType = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.diff_rad_by_sin = diff_rad_by_sin
+        self.use_direction_classifier = use_direction_classifier
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.assigner_per_size = assigner_per_size
+        self.assign_per_class = assign_per_class
+        self.dir_offset = dir_offset
+        self.dir_limit_offset = dir_limit_offset
+        warnings.warn(
+            'dir_offset and dir_limit_offset will be depressed and be '
+            'incorporated into box coder in the future')
+
+        # build anchor generator
+        self.prior_generator = TASK_UTILS.build(anchor_generator)
+        # In 3D detection, the anchor stride is connected with anchor size
+        self.num_anchors = self.prior_generator.num_base_anchors
+        # build box coder
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.box_code_size = self.bbox_coder.code_size
+
+        # build loss function
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        self.sampling = loss_cls['type'] not in [
+            'mmdet.FocalLoss', 'mmdet.GHMC'
+        ]
+        if not self.use_sigmoid_cls:
+            self.num_classes += 1
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.loss_dir = MODELS.build(loss_dir)
+
+        self._init_layers()
+        self._init_assigner_sampler()
+
+        if init_cfg is None:
+            self.init_cfg = dict(
+                type='Normal',
+                layer='Conv2d',
+                std=0.01,
+                override=dict(
+                    type='Normal', name='conv_cls', std=0.01, bias_prob=0.01))
+
+    def _init_assigner_sampler(self):
+        """Initialize the target assigner and sampler of the head."""
+        if self.train_cfg is None:
+            return
+
+        if self.sampling:
+            self.bbox_sampler = TASK_UTILS.build(self.train_cfg.sampler)
+        else:
+            self.bbox_sampler = PseudoSampler()
+        if isinstance(self.train_cfg.assigner, dict):
+            self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+        elif isinstance(self.train_cfg.assigner, list):
+            self.bbox_assigner = [
+                TASK_UTILS.build(res) for res in self.train_cfg.assigner
+            ]
+
+    def _init_layers(self):
+        """Initialize neural network layers of the head."""
+        self.cls_out_channels = self.num_anchors * self.num_classes
+        self.conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 1)
+        self.conv_reg = nn.Conv2d(self.feat_channels,
+                                  self.num_anchors * self.box_code_size, 1)
+        if self.use_direction_classifier:
+            self.conv_dir_cls = nn.Conv2d(self.feat_channels,
+                                          self.num_anchors * 2, 1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward function on a single-scale feature map.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level
+                    the channels number is num_base_priors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale
+                    level, the channels number is num_base_priors * C.
+                dir_cls_pred (Tensor | None): Direction classification
+                    prediction for a single scale level, the channels
+                    number is num_base_priors * 2.
+        """
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        dir_cls_pred = None
+        if self.use_direction_classifier:
+            dir_cls_pred = self.conv_dir_cls(x)
+        return cls_score, bbox_pred, dir_cls_pred
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward pass.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network,
+                each is a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores, bbox and direction
+                classification prediction.
+
+                - cls_scores (list[Tensor]): Classification scores for all
+                    scale levels, each is a 4D-tensor, the channels number
+                    is num_base_priors * num_classes.
+                - bbox_preds (list[Tensor]): Box energies / deltas for all
+                    scale levels, each is a 4D-tensor, the channels number
+                    is num_base_priors * C.
+                - dir_cls_preds (list[Tensor|None]): Direction classification
+                    predictions for all scale levels, each is a 4D-tensor,
+                    the channels number is num_base_priors * 2.
+        """
+        return multi_apply(self.forward_single, x)
+
+    # TODO: Support augmentation test
+    def aug_test(self,
+                 aug_batch_feats,
+                 aug_batch_input_metas,
+                 rescale=False,
+                 **kwargs):
+        aug_bboxes = []
+        # only support aug_test for one sample
+        for x, input_meta in zip(aug_batch_feats, aug_batch_input_metas):
+            outs = self.forward(x)
+            bbox_list = self.get_results(*outs, [input_meta], rescale=rescale)
+            bbox_dict = dict(
+                bboxes_3d=bbox_list[0].bboxes_3d,
+                scores_3d=bbox_list[0].scores_3d,
+                labels_3d=bbox_list[0].labels_3d)
+            aug_bboxes.append(bbox_dict)
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, aug_batch_input_metas,
+                                            self.test_cfg)
+        return [merged_bboxes]
+
+    def get_anchors(self,
+                    featmap_sizes: List[tuple],
+                    input_metas: List[dict],
+                    device: str = 'cuda') -> list:
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            input_metas (list[dict]): contain pcd and img's meta info.
+            device (str): device of current module.
+
+        Returns:
+            list[list[torch.Tensor]]: Anchors of each image, valid flags
+                of each image.
+        """
+        num_imgs = len(input_metas)
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.prior_generator.grid_anchors(
+            featmap_sizes, device=device)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+        return anchor_list
+
+    def _loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                             dir_cls_pred: Tensor, labels: Tensor,
+                             label_weights: Tensor, bbox_targets: Tensor,
+                             bbox_weights: Tensor, dir_targets: Tensor,
+                             dir_weights: Tensor, num_total_samples: int):
+        """Calculate loss of Single-level results.
+
+        Args:
+            cls_score (Tensor): Class score in single-level.
+            bbox_pred (Tensor): Bbox prediction in single-level.
+            dir_cls_pred (Tensor): Predictions of direction class
+                in single-level.
+            labels (Tensor): Labels of class.
+            label_weights (Tensor): Weights of class loss.
+            bbox_targets (Tensor): Targets of bbox predictions.
+            bbox_weights (Tensor): Weights of bbox loss.
+            dir_targets (Tensor): Targets of direction predictions.
+            dir_weights (Tensor): Weights of direction loss.
+            num_total_samples (int): The number of valid samples.
+
+        Returns:
+            tuple[torch.Tensor]: Losses of class, bbox
+                and direction, respectively.
+        """
+        # classification loss
+        if num_total_samples is None:
+            num_total_samples = int(cls_score.shape[0])
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.num_classes)
+        assert labels.max().item() <= self.num_classes
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=num_total_samples)
+
+        # regression loss
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1, self.box_code_size)
+        bbox_targets = bbox_targets.reshape(-1, self.box_code_size)
+        bbox_weights = bbox_weights.reshape(-1, self.box_code_size)
+
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero(
+                        as_tuple=False).reshape(-1)
+        num_pos = len(pos_inds)
+
+        pos_bbox_pred = bbox_pred[pos_inds]
+        pos_bbox_targets = bbox_targets[pos_inds]
+        pos_bbox_weights = bbox_weights[pos_inds]
+
+        # dir loss
+        if self.use_direction_classifier:
+            dir_cls_pred = dir_cls_pred.permute(0, 2, 3, 1).reshape(-1, 2)
+            dir_targets = dir_targets.reshape(-1)
+            dir_weights = dir_weights.reshape(-1)
+            pos_dir_cls_pred = dir_cls_pred[pos_inds]
+            pos_dir_targets = dir_targets[pos_inds]
+            pos_dir_weights = dir_weights[pos_inds]
+
+        if num_pos > 0:
+            code_weight = self.train_cfg.get('code_weight', None)
+            if code_weight:
+                pos_bbox_weights = pos_bbox_weights * bbox_weights.new_tensor(
+                    code_weight)
+            if self.diff_rad_by_sin:
+                pos_bbox_pred, pos_bbox_targets = self.add_sin_difference(
+                    pos_bbox_pred, pos_bbox_targets)
+            loss_bbox = self.loss_bbox(
+                pos_bbox_pred,
+                pos_bbox_targets,
+                pos_bbox_weights,
+                avg_factor=num_total_samples)
+
+            # direction classification loss
+            loss_dir = None
+            if self.use_direction_classifier:
+                loss_dir = self.loss_dir(
+                    pos_dir_cls_pred,
+                    pos_dir_targets,
+                    pos_dir_weights,
+                    avg_factor=num_total_samples)
+        else:
+            loss_bbox = pos_bbox_pred.sum()
+            if self.use_direction_classifier:
+                loss_dir = pos_dir_cls_pred.sum()
+
+        return loss_cls, loss_bbox, loss_dir
+
+    @staticmethod
+    def add_sin_difference(boxes1: Tensor, boxes2: Tensor) -> tuple:
+        """Convert the rotation difference to difference in sine function.
+
+        Args:
+            boxes1 (torch.Tensor): Original Boxes in shape (NxC), where C>=7
+                and the 7th dimension is rotation dimension.
+            boxes2 (torch.Tensor): Target boxes in shape (NxC), where C>=7 and
+                the 7th dimension is rotation dimension.
+
+        Returns:
+            tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th
+                dimensions are changed.
+        """
+        rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos(
+            boxes2[..., 6:7])
+        rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[...,
+                                                                         6:7])
+        boxes1 = torch.cat(
+            [boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1)
+        boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]],
+                           dim=-1)
+        return boxes1, boxes2
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            dir_cls_preds: List[Tensor],
+            batch_gt_instances_3d: InstanceList,
+            batch_input_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[torch.Tensor]): Multi-level class scores.
+            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
+            dir_cls_preds (list[torch.Tensor]): Multi-level direction
+                class predictions.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d``
+                and ``labels_3d`` attributes.
+            batch_input_metas (list[dict]): Contain pcd and img's meta info.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, list[torch.Tensor]]: Classification, bbox, and
+                direction losses of each level.
+
+                - loss_cls (list[torch.Tensor]): Classification losses.
+                - loss_bbox (list[torch.Tensor]): Box regression losses.
+                - loss_dir (list[torch.Tensor]): Direction classification
+                    losses.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+        device = cls_scores[0].device
+        anchor_list = self.get_anchors(
+            featmap_sizes, batch_input_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.anchor_target_3d(
+            anchor_list,
+            batch_gt_instances_3d,
+            batch_input_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            num_classes=self.num_classes,
+            label_channels=label_channels,
+            sampling=self.sampling)
+
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         dir_targets_list, dir_weights_list, num_total_pos,
+         num_total_neg) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+
+        # num_total_samples = None
+        with amp.autocast(enabled=False):
+            losses_cls, losses_bbox, losses_dir = multi_apply(
+                self._loss_by_feat_single,
+                cast_tensor_type(cls_scores, dst_type=torch.float32),
+                cast_tensor_type(bbox_preds, dst_type=torch.float32),
+                cast_tensor_type(dir_cls_preds, dst_type=torch.float32),
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                bbox_weights_list,
+                dir_targets_list,
+                dir_weights_list,
+                num_total_samples=num_total_samples)
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dir=losses_dir)
diff --git a/mmde/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py b/mmde/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..247d41cf598fb04bc111282a0d307b2575ec4c16
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py
@@ -0,0 +1,479 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import Any, List, Sequence, Tuple, Union
+
+import torch
+from mmcv.cnn import ConvModule
+from mmdet.models.utils import multi_apply
+from mmengine.model import bias_init_with_prob, normal_init
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, InstanceList, OptConfigType
+from .base_mono3d_dense_head import BaseMono3DDenseHead
+
+
+@MODELS.register_module()
+class AnchorFreeMono3DHead(BaseMono3DDenseHead):
+    """Anchor-free head for monocular 3D object detection.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels.
+            Used in child classes. Defaults to 256.
+        stacked_convs (int): Number of stacking convs of the head.
+        strides (Sequence[int] or Sequence[Tuple[int, int]]): Downsample
+            factor of each feature map.
+        dcn_on_last_conv (bool): If true, use dcn in the last
+            layer of towers. Default: False.
+        conv_bias (bool or str): If specified as `auto`, it will be
+            decided by the norm_cfg. Bias of conv will be set as True
+            if `norm_cfg` is None, otherwise False. Default: 'auto'.
+        background_label (bool, Optional): Label ID of background,
+            set as 0 for RPN and num_classes for other heads.
+            It will automatically set as `num_classes` if None is given.
+        use_direction_classifier (bool):
+            Whether to add a direction classifier.
+        diff_rad_by_sin (bool): Whether to change the difference
+            into sin difference for box regression loss. Defaults to True.
+        dir_offset (float): Parameter used in direction
+            classification. Defaults to 0.
+        dir_limit_offset (float): Parameter used in direction
+            classification. Defaults to 0.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_dir (:obj:`ConfigDict` or dict): Config of direction classifier
+            loss.
+        loss_attr (:obj:`ConfigDict` or dict): Config of attribute classifier
+            loss, which is only active when `pred_attrs=True`.
+        bbox_code_size (int): Dimensions of predicted bounding boxes.
+        pred_attrs (bool): Whether to predict attributes.
+            Defaults to False.
+        num_attrs (int): The number of attributes to be predicted.
+            Default: 9.
+        pred_velo (bool): Whether to predict velocity.
+            Defaults to False.
+        pred_bbox2d (bool): Whether to predict 2D boxes.
+            Defaults to False.
+        group_reg_dims (tuple[int], optional): The dimension of each regression
+            target group. Default: (2, 1, 3, 1, 2).
+        cls_branch (tuple[int], optional): Channels for classification branch.
+            Default: (128, 64).
+        reg_branch (tuple[tuple], optional): Channels for regression branch.
+            Default: (
+                (128, 64),  # offset
+                (128, 64),  # depth
+                (64, ),  # size
+                (64, ),  # rot
+                ()  # velo
+            ),
+        dir_branch (Sequence[int]): Channels for direction
+            classification branch. Default: (64, ).
+        attr_branch (Sequence[int]): Channels for classification branch.
+            Default: (64, ).
+        conv_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for
+            convolution layer. Default: None.
+        norm_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for
+            normalization layer. Default: None.
+        train_cfg (:obj:`ConfigDict` or dict, Optional): Training config
+            of anchor head.
+        test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of
+            anchor head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """  # noqa: W605
+
+    _version = 1
+
+    def __init__(
+            self,
+            num_classes: int,
+            in_channels: int,
+            feat_channels: int = 256,
+            stacked_convs: int = 4,
+            strides: Sequence[int] = (4, 8, 16, 32, 64),
+            dcn_on_last_conv: bool = False,
+            conv_bias: Union[bool, str] = 'auto',
+            background_label: bool = None,
+            use_direction_classifier: bool = True,
+            diff_rad_by_sin: bool = True,
+            dir_offset: int = 0,
+            dir_limit_offset: int = 0,
+            loss_cls: ConfigType = dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_bbox: ConfigType = dict(
+                type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+            loss_dir: ConfigType = dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0),
+            loss_attr: ConfigType = dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0),
+            bbox_code_size: int = 9,  # For nuscenes
+            pred_attrs: bool = False,
+            num_attrs: int = 9,  # For nuscenes
+            pred_velo: bool = False,
+            pred_bbox2d: bool = False,
+            group_reg_dims: Sequence[int] = (
+                2, 1, 3, 1, 2),  # offset, depth, size, rot, velo,
+            cls_branch: Sequence[int] = (128, 64),
+            reg_branch: Sequence[Tuple[int, int]] = (
+                (128, 64),  # offset
+                (128, 64),  # depth
+                (64, ),  # size
+                (64, ),  # rot
+                ()  # velo
+            ),
+            dir_branch: Sequence[int] = (64, ),
+            attr_branch: Sequence[int] = (64, ),
+            conv_cfg: OptConfigType = None,
+            norm_cfg: OptConfigType = None,
+            train_cfg: OptConfigType = None,
+            test_cfg: OptConfigType = None,
+            init_cfg: OptConfigType = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.cls_out_channels = num_classes
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.dcn_on_last_conv = dcn_on_last_conv
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.use_direction_classifier = use_direction_classifier
+        self.diff_rad_by_sin = diff_rad_by_sin
+        self.dir_offset = dir_offset
+        self.dir_limit_offset = dir_limit_offset
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.loss_dir = MODELS.build(loss_dir)
+        self.bbox_code_size = bbox_code_size
+        self.group_reg_dims = list(group_reg_dims)
+        self.cls_branch = cls_branch
+        self.reg_branch = reg_branch
+        assert len(reg_branch) == len(group_reg_dims), 'The number of '\
+            'element in reg_branch and group_reg_dims should be the same.'
+        self.pred_velo = pred_velo
+        self.pred_bbox2d = pred_bbox2d
+        self.out_channels = []
+        for reg_branch_channels in reg_branch:
+            if len(reg_branch_channels) > 0:
+                self.out_channels.append(reg_branch_channels[-1])
+            else:
+                self.out_channels.append(-1)
+        self.dir_branch = dir_branch
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.background_label = (
+            num_classes if background_label is None else background_label)
+        # background_label should be either 0 or num_classes
+        assert (self.background_label == 0
+                or self.background_label == num_classes)
+        self.pred_attrs = pred_attrs
+        self.attr_background_label = -1
+        self.num_attrs = num_attrs
+        if self.pred_attrs:
+            self.attr_background_label = num_attrs
+            self.loss_attr = MODELS.build(loss_attr)
+            self.attr_branch = attr_branch
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self._init_cls_convs()
+        self._init_reg_convs()
+        self._init_predictor()
+
+    def _init_cls_convs(self):
+        """Initialize classification conv layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_reg_convs(self):
+        """Initialize bbox regression conv layers of the head."""
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_branch(self, conv_channels=(64), conv_strides=(1)):
+        """Initialize conv layers as a prediction branch."""
+        conv_before_pred = nn.ModuleList()
+        if isinstance(conv_channels, int):
+            conv_channels = [self.feat_channels] + [conv_channels]
+            conv_strides = [conv_strides]
+        else:
+            conv_channels = [self.feat_channels] + list(conv_channels)
+            conv_strides = list(conv_strides)
+        for i in range(len(conv_strides)):
+            conv_before_pred.append(
+                ConvModule(
+                    conv_channels[i],
+                    conv_channels[i + 1],
+                    3,
+                    stride=conv_strides[i],
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+        return conv_before_pred
+
+    def _init_predictor(self):
+        """Initialize predictor layers of the head."""
+        self.conv_cls_prev = self._init_branch(
+            conv_channels=self.cls_branch,
+            conv_strides=(1, ) * len(self.cls_branch))
+        self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels,
+                                  1)
+        self.conv_reg_prevs = nn.ModuleList()
+        self.conv_regs = nn.ModuleList()
+        for i in range(len(self.group_reg_dims)):
+            reg_dim = self.group_reg_dims[i]
+            reg_branch_channels = self.reg_branch[i]
+            out_channel = self.out_channels[i]
+            if len(reg_branch_channels) > 0:
+                self.conv_reg_prevs.append(
+                    self._init_branch(
+                        conv_channels=reg_branch_channels,
+                        conv_strides=(1, ) * len(reg_branch_channels)))
+                self.conv_regs.append(nn.Conv2d(out_channel, reg_dim, 1))
+            else:
+                self.conv_reg_prevs.append(None)
+                self.conv_regs.append(
+                    nn.Conv2d(self.feat_channels, reg_dim, 1))
+        if self.use_direction_classifier:
+            self.conv_dir_cls_prev = self._init_branch(
+                conv_channels=self.dir_branch,
+                conv_strides=(1, ) * len(self.dir_branch))
+            self.conv_dir_cls = nn.Conv2d(self.dir_branch[-1], 2, 1)
+        if self.pred_attrs:
+            self.conv_attr_prev = self._init_branch(
+                conv_channels=self.attr_branch,
+                conv_strides=(1, ) * len(self.attr_branch))
+            self.conv_attr = nn.Conv2d(self.attr_branch[-1], self.num_attrs, 1)
+
+    def init_weights(self):
+        """Initialize weights of the head.
+
+        We currently still use the customized defined init_weights because the
+        default init of DCN triggered by the init_cfg will init
+        conv_offset.weight, which mistakenly affects the training stability.
+        """
+        for modules in [self.cls_convs, self.reg_convs, self.conv_cls_prev]:
+            for m in modules:
+                if isinstance(m.conv, nn.Conv2d):
+                    normal_init(m.conv, std=0.01)
+        for conv_reg_prev in self.conv_reg_prevs:
+            if conv_reg_prev is None:
+                continue
+            for m in conv_reg_prev:
+                if isinstance(m.conv, nn.Conv2d):
+                    normal_init(m.conv, std=0.01)
+        if self.use_direction_classifier:
+            for m in self.conv_dir_cls_prev:
+                if isinstance(m.conv, nn.Conv2d):
+                    normal_init(m.conv, std=0.01)
+        if self.pred_attrs:
+            for m in self.conv_attr_prev:
+                if isinstance(m.conv, nn.Conv2d):
+                    normal_init(m.conv, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.conv_cls, std=0.01, bias=bias_cls)
+        for conv_reg in self.conv_regs:
+            normal_init(conv_reg, std=0.01)
+        if self.use_direction_classifier:
+            normal_init(self.conv_dir_cls, std=0.01, bias=bias_cls)
+        if self.pred_attrs:
+            normal_init(self.conv_attr, std=0.01, bias=bias_cls)
+
+    def forward(
+        self, x: Tuple[Tensor]
+    ) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually contain classification scores, bbox predictions,
+                and direction class predictions.
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * bbox_code_size.
+                dir_cls_preds (list[Tensor]): Box scores for direction class
+                    predictions on each scale level, each is a 4D-tensor,
+                    the channel number is num_points * 2. (bin = 2)
+                attr_preds (list[Tensor]): Attribute scores for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * num_attrs.
+        """
+        return multi_apply(self.forward_single, x)[:5]
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, ...]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+
+        Returns:
+            tuple: Scores for each class, bbox predictions, direction class,
+                and attributes, features after classification and regression
+                conv layers, some models needs these features like FCOS.
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+        # clone the cls_feat for reusing the feature map afterwards
+        clone_cls_feat = cls_feat.clone()
+        for conv_cls_prev_layer in self.conv_cls_prev:
+            clone_cls_feat = conv_cls_prev_layer(clone_cls_feat)
+        cls_score = self.conv_cls(clone_cls_feat)
+
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+        bbox_pred = []
+        for i in range(len(self.group_reg_dims)):
+            # clone the reg_feat for reusing the feature map afterwards
+            clone_reg_feat = reg_feat.clone()
+            if len(self.reg_branch[i]) > 0:
+                for conv_reg_prev_layer in self.conv_reg_prevs[i]:
+                    clone_reg_feat = conv_reg_prev_layer(clone_reg_feat)
+            bbox_pred.append(self.conv_regs[i](clone_reg_feat))
+        bbox_pred = torch.cat(bbox_pred, dim=1)
+
+        dir_cls_pred = None
+        if self.use_direction_classifier:
+            clone_reg_feat = reg_feat.clone()
+            for conv_dir_cls_prev_layer in self.conv_dir_cls_prev:
+                clone_reg_feat = conv_dir_cls_prev_layer(clone_reg_feat)
+            dir_cls_pred = self.conv_dir_cls(clone_reg_feat)
+
+        attr_pred = None
+        if self.pred_attrs:
+            # clone the cls_feat for reusing the feature map afterwards
+            clone_cls_feat = cls_feat.clone()
+            for conv_attr_prev_layer in self.conv_attr_prev:
+                clone_cls_feat = conv_attr_prev_layer(clone_cls_feat)
+            attr_pred = self.conv_attr(clone_cls_feat)
+
+        return cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, \
+            reg_feat
+
+    @abstractmethod
+    def get_targets(self, points: List[Tensor],
+                    batch_gt_instances: InstanceList) -> Any:
+        """Compute regression, classification and centerss targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes``、``labels``
+                、``bboxes_3d``、``labels_3d``、``depths``、``centers_2d``
+                and attributes.
+        """
+        raise NotImplementedError
+
+    # TODO: Refactor using MlvlPointGenerator in MMDet.
+    def _get_points_single(self,
+                           featmap_size: Tuple[int],
+                           stride: int,
+                           dtype: torch.dtype,
+                           device: torch.device,
+                           flatten: bool = False) -> Tuple[Tensor, Tensor]:
+        """Get points of a single scale level.
+
+        Args:
+            featmap_size (tuple[int]): Single scale level feature map
+                size.
+            stride (int): Downsample factor of the feature map.
+            dtype (torch.dtype): Type of points.
+            device (torch.device): Device of points.
+            flatten (bool): Whether to flatten the tensor.
+                Defaults to False.
+
+        Returns:
+            tuple: points of each image.
+        """
+        h, w = featmap_size
+        x_range = torch.arange(w, dtype=dtype, device=device)
+        y_range = torch.arange(h, dtype=dtype, device=device)
+        y, x = torch.meshgrid(y_range, x_range)
+        if flatten:
+            y = y.flatten()
+            x = x.flatten()
+        return y, x
+
+    # TODO: Refactor using MlvlPointGenerator in MMDet.
+    def get_points(self,
+                   featmap_sizes: List[Tuple[int]],
+                   dtype: torch.dtype,
+                   device: torch.device,
+                   flatten: bool = False) -> List[Tuple[Tensor, Tensor]]:
+        """Get points according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            dtype (torch.dtype): Type of points.
+            device (torch.device): Device of points.
+            flatten (bool): Whether to flatten the tensor.
+                Defaults to False.
+
+        Returns:
+            list[tuple]: points of each image.
+        """
+        mlvl_points = []
+        for i in range(len(featmap_sizes)):
+            mlvl_points.append(
+                self._get_points_single(featmap_sizes[i], self.strides[i],
+                                        dtype, device, flatten))
+        return mlvl_points
diff --git a/mmde/mmdet3d/models/dense_heads/base_3d_dense_head.py b/mmde/mmdet3d/models/dense_heads/base_3d_dense_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..a38695ad83adbf6207026a599efb1bb22d6918a0
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/base_3d_dense_head.py
@@ -0,0 +1,381 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmdet.models.utils import select_single_mlvl
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, constant_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet3d.models.layers import box3d_multiclass_nms
+from mmdet3d.structures import limit_period, xywhr2xyxyr
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils.typing_utils import InstanceList, OptMultiConfig
+
+
+class Base3DDenseHead(BaseModule, metaclass=ABCMeta):
+    """Base class for 3D DenseHeads.
+
+    1. The ``init_weights`` method is used to initialize densehead's
+    model parameters. After detector initialization, ``init_weights``
+    is triggered when ``detector.init_weights()`` is called externally.
+
+    2. The ``loss`` method is used to calculate the loss of densehead,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``loss_by_feat`` method
+    is called based on the feature maps to calculate the loss.
+
+    .. code:: text
+
+    loss(): forward() -> loss_by_feat()
+
+    3. The ``predict`` method is used to predict detection results,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``predict_by_feat`` method
+    is called based on the feature maps to predict detection results including
+    post-processing.
+
+    .. code:: text
+
+    predict(): forward() -> predict_by_feat()
+
+    4. The ``loss_and_predict`` method is used to return loss and detection
+    results at the same time. It will call densehead's ``forward``,
+    ``loss_by_feat`` and ``predict_by_feat`` methods in order.  If one-stage is
+    used as RPN, the densehead needs to return both losses and predictions.
+    This predictions is used as the proposal of roihead.
+
+    .. code:: text
+
+    loss_and_predict(): forward() -> loss_by_feat() -> predict_by_feat()
+    """
+
+    def __init__(self, init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+    def init_weights(self) -> None:
+        """Initialize the weights."""
+        super().init_weights()
+        # avoid init_cfg overwrite the initialization of `conv_offset`
+        for m in self.modules():
+            # DeformConv2dPack, ModulatedDeformConv2dPack
+            if hasattr(m, 'conv_offset'):
+                constant_init(m.conv_offset, 0)
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        outs = self(x)
+
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+
+        loss_inputs = outs + (batch_gt_instances_3d, batch_input_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    @abstractmethod
+    def loss_by_feat(self, **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head."""
+        pass
+
+    def loss_and_predict(self,
+                         x: Tuple[Tensor],
+                         batch_data_samples: SampleList,
+                         proposal_cfg: Optional[ConfigDict] = None,
+                         **kwargs) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`Det3DDataSample`]): Each item
+                contains the meta information of each image and
+                corresponding annotations.
+            proposal_cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+                - losses: (dict[str, Tensor]): A dictionary of loss components.
+                - predictions (list[:obj:`InstanceData`]): Detection
+                  results of each image after the post process.
+        """
+        batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+
+        outs = self(x)
+
+        loss_inputs = outs + (batch_gt_instances, batch_input_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_input_metas=batch_input_metas, cfg=proposal_cfg)
+        return losses, predictions
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the 3D detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_pts_panoptic_seg` and
+                `gt_pts_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        batch_input_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        outs = self(x)
+        predictions = self.predict_by_feat(
+            *outs, batch_input_metas=batch_input_metas, rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        dir_cls_preds: List[Tensor],
+                        batch_input_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            batch_input_metas (list[dict], Optional): Batch inputs meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        assert len(cls_scores) == len(dir_cls_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_anchors(
+            featmap_sizes, device=cls_scores[0].device)
+        mlvl_priors = [
+            prior.reshape(-1, self.box_code_size) for prior in mlvl_priors
+        ]
+
+        result_list = []
+
+        for input_id in range(len(batch_input_metas)):
+
+            input_meta = batch_input_metas[input_id]
+            cls_score_list = select_single_mlvl(cls_scores, input_id)
+            bbox_pred_list = select_single_mlvl(bbox_preds, input_id)
+            dir_cls_pred_list = select_single_mlvl(dir_cls_preds, input_id)
+
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                dir_cls_pred_list=dir_cls_pred_list,
+                mlvl_priors=mlvl_priors,
+                input_meta=input_meta,
+                cfg=cfg,
+                rescale=rescale,
+                **kwargs)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                dir_cls_pred_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                input_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                **kwargs) -> InstanceData:
+        """Transform a single points sample's features extracted from the head
+        into bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single point cloud sample, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single point cloud sample, each item
+                has shape (num_priors * C, H, W).
+            dir_cls_pred_list (list[Tensor]): Predictions of direction class
+                from all scale levels of a single point cloud sample, each
+                item has shape (num_priors * 2, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            input_meta (dict): Contain point clouds and image meta info.
+            cfg (:obj:`ConfigDict`): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_priors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_dir_scores = []
+        for cls_score, bbox_pred, dir_cls_pred, priors in zip(
+                cls_score_list, bbox_pred_list, dir_cls_pred_list,
+                mlvl_priors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:]
+            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
+            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
+
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.num_classes)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2,
+                                          0).reshape(-1, self.box_code_size)
+
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                priors = priors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                dir_cls_score = dir_cls_score[topk_inds]
+
+            bboxes = self.bbox_coder.decode(priors, bbox_pred)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_dir_scores.append(dir_cls_score)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
+            mlvl_bboxes, box_dim=self.box_code_size).bev)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
+
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the front when using sigmoid
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+
+        score_thr = cfg.get('score_thr', 0)
+        results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
+                                       mlvl_scores, score_thr, cfg.max_num,
+                                       cfg, mlvl_dir_scores)
+        bboxes, scores, labels, dir_scores = results
+        if bboxes.shape[0] > 0:
+            dir_rot = limit_period(bboxes[..., 6] - self.dir_offset,
+                                   self.dir_limit_offset, np.pi)
+            bboxes[..., 6] = (
+                dir_rot + self.dir_offset +
+                np.pi * dir_scores.to(bboxes.dtype))
+        bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size)
+        results = InstanceData()
+        results.bboxes_3d = bboxes
+        results.scores_3d = scores
+        results.labels_3d = labels
+
+        return results
+
+    # TODO: Support augmentation test
+    def aug_test(self,
+                 aug_batch_feats,
+                 aug_batch_input_metas,
+                 rescale=False,
+                 with_ori_nms=False,
+                 **kwargs):
+        pass
diff --git a/mmde/mmdet3d/models/dense_heads/base_conv_bbox_head.py b/mmde/mmdet3d/models/dense_heads/base_conv_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f4b875c5e58f37fd1249dbb41dadf87512851f7
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/base_conv_bbox_head.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import build_conv_layer
+from mmengine.model import BaseModule
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class BaseConvBboxHead(BaseModule):
+    r"""More general bbox head, with shared conv layers and two optional
+    separated branches.
+
+    .. code-block:: none
+
+                     /-> cls convs -> cls_score
+        shared convs
+                     \-> reg convs -> bbox_pred
+    """
+
+    def __init__(self,
+                 in_channels=0,
+                 shared_conv_channels=(),
+                 cls_conv_channels=(),
+                 num_cls_out_channels=0,
+                 reg_conv_channels=(),
+                 num_reg_out_channels=0,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d'),
+                 act_cfg=dict(type='ReLU'),
+                 bias='auto',
+                 init_cfg=None,
+                 *args,
+                 **kwargs):
+        super(BaseConvBboxHead, self).__init__(
+            init_cfg=init_cfg, *args, **kwargs)
+        assert in_channels > 0
+        assert num_cls_out_channels > 0
+        assert num_reg_out_channels > 0
+        self.in_channels = in_channels
+        self.shared_conv_channels = shared_conv_channels
+        self.cls_conv_channels = cls_conv_channels
+        self.num_cls_out_channels = num_cls_out_channels
+        self.reg_conv_channels = reg_conv_channels
+        self.num_reg_out_channels = num_reg_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.bias = bias
+
+        # add shared convs
+        if len(self.shared_conv_channels) > 0:
+            self.shared_convs = self._add_conv_branch(
+                self.in_channels, self.shared_conv_channels)
+            out_channels = self.shared_conv_channels[-1]
+        else:
+            out_channels = self.in_channels
+
+        # add cls specific branch
+        prev_channel = out_channels
+        if len(self.cls_conv_channels) > 0:
+            self.cls_convs = self._add_conv_branch(prev_channel,
+                                                   self.cls_conv_channels)
+            prev_channel = self.cls_conv_channels[-1]
+
+        self.conv_cls = build_conv_layer(
+            conv_cfg,
+            in_channels=prev_channel,
+            out_channels=num_cls_out_channels,
+            kernel_size=1)
+        # add reg specific branch
+        prev_channel = out_channels
+        if len(self.reg_conv_channels) > 0:
+            self.reg_convs = self._add_conv_branch(prev_channel,
+                                                   self.reg_conv_channels)
+            prev_channel = self.reg_conv_channels[-1]
+
+        self.conv_reg = build_conv_layer(
+            conv_cfg,
+            in_channels=prev_channel,
+            out_channels=num_reg_out_channels,
+            kernel_size=1)
+
+    def _add_conv_branch(self, in_channels, conv_channels):
+        """Add shared or separable branch."""
+        conv_spec = [in_channels] + list(conv_channels)
+        # add branch specific conv layers
+        conv_layers = nn.Sequential()
+        for i in range(len(conv_spec) - 1):
+            conv_layers.add_module(
+                f'layer{i}',
+                ConvModule(
+                    conv_spec[i],
+                    conv_spec[i + 1],
+                    kernel_size=1,
+                    padding=0,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    bias=self.bias,
+                    inplace=True))
+        return conv_layers
+
+    def forward(self, feats):
+        """Forward.
+
+        Args:
+            feats (Tensor): Input features
+
+        Returns:
+            Tensor: Class scores predictions
+            Tensor: Regression predictions
+        """
+        # shared part
+        if len(self.shared_conv_channels) > 0:
+            x = self.shared_convs(feats)
+
+        # separate branches
+        x_cls = x
+        x_reg = x
+
+        if len(self.cls_conv_channels) > 0:
+            x_cls = self.cls_convs(x_cls)
+        cls_score = self.conv_cls(x_cls)
+
+        if len(self.reg_conv_channels) > 0:
+            x_reg = self.reg_convs(x_reg)
+        bbox_pred = self.conv_reg(x_reg)
+
+        return cls_score, bbox_pred
diff --git a/mmde/mmdet3d/models/dense_heads/base_mono3d_dense_head.py b/mmde/mmdet3d/models/dense_heads/base_mono3d_dense_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..5627ce17e459bcc787afda8eec9897273b569bb8
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/base_mono3d_dense_head.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Optional, Tuple
+
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import InstanceList, OptMultiConfig
+
+
+class BaseMono3DDenseHead(BaseModule, metaclass=ABCMeta):
+    """Base class for Monocular 3D DenseHeads.
+
+    1. The ``loss`` method is used to calculate the loss of densehead,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``loss_by_feat`` method
+    is called based on the feature maps to calculate the loss.
+
+    .. code:: text
+
+    loss(): forward() -> loss_by_feat()
+
+    2. The ``predict`` method is used to predict detection results,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``predict_by_feat`` method
+    is called based on the feature maps to predict detection results including
+    post-processing.
+
+    .. code:: text
+
+    predict(): forward() -> predict_by_feat()
+
+    3. The ``loss_and_predict`` method is used to return loss and detection
+    results at the same time. It will call densehead's ``forward``,
+    ``loss_by_feat`` and ``predict_by_feat`` methods in order.  If one-stage is
+    used as RPN, the densehead needs to return both losses and predictions.
+    This predictions is used as the proposal of roihead.
+
+    .. code:: text
+
+    loss_and_predict(): forward() -> loss_by_feat() -> predict_by_feat()
+    """
+
+    def __init__(self, init_cfg: OptMultiConfig = None) -> None:
+        super(BaseMono3DDenseHead, self).__init__(init_cfg=init_cfg)
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             **kwargs) -> dict:
+        """
+        Args:
+            x (list[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`Det3DDataSample`]): Each item
+                contains the meta information of each image and corresponding
+                annotations.
+
+        Returns:
+            tuple or Tensor: When `proposal_cfg` is None, the detector is a \
+            normal one-stage detector, The return value is the losses.
+
+            - losses: (dict[str, Tensor]): A dictionary of loss components.
+
+            When the `proposal_cfg` is not None, the head is used as a
+            `rpn_head`, the return value is a tuple contains:
+
+            - losses: (dict[str, Tensor]): A dictionary of loss components.
+            - results_list (list[:obj:`InstanceData`]): Detection
+              results of each image after the post process.
+              Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (:obj:`BaseInstance3DBoxes`): Contains a tensor
+                  with shape (num_instances, C), the last dimension C of a
+                  3D box is (x, y, z, x_size, y_size, z_size, yaw, ...), where
+                  C >= 7. C = 7 for kitti and C = 9 for nuscenes with extra 2
+                  dims of velocity.
+        """
+
+        outs = self(x)
+        batch_gt_instances_3d = []
+        batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances.append(data_sample.gt_instances)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+
+        loss_inputs = outs + (batch_gt_instances_3d, batch_gt_instances,
+                              batch_img_metas, batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        return losses
+
+    @abstractmethod
+    def loss_by_feat(self, **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head."""
+        pass
+
+    def loss_and_predict(self,
+                         x: Tuple[Tensor],
+                         batch_data_samples: SampleList,
+                         proposal_cfg: Optional[ConfigDict] = None,
+                         **kwargs) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`Det3DDataSample`]): Each item
+                contains the meta information of each image and
+                corresponding annotations.
+            proposal_cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+            - losses: (dict[str, Tensor]): A dictionary of loss components.
+            - predictions (list[:obj:`InstanceData`]): Detection
+                results of each image after the post process.
+        """
+        batch_gt_instances_3d = []
+        batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances.append(data_sample.gt_instances)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+
+        outs = self(x)
+
+        loss_inputs = outs + (batch_gt_instances_3d, batch_gt_instances,
+                              batch_img_metas, batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg)
+
+        return losses, predictions
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_pts_panoptic_seg` and `gt_pts_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        outs = self(x)
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+
+        return predictions
+
+    @abstractmethod
+    def predict_by_feat(self, **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results."""
+        pass
diff --git a/mmde/mmdet3d/models/dense_heads/centerpoint_head.py b/mmde/mmdet3d/models/dense_heads/centerpoint_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3fc1879646b645ffbe233ad2618d8cf99de59c0
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/centerpoint_head.py
@@ -0,0 +1,925 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+from mmcv.cnn import ConvModule, build_conv_layer
+from mmdet.models.utils import multi_apply
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor, nn
+
+from mmdet3d.models.utils import (clip_sigmoid, draw_heatmap_gaussian,
+                                  gaussian_radius)
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures import Det3DDataSample, xywhr2xyxyr
+from ..layers import circle_nms, nms_bev
+
+
+@MODELS.register_module()
+class SeparateHead(BaseModule):
+    """SeparateHead for CenterHead.
+
+    Args:
+        in_channels (int): Input channels for conv_layer.
+        heads (dict): Conv information.
+        head_conv (int, optional): Output channels.
+            Default: 64.
+        final_kernel (int, optional): Kernel size for the last conv layer.
+            Default: 1.
+        init_bias (float, optional): Initial bias. Default: -2.19.
+        conv_cfg (dict, optional): Config of conv layer.
+            Default: dict(type='Conv2d')
+        norm_cfg (dict, optional): Config of norm layer.
+            Default: dict(type='BN2d').
+        bias (str, optional): Type of bias. Default: 'auto'.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 heads,
+                 head_conv=64,
+                 final_kernel=1,
+                 init_bias=-2.19,
+                 conv_cfg=dict(type='Conv2d'),
+                 norm_cfg=dict(type='BN2d'),
+                 bias='auto',
+                 init_cfg=None,
+                 **kwargs):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+            'behavior, init_cfg is not allowed to be set'
+        super(SeparateHead, self).__init__(init_cfg=init_cfg)
+        self.heads = heads
+        self.init_bias = init_bias
+        for head in self.heads:
+            classes, num_conv = self.heads[head]
+            conv_layers = []
+            c_in = in_channels
+            for i in range(num_conv - 1):
+                conv_layers.append(
+                    ConvModule(
+                        c_in,
+                        head_conv,
+                        kernel_size=final_kernel,
+                        stride=1,
+                        padding=final_kernel // 2,
+                        bias=bias,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg))
+                c_in = head_conv
+
+            conv_layers.append(
+                build_conv_layer(
+                    conv_cfg,
+                    head_conv,
+                    classes,
+                    kernel_size=final_kernel,
+                    stride=1,
+                    padding=final_kernel // 2,
+                    bias=True))
+            conv_layers = nn.Sequential(*conv_layers)
+
+            self.__setattr__(head, conv_layers)
+
+            if init_cfg is None:
+                self.init_cfg = dict(type='Kaiming', layer='Conv2d')
+
+    def init_weights(self):
+        """Initialize weights."""
+        super().init_weights()
+        for head in self.heads:
+            if head == 'heatmap':
+                self.__getattr__(head)[-1].bias.data.fill_(self.init_bias)
+
+    def forward(self, x):
+        """Forward function for SepHead.
+
+        Args:
+            x (torch.Tensor): Input feature map with the shape of
+                [B, 512, 128, 128].
+
+        Returns:
+            dict[str: torch.Tensor]: contains the following keys:
+
+                -reg (torch.Tensor): 2D regression value with the
+                    shape of [B, 2, H, W].
+                -height (torch.Tensor): Height value with the
+                    shape of [B, 1, H, W].
+                -dim (torch.Tensor): Size value with the shape
+                    of [B, 3, H, W].
+                -rot (torch.Tensor): Rotation value with the
+                    shape of [B, 2, H, W].
+                -vel (torch.Tensor): Velocity value with the
+                    shape of [B, 2, H, W].
+                -heatmap (torch.Tensor): Heatmap with the shape of
+                    [B, N, H, W].
+        """
+        ret_dict = dict()
+        for head in self.heads:
+            ret_dict[head] = self.__getattr__(head)(x)
+
+        return ret_dict
+
+
+@MODELS.register_module()
+class DCNSeparateHead(BaseModule):
+    r"""DCNSeparateHead for CenterHead.
+
+    .. code-block:: none
+            /-----> DCN for heatmap task -----> heatmap task.
+    feature
+            \-----> DCN for regression tasks -----> regression tasks
+
+    Args:
+        in_channels (int): Input channels for conv_layer.
+        num_cls (int): Number of classes.
+        heads (dict): Conv information.
+        dcn_config (dict): Config of dcn layer.
+        head_conv (int, optional): Output channels.
+            Default: 64.
+        final_kernel (int, optional): Kernel size for the last conv
+            layer. Default: 1.
+        init_bias (float, optional): Initial bias. Default: -2.19.
+        conv_cfg (dict, optional): Config of conv layer.
+            Default: dict(type='Conv2d')
+        norm_cfg (dict, optional): Config of norm layer.
+            Default: dict(type='BN2d').
+        bias (str, optional): Type of bias. Default: 'auto'.
+    """  # noqa: W605
+
+    def __init__(self,
+                 in_channels,
+                 num_cls,
+                 heads,
+                 dcn_config,
+                 head_conv=64,
+                 final_kernel=1,
+                 init_bias=-2.19,
+                 conv_cfg=dict(type='Conv2d'),
+                 norm_cfg=dict(type='BN2d'),
+                 bias='auto',
+                 init_cfg=None,
+                 **kwargs):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+            'behavior, init_cfg is not allowed to be set'
+        super(DCNSeparateHead, self).__init__(init_cfg=init_cfg)
+        if 'heatmap' in heads:
+            heads.pop('heatmap')
+        # feature adaptation with dcn
+        # use separate features for classification / regression
+        self.feature_adapt_cls = build_conv_layer(dcn_config)
+
+        self.feature_adapt_reg = build_conv_layer(dcn_config)
+
+        # heatmap prediction head
+        cls_head = [
+            ConvModule(
+                in_channels,
+                head_conv,
+                kernel_size=3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                bias=bias,
+                norm_cfg=norm_cfg),
+            build_conv_layer(
+                conv_cfg,
+                head_conv,
+                num_cls,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=bias)
+        ]
+        self.cls_head = nn.Sequential(*cls_head)
+        self.init_bias = init_bias
+        # other regression target
+        self.task_head = SeparateHead(
+            in_channels,
+            heads,
+            head_conv=head_conv,
+            final_kernel=final_kernel,
+            bias=bias)
+        if init_cfg is None:
+            self.init_cfg = dict(type='Kaiming', layer='Conv2d')
+
+    def init_weights(self):
+        """Initialize weights."""
+        super().init_weights()
+        self.cls_head[-1].bias.data.fill_(self.init_bias)
+
+    def forward(self, x):
+        """Forward function for DCNSepHead.
+
+        Args:
+            x (torch.Tensor): Input feature map with the shape of
+                [B, 512, 128, 128].
+
+        Returns:
+            dict[str: torch.Tensor]: contains the following keys:
+
+                -reg (torch.Tensor): 2D regression value with the
+                    shape of [B, 2, H, W].
+                -height (torch.Tensor): Height value with the
+                    shape of [B, 1, H, W].
+                -dim (torch.Tensor): Size value with the shape
+                    of [B, 3, H, W].
+                -rot (torch.Tensor): Rotation value with the
+                    shape of [B, 2, H, W].
+                -vel (torch.Tensor): Velocity value with the
+                    shape of [B, 2, H, W].
+                -heatmap (torch.Tensor): Heatmap with the shape of
+                    [B, N, H, W].
+        """
+        center_feat = self.feature_adapt_cls(x)
+        reg_feat = self.feature_adapt_reg(x)
+
+        cls_score = self.cls_head(center_feat)
+        ret = self.task_head(reg_feat)
+        ret['heatmap'] = cls_score
+
+        return ret
+
+
+@MODELS.register_module()
+class CenterHead(BaseModule):
+    """CenterHead for CenterPoint.
+
+    Args:
+        in_channels (list[int] | int, optional): Channels of the input
+            feature map. Default: [128].
+        tasks (list[dict], optional): Task information including class number
+            and class names. Default: None.
+        bbox_coder (dict, optional): Bbox coder configs. Default: None.
+        common_heads (dict, optional): Conv information for common heads.
+            Default: dict().
+        loss_cls (dict, optional): Config of classification loss function.
+            Default: dict(type='GaussianFocalLoss', reduction='mean').
+        loss_bbox (dict, optional): Config of regression loss function.
+            Default: dict(type='L1Loss', reduction='none').
+        separate_head (dict, optional): Config of separate head. Default: dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3)
+        share_conv_channel (int, optional): Output channels for share_conv
+            layer. Default: 64.
+        num_heatmap_convs (int, optional): Number of conv layers for heatmap
+            conv layer. Default: 2.
+        conv_cfg (dict, optional): Config of conv layer.
+            Default: dict(type='Conv2d')
+        norm_cfg (dict, optional): Config of norm layer.
+            Default: dict(type='BN2d').
+        bias (str): Type of bias. Default: 'auto'.
+        norm_bbox (bool): Whether normalize the bbox predictions.
+            Defaults to True.
+        train_cfg (dict, optional): Train-time configs. Default: None.
+        test_cfg (dict, optional): Test-time configs. Default: None.
+        init_cfg (dict, optional): Config for initialization.
+    """
+
+    def __init__(self,
+                 in_channels: Union[List[int], int] = [128],
+                 tasks: Optional[List[dict]] = None,
+                 bbox_coder: Optional[dict] = None,
+                 common_heads: dict = dict(),
+                 loss_cls: dict = dict(
+                     type='mmdet.GaussianFocalLoss', reduction='mean'),
+                 loss_bbox: dict = dict(
+                     type='mmdet.L1Loss', reduction='none', loss_weight=0.25),
+                 separate_head: dict = dict(
+                     type='mmdet.SeparateHead',
+                     init_bias=-2.19,
+                     final_kernel=3),
+                 share_conv_channel: int = 64,
+                 num_heatmap_convs: int = 2,
+                 conv_cfg: dict = dict(type='Conv2d'),
+                 norm_cfg: dict = dict(type='BN2d'),
+                 bias: str = 'auto',
+                 norm_bbox: bool = True,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 **kwargs):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+            'behavior, init_cfg is not allowed to be set'
+        super(CenterHead, self).__init__(init_cfg=init_cfg, **kwargs)
+
+        # TODO we should rename this variable,
+        # for example num_classes_per_task ?
+        # {'num_class': 2, 'class_names': ['pedestrian', 'traffic_cone']}]
+        # TODO seems num_classes is useless
+        num_classes = [len(t['class_names']) for t in tasks]
+        self.class_names = [t['class_names'] for t in tasks]
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.norm_bbox = norm_bbox
+
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.num_anchor_per_locs = [n for n in num_classes]
+
+        # a shared convolution
+        self.shared_conv = ConvModule(
+            in_channels,
+            share_conv_channel,
+            kernel_size=3,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=bias)
+
+        self.task_heads = nn.ModuleList()
+
+        for num_cls in num_classes:
+            heads = copy.deepcopy(common_heads)
+            heads.update(dict(heatmap=(num_cls, num_heatmap_convs)))
+            separate_head.update(
+                in_channels=share_conv_channel, heads=heads, num_cls=num_cls)
+            self.task_heads.append(MODELS.build(separate_head))
+
+    def forward_single(self, x: Tensor) -> dict:
+        """Forward function for CenterPoint.
+
+        Args:
+            x (torch.Tensor): Input feature map with the shape of
+                [B, 512, 128, 128].
+
+        Returns:
+            list[dict]: Output results for tasks.
+        """
+        ret_dicts = []
+
+        x = self.shared_conv(x)
+
+        for task in self.task_heads:
+            ret_dicts.append(task(x))
+
+        return ret_dicts
+
+    def forward(self, feats: List[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward pass.
+
+        Args:
+            feats (list[torch.Tensor]): Multi-level features, e.g.,
+                features produced by FPN.
+
+        Returns:
+            tuple(list[dict]): Output results for tasks.
+        """
+        return multi_apply(self.forward_single, feats)
+
+    def _gather_feat(self, feat, ind, mask=None):
+        """Gather feature map.
+
+        Given feature map and index, return indexed feature map.
+
+        Args:
+            feat (torch.tensor): Feature map with the shape of [B, H*W, 10].
+            ind (torch.Tensor): Index of the ground truth boxes with the
+                shape of [B, max_obj].
+            mask (torch.Tensor, optional): Mask of the feature map with the
+                shape of [B, max_obj]. Default: None.
+
+        Returns:
+            torch.Tensor: Feature map after gathering with the shape
+                of [B, max_obj, 10].
+        """
+        dim = feat.size(2)
+        ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
+        feat = feat.gather(1, ind)
+        if mask is not None:
+            mask = mask.unsqueeze(2).expand_as(feat)
+            feat = feat[mask]
+            feat = feat.view(-1, dim)
+        return feat
+
+    def get_targets(
+        self,
+        batch_gt_instances_3d: List[InstanceData],
+    ) -> Tuple[List[Tensor]]:
+        """Generate targets.
+
+        How each output is transformed:
+
+            Each nested list is transposed so that all same-index elements in
+            each sub-list (1, ..., N) become the new sub-lists.
+                [ [a0, a1, a2, ... ], [b0, b1, b2, ... ], ... ]
+                ==> [ [a0, b0, ... ], [a1, b1, ... ], [a2, b2, ... ] ]
+
+            The new transposed nested list is converted into a list of N
+            tensors generated by concatenating tensors in the new sub-lists.
+                [ tensor0, tensor1, tensor2, ... ]
+
+        Args:
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and\
+                ``labels_3d`` attributes.
+
+        Returns:
+            Returns:
+                tuple[list[torch.Tensor]]: Tuple of target including
+                    the following results in order.
+
+                - list[torch.Tensor]: Heatmap scores.
+                - list[torch.Tensor]: Ground truth boxes.
+                - list[torch.Tensor]: Indexes indicating the
+                    position of the valid boxes.
+                - list[torch.Tensor]: Masks indicating which
+                    boxes are valid.
+        """
+        heatmaps, anno_boxes, inds, masks = multi_apply(
+            self.get_targets_single, batch_gt_instances_3d)
+        # Transpose heatmaps
+        heatmaps = list(map(list, zip(*heatmaps)))
+        heatmaps = [torch.stack(hms_) for hms_ in heatmaps]
+        # Transpose anno_boxes
+        anno_boxes = list(map(list, zip(*anno_boxes)))
+        anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes]
+        # Transpose inds
+        inds = list(map(list, zip(*inds)))
+        inds = [torch.stack(inds_) for inds_ in inds]
+        # Transpose inds
+        masks = list(map(list, zip(*masks)))
+        masks = [torch.stack(masks_) for masks_ in masks]
+        return heatmaps, anno_boxes, inds, masks
+
+    def get_targets_single(self,
+                           gt_instances_3d: InstanceData) -> Tuple[Tensor]:
+        """Generate training targets for a single sample.
+
+        Args:
+            gt_instances_3d (:obj:`InstanceData`): Gt_instances of
+                single data sample. It usually includes
+                ``bboxes_3d`` and ``labels_3d`` attributes.
+
+        Returns:
+            tuple[list[torch.Tensor]]: Tuple of target including
+                the following results in order.
+
+                - list[torch.Tensor]: Heatmap scores.
+                - list[torch.Tensor]: Ground truth boxes.
+                - list[torch.Tensor]: Indexes indicating the position
+                    of the valid boxes.
+                - list[torch.Tensor]: Masks indicating which boxes
+                    are valid.
+        """
+        gt_labels_3d = gt_instances_3d.labels_3d
+        gt_bboxes_3d = gt_instances_3d.bboxes_3d
+        device = gt_labels_3d.device
+        gt_bboxes_3d = torch.cat(
+            (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]),
+            dim=1).to(device)
+        max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg']
+        grid_size = torch.tensor(self.train_cfg['grid_size']).to(device)
+        pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
+        voxel_size = torch.tensor(self.train_cfg['voxel_size'])
+
+        feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']
+
+        # reorganize the gt_dict by tasks
+        task_masks = []
+        flag = 0
+        for class_name in self.class_names:
+            task_masks.append([
+                torch.where(gt_labels_3d == class_name.index(i) + flag)
+                for i in class_name
+            ])
+            flag += len(class_name)
+
+        task_boxes = []
+        task_classes = []
+        flag2 = 0
+        for idx, mask in enumerate(task_masks):
+            task_box = []
+            task_class = []
+            for m in mask:
+                task_box.append(gt_bboxes_3d[m])
+                # 0 is background for each task, so we need to add 1 here.
+                task_class.append(gt_labels_3d[m] + 1 - flag2)
+            task_boxes.append(torch.cat(task_box, axis=0).to(device))
+            task_classes.append(torch.cat(task_class).long().to(device))
+            flag2 += len(mask)
+        draw_gaussian = draw_heatmap_gaussian
+        heatmaps, anno_boxes, inds, masks = [], [], [], []
+
+        for idx, task_head in enumerate(self.task_heads):
+            heatmap = gt_bboxes_3d.new_zeros(
+                (len(self.class_names[idx]), feature_map_size[1],
+                 feature_map_size[0]))
+
+            anno_box = gt_bboxes_3d.new_zeros((max_objs, 10),
+                                              dtype=torch.float32)
+
+            ind = gt_labels_3d.new_zeros((max_objs), dtype=torch.int64)
+            mask = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.uint8)
+
+            num_objs = min(task_boxes[idx].shape[0], max_objs)
+
+            for k in range(num_objs):
+                cls_id = task_classes[idx][k] - 1
+
+                length = task_boxes[idx][k][3]
+                width = task_boxes[idx][k][4]
+                length = length / voxel_size[0] / self.train_cfg[
+                    'out_size_factor']
+                width = width / voxel_size[1] / self.train_cfg[
+                    'out_size_factor']
+
+                if width > 0 and length > 0:
+                    radius = gaussian_radius(
+                        (width, length),
+                        min_overlap=self.train_cfg['gaussian_overlap'])
+                    radius = max(self.train_cfg['min_radius'], int(radius))
+
+                    # be really careful for the coordinate system of
+                    # your box annotation.
+                    x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][
+                        1], task_boxes[idx][k][2]
+
+                    coor_x = (
+                        x - pc_range[0]
+                    ) / voxel_size[0] / self.train_cfg['out_size_factor']
+                    coor_y = (
+                        y - pc_range[1]
+                    ) / voxel_size[1] / self.train_cfg['out_size_factor']
+
+                    center = torch.tensor([coor_x, coor_y],
+                                          dtype=torch.float32,
+                                          device=device)
+                    center_int = center.to(torch.int32)
+
+                    # throw out not in range objects to avoid out of array
+                    # area when creating the heatmap
+                    if not (0 <= center_int[0] < feature_map_size[0]
+                            and 0 <= center_int[1] < feature_map_size[1]):
+                        continue
+
+                    draw_gaussian(heatmap[cls_id], center_int, radius)
+
+                    new_idx = k
+                    x, y = center_int[0], center_int[1]
+
+                    assert (y * feature_map_size[0] + x <
+                            feature_map_size[0] * feature_map_size[1])
+
+                    ind[new_idx] = y * feature_map_size[0] + x
+                    mask[new_idx] = 1
+                    # TODO: support other outdoor dataset
+                    vx, vy = task_boxes[idx][k][7:]
+                    rot = task_boxes[idx][k][6]
+                    box_dim = task_boxes[idx][k][3:6]
+                    if self.norm_bbox:
+                        box_dim = box_dim.log()
+                    anno_box[new_idx] = torch.cat([
+                        center - torch.tensor([x, y], device=device),
+                        z.unsqueeze(0), box_dim,
+                        torch.sin(rot).unsqueeze(0),
+                        torch.cos(rot).unsqueeze(0),
+                        vx.unsqueeze(0),
+                        vy.unsqueeze(0)
+                    ])
+
+            heatmaps.append(heatmap)
+            anno_boxes.append(anno_box)
+            masks.append(mask)
+            inds.append(ind)
+        return heatmaps, anno_boxes, inds, masks
+
+    def loss(self, pts_feats: List[Tensor],
+             batch_data_samples: List[Det3DDataSample], *args,
+             **kwargs) -> Dict[str, Tensor]:
+        """Forward function for point cloud branch.
+
+        Args:
+            pts_feats (list[torch.Tensor]): Features of point cloud branch
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, .
+
+        Returns:
+            dict: Losses of each branch.
+        """
+        outs = self(pts_feats)
+        batch_gt_instance_3d = []
+        for data_sample in batch_data_samples:
+            batch_gt_instance_3d.append(data_sample.gt_instances_3d)
+        losses = self.loss_by_feat(outs, batch_gt_instance_3d)
+        return losses
+
+    def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
+                     batch_gt_instances_3d: List[InstanceData], *args,
+                     **kwargs):
+        """Loss function for CenterHead.
+
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results of
+                multiple tasks. The outer tuple indicate  different
+                tasks head, and the internal list indicate different
+                FPN level.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and\
+                ``labels_3d`` attributes.
+
+        Returns:
+            dict[str,torch.Tensor]: Loss of heatmap and bbox of each task.
+        """
+
+        heatmaps, anno_boxes, inds, masks = self.get_targets(
+            batch_gt_instances_3d)
+        loss_dict = dict()
+        for task_id, preds_dict in enumerate(preds_dicts):
+            # heatmap focal loss
+            preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap'])
+            num_pos = heatmaps[task_id].eq(1).float().sum().item()
+            loss_heatmap = self.loss_cls(
+                preds_dict[0]['heatmap'],
+                heatmaps[task_id],
+                avg_factor=max(num_pos, 1))
+            target_box = anno_boxes[task_id]
+            # reconstruct the anno_box from multiple reg heads
+            preds_dict[0]['anno_box'] = torch.cat(
+                (preds_dict[0]['reg'], preds_dict[0]['height'],
+                 preds_dict[0]['dim'], preds_dict[0]['rot'],
+                 preds_dict[0]['vel']),
+                dim=1)
+
+            # Regression loss for dimension, offset, height, rotation
+            ind = inds[task_id]
+            num = masks[task_id].float().sum()
+            pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous()
+            pred = pred.view(pred.size(0), -1, pred.size(3))
+            pred = self._gather_feat(pred, ind)
+            mask = masks[task_id].unsqueeze(2).expand_as(target_box).float()
+            isnotnan = (~torch.isnan(target_box)).float()
+            mask *= isnotnan
+
+            code_weights = self.train_cfg.get('code_weights', None)
+            bbox_weights = mask * mask.new_tensor(code_weights)
+            loss_bbox = self.loss_bbox(
+                pred, target_box, bbox_weights, avg_factor=(num + 1e-4))
+            loss_dict[f'task{task_id}.loss_heatmap'] = loss_heatmap
+            loss_dict[f'task{task_id}.loss_bbox'] = loss_bbox
+        return loss_dict
+
+    def predict(self,
+                pts_feats: Dict[str, torch.Tensor],
+                batch_data_samples: List[Det3DDataSample],
+                rescale=True,
+                **kwargs) -> List[InstanceData]:
+        """
+        Args:
+            pts_feats (dict): Point features..
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes meta information of data.
+            rescale (bool): Whether rescale the resutls to
+                the original scale.
+
+        Returns:
+            list[:obj:`InstanceData`]: List of processed predictions. Each
+            InstanceData contains 3d Bounding boxes and corresponding
+            scores and labels.
+        """
+        preds_dict = self(pts_feats)
+        batch_size = len(batch_data_samples)
+        batch_input_metas = []
+        for batch_index in range(batch_size):
+            metainfo = batch_data_samples[batch_index].metainfo
+            batch_input_metas.append(metainfo)
+
+        results_list = self.predict_by_feat(
+            preds_dict, batch_input_metas, rescale=rescale, **kwargs)
+        return results_list
+
+    def predict_by_feat(self, preds_dicts: Tuple[List[dict]],
+                        batch_input_metas: List[dict], *args,
+                        **kwargs) -> List[InstanceData]:
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results of
+                multiple tasks. The outer tuple indicate  different
+                tasks head, and the internal list indicate different
+                FPN level.
+            batch_input_metas (list[dict]): Meta info of multiple
+                inputs.
+
+        Returns:
+            list[:obj:`InstanceData`]: Instance prediction
+            results of each sample after the post process.
+            Each item usually contains following keys.
+
+                - scores_3d (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels_3d (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes_3d (:obj:`LiDARInstance3DBoxes`): Prediction
+                  of bboxes, contains a tensor with shape
+                  (num_instances, 7) or (num_instances, 9), and
+                  the last 2 dimensions of 9 is
+                  velocity.
+        """
+        rets = []
+        for task_id, preds_dict in enumerate(preds_dicts):
+            num_class_with_bg = self.num_classes[task_id]
+            batch_size = preds_dict[0]['heatmap'].shape[0]
+            batch_heatmap = preds_dict[0]['heatmap'].sigmoid()
+
+            batch_reg = preds_dict[0]['reg']
+            batch_hei = preds_dict[0]['height']
+
+            if self.norm_bbox:
+                batch_dim = torch.exp(preds_dict[0]['dim'])
+            else:
+                batch_dim = preds_dict[0]['dim']
+
+            batch_rots = preds_dict[0]['rot'][:, 0].unsqueeze(1)
+            batch_rotc = preds_dict[0]['rot'][:, 1].unsqueeze(1)
+
+            if 'vel' in preds_dict[0]:
+                batch_vel = preds_dict[0]['vel']
+            else:
+                batch_vel = None
+            temp = self.bbox_coder.decode(
+                batch_heatmap,
+                batch_rots,
+                batch_rotc,
+                batch_hei,
+                batch_dim,
+                batch_vel,
+                reg=batch_reg,
+                task_id=task_id)
+            assert self.test_cfg['nms_type'] in ['circle', 'rotate']
+            batch_reg_preds = [box['bboxes'] for box in temp]
+            batch_cls_preds = [box['scores'] for box in temp]
+            batch_cls_labels = [box['labels'] for box in temp]
+            if self.test_cfg['nms_type'] == 'circle':
+                ret_task = []
+                for i in range(batch_size):
+                    boxes3d = temp[i]['bboxes']
+                    scores = temp[i]['scores']
+                    labels = temp[i]['labels']
+                    centers = boxes3d[:, [0, 1]]
+                    boxes = torch.cat([centers, scores.view(-1, 1)], dim=1)
+                    keep = torch.tensor(
+                        circle_nms(
+                            boxes.detach().cpu().numpy(),
+                            self.test_cfg['min_radius'][task_id],
+                            post_max_size=self.test_cfg['post_max_size']),
+                        dtype=torch.long,
+                        device=boxes.device)
+
+                    boxes3d = boxes3d[keep]
+                    scores = scores[keep]
+                    labels = labels[keep]
+                    ret = dict(bboxes=boxes3d, scores=scores, labels=labels)
+                    ret_task.append(ret)
+                rets.append(ret_task)
+            else:
+                rets.append(
+                    self.get_task_detections(num_class_with_bg,
+                                             batch_cls_preds, batch_reg_preds,
+                                             batch_cls_labels,
+                                             batch_input_metas))
+
+        # Merge branches results
+        num_samples = len(rets[0])
+
+        ret_list = []
+        for i in range(num_samples):
+            temp_instances = InstanceData()
+            for k in rets[0][i].keys():
+                if k == 'bboxes':
+                    bboxes = torch.cat([ret[i][k] for ret in rets])
+                    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+                    bboxes = batch_input_metas[i]['box_type_3d'](
+                        bboxes, self.bbox_coder.code_size)
+                elif k == 'scores':
+                    scores = torch.cat([ret[i][k] for ret in rets])
+                elif k == 'labels':
+                    flag = 0
+                    for j, num_class in enumerate(self.num_classes):
+                        rets[j][i][k] += flag
+                        flag += num_class
+                    labels = torch.cat([ret[i][k].int() for ret in rets])
+            temp_instances.bboxes_3d = bboxes
+            temp_instances.scores_3d = scores
+            temp_instances.labels_3d = labels
+            ret_list.append(temp_instances)
+        return ret_list
+
+    def get_task_detections(self, num_class_with_bg, batch_cls_preds,
+                            batch_reg_preds, batch_cls_labels, img_metas):
+        """Rotate nms for each task.
+
+        Args:
+            num_class_with_bg (int): Number of classes for the current task.
+            batch_cls_preds (list[torch.Tensor]): Prediction score with the
+                shape of [N].
+            batch_reg_preds (list[torch.Tensor]): Prediction bbox with the
+                shape of [N, 9].
+            batch_cls_labels (list[torch.Tensor]): Prediction label with the
+                shape of [N].
+            img_metas (list[dict]): Meta information of each sample.
+
+        Returns:
+            list[dict[str: torch.Tensor]]: contains the following keys:
+
+                -bboxes (torch.Tensor): Prediction bboxes after nms with the
+                    shape of [N, 9].
+                -scores (torch.Tensor): Prediction scores after nms with the
+                    shape of [N].
+                -labels (torch.Tensor): Prediction labels after nms with the
+                    shape of [N].
+        """
+        predictions_dicts = []
+        post_center_range = self.test_cfg['post_center_limit_range']
+        if len(post_center_range) > 0:
+            post_center_range = torch.tensor(
+                post_center_range,
+                dtype=batch_reg_preds[0].dtype,
+                device=batch_reg_preds[0].device)
+
+        for i, (box_preds, cls_preds, cls_labels) in enumerate(
+                zip(batch_reg_preds, batch_cls_preds, batch_cls_labels)):
+
+            # Apply NMS in bird eye view
+
+            # get the highest score per prediction, then apply nms
+            # to remove overlapped box.
+            if num_class_with_bg == 1:
+                top_scores = cls_preds.squeeze(-1)
+                top_labels = torch.zeros(
+                    cls_preds.shape[0],
+                    device=cls_preds.device,
+                    dtype=torch.long)
+
+            else:
+                top_labels = cls_labels.long()
+                top_scores = cls_preds.squeeze(-1)
+
+            if self.test_cfg['score_threshold'] > 0.0:
+                thresh = torch.tensor(
+                    [self.test_cfg['score_threshold']],
+                    device=cls_preds.device).type_as(cls_preds)
+                top_scores_keep = top_scores >= thresh
+                top_scores = top_scores.masked_select(top_scores_keep)
+
+            if top_scores.shape[0] != 0:
+                if self.test_cfg['score_threshold'] > 0.0:
+                    box_preds = box_preds[top_scores_keep]
+                    top_labels = top_labels[top_scores_keep]
+
+                boxes_for_nms = xywhr2xyxyr(img_metas[i]['box_type_3d'](
+                    box_preds[:, :], self.bbox_coder.code_size).bev)
+                # the nms in 3d detection just remove overlap boxes.
+
+                selected = nms_bev(
+                    boxes_for_nms,
+                    top_scores,
+                    thresh=self.test_cfg['nms_thr'],
+                    pre_max_size=self.test_cfg['pre_max_size'],
+                    post_max_size=self.test_cfg['post_max_size'])
+            else:
+                selected = []
+
+            # if selected is not None:
+            selected_boxes = box_preds[selected]
+            selected_labels = top_labels[selected]
+            selected_scores = top_scores[selected]
+
+            # finally generate predictions.
+            if selected_boxes.shape[0] != 0:
+                box_preds = selected_boxes
+                scores = selected_scores
+                label_preds = selected_labels
+                final_box_preds = box_preds
+                final_scores = scores
+                final_labels = label_preds
+                if post_center_range is not None:
+                    mask = (final_box_preds[:, :3] >=
+                            post_center_range[:3]).all(1)
+                    mask &= (final_box_preds[:, :3] <=
+                             post_center_range[3:]).all(1)
+                    predictions_dict = dict(
+                        bboxes=final_box_preds[mask],
+                        scores=final_scores[mask],
+                        labels=final_labels[mask])
+                else:
+                    predictions_dict = dict(
+                        bboxes=final_box_preds,
+                        scores=final_scores,
+                        labels=final_labels)
+            else:
+                dtype = batch_reg_preds[0].dtype
+                device = batch_reg_preds[0].device
+                predictions_dict = dict(
+                    bboxes=torch.zeros([0, self.bbox_coder.code_size],
+                                       dtype=dtype,
+                                       device=device),
+                    scores=torch.zeros([0], dtype=dtype, device=device),
+                    labels=torch.zeros([0],
+                                       dtype=top_labels.dtype,
+                                       device=device))
+
+            predictions_dicts.append(predictions_dict)
+        return predictions_dicts
diff --git a/mmde/mmdet3d/models/dense_heads/fcaf3d_head.py b/mmde/mmdet3d/models/dense_heads/fcaf3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed53b7dbe39253adb14220e8475c5b8d331bc6b3
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/fcaf3d_head.py
@@ -0,0 +1,696 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapted from https://github.com/SamsungLabs/fcaf3d/blob/master/mmdet3d/models/dense_heads/fcaf3d_neck_with_head.py # noqa
+from typing import List, Optional, Tuple
+
+try:
+    import MinkowskiEngine as ME
+    from MinkowskiEngine import SparseTensor
+except ImportError:
+    # Please follow get_started.md to install MinkowskiEngine.
+    ME = SparseTensor = None
+    pass
+
+import torch
+from mmcv.cnn import Scale
+from mmcv.ops import nms3d, nms3d_normal
+from mmdet.utils import reduce_mean
+from mmengine.model import bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor, nn
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import BaseInstance3DBoxes, rotation_3d_in_axis
+from mmdet3d.utils import InstanceList, OptInstanceList
+from .base_3d_dense_head import Base3DDenseHead
+
+
+@MODELS.register_module()
+class FCAF3DHead(Base3DDenseHead):
+    r"""Bbox head of `FCAF3D <https://arxiv.org/abs/2112.00322>`_.
+
+    Actually here we store both the sparse 3D FPN and a head. The neck and
+    the head can not be simply separated as pruning score on the i-th level
+    of FPN requires classification scores from i+1-th level of the head.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (tuple(int)): Number of channels in input tensors.
+        out_channels (int): Number of channels in the neck output tensors.
+        num_reg_outs (int): Number of regression layer channels.
+        voxel_size (float): Voxel size in meters.
+        pts_prune_threshold (int): Pruning threshold on each feature level.
+        pts_assign_threshold (int): Box to location assigner parameter.
+            Assigner selects the maximum feature level with more locations
+            inside the box than pts_assign_threshold.
+        pts_center_threshold (int): Box to location assigner parameter.
+            After feature level for the box is determined, assigner selects
+            pts_center_threshold locations closest to the box center.
+        center_loss (dict): Config of centerness loss. Defaults to
+            dict(type='mmdet.CrossEntropyLoss', use_sigmoid=True).
+        bbox_loss (dict): Config of bbox loss. Defaults to
+            dict(type='AxisAlignedIoULoss').
+        cls_loss (dict): Config of classification loss. Defaults to
+            dict = dict(type='mmdet.FocalLoss').
+        train_cfg (dict, optional): Config for train stage. Defaults to None.
+        test_cfg (dict, optional): Config for test stage. Defaults to None.
+        init_cfg (dict, optional): Config for weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: Tuple[int],
+                 out_channels: int,
+                 num_reg_outs: int,
+                 voxel_size: float,
+                 pts_prune_threshold: int,
+                 pts_assign_threshold: int,
+                 pts_center_threshold: int,
+                 center_loss: dict = dict(
+                     type='mmdet.CrossEntropyLoss', use_sigmoid=True),
+                 bbox_loss: dict = dict(type='AxisAlignedIoULoss'),
+                 cls_loss: dict = dict(type='mmdet.FocalLoss'),
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None):
+        super(FCAF3DHead, self).__init__(init_cfg)
+        if ME is None:
+            raise ImportError(
+                'Please follow `get_started.md` to install MinkowskiEngine.`')
+        self.voxel_size = voxel_size
+        self.pts_prune_threshold = pts_prune_threshold
+        self.pts_assign_threshold = pts_assign_threshold
+        self.pts_center_threshold = pts_center_threshold
+        self.center_loss = MODELS.build(center_loss)
+        self.bbox_loss = MODELS.build(bbox_loss)
+        self.cls_loss = MODELS.build(cls_loss)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers(in_channels, out_channels, num_reg_outs, num_classes)
+
+    @staticmethod
+    def _make_block(in_channels: int, out_channels: int) -> nn.Module:
+        """Construct Conv-Norm-Act block.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+
+        Returns:
+            torch.nn.Module: With corresponding layers.
+        """
+        return nn.Sequential(
+            ME.MinkowskiConvolution(
+                in_channels, out_channels, kernel_size=3, dimension=3),
+            ME.MinkowskiBatchNorm(out_channels), ME.MinkowskiELU())
+
+    @staticmethod
+    def _make_up_block(in_channels: int, out_channels: int) -> nn.Module:
+        """Construct DeConv-Norm-Act-Conv-Norm-Act block.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+
+        Returns:
+            torch.nn.Module: With corresponding layers.
+        """
+        return nn.Sequential(
+            ME.MinkowskiGenerativeConvolutionTranspose(
+                in_channels,
+                out_channels,
+                kernel_size=2,
+                stride=2,
+                dimension=3), ME.MinkowskiBatchNorm(out_channels),
+            ME.MinkowskiELU(),
+            ME.MinkowskiConvolution(
+                out_channels, out_channels, kernel_size=3, dimension=3),
+            ME.MinkowskiBatchNorm(out_channels), ME.MinkowskiELU())
+
+    def _init_layers(self, in_channels: Tuple[int], out_channels: int,
+                     num_reg_outs: int, num_classes: int):
+        """Initialize layers.
+
+        Args:
+            in_channels (tuple[int]): Number of channels in input tensors.
+            out_channels (int): Number of channels in the neck output tensors.
+            num_reg_outs (int): Number of regression layer channels.
+            num_classes (int): Number of classes.
+        """
+        # neck layers
+        self.pruning = ME.MinkowskiPruning()
+        for i in range(len(in_channels)):
+            if i > 0:
+                self.__setattr__(
+                    f'up_block_{i}',
+                    self._make_up_block(in_channels[i], in_channels[i - 1]))
+            self.__setattr__(f'out_block_{i}',
+                             self._make_block(in_channels[i], out_channels))
+
+        # head layers
+        self.conv_center = ME.MinkowskiConvolution(
+            out_channels, 1, kernel_size=1, dimension=3)
+        self.conv_reg = ME.MinkowskiConvolution(
+            out_channels, num_reg_outs, kernel_size=1, dimension=3)
+        self.conv_cls = ME.MinkowskiConvolution(
+            out_channels, num_classes, kernel_size=1, bias=True, dimension=3)
+        self.scales = nn.ModuleList(
+            [Scale(1.) for _ in range(len(in_channels))])
+
+    def init_weights(self):
+        """Initialize weights."""
+        nn.init.normal_(self.conv_center.kernel, std=.01)
+        nn.init.normal_(self.conv_reg.kernel, std=.01)
+        nn.init.normal_(self.conv_cls.kernel, std=.01)
+        nn.init.constant_(self.conv_cls.bias, bias_init_with_prob(.01))
+
+    def forward(self, x: List[Tensor]) -> Tuple[List[Tensor], ...]:
+        """Forward pass.
+
+        Args:
+            x (list[Tensor]): Features from the backbone.
+
+        Returns:
+            Tuple[List[Tensor], ...]: Predictions of the head.
+        """
+        center_preds, bbox_preds, cls_preds, points = [], [], [], []
+        inputs = x
+        x = inputs[-1]
+        prune_score = None
+        for i in range(len(inputs) - 1, -1, -1):
+            if i < len(inputs) - 1:
+                x = self.__getattr__(f'up_block_{i + 1}')(x)
+                x = inputs[i] + x
+                x = self._prune(x, prune_score)
+
+            out = self.__getattr__(f'out_block_{i}')(x)
+            center_pred, bbox_pred, cls_pred, point, prune_score = \
+                self._forward_single(out, self.scales[i])
+            center_preds.append(center_pred)
+            bbox_preds.append(bbox_pred)
+            cls_preds.append(cls_pred)
+            points.append(point)
+        return center_preds[::-1], bbox_preds[::-1], cls_preds[::-1], \
+            points[::-1]
+
+    def _prune(self, x: SparseTensor, scores: SparseTensor) -> SparseTensor:
+        """Prunes the tensor by score thresholding.
+
+        Args:
+            x (SparseTensor): Tensor to be pruned.
+            scores (SparseTensor): Scores for thresholding.
+
+        Returns:
+            SparseTensor: Pruned tensor.
+        """
+        with torch.no_grad():
+            coordinates = x.C.float()
+            interpolated_scores = scores.features_at_coordinates(coordinates)
+            prune_mask = interpolated_scores.new_zeros(
+                (len(interpolated_scores)), dtype=torch.bool)
+            for permutation in x.decomposition_permutations:
+                score = interpolated_scores[permutation]
+                mask = score.new_zeros((len(score)), dtype=torch.bool)
+                topk = min(len(score), self.pts_prune_threshold)
+                ids = torch.topk(score.squeeze(1), topk, sorted=False).indices
+                mask[ids] = True
+                prune_mask[permutation[mask]] = True
+        x = self.pruning(x, prune_mask)
+        return x
+
+    def _forward_single(self, x: SparseTensor,
+                        scale: Scale) -> Tuple[Tensor, ...]:
+        """Forward pass per level.
+
+        Args:
+            x (SparseTensor): Per level neck output tensor.
+            scale (mmcv.cnn.Scale): Per level multiplication weight.
+
+        Returns:
+            tuple[Tensor]: Per level head predictions.
+        """
+        center_pred = self.conv_center(x).features
+        scores = self.conv_cls(x)
+        cls_pred = scores.features
+        prune_scores = ME.SparseTensor(
+            scores.features.max(dim=1, keepdim=True).values,
+            coordinate_map_key=scores.coordinate_map_key,
+            coordinate_manager=scores.coordinate_manager)
+        reg_final = self.conv_reg(x).features
+        reg_distance = torch.exp(scale(reg_final[:, :6]))
+        reg_angle = reg_final[:, 6:]
+        bbox_pred = torch.cat((reg_distance, reg_angle), dim=1)
+
+        center_preds, bbox_preds, cls_preds, points = [], [], [], []
+        for permutation in x.decomposition_permutations:
+            center_preds.append(center_pred[permutation])
+            bbox_preds.append(bbox_pred[permutation])
+            cls_preds.append(cls_pred[permutation])
+
+        points = x.decomposed_coordinates
+        for i in range(len(points)):
+            points[i] = points[i] * self.voxel_size
+
+        return center_preds, bbox_preds, cls_preds, points, prune_scores
+
+    def _loss_by_feat_single(self, center_preds: List[Tensor],
+                             bbox_preds: List[Tensor], cls_preds: List[Tensor],
+                             points: List[Tensor],
+                             gt_bboxes: BaseInstance3DBoxes, gt_labels: Tensor,
+                             input_meta: dict) -> Tuple[Tensor, ...]:
+        """Loss function of single sample.
+
+        Args:
+            center_preds (list[Tensor]): Centerness predictions for all levels.
+            bbox_preds (list[Tensor]): Bbox predictions for all levels.
+            cls_preds (list[Tensor]): Classification predictions for all
+                levels.
+            points (list[Tensor]): Final location coordinates for all levels.
+            gt_bboxes (:obj:`BaseInstance3DBoxes`): Ground truth boxes.
+            gt_labels (Tensor): Ground truth labels.
+            input_meta (dict): Scene meta info.
+
+        Returns:
+            tuple[Tensor, ...]: Centerness, bbox, and classification loss
+            values.
+        """
+        center_targets, bbox_targets, cls_targets = self.get_targets(
+            points, gt_bboxes, gt_labels)
+
+        center_preds = torch.cat(center_preds)
+        bbox_preds = torch.cat(bbox_preds)
+        cls_preds = torch.cat(cls_preds)
+        points = torch.cat(points)
+
+        # cls loss
+        pos_inds = torch.nonzero(cls_targets >= 0).squeeze(1)
+        n_pos = points.new_tensor(len(pos_inds))
+        n_pos = max(reduce_mean(n_pos), 1.)
+        cls_loss = self.cls_loss(cls_preds, cls_targets, avg_factor=n_pos)
+
+        # bbox and centerness losses
+        pos_center_preds = center_preds[pos_inds]
+        pos_bbox_preds = bbox_preds[pos_inds]
+        pos_center_targets = center_targets[pos_inds].unsqueeze(1)
+        pos_bbox_targets = bbox_targets[pos_inds]
+        # reduce_mean is outside if / else block to prevent deadlock
+        center_denorm = max(
+            reduce_mean(pos_center_targets.sum().detach()), 1e-6)
+        if len(pos_inds) > 0:
+            pos_points = points[pos_inds]
+            center_loss = self.center_loss(
+                pos_center_preds, pos_center_targets, avg_factor=n_pos)
+
+            bbox_loss = self.bbox_loss(
+                self._bbox_to_loss(
+                    self._bbox_pred_to_bbox(pos_points, pos_bbox_preds)),
+                self._bbox_to_loss(pos_bbox_targets),
+                weight=pos_center_targets.squeeze(1),
+                avg_factor=center_denorm)
+        else:
+            center_loss = pos_center_preds.sum()
+            bbox_loss = pos_bbox_preds.sum()
+        return center_loss, bbox_loss, cls_loss
+
+    def loss_by_feat(self,
+                     center_preds: List[List[Tensor]],
+                     bbox_preds: List[List[Tensor]],
+                     cls_preds: List[List[Tensor]],
+                     points: List[List[Tensor]],
+                     batch_gt_instances_3d: InstanceList,
+                     batch_input_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None,
+                     **kwargs) -> dict:
+        """Loss function about feature.
+
+        Args:
+            center_preds (list[list[Tensor]]): Centerness predictions for
+                all scenes. The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            bbox_preds (list[list[Tensor]]): Bbox predictions for all scenes.
+                The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            cls_preds (list[list[Tensor]]): Classification predictions for all
+                scenes. The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            points (list[list[Tensor]]): Final location coordinates for all
+                scenes. The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、`
+                `labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_input_metas (list[dict]): Meta information of each input,
+                e.g., image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: Centerness, bbox, and classification losses.
+        """
+        center_losses, bbox_losses, cls_losses = [], [], []
+        for i in range(len(batch_input_metas)):
+            center_loss, bbox_loss, cls_loss = self._loss_by_feat_single(
+                center_preds=[x[i] for x in center_preds],
+                bbox_preds=[x[i] for x in bbox_preds],
+                cls_preds=[x[i] for x in cls_preds],
+                points=[x[i] for x in points],
+                input_meta=batch_input_metas[i],
+                gt_bboxes=batch_gt_instances_3d[i].bboxes_3d,
+                gt_labels=batch_gt_instances_3d[i].labels_3d)
+            center_losses.append(center_loss)
+            bbox_losses.append(bbox_loss)
+            cls_losses.append(cls_loss)
+        return dict(
+            center_loss=torch.mean(torch.stack(center_losses)),
+            bbox_loss=torch.mean(torch.stack(bbox_losses)),
+            cls_loss=torch.mean(torch.stack(cls_losses)))
+
+    def _predict_by_feat_single(self, center_preds: List[Tensor],
+                                bbox_preds: List[Tensor],
+                                cls_preds: List[Tensor], points: List[Tensor],
+                                input_meta: dict) -> InstanceData:
+        """Generate boxes for single sample.
+
+        Args:
+            center_preds (list[Tensor]): Centerness predictions for all levels.
+            bbox_preds (list[Tensor]): Bbox predictions for all levels.
+            cls_preds (list[Tensor]): Classification predictions for all
+                levels.
+            points (list[Tensor]): Final location coordinates for all levels.
+            input_meta (dict): Scene meta info.
+
+        Returns:
+            InstanceData: Predicted bounding boxes, scores and labels.
+        """
+        mlvl_bboxes, mlvl_scores = [], []
+        for center_pred, bbox_pred, cls_pred, point in zip(
+                center_preds, bbox_preds, cls_preds, points):
+            scores = cls_pred.sigmoid() * center_pred.sigmoid()
+            max_scores, _ = scores.max(dim=1)
+
+            if len(scores) > self.test_cfg.nms_pre > 0:
+                _, ids = max_scores.topk(self.test_cfg.nms_pre)
+                bbox_pred = bbox_pred[ids]
+                scores = scores[ids]
+                point = point[ids]
+
+            bboxes = self._bbox_pred_to_bbox(point, bbox_pred)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+
+        bboxes = torch.cat(mlvl_bboxes)
+        scores = torch.cat(mlvl_scores)
+        bboxes, scores, labels = self._single_scene_multiclass_nms(
+            bboxes, scores, input_meta)
+
+        bboxes = input_meta['box_type_3d'](
+            bboxes,
+            box_dim=bboxes.shape[1],
+            with_yaw=bboxes.shape[1] == 7,
+            origin=(.5, .5, .5))
+
+        results = InstanceData()
+        results.bboxes_3d = bboxes
+        results.scores_3d = scores
+        results.labels_3d = labels
+        return results
+
+    def predict_by_feat(self, center_preds: List[List[Tensor]],
+                        bbox_preds: List[List[Tensor]], cls_preds,
+                        points: List[List[Tensor]],
+                        batch_input_metas: List[dict],
+                        **kwargs) -> List[InstanceData]:
+        """Generate boxes for all scenes.
+
+        Args:
+            center_preds (list[list[Tensor]]): Centerness predictions for
+                all scenes.
+            bbox_preds (list[list[Tensor]]): Bbox predictions for all scenes.
+            cls_preds (list[list[Tensor]]): Classification predictions for all
+                scenes.
+            points (list[list[Tensor]]): Final location coordinates for all
+                scenes.
+            batch_input_metas (list[dict]): Meta infos for all scenes.
+
+        Returns:
+            list[InstanceData]: Predicted bboxes, scores, and labels for
+            all scenes.
+        """
+        results = []
+        for i in range(len(batch_input_metas)):
+            result = self._predict_by_feat_single(
+                center_preds=[x[i] for x in center_preds],
+                bbox_preds=[x[i] for x in bbox_preds],
+                cls_preds=[x[i] for x in cls_preds],
+                points=[x[i] for x in points],
+                input_meta=batch_input_metas[i])
+            results.append(result)
+        return results
+
+    @staticmethod
+    def _bbox_to_loss(bbox: Tensor) -> Tensor:
+        """Transform box to the axis-aligned or rotated iou loss format.
+
+        Args:
+            bbox (Tensor): 3D box of shape (N, 6) or (N, 7).
+
+        Returns:
+            Tensor: Transformed 3D box of shape (N, 6) or (N, 7).
+        """
+        # rotated iou loss accepts (x, y, z, w, h, l, heading)
+        if bbox.shape[-1] != 6:
+            return bbox
+
+        # axis-aligned case: x, y, z, w, h, l -> x1, y1, z1, x2, y2, z2
+        return torch.stack(
+            (bbox[..., 0] - bbox[..., 3] / 2, bbox[..., 1] - bbox[..., 4] / 2,
+             bbox[..., 2] - bbox[..., 5] / 2, bbox[..., 0] + bbox[..., 3] / 2,
+             bbox[..., 1] + bbox[..., 4] / 2, bbox[..., 2] + bbox[..., 5] / 2),
+            dim=-1)
+
+    @staticmethod
+    def _bbox_pred_to_bbox(points: Tensor, bbox_pred: Tensor) -> Tensor:
+        """Transform predicted bbox parameters to bbox.
+
+        Args:
+            points (Tensor): Final locations of shape (N, 3)
+            bbox_pred (Tensor): Predicted bbox parameters of shape (N, 6)
+                or (N, 8).
+
+        Returns:
+            Tensor: Transformed 3D box of shape (N, 6) or (N, 7).
+        """
+        if bbox_pred.shape[0] == 0:
+            return bbox_pred
+
+        x_center = points[:, 0] + (bbox_pred[:, 1] - bbox_pred[:, 0]) / 2
+        y_center = points[:, 1] + (bbox_pred[:, 3] - bbox_pred[:, 2]) / 2
+        z_center = points[:, 2] + (bbox_pred[:, 5] - bbox_pred[:, 4]) / 2
+
+        # dx_min, dx_max, dy_min, dy_max, dz_min, dz_max -> x, y, z, w, l, h
+        base_bbox = torch.stack([
+            x_center,
+            y_center,
+            z_center,
+            bbox_pred[:, 0] + bbox_pred[:, 1],
+            bbox_pred[:, 2] + bbox_pred[:, 3],
+            bbox_pred[:, 4] + bbox_pred[:, 5],
+        ], -1)
+
+        # axis-aligned case
+        if bbox_pred.shape[1] == 6:
+            return base_bbox
+
+        # rotated case: ..., sin(2a)ln(q), cos(2a)ln(q)
+        scale = bbox_pred[:, 0] + bbox_pred[:, 1] + \
+            bbox_pred[:, 2] + bbox_pred[:, 3]
+        q = torch.exp(
+            torch.sqrt(
+                torch.pow(bbox_pred[:, 6], 2) + torch.pow(bbox_pred[:, 7], 2)))
+        alpha = 0.5 * torch.atan2(bbox_pred[:, 6], bbox_pred[:, 7])
+        return torch.stack(
+            (x_center, y_center, z_center, scale / (1 + q), scale /
+             (1 + q) * q, bbox_pred[:, 5] + bbox_pred[:, 4], alpha),
+            dim=-1)
+
+    @staticmethod
+    def _get_face_distances(points: Tensor, boxes: Tensor) -> Tensor:
+        """Calculate distances from point to box faces.
+
+        Args:
+            points (Tensor): Final locations of shape (N_points, N_boxes, 3).
+            boxes (Tensor): 3D boxes of shape (N_points, N_boxes, 7)
+
+        Returns:
+            Tensor: Face distances of shape (N_points, N_boxes, 6),
+            (dx_min, dx_max, dy_min, dy_max, dz_min, dz_max).
+        """
+        shift = torch.stack(
+            (points[..., 0] - boxes[..., 0], points[..., 1] - boxes[..., 1],
+             points[..., 2] - boxes[..., 2]),
+            dim=-1).permute(1, 0, 2)
+        shift = rotation_3d_in_axis(
+            shift, -boxes[0, :, 6], axis=2).permute(1, 0, 2)
+        centers = boxes[..., :3] + shift
+        dx_min = centers[..., 0] - boxes[..., 0] + boxes[..., 3] / 2
+        dx_max = boxes[..., 0] + boxes[..., 3] / 2 - centers[..., 0]
+        dy_min = centers[..., 1] - boxes[..., 1] + boxes[..., 4] / 2
+        dy_max = boxes[..., 1] + boxes[..., 4] / 2 - centers[..., 1]
+        dz_min = centers[..., 2] - boxes[..., 2] + boxes[..., 5] / 2
+        dz_max = boxes[..., 2] + boxes[..., 5] / 2 - centers[..., 2]
+        return torch.stack((dx_min, dx_max, dy_min, dy_max, dz_min, dz_max),
+                           dim=-1)
+
+    @staticmethod
+    def _get_centerness(face_distances: Tensor) -> Tensor:
+        """Compute point centerness w.r.t containing box.
+
+        Args:
+            face_distances (Tensor): Face distances of shape (B, N, 6),
+                (dx_min, dx_max, dy_min, dy_max, dz_min, dz_max).
+
+        Returns:
+            Tensor: Centerness of shape (B, N).
+        """
+        x_dims = face_distances[..., [0, 1]]
+        y_dims = face_distances[..., [2, 3]]
+        z_dims = face_distances[..., [4, 5]]
+        centerness_targets = x_dims.min(dim=-1)[0] / x_dims.max(dim=-1)[0] * \
+            y_dims.min(dim=-1)[0] / y_dims.max(dim=-1)[0] * \
+            z_dims.min(dim=-1)[0] / z_dims.max(dim=-1)[0]
+        return torch.sqrt(centerness_targets)
+
+    @torch.no_grad()
+    def get_targets(self, points: Tensor, gt_bboxes: BaseInstance3DBoxes,
+                    gt_labels: Tensor) -> Tuple[Tensor, ...]:
+        """Compute targets for final locations for a single scene.
+
+        Args:
+            points (list[Tensor]): Final locations for all levels.
+            gt_bboxes (BaseInstance3DBoxes): Ground truth boxes.
+            gt_labels (Tensor): Ground truth labels.
+
+        Returns:
+            tuple[Tensor, ...]: Centerness, bbox and classification
+            targets for all locations.
+        """
+        float_max = points[0].new_tensor(1e8)
+        n_levels = len(points)
+        levels = torch.cat([
+            points[i].new_tensor(i).expand(len(points[i]))
+            for i in range(len(points))
+        ])
+        points = torch.cat(points)
+        gt_bboxes = gt_bboxes.to(points.device)
+        n_points = len(points)
+        n_boxes = len(gt_bboxes)
+        volumes = gt_bboxes.volume.unsqueeze(0).expand(n_points, n_boxes)
+
+        # condition 1: point inside box
+        boxes = torch.cat((gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+                          dim=1)
+        boxes = boxes.expand(n_points, n_boxes, 7)
+        points = points.unsqueeze(1).expand(n_points, n_boxes, 3)
+        face_distances = self._get_face_distances(points, boxes)
+        inside_box_condition = face_distances.min(dim=-1).values > 0
+
+        # condition 2: positive points per level >= limit
+        # calculate positive points per scale
+        n_pos_points_per_level = []
+        for i in range(n_levels):
+            n_pos_points_per_level.append(
+                torch.sum(inside_box_condition[levels == i], dim=0))
+        # find best level
+        n_pos_points_per_level = torch.stack(n_pos_points_per_level, dim=0)
+        lower_limit_mask = n_pos_points_per_level < self.pts_assign_threshold
+        lower_index = torch.argmax(lower_limit_mask.int(), dim=0) - 1
+        lower_index = torch.where(lower_index < 0, 0, lower_index)
+        all_upper_limit_mask = torch.all(
+            torch.logical_not(lower_limit_mask), dim=0)
+        best_level = torch.where(all_upper_limit_mask, n_levels - 1,
+                                 lower_index)
+        # keep only points with best level
+        best_level = best_level.expand(n_points, n_boxes)
+        levels = torch.unsqueeze(levels, 1).expand(n_points, n_boxes)
+        level_condition = best_level == levels
+
+        # condition 3: limit topk points per box by centerness
+        centerness = self._get_centerness(face_distances)
+        centerness = torch.where(inside_box_condition, centerness,
+                                 torch.ones_like(centerness) * -1)
+        centerness = torch.where(level_condition, centerness,
+                                 torch.ones_like(centerness) * -1)
+        top_centerness = torch.topk(
+            centerness,
+            min(self.pts_center_threshold + 1, len(centerness)),
+            dim=0).values[-1]
+        topk_condition = centerness > top_centerness.unsqueeze(0)
+
+        # condition 4: min volume box per point
+        volumes = torch.where(inside_box_condition, volumes, float_max)
+        volumes = torch.where(level_condition, volumes, float_max)
+        volumes = torch.where(topk_condition, volumes, float_max)
+        min_volumes, min_inds = volumes.min(dim=1)
+
+        center_targets = centerness[torch.arange(n_points), min_inds]
+        bbox_targets = boxes[torch.arange(n_points), min_inds]
+        if not gt_bboxes.with_yaw:
+            bbox_targets = bbox_targets[:, :-1]
+        cls_targets = gt_labels[min_inds]
+        cls_targets = torch.where(min_volumes == float_max, -1, cls_targets)
+        return center_targets, bbox_targets, cls_targets
+
+    def _single_scene_multiclass_nms(self, bboxes: Tensor, scores: Tensor,
+                                     input_meta: dict) -> Tuple[Tensor, ...]:
+        """Multi-class nms for a single scene.
+
+        Args:
+            bboxes (Tensor): Predicted boxes of shape (N_boxes, 6) or
+                (N_boxes, 7).
+            scores (Tensor): Predicted scores of shape (N_boxes, N_classes).
+            input_meta (dict): Scene meta data.
+
+        Returns:
+            tuple[Tensor, ...]: Predicted bboxes, scores and labels.
+        """
+        num_classes = scores.shape[1]
+        with_yaw = bboxes.shape[1] == 7
+        nms_bboxes, nms_scores, nms_labels = [], [], []
+        for i in range(num_classes):
+            ids = scores[:, i] > self.test_cfg.score_thr
+            if not ids.any():
+                continue
+
+            class_scores = scores[ids, i]
+            class_bboxes = bboxes[ids]
+            if with_yaw:
+                nms_function = nms3d
+            else:
+                class_bboxes = torch.cat(
+                    (class_bboxes, torch.zeros_like(class_bboxes[:, :1])),
+                    dim=1)
+                nms_function = nms3d_normal
+
+            nms_ids = nms_function(class_bboxes, class_scores,
+                                   self.test_cfg.iou_thr)
+            nms_bboxes.append(class_bboxes[nms_ids])
+            nms_scores.append(class_scores[nms_ids])
+            nms_labels.append(
+                bboxes.new_full(
+                    class_scores[nms_ids].shape, i, dtype=torch.long))
+
+        if len(nms_bboxes):
+            nms_bboxes = torch.cat(nms_bboxes, dim=0)
+            nms_scores = torch.cat(nms_scores, dim=0)
+            nms_labels = torch.cat(nms_labels, dim=0)
+        else:
+            nms_bboxes = bboxes.new_zeros((0, bboxes.shape[1]))
+            nms_scores = bboxes.new_zeros((0, ))
+            nms_labels = bboxes.new_zeros((0, ))
+
+        if not with_yaw:
+            nms_bboxes = nms_bboxes[:, :6]
+
+        return nms_bboxes, nms_scores, nms_labels
diff --git a/mmde/mmdet3d/models/dense_heads/fcos_mono3d_head.py b/mmde/mmdet3d/models/dense_heads/fcos_mono3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1816431566e0636a8400e88412b754f9b98c2fb5
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/fcos_mono3d_head.py
@@ -0,0 +1,958 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple
+
+import numpy as np
+import torch
+from mmcv.cnn import Scale
+from mmdet.models.utils import multi_apply, select_single_mlvl
+from mmengine.model import normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.models.layers import box3d_multiclass_nms
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures import limit_period, points_img2cam, xywhr2xyxyr
+from mmdet3d.utils import (ConfigType, InstanceList, OptConfigType,
+                           OptInstanceList)
+from .anchor_free_mono3d_head import AnchorFreeMono3DHead
+
+RangeType = Sequence[Tuple[int, int]]
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class FCOSMono3DHead(AnchorFreeMono3DHead):
+    """Anchor-free head used in FCOS3D.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple
+            level points.
+        center_sampling (bool): If true, use center sampling. Default: True.
+        center_sample_radius (float): Radius of center sampling. Default: 1.5.
+        norm_on_bbox (bool): If true, normalize the regression targets
+            with FPN strides. Default: True.
+        centerness_on_reg (bool): If true, position centerness on the
+            regress branch. Please refer to
+            https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
+            Default: True.
+        centerness_alpha (float): Parameter used to adjust the intensity
+            attenuation from the center to the periphery. Default: 2.5.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_dir (:obj:`ConfigDict` or dict): Config of direction classification loss.
+        loss_attr (:obj:`ConfigDict` or dict): Config of attribute classification loss.
+        loss_centerness (:obj:`ConfigDict` or dict): Config of centerness loss.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
+        centerness_branch (tuple[int]): Channels for centerness branch.
+            Default: (64, ).
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """  # noqa: E501
+
+    def __init__(self,
+                 regress_ranges: RangeType = ((-1, 48), (48, 96), (96, 192),
+                                              (192, 384), (384, INF)),
+                 center_sampling: bool = True,
+                 center_sample_radius: float = 1.5,
+                 norm_on_bbox: bool = True,
+                 centerness_on_reg: bool = True,
+                 centerness_alpha: float = 2.5,
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='mmdet.SmoothL1Loss',
+                     beta=1.0 / 9.0,
+                     loss_weight=1.0),
+                 loss_dir: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 loss_attr: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 loss_centerness: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 bbox_coder: ConfigType = dict(
+                     type='FCOS3DBBoxCoder', code_size=9),
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 centerness_branch: Tuple[int] = (64, ),
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        self.regress_ranges = regress_ranges
+        self.center_sampling = center_sampling
+        self.center_sample_radius = center_sample_radius
+        self.norm_on_bbox = norm_on_bbox
+        self.centerness_on_reg = centerness_on_reg
+        self.centerness_alpha = centerness_alpha
+        self.centerness_branch = centerness_branch
+        super().__init__(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_dir=loss_dir,
+            loss_attr=loss_attr,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.loss_centerness = MODELS.build(loss_centerness)
+        bbox_coder['code_size'] = self.bbox_code_size
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        super()._init_layers()
+        self.conv_centerness_prev = self._init_branch(
+            conv_channels=self.centerness_branch,
+            conv_strides=(1, ) * len(self.centerness_branch))
+        self.conv_centerness = nn.Conv2d(self.centerness_branch[-1], 1, 1)
+        self.scale_dim = 3  # only for offset, depth and size regression
+        self.scales = nn.ModuleList([
+            nn.ModuleList([Scale(1.0) for _ in range(self.scale_dim)])
+            for _ in self.strides
+        ])
+
+    def init_weights(self):
+        """Initialize weights of the head.
+
+        We currently still use the customized init_weights because the default
+        init of DCN triggered by the init_cfg will init conv_offset.weight,
+        which mistakenly affects the training stability.
+        """
+        super().init_weights()
+        for m in self.conv_centerness_prev:
+            if isinstance(m.conv, nn.Conv2d):
+                normal_init(m.conv, std=0.01)
+        normal_init(self.conv_centerness, std=0.01)
+
+    def forward(
+        self, x: Tuple[Tensor]
+    ) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor],
+               List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * bbox_code_size.
+                dir_cls_preds (list[Tensor]): Box scores for direction class
+                    predictions on each scale level, each is a 4D-tensor,
+                    the channel number is num_points * 2. (bin = 2).
+                attr_preds (list[Tensor]): Attribute scores for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * num_attrs.
+                centernesses (list[Tensor]): Centerness for each scale level,
+                    each is a 4D-tensor, the channel number is num_points * 1.
+        """
+        # Note: we use [:5] to filter feats and only return predictions
+        return multi_apply(self.forward_single, x, self.scales,
+                           self.strides)[:5]
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, ...]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple: scores for each class, bbox and direction class
+                predictions, centerness predictions of input feature maps.
+        """
+        cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, reg_feat = \
+            super().forward_single(x)
+
+        if self.centerness_on_reg:
+            clone_reg_feat = reg_feat.clone()
+            for conv_centerness_prev_layer in self.conv_centerness_prev:
+                clone_reg_feat = conv_centerness_prev_layer(clone_reg_feat)
+            centerness = self.conv_centerness(clone_reg_feat)
+        else:
+            clone_cls_feat = cls_feat.clone()
+            for conv_centerness_prev_layer in self.conv_centerness_prev:
+                clone_cls_feat = conv_centerness_prev_layer(clone_cls_feat)
+            centerness = self.conv_centerness(clone_cls_feat)
+
+        bbox_pred = self.bbox_coder.decode(bbox_pred, scale, stride,
+                                           self.training, cls_score)
+
+        return cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, \
+            cls_feat, reg_feat
+
+    @staticmethod
+    def add_sin_difference(boxes1: Tensor,
+                           boxes2: Tensor) -> Tuple[Tensor, Tensor]:
+        """Convert the rotation difference to difference in sine function.
+
+        Args:
+            boxes1 (torch.Tensor): Original Boxes in shape (NxC), where C>=7
+                and the 7th dimension is rotation dimension.
+            boxes2 (torch.Tensor): Target boxes in shape (NxC), where C>=7 and
+                the 7th dimension is rotation dimension.
+
+        Returns:
+            tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th
+                dimensions are changed.
+        """
+        rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos(
+            boxes2[..., 6:7])
+        rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[...,
+                                                                         6:7])
+        boxes1 = torch.cat(
+            [boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1)
+        boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]],
+                           dim=-1)
+        return boxes1, boxes2
+
+    @staticmethod
+    def get_direction_target(reg_targets: Tensor,
+                             dir_offset: int = 0,
+                             dir_limit_offset: float = 0.0,
+                             num_bins: int = 2,
+                             one_hot: bool = True) -> Tensor:
+        """Encode direction to 0 ~ num_bins-1.
+
+        Args:
+            reg_targets (torch.Tensor): Bbox regression targets.
+            dir_offset (int, optional): Direction offset. Default to 0.
+            dir_limit_offset (float, optional): Offset to set the direction
+                range. Default to 0.0.
+            num_bins (int, optional): Number of bins to divide 2*PI.
+                Default to 2.
+            one_hot (bool, optional): Whether to encode as one hot.
+                Default to True.
+
+        Returns:
+            torch.Tensor: Encoded direction targets.
+        """
+        rot_gt = reg_targets[..., 6]
+        offset_rot = limit_period(rot_gt - dir_offset, dir_limit_offset,
+                                  2 * np.pi)
+        dir_cls_targets = torch.floor(offset_rot /
+                                      (2 * np.pi / num_bins)).long()
+        dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1)
+        if one_hot:
+            dir_targets = torch.zeros(
+                *list(dir_cls_targets.shape),
+                num_bins,
+                dtype=reg_targets.dtype,
+                device=dir_cls_targets.device)
+            dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0)
+            dir_cls_targets = dir_targets
+        return dir_cls_targets
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            dir_cls_preds: List[Tensor],
+            attr_preds: List[Tensor],
+            centernesses: List[Tensor],
+            batch_gt_instances_3d: InstanceList,
+            batch_gt_instacnes: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * bbox_code_size.
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * 2. (bin = 2)
+            attr_preds (list[Tensor]): Attribute scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_attrs.
+            centernesses (list[Tensor]): Centerness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、`
+                `labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes``、``labels``.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(centernesses) == len(
+            attr_preds)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
+                                           bbox_preds[0].device)
+        labels_3d, bbox_targets_3d, centerness_targets, attr_targets = \
+            self.get_targets(all_level_points, batch_gt_instances_3d,
+                             batch_gt_instacnes)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds, dir_cls_preds and centerness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, sum(self.group_reg_dims))
+            for bbox_pred in bbox_preds
+        ]
+        flatten_dir_cls_preds = [
+            dir_cls_pred.permute(0, 2, 3, 1).reshape(-1, 2)
+            for dir_cls_pred in dir_cls_preds
+        ]
+        flatten_centerness = [
+            centerness.permute(0, 2, 3, 1).reshape(-1)
+            for centerness in centernesses
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_dir_cls_preds = torch.cat(flatten_dir_cls_preds)
+        flatten_centerness = torch.cat(flatten_centerness)
+        flatten_labels_3d = torch.cat(labels_3d)
+        flatten_bbox_targets_3d = torch.cat(bbox_targets_3d)
+        flatten_centerness_targets = torch.cat(centerness_targets)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((flatten_labels_3d >= 0)
+                    & (flatten_labels_3d < bg_class_ind)).nonzero().reshape(-1)
+        num_pos = len(pos_inds)
+
+        loss_cls = self.loss_cls(
+            flatten_cls_scores,
+            flatten_labels_3d,
+            avg_factor=num_pos + num_imgs)  # avoid num_pos is 0
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_dir_cls_preds = flatten_dir_cls_preds[pos_inds]
+        pos_centerness = flatten_centerness[pos_inds]
+
+        if self.pred_attrs:
+            flatten_attr_preds = [
+                attr_pred.permute(0, 2, 3, 1).reshape(-1, self.num_attrs)
+                for attr_pred in attr_preds
+            ]
+            flatten_attr_preds = torch.cat(flatten_attr_preds)
+            flatten_attr_targets = torch.cat(attr_targets)
+            pos_attr_preds = flatten_attr_preds[pos_inds]
+
+        if num_pos > 0:
+            pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds]
+            pos_centerness_targets = flatten_centerness_targets[pos_inds]
+            if self.pred_attrs:
+                pos_attr_targets = flatten_attr_targets[pos_inds]
+            bbox_weights = pos_centerness_targets.new_ones(
+                len(pos_centerness_targets), sum(self.group_reg_dims))
+            equal_weights = pos_centerness_targets.new_ones(
+                pos_centerness_targets.shape)
+
+            code_weight = self.train_cfg.get('code_weight', None)
+            if code_weight:
+                assert len(code_weight) == sum(self.group_reg_dims)
+                bbox_weights = bbox_weights * bbox_weights.new_tensor(
+                    code_weight)
+
+            if self.use_direction_classifier:
+                pos_dir_cls_targets = self.get_direction_target(
+                    pos_bbox_targets_3d,
+                    self.dir_offset,
+                    self.dir_limit_offset,
+                    one_hot=False)
+
+            if self.diff_rad_by_sin:
+                pos_bbox_preds, pos_bbox_targets_3d = self.add_sin_difference(
+                    pos_bbox_preds, pos_bbox_targets_3d)
+
+            loss_offset = self.loss_bbox(
+                pos_bbox_preds[:, :2],
+                pos_bbox_targets_3d[:, :2],
+                weight=bbox_weights[:, :2],
+                avg_factor=equal_weights.sum())
+            loss_depth = self.loss_bbox(
+                pos_bbox_preds[:, 2],
+                pos_bbox_targets_3d[:, 2],
+                weight=bbox_weights[:, 2],
+                avg_factor=equal_weights.sum())
+            loss_size = self.loss_bbox(
+                pos_bbox_preds[:, 3:6],
+                pos_bbox_targets_3d[:, 3:6],
+                weight=bbox_weights[:, 3:6],
+                avg_factor=equal_weights.sum())
+            loss_rotsin = self.loss_bbox(
+                pos_bbox_preds[:, 6],
+                pos_bbox_targets_3d[:, 6],
+                weight=bbox_weights[:, 6],
+                avg_factor=equal_weights.sum())
+            loss_velo = None
+            if self.pred_velo:
+                loss_velo = self.loss_bbox(
+                    pos_bbox_preds[:, 7:9],
+                    pos_bbox_targets_3d[:, 7:9],
+                    weight=bbox_weights[:, 7:9],
+                    avg_factor=equal_weights.sum())
+
+            loss_centerness = self.loss_centerness(pos_centerness,
+                                                   pos_centerness_targets)
+
+            # direction classification loss
+            loss_dir = None
+            # TODO: add more check for use_direction_classifier
+            if self.use_direction_classifier:
+                loss_dir = self.loss_dir(
+                    pos_dir_cls_preds,
+                    pos_dir_cls_targets,
+                    equal_weights,
+                    avg_factor=equal_weights.sum())
+
+            # attribute classification loss
+            loss_attr = None
+            if self.pred_attrs:
+                loss_attr = self.loss_attr(
+                    pos_attr_preds,
+                    pos_attr_targets,
+                    pos_centerness_targets,
+                    avg_factor=pos_centerness_targets.sum())
+
+        else:
+            # need absolute due to possible negative delta x/y
+            loss_offset = pos_bbox_preds[:, :2].sum()
+            loss_depth = pos_bbox_preds[:, 2].sum()
+            loss_size = pos_bbox_preds[:, 3:6].sum()
+            loss_rotsin = pos_bbox_preds[:, 6].sum()
+            loss_velo = None
+            if self.pred_velo:
+                loss_velo = pos_bbox_preds[:, 7:9].sum()
+            loss_centerness = pos_centerness.sum()
+            loss_dir = None
+            if self.use_direction_classifier:
+                loss_dir = pos_dir_cls_preds.sum()
+            loss_attr = None
+            if self.pred_attrs:
+                loss_attr = pos_attr_preds.sum()
+
+        loss_dict = dict(
+            loss_cls=loss_cls,
+            loss_offset=loss_offset,
+            loss_depth=loss_depth,
+            loss_size=loss_size,
+            loss_rotsin=loss_rotsin,
+            loss_centerness=loss_centerness)
+
+        if loss_velo is not None:
+            loss_dict['loss_velo'] = loss_velo
+
+        if loss_dir is not None:
+            loss_dict['loss_dir'] = loss_dir
+
+        if loss_attr is not None:
+            loss_dict['loss_attr'] = loss_attr
+
+        return loss_dict
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        dir_cls_preds: List[Tensor],
+                        attr_preds: List[Tensor],
+                        centernesses: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: OptConfigType = None,
+                        rescale: bool = False) -> InstanceList:
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_points * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_points * 4, H, W)
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * 2. (bin = 2)
+            attr_preds (list[Tensor]): Attribute scores for each scale level
+                Has shape (N, num_points * num_attrs, H, W)
+            centernesses (list[Tensor]): Centerness for each scale level with
+                shape (N, num_points * 1, H, W)
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores_3d (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels_3d (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes_3d (Tensor): Contains a tensor with shape
+                  (num_instances, C), where C >= 7.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \
+            len(centernesses) == len(attr_preds)
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        # TODO: refactor using prior_generator
+        mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
+                                      bbox_preds[0].device)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            cls_score_list = select_single_mlvl(cls_scores, img_id)
+            bbox_pred_list = select_single_mlvl(bbox_preds, img_id)
+
+            if self.use_direction_classifier:
+                dir_cls_pred_list = select_single_mlvl(dir_cls_preds, img_id)
+            else:
+                dir_cls_pred_list = [
+                    cls_scores[i][img_id].new_full(
+                        [2, *cls_scores[i][img_id].shape[1:]], 0).detach()
+                    for i in range(num_levels)
+                ]
+
+            if self.pred_attrs:
+                attr_pred_list = select_single_mlvl(attr_preds, img_id)
+            else:
+                attr_pred_list = [
+                    cls_scores[i][img_id].new_full(
+                        [self.num_attrs, *cls_scores[i][img_id].shape[1:]],
+                        self.attr_background_label).detach()
+                    for i in range(num_levels)
+                ]
+
+            centerness_pred_list = select_single_mlvl(centernesses, img_id)
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                dir_cls_pred_list=dir_cls_pred_list,
+                attr_pred_list=attr_pred_list,
+                centerness_pred_list=centerness_pred_list,
+                mlvl_points=mlvl_points,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale)
+            result_list.append(results)
+        result_list_2d = None
+        return result_list, result_list_2d
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                dir_cls_pred_list: List[Tensor],
+                                attr_pred_list: List[Tensor],
+                                centerness_pred_list: List[Tensor],
+                                mlvl_points: Tensor,
+                                img_meta: dict,
+                                cfg: ConfigType,
+                                rescale: bool = False) -> InstanceData:
+        """Transform outputs for a single batch item into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for a single scale level
+                Has shape (num_points * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for a single scale
+                level with shape (num_points * bbox_code_size, H, W).
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on a single scale level with shape
+                (num_points * 2, H, W)
+            attr_preds (list[Tensor]): Attribute scores for each scale level
+                Has shape (N, num_points * num_attrs, H, W)
+            centernesses (list[Tensor]): Centerness for a single scale level
+                with shape (num_points, H, W).
+            mlvl_points (list[Tensor]): Box reference for a single scale level
+                with shape (num_total_points, 2).
+            img_meta (dict): Metadata of input image.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+
+        Returns:
+            :obj:`InstanceData`: 3D Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores_3d (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels_3d (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes_3d (Tensor): Contains a tensor with shape
+                  (num_instances, C), where C >= 7.
+        """
+        view = np.array(img_meta['cam2img'])
+        scale_factor = img_meta['scale_factor']
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_points)
+        mlvl_centers_2d = []
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_dir_scores = []
+        mlvl_attr_scores = []
+        mlvl_centerness = []
+
+        for cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, \
+                points in zip(cls_score_list, bbox_pred_list,
+                              dir_cls_pred_list, attr_pred_list,
+                              centerness_pred_list, mlvl_points):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
+            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
+            attr_pred = attr_pred.permute(1, 2, 0).reshape(-1, self.num_attrs)
+            attr_score = torch.max(attr_pred, dim=-1)[1]
+            centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid()
+
+            bbox_pred = bbox_pred.permute(1, 2,
+                                          0).reshape(-1,
+                                                     sum(self.group_reg_dims))
+            bbox_pred = bbox_pred[:, :self.bbox_code_size]
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                max_scores, _ = (scores * centerness[:, None]).max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                points = points[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                dir_cls_pred = dir_cls_pred[topk_inds, :]
+                centerness = centerness[topk_inds]
+                dir_cls_score = dir_cls_score[topk_inds]
+                attr_score = attr_score[topk_inds]
+            # change the offset to actual center predictions
+            bbox_pred[:, :2] = points - bbox_pred[:, :2]
+            if rescale:
+                bbox_pred[:, :2] /= bbox_pred[:, :2].new_tensor(scale_factor)
+            pred_center2d = bbox_pred[:, :3].clone()
+            bbox_pred[:, :3] = points_img2cam(bbox_pred[:, :3], view)
+            mlvl_centers_2d.append(pred_center2d)
+            mlvl_bboxes.append(bbox_pred)
+            mlvl_scores.append(scores)
+            mlvl_dir_scores.append(dir_cls_score)
+            mlvl_attr_scores.append(attr_score)
+            mlvl_centerness.append(centerness)
+
+        mlvl_centers_2d = torch.cat(mlvl_centers_2d)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
+
+        # change local yaw to global yaw for 3D nms
+        cam2img = mlvl_centers_2d.new_zeros((4, 4))
+        cam2img[:view.shape[0], :view.shape[1]] = \
+            mlvl_centers_2d.new_tensor(view)
+        mlvl_bboxes = self.bbox_coder.decode_yaw(mlvl_bboxes, mlvl_centers_2d,
+                                                 mlvl_dir_scores,
+                                                 self.dir_offset, cam2img)
+
+        mlvl_bboxes_for_nms = xywhr2xyxyr(img_meta['box_type_3d'](
+            mlvl_bboxes, box_dim=self.bbox_code_size,
+            origin=(0.5, 0.5, 0.5)).bev)
+
+        mlvl_scores = torch.cat(mlvl_scores)
+        padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+        # BG cat_id: num_class
+        mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        mlvl_attr_scores = torch.cat(mlvl_attr_scores)
+        mlvl_centerness = torch.cat(mlvl_centerness)
+        # no scale_factors in box3d_multiclass_nms
+        # Then we multiply it from outside
+        mlvl_nms_scores = mlvl_scores * mlvl_centerness[:, None]
+        results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
+                                       mlvl_nms_scores, cfg.score_thr,
+                                       cfg.max_per_img, cfg, mlvl_dir_scores,
+                                       mlvl_attr_scores)
+        bboxes, scores, labels, dir_scores, attrs = results
+        attrs = attrs.to(labels.dtype)  # change data type to int
+        bboxes = img_meta['box_type_3d'](
+            bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5))
+        # Note that the predictions use origin (0.5, 0.5, 0.5)
+        # Due to the ground truth centers_2d are the gravity center of objects
+        # v0.10.0 fix inplace operation to the input tensor of cam_box3d
+        # So here we also need to add origin=(0.5, 0.5, 0.5)
+
+        results = InstanceData()
+        results.bboxes_3d = bboxes
+        results.scores_3d = scores
+        results.labels_3d = labels
+        if self.pred_attrs and attrs is not None:
+            results.attr_labels = attrs
+
+        return results
+
+    def _get_points_single(self,
+                           featmap_size: Tuple[int],
+                           stride: int,
+                           dtype: torch.dtype,
+                           device: torch.device,
+                           flatten: bool = False) -> Tensor:
+        """Get points of a single scale level.
+
+        Args:
+            featmap_size (tuple[int]): Single scale level feature map size.
+            stride (int): Downsample factor of the feature map.
+            dtype (torch.dtype): Type of points.
+            device (torch.device): Device of points.
+            flatten (bool): Whether to flatten the tensor.
+                Defaults to False.
+
+        Returns:
+            Tensor: points of each image.
+        """
+        y, x = super()._get_points_single(featmap_size, stride, dtype, device)
+        points = torch.stack((x.reshape(-1) * stride, y.reshape(-1) * stride),
+                             dim=-1) + stride // 2
+        return points
+
+    def get_targets(
+        self,
+        points: List[Tensor],
+        batch_gt_instances_3d: InstanceList,
+        batch_gt_instances: InstanceList,
+    ) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]:
+        """Compute regression, classification and centerss targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、
+                ``labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes``、``labels``.
+
+        Returns:
+            tuple: Targets of each level.
+
+            - concat_lvl_labels_3d (list[Tensor]): 3D Labels of each level.
+            - concat_lvl_bbox_targets_3d (list[Tensor]): 3D BBox targets of
+                each level.
+            - concat_lvl_centerness_targets (list[Tensor]): Centerness targets
+                of each level.
+            - concat_lvl_attr_targets (list[Tensor]): Attribute targets of
+                each level.
+        """
+        assert len(points) == len(self.regress_ranges)
+        num_levels = len(points)
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        if 'attr_labels' not in batch_gt_instances_3d[0]:
+            for gt_instances_3d in batch_gt_instances_3d:
+                gt_instances_3d.attr_labels = \
+                    gt_instances_3d.labels_3d.new_full(
+                        gt_instances_3d.labels_3d.shape,
+                        self.attr_background_label
+                    )
+
+        # get labels and bbox_targets of each image
+        _, _, labels_3d_list, bbox_targets_3d_list, centerness_targets_list, \
+            attr_targets_list = multi_apply(
+                self._get_target_single,
+                batch_gt_instances_3d,
+                batch_gt_instances,
+                points=concat_points,
+                regress_ranges=concat_regress_ranges,
+                num_points_per_lvl=num_points)
+
+        # split to per img, per level
+        labels_3d_list = [
+            labels_3d.split(num_points, 0) for labels_3d in labels_3d_list
+        ]
+        bbox_targets_3d_list = [
+            bbox_targets_3d.split(num_points, 0)
+            for bbox_targets_3d in bbox_targets_3d_list
+        ]
+        centerness_targets_list = [
+            centerness_targets.split(num_points, 0)
+            for centerness_targets in centerness_targets_list
+        ]
+        attr_targets_list = [
+            attr_targets.split(num_points, 0)
+            for attr_targets in attr_targets_list
+        ]
+
+        # concat per level image
+        concat_lvl_labels_3d = []
+        concat_lvl_bbox_targets_3d = []
+        concat_lvl_centerness_targets = []
+        concat_lvl_attr_targets = []
+        for i in range(num_levels):
+            concat_lvl_labels_3d.append(
+                torch.cat([labels[i] for labels in labels_3d_list]))
+            concat_lvl_centerness_targets.append(
+                torch.cat([
+                    centerness_targets[i]
+                    for centerness_targets in centerness_targets_list
+                ]))
+            bbox_targets_3d = torch.cat([
+                bbox_targets_3d[i] for bbox_targets_3d in bbox_targets_3d_list
+            ])
+            concat_lvl_attr_targets.append(
+                torch.cat(
+                    [attr_targets[i] for attr_targets in attr_targets_list]))
+            if self.norm_on_bbox:
+                bbox_targets_3d[:, :
+                                2] = bbox_targets_3d[:, :2] / self.strides[i]
+            concat_lvl_bbox_targets_3d.append(bbox_targets_3d)
+        return concat_lvl_labels_3d, concat_lvl_bbox_targets_3d, \
+            concat_lvl_centerness_targets, concat_lvl_attr_targets
+
+    def _get_target_single(
+            self, gt_instances_3d: InstanceData, gt_instances: InstanceData,
+            points: Tensor, regress_ranges: Tensor,
+            num_points_per_lvl: List[int]) -> Tuple[Tensor, ...]:
+        """Compute regression and classification targets for a single image."""
+        num_points = points.size(0)
+        num_gts = len(gt_instances_3d)
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        gt_bboxes_3d = gt_instances_3d.bboxes_3d
+        gt_labels_3d = gt_instances_3d.labels_3d
+        centers_2d = gt_instances_3d.centers_2d
+        depths = gt_instances_3d.depths
+        attr_labels = gt_instances_3d.attr_labels
+
+        if not isinstance(gt_bboxes_3d, torch.Tensor):
+            gt_bboxes_3d = gt_bboxes_3d.tensor.to(gt_bboxes.device)
+        if num_gts == 0:
+            return gt_labels.new_full((num_points,), self.background_label), \
+                   gt_bboxes.new_zeros((num_points, 4)), \
+                   gt_labels_3d.new_full(
+                       (num_points,), self.background_label), \
+                   gt_bboxes_3d.new_zeros((num_points, self.bbox_code_size)), \
+                   gt_bboxes_3d.new_zeros((num_points,)), \
+                   attr_labels.new_full(
+                       (num_points,), self.attr_background_label)
+
+        # change orientation to local yaw
+        gt_bboxes_3d[..., 6] = -torch.atan2(
+            gt_bboxes_3d[..., 0], gt_bboxes_3d[..., 2]) + gt_bboxes_3d[..., 6]
+
+        areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+            gt_bboxes[:, 3] - gt_bboxes[:, 1])
+        areas = areas[None].repeat(num_points, 1)
+        regress_ranges = regress_ranges[:, None, :].expand(
+            num_points, num_gts, 2)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        centers_2d = centers_2d[None].expand(num_points, num_gts, 2)
+        gt_bboxes_3d = gt_bboxes_3d[None].expand(num_points, num_gts,
+                                                 self.bbox_code_size)
+        depths = depths[None, :, None].expand(num_points, num_gts, 1)
+        xs, ys = points[:, 0], points[:, 1]
+        xs = xs[:, None].expand(num_points, num_gts)
+        ys = ys[:, None].expand(num_points, num_gts)
+
+        delta_xs = (xs - centers_2d[..., 0])[..., None]
+        delta_ys = (ys - centers_2d[..., 1])[..., None]
+        bbox_targets_3d = torch.cat(
+            (delta_xs, delta_ys, depths, gt_bboxes_3d[..., 3:]), dim=-1)
+
+        left = xs - gt_bboxes[..., 0]
+        right = gt_bboxes[..., 2] - xs
+        top = ys - gt_bboxes[..., 1]
+        bottom = gt_bboxes[..., 3] - ys
+        bbox_targets = torch.stack((left, top, right, bottom), -1)
+
+        assert self.center_sampling is True, 'Setting center_sampling to '\
+            'False has not been implemented for FCOS3D.'
+        # condition1: inside a `center bbox`
+        radius = self.center_sample_radius
+        center_xs = centers_2d[..., 0]
+        center_ys = centers_2d[..., 1]
+        center_gts = torch.zeros_like(gt_bboxes)
+        stride = center_xs.new_zeros(center_xs.shape)
+
+        # project the points on current lvl back to the `original` sizes
+        lvl_begin = 0
+        for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl):
+            lvl_end = lvl_begin + num_points_lvl
+            stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius
+            lvl_begin = lvl_end
+
+        center_gts[..., 0] = center_xs - stride
+        center_gts[..., 1] = center_ys - stride
+        center_gts[..., 2] = center_xs + stride
+        center_gts[..., 3] = center_ys + stride
+
+        cb_dist_left = xs - center_gts[..., 0]
+        cb_dist_right = center_gts[..., 2] - xs
+        cb_dist_top = ys - center_gts[..., 1]
+        cb_dist_bottom = center_gts[..., 3] - ys
+        center_bbox = torch.stack(
+            (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1)
+        inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
+
+        # condition2: limit the regression range for each location
+        max_regress_distance = bbox_targets.max(-1)[0]
+        inside_regress_range = (
+            (max_regress_distance >= regress_ranges[..., 0])
+            & (max_regress_distance <= regress_ranges[..., 1]))
+
+        # center-based criterion to deal with ambiguity
+        dists = torch.sqrt(torch.sum(bbox_targets_3d[..., :2]**2, dim=-1))
+        dists[inside_gt_bbox_mask == 0] = INF
+        dists[inside_regress_range == 0] = INF
+        min_dist, min_dist_inds = dists.min(dim=1)
+
+        labels = gt_labels[min_dist_inds]
+        labels_3d = gt_labels_3d[min_dist_inds]
+        attr_labels = attr_labels[min_dist_inds]
+        labels[min_dist == INF] = self.background_label  # set as BG
+        labels_3d[min_dist == INF] = self.background_label  # set as BG
+        attr_labels[min_dist == INF] = self.attr_background_label
+
+        bbox_targets = bbox_targets[range(num_points), min_dist_inds]
+        bbox_targets_3d = bbox_targets_3d[range(num_points), min_dist_inds]
+        relative_dists = torch.sqrt(
+            torch.sum(bbox_targets_3d[..., :2]**2,
+                      dim=-1)) / (1.414 * stride[:, 0])
+        # [N, 1] / [N, 1]
+        centerness_targets = torch.exp(-self.centerness_alpha * relative_dists)
+
+        return labels, bbox_targets, labels_3d, bbox_targets_3d, \
+            centerness_targets, attr_labels
diff --git a/mmde/mmdet3d/models/dense_heads/free_anchor3d_head.py b/mmde/mmdet3d/models/dense_heads/free_anchor3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..21ac1e5aa6333770d5263c35ab38f1f02df91a60
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/free_anchor3d_head.py
@@ -0,0 +1,294 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List
+
+import torch
+from mmengine.device import get_device
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import bbox_overlaps_nearest_3d
+from mmdet3d.utils import InstanceList, OptInstanceList
+from .anchor3d_head import Anchor3DHead
+from .train_mixins import get_direction_target
+
+
+@MODELS.register_module()
+class FreeAnchor3DHead(Anchor3DHead):
+    r"""`FreeAnchor <https://arxiv.org/abs/1909.02466>`_ head for 3D detection.
+
+    Note:
+        This implementation is directly modified from the `mmdet implementation
+        <https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/free_anchor_retina_head.py>`_.
+        We find it also works on 3D detection with minor modification, i.e.,
+        different hyper-parameters and a additional direction classifier.
+
+    Args:
+        pre_anchor_topk (int): Number of boxes that be token in each bag.
+        bbox_thr (float): The threshold of the saturated linear function. It is
+            usually the same with the IoU threshold used in NMS.
+        gamma (float): Gamma parameter in focal loss.
+        alpha (float): Alpha parameter in focal loss.
+        kwargs (dict): Other arguments are the same as those in :class:`Anchor3DHead`.
+    """  # noqa: E501
+
+    def __init__(self,
+                 pre_anchor_topk: int = 50,
+                 bbox_thr: float = 0.6,
+                 gamma: float = 2.0,
+                 alpha: float = 0.5,
+                 init_cfg: dict = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg, **kwargs)
+        self.pre_anchor_topk = pre_anchor_topk
+        self.bbox_thr = bbox_thr
+        self.gamma = gamma
+        self.alpha = alpha
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            dir_cls_preds: List[Tensor],
+            batch_gt_instances_3d: InstanceList,
+            batch_input_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> Dict:
+        """Calculate loss of FreeAnchor head.
+
+        Args:
+            cls_scores (list[torch.Tensor]): Classification scores of
+                different samples.
+            bbox_preds (list[torch.Tensor]): Box predictions of
+                different samples
+            dir_cls_preds (list[torch.Tensor]): Direction predictions of
+                different samples
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+            batch_input_metas (list[dict]): Contain pcd and img's meta info.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, torch.Tensor]: Loss items.
+
+                - positive_bag_loss (torch.Tensor): Loss of positive samples.
+                - negative_bag_loss (torch.Tensor): Loss of negative samples.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = get_device()
+        anchor_list = self.get_anchors(featmap_sizes, batch_input_metas,
+                                       device)
+        mlvl_anchors = [torch.cat(anchor) for anchor in anchor_list]
+
+        # concatenate each level
+        cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(
+                cls_score.size(0), -1, self.num_classes)
+            for cls_score in cls_scores
+        ]
+        bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(
+                bbox_pred.size(0), -1, self.box_code_size)
+            for bbox_pred in bbox_preds
+        ]
+        dir_cls_preds = [
+            dir_cls_pred.permute(0, 2, 3,
+                                 1).reshape(dir_cls_pred.size(0), -1, 2)
+            for dir_cls_pred in dir_cls_preds
+        ]
+
+        cls_scores = torch.cat(cls_scores, dim=1)
+        bbox_preds = torch.cat(bbox_preds, dim=1)
+        dir_cls_preds = torch.cat(dir_cls_preds, dim=1)
+
+        cls_probs = torch.sigmoid(cls_scores)
+        box_prob = []
+        num_pos = 0
+        positive_losses = []
+        for _, (anchors, gt_instance_3d, cls_prob, bbox_pred,
+                dir_cls_pred) in enumerate(
+                    zip(mlvl_anchors, batch_gt_instances_3d, cls_probs,
+                        bbox_preds, dir_cls_preds)):
+
+            gt_bboxes = gt_instance_3d.bboxes_3d.tensor.to(anchors.device)
+            gt_labels = gt_instance_3d.labels_3d.to(anchors.device)
+            with torch.no_grad():
+                # box_localization: a_{j}^{loc}, shape: [j, 4]
+                pred_boxes = self.bbox_coder.decode(anchors, bbox_pred)
+
+                # object_box_iou: IoU_{ij}^{loc}, shape: [i, j]
+                object_box_iou = bbox_overlaps_nearest_3d(
+                    gt_bboxes, pred_boxes)
+
+                # object_box_prob: P{a_{j} -> b_{i}}, shape: [i, j]
+                t1 = self.bbox_thr
+                t2 = object_box_iou.max(
+                    dim=1, keepdim=True).values.clamp(min=t1 + 1e-6)
+                object_box_prob = ((object_box_iou - t1) / (t2 - t1)).clamp(
+                    min=0, max=1)
+
+                # object_cls_box_prob: P{a_{j} -> b_{i}}, shape: [i, c, j]
+                num_obj = gt_labels.size(0)
+                indices = torch.stack(
+                    [torch.arange(num_obj).type_as(gt_labels), gt_labels],
+                    dim=0)
+
+                object_cls_box_prob = torch.sparse_coo_tensor(
+                    indices, object_box_prob)
+
+                # image_box_iou: P{a_{j} \in A_{+}}, shape: [c, j]
+                """
+                from "start" to "end" implement:
+                image_box_iou = torch.sparse.max(object_cls_box_prob,
+                                                 dim=0).t()
+
+                """
+                # start
+                box_cls_prob = torch.sparse.sum(
+                    object_cls_box_prob, dim=0).to_dense()
+
+                indices = torch.nonzero(box_cls_prob, as_tuple=False).t_()
+                if indices.numel() == 0:
+                    image_box_prob = torch.zeros(
+                        anchors.size(0),
+                        self.num_classes).type_as(object_box_prob)
+                else:
+                    nonzero_box_prob = torch.where(
+                        (gt_labels.unsqueeze(dim=-1) == indices[0]),
+                        object_box_prob[:, indices[1]],
+                        torch.tensor(
+                            [0]).type_as(object_box_prob)).max(dim=0).values
+
+                    # upmap to shape [j, c]
+                    image_box_prob = torch.sparse_coo_tensor(
+                        indices.flip([0]),
+                        nonzero_box_prob,
+                        size=(anchors.size(0), self.num_classes)).to_dense()
+                # end
+
+                box_prob.append(image_box_prob)
+
+            # construct bags for objects
+            match_quality_matrix = bbox_overlaps_nearest_3d(gt_bboxes, anchors)
+            _, matched = torch.topk(
+                match_quality_matrix,
+                self.pre_anchor_topk,
+                dim=1,
+                sorted=False)
+            del match_quality_matrix
+
+            # matched_cls_prob: P_{ij}^{cls}
+            matched_cls_prob = torch.gather(
+                cls_prob[matched], 2,
+                gt_labels.view(-1, 1, 1).repeat(1, self.pre_anchor_topk,
+                                                1)).squeeze(2)
+
+            # matched_box_prob: P_{ij}^{loc}
+            matched_anchors = anchors[matched]
+            matched_object_targets = self.bbox_coder.encode(
+                matched_anchors,
+                gt_bboxes.unsqueeze(dim=1).expand_as(matched_anchors))
+
+            # direction classification loss
+            loss_dir = None
+            if self.use_direction_classifier:
+                # also calculate direction prob: P_{ij}^{dir}
+                matched_dir_targets = get_direction_target(
+                    matched_anchors,
+                    matched_object_targets,
+                    self.dir_offset,
+                    self.dir_limit_offset,
+                    one_hot=False)
+                loss_dir = self.loss_dir(
+                    dir_cls_pred[matched].transpose(-2, -1),
+                    matched_dir_targets,
+                    reduction_override='none')
+
+            # generate bbox weights
+            if self.diff_rad_by_sin:
+                bbox_preds_clone = bbox_pred.clone()
+                bbox_preds_clone[matched], matched_object_targets = \
+                    self.add_sin_difference(
+                        bbox_preds_clone[matched], matched_object_targets)
+            bbox_weights = matched_anchors.new_ones(matched_anchors.size())
+            # Use pop is not right, check performance
+            code_weight = self.train_cfg.get('code_weight', None)
+            if code_weight:
+                bbox_weights = bbox_weights * bbox_weights.new_tensor(
+                    code_weight)
+            loss_bbox = self.loss_bbox(
+                bbox_preds_clone[matched],
+                matched_object_targets,
+                bbox_weights,
+                reduction_override='none').sum(-1)
+
+            if loss_dir is not None:
+                loss_bbox += loss_dir
+            matched_box_prob = torch.exp(-loss_bbox)
+
+            # positive_losses: {-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )}
+            num_pos += len(gt_bboxes)
+            positive_losses.append(
+                self.positive_bag_loss(matched_cls_prob, matched_box_prob))
+
+        positive_loss = torch.cat(positive_losses).sum() / max(1, num_pos)
+
+        # box_prob: P{a_{j} \in A_{+}}
+        box_prob = torch.stack(box_prob, dim=0)
+
+        # negative_loss:
+        # \sum_{j}{ FL((1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg})) } / n||B||
+        negative_loss = self.negative_bag_loss(cls_prob, box_prob).sum() / max(
+            1, num_pos * self.pre_anchor_topk)
+
+        losses = {
+            'positive_bag_loss': positive_loss,
+            'negative_bag_loss': negative_loss
+        }
+        return losses
+
+    def positive_bag_loss(self, matched_cls_prob: Tensor,
+                          matched_box_prob: Tensor) -> Tensor:
+        """Generate positive bag loss.
+
+        Args:
+            matched_cls_prob (torch.Tensor): Classification probability
+                of matched positive samples.
+            matched_box_prob (torch.Tensor): Bounding box probability
+                of matched positive samples.
+
+        Returns:
+            torch.Tensor: Loss of positive samples.
+        """
+        # bag_prob = Mean-max(matched_prob)
+        matched_prob = matched_cls_prob * matched_box_prob
+        weight = 1 / torch.clamp(1 - matched_prob, 1e-12, None)
+        weight /= weight.sum(dim=1).unsqueeze(dim=-1)
+        bag_prob = (weight * matched_prob).sum(dim=1)
+        # positive_bag_loss = -self.alpha * log(bag_prob)
+        bag_prob = bag_prob.clamp(0, 1)  # to avoid bug of BCE, check
+        return self.alpha * F.binary_cross_entropy(
+            bag_prob, torch.ones_like(bag_prob), reduction='none')
+
+    def negative_bag_loss(self, cls_prob: Tensor, box_prob: Tensor) -> Tensor:
+        """Generate negative bag loss.
+
+        Args:
+            cls_prob (torch.Tensor): Classification probability
+                of negative samples.
+            box_prob (torch.Tensor): Bounding box probability
+                of negative samples.
+
+        Returns:
+            torch.Tensor: Loss of negative samples.
+        """
+        prob = cls_prob * (1 - box_prob)
+        prob = prob.clamp(0, 1)  # to avoid bug of BCE, check
+        negative_bag_loss = prob**self.gamma * F.binary_cross_entropy(
+            prob, torch.zeros_like(prob), reduction='none')
+        return (1 - self.alpha) * negative_bag_loss
diff --git a/mmde/mmdet3d/models/dense_heads/groupfree3d_head.py b/mmde/mmdet3d/models/dense_heads/groupfree3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..06b6688535d313bc96525822e38349caf079483e
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/groupfree3d_head.py
@@ -0,0 +1,1108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks.transformer import (build_positional_encoding,
+                                         build_transformer_layer)
+from mmcv.ops import PointsSampler as Points_Sampler
+from mmcv.ops import gather_points
+from mmdet.models.utils import multi_apply
+from mmengine.model import BaseModule, xavier_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.models.layers import aligned_3d_nms
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures import BaseInstance3DBoxes, Det3DDataSample
+from mmdet3d.structures.det3d_data_sample import SampleList
+from .base_conv_bbox_head import BaseConvBboxHead
+
+EPS = 1e-6
+
+
+class PointsObjClsModule(BaseModule):
+    """object candidate point prediction from seed point features.
+
+    Args:
+        in_channel (int): number of channels of seed point features.
+        num_convs (int, optional): number of conv layers.
+            Default: 3.
+        conv_cfg (dict, optional): Config of convolution.
+            Default: dict(type='Conv1d').
+        norm_cfg (dict, optional): Config of normalization.
+            Default: dict(type='BN1d').
+        act_cfg (dict, optional): Config of activation.
+            Default: dict(type='ReLU').
+    """
+
+    def __init__(self,
+                 in_channel: int,
+                 num_convs: int = 3,
+                 conv_cfg: dict = dict(type='Conv1d'),
+                 norm_cfg: dict = dict(type='BN1d'),
+                 act_cfg: dict = dict(type='ReLU'),
+                 init_cfg: Optional[dict] = None):
+        super().__init__(init_cfg=init_cfg)
+        conv_channels = [in_channel for _ in range(num_convs - 1)]
+        conv_channels.append(1)
+
+        self.mlp = nn.Sequential()
+        prev_channels = in_channel
+        for i in range(num_convs):
+            self.mlp.add_module(
+                f'layer{i}',
+                ConvModule(
+                    prev_channels,
+                    conv_channels[i],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg if i < num_convs - 1 else None,
+                    act_cfg=act_cfg if i < num_convs - 1 else None,
+                    bias=True,
+                    inplace=True))
+            prev_channels = conv_channels[i]
+
+    def forward(self, seed_features):
+        """Forward pass.
+
+        Args:
+            seed_features (torch.Tensor): seed features, dims:
+                (batch_size, feature_dim, num_seed)
+
+        Returns:
+            torch.Tensor: objectness logits, dim:
+                (batch_size, 1, num_seed)
+        """
+        return self.mlp(seed_features)
+
+
+class GeneralSamplingModule(nn.Module):
+    """Sampling Points.
+
+    Sampling points with given index.
+    """
+
+    def forward(self, xyz: Tensor, features: Tensor,
+                sample_inds: Tensor) -> Tuple[Tensor]:
+        """Forward pass.
+
+        Args:
+            xyz (Tensor)： (B, N, 3) the coordinates of the features.
+            features (Tensor): (B, C, N) features to sample.
+            sample_inds (Tensor): (B, M) the given index,
+                where M is the number of points.
+
+        Returns:
+            Tensor: (B, M, 3) coordinates of sampled features
+            Tensor: (B, C, M) the sampled features.
+            Tensor: (B, M) the given index.
+        """
+        xyz_t = xyz.transpose(1, 2).contiguous()
+        new_xyz = gather_points(xyz_t, sample_inds).transpose(1,
+                                                              2).contiguous()
+        new_features = gather_points(features, sample_inds).contiguous()
+
+        return new_xyz, new_features, sample_inds
+
+
+@MODELS.register_module()
+class GroupFree3DHead(BaseModule):
+    r"""Bbox head of `Group-Free 3D <https://arxiv.org/abs/2104.00678>`_.
+
+    Args:
+        num_classes (int): The number of class.
+        in_channels (int): The dims of input features from backbone.
+        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
+            decoding boxes.
+        num_decoder_layers (int): The number of transformer decoder layers.
+        transformerlayers (dict): Config for transformer decoder.
+        train_cfg (dict, optional): Config for training.
+        test_cfg (dict, optional): Config for testing.
+        num_proposal (int): The number of initial sampling candidates.
+        pred_layer_cfg (dict, optional): Config of classfication and regression
+            prediction layers.
+        size_cls_agnostic (bool): Whether the predicted size is class-agnostic.
+        gt_per_seed (int): the number of candidate instance each point belongs
+            to.
+        sampling_objectness_loss (dict, optional): Config of initial sampling
+            objectness loss.
+        objectness_loss (dict, optional): Config of objectness loss.
+        center_loss (dict, optional): Config of center loss.
+        dir_class_loss (dict, optional): Config of direction classification
+            loss.
+        dir_res_loss (dict, optional): Config of direction residual
+            regression loss.
+        size_class_loss (dict, optional): Config of size classification loss.
+        size_res_loss (dict, optional): Config of size residual
+            regression loss.
+        size_reg_loss (dict, optional): Config of class-agnostic size
+            regression loss.
+        semantic_loss (dict, optional): Config of point-wise semantic
+            segmentation loss.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 bbox_coder: dict,
+                 num_decoder_layers: int,
+                 transformerlayers: dict,
+                 decoder_self_posembeds: dict = dict(
+                     type='ConvBNPositionalEncoding',
+                     input_channel=6,
+                     num_pos_feats=288),
+                 decoder_cross_posembeds: dict = dict(
+                     type='ConvBNPositionalEncoding',
+                     input_channel=3,
+                     num_pos_feats=288),
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 num_proposal: int = 128,
+                 pred_layer_cfg: Optional[dict] = None,
+                 size_cls_agnostic: bool = True,
+                 gt_per_seed: int = 3,
+                 sampling_objectness_loss: Optional[dict] = None,
+                 objectness_loss: Optional[dict] = None,
+                 center_loss: Optional[dict] = None,
+                 dir_class_loss: Optional[dict] = None,
+                 dir_res_loss: Optional[dict] = None,
+                 size_class_loss: Optional[dict] = None,
+                 size_res_loss: Optional[dict] = None,
+                 size_reg_loss: Optional[dict] = None,
+                 semantic_loss: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None):
+        super(GroupFree3DHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.num_proposal = num_proposal
+        self.in_channels = in_channels
+        self.num_decoder_layers = num_decoder_layers
+        self.size_cls_agnostic = size_cls_agnostic
+        self.gt_per_seed = gt_per_seed
+
+        # Transformer decoder layers
+        if isinstance(transformerlayers, dict):
+            transformerlayers = [
+                copy.deepcopy(transformerlayers)
+                for _ in range(num_decoder_layers)
+            ]
+        else:
+            assert isinstance(transformerlayers, list) and \
+                   len(transformerlayers) == num_decoder_layers
+        self.decoder_layers = nn.ModuleList()
+        for i in range(self.num_decoder_layers):
+            self.decoder_layers.append(
+                build_transformer_layer(transformerlayers[i]))
+        self.embed_dims = self.decoder_layers[0].embed_dims
+        assert self.embed_dims == decoder_self_posembeds['num_pos_feats']
+        assert self.embed_dims == decoder_cross_posembeds['num_pos_feats']
+
+        # bbox_coder
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.num_sizes = self.bbox_coder.num_sizes
+        self.num_dir_bins = self.bbox_coder.num_dir_bins
+
+        # Initial object candidate sampling
+        self.gsample_module = GeneralSamplingModule()
+        self.fps_module = Points_Sampler([self.num_proposal])
+        self.points_obj_cls = PointsObjClsModule(self.in_channels)
+
+        # initial candidate prediction
+        self.conv_pred = BaseConvBboxHead(
+            **pred_layer_cfg,
+            num_cls_out_channels=self._get_cls_out_channels(),
+            num_reg_out_channels=self._get_reg_out_channels())
+
+        # query proj and key proj
+        self.decoder_query_proj = nn.Conv1d(
+            self.embed_dims, self.embed_dims, kernel_size=1)
+        self.decoder_key_proj = nn.Conv1d(
+            self.embed_dims, self.embed_dims, kernel_size=1)
+
+        # query position embed
+        self.decoder_self_posembeds = nn.ModuleList()
+        for _ in range(self.num_decoder_layers):
+            self.decoder_self_posembeds.append(
+                build_positional_encoding(decoder_self_posembeds))
+        # key position embed
+        self.decoder_cross_posembeds = nn.ModuleList()
+        for _ in range(self.num_decoder_layers):
+            self.decoder_cross_posembeds.append(
+                build_positional_encoding(decoder_cross_posembeds))
+
+        # Prediction Head
+        self.prediction_heads = nn.ModuleList()
+        for i in range(self.num_decoder_layers):
+            self.prediction_heads.append(
+                BaseConvBboxHead(
+                    **pred_layer_cfg,
+                    num_cls_out_channels=self._get_cls_out_channels(),
+                    num_reg_out_channels=self._get_reg_out_channels()))
+
+        self.loss_sampling_objectness = MODELS.build(sampling_objectness_loss)
+        self.loss_objectness = MODELS.build(objectness_loss)
+        self.loss_center = MODELS.build(center_loss)
+        self.loss_dir_res = MODELS.build(dir_res_loss)
+        self.loss_dir_class = MODELS.build(dir_class_loss)
+        self.loss_semantic = MODELS.build(semantic_loss)
+        if self.size_cls_agnostic:
+            self.loss_size_reg = MODELS.build(size_reg_loss)
+        else:
+            self.loss_size_res = MODELS.build(size_res_loss)
+            self.loss_size_class = MODELS.build(size_class_loss)
+
+    def init_weights(self):
+        """Initialize weights of transformer decoder in GroupFree3DHead."""
+        # initialize transformer
+        for m in self.decoder_layers.parameters():
+            if m.dim() > 1:
+                xavier_init(m, distribution='uniform')
+        for m in self.decoder_self_posembeds.parameters():
+            if m.dim() > 1:
+                xavier_init(m, distribution='uniform')
+        for m in self.decoder_cross_posembeds.parameters():
+            if m.dim() > 1:
+                xavier_init(m, distribution='uniform')
+
+    def _get_cls_out_channels(self):
+        """Return the channel number of classification outputs."""
+        # Class numbers (k) + objectness (1)
+        return self.num_classes + 1
+
+    def _get_reg_out_channels(self):
+        """Return the channel number of regression outputs."""
+        # center residual (3),
+        # heading class+residual (num_dir_bins*2),
+        # size class+residual(num_sizes*4 or 3)
+        if self.size_cls_agnostic:
+            return 6 + self.num_dir_bins * 2
+        else:
+            return 3 + self.num_dir_bins * 2 + self.num_sizes * 4
+
+    def _extract_input(self, feat_dict: dict) -> Tuple[Tensor]:
+        """Extract inputs from features dictionary.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            Tuple[Tensor]:
+
+            - seed_points (Tensor): Coordinates of input points.
+            - seed_features (Tensor): Features of input points.
+            - seed_indices (Tensor): Indices of input points.
+        """
+
+        seed_points = feat_dict['fp_xyz'][-1]
+        seed_features = feat_dict['fp_features'][-1]
+        seed_indices = feat_dict['fp_indices'][-1]
+
+        return seed_points, seed_features, seed_indices
+
+    @property
+    def sample_mode(self):
+        """
+        Returns:
+            str: Sample mode for initial candidates sampling.
+        """
+        if self.training:
+            sample_mode = self.train_cfg.sample_mode
+        else:
+            sample_mode = self.test_cfg.sample_mode
+        assert sample_mode in ['fps', 'kps']
+        return sample_mode
+
+    def forward(self, feat_dict: dict) -> dict:
+        """Forward pass.
+
+        Note:
+            The forward of GroupFree3DHead is divided into 2 steps:
+
+                1. Initial object candidates sampling.
+                2. Iterative object box prediction by transformer decoder.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+
+        Returns:
+            results (dict): Predictions of GroupFree3D head.
+        """
+        sample_mode = self.sample_mode
+
+        seed_xyz, seed_features, seed_indices = self._extract_input(feat_dict)
+
+        results = dict(
+            seed_points=seed_xyz,
+            seed_features=seed_features,
+            seed_indices=seed_indices)
+
+        # 1. Initial object candidates sampling.
+        if sample_mode == 'fps':
+            sample_inds = self.fps_module(seed_xyz, seed_features)
+        elif sample_mode == 'kps':
+            points_obj_cls_logits = self.points_obj_cls(
+                seed_features)  # (batch_size, 1, num_seed)
+            points_obj_cls_scores = points_obj_cls_logits.sigmoid().squeeze(1)
+            sample_inds = torch.topk(points_obj_cls_scores,
+                                     self.num_proposal)[1].int()
+            results['seeds_obj_cls_logits'] = points_obj_cls_logits
+        else:
+            raise NotImplementedError(
+                f'Sample mode {sample_mode} is not supported!')
+
+        candidate_xyz, candidate_features, sample_inds = self.gsample_module(
+            seed_xyz, seed_features, sample_inds)
+
+        results['query_points_xyz'] = candidate_xyz  # (B, M, 3)
+        results['query_points_feature'] = candidate_features  # (B, C, M)
+        results['query_points_sample_inds'] = sample_inds.long()  # (B, M)
+
+        prefix = 'proposal.'
+        cls_predictions, reg_predictions = self.conv_pred(candidate_features)
+        decode_res = self.bbox_coder.split_pred(cls_predictions,
+                                                reg_predictions, candidate_xyz,
+                                                prefix)
+
+        results.update(decode_res)
+        bbox3d = self.bbox_coder.decode(results, prefix)
+
+        # 2. Iterative object box prediction by transformer decoder.
+        base_bbox3d = bbox3d[:, :, :6].detach().clone()
+
+        query = self.decoder_query_proj(candidate_features).permute(2, 0, 1)
+        key = self.decoder_key_proj(seed_features).permute(2, 0, 1)
+        value = key
+
+        # transformer decoder
+        results['num_decoder_layers'] = 0
+        for i in range(self.num_decoder_layers):
+            prefix = f's{i}.'
+
+            query_pos = self.decoder_self_posembeds[i](base_bbox3d).permute(
+                2, 0, 1)
+            key_pos = self.decoder_cross_posembeds[i](seed_xyz).permute(
+                2, 0, 1)
+
+            query = self.decoder_layers[i](
+                query, key, value, query_pos=query_pos,
+                key_pos=key_pos).permute(1, 2, 0)
+
+            results[f'{prefix}query'] = query
+
+            cls_predictions, reg_predictions = self.prediction_heads[i](query)
+            decode_res = self.bbox_coder.split_pred(cls_predictions,
+                                                    reg_predictions,
+                                                    candidate_xyz, prefix)
+            # TODO: should save bbox3d instead of decode_res?
+            results.update(decode_res)
+
+            bbox3d = self.bbox_coder.decode(results, prefix)
+            results[f'{prefix}bbox3d'] = bbox3d
+            base_bbox3d = bbox3d[:, :, :6].detach().clone()
+            query = query.permute(2, 0, 1)
+
+            results['num_decoder_layers'] += 1
+
+        return results
+
+    def loss(self, points: List[torch.Tensor], feats_dict: Dict[str,
+                                                                torch.Tensor],
+             batch_data_samples: SampleList, **kwargs) -> dict:
+        """
+        Args:
+            points (list[tensor]): Points cloud of multiple samples.
+            feats_dict (dict): Predictions from backbone or FPN.
+            batch_data_samples (list[:obj:`Det3DDataSample`]): Each item
+                contains the meta information of each sample and
+                corresponding annotations.
+
+        Returns:
+            dict:  A dictionary of loss components.
+        """
+        preds_dict = self.forward(feats_dict)
+        batch_gt_instance_3d = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        batch_pts_semantic_mask = []
+        batch_pts_instance_mask = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instance_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+            batch_pts_semantic_mask.append(
+                data_sample.gt_pts_seg.get('pts_semantic_mask', None))
+            batch_pts_instance_mask.append(
+                data_sample.gt_pts_seg.get('pts_instance_mask', None))
+
+        loss_inputs = (points, preds_dict, batch_gt_instance_3d)
+        losses = self.loss_by_feat(
+            *loss_inputs,
+            batch_pts_semantic_mask=batch_pts_semantic_mask,
+            batch_pts_instance_mask=batch_pts_instance_mask,
+            batch_input_metas=batch_input_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        return losses
+
+    def loss_by_feat(
+            self,
+            points: List[torch.Tensor],
+            feats_dict: dict,
+            batch_gt_instances_3d: List[InstanceData],
+            batch_pts_semantic_mask: Optional[List[torch.Tensor]] = None,
+            batch_pts_instance_mask: Optional[List[torch.Tensor]] = None,
+            ret_target: bool = False,
+            **kwargs) -> dict:
+        """Compute loss.
+
+        Args:
+            points (list[torch.Tensor]): Input points.
+            feats_dict (dict): Predictions from previous component.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+            batch_pts_semantic_mask (list[tensor]): Semantic mask
+                of points cloud. Defaults to None.
+            batch_pts_semantic_mask (list[tensor]): Instance mask
+                of points cloud. Defaults to None.
+            ret_target (bool): Return targets or not. Defaults to False.
+
+        Returns:
+            dict: Losses of `GroupFree3D`.
+        """
+        targets = self.get_targets(points, feats_dict, batch_gt_instances_3d,
+                                   batch_pts_semantic_mask,
+                                   batch_pts_instance_mask)
+        (sampling_targets, sampling_weights, assigned_size_targets,
+         size_class_targets, size_res_targets, dir_class_targets,
+         dir_res_targets, center_targets, assigned_center_targets,
+         mask_targets, valid_gt_masks, objectness_targets, objectness_weights,
+         box_loss_weights, valid_gt_weights) = targets
+
+        batch_size, proposal_num = size_class_targets.shape[:2]
+
+        losses = dict()
+
+        # calculate objectness classification loss
+        sampling_obj_score = feats_dict['seeds_obj_cls_logits'].reshape(-1, 1)
+        sampling_objectness_loss = self.loss_sampling_objectness(
+            sampling_obj_score,
+            1 - sampling_targets.reshape(-1),
+            sampling_weights.reshape(-1),
+            avg_factor=batch_size)
+        losses['sampling_objectness_loss'] = sampling_objectness_loss
+
+        prefixes = ['proposal.'] + [
+            f's{i}.' for i in range(feats_dict['num_decoder_layers'])
+        ]
+        num_stages = len(prefixes)
+        for prefix in prefixes:
+
+            # calculate objectness loss
+            obj_score = feats_dict[f'{prefix}obj_scores'].transpose(2, 1)
+            objectness_loss = self.loss_objectness(
+                obj_score.reshape(-1, 1),
+                1 - objectness_targets.reshape(-1),
+                objectness_weights.reshape(-1),
+                avg_factor=batch_size)
+            losses[f'{prefix}objectness_loss'] = objectness_loss / num_stages
+
+            # calculate center loss
+            box_loss_weights_expand = box_loss_weights.unsqueeze(-1).expand(
+                -1, -1, 3)
+            center_loss = self.loss_center(
+                feats_dict[f'{prefix}center'],
+                assigned_center_targets,
+                weight=box_loss_weights_expand)
+            losses[f'{prefix}center_loss'] = center_loss / num_stages
+
+            # calculate direction class loss
+            dir_class_loss = self.loss_dir_class(
+                feats_dict[f'{prefix}dir_class'].transpose(2, 1),
+                dir_class_targets,
+                weight=box_loss_weights)
+            losses[f'{prefix}dir_class_loss'] = dir_class_loss / num_stages
+
+            # calculate direction residual loss
+            heading_label_one_hot = size_class_targets.new_zeros(
+                (batch_size, proposal_num, self.num_dir_bins))
+            heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1),
+                                           1)
+            dir_res_norm = torch.sum(
+                feats_dict[f'{prefix}dir_res_norm'] * heading_label_one_hot,
+                -1)
+            dir_res_loss = self.loss_dir_res(
+                dir_res_norm, dir_res_targets, weight=box_loss_weights)
+            losses[f'{prefix}dir_res_loss'] = dir_res_loss / num_stages
+
+            if self.size_cls_agnostic:
+                # calculate class-agnostic size loss
+                size_reg_loss = self.loss_size_reg(
+                    feats_dict[f'{prefix}size'],
+                    assigned_size_targets,
+                    weight=box_loss_weights_expand)
+                losses[f'{prefix}size_reg_loss'] = size_reg_loss / num_stages
+
+            else:
+                # calculate size class loss
+                size_class_loss = self.loss_size_class(
+                    feats_dict[f'{prefix}size_class'].transpose(2, 1),
+                    size_class_targets,
+                    weight=box_loss_weights)
+                losses[
+                    f'{prefix}size_class_loss'] = size_class_loss / num_stages
+
+                # calculate size residual loss
+                one_hot_size_targets = size_class_targets.new_zeros(
+                    (batch_size, proposal_num, self.num_sizes))
+                one_hot_size_targets.scatter_(2,
+                                              size_class_targets.unsqueeze(-1),
+                                              1)
+                one_hot_size_targets_expand = one_hot_size_targets.unsqueeze(
+                    -1).expand(-1, -1, -1, 3).contiguous()
+                size_residual_norm = torch.sum(
+                    feats_dict[f'{prefix}size_res_norm'] *
+                    one_hot_size_targets_expand, 2)
+                box_loss_weights_expand = box_loss_weights.unsqueeze(
+                    -1).expand(-1, -1, 3)
+                size_res_loss = self.loss_size_res(
+                    size_residual_norm,
+                    size_res_targets,
+                    weight=box_loss_weights_expand)
+                losses[f'{prefix}size_res_loss'] = size_res_loss / num_stages
+
+            # calculate semantic loss
+            semantic_loss = self.loss_semantic(
+                feats_dict[f'{prefix}sem_scores'].transpose(2, 1),
+                mask_targets,
+                weight=box_loss_weights)
+            losses[f'{prefix}semantic_loss'] = semantic_loss / num_stages
+
+        if ret_target:
+            losses['targets'] = targets
+
+        return losses
+
+    def get_targets(
+        self,
+        points: List[Tensor],
+        feats_dict: dict = None,
+        batch_gt_instances_3d: List[InstanceData] = None,
+        batch_pts_semantic_mask: List[torch.Tensor] = None,
+        batch_pts_instance_mask: List[torch.Tensor] = None,
+        max_gt_num: int = 64,
+    ):
+        """Generate targets of GroupFree3D head.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            feats_dict (torch.Tensor): Predictions of previous component.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+            batch_pts_semantic_mask (list[tensor]): Semantic gt mask for
+                 point clouds. Defaults to None.
+            batch_pts_instance_mask (list[tensor]): Instance gt mask for
+                 point clouds. Defaults to None.
+            max_gt_num (int): Max number of GTs for single batch. Defaults
+                to 64.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of GroupFree3D head.
+        """
+        # find empty example
+        valid_gt_masks = list()
+        gt_num = list()
+        batch_gt_labels_3d = [
+            gt_instances_3d.labels_3d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+        batch_gt_bboxes_3d = [
+            gt_instances_3d.bboxes_3d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+
+        for index in range(len(batch_gt_labels_3d)):
+            if len(batch_gt_labels_3d[index]) == 0:
+                fake_box = batch_gt_bboxes_3d[index].tensor.new_zeros(
+                    1, batch_gt_bboxes_3d[index].tensor.shape[-1])
+                batch_gt_bboxes_3d[index] = batch_gt_bboxes_3d[index].new_box(
+                    fake_box)
+                batch_gt_labels_3d[index] = batch_gt_labels_3d[
+                    index].new_zeros(1)
+                valid_gt_masks.append(batch_gt_labels_3d[index].new_zeros(1))
+                gt_num.append(1)
+            else:
+                valid_gt_masks.append(batch_gt_labels_3d[index].new_ones(
+                    batch_gt_labels_3d[index].shape))
+                gt_num.append(batch_gt_labels_3d[index].shape[0])
+
+        max_gt_nums = [max_gt_num for _ in range(len(batch_gt_labels_3d))]
+
+        if batch_pts_semantic_mask is None:
+            batch_pts_semantic_mask = [
+                None for i in range(len(batch_gt_labels_3d))
+            ]
+            batch_pts_instance_mask = [
+                None for i in range(len(batch_gt_labels_3d))
+            ]
+
+        seed_points = [
+            feats_dict['seed_points'][i]
+            for i in range(len(batch_gt_labels_3d))
+        ]
+
+        seed_indices = [
+            feats_dict['seed_indices'][i]
+            for i in range(len(batch_gt_labels_3d))
+        ]
+
+        candidate_indices = [
+            feats_dict['query_points_sample_inds'][i]
+            for i in range(len(batch_gt_labels_3d))
+        ]
+
+        (sampling_targets, assigned_size_targets, size_class_targets,
+         size_res_targets, dir_class_targets, dir_res_targets, center_targets,
+         assigned_center_targets, mask_targets,
+         objectness_targets, objectness_masks) = multi_apply(
+             self._get_targets_single, points, batch_gt_bboxes_3d,
+             batch_gt_labels_3d, batch_pts_semantic_mask,
+             batch_pts_instance_mask, max_gt_nums, seed_points, seed_indices,
+             candidate_indices)
+
+        # pad targets as original code of GroupFree3D.
+        for index in range(len(batch_gt_labels_3d)):
+            pad_num = max_gt_num - batch_gt_labels_3d[index].shape[0]
+            valid_gt_masks[index] = F.pad(valid_gt_masks[index], (0, pad_num))
+
+        sampling_targets = torch.stack(sampling_targets)
+        sampling_weights = (sampling_targets >= 0).float()
+        sampling_normalizer = sampling_weights.sum(dim=1, keepdim=True).float()
+        sampling_weights /= sampling_normalizer.clamp(min=1.0)
+
+        assigned_size_targets = torch.stack(assigned_size_targets)
+        center_targets = torch.stack(center_targets)
+        valid_gt_masks = torch.stack(valid_gt_masks)
+
+        assigned_center_targets = torch.stack(assigned_center_targets)
+        objectness_targets = torch.stack(objectness_targets)
+
+        objectness_weights = torch.stack(objectness_masks)
+        cls_normalizer = objectness_weights.sum(dim=1, keepdim=True).float()
+        objectness_weights /= cls_normalizer.clamp(min=1.0)
+
+        box_loss_weights = objectness_targets.float() / (
+            objectness_targets.sum().float() + EPS)
+
+        valid_gt_weights = valid_gt_masks.float() / (
+            valid_gt_masks.sum().float() + EPS)
+
+        dir_class_targets = torch.stack(dir_class_targets)
+        dir_res_targets = torch.stack(dir_res_targets)
+        size_class_targets = torch.stack(size_class_targets)
+        size_res_targets = torch.stack(size_res_targets)
+        mask_targets = torch.stack(mask_targets)
+
+        return (sampling_targets, sampling_weights, assigned_size_targets,
+                size_class_targets, size_res_targets, dir_class_targets,
+                dir_res_targets, center_targets, assigned_center_targets,
+                mask_targets, valid_gt_masks, objectness_targets,
+                objectness_weights, box_loss_weights, valid_gt_weights)
+
+    def _get_targets_single(self,
+                            points: Tensor,
+                            gt_bboxes_3d: BaseInstance3DBoxes,
+                            gt_labels_3d: Tensor,
+                            pts_semantic_mask: Optional[Tensor] = None,
+                            pts_instance_mask: Optional[Tensor] = None,
+                            max_gt_nums: Optional[int] = None,
+                            seed_points: Optional[Tensor] = None,
+                            seed_indices: Optional[Tensor] = None,
+                            candidate_indices: Optional[Tensor] = None,
+                            seed_points_obj_topk: int = 4):
+        """Generate targets of GroupFree3D head for single batch.
+
+        Args:
+            points (torch.Tensor): Points of each batch.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
+                boxes of each batch.
+            gt_labels_3d (torch.Tensor): Labels of each batch.
+            pts_semantic_mask (torch.Tensor, optional): Point-wise semantic
+                label of each batch. Defaults to None.
+            pts_instance_mask (torch.Tensor, optional): Point-wise instance
+                label of each batch. Defaults to None.
+            max_gt_nums (int, optional): Max number of GTs for single batch.
+                Defaults to None.
+            seed_points (torch.Tensor,optional): Coordinates of seed points.
+                Defaults to None.
+            seed_indices (torch.Tensor,optional): Indices of seed points.
+                Defaults to None.
+            candidate_indices (torch.Tensor,optional): Indices of object
+                candidates. Defaults to None.
+            seed_points_obj_topk (int): k value of k-Closest Points Sampling.
+                Defaults to 4.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of GroupFree3D head.
+        """
+
+        assert self.bbox_coder.with_rot or pts_semantic_mask is not None
+
+        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
+
+        # generate center, dir, size target
+        (center_targets, size_targets, size_class_targets, size_res_targets,
+         dir_class_targets,
+         dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d)
+
+        # pad targets as original code of GroupFree3D
+        pad_num = max_gt_nums - gt_labels_3d.shape[0]
+        box_label_mask = points.new_zeros([max_gt_nums])
+        box_label_mask[:gt_labels_3d.shape[0]] = 1
+
+        gt_bboxes_pad = F.pad(gt_bboxes_3d.tensor, (0, 0, 0, pad_num))
+        gt_bboxes_pad[gt_labels_3d.shape[0]:, 0:3] += 1000
+        gt_bboxes_3d = gt_bboxes_3d.new_box(gt_bboxes_pad)
+
+        gt_labels_3d = F.pad(gt_labels_3d, (0, pad_num))
+
+        center_targets = F.pad(center_targets, (0, 0, 0, pad_num), value=1000)
+        size_targets = F.pad(size_targets, (0, 0, 0, pad_num))
+        size_class_targets = F.pad(size_class_targets, (0, pad_num))
+        size_res_targets = F.pad(size_res_targets, (0, 0, 0, pad_num))
+        dir_class_targets = F.pad(dir_class_targets, (0, pad_num))
+        dir_res_targets = F.pad(dir_res_targets, (0, pad_num))
+
+        # 0. generate pts_instance_label and pts_obj_mask
+        num_points = points.shape[0]
+        pts_obj_mask = points.new_zeros([num_points], dtype=torch.long)
+        pts_instance_label = points.new_zeros([num_points],
+                                              dtype=torch.long) - 1
+
+        if self.bbox_coder.with_rot:
+            vote_targets = points.new_zeros([num_points, 4 * self.gt_per_seed])
+            vote_target_idx = points.new_zeros([num_points], dtype=torch.long)
+            box_indices_all = gt_bboxes_3d.points_in_boxes_part(points)
+            for i in range(gt_labels_3d.shape[0]):
+                box_indices = box_indices_all[:, i]
+                indices = torch.nonzero(
+                    box_indices, as_tuple=False).squeeze(-1)
+                selected_points = points[indices]
+                pts_obj_mask[indices] = 1
+                vote_targets_tmp = vote_targets[indices]
+                votes = gt_bboxes_3d.gravity_center[i].unsqueeze(
+                    0) - selected_points[:, :3]
+
+                for j in range(self.gt_per_seed):
+                    column_indices = torch.nonzero(
+                        vote_target_idx[indices] == j,
+                        as_tuple=False).squeeze(-1)
+                    vote_targets_tmp[column_indices,
+                                     int(j * 3):int(j * 3 +
+                                                    3)] = votes[column_indices]
+                    vote_targets_tmp[column_indices,
+                                     j + 3 * self.gt_per_seed] = i
+                    if j == 0:
+                        vote_targets_tmp[
+                            column_indices, :3 *
+                            self.gt_per_seed] = votes[column_indices].repeat(
+                                1, self.gt_per_seed)
+                        vote_targets_tmp[column_indices,
+                                         3 * self.gt_per_seed:] = i
+
+                vote_targets[indices] = vote_targets_tmp
+                vote_target_idx[indices] = torch.clamp(
+                    vote_target_idx[indices] + 1, max=2)
+
+            dist = points.new_zeros([num_points, self.gt_per_seed]) + 1000
+            for j in range(self.gt_per_seed):
+                dist[:, j] = (vote_targets[:, 3 * j:3 * j + 3]**2).sum(-1)
+
+            instance_indices = torch.argmin(
+                dist, dim=-1).unsqueeze(-1) + 3 * self.gt_per_seed
+            instance_lable = torch.gather(vote_targets, 1,
+                                          instance_indices).squeeze(-1)
+            pts_instance_label = instance_lable.long()
+            pts_instance_label[pts_obj_mask == 0] = -1
+
+        elif pts_instance_mask is not None and pts_semantic_mask is not None:
+            for i in torch.unique(pts_instance_mask):
+                indices = torch.nonzero(
+                    pts_instance_mask == i, as_tuple=False).squeeze(-1)
+
+                if pts_semantic_mask[indices[0]] < self.num_classes:
+                    selected_points = points[indices, :3]
+                    center = 0.5 * (
+                        selected_points.min(0)[0] + selected_points.max(0)[0])
+
+                    delta_xyz = center - center_targets
+                    instance_lable = torch.argmin((delta_xyz**2).sum(-1))
+                    pts_instance_label[indices] = instance_lable
+                    pts_obj_mask[indices] = 1
+
+        else:
+            raise NotImplementedError
+
+        # 1. generate objectness targets in sampling head
+        gt_num = gt_labels_3d.shape[0]
+        num_seed = seed_points.shape[0]
+        num_candidate = candidate_indices.shape[0]
+
+        object_assignment = torch.gather(pts_instance_label, 0, seed_indices)
+        # set background points to the last gt bbox as original code
+        object_assignment[object_assignment < 0] = gt_num - 1
+        object_assignment_one_hot = gt_bboxes_3d.tensor.new_zeros(
+            (num_seed, gt_num))
+        object_assignment_one_hot.scatter_(1, object_assignment.unsqueeze(-1),
+                                           1)  # (num_seed, gt_num)
+
+        delta_xyz = seed_points.unsqueeze(
+            1) - gt_bboxes_3d.gravity_center.unsqueeze(
+                0)  # (num_seed, gt_num, 3)
+        delta_xyz = delta_xyz / (gt_bboxes_3d.dims.unsqueeze(0) + EPS)
+
+        new_dist = torch.sum(delta_xyz**2, dim=-1)
+        euclidean_dist1 = torch.sqrt(new_dist + EPS)
+        euclidean_dist1 = euclidean_dist1 * object_assignment_one_hot + 100 * (
+            1 - object_assignment_one_hot)
+        # (gt_num, num_seed)
+        euclidean_dist1 = euclidean_dist1.permute(1, 0)
+
+        # gt_num x topk
+        topk_inds = torch.topk(
+            euclidean_dist1,
+            seed_points_obj_topk,
+            largest=False)[1] * box_label_mask[:, None] + \
+            (box_label_mask[:, None] - 1)
+        topk_inds = topk_inds.long()
+        topk_inds = topk_inds.view(-1).contiguous()
+
+        sampling_targets = torch.zeros(
+            num_seed + 1, dtype=torch.long).to(points.device)
+        sampling_targets[topk_inds] = 1
+        sampling_targets = sampling_targets[:num_seed]
+        # pts_instance_label
+        objectness_label_mask = torch.gather(pts_instance_label, 0,
+                                             seed_indices)  # num_seed
+        sampling_targets[objectness_label_mask < 0] = 0
+
+        # 2. objectness target
+        seed_obj_gt = torch.gather(pts_obj_mask, 0, seed_indices)  # num_seed
+        objectness_targets = torch.gather(seed_obj_gt, 0,
+                                          candidate_indices)  # num_candidate
+
+        # 3. box target
+        seed_instance_label = torch.gather(pts_instance_label, 0,
+                                           seed_indices)  # num_seed
+        query_points_instance_label = torch.gather(
+            seed_instance_label, 0, candidate_indices)  # num_candidate
+
+        # Set assignment
+        # (num_candidate, ) with values in 0,1,...,gt_num-1
+        assignment = query_points_instance_label
+        # set background points to the last gt bbox as original code
+        assignment[assignment < 0] = gt_num - 1
+        assignment_expand = assignment.unsqueeze(1).expand(-1, 3)
+
+        assigned_center_targets = center_targets[assignment]
+        assigned_size_targets = size_targets[assignment]
+
+        dir_class_targets = dir_class_targets[assignment]
+        dir_res_targets = dir_res_targets[assignment]
+        dir_res_targets /= (np.pi / self.num_dir_bins)
+
+        size_class_targets = size_class_targets[assignment]
+        size_res_targets = \
+            torch.gather(size_res_targets, 0, assignment_expand)
+        one_hot_size_targets = gt_bboxes_3d.tensor.new_zeros(
+            (num_candidate, self.num_sizes))
+        one_hot_size_targets.scatter_(1, size_class_targets.unsqueeze(-1), 1)
+        one_hot_size_targets = one_hot_size_targets.unsqueeze(-1).expand(
+            -1, -1, 3)  # (num_candidate,num_size_cluster,3)
+        mean_sizes = size_res_targets.new_tensor(
+            self.bbox_coder.mean_sizes).unsqueeze(0)
+        pos_mean_sizes = torch.sum(one_hot_size_targets * mean_sizes, 1)
+        size_res_targets /= pos_mean_sizes
+
+        mask_targets = gt_labels_3d[assignment].long()
+
+        objectness_masks = points.new_ones((num_candidate))
+
+        return (sampling_targets, assigned_size_targets, size_class_targets,
+                size_res_targets, dir_class_targets, dir_res_targets,
+                center_targets, assigned_center_targets, mask_targets,
+                objectness_targets, objectness_masks)
+
+    def predict(self, points: List[torch.Tensor],
+                feats_dict: Dict[str, torch.Tensor],
+                batch_data_samples: List[Det3DDataSample],
+                **kwargs) -> List[InstanceData]:
+        """
+        Args:
+            points (list[tensor]): Point clouds of multiple samples.
+            feats_dict (dict): Features from FPN or backbone.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes meta information of data.
+
+        Returns:
+            list[:obj:`InstanceData`]: List of processed predictions. Each
+            InstanceData contains 3d Bounding boxes and corresponding
+            scores and labels.
+        """
+        preds_dict = self(feats_dict)
+        batch_size = len(batch_data_samples)
+        batch_input_metas = []
+        for batch_index in range(batch_size):
+            metainfo = batch_data_samples[batch_index].metainfo
+            batch_input_metas.append(metainfo)
+
+        results_list = self.predict_by_feat(points, preds_dict,
+                                            batch_input_metas, **kwargs)
+        return results_list
+
+    def predict_by_feat(self,
+                        points: List[torch.Tensor],
+                        bbox_preds_dict: dict,
+                        batch_input_metas: List[dict],
+                        use_nms: bool = True,
+                        **kwargs) -> List[InstanceData]:
+        """Generate bboxes from vote head predictions.
+
+        Args:
+            points (List[torch.Tensor]): Input points of multiple samples.
+            bbox_preds_dict (dict): Predictions from groupfree3d head.
+            batch_input_metas (list[dict]): Each item
+                contains the meta information of each sample.
+            use_nms (bool): Whether to apply NMS, skip nms postprocessing
+                while using vote head in rpn stage.
+
+        Returns:
+            list[:obj:`InstanceData`]: List of processed predictions. Each
+            InstanceData cantains 3d Bounding boxes and corresponding
+            scores and labels.
+        """
+        # support multi-stage predictions
+        assert self.test_cfg['prediction_stages'] in \
+            ['last', 'all', 'last_three']
+
+        if self.test_cfg['prediction_stages'] == 'last':
+            prefixes = [f's{self.num_decoder_layers - 1}.']
+        elif self.test_cfg['prediction_stages'] == 'all':
+            prefixes = ['proposal.'] + \
+                [f's{i}.' for i in range(self.num_decoder_layers)]
+        elif self.test_cfg['prediction_stages'] == 'last_three':
+            prefixes = [
+                f's{i}.' for i in range(self.num_decoder_layers -
+                                        3, self.num_decoder_layers)
+            ]
+        else:
+            raise NotImplementedError
+
+        obj_scores = list()
+        sem_scores = list()
+        bbox3d = list()
+        for prefix in prefixes:
+            # decode boxes
+            obj_score = bbox_preds_dict[f'{prefix}obj_scores'][...,
+                                                               -1].sigmoid()
+            sem_score = bbox_preds_dict[f'{prefix}sem_scores'].softmax(-1)
+            bbox = self.bbox_coder.decode(bbox_preds_dict, prefix)
+            obj_scores.append(obj_score)
+            sem_scores.append(sem_score)
+            bbox3d.append(bbox)
+
+        obj_scores = torch.cat(obj_scores, dim=1)
+        sem_scores = torch.cat(sem_scores, dim=1)
+        bbox3d = torch.cat(bbox3d, dim=1)
+        stack_points = torch.stack(points)
+        results_list = list()
+        if use_nms:
+            batch_size = bbox3d.shape[0]
+            temp_results = InstanceData()
+            for b in range(batch_size):
+                bbox_selected, score_selected, labels = \
+                    self.multiclass_nms_single(obj_scores[b],
+                                               sem_scores[b],
+                                               bbox3d[b],
+                                               stack_points[b, ..., :3],
+                                               batch_input_metas[b])
+                bbox = batch_input_metas[b]['box_type_3d'](
+                    bbox_selected,
+                    box_dim=bbox_selected.shape[-1],
+                    with_yaw=self.bbox_coder.with_rot)
+                temp_results.bboxes_3d = bbox
+                temp_results.scores_3d = score_selected
+                temp_results.labels_3d = labels
+                results_list.append(temp_results)
+            return results_list
+        else:
+            return bbox3d
+
+    def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points,
+                              input_meta):
+        """Multi-class nms in single batch.
+
+        Args:
+            obj_scores (torch.Tensor): Objectness score of bounding boxes.
+            sem_scores (torch.Tensor): semantic class score of bounding boxes.
+            bbox (torch.Tensor): Predicted bounding boxes.
+            points (torch.Tensor): Input points.
+            input_meta (dict): Point cloud and image's meta info.
+
+        Returns:
+            tuple[torch.Tensor]: Bounding boxes, scores and labels.
+        """
+        bbox = input_meta['box_type_3d'](
+            bbox,
+            box_dim=bbox.shape[-1],
+            with_yaw=self.bbox_coder.with_rot,
+            origin=(0.5, 0.5, 0.5))
+        box_indices = bbox.points_in_boxes_all(points)
+
+        corner3d = bbox.corners
+        minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
+        minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
+        minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
+
+        nonempty_box_mask = box_indices.T.sum(1) > 5
+
+        bbox_classes = torch.argmax(sem_scores, -1)
+        nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask],
+                                      obj_scores[nonempty_box_mask],
+                                      bbox_classes[nonempty_box_mask],
+                                      self.test_cfg.nms_thr)
+
+        # filter empty boxes and boxes with low score
+        scores_mask = (obj_scores > self.test_cfg.score_thr)
+        nonempty_box_inds = torch.nonzero(
+            nonempty_box_mask, as_tuple=False).flatten()
+        nonempty_mask = torch.zeros_like(bbox_classes).scatter(
+            0, nonempty_box_inds[nms_selected], 1)
+        selected = (nonempty_mask.bool() & scores_mask.bool())
+
+        if self.test_cfg.per_class_proposal:
+            bbox_selected, score_selected, labels = [], [], []
+            for k in range(sem_scores.shape[-1]):
+                bbox_selected.append(bbox[selected].tensor)
+                score_selected.append(obj_scores[selected] *
+                                      sem_scores[selected][:, k])
+                labels.append(
+                    torch.zeros_like(bbox_classes[selected]).fill_(k))
+            bbox_selected = torch.cat(bbox_selected, 0)
+            score_selected = torch.cat(score_selected, 0)
+            labels = torch.cat(labels, 0)
+        else:
+            bbox_selected = bbox[selected].tensor
+            score_selected = obj_scores[selected]
+            labels = bbox_classes[selected]
+
+        return bbox_selected, score_selected, labels
diff --git a/mmde/mmdet3d/models/dense_heads/imvoxel_head.py b/mmde/mmdet3d/models/dense_heads/imvoxel_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..948cb8aed776f27b9efc64746df05341a9f47dc2
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/imvoxel_head.py
@@ -0,0 +1,696 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+from mmcv.cnn import Scale
+from mmcv.ops import nms3d, nms3d_normal
+from mmdet.models.utils import multi_apply
+from mmdet.utils import reduce_mean
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor, nn
+
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures.bbox_3d.utils import rotation_3d_in_axis
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils.typing_utils import (ConfigType, InstanceList,
+                                        OptConfigType, OptInstanceList)
+
+
+@MODELS.register_module()
+class ImVoxelHead(BaseModule):
+    r"""`ImVoxelNet<https://arxiv.org/abs/2106.01178>`_ head for indoor
+    datasets.
+
+    Args:
+        n_classes (int): Number of classes.
+        n_levels (int): Number of feature levels.
+        n_channels (int): Number of channels in input tensors.
+        n_reg_outs (int): Number of regression layer channels.
+        pts_assign_threshold (int): Min number of location per box to
+            be assigned with.
+        pts_center_threshold (int): Max number of locations per box to
+            be assigned with.
+        center_loss (dict, optional): Config of centerness loss.
+            Default: dict(type='CrossEntropyLoss', use_sigmoid=True).
+        bbox_loss (dict, optional): Config of bbox loss.
+            Default: dict(type='RotatedIoU3DLoss').
+        cls_loss (dict, optional): Config of classification loss.
+            Default: dict(type='FocalLoss').
+        train_cfg (dict, optional): Config for train stage. Defaults to None.
+        test_cfg (dict, optional): Config for test stage. Defaults to None.
+        init_cfg (dict, optional): Config for weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 n_classes: int,
+                 n_levels: int,
+                 n_channels: int,
+                 n_reg_outs: int,
+                 pts_assign_threshold: int,
+                 pts_center_threshold: int,
+                 prior_generator: ConfigType,
+                 center_loss: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss', use_sigmoid=True),
+                 bbox_loss: ConfigType = dict(type='RotatedIoU3DLoss'),
+                 cls_loss: ConfigType = dict(type='mmdet.FocalLoss'),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super(ImVoxelHead, self).__init__(init_cfg)
+        self.pts_assign_threshold = pts_assign_threshold
+        self.pts_center_threshold = pts_center_threshold
+        self.prior_generator = TASK_UTILS.build(prior_generator)
+        self.center_loss = MODELS.build(center_loss)
+        self.bbox_loss = MODELS.build(bbox_loss)
+        self.cls_loss = MODELS.build(cls_loss)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers(n_channels, n_reg_outs, n_classes, n_levels)
+
+    def _init_layers(self, n_channels, n_reg_outs, n_classes, n_levels):
+        """Initialize neural network layers of the head."""
+        self.conv_center = nn.Conv3d(n_channels, 1, 3, padding=1, bias=False)
+        self.conv_reg = nn.Conv3d(
+            n_channels, n_reg_outs, 3, padding=1, bias=False)
+        self.conv_cls = nn.Conv3d(n_channels, n_classes, 3, padding=1)
+        self.scales = nn.ModuleList([Scale(1.) for _ in range(n_levels)])
+
+    def init_weights(self):
+        """Initialize all layer weights."""
+        normal_init(self.conv_center, std=.01)
+        normal_init(self.conv_reg, std=.01)
+        normal_init(self.conv_cls, std=.01, bias=bias_init_with_prob(.01))
+
+    def _forward_single(self, x: Tensor, scale: Scale):
+        """Forward pass per level.
+
+        Args:
+            x (Tensor): Per level 3d neck output tensor.
+            scale (mmcv.cnn.Scale): Per level multiplication weight.
+
+        Returns:
+            tuple[Tensor]: Centerness, bbox and classification predictions.
+        """
+        reg_final = self.conv_reg(x)
+        reg_distance = torch.exp(scale(reg_final[:, :6]))
+        reg_angle = reg_final[:, 6:]
+        bbox_pred = torch.cat((reg_distance, reg_angle), dim=1)
+        return self.conv_center(x), bbox_pred, self.conv_cls(x)
+
+    def forward(self, x: Tensor):
+        """Forward function.
+
+        Args:
+            x (list[Tensor]): Features from 3d neck.
+
+        Returns:
+            tuple[Tensor]: Centerness, bbox and classification predictions.
+        """
+        return multi_apply(self._forward_single, x, self.scales)
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        valid_pred = x[-1]
+        outs = self(x[:-1])
+
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+
+        loss_inputs = outs + (valid_pred, batch_gt_instances_3d,
+                              batch_input_metas, batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_and_predict(self,
+                         x: Tuple[Tensor],
+                         batch_data_samples: SampleList,
+                         proposal_cfg: Optional[ConfigDict] = None,
+                         **kwargs) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`Det3DDataSample`]): Each item
+                contains the meta information of each image and
+                corresponding annotations.
+            proposal_cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+                - losses: (dict[str, Tensor]): A dictionary of loss components.
+                - predictions (list[:obj:`InstanceData`]): Detection
+                  results of each image after the post process.
+        """
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+
+        valid_pred = x[-1]
+        outs = self(x[:-1])
+
+        loss_inputs = outs + (valid_pred, batch_gt_instances_3d,
+                              batch_input_metas, batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            *outs,
+            valid_pred=valid_pred,
+            batch_input_metas=batch_input_metas,
+            cfg=proposal_cfg)
+        return losses, predictions
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the 3D detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_pts_panoptic_seg` and
+                `gt_pts_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        batch_input_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        valid_pred = x[-1]
+        outs = self(x[:-1])
+        predictions = self.predict_by_feat(
+            *outs,
+            valid_pred=valid_pred,
+            batch_input_metas=batch_input_metas,
+            rescale=rescale)
+        return predictions
+
+    def _loss_by_feat_single(self, center_preds, bbox_preds, cls_preds,
+                             valid_preds, input_meta, gt_bboxes, gt_labels):
+        """Per scene loss function.
+
+        Args:
+            center_preds (list[Tensor]): Centerness predictions for all levels.
+            bbox_preds (list[Tensor]): Bbox predictions for all levels.
+            cls_preds (list[Tensor]): Classification predictions for all
+                levels.
+            valid_preds (list[Tensor]): Valid mask predictions for all levels.
+            input_meta (dict): Scene meta info.
+            gt_bboxes (BaseInstance3DBoxes): Ground truth boxes.
+            gt_labels (Tensor): Ground truth labels.
+
+        Returns:
+            tuple[Tensor]: Centerness, bbox, and classification loss values.
+        """
+        points = self._get_points(center_preds)
+        center_targets, bbox_targets, cls_targets = self._get_targets(
+            points, gt_bboxes, gt_labels)
+
+        center_preds = torch.cat(
+            [x.permute(1, 2, 3, 0).reshape(-1) for x in center_preds])
+        bbox_preds = torch.cat([
+            x.permute(1, 2, 3, 0).reshape(-1, x.shape[0]) for x in bbox_preds
+        ])
+        cls_preds = torch.cat(
+            [x.permute(1, 2, 3, 0).reshape(-1, x.shape[0]) for x in cls_preds])
+        valid_preds = torch.cat(
+            [x.permute(1, 2, 3, 0).reshape(-1) for x in valid_preds])
+        points = torch.cat(points)
+
+        # cls loss
+        pos_inds = torch.nonzero(
+            torch.logical_and(cls_targets >= 0, valid_preds)).squeeze(1)
+        n_pos = points.new_tensor(len(pos_inds))
+        n_pos = max(reduce_mean(n_pos), 1.)
+        if torch.any(valid_preds):
+            cls_loss = self.cls_loss(
+                cls_preds[valid_preds],
+                cls_targets[valid_preds],
+                avg_factor=n_pos)
+        else:
+            cls_loss = cls_preds[valid_preds].sum()
+
+        # bbox and centerness losses
+        pos_center_preds = center_preds[pos_inds]
+        pos_bbox_preds = bbox_preds[pos_inds]
+        if len(pos_inds) > 0:
+            pos_center_targets = center_targets[pos_inds]
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_points = points[pos_inds]
+            center_loss = self.center_loss(
+                pos_center_preds, pos_center_targets, avg_factor=n_pos)
+            bbox_loss = self.bbox_loss(
+                self._bbox_pred_to_bbox(pos_points, pos_bbox_preds),
+                pos_bbox_targets,
+                weight=pos_center_targets,
+                avg_factor=pos_center_targets.sum())
+        else:
+            center_loss = pos_center_preds.sum()
+            bbox_loss = pos_bbox_preds.sum()
+        return center_loss, bbox_loss, cls_loss
+
+    def loss_by_feat(self,
+                     center_preds: List[List[Tensor]],
+                     bbox_preds: List[List[Tensor]],
+                     cls_preds: List[List[Tensor]],
+                     valid_pred: Tensor,
+                     batch_gt_instances_3d: InstanceList,
+                     batch_input_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None,
+                     **kwargs) -> dict:
+        """Per scene loss function.
+
+        Args:
+            center_preds (list[list[Tensor]]): Centerness predictions for
+                all scenes. The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            bbox_preds (list[list[Tensor]]): Bbox predictions for all scenes.
+                The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            cls_preds (list[list[Tensor]]): Classification predictions for all
+                scenes. The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            valid_pred (Tensor): Valid mask prediction for all scenes.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、`
+                `labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_input_metas (list[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: Centerness, bbox, and classification loss values.
+        """
+        valid_preds = self._upsample_valid_preds(valid_pred, center_preds)
+        center_losses, bbox_losses, cls_losses = [], [], []
+        for i in range(len(batch_input_metas)):
+            center_loss, bbox_loss, cls_loss = self._loss_by_feat_single(
+                center_preds=[x[i] for x in center_preds],
+                bbox_preds=[x[i] for x in bbox_preds],
+                cls_preds=[x[i] for x in cls_preds],
+                valid_preds=[x[i] for x in valid_preds],
+                input_meta=batch_input_metas[i],
+                gt_bboxes=batch_gt_instances_3d[i].bboxes_3d,
+                gt_labels=batch_gt_instances_3d[i].labels_3d)
+            center_losses.append(center_loss)
+            bbox_losses.append(bbox_loss)
+            cls_losses.append(cls_loss)
+        return dict(
+            center_loss=torch.mean(torch.stack(center_losses)),
+            bbox_loss=torch.mean(torch.stack(bbox_losses)),
+            cls_loss=torch.mean(torch.stack(cls_losses)))
+
+    def _predict_by_feat_single(self, center_preds: List[Tensor],
+                                bbox_preds: List[Tensor],
+                                cls_preds: List[Tensor],
+                                valid_preds: List[Tensor],
+                                input_meta: dict) -> InstanceData:
+        """Generate boxes for single sample.
+
+        Args:
+            center_preds (list[Tensor]): Centerness predictions for all levels.
+            bbox_preds (list[Tensor]): Bbox predictions for all levels.
+            cls_preds (list[Tensor]): Classification predictions for all
+                levels.
+            valid_preds (tuple[Tensor]): Upsampled valid masks for all feature
+                levels.
+            input_meta (dict): Scene meta info.
+
+        Returns:
+            tuple[Tensor]: Predicted bounding boxes, scores and labels.
+        """
+        points = self._get_points(center_preds)
+        mlvl_bboxes, mlvl_scores = [], []
+        for center_pred, bbox_pred, cls_pred, valid_pred, point in zip(
+                center_preds, bbox_preds, cls_preds, valid_preds, points):
+            center_pred = center_pred.permute(1, 2, 3, 0).reshape(-1, 1)
+            bbox_pred = bbox_pred.permute(1, 2, 3,
+                                          0).reshape(-1, bbox_pred.shape[0])
+            cls_pred = cls_pred.permute(1, 2, 3,
+                                        0).reshape(-1, cls_pred.shape[0])
+            valid_pred = valid_pred.permute(1, 2, 3, 0).reshape(-1, 1)
+
+            scores = cls_pred.sigmoid() * center_pred.sigmoid() * valid_pred
+            max_scores, _ = scores.max(dim=1)
+
+            if len(scores) > self.test_cfg.nms_pre > 0:
+                _, ids = max_scores.topk(self.test_cfg.nms_pre)
+                bbox_pred = bbox_pred[ids]
+                scores = scores[ids]
+                point = point[ids]
+
+            bboxes = self._bbox_pred_to_bbox(point, bbox_pred)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+
+        bboxes = torch.cat(mlvl_bboxes)
+        scores = torch.cat(mlvl_scores)
+        bboxes, scores, labels = self._single_scene_multiclass_nms(
+            bboxes, scores, input_meta)
+
+        bboxes = input_meta['box_type_3d'](
+            bboxes,
+            box_dim=bboxes.shape[1],
+            with_yaw=bboxes.shape[1] == 7,
+            origin=(.5, .5, .5))
+
+        results = InstanceData()
+        results.bboxes_3d = bboxes
+        results.scores_3d = scores
+        results.labels_3d = labels
+        return results
+
+    def predict_by_feat(self, center_preds: List[List[Tensor]],
+                        bbox_preds: List[List[Tensor]],
+                        cls_preds: List[List[Tensor]], valid_pred: Tensor,
+                        batch_input_metas: List[dict],
+                        **kwargs) -> List[InstanceData]:
+        """Generate boxes for all scenes.
+
+        Args:
+            center_preds (list[list[Tensor]]): Centerness predictions for
+                all scenes.
+            bbox_preds (list[list[Tensor]]): Bbox predictions for all scenes.
+            cls_preds (list[list[Tensor]]): Classification predictions for all
+                scenes.
+            valid_pred (Tensor): Valid mask prediction for all scenes.
+            batch_input_metas (list[dict]): Meta infos for all scenes.
+
+        Returns:
+            list[tuple[Tensor]]: Predicted bboxes, scores, and labels for
+                all scenes.
+        """
+        valid_preds = self._upsample_valid_preds(valid_pred, center_preds)
+        results = []
+        for i in range(len(batch_input_metas)):
+            results.append(
+                self._predict_by_feat_single(
+                    center_preds=[x[i] for x in center_preds],
+                    bbox_preds=[x[i] for x in bbox_preds],
+                    cls_preds=[x[i] for x in cls_preds],
+                    valid_preds=[x[i] for x in valid_preds],
+                    input_meta=batch_input_metas[i]))
+        return results
+
+    @staticmethod
+    def _upsample_valid_preds(valid_pred, features):
+        """Upsample valid mask predictions.
+
+        Args:
+            valid_pred (Tensor): Valid mask prediction.
+            features (Tensor): Feature tensor.
+
+        Returns:
+            tuple[Tensor]: Upsampled valid masks for all feature levels.
+        """
+        return [
+            nn.Upsample(size=x.shape[-3:],
+                        mode='trilinear')(valid_pred).round().bool()
+            for x in features
+        ]
+
+    def _get_points(self, features):
+        """Generate final locations.
+
+        Args:
+            features (list[Tensor]): Feature tensors for all feature levels.
+
+        Returns:
+            list(Tensor): Final locations for all feature levels.
+        """
+        points = []
+        for x in features:
+            n_voxels = x.size()[-3:][::-1]
+            points.append(
+                self.prior_generator.grid_anchors(
+                    [n_voxels],
+                    device=x.device)[0][:, :3].reshape(n_voxels +
+                                                       (3, )).permute(
+                                                           2, 1, 0,
+                                                           3).reshape(-1, 3))
+        return points
+
+    @staticmethod
+    def _bbox_pred_to_bbox(points, bbox_pred):
+        """Transform predicted bbox parameters to bbox.
+
+        Args:
+            points (Tensor): Final locations of shape (N, 3).
+            bbox_pred (Tensor): Predicted bbox parameters of shape (N, 7).
+
+        Returns:
+            Tensor: Transformed 3D box of shape (N, 7).
+        """
+        if bbox_pred.shape[0] == 0:
+            return bbox_pred
+
+        # dx_min, dx_max, dy_min, dy_max, dz_min, dz_max, alpha ->
+        # x_center, y_center, z_center, w, l, h, alpha
+        shift = torch.stack(((bbox_pred[:, 1] - bbox_pred[:, 0]) / 2,
+                             (bbox_pred[:, 3] - bbox_pred[:, 2]) / 2,
+                             (bbox_pred[:, 5] - bbox_pred[:, 4]) / 2),
+                            dim=-1).view(-1, 1, 3)
+        shift = rotation_3d_in_axis(shift, bbox_pred[:, 6], axis=2)[:, 0, :]
+        center = points + shift
+        size = torch.stack(
+            (bbox_pred[:, 0] + bbox_pred[:, 1], bbox_pred[:, 2] +
+             bbox_pred[:, 3], bbox_pred[:, 4] + bbox_pred[:, 5]),
+            dim=-1)
+        return torch.cat((center, size, bbox_pred[:, 6:7]), dim=-1)
+
+    # The function is directly copied from FCAF3DHead.
+    @staticmethod
+    def _get_face_distances(points, boxes):
+        """Calculate distances from point to box faces.
+
+        Args:
+            points (Tensor): Final locations of shape (N_points, N_boxes, 3).
+            boxes (Tensor): 3D boxes of shape (N_points, N_boxes, 7)
+
+        Returns:
+            Tensor: Face distances of shape (N_points, N_boxes, 6),
+                (dx_min, dx_max, dy_min, dy_max, dz_min, dz_max).
+        """
+        shift = torch.stack(
+            (points[..., 0] - boxes[..., 0], points[..., 1] - boxes[..., 1],
+             points[..., 2] - boxes[..., 2]),
+            dim=-1).permute(1, 0, 2)
+        shift = rotation_3d_in_axis(
+            shift, -boxes[0, :, 6], axis=2).permute(1, 0, 2)
+        centers = boxes[..., :3] + shift
+        dx_min = centers[..., 0] - boxes[..., 0] + boxes[..., 3] / 2
+        dx_max = boxes[..., 0] + boxes[..., 3] / 2 - centers[..., 0]
+        dy_min = centers[..., 1] - boxes[..., 1] + boxes[..., 4] / 2
+        dy_max = boxes[..., 1] + boxes[..., 4] / 2 - centers[..., 1]
+        dz_min = centers[..., 2] - boxes[..., 2] + boxes[..., 5] / 2
+        dz_max = boxes[..., 2] + boxes[..., 5] / 2 - centers[..., 2]
+        return torch.stack((dx_min, dx_max, dy_min, dy_max, dz_min, dz_max),
+                           dim=-1)
+
+    # The function is directly copied from FCAF3DHead.
+    @staticmethod
+    def _get_centerness(face_distances):
+        """Compute point centerness w.r.t containing box.
+
+        Args:
+            face_distances (Tensor): Face distances of shape (B, N, 6),
+                (dx_min, dx_max, dy_min, dy_max, dz_min, dz_max).
+
+        Returns:
+            Tensor: Centerness of shape (B, N).
+        """
+        x_dims = face_distances[..., [0, 1]]
+        y_dims = face_distances[..., [2, 3]]
+        z_dims = face_distances[..., [4, 5]]
+        centerness_targets = x_dims.min(dim=-1)[0] / x_dims.max(dim=-1)[0] * \
+            y_dims.min(dim=-1)[0] / y_dims.max(dim=-1)[0] * \
+            z_dims.min(dim=-1)[0] / z_dims.max(dim=-1)[0]
+        return torch.sqrt(centerness_targets)
+
+    # The function is directly copied from FCAF3DHead.
+    @torch.no_grad()
+    def _get_targets(self, points, gt_bboxes, gt_labels):
+        """Compute targets for final locations for a single scene.
+
+        Args:
+            points (list[Tensor]): Final locations for all levels.
+            gt_bboxes (BaseInstance3DBoxes): Ground truth boxes.
+            gt_labels (Tensor): Ground truth labels.
+
+        Returns:
+            tuple[Tensor]: Centerness, bbox and classification
+                targets for all locations.
+        """
+        float_max = points[0].new_tensor(1e8)
+        n_levels = len(points)
+        levels = torch.cat([
+            points[i].new_tensor(i).expand(len(points[i]))
+            for i in range(len(points))
+        ])
+        points = torch.cat(points)
+        gt_bboxes = gt_bboxes.to(points.device)
+        n_points = len(points)
+        n_boxes = len(gt_bboxes)
+        volumes = gt_bboxes.volume.unsqueeze(0).expand(n_points, n_boxes)
+
+        # condition 1: point inside box
+        boxes = torch.cat((gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+                          dim=1)
+        boxes = boxes.expand(n_points, n_boxes, 7)
+        points = points.unsqueeze(1).expand(n_points, n_boxes, 3)
+        face_distances = self._get_face_distances(points, boxes)
+        inside_box_condition = face_distances.min(dim=-1).values > 0
+
+        # condition 2: positive points per level >= limit
+        # calculate positive points per scale
+        n_pos_points_per_level = []
+        for i in range(n_levels):
+            n_pos_points_per_level.append(
+                torch.sum(inside_box_condition[levels == i], dim=0))
+        # find best level
+        n_pos_points_per_level = torch.stack(n_pos_points_per_level, dim=0)
+        lower_limit_mask = n_pos_points_per_level < self.pts_assign_threshold
+        lower_index = torch.argmax(lower_limit_mask.int(), dim=0) - 1
+        lower_index = torch.where(lower_index < 0, 0, lower_index)
+        all_upper_limit_mask = torch.all(
+            torch.logical_not(lower_limit_mask), dim=0)
+        best_level = torch.where(all_upper_limit_mask, n_levels - 1,
+                                 lower_index)
+        # keep only points with best level
+        best_level = best_level.expand(n_points, n_boxes)
+        levels = torch.unsqueeze(levels, 1).expand(n_points, n_boxes)
+        level_condition = best_level == levels
+
+        # condition 3: limit topk points per box by centerness
+        centerness = self._get_centerness(face_distances)
+        centerness = torch.where(inside_box_condition, centerness,
+                                 torch.ones_like(centerness) * -1)
+        centerness = torch.where(level_condition, centerness,
+                                 torch.ones_like(centerness) * -1)
+        top_centerness = torch.topk(
+            centerness,
+            min(self.pts_center_threshold + 1, len(centerness)),
+            dim=0).values[-1]
+        topk_condition = centerness > top_centerness.unsqueeze(0)
+
+        # condition 4: min volume box per point
+        volumes = torch.where(inside_box_condition, volumes, float_max)
+        volumes = torch.where(level_condition, volumes, float_max)
+        volumes = torch.where(topk_condition, volumes, float_max)
+        min_volumes, min_inds = volumes.min(dim=1)
+
+        center_targets = centerness[torch.arange(n_points), min_inds]
+        bbox_targets = boxes[torch.arange(n_points), min_inds]
+        if not gt_bboxes.with_yaw:
+            bbox_targets = bbox_targets[:, :-1]
+        cls_targets = gt_labels[min_inds]
+        cls_targets = torch.where(min_volumes == float_max, -1, cls_targets)
+        return center_targets, bbox_targets, cls_targets
+
+    # Originally ImVoxelNet utilizes 2d nms as mmdetection3d didn't
+    # support 3d nms. But since mmcv==1.5.2 we simply use nms3d here.
+    # The function is directly copied from FCAF3DHead.
+    def _single_scene_multiclass_nms(self, bboxes, scores, input_meta):
+        """Multi-class nms for a single scene.
+
+        Args:
+            bboxes (Tensor): Predicted boxes of shape (N_boxes, 6) or
+                (N_boxes, 7).
+            scores (Tensor): Predicted scores of shape (N_boxes, N_classes).
+            input_meta (dict): Scene meta data.
+
+        Returns:
+            tuple[Tensor]: Predicted bboxes, scores and labels.
+        """
+        n_classes = scores.shape[1]
+        with_yaw = bboxes.shape[1] == 7
+        nms_bboxes, nms_scores, nms_labels = [], [], []
+        for i in range(n_classes):
+            ids = scores[:, i] > self.test_cfg.score_thr
+            if not ids.any():
+                continue
+
+            class_scores = scores[ids, i]
+            class_bboxes = bboxes[ids]
+            if with_yaw:
+                nms_function = nms3d
+            else:
+                class_bboxes = torch.cat(
+                    (class_bboxes, torch.zeros_like(class_bboxes[:, :1])),
+                    dim=1)
+                nms_function = nms3d_normal
+
+            nms_ids = nms_function(class_bboxes, class_scores,
+                                   self.test_cfg.iou_thr)
+            nms_bboxes.append(class_bboxes[nms_ids])
+            nms_scores.append(class_scores[nms_ids])
+            nms_labels.append(
+                bboxes.new_full(
+                    class_scores[nms_ids].shape, i, dtype=torch.long))
+
+        if len(nms_bboxes):
+            nms_bboxes = torch.cat(nms_bboxes, dim=0)
+            nms_scores = torch.cat(nms_scores, dim=0)
+            nms_labels = torch.cat(nms_labels, dim=0)
+        else:
+            nms_bboxes = bboxes.new_zeros((0, bboxes.shape[1]))
+            nms_scores = bboxes.new_zeros((0, ))
+            nms_labels = bboxes.new_zeros((0, ))
+
+        if with_yaw:
+            box_dim = 7
+        else:
+            box_dim = 6
+            nms_bboxes = nms_bboxes[:, :box_dim]
+
+        return nms_bboxes, nms_scores, nms_labels
diff --git a/mmde/mmdet3d/models/dense_heads/monoflex_head.py b/mmde/mmdet3d/models/dense_heads/monoflex_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..72f0257161279fe34ff40cac2e390ded94b56282
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/monoflex_head.py
@@ -0,0 +1,804 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import torch
+from mmdet.models.utils import (gaussian_radius, gen_gaussian_target,
+                                multi_apply)
+from mmdet.models.utils.gaussian_target import (get_local_maximum,
+                                                get_topk_from_heatmap,
+                                                transpose_and_gather_feat)
+from mmengine.config import ConfigDict
+from mmengine.model import xavier_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.models.layers import EdgeFusionModule
+from mmdet3d.models.task_modules.builder import build_bbox_coder
+from mmdet3d.models.utils import (filter_outside_objs, get_edge_indices,
+                                  get_ellip_gaussian_2D, get_keypoints,
+                                  handle_proj_objs)
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import Det3DDataSample
+from .anchor_free_mono3d_head import AnchorFreeMono3DHead
+
+
+@MODELS.register_module()
+class MonoFlexHead(AnchorFreeMono3DHead):
+    r"""MonoFlex head used in `MonoFlex <https://arxiv.org/abs/2104.02323>`_
+
+    .. code-block:: none
+
+                / --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> cls
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv --> 2d bbox
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> 2d offsets
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->  keypoints offsets
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->  keypoints uncertainty
+        feature
+                | --> 3 x 3 conv --> 1 x 1 conv -->  keypoints uncertainty
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->   3d dimensions
+                |
+                |                  |--- 1 x 1 conv -->  ori cls
+                | --> 3 x 3 conv --|
+                |                  |--- 1 x 1 conv -->  ori offsets
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->  depth
+                |
+                \ --> 3 x 3 conv --> 1 x 1 conv -->  depth uncertainty
+
+    Args:
+        use_edge_fusion (bool): Whether to use edge fusion module while
+            feature extraction.
+        edge_fusion_inds (list[tuple]): Indices of feature to use edge fusion.
+        edge_heatmap_ratio (float): Ratio of generating target heatmap.
+        filter_outside_objs (bool, optional): Whether to filter the
+            outside objects. Default: True.
+        loss_cls (dict, optional): Config of classification loss.
+            Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0).
+        loss_bbox (dict, optional): Config of localization loss.
+            Default: loss_bbox=dict(type='IOULoss', loss_weight=10.0).
+        loss_dir (dict, optional): Config of direction classification loss.
+            Default: dict(type='MultibinLoss', loss_weight=0.1).
+        loss_keypoints (dict, optional): Config of keypoints loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_dims: (dict, optional): Config of dimensions loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_offsets_2d: (dict, optional): Config of offsets_2d loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_direct_depth: (dict, optional): Config of directly regression depth loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_keypoints_depth: (dict, optional): Config of keypoints decoded depth loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_combined_depth: (dict, optional): Config of combined depth loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_attr (dict, optional): Config of attribute classification loss.
+            In MonoFlex, Default: None.
+        bbox_coder (dict, optional): Bbox coder for encoding and decoding boxes.
+            Default: dict(type='MonoFlexCoder', code_size=7).
+        norm_cfg (dict, optional): Dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
+        init_cfg (dict): Initialization config dict. Default: None.
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 use_edge_fusion: bool,
+                 edge_fusion_inds: List[Tuple],
+                 edge_heatmap_ratio: float,
+                 filter_outside_objs: bool = True,
+                 loss_cls: dict = dict(
+                     type='mmdet.GaussianFocalLoss', loss_weight=1.0),
+                 loss_bbox: dict = dict(type='mmdet.IoULoss', loss_weight=0.1),
+                 loss_dir: dict = dict(type='MultiBinLoss', loss_weight=0.1),
+                 loss_keypoints: dict = dict(
+                     type='mmdet.L1Loss', loss_weight=0.1),
+                 loss_dims: dict = dict(type='mmdet.L1Loss', loss_weight=0.1),
+                 loss_offsets_2d: dict = dict(
+                     type='mmdet.L1Loss', loss_weight=0.1),
+                 loss_direct_depth: dict = dict(
+                     type='mmdet.L1Loss', loss_weight=0.1),
+                 loss_keypoints_depth: dict = dict(
+                     type='mmdet.L1Loss', loss_weight=0.1),
+                 loss_combined_depth: dict = dict(
+                     type='mmdet.L1Loss', loss_weight=0.1),
+                 loss_attr: Optional[dict] = None,
+                 bbox_coder: dict = dict(type='MonoFlexCoder', code_size=7),
+                 norm_cfg: Union[ConfigDict, dict] = dict(type='BN'),
+                 init_cfg: Optional[Union[ConfigDict, dict]] = None,
+                 init_bias: float = -2.19,
+                 **kwargs) -> None:
+        self.use_edge_fusion = use_edge_fusion
+        self.edge_fusion_inds = edge_fusion_inds
+        super().__init__(
+            num_classes,
+            in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_dir=loss_dir,
+            loss_attr=loss_attr,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.filter_outside_objs = filter_outside_objs
+        self.edge_heatmap_ratio = edge_heatmap_ratio
+        self.init_bias = init_bias
+        self.loss_dir = MODELS.build(loss_dir)
+        self.loss_keypoints = MODELS.build(loss_keypoints)
+        self.loss_dims = MODELS.build(loss_dims)
+        self.loss_offsets_2d = MODELS.build(loss_offsets_2d)
+        self.loss_direct_depth = MODELS.build(loss_direct_depth)
+        self.loss_keypoints_depth = MODELS.build(loss_keypoints_depth)
+        self.loss_combined_depth = MODELS.build(loss_combined_depth)
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+
+    def _init_edge_module(self):
+        """Initialize edge fusion module for feature extraction."""
+        self.edge_fuse_cls = EdgeFusionModule(self.num_classes, 256)
+        for i in range(len(self.edge_fusion_inds)):
+            reg_inds, out_inds = self.edge_fusion_inds[i]
+            out_channels = self.group_reg_dims[reg_inds][out_inds]
+            fusion_layer = EdgeFusionModule(out_channels, 256)
+            layer_name = f'edge_fuse_reg_{reg_inds}_{out_inds}'
+            self.add_module(layer_name, fusion_layer)
+
+    def init_weights(self):
+        """Initialize weights."""
+        super().init_weights()
+        self.conv_cls.bias.data.fill_(self.init_bias)
+        xavier_init(self.conv_regs[4][0], gain=0.01)
+        xavier_init(self.conv_regs[7][0], gain=0.01)
+        for m in self.conv_regs.modules():
+            if isinstance(m, nn.Conv2d):
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _init_predictor(self):
+        """Initialize predictor layers of the head."""
+        self.conv_cls_prev = self._init_branch(
+            conv_channels=self.cls_branch,
+            conv_strides=(1, ) * len(self.cls_branch))
+        self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels,
+                                  1)
+        # init regression head
+        self.conv_reg_prevs = nn.ModuleList()
+        # init output head
+        self.conv_regs = nn.ModuleList()
+        # group_reg_dims:
+        # ((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ), (1, ))
+        for i in range(len(self.group_reg_dims)):
+            reg_dims = self.group_reg_dims[i]
+            reg_branch_channels = self.reg_branch[i]
+            out_channel = self.out_channels[i]
+            reg_list = nn.ModuleList()
+            if len(reg_branch_channels) > 0:
+                self.conv_reg_prevs.append(
+                    self._init_branch(
+                        conv_channels=reg_branch_channels,
+                        conv_strides=(1, ) * len(reg_branch_channels)))
+                for reg_dim in reg_dims:
+                    reg_list.append(nn.Conv2d(out_channel, reg_dim, 1))
+                self.conv_regs.append(reg_list)
+            else:
+                self.conv_reg_prevs.append(None)
+                for reg_dim in reg_dims:
+                    reg_list.append(nn.Conv2d(self.feat_channels, reg_dim, 1))
+                self.conv_regs.append(reg_list)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self._init_predictor()
+        if self.use_edge_fusion:
+            self._init_edge_module()
+
+    def loss(self, x: List[Tensor], batch_data_samples: List[Det3DDataSample],
+             **kwargs):
+        """
+        Args:
+            x (list[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`Det3DDataSample`]): Each item
+                contains the meta information of each image and corresponding
+                annotations.
+            proposal_cfg (mmengine.Config, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+
+        Returns:
+            tuple or Tensor: When `proposal_cfg` is None, the detector is a \
+            normal one-stage detector, The return value is the losses.
+
+            - losses: (dict[str, Tensor]): A dictionary of loss components.
+
+            When the `proposal_cfg` is not None, the head is used as a
+            `rpn_head`, the return value is a tuple contains:
+
+            - losses: (dict[str, Tensor]): A dictionary of loss components.
+            - results_list (list[:obj:`InstanceData`]): Detection
+              results of each image after the post process.
+              Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (:obj:`BaseInstance3DBoxes`): Contains a tensor
+                  with shape (num_instances, C), the last dimension C of a
+                  3D box is (x, y, z, x_size, y_size, z_size, yaw, ...), where
+                  C >= 7. C = 7 for kitti and C = 9 for nuscenes with extra 2
+                  dims of velocity.
+        """
+
+        batch_gt_instances_3d = []
+        batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances.append(data_sample.gt_instances)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+
+        # monoflex head needs img_metas for feature extraction
+        outs = self(x, batch_img_metas)
+        loss_inputs = outs + (batch_gt_instances_3d, batch_img_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss(*loss_inputs)
+
+        return losses
+
+    def forward(self, feats: List[Tensor], batch_img_metas: List[dict]):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (list[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * bbox_code_size.
+        """
+        mlvl_batch_img_metas = [batch_img_metas for i in range(len(feats))]
+        return multi_apply(self.forward_single, feats, mlvl_batch_img_metas)
+
+    def forward_single(self, x: Tensor, batch_img_metas: List[dict]):
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): Feature maps from a specific FPN feature level.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple: Scores for each class, bbox predictions.
+        """
+        img_h, img_w = batch_img_metas[0]['pad_shape'][:2]
+        batch_size, _, feat_h, feat_w = x.shape
+        downsample_ratio = img_h / feat_h
+
+        for conv_cls_prev_layer in self.conv_cls_prev:
+            cls_feat = conv_cls_prev_layer(x)
+        out_cls = self.conv_cls(cls_feat)
+
+        if self.use_edge_fusion:
+            # calculate the edge indices for the batch data
+            edge_indices_list = get_edge_indices(
+                batch_img_metas, downsample_ratio, device=x.device)
+            edge_lens = [
+                edge_indices.shape[0] for edge_indices in edge_indices_list
+            ]
+            max_edge_len = max(edge_lens)
+            edge_indices = x.new_zeros((batch_size, max_edge_len, 2),
+                                       dtype=torch.long)
+            for i in range(batch_size):
+                edge_indices[i, :edge_lens[i]] = edge_indices_list[i]
+            # cls feature map edge fusion
+            out_cls = self.edge_fuse_cls(cls_feat, out_cls, edge_indices,
+                                         edge_lens, feat_h, feat_w)
+
+        bbox_pred = []
+
+        for i in range(len(self.group_reg_dims)):
+            reg_feat = x.clone()
+            # feature regression head
+            if len(self.reg_branch[i]) > 0:
+                for conv_reg_prev_layer in self.conv_reg_prevs[i]:
+                    reg_feat = conv_reg_prev_layer(reg_feat)
+
+            for j, conv_reg in enumerate(self.conv_regs[i]):
+                out_reg = conv_reg(reg_feat)
+                #  Use Edge Fusion Module
+                if self.use_edge_fusion and (i, j) in self.edge_fusion_inds:
+                    # reg feature map edge fusion
+                    out_reg = getattr(self, 'edge_fuse_reg_{}_{}'.format(
+                        i, j))(reg_feat, out_reg, edge_indices, edge_lens,
+                               feat_h, feat_w)
+                bbox_pred.append(out_reg)
+
+        bbox_pred = torch.cat(bbox_pred, dim=1)
+        cls_score = out_cls.sigmoid()  # turn to 0-1
+        cls_score = cls_score.clamp(min=1e-4, max=1 - 1e-4)
+
+        return cls_score, bbox_pred
+
+    def predict_by_feat(self, cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor], batch_img_metas: List[dict]):
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level.
+            bbox_preds (list[Tensor]): Box regression for each scale.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+        Returns:
+            list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]:
+                Each item in result_list is 4-tuple.
+        """
+        assert len(cls_scores) == len(bbox_preds) == 1
+        cam2imgs = torch.stack([
+            cls_scores[0].new_tensor(input_meta['cam2img'])
+            for input_meta in batch_img_metas
+        ])
+        batch_bboxes, batch_scores, batch_topk_labels = self._decode_heatmap(
+            cls_scores[0],
+            bbox_preds[0],
+            batch_img_metas,
+            cam2imgs=cam2imgs,
+            topk=100,
+            kernel=3)
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+
+            bboxes = batch_bboxes[img_id]
+            scores = batch_scores[img_id]
+            labels = batch_topk_labels[img_id]
+
+            keep_idx = scores > 0.25
+            bboxes = bboxes[keep_idx]
+            scores = scores[keep_idx]
+            labels = labels[keep_idx]
+
+            bboxes = batch_img_metas[img_id]['box_type_3d'](
+                bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5))
+            attrs = None
+
+            results = InstanceData()
+            results.bboxes_3d = bboxes
+            results.scores_3d = scores
+            results.labels_3d = labels
+
+            if attrs is not None:
+                results.attr_labels = attrs
+
+            result_list.append(results)
+
+        return result_list
+
+    def _decode_heatmap(self,
+                        cls_score: Tensor,
+                        reg_pred: Tensor,
+                        batch_img_metas: List[dict],
+                        cam2imgs: Tensor,
+                        topk: int = 100,
+                        kernel: int = 3):
+        """Transform outputs into detections raw bbox predictions.
+
+        Args:
+            class_score (Tensor): Center predict heatmap,
+                shape (B, num_classes, H, W).
+            reg_pred (Tensor): Box regression map.
+                shape (B, channel, H , W).
+            batch_img_metas (List[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cam2imgs (Tensor): Camera intrinsic matrix.
+                shape (N, 4, 4)
+            topk (int, optional): Get top k center keypoints from heatmap.
+                Default 100.
+            kernel (int, optional): Max pooling kernel for extract local
+                maximum pixels. Default 3.
+
+        Returns:
+            tuple[torch.Tensor]: Decoded output of SMOKEHead, containing
+               the following Tensors:
+              - batch_bboxes (Tensor): Coords of each 3D box.
+                    shape (B, k, 7)
+              - batch_scores (Tensor): Scores of each 3D box.
+                    shape (B, k)
+              - batch_topk_labels (Tensor): Categories of each 3D box.
+                    shape (B, k)
+        """
+        img_h, img_w = batch_img_metas[0]['pad_shape'][:2]
+        batch_size, _, feat_h, feat_w = cls_score.shape
+
+        downsample_ratio = img_h / feat_h
+        center_heatmap_pred = get_local_maximum(cls_score, kernel=kernel)
+
+        *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap(
+            center_heatmap_pred, k=topk)
+        batch_scores, batch_index, batch_topk_labels = batch_dets
+
+        regression = transpose_and_gather_feat(reg_pred, batch_index)
+        regression = regression.view(-1, 8)
+
+        pred_base_centers_2d = torch.cat(
+            [topk_xs.view(-1, 1),
+             topk_ys.view(-1, 1).float()], dim=1)
+        preds = self.bbox_coder.decode(regression, batch_topk_labels,
+                                       downsample_ratio, cam2imgs)
+        pred_locations = self.bbox_coder.decode_location(
+            pred_base_centers_2d, preds['offsets_2d'], preds['combined_depth'],
+            cam2imgs, downsample_ratio)
+        pred_yaws = self.bbox_coder.decode_orientation(
+            preds['orientations']).unsqueeze(-1)
+        pred_dims = preds['dimensions']
+        batch_bboxes = torch.cat((pred_locations, pred_dims, pred_yaws), dim=1)
+        batch_bboxes = batch_bboxes.view(batch_size, -1, self.bbox_code_size)
+        return batch_bboxes, batch_scores, batch_topk_labels
+
+    def get_predictions(self, pred_reg, labels3d, centers_2d, reg_mask,
+                        batch_indices, batch_img_metas, downsample_ratio):
+        """Prepare predictions for computing loss.
+
+        Args:
+            pred_reg (Tensor): Box regression map.
+                shape (B, channel, H , W).
+            labels3d (Tensor): Labels of each 3D box.
+                shape (B * max_objs, )
+            centers_2d (Tensor): Coords of each projected 3D box
+                center on image. shape (N, 2)
+            reg_mask (Tensor): Indexes of the existence of the 3D box.
+                shape (B * max_objs, )
+            batch_indices (Tenosr): Batch indices of the 3D box.
+                shape (N, 3)
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            downsample_ratio (int): The stride of feature map.
+
+        Returns:
+            dict: The predictions for computing loss.
+        """
+        batch, channel = pred_reg.shape[0], pred_reg.shape[1]
+        w = pred_reg.shape[3]
+        cam2imgs = torch.stack([
+            centers_2d.new_tensor(img_meta['cam2img'])
+            for img_meta in batch_img_metas
+        ])
+        # (batch_size, 4, 4) -> (N, 4, 4)
+        cam2imgs = cam2imgs[batch_indices, :, :]
+        centers_2d_inds = centers_2d[:, 1] * w + centers_2d[:, 0]
+        centers_2d_inds = centers_2d_inds.view(batch, -1)
+        pred_regression = transpose_and_gather_feat(pred_reg, centers_2d_inds)
+        pred_regression_pois = pred_regression.view(-1, channel)[reg_mask]
+        preds = self.bbox_coder.decode(pred_regression_pois, labels3d,
+                                       downsample_ratio, cam2imgs)
+
+        return preds
+
+    def get_targets(self, batch_gt_instances_3d: List[InstanceData],
+                    batch_gt_instances: List[InstanceData],
+                    feat_shape: Tuple[int], batch_img_metas: List[dict]):
+        """Get training targets for batch images.
+``
+        Args:
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、
+                ``labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes``、``labels``.
+            feat_shape (tuple[int]): Feature map shape with value,
+                shape (B, _, H, W).
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+
+        Returns:
+            tuple[Tensor, dict]: The Tensor value is the targets of
+                center heatmap, the dict has components below:
+              - base_centers_2d_target (Tensor): Coords of each projected
+                    3D box center on image. shape (B * max_objs, 2),
+                    [dtype: int]
+              - labels3d (Tensor): Labels of each 3D box.
+                    shape (N, )
+              - reg_mask (Tensor): Mask of the existence of the 3D box.
+                    shape (B * max_objs, )
+              - batch_indices (Tensor): Batch id of the 3D box.
+                    shape (N, )
+              - depth_target (Tensor): Depth target of each 3D box.
+                    shape (N, )
+              - keypoints2d_target (Tensor): Keypoints of each projected 3D box
+                    on image. shape (N, 10, 2)
+              - keypoints_mask (Tensor): Keypoints mask of each projected 3D
+                    box on image. shape (N, 10)
+              - keypoints_depth_mask (Tensor): Depths decoded from keypoints
+                    of each 3D box. shape (N, 3)
+              - orientations_target (Tensor): Orientation (encoded local yaw)
+                    target of each 3D box. shape (N, )
+              - offsets_2d_target (Tensor): Offsets target of each projected
+                    3D box. shape (N, 2)
+              - dimensions_target (Tensor): Dimensions target of each 3D box.
+                    shape (N, 3)
+              - downsample_ratio (int): The stride of feature map.
+        """
+
+        gt_bboxes_list = [
+            gt_instances.bboxes for gt_instances in batch_gt_instances
+        ]
+        gt_labels_list = [
+            gt_instances.labels for gt_instances in batch_gt_instances
+        ]
+        gt_bboxes_3d_list = [
+            gt_instances_3d.bboxes_3d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+        gt_labels_3d_list = [
+            gt_instances_3d.labels_3d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+        centers_2d_list = [
+            gt_instances_3d.centers_2d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+        depths_list = [
+            gt_instances_3d.depths for gt_instances_3d in batch_gt_instances_3d
+        ]
+
+        img_h, img_w = batch_img_metas[0]['pad_shape'][:2]
+        batch_size, _, feat_h, feat_w = feat_shape
+
+        width_ratio = float(feat_w / img_w)  # 1/4
+        height_ratio = float(feat_h / img_h)  # 1/4
+
+        assert width_ratio == height_ratio
+
+        # Whether to filter the objects which are not in FOV.
+        if self.filter_outside_objs:
+            filter_outside_objs(gt_bboxes_list, gt_labels_list,
+                                gt_bboxes_3d_list, gt_labels_3d_list,
+                                centers_2d_list, batch_img_metas)
+
+        # transform centers_2d to base centers_2d for regression and
+        # heatmap generation.
+        # centers_2d = int(base_centers_2d) + offsets_2d
+        base_centers_2d_list, offsets_2d_list, trunc_mask_list = \
+            handle_proj_objs(centers_2d_list, gt_bboxes_list, batch_img_metas)
+
+        keypoints2d_list, keypoints_mask_list, keypoints_depth_mask_list = \
+            get_keypoints(gt_bboxes_3d_list, centers_2d_list, batch_img_metas)
+
+        center_heatmap_target = gt_bboxes_list[-1].new_zeros(
+            [batch_size, self.num_classes, feat_h, feat_w])
+
+        for batch_id in range(batch_size):
+            # project gt_bboxes from input image to feat map
+            gt_bboxes = gt_bboxes_list[batch_id] * width_ratio
+            gt_labels = gt_labels_list[batch_id]
+
+            # project base centers_2d from input image to feat map
+            gt_base_centers_2d = base_centers_2d_list[batch_id] * width_ratio
+            trunc_masks = trunc_mask_list[batch_id]
+
+            for j, base_center2d in enumerate(gt_base_centers_2d):
+                if trunc_masks[j]:
+                    # for outside objects, generate ellipse heatmap
+                    base_center2d_x_int, base_center2d_y_int = \
+                        base_center2d.int()
+                    scale_box_w = min(base_center2d_x_int - gt_bboxes[j][0],
+                                      gt_bboxes[j][2] - base_center2d_x_int)
+                    scale_box_h = min(base_center2d_y_int - gt_bboxes[j][1],
+                                      gt_bboxes[j][3] - base_center2d_y_int)
+                    radius_x = scale_box_w * self.edge_heatmap_ratio
+                    radius_y = scale_box_h * self.edge_heatmap_ratio
+                    radius_x, radius_y = max(0, int(radius_x)), max(
+                        0, int(radius_y))
+                    assert min(radius_x, radius_y) == 0
+                    ind = gt_labels[j]
+                    get_ellip_gaussian_2D(
+                        center_heatmap_target[batch_id, ind],
+                        [base_center2d_x_int, base_center2d_y_int], radius_x,
+                        radius_y)
+                else:
+                    base_center2d_x_int, base_center2d_y_int = \
+                        base_center2d.int()
+                    scale_box_h = (gt_bboxes[j][3] - gt_bboxes[j][1])
+                    scale_box_w = (gt_bboxes[j][2] - gt_bboxes[j][0])
+                    radius = gaussian_radius([scale_box_h, scale_box_w],
+                                             min_overlap=0.7)
+                    radius = max(0, int(radius))
+                    ind = gt_labels[j]
+                    gen_gaussian_target(
+                        center_heatmap_target[batch_id, ind],
+                        [base_center2d_x_int, base_center2d_y_int], radius)
+
+        avg_factor = max(1, center_heatmap_target.eq(1).sum())
+        num_ctrs = [centers_2d.shape[0] for centers_2d in centers_2d_list]
+        max_objs = max(num_ctrs)
+        batch_indices = [
+            centers_2d_list[0].new_full((num_ctrs[i], ), i)
+            for i in range(batch_size)
+        ]
+        batch_indices = torch.cat(batch_indices, dim=0)
+        reg_mask = torch.zeros(
+            (batch_size, max_objs),
+            dtype=torch.bool).to(base_centers_2d_list[0].device)
+        gt_bboxes_3d = batch_img_metas[0]['box_type_3d'].cat(gt_bboxes_3d_list)
+        gt_bboxes_3d = gt_bboxes_3d.to(base_centers_2d_list[0].device)
+
+        # encode original local yaw to multibin format
+        orienations_target = self.bbox_coder.encode(gt_bboxes_3d)
+
+        batch_base_centers_2d = base_centers_2d_list[0].new_zeros(
+            (batch_size, max_objs, 2))
+
+        for i in range(batch_size):
+            reg_mask[i, :num_ctrs[i]] = 1
+            batch_base_centers_2d[i, :num_ctrs[i]] = base_centers_2d_list[i]
+
+        flatten_reg_mask = reg_mask.flatten()
+
+        # transform base centers_2d from input scale to output scale
+        batch_base_centers_2d = batch_base_centers_2d.view(-1, 2) * width_ratio
+
+        dimensions_target = gt_bboxes_3d.tensor[:, 3:6]
+        labels_3d = torch.cat(gt_labels_3d_list)
+        keypoints2d_target = torch.cat(keypoints2d_list)
+        keypoints_mask = torch.cat(keypoints_mask_list)
+        keypoints_depth_mask = torch.cat(keypoints_depth_mask_list)
+        offsets_2d_target = torch.cat(offsets_2d_list)
+        bboxes2d = torch.cat(gt_bboxes_list)
+
+        # transform FCOS style bbox into [x1, y1, x2, y2] format.
+        bboxes2d_target = torch.cat([bboxes2d[:, 0:2] * -1, bboxes2d[:, 2:]],
+                                    dim=-1)
+        depths = torch.cat(depths_list)
+
+        target_labels = dict(
+            base_centers_2d_target=batch_base_centers_2d.int(),
+            labels3d=labels_3d,
+            reg_mask=flatten_reg_mask,
+            batch_indices=batch_indices,
+            bboxes2d_target=bboxes2d_target,
+            depth_target=depths,
+            keypoints2d_target=keypoints2d_target,
+            keypoints_mask=keypoints_mask,
+            keypoints_depth_mask=keypoints_depth_mask,
+            orienations_target=orienations_target,
+            offsets_2d_target=offsets_2d_target,
+            dimensions_target=dimensions_target,
+            downsample_ratio=1 / width_ratio)
+
+        return center_heatmap_target, avg_factor, target_labels
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances_3d: List[InstanceData],
+            batch_gt_instances: List[InstanceData],
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: Optional[List[InstanceData]] = None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level.
+                shape (num_gt, 4).
+            bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel
+                number is bbox_code_size.
+                shape (B, 7, H, W).
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、
+                ``labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes``、``labels``.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == 1
+        assert batch_gt_instances_ignore is None
+        center2d_heatmap = cls_scores[0]
+        pred_reg = bbox_preds[0]
+
+        center2d_heatmap_target, avg_factor, target_labels = \
+            self.get_targets(batch_gt_instances_3d,
+                             batch_gt_instances,
+                             center2d_heatmap.shape,
+                             batch_img_metas)
+
+        preds = self.get_predictions(
+            pred_reg=pred_reg,
+            labels3d=target_labels['labels3d'],
+            centers_2d=target_labels['base_centers_2d_target'],
+            reg_mask=target_labels['reg_mask'],
+            batch_indices=target_labels['batch_indices'],
+            batch_img_metas=batch_img_metas,
+            downsample_ratio=target_labels['downsample_ratio'])
+
+        # heatmap loss
+        loss_cls = self.loss_cls(
+            center2d_heatmap, center2d_heatmap_target, avg_factor=avg_factor)
+
+        # bbox2d regression loss
+        loss_bbox = self.loss_bbox(preds['bboxes2d'],
+                                   target_labels['bboxes2d_target'])
+
+        # keypoints loss, the keypoints in predictions and target are all
+        # local coordinates. Check the mask dtype should be bool, not int
+        # or float to ensure the indexing is bool index
+        keypoints2d_mask = target_labels['keypoints2d_mask']
+        loss_keypoints = self.loss_keypoints(
+            preds['keypoints2d'][keypoints2d_mask],
+            target_labels['keypoints2d_target'][keypoints2d_mask])
+
+        # orientations loss
+        loss_dir = self.loss_dir(preds['orientations'],
+                                 target_labels['orientations_target'])
+
+        # dimensions loss
+        loss_dims = self.loss_dims(preds['dimensions'],
+                                   target_labels['dimensions_target'])
+
+        # offsets for center heatmap
+        loss_offsets_2d = self.loss_offsets_2d(
+            preds['offsets_2d'], target_labels['offsets_2d_target'])
+
+        # directly regressed depth loss with direct depth uncertainty loss
+        direct_depth_weights = torch.exp(-preds['direct_depth_uncertainty'])
+        loss_weight_1 = self.loss_direct_depth.loss_weight
+        loss_direct_depth = self.loss_direct_depth(
+            preds['direct_depth'], target_labels['depth_target'],
+            direct_depth_weights)
+        loss_uncertainty_1 =\
+            preds['direct_depth_uncertainty'] * loss_weight_1
+        loss_direct_depth = loss_direct_depth + loss_uncertainty_1.mean()
+
+        # keypoints decoded depth loss with keypoints depth uncertainty loss
+        depth_mask = target_labels['keypoints_depth_mask']
+        depth_target = target_labels['depth_target'].unsqueeze(-1).repeat(1, 3)
+        valid_keypoints_depth_uncertainty = preds[
+            'keypoints_depth_uncertainty'][depth_mask]
+        valid_keypoints_depth_weights = torch.exp(
+            -valid_keypoints_depth_uncertainty)
+        loss_keypoints_depth = self.loss_keypoint_depth(
+            preds['keypoints_depth'][depth_mask], depth_target[depth_mask],
+            valid_keypoints_depth_weights)
+        loss_weight_2 = self.loss_keypoints_depth.loss_weight
+        loss_uncertainty_2 =\
+            valid_keypoints_depth_uncertainty * loss_weight_2
+        loss_keypoints_depth = loss_keypoints_depth + loss_uncertainty_2.mean()
+
+        # combined depth loss for optimiaze the uncertainty
+        loss_combined_depth = self.loss_combined_depth(
+            preds['combined_depth'], target_labels['depth_target'])
+
+        loss_dict = dict(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_keypoints=loss_keypoints,
+            loss_dir=loss_dir,
+            loss_dims=loss_dims,
+            loss_offsets_2d=loss_offsets_2d,
+            loss_direct_depth=loss_direct_depth,
+            loss_keypoints_depth=loss_keypoints_depth,
+            loss_combined_depth=loss_combined_depth)
+
+        return loss_dict
diff --git a/mmde/mmdet3d/models/dense_heads/parta2_rpn_head.py b/mmde/mmdet3d/models/dense_heads/parta2_rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3a31aed21bbfb7eb2d8256cefdbf1a43eeadb50
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/parta2_rpn_head.py
@@ -0,0 +1,398 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+from mmengine import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet3d.models.layers import nms_bev, nms_normal_bev
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import limit_period, xywhr2xyxyr
+from mmdet3d.utils.typing_utils import InstanceList
+from ...structures.det3d_data_sample import SampleList
+from .anchor3d_head import Anchor3DHead
+
+
+@MODELS.register_module()
+class PartA2RPNHead(Anchor3DHead):
+    """RPN head for PartA2.
+
+    Note:
+        The main difference between the PartA2 RPN head and the Anchor3DHead
+        lies in their output during inference. PartA2 RPN head further returns
+        the original classification score for the second stage since the bbox
+        head in RoI head does not do classification task.
+
+        Different from RPN heads in 2D detectors, this RPN head does
+        multi-class classification task and uses FocalLoss like the SECOND and
+        PointPillars do. But this head uses class agnostic nms rather than
+        multi-class nms.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        train_cfg (dict): Train configs.
+        test_cfg (dict): Test configs.
+        feat_channels (int): Number of channels of the feature map.
+        use_direction_classifier (bool): Whether to add a direction classifier.
+        anchor_generator(dict): Config dict of anchor generator.
+        assigner_per_size (bool): Whether to do assignment for each separate
+            anchor size.
+        assign_per_class (bool): Whether to do assignment for each class.
+        diff_rad_by_sin (bool): Whether to change the difference into sin
+            difference for box regression loss.
+        dir_offset (float | int): The offset of BEV rotation angles
+            (TODO: may be moved into box coder)
+        dir_limit_offset (float | int): The limited range of BEV
+            rotation angles. (TODO: may be moved into box coder)
+        bbox_coder (dict): Config dict of box coders.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        loss_dir (dict): Config of direction classifier loss.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 train_cfg: ConfigDict,
+                 test_cfg: ConfigDict,
+                 feat_channels: int = 256,
+                 use_direction_classifier: bool = True,
+                 anchor_generator: Dict = dict(
+                     type='Anchor3DRangeGenerator',
+                     range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
+                     strides=[2],
+                     sizes=[[3.9, 1.6, 1.56]],
+                     rotations=[0, 1.57],
+                     custom_values=[],
+                     reshape_out=False),
+                 assigner_per_size: bool = False,
+                 assign_per_class: bool = False,
+                 diff_rad_by_sin: bool = True,
+                 dir_offset: float = -np.pi / 2,
+                 dir_limit_offset: float = 0,
+                 bbox_coder: Dict = dict(type='DeltaXYZWLHRBBoxCoder'),
+                 loss_cls: Dict = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox: Dict = dict(
+                     type='mmdet.SmoothL1Loss',
+                     beta=1.0 / 9.0,
+                     loss_weight=2.0),
+                 loss_dir: Dict = dict(
+                     type='mmdet.CrossEntropyLoss', loss_weight=0.2),
+                 init_cfg: Dict = None) -> None:
+        super().__init__(num_classes, in_channels, feat_channels,
+                         use_direction_classifier, anchor_generator,
+                         assigner_per_size, assign_per_class, diff_rad_by_sin,
+                         dir_offset, dir_limit_offset, bbox_coder, loss_cls,
+                         loss_bbox, loss_dir, train_cfg, test_cfg, init_cfg)
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                dir_cls_pred_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                input_meta: List[dict],
+                                cfg: ConfigDict,
+                                rescale: List[Tensor] = False):
+        """Get bboxes of single branch.
+
+        Args:
+            cls_score_list (torch.Tensor): Class score in single batch.
+            bbox_pred_list (torch.Tensor): Bbox prediction in single batch.
+            dir_cls_pred_list (torch.Tensor): Predictions of direction class
+                in single batch.
+            mlvl_priors (List[torch.Tensor]): Multi-level anchors
+                in single batch.
+            input_meta (list[dict]): Contain pcd and img's meta info.
+            cfg (:obj:`ConfigDict`): Training or testing config.
+            rescale (list[torch.Tensor]): whether th rescale bbox.
+
+        Returns:
+            dict: Predictions of single batch containing the following keys:
+
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
+            - scores_3d (torch.Tensor): Score of each bbox.
+            - labels_3d (torch.Tensor): Label of each bbox.
+            - cls_preds (torch.Tensor): Class score of each bbox.
+        """
+        assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_priors)
+        mlvl_bboxes = []
+        mlvl_max_scores = []
+        mlvl_label_pred = []
+        mlvl_dir_scores = []
+        mlvl_cls_score = []
+        for cls_score, bbox_pred, dir_cls_pred, anchors in zip(
+                cls_score_list, bbox_pred_list, dir_cls_pred_list,
+                mlvl_priors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:]
+            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
+            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
+
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.num_classes)
+
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2,
+                                          0).reshape(-1, self.box_code_size)
+
+            nms_pre = cfg.get('nms_pre', -1)
+            if self.use_sigmoid_cls:
+                max_scores, pred_labels = scores.max(dim=1)
+            else:
+                max_scores, pred_labels = scores[:, :-1].max(dim=1)
+            # get topk
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                topk_scores, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                max_scores = topk_scores
+                cls_score = scores[topk_inds, :]
+                dir_cls_score = dir_cls_score[topk_inds]
+                pred_labels = pred_labels[topk_inds]
+
+            bboxes = self.bbox_coder.decode(anchors, bbox_pred)
+            mlvl_bboxes.append(bboxes)
+            mlvl_max_scores.append(max_scores)
+            mlvl_cls_score.append(cls_score)
+            mlvl_label_pred.append(pred_labels)
+            mlvl_dir_scores.append(dir_cls_score)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
+            mlvl_bboxes, box_dim=self.box_code_size).bev)
+        mlvl_max_scores = torch.cat(mlvl_max_scores)
+        mlvl_label_pred = torch.cat(mlvl_label_pred)
+        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
+        # shape [k, num_class] before sigmoid
+        # PartA2 need to keep raw classification score
+        # because the bbox head in the second stage does not have
+        # classification branch,
+        # roi head need this score as classification score
+        mlvl_cls_score = torch.cat(mlvl_cls_score)
+
+        score_thr = cfg.get('score_thr', 0)
+        result = self.class_agnostic_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
+                                         mlvl_max_scores, mlvl_label_pred,
+                                         mlvl_cls_score, mlvl_dir_scores,
+                                         score_thr, cfg, input_meta)
+        return result
+
+    def loss_and_predict(self,
+                         feats_dict: Dict,
+                         batch_data_samples: SampleList,
+                         proposal_cfg: ConfigDict = None,
+                         **kwargs) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+            proposal_cfg (ConfigDict, optional): Proposal config.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+            - losses: (dict[str, Tensor]): A dictionary of loss components.
+            - predictions (list[:obj:`InstanceData`]): Detection
+              results of each sample after the post process.
+        """
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+
+        outs = self(feats_dict['neck_feats'])
+
+        loss_inputs = outs + (batch_gt_instances_3d, batch_input_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_input_metas=batch_input_metas, cfg=proposal_cfg)
+        return losses, predictions
+
+    def loss_by_feat(self,
+                     cls_scores: List[Tensor],
+                     bbox_preds: List[Tensor],
+                     dir_cls_preds: List[Tensor],
+                     batch_gt_instances_3d: InstanceList,
+                     batch_input_metas: List[dict],
+                     batch_gt_instances_ignore: InstanceList = None) -> Dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[torch.Tensor]): Multi-level class scores.
+            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
+            dir_cls_preds (list[torch.Tensor]): Multi-level direction
+                class predictions.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+            batch_input_metas (list[dict]): Contain pcd and img's meta info.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, list[torch.Tensor]]: Classification, bbox, and
+                direction losses of each level.
+
+            - loss_rpn_cls (list[torch.Tensor]): Classification losses.
+            - loss_rpn_bbox (list[torch.Tensor]): Box regression losses.
+            - loss_rpn_dir (list[torch.Tensor]): Direction classification
+                losses.
+        """
+        loss_dict = super().loss_by_feat(cls_scores, bbox_preds, dir_cls_preds,
+                                         batch_gt_instances_3d,
+                                         batch_input_metas,
+                                         batch_gt_instances_ignore)
+        # change the loss key names to avoid conflict
+        return dict(
+            loss_rpn_cls=loss_dict['loss_cls'],
+            loss_rpn_bbox=loss_dict['loss_bbox'],
+            loss_rpn_dir=loss_dict['loss_dir'])
+
+    def class_agnostic_nms(self, mlvl_bboxes: Tensor,
+                           mlvl_bboxes_for_nms: Tensor,
+                           mlvl_max_scores: Tensor, mlvl_label_pred: Tensor,
+                           mlvl_cls_score: Tensor, mlvl_dir_scores: Tensor,
+                           score_thr: int, cfg: ConfigDict,
+                           input_meta: dict) -> Dict:
+        """Class agnostic nms for single batch.
+
+        Args:
+            mlvl_bboxes (torch.Tensor): Bboxes from Multi-level.
+            mlvl_bboxes_for_nms (torch.Tensor): Bboxes for nms
+                (bev or minmax boxes) from Multi-level.
+            mlvl_max_scores (torch.Tensor): Max scores of Multi-level bbox.
+            mlvl_label_pred (torch.Tensor): Class predictions
+                of Multi-level bbox.
+            mlvl_cls_score (torch.Tensor): Class scores of
+                Multi-level bbox.
+            mlvl_dir_scores (torch.Tensor): Direction scores of
+                Multi-level bbox.
+            score_thr (int): Score threshold.
+            cfg (:obj:`ConfigDict`): Training or testing config.
+            input_meta (dict): Contain pcd and img's meta info.
+
+        Returns:
+            dict: Predictions of single batch. Contain the keys:
+
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
+            - scores_3d (torch.Tensor): Score of each bbox.
+            - labels_3d (torch.Tensor): Label of each bbox.
+            - cls_preds (torch.Tensor): Class score of each bbox.
+        """
+        bboxes = []
+        scores = []
+        labels = []
+        dir_scores = []
+        cls_scores = []
+        score_thr_inds = mlvl_max_scores > score_thr
+        _scores = mlvl_max_scores[score_thr_inds]
+        _bboxes_for_nms = mlvl_bboxes_for_nms[score_thr_inds, :]
+        if cfg.use_rotate_nms:
+            nms_func = nms_bev
+        else:
+            nms_func = nms_normal_bev
+        selected = nms_func(_bboxes_for_nms, _scores, cfg.nms_thr)
+
+        _mlvl_bboxes = mlvl_bboxes[score_thr_inds, :]
+        _mlvl_dir_scores = mlvl_dir_scores[score_thr_inds]
+        _mlvl_label_pred = mlvl_label_pred[score_thr_inds]
+        _mlvl_cls_score = mlvl_cls_score[score_thr_inds]
+
+        if len(selected) > 0:
+            bboxes.append(_mlvl_bboxes[selected])
+            scores.append(_scores[selected])
+            labels.append(_mlvl_label_pred[selected])
+            cls_scores.append(_mlvl_cls_score[selected])
+            dir_scores.append(_mlvl_dir_scores[selected])
+            dir_rot = limit_period(bboxes[-1][..., 6] - self.dir_offset,
+                                   self.dir_limit_offset, np.pi)
+            bboxes[-1][..., 6] = (
+                dir_rot + self.dir_offset +
+                np.pi * dir_scores[-1].to(bboxes[-1].dtype))
+
+        if bboxes:
+            bboxes = torch.cat(bboxes, dim=0)
+            scores = torch.cat(scores, dim=0)
+            cls_scores = torch.cat(cls_scores, dim=0)
+            labels = torch.cat(labels, dim=0)
+            if bboxes.shape[0] > cfg.nms_post:
+                _, inds = scores.sort(descending=True)
+                inds = inds[:cfg.nms_post]
+                bboxes = bboxes[inds, :]
+                labels = labels[inds]
+                scores = scores[inds]
+                cls_scores = cls_scores[inds]
+            bboxes = input_meta['box_type_3d'](
+                bboxes, box_dim=self.box_code_size)
+            result = InstanceData()
+            result.bboxes_3d = bboxes
+            result.scores_3d = scores
+            result.labels_3d = labels
+            result.cls_preds = cls_scores
+            return result
+        else:
+            result = InstanceData()
+            result.bboxes_3d = input_meta['box_type_3d'](
+                mlvl_bboxes.new_zeros([0, self.box_code_size]),
+                box_dim=self.box_code_size)
+            result.scores_3d = mlvl_bboxes.new_zeros([0])
+            result.labels_3d = mlvl_bboxes.new_zeros([0])
+            result.cls_preds = mlvl_bboxes.new_zeros(
+                [0, mlvl_cls_score.shape[-1]])
+            return result
+
+    def predict(self, feats_dict: Dict,
+                batch_data_samples: SampleList) -> InstanceList:
+        """Perform forward propagation of the 3D detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        batch_input_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        rpn_outs = self(feats_dict['neck_feats'])
+        proposal_cfg = self.test_cfg
+
+        proposal_list = self.predict_by_feat(
+            *rpn_outs, cfg=proposal_cfg, batch_input_metas=batch_input_metas)
+        return proposal_list
diff --git a/mmde/mmdet3d/models/dense_heads/pgd_head.py b/mmde/mmdet3d/models/dense_heads/pgd_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a733b86919c70ad61c97091e8fa2f42ac4ecae7
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/pgd_head.py
@@ -0,0 +1,1239 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmcv.cnn import Scale
+from mmdet.models.utils import multi_apply
+from mmdet.structures.bbox import distance2bbox
+from mmengine.model import bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.models.layers import box3d_multiclass_nms
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import points_cam2img, points_img2cam, xywhr2xyxyr
+from mmdet3d.utils.typing_utils import (ConfigType, InstanceList,
+                                        OptConfigType, OptInstanceList)
+from .fcos_mono3d_head import FCOSMono3DHead
+
+
+@MODELS.register_module()
+class PGDHead(FCOSMono3DHead):
+    r"""Anchor-free head used in `PGD <https://arxiv.org/abs/2107.14160>`_.
+
+    Args:
+        use_depth_classifer (bool, optional): Whether to use depth classifier.
+            Defaults to True.
+        use_only_reg_proj (bool, optional): Whether to use only direct
+            regressed depth in the re-projection (to make the network easier
+            to learn). Defaults to False.
+        weight_dim (int, optional): Dimension of the location-aware weight
+            map. Defaults to -1.
+        weight_branch (tuple[tuple[int]], optional): Feature map channels of
+            the convolutional branch for weight map. Defaults to ((256, ), ).
+        depth_branch (tuple[int], optional): Feature map channels of the
+            branch for probabilistic depth estimation. Defaults to (64, ),
+        depth_range (tuple[float], optional): Range of depth estimation.
+            Defaults to (0, 70),
+        depth_unit (int, optional): Unit of depth range division. Defaults to
+            10.
+        division (str, optional): Depth division method. Options include
+            'uniform', 'linear', 'log', 'loguniform'. Defaults to 'uniform'.
+        depth_bins (int, optional): Discrete bins of depth division. Defaults
+            to 8.
+        loss_depth (dict, optional): Depth loss. Defaults to dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0).
+        loss_bbox2d (dict, optional): Loss for 2D box estimation. Defaults to
+            dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0).
+        loss_consistency (dict, optional): Consistency loss. Defaults to
+            dict(type='GIoULoss', loss_weight=1.0),
+        pred_velo (bool, optional): Whether to predict velocity. Defaults to
+            False.
+        pred_bbox2d (bool, optional): Whether to predict 2D bounding boxes.
+            Defaults to True.
+        pred_keypoints (bool, optional): Whether to predict keypoints.
+            Defaults to False,
+        bbox_coder (dict, optional): Bounding box coder. Defaults to
+            dict(type='PGDBBoxCoder', base_depths=((28.01, 16.32), ),
+            base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6), (3.9, 1.56, 1.6)),
+            code_size=7).
+    """
+
+    def __init__(self,
+                 use_depth_classifier: bool = True,
+                 use_onlyreg_proj: bool = False,
+                 weight_dim: int = -1,
+                 weight_branch: Tuple[Tuple] = ((256, ), ),
+                 depth_branch: Tuple = (64, ),
+                 depth_range: Tuple = (0, 70),
+                 depth_unit: int = 10,
+                 division: str = 'uniform',
+                 depth_bins: int = 8,
+                 loss_depth: dict = dict(
+                     type='mmdet.SmoothL1Loss',
+                     beta=1.0 / 9.0,
+                     loss_weight=1.0),
+                 loss_bbox2d: dict = dict(
+                     type='mmdet.SmoothL1Loss',
+                     beta=1.0 / 9.0,
+                     loss_weight=1.0),
+                 loss_consistency: dict = dict(
+                     type='mmdet.GIoULoss', loss_weight=1.0),
+                 pred_bbox2d: bool = True,
+                 pred_keypoints: bool = False,
+                 bbox_coder: dict = dict(
+                     type='PGDBBoxCoder',
+                     base_depths=((28.01, 16.32), ),
+                     base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6),
+                                (3.9, 1.56, 1.6)),
+                     code_size=7),
+                 **kwargs) -> None:
+        self.use_depth_classifier = use_depth_classifier
+        self.use_onlyreg_proj = use_onlyreg_proj
+        self.depth_branch = depth_branch
+        self.pred_keypoints = pred_keypoints
+        self.weight_dim = weight_dim
+        self.weight_branch = weight_branch
+        self.weight_out_channels = []
+        for weight_branch_channels in weight_branch:
+            if len(weight_branch_channels) > 0:
+                self.weight_out_channels.append(weight_branch_channels[-1])
+            else:
+                self.weight_out_channels.append(-1)
+        self.depth_range = depth_range
+        self.depth_unit = depth_unit
+        self.division = division
+        if self.division == 'uniform':
+            self.num_depth_cls = int(
+                (depth_range[1] - depth_range[0]) / depth_unit) + 1
+            if self.num_depth_cls != depth_bins:
+                print('Warning: The number of bins computed from ' +
+                      'depth_unit is different from given parameter! ' +
+                      'Depth_unit will be considered with priority in ' +
+                      'Uniform Division.')
+        else:
+            self.num_depth_cls = depth_bins
+        super().__init__(
+            pred_bbox2d=pred_bbox2d, bbox_coder=bbox_coder, **kwargs)
+        self.loss_depth = MODELS.build(loss_depth)
+        if self.pred_bbox2d:
+            self.loss_bbox2d = MODELS.build(loss_bbox2d)
+            self.loss_consistency = MODELS.build(loss_consistency)
+        if self.pred_keypoints:
+            self.kpts_start = 9 if self.pred_velo else 7
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        super()._init_layers()
+        if self.pred_bbox2d:
+            self.scale_dim += 1
+        if self.pred_keypoints:
+            self.scale_dim += 1
+        self.scales = nn.ModuleList([
+            nn.ModuleList([Scale(1.0) for _ in range(self.scale_dim)])
+            for _ in self.strides
+        ])
+
+    def _init_predictor(self):
+        """Initialize predictor layers of the head."""
+        super()._init_predictor()
+
+        if self.use_depth_classifier:
+            self.conv_depth_cls_prev = self._init_branch(
+                conv_channels=self.depth_branch,
+                conv_strides=(1, ) * len(self.depth_branch))
+            self.conv_depth_cls = nn.Conv2d(self.depth_branch[-1],
+                                            self.num_depth_cls, 1)
+            # Data-agnostic single param lambda for local depth fusion
+            self.fuse_lambda = nn.Parameter(torch.tensor(10e-5))
+
+        if self.weight_dim != -1:
+            self.conv_weight_prevs = nn.ModuleList()
+            self.conv_weights = nn.ModuleList()
+            for i in range(self.weight_dim):
+                weight_branch_channels = self.weight_branch[i]
+                weight_out_channel = self.weight_out_channels[i]
+                if len(weight_branch_channels) > 0:
+                    self.conv_weight_prevs.append(
+                        self._init_branch(
+                            conv_channels=weight_branch_channels,
+                            conv_strides=(1, ) * len(weight_branch_channels)))
+                    self.conv_weights.append(
+                        nn.Conv2d(weight_out_channel, 1, 1))
+                else:
+                    self.conv_weight_prevs.append(None)
+                    self.conv_weights.append(
+                        nn.Conv2d(self.feat_channels, 1, 1))
+
+    def init_weights(self):
+        """Initialize weights of the head.
+
+        We currently still use the customized defined init_weights because the
+        default init of DCN triggered by the init_cfg will init
+        conv_offset.weight, which mistakenly affects the training stability.
+        """
+        super().init_weights()
+
+        bias_cls = bias_init_with_prob(0.01)
+        if self.use_depth_classifier:
+            for m in self.conv_depth_cls_prev:
+                if isinstance(m.conv, nn.Conv2d):
+                    normal_init(m.conv, std=0.01)
+            normal_init(self.conv_depth_cls, std=0.01, bias=bias_cls)
+
+        if self.weight_dim != -1:
+            for conv_weight_prev in self.conv_weight_prevs:
+                if conv_weight_prev is None:
+                    continue
+                for m in conv_weight_prev:
+                    if isinstance(m.conv, nn.Conv2d):
+                        normal_init(m.conv, std=0.01)
+            for conv_weight in self.conv_weights:
+                normal_init(conv_weight, std=0.01)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[Tensor, ...]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * bbox_code_size.
+                dir_cls_preds (list[Tensor]): Box scores for direction class
+                    predictions on each scale level, each is a 4D-tensor,
+                    the channel number is num_points * 2. (bin = 2).
+                weight (list[Tensor]): Location-aware weight maps on each
+                    scale level, each is a 4D-tensor, the channel number is
+                    num_points * 1.
+                depth_cls_preds (list[Tensor]): Box scores for depth class
+                    predictions on each scale level, each is a 4D-tensor,
+                    the channel number is num_points * self.num_depth_cls.
+                attr_preds (list[Tensor]): Attribute scores for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * num_attrs.
+                centernesses (list[Tensor]): Centerness for each scale level,
+                    each is a 4D-tensor, the channel number is num_points * 1.
+        """
+        return multi_apply(self.forward_single, x, self.scales, self.strides)
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, ...]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple: scores for each class, bbox and direction class
+                predictions, depth class predictions, location-aware weights,
+                attribute and centerness predictions of input feature maps.
+        """
+        cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, cls_feat, \
+            reg_feat = super().forward_single(x, scale, stride)
+
+        max_regress_range = stride * self.regress_ranges[0][1] / \
+            self.strides[0]
+        bbox_pred = self.bbox_coder.decode_2d(bbox_pred, scale, stride,
+                                              max_regress_range, self.training,
+                                              self.pred_keypoints,
+                                              self.pred_bbox2d)
+
+        depth_cls_pred = None
+        if self.use_depth_classifier:
+            clone_reg_feat = reg_feat.clone()
+            for conv_depth_cls_prev_layer in self.conv_depth_cls_prev:
+                clone_reg_feat = conv_depth_cls_prev_layer(clone_reg_feat)
+            depth_cls_pred = self.conv_depth_cls(clone_reg_feat)
+
+        weight = None
+        if self.weight_dim != -1:
+            weight = []
+            for i in range(self.weight_dim):
+                clone_reg_feat = reg_feat.clone()
+                if len(self.weight_branch[i]) > 0:
+                    for conv_weight_prev_layer in self.conv_weight_prevs[i]:
+                        clone_reg_feat = conv_weight_prev_layer(clone_reg_feat)
+                weight.append(self.conv_weights[i](clone_reg_feat))
+            weight = torch.cat(weight, dim=1)
+
+        return cls_score, bbox_pred, dir_cls_pred, depth_cls_pred, weight, \
+            attr_pred, centerness
+
+    def get_proj_bbox2d(self,
+                        bbox_preds: List[Tensor],
+                        pos_dir_cls_preds: List[Tensor],
+                        labels_3d: List[Tensor],
+                        bbox_targets_3d: List[Tensor],
+                        pos_points: Tensor,
+                        pos_inds: Tensor,
+                        batch_img_metas: List[dict],
+                        pos_depth_cls_preds: Optional[Tensor] = None,
+                        pos_weights: Optional[Tensor] = None,
+                        pos_cls_scores: Optional[Tensor] = None,
+                        with_kpts: bool = False) -> Tuple[Tensor]:
+        """Decode box predictions and get projected 2D attributes.
+
+        Args:
+            bbox_preds (list[Tensor]): Box predictions for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * bbox_code_size.
+            pos_dir_cls_preds (Tensor): Box scores for direction class
+                predictions of positive boxes on all the scale levels in shape
+                (num_pos_points, 2).
+            labels_3d (list[Tensor]): 3D box category labels for each scale
+                level, each is a 4D-tensor.
+            bbox_targets_3d (list[Tensor]): 3D box targets for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * bbox_code_size.
+            pos_points (Tensor): Foreground points.
+            pos_inds (Tensor): Index of foreground points from flattened
+                tensors.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            pos_depth_cls_preds (Tensor, optional): Probabilistic depth map of
+                positive boxes on all the scale levels in shape
+                (num_pos_points, self.num_depth_cls). Defaults to None.
+            pos_weights (Tensor, optional): Location-aware weights of positive
+                boxes in shape (num_pos_points, self.weight_dim). Defaults to
+                None.
+            pos_cls_scores (Tensor, optional): Classification scores of
+                positive boxes in shape (num_pos_points, self.num_classes).
+                Defaults to None.
+            with_kpts (bool, optional): Whether to output keypoints targets.
+                Defaults to False.
+
+        Returns:
+            tuple[Tensor]: Exterior 2D boxes from projected 3D boxes,
+                predicted 2D boxes and keypoint targets (if necessary).
+        """
+        views = [np.array(img_meta['cam2img']) for img_meta in batch_img_metas]
+        num_imgs = len(batch_img_metas)
+        img_idx = []
+        for label in labels_3d:
+            for idx in range(num_imgs):
+                img_idx.append(
+                    labels_3d[0].new_ones(int(len(label) / num_imgs)) * idx)
+        img_idx = torch.cat(img_idx)
+        pos_img_idx = img_idx[pos_inds]
+
+        flatten_strided_bbox_preds = []
+        flatten_strided_bbox2d_preds = []
+        flatten_bbox_targets_3d = []
+        flatten_strides = []
+
+        for stride_idx, bbox_pred in enumerate(bbox_preds):
+            flatten_bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(
+                -1, sum(self.group_reg_dims))
+            flatten_bbox_pred[:, :2] *= self.strides[stride_idx]
+            flatten_bbox_pred[:, -4:] *= self.strides[stride_idx]
+            flatten_strided_bbox_preds.append(
+                flatten_bbox_pred[:, :self.bbox_coder.bbox_code_size])
+            flatten_strided_bbox2d_preds.append(flatten_bbox_pred[:, -4:])
+
+            bbox_target_3d = bbox_targets_3d[stride_idx].clone()
+            bbox_target_3d[:, :2] *= self.strides[stride_idx]
+            bbox_target_3d[:, -4:] *= self.strides[stride_idx]
+            flatten_bbox_targets_3d.append(bbox_target_3d)
+
+            flatten_stride = flatten_bbox_pred.new_ones(
+                *flatten_bbox_pred.shape[:-1], 1) * self.strides[stride_idx]
+            flatten_strides.append(flatten_stride)
+
+        flatten_strided_bbox_preds = torch.cat(flatten_strided_bbox_preds)
+        flatten_strided_bbox2d_preds = torch.cat(flatten_strided_bbox2d_preds)
+        flatten_bbox_targets_3d = torch.cat(flatten_bbox_targets_3d)
+        flatten_strides = torch.cat(flatten_strides)
+        pos_strided_bbox_preds = flatten_strided_bbox_preds[pos_inds]
+        pos_strided_bbox2d_preds = flatten_strided_bbox2d_preds[pos_inds]
+        pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds]
+        pos_strides = flatten_strides[pos_inds]
+
+        pos_decoded_bbox2d_preds = distance2bbox(pos_points,
+                                                 pos_strided_bbox2d_preds)
+
+        pos_strided_bbox_preds[:, :2] = \
+            pos_points - pos_strided_bbox_preds[:, :2]
+        pos_bbox_targets_3d[:, :2] = \
+            pos_points - pos_bbox_targets_3d[:, :2]
+
+        if self.use_depth_classifier and (not self.use_onlyreg_proj):
+            pos_prob_depth_preds = self.bbox_coder.decode_prob_depth(
+                pos_depth_cls_preds, self.depth_range, self.depth_unit,
+                self.division, self.num_depth_cls)
+            sig_alpha = torch.sigmoid(self.fuse_lambda)
+            pos_strided_bbox_preds[:, 2] = \
+                sig_alpha * pos_strided_bbox_preds.clone()[:, 2] + \
+                (1 - sig_alpha) * pos_prob_depth_preds
+
+        box_corners_in_image = pos_strided_bbox_preds.new_zeros(
+            (*pos_strided_bbox_preds.shape[:-1], 8, 2))
+        box_corners_in_image_gt = pos_strided_bbox_preds.new_zeros(
+            (*pos_strided_bbox_preds.shape[:-1], 8, 2))
+
+        for idx in range(num_imgs):
+            mask = (pos_img_idx == idx)
+            if pos_strided_bbox_preds[mask].shape[0] == 0:
+                continue
+            cam2img = torch.eye(
+                4,
+                dtype=pos_strided_bbox_preds.dtype,
+                device=pos_strided_bbox_preds.device)
+            view_shape = views[idx].shape
+            cam2img[:view_shape[0], :view_shape[1]] = \
+                pos_strided_bbox_preds.new_tensor(views[idx])
+
+            centers2d_preds = pos_strided_bbox_preds.clone()[mask, :2]
+            centers2d_targets = pos_bbox_targets_3d.clone()[mask, :2]
+            centers3d_targets = points_img2cam(pos_bbox_targets_3d[mask, :3],
+                                               views[idx])
+
+            # use predicted depth to re-project the 2.5D centers
+            pos_strided_bbox_preds[mask, :3] = points_img2cam(
+                pos_strided_bbox_preds[mask, :3], views[idx])
+            pos_bbox_targets_3d[mask, :3] = centers3d_targets
+
+            # depth fixed when computing re-project 3D bboxes
+            pos_strided_bbox_preds[mask, 2] = \
+                pos_bbox_targets_3d.clone()[mask, 2]
+
+            # decode yaws
+            if self.use_direction_classifier:
+                pos_dir_cls_scores = torch.max(
+                    pos_dir_cls_preds[mask], dim=-1)[1]
+                pos_strided_bbox_preds[mask] = self.bbox_coder.decode_yaw(
+                    pos_strided_bbox_preds[mask], centers2d_preds,
+                    pos_dir_cls_scores, self.dir_offset, cam2img)
+            pos_bbox_targets_3d[mask, 6] = torch.atan2(
+                centers2d_targets[:, 0] - cam2img[0, 2],
+                cam2img[0, 0]) + pos_bbox_targets_3d[mask, 6]
+
+            corners = batch_img_metas[0]['box_type_3d'](
+                pos_strided_bbox_preds[mask],
+                box_dim=self.bbox_coder.bbox_code_size,
+                origin=(0.5, 0.5, 0.5)).corners
+            box_corners_in_image[mask] = points_cam2img(corners, cam2img)
+
+            corners_gt = batch_img_metas[0]['box_type_3d'](
+                pos_bbox_targets_3d[mask, :self.bbox_code_size],
+                box_dim=self.bbox_coder.bbox_code_size,
+                origin=(0.5, 0.5, 0.5)).corners
+            box_corners_in_image_gt[mask] = points_cam2img(corners_gt, cam2img)
+
+        minxy = torch.min(box_corners_in_image, dim=1)[0]
+        maxxy = torch.max(box_corners_in_image, dim=1)[0]
+        proj_bbox2d_preds = torch.cat([minxy, maxxy], dim=1)
+
+        outputs = (proj_bbox2d_preds, pos_decoded_bbox2d_preds)
+
+        if with_kpts:
+            norm_strides = pos_strides * self.regress_ranges[0][1] / \
+                self.strides[0]
+            kpts_targets = box_corners_in_image_gt - pos_points[..., None, :]
+            kpts_targets = kpts_targets.view(
+                (*pos_strided_bbox_preds.shape[:-1], 16))
+            kpts_targets /= norm_strides
+
+            outputs += (kpts_targets, )
+
+        return outputs
+
+    def get_pos_predictions(self, bbox_preds: List[Tensor],
+                            dir_cls_preds: List[Tensor],
+                            depth_cls_preds: List[Tensor],
+                            weights: List[Tensor], attr_preds: List[Tensor],
+                            centernesses: List[Tensor], pos_inds: Tensor,
+                            batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Flatten predictions and get positive ones.
+
+        Args:
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * bbox_code_size.
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * 2. (bin = 2)
+            depth_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * self.num_depth_cls.
+            attr_preds (list[Tensor]): Attribute scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_attrs.
+            centernesses (list[Tensor]): Centerness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            pos_inds (Tensor): Index of foreground points from flattened
+                tensors.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple[Tensor]: Box predictions, direction classes, probabilistic
+                depth maps, location-aware weight maps, attributes and
+                centerness predictions.
+        """
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, sum(self.group_reg_dims))
+            for bbox_pred in bbox_preds
+        ]
+        flatten_dir_cls_preds = [
+            dir_cls_pred.permute(0, 2, 3, 1).reshape(-1, 2)
+            for dir_cls_pred in dir_cls_preds
+        ]
+        flatten_centerness = [
+            centerness.permute(0, 2, 3, 1).reshape(-1)
+            for centerness in centernesses
+        ]
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_dir_cls_preds = torch.cat(flatten_dir_cls_preds)
+        flatten_centerness = torch.cat(flatten_centerness)
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_dir_cls_preds = flatten_dir_cls_preds[pos_inds]
+        pos_centerness = flatten_centerness[pos_inds]
+
+        pos_depth_cls_preds = None
+        if self.use_depth_classifier:
+            flatten_depth_cls_preds = [
+                depth_cls_pred.permute(0, 2, 3,
+                                       1).reshape(-1, self.num_depth_cls)
+                for depth_cls_pred in depth_cls_preds
+            ]
+            flatten_depth_cls_preds = torch.cat(flatten_depth_cls_preds)
+            pos_depth_cls_preds = flatten_depth_cls_preds[pos_inds]
+
+        pos_weights = None
+        if self.weight_dim != -1:
+            flatten_weights = [
+                weight.permute(0, 2, 3, 1).reshape(-1, self.weight_dim)
+                for weight in weights
+            ]
+            flatten_weights = torch.cat(flatten_weights)
+            pos_weights = flatten_weights[pos_inds]
+
+        pos_attr_preds = None
+        if self.pred_attrs:
+            flatten_attr_preds = [
+                attr_pred.permute(0, 2, 3, 1).reshape(-1, self.num_attrs)
+                for attr_pred in attr_preds
+            ]
+            flatten_attr_preds = torch.cat(flatten_attr_preds)
+            pos_attr_preds = flatten_attr_preds[pos_inds]
+
+        return pos_bbox_preds, pos_dir_cls_preds, pos_depth_cls_preds, \
+            pos_weights, pos_attr_preds, pos_centerness
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            dir_cls_preds: List[Tensor],
+            depth_cls_preds: List[Tensor],
+            weights: List[Tensor],
+            attr_preds: List[Tensor],
+            centernesses: List[Tensor],
+            batch_gt_instances_3d: InstanceList,
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * bbox_code_size.
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * 2. (bin = 2)
+            depth_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * self.num_depth_cls.
+            weights (list[Tensor]): Location-aware weights for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * self.weight_dim.
+            attr_preds (list[Tensor]): Attribute scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_attrs.
+            centernesses (list[Tensor]): Centerness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes``、``labels``
+                、``bboxes_3d``、``labels_3d``、``depths``、``centers_2d`` and
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \
+            len(depth_cls_preds) == len(weights) == len(centernesses) == \
+            len(attr_preds), 'The length of cls_scores, bbox_preds, ' \
+            'dir_cls_preds, depth_cls_preds, weights, centernesses, and' \
+            f'attr_preds: {len(cls_scores)}, {len(bbox_preds)}, ' \
+            f'{len(dir_cls_preds)}, {len(depth_cls_preds)}, {len(weights)}' \
+            f'{len(centernesses)}, {len(attr_preds)} are inconsistent.'
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
+                                           bbox_preds[0].device)
+        labels_3d, bbox_targets_3d, centerness_targets, attr_targets = \
+            self.get_targets(
+                all_level_points, batch_gt_instances_3d, batch_gt_instances)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores and targets
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_labels_3d = torch.cat(labels_3d)
+        flatten_bbox_targets_3d = torch.cat(bbox_targets_3d)
+        flatten_centerness_targets = torch.cat(centerness_targets)
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+        if self.pred_attrs:
+            flatten_attr_targets = torch.cat(attr_targets)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((flatten_labels_3d >= 0)
+                    & (flatten_labels_3d < bg_class_ind)).nonzero().reshape(-1)
+        num_pos = len(pos_inds)
+
+        loss_dict = dict()
+
+        loss_dict['loss_cls'] = self.loss_cls(
+            flatten_cls_scores,
+            flatten_labels_3d,
+            avg_factor=num_pos + num_imgs)  # avoid num_pos is 0
+
+        pos_bbox_preds, pos_dir_cls_preds, pos_depth_cls_preds, pos_weights, \
+            pos_attr_preds, pos_centerness = self.get_pos_predictions(
+                bbox_preds, dir_cls_preds, depth_cls_preds, weights,
+                attr_preds, centernesses, pos_inds, batch_img_metas)
+
+        if num_pos > 0:
+            pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds]
+            pos_centerness_targets = flatten_centerness_targets[pos_inds]
+            pos_points = flatten_points[pos_inds]
+            if self.pred_attrs:
+                pos_attr_targets = flatten_attr_targets[pos_inds]
+            if self.use_direction_classifier:
+                pos_dir_cls_targets = self.get_direction_target(
+                    pos_bbox_targets_3d, self.dir_offset, one_hot=False)
+
+            bbox_weights = pos_centerness_targets.new_ones(
+                len(pos_centerness_targets), sum(self.group_reg_dims))
+            equal_weights = pos_centerness_targets.new_ones(
+                pos_centerness_targets.shape)
+            code_weight = self.train_cfg.get('code_weight', None)
+            if code_weight:
+                assert len(code_weight) == sum(self.group_reg_dims)
+                bbox_weights = bbox_weights * bbox_weights.new_tensor(
+                    code_weight)
+
+            if self.diff_rad_by_sin:
+                pos_bbox_preds, pos_bbox_targets_3d = self.add_sin_difference(
+                    pos_bbox_preds, pos_bbox_targets_3d)
+
+            loss_dict['loss_offset'] = self.loss_bbox(
+                pos_bbox_preds[:, :2],
+                pos_bbox_targets_3d[:, :2],
+                weight=bbox_weights[:, :2],
+                avg_factor=equal_weights.sum())
+            loss_dict['loss_size'] = self.loss_bbox(
+                pos_bbox_preds[:, 3:6],
+                pos_bbox_targets_3d[:, 3:6],
+                weight=bbox_weights[:, 3:6],
+                avg_factor=equal_weights.sum())
+            loss_dict['loss_rotsin'] = self.loss_bbox(
+                pos_bbox_preds[:, 6],
+                pos_bbox_targets_3d[:, 6],
+                weight=bbox_weights[:, 6],
+                avg_factor=equal_weights.sum())
+            if self.pred_velo:
+                loss_dict['loss_velo'] = self.loss_bbox(
+                    pos_bbox_preds[:, 7:9],
+                    pos_bbox_targets_3d[:, 7:9],
+                    weight=bbox_weights[:, 7:9],
+                    avg_factor=equal_weights.sum())
+
+            proj_bbox2d_inputs = (bbox_preds, pos_dir_cls_preds, labels_3d,
+                                  bbox_targets_3d, pos_points, pos_inds,
+                                  batch_img_metas)
+
+            # direction classification loss
+            # TODO: add more check for use_direction_classifier
+            if self.use_direction_classifier:
+                loss_dict['loss_dir'] = self.loss_dir(
+                    pos_dir_cls_preds,
+                    pos_dir_cls_targets,
+                    equal_weights,
+                    avg_factor=equal_weights.sum())
+
+            # init depth loss with the one computed from direct regression
+            loss_dict['loss_depth'] = self.loss_bbox(
+                pos_bbox_preds[:, 2],
+                pos_bbox_targets_3d[:, 2],
+                weight=bbox_weights[:, 2],
+                avg_factor=equal_weights.sum())
+            # depth classification loss
+            if self.use_depth_classifier:
+                pos_prob_depth_preds = self.bbox_coder.decode_prob_depth(
+                    pos_depth_cls_preds, self.depth_range, self.depth_unit,
+                    self.division, self.num_depth_cls)
+                sig_alpha = torch.sigmoid(self.fuse_lambda)
+                if self.weight_dim != -1:
+                    loss_fuse_depth = self.loss_depth(
+                        sig_alpha * pos_bbox_preds[:, 2] +
+                        (1 - sig_alpha) * pos_prob_depth_preds,
+                        pos_bbox_targets_3d[:, 2],
+                        sigma=pos_weights[:, 0],
+                        weight=bbox_weights[:, 2],
+                        avg_factor=equal_weights.sum())
+                else:
+                    loss_fuse_depth = self.loss_depth(
+                        sig_alpha * pos_bbox_preds[:, 2] +
+                        (1 - sig_alpha) * pos_prob_depth_preds,
+                        pos_bbox_targets_3d[:, 2],
+                        weight=bbox_weights[:, 2],
+                        avg_factor=equal_weights.sum())
+                loss_dict['loss_depth'] = loss_fuse_depth
+
+                proj_bbox2d_inputs += (pos_depth_cls_preds, )
+
+            if self.pred_keypoints:
+                # use smoothL1 to compute consistency loss for keypoints
+                # normalize the offsets with strides
+                proj_bbox2d_preds, pos_decoded_bbox2d_preds, kpts_targets = \
+                    self.get_proj_bbox2d(*proj_bbox2d_inputs, with_kpts=True)
+                loss_dict['loss_kpts'] = self.loss_bbox(
+                    pos_bbox_preds[:, self.kpts_start:self.kpts_start + 16],
+                    kpts_targets,
+                    weight=bbox_weights[:,
+                                        self.kpts_start:self.kpts_start + 16],
+                    avg_factor=equal_weights.sum())
+
+            if self.pred_bbox2d:
+                loss_dict['loss_bbox2d'] = self.loss_bbox2d(
+                    pos_bbox_preds[:, -4:],
+                    pos_bbox_targets_3d[:, -4:],
+                    weight=bbox_weights[:, -4:],
+                    avg_factor=equal_weights.sum())
+                if not self.pred_keypoints:
+                    proj_bbox2d_preds, pos_decoded_bbox2d_preds = \
+                        self.get_proj_bbox2d(*proj_bbox2d_inputs)
+                loss_dict['loss_consistency'] = self.loss_consistency(
+                    proj_bbox2d_preds,
+                    pos_decoded_bbox2d_preds,
+                    weight=bbox_weights[:, -4:],
+                    avg_factor=equal_weights.sum())
+
+            loss_dict['loss_centerness'] = self.loss_centerness(
+                pos_centerness, pos_centerness_targets)
+
+            # attribute classification loss
+            if self.pred_attrs:
+                loss_dict['loss_attr'] = self.loss_attr(
+                    pos_attr_preds,
+                    pos_attr_targets,
+                    pos_centerness_targets,
+                    avg_factor=pos_centerness_targets.sum())
+
+        else:
+            # need absolute due to possible negative delta x/y
+            loss_dict['loss_offset'] = pos_bbox_preds[:, :2].sum()
+            loss_dict['loss_size'] = pos_bbox_preds[:, 3:6].sum()
+            loss_dict['loss_rotsin'] = pos_bbox_preds[:, 6].sum()
+            loss_dict['loss_depth'] = pos_bbox_preds[:, 2].sum()
+            if self.pred_velo:
+                loss_dict['loss_velo'] = pos_bbox_preds[:, 7:9].sum()
+            if self.pred_keypoints:
+                loss_dict['loss_kpts'] = pos_bbox_preds[:,
+                                                        self.kpts_start:self.
+                                                        kpts_start + 16].sum()
+            if self.pred_bbox2d:
+                loss_dict['loss_bbox2d'] = pos_bbox_preds[:, -4:].sum()
+                loss_dict['loss_consistency'] = pos_bbox_preds[:, -4:].sum()
+            loss_dict['loss_centerness'] = pos_centerness.sum()
+            if self.use_direction_classifier:
+                loss_dict['loss_dir'] = pos_dir_cls_preds.sum()
+            if self.use_depth_classifier:
+                sig_alpha = torch.sigmoid(self.fuse_lambda)
+                loss_fuse_depth = \
+                    sig_alpha * pos_bbox_preds[:, 2].sum() + \
+                    (1 - sig_alpha) * pos_depth_cls_preds.sum()
+                if self.weight_dim != -1:
+                    loss_fuse_depth *= torch.exp(-pos_weights[:, 0].sum())
+                loss_dict['loss_depth'] = loss_fuse_depth
+            if self.pred_attrs:
+                loss_dict['loss_attr'] = pos_attr_preds.sum()
+
+        return loss_dict
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        dir_cls_preds: List[Tensor],
+                        depth_cls_preds: List[Tensor],
+                        weights: List[Tensor],
+                        attr_preds: List[Tensor],
+                        centernesses: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: OptConfigType = None,
+                        rescale: bool = False) -> InstanceList:
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_points * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_points * 4, H, W)
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * 2. (bin = 2)
+            depth_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * self.num_depth_cls.
+            weights (list[Tensor]): Location-aware weights for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * self.weight_dim.
+            attr_preds (list[Tensor]): Attribute scores for each scale level
+                Has shape (N, num_points * num_attrs, H, W)
+            centernesses (list[Tensor]): Centerness for each scale level with
+                shape (N, num_points * 1, H, W)
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmengine.Config, optional): Test / postprocessing config,
+                if None, test_cfg would be used. Defaults to None.
+            rescale (bool, optional): If True, return boxes in original image
+                space. Defaults to False.
+
+        Returns:
+            list[tuple[Tensor]]: Each item in result_list is a tuple, which
+                consists of predicted 3D boxes, scores, labels, attributes and
+                2D boxes (if necessary).
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \
+            len(depth_cls_preds) == len(weights) == len(centernesses) == \
+            len(attr_preds), 'The length of cls_scores, bbox_preds, ' \
+            'dir_cls_preds, depth_cls_preds, weights, centernesses, and' \
+            f'attr_preds: {len(cls_scores)}, {len(bbox_preds)}, ' \
+            f'{len(dir_cls_preds)}, {len(depth_cls_preds)}, {len(weights)}' \
+            f'{len(centernesses)}, {len(attr_preds)} are inconsistent.'
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
+                                      bbox_preds[0].device)
+        result_list = []
+        result_list_2d = []
+
+        for img_id in range(len(batch_img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            if self.use_direction_classifier:
+                dir_cls_pred_list = [
+                    dir_cls_preds[i][img_id].detach()
+                    for i in range(num_levels)
+                ]
+            else:
+                dir_cls_pred_list = [
+                    cls_scores[i][img_id].new_full(
+                        [2, *cls_scores[i][img_id].shape[1:]], 0).detach()
+                    for i in range(num_levels)
+                ]
+            if self.use_depth_classifier:
+                depth_cls_pred_list = [
+                    depth_cls_preds[i][img_id].detach()
+                    for i in range(num_levels)
+                ]
+            else:
+                depth_cls_pred_list = [
+                    cls_scores[i][img_id].new_full(
+                        [self.num_depth_cls, *cls_scores[i][img_id].shape[1:]],
+                        0).detach() for i in range(num_levels)
+                ]
+            if self.weight_dim != -1:
+                weight_list = [
+                    weights[i][img_id].detach() for i in range(num_levels)
+                ]
+            else:
+                weight_list = [
+                    cls_scores[i][img_id].new_full(
+                        [1, *cls_scores[i][img_id].shape[1:]], 0).detach()
+                    for i in range(num_levels)
+                ]
+            if self.pred_attrs:
+                attr_pred_list = [
+                    attr_preds[i][img_id].detach() for i in range(num_levels)
+                ]
+            else:
+                attr_pred_list = [
+                    cls_scores[i][img_id].new_full(
+                        [self.num_attrs, *cls_scores[i][img_id].shape[1:]],
+                        self.attr_background_label).detach()
+                    for i in range(num_levels)
+                ]
+            centerness_pred_list = [
+                centernesses[i][img_id].detach() for i in range(num_levels)
+            ]
+            img_meta = batch_img_metas[img_id]
+            results, results_2d = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                dir_cls_pred_list=dir_cls_pred_list,
+                depth_cls_pred_list=depth_cls_pred_list,
+                weight_list=weight_list,
+                attr_pred_list=attr_pred_list,
+                centerness_pred_list=centerness_pred_list,
+                mlvl_points=mlvl_points,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale)
+            result_list.append(results)
+            result_list_2d.append(results_2d)
+        return result_list, result_list_2d
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                dir_cls_pred_list: List[Tensor],
+                                depth_cls_pred_list: List[Tensor],
+                                weight_list: List[Tensor],
+                                attr_pred_list: List[Tensor],
+                                centerness_pred_list: List[Tensor],
+                                mlvl_points: Tensor,
+                                img_meta: dict,
+                                cfg: ConfigType,
+                                rescale: bool = False) -> InstanceData:
+        """Transform outputs for a single batch item into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for a single scale level
+                Has shape (num_points * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for a single scale
+                level with shape (num_points * bbox_code_size, H, W).
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on a single scale level with shape
+                (num_points * 2, H, W)
+            depth_cls_preds (list[Tensor]): Box scores for probabilistic depth
+                predictions on a single scale level with shape
+                (num_points * self.num_depth_cls, H, W)
+            weights (list[Tensor]): Location-aware weight maps on a single
+                scale level with shape (num_points * self.weight_dim, H, W).
+            attr_preds (list[Tensor]): Attribute scores for each scale level
+                Has shape (N, num_points * num_attrs, H, W)
+            centernesses (list[Tensor]): Centerness for a single scale level
+                with shape (num_points, H, W).
+            mlvl_points (list[Tensor]): Box reference for a single scale level
+                with shape (num_total_points, 2).
+            img_meta (dict): Metadata of input image.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool, optional): If True, return boxes in original image
+                space. Defaults to False.
+
+        Returns:
+            tuples[Tensor]: Predicted 3D boxes, scores, labels, attributes and
+                2D boxes (if necessary).
+        """
+        view = np.array(img_meta['cam2img'])
+        scale_factor = img_meta['scale_factor']
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_points)
+        mlvl_centers2d = []
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_dir_scores = []
+        mlvl_attr_scores = []
+        mlvl_centerness = []
+        mlvl_depth_cls_scores = []
+        mlvl_depth_uncertainty = []
+        mlvl_bboxes2d = None
+        if self.pred_bbox2d:
+            mlvl_bboxes2d = []
+
+        for cls_score, bbox_pred, dir_cls_pred, depth_cls_pred, weight, \
+                attr_pred, centerness, points in zip(
+                    cls_score_list, bbox_pred_list, dir_cls_pred_list,
+                    depth_cls_pred_list, weight_list, attr_pred_list,
+                    centerness_pred_list, mlvl_points):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
+            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
+            depth_cls_pred = depth_cls_pred.permute(1, 2, 0).reshape(
+                -1, self.num_depth_cls)
+            depth_cls_score = F.softmax(
+                depth_cls_pred, dim=-1).topk(
+                    k=2, dim=-1)[0].mean(dim=-1)
+            if self.weight_dim != -1:
+                weight = weight.permute(1, 2, 0).reshape(-1, self.weight_dim)
+            else:
+                weight = weight.permute(1, 2, 0).reshape(-1, 1)
+            depth_uncertainty = torch.exp(-weight[:, -1])
+            attr_pred = attr_pred.permute(1, 2, 0).reshape(-1, self.num_attrs)
+            attr_score = torch.max(attr_pred, dim=-1)[1]
+            centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid()
+
+            bbox_pred = bbox_pred.permute(1, 2,
+                                          0).reshape(-1,
+                                                     sum(self.group_reg_dims))
+            bbox_pred3d = bbox_pred[:, :self.bbox_coder.bbox_code_size]
+            if self.pred_bbox2d:
+                bbox_pred2d = bbox_pred[:, -4:]
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                merged_scores = scores * centerness[:, None]
+                if self.use_depth_classifier:
+                    merged_scores *= depth_cls_score[:, None]
+                    if self.weight_dim != -1:
+                        merged_scores *= depth_uncertainty[:, None]
+                max_scores, _ = merged_scores.max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                points = points[topk_inds, :]
+                bbox_pred3d = bbox_pred3d[topk_inds, :]
+                scores = scores[topk_inds, :]
+                dir_cls_pred = dir_cls_pred[topk_inds, :]
+                depth_cls_pred = depth_cls_pred[topk_inds, :]
+                centerness = centerness[topk_inds]
+                dir_cls_score = dir_cls_score[topk_inds]
+                depth_cls_score = depth_cls_score[topk_inds]
+                depth_uncertainty = depth_uncertainty[topk_inds]
+                attr_score = attr_score[topk_inds]
+                if self.pred_bbox2d:
+                    bbox_pred2d = bbox_pred2d[topk_inds, :]
+            # change the offset to actual center predictions
+            bbox_pred3d[:, :2] = points - bbox_pred3d[:, :2]
+            if rescale:
+                if self.pred_bbox2d:
+                    bbox_pred2d /= bbox_pred2d.new_tensor(scale_factor[0])
+            if self.use_depth_classifier:
+                prob_depth_pred = self.bbox_coder.decode_prob_depth(
+                    depth_cls_pred, self.depth_range, self.depth_unit,
+                    self.division, self.num_depth_cls)
+                sig_alpha = torch.sigmoid(self.fuse_lambda)
+                bbox_pred3d[:, 2] = sig_alpha * bbox_pred3d[:, 2] + \
+                    (1 - sig_alpha) * prob_depth_pred
+            pred_center2d = bbox_pred3d[:, :3].clone()
+            bbox_pred3d[:, :3] = points_img2cam(bbox_pred3d[:, :3], view)
+            mlvl_centers2d.append(pred_center2d)
+            mlvl_bboxes.append(bbox_pred3d)
+            mlvl_scores.append(scores)
+            mlvl_dir_scores.append(dir_cls_score)
+            mlvl_depth_cls_scores.append(depth_cls_score)
+            mlvl_attr_scores.append(attr_score)
+            mlvl_centerness.append(centerness)
+            mlvl_depth_uncertainty.append(depth_uncertainty)
+            if self.pred_bbox2d:
+                bbox_pred2d = distance2bbox(
+                    points, bbox_pred2d, max_shape=img_meta['img_shape'])
+                mlvl_bboxes2d.append(bbox_pred2d)
+
+        mlvl_centers2d = torch.cat(mlvl_centers2d)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
+        if self.pred_bbox2d:
+            mlvl_bboxes2d = torch.cat(mlvl_bboxes2d)
+
+        # change local yaw to global yaw for 3D nms
+        cam2img = torch.eye(
+            4, dtype=mlvl_centers2d.dtype, device=mlvl_centers2d.device)
+        cam2img[:view.shape[0], :view.shape[1]] = \
+            mlvl_centers2d.new_tensor(view)
+        mlvl_bboxes = self.bbox_coder.decode_yaw(mlvl_bboxes, mlvl_centers2d,
+                                                 mlvl_dir_scores,
+                                                 self.dir_offset, cam2img)
+
+        mlvl_bboxes_for_nms = xywhr2xyxyr(img_meta['box_type_3d'](
+            mlvl_bboxes,
+            box_dim=self.bbox_coder.bbox_code_size,
+            origin=(0.5, 0.5, 0.5)).bev)
+
+        mlvl_scores = torch.cat(mlvl_scores)
+        padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+        # BG cat_id: num_class
+        mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        mlvl_attr_scores = torch.cat(mlvl_attr_scores)
+        mlvl_centerness = torch.cat(mlvl_centerness)
+        # no scale_factors in box3d_multiclass_nms
+        # Then we multiply it from outside
+        mlvl_nms_scores = mlvl_scores * mlvl_centerness[:, None]
+        if self.use_depth_classifier:  # multiply the depth confidence
+            mlvl_depth_cls_scores = torch.cat(mlvl_depth_cls_scores)
+            mlvl_nms_scores *= mlvl_depth_cls_scores[:, None]
+            if self.weight_dim != -1:
+                mlvl_depth_uncertainty = torch.cat(mlvl_depth_uncertainty)
+                mlvl_nms_scores *= mlvl_depth_uncertainty[:, None]
+        nms_results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
+                                           mlvl_nms_scores, cfg.score_thr,
+                                           cfg.max_per_img, cfg,
+                                           mlvl_dir_scores, mlvl_attr_scores,
+                                           mlvl_bboxes2d)
+        bboxes, scores, labels, dir_scores, attrs = nms_results[0:5]
+        attrs = attrs.to(labels.dtype)  # change data type to int
+        bboxes = img_meta['box_type_3d'](
+            bboxes,
+            box_dim=self.bbox_coder.bbox_code_size,
+            origin=(0.5, 0.5, 0.5))
+        # Note that the predictions use origin (0.5, 0.5, 0.5)
+        # Due to the ground truth centers2d are the gravity center of objects
+        # v0.10.0 fix inplace operation to the input tensor of cam_box3d
+        # So here we also need to add origin=(0.5, 0.5, 0.5)
+        if not self.pred_attrs:
+            attrs = None
+
+        results = InstanceData()
+        results.bboxes_3d = bboxes
+        results.scores_3d = scores
+        results.labels_3d = labels
+
+        if attrs is not None:
+            results.attr_labels = attrs
+
+        results_2d = InstanceData()
+
+        if self.pred_bbox2d:
+            bboxes2d = nms_results[-1]
+            results_2d.bboxes = bboxes2d
+            results_2d.scores = scores
+            results_2d.labels = labels
+
+        return results, results_2d
+
+    def get_targets(
+        self,
+        points: List[Tensor],
+        batch_gt_instances_3d: InstanceList,
+        batch_gt_instances: InstanceList,
+    ) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]:
+        """Compute regression, classification and centerss targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、
+                ``labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes``、``labels``.
+
+        Returns:
+            tuple:
+                concat_lvl_labels (list[Tensor]): Labels of each level. \
+                concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
+                    level.
+        """
+        assert len(points) == len(self.regress_ranges)
+        num_levels = len(points)
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        if 'attr_labels' not in batch_gt_instances_3d[0]:
+            for gt_instances_3d in batch_gt_instances_3d:
+                gt_instances_3d.attr_labels = \
+                    gt_instances_3d.labels_3d.new_full(
+                        gt_instances_3d.labels_3d.shape,
+                        self.attr_background_label)
+
+        # get labels and bbox_targets of each image
+        _, bbox_targets_list, labels_3d_list, bbox_targets_3d_list, \
+            centerness_targets_list, attr_targets_list = multi_apply(
+                self._get_target_single,
+                batch_gt_instances_3d,
+                batch_gt_instances,
+                points=concat_points,
+                regress_ranges=concat_regress_ranges,
+                num_points_per_lvl=num_points)
+
+        # split to per img, per level
+        bbox_targets_list = [
+            bbox_targets.split(num_points, 0)
+            for bbox_targets in bbox_targets_list
+        ]
+        labels_3d_list = [
+            labels_3d.split(num_points, 0) for labels_3d in labels_3d_list
+        ]
+        bbox_targets_3d_list = [
+            bbox_targets_3d.split(num_points, 0)
+            for bbox_targets_3d in bbox_targets_3d_list
+        ]
+        centerness_targets_list = [
+            centerness_targets.split(num_points, 0)
+            for centerness_targets in centerness_targets_list
+        ]
+        attr_targets_list = [
+            attr_targets.split(num_points, 0)
+            for attr_targets in attr_targets_list
+        ]
+
+        # concat per level image
+        concat_lvl_labels_3d = []
+        concat_lvl_bbox_targets_3d = []
+        concat_lvl_centerness_targets = []
+        concat_lvl_attr_targets = []
+        for i in range(num_levels):
+            concat_lvl_labels_3d.append(
+                torch.cat([labels[i] for labels in labels_3d_list]))
+            concat_lvl_centerness_targets.append(
+                torch.cat([
+                    centerness_targets[i]
+                    for centerness_targets in centerness_targets_list
+                ]))
+            bbox_targets_3d = torch.cat([
+                bbox_targets_3d[i] for bbox_targets_3d in bbox_targets_3d_list
+            ])
+            if self.pred_bbox2d:
+                bbox_targets = torch.cat(
+                    [bbox_targets[i] for bbox_targets in bbox_targets_list])
+                bbox_targets_3d = torch.cat([bbox_targets_3d, bbox_targets],
+                                            dim=1)
+            concat_lvl_attr_targets.append(
+                torch.cat(
+                    [attr_targets[i] for attr_targets in attr_targets_list]))
+            if self.norm_on_bbox:
+                bbox_targets_3d[:, :2] = \
+                    bbox_targets_3d[:, :2] / self.strides[i]
+                if self.pred_bbox2d:
+                    bbox_targets_3d[:, -4:] = \
+                        bbox_targets_3d[:, -4:] / self.strides[i]
+            concat_lvl_bbox_targets_3d.append(bbox_targets_3d)
+        return concat_lvl_labels_3d, concat_lvl_bbox_targets_3d, \
+            concat_lvl_centerness_targets, concat_lvl_attr_targets
diff --git a/mmde/mmdet3d/models/dense_heads/point_rpn_head.py b/mmde/mmdet3d/models/dense_heads/point_rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..a57516211457f43d4764ed3b5a94b8f229b32d30
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/point_rpn_head.py
@@ -0,0 +1,511 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from mmdet.models.utils import multi_apply
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.models.layers import nms_bev, nms_normal_bev
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures import xywhr2xyxyr
+from mmdet3d.structures.bbox_3d import (BaseInstance3DBoxes,
+                                        DepthInstance3DBoxes,
+                                        LiDARInstance3DBoxes)
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils.typing_utils import InstanceList
+
+
+@MODELS.register_module()
+class PointRPNHead(BaseModule):
+    """RPN module for PointRCNN.
+
+    Args:
+        num_classes (int): Number of classes.
+        train_cfg (dict): Train configs.
+        test_cfg (dict): Test configs.
+        pred_layer_cfg (dict, optional): Config of classification and
+            regression prediction layers. Defaults to None.
+        enlarge_width (float, optional): Enlarge bbox for each side to ignore
+            close points. Defaults to 0.1.
+        cls_loss (dict, optional): Config of direction classification loss.
+            Defaults to None.
+        bbox_loss (dict, optional): Config of localization loss.
+            Defaults to None.
+        bbox_coder (dict, optional): Config dict of box coders.
+            Defaults to None.
+        init_cfg (dict, optional): Config of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 train_cfg: dict,
+                 test_cfg: dict,
+                 pred_layer_cfg: Optional[dict] = None,
+                 enlarge_width: float = 0.1,
+                 cls_loss: Optional[dict] = None,
+                 bbox_loss: Optional[dict] = None,
+                 bbox_coder: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.enlarge_width = enlarge_width
+
+        # build loss function
+        self.bbox_loss = MODELS.build(bbox_loss)
+        self.cls_loss = MODELS.build(cls_loss)
+
+        # build box coder
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+
+        # build pred conv
+        self.cls_layers = self._make_fc_layers(
+            fc_cfg=pred_layer_cfg.cls_linear_channels,
+            input_channels=pred_layer_cfg.in_channels,
+            output_channels=self._get_cls_out_channels())
+
+        self.reg_layers = self._make_fc_layers(
+            fc_cfg=pred_layer_cfg.reg_linear_channels,
+            input_channels=pred_layer_cfg.in_channels,
+            output_channels=self._get_reg_out_channels())
+
+    def _make_fc_layers(self, fc_cfg: dict, input_channels: int,
+                        output_channels: int) -> nn.Sequential:
+        """Make fully connect layers.
+
+        Args:
+            fc_cfg (dict): Config of fully connect.
+            input_channels (int): Input channels for fc_layers.
+            output_channels (int): Input channels for fc_layers.
+
+        Returns:
+            nn.Sequential: Fully connect layers.
+        """
+        fc_layers = []
+        c_in = input_channels
+        for k in range(0, fc_cfg.__len__()):
+            fc_layers.extend([
+                nn.Linear(c_in, fc_cfg[k], bias=False),
+                nn.BatchNorm1d(fc_cfg[k]),
+                nn.ReLU(),
+            ])
+            c_in = fc_cfg[k]
+        fc_layers.append(nn.Linear(c_in, output_channels, bias=True))
+        return nn.Sequential(*fc_layers)
+
+    def _get_cls_out_channels(self):
+        """Return the channel number of classification outputs."""
+        # Class numbers (k) + objectness (1)
+        return self.num_classes
+
+    def _get_reg_out_channels(self):
+        """Return the channel number of regression outputs."""
+        # Bbox classification and regression
+        # (center residual (3), size regression (3)
+        # torch.cos(yaw) (1), torch.sin(yaw) (1)
+        return self.bbox_coder.code_size
+
+    def forward(self, feat_dict: dict) -> Tuple[List[Tensor]]:
+        """Forward pass.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            tuple[list[torch.Tensor]]: Predicted boxes and classification
+                scores.
+        """
+        point_features = feat_dict['fp_features']
+        point_features = point_features.permute(0, 2, 1).contiguous()
+        batch_size = point_features.shape[0]
+        feat_cls = point_features.view(-1, point_features.shape[-1])
+        feat_reg = point_features.view(-1, point_features.shape[-1])
+
+        point_cls_preds = self.cls_layers(feat_cls).reshape(
+            batch_size, -1, self._get_cls_out_channels())
+        point_box_preds = self.reg_layers(feat_reg).reshape(
+            batch_size, -1, self._get_reg_out_channels())
+        return point_box_preds, point_cls_preds
+
+    def loss_by_feat(
+            self,
+            bbox_preds: List[Tensor],
+            cls_preds: List[Tensor],
+            points: List[Tensor],
+            batch_gt_instances_3d: InstanceList,
+            batch_input_metas: Optional[List[dict]] = None,
+            batch_gt_instances_ignore: Optional[InstanceList] = None) -> Dict:
+        """Compute loss.
+
+        Args:
+            bbox_preds (list[torch.Tensor]): Predictions from forward of
+                PointRCNN RPN_Head.
+            cls_preds (list[torch.Tensor]): Classification from forward of
+                PointRCNN RPN_Head.
+            points (list[torch.Tensor]): Input points.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances_3d. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+            batch_input_metas (list[dict]): Contain pcd and img's meta info.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: Losses of PointRCNN RPN module.
+        """
+        targets = self.get_targets(points, batch_gt_instances_3d)
+        (bbox_targets, mask_targets, positive_mask, negative_mask,
+         box_loss_weights, point_targets) = targets
+
+        # bbox loss
+        bbox_loss = self.bbox_loss(bbox_preds, bbox_targets,
+                                   box_loss_weights.unsqueeze(-1))
+        # calculate semantic loss
+        semantic_points = cls_preds.reshape(-1, self.num_classes)
+        semantic_targets = mask_targets
+        semantic_targets[negative_mask] = self.num_classes
+        semantic_points_label = semantic_targets
+        # for ignore, but now we do not have ignored label
+        semantic_loss_weight = negative_mask.float() + positive_mask.float()
+        semantic_loss = self.cls_loss(semantic_points,
+                                      semantic_points_label.reshape(-1),
+                                      semantic_loss_weight.reshape(-1))
+        semantic_loss /= positive_mask.float().sum()
+        losses = dict(bbox_loss=bbox_loss, semantic_loss=semantic_loss)
+
+        return losses
+
+    def get_targets(self, points: List[Tensor],
+                    batch_gt_instances_3d: InstanceList) -> Tuple[Tensor]:
+        """Generate targets of PointRCNN RPN head.
+
+        Args:
+            points (list[torch.Tensor]): Points in one batch.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances_3d. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of PointRCNN RPN head.
+        """
+        gt_labels_3d = [
+            instances.labels_3d for instances in batch_gt_instances_3d
+        ]
+        gt_bboxes_3d = [
+            instances.bboxes_3d for instances in batch_gt_instances_3d
+        ]
+
+        (bbox_targets, mask_targets, positive_mask, negative_mask,
+         point_targets) = multi_apply(self.get_targets_single, points,
+                                      gt_bboxes_3d, gt_labels_3d)
+
+        bbox_targets = torch.stack(bbox_targets)
+        mask_targets = torch.stack(mask_targets)
+        positive_mask = torch.stack(positive_mask)
+        negative_mask = torch.stack(negative_mask)
+        box_loss_weights = positive_mask / (positive_mask.sum() + 1e-6)
+
+        return (bbox_targets, mask_targets, positive_mask, negative_mask,
+                box_loss_weights, point_targets)
+
+    def get_targets_single(self, points: Tensor,
+                           gt_bboxes_3d: BaseInstance3DBoxes,
+                           gt_labels_3d: Tensor) -> Tuple[Tensor]:
+        """Generate targets of PointRCNN RPN head for single batch.
+
+        Args:
+            points (torch.Tensor): Points of each batch.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
+                boxes of each batch.
+            gt_labels_3d (torch.Tensor): Labels of each batch.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of ssd3d head.
+        """
+        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
+
+        valid_gt = gt_labels_3d != -1
+        gt_bboxes_3d = gt_bboxes_3d[valid_gt]
+        gt_labels_3d = gt_labels_3d[valid_gt]
+
+        # transform the bbox coordinate to the point cloud coordinate
+        gt_bboxes_3d_tensor = gt_bboxes_3d.tensor.clone()
+        gt_bboxes_3d_tensor[..., 2] += gt_bboxes_3d_tensor[..., 5] / 2
+
+        points_mask, assignment = self._assign_targets_by_points_inside(
+            gt_bboxes_3d, points)
+        gt_bboxes_3d_tensor = gt_bboxes_3d_tensor[assignment]
+        mask_targets = gt_labels_3d[assignment]
+
+        bbox_targets = self.bbox_coder.encode(gt_bboxes_3d_tensor,
+                                              points[..., 0:3], mask_targets)
+
+        positive_mask = (points_mask.max(1)[0] > 0)
+        # add ignore_mask
+        extend_gt_bboxes_3d = gt_bboxes_3d.enlarged_box(self.enlarge_width)
+        points_mask, _ = self._assign_targets_by_points_inside(
+            extend_gt_bboxes_3d, points)
+        negative_mask = (points_mask.max(1)[0] == 0)
+
+        point_targets = points[..., 0:3]
+        return (bbox_targets, mask_targets, positive_mask, negative_mask,
+                point_targets)
+
+    def predict_by_feat(self, points: Tensor, bbox_preds: List[Tensor],
+                        cls_preds: List[Tensor], batch_input_metas: List[dict],
+                        cfg: Optional[dict]) -> InstanceList:
+        """Generate bboxes from RPN head predictions.
+
+        Args:
+            points (torch.Tensor): Input points.
+            bbox_preds (list[tensor]): Regression predictions from PointRCNN
+                head.
+            cls_preds (list[tensor]): Class scores predictions from PointRCNN
+                head.
+            batch_input_metas (list[dict]): Batch inputs meta info.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+            - cls_preds (torch.Tensor): Class score of each bbox.
+        """
+        sem_scores = cls_preds.sigmoid()
+        obj_scores = sem_scores.max(-1)[0]
+        object_class = sem_scores.argmax(dim=-1)
+
+        batch_size = sem_scores.shape[0]
+        results = list()
+        for b in range(batch_size):
+            bbox3d = self.bbox_coder.decode(bbox_preds[b], points[b, ..., :3],
+                                            object_class[b])
+            mask = ~bbox3d.sum(dim=1).isinf()
+            bbox_selected, score_selected, labels, cls_preds_selected = \
+                self.class_agnostic_nms(obj_scores[b][mask],
+                                        sem_scores[b][mask, :],
+                                        bbox3d[mask, :],
+                                        points[b, ..., :3][mask, :],
+                                        batch_input_metas[b],
+                                        cfg.nms_cfg)
+            bbox_selected = batch_input_metas[b]['box_type_3d'](
+                bbox_selected, box_dim=bbox_selected.shape[-1])
+            result = InstanceData()
+            result.bboxes_3d = bbox_selected
+            result.scores_3d = score_selected
+            result.labels_3d = labels
+            result.cls_preds = cls_preds_selected
+            results.append(result)
+        return results
+
+    def class_agnostic_nms(self, obj_scores: Tensor, sem_scores: Tensor,
+                           bbox: Tensor, points: Tensor, input_meta: Dict,
+                           nms_cfg: Dict) -> Tuple[Tensor]:
+        """Class agnostic nms.
+
+        Args:
+            obj_scores (torch.Tensor): Objectness score of bounding boxes.
+            sem_scores (torch.Tensor): Semantic class score of bounding boxes.
+            bbox (torch.Tensor): Predicted bounding boxes.
+            points (torch.Tensor): Input points.
+            input_meta (dict): Contain pcd and img's meta info.
+            nms_cfg (dict): NMS config dict.
+
+        Returns:
+            tuple[torch.Tensor]: Bounding boxes, scores and labels.
+        """
+        if nms_cfg.use_rotate_nms:
+            nms_func = nms_bev
+        else:
+            nms_func = nms_normal_bev
+
+        num_bbox = bbox.shape[0]
+        bbox = input_meta['box_type_3d'](
+            bbox.clone(),
+            box_dim=bbox.shape[-1],
+            with_yaw=True,
+            origin=(0.5, 0.5, 0.5))
+
+        if isinstance(bbox, LiDARInstance3DBoxes):
+            box_idx = bbox.points_in_boxes(points)
+            box_indices = box_idx.new_zeros([num_bbox + 1])
+            box_idx[box_idx == -1] = num_bbox
+            box_indices.scatter_add_(0, box_idx.long(),
+                                     box_idx.new_ones(box_idx.shape))
+            box_indices = box_indices[:-1]
+            nonempty_box_mask = box_indices >= 0
+        elif isinstance(bbox, DepthInstance3DBoxes):
+            box_indices = bbox.points_in_boxes(points)
+            nonempty_box_mask = box_indices.T.sum(1) >= 0
+        else:
+            raise NotImplementedError('Unsupported bbox type!')
+
+        bbox = bbox[nonempty_box_mask]
+
+        if nms_cfg.score_thr is not None:
+            score_thr = nms_cfg.score_thr
+            keep = (obj_scores >= score_thr)
+            obj_scores = obj_scores[keep]
+            sem_scores = sem_scores[keep]
+            bbox = bbox.tensor[keep]
+
+        if bbox.tensor.shape[0] > 0:
+            topk = min(nms_cfg.nms_pre, obj_scores.shape[0])
+            obj_scores_nms, indices = torch.topk(obj_scores, k=topk)
+            bbox_for_nms = xywhr2xyxyr(bbox[indices].bev)
+            sem_scores_nms = sem_scores[indices]
+
+            keep = nms_func(bbox_for_nms, obj_scores_nms, nms_cfg.iou_thr)
+            keep = keep[:nms_cfg.nms_post]
+
+            bbox_selected = bbox.tensor[indices][keep]
+            score_selected = obj_scores_nms[keep]
+            cls_preds = sem_scores_nms[keep]
+            labels = torch.argmax(cls_preds, -1)
+            if bbox_selected.shape[0] > nms_cfg.nms_post:
+                _, inds = score_selected.sort(descending=True)
+                inds = inds[:score_selected.nms_post]
+                bbox_selected = bbox_selected[inds, :]
+                labels = labels[inds]
+                score_selected = score_selected[inds]
+                cls_preds = cls_preds[inds, :]
+        else:
+            bbox_selected = bbox.tensor
+            score_selected = obj_scores.new_zeros([0])
+            labels = obj_scores.new_zeros([0])
+            cls_preds = obj_scores.new_zeros([0, sem_scores.shape[-1]])
+        return bbox_selected, score_selected, labels, cls_preds
+
+    def _assign_targets_by_points_inside(self, bboxes_3d: BaseInstance3DBoxes,
+                                         points: Tensor) -> Tuple[Tensor]:
+        """Compute assignment by checking whether point is inside bbox.
+
+        Args:
+            bboxes_3d (:obj:`BaseInstance3DBoxes`): Instance of bounding boxes.
+            points (torch.Tensor): Points of a batch.
+
+        Returns:
+            tuple[torch.Tensor]: Flags indicating whether each point is
+                inside bbox and the index of box where each point are in.
+        """
+        # TODO: align points_in_boxes function in each box_structures
+        num_bbox = bboxes_3d.tensor.shape[0]
+        if isinstance(bboxes_3d, LiDARInstance3DBoxes):
+            assignment = bboxes_3d.points_in_boxes(points[:, 0:3]).long()
+            points_mask = assignment.new_zeros(
+                [assignment.shape[0], num_bbox + 1])
+            assignment[assignment == -1] = num_bbox
+            points_mask.scatter_(1, assignment.unsqueeze(1), 1)
+            points_mask = points_mask[:, :-1]
+            assignment[assignment == num_bbox] = num_bbox - 1
+        elif isinstance(bboxes_3d, DepthInstance3DBoxes):
+            points_mask = bboxes_3d.points_in_boxes(points)
+            assignment = points_mask.argmax(dim=-1)
+        else:
+            raise NotImplementedError('Unsupported bbox type!')
+
+        return points_mask, assignment
+
+    def predict(self, feats_dict: Dict,
+                batch_data_samples: SampleList) -> InstanceList:
+        """Perform forward propagation of the 3D detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        batch_input_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        raw_points = feats_dict.pop('raw_points')
+        bbox_preds, cls_preds = self(feats_dict)
+        proposal_cfg = self.test_cfg
+
+        proposal_list = self.predict_by_feat(
+            raw_points,
+            bbox_preds,
+            cls_preds,
+            cfg=proposal_cfg,
+            batch_input_metas=batch_input_metas)
+        feats_dict['points_cls_preds'] = cls_preds
+        return proposal_list
+
+    def loss_and_predict(self,
+                         feats_dict: Dict,
+                         batch_data_samples: SampleList,
+                         proposal_cfg: Optional[dict] = None,
+                         **kwargs) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+            proposal_cfg (ConfigDict, optional): Proposal config.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+            - losses: (dict[str, Tensor]): A dictionary of loss components.
+            - predictions (list[:obj:`InstanceData`]): Detection
+              results of each sample after the post process.
+        """
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+        raw_points = feats_dict.pop('raw_points')
+        bbox_preds, cls_preds = self(feats_dict)
+
+        loss_inputs = (bbox_preds, cls_preds,
+                       raw_points) + (batch_gt_instances_3d, batch_input_metas,
+                                      batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            raw_points,
+            bbox_preds,
+            cls_preds,
+            batch_input_metas=batch_input_metas,
+            cfg=proposal_cfg)
+        feats_dict['points_cls_preds'] = cls_preds
+        if predictions[0].bboxes_3d.tensor.isinf().any():
+            print(predictions)
+        return losses, predictions
diff --git a/mmde/mmdet3d/models/dense_heads/shape_aware_head.py b/mmde/mmdet3d/models/dense_heads/shape_aware_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c192c695127a7f417520093c84ce38548dcedc4
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/shape_aware_head.py
@@ -0,0 +1,537 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmcv.cnn import ConvModule
+from mmdet.models.utils import multi_apply
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.models.layers import box3d_multiclass_nms
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import limit_period, xywhr2xyxyr
+from mmdet3d.utils import InstanceList, OptInstanceList
+from .anchor3d_head import Anchor3DHead
+
+
+@MODELS.register_module()
+class BaseShapeHead(BaseModule):
+    """Base Shape-aware Head in Shape Signature Network.
+
+    Note:
+        This base shape-aware grouping head uses default settings for small
+        objects. For large and huge objects, it is recommended to use
+        heavier heads, like (64, 64, 64) and (128, 128, 64, 64, 64) in
+        shared conv channels, (2, 1, 1) and (2, 1, 2, 1, 1) in shared
+        conv strides. For tiny objects, we can use smaller heads, like
+        (32, 32) channels and (1, 1) strides.
+
+    Args:
+        num_cls (int): Number of classes.
+        num_base_anchors (int): Number of anchors per location.
+        box_code_size (int): The dimension of boxes to be encoded.
+        in_channels (int): Input channels for convolutional layers.
+        shared_conv_channels (tuple, optional): Channels for shared
+            convolutional layers. Default: (64, 64).
+        shared_conv_strides (tuple): Strides for shared
+            convolutional layers. Default: (1, 1).
+        use_direction_classifier (bool): Whether to use direction
+            classifier. Default: True.
+        conv_cfg (dict): Config of conv layer.
+            Default: dict(type='Conv2d')
+        norm_cfg (dict): Config of norm layer.
+            Default: dict(type='BN2d').
+        bias (bool | str): Type of bias. Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_cls: int,
+                 num_base_anchors: int,
+                 box_code_size: int,
+                 in_channels: int,
+                 shared_conv_channels: Tuple = (64, 64),
+                 shared_conv_strides: Tuple = (1, 1),
+                 use_direction_classifier: bool = True,
+                 conv_cfg: Dict = dict(type='Conv2d'),
+                 norm_cfg: Dict = dict(type='BN2d'),
+                 bias: bool = False,
+                 init_cfg: Optional[dict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_cls = num_cls
+        self.num_base_anchors = num_base_anchors
+        self.use_direction_classifier = use_direction_classifier
+        self.box_code_size = box_code_size
+
+        assert len(shared_conv_channels) == len(shared_conv_strides), \
+            'Lengths of channels and strides list should be equal.'
+
+        self.shared_conv_channels = [in_channels] + list(shared_conv_channels)
+        self.shared_conv_strides = list(shared_conv_strides)
+
+        shared_conv = []
+        for i in range(len(self.shared_conv_strides)):
+            shared_conv.append(
+                ConvModule(
+                    self.shared_conv_channels[i],
+                    self.shared_conv_channels[i + 1],
+                    kernel_size=3,
+                    stride=self.shared_conv_strides[i],
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    bias=bias,
+                    norm_cfg=norm_cfg))
+
+        self.shared_conv = nn.Sequential(*shared_conv)
+
+        out_channels = self.shared_conv_channels[-1]
+        self.conv_cls = nn.Conv2d(out_channels, num_base_anchors * num_cls, 1)
+        self.conv_reg = nn.Conv2d(out_channels,
+                                  num_base_anchors * box_code_size, 1)
+
+        if use_direction_classifier:
+            self.conv_dir_cls = nn.Conv2d(out_channels, num_base_anchors * 2,
+                                          1)
+        if init_cfg is None:
+            if use_direction_classifier:
+                self.init_cfg = dict(
+                    type='Kaiming',
+                    layer='Conv2d',
+                    override=[
+                        dict(type='Normal', name='conv_reg', std=0.01),
+                        dict(
+                            type='Normal',
+                            name='conv_cls',
+                            std=0.01,
+                            bias_prob=0.01),
+                        dict(
+                            type='Normal',
+                            name='conv_dir_cls',
+                            std=0.01,
+                            bias_prob=0.01)
+                    ])
+            else:
+                self.init_cfg = dict(
+                    type='Kaiming',
+                    layer='Conv2d',
+                    override=[
+                        dict(type='Normal', name='conv_reg', std=0.01),
+                        dict(
+                            type='Normal',
+                            name='conv_cls',
+                            std=0.01,
+                            bias_prob=0.01)
+                    ])
+
+    def forward(self, x: Tensor) -> Dict:
+        """Forward function for SmallHead.
+
+        Args:
+            x (torch.Tensor): Input feature map with the shape of
+                [B, C, H, W].
+
+        Returns:
+            dict[torch.Tensor]: Contain score of each class, bbox
+                regression and direction classification predictions.
+                Note that all the returned tensors are reshaped as
+                [bs*num_base_anchors*H*W, num_cls/box_code_size/dir_bins].
+                It is more convenient to concat anchors for different
+                classes even though they have different feature map sizes.
+        """
+        x = self.shared_conv(x)
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        featmap_size = bbox_pred.shape[-2:]
+        H, W = featmap_size
+        B = bbox_pred.shape[0]
+        cls_score = cls_score.view(-1, self.num_base_anchors, self.num_cls, H,
+                                   W).permute(0, 1, 3, 4,
+                                              2).reshape(B, -1, self.num_cls)
+        bbox_pred = bbox_pred.view(-1, self.num_base_anchors,
+                                   self.box_code_size, H, W).permute(
+                                       0, 1, 3, 4,
+                                       2).reshape(B, -1, self.box_code_size)
+
+        dir_cls_preds = None
+        if self.use_direction_classifier:
+            dir_cls_preds = self.conv_dir_cls(x)
+            dir_cls_preds = dir_cls_preds.view(-1, self.num_base_anchors, 2, H,
+                                               W).permute(0, 1, 3, 4,
+                                                          2).reshape(B, -1, 2)
+        ret = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            dir_cls_preds=dir_cls_preds,
+            featmap_size=featmap_size)
+        return ret
+
+
+@MODELS.register_module()
+class ShapeAwareHead(Anchor3DHead):
+    """Shape-aware grouping head for SSN.
+
+    Args:
+        tasks (dict): Shape-aware groups of multi-class objects.
+        assign_per_class (bool): Whether to do assignment for each
+            class. Default: True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 tasks: Dict,
+                 assign_per_class: bool = True,
+                 init_cfg: Optional[dict] = None,
+                 **kwargs) -> Dict:
+        self.tasks = tasks
+        self.featmap_sizes = []
+        super().__init__(
+            assign_per_class=assign_per_class, init_cfg=init_cfg, **kwargs)
+
+    def init_weights(self):
+        if not self._is_init:
+            for m in self.heads:
+                if hasattr(m, 'init_weights'):
+                    m.init_weights()
+            self._is_init = True
+        else:
+            warnings.warn(f'init_weights of {self.__class__.__name__} has '
+                          f'been called more than once.')
+
+    def _init_layers(self):
+        """Initialize neural network layers of the head."""
+        self.heads = nn.ModuleList()
+        cls_ptr = 0
+        for task in self.tasks:
+            sizes = self.prior_generator.sizes[cls_ptr:cls_ptr +
+                                               task['num_class']]
+            num_size = torch.tensor(sizes).reshape(-1, 3).size(0)
+            num_rot = len(self.prior_generator.rotations)
+            num_base_anchors = num_rot * num_size
+            branch = dict(
+                type='BaseShapeHead',
+                num_cls=self.num_classes,
+                num_base_anchors=num_base_anchors,
+                box_code_size=self.box_code_size,
+                in_channels=self.in_channels,
+                shared_conv_channels=task['shared_conv_channels'],
+                shared_conv_strides=task['shared_conv_strides'])
+            self.heads.append(MODELS.build(branch))
+            cls_ptr += task['num_class']
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward function on a single-scale feature map.
+
+        Args:
+            x (torch.Tensor): Input features.
+        Returns:
+            tuple[torch.Tensor]: Contain score of each class, bbox
+                regression and direction classification predictions.
+        """
+        results = []
+
+        for head in self.heads:
+            results.append(head(x))
+
+        cls_score = torch.cat([result['cls_score'] for result in results],
+                              dim=1)
+        bbox_pred = torch.cat([result['bbox_pred'] for result in results],
+                              dim=1)
+        dir_cls_preds = None
+        if self.use_direction_classifier:
+            dir_cls_preds = torch.cat(
+                [result['dir_cls_preds'] for result in results], dim=1)
+
+        self.featmap_sizes = []
+        for i, task in enumerate(self.tasks):
+            for _ in range(task['num_class']):
+                self.featmap_sizes.append(results[i]['featmap_size'])
+        assert len(self.featmap_sizes) == len(self.prior_generator.ranges), \
+            'Length of feature map sizes must be equal to length of ' + \
+            'different ranges of anchor generator.'
+
+        return cls_score, bbox_pred, dir_cls_preds
+
+    def loss_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                    dir_cls_preds: Tensor, labels: Tensor,
+                    label_weights: Tensor, bbox_targets: Tensor,
+                    bbox_weights: Tensor, dir_targets: Tensor,
+                    dir_weights: Tensor,
+                    num_total_samples: int) -> Tuple[Tensor]:
+        """Calculate loss of Single-level results.
+
+        Args:
+            cls_score (torch.Tensor): Class score in single-level.
+            bbox_pred (torch.Tensor): Bbox prediction in single-level.
+            dir_cls_preds (torch.Tensor): Predictions of direction class
+                in single-level.
+            labels (torch.Tensor): Labels of class.
+            label_weights (torch.Tensor): Weights of class loss.
+            bbox_targets (torch.Tensor): Targets of bbox predictions.
+            bbox_weights (torch.Tensor): Weights of bbox loss.
+            dir_targets (torch.Tensor): Targets of direction predictions.
+            dir_weights (torch.Tensor): Weights of direction loss.
+            num_total_samples (int): The number of valid samples.
+
+        Returns:
+            tuple[torch.Tensor]: Losses of class, bbox
+                and direction, respectively.
+        """
+        # classification loss
+        if num_total_samples is None:
+            num_total_samples = int(cls_score.shape[0])
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.reshape(-1, self.num_classes)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=num_total_samples)
+
+        # regression loss
+        bbox_targets = bbox_targets.reshape(-1, self.box_code_size)
+        bbox_weights = bbox_weights.reshape(-1, self.box_code_size)
+        code_weight = self.train_cfg.get('code_weight', None)
+
+        if code_weight:
+            bbox_weights = bbox_weights * bbox_weights.new_tensor(code_weight)
+        bbox_pred = bbox_pred.reshape(-1, self.box_code_size)
+        if self.diff_rad_by_sin:
+            bbox_pred, bbox_targets = self.add_sin_difference(
+                bbox_pred, bbox_targets)
+        loss_bbox = self.loss_bbox(
+            bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            avg_factor=num_total_samples)
+
+        # direction classification loss
+        loss_dir = None
+        if self.use_direction_classifier:
+            dir_cls_preds = dir_cls_preds.reshape(-1, 2)
+            dir_targets = dir_targets.reshape(-1)
+            dir_weights = dir_weights.reshape(-1)
+            loss_dir = self.loss_dir(
+                dir_cls_preds,
+                dir_targets,
+                dir_weights,
+                avg_factor=num_total_samples)
+
+        return loss_cls, loss_bbox, loss_dir
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            dir_cls_preds: List[Tensor],
+            batch_gt_instances_3d: InstanceList,
+            batch_input_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> Dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[torch.Tensor]): Multi-level class scores.
+            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
+            dir_cls_preds (list[torch.Tensor]): Multi-level direction
+                class predictions.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+            batch_input_metas (list[dict]): Contain pcd and sample's meta info.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, list[torch.Tensor]]: Classification, bbox, and
+                direction losses of each level.
+
+                - loss_cls (list[torch.Tensor]): Classification losses.
+                - loss_bbox (list[torch.Tensor]): Box regression losses.
+                - loss_dir (list[torch.Tensor]): Direction classification
+                    losses.
+        """
+        device = cls_scores[0].device
+        anchor_list = self.get_anchors(
+            self.featmap_sizes, batch_input_metas, device=device)
+        cls_reg_targets = self.anchor_target_3d(
+            anchor_list,
+            batch_gt_instances_3d,
+            batch_input_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            num_classes=self.num_classes,
+            sampling=self.sampling)
+
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         dir_targets_list, dir_weights_list, num_total_pos,
+         num_total_neg) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+
+        # num_total_samples = None
+        losses_cls, losses_bbox, losses_dir = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            dir_cls_preds,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            dir_targets_list,
+            dir_weights_list,
+            num_total_samples=num_total_samples)
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dir=losses_dir)
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        dir_cls_preds: List[Tensor],
+                        batch_input_metas: List[dict],
+                        cfg: Optional[dict] = None,
+                        rescale: List[Tensor] = False) -> List[tuple]:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_scores (list[torch.Tensor]): Multi-level class scores.
+            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
+            dir_cls_preds (list[torch.Tensor]): Multi-level direction
+                class predictions.
+            batch_input_metas (list[dict]): Contain pcd and img's meta info.
+            cfg (:obj:`ConfigDict`, optional): Training or testing config.
+                Default: None.
+            rescale (list[torch.Tensor], optional): Whether to rescale bbox.
+                Default: False.
+
+        Returns:
+            list[tuple]: Prediction resultes of batches.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        assert len(cls_scores) == len(dir_cls_preds)
+        num_levels = len(cls_scores)
+        assert num_levels == 1, 'Only support single level inference.'
+        device = cls_scores[0].device
+        mlvl_anchors = self.prior_generator.grid_anchors(
+            self.featmap_sizes, device=device)
+        # `anchor` is a list of anchors for different classes
+        mlvl_anchors = [torch.cat(anchor, dim=0) for anchor in mlvl_anchors]
+
+        result_list = []
+        for img_id in range(len(batch_input_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            dir_cls_pred_list = [
+                dir_cls_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+
+            input_meta = batch_input_metas[img_id]
+            proposals = self._predict_by_feat_single(cls_score_list,
+                                                     bbox_pred_list,
+                                                     dir_cls_pred_list,
+                                                     mlvl_anchors, input_meta,
+                                                     cfg, rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: Tensor,
+                                bbox_preds: Tensor,
+                                dir_cls_preds: Tensor,
+                                mlvl_anchors: List[Tensor],
+                                input_meta: List[dict],
+                                cfg: Dict = None,
+                                rescale: List[Tensor] = False):
+        """Transform a single point's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_scores (torch.Tensor): Class score in single batch.
+            bbox_preds (torch.Tensor): Bbox prediction in single batch.
+            dir_cls_preds (torch.Tensor): Predictions of direction class
+                in single batch.
+            mlvl_anchors (List[torch.Tensor]): Multi-level anchors
+                in single batch.
+            input_meta (list[dict]): Contain pcd and img's meta info.
+            cfg (:obj:`ConfigDict`): Training or testing config.
+            rescale (list[torch.Tensor]): whether to rescale bbox.
+                Default: False.
+
+        Returns:
+            tuple: Contain predictions of single batch.
+
+                - bboxes (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
+                - scores (torch.Tensor): Class score of each bbox.
+                - labels (torch.Tensor): Label of each bbox.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_dir_scores = []
+        for cls_score, bbox_pred, dir_cls_pred, anchors in zip(
+                cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors):
+            assert cls_score.size()[-2] == bbox_pred.size()[-2]
+            assert cls_score.size()[-2] == dir_cls_pred.size()[-2]
+            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
+
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                dir_cls_score = dir_cls_score[topk_inds]
+
+            bboxes = self.bbox_coder.decode(anchors, bbox_pred)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_dir_scores.append(dir_cls_score)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
+            mlvl_bboxes, box_dim=self.box_code_size).bev)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
+
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the front when using sigmoid
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+
+        score_thr = cfg.get('score_thr', 0)
+        results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
+                                       mlvl_scores, score_thr, cfg.max_num,
+                                       cfg, mlvl_dir_scores)
+        bboxes, scores, labels, dir_scores = results
+        if bboxes.shape[0] > 0:
+            dir_rot = limit_period(bboxes[..., 6] - self.dir_offset,
+                                   self.dir_limit_offset, np.pi)
+            bboxes[..., 6] = (
+                dir_rot + self.dir_offset +
+                np.pi * dir_scores.to(bboxes.dtype))
+        bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size)
+        results = InstanceData()
+        results.bboxes_3d = bboxes
+        results.scores_3d = scores
+        results.labels_3d = labels
+        return results
diff --git a/mmde/mmdet3d/models/dense_heads/smoke_mono3d_head.py b/mmde/mmdet3d/models/dense_heads/smoke_mono3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..23f3ad2d27f37d962be51f9f209a87dadebce2b8
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/smoke_mono3d_head.py
@@ -0,0 +1,554 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+from mmdet.models.utils import (gaussian_radius, gen_gaussian_target,
+                                multi_apply)
+from mmdet.models.utils.gaussian_target import (get_local_maximum,
+                                                get_topk_from_heatmap,
+                                                transpose_and_gather_feat)
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.utils import (ConfigType, InstanceList, OptConfigType,
+                           OptInstanceList, OptMultiConfig)
+from .anchor_free_mono3d_head import AnchorFreeMono3DHead
+
+
+@MODELS.register_module()
+class SMOKEMono3DHead(AnchorFreeMono3DHead):
+    r"""Anchor-free head used in `SMOKE <https://arxiv.org/abs/2002.10111>`_
+
+    .. code-block:: none
+
+                /-----> 3*3 conv -----> 1*1 conv -----> cls
+        feature
+                \-----> 3*3 conv -----> 1*1 conv -----> reg
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        dim_channel (list[int]): indices of dimension offset preds in
+            regression heatmap channels.
+        ori_channel (list[int]): indices of orientation offset pred in
+            regression heatmap channels.
+        bbox_coder (:obj:`ConfigDict` or dict): Bbox coder for encoding
+            and decoding boxes.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+            Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0).
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+            Default: loss_bbox=dict(type='L1Loss', loss_weight=10.0).
+        loss_dir (:obj:`ConfigDict` or dict, Optional): Config of direction
+            classification loss. In SMOKE, Default: None.
+        loss_attr (:obj:`ConfigDict` or dict, Optional): Config of attribute
+            classification loss. In SMOKE, Default: None.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict. Defaults to None.
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 dim_channel: List[int],
+                 ori_channel: List[int],
+                 bbox_coder: ConfigType,
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.GaussionFocalLoss', loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='mmdet.L1Loss', loss_weight=0.1),
+                 loss_dir: OptConfigType = None,
+                 loss_attr: OptConfigType = None,
+                 norm_cfg: OptConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super().__init__(
+            num_classes,
+            in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_dir=loss_dir,
+            loss_attr=loss_attr,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.dim_channel = dim_channel
+        self.ori_channel = ori_channel
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * bbox_code_size.
+        """
+        return multi_apply(self.forward_single, x)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): Input feature map.
+
+        Returns:
+            tuple: Scores for each class, bbox of input feature maps.
+        """
+        cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, reg_feat = \
+            super().forward_single(x)
+        cls_score = cls_score.sigmoid()  # turn to 0-1
+        cls_score = cls_score.clamp(min=1e-4, max=1 - 1e-4)
+        # (N, C, H, W)
+        offset_dims = bbox_pred[:, self.dim_channel, ...]
+        bbox_pred[:, self.dim_channel, ...] = offset_dims.sigmoid() - 0.5
+        # (N, C, H, W)
+        vector_ori = bbox_pred[:, self.ori_channel, ...]
+        bbox_pred[:, self.ori_channel, ...] = F.normalize(vector_ori)
+        return cls_score, bbox_pred
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        rescale: bool = None) -> InstanceList:
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level.
+            bbox_preds (list[Tensor]): Box regression for each scale.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+
+        Returns:
+            list[:obj:`InstanceData`]: 3D Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+                (num_instance, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+                (num_instances, 7).
+        """
+        assert len(cls_scores) == len(bbox_preds) == 1
+        cam2imgs = torch.stack([
+            cls_scores[0].new_tensor(img_meta['cam2img'])
+            for img_meta in batch_img_metas
+        ])
+        trans_mats = torch.stack([
+            cls_scores[0].new_tensor(img_meta['trans_mat'])
+            for img_meta in batch_img_metas
+        ])
+        batch_bboxes, batch_scores, batch_topk_labels = self._decode_heatmap(
+            cls_scores[0],
+            bbox_preds[0],
+            batch_img_metas,
+            cam2imgs=cam2imgs,
+            trans_mats=trans_mats,
+            topk=100,
+            kernel=3)
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+
+            bboxes = batch_bboxes[img_id]
+            scores = batch_scores[img_id]
+            labels = batch_topk_labels[img_id]
+
+            keep_idx = scores > 0.25
+            bboxes = bboxes[keep_idx]
+            scores = scores[keep_idx]
+            labels = labels[keep_idx]
+
+            bboxes = batch_img_metas[img_id]['box_type_3d'](
+                bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5))
+            attrs = None
+
+            results = InstanceData()
+            results.bboxes_3d = bboxes
+            results.labels_3d = labels
+            results.scores_3d = scores
+
+            if attrs is not None:
+                results.attr_labels = attrs
+
+            result_list.append(results)
+
+        return result_list
+
+    def _decode_heatmap(self,
+                        cls_score: Tensor,
+                        reg_pred: Tensor,
+                        batch_img_metas: List[dict],
+                        cam2imgs: Tensor,
+                        trans_mats: Tensor,
+                        topk: int = 100,
+                        kernel: int = 3) -> Tuple[Tensor, Tensor, Tensor]:
+        """Transform outputs into detections raw bbox predictions.
+
+        Args:
+            class_score (Tensor): Center predict heatmap,
+                shape (B, num_classes, H, W).
+            reg_pred (Tensor): Box regression map.
+                shape (B, channel, H , W).
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cam2imgs (Tensor): Camera intrinsic matrixs.
+                shape (B, 4, 4)
+            trans_mats (Tensor): Transformation matrix from original image
+                to feature map.
+                shape: (batch, 3, 3)
+            topk (int): Get top k center keypoints from heatmap. Default 100.
+            kernel (int): Max pooling kernel for extract local maximum pixels.
+               Default 3.
+
+        Returns:
+            tuple[torch.Tensor]: Decoded output of SMOKEHead, containing
+               the following Tensors:
+
+              - batch_bboxes (Tensor): Coords of each 3D box.
+                    shape (B, k, 7)
+              - batch_scores (Tensor): Scores of each 3D box.
+                    shape (B, k)
+              - batch_topk_labels (Tensor): Categories of each 3D box.
+                    shape (B, k)
+        """
+        img_h, img_w = batch_img_metas[0]['pad_shape'][:2]
+        bs, _, feat_h, feat_w = cls_score.shape
+
+        center_heatmap_pred = get_local_maximum(cls_score, kernel=kernel)
+
+        *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap(
+            center_heatmap_pred, k=topk)
+        batch_scores, batch_index, batch_topk_labels = batch_dets
+
+        regression = transpose_and_gather_feat(reg_pred, batch_index)
+        regression = regression.view(-1, 8)
+
+        points = torch.cat([topk_xs.view(-1, 1),
+                            topk_ys.view(-1, 1).float()],
+                           dim=1)
+        locations, dimensions, orientations = self.bbox_coder.decode(
+            regression, points, batch_topk_labels, cam2imgs, trans_mats)
+
+        batch_bboxes = torch.cat((locations, dimensions, orientations), dim=1)
+        batch_bboxes = batch_bboxes.view(bs, -1, self.bbox_code_size)
+        return batch_bboxes, batch_scores, batch_topk_labels
+
+    def get_predictions(self, labels_3d: Tensor, centers_2d: Tensor,
+                        gt_locations: Tensor, gt_dimensions: Tensor,
+                        gt_orientations: Tensor, indices: Tensor,
+                        batch_img_metas: List[dict], pred_reg: Tensor) -> dict:
+        """Prepare predictions for computing loss.
+
+        Args:
+            labels_3d (Tensor): Labels of each 3D box.
+                shape (B, max_objs, )
+            centers_2d (Tensor): Coords of each projected 3D box
+                center on image. shape (B * max_objs, 2)
+            gt_locations (Tensor): Coords of each 3D box's location.
+                shape (B * max_objs, 3)
+            gt_dimensions (Tensor): Dimensions of each 3D box.
+                shape (N, 3)
+            gt_orientations (Tensor): Orientation(yaw) of each 3D box.
+                shape (N, 1)
+            indices (Tensor): Indices of the existence of the 3D box.
+                shape (B * max_objs, )
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            pre_reg (Tensor): Box regression map.
+                shape (B, channel, H , W).
+
+        Returns:
+            dict: the dict has components below:
+
+            - bbox3d_yaws (:obj:`CameraInstance3DBoxes`):
+                bbox calculated using pred orientations.
+            - bbox3d_dims (:obj:`CameraInstance3DBoxes`):
+                bbox calculated using pred dimensions.
+            - bbox3d_locs (:obj:`CameraInstance3DBoxes`):
+                bbox calculated using pred locations.
+        """
+        batch, channel = pred_reg.shape[0], pred_reg.shape[1]
+        w = pred_reg.shape[3]
+        cam2imgs = torch.stack([
+            gt_locations.new_tensor(img_meta['cam2img'])
+            for img_meta in batch_img_metas
+        ])
+        trans_mats = torch.stack([
+            gt_locations.new_tensor(img_meta['trans_mat'])
+            for img_meta in batch_img_metas
+        ])
+        centers_2d_inds = centers_2d[:, 1] * w + centers_2d[:, 0]
+        centers_2d_inds = centers_2d_inds.view(batch, -1)
+        pred_regression = transpose_and_gather_feat(pred_reg, centers_2d_inds)
+        pred_regression_pois = pred_regression.view(-1, channel)
+        locations, dimensions, orientations = self.bbox_coder.decode(
+            pred_regression_pois, centers_2d, labels_3d, cam2imgs, trans_mats,
+            gt_locations)
+
+        locations, dimensions, orientations = locations[indices], dimensions[
+            indices], orientations[indices]
+
+        locations[:, 1] += dimensions[:, 1] / 2
+
+        gt_locations = gt_locations[indices]
+
+        assert len(locations) == len(gt_locations)
+        assert len(dimensions) == len(gt_dimensions)
+        assert len(orientations) == len(gt_orientations)
+        bbox3d_yaws = self.bbox_coder.encode(gt_locations, gt_dimensions,
+                                             orientations, batch_img_metas)
+        bbox3d_dims = self.bbox_coder.encode(gt_locations, dimensions,
+                                             gt_orientations, batch_img_metas)
+        bbox3d_locs = self.bbox_coder.encode(locations, gt_dimensions,
+                                             gt_orientations, batch_img_metas)
+
+        pred_bboxes = dict(ori=bbox3d_yaws, dim=bbox3d_dims, loc=bbox3d_locs)
+
+        return pred_bboxes
+
+    def get_targets(self, batch_gt_instances_3d: InstanceList,
+                    batch_gt_instances: InstanceList, feat_shape: Tuple[int],
+                    batch_img_metas: List[dict]) -> Tuple[Tensor, int, dict]:
+        """Get training targets for batch images.
+
+        Args:
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、
+                ``labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes``、``labels``.
+            feat_shape (tuple[int]): Feature map shape with value,
+                shape (B, _, H, W).
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple[Tensor, int, dict]: The Tensor value is the targets of
+                center heatmap, the dict has components below:
+
+              - gt_centers_2d (Tensor): Coords of each projected 3D box
+                    center on image. shape (B * max_objs, 2)
+              - gt_labels_3d (Tensor): Labels of each 3D box.
+                    shape (B, max_objs, )
+              - indices (Tensor): Indices of the existence of the 3D box.
+                    shape (B * max_objs, )
+              - affine_indices (Tensor): Indices of the affine of the 3D box.
+                    shape (N, )
+              - gt_locs (Tensor): Coords of each 3D box's location.
+                    shape (N, 3)
+              - gt_dims (Tensor): Dimensions of each 3D box.
+                    shape (N, 3)
+              - gt_yaws (Tensor): Orientation(yaw) of each 3D box.
+                    shape (N, 1)
+              - gt_cors (Tensor): Coords of the corners of each 3D box.
+                    shape (N, 8, 3)
+        """
+
+        gt_bboxes = [
+            gt_instances.bboxes for gt_instances in batch_gt_instances
+        ]
+        gt_labels = [
+            gt_instances.labels for gt_instances in batch_gt_instances
+        ]
+        gt_bboxes_3d = [
+            gt_instances_3d.bboxes_3d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+        gt_labels_3d = [
+            gt_instances_3d.labels_3d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+        centers_2d = [
+            gt_instances_3d.centers_2d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+        img_shape = batch_img_metas[0]['pad_shape']
+
+        reg_mask = torch.stack([
+            gt_bboxes[0].new_tensor(
+                not img_meta['affine_aug'], dtype=torch.bool)
+            for img_meta in batch_img_metas
+        ])
+
+        img_h, img_w = img_shape[:2]
+        bs, _, feat_h, feat_w = feat_shape
+
+        width_ratio = float(feat_w / img_w)  # 1/4
+        height_ratio = float(feat_h / img_h)  # 1/4
+
+        assert width_ratio == height_ratio
+
+        center_heatmap_target = gt_bboxes[-1].new_zeros(
+            [bs, self.num_classes, feat_h, feat_w])
+
+        gt_centers_2d = centers_2d.copy()
+
+        for batch_id in range(bs):
+            gt_bbox = gt_bboxes[batch_id]
+            gt_label = gt_labels[batch_id]
+            # project centers_2d from input image to feat map
+            gt_center_2d = gt_centers_2d[batch_id] * width_ratio
+
+            for j, center in enumerate(gt_center_2d):
+                center_x_int, center_y_int = center.int()
+                scale_box_h = (gt_bbox[j][3] - gt_bbox[j][1]) * height_ratio
+                scale_box_w = (gt_bbox[j][2] - gt_bbox[j][0]) * width_ratio
+                radius = gaussian_radius([scale_box_h, scale_box_w],
+                                         min_overlap=0.7)
+                radius = max(0, int(radius))
+                ind = gt_label[j]
+                gen_gaussian_target(center_heatmap_target[batch_id, ind],
+                                    [center_x_int, center_y_int], radius)
+
+        avg_factor = max(1, center_heatmap_target.eq(1).sum())
+        num_ctrs = [center_2d.shape[0] for center_2d in centers_2d]
+        max_objs = max(num_ctrs)
+
+        reg_inds = torch.cat(
+            [reg_mask[i].repeat(num_ctrs[i]) for i in range(bs)])
+
+        inds = torch.zeros((bs, max_objs),
+                           dtype=torch.bool).to(centers_2d[0].device)
+
+        # put gt 3d bboxes to gpu
+        gt_bboxes_3d = [
+            gt_bbox_3d.to(centers_2d[0].device) for gt_bbox_3d in gt_bboxes_3d
+        ]
+
+        batch_centers_2d = centers_2d[0].new_zeros((bs, max_objs, 2))
+        batch_labels_3d = gt_labels_3d[0].new_zeros((bs, max_objs))
+        batch_gt_locations = \
+            gt_bboxes_3d[0].tensor.new_zeros((bs, max_objs, 3))
+        for i in range(bs):
+            inds[i, :num_ctrs[i]] = 1
+            batch_centers_2d[i, :num_ctrs[i]] = centers_2d[i]
+            batch_labels_3d[i, :num_ctrs[i]] = gt_labels_3d[i]
+            batch_gt_locations[i, :num_ctrs[i]] = \
+                gt_bboxes_3d[i].tensor[:, :3]
+
+        inds = inds.flatten()
+        batch_centers_2d = batch_centers_2d.view(-1, 2) * width_ratio
+        batch_gt_locations = batch_gt_locations.view(-1, 3)
+
+        # filter the empty image, without gt_bboxes_3d
+        gt_bboxes_3d = [
+            gt_bbox_3d for gt_bbox_3d in gt_bboxes_3d
+            if gt_bbox_3d.tensor.shape[0] > 0
+        ]
+
+        gt_dimensions = torch.cat(
+            [gt_bbox_3d.tensor[:, 3:6] for gt_bbox_3d in gt_bboxes_3d])
+        gt_orientations = torch.cat([
+            gt_bbox_3d.tensor[:, 6].unsqueeze(-1)
+            for gt_bbox_3d in gt_bboxes_3d
+        ])
+        gt_corners = torch.cat(
+            [gt_bbox_3d.corners for gt_bbox_3d in gt_bboxes_3d])
+
+        target_labels = dict(
+            gt_centers_2d=batch_centers_2d.long(),
+            gt_labels_3d=batch_labels_3d,
+            indices=inds,
+            reg_indices=reg_inds,
+            gt_locs=batch_gt_locations,
+            gt_dims=gt_dimensions,
+            gt_yaws=gt_orientations,
+            gt_cors=gt_corners)
+
+        return center_heatmap_target, avg_factor, target_labels
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances_3d: InstanceList,
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level.
+                shape (num_gt, 4).
+            bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel
+                number is bbox_code_size.
+                shape (B, 7, H, W).
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、
+                ``labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes``、``labels``.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components, which has
+                components below:
+
+            - loss_cls (Tensor): loss of cls heatmap.
+            - loss_bbox (Tensor): loss of bbox heatmap.
+        """
+        assert len(cls_scores) == len(bbox_preds) == 1
+        center_2d_heatmap = cls_scores[0]
+        pred_reg = bbox_preds[0]
+
+        center_2d_heatmap_target, avg_factor, target_labels = \
+            self.get_targets(batch_gt_instances_3d,
+                             batch_gt_instances,
+                             center_2d_heatmap.shape,
+                             batch_img_metas)
+
+        pred_bboxes = self.get_predictions(
+            labels_3d=target_labels['gt_labels_3d'],
+            centers_2d=target_labels['gt_centers_2d'],
+            gt_locations=target_labels['gt_locs'],
+            gt_dimensions=target_labels['gt_dims'],
+            gt_orientations=target_labels['gt_yaws'],
+            indices=target_labels['indices'],
+            batch_img_metas=batch_img_metas,
+            pred_reg=pred_reg)
+
+        loss_cls = self.loss_cls(
+            center_2d_heatmap, center_2d_heatmap_target, avg_factor=avg_factor)
+
+        reg_inds = target_labels['reg_indices']
+
+        loss_bbox_oris = self.loss_bbox(
+            pred_bboxes['ori'].corners[reg_inds, ...],
+            target_labels['gt_cors'][reg_inds, ...])
+
+        loss_bbox_dims = self.loss_bbox(
+            pred_bboxes['dim'].corners[reg_inds, ...],
+            target_labels['gt_cors'][reg_inds, ...])
+
+        loss_bbox_locs = self.loss_bbox(
+            pred_bboxes['loc'].corners[reg_inds, ...],
+            target_labels['gt_cors'][reg_inds, ...])
+
+        loss_bbox = loss_bbox_dims + loss_bbox_locs + loss_bbox_oris
+
+        loss_dict = dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
+
+        return loss_dict
diff --git a/mmde/mmdet3d/models/dense_heads/ssd_3d_head.py b/mmde/mmdet3d/models/dense_heads/ssd_3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8f42f8c3fb8e361580ffc2a0fbc7b08bf23dfa3
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/ssd_3d_head.py
@@ -0,0 +1,583 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import torch
+from mmcv.ops.nms import batched_nms
+from mmdet.models.utils import multi_apply
+from mmengine import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import BaseInstance3DBoxes
+from mmdet3d.structures.bbox_3d import (DepthInstance3DBoxes,
+                                        LiDARInstance3DBoxes,
+                                        rotation_3d_in_axis)
+from .vote_head import VoteHead
+
+
+@MODELS.register_module()
+class SSD3DHead(VoteHead):
+    r"""Bbox head of `3DSSD <https://arxiv.org/abs/2002.10187>`_.
+
+    Args:
+        num_classes (int): The number of class.
+        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
+            decoding boxes.
+        train_cfg (dict): Config for training.
+        test_cfg (dict): Config for testing.
+        vote_module_cfg (dict): Config of VoteModule for point-wise votes.
+        vote_aggregation_cfg (dict): Config of vote aggregation layer.
+        pred_layer_cfg (dict): Config of classfication and regression
+            prediction layers.
+        conv_cfg (dict): Config of convolution in prediction layer.
+        norm_cfg (dict): Config of BN in prediction layer.
+        act_cfg (dict): Config of activation in prediction layer.
+        objectness_loss (dict): Config of objectness loss.
+        center_loss (dict): Config of center loss.
+        dir_class_loss (dict): Config of direction classification loss.
+        dir_res_loss (dict): Config of direction residual regression loss.
+        size_res_loss (dict): Config of size residual regression loss.
+        corner_loss (dict): Config of bbox corners regression loss.
+        vote_loss (dict): Config of candidate points regression loss.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 bbox_coder: Union[ConfigDict, dict],
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 vote_module_cfg: Optional[dict] = None,
+                 vote_aggregation_cfg: Optional[dict] = None,
+                 pred_layer_cfg: Optional[dict] = None,
+                 objectness_loss: Optional[dict] = None,
+                 center_loss: Optional[dict] = None,
+                 dir_class_loss: Optional[dict] = None,
+                 dir_res_loss: Optional[dict] = None,
+                 size_res_loss: Optional[dict] = None,
+                 corner_loss: Optional[dict] = None,
+                 vote_loss: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None) -> None:
+        super(SSD3DHead, self).__init__(
+            num_classes,
+            bbox_coder,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            vote_module_cfg=vote_module_cfg,
+            vote_aggregation_cfg=vote_aggregation_cfg,
+            pred_layer_cfg=pred_layer_cfg,
+            objectness_loss=objectness_loss,
+            center_loss=center_loss,
+            dir_class_loss=dir_class_loss,
+            dir_res_loss=dir_res_loss,
+            size_class_loss=None,
+            size_res_loss=size_res_loss,
+            semantic_loss=None,
+            init_cfg=init_cfg)
+        self.corner_loss = MODELS.build(corner_loss)
+        self.vote_loss = MODELS.build(vote_loss)
+        self.num_candidates = vote_module_cfg['num_points']
+
+    def _get_cls_out_channels(self) -> int:
+        """Return the channel number of classification outputs."""
+        # Class numbers (k) + objectness (1)
+        return self.num_classes
+
+    def _get_reg_out_channels(self) -> int:
+        """Return the channel number of regression outputs."""
+        # Bbox classification and regression
+        # (center residual (3), size regression (3)
+        # heading class+residual (num_dir_bins*2)),
+        return 3 + 3 + self.num_dir_bins * 2
+
+    def _extract_input(self, feat_dict: dict) -> Tuple:
+        """Extract inputs from features dictionary.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            torch.Tensor: Coordinates of input points.
+            torch.Tensor: Features of input points.
+            torch.Tensor: Indices of input points.
+        """
+        seed_points = feat_dict['sa_xyz'][-1]
+        seed_features = feat_dict['sa_features'][-1]
+        seed_indices = feat_dict['sa_indices'][-1]
+
+        return seed_points, seed_features, seed_indices
+
+    def loss_by_feat(
+            self,
+            points: List[torch.Tensor],
+            bbox_preds_dict: dict,
+            batch_gt_instances_3d: List[InstanceData],
+            batch_pts_semantic_mask: Optional[List[torch.Tensor]] = None,
+            batch_pts_instance_mask: Optional[List[torch.Tensor]] = None,
+            batch_input_metas: List[dict] = None,
+            ret_target: bool = False,
+            **kwargs) -> dict:
+        """Compute loss.
+
+        Args:
+            points (list[torch.Tensor]): Input points.
+            bbox_preds_dict (dict): Predictions from forward of vote head.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+            batch_pts_semantic_mask (list[tensor]): Semantic mask
+                of points cloud. Defaults to None. Defaults to None.
+            batch_pts_semantic_mask (list[tensor]): Instance mask
+                of points cloud. Defaults to None. Defaults to None.
+            batch_input_metas (list[dict]): Contain pcd and img's meta info.
+            ret_target (bool): Return targets or not.  Defaults to False.
+
+        Returns:
+            dict: Losses of 3DSSD.
+        """
+
+        targets = self.get_targets(points, bbox_preds_dict,
+                                   batch_gt_instances_3d,
+                                   batch_pts_semantic_mask,
+                                   batch_pts_instance_mask)
+        (vote_targets, center_targets, size_res_targets, dir_class_targets,
+         dir_res_targets, mask_targets, centerness_targets, corner3d_targets,
+         vote_mask, positive_mask, negative_mask, centerness_weights,
+         box_loss_weights, heading_res_loss_weight) = targets
+
+        # calculate centerness loss
+        centerness_loss = self.loss_objectness(
+            bbox_preds_dict['obj_scores'].transpose(2, 1),
+            centerness_targets,
+            weight=centerness_weights)
+
+        # calculate center loss
+        center_loss = self.loss_center(
+            bbox_preds_dict['center_offset'],
+            center_targets,
+            weight=box_loss_weights.unsqueeze(-1))
+
+        # calculate direction class loss
+        dir_class_loss = self.loss_dir_class(
+            bbox_preds_dict['dir_class'].transpose(1, 2),
+            dir_class_targets,
+            weight=box_loss_weights)
+
+        # calculate direction residual loss
+        dir_res_loss = self.loss_dir_res(
+            bbox_preds_dict['dir_res_norm'],
+            dir_res_targets.unsqueeze(-1).repeat(1, 1, self.num_dir_bins),
+            weight=heading_res_loss_weight)
+
+        # calculate size residual loss
+        size_loss = self.loss_size_res(
+            bbox_preds_dict['size'],
+            size_res_targets,
+            weight=box_loss_weights.unsqueeze(-1))
+
+        # calculate corner loss
+        one_hot_dir_class_targets = dir_class_targets.new_zeros(
+            bbox_preds_dict['dir_class'].shape)
+        one_hot_dir_class_targets.scatter_(2, dir_class_targets.unsqueeze(-1),
+                                           1)
+        pred_bbox3d = self.bbox_coder.decode(
+            dict(
+                center=bbox_preds_dict['center'],
+                dir_res=bbox_preds_dict['dir_res'],
+                dir_class=one_hot_dir_class_targets,
+                size=bbox_preds_dict['size']))
+        pred_bbox3d = pred_bbox3d.reshape(-1, pred_bbox3d.shape[-1])
+        pred_bbox3d = batch_input_metas[0]['box_type_3d'](
+            pred_bbox3d.clone(),
+            box_dim=pred_bbox3d.shape[-1],
+            with_yaw=self.bbox_coder.with_rot,
+            origin=(0.5, 0.5, 0.5))
+        pred_corners3d = pred_bbox3d.corners.reshape(-1, 8, 3)
+        corner_loss = self.corner_loss(
+            pred_corners3d,
+            corner3d_targets.reshape(-1, 8, 3),
+            weight=box_loss_weights.view(-1, 1, 1))
+
+        # calculate vote loss
+        vote_loss = self.vote_loss(
+            bbox_preds_dict['vote_offset'].transpose(1, 2),
+            vote_targets,
+            weight=vote_mask.unsqueeze(-1))
+
+        losses = dict(
+            centerness_loss=centerness_loss,
+            center_loss=center_loss,
+            dir_class_loss=dir_class_loss,
+            dir_res_loss=dir_res_loss,
+            size_res_loss=size_loss,
+            corner_loss=corner_loss,
+            vote_loss=vote_loss)
+
+        return losses
+
+    def get_targets(
+        self,
+        points: List[Tensor],
+        bbox_preds_dict: dict = None,
+        batch_gt_instances_3d: List[InstanceData] = None,
+        batch_pts_semantic_mask: List[torch.Tensor] = None,
+        batch_pts_instance_mask: List[torch.Tensor] = None,
+    ) -> Tuple[Tensor]:
+        """Generate targets of 3DSSD head.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            bbox_preds_dict (dict): Bounding box predictions of
+                vote head.  Defaults to None.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes`` and ``labels``
+                attributes.  Defaults to None.
+            batch_pts_semantic_mask (list[tensor]): Semantic gt mask for
+                point clouds.  Defaults to None.
+            batch_pts_instance_mask (list[tensor]): Instance gt mask for
+                point clouds. Defaults to None.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of 3DSSD head.
+        """
+        batch_gt_labels_3d = [
+            gt_instances_3d.labels_3d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+        batch_gt_bboxes_3d = [
+            gt_instances_3d.bboxes_3d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+
+        # find empty example
+        for index in range(len(batch_gt_labels_3d)):
+            if len(batch_gt_labels_3d[index]) == 0:
+                fake_box = batch_gt_bboxes_3d[index].tensor.new_zeros(
+                    1, batch_gt_bboxes_3d[index].tensor.shape[-1])
+                batch_gt_bboxes_3d[index] = batch_gt_bboxes_3d[index].new_box(
+                    fake_box)
+                batch_gt_labels_3d[index] = batch_gt_labels_3d[
+                    index].new_zeros(1)
+
+        if batch_pts_semantic_mask is None:
+            batch_pts_semantic_mask = [
+                None for _ in range(len(batch_gt_labels_3d))
+            ]
+            batch_pts_instance_mask = [
+                None for _ in range(len(batch_gt_labels_3d))
+            ]
+
+        aggregated_points = [
+            bbox_preds_dict['aggregated_points'][i]
+            for i in range(len(batch_gt_labels_3d))
+        ]
+
+        seed_points = [
+            bbox_preds_dict['seed_points'][i, :self.num_candidates].detach()
+            for i in range(len(batch_gt_labels_3d))
+        ]
+
+        (vote_targets, center_targets, size_res_targets, dir_class_targets,
+         dir_res_targets, mask_targets, centerness_targets, corner3d_targets,
+         vote_mask, positive_mask, negative_mask) = multi_apply(
+             self.get_targets_single, points, batch_gt_bboxes_3d,
+             batch_gt_labels_3d, batch_pts_semantic_mask,
+             batch_pts_instance_mask, aggregated_points, seed_points)
+
+        center_targets = torch.stack(center_targets)
+        positive_mask = torch.stack(positive_mask)
+        negative_mask = torch.stack(negative_mask)
+        dir_class_targets = torch.stack(dir_class_targets)
+        dir_res_targets = torch.stack(dir_res_targets)
+        size_res_targets = torch.stack(size_res_targets)
+        mask_targets = torch.stack(mask_targets)
+        centerness_targets = torch.stack(centerness_targets).detach()
+        corner3d_targets = torch.stack(corner3d_targets)
+        vote_targets = torch.stack(vote_targets)
+        vote_mask = torch.stack(vote_mask)
+
+        center_targets -= bbox_preds_dict['aggregated_points']
+
+        centerness_weights = (positive_mask +
+                              negative_mask).unsqueeze(-1).repeat(
+                                  1, 1, self.num_classes).float()
+        centerness_weights = centerness_weights / \
+            (centerness_weights.sum() + 1e-6)
+        vote_mask = vote_mask / (vote_mask.sum() + 1e-6)
+
+        box_loss_weights = positive_mask / (positive_mask.sum() + 1e-6)
+
+        batch_size, proposal_num = dir_class_targets.shape[:2]
+        heading_label_one_hot = dir_class_targets.new_zeros(
+            (batch_size, proposal_num, self.num_dir_bins))
+        heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1)
+        heading_res_loss_weight = heading_label_one_hot * \
+            box_loss_weights.unsqueeze(-1)
+
+        return (vote_targets, center_targets, size_res_targets,
+                dir_class_targets, dir_res_targets, mask_targets,
+                centerness_targets, corner3d_targets, vote_mask, positive_mask,
+                negative_mask, centerness_weights, box_loss_weights,
+                heading_res_loss_weight)
+
+    def get_targets_single(self,
+                           points: Tensor,
+                           gt_bboxes_3d: BaseInstance3DBoxes,
+                           gt_labels_3d: Tensor,
+                           pts_semantic_mask: Optional[Tensor] = None,
+                           pts_instance_mask: Optional[Tensor] = None,
+                           aggregated_points: Optional[Tensor] = None,
+                           seed_points: Optional[Tensor] = None,
+                           **kwargs):
+        """Generate targets of ssd3d head for single batch.
+
+        Args:
+            points (torch.Tensor): Points of each batch.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
+                boxes of each batch.
+            gt_labels_3d (torch.Tensor): Labels of each batch.
+            pts_semantic_mask (torch.Tensor): Point-wise semantic
+                label of each batch.
+            pts_instance_mask (torch.Tensor): Point-wise instance
+                label of each batch.
+            aggregated_points (torch.Tensor): Aggregated points from
+                candidate points layer.
+            seed_points (torch.Tensor): Seed points of candidate points.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of ssd3d head.
+        """
+        assert self.bbox_coder.with_rot or pts_semantic_mask is not None
+        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
+        valid_gt = gt_labels_3d != -1
+        gt_bboxes_3d = gt_bboxes_3d[valid_gt]
+        gt_labels_3d = gt_labels_3d[valid_gt]
+
+        # Generate fake GT for empty scene
+        if valid_gt.sum() == 0:
+            vote_targets = points.new_zeros(self.num_candidates, 3)
+            center_targets = points.new_zeros(self.num_candidates, 3)
+            size_res_targets = points.new_zeros(self.num_candidates, 3)
+            dir_class_targets = points.new_zeros(
+                self.num_candidates, dtype=torch.int64)
+            dir_res_targets = points.new_zeros(self.num_candidates)
+            mask_targets = points.new_zeros(
+                self.num_candidates, dtype=torch.int64)
+            centerness_targets = points.new_zeros(self.num_candidates,
+                                                  self.num_classes)
+            corner3d_targets = points.new_zeros(self.num_candidates, 8, 3)
+            vote_mask = points.new_zeros(self.num_candidates, dtype=torch.bool)
+            positive_mask = points.new_zeros(
+                self.num_candidates, dtype=torch.bool)
+            negative_mask = points.new_ones(
+                self.num_candidates, dtype=torch.bool)
+            return (vote_targets, center_targets, size_res_targets,
+                    dir_class_targets, dir_res_targets, mask_targets,
+                    centerness_targets, corner3d_targets, vote_mask,
+                    positive_mask, negative_mask)
+
+        gt_corner3d = gt_bboxes_3d.corners
+
+        (center_targets, size_targets, dir_class_targets,
+         dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d)
+
+        points_mask, assignment = self._assign_targets_by_points_inside(
+            gt_bboxes_3d, aggregated_points)
+
+        center_targets = center_targets[assignment]
+        size_res_targets = size_targets[assignment]
+        mask_targets = gt_labels_3d[assignment]
+        dir_class_targets = dir_class_targets[assignment]
+        dir_res_targets = dir_res_targets[assignment]
+        corner3d_targets = gt_corner3d[assignment]
+
+        top_center_targets = center_targets.clone()
+        top_center_targets[:, 2] += size_res_targets[:, 2]
+        dist = torch.norm(aggregated_points - top_center_targets, dim=1)
+        dist_mask = dist < self.train_cfg.pos_distance_thr
+        positive_mask = (points_mask.max(1)[0] > 0) * dist_mask
+        negative_mask = (points_mask.max(1)[0] == 0)
+
+        # Centerness loss targets
+        canonical_xyz = aggregated_points - center_targets
+        if self.bbox_coder.with_rot:
+            # TODO: Align points rotation implementation of
+            # LiDARInstance3DBoxes and DepthInstance3DBoxes
+            canonical_xyz = rotation_3d_in_axis(
+                canonical_xyz.unsqueeze(0).transpose(0, 1),
+                -gt_bboxes_3d.yaw[assignment],
+                axis=2).squeeze(1)
+        distance_front = torch.clamp(
+            size_res_targets[:, 0] - canonical_xyz[:, 0], min=0)
+        distance_back = torch.clamp(
+            size_res_targets[:, 0] + canonical_xyz[:, 0], min=0)
+        distance_left = torch.clamp(
+            size_res_targets[:, 1] - canonical_xyz[:, 1], min=0)
+        distance_right = torch.clamp(
+            size_res_targets[:, 1] + canonical_xyz[:, 1], min=0)
+        distance_top = torch.clamp(
+            size_res_targets[:, 2] - canonical_xyz[:, 2], min=0)
+        distance_bottom = torch.clamp(
+            size_res_targets[:, 2] + canonical_xyz[:, 2], min=0)
+
+        centerness_l = torch.min(distance_front, distance_back) / torch.max(
+            distance_front, distance_back)
+        centerness_w = torch.min(distance_left, distance_right) / torch.max(
+            distance_left, distance_right)
+        centerness_h = torch.min(distance_bottom, distance_top) / torch.max(
+            distance_bottom, distance_top)
+        centerness_targets = torch.clamp(
+            centerness_l * centerness_w * centerness_h, min=0)
+        centerness_targets = centerness_targets.pow(1 / 3.0)
+        centerness_targets = torch.clamp(centerness_targets, min=0, max=1)
+
+        proposal_num = centerness_targets.shape[0]
+        one_hot_centerness_targets = centerness_targets.new_zeros(
+            (proposal_num, self.num_classes))
+        one_hot_centerness_targets.scatter_(1, mask_targets.unsqueeze(-1), 1)
+        centerness_targets = centerness_targets.unsqueeze(
+            1) * one_hot_centerness_targets
+
+        # Vote loss targets
+        enlarged_gt_bboxes_3d = gt_bboxes_3d.enlarged_box(
+            self.train_cfg.expand_dims_length)
+        enlarged_gt_bboxes_3d.tensor[:, 2] -= self.train_cfg.expand_dims_length
+        vote_mask, vote_assignment = self._assign_targets_by_points_inside(
+            enlarged_gt_bboxes_3d, seed_points)
+
+        vote_targets = gt_bboxes_3d.gravity_center
+        vote_targets = vote_targets[vote_assignment] - seed_points
+        vote_mask = vote_mask.max(1)[0] > 0
+
+        return (vote_targets, center_targets, size_res_targets,
+                dir_class_targets, dir_res_targets, mask_targets,
+                centerness_targets, corner3d_targets, vote_mask, positive_mask,
+                negative_mask)
+
+    def predict_by_feat(self, points: List[torch.Tensor],
+                        bbox_preds_dict: dict, batch_input_metas: List[dict],
+                        **kwargs) -> List[InstanceData]:
+        """Generate bboxes from vote head predictions.
+
+        Args:
+            points (List[torch.Tensor]): Input points of multiple samples.
+            bbox_preds_dict (dict): Predictions from vote head.
+            batch_input_metas (list[dict]): Each item
+                contains the meta information of each sample.
+
+        Returns:
+            list[:obj:`InstanceData`]: List of processed predictions. Each
+            InstanceData cantains 3d Bounding boxes and corresponding
+            scores and labels.
+        """
+        # decode boxes
+        sem_scores = F.sigmoid(bbox_preds_dict['obj_scores']).transpose(1, 2)
+        obj_scores = sem_scores.max(-1)[0]
+        bbox3d = self.bbox_coder.decode(bbox_preds_dict)
+        batch_size = bbox3d.shape[0]
+        points = torch.stack(points)
+        results_list = []
+        for b in range(batch_size):
+            temp_results = InstanceData()
+            bbox_selected, score_selected, labels = self.multiclass_nms_single(
+                obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3],
+                batch_input_metas[b])
+
+            bbox = batch_input_metas[b]['box_type_3d'](
+                bbox_selected.clone(),
+                box_dim=bbox_selected.shape[-1],
+                with_yaw=self.bbox_coder.with_rot)
+
+            temp_results.bboxes_3d = bbox
+            temp_results.scores_3d = score_selected
+            temp_results.labels_3d = labels
+            results_list.append(temp_results)
+
+        return results_list
+
+    def multiclass_nms_single(self, obj_scores: Tensor, sem_scores: Tensor,
+                              bbox: Tensor, points: Tensor,
+                              input_meta: dict) -> Tuple[Tensor]:
+        """Multi-class nms in single batch.
+
+        Args:
+            obj_scores (torch.Tensor): Objectness score of bounding boxes.
+            sem_scores (torch.Tensor): Semantic class score of bounding boxes.
+            bbox (torch.Tensor): Predicted bounding boxes.
+            points (torch.Tensor): Input points.
+            input_meta (dict): Point cloud and image's meta info.
+
+        Returns:
+            tuple[torch.Tensor]: Bounding boxes, scores and labels.
+        """
+        bbox = input_meta['box_type_3d'](
+            bbox.clone(),
+            box_dim=bbox.shape[-1],
+            with_yaw=self.bbox_coder.with_rot,
+            origin=(0.5, 0.5, 0.5))
+
+        if isinstance(bbox, (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+            box_indices = bbox.points_in_boxes_all(points)
+            nonempty_box_mask = box_indices.T.sum(1) >= 0
+        else:
+            raise NotImplementedError('Unsupported bbox type!')
+
+        corner3d = bbox.corners
+        minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
+        minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
+        minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
+
+        bbox_classes = torch.argmax(sem_scores, -1)
+        nms_keep = batched_nms(
+            minmax_box3d[nonempty_box_mask][:, [0, 1, 3, 4]],
+            obj_scores[nonempty_box_mask], bbox_classes[nonempty_box_mask],
+            self.test_cfg.nms_cfg)[1]
+
+        if nms_keep.shape[0] > self.test_cfg.max_output_num:
+            nms_keep = nms_keep[:self.test_cfg.max_output_num]
+
+        # filter empty boxes and boxes with low score
+        scores_mask = (obj_scores >= self.test_cfg.score_thr)
+        nonempty_box_inds = torch.nonzero(
+            nonempty_box_mask, as_tuple=False).flatten()
+        nonempty_mask = torch.zeros_like(bbox_classes).scatter(
+            0, nonempty_box_inds[nms_keep], 1)
+        selected = (nonempty_mask.bool() & scores_mask.bool())
+
+        if self.test_cfg.per_class_proposal:
+            bbox_selected, score_selected, labels = [], [], []
+            for k in range(sem_scores.shape[-1]):
+                bbox_selected.append(bbox[selected].tensor)
+                score_selected.append(obj_scores[selected])
+                labels.append(
+                    torch.zeros_like(bbox_classes[selected]).fill_(k))
+            bbox_selected = torch.cat(bbox_selected, 0)
+            score_selected = torch.cat(score_selected, 0)
+            labels = torch.cat(labels, 0)
+        else:
+            bbox_selected = bbox[selected].tensor
+            score_selected = obj_scores[selected]
+            labels = bbox_classes[selected]
+
+        return bbox_selected, score_selected, labels
+
+    def _assign_targets_by_points_inside(self, bboxes_3d: BaseInstance3DBoxes,
+                                         points: Tensor) -> Tuple:
+        """Compute assignment by checking whether point is inside bbox.
+
+        Args:
+            bboxes_3d (BaseInstance3DBoxes): Instance of bounding boxes.
+            points (torch.Tensor): Points of a batch.
+
+        Returns:
+            tuple[torch.Tensor]: Flags indicating whether each point is
+                inside bbox and the index of box where each point are in.
+        """
+        if isinstance(bboxes_3d, (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+            points_mask = bboxes_3d.points_in_boxes_all(points)
+            assignment = points_mask.argmax(dim=-1)
+        else:
+            raise NotImplementedError('Unsupported bbox type!')
+
+        return points_mask, assignment
diff --git a/mmde/mmdet3d/models/dense_heads/train_mixins.py b/mmde/mmdet3d/models/dense_heads/train_mixins.py
new file mode 100644
index 0000000000000000000000000000000000000000..01c7eb79e53e5b17022f2350d703dcd83ab15025
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/train_mixins.py
@@ -0,0 +1,353 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmdet.models.utils import images_to_levels, multi_apply
+from mmengine.structures import InstanceData
+
+from mmdet3d.structures import limit_period
+
+
+class AnchorTrainMixin(object):
+    """Mixin class for target assigning of dense heads."""
+
+    def anchor_target_3d(self,
+                         anchor_list,
+                         batch_gt_instances_3d,
+                         batch_input_metas,
+                         batch_gt_instances_ignore=None,
+                         label_channels=1,
+                         num_classes=1,
+                         sampling=True):
+        """Compute regression and classification targets for anchors.
+
+        Args:
+            anchor_list (list[list]): Multi level anchors of each image.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Ground truth
+                bboxes of each image.
+            batch_input_metas (list[dict]): Meta info of each image.
+            batch_gt_instances_ignore (list): Ignore list of gt bboxes.
+            label_channels (int): The channel of labels.
+            num_classes (int): The number of classes.
+            sampling (bool): Whether to sample anchors.
+
+        Returns:
+            tuple (list, list, list, list, list, list, int, int):
+                Anchor targets, including labels, label weights,
+                bbox targets, bbox weights, direction targets,
+                direction weights, number of positive anchors and
+                number of negative anchors.
+        """
+        num_inputs = len(batch_input_metas)
+        assert len(anchor_list) == num_inputs
+
+        if isinstance(anchor_list[0][0], list):
+            # sizes of anchors are different
+            # anchor number of a single level
+            num_level_anchors = [
+                sum([anchor.size(0) for anchor in anchors])
+                for anchors in anchor_list[0]
+            ]
+            for i in range(num_inputs):
+                anchor_list[i] = anchor_list[i][0]
+        else:
+            # anchor number of multi levels
+            num_level_anchors = [
+                anchors.view(-1, self.box_code_size).size(0)
+                for anchors in anchor_list[0]
+            ]
+            # concat all level anchors and flags to a single tensor
+            for i in range(num_inputs):
+                anchor_list[i] = torch.cat(anchor_list[i])
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None for _ in range(num_inputs)]
+
+        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
+         all_dir_targets, all_dir_weights, pos_inds_list,
+         neg_inds_list) = multi_apply(
+             self.anchor_target_3d_single,
+             anchor_list,
+             batch_gt_instances_3d,
+             batch_gt_instances_ignore,
+             batch_input_metas,
+             label_channels=label_channels,
+             num_classes=num_classes,
+             sampling=sampling)
+
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        dir_targets_list = images_to_levels(all_dir_targets, num_level_anchors)
+        dir_weights_list = images_to_levels(all_dir_weights, num_level_anchors)
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, dir_targets_list, dir_weights_list,
+                num_total_pos, num_total_neg)
+
+    def anchor_target_3d_single(self,
+                                anchors,
+                                gt_instance_3d,
+                                gt_instance_ignore,
+                                input_meta,
+                                label_channels=1,
+                                num_classes=1,
+                                sampling=True):
+        """Compute targets of anchors in single batch.
+
+        Args:
+            anchors (torch.Tensor): Concatenated multi-level anchor.
+            gt_instance_3d (:obj:`InstanceData`): Gt bboxes.
+            gt_instance_ignore (:obj:`InstanceData`): Ignored gt bboxes.
+            input_meta (dict): Meta info of each image.
+            label_channels (int): The channel of labels.
+            num_classes (int): The number of classes.
+            sampling (bool): Whether to sample anchors.
+
+        Returns:
+            tuple[torch.Tensor]: Anchor targets.
+        """
+        if isinstance(self.bbox_assigner,
+                      list) and (not isinstance(anchors, list)):
+            feat_size = anchors.size(0) * anchors.size(1) * anchors.size(2)
+            rot_angles = anchors.size(-2)
+            assert len(self.bbox_assigner) == anchors.size(-3)
+            (total_labels, total_label_weights, total_bbox_targets,
+             total_bbox_weights, total_dir_targets, total_dir_weights,
+             total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], []
+            current_anchor_num = 0
+            for i, assigner in enumerate(self.bbox_assigner):
+                current_anchors = anchors[..., i, :, :].reshape(
+                    -1, self.box_code_size)
+                current_anchor_num += current_anchors.size(0)
+                if self.assign_per_class:
+                    gt_per_cls = (gt_instance_3d.labels_3d == i)
+                    gt_per_cls_instance = InstanceData()
+                    gt_per_cls_instance.labels_3d = gt_instance_3d.labels_3d[
+                        gt_per_cls]
+                    gt_per_cls_instance.bboxes_3d = gt_instance_3d.bboxes_3d[
+                        gt_per_cls, :]
+                    anchor_targets = self.anchor_target_single_assigner(
+                        assigner, current_anchors, gt_per_cls_instance,
+                        gt_instance_ignore, input_meta, num_classes, sampling)
+                else:
+                    anchor_targets = self.anchor_target_single_assigner(
+                        assigner, current_anchors, gt_instance_3d,
+                        gt_instance_ignore, input_meta, num_classes, sampling)
+
+                (labels, label_weights, bbox_targets, bbox_weights,
+                 dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets
+                total_labels.append(labels.reshape(feat_size, 1, rot_angles))
+                total_label_weights.append(
+                    label_weights.reshape(feat_size, 1, rot_angles))
+                total_bbox_targets.append(
+                    bbox_targets.reshape(feat_size, 1, rot_angles,
+                                         anchors.size(-1)))
+                total_bbox_weights.append(
+                    bbox_weights.reshape(feat_size, 1, rot_angles,
+                                         anchors.size(-1)))
+                total_dir_targets.append(
+                    dir_targets.reshape(feat_size, 1, rot_angles))
+                total_dir_weights.append(
+                    dir_weights.reshape(feat_size, 1, rot_angles))
+                total_pos_inds.append(pos_inds)
+                total_neg_inds.append(neg_inds)
+
+            total_labels = torch.cat(total_labels, dim=-2).reshape(-1)
+            total_label_weights = torch.cat(
+                total_label_weights, dim=-2).reshape(-1)
+            total_bbox_targets = torch.cat(
+                total_bbox_targets, dim=-3).reshape(-1, anchors.size(-1))
+            total_bbox_weights = torch.cat(
+                total_bbox_weights, dim=-3).reshape(-1, anchors.size(-1))
+            total_dir_targets = torch.cat(
+                total_dir_targets, dim=-2).reshape(-1)
+            total_dir_weights = torch.cat(
+                total_dir_weights, dim=-2).reshape(-1)
+            total_pos_inds = torch.cat(total_pos_inds, dim=0).reshape(-1)
+            total_neg_inds = torch.cat(total_neg_inds, dim=0).reshape(-1)
+            return (total_labels, total_label_weights, total_bbox_targets,
+                    total_bbox_weights, total_dir_targets, total_dir_weights,
+                    total_pos_inds, total_neg_inds)
+        elif isinstance(self.bbox_assigner, list) and isinstance(
+                anchors, list):
+            # class-aware anchors with different feature map sizes
+            assert len(self.bbox_assigner) == len(anchors), \
+                'The number of bbox assigners and anchors should be the same.'
+            (total_labels, total_label_weights, total_bbox_targets,
+             total_bbox_weights, total_dir_targets, total_dir_weights,
+             total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], []
+            current_anchor_num = 0
+            for i, assigner in enumerate(self.bbox_assigner):
+                current_anchors = anchors[i]
+                current_anchor_num += current_anchors.size(0)
+                if self.assign_per_class:
+                    gt_per_cls = (gt_instance_3d.labels_3d == i)
+                    gt_per_cls_instance = InstanceData()
+                    gt_per_cls_instance.labels_3d = gt_instance_3d.labels_3d[
+                        gt_per_cls]
+                    gt_per_cls_instance.bboxes_3d = gt_instance_3d.bboxes_3d[
+                        gt_per_cls, :]
+                    anchor_targets = self.anchor_target_single_assigner(
+                        assigner, current_anchors, gt_per_cls_instance,
+                        gt_instance_ignore, input_meta, num_classes, sampling)
+                else:
+                    anchor_targets = self.anchor_target_single_assigner(
+                        assigner, current_anchors, gt_instance_3d,
+                        gt_instance_ignore, input_meta, num_classes, sampling)
+
+                (labels, label_weights, bbox_targets, bbox_weights,
+                 dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets
+                total_labels.append(labels)
+                total_label_weights.append(label_weights)
+                total_bbox_targets.append(
+                    bbox_targets.reshape(-1, anchors[i].size(-1)))
+                total_bbox_weights.append(
+                    bbox_weights.reshape(-1, anchors[i].size(-1)))
+                total_dir_targets.append(dir_targets)
+                total_dir_weights.append(dir_weights)
+                total_pos_inds.append(pos_inds)
+                total_neg_inds.append(neg_inds)
+
+            total_labels = torch.cat(total_labels, dim=0)
+            total_label_weights = torch.cat(total_label_weights, dim=0)
+            total_bbox_targets = torch.cat(total_bbox_targets, dim=0)
+            total_bbox_weights = torch.cat(total_bbox_weights, dim=0)
+            total_dir_targets = torch.cat(total_dir_targets, dim=0)
+            total_dir_weights = torch.cat(total_dir_weights, dim=0)
+            total_pos_inds = torch.cat(total_pos_inds, dim=0)
+            total_neg_inds = torch.cat(total_neg_inds, dim=0)
+            return (total_labels, total_label_weights, total_bbox_targets,
+                    total_bbox_weights, total_dir_targets, total_dir_weights,
+                    total_pos_inds, total_neg_inds)
+        else:
+            return self.anchor_target_single_assigner(self.bbox_assigner,
+                                                      anchors, gt_instance_3d,
+                                                      gt_instance_ignore,
+                                                      input_meta, num_classes,
+                                                      sampling)
+
+    def anchor_target_single_assigner(self,
+                                      bbox_assigner,
+                                      anchors,
+                                      gt_instance_3d,
+                                      gt_instance_ignore,
+                                      input_meta,
+                                      num_classes=1,
+                                      sampling=True):
+        """Assign anchors and encode positive anchors.
+
+        Args:
+            bbox_assigner (BaseAssigner): assign positive and negative boxes.
+            anchors (torch.Tensor): Concatenated multi-level anchor.
+            gt_instance_3d (:obj:`InstanceData`): Gt bboxes.
+            gt_instance_ignore (torch.Tensor): Ignored gt bboxes.
+            input_meta (dict): Meta info of each image.
+            num_classes (int): The number of classes.
+            sampling (bool): Whether to sample anchors.
+
+        Returns:
+            tuple[torch.Tensor]: Anchor targets.
+        """
+        anchors = anchors.reshape(-1, anchors.size(-1))
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        dir_targets = anchors.new_zeros((anchors.shape[0]), dtype=torch.long)
+        dir_weights = anchors.new_zeros((anchors.shape[0]), dtype=torch.float)
+        labels = anchors.new_zeros(num_valid_anchors, dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+        if len(gt_instance_3d.bboxes_3d) > 0:
+            if not isinstance(gt_instance_3d.bboxes_3d, torch.Tensor):
+                gt_instance_3d.bboxes_3d = gt_instance_3d.bboxes_3d.tensor.to(
+                    anchors.device)
+            pred_instance_3d = InstanceData(priors=anchors)
+            assign_result = bbox_assigner.assign(pred_instance_3d,
+                                                 gt_instance_3d,
+                                                 gt_instance_ignore)
+            sampling_result = self.bbox_sampler.sample(assign_result,
+                                                       pred_instance_3d,
+                                                       gt_instance_3d)
+            pos_inds = sampling_result.pos_inds
+            neg_inds = sampling_result.neg_inds
+        else:
+            pos_inds = torch.nonzero(
+                anchors.new_zeros((anchors.shape[0], ), dtype=torch.bool) > 0,
+                as_tuple=False).squeeze(-1).unique()
+            neg_inds = torch.nonzero(
+                anchors.new_zeros((anchors.shape[0], ), dtype=torch.bool) == 0,
+                as_tuple=False).squeeze(-1).unique()
+
+        if gt_instance_3d.labels_3d is not None:
+            labels += num_classes
+        if len(pos_inds) > 0:
+            pos_bbox_targets = self.bbox_coder.encode(
+                sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            pos_dir_targets = get_direction_target(
+                sampling_result.pos_bboxes,
+                pos_bbox_targets,
+                self.dir_offset,
+                self.dir_limit_offset,
+                one_hot=False)
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            dir_targets[pos_inds] = pos_dir_targets
+            dir_weights[pos_inds] = 1.0
+
+            if gt_instance_3d.labels_3d is None:
+                labels[pos_inds] = 1
+            else:
+                labels[pos_inds] = gt_instance_3d.labels_3d[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+        return (labels, label_weights, bbox_targets, bbox_weights, dir_targets,
+                dir_weights, pos_inds, neg_inds)
+
+
+def get_direction_target(anchors,
+                         reg_targets,
+                         dir_offset=0,
+                         dir_limit_offset=0,
+                         num_bins=2,
+                         one_hot=True):
+    """Encode direction to 0 ~ num_bins-1.
+
+    Args:
+        anchors (torch.Tensor): Concatenated multi-level anchor.
+        reg_targets (torch.Tensor): Bbox regression targets.
+        dir_offset (int): Direction offset.
+        num_bins (int): Number of bins to divide 2*PI.
+        one_hot (bool): Whether to encode as one hot.
+
+    Returns:
+        torch.Tensor: Encoded direction targets.
+    """
+    rot_gt = reg_targets[..., 6] + anchors[..., 6]
+    offset_rot = limit_period(rot_gt - dir_offset, dir_limit_offset, 2 * np.pi)
+    dir_cls_targets = torch.floor(offset_rot / (2 * np.pi / num_bins)).long()
+    dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1)
+    if one_hot:
+        dir_targets = torch.zeros(
+            *list(dir_cls_targets.shape),
+            num_bins,
+            dtype=anchors.dtype,
+            device=dir_cls_targets.device)
+        dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0)
+        dir_cls_targets = dir_targets
+    return dir_cls_targets
diff --git a/mmde/mmdet3d/models/dense_heads/vote_head.py b/mmde/mmdet3d/models/dense_heads/vote_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..853caae31398f08c0082d454ea4e4cc03d944f1c
--- /dev/null
+++ b/mmde/mmdet3d/models/dense_heads/vote_head.py
@@ -0,0 +1,837 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from mmcv.ops import furthest_point_sample
+from mmdet.models.utils import multi_apply
+from mmengine import ConfigDict
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.models.layers import VoteModule, aligned_3d_nms, build_sa_module
+from mmdet3d.models.losses import chamfer_distance
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures import Det3DDataSample
+from .base_conv_bbox_head import BaseConvBboxHead
+
+
+@MODELS.register_module()
+class VoteHead(BaseModule):
+    r"""Bbox head of `Votenet <https://arxiv.org/abs/1904.09664>`_.
+
+    Args:
+        num_classes (int): The number of class.
+        bbox_coder (ConfigDict, dict): Bbox coder for encoding and
+            decoding boxes. Defaults to None.
+        train_cfg (dict, optional): Config for training. Defaults to None.
+        test_cfg (dict, optional): Config for testing. Defaults to None.
+        vote_module_cfg (dict, optional): Config of VoteModule for
+            point-wise votes. Defaults to None.
+        vote_aggregation_cfg (dict, optional): Config of vote
+            aggregation layer. Defaults to None.
+        pred_layer_cfg (dict, optional): Config of classification
+            and regression prediction layers. Defaults to None.
+        objectness_loss (dict, optional): Config of objectness loss.
+            Defaults to None.
+        center_loss (dict, optional): Config of center loss.
+            Defaults to None.
+        dir_class_loss (dict, optional): Config of direction
+            classification loss. Defaults to None.
+        dir_res_loss (dict, optional): Config of direction
+            residual regression loss. Defaults to None.
+        size_class_loss (dict, optional): Config of size
+            classification loss. Defaults to None.
+        size_res_loss (dict, optional): Config of size
+            residual regression loss. Defaults to None.
+        semantic_loss (dict, optional): Config of point-wise
+            semantic segmentation loss. Defaults to None.
+        iou_loss (dict, optional): Config of IOU loss for
+            regression. Defaults to None.
+        init_cfg (dict, optional): Config of model weight
+            initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 bbox_coder: Union[ConfigDict, dict],
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 vote_module_cfg: Optional[dict] = None,
+                 vote_aggregation_cfg: Optional[dict] = None,
+                 pred_layer_cfg: Optional[dict] = None,
+                 objectness_loss: Optional[dict] = None,
+                 center_loss: Optional[dict] = None,
+                 dir_class_loss: Optional[dict] = None,
+                 dir_res_loss: Optional[dict] = None,
+                 size_class_loss: Optional[dict] = None,
+                 size_res_loss: Optional[dict] = None,
+                 semantic_loss: Optional[dict] = None,
+                 iou_loss: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None):
+        super(VoteHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        self.gt_per_seed = vote_module_cfg['gt_per_seed']
+        self.num_proposal = vote_aggregation_cfg['num_point']
+
+        self.loss_objectness = MODELS.build(objectness_loss)
+        self.loss_center = MODELS.build(center_loss)
+        self.loss_dir_res = MODELS.build(dir_res_loss)
+        self.loss_dir_class = MODELS.build(dir_class_loss)
+        self.loss_size_res = MODELS.build(size_res_loss)
+        if size_class_loss is not None:
+            self.size_class_loss = MODELS.build(size_class_loss)
+        if semantic_loss is not None:
+            self.semantic_loss = MODELS.build(semantic_loss)
+        if iou_loss is not None:
+            self.iou_loss = MODELS.build(iou_loss)
+        else:
+            self.iou_loss = None
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.num_sizes = self.bbox_coder.num_sizes
+        self.num_dir_bins = self.bbox_coder.num_dir_bins
+
+        self.vote_module = VoteModule(**vote_module_cfg)
+        self.vote_aggregation = build_sa_module(vote_aggregation_cfg)
+
+        # Bbox classification and regression
+        self.conv_pred = BaseConvBboxHead(
+            **pred_layer_cfg,
+            num_cls_out_channels=self._get_cls_out_channels(),
+            num_reg_out_channels=self._get_reg_out_channels())
+
+    @property
+    def sample_mode(self):
+        if self.training:
+            sample_mode = self.train_cfg.sample_mode
+        else:
+            sample_mode = self.test_cfg.sample_mode
+        assert sample_mode in ['vote', 'seed', 'random', 'spec']
+        return sample_mode
+
+    def _get_cls_out_channels(self):
+        """Return the channel number of classification outputs."""
+        # Class numbers (k) + objectness (2)
+        return self.num_classes + 2
+
+    def _get_reg_out_channels(self):
+        """Return the channel number of regression outputs."""
+        # Objectness scores (2), center residual (3),
+        # heading class+residual (num_dir_bins*2),
+        # size class+residual(num_sizes*4)
+        return 3 + self.num_dir_bins * 2 + self.num_sizes * 4
+
+    def _extract_input(self, feat_dict: dict) -> tuple:
+        """Extract inputs from features dictionary.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            tuple[Tensor]: Arrage as following three tensor.
+
+                - Coordinates of input points.
+                - Features of input points.
+                - Indices of input points.
+        """
+
+        # for imvotenet
+        if 'seed_points' in feat_dict and \
+           'seed_features' in feat_dict and \
+           'seed_indices' in feat_dict:
+            seed_points = feat_dict['seed_points']
+            seed_features = feat_dict['seed_features']
+            seed_indices = feat_dict['seed_indices']
+        # for votenet
+        else:
+            seed_points = feat_dict['fp_xyz'][-1]
+            seed_features = feat_dict['fp_features'][-1]
+            seed_indices = feat_dict['fp_indices'][-1]
+
+        return seed_points, seed_features, seed_indices
+
+    def predict(self,
+                points: List[torch.Tensor],
+                feats_dict: Dict[str, torch.Tensor],
+                batch_data_samples: List[Det3DDataSample],
+                use_nms: bool = True,
+                **kwargs) -> List[InstanceData]:
+        """
+        Args:
+            points (list[tensor]): Point clouds of multiple samples.
+            feats_dict (dict): Features from FPN or backbone..
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes meta information of data.
+            use_nms (bool): Whether do the nms for predictions.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: List of processed predictions. Each
+            InstanceData contains 3d Bounding boxes and corresponding
+            scores and labels.
+        """
+        preds_dict = self(feats_dict)
+        # `preds_dict` can be used in H3DNET
+        feats_dict.update(preds_dict)
+
+        batch_size = len(batch_data_samples)
+        batch_input_metas = []
+        for batch_index in range(batch_size):
+            metainfo = batch_data_samples[batch_index].metainfo
+            batch_input_metas.append(metainfo)
+
+        results_list = self.predict_by_feat(
+            points, preds_dict, batch_input_metas, use_nms=use_nms, **kwargs)
+        return results_list
+
+    def loss_and_predict(self,
+                         points: List[torch.Tensor],
+                         feats_dict: Dict[str, torch.Tensor],
+                         batch_data_samples: List[Det3DDataSample],
+                         ret_target: bool = False,
+                         proposal_cfg: dict = None,
+                         **kwargs) -> Tuple:
+        """
+        Args:
+            points (list[tensor]): Points cloud of multiple samples.
+            feats_dict (dict): Predictions from backbone or FPN.
+            batch_data_samples (list[:obj:`Det3DDataSample`]): Each item
+                contains the meta information of each sample and
+                corresponding annotations.
+            ret_target (bool): Whether return the assigned target.
+                Defaults to False.
+            proposal_cfg (dict): Configure for proposal process.
+                Defaults to True.
+
+        Returns:
+            tuple:  Contains loss and predictions after post-process.
+        """
+        preds_dict = self.forward(feats_dict)
+        feats_dict.update(preds_dict)
+        batch_gt_instance_3d = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        batch_pts_semantic_mask = []
+        batch_pts_instance_mask = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instance_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+            batch_pts_semantic_mask.append(
+                data_sample.gt_pts_seg.get('pts_semantic_mask', None))
+            batch_pts_instance_mask.append(
+                data_sample.gt_pts_seg.get('pts_instance_mask', None))
+
+        loss_inputs = (points, preds_dict, batch_gt_instance_3d)
+        losses = self.loss_by_feat(
+            *loss_inputs,
+            batch_pts_semantic_mask=batch_pts_semantic_mask,
+            batch_pts_instance_mask=batch_pts_instance_mask,
+            batch_input_metas=batch_input_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            ret_target=ret_target,
+            **kwargs)
+
+        results_list = self.predict_by_feat(
+            points,
+            preds_dict,
+            batch_input_metas,
+            use_nms=proposal_cfg.use_nms,
+            **kwargs)
+
+        return losses, results_list
+
+    def loss(self,
+             points: List[torch.Tensor],
+             feats_dict: Dict[str, torch.Tensor],
+             batch_data_samples: List[Det3DDataSample],
+             ret_target: bool = False,
+             **kwargs) -> dict:
+        """
+        Args:
+            points (list[tensor]): Points cloud of multiple samples.
+            feats_dict (dict): Predictions from backbone or FPN.
+            batch_data_samples (list[:obj:`Det3DDataSample`]): Each item
+                contains the meta information of each sample and
+                corresponding annotations.
+            ret_target (bool): Whether return the assigned target.
+                Defaults to False.
+
+        Returns:
+            dict:  A dictionary of loss components.
+        """
+        preds_dict = self.forward(feats_dict)
+        batch_gt_instance_3d = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        batch_pts_semantic_mask = []
+        batch_pts_instance_mask = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instance_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+            batch_pts_semantic_mask.append(
+                data_sample.gt_pts_seg.get('pts_semantic_mask', None))
+            batch_pts_instance_mask.append(
+                data_sample.gt_pts_seg.get('pts_instance_mask', None))
+
+        loss_inputs = (points, preds_dict, batch_gt_instance_3d)
+        losses = self.loss_by_feat(
+            *loss_inputs,
+            batch_pts_semantic_mask=batch_pts_semantic_mask,
+            batch_pts_instance_mask=batch_pts_instance_mask,
+            batch_input_metas=batch_input_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            ret_target=ret_target,
+            **kwargs)
+        return losses
+
+    def forward(self, feat_dict: dict) -> dict:
+        """Forward pass.
+
+        Note:
+            The forward of VoteHead is divided into 4 steps:
+
+                1. Generate vote_points from seed_points.
+                2. Aggregate vote_points.
+                3. Predict bbox and score.
+                4. Decode predictions.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            dict: Predictions of vote head.
+        """
+
+        seed_points, seed_features, seed_indices = self._extract_input(
+            feat_dict)
+
+        # 1. generate vote_points from seed_points
+        vote_points, vote_features, vote_offset = self.vote_module(
+            seed_points, seed_features)
+        results = dict(
+            seed_points=seed_points,
+            seed_indices=seed_indices,
+            vote_points=vote_points,
+            vote_features=vote_features,
+            vote_offset=vote_offset)
+
+        # 2. aggregate vote_points
+        if self.sample_mode == 'vote':
+            # use fps in vote_aggregation
+            aggregation_inputs = dict(
+                points_xyz=vote_points, features=vote_features)
+        elif self.sample_mode == 'seed':
+            # FPS on seed and choose the votes corresponding to the seeds
+            sample_indices = furthest_point_sample(seed_points,
+                                                   self.num_proposal)
+            aggregation_inputs = dict(
+                points_xyz=vote_points,
+                features=vote_features,
+                indices=sample_indices)
+        elif self.sample_mode == 'random':
+            # Random sampling from the votes
+            batch_size, num_seed = seed_points.shape[:2]
+            sample_indices = seed_points.new_tensor(
+                torch.randint(0, num_seed, (batch_size, self.num_proposal)),
+                dtype=torch.int32)
+            aggregation_inputs = dict(
+                points_xyz=vote_points,
+                features=vote_features,
+                indices=sample_indices)
+        elif self.sample_mode == 'spec':
+            # Specify the new center in vote_aggregation
+            aggregation_inputs = dict(
+                points_xyz=seed_points,
+                features=seed_features,
+                target_xyz=vote_points)
+        else:
+            raise NotImplementedError(
+                f'Sample mode {self.sample_mode} is not supported!')
+
+        vote_aggregation_ret = self.vote_aggregation(**aggregation_inputs)
+        aggregated_points, features, aggregated_indices = vote_aggregation_ret
+
+        results['aggregated_points'] = aggregated_points
+        results['aggregated_features'] = features
+        results['aggregated_indices'] = aggregated_indices
+
+        # 3. predict bbox and score
+        cls_predictions, reg_predictions = self.conv_pred(features)
+
+        # 4. decode predictions
+        decode_res = self.bbox_coder.split_pred(cls_predictions,
+                                                reg_predictions,
+                                                aggregated_points)
+        results.update(decode_res)
+        return results
+
+    def loss_by_feat(
+            self,
+            points: List[torch.Tensor],
+            bbox_preds_dict: dict,
+            batch_gt_instances_3d: List[InstanceData],
+            batch_pts_semantic_mask: Optional[List[torch.Tensor]] = None,
+            batch_pts_instance_mask: Optional[List[torch.Tensor]] = None,
+            ret_target: bool = False,
+            **kwargs) -> dict:
+        """Compute loss.
+
+        Args:
+            points (list[torch.Tensor]): Input points.
+            bbox_preds_dict (dict): Predictions from forward of vote head.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_pts_semantic_mask (list[tensor]): Semantic mask
+                of points cloud. Defaults to None.
+            batch_pts_semantic_mask (list[tensor]): Instance mask
+                of points cloud. Defaults to None.
+            batch_input_metas (list[dict]): Contain pcd and img's meta info.
+            ret_target (bool): Return targets or not. Defaults to False.
+
+        Returns:
+            dict: Losses of Votenet.
+        """
+
+        targets = self.get_targets(points, bbox_preds_dict,
+                                   batch_gt_instances_3d,
+                                   batch_pts_semantic_mask,
+                                   batch_pts_instance_mask)
+        (vote_targets, vote_target_masks, size_class_targets, size_res_targets,
+         dir_class_targets, dir_res_targets, center_targets,
+         assigned_center_targets, mask_targets, valid_gt_masks,
+         objectness_targets, objectness_weights, box_loss_weights,
+         valid_gt_weights) = targets
+
+        # calculate vote loss
+        vote_loss = self.vote_module.get_loss(bbox_preds_dict['seed_points'],
+                                              bbox_preds_dict['vote_points'],
+                                              bbox_preds_dict['seed_indices'],
+                                              vote_target_masks, vote_targets)
+
+        # calculate objectness loss
+        objectness_loss = self.loss_objectness(
+            bbox_preds_dict['obj_scores'].transpose(2, 1),
+            objectness_targets,
+            weight=objectness_weights)
+
+        # calculate center loss
+        source2target_loss, target2source_loss = self.loss_center(
+            bbox_preds_dict['center'],
+            center_targets,
+            src_weight=box_loss_weights,
+            dst_weight=valid_gt_weights)
+        center_loss = source2target_loss + target2source_loss
+
+        # calculate direction class loss
+        dir_class_loss = self.loss_dir_class(
+            bbox_preds_dict['dir_class'].transpose(2, 1),
+            dir_class_targets,
+            weight=box_loss_weights)
+
+        # calculate direction residual loss
+        batch_size, proposal_num = size_class_targets.shape[:2]
+        heading_label_one_hot = vote_targets.new_zeros(
+            (batch_size, proposal_num, self.num_dir_bins))
+        heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1)
+        dir_res_norm = torch.sum(
+            bbox_preds_dict['dir_res_norm'] * heading_label_one_hot, -1)
+        dir_res_loss = self.loss_dir_res(
+            dir_res_norm, dir_res_targets, weight=box_loss_weights)
+
+        # calculate size class loss
+        size_class_loss = self.size_class_loss(
+            bbox_preds_dict['size_class'].transpose(2, 1),
+            size_class_targets,
+            weight=box_loss_weights)
+
+        # calculate size residual loss
+        one_hot_size_targets = vote_targets.new_zeros(
+            (batch_size, proposal_num, self.num_sizes))
+        one_hot_size_targets.scatter_(2, size_class_targets.unsqueeze(-1), 1)
+        one_hot_size_targets_expand = one_hot_size_targets.unsqueeze(
+            -1).repeat(1, 1, 1, 3).contiguous()
+        size_residual_norm = torch.sum(
+            bbox_preds_dict['size_res_norm'] * one_hot_size_targets_expand, 2)
+        box_loss_weights_expand = box_loss_weights.unsqueeze(-1).repeat(
+            1, 1, 3)
+        size_res_loss = self.loss_size_res(
+            size_residual_norm,
+            size_res_targets,
+            weight=box_loss_weights_expand)
+
+        # calculate semantic loss
+        semantic_loss = self.semantic_loss(
+            bbox_preds_dict['sem_scores'].transpose(2, 1),
+            mask_targets,
+            weight=box_loss_weights)
+
+        losses = dict(
+            vote_loss=vote_loss,
+            objectness_loss=objectness_loss,
+            semantic_loss=semantic_loss,
+            center_loss=center_loss,
+            dir_class_loss=dir_class_loss,
+            dir_res_loss=dir_res_loss,
+            size_class_loss=size_class_loss,
+            size_res_loss=size_res_loss)
+
+        if self.iou_loss:
+            corners_pred = self.bbox_coder.decode_corners(
+                bbox_preds_dict['center'], size_residual_norm,
+                one_hot_size_targets_expand)
+            corners_target = self.bbox_coder.decode_corners(
+                assigned_center_targets, size_res_targets,
+                one_hot_size_targets_expand)
+            iou_loss = self.iou_loss(
+                corners_pred, corners_target, weight=box_loss_weights)
+            losses['iou_loss'] = iou_loss
+
+        if ret_target:
+            losses['targets'] = targets
+
+        return losses
+
+    def get_targets(
+        self,
+        points,
+        bbox_preds: dict = None,
+        batch_gt_instances_3d: List[InstanceData] = None,
+        batch_pts_semantic_mask: List[torch.Tensor] = None,
+        batch_pts_instance_mask: List[torch.Tensor] = None,
+    ):
+        """Generate targets of vote head.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            bbox_preds (torch.Tensor): Bounding box predictions of vote head.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_pts_semantic_mask (list[tensor]): Semantic gt mask for
+                point clouds. Defaults to None.
+            batch_pts_instance_mask (list[tensor]): Instance gt mask for
+                point clouds. Defaults to None.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of vote head.
+        """
+        # find empty example
+        valid_gt_masks = list()
+        gt_num = list()
+        batch_gt_labels_3d = [
+            gt_instances_3d.labels_3d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+        batch_gt_bboxes_3d = [
+            gt_instances_3d.bboxes_3d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+        for index in range(len(batch_gt_labels_3d)):
+            if len(batch_gt_labels_3d[index]) == 0:
+                fake_box = batch_gt_bboxes_3d[index].tensor.new_zeros(
+                    1, batch_gt_bboxes_3d[index].tensor.shape[-1])
+                batch_gt_bboxes_3d[index] = batch_gt_bboxes_3d[index].new_box(
+                    fake_box)
+                batch_gt_labels_3d[index] = batch_gt_labels_3d[
+                    index].new_zeros(1)
+                valid_gt_masks.append(batch_gt_labels_3d[index].new_zeros(1))
+                gt_num.append(1)
+            else:
+                valid_gt_masks.append(batch_gt_labels_3d[index].new_ones(
+                    batch_gt_labels_3d[index].shape))
+                gt_num.append(batch_gt_labels_3d[index].shape[0])
+        max_gt_num = max(gt_num)
+
+        aggregated_points = [
+            bbox_preds['aggregated_points'][i]
+            for i in range(len(batch_gt_labels_3d))
+        ]
+
+        (vote_targets, vote_target_masks, size_class_targets, size_res_targets,
+         dir_class_targets, dir_res_targets, center_targets,
+         assigned_center_targets, mask_targets,
+         objectness_targets, objectness_masks) = multi_apply(
+             self._get_targets_single, points, batch_gt_bboxes_3d,
+             batch_gt_labels_3d, batch_pts_semantic_mask,
+             batch_pts_instance_mask, aggregated_points)
+
+        # pad targets as original code of votenet.
+        for index in range(len(batch_gt_labels_3d)):
+            pad_num = max_gt_num - batch_gt_labels_3d[index].shape[0]
+            center_targets[index] = F.pad(center_targets[index],
+                                          (0, 0, 0, pad_num))
+            valid_gt_masks[index] = F.pad(valid_gt_masks[index], (0, pad_num))
+
+        vote_targets = torch.stack(vote_targets)
+        vote_target_masks = torch.stack(vote_target_masks)
+        center_targets = torch.stack(center_targets)
+        valid_gt_masks = torch.stack(valid_gt_masks)
+
+        assigned_center_targets = torch.stack(assigned_center_targets)
+        objectness_targets = torch.stack(objectness_targets)
+        objectness_weights = torch.stack(objectness_masks)
+        objectness_weights /= (torch.sum(objectness_weights) + 1e-6)
+        box_loss_weights = objectness_targets.float() / (
+            torch.sum(objectness_targets).float() + 1e-6)
+        valid_gt_weights = valid_gt_masks.float() / (
+            torch.sum(valid_gt_masks.float()) + 1e-6)
+        dir_class_targets = torch.stack(dir_class_targets)
+        dir_res_targets = torch.stack(dir_res_targets)
+        size_class_targets = torch.stack(size_class_targets)
+        size_res_targets = torch.stack(size_res_targets)
+        mask_targets = torch.stack(mask_targets)
+
+        return (vote_targets, vote_target_masks, size_class_targets,
+                size_res_targets, dir_class_targets, dir_res_targets,
+                center_targets, assigned_center_targets, mask_targets,
+                valid_gt_masks, objectness_targets, objectness_weights,
+                box_loss_weights, valid_gt_weights)
+
+    def _get_targets_single(self,
+                            points,
+                            gt_bboxes_3d,
+                            gt_labels_3d,
+                            pts_semantic_mask=None,
+                            pts_instance_mask=None,
+                            aggregated_points=None):
+        """Generate targets of vote head for single batch.
+
+        Args:
+            points (torch.Tensor): Points of each batch.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
+                boxes of each batch.
+            gt_labels_3d (torch.Tensor): Labels of each batch.
+            pts_semantic_mask (torch.Tensor): Point-wise semantic
+                label of each batch.
+            pts_instance_mask (torch.Tensor): Point-wise instance
+                label of each batch.
+            aggregated_points (torch.Tensor): Aggregated points from
+                vote aggregation layer.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of vote head.
+        """
+        assert self.bbox_coder.with_rot or pts_semantic_mask is not None
+
+        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
+
+        # generate votes target
+        num_points = points.shape[0]
+        if self.bbox_coder.with_rot:
+            vote_targets = points.new_zeros([num_points, 3 * self.gt_per_seed])
+            vote_target_masks = points.new_zeros([num_points],
+                                                 dtype=torch.long)
+            vote_target_idx = points.new_zeros([num_points], dtype=torch.long)
+            box_indices_all = gt_bboxes_3d.points_in_boxes_all(points)
+            for i in range(gt_labels_3d.shape[0]):
+                box_indices = box_indices_all[:, i]
+                indices = torch.nonzero(
+                    box_indices, as_tuple=False).squeeze(-1)
+                selected_points = points[indices]
+                vote_target_masks[indices] = 1
+                vote_targets_tmp = vote_targets[indices]
+                votes = gt_bboxes_3d.gravity_center[i].unsqueeze(
+                    0) - selected_points[:, :3]
+
+                for j in range(self.gt_per_seed):
+                    column_indices = torch.nonzero(
+                        vote_target_idx[indices] == j,
+                        as_tuple=False).squeeze(-1)
+                    vote_targets_tmp[column_indices,
+                                     int(j * 3):int(j * 3 +
+                                                    3)] = votes[column_indices]
+                    if j == 0:
+                        vote_targets_tmp[column_indices] = votes[
+                            column_indices].repeat(1, self.gt_per_seed)
+
+                vote_targets[indices] = vote_targets_tmp
+                vote_target_idx[indices] = torch.clamp(
+                    vote_target_idx[indices] + 1, max=2)
+        elif pts_semantic_mask is not None:
+            vote_targets = points.new_zeros([num_points, 3])
+            vote_target_masks = points.new_zeros([num_points],
+                                                 dtype=torch.long)
+            for i in torch.unique(pts_instance_mask):
+                indices = torch.nonzero(
+                    pts_instance_mask == i, as_tuple=False).squeeze(-1)
+                if pts_semantic_mask[indices[0]] < self.num_classes:
+                    selected_points = points[indices, :3]
+                    center = 0.5 * (
+                        selected_points.min(0)[0] + selected_points.max(0)[0])
+                    vote_targets[indices, :] = center - selected_points
+                    vote_target_masks[indices] = 1
+            vote_targets = vote_targets.repeat((1, self.gt_per_seed))
+        else:
+            raise NotImplementedError
+
+        (center_targets, size_class_targets, size_res_targets,
+         dir_class_targets,
+         dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d)
+
+        proposal_num = aggregated_points.shape[0]
+        distance1, _, assignment, _ = chamfer_distance(
+            aggregated_points.unsqueeze(0),
+            center_targets.unsqueeze(0),
+            reduction='none')
+        assignment = assignment.squeeze(0)
+        euclidean_distance1 = torch.sqrt(distance1.squeeze(0) + 1e-6)
+
+        objectness_targets = points.new_zeros((proposal_num), dtype=torch.long)
+        objectness_targets[
+            euclidean_distance1 < self.train_cfg['pos_distance_thr']] = 1
+
+        objectness_masks = points.new_zeros((proposal_num))
+        objectness_masks[
+            euclidean_distance1 < self.train_cfg['pos_distance_thr']] = 1.0
+        objectness_masks[
+            euclidean_distance1 > self.train_cfg['neg_distance_thr']] = 1.0
+
+        dir_class_targets = dir_class_targets[assignment]
+        dir_res_targets = dir_res_targets[assignment]
+        dir_res_targets /= (np.pi / self.num_dir_bins)
+        size_class_targets = size_class_targets[assignment]
+        size_res_targets = size_res_targets[assignment]
+
+        one_hot_size_targets = gt_bboxes_3d.tensor.new_zeros(
+            (proposal_num, self.num_sizes))
+        one_hot_size_targets.scatter_(1, size_class_targets.unsqueeze(-1), 1)
+        one_hot_size_targets = one_hot_size_targets.unsqueeze(-1).repeat(
+            1, 1, 3)
+        mean_sizes = size_res_targets.new_tensor(
+            self.bbox_coder.mean_sizes).unsqueeze(0)
+        pos_mean_sizes = torch.sum(one_hot_size_targets * mean_sizes, 1)
+        size_res_targets /= pos_mean_sizes
+
+        mask_targets = gt_labels_3d[assignment]
+        assigned_center_targets = center_targets[assignment]
+
+        return (vote_targets, vote_target_masks, size_class_targets,
+                size_res_targets, dir_class_targets,
+                dir_res_targets, center_targets, assigned_center_targets,
+                mask_targets.long(), objectness_targets, objectness_masks)
+
+    def predict_by_feat(self,
+                        points: List[torch.Tensor],
+                        bbox_preds_dict: dict,
+                        batch_input_metas: List[dict],
+                        use_nms: bool = True,
+                        **kwargs) -> List[InstanceData]:
+        """Generate bboxes from vote head predictions.
+
+        Args:
+            points (List[torch.Tensor]): Input points of multiple samples.
+            bbox_preds_dict (dict): Predictions from vote head.
+            batch_input_metas (list[dict]): Each item
+                contains the meta information of each sample.
+            use_nms (bool): Whether to apply NMS, skip nms postprocessing
+                while using vote head in rpn stage.
+
+        Returns:
+            list[:obj:`InstanceData`] or Tensor: Return list of processed
+            predictions when `use_nms` is True. Each InstanceData cantains
+            3d Bounding boxes and corresponding scores and labels.
+            Return raw bboxes when `use_nms` is False.
+        """
+        # decode boxes
+        stack_points = torch.stack(points)
+        obj_scores = F.softmax(bbox_preds_dict['obj_scores'], dim=-1)[..., -1]
+        sem_scores = F.softmax(bbox_preds_dict['sem_scores'], dim=-1)
+        bbox3d = self.bbox_coder.decode(bbox_preds_dict)
+
+        batch_size = bbox3d.shape[0]
+        results_list = list()
+        if use_nms:
+            for batch_index in range(batch_size):
+                temp_results = InstanceData()
+                bbox_selected, score_selected, labels = \
+                    self.multiclass_nms_single(
+                        obj_scores[batch_index],
+                        sem_scores[batch_index],
+                        bbox3d[batch_index],
+                        stack_points[batch_index, ..., :3],
+                        batch_input_metas[batch_index])
+                bbox = batch_input_metas[batch_index]['box_type_3d'](
+                    bbox_selected,
+                    box_dim=bbox_selected.shape[-1],
+                    with_yaw=self.bbox_coder.with_rot)
+                temp_results.bboxes_3d = bbox
+                temp_results.scores_3d = score_selected
+                temp_results.labels_3d = labels
+                results_list.append(temp_results)
+
+            return results_list
+        else:
+            # TODO unify it when refactor the Augtest
+            return bbox3d
+
+    def multiclass_nms_single(self, obj_scores: Tensor, sem_scores: Tensor,
+                              bbox: Tensor, points: Tensor,
+                              input_meta: dict) -> Tuple:
+        """Multi-class nms in single batch.
+
+        Args:
+            obj_scores (torch.Tensor): Objectness score of bounding boxes.
+            sem_scores (torch.Tensor): semantic class score of bounding boxes.
+            bbox (torch.Tensor): Predicted bounding boxes.
+            points (torch.Tensor): Input points.
+            input_meta (dict): Point cloud and image's meta info.
+
+        Returns:
+            tuple[torch.Tensor]: Bounding boxes, scores and labels.
+        """
+        bbox = input_meta['box_type_3d'](
+            bbox,
+            box_dim=bbox.shape[-1],
+            with_yaw=self.bbox_coder.with_rot,
+            origin=(0.5, 0.5, 0.5))
+        box_indices = bbox.points_in_boxes_all(points)
+
+        corner3d = bbox.corners
+        minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
+        minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
+        minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
+
+        nonempty_box_mask = box_indices.T.sum(1) > 5
+
+        bbox_classes = torch.argmax(sem_scores, -1)
+        nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask],
+                                      obj_scores[nonempty_box_mask],
+                                      bbox_classes[nonempty_box_mask],
+                                      self.test_cfg.nms_thr)
+
+        # filter empty boxes and boxes with low score
+        scores_mask = (obj_scores > self.test_cfg.score_thr)
+        nonempty_box_inds = torch.nonzero(
+            nonempty_box_mask, as_tuple=False).flatten()
+        nonempty_mask = torch.zeros_like(bbox_classes).scatter(
+            0, nonempty_box_inds[nms_selected], 1)
+        selected = (nonempty_mask.bool() & scores_mask.bool())
+
+        if self.test_cfg.per_class_proposal:
+            bbox_selected, score_selected, labels = [], [], []
+            for k in range(sem_scores.shape[-1]):
+                bbox_selected.append(bbox[selected].tensor)
+                score_selected.append(obj_scores[selected] *
+                                      sem_scores[selected][:, k])
+                labels.append(
+                    torch.zeros_like(bbox_classes[selected]).fill_(k))
+            bbox_selected = torch.cat(bbox_selected, 0)
+            score_selected = torch.cat(score_selected, 0)
+            labels = torch.cat(labels, 0)
+        else:
+            bbox_selected = bbox[selected].tensor
+            score_selected = obj_scores[selected]
+            labels = bbox_classes[selected]
+
+        return bbox_selected, score_selected, labels
diff --git a/mmde/mmdet3d/models/detectors/__init__.py b/mmde/mmdet3d/models/detectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c95e00ca0df5ccf190c0508c2df693627ef0d533
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import Base3DDetector
+from .centerpoint import CenterPoint
+from .dfm import DfM
+from .dynamic_voxelnet import DynamicVoxelNet
+from .fcos_mono3d import FCOSMono3D
+from .groupfree3dnet import GroupFree3DNet
+from .h3dnet import H3DNet
+from .imvotenet import ImVoteNet
+from .imvoxelnet import ImVoxelNet
+from .mink_single_stage import MinkSingleStage3DDetector
+from .multiview_dfm import MultiViewDfM
+from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN
+from .mvx_two_stage import MVXTwoStageDetector
+from .parta2 import PartA2
+from .point_rcnn import PointRCNN
+from .pv_rcnn import PointVoxelRCNN
+from .sassd import SASSD
+from .single_stage_mono3d import SingleStageMono3DDetector
+from .smoke_mono3d import SMOKEMono3D
+from .ssd3dnet import SSD3DNet
+from .votenet import VoteNet
+from .voxelnet import VoxelNet
+
+__all__ = [
+    'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector',
+    'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet',
+    'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector',
+    'FCOSMono3D', 'ImVoxelNet', 'GroupFree3DNet', 'PointRCNN', 'SMOKEMono3D',
+    'SASSD', 'MinkSingleStage3DDetector', 'MultiViewDfM', 'DfM',
+    'PointVoxelRCNN'
+]
diff --git a/mmde/mmdet3d/models/detectors/base.py b/mmde/mmdet3d/models/detectors/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f7bb5d99f90097e5cb962c470755566bfb52c52
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/base.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+from mmdet.models import BaseDetector
+from mmengine.structures import InstanceData
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.det3d_data_sample import (ForwardResults,
+                                                  OptSampleList, SampleList)
+from mmdet3d.utils.typing_utils import (OptConfigType, OptInstanceList,
+                                        OptMultiConfig)
+
+
+@MODELS.register_module()
+class Base3DDetector(BaseDetector):
+    """Base class for 3D detectors.
+
+    Args:
+       data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`BaseDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+       init_cfg (dict or ConfigDict, optional): the config to control the
+           initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+    def forward(self,
+                inputs: Union[dict, List[dict]],
+                data_samples: OptSampleList = None,
+                mode: str = 'tensor',
+                **kwargs) -> ForwardResults:
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+        processed to a list of :obj:`Det3DDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs  (dict | list[dict]): When it is a list[dict], the
+                outer list indicate the test time augmentation. Each
+                dict contains batch inputs
+                which include 'points' and 'imgs' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (torch.Tensor): Image tensor has shape (B, C, H, W).
+            data_samples (list[:obj:`Det3DDataSample`],
+                list[list[:obj:`Det3DDataSample`]], optional): The
+                annotation data of every samples. When it is a list[list], the
+                outer list indicate the test time augmentation, and the
+                inter list indicate the batch. Otherwise, the list simply
+                indicate the batch. Defaults to None.
+            mode (str): Return what kind of value. Defaults to 'tensor'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of :obj:`Det3DDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if mode == 'loss':
+            return self.loss(inputs, data_samples, **kwargs)
+        elif mode == 'predict':
+            if isinstance(data_samples[0], list):
+                # aug test
+                assert len(data_samples[0]) == 1, 'Only support ' \
+                                                  'batch_size 1 ' \
+                                                  'in mmdet3d when ' \
+                                                  'do the test' \
+                                                  'time augmentation.'
+                return self.aug_test(inputs, data_samples, **kwargs)
+            else:
+                return self.predict(inputs, data_samples, **kwargs)
+        elif mode == 'tensor':
+            return self._forward(inputs, data_samples, **kwargs)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    def add_pred_to_datasample(
+        self,
+        data_samples: SampleList,
+        data_instances_3d: OptInstanceList = None,
+        data_instances_2d: OptInstanceList = None,
+    ) -> SampleList:
+        """Convert results list to `Det3DDataSample`.
+
+        Subclasses could override it to be compatible for some multi-modality
+        3D detectors.
+
+        Args:
+            data_samples (list[:obj:`Det3DDataSample`]): The input data.
+            data_instances_3d (list[:obj:`InstanceData`], optional): 3D
+                Detection results of each sample.
+            data_instances_2d (list[:obj:`InstanceData`], optional): 2D
+                Detection results of each sample.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input. Each Det3DDataSample usually contains
+            'pred_instances_3d'. And the ``pred_instances_3d`` normally
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels_3d (Tensor): Labels of 3D bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+              (num_instances, C) where C >=7.
+
+            When there are image prediction in some models, it should
+            contains  `pred_instances`, And the ``pred_instances`` normally
+            contains following keys.
+
+            - scores (Tensor): Classification scores of image, has a shape
+              (num_instance, )
+            - labels (Tensor): Predict Labels of 2D bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Contains a tensor with shape
+              (num_instances, 4).
+        """
+
+        assert (data_instances_2d is not None) or \
+               (data_instances_3d is not None),\
+               'please pass at least one type of data_samples'
+
+        if data_instances_2d is None:
+            data_instances_2d = [
+                InstanceData() for _ in range(len(data_instances_3d))
+            ]
+        if data_instances_3d is None:
+            data_instances_3d = [
+                InstanceData() for _ in range(len(data_instances_2d))
+            ]
+
+        for i, data_sample in enumerate(data_samples):
+            data_sample.pred_instances_3d = data_instances_3d[i]
+            data_sample.pred_instances = data_instances_2d[i]
+        return data_samples
diff --git a/mmde/mmdet3d/models/detectors/centerpoint.py b/mmde/mmdet3d/models/detectors/centerpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..e628f0179ab75730eef21ddafb2fabc1983cf452
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/centerpoint.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from mmdet3d.registry import MODELS
+from .mvx_two_stage import MVXTwoStageDetector
+
+
+@MODELS.register_module()
+class CenterPoint(MVXTwoStageDetector):
+    """Base class of Multi-modality VoxelNet.
+
+    Args:
+        pts_voxel_encoder (dict, optional): Point voxelization
+            encoder layer. Defaults to None.
+        pts_middle_encoder (dict, optional): Middle encoder layer
+            of points cloud modality. Defaults to None.
+        pts_fusion_layer (dict, optional): Fusion layer.
+            Defaults to None.
+        img_backbone (dict, optional): Backbone of extracting
+            images feature. Defaults to None.
+        pts_backbone (dict, optional): Backbone of extracting
+            points features. Defaults to None.
+        img_neck (dict, optional): Neck of extracting
+            image features. Defaults to None.
+        pts_neck (dict, optional): Neck of extracting
+            points features. Defaults to None.
+        pts_bbox_head (dict, optional): Bboxes head of
+            point cloud modality. Defaults to None.
+        img_roi_head (dict, optional): RoI head of image
+            modality. Defaults to None.
+        img_rpn_head (dict, optional): RPN head of image
+            modality. Defaults to None.
+        train_cfg (dict, optional): Train config of model.
+            Defaults to None.
+        test_cfg (dict, optional): Train config of model.
+            Defaults to None.
+        init_cfg (dict, optional): Initialize config of
+            model. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`Det3DDataPreprocessor`. Defaults to None.
+    """
+
+    def __init__(self,
+                 pts_voxel_encoder: Optional[dict] = None,
+                 pts_middle_encoder: Optional[dict] = None,
+                 pts_fusion_layer: Optional[dict] = None,
+                 img_backbone: Optional[dict] = None,
+                 pts_backbone: Optional[dict] = None,
+                 img_neck: Optional[dict] = None,
+                 pts_neck: Optional[dict] = None,
+                 pts_bbox_head: Optional[dict] = None,
+                 img_roi_head: Optional[dict] = None,
+                 img_rpn_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 data_preprocessor: Optional[dict] = None,
+                 **kwargs):
+
+        super(CenterPoint,
+              self).__init__(pts_voxel_encoder, pts_middle_encoder,
+                             pts_fusion_layer, img_backbone, pts_backbone,
+                             img_neck, pts_neck, pts_bbox_head, img_roi_head,
+                             img_rpn_head, train_cfg, test_cfg, init_cfg,
+                             data_preprocessor, **kwargs)
diff --git a/mmde/mmdet3d/models/detectors/dfm.py b/mmde/mmdet3d/models/detectors/dfm.py
new file mode 100644
index 0000000000000000000000000000000000000000..7494c8546f7e499b2b74ab8f9a41a984e7fe1c19
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/dfm.py
@@ -0,0 +1,239 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmdet.models.detectors import BaseDetector
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.ops import bbox3d2result
+from mmdet3d.utils import ConfigType
+
+
+@MODELS.register_module()
+class DfM(BaseDetector):
+    r"""`Monocular 3D Object Detection with Depth from Motion.
+        <https://arxiv.org/abs/2207.12988>`_.
+
+    Args:
+        data_preprocessor (:obj:`ConfigDict` or dict): The pre-process
+            config of :class:`BaseDataPreprocessor`. it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        backbone_stereo (:obj:`ConfigDict` or dict): The stereo backbone
+        config.
+        backbone_3d (:obj:`ConfigDict` or dict): The 3d backbone config.
+        neck_3d (:obj:`ConfigDict` or dict): The 3D neck config.
+        bbox_head_3d (:obj:`ConfigDict` or dict): The 3d bbox head config.
+        neck_2d (:obj:`ConfigDict` or dict, optional): The 2D neck config
+            for 2D object detection. Defaults to None.
+        bbox_head_2d (:obj:`ConfigDict` or dict, optional): The 2D bbox
+            head config for 2D object detection. Defaults to None.
+        depth_head_2d (:obj:`ConfigDict` or dict, optional): The 2D depth
+            head config for depth estimation in fov space. Defaults to None.
+        depth_head (:obj:`ConfigDict` or dict, optional): The depth head
+            config for depth estimation in 3D voxel projected to fov space .
+        train_cfg (:obj:`ConfigDict` or dict, optional): Config dict of
+            training hyper-parameters. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Config dict of test
+            hyper-parameters. Defaults to None.
+        pretrained (:obj: `ConfigDict` or dict optional): The pretrained
+            config.
+        init_cfg (:obj:`ConfigDict` or dict, optional): The initialization
+            config. Defaults to None.
+    """
+
+    def __init__(self,
+                 data_preprocessor: ConfigType,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 backbone_stereo: ConfigType,
+                 backbone_3d: ConfigType,
+                 neck_3d: ConfigType,
+                 bbox_head_3d: ConfigType,
+                 neck_2d=None,
+                 bbox_head_2d=None,
+                 depth_head_2d=None,
+                 depth_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        self.neck = MODELS.build(neck)
+        if backbone_stereo is not None:
+            backbone_stereo.update(cat_img_feature=self.neck.cat_img_feature)
+            backbone_stereo.update(in_sem_channels=self.neck.sem_channels[-1])
+            self.backbone_stereo = MODELS.build(backbone_stereo)
+            assert self.neck.cat_img_feature == \
+                self.backbone_stereo.cat_img_feature
+            assert self.neck.sem_channels[
+                -1] == self.backbone_stereo.in_sem_channels
+        if backbone_3d is not None:
+            self.backbone_3d = MODELS.build(backbone_3d)
+        if neck_3d is not None:
+            self.neck_3d = MODELS.build(neck_3d)
+        if neck_2d is not None:
+            self.neck_2d = MODELS.build(neck_2d)
+        if bbox_head_2d is not None:
+            self.bbox_head_2d = MODELS.build(bbox_head_2d)
+        if depth_head_2d is not None:
+            self.depth_head_2d = MODELS.build(depth_head_2d)
+        if depth_head is not None:
+            self.depth_head = MODELS.build(depth_head)
+            self.depth_samples = self.depth_head.depth_samples
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        bbox_head_3d.update(train_cfg=train_cfg)
+        bbox_head_3d.update(test_cfg=test_cfg)
+        self.bbox_head_3d = MODELS.build(bbox_head_3d)
+
+    @property
+    def with_backbone_3d(self):
+        """Whether the detector has a 3D backbone."""
+        return hasattr(self, 'backbone_3d') and self.backbone_3d is not None
+
+    @property
+    def with_neck_3d(self):
+        """Whether the detector has a 3D neck."""
+        return hasattr(self, 'neck_3d') and self.neck_3d is not None
+
+    @property
+    def with_neck_2d(self):
+        """Whether the detector has a 2D neck."""
+        return hasattr(self, 'neck_2d') and self.neck_2d is not None
+
+    @property
+    def with_bbox_head_2d(self):
+        """Whether the detector has a 2D detection head."""
+        return hasattr(self, 'bbox_head_2d') and self.bbox_head_2d is not None
+
+    @property
+    def with_depth_head_2d(self):
+        """Whether the detector has a image-based depth head."""
+        return hasattr(self,
+                       'depth_head_2d') and self.depth_head_2d is not None
+
+    @property
+    def with_depth_head(self):
+        """Whether the detector has a frustum-based depth head."""
+        return hasattr(self, 'depth_head') and self.depth_head is not None
+
+    def extract_feat(self, img, img_metas):
+        """Feature extraction for perspective-view images.
+
+        Args:
+            img (torch.Tensor): Images of shape [B, N, C_in, H, W].
+            img_metas (list): Image meta information. Each element corresponds
+                to a group of images. len(img_metas) == B.
+
+        Returns:
+            torch.Tensor: bev feature with shape [B, C_out, N_y, N_x].
+        """
+        # split input img into current and previous ones
+        batch_size, N, C_in, H, W = img.shape
+        cur_imgs = img[:, 0]
+        prev_imgs = img[:, 1]  # TODO: to support multiple prev imgs
+        # 2D backbone for feature extraction
+        cur_feats = self.backbone(cur_imgs)
+        cur_feats = [cur_imgs] + list(cur_feats)
+        prev_feats = self.backbone(prev_imgs)
+        prev_feats = [prev_imgs] + list(prev_feats)
+        # SPP module as the feature neck
+        cur_stereo_feat, cur_sem_feat = self.neck(cur_feats)
+        prev_stereo_feat, prev_sem_feat = self.neck(prev_feats)
+        # derive cur2prevs
+        cur_pose = torch.tensor(
+            [img_meta['cam2global'] for img_meta in img_metas],
+            device=img.device)[:, None, :, :]  # (B, 1, 4, 4)
+        prev_poses = []
+        for img_meta in img_metas:
+            sweep_img_metas = img_meta['sweep_img_metas']
+            prev_poses.append([
+                sweep_img_meta['cam2global']
+                for sweep_img_meta in sweep_img_metas
+            ])
+        prev_poses = torch.tensor(prev_poses, device=img.device)
+        pad_prev_cam2global = torch.eye(4)[None, None].expand(
+            batch_size, N - 1, 4, 4).to(img.device)
+        pad_prev_cam2global[:, :, :prev_poses.shape[-2], :prev_poses.
+                            shape[-1]] = prev_poses
+        pad_cur_cam2global = torch.eye(4)[None,
+                                          None].expand(batch_size, 1, 4,
+                                                       4).to(img.device)
+        pad_cur_cam2global[:, :, :cur_pose.shape[-2], :cur_pose.
+                           shape[-1]] = cur_pose
+        # (B, N-1, 4, 4) * (B, 1, 4, 4) -> (B, N-1, 4, 4)
+        # torch.linalg.solve is faster and more numerically stable
+        # than torch.matmul(torch.linalg.inv(A), B)
+        # empirical results show that torch.linalg.solve can derive
+        # almost the same result with np.linalg.inv
+        # while torch.linalg.inv can not
+        cur2prevs = torch.linalg.solve(pad_prev_cam2global, pad_cur_cam2global)
+        for meta_idx, img_meta in enumerate(img_metas):
+            img_meta['cur2prevs'] = cur2prevs[meta_idx]
+        # stereo backbone for depth estimation
+        # volume_feat: (batch_size, Cv, Nz, Ny, Nx)
+        volume_feat = self.backbone_stereo(cur_stereo_feat, prev_stereo_feat,
+                                           img_metas, cur_sem_feat)
+        # height compression
+        _, Cv, Nz, Ny, Nx = volume_feat.shape
+        bev_feat = volume_feat.view(batch_size, Cv * Nz, Ny, Nx)
+        bev_feat_prehg, bev_feat = self.neck_3d(bev_feat)
+        return bev_feat
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      depth_img=None,
+                      **kwargs):
+        """Forward function for training."""
+        bev_feat = self.extract_feat(img, img_metas)
+        outs = self.bbox_head_3d([bev_feat])
+        losses = self.bbox_head_3d.loss(*outs, gt_bboxes_3d, gt_labels_3d,
+                                        img_metas)
+        # TODO: loss_dense_depth, loss_2d, loss_imitation
+        return losses
+
+    def forward_test(self, img, img_metas, **kwargs):
+        """Forward of testing.
+
+        Args:
+            img (torch.Tensor): Input images of shape (N, C_in, H, W).
+            img_metas (list): Image metas.
+
+        Returns:
+            list[dict]: Predicted 3d boxes.
+        """
+        # not supporting aug_test for now
+        return self.simple_test(img, img_metas)
+
+    def simple_test(self, img, img_metas):
+        """Simple inference forward without test time augmentation."""
+        bev_feat = self.extract_feat(img, img_metas)
+        # bbox_head takes a list of feature from different levels as input
+        # so need [bev_feat]
+        outs = self.bbox_head_3d([bev_feat])
+        bbox_list = self.bbox_head_3d.get_bboxes(*outs, img_metas)
+        bbox_results = [
+            bbox3d2result(det_bboxes, det_scores, det_labels)
+            for det_bboxes, det_scores, det_labels in bbox_list
+        ]
+        # add pseudo-lidar label to each pred_dict for post-processing
+        for bbox_result in bbox_results:
+            bbox_result['pseudo_lidar'] = True
+        return bbox_results
+
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Test with augmentations.
+
+        Args:
+            imgs (list[torch.Tensor]): Input images of shape (N, C_in, H, W).
+            img_metas (list): Image metas.
+
+        Returns:
+            list[dict]: Predicted 3d boxes.
+        """
+        raise NotImplementedError
diff --git a/mmde/mmdet3d/models/detectors/dynamic_voxelnet.py b/mmde/mmdet3d/models/detectors/dynamic_voxelnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab4e27bfde99027aef4ee602ccb4c1f9275d823b
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/dynamic_voxelnet.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
+from .voxelnet import VoxelNet
+
+
+@MODELS.register_module()
+class DynamicVoxelNet(VoxelNet):
+    r"""VoxelNet using `dynamic voxelization
+    <https://arxiv.org/abs/1910.06528>`_.
+    """
+
+    def __init__(self,
+                 voxel_encoder: ConfigType,
+                 middle_encoder: ConfigType,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            voxel_encoder=voxel_encoder,
+            middle_encoder=middle_encoder,
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+
+    def extract_feat(self, batch_inputs_dict: dict) -> Tuple[Tensor]:
+        """Extract features from points."""
+        voxel_dict = batch_inputs_dict['voxels']
+        voxel_features, feature_coors = self.voxel_encoder(
+            voxel_dict['voxels'], voxel_dict['coors'])
+        batch_size = voxel_dict['coors'][-1, 0].item() + 1
+        x = self.middle_encoder(voxel_features, feature_coors, batch_size)
+        x = self.backbone(x)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
diff --git a/mmde/mmdet3d/models/detectors/fcos_mono3d.py b/mmde/mmdet3d/models/detectors/fcos_mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..c425ae89932fa3d184d997f1e29563c6301774af
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/fcos_mono3d.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
+from ...structures.det3d_data_sample import SampleList
+from .single_stage_mono3d import SingleStageMono3DDetector
+
+
+@MODELS.register_module()
+class FCOSMono3D(SingleStageMono3DDetector):
+    r"""`FCOS3D <https://arxiv.org/abs/2104.10956>`_ for monocular 3D object detection.
+
+    Currently please refer to our entry on the
+    `leaderboard <https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Camera>`_.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of FCOS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of FCOS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """  # noqa: E501
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+
+    def predict(self,
+                batch_inputs_dict: Dict[str, Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'imgs' keys
+
+                - imgs (torch.Tensor: Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input. Each Det3DDataSample usually contains
+            'pred_instances_3d'. And the ``pred_instances_3d`` normally
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels_3d (Tensor): Labels of 3D bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+              (num_instances, C) where C >=7.
+
+            When there are 2D prediction in models, it should
+            contains  `pred_instances`, And the ``pred_instances`` normally
+            contains following keys.
+
+            - scores (Tensor): Classification scores of image, has a shape
+              (num_instance, )
+            - labels (Tensor): Predict Labels of 2D bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Contains a tensor with shape
+              (num_instances, 4).
+        """
+        x = self.extract_feat(batch_inputs_dict)
+        results_list, results_list_2d = self.bbox_head.predict(
+            x, batch_data_samples, rescale=rescale)
+        predictions = self.add_pred_to_datasample(batch_data_samples,
+                                                  results_list,
+                                                  results_list_2d)
+        return predictions
diff --git a/mmde/mmdet3d/models/detectors/groupfree3dnet.py b/mmde/mmdet3d/models/detectors/groupfree3dnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..935f3cef3b88e9535341e75b0d520257e4724164
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/groupfree3dnet.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmdet3d.registry import MODELS
+from ...structures.det3d_data_sample import SampleList
+from .single_stage import SingleStage3DDetector
+
+
+@MODELS.register_module()
+class GroupFree3DNet(SingleStage3DDetector):
+    """`Group-Free 3D <https://arxiv.org/abs/2104.00678>`_."""
+
+    def __init__(self,
+                 backbone,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(GroupFree3DNet, self).__init__(
+            backbone=backbone,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def loss(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs dict and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'imgs' keys.
+
+                    - points (list[torch.Tensor]): Point cloud of each sample.
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_pts_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs_dict)
+        points = batch_inputs_dict['points']
+        losses = self.bbox_head.loss(points, x, batch_data_samples, **kwargs)
+        return losses
+
+    def predict(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'imgs' keys.
+
+                    - points (list[torch.Tensor]): Point cloud of each sample.
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_pts_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input images. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+                (num_instance, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+                (num_instances, C) where C >=7.
+        """
+        x = self.extract_feat(batch_inputs_dict)
+        points = batch_inputs_dict['points']
+        results_list = self.bbox_head.predict(points, x, batch_data_samples,
+                                              **kwargs)
+        predictions = self.add_pred_to_datasample(batch_data_samples,
+                                                  results_list)
+        return predictions
diff --git a/mmde/mmdet3d/models/detectors/h3dnet.py b/mmde/mmdet3d/models/detectors/h3dnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ce6e92eeaab7c9772b13a1af5c75d0cb3e8d73e
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/h3dnet.py
@@ -0,0 +1,157 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Union
+
+import torch
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import Det3DDataSample
+from .two_stage import TwoStage3DDetector
+
+
+@MODELS.register_module()
+class H3DNet(TwoStage3DDetector):
+    r"""H3DNet model.
+
+    Please refer to the `paper <https://arxiv.org/abs/2006.05682>`_
+
+    Args:
+        backbone (dict): Config dict of detector's backbone.
+        neck (dict, optional): Config dict of neck. Defaults to None.
+        rpn_head (dict, optional): Config dict of rpn head. Defaults to None.
+        roi_head (dict, optional): Config dict of roi head. Defaults to None.
+        train_cfg (dict, optional): Config dict of training hyper-parameters.
+            Defaults to None.
+        test_cfg (dict, optional): Config dict of test hyper-parameters.
+            Defaults to None.
+        init_cfg (dict, optional): the config to control the
+           initialization. Default to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+    """
+
+    def __init__(self,
+                 backbone: dict,
+                 neck: Optional[dict] = None,
+                 rpn_head: Optional[dict] = None,
+                 roi_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 data_preprocessor: Optional[dict] = None,
+                 **kwargs) -> None:
+        super(H3DNet, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor,
+            **kwargs)
+
+    def extract_feat(self, batch_inputs_dict: dict) -> None:
+        """Directly extract features from the backbone+neck.
+
+        Args:
+
+            batch_inputs_dict (dict): The model input dict which include
+                'points'.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+
+        Returns:
+            dict: Dict of feature.
+        """
+        stack_points = torch.stack(batch_inputs_dict['points'])
+        x = self.backbone(stack_points)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def loss(self, batch_inputs_dict: Dict[str, Union[List, Tensor]],
+             batch_data_samples: List[Det3DDataSample], **kwargs) -> dict:
+        """
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        feats_dict = self.extract_feat(batch_inputs_dict)
+
+        feats_dict['fp_xyz'] = [feats_dict['fp_xyz_net0'][-1]]
+        feats_dict['fp_features'] = [feats_dict['hd_feature']]
+        feats_dict['fp_indices'] = [feats_dict['fp_indices_net0'][-1]]
+
+        losses = dict()
+        if self.with_rpn:
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                              self.test_cfg.rpn)
+            # note, the feats_dict would be added new key & value in rpn_head
+            rpn_losses, rpn_proposals = self.rpn_head.loss_and_predict(
+                batch_inputs_dict['points'],
+                feats_dict,
+                batch_data_samples,
+                ret_target=True,
+                proposal_cfg=proposal_cfg)
+            feats_dict['targets'] = rpn_losses.pop('targets')
+            losses.update(rpn_losses)
+            feats_dict['rpn_proposals'] = rpn_proposals
+        else:
+            raise NotImplementedError
+
+        roi_losses = self.roi_head.loss(batch_inputs_dict['points'],
+                                        feats_dict, batch_data_samples,
+                                        **kwargs)
+        losses.update(roi_losses)
+
+        return losses
+
+    def predict(
+            self, batch_input_dict: Dict,
+            batch_data_samples: List[Det3DDataSample]
+    ) -> List[Det3DDataSample]:
+        """Get model predictions.
+
+        Args:
+            points (list[torch.Tensor]): Points of each sample.
+            batch_data_samples (list[:obj:`Det3DDataSample`]): Each item
+                contains the meta information of each sample and
+                corresponding annotations.
+
+        Returns:
+            list: Predicted 3d boxes.
+        """
+
+        feats_dict = self.extract_feat(batch_input_dict)
+        feats_dict['fp_xyz'] = [feats_dict['fp_xyz_net0'][-1]]
+        feats_dict['fp_features'] = [feats_dict['hd_feature']]
+        feats_dict['fp_indices'] = [feats_dict['fp_indices_net0'][-1]]
+
+        if self.with_rpn:
+            proposal_cfg = self.test_cfg.rpn
+            rpn_proposals = self.rpn_head.predict(
+                batch_input_dict['points'],
+                feats_dict,
+                batch_data_samples,
+                use_nms=proposal_cfg.use_nms)
+            feats_dict['rpn_proposals'] = rpn_proposals
+        else:
+            raise NotImplementedError
+
+        results_list = self.roi_head.predict(
+            batch_input_dict['points'],
+            feats_dict,
+            batch_data_samples,
+            suffix='_optimized')
+        return self.add_pred_to_datasample(batch_data_samples, results_list)
diff --git a/mmde/mmdet3d/models/detectors/imvotenet.py b/mmde/mmdet3d/models/detectors/imvotenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f02a599ae6fe3f27437b3177d477ddc11b7f3dc
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/imvotenet.py
@@ -0,0 +1,537 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import Det3DDataSample
+from ..layers import MLP
+from .base import Base3DDetector
+
+
+def sample_valid_seeds(mask: Tensor, num_sampled_seed: int = 1024) -> Tensor:
+    r"""Randomly sample seeds from all imvotes.
+
+    Modified from `<https://github.com/facebookresearch/imvotenet/blob/a8856345146bacf29a57266a2f0b874406fd8823/models/imvotenet.py#L26>`_
+
+    Args:
+        mask (torch.Tensor): Bool tensor in shape (
+            seed_num*max_imvote_per_pixel), indicates
+            whether this imvote corresponds to a 2D bbox.
+        num_sampled_seed (int): How many to sample from all imvotes.
+
+    Returns:
+        torch.Tensor: Indices with shape (num_sampled_seed).
+    """  # noqa: E501
+    device = mask.device
+    batch_size = mask.shape[0]
+    sample_inds = mask.new_zeros((batch_size, num_sampled_seed),
+                                 dtype=torch.int64)
+    for bidx in range(batch_size):
+        # return index of non zero elements
+        valid_inds = torch.nonzero(mask[bidx, :]).squeeze(-1)
+        if len(valid_inds) < num_sampled_seed:
+            # compute set t1 - t2
+            t1 = torch.arange(num_sampled_seed, device=device)
+            t2 = valid_inds % num_sampled_seed
+            combined = torch.cat((t1, t2))
+            uniques, counts = combined.unique(return_counts=True)
+            difference = uniques[counts == 1]
+
+            rand_inds = torch.randperm(
+                len(difference),
+                device=device)[:num_sampled_seed - len(valid_inds)]
+            cur_sample_inds = difference[rand_inds]
+            cur_sample_inds = torch.cat((valid_inds, cur_sample_inds))
+        else:
+            rand_inds = torch.randperm(
+                len(valid_inds), device=device)[:num_sampled_seed]
+            cur_sample_inds = valid_inds[rand_inds]
+        sample_inds[bidx, :] = cur_sample_inds
+    return sample_inds
+
+
+@MODELS.register_module()
+class ImVoteNet(Base3DDetector):
+    r"""`ImVoteNet <https://arxiv.org/abs/2001.10692>`_ for 3D detection.
+
+    ImVoteNet is based on fusing 2D votes in images and 3D votes in point
+    clouds, which explicitly extract both geometric and semantic features
+    from the 2D images. It leverage camera parameters to lift these
+    features to 3D. A multi-tower training scheme also improve the synergy
+    of 2D-3D feature fusion.
+
+    """
+
+    def __init__(self,
+                 pts_backbone: Optional[dict] = None,
+                 pts_bbox_heads: Optional[dict] = None,
+                 pts_neck: Optional[dict] = None,
+                 img_backbone: Optional[dict] = None,
+                 img_neck: Optional[dict] = None,
+                 img_roi_head: Optional[dict] = None,
+                 img_rpn_head: Optional[dict] = None,
+                 img_mlp: Optional[dict] = None,
+                 freeze_img_branch: bool = False,
+                 fusion_layer: Optional[dict] = None,
+                 num_sampled_seed: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 **kwargs) -> None:
+
+        super(ImVoteNet, self).__init__(init_cfg=init_cfg, **kwargs)
+
+        # point branch
+        if pts_backbone is not None:
+            self.pts_backbone = MODELS.build(pts_backbone)
+        if pts_neck is not None:
+            self.pts_neck = MODELS.build(pts_neck)
+        if pts_bbox_heads is not None:
+            pts_bbox_head_common = pts_bbox_heads.common
+            pts_bbox_head_common.update(
+                train_cfg=train_cfg.pts if train_cfg is not None else None)
+            pts_bbox_head_common.update(test_cfg=test_cfg.pts)
+            pts_bbox_head_joint = pts_bbox_head_common.copy()
+            pts_bbox_head_joint.update(pts_bbox_heads.joint)
+            pts_bbox_head_pts = pts_bbox_head_common.copy()
+            pts_bbox_head_pts.update(pts_bbox_heads.pts)
+            pts_bbox_head_img = pts_bbox_head_common.copy()
+            pts_bbox_head_img.update(pts_bbox_heads.img)
+
+            self.pts_bbox_head_joint = MODELS.build(pts_bbox_head_joint)
+            self.pts_bbox_head_pts = MODELS.build(pts_bbox_head_pts)
+            self.pts_bbox_head_img = MODELS.build(pts_bbox_head_img)
+            self.pts_bbox_heads = [
+                self.pts_bbox_head_joint, self.pts_bbox_head_pts,
+                self.pts_bbox_head_img
+            ]
+            self.loss_weights = pts_bbox_heads.loss_weights
+
+        # image branch
+        if img_backbone:
+            self.img_backbone = MODELS.build(img_backbone)
+        if img_neck is not None:
+            self.img_neck = MODELS.build(img_neck)
+        if img_rpn_head is not None:
+            rpn_train_cfg = train_cfg.img_rpn if train_cfg \
+                is not None else None
+            img_rpn_head_ = img_rpn_head.copy()
+            img_rpn_head_.update(
+                train_cfg=rpn_train_cfg, test_cfg=test_cfg.img_rpn)
+            self.img_rpn_head = MODELS.build(img_rpn_head_)
+        if img_roi_head is not None:
+            rcnn_train_cfg = train_cfg.img_rcnn if train_cfg \
+                is not None else None
+            img_roi_head.update(
+                train_cfg=rcnn_train_cfg, test_cfg=test_cfg.img_rcnn)
+            self.img_roi_head = MODELS.build(img_roi_head)
+
+        # fusion
+        if fusion_layer is not None:
+            self.fusion_layer = MODELS.build(fusion_layer)
+            self.max_imvote_per_pixel = fusion_layer.max_imvote_per_pixel
+
+        self.freeze_img_branch = freeze_img_branch
+        if freeze_img_branch:
+            self.freeze_img_branch_params()
+
+        if img_mlp is not None:
+            self.img_mlp = MLP(**img_mlp)
+
+        self.num_sampled_seed = num_sampled_seed
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def _forward(self):
+        raise NotImplementedError
+
+    def freeze_img_branch_params(self):
+        """Freeze all image branch parameters."""
+        if self.with_img_bbox_head:
+            for param in self.img_bbox_head.parameters():
+                param.requires_grad = False
+        if self.with_img_backbone:
+            for param in self.img_backbone.parameters():
+                param.requires_grad = False
+        if self.with_img_neck:
+            for param in self.img_neck.parameters():
+                param.requires_grad = False
+        if self.with_img_rpn:
+            for param in self.img_rpn_head.parameters():
+                param.requires_grad = False
+        if self.with_img_roi_head:
+            for param in self.img_roi_head.parameters():
+                param.requires_grad = False
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """Overload in order to load img network ckpts into img branch."""
+        module_names = ['backbone', 'neck', 'roi_head', 'rpn_head']
+        for key in list(state_dict):
+            for module_name in module_names:
+                if key.startswith(module_name) and ('img_' +
+                                                    key) not in state_dict:
+                    state_dict['img_' + key] = state_dict.pop(key)
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def train(self, mode=True):
+        """Overload in order to keep image branch modules in eval mode."""
+        super(ImVoteNet, self).train(mode)
+        if self.freeze_img_branch:
+            if self.with_img_bbox_head:
+                self.img_bbox_head.eval()
+            if self.with_img_backbone:
+                self.img_backbone.eval()
+            if self.with_img_neck:
+                self.img_neck.eval()
+            if self.with_img_rpn:
+                self.img_rpn_head.eval()
+            if self.with_img_roi_head:
+                self.img_roi_head.eval()
+
+    @property
+    def with_img_bbox(self):
+        """bool: Whether the detector has a 2D image box head."""
+        return ((hasattr(self, 'img_roi_head') and self.img_roi_head.with_bbox)
+                or (hasattr(self, 'img_bbox_head')
+                    and self.img_bbox_head is not None))
+
+    @property
+    def with_img_bbox_head(self):
+        """bool: Whether the detector has a 2D image box head (not roi)."""
+        return hasattr(self,
+                       'img_bbox_head') and self.img_bbox_head is not None
+
+    @property
+    def with_img_backbone(self):
+        """bool: Whether the detector has a 2D image backbone."""
+        return hasattr(self, 'img_backbone') and self.img_backbone is not None
+
+    @property
+    def with_img_neck(self):
+        """bool: Whether the detector has a neck in image branch."""
+        return hasattr(self, 'img_neck') and self.img_neck is not None
+
+    @property
+    def with_img_rpn(self):
+        """bool: Whether the detector has a 2D RPN in image detector branch."""
+        return hasattr(self, 'img_rpn_head') and self.img_rpn_head is not None
+
+    @property
+    def with_img_roi_head(self):
+        """bool: Whether the detector has a RoI Head in image branch."""
+        return hasattr(self, 'img_roi_head') and self.img_roi_head is not None
+
+    @property
+    def with_pts_bbox(self):
+        """bool: Whether the detector has a 3D box head."""
+        return hasattr(self,
+                       'pts_bbox_head') and self.pts_bbox_head is not None
+
+    @property
+    def with_pts_backbone(self):
+        """bool: Whether the detector has a 3D backbone."""
+        return hasattr(self, 'pts_backbone') and self.pts_backbone is not None
+
+    @property
+    def with_pts_neck(self):
+        """bool: Whether the detector has a neck in 3D detector branch."""
+        return hasattr(self, 'pts_neck') and self.pts_neck is not None
+
+    def extract_feat(self, imgs):
+        """Just to inherit from abstract method."""
+        pass
+
+    def extract_img_feat(self, img: Tensor) -> Sequence[Tensor]:
+        """Directly extract features from the img backbone+neck."""
+        x = self.img_backbone(img)
+        if self.with_img_neck:
+            x = self.img_neck(x)
+        return x
+
+    def extract_pts_feat(self, pts: Tensor) -> Tuple[Tensor]:
+        """Extract features of points."""
+        x = self.pts_backbone(pts)
+        if self.with_pts_neck:
+            x = self.pts_neck(x)
+
+        seed_points = x['fp_xyz'][-1]
+        seed_features = x['fp_features'][-1]
+        seed_indices = x['fp_indices'][-1]
+
+        return (seed_points, seed_features, seed_indices)
+
+    def loss(self, batch_inputs_dict: Dict[str, Union[List, Tensor]],
+             batch_data_samples: List[Det3DDataSample],
+             **kwargs) -> List[Det3DDataSample]:
+        """
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'imgs` keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (list[torch.Tensor]): Image tensor with shape
+                  (N, C, H ,W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        imgs = batch_inputs_dict.get('imgs', None)
+        points = batch_inputs_dict.get('points', None)
+        if points is None:
+            x = self.extract_img_feat(imgs)
+            losses = dict()
+            # RPN forward and loss
+            if self.with_img_rpn:
+                proposal_cfg = self.train_cfg.get('img_rpn_proposal',
+                                                  self.test_cfg.img_rpn)
+                rpn_data_samples = copy.deepcopy(batch_data_samples)
+                # set cat_id of gt_labels to 0 in RPN
+                for data_sample in rpn_data_samples:
+                    data_sample.gt_instances.labels = \
+                        torch.zeros_like(data_sample.gt_instances.labels)
+
+                rpn_losses, rpn_results_list = \
+                    self.img_rpn_head.loss_and_predict(
+                        x, rpn_data_samples,
+                        proposal_cfg=proposal_cfg, **kwargs)
+                # avoid get same name with roi_head loss
+                keys = rpn_losses.keys()
+                for key in keys:
+                    if 'loss' in key and 'rpn' not in key:
+                        rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+                losses.update(rpn_losses)
+            else:
+                assert batch_data_samples[0].get('proposals', None) is not None
+                # use pre-defined proposals in InstanceData for
+                # the second stage
+                # to extract ROI features.
+                rpn_results_list = [
+                    data_sample.proposals for data_sample in batch_data_samples
+                ]
+
+            roi_losses = self.img_roi_head.loss(x, rpn_results_list,
+                                                batch_data_samples, **kwargs)
+            losses.update(roi_losses)
+            return losses
+        else:
+            with torch.no_grad():
+                results_2d = self.predict_img_only(
+                    batch_inputs_dict['imgs'],
+                    batch_data_samples,
+                    rescale=False)
+            # tensor with shape (n, 6), the 6 arrange
+            # as [x1, x2, y1, y2, score, label]
+            pred_bboxes_with_label_list = []
+            for single_results in results_2d:
+                cat_preds = torch.cat(
+                    (single_results.bboxes, single_results.scores[:, None],
+                     single_results.labels[:, None]),
+                    dim=-1)
+                cat_preds = cat_preds[torch.argsort(
+                    cat_preds[:, 4], descending=True)]
+                # drop half bboxes during training for better generalization
+                if self.training:
+                    rand_drop = torch.randperm(
+                        len(cat_preds))[:(len(cat_preds) + 1) // 2]
+                    rand_drop = torch.sort(rand_drop)[0]
+                    cat_preds = cat_preds[rand_drop]
+
+                pred_bboxes_with_label_list.append(cat_preds)
+
+            stack_points = torch.stack(points)
+            seeds_3d, seed_3d_features, seed_indices = \
+                self.extract_pts_feat(stack_points)
+            img_metas = [item.metainfo for item in batch_data_samples]
+            img_features, masks = self.fusion_layer(
+                imgs, pred_bboxes_with_label_list, seeds_3d, img_metas)
+
+            inds = sample_valid_seeds(masks, self.num_sampled_seed)
+            batch_size, img_feat_size = img_features.shape[:2]
+            pts_feat_size = seed_3d_features.shape[1]
+            inds_img = inds.view(batch_size, 1,
+                                 -1).expand(-1, img_feat_size, -1)
+            img_features = img_features.gather(-1, inds_img)
+            inds = inds % inds.shape[1]
+            inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3)
+            seeds_3d = seeds_3d.gather(1, inds_seed_xyz)
+            inds_seed_feats = inds.view(batch_size, 1,
+                                        -1).expand(-1, pts_feat_size, -1)
+            seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats)
+            seed_indices = seed_indices.gather(1, inds)
+
+            img_features = self.img_mlp(img_features)
+            fused_features = torch.cat([seed_3d_features, img_features], dim=1)
+
+            feat_dict_joint = dict(
+                seed_points=seeds_3d,
+                seed_features=fused_features,
+                seed_indices=seed_indices)
+            feat_dict_pts = dict(
+                seed_points=seeds_3d,
+                seed_features=seed_3d_features,
+                seed_indices=seed_indices)
+            feat_dict_img = dict(
+                seed_points=seeds_3d,
+                seed_features=img_features,
+                seed_indices=seed_indices)
+
+            losses_towers = []
+            losses_joint = self.pts_bbox_head_joint.loss(
+                points, feat_dict_joint, batch_data_samples)
+            losses_pts = self.pts_bbox_head_pts.loss(points, feat_dict_pts,
+                                                     batch_data_samples)
+            losses_img = self.pts_bbox_head_img.loss(points, feat_dict_img,
+                                                     batch_data_samples)
+            losses_towers.append(losses_joint)
+            losses_towers.append(losses_pts)
+            losses_towers.append(losses_img)
+            combined_losses = dict()
+            for loss_term in losses_joint:
+                if 'loss' in loss_term:
+                    combined_losses[loss_term] = 0
+                    for i in range(len(losses_towers)):
+                        combined_losses[loss_term] += \
+                            losses_towers[i][loss_term] * \
+                            self.loss_weights[i]
+                else:
+                    # only save the metric of the joint head
+                    # if it is not a loss
+                    combined_losses[loss_term] = \
+                        losses_towers[0][loss_term]
+
+            return combined_losses
+
+    def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
+                batch_data_samples: List[Det3DDataSample],
+                **kwargs) -> List[Det3DDataSample]:
+        """Forward of testing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points' and 'imgs keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (list[torch.Tensor]): Tensor of Images.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
+        """
+        points = batch_inputs_dict.get('points', None)
+        imgs = batch_inputs_dict.get('imgs', None)
+        if points is None:
+            assert imgs is not None
+            results_2d = self.predict_img_only(imgs, batch_data_samples)
+            return self.add_pred_to_datasample(
+                batch_data_samples, data_instances_2d=results_2d)
+
+        else:
+            results_2d = self.predict_img_only(
+                batch_inputs_dict['imgs'], batch_data_samples, rescale=False)
+            # tensor with shape (n, 6), the 6 arrange
+            # as [x1, x2, y1, y2, score, label]
+            pred_bboxes_with_label_list = []
+            for single_results in results_2d:
+                cat_preds = torch.cat(
+                    (single_results.bboxes, single_results.scores[:, None],
+                     single_results.labels[:, None]),
+                    dim=-1)
+                cat_preds = cat_preds[torch.argsort(
+                    cat_preds[:, 4], descending=True)]
+                pred_bboxes_with_label_list.append(cat_preds)
+
+            stack_points = torch.stack(points)
+            seeds_3d, seed_3d_features, seed_indices = \
+                self.extract_pts_feat(stack_points)
+
+            img_features, masks = self.fusion_layer(
+                imgs, pred_bboxes_with_label_list, seeds_3d,
+                [item.metainfo for item in batch_data_samples])
+
+            inds = sample_valid_seeds(masks, self.num_sampled_seed)
+            batch_size, img_feat_size = img_features.shape[:2]
+            pts_feat_size = seed_3d_features.shape[1]
+            inds_img = inds.view(batch_size, 1,
+                                 -1).expand(-1, img_feat_size, -1)
+            img_features = img_features.gather(-1, inds_img)
+            inds = inds % inds.shape[1]
+            inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3)
+            seeds_3d = seeds_3d.gather(1, inds_seed_xyz)
+            inds_seed_feats = inds.view(batch_size, 1,
+                                        -1).expand(-1, pts_feat_size, -1)
+            seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats)
+            seed_indices = seed_indices.gather(1, inds)
+
+            img_features = self.img_mlp(img_features)
+
+            fused_features = torch.cat([seed_3d_features, img_features], dim=1)
+
+            feat_dict = dict(
+                seed_points=seeds_3d,
+                seed_features=fused_features,
+                seed_indices=seed_indices)
+
+            results_3d = self.pts_bbox_head_joint.predict(
+                batch_inputs_dict['points'],
+                feat_dict,
+                batch_data_samples,
+                rescale=True)
+
+            return self.add_pred_to_datasample(batch_data_samples, results_3d)
+
+    def predict_img_only(self,
+                         imgs: Tensor,
+                         batch_data_samples: List[Det3DDataSample],
+                         rescale: bool = True) -> List[InstanceData]:
+        """Predict results from a batch of imgs with post- processing.
+
+        Args:
+            imgs (Tensor): Inputs images with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Return the list of detection
+            results of the input images, usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+                (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+                the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+
+        assert self.with_img_bbox, 'Img bbox head must be implemented.'
+        assert self.with_img_backbone, 'Img backbone must be implemented.'
+        assert self.with_img_rpn, 'Img rpn must be implemented.'
+        assert self.with_img_roi_head, 'Img roi head must be implemented.'
+        x = self.extract_img_feat(imgs)
+
+        # If there are no pre-defined proposals, use RPN to get proposals
+        if batch_data_samples[0].get('proposals', None) is None:
+            rpn_results_list = self.img_rpn_head.predict(
+                x, batch_data_samples, rescale=False)
+        else:
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        results_list = self.img_roi_head.predict(
+            x, rpn_results_list, batch_data_samples, rescale=rescale)
+
+        return results_list
diff --git a/mmde/mmdet3d/models/detectors/imvoxelnet.py b/mmde/mmdet3d/models/detectors/imvoxelnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..e97c3284edcc4f1db6306f7e75b2d70fdb1ed10b
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/imvoxelnet.py
@@ -0,0 +1,275 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet3d.models.detectors import Base3DDetector
+from mmdet3d.models.layers.fusion_layers.point_fusion import point_sample
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures.bbox_3d import get_proj_mat_by_coord_type
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import ConfigType, OptConfigType, OptInstanceList
+
+
+@MODELS.register_module()
+class ImVoxelNet(Base3DDetector):
+    r"""`ImVoxelNet <https://arxiv.org/abs/2106.01178>`_.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        neck_3d (:obj:`ConfigDict` or dict): The 3D neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        prior_generator (:obj:`ConfigDict` or dict): The prior points
+            generator config.
+        n_voxels (list): Number of voxels along x, y, z axis.
+        coord_type (str): The type of coordinates of points cloud:
+            'DEPTH', 'LIDAR', or 'CAMERA'.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Config dict of
+            training hyper-parameters. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Config dict of test
+            hyper-parameters. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (:obj:`ConfigDict` or dict, optional): The initialization
+            config. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 neck_3d: ConfigType,
+                 bbox_head: ConfigType,
+                 prior_generator: ConfigType,
+                 n_voxels: List,
+                 coord_type: str,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        self.neck = MODELS.build(neck)
+        self.neck_3d = MODELS.build(neck_3d)
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = MODELS.build(bbox_head)
+        self.prior_generator = TASK_UTILS.build(prior_generator)
+        self.n_voxels = n_voxels
+        self.coord_type = coord_type
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def extract_feat(self, batch_inputs_dict: dict,
+                     batch_data_samples: SampleList):
+        """Extract 3d features from the backbone -> fpn -> 3d projection.
+
+        -> 3d neck -> bbox_head.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                the 'imgs' key.
+
+                    - imgs (torch.Tensor, optional): Image of each sample.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            Tuple:
+            - torch.Tensor: Features of shape (N, C_out, N_x, N_y, N_z).
+            - torch.Tensor: Valid mask of shape (N, 1, N_x, N_y, N_z).
+        """
+        img = batch_inputs_dict['imgs']
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        x = self.backbone(img)
+        x = self.neck(x)[0]
+        points = self.prior_generator.grid_anchors([self.n_voxels[::-1]],
+                                                   device=img.device)[0][:, :3]
+        volumes, valid_preds = [], []
+        for feature, img_meta in zip(x, batch_img_metas):
+            img_scale_factor = (
+                points.new_tensor(img_meta['scale_factor'][:2])
+                if 'scale_factor' in img_meta.keys() else 1)
+            img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False
+            img_crop_offset = (
+                points.new_tensor(img_meta['img_crop_offset'])
+                if 'img_crop_offset' in img_meta.keys() else 0)
+            proj_mat = points.new_tensor(
+                get_proj_mat_by_coord_type(img_meta, self.coord_type))
+            volume = point_sample(
+                img_meta,
+                img_features=feature[None, ...],
+                points=points,
+                proj_mat=points.new_tensor(proj_mat),
+                coord_type=self.coord_type,
+                img_scale_factor=img_scale_factor,
+                img_crop_offset=img_crop_offset,
+                img_flip=img_flip,
+                img_pad_shape=img.shape[-2:],
+                img_shape=img_meta['img_shape'][:2],
+                aligned=False)
+            volumes.append(
+                volume.reshape(self.n_voxels[::-1] + [-1]).permute(3, 2, 1, 0))
+            valid_preds.append(
+                ~torch.all(volumes[-1] == 0, dim=0, keepdim=True))
+        x = torch.stack(volumes)
+        x = self.neck_3d(x)
+        return x, torch.stack(valid_preds).float()
+
+    def loss(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+             **kwargs) -> Union[dict, list]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                the 'imgs' key.
+
+                    - imgs (torch.Tensor, optional): Image of each sample.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        x, valid_preds = self.extract_feat(batch_inputs_dict,
+                                           batch_data_samples)
+        # For indoor datasets ImVoxelNet uses ImVoxelHead that handles
+        # mask of visible voxels.
+        if self.coord_type == 'DEPTH':
+            x += (valid_preds, )
+        losses = self.bbox_head.loss(x, batch_data_samples, **kwargs)
+        return losses
+
+    def predict(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                the 'imgs' key.
+
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input images. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+                - scores_3d (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels_3d (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes_3d (Tensor): Contains a tensor with shape
+                    (num_instances, C) where C >=7.
+        """
+        x, valid_preds = self.extract_feat(batch_inputs_dict,
+                                           batch_data_samples)
+        # For indoor datasets ImVoxelNet uses ImVoxelHead that handles
+        # mask of visible voxels.
+        if self.coord_type == 'DEPTH':
+            x += (valid_preds, )
+        results_list = \
+            self.bbox_head.predict(x, batch_data_samples, **kwargs)
+        predictions = self.add_pred_to_datasample(batch_data_samples,
+                                                  results_list)
+        return predictions
+
+    def _forward(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+                 *args, **kwargs) -> Tuple[List[torch.Tensor]]:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                the 'imgs' key.
+
+                    - imgs (torch.Tensor, optional): Image of each sample.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            tuple[list]: A tuple of features from ``bbox_head`` forward.
+        """
+        x, valid_preds = self.extract_feat(batch_inputs_dict,
+                                           batch_data_samples)
+        # For indoor datasets ImVoxelNet uses ImVoxelHead that handles
+        # mask of visible voxels.
+        if self.coord_type == 'DEPTH':
+            x += (valid_preds, )
+        results = self.bbox_head.forward(x)
+        return results
+
+    def convert_to_datasample(
+        self,
+        data_samples: SampleList,
+        data_instances_3d: OptInstanceList = None,
+        data_instances_2d: OptInstanceList = None,
+    ) -> SampleList:
+        """Convert results list to `Det3DDataSample`.
+
+        Subclasses could override it to be compatible for some multi-modality
+        3D detectors.
+
+        Args:
+            data_samples (list[:obj:`Det3DDataSample`]): The input data.
+            data_instances_3d (list[:obj:`InstanceData`], optional): 3D
+                Detection results of each sample.
+            data_instances_2d (list[:obj:`InstanceData`], optional): 2D
+                Detection results of each sample.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input. Each Det3DDataSample usually contains
+            'pred_instances_3d'. And the ``pred_instances_3d`` normally
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels_3d (Tensor): Labels of 3D bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+              (num_instances, C) where C >=7.
+
+            When there are image prediction in some models, it should
+            contains  `pred_instances`, And the ``pred_instances`` normally
+            contains following keys.
+
+            - scores (Tensor): Classification scores of image, has a shape
+              (num_instance, )
+            - labels (Tensor): Predict Labels of 2D bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Contains a tensor with shape
+              (num_instances, 4).
+        """
+
+        assert (data_instances_2d is not None) or \
+               (data_instances_3d is not None),\
+               'please pass at least one type of data_samples'
+
+        if data_instances_2d is None:
+            data_instances_2d = [
+                InstanceData() for _ in range(len(data_instances_3d))
+            ]
+        if data_instances_3d is None:
+            data_instances_3d = [
+                InstanceData() for _ in range(len(data_instances_2d))
+            ]
+
+        for i, data_sample in enumerate(data_samples):
+            data_sample.pred_instances_3d = data_instances_3d[i]
+            data_sample.pred_instances = data_instances_2d[i]
+        return data_samples
diff --git a/mmde/mmdet3d/models/detectors/mink_single_stage.py b/mmde/mmdet3d/models/detectors/mink_single_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3c9f57aec415d4a0762b34c9391da47087dd1fd
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/mink_single_stage.py
@@ -0,0 +1,136 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapted from https://github.com/SamsungLabs/fcaf3d/blob/master/mmdet3d/models/detectors/single_stage_sparse.py # noqa
+from typing import Dict, List, OrderedDict, Tuple, Union
+
+import torch
+from torch import Tensor
+
+try:
+    import MinkowskiEngine as ME
+except ImportError:
+    # Please follow get_started.md to install MinkowskiEngine.
+    ME = None
+    pass
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStage3DDetector
+
+
+@MODELS.register_module()
+class MinkSingleStage3DDetector(SingleStage3DDetector):
+    r"""MinkSingleStage3DDetector.
+
+    This class serves as a base class for single-stage 3D detectors based on
+    MinkowskiEngine `GSDN <https://arxiv.org/abs/2006.12356>`_.
+
+
+    Args:
+        backbone (dict): Config dict of detector's backbone.
+        neck (dict, optional): Config dict of neck. Defaults to None.
+        bbox_head (dict, optional): Config dict of box head. Defaults to None.
+        train_cfg (dict, optional): Config dict of training hyper-parameters.
+            Defaults to None.
+        test_cfg (dict, optional): Config dict of test hyper-parameters.
+            Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or ConfigDict, optional): the config to control the
+            initialization. Defaults to None.
+    """
+    _version = 2
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+        if ME is None:
+            raise ImportError(
+                'Please follow `get_started.md` to install MinkowskiEngine.`')
+        self.voxel_size = bbox_head['voxel_size']
+
+    def extract_feat(
+        self, batch_inputs_dict: Dict[str, Tensor]
+    ) -> Union[Tuple[torch.Tensor], Dict[str, Tensor]]:
+        """Directly extract features from the backbone+neck.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which includes
+                'points' keys.
+
+                    - points (list[torch.Tensor]): Point cloud of each sample.
+
+        Returns:
+            tuple[Tensor] | dict:  For outside 3D object detection, we
+                typically obtain a tuple of features from the backbone + neck,
+                and for inside 3D object detection, usually a dict containing
+                features will be obtained.
+        """
+        points = batch_inputs_dict['points']
+
+        coordinates, features = ME.utils.batch_sparse_collate(
+            [(p[:, :3] / self.voxel_size, p[:, 3:]) for p in points],
+            device=points[0].device)
+        x = ME.SparseTensor(coordinates=coordinates, features=features)
+
+        x = self.backbone(x)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def _load_from_state_dict(self, state_dict: OrderedDict, prefix: str,
+                              local_metadata: Dict, strict: bool,
+                              missing_keys: List[str],
+                              unexpected_keys: List[str],
+                              error_msgs: List[str]) -> None:
+        """Load checkpoint.
+
+        Args:
+            state_dict (dict): a dict containing parameters and
+                persistent buffers.
+            prefix (str): the prefix for parameters and buffers used in this
+                module
+            local_metadata (dict): a dict containing the metadata for this
+                module.
+            strict (bool): whether to strictly enforce that the keys in
+                :attr:`state_dict` with :attr:`prefix` match the names of
+                parameters and buffers in this module
+            missing_keys (list of str): if ``strict=True``, add missing keys to
+                this list
+            unexpected_keys (list of str): if ``strict=True``, add unexpected
+                keys to this list
+            error_msgs (list of str): error messages should be added to this
+                list, and will be reported together in
+                :meth:`~torch.nn.Module.load_state_dict`
+        """
+        # The names of some parameters in FCAF3D has been changed
+        # since 2022.10.
+        version = local_metadata.get('version', None)
+        if (version is None or
+                version < 2) and self.__class__ is MinkSingleStage3DDetector:
+            convert_dict = {'head.': 'bbox_head.'}
+            state_dict_keys = list(state_dict.keys())
+            for k in state_dict_keys:
+                for ori_key, convert_key in convert_dict.items():
+                    if ori_key in k:
+                        convert_key = k.replace(ori_key, convert_key)
+                        state_dict[convert_key] = state_dict[k]
+                        del state_dict[k]
+
+        super(MinkSingleStage3DDetector,
+              self)._load_from_state_dict(state_dict, prefix, local_metadata,
+                                          strict, missing_keys,
+                                          unexpected_keys, error_msgs)
diff --git a/mmde/mmdet3d/models/detectors/multiview_dfm.py b/mmde/mmdet3d/models/detectors/multiview_dfm.py
new file mode 100644
index 0000000000000000000000000000000000000000..81446d30f2803fca76710cf7062033f896e24f06
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/multiview_dfm.py
@@ -0,0 +1,502 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet3d.models.layers.fusion_layers.point_fusion import (point_sample,
+                                                              voxel_sample)
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures.bbox_3d.utils import get_lidar2img
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import ConfigType, OptConfigType, OptInstanceList
+from .dfm import DfM
+
+
+@MODELS.register_module()
+class MultiViewDfM(DfM):
+    r"""Waymo challenge solution of `MV-FCOS3D++
+    <https://arxiv.org/abs/2207.12716>`_.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        backbone_stereo (:obj:`ConfigDict` or dict): The stereo backbone
+        config.
+        backbone_3d (:obj:`ConfigDict` or dict): The 3d backbone config.
+        neck_3d (:obj:`ConfigDict` or dict): The 3D neck config.
+        bbox_head_3d (:obj:`ConfigDict` or dict): The bbox head config.
+        voxel_size (:obj:`ConfigDict` or dict): The voxel size.
+        anchor_generator (:obj:`ConfigDict` or dict): The anchor generator
+            config.
+        neck_2d (:obj:`ConfigDict` or dict, optional): The 2D neck config
+            for 2D object detection. Defaults to None.
+        bbox_head_2d (:obj:`ConfigDict` or dict, optional): The 2D bbox
+            head config for 2D object detection. Defaults to None.
+        depth_head_2d (:obj:`ConfigDict` or dict, optional): The 2D depth
+            head config for depth estimation in fov space. Defaults to None.
+        depth_head (:obj:`ConfigDict` or dict, optional): The depth head
+            config for depth estimation in 3D voxel projected to fov space .
+        train_cfg (:obj:`ConfigDict` or dict, optional): Config dict of
+            training hyper-parameters. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Config dict of test
+            hyper-parameters. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        valid_sample (bool): Whether to filter invalid points in view
+            transformation. Defaults to True.
+        temporal_aggregate (str): Key to determine the aggregation way in
+            temporal fusion. Defaults to 'concat'.
+        transform_depth (bool): Key to determine the transformation of depth.
+            Defaults to True.
+        init_cfg (:obj:`ConfigDict` or dict, optional): The initialization
+            config. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 backbone_stereo: ConfigType,
+                 backbone_3d: ConfigType,
+                 neck_3d: ConfigType,
+                 bbox_head_3d: ConfigType,
+                 voxel_size: ConfigType,
+                 anchor_generator: ConfigType,
+                 neck_2d: ConfigType = None,
+                 bbox_head_2d: ConfigType = None,
+                 depth_head_2d: ConfigType = None,
+                 depth_head: ConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 valid_sample: bool = True,
+                 temporal_aggregate: str = 'mean',
+                 transform_depth: bool = True,
+                 init_cfg: OptConfigType = None):
+        super().__init__(
+            data_preprocessor=data_preprocessor,
+            backbone=backbone,
+            neck=neck,
+            backbone_stereo=backbone_stereo,
+            backbone_3d=backbone_3d,
+            neck_3d=neck_3d,
+            bbox_head_3d=bbox_head_3d,
+            neck_2d=neck_2d,
+            bbox_head_2d=bbox_head_2d,
+            depth_head_2d=depth_head_2d,
+            depth_head=depth_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        self.voxel_size = voxel_size
+        self.voxel_range = anchor_generator['ranges'][0]
+        self.n_voxels = [
+            round((self.voxel_range[3] - self.voxel_range[0]) /
+                  self.voxel_size[0]),
+            round((self.voxel_range[4] - self.voxel_range[1]) /
+                  self.voxel_size[1]),
+            round((self.voxel_range[5] - self.voxel_range[2]) /
+                  self.voxel_size[2])
+        ]
+        self.anchor_generator = TASK_UTILS.build(anchor_generator)
+        self.valid_sample = valid_sample
+        self.temporal_aggregate = temporal_aggregate
+        self.transform_depth = transform_depth
+
+    def extract_feat(self, batch_inputs_dict: dict,
+                     batch_data_samples: SampleList):
+        """Extract 3d features from the backbone -> fpn -> 3d projection.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                the 'imgs' key.
+
+                    - imgs (torch.Tensor, optional): Image of each sample.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            torch.Tensor: of shape (N, C_out, N_x, N_y, N_z)
+        """
+        # TODO: Nt means the number of frames temporally
+        # num_views means the number of views of a frame
+        img = batch_inputs_dict['imgs']
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        batch_size, _, C_in, H, W = img.shape
+        num_views = batch_img_metas[0]['num_views']
+        num_ref_frames = batch_img_metas[0]['num_ref_frames']
+        if num_ref_frames > 0:
+            num_frames = num_ref_frames + 1
+        else:
+            num_frames = 1
+        input_shape = img.shape[-2:]
+        # NOTE: input_shape is the largest pad_shape of the batch of images
+        for img_meta in batch_img_metas:
+            img_meta.update(input_shape=input_shape)
+        if num_ref_frames > 0:
+            cur_imgs = img[:, :num_views].reshape(-1, C_in, H, W)
+            prev_imgs = img[:, num_views:].reshape(-1, C_in, H, W)
+            cur_feats = self.backbone(cur_imgs)
+            cur_feats = self.neck(cur_feats)[0]
+            with torch.no_grad():
+                prev_feats = self.backbone(prev_imgs)
+                prev_feats = self.neck(prev_feats)[0]
+            _, C_feat, H_feat, W_feat = cur_feats.shape
+            cur_feats = cur_feats.view(batch_size, -1, C_feat, H_feat, W_feat)
+            prev_feats = prev_feats.view(batch_size, -1, C_feat, H_feat,
+                                         W_feat)
+            batch_feats = torch.cat([cur_feats, prev_feats], dim=1)
+        else:
+            batch_imgs = img.view(-1, C_in, H, W)
+            batch_feats = self.backbone(batch_imgs)
+            # TODO: support SPP module neck
+            batch_feats = self.neck(batch_feats)[0]
+            _, C_feat, H_feat, W_feat = batch_feats.shape
+            batch_feats = batch_feats.view(batch_size, -1, C_feat, H_feat,
+                                           W_feat)
+        # transform the feature to voxel & stereo space
+        transform_feats = self.feature_transformation(batch_feats,
+                                                      batch_img_metas,
+                                                      num_views, num_frames)
+        if self.with_depth_head_2d:
+            transform_feats += (batch_feats[:, :num_views], )
+        return transform_feats
+
+    def feature_transformation(self, batch_feats, batch_img_metas, num_views,
+                               num_frames):
+        """Feature transformation from perspective view to BEV.
+
+        Args:
+            batch_feats (torch.Tensor): Perspective view features of shape
+                (batch_size, num_views, C, H, W).
+            batch_img_metas (list[dict]): Image meta information. Each element
+                corresponds to a group of images. len(img_metas) == B.
+            num_views (int): Number of views.
+            num_frames (int): Number of consecutive frames.
+
+        Returns:
+            tuple[torch.Tensor]: Volume features and (optionally) stereo \
+            features.
+        """
+        # TODO: support more complicated 2D feature sampling
+        points = self.anchor_generator.grid_anchors(
+            [self.n_voxels[::-1]], device=batch_feats.device)[0][:, :3]
+        volumes = []
+        img_scale_factors = []
+        img_flips = []
+        img_crop_offsets = []
+        for feature, img_meta in zip(batch_feats, batch_img_metas):
+
+            # TODO: remove feature sampling from back
+            # TODO: support different scale_factors/flip/crop_offset for
+            # different views
+            frame_volume = []
+            frame_valid_nums = []
+            for frame_idx in range(num_frames):
+                volume = []
+                valid_flags = []
+                if isinstance(img_meta['img_shape'], list):
+                    img_shape = img_meta['img_shape'][frame_idx][:2]
+                else:
+                    img_shape = img_meta['img_shape'][:2]
+
+                for view_idx in range(num_views):
+
+                    sample_idx = frame_idx * num_views + view_idx
+
+                    if 'scale_factor' in img_meta:
+                        img_scale_factor = img_meta['scale_factor'][sample_idx]
+                        if isinstance(img_scale_factor, np.ndarray) and \
+                                len(img_meta['scale_factor']) >= 2:
+                            img_scale_factor = (
+                                points.new_tensor(img_scale_factor[:2]))
+                        else:
+                            img_scale_factor = (
+                                points.new_tensor(img_scale_factor))
+                    else:
+                        img_scale_factor = (1)
+                    img_flip = img_meta['flip'][sample_idx] \
+                        if 'flip' in img_meta.keys() else False
+                    img_crop_offset = (
+                        points.new_tensor(
+                            img_meta['img_crop_offset'][sample_idx])
+                        if 'img_crop_offset' in img_meta.keys() else 0)
+                    lidar2cam = points.new_tensor(
+                        img_meta['lidar2cam'][sample_idx])
+                    cam2img = points.new_tensor(
+                        img_meta['ori_cam2img'][sample_idx])
+                    # align the precision, the tensor is converted to float32
+                    lidar2img = get_lidar2img(cam2img.double(),
+                                              lidar2cam.double())
+                    lidar2img = lidar2img.float()
+
+                    sample_results = point_sample(
+                        img_meta,
+                        img_features=feature[sample_idx][None, ...],
+                        points=points,
+                        proj_mat=lidar2img,
+                        coord_type='LIDAR',
+                        img_scale_factor=img_scale_factor,
+                        img_crop_offset=img_crop_offset,
+                        img_flip=img_flip,
+                        img_pad_shape=img_meta['input_shape'],
+                        img_shape=img_shape,
+                        aligned=False,
+                        valid_flag=self.valid_sample)
+                    if self.valid_sample:
+                        volume.append(sample_results[0])
+                        valid_flags.append(sample_results[1])
+                    else:
+                        volume.append(sample_results)
+                    # TODO: save valid flags, more reasonable feat fusion
+                if self.valid_sample:
+                    valid_nums = torch.stack(
+                        valid_flags, dim=0).sum(0)  # (N, )
+                    volume = torch.stack(volume, dim=0).sum(0)
+                    valid_mask = valid_nums > 0
+                    volume[~valid_mask] = 0
+                    frame_valid_nums.append(valid_nums)
+                else:
+                    volume = torch.stack(volume, dim=0).mean(0)
+                frame_volume.append(volume)
+
+            img_scale_factors.append(img_scale_factor)
+            img_flips.append(img_flip)
+            img_crop_offsets.append(img_crop_offset)
+
+            if self.valid_sample:
+                if self.temporal_aggregate == 'mean':
+                    frame_volume = torch.stack(frame_volume, dim=0).sum(0)
+                    frame_valid_nums = torch.stack(
+                        frame_valid_nums, dim=0).sum(0)
+                    frame_valid_mask = frame_valid_nums > 0
+                    frame_volume[~frame_valid_mask] = 0
+                    frame_volume = frame_volume / torch.clamp(
+                        frame_valid_nums[:, None], min=1)
+                elif self.temporal_aggregate == 'concat':
+                    frame_valid_nums = torch.stack(frame_valid_nums, dim=1)
+                    frame_volume = torch.stack(frame_volume, dim=1)
+                    frame_valid_mask = frame_valid_nums > 0
+                    frame_volume[~frame_valid_mask] = 0
+                    frame_volume = (frame_volume / torch.clamp(
+                        frame_valid_nums[:, :, None], min=1)).flatten(
+                            start_dim=1, end_dim=2)
+            else:
+                frame_volume = torch.stack(frame_volume, dim=0).mean(0)
+            volumes.append(
+                frame_volume.reshape(self.n_voxels[::-1] + [-1]).permute(
+                    3, 2, 1, 0))
+        volume_feat = torch.stack(volumes)  # (B, C, N_x, N_y, N_z)
+        if self.with_backbone_3d:
+            outputs = self.backbone_3d(volume_feat)
+            volume_feat = outputs[0]
+            if self.backbone_3d.output_bev:
+                # use outputs[0] if len(outputs) == 1
+                # use outputs[1] if len(outputs) == 2
+                # TODO: unify the output formats
+                bev_feat = outputs[-1]
+        # grid_sample stereo features from the volume feature
+        # TODO: also support temporal modeling for depth head
+        if self.with_depth_head:
+            batch_stereo_feats = []
+            for batch_idx in range(volume_feat.shape[0]):
+                stereo_feat = []
+                for view_idx in range(num_views):
+                    img_scale_factor = img_scale_factors[batch_idx] \
+                        if self.transform_depth else points.new_tensor(
+                            [1., 1.])
+                    img_crop_offset = img_crop_offsets[batch_idx] \
+                        if self.transform_depth else points.new_tensor(
+                            [0., 0.])
+                    img_flip = img_flips[batch_idx] if self.transform_depth \
+                        else False
+                    img_pad_shape = img_meta['input_shape'] \
+                        if self.transform_depth else img_meta['ori_shape'][:2]
+                    lidar2cam = points.new_tensor(
+                        batch_img_metas[batch_idx]['lidar2cam'][view_idx])
+                    cam2img = points.new_tensor(
+                        img_meta[batch_idx]['lidar2cam'][view_idx])
+                    proj_mat = torch.matmul(cam2img, lidar2cam)
+                    stereo_feat.append(
+                        voxel_sample(
+                            volume_feat[batch_idx][None],
+                            voxel_range=self.voxel_range,
+                            voxel_size=self.voxel_size,
+                            depth_samples=volume_feat.new_tensor(
+                                self.depth_samples),
+                            proj_mat=proj_mat,
+                            downsample_factor=self.depth_head.
+                            downsample_factor,
+                            img_scale_factor=img_scale_factor,
+                            img_crop_offset=img_crop_offset,
+                            img_flip=img_flip,
+                            img_pad_shape=img_pad_shape,
+                            img_shape=batch_img_metas[batch_idx]['img_shape']
+                            [view_idx][:2],
+                            aligned=True))  # TODO: study the aligned setting
+                batch_stereo_feats.append(torch.cat(stereo_feat))
+            # cat (N, C, D, H, W) -> (B*N, C, D, H, W)
+            batch_stereo_feats = torch.cat(batch_stereo_feats)
+        if self.with_neck_3d:
+            if self.with_backbone_3d and self.backbone_3d.output_bev:
+                spatial_features = self.neck_3d(bev_feat)
+                # TODO: unify the outputs of neck_3d
+                volume_feat = spatial_features[1]
+            else:
+                volume_feat = self.neck_3d(volume_feat)[0]
+        # TODO: unify the output format of neck_3d
+        transform_feats = (volume_feat, )
+        if self.with_depth_head:
+            transform_feats += (batch_stereo_feats, )
+        return transform_feats
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs dict and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'img' keys.
+
+                    - points (list[torch.Tensor]): Point cloud of each sample.
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        feats = self.extract_feat(batch_inputs, batch_data_samples)
+        bev_feat = feats[0]
+        losses = self.bbox_head_3d.loss([bev_feat], batch_data_samples)
+        return losses
+
+    def predict(self, batch_inputs: Tensor,
+                batch_data_samples: SampleList) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'imgs' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input samples. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+                (num_instance, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+                (num_instances, C) where C >=7.
+        """
+        feats = self.extract_feat(batch_inputs, batch_data_samples)
+        bev_feat = feats[0]
+        results_list = self.bbox_head_3d.predict([bev_feat],
+                                                 batch_data_samples)
+        predictions = self.add_pred_to_datasample(batch_data_samples,
+                                                  results_list)
+        return predictions
+
+    def _forward(self,
+                 batch_inputs: Tensor,
+                 batch_data_samples: SampleList = None):
+        """Network forward process.
+
+        Usually includes backbone, neck and head forward without any post-
+        processing.
+        """
+        feats = self.extract_feat(batch_inputs, batch_data_samples)
+        bev_feat = feats[0]
+        self.bbox_head.forward(bev_feat, batch_data_samples)
+
+    def add_pred_to_datasample(
+        self,
+        data_samples: SampleList,
+        data_instances_3d: OptInstanceList = None,
+        data_instances_2d: OptInstanceList = None,
+    ) -> SampleList:
+        """Convert results list to `Det3DDataSample`.
+
+        Subclasses could override it to be compatible for some multi-modality
+        3D detectors.
+
+        Args:
+            data_samples (list[:obj:`Det3DDataSample`]): The input data.
+            data_instances_3d (list[:obj:`InstanceData`], optional): 3D
+                Detection results of each sample.
+            data_instances_2d (list[:obj:`InstanceData`], optional): 2D
+                Detection results of each sample.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input. Each Det3DDataSample usually contains
+            'pred_instances_3d'. And the ``pred_instances_3d`` normally
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels_3d (Tensor): Labels of 3D bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+              (num_instances, C) where C >=7.
+
+            When there are image prediction in some models, it should
+            contains  `pred_instances`, And the ``pred_instances`` normally
+            contains following keys.
+
+            - scores (Tensor): Classification scores of image, has a shape
+              (num_instance, )
+            - labels (Tensor): Predict Labels of 2D bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Contains a tensor with shape
+              (num_instances, 4).
+        """
+
+        assert (data_instances_2d is not None) or \
+               (data_instances_3d is not None),\
+               'please pass at least one type of data_samples'
+
+        if data_instances_2d is None:
+            data_instances_2d = [
+                InstanceData() for _ in range(len(data_instances_3d))
+            ]
+        if data_instances_3d is None:
+            data_instances_3d = [
+                InstanceData() for _ in range(len(data_instances_2d))
+            ]
+
+        for i, data_sample in enumerate(data_samples):
+            data_sample.pred_instances_3d = data_instances_3d[i]
+            data_sample.pred_instances = data_instances_2d[i]
+        return data_samples
+
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Test with augmentations.
+
+        Args:
+            imgs (list[torch.Tensor]): Input images of shape (N, C_in, H, W).
+            img_metas (list): Image metas.
+
+        Returns:
+            list[dict]: Predicted 3d boxes.
+        """
+        raise NotImplementedError
diff --git a/mmde/mmdet3d/models/detectors/mvx_faster_rcnn.py b/mmde/mmdet3d/models/detectors/mvx_faster_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..de858d07f2e1c929c25d8a70ad14167f90431e24
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/mvx_faster_rcnn.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence
+
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from .mvx_two_stage import MVXTwoStageDetector
+
+
+@MODELS.register_module()
+class MVXFasterRCNN(MVXTwoStageDetector):
+    """Multi-modality VoxelNet using Faster R-CNN."""
+
+    def __init__(self, **kwargs):
+        super(MVXFasterRCNN, self).__init__(**kwargs)
+
+
+@MODELS.register_module()
+class DynamicMVXFasterRCNN(MVXTwoStageDetector):
+    """Multi-modality VoxelNet using Faster R-CNN and dynamic voxelization."""
+
+    def __init__(self, **kwargs):
+        super(DynamicMVXFasterRCNN, self).__init__(**kwargs)
+
+    def extract_pts_feat(
+            self,
+            voxel_dict: Dict[str, Tensor],
+            points: Optional[List[Tensor]] = None,
+            img_feats: Optional[Sequence[Tensor]] = None,
+            batch_input_metas: Optional[List[dict]] = None
+    ) -> Sequence[Tensor]:
+        """Extract features of points.
+
+        Args:
+            voxel_dict(Dict[str, Tensor]): Dict of voxelization infos.
+            points (List[tensor], optional):  Point cloud of multiple inputs.
+            img_feats (list[Tensor], tuple[tensor], optional): Features from
+                image backbone.
+            batch_input_metas (list[dict], optional): The meta information
+                of multiple samples. Defaults to True.
+
+        Returns:
+            Sequence[tensor]: points features of multiple inputs
+            from backbone or neck.
+        """
+        if not self.with_pts_bbox:
+            return None
+        voxel_features, feature_coors = self.pts_voxel_encoder(
+            voxel_dict['voxels'], voxel_dict['coors'], points, img_feats,
+            batch_input_metas)
+        batch_size = voxel_dict['coors'][-1, 0] + 1
+        x = self.pts_middle_encoder(voxel_features, feature_coors, batch_size)
+        x = self.pts_backbone(x)
+        if self.with_pts_neck:
+            x = self.pts_neck(x)
+        return x
diff --git a/mmde/mmdet3d/models/detectors/mvx_two_stage.py b/mmde/mmdet3d/models/detectors/mvx_two_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..537d82144bb11255e0be7d22e8b570142f662d33
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/mvx_two_stage.py
@@ -0,0 +1,407 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Optional, Sequence
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import Det3DDataSample
+from .base import Base3DDetector
+
+
+@MODELS.register_module()
+class MVXTwoStageDetector(Base3DDetector):
+    """Base class of Multi-modality VoxelNet.
+
+    Args:
+        pts_voxel_encoder (dict, optional): Point voxelization
+            encoder layer. Defaults to None.
+        pts_middle_encoder (dict, optional): Middle encoder layer
+            of points cloud modality. Defaults to None.
+        pts_fusion_layer (dict, optional): Fusion layer.
+            Defaults to None.
+        img_backbone (dict, optional): Backbone of extracting
+            images feature. Defaults to None.
+        pts_backbone (dict, optional): Backbone of extracting
+            points features. Defaults to None.
+        img_neck (dict, optional): Neck of extracting
+            image features. Defaults to None.
+        pts_neck (dict, optional): Neck of extracting
+            points features. Defaults to None.
+        pts_bbox_head (dict, optional): Bboxes head of
+            point cloud modality. Defaults to None.
+        img_roi_head (dict, optional): RoI head of image
+            modality. Defaults to None.
+        img_rpn_head (dict, optional): RPN head of image
+            modality. Defaults to None.
+        train_cfg (dict, optional): Train config of model.
+            Defaults to None.
+        test_cfg (dict, optional): Train config of model.
+            Defaults to None.
+        init_cfg (dict, optional): Initialize config of
+            model. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`Det3DDataPreprocessor`. Defaults to None.
+    """
+
+    def __init__(self,
+                 pts_voxel_encoder: Optional[dict] = None,
+                 pts_middle_encoder: Optional[dict] = None,
+                 pts_fusion_layer: Optional[dict] = None,
+                 img_backbone: Optional[dict] = None,
+                 pts_backbone: Optional[dict] = None,
+                 img_neck: Optional[dict] = None,
+                 pts_neck: Optional[dict] = None,
+                 pts_bbox_head: Optional[dict] = None,
+                 img_roi_head: Optional[dict] = None,
+                 img_rpn_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 data_preprocessor: Optional[dict] = None,
+                 **kwargs):
+        super(MVXTwoStageDetector, self).__init__(
+            init_cfg=init_cfg, data_preprocessor=data_preprocessor, **kwargs)
+
+        if pts_voxel_encoder:
+            self.pts_voxel_encoder = MODELS.build(pts_voxel_encoder)
+        if pts_middle_encoder:
+            self.pts_middle_encoder = MODELS.build(pts_middle_encoder)
+        if pts_backbone:
+            self.pts_backbone = MODELS.build(pts_backbone)
+        if pts_fusion_layer:
+            self.pts_fusion_layer = MODELS.build(pts_fusion_layer)
+        if pts_neck is not None:
+            self.pts_neck = MODELS.build(pts_neck)
+        if pts_bbox_head:
+            pts_train_cfg = train_cfg.pts if train_cfg else None
+            pts_bbox_head.update(train_cfg=pts_train_cfg)
+            pts_test_cfg = test_cfg.pts if test_cfg else None
+            pts_bbox_head.update(test_cfg=pts_test_cfg)
+            self.pts_bbox_head = MODELS.build(pts_bbox_head)
+
+        if img_backbone:
+            self.img_backbone = MODELS.build(img_backbone)
+        if img_neck is not None:
+            self.img_neck = MODELS.build(img_neck)
+        if img_rpn_head is not None:
+            self.img_rpn_head = MODELS.build(img_rpn_head)
+        if img_roi_head is not None:
+            self.img_roi_head = MODELS.build(img_roi_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    @property
+    def with_img_shared_head(self):
+        """bool: Whether the detector has a shared head in image branch."""
+        return hasattr(self,
+                       'img_shared_head') and self.img_shared_head is not None
+
+    @property
+    def with_pts_bbox(self):
+        """bool: Whether the detector has a 3D box head."""
+        return hasattr(self,
+                       'pts_bbox_head') and self.pts_bbox_head is not None
+
+    @property
+    def with_img_bbox(self):
+        """bool: Whether the detector has a 2D image box head."""
+        return hasattr(self,
+                       'img_bbox_head') and self.img_bbox_head is not None
+
+    @property
+    def with_img_backbone(self):
+        """bool: Whether the detector has a 2D image backbone."""
+        return hasattr(self, 'img_backbone') and self.img_backbone is not None
+
+    @property
+    def with_pts_backbone(self):
+        """bool: Whether the detector has a 3D backbone."""
+        return hasattr(self, 'pts_backbone') and self.pts_backbone is not None
+
+    @property
+    def with_fusion(self):
+        """bool: Whether the detector has a fusion layer."""
+        return hasattr(self,
+                       'pts_fusion_layer') and self.fusion_layer is not None
+
+    @property
+    def with_img_neck(self):
+        """bool: Whether the detector has a neck in image branch."""
+        return hasattr(self, 'img_neck') and self.img_neck is not None
+
+    @property
+    def with_pts_neck(self):
+        """bool: Whether the detector has a neck in 3D detector branch."""
+        return hasattr(self, 'pts_neck') and self.pts_neck is not None
+
+    @property
+    def with_img_rpn(self):
+        """bool: Whether the detector has a 2D RPN in image detector branch."""
+        return hasattr(self, 'img_rpn_head') and self.img_rpn_head is not None
+
+    @property
+    def with_img_roi_head(self):
+        """bool: Whether the detector has a RoI Head in image branch."""
+        return hasattr(self, 'img_roi_head') and self.img_roi_head is not None
+
+    @property
+    def with_voxel_encoder(self):
+        """bool: Whether the detector has a voxel encoder."""
+        return hasattr(self,
+                       'voxel_encoder') and self.voxel_encoder is not None
+
+    @property
+    def with_middle_encoder(self):
+        """bool: Whether the detector has a middle encoder."""
+        return hasattr(self,
+                       'middle_encoder') and self.middle_encoder is not None
+
+    def _forward(self):
+        pass
+
+    def extract_img_feat(self, img: Tensor, input_metas: List[dict]) -> dict:
+        """Extract features of images."""
+        if self.with_img_backbone and img is not None:
+            input_shape = img.shape[-2:]
+            # update real input shape of each single img
+            for img_meta in input_metas:
+                img_meta.update(input_shape=input_shape)
+
+            if img.dim() == 5 and img.size(0) == 1:
+                img.squeeze_()
+            elif img.dim() == 5 and img.size(0) > 1:
+                B, N, C, H, W = img.size()
+                img = img.view(B * N, C, H, W)
+            img_feats = self.img_backbone(img)
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+        return img_feats
+
+    def extract_pts_feat(
+            self,
+            voxel_dict: Dict[str, Tensor],
+            points: Optional[List[Tensor]] = None,
+            img_feats: Optional[Sequence[Tensor]] = None,
+            batch_input_metas: Optional[List[dict]] = None
+    ) -> Sequence[Tensor]:
+        """Extract features of points.
+
+        Args:
+            voxel_dict(Dict[str, Tensor]): Dict of voxelization infos.
+            points (List[tensor], optional):  Point cloud of multiple inputs.
+            img_feats (list[Tensor], tuple[tensor], optional): Features from
+                image backbone.
+            batch_input_metas (list[dict], optional): The meta information
+                of multiple samples. Defaults to True.
+
+        Returns:
+            Sequence[tensor]: points features of multiple inputs
+            from backbone or neck.
+        """
+        if not self.with_pts_bbox:
+            return None
+        voxel_features = self.pts_voxel_encoder(voxel_dict['voxels'],
+                                                voxel_dict['num_points'],
+                                                voxel_dict['coors'], img_feats,
+                                                batch_input_metas)
+        batch_size = voxel_dict['coors'][-1, 0] + 1
+        x = self.pts_middle_encoder(voxel_features, voxel_dict['coors'],
+                                    batch_size)
+        x = self.pts_backbone(x)
+        if self.with_pts_neck:
+            x = self.pts_neck(x)
+        return x
+
+    def extract_feat(self, batch_inputs_dict: dict,
+                     batch_input_metas: List[dict]) -> tuple:
+        """Extract features from images and points.
+
+        Args:
+            batch_inputs_dict (dict): Dict of batch inputs. It
+                contains
+
+                - points (List[tensor]):  Point cloud of multiple inputs.
+                - imgs (tensor): Image tensor with shape (B, C, H, W).
+            batch_input_metas (list[dict]): Meta information of multiple inputs
+                in a batch.
+
+        Returns:
+             tuple: Two elements in tuple arrange as
+             image features and point cloud features.
+        """
+        voxel_dict = batch_inputs_dict.get('voxels', None)
+        imgs = batch_inputs_dict.get('imgs', None)
+        points = batch_inputs_dict.get('points', None)
+        img_feats = self.extract_img_feat(imgs, batch_input_metas)
+        pts_feats = self.extract_pts_feat(
+            voxel_dict,
+            points=points,
+            img_feats=img_feats,
+            batch_input_metas=batch_input_metas)
+        return (img_feats, pts_feats)
+
+    def loss(self, batch_inputs_dict: Dict[List, torch.Tensor],
+             batch_data_samples: List[Det3DDataSample],
+             **kwargs) -> List[Det3DDataSample]:
+        """
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points' and `imgs` keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (torch.Tensor): Tensor of batch images, has shape
+                  (B, C, H ,W)
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, .
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+
+        """
+
+        batch_input_metas = [item.metainfo for item in batch_data_samples]
+        img_feats, pts_feats = self.extract_feat(batch_inputs_dict,
+                                                 batch_input_metas)
+        losses = dict()
+        if pts_feats:
+            losses_pts = self.pts_bbox_head.loss(pts_feats, batch_data_samples,
+                                                 **kwargs)
+            losses.update(losses_pts)
+        if img_feats:
+            losses_img = self.loss_imgs(img_feats, batch_data_samples)
+            losses.update(losses_img)
+        return losses
+
+    def loss_imgs(self, x: List[Tensor],
+                  batch_data_samples: List[Det3DDataSample], **kwargs):
+        """Forward function for image branch.
+
+        This function works similar to the forward function of Faster R-CNN.
+
+        Args:
+            x (list[torch.Tensor]): Image features of shape (B, C, H, W)
+                of multiple levels.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, .
+
+        Returns:
+            dict: Losses of each branch.
+        """
+        losses = dict()
+        # RPN forward and loss
+        if self.with_img_rpn:
+            proposal_cfg = self.test_cfg.rpn
+            rpn_data_samples = copy.deepcopy(batch_data_samples)
+            # set cat_id of gt_labels to 0 in RPN
+            for data_sample in rpn_data_samples:
+                data_sample.gt_instances.labels = \
+                    torch.zeros_like(data_sample.gt_instances.labels)
+            rpn_losses, rpn_results_list = self.img_rpn_head.loss_and_predict(
+                x, rpn_data_samples, proposal_cfg=proposal_cfg, **kwargs)
+            # avoid get same name with roi_head loss
+            keys = rpn_losses.keys()
+            for key in keys:
+                if 'loss' in key and 'rpn' not in key:
+                    rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+            losses.update(rpn_losses)
+
+        else:
+            if 'proposals' in batch_data_samples[0]:
+                # use pre-defined proposals in InstanceData
+                # for the second stage
+                # to extract ROI features.
+                rpn_results_list = [
+                    data_sample.proposals for data_sample in batch_data_samples
+                ]
+            else:
+                rpn_results_list = None
+        # bbox head forward and loss
+        if self.with_img_bbox:
+            roi_losses = self.img_roi_head.loss(x, rpn_results_list,
+                                                batch_data_samples, **kwargs)
+            losses.update(roi_losses)
+        return losses
+
+    def predict_imgs(self,
+                     x: List[Tensor],
+                     batch_data_samples: List[Det3DDataSample],
+                     rescale: bool = True,
+                     **kwargs) -> InstanceData:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            x (List[Tensor]): Image features from FPN.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+        """
+
+        if batch_data_samples[0].get('proposals', None) is None:
+            rpn_results_list = self.img_rpn_head.predict(
+                x, batch_data_samples, rescale=False)
+        else:
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+        results_list = self.img_roi_head.predict(
+            x, rpn_results_list, batch_data_samples, rescale=rescale, **kwargs)
+        return results_list
+
+    def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
+                batch_data_samples: List[Det3DDataSample],
+                **kwargs) -> List[Det3DDataSample]:
+        """Forward of testing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input sample. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+                (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bbox_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
+                contains a tensor with shape (num_instances, 7).
+        """
+        batch_input_metas = [item.metainfo for item in batch_data_samples]
+        img_feats, pts_feats = self.extract_feat(batch_inputs_dict,
+                                                 batch_input_metas)
+        if pts_feats and self.with_pts_bbox:
+            results_list_3d = self.pts_bbox_head.predict(
+                pts_feats, batch_data_samples, **kwargs)
+        else:
+            results_list_3d = None
+
+        if img_feats and self.with_img_bbox:
+            # TODO check this for camera modality
+            results_list_2d = self.predict_imgs(img_feats, batch_data_samples,
+                                                **kwargs)
+        else:
+            results_list_2d = None
+
+        detsamples = self.add_pred_to_datasample(batch_data_samples,
+                                                 results_list_3d,
+                                                 results_list_2d)
+        return detsamples
diff --git a/mmde/mmdet3d/models/detectors/parta2.py b/mmde/mmdet3d/models/detectors/parta2.py
new file mode 100644
index 0000000000000000000000000000000000000000..9011abd232ecfd641a47f636cf603520c7733f7e
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/parta2.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
+
+from mmdet3d.registry import MODELS
+from .two_stage import TwoStage3DDetector
+
+
+@MODELS.register_module()
+class PartA2(TwoStage3DDetector):
+    r"""Part-A2 detector.
+
+    Please refer to the `paper <https://arxiv.org/abs/1907.03670>`_
+    """
+
+    def __init__(self,
+                 voxel_encoder: dict,
+                 middle_encoder: dict,
+                 backbone: dict,
+                 neck: dict = None,
+                 rpn_head: dict = None,
+                 roi_head: dict = None,
+                 train_cfg: dict = None,
+                 test_cfg: dict = None,
+                 init_cfg: dict = None,
+                 data_preprocessor: Optional[dict] = None):
+        super(PartA2, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
+        self.voxel_encoder = MODELS.build(voxel_encoder)
+        self.middle_encoder = MODELS.build(middle_encoder)
+
+    def extract_feat(self, batch_inputs_dict: Dict) -> Dict:
+        """Directly extract features from the backbone+neck.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'imgs' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (torch.Tensor, optional): Image of each sample.
+
+        Returns:
+            tuple[Tensor] | dict:  For outside 3D object detection, we
+                typically obtain a tuple of features from the backbone + neck,
+                and for inside 3D object detection, usually a dict containing
+                features will be obtained.
+        """
+        voxel_dict = batch_inputs_dict['voxels']
+        voxel_features = self.voxel_encoder(voxel_dict['voxels'],
+                                            voxel_dict['num_points'],
+                                            voxel_dict['coors'])
+        batch_size = voxel_dict['coors'][-1, 0].item() + 1
+        feats_dict = self.middle_encoder(voxel_features, voxel_dict['coors'],
+                                         batch_size)
+        x = self.backbone(feats_dict['spatial_features'])
+        if self.with_neck:
+            neck_feats = self.neck(x)
+            feats_dict.update({'neck_feats': neck_feats})
+        feats_dict['voxels_dict'] = voxel_dict
+        return feats_dict
diff --git a/mmde/mmdet3d/models/detectors/point_rcnn.py b/mmde/mmdet3d/models/detectors/point_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..398d351e7782b389addf171eef4879bdcbbdb633
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/point_rcnn.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
+
+import torch
+
+from mmdet3d.registry import MODELS
+from .two_stage import TwoStage3DDetector
+
+
+@MODELS.register_module()
+class PointRCNN(TwoStage3DDetector):
+    r"""PointRCNN detector.
+
+    Please refer to the `PointRCNN <https://arxiv.org/abs/1812.04244>`_
+
+    Args:
+        backbone (dict): Config dict of detector's backbone.
+        neck (dict, optional): Config dict of neck. Defaults to None.
+        rpn_head (dict, optional): Config of RPN head. Defaults to None.
+        roi_head (dict, optional): Config of ROI head. Defaults to None.
+        train_cfg (dict, optional): Train configs. Defaults to None.
+        test_cfg (dict, optional): Test configs. Defaults to None.
+        pretrained (str, optional): Model pretrained path. Defaults to None.
+        init_cfg (dict, optional): Config of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: dict,
+                 neck: Optional[dict] = None,
+                 rpn_head: Optional[dict] = None,
+                 roi_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 data_preprocessor: Optional[dict] = None) -> None:
+        super(PointRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
+
+    def extract_feat(self, batch_inputs_dict: Dict) -> Dict:
+        """Directly extract features from the backbone+neck.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'imgs' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (torch.Tensor, optional): Image of each sample.
+
+        Returns:
+            dict: Features from the backbone+neck and raw points.
+        """
+        points = torch.stack(batch_inputs_dict['points'])
+        x = self.backbone(points)
+
+        if self.with_neck:
+            x = self.neck(x)
+        return dict(
+            fp_features=x['fp_features'].clone(),
+            fp_points=x['fp_xyz'].clone(),
+            raw_points=points)
diff --git a/mmde/mmdet3d/models/detectors/pv_rcnn.py b/mmde/mmdet3d/models/detectors/pv_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac03a6193472ae4a21375a1b4af023925f680e10
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/pv_rcnn.py
@@ -0,0 +1,232 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Optional
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import InstanceList
+from .two_stage import TwoStage3DDetector
+
+
+@MODELS.register_module()
+class PointVoxelRCNN(TwoStage3DDetector):
+    r"""PointVoxelRCNN detector.
+
+    Please refer to the `PointVoxelRCNN <https://arxiv.org/abs/1912.13192>`_.
+
+    Args:
+        voxel_encoder (dict): Point voxelization encoder layer.
+        middle_encoder (dict): Middle encoder layer
+            of points cloud modality.
+        backbone (dict): Backbone of extracting points features.
+        neck (dict, optional): Neck of extracting points features.
+            Defaults to None.
+        rpn_head (dict, optional): Config of RPN head. Defaults to None.
+        points_encoder (dict, optional): Points encoder to extract point-wise
+            features. Defaults to None.
+        roi_head (dict, optional): Config of ROI head. Defaults to None.
+        train_cfg (dict, optional): Train config of model.
+            Defaults to None.
+        test_cfg (dict, optional): Train config of model.
+            Defaults to None.
+        init_cfg (dict, optional): Initialize config of
+            model. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`Det3DDataPreprocessor`. Defaults to None.
+    """
+
+    def __init__(self,
+                 voxel_encoder: dict,
+                 middle_encoder: dict,
+                 backbone: dict,
+                 neck: Optional[dict] = None,
+                 rpn_head: Optional[dict] = None,
+                 points_encoder: Optional[dict] = None,
+                 roi_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 data_preprocessor: Optional[dict] = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
+        self.voxel_encoder = MODELS.build(voxel_encoder)
+        self.middle_encoder = MODELS.build(middle_encoder)
+        self.points_encoder = MODELS.build(points_encoder)
+
+    def predict(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'voxels' keys.
+
+                    - points (list[torch.Tensor]): Point cloud of each sample.
+                    - voxels (dict[torch.Tensor]): Voxels of the batch sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input samples. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+                - scores_3d (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels_3d (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes_3d (Tensor): Contains a tensor with shape
+                    (num_instances, C) where C >=7.
+        """
+        feats_dict = self.extract_feat(batch_inputs_dict)
+        if self.with_rpn:
+            rpn_results_list = self.rpn_head.predict(feats_dict,
+                                                     batch_data_samples)
+        else:
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        # extrack points feats by points_encoder
+        points_feats_dict = self.extract_points_feat(batch_inputs_dict,
+                                                     feats_dict,
+                                                     rpn_results_list)
+
+        results_list_3d = self.roi_head.predict(points_feats_dict,
+                                                rpn_results_list,
+                                                batch_data_samples)
+
+        # connvert to Det3DDataSample
+        results_list = self.add_pred_to_datasample(batch_data_samples,
+                                                   results_list_3d)
+
+        return results_list
+
+    def extract_feat(self, batch_inputs_dict: dict) -> dict:
+        """Extract features from the input voxels.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'voxels' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - voxels (dict[torch.Tensor]): Voxels of the batch sample.
+
+        Returns:
+            dict: We typically obtain a dict of features from the backbone +
+                neck, it includes:
+
+                - spatial_feats (torch.Tensor): Spatial feats from middle
+                    encoder.
+                - multi_scale_3d_feats (list[torch.Tensor]): Multi scale
+                    middle feats from middle encoder.
+                - neck_feats (torch.Tensor): Neck feats from neck.
+        """
+        feats_dict = dict()
+        voxel_dict = batch_inputs_dict['voxels']
+        voxel_features = self.voxel_encoder(voxel_dict['voxels'],
+                                            voxel_dict['num_points'],
+                                            voxel_dict['coors'])
+        batch_size = voxel_dict['coors'][-1, 0].item() + 1
+        feats_dict['spatial_feats'], feats_dict[
+            'multi_scale_3d_feats'] = self.middle_encoder(
+                voxel_features, voxel_dict['coors'], batch_size)
+        x = self.backbone(feats_dict['spatial_feats'])
+        if self.with_neck:
+            neck_feats = self.neck(x)
+            feats_dict['neck_feats'] = neck_feats
+        return feats_dict
+
+    def extract_points_feat(self, batch_inputs_dict: dict, feats_dict: dict,
+                            rpn_results_list: InstanceList) -> dict:
+        """Extract point-wise features from the raw points and voxel features.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'voxels' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - voxels (dict[torch.Tensor]): Voxels of the batch sample.
+            feats_dict (dict): Contains features from the first stage.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+
+        Returns:
+            dict: Contain Point-wise features, include:
+                - keypoints (torch.Tensor): Sampled key points.
+                - keypoint_features (torch.Tensor): Gather key points features
+                    from multi input.
+                - fusion_keypoint_features (torch.Tensor): Fusion
+                    keypoint_features by point_feature_fusion_layer.
+        """
+        return self.points_encoder(batch_inputs_dict, feats_dict,
+                                   rpn_results_list)
+
+    def loss(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+             **kwargs):
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'voxels' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - voxels (dict[torch.Tensor]): Voxels of the batch sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        feats_dict = self.extract_feat(batch_inputs_dict)
+
+        losses = dict()
+
+        # RPN forward and loss
+        if self.with_rpn:
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                              self.test_cfg.rpn)
+            rpn_data_samples = copy.deepcopy(batch_data_samples)
+
+            rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict(
+                feats_dict,
+                rpn_data_samples,
+                proposal_cfg=proposal_cfg,
+                **kwargs)
+            # avoid get same name with roi_head loss
+            keys = rpn_losses.keys()
+            for key in keys:
+                if 'loss' in key and 'rpn' not in key:
+                    rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+            losses.update(rpn_losses)
+        else:
+            # TODO: Not support currently, should have a check at Fast R-CNN
+            assert batch_data_samples[0].get('proposals', None) is not None
+            # use pre-defined proposals in InstanceData for the second stage
+            # to extract ROI features.
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        points_feats_dict = self.extract_points_feat(batch_inputs_dict,
+                                                     feats_dict,
+                                                     rpn_results_list)
+
+        roi_losses = self.roi_head.loss(points_feats_dict, rpn_results_list,
+                                        batch_data_samples)
+        losses.update(roi_losses)
+
+        return losses
diff --git a/mmde/mmdet3d/models/detectors/sassd.py b/mmde/mmdet3d/models/detectors/sassd.py
new file mode 100644
index 0000000000000000000000000000000000000000..76f130c77f503673b37900358200e9456b39d8c3
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/sassd.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
+from ...structures.det3d_data_sample import SampleList
+from .single_stage import SingleStage3DDetector
+
+
+@MODELS.register_module()
+class SASSD(SingleStage3DDetector):
+    r"""`SASSD <https://github.com/skyhehe123/SA-SSD>` _ for 3D detection."""
+
+    def __init__(self,
+                 voxel_encoder: ConfigType,
+                 middle_encoder: ConfigType,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super(SASSD, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+
+        self.voxel_encoder = MODELS.build(voxel_encoder)
+        self.middle_encoder = MODELS.build(middle_encoder)
+
+    def extract_feat(
+        self,
+        batch_inputs_dict: dict,
+        test_mode: bool = True
+    ) -> Union[Tuple[Tuple[Tensor], Tuple], Tuple[Tensor]]:
+        """Extract features from points.
+
+        Args:
+            batch_inputs_dict (dict): The batch inputs.
+            test_mode (bool, optional): Whether test mode. Defaults to True.
+
+        Returns:
+            Union[Tuple[Tuple[Tensor], Tuple], Tuple[Tensor]]: In test mode, it
+            returns the features of points from multiple levels. In training
+            mode, it returns the features of points from multiple levels and a
+            tuple containing the mean features of points and the targets of
+            clssification and regression.
+        """
+        voxel_dict = batch_inputs_dict['voxels']
+        voxel_features = self.voxel_encoder(voxel_dict['voxels'],
+                                            voxel_dict['num_points'],
+                                            voxel_dict['coors'])
+        batch_size = voxel_dict['coors'][-1, 0].item() + 1
+        # `point_misc` is a tuple containing the mean features of points and
+        # the targets of clssification and regression. It's only used for
+        # calculating auxiliary loss in training mode.
+        x, point_misc = self.middle_encoder(voxel_features,
+                                            voxel_dict['coors'], batch_size,
+                                            test_mode)
+        x = self.backbone(x)
+        if self.with_neck:
+            x = self.neck(x)
+
+        return (x, point_misc) if not test_mode else x
+
+    def loss(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs dict and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points' keys.
+                    - points (list[torch.Tensor]): Point cloud of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        x, point_misc = self.extract_feat(batch_inputs_dict, test_mode=False)
+        batch_gt_bboxes_3d = [
+            data_sample.gt_instances_3d.bboxes_3d
+            for data_sample in batch_data_samples
+        ]
+        aux_loss = self.middle_encoder.aux_loss(*point_misc,
+                                                batch_gt_bboxes_3d)
+        losses = self.bbox_head.loss(x, batch_data_samples)
+        losses.update(aux_loss)
+        return losses
diff --git a/mmde/mmdet3d/models/detectors/single_stage.py b/mmde/mmdet3d/models/detectors/single_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..7719944286752bb45b823a2171f1d056772e51e2
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/single_stage.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple, Union
+
+import torch
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
+from ...structures.det3d_data_sample import OptSampleList, SampleList
+from .base import Base3DDetector
+
+
+@MODELS.register_module()
+class SingleStage3DDetector(Base3DDetector):
+    """SingleStage3DDetector.
+
+    This class serves as a base class for single-stage 3D detectors which
+    directly and densely predict 3D bounding boxes on the output features
+    of the backbone+neck.
+
+
+    Args:
+        backbone (dict): Config dict of detector's backbone.
+        neck (dict, optional): Config dict of neck. Defaults to None.
+        bbox_head (dict, optional): Config dict of box head. Defaults to None.
+        train_cfg (dict, optional): Config dict of training hyper-parameters.
+            Defaults to None.
+        test_cfg (dict, optional): Config dict of test hyper-parameters.
+            Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or ConfigDict, optional): the config to control the
+            initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = MODELS.build(bbox_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def loss(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+             **kwargs) -> Union[dict, list]:
+        """Calculate losses from a batch of inputs dict and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'img' keys.
+
+                    - points (list[torch.Tensor]): Point cloud of each sample.
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs_dict)
+        losses = self.bbox_head.loss(x, batch_data_samples, **kwargs)
+        return losses
+
+    def predict(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'img' keys.
+
+                    - points (list[torch.Tensor]): Point cloud of each sample.
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input samples. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+                - scores_3d (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels_3d (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes_3d (Tensor): Contains a tensor with shape
+                    (num_instances, C) where C >=7.
+        """
+        x = self.extract_feat(batch_inputs_dict)
+        results_list = self.bbox_head.predict(x, batch_data_samples, **kwargs)
+        predictions = self.add_pred_to_datasample(batch_data_samples,
+                                                  results_list)
+        return predictions
+
+    def _forward(self,
+                 batch_inputs_dict: dict,
+                 data_samples: OptSampleList = None,
+                 **kwargs) -> Tuple[List[torch.Tensor]]:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'img' keys.
+
+                    - points (list[torch.Tensor]): Point cloud of each sample.
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            tuple[list]: A tuple of features from ``bbox_head`` forward.
+        """
+        x = self.extract_feat(batch_inputs_dict)
+        results = self.bbox_head.forward(x)
+        return results
+
+    def extract_feat(
+        self, batch_inputs_dict: Dict[str, Tensor]
+    ) -> Union[Tuple[torch.Tensor], Dict[str, Tensor]]:
+        """Directly extract features from the backbone+neck.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'img' keys.
+
+                    - points (list[torch.Tensor]): Point cloud of each sample.
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+        Returns:
+            tuple[Tensor] | dict:  For outside 3D object detection, we
+                typically obtain a tuple of features from the backbone + neck,
+                and for inside 3D object detection, usually a dict containing
+                features will be obtained.
+        """
+        points = batch_inputs_dict['points']
+        stack_points = torch.stack(points)
+        x = self.backbone(stack_points)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
diff --git a/mmde/mmdet3d/models/detectors/single_stage_mono3d.py b/mmde/mmdet3d/models/detectors/single_stage_mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5865db3602427124dd2b0af5857e428f5a3440bd
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/single_stage_mono3d.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+from mmdet.models.detectors.single_stage import SingleStageDetector
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import OptInstanceList
+
+
+@MODELS.register_module()
+class SingleStageMono3DDetector(SingleStageDetector):
+    """Base class for monocular 3D single-stage detectors.
+
+    Monocular 3D single-stage detectors directly and densely predict bounding
+    boxes on the output features of the backbone+neck.
+    """
+
+    def add_pred_to_datasample(
+        self,
+        data_samples: SampleList,
+        data_instances_3d: OptInstanceList = None,
+        data_instances_2d: OptInstanceList = None,
+    ) -> SampleList:
+        """Convert results list to `Det3DDataSample`.
+
+        Args:
+            data_samples (list[:obj:`Det3DDataSample`]): The input data.
+            data_instances_3d (list[:obj:`InstanceData`], optional): 3D
+                Detection results of each image. Defaults to None.
+            data_instances_2d (list[:obj:`InstanceData`], optional): 2D
+                Detection results of each image. Defaults to None.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input. Each Det3DDataSample usually contains
+            'pred_instances_3d'. And the ``pred_instances_3d`` normally
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels_3d (Tensor): Labels of 3D bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+              (num_instances, C) where C >=7.
+
+            When there are 2D prediction in some models, it should
+            contains  `pred_instances`, And the ``pred_instances`` normally
+            contains following keys.
+
+            - scores (Tensor): Classification scores of image, has a shape
+              (num_instance, )
+            - labels (Tensor): Predict Labels of 2D bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Contains a tensor with shape
+              (num_instances, 4).
+        """
+
+        assert (data_instances_2d is not None) or \
+               (data_instances_3d is not None),\
+               'please pass at least one type of data_samples'
+
+        if data_instances_2d is None:
+            data_instances_2d = [
+                InstanceData() for _ in range(len(data_instances_3d))
+            ]
+        if data_instances_3d is None:
+            data_instances_3d = [
+                InstanceData() for _ in range(len(data_instances_2d))
+            ]
+
+        for i, data_sample in enumerate(data_samples):
+            data_sample.pred_instances_3d = data_instances_3d[i]
+            data_sample.pred_instances = data_instances_2d[i]
+        return data_samples
+
+    def extract_feat(self, batch_inputs_dict: dict) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs_dict (dict): Contains 'img' key
+                with image tensor with shape (N, C, H ,W).
+
+        Returns:
+            tuple[Tensor]: Multi-level features that may have
+            different resolutions.
+        """
+        batch_imgs = batch_inputs_dict['imgs']
+        x = self.backbone(batch_imgs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    # TODO: Support test time augmentation
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test function with test time augmentation."""
+        pass
diff --git a/mmde/mmdet3d/models/detectors/smoke_mono3d.py b/mmde/mmdet3d/models/detectors/smoke_mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1b04d472400488a827eb1a729b0a289f79c552a
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/smoke_mono3d.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_mono3d import SingleStageMono3DDetector
+
+
+@MODELS.register_module()
+class SMOKEMono3D(SingleStageMono3DDetector):
+    r"""SMOKE <https://arxiv.org/abs/2002.10111>`_ for monocular 3D object
+        detection.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of FCOS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of FCOS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/mmde/mmdet3d/models/detectors/ssd3dnet.py b/mmde/mmdet3d/models/detectors/ssd3dnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bf85e937186dd4b8c2304153b1f23848cc52c69
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/ssd3dnet.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet3d.registry import MODELS
+from .votenet import VoteNet
+
+
+@MODELS.register_module()
+class SSD3DNet(VoteNet):
+    """3DSSDNet model.
+
+    https://arxiv.org/abs/2002.10187.pdf
+    """
+
+    def __init__(self,
+                 backbone,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(SSD3DNet, self).__init__(
+            backbone=backbone,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
diff --git a/mmde/mmdet3d/models/detectors/two_stage.py b/mmde/mmdet3d/models/detectors/two_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e003e74db2973be6715fa8a653f867346b460ae
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/two_stage.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Union
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
+from ...structures.det3d_data_sample import SampleList
+from .base import Base3DDetector
+
+
+@MODELS.register_module()
+class TwoStage3DDetector(Base3DDetector):
+    """Base class of two-stage 3D detector.
+
+    It inherits original ``:class:Base3DDetector``. This class could serve as a
+    base class for all two-stage 3D detectors.
+    """
+
+    def __init__(
+        self,
+        backbone: ConfigType,
+        neck: OptConfigType = None,
+        rpn_head: OptConfigType = None,
+        roi_head: OptConfigType = None,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        init_cfg: OptMultiConfig = None,
+        data_preprocessor: OptConfigType = None,
+    ) -> None:
+        super(TwoStage3DDetector, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+
+        if rpn_head is not None:
+            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
+            rpn_head_ = rpn_head.copy()
+            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
+            rpn_head_num_classes = rpn_head_.get('num_classes', None)
+            if rpn_head_num_classes is None:
+                rpn_head_.update(num_classes=1)
+            self.rpn_head = MODELS.build(rpn_head_)
+
+        if roi_head is not None:
+            # update train and test cfg here for now
+            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
+            roi_head.update(train_cfg=rcnn_train_cfg)
+            roi_head.update(test_cfg=test_cfg.rcnn)
+            self.roi_head = MODELS.build(roi_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    @property
+    def with_rpn(self) -> bool:
+        """bool: whether the detector has RPN"""
+        return hasattr(self, 'rpn_head') and self.rpn_head is not None
+
+    @property
+    def with_roi_head(self) -> bool:
+        """bool: whether the detector has a RoI head"""
+        return hasattr(self, 'roi_head') and self.roi_head is not None
+
+    def loss(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+             **kwargs) -> Union[dict, list]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'imgs' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        feats_dict = self.extract_feat(batch_inputs_dict)
+
+        losses = dict()
+
+        # RPN forward and loss
+        if self.with_rpn:
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                              self.test_cfg.rpn)
+            rpn_data_samples = copy.deepcopy(batch_data_samples)
+
+            rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict(
+                feats_dict,
+                rpn_data_samples,
+                proposal_cfg=proposal_cfg,
+                **kwargs)
+            # avoid get same name with roi_head loss
+            keys = rpn_losses.keys()
+            for key in keys:
+                if 'loss' in key and 'rpn' not in key:
+                    losses[f'rpn_{key}'] = rpn_losses[key]
+                else:
+                    losses[key] = rpn_losses[key]
+        else:
+            # TODO: Not support currently, should have a check at Fast R-CNN
+            assert batch_data_samples[0].get('proposals', None) is not None
+            # use pre-defined proposals in InstanceData for the second stage
+            # to extract ROI features.
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        roi_losses = self.roi_head.loss(feats_dict, rpn_results_list,
+                                        batch_data_samples, **kwargs)
+        losses.update(roi_losses)
+
+        return losses
+
+    def predict(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'imgs' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input samples. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+                (num_instance, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bboxes_3d (Tensor): Contains a tensor with shape
+                (num_instances, C) where C >=7.
+        """
+        feats_dict = self.extract_feat(batch_inputs_dict)
+
+        if self.with_rpn:
+            rpn_results_list = self.rpn_head.predict(feats_dict,
+                                                     batch_data_samples)
+
+        else:
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        results_list = self.roi_head.predict(feats_dict, rpn_results_list,
+                                             batch_data_samples)
+
+        # connvert to Det3DDataSample
+        results_list = self.add_pred_to_datasample(batch_data_samples,
+                                                   results_list)
+
+        return results_list
+
+    def _forward(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+                 **kwargs) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'img' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            tuple: A tuple of features from ``rpn_head`` and ``roi_head``
+            forward.
+        """
+        feats_dict = self.extract_feat(batch_inputs_dict)
+        rpn_outs = self.rpn_head.forward(feats_dict['neck_feats'])
+
+        # If there are no pre-defined proposals, use RPN to get proposals
+        if batch_data_samples[0].get('proposals', None) is None:
+            batch_input_metas = [
+                data_samples.metainfo for data_samples in batch_data_samples
+            ]
+            rpn_results_list = self.rpn_head.predict_by_feat(
+                *rpn_outs, batch_input_metas=batch_input_metas)
+        else:
+            # TODO: Not checked currently.
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        # roi_head
+        roi_outs = self.roi_head._forward(feats_dict, rpn_results_list)
+        return rpn_outs + roi_outs
diff --git a/mmde/mmdet3d/models/detectors/votenet.py b/mmde/mmdet3d/models/detectors/votenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae7088974d3f68cc8ba4695f8e423f79159467f9
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/votenet.py
@@ -0,0 +1,148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Union
+
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import Det3DDataSample
+from ..test_time_augs import merge_aug_bboxes_3d
+from .single_stage import SingleStage3DDetector
+
+
+@MODELS.register_module()
+class VoteNet(SingleStage3DDetector):
+    r"""`VoteNet <https://arxiv.org/pdf/1904.09664.pdf>`_ for 3D detection.
+
+    Args:
+        backbone (dict): Config dict of detector's backbone.
+        bbox_head (dict, optional): Config dict of box head. Defaults to None.
+        train_cfg (dict, optional): Config dict of training hyper-parameters.
+            Defaults to None.
+        test_cfg (dict, optional): Config dict of test hyper-parameters.
+            Defaults to None.
+        init_cfg (dict, optional): the config to control the
+           initialization. Default to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+    """
+
+    def __init__(self,
+                 backbone: dict,
+                 bbox_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 data_preprocessor: Optional[dict] = None,
+                 **kwargs):
+        super(VoteNet, self).__init__(
+            backbone=backbone,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor,
+            **kwargs)
+
+    def loss(self, batch_inputs_dict: Dict[str, Union[List, Tensor]],
+             batch_data_samples: List[Det3DDataSample],
+             **kwargs) -> List[Det3DDataSample]:
+        """
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        feat_dict = self.extract_feat(batch_inputs_dict)
+        points = batch_inputs_dict['points']
+        losses = self.bbox_head.loss(points, feat_dict, batch_data_samples,
+                                     **kwargs)
+        return losses
+
+    def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
+                batch_data_samples: List[Det3DDataSample],
+                **kwargs) -> List[Det3DDataSample]:
+        """Forward of testing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input sample. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+                - scores_3d (Tensor): Classification scores, has a shape
+                    (num_instances, )
+                - labels_3d (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
+                    contains a tensor with shape (num_instances, 7).
+        """
+        feats_dict = self.extract_feat(batch_inputs_dict)
+        points = batch_inputs_dict['points']
+        results_list = self.bbox_head.predict(points, feats_dict,
+                                              batch_data_samples, **kwargs)
+        data_3d_samples = self.add_pred_to_datasample(batch_data_samples,
+                                                      results_list)
+        return data_3d_samples
+
+    def aug_test(self, aug_inputs_list: List[dict],
+                 aug_data_samples: List[List[dict]], **kwargs):
+        """Test with augmentation.
+
+        Batch size always is 1 when do the augtest.
+
+        Args:
+            aug_inputs_list (List[dict]): The list indicate same data
+                under differecnt augmentation.
+            aug_data_samples (List[List[dict]]): The outer list
+                indicate different augmentation, and the inter
+                list indicate the batch size.
+        """
+        num_augs = len(aug_inputs_list)
+        if num_augs == 1:
+            return self.predict(aug_inputs_list[0], aug_data_samples[0])
+
+        batch_size = len(aug_data_samples[0])
+        assert batch_size == 1
+        multi_aug_results = []
+        for aug_id in range(num_augs):
+            batch_inputs_dict = aug_inputs_list[aug_id]
+            batch_data_samples = aug_data_samples[aug_id]
+            feats_dict = self.extract_feat(batch_inputs_dict)
+            points = batch_inputs_dict['points']
+            results_list = self.bbox_head.predict(points, feats_dict,
+                                                  batch_data_samples, **kwargs)
+            multi_aug_results.append(results_list[0])
+        aug_input_metas_list = []
+        for aug_index in range(num_augs):
+            metainfo = aug_data_samples[aug_id][0].metainfo
+            aug_input_metas_list.append(metainfo)
+
+        aug_results_list = [item.to_dict() for item in multi_aug_results]
+        # after merging, bboxes will be rescaled to the original image size
+        merged_results_dict = merge_aug_bboxes_3d(aug_results_list,
+                                                  aug_input_metas_list,
+                                                  self.bbox_head.test_cfg)
+
+        merged_results = InstanceData(**merged_results_dict)
+        data_3d_samples = self.add_pred_to_datasample(batch_data_samples,
+                                                      [merged_results])
+        return data_3d_samples
diff --git a/mmde/mmdet3d/models/detectors/voxelnet.py b/mmde/mmdet3d/models/detectors/voxelnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f5592633b4609d8174959c16a0fc78fc29fc08f
--- /dev/null
+++ b/mmde/mmdet3d/models/detectors/voxelnet.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStage3DDetector
+
+
+@MODELS.register_module()
+class VoxelNet(SingleStage3DDetector):
+    r"""`VoxelNet <https://arxiv.org/abs/1711.06396>`_ for 3D detection."""
+
+    def __init__(self,
+                 voxel_encoder: ConfigType,
+                 middle_encoder: ConfigType,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+        self.voxel_encoder = MODELS.build(voxel_encoder)
+        self.middle_encoder = MODELS.build(middle_encoder)
+
+    def extract_feat(self, batch_inputs_dict: dict) -> Tuple[Tensor]:
+        """Extract features from points."""
+        voxel_dict = batch_inputs_dict['voxels']
+        voxel_features = self.voxel_encoder(voxel_dict['voxels'],
+                                            voxel_dict['num_points'],
+                                            voxel_dict['coors'])
+        batch_size = voxel_dict['coors'][-1, 0].item() + 1
+        x = self.middle_encoder(voxel_features, voxel_dict['coors'],
+                                batch_size)
+        x = self.backbone(x)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
diff --git a/mmde/mmdet3d/models/layers/__init__.py b/mmde/mmdet3d/models/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dc2fca8b5f1a3a2ae60b49c8fbd032d15b60cb6
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .box3d_nms import (aligned_3d_nms, box3d_multiclass_nms, circle_nms,
+                        nms_bev, nms_normal_bev)
+from .dgcnn_modules import DGCNNFAModule, DGCNNFPModule, DGCNNGFModule
+from .edge_fusion_module import EdgeFusionModule
+from .fusion_layers import (PointFusion, VoteFusion, apply_3d_transformation,
+                            bbox_2d_transform, coord_2d_transform)
+from .minkowski_engine_block import (MinkowskiBasicBlock, MinkowskiBottleneck,
+                                     MinkowskiConvModule)
+from .mlp import MLP
+from .norm import NaiveSyncBatchNorm1d, NaiveSyncBatchNorm2d
+from .paconv import PAConv, PAConvCUDA
+from .pointnet_modules import (PAConvCUDASAModule, PAConvCUDASAModuleMSG,
+                               PAConvSAModule, PAConvSAModuleMSG,
+                               PointFPModule, PointSAModule, PointSAModuleMSG,
+                               build_sa_module)
+from .sparse_block import (SparseBasicBlock, SparseBottleneck,
+                           make_sparse_convmodule)
+from .torchsparse_block import (TorchSparseBasicBlock, TorchSparseBottleneck,
+                                TorchSparseConvModule)
+from .transformer import GroupFree3DMHA
+from .vote_module import VoteModule
+
+__all__ = [
+    'VoteModule', 'GroupFree3DMHA', 'EdgeFusionModule', 'DGCNNFAModule',
+    'DGCNNFPModule', 'DGCNNGFModule', 'NaiveSyncBatchNorm1d',
+    'NaiveSyncBatchNorm2d', 'PAConv', 'PAConvCUDA', 'SparseBasicBlock',
+    'SparseBottleneck', 'make_sparse_convmodule', 'PointFusion', 'VoteFusion',
+    'apply_3d_transformation', 'bbox_2d_transform', 'coord_2d_transform',
+    'MLP', 'box3d_multiclass_nms', 'aligned_3d_nms', 'circle_nms', 'nms_bev',
+    'nms_normal_bev', 'build_sa_module', 'PointSAModuleMSG', 'PointSAModule',
+    'PointFPModule', 'PAConvSAModule', 'PAConvSAModuleMSG',
+    'PAConvCUDASAModule', 'PAConvCUDASAModuleMSG', 'TorchSparseConvModule',
+    'TorchSparseBasicBlock', 'TorchSparseBottleneck', 'MinkowskiConvModule',
+    'MinkowskiBasicBlock', 'MinkowskiBottleneck'
+]
diff --git a/mmde/mmdet3d/models/layers/box3d_nms.py b/mmde/mmdet3d/models/layers/box3d_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3d2f784f6ce69871642fe1c97332b9bad1c0448
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/box3d_nms.py
@@ -0,0 +1,295 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import numba
+import numpy as np
+import torch
+from mmcv.ops import nms, nms_rotated
+from torch import Tensor
+
+
+def box3d_multiclass_nms(
+        mlvl_bboxes: Tensor,
+        mlvl_bboxes_for_nms: Tensor,
+        mlvl_scores: Tensor,
+        score_thr: float,
+        max_num: int,
+        cfg: dict,
+        mlvl_dir_scores: Optional[Tensor] = None,
+        mlvl_attr_scores: Optional[Tensor] = None,
+        mlvl_bboxes2d: Optional[Tensor] = None) -> Tuple[Tensor]:
+    """Multi-class NMS for 3D boxes. The IoU used for NMS is defined as the 2D
+    IoU between BEV boxes.
+
+    Args:
+        mlvl_bboxes (Tensor): Multi-level boxes with shape (N, M).
+            M is the dimensions of boxes.
+        mlvl_bboxes_for_nms (Tensor): Multi-level boxes with shape (N, 5)
+            ([x1, y1, x2, y2, ry]). N is the number of boxes.
+            The coordinate system of the BEV boxes is counterclockwise.
+        mlvl_scores (Tensor): Multi-level boxes with shape (N, C + 1).
+            N is the number of boxes. C is the number of classes.
+        score_thr (float): Score threshold to filter boxes with low confidence.
+        max_num (int): Maximum number of boxes will be kept.
+        cfg (dict): Configuration dict of NMS.
+        mlvl_dir_scores (Tensor, optional): Multi-level scores of direction
+            classifier. Defaults to None.
+        mlvl_attr_scores (Tensor, optional): Multi-level scores of attribute
+            classifier. Defaults to None.
+        mlvl_bboxes2d (Tensor, optional): Multi-level 2D bounding boxes.
+            Defaults to None.
+
+    Returns:
+        Tuple[Tensor]: Return results after nms, including 3D bounding boxes,
+        scores, labels, direction scores, attribute scores (optional) and
+        2D bounding boxes (optional).
+    """
+    # do multi class nms
+    # the fg class id range: [0, num_classes-1]
+    num_classes = mlvl_scores.shape[1] - 1
+    bboxes = []
+    scores = []
+    labels = []
+    dir_scores = []
+    attr_scores = []
+    bboxes2d = []
+    for i in range(0, num_classes):
+        # get bboxes and scores of this class
+        cls_inds = mlvl_scores[:, i] > score_thr
+        if not cls_inds.any():
+            continue
+
+        _scores = mlvl_scores[cls_inds, i]
+        _bboxes_for_nms = mlvl_bboxes_for_nms[cls_inds, :]
+
+        if cfg.use_rotate_nms:
+            nms_func = nms_bev
+        else:
+            nms_func = nms_normal_bev
+
+        selected = nms_func(_bboxes_for_nms, _scores, cfg.nms_thr)
+        _mlvl_bboxes = mlvl_bboxes[cls_inds, :]
+        bboxes.append(_mlvl_bboxes[selected])
+        scores.append(_scores[selected])
+        cls_label = mlvl_bboxes.new_full((len(selected), ),
+                                         i,
+                                         dtype=torch.long)
+        labels.append(cls_label)
+
+        if mlvl_dir_scores is not None:
+            _mlvl_dir_scores = mlvl_dir_scores[cls_inds]
+            dir_scores.append(_mlvl_dir_scores[selected])
+        if mlvl_attr_scores is not None:
+            _mlvl_attr_scores = mlvl_attr_scores[cls_inds]
+            attr_scores.append(_mlvl_attr_scores[selected])
+        if mlvl_bboxes2d is not None:
+            _mlvl_bboxes2d = mlvl_bboxes2d[cls_inds]
+            bboxes2d.append(_mlvl_bboxes2d[selected])
+
+    if bboxes:
+        bboxes = torch.cat(bboxes, dim=0)
+        scores = torch.cat(scores, dim=0)
+        labels = torch.cat(labels, dim=0)
+        if mlvl_dir_scores is not None:
+            dir_scores = torch.cat(dir_scores, dim=0)
+        if mlvl_attr_scores is not None:
+            attr_scores = torch.cat(attr_scores, dim=0)
+        if mlvl_bboxes2d is not None:
+            bboxes2d = torch.cat(bboxes2d, dim=0)
+        if bboxes.shape[0] > max_num:
+            _, inds = scores.sort(descending=True)
+            inds = inds[:max_num]
+            bboxes = bboxes[inds, :]
+            labels = labels[inds]
+            scores = scores[inds]
+            if mlvl_dir_scores is not None:
+                dir_scores = dir_scores[inds]
+            if mlvl_attr_scores is not None:
+                attr_scores = attr_scores[inds]
+            if mlvl_bboxes2d is not None:
+                bboxes2d = bboxes2d[inds]
+    else:
+        bboxes = mlvl_scores.new_zeros((0, mlvl_bboxes.size(-1)))
+        scores = mlvl_scores.new_zeros((0, ))
+        labels = mlvl_scores.new_zeros((0, ), dtype=torch.long)
+        if mlvl_dir_scores is not None:
+            dir_scores = mlvl_scores.new_zeros((0, ))
+        if mlvl_attr_scores is not None:
+            attr_scores = mlvl_scores.new_zeros((0, ))
+        if mlvl_bboxes2d is not None:
+            bboxes2d = mlvl_scores.new_zeros((0, 4))
+
+    results = (bboxes, scores, labels)
+
+    if mlvl_dir_scores is not None:
+        results = results + (dir_scores, )
+    if mlvl_attr_scores is not None:
+        results = results + (attr_scores, )
+    if mlvl_bboxes2d is not None:
+        results = results + (bboxes2d, )
+
+    return results
+
+
+def aligned_3d_nms(boxes: Tensor, scores: Tensor, classes: Tensor,
+                   thresh: float) -> Tensor:
+    """3D NMS for aligned boxes.
+
+    Args:
+        boxes (Tensor): Aligned box with shape [N, 6].
+        scores (Tensor): Scores of each box.
+        classes (Tensor): Class of each box.
+        thresh (float): IoU threshold for nms.
+
+    Returns:
+        Tensor: Indices of selected boxes.
+    """
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    z1 = boxes[:, 2]
+    x2 = boxes[:, 3]
+    y2 = boxes[:, 4]
+    z2 = boxes[:, 5]
+    area = (x2 - x1) * (y2 - y1) * (z2 - z1)
+    zero = boxes.new_zeros(1, )
+
+    score_sorted = torch.argsort(scores)
+    pick = []
+    while (score_sorted.shape[0] != 0):
+        last = score_sorted.shape[0]
+        i = score_sorted[-1]
+        pick.append(i)
+
+        xx1 = torch.max(x1[i], x1[score_sorted[:last - 1]])
+        yy1 = torch.max(y1[i], y1[score_sorted[:last - 1]])
+        zz1 = torch.max(z1[i], z1[score_sorted[:last - 1]])
+        xx2 = torch.min(x2[i], x2[score_sorted[:last - 1]])
+        yy2 = torch.min(y2[i], y2[score_sorted[:last - 1]])
+        zz2 = torch.min(z2[i], z2[score_sorted[:last - 1]])
+        classes1 = classes[i]
+        classes2 = classes[score_sorted[:last - 1]]
+        inter_l = torch.max(zero, xx2 - xx1)
+        inter_w = torch.max(zero, yy2 - yy1)
+        inter_h = torch.max(zero, zz2 - zz1)
+
+        inter = inter_l * inter_w * inter_h
+        iou = inter / (area[i] + area[score_sorted[:last - 1]] - inter)
+        iou = iou * (classes1 == classes2).float()
+        score_sorted = score_sorted[torch.nonzero(
+            iou <= thresh, as_tuple=False).flatten()]
+
+    indices = boxes.new_tensor(pick, dtype=torch.long)
+    return indices
+
+
+@numba.jit(nopython=True)
+def circle_nms(dets: Tensor, thresh: float, post_max_size: int = 83) -> Tensor:
+    """Circular NMS.
+
+    An object is only counted as positive if no other center with a higher
+    confidence exists within a radius r using a bird-eye view distance metric.
+
+    Args:
+        dets (Tensor): Detection results with the shape of [N, 3].
+        thresh (float): Value of threshold.
+        post_max_size (int): Max number of prediction to be kept.
+            Defaults to 83.
+
+    Returns:
+        Tensor: Indexes of the detections to be kept.
+    """
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    scores = dets[:, 2]
+    order = scores.argsort()[::-1].astype(np.int32)  # highest->lowest
+    ndets = dets.shape[0]
+    suppressed = np.zeros((ndets), dtype=np.int32)
+    keep = []
+    for _i in range(ndets):
+        i = order[_i]  # start with highest score box
+        if suppressed[
+                i] == 1:  # if any box have enough iou with this, remove it
+            continue
+        keep.append(i)
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            # calculate center distance between i and j box
+            dist = (x1[i] - x1[j])**2 + (y1[i] - y1[j])**2
+
+            # ovr = inter / areas[j]
+            if dist <= thresh:
+                suppressed[j] = 1
+
+    if post_max_size < len(keep):
+        return keep[:post_max_size]
+
+    return keep
+
+
+# This function duplicates functionality of mmcv.ops.iou_3d.nms_bev
+# from mmcv<=1.5, but using cuda ops from mmcv.ops.nms.nms_rotated.
+# Nms api will be unified in mmdetection3d one day.
+def nms_bev(boxes: Tensor,
+            scores: Tensor,
+            thresh: float,
+            pre_max_size: Optional[int] = None,
+            post_max_size: Optional[int] = None) -> Tensor:
+    """NMS function GPU implementation (for BEV boxes). The overlap of two
+    boxes for IoU calculation is defined as the exact overlapping area of the
+    two boxes. In this function, one can also set ``pre_max_size`` and
+    ``post_max_size``.
+
+    Args:
+        boxes (Tensor): Input boxes with the shape of [N, 5]
+            ([x1, y1, x2, y2, ry]).
+        scores (Tensor): Scores of boxes with the shape of [N].
+        thresh (float): Overlap threshold of NMS.
+        pre_max_size (int, optional): Max size of boxes before NMS.
+            Defaults to None.
+        post_max_size (int, optional): Max size of boxes after NMS.
+            Defaults to None.
+
+    Returns:
+        Tensor: Indexes after NMS.
+    """
+    assert boxes.size(1) == 5, 'Input boxes shape should be [N, 5]'
+    order = scores.sort(0, descending=True)[1]
+    if pre_max_size is not None:
+        order = order[:pre_max_size]
+    boxes = boxes[order].contiguous()
+    scores = scores[order]
+
+    # xyxyr -> back to xywhr
+    # note: better skip this step before nms_bev call in the future
+    boxes = torch.stack(
+        ((boxes[:, 0] + boxes[:, 2]) / 2, (boxes[:, 1] + boxes[:, 3]) / 2,
+         boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1], boxes[:, 4]),
+        dim=-1)
+
+    keep = nms_rotated(boxes, scores, thresh)[1]
+    keep = order[keep]
+    if post_max_size is not None:
+        keep = keep[:post_max_size]
+    return keep
+
+
+# This function duplicates functionality of mmcv.ops.iou_3d.nms_normal_bev
+# from mmcv<=1.5, but using cuda ops from mmcv.ops.nms.nms.
+# Nms api will be unified in mmdetection3d one day.
+def nms_normal_bev(boxes: Tensor, scores: Tensor, thresh: float) -> Tensor:
+    """Normal NMS function GPU implementation (for BEV boxes). The overlap of
+    two boxes for IoU calculation is defined as the exact overlapping area of
+    the two boxes WITH their yaw angle set to 0.
+
+    Args:
+        boxes (Tensor): Input boxes with shape (N, 5).
+        scores (Tensor): Scores of predicted boxes with shape (N).
+        thresh (float): Overlap threshold of NMS.
+
+    Returns:
+        Tensor: Remaining indices with scores in descending order.
+    """
+    assert boxes.shape[1] == 5, 'Input boxes shape should be [N, 5]'
+    return nms(boxes[:, :-1], scores, thresh)[1]
diff --git a/mmde/mmdet3d/models/layers/dgcnn_modules/__init__.py b/mmde/mmdet3d/models/layers/dgcnn_modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..67beb0907fba6463a7fdd14f652054ecf780aec6
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/dgcnn_modules/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dgcnn_fa_module import DGCNNFAModule
+from .dgcnn_fp_module import DGCNNFPModule
+from .dgcnn_gf_module import DGCNNGFModule
+
+__all__ = ['DGCNNFAModule', 'DGCNNFPModule', 'DGCNNGFModule']
diff --git a/mmde/mmdet3d/models/layers/dgcnn_modules/dgcnn_fa_module.py b/mmde/mmdet3d/models/layers/dgcnn_modules/dgcnn_fa_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..81420b183627d305009f7f41a74e5becf351f6db
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/dgcnn_modules/dgcnn_fa_module.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.utils import ConfigType, OptMultiConfig
+
+
+class DGCNNFAModule(BaseModule):
+    """Point feature aggregation module used in DGCNN.
+
+    Aggregate all the features of points.
+
+    Args:
+        mlp_channels (List[int]): List of mlp channels.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN1d').
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU').
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 mlp_channels: List[int],
+                 norm_cfg: ConfigType = dict(type='BN1d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(DGCNNFAModule, self).__init__(init_cfg=init_cfg)
+        self.mlps = nn.Sequential()
+        for i in range(len(mlp_channels) - 1):
+            self.mlps.add_module(
+                f'layer{i}',
+                ConvModule(
+                    mlp_channels[i],
+                    mlp_channels[i + 1],
+                    kernel_size=(1, ),
+                    stride=(1, ),
+                    conv_cfg=dict(type='Conv1d'),
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, points: List[Tensor]) -> Tensor:
+        """forward.
+
+        Args:
+            points (List[Tensor]): Tensor of the features to be aggregated.
+
+        Returns:
+            Tensor: (B, N, M) M = mlp[-1]. Tensor of the output points.
+        """
+
+        if len(points) > 1:
+            new_points = torch.cat(points[1:], dim=-1)
+            new_points = new_points.transpose(1, 2).contiguous()  # (B, C, N)
+            new_points_copy = new_points
+
+            new_points = self.mlps(new_points)
+
+            new_fa_points = new_points.max(dim=-1, keepdim=True)[0]
+            new_fa_points = new_fa_points.repeat(1, 1, new_points.shape[-1])
+
+            new_points = torch.cat([new_fa_points, new_points_copy], dim=1)
+            new_points = new_points.transpose(1, 2).contiguous()
+        else:
+            new_points = points
+
+        return new_points
diff --git a/mmde/mmdet3d/models/layers/dgcnn_modules/dgcnn_fp_module.py b/mmde/mmdet3d/models/layers/dgcnn_modules/dgcnn_fp_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e25a5da9c47fb57175f4f55dc3a4ceb6e372f6e
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/dgcnn_modules/dgcnn_fp_module.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.utils import ConfigType, OptMultiConfig
+
+
+class DGCNNFPModule(BaseModule):
+    """Point feature propagation module used in DGCNN.
+
+    Propagate the features from one set to another.
+
+    Args:
+        mlp_channels (List[int]): List of mlp channels.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN1d').
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU').
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 mlp_channels: List[int],
+                 norm_cfg: ConfigType = dict(type='BN1d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(DGCNNFPModule, self).__init__(init_cfg=init_cfg)
+        self.mlps = nn.Sequential()
+        for i in range(len(mlp_channels) - 1):
+            self.mlps.add_module(
+                f'layer{i}',
+                ConvModule(
+                    mlp_channels[i],
+                    mlp_channels[i + 1],
+                    kernel_size=(1, ),
+                    stride=(1, ),
+                    conv_cfg=dict(type='Conv1d'),
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, points: Tensor) -> Tensor:
+        """Forward.
+
+        Args:
+            points (Tensor): (B, N, C) Tensor of the input points.
+
+        Returns:
+            Tensor: (B, N, M) M = mlp[-1]. Tensor of the new points.
+        """
+
+        if points is not None:
+            new_points = points.transpose(1, 2).contiguous()  # (B, C, N)
+            new_points = self.mlps(new_points)
+            new_points = new_points.transpose(1, 2).contiguous()
+        else:
+            new_points = points
+
+        return new_points
diff --git a/mmde/mmdet3d/models/layers/dgcnn_modules/dgcnn_gf_module.py b/mmde/mmdet3d/models/layers/dgcnn_modules/dgcnn_gf_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc9266947776c347b17187adc331ef7adf8141f7
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/dgcnn_modules/dgcnn_gf_module.py
@@ -0,0 +1,222 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import torch
+from mmcv.cnn import ConvModule
+from mmcv.ops.group_points import GroupAll, QueryAndGroup, grouping_operation
+from torch import Tensor
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.utils import ConfigType
+
+
+class BaseDGCNNGFModule(nn.Module):
+    """Base module for point graph feature module used in DGCNN.
+
+    Args:
+        radii (List[float]): List of radius in each knn or ball query.
+        sample_nums (List[int]): Number of samples in each knn or ball query.
+        mlp_channels (List[List[int]]): Specify of the dgcnn before the global
+            pooling for each graph feature module.
+        knn_modes (List[str]): Type of KNN method, valid mode
+            ['F-KNN', 'D-KNN']. Defaults to ['F-KNN'].
+        dilated_group (bool): Whether to use dilated ball query.
+            Defaults to False.
+        use_xyz (bool): Whether to use xyz as point features.
+            Defaults to True.
+        pool_mode (str): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool): If ball query, whether to normalize local XYZ
+            with radius. Defaults to False.
+        grouper_return_grouped_xyz (bool): Whether to return grouped xyz in
+            `QueryAndGroup`. Defaults to False.
+        grouper_return_grouped_idx (bool): Whether to return grouped idx in
+            `QueryAndGroup`. Defaults to False.
+    """
+
+    def __init__(self,
+                 radii: List[float],
+                 sample_nums: List[int],
+                 mlp_channels: List[List[int]],
+                 knn_modes: List[str] = ['F-KNN'],
+                 dilated_group: bool = False,
+                 use_xyz: bool = True,
+                 pool_mode: str = 'max',
+                 normalize_xyz: bool = False,
+                 grouper_return_grouped_xyz: bool = False,
+                 grouper_return_grouped_idx: bool = False) -> None:
+        super(BaseDGCNNGFModule, self).__init__()
+
+        assert len(sample_nums) == len(
+            mlp_channels
+        ), 'Num_samples and mlp_channels should have the same length.'
+        assert pool_mode in ['max', 'avg'
+                             ], "Pool_mode should be one of ['max', 'avg']."
+        assert isinstance(knn_modes, list) or isinstance(
+            knn_modes, tuple), 'The type of knn_modes should be list or tuple.'
+
+        if isinstance(mlp_channels, tuple):
+            mlp_channels = list(map(list, mlp_channels))
+        self.mlp_channels = mlp_channels
+
+        self.pool_mode = pool_mode
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        self.knn_modes = knn_modes
+
+        for i in range(len(sample_nums)):
+            sample_num = sample_nums[i]
+            if sample_num is not None:
+                if self.knn_modes[i] == 'D-KNN':
+                    grouper = QueryAndGroup(
+                        radii[i],
+                        sample_num,
+                        use_xyz=use_xyz,
+                        normalize_xyz=normalize_xyz,
+                        return_grouped_xyz=grouper_return_grouped_xyz,
+                        return_grouped_idx=True)
+                else:
+                    grouper = QueryAndGroup(
+                        radii[i],
+                        sample_num,
+                        use_xyz=use_xyz,
+                        normalize_xyz=normalize_xyz,
+                        return_grouped_xyz=grouper_return_grouped_xyz,
+                        return_grouped_idx=grouper_return_grouped_idx)
+            else:
+                grouper = GroupAll(use_xyz)
+            self.groupers.append(grouper)
+
+    def _pool_features(self, features: Tensor) -> Tensor:
+        """Perform feature aggregation using pooling operation.
+
+        Args:
+            features (Tensor): (B, C, N, K) Features of locally grouped
+                points before pooling.
+
+        Returns:
+            Tensor: (B, C, N) Pooled features aggregating local information.
+        """
+        if self.pool_mode == 'max':
+            # (B, C, N, 1)
+            new_features = F.max_pool2d(
+                features, kernel_size=[1, features.size(3)])
+        elif self.pool_mode == 'avg':
+            # (B, C, N, 1)
+            new_features = F.avg_pool2d(
+                features, kernel_size=[1, features.size(3)])
+        else:
+            raise NotImplementedError
+
+        return new_features.squeeze(-1).contiguous()
+
+    def forward(self, points: Tensor) -> Tensor:
+        """forward.
+
+        Args:
+            points (Tensor): (B, N, C) Input points.
+
+        Returns:
+            Tensor: (B, N, C1) New points generated from each graph
+            feature module.
+        """
+        new_points_list = [points]
+
+        for i in range(len(self.groupers)):
+
+            new_points = new_points_list[i]
+            new_points_trans = new_points.transpose(
+                1, 2).contiguous()  # (B, C, N)
+
+            if self.knn_modes[i] == 'D-KNN':
+                # (B, N, C) -> (B, N, K)
+                idx = self.groupers[i](new_points[..., -3:].contiguous(),
+                                       new_points[..., -3:].contiguous())[-1]
+
+                grouped_results = grouping_operation(
+                    new_points_trans, idx)  # (B, C, N) -> (B, C, N, K)
+                grouped_results -= new_points_trans.unsqueeze(-1)
+            else:
+                grouped_results = self.groupers[i](
+                    new_points, new_points)  # (B, N, C) -> (B, C, N, K)
+
+            new_points = new_points_trans.unsqueeze(-1).repeat(
+                1, 1, 1, grouped_results.shape[-1])
+            new_points = torch.cat([grouped_results, new_points], dim=1)
+
+            # (B, mlp[-1], N, K)
+            new_points = self.mlps[i](new_points)
+
+            # (B, mlp[-1], N)
+            new_points = self._pool_features(new_points)
+            new_points = new_points.transpose(1, 2).contiguous()
+            new_points_list.append(new_points)
+
+        return new_points
+
+
+class DGCNNGFModule(BaseDGCNNGFModule):
+    """Point graph feature module used in DGCNN.
+
+    Args:
+        mlp_channels (List[int]): Specify of the dgcnn before the global
+            pooling for each graph feature module.
+        num_sample (int, optional): Number of samples in each knn or ball
+            query. Defaults to None.
+        knn_mode (str): Type of KNN method, valid mode ['F-KNN', 'D-KNN'].
+            Defaults to 'F-KNN'.
+        radius (float, optional): Radius to group with. Defaults to None.
+        dilated_group (bool): Whether to use dilated ball query.
+            Defaults to False.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d').
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU').
+        use_xyz (bool): Whether to use xyz as point features. Defaults to True.
+        pool_mode (str): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool): If ball query, whether to normalize local XYZ
+            with radius. Defaults to False.
+        bias (bool or str): If specified as `auto`, it will be decided by
+            `norm_cfg`. `bias` will be set as True if `norm_cfg` is None,
+            otherwise False. Defaults to 'auto'.
+    """
+
+    def __init__(self,
+                 mlp_channels: List[int],
+                 num_sample: Optional[int] = None,
+                 knn_mode: str = 'F-KNN',
+                 radius: Optional[float] = None,
+                 dilated_group: bool = False,
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 use_xyz: bool = True,
+                 pool_mode: str = 'max',
+                 normalize_xyz: bool = False,
+                 bias: Union[bool, str] = 'auto') -> None:
+        super(DGCNNGFModule, self).__init__(
+            mlp_channels=[mlp_channels],
+            sample_nums=[num_sample],
+            knn_modes=[knn_mode],
+            radii=[radius],
+            use_xyz=use_xyz,
+            pool_mode=pool_mode,
+            normalize_xyz=normalize_xyz,
+            dilated_group=dilated_group)
+
+        for i in range(len(self.mlp_channels)):
+            mlp_channel = self.mlp_channels[i]
+
+            mlp = nn.Sequential()
+            for i in range(len(mlp_channel) - 1):
+                mlp.add_module(
+                    f'layer{i}',
+                    ConvModule(
+                        mlp_channel[i],
+                        mlp_channel[i + 1],
+                        kernel_size=(1, 1),
+                        stride=(1, 1),
+                        conv_cfg=dict(type='Conv2d'),
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg,
+                        bias=bias))
+            self.mlps.append(mlp)
diff --git a/mmde/mmdet3d/models/layers/edge_fusion_module.py b/mmde/mmdet3d/models/layers/edge_fusion_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdac05e3cf60488d7dbde9316d21a8b0990f9841
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/edge_fusion_module.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.utils import ConfigType
+
+
+class EdgeFusionModule(BaseModule):
+    """Edge Fusion Module for feature map.
+
+    Args:
+        out_channels (int): The number of output channels.
+        feat_channels (int): The number of channels in feature map
+            during edge feature fusion.
+        kernel_size (int): Kernel size of convolution. Defaults to 3.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU').
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN1d').
+    """
+
+    def __init__(
+        self,
+        out_channels: int,
+        feat_channels: int,
+        kernel_size: int = 3,
+        act_cfg: ConfigType = dict(type='ReLU'),
+        norm_cfg: ConfigType = dict(type='BN1d')
+    ) -> None:
+        super(EdgeFusionModule, self).__init__()
+        self.edge_convs = nn.Sequential(
+            ConvModule(
+                feat_channels,
+                feat_channels,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+                conv_cfg=dict(type='Conv1d'),
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            nn.Conv1d(feat_channels, out_channels, kernel_size=1))
+        self.feat_channels = feat_channels
+
+    def forward(self, features: Tensor, fused_features: Tensor,
+                edge_indices: Tensor, edge_lens: List[int], output_h: int,
+                output_w: int) -> Tensor:
+        """Forward pass.
+
+        Args:
+            features (Tensor): Different representative features for fusion.
+            fused_features (Tensor): Different representative features
+                to be fused.
+            edge_indices (Tensor): Batch image edge indices.
+            edge_lens (List[int]): List of edge length of each image.
+            output_h (int): Height of output feature map.
+            output_w (int): Width of output feature map.
+
+        Returns:
+            Tensor: Fused feature maps.
+        """
+        batch_size = features.shape[0]
+        # normalize
+        grid_edge_indices = edge_indices.view(batch_size, -1, 1, 2).float()
+        grid_edge_indices[..., 0] = \
+            grid_edge_indices[..., 0] / (output_w - 1) * 2 - 1
+        grid_edge_indices[..., 1] = \
+            grid_edge_indices[..., 1] / (output_h - 1) * 2 - 1
+
+        # apply edge fusion
+        edge_features = F.grid_sample(
+            features, grid_edge_indices, align_corners=True).squeeze(-1)
+        edge_output = self.edge_convs(edge_features)
+
+        for k in range(batch_size):
+            edge_indice_k = edge_indices[k, :edge_lens[k]]
+            fused_features[k, :, edge_indice_k[:, 1],
+                           edge_indice_k[:, 0]] += edge_output[
+                               k, :, :edge_lens[k]]
+
+        return fused_features
diff --git a/mmde/mmdet3d/models/layers/fusion_layers/__init__.py b/mmde/mmdet3d/models/layers/fusion_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6df4741d782cd341ed91403697eaa9fc581655ff
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/fusion_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .coord_transform import (apply_3d_transformation, bbox_2d_transform,
+                              coord_2d_transform)
+from .point_fusion import PointFusion
+from .vote_fusion import VoteFusion
+
+__all__ = [
+    'PointFusion', 'VoteFusion', 'apply_3d_transformation',
+    'bbox_2d_transform', 'coord_2d_transform'
+]
diff --git a/mmde/mmdet3d/models/layers/fusion_layers/coord_transform.py b/mmde/mmdet3d/models/layers/fusion_layers/coord_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bcb6cba74e2f2fdebcd06586ac10f5dc64fd3ca
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/fusion_layers/coord_transform.py
@@ -0,0 +1,224 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+from typing import Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet3d.structures.points import get_points_type
+
+
+def apply_3d_transformation(pcd: Tensor,
+                            coord_type: str,
+                            img_meta: dict,
+                            reverse: bool = False) -> Tensor:
+    """Apply transformation to input point cloud.
+
+    Args:
+        pcd (Tensor): The point cloud to be transformed.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
+        img_meta(dict): Meta info regarding data transformation.
+        reverse (bool): Reversed transformation or not. Defaults to False.
+
+    Note:
+        The elements in img_meta['transformation_3d_flow']:
+
+            - "T" stands for translation;
+            - "S" stands for scale;
+            - "R" stands for rotation;
+            - "HF" stands for horizontal flip;
+            - "VF" stands for vertical flip.
+
+    Returns:
+        Tensor: The transformed point cloud.
+    """
+
+    dtype = pcd.dtype
+    device = pcd.device
+
+    pcd_rotate_mat = (
+        torch.tensor(img_meta['pcd_rotation'], dtype=dtype, device=device)
+        if 'pcd_rotation' in img_meta else torch.eye(
+            3, dtype=dtype, device=device))
+
+    pcd_scale_factor = (
+        img_meta['pcd_scale_factor'] if 'pcd_scale_factor' in img_meta else 1.)
+
+    pcd_trans_factor = (
+        torch.tensor(img_meta['pcd_trans'], dtype=dtype, device=device)
+        if 'pcd_trans' in img_meta else torch.zeros(
+            (3), dtype=dtype, device=device))
+
+    pcd_horizontal_flip = img_meta[
+        'pcd_horizontal_flip'] if 'pcd_horizontal_flip' in \
+        img_meta else False
+
+    pcd_vertical_flip = img_meta[
+        'pcd_vertical_flip'] if 'pcd_vertical_flip' in \
+        img_meta else False
+
+    flow = img_meta['transformation_3d_flow'] \
+        if 'transformation_3d_flow' in img_meta else []
+
+    pcd = pcd.clone()  # prevent inplace modification
+    pcd = get_points_type(coord_type)(pcd)
+
+    horizontal_flip_func = partial(pcd.flip, bev_direction='horizontal') \
+        if pcd_horizontal_flip else lambda: None
+    vertical_flip_func = partial(pcd.flip, bev_direction='vertical') \
+        if pcd_vertical_flip else lambda: None
+    if reverse:
+        scale_func = partial(pcd.scale, scale_factor=1.0 / pcd_scale_factor)
+        translate_func = partial(pcd.translate, trans_vector=-pcd_trans_factor)
+        # pcd_rotate_mat @ pcd_rotate_mat.inverse() is not
+        # exactly an identity matrix
+        # use angle to create the inverse rot matrix neither.
+        rotate_func = partial(pcd.rotate, rotation=pcd_rotate_mat.inverse())
+
+        # reverse the pipeline
+        flow = flow[::-1]
+    else:
+        scale_func = partial(pcd.scale, scale_factor=pcd_scale_factor)
+        translate_func = partial(pcd.translate, trans_vector=pcd_trans_factor)
+        rotate_func = partial(pcd.rotate, rotation=pcd_rotate_mat)
+
+    flow_mapping = {
+        'T': translate_func,
+        'S': scale_func,
+        'R': rotate_func,
+        'HF': horizontal_flip_func,
+        'VF': vertical_flip_func
+    }
+    for op in flow:
+        assert op in flow_mapping, f'This 3D data '\
+            f'transformation op ({op}) is not supported'
+        func = flow_mapping[op]
+        func()
+
+    return pcd.coord
+
+
+def extract_2d_info(
+        img_meta: dict,
+        tensor: Tensor) -> Tuple[int, int, int, int, Tensor, bool, Tensor]:
+    """Extract image augmentation information from img_meta.
+
+    Args:
+        img_meta (dict): Meta info regarding data transformation.
+        tensor (Tensor): Input tensor used to create new ones.
+
+    Returns:
+        Tuple[int, int, int, int, torch.Tensor, bool, torch.Tensor]:
+        The extracted information.
+    """
+    img_shape = img_meta['img_shape']
+    ori_shape = img_meta['ori_shape']
+    img_h, img_w = img_shape
+    ori_h, ori_w = ori_shape
+
+    img_scale_factor = (
+        tensor.new_tensor(img_meta['scale_factor'][:2])
+        if 'scale_factor' in img_meta else tensor.new_tensor([1.0, 1.0]))
+    img_flip = img_meta['flip'] if 'flip' in img_meta else False
+    img_crop_offset = (
+        tensor.new_tensor(img_meta['img_crop_offset'])
+        if 'img_crop_offset' in img_meta else tensor.new_tensor([0.0, 0.0]))
+
+    return (img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip,
+            img_crop_offset)
+
+
+def bbox_2d_transform(img_meta: dict, bbox_2d: Tensor,
+                      ori2new: bool) -> Tensor:
+    """Transform 2d bbox according to img_meta.
+
+    Args:
+        img_meta (dict): Meta info regarding data transformation.
+        bbox_2d (Tensor): Shape (..., >4) The input 2d bboxes to transform.
+        ori2new (bool): Origin img coord system to new or not.
+
+    Returns:
+        Tensor: The transformed 2d bboxes.
+    """
+
+    img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \
+        img_crop_offset = extract_2d_info(img_meta, bbox_2d)
+
+    bbox_2d_new = bbox_2d.clone()
+
+    if ori2new:
+        bbox_2d_new[:, 0] = bbox_2d_new[:, 0] * img_scale_factor[0]
+        bbox_2d_new[:, 2] = bbox_2d_new[:, 2] * img_scale_factor[0]
+        bbox_2d_new[:, 1] = bbox_2d_new[:, 1] * img_scale_factor[1]
+        bbox_2d_new[:, 3] = bbox_2d_new[:, 3] * img_scale_factor[1]
+
+        bbox_2d_new[:, 0] = bbox_2d_new[:, 0] + img_crop_offset[0]
+        bbox_2d_new[:, 2] = bbox_2d_new[:, 2] + img_crop_offset[0]
+        bbox_2d_new[:, 1] = bbox_2d_new[:, 1] + img_crop_offset[1]
+        bbox_2d_new[:, 3] = bbox_2d_new[:, 3] + img_crop_offset[1]
+
+        if img_flip:
+            bbox_2d_r = img_w - bbox_2d_new[:, 0]
+            bbox_2d_l = img_w - bbox_2d_new[:, 2]
+            bbox_2d_new[:, 0] = bbox_2d_l
+            bbox_2d_new[:, 2] = bbox_2d_r
+    else:
+        if img_flip:
+            bbox_2d_r = img_w - bbox_2d_new[:, 0]
+            bbox_2d_l = img_w - bbox_2d_new[:, 2]
+            bbox_2d_new[:, 0] = bbox_2d_l
+            bbox_2d_new[:, 2] = bbox_2d_r
+
+        bbox_2d_new[:, 0] = bbox_2d_new[:, 0] - img_crop_offset[0]
+        bbox_2d_new[:, 2] = bbox_2d_new[:, 2] - img_crop_offset[0]
+        bbox_2d_new[:, 1] = bbox_2d_new[:, 1] - img_crop_offset[1]
+        bbox_2d_new[:, 3] = bbox_2d_new[:, 3] - img_crop_offset[1]
+
+        bbox_2d_new[:, 0] = bbox_2d_new[:, 0] / img_scale_factor[0]
+        bbox_2d_new[:, 2] = bbox_2d_new[:, 2] / img_scale_factor[0]
+        bbox_2d_new[:, 1] = bbox_2d_new[:, 1] / img_scale_factor[1]
+        bbox_2d_new[:, 3] = bbox_2d_new[:, 3] / img_scale_factor[1]
+
+    return bbox_2d_new
+
+
+def coord_2d_transform(img_meta: dict, coord_2d: Tensor,
+                       ori2new: bool) -> Tensor:
+    """Transform 2d pixel coordinates according to img_meta.
+
+    Args:
+        img_meta (dict): Meta info regarding data transformation.
+        coord_2d (Tensor): Shape (..., 2) The input 2d coords to transform.
+        ori2new (bool): Origin img coord system to new or not.
+
+    Returns:
+        Tensor: The transformed 2d coordinates.
+    """
+
+    img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \
+        img_crop_offset = extract_2d_info(img_meta, coord_2d)
+
+    coord_2d_new = coord_2d.clone()
+
+    if ori2new:
+        # TODO here we assume this order of transformation
+        coord_2d_new[..., 0] = coord_2d_new[..., 0] * img_scale_factor[0]
+        coord_2d_new[..., 1] = coord_2d_new[..., 1] * img_scale_factor[1]
+
+        coord_2d_new[..., 0] += img_crop_offset[0]
+        coord_2d_new[..., 1] += img_crop_offset[1]
+
+        # flip uv coordinates and bbox
+        if img_flip:
+            coord_2d_new[..., 0] = img_w - coord_2d_new[..., 0]
+    else:
+        if img_flip:
+            coord_2d_new[..., 0] = img_w - coord_2d_new[..., 0]
+
+        coord_2d_new[..., 0] -= img_crop_offset[0]
+        coord_2d_new[..., 1] -= img_crop_offset[1]
+
+        coord_2d_new[..., 0] = coord_2d_new[..., 0] / img_scale_factor[0]
+        coord_2d_new[..., 1] = coord_2d_new[..., 1] / img_scale_factor[1]
+
+    return coord_2d_new
diff --git a/mmde/mmdet3d/models/layers/fusion_layers/point_fusion.py b/mmde/mmdet3d/models/layers/fusion_layers/point_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..170f2ae20a91488546da7594982f6d4047ce3b52
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/fusion_layers/point_fusion.py
@@ -0,0 +1,418 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.bbox_3d import (get_proj_mat_by_coord_type,
+                                        points_cam2img, points_img2cam)
+from mmdet3d.utils import OptConfigType, OptMultiConfig
+from . import apply_3d_transformation
+
+
+def point_sample(img_meta: dict,
+                 img_features: Tensor,
+                 points: Tensor,
+                 proj_mat: Tensor,
+                 coord_type: str,
+                 img_scale_factor: Tensor,
+                 img_crop_offset: Tensor,
+                 img_flip: bool,
+                 img_pad_shape: Tuple[int],
+                 img_shape: Tuple[int],
+                 aligned: bool = True,
+                 padding_mode: str = 'zeros',
+                 align_corners: bool = True,
+                 valid_flag: bool = False) -> Tensor:
+    """Obtain image features using points.
+
+    Args:
+        img_meta (dict): Meta info.
+        img_features (Tensor): 1 x C x H x W image features.
+        points (Tensor): Nx3 point cloud in LiDAR coordinates.
+        proj_mat (Tensor): 4x4 transformation matrix.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
+        img_scale_factor (Tensor): Scale factor with shape of
+            (w_scale, h_scale).
+        img_crop_offset (Tensor): Crop offset used to crop image during
+            data augmentation with shape of (w_offset, h_offset).
+        img_flip (bool): Whether the image is flipped.
+        img_pad_shape (Tuple[int]): Int tuple indicates the h & w after
+            padding. This is necessary to obtain features in feature map.
+        img_shape (Tuple[int]): Int tuple indicates the h & w before padding
+            after scaling. This is necessary for flipping coordinates.
+        aligned (bool): Whether to use bilinear interpolation when
+            sampling image features for each point. Defaults to True.
+        padding_mode (str): Padding mode when padding values for
+            features of out-of-image points. Defaults to 'zeros'.
+        align_corners (bool): Whether to align corners when
+            sampling image features for each point. Defaults to True.
+        valid_flag (bool): Whether to filter out the points that outside
+            the image and with depth smaller than 0. Defaults to False.
+
+    Returns:
+        Tensor: NxC image features sampled by point coordinates.
+    """
+
+    # apply transformation based on info in img_meta
+    points = apply_3d_transformation(
+        points, coord_type, img_meta, reverse=True)
+
+    # project points to image coordinate
+    if valid_flag:
+        proj_pts = points_cam2img(points, proj_mat, with_depth=True)
+        pts_2d = proj_pts[..., :2]
+        depths = proj_pts[..., 2]
+    else:
+        pts_2d = points_cam2img(points, proj_mat)
+
+    # img transformation: scale -> crop -> flip
+    # the image is resized by img_scale_factor
+    img_coors = pts_2d[:, 0:2] * img_scale_factor  # Nx2
+    img_coors -= img_crop_offset
+
+    # grid sample, the valid grid range should be in [-1,1]
+    coor_x, coor_y = torch.split(img_coors, 1, dim=1)  # each is Nx1
+
+    if img_flip:
+        # by default we take it as horizontal flip
+        # use img_shape before padding for flip
+        ori_h, ori_w = img_shape
+        coor_x = ori_w - coor_x
+
+    h, w = img_pad_shape
+    norm_coor_y = coor_y / h * 2 - 1
+    norm_coor_x = coor_x / w * 2 - 1
+    grid = torch.cat([norm_coor_x, norm_coor_y],
+                     dim=1).unsqueeze(0).unsqueeze(0)  # Nx2 -> 1x1xNx2
+
+    # align_corner=True provides higher performance
+    mode = 'bilinear' if aligned else 'nearest'
+    point_features = F.grid_sample(
+        img_features,
+        grid,
+        mode=mode,
+        padding_mode=padding_mode,
+        align_corners=align_corners)  # 1xCx1xN feats
+
+    if valid_flag:
+        # (N, )
+        valid = (coor_x.squeeze() < w) & (coor_x.squeeze() > 0) & (
+            coor_y.squeeze() < h) & (coor_y.squeeze() > 0) & (
+                depths > 0)
+        valid_features = point_features.squeeze().t()
+        valid_features[~valid] = 0
+        return valid_features, valid  # (N, C), (N,)
+
+    return point_features.squeeze().t()
+
+
+@MODELS.register_module()
+class PointFusion(BaseModule):
+    """Fuse image features from multi-scale features.
+
+    Args:
+        img_channels (List[int] or int): Channels of image features.
+            It could be a list if the input is multi-scale image features.
+        pts_channels (int): Channels of point features
+        mid_channels (int): Channels of middle layers
+        out_channels (int): Channels of output fused features
+        img_levels (List[int] or int): Number of image levels. Defaults to 3.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. Defaults to 'LIDAR'.
+        conv_cfg (:obj:`ConfigDict` or dict): Config dict for convolution
+            layers of middle layers. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layers of middle layers. Defaults to None.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
+        activate_out (bool): Whether to apply relu activation to output
+            features. Defaults to True.
+        fuse_out (bool): Whether to apply conv layer to the fused features.
+            Defaults to False.
+        dropout_ratio (int or float): Dropout ratio of image features to
+            prevent overfitting. Defaults to 0.
+        aligned (bool): Whether to apply aligned feature fusion.
+            Defaults to True.
+        align_corners (bool): Whether to align corner when sampling features
+            according to points. Defaults to True.
+        padding_mode (str): Mode used to pad the features of points that do not
+            have corresponding image features. Defaults to 'zeros'.
+        lateral_conv (bool): Whether to apply lateral convs to image features.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 img_channels: Union[List[int], int],
+                 pts_channels: int,
+                 mid_channels: int,
+                 out_channels: int,
+                 img_levels: Union[List[int], int] = 3,
+                 coord_type: str = 'LIDAR',
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 act_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 activate_out: bool = True,
+                 fuse_out: bool = False,
+                 dropout_ratio: Union[int, float] = 0,
+                 aligned: bool = True,
+                 align_corners: bool = True,
+                 padding_mode: str = 'zeros',
+                 lateral_conv: bool = True) -> None:
+        super(PointFusion, self).__init__(init_cfg=init_cfg)
+        if isinstance(img_levels, int):
+            img_levels = [img_levels]
+        if isinstance(img_channels, int):
+            img_channels = [img_channels] * len(img_levels)
+        assert isinstance(img_levels, list)
+        assert isinstance(img_channels, list)
+        assert len(img_channels) == len(img_levels)
+
+        self.img_levels = img_levels
+        self.coord_type = coord_type
+        self.act_cfg = act_cfg
+        self.activate_out = activate_out
+        self.fuse_out = fuse_out
+        self.dropout_ratio = dropout_ratio
+        self.img_channels = img_channels
+        self.aligned = aligned
+        self.align_corners = align_corners
+        self.padding_mode = padding_mode
+
+        self.lateral_convs = None
+        if lateral_conv:
+            self.lateral_convs = nn.ModuleList()
+            for i in range(len(img_channels)):
+                l_conv = ConvModule(
+                    img_channels[i],
+                    mid_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=self.act_cfg,
+                    inplace=False)
+                self.lateral_convs.append(l_conv)
+            self.img_transform = nn.Sequential(
+                nn.Linear(mid_channels * len(img_channels), out_channels),
+                nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+            )
+        else:
+            self.img_transform = nn.Sequential(
+                nn.Linear(sum(img_channels), out_channels),
+                nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+            )
+        self.pts_transform = nn.Sequential(
+            nn.Linear(pts_channels, out_channels),
+            nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+        )
+
+        if self.fuse_out:
+            self.fuse_conv = nn.Sequential(
+                nn.Linear(mid_channels, out_channels),
+                # For pts the BN is initialized differently by default
+                # TODO: check whether this is necessary
+                nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+                nn.ReLU(inplace=False))
+
+        if init_cfg is None:
+            self.init_cfg = [
+                dict(type='Xavier', layer='Conv2d', distribution='uniform'),
+                dict(type='Xavier', layer='Linear', distribution='uniform')
+            ]
+
+    def forward(self, img_feats: List[Tensor], pts: List[Tensor],
+                pts_feats: Tensor, img_metas: List[dict]) -> Tensor:
+        """Forward function.
+
+        Args:
+            img_feats (List[Tensor]): Image features.
+            pts: (List[Tensor]): A batch of points with shape N x 3.
+            pts_feats (Tensor): A tensor consist of point features of the
+                total batch.
+            img_metas (List[dict]): Meta information of images.
+
+        Returns:
+            Tensor: Fused features of each point.
+        """
+        img_pts = self.obtain_mlvl_feats(img_feats, pts, img_metas)
+        img_pre_fuse = self.img_transform(img_pts)
+        if self.training and self.dropout_ratio > 0:
+            img_pre_fuse = F.dropout(img_pre_fuse, self.dropout_ratio)
+        pts_pre_fuse = self.pts_transform(pts_feats)
+
+        fuse_out = img_pre_fuse + pts_pre_fuse
+        if self.activate_out:
+            fuse_out = F.relu(fuse_out)
+        if self.fuse_out:
+            fuse_out = self.fuse_conv(fuse_out)
+
+        return fuse_out
+
+    def obtain_mlvl_feats(self, img_feats: List[Tensor], pts: List[Tensor],
+                          img_metas: List[dict]) -> Tensor:
+        """Obtain multi-level features for each point.
+
+        Args:
+            img_feats (List[Tensor]): Multi-scale image features produced
+                by image backbone in shape (N, C, H, W).
+            pts (List[Tensor]): Points of each sample.
+            img_metas (List[dict]): Meta information for each sample.
+
+        Returns:
+            Tensor: Corresponding image features of each point.
+        """
+        if self.lateral_convs is not None:
+            img_ins = [
+                lateral_conv(img_feats[i])
+                for i, lateral_conv in zip(self.img_levels, self.lateral_convs)
+            ]
+        else:
+            img_ins = img_feats
+        img_feats_per_point = []
+        # Sample multi-level features
+        for i in range(len(img_metas)):
+            mlvl_img_feats = []
+            for level in range(len(self.img_levels)):
+                mlvl_img_feats.append(
+                    self.sample_single(img_ins[level][i:i + 1], pts[i][:, :3],
+                                       img_metas[i]))
+            mlvl_img_feats = torch.cat(mlvl_img_feats, dim=-1)
+            img_feats_per_point.append(mlvl_img_feats)
+
+        img_pts = torch.cat(img_feats_per_point, dim=0)
+        return img_pts
+
+    def sample_single(self, img_feats: Tensor, pts: Tensor,
+                      img_meta: dict) -> Tensor:
+        """Sample features from single level image feature map.
+
+        Args:
+            img_feats (Tensor): Image feature map in shape (1, C, H, W).
+            pts (Tensor): Points of a single sample.
+            img_meta (dict): Meta information of the single sample.
+
+        Returns:
+            Tensor: Single level image features of each point.
+        """
+        # TODO: image transformation also extracted
+        img_scale_factor = (
+            pts.new_tensor(img_meta['scale_factor'][:2])
+            if 'scale_factor' in img_meta.keys() else 1)
+        img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False
+        img_crop_offset = (
+            pts.new_tensor(img_meta['img_crop_offset'])
+            if 'img_crop_offset' in img_meta.keys() else 0)
+        proj_mat = get_proj_mat_by_coord_type(img_meta, self.coord_type)
+        img_pts = point_sample(
+            img_meta=img_meta,
+            img_features=img_feats,
+            points=pts,
+            proj_mat=pts.new_tensor(proj_mat),
+            coord_type=self.coord_type,
+            img_scale_factor=img_scale_factor,
+            img_crop_offset=img_crop_offset,
+            img_flip=img_flip,
+            img_pad_shape=img_meta['input_shape'][:2],
+            img_shape=img_meta['img_shape'][:2],
+            aligned=self.aligned,
+            padding_mode=self.padding_mode,
+            align_corners=self.align_corners,
+        )
+        return img_pts
+
+
+def voxel_sample(voxel_features: Tensor,
+                 voxel_range: List[float],
+                 voxel_size: List[float],
+                 depth_samples: Tensor,
+                 proj_mat: Tensor,
+                 downsample_factor: int,
+                 img_scale_factor: Tensor,
+                 img_crop_offset: Tensor,
+                 img_flip: bool,
+                 img_pad_shape: Tuple[int],
+                 img_shape: Tuple[int],
+                 aligned: bool = True,
+                 padding_mode: str = 'zeros',
+                 align_corners: bool = True) -> Tensor:
+    """Obtain image features using points.
+
+    Args:
+        voxel_features (Tensor): 1 x C x Nx x Ny x Nz voxel features.
+        voxel_range (List[float]): The range of voxel features.
+        voxel_size (List[float]): The voxel size of voxel features.
+        depth_samples (Tensor): N depth samples in LiDAR coordinates.
+        proj_mat (Tensor): ORIGINAL LiDAR2img projection matrix for N views.
+        downsample_factor (int): The downsample factor in rescaling.
+        img_scale_factor (Tensor): Scale factor with shape of
+            (w_scale, h_scale).
+        img_crop_offset (Tensor): Crop offset used to crop image during
+            data augmentation with shape of (w_offset, h_offset).
+        img_flip (bool): Whether the image is flipped.
+        img_pad_shape (Tuple[int]): Int tuple indicates the h & w after
+            padding. This is necessary to obtain features in feature map.
+        img_shape (Tuple[int]): Int tuple indicates the h & w before padding
+            after scaling. This is necessary for flipping coordinates.
+        aligned (bool): Whether to use bilinear interpolation when
+            sampling image features for each point. Defaults to True.
+        padding_mode (str): Padding mode when padding values for
+            features of out-of-image points. Defaults to 'zeros'.
+        align_corners (bool): Whether to align corners when
+            sampling image features for each point. Defaults to True.
+
+    Returns:
+        Tensor: 1xCxDxHxW frustum features sampled from voxel features.
+    """
+    # construct frustum grid
+    device = voxel_features.device
+    h, w = img_pad_shape
+    h_out = round(h / downsample_factor)
+    w_out = round(w / downsample_factor)
+    ws = (torch.linspace(0, w_out - 1, w_out) * downsample_factor).to(device)
+    hs = (torch.linspace(0, h_out - 1, h_out) * downsample_factor).to(device)
+    depths = depth_samples[::downsample_factor]
+    num_depths = len(depths)
+    ds_3d, ys_3d, xs_3d = torch.meshgrid(depths, hs, ws)
+    # grid: (D, H_out, W_out, 3) -> (D*H_out*W_out, 3)
+    grid = torch.stack([xs_3d, ys_3d, ds_3d], dim=-1).view(-1, 3)
+    # recover the coordinates in the canonical space
+    # reverse order of augmentations: flip -> crop -> scale
+    if img_flip:
+        # by default we take it as horizontal flip
+        # use img_shape before padding for flip
+        ori_h, ori_w = img_shape
+        grid[:, 0] = ori_w - grid[:, 0]
+    grid[:, :2] += img_crop_offset
+    grid[:, :2] /= img_scale_factor
+    # grid3d: (D*H_out*W_out, 3) in LiDAR coordinate system
+    grid3d = points_img2cam(grid, proj_mat)
+    # convert the 3D point coordinates to voxel coordinates
+    voxel_range = torch.tensor(voxel_range).to(device).view(1, 6)
+    voxel_size = torch.tensor(voxel_size).to(device).view(1, 3)
+    # suppose the voxel grid is generated with AlignedAnchorGenerator
+    # -0.5 given each grid is located at the center of the grid
+    # TODO: study whether here needs -0.5
+    grid3d = (grid3d - voxel_range[:, :3]) / voxel_size - 0.5
+    grid_size = (voxel_range[:, 3:] - voxel_range[:, :3]) / voxel_size
+    # normalize grid3d to (-1, 1)
+    grid3d = grid3d / grid_size * 2 - 1
+    # (x, y, z) -> (z, y, x) for grid_sampling
+    grid3d = grid3d.view(1, num_depths, h_out, w_out, 3)[..., [2, 1, 0]]
+    # align_corner=True provides higher performance
+    mode = 'bilinear' if aligned else 'nearest'
+    frustum_features = F.grid_sample(
+        voxel_features,
+        grid3d,
+        mode=mode,
+        padding_mode=padding_mode,
+        align_corners=align_corners)  # 1xCxDxHxW feats
+
+    return frustum_features
diff --git a/mmde/mmdet3d/models/layers/fusion_layers/vote_fusion.py b/mmde/mmdet3d/models/layers/fusion_layers/vote_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9b408401e9093b995fecc31a992d730101c479e
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/fusion_layers/vote_fusion.py
@@ -0,0 +1,207 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import points_cam2img
+from . import apply_3d_transformation, bbox_2d_transform, coord_2d_transform
+
+EPS = 1e-6
+
+
+@MODELS.register_module()
+class VoteFusion(nn.Module):
+    """Fuse 2d features from 3d seeds.
+
+    Args:
+        num_classes (int): Number of classes.
+        max_imvote_per_pixel (int): Max number of imvotes.
+    """
+
+    def __init__(self,
+                 num_classes: int = 10,
+                 max_imvote_per_pixel: int = 3) -> None:
+        super(VoteFusion, self).__init__()
+        self.num_classes = num_classes
+        self.max_imvote_per_pixel = max_imvote_per_pixel
+
+    def forward(self, imgs: List[Tensor], bboxes_2d_rescaled: List[Tensor],
+                seeds_3d_depth: List[Tensor],
+                img_metas: List[dict]) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            imgs (List[Tensor]): Image features.
+            bboxes_2d_rescaled (List[Tensor]): 2D bboxes.
+            seeds_3d_depth (List[Tensor]): 3D seeds.
+            img_metas (List[dict]): Meta information of images.
+
+        Returns:
+            Tuple[Tensor]:
+
+                - img_features: Concatenated cues of each point.
+                - masks: Validity mask of each feature.
+        """
+        img_features = []
+        masks = []
+        for i, data in enumerate(
+                zip(imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas)):
+            img, bbox_2d_rescaled, seed_3d_depth, img_meta = data
+            bbox_num = bbox_2d_rescaled.shape[0]
+            seed_num = seed_3d_depth.shape[0]
+
+            img_shape = img_meta['img_shape']
+            # first reverse the data transformations
+            xyz_depth = apply_3d_transformation(
+                seed_3d_depth, 'DEPTH', img_meta, reverse=True)
+
+            # project points from depth to image
+            depth2img = xyz_depth.new_tensor(img_meta['depth2img'])
+            uvz_origin = points_cam2img(xyz_depth, depth2img, True)
+            z_cam = uvz_origin[..., 2]
+            uv_origin = (uvz_origin[..., :2] - 1).round()
+
+            # rescale 2d coordinates and bboxes
+            uv_rescaled = coord_2d_transform(img_meta, uv_origin, True)
+            bbox_2d_origin = bbox_2d_transform(img_meta, bbox_2d_rescaled,
+                                               False)
+
+            if bbox_num == 0:
+                imvote_num = seed_num * self.max_imvote_per_pixel
+
+                # use zero features
+                two_cues = torch.zeros((15, imvote_num),
+                                       device=seed_3d_depth.device)
+                mask_zero = torch.zeros(
+                    imvote_num - seed_num, device=seed_3d_depth.device).bool()
+                mask_one = torch.ones(
+                    seed_num, device=seed_3d_depth.device).bool()
+                mask = torch.cat([mask_one, mask_zero], dim=0)
+            else:
+                # expand bboxes and seeds
+                bbox_expanded = bbox_2d_origin.view(1, bbox_num, -1).expand(
+                    seed_num, -1, -1)
+                seed_2d_expanded = uv_origin.view(seed_num, 1,
+                                                  -1).expand(-1, bbox_num, -1)
+                seed_2d_expanded_x, seed_2d_expanded_y = \
+                    seed_2d_expanded.split(1, dim=-1)
+
+                bbox_expanded_l, bbox_expanded_t, bbox_expanded_r, \
+                    bbox_expanded_b, bbox_expanded_conf, bbox_expanded_cls = \
+                    bbox_expanded.split(1, dim=-1)
+                bbox_expanded_midx = (bbox_expanded_l + bbox_expanded_r) / 2
+                bbox_expanded_midy = (bbox_expanded_t + bbox_expanded_b) / 2
+
+                seed_2d_in_bbox_x = (seed_2d_expanded_x > bbox_expanded_l) * \
+                    (seed_2d_expanded_x < bbox_expanded_r)
+                seed_2d_in_bbox_y = (seed_2d_expanded_y > bbox_expanded_t) * \
+                    (seed_2d_expanded_y < bbox_expanded_b)
+                seed_2d_in_bbox = seed_2d_in_bbox_x * seed_2d_in_bbox_y
+
+                # semantic cues, dim=class_num
+                sem_cue = torch.zeros_like(bbox_expanded_conf).expand(
+                    -1, -1, self.num_classes)
+                sem_cue = sem_cue.scatter(-1, bbox_expanded_cls.long(),
+                                          bbox_expanded_conf)
+
+                # bbox center - uv
+                delta_u = bbox_expanded_midx - seed_2d_expanded_x
+                delta_v = bbox_expanded_midy - seed_2d_expanded_y
+
+                seed_3d_expanded = seed_3d_depth.view(seed_num, 1, -1).expand(
+                    -1, bbox_num, -1)
+
+                z_cam = z_cam.view(seed_num, 1, 1).expand(-1, bbox_num, -1)
+                imvote = torch.cat(
+                    [delta_u, delta_v,
+                     torch.zeros_like(delta_v)], dim=-1).view(-1, 3)
+                imvote = imvote * z_cam.reshape(-1, 1)
+                imvote = imvote @ torch.inverse(depth2img.t())
+
+                # apply transformation to lifted imvotes
+                imvote = apply_3d_transformation(
+                    imvote, 'DEPTH', img_meta, reverse=False)
+
+                seed_3d_expanded = seed_3d_expanded.reshape(imvote.shape)
+
+                # ray angle
+                ray_angle = seed_3d_expanded + imvote
+                ray_angle /= torch.sqrt(torch.sum(ray_angle**2, -1) +
+                                        EPS).unsqueeze(-1)
+
+                # imvote lifted to 3d
+                xz = ray_angle[:, [0, 2]] / (ray_angle[:, [1]] + EPS) \
+                    * seed_3d_expanded[:, [1]] - seed_3d_expanded[:, [0, 2]]
+
+                # geometric cues, dim=5
+                geo_cue = torch.cat([xz, ray_angle],
+                                    dim=-1).view(seed_num, -1, 5)
+
+                two_cues = torch.cat([geo_cue, sem_cue], dim=-1)
+                # mask to 0 if seed not in bbox
+                two_cues = two_cues * seed_2d_in_bbox.float()
+
+                feature_size = two_cues.shape[-1]
+                # if bbox number is too small, append zeros
+                if bbox_num < self.max_imvote_per_pixel:
+                    append_num = self.max_imvote_per_pixel - bbox_num
+                    append_zeros = torch.zeros(
+                        (seed_num, append_num, 1),
+                        device=seed_2d_in_bbox.device).bool()
+                    seed_2d_in_bbox = torch.cat(
+                        [seed_2d_in_bbox, append_zeros], dim=1)
+                    append_zeros = torch.zeros(
+                        (seed_num, append_num, feature_size),
+                        device=two_cues.device)
+                    two_cues = torch.cat([two_cues, append_zeros], dim=1)
+                    append_zeros = torch.zeros((seed_num, append_num, 1),
+                                               device=two_cues.device)
+                    bbox_expanded_conf = torch.cat(
+                        [bbox_expanded_conf, append_zeros], dim=1)
+
+                # sort the valid seed-bbox pair according to confidence
+                pair_score = seed_2d_in_bbox.float() + bbox_expanded_conf
+                # and find the largests
+                mask, indices = pair_score.topk(
+                    self.max_imvote_per_pixel,
+                    dim=1,
+                    largest=True,
+                    sorted=True)
+
+                indices_img = indices.expand(-1, -1, feature_size)
+                two_cues = two_cues.gather(dim=1, index=indices_img)
+                two_cues = two_cues.transpose(1, 0)
+                two_cues = two_cues.reshape(-1, feature_size).transpose(
+                    1, 0).contiguous()
+
+                # since conf is ~ (0, 1), floor gives us validity
+                mask = mask.floor().int()
+                mask = mask.transpose(1, 0).reshape(-1).bool()
+
+            # clear the padding
+            img = img[:, :img_shape[0], :img_shape[1]]
+            img_flatten = img.reshape(3, -1).float()
+            img_flatten /= 255.
+
+            # take the normalized pixel value as texture cue
+            uv_rescaled[:, 0] = torch.clamp(uv_rescaled[:, 0].round(), 0,
+                                            img_shape[1] - 1)
+            uv_rescaled[:, 1] = torch.clamp(uv_rescaled[:, 1].round(), 0,
+                                            img_shape[0] - 1)
+            uv_flatten = uv_rescaled[:, 1].round() * \
+                img_shape[1] + uv_rescaled[:, 0].round()
+            uv_expanded = uv_flatten.unsqueeze(0).expand(3, -1).long()
+            txt_cue = torch.gather(img_flatten, dim=-1, index=uv_expanded)
+            txt_cue = txt_cue.unsqueeze(1).expand(-1,
+                                                  self.max_imvote_per_pixel,
+                                                  -1).reshape(3, -1)
+
+            # append texture cue
+            img_feature = torch.cat([two_cues, txt_cue], dim=0)
+            img_features.append(img_feature)
+            masks.append(mask)
+
+        return torch.stack(img_features, 0), torch.stack(masks, 0)
diff --git a/mmde/mmdet3d/models/layers/minkowski_engine_block.py b/mmde/mmdet3d/models/layers/minkowski_engine_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a54ee2de53816ea1a6bcc09940fb66a7faae1d8
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/minkowski_engine_block.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+from mmcv.cnn import build_activation_layer, build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+from mmengine.registry import MODELS
+from torch import nn
+
+from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
+
+try:
+    from MinkowskiEngine import (MinkowskiBatchNorm, MinkowskiConvolution,
+                                 MinkowskiConvolutionTranspose, MinkowskiReLU,
+                                 MinkowskiSyncBatchNorm, SparseTensor)
+    from MinkowskiEngine.modules.resnet_block import BasicBlock, Bottleneck
+except ImportError:
+    SparseTensor = None
+    from mmcv.cnn.resnet import BasicBlock, Bottleneck
+    IS_MINKOWSKI_ENGINE_AVAILABLE = False
+else:
+    MODELS._register_module(MinkowskiConvolution, 'MinkowskiConvNd')
+    MODELS._register_module(MinkowskiConvolutionTranspose,
+                            'MinkowskiConvNdTranspose')
+    MODELS._register_module(MinkowskiBatchNorm, 'MinkowskiBN')
+    MODELS._register_module(MinkowskiSyncBatchNorm, 'MinkowskiSyncBN')
+    MODELS._register_module(MinkowskiReLU, 'MinkowskiReLU')
+    IS_MINKOWSKI_ENGINE_AVAILABLE = True
+
+
+class MinkowskiConvModule(BaseModule):
+    """A minkowski engine conv block that bundles conv/norm/activation layers.
+
+    Args:
+        in_channels (int): In channels of block.
+        out_channels (int): Out channels of block.
+        kernel_size (int or Tuple[int]): Kernel_size of block.
+        stride (int or Tuple[int]): Stride of the first block. Defaults to 1.
+        dilation (int): Dilation of block. Defaults to 1.
+        bias (bool): Whether to use bias in conv. Defaults to False.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config of conv layer.
+            Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): The config of normalization.
+            Defaults to dict(type='MinkowskiBN').
+        act_cfg (:obj:`ConfigDict` or dict): The config of activation.
+            Defaults to dict(type='MinkowskiReLU', inplace=True).
+        init_cfg (:obj:`ConfigDict` or dict, optional): Initialization config.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int, int]],
+                 stride: Union[int, Tuple[int, int, int]] = 1,
+                 dilation: int = 1,
+                 bias: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='MinkowskiBN'),
+                 act_cfg: ConfigType = dict(
+                     type='MinkowskiReLU', inplace=True),
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg)
+        layers = []
+        if conv_cfg is None:
+            conv_cfg = dict(type='MinkowskiConvNd')
+        conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            dilation,
+            bias,
+            dimension=3)
+        layers.append(conv)
+
+        if norm_cfg is not None:
+            _, norm = build_norm_layer(norm_cfg, out_channels)
+            layers.append(norm)
+        if act_cfg is not None:
+            activation = build_activation_layer(act_cfg)
+            layers.append(activation)
+        self.net = nn.Sequential(*layers)
+
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        out = self.net(x)
+        return out
+
+
+class MinkowskiBasicBlock(BasicBlock, BaseModule):
+    """A wrapper of minkowski engine basic block. It inherits from mmengine's
+    `BaseModule` and allows additional keyword arguments.
+
+    Args:
+        inplanes (int): In channels of block.
+        planes (int): Out channels of block.
+        stride (int or Tuple[int]): Stride of the first conv. Defaults to 1.
+        dilation (int): Dilation of block. Defaults to 1.
+        downsample (nn.Module, optional): Residual branch conv module if
+            necessary. Defaults to None.
+        bn_momentum (float): Momentum of batch norm layer. Defaults to 0.1.
+        dimension (int): Dimension of minkowski convolution. Defaults to 3.
+        init_cfg (:obj:`ConfigDict` or dict, optional): Initialization config.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 bn_momentum: float = 0.1,
+                 dimension: int = 3,
+                 init_cfg: OptConfigType = None,
+                 **kwargs):
+        BaseModule.__init__(self, init_cfg)
+        BasicBlock.__init__(
+            self,
+            inplanes,
+            planes,
+            stride=stride,
+            dilation=dilation,
+            downsample=downsample,
+            bn_momentum=bn_momentum,
+            dimension=dimension)
+
+
+class MinkowskiBottleneck(Bottleneck, BaseModule):
+    """A wrapper of minkowski engine bottleneck block. It inherits from
+    mmengine's `BaseModule` and allows additional keyword arguments.
+
+    Args:
+        inplanes (int): In channels of block.
+        planes (int): Out channels of block.
+        stride (int or Tuple[int]): Stride of the second conv. Defaults to 1.
+        dilation (int): Dilation of block. Defaults to 1.
+        downsample (nn.Module, optional): Residual branch conv module if
+            necessary. Defaults to None.
+        bn_momentum (float): Momentum of batch norm layer. Defaults to 0.1.
+        dimension (int): Dimension of minkowski convolution. Defaults to 3.
+        init_cfg (:obj:`ConfigDict` or dict, optional): Initialization config.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 bn_momentum: float = 0.1,
+                 dimension: int = 3,
+                 init_cfg: OptConfigType = None,
+                 **kwargs):
+        BaseModule.__init__(self, init_cfg)
+        Bottleneck.__init__(
+            self,
+            inplanes,
+            planes,
+            stride=stride,
+            dilation=dilation,
+            downsample=downsample,
+            bn_momentum=bn_momentum,
+            dimension=dimension)
diff --git a/mmde/mmdet3d/models/layers/mlp.py b/mmde/mmdet3d/models/layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..837d1f1a71540568acd8b9dc62d1822cf93b54de
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/mlp.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.utils import ConfigType, OptMultiConfig
+
+
+class MLP(BaseModule):
+    """A simple MLP module.
+
+    Pass features (B, C, N) through an MLP.
+
+    Args:
+        in_channels (int): Number of channels of input features.
+            Defaults to 18.
+        conv_channels (Tuple[int]): Out channels of the convolution.
+            Defaults to (256, 256).
+        conv_cfg (:obj:`ConfigDict` or dict): Config dict for convolution
+            layer. Defaults to dict(type='Conv1d').
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN1d').
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU').
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channel: int = 18,
+                 conv_channels: Tuple[int] = (256, 256),
+                 conv_cfg: ConfigType = dict(type='Conv1d'),
+                 norm_cfg: ConfigType = dict(type='BN1d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(MLP, self).__init__(init_cfg=init_cfg)
+        self.mlp = nn.Sequential()
+        prev_channels = in_channel
+        for i, conv_channel in enumerate(conv_channels):
+            self.mlp.add_module(
+                f'layer{i}',
+                ConvModule(
+                    prev_channels,
+                    conv_channels[i],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    bias=True,
+                    inplace=True))
+            prev_channels = conv_channels[i]
+
+    def forward(self, img_features: Tensor) -> Tensor:
+        return self.mlp(img_features)
diff --git a/mmde/mmdet3d/models/layers/norm.py b/mmde/mmdet3d/models/layers/norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a8527872336fc4bf2c1da29bce922a553fc75c3
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/norm.py
@@ -0,0 +1,150 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.registry import MODELS
+from torch import Tensor
+from torch import distributed as dist
+from torch import nn as nn
+from torch.autograd.function import Function
+
+
+class AllReduce(Function):
+
+    @staticmethod
+    def forward(ctx, input: Tensor) -> Tensor:
+        input_list = [
+            torch.zeros_like(input) for k in range(dist.get_world_size())
+        ]
+        # Use allgather instead of allreduce in-place operations is unreliable
+        dist.all_gather(input_list, input, async_op=False)
+        inputs = torch.stack(input_list, dim=0)
+        return torch.sum(inputs, dim=0)
+
+    @staticmethod
+    def backward(ctx, grad_output: Tensor) -> Tensor:
+        dist.all_reduce(grad_output, async_op=False)
+        return grad_output
+
+
+@MODELS.register_module('naiveSyncBN1d')
+class NaiveSyncBatchNorm1d(nn.BatchNorm1d):
+    """Synchronized Batch Normalization for 3D Tensors.
+
+    Note:
+        This implementation is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        `torch.nn.SyncBatchNorm` has known unknown bugs.
+        It produces significantly worse AP (and sometimes goes NaN)
+        when the batch size on each worker is quite different
+        (e.g., when scale augmentation is used).
+        In 3D detection, different workers has points of different shapes,
+        which also cause instability.
+
+        Use this implementation before `nn.SyncBatchNorm` is fixed.
+        It is slower than `nn.SyncBatchNorm`.
+    """
+
+    def __init__(self, *args: list, **kwargs: dict) -> None:
+        super(NaiveSyncBatchNorm1d, self).__init__(*args, **kwargs)
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Args:
+            input (Tensor): Has shape (N, C) or (N, C, L), where N is
+                the batch size, C is the number of features or
+                channels, and L is the sequence length
+
+        Returns:
+            Tensor: Has shape (N, C) or (N, C, L), same shape as input.
+        """
+        using_dist = dist.is_available() and dist.is_initialized()
+        if (not using_dist) or dist.get_world_size() == 1 \
+                or not self.training:
+            return super().forward(input)
+        assert input.shape[0] > 0, 'SyncBN does not support empty inputs'
+        is_two_dim = input.dim() == 2
+        if is_two_dim:
+            input = input.unsqueeze(2)
+
+        C = input.shape[1]
+        mean = torch.mean(input, dim=[0, 2])
+        meansqr = torch.mean(input * input, dim=[0, 2])
+
+        vec = torch.cat([mean, meansqr], dim=0)
+        vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
+
+        mean, meansqr = torch.split(vec, C)
+        var = meansqr - mean * mean
+        self.running_mean += self.momentum * (
+            mean.detach() - self.running_mean)
+        self.running_var += self.momentum * (var.detach() - self.running_var)
+
+        invstd = torch.rsqrt(var + self.eps)
+        scale = self.weight * invstd
+        bias = self.bias - mean * scale
+        scale = scale.reshape(1, -1, 1)
+        bias = bias.reshape(1, -1, 1)
+        output = input * scale + bias
+        if is_two_dim:
+            output = output.squeeze(2)
+        return output
+
+
+@MODELS.register_module('naiveSyncBN2d')
+class NaiveSyncBatchNorm2d(nn.BatchNorm2d):
+    """Synchronized Batch Normalization for 4D Tensors.
+
+    Note:
+        This implementation is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        `torch.nn.SyncBatchNorm` has known unknown bugs.
+        It produces significantly worse AP (and sometimes goes NaN)
+        when the batch size on each worker is quite different
+        (e.g., when scale augmentation is used).
+        This phenomenon also occurs when the multi-modality feature fusion
+        modules of multi-modality detectors use SyncBN.
+
+        Use this implementation before `nn.SyncBatchNorm` is fixed.
+        It is slower than `nn.SyncBatchNorm`.
+    """
+
+    def __init__(self, *args: list, **kwargs: dict) -> None:
+        super(NaiveSyncBatchNorm2d, self).__init__(*args, **kwargs)
+
+    def forward(self, input: Tensor) -> Tensor:
+        """
+        Args:
+            Input (Tensor): Feature has shape (N, C, H, W).
+
+        Returns:
+            Tensor: Has shape (N, C, H, W), same shape as input.
+        """
+        assert input.dtype == torch.float32, \
+            f'input should be in float32 type, got {input.dtype}'
+        using_dist = dist.is_available() and dist.is_initialized()
+        if (not using_dist) or \
+                dist.get_world_size() == 1 or \
+                not self.training:
+            return super().forward(input)
+
+        assert input.shape[0] > 0, 'SyncBN does not support empty inputs'
+        C = input.shape[1]
+        mean = torch.mean(input, dim=[0, 2, 3])
+        meansqr = torch.mean(input * input, dim=[0, 2, 3])
+
+        vec = torch.cat([mean, meansqr], dim=0)
+        vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
+
+        mean, meansqr = torch.split(vec, C)
+        var = meansqr - mean * mean
+        self.running_mean += self.momentum * (
+            mean.detach() - self.running_mean)
+        self.running_var += self.momentum * (var.detach() - self.running_var)
+
+        invstd = torch.rsqrt(var + self.eps)
+        scale = self.weight * invstd
+        bias = self.bias - mean * scale
+        scale = scale.reshape(1, -1, 1, 1)
+        bias = bias.reshape(1, -1, 1, 1)
+        return input * scale + bias
diff --git a/mmde/mmdet3d/models/layers/paconv/__init__.py b/mmde/mmdet3d/models/layers/paconv/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d71c7660fba930deb46b0d95ba52628f1321c86a
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/paconv/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .paconv import PAConv, PAConvCUDA
+
+__all__ = ['PAConv', 'PAConvCUDA']
diff --git a/mmde/mmdet3d/models/layers/paconv/paconv.py b/mmde/mmdet3d/models/layers/paconv/paconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..04aaa2a37870a8850f44e68a22374cb4d4538930
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/paconv/paconv.py
@@ -0,0 +1,402 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Tuple, Union
+
+import torch
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+from mmcv.ops import assign_score_withk as assign_score_cuda
+from mmengine.model import constant_init
+from torch import Tensor
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.utils import ConfigType
+from .utils import assign_kernel_withoutk, assign_score, calc_euclidian_dist
+
+
+class ScoreNet(nn.Module):
+    r"""ScoreNet that outputs coefficient scores to assemble kernel weights in
+    the weight bank according to the relative position of point pairs.
+
+    Args:
+        mlp_channels (List[int]): Hidden unit sizes of SharedMLP layers.
+        last_bn (bool): Whether to use BN on the last output of mlps.
+            Defaults to False.
+        score_norm (str): Normalization function of output scores.
+            Can be 'softmax', 'sigmoid' or 'identity'. Defaults to 'softmax'.
+        temp_factor (float): Temperature factor to scale the output
+            scores before softmax. Defaults to 1.0.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d').
+        bias (bool or str): If specified as `auto`, it will be decided by
+            `norm_cfg`. `bias` will be set as True if `norm_cfg` is None,
+            otherwise False. Defaults to 'auto'.
+
+    Note:
+        The official code applies xavier_init to all Conv layers in ScoreNet,
+        see `PAConv <https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg
+        /model/pointnet2/paconv.py#L105>`_. However in our experiments, we
+        did not find much difference in applying such xavier initialization
+        or not. So we neglect this initialization in our implementation.
+    """
+
+    def __init__(self,
+                 mlp_channels: List[int],
+                 last_bn: bool = False,
+                 score_norm: str = 'softmax',
+                 temp_factor: float = 1.0,
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 bias: Union[bool, str] = 'auto') -> None:
+        super(ScoreNet, self).__init__()
+
+        assert score_norm in ['softmax', 'sigmoid', 'identity'], \
+            f'unsupported score_norm function {score_norm}'
+
+        self.score_norm = score_norm
+        self.temp_factor = temp_factor
+
+        self.mlps = nn.Sequential()
+        for i in range(len(mlp_channels) - 2):
+            self.mlps.add_module(
+                f'layer{i}',
+                ConvModule(
+                    mlp_channels[i],
+                    mlp_channels[i + 1],
+                    kernel_size=(1, 1),
+                    stride=(1, 1),
+                    conv_cfg=dict(type='Conv2d'),
+                    norm_cfg=norm_cfg,
+                    bias=bias))
+
+        # for the last mlp that outputs scores, no relu and possibly no bn
+        i = len(mlp_channels) - 2
+        self.mlps.add_module(
+            f'layer{i}',
+            ConvModule(
+                mlp_channels[i],
+                mlp_channels[i + 1],
+                kernel_size=(1, 1),
+                stride=(1, 1),
+                conv_cfg=dict(type='Conv2d'),
+                norm_cfg=norm_cfg if last_bn else None,
+                act_cfg=None,
+                bias=bias))
+
+    def forward(self, xyz_features: Tensor) -> Tensor:
+        """Forward.
+
+        Args:
+            xyz_features (Tensor): (B, C, N, K) Features constructed from xyz
+                coordinates of point pairs. May contain relative positions,
+                Euclidean distance, etc.
+
+        Returns:
+            Tensor: (B, N, K, M) Predicted scores for `M` kernels.
+        """
+        scores = self.mlps(xyz_features)  # (B, M, N, K)
+
+        # perform score normalization
+        if self.score_norm == 'softmax':
+            scores = F.softmax(scores / self.temp_factor, dim=1)
+        elif self.score_norm == 'sigmoid':
+            scores = torch.sigmoid(scores / self.temp_factor)
+        else:  # 'identity'
+            scores = scores
+
+        scores = scores.permute(0, 2, 3, 1)  # (B, N, K, M)
+
+        return scores
+
+
+class PAConv(nn.Module):
+    """Non-CUDA version of PAConv.
+
+    PAConv stores a trainable weight bank containing several kernel weights.
+    Given input points and features, it computes coefficient scores to assemble
+    those kernels to form conv kernels, and then runs convolution on the input.
+
+    Args:
+        in_channels (int): Input channels of point features.
+        out_channels (int): Output channels of point features.
+        num_kernels (int): Number of kernel weights in the weight bank.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d', momentum=0.1).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU', inplace=True).
+        scorenet_input (str): Type of input to ScoreNet.
+            Can be 'identity', 'w_neighbor' or 'w_neighbor_dist'.
+            Defaults to 'w_neighbor_dist'.
+        weight_bank_init (str): Init method of weight bank kernels.
+            Can be 'kaiming' or 'xavier'. Defaults to 'kaiming'.
+        kernel_input (str): Input features to be multiplied with kernel
+            weights. Can be 'identity' or 'w_neighbor'.
+            Defaults to 'w_neighbor'.
+        scorenet_cfg (dict): Config of the ScoreNet module, which may contain
+            the following keys and values:
+
+            - mlp_channels (List[int]): Hidden units of MLPs.
+            - score_norm (str): Normalization function of output scores.
+              Can be 'softmax', 'sigmoid' or 'identity'.
+            - temp_factor (float): Temperature factor to scale the output
+              scores before softmax.
+            - last_bn (bool): Whether to use BN on the last output of mlps.
+            Defaults to dict(mlp_channels=[16, 16, 16],
+                             score_norm='softmax',
+                             temp_factor=1.0,
+                             last_bn=False).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_kernels: int,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+        scorenet_input: str = 'w_neighbor_dist',
+        weight_bank_init: str = 'kaiming',
+        kernel_input: str = 'w_neighbor',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[16, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
+        super(PAConv, self).__init__()
+
+        # determine weight kernel size according to used features
+        if kernel_input == 'identity':
+            # only use grouped_features
+            kernel_mul = 1
+        elif kernel_input == 'w_neighbor':
+            # concat of (grouped_features - center_features, grouped_features)
+            kernel_mul = 2
+        else:
+            raise NotImplementedError(
+                f'unsupported kernel_input {kernel_input}')
+        self.kernel_input = kernel_input
+        in_channels = kernel_mul * in_channels
+
+        # determine mlp channels in ScoreNet according to used xyz features
+        if scorenet_input == 'identity':
+            # only use relative position (grouped_xyz - center_xyz)
+            self.scorenet_in_channels = 3
+        elif scorenet_input == 'w_neighbor':
+            # (grouped_xyz - center_xyz, grouped_xyz)
+            self.scorenet_in_channels = 6
+        elif scorenet_input == 'w_neighbor_dist':
+            # (center_xyz, grouped_xyz - center_xyz, Euclidean distance)
+            self.scorenet_in_channels = 7
+        else:
+            raise NotImplementedError(
+                f'unsupported scorenet_input {scorenet_input}')
+        self.scorenet_input = scorenet_input
+
+        # construct kernel weights in weight bank
+        # self.weight_bank is of shape [C, num_kernels * out_c]
+        # where C can be in_c or (2 * in_c)
+        if weight_bank_init == 'kaiming':
+            weight_init = nn.init.kaiming_normal_
+        elif weight_bank_init == 'xavier':
+            weight_init = nn.init.xavier_normal_
+        else:
+            raise NotImplementedError(
+                f'unsupported weight bank init method {weight_bank_init}')
+
+        self.num_kernels = num_kernels  # the parameter `m` in the paper
+        weight_bank = weight_init(
+            torch.empty(self.num_kernels, in_channels, out_channels))
+        weight_bank = weight_bank.permute(1, 0, 2).reshape(
+            in_channels, self.num_kernels * out_channels).contiguous()
+        self.weight_bank = nn.Parameter(weight_bank, requires_grad=True)
+
+        # construct ScoreNet
+        scorenet_cfg_ = copy.deepcopy(scorenet_cfg)
+        scorenet_cfg_['mlp_channels'].insert(0, self.scorenet_in_channels)
+        scorenet_cfg_['mlp_channels'].append(self.num_kernels)
+        self.scorenet = ScoreNet(**scorenet_cfg_)
+
+        self.bn = build_norm_layer(norm_cfg, out_channels)[1] if \
+            norm_cfg is not None else None
+        self.activate = build_activation_layer(act_cfg) if \
+            act_cfg is not None else None
+
+        # set some basic attributes of Conv layers
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.init_weights()
+
+    def init_weights(self) -> None:
+        """Initialize weights of shared MLP layers and BN layers."""
+        if self.bn is not None:
+            constant_init(self.bn, val=1, bias=0)
+
+    def _prepare_scorenet_input(self, points_xyz: Tensor) -> Tensor:
+        """Prepare input point pairs features for self.ScoreNet.
+
+        Args:
+            points_xyz (Tensor): (B, 3, npoint, K) Coordinates of the
+                grouped points.
+
+        Returns:
+            Tensor: (B, C, npoint, K) The generated features per point pair.
+        """
+        B, _, npoint, K = points_xyz.size()
+        center_xyz = points_xyz[..., :1].repeat(1, 1, 1, K)
+        xyz_diff = points_xyz - center_xyz  # [B, 3, npoint, K]
+        if self.scorenet_input == 'identity':
+            xyz_features = xyz_diff
+        elif self.scorenet_input == 'w_neighbor':
+            xyz_features = torch.cat((xyz_diff, points_xyz), dim=1)
+        else:  # w_neighbor_dist
+            euclidian_dist = calc_euclidian_dist(
+                center_xyz.permute(0, 2, 3, 1).reshape(B * npoint * K, 3),
+                points_xyz.permute(0, 2, 3, 1).reshape(B * npoint * K, 3)).\
+                    reshape(B, 1, npoint, K)
+            xyz_features = torch.cat((center_xyz, xyz_diff, euclidian_dist),
+                                     dim=1)
+        return xyz_features
+
+    def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]:
+        """Forward.
+
+        Args:
+            inputs (Tuple[Tensor]):
+
+                - features (Tensor): (B, in_c, npoint, K)
+                  Features of the queried points.
+                - points_xyz (Tensor): (B, 3, npoint, K)
+                  Coordinates of the grouped points.
+
+        Returns:
+            Tuple[Tensor]:
+
+                - new_features: (B, out_c, npoint, K) Features after PAConv.
+                - points_xyz: Same as input.
+        """
+        features, points_xyz = inputs
+        B, _, npoint, K = features.size()
+
+        if self.kernel_input == 'w_neighbor':
+            center_features = features[..., :1].repeat(1, 1, 1, K)
+            features_diff = features - center_features
+            # to (B, 2 * in_c, npoint, K)
+            features = torch.cat((features_diff, features), dim=1)
+
+        # prepare features for between each point and its grouping center
+        xyz_features = self._prepare_scorenet_input(points_xyz)
+
+        # scores to assemble kernel weights
+        scores = self.scorenet(xyz_features)  # [B, npoint, K, m]
+
+        # first compute out features over all kernels
+        # features is [B, C, npoint, K], weight_bank is [C, m * out_c]
+        new_features = torch.matmul(
+            features.permute(0, 2, 3, 1),
+            self.weight_bank).view(B, npoint, K, self.num_kernels,
+                                   -1)  # [B, npoint, K, m, out_c]
+
+        # then aggregate using scores
+        new_features = assign_score(scores, new_features)
+        # to [B, out_c, npoint, K]
+        new_features = new_features.permute(0, 3, 1, 2).contiguous()
+
+        if self.bn is not None:
+            new_features = self.bn(new_features)
+        if self.activate is not None:
+            new_features = self.activate(new_features)
+
+        # in order to keep input output consistency
+        # so that we can wrap PAConv in Sequential
+        return (new_features, points_xyz)
+
+
+class PAConvCUDA(PAConv):
+    """CUDA version of PAConv that implements a cuda op to efficiently perform
+    kernel assembling.
+
+    Different from vanilla PAConv, the input features of this function is not
+    grouped by centers. Instead, they will be queried on-the-fly by the
+    additional input `points_idx`. This avoids the large intermediate matrix.
+    See the `paper <https://arxiv.org/pdf/2103.14635.pdf>`_ appendix Sec. D for
+    more detailed descriptions.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_kernels: int,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+        scorenet_input: str = 'w_neighbor_dist',
+        weight_bank_init: str = 'kaiming',
+        kernel_input: str = 'w_neighbor',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[8, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
+        super(PAConvCUDA, self).__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            num_kernels=num_kernels,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            scorenet_input=scorenet_input,
+            weight_bank_init=weight_bank_init,
+            kernel_input=kernel_input,
+            scorenet_cfg=scorenet_cfg)
+
+        assert self.kernel_input == 'w_neighbor', \
+            'CUDA implemented PAConv only supports w_neighbor kernel_input'
+
+    def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]:
+        """Forward.
+
+        Args:
+            inputs (Tuple[Tensor]):
+
+                - features (Tensor): (B, in_c, N)
+                  Features of all points in the current point cloud.
+                  Different from non-CUDA version PAConv, here the features
+                  are not grouped by each center to form a K dim.
+                - points_xyz (Tensor): (B, 3, npoint, K)
+                  Coordinates of the grouped points.
+                - points_idx (Tensor): (B, npoint, K)
+                  Index of the grouped points.
+
+        Returns:
+            Tuple[Tensor]:
+
+                - new_features: (B, out_c, npoint, K) Features after PAConv.
+                - points_xyz: Same as input.
+                - points_idx: Same as input.
+        """
+        features, points_xyz, points_idx = inputs
+
+        # prepare features for between each point and its grouping center
+        xyz_features = self._prepare_scorenet_input(points_xyz)
+
+        # scores to assemble kernel weights
+        scores = self.scorenet(xyz_features)  # [B, npoint, K, m]
+
+        # pre-compute features for points and centers separately
+        # features is [B, in_c, N], weight_bank is [C, m * out_dim]
+        point_feat, center_feat = assign_kernel_withoutk(
+            features, self.weight_bank, self.num_kernels)
+
+        # aggregate features using custom cuda op
+        new_features = assign_score_cuda(
+            scores, point_feat, center_feat, points_idx,
+            'sum').contiguous()  # [B, out_c, npoint, K]
+
+        if self.bn is not None:
+            new_features = self.bn(new_features)
+        if self.activate is not None:
+            new_features = self.activate(new_features)
+
+        # in order to keep input output consistency
+        return (new_features, points_xyz, points_idx)
diff --git a/mmde/mmdet3d/models/layers/paconv/utils.py b/mmde/mmdet3d/models/layers/paconv/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e126b1a862ff43e8a3f2a22be94bd9b8a9ed7973
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/paconv/utils.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from torch import Tensor
+
+
+def calc_euclidian_dist(xyz1: Tensor, xyz2: Tensor) -> Tensor:
+    """Calculate the Euclidean distance between two sets of points.
+
+    Args:
+        xyz1 (Tensor): (N, 3) The first set of points.
+        xyz2 (Tensor): (N, 3) The second set of points.
+
+    Returns:
+        Tensor: (N, ) The Euclidean distance between each point pair.
+    """
+    assert xyz1.shape[0] == xyz2.shape[0], 'number of points are not the same'
+    assert xyz1.shape[1] == xyz2.shape[1] == 3, \
+        'points coordinates dimension is not 3'
+    return torch.norm(xyz1 - xyz2, dim=-1)
+
+
+def assign_score(scores: Tensor, point_features: Tensor) -> Tensor:
+    """Perform weighted sum to aggregate output features according to scores.
+    This function is used in non-CUDA version of PAConv.
+
+    Compared to the cuda op assigh_score_withk, this pytorch implementation
+    pre-computes output features for the neighbors of all centers, and then
+    performs aggregation. It consumes more GPU memories.
+
+    Args:
+        scores (Tensor): (B, npoint, K, M) Predicted scores to
+            aggregate weight matrices in the weight bank.
+            `npoint` is the number of sampled centers.
+            `K` is the number of queried neighbors.
+            `M` is the number of weight matrices in the weight bank.
+        point_features (Tensor): (B, npoint, K, M, out_dim)
+            Pre-computed point features to be aggregated.
+
+    Returns:
+        Tensor: (B, npoint, K, out_dim) The aggregated features.
+    """
+    B, npoint, K, M = scores.size()
+    scores = scores.view(B, npoint, K, 1, M)
+    output = torch.matmul(scores, point_features).view(B, npoint, K, -1)
+    return output
+
+
+def assign_kernel_withoutk(features: Tensor, kernels: Tensor,
+                           M: int) -> Tuple[Tensor]:
+    """Pre-compute features with weight matrices in weight bank. This function
+    is used before cuda op assign_score_withk in CUDA version PAConv.
+
+    Args:
+        features (Tensor): (B, in_dim, N) Input features of all points.
+            `N` is the number of points in current point cloud.
+        kernels (Tensor): (2 * in_dim, M * out_dim) Weight matrices in
+            the weight bank, transformed from (M, 2 * in_dim, out_dim).
+            `2 * in_dim` is because the input features are concatenation of
+            (point_features - center_features, point_features).
+        M (int): Number of weight matrices in the weight bank.
+
+    Returns:
+        Tuple[Tensor]: Both of shape (B, N, M, out_dim).
+
+            - point_features: Pre-computed features for points.
+            - center_features: Pre-computed features for centers.
+    """
+    B, in_dim, N = features.size()
+    feat_trans = features.permute(0, 2, 1)  # [B, N, in_dim]
+    out_feat_half1 = torch.matmul(feat_trans, kernels[:in_dim]).view(
+        B, N, M, -1)  # [B, N, M, out_dim]
+    out_feat_half2 = torch.matmul(feat_trans, kernels[in_dim:]).view(
+        B, N, M, -1)  # [B, N, M, out_dim]
+
+    # TODO: why this hard-coded if condition?
+    # when the network input is only xyz without additional features
+    # xyz will be used as features, so that features.size(1) == 3 % 2 != 0
+    # we need to compensate center_features because otherwise
+    # `point_features - center_features` will result in all zeros?
+    if features.size(1) % 2 != 0:
+        out_feat_half_coord = torch.matmul(
+            feat_trans[:, :, :3],  # [B, N, 3]
+            kernels[in_dim:in_dim + 3]).view(B, N, M, -1)  # [B, N, M, out_dim]
+    else:
+        out_feat_half_coord = torch.zeros_like(out_feat_half2)
+
+    point_features = out_feat_half1 + out_feat_half2
+    center_features = out_feat_half1 + out_feat_half_coord
+    return point_features, center_features
diff --git a/mmde/mmdet3d/models/layers/pointnet_modules/__init__.py b/mmde/mmdet3d/models/layers/pointnet_modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..13d6e1d81aa752402f2087467d1002d1f5679c79
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/pointnet_modules/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import build_sa_module
+from .paconv_sa_module import (PAConvCUDASAModule, PAConvCUDASAModuleMSG,
+                               PAConvSAModule, PAConvSAModuleMSG)
+from .point_fp_module import PointFPModule
+from .point_sa_module import PointSAModule, PointSAModuleMSG
+from .stack_point_sa_module import StackedSAModuleMSG
+
+__all__ = [
+    'build_sa_module', 'PointSAModuleMSG', 'PointSAModule', 'PointFPModule',
+    'PAConvSAModule', 'PAConvSAModuleMSG', 'PAConvCUDASAModule',
+    'PAConvCUDASAModuleMSG', 'StackedSAModuleMSG'
+]
diff --git a/mmde/mmdet3d/models/layers/pointnet_modules/builder.py b/mmde/mmdet3d/models/layers/pointnet_modules/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..2274f9c6ca2928949c5f238e225e506e0717bb18
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/pointnet_modules/builder.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+from mmengine.registry import Registry
+from torch import nn as nn
+
+SA_MODULES = Registry(
+    name='point_sa_module',
+    locations=['mmdet3d.models.layers.pointnet_modules'])
+
+
+def build_sa_module(cfg: Union[dict, None], *args, **kwargs) -> nn.Module:
+    """Build PointNet2 set abstraction (SA) module.
+
+    Args:
+        cfg (dict or None): The SA module config, which should contain:
+
+            - type (str): Module type.
+            - module args: Args needed to instantiate an SA module.
+        args (argument list): Arguments passed to the `__init__`
+            method of the corresponding module.
+        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
+            method of the corresponding SA module .
+
+    Returns:
+        nn.Module: Created SA module.
+    """
+    if cfg is None:
+        cfg_ = dict(type='PointSAModule')
+    else:
+        if not isinstance(cfg, dict):
+            raise TypeError('cfg must be a dict')
+        if 'type' not in cfg:
+            raise KeyError('the cfg dict must contain the key "type"')
+        cfg_ = cfg.copy()
+
+    module_type = cfg_.pop('type')
+    if module_type not in SA_MODULES:
+        raise KeyError(f'Unrecognized module type {module_type}')
+    else:
+        sa_module = SA_MODULES.get(module_type)
+
+    module = sa_module(*args, **kwargs, **cfg_)
+
+    return module
diff --git a/mmde/mmdet3d/models/layers/pointnet_modules/paconv_sa_module.py b/mmde/mmdet3d/models/layers/pointnet_modules/paconv_sa_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6e55d0dca91c4129cbb1a6fd73b9098cd4d5605
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/pointnet_modules/paconv_sa_module.py
@@ -0,0 +1,383 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.models.layers.paconv import PAConv, PAConvCUDA
+from mmdet3d.utils import ConfigType
+from .builder import SA_MODULES
+from .point_sa_module import BasePointSAModule
+
+
+@SA_MODULES.register_module()
+class PAConvSAModuleMSG(BasePointSAModule):
+    r"""Point set abstraction module with multi-scale grouping (MSG) used in
+    PAConv networks.
+
+    Replace the MLPs in `PointSAModuleMSG` with PAConv layers.
+    See the `paper <https://arxiv.org/abs/2103.14635>`_ for more details.
+
+    Args:
+        num_point (int): Number of points.
+        radii (List[float]): List of radius in each ball query.
+        sample_nums (List[int]): Number of samples in each ball query.
+        mlp_channels (List[List[int]]): Specify of the pointnet before
+            the global pooling for each scale.
+        paconv_num_kernels (List[List[int]]): Number of kernel weights in the
+            weight banks of each layer's PAConv.
+        fps_mod (List[str]): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS']. Defaults to ['D-FPS'].
+
+            - F-FPS: Using feature distances for FPS.
+            - D-FPS: Using Euclidean distances of points for FPS.
+            - FS: Using F-FPS and D-FPS simultaneously.
+        fps_sample_range_list (List[int]): Range of points to apply FPS.
+            Defaults to [-1].
+        dilated_group (bool): Whether to use dilated ball query.
+            Defaults to False.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d', momentum=0.1).
+        use_xyz (bool): Whether to use xyz. Defaults to True.
+        pool_mod (str): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool): Whether to normalize local XYZ with radius.
+            Defaults to False.
+        bias (bool or str): If specified as `auto`, it will be decided by
+            `norm_cfg`. `bias` will be set as True if `norm_cfg` is None,
+            otherwise False. Defaults to 'auto'.
+        paconv_kernel_input (str): Input features to be multiplied
+            with kernel weights. Can be 'identity' or 'w_neighbor'.
+            Defaults to 'w_neighbor'.
+        scorenet_input (str): Type of the input to ScoreNet.
+            Defaults to 'w_neighbor_dist'. Can be the following values:
+
+            - 'identity': Use xyz coordinates as input.
+            - 'w_neighbor': Use xyz coordinates and the difference with center
+              points as input.
+            - 'w_neighbor_dist': Use xyz coordinates, the difference with
+              center points and the Euclidean distance as input.
+        scorenet_cfg (dict): Config of the ScoreNet module, which
+            may contain the following keys and values:
+
+            - mlp_channels (List[int]): Hidden units of MLPs.
+            - score_norm (str): Normalization function of output scores.
+              Can be 'softmax', 'sigmoid' or 'identity'.
+            - temp_factor (float): Temperature factor to scale the output
+              scores before softmax.
+            - last_bn (bool): Whether to use BN on the last output of mlps.
+            Defaults to dict(mlp_channels=[16, 16, 16],
+                             score_norm='softmax',
+                             temp_factor=1.0,
+                             last_bn=False).
+    """
+
+    def __init__(
+        self,
+        num_point: int,
+        radii: List[float],
+        sample_nums: List[int],
+        mlp_channels: List[List[int]],
+        paconv_num_kernels: List[List[int]],
+        fps_mod: List[str] = ['D-FPS'],
+        fps_sample_range_list: List[int] = [-1],
+        dilated_group: bool = False,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        use_xyz: bool = True,
+        pool_mod: str = 'max',
+        normalize_xyz: bool = False,
+        bias: Union[bool, str] = 'auto',
+        paconv_kernel_input: str = 'w_neighbor',
+        scorenet_input: str = 'w_neighbor_dist',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[16, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
+        super(PAConvSAModuleMSG, self).__init__(
+            num_point=num_point,
+            radii=radii,
+            sample_nums=sample_nums,
+            mlp_channels=mlp_channels,
+            fps_mod=fps_mod,
+            fps_sample_range_list=fps_sample_range_list,
+            dilated_group=dilated_group,
+            use_xyz=use_xyz,
+            pool_mod=pool_mod,
+            normalize_xyz=normalize_xyz,
+            grouper_return_grouped_xyz=True)
+
+        assert len(paconv_num_kernels) == len(mlp_channels)
+        for i in range(len(mlp_channels)):
+            assert len(paconv_num_kernels[i]) == len(mlp_channels[i]) - 1, \
+                'PAConv number of kernel weights wrong'
+
+        # in PAConv, bias only exists in ScoreNet
+        scorenet_cfg['bias'] = bias
+
+        for i in range(len(self.mlp_channels)):
+            mlp_channel = self.mlp_channels[i]
+            if use_xyz:
+                mlp_channel[0] += 3
+
+            num_kernels = paconv_num_kernels[i]
+
+            mlp = nn.Sequential()
+            for i in range(len(mlp_channel) - 1):
+                mlp.add_module(
+                    f'layer{i}',
+                    PAConv(
+                        mlp_channel[i],
+                        mlp_channel[i + 1],
+                        num_kernels[i],
+                        norm_cfg=norm_cfg,
+                        kernel_input=paconv_kernel_input,
+                        scorenet_input=scorenet_input,
+                        scorenet_cfg=scorenet_cfg))
+            self.mlps.append(mlp)
+
+
+@SA_MODULES.register_module()
+class PAConvSAModule(PAConvSAModuleMSG):
+    r"""Point set abstraction module with single-scale grouping (SSG) used in
+    PAConv networks.
+
+    Replace the MLPs in `PointSAModule` with PAConv layers. See the `paper
+    <https://arxiv.org/abs/2103.14635>`_ for more details.
+    """
+
+    def __init__(
+        self,
+        mlp_channels: List[int],
+        paconv_num_kernels: List[int],
+        num_point: Optional[int] = None,
+        radius: Optional[float] = None,
+        num_sample: Optional[int] = None,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        use_xyz: bool = True,
+        pool_mod: str = 'max',
+        fps_mod: List[str] = ['D-FPS'],
+        fps_sample_range_list: List[int] = [-1],
+        normalize_xyz: bool = False,
+        paconv_kernel_input: str = 'w_neighbor',
+        scorenet_input: str = 'w_neighbor_dist',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[16, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
+        super(PAConvSAModule, self).__init__(
+            mlp_channels=[mlp_channels],
+            paconv_num_kernels=[paconv_num_kernels],
+            num_point=num_point,
+            radii=[radius],
+            sample_nums=[num_sample],
+            norm_cfg=norm_cfg,
+            use_xyz=use_xyz,
+            pool_mod=pool_mod,
+            fps_mod=fps_mod,
+            fps_sample_range_list=fps_sample_range_list,
+            normalize_xyz=normalize_xyz,
+            paconv_kernel_input=paconv_kernel_input,
+            scorenet_input=scorenet_input,
+            scorenet_cfg=scorenet_cfg)
+
+
+@SA_MODULES.register_module()
+class PAConvCUDASAModuleMSG(BasePointSAModule):
+    r"""Point set abstraction module with multi-scale grouping (MSG) used in
+    PAConv networks.
+
+    Replace the non CUDA version PAConv with CUDA implemented PAConv for
+    efficient computation. See the `paper <https://arxiv.org/abs/2103.14635>`_
+    for more details.
+    """
+
+    def __init__(
+        self,
+        num_point: int,
+        radii: List[float],
+        sample_nums: List[int],
+        mlp_channels: List[List[int]],
+        paconv_num_kernels: List[List[int]],
+        fps_mod: List[str] = ['D-FPS'],
+        fps_sample_range_list: List[int] = [-1],
+        dilated_group: bool = False,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        use_xyz: bool = True,
+        pool_mod: str = 'max',
+        normalize_xyz: bool = False,
+        bias: Union[bool, str] = 'auto',
+        paconv_kernel_input: str = 'w_neighbor',
+        scorenet_input: str = 'w_neighbor_dist',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[8, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
+        super(PAConvCUDASAModuleMSG, self).__init__(
+            num_point=num_point,
+            radii=radii,
+            sample_nums=sample_nums,
+            mlp_channels=mlp_channels,
+            fps_mod=fps_mod,
+            fps_sample_range_list=fps_sample_range_list,
+            dilated_group=dilated_group,
+            use_xyz=use_xyz,
+            pool_mod=pool_mod,
+            normalize_xyz=normalize_xyz,
+            grouper_return_grouped_xyz=True,
+            grouper_return_grouped_idx=True)
+
+        assert len(paconv_num_kernels) == len(mlp_channels)
+        for i in range(len(mlp_channels)):
+            assert len(paconv_num_kernels[i]) == len(mlp_channels[i]) - 1, \
+                'PAConv number of kernel weights wrong'
+
+        # in PAConv, bias only exists in ScoreNet
+        scorenet_cfg['bias'] = bias
+
+        # we need to manually concat xyz for CUDA implemented PAConv
+        self.use_xyz = use_xyz
+
+        for i in range(len(self.mlp_channels)):
+            mlp_channel = self.mlp_channels[i]
+            if use_xyz:
+                mlp_channel[0] += 3
+
+            num_kernels = paconv_num_kernels[i]
+
+            # can't use `nn.Sequential` for PAConvCUDA because its input and
+            # output have different shapes
+            mlp = nn.ModuleList()
+            for i in range(len(mlp_channel) - 1):
+                mlp.append(
+                    PAConvCUDA(
+                        mlp_channel[i],
+                        mlp_channel[i + 1],
+                        num_kernels[i],
+                        norm_cfg=norm_cfg,
+                        kernel_input=paconv_kernel_input,
+                        scorenet_input=scorenet_input,
+                        scorenet_cfg=scorenet_cfg))
+            self.mlps.append(mlp)
+
+    def forward(
+        self,
+        points_xyz: Tensor,
+        features: Optional[Tensor] = None,
+        indices: Optional[Tensor] = None,
+        target_xyz: Optional[Tensor] = None,
+    ) -> Tuple[Tensor]:
+        """Forward.
+
+        Args:
+            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            features (Tensor, optional): (B, C, N) features of each point.
+                Defaults to None.
+            indices (Tensor, optional): (B, num_point) Index of the features.
+                Defaults to None.
+            target_xyz (Tensor, optional): (B, M, 3) new coords of the outputs.
+                Defaults to None.
+
+        Returns:
+            Tuple[Tensor]:
+
+                - new_xyz: (B, M, 3) where M is the number of points.
+                  New features xyz.
+                - new_features: (B, M, sum_k(mlps[k][-1])) where M is the
+                  number of points. New feature descriptors.
+                - indices: (B, M) where M is the number of points.
+                  Index of the features.
+        """
+        new_features_list = []
+
+        # sample points, (B, num_point, 3), (B, num_point)
+        new_xyz, indices = self._sample_points(points_xyz, features, indices,
+                                               target_xyz)
+
+        for i in range(len(self.groupers)):
+            xyz = points_xyz
+            new_features = features
+            for j in range(len(self.mlps[i])):
+                # we don't use grouped_features here to avoid large GPU memory
+                # _, (B, 3, num_point, nsample), (B, num_point, nsample)
+                _, grouped_xyz, grouped_idx = self.groupers[i](xyz, new_xyz,
+                                                               new_features)
+
+                # concat xyz as additional features
+                if self.use_xyz and j == 0:
+                    # (B, C+3, N)
+                    new_features = torch.cat(
+                        (points_xyz.permute(0, 2, 1), new_features), dim=1)
+
+                # (B, out_c, num_point, nsample)
+                grouped_new_features = self.mlps[i][j](
+                    (new_features, grouped_xyz, grouped_idx.long()))[0]
+
+                # different from PointNet++ and non CUDA version of PAConv
+                # CUDA version of PAConv needs to aggregate local features
+                # every time after it passes through a Conv layer
+                # in order to transform to valid input shape
+                # (B, out_c, num_point)
+                new_features = self._pool_features(grouped_new_features)
+
+                # constrain the points to be grouped for next PAConv layer
+                # because new_features only contains sampled centers now
+                # (B, num_point, 3)
+                xyz = new_xyz
+
+            new_features_list.append(new_features)
+
+        return new_xyz, torch.cat(new_features_list, dim=1), indices
+
+
+@SA_MODULES.register_module()
+class PAConvCUDASAModule(PAConvCUDASAModuleMSG):
+    r"""Point set abstraction module with single-scale grouping (SSG) used in
+    PAConv networks.
+
+    Replace the non CUDA version PAConv with CUDA implemented PAConv for
+    efficient computation. See the `paper <https://arxiv.org/abs/2103.14635>`_
+    for more details.
+    """
+
+    def __init__(
+        self,
+        mlp_channels: List[int],
+        paconv_num_kernels: List[int],
+        num_point: Optional[int] = None,
+        radius: Optional[float] = None,
+        num_sample: Optional[int] = None,
+        norm_cfg: ConfigType = dict(type='BN2d', momentum=0.1),
+        use_xyz: bool = True,
+        pool_mod: str = 'max',
+        fps_mod: List[str] = ['D-FPS'],
+        fps_sample_range_list: List[int] = [-1],
+        normalize_xyz: bool = False,
+        paconv_kernel_input: str = 'w_neighbor',
+        scorenet_input: str = 'w_neighbor_dist',
+        scorenet_cfg: dict = dict(
+            mlp_channels=[8, 16, 16],
+            score_norm='softmax',
+            temp_factor=1.0,
+            last_bn=False)
+    ) -> None:
+        super(PAConvCUDASAModule, self).__init__(
+            mlp_channels=[mlp_channels],
+            paconv_num_kernels=[paconv_num_kernels],
+            num_point=num_point,
+            radii=[radius],
+            sample_nums=[num_sample],
+            norm_cfg=norm_cfg,
+            use_xyz=use_xyz,
+            pool_mod=pool_mod,
+            fps_mod=fps_mod,
+            fps_sample_range_list=fps_sample_range_list,
+            normalize_xyz=normalize_xyz,
+            paconv_kernel_input=paconv_kernel_input,
+            scorenet_input=scorenet_input,
+            scorenet_cfg=scorenet_cfg)
diff --git a/mmde/mmdet3d/models/layers/pointnet_modules/point_fp_module.py b/mmde/mmdet3d/models/layers/pointnet_modules/point_fp_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..3635490c211b9602dda3af535e52aaedd3feed07
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/pointnet_modules/point_fp_module.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from mmcv.cnn import ConvModule
+from mmcv.ops import three_interpolate, three_nn
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.utils import ConfigType, OptMultiConfig
+
+
+class PointFPModule(BaseModule):
+    """Point feature propagation module used in PointNets.
+
+    Propagate the features from one set to another.
+
+    Args:
+        mlp_channels (list[int]): List of mlp channels.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d').
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 mlp_channels: List[int],
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(PointFPModule, self).__init__(init_cfg=init_cfg)
+        self.mlps = nn.Sequential()
+        for i in range(len(mlp_channels) - 1):
+            self.mlps.add_module(
+                f'layer{i}',
+                ConvModule(
+                    mlp_channels[i],
+                    mlp_channels[i + 1],
+                    kernel_size=(1, 1),
+                    stride=(1, 1),
+                    conv_cfg=dict(type='Conv2d'),
+                    norm_cfg=norm_cfg))
+
+    def forward(self, target: Tensor, source: Tensor, target_feats: Tensor,
+                source_feats: Tensor) -> Tensor:
+        """Forward.
+
+        Args:
+            target (Tensor): (B, n, 3) Tensor of the xyz positions of
+                the target features.
+            source (Tensor): (B, m, 3) Tensor of the xyz positions of
+                the source features.
+            target_feats (Tensor): (B, C1, n) Tensor of the features to be
+                propagated to.
+            source_feats (Tensor): (B, C2, m) Tensor of features
+                to be propagated.
+
+        Return:
+            Tensor: (B, M, N) M = mlp[-1], Tensor of the target features.
+        """
+        if source is not None:
+            dist, idx = three_nn(target, source)
+            dist_reciprocal = 1.0 / (dist + 1e-8)
+            norm = torch.sum(dist_reciprocal, dim=2, keepdim=True)
+            weight = dist_reciprocal / norm
+
+            interpolated_feats = three_interpolate(source_feats, idx, weight)
+        else:
+            interpolated_feats = source_feats.expand(*source_feats.size()[0:2],
+                                                     target.size(1))
+
+        if target_feats is not None:
+            new_features = torch.cat([interpolated_feats, target_feats],
+                                     dim=1)  # (B, C2 + C1, n)
+        else:
+            new_features = interpolated_feats
+
+        new_features = new_features.unsqueeze(-1)
+        new_features = self.mlps(new_features)
+
+        return new_features.squeeze(-1)
diff --git a/mmde/mmdet3d/models/layers/pointnet_modules/point_sa_module.py b/mmde/mmdet3d/models/layers/pointnet_modules/point_sa_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..61661afa9d5a4f5e22df762f37eb6e37a47f541f
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/pointnet_modules/point_sa_module.py
@@ -0,0 +1,354 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import torch
+from mmcv.cnn import ConvModule
+from mmcv.ops import GroupAll
+from mmcv.ops import PointsSampler as Points_Sampler
+from mmcv.ops import QueryAndGroup, gather_points
+from torch import Tensor
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.models.layers import PAConv
+from mmdet3d.utils import ConfigType
+from .builder import SA_MODULES
+
+
+class BasePointSAModule(nn.Module):
+    """Base module for point set abstraction module used in PointNets.
+
+    Args:
+        num_point (int): Number of points.
+        radii (List[float]): List of radius in each ball query.
+        sample_nums (List[int]): Number of samples in each ball query.
+        mlp_channels (List[List[int]]): Specify of the pointnet before
+            the global pooling for each scale.
+        fps_mod (List[str]): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS']. Defaults to ['D-FPS'].
+
+            - F-FPS: using feature distances for FPS.
+            - D-FPS: using Euclidean distances of points for FPS.
+            - FS: using F-FPS and D-FPS simultaneously.
+        fps_sample_range_list (List[int]): Range of points to apply FPS.
+            Defaults to [-1].
+        dilated_group (bool): Whether to use dilated ball query.
+            Defaults to False.
+        use_xyz (bool): Whether to use xyz. Defaults to True.
+        pool_mod (str): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool): Whether to normalize local XYZ with radius.
+            Defaults to False.
+        grouper_return_grouped_xyz (bool): Whether to return grouped xyz
+            in `QueryAndGroup`. Defaults to False.
+        grouper_return_grouped_idx (bool): Whether to return grouped idx
+            in `QueryAndGroup`. Defaults to False.
+    """
+
+    def __init__(self,
+                 num_point: int,
+                 radii: List[float],
+                 sample_nums: List[int],
+                 mlp_channels: List[List[int]],
+                 fps_mod: List[str] = ['D-FPS'],
+                 fps_sample_range_list: List[int] = [-1],
+                 dilated_group: bool = False,
+                 use_xyz: bool = True,
+                 pool_mod: str = 'max',
+                 normalize_xyz: bool = False,
+                 grouper_return_grouped_xyz: bool = False,
+                 grouper_return_grouped_idx: bool = False) -> None:
+        super(BasePointSAModule, self).__init__()
+
+        assert len(radii) == len(sample_nums) == len(mlp_channels)
+        assert pool_mod in ['max', 'avg']
+        assert isinstance(fps_mod, list) or isinstance(fps_mod, tuple)
+        assert isinstance(fps_sample_range_list, list) or isinstance(
+            fps_sample_range_list, tuple)
+        assert len(fps_mod) == len(fps_sample_range_list)
+
+        if isinstance(mlp_channels, tuple):
+            mlp_channels = list(map(list, mlp_channels))
+        self.mlp_channels = mlp_channels
+
+        if isinstance(num_point, int):
+            self.num_point = [num_point]
+        elif isinstance(num_point, list) or isinstance(num_point, tuple):
+            self.num_point = num_point
+        elif num_point is None:
+            self.num_point = None
+        else:
+            raise NotImplementedError('Error type of num_point!')
+
+        self.pool_mod = pool_mod
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        self.fps_mod_list = fps_mod
+        self.fps_sample_range_list = fps_sample_range_list
+
+        if self.num_point is not None:
+            self.points_sampler = Points_Sampler(self.num_point,
+                                                 self.fps_mod_list,
+                                                 self.fps_sample_range_list)
+        else:
+            self.points_sampler = None
+
+        for i in range(len(radii)):
+            radius = radii[i]
+            sample_num = sample_nums[i]
+            if num_point is not None:
+                if dilated_group and i != 0:
+                    min_radius = radii[i - 1]
+                else:
+                    min_radius = 0
+                grouper = QueryAndGroup(
+                    radius,
+                    sample_num,
+                    min_radius=min_radius,
+                    use_xyz=use_xyz,
+                    normalize_xyz=normalize_xyz,
+                    return_grouped_xyz=grouper_return_grouped_xyz,
+                    return_grouped_idx=grouper_return_grouped_idx)
+            else:
+                grouper = GroupAll(use_xyz)
+            self.groupers.append(grouper)
+
+    def _sample_points(self, points_xyz: Tensor, features: Tensor,
+                       indices: Tensor, target_xyz: Tensor) -> Tuple[Tensor]:
+        """Perform point sampling based on inputs.
+
+        If `indices` is specified, directly sample corresponding points.
+        Else if `target_xyz` is specified, use is as sampled points.
+        Otherwise sample points using `self.points_sampler`.
+
+        Args:
+            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            features (Tensor): (B, C, N) Features of each point.
+            indices (Tensor): (B, num_point) Index of the features.
+            target_xyz (Tensor): (B, M, 3) new_xyz coordinates of the outputs.
+
+        Returns:
+            Tuple[Tensor]:
+
+            - new_xyz: (B, num_point, 3) Sampled xyz coordinates of points.
+            - indices: (B, num_point) Sampled points' index.
+        """
+        xyz_flipped = points_xyz.transpose(1, 2).contiguous()
+        if indices is not None:
+            assert (indices.shape[1] == self.num_point[0])
+            new_xyz = gather_points(xyz_flipped, indices).transpose(
+                1, 2).contiguous() if self.num_point is not None else None
+        elif target_xyz is not None:
+            new_xyz = target_xyz.contiguous()
+        else:
+            if self.num_point is not None:
+                indices = self.points_sampler(points_xyz, features)
+                new_xyz = gather_points(xyz_flipped,
+                                        indices).transpose(1, 2).contiguous()
+            else:
+                new_xyz = None
+
+        return new_xyz, indices
+
+    def _pool_features(self, features: Tensor) -> Tensor:
+        """Perform feature aggregation using pooling operation.
+
+        Args:
+            features (Tensor): (B, C, N, K) Features of locally grouped
+                points before pooling.
+
+        Returns:
+            Tensor: (B, C, N) Pooled features aggregating local information.
+        """
+        if self.pool_mod == 'max':
+            # (B, C, N, 1)
+            new_features = F.max_pool2d(
+                features, kernel_size=[1, features.size(3)])
+        elif self.pool_mod == 'avg':
+            # (B, C, N, 1)
+            new_features = F.avg_pool2d(
+                features, kernel_size=[1, features.size(3)])
+        else:
+            raise NotImplementedError
+
+        return new_features.squeeze(-1).contiguous()
+
+    def forward(
+        self,
+        points_xyz: Tensor,
+        features: Optional[Tensor] = None,
+        indices: Optional[Tensor] = None,
+        target_xyz: Optional[Tensor] = None,
+    ) -> Tuple[Tensor]:
+        """Forward.
+
+        Args:
+            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            features (Tensor, optional): (B, C, N) Features of each point.
+                Defaults to None.
+            indices (Tensor, optional): (B, num_point) Index of the features.
+                Defaults to None.
+            target_xyz (Tensor, optional): (B, M, 3) New coords of the outputs.
+                Defaults to None.
+
+        Returns:
+            Tuple[Tensor]:
+
+                - new_xyz: (B, M, 3) Where M is the number of points.
+                  New features xyz.
+                - new_features: (B, M, sum_k(mlps[k][-1])) Where M is the
+                  number of points. New feature descriptors.
+                - indices: (B, M) Where M is the number of points.
+                  Index of the features.
+        """
+        new_features_list = []
+
+        # sample points, (B, num_point, 3), (B, num_point)
+        new_xyz, indices = self._sample_points(points_xyz, features, indices,
+                                               target_xyz)
+
+        for i in range(len(self.groupers)):
+            # grouped_results may contain:
+            # - grouped_features: (B, C, num_point, nsample)
+            # - grouped_xyz: (B, 3, num_point, nsample)
+            # - grouped_idx: (B, num_point, nsample)
+            grouped_results = self.groupers[i](points_xyz, new_xyz, features)
+
+            # (B, mlp[-1], num_point, nsample)
+            new_features = self.mlps[i](grouped_results)
+
+            # this is a bit hack because PAConv outputs two values
+            # we take the first one as feature
+            if isinstance(self.mlps[i][0], PAConv):
+                assert isinstance(new_features, tuple)
+                new_features = new_features[0]
+
+            # (B, mlp[-1], num_point)
+            new_features = self._pool_features(new_features)
+            new_features_list.append(new_features)
+
+        return new_xyz, torch.cat(new_features_list, dim=1), indices
+
+
+@SA_MODULES.register_module()
+class PointSAModuleMSG(BasePointSAModule):
+    """Point set abstraction module with multi-scale grouping (MSG) used in
+    PointNets.
+
+    Args:
+        num_point (int): Number of points.
+        radii (List[float]): List of radius in each ball query.
+        sample_nums (List[int]): Number of samples in each ball query.
+        mlp_channels (List[List[int]]): Specify of the pointnet before
+            the global pooling for each scale.
+        fps_mod (List[str]): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS']. Defaults to ['D-FPS'].
+
+            - F-FPS: using feature distances for FPS.
+            - D-FPS: using Euclidean distances of points for FPS.
+            - FS: using F-FPS and D-FPS simultaneously.
+        fps_sample_range_list (List[int]): Range of points to apply FPS.
+            Defaults to [-1].
+        dilated_group (bool): Whether to use dilated ball query.
+            Defaults to False.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN2d').
+        use_xyz (bool): Whether to use xyz. Defaults to True.
+        pool_mod (str): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool): Whether to normalize local XYZ with radius.
+            Defaults to False.
+        bias (bool or str): If specified as `auto`, it will be decided by
+            `norm_cfg`. `bias` will be set as True if `norm_cfg` is None,
+            otherwise False. Defaults to 'auto'.
+    """
+
+    def __init__(self,
+                 num_point: int,
+                 radii: List[float],
+                 sample_nums: List[int],
+                 mlp_channels: List[List[int]],
+                 fps_mod: List[str] = ['D-FPS'],
+                 fps_sample_range_list: List[int] = [-1],
+                 dilated_group: bool = False,
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 use_xyz: bool = True,
+                 pool_mod: str = 'max',
+                 normalize_xyz: bool = False,
+                 bias: Union[bool, str] = 'auto') -> None:
+        super(PointSAModuleMSG, self).__init__(
+            num_point=num_point,
+            radii=radii,
+            sample_nums=sample_nums,
+            mlp_channels=mlp_channels,
+            fps_mod=fps_mod,
+            fps_sample_range_list=fps_sample_range_list,
+            dilated_group=dilated_group,
+            use_xyz=use_xyz,
+            pool_mod=pool_mod,
+            normalize_xyz=normalize_xyz)
+
+        for i in range(len(self.mlp_channels)):
+            mlp_channel = self.mlp_channels[i]
+            if use_xyz:
+                mlp_channel[0] += 3
+
+            mlp = nn.Sequential()
+            for i in range(len(mlp_channel) - 1):
+                mlp.add_module(
+                    f'layer{i}',
+                    ConvModule(
+                        mlp_channel[i],
+                        mlp_channel[i + 1],
+                        kernel_size=(1, 1),
+                        stride=(1, 1),
+                        conv_cfg=dict(type='Conv2d'),
+                        norm_cfg=norm_cfg,
+                        bias=bias))
+            self.mlps.append(mlp)
+
+
+@SA_MODULES.register_module()
+class PointSAModule(PointSAModuleMSG):
+    """Point set abstraction module with single-scale grouping (SSG) used in
+    PointNets.
+
+    Args:
+        mlp_channels (List[int]): Specify of the pointnet before
+            the global pooling for each scale.
+        num_point (int, optional): Number of points. Defaults to None.
+        radius (float, optional): Radius to group with. Defaults to None.
+        num_sample (int, optional): Number of samples in each ball query.
+            Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Default to dict(type='BN2d').
+        use_xyz (bool): Whether to use xyz. Defaults to True.
+        pool_mod (str): Type of pooling method. Defaults to 'max'.
+        fps_mod (List[str]): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS']. Defaults to ['D-FPS'].
+        fps_sample_range_list (List[int]): Range of points to apply FPS.
+            Defaults to [-1].
+        normalize_xyz (bool): Whether to normalize local XYZ with radius.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 mlp_channels: List[int],
+                 num_point: Optional[int] = None,
+                 radius: Optional[float] = None,
+                 num_sample: Optional[int] = None,
+                 norm_cfg: ConfigType = dict(type='BN2d'),
+                 use_xyz: bool = True,
+                 pool_mod: str = 'max',
+                 fps_mod: List[str] = ['D-FPS'],
+                 fps_sample_range_list: List[int] = [-1],
+                 normalize_xyz: bool = False) -> None:
+        super(PointSAModule, self).__init__(
+            mlp_channels=[mlp_channels],
+            num_point=num_point,
+            radii=[radius],
+            sample_nums=[num_sample],
+            norm_cfg=norm_cfg,
+            use_xyz=use_xyz,
+            pool_mod=pool_mod,
+            fps_mod=fps_mod,
+            fps_sample_range_list=fps_sample_range_list,
+            normalize_xyz=normalize_xyz)
diff --git a/mmde/mmdet3d/models/layers/pointnet_modules/stack_point_sa_module.py b/mmde/mmdet3d/models/layers/pointnet_modules/stack_point_sa_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..62839327819041d0894c8fa87373e02a2c01d1d9
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/pointnet_modules/stack_point_sa_module.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import ball_query, grouping_operation
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+
+
+class StackQueryAndGroup(BaseModule):
+    """Find nearby points in spherical space.
+
+    Args:
+        radius (float): List of radius in each ball query.
+        sample_nums (int): Number of samples in each ball query.
+        use_xyz (bool): Whether to use xyz. Default: True.
+        init_cfg (dict, optional): Initialize config of
+            model. Defaults to None.
+    """
+
+    def __init__(self,
+                 radius: float,
+                 sample_nums: int,
+                 use_xyz: bool = True,
+                 init_cfg: dict = None):
+        super().__init__(init_cfg=init_cfg)
+        self.radius, self.sample_nums, self.use_xyz = \
+            radius, sample_nums, use_xyz
+
+    def forward(self,
+                xyz: torch.Tensor,
+                xyz_batch_cnt: torch.Tensor,
+                new_xyz: torch.Tensor,
+                new_xyz_batch_cnt: torch.Tensor,
+                features: torch.Tensor = None) -> Tuple[Tensor, Tensor]:
+        """Forward.
+
+        Args:
+            xyz (Tensor): Tensor of the xyz coordinates
+                of the features shape with (N1 + N2 ..., 3).
+            xyz_batch_cnt: (Tensor): Stacked input xyz coordinates nums in
+                each batch, just like (N1, N2, ...).
+            new_xyz (Tensor): New coords of the outputs shape with
+                (M1 + M2 ..., 3).
+            new_xyz_batch_cnt: (Tensor): Stacked new xyz coordinates nums
+                in each batch, just like (M1, M2, ...).
+            features (Tensor, optional): Features of each point with shape
+                (N1 + N2 ..., C). C is features channel number. Default: None.
+        """
+        assert xyz.shape[0] == xyz_batch_cnt.sum(
+        ), f'xyz: {str(xyz.shape)}, xyz_batch_cnt: str(new_xyz_batch_cnt)'
+        assert new_xyz.shape[0] == new_xyz_batch_cnt.sum(), \
+            'new_xyz: str(new_xyz.shape), new_xyz_batch_cnt: ' \
+            'str(new_xyz_batch_cnt)'
+
+        # idx: (M1 + M2 ..., nsample)
+        idx = ball_query(0, self.radius, self.sample_nums, xyz, new_xyz,
+                         xyz_batch_cnt, new_xyz_batch_cnt)
+        empty_ball_mask = (idx[:, 0] == -1)
+        idx[empty_ball_mask] = 0
+        grouped_xyz = grouping_operation(
+            xyz, idx, xyz_batch_cnt,
+            new_xyz_batch_cnt)  # (M1 + M2, 3, nsample)
+        grouped_xyz -= new_xyz.unsqueeze(-1)
+
+        grouped_xyz[empty_ball_mask] = 0
+        if features is not None:
+            grouped_features = grouping_operation(
+                features, idx, xyz_batch_cnt,
+                new_xyz_batch_cnt)  # (M1 + M2, C, nsample)
+            grouped_features[empty_ball_mask] = 0
+            if self.use_xyz:
+                new_features = torch.cat(
+                    [grouped_xyz, grouped_features],
+                    dim=1)  # (M1 + M2 ..., C + 3, nsample)
+            else:
+                new_features = grouped_features
+        else:
+            assert self.use_xyz, 'Cannot have not features and not' \
+                                 ' use xyz as a feature!'
+            new_features = grouped_xyz
+        return new_features, idx
+
+
+@MODELS.register_module()
+class StackedSAModuleMSG(BaseModule):
+    """Stack point set abstraction module.
+
+    Args:
+        in_channels (int): Input channels.
+        radius (list[float]): List of radius in each ball query.
+        sample_nums (list[int]): Number of samples in each ball query.
+        mlp_channels (list[list[int]]): Specify mlp channels of the
+            pointnet before the global pooling for each scale to encode
+            point features.
+        use_xyz (bool): Whether to use xyz. Default: True.
+        pool_mod (str): Type of pooling method.
+            Default: 'max_pool'.
+        norm_cfg (dict): Type of normalization method. Defaults to
+            dict(type='BN2d', eps=1e-5, momentum=0.01).
+        init_cfg (dict, optional): Initialize config of
+            model. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 radius: List[float],
+                 sample_nums: List[int],
+                 mlp_channels: List[List[int]],
+                 use_xyz: bool = True,
+                 pool_mod='max',
+                 norm_cfg: dict = dict(type='BN2d', eps=1e-5, momentum=0.01),
+                 init_cfg: dict = None,
+                 **kwargs) -> None:
+        super(StackedSAModuleMSG, self).__init__(init_cfg=init_cfg)
+        assert len(radius) == len(sample_nums) == len(mlp_channels)
+
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        for i in range(len(radius)):
+            cin = in_channels
+            if use_xyz:
+                cin += 3
+            cur_radius = radius[i]
+            nsample = sample_nums[i]
+            mlp_spec = mlp_channels[i]
+
+            self.groupers.append(
+                StackQueryAndGroup(cur_radius, nsample, use_xyz=use_xyz))
+
+            mlp = nn.Sequential()
+            for i in range(len(mlp_spec)):
+                cout = mlp_spec[i]
+                mlp.add_module(
+                    f'layer{i}',
+                    ConvModule(
+                        cin,
+                        cout,
+                        kernel_size=(1, 1),
+                        stride=(1, 1),
+                        conv_cfg=dict(type='Conv2d'),
+                        norm_cfg=norm_cfg,
+                        bias=False))
+                cin = cout
+            self.mlps.append(mlp)
+        self.pool_mod = pool_mod
+
+    def forward(self,
+                xyz: Tensor,
+                xyz_batch_cnt: Tensor,
+                new_xyz: Tensor,
+                new_xyz_batch_cnt: Tensor,
+                features: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
+        """Forward.
+
+        Args:
+            xyz (Tensor): Tensor of the xyz coordinates
+                of the features shape with (N1 + N2 ..., 3).
+            xyz_batch_cnt: (Tensor): Stacked input xyz coordinates nums in
+                each batch, just like (N1, N2, ...).
+            new_xyz (Tensor): New coords of the outputs shape with
+                (M1 + M2 ..., 3).
+            new_xyz_batch_cnt: (Tensor): Stacked new xyz coordinates nums
+                in each batch, just like (M1, M2, ...).
+            features (Tensor, optional): Features of each point with shape
+                (N1 + N2 ..., C). C is features channel number. Default: None.
+
+        Returns:
+            Return new points coordinates and features:
+                - new_xyz  (Tensor): Target points coordinates with shape
+                    (N1 + N2 ..., 3).
+                - new_features (Tensor): Target points features with shape
+                    (M1 + M2 ..., sum_k(mlps[k][-1])).
+        """
+        new_features_list = []
+        for k in range(len(self.groupers)):
+            grouped_features, ball_idxs = self.groupers[k](
+                xyz, xyz_batch_cnt, new_xyz, new_xyz_batch_cnt,
+                features)  # (M1 + M2, Cin, nsample)
+            grouped_features = grouped_features.permute(1, 0,
+                                                        2).unsqueeze(dim=0)
+            new_features = self.mlps[k](grouped_features)
+            # (M1 + M2 ..., Cout, nsample)
+            if self.pool_mod == 'max':
+                new_features = new_features.max(-1).values
+            elif self.pool_mod == 'avg':
+                new_features = new_features.mean(-1)
+            else:
+                raise NotImplementedError
+            new_features = new_features.squeeze(dim=0).permute(1, 0)
+            new_features_list.append(new_features)
+
+        new_features = torch.cat(new_features_list, dim=1)
+
+        return new_xyz, new_features
diff --git a/mmde/mmdet3d/models/layers/sparse_block.py b/mmde/mmdet3d/models/layers/sparse_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ed7c8f48b0eaf1dabb55d972b6f78148312d148
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/sparse_block.py
@@ -0,0 +1,224 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
+from torch import nn
+
+from mmdet3d.utils import OptConfigType
+from .spconv import IS_SPCONV2_AVAILABLE
+
+if IS_SPCONV2_AVAILABLE:
+    from spconv.pytorch import SparseConvTensor, SparseModule, SparseSequential
+else:
+    from mmcv.ops import SparseConvTensor, SparseModule, SparseSequential
+
+
+def replace_feature(out: SparseConvTensor,
+                    new_features: SparseConvTensor) -> SparseConvTensor:
+    if 'replace_feature' in out.__dir__():
+        # spconv 2.x behaviour
+        return out.replace_feature(new_features)
+    else:
+        out.features = new_features
+        return out
+
+
+class SparseBottleneck(Bottleneck, SparseModule):
+    """Sparse bottleneck block for PartA^2.
+
+    Bottleneck block implemented with submanifold sparse convolution.
+
+    Args:
+        inplanes (int): Inplanes of block.
+        planes (int): Planes of block.
+        stride (int or Tuple[int]): Stride of the first block. Defaults to 1.
+        downsample (Module, optional): Down sample module for block.
+            Defaults to None.
+        indice_key (str): Indice key for spconv. Default to None.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Defaults to None.
+    """
+
+    expansion = 4
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: Union[int, Tuple[int]] = 1,
+                 downsample: nn.Module = None,
+                 indice_key=None,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None) -> None:
+
+        SparseModule.__init__(self)
+        if conv_cfg is None:
+            conv_cfg = dict(type='SubMConv3d')
+        conv_cfg.setdefault('indice_key', indice_key)
+        if norm_cfg is None:
+            norm_cfg = dict(type='BN1d')
+        Bottleneck.__init__(
+            self,
+            inplanes,
+            planes,
+            stride=stride,
+            downsample=downsample,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+    def forward(self, x: SparseConvTensor) -> SparseConvTensor:
+        identity = x.features
+
+        out = self.conv1(x)
+        out = replace_feature(out, self.bn1(out.features))
+        out = replace_feature(out, self.relu(out.features))
+
+        out = self.conv2(out)
+        out = replace_feature(out, self.bn2(out.features))
+        out = replace_feature(out, self.relu(out.features))
+
+        out = self.conv3(out)
+        out = replace_feature(out, self.bn3(out.features))
+
+        if self.downsample is not None:
+            identity = self.downsample(x).features
+
+        out = replace_feature(out, out.features + identity)
+        out = replace_feature(out, self.relu(out.features))
+
+        return out
+
+
+class SparseBasicBlock(BasicBlock, SparseModule):
+    """Sparse basic block for PartA^2.
+
+    Sparse basic block implemented with submanifold sparse convolution.
+
+    Args:
+        inplanes (int): Inplanes of block.
+        planes (int): Planes of block.
+        stride (int or Tuple[int]): Stride of the first block. Defaults to 1.
+        downsample (Module, optional): Down sample module for block.
+            Defaults to None.
+        indice_key (str): Indice key for spconv. Default to None.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Defaults to None.
+    """
+
+    expansion = 1
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: Union[int, Tuple[int]] = 1,
+                 downsample: nn.Module = None,
+                 indice_key: Optional[str] = None,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None) -> None:
+        SparseModule.__init__(self)
+        if conv_cfg is None:
+            conv_cfg = dict(type='SubMConv3d')
+        conv_cfg.setdefault('indice_key', indice_key)
+        if norm_cfg is None:
+            norm_cfg = dict(type='BN1d')
+        BasicBlock.__init__(
+            self,
+            inplanes,
+            planes,
+            stride=stride,
+            downsample=downsample,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+    def forward(self, x: SparseConvTensor) -> SparseConvTensor:
+        identity = x.features
+
+        assert x.features.dim() == 2, f'x.features.dim()={x.features.dim()}'
+        out = self.conv1(x)
+        out = replace_feature(out, self.norm1(out.features))
+        out = replace_feature(out, self.relu(out.features))
+
+        out = self.conv2(out)
+        out = replace_feature(out, self.norm2(out.features))
+
+        if self.downsample is not None:
+            identity = self.downsample(x).features
+
+        out = replace_feature(out, out.features + identity)
+        out = replace_feature(out, self.relu(out.features))
+
+        return out
+
+
+def make_sparse_convmodule(in_channels: int,
+                           out_channels: int,
+                           kernel_size: Union[int, Tuple[int]],
+                           indice_key: Optional[str] = None,
+                           stride: Union[int, Tuple[int]] = 1,
+                           padding: Union[int, Tuple[int]] = 0,
+                           conv_type: str = 'SubMConv3d',
+                           norm_cfg: OptConfigType = None,
+                           order: Tuple[str] = ('conv', 'norm', 'act'),
+                           **kwargs) -> SparseSequential:
+    """Make sparse convolution module.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of out channels.
+        kernel_size (int | Tuple[int]): Kernel size of convolution.
+        indice_key (str): The indice key used for sparse tensor.
+        stride (int or tuple[int]): The stride of convolution.
+        padding (int or tuple[int]): The padding number of input.
+        conv_type (str): Sparse conv type in spconv. Defaults to 'SubMConv3d'.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Defaults to None.
+        order (Tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Common examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+            Defaults to ('conv', 'norm', 'act').
+
+    Returns:
+        spconv.SparseSequential: sparse convolution module.
+    """
+    assert isinstance(order, tuple) and len(order) <= 3
+    assert set(order) | {'conv', 'norm', 'act'} == {'conv', 'norm', 'act'}
+
+    conv_cfg = dict(type=conv_type, indice_key=indice_key)
+    if norm_cfg is None:
+        norm_cfg = dict(type='BN1d')
+
+    layers = list()
+    for layer in order:
+        if layer == 'conv':
+            if conv_type not in [
+                    'SparseInverseConv3d', 'SparseInverseConv2d',
+                    'SparseInverseConv1d'
+            ]:
+                layers.append(
+                    build_conv_layer(
+                        conv_cfg,
+                        in_channels,
+                        out_channels,
+                        kernel_size,
+                        stride=stride,
+                        padding=padding,
+                        bias=False))
+            else:
+                layers.append(
+                    build_conv_layer(
+                        conv_cfg,
+                        in_channels,
+                        out_channels,
+                        kernel_size,
+                        bias=False))
+        elif layer == 'norm':
+            layers.append(build_norm_layer(norm_cfg, out_channels)[1])
+        elif layer == 'act':
+            layers.append(nn.ReLU(inplace=True))
+
+    layers = SparseSequential(*layers)
+    return layers
diff --git a/mmde/mmdet3d/models/layers/spconv/__init__.py b/mmde/mmdet3d/models/layers/spconv/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..37b533e0c6738ff7a1fc4a4740a394bed3270c2f
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/spconv/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .overwrite_spconv import register_spconv2
+
+try:
+    import spconv
+except ImportError:
+    IS_SPCONV2_AVAILABLE = False
+else:
+    if hasattr(spconv, '__version__') and spconv.__version__ >= '2.0.0':
+        IS_SPCONV2_AVAILABLE = register_spconv2()
+    else:
+        IS_SPCONV2_AVAILABLE = False
+
+__all__ = ['IS_SPCONV2_AVAILABLE']
diff --git a/mmde/mmdet3d/models/layers/spconv/overwrite_spconv/__init__.py b/mmde/mmdet3d/models/layers/spconv/overwrite_spconv/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e93d9cabbc730904fdd2d40929c4981491e4640
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/spconv/overwrite_spconv/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .write_spconv2 import register_spconv2
+
+__all__ = ['register_spconv2']
diff --git a/mmde/mmdet3d/models/layers/spconv/overwrite_spconv/write_spconv2.py b/mmde/mmdet3d/models/layers/spconv/overwrite_spconv/write_spconv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa2ae51592e6c1b84288a4d1ecb8d133c3aecc89
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/spconv/overwrite_spconv/write_spconv2.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+from typing import List, OrderedDict
+
+from mmengine.registry import MODELS
+from torch.nn.parameter import Parameter
+
+
+def register_spconv2() -> bool:
+    """This func registers spconv2.0 spconv ops to overwrite the default mmcv
+    spconv ops."""
+    try:
+        from spconv.pytorch import (SparseConv2d, SparseConv3d, SparseConv4d,
+                                    SparseConvTranspose2d,
+                                    SparseConvTranspose3d, SparseInverseConv2d,
+                                    SparseInverseConv3d, SparseModule,
+                                    SubMConv2d, SubMConv3d, SubMConv4d)
+    except ImportError:
+        return False
+    else:
+        MODELS._register_module(SparseConv2d, 'SparseConv2d', force=True)
+        MODELS._register_module(SparseConv3d, 'SparseConv3d', force=True)
+        MODELS._register_module(SparseConv4d, 'SparseConv4d', force=True)
+
+        MODELS._register_module(
+            SparseConvTranspose2d, 'SparseConvTranspose2d', force=True)
+        MODELS._register_module(
+            SparseConvTranspose3d, 'SparseConvTranspose3d', force=True)
+
+        MODELS._register_module(
+            SparseInverseConv2d, 'SparseInverseConv2d', force=True)
+        MODELS._register_module(
+            SparseInverseConv3d, 'SparseInverseConv3d', force=True)
+
+        MODELS._register_module(SubMConv2d, 'SubMConv2d', force=True)
+        MODELS._register_module(SubMConv3d, 'SubMConv3d', force=True)
+        MODELS._register_module(SubMConv4d, 'SubMConv4d', force=True)
+        SparseModule._version = 2
+        SparseModule._load_from_state_dict = _load_from_state_dict
+        return True
+
+
+def _load_from_state_dict(self, state_dict: OrderedDict, prefix: str,
+                          local_metadata: dict, strict: bool,
+                          missing_keys: List[str], unexpected_keys: List[str],
+                          error_msgs: List[str]) -> None:
+    """Rewrite this func to compat the convolutional kernel weights between
+    spconv 1.x in MMCV and 2.x in spconv2.x.
+
+    Kernel weights in MMCV spconv has shape in (D,H,W,in_channel,out_channel) ,
+    while those in spcon2.x is in (out_channel,D,H,W,in_channel).
+    """
+    version = local_metadata.get('version', None)
+    for hook in self._load_state_dict_pre_hooks.values():
+        hook(state_dict, prefix, local_metadata, strict, missing_keys,
+             unexpected_keys, error_msgs)
+
+    local_name_params = itertools.chain(self._parameters.items(),
+                                        self._buffers.items())
+    local_state = {k: v.data for k, v in local_name_params if v is not None}
+
+    for name, param in local_state.items():
+        key = prefix + name
+        if key in state_dict:
+            input_param = state_dict[key]
+
+            # Backward compatibility: loading 1-dim tensor from
+            # 0.3.* to version 0.4+
+            if len(param.shape) == 0 and len(input_param.shape) == 1:
+                input_param = input_param[0]
+            if version != 2:
+                dims = [len(input_param.shape) - 1] + list(
+                    range(len(input_param.shape) - 1))
+                input_param = input_param.permute(*dims)
+            if input_param.shape != param.shape:
+                # local shape should match the one in checkpoint
+                error_msgs.append(
+                    f'size mismatch for {key}: copying a param with '
+                    f'shape {key, input_param.shape} from checkpoint,'
+                    f'the shape in current model is {param.shape}.')
+                continue
+
+            if isinstance(input_param, Parameter):
+                # backwards compatibility for serialized parameters
+                input_param = input_param.data
+            try:
+                param.copy_(input_param)
+            except Exception:
+                error_msgs.append(
+                    f'While copying the parameter named "{key}", whose '
+                    f'dimensions in the model are {param.size()} and whose '
+                    f'dimensions in the checkpoint are {input_param.size()}.')
+        elif strict:
+            missing_keys.append(key)
+
+    if strict:
+        for key, input_param in state_dict.items():
+            if key.startswith(prefix):
+                input_name = key[len(prefix):]
+                input_name = input_name.split(
+                    '.', 1)[0]  # get the name of param/buffer/child
+                if input_name not in self._modules \
+                        and input_name not in local_state:
+                    unexpected_keys.append(key)
diff --git a/mmde/mmdet3d/models/layers/torchsparse/__init__.py b/mmde/mmdet3d/models/layers/torchsparse/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1232c73c788bf622c0ad18ee5addd8924b7622a2
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/torchsparse/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .torchsparse_wrapper import register_torchsparse
+
+try:
+    import torchsparse  # noqa
+except ImportError:
+    IS_TORCHSPARSE_AVAILABLE = False
+else:
+    IS_TORCHSPARSE_AVAILABLE = register_torchsparse()
+
+__all__ = ['IS_TORCHSPARSE_AVAILABLE']
diff --git a/mmde/mmdet3d/models/layers/torchsparse/torchsparse_wrapper.py b/mmde/mmdet3d/models/layers/torchsparse/torchsparse_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcb2490d876d13257cc43c9a4e9c67324dfcf81a
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/torchsparse/torchsparse_wrapper.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+
+def register_torchsparse() -> bool:
+    """This func registers torchsparse modules."""
+    try:
+        from torchsparse.nn import (BatchNorm, Conv3d, GroupNorm, LeakyReLU,
+                                    ReLU)
+        from torchsparse.nn.utils import fapply
+        from torchsparse.tensor import SparseTensor
+    except ImportError:
+        return False
+    else:
+
+        class SyncBatchNorm(nn.SyncBatchNorm):
+
+            def forward(self, input: SparseTensor) -> SparseTensor:
+                return fapply(input, super().forward)
+
+        MODELS._register_module(Conv3d, 'TorchSparseConv3d')
+        MODELS._register_module(BatchNorm, 'TorchSparseBN')
+        MODELS._register_module(SyncBatchNorm, 'TorchSparseSyncBN')
+        MODELS._register_module(GroupNorm, 'TorchSparseGN')
+        MODELS._register_module(ReLU, 'TorchSparseReLU')
+        MODELS._register_module(LeakyReLU, 'TorchSparseLeakyReLU')
+        return True
diff --git a/mmde/mmdet3d/models/layers/torchsparse_block.py b/mmde/mmdet3d/models/layers/torchsparse_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..727db5f3da43f458043ed3d92ad5dadee6f91098
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/torchsparse_block.py
@@ -0,0 +1,196 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Union
+
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmengine.model import BaseModule
+from torch import nn
+
+from mmdet3d.utils import ConfigType, OptConfigType
+from .torchsparse import IS_TORCHSPARSE_AVAILABLE
+
+if IS_TORCHSPARSE_AVAILABLE:
+    import torchsparse.nn as spnn
+    from torchsparse.tensor import SparseTensor
+else:
+    SparseTensor = None
+
+
+class TorchSparseConvModule(BaseModule):
+    """A torchsparse conv block that bundles conv/norm/activation layers.
+
+    Args:
+        in_channels (int): In channels of block.
+        out_channels (int): Out channels of block.
+        kernel_size (int or Tuple[int]): Kernel_size of block.
+        stride (int or Tuple[int]): Stride of the first block. Defaults to 1.
+        dilation (int): Dilation of block. Defaults to 1.
+        bias (bool): Whether use bias in conv. Defaults to False.
+        transposed (bool): Whether use transposed convolution operator.
+            Defaults to False.
+        norm_cfg (:obj:`ConfigDict` or dict): The config of normalization.
+        init_cfg (:obj:`ConfigDict` or dict, optional): Initialization config.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Sequence[int]],
+                 stride: Union[int, Sequence[int]] = 1,
+                 dilation: int = 1,
+                 bias: bool = False,
+                 transposed: bool = False,
+                 norm_cfg: ConfigType = dict(type='TorchSparseBN'),
+                 act_cfg: ConfigType = dict(
+                     type='TorchSparseReLU', inplace=True),
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg)
+        layers = [
+            spnn.Conv3d(in_channels, out_channels, kernel_size, stride,
+                        dilation, bias, transposed)
+        ]
+        if norm_cfg is not None:
+            _, norm = build_norm_layer(norm_cfg, out_channels)
+            layers.append(norm)
+        if act_cfg is not None:
+            activation = build_activation_layer(act_cfg)
+            layers.append(activation)
+        self.net = nn.Sequential(*layers)
+
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        out = self.net(x)
+        return out
+
+
+class TorchSparseBasicBlock(BaseModule):
+    """Torchsparse residual basic block for MinkUNet.
+
+    Args:
+        in_channels (int): In channels of block.
+        out_channels (int): Out channels of block.
+        kernel_size (int or Tuple[int]): Kernel_size of block.
+        stride (int or Tuple[int]): Stride of the first block. Defaults to 1.
+        dilation (int): Dilation of block. Defaults to 1.
+        bias (bool): Whether use bias in conv. Defaults to False.
+        norm_cfg (:obj:`ConfigDict` or dict): The config of normalization.
+        init_cfg (:obj:`ConfigDict` or dict, optional): Initialization config.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Sequence[int]] = 3,
+                 stride: Union[int, Sequence[int]] = 1,
+                 dilation: int = 1,
+                 bias: bool = False,
+                 norm_cfg: ConfigType = dict(type='TorchSparseBN'),
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg)
+        _, norm1 = build_norm_layer(norm_cfg, out_channels)
+        _, norm2 = build_norm_layer(norm_cfg, out_channels)
+
+        self.net = nn.Sequential(
+            spnn.Conv3d(in_channels, out_channels, kernel_size, stride,
+                        dilation, bias), norm1, spnn.ReLU(inplace=True),
+            spnn.Conv3d(
+                out_channels,
+                out_channels,
+                kernel_size,
+                stride=1,
+                dilation=dilation,
+                bias=bias), norm2)
+
+        if in_channels == out_channels and stride == 1:
+            self.downsample = nn.Identity()
+        else:
+            _, norm3 = build_norm_layer(norm_cfg, out_channels)
+            self.downsample = nn.Sequential(
+                spnn.Conv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=stride,
+                    dilation=dilation,
+                    bias=bias), norm3)
+
+        self.relu = spnn.ReLU(inplace=True)
+
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        out = self.relu(self.net(x) + self.downsample(x))
+        return out
+
+
+class TorchSparseBottleneck(BaseModule):
+    """Torchsparse residual basic block for MinkUNet.
+
+    Args:
+        in_channels (int): In channels of block.
+        out_channels (int): Out channels of block.
+        kernel_size (int or Tuple[int]): Kernel_size of block.
+        stride (int or Tuple[int]): Stride of the second block. Defaults to 1.
+        dilation (int): Dilation of block. Defaults to 1.
+        bias (bool): Whether use bias in conv. Defaults to False.
+        norm_cfg (:obj:`ConfigDict` or dict): The config of normalization.
+        init_cfg (:obj:`ConfigDict` or dict, optional): Initialization config.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Sequence[int]] = 3,
+                 stride: Union[int, Sequence[int]] = 1,
+                 dilation: int = 1,
+                 bias: bool = False,
+                 norm_cfg: ConfigType = dict(type='TorchSparseBN'),
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg)
+        _, norm1 = build_norm_layer(norm_cfg, out_channels)
+        _, norm2 = build_norm_layer(norm_cfg, out_channels)
+        _, norm3 = build_norm_layer(norm_cfg, out_channels)
+
+        self.net = nn.Sequential(
+            spnn.Conv3d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                dilation=dilation,
+                bias=bias), norm1, spnn.ReLU(inplace=True),
+            spnn.Conv3d(
+                out_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                dilation=dilation,
+                bias=bias), norm2, spnn.ReLU(inplace=True),
+            spnn.Conv3d(
+                out_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                dilation=dilation,
+                bias=bias), norm3)
+
+        if in_channels == out_channels and stride == 1:
+            self.downsample = nn.Identity()
+        else:
+            _, norm4 = build_norm_layer(norm_cfg, out_channels)
+            self.downsample = nn.Sequential(
+                spnn.Conv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=stride,
+                    dilation=dilation,
+                    bias=bias), norm4)
+
+        self.relu = spnn.ReLU(inplace=True)
+
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        out = self.relu(self.net(x) + self.downsample(x))
+        return out
diff --git a/mmde/mmdet3d/models/layers/transformer.py b/mmde/mmdet3d/models/layers/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2c9663d2feee8f5c8bd006f1c28e6425610ae04
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/transformer.py
@@ -0,0 +1,146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from mmcv.cnn.bricks.transformer import MultiheadAttention
+from mmengine.registry import MODELS
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.utils import ConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class GroupFree3DMHA(MultiheadAttention):
+    """A wrapper for torch.nn.MultiheadAttention for GroupFree3D.
+
+    This module implements MultiheadAttention with identity connection,
+    and positional encoding used in DETR is also passed as input.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads. Same as
+            `nn.MultiheadAttention`.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Defaults to 0.0.
+        proj_drop (float): A Dropout layer. Defaults to 0.0.
+        dropout_layer (ConfigType): The dropout_layer used when adding
+            the shortcut. Defaults to dict(type='DropOut', drop_prob=0.).
+        init_cfg (:obj:`ConfigDict` or dict or List[:obj:`Contigdict` or dict],
+            optional): Initialization config dict. Defaults to None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim) or (n, batch, embed_dim).
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 num_heads: int,
+                 attn_drop: float = 0.,
+                 proj_drop: float = 0.,
+                 dropout_layer: ConfigType = dict(
+                     type='DropOut', drop_prob=0.),
+                 init_cfg: OptMultiConfig = None,
+                 batch_first: bool = False,
+                 **kwargs) -> None:
+        super(GroupFree3DMHA,
+              self).__init__(embed_dims, num_heads, attn_drop, proj_drop,
+                             dropout_layer, init_cfg, batch_first, **kwargs)
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor,
+                value: Tensor,
+                identity: Tensor,
+                query_pos: Optional[Tensor] = None,
+                key_pos: Optional[Tensor] = None,
+                attn_mask: Optional[Tensor] = None,
+                key_padding_mask: Optional[Tensor] = None,
+                **kwargs) -> Tensor:
+        """Forward function for `GroupFree3DMHA`.
+
+        **kwargs allow passing a more general data flow when combining
+        with other operations in `transformerlayer`.
+
+        Args:
+            query (Tensor): The input query with shape [num_queries, bs,
+                embed_dims]. Same in `nn.MultiheadAttention.forward`.
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims]. Same in `nn.MultiheadAttention.forward`.
+                If None, the ``query`` will be used.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`.
+                If None, the `key` will be used.
+            identity (Tensor): This tensor, with the same shape as x,
+                will be used for the identity link. If None, `x` will be used.
+            query_pos (Tensor, optional): The positional encoding for query,
+                with the same shape as `x`. Defaults to None.
+                If not None, it will be added to `x` before forward function.
+            key_pos (Tensor, optional): The positional encoding for `key`,
+                with the same shape as `key`. Defaults to None. If not None,
+                it will be added to `key` before forward function. If None,
+                and `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor, optional): ByteTensor mask with shape
+                [num_queries, num_keys].
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+            key_padding_mask (Tensor, optional): ByteTensor with shape
+                [bs, num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+
+        Returns:
+            Tensor: Forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        if hasattr(self, 'operation_name'):
+            if self.operation_name == 'self_attn':
+                value = value + query_pos
+            elif self.operation_name == 'cross_attn':
+                value = value + key_pos
+            else:
+                raise NotImplementedError(
+                    f'{self.__class__.name} '
+                    f"can't be used as {self.operation_name}")
+        else:
+            value = value + query_pos
+
+        return super(GroupFree3DMHA, self).forward(
+            query=query,
+            key=key,
+            value=value,
+            identity=identity,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+
+
+@MODELS.register_module()
+class ConvBNPositionalEncoding(nn.Module):
+    """Absolute position embedding with Conv learning.
+
+    Args:
+        input_channel (int): Input features dim.
+        num_pos_feats (int): Output position features dim.
+            Defaults to 288 to be consistent with seed features dim.
+    """
+
+    def __init__(self, input_channel: int, num_pos_feats: int = 288) -> None:
+        super(ConvBNPositionalEncoding, self).__init__()
+        self.position_embedding_head = nn.Sequential(
+            nn.Conv1d(input_channel, num_pos_feats, kernel_size=1),
+            nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True),
+            nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1))
+
+    def forward(self, xyz: Tensor) -> Tensor:
+        """Forward pass.
+
+        Args:
+            xyz (Tensor): (B, N, 3) The coordinates to embed.
+
+        Returns:
+            Tensor: (B, num_pos_feats, N) The embedded position features.
+        """
+        xyz = xyz.permute(0, 2, 1)
+        position_embedding = self.position_embedding_head(xyz)
+        return position_embedding
diff --git a/mmde/mmdet3d/models/layers/vote_module.py b/mmde/mmdet3d/models/layers/vote_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..8759aec495e996fc6350ab49ee8821cd8bef2e0f
--- /dev/null
+++ b/mmde/mmdet3d/models/layers/vote_module.py
@@ -0,0 +1,190 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from mmcv.cnn import ConvModule
+from mmengine import is_tuple_of
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptConfigType
+
+
+class VoteModule(nn.Module):
+    """Vote module.
+
+    Generate votes from seed point features.
+
+    Args:
+        in_channels (int): Number of channels of seed point features.
+        vote_per_seed (int): Number of votes generated from each seed point.
+            Defaults to 1.
+        gt_per_seed (int): Number of ground truth votes generated from each
+            seed point. Defaults to 3.
+        num_points (int): Number of points to be used for voting.
+            Defaults to 1.
+        conv_channels (tuple[int]): Out channels of vote generating
+            convolution. Defaults to (16, 16).
+        conv_cfg (:obj:`ConfigDict` or dict): Config dict for convolution
+            layer. Defaults to dict(type='Conv1d').
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN1d').
+        norm_feats (bool): Whether to normalize features. Default to True.
+        with_res_feat (bool): Whether to predict residual features.
+            Defaults to True.
+        vote_xyz_range (List[float], optional): The range of points
+            translation. Defaults to None.
+        vote_loss (:obj:`ConfigDict` or dict, optional): Config of vote loss.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 vote_per_seed: int = 1,
+                 gt_per_seed: int = 3,
+                 num_points: int = -1,
+                 conv_channels: Tuple[int] = (16, 16),
+                 conv_cfg: ConfigType = dict(type='Conv1d'),
+                 norm_cfg: ConfigType = dict(type='BN1d'),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 norm_feats: bool = True,
+                 with_res_feat: bool = True,
+                 vote_xyz_range: List[float] = None,
+                 vote_loss: OptConfigType = None) -> None:
+        super(VoteModule, self).__init__()
+        self.in_channels = in_channels
+        self.vote_per_seed = vote_per_seed
+        self.gt_per_seed = gt_per_seed
+        self.num_points = num_points
+        self.norm_feats = norm_feats
+        self.with_res_feat = with_res_feat
+
+        assert vote_xyz_range is None or is_tuple_of(vote_xyz_range, float)
+        self.vote_xyz_range = vote_xyz_range
+
+        if vote_loss is not None:
+            self.vote_loss = MODELS.build(vote_loss)
+
+        prev_channels = in_channels
+        vote_conv_list = list()
+        for k in range(len(conv_channels)):
+            vote_conv_list.append(
+                ConvModule(
+                    prev_channels,
+                    conv_channels[k],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    bias=True,
+                    inplace=True))
+            prev_channels = conv_channels[k]
+        self.vote_conv = nn.Sequential(*vote_conv_list)
+
+        # conv_out predicts coordinate and residual features
+        if with_res_feat:
+            out_channel = (3 + in_channels) * self.vote_per_seed
+        else:
+            out_channel = 3 * self.vote_per_seed
+        self.conv_out = nn.Conv1d(prev_channels, out_channel, 1)
+
+    def forward(self, seed_points: Tensor,
+                seed_feats: Tensor) -> Tuple[Tensor]:
+        """Forward.
+
+        Args:
+            seed_points (Tensor): Coordinate of the seed points in shape
+                (B, N, 3).
+            seed_feats (Tensor): Features of the seed points in shape
+                (B, C, N).
+
+        Returns:
+            Tuple[torch.Tensor]:
+
+                - vote_points: Voted xyz based on the seed points
+                  with shape (B, M, 3), ``M=num_seed*vote_per_seed``.
+                - vote_features: Voted features based on the seed points with
+                  shape (B, C, M) where ``M=num_seed*vote_per_seed``,
+                  ``C=vote_feature_dim``.
+        """
+        if self.num_points != -1:
+            assert self.num_points < seed_points.shape[1], \
+                f'Number of vote points ({self.num_points}) should be '\
+                f'smaller than seed points size ({seed_points.shape[1]})'
+            seed_points = seed_points[:, :self.num_points]
+            seed_feats = seed_feats[..., :self.num_points]
+
+        batch_size, feat_channels, num_seed = seed_feats.shape
+        num_vote = num_seed * self.vote_per_seed
+        x = self.vote_conv(seed_feats)
+        # (batch_size, (3+out_dim)*vote_per_seed, num_seed)
+        votes = self.conv_out(x)
+
+        votes = votes.transpose(2, 1).view(batch_size, num_seed,
+                                           self.vote_per_seed, -1)
+
+        offset = votes[:, :, :, 0:3]
+        if self.vote_xyz_range is not None:
+            limited_offset_list = []
+            for axis in range(len(self.vote_xyz_range)):
+                limited_offset_list.append(offset[..., axis].clamp(
+                    min=-self.vote_xyz_range[axis],
+                    max=self.vote_xyz_range[axis]))
+            limited_offset = torch.stack(limited_offset_list, -1)
+            vote_points = (seed_points.unsqueeze(2) +
+                           limited_offset).contiguous()
+        else:
+            vote_points = (seed_points.unsqueeze(2) + offset).contiguous()
+        vote_points = vote_points.view(batch_size, num_vote, 3)
+        offset = offset.reshape(batch_size, num_vote, 3).transpose(2, 1)
+
+        if self.with_res_feat:
+            res_feats = votes[:, :, :, 3:]
+            vote_feats = (seed_feats.transpose(2, 1).unsqueeze(2) +
+                          res_feats).contiguous()
+            vote_feats = vote_feats.view(batch_size,
+                                         num_vote, feat_channels).transpose(
+                                             2, 1).contiguous()
+
+            if self.norm_feats:
+                features_norm = torch.norm(vote_feats, p=2, dim=1)
+                vote_feats = vote_feats.div(features_norm.unsqueeze(1))
+        else:
+            vote_feats = seed_feats
+        return vote_points, vote_feats, offset
+
+    def get_loss(self, seed_points: Tensor, vote_points: Tensor,
+                 seed_indices: Tensor, vote_targets_mask: Tensor,
+                 vote_targets: Tensor) -> Tensor:
+        """Calculate loss of voting module.
+
+        Args:
+            seed_points (Tensor): Coordinate of the seed points.
+            vote_points (Tensor): Coordinate of the vote points.
+            seed_indices (Tensor): Indices of seed points in raw points.
+            vote_targets_mask (Tensor): Mask of valid vote targets.
+            vote_targets (Tensor): Targets of votes.
+
+        Returns:
+            Tensor: Weighted vote loss.
+        """
+        batch_size, num_seed = seed_points.shape[:2]
+
+        seed_gt_votes_mask = torch.gather(vote_targets_mask, 1,
+                                          seed_indices).float()
+
+        seed_indices_expand = seed_indices.unsqueeze(-1).repeat(
+            1, 1, 3 * self.gt_per_seed)
+        seed_gt_votes = torch.gather(vote_targets, 1, seed_indices_expand)
+        seed_gt_votes += seed_points.repeat(1, 1, self.gt_per_seed)
+
+        weight = seed_gt_votes_mask / (torch.sum(seed_gt_votes_mask) + 1e-6)
+        distance = self.vote_loss(
+            vote_points.view(batch_size * num_seed, -1, 3),
+            seed_gt_votes.view(batch_size * num_seed, -1, 3),
+            dst_weight=weight.view(batch_size * num_seed, 1))[1]
+        vote_loss = torch.sum(torch.min(distance, dim=1)[0])
+
+        return vote_loss
diff --git a/mmde/mmdet3d/models/losses/__init__.py b/mmde/mmdet3d/models/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6956c7219d4d7d2906f501f0f0c791a29e07a411
--- /dev/null
+++ b/mmde/mmdet3d/models/losses/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.losses import FocalLoss, SmoothL1Loss, binary_cross_entropy
+
+from .axis_aligned_iou_loss import AxisAlignedIoULoss, axis_aligned_iou_loss
+from .chamfer_distance import ChamferDistance, chamfer_distance
+from .lovasz_loss import LovaszLoss
+from .multibin_loss import MultiBinLoss
+from .paconv_regularization_loss import PAConvRegularizationLoss
+from .rotated_iou_loss import RotatedIoU3DLoss, rotated_iou_3d_loss
+from .uncertain_smooth_l1_loss import UncertainL1Loss, UncertainSmoothL1Loss
+
+__all__ = [
+    'FocalLoss', 'SmoothL1Loss', 'binary_cross_entropy', 'ChamferDistance',
+    'chamfer_distance', 'axis_aligned_iou_loss', 'AxisAlignedIoULoss',
+    'PAConvRegularizationLoss', 'UncertainL1Loss', 'UncertainSmoothL1Loss',
+    'MultiBinLoss', 'RotatedIoU3DLoss', 'rotated_iou_3d_loss', 'LovaszLoss'
+]
diff --git a/mmde/mmdet3d/models/losses/axis_aligned_iou_loss.py b/mmde/mmdet3d/models/losses/axis_aligned_iou_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..45e25c919d6999be29b20a25b5e2b5692ba51d14
--- /dev/null
+++ b/mmde/mmdet3d/models/losses/axis_aligned_iou_loss.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmdet.models.losses.utils import weighted_loss
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import AxisAlignedBboxOverlaps3D
+
+
+@weighted_loss
+def axis_aligned_iou_loss(pred: Tensor, target: Tensor) -> Tensor:
+    """Calculate the IoU loss (1-IoU) of two set of axis aligned bounding
+    boxes. Note that predictions and targets are one-to-one corresponded.
+
+    Args:
+        pred (Tensor): Bbox predictions with shape [..., 3].
+        target (Tensor): Bbox targets (gt) with shape [..., 3].
+
+    Returns:
+        Tensor: IoU loss between predictions and targets.
+    """
+
+    axis_aligned_iou = AxisAlignedBboxOverlaps3D()(
+        pred, target, is_aligned=True)
+    iou_loss = 1 - axis_aligned_iou
+    return iou_loss
+
+
+@MODELS.register_module()
+class AxisAlignedIoULoss(nn.Module):
+    """Calculate the IoU loss (1-IoU) of axis aligned bounding boxes.
+
+    Args:
+        reduction (str): Method to reduce losses.
+            The valid reduction method are 'none', 'sum' or 'mean'.
+            Defaults to 'mean'.
+        loss_weight (float): Weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super(AxisAlignedIoULoss, self).__init__()
+        assert reduction in ['none', 'sum', 'mean']
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function of loss calculation.
+
+        Args:
+            pred (Tensor): Bbox predictions with shape [..., 3].
+            target (Tensor): Bbox targets (gt) with shape [..., 3].
+            weight (Tensor, optional): Weight of loss.
+                Defaults to None.
+            avg_factor (float, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): Method to reduce losses.
+                The valid reduction method are 'none', 'sum' or 'mean'.
+                Defaults to None.
+
+        Returns:
+            Tensor: IoU loss between predictions and targets.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if (weight is not None) and (not torch.any(weight > 0)) and (
+                reduction != 'none'):
+            return (pred * weight).sum()
+        return axis_aligned_iou_loss(
+            pred,
+            target,
+            weight=weight,
+            avg_factor=avg_factor,
+            reduction=reduction) * self.loss_weight
diff --git a/mmde/mmdet3d/models/losses/chamfer_distance.py b/mmde/mmdet3d/models/losses/chamfer_distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..10982135c7d32b21a074a78bbdbb3449f0a0d140
--- /dev/null
+++ b/mmde/mmdet3d/models/losses/chamfer_distance.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+from torch import nn as nn
+from torch.nn.functional import l1_loss, mse_loss, smooth_l1_loss
+
+from mmdet3d.registry import MODELS
+
+
+def chamfer_distance(
+        src: Tensor,
+        dst: Tensor,
+        src_weight: Union[Tensor, float] = 1.0,
+        dst_weight: Union[Tensor, float] = 1.0,
+        criterion_mode: str = 'l2',
+        reduction: str = 'mean') -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    """Calculate Chamfer Distance of two sets.
+
+    Args:
+        src (Tensor): Source set with shape [B, N, C] to
+            calculate Chamfer Distance.
+        dst (Tensor): Destination set with shape [B, M, C] to
+            calculate Chamfer Distance.
+        src_weight (Tensor or float): Weight of source loss. Defaults to 1.0.
+        dst_weight (Tensor or float): Weight of destination loss.
+            Defaults to 1.0.
+        criterion_mode (str): Criterion mode to calculate distance.
+            The valid modes are 'smooth_l1', 'l1' or 'l2'. Defaults to 'l2'.
+        reduction (str): Method to reduce losses.
+            The valid reduction method are 'none', 'sum' or 'mean'.
+            Defaults to 'mean'.
+
+    Returns:
+        tuple: Source and Destination loss with the corresponding indices.
+
+            - loss_src (Tensor): The min distance
+              from source to destination.
+            - loss_dst (Tensor): The min distance
+              from destination to source.
+            - indices1 (Tensor): Index the min distance point
+              for each point in source to destination.
+            - indices2 (Tensor): Index the min distance point
+              for each point in destination to source.
+    """
+
+    if criterion_mode == 'smooth_l1':
+        criterion = smooth_l1_loss
+    elif criterion_mode == 'l1':
+        criterion = l1_loss
+    elif criterion_mode == 'l2':
+        criterion = mse_loss
+    else:
+        raise NotImplementedError
+
+    src_expand = src.unsqueeze(2).repeat(1, 1, dst.shape[1], 1)
+    dst_expand = dst.unsqueeze(1).repeat(1, src.shape[1], 1, 1)
+
+    distance = criterion(src_expand, dst_expand, reduction='none').sum(-1)
+    src2dst_distance, indices1 = torch.min(distance, dim=2)  # (B,N)
+    dst2src_distance, indices2 = torch.min(distance, dim=1)  # (B,M)
+
+    loss_src = (src2dst_distance * src_weight)
+    loss_dst = (dst2src_distance * dst_weight)
+
+    if reduction == 'sum':
+        loss_src = torch.sum(loss_src)
+        loss_dst = torch.sum(loss_dst)
+    elif reduction == 'mean':
+        loss_src = torch.mean(loss_src)
+        loss_dst = torch.mean(loss_dst)
+    elif reduction == 'none':
+        pass
+    else:
+        raise NotImplementedError
+
+    return loss_src, loss_dst, indices1, indices2
+
+
+@MODELS.register_module()
+class ChamferDistance(nn.Module):
+    """Calculate Chamfer Distance of two sets.
+
+    Args:
+        mode (str): Criterion mode to calculate distance.
+            The valid modes are 'smooth_l1', 'l1' or 'l2'. Defaults to 'l2'.
+        reduction (str): Method to reduce losses.
+            The valid reduction method are 'none', 'sum' or 'mean'.
+            Defaults to 'mean'.
+        loss_src_weight (float): Weight of loss_source. Defaults to l.0.
+        loss_dst_weight (float): Weight of loss_target. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 mode: str = 'l2',
+                 reduction: str = 'mean',
+                 loss_src_weight: float = 1.0,
+                 loss_dst_weight: float = 1.0) -> None:
+        super(ChamferDistance, self).__init__()
+
+        assert mode in ['smooth_l1', 'l1', 'l2']
+        assert reduction in ['none', 'sum', 'mean']
+        self.mode = mode
+        self.reduction = reduction
+        self.loss_src_weight = loss_src_weight
+        self.loss_dst_weight = loss_dst_weight
+
+    def forward(
+        self,
+        source: Tensor,
+        target: Tensor,
+        src_weight: Union[Tensor, float] = 1.0,
+        dst_weight: Union[Tensor, float] = 1.0,
+        reduction_override: Optional[str] = None,
+        return_indices: bool = False,
+        **kwargs
+    ) -> Union[Tuple[Tensor, Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
+        """Forward function of loss calculation.
+
+        Args:
+            source (Tensor): Source set with shape [B, N, C] to
+                calculate Chamfer Distance.
+            target (Tensor): Destination set with shape [B, M, C] to
+                calculate Chamfer Distance.
+            src_weight (Tensor | float):
+                Weight of source loss. Defaults to 1.0.
+            dst_weight (Tensor | float):
+                Weight of destination loss. Defaults to 1.0.
+            reduction_override (str, optional): Method to reduce losses.
+                The valid reduction method are 'none', 'sum' or 'mean'.
+                Defaults to None.
+            return_indices (bool): Whether to return indices.
+                Defaults to False.
+
+        Returns:
+            tuple[Tensor]: If ``return_indices=True``, return losses of
+                source and target with their corresponding indices in the
+                order of ``(loss_source, loss_target, indices1, indices2)``.
+                If ``return_indices=False``, return
+                ``(loss_source, loss_target)``.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        loss_source, loss_target, indices1, indices2 = chamfer_distance(
+            source, target, src_weight, dst_weight, self.mode, reduction)
+
+        loss_source *= self.loss_src_weight
+        loss_target *= self.loss_dst_weight
+
+        if return_indices:
+            return loss_source, loss_target, indices1, indices2
+        else:
+            return loss_source, loss_target
diff --git a/mmde/mmdet3d/models/losses/lovasz_loss.py b/mmde/mmdet3d/models/losses/lovasz_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9bcc270bd29b5e12971400d6bd3d47b7cdbb03a
--- /dev/null
+++ b/mmde/mmdet3d/models/losses/lovasz_loss.py
@@ -0,0 +1,356 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Directly borrowed from mmsegmentation.
+
+Modified from https://github.com/bermanmaxim/LovaszSoftmax/blob/master/pytor
+ch/lovasz_losses.py Lovasz-Softmax and Jaccard hinge loss in PyTorch Maxim
+Berman 2018 ESAT-PSI KU Leuven (MIT License)
+"""
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.models import weight_reduce_loss
+from mmengine.utils import is_list_of
+
+from mmdet3d.registry import MODELS
+
+
+def lovasz_grad(gt_sorted: torch.Tensor) -> torch.Tensor:
+    """Computes gradient of the Lovasz extension w.r.t sorted errors.
+
+    See Alg. 1 in paper.
+    `The Lovasz-Softmax loss. <https://arxiv.org/abs/1705.08790>`_.
+
+    Args:
+        gt_sorted (torch.Tensor): Sorted ground truth.
+
+    Return:
+        torch.Tensor: Gradient of the Lovasz extension.
+    """
+    p = len(gt_sorted)
+    gts = gt_sorted.sum()
+    intersection = gts - gt_sorted.float().cumsum(0)
+    union = gts + (1 - gt_sorted).float().cumsum(0)
+    jaccard = 1. - intersection / union
+    if p > 1:  # cover 1-pixel case
+        jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
+    return jaccard
+
+
+def flatten_binary_logits(
+        logits: torch.Tensor,
+        labels: torch.Tensor,
+        ignore_index: Optional[int] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Flatten predictions and labels in the batch (binary case). Remove
+    tensors whose labels equal to 'ignore_index'.
+
+    Args:
+        probs (torch.Tensor): Predictions to be modified.
+        labels (torch.Tensor): Labels to be modified.
+        ignore_index (int, optional): The label index to be ignored.
+            Defaults to None.
+
+    Return:
+        tuple(torch.Tensor, torch.Tensor): Modified predictions and labels.
+    """
+    logits = logits.view(-1)
+    labels = labels.view(-1)
+    if ignore_index is None:
+        return logits, labels
+    valid = (labels != ignore_index)
+    vlogits = logits[valid]
+    vlabels = labels[valid]
+    return vlogits, vlabels
+
+
+def flatten_probs(
+        probs: torch.Tensor,
+        labels: torch.Tensor,
+        ignore_index: Optional[int] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Flatten predictions and labels in the batch. Remove tensors whose labels
+    equal to 'ignore_index'.
+
+    Args:
+        probs (torch.Tensor): Predictions to be modified.
+        labels (torch.Tensor): Labels to be modified.
+        ignore_index (int, optional): The label index to be ignored.
+            Defaults to None.
+
+    Return:
+        tuple(torch.Tensor, torch.Tensor): Modified predictions and labels.
+    """
+    if probs.dim() != 2:  # for input with P*C
+        if probs.dim() == 3:
+            # assumes output of a sigmoid layer
+            B, H, W = probs.size()
+            probs = probs.view(B, 1, H, W)
+        B, C, H, W = probs.size()
+        probs = probs.permute(0, 2, 3, 1).contiguous().view(-1,
+                                                            C)  # B*H*W, C=P,C
+        labels = labels.view(-1)
+    if ignore_index is None:
+        return probs, labels
+    valid = (labels != ignore_index)
+    vprobs = probs[valid.nonzero().squeeze()]
+    vlabels = labels[valid]
+    return vprobs, vlabels
+
+
+def lovasz_hinge_flat(logits: torch.Tensor,
+                      labels: torch.Tensor) -> torch.Tensor:
+    """Binary Lovasz hinge loss.
+
+    Args:
+        logits (torch.Tensor): Logits at each prediction
+            (between -infty and +infty) with shape [P].
+        labels (torch.Tensor): Binary ground truth labels (0 or 1)
+            with shape [P].
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    if len(labels) == 0:
+        # only void pixels, the gradients should be 0
+        return logits.sum() * 0.
+    signs = 2. * labels.float() - 1.
+    errors = (1. - logits * signs)
+    errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
+    perm = perm.data
+    gt_sorted = labels[perm]
+    grad = lovasz_grad(gt_sorted)
+    loss = torch.dot(F.relu(errors_sorted), grad)
+    return loss
+
+
+def lovasz_hinge(logits: torch.Tensor,
+                 labels: torch.Tensor,
+                 classes: Optional[Union[str, List[int]]] = None,
+                 per_sample: bool = False,
+                 class_weight: Optional[List[float]] = None,
+                 reduction: str = 'mean',
+                 avg_factor: Optional[int] = None,
+                 ignore_index: int = 255) -> torch.Tensor:
+    """Binary Lovasz hinge loss.
+
+    Args:
+        logits (torch.Tensor): Logits at each pixel
+            (between -infty and +infty) with shape [B, H, W].
+        labels (torch.Tensor): Binary ground truth masks (0 or 1)
+            with shape [B, H, W].
+        classes (Union[str, list[int]], optional): Placeholder, to be
+            consistent with other loss. Defaults to None.
+        per_sample (bool): If per_sample is True, compute the loss per
+            sample instead of per batch. Defaults to False.
+        class_weight (list[float], optional): Placeholder, to be consistent
+            with other loss. Defaults to None.
+        reduction (str): The method used to reduce the loss. Options
+            are "none", "mean" and "sum". This parameter only works when
+            per_sample is True. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. This parameter only works when per_sample is True.
+            Defaults to None.
+        ignore_index (Union[int, None]): The label index to be ignored.
+            Defaults to 255.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    if per_sample:
+        loss = [
+            lovasz_hinge_flat(*flatten_binary_logits(
+                logit.unsqueeze(0), label.unsqueeze(0), ignore_index))
+            for logit, label in zip(logits, labels)
+        ]
+        loss = weight_reduce_loss(
+            torch.stack(loss), None, reduction, avg_factor)
+    else:
+        loss = lovasz_hinge_flat(
+            *flatten_binary_logits(logits, labels, ignore_index))
+    return loss
+
+
+def lovasz_softmax_flat(
+        probs: torch.Tensor,
+        labels: torch.Tensor,
+        classes: Union[str, List[int]] = 'present',
+        class_weight: Optional[List[float]] = None) -> torch.Tensor:
+    """Multi-class Lovasz-Softmax loss.
+
+    Args:
+        probs (torch.Tensor): Class probabilities at each prediction
+            (between 0 and 1) with shape [P, C]
+        labels (torch.Tensor): Ground truth labels (between 0 and C - 1)
+            with shape [P].
+        classes (Union[str, list[int]]): Classes chosen to calculate loss.
+            'all' for all classes, 'present' for classes present in labels, or
+            a list of classes to average. Defaults to 'present'.
+        class_weight (list[float], optional): The weight for each class.
+            Defaults to None.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    if probs.numel() == 0:
+        # only void pixels, the gradients should be 0
+        return probs * 0.
+    C = probs.size(1)
+    losses = []
+    class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes
+    for c in class_to_sum:
+        fg = (labels == c).float()  # foreground for class c
+        if (classes == 'present' and fg.sum() == 0):
+            continue
+        if C == 1:
+            if len(classes) > 1:
+                raise ValueError('Sigmoid output possible only with 1 class')
+            class_pred = probs[:, 0]
+        else:
+            class_pred = probs[:, c]
+        errors = (fg - class_pred).abs()
+        errors_sorted, perm = torch.sort(errors, 0, descending=True)
+        perm = perm.data
+        fg_sorted = fg[perm]
+        loss = torch.dot(errors_sorted, lovasz_grad(fg_sorted))
+        if class_weight is not None:
+            loss *= class_weight[c]
+        losses.append(loss)
+    return torch.stack(losses).mean()
+
+
+def lovasz_softmax(probs: torch.Tensor,
+                   labels: torch.Tensor,
+                   classes: Union[str, List[int]] = 'present',
+                   per_sample: bool = False,
+                   class_weight: List[float] = None,
+                   reduction: str = 'mean',
+                   avg_factor: Optional[int] = None,
+                   ignore_index: int = 255) -> torch.Tensor:
+    """Multi-class Lovasz-Softmax loss.
+
+    Args:
+        probs (torch.Tensor): Class probabilities at each
+            prediction (between 0 and 1) with shape [B, C, H, W].
+        labels (torch.Tensor): Ground truth labels (between 0 and
+            C - 1) with shape [B, H, W].
+        classes (Union[str, list[int]]): Classes chosen to calculate loss.
+            'all' for all classes, 'present' for classes present in labels, or
+            a list of classes to average. Defaults to 'present'.
+        per_sample (bool): If per_sample is True, compute the loss per
+            sample instead of per batch. Defaults to False.
+        class_weight (list[float], optional): The weight for each class.
+            Defaults to None.
+        reduction (str): The method used to reduce the loss. Options
+            are "none", "mean" and "sum". This parameter only works when
+            per_sample is True. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. This parameter only works when per_sample is True.
+            Defaults to None.
+        ignore_index (Union[int, None]): The label index to be ignored.
+            Defaults to 255.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+
+    if per_sample:
+        loss = [
+            lovasz_softmax_flat(
+                *flatten_probs(
+                    prob.unsqueeze(0), label.unsqueeze(0), ignore_index),
+                classes=classes,
+                class_weight=class_weight)
+            for prob, label in zip(probs, labels)
+        ]
+        loss = weight_reduce_loss(
+            torch.stack(loss), None, reduction, avg_factor)
+    else:
+        loss = lovasz_softmax_flat(
+            *flatten_probs(probs, labels, ignore_index),
+            classes=classes,
+            class_weight=class_weight)
+    return loss
+
+
+@MODELS.register_module()
+class LovaszLoss(nn.Module):
+    """LovaszLoss.
+
+    This loss is proposed in `The Lovasz-Softmax loss: A tractable surrogate
+    for the optimization of the intersection-over-union measure in neural
+    networks <https://arxiv.org/abs/1705.08790>`_.
+
+    Args:
+        loss_type (str): Binary or multi-class loss.
+            Defaults to 'multi_class'. Options are "binary" and "multi_class".
+        classes (Union[str, list[int]]): Classes chosen to calculate loss.
+            'all' for all classes, 'present' for classes present in labels, or
+            a list of classes to average. Defaults to 'present'.
+        per_sample (bool): If per_sample is True, compute the loss per
+            sample instead of per batch. Defaults to False.
+        reduction (str): The method used to reduce the loss. Options
+            are "none", "mean" and "sum". This parameter only works when
+            per_sample is True. Defaults to 'mean'.
+        class_weight ([list[float], optional): Weight of each class.
+            Defaults to None.
+        loss_weight (float): Weight of the loss. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 loss_type: str = 'multi_class',
+                 classes: Union[str, List[int]] = 'present',
+                 per_sample: bool = False,
+                 reduction: str = 'mean',
+                 class_weight: Optional[List[float]] = None,
+                 loss_weight: float = 1.0):
+        super().__init__()
+        assert loss_type in ('binary', 'multi_class'), "loss_type should be \
+                                                    'binary' or 'multi_class'."
+
+        if loss_type == 'binary':
+            self.cls_criterion = lovasz_hinge
+        else:
+            self.cls_criterion = lovasz_softmax
+        assert classes in ('all', 'present') or is_list_of(classes, int)
+        if not per_sample:
+            assert reduction == 'none', "reduction should be 'none' when \
+                                                        per_sample is False."
+
+        self.classes = classes
+        self.per_sample = per_sample
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = class_weight
+
+    def forward(self,
+                cls_score: torch.Tensor,
+                label: torch.Tensor,
+                avg_factor: int = None,
+                reduction_override: str = None,
+                **kwargs) -> torch.Tensor:
+        """Forward function."""
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.class_weight is not None:
+            class_weight = cls_score.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+
+        # if multi-class loss, transform logits to probs
+        if self.cls_criterion == lovasz_softmax:
+            cls_score = F.softmax(cls_score, dim=1)
+
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            self.classes,
+            self.per_sample,
+            class_weight=class_weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_cls
diff --git a/mmde/mmdet3d/models/losses/multibin_loss.py b/mmde/mmdet3d/models/losses/multibin_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..91a1271a454d5200848a5496b39b831fa721261d
--- /dev/null
+++ b/mmde/mmdet3d/models/losses/multibin_loss.py
@@ -0,0 +1,107 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmdet.models.losses.utils import weighted_loss
+from torch import Tensor
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.registry import MODELS
+
+
+@weighted_loss
+def multibin_loss(pred_orientations: Tensor,
+                  gt_orientations: Tensor,
+                  num_dir_bins: int = 4) -> Tensor:
+    """Multi-Bin Loss.
+
+    Args:
+        pred_orientations(Tensor): Predicted local vector
+            orientation in [axis_cls, head_cls, sin, cos] format.
+            shape (N, num_dir_bins * 4)
+        gt_orientations(Tensor): Corresponding gt bboxes,
+            shape (N, num_dir_bins * 2).
+        num_dir_bins(int): Number of bins to encode
+            direction angle.
+            Defaults to 4.
+
+    Returns:
+        Tensor: Loss tensor.
+    """
+    cls_losses = 0
+    reg_losses = 0
+    reg_cnt = 0
+    for i in range(num_dir_bins):
+        # bin cls loss
+        cls_ce_loss = F.cross_entropy(
+            pred_orientations[:, (i * 2):(i * 2 + 2)],
+            gt_orientations[:, i].long(),
+            reduction='mean')
+        # regression loss
+        valid_mask_i = (gt_orientations[:, i] == 1)
+        cls_losses += cls_ce_loss
+        if valid_mask_i.sum() > 0:
+            start = num_dir_bins * 2 + i * 2
+            end = start + 2
+            pred_offset = F.normalize(pred_orientations[valid_mask_i,
+                                                        start:end])
+            gt_offset_sin = torch.sin(gt_orientations[valid_mask_i,
+                                                      num_dir_bins + i])
+            gt_offset_cos = torch.cos(gt_orientations[valid_mask_i,
+                                                      num_dir_bins + i])
+            reg_loss = \
+                F.l1_loss(pred_offset[:, 0], gt_offset_sin,
+                          reduction='none') + \
+                F.l1_loss(pred_offset[:, 1], gt_offset_cos,
+                          reduction='none')
+
+            reg_losses += reg_loss.sum()
+            reg_cnt += valid_mask_i.sum()
+
+        return cls_losses / num_dir_bins + reg_losses / reg_cnt
+
+
+@MODELS.register_module()
+class MultiBinLoss(nn.Module):
+    """Multi-Bin Loss for orientation.
+
+    Args:
+        reduction (str): The method to reduce the loss.
+            Options are 'none', 'mean' and 'sum'. Defaults to 'none'.
+        loss_weight (float): The weight of loss. Defaults
+            to 1.0.
+    """
+
+    def __init__(self,
+                 reduction: str = 'none',
+                 loss_weight: float = 1.0) -> None:
+        super(MultiBinLoss, self).__init__()
+        assert reduction in ['none', 'sum', 'mean']
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                num_dir_bins: int,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            num_dir_bins (int): Number of bins to encode direction angle.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss = self.loss_weight * multibin_loss(
+            pred, target, num_dir_bins=num_dir_bins, reduction=reduction)
+        return loss
diff --git a/mmde/mmdet3d/models/losses/paconv_regularization_loss.py b/mmde/mmdet3d/models/losses/paconv_regularization_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d88761a6296dc071b3dcda68149861914bfe0d8
--- /dev/null
+++ b/mmde/mmdet3d/models/losses/paconv_regularization_loss.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import torch
+from mmdet.models.losses.utils import weight_reduce_loss
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+from ..layers import PAConv, PAConvCUDA
+
+
+def weight_correlation(conv: nn.Module) -> Tensor:
+    """Calculate correlations between kernel weights in Conv's weight bank as
+    regularization loss. The cosine similarity is used as metrics.
+
+    Args:
+        conv (nn.Module): A Conv modules to be regularized.
+            Currently we only support `PAConv` and `PAConvCUDA`.
+
+    Returns:
+        Tensor: Correlations between each kernel weights in weight bank.
+    """
+    assert isinstance(conv, (PAConv, PAConvCUDA)), \
+        f'unsupported module type {type(conv)}'
+    kernels = conv.weight_bank  # [C_in, num_kernels * C_out]
+    in_channels = conv.in_channels
+    out_channels = conv.out_channels
+    num_kernels = conv.num_kernels
+
+    # [num_kernels, Cin * Cout]
+    flatten_kernels = kernels.view(in_channels, num_kernels, out_channels).\
+        permute(1, 0, 2).reshape(num_kernels, -1)
+    # [num_kernels, num_kernels]
+    inner_product = torch.matmul(flatten_kernels, flatten_kernels.T)
+    # [num_kernels, 1]
+    kernel_norms = torch.sum(flatten_kernels**2, dim=-1, keepdim=True)**0.5
+    # [num_kernels, num_kernels]
+    kernel_norms = torch.matmul(kernel_norms, kernel_norms.T)
+    cosine_sims = inner_product / kernel_norms
+    # take upper triangular part excluding diagonal since we only compute
+    # correlation between different kernels once
+    # the square is to ensure positive loss, refer to:
+    # https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/tool/train.py#L208
+    corr = torch.sum(torch.triu(cosine_sims, diagonal=1)**2)
+
+    return corr
+
+
+def paconv_regularization_loss(modules: List[nn.Module],
+                               reduction: str) -> Tensor:
+    """Computes correlation loss of PAConv weight kernels as regularization.
+
+    Args:
+        modules (List[nn.Module] | :obj:`generator`):
+            A list or a python generator of torch.nn.Modules.
+        reduction (str): Method to reduce losses among PAConv modules.
+            The valid reduction method are 'none', 'sum' or 'mean'.
+
+    Returns:
+        Tensor: Correlation loss of kernel weights.
+    """
+    corr_loss = []
+    for module in modules:
+        if isinstance(module, (PAConv, PAConvCUDA)):
+            corr_loss.append(weight_correlation(module))
+    corr_loss = torch.stack(corr_loss)
+
+    # perform reduction
+    corr_loss = weight_reduce_loss(corr_loss, reduction=reduction)
+
+    return corr_loss
+
+
+@MODELS.register_module()
+class PAConvRegularizationLoss(nn.Module):
+    """Calculate correlation loss of kernel weights in PAConv's weight bank.
+
+    This is used as a regularization term in PAConv model training.
+
+    Args:
+        reduction (str): Method to reduce losses. The reduction is performed
+            among all PAConv modules instead of prediction tensors.
+            The valid reduction method are 'none', 'sum' or 'mean'.
+            Defaults to 'mean'.
+        loss_weight (float): Weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super(PAConvRegularizationLoss, self).__init__()
+        assert reduction in ['none', 'sum', 'mean']
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                modules: List[nn.Module],
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function of loss calculation.
+
+        Args:
+            modules (List[nn.Module] | :obj:`generator`):
+                A list or a python generator of torch.nn.Modules.
+            reduction_override (str, optional): Method to reduce losses.
+                The valid reduction method are 'none', 'sum' or 'mean'.
+                Defaults to None.
+
+        Returns:
+            Tensor: Correlation loss of kernel weights.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        return self.loss_weight * paconv_regularization_loss(
+            modules, reduction=reduction)
diff --git a/mmde/mmdet3d/models/losses/rotated_iou_loss.py b/mmde/mmdet3d/models/losses/rotated_iou_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a737bd341bf78a286c1eaa0c0feae5769221cf8
--- /dev/null
+++ b/mmde/mmdet3d/models/losses/rotated_iou_loss.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmcv.ops import diff_iou_rotated_3d
+from mmdet.models.losses.utils import weighted_loss
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+
+
+@weighted_loss
+def rotated_iou_3d_loss(pred: Tensor, target: Tensor) -> Tensor:
+    """Calculate the IoU loss (1-IoU) of two sets of rotated bounding boxes.
+
+    Note that predictions and targets are one-to-one corresponded.
+
+    Args:
+        pred (Tensor): Bbox predictions with shape [N, 7]
+            (x, y, z, w, l, h, alpha).
+        target (Tensor): Bbox targets (gt) with shape [N, 7]
+            (x, y, z, w, l, h, alpha).
+
+    Returns:
+        Tensor: IoU loss between predictions and targets.
+    """
+    iou_loss = 1 - diff_iou_rotated_3d(pred.unsqueeze(0),
+                                       target.unsqueeze(0))[0]
+    return iou_loss
+
+
+@MODELS.register_module()
+class RotatedIoU3DLoss(nn.Module):
+    """Calculate the IoU loss (1-IoU) of rotated bounding boxes.
+
+    Args:
+        reduction (str): Method to reduce losses.
+            The valid reduction method are 'none', 'sum' or 'mean'.
+            Defaults to 'mean'.
+        loss_weight (float): Weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function of loss calculation.
+
+        Args:
+            pred (Tensor): Bbox predictions with shape [..., 7]
+                (x, y, z, w, l, h, alpha).
+            target (Tensor): Bbox targets (gt) with shape [..., 7]
+                (x, y, z, w, l, h, alpha).
+            weight (Tensor, optional): Weight of loss.
+                Defaults to None.
+            avg_factor (float, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): Method to reduce losses.
+                The valid reduction method are 'none', 'sum' or 'mean'.
+                Defaults to None.
+
+        Returns:
+            Tensor: IoU loss between predictions and targets.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            return pred.sum() * weight.sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            weight = weight.mean(-1)
+        loss = self.loss_weight * rotated_iou_3d_loss(
+            pred,
+            target,
+            weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+
+        return loss
diff --git a/mmde/mmdet3d/models/losses/uncertain_smooth_l1_loss.py b/mmde/mmdet3d/models/losses/uncertain_smooth_l1_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cd90f386806657d25473dd53c51b27f90ff0c81
--- /dev/null
+++ b/mmde/mmdet3d/models/losses/uncertain_smooth_l1_loss.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmdet.models.losses.utils import weighted_loss
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+
+
+@weighted_loss
+def uncertain_smooth_l1_loss(pred: Tensor,
+                             target: Tensor,
+                             sigma: Tensor,
+                             alpha: float = 1.0,
+                             beta: float = 1.0) -> Tensor:
+    """Smooth L1 loss with uncertainty.
+
+    Args:
+        pred (Tensor): The prediction.
+        target (Tensor): The learning target of the prediction.
+        sigma (Tensor): The sigma for uncertainty.
+        alpha (float): The coefficient of log(sigma).
+            Defaults to 1.0.
+        beta (float): The threshold in the piecewise function.
+            Defaults to 1.0.
+
+    Returns:
+        Tensor: Calculated loss
+    """
+    assert beta > 0
+    assert target.numel() > 0
+    assert pred.size() == target.size() == sigma.size(), 'The size of pred ' \
+        f'{pred.size()}, target {target.size()}, and sigma {sigma.size()} ' \
+        'are inconsistent.'
+    diff = torch.abs(pred - target)
+    loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
+                       diff - 0.5 * beta)
+    loss = torch.exp(-sigma) * loss + alpha * sigma
+
+    return loss
+
+
+@weighted_loss
+def uncertain_l1_loss(pred: Tensor,
+                      target: Tensor,
+                      sigma: Tensor,
+                      alpha: float = 1.0) -> Tensor:
+    """L1 loss with uncertainty.
+
+    Args:
+        pred (Tensor): The prediction.
+        target (Tensor): The learning target of the prediction.
+        sigma (Tensor): The sigma for uncertainty.
+        alpha (float): The coefficient of log(sigma).
+            Defaults to 1.0.
+
+    Returns:
+        Tensor: Calculated loss
+    """
+    assert target.numel() > 0
+    assert pred.size() == target.size() == sigma.size(), 'The size of pred ' \
+        f'{pred.size()}, target {target.size()}, and sigma {sigma.size()} ' \
+        'are inconsistent.'
+    loss = torch.abs(pred - target)
+    loss = torch.exp(-sigma) * loss + alpha * sigma
+    return loss
+
+
+@MODELS.register_module()
+class UncertainSmoothL1Loss(nn.Module):
+    r"""Smooth L1 loss with uncertainty.
+
+    Please refer to `PGD <https://arxiv.org/abs/2107.14160>`_ and
+    `Multi-Task Learning Using Uncertainty to Weigh Losses for Scene Geometry
+    and Semantics <https://arxiv.org/abs/1705.07115>`_ for more details.
+
+    Args:
+        alpha (float): The coefficient of log(sigma).
+            Defaults to 1.0.
+        beta (float): The threshold in the piecewise function.
+            Defaults to 1.0.
+        reduction (str): The method to reduce the loss.
+            Options are 'none', 'mean' and 'sum'. Defaults to 'mean'.
+        loss_weight (float): The weight of loss. Defaults to 1.0
+    """
+
+    def __init__(self,
+                 alpha: float = 1.0,
+                 beta: float = 1.0,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super(UncertainSmoothL1Loss, self).__init__()
+        assert reduction in ['none', 'sum', 'mean']
+        self.alpha = alpha
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                sigma: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            sigma (Tensor): The sigma for uncertainty.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (float, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            Tensor: Calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * uncertain_smooth_l1_loss(
+            pred,
+            target,
+            weight,
+            sigma=sigma,
+            alpha=self.alpha,
+            beta=self.beta,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_bbox
+
+
+@MODELS.register_module()
+class UncertainL1Loss(nn.Module):
+    """L1 loss with uncertainty.
+
+    Args:
+        alpha (float): The coefficient of log(sigma).
+            Defaults to 1.0.
+        reduction (str): The method to reduce the loss.
+            Options are 'none', 'mean' and 'sum'. Defaults to 'mean'.
+        loss_weight (float): The weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 alpha: float = 1.0,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super(UncertainL1Loss, self).__init__()
+        assert reduction in ['none', 'sum', 'mean']
+        self.alpha = alpha
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                sigma: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            sigma (Tensor): The sigma for uncertainty.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (float, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            Tensor: Calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * uncertain_l1_loss(
+            pred,
+            target,
+            weight,
+            sigma=sigma,
+            alpha=self.alpha,
+            reduction=reduction,
+            avg_factor=avg_factor)
+        return loss_bbox
diff --git a/mmde/mmdet3d/models/middle_encoders/__init__.py b/mmde/mmdet3d/models/middle_encoders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..96f5d2019dc3c6ee6d7df39c30e612c683280803
--- /dev/null
+++ b/mmde/mmdet3d/models/middle_encoders/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .pillar_scatter import PointPillarsScatter
+from .sparse_encoder import SparseEncoder, SparseEncoderSASSD
+from .sparse_unet import SparseUNet
+from .voxel_set_abstraction import VoxelSetAbstraction
+
+__all__ = [
+    'PointPillarsScatter', 'SparseEncoder', 'SparseEncoderSASSD', 'SparseUNet',
+    'VoxelSetAbstraction'
+]
diff --git a/mmde/mmdet3d/models/middle_encoders/pillar_scatter.py b/mmde/mmdet3d/models/middle_encoders/pillar_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..11287587dfdc745ce679f7ef844278a57a601edd
--- /dev/null
+++ b/mmde/mmdet3d/models/middle_encoders/pillar_scatter.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from torch import Tensor, nn
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class PointPillarsScatter(nn.Module):
+    """Point Pillar's Scatter.
+
+    Converts learned features from dense tensor to sparse pseudo image.
+
+    Args:
+        in_channels (int): Channels of input features.
+        output_shape (list[int]): Required output shape of features.
+    """
+
+    def __init__(self, in_channels: int, output_shape: List[int]):
+        super().__init__()
+        self.output_shape = output_shape
+        self.ny = output_shape[0]
+        self.nx = output_shape[1]
+        self.in_channels = in_channels
+
+    def forward(self,
+                voxel_features: Tensor,
+                coors: Tensor,
+                batch_size: int = None) -> Tensor:
+        """Foraward function to scatter features."""
+        # TODO: rewrite the function in a batch manner
+        # no need to deal with different batch cases
+        if batch_size is not None:
+            return self.forward_batch(voxel_features, coors, batch_size)
+        else:
+            return self.forward_single(voxel_features, coors)
+
+    def forward_single(self, voxel_features: Tensor, coors: Tensor) -> Tensor:
+        """Scatter features of single sample.
+
+        Args:
+            voxel_features (torch.Tensor): Voxel features in shape (N, M, C).
+            coors (torch.Tensor): Coordinates of each voxel.
+                The first column indicates the sample ID.
+        """
+        # Create the canvas for this sample
+        canvas = torch.zeros(
+            self.in_channels,
+            self.nx * self.ny,
+            dtype=voxel_features.dtype,
+            device=voxel_features.device)
+
+        indices = coors[:, 2] * self.nx + coors[:, 3]
+        indices = indices.long()
+        voxels = voxel_features.t()
+        # Now scatter the blob back to the canvas.
+        canvas[:, indices] = voxels
+        # Undo the column stacking to final 4-dim tensor
+        canvas = canvas.view(1, self.in_channels, self.ny, self.nx)
+        return canvas
+
+    def forward_batch(self, voxel_features: Tensor, coors: Tensor,
+                      batch_size: int) -> Tensor:
+        """Scatter features of single sample.
+
+        Args:
+            voxel_features (torch.Tensor): Voxel features in shape (N, M, C).
+            coors (torch.Tensor): Coordinates of each voxel in shape (N, 4).
+                The first column indicates the sample ID.
+            batch_size (int): Number of samples in the current batch.
+        """
+        # batch_canvas will be the final output.
+        batch_canvas = []
+        for batch_itt in range(batch_size):
+            # Create the canvas for this sample
+            canvas = torch.zeros(
+                self.in_channels,
+                self.nx * self.ny,
+                dtype=voxel_features.dtype,
+                device=voxel_features.device)
+
+            # Only include non-empty pillars
+            batch_mask = coors[:, 0] == batch_itt
+            this_coors = coors[batch_mask, :]
+            indices = this_coors[:, 2] * self.nx + this_coors[:, 3]
+            indices = indices.type(torch.long)
+            voxels = voxel_features[batch_mask, :]
+            voxels = voxels.t()
+
+            # Now scatter the blob back to the canvas.
+            canvas[:, indices] = voxels
+
+            # Append to a list for later stacking.
+            batch_canvas.append(canvas)
+
+        # Stack to 3-dim tensor (batch-size, in_channels, nrows*ncols)
+        batch_canvas = torch.stack(batch_canvas, 0)
+
+        # Undo the column stacking to final 4-dim tensor
+        batch_canvas = batch_canvas.view(batch_size, self.in_channels, self.ny,
+                                         self.nx)
+
+        return batch_canvas
diff --git a/mmde/mmdet3d/models/middle_encoders/sparse_encoder.py b/mmde/mmdet3d/models/middle_encoders/sparse_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef14151429fe7f639a4876272ba6654abfc58084
--- /dev/null
+++ b/mmde/mmdet3d/models/middle_encoders/sparse_encoder.py
@@ -0,0 +1,528 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+from mmcv.ops import points_in_boxes_all, three_interpolate, three_nn
+from mmdet.models.losses import sigmoid_focal_loss, smooth_l1_loss
+from mmengine.runner import amp
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.models.layers import SparseBasicBlock, make_sparse_convmodule
+from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import BaseInstance3DBoxes
+
+if IS_SPCONV2_AVAILABLE:
+    from spconv.pytorch import SparseConvTensor, SparseSequential
+else:
+    from mmcv.ops import SparseConvTensor, SparseSequential
+
+TwoTupleIntType = Tuple[Tuple[int]]
+
+
+@MODELS.register_module()
+class SparseEncoder(nn.Module):
+    r"""Sparse encoder for SECOND and Part-A2.
+
+    Args:
+        in_channels (int): The number of input channels.
+        sparse_shape (list[int]): The sparse shape of input tensor.
+        order (tuple[str], optional): Order of conv module.
+            Defaults to ('conv', 'norm', 'act').
+        norm_cfg (dict, optional): Config of normalization layer. Defaults to
+            dict(type='BN1d', eps=1e-3, momentum=0.01).
+        base_channels (int, optional): Out channels for conv_input layer.
+            Defaults to 16.
+        output_channels (int, optional): Out channels for conv_out layer.
+            Defaults to 128.
+        encoder_channels (tuple[tuple[int]], optional):
+            Convolutional channels of each encode block.
+            Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)).
+        encoder_paddings (tuple[tuple[int]], optional):
+            Paddings of each encode block.
+            Defaults to ((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)).
+        block_type (str, optional): Type of the block to use.
+            Defaults to 'conv_module'.
+        return_middle_feats (bool): Whether output middle features.
+            Default to False.
+    """
+
+    def __init__(
+            self,
+            in_channels: int,
+            sparse_shape: List[int],
+            order: Optional[Tuple[str]] = ('conv', 'norm', 'act'),
+            norm_cfg: Optional[dict] = dict(
+                type='BN1d', eps=1e-3, momentum=0.01),
+            base_channels: Optional[int] = 16,
+            output_channels: Optional[int] = 128,
+            encoder_channels: Optional[TwoTupleIntType] = ((16, ), (32, 32,
+                                                                    32),
+                                                           (64, 64,
+                                                            64), (64, 64, 64)),
+            encoder_paddings: Optional[TwoTupleIntType] = ((1, ), (1, 1, 1),
+                                                           (1, 1, 1),
+                                                           ((0, 1, 1), 1, 1)),
+            block_type: Optional[str] = 'conv_module',
+            return_middle_feats: Optional[bool] = False):
+        super().__init__()
+        assert block_type in ['conv_module', 'basicblock']
+        self.sparse_shape = sparse_shape
+        self.in_channels = in_channels
+        self.order = order
+        self.base_channels = base_channels
+        self.output_channels = output_channels
+        self.encoder_channels = encoder_channels
+        self.encoder_paddings = encoder_paddings
+        self.stage_num = len(self.encoder_channels)
+        self.return_middle_feats = return_middle_feats
+        # Spconv init all weight on its own
+
+        assert isinstance(order, tuple) and len(order) == 3
+        assert set(order) == {'conv', 'norm', 'act'}
+
+        if self.order[0] != 'conv':  # pre activate
+            self.conv_input = make_sparse_convmodule(
+                in_channels,
+                self.base_channels,
+                3,
+                norm_cfg=norm_cfg,
+                padding=1,
+                indice_key='subm1',
+                conv_type='SubMConv3d',
+                order=('conv', ))
+        else:  # post activate
+            self.conv_input = make_sparse_convmodule(
+                in_channels,
+                self.base_channels,
+                3,
+                norm_cfg=norm_cfg,
+                padding=1,
+                indice_key='subm1',
+                conv_type='SubMConv3d')
+
+        encoder_out_channels = self.make_encoder_layers(
+            make_sparse_convmodule,
+            norm_cfg,
+            self.base_channels,
+            block_type=block_type)
+
+        self.conv_out = make_sparse_convmodule(
+            encoder_out_channels,
+            self.output_channels,
+            kernel_size=(3, 1, 1),
+            stride=(2, 1, 1),
+            norm_cfg=norm_cfg,
+            padding=0,
+            indice_key='spconv_down2',
+            conv_type='SparseConv3d')
+
+    @amp.autocast(enabled=False)
+    def forward(self, voxel_features: Tensor, coors: Tensor,
+                batch_size: int) -> Union[Tensor, Tuple[Tensor, list]]:
+        """Forward of SparseEncoder.
+
+        Args:
+            voxel_features (torch.Tensor): Voxel features in shape (N, C).
+            coors (torch.Tensor): Coordinates in shape (N, 4),
+                the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
+            batch_size (int): Batch size.
+
+        Returns:
+            torch.Tensor | tuple[torch.Tensor, list]: Return spatial features
+                include:
+
+            - spatial_features (torch.Tensor): Spatial features are out from
+                the last layer.
+            - encode_features (List[SparseConvTensor], optional): Middle layer
+                output features. When self.return_middle_feats is True, the
+                module returns middle features.
+        """
+        coors = coors.int()
+        input_sp_tensor = SparseConvTensor(voxel_features, coors,
+                                           self.sparse_shape, batch_size)
+        x = self.conv_input(input_sp_tensor)
+
+        encode_features = []
+        for encoder_layer in self.encoder_layers:
+            x = encoder_layer(x)
+            encode_features.append(x)
+
+        # for detection head
+        # [200, 176, 5] -> [200, 176, 2]
+        out = self.conv_out(encode_features[-1])
+        spatial_features = out.dense()
+
+        N, C, D, H, W = spatial_features.shape
+        spatial_features = spatial_features.view(N, C * D, H, W)
+
+        if self.return_middle_feats:
+            return spatial_features, encode_features
+        else:
+            return spatial_features
+
+    def make_encoder_layers(
+        self,
+        make_block: nn.Module,
+        norm_cfg: Dict,
+        in_channels: int,
+        block_type: Optional[str] = 'conv_module',
+        conv_cfg: Optional[dict] = dict(type='SubMConv3d')
+    ) -> int:
+        """make encoder layers using sparse convs.
+
+        Args:
+            make_block (method): A bounded function to build blocks.
+            norm_cfg (dict[str]): Config of normalization layer.
+            in_channels (int): The number of encoder input channels.
+            block_type (str, optional): Type of the block to use.
+                Defaults to 'conv_module'.
+            conv_cfg (dict, optional): Config of conv layer. Defaults to
+                dict(type='SubMConv3d').
+
+        Returns:
+            int: The number of encoder output channels.
+        """
+        assert block_type in ['conv_module', 'basicblock']
+        self.encoder_layers = SparseSequential()
+
+        for i, blocks in enumerate(self.encoder_channels):
+            blocks_list = []
+            for j, out_channels in enumerate(tuple(blocks)):
+                padding = tuple(self.encoder_paddings[i])[j]
+                # each stage started with a spconv layer
+                # except the first stage
+                if i != 0 and j == 0 and block_type == 'conv_module':
+                    blocks_list.append(
+                        make_block(
+                            in_channels,
+                            out_channels,
+                            3,
+                            norm_cfg=norm_cfg,
+                            stride=2,
+                            padding=padding,
+                            indice_key=f'spconv{i + 1}',
+                            conv_type='SparseConv3d'))
+                elif block_type == 'basicblock':
+                    if j == len(blocks) - 1 and i != len(
+                            self.encoder_channels) - 1:
+                        blocks_list.append(
+                            make_block(
+                                in_channels,
+                                out_channels,
+                                3,
+                                norm_cfg=norm_cfg,
+                                stride=2,
+                                padding=padding,
+                                indice_key=f'spconv{i + 1}',
+                                conv_type='SparseConv3d'))
+                    else:
+                        blocks_list.append(
+                            SparseBasicBlock(
+                                out_channels,
+                                out_channels,
+                                norm_cfg=norm_cfg,
+                                conv_cfg=conv_cfg))
+                else:
+                    blocks_list.append(
+                        make_block(
+                            in_channels,
+                            out_channels,
+                            3,
+                            norm_cfg=norm_cfg,
+                            padding=padding,
+                            indice_key=f'subm{i + 1}',
+                            conv_type='SubMConv3d'))
+                in_channels = out_channels
+            stage_name = f'encoder_layer{i + 1}'
+            stage_layers = SparseSequential(*blocks_list)
+            self.encoder_layers.add_module(stage_name, stage_layers)
+        return out_channels
+
+
+@MODELS.register_module()
+class SparseEncoderSASSD(SparseEncoder):
+    r"""Sparse encoder for `SASSD <https://github.com/skyhehe123/SA-SSD>`_
+
+    Args:
+        in_channels (int): The number of input channels.
+        sparse_shape (list[int]): The sparse shape of input tensor.
+        order (list[str], optional): Order of conv module.
+            Defaults to ('conv', 'norm', 'act').
+        norm_cfg (dict, optional): Config of normalization layer. Defaults to
+            dict(type='BN1d', eps=1e-3, momentum=0.01).
+        base_channels (int, optional): Out channels for conv_input layer.
+            Defaults to 16.
+        output_channels (int, optional): Out channels for conv_out layer.
+            Defaults to 128.
+        encoder_channels (tuple[tuple[int]], optional):
+            Convolutional channels of each encode block.
+            Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)).
+        encoder_paddings (tuple[tuple[int]], optional):
+            Paddings of each encode block.
+            Defaults to ((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)).
+        block_type (str, optional): Type of the block to use.
+            Defaults to 'conv_module'.
+    """
+
+    def __init__(
+            self,
+            in_channels: int,
+            sparse_shape: List[int],
+            order: Tuple[str] = ('conv', 'norm', 'act'),
+            norm_cfg: dict = dict(type='BN1d', eps=1e-3, momentum=0.01),
+            base_channels: int = 16,
+            output_channels: int = 128,
+            encoder_channels: Optional[TwoTupleIntType] = ((16, ), (32, 32,
+                                                                    32),
+                                                           (64, 64,
+                                                            64), (64, 64, 64)),
+            encoder_paddings: Optional[TwoTupleIntType] = ((1, ), (1, 1, 1),
+                                                           (1, 1, 1),
+                                                           ((0, 1, 1), 1, 1)),
+            block_type: str = 'conv_module'):
+        super(SparseEncoderSASSD, self).__init__(
+            in_channels=in_channels,
+            sparse_shape=sparse_shape,
+            order=order,
+            norm_cfg=norm_cfg,
+            base_channels=base_channels,
+            output_channels=output_channels,
+            encoder_channels=encoder_channels,
+            encoder_paddings=encoder_paddings,
+            block_type=block_type)
+
+        self.point_fc = nn.Linear(112, 64, bias=False)
+        self.point_cls = nn.Linear(64, 1, bias=False)
+        self.point_reg = nn.Linear(64, 3, bias=False)
+
+    def forward(self,
+                voxel_features: Tensor,
+                coors: Tensor,
+                batch_size: Tensor,
+                test_mode: bool = False) -> Tuple[Tensor, tuple]:
+        """Forward of SparseEncoder.
+
+        Args:
+            voxel_features (torch.Tensor): Voxel features in shape (N, C).
+            coors (torch.Tensor): Coordinates in shape (N, 4),
+                the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
+            batch_size (int): Batch size.
+            test_mode (bool, optional): Whether in test mode.
+                Defaults to False.
+
+        Returns:
+            Tensor: Backbone features.
+            tuple[torch.Tensor]: Mean feature value of the points,
+                Classification result of the points,
+                Regression offsets of the points.
+        """
+        coors = coors.int()
+        input_sp_tensor = SparseConvTensor(voxel_features, coors,
+                                           self.sparse_shape, batch_size)
+        x = self.conv_input(input_sp_tensor)
+
+        encode_features = []
+        for encoder_layer in self.encoder_layers:
+            x = encoder_layer(x)
+            encode_features.append(x)
+
+        # for detection head
+        # [200, 176, 5] -> [200, 176, 2]
+        out = self.conv_out(encode_features[-1])
+        spatial_features = out.dense()
+
+        N, C, D, H, W = spatial_features.shape
+        spatial_features = spatial_features.view(N, C * D, H, W)
+
+        if test_mode:
+            return spatial_features, None
+
+        points_mean = torch.zeros_like(voxel_features)
+        points_mean[:, 0] = coors[:, 0]
+        points_mean[:, 1:] = voxel_features[:, :3]
+
+        # auxiliary network
+        p0 = self.make_auxiliary_points(
+            encode_features[0],
+            points_mean,
+            offset=(0, -40., -3.),
+            voxel_size=(.1, .1, .2))
+
+        p1 = self.make_auxiliary_points(
+            encode_features[1],
+            points_mean,
+            offset=(0, -40., -3.),
+            voxel_size=(.2, .2, .4))
+
+        p2 = self.make_auxiliary_points(
+            encode_features[2],
+            points_mean,
+            offset=(0, -40., -3.),
+            voxel_size=(.4, .4, .8))
+
+        pointwise = torch.cat([p0, p1, p2], dim=-1)
+        pointwise = self.point_fc(pointwise)
+        point_cls = self.point_cls(pointwise)
+        point_reg = self.point_reg(pointwise)
+        point_misc = (points_mean, point_cls, point_reg)
+
+        return spatial_features, point_misc
+
+    def get_auxiliary_targets(self,
+                              points_feats: Tensor,
+                              gt_bboxes_3d: List[BaseInstance3DBoxes],
+                              enlarge: float = 1.0) -> Tuple[Tensor, Tensor]:
+        """Get auxiliary target.
+
+        Args:
+            points_feats (torch.Tensor): Mean features of the points.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]):  Ground truth
+                boxes for each sample.
+            enlarge (float, optional): Enlaged scale. Defaults to 1.0.
+
+        Returns:
+            tuple[torch.Tensor]: Label of the points and
+                center offsets of the points.
+        """
+        center_offsets = list()
+        pts_labels = list()
+        for i in range(len(gt_bboxes_3d)):
+            boxes3d = gt_bboxes_3d[i].tensor.detach().clone()
+            idx = torch.nonzero(points_feats[:, 0] == i).view(-1)
+            point_xyz = points_feats[idx, 1:].detach().clone()
+
+            boxes3d[:, 3:6] *= enlarge
+
+            pts_in_flag, center_offset = self.calculate_pts_offsets(
+                point_xyz, boxes3d)
+            pts_label = pts_in_flag.max(0)[0].byte()
+            pts_labels.append(pts_label)
+            center_offsets.append(center_offset)
+
+        center_offsets = torch.cat(center_offsets)
+        pts_labels = torch.cat(pts_labels).to(center_offsets.device)
+
+        return pts_labels, center_offsets
+
+    def calculate_pts_offsets(self, points: Tensor,
+                              bboxes_3d: Tensor) -> Tuple[Tensor, Tensor]:
+        """Find all boxes in which each point is, as well as the offsets from
+        the box centers.
+
+        Args:
+            points (torch.Tensor): [M, 3], [x, y, z] in LiDAR coordinate
+            bboxes_3d (torch.Tensor): [T, 7],
+                num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
+                (x, y, z) is the bottom center.
+
+        Returns:
+            tuple[torch.Tensor]: Point indices of boxes with the shape of
+                (T, M). Default background = 0.
+                And offsets from the box centers of points,
+                if it belows to the box, with the shape of (M, 3).
+                Default background = 0.
+        """
+        boxes_num = len(bboxes_3d)
+        pts_num = len(points)
+
+        box_indices = points_in_boxes_all(points[None, ...], bboxes_3d[None,
+                                                                       ...])
+        pts_indices = box_indices.squeeze(0).transpose(0, 1)
+        center_offsets = torch.zeros_like(points).to(points.device)
+
+        for i in range(boxes_num):
+            for j in range(pts_num):
+                if pts_indices[i][j] == 1:
+                    center_offsets[j][0] = points[j][0] - bboxes_3d[i][0]
+                    center_offsets[j][1] = points[j][1] - bboxes_3d[i][1]
+                    center_offsets[j][2] = (
+                        points[j][2] -
+                        (bboxes_3d[i][2] + bboxes_3d[i][2] / 2.0))
+        return pts_indices, center_offsets
+
+    def aux_loss(self, points: Tensor, point_cls: Tensor, point_reg: Tensor,
+                 gt_bboxes_3d: Tensor) -> dict:
+        """Calculate auxiliary loss.
+
+        Args:
+            points (torch.Tensor): Mean feature value of the points.
+            point_cls (torch.Tensor): Classification result of the points.
+            point_reg (torch.Tensor): Regression offsets of the points.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+
+        Returns:
+            dict: Auxiliary loss.
+        """
+        num_boxes = len(gt_bboxes_3d)
+        pts_labels, center_targets = self.get_auxiliary_targets(
+            points, gt_bboxes_3d)
+
+        rpn_cls_target = pts_labels.long()
+        pos = (pts_labels > 0).float()
+        neg = (pts_labels == 0).float()
+
+        pos_normalizer = pos.sum().clamp(min=1.0)
+
+        cls_weights = pos + neg
+        reg_weights = pos
+        reg_weights = reg_weights / pos_normalizer
+
+        aux_loss_cls = sigmoid_focal_loss(
+            point_cls,
+            rpn_cls_target,
+            weight=cls_weights,
+            avg_factor=pos_normalizer)
+
+        aux_loss_cls /= num_boxes
+
+        weight = reg_weights[..., None]
+        aux_loss_reg = smooth_l1_loss(point_reg, center_targets, beta=1 / 9.)
+        aux_loss_reg = torch.sum(aux_loss_reg * weight)[None]
+        aux_loss_reg /= num_boxes
+
+        aux_loss_cls, aux_loss_reg = [aux_loss_cls], [aux_loss_reg]
+
+        return dict(aux_loss_cls=aux_loss_cls, aux_loss_reg=aux_loss_reg)
+
+    def make_auxiliary_points(
+        self,
+        source_tensor: Tensor,
+        target: Tensor,
+        offset: Tuple = (0., -40., -3.),
+        voxel_size: Tuple = (.05, .05, .1)
+    ) -> Tensor:
+        """Make auxiliary points for loss computation.
+
+        Args:
+            source_tensor (torch.Tensor): (M, C) features to be propigated.
+            target (torch.Tensor): (N, 4) bxyz positions of the
+                target features.
+            offset (tuple[float], optional): Voxelization offset.
+                Defaults to (0., -40., -3.)
+            voxel_size (tuple[float], optional): Voxelization size.
+                Defaults to (.05, .05, .1)
+
+        Returns:
+            torch.Tensor: (N, C) tensor of the features of the target features.
+        """
+        # Tansfer tensor to points
+        source = source_tensor.indices.float()
+        offset = torch.Tensor(offset).to(source.device)
+        voxel_size = torch.Tensor(voxel_size).to(source.device)
+        source[:, 1:] = (
+            source[:, [3, 2, 1]] * voxel_size + offset + .5 * voxel_size)
+
+        source_feats = source_tensor.features[None, ...].transpose(1, 2)
+
+        # Interplate auxiliary points
+        dist, idx = three_nn(target[None, ...], source[None, ...])
+        dist_recip = 1.0 / (dist + 1e-8)
+        norm = torch.sum(dist_recip, dim=2, keepdim=True)
+        weight = dist_recip / norm
+        new_features = three_interpolate(source_feats.contiguous(), idx,
+                                         weight)
+
+        return new_features.squeeze(0).transpose(0, 1)
diff --git a/mmde/mmdet3d/models/middle_encoders/sparse_unet.py b/mmde/mmdet3d/models/middle_encoders/sparse_unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8e68aeadc871dcd469dee637f6b8660bc4285b6
--- /dev/null
+++ b/mmde/mmdet3d/models/middle_encoders/sparse_unet.py
@@ -0,0 +1,316 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import Tensor, nn
+
+from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE
+
+if IS_SPCONV2_AVAILABLE:
+    from spconv.pytorch import SparseConvTensor, SparseSequential
+else:
+    from mmcv.ops import SparseConvTensor, SparseSequential
+
+from mmengine.model import BaseModule
+
+from mmdet3d.models.layers import SparseBasicBlock, make_sparse_convmodule
+from mmdet3d.models.layers.sparse_block import replace_feature
+from mmdet3d.registry import MODELS
+
+TwoTupleIntType = Tuple[Tuple[int]]
+
+
+@MODELS.register_module()
+class SparseUNet(BaseModule):
+    r"""SparseUNet for PartA^2.
+
+    See the `paper <https://arxiv.org/abs/1907.03670>`_ for more details.
+
+    Args:
+        in_channels (int): The number of input channels.
+        sparse_shape (list[int]): The sparse shape of input tensor.
+        norm_cfg (dict): Config of normalization layer.
+        base_channels (int): Out channels for conv_input layer.
+        output_channels (int): Out channels for conv_out layer.
+        encoder_channels (tuple[tuple[int]]):
+            Convolutional channels of each encode block.
+        encoder_paddings (tuple[tuple[int]]): Paddings of each encode block.
+        decoder_channels (tuple[tuple[int]]):
+            Convolutional channels of each decode block.
+        decoder_paddings (tuple[tuple[int]]): Paddings of each decode block.
+    """
+
+    def __init__(
+            self,
+            in_channels: int,
+            sparse_shape: List[int],
+            order: Tuple[str] = ('conv', 'norm', 'act'),
+            norm_cfg: dict = dict(type='BN1d', eps=1e-3, momentum=0.01),
+            base_channels: int = 16,
+            output_channels: int = 128,
+            encoder_channels: Optional[TwoTupleIntType] = ((16, ), (32, 32,
+                                                                    32),
+                                                           (64, 64,
+                                                            64), (64, 64, 64)),
+            encoder_paddings: Optional[TwoTupleIntType] = ((1, ), (1, 1, 1),
+                                                           (1, 1, 1),
+                                                           ((0, 1, 1), 1, 1)),
+            decoder_channels: Optional[TwoTupleIntType] = ((64, 64,
+                                                            64), (64, 64, 32),
+                                                           (32, 32,
+                                                            16), (16, 16, 16)),
+            decoder_paddings: Optional[TwoTupleIntType] = ((1, 0), (1, 0),
+                                                           (0, 0), (0, 1)),
+            init_cfg: bool = None):
+        super().__init__(init_cfg=init_cfg)
+        self.sparse_shape = sparse_shape
+        self.in_channels = in_channels
+        self.order = order
+        self.base_channels = base_channels
+        self.output_channels = output_channels
+        self.encoder_channels = encoder_channels
+        self.encoder_paddings = encoder_paddings
+        self.decoder_channels = decoder_channels
+        self.decoder_paddings = decoder_paddings
+        self.stage_num = len(self.encoder_channels)
+        # Spconv init all weight on its own
+
+        assert isinstance(order, tuple) and len(order) == 3
+        assert set(order) == {'conv', 'norm', 'act'}
+
+        if self.order[0] != 'conv':  # pre activate
+            self.conv_input = make_sparse_convmodule(
+                in_channels,
+                self.base_channels,
+                3,
+                norm_cfg=norm_cfg,
+                padding=1,
+                indice_key='subm1',
+                conv_type='SubMConv3d',
+                order=('conv', ))
+        else:  # post activate
+            self.conv_input = make_sparse_convmodule(
+                in_channels,
+                self.base_channels,
+                3,
+                norm_cfg=norm_cfg,
+                padding=1,
+                indice_key='subm1',
+                conv_type='SubMConv3d')
+
+        encoder_out_channels = self.make_encoder_layers(
+            make_sparse_convmodule, norm_cfg, self.base_channels)
+        self.make_decoder_layers(make_sparse_convmodule, norm_cfg,
+                                 encoder_out_channels)
+
+        self.conv_out = make_sparse_convmodule(
+            encoder_out_channels,
+            self.output_channels,
+            kernel_size=(3, 1, 1),
+            stride=(2, 1, 1),
+            norm_cfg=norm_cfg,
+            padding=0,
+            indice_key='spconv_down2',
+            conv_type='SparseConv3d')
+
+    def forward(self, voxel_features: Tensor, coors: Tensor,
+                batch_size: int) -> Dict[str, Tensor]:
+        """Forward of SparseUNet.
+
+        Args:
+            voxel_features (torch.float32): Voxel features in shape [N, C].
+            coors (torch.int32): Coordinates in shape [N, 4],
+                the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
+            batch_size (int): Batch size.
+
+        Returns:
+            dict[str, torch.Tensor]: Backbone features.
+        """
+        coors = coors.int()
+        input_sp_tensor = SparseConvTensor(voxel_features, coors,
+                                           self.sparse_shape, batch_size)
+        x = self.conv_input(input_sp_tensor)
+
+        encode_features = []
+        for encoder_layer in self.encoder_layers:
+            x = encoder_layer(x)
+            encode_features.append(x)
+
+        # for detection head
+        # [200, 176, 5] -> [200, 176, 2]
+        out = self.conv_out(encode_features[-1])
+        spatial_features = out.dense()
+
+        N, C, D, H, W = spatial_features.shape
+        spatial_features = spatial_features.view(N, C * D, H, W)
+
+        # for segmentation head, with output shape:
+        # [400, 352, 11] <- [200, 176, 5]
+        # [800, 704, 21] <- [400, 352, 11]
+        # [1600, 1408, 41] <- [800, 704, 21]
+        # [1600, 1408, 41] <- [1600, 1408, 41]
+        decode_features = []
+        x = encode_features[-1]
+        for i in range(self.stage_num, 0, -1):
+            x = self.decoder_layer_forward(encode_features[i - 1], x,
+                                           getattr(self, f'lateral_layer{i}'),
+                                           getattr(self, f'merge_layer{i}'),
+                                           getattr(self, f'upsample_layer{i}'))
+            decode_features.append(x)
+
+        seg_features = decode_features[-1].features
+
+        ret = dict(
+            spatial_features=spatial_features, seg_features=seg_features)
+
+        return ret
+
+    def decoder_layer_forward(
+            self, x_lateral: SparseConvTensor, x_bottom: SparseConvTensor,
+            lateral_layer: SparseBasicBlock, merge_layer: SparseSequential,
+            upsample_layer: SparseSequential) -> SparseConvTensor:
+        """Forward of upsample and residual block.
+
+        Args:
+            x_lateral (:obj:`SparseConvTensor`): Lateral tensor.
+            x_bottom (:obj:`SparseConvTensor`): Feature from bottom layer.
+            lateral_layer (SparseBasicBlock): Convolution for lateral tensor.
+            merge_layer (SparseSequential): Convolution for merging features.
+            upsample_layer (SparseSequential): Convolution for upsampling.
+
+        Returns:
+            :obj:`SparseConvTensor`: Upsampled feature.
+        """
+        x = lateral_layer(x_lateral)
+        x = replace_feature(x, torch.cat((x_bottom.features, x.features),
+                                         dim=1))
+        x_merge = merge_layer(x)
+        x = self.reduce_channel(x, x_merge.features.shape[1])
+        x = replace_feature(x, x_merge.features + x.features)
+        x = upsample_layer(x)
+        return x
+
+    @staticmethod
+    def reduce_channel(x: SparseConvTensor,
+                       out_channels: int) -> SparseConvTensor:
+        """reduce channel for element-wise addition.
+
+        Args:
+            x (:obj:`SparseConvTensor`): Sparse tensor, ``x.features``
+                are in shape (N, C1).
+            out_channels (int): The number of channel after reduction.
+
+        Returns:
+            :obj:`SparseConvTensor`: Channel reduced feature.
+        """
+        features = x.features
+        n, in_channels = features.shape
+        assert (in_channels % out_channels
+                == 0) and (in_channels >= out_channels)
+        x = replace_feature(x, features.view(n, out_channels, -1).sum(dim=2))
+        return x
+
+    def make_encoder_layers(self, make_block: nn.Module, norm_cfg: dict,
+                            in_channels: int) -> int:
+        """make encoder layers using sparse convs.
+
+        Args:
+            make_block (method): A bounded function to build blocks.
+            norm_cfg (dict[str]): Config of normalization layer.
+            in_channels (int): The number of encoder input channels.
+
+        Returns:
+            int: The number of encoder output channels.
+        """
+        self.encoder_layers = SparseSequential()
+
+        for i, blocks in enumerate(self.encoder_channels):
+            blocks_list = []
+            for j, out_channels in enumerate(tuple(blocks)):
+                padding = tuple(self.encoder_paddings[i])[j]
+                # each stage started with a spconv layer
+                # except the first stage
+                if i != 0 and j == 0:
+                    blocks_list.append(
+                        make_block(
+                            in_channels,
+                            out_channels,
+                            3,
+                            norm_cfg=norm_cfg,
+                            stride=2,
+                            padding=padding,
+                            indice_key=f'spconv{i + 1}',
+                            conv_type='SparseConv3d'))
+                else:
+                    blocks_list.append(
+                        make_block(
+                            in_channels,
+                            out_channels,
+                            3,
+                            norm_cfg=norm_cfg,
+                            padding=padding,
+                            indice_key=f'subm{i + 1}',
+                            conv_type='SubMConv3d'))
+                in_channels = out_channels
+            stage_name = f'encoder_layer{i + 1}'
+            stage_layers = SparseSequential(*blocks_list)
+            self.encoder_layers.add_module(stage_name, stage_layers)
+        return out_channels
+
+    def make_decoder_layers(self, make_block: nn.Module, norm_cfg: dict,
+                            in_channels: int) -> int:
+        """make decoder layers using sparse convs.
+
+        Args:
+            make_block (method): A bounded function to build blocks.
+            norm_cfg (dict[str]): Config of normalization layer.
+            in_channels (int): The number of encoder input channels.
+
+        Returns:
+            int: The number of encoder output channels.
+        """
+        block_num = len(self.decoder_channels)
+        for i, block_channels in enumerate(self.decoder_channels):
+            paddings = self.decoder_paddings[i]
+            setattr(
+                self, f'lateral_layer{block_num - i}',
+                SparseBasicBlock(
+                    in_channels,
+                    block_channels[0],
+                    conv_cfg=dict(
+                        type='SubMConv3d', indice_key=f'subm{block_num - i}'),
+                    norm_cfg=norm_cfg))
+            setattr(
+                self, f'merge_layer{block_num - i}',
+                make_block(
+                    in_channels * 2,
+                    block_channels[1],
+                    3,
+                    norm_cfg=norm_cfg,
+                    padding=paddings[0],
+                    indice_key=f'subm{block_num - i}',
+                    conv_type='SubMConv3d'))
+            if block_num - i != 1:
+                setattr(
+                    self, f'upsample_layer{block_num - i}',
+                    make_block(
+                        in_channels,
+                        block_channels[2],
+                        3,
+                        norm_cfg=norm_cfg,
+                        indice_key=f'spconv{block_num - i}',
+                        conv_type='SparseInverseConv3d'))
+            else:
+                # use submanifold conv instead of inverse conv
+                # in the last block
+                setattr(
+                    self, f'upsample_layer{block_num - i}',
+                    make_block(
+                        in_channels,
+                        block_channels[2],
+                        3,
+                        norm_cfg=norm_cfg,
+                        padding=paddings[1],
+                        indice_key='subm1',
+                        conv_type='SubMConv3d'))
+            in_channels = block_channels[2]
diff --git a/mmde/mmdet3d/models/middle_encoders/voxel_set_abstraction.py b/mmde/mmdet3d/models/middle_encoders/voxel_set_abstraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..366ee4de8a29575b9375349258f8b4424f2a3d7a
--- /dev/null
+++ b/mmde/mmdet3d/models/middle_encoders/voxel_set_abstraction.py
@@ -0,0 +1,335 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import mmengine
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops.furthest_point_sample import furthest_point_sample
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import InstanceList
+
+
+def bilinear_interpolate_torch(inputs: Tensor, x: Tensor, y: Tensor) -> Tensor:
+    """Bilinear interpolate for inputs."""
+    x0 = torch.floor(x).long()
+    x1 = x0 + 1
+
+    y0 = torch.floor(y).long()
+    y1 = y0 + 1
+
+    x0 = torch.clamp(x0, 0, inputs.shape[1] - 1)
+    x1 = torch.clamp(x1, 0, inputs.shape[1] - 1)
+    y0 = torch.clamp(y0, 0, inputs.shape[0] - 1)
+    y1 = torch.clamp(y1, 0, inputs.shape[0] - 1)
+
+    Ia = inputs[y0, x0]
+    Ib = inputs[y1, x0]
+    Ic = inputs[y0, x1]
+    Id = inputs[y1, x1]
+
+    wa = (x1.type_as(x) - x) * (y1.type_as(y) - y)
+    wb = (x1.type_as(x) - x) * (y - y0.type_as(y))
+    wc = (x - x0.type_as(x)) * (y1.type_as(y) - y)
+    wd = (x - x0.type_as(x)) * (y - y0.type_as(y))
+    ans = torch.t((torch.t(Ia) * wa)) + torch.t(torch.t(Ib) * wb) + torch.t(
+        torch.t(Ic) * wc) + torch.t(torch.t(Id) * wd)
+    return ans
+
+
+@MODELS.register_module()
+class VoxelSetAbstraction(BaseModule):
+    """Voxel set abstraction module for PVRCNN and PVRCNN++.
+
+    Args:
+        num_keypoints (int): The number of key points sampled from
+            raw points cloud.
+        fused_out_channel (int): Key points feature output channels
+            num after fused. Default to 128.
+        voxel_size (list[float]): Size of voxels. Defaults to
+            [0.05, 0.05, 0.1].
+        point_cloud_range (list[float]): Point cloud range. Defaults to
+            [0, -40, -3, 70.4, 40, 1].
+        voxel_sa_cfgs_list (List[dict or ConfigDict], optional): List of SA
+            module cfg. Used to gather key points features from multi-wise
+            voxel features. Default to None.
+        rawpoints_sa_cfgs (dict or ConfigDict, optional): SA module cfg.
+            Used to gather key points features from raw points. Default to
+            None.
+        bev_feat_channel (int): Bev features channels num.
+            Default to 256.
+        bev_scale_factor (int): Bev features scale factor. Default to 8.
+        voxel_center_as_source (bool): Whether used voxel centers as points
+            cloud key points. Defaults to False.
+        norm_cfg (dict[str]): Config of normalization layer. Default
+            used dict(type='BN1d', eps=1e-5, momentum=0.1).
+        bias (bool | str, optional): If specified as `auto`, it will be
+            decided by `norm_cfg`. `bias` will be set as True if
+            `norm_cfg` is None, otherwise False. Default: 'auto'.
+    """
+
+    def __init__(self,
+                 num_keypoints: int,
+                 fused_out_channel: int = 128,
+                 voxel_size: list = [0.05, 0.05, 0.1],
+                 point_cloud_range: list = [0, -40, -3, 70.4, 40, 1],
+                 voxel_sa_cfgs_list: Optional[list] = None,
+                 rawpoints_sa_cfgs: Optional[dict] = None,
+                 bev_feat_channel: int = 256,
+                 bev_scale_factor: int = 8,
+                 voxel_center_as_source: bool = False,
+                 norm_cfg: dict = dict(type='BN2d', eps=1e-5, momentum=0.1),
+                 bias: str = 'auto') -> None:
+        super().__init__()
+        self.num_keypoints = num_keypoints
+        self.fused_out_channel = fused_out_channel
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.voxel_center_as_source = voxel_center_as_source
+
+        gathered_channel = 0
+
+        if rawpoints_sa_cfgs is not None:
+            self.rawpoints_sa_layer = MODELS.build(rawpoints_sa_cfgs)
+            gathered_channel += sum(
+                [x[-1] for x in rawpoints_sa_cfgs.mlp_channels])
+        else:
+            self.rawpoints_sa_layer = None
+
+        if voxel_sa_cfgs_list is not None:
+            self.voxel_sa_configs_list = voxel_sa_cfgs_list
+            self.voxel_sa_layers = nn.ModuleList()
+            for voxel_sa_config in voxel_sa_cfgs_list:
+                cur_layer = MODELS.build(voxel_sa_config)
+                self.voxel_sa_layers.append(cur_layer)
+                gathered_channel += sum(
+                    [x[-1] for x in voxel_sa_config.mlp_channels])
+        else:
+            self.voxel_sa_layers = None
+
+        if bev_feat_channel is not None and bev_scale_factor is not None:
+            self.bev_cfg = mmengine.Config(
+                dict(
+                    bev_feat_channels=bev_feat_channel,
+                    bev_scale_factor=bev_scale_factor))
+            gathered_channel += bev_feat_channel
+        else:
+            self.bev_cfg = None
+        self.point_feature_fusion_layer = nn.Sequential(
+            ConvModule(
+                gathered_channel,
+                fused_out_channel,
+                kernel_size=(1, 1),
+                stride=(1, 1),
+                conv_cfg=dict(type='Conv2d'),
+                norm_cfg=norm_cfg,
+                bias=bias))
+
+    def interpolate_from_bev_features(self, keypoints: torch.Tensor,
+                                      bev_features: torch.Tensor,
+                                      batch_size: int,
+                                      bev_scale_factor: int) -> torch.Tensor:
+        """Gather key points features from bev feature map by interpolate.
+
+        Args:
+            keypoints (torch.Tensor): Sampled key points with shape
+                (N1 + N2 + ..., NDim).
+            bev_features (torch.Tensor): Bev feature map from the first
+                stage with shape (B, C, H, W).
+            batch_size (int): Input batch size.
+            bev_scale_factor (int): Bev feature map scale factor.
+
+        Returns:
+            torch.Tensor: Key points features gather from bev feature
+                map with shape (N1 + N2 + ..., C)
+        """
+        x_idxs = (keypoints[..., 0] -
+                  self.point_cloud_range[0]) / self.voxel_size[0]
+        y_idxs = (keypoints[..., 1] -
+                  self.point_cloud_range[1]) / self.voxel_size[1]
+
+        x_idxs = x_idxs / bev_scale_factor
+        y_idxs = y_idxs / bev_scale_factor
+
+        point_bev_features_list = []
+        for k in range(batch_size):
+            cur_x_idxs = x_idxs[k, ...]
+            cur_y_idxs = y_idxs[k, ...]
+            cur_bev_features = bev_features[k].permute(1, 2, 0)  # (H, W, C)
+            point_bev_features = bilinear_interpolate_torch(
+                cur_bev_features, cur_x_idxs, cur_y_idxs)
+            point_bev_features_list.append(point_bev_features)
+
+        point_bev_features = torch.cat(
+            point_bev_features_list, dim=0)  # (N1 + N2 + ..., C)
+        return point_bev_features.view(batch_size, keypoints.shape[1], -1)
+
+    def get_voxel_centers(self, coors: torch.Tensor,
+                          scale_factor: float) -> torch.Tensor:
+        """Get voxel centers coordinate.
+
+        Args:
+            coors (torch.Tensor): Coordinates of voxels shape is Nx(1+NDim),
+                where 1 represents the batch index.
+            scale_factor (float): Scale factor.
+
+        Returns:
+            torch.Tensor: Voxel centers coordinate with shape (N, 3).
+        """
+        assert coors.shape[1] == 4
+        voxel_centers = coors[:, [3, 2, 1]].float()  # (xyz)
+        voxel_size = torch.tensor(
+            self.voxel_size,
+            device=voxel_centers.device).float() * scale_factor
+        pc_range = torch.tensor(
+            self.point_cloud_range[0:3], device=voxel_centers.device).float()
+        voxel_centers = (voxel_centers + 0.5) * voxel_size + pc_range
+        return voxel_centers
+
+    def sample_key_points(self, points: List[torch.Tensor],
+                          coors: torch.Tensor) -> torch.Tensor:
+        """Sample key points from raw points cloud.
+
+        Args:
+            points (List[torch.Tensor]): Point cloud of each sample.
+            coors (torch.Tensor): Coordinates of voxels shape is Nx(1+NDim),
+                where 1 represents the batch index.
+
+        Returns:
+            torch.Tensor: (B, M, 3) Key points of each sample.
+                M is num_keypoints.
+        """
+        assert points is not None or coors is not None
+        if self.voxel_center_as_source:
+            _src_points = self.get_voxel_centers(coors=coors, scale_factor=1)
+            batch_size = coors[-1, 0].item() + 1
+            src_points = [
+                _src_points[coors[:, 0] == b] for b in range(batch_size)
+            ]
+        else:
+            src_points = [p[..., :3] for p in points]
+
+        keypoints_list = []
+        for points_to_sample in src_points:
+            num_points = points_to_sample.shape[0]
+            cur_pt_idxs = furthest_point_sample(
+                points_to_sample.unsqueeze(dim=0).contiguous(),
+                self.num_keypoints).long()[0]
+
+            if num_points < self.num_keypoints:
+                times = int(self.num_keypoints / num_points) + 1
+                non_empty = cur_pt_idxs[:num_points]
+                cur_pt_idxs = non_empty.repeat(times)[:self.num_keypoints]
+
+            keypoints = points_to_sample[cur_pt_idxs]
+
+            keypoints_list.append(keypoints)
+        keypoints = torch.stack(keypoints_list, dim=0)  # (B, M, 3)
+        return keypoints
+
+    def forward(self, batch_inputs_dict: dict, feats_dict: dict,
+                rpn_results_list: InstanceList) -> dict:
+        """Extract point-wise features from multi-input.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points', 'voxels' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - voxels (dict[torch.Tensor]): Voxels of the batch sample.
+            feats_dict (dict): Contains features from the first
+                stage.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+
+        Returns:
+            dict: Contain Point-wise features, include:
+                - keypoints (torch.Tensor): Sampled key points.
+                - keypoint_features (torch.Tensor): Gathered key points
+                    features from multi input.
+                - fusion_keypoint_features (torch.Tensor): Fusion
+                    keypoint_features by point_feature_fusion_layer.
+        """
+        points = batch_inputs_dict['points']
+        voxel_encode_features = feats_dict['multi_scale_3d_feats']
+        bev_encode_features = feats_dict['spatial_feats']
+        if self.voxel_center_as_source:
+            voxels_coors = batch_inputs_dict['voxels']['coors']
+        else:
+            voxels_coors = None
+        keypoints = self.sample_key_points(points, voxels_coors)
+
+        point_features_list = []
+        batch_size = len(points)
+
+        if self.bev_cfg is not None:
+            point_bev_features = self.interpolate_from_bev_features(
+                keypoints, bev_encode_features, batch_size,
+                self.bev_cfg.bev_scale_factor)
+            point_features_list.append(point_bev_features.contiguous())
+
+        batch_size, num_keypoints, _ = keypoints.shape
+        key_xyz = keypoints.view(-1, 3)
+        key_xyz_batch_cnt = key_xyz.new_zeros(batch_size).int().fill_(
+            num_keypoints)
+
+        if self.rawpoints_sa_layer is not None:
+            batch_points = torch.cat(points, dim=0)
+            batch_cnt = [len(p) for p in points]
+            xyz = batch_points[:, :3].contiguous()
+            features = None
+            if batch_points.size(1) > 0:
+                features = batch_points[:, 3:].contiguous()
+            xyz_batch_cnt = xyz.new_tensor(batch_cnt, dtype=torch.int32)
+
+            pooled_points, pooled_features = self.rawpoints_sa_layer(
+                xyz=xyz.contiguous(),
+                xyz_batch_cnt=xyz_batch_cnt,
+                new_xyz=key_xyz.contiguous(),
+                new_xyz_batch_cnt=key_xyz_batch_cnt,
+                features=features.contiguous(),
+            )
+
+            point_features_list.append(pooled_features.contiguous().view(
+                batch_size, num_keypoints, -1))
+        if self.voxel_sa_layers is not None:
+            for k, voxel_sa_layer in enumerate(self.voxel_sa_layers):
+                cur_coords = voxel_encode_features[k].indices
+                xyz = self.get_voxel_centers(
+                    coors=cur_coords,
+                    scale_factor=self.voxel_sa_configs_list[k].scale_factor
+                ).contiguous()
+                xyz_batch_cnt = xyz.new_zeros(batch_size).int()
+                for bs_idx in range(batch_size):
+                    xyz_batch_cnt[bs_idx] = (cur_coords[:, 0] == bs_idx).sum()
+
+                pooled_points, pooled_features = voxel_sa_layer(
+                    xyz=xyz.contiguous(),
+                    xyz_batch_cnt=xyz_batch_cnt,
+                    new_xyz=key_xyz.contiguous(),
+                    new_xyz_batch_cnt=key_xyz_batch_cnt,
+                    features=voxel_encode_features[k].features.contiguous(),
+                )
+                point_features_list.append(pooled_features.contiguous().view(
+                    batch_size, num_keypoints, -1))
+
+        point_features = torch.cat(
+            point_features_list, dim=-1).view(batch_size * num_keypoints, -1,
+                                              1)
+
+        fusion_point_features = self.point_feature_fusion_layer(
+            point_features.unsqueeze(dim=-1)).squeeze(dim=-1)
+
+        batch_idxs = torch.arange(
+            batch_size * num_keypoints, device=keypoints.device
+        ) // num_keypoints  # batch indexes of each key points
+        batch_keypoints_xyz = torch.cat(
+            (batch_idxs.to(key_xyz.dtype).unsqueeze(dim=-1), key_xyz), dim=-1)
+
+        return dict(
+            keypoint_features=point_features.squeeze(dim=-1),
+            fusion_keypoint_features=fusion_point_features.squeeze(dim=-1),
+            keypoints=batch_keypoints_xyz)
diff --git a/mmde/mmdet3d/models/necks/__init__.py b/mmde/mmdet3d/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53b885cb163ce590f1b6e8ef5ae9dd2542c07e3c
--- /dev/null
+++ b/mmde/mmdet3d/models/necks/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.necks.fpn import FPN
+
+from .dla_neck import DLANeck
+from .imvoxel_neck import IndoorImVoxelNeck, OutdoorImVoxelNeck
+from .pointnet2_fp_neck import PointNetFPNeck
+from .second_fpn import SECONDFPN
+
+__all__ = [
+    'FPN', 'SECONDFPN', 'OutdoorImVoxelNeck', 'PointNetFPNeck', 'DLANeck',
+    'IndoorImVoxelNeck'
+]
diff --git a/mmde/mmdet3d/models/necks/dla_neck.py b/mmde/mmdet3d/models/necks/dla_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ff194b15fa2f48669b2da5bccfff9f6b14f2dc4
--- /dev/null
+++ b/mmde/mmdet3d/models/necks/dla_neck.py
@@ -0,0 +1,233 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import numpy as np
+from mmcv.cnn import ConvModule, build_conv_layer
+from mmengine.model import BaseModule
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+
+
+def fill_up_weights(up):
+    """Simulated bilinear upsampling kernel.
+
+    Args:
+        up (nn.Module): ConvTranspose2d module.
+    """
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :]
+
+
+class IDAUpsample(BaseModule):
+    """Iterative Deep Aggregation (IDA) Upsampling module to upsample features
+    of different scales to a similar scale.
+
+    Args:
+        out_channels (int): Number of output channels for DeformConv.
+        in_channels (List[int]): List of input channels of multi-scale
+            feature maps.
+        kernel_sizes (List[int]): List of size of the convolving
+            kernel of different scales.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        use_dcn (bool, optional): If True, use DCNv2. Default: True.
+    """
+
+    def __init__(
+        self,
+        out_channels,
+        in_channels,
+        kernel_sizes,
+        norm_cfg=None,
+        use_dcn=True,
+        init_cfg=None,
+    ):
+        super(IDAUpsample, self).__init__(init_cfg)
+        self.use_dcn = use_dcn
+        self.projs = nn.ModuleList()
+        self.ups = nn.ModuleList()
+        self.nodes = nn.ModuleList()
+
+        for i in range(1, len(in_channels)):
+            in_channel = in_channels[i]
+            up_kernel_size = int(kernel_sizes[i])
+            proj = ConvModule(
+                in_channel,
+                out_channels,
+                3,
+                padding=1,
+                bias=True,
+                conv_cfg=dict(type='DCNv2') if self.use_dcn else None,
+                norm_cfg=norm_cfg)
+            node = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                bias=True,
+                conv_cfg=dict(type='DCNv2') if self.use_dcn else None,
+                norm_cfg=norm_cfg)
+            up = build_conv_layer(
+                dict(type='deconv'),
+                out_channels,
+                out_channels,
+                up_kernel_size * 2,
+                stride=up_kernel_size,
+                padding=up_kernel_size // 2,
+                output_padding=0,
+                groups=out_channels,
+                bias=False)
+
+            self.projs.append(proj)
+            self.ups.append(up)
+            self.nodes.append(node)
+
+    def forward(self, mlvl_features, start_level, end_level):
+        """Forward function.
+
+        Args:
+            mlvl_features (list[torch.Tensor]): Features from multiple layers.
+            start_level (int): Start layer for feature upsampling.
+            end_level (int): End layer for feature upsampling.
+        """
+        for i in range(start_level, end_level - 1):
+            upsample = self.ups[i - start_level]
+            project = self.projs[i - start_level]
+            mlvl_features[i + 1] = upsample(project(mlvl_features[i + 1]))
+            node = self.nodes[i - start_level]
+            mlvl_features[i + 1] = node(mlvl_features[i + 1] +
+                                        mlvl_features[i])
+
+
+class DLAUpsample(BaseModule):
+    """Deep Layer Aggregation (DLA) Upsampling module for different scales
+    feature extraction, upsampling and fusion, It consists of groups of
+    IDAupsample modules.
+
+    Args:
+        start_level (int): The start layer.
+        channels (List[int]): List of input channels of multi-scale
+            feature maps.
+        scales(List[int]): List of scale of different layers' feature.
+        in_channels (NoneType, optional): List of input channels of
+            different scales. Default: None.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        use_dcn (bool, optional): Whether to use dcn in IDAup module.
+            Default: True.
+    """
+
+    def __init__(self,
+                 start_level,
+                 channels,
+                 scales,
+                 in_channels=None,
+                 norm_cfg=None,
+                 use_dcn=True,
+                 init_cfg=None):
+        super(DLAUpsample, self).__init__(init_cfg)
+        self.start_level = start_level
+        if in_channels is None:
+            in_channels = channels
+        self.channels = channels
+        channels = list(channels)
+        scales = np.array(scales, dtype=int)
+        for i in range(len(channels) - 1):
+            j = -i - 2
+            setattr(
+                self, 'ida_{}'.format(i),
+                IDAUpsample(channels[j], in_channels[j:],
+                            scales[j:] // scales[j], norm_cfg, use_dcn))
+            scales[j + 1:] = scales[j]
+            in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]
+
+    def forward(self, mlvl_features):
+        """Forward function.
+
+        Args:
+            mlvl_features(list[torch.Tensor]): Features from multi-scale
+                layers.
+
+        Returns:
+            tuple[torch.Tensor]: Up-sampled features of different layers.
+        """
+        outs = [mlvl_features[-1]]
+        for i in range(len(mlvl_features) - self.start_level - 1):
+            ida = getattr(self, 'ida_{}'.format(i))
+            ida(mlvl_features, len(mlvl_features) - i - 2, len(mlvl_features))
+            outs.insert(0, mlvl_features[-1])
+        return outs
+
+
+@MODELS.register_module()
+class DLANeck(BaseModule):
+    """DLA Neck.
+
+    Args:
+        in_channels (list[int], optional): List of input channels
+            of multi-scale feature map.
+        start_level (int, optional): The scale level where upsampling
+            starts. Default: 2.
+        end_level (int, optional): The scale level where upsampling
+            ends. Default: 5.
+        norm_cfg (dict, optional): Config dict for normalization
+            layer. Default: None.
+        use_dcn (bool, optional): Whether to use dcn in IDAup module.
+            Default: True.
+    """
+
+    def __init__(self,
+                 in_channels=[16, 32, 64, 128, 256, 512],
+                 start_level=2,
+                 end_level=5,
+                 norm_cfg=None,
+                 use_dcn=True,
+                 init_cfg=None):
+        super(DLANeck, self).__init__(init_cfg)
+        self.start_level = start_level
+        self.end_level = end_level
+        scales = [2**i for i in range(len(in_channels[self.start_level:]))]
+        self.dla_up = DLAUpsample(
+            start_level=self.start_level,
+            channels=in_channels[self.start_level:],
+            scales=scales,
+            norm_cfg=norm_cfg,
+            use_dcn=use_dcn)
+        self.ida_up = IDAUpsample(
+            in_channels[self.start_level],
+            in_channels[self.start_level:self.end_level],
+            [2**i for i in range(self.end_level - self.start_level)], norm_cfg,
+            use_dcn)
+
+    def forward(self, x):
+        mlvl_features = [x[i] for i in range(len(x))]
+        mlvl_features = self.dla_up(mlvl_features)
+        outs = []
+        for i in range(self.end_level - self.start_level):
+            outs.append(mlvl_features[i].clone())
+        self.ida_up(outs, 0, len(outs))
+        return [outs[-1]]
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                # In order to be consistent with the source code,
+                # reset the ConvTranspose2d initialization parameters
+                m.reset_parameters()
+                # Simulated bilinear upsampling kernel
+                fill_up_weights(m)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Conv2d):
+                # In order to be consistent with the source code,
+                # reset the Conv2d initialization parameters
+                m.reset_parameters()
diff --git a/mmde/mmdet3d/models/necks/imvoxel_neck.py b/mmde/mmdet3d/models/necks/imvoxel_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..94facbf29afaf0709a85813602c8aca77fb1737e
--- /dev/null
+++ b/mmde/mmdet3d/models/necks/imvoxel_neck.py
@@ -0,0 +1,230 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import nn
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class OutdoorImVoxelNeck(BaseModule):
+    """Neck for ImVoxelNet outdoor scenario.
+
+    Args:
+        in_channels (int): Number of channels in an input tensor.
+        out_channels (int): Number of channels in all output tensors.
+    """
+
+    def __init__(self, in_channels, out_channels):
+        super(OutdoorImVoxelNeck, self).__init__()
+        self.model = nn.Sequential(
+            ResModule(in_channels, in_channels),
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=in_channels * 2,
+                kernel_size=3,
+                stride=(1, 1, 2),
+                padding=1,
+                conv_cfg=dict(type='Conv3d'),
+                norm_cfg=dict(type='BN3d'),
+                act_cfg=dict(type='ReLU', inplace=True)),
+            ResModule(in_channels * 2, in_channels * 2),
+            ConvModule(
+                in_channels=in_channels * 2,
+                out_channels=in_channels * 4,
+                kernel_size=3,
+                stride=(1, 1, 2),
+                padding=1,
+                conv_cfg=dict(type='Conv3d'),
+                norm_cfg=dict(type='BN3d'),
+                act_cfg=dict(type='ReLU', inplace=True)),
+            ResModule(in_channels * 4, in_channels * 4),
+            ConvModule(
+                in_channels=in_channels * 4,
+                out_channels=out_channels,
+                kernel_size=3,
+                padding=(1, 1, 0),
+                conv_cfg=dict(type='Conv3d'),
+                norm_cfg=dict(type='BN3d'),
+                act_cfg=dict(type='ReLU', inplace=True)))
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): of shape (N, C_in, N_x, N_y, N_z).
+
+        Returns:
+            list[torch.Tensor]: of shape (N, C_out, N_y, N_x).
+        """
+        x = self.model.forward(x)
+        assert x.shape[-1] == 1
+        # Anchor3DHead axis order is (y, x).
+        return [x[..., 0].transpose(-1, -2)]
+
+    def init_weights(self):
+        """Initialize weights of neck."""
+        pass
+
+
+@MODELS.register_module()
+class IndoorImVoxelNeck(BaseModule):
+    """Neck for ImVoxelNet outdoor scenario.
+
+    Args:
+        in_channels (int): Number of channels in an input tensor.
+        out_channels (int): Number of channels in all output tensors.
+        n_blocks (list[int]): Number of blocks for each feature level.
+    """
+
+    def __init__(self, in_channels, out_channels, n_blocks):
+        super(IndoorImVoxelNeck, self).__init__()
+        self.n_scales = len(n_blocks)
+        n_channels = in_channels
+        for i in range(len(n_blocks)):
+            stride = 1 if i == 0 else 2
+            self.__setattr__(f'down_layer_{i}',
+                             self._make_layer(stride, n_channels, n_blocks[i]))
+            n_channels = n_channels * stride
+            if i > 0:
+                self.__setattr__(
+                    f'up_block_{i}',
+                    self._make_up_block(n_channels, n_channels // 2))
+            self.__setattr__(f'out_block_{i}',
+                             self._make_block(n_channels, out_channels))
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): of shape (N, C_in, N_x, N_y, N_z).
+
+        Returns:
+            list[torch.Tensor]: of shape (N, C_out, N_xi, N_yi, N_zi).
+        """
+        down_outs = []
+        for i in range(self.n_scales):
+            x = self.__getattr__(f'down_layer_{i}')(x)
+            down_outs.append(x)
+        outs = []
+        for i in range(self.n_scales - 1, -1, -1):
+            if i < self.n_scales - 1:
+                x = self.__getattr__(f'up_block_{i + 1}')(x)
+                x = down_outs[i] + x
+            out = self.__getattr__(f'out_block_{i}')(x)
+            outs.append(out)
+        return outs[::-1]
+
+    @staticmethod
+    def _make_layer(stride, n_channels, n_blocks):
+        """Make a layer from several residual blocks.
+
+        Args:
+            stride (int): Stride of the first residual block.
+            n_channels (int): Number of channels of the first residual block.
+            n_blocks (int): Number of residual blocks.
+
+        Returns:
+            torch.nn.Module: With several residual blocks.
+        """
+        blocks = []
+        for i in range(n_blocks):
+            if i == 0 and stride != 1:
+                blocks.append(ResModule(n_channels, n_channels * 2, stride))
+                n_channels = n_channels * 2
+            else:
+                blocks.append(ResModule(n_channels, n_channels))
+        return nn.Sequential(*blocks)
+
+    @staticmethod
+    def _make_block(in_channels, out_channels):
+        """Make a convolutional block.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+
+        Returns:
+            torch.nn.Module: Convolutional block.
+        """
+        return nn.Sequential(
+            nn.Conv3d(in_channels, out_channels, 3, 1, 1, bias=False),
+            nn.BatchNorm3d(out_channels), nn.ReLU(inplace=True))
+
+    @staticmethod
+    def _make_up_block(in_channels, out_channels):
+        """Make upsampling convolutional block.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+
+        Returns:
+            torch.nn.Module: Upsampling convolutional block.
+        """
+
+        return nn.Sequential(
+            nn.ConvTranspose3d(in_channels, out_channels, 2, 2, bias=False),
+            nn.BatchNorm3d(out_channels), nn.ReLU(inplace=True),
+            nn.Conv3d(out_channels, out_channels, 3, 1, 1, bias=False),
+            nn.BatchNorm3d(out_channels), nn.ReLU(inplace=True))
+
+
+class ResModule(nn.Module):
+    """3d residual block for ImVoxelNeck.
+
+    Args:
+        in_channels (int): Number of channels in input tensor.
+        out_channels (int): Number of channels in output tensor.
+        stride (int, optional): Stride of the block. Defaults to 1.
+    """
+
+    def __init__(self, in_channels, out_channels, stride=1):
+        super().__init__()
+        self.conv0 = ConvModule(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            conv_cfg=dict(type='Conv3d'),
+            norm_cfg=dict(type='BN3d'),
+            act_cfg=dict(type='ReLU', inplace=True))
+        self.conv1 = ConvModule(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            padding=1,
+            conv_cfg=dict(type='Conv3d'),
+            norm_cfg=dict(type='BN3d'),
+            act_cfg=None)
+        if stride != 1:
+            self.downsample = ConvModule(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=stride,
+                padding=0,
+                conv_cfg=dict(type='Conv3d'),
+                norm_cfg=dict(type='BN3d'),
+                act_cfg=None)
+        self.stride = stride
+        self.activation = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): of shape (N, C, N_x, N_y, N_z).
+
+        Returns:
+            torch.Tensor: 5d feature map.
+        """
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        if self.stride != 1:
+            identity = self.downsample(identity)
+        x = x + identity
+        x = self.activation(x)
+        return x
diff --git a/mmde/mmdet3d/models/necks/pointnet2_fp_neck.py b/mmde/mmdet3d/models/necks/pointnet2_fp_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..535b0beb27d03a871bfffbbba80663a8aed11a13
--- /dev/null
+++ b/mmde/mmdet3d/models/necks/pointnet2_fp_neck.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.model import BaseModule
+from torch import nn as nn
+
+from mmdet3d.models.layers.pointnet_modules import PointFPModule
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class PointNetFPNeck(BaseModule):
+    r"""PointNet FP Module used in PointRCNN.
+
+    Refer to the `official code <https://github.com/charlesq34/pointnet2>`_.
+
+    .. code-block:: none
+
+        sa_n ----------------------------------------
+                                                     |
+        ... ---------------------------------        |
+                                             |       |
+        sa_1 -------------                   |       |
+                          |                  |       |
+        sa_0 -> fp_0 -> fp_module ->fp_1 -> ... -> fp_module -> fp_n
+
+    sa_n including sa_xyz (torch.Tensor) and sa_features (torch.Tensor)
+    fp_n including fp_xyz (torch.Tensor) and fp_features (torch.Tensor)
+
+    Args:
+        fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self, fp_channels, init_cfg=None):
+        super(PointNetFPNeck, self).__init__(init_cfg=init_cfg)
+
+        self.num_fp = len(fp_channels)
+        self.FP_modules = nn.ModuleList()
+        for cur_fp_mlps in fp_channels:
+            self.FP_modules.append(PointFPModule(mlp_channels=cur_fp_mlps))
+
+    def _extract_input(self, feat_dict):
+        """Extract inputs from features dictionary.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone, which may contain
+                the following keys and values:
+
+                - sa_xyz (list[torch.Tensor]): Points of each sa module
+                    in shape (N, 3).
+                - sa_features (list[torch.Tensor]): Output features of
+                    each sa module in shape (N, M).
+
+        Returns:
+            list[torch.Tensor]: Coordinates of multiple levels of points.
+            list[torch.Tensor]: Features of multiple levels of points.
+        """
+        sa_xyz = feat_dict['sa_xyz']
+        sa_features = feat_dict['sa_features']
+        assert len(sa_xyz) == len(sa_features)
+
+        return sa_xyz, sa_features
+
+    def forward(self, feat_dict):
+        """Forward pass.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            dict[str, torch.Tensor]: Outputs of the Neck.
+
+                - fp_xyz (torch.Tensor): The coordinates of fp features.
+                - fp_features (torch.Tensor): The features from the last
+                    feature propagation layers.
+        """
+        sa_xyz, sa_features = self._extract_input(feat_dict)
+
+        fp_feature = sa_features[-1]
+        fp_xyz = sa_xyz[-1]
+
+        for i in range(self.num_fp):
+            # consume the points in a bottom-up manner
+            fp_feature = self.FP_modules[i](sa_xyz[-(i + 2)], sa_xyz[-(i + 1)],
+                                            sa_features[-(i + 2)], fp_feature)
+            fp_xyz = sa_xyz[-(i + 2)]
+
+        ret = dict(fp_xyz=fp_xyz, fp_features=fp_feature)
+        return ret
diff --git a/mmde/mmdet3d/models/necks/second_fpn.py b/mmde/mmdet3d/models/necks/second_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4dc590c1529c3962e8b20b09b0fc4c965415a4c
--- /dev/null
+++ b/mmde/mmdet3d/models/necks/second_fpn.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer
+from mmengine.model import BaseModule
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class SECONDFPN(BaseModule):
+    """FPN used in SECOND/PointPillars/PartA2/MVXNet.
+
+    Args:
+        in_channels (list[int]): Input channels of multi-scale feature maps.
+        out_channels (list[int]): Output channels of feature maps.
+        upsample_strides (list[int]): Strides used to upsample the
+            feature maps.
+        norm_cfg (dict): Config dict of normalization layers.
+        upsample_cfg (dict): Config dict of upsample layers.
+        conv_cfg (dict): Config dict of conv layers.
+        use_conv_for_no_stride (bool): Whether to use conv when stride is 1.
+        init_cfg (dict or :obj:`ConfigDict` or list[dict or :obj:`ConfigDict`],
+            optional): Initialization config dict. Defaults to
+            [dict(type='Kaiming', layer='ConvTranspose2d'),
+             dict(type='Constant', layer='NaiveSyncBatchNorm2d', val=1.0)].
+    """
+
+    def __init__(self,
+                 in_channels=[128, 128, 256],
+                 out_channels=[256, 256, 256],
+                 upsample_strides=[1, 2, 4],
+                 norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+                 upsample_cfg=dict(type='deconv', bias=False),
+                 conv_cfg=dict(type='Conv2d', bias=False),
+                 use_conv_for_no_stride=False,
+                 init_cfg=[
+                     dict(type='Kaiming', layer='ConvTranspose2d'),
+                     dict(
+                         type='Constant',
+                         layer='NaiveSyncBatchNorm2d',
+                         val=1.0)
+                 ]):
+        # if for GroupNorm,
+        # cfg is dict(type='GN', num_groups=num_groups, eps=1e-3, affine=True)
+        super(SECONDFPN, self).__init__(init_cfg=init_cfg)
+        assert len(out_channels) == len(upsample_strides) == len(in_channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        deblocks = []
+        for i, out_channel in enumerate(out_channels):
+            stride = upsample_strides[i]
+            if stride > 1 or (stride == 1 and not use_conv_for_no_stride):
+                upsample_layer = build_upsample_layer(
+                    upsample_cfg,
+                    in_channels=in_channels[i],
+                    out_channels=out_channel,
+                    kernel_size=upsample_strides[i],
+                    stride=upsample_strides[i])
+            else:
+                stride = np.round(1 / stride).astype(np.int64)
+                upsample_layer = build_conv_layer(
+                    conv_cfg,
+                    in_channels=in_channels[i],
+                    out_channels=out_channel,
+                    kernel_size=stride,
+                    stride=stride)
+
+            deblock = nn.Sequential(upsample_layer,
+                                    build_norm_layer(norm_cfg, out_channel)[1],
+                                    nn.ReLU(inplace=True))
+            deblocks.append(deblock)
+        self.deblocks = nn.ModuleList(deblocks)
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (List[torch.Tensor]): Multi-level features with 4D Tensor in
+                (N, C, H, W) shape.
+
+        Returns:
+            list[torch.Tensor]: Multi-level feature maps.
+        """
+        assert len(x) == len(self.in_channels)
+        ups = [deblock(x[i]) for i, deblock in enumerate(self.deblocks)]
+
+        if len(ups) > 1:
+            out = torch.cat(ups, dim=1)
+        else:
+            out = ups[0]
+        return [out]
diff --git a/mmde/mmdet3d/models/roi_heads/__init__.py b/mmde/mmdet3d/models/roi_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e90b1a755dc4ac45677269d9cbeef434199ab38
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_3droi_head import Base3DRoIHead
+from .bbox_heads import PartA2BboxHead
+from .h3d_roi_head import H3DRoIHead
+from .mask_heads import PointwiseSemanticHead, PrimitiveHead
+from .part_aggregation_roi_head import PartAggregationROIHead
+from .point_rcnn_roi_head import PointRCNNRoIHead
+from .pv_rcnn_roi_head import PVRCNNRoiHead
+from .roi_extractors import Single3DRoIAwareExtractor, SingleRoIExtractor
+
+__all__ = [
+    'Base3DRoIHead', 'PartAggregationROIHead', 'PointwiseSemanticHead',
+    'Single3DRoIAwareExtractor', 'PartA2BboxHead', 'SingleRoIExtractor',
+    'H3DRoIHead', 'PrimitiveHead', 'PointRCNNRoIHead', 'PVRCNNRoiHead'
+]
diff --git a/mmde/mmdet3d/models/roi_heads/base_3droi_head.py b/mmde/mmdet3d/models/roi_heads/base_3droi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a6fb6bfd8715e548779b80b8969fbd50e832dfe
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/base_3droi_head.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.roi_heads import BaseRoIHead
+
+from mmdet3d.registry import MODELS, TASK_UTILS
+
+
+class Base3DRoIHead(BaseRoIHead):
+    """Base class for 3d RoIHeads."""
+
+    def __init__(self,
+                 bbox_head=None,
+                 bbox_roi_extractor=None,
+                 mask_head=None,
+                 mask_roi_extractor=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None):
+        super(Base3DRoIHead, self).__init__(
+            bbox_head=bbox_head,
+            bbox_roi_extractor=bbox_roi_extractor,
+            mask_head=mask_head,
+            mask_roi_extractor=mask_roi_extractor,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+
+    def init_bbox_head(self, bbox_roi_extractor: dict,
+                       bbox_head: dict) -> None:
+        """Initialize box head and box roi extractor.
+
+        Args:
+            bbox_roi_extractor (dict or ConfigDict): Config of box
+                roi extractor.
+            bbox_head (dict or ConfigDict): Config of box in box head.
+        """
+        self.bbox_roi_extractor = MODELS.build(bbox_roi_extractor)
+        self.bbox_head = MODELS.build(bbox_head)
+
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            if isinstance(self.train_cfg.assigner, dict):
+                self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            elif isinstance(self.train_cfg.assigner, list):
+                self.bbox_assigner = [
+                    TASK_UTILS.build(res) for res in self.train_cfg.assigner
+                ]
+            self.bbox_sampler = TASK_UTILS.build(self.train_cfg.sampler)
+
+    def init_mask_head(self):
+        """Initialize mask head, skip since ``PartAggregationROIHead`` does not
+        have one."""
+        pass
diff --git a/mmde/mmdet3d/models/roi_heads/bbox_heads/__init__.py b/mmde/mmdet3d/models/roi_heads/bbox_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..994465ed8db0f1a5b6b5e8c536824ada757ac72a
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/bbox_heads/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.roi_heads.bbox_heads import (BBoxHead, ConvFCBBoxHead,
+                                               DoubleConvFCBBoxHead,
+                                               Shared2FCBBoxHead,
+                                               Shared4Conv1FCBBoxHead)
+
+from .h3d_bbox_head import H3DBboxHead
+from .parta2_bbox_head import PartA2BboxHead
+from .point_rcnn_bbox_head import PointRCNNBboxHead
+from .pv_rcnn_bbox_head import PVRCNNBBoxHead
+
+__all__ = [
+    'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead',
+    'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'PartA2BboxHead',
+    'H3DBboxHead', 'PointRCNNBboxHead', 'PVRCNNBBoxHead'
+]
diff --git a/mmde/mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py b/mmde/mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8168a5ef8675c3c06e286e775a4f4f5025d33723
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py
@@ -0,0 +1,990 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from mmcv.cnn import ConvModule
+from mmdet.models.utils import multi_apply
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.models import aligned_3d_nms
+from mmdet3d.models.layers.pointnet_modules import build_sa_module
+from mmdet3d.models.losses import chamfer_distance
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures import (BaseInstance3DBoxes, DepthInstance3DBoxes,
+                                Det3DDataSample)
+
+
+@MODELS.register_module()
+class H3DBboxHead(BaseModule):
+    r"""Bbox head of `H3DNet <https://arxiv.org/abs/2006.05682>`_.
+
+    Args:
+        num_classes (int): The number of classes.
+        surface_matching_cfg (dict): Config for surface primitive matching.
+        line_matching_cfg (dict): Config for line primitive matching.
+        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
+            decoding boxes.
+        train_cfg (dict): Config for training. Defaults to None.
+        test_cfg (dict): Config for testing. Defaults to None.
+        gt_per_seed (int): Number of ground truth votes generated
+            from each seed point. Defaults to 1.
+        num_proposal (int): Number of proposal votes generated.
+            Defaults to 256.
+        primitive_feat_refine_streams (int): The number of mlps to
+            refine primitive feature. Defaults to 2.
+        primitive_refine_channels (tuple[int]): Convolution channels of
+            prediction layer. Defaults to [128, 128, 128].
+        upper_thresh (float): Threshold for line matching. Defaults to 100.
+        surface_thresh (float): Threshold for surface matching.
+            Defaults to 0.5.
+        line_thresh (float): Threshold for line matching.  Defaults to 0.5.
+        conv_cfg (dict): Config of convolution in prediction layer.
+            Defaults to None.
+        norm_cfg (dict): Config of BN in prediction layer.  Defaults to None.
+        objectness_loss (dict): Config of objectness loss.  Defaults to None.
+        center_loss (dict): Config of center loss.  Defaults to None.
+        dir_class_loss (dict): Config of direction classification loss.
+            Defaults to None.
+        dir_res_loss (dict): Config of direction residual regression loss.
+            Defaults to None.
+        size_class_loss (dict): Config of size classification loss.
+            Defaults to None.
+        size_res_loss (dict): Config of size residual regression loss.
+            Defaults to None.
+        semantic_loss (dict): Config of point-wise semantic segmentation loss.
+             Defaults to None.
+        cues_objectness_loss (dict): Config of cues objectness loss.
+             Defaults to None.
+        cues_semantic_loss (dict): Config of cues semantic loss.
+             Defaults to None.
+        proposal_objectness_loss (dict): Config of proposal objectness
+            loss.  Defaults to None.
+        primitive_center_loss (dict): Config of primitive center regression
+            loss.  Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 suface_matching_cfg: dict,
+                 line_matching_cfg: dict,
+                 bbox_coder: dict,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 gt_per_seed: int = 1,
+                 num_proposal: int = 256,
+                 primitive_feat_refine_streams: int = 2,
+                 primitive_refine_channels: List[int] = [128, 128, 128],
+                 upper_thresh: float = 100.0,
+                 surface_thresh: float = 0.5,
+                 line_thresh: float = 0.5,
+                 conv_cfg: dict = dict(type='Conv1d'),
+                 norm_cfg: dict = dict(type='BN1d'),
+                 objectness_loss: Optional[dict] = None,
+                 center_loss: Optional[dict] = None,
+                 dir_class_loss: Optional[dict] = None,
+                 dir_res_loss: Optional[dict] = None,
+                 size_class_loss: Optional[dict] = None,
+                 size_res_loss: Optional[dict] = None,
+                 semantic_loss: Optional[dict] = None,
+                 cues_objectness_loss: Optional[dict] = None,
+                 cues_semantic_loss: Optional[dict] = None,
+                 proposal_objectness_loss: Optional[dict] = None,
+                 primitive_center_loss: Optional[dict] = None,
+                 init_cfg: dict = None):
+        super(H3DBboxHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.gt_per_seed = gt_per_seed
+        self.num_proposal = num_proposal
+        self.with_angle = bbox_coder['with_rot']
+        self.upper_thresh = upper_thresh
+        self.surface_thresh = surface_thresh
+        self.line_thresh = line_thresh
+
+        self.loss_objectness = MODELS.build(objectness_loss)
+        self.loss_center = MODELS.build(center_loss)
+        self.loss_dir_class = MODELS.build(dir_class_loss)
+        self.loss_dir_res = MODELS.build(dir_res_loss)
+        self.loss_size_class = MODELS.build(size_class_loss)
+        self.loss_size_res = MODELS.build(size_res_loss)
+        self.loss_semantic = MODELS.build(semantic_loss)
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.num_sizes = self.bbox_coder.num_sizes
+        self.num_dir_bins = self.bbox_coder.num_dir_bins
+
+        self.loss_cues_objectness = MODELS.build(cues_objectness_loss)
+        self.loss_cues_semantic = MODELS.build(cues_semantic_loss)
+        self.loss_proposal_objectness = MODELS.build(proposal_objectness_loss)
+        self.loss_primitive_center = MODELS.build(primitive_center_loss)
+
+        assert suface_matching_cfg['mlp_channels'][-1] == \
+            line_matching_cfg['mlp_channels'][-1]
+
+        # surface center matching
+        self.surface_center_matcher = build_sa_module(suface_matching_cfg)
+        # line center matching
+        self.line_center_matcher = build_sa_module(line_matching_cfg)
+
+        # Compute the matching scores
+        matching_feat_dims = suface_matching_cfg['mlp_channels'][-1]
+        self.matching_conv = ConvModule(
+            matching_feat_dims,
+            matching_feat_dims,
+            1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=True,
+            inplace=True)
+        self.matching_pred = nn.Conv1d(matching_feat_dims, 2, 1)
+
+        # Compute the semantic matching scores
+        self.semantic_matching_conv = ConvModule(
+            matching_feat_dims,
+            matching_feat_dims,
+            1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=True,
+            inplace=True)
+        self.semantic_matching_pred = nn.Conv1d(matching_feat_dims, 2, 1)
+
+        # Surface feature aggregation
+        self.surface_feats_aggregation = list()
+        for k in range(primitive_feat_refine_streams):
+            self.surface_feats_aggregation.append(
+                ConvModule(
+                    matching_feat_dims,
+                    matching_feat_dims,
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    bias=True,
+                    inplace=True))
+        self.surface_feats_aggregation = nn.Sequential(
+            *self.surface_feats_aggregation)
+
+        # Line feature aggregation
+        self.line_feats_aggregation = list()
+        for k in range(primitive_feat_refine_streams):
+            self.line_feats_aggregation.append(
+                ConvModule(
+                    matching_feat_dims,
+                    matching_feat_dims,
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    bias=True,
+                    inplace=True))
+        self.line_feats_aggregation = nn.Sequential(
+            *self.line_feats_aggregation)
+
+        # surface center(6) + line center(12)
+        prev_channel = 18 * matching_feat_dims
+        self.bbox_pred = nn.ModuleList()
+        for k in range(len(primitive_refine_channels)):
+            self.bbox_pred.append(
+                ConvModule(
+                    prev_channel,
+                    primitive_refine_channels[k],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    bias=True,
+                    inplace=False))
+            prev_channel = primitive_refine_channels[k]
+
+        # Final object detection
+        # Objectness scores (2), center residual (3),
+        # heading class+residual (num_heading_bin*2), size class +
+        # residual(num_size_cluster*4)
+        conv_out_channel = (2 + 3 + bbox_coder['num_dir_bins'] * 2 +
+                            bbox_coder['num_sizes'] * 4 + self.num_classes)
+        self.bbox_pred.append(nn.Conv1d(prev_channel, conv_out_channel, 1))
+
+    def forward(self, feats_dict: dict):
+        """Forward pass.
+
+        Args:
+            feats_dict (dict): Feature dict from backbone.
+
+        Returns:
+            dict: Predictions of head.
+        """
+        ret_dict = {}
+        aggregated_points = feats_dict['aggregated_points']
+        original_feature = feats_dict['aggregated_features']
+        batch_size = original_feature.shape[0]
+        object_proposal = original_feature.shape[2]
+
+        # Extract surface center, features and semantic predictions
+        z_center = feats_dict['pred_z_center']
+        xy_center = feats_dict['pred_xy_center']
+        z_semantic = feats_dict['sem_cls_scores_z']
+        xy_semantic = feats_dict['sem_cls_scores_xy']
+        z_feature = feats_dict['aggregated_features_z']
+        xy_feature = feats_dict['aggregated_features_xy']
+        # Extract line points and features
+        line_center = feats_dict['pred_line_center']
+        line_feature = feats_dict['aggregated_features_line']
+
+        surface_center_pred = torch.cat((z_center, xy_center), dim=1)
+        ret_dict['surface_center_pred'] = surface_center_pred
+        ret_dict['surface_sem_pred'] = torch.cat((z_semantic, xy_semantic),
+                                                 dim=1)
+
+        # Extract the surface and line centers of rpn proposals
+        rpn_proposals = feats_dict['rpn_proposals']
+        rpn_proposals_bbox = DepthInstance3DBoxes(
+            rpn_proposals.reshape(-1, 7).clone(),
+            box_dim=rpn_proposals.shape[-1],
+            with_yaw=self.with_angle,
+            origin=(0.5, 0.5, 0.5))
+
+        obj_surface_center, obj_line_center = \
+            rpn_proposals_bbox.get_surface_line_center()
+        obj_surface_center = obj_surface_center.reshape(
+            batch_size, -1, 6, 3).transpose(1, 2).reshape(batch_size, -1, 3)
+        obj_line_center = obj_line_center.reshape(batch_size, -1, 12,
+                                                  3).transpose(1, 2).reshape(
+                                                      batch_size, -1, 3)
+        ret_dict['surface_center_object'] = obj_surface_center
+        ret_dict['line_center_object'] = obj_line_center
+
+        # aggregate primitive z and xy features to rpn proposals
+        surface_center_feature_pred = torch.cat((z_feature, xy_feature), dim=2)
+        surface_center_feature_pred = torch.cat(
+            (surface_center_feature_pred.new_zeros(
+                (batch_size, 6, surface_center_feature_pred.shape[2])),
+             surface_center_feature_pred),
+            dim=1)
+
+        surface_xyz, surface_features, _ = self.surface_center_matcher(
+            surface_center_pred,
+            surface_center_feature_pred,
+            target_xyz=obj_surface_center)
+
+        # aggregate primitive line features to rpn proposals
+        line_feature = torch.cat((line_feature.new_zeros(
+            (batch_size, 12, line_feature.shape[2])), line_feature),
+                                 dim=1)
+        line_xyz, line_features, _ = self.line_center_matcher(
+            line_center, line_feature, target_xyz=obj_line_center)
+
+        # combine the surface and line features
+        combine_features = torch.cat((surface_features, line_features), dim=2)
+
+        matching_features = self.matching_conv(combine_features)
+        matching_score = self.matching_pred(matching_features)
+        ret_dict['matching_score'] = matching_score.transpose(2, 1)
+
+        semantic_matching_features = self.semantic_matching_conv(
+            combine_features)
+        semantic_matching_score = self.semantic_matching_pred(
+            semantic_matching_features)
+        ret_dict['semantic_matching_score'] = \
+            semantic_matching_score.transpose(2, 1)
+
+        surface_features = self.surface_feats_aggregation(surface_features)
+        line_features = self.line_feats_aggregation(line_features)
+
+        # Combine all surface and line features
+        surface_features = surface_features.view(batch_size, -1,
+                                                 object_proposal)
+        line_features = line_features.view(batch_size, -1, object_proposal)
+
+        combine_feature = torch.cat((surface_features, line_features), dim=1)
+
+        # Final bbox predictions
+        bbox_predictions = self.bbox_pred[0](combine_feature)
+        bbox_predictions += original_feature
+        for conv_module in self.bbox_pred[1:]:
+            bbox_predictions = conv_module(bbox_predictions)
+
+        refine_decode_res = self.bbox_coder.split_pred(
+            bbox_predictions[:, :self.num_classes + 2],
+            bbox_predictions[:, self.num_classes + 2:], aggregated_points)
+        for key in refine_decode_res.keys():
+            ret_dict[key + '_optimized'] = refine_decode_res[key]
+        return ret_dict
+
+    def loss(
+        self,
+        points: List[Tensor],
+        feats_dict: dict,
+        rpn_targets: Tuple = None,
+        batch_data_samples: List[Det3DDataSample] = None,
+    ):
+        """
+        Args:
+            points (list[tensor]): Points cloud of multiple samples.
+            feats_dict (dict): Predictions from backbone or FPN.
+            rpn_targets (Tuple, Optional): The target of sample from RPN.
+                Defaults to None.
+            batch_data_samples (list[:obj:`Det3DDataSample`], Optional):
+                Each item contains the meta information of each sample
+                and corresponding annotations. Defaults to None.
+
+        Returns:
+            dict:  A dictionary of loss components.
+        """
+        preds = self(feats_dict)
+        feats_dict.update(preds)
+
+        (vote_targets, vote_target_masks, size_class_targets, size_res_targets,
+         dir_class_targets, dir_res_targets, center_targets, _, mask_targets,
+         valid_gt_masks, objectness_targets, objectness_weights,
+         box_loss_weights, valid_gt_weights) = rpn_targets
+
+        losses = {}
+
+        # calculate refined proposal loss
+        refined_proposal_loss = self.get_proposal_stage_loss(
+            feats_dict,
+            size_class_targets,
+            size_res_targets,
+            dir_class_targets,
+            dir_res_targets,
+            center_targets,
+            mask_targets,
+            objectness_targets,
+            objectness_weights,
+            box_loss_weights,
+            valid_gt_weights,
+            suffix='_optimized')
+        for key in refined_proposal_loss.keys():
+            losses[key + '_optimized'] = refined_proposal_loss[key]
+
+        batch_gt_instance_3d = []
+        batch_input_metas = []
+
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instance_3d.append(data_sample.gt_instances_3d)
+
+        temp_loss = self.loss_by_feat(points, feats_dict, batch_gt_instance_3d)
+        losses.update(temp_loss)
+        return losses
+
+    def loss_by_feat(self, points: List[torch.Tensor], feats_dict: dict,
+                     batch_gt_instances_3d: List[InstanceData],
+                     **kwargs) -> dict:
+        """Compute loss.
+
+        Args:
+            points (list[torch.Tensor]): Input points.
+            feats_dict (dict): Predictions from forward of vote head.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            dict: Losses of H3DNet.
+        """
+        bbox3d_optimized = self.bbox_coder.decode(
+            feats_dict, suffix='_optimized')
+
+        targets = self.get_targets(points, feats_dict, batch_gt_instances_3d)
+
+        (cues_objectness_label, cues_sem_label, proposal_objectness_label,
+         cues_mask, cues_match_mask, proposal_objectness_mask,
+         cues_matching_label, obj_surface_line_center) = targets
+
+        # match scores for each geometric primitive
+        objectness_scores = feats_dict['matching_score']
+        # match scores for the semantics of primitives
+        objectness_scores_sem = feats_dict['semantic_matching_score']
+
+        primitive_objectness_loss = self.loss_cues_objectness(
+            objectness_scores.transpose(2, 1),
+            cues_objectness_label,
+            weight=cues_mask,
+            avg_factor=cues_mask.sum() + 1e-6)
+
+        primitive_sem_loss = self.loss_cues_semantic(
+            objectness_scores_sem.transpose(2, 1),
+            cues_sem_label,
+            weight=cues_mask,
+            avg_factor=cues_mask.sum() + 1e-6)
+
+        objectness_scores = feats_dict['obj_scores_optimized']
+        objectness_loss_refine = self.loss_proposal_objectness(
+            objectness_scores.transpose(2, 1), proposal_objectness_label)
+        primitive_matching_loss = (objectness_loss_refine *
+                                   cues_match_mask).sum() / (
+                                       cues_match_mask.sum() + 1e-6) * 0.5
+        primitive_sem_matching_loss = (
+            objectness_loss_refine * proposal_objectness_mask).sum() / (
+                proposal_objectness_mask.sum() + 1e-6) * 0.5
+
+        # Get the object surface center here
+        batch_size, object_proposal = bbox3d_optimized.shape[:2]
+        refined_bbox = DepthInstance3DBoxes(
+            bbox3d_optimized.reshape(-1, 7).clone(),
+            box_dim=bbox3d_optimized.shape[-1],
+            with_yaw=self.with_angle,
+            origin=(0.5, 0.5, 0.5))
+
+        pred_obj_surface_center, pred_obj_line_center = \
+            refined_bbox.get_surface_line_center()
+        pred_obj_surface_center = pred_obj_surface_center.reshape(
+            batch_size, -1, 6, 3).transpose(1, 2).reshape(batch_size, -1, 3)
+        pred_obj_line_center = pred_obj_line_center.reshape(
+            batch_size, -1, 12, 3).transpose(1, 2).reshape(batch_size, -1, 3)
+        pred_surface_line_center = torch.cat(
+            (pred_obj_surface_center, pred_obj_line_center), 1)
+
+        square_dist = self.loss_primitive_center(pred_surface_line_center,
+                                                 obj_surface_line_center)
+
+        match_dist = torch.sqrt(square_dist.sum(dim=-1) + 1e-6)
+        primitive_centroid_reg_loss = torch.sum(
+            match_dist * cues_matching_label) / (
+                cues_matching_label.sum() + 1e-6)
+
+        refined_loss = dict(
+            primitive_objectness_loss=primitive_objectness_loss,
+            primitive_sem_loss=primitive_sem_loss,
+            primitive_matching_loss=primitive_matching_loss,
+            primitive_sem_matching_loss=primitive_sem_matching_loss,
+            primitive_centroid_reg_loss=primitive_centroid_reg_loss)
+
+        return refined_loss
+
+    def predict(self,
+                points: List[torch.Tensor],
+                feats_dict: Dict[str, torch.Tensor],
+                batch_data_samples: List[Det3DDataSample],
+                suffix='_optimized',
+                **kwargs) -> List[InstanceData]:
+        """
+        Args:
+            points (list[tensor]): Point clouds of multiple samples.
+            feats_dict (dict): Features from FPN or backbone..
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes meta information of data.
+            suffix (str): suffix for tensor in feats_dict.
+                Defaults to '_optimized'.
+
+        Returns:
+            list[:obj:`InstanceData`]: List of processed predictions. Each
+            InstanceData contains 3d Bounding boxes and corresponding
+            scores and labels.
+        """
+        preds_dict = self(feats_dict)
+        # `preds_dict` can be used in H3DNET
+        feats_dict.update(preds_dict)
+
+        batch_size = len(batch_data_samples)
+        batch_input_metas = []
+        for batch_index in range(batch_size):
+            metainfo = batch_data_samples[batch_index].metainfo
+            batch_input_metas.append(metainfo)
+
+        results_list = self.predict_by_feat(
+            points, feats_dict, batch_input_metas, suffix=suffix, **kwargs)
+        return results_list
+
+    def predict_by_feat(self,
+                        points: List[torch.Tensor],
+                        feats_dict: dict,
+                        batch_input_metas: List[dict],
+                        suffix='_optimized',
+                        **kwargs) -> List[InstanceData]:
+        """Generate bboxes from vote head predictions.
+
+        Args:
+            points (List[torch.Tensor]): Input points of multiple samples.
+            feats_dict (dict): Predictions from previous components.
+            batch_input_metas (list[dict]): Each item
+                contains the meta information of each sample.
+            suffix (str): suffix for tensor in feats_dict.
+                Defaults to '_optimized'.
+
+        Returns:
+            list[:obj:`InstanceData`]: Return list of processed
+            predictions. Each InstanceData cantains
+            3d Bounding boxes and corresponding scores and labels.
+        """
+
+        # decode boxes
+        obj_scores = F.softmax(
+            feats_dict['obj_scores' + suffix], dim=-1)[..., -1]
+
+        sem_scores = F.softmax(feats_dict['sem_scores'], dim=-1)
+
+        prediction_collection = {}
+        prediction_collection['center'] = feats_dict['center' + suffix]
+        prediction_collection['dir_class'] = feats_dict['dir_class']
+        prediction_collection['dir_res'] = feats_dict['dir_res' + suffix]
+        prediction_collection['size_class'] = feats_dict['size_class']
+        prediction_collection['size_res'] = feats_dict['size_res' + suffix]
+
+        bbox3d = self.bbox_coder.decode(prediction_collection)
+
+        batch_size = bbox3d.shape[0]
+        results_list = list()
+        points = torch.stack(points)
+        for b in range(batch_size):
+            temp_results = InstanceData()
+            bbox_selected, score_selected, labels = self.multiclass_nms_single(
+                obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3],
+                batch_input_metas[b])
+            bbox = batch_input_metas[b]['box_type_3d'](
+                bbox_selected,
+                box_dim=bbox_selected.shape[-1],
+                with_yaw=self.bbox_coder.with_rot)
+
+            temp_results.bboxes_3d = bbox
+            temp_results.scores_3d = score_selected
+            temp_results.labels_3d = labels
+            results_list.append(temp_results)
+
+        return results_list
+
+    def multiclass_nms_single(self, obj_scores: Tensor, sem_scores: Tensor,
+                              bbox: Tensor, points: Tensor,
+                              input_meta: dict) -> Tuple:
+        """Multi-class nms in single batch.
+
+        Args:
+            obj_scores (torch.Tensor): Objectness score of bounding boxes.
+            sem_scores (torch.Tensor): semantic class score of bounding boxes.
+            bbox (torch.Tensor): Predicted bounding boxes.
+            points (torch.Tensor): Input points.
+            input_meta (dict): Point cloud and image's meta info.
+
+        Returns:
+            tuple[torch.Tensor]: Bounding boxes, scores and labels.
+        """
+        bbox = input_meta['box_type_3d'](
+            bbox,
+            box_dim=bbox.shape[-1],
+            with_yaw=self.bbox_coder.with_rot,
+            origin=(0.5, 0.5, 0.5))
+        box_indices = bbox.points_in_boxes_all(points)
+
+        corner3d = bbox.corners
+        minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
+        minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
+        minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
+
+        nonempty_box_mask = box_indices.T.sum(1) > 5
+
+        bbox_classes = torch.argmax(sem_scores, -1)
+        nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask],
+                                      obj_scores[nonempty_box_mask],
+                                      bbox_classes[nonempty_box_mask],
+                                      self.test_cfg.nms_thr)
+
+        # filter empty boxes and boxes with low score
+        scores_mask = (obj_scores > self.test_cfg.score_thr)
+        nonempty_box_inds = torch.nonzero(
+            nonempty_box_mask, as_tuple=False).flatten()
+        nonempty_mask = torch.zeros_like(bbox_classes).scatter(
+            0, nonempty_box_inds[nms_selected], 1)
+        selected = (nonempty_mask.bool() & scores_mask.bool())
+
+        if self.test_cfg.per_class_proposal:
+            bbox_selected, score_selected, labels = [], [], []
+            for k in range(sem_scores.shape[-1]):
+                bbox_selected.append(bbox[selected].tensor)
+                score_selected.append(obj_scores[selected] *
+                                      sem_scores[selected][:, k])
+                labels.append(
+                    torch.zeros_like(bbox_classes[selected]).fill_(k))
+            bbox_selected = torch.cat(bbox_selected, 0)
+            score_selected = torch.cat(score_selected, 0)
+            labels = torch.cat(labels, 0)
+        else:
+            bbox_selected = bbox[selected].tensor
+            score_selected = obj_scores[selected]
+            labels = bbox_classes[selected]
+
+        return bbox_selected, score_selected, labels
+
+    def get_proposal_stage_loss(self,
+                                bbox_preds,
+                                size_class_targets,
+                                size_res_targets,
+                                dir_class_targets,
+                                dir_res_targets,
+                                center_targets,
+                                mask_targets,
+                                objectness_targets,
+                                objectness_weights,
+                                box_loss_weights,
+                                valid_gt_weights,
+                                suffix=''):
+        """Compute loss for the aggregation module.
+
+        Args:
+            bbox_preds (dict): Predictions from forward of vote head.
+            size_class_targets (torch.Tensor): Ground truth
+                size class of each prediction bounding box.
+            size_res_targets (torch.Tensor): Ground truth
+                size residual of each prediction bounding box.
+            dir_class_targets (torch.Tensor): Ground truth
+                direction class of each prediction bounding box.
+            dir_res_targets (torch.Tensor): Ground truth
+                direction residual of each prediction bounding box.
+            center_targets (torch.Tensor): Ground truth center
+                of each prediction bounding box.
+            mask_targets (torch.Tensor): Validation of each
+                prediction bounding box.
+            objectness_targets (torch.Tensor): Ground truth
+                objectness label of each prediction bounding box.
+            objectness_weights (torch.Tensor): Weights of objectness
+                loss for each prediction bounding box.
+            box_loss_weights (torch.Tensor): Weights of regression
+                loss for each prediction bounding box.
+            valid_gt_weights (torch.Tensor): Validation of each
+                ground truth bounding box.
+
+        Returns:
+            dict: Losses of aggregation module.
+        """
+        # calculate objectness loss
+        objectness_loss = self.loss_objectness(
+            bbox_preds['obj_scores' + suffix].transpose(2, 1),
+            objectness_targets,
+            weight=objectness_weights)
+
+        # calculate center loss
+        source2target_loss, target2source_loss = self.loss_center(
+            bbox_preds['center' + suffix],
+            center_targets,
+            src_weight=box_loss_weights,
+            dst_weight=valid_gt_weights)
+        center_loss = source2target_loss + target2source_loss
+
+        # calculate direction class loss
+        dir_class_loss = self.loss_dir_class(
+            bbox_preds['dir_class' + suffix].transpose(2, 1),
+            dir_class_targets,
+            weight=box_loss_weights)
+
+        # calculate direction residual loss
+        batch_size, proposal_num = size_class_targets.shape[:2]
+        heading_label_one_hot = dir_class_targets.new_zeros(
+            (batch_size, proposal_num, self.num_dir_bins))
+        heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1)
+        dir_res_norm = (bbox_preds['dir_res_norm' + suffix] *
+                        heading_label_one_hot).sum(dim=-1)
+        dir_res_loss = self.loss_dir_res(
+            dir_res_norm, dir_res_targets, weight=box_loss_weights)
+
+        # calculate size class loss
+        size_class_loss = self.loss_size_class(
+            bbox_preds['size_class' + suffix].transpose(2, 1),
+            size_class_targets,
+            weight=box_loss_weights)
+
+        # calculate size residual loss
+        one_hot_size_targets = box_loss_weights.new_zeros(
+            (batch_size, proposal_num, self.num_sizes))
+        one_hot_size_targets.scatter_(2, size_class_targets.unsqueeze(-1), 1)
+        one_hot_size_targets_expand = one_hot_size_targets.unsqueeze(
+            -1).repeat(1, 1, 1, 3)
+        size_residual_norm = (bbox_preds['size_res_norm' + suffix] *
+                              one_hot_size_targets_expand).sum(dim=2)
+        box_loss_weights_expand = box_loss_weights.unsqueeze(-1).repeat(
+            1, 1, 3)
+        size_res_loss = self.loss_size_res(
+            size_residual_norm,
+            size_res_targets,
+            weight=box_loss_weights_expand)
+
+        # calculate semantic loss
+        semantic_loss = self.loss_semantic(
+            bbox_preds['sem_scores' + suffix].transpose(2, 1),
+            mask_targets,
+            weight=box_loss_weights)
+
+        losses = dict(
+            objectness_loss=objectness_loss,
+            semantic_loss=semantic_loss,
+            center_loss=center_loss,
+            dir_class_loss=dir_class_loss,
+            dir_res_loss=dir_res_loss,
+            size_class_loss=size_class_loss,
+            size_res_loss=size_res_loss)
+
+        return losses
+
+    def get_targets(
+        self,
+        points,
+        feats_dict: Optional[dict] = None,
+        batch_gt_instances_3d: Optional[List[InstanceData]] = None,
+    ):
+        """Generate targets of vote head.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            feats_dict (dict, optional): Predictions of previous
+                components. Defaults to None.
+            batch_gt_instances_3d (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances. It usually includes
+                ``bboxes_3d`` and ``labels_3d`` attributes.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of vote head.
+        """
+        # find empty example
+        valid_gt_masks = list()
+        gt_num = list()
+        batch_gt_labels_3d = [
+            gt_instances_3d.labels_3d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+        batch_gt_bboxes_3d = [
+            gt_instances_3d.bboxes_3d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+        for index in range(len(batch_gt_labels_3d)):
+            if len(batch_gt_labels_3d[index]) == 0:
+                fake_box = batch_gt_bboxes_3d[index].tensor.new_zeros(
+                    1, batch_gt_bboxes_3d[index].tensor.shape[-1])
+                batch_gt_bboxes_3d[index] = batch_gt_bboxes_3d[index].new_box(
+                    fake_box)
+                batch_gt_labels_3d[index] = batch_gt_labels_3d[
+                    index].new_zeros(1)
+                valid_gt_masks.append(batch_gt_labels_3d[index].new_zeros(1))
+                gt_num.append(1)
+            else:
+                valid_gt_masks.append(batch_gt_labels_3d[index].new_ones(
+                    batch_gt_labels_3d[index].shape))
+                gt_num.append(batch_gt_labels_3d[index].shape[0])
+
+        aggregated_points = [
+            feats_dict['aggregated_points'][i]
+            for i in range(len(batch_gt_labels_3d))
+        ]
+
+        surface_center_pred = [
+            feats_dict['surface_center_pred'][i]
+            for i in range(len(batch_gt_labels_3d))
+        ]
+
+        line_center_pred = [
+            feats_dict['pred_line_center'][i]
+            for i in range(len(batch_gt_labels_3d))
+        ]
+
+        surface_center_object = [
+            feats_dict['surface_center_object'][i]
+            for i in range(len(batch_gt_labels_3d))
+        ]
+
+        line_center_object = [
+            feats_dict['line_center_object'][i]
+            for i in range(len(batch_gt_labels_3d))
+        ]
+
+        surface_sem_pred = [
+            feats_dict['surface_sem_pred'][i]
+            for i in range(len(batch_gt_labels_3d))
+        ]
+
+        line_sem_pred = [
+            feats_dict['sem_cls_scores_line'][i]
+            for i in range(len(batch_gt_labels_3d))
+        ]
+
+        (cues_objectness_label, cues_sem_label, proposal_objectness_label,
+         cues_mask, cues_match_mask, proposal_objectness_mask,
+         cues_matching_label, obj_surface_line_center) = multi_apply(
+             self._get_targets_single, points, batch_gt_bboxes_3d,
+             batch_gt_labels_3d, aggregated_points, surface_center_pred,
+             line_center_pred, surface_center_object, line_center_object,
+             surface_sem_pred, line_sem_pred)
+
+        cues_objectness_label = torch.stack(cues_objectness_label)
+        cues_sem_label = torch.stack(cues_sem_label)
+        proposal_objectness_label = torch.stack(proposal_objectness_label)
+        cues_mask = torch.stack(cues_mask)
+        cues_match_mask = torch.stack(cues_match_mask)
+        proposal_objectness_mask = torch.stack(proposal_objectness_mask)
+        cues_matching_label = torch.stack(cues_matching_label)
+        obj_surface_line_center = torch.stack(obj_surface_line_center)
+
+        return (cues_objectness_label, cues_sem_label,
+                proposal_objectness_label, cues_mask, cues_match_mask,
+                proposal_objectness_mask, cues_matching_label,
+                obj_surface_line_center)
+
+    def _get_targets_single(self,
+                            points: Tensor,
+                            gt_bboxes_3d: BaseInstance3DBoxes,
+                            gt_labels_3d: Tensor,
+                            aggregated_points: Optional[Tensor] = None,
+                            pred_surface_center: Optional[Tensor] = None,
+                            pred_line_center: Optional[Tensor] = None,
+                            pred_obj_surface_center: Optional[Tensor] = None,
+                            pred_obj_line_center: Optional[Tensor] = None,
+                            pred_surface_sem: Optional[Tensor] = None,
+                            pred_line_sem: Optional[Tensor] = None):
+        """Generate targets for primitive cues for single batch.
+
+        Args:
+            points (torch.Tensor): Points of each batch.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
+                boxes of each batch.
+            gt_labels_3d (torch.Tensor): Labels of each batch.
+            aggregated_points (torch.Tensor): Aggregated points from
+                vote aggregation layer.
+            pred_surface_center (torch.Tensor): Prediction of surface center.
+            pred_line_center (torch.Tensor): Prediction of line center.
+            pred_obj_surface_center (torch.Tensor): Objectness prediction
+                of surface center.
+            pred_obj_line_center (torch.Tensor): Objectness prediction of
+                line center.
+            pred_surface_sem (torch.Tensor): Semantic prediction of
+                surface center.
+            pred_line_sem (torch.Tensor): Semantic prediction of line center.
+        Returns:
+            tuple[torch.Tensor]: Targets for primitive cues.
+        """
+        device = points.device
+        gt_bboxes_3d = gt_bboxes_3d.to(device)
+        num_proposals = aggregated_points.shape[0]
+        gt_center = gt_bboxes_3d.gravity_center
+
+        dist1, dist2, ind1, _ = chamfer_distance(
+            aggregated_points.unsqueeze(0),
+            gt_center.unsqueeze(0),
+            reduction='none')
+        # Set assignment
+        object_assignment = ind1.squeeze(0)
+
+        # Generate objectness label and mask
+        # objectness_label: 1 if pred object center is within
+        # self.train_cfg['near_threshold'] of any GT object
+        # objectness_mask: 0 if pred object center is in gray
+        # zone (DONOTCARE), 1 otherwise
+        euclidean_dist1 = torch.sqrt(dist1.squeeze(0) + 1e-6)
+        proposal_objectness_label = euclidean_dist1.new_zeros(
+            num_proposals, dtype=torch.long)
+        proposal_objectness_mask = euclidean_dist1.new_zeros(num_proposals)
+
+        gt_sem = gt_labels_3d[object_assignment]
+
+        obj_surface_center, obj_line_center = \
+            gt_bboxes_3d.get_surface_line_center()
+        obj_surface_center = obj_surface_center.reshape(-1, 6,
+                                                        3).transpose(0, 1)
+        obj_line_center = obj_line_center.reshape(-1, 12, 3).transpose(0, 1)
+        obj_surface_center = obj_surface_center[:, object_assignment].reshape(
+            1, -1, 3)
+        obj_line_center = obj_line_center[:,
+                                          object_assignment].reshape(1, -1, 3)
+
+        surface_sem = torch.argmax(pred_surface_sem, dim=1).float()
+        line_sem = torch.argmax(pred_line_sem, dim=1).float()
+
+        dist_surface, _, surface_ind, _ = chamfer_distance(
+            obj_surface_center,
+            pred_surface_center.unsqueeze(0),
+            reduction='none')
+        dist_line, _, line_ind, _ = chamfer_distance(
+            obj_line_center, pred_line_center.unsqueeze(0), reduction='none')
+
+        surface_sel = pred_surface_center[surface_ind.squeeze(0)]
+        line_sel = pred_line_center[line_ind.squeeze(0)]
+        surface_sel_sem = surface_sem[surface_ind.squeeze(0)]
+        line_sel_sem = line_sem[line_ind.squeeze(0)]
+
+        surface_sel_sem_gt = gt_sem.repeat(6).float()
+        line_sel_sem_gt = gt_sem.repeat(12).float()
+
+        euclidean_dist_surface = torch.sqrt(dist_surface.squeeze(0) + 1e-6)
+        euclidean_dist_line = torch.sqrt(dist_line.squeeze(0) + 1e-6)
+        objectness_label_surface = euclidean_dist_line.new_zeros(
+            num_proposals * 6, dtype=torch.long)
+
+        objectness_label_line = euclidean_dist_line.new_zeros(
+            num_proposals * 12, dtype=torch.long)
+
+        objectness_label_surface_sem = euclidean_dist_line.new_zeros(
+            num_proposals * 6, dtype=torch.long)
+        objectness_label_line_sem = euclidean_dist_line.new_zeros(
+            num_proposals * 12, dtype=torch.long)
+
+        euclidean_dist_obj_surface = torch.sqrt((
+            (pred_obj_surface_center - surface_sel)**2).sum(dim=-1) + 1e-6)
+        euclidean_dist_obj_line = torch.sqrt(
+            torch.sum((pred_obj_line_center - line_sel)**2, dim=-1) + 1e-6)
+
+        # Objectness score just with centers
+        proposal_objectness_label[
+            euclidean_dist1 < self.train_cfg['near_threshold']] = 1
+        proposal_objectness_mask[
+            euclidean_dist1 < self.train_cfg['near_threshold']] = 1
+        proposal_objectness_mask[
+            euclidean_dist1 > self.train_cfg['far_threshold']] = 1
+
+        objectness_label_surface[
+            (euclidean_dist_obj_surface <
+             self.train_cfg['label_surface_threshold']) *
+            (euclidean_dist_surface <
+             self.train_cfg['mask_surface_threshold'])] = 1
+        objectness_label_surface_sem[
+            (euclidean_dist_obj_surface <
+             self.train_cfg['label_surface_threshold']) *
+            (euclidean_dist_surface < self.train_cfg['mask_surface_threshold'])
+            * (surface_sel_sem == surface_sel_sem_gt)] = 1
+
+        objectness_label_line[
+            (euclidean_dist_obj_line < self.train_cfg['label_line_threshold'])
+            *
+            (euclidean_dist_line < self.train_cfg['mask_line_threshold'])] = 1
+        objectness_label_line_sem[
+            (euclidean_dist_obj_line < self.train_cfg['label_line_threshold'])
+            * (euclidean_dist_line < self.train_cfg['mask_line_threshold']) *
+            (line_sel_sem == line_sel_sem_gt)] = 1
+
+        objectness_label_surface_obj = proposal_objectness_label.repeat(6)
+        objectness_mask_surface_obj = proposal_objectness_mask.repeat(6)
+        objectness_label_line_obj = proposal_objectness_label.repeat(12)
+        objectness_mask_line_obj = proposal_objectness_mask.repeat(12)
+
+        objectness_mask_surface = objectness_mask_surface_obj
+        objectness_mask_line = objectness_mask_line_obj
+
+        cues_objectness_label = torch.cat(
+            (objectness_label_surface, objectness_label_line), 0)
+        cues_sem_label = torch.cat(
+            (objectness_label_surface_sem, objectness_label_line_sem), 0)
+        cues_mask = torch.cat((objectness_mask_surface, objectness_mask_line),
+                              0)
+
+        objectness_label_surface *= objectness_label_surface_obj
+        objectness_label_line *= objectness_label_line_obj
+        cues_matching_label = torch.cat(
+            (objectness_label_surface, objectness_label_line), 0)
+
+        objectness_label_surface_sem *= objectness_label_surface_obj
+        objectness_label_line_sem *= objectness_label_line_obj
+
+        cues_match_mask = (torch.sum(
+            cues_objectness_label.view(18, num_proposals), dim=0) >=
+                           1).float()
+
+        obj_surface_line_center = torch.cat(
+            (obj_surface_center, obj_line_center), 1).squeeze(0)
+
+        return (cues_objectness_label, cues_sem_label,
+                proposal_objectness_label, cues_mask, cues_match_mask,
+                proposal_objectness_mask, cues_matching_label,
+                obj_surface_line_center)
diff --git a/mmde/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py b/mmde/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a21e401aa88a2524727f7da32315536660fb0c0
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
@@ -0,0 +1,658 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+from mmcv.cnn import ConvModule
+from mmdet.models.utils import multi_apply
+from mmengine.model import normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet3d.models import make_sparse_convmodule
+from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE
+from mmdet3d.utils.typing_utils import InstanceList
+
+if IS_SPCONV2_AVAILABLE:
+    from spconv.pytorch import (SparseConvTensor, SparseMaxPool3d,
+                                SparseSequential)
+else:
+    from mmcv.ops import SparseConvTensor, SparseMaxPool3d, SparseSequential
+
+from mmengine.model import BaseModule
+from torch import nn as nn
+
+from mmdet3d.models.layers import nms_bev, nms_normal_bev
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures.bbox_3d import (LiDARInstance3DBoxes,
+                                        rotation_3d_in_axis, xywhr2xyxyr)
+from mmdet3d.utils.typing_utils import SamplingResultList
+
+
+@MODELS.register_module()
+class PartA2BboxHead(BaseModule):
+    """PartA2 RoI head.
+
+    Args:
+        num_classes (int): The number of classes to prediction.
+        seg_in_channels (int): Input channels of segmentation
+            convolution layer.
+        part_in_channels (int): Input channels of part convolution layer.
+        seg_conv_channels (list(int)): Out channels of each
+            segmentation convolution layer.
+        part_conv_channels (list(int)): Out channels of each
+            part convolution layer.
+        merge_conv_channels (list(int)): Out channels of each
+            feature merged convolution layer.
+        down_conv_channels (list(int)): Out channels of each
+            downsampled convolution layer.
+        shared_fc_channels (list(int)): Out channels of each shared fc layer.
+        cls_channels (list(int)): Out channels of each classification layer.
+        reg_channels (list(int)): Out channels of each regression layer.
+        dropout_ratio (float): Dropout ratio of classification and
+            regression layers.
+        roi_feat_size (int): The size of pooled roi features.
+        with_corner_loss (bool): Whether to use corner loss or not.
+        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for box head.
+        conv_cfg (dict): Config dict of convolutional layers
+        norm_cfg (dict): Config dict of normalization layers
+        loss_bbox (dict): Config dict of box regression loss.
+        loss_cls (dict, optional): Config dict of classifacation loss.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 seg_in_channels: int,
+                 part_in_channels: int,
+                 seg_conv_channels: List[int] = None,
+                 part_conv_channels: List[int] = None,
+                 merge_conv_channels: List[int] = None,
+                 down_conv_channels: List[int] = None,
+                 shared_fc_channels: List[int] = None,
+                 cls_channels: List[int] = None,
+                 reg_channels: List[int] = None,
+                 dropout_ratio: float = 0.1,
+                 roi_feat_size: int = 14,
+                 with_corner_loss: bool = True,
+                 bbox_coder: dict = dict(type='DeltaXYZWLHRBBoxCoder'),
+                 conv_cfg: dict = dict(type='Conv1d'),
+                 norm_cfg: dict = dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 loss_bbox: dict = dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+                 loss_cls: dict = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='none',
+                     loss_weight=1.0),
+                 init_cfg: dict = None) -> None:
+        super(PartA2BboxHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.with_corner_loss = with_corner_loss
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+
+        assert down_conv_channels[-1] == shared_fc_channels[0]
+
+        # init layers
+        part_channel_last = part_in_channels
+        part_conv = []
+        for i, channel in enumerate(part_conv_channels):
+            part_conv.append(
+                make_sparse_convmodule(
+                    part_channel_last,
+                    channel,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    indice_key=f'rcnn_part{i}',
+                    conv_type='SubMConv3d'))
+            part_channel_last = channel
+        self.part_conv = SparseSequential(*part_conv)
+
+        seg_channel_last = seg_in_channels
+        seg_conv = []
+        for i, channel in enumerate(seg_conv_channels):
+            seg_conv.append(
+                make_sparse_convmodule(
+                    seg_channel_last,
+                    channel,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    indice_key=f'rcnn_seg{i}',
+                    conv_type='SubMConv3d'))
+            seg_channel_last = channel
+        self.seg_conv = SparseSequential(*seg_conv)
+
+        self.conv_down = SparseSequential()
+
+        merge_conv_channel_last = part_channel_last + seg_channel_last
+        merge_conv = []
+        for i, channel in enumerate(merge_conv_channels):
+            merge_conv.append(
+                make_sparse_convmodule(
+                    merge_conv_channel_last,
+                    channel,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    indice_key='rcnn_down0'))
+            merge_conv_channel_last = channel
+
+        down_conv_channel_last = merge_conv_channel_last
+        conv_down = []
+        for i, channel in enumerate(down_conv_channels):
+            conv_down.append(
+                make_sparse_convmodule(
+                    down_conv_channel_last,
+                    channel,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    indice_key='rcnn_down1'))
+            down_conv_channel_last = channel
+
+        self.conv_down.add_module('merge_conv', SparseSequential(*merge_conv))
+        self.conv_down.add_module('max_pool3d',
+                                  SparseMaxPool3d(kernel_size=2, stride=2))
+        self.conv_down.add_module('down_conv', SparseSequential(*conv_down))
+
+        shared_fc_list = []
+        pool_size = roi_feat_size // 2
+        pre_channel = shared_fc_channels[0] * pool_size**3
+        for k in range(1, len(shared_fc_channels)):
+            shared_fc_list.append(
+                ConvModule(
+                    pre_channel,
+                    shared_fc_channels[k],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    inplace=True))
+            pre_channel = shared_fc_channels[k]
+
+            if k != len(shared_fc_channels) - 1 and dropout_ratio > 0:
+                shared_fc_list.append(nn.Dropout(dropout_ratio))
+
+        self.shared_fc = nn.Sequential(*shared_fc_list)
+
+        # Classification layer
+        channel_in = shared_fc_channels[-1]
+        cls_channel = 1
+        cls_layers = []
+        pre_channel = channel_in
+        for k in range(0, len(cls_channels)):
+            cls_layers.append(
+                ConvModule(
+                    pre_channel,
+                    cls_channels[k],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    inplace=True))
+            pre_channel = cls_channels[k]
+        cls_layers.append(
+            ConvModule(
+                pre_channel,
+                cls_channel,
+                1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                act_cfg=None))
+        if dropout_ratio >= 0:
+            cls_layers.insert(1, nn.Dropout(dropout_ratio))
+
+        self.conv_cls = nn.Sequential(*cls_layers)
+
+        # Regression layer
+        reg_layers = []
+        pre_channel = channel_in
+        for k in range(0, len(reg_channels)):
+            reg_layers.append(
+                ConvModule(
+                    pre_channel,
+                    reg_channels[k],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    inplace=True))
+            pre_channel = reg_channels[k]
+        reg_layers.append(
+            ConvModule(
+                pre_channel,
+                self.bbox_coder.code_size,
+                1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                act_cfg=None))
+        if dropout_ratio >= 0:
+            reg_layers.insert(1, nn.Dropout(dropout_ratio))
+
+        self.conv_reg = nn.Sequential(*reg_layers)
+
+        if init_cfg is None:
+            self.init_cfg = dict(
+                type='Xavier',
+                layer=['Conv2d', 'Conv1d'],
+                distribution='uniform')
+
+    def init_weights(self):
+        super().init_weights()
+        normal_init(self.conv_reg[-1].conv, mean=0, std=0.001)
+
+    def forward(self, seg_feats: Tensor, part_feats: Tensor) -> Tuple[Tensor]:
+        """Forward pass.
+
+        Args:
+            seg_feats (torch.Tensor): Point-wise semantic features.
+            part_feats (torch.Tensor): Point-wise part prediction features.
+
+        Returns:
+            tuple[torch.Tensor]: Score of class and bbox predictions.
+        """
+        # (B * N, out_x, out_y, out_z, 4)
+        rcnn_batch_size = part_feats.shape[0]
+
+        # transform to sparse tensors
+        sparse_shape = part_feats.shape[1:4]
+        # (non_empty_num, 4) ==> [bs_idx, x_idx, y_idx, z_idx]
+        sparse_idx = part_feats.sum(dim=-1).nonzero(as_tuple=False)
+
+        part_features = part_feats[sparse_idx[:, 0], sparse_idx[:, 1],
+                                   sparse_idx[:, 2], sparse_idx[:, 3]]
+        seg_features = seg_feats[sparse_idx[:, 0], sparse_idx[:, 1],
+                                 sparse_idx[:, 2], sparse_idx[:, 3]]
+        coords = sparse_idx.int().contiguous()
+        part_features = SparseConvTensor(part_features, coords, sparse_shape,
+                                         rcnn_batch_size)
+        seg_features = SparseConvTensor(seg_features, coords, sparse_shape,
+                                        rcnn_batch_size)
+
+        # forward rcnn network
+        x_part = self.part_conv(part_features)
+        x_rpn = self.seg_conv(seg_features)
+
+        merged_feature = torch.cat((x_rpn.features, x_part.features),
+                                   dim=1)  # (N, C)
+        shared_feature = SparseConvTensor(merged_feature, coords, sparse_shape,
+                                          rcnn_batch_size)
+
+        x = self.conv_down(shared_feature)
+
+        shared_feature = x.dense().view(rcnn_batch_size, -1, 1)
+
+        shared_feature = self.shared_fc(shared_feature)
+
+        cls_score = self.conv_cls(shared_feature).transpose(
+            1, 2).contiguous().squeeze(dim=1)  # (B, 1)
+        bbox_pred = self.conv_reg(shared_feature).transpose(
+            1, 2).contiguous().squeeze(dim=1)  # (B, C)
+
+        return cls_score, bbox_pred
+
+    def loss(self, cls_score: Tensor, bbox_pred: Tensor, rois: Tensor,
+             labels: Tensor, bbox_targets: Tensor, pos_gt_bboxes: Tensor,
+             reg_mask: Tensor, label_weights: Tensor,
+             bbox_weights: Tensor) -> Dict:
+        """Computing losses.
+
+        Args:
+            cls_score (torch.Tensor): Scores of each roi.
+            bbox_pred (torch.Tensor): Predictions of bboxes.
+            rois (torch.Tensor): Roi bboxes.
+            labels (torch.Tensor): Labels of class.
+            bbox_targets (torch.Tensor): Target of positive bboxes.
+            pos_gt_bboxes (torch.Tensor): Ground truths of positive bboxes.
+            reg_mask (torch.Tensor): Mask for positive bboxes.
+            label_weights (torch.Tensor): Weights of class loss.
+            bbox_weights (torch.Tensor): Weights of bbox loss.
+
+        Returns:
+            dict: Computed losses.
+
+            - loss_cls (torch.Tensor): Loss of classes.
+            - loss_bbox (torch.Tensor): Loss of bboxes.
+            - loss_corner (torch.Tensor): Loss of corners.
+        """
+        losses = dict()
+        rcnn_batch_size = cls_score.shape[0]
+
+        # calculate class loss
+        cls_flat = cls_score.view(-1)
+        loss_cls = self.loss_cls(cls_flat, labels, label_weights)
+        losses['loss_cls'] = loss_cls
+
+        # calculate regression loss
+        code_size = self.bbox_coder.code_size
+        pos_inds = (reg_mask > 0)
+        if pos_inds.any() == 0:
+            # fake a part loss
+            losses['loss_bbox'] = loss_cls.new_tensor(0) * loss_cls.sum()
+            if self.with_corner_loss:
+                losses['loss_corner'] = loss_cls.new_tensor(0) * loss_cls.sum()
+        else:
+            pos_bbox_pred = bbox_pred.view(rcnn_batch_size, -1)[pos_inds]
+            bbox_weights_flat = bbox_weights[pos_inds].view(-1, 1).repeat(
+                1, pos_bbox_pred.shape[-1])
+            loss_bbox = self.loss_bbox(
+                pos_bbox_pred.unsqueeze(dim=0), bbox_targets.unsqueeze(dim=0),
+                bbox_weights_flat.unsqueeze(dim=0))
+            losses['loss_bbox'] = loss_bbox
+
+            if self.with_corner_loss:
+                pos_roi_boxes3d = rois[..., 1:].view(-1, code_size)[pos_inds]
+                pos_roi_boxes3d = pos_roi_boxes3d.view(-1, code_size)
+                batch_anchors = pos_roi_boxes3d.clone().detach()
+                pos_rois_rotation = pos_roi_boxes3d[..., 6].view(-1)
+                roi_xyz = pos_roi_boxes3d[..., 0:3].view(-1, 3)
+                batch_anchors[..., 0:3] = 0
+                # decode boxes
+                pred_boxes3d = self.bbox_coder.decode(
+                    batch_anchors,
+                    pos_bbox_pred.view(-1, code_size)).view(-1, code_size)
+
+                pred_boxes3d[..., 0:3] = rotation_3d_in_axis(
+                    pred_boxes3d[..., 0:3].unsqueeze(1),
+                    pos_rois_rotation,
+                    axis=2).squeeze(1)
+
+                pred_boxes3d[:, 0:3] += roi_xyz
+
+                # calculate corner loss
+                loss_corner = self.get_corner_loss_lidar(
+                    pred_boxes3d, pos_gt_bboxes)
+                losses['loss_corner'] = loss_corner
+
+        return losses
+
+    def get_targets(self,
+                    sampling_results: SamplingResultList,
+                    rcnn_train_cfg: dict,
+                    concat: bool = True) -> Tuple[Tensor]:
+        """Generate targets.
+
+        Args:
+            sampling_results (list[:obj:`SamplingResult`]):
+                Sampled results from rois.
+            rcnn_train_cfg (:obj:`ConfigDict`): Training config of rcnn.
+            concat (bool): Whether to concatenate targets between batches.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of boxes and class prediction.
+        """
+        pos_bboxes_list = [res.pos_bboxes for res in sampling_results]
+        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
+        iou_list = [res.iou for res in sampling_results]
+        targets = multi_apply(
+            self._get_target_single,
+            pos_bboxes_list,
+            pos_gt_bboxes_list,
+            iou_list,
+            cfg=rcnn_train_cfg)
+
+        (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+         bbox_weights) = targets
+
+        if concat:
+            label = torch.cat(label, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            pos_gt_bboxes = torch.cat(pos_gt_bboxes, 0)
+            reg_mask = torch.cat(reg_mask, 0)
+
+            label_weights = torch.cat(label_weights, 0)
+            label_weights /= torch.clamp(label_weights.sum(), min=1.0)
+
+            bbox_weights = torch.cat(bbox_weights, 0)
+            bbox_weights /= torch.clamp(bbox_weights.sum(), min=1.0)
+
+        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+
+    def _get_target_single(self, pos_bboxes: Tensor, pos_gt_bboxes: Tensor,
+                           ious: Tensor, cfg: dict) -> Tuple[Tensor]:
+        """Generate training targets for a single sample.
+
+        Args:
+            pos_bboxes (torch.Tensor): Positive boxes with shape
+                (N, 7).
+            pos_gt_bboxes (torch.Tensor): Ground truth boxes with shape
+                (M, 7).
+            ious (torch.Tensor): IoU between `pos_bboxes` and `pos_gt_bboxes`
+                in shape (N, M).
+            cfg (dict): Training configs.
+
+        Returns:
+            tuple[torch.Tensor]: Target for positive boxes.
+                (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+        """
+        cls_pos_mask = ious > cfg.cls_pos_thr
+        cls_neg_mask = ious < cfg.cls_neg_thr
+        interval_mask = (cls_pos_mask == 0) & (cls_neg_mask == 0)
+
+        # iou regression target
+        label = (cls_pos_mask > 0).float()
+        label[interval_mask] = ious[interval_mask] * 2 - 0.5
+        # label weights
+        label_weights = (label >= 0).float()
+
+        # box regression target
+        reg_mask = pos_bboxes.new_zeros(ious.size(0)).long()
+        reg_mask[0:pos_gt_bboxes.size(0)] = 1
+        bbox_weights = (reg_mask > 0).float()
+        if reg_mask.bool().any():
+            pos_gt_bboxes_ct = pos_gt_bboxes.clone().detach()
+            roi_center = pos_bboxes[..., 0:3]
+            roi_ry = pos_bboxes[..., 6] % (2 * np.pi)
+
+            # canonical transformation
+            pos_gt_bboxes_ct[..., 0:3] -= roi_center
+            pos_gt_bboxes_ct[..., 6] -= roi_ry
+            pos_gt_bboxes_ct[..., 0:3] = rotation_3d_in_axis(
+                pos_gt_bboxes_ct[..., 0:3].unsqueeze(1), -roi_ry,
+                axis=2).squeeze(1)
+
+            # flip orientation if rois have opposite orientation
+            ry_label = pos_gt_bboxes_ct[..., 6] % (2 * np.pi)  # 0 ~ 2pi
+            opposite_flag = (ry_label > np.pi * 0.5) & (ry_label < np.pi * 1.5)
+            ry_label[opposite_flag] = (ry_label[opposite_flag] + np.pi) % (
+                2 * np.pi)  # (0 ~ pi/2, 3pi/2 ~ 2pi)
+            flag = ry_label > np.pi
+            ry_label[flag] = ry_label[flag] - np.pi * 2  # (-pi/2, pi/2)
+            ry_label = torch.clamp(ry_label, min=-np.pi / 2, max=np.pi / 2)
+            pos_gt_bboxes_ct[..., 6] = ry_label
+
+            rois_anchor = pos_bboxes.clone().detach()
+            rois_anchor[:, 0:3] = 0
+            rois_anchor[:, 6] = 0
+            bbox_targets = self.bbox_coder.encode(rois_anchor,
+                                                  pos_gt_bboxes_ct)
+        else:
+            # no fg bbox
+            bbox_targets = pos_gt_bboxes.new_empty((0, 7))
+
+        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+
+    def get_corner_loss_lidar(self,
+                              pred_bbox3d: Tensor,
+                              gt_bbox3d: Tensor,
+                              delta: float = 1.0) -> Tensor:
+        """Calculate corner loss of given boxes.
+
+        Args:
+            pred_bbox3d (torch.FloatTensor): Predicted boxes in shape (N, 7).
+            gt_bbox3d (torch.FloatTensor): Ground truth boxes in shape (N, 7).
+            delta (float, optional): huber loss threshold. Defaults to 1.0
+
+        Returns:
+            torch.FloatTensor: Calculated corner loss in shape (N).
+        """
+        assert pred_bbox3d.shape[0] == gt_bbox3d.shape[0]
+
+        # This is a little bit hack here because we assume the box for
+        # Part-A2 is in LiDAR coordinates
+        gt_boxes_structure = LiDARInstance3DBoxes(gt_bbox3d)
+        pred_box_corners = LiDARInstance3DBoxes(pred_bbox3d).corners
+        gt_box_corners = gt_boxes_structure.corners
+
+        # This flip only changes the heading direction of GT boxes
+        gt_bbox3d_flip = gt_boxes_structure.clone()
+        gt_bbox3d_flip.tensor[:, 6] += np.pi
+        gt_box_corners_flip = gt_bbox3d_flip.corners
+
+        corner_dist = torch.min(
+            torch.norm(pred_box_corners - gt_box_corners, dim=2),
+            torch.norm(pred_box_corners - gt_box_corners_flip,
+                       dim=2))  # (N, 8)
+        # huber loss
+        abs_error = corner_dist.abs()
+        quadratic = abs_error.clamp(max=delta)
+        linear = (abs_error - quadratic)
+        corner_loss = 0.5 * quadratic**2 + delta * linear
+
+        return corner_loss.mean(dim=1)
+
+    def get_results(self,
+                    rois: Tensor,
+                    cls_score: Tensor,
+                    bbox_pred: Tensor,
+                    class_labels: Tensor,
+                    class_pred: Tensor,
+                    input_metas: List[dict],
+                    cfg: dict = None) -> InstanceList:
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            rois (torch.Tensor): Roi bounding boxes.
+            cls_score (torch.Tensor): Scores of bounding boxes.
+            bbox_pred (torch.Tensor): Bounding boxes predictions
+            class_labels (torch.Tensor): Label of classes
+            class_pred (torch.Tensor): Score for nms.
+            input_metas (list[dict]): Point cloud and image's meta info.
+            cfg (:obj:`ConfigDict`): Testing config.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        roi_batch_id = rois[..., 0]
+        roi_boxes = rois[..., 1:]  # boxes without batch id
+        batch_size = int(roi_batch_id.max().item() + 1)
+
+        # decode boxes
+        roi_ry = roi_boxes[..., 6].view(-1)
+        roi_xyz = roi_boxes[..., 0:3].view(-1, 3)
+        local_roi_boxes = roi_boxes.clone().detach()
+        local_roi_boxes[..., 0:3] = 0
+        rcnn_boxes3d = self.bbox_coder.decode(local_roi_boxes, bbox_pred)
+        rcnn_boxes3d[..., 0:3] = rotation_3d_in_axis(
+            rcnn_boxes3d[..., 0:3].unsqueeze(1), roi_ry, axis=2).squeeze(1)
+        rcnn_boxes3d[:, 0:3] += roi_xyz
+
+        # post processing
+        result_list = []
+        for batch_id in range(batch_size):
+            cur_class_labels = class_labels[batch_id]
+            cur_cls_score = cls_score[roi_batch_id == batch_id].view(-1)
+
+            cur_box_prob = class_pred[batch_id]
+            cur_rcnn_boxes3d = rcnn_boxes3d[roi_batch_id == batch_id]
+            keep = self.multi_class_nms(cur_box_prob, cur_rcnn_boxes3d,
+                                        cfg.score_thr, cfg.nms_thr,
+                                        input_metas[batch_id],
+                                        cfg.use_rotate_nms)
+            selected_bboxes = cur_rcnn_boxes3d[keep]
+            selected_label_preds = cur_class_labels[keep]
+            selected_scores = cur_cls_score[keep]
+
+            results = InstanceData()
+            results.bboxes_3d = input_metas[batch_id]['box_type_3d'](
+                selected_bboxes, self.bbox_coder.code_size)
+            results.scores_3d = selected_scores
+            results.labels_3d = selected_label_preds
+
+            result_list.append(results)
+        return result_list
+
+    def multi_class_nms(self,
+                        box_probs: Tensor,
+                        box_preds: Tensor,
+                        score_thr: float,
+                        nms_thr: float,
+                        input_meta: dict,
+                        use_rotate_nms: bool = True) -> Tensor:
+        """Multi-class NMS for box head.
+
+        Note:
+            This function has large overlap with the `box3d_multiclass_nms`
+            implemented in `mmdet3d.core.post_processing`. We are considering
+            merging these two functions in the future.
+
+        Args:
+            box_probs (torch.Tensor): Predicted boxes probabitilies in
+                shape (N,).
+            box_preds (torch.Tensor): Predicted boxes in shape (N, 7+C).
+            score_thr (float): Threshold of scores.
+            nms_thr (float): Threshold for NMS.
+            input_meta (dict): Meta information of the current sample.
+            use_rotate_nms (bool, optional): Whether to use rotated nms.
+                Defaults to True.
+
+        Returns:
+            torch.Tensor: Selected indices.
+        """
+        if use_rotate_nms:
+            nms_func = nms_bev
+        else:
+            nms_func = nms_normal_bev
+
+        assert box_probs.shape[
+            1] == self.num_classes, f'box_probs shape: {str(box_probs.shape)}'
+        selected_list = []
+        selected_labels = []
+        boxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
+            box_preds, self.bbox_coder.code_size).bev)
+
+        score_thresh = score_thr if isinstance(
+            score_thr, list) else [score_thr for x in range(self.num_classes)]
+        nms_thresh = nms_thr if isinstance(
+            nms_thr, list) else [nms_thr for x in range(self.num_classes)]
+        for k in range(0, self.num_classes):
+            class_scores_keep = box_probs[:, k] >= score_thresh[k]
+
+            if class_scores_keep.int().sum() > 0:
+                original_idxs = class_scores_keep.nonzero(
+                    as_tuple=False).view(-1)
+                cur_boxes_for_nms = boxes_for_nms[class_scores_keep]
+                cur_rank_scores = box_probs[class_scores_keep, k]
+
+                cur_selected = nms_func(cur_boxes_for_nms, cur_rank_scores,
+                                        nms_thresh[k])
+
+                if cur_selected.shape[0] == 0:
+                    continue
+                selected_list.append(original_idxs[cur_selected])
+                selected_labels.append(
+                    torch.full([cur_selected.shape[0]],
+                               k + 1,
+                               dtype=torch.int64,
+                               device=box_preds.device))
+
+        keep = torch.cat(
+            selected_list, dim=0) if len(selected_list) > 0 else []
+        return keep
diff --git a/mmde/mmdet3d/models/roi_heads/bbox_heads/point_rcnn_bbox_head.py b/mmde/mmdet3d/models/roi_heads/bbox_heads/point_rcnn_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef80e17ddd19055ba5788d58d51739237800260d
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/bbox_heads/point_rcnn_bbox_head.py
@@ -0,0 +1,604 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import build_conv_layer
+from mmdet.models.utils import multi_apply
+from mmengine.model import BaseModule, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet3d.models.layers import nms_bev, nms_normal_bev
+from mmdet3d.models.layers.pointnet_modules import build_sa_module
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures.bbox_3d import (LiDARInstance3DBoxes,
+                                        rotation_3d_in_axis, xywhr2xyxyr)
+from mmdet3d.utils.typing_utils import InstanceList, SamplingResultList
+
+
+@MODELS.register_module()
+class PointRCNNBboxHead(BaseModule):
+    """PointRCNN RoI Bbox head.
+
+    Args:
+        num_classes (int): The number of classes to prediction.
+        in_channels (int)： Input channels of point features.
+        mlp_channels (list[int]): the number of mlp channels
+        pred_layer_cfg (dict, optional): Config of classfication and
+            regression prediction layers. Defaults to None.
+        num_points (tuple): The number of points which each SA
+            module samples. Defaults to (128, 32, -1).
+        radius (tuple): Sampling radius of each SA module.
+            Defaults to (0.2, 0.4, 100).
+        num_samples (tuple): The number of samples for ball query
+            in each SA module. Defaults to (64, 64, 64).
+        sa_channels (tuple): Out channels of each mlp in SA module.
+            Defaults to ((128, 128, 128), (128, 128, 256), (256, 256, 512)).
+        bbox_coder (dict): Config dict of box coders.
+            Defaults to dict(type='DeltaXYZWLHRBBoxCoder').
+        sa_cfg (dict): Config of set abstraction module, which may
+            contain the following keys and values:
+
+            - pool_mod (str): Pool method ('max' or 'avg') for SA modules.
+            - use_xyz (bool): Whether to use xyz as a part of features.
+            - normalize_xyz (bool): Whether to normalize xyz with radii in
+              each SA module.
+            Defaults to dict(type='PointSAModule', pool_mod='max',
+                use_xyz=True).
+        conv_cfg (dict): Config dict of convolutional layers.
+             Defaults to dict(type='Conv1d').
+        norm_cfg (dict): Config dict of normalization layers.
+             Defaults to dict(type='BN1d').
+        act_cfg (dict): Config dict of activation layers.
+            Defaults to dict(type='ReLU').
+        bias (str): Type of bias. Defaults to 'auto'.
+        loss_bbox (dict): Config of regression loss function.
+            Defaults to dict(type='SmoothL1Loss', beta=1.0 / 9.0,
+                reduction='sum', loss_weight=1.0).
+        loss_cls (dict): Config of classification loss function.
+             Defaults to dict(type='CrossEntropyLoss', use_sigmoid=True,
+                reduction='sum', loss_weight=1.0).
+        with_corner_loss (bool): Whether using corner loss.
+            Defaults to True.
+        init_cfg (dict, optional): Config of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: dict,
+                 in_channels: dict,
+                 mlp_channels: dict,
+                 pred_layer_cfg: Optional[dict] = None,
+                 num_points: dict = (128, 32, -1),
+                 radius: dict = (0.2, 0.4, 100),
+                 num_samples: dict = (64, 64, 64),
+                 sa_channels: dict = ((128, 128, 128), (128, 128, 256),
+                                      (256, 256, 512)),
+                 bbox_coder: dict = dict(type='DeltaXYZWLHRBBoxCoder'),
+                 sa_cfg: dict = dict(
+                     type='PointSAModule', pool_mod='max', use_xyz=True),
+                 conv_cfg: dict = dict(type='Conv1d'),
+                 norm_cfg: dict = dict(type='BN1d'),
+                 act_cfg: dict = dict(type='ReLU'),
+                 bias: str = 'auto',
+                 loss_bbox: dict = dict(
+                     type='SmoothL1Loss',
+                     beta=1.0 / 9.0,
+                     reduction='sum',
+                     loss_weight=1.0),
+                 loss_cls: dict = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='sum',
+                     loss_weight=1.0),
+                 with_corner_loss: bool = True,
+                 init_cfg: Optional[dict] = None) -> None:
+        super(PointRCNNBboxHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.num_sa = len(sa_channels)
+        self.with_corner_loss = with_corner_loss
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.bias = bias
+
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+
+        self.in_channels = in_channels
+        mlp_channels = [self.in_channels] + mlp_channels
+        shared_mlps = nn.Sequential()
+        for i in range(len(mlp_channels) - 1):
+            shared_mlps.add_module(
+                f'layer{i}',
+                ConvModule(
+                    mlp_channels[i],
+                    mlp_channels[i + 1],
+                    kernel_size=(1, 1),
+                    stride=(1, 1),
+                    inplace=False,
+                    conv_cfg=dict(type='Conv2d')))
+        self.xyz_up_layer = nn.Sequential(*shared_mlps)
+
+        c_out = mlp_channels[-1]
+        self.merge_down_layer = ConvModule(
+            c_out * 2,
+            c_out,
+            kernel_size=(1, 1),
+            stride=(1, 1),
+            inplace=False,
+            conv_cfg=dict(type='Conv2d'))
+
+        pre_channels = c_out
+
+        self.SA_modules = nn.ModuleList()
+        sa_in_channel = pre_channels
+
+        for sa_index in range(self.num_sa):
+            cur_sa_mlps = list(sa_channels[sa_index])
+            cur_sa_mlps = [sa_in_channel] + cur_sa_mlps
+            sa_out_channel = cur_sa_mlps[-1]
+
+            cur_num_points = num_points[sa_index]
+            if cur_num_points <= 0:
+                cur_num_points = None
+            self.SA_modules.append(
+                build_sa_module(
+                    num_point=cur_num_points,
+                    radius=radius[sa_index],
+                    num_sample=num_samples[sa_index],
+                    mlp_channels=cur_sa_mlps,
+                    cfg=sa_cfg))
+            sa_in_channel = sa_out_channel
+        self.cls_convs = self._add_conv_branch(
+            pred_layer_cfg.in_channels, pred_layer_cfg.cls_conv_channels)
+        self.reg_convs = self._add_conv_branch(
+            pred_layer_cfg.in_channels, pred_layer_cfg.reg_conv_channels)
+
+        prev_channel = pred_layer_cfg.cls_conv_channels[-1]
+        self.conv_cls = build_conv_layer(
+            self.conv_cfg,
+            in_channels=prev_channel,
+            out_channels=self.num_classes,
+            kernel_size=1)
+        prev_channel = pred_layer_cfg.reg_conv_channels[-1]
+        self.conv_reg = build_conv_layer(
+            self.conv_cfg,
+            in_channels=prev_channel,
+            out_channels=self.bbox_coder.code_size * self.num_classes,
+            kernel_size=1)
+
+        if init_cfg is None:
+            self.init_cfg = dict(type='Xavier', layer=['Conv2d', 'Conv1d'])
+
+    def _add_conv_branch(self, in_channels: int,
+                         conv_channels: tuple) -> nn.Sequential:
+        """Add shared or separable branch.
+
+        Args:
+            in_channels (int): Input feature channel.
+            conv_channels (tuple): Middle feature channels.
+        """
+        conv_spec = [in_channels] + list(conv_channels)
+        # add branch specific conv layers
+        conv_layers = nn.Sequential()
+        for i in range(len(conv_spec) - 1):
+            conv_layers.add_module(
+                f'layer{i}',
+                ConvModule(
+                    conv_spec[i],
+                    conv_spec[i + 1],
+                    kernel_size=1,
+                    padding=0,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    bias=self.bias,
+                    inplace=True))
+        return conv_layers
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        super().init_weights()
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d):
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+        normal_init(self.conv_reg.weight, mean=0, std=0.001)
+
+    def forward(self, feats: Tensor) -> Tuple[Tensor]:
+        """Forward pass.
+
+        Args:
+            feats (torch.Torch): Features from RCNN modules.
+
+        Returns:
+            tuple[torch.Tensor]: Score of class and bbox predictions.
+        """
+        input_data = feats.clone().detach()
+        xyz_input = input_data[..., 0:self.in_channels].transpose(
+            1, 2).unsqueeze(dim=3).contiguous().clone().detach()
+        xyz_features = self.xyz_up_layer(xyz_input)
+        rpn_features = input_data[..., self.in_channels:].transpose(
+            1, 2).unsqueeze(dim=3)
+        merged_features = torch.cat((xyz_features, rpn_features), dim=1)
+        merged_features = self.merge_down_layer(merged_features)
+        l_xyz, l_features = [input_data[..., 0:3].contiguous()], \
+                            [merged_features.squeeze(dim=3)]
+        for i in range(len(self.SA_modules)):
+            li_xyz, li_features, cur_indices = \
+                self.SA_modules[i](l_xyz[i], l_features[i])
+            l_xyz.append(li_xyz)
+            l_features.append(li_features)
+
+        shared_features = l_features[-1]
+        x_cls = shared_features
+        x_reg = shared_features
+        x_cls = self.cls_convs(x_cls)
+        rcnn_cls = self.conv_cls(x_cls)
+        x_reg = self.reg_convs(x_reg)
+        rcnn_reg = self.conv_reg(x_reg)
+        rcnn_cls = rcnn_cls.transpose(1, 2).contiguous().squeeze(dim=1)
+        rcnn_reg = rcnn_reg.transpose(1, 2).contiguous().squeeze(dim=1)
+        return rcnn_cls, rcnn_reg
+
+    def loss(self, cls_score: Tensor, bbox_pred: Tensor, rois: Tensor,
+             labels: Tensor, bbox_targets: Tensor, pos_gt_bboxes: Tensor,
+             reg_mask: Tensor, label_weights: Tensor,
+             bbox_weights: Tensor) -> Dict:
+        """Computing losses.
+
+        Args:
+            cls_score (torch.Tensor): Scores of each RoI.
+            bbox_pred (torch.Tensor): Predictions of bboxes.
+            rois (torch.Tensor): RoI bboxes.
+            labels (torch.Tensor): Labels of class.
+            bbox_targets (torch.Tensor): Target of positive bboxes.
+            pos_gt_bboxes (torch.Tensor): Ground truths of positive bboxes.
+            reg_mask (torch.Tensor): Mask for positive bboxes.
+            label_weights (torch.Tensor): Weights of class loss.
+            bbox_weights (torch.Tensor): Weights of bbox loss.
+
+        Returns:
+            dict: Computed losses.
+
+                - loss_cls (torch.Tensor): Loss of classes.
+                - loss_bbox (torch.Tensor): Loss of bboxes.
+                - loss_corner (torch.Tensor): Loss of corners.
+        """
+        losses = dict()
+        rcnn_batch_size = cls_score.shape[0]
+        # calculate class loss
+        cls_flat = cls_score.view(-1)
+        loss_cls = self.loss_cls(cls_flat, labels, label_weights)
+        losses['loss_cls'] = loss_cls
+
+        # calculate regression loss
+        code_size = self.bbox_coder.code_size
+        pos_inds = (reg_mask > 0)
+
+        pos_bbox_pred = bbox_pred.view(rcnn_batch_size, -1)[pos_inds].clone()
+        bbox_weights_flat = bbox_weights[pos_inds].view(-1, 1).repeat(
+            1, pos_bbox_pred.shape[-1])
+        loss_bbox = self.loss_bbox(
+            pos_bbox_pred.unsqueeze(dim=0),
+            bbox_targets.unsqueeze(dim=0).detach(),
+            bbox_weights_flat.unsqueeze(dim=0))
+        losses['loss_bbox'] = loss_bbox
+
+        if pos_inds.any() != 0 and self.with_corner_loss:
+            rois = rois.detach()
+            pos_roi_boxes3d = rois[..., 1:].view(-1, code_size)[pos_inds]
+            pos_roi_boxes3d = pos_roi_boxes3d.view(-1, code_size)
+            batch_anchors = pos_roi_boxes3d.clone().detach()
+            pos_rois_rotation = pos_roi_boxes3d[..., 6].view(-1)
+            roi_xyz = pos_roi_boxes3d[..., 0:3].view(-1, 3)
+            batch_anchors[..., 0:3] = 0
+            # decode boxes
+            pred_boxes3d = self.bbox_coder.decode(
+                batch_anchors,
+                pos_bbox_pred.view(-1, code_size)).view(-1, code_size)
+
+            pred_boxes3d[..., 0:3] = rotation_3d_in_axis(
+                pred_boxes3d[..., 0:3].unsqueeze(1), (pos_rois_rotation),
+                axis=2).squeeze(1)
+
+            pred_boxes3d[:, 0:3] += roi_xyz
+
+            # calculate corner loss
+            loss_corner = self.get_corner_loss_lidar(pred_boxes3d,
+                                                     pos_gt_bboxes).mean()
+
+            losses['loss_corner'] = loss_corner
+        else:
+            losses['loss_corner'] = loss_cls.new_tensor(0) * loss_cls.sum()
+        return losses
+
+    def get_corner_loss_lidar(self,
+                              pred_bbox3d: Tensor,
+                              gt_bbox3d: Tensor,
+                              delta: float = 1.0) -> Tensor:
+        """Calculate corner loss of given boxes.
+
+        Args:
+            pred_bbox3d (torch.FloatTensor): Predicted boxes in shape (N, 7).
+            gt_bbox3d (torch.FloatTensor): Ground truth boxes in shape (N, 7).
+            delta (float, optional): huber loss threshold. Defaults to 1.0
+
+        Returns:
+            torch.FloatTensor: Calculated corner loss in shape (N).
+        """
+        assert pred_bbox3d.shape[0] == gt_bbox3d.shape[0]
+
+        # This is a little bit hack here because we assume the box for
+        # PointRCNN is in LiDAR coordinates
+
+        gt_boxes_structure = LiDARInstance3DBoxes(gt_bbox3d)
+        pred_box_corners = LiDARInstance3DBoxes(pred_bbox3d).corners
+        gt_box_corners = gt_boxes_structure.corners
+
+        # This flip only changes the heading direction of GT boxes
+        gt_bbox3d_flip = gt_boxes_structure.clone()
+        gt_bbox3d_flip.tensor[:, 6] += np.pi
+        gt_box_corners_flip = gt_bbox3d_flip.corners
+
+        corner_dist = torch.min(
+            torch.norm(pred_box_corners - gt_box_corners, dim=2),
+            torch.norm(pred_box_corners - gt_box_corners_flip, dim=2))
+        # huber loss
+        abs_error = corner_dist.abs()
+        # quadratic = abs_error.clamp(max=delta)
+        # linear = (abs_error - quadratic)
+        # corner_loss = 0.5 * quadratic**2 + delta * linear
+        loss = torch.where(abs_error < delta, 0.5 * abs_error**2 / delta,
+                           abs_error - 0.5 * delta)
+        return loss.mean(dim=1)
+
+    def get_targets(self,
+                    sampling_results: SamplingResultList,
+                    rcnn_train_cfg: dict,
+                    concat: bool = True) -> Tuple[Tensor]:
+        """Generate targets.
+
+        Args:
+            sampling_results (list[:obj:`SamplingResult`]):
+                Sampled results from rois.
+            rcnn_train_cfg (:obj:`ConfigDict`): Training config of rcnn.
+            concat (bool): Whether to concatenate targets between
+                batches. Defaults to True.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of boxes and class prediction.
+        """
+        pos_bboxes_list = [res.pos_bboxes for res in sampling_results]
+        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
+        iou_list = [res.iou for res in sampling_results]
+        targets = multi_apply(
+            self._get_target_single,
+            pos_bboxes_list,
+            pos_gt_bboxes_list,
+            iou_list,
+            cfg=rcnn_train_cfg)
+        (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+         bbox_weights) = targets
+
+        if concat:
+            label = torch.cat(label, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            pos_gt_bboxes = torch.cat(pos_gt_bboxes, 0)
+            reg_mask = torch.cat(reg_mask, 0)
+
+            label_weights = torch.cat(label_weights, 0)
+            label_weights /= torch.clamp(label_weights.sum(), min=1.0)
+
+            bbox_weights = torch.cat(bbox_weights, 0)
+            bbox_weights /= torch.clamp(bbox_weights.sum(), min=1.0)
+
+        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+
+    def _get_target_single(self, pos_bboxes: Tensor, pos_gt_bboxes: Tensor,
+                           ious: Tensor, cfg: dict) -> Tuple[Tensor]:
+        """Generate training targets for a single sample.
+
+        Args:
+            pos_bboxes (torch.Tensor): Positive boxes with shape
+                (N, 7).
+            pos_gt_bboxes (torch.Tensor): Ground truth boxes with shape
+                (M, 7).
+            ious (torch.Tensor): IoU between `pos_bboxes` and `pos_gt_bboxes`
+                in shape (N, M).
+            cfg (dict): Training configs.
+
+        Returns:
+            tuple[torch.Tensor]: Target for positive boxes.
+                (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+        """
+        cls_pos_mask = ious > cfg.cls_pos_thr
+        cls_neg_mask = ious < cfg.cls_neg_thr
+        interval_mask = (cls_pos_mask == 0) & (cls_neg_mask == 0)
+        # iou regression target
+        label = (cls_pos_mask > 0).float()
+        label[interval_mask] = (ious[interval_mask] - cfg.cls_neg_thr) / \
+            (cfg.cls_pos_thr - cfg.cls_neg_thr)
+        # label weights
+        label_weights = (label >= 0).float()
+        # box regression target
+        reg_mask = pos_bboxes.new_zeros(ious.size(0)).long()
+        reg_mask[0:pos_gt_bboxes.size(0)] = 1
+        bbox_weights = (reg_mask > 0).float()
+        if reg_mask.bool().any():
+            pos_gt_bboxes_ct = pos_gt_bboxes.clone().detach()
+            roi_center = pos_bboxes[..., 0:3]
+            roi_ry = pos_bboxes[..., 6] % (2 * np.pi)
+
+            # canonical transformation
+            pos_gt_bboxes_ct[..., 0:3] -= roi_center
+            pos_gt_bboxes_ct[..., 6] -= roi_ry
+            pos_gt_bboxes_ct[..., 0:3] = rotation_3d_in_axis(
+                pos_gt_bboxes_ct[..., 0:3].unsqueeze(1), -(roi_ry),
+                axis=2).squeeze(1)
+
+            # flip orientation if gt have opposite orientation
+            ry_label = pos_gt_bboxes_ct[..., 6] % (2 * np.pi)  # 0 ~ 2pi
+            is_opposite = (ry_label > np.pi * 0.5) & (ry_label < np.pi * 1.5)
+            ry_label[is_opposite] = (ry_label[is_opposite] + np.pi) % (
+                2 * np.pi)  # (0 ~ pi/2, 3pi/2 ~ 2pi)
+            flag = ry_label > np.pi
+            ry_label[flag] = ry_label[flag] - np.pi * 2  # (-pi/2, pi/2)
+            ry_label = torch.clamp(ry_label, min=-np.pi / 2, max=np.pi / 2)
+            pos_gt_bboxes_ct[..., 6] = ry_label
+
+            rois_anchor = pos_bboxes.clone().detach()
+            rois_anchor[:, 0:3] = 0
+            rois_anchor[:, 6] = 0
+            bbox_targets = self.bbox_coder.encode(rois_anchor,
+                                                  pos_gt_bboxes_ct)
+        else:
+            # no fg bbox
+            bbox_targets = pos_gt_bboxes.new_empty((0, 7))
+
+        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+
+    def get_results(self,
+                    rois: Tensor,
+                    cls_score: Tensor,
+                    bbox_pred: Tensor,
+                    class_labels: Tensor,
+                    input_metas: List[dict],
+                    cfg: dict = None) -> InstanceList:
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            rois (torch.Tensor): RoI bounding boxes.
+            cls_score (torch.Tensor): Scores of bounding boxes.
+            bbox_pred (torch.Tensor): Bounding boxes predictions
+            class_labels (torch.Tensor): Label of classes
+            input_metas (list[dict]): Point cloud and image's meta info.
+            cfg (:obj:`ConfigDict`, optional): Testing config.
+                Defaults to None.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        roi_batch_id = rois[..., 0]
+        roi_boxes = rois[..., 1:]  # boxes without batch id
+        batch_size = int(roi_batch_id.max().item() + 1)
+
+        # decode boxes
+        roi_ry = roi_boxes[..., 6].view(-1)
+        roi_xyz = roi_boxes[..., 0:3].view(-1, 3)
+        local_roi_boxes = roi_boxes.clone().detach()
+        local_roi_boxes[..., 0:3] = 0
+        rcnn_boxes3d = self.bbox_coder.decode(local_roi_boxes, bbox_pred)
+        rcnn_boxes3d[..., 0:3] = rotation_3d_in_axis(
+            rcnn_boxes3d[..., 0:3].unsqueeze(1), roi_ry, axis=2).squeeze(1)
+        rcnn_boxes3d[:, 0:3] += roi_xyz
+
+        # post processing
+        result_list = []
+        for batch_id in range(batch_size):
+            cur_class_labels = class_labels[batch_id]
+            cur_cls_score = cls_score[roi_batch_id == batch_id].view(-1)
+
+            cur_box_prob = cur_cls_score.unsqueeze(1)
+            cur_rcnn_boxes3d = rcnn_boxes3d[roi_batch_id == batch_id]
+            keep = self.multi_class_nms(cur_box_prob, cur_rcnn_boxes3d,
+                                        cfg.score_thr, cfg.nms_thr,
+                                        input_metas[batch_id],
+                                        cfg.use_rotate_nms)
+            selected_bboxes = cur_rcnn_boxes3d[keep]
+            selected_label_preds = cur_class_labels[keep]
+            selected_scores = cur_cls_score[keep]
+            results = InstanceData()
+            results.bboxes_3d = input_metas[batch_id]['box_type_3d'](
+                selected_bboxes, selected_bboxes.shape[-1])
+            results.scores_3d = selected_scores
+            results.labels_3d = selected_label_preds
+
+            result_list.append(results)
+        return result_list
+
+    def multi_class_nms(self,
+                        box_probs: Tensor,
+                        box_preds: Tensor,
+                        score_thr: float,
+                        nms_thr: float,
+                        input_meta: dict,
+                        use_rotate_nms: bool = True) -> Tensor:
+        """Multi-class NMS for box head.
+
+        Note:
+            This function has large overlap with the `box3d_multiclass_nms`
+            implemented in `mmdet3d.core.post_processing`. We are considering
+            merging these two functions in the future.
+
+        Args:
+            box_probs (torch.Tensor): Predicted boxes probabilities in
+                shape (N,).
+            box_preds (torch.Tensor): Predicted boxes in shape (N, 7+C).
+            score_thr (float): Threshold of scores.
+            nms_thr (float): Threshold for NMS.
+            input_meta (dict): Meta information of the current sample.
+            use_rotate_nms (bool): Whether to use rotated nms.
+                Defaults to True.
+
+        Returns:
+            torch.Tensor: Selected indices.
+        """
+        if use_rotate_nms:
+            nms_func = nms_bev
+        else:
+            nms_func = nms_normal_bev
+
+        assert box_probs.shape[
+            1] == self.num_classes, f'box_probs shape: {str(box_probs.shape)}'
+        selected_list = []
+        selected_labels = []
+        boxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
+            box_preds, self.bbox_coder.code_size).bev)
+
+        score_thresh = score_thr if isinstance(
+            score_thr, list) else [score_thr for x in range(self.num_classes)]
+        nms_thresh = nms_thr if isinstance(
+            nms_thr, list) else [nms_thr for x in range(self.num_classes)]
+        for k in range(0, self.num_classes):
+            class_scores_keep = box_probs[:, k] >= score_thresh[k]
+
+            if class_scores_keep.int().sum() > 0:
+                original_idxs = class_scores_keep.nonzero(
+                    as_tuple=False).view(-1)
+                cur_boxes_for_nms = boxes_for_nms[class_scores_keep]
+                cur_rank_scores = box_probs[class_scores_keep, k]
+
+                cur_selected = nms_func(cur_boxes_for_nms, cur_rank_scores,
+                                        nms_thresh[k])
+
+                if cur_selected.shape[0] == 0:
+                    continue
+                selected_list.append(original_idxs[cur_selected])
+                selected_labels.append(
+                    torch.full([cur_selected.shape[0]],
+                               k + 1,
+                               dtype=torch.int64,
+                               device=box_preds.device))
+
+        keep = torch.cat(
+            selected_list, dim=0) if len(selected_list) > 0 else []
+        return keep
diff --git a/mmde/mmdet3d/models/roi_heads/bbox_heads/pv_rcnn_bbox_head.py b/mmde/mmdet3d/models/roi_heads/bbox_heads/pv_rcnn_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..abdaf79a3c7348920a37d06ddc61cac46702255d
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/bbox_heads/pv_rcnn_bbox_head.py
@@ -0,0 +1,509 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmcv.cnn import ConvModule
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.utils import multi_apply
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import nn as nn
+
+from mmdet3d.models.layers import nms_bev, nms_normal_bev
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures.bbox_3d import (LiDARInstance3DBoxes,
+                                        rotation_3d_in_axis, xywhr2xyxyr)
+from mmdet3d.utils import InstanceList
+
+
+@MODELS.register_module()
+class PVRCNNBBoxHead(BaseModule):
+    """PVRCNN BBox head.
+
+    Args:
+        in_channels (int): The number of input channel.
+        grid_size (int): The number of grid points in roi bbox.
+        num_classes (int): The number of classes.
+        class_agnostic (bool): Whether generate class agnostic prediction.
+            Defaults to True.
+        shared_fc_channels (tuple(int)): Out channels of each shared fc layer.
+            Defaults to (256, 256).
+        cls_channels (tuple(int)): Out channels of each classification layer.
+            Defaults to (256, 256).
+        reg_channels (tuple(int)): Out channels of each regression layer.
+            Defaults to (256, 256).
+        dropout_ratio (float): Ratio of dropout layer. Defaults to 0.5.
+        with_corner_loss (bool): Whether to use corner loss or not.
+            Defaults to True.
+        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for box head.
+            Defaults to dict(type='DeltaXYZWLHRBBoxCoder').
+        norm_cfg (dict): Type of normalization method.
+            Defaults to dict(type='BN1d', eps=1e-5, momentum=0.1)
+        loss_bbox (dict): Config dict of box regression loss.
+        loss_cls (dict): Config dict of classifacation loss.
+        init_cfg (dict, optional): Initialize config of
+            model.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        grid_size: int,
+        num_classes: int,
+        class_agnostic: bool = True,
+        shared_fc_channels: Tuple[int] = (256, 256),
+        cls_channels: Tuple[int] = (256, 256),
+        reg_channels: Tuple[int] = (256, 256),
+        dropout_ratio: float = 0.3,
+        with_corner_loss: bool = True,
+        bbox_coder: dict = dict(type='DeltaXYZWLHRBBoxCoder'),
+        norm_cfg: dict = dict(type='BN2d', eps=1e-5, momentum=0.1),
+        loss_bbox: dict = dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_cls: dict = dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='none',
+            loss_weight=1.0),
+        init_cfg: Optional[dict] = dict(
+            type='Xavier', layer=['Conv2d', 'Conv1d'], distribution='uniform')
+    ) -> None:
+        super(PVRCNNBBoxHead, self).__init__(init_cfg=init_cfg)
+        self.init_cfg = init_cfg
+        self.num_classes = num_classes
+        self.with_corner_loss = with_corner_loss
+        self.class_agnostic = class_agnostic
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+
+        cls_out_channels = 1 if class_agnostic else num_classes
+        self.reg_out_channels = self.bbox_coder.code_size * cls_out_channels
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = cls_out_channels
+        else:
+            self.cls_out_channels = cls_out_channels + 1
+
+        self.dropout_ratio = dropout_ratio
+        self.grid_size = grid_size
+
+        # PVRCNNBBoxHead model in_channels is num of grid points in roi box.
+        in_channels *= (self.grid_size**3)
+
+        self.in_channels = in_channels
+
+        self.shared_fc_layer = self._make_fc_layers(
+            in_channels, shared_fc_channels,
+            range(len(shared_fc_channels) - 1), norm_cfg)
+        self.cls_layer = self._make_fc_layers(
+            shared_fc_channels[-1],
+            cls_channels,
+            range(1),
+            norm_cfg,
+            out_channels=self.cls_out_channels)
+        self.reg_layer = self._make_fc_layers(
+            shared_fc_channels[-1],
+            reg_channels,
+            range(1),
+            norm_cfg,
+            out_channels=self.reg_out_channels)
+
+    def _make_fc_layers(self,
+                        in_channels: int,
+                        fc_channels: list,
+                        dropout_indices: list,
+                        norm_cfg: dict,
+                        out_channels: Optional[int] = None) -> torch.nn.Module:
+        """Initial a full connection layer.
+
+        Args:
+            in_channels (int): Module in channels.
+            fc_channels (list): Full connection layer channels.
+            dropout_indices (list): Dropout indices.
+            norm_cfg (dict): Type of normalization method.
+            out_channels (int, optional): Module out channels.
+        """
+        fc_layers = []
+        pre_channel = in_channels
+        for k in range(len(fc_channels)):
+            fc_layers.append(
+                ConvModule(
+                    pre_channel,
+                    fc_channels[k],
+                    kernel_size=(1, 1),
+                    stride=(1, 1),
+                    norm_cfg=norm_cfg,
+                    conv_cfg=dict(type='Conv2d'),
+                    bias=False,
+                    inplace=True))
+            pre_channel = fc_channels[k]
+            if self.dropout_ratio >= 0 and k in dropout_indices:
+                fc_layers.append(nn.Dropout(self.dropout_ratio))
+        if out_channels is not None:
+            fc_layers.append(
+                nn.Conv2d(fc_channels[-1], out_channels, 1, bias=True))
+        fc_layers = nn.Sequential(*fc_layers)
+        return fc_layers
+
+    def forward(self, feats: torch.Tensor) -> Tuple[torch.Tensor]:
+        """Forward pvrcnn bbox head.
+
+        Args:
+            feats (torch.Tensor): Batch point-wise features.
+
+        Returns:
+            tuple[torch.Tensor]: Score of class and bbox predictions.
+        """
+        # (B * N, 6, 6, 6, C)
+        rcnn_batch_size = feats.shape[0]
+        feats = feats.permute(0, 4, 1, 2,
+                              3).contiguous().view(rcnn_batch_size, -1, 1, 1)
+        # (BxN, C*6*6*6)
+        shared_feats = self.shared_fc_layer(feats)
+        cls_score = self.cls_layer(shared_feats).transpose(
+            1, 2).contiguous().view(-1, self.cls_out_channels)  # (B, 1)
+        bbox_pred = self.reg_layer(shared_feats).transpose(
+            1, 2).contiguous().view(-1, self.reg_out_channels)  # (B, C)
+        return cls_score, bbox_pred
+
+    def loss(self, cls_score: torch.Tensor, bbox_pred: torch.Tensor,
+             rois: torch.Tensor, labels: torch.Tensor,
+             bbox_targets: torch.Tensor, pos_gt_bboxes: torch.Tensor,
+             reg_mask: torch.Tensor, label_weights: torch.Tensor,
+             bbox_weights: torch.Tensor) -> Dict:
+        """Coumputing losses.
+
+        Args:
+            cls_score (torch.Tensor): Scores of each roi.
+            bbox_pred (torch.Tensor): Predictions of bboxes.
+            rois (torch.Tensor): Roi bboxes.
+            labels (torch.Tensor): Labels of class.
+            bbox_targets (torch.Tensor): Target of positive bboxes.
+            pos_gt_bboxes (torch.Tensor): Ground truths of positive bboxes.
+            reg_mask (torch.Tensor): Mask for positive bboxes.
+            label_weights (torch.Tensor): Weights of class loss.
+            bbox_weights (torch.Tensor): Weights of bbox loss.
+
+        Returns:
+             dict: Computed losses.
+
+             - loss_cls (torch.Tensor): Loss of classes.
+             - loss_bbox (torch.Tensor): Loss of bboxes.
+             - loss_corner (torch.Tensor): Loss of corners.
+        """
+        losses = dict()
+        rcnn_batch_size = cls_score.shape[0]
+
+        # calculate class loss
+        cls_flat = cls_score.view(-1)
+        loss_cls = self.loss_cls(cls_flat, labels, label_weights)
+        losses['loss_cls'] = loss_cls
+
+        # calculate regression loss
+        code_size = self.bbox_coder.code_size
+        pos_inds = (reg_mask > 0)
+        if pos_inds.any() == 0:
+            # fake a part loss
+            losses['loss_bbox'] = 0 * bbox_pred.sum()
+            if self.with_corner_loss:
+                losses['loss_corner'] = 0 * bbox_pred.sum()
+        else:
+            pos_bbox_pred = bbox_pred.view(rcnn_batch_size, -1)[pos_inds]
+            bbox_weights_flat = bbox_weights[pos_inds].view(-1, 1).repeat(
+                1, pos_bbox_pred.shape[-1])
+            loss_bbox = self.loss_bbox(
+                pos_bbox_pred.unsqueeze(dim=0), bbox_targets.unsqueeze(dim=0),
+                bbox_weights_flat.unsqueeze(dim=0))
+            losses['loss_bbox'] = loss_bbox
+
+            if self.with_corner_loss:
+                pos_roi_boxes3d = rois[..., 1:].view(-1, code_size)[pos_inds]
+                pos_roi_boxes3d = pos_roi_boxes3d.view(-1, code_size)
+                batch_anchors = pos_roi_boxes3d.clone().detach()
+                pos_rois_rotation = pos_roi_boxes3d[..., 6].view(-1)
+                roi_xyz = pos_roi_boxes3d[..., 0:3].view(-1, 3)
+                batch_anchors[..., 0:3] = 0
+                # decode boxes
+                pred_boxes3d = self.bbox_coder.decode(
+                    batch_anchors,
+                    pos_bbox_pred.view(-1, code_size)).view(-1, code_size)
+
+                pred_boxes3d[..., 0:3] = rotation_3d_in_axis(
+                    pred_boxes3d[..., 0:3].unsqueeze(1),
+                    pos_rois_rotation,
+                    axis=2).squeeze(1)
+
+                pred_boxes3d[:, 0:3] += roi_xyz
+
+                # calculate corner loss
+                loss_corner = self.get_corner_loss_lidar(
+                    pred_boxes3d, pos_gt_bboxes)
+                losses['loss_corner'] = loss_corner.mean()
+
+        return losses
+
+    def get_targets(self,
+                    sampling_results: SamplingResult,
+                    rcnn_train_cfg: dict,
+                    concat: bool = True) -> Tuple[torch.Tensor]:
+        """Generate targets.
+
+        Args:
+            sampling_results (list[:obj:`SamplingResult`]):
+                Sampled results from rois.
+            rcnn_train_cfg (:obj:`ConfigDict`): Training config of rcnn.
+            concat (bool): Whether to concatenate targets between batches.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of boxes and class prediction.
+        """
+        pos_bboxes_list = [res.pos_bboxes for res in sampling_results]
+        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
+        iou_list = [res.iou for res in sampling_results]
+        targets = multi_apply(
+            self._get_target_single,
+            pos_bboxes_list,
+            pos_gt_bboxes_list,
+            iou_list,
+            cfg=rcnn_train_cfg)
+
+        (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+         bbox_weights) = targets
+
+        if concat:
+            label = torch.cat(label, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            pos_gt_bboxes = torch.cat(pos_gt_bboxes, 0)
+            reg_mask = torch.cat(reg_mask, 0)
+
+            label_weights = torch.cat(label_weights, 0)
+            label_weights /= torch.clamp(label_weights.sum(), min=1.0)
+
+            bbox_weights = torch.cat(bbox_weights, 0)
+            bbox_weights /= torch.clamp(bbox_weights.sum(), min=1.0)
+
+        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+
+    def _get_target_single(self, pos_bboxes: torch.Tensor,
+                           pos_gt_bboxes: torch.Tensor, ious: torch.Tensor,
+                           cfg: dict) -> Tuple[torch.Tensor]:
+        """Generate training targets for a single sample.
+
+        Args:
+            pos_bboxes (torch.Tensor): Positive boxes with shape
+                (N, 7).
+            pos_gt_bboxes (torch.Tensor): Ground truth boxes with shape
+                (M, 7).
+            ious (torch.Tensor): IoU between `pos_bboxes` and `pos_gt_bboxes`
+                in shape (N, M).
+            cfg (dict): Training configs.
+
+        Returns:
+            tuple[torch.Tensor]: Target for positive boxes.
+                (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+        """
+        cls_pos_mask = ious > cfg.cls_pos_thr
+        cls_neg_mask = ious < cfg.cls_neg_thr
+        interval_mask = (cls_pos_mask == 0) & (cls_neg_mask == 0)
+
+        # iou regression target
+        label = (cls_pos_mask > 0).float()
+        label[interval_mask] = ious[interval_mask] * 2 - 0.5
+        # label weights
+        label_weights = (label >= 0).float()
+
+        # box regression target
+        reg_mask = pos_bboxes.new_zeros(ious.size(0)).long()
+        reg_mask[0:pos_gt_bboxes.size(0)] = 1
+        bbox_weights = (reg_mask > 0).float()
+        if reg_mask.bool().any():
+            pos_gt_bboxes_ct = pos_gt_bboxes.clone().detach()
+            roi_center = pos_bboxes[..., 0:3]
+            roi_ry = pos_bboxes[..., 6] % (2 * np.pi)
+
+            # canonical transformation
+            pos_gt_bboxes_ct[..., 0:3] -= roi_center
+            pos_gt_bboxes_ct[..., 6] -= roi_ry
+            pos_gt_bboxes_ct[..., 0:3] = rotation_3d_in_axis(
+                pos_gt_bboxes_ct[..., 0:3].unsqueeze(1), -roi_ry,
+                axis=2).squeeze(1)
+
+            # flip orientation if rois have opposite orientation
+            ry_label = pos_gt_bboxes_ct[..., 6] % (2 * np.pi)  # 0 ~ 2pi
+            opposite_flag = (ry_label > np.pi * 0.5) & (ry_label < np.pi * 1.5)
+            ry_label[opposite_flag] = (ry_label[opposite_flag] + np.pi) % (
+                2 * np.pi)  # (0 ~ pi/2, 3pi/2 ~ 2pi)
+            flag = ry_label > np.pi
+            ry_label[flag] = ry_label[flag] - np.pi * 2  # (-pi/2, pi/2)
+            ry_label = torch.clamp(ry_label, min=-np.pi / 2, max=np.pi / 2)
+            pos_gt_bboxes_ct[..., 6] = ry_label
+
+            rois_anchor = pos_bboxes.clone().detach()
+            rois_anchor[:, 0:3] = 0
+            rois_anchor[:, 6] = 0
+            bbox_targets = self.bbox_coder.encode(rois_anchor,
+                                                  pos_gt_bboxes_ct)
+        else:
+            # no fg bbox
+            bbox_targets = pos_gt_bboxes.new_empty((0, 7))
+
+        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+
+    def get_corner_loss_lidar(self,
+                              pred_bbox3d: torch.Tensor,
+                              gt_bbox3d: torch.Tensor,
+                              delta: float = 1.0) -> torch.Tensor:
+        """Calculate corner loss of given boxes.
+
+        Args:
+            pred_bbox3d (torch.FloatTensor): Predicted boxes in shape (N, 7).
+            gt_bbox3d (torch.FloatTensor): Ground truth boxes in shape (N, 7).
+            delta (float, optional): huber loss threshold. Defaults to 1.0
+
+        Returns:
+            torch.FloatTensor: Calculated corner loss in shape (N).
+        """
+        assert pred_bbox3d.shape[0] == gt_bbox3d.shape[0]
+
+        # This is a little bit hack here because we assume the box for
+        # Part-A2 is in LiDAR coordinates
+        gt_boxes_structure = LiDARInstance3DBoxes(gt_bbox3d)
+        pred_box_corners = LiDARInstance3DBoxes(pred_bbox3d).corners
+        gt_box_corners = gt_boxes_structure.corners
+
+        # This flip only changes the heading direction of GT boxes
+        gt_bbox3d_flip = gt_boxes_structure.clone()
+        gt_bbox3d_flip.tensor[:, 6] += np.pi
+        gt_box_corners_flip = gt_bbox3d_flip.corners
+
+        corner_dist = torch.min(
+            torch.norm(pred_box_corners - gt_box_corners, dim=2),
+            torch.norm(pred_box_corners - gt_box_corners_flip,
+                       dim=2))  # (N, 8)
+        # huber loss
+        abs_error = torch.abs(corner_dist)
+        corner_loss = torch.where(abs_error < delta,
+                                  0.5 * abs_error**2 / delta,
+                                  abs_error - 0.5 * delta)
+        return corner_loss.mean(dim=1)
+
+    def get_results(self,
+                    rois: torch.Tensor,
+                    cls_preds: torch.Tensor,
+                    bbox_reg: torch.Tensor,
+                    class_labels: torch.Tensor,
+                    input_metas: List[dict],
+                    test_cfg: dict = None) -> InstanceList:
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            rois (torch.Tensor): Roi bounding boxes.
+            cls_preds (torch.Tensor): Scores of bounding boxes.
+            bbox_reg (torch.Tensor): Bounding boxes predictions
+            class_labels (torch.Tensor): Label of classes
+            input_metas (list[dict]): Point cloud meta info.
+            test_cfg (:obj:`ConfigDict`): Testing config.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        roi_batch_id = rois[..., 0]
+        roi_boxes = rois[..., 1:]  # boxes without batch id
+        batch_size = int(roi_batch_id.max().item() + 1)
+
+        # decode boxes
+        roi_ry = roi_boxes[..., 6].view(-1)
+        roi_xyz = roi_boxes[..., 0:3].view(-1, 3)
+        local_roi_boxes = roi_boxes.clone().detach()
+        local_roi_boxes[..., 0:3] = 0
+        batch_box_preds = self.bbox_coder.decode(local_roi_boxes, bbox_reg)
+        batch_box_preds[..., 0:3] = rotation_3d_in_axis(
+            batch_box_preds[..., 0:3].unsqueeze(1), roi_ry, axis=2).squeeze(1)
+        batch_box_preds[:, 0:3] += roi_xyz
+
+        # post processing
+        result_list = []
+        for batch_id in range(batch_size):
+            cur_cls_preds = cls_preds[roi_batch_id == batch_id]
+            box_preds = batch_box_preds[roi_batch_id == batch_id]
+            label_preds = class_labels[batch_id]
+
+            cur_cls_preds = cur_cls_preds.sigmoid()
+            cur_cls_preds, _ = torch.max(cur_cls_preds, dim=-1)
+            selected = self.class_agnostic_nms(
+                scores=cur_cls_preds,
+                bbox_preds=box_preds,
+                input_meta=input_metas[batch_id],
+                nms_cfg=test_cfg)
+
+            selected_bboxes = box_preds[selected]
+            selected_label_preds = label_preds[selected]
+            selected_scores = cur_cls_preds[selected]
+
+            results = InstanceData()
+            results.bboxes_3d = input_metas[batch_id]['box_type_3d'](
+                selected_bboxes, self.bbox_coder.code_size)
+            results.scores_3d = selected_scores
+            results.labels_3d = selected_label_preds
+
+            result_list.append(results)
+        return result_list
+
+    def class_agnostic_nms(self, scores: torch.Tensor,
+                           bbox_preds: torch.Tensor, nms_cfg: dict,
+                           input_meta: dict) -> Tuple[torch.Tensor]:
+        """Class agnostic NMS for box head.
+
+        Args:
+            scores (torch.Tensor): Object score of bounding boxes.
+            bbox_preds (torch.Tensor): Predicted bounding boxes.
+            nms_cfg (dict): NMS config dict.
+            input_meta (dict): Contain pcd and img's meta info.
+
+        Returns:
+            tuple[torch.Tensor]: Bounding boxes, scores and labels.
+        """
+        obj_scores = scores.clone()
+        if nms_cfg.use_rotate_nms:
+            nms_func = nms_bev
+        else:
+            nms_func = nms_normal_bev
+
+        bbox = input_meta['box_type_3d'](
+            bbox_preds.clone(),
+            box_dim=bbox_preds.shape[-1],
+            with_yaw=True,
+            origin=(0.5, 0.5, 0.5))
+
+        if nms_cfg.score_thr is not None:
+            scores_mask = (obj_scores >= nms_cfg.score_thr)
+            obj_scores = obj_scores[scores_mask]
+            bbox = bbox[scores_mask]
+        selected = []
+        if obj_scores.shape[0] > 0:
+            box_scores_nms, indices = torch.topk(
+                obj_scores, k=min(4096, obj_scores.shape[0]))
+            bbox_bev = bbox.bev[indices]
+            bbox_for_nms = xywhr2xyxyr(bbox_bev)
+
+            keep = nms_func(bbox_for_nms, box_scores_nms, nms_cfg.nms_thr)
+            selected = indices[keep]
+        if nms_cfg.score_thr is not None:
+            original_idxs = scores_mask.nonzero().view(-1)
+            selected = original_idxs[selected]
+        return selected
diff --git a/mmde/mmdet3d/models/roi_heads/h3d_roi_head.py b/mmde/mmdet3d/models/roi_heads/h3d_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..521ce1345a85ad6778443cf4d64062bf6052f9cd
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/h3d_roi_head.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List
+
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import Det3DDataSample
+from .base_3droi_head import Base3DRoIHead
+
+
+@MODELS.register_module()
+class H3DRoIHead(Base3DRoIHead):
+    """H3D roi head for H3DNet.
+
+    Args:
+        primitive_list (List): Configs of primitive heads.
+        bbox_head (ConfigDict): Config of bbox_head.
+        train_cfg (ConfigDict): Training config.
+        test_cfg (ConfigDict): Testing config.
+    """
+
+    def __init__(self,
+                 primitive_list: List[dict],
+                 bbox_head: dict = None,
+                 train_cfg: dict = None,
+                 test_cfg: dict = None,
+                 init_cfg: dict = None):
+        super(H3DRoIHead, self).__init__(
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        # Primitive module
+        assert len(primitive_list) == 3
+        self.primitive_z = MODELS.build(primitive_list[0])
+        self.primitive_xy = MODELS.build(primitive_list[1])
+        self.primitive_line = MODELS.build(primitive_list[2])
+
+    def init_mask_head(self):
+        """Initialize mask head, skip since ``H3DROIHead`` does not have
+        one."""
+        pass
+
+    def init_bbox_head(self, dummy_args, bbox_head):
+        """Initialize box head.
+
+        Args:
+            dummy_args (optional): Just to compatible with
+                the interface in base class
+            bbox_head (dict): Config for bbox head.
+        """
+        bbox_head['train_cfg'] = self.train_cfg
+        bbox_head['test_cfg'] = self.test_cfg
+        self.bbox_head = MODELS.build(bbox_head)
+
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        pass
+
+    def loss(self, points: List[Tensor], feats_dict: dict,
+             batch_data_samples: List[Det3DDataSample], **kwargs):
+        """Training forward function of PartAggregationROIHead.
+
+        Args:
+            points (list[torch.Tensor]): Point cloud of each sample.
+            feats_dict (dict): Dict of feature.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
+
+        Returns:
+            dict: losses from each head.
+        """
+        losses = dict()
+
+        primitive_loss_inputs = (points, feats_dict, batch_data_samples)
+        # note the feats_dict would be added new key and value in each head.
+        loss_z = self.primitive_z.loss(*primitive_loss_inputs)
+        loss_xy = self.primitive_xy.loss(*primitive_loss_inputs)
+        loss_line = self.primitive_line.loss(*primitive_loss_inputs)
+
+        losses.update(loss_z)
+        losses.update(loss_xy)
+        losses.update(loss_line)
+
+        targets = feats_dict.pop('targets')
+
+        bbox_loss = self.bbox_head.loss(
+            points,
+            feats_dict,
+            rpn_targets=targets,
+            batch_data_samples=batch_data_samples)
+        losses.update(bbox_loss)
+        return losses
+
+    def predict(self,
+                points: List[Tensor],
+                feats_dict: Dict[str, Tensor],
+                batch_data_samples: List[Det3DDataSample],
+                suffix='_optimized',
+                **kwargs) -> List[InstanceData]:
+        """
+        Args:
+            points (list[tensor]): Point clouds of multiple samples.
+            feats_dict (dict): Features from FPN or backbone..
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes meta information of data.
+
+        Returns:
+            list[:obj:`InstanceData`]: List of processed predictions. Each
+            InstanceData contains 3d Bounding boxes and corresponding
+            scores and labels.
+        """
+
+        result_z = self.primitive_z(feats_dict)
+        feats_dict.update(result_z)
+
+        result_xy = self.primitive_xy(feats_dict)
+        feats_dict.update(result_xy)
+
+        result_line = self.primitive_line(feats_dict)
+        feats_dict.update(result_line)
+
+        bbox_preds = self.bbox_head(feats_dict)
+        feats_dict.update(bbox_preds)
+        results_list = self.bbox_head.predict(
+            points, feats_dict, batch_data_samples, suffix=suffix)
+
+        return results_list
diff --git a/mmde/mmdet3d/models/roi_heads/mask_heads/__init__.py b/mmde/mmdet3d/models/roi_heads/mask_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..68e754b4f05d243ea2b7f329379d0c7ebdecc517
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/mask_heads/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .foreground_segmentation_head import ForegroundSegmentationHead
+from .pointwise_semantic_head import PointwiseSemanticHead
+from .primitive_head import PrimitiveHead
+
+__all__ = [
+    'PointwiseSemanticHead', 'PrimitiveHead', 'ForegroundSegmentationHead'
+]
diff --git a/mmde/mmdet3d/models/roi_heads/mask_heads/foreground_segmentation_head.py b/mmde/mmdet3d/models/roi_heads/mask_heads/foreground_segmentation_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6505fefdb7913dc9151167500227229f1039f897
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/mask_heads/foreground_segmentation_head.py
@@ -0,0 +1,174 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Tuple
+
+import torch
+from mmcv.cnn.bricks import build_norm_layer
+from mmdet.models.utils import multi_apply
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import InstanceList
+
+
+@MODELS.register_module()
+class ForegroundSegmentationHead(BaseModule):
+    """Foreground segmentation head.
+
+    Args:
+        in_channels (int): The number of input channel.
+        mlp_channels (tuple[int]): Specify of mlp channels. Defaults
+            to (256, 256).
+        extra_width (float): Boxes enlarge width. Default used 0.1.
+        norm_cfg (dict): Type of normalization method. Defaults to
+            dict(type='BN1d', eps=1e-5, momentum=0.1).
+        init_cfg (dict, optional): Initialize config of
+            model. Defaults to None.
+        loss_seg (dict): Config of segmentation loss. Defaults to
+            dict(type='mmdet.FocalLoss')
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        mlp_channels: Tuple[int] = (256, 256),
+        extra_width: float = 0.1,
+        norm_cfg: dict = dict(type='BN1d', eps=1e-5, momentum=0.1),
+        init_cfg: Optional[dict] = None,
+        loss_seg: dict = dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            gamma=2.0,
+            alpha=0.25,
+            activated=True,
+            loss_weight=1.0)
+    ) -> None:
+        super(ForegroundSegmentationHead, self).__init__(init_cfg=init_cfg)
+        self.extra_width = extra_width
+        self.num_classes = 1
+
+        self.in_channels = in_channels
+        self.use_sigmoid_cls = loss_seg.get('use_sigmoid', False)
+
+        out_channels = 1
+        if self.use_sigmoid_cls:
+            self.out_channels = out_channels
+        else:
+            self.out_channels = out_channels + 1
+
+        mlps_layers = []
+        cin = in_channels
+        for mlp in mlp_channels:
+            mlps_layers.extend([
+                nn.Linear(cin, mlp, bias=False),
+                build_norm_layer(norm_cfg, mlp)[1],
+                nn.ReLU()
+            ])
+            cin = mlp
+        mlps_layers.append(nn.Linear(cin, self.out_channels, bias=True))
+
+        self.seg_cls_layer = nn.Sequential(*mlps_layers)
+
+        self.loss_seg = MODELS.build(loss_seg)
+
+    def forward(self, feats: torch.Tensor) -> dict:
+        """Forward head.
+
+        Args:
+            feats (torch.Tensor): Point-wise features.
+
+        Returns:
+            dict: Segment predictions.
+        """
+        seg_preds = self.seg_cls_layer(feats)
+        return dict(seg_preds=seg_preds)
+
+    def _get_targets_single(self, point_xyz: torch.Tensor,
+                            gt_bboxes_3d: InstanceData,
+                            gt_labels_3d: torch.Tensor) -> torch.Tensor:
+        """generate segmentation targets for a single sample.
+
+        Args:
+            point_xyz (torch.Tensor): Coordinate of points.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in
+                shape (box_num, 7).
+            gt_labels_3d (torch.Tensor): Class labels of ground truths in
+                shape (box_num).
+
+        Returns:
+            torch.Tensor: Points class labels.
+        """
+        point_cls_labels_single = point_xyz.new_zeros(
+            point_xyz.shape[0]).long()
+        enlarged_gt_boxes = gt_bboxes_3d.enlarged_box(self.extra_width)
+
+        box_idxs_of_pts = gt_bboxes_3d.points_in_boxes_part(point_xyz).long()
+        extend_box_idxs_of_pts = enlarged_gt_boxes.points_in_boxes_part(
+            point_xyz).long()
+        box_fg_flag = box_idxs_of_pts >= 0
+        fg_flag = box_fg_flag.clone()
+        ignore_flag = fg_flag ^ (extend_box_idxs_of_pts >= 0)
+        point_cls_labels_single[ignore_flag] = -1
+        gt_box_of_fg_points = gt_labels_3d[box_idxs_of_pts[fg_flag]]
+        point_cls_labels_single[
+            fg_flag] = 1 if self.num_classes == 1 else\
+            gt_box_of_fg_points.long()
+        return point_cls_labels_single,
+
+    def get_targets(self, points_bxyz: torch.Tensor,
+                    batch_gt_instances_3d: InstanceList) -> dict:
+        """Generate segmentation targets.
+
+        Args:
+            points_bxyz (torch.Tensor): The coordinates of point in shape
+                (B, num_points, 3).
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+
+        Returns:
+            dict: Prediction targets
+                - seg_targets (torch.Tensor): Segmentation targets.
+        """
+        batch_size = len(batch_gt_instances_3d)
+        points_xyz_list = []
+        gt_bboxes_3d = []
+        gt_labels_3d = []
+        for idx in range(batch_size):
+            coords_idx = points_bxyz[:, 0] == idx
+            points_xyz_list.append(points_bxyz[coords_idx][..., 1:])
+            gt_bboxes_3d.append(batch_gt_instances_3d[idx].bboxes_3d)
+            gt_labels_3d.append(batch_gt_instances_3d[idx].labels_3d)
+        seg_targets, = multi_apply(self._get_targets_single, points_xyz_list,
+                                   gt_bboxes_3d, gt_labels_3d)
+        seg_targets = torch.cat(seg_targets, dim=0)
+        return dict(seg_targets=seg_targets)
+
+    def loss(self, semantic_results: dict,
+             semantic_targets: dict) -> Dict[str, torch.Tensor]:
+        """Calculate point-wise segmentation losses.
+
+        Args:
+            semantic_results (dict): Results from semantic head.
+            semantic_targets (dict): Targets of semantic results.
+
+        Returns:
+            dict: Loss of segmentation.
+
+            - loss_semantic (torch.Tensor): Segmentation prediction loss.
+        """
+        seg_preds = semantic_results['seg_preds']
+        seg_targets = semantic_targets['seg_targets']
+
+        positives = (seg_targets > 0)
+
+        negative_cls_weights = (seg_targets == 0).float()
+        seg_weights = (negative_cls_weights + 1.0 * positives).float()
+        pos_normalizer = positives.sum(dim=0).float()
+        seg_weights /= torch.clamp(pos_normalizer, min=1.0)
+
+        seg_preds = torch.sigmoid(seg_preds)
+        loss_seg = self.loss_seg(seg_preds, (~positives).long(), seg_weights)
+        return dict(loss_semantic=loss_seg)
diff --git a/mmde/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py b/mmde/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..340c6bf86966bcf56e482b73a5845fd66b703b80
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py
@@ -0,0 +1,211 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Tuple
+
+import torch
+from mmdet.models.utils import multi_apply
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.bbox_3d import BaseInstance3DBoxes, rotation_3d_in_axis
+from mmdet3d.utils import InstanceList
+
+
+@MODELS.register_module()
+class PointwiseSemanticHead(BaseModule):
+    """Semantic segmentation head for point-wise segmentation.
+
+    Predict point-wise segmentation and part regression results for PartA2.
+    See `paper <https://arxiv.org/abs/1907.03670>`_ for more details.
+
+    Args:
+        in_channels (int): The number of input channel.
+        num_classes (int): The number of class.
+        extra_width (float): Boxes enlarge width.
+        loss_seg (dict): Config of segmentation loss.
+        loss_part (dict): Config of part prediction loss.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        num_classes: int = 3,
+        extra_width: float = 0.2,
+        seg_score_thr: float = 0.3,
+        init_cfg: Optional[dict] = None,
+        loss_seg: dict = dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_part: dict = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)
+    ) -> None:
+        super(PointwiseSemanticHead, self).__init__(init_cfg=init_cfg)
+        self.extra_width = extra_width
+        self.num_classes = num_classes
+        self.seg_score_thr = seg_score_thr
+        self.seg_cls_layer = nn.Linear(in_channels, 1, bias=True)
+        self.seg_reg_layer = nn.Linear(in_channels, 3, bias=True)
+
+        self.loss_seg = MODELS.build(loss_seg)
+        self.loss_part = MODELS.build(loss_part)
+
+    def forward(self, x: Tensor) -> Dict[str, Tensor]:
+        """Forward pass.
+
+        Args:
+            x (torch.Tensor): Features from the first stage.
+
+        Returns:
+            dict: Part features, segmentation and part predictions.
+
+            - seg_preds (torch.Tensor): Segment predictions.
+            - part_preds (torch.Tensor): Part predictions.
+            - part_feats (torch.Tensor): Feature predictions.
+        """
+        seg_preds = self.seg_cls_layer(x)  # (N, 1)
+        part_preds = self.seg_reg_layer(x)  # (N, 3)
+
+        seg_scores = torch.sigmoid(seg_preds).detach()
+        seg_mask = (seg_scores > self.seg_score_thr)
+
+        part_offsets = torch.sigmoid(part_preds).clone().detach()
+        part_offsets[seg_mask.view(-1) == 0] = 0
+        part_feats = torch.cat((part_offsets, seg_scores),
+                               dim=-1)  # shape (npoints, 4)
+        return dict(
+            seg_preds=seg_preds, part_preds=part_preds, part_feats=part_feats)
+
+    def get_targets_single(self, voxel_centers: Tensor,
+                           gt_bboxes_3d: BaseInstance3DBoxes,
+                           gt_labels_3d: Tensor) -> Tuple[Tensor]:
+        """generate segmentation and part prediction targets for a single
+        sample.
+
+        Args:
+            voxel_centers (torch.Tensor): The center of voxels in shape
+                (voxel_num, 3).
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in
+                shape (box_num, 7).
+            gt_labels_3d (torch.Tensor): Class labels of ground truths in
+                shape (box_num).
+
+        Returns:
+            tuple[torch.Tensor]: Segmentation targets with shape [voxel_num]
+                part prediction targets with shape [voxel_num, 3]
+        """
+        gt_bboxes_3d = gt_bboxes_3d.to(voxel_centers.device)
+        enlarged_gt_boxes = gt_bboxes_3d.enlarged_box(self.extra_width)
+
+        part_targets = voxel_centers.new_zeros((voxel_centers.shape[0], 3),
+                                               dtype=torch.float32)
+        box_idx = gt_bboxes_3d.points_in_boxes_part(voxel_centers)
+        enlarge_box_idx = enlarged_gt_boxes.points_in_boxes_part(
+            voxel_centers).long()
+
+        gt_labels_pad = F.pad(
+            gt_labels_3d, (1, 0), mode='constant', value=self.num_classes)
+        seg_targets = gt_labels_pad[(box_idx.long() + 1)]
+        fg_pt_flag = box_idx > -1
+        ignore_flag = fg_pt_flag ^ (enlarge_box_idx > -1)
+        seg_targets[ignore_flag] = -1
+
+        for k in range(len(gt_bboxes_3d)):
+            k_box_flag = box_idx == k
+            # no point in current box (caused by velodyne reduce)
+            if not k_box_flag.any():
+                continue
+            fg_voxels = voxel_centers[k_box_flag]
+            transformed_voxels = fg_voxels - gt_bboxes_3d.bottom_center[k]
+            transformed_voxels = rotation_3d_in_axis(
+                transformed_voxels.unsqueeze(0),
+                -gt_bboxes_3d.yaw[k].view(1),
+                axis=2)
+            part_targets[k_box_flag] = transformed_voxels / gt_bboxes_3d.dims[
+                k] + voxel_centers.new_tensor([0.5, 0.5, 0])
+
+        part_targets = torch.clamp(part_targets, min=0)
+        return seg_targets, part_targets
+
+    def get_targets(self, voxel_dict: dict,
+                    batch_gt_instances_3d: InstanceList) -> dict:
+        """generate segmentation and part prediction targets.
+
+        Args:
+            voxel_dict (dict): Contains information of voxels.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+
+        Returns:
+            dict: Prediction targets
+
+            - seg_targets (torch.Tensor): Segmentation targets
+                with shape [voxel_num].
+            - part_targets (torch.Tensor): Part prediction targets
+                with shape [voxel_num, 3].
+        """
+        batch_size = len(batch_gt_instances_3d)
+        voxel_center_list = []
+        gt_bboxes_3d = []
+        gt_labels_3d = []
+        for idx in range(batch_size):
+            coords_idx = voxel_dict['coors'][:, 0] == idx
+            voxel_center_list.append(voxel_dict['voxel_centers'][coords_idx])
+            gt_bboxes_3d.append(batch_gt_instances_3d[idx].bboxes_3d)
+            gt_labels_3d.append(batch_gt_instances_3d[idx].labels_3d)
+        seg_targets, part_targets = multi_apply(self.get_targets_single,
+                                                voxel_center_list,
+                                                gt_bboxes_3d, gt_labels_3d)
+        seg_targets = torch.cat(seg_targets, dim=0)
+        part_targets = torch.cat(part_targets, dim=0)
+        return dict(seg_targets=seg_targets, part_targets=part_targets)
+
+    def loss(self, semantic_results: dict,
+             semantic_targets: dict) -> Dict[str, Tensor]:
+        """Calculate point-wise segmentation and part prediction losses.
+
+        Args:
+            semantic_results (dict): Results from semantic head.
+
+                - seg_preds: Segmentation predictions.
+                - part_preds: Part predictions.
+
+            semantic_targets (dict): Targets of semantic results.
+
+                - seg_preds: Segmentation targets.
+                - part_preds: Part targets.
+
+        Returns:
+            dict: Loss of segmentation and part prediction.
+
+            - loss_seg (torch.Tensor): Segmentation prediction loss.
+            - loss_part (torch.Tensor): Part prediction loss.
+        """
+        seg_preds = semantic_results['seg_preds']
+        part_preds = semantic_results['part_preds']
+        seg_targets = semantic_targets['seg_targets']
+        part_targets = semantic_targets['part_targets']
+
+        pos_mask = (seg_targets > -1) & (seg_targets < self.num_classes)
+        binary_seg_target = pos_mask.long()
+        pos = pos_mask.float()
+        neg = (seg_targets == self.num_classes).float()
+        seg_weights = pos + neg
+        pos_normalizer = pos.sum()
+        seg_weights = seg_weights / torch.clamp(pos_normalizer, min=1.0)
+        loss_seg = self.loss_seg(seg_preds, binary_seg_target, seg_weights)
+
+        if pos_normalizer > 0:
+            loss_part = self.loss_part(part_preds[pos_mask],
+                                       part_targets[pos_mask])
+        else:
+            # fake a part loss
+            loss_part = loss_seg.new_tensor(0)
+
+        return dict(loss_seg=loss_seg, loss_part=loss_part)
diff --git a/mmde/mmdet3d/models/roi_heads/mask_heads/primitive_head.py b/mmde/mmdet3d/models/roi_heads/mask_heads/primitive_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6b5e1b7af8e7efd5db288a8991a664b58c6b15e
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/mask_heads/primitive_head.py
@@ -0,0 +1,1053 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from mmcv.cnn import ConvModule
+from mmcv.ops import furthest_point_sample
+from mmdet.models.utils import multi_apply
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.models.layers import VoteModule, build_sa_module
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import Det3DDataSample
+from mmdet3d.structures.bbox_3d import BaseInstance3DBoxes
+
+
+@MODELS.register_module()
+class PrimitiveHead(BaseModule):
+    r"""Primitive head of `H3DNet <https://arxiv.org/abs/2006.05682>`_.
+
+    Args:
+        num_dims (int): The dimension of primitive semantic information.
+        num_classes (int): The number of class.
+        primitive_mode (str): The mode of primitive module,
+            available mode ['z', 'xy', 'line'].
+        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
+            decoding boxes.
+        train_cfg (dict, optional): Config for training.
+        test_cfg (dict, optional): Config for testing.
+        vote_module_cfg (dict, optional): Config of VoteModule for point-wise
+            votes.
+        vote_aggregation_cfg (dict, optional): Config of vote aggregation
+            layer.
+        feat_channels (tuple[int]): Convolution channels of
+            prediction layer.
+        upper_thresh (float): Threshold for line matching.
+        surface_thresh (float): Threshold for surface matching.
+        conv_cfg (dict, optional): Config of convolution in prediction layer.
+        norm_cfg (dict, optional): Config of BN in prediction layer.
+        objectness_loss (dict, optional): Config of objectness loss.
+        center_loss (dict, optional): Config of center loss.
+        semantic_loss (dict, optional): Config of point-wise semantic
+            segmentation loss.
+    """
+
+    def __init__(self,
+                 num_dims: int,
+                 num_classes: int,
+                 primitive_mode: str,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 vote_module_cfg: Optional[dict] = None,
+                 vote_aggregation_cfg: Optional[dict] = None,
+                 feat_channels: tuple = (128, 128),
+                 upper_thresh: float = 100.0,
+                 surface_thresh: float = 0.5,
+                 conv_cfg: dict = dict(type='Conv1d'),
+                 norm_cfg: dict = dict(type='BN1d'),
+                 objectness_loss: Optional[dict] = None,
+                 center_loss: Optional[dict] = None,
+                 semantic_reg_loss: Optional[dict] = None,
+                 semantic_cls_loss: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None):
+        super(PrimitiveHead, self).__init__(init_cfg=init_cfg)
+        # bounding boxes centers,  face centers and edge centers
+        assert primitive_mode in ['z', 'xy', 'line']
+        # The dimension of primitive semantic information.
+        self.num_dims = num_dims
+        self.num_classes = num_classes
+        self.primitive_mode = primitive_mode
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.gt_per_seed = vote_module_cfg['gt_per_seed']
+        self.num_proposal = vote_aggregation_cfg['num_point']
+        self.upper_thresh = upper_thresh
+        self.surface_thresh = surface_thresh
+
+        self.loss_objectness = MODELS.build(objectness_loss)
+        self.loss_center = MODELS.build(center_loss)
+        self.loss_semantic_reg = MODELS.build(semantic_reg_loss)
+        self.loss_semantic_cls = MODELS.build(semantic_cls_loss)
+
+        assert vote_aggregation_cfg['mlp_channels'][0] == vote_module_cfg[
+            'in_channels']
+
+        # Primitive existence flag prediction
+        self.flag_conv = ConvModule(
+            vote_module_cfg['conv_channels'][-1],
+            vote_module_cfg['conv_channels'][-1] // 2,
+            1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=True,
+            inplace=True)
+        self.flag_pred = torch.nn.Conv1d(
+            vote_module_cfg['conv_channels'][-1] // 2, 2, 1)
+
+        self.vote_module = VoteModule(**vote_module_cfg)
+        self.vote_aggregation = build_sa_module(vote_aggregation_cfg)
+
+        prev_channel = vote_aggregation_cfg['mlp_channels'][-1]
+        conv_pred_list = list()
+        for k in range(len(feat_channels)):
+            conv_pred_list.append(
+                ConvModule(
+                    prev_channel,
+                    feat_channels[k],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    bias=True,
+                    inplace=True))
+            prev_channel = feat_channels[k]
+        self.conv_pred = nn.Sequential(*conv_pred_list)
+
+        conv_out_channel = 3 + num_dims + num_classes
+        self.conv_pred.add_module('conv_out',
+                                  nn.Conv1d(prev_channel, conv_out_channel, 1))
+
+    @property
+    def sample_mode(self):
+        if self.training:
+            sample_mode = self.train_cfg.sample_mode
+        else:
+            sample_mode = self.test_cfg.sample_mode
+        assert sample_mode in ['vote', 'seed', 'random']
+        return sample_mode
+
+    def forward(self, feats_dict: dict) -> dict:
+        """Forward pass.
+
+        Args:
+            feats_dict (dict): Feature dict from backbone.
+
+
+        Returns:
+            dict: Predictions of primitive head.
+        """
+        sample_mode = self.sample_mode
+
+        seed_points = feats_dict['fp_xyz_net0'][-1]
+        seed_features = feats_dict['hd_feature']
+        results = {}
+
+        primitive_flag = self.flag_conv(seed_features)
+        primitive_flag = self.flag_pred(primitive_flag)
+
+        results['pred_flag_' + self.primitive_mode] = primitive_flag
+
+        # 1. generate vote_points from seed_points
+        vote_points, vote_features, _ = self.vote_module(
+            seed_points, seed_features)
+        results['vote_' + self.primitive_mode] = vote_points
+        results['vote_features_' + self.primitive_mode] = vote_features
+
+        # 2. aggregate vote_points
+        if sample_mode == 'vote':
+            # use fps in vote_aggregation
+            sample_indices = None
+        elif sample_mode == 'seed':
+            # FPS on seed and choose the votes corresponding to the seeds
+            sample_indices = furthest_point_sample(seed_points,
+                                                   self.num_proposal)
+        elif sample_mode == 'random':
+            # Random sampling from the votes
+            batch_size, num_seed = seed_points.shape[:2]
+            sample_indices = torch.randint(
+                0,
+                num_seed, (batch_size, self.num_proposal),
+                dtype=torch.int32,
+                device=seed_points.device)
+        else:
+            raise NotImplementedError('Unsupported sample mod!')
+
+        vote_aggregation_ret = self.vote_aggregation(vote_points,
+                                                     vote_features,
+                                                     sample_indices)
+        aggregated_points, features, aggregated_indices = vote_aggregation_ret
+        results['aggregated_points_' + self.primitive_mode] = aggregated_points
+        results['aggregated_features_' + self.primitive_mode] = features
+        results['aggregated_indices_' +
+                self.primitive_mode] = aggregated_indices
+
+        # 3. predict primitive offsets and semantic information
+        predictions = self.conv_pred(features)
+
+        # 4. decode predictions
+        decode_ret = self.primitive_decode_scores(predictions,
+                                                  aggregated_points)
+        results.update(decode_ret)
+
+        center, pred_ind = self.get_primitive_center(
+            primitive_flag, decode_ret['center_' + self.primitive_mode])
+
+        results['pred_' + self.primitive_mode + '_ind'] = pred_ind
+        results['pred_' + self.primitive_mode + '_center'] = center
+        return results
+
+    def loss(self, points: List[torch.Tensor], feats_dict: Dict[str,
+                                                                torch.Tensor],
+             batch_data_samples: List[Det3DDataSample], **kwargs) -> dict:
+        """
+        Args:
+            points (list[tensor]): Points cloud of multiple samples.
+            feats_dict (dict): Predictions from backbone or FPN.
+            batch_data_samples (list[:obj:`Det3DDataSample`]): Each item
+                contains the meta information of each sample and
+                corresponding annotations.
+
+        Returns:
+            dict:  A dictionary of loss components.
+        """
+        preds = self(feats_dict)
+        feats_dict.update(preds)
+
+        batch_gt_instance_3d = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        batch_pts_semantic_mask = []
+        batch_pts_instance_mask = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instance_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+            batch_pts_semantic_mask.append(
+                data_sample.gt_pts_seg.get('pts_semantic_mask', None))
+            batch_pts_instance_mask.append(
+                data_sample.gt_pts_seg.get('pts_instance_mask', None))
+
+        loss_inputs = (points, feats_dict, batch_gt_instance_3d)
+        losses = self.loss_by_feat(
+            *loss_inputs,
+            batch_pts_semantic_mask=batch_pts_semantic_mask,
+            batch_pts_instance_mask=batch_pts_instance_mask,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+        )
+        return losses
+
+    def loss_by_feat(
+            self,
+            points: List[torch.Tensor],
+            feats_dict: dict,
+            batch_gt_instances_3d: List[InstanceData],
+            batch_pts_semantic_mask: Optional[List[torch.Tensor]] = None,
+            batch_pts_instance_mask: Optional[List[torch.Tensor]] = None,
+            **kwargs):
+        """Compute loss.
+
+        Args:
+            points (list[torch.Tensor]): Input points.
+            feats_dict (dict): Predictions of previous modules.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_pts_semantic_mask (list[tensor]): Semantic mask
+                of points cloud. Defaults to None.
+            batch_pts_instance_mask (list[tensor]): Instance mask
+                of points cloud. Defaults to None.
+
+        Returns:
+            dict: Losses of Primitive Head.
+        """
+
+        targets = self.get_targets(points, feats_dict, batch_gt_instances_3d,
+                                   batch_pts_semantic_mask,
+                                   batch_pts_instance_mask)
+
+        (point_mask, point_offset, gt_primitive_center, gt_primitive_semantic,
+         gt_sem_cls_label, gt_primitive_mask) = targets
+
+        losses = {}
+        # Compute the loss of primitive existence flag
+        pred_flag = feats_dict['pred_flag_' + self.primitive_mode]
+        flag_loss = self.loss_objectness(pred_flag, gt_primitive_mask.long())
+        losses['flag_loss_' + self.primitive_mode] = flag_loss
+
+        # calculate vote loss
+        vote_loss = self.vote_module.get_loss(
+            feats_dict['seed_points'],
+            feats_dict['vote_' + self.primitive_mode],
+            feats_dict['seed_indices'], point_mask, point_offset)
+        losses['vote_loss_' + self.primitive_mode] = vote_loss
+
+        num_proposal = feats_dict['aggregated_points_' +
+                                  self.primitive_mode].shape[1]
+        primitive_center = feats_dict['center_' + self.primitive_mode]
+        if self.primitive_mode != 'line':
+            primitive_semantic = feats_dict['size_residuals_' +
+                                            self.primitive_mode].contiguous()
+        else:
+            primitive_semantic = None
+        semancitc_scores = feats_dict['sem_cls_scores_' +
+                                      self.primitive_mode].transpose(2, 1)
+
+        gt_primitive_mask = gt_primitive_mask / \
+            (gt_primitive_mask.sum() + 1e-6)
+        center_loss, size_loss, sem_cls_loss = self.compute_primitive_loss(
+            primitive_center, primitive_semantic, semancitc_scores,
+            num_proposal, gt_primitive_center, gt_primitive_semantic,
+            gt_sem_cls_label, gt_primitive_mask)
+        losses['center_loss_' + self.primitive_mode] = center_loss
+        losses['size_loss_' + self.primitive_mode] = size_loss
+        losses['sem_loss_' + self.primitive_mode] = sem_cls_loss
+
+        return losses
+
+    def get_targets(
+        self,
+        points,
+        bbox_preds: Optional[dict] = None,
+        batch_gt_instances_3d: List[InstanceData] = None,
+        batch_pts_semantic_mask: List[torch.Tensor] = None,
+        batch_pts_instance_mask: List[torch.Tensor] = None,
+    ):
+        """Generate targets of primitive head.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            bbox_preds (torch.Tensor): Bounding box predictions of
+                primitive head.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+            batch_pts_semantic_mask (list[tensor]): Semantic gt mask for
+                multiple images.
+            batch_pts_instance_mask (list[tensor]): Instance gt mask for
+                multiple images.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of primitive head.
+        """
+        batch_gt_labels_3d = [
+            gt_instances_3d.labels_3d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+        batch_gt_bboxes_3d = [
+            gt_instances_3d.bboxes_3d
+            for gt_instances_3d in batch_gt_instances_3d
+        ]
+        for index in range(len(batch_gt_labels_3d)):
+            if len(batch_gt_labels_3d[index]) == 0:
+                fake_box = batch_gt_bboxes_3d[index].tensor.new_zeros(
+                    1, batch_gt_bboxes_3d[index].tensor.shape[-1])
+                batch_gt_bboxes_3d[index] = batch_gt_bboxes_3d[index].new_box(
+                    fake_box)
+                batch_gt_labels_3d[index] = batch_gt_labels_3d[
+                    index].new_zeros(1)
+
+        if batch_pts_semantic_mask is None:
+            batch_pts_semantic_mask = [
+                None for _ in range(len(batch_gt_labels_3d))
+            ]
+            batch_pts_instance_mask = [
+                None for _ in range(len(batch_gt_labels_3d))
+            ]
+
+        (point_mask, point_sem,
+         point_offset) = multi_apply(self.get_targets_single, points,
+                                     batch_gt_bboxes_3d, batch_gt_labels_3d,
+                                     batch_pts_semantic_mask,
+                                     batch_pts_instance_mask)
+
+        point_mask = torch.stack(point_mask)
+        point_sem = torch.stack(point_sem)
+        point_offset = torch.stack(point_offset)
+
+        batch_size = point_mask.shape[0]
+        num_proposal = bbox_preds['aggregated_points_' +
+                                  self.primitive_mode].shape[1]
+        num_seed = bbox_preds['seed_points'].shape[1]
+        seed_inds = bbox_preds['seed_indices'].long()
+        seed_inds_expand = seed_inds.view(batch_size, num_seed,
+                                          1).repeat(1, 1, 3)
+        seed_gt_votes = torch.gather(point_offset, 1, seed_inds_expand)
+        seed_gt_votes += bbox_preds['seed_points']
+        gt_primitive_center = seed_gt_votes.view(batch_size * num_proposal, 1,
+                                                 3)
+
+        seed_inds_expand_sem = seed_inds.view(batch_size, num_seed, 1).repeat(
+            1, 1, 4 + self.num_dims)
+        seed_gt_sem = torch.gather(point_sem, 1, seed_inds_expand_sem)
+        gt_primitive_semantic = seed_gt_sem[:, :, 3:3 + self.num_dims].view(
+            batch_size * num_proposal, 1, self.num_dims).contiguous()
+
+        gt_sem_cls_label = seed_gt_sem[:, :, -1].long()
+
+        gt_votes_mask = torch.gather(point_mask, 1, seed_inds)
+
+        return (point_mask, point_offset, gt_primitive_center,
+                gt_primitive_semantic, gt_sem_cls_label, gt_votes_mask)
+
+    def get_targets_single(
+            self,
+            points: torch.Tensor,
+            gt_bboxes_3d: BaseInstance3DBoxes,
+            gt_labels_3d: torch.Tensor,
+            pts_semantic_mask: torch.Tensor = None,
+            pts_instance_mask: torch.Tensor = None) -> Tuple[torch.Tensor]:
+        """Generate targets of primitive head for single batch.
+
+        Args:
+            points (torch.Tensor): Points of each batch.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
+                boxes of each batch.
+            gt_labels_3d (torch.Tensor): Labels of each batch.
+            pts_semantic_mask (torch.Tensor): Point-wise semantic
+                label of each batch.
+            pts_instance_mask (torch.Tensor): Point-wise instance
+                label of each batch.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of primitive head.
+        """
+        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
+        num_points = points.shape[0]
+
+        point_mask = points.new_zeros(num_points)
+        # Offset to the primitive center
+        point_offset = points.new_zeros([num_points, 3])
+        # Semantic information of primitive center
+        point_sem = points.new_zeros([num_points, 3 + self.num_dims + 1])
+
+        # Generate pts_semantic_mask and pts_instance_mask when they are None
+        if pts_semantic_mask is None or pts_instance_mask is None:
+            points2box_mask = gt_bboxes_3d.points_in_boxes_all(points)
+            assignment = points2box_mask.argmax(1)
+            background_mask = points2box_mask.max(1)[0] == 0
+
+            if pts_semantic_mask is None:
+                pts_semantic_mask = gt_labels_3d[assignment]
+                pts_semantic_mask[background_mask] = self.num_classes
+
+            if pts_instance_mask is None:
+                pts_instance_mask = assignment
+                pts_instance_mask[background_mask] = gt_labels_3d.shape[0]
+
+        instance_flag = torch.nonzero(
+            pts_semantic_mask != self.num_classes, as_tuple=False).squeeze(1)
+        instance_labels = pts_instance_mask[instance_flag].unique()
+
+        with_yaw = gt_bboxes_3d.with_yaw
+        for i, i_instance in enumerate(instance_labels):
+            indices = instance_flag[pts_instance_mask[instance_flag] ==
+                                    i_instance]
+            coords = points[indices, :3]
+            cur_cls_label = pts_semantic_mask[indices][0]
+
+            # Bbox Corners
+            cur_corners = gt_bboxes_3d.corners[i]
+
+            plane_lower_temp = points.new_tensor(
+                [0, 0, 1, -cur_corners[7, -1]])
+            upper_points = cur_corners[[1, 2, 5, 6]]
+            refined_distance = (upper_points * plane_lower_temp[:3]).sum(dim=1)
+
+            if self.check_horizon(upper_points) and \
+                    plane_lower_temp[0] + plane_lower_temp[1] < \
+                    self.train_cfg['lower_thresh']:
+                plane_lower = points.new_tensor(
+                    [0, 0, 1, plane_lower_temp[-1]])
+                plane_upper = points.new_tensor(
+                    [0, 0, 1, -torch.mean(refined_distance)])
+            else:
+                raise NotImplementedError('Only horizontal plane is support!')
+
+            if self.check_dist(plane_upper, upper_points) is False:
+                raise NotImplementedError(
+                    'Mean distance to plane should be lower than thresh!')
+
+            # Get the boundary points here
+            point2plane_dist, selected = self.match_point2plane(
+                plane_lower, coords)
+
+            # Get bottom four lines
+            if self.primitive_mode == 'line':
+                point2line_matching = self.match_point2line(
+                    coords[selected], cur_corners, with_yaw, mode='bottom')
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_line_targets(point_mask,
+                                                        point_offset,
+                                                        point_sem,
+                                                        coords[selected],
+                                                        indices[selected],
+                                                        cur_cls_label,
+                                                        point2line_matching,
+                                                        cur_corners,
+                                                        [1, 1, 0, 0],
+                                                        with_yaw,
+                                                        mode='bottom')
+
+            # Set the surface labels here
+            if self.primitive_mode == 'z' and \
+                    selected.sum() > self.train_cfg['num_point'] and \
+                    point2plane_dist[selected].var() < \
+                    self.train_cfg['var_thresh']:
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_surface_targets(point_mask,
+                                                           point_offset,
+                                                           point_sem,
+                                                           coords[selected],
+                                                           indices[selected],
+                                                           cur_cls_label,
+                                                           cur_corners,
+                                                           with_yaw,
+                                                           mode='bottom')
+
+            # Get the boundary points here
+            point2plane_dist, selected = self.match_point2plane(
+                plane_upper, coords)
+
+            # Get top four lines
+            if self.primitive_mode == 'line':
+                point2line_matching = self.match_point2line(
+                    coords[selected], cur_corners, with_yaw, mode='top')
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_line_targets(point_mask,
+                                                        point_offset,
+                                                        point_sem,
+                                                        coords[selected],
+                                                        indices[selected],
+                                                        cur_cls_label,
+                                                        point2line_matching,
+                                                        cur_corners,
+                                                        [1, 1, 0, 0],
+                                                        with_yaw,
+                                                        mode='top')
+
+            if self.primitive_mode == 'z' and \
+                    selected.sum() > self.train_cfg['num_point'] and \
+                    point2plane_dist[selected].var() < \
+                    self.train_cfg['var_thresh']:
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_surface_targets(point_mask,
+                                                           point_offset,
+                                                           point_sem,
+                                                           coords[selected],
+                                                           indices[selected],
+                                                           cur_cls_label,
+                                                           cur_corners,
+                                                           with_yaw,
+                                                           mode='top')
+
+            # Get left two lines
+            plane_left_temp = self._get_plane_fomulation(
+                cur_corners[2] - cur_corners[3],
+                cur_corners[3] - cur_corners[0], cur_corners[0])
+
+            right_points = cur_corners[[4, 5, 7, 6]]
+            plane_left_temp /= torch.norm(plane_left_temp[:3])
+            refined_distance = (right_points * plane_left_temp[:3]).sum(dim=1)
+
+            if plane_left_temp[2] < self.train_cfg['lower_thresh']:
+                plane_left = plane_left_temp
+                plane_right = points.new_tensor([
+                    plane_left_temp[0], plane_left_temp[1], plane_left_temp[2],
+                    -refined_distance.mean()
+                ])
+            else:
+                raise NotImplementedError(
+                    'Normal vector of the plane should be horizontal!')
+
+            # Get the boundary points here
+            point2plane_dist, selected = self.match_point2plane(
+                plane_left, coords)
+
+            # Get left four lines
+            if self.primitive_mode == 'line':
+                point2line_matching = self.match_point2line(
+                    coords[selected], cur_corners, with_yaw, mode='left')
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_line_targets(
+                        point_mask, point_offset, point_sem,
+                        coords[selected], indices[selected], cur_cls_label,
+                        point2line_matching[2:], cur_corners, [2, 2],
+                        with_yaw, mode='left')
+
+            if self.primitive_mode == 'xy' and \
+                    selected.sum() > self.train_cfg['num_point'] and \
+                    point2plane_dist[selected].var() < \
+                    self.train_cfg['var_thresh']:
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_surface_targets(
+                        point_mask, point_offset, point_sem,
+                        coords[selected], indices[selected], cur_cls_label,
+                        cur_corners, with_yaw, mode='left')
+
+            # Get the boundary points here
+            point2plane_dist, selected = self.match_point2plane(
+                plane_right, coords)
+
+            # Get right four lines
+            if self.primitive_mode == 'line':
+                point2line_matching = self.match_point2line(
+                    coords[selected], cur_corners, with_yaw, mode='right')
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_line_targets(
+                        point_mask, point_offset, point_sem,
+                        coords[selected], indices[selected], cur_cls_label,
+                        point2line_matching[2:], cur_corners, [2, 2],
+                        with_yaw, mode='right')
+
+            if self.primitive_mode == 'xy' and \
+                    selected.sum() > self.train_cfg['num_point'] and \
+                    point2plane_dist[selected].var() < \
+                    self.train_cfg['var_thresh']:
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_surface_targets(
+                        point_mask, point_offset, point_sem,
+                        coords[selected], indices[selected], cur_cls_label,
+                        cur_corners, with_yaw, mode='right')
+
+            plane_front_temp = self._get_plane_fomulation(
+                cur_corners[0] - cur_corners[4],
+                cur_corners[4] - cur_corners[5], cur_corners[5])
+
+            back_points = cur_corners[[3, 2, 7, 6]]
+            plane_front_temp /= torch.norm(plane_front_temp[:3])
+            refined_distance = (back_points * plane_front_temp[:3]).sum(dim=1)
+
+            if plane_front_temp[2] < self.train_cfg['lower_thresh']:
+                plane_front = plane_front_temp
+                plane_back = points.new_tensor([
+                    plane_front_temp[0], plane_front_temp[1],
+                    plane_front_temp[2], -torch.mean(refined_distance)
+                ])
+            else:
+                raise NotImplementedError(
+                    'Normal vector of the plane should be horizontal!')
+
+            # Get the boundary points here
+            point2plane_dist, selected = self.match_point2plane(
+                plane_front, coords)
+
+            if self.primitive_mode == 'xy' and \
+                    selected.sum() > self.train_cfg['num_point'] and \
+                    (point2plane_dist[selected]).var() < \
+                    self.train_cfg['var_thresh']:
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_surface_targets(
+                        point_mask, point_offset, point_sem,
+                        coords[selected], indices[selected], cur_cls_label,
+                        cur_corners, with_yaw, mode='front')
+
+            # Get the boundary points here
+            point2plane_dist, selected = self.match_point2plane(
+                plane_back, coords)
+
+            if self.primitive_mode == 'xy' and \
+                    selected.sum() > self.train_cfg['num_point'] and \
+                    point2plane_dist[selected].var() < \
+                    self.train_cfg['var_thresh']:
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_surface_targets(
+                        point_mask, point_offset, point_sem,
+                        coords[selected], indices[selected], cur_cls_label,
+                        cur_corners, with_yaw, mode='back')
+
+        return (point_mask, point_sem, point_offset)
+
+    def primitive_decode_scores(self, predictions: torch.Tensor,
+                                aggregated_points: torch.Tensor) -> dict:
+        """Decode predicted parts to primitive head.
+
+        Args:
+            predictions (torch.Tensor): primitive pridictions of each batch.
+            aggregated_points (torch.Tensor): The aggregated points
+                of vote stage.
+
+        Returns:
+            Dict: Predictions of primitive head, including center,
+                semantic size and semantic scores.
+        """
+
+        ret_dict = {}
+        pred_transposed = predictions.transpose(2, 1)
+
+        center = aggregated_points + pred_transposed[:, :, 0:3]
+        ret_dict['center_' + self.primitive_mode] = center
+
+        if self.primitive_mode in ['z', 'xy']:
+            ret_dict['size_residuals_' + self.primitive_mode] = \
+                pred_transposed[:, :, 3:3 + self.num_dims]
+
+        ret_dict['sem_cls_scores_' + self.primitive_mode] = \
+            pred_transposed[:, :, 3 + self.num_dims:]
+
+        return ret_dict
+
+    def check_horizon(self, points: torch.Tensor) -> bool:
+        """Check whether is a horizontal plane.
+
+        Args:
+            points (torch.Tensor): Points of input.
+
+        Returns:
+            Bool: Flag of result.
+        """
+        return (points[0][-1] == points[1][-1]) and \
+               (points[1][-1] == points[2][-1]) and \
+               (points[2][-1] == points[3][-1])
+
+    def check_dist(self, plane_equ: torch.Tensor,
+                   points: torch.Tensor) -> tuple:
+        """Whether the mean of points to plane distance is lower than thresh.
+
+        Args:
+            plane_equ (torch.Tensor): Plane to be checked.
+            points (torch.Tensor): Points to be checked.
+
+        Returns:
+            Tuple: Flag of result.
+        """
+        return (points[:, 2] +
+                plane_equ[-1]).sum() / 4.0 < self.train_cfg['lower_thresh']
+
+    def point2line_dist(self, points: torch.Tensor, pts_a: torch.Tensor,
+                        pts_b: torch.Tensor) -> torch.Tensor:
+        """Calculate the distance from point to line.
+
+        Args:
+            points (torch.Tensor): Points of input.
+            pts_a (torch.Tensor): Point on the specific line.
+            pts_b (torch.Tensor): Point on the specific line.
+
+        Returns:
+            torch.Tensor: Distance between each point to line.
+        """
+        line_a2b = pts_b - pts_a
+        line_a2pts = points - pts_a
+        length = (line_a2pts * line_a2b.view(1, 3)).sum(1) / \
+            line_a2b.norm()
+        dist = (line_a2pts.norm(dim=1)**2 - length**2).sqrt()
+
+        return dist
+
+    def match_point2line(self,
+                         points: torch.Tensor,
+                         corners: torch.Tensor,
+                         with_yaw: bool,
+                         mode: str = 'bottom') -> tuple:
+        """Match points to corresponding line.
+
+        Args:
+            points (torch.Tensor): Points of input.
+            corners (torch.Tensor): Eight corners of a bounding box.
+            with_yaw (Bool): Whether the boundind box is with rotation.
+            mode (str, optional): Specify which line should be matched,
+                available mode are ('bottom', 'top', 'left', 'right').
+                Defaults to 'bottom'.
+
+        Returns:
+            Tuple: Flag of matching correspondence.
+        """
+        if with_yaw:
+            corners_pair = {
+                'bottom': [[0, 3], [4, 7], [0, 4], [3, 7]],
+                'top': [[1, 2], [5, 6], [1, 5], [2, 6]],
+                'left': [[0, 1], [3, 2], [0, 1], [3, 2]],
+                'right': [[4, 5], [7, 6], [4, 5], [7, 6]]
+            }
+            selected_list = []
+            for pair_index in corners_pair[mode]:
+                selected = self.point2line_dist(
+                    points, corners[pair_index[0]], corners[pair_index[1]]) \
+                    < self.train_cfg['line_thresh']
+                selected_list.append(selected)
+        else:
+            xmin, ymin, _ = corners.min(0)[0]
+            xmax, ymax, _ = corners.max(0)[0]
+            sel1 = torch.abs(points[:, 0] -
+                             xmin) < self.train_cfg['line_thresh']
+            sel2 = torch.abs(points[:, 0] -
+                             xmax) < self.train_cfg['line_thresh']
+            sel3 = torch.abs(points[:, 1] -
+                             ymin) < self.train_cfg['line_thresh']
+            sel4 = torch.abs(points[:, 1] -
+                             ymax) < self.train_cfg['line_thresh']
+            selected_list = [sel1, sel2, sel3, sel4]
+        return selected_list
+
+    def match_point2plane(self, plane: torch.Tensor,
+                          points: torch.Tensor) -> tuple:
+        """Match points to plane.
+
+        Args:
+            plane (torch.Tensor): Equation of the plane.
+            points (torch.Tensor): Points of input.
+
+        Returns:
+            Tuple: Distance of each point to the plane and
+                flag of matching correspondence.
+        """
+        point2plane_dist = torch.abs((points * plane[:3]).sum(dim=1) +
+                                     plane[-1])
+        min_dist = point2plane_dist.min()
+        selected = torch.abs(point2plane_dist -
+                             min_dist) < self.train_cfg['dist_thresh']
+        return point2plane_dist, selected
+
+    def compute_primitive_loss(self, primitive_center: torch.Tensor,
+                               primitive_semantic: torch.Tensor,
+                               semantic_scores: torch.Tensor,
+                               num_proposal: torch.Tensor,
+                               gt_primitive_center: torch.Tensor,
+                               gt_primitive_semantic: torch.Tensor,
+                               gt_sem_cls_label: torch.Tensor,
+                               gt_primitive_mask: torch.Tensor) -> Tuple:
+        """Compute loss of primitive module.
+
+        Args:
+            primitive_center (torch.Tensor): Pridictions of primitive center.
+            primitive_semantic (torch.Tensor): Pridictions of primitive
+                semantic.
+            semantic_scores (torch.Tensor): Pridictions of primitive
+                semantic scores.
+            num_proposal (int): The number of primitive proposal.
+            gt_primitive_center (torch.Tensor): Ground truth of
+                primitive center.
+            gt_votes_sem (torch.Tensor): Ground truth of primitive semantic.
+            gt_sem_cls_label (torch.Tensor): Ground truth of primitive
+                semantic class.
+            gt_primitive_mask (torch.Tensor): Ground truth of primitive mask.
+
+        Returns:
+            Tuple: Loss of primitive module.
+        """
+        batch_size = primitive_center.shape[0]
+        vote_xyz_reshape = primitive_center.view(batch_size * num_proposal, -1,
+                                                 3)
+
+        center_loss = self.loss_center(
+            vote_xyz_reshape,
+            gt_primitive_center,
+            dst_weight=gt_primitive_mask.view(batch_size * num_proposal, 1))[1]
+
+        if self.primitive_mode != 'line':
+            size_xyz_reshape = primitive_semantic.view(
+                batch_size * num_proposal, -1, self.num_dims).contiguous()
+            size_loss = self.loss_semantic_reg(
+                size_xyz_reshape,
+                gt_primitive_semantic,
+                dst_weight=gt_primitive_mask.view(batch_size * num_proposal,
+                                                  1))[1]
+        else:
+            size_loss = center_loss.new_tensor(0.0)
+
+        # Semantic cls loss
+        sem_cls_loss = self.loss_semantic_cls(
+            semantic_scores, gt_sem_cls_label, weight=gt_primitive_mask)
+
+        return center_loss, size_loss, sem_cls_loss
+
+    def get_primitive_center(self, pred_flag: torch.Tensor,
+                             center: torch.Tensor) -> Tuple:
+        """Generate primitive center from predictions.
+
+        Args:
+            pred_flag (torch.Tensor): Scores of primitive center.
+            center (torch.Tensor): Pridictions of primitive center.
+
+        Returns:
+            Tuple: Primitive center and the prediction indices.
+        """
+        ind_normal = F.softmax(pred_flag, dim=1)
+        pred_indices = (ind_normal[:, 1, :] >
+                        self.surface_thresh).detach().float()
+        selected = (ind_normal[:, 1, :] <=
+                    self.surface_thresh).detach().float()
+        offset = torch.ones_like(center) * self.upper_thresh
+        center = center + offset * selected.unsqueeze(-1)
+        return center, pred_indices
+
+    def _assign_primitive_line_targets(self,
+                                       point_mask: torch.Tensor,
+                                       point_offset: torch.Tensor,
+                                       point_sem: torch.Tensor,
+                                       coords: torch.Tensor,
+                                       indices: torch.Tensor,
+                                       cls_label: int,
+                                       point2line_matching: torch.Tensor,
+                                       corners: torch.Tensor,
+                                       center_axises: torch.Tensor,
+                                       with_yaw: bool,
+                                       mode: str = 'bottom') -> Tuple:
+        """Generate targets of line primitive.
+
+        Args:
+            point_mask (torch.Tensor): Tensor to store the ground
+                truth of mask.
+            point_offset (torch.Tensor): Tensor to store the ground
+                truth of offset.
+            point_sem (torch.Tensor): Tensor to store the ground
+                truth of semantic.
+            coords (torch.Tensor): The selected points.
+            indices (torch.Tensor): Indices of the selected points.
+            cls_label (int): Class label of the ground truth bounding box.
+            point2line_matching (torch.Tensor): Flag indicate that
+                matching line of each point.
+            corners (torch.Tensor): Corners of the ground truth bounding box.
+            center_axises (list[int]): Indicate in which axis the line center
+                should be refined.
+            with_yaw (Bool): Whether the boundind box is with rotation.
+            mode (str, optional): Specify which line should be matched,
+                available mode are ('bottom', 'top', 'left', 'right').
+                Defaults to 'bottom'.
+
+        Returns:
+            Tuple: Targets of the line primitive.
+        """
+        corners_pair = {
+            'bottom': [[0, 3], [4, 7], [0, 4], [3, 7]],
+            'top': [[1, 2], [5, 6], [1, 5], [2, 6]],
+            'left': [[0, 1], [3, 2]],
+            'right': [[4, 5], [7, 6]]
+        }
+        corners_pair = corners_pair[mode]
+        assert len(corners_pair) == len(point2line_matching) == len(
+            center_axises)
+        for line_select, center_axis, pair_index in zip(
+                point2line_matching, center_axises, corners_pair):
+            if line_select.sum() > self.train_cfg['num_point_line']:
+                point_mask[indices[line_select]] = 1.0
+
+                if with_yaw:
+                    line_center = (corners[pair_index[0]] +
+                                   corners[pair_index[1]]) / 2
+                else:
+                    line_center = coords[line_select].mean(dim=0)
+                    line_center[center_axis] = corners[:, center_axis].mean()
+
+                point_offset[indices[line_select]] = \
+                    line_center - coords[line_select]
+                point_sem[indices[line_select]] = \
+                    point_sem.new_tensor([line_center[0], line_center[1],
+                                          line_center[2], cls_label])
+        return point_mask, point_offset, point_sem
+
+    def _assign_primitive_surface_targets(self,
+                                          point_mask: torch.Tensor,
+                                          point_offset: torch.Tensor,
+                                          point_sem: torch.Tensor,
+                                          coords: torch.Tensor,
+                                          indices: torch.Tensor,
+                                          cls_label: int,
+                                          corners: torch.Tensor,
+                                          with_yaw: bool,
+                                          mode: str = 'bottom') -> Tuple:
+        """Generate targets for primitive z and primitive xy.
+
+        Args:
+            point_mask (torch.Tensor): Tensor to store the ground
+                truth of mask.
+            point_offset (torch.Tensor): Tensor to store the ground
+                truth of offset.
+            point_sem (torch.Tensor): Tensor to store the ground
+                truth of semantic.
+            coords (torch.Tensor): The selected points.
+            indices (torch.Tensor): Indices of the selected points.
+            cls_label (int): Class label of the ground truth bounding box.
+            corners (torch.Tensor): Corners of the ground truth bounding box.
+            with_yaw (Bool): Whether the boundind box is with rotation.
+            mode (str, optional): Specify which line should be matched,
+                available mode are ('bottom', 'top', 'left', 'right',
+                'front', 'back').
+                Defaults to 'bottom'.
+
+        Returns:
+            Tuple: Targets of the center primitive.
+        """
+        point_mask[indices] = 1.0
+        corners_pair = {
+            'bottom': [0, 7],
+            'top': [1, 6],
+            'left': [0, 1],
+            'right': [4, 5],
+            'front': [0, 1],
+            'back': [3, 2]
+        }
+        pair_index = corners_pair[mode]
+        if self.primitive_mode == 'z':
+            if with_yaw:
+                center = (corners[pair_index[0]] +
+                          corners[pair_index[1]]) / 2.0
+                center[2] = coords[:, 2].mean()
+                point_sem[indices] = point_sem.new_tensor([
+                    center[0], center[1],
+                    center[2], (corners[4] - corners[0]).norm(),
+                    (corners[3] - corners[0]).norm(), cls_label
+                ])
+            else:
+                center = point_mask.new_tensor([
+                    corners[:, 0].mean(), corners[:, 1].mean(),
+                    coords[:, 2].mean()
+                ])
+                point_sem[indices] = point_sem.new_tensor([
+                    center[0], center[1], center[2],
+                    corners[:, 0].max() - corners[:, 0].min(),
+                    corners[:, 1].max() - corners[:, 1].min(), cls_label
+                ])
+        elif self.primitive_mode == 'xy':
+            if with_yaw:
+                center = coords.mean(0)
+                center[2] = (corners[pair_index[0], 2] +
+                             corners[pair_index[1], 2]) / 2.0
+                point_sem[indices] = point_sem.new_tensor([
+                    center[0], center[1], center[2],
+                    corners[pair_index[1], 2] - corners[pair_index[0], 2],
+                    cls_label
+                ])
+            else:
+                center = point_mask.new_tensor([
+                    coords[:, 0].mean(), coords[:, 1].mean(),
+                    corners[:, 2].mean()
+                ])
+                point_sem[indices] = point_sem.new_tensor([
+                    center[0], center[1], center[2],
+                    corners[:, 2].max() - corners[:, 2].min(), cls_label
+                ])
+        point_offset[indices] = center - coords
+        return point_mask, point_offset, point_sem
+
+    def _get_plane_fomulation(self, vector1: torch.Tensor,
+                              vector2: torch.Tensor,
+                              point: torch.Tensor) -> torch.Tensor:
+        """Compute the equation of the plane.
+
+        Args:
+            vector1 (torch.Tensor): Parallel vector of the plane.
+            vector2 (torch.Tensor): Parallel vector of the plane.
+            point (torch.Tensor): Point on the plane.
+
+        Returns:
+            torch.Tensor: Equation of the plane.
+        """
+        surface_norm = torch.cross(vector1, vector2)
+        surface_dis = -torch.dot(surface_norm, point)
+        plane = point.new_tensor(
+            [surface_norm[0], surface_norm[1], surface_norm[2], surface_dis])
+        return plane
diff --git a/mmde/mmdet3d/models/roi_heads/part_aggregation_roi_head.py b/mmde/mmdet3d/models/roi_heads/part_aggregation_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..82816b3442eb927b4c9acbfdb4c6bf95b5c3d2f9
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/part_aggregation_roi_head.py
@@ -0,0 +1,379 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+from mmdet.models.task_modules import AssignResult, SamplingResult
+from mmengine import ConfigDict
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import bbox3d2roi
+from mmdet3d.utils import InstanceList
+from ...structures.det3d_data_sample import SampleList
+from .base_3droi_head import Base3DRoIHead
+
+
+@MODELS.register_module()
+class PartAggregationROIHead(Base3DRoIHead):
+    """Part aggregation roi head for PartA2.
+
+    Args:
+        semantic_head (ConfigDict): Config of semantic head.
+        num_classes (int): The number of classes.
+        seg_roi_extractor (ConfigDict): Config of seg_roi_extractor.
+        bbox_roi_extractor (ConfigDict): Config of part_roi_extractor.
+        bbox_head (ConfigDict): Config of bbox_head.
+        train_cfg (ConfigDict): Training config.
+        test_cfg (ConfigDict): Testing config.
+    """
+
+    def __init__(self,
+                 semantic_head: dict,
+                 num_classes: int = 3,
+                 seg_roi_extractor: dict = None,
+                 bbox_head: dict = None,
+                 bbox_roi_extractor: dict = None,
+                 train_cfg: dict = None,
+                 test_cfg: dict = None,
+                 init_cfg: dict = None) -> None:
+        super(PartAggregationROIHead, self).__init__(
+            bbox_head=bbox_head,
+            bbox_roi_extractor=bbox_roi_extractor,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        self.num_classes = num_classes
+        assert semantic_head is not None
+        self.init_seg_head(seg_roi_extractor, semantic_head)
+
+    def init_seg_head(self, seg_roi_extractor: dict,
+                      semantic_head: dict) -> None:
+        """Initialize semantic head and seg roi extractor.
+
+        Args:
+            seg_roi_extractor (dict): Config of seg
+                roi extractor.
+            semantic_head (dict): Config of semantic head.
+        """
+        self.semantic_head = MODELS.build(semantic_head)
+        self.seg_roi_extractor = MODELS.build(seg_roi_extractor)
+
+    @property
+    def with_semantic(self):
+        """bool: whether the head has semantic branch"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    def _bbox_forward_train(self, feats_dict: Dict, voxels_dict: Dict,
+                            sampling_results: List[SamplingResult]) -> Dict:
+        """Forward training function of roi_extractor and bbox_head.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            voxels_dict (dict): Contains information of voxels.
+            sampling_results (:obj:`SamplingResult`): Sampled results used
+                for training.
+
+        Returns:
+            dict: Forward results including losses and predictions.
+        """
+        rois = bbox3d2roi([res.bboxes for res in sampling_results])
+        bbox_results = self._bbox_forward(feats_dict, voxels_dict, rois)
+
+        bbox_targets = self.bbox_head.get_targets(sampling_results,
+                                                  self.train_cfg)
+        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'],
+                                        bbox_results['bbox_pred'], rois,
+                                        *bbox_targets)
+
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
+
+    def _assign_and_sample(
+            self, rpn_results_list: InstanceList,
+            batch_gt_instances_3d: InstanceList,
+            batch_gt_instances_ignore: InstanceList) -> List[SamplingResult]:
+        """Assign and sample proposals for training.
+
+        Args:
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+            batch_gt_instances_ignore (list): Ignore instances of gt bboxes.
+
+        Returns:
+            list[:obj:`SamplingResult`]: Sampled results of each training
+                sample.
+        """
+        sampling_results = []
+        # bbox assign
+        for batch_idx in range(len(rpn_results_list)):
+            cur_proposal_list = rpn_results_list[batch_idx]
+            cur_boxes = cur_proposal_list['bboxes_3d']
+            cur_labels_3d = cur_proposal_list['labels_3d']
+            cur_gt_instances_3d = batch_gt_instances_3d[batch_idx]
+            cur_gt_instances_ignore = batch_gt_instances_ignore[batch_idx]
+            cur_gt_instances_3d.bboxes_3d = cur_gt_instances_3d.\
+                bboxes_3d.tensor
+            cur_gt_bboxes = cur_gt_instances_3d.bboxes_3d.to(cur_boxes.device)
+            cur_gt_labels = cur_gt_instances_3d.labels_3d
+
+            batch_num_gts = 0
+            # 0 is bg
+            batch_gt_indis = cur_gt_labels.new_full((len(cur_boxes), ), 0)
+            batch_max_overlaps = cur_boxes.tensor.new_zeros(len(cur_boxes))
+            # -1 is bg
+            batch_gt_labels = cur_gt_labels.new_full((len(cur_boxes), ), -1)
+
+            # each class may have its own assigner
+            if isinstance(self.bbox_assigner, list):
+                for i, assigner in enumerate(self.bbox_assigner):
+                    gt_per_cls = (cur_gt_labels == i)
+                    pred_per_cls = (cur_labels_3d == i)
+                    cur_assign_res = assigner.assign(
+                        cur_proposal_list[pred_per_cls],
+                        cur_gt_instances_3d[gt_per_cls],
+                        cur_gt_instances_ignore)
+                    # gather assign_results in different class into one result
+                    batch_num_gts += cur_assign_res.num_gts
+                    # gt inds (1-based)
+                    gt_inds_arange_pad = gt_per_cls.nonzero(
+                        as_tuple=False).view(-1) + 1
+                    # pad 0 for indice unassigned
+                    gt_inds_arange_pad = F.pad(
+                        gt_inds_arange_pad, (1, 0), mode='constant', value=0)
+                    # pad -1 for indice ignore
+                    gt_inds_arange_pad = F.pad(
+                        gt_inds_arange_pad, (1, 0), mode='constant', value=-1)
+                    # convert to 0~gt_num+2 for indices
+                    gt_inds_arange_pad += 1
+                    # now 0 is bg, >1 is fg in batch_gt_indis
+                    batch_gt_indis[pred_per_cls] = gt_inds_arange_pad[
+                        cur_assign_res.gt_inds + 1] - 1
+                    batch_max_overlaps[
+                        pred_per_cls] = cur_assign_res.max_overlaps
+                    batch_gt_labels[pred_per_cls] = cur_assign_res.labels
+
+                assign_result = AssignResult(batch_num_gts, batch_gt_indis,
+                                             batch_max_overlaps,
+                                             batch_gt_labels)
+            else:  # for single class
+                assign_result = self.bbox_assigner.assign(
+                    cur_proposal_list, cur_gt_instances_3d,
+                    cur_gt_instances_ignore)
+            # sample boxes
+            sampling_result = self.bbox_sampler.sample(assign_result,
+                                                       cur_boxes.tensor,
+                                                       cur_gt_bboxes,
+                                                       cur_gt_labels)
+            sampling_results.append(sampling_result)
+        return sampling_results
+
+    def _semantic_forward_train(self, feats_dict: dict, voxel_dict: dict,
+                                batch_gt_instances_3d: InstanceList) -> Dict:
+        """Train semantic head.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            voxel_dict (dict): Contains information of voxels.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+
+        Returns:
+            dict: Segmentation results including losses
+        """
+        semantic_results = self.semantic_head(feats_dict['seg_features'])
+        semantic_targets = self.semantic_head.get_targets(
+            voxel_dict, batch_gt_instances_3d)
+        loss_semantic = self.semantic_head.loss(semantic_results,
+                                                semantic_targets)
+        semantic_results.update(loss_semantic=loss_semantic)
+        return semantic_results
+
+    def predict(self,
+                feats_dict: Dict,
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False,
+                **kwargs) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        assert self.with_bbox, 'Bbox head must be implemented in PartA2.'
+        assert self.with_semantic, 'Semantic head must be implemented' \
+                                   ' in PartA2.'
+
+        batch_input_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        voxels_dict = feats_dict.pop('voxels_dict')
+        # TODO: Split predict semantic and bbox
+        results_list = self.predict_bbox(feats_dict, voxels_dict,
+                                         batch_input_metas, rpn_results_list,
+                                         self.test_cfg)
+        return results_list
+
+    def predict_bbox(self, feats_dict: Dict, voxel_dict: Dict,
+                     batch_input_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     test_cfg: ConfigDict) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            voxel_dict (dict): Contains information of voxels.
+            batch_input_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            test_cfg (Config): Test config.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        semantic_results = self.semantic_head(feats_dict['seg_features'])
+        feats_dict.update(semantic_results)
+        rois = bbox3d2roi(
+            [res['bboxes_3d'].tensor for res in rpn_results_list])
+        labels_3d = [res['labels_3d'] for res in rpn_results_list]
+        cls_preds = [res['cls_preds'] for res in rpn_results_list]
+        bbox_results = self._bbox_forward(feats_dict, voxel_dict, rois)
+
+        bbox_list = self.bbox_head.get_results(rois, bbox_results['cls_score'],
+                                               bbox_results['bbox_pred'],
+                                               labels_3d, cls_preds,
+                                               batch_input_metas, test_cfg)
+        return bbox_list
+
+    def _bbox_forward(self, feats_dict: Dict, voxel_dict: Dict,
+                      rois: Tensor) -> Dict:
+        """Forward function of roi_extractor and bbox_head used in both
+        training and testing.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            voxel_dict (dict): Contains information of voxels.
+            rois (Tensor): Roi boxes.
+
+        Returns:
+            dict: Contains predictions of bbox_head and
+                features of roi_extractor.
+        """
+        pooled_seg_feats = self.seg_roi_extractor(feats_dict['seg_features'],
+                                                  voxel_dict['voxel_centers'],
+                                                  voxel_dict['coors'][...,
+                                                                      0], rois)
+        pooled_part_feats = self.bbox_roi_extractor(
+            feats_dict['part_feats'], voxel_dict['voxel_centers'],
+            voxel_dict['coors'][..., 0], rois)
+        cls_score, bbox_pred = self.bbox_head(pooled_seg_feats,
+                                              pooled_part_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            pooled_seg_feats=pooled_seg_feats,
+            pooled_part_feats=pooled_part_feats)
+        return bbox_results
+
+    def loss(self, feats_dict: Dict, rpn_results_list: InstanceList,
+             batch_data_samples: SampleList, **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        losses = dict()
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        voxels_dict = feats_dict.pop('voxels_dict')
+        for data_sample in batch_data_samples:
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            if 'ignored_instances' in data_sample:
+                batch_gt_instances_ignore.append(data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+        if self.with_semantic:
+            semantic_results = self._semantic_forward_train(
+                feats_dict, voxels_dict, batch_gt_instances_3d)
+            losses.update(semantic_results.pop('loss_semantic'))
+
+        sample_results = self._assign_and_sample(rpn_results_list,
+                                                 batch_gt_instances_3d,
+                                                 batch_gt_instances_ignore)
+        if self.with_bbox:
+            feats_dict.update(semantic_results)
+            bbox_results = self._bbox_forward_train(feats_dict, voxels_dict,
+                                                    sample_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        return losses
+
+    def _forward(self, feats_dict: dict,
+                 rpn_results_list: InstanceList) -> Tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+
+        Returns:
+            tuple: A tuple of results from roi head.
+        """
+        voxel_dict = feats_dict.pop('voxel_dict')
+        semantic_results = self.semantic_head(feats_dict['seg_features'])
+        feats_dict.update(semantic_results)
+        rois = bbox3d2roi([res['bbox_3d'].tensor for res in rpn_results_list])
+        bbox_results = self._bbox_forward(feats_dict, voxel_dict, rois)
+        cls_score = bbox_results['cls_score']
+        bbox_pred = bbox_results['bbox_pred']
+        return cls_score, bbox_pred
diff --git a/mmde/mmdet3d/models/roi_heads/point_rcnn_roi_head.py b/mmde/mmdet3d/models/roi_heads/point_rcnn_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..454e2f30cf3eb43acad961c408c0be41e7a8d227
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/point_rcnn_roi_head.py
@@ -0,0 +1,309 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
+
+import torch
+from mmdet.models.task_modules import AssignResult
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures import bbox3d2roi
+from mmdet3d.utils.typing_utils import InstanceList, SampleList
+from .base_3droi_head import Base3DRoIHead
+
+
+@MODELS.register_module()
+class PointRCNNRoIHead(Base3DRoIHead):
+    """RoI head for PointRCNN.
+
+    Args:
+        bbox_head (dict): Config of bbox_head.
+        bbox_roi_extractor (dict): Config of RoI extractor.
+        train_cfg (dict): Train configs.
+        test_cfg (dict): Test configs.
+        depth_normalizer (float): Normalize depth feature.
+            Defaults to 70.0.
+        init_cfg (dict, optional): Config of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 bbox_head: dict,
+                 bbox_roi_extractor: dict,
+                 train_cfg: dict,
+                 test_cfg: dict,
+                 depth_normalizer: dict = 70.0,
+                 init_cfg: Optional[dict] = None) -> None:
+        super(PointRCNNRoIHead, self).__init__(
+            bbox_head=bbox_head,
+            bbox_roi_extractor=bbox_roi_extractor,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        self.depth_normalizer = depth_normalizer
+
+        self.init_assigner_sampler()
+
+    def init_mask_head(self):
+        """Initialize maek head."""
+        pass
+
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            if isinstance(self.train_cfg.assigner, dict):
+                self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            elif isinstance(self.train_cfg.assigner, list):
+                self.bbox_assigner = [
+                    TASK_UTILS.build(res) for res in self.train_cfg.assigner
+                ]
+            self.bbox_sampler = TASK_UTILS.build(self.train_cfg.sampler)
+
+    def loss(self, feats_dict: Dict, rpn_results_list: InstanceList,
+             batch_data_samples: SampleList, **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        features = feats_dict['fp_features']
+        fp_points = feats_dict['fp_points']
+        point_cls_preds = feats_dict['points_cls_preds']
+        sem_scores = point_cls_preds.sigmoid()
+        point_scores = sem_scores.max(-1)[0]
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        for data_sample in batch_data_samples:
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            if 'ignored_instances' in data_sample:
+                batch_gt_instances_ignore.append(data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+        sample_results = self._assign_and_sample(rpn_results_list,
+                                                 batch_gt_instances_3d,
+                                                 batch_gt_instances_ignore)
+
+        # concat the depth, semantic features and backbone features
+        features = features.transpose(1, 2).contiguous()
+        point_depths = fp_points.norm(dim=2) / self.depth_normalizer - 0.5
+        features_list = [
+            point_scores.unsqueeze(2),
+            point_depths.unsqueeze(2), features
+        ]
+        features = torch.cat(features_list, dim=2)
+
+        bbox_results = self._bbox_forward_train(features, fp_points,
+                                                sample_results)
+        losses = dict()
+        losses.update(bbox_results['loss_bbox'])
+
+        return losses
+
+    def predict(self,
+                feats_dict: Dict,
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False,
+                **kwargs) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        rois = bbox3d2roi(
+            [res['bboxes_3d'].tensor for res in rpn_results_list])
+        labels_3d = [res['labels_3d'] for res in rpn_results_list]
+        batch_input_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        fp_features = feats_dict['fp_features']
+        fp_points = feats_dict['fp_points']
+        point_cls_preds = feats_dict['points_cls_preds']
+        sem_scores = point_cls_preds.sigmoid()
+        point_scores = sem_scores.max(-1)[0]
+
+        features = fp_features.transpose(1, 2).contiguous()
+        point_depths = fp_points.norm(dim=2) / self.depth_normalizer - 0.5
+        features_list = [
+            point_scores.unsqueeze(2),
+            point_depths.unsqueeze(2), features
+        ]
+
+        features = torch.cat(features_list, dim=2)
+        batch_size = features.shape[0]
+        bbox_results = self._bbox_forward(features, fp_points, batch_size,
+                                          rois)
+        object_score = bbox_results['cls_score'].sigmoid()
+        bbox_list = self.bbox_head.get_results(
+            rois,
+            object_score,
+            bbox_results['bbox_pred'],
+            labels_3d,
+            batch_input_metas,
+            cfg=self.test_cfg)
+
+        return bbox_list
+
+    def _bbox_forward_train(self, features: Tensor, points: Tensor,
+                            sampling_results: SampleList) -> dict:
+        """Forward training function of roi_extractor and bbox_head.
+
+        Args:
+            features (torch.Tensor): Backbone features with depth and \
+                semantic features.
+            points (torch.Tensor): Point cloud.
+            sampling_results (:obj:`SamplingResult`): Sampled results used
+                for training.
+
+        Returns:
+            dict: Forward results including losses and predictions.
+        """
+        rois = bbox3d2roi([res.bboxes for res in sampling_results])
+        batch_size = features.shape[0]
+        bbox_results = self._bbox_forward(features, points, batch_size, rois)
+        bbox_targets = self.bbox_head.get_targets(sampling_results,
+                                                  self.train_cfg)
+
+        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'],
+                                        bbox_results['bbox_pred'], rois,
+                                        *bbox_targets)
+
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
+
+    def _bbox_forward(self, features: Tensor, points: Tensor, batch_size: int,
+                      rois: Tensor) -> dict:
+        """Forward function of roi_extractor and bbox_head used in both
+        training and testing.
+
+        Args:
+            features (torch.Tensor): Backbone features with depth and
+                semantic features.
+            points (torch.Tensor): Point cloud.
+            batch_size (int): Batch size.
+            rois (torch.Tensor): RoI boxes.
+
+        Returns:
+            dict: Contains predictions of bbox_head and
+                features of roi_extractor.
+        """
+        pooled_point_feats = self.bbox_roi_extractor(features, points,
+                                                     batch_size, rois)
+
+        cls_score, bbox_pred = self.bbox_head(pooled_point_feats)
+        bbox_results = dict(cls_score=cls_score, bbox_pred=bbox_pred)
+        return bbox_results
+
+    def _assign_and_sample(
+            self, rpn_results_list: InstanceList,
+            batch_gt_instances_3d: InstanceList,
+            batch_gt_instances_ignore: InstanceList) -> SampleList:
+        """Assign and sample proposals for training.
+
+        Args:
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`]): Ignore
+                instances of gt bboxes.
+
+        Returns:
+            list[:obj:`SamplingResult`]: Sampled results of each training
+                sample.
+        """
+        sampling_results = []
+        # bbox assign
+        for batch_idx in range(len(rpn_results_list)):
+            cur_proposal_list = rpn_results_list[batch_idx]
+            cur_boxes = cur_proposal_list['bboxes_3d']
+            cur_labels_3d = cur_proposal_list['labels_3d']
+            cur_gt_instances_3d = batch_gt_instances_3d[batch_idx]
+            cur_gt_instances_3d.bboxes_3d = cur_gt_instances_3d.\
+                bboxes_3d.tensor
+            cur_gt_instances_ignore = batch_gt_instances_ignore[batch_idx]
+            cur_gt_bboxes = cur_gt_instances_3d.bboxes_3d.to(cur_boxes.device)
+            cur_gt_labels = cur_gt_instances_3d.labels_3d
+            batch_num_gts = 0
+            # 0 is bg
+            batch_gt_indis = cur_gt_labels.new_full((len(cur_boxes), ), 0)
+            batch_max_overlaps = cur_boxes.tensor.new_zeros(len(cur_boxes))
+            # -1 is bg
+            batch_gt_labels = cur_gt_labels.new_full((len(cur_boxes), ), -1)
+
+            # each class may have its own assigner
+            if isinstance(self.bbox_assigner, list):
+                for i, assigner in enumerate(self.bbox_assigner):
+                    gt_per_cls = (cur_gt_labels == i)
+                    pred_per_cls = (cur_labels_3d == i)
+                    cur_assign_res = assigner.assign(
+                        cur_proposal_list[pred_per_cls],
+                        cur_gt_instances_3d[gt_per_cls],
+                        cur_gt_instances_ignore)
+                    # gather assign_results in different class into one result
+                    batch_num_gts += cur_assign_res.num_gts
+                    # gt inds (1-based)
+                    gt_inds_arange_pad = gt_per_cls.nonzero(
+                        as_tuple=False).view(-1) + 1
+                    # pad 0 for indice unassigned
+                    gt_inds_arange_pad = F.pad(
+                        gt_inds_arange_pad, (1, 0), mode='constant', value=0)
+                    # pad -1 for indice ignore
+                    gt_inds_arange_pad = F.pad(
+                        gt_inds_arange_pad, (1, 0), mode='constant', value=-1)
+                    # convert to 0~gt_num+2 for indices
+                    gt_inds_arange_pad += 1
+                    # now 0 is bg, >1 is fg in batch_gt_indis
+                    batch_gt_indis[pred_per_cls] = gt_inds_arange_pad[
+                        cur_assign_res.gt_inds + 1] - 1
+                    batch_max_overlaps[
+                        pred_per_cls] = cur_assign_res.max_overlaps
+                    batch_gt_labels[pred_per_cls] = cur_assign_res.labels
+
+                assign_result = AssignResult(batch_num_gts, batch_gt_indis,
+                                             batch_max_overlaps,
+                                             batch_gt_labels)
+            else:  # for single class
+                assign_result = self.bbox_assigner.assign(
+                    cur_proposal_list, cur_gt_instances_3d,
+                    cur_gt_instances_ignore)
+
+            # sample boxes
+            sampling_result = self.bbox_sampler.sample(assign_result,
+                                                       cur_boxes.tensor,
+                                                       cur_gt_bboxes,
+                                                       cur_gt_labels)
+            sampling_results.append(sampling_result)
+        return sampling_results
diff --git a/mmde/mmdet3d/models/roi_heads/pv_rcnn_roi_head.py b/mmde/mmdet3d/models/roi_heads/pv_rcnn_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c6011195c2d38c7a4eb0b9bfe7e5aeb1212263b
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/pv_rcnn_roi_head.py
@@ -0,0 +1,312 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import torch
+from mmdet.models.task_modules import AssignResult
+from mmdet.models.task_modules.samplers import SamplingResult
+from torch.nn import functional as F
+
+from mmdet3d.models.roi_heads.base_3droi_head import Base3DRoIHead
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import bbox3d2roi
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import InstanceList
+
+
+@MODELS.register_module()
+class PVRCNNRoiHead(Base3DRoIHead):
+    """RoI head for PV-RCNN.
+
+    Args:
+        num_classes (int): The number of classes. Defaults to 3.
+        semantic_head (dict, optional): Config of semantic head.
+            Defaults to None.
+        bbox_roi_extractor (dict, optional): Config of roi_extractor.
+            Defaults to None.
+        bbox_head (dict, optional): Config of bbox_head. Defaults to None.
+        train_cfg (dict, optional): Train config of model.
+            Defaults to None.
+        test_cfg (dict, optional): Train config of model.
+            Defaults to None.
+        init_cfg (dict, optional): Initialize config of
+            model. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int = 3,
+                 semantic_head: Optional[dict] = None,
+                 bbox_roi_extractor: Optional[dict] = None,
+                 bbox_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None):
+        super(PVRCNNRoiHead, self).__init__(
+            bbox_head=bbox_head,
+            bbox_roi_extractor=bbox_roi_extractor,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.semantic_head = MODELS.build(semantic_head)
+
+        self.init_assigner_sampler()
+
+    @property
+    def with_semantic(self):
+        """bool: whether the head has semantic branch"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    def loss(self, feats_dict: dict, rpn_results_list: InstanceList,
+             batch_data_samples: SampleList, **kwargs) -> dict:
+        """Training forward function of PVRCNNROIHead.
+
+        Args:
+            feats_dict (dict): Contains point-wise features.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            dict: losses from each head.
+
+            - loss_semantic (torch.Tensor): loss of semantic head.
+            - loss_bbox (torch.Tensor): loss of bboxes.
+            - loss_cls (torch.Tensor): loss of object classification.
+            - loss_corner (torch.Tensor): loss of bboxes corners.
+        """
+        losses = dict()
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        for data_sample in batch_data_samples:
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            if 'ignored_instances' in data_sample:
+                batch_gt_instances_ignore.append(data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+        if self.with_semantic:
+            semantic_results = self._semantic_forward_train(
+                feats_dict['keypoint_features'], feats_dict['keypoints'],
+                batch_gt_instances_3d)
+            losses['loss_semantic'] = semantic_results['loss_semantic']
+
+        sample_results = self._assign_and_sample(rpn_results_list,
+                                                 batch_gt_instances_3d)
+        if self.with_bbox:
+            bbox_results = self._bbox_forward_train(
+                semantic_results['seg_preds'],
+                feats_dict['fusion_keypoint_features'],
+                feats_dict['keypoints'], sample_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        return losses
+
+    def predict(self, feats_dict: dict, rpn_results_list: InstanceList,
+                batch_data_samples: SampleList, **kwargs) -> SampleList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            feats_dict (dict): Contains point-wise features.
+            rpn_results_list (List[:obj:`InstanceData`]): Detection results
+                of rpn head.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 7.
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        assert self.with_semantic, 'Semantic head must be implemented.'
+
+        batch_input_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        semantic_results = self.semantic_head(feats_dict['keypoint_features'])
+        point_features = feats_dict[
+            'fusion_keypoint_features'] * semantic_results[
+                'seg_preds'].sigmoid().max(
+                    dim=-1, keepdim=True).values
+        rois = bbox3d2roi(
+            [res['bboxes_3d'].tensor for res in rpn_results_list])
+        labels_3d = [res['labels_3d'] for res in rpn_results_list]
+        bbox_results = self._bbox_forward(point_features,
+                                          feats_dict['keypoints'], rois)
+
+        results_list = self.bbox_head.get_results(rois,
+                                                  bbox_results['bbox_scores'],
+                                                  bbox_results['bbox_reg'],
+                                                  labels_3d, batch_input_metas,
+                                                  self.test_cfg)
+        return results_list
+
+    def _bbox_forward_train(self, seg_preds: torch.Tensor,
+                            keypoint_features: torch.Tensor,
+                            keypoints: torch.Tensor,
+                            sampling_results: SamplingResult) -> dict:
+        """Forward training function of roi_extractor and bbox_head.
+
+        Args:
+            seg_preds (torch.Tensor): Point-wise semantic features.
+            keypoint_features (torch.Tensor): key points features
+                from points encoder.
+            keypoints (torch.Tensor): Coordinate of key points.
+            sampling_results (:obj:`SamplingResult`): Sampled results used
+                for training.
+
+        Returns:
+            dict: Forward results including losses and predictions.
+        """
+        rois = bbox3d2roi([res.bboxes for res in sampling_results])
+        keypoint_features = keypoint_features * seg_preds.sigmoid().max(
+            dim=-1, keepdim=True).values
+        bbox_results = self._bbox_forward(keypoint_features, keypoints, rois)
+
+        bbox_targets = self.bbox_head.get_targets(sampling_results,
+                                                  self.train_cfg)
+        loss_bbox = self.bbox_head.loss(bbox_results['bbox_scores'],
+                                        bbox_results['bbox_reg'], rois,
+                                        *bbox_targets)
+
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
+
+    def _bbox_forward(self, keypoint_features: torch.Tensor,
+                      keypoints: torch.Tensor, rois: torch.Tensor) -> dict:
+        """Forward function of roi_extractor and bbox_head used in both
+        training and testing.
+
+        Args:
+            rois (Tensor): Roi boxes.
+            keypoint_features (torch.Tensor): key points features
+                from points encoder.
+            keypoints (torch.Tensor): Coordinate of key points.
+            rois (Tensor): Roi boxes.
+
+        Returns:
+            dict: Contains predictions of bbox_head and
+                features of roi_extractor.
+        """
+        pooled_keypoint_features = self.bbox_roi_extractor(
+            keypoint_features, keypoints[..., 1:], keypoints[..., 0].int(),
+            rois)
+        bbox_score, bbox_reg = self.bbox_head(pooled_keypoint_features)
+
+        bbox_results = dict(bbox_scores=bbox_score, bbox_reg=bbox_reg)
+        return bbox_results
+
+    def _assign_and_sample(
+            self, proposal_list: InstanceList,
+            batch_gt_instances_3d: InstanceList) -> List[SamplingResult]:
+        """Assign and sample proposals for training.
+
+        Args:
+            proposal_list (list[:obj:`InstancesData`]): Proposals produced by
+                rpn head.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+
+        Returns:
+            list[:obj:`SamplingResult`]: Sampled results of each training
+                sample.
+        """
+        sampling_results = []
+        # bbox assign
+        for batch_idx in range(len(proposal_list)):
+            cur_proposal_list = proposal_list[batch_idx]
+            cur_boxes = cur_proposal_list['bboxes_3d']
+            cur_labels_3d = cur_proposal_list['labels_3d']
+            cur_gt_instances_3d = batch_gt_instances_3d[batch_idx]
+            cur_gt_instances_3d.bboxes_3d = cur_gt_instances_3d.\
+                bboxes_3d.tensor
+            cur_gt_bboxes = batch_gt_instances_3d[batch_idx].bboxes_3d.to(
+                cur_boxes.device)
+            cur_gt_labels = batch_gt_instances_3d[batch_idx].labels_3d
+
+            batch_num_gts = 0
+            # 0 is bg
+            batch_gt_indis = cur_gt_labels.new_full((len(cur_boxes), ), 0)
+            batch_max_overlaps = cur_boxes.tensor.new_zeros(len(cur_boxes))
+            # -1 is bg
+            batch_gt_labels = cur_gt_labels.new_full((len(cur_boxes), ), -1)
+
+            # each class may have its own assigner
+            if isinstance(self.bbox_assigner, list):
+                for i, assigner in enumerate(self.bbox_assigner):
+                    gt_per_cls = (cur_gt_labels == i)
+                    pred_per_cls = (cur_labels_3d == i)
+                    cur_assign_res = assigner.assign(
+                        cur_proposal_list[pred_per_cls],
+                        cur_gt_instances_3d[gt_per_cls])
+                    # gather assign_results in different class into one result
+                    batch_num_gts += cur_assign_res.num_gts
+                    # gt inds (1-based)
+                    gt_inds_arange_pad = gt_per_cls.nonzero(
+                        as_tuple=False).view(-1) + 1
+                    # pad 0 for indice unassigned
+                    gt_inds_arange_pad = F.pad(
+                        gt_inds_arange_pad, (1, 0), mode='constant', value=0)
+                    # pad -1 for indice ignore
+                    gt_inds_arange_pad = F.pad(
+                        gt_inds_arange_pad, (1, 0), mode='constant', value=-1)
+                    # convert to 0~gt_num+2 for indices
+                    gt_inds_arange_pad += 1
+                    # now 0 is bg, >1 is fg in batch_gt_indis
+                    batch_gt_indis[pred_per_cls] = gt_inds_arange_pad[
+                        cur_assign_res.gt_inds + 1] - 1
+                    batch_max_overlaps[
+                        pred_per_cls] = cur_assign_res.max_overlaps
+                    batch_gt_labels[pred_per_cls] = cur_assign_res.labels
+
+                assign_result = AssignResult(batch_num_gts, batch_gt_indis,
+                                             batch_max_overlaps,
+                                             batch_gt_labels)
+            else:  # for single class
+                assign_result = self.bbox_assigner.assign(
+                    cur_proposal_list, cur_gt_instances_3d)
+            # sample boxes
+            sampling_result = self.bbox_sampler.sample(assign_result,
+                                                       cur_boxes.tensor,
+                                                       cur_gt_bboxes,
+                                                       cur_gt_labels)
+            sampling_results.append(sampling_result)
+        return sampling_results
+
+    def _semantic_forward_train(self, keypoint_features: torch.Tensor,
+                                keypoints: torch.Tensor,
+                                batch_gt_instances_3d: InstanceList) -> dict:
+        """Train semantic head.
+
+        Args:
+            keypoint_features (torch.Tensor): key points features
+                from points encoder.
+            keypoints (torch.Tensor): Coordinate of key points.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+
+        Returns:
+            dict: Segmentation results including losses
+        """
+        semantic_results = self.semantic_head(keypoint_features)
+        semantic_targets = self.semantic_head.get_targets(
+            keypoints, batch_gt_instances_3d)
+        loss_semantic = self.semantic_head.loss(semantic_results,
+                                                semantic_targets)
+        semantic_results.update(loss_semantic)
+        return semantic_results
diff --git a/mmde/mmdet3d/models/roi_heads/roi_extractors/__init__.py b/mmde/mmdet3d/models/roi_heads/roi_extractors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f10e7179c72c92c3536bf4750f64b138238e80da
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/roi_extractors/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.roi_heads.roi_extractors import SingleRoIExtractor
+
+from .batch_roigridpoint_extractor import Batch3DRoIGridExtractor
+from .single_roiaware_extractor import Single3DRoIAwareExtractor
+from .single_roipoint_extractor import Single3DRoIPointExtractor
+
+__all__ = [
+    'SingleRoIExtractor', 'Single3DRoIAwareExtractor',
+    'Single3DRoIPointExtractor', 'Batch3DRoIGridExtractor'
+]
diff --git a/mmde/mmdet3d/models/roi_heads/roi_extractors/batch_roigridpoint_extractor.py b/mmde/mmdet3d/models/roi_heads/roi_extractors/batch_roigridpoint_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d4825f31fdb667e5fc0dbc9b31376a13c5e33b0
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/roi_extractors/batch_roigridpoint_extractor.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.bbox_3d import rotation_3d_in_axis
+
+
+@MODELS.register_module()
+class Batch3DRoIGridExtractor(BaseModule):
+    """Grid point wise roi-aware Extractor.
+
+    Args:
+        grid_size (int): The number of grid points in a roi bbox.
+            Defaults to 6.
+        roi_layer (dict, optional): Config of sa module to get
+            grid points features. Defaults to None.
+        init_cfg (dict, optional): Initialize config of
+            model. Defaults to None.
+    """
+
+    def __init__(self,
+                 grid_size: int = 6,
+                 roi_layer: dict = None,
+                 init_cfg: dict = None) -> None:
+        super(Batch3DRoIGridExtractor, self).__init__(init_cfg=init_cfg)
+        self.roi_grid_pool_layer = MODELS.build(roi_layer)
+        self.grid_size = grid_size
+
+    def forward(self, feats: torch.Tensor, coordinate: torch.Tensor,
+                batch_inds: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+        """Forward roi extractor to extract grid points feature.
+
+        Args:
+            feats (torch.Tensor): Key points features.
+            coordinate (torch.Tensor): Key points coordinates.
+            batch_inds (torch.Tensor): Input batch indexes.
+            rois (torch.Tensor): Detection results of rpn head.
+
+        Returns:
+            torch.Tensor: Grid points features.
+        """
+        batch_size = int(batch_inds.max()) + 1
+
+        xyz = coordinate
+        xyz_batch_cnt = xyz.new_zeros(batch_size).int()
+        for k in range(batch_size):
+            xyz_batch_cnt[k] = (batch_inds == k).sum()
+
+        rois_batch_inds = rois[:, 0].int()
+        # (N1+N2+..., 6x6x6, 3)
+        roi_grid = self.get_dense_grid_points(rois[:, 1:])
+
+        new_xyz = roi_grid.view(-1, 3)
+        new_xyz_batch_cnt = new_xyz.new_zeros(batch_size).int()
+        for k in range(batch_size):
+            new_xyz_batch_cnt[k] = ((rois_batch_inds == k).sum() *
+                                    roi_grid.size(1))
+        pooled_points, pooled_features = self.roi_grid_pool_layer(
+            xyz=xyz.contiguous(),
+            xyz_batch_cnt=xyz_batch_cnt,
+            new_xyz=new_xyz.contiguous(),
+            new_xyz_batch_cnt=new_xyz_batch_cnt,
+            features=feats.contiguous())  # (M1 + M2 ..., C)
+
+        pooled_features = pooled_features.view(-1, self.grid_size,
+                                               self.grid_size, self.grid_size,
+                                               pooled_features.shape[-1])
+        # (BxN, 6, 6, 6, C)
+        return pooled_features
+
+    def get_dense_grid_points(self, rois: torch.Tensor) -> torch.Tensor:
+        """Get dense grid points from rois.
+
+        Args:
+            rois (torch.Tensor): Detection results of rpn head.
+
+        Returns:
+            torch.Tensor: Grid points coordinates.
+        """
+        rois_bbox = rois.clone()
+        rois_bbox[:, 2] += rois_bbox[:, 5] / 2
+        faked_features = rois_bbox.new_ones(
+            (self.grid_size, self.grid_size, self.grid_size))
+        dense_idx = faked_features.nonzero()
+        dense_idx = dense_idx.repeat(rois_bbox.size(0), 1, 1).float()
+        dense_idx = ((dense_idx + 0.5) / self.grid_size)
+        dense_idx[..., :3] -= 0.5
+
+        roi_ctr = rois_bbox[:, :3]
+        roi_dim = rois_bbox[:, 3:6]
+        roi_grid_points = dense_idx * roi_dim.view(-1, 1, 3)
+        roi_grid_points = rotation_3d_in_axis(
+            roi_grid_points, rois_bbox[:, 6], axis=2)
+        roi_grid_points += roi_ctr.view(-1, 1, 3)
+
+        return roi_grid_points
diff --git a/mmde/mmdet3d/models/roi_heads/roi_extractors/single_roiaware_extractor.py b/mmde/mmdet3d/models/roi_heads/roi_extractors/single_roiaware_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..00756cc3db1a5fcc1f09e4ef2d7088b86a4a10b0
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/roi_extractors/single_roiaware_extractor.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from mmcv import ops
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class Single3DRoIAwareExtractor(BaseModule):
+    """Point-wise roi-aware Extractor.
+
+    Extract Point-wise roi features.
+
+    Args:
+        roi_layer (dict, optional): The config of roi layer.
+    """
+
+    def __init__(self,
+                 roi_layer: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None) -> None:
+        super(Single3DRoIAwareExtractor, self).__init__(init_cfg=init_cfg)
+        self.roi_layer = self.build_roi_layers(roi_layer)
+
+    def build_roi_layers(self, layer_cfg: dict) -> nn.Module:
+        """Build roi layers using `layer_cfg`"""
+        cfg = layer_cfg.copy()
+        layer_type = cfg.pop('type')
+        assert hasattr(ops, layer_type)
+        layer_cls = getattr(ops, layer_type)
+        roi_layers = layer_cls(**cfg)
+        return roi_layers
+
+    def forward(self, feats: Tensor, coordinate: Tensor, batch_inds: Tensor,
+                rois: Tensor) -> Tensor:
+        """Extract point-wise roi features.
+
+        Args:
+            feats (torch.FloatTensor): Point-wise features with
+                shape (batch, npoints, channels) for pooling.
+            coordinate (torch.FloatTensor): Coordinate of each point.
+            batch_inds (torch.LongTensor): Indicate the batch of each point.
+            rois (torch.FloatTensor): Roi boxes with batch indices.
+
+        Returns:
+            torch.FloatTensor: Pooled features
+        """
+        pooled_roi_feats = []
+        for batch_idx in range(int(batch_inds.max()) + 1):
+            roi_inds = (rois[..., 0].int() == batch_idx)
+            coors_inds = (batch_inds.int() == batch_idx)
+            pooled_roi_feat = self.roi_layer(rois[..., 1:][roi_inds],
+                                             coordinate[coors_inds],
+                                             feats[coors_inds])
+            pooled_roi_feats.append(pooled_roi_feat)
+        pooled_roi_feats = torch.cat(pooled_roi_feats, 0)
+        return pooled_roi_feats
diff --git a/mmde/mmdet3d/models/roi_heads/roi_extractors/single_roipoint_extractor.py b/mmde/mmdet3d/models/roi_heads/roi_extractors/single_roipoint_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..2697d25e5e5f235a1d234abb3fdede0c96ae7220
--- /dev/null
+++ b/mmde/mmdet3d/models/roi_heads/roi_extractors/single_roipoint_extractor.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from mmcv import ops
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.bbox_3d import rotation_3d_in_axis
+
+
+@MODELS.register_module()
+class Single3DRoIPointExtractor(nn.Module):
+    """Point-wise roi-aware Extractor.
+
+    Extract Point-wise roi features.
+
+    Args:
+        roi_layer (dict, optional): The config of roi layer.
+    """
+
+    def __init__(self, roi_layer: Optional[dict] = None) -> None:
+        super(Single3DRoIPointExtractor, self).__init__()
+        self.roi_layer = self.build_roi_layers(roi_layer)
+
+    def build_roi_layers(self, layer_cfg: dict) -> nn.Module:
+        """Build roi layers using `layer_cfg`"""
+        cfg = layer_cfg.copy()
+        layer_type = cfg.pop('type')
+        assert hasattr(ops, layer_type)
+        layer_cls = getattr(ops, layer_type)
+        roi_layers = layer_cls(**cfg)
+        return roi_layers
+
+    def forward(self, feats: Tensor, coordinate: Tensor, batch_inds: Tensor,
+                rois: Tensor) -> Tensor:
+        """Extract point-wise roi features.
+
+        Args:
+            feats (torch.FloatTensor): Point-wise features with
+                shape (batch, npoints, channels) for pooling.
+            coordinate (torch.FloatTensor): Coordinate of each point.
+            batch_inds (torch.LongTensor): Indicate the batch of each point.
+            rois (torch.FloatTensor): Roi boxes with batch indices.
+
+        Returns:
+            torch.FloatTensor: Pooled features
+        """
+        rois = rois[..., 1:]
+        rois = rois.view(batch_inds, -1, rois.shape[-1])
+        with torch.no_grad():
+            pooled_roi_feat, pooled_empty_flag = self.roi_layer(
+                coordinate, feats, rois)
+
+            # canonical transformation
+            roi_center = rois[:, :, 0:3]
+            pooled_roi_feat[:, :, :, 0:3] -= roi_center.unsqueeze(dim=2)
+            pooled_roi_feat = pooled_roi_feat.view(-1,
+                                                   pooled_roi_feat.shape[-2],
+                                                   pooled_roi_feat.shape[-1])
+            pooled_roi_feat[:, :, 0:3] = rotation_3d_in_axis(
+                pooled_roi_feat[:, :, 0:3],
+                -(rois.view(-1, rois.shape[-1])[:, 6]),
+                axis=2)
+            pooled_roi_feat[pooled_empty_flag.view(-1) > 0] = 0
+
+        return pooled_roi_feat
diff --git a/mmde/mmdet3d/models/segmentors/__init__.py b/mmde/mmdet3d/models/segmentors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce0a555c355fb29c19196e980089f47a243211cf
--- /dev/null
+++ b/mmde/mmdet3d/models/segmentors/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import Base3DSegmentor
+from .cylinder3d import Cylinder3D
+from .encoder_decoder import EncoderDecoder3D
+from .minkunet import MinkUNet
+from .seg3d_tta import Seg3DTTAModel
+
+__all__ = [
+    'Base3DSegmentor', 'EncoderDecoder3D', 'Cylinder3D', 'MinkUNet',
+    'Seg3DTTAModel'
+]
diff --git a/mmde/mmdet3d/models/segmentors/base.py b/mmde/mmdet3d/models/segmentors/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..f68c2a3799b811e0a4a1cff77adcf05b5fcf19df
--- /dev/null
+++ b/mmde/mmdet3d/models/segmentors/base.py
@@ -0,0 +1,165 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Union
+
+from mmengine.model import BaseModel
+from torch import Tensor
+
+from mmdet3d.structures import PointData
+from mmdet3d.structures.det3d_data_sample import (ForwardResults,
+                                                  OptSampleList, SampleList)
+from mmdet3d.utils import OptConfigType, OptMultiConfig
+
+
+class Base3DSegmentor(BaseModel, metaclass=ABCMeta):
+    """Base class for 3D segmentors.
+
+    Args:
+        data_preprocessor (dict or ConfigDict, optional): Model preprocessing
+            config for processing the input data. it usually includes
+            ``to_rgb``, ``pad_size_divisor``, ``pad_val``, ``mean`` and
+            ``std``. Defaults to None.
+       init_cfg (dict or ConfigDict, optional): The config to control the
+           initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super(Base3DSegmentor, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+    @property
+    def with_neck(self) -> bool:
+        """bool: Whether the segmentor has neck."""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    @property
+    def with_auxiliary_head(self) -> bool:
+        """bool: Whether the segmentor has auxiliary head."""
+        return hasattr(self,
+                       'auxiliary_head') and self.auxiliary_head is not None
+
+    @property
+    def with_decode_head(self) -> bool:
+        """bool: Whether the segmentor has decode head."""
+        return hasattr(self, 'decode_head') and self.decode_head is not None
+
+    @property
+    def with_regularization_loss(self) -> bool:
+        """bool: Whether the segmentor has regularization loss for weight."""
+        return hasattr(self, 'loss_regularization') and \
+            self.loss_regularization is not None
+
+    @abstractmethod
+    def extract_feat(self, batch_inputs: Tensor) -> dict:
+        """Placeholder for extract features from images."""
+        pass
+
+    @abstractmethod
+    def encode_decode(self, batch_inputs: Tensor,
+                      batch_data_samples: SampleList) -> Tensor:
+        """Placeholder for encode images with backbone and decode into a
+        semantic segmentation map of the same size as input."""
+        pass
+
+    def forward(self,
+                inputs: Union[dict, List[dict]],
+                data_samples: OptSampleList = None,
+                mode: str = 'tensor') -> ForwardResults:
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+          tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+          processed to a list of :obj:`SegDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+          inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (dict or List[dict]): Input sample dict which includes
+                'points' and 'imgs' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - imgs (Tensor): Image tensor has shape (B, C, H, W).
+            data_samples (List[:obj:`Det3DDataSample`], optional):
+                The annotation data of every samples. Defaults to None.
+            mode (str): Return what kind of value. Defaults to 'tensor'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of :obj:`Det3DDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if mode == 'loss':
+            return self.loss(inputs, data_samples)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples)
+        elif mode == 'tensor':
+            return self._forward(inputs, data_samples)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    @abstractmethod
+    def loss(self, batch_inputs: dict,
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """Calculate losses from a batch of inputs and data samples."""
+        pass
+
+    @abstractmethod
+    def predict(self, batch_inputs: dict,
+                batch_data_samples: SampleList) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing."""
+        pass
+
+    @abstractmethod
+    def _forward(self,
+                 batch_inputs: dict,
+                 batch_data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Usually includes backbone, neck and head forward without any post-
+        processing.
+        """
+        pass
+
+    def postprocess_result(self, seg_logits_list: List[Tensor],
+                           batch_data_samples: SampleList) -> SampleList:
+        """Convert results list to `Det3DDataSample`.
+
+        Args:
+            seg_logits_list (List[Tensor]): List of segmentation results,
+                seg_logits from model of each input point clouds sample.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+
+        Returns:
+            List[:obj:`Det3DDataSample`]: Segmentation results of the input
+            points. Each Det3DDataSample usually contains:
+
+            - ``pred_pts_seg`` (PointData): Prediction of 3D semantic
+              segmentation.
+            - ``pts_seg_logits`` (PointData): Predicted logits of 3D semantic
+              segmentation before normalization.
+        """
+
+        for i in range(len(seg_logits_list)):
+            seg_logits = seg_logits_list[i]
+            seg_pred = seg_logits.argmax(dim=0)
+            batch_data_samples[i].set_data({
+                'pts_seg_logits':
+                PointData(**{'pts_seg_logits': seg_logits}),
+                'pred_pts_seg':
+                PointData(**{'pts_semantic_mask': seg_pred})
+            })
+        return batch_data_samples
diff --git a/mmde/mmdet3d/models/segmentors/cylinder3d.py b/mmde/mmdet3d/models/segmentors/cylinder3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4177dd60fa284c917df9b448bc1632926fa440e
--- /dev/null
+++ b/mmde/mmdet3d/models/segmentors/cylinder3d.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
+from ...structures.det3d_data_sample import SampleList
+from .encoder_decoder import EncoderDecoder3D
+
+
+@MODELS.register_module()
+class Cylinder3D(EncoderDecoder3D):
+    """`Cylindrical and Asymmetrical 3D Convolution Networks for LiDAR
+    Segmentation.
+
+        <https://arxiv.org/abs/2011.10033>`_.
+
+    Args:
+        voxel_encoder (dict or :obj:`ConfigDict`): The config for the
+            points2voxel encoder of segmentor.
+        backbone (dict or :obj:`ConfigDict`): The config for the backnone of
+            segmentor.
+        decode_head (dict or :obj:`ConfigDict`): The config for the decode
+            head of segmentor.
+        neck (dict or :obj:`ConfigDict`, optional): The config for the neck of
+            segmentor. Defaults to None.
+        auxiliary_head (dict or :obj:`ConfigDict` or List[dict or
+            :obj:`ConfigDict`], optional): The config for the auxiliary head of
+            segmentor. Defaults to None.
+        loss_regularization (dict or :obj:`ConfigDict` or List[dict or
+            :obj:`ConfigDict`], optional): The config for the regularization
+            loass. Defaults to None.
+        train_cfg (dict or :obj:`ConfigDict`, optional): The config for
+            training. Defaults to None.
+        test_cfg (dict or :obj:`ConfigDict`, optional): The config for testing.
+            Defaults to None.
+        data_preprocessor (dict or :obj:`ConfigDict`, optional): The
+            pre-process config of :class:`BaseDataPreprocessor`.
+            Defaults to None.
+        init_cfg (dict or :obj:`ConfigDict` or List[dict or :obj:`ConfigDict`],
+            optional): The weight initialized config for :class:`BaseModule`.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 voxel_encoder: ConfigType,
+                 backbone: ConfigType,
+                 decode_head: ConfigType,
+                 neck: OptConfigType = None,
+                 auxiliary_head: OptConfigType = None,
+                 loss_regularization: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(Cylinder3D, self).__init__(
+            backbone=backbone,
+            decode_head=decode_head,
+            neck=neck,
+            auxiliary_head=auxiliary_head,
+            loss_regularization=loss_regularization,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+
+        self.voxel_encoder = MODELS.build(voxel_encoder)
+
+    def extract_feat(self, batch_inputs: dict) -> Tensor:
+        """Extract features from points."""
+        encoded_feats = self.voxel_encoder(batch_inputs['voxels']['voxels'],
+                                           batch_inputs['voxels']['coors'])
+        batch_inputs['voxels']['voxel_coors'] = encoded_feats[1]
+        x = self.backbone(encoded_feats[0], encoded_feats[1],
+                          len(batch_inputs['points']))
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def loss(self, batch_inputs_dict: dict,
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs_dict (dict): Input sample dict which
+                includes 'points' and 'imgs' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        # extract features using backbone
+        x = self.extract_feat(batch_inputs_dict)
+        losses = dict()
+        loss_decode = self._decode_head_forward_train(x, batch_data_samples)
+        losses.update(loss_decode)
+
+        return losses
+
+    def predict(self,
+                batch_inputs_dict: dict,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Simple test with single scene.
+
+        Args:
+            batch_inputs_dict (dict): Input sample dict which includes 'points'
+                and 'imgs' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+            rescale (bool): Whether transform to original number of points.
+                Will be used for voxelization based segmentors.
+                Defaults to True.
+
+        Returns:
+            List[:obj:`Det3DDataSample`]: Segmentation results of the input
+            points. Each Det3DDataSample usually contains:
+
+            - ``pred_pts_seg`` (PointData): Prediction of 3D semantic
+              segmentation.
+            - ``pts_seg_logits`` (PointData): Predicted logits of 3D semantic
+              segmentation before normalization.
+        """
+        # 3D segmentation requires per-point prediction, so it's impossible
+        # to use down-sampling to get a batch of scenes with same num_points
+        # therefore, we only support testing one scene every time
+        x = self.extract_feat(batch_inputs_dict)
+        seg_logits_list = self.decode_head.predict(x, batch_inputs_dict,
+                                                   batch_data_samples)
+        for i in range(len(seg_logits_list)):
+            seg_logits_list[i] = seg_logits_list[i].transpose(0, 1)
+
+        return self.postprocess_result(seg_logits_list, batch_data_samples)
diff --git a/mmde/mmdet3d/models/segmentors/encoder_decoder.py b/mmde/mmdet3d/models/segmentors/encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..168d0e65a5baba84dcb6b15629dc33f262864fd1
--- /dev/null
+++ b/mmde/mmdet3d/models/segmentors/encoder_decoder.py
@@ -0,0 +1,545 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
+from ...structures.det3d_data_sample import OptSampleList, SampleList
+from ..utils import add_prefix
+from .base import Base3DSegmentor
+
+
+@MODELS.register_module()
+class EncoderDecoder3D(Base3DSegmentor):
+    """3D Encoder Decoder segmentors.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+
+    1. The ``loss`` method is used to calculate the loss of model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2) Call the decode head loss function to forward decode head model and
+    calculate losses.
+
+    .. code:: text
+
+    loss(): extract_feat() -> _decode_head_forward_train() -> _auxiliary_head_forward_train (optional)
+    _decode_head_forward_train(): decode_head.loss()
+    _auxiliary_head_forward_train(): auxiliary_head.loss (optional)
+
+    2. The ``predict`` method is used to predict segmentation results,
+    which includes two steps: (1) Run inference function to obtain the list of
+    seg_logits (2) Call post-processing function to obtain list of
+    ``Det3DDataSample`` including ``pred_pts_seg``.
+
+    .. code:: text
+
+    predict(): inference() -> postprocess_result()
+    inference(): whole_inference()/slide_inference()
+    whole_inference()/slide_inference(): encoder_decoder()
+    encoder_decoder(): extract_feat() -> decode_head.predict()
+
+    4 The ``_forward`` method is used to output the tensor by running the model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2) Call the decode head forward function to forward decode head model.
+
+    .. code:: text
+
+    _forward(): extract_feat() -> _decode_head.forward()
+
+    Args:
+        backbone (dict or :obj:`ConfigDict`): The config for the backnone of
+            segmentor.
+        decode_head (dict or :obj:`ConfigDict`): The config for the decode
+            head of segmentor.
+        neck (dict or :obj:`ConfigDict`, optional): The config for the neck of
+            segmentor. Defaults to None.
+        auxiliary_head (dict or :obj:`ConfigDict` or List[dict or
+            :obj:`ConfigDict`], optional): The config for the auxiliary head of
+            segmentor. Defaults to None.
+        loss_regularization (dict or :obj:`ConfigDict` or List[dict or
+            :obj:`ConfigDict`], optional): The config for the regularization
+            loass. Defaults to None.
+        train_cfg (dict or :obj:`ConfigDict`, optional): The config for
+            training. Defaults to None.
+        test_cfg (dict or :obj:`ConfigDict`, optional): The config for testing.
+            Defaults to None.
+        data_preprocessor (dict or :obj:`ConfigDict`, optional): The
+            pre-process config of :class:`BaseDataPreprocessor`.
+            Defaults to None.
+        init_cfg (dict or :obj:`ConfigDict` or List[dict or :obj:`ConfigDict`],
+            optional): The weight initialized config for :class:`BaseModule`.
+            Defaults to None.
+    """  # noqa: E501
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 decode_head: ConfigType,
+                 neck: OptConfigType = None,
+                 auxiliary_head: OptMultiConfig = None,
+                 loss_regularization: OptMultiConfig = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(EncoderDecoder3D, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        self._init_decode_head(decode_head)
+        self._init_auxiliary_head(auxiliary_head)
+        self._init_loss_regularization(loss_regularization)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        assert self.with_decode_head, \
+            '3D EncoderDecoder Segmentor should have a decode_head'
+
+    def _init_decode_head(self, decode_head: ConfigType) -> None:
+        """Initialize ``decode_head``."""
+        self.decode_head = MODELS.build(decode_head)
+        self.num_classes = self.decode_head.num_classes
+
+    def _init_auxiliary_head(self,
+                             auxiliary_head: OptMultiConfig = None) -> None:
+        """Initialize ``auxiliary_head``."""
+        if auxiliary_head is not None:
+            if isinstance(auxiliary_head, list):
+                self.auxiliary_head = nn.ModuleList()
+                for head_cfg in auxiliary_head:
+                    self.auxiliary_head.append(MODELS.build(head_cfg))
+            else:
+                self.auxiliary_head = MODELS.build(auxiliary_head)
+
+    def _init_loss_regularization(self,
+                                  loss_regularization: OptMultiConfig = None
+                                  ) -> None:
+        """Initialize ``loss_regularization``."""
+        if loss_regularization is not None:
+            if isinstance(loss_regularization, list):
+                self.loss_regularization = nn.ModuleList()
+                for loss_cfg in loss_regularization:
+                    self.loss_regularization.append(MODELS.build(loss_cfg))
+            else:
+                self.loss_regularization = MODELS.build(loss_regularization)
+
+    def extract_feat(self, batch_inputs: Tensor) -> dict:
+        """Extract features from points."""
+        x = self.backbone(batch_inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def encode_decode(self, batch_inputs: Tensor,
+                      batch_input_metas: List[dict]) -> Tensor:
+        """Encode points with backbone and decode into a semantic segmentation
+        map of the same size as input.
+
+        Args:
+            batch_input (Tensor): Input point cloud sample
+            batch_input_metas (List[dict]): Meta information of a batch of
+                samples.
+
+        Returns:
+            Tensor: Segmentation logits of shape [B, num_classes, N].
+        """
+        x = self.extract_feat(batch_inputs)
+        seg_logits = self.decode_head.predict(x, batch_input_metas,
+                                              self.test_cfg)
+        return seg_logits
+
+    def _decode_head_forward_train(
+            self, batch_inputs_dict: dict,
+            batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """Run forward function and calculate loss for decode head in training.
+
+        Args:
+            batch_input (Tensor): Input point cloud sample
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components for decode head.
+        """
+        losses = dict()
+        loss_decode = self.decode_head.loss(batch_inputs_dict,
+                                            batch_data_samples, self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def _auxiliary_head_forward_train(
+        self,
+        batch_inputs_dict: dict,
+        batch_data_samples: SampleList,
+    ) -> Dict[str, Tensor]:
+        """Run forward function and calculate loss for auxiliary head in
+        training.
+
+        Args:
+            batch_input (Tensor): Input point cloud sample
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components for auxiliary
+            head.
+        """
+        losses = dict()
+        if isinstance(self.auxiliary_head, nn.ModuleList):
+            for idx, aux_head in enumerate(self.auxiliary_head):
+                loss_aux = aux_head.loss(batch_inputs_dict, batch_data_samples,
+                                         self.train_cfg)
+                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
+        else:
+            loss_aux = self.auxiliary_head.loss(batch_inputs_dict,
+                                                batch_data_samples,
+                                                self.train_cfg)
+            losses.update(add_prefix(loss_aux, 'aux'))
+
+        return losses
+
+    def _loss_regularization_forward_train(self) -> Dict[str, Tensor]:
+        """Calculate regularization loss for model weight in training."""
+        losses = dict()
+        if isinstance(self.loss_regularization, nn.ModuleList):
+            for idx, regularize_loss in enumerate(self.loss_regularization):
+                loss_regularize = dict(
+                    loss_regularize=regularize_loss(self.modules()))
+                losses.update(add_prefix(loss_regularize, f'regularize_{idx}'))
+        else:
+            loss_regularize = dict(
+                loss_regularize=self.loss_regularization(self.modules()))
+            losses.update(add_prefix(loss_regularize, 'regularize'))
+
+        return losses
+
+    def loss(self, batch_inputs_dict: dict,
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs_dict (dict): Input sample dict which
+                includes 'points' and 'imgs' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        # extract features using backbone
+        points = torch.stack(batch_inputs_dict['points'])
+        x = self.extract_feat(points)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(x, batch_data_samples)
+        losses.update(loss_decode)
+
+        if self.with_auxiliary_head:
+            loss_aux = self._auxiliary_head_forward_train(
+                x, batch_data_samples)
+            losses.update(loss_aux)
+
+        if self.with_regularization_loss:
+            loss_regularize = self._loss_regularization_forward_train()
+            losses.update(loss_regularize)
+
+        return losses
+
+    @staticmethod
+    def _input_generation(coords,
+                          patch_center: Tensor,
+                          coord_max: Tensor,
+                          feats: Tensor,
+                          use_normalized_coord: bool = False) -> Tensor:
+        """Generating model input.
+
+        Generate input by subtracting patch center and adding additional
+        features. Currently support colors and normalized xyz as features.
+
+        Args:
+            coords (Tensor): Sampled 3D point coordinate of shape [S, 3].
+            patch_center (Tensor): Center coordinate of the patch.
+            coord_max (Tensor): Max coordinate of all 3D points.
+            feats (Tensor): Features of sampled points of shape [S, C].
+            use_normalized_coord (bool): Whether to use normalized xyz as
+                additional features. Defaults to False.
+
+        Returns:
+            Tensor: The generated input data of shape [S, 3+C'].
+        """
+        # subtract patch center, the z dimension is not centered
+        centered_coords = coords.clone()
+        centered_coords[:, 0] -= patch_center[0]
+        centered_coords[:, 1] -= patch_center[1]
+
+        # normalized coordinates as extra features
+        if use_normalized_coord:
+            normalized_coord = coords / coord_max
+            feats = torch.cat([feats, normalized_coord], dim=1)
+
+        points = torch.cat([centered_coords, feats], dim=1)
+
+        return points
+
+    def _sliding_patch_generation(self,
+                                  points: Tensor,
+                                  num_points: int,
+                                  block_size: float,
+                                  sample_rate: float = 0.5,
+                                  use_normalized_coord: bool = False,
+                                  eps: float = 1e-3) -> Tuple[Tensor, Tensor]:
+        """Sampling points in a sliding window fashion.
+
+        First sample patches to cover all the input points.
+        Then sample points in each patch to batch points of a certain number.
+
+        Args:
+            points (Tensor): Input points of shape [N, 3+C].
+            num_points (int): Number of points to be sampled in each patch.
+            block_size (float): Size of a patch to sample.
+            sample_rate (float): Stride used in sliding patch. Defaults to 0.5.
+            use_normalized_coord (bool): Whether to use normalized xyz as
+                additional features. Defaults to False.
+            eps (float): A value added to patch boundary to guarantee points
+                coverage. Defaults to 1e-3.
+
+        Returns:
+            Tuple[Tensor, Tensor]:
+
+            - patch_points (Tensor): Points of different patches of shape
+              [K, N, 3+C].
+            - patch_idxs (Tensor): Index of each point in `patch_points` of
+              shape [K, N].
+        """
+        device = points.device
+        # we assume the first three dims are points' 3D coordinates
+        # and the rest dims are their per-point features
+        coords = points[:, :3]
+        feats = points[:, 3:]
+
+        coord_max = coords.max(0)[0]
+        coord_min = coords.min(0)[0]
+        stride = block_size * sample_rate
+        num_grid_x = int(
+            torch.ceil((coord_max[0] - coord_min[0] - block_size) /
+                       stride).item() + 1)
+        num_grid_y = int(
+            torch.ceil((coord_max[1] - coord_min[1] - block_size) /
+                       stride).item() + 1)
+
+        patch_points, patch_idxs = [], []
+        for idx_y in range(num_grid_y):
+            s_y = coord_min[1] + idx_y * stride
+            e_y = torch.min(s_y + block_size, coord_max[1])
+            s_y = e_y - block_size
+            for idx_x in range(num_grid_x):
+                s_x = coord_min[0] + idx_x * stride
+                e_x = torch.min(s_x + block_size, coord_max[0])
+                s_x = e_x - block_size
+
+                # extract points within this patch
+                cur_min = torch.tensor([s_x, s_y, coord_min[2]]).to(device)
+                cur_max = torch.tensor([e_x, e_y, coord_max[2]]).to(device)
+                cur_choice = ((coords >= cur_min - eps) &
+                              (coords <= cur_max + eps)).all(dim=1)
+
+                if not cur_choice.any():  # no points in this patch
+                    continue
+
+                # sample points in this patch to multiple batches
+                cur_center = cur_min + block_size / 2.0
+                point_idxs = torch.nonzero(cur_choice, as_tuple=True)[0]
+                num_batch = int(np.ceil(point_idxs.shape[0] / num_points))
+                point_size = int(num_batch * num_points)
+                replace = point_size > 2 * point_idxs.shape[0]
+                num_repeat = point_size - point_idxs.shape[0]
+                if replace:  # duplicate
+                    point_idxs_repeat = point_idxs[torch.randint(
+                        0, point_idxs.shape[0],
+                        size=(num_repeat, )).to(device)]
+                else:
+                    point_idxs_repeat = point_idxs[torch.randperm(
+                        point_idxs.shape[0])[:num_repeat]]
+
+                choices = torch.cat([point_idxs, point_idxs_repeat], dim=0)
+                choices = choices[torch.randperm(choices.shape[0])]
+
+                # construct model input
+                point_batches = self._input_generation(
+                    coords[choices],
+                    cur_center,
+                    coord_max,
+                    feats[choices],
+                    use_normalized_coord=use_normalized_coord)
+
+                patch_points.append(point_batches)
+                patch_idxs.append(choices)
+
+        patch_points = torch.cat(patch_points, dim=0)
+        patch_idxs = torch.cat(patch_idxs, dim=0)
+
+        # make sure all points are sampled at least once
+        assert torch.unique(patch_idxs).shape[0] == points.shape[0], \
+            'some points are not sampled in sliding inference'
+
+        return patch_points, patch_idxs
+
+    def slide_inference(self, point: Tensor, input_meta: dict,
+                        rescale: bool) -> Tensor:
+        """Inference by sliding-window with overlap.
+
+        Args:
+            point (Tensor): Input points of shape [N, 3+C].
+            input_meta (dict): Meta information of input sample.
+            rescale (bool): Whether transform to original number of points.
+                Will be used for voxelization based segmentors.
+
+        Returns:
+            Tensor: The output segmentation map of shape [num_classes, N].
+        """
+        num_points = self.test_cfg.num_points
+        block_size = self.test_cfg.block_size
+        sample_rate = self.test_cfg.sample_rate
+        use_normalized_coord = self.test_cfg.use_normalized_coord
+        batch_size = self.test_cfg.batch_size * num_points
+
+        # patch_points is of shape [K*N, 3+C], patch_idxs is of shape [K*N]
+        patch_points, patch_idxs = self._sliding_patch_generation(
+            point, num_points, block_size, sample_rate, use_normalized_coord)
+        feats_dim = patch_points.shape[1]
+        seg_logits = []  # save patch predictions
+
+        for batch_idx in range(0, patch_points.shape[0], batch_size):
+            batch_points = patch_points[batch_idx:batch_idx + batch_size]
+            batch_points = batch_points.view(-1, num_points, feats_dim)
+            # batch_seg_logit is of shape [B, num_classes, N]
+            batch_seg_logit = self.encode_decode(batch_points,
+                                                 [input_meta] * batch_size)
+            batch_seg_logit = batch_seg_logit.transpose(1, 2).contiguous()
+            seg_logits.append(batch_seg_logit.view(-1, self.num_classes))
+
+        # aggregate per-point logits by indexing sum and dividing count
+        seg_logits = torch.cat(seg_logits, dim=0)  # [K*N, num_classes]
+        expand_patch_idxs = patch_idxs.unsqueeze(1).repeat(1, self.num_classes)
+        preds = point.new_zeros((point.shape[0], self.num_classes)).\
+            scatter_add_(dim=0, index=expand_patch_idxs, src=seg_logits)
+        count_mat = torch.bincount(patch_idxs)
+        preds = preds / count_mat[:, None]
+
+        # TODO: if rescale and voxelization segmentor
+
+        return preds.transpose(0, 1)  # to [num_classes, K*N]
+
+    def whole_inference(self, points: Tensor, batch_input_metas: List[dict],
+                        rescale: bool) -> Tensor:
+        """Inference with full scene (one forward pass without sliding)."""
+        seg_logit = self.encode_decode(points, batch_input_metas)
+        # TODO: if rescale and voxelization segmentor
+        return seg_logit
+
+    def inference(self, points: Tensor, batch_input_metas: List[dict],
+                  rescale: bool) -> Tensor:
+        """Inference with slide/whole style.
+
+        Args:
+            points (Tensor): Input points of shape [B, N, 3+C].
+            batch_input_metas (List[dict]): Meta information of a batch of
+                samples.
+            rescale (bool): Whether transform to original number of points.
+                Will be used for voxelization based segmentors.
+
+        Returns:
+            Tensor: The output segmentation map.
+        """
+        assert self.test_cfg.mode in ['slide', 'whole']
+        if self.test_cfg.mode == 'slide':
+            seg_logit = torch.stack([
+                self.slide_inference(point, input_meta, rescale)
+                for point, input_meta in zip(points, batch_input_metas)
+            ], 0)
+        else:
+            seg_logit = self.whole_inference(points, batch_input_metas,
+                                             rescale)
+        return seg_logit
+
+    def predict(self,
+                batch_inputs_dict: dict,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Simple test with single scene.
+
+        Args:
+            batch_inputs_dict (dict): Input sample dict which includes 'points'
+                and 'imgs' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+            rescale (bool): Whether transform to original number of points.
+                Will be used for voxelization based segmentors.
+                Defaults to True.
+
+        Returns:
+            List[:obj:`Det3DDataSample`]: Segmentation results of the input
+            points. Each Det3DDataSample usually contains:
+
+            - ``pred_pts_seg`` (PointData): Prediction of 3D semantic
+              segmentation.
+            - ``pts_seg_logits`` (PointData): Predicted logits of 3D semantic
+              segmentation before normalization.
+        """
+        # 3D segmentation requires per-point prediction, so it's impossible
+        # to use down-sampling to get a batch of scenes with same num_points
+        # therefore, we only support testing one scene every time
+        seg_logits_list = []
+        batch_input_metas = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+
+        points = batch_inputs_dict['points']
+        for point, input_meta in zip(points, batch_input_metas):
+            seg_logits = self.inference(
+                point.unsqueeze(0), [input_meta], rescale)[0]
+            seg_logits_list.append(seg_logits)
+
+        return self.postprocess_result(seg_logits_list, batch_data_samples)
+
+    def _forward(self,
+                 batch_inputs_dict: dict,
+                 batch_data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            batch_inputs_dict (dict): Input sample dict which includes 'points'
+                and 'imgs' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        points = torch.stack(batch_inputs_dict['points'])
+        x = self.extract_feat(points)
+        return self.decode_head.forward(x)
diff --git a/mmde/mmdet3d/models/segmentors/minkunet.py b/mmde/mmdet3d/models/segmentors/minkunet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5e80caa0292bea748c0faa340e5aba6f0d10aaa
--- /dev/null
+++ b/mmde/mmdet3d/models/segmentors/minkunet.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.det3d_data_sample import OptSampleList, SampleList
+from .encoder_decoder import EncoderDecoder3D
+
+
+@MODELS.register_module()
+class MinkUNet(EncoderDecoder3D):
+    r"""MinkUNet is the implementation of `4D Spatio-Temporal ConvNets.
+    <https://arxiv.org/abs/1904.08755>`_ with TorchSparse backend.
+
+    Refer to `implementation code <https://github.com/mit-han-lab/spvnas>`_.
+
+    Args:
+        kwargs (dict): Arguments are the same as those in
+            :class:`EncoderDecoder3D`.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+    def loss(self, inputs: dict, data_samples: SampleList):
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs_dict (dict): Input sample dict which
+                includes 'points' and 'voxels' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - voxels (dict): Voxel feature and coords after voxelization.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components.
+        """
+        x = self.extract_feat(inputs)
+        losses = self.decode_head.loss(x, data_samples, self.train_cfg)
+        return losses
+
+    def predict(self, inputs: dict,
+                batch_data_samples: SampleList) -> SampleList:
+        """Simple test with single scene.
+
+        Args:
+            batch_inputs_dict (dict): Input sample dict which
+                includes 'points' and 'voxels' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - voxels (dict): Voxel feature and coords after voxelization.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+
+        Returns:
+            List[:obj:`Det3DDataSample`]: Segmentation results of the input
+            points. Each Det3DDataSample usually contains:
+
+            - ``pred_pts_seg`` (PointData): Prediction of 3D semantic
+              segmentation.
+            - ``pts_seg_logits`` (PointData): Predicted logits of 3D semantic
+              segmentation before normalization.
+        """
+        x = self.extract_feat(inputs)
+        seg_logits_list = self.decode_head.predict(x, batch_data_samples)
+        for i in range(len(seg_logits_list)):
+            seg_logits_list[i] = seg_logits_list[i].transpose(0, 1)
+
+        return self.postprocess_result(seg_logits_list, batch_data_samples)
+
+    def _forward(self,
+                 batch_inputs_dict: dict,
+                 batch_data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            batch_inputs_dict (dict): Input sample dict which
+                includes 'points' and 'voxels' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - voxels (dict): Voxel feature and coords after voxelization.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`. Defaults to None.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        x = self.extract_feat(batch_inputs_dict)
+        return self.decode_head.forward(x)
+
+    def extract_feat(self, batch_inputs_dict: dict) -> Tensor:
+        """Extract features from voxels.
+
+        Args:
+            batch_inputs_dict (dict): Input sample dict which
+                includes 'points' and 'voxels' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - voxels (dict): Voxel feature and coords after voxelization.
+
+        Returns:
+            SparseTensor: voxels with features.
+        """
+        voxel_dict = batch_inputs_dict['voxels']
+        x = self.backbone(voxel_dict['voxels'], voxel_dict['coors'])
+        if self.with_neck:
+            x = self.neck(x)
+        return x
diff --git a/mmde/mmdet3d/models/segmentors/seg3d_tta.py b/mmde/mmdet3d/models/segmentors/seg3d_tta.py
new file mode 100644
index 0000000000000000000000000000000000000000..be93562ff76e4cdafb225ba46bf380c0c53b41dc
--- /dev/null
+++ b/mmde/mmdet3d/models/segmentors/seg3d_tta.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from mmengine.model import BaseTTAModel
+
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.det3d_data_sample import SampleList
+
+
+@MODELS.register_module()
+class Seg3DTTAModel(BaseTTAModel):
+
+    def merge_preds(self, data_samples_list: List[SampleList]) -> SampleList:
+        """Merge predictions of enhanced data to one prediction.
+
+        Args:
+            data_samples_list (List[List[:obj:`Det3DDataSample`]]): List of
+                predictions of all enhanced data.
+
+        Returns:
+            List[:obj:`Det3DDataSample`]: Merged prediction.
+        """
+        predictions = []
+        for data_samples in data_samples_list:
+            seg_logits = data_samples[0].pts_seg_logits.pts_seg_logits
+            logits = torch.zeros(seg_logits.shape).to(seg_logits)
+            for data_sample in data_samples:
+                seg_logit = data_sample.pts_seg_logits.pts_seg_logits
+                logits += seg_logit.softmax(dim=0)
+            logits /= len(data_samples)
+            seg_pred = logits.argmax(dim=0)
+            data_samples[0].pred_pts_seg.pts_semantic_mask = seg_pred
+            predictions.append(data_samples[0])
+        return predictions
diff --git a/mmde/mmdet3d/models/task_modules/__init__.py b/mmde/mmdet3d/models/task_modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a0c818d284cf7cc9ebc19de855add14635d9ddb
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.task_modules import AssignResult, BaseAssigner
+
+from .anchor import (ANCHOR_GENERATORS, PRIOR_GENERATORS,
+                     AlignedAnchor3DRangeGenerator,
+                     AlignedAnchor3DRangeGeneratorPerCls,
+                     Anchor3DRangeGenerator, build_anchor_generator,
+                     build_prior_generator)
+from .assigners import Max3DIoUAssigner
+from .coders import (AnchorFreeBBoxCoder, CenterPointBBoxCoder,
+                     DeltaXYZWLHRBBoxCoder, FCOS3DBBoxCoder,
+                     GroupFree3DBBoxCoder, MonoFlexCoder,
+                     PartialBinBasedBBoxCoder, PGDBBoxCoder,
+                     PointXYZWHLRBBoxCoder, SMOKECoder)
+from .samplers import (BaseSampler, CombinedSampler,
+                       InstanceBalancedPosSampler, IoUBalancedNegSampler,
+                       IoUNegPiecewiseSampler, OHEMSampler, PseudoSampler,
+                       RandomSampler, SamplingResult)
+from .voxel import VoxelGenerator
+
+__all__ = [
+    'BaseAssigner', 'Max3DIoUAssigner', 'AssignResult', 'BaseSampler',
+    'PseudoSampler', 'RandomSampler', 'InstanceBalancedPosSampler',
+    'IoUBalancedNegSampler', 'CombinedSampler', 'OHEMSampler',
+    'SamplingResult', 'IoUNegPiecewiseSampler', 'DeltaXYZWLHRBBoxCoder',
+    'PartialBinBasedBBoxCoder', 'CenterPointBBoxCoder', 'AnchorFreeBBoxCoder',
+    'GroupFree3DBBoxCoder', 'PointXYZWHLRBBoxCoder', 'FCOS3DBBoxCoder',
+    'PGDBBoxCoder', 'SMOKECoder', 'MonoFlexCoder', 'VoxelGenerator',
+    'AlignedAnchor3DRangeGenerator', 'Anchor3DRangeGenerator',
+    'build_prior_generator', 'AlignedAnchor3DRangeGeneratorPerCls',
+    'build_anchor_generator', 'ANCHOR_GENERATORS', 'PRIOR_GENERATORS'
+]
diff --git a/mmde/mmdet3d/models/task_modules/anchor/__init__.py b/mmde/mmdet3d/models/task_modules/anchor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc14ac11cef9c5ce96b9067b5ccd5b952d93af61
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/anchor/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor_3d_generator import (AlignedAnchor3DRangeGenerator,
+                                  AlignedAnchor3DRangeGeneratorPerCls,
+                                  Anchor3DRangeGenerator)
+from .builder import (ANCHOR_GENERATORS, PRIOR_GENERATORS,
+                      build_anchor_generator, build_prior_generator)
+
+__all__ = [
+    'AlignedAnchor3DRangeGenerator', 'Anchor3DRangeGenerator',
+    'build_prior_generator', 'AlignedAnchor3DRangeGeneratorPerCls',
+    'build_anchor_generator', 'ANCHOR_GENERATORS', 'PRIOR_GENERATORS'
+]
diff --git a/mmde/mmdet3d/models/task_modules/anchor/anchor_3d_generator.py b/mmde/mmdet3d/models/task_modules/anchor/anchor_3d_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f765f742208a9587b66f4a2c99d2de3423f25ce
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/anchor/anchor_3d_generator.py
@@ -0,0 +1,438 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import mmengine
+import torch
+from torch import Tensor
+
+from mmdet3d.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class Anchor3DRangeGenerator(object):
+    """3D Anchor Generator by range.
+
+    This anchor generator generates anchors by the given range in different
+    feature levels.
+    Due the convention in 3D detection, different anchor sizes are related to
+    different ranges for different categories. However we find this setting
+    does not effect the performance much in some datasets, e.g., nuScenes.
+
+    Args:
+        ranges (list[list[float]]): Ranges of different anchors.
+            The ranges are the same across different feature levels. But may
+            vary for different anchor sizes if size_per_range is True.
+        sizes (list[list[float]], optional): 3D sizes of anchors.
+            Defaults to [[3.9, 1.6, 1.56]].
+        scales (list[int], optional): Scales of anchors in different feature
+            levels. Defaults to [1].
+        rotations (list[float], optional): Rotations of anchors in a feature
+            grid. Defaults to [0, 1.5707963].
+        custom_values (tuple[float], optional): Customized values of that
+            anchor. For example, in nuScenes the anchors have velocities.
+            Defaults to ().
+        reshape_out (bool, optional): Whether to reshape the output into
+            (N x 4). Defaults to True.
+        size_per_range (bool, optional): Whether to use separate ranges for
+            different sizes. If size_per_range is True, the ranges should have
+            the same length as the sizes, if not, it will be duplicated.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 ranges: List[List[float]],
+                 sizes: List[List[float]] = [[3.9, 1.6, 1.56]],
+                 scales: List[int] = [1],
+                 rotations: List[float] = [0, 1.5707963],
+                 custom_values: Tuple[float] = (),
+                 reshape_out: bool = True,
+                 size_per_range: bool = True) -> None:
+        assert mmengine.is_list_of(ranges, list)
+        if size_per_range:
+            if len(sizes) != len(ranges):
+                assert len(ranges) == 1
+                ranges = ranges * len(sizes)
+            assert len(ranges) == len(sizes)
+        else:
+            assert len(ranges) == 1
+        assert mmengine.is_list_of(sizes, list)
+        assert isinstance(scales, list)
+
+        self.sizes = sizes
+        self.scales = scales
+        self.ranges = ranges
+        self.rotations = rotations
+        self.custom_values = custom_values
+        self.cached_anchors = None
+        self.reshape_out = reshape_out
+        self.size_per_range = size_per_range
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + '('
+        s += f'anchor_range={self.ranges},\n'
+        s += f'scales={self.scales},\n'
+        s += f'sizes={self.sizes},\n'
+        s += f'rotations={self.rotations},\n'
+        s += f'reshape_out={self.reshape_out},\n'
+        s += f'size_per_range={self.size_per_range})'
+        return s
+
+    @property
+    def num_base_anchors(self) -> int:
+        """int: Total number of base anchors in a feature grid."""
+        num_rot = len(self.rotations)
+        num_size = torch.tensor(self.sizes).reshape(-1, 3).size(0)
+        return num_rot * num_size
+
+    @property
+    def num_levels(self) -> int:
+        """int: Number of feature levels that the generator is applied to."""
+        return len(self.scales)
+
+    def grid_anchors(
+            self,
+            featmap_sizes: List[Tuple[int]],
+            device: Union[str, torch.device] = 'cuda') -> List[Tensor]:
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels.
+            device (str, optional): Device where the anchors will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            list[torch.Tensor]: Anchors in multiple feature levels.
+                The sizes of each tensor should be [N, 4], where
+                N = width * height * num_base_anchors, width and height
+                are the sizes of the corresponding feature level,
+                num_base_anchors is the number of anchors for that level.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_anchors = []
+        for i in range(self.num_levels):
+            anchors = self.single_level_grid_anchors(
+                featmap_sizes[i], self.scales[i], device=device)
+            if self.reshape_out:
+                anchors = anchors.reshape(-1, anchors.size(-1))
+            multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def single_level_grid_anchors(
+            self,
+            featmap_size: Tuple[int],
+            scale: int,
+            device: Union[str, torch.device] = 'cuda') -> Tensor:
+        """Generate grid anchors of a single level feature map.
+
+        This function is usually called by method ``self.grid_anchors``.
+
+        Args:
+            featmap_size (tuple[int]): Size of the feature map.
+            scale (float): Scale factor of the anchors in the current level.
+            device (str, optional): Device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature map.
+        """
+        # We reimplement the anchor generator using torch in cuda
+        # torch: 0.6975 s for 1000 times
+        # numpy: 4.3345 s for 1000 times
+        # which is ~5 times faster than the numpy implementation
+        if not self.size_per_range:
+            return self.anchors_single_range(
+                featmap_size,
+                self.ranges[0],
+                scale,
+                self.sizes,
+                self.rotations,
+                device=device)
+
+        mr_anchors = []
+        for anchor_range, anchor_size in zip(self.ranges, self.sizes):
+            mr_anchors.append(
+                self.anchors_single_range(
+                    featmap_size,
+                    anchor_range,
+                    scale,
+                    anchor_size,
+                    self.rotations,
+                    device=device))
+        mr_anchors = torch.cat(mr_anchors, dim=-3)
+        return mr_anchors
+
+    def anchors_single_range(
+            self,
+            feature_size: Tuple[int],
+            anchor_range: Union[Tensor, List[float]],
+            scale: int = 1,
+            sizes: Union[List[List[float]], List[float]] = [[3.9, 1.6, 1.56]],
+            rotations: List[float] = [0, 1.5707963],
+            device: Union[str, torch.device] = 'cuda') -> Tensor:
+        """Generate anchors in a single range.
+
+        Args:
+            feature_size (list[float] | tuple[float]): Feature map size. It is
+                either a list of a tuple of [D, H, W](in order of z, y, and x).
+            anchor_range (torch.Tensor | list[float]): Range of anchors with
+                shape [6]. The order is consistent with that of anchors, i.e.,
+                (x_min, y_min, z_min, x_max, y_max, z_max).
+            scale (float | int, optional): The scale factor of anchors.
+                Defaults to 1.
+            sizes (list[list] | np.ndarray | torch.Tensor, optional):
+                Anchor size with shape [N, 3], in order of x, y, z.
+                Defaults to [[3.9, 1.6, 1.56]].
+            rotations (list[float] | np.ndarray | torch.Tensor, optional):
+                Rotations of anchors in a single feature grid.
+                Defaults to [0, 1.5707963].
+            device (str): Devices that the anchors will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors with shape
+                [*feature_size, num_sizes, num_rots, 7].
+        """
+        if len(feature_size) == 2:
+            feature_size = [1, feature_size[0], feature_size[1]]
+        anchor_range = torch.tensor(anchor_range, device=device)
+        z_centers = torch.linspace(
+            anchor_range[2], anchor_range[5], feature_size[0], device=device)
+        y_centers = torch.linspace(
+            anchor_range[1], anchor_range[4], feature_size[1], device=device)
+        x_centers = torch.linspace(
+            anchor_range[0], anchor_range[3], feature_size[2], device=device)
+        sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale
+        rotations = torch.tensor(rotations, device=device)
+
+        # torch.meshgrid default behavior is 'id', np's default is 'xy'
+        rets = torch.meshgrid(x_centers, y_centers, z_centers, rotations)
+        # torch.meshgrid returns a tuple rather than list
+        rets = list(rets)
+        tile_shape = [1] * 5
+        tile_shape[-2] = int(sizes.shape[0])
+        for i in range(len(rets)):
+            rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1)
+
+        sizes = sizes.reshape([1, 1, 1, -1, 1, 3])
+        tile_size_shape = list(rets[0].shape)
+        tile_size_shape[3] = 1
+        sizes = sizes.repeat(tile_size_shape)
+        rets.insert(3, sizes)
+
+        ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5])
+        # [1, 200, 176, N, 2, 7] for kitti after permute
+
+        if len(self.custom_values) > 0:
+            custom_ndim = len(self.custom_values)
+            custom = ret.new_zeros([*ret.shape[:-1], custom_ndim])
+            # custom[:] = self.custom_values
+            ret = torch.cat([ret, custom], dim=-1)
+            # [1, 200, 176, N, 2, 9] for nus dataset after permute
+        return ret
+
+
+@TASK_UTILS.register_module()
+class AlignedAnchor3DRangeGenerator(Anchor3DRangeGenerator):
+    """Aligned 3D Anchor Generator by range.
+
+    This anchor generator uses a different manner to generate the positions
+    of anchors' centers from :class:`Anchor3DRangeGenerator`.
+
+    Note:
+        The `align` means that the anchor's center is aligned with the voxel
+        grid, which is also the feature grid. The previous implementation of
+        :class:`Anchor3DRangeGenerator` does not generate the anchors' center
+        according to the voxel grid. Rather, it generates the center by
+        uniformly distributing the anchors inside the minimum and maximum
+        anchor ranges according to the feature map sizes.
+        However, this makes the anchors center does not match the feature grid.
+        The :class:`AlignedAnchor3DRangeGenerator` add + 1 when using the
+        feature map sizes to obtain the corners of the voxel grid. Then it
+        shifts the coordinates to the center of voxel grid and use the left
+        up corner to distribute anchors.
+
+    Args:
+        anchor_corner (bool, optional): Whether to align with the corner of the
+            voxel grid. By default it is False and the anchor's center will be
+            the same as the corresponding voxel's center, which is also the
+            center of the corresponding greature grid. Defaults to False.
+    """
+
+    def __init__(self, align_corner: bool = False, **kwargs) -> None:
+        super(AlignedAnchor3DRangeGenerator, self).__init__(**kwargs)
+        self.align_corner = align_corner
+
+    def anchors_single_range(
+            self,
+            feature_size: List[int],
+            anchor_range: List[float],
+            scale: int,
+            sizes: Union[List[List[float]], List[float]] = [[3.9, 1.6, 1.56]],
+            rotations: List[float] = [0, 1.5707963],
+            device: Union[str, torch.device] = 'cuda') -> Tensor:
+        """Generate anchors in a single range.
+
+        Args:
+            feature_size (list[float] | tuple[float]): Feature map size. It is
+                either a list of a tuple of [D, H, W](in order of z, y, and x).
+            anchor_range (torch.Tensor | list[float]): Range of anchors with
+                shape [6]. The order is consistent with that of anchors, i.e.,
+                (x_min, y_min, z_min, x_max, y_max, z_max).
+            scale (float | int): The scale factor of anchors.
+            sizes (list[list] | np.ndarray | torch.Tensor, optional):
+                Anchor size with shape [N, 3], in order of x, y, z.
+                Defaults to [[3.9, 1.6, 1.56]].
+            rotations (list[float] | np.ndarray | torch.Tensor, optional):
+                Rotations of anchors in a single feature grid.
+                Defaults to [0, 1.5707963].
+            device (str, optional): Devices that the anchors will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors with shape
+                [*feature_size, num_sizes, num_rots, 7].
+        """
+        if len(feature_size) == 2:
+            feature_size = [1, feature_size[0], feature_size[1]]
+        anchor_range = torch.tensor(anchor_range, device=device)
+        z_centers = torch.linspace(
+            anchor_range[2],
+            anchor_range[5],
+            feature_size[0] + 1,
+            device=device)
+        y_centers = torch.linspace(
+            anchor_range[1],
+            anchor_range[4],
+            feature_size[1] + 1,
+            device=device)
+        x_centers = torch.linspace(
+            anchor_range[0],
+            anchor_range[3],
+            feature_size[2] + 1,
+            device=device)
+        sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale
+        rotations = torch.tensor(rotations, device=device)
+
+        # shift the anchor center
+        if not self.align_corner:
+            z_shift = (z_centers[1] - z_centers[0]) / 2
+            y_shift = (y_centers[1] - y_centers[0]) / 2
+            x_shift = (x_centers[1] - x_centers[0]) / 2
+            z_centers += z_shift
+            y_centers += y_shift
+            x_centers += x_shift
+
+        # torch.meshgrid default behavior is 'id', np's default is 'xy'
+        rets = torch.meshgrid(x_centers[:feature_size[2]],
+                              y_centers[:feature_size[1]],
+                              z_centers[:feature_size[0]], rotations)
+
+        # torch.meshgrid returns a tuple rather than list
+        rets = list(rets)
+        tile_shape = [1] * 5
+        tile_shape[-2] = int(sizes.shape[0])
+        for i in range(len(rets)):
+            rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1)
+
+        sizes = sizes.reshape([1, 1, 1, -1, 1, 3])
+        tile_size_shape = list(rets[0].shape)
+        tile_size_shape[3] = 1
+        sizes = sizes.repeat(tile_size_shape)
+        rets.insert(3, sizes)
+
+        ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5])
+
+        if len(self.custom_values) > 0:
+            custom_ndim = len(self.custom_values)
+            custom = ret.new_zeros([*ret.shape[:-1], custom_ndim])
+            # TODO: check the support of custom values
+            # custom[:] = self.custom_values
+            ret = torch.cat([ret, custom], dim=-1)
+        return ret
+
+
+@TASK_UTILS.register_module()
+class AlignedAnchor3DRangeGeneratorPerCls(AlignedAnchor3DRangeGenerator):
+    """3D Anchor Generator by range for per class.
+
+    This anchor generator generates anchors by the given range for per class.
+    Note that feature maps of different classes may be different.
+
+    Args:
+        kwargs (dict): Arguments are the same as those in
+            :class:`AlignedAnchor3DRangeGenerator`.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super(AlignedAnchor3DRangeGeneratorPerCls, self).__init__(**kwargs)
+        assert len(self.scales) == 1, 'Multi-scale feature map levels are' + \
+            ' not supported currently in this kind of anchor generator.'
+
+    def grid_anchors(
+            self,
+            featmap_sizes: List[Tuple[int]],
+            device: Union[str, torch.device] = 'cuda') -> List[List[Tensor]]:
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes for
+                different classes in a single feature level.
+            device (str, optional): Device where the anchors will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            list[list[torch.Tensor]]: Anchors in multiple feature levels.
+                Note that in this anchor generator, we currently only
+                support single feature level. The sizes of each tensor
+                should be [num_sizes/ranges*num_rots*featmap_size,
+                box_code_size].
+        """
+        multi_level_anchors = []
+        anchors = self.multi_cls_grid_anchors(
+            featmap_sizes, self.scales[0], device=device)
+        multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def multi_cls_grid_anchors(
+            self,
+            featmap_sizes: List[Tuple[int]],
+            scale: int,
+            device: Union[str, torch.device] = 'cuda') -> List[Tensor]:
+        """Generate grid anchors of a single level feature map for multi-class
+        with different feature map sizes.
+
+        This function is usually called by method ``self.grid_anchors``.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes for
+                different classes in a single feature level.
+            scale (float): Scale factor of the anchors in the current level.
+            device (str, optional): Device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature map.
+        """
+        assert len(featmap_sizes) == len(self.sizes) == len(self.ranges), \
+            'The number of different feature map sizes anchor sizes and ' + \
+            'ranges should be the same.'
+
+        multi_cls_anchors = []
+        for i in range(len(featmap_sizes)):
+            anchors = self.anchors_single_range(
+                featmap_sizes[i],
+                self.ranges[i],
+                scale,
+                self.sizes[i],
+                self.rotations,
+                device=device)
+            # [*featmap_size, num_sizes/ranges, num_rots, box_code_size]
+            ndim = len(featmap_sizes[i])
+            anchors = anchors.view(*featmap_sizes[i], -1, anchors.size(-1))
+            # [*featmap_size, num_sizes/ranges*num_rots, box_code_size]
+            anchors = anchors.permute(ndim, *range(0, ndim), ndim + 1)
+            # [num_sizes/ranges*num_rots, *featmap_size, box_code_size]
+            multi_cls_anchors.append(anchors.reshape(-1, anchors.size(-1)))
+            # [num_sizes/ranges*num_rots*featmap_size, box_code_size]
+        return multi_cls_anchors
diff --git a/mmde/mmdet3d/models/task_modules/anchor/builder.py b/mmde/mmdet3d/models/task_modules/anchor/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..af7018431158d0a8b9d23f95910c23e85b0dedde
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/anchor/builder.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Any
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet3d.utils import ConfigType
+
+PRIOR_GENERATORS = TASK_UTILS
+
+ANCHOR_GENERATORS = TASK_UTILS
+
+
+def build_prior_generator(cfg: ConfigType, default_args=None) -> Any:
+    warnings.warn(
+        '``build_prior_generator`` would be deprecated soon, please use '
+        '``mmdet3d.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_anchor_generator(cfg: ConfigType, default_args=None) -> Any:
+    warnings.warn(
+        '``build_anchor_generator`` would be deprecated soon, please use '
+        '``mmdet3d.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
diff --git a/mmde/mmdet3d/models/task_modules/assigners/__init__.py b/mmde/mmdet3d/models/task_modules/assigners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c2b0a90437930e488c69feb1c31480beafb6e3d
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/assigners/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .max_3d_iou_assigner import Max3DIoUAssigner
+
+__all__ = ['Max3DIoUAssigner']
diff --git a/mmde/mmdet3d/models/task_modules/assigners/max_3d_iou_assigner.py b/mmde/mmdet3d/models/task_modules/assigners/max_3d_iou_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..26788643459a58a6dee31a169313ae589fa4cac4
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/assigners/max_3d_iou_assigner.py
@@ -0,0 +1,160 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+from mmdet.models.task_modules import AssignResult, MaxIoUAssigner
+from mmengine.structures import InstanceData
+
+from mmdet3d.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class Max3DIoUAssigner(MaxIoUAssigner):
+    # TODO: This is a temporary box assigner.
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, or a semi-positive integer
+    indicating the ground truth index.
+
+    - -1: negative sample, no assigned gt
+    - semi-positive integer: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+            `min_pos_iou` is set to avoid assigning bboxes that have extremely
+            small iou with GT as positive samples.
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+        match_low_quality (bool): Whether to allow low quality matches. This is
+            usually allowed for RPN and single stage detectors, but not allowed
+            in the second stage. Details are demonstrated in Step 4.
+        gpu_assign_thr (int): The upper bound of the number of GT for GPU
+            assign. When the number of gt is above this threshold, will assign
+            on CPU device. Negative values mean not assign on CPU.
+        iou_calculator (dict): Config of overlaps Calculator.
+    """
+
+    def __init__(
+        self,
+        pos_iou_thr: float,
+        neg_iou_thr: Union[float, tuple],
+        min_pos_iou: float = .0,
+        gt_max_assign_all: bool = True,
+        ignore_iof_thr: float = -1,
+        ignore_wrt_candidates: bool = True,
+        match_low_quality: bool = True,
+        gpu_assign_thr: float = -1,
+        iou_calculator: dict = dict(type='BboxOverlaps2D')
+    ) -> None:
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        self.gpu_assign_thr = gpu_assign_thr
+        self.match_low_quality = match_low_quality
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to bboxes.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, or a semi-positive number. -1 means negative
+        sample, semi-positive number is the index (0-based) of assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to the background
+        2. assign proposals whose iou with all gts < neg_iou_thr to 0
+        3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
+           assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals (may be more than
+           one) to itself
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+
+        Example:
+            >>> from mmengine.structures import InstanceData
+            >>> self = MaxIoUAssigner(0.5, 0.5)
+            >>> pred_instances = InstanceData()
+            >>> pred_instances.priors = torch.Tensor([[0, 0, 10, 10],
+            ...                                      [10, 10, 20, 20]])
+            >>> gt_instances = InstanceData()
+            >>> gt_instances.bboxes = torch.Tensor([[0, 0, 10, 9]])
+            >>> gt_instances.labels = torch.Tensor([0])
+            >>> assign_result = self.assign(pred_instances, gt_instances)
+            >>> expected_gt_inds = torch.LongTensor([1, 0])
+            >>> assert torch.all(assign_result.gt_inds == expected_gt_inds)
+        """
+        gt_bboxes = gt_instances.bboxes_3d
+        if 'priors' in pred_instances:
+            priors = pred_instances.priors
+        else:
+            priors = pred_instances.bboxes_3d.tensor
+        gt_labels = gt_instances.labels_3d
+        if gt_instances_ignore is not None:
+            gt_bboxes_ignore = gt_instances_ignore.bboxes_3d
+        else:
+            gt_bboxes_ignore = None
+
+        assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
+            gt_bboxes.shape[0] > self.gpu_assign_thr) else False
+        # compute overlap and assign gt on CPU when number of GT is large
+        if assign_on_cpu:
+            device = priors.device
+            priors = priors.cpu()
+            gt_bboxes = gt_bboxes.cpu()
+            gt_labels = gt_labels.cpu()
+            if gt_bboxes_ignore is not None:
+                gt_bboxes_ignore = gt_bboxes_ignore.cpu()
+
+        overlaps = self.iou_calculator(gt_bboxes, priors)
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and priors.numel() > 0):
+            if self.ignore_wrt_candidates:
+                ignore_overlaps = self.iou_calculator(
+                    priors, gt_bboxes_ignore, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            else:
+                ignore_overlaps = self.iou_calculator(
+                    gt_bboxes_ignore, priors, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
+            overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
+
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        if assign_on_cpu:
+            assign_result.gt_inds = assign_result.gt_inds.to(device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(device)
+            if assign_result.labels is not None:
+                assign_result.labels = assign_result.labels.to(device)
+        return assign_result
diff --git a/mmde/mmdet3d/models/task_modules/builder.py b/mmde/mmdet3d/models/task_modules/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7794fb71d8ab8fc57d3afd03378ae51df0438c5f
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/builder.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Any
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet3d.utils.typing_utils import ConfigType
+
+BBOX_ASSIGNERS = TASK_UTILS
+BBOX_SAMPLERS = TASK_UTILS
+BBOX_CODERS = TASK_UTILS
+
+
+def build_assigner(cfg: ConfigType, **default_args) -> Any:
+    """Builder of box assigner."""
+    warnings.warn('``build_assigner`` would be deprecated soon, please use '
+                  '``mmdet3d.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_sampler(cfg: ConfigType, **default_args) -> Any:
+    """Builder of box sampler."""
+    warnings.warn('``build_sampler`` would be deprecated soon, please use '
+                  '``mmdet3d.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_bbox_coder(cfg: ConfigType, **default_args) -> Any:
+    """Builder of box coder."""
+    warnings.warn('``build_bbox_coder`` would be deprecated soon, please use '
+                  '``mmdet3d.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
diff --git a/mmde/mmdet3d/models/task_modules/coders/__init__.py b/mmde/mmdet3d/models/task_modules/coders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b22e725be7b37335d7d0b71115f47a0a132af662
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/coders/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor_free_bbox_coder import AnchorFreeBBoxCoder
+from .centerpoint_bbox_coders import CenterPointBBoxCoder
+from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder
+from .fcos3d_bbox_coder import FCOS3DBBoxCoder
+from .groupfree3d_bbox_coder import GroupFree3DBBoxCoder
+from .monoflex_bbox_coder import MonoFlexCoder
+from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
+from .pgd_bbox_coder import PGDBBoxCoder
+from .point_xyzwhlr_bbox_coder import PointXYZWHLRBBoxCoder
+from .smoke_bbox_coder import SMOKECoder
+
+__all__ = [
+    'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder',
+    'CenterPointBBoxCoder', 'AnchorFreeBBoxCoder', 'GroupFree3DBBoxCoder',
+    'PointXYZWHLRBBoxCoder', 'FCOS3DBBoxCoder', 'PGDBBoxCoder', 'SMOKECoder',
+    'MonoFlexCoder'
+]
diff --git a/mmde/mmdet3d/models/task_modules/coders/anchor_free_bbox_coder.py b/mmde/mmdet3d/models/task_modules/coders/anchor_free_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9d884b396302db1a6c26c104532133ae806737d
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/coders/anchor_free_bbox_coder.py
@@ -0,0 +1,136 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet3d.structures import BaseInstance3DBoxes
+from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class AnchorFreeBBoxCoder(PartialBinBasedBBoxCoder):
+    """Anchor free bbox coder for 3D boxes.
+
+    Args:
+        num_dir_bins (int): Number of bins to encode direction angle.
+        with_rot (bool): Whether the bbox is with rotation.
+    """
+
+    def __init__(self, num_dir_bins: int, with_rot: bool = True) -> None:
+        super(AnchorFreeBBoxCoder, self).__init__(
+            num_dir_bins, 0, [], with_rot=with_rot)
+        self.num_dir_bins = num_dir_bins
+        self.with_rot = with_rot
+
+    def encode(self, gt_bboxes_3d: BaseInstance3DBoxes,
+               gt_labels_3d: Tensor) -> tuple:
+        """Encode ground truth to prediction targets.
+
+        Args:
+            gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes
+                with shape (n, 7).
+            gt_labels_3d (torch.Tensor): Ground truth classes.
+
+        Returns:
+            tuple: Targets of center, size and direction.
+        """
+        # generate center target
+        center_target = gt_bboxes_3d.gravity_center
+
+        # generate bbox size target
+        size_res_target = gt_bboxes_3d.dims / 2
+
+        # generate dir target
+        box_num = gt_labels_3d.shape[0]
+        if self.with_rot:
+            (dir_class_target,
+             dir_res_target) = self.angle2class(gt_bboxes_3d.yaw)
+            dir_res_target /= (2 * np.pi / self.num_dir_bins)
+        else:
+            dir_class_target = gt_labels_3d.new_zeros(box_num)
+            dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num)
+
+        return (center_target, size_res_target, dir_class_target,
+                dir_res_target)
+
+    def decode(self, bbox_out: dict) -> Tensor:
+        """Decode predicted parts to bbox3d.
+
+        Args:
+            bbox_out (dict): Predictions from model, should contain keys below.
+
+                - center: predicted bottom center of bboxes.
+                - dir_class: predicted bbox direction class.
+                - dir_res: predicted bbox direction residual.
+                - size: predicted bbox size.
+
+        Returns:
+            torch.Tensor: Decoded bbox3d with shape (batch, n, 7).
+        """
+        center = bbox_out['center']
+        batch_size, num_proposal = center.shape[:2]
+
+        # decode heading angle
+        if self.with_rot:
+            dir_class = torch.argmax(bbox_out['dir_class'], -1)
+            dir_res = torch.gather(bbox_out['dir_res'], 2,
+                                   dir_class.unsqueeze(-1))
+            dir_res.squeeze_(2)
+            dir_angle = self.class2angle(dir_class, dir_res).reshape(
+                batch_size, num_proposal, 1)
+        else:
+            dir_angle = center.new_zeros(batch_size, num_proposal, 1)
+
+        # decode bbox size
+        bbox_size = torch.clamp(bbox_out['size'] * 2, min=0.1)
+
+        bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
+        return bbox3d
+
+    def split_pred(self, cls_preds: Tensor, reg_preds: Tensor,
+                   base_xyz: Tensor) -> Dict[str, Tensor]:
+        """Split predicted features to specific parts.
+
+        Args:
+            cls_preds (torch.Tensor): Class predicted features to split.
+            reg_preds (torch.Tensor): Regression predicted features to split.
+            base_xyz (torch.Tensor): Coordinates of points.
+
+        Returns:
+            dict[str, torch.Tensor]: Split results.
+        """
+        results = {}
+        results['obj_scores'] = cls_preds
+
+        start, end = 0, 0
+        reg_preds_trans = reg_preds.transpose(2, 1)
+
+        # decode center
+        end += 3
+        # (batch_size, num_proposal, 3)
+        results['center_offset'] = reg_preds_trans[..., start:end]
+        results['center'] = base_xyz.detach() + reg_preds_trans[..., start:end]
+        start = end
+
+        # decode center
+        end += 3
+        # (batch_size, num_proposal, 3)
+        results['size'] = reg_preds_trans[..., start:end]
+        start = end
+
+        # decode direction
+        end += self.num_dir_bins
+        results['dir_class'] = reg_preds_trans[..., start:end]
+        start = end
+
+        end += self.num_dir_bins
+        dir_res_norm = reg_preds_trans[..., start:end]
+        start = end
+
+        results['dir_res_norm'] = dir_res_norm
+        results['dir_res'] = dir_res_norm * (2 * np.pi / self.num_dir_bins)
+
+        return results
diff --git a/mmde/mmdet3d/models/task_modules/coders/centerpoint_bbox_coders.py b/mmde/mmdet3d/models/task_modules/coders/centerpoint_bbox_coders.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3359d6bc138eb3eace4bbadefa41d9322b1a32f
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/coders/centerpoint_bbox_coders.py
@@ -0,0 +1,235 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from mmdet.models.task_modules import BaseBBoxCoder
+from torch import Tensor
+
+from mmdet3d.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class CenterPointBBoxCoder(BaseBBoxCoder):
+    """Bbox coder for CenterPoint.
+
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        out_size_factor (int): Downsample factor of the model.
+        voxel_size (list[float]): Size of voxel.
+        post_center_range (list[float], optional): Limit of the center.
+            Default: None.
+        max_num (int, optional): Max number to be kept. Default: 100.
+        score_threshold (float, optional): Threshold to filter boxes
+            based on score. Default: None.
+        code_size (int, optional): Code size of bboxes. Default: 9
+    """
+
+    def __init__(self,
+                 pc_range: List[float],
+                 out_size_factor: int,
+                 voxel_size: List[float],
+                 post_center_range: Optional[List[float]] = None,
+                 max_num: int = 100,
+                 score_threshold: Optional[float] = None,
+                 code_size: int = 9) -> None:
+
+        self.pc_range = pc_range
+        self.out_size_factor = out_size_factor
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.code_size = code_size
+
+    def _gather_feat(self,
+                     feats: Tensor,
+                     inds: Tensor,
+                     feat_masks: Optional[Tensor] = None) -> Tensor:
+        """Given feats and indexes, returns the gathered feats.
+
+        Args:
+            feats (torch.Tensor): Features to be transposed and gathered
+                with the shape of [B, 2, W, H].
+            inds (torch.Tensor): Indexes with the shape of [B, N].
+            feat_masks (torch.Tensor, optional): Mask of the feats.
+                Default: None.
+
+        Returns:
+            torch.Tensor: Gathered feats.
+        """
+        dim = feats.size(2)
+        inds = inds.unsqueeze(2).expand(inds.size(0), inds.size(1), dim)
+        feats = feats.gather(1, inds)
+        if feat_masks is not None:
+            feat_masks = feat_masks.unsqueeze(2).expand_as(feats)
+            feats = feats[feat_masks]
+            feats = feats.view(-1, dim)
+        return feats
+
+    def _topk(self, scores: Tensor, K: int = 80) -> Tuple[Tensor]:
+        """Get indexes based on scores.
+
+        Args:
+            scores (torch.Tensor): scores with the shape of [B, N, W, H].
+            K (int, optional): Number to be kept. Defaults to 80.
+
+        Returns:
+            tuple[torch.Tensor]
+                torch.Tensor: Selected scores with the shape of [B, K].
+                torch.Tensor: Selected indexes with the shape of [B, K].
+                torch.Tensor: Selected classes with the shape of [B, K].
+                torch.Tensor: Selected y coord with the shape of [B, K].
+                torch.Tensor: Selected x coord with the shape of [B, K].
+        """
+        batch, cat, height, width = scores.size()
+
+        topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
+
+        topk_inds = topk_inds % (height * width)
+        topk_ys = (topk_inds.float() /
+                   torch.tensor(width, dtype=torch.float)).int().float()
+        topk_xs = (topk_inds % width).int().float()
+
+        topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
+        topk_clses = (topk_ind / torch.tensor(K, dtype=torch.float)).int()
+        topk_inds = self._gather_feat(topk_inds.view(batch, -1, 1),
+                                      topk_ind).view(batch, K)
+        topk_ys = self._gather_feat(topk_ys.view(batch, -1, 1),
+                                    topk_ind).view(batch, K)
+        topk_xs = self._gather_feat(topk_xs.view(batch, -1, 1),
+                                    topk_ind).view(batch, K)
+
+        return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
+
+    def _transpose_and_gather_feat(self, feat: Tensor, ind: Tensor) -> Tensor:
+        """Given feats and indexes, returns the transposed and gathered feats.
+
+        Args:
+            feat (torch.Tensor): Features to be transposed and gathered
+                with the shape of [B, 2, W, H].
+            ind (torch.Tensor): Indexes with the shape of [B, N].
+
+        Returns:
+            torch.Tensor: Transposed and gathered feats.
+        """
+        feat = feat.permute(0, 2, 3, 1).contiguous()
+        feat = feat.view(feat.size(0), -1, feat.size(3))
+        feat = self._gather_feat(feat, ind)
+        return feat
+
+    def encode(self):
+        pass
+
+    def decode(self,
+               heat: Tensor,
+               rot_sine: Tensor,
+               rot_cosine: Tensor,
+               hei: Tensor,
+               dim: Tensor,
+               vel: Tensor,
+               reg: Optional[Tensor] = None,
+               task_id: int = -1) -> List[Dict[str, Tensor]]:
+        """Decode bboxes.
+
+        Args:
+            heat (torch.Tensor): Heatmap with the shape of [B, N, W, H].
+            rot_sine (torch.Tensor): Sine of rotation with the shape of
+                [B, 1, W, H].
+            rot_cosine (torch.Tensor): Cosine of rotation with the shape of
+                [B, 1, W, H].
+            hei (torch.Tensor): Height of the boxes with the shape
+                of [B, 1, W, H].
+            dim (torch.Tensor): Dim of the boxes with the shape of
+                [B, 1, W, H].
+            vel (torch.Tensor): Velocity with the shape of [B, 1, W, H].
+            reg (torch.Tensor, optional): Regression value of the boxes in
+                2D with the shape of [B, 2, W, H]. Default: None.
+            task_id (int, optional): Index of task. Default: -1.
+
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        batch, cat, _, _ = heat.size()
+
+        scores, inds, clses, ys, xs = self._topk(heat, K=self.max_num)
+
+        if reg is not None:
+            reg = self._transpose_and_gather_feat(reg, inds)
+            reg = reg.view(batch, self.max_num, 2)
+            xs = xs.view(batch, self.max_num, 1) + reg[:, :, 0:1]
+            ys = ys.view(batch, self.max_num, 1) + reg[:, :, 1:2]
+        else:
+            xs = xs.view(batch, self.max_num, 1) + 0.5
+            ys = ys.view(batch, self.max_num, 1) + 0.5
+
+        # rotation value and direction label
+        rot_sine = self._transpose_and_gather_feat(rot_sine, inds)
+        rot_sine = rot_sine.view(batch, self.max_num, 1)
+
+        rot_cosine = self._transpose_and_gather_feat(rot_cosine, inds)
+        rot_cosine = rot_cosine.view(batch, self.max_num, 1)
+        rot = torch.atan2(rot_sine, rot_cosine)
+
+        # height in the bev
+        hei = self._transpose_and_gather_feat(hei, inds)
+        hei = hei.view(batch, self.max_num, 1)
+
+        # dim of the box
+        dim = self._transpose_and_gather_feat(dim, inds)
+        dim = dim.view(batch, self.max_num, 3)
+
+        # class label
+        clses = clses.view(batch, self.max_num).float()
+        scores = scores.view(batch, self.max_num)
+
+        xs = xs.view(
+            batch, self.max_num,
+            1) * self.out_size_factor * self.voxel_size[0] + self.pc_range[0]
+        ys = ys.view(
+            batch, self.max_num,
+            1) * self.out_size_factor * self.voxel_size[1] + self.pc_range[1]
+
+        if vel is None:  # KITTI FORMAT
+            final_box_preds = torch.cat([xs, ys, hei, dim, rot], dim=2)
+        else:  # exist velocity, nuscene format
+            vel = self._transpose_and_gather_feat(vel, inds)
+            vel = vel.view(batch, self.max_num, 2)
+            final_box_preds = torch.cat([xs, ys, hei, dim, rot, vel], dim=2)
+
+        final_scores = scores
+        final_preds = clses
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=heat.device)
+            mask = (final_box_preds[..., :3] >=
+                    self.post_center_range[:3]).all(2)
+            mask &= (final_box_preds[..., :3] <=
+                     self.post_center_range[3:]).all(2)
+
+            predictions_dicts = []
+            for i in range(batch):
+                cmask = mask[i, :]
+                if self.score_threshold:
+                    cmask &= thresh_mask[i]
+
+                boxes3d = final_box_preds[i, cmask]
+                scores = final_scores[i, cmask]
+                labels = final_preds[i, cmask]
+                predictions_dict = {
+                    'bboxes': boxes3d,
+                    'scores': scores,
+                    'labels': labels
+                }
+
+                predictions_dicts.append(predictions_dict)
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+
+        return predictions_dicts
diff --git a/mmde/mmdet3d/models/task_modules/coders/delta_xyzwhlr_bbox_coder.py b/mmde/mmdet3d/models/task_modules/coders/delta_xyzwhlr_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..45c5e0bcba8d8227569c0eacdf68707e264d46dc
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/coders/delta_xyzwhlr_bbox_coder.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmdet.models.task_modules import BaseBBoxCoder
+from torch import Tensor
+
+from mmdet3d.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class DeltaXYZWLHRBBoxCoder(BaseBBoxCoder):
+    """Bbox Coder for 3D boxes.
+
+    Args:
+        code_size (int): The dimension of boxes to be encoded.
+    """
+
+    def __init__(self, code_size: int = 7) -> None:
+        super(DeltaXYZWLHRBBoxCoder, self).__init__()
+        self.code_size = code_size
+
+    @staticmethod
+    def encode(src_boxes: Tensor, dst_boxes: Tensor) -> Tensor:
+        """Get box regression transformation deltas (dx, dy, dz, dx_size,
+        dy_size, dz_size, dr, dv*) that can be used to transform the
+        `src_boxes` into the `target_boxes`.
+
+        Args:
+            src_boxes (torch.Tensor): source boxes, e.g., object proposals.
+            dst_boxes (torch.Tensor): target of the transformation, e.g.,
+                ground-truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas.
+        """
+        box_ndim = src_boxes.shape[-1]
+        cas, cgs, cts = [], [], []
+        if box_ndim > 7:
+            xa, ya, za, wa, la, ha, ra, *cas = torch.split(
+                src_boxes, 1, dim=-1)
+            xg, yg, zg, wg, lg, hg, rg, *cgs = torch.split(
+                dst_boxes, 1, dim=-1)
+            cts = [g - a for g, a in zip(cgs, cas)]
+        else:
+            xa, ya, za, wa, la, ha, ra = torch.split(src_boxes, 1, dim=-1)
+            xg, yg, zg, wg, lg, hg, rg = torch.split(dst_boxes, 1, dim=-1)
+        za = za + ha / 2
+        zg = zg + hg / 2
+        diagonal = torch.sqrt(la**2 + wa**2)
+        xt = (xg - xa) / diagonal
+        yt = (yg - ya) / diagonal
+        zt = (zg - za) / ha
+        lt = torch.log(lg / la)
+        wt = torch.log(wg / wa)
+        ht = torch.log(hg / ha)
+        rt = rg - ra
+        return torch.cat([xt, yt, zt, wt, lt, ht, rt, *cts], dim=-1)
+
+    @staticmethod
+    def decode(anchors: Tensor, deltas: Tensor) -> Tensor:
+        """Apply transformation `deltas` (dx, dy, dz, dx_size, dy_size,
+        dz_size, dr, dv*) to `boxes`.
+
+        Args:
+            anchors (torch.Tensor): Parameters of anchors with shape (N, 7).
+            deltas (torch.Tensor): Encoded boxes with shape
+                (N, 7+n) [x, y, z, x_size, y_size, z_size, r, velo*].
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        cas, cts = [], []
+        box_ndim = anchors.shape[-1]
+        if box_ndim > 7:
+            xa, ya, za, wa, la, ha, ra, *cas = torch.split(anchors, 1, dim=-1)
+            xt, yt, zt, wt, lt, ht, rt, *cts = torch.split(deltas, 1, dim=-1)
+        else:
+            xa, ya, za, wa, la, ha, ra = torch.split(anchors, 1, dim=-1)
+            xt, yt, zt, wt, lt, ht, rt = torch.split(deltas, 1, dim=-1)
+
+        za = za + ha / 2
+        diagonal = torch.sqrt(la**2 + wa**2)
+        xg = xt * diagonal + xa
+        yg = yt * diagonal + ya
+        zg = zt * ha + za
+
+        lg = torch.exp(lt) * la
+        wg = torch.exp(wt) * wa
+        hg = torch.exp(ht) * ha
+        rg = rt + ra
+        zg = zg - hg / 2
+        cgs = [t + a for t, a in zip(cts, cas)]
+        return torch.cat([xg, yg, zg, wg, lg, hg, rg, *cgs], dim=-1)
diff --git a/mmde/mmdet3d/models/task_modules/coders/fcos3d_bbox_coder.py b/mmde/mmdet3d/models/task_modules/coders/fcos3d_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a8b286e5f2797f52151946f7bc89631e7e24ea1
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/coders/fcos3d_bbox_coder.py
@@ -0,0 +1,136 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+from mmdet.models.task_modules import BaseBBoxCoder
+from torch import Tensor
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet3d.structures.bbox_3d import limit_period
+
+
+@TASK_UTILS.register_module()
+class FCOS3DBBoxCoder(BaseBBoxCoder):
+    """Bounding box coder for FCOS3D.
+
+    Args:
+        base_depths (tuple[tuple[float]]): Depth references for decode box
+            depth. Defaults to None.
+        base_dims (tuple[tuple[float]]): Dimension references for decode box
+            dimension. Defaults to None.
+        code_size (int): The dimension of boxes to be encoded. Defaults to 7.
+        norm_on_bbox (bool): Whether to apply normalization on the bounding
+            box 2D attributes. Defaults to True.
+    """
+
+    def __init__(self,
+                 base_depths: Optional[Tuple[Tuple[float]]] = None,
+                 base_dims: Optional[Tuple[Tuple[float]]] = None,
+                 code_size: int = 7,
+                 norm_on_bbox: bool = True) -> None:
+        super(FCOS3DBBoxCoder, self).__init__()
+        self.base_depths = base_depths
+        self.base_dims = base_dims
+        self.bbox_code_size = code_size
+        self.norm_on_bbox = norm_on_bbox
+
+    def encode(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels):
+        # TODO: refactor the encoder in the FCOS3D and PGD head
+        pass
+
+    def decode(self,
+               bbox: Tensor,
+               scale: tuple,
+               stride: int,
+               training: bool,
+               cls_score: Optional[Tensor] = None) -> Tensor:
+        """Decode regressed results into 3D predictions.
+
+        Note that offsets are not transformed to the projected 3D centers.
+
+        Args:
+            bbox (torch.Tensor): Raw bounding box predictions in shape
+                [N, C, H, W].
+            scale (tuple[`Scale`]): Learnable scale parameters.
+            stride (int): Stride for a specific feature level.
+            training (bool): Whether the decoding is in the training
+                procedure.
+            cls_score (torch.Tensor): Classification score map for deciding
+                which base depth or dim is used. Defaults to None.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        # scale the bbox of different level
+        # only apply to offset, depth and size prediction
+        scale_offset, scale_depth, scale_size = scale[0:3]
+
+        clone_bbox = bbox.clone()
+        bbox[:, :2] = scale_offset(clone_bbox[:, :2]).float()
+        bbox[:, 2] = scale_depth(clone_bbox[:, 2]).float()
+        bbox[:, 3:6] = scale_size(clone_bbox[:, 3:6]).float()
+
+        if self.base_depths is None:
+            bbox[:, 2] = bbox[:, 2].exp()
+        elif len(self.base_depths) == 1:  # only single prior
+            mean = self.base_depths[0][0]
+            std = self.base_depths[0][1]
+            bbox[:, 2] = mean + bbox.clone()[:, 2] * std
+        else:  # multi-class priors
+            assert len(self.base_depths) == cls_score.shape[1], \
+                'The number of multi-class depth priors should be equal to ' \
+                'the number of categories.'
+            indices = cls_score.max(dim=1)[1]
+            depth_priors = cls_score.new_tensor(
+                self.base_depths)[indices, :].permute(0, 3, 1, 2)
+            mean = depth_priors[:, 0]
+            std = depth_priors[:, 1]
+            bbox[:, 2] = mean + bbox.clone()[:, 2] * std
+
+        bbox[:, 3:6] = bbox[:, 3:6].exp()
+        if self.base_dims is not None:
+            assert len(self.base_dims) == cls_score.shape[1], \
+                'The number of anchor sizes should be equal to the number ' \
+                'of categories.'
+            indices = cls_score.max(dim=1)[1]
+            size_priors = cls_score.new_tensor(
+                self.base_dims)[indices, :].permute(0, 3, 1, 2)
+            bbox[:, 3:6] = size_priors * bbox.clone()[:, 3:6]
+
+        assert self.norm_on_bbox is True, 'Setting norm_on_bbox to False '\
+            'has not been thoroughly tested for FCOS3D.'
+        if self.norm_on_bbox:
+            if not training:
+                # Note that this line is conducted only when testing
+                bbox[:, :2] *= stride
+
+        return bbox
+
+    @staticmethod
+    def decode_yaw(bbox: Tensor, centers2d: Tensor, dir_cls: Tensor,
+                   dir_offset: float, cam2img: Tensor) -> Tensor:
+        """Decode yaw angle and change it from local to global.i.
+
+        Args:
+            bbox (torch.Tensor): Bounding box predictions in shape
+                [N, C] with yaws to be decoded.
+            centers2d (torch.Tensor): Projected 3D-center on the image planes
+                corresponding to the box predictions.
+            dir_cls (torch.Tensor): Predicted direction classes.
+            dir_offset (float): Direction offset before dividing all the
+                directions into several classes.
+            cam2img (torch.Tensor): Camera intrinsic matrix in shape [4, 4].
+
+        Returns:
+            torch.Tensor: Bounding boxes with decoded yaws.
+        """
+        if bbox.shape[0] > 0:
+            dir_rot = limit_period(bbox[..., 6] - dir_offset, 0, np.pi)
+            bbox[..., 6] = \
+                dir_rot + dir_offset + np.pi * dir_cls.to(bbox.dtype)
+
+        bbox[:, 6] = torch.atan2(centers2d[:, 0] - cam2img[0, 2],
+                                 cam2img[0, 0]) + bbox[:, 6]
+
+        return bbox
diff --git a/mmde/mmdet3d/models/task_modules/coders/groupfree3d_bbox_coder.py b/mmde/mmdet3d/models/task_modules/coders/groupfree3d_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3660c1d96d3423d2d63309518ae4267ee4019193
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/coders/groupfree3d_bbox_coder.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet3d.structures.bbox_3d import BaseInstance3DBoxes
+from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class GroupFree3DBBoxCoder(PartialBinBasedBBoxCoder):
+    """Modified partial bin based bbox coder for GroupFree3D.
+
+    Args:
+        num_dir_bins (int): Number of bins to encode direction angle.
+        num_sizes (int): Number of size clusters.
+        mean_sizes (list[list[int]]): Mean size of bboxes in each class.
+        with_rot (bool, optional): Whether the bbox is with rotation.
+            Defaults to True.
+        size_cls_agnostic (bool, optional): Whether the predicted size is
+            class-agnostic. Defaults to True.
+    """
+
+    def __init__(self,
+                 num_dir_bins: int,
+                 num_sizes: int,
+                 mean_sizes: List[List[int]],
+                 with_rot: bool = True,
+                 size_cls_agnostic: bool = True) -> None:
+        super(GroupFree3DBBoxCoder, self).__init__(
+            num_dir_bins=num_dir_bins,
+            num_sizes=num_sizes,
+            mean_sizes=mean_sizes,
+            with_rot=with_rot)
+        self.size_cls_agnostic = size_cls_agnostic
+
+    def encode(self, gt_bboxes_3d: BaseInstance3DBoxes,
+               gt_labels_3d: Tensor) -> tuple:
+        """Encode ground truth to prediction targets.
+
+        Args:
+            gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes
+                with shape (n, 7).
+            gt_labels_3d (torch.Tensor): Ground truth classes.
+
+        Returns:
+            tuple: Targets of center, size and direction.
+        """
+        # generate center target
+        center_target = gt_bboxes_3d.gravity_center
+
+        # generate bbox size target
+        size_target = gt_bboxes_3d.dims
+        size_class_target = gt_labels_3d
+        size_res_target = gt_bboxes_3d.dims - gt_bboxes_3d.tensor.new_tensor(
+            self.mean_sizes)[size_class_target]
+
+        # generate dir target
+        box_num = gt_labels_3d.shape[0]
+        if self.with_rot:
+            (dir_class_target,
+             dir_res_target) = self.angle2class(gt_bboxes_3d.yaw)
+        else:
+            dir_class_target = gt_labels_3d.new_zeros(box_num)
+            dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num)
+
+        return (center_target, size_target, size_class_target, size_res_target,
+                dir_class_target, dir_res_target)
+
+    def decode(self, bbox_out: dict, prefix: str = '') -> Tensor:
+        """Decode predicted parts to bbox3d.
+
+        Args:
+            bbox_out (dict): Predictions from model, should contain keys below.
+
+                - center: predicted bottom center of bboxes.
+                - dir_class: predicted bbox direction class.
+                - dir_res: predicted bbox direction residual.
+                - size_class: predicted bbox size class.
+                - size_res: predicted bbox size residual.
+                - size: predicted class-agnostic bbox size
+            prefix (str, optional): Decode predictions with specific prefix.
+                Defaults to ''.
+
+        Returns:
+            torch.Tensor: Decoded bbox3d with shape (batch, n, 7).
+        """
+        center = bbox_out[f'{prefix}center']
+        batch_size, num_proposal = center.shape[:2]
+
+        # decode heading angle
+        if self.with_rot:
+            dir_class = torch.argmax(bbox_out[f'{prefix}dir_class'], -1)
+            dir_res = torch.gather(bbox_out[f'{prefix}dir_res'], 2,
+                                   dir_class.unsqueeze(-1))
+            dir_res.squeeze_(2)
+            dir_angle = self.class2angle(dir_class, dir_res).reshape(
+                batch_size, num_proposal, 1)
+        else:
+            dir_angle = center.new_zeros(batch_size, num_proposal, 1)
+
+        # decode bbox size
+        if self.size_cls_agnostic:
+            bbox_size = bbox_out[f'{prefix}size'].reshape(
+                batch_size, num_proposal, 3)
+        else:
+            size_class = torch.argmax(
+                bbox_out[f'{prefix}size_class'], -1, keepdim=True)
+            size_res = torch.gather(
+                bbox_out[f'{prefix}size_res'], 2,
+                size_class.unsqueeze(-1).repeat(1, 1, 1, 3))
+            mean_sizes = center.new_tensor(self.mean_sizes)
+            size_base = torch.index_select(mean_sizes, 0,
+                                           size_class.reshape(-1))
+            bbox_size = size_base.reshape(batch_size, num_proposal,
+                                          -1) + size_res.squeeze(2)
+
+        bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
+        return bbox3d
+
+    def split_pred(self,
+                   cls_preds: Tensor,
+                   reg_preds: Tensor,
+                   base_xyz: Tensor,
+                   prefix: str = '') -> Dict[str, Tensor]:
+        """Split predicted features to specific parts.
+
+        Args:
+            cls_preds (torch.Tensor): Class predicted features to split.
+            reg_preds (torch.Tensor): Regression predicted features to split.
+            base_xyz (torch.Tensor): Coordinates of points.
+            prefix (str, optional): Decode predictions with specific prefix.
+                Defaults to ''.
+
+        Returns:
+            dict[str, torch.Tensor]: Split results.
+        """
+        results = {}
+        start, end = 0, 0
+
+        cls_preds_trans = cls_preds.transpose(2, 1)
+        reg_preds_trans = reg_preds.transpose(2, 1)
+
+        # decode center
+        end += 3
+        # (batch_size, num_proposal, 3)
+        results[f'{prefix}center_residual'] = \
+            reg_preds_trans[..., start:end].contiguous()
+        results[f'{prefix}center'] = base_xyz + \
+            reg_preds_trans[..., start:end].contiguous()
+        start = end
+
+        # decode direction
+        end += self.num_dir_bins
+        results[f'{prefix}dir_class'] = \
+            reg_preds_trans[..., start:end].contiguous()
+        start = end
+
+        end += self.num_dir_bins
+        dir_res_norm = reg_preds_trans[..., start:end].contiguous()
+        start = end
+
+        results[f'{prefix}dir_res_norm'] = dir_res_norm
+        results[f'{prefix}dir_res'] = dir_res_norm * (
+            np.pi / self.num_dir_bins)
+
+        # decode size
+        if self.size_cls_agnostic:
+            end += 3
+            results[f'{prefix}size'] = \
+                reg_preds_trans[..., start:end].contiguous()
+        else:
+            end += self.num_sizes
+            results[f'{prefix}size_class'] = reg_preds_trans[
+                ..., start:end].contiguous()
+            start = end
+
+            end += self.num_sizes * 3
+            size_res_norm = reg_preds_trans[..., start:end]
+            batch_size, num_proposal = reg_preds_trans.shape[:2]
+            size_res_norm = size_res_norm.view(
+                [batch_size, num_proposal, self.num_sizes, 3])
+            start = end
+
+            results[f'{prefix}size_res_norm'] = size_res_norm.contiguous()
+            mean_sizes = reg_preds.new_tensor(self.mean_sizes)
+            results[f'{prefix}size_res'] = (
+                size_res_norm * mean_sizes.unsqueeze(0).unsqueeze(0))
+
+        # decode objectness score
+        # Group-Free-3D objectness output shape (batch, proposal, 1)
+        results[f'{prefix}obj_scores'] = cls_preds_trans[..., :1].contiguous()
+
+        # decode semantic score
+        results[f'{prefix}sem_scores'] = cls_preds_trans[..., 1:].contiguous()
+
+        return results
diff --git a/mmde/mmdet3d/models/task_modules/coders/monoflex_bbox_coder.py b/mmde/mmdet3d/models/task_modules/coders/monoflex_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..57199255d07ab96fa3dc562a7e9074448ddebbba
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/coders/monoflex_bbox_coder.py
@@ -0,0 +1,525 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+from mmdet.models.task_modules import BaseBBoxCoder
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet3d.structures.bbox_3d import BaseInstance3DBoxes
+
+
+@TASK_UTILS.register_module()
+class MonoFlexCoder(BaseBBoxCoder):
+    """Bbox Coder for MonoFlex.
+
+    Args:
+        depth_mode (str): The mode for depth calculation.
+            Available options are "linear", "inv_sigmoid", and "exp".
+        base_depth (tuple[float]): References for decoding box depth.
+        depth_range (list): Depth range of predicted depth.
+        combine_depth (bool): Whether to use combined depth (direct depth
+            and depth from keypoints) or use direct depth only.
+        uncertainty_range (list): Uncertainty range of predicted depth.
+        base_dims (tuple[tuple[float]]): Dimensions mean and std of decode bbox
+            dimensions [l, h, w] for each category.
+        dims_mode (str): The mode for dimension calculation.
+            Available options are "linear" and "exp".
+        multibin (bool): Whether to use multibin representation.
+        num_dir_bins (int): Number of Number of bins to encode
+            direction angle.
+        bin_centers (list[float]): Local yaw centers while using multibin
+            representations.
+        bin_margin (float): Margin of multibin representations.
+        code_size (int): The dimension of boxes to be encoded.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-3.
+    """
+
+    def __init__(self,
+                 depth_mode: str,
+                 base_depth: Tuple[float],
+                 depth_range: list,
+                 combine_depth: bool,
+                 uncertainty_range: list,
+                 base_dims: Tuple[Tuple[float]],
+                 dims_mode: str,
+                 multibin: bool,
+                 num_dir_bins: int,
+                 bin_centers: List[float],
+                 bin_margin: float,
+                 code_size: int,
+                 eps: float = 1e-3) -> None:
+        super(MonoFlexCoder, self).__init__()
+
+        # depth related
+        self.depth_mode = depth_mode
+        self.base_depth = base_depth
+        self.depth_range = depth_range
+        self.combine_depth = combine_depth
+        self.uncertainty_range = uncertainty_range
+
+        # dimensions related
+        self.base_dims = base_dims
+        self.dims_mode = dims_mode
+
+        # orientation related
+        self.multibin = multibin
+        self.num_dir_bins = num_dir_bins
+        self.bin_centers = bin_centers
+        self.bin_margin = bin_margin
+
+        # output related
+        self.bbox_code_size = code_size
+        self.eps = eps
+
+    def encode(self, gt_bboxes_3d: BaseInstance3DBoxes) -> Tensor:
+        """Encode ground truth to prediction targets.
+
+        Args:
+            gt_bboxes_3d (`BaseInstance3DBoxes`): Ground truth 3D bboxes.
+                shape: (N, 7).
+
+        Returns:
+            torch.Tensor: Targets of orientations.
+        """
+        local_yaw = gt_bboxes_3d.local_yaw
+        # encode local yaw (-pi ~ pi) to multibin format
+        encode_local_yaw = local_yaw.new_zeros(
+            [local_yaw.shape[0], self.num_dir_bins * 2])
+        bin_size = 2 * np.pi / self.num_dir_bins
+        margin_size = bin_size * self.bin_margin
+
+        bin_centers = local_yaw.new_tensor(self.bin_centers)
+        range_size = bin_size / 2 + margin_size
+
+        offsets = local_yaw.unsqueeze(1) - bin_centers.unsqueeze(0)
+        offsets[offsets > np.pi] = offsets[offsets > np.pi] - 2 * np.pi
+        offsets[offsets < -np.pi] = offsets[offsets < -np.pi] + 2 * np.pi
+
+        for i in range(self.num_dir_bins):
+            offset = offsets[:, i]
+            inds = abs(offset) < range_size
+            encode_local_yaw[inds, i] = 1
+            encode_local_yaw[inds, i + self.num_dir_bins] = offset[inds]
+
+        orientation_target = encode_local_yaw
+
+        return orientation_target
+
+    def decode(self, bbox: Tensor, base_centers2d: Tensor, labels: Tensor,
+               downsample_ratio: int, cam2imgs: Tensor) -> Dict[str, Tensor]:
+        """Decode bounding box regression into 3D predictions.
+
+        Args:
+            bbox (Tensor): Raw bounding box predictions for each
+                predict center2d point.
+                shape: (N, C)
+            base_centers2d (torch.Tensor): Base centers2d for 3D bboxes.
+                shape: (N, 2).
+            labels (Tensor): Batch predict class label for each predict
+                center2d point.
+                shape: (N, )
+            downsample_ratio (int): The stride of feature map.
+            cam2imgs (Tensor): Batch images' camera intrinsic matrix.
+                shape: kitti (N, 4, 4)  nuscenes (N, 3, 3)
+
+        Return:
+            dict: The 3D prediction dict decoded from regression map.
+            the dict has components below:
+                - bboxes2d (torch.Tensor): Decoded [x1, y1, x2, y2] format
+                    2D bboxes.
+                - dimensions (torch.Tensor): Decoded dimensions for each
+                    object.
+                - offsets2d (torch.Tenosr): Offsets between base centers2d
+                    and real centers2d.
+                - direct_depth (torch.Tensor): Decoded directly regressed
+                    depth.
+                - keypoints2d (torch.Tensor): Keypoints of each projected
+                    3D box on image.
+                - keypoints_depth (torch.Tensor): Decoded depth from keypoints.
+                - combined_depth (torch.Tensor): Combined depth using direct
+                    depth and keypoints depth with depth uncertainty.
+                - orientations (torch.Tensor): Multibin format orientations
+                    (local yaw) for each objects.
+        """
+
+        # 4 dimensions for FCOS style regression
+        pred_bboxes2d = bbox[:, 0:4]
+
+        # change FCOS style to [x1, y1, x2, y2] format for IOU Loss
+        pred_bboxes2d = self.decode_bboxes2d(pred_bboxes2d, base_centers2d)
+
+        # 2 dimensions for projected centers2d offsets
+        pred_offsets2d = bbox[:, 4:6]
+
+        # 3 dimensions for 3D bbox dimensions offsets
+        pred_dimensions_offsets3d = bbox[:, 29:32]
+
+        # the first 8 dimensions are for orientation bin classification
+        # and the second 8 dimensions are for orientation offsets.
+        pred_orientations = torch.cat((bbox[:, 32:40], bbox[:, 40:48]), dim=1)
+
+        # 3 dimensions for the uncertainties of the solved depths from
+        # groups of keypoints
+        pred_keypoints_depth_uncertainty = bbox[:, 26:29]
+
+        # 1 dimension for the uncertainty of directly regressed depth
+        pred_direct_depth_uncertainty = bbox[:, 49:50].squeeze(-1)
+
+        # 2 dimension of offsets x keypoints (8 corners + top/bottom center)
+        pred_keypoints2d = bbox[:, 6:26].reshape(-1, 10, 2)
+
+        # 1 dimension for depth offsets
+        pred_direct_depth_offsets = bbox[:, 48:49].squeeze(-1)
+
+        # decode the pred residual dimensions to real dimensions
+        pred_dimensions = self.decode_dims(labels, pred_dimensions_offsets3d)
+        pred_direct_depth = self.decode_direct_depth(pred_direct_depth_offsets)
+        pred_keypoints_depth = self.keypoints2depth(pred_keypoints2d,
+                                                    pred_dimensions, cam2imgs,
+                                                    downsample_ratio)
+
+        pred_direct_depth_uncertainty = torch.clamp(
+            pred_direct_depth_uncertainty, self.uncertainty_range[0],
+            self.uncertainty_range[1])
+        pred_keypoints_depth_uncertainty = torch.clamp(
+            pred_keypoints_depth_uncertainty, self.uncertainty_range[0],
+            self.uncertainty_range[1])
+
+        if self.combine_depth:
+            pred_depth_uncertainty = torch.cat(
+                (pred_direct_depth_uncertainty.unsqueeze(-1),
+                 pred_keypoints_depth_uncertainty),
+                dim=1).exp()
+            pred_depth = torch.cat(
+                (pred_direct_depth.unsqueeze(-1), pred_keypoints_depth), dim=1)
+            pred_combined_depth = \
+                self.combine_depths(pred_depth, pred_depth_uncertainty)
+        else:
+            pred_combined_depth = None
+
+        preds = dict(
+            bboxes2d=pred_bboxes2d,
+            dimensions=pred_dimensions,
+            offsets2d=pred_offsets2d,
+            keypoints2d=pred_keypoints2d,
+            orientations=pred_orientations,
+            direct_depth=pred_direct_depth,
+            keypoints_depth=pred_keypoints_depth,
+            combined_depth=pred_combined_depth,
+            direct_depth_uncertainty=pred_direct_depth_uncertainty,
+            keypoints_depth_uncertainty=pred_keypoints_depth_uncertainty,
+        )
+
+        return preds
+
+    def decode_direct_depth(self, depth_offsets: Tensor) -> Tensor:
+        """Transform depth offset to directly regressed depth.
+
+        Args:
+            depth_offsets (torch.Tensor): Predicted depth offsets.
+                shape: (N, )
+
+        Return:
+            torch.Tensor: Directly regressed depth.
+                shape: (N, )
+        """
+        if self.depth_mode == 'exp':
+            direct_depth = depth_offsets.exp()
+        elif self.depth_mode == 'linear':
+            base_depth = depth_offsets.new_tensor(self.base_depth)
+            direct_depth = depth_offsets * base_depth[1] + base_depth[0]
+        elif self.depth_mode == 'inv_sigmoid':
+            direct_depth = 1 / torch.sigmoid(depth_offsets) - 1
+        else:
+            raise ValueError
+
+        if self.depth_range is not None:
+            direct_depth = torch.clamp(
+                direct_depth, min=self.depth_range[0], max=self.depth_range[1])
+
+        return direct_depth
+
+    def decode_location(self,
+                        base_centers2d: Tensor,
+                        offsets2d: Tensor,
+                        depths: Tensor,
+                        cam2imgs: Tensor,
+                        downsample_ratio: Tensor,
+                        pad_mode: str = 'default') -> Tuple[Tensor]:
+        """Retrieve object location.
+
+        Args:
+            base_centers2d (torch.Tensor): predicted base centers2d.
+                shape: (N, 2)
+            offsets2d (torch.Tensor): The offsets between real centers2d
+                and base centers2d.
+                shape: (N , 2)
+            depths (torch.Tensor): Depths of objects.
+                shape: (N, )
+            cam2imgs (torch.Tensor): Batch images' camera intrinsic matrix.
+                shape: kitti (N, 4, 4)  nuscenes (N, 3, 3)
+            downsample_ratio (int): The stride of feature map.
+            pad_mode (str, optional): Padding mode used in
+                training data augmentation.
+
+        Return:
+            tuple(torch.Tensor): Centers of 3D boxes.
+                shape: (N, 3)
+        """
+        N = cam2imgs.shape[0]
+        # (N, 4, 4)
+        cam2imgs_inv = cam2imgs.inverse()
+        if pad_mode == 'default':
+            centers2d_img = (base_centers2d + offsets2d) * downsample_ratio
+        else:
+            raise NotImplementedError
+        # (N, 3)
+        centers2d_img = \
+            torch.cat((centers2d_img, depths.unsqueeze(-1)), dim=1)
+        # (N, 4, 1)
+        centers2d_extend = \
+            torch.cat((centers2d_img, centers2d_img.new_ones(N, 1)),
+                      dim=1).unsqueeze(-1)
+        locations = torch.matmul(cam2imgs_inv, centers2d_extend).squeeze(-1)
+
+        return locations[:, :3]
+
+    def keypoints2depth(
+            self,
+            keypoints2d: Tensor,
+            dimensions: Tensor,
+            cam2imgs: Tensor,
+            downsample_ratio: int = 4,
+            group0_index: List[Tuple[int]] = [(7, 3), (0, 4)],
+            group1_index: List[Tuple[int]] = [(2, 6),
+                                              (1, 5)]) -> Tuple[Tensor]:
+        """Decode depth form three groups of keypoints and geometry projection
+        model. 2D keypoints inlucding 8 coreners and top/bottom centers will be
+        divided into three groups which will be used to calculate three depths
+        of object.
+
+        .. code-block:: none
+
+                Group center keypoints:
+
+                             + --------------- +
+                            /|   top center   /|
+                           / |      .        / |
+                          /  |      |       /  |
+                         + ---------|----- +   +
+                         |  /       |      |  /
+                         | /        .      | /
+                         |/ bottom center  |/
+                         + --------------- +
+
+                Group 0 keypoints:
+
+                             0
+                             + -------------- +
+                            /|               /|
+                           / |              / |
+                          /  |            5/  |
+                         + -------------- +   +
+                         |  /3            |  /
+                         | /              | /
+                         |/               |/
+                         + -------------- + 6
+
+                Group 1 keypoints:
+
+                                               4
+                             + -------------- +
+                            /|               /|
+                           / |              / |
+                          /  |             /  |
+                       1 + -------------- +   + 7
+                         |  /             |  /
+                         | /              | /
+                         |/               |/
+                       2 + -------------- +
+
+
+        Args:
+            keypoints2d (torch.Tensor): Keypoints of objects.
+                8 vertices + top/bottom center.
+                shape: (N, 10, 2)
+            dimensions (torch.Tensor): Dimensions of objetcts.
+                shape: (N, 3)
+            cam2imgs (torch.Tensor): Batch images' camera intrinsic matrix.
+                shape: kitti (N, 4, 4)  nuscenes (N, 3, 3)
+            downsample_ratio (int, opitonal): The stride of feature map.
+                Defaults: 4.
+            group0_index(list[tuple[int]], optional): Keypoints group 0
+                of index to calculate the depth.
+                Defaults: [0, 3, 4, 7].
+            group1_index(list[tuple[int]], optional): Keypoints group 1
+                of index to calculate the depth.
+                Defaults: [1, 2, 5, 6]
+
+        Return:
+            tuple(torch.Tensor): Depth computed from three groups of
+                keypoints (top/bottom, group0, group1)
+                shape: (N, 3)
+        """
+
+        pred_height_3d = dimensions[:, 1].clone()
+        f_u = cam2imgs[:, 0, 0]
+        center_height = keypoints2d[:, -2, 1] - keypoints2d[:, -1, 1]
+        corner_group0_height = keypoints2d[:, group0_index[0], 1] \
+            - keypoints2d[:, group0_index[1], 1]
+        corner_group1_height = keypoints2d[:, group1_index[0], 1] \
+            - keypoints2d[:, group1_index[1], 1]
+        center_depth = f_u * pred_height_3d / (
+            F.relu(center_height) * downsample_ratio + self.eps)
+        corner_group0_depth = (f_u * pred_height_3d).unsqueeze(-1) / (
+            F.relu(corner_group0_height) * downsample_ratio + self.eps)
+        corner_group1_depth = (f_u * pred_height_3d).unsqueeze(-1) / (
+            F.relu(corner_group1_height) * downsample_ratio + self.eps)
+
+        corner_group0_depth = corner_group0_depth.mean(dim=1)
+        corner_group1_depth = corner_group1_depth.mean(dim=1)
+
+        keypoints_depth = torch.stack(
+            (center_depth, corner_group0_depth, corner_group1_depth), dim=1)
+        keypoints_depth = torch.clamp(
+            keypoints_depth, min=self.depth_range[0], max=self.depth_range[1])
+
+        return keypoints_depth
+
+    def decode_dims(self, labels: Tensor, dims_offset: Tensor) -> Tensor:
+        """Retrieve object dimensions.
+
+        Args:
+            labels (torch.Tensor): Each points' category id.
+                shape: (N, K)
+            dims_offset (torch.Tensor): Dimension offsets.
+                shape: (N, 3)
+
+        Returns:
+            torch.Tensor: Shape (N, 3)
+        """
+
+        if self.dims_mode == 'exp':
+            dims_offset = dims_offset.exp()
+        elif self.dims_mode == 'linear':
+            labels = labels.long()
+            base_dims = dims_offset.new_tensor(self.base_dims)
+            dims_mean = base_dims[:, :3]
+            dims_std = base_dims[:, 3:6]
+            cls_dimension_mean = dims_mean[labels, :]
+            cls_dimension_std = dims_std[labels, :]
+            dimensions = dims_offset * cls_dimension_mean + cls_dimension_std
+        else:
+            raise ValueError
+
+        return dimensions
+
+    def decode_orientation(self, ori_vector: Tensor,
+                           locations: Tensor) -> Tuple[Tensor]:
+        """Retrieve object orientation.
+
+        Args:
+            ori_vector (torch.Tensor): Local orientation vector
+                in [axis_cls, head_cls, sin, cos] format.
+                shape: (N, num_dir_bins * 4)
+            locations (torch.Tensor): Object location.
+                shape: (N, 3)
+
+        Returns:
+            tuple[torch.Tensor]: yaws and local yaws of 3d bboxes.
+        """
+        if self.multibin:
+            pred_bin_cls = ori_vector[:, :self.num_dir_bins * 2].view(
+                -1, self.num_dir_bins, 2)
+            pred_bin_cls = pred_bin_cls.softmax(dim=2)[..., 1]
+            orientations = ori_vector.new_zeros(ori_vector.shape[0])
+            for i in range(self.num_dir_bins):
+                mask_i = (pred_bin_cls.argmax(dim=1) == i)
+                start_bin = self.num_dir_bins * 2 + i * 2
+                end_bin = start_bin + 2
+                pred_bin_offset = ori_vector[mask_i, start_bin:end_bin]
+                orientations[mask_i] = pred_bin_offset[:, 0].atan2(
+                    pred_bin_offset[:, 1]) + self.bin_centers[i]
+        else:
+            axis_cls = ori_vector[:, :2].softmax(dim=1)
+            axis_cls = axis_cls[:, 0] < axis_cls[:, 1]
+            head_cls = ori_vector[:, 2:4].softmax(dim=1)
+            head_cls = head_cls[:, 0] < head_cls[:, 1]
+            # cls axis
+            orientations = self.bin_centers[axis_cls + head_cls * 2]
+            sin_cos_offset = F.normalize(ori_vector[:, 4:])
+            orientations += sin_cos_offset[:, 0].atan(sin_cos_offset[:, 1])
+
+        locations = locations.view(-1, 3)
+        rays = locations[:, 0].atan2(locations[:, 2])
+        local_yaws = orientations
+        yaws = local_yaws + rays
+
+        larger_idx = (yaws > np.pi).nonzero(as_tuple=False)
+        small_idx = (yaws < -np.pi).nonzero(as_tuple=False)
+        if len(larger_idx) != 0:
+            yaws[larger_idx] -= 2 * np.pi
+        if len(small_idx) != 0:
+            yaws[small_idx] += 2 * np.pi
+
+        larger_idx = (local_yaws > np.pi).nonzero(as_tuple=False)
+        small_idx = (local_yaws < -np.pi).nonzero(as_tuple=False)
+        if len(larger_idx) != 0:
+            local_yaws[larger_idx] -= 2 * np.pi
+        if len(small_idx) != 0:
+            local_yaws[small_idx] += 2 * np.pi
+
+        return yaws, local_yaws
+
+    def decode_bboxes2d(self, reg_bboxes2d: Tensor,
+                        base_centers2d: Tensor) -> Tensor:
+        """Retrieve [x1, y1, x2, y2] format 2D bboxes.
+
+        Args:
+            reg_bboxes2d (torch.Tensor): Predicted FCOS style
+                2D bboxes.
+                shape: (N, 4)
+            base_centers2d (torch.Tensor): predicted base centers2d.
+                shape: (N, 2)
+
+        Returns:
+            torch.Tenosr: [x1, y1, x2, y2] format 2D bboxes.
+        """
+        centers_x = base_centers2d[:, 0]
+        centers_y = base_centers2d[:, 1]
+
+        xs_min = centers_x - reg_bboxes2d[..., 0]
+        ys_min = centers_y - reg_bboxes2d[..., 1]
+        xs_max = centers_x + reg_bboxes2d[..., 2]
+        ys_max = centers_y + reg_bboxes2d[..., 3]
+
+        bboxes2d = torch.stack([xs_min, ys_min, xs_max, ys_max], dim=-1)
+
+        return bboxes2d
+
+    def combine_depths(self, depth: Tensor,
+                       depth_uncertainty: Tensor) -> Tensor:
+        """Combine all the prediced depths with depth uncertainty.
+
+        Args:
+            depth (torch.Tensor): Predicted depths of each object.
+                2D bboxes.
+                shape: (N, 4)
+            depth_uncertainty (torch.Tensor): Depth uncertainty for
+                each depth of each object.
+                shape: (N, 4)
+
+        Returns:
+            torch.Tenosr: combined depth.
+        """
+        uncertainty_weights = 1 / depth_uncertainty
+        uncertainty_weights = \
+            uncertainty_weights / \
+            uncertainty_weights.sum(dim=1, keepdim=True)
+        combined_depth = torch.sum(depth * uncertainty_weights, dim=1)
+
+        return combined_depth
diff --git a/mmde/mmdet3d/models/task_modules/coders/partial_bin_based_bbox_coder.py b/mmde/mmdet3d/models/task_modules/coders/partial_bin_based_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d1184c50a259cff475e221f8425269c5e7c1d6b
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/coders/partial_bin_based_bbox_coder.py
@@ -0,0 +1,255 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List
+
+import numpy as np
+import torch
+from mmdet.models.task_modules import BaseBBoxCoder
+from torch import Tensor
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet3d.structures.bbox_3d import BaseInstance3DBoxes
+
+
+@TASK_UTILS.register_module()
+class PartialBinBasedBBoxCoder(BaseBBoxCoder):
+    """Partial bin based bbox coder.
+
+    Args:
+        num_dir_bins (int): Number of bins to encode direction angle.
+        num_sizes (int): Number of size clusters.
+        mean_sizes (list[list[int]]): Mean size of bboxes in each class.
+        with_rot (bool): Whether the bbox is with rotation.
+    """
+
+    def __init__(self,
+                 num_dir_bins: int,
+                 num_sizes: int,
+                 mean_sizes: List[List[int]],
+                 with_rot: bool = True):
+        super(PartialBinBasedBBoxCoder, self).__init__()
+        assert len(mean_sizes) == num_sizes
+        self.num_dir_bins = num_dir_bins
+        self.num_sizes = num_sizes
+        self.mean_sizes = mean_sizes
+        self.with_rot = with_rot
+
+    def encode(self, gt_bboxes_3d: BaseInstance3DBoxes,
+               gt_labels_3d: Tensor) -> tuple:
+        """Encode ground truth to prediction targets.
+
+        Args:
+            gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes
+                with shape (n, 7).
+            gt_labels_3d (torch.Tensor): Ground truth classes.
+
+        Returns:
+            tuple: Targets of center, size and direction.
+        """
+        # generate center target
+        center_target = gt_bboxes_3d.gravity_center
+
+        # generate bbox size target
+        size_class_target = gt_labels_3d
+        size_res_target = gt_bboxes_3d.dims - gt_bboxes_3d.tensor.new_tensor(
+            self.mean_sizes)[size_class_target]
+
+        # generate dir target
+        box_num = gt_labels_3d.shape[0]
+        if self.with_rot:
+            (dir_class_target,
+             dir_res_target) = self.angle2class(gt_bboxes_3d.yaw)
+        else:
+            dir_class_target = gt_labels_3d.new_zeros(box_num)
+            dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num)
+
+        return (center_target, size_class_target, size_res_target,
+                dir_class_target, dir_res_target)
+
+    def decode(self, bbox_out: dict, suffix: str = '') -> Tensor:
+        """Decode predicted parts to bbox3d.
+
+        Args:
+            bbox_out (dict): Predictions from model, should contain keys below.
+
+                - center: predicted bottom center of bboxes.
+                - dir_class: predicted bbox direction class.
+                - dir_res: predicted bbox direction residual.
+                - size_class: predicted bbox size class.
+                - size_res: predicted bbox size residual.
+            suffix (str): Decode predictions with specific suffix.
+
+        Returns:
+            torch.Tensor: Decoded bbox3d with shape (batch, n, 7).
+        """
+        center = bbox_out['center' + suffix]
+        batch_size, num_proposal = center.shape[:2]
+
+        # decode heading angle
+        if self.with_rot:
+            dir_class = torch.argmax(bbox_out['dir_class' + suffix], -1)
+            dir_res = torch.gather(bbox_out['dir_res' + suffix], 2,
+                                   dir_class.unsqueeze(-1))
+            dir_res.squeeze_(2)
+            dir_angle = self.class2angle(dir_class, dir_res).reshape(
+                batch_size, num_proposal, 1)
+        else:
+            dir_angle = center.new_zeros(batch_size, num_proposal, 1)
+
+        # decode bbox size
+        size_class = torch.argmax(
+            bbox_out['size_class' + suffix], -1, keepdim=True)
+        size_res = torch.gather(bbox_out['size_res' + suffix], 2,
+                                size_class.unsqueeze(-1).repeat(1, 1, 1, 3))
+        mean_sizes = center.new_tensor(self.mean_sizes)
+        size_base = torch.index_select(mean_sizes, 0, size_class.reshape(-1))
+        bbox_size = size_base.reshape(batch_size, num_proposal,
+                                      -1) + size_res.squeeze(2)
+
+        bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
+        return bbox3d
+
+    def decode_corners(self, center: Tensor, size_res: Tensor,
+                       size_class: Tensor) -> Tensor:
+        """Decode center, size residuals and class to corners. Only useful for
+        axis-aligned bounding boxes, so angle isn't considered.
+
+        Args:
+            center (torch.Tensor): Shape [B, N, 3]
+            size_res (torch.Tensor): Shape [B, N, 3] or [B, N, C, 3]
+            size_class (torch.Tensor): Shape: [B, N] or [B, N, 1]
+            or [B, N, C, 3]
+
+        Returns:
+            torch.Tensor: Corners with shape [B, N, 6]
+        """
+        if len(size_class.shape) == 2 or size_class.shape[-1] == 1:
+            batch_size, proposal_num = size_class.shape[:2]
+            one_hot_size_class = size_res.new_zeros(
+                (batch_size, proposal_num, self.num_sizes))
+            if len(size_class.shape) == 2:
+                size_class = size_class.unsqueeze(-1)
+            one_hot_size_class.scatter_(2, size_class, 1)
+            one_hot_size_class_expand = one_hot_size_class.unsqueeze(
+                -1).repeat(1, 1, 1, 3).contiguous()
+        else:
+            one_hot_size_class_expand = size_class
+
+        if len(size_res.shape) == 4:
+            size_res = torch.sum(size_res * one_hot_size_class_expand, 2)
+
+        mean_sizes = size_res.new_tensor(self.mean_sizes)
+        mean_sizes = torch.sum(mean_sizes * one_hot_size_class_expand, 2)
+        size_full = (size_res + 1) * mean_sizes
+        size_full = torch.clamp(size_full, 0)
+        half_size_full = size_full / 2
+        corner1 = center - half_size_full
+        corner2 = center + half_size_full
+        corners = torch.cat([corner1, corner2], dim=-1)
+        return corners
+
+    def split_pred(self, cls_preds: Tensor, reg_preds: Tensor,
+                   base_xyz: Tensor) -> Dict[str, Tensor]:
+        """Split predicted features to specific parts.
+
+        Args:
+            cls_preds (torch.Tensor): Class predicted features to split.
+            reg_preds (torch.Tensor): Regression predicted features to split.
+            base_xyz (torch.Tensor): Coordinates of points.
+
+        Returns:
+            dict[str, torch.Tensor]: Split results.
+        """
+        results = {}
+        start, end = 0, 0
+
+        cls_preds_trans = cls_preds.transpose(2, 1)
+        reg_preds_trans = reg_preds.transpose(2, 1)
+
+        # decode center
+        end += 3
+        # (batch_size, num_proposal, 3)
+        results['center'] = base_xyz + \
+            reg_preds_trans[..., start:end].contiguous()
+        start = end
+
+        # decode direction
+        end += self.num_dir_bins
+        results['dir_class'] = reg_preds_trans[..., start:end].contiguous()
+        start = end
+
+        end += self.num_dir_bins
+        dir_res_norm = reg_preds_trans[..., start:end].contiguous()
+        start = end
+
+        results['dir_res_norm'] = dir_res_norm
+        results['dir_res'] = dir_res_norm * (np.pi / self.num_dir_bins)
+
+        # decode size
+        end += self.num_sizes
+        results['size_class'] = reg_preds_trans[..., start:end].contiguous()
+        start = end
+
+        end += self.num_sizes * 3
+        size_res_norm = reg_preds_trans[..., start:end]
+        batch_size, num_proposal = reg_preds_trans.shape[:2]
+        size_res_norm = size_res_norm.view(
+            [batch_size, num_proposal, self.num_sizes, 3])
+        start = end
+
+        results['size_res_norm'] = size_res_norm.contiguous()
+        mean_sizes = reg_preds.new_tensor(self.mean_sizes)
+        results['size_res'] = (
+            size_res_norm * mean_sizes.unsqueeze(0).unsqueeze(0))
+
+        # decode objectness score
+        start = 0
+        end = 2
+        results['obj_scores'] = cls_preds_trans[..., start:end].contiguous()
+        start = end
+
+        # decode semantic score
+        results['sem_scores'] = cls_preds_trans[..., start:].contiguous()
+
+        return results
+
+    def angle2class(self, angle: Tensor) -> tuple:
+        """Convert continuous angle to a discrete class and a residual.
+
+        Convert continuous angle to a discrete class and a small
+        regression number from class center angle to current angle.
+
+        Args:
+            angle (torch.Tensor): Angle is from 0-2pi (or -pi~pi),
+                class center at 0, 1*(2pi/N), 2*(2pi/N) ...  (N-1)*(2pi/N).
+
+        Returns:
+            tuple: Encoded discrete class and residual.
+        """
+        angle = angle % (2 * np.pi)
+        angle_per_class = 2 * np.pi / float(self.num_dir_bins)
+        shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi)
+        angle_cls = shifted_angle // angle_per_class
+        angle_res = shifted_angle - (
+            angle_cls * angle_per_class + angle_per_class / 2)
+        return angle_cls.long(), angle_res
+
+    def class2angle(self,
+                    angle_cls: Tensor,
+                    angle_res: Tensor,
+                    limit_period: bool = True) -> Tensor:
+        """Inverse function to angle2class.
+
+        Args:
+            angle_cls (torch.Tensor): Angle class to decode.
+            angle_res (torch.Tensor): Angle residual to decode.
+            limit_period (bool): Whether to limit angle to [-pi, pi].
+
+        Returns:
+            torch.Tensor: Angle decoded from angle_cls and angle_res.
+        """
+        angle_per_class = 2 * np.pi / float(self.num_dir_bins)
+        angle_center = angle_cls.float() * angle_per_class
+        angle = angle_center + angle_res
+        if limit_period:
+            angle[angle > np.pi] -= 2 * np.pi
+        return angle
diff --git a/mmde/mmdet3d/models/task_modules/coders/pgd_bbox_coder.py b/mmde/mmdet3d/models/task_modules/coders/pgd_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..d689490aa7fc7344ccd7c2f83b03d5e4d2d69659
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/coders/pgd_bbox_coder.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import numpy as np
+import torch
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.registry import TASK_UTILS
+from .fcos3d_bbox_coder import FCOS3DBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class PGDBBoxCoder(FCOS3DBBoxCoder):
+    """Bounding box coder for PGD."""
+
+    def encode(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels):
+        # TODO: refactor the encoder codes in the FCOS3D and PGD head
+        pass
+
+    def decode_2d(self,
+                  bbox: Tensor,
+                  scale: tuple,
+                  stride: int,
+                  max_regress_range: int,
+                  training: bool,
+                  pred_keypoints: bool = False,
+                  pred_bbox2d: bool = True) -> Tensor:
+        """Decode regressed 2D attributes.
+
+        Args:
+            bbox (torch.Tensor): Raw bounding box predictions in shape
+                [N, C, H, W].
+            scale (tuple[`Scale`]): Learnable scale parameters.
+            stride (int): Stride for a specific feature level.
+            max_regress_range (int): Maximum regression range for a specific
+                feature level.
+            training (bool): Whether the decoding is in the training
+                procedure.
+            pred_keypoints (bool, optional): Whether to predict keypoints.
+                Defaults to False.
+            pred_bbox2d (bool, optional): Whether to predict 2D bounding
+                boxes. Defaults to False.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        clone_bbox = bbox.clone()
+        if pred_keypoints:
+            scale_kpts = scale[3]
+            # 2 dimension of offsets x 8 corners of a 3D bbox
+            bbox[:, self.bbox_code_size:self.bbox_code_size + 16] = \
+                torch.tanh(scale_kpts(clone_bbox[
+                    :, self.bbox_code_size:self.bbox_code_size + 16]).float())
+
+        if pred_bbox2d:
+            scale_bbox2d = scale[-1]
+            # The last four dimensions are offsets to four sides of a 2D bbox
+            bbox[:, -4:] = scale_bbox2d(clone_bbox[:, -4:]).float()
+
+        if self.norm_on_bbox:
+            if pred_bbox2d:
+                bbox[:, -4:] = F.relu(bbox.clone()[:, -4:])
+            if not training:
+                if pred_keypoints:
+                    bbox[
+                        :, self.bbox_code_size:self.bbox_code_size + 16] *= \
+                           max_regress_range
+                if pred_bbox2d:
+                    bbox[:, -4:] *= stride
+        else:
+            if pred_bbox2d:
+                bbox[:, -4:] = bbox.clone()[:, -4:].exp()
+        return bbox
+
+    def decode_prob_depth(self, depth_cls_preds: Tensor,
+                          depth_range: Tuple[float], depth_unit: int,
+                          division: str, num_depth_cls: int) -> Tensor:
+        """Decode probabilistic depth map.
+
+        Args:
+            depth_cls_preds (torch.Tensor): Depth probabilistic map in shape
+                [..., self.num_depth_cls] (raw output before softmax).
+            depth_range (tuple[float]): Range of depth estimation.
+            depth_unit (int): Unit of depth range division.
+            division (str): Depth division method. Options include 'uniform',
+                'linear', 'log', 'loguniform'.
+            num_depth_cls (int): Number of depth classes.
+
+        Returns:
+            torch.Tensor: Decoded probabilistic depth estimation.
+        """
+        if division == 'uniform':
+            depth_multiplier = depth_unit * \
+                depth_cls_preds.new_tensor(
+                    list(range(num_depth_cls))).reshape([1, -1])
+            prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) *
+                                depth_multiplier).sum(dim=-1)
+            return prob_depth_preds
+        elif division == 'linear':
+            split_pts = depth_cls_preds.new_tensor(list(
+                range(num_depth_cls))).reshape([1, -1])
+            depth_multiplier = depth_range[0] + (
+                depth_range[1] - depth_range[0]) / \
+                (num_depth_cls * (num_depth_cls - 1)) * \
+                (split_pts * (split_pts+1))
+            prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) *
+                                depth_multiplier).sum(dim=-1)
+            return prob_depth_preds
+        elif division == 'log':
+            split_pts = depth_cls_preds.new_tensor(list(
+                range(num_depth_cls))).reshape([1, -1])
+            start = max(depth_range[0], 1)
+            end = depth_range[1]
+            depth_multiplier = (np.log(start) +
+                                split_pts * np.log(end / start) /
+                                (num_depth_cls - 1)).exp()
+            prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) *
+                                depth_multiplier).sum(dim=-1)
+            return prob_depth_preds
+        elif division == 'loguniform':
+            split_pts = depth_cls_preds.new_tensor(list(
+                range(num_depth_cls))).reshape([1, -1])
+            start = max(depth_range[0], 1)
+            end = depth_range[1]
+            log_multiplier = np.log(start) + \
+                split_pts * np.log(end / start) / (num_depth_cls - 1)
+            prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) *
+                                log_multiplier).sum(dim=-1).exp()
+            return prob_depth_preds
+        else:
+            raise NotImplementedError
diff --git a/mmde/mmdet3d/models/task_modules/coders/point_xyzwhlr_bbox_coder.py b/mmde/mmdet3d/models/task_modules/coders/point_xyzwhlr_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..09c20b7421b629a62d55eb44953362b480bf5627
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/coders/point_xyzwhlr_bbox_coder.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import numpy as np
+import torch
+from mmdet.models.task_modules import BaseBBoxCoder
+from torch import Tensor
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet3d.structures import BaseInstance3DBoxes
+
+
+@TASK_UTILS.register_module()
+class PointXYZWHLRBBoxCoder(BaseBBoxCoder):
+    """Point based bbox coder for 3D boxes.
+
+    Args:
+        code_size (int): The dimension of boxes to be encoded.
+        use_mean_size (bool, optional): Whether using anchors based on class.
+            Defaults to True.
+        mean_size (list[list[float]], optional): Mean size of bboxes in
+            each class. Defaults to None.
+    """
+
+    def __init__(self,
+                 code_size: int = 7,
+                 use_mean_size: bool = True,
+                 mean_size: List[List[float]] = None):
+        super(PointXYZWHLRBBoxCoder, self).__init__()
+        self.code_size = code_size
+        self.use_mean_size = use_mean_size
+        if self.use_mean_size:
+            self.mean_size = torch.from_numpy(np.array(mean_size)).float()
+            assert self.mean_size.min() > 0, \
+                f'The min of mean_size should > 0, however currently it is '\
+                f'{self.mean_size.min()}, please check it in your config.'
+
+    def encode(self,
+               gt_bboxes_3d: BaseInstance3DBoxes,
+               points: Tensor,
+               gt_labels_3d: Optional[Tensor] = None) -> Tensor:
+        """Encode ground truth to prediction targets.
+
+        Args:
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth bboxes
+                with shape (N, 7 + C).
+            points (torch.Tensor): Point cloud with shape (N, 3).
+            gt_labels_3d (torch.Tensor, optional): Ground truth classes.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Encoded boxes with shape (N, 8 + C).
+        """
+        gt_bboxes_3d[:, 3:6] = torch.clamp_min(gt_bboxes_3d[:, 3:6], min=1e-5)
+
+        xg, yg, zg, dxg, dyg, dzg, rg, *cgs = torch.split(
+            gt_bboxes_3d, 1, dim=-1)
+        xa, ya, za = torch.split(points, 1, dim=-1)
+
+        if self.use_mean_size:
+            assert gt_labels_3d.max() <= self.mean_size.shape[0] - 1, \
+                f'the max gt label {gt_labels_3d.max()} is bigger than' \
+                f'anchor types {self.mean_size.shape[0] - 1}.'
+            self.mean_size = self.mean_size.to(gt_labels_3d.device)
+            point_anchor_size = self.mean_size[gt_labels_3d]
+            dxa, dya, dza = torch.split(point_anchor_size, 1, dim=-1)
+            diagonal = torch.sqrt(dxa**2 + dya**2)
+            xt = (xg - xa) / diagonal
+            yt = (yg - ya) / diagonal
+            zt = (zg - za) / dza
+            dxt = torch.log(dxg / dxa)
+            dyt = torch.log(dyg / dya)
+            dzt = torch.log(dzg / dza)
+        else:
+            xt = (xg - xa)
+            yt = (yg - ya)
+            zt = (zg - za)
+            dxt = torch.log(dxg)
+            dyt = torch.log(dyg)
+            dzt = torch.log(dzg)
+
+        return torch.cat(
+            [xt, yt, zt, dxt, dyt, dzt,
+             torch.cos(rg),
+             torch.sin(rg), *cgs],
+            dim=-1)
+
+    def decode(self,
+               box_encodings: Tensor,
+               points: Tensor,
+               pred_labels_3d: Optional[Tensor] = None) -> Tensor:
+        """Decode predicted parts and points to bbox3d.
+
+        Args:
+            box_encodings (torch.Tensor): Encoded boxes with shape (N, 8 + C).
+            points (torch.Tensor): Point cloud with shape (N, 3).
+            pred_labels_3d (torch.Tensor): Bbox predicted labels (N, M).
+
+        Returns:
+            torch.Tensor: Decoded boxes with shape (N, 7 + C)
+        """
+        xt, yt, zt, dxt, dyt, dzt, cost, sint, *cts = torch.split(
+            box_encodings, 1, dim=-1)
+        xa, ya, za = torch.split(points, 1, dim=-1)
+
+        if self.use_mean_size:
+            assert pred_labels_3d.max() <= self.mean_size.shape[0] - 1, \
+                f'The max pred label {pred_labels_3d.max()} is bigger than' \
+                f'anchor types {self.mean_size.shape[0] - 1}.'
+            self.mean_size = self.mean_size.to(pred_labels_3d.device)
+            point_anchor_size = self.mean_size[pred_labels_3d]
+            dxa, dya, dza = torch.split(point_anchor_size, 1, dim=-1)
+            diagonal = torch.sqrt(dxa**2 + dya**2)
+            xg = xt * diagonal + xa
+            yg = yt * diagonal + ya
+            zg = zt * dza + za
+
+            dxg = torch.exp(dxt) * dxa
+            dyg = torch.exp(dyt) * dya
+            dzg = torch.exp(dzt) * dza
+        else:
+            xg = xt + xa
+            yg = yt + ya
+            zg = zt + za
+            dxg, dyg, dzg = torch.split(
+                torch.exp(box_encodings[..., 3:6]), 1, dim=-1)
+
+        rg = torch.atan2(sint, cost)
+
+        return torch.cat([xg, yg, zg, dxg, dyg, dzg, rg, *cts], dim=-1)
diff --git a/mmde/mmdet3d/models/task_modules/coders/smoke_bbox_coder.py b/mmde/mmdet3d/models/task_modules/coders/smoke_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d98f309f8ba3d58d19a88091bdba8b88396b821
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/coders/smoke_bbox_coder.py
@@ -0,0 +1,217 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmdet.models.task_modules import BaseBBoxCoder
+from torch import Tensor
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet3d.structures import CameraInstance3DBoxes
+
+
+@TASK_UTILS.register_module()
+class SMOKECoder(BaseBBoxCoder):
+    """Bbox Coder for SMOKE.
+
+    Args:
+        base_depth (tuple[float]): Depth references for decode box depth.
+        base_dims (tuple[tuple[float]]): Dimension references [l, h, w]
+            for decode box dimension for each category.
+        code_size (int): The dimension of boxes to be encoded.
+    """
+
+    def __init__(self, base_depth: Tuple[float], base_dims: Tuple[float],
+                 code_size: int):
+        super(SMOKECoder, self).__init__()
+        self.base_depth = base_depth
+        self.base_dims = base_dims
+        self.bbox_code_size = code_size
+
+    def encode(self, locations: Optional[Tensor], dimensions: Tensor,
+               orientations: Tensor,
+               input_metas: List[dict]) -> CameraInstance3DBoxes:
+        """Encode CameraInstance3DBoxes by locations, dimensions, orientations.
+
+        Args:
+            locations (Tensor): Center location for 3D boxes.
+                (N, 3)
+            dimensions (Tensor): Dimensions for 3D boxes.
+                shape (N, 3)
+            orientations (Tensor): Orientations for 3D boxes.
+                shape (N, 1)
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Return:
+            :obj:`CameraInstance3DBoxes`: 3D bboxes of batch images,
+                shape (N, bbox_code_size).
+        """
+
+        bboxes = torch.cat((locations, dimensions, orientations), dim=1)
+        assert bboxes.shape[1] == self.bbox_code_size, 'bboxes shape dose not'\
+            'match the bbox_code_size.'
+        batch_bboxes = input_metas[0]['box_type_3d'](
+            bboxes, box_dim=self.bbox_code_size)
+
+        return batch_bboxes
+
+    def decode(self,
+               reg: Tensor,
+               points: Tensor,
+               labels: Tensor,
+               cam2imgs: Tensor,
+               trans_mats: Tensor,
+               locations: Optional[Tensor] = None) -> Tuple[Tensor]:
+        """Decode regression into locations, dimensions, orientations.
+
+        Args:
+            reg (Tensor): Batch regression for each predict center2d point.
+                shape: (batch * K (max_objs), C)
+            points(Tensor): Batch projected bbox centers on image plane.
+                shape: (batch * K (max_objs) , 2)
+            labels (Tensor): Batch predict class label for each predict
+                center2d point.
+                shape: (batch, K (max_objs))
+            cam2imgs (Tensor): Batch images' camera intrinsic matrix.
+                shape: kitti (batch, 4, 4)  nuscenes (batch, 3, 3)
+            trans_mats (Tensor): transformation matrix from original image
+                to feature map.
+                shape: (batch, 3, 3)
+            locations (None | Tensor): if locations is None, this function
+                is used to decode while inference, otherwise, it's used while
+                training using the ground truth 3d bbox locations.
+                shape: (batch * K (max_objs), 3)
+
+        Return:
+            tuple(Tensor): The tuple has components below:
+                - locations (Tensor): Centers of 3D boxes.
+                    shape: (batch * K (max_objs), 3)
+                - dimensions (Tensor): Dimensions of 3D boxes.
+                    shape: (batch * K (max_objs), 3)
+                - orientations (Tensor): Orientations of 3D
+                    boxes.
+                    shape: (batch * K (max_objs), 1)
+        """
+        depth_offsets = reg[:, 0]
+        centers2d_offsets = reg[:, 1:3]
+        dimensions_offsets = reg[:, 3:6]
+        orientations = reg[:, 6:8]
+        depths = self._decode_depth(depth_offsets)
+        # get the 3D Bounding box's center location.
+        pred_locations = self._decode_location(points, centers2d_offsets,
+                                               depths, cam2imgs, trans_mats)
+        pred_dimensions = self._decode_dimension(labels, dimensions_offsets)
+        if locations is None:
+            pred_orientations = self._decode_orientation(
+                orientations, pred_locations)
+        else:
+            pred_orientations = self._decode_orientation(
+                orientations, locations)
+
+        return pred_locations, pred_dimensions, pred_orientations
+
+    def _decode_depth(self, depth_offsets: Tensor) -> Tensor:
+        """Transform depth offset to depth."""
+        base_depth = depth_offsets.new_tensor(self.base_depth)
+        depths = depth_offsets * base_depth[1] + base_depth[0]
+
+        return depths
+
+    def _decode_location(self, points: Tensor, centers2d_offsets: Tensor,
+                         depths: Tensor, cam2imgs: Tensor,
+                         trans_mats: Tensor) -> Tensor:
+        """Retrieve objects location in camera coordinate based on projected
+        points.
+
+        Args:
+            points (Tensor): Projected points on feature map in (x, y)
+                shape: (batch * K, 2)
+            centers2d_offset (Tensor): Project points offset in
+                (delta_x, delta_y). shape: (batch * K, 2)
+            depths (Tensor): Object depth z.
+                shape: (batch * K)
+            cam2imgs (Tensor): Batch camera intrinsics matrix.
+                shape: kitti (batch, 4, 4)  nuscenes (batch, 3, 3)
+            trans_mats (Tensor): transformation matrix from original image
+                to feature map.
+                shape: (batch, 3, 3)
+        """
+        # number of points
+        N = centers2d_offsets.shape[0]
+        # batch_size
+        N_batch = cam2imgs.shape[0]
+        batch_id = torch.arange(N_batch).unsqueeze(1)
+        obj_id = batch_id.repeat(1, N // N_batch).flatten()
+        trans_mats_inv = trans_mats.inverse()[obj_id]
+        cam2imgs_inv = cam2imgs.inverse()[obj_id]
+        centers2d = points + centers2d_offsets
+        centers2d_extend = torch.cat((centers2d, centers2d.new_ones(N, 1)),
+                                     dim=1)
+        # expand project points as [N, 3, 1]
+        centers2d_extend = centers2d_extend.unsqueeze(-1)
+        # transform project points back on original image
+        centers2d_img = torch.matmul(trans_mats_inv, centers2d_extend)
+        centers2d_img = centers2d_img * depths.view(N, -1, 1)
+        if cam2imgs.shape[1] == 4:
+            centers2d_img = torch.cat(
+                (centers2d_img, centers2d.new_ones(N, 1, 1)), dim=1)
+        locations = torch.matmul(cam2imgs_inv, centers2d_img).squeeze(2)
+
+        return locations[:, :3]
+
+    def _decode_dimension(self, labels: Tensor, dims_offset: Tensor) -> Tensor:
+        """Transform dimension offsets to dimension according to its category.
+
+        Args:
+            labels (Tensor): Each points' category id.
+                shape: (N, K)
+            dims_offset (Tensor): Dimension offsets.
+                shape: (N, 3)
+        """
+        labels = labels.flatten().long()
+        base_dims = dims_offset.new_tensor(self.base_dims)
+        dims_select = base_dims[labels, :]
+        dimensions = dims_offset.exp() * dims_select
+
+        return dimensions
+
+    def _decode_orientation(self, ori_vector: Tensor,
+                            locations: Optional[Tensor]) -> Tensor:
+        """Retrieve object orientation.
+
+        Args:
+            ori_vector (Tensor): Local orientation in [sin, cos] format.
+                shape: (N, 2)
+            locations (Tensor): Object location.
+                shape: (N, 3)
+
+        Return:
+            Tensor: yaw(Orientation). Notice that the yaw's
+                range is [-np.pi, np.pi].
+                shape：(N, 1）
+        """
+        assert len(ori_vector) == len(locations)
+        locations = locations.view(-1, 3)
+        rays = torch.atan(locations[:, 0] / (locations[:, 2] + 1e-7))
+        alphas = torch.atan(ori_vector[:, 0] / (ori_vector[:, 1] + 1e-7))
+
+        # get cosine value positive and negative index.
+        cos_pos_inds = (ori_vector[:, 1] >= 0).nonzero(as_tuple=False)
+        cos_neg_inds = (ori_vector[:, 1] < 0).nonzero(as_tuple=False)
+
+        alphas[cos_pos_inds] -= np.pi / 2
+        alphas[cos_neg_inds] += np.pi / 2
+        # retrieve object rotation y angle.
+        yaws = alphas + rays
+
+        larger_inds = (yaws > np.pi).nonzero(as_tuple=False)
+        small_inds = (yaws < -np.pi).nonzero(as_tuple=False)
+
+        if len(larger_inds) != 0:
+            yaws[larger_inds] -= 2 * np.pi
+        if len(small_inds) != 0:
+            yaws[small_inds] += 2 * np.pi
+
+        yaws = yaws.unsqueeze(-1)
+        return yaws
diff --git a/mmde/mmdet3d/models/task_modules/samplers/__init__.py b/mmde/mmdet3d/models/task_modules/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e7be46a36f9730507c4c310d9e827f10e6ede8e
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/samplers/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.task_modules.samplers import (BaseSampler, CombinedSampler,
+                                                InstanceBalancedPosSampler,
+                                                IoUBalancedNegSampler,
+                                                OHEMSampler, RandomSampler,
+                                                SamplingResult)
+
+from .iou_neg_piecewise_sampler import IoUNegPiecewiseSampler
+from .pseudosample import PseudoSampler
+
+__all__ = [
+    'BaseSampler', 'PseudoSampler', 'RandomSampler',
+    'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
+    'OHEMSampler', 'SamplingResult', 'IoUNegPiecewiseSampler'
+]
diff --git a/mmde/mmdet3d/models/task_modules/samplers/iou_neg_piecewise_sampler.py b/mmde/mmdet3d/models/task_modules/samplers/iou_neg_piecewise_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa98f81086d19938b8f7049adc52657f0993743b
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/samplers/iou_neg_piecewise_sampler.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional, Union
+
+import torch
+from mmdet.models.task_modules import AssignResult
+from numpy import ndarray
+from torch import Tensor
+
+from mmdet3d.registry import TASK_UTILS
+from . import RandomSampler, SamplingResult
+
+
+@TASK_UTILS.register_module()
+class IoUNegPiecewiseSampler(RandomSampler):
+    """IoU Piece-wise Sampling.
+
+    Sampling negative proposals according to a list of IoU thresholds.
+    The negative proposals are divided into several pieces according
+    to `neg_iou_piece_thrs`. And the ratio of each piece is indicated
+    by `neg_piece_fractions`.
+
+    Args:
+        num (int): Number of proposals.
+        pos_fraction (float): The fraction of positive proposals.
+        neg_piece_fractions (list): A list contains fractions that indicates
+            the ratio of each piece of total negative samplers.
+        neg_iou_piece_thrs (list): A list contains IoU thresholds that
+            indicate the upper bound of this piece.
+        neg_pos_ub (float): The total ratio to limit the upper bound
+            number of negative samples.
+        add_gt_as_proposals (bool): Whether to add gt as proposals.
+    """
+
+    def __init__(self,
+                 num: int,
+                 pos_fraction: Optional[float] = None,
+                 neg_piece_fractions: Optional[list] = None,
+                 neg_iou_piece_thrs: Optional[list] = None,
+                 neg_pos_ub: float = -1,
+                 add_gt_as_proposals: bool = False,
+                 return_iou: bool = False) -> None:
+        super(IoUNegPiecewiseSampler,
+              self).__init__(num, pos_fraction, neg_pos_ub,
+                             add_gt_as_proposals)
+        assert isinstance(neg_piece_fractions, list)
+        assert len(neg_piece_fractions) == len(neg_iou_piece_thrs)
+        self.neg_piece_fractions = neg_piece_fractions
+        self.neg_iou_thr = neg_iou_piece_thrs
+        self.return_iou = return_iou
+        self.neg_piece_num = len(self.neg_piece_fractions)
+
+    def _sample_pos(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some positive samples."""
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Tensor:
+        """Randomly sample some negative samples."""
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= 0:
+            return neg_inds.squeeze(1)
+        else:
+            neg_inds_choice = neg_inds.new_zeros([0])
+            extend_num = 0
+            max_overlaps = assign_result.max_overlaps[neg_inds]
+
+            for piece_inds in range(self.neg_piece_num):
+                if piece_inds == self.neg_piece_num - 1:  # for the last piece
+                    piece_expected_num = num_expected - len(neg_inds_choice)
+                    min_iou_thr = 0
+                else:
+                    # if the numbers of negative samplers in previous
+                    # pieces are less than the expected number, extend
+                    # the same number in the current piece.
+                    piece_expected_num = min(
+                        num_expected,
+                        math.ceil(num_expected *
+                                  self.neg_piece_fractions[piece_inds]) +
+                        extend_num)
+                    min_iou_thr = self.neg_iou_thr[piece_inds + 1]
+                max_iou_thr = self.neg_iou_thr[piece_inds]
+                piece_neg_inds = torch.nonzero(
+                    (max_overlaps >= min_iou_thr)
+                    & (max_overlaps < max_iou_thr),
+                    as_tuple=False).view(-1)
+
+                if len(piece_neg_inds) < piece_expected_num:
+                    neg_inds_choice = torch.cat(
+                        [neg_inds_choice, neg_inds[piece_neg_inds]], dim=0)
+                    extend_num += piece_expected_num - len(piece_neg_inds)
+
+                    # for the last piece
+                    if piece_inds == self.neg_piece_num - 1:
+                        extend_neg_num = num_expected - len(neg_inds_choice)
+                        # if the numbers of nagetive samples > 0, we will
+                        # randomly select num_expected samples in last piece
+                        if piece_neg_inds.numel() > 0:
+                            rand_idx = torch.randint(
+                                low=0,
+                                high=piece_neg_inds.numel(),
+                                size=(extend_neg_num, )).long()
+                            neg_inds_choice = torch.cat(
+                                [neg_inds_choice, piece_neg_inds[rand_idx]],
+                                dim=0)
+                        # if the numbers of nagetive samples == 0, we will
+                        # randomly select num_expected samples in all
+                        # previous pieces
+                        else:
+                            rand_idx = torch.randint(
+                                low=0,
+                                high=neg_inds_choice.numel(),
+                                size=(extend_neg_num, )).long()
+                            neg_inds_choice = torch.cat(
+                                [neg_inds_choice, neg_inds_choice[rand_idx]],
+                                dim=0)
+                else:
+                    piece_choice = self.random_choice(piece_neg_inds,
+                                                      piece_expected_num)
+                    neg_inds_choice = torch.cat(
+                        [neg_inds_choice, neg_inds[piece_choice]], dim=0)
+                    extend_num = 0
+            assert len(neg_inds_choice) == num_expected
+            return neg_inds_choice
+
+    def sample(self,
+               assign_result: AssignResult,
+               bboxes: Tensor,
+               gt_bboxes: Tensor,
+               gt_labels: Optional[Tensor] = None,
+               **kwargs) -> SamplingResult:
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            bboxes (torch.Tensor): Boxes to be sampled from.
+            gt_bboxes (torch.Tensor): Ground truth bboxes.
+            gt_labels (torch.Tensor, optional): Class labels of ground truth
+                bboxes.
+
+        Returns:
+            :obj:`SamplingResult`: Sampling result.
+        """
+        if len(bboxes.shape) < 2:
+            bboxes = bboxes[None, :]
+
+        gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.bool)
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            if gt_labels is None:
+                raise ValueError(
+                    'gt_labels must be given when add_gt_as_proposals is True')
+            bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.bool)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(
+            assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
+        # We found that sampled indices have duplicated items occasionally.
+        # (may be a bug of PyTorch)
+        pos_inds = pos_inds.unique()
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds = self.neg_sampler._sample_neg(
+            assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
+
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        if self.return_iou:
+            # PartA2 needs iou score to regression.
+            sampling_result.iou = assign_result.max_overlaps[torch.cat(
+                [pos_inds, neg_inds])]
+            sampling_result.iou.detach_()
+
+        return sampling_result
diff --git a/mmde/mmdet3d/models/task_modules/samplers/pseudosample.py b/mmde/mmdet3d/models/task_modules/samplers/pseudosample.py
new file mode 100644
index 0000000000000000000000000000000000000000..4da0ff2a4da28c0dc47504d74d30c42aabd958f4
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/samplers/pseudosample.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmdet.models.task_modules import AssignResult
+from mmengine.structures import InstanceData
+
+from mmdet3d.registry import TASK_UTILS
+from ..samplers import BaseSampler, SamplingResult
+
+
+@TASK_UTILS.register_module()
+class PseudoSampler(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+
+    # TODO: This is a temporary pseudo sampler.
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData, *args, **kwargs) -> SamplingResult:
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            pred_instances (:obj:`InstaceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors, points, or bboxes predicted by the model,
+                shape(n, 4).
+            gt_instances (:obj:`InstaceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        gt_bboxes = gt_instances.bboxes_3d
+        priors = pred_instances.priors
+
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+
+        gt_flags = priors.new_zeros(priors.shape[0], dtype=torch.uint8)
+        sampling_result = SamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_bboxes=gt_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags,
+            avg_factor_with_neg=False)
+        return sampling_result
diff --git a/mmde/mmdet3d/models/task_modules/voxel/__init__.py b/mmde/mmdet3d/models/task_modules/voxel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..273dc5b9ed45c0a8d72e7994d468c5b9c70be928
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/voxel/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .voxel_generator import VoxelGenerator
+
+__all__ = ['VoxelGenerator']
diff --git a/mmde/mmdet3d/models/task_modules/voxel/voxel_generator.py b/mmde/mmdet3d/models/task_modules/voxel/voxel_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..92b9f07dd8c8fc34fb6aa8d95624f84dbddf8864
--- /dev/null
+++ b/mmde/mmdet3d/models/task_modules/voxel/voxel_generator.py
@@ -0,0 +1,289 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import numba
+import numpy as np
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class VoxelGenerator(object):
+    """Voxel generator in numpy implementation.
+
+    Args:
+        voxel_size (list[float]): Size of a single voxel
+        point_cloud_range (list[float]): Range of points
+        max_num_points (int): Maximum number of points in a single voxel
+        max_voxels (int, optional): Maximum number of voxels.
+            Defaults to 20000.
+    """
+
+    def __init__(self,
+                 voxel_size: List[float],
+                 point_cloud_range: List[float],
+                 max_num_points: int,
+                 max_voxels: int = 20000):
+
+        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
+        # [0, -40, -3, 70.4, 40, 1]
+        voxel_size = np.array(voxel_size, dtype=np.float32)
+        grid_size = (point_cloud_range[3:] -
+                     point_cloud_range[:3]) / voxel_size
+        grid_size = np.round(grid_size).astype(np.int64)
+
+        self._voxel_size = voxel_size
+        self._point_cloud_range = point_cloud_range
+        self._max_num_points = max_num_points
+        self._max_voxels = max_voxels
+        self._grid_size = grid_size
+
+    def generate(self, points: np.ndarray) -> Tuple[np.ndarray]:
+        """Generate voxels given points."""
+        return points_to_voxel(points, self._voxel_size,
+                               self._point_cloud_range, self._max_num_points,
+                               True, self._max_voxels)
+
+    @property
+    def voxel_size(self) -> List[float]:
+        """list[float]: Size of a single voxel."""
+        return self._voxel_size
+
+    @property
+    def max_num_points_per_voxel(self) -> int:
+        """int: Maximum number of points per voxel."""
+        return self._max_num_points
+
+    @property
+    def point_cloud_range(self) -> List[float]:
+        """list[float]: Range of point cloud."""
+        return self._point_cloud_range
+
+    @property
+    def grid_size(self) -> np.ndarray:
+        """np.ndarray: The size of grids."""
+        return self._grid_size
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        indent = ' ' * (len(repr_str) + 1)
+        repr_str += f'(voxel_size={self._voxel_size},\n'
+        repr_str += indent + 'point_cloud_range='
+        repr_str += f'{self._point_cloud_range.tolist()},\n'
+        repr_str += indent + f'max_num_points={self._max_num_points},\n'
+        repr_str += indent + f'max_voxels={self._max_voxels},\n'
+        repr_str += indent + f'grid_size={self._grid_size.tolist()}'
+        repr_str += ')'
+        return repr_str
+
+
+def points_to_voxel(points: np.ndarray,
+                    voxel_size: Union[list, tuple, np.ndarray],
+                    coors_range: Union[List[float], List[Tuple[float]],
+                                       List[np.ndarray]],
+                    max_points: int = 35,
+                    reverse_index: bool = True,
+                    max_voxels: int = 20000) -> Tuple[np.ndarray]:
+    """convert kitti points(N, >=3) to voxels.
+
+    Args:
+        points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and
+            points[:, 3:] contain other information such as reflectivity.
+        voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size
+        coors_range (list[float | tuple[float] | ndarray]): Voxel range.
+            format: xyzxyz, minmax
+        max_points (int): Indicate maximum points contained in a voxel.
+        reverse_index (bool): Whether return reversed coordinates.
+            if points has xyz format and reverse_index is True, output
+            coordinates will be zyx format, but points in features always
+            xyz format.
+        max_voxels (int): Maximum number of voxels this function creates.
+            For second, 20000 is a good choice. Points should be shuffled for
+            randomness before this function because max_voxels drops points.
+
+    Returns:
+        tuple[np.ndarray]:
+            voxels: [M, max_points, ndim] float tensor. only contain points.
+            coordinates: [M, 3] int32 tensor.
+            num_points_per_voxel: [M] int32 tensor.
+    """
+    if not isinstance(voxel_size, np.ndarray):
+        voxel_size = np.array(voxel_size, dtype=points.dtype)
+    if not isinstance(coors_range, np.ndarray):
+        coors_range = np.array(coors_range, dtype=points.dtype)
+    voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size
+    voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist())
+    if reverse_index:
+        voxelmap_shape = voxelmap_shape[::-1]
+    # don't create large array in jit(nopython=True) code.
+    num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
+    coor_to_voxelidx = -np.ones(shape=voxelmap_shape, dtype=np.int32)
+    voxels = np.zeros(
+        shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype)
+    coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
+    if reverse_index:
+        voxel_num = _points_to_voxel_reverse_kernel(
+            points, voxel_size, coors_range, num_points_per_voxel,
+            coor_to_voxelidx, voxels, coors, max_points, max_voxels)
+
+    else:
+        voxel_num = _points_to_voxel_kernel(points, voxel_size, coors_range,
+                                            num_points_per_voxel,
+                                            coor_to_voxelidx, voxels, coors,
+                                            max_points, max_voxels)
+
+    coors = coors[:voxel_num]
+    voxels = voxels[:voxel_num]
+    num_points_per_voxel = num_points_per_voxel[:voxel_num]
+
+    return voxels, coors, num_points_per_voxel
+
+
+@numba.jit(nopython=True)
+def _points_to_voxel_reverse_kernel(points: np.ndarray,
+                                    voxel_size: Union[list, tuple, np.ndarray],
+                                    coors_range: Union[List[float],
+                                                       List[Tuple[float]],
+                                                       List[np.ndarray]],
+                                    num_points_per_voxel: int,
+                                    coor_to_voxelidx: np.ndarray,
+                                    voxels: np.ndarray,
+                                    coors: np.ndarray,
+                                    max_points: int = 35,
+                                    max_voxels: int = 20000):
+    """convert kitti points(N, >=3) to voxels.
+
+    Args:
+        points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and
+            points[:, 3:] contain other information such as reflectivity.
+        voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size
+        coors_range (list[float | tuple[float] | ndarray]): Range of voxels.
+            format: xyzxyz, minmax
+        num_points_per_voxel (int): Number of points per voxel.
+        coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W),
+            which has the same shape as the complete voxel map. It indicates
+            the index of each corresponding voxel.
+        voxels (np.ndarray): Created empty voxels.
+        coors (np.ndarray): Created coordinates of each voxel.
+        max_points (int): Indicate maximum points contained in a voxel.
+        max_voxels (int): Maximum number of voxels this function create.
+            for second, 20000 is a good choice. Points should be shuffled for
+            randomness before this function because max_voxels drops points.
+
+    Returns:
+        tuple[np.ndarray]:
+            voxels: Shape [M, max_points, ndim], only contain points.
+            coordinates: Shape [M, 3].
+            num_points_per_voxel: Shape [M].
+    """
+    # put all computations to one loop.
+    # we shouldn't create large array in main jit code, otherwise
+    # reduce performance
+    N = points.shape[0]
+    # ndim = points.shape[1] - 1
+    ndim = 3
+    ndim_minus_1 = ndim - 1
+    grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size
+    # np.round(grid_size)
+    # grid_size = np.round(grid_size).astype(np.int64)(np.int32)
+    grid_size = np.round(grid_size, 0, grid_size).astype(np.int32)
+    coor = np.zeros(shape=(3, ), dtype=np.int32)
+    voxel_num = 0
+    failed = False
+    for i in range(N):
+        failed = False
+        for j in range(ndim):
+            c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j])
+            if c < 0 or c >= grid_size[j]:
+                failed = True
+                break
+            coor[ndim_minus_1 - j] = c
+        if failed:
+            continue
+        voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]]
+        if voxelidx == -1:
+            voxelidx = voxel_num
+            if voxel_num >= max_voxels:
+                continue
+            voxel_num += 1
+            coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx
+            coors[voxelidx] = coor
+        num = num_points_per_voxel[voxelidx]
+        if num < max_points:
+            voxels[voxelidx, num] = points[i]
+            num_points_per_voxel[voxelidx] += 1
+    return voxel_num
+
+
+@numba.jit(nopython=True)
+def _points_to_voxel_kernel(points: np.ndarray,
+                            voxel_size: Union[list, tuple, np.ndarray],
+                            coors_range: Union[List[float], List[Tuple[float]],
+                                               List[np.ndarray]],
+                            num_points_per_voxel: int,
+                            coor_to_voxelidx: np.ndarray,
+                            voxels: np.ndarray,
+                            coors: np.ndarray,
+                            max_points: int = 35,
+                            max_voxels: int = 200000):
+    """convert kitti points(N, >=3) to voxels.
+
+    Args:
+        points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and
+            points[:, 3:] contain other information such as reflectivity.
+        voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size.
+        coors_range (list[float | tuple[float] | ndarray]): Range of voxels.
+            format: xyzxyz, minmax
+        num_points_per_voxel (int): Number of points per voxel.
+        coor_to_voxelidx (np.ndarray): A voxel grid of shape (D, H, W),
+            which has the same shape as the complete voxel map. It indicates
+            the index of each corresponding voxel.
+        voxels (np.ndarray): Created empty voxels.
+        coors (np.ndarray): Created coordinates of each voxel.
+        max_points (int): Indicate maximum points contained in a voxel.
+        max_voxels (int): Maximum number of voxels this function create.
+            for second, 20000 is a good choice. Points should be shuffled for
+            randomness before this function because max_voxels drops points.
+
+    Returns:
+        tuple[np.ndarray]:
+            voxels: Shape [M, max_points, ndim], only contain points.
+            coordinates: Shape [M, 3].
+            num_points_per_voxel: Shape [M].
+    """
+    N = points.shape[0]
+    # ndim = points.shape[1] - 1
+    ndim = 3
+    grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size
+    # grid_size = np.round(grid_size).astype(np.int64)(np.int32)
+    grid_size = np.round(grid_size, 0, grid_size).astype(np.int32)
+
+    # lower_bound = coors_range[:3]
+    # upper_bound = coors_range[3:]
+    coor = np.zeros(shape=(3, ), dtype=np.int32)
+    voxel_num = 0
+    failed = False
+    for i in range(N):
+        failed = False
+        for j in range(ndim):
+            c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j])
+            if c < 0 or c >= grid_size[j]:
+                failed = True
+                break
+            coor[j] = c
+        if failed:
+            continue
+        voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]]
+        if voxelidx == -1:
+            voxelidx = voxel_num
+            if voxel_num >= max_voxels:
+                continue
+            voxel_num += 1
+            coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx
+            coors[voxelidx] = coor
+        num = num_points_per_voxel[voxelidx]
+        if num < max_points:
+            voxels[voxelidx, num] = points[i]
+            num_points_per_voxel[voxelidx] += 1
+    return voxel_num
diff --git a/mmde/mmdet3d/models/test_time_augs/__init__.py b/mmde/mmdet3d/models/test_time_augs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..288f4d36152c5585b92f201af06055c386af4902
--- /dev/null
+++ b/mmde/mmdet3d/models/test_time_augs/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .merge_augs import merge_aug_bboxes_3d
+
+__all__ = ['merge_aug_bboxes_3d']
diff --git a/mmde/mmdet3d/models/test_time_augs/merge_augs.py b/mmde/mmdet3d/models/test_time_augs/merge_augs.py
new file mode 100644
index 0000000000000000000000000000000000000000..b78f32688026571286b3349f9fd9d68535aad436
--- /dev/null
+++ b/mmde/mmdet3d/models/test_time_augs/merge_augs.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+
+from mmdet3d.structures import bbox3d2result, bbox3d_mapping_back, xywhr2xyxyr
+from mmdet3d.utils import ConfigType
+from ..layers import nms_bev, nms_normal_bev
+
+
+def merge_aug_bboxes_3d(aug_results: List[dict],
+                        aug_batch_input_metas: List[dict],
+                        test_cfg: ConfigType) -> dict:
+    """Merge augmented detection 3D bboxes and scores.
+
+    Args:
+        aug_results (List[dict]): The dict of detection results.
+            The dict contains the following keys
+
+            - bbox_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (Tensor): Detection scores.
+            - labels_3d (Tensor): Predicted box labels.
+        aug_batch_input_metas (List[dict]): Meta information of each sample.
+        test_cfg (dict or :obj:`ConfigDict`): Test config.
+
+    Returns:
+        dict: Bounding boxes results in cpu mode, containing merged results.
+
+            - bbox_3d (:obj:`BaseInstance3DBoxes`): Merged detection bbox.
+            - scores_3d (torch.Tensor): Merged detection scores.
+            - labels_3d (torch.Tensor): Merged predicted box labels.
+    """
+
+    assert len(aug_results) == len(aug_batch_input_metas), \
+        '"aug_results" should have the same length as ' \
+        f'"aug_batch_input_metas", got len(aug_results)={len(aug_results)} ' \
+        f'and len(aug_batch_input_metas)={len(aug_batch_input_metas)}'
+
+    recovered_bboxes = []
+    recovered_scores = []
+    recovered_labels = []
+
+    for bboxes, input_info in zip(aug_results, aug_batch_input_metas):
+        scale_factor = input_info['pcd_scale_factor']
+        pcd_horizontal_flip = input_info['pcd_horizontal_flip']
+        pcd_vertical_flip = input_info['pcd_vertical_flip']
+        recovered_scores.append(bboxes['scores_3d'])
+        recovered_labels.append(bboxes['labels_3d'])
+        bboxes = bbox3d_mapping_back(bboxes['bbox_3d'], scale_factor,
+                                     pcd_horizontal_flip, pcd_vertical_flip)
+        recovered_bboxes.append(bboxes)
+
+    aug_bboxes = recovered_bboxes[0].cat(recovered_bboxes)
+    aug_bboxes_for_nms = xywhr2xyxyr(aug_bboxes.bev)
+    aug_scores = torch.cat(recovered_scores, dim=0)
+    aug_labels = torch.cat(recovered_labels, dim=0)
+
+    # TODO: use a more elegent way to deal with nms
+    if test_cfg.get('use_rotate_nms', False):
+        nms_func = nms_bev
+    else:
+        nms_func = nms_normal_bev
+
+    merged_bboxes = []
+    merged_scores = []
+    merged_labels = []
+
+    # Apply multi-class nms when merge bboxes
+    if len(aug_labels) == 0:
+        return bbox3d2result(aug_bboxes, aug_scores, aug_labels)
+
+    for class_id in range(torch.max(aug_labels).item() + 1):
+        class_inds = (aug_labels == class_id)
+        bboxes_i = aug_bboxes[class_inds]
+        bboxes_nms_i = aug_bboxes_for_nms[class_inds, :]
+        scores_i = aug_scores[class_inds]
+        labels_i = aug_labels[class_inds]
+        if len(bboxes_nms_i) == 0:
+            continue
+        selected = nms_func(bboxes_nms_i, scores_i, test_cfg.nms_thr)
+
+        merged_bboxes.append(bboxes_i[selected, :])
+        merged_scores.append(scores_i[selected])
+        merged_labels.append(labels_i[selected])
+
+    merged_bboxes = merged_bboxes[0].cat(merged_bboxes)
+    merged_scores = torch.cat(merged_scores, dim=0)
+    merged_labels = torch.cat(merged_labels, dim=0)
+
+    _, order = merged_scores.sort(0, descending=True)
+    num = min(test_cfg.get('max_num', 500), len(aug_bboxes))
+    order = order[:num]
+
+    merged_bboxes = merged_bboxes[order]
+    merged_scores = merged_scores[order]
+    merged_labels = merged_labels[order]
+
+    return bbox3d2result(merged_bboxes, merged_scores, merged_labels)
diff --git a/mmde/mmdet3d/models/utils/__init__.py b/mmde/mmdet3d/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..98e0fbda4c66fa1367b7c0b466b69d605d88521e
--- /dev/null
+++ b/mmde/mmdet3d/models/utils/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .add_prefix import add_prefix
+from .clip_sigmoid import clip_sigmoid
+from .edge_indices import get_edge_indices
+from .gaussian import (draw_heatmap_gaussian, ellip_gaussian2D, gaussian_2d,
+                       gaussian_radius, get_ellip_gaussian_2D)
+from .gen_keypoints import get_keypoints
+from .handle_objs import filter_outside_objs, handle_proj_objs
+
+__all__ = [
+    'clip_sigmoid', 'get_edge_indices', 'filter_outside_objs',
+    'handle_proj_objs', 'get_keypoints', 'gaussian_2d',
+    'draw_heatmap_gaussian', 'gaussian_radius', 'get_ellip_gaussian_2D',
+    'ellip_gaussian2D', 'add_prefix'
+]
diff --git a/mmde/mmdet3d/models/utils/add_prefix.py b/mmde/mmdet3d/models/utils/add_prefix.py
new file mode 100644
index 0000000000000000000000000000000000000000..46a7b601429653918d64502290157bf2f66e82aa
--- /dev/null
+++ b/mmde/mmdet3d/models/utils/add_prefix.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def add_prefix(inputs: dict, prefix: str) -> dict:
+    """Add prefix for dict.
+
+    Args:
+        inputs (dict): The input dict with str keys.
+        prefix (str): The prefix to add.
+
+    Returns:
+
+        dict: The dict with keys updated with ``prefix``.
+    """
+
+    outputs = dict()
+    for name, value in inputs.items():
+        outputs[f'{prefix}.{name}'] = value
+
+    return outputs
diff --git a/mmde/mmdet3d/models/utils/clip_sigmoid.py b/mmde/mmdet3d/models/utils/clip_sigmoid.py
new file mode 100644
index 0000000000000000000000000000000000000000..7be8301bda1c5ce110e71fa9e81cae680ba5a4db
--- /dev/null
+++ b/mmde/mmdet3d/models/utils/clip_sigmoid.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import Tensor
+
+
+def clip_sigmoid(x: Tensor, eps: float = 1e-4) -> Tensor:
+    """Sigmoid function for input feature.
+
+    Args:
+        x (Tensor): Input feature map with the shape of [B, N, H, W].
+        eps (float): Lower bound of the range to be clamped to.
+            Defaults to 1e-4.
+
+    Returns:
+        Tensor: Feature map after sigmoid.
+    """
+    y = torch.clamp(x.sigmoid_(), min=eps, max=1 - eps)
+    return y
diff --git a/mmde/mmdet3d/models/utils/edge_indices.py b/mmde/mmdet3d/models/utils/edge_indices.py
new file mode 100644
index 0000000000000000000000000000000000000000..33190cddd4d9edcb50999b3e44e0e822e956b715
--- /dev/null
+++ b/mmde/mmdet3d/models/utils/edge_indices.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import numpy as np
+import torch
+from torch import Tensor
+
+
+def get_edge_indices(img_metas: List[dict],
+                     downsample_ratio: int,
+                     step: int = 1,
+                     pad_mode: str = 'default',
+                     dtype: type = np.float32,
+                     device: str = 'cpu') -> List[Tensor]:
+    """Function to filter the objects label outside the image.
+    The edge_indices are generated using numpy on cpu rather
+    than on CUDA due to the latency issue. When batch size = 8,
+    this function with numpy array is ~8 times faster than that
+    with CUDA tensor (0.09s and 0.72s in 100 runs).
+
+    Args:
+        img_metas (List[dict]): Meta information of each image, e.g.,
+            image size, scaling factor, etc.
+        downsample_ratio (int): Downsample ratio of output feature,
+        step (int): Step size used for generateing
+            edge indices. Defaults to 1.
+        pad_mode (str): Padding mode during data pipeline.
+            Defaults to 'default'.
+        dtype (type): Dtype of edge indices tensor.
+            Defaults to np.float32.
+        device (str): Device of edge indices tensor.
+            Defaults to 'cpu'.
+
+    Returns:
+        List[Tensor]: Edge indices for each image in batch data.
+    """
+    edge_indices_list = []
+    for i in range(len(img_metas)):
+        img_shape = img_metas[i]['img_shape']
+        pad_shape = img_metas[i]['pad_shape']
+        h, w = img_shape[:2]
+        pad_h, pad_w = pad_shape
+        edge_indices = []
+
+        if pad_mode == 'default':
+            x_min = 0
+            y_min = 0
+            x_max = (w - 1) // downsample_ratio
+            y_max = (h - 1) // downsample_ratio
+        elif pad_mode == 'center':
+            x_min = np.ceil((pad_w - w) / 2 * downsample_ratio)
+            y_min = np.ceil((pad_h - h) / 2 * downsample_ratio)
+            x_max = x_min + w // downsample_ratio
+            y_max = y_min + h // downsample_ratio
+        else:
+            raise NotImplementedError
+
+        # left
+        y = np.arange(y_min, y_max, step, dtype=dtype)
+        x = np.ones(len(y)) * x_min
+
+        edge_indices_edge = np.stack((x, y), axis=1)
+        edge_indices.append(edge_indices_edge)
+
+        # bottom
+        x = np.arange(x_min, x_max, step, dtype=dtype)
+        y = np.ones(len(x)) * y_max
+
+        edge_indices_edge = np.stack((x, y), axis=1)
+        edge_indices.append(edge_indices_edge)
+
+        # right
+        y = np.arange(y_max, y_min, -step, dtype=dtype)
+        x = np.ones(len(y)) * x_max
+
+        edge_indices_edge = np.stack((x, y), axis=1)
+        edge_indices.append(edge_indices_edge)
+
+        # top
+        x = np.arange(x_max, x_min, -step, dtype=dtype)
+        y = np.ones(len(x)) * y_min
+
+        edge_indices_edge = np.stack((x, y), axis=1)
+        edge_indices.append(edge_indices_edge)
+
+        edge_indices = \
+            np.concatenate([index for index in edge_indices], axis=0)
+        edge_indices = torch.from_numpy(edge_indices).to(device).long()
+        edge_indices_list.append(edge_indices)
+
+    return edge_indices_list
diff --git a/mmde/mmdet3d/models/utils/gaussian.py b/mmde/mmdet3d/models/utils/gaussian.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d094dcc97e225d429e782a5bf62c197e34fa7b4
--- /dev/null
+++ b/mmde/mmdet3d/models/utils/gaussian.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import numpy as np
+import torch
+from torch import Tensor
+
+
+def gaussian_2d(shape: Tuple[int, int], sigma: float = 1) -> np.ndarray:
+    """Generate gaussian map.
+
+    Args:
+        shape (Tuple[int]): Shape of the map.
+        sigma (float): Sigma to generate gaussian map.
+            Defaults to 1.
+
+    Returns:
+        np.ndarray: Generated gaussian map.
+    """
+    m, n = [(ss - 1.) / 2. for ss in shape]
+    y, x = np.ogrid[-m:m + 1, -n:n + 1]
+
+    h = np.exp(-(x * x + y * y) / (2 * sigma * sigma))
+    h[h < np.finfo(h.dtype).eps * h.max()] = 0
+    return h
+
+
+def draw_heatmap_gaussian(heatmap: Tensor,
+                          center: Tensor,
+                          radius: int,
+                          k: int = 1) -> Tensor:
+    """Get gaussian masked heatmap.
+
+    Args:
+        heatmap (Tensor): Heatmap to be masked.
+        center (Tensor): Center coord of the heatmap.
+        radius (int): Radius of gaussian.
+        k (int): Multiple of masked_gaussian. Defaults to 1.
+
+    Returns:
+        Tensor: Masked heatmap.
+    """
+    diameter = 2 * radius + 1
+    gaussian = gaussian_2d((diameter, diameter), sigma=diameter / 6)
+
+    x, y = int(center[0]), int(center[1])
+
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = torch.from_numpy(
+        gaussian[radius - top:radius + bottom,
+                 radius - left:radius + right]).to(heatmap.device,
+                                                   torch.float32)
+    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
+        torch.max(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+    return heatmap
+
+
+def gaussian_radius(det_size: Tuple[Tensor, Tensor],
+                    min_overlap: float = 0.5) -> Tensor:
+    """Get radius of gaussian.
+
+    Args:
+        det_size (Tuple[Tensor]): Size of the detection result.
+        min_overlap (float): Gaussian_overlap. Defaults to 0.5.
+
+    Returns:
+        Tensor: Computed radius.
+    """
+    height, width = det_size
+
+    a1 = 1
+    b1 = (height + width)
+    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
+    sq1 = torch.sqrt(b1**2 - 4 * a1 * c1)
+    r1 = (b1 + sq1) / 2
+
+    a2 = 4
+    b2 = 2 * (height + width)
+    c2 = (1 - min_overlap) * width * height
+    sq2 = torch.sqrt(b2**2 - 4 * a2 * c2)
+    r2 = (b2 + sq2) / 2
+
+    a3 = 4 * min_overlap
+    b3 = -2 * min_overlap * (height + width)
+    c3 = (min_overlap - 1) * width * height
+    sq3 = torch.sqrt(b3**2 - 4 * a3 * c3)
+    r3 = (b3 + sq3) / 2
+    return min(r1, r2, r3)
+
+
+def get_ellip_gaussian_2D(heatmap: Tensor,
+                          center: List[int],
+                          radius_x: int,
+                          radius_y: int,
+                          k: int = 1) -> Tensor:
+    """Generate 2D ellipse gaussian heatmap.
+
+    Args:
+        heatmap (Tensor): Input heatmap, the gaussian kernel will cover on
+            it and maintain the max value.
+        center (List[int]): Coord of gaussian kernel's center.
+        radius_x (int): X-axis radius of gaussian kernel.
+        radius_y (int): Y-axis radius of gaussian kernel.
+        k (int): Coefficient of gaussian kernel. Defaults to 1.
+
+    Returns:
+        out_heatmap (Tensor): Updated heatmap covered by gaussian kernel.
+    """
+    diameter_x, diameter_y = 2 * radius_x + 1, 2 * radius_y + 1
+    gaussian_kernel = ellip_gaussian2D((radius_x, radius_y),
+                                       sigma_x=diameter_x // 6,
+                                       sigma_y=diameter_y // 6,
+                                       dtype=heatmap.dtype,
+                                       device=heatmap.device)
+
+    x, y = int(center[0]), int(center[1])
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius_x), min(width - x, radius_x + 1)
+    top, bottom = min(y, radius_y), min(height - y, radius_y + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian_kernel[radius_y - top:radius_y + bottom,
+                                      radius_x - left:radius_x + right]
+    out_heatmap = heatmap
+    torch.max(
+        masked_heatmap,
+        masked_gaussian * k,
+        out=out_heatmap[y - top:y + bottom, x - left:x + right])
+
+    return out_heatmap
+
+
+def ellip_gaussian2D(radius: Tuple[int, int],
+                     sigma_x: int,
+                     sigma_y: int,
+                     dtype: torch.dtype = torch.float32,
+                     device: str = 'cpu') -> Tensor:
+    """Generate 2D ellipse gaussian kernel.
+
+    Args:
+        radius (Tuple[int]): Ellipse radius (radius_x, radius_y) of gaussian
+            kernel.
+        sigma_x (int): X-axis sigma of gaussian function.
+        sigma_y (int): Y-axis sigma of gaussian function.
+        dtype (torch.dtype): Dtype of gaussian tensor.
+            Defaults to torch.float32.
+        device (str): Device of gaussian tensor.
+            Defaults to 'cpu'.
+
+    Returns:
+        h (Tensor): Gaussian kernel with a
+            ``(2 * radius_y + 1) * (2 * radius_x + 1)`` shape.
+    """
+    x = torch.arange(
+        -radius[0], radius[0] + 1, dtype=dtype, device=device).view(1, -1)
+    y = torch.arange(
+        -radius[1], radius[1] + 1, dtype=dtype, device=device).view(-1, 1)
+
+    h = (-(x * x) / (2 * sigma_x * sigma_x) - (y * y) /
+         (2 * sigma_y * sigma_y)).exp()
+    h[h < torch.finfo(h.dtype).eps * h.max()] = 0
+
+    return h
diff --git a/mmde/mmdet3d/models/utils/gen_keypoints.py b/mmde/mmdet3d/models/utils/gen_keypoints.py
new file mode 100644
index 0000000000000000000000000000000000000000..848b0f028ba2275c25d911d2c7a01752793f8c2c
--- /dev/null
+++ b/mmde/mmdet3d/models/utils/gen_keypoints.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet3d.structures import CameraInstance3DBoxes, points_cam2img
+
+
+def get_keypoints(
+        gt_bboxes_3d_list: List[CameraInstance3DBoxes],
+        centers2d_list: List[Tensor],
+        img_metas: List[dict],
+        use_local_coords: bool = True) -> Tuple[List[Tensor], List[Tensor]]:
+    """Function to filter the objects label outside the image.
+
+    Args:
+        gt_bboxes_3d_list (List[:obj:`CameraInstance3DBoxes`]): Ground truth
+            bboxes of each image.
+        centers2d_list (List[Tensor]): Projected 3D centers onto 2D image,
+            shape (num_gt, 2).
+        img_metas (List[dict]): Meta information of each image, e.g.,
+            image size, scaling factor, etc.
+        use_local_coords (bool): Whether to use local coordinates
+            for keypoints. Defaults to True.
+
+    Returns:
+        Tuple[List[Tensor], List[Tensor]]: It contains two elements,
+        the first is the keypoints for each projected 2D bbox in batch data.
+        The second is the visible mask of depth calculated by keypoints.
+    """
+
+    assert len(gt_bboxes_3d_list) == len(centers2d_list)
+    bs = len(gt_bboxes_3d_list)
+    keypoints2d_list = []
+    keypoints_depth_mask_list = []
+
+    for i in range(bs):
+        gt_bboxes_3d = gt_bboxes_3d_list[i]
+        centers2d = centers2d_list[i]
+        img_shape = img_metas[i]['img_shape']
+        cam2img = img_metas[i]['cam2img']
+        h, w = img_shape[:2]
+        # (N, 8, 3)
+        corners3d = gt_bboxes_3d.corners
+        top_centers3d = torch.mean(corners3d[:, [0, 1, 4, 5], :], dim=1)
+        bot_centers3d = torch.mean(corners3d[:, [2, 3, 6, 7], :], dim=1)
+        # (N, 2, 3)
+        top_bot_centers3d = torch.stack((top_centers3d, bot_centers3d), dim=1)
+        keypoints3d = torch.cat((corners3d, top_bot_centers3d), dim=1)
+        # (N, 10, 2)
+        keypoints2d = points_cam2img(keypoints3d, cam2img)
+
+        # keypoints mask: keypoints must be inside
+        # the image and in front of the camera
+        keypoints_x_visible = (keypoints2d[..., 0] >= 0) & (
+            keypoints2d[..., 0] <= w - 1)
+        keypoints_y_visible = (keypoints2d[..., 1] >= 0) & (
+            keypoints2d[..., 1] <= h - 1)
+        keypoints_z_visible = (keypoints3d[..., -1] > 0)
+
+        # (N, 1O)
+        keypoints_visible = \
+            keypoints_x_visible & keypoints_y_visible & keypoints_z_visible
+        # center, diag-02, diag-13
+        keypoints_depth_valid = torch.stack(
+            (keypoints_visible[:, [8, 9]].all(dim=1),
+             keypoints_visible[:, [0, 3, 5, 6]].all(dim=1),
+             keypoints_visible[:, [1, 2, 4, 7]].all(dim=1)),
+            dim=1)
+        keypoints_visible = keypoints_visible.float()
+
+        if use_local_coords:
+            keypoints2d = torch.cat((keypoints2d - centers2d.unsqueeze(1),
+                                     keypoints_visible.unsqueeze(-1)),
+                                    dim=2)
+        else:
+            keypoints2d = torch.cat(
+                (keypoints2d, keypoints_visible.unsqueeze(-1)), dim=2)
+
+        keypoints2d_list.append(keypoints2d)
+        keypoints_depth_mask_list.append(keypoints_depth_valid)
+
+    return (keypoints2d_list, keypoints_depth_mask_list)
diff --git a/mmde/mmdet3d/models/utils/handle_objs.py b/mmde/mmdet3d/models/utils/handle_objs.py
new file mode 100644
index 0000000000000000000000000000000000000000..d05afb16bb4e418b45a5eeb6693a7a5784ceea66
--- /dev/null
+++ b/mmde/mmdet3d/models/utils/handle_objs.py
@@ -0,0 +1,149 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet3d.structures import CameraInstance3DBoxes
+
+
+def filter_outside_objs(gt_bboxes_list: List[Tensor],
+                        gt_labels_list: List[Tensor],
+                        gt_bboxes_3d_list: List[CameraInstance3DBoxes],
+                        gt_labels_3d_list: List[Tensor],
+                        centers2d_list: List[Tensor],
+                        img_metas: List[dict]) -> None:
+    """Function to filter the objects label outside the image.
+
+    Args:
+        gt_bboxes_list (List[Tensor]): Ground truth bboxes of each image,
+            each has shape (num_gt, 4).
+        gt_labels_list (List[Tensor]): Ground truth labels of each box,
+            each has shape (num_gt,).
+        gt_bboxes_3d_list (List[:obj:`CameraInstance3DBoxes`]): 3D Ground
+            truth bboxes of each image, each has shape
+            (num_gt, bbox_code_size).
+        gt_labels_3d_list (List[Tensor]): 3D Ground truth labels of each
+            box, each has shape (num_gt,).
+        centers2d_list (List[Tensor]): Projected 3D centers onto 2D image,
+            each has shape (num_gt, 2).
+        img_metas (list[dict]): Meta information of each image, e.g.,
+            image size, scaling factor, etc.
+    """
+    bs = len(centers2d_list)
+
+    for i in range(bs):
+        centers2d = centers2d_list[i].clone()
+        img_shape = img_metas[i]['img_shape']
+        keep_inds = (centers2d[:, 0] > 0) & \
+                    (centers2d[:, 0] < img_shape[1]) & \
+                    (centers2d[:, 1] > 0) & \
+                    (centers2d[:, 1] < img_shape[0])
+        centers2d_list[i] = centers2d[keep_inds]
+        gt_labels_list[i] = gt_labels_list[i][keep_inds]
+        gt_bboxes_list[i] = gt_bboxes_list[i][keep_inds]
+        gt_bboxes_3d_list[i].tensor = gt_bboxes_3d_list[i].tensor[keep_inds]
+        gt_labels_3d_list[i] = gt_labels_3d_list[i][keep_inds]
+
+
+def get_centers2d_target(centers2d: Tensor, centers: Tensor,
+                         img_shape: tuple) -> Tensor:
+    """Function to get target centers2d.
+
+    Args:
+        centers2d (Tensor): Projected 3D centers onto 2D images.
+        centers (Tensor): Centers of 2d gt bboxes.
+        img_shape (tuple): Resized image shape.
+
+    Returns:
+        torch.Tensor: Projected 3D centers (centers2D) target.
+    """
+    N = centers2d.shape[0]
+    h, w = img_shape[:2]
+    valid_intersects = centers2d.new_zeros((N, 2))
+    a = (centers[:, 1] - centers2d[:, 1]) / (centers[:, 0] - centers2d[:, 0])
+    b = centers[:, 1] - a * centers[:, 0]
+    left_y = b
+    right_y = (w - 1) * a + b
+    top_x = -b / a
+    bottom_x = (h - 1 - b) / a
+
+    left_coors = torch.stack((left_y.new_zeros(N, ), left_y), dim=1)
+    right_coors = torch.stack((right_y.new_full((N, ), w - 1), right_y), dim=1)
+    top_coors = torch.stack((top_x, top_x.new_zeros(N, )), dim=1)
+    bottom_coors = torch.stack((bottom_x, bottom_x.new_full((N, ), h - 1)),
+                               dim=1)
+
+    intersects = torch.stack(
+        [left_coors, right_coors, top_coors, bottom_coors], dim=1)
+    intersects_x = intersects[:, :, 0]
+    intersects_y = intersects[:, :, 1]
+    inds = (intersects_x >= 0) & (intersects_x <=
+                                  w - 1) & (intersects_y >= 0) & (
+                                      intersects_y <= h - 1)
+    valid_intersects = intersects[inds].reshape(N, 2, 2)
+    dist = torch.norm(valid_intersects - centers2d.unsqueeze(1), dim=2)
+    min_idx = torch.argmin(dist, dim=1)
+
+    min_idx = min_idx.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, 2)
+    centers2d_target = valid_intersects.gather(dim=1, index=min_idx).squeeze(1)
+
+    return centers2d_target
+
+
+def handle_proj_objs(
+        centers2d_list: List[Tensor], gt_bboxes_list: List[Tensor],
+        img_metas: List[dict]
+) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]:
+    """Function to handle projected object centers2d, generate target
+    centers2d.
+
+    Args:
+        gt_bboxes_list (List[Tensor]): Ground truth bboxes of each image,
+            shape (num_gt, 4).
+        centers2d_list (List[Tensor]): Projected 3D centers onto 2D image,
+            shape (num_gt, 2).
+        img_metas (List[dict]): Meta information of each image, e.g.,
+            image size, scaling factor, etc.
+
+    Returns:
+        Tuple[List[Tensor], List[Tensor], List[Tensor]]: It contains three
+        elements. The first is the target centers2d after handling the
+        truncated objects. The second is the offsets between target centers2d
+        and round int dtype centers2d,and the last is the truncation mask
+        for each object in batch data.
+    """
+    bs = len(centers2d_list)
+    centers2d_target_list = []
+    trunc_mask_list = []
+    offsets2d_list = []
+    # for now, only pad mode that img is padded by right and
+    # bottom side is supported.
+    for i in range(bs):
+        centers2d = centers2d_list[i]
+        gt_bbox = gt_bboxes_list[i]
+        img_shape = img_metas[i]['img_shape']
+        centers2d_target = centers2d.clone()
+        inside_inds = (centers2d[:, 0] > 0) & \
+                      (centers2d[:, 0] < img_shape[1]) & \
+                      (centers2d[:, 1] > 0) & \
+                      (centers2d[:, 1] < img_shape[0])
+        outside_inds = ~inside_inds
+
+        # if there are outside objects
+        if outside_inds.any():
+            centers = (gt_bbox[:, :2] + gt_bbox[:, 2:]) / 2
+            outside_centers2d = centers2d[outside_inds]
+            match_centers = centers[outside_inds]
+            target_outside_centers2d = get_centers2d_target(
+                outside_centers2d, match_centers, img_shape)
+            centers2d_target[outside_inds] = target_outside_centers2d
+
+        offsets2d = centers2d - centers2d_target.round().int()
+        trunc_mask = outside_inds
+
+        centers2d_target_list.append(centers2d_target)
+        trunc_mask_list.append(trunc_mask)
+        offsets2d_list.append(offsets2d)
+
+    return (centers2d_target_list, offsets2d_list, trunc_mask_list)
diff --git a/mmde/mmdet3d/models/voxel_encoders/__init__.py b/mmde/mmdet3d/models/voxel_encoders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e14e87e0f7e42736f3b7953eab05398488102ce
--- /dev/null
+++ b/mmde/mmdet3d/models/voxel_encoders/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .pillar_encoder import DynamicPillarFeatureNet, PillarFeatureNet
+from .voxel_encoder import (DynamicSimpleVFE, DynamicVFE, HardSimpleVFE,
+                            HardVFE, SegVFE)
+
+__all__ = [
+    'PillarFeatureNet', 'DynamicPillarFeatureNet', 'HardVFE', 'DynamicVFE',
+    'HardSimpleVFE', 'DynamicSimpleVFE', 'SegVFE'
+]
diff --git a/mmde/mmdet3d/models/voxel_encoders/pillar_encoder.py b/mmde/mmdet3d/models/voxel_encoders/pillar_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf0189cfb267f86f25da80228ea11c4a533a9f3c
--- /dev/null
+++ b/mmde/mmdet3d/models/voxel_encoders/pillar_encoder.py
@@ -0,0 +1,326 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.ops import DynamicScatter
+from torch import Tensor, nn
+
+from mmdet3d.registry import MODELS
+from .utils import PFNLayer, get_paddings_indicator
+
+
+@MODELS.register_module()
+class PillarFeatureNet(nn.Module):
+    """Pillar Feature Net.
+
+    The network prepares the pillar features and performs forward pass
+    through PFNLayers.
+
+    Args:
+        in_channels (int, optional): Number of input features,
+            either x, y, z or x, y, z, r. Defaults to 4.
+        feat_channels (tuple, optional): Number of features in each of the
+            N PFNLayers. Defaults to (64, ).
+        with_distance (bool, optional): Whether to include Euclidean distance
+            to points. Defaults to False.
+        with_cluster_center (bool, optional): [description]. Defaults to True.
+        with_voxel_center (bool, optional): [description]. Defaults to True.
+        voxel_size (tuple[float], optional): Size of voxels, only utilize x
+            and y size. Defaults to (0.2, 0.2, 4).
+        point_cloud_range (tuple[float], optional): Point cloud range, only
+            utilizes x and y min. Defaults to (0, -40, -3, 70.4, 40, 1).
+        norm_cfg ([type], optional): [description].
+            Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01).
+        mode (str, optional): The mode to gather point features. Options are
+            'max' or 'avg'. Defaults to 'max'.
+        legacy (bool, optional): Whether to use the new behavior or
+            the original behavior. Defaults to True.
+    """
+
+    def __init__(self,
+                 in_channels: Optional[int] = 4,
+                 feat_channels: Optional[tuple] = (64, ),
+                 with_distance: Optional[bool] = False,
+                 with_cluster_center: Optional[bool] = True,
+                 with_voxel_center: Optional[bool] = True,
+                 voxel_size: Optional[Tuple[float]] = (0.2, 0.2, 4),
+                 point_cloud_range: Optional[Tuple[float]] = (0, -40, -3, 70.4,
+                                                              40, 1),
+                 norm_cfg: Optional[dict] = dict(
+                     type='BN1d', eps=1e-3, momentum=0.01),
+                 mode: Optional[str] = 'max',
+                 legacy: Optional[bool] = True):
+        super(PillarFeatureNet, self).__init__()
+        assert len(feat_channels) > 0
+        self.legacy = legacy
+        if with_cluster_center:
+            in_channels += 3
+        if with_voxel_center:
+            in_channels += 3
+        if with_distance:
+            in_channels += 1
+        self._with_distance = with_distance
+        self._with_cluster_center = with_cluster_center
+        self._with_voxel_center = with_voxel_center
+        # Create PillarFeatureNet layers
+        self.in_channels = in_channels
+        feat_channels = [in_channels] + list(feat_channels)
+        pfn_layers = []
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
+            if i < len(feat_channels) - 2:
+                last_layer = False
+            else:
+                last_layer = True
+            pfn_layers.append(
+                PFNLayer(
+                    in_filters,
+                    out_filters,
+                    norm_cfg=norm_cfg,
+                    last_layer=last_layer,
+                    mode=mode))
+        self.pfn_layers = nn.ModuleList(pfn_layers)
+
+        # Need pillar (voxel) size and x/y offset in order to calculate offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.vz = voxel_size[2]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.z_offset = self.vz / 2 + point_cloud_range[2]
+        self.point_cloud_range = point_cloud_range
+
+    def forward(self, features: Tensor, num_points: Tensor, coors: Tensor,
+                *args, **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            features (torch.Tensor): Point features or raw points in shape
+                (N, M, C).
+            num_points (torch.Tensor): Number of points in each pillar.
+            coors (torch.Tensor): Coordinates of each voxel.
+
+        Returns:
+            torch.Tensor: Features of pillars.
+        """
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            points_mean = features[:, :, :3].sum(
+                dim=1, keepdim=True) / num_points.type_as(features).view(
+                    -1, 1, 1)
+            f_cluster = features[:, :, :3] - points_mean
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        dtype = features.dtype
+        if self._with_voxel_center:
+            if not self.legacy:
+                f_center = torch.zeros_like(features[:, :, :3])
+                f_center[:, :, 0] = features[:, :, 0] - (
+                    coors[:, 3].to(dtype).unsqueeze(1) * self.vx +
+                    self.x_offset)
+                f_center[:, :, 1] = features[:, :, 1] - (
+                    coors[:, 2].to(dtype).unsqueeze(1) * self.vy +
+                    self.y_offset)
+                f_center[:, :, 2] = features[:, :, 2] - (
+                    coors[:, 1].to(dtype).unsqueeze(1) * self.vz +
+                    self.z_offset)
+            else:
+                f_center = features[:, :, :3]
+                f_center[:, :, 0] = f_center[:, :, 0] - (
+                    coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
+                    self.x_offset)
+                f_center[:, :, 1] = f_center[:, :, 1] - (
+                    coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
+                    self.y_offset)
+                f_center[:, :, 2] = f_center[:, :, 2] - (
+                    coors[:, 1].type_as(features).unsqueeze(1) * self.vz +
+                    self.z_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        features = torch.cat(features_ls, dim=-1)
+        # The feature decorations were calculated without regard to whether
+        # pillar was empty. Need to ensure that
+        # empty pillars remain set to zeros.
+        voxel_count = features.shape[1]
+        mask = get_paddings_indicator(num_points, voxel_count, axis=0)
+        mask = torch.unsqueeze(mask, -1).type_as(features)
+        features *= mask
+
+        for pfn in self.pfn_layers:
+            features = pfn(features, num_points)
+
+        return features.squeeze(1)
+
+
+@MODELS.register_module()
+class DynamicPillarFeatureNet(PillarFeatureNet):
+    """Pillar Feature Net using dynamic voxelization.
+
+    The network prepares the pillar features and performs forward pass
+    through PFNLayers. The main difference is that it is used for
+    dynamic voxels, which contains different number of points inside a voxel
+    without limits.
+
+    Args:
+        in_channels (int, optional): Number of input features,
+            either x, y, z or x, y, z, r. Defaults to 4.
+        feat_channels (tuple, optional): Number of features in each of the
+            N PFNLayers. Defaults to (64, ).
+        with_distance (bool, optional): Whether to include Euclidean distance
+            to points. Defaults to False.
+        with_cluster_center (bool, optional): [description]. Defaults to True.
+        with_voxel_center (bool, optional): [description]. Defaults to True.
+        voxel_size (tuple[float], optional): Size of voxels, only utilize x
+            and y size. Defaults to (0.2, 0.2, 4).
+        point_cloud_range (tuple[float], optional): Point cloud range, only
+            utilizes x and y min. Defaults to (0, -40, -3, 70.4, 40, 1).
+        norm_cfg ([type], optional): [description].
+            Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01).
+        mode (str, optional): The mode to gather point features. Options are
+            'max' or 'avg'. Defaults to 'max'.
+        legacy (bool, optional): Whether to use the new behavior or
+            the original behavior. Defaults to True.
+    """
+
+    def __init__(self,
+                 in_channels: Optional[int] = 4,
+                 feat_channels: Optional[tuple] = (64, ),
+                 with_distance: Optional[bool] = False,
+                 with_cluster_center: Optional[bool] = True,
+                 with_voxel_center: Optional[bool] = True,
+                 voxel_size: Optional[Tuple[float]] = (0.2, 0.2, 4),
+                 point_cloud_range: Optional[Tuple[float]] = (0, -40, -3, 70.4,
+                                                              40, 1),
+                 norm_cfg: Optional[dict] = dict(
+                     type='BN1d', eps=1e-3, momentum=0.01),
+                 mode: Optional[str] = 'max',
+                 legacy: Optional[bool] = True):
+        super(DynamicPillarFeatureNet, self).__init__(
+            in_channels,
+            feat_channels,
+            with_distance,
+            with_cluster_center=with_cluster_center,
+            with_voxel_center=with_voxel_center,
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            norm_cfg=norm_cfg,
+            mode=mode,
+            legacy=legacy)
+        feat_channels = [self.in_channels] + list(feat_channels)
+        pfn_layers = []
+        # TODO: currently only support one PFNLayer
+
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
+            if i > 0:
+                in_filters *= 2
+            norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
+            pfn_layers.append(
+                nn.Sequential(
+                    nn.Linear(in_filters, out_filters, bias=False), norm_layer,
+                    nn.ReLU(inplace=True)))
+        self.num_pfn = len(pfn_layers)
+        self.pfn_layers = nn.ModuleList(pfn_layers)
+        self.pfn_scatter = DynamicScatter(voxel_size, point_cloud_range,
+                                          (mode != 'max'))
+        self.cluster_scatter = DynamicScatter(
+            voxel_size, point_cloud_range, average_points=True)
+
+    def map_voxel_center_to_point(self, pts_coors: Tensor, voxel_mean: Tensor,
+                                  voxel_coors: Tensor) -> Tensor:
+        """Map the centers of voxels to its corresponding points.
+
+        Args:
+            pts_coors (torch.Tensor): The coordinates of each points, shape
+                (M, 3), where M is the number of points.
+            voxel_mean (torch.Tensor): The mean or aggregated features of a
+                voxel, shape (N, C), where N is the number of voxels.
+            voxel_coors (torch.Tensor): The coordinates of each voxel.
+
+        Returns:
+            torch.Tensor: Corresponding voxel centers of each points, shape
+                (M, C), where M is the number of points.
+        """
+        # Step 1: scatter voxel into canvas
+        # Calculate necessary things for canvas creation
+        canvas_y = int(
+            (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy)
+        canvas_x = int(
+            (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx)
+        canvas_channel = voxel_mean.size(1)
+        batch_size = pts_coors[-1, 0] + 1
+        canvas_len = canvas_y * canvas_x * batch_size
+        # Create the canvas for this sample
+        canvas = voxel_mean.new_zeros(canvas_channel, canvas_len)
+        # Only include non-empty pillars
+        indices = (
+            voxel_coors[:, 0] * canvas_y * canvas_x +
+            voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3])
+        # Scatter the blob back to the canvas
+        canvas[:, indices.long()] = voxel_mean.t()
+
+        # Step 2: get voxel mean for each point
+        voxel_index = (
+            pts_coors[:, 0] * canvas_y * canvas_x +
+            pts_coors[:, 2] * canvas_x + pts_coors[:, 3])
+        center_per_point = canvas[:, voxel_index.long()].t()
+        return center_per_point
+
+    def forward(self, features: Tensor, coors: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            features (torch.Tensor): Point features or raw points in shape
+                (N, M, C).
+            coors (torch.Tensor): Coordinates of each voxel
+
+        Returns:
+            torch.Tensor: Features of pillars.
+        """
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            voxel_mean, mean_coors = self.cluster_scatter(features, coors)
+            points_mean = self.map_voxel_center_to_point(
+                coors, voxel_mean, mean_coors)
+            # TODO: maybe also do cluster for reflectivity
+            f_cluster = features[:, :3] - points_mean[:, :3]
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features.new_zeros(size=(features.size(0), 3))
+            f_center[:, 0] = features[:, 0] - (
+                coors[:, 3].type_as(features) * self.vx + self.x_offset)
+            f_center[:, 1] = features[:, 1] - (
+                coors[:, 2].type_as(features) * self.vy + self.y_offset)
+            f_center[:, 2] = features[:, 2] - (
+                coors[:, 1].type_as(features) * self.vz + self.z_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        features = torch.cat(features_ls, dim=-1)
+        for i, pfn in enumerate(self.pfn_layers):
+            point_feats = pfn(features)
+            voxel_feats, voxel_coors = self.pfn_scatter(point_feats, coors)
+            if i != len(self.pfn_layers) - 1:
+                # need to concat voxel feats if it is not the last pfn
+                feat_per_point = self.map_voxel_center_to_point(
+                    coors, voxel_feats, voxel_coors)
+                features = torch.cat([point_feats, feat_per_point], dim=1)
+
+        return voxel_feats, voxel_coors
diff --git a/mmde/mmdet3d/models/voxel_encoders/utils.py b/mmde/mmdet3d/models/voxel_encoders/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..131ed3fe644343155eb417244ef4f5beeafbf6b2
--- /dev/null
+++ b/mmde/mmdet3d/models/voxel_encoders/utils.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmcv.cnn import build_norm_layer
+from torch import Tensor, nn
+from torch.nn import functional as F
+
+
+def get_paddings_indicator(actual_num: Tensor,
+                           max_num: Tensor,
+                           axis: int = 0) -> Tensor:
+    """Create boolean mask by actually number of a padded tensor.
+
+    Args:
+        actual_num (torch.Tensor): Actual number of points in each voxel.
+        max_num (int): Max number of points in each voxel
+
+    Returns:
+        torch.Tensor: Mask indicates which points are valid inside a voxel.
+    """
+    actual_num = torch.unsqueeze(actual_num, axis + 1)
+    # tiled_actual_num: [N, M, 1]
+    max_num_shape = [1] * len(actual_num.shape)
+    max_num_shape[axis + 1] = -1
+    max_num = torch.arange(
+        max_num, dtype=torch.int, device=actual_num.device).view(max_num_shape)
+    # tiled_actual_num: [[3,3,3,3,3], [4,4,4,4,4], [2,2,2,2,2]]
+    # tiled_max_num: [[0,1,2,3,4], [0,1,2,3,4], [0,1,2,3,4]]
+    paddings_indicator = actual_num.int() > max_num
+    # paddings_indicator shape: [batch_size, max_num]
+    return paddings_indicator
+
+
+class VFELayer(nn.Module):
+    """Voxel Feature Encoder layer.
+
+    The voxel encoder is composed of a series of these layers.
+    This module do not support average pooling and only support to use
+    max pooling to gather features inside a VFE.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        norm_cfg (dict): Config dict of normalization layers
+        max_out (bool): Whether aggregate the features of points inside
+            each voxel and only return voxel features.
+        cat_max (bool): Whether concatenate the aggregated features
+            and pointwise features.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: Optional[dict] = dict(
+                     type='BN1d', eps=1e-3, momentum=0.01),
+                 max_out: Optional[bool] = True,
+                 cat_max: Optional[bool] = True):
+        super(VFELayer, self).__init__()
+        self.cat_max = cat_max
+        self.max_out = max_out
+        # self.units = int(out_channels / 2)
+
+        self.norm = build_norm_layer(norm_cfg, out_channels)[1]
+        self.linear = nn.Linear(in_channels, out_channels, bias=False)
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            inputs (torch.Tensor): Voxels features of shape (N, M, C).
+                N is the number of voxels, M is the number of points in
+                voxels, C is the number of channels of point features.
+
+        Returns:
+            torch.Tensor: Voxel features. There are three mode under which the
+                features have different meaning.
+                - `max_out=False`: Return point-wise features in
+                    shape (N, M, C).
+                - `max_out=True` and `cat_max=False`: Return aggregated
+                    voxel features in shape (N, C)
+                - `max_out=True` and `cat_max=True`: Return concatenated
+                    point-wise features in shape (N, M, C).
+        """
+        # [K, T, 7] tensordot [7, units] = [K, T, units]
+        voxel_count = inputs.shape[1]
+
+        x = self.linear(inputs)
+        x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
+                                                               1).contiguous()
+        pointwise = F.relu(x)
+        # [K, T, units]
+        if self.max_out:
+            aggregated = torch.max(pointwise, dim=1, keepdim=True)[0]
+        else:
+            # this is for fusion layer
+            return pointwise
+
+        if not self.cat_max:
+            return aggregated.squeeze(1)
+        else:
+            # [K, 1, units]
+            repeated = aggregated.repeat(1, voxel_count, 1)
+            concatenated = torch.cat([pointwise, repeated], dim=2)
+            # [K, T, 2 * units]
+            return concatenated
+
+
+class PFNLayer(nn.Module):
+    """Pillar Feature Net Layer.
+
+    The Pillar Feature Net is composed of a series of these layers, but the
+    PointPillars paper results only used a single PFNLayer.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        norm_cfg (dict, optional): Config dict of normalization layers.
+            Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01).
+        last_layer (bool, optional): If last_layer, there is no
+            concatenation of features. Defaults to False.
+        mode (str, optional): Pooling model to gather features inside voxels.
+            Defaults to 'max'.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: Optional[dict] = dict(
+                     type='BN1d', eps=1e-3, momentum=0.01),
+                 last_layer: Optional[bool] = False,
+                 mode: Optional[str] = 'max'):
+
+        super().__init__()
+        self.name = 'PFNLayer'
+        self.last_vfe = last_layer
+        if not self.last_vfe:
+            out_channels = out_channels // 2
+        self.units = out_channels
+
+        self.norm = build_norm_layer(norm_cfg, self.units)[1]
+        self.linear = nn.Linear(in_channels, self.units, bias=False)
+
+        assert mode in ['max', 'avg']
+        self.mode = mode
+
+    def forward(self,
+                inputs: Tensor,
+                num_voxels: Optional[Tensor] = None,
+                aligned_distance: Optional[Tensor] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            inputs (torch.Tensor): Pillar/Voxel inputs with shape (N, M, C).
+                N is the number of voxels, M is the number of points in
+                voxels, C is the number of channels of point features.
+            num_voxels (torch.Tensor, optional): Number of points in each
+                voxel. Defaults to None.
+            aligned_distance (torch.Tensor, optional): The distance of
+                each points to the voxel center. Defaults to None.
+
+        Returns:
+            torch.Tensor: Features of Pillars.
+        """
+        x = self.linear(inputs)
+        x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
+                                                               1).contiguous()
+        x = F.relu(x)
+
+        if self.mode == 'max':
+            if aligned_distance is not None:
+                x = x.mul(aligned_distance.unsqueeze(-1))
+            x_max = torch.max(x, dim=1, keepdim=True)[0]
+        elif self.mode == 'avg':
+            if aligned_distance is not None:
+                x = x.mul(aligned_distance.unsqueeze(-1))
+            x_max = x.sum(
+                dim=1, keepdim=True) / num_voxels.type_as(inputs).view(
+                    -1, 1, 1)
+
+        if self.last_vfe:
+            return x_max
+        else:
+            x_repeat = x_max.repeat(1, inputs.shape[1], 1)
+            x_concatenated = torch.cat([x, x_repeat], dim=2)
+            return x_concatenated
diff --git a/mmde/mmdet3d/models/voxel_encoders/voxel_encoder.py b/mmde/mmdet3d/models/voxel_encoders/voxel_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a01895b29ef6578079ca120f5e4ba2aeea5101f
--- /dev/null
+++ b/mmde/mmdet3d/models/voxel_encoders/voxel_encoder.py
@@ -0,0 +1,640 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Tuple
+
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.ops import DynamicScatter
+from torch import Tensor, nn
+
+from mmdet3d.registry import MODELS
+from .utils import VFELayer, get_paddings_indicator
+
+
+@MODELS.register_module()
+class HardSimpleVFE(nn.Module):
+    """Simple voxel feature encoder used in SECOND.
+
+    It simply averages the values of points in a voxel.
+
+    Args:
+        num_features (int, optional): Number of features to use. Default: 4.
+    """
+
+    def __init__(self, num_features: int = 4) -> None:
+        super(HardSimpleVFE, self).__init__()
+        self.num_features = num_features
+
+    def forward(self, features: Tensor, num_points: Tensor, coors: Tensor,
+                *args, **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            features (torch.Tensor): Point features in shape
+                (N, M, 3(4)). N is the number of voxels and M is the maximum
+                number of points inside a single voxel.
+            num_points (torch.Tensor): Number of points in each voxel,
+                 shape (N, ).
+            coors (torch.Tensor): Coordinates of voxels.
+
+        Returns:
+            torch.Tensor: Mean of points inside each voxel in shape (N, 3(4))
+        """
+        points_mean = features[:, :, :self.num_features].sum(
+            dim=1, keepdim=False) / num_points.type_as(features).view(-1, 1)
+        return points_mean.contiguous()
+
+
+@MODELS.register_module()
+class DynamicSimpleVFE(nn.Module):
+    """Simple dynamic voxel feature encoder used in DV-SECOND.
+
+    It simply averages the values of points in a voxel.
+    But the number of points in a voxel is dynamic and varies.
+
+    Args:
+        voxel_size (tupe[float]): Size of a single voxel
+        point_cloud_range (tuple[float]): Range of the point cloud and voxels
+    """
+
+    def __init__(self,
+                 voxel_size: Tuple[float] = (0.2, 0.2, 4),
+                 point_cloud_range: Tuple[float] = (0, -40, -3, 70.4, 40, 1)):
+        super(DynamicSimpleVFE, self).__init__()
+        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
+
+    @torch.no_grad()
+    def forward(self, features: Tensor, coors: Tensor, *args,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            features (torch.Tensor): Point features in shape
+                (N, 3(4)). N is the number of points.
+            coors (torch.Tensor): Coordinates of voxels.
+
+        Returns:
+            torch.Tensor: Mean of points inside each voxel in shape (M, 3(4)).
+                M is the number of voxels.
+        """
+        # This function is used from the start of the voxelnet
+        # num_points: [concated_num_points]
+        features, features_coors = self.scatter(features, coors)
+        return features, features_coors
+
+
+@MODELS.register_module()
+class DynamicVFE(nn.Module):
+    """Dynamic Voxel feature encoder used in DV-SECOND.
+
+    It encodes features of voxels and their points. It could also fuse
+    image feature into voxel features in a point-wise manner.
+    The number of points inside the voxel varies.
+
+    Args:
+        in_channels (int, optional): Input channels of VFE. Defaults to 4.
+        feat_channels (list(int), optional): Channels of features in VFE.
+        with_distance (bool, optional): Whether to use the L2 distance of
+            points to the origin point. Defaults to False.
+        with_cluster_center (bool, optional): Whether to use the distance
+            to cluster center of points inside a voxel. Defaults to False.
+        with_voxel_center (bool, optional): Whether to use the distance
+            to center of voxel for each points inside a voxel.
+            Defaults to False.
+        voxel_size (tuple[float], optional): Size of a single voxel.
+            Defaults to (0.2, 0.2, 4).
+        point_cloud_range (tuple[float], optional): The range of points
+            or voxels. Defaults to (0, -40, -3, 70.4, 40, 1).
+        norm_cfg (dict, optional): Config dict of normalization layers.
+        mode (str, optional): The mode when pooling features of points
+            inside a voxel. Available options include 'max' and 'avg'.
+            Defaults to 'max'.
+        fusion_layer (dict, optional): The config dict of fusion
+            layer used in multi-modal detectors. Defaults to None.
+        return_point_feats (bool, optional): Whether to return the features
+            of each points. Defaults to False.
+    """
+
+    def __init__(self,
+                 in_channels: int = 4,
+                 feat_channels: list = [],
+                 with_distance: bool = False,
+                 with_cluster_center: bool = False,
+                 with_voxel_center: bool = False,
+                 voxel_size: Tuple[float] = (0.2, 0.2, 4),
+                 point_cloud_range: Tuple[float] = (0, -40, -3, 70.4, 40, 1),
+                 norm_cfg: dict = dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 mode: str = 'max',
+                 fusion_layer: dict = None,
+                 return_point_feats: bool = False):
+        super(DynamicVFE, self).__init__()
+        assert mode in ['avg', 'max']
+        assert len(feat_channels) > 0
+        if with_cluster_center:
+            in_channels += 3
+        if with_voxel_center:
+            in_channels += 3
+        if with_distance:
+            in_channels += 1
+        self.in_channels = in_channels
+        self._with_distance = with_distance
+        self._with_cluster_center = with_cluster_center
+        self._with_voxel_center = with_voxel_center
+        self.return_point_feats = return_point_feats
+
+        # Need pillar (voxel) size and x/y offset in order to calculate offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.vz = voxel_size[2]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.z_offset = self.vz / 2 + point_cloud_range[2]
+        self.point_cloud_range = point_cloud_range
+
+        feat_channels = [self.in_channels] + list(feat_channels)
+        vfe_layers = []
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
+            if i > 0:
+                in_filters *= 2
+            norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
+            vfe_layers.append(
+                nn.Sequential(
+                    nn.Linear(in_filters, out_filters, bias=False), norm_layer,
+                    nn.ReLU(inplace=True)))
+        self.vfe_layers = nn.ModuleList(vfe_layers)
+        self.num_vfe = len(vfe_layers)
+        self.vfe_scatter = DynamicScatter(voxel_size, point_cloud_range,
+                                          (mode != 'max'))
+        self.cluster_scatter = DynamicScatter(
+            voxel_size, point_cloud_range, average_points=True)
+        self.fusion_layer = None
+        if fusion_layer is not None:
+            self.fusion_layer = MODELS.build(fusion_layer)
+
+    def map_voxel_center_to_point(self, pts_coors: Tensor, voxel_mean: Tensor,
+                                  voxel_coors: Tensor) -> Tensor:
+        """Map voxel features to its corresponding points.
+
+        Args:
+            pts_coors (torch.Tensor): Voxel coordinate of each point.
+            voxel_mean (torch.Tensor): Voxel features to be mapped.
+            voxel_coors (torch.Tensor): Coordinates of valid voxels
+
+        Returns:
+            torch.Tensor: Features or centers of each point.
+        """
+        # Step 1: scatter voxel into canvas
+        # Calculate necessary things for canvas creation
+        canvas_z = int(
+            (self.point_cloud_range[5] - self.point_cloud_range[2]) / self.vz)
+        canvas_y = int(
+            (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy)
+        canvas_x = int(
+            (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx)
+        # canvas_channel = voxel_mean.size(1)
+        batch_size = pts_coors[-1, 0] + 1
+        canvas_len = canvas_z * canvas_y * canvas_x * batch_size
+        # Create the canvas for this sample
+        canvas = voxel_mean.new_zeros(canvas_len, dtype=torch.long)
+        # Only include non-empty pillars
+        indices = (
+            voxel_coors[:, 0] * canvas_z * canvas_y * canvas_x +
+            voxel_coors[:, 1] * canvas_y * canvas_x +
+            voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3])
+        # Scatter the blob back to the canvas
+        canvas[indices.long()] = torch.arange(
+            start=0, end=voxel_mean.size(0), device=voxel_mean.device)
+
+        # Step 2: get voxel mean for each point
+        voxel_index = (
+            pts_coors[:, 0] * canvas_z * canvas_y * canvas_x +
+            pts_coors[:, 1] * canvas_y * canvas_x +
+            pts_coors[:, 2] * canvas_x + pts_coors[:, 3])
+        voxel_inds = canvas[voxel_index.long()]
+        center_per_point = voxel_mean[voxel_inds, ...]
+        return center_per_point
+
+    def forward(self,
+                features: Tensor,
+                coors: Tensor,
+                points: Optional[Sequence[Tensor]] = None,
+                img_feats: Optional[Sequence[Tensor]] = None,
+                img_metas: Optional[dict] = None,
+                *args,
+                **kwargs) -> tuple:
+        """Forward functions.
+
+        Args:
+            features (torch.Tensor): Features of voxels, shape is NxC.
+            coors (torch.Tensor): Coordinates of voxels, shape is  Nx(1+NDim).
+            points (list[torch.Tensor], optional): Raw points used to guide the
+                multi-modality fusion. Defaults to None.
+            img_feats (list[torch.Tensor], optional): Image features used for
+                multi-modality fusion. Defaults to None.
+            img_metas (dict, optional): [description]. Defaults to None.
+
+        Returns:
+            tuple: If `return_point_feats` is False, returns voxel features and
+                its coordinates. If `return_point_feats` is True, returns
+                feature of each points inside voxels.
+        """
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            voxel_mean, mean_coors = self.cluster_scatter(features, coors)
+            points_mean = self.map_voxel_center_to_point(
+                coors, voxel_mean, mean_coors)
+            # TODO: maybe also do cluster for reflectivity
+            f_cluster = features[:, :3] - points_mean[:, :3]
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features.new_zeros(size=(features.size(0), 3))
+            f_center[:, 0] = features[:, 0] - (
+                coors[:, 3].type_as(features) * self.vx + self.x_offset)
+            f_center[:, 1] = features[:, 1] - (
+                coors[:, 2].type_as(features) * self.vy + self.y_offset)
+            f_center[:, 2] = features[:, 2] - (
+                coors[:, 1].type_as(features) * self.vz + self.z_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        features = torch.cat(features_ls, dim=-1)
+        for i, vfe in enumerate(self.vfe_layers):
+            point_feats = vfe(features)
+            if (i == len(self.vfe_layers) - 1 and self.fusion_layer is not None
+                    and img_feats is not None):
+                point_feats = self.fusion_layer(img_feats, points, point_feats,
+                                                img_metas)
+            voxel_feats, voxel_coors = self.vfe_scatter(point_feats, coors)
+            if i != len(self.vfe_layers) - 1:
+                # need to concat voxel feats if it is not the last vfe
+                feat_per_point = self.map_voxel_center_to_point(
+                    coors, voxel_feats, voxel_coors)
+                features = torch.cat([point_feats, feat_per_point], dim=1)
+
+        if self.return_point_feats:
+            return point_feats
+        return voxel_feats, voxel_coors
+
+
+@MODELS.register_module()
+class HardVFE(nn.Module):
+    """Voxel feature encoder used in DV-SECOND.
+
+    It encodes features of voxels and their points. It could also fuse
+    image feature into voxel features in a point-wise manner.
+
+    Args:
+        in_channels (int, optional): Input channels of VFE. Defaults to 4.
+        feat_channels (list(int), optional): Channels of features in VFE.
+        with_distance (bool, optional): Whether to use the L2 distance
+            of points to the origin point. Defaults to False.
+        with_cluster_center (bool, optional): Whether to use the distance
+            to cluster center of points inside a voxel. Defaults to False.
+        with_voxel_center (bool, optional): Whether to use the distance to
+            center of voxel for each points inside a voxel. Defaults to False.
+        voxel_size (tuple[float], optional): Size of a single voxel.
+            Defaults to (0.2, 0.2, 4).
+        point_cloud_range (tuple[float], optional): The range of points
+            or voxels. Defaults to (0, -40, -3, 70.4, 40, 1).
+        norm_cfg (dict, optional): Config dict of normalization layers.
+        mode (str, optional): The mode when pooling features of points inside a
+            voxel. Available options include 'max' and 'avg'.
+            Defaults to 'max'.
+        fusion_layer (dict, optional): The config dict of fusion layer
+            used in multi-modal detectors. Defaults to None.
+        return_point_feats (bool, optional): Whether to return the
+            features of each points. Defaults to False.
+    """
+
+    def __init__(self,
+                 in_channels: int = 4,
+                 feat_channels: list = [],
+                 with_distance: bool = False,
+                 with_cluster_center: bool = False,
+                 with_voxel_center: bool = False,
+                 voxel_size: Tuple[float] = (0.2, 0.2, 4),
+                 point_cloud_range: Tuple[float] = (0, -40, -3, 70.4, 40, 1),
+                 norm_cfg: dict = dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 mode: str = 'max',
+                 fusion_layer: dict = None,
+                 return_point_feats: bool = False):
+        super(HardVFE, self).__init__()
+        assert len(feat_channels) > 0
+        if with_cluster_center:
+            in_channels += 3
+        if with_voxel_center:
+            in_channels += 3
+        if with_distance:
+            in_channels += 1
+        self.in_channels = in_channels
+        self._with_distance = with_distance
+        self._with_cluster_center = with_cluster_center
+        self._with_voxel_center = with_voxel_center
+        self.return_point_feats = return_point_feats
+
+        # Need pillar (voxel) size and x/y offset to calculate pillar offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.vz = voxel_size[2]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.z_offset = self.vz / 2 + point_cloud_range[2]
+        self.point_cloud_range = point_cloud_range
+
+        feat_channels = [self.in_channels] + list(feat_channels)
+        vfe_layers = []
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
+            if i > 0:
+                in_filters *= 2
+            # TODO: pass norm_cfg to VFE
+            # norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
+            if i == (len(feat_channels) - 2):
+                cat_max = False
+                max_out = True
+                if fusion_layer:
+                    max_out = False
+            else:
+                max_out = True
+                cat_max = True
+            vfe_layers.append(
+                VFELayer(
+                    in_filters,
+                    out_filters,
+                    norm_cfg=norm_cfg,
+                    max_out=max_out,
+                    cat_max=cat_max))
+            self.vfe_layers = nn.ModuleList(vfe_layers)
+        self.num_vfe = len(vfe_layers)
+
+        self.fusion_layer = None
+        if fusion_layer is not None:
+            self.fusion_layer = MODELS.build(fusion_layer)
+
+    def forward(self,
+                features: Tensor,
+                num_points: Tensor,
+                coors: Tensor,
+                img_feats: Optional[Sequence[Tensor]] = None,
+                img_metas: Optional[dict] = None,
+                *args,
+                **kwargs) -> tuple:
+        """Forward functions.
+
+        Args:
+            features (torch.Tensor): Features of voxels, shape is MxNxC.
+            num_points (torch.Tensor): Number of points in each voxel.
+            coors (torch.Tensor): Coordinates of voxels, shape is Mx(1+NDim).
+            img_feats (list[torch.Tensor], optional): Image features used for
+                multi-modality fusion. Defaults to None.
+            img_metas (dict, optional): [description]. Defaults to None.
+
+        Returns:
+            tuple: If `return_point_feats` is False, returns voxel features and
+                its coordinates. If `return_point_feats` is True, returns
+                feature of each points inside voxels.
+        """
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            points_mean = (
+                features[:, :, :3].sum(dim=1, keepdim=True) /
+                num_points.type_as(features).view(-1, 1, 1))
+            # TODO: maybe also do cluster for reflectivity
+            f_cluster = features[:, :, :3] - points_mean
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features.new_zeros(
+                size=(features.size(0), features.size(1), 3))
+            f_center[:, :, 0] = features[:, :, 0] - (
+                coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
+                self.x_offset)
+            f_center[:, :, 1] = features[:, :, 1] - (
+                coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
+                self.y_offset)
+            f_center[:, :, 2] = features[:, :, 2] - (
+                coors[:, 1].type_as(features).unsqueeze(1) * self.vz +
+                self.z_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        voxel_feats = torch.cat(features_ls, dim=-1)
+        # The feature decorations were calculated without regard to whether
+        # pillar was empty.
+        # Need to ensure that empty voxels remain set to zeros.
+        voxel_count = voxel_feats.shape[1]
+        mask = get_paddings_indicator(num_points, voxel_count, axis=0)
+        voxel_feats *= mask.unsqueeze(-1).type_as(voxel_feats)
+
+        for i, vfe in enumerate(self.vfe_layers):
+            voxel_feats = vfe(voxel_feats)
+
+        if (self.fusion_layer is not None and img_feats is not None):
+            voxel_feats = self.fusion_with_mask(features, mask, voxel_feats,
+                                                coors, img_feats, img_metas)
+
+        return voxel_feats
+
+    def fusion_with_mask(self, features: Tensor, mask: Tensor,
+                         voxel_feats: Tensor, coors: Tensor,
+                         img_feats: Sequence[Tensor],
+                         img_metas: Sequence[dict]) -> Tensor:
+        """Fuse image and point features with mask.
+
+        Args:
+            features (torch.Tensor): Features of voxel, usually it is the
+                values of points in voxels.
+            mask (torch.Tensor): Mask indicates valid features in each voxel.
+            voxel_feats (torch.Tensor): Features of voxels.
+            coors (torch.Tensor): Coordinates of each single voxel.
+            img_feats (list[torch.Tensor]): Multi-scale feature maps of image.
+            img_metas (list(dict)): Meta information of image and points.
+
+        Returns:
+            torch.Tensor: Fused features of each voxel.
+        """
+        # the features is consist of a batch of points
+        batch_size = coors[-1, 0] + 1
+        points = []
+        for i in range(batch_size):
+            single_mask = (coors[:, 0] == i)
+            points.append(features[single_mask][mask[single_mask]])
+
+        point_feats = voxel_feats[mask]
+        point_feats = self.fusion_layer(img_feats, points, point_feats,
+                                        img_metas)
+
+        voxel_canvas = voxel_feats.new_zeros(
+            size=(voxel_feats.size(0), voxel_feats.size(1),
+                  point_feats.size(-1)))
+        voxel_canvas[mask] = point_feats
+        out = torch.max(voxel_canvas, dim=1)[0]
+
+        return out
+
+
+@MODELS.register_module()
+class SegVFE(nn.Module):
+    """Voxel feature encoder used in segmentation task.
+
+    It encodes features of voxels and their points. It could also fuse
+    image feature into voxel features in a point-wise manner.
+    The number of points inside the voxel varies.
+
+    Args:
+        in_channels (int): Input channels of VFE. Defaults to 6.
+        feat_channels (list(int)): Channels of features in VFE.
+        with_voxel_center (bool): Whether to use the distance
+            to center of voxel for each points inside a voxel.
+            Defaults to False.
+        voxel_size (tuple[float]): Size of a single voxel (rho, phi, z).
+            Defaults to None.
+        grid_shape (tuple[float]): The grid shape of voxelization.
+            Defaults to (480, 360, 32).
+        point_cloud_range (tuple[float]): The range of points or voxels.
+            Defaults to (0, -3.14159265359, -4, 50, 3.14159265359, 2).
+        norm_cfg (dict): Config dict of normalization layers.
+        mode (str): The mode when pooling features of points
+            inside a voxel. Available options include 'max' and 'avg'.
+            Defaults to 'max'.
+        with_pre_norm (bool): Whether to use the norm layer before
+            input vfe layer.
+        feat_compression (int, optional): The voxel feature compression
+            channels, Defaults to None
+        return_point_feats (bool): Whether to return the features
+            of each points. Defaults to False.
+    """
+
+    def __init__(self,
+                 in_channels: int = 6,
+                 feat_channels: Sequence[int] = [],
+                 with_voxel_center: bool = False,
+                 voxel_size: Optional[Sequence[float]] = None,
+                 grid_shape: Sequence[float] = (480, 360, 32),
+                 point_cloud_range: Sequence[float] = (0, -3.14159265359, -4,
+                                                       50, 3.14159265359, 2),
+                 norm_cfg: dict = dict(type='BN1d', eps=1e-5, momentum=0.1),
+                 mode: bool = 'max',
+                 with_pre_norm: bool = True,
+                 feat_compression: Optional[int] = None,
+                 return_point_feats: bool = False) -> None:
+        super(SegVFE, self).__init__()
+        assert mode in ['avg', 'max']
+        assert len(feat_channels) > 0
+        assert not (voxel_size and grid_shape), \
+            'voxel_size and grid_shape cannot be setting at the same time'
+        if with_voxel_center:
+            in_channels += 3
+        self.in_channels = in_channels
+        self._with_voxel_center = with_voxel_center
+        self.return_point_feats = return_point_feats
+
+        self.point_cloud_range = point_cloud_range
+        point_cloud_range = torch.tensor(
+            point_cloud_range, dtype=torch.float32)
+        if voxel_size:
+            self.voxel_size = voxel_size
+            voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
+            grid_shape = (point_cloud_range[3:] -
+                          point_cloud_range[:3]) / voxel_size
+            grid_shape = torch.round(grid_shape).long().tolist()
+            self.grid_shape = grid_shape
+        elif grid_shape:
+            grid_shape = torch.tensor(grid_shape, dtype=torch.float32)
+            voxel_size = (point_cloud_range[3:] - point_cloud_range[:3]) / (
+                grid_shape - 1)
+            voxel_size = voxel_size.tolist()
+            self.voxel_size = voxel_size
+        else:
+            raise ValueError('must assign a value to voxel_size or grid_shape')
+
+        # Need pillar (voxel) size and x/y offset in order to calculate offset
+        self.vx = self.voxel_size[0]
+        self.vy = self.voxel_size[1]
+        self.vz = self.voxel_size[2]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.z_offset = self.vz / 2 + point_cloud_range[2]
+
+        feat_channels = [self.in_channels] + list(feat_channels)
+        if with_pre_norm:
+            self.pre_norm = build_norm_layer(norm_cfg, self.in_channels)[1]
+        vfe_layers = []
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
+            norm_layer = build_norm_layer(norm_cfg, out_filters)[1]
+            if i == len(feat_channels) - 2:
+                vfe_layers.append(nn.Linear(in_filters, out_filters))
+            else:
+                vfe_layers.append(
+                    nn.Sequential(
+                        nn.Linear(in_filters, out_filters), norm_layer,
+                        nn.ReLU(inplace=True)))
+        self.vfe_layers = nn.ModuleList(vfe_layers)
+        self.vfe_scatter = DynamicScatter(self.voxel_size,
+                                          self.point_cloud_range,
+                                          (mode != 'max'))
+        self.compression_layers = None
+        if feat_compression is not None:
+            self.compression_layers = nn.Sequential(
+                nn.Linear(feat_channels[-1], feat_compression), nn.ReLU())
+
+    def forward(self, features: Tensor, coors: Tensor, *args,
+                **kwargs) -> Tuple[Tensor]:
+        """Forward functions.
+
+        Args:
+            features (Tensor): Features of voxels, shape is NxC.
+            coors (Tensor): Coordinates of voxels, shape is  Nx(1+NDim).
+
+        Returns:
+            tuple: If `return_point_feats` is False, returns voxel features and
+                its coordinates. If `return_point_feats` is True, returns
+                feature of each points inside voxels additionally.
+        """
+        features_ls = [features]
+
+        # Find distance of x, y, and z from voxel center
+        if self._with_voxel_center:
+            f_center = features.new_zeros(size=(features.size(0), 3))
+            f_center[:, 0] = features[:, 0] - (
+                coors[:, 1].type_as(features) * self.vx + self.x_offset)
+            f_center[:, 1] = features[:, 1] - (
+                coors[:, 2].type_as(features) * self.vy + self.y_offset)
+            f_center[:, 2] = features[:, 2] - (
+                coors[:, 3].type_as(features) * self.vz + self.z_offset)
+            features_ls.append(f_center)
+
+        # Combine together feature decorations
+        features = torch.cat(features_ls[::-1], dim=-1)
+        if self.pre_norm is not None:
+            features = self.pre_norm(features)
+
+        point_feats = []
+        for vfe in self.vfe_layers:
+            features = vfe(features)
+            point_feats.append(features)
+        voxel_feats, voxel_coors = self.vfe_scatter(features, coors)
+
+        if self.compression_layers is not None:
+            voxel_feats = self.compression_layers(voxel_feats)
+
+        if self.return_point_feats:
+            return voxel_feats, voxel_coors, point_feats
+        return voxel_feats, voxel_coors
diff --git a/mmde/mmdet3d/registry.py b/mmde/mmdet3d/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..0278a765e150d86b51a1f576f617ddd9b91793f4
--- /dev/null
+++ b/mmde/mmdet3d/registry.py
@@ -0,0 +1,141 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""MMDetection3D provides 17 registry nodes to support using modules across
+projects. Each node is a child of the root registry in MMEngine.
+
+More details can be found at
+https://mmengine.readthedocs.io/en/latest/tutorials/registry.html.
+"""
+
+from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS
+from mmengine.registry import DATASETS as MMENGINE_DATASETS
+from mmengine.registry import EVALUATOR as MMENGINE_EVALUATOR
+from mmengine.registry import HOOKS as MMENGINE_HOOKS
+from mmengine.registry import INFERENCERS as MMENGINE_INFERENCERS
+from mmengine.registry import LOG_PROCESSORS as MMENGINE_LOG_PROCESSORS
+from mmengine.registry import LOOPS as MMENGINE_LOOPS
+from mmengine.registry import METRICS as MMENGINE_METRICS
+from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS
+from mmengine.registry import MODELS as MMENGINE_MODELS
+from mmengine.registry import \
+    OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS
+from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS
+from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS
+from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS
+from mmengine.registry import \
+    RUNNER_CONSTRUCTORS as MMENGINE_RUNNER_CONSTRUCTORS
+from mmengine.registry import RUNNERS as MMENGINE_RUNNERS
+from mmengine.registry import TASK_UTILS as MMENGINE_TASK_UTILS
+from mmengine.registry import TRANSFORMS as MMENGINE_TRANSFORMS
+from mmengine.registry import VISBACKENDS as MMENGINE_VISBACKENDS
+from mmengine.registry import VISUALIZERS as MMENGINE_VISUALIZERS
+from mmengine.registry import \
+    WEIGHT_INITIALIZERS as MMENGINE_WEIGHT_INITIALIZERS
+from mmengine.registry import Registry
+
+# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner`
+RUNNERS = Registry(
+    # TODO: update the location when mmdet3d has its own runner
+    'runner',
+    parent=MMENGINE_RUNNERS,
+    locations=['mmdet3d.engine'])
+# manage runner constructors that define how to initialize runners
+RUNNER_CONSTRUCTORS = Registry(
+    'runner constructor',
+    parent=MMENGINE_RUNNER_CONSTRUCTORS,
+    # TODO: update the location when mmdet3d has its own runner
+    locations=['mmdet3d.engine'])
+# manage all kinds of loops like `EpochBasedTrainLoop`
+LOOPS = Registry(
+    # TODO: update the location when mmdet3d has its own loop
+    'loop',
+    parent=MMENGINE_LOOPS,
+    locations=['mmdet3d.engine'])
+# manage all kinds of hooks like `CheckpointHook`
+HOOKS = Registry(
+    'hook', parent=MMENGINE_HOOKS, locations=['mmdet3d.engine.hooks'])
+
+# manage data-related modules
+DATASETS = Registry(
+    'dataset', parent=MMENGINE_DATASETS, locations=['mmdet3d.datasets'])
+DATA_SAMPLERS = Registry(
+    'data sampler',
+    parent=MMENGINE_DATA_SAMPLERS,
+    # TODO: update the location when mmdet3d has its own data sampler
+    locations=['mmdet3d.datasets'])
+TRANSFORMS = Registry(
+    'transform',
+    parent=MMENGINE_TRANSFORMS,
+    locations=['mmdet3d.datasets.transforms'])
+
+# mangage all kinds of modules inheriting `nn.Module`
+MODELS = Registry(
+    'model', parent=MMENGINE_MODELS, locations=['mmdet3d.models'])
+# mangage all kinds of model wrappers like 'MMDistributedDataParallel'
+MODEL_WRAPPERS = Registry(
+    'model_wrapper',
+    parent=MMENGINE_MODEL_WRAPPERS,
+    locations=['mmdet3d.models'])
+# mangage all kinds of weight initialization modules like `Uniform`
+WEIGHT_INITIALIZERS = Registry(
+    'weight initializer',
+    parent=MMENGINE_WEIGHT_INITIALIZERS,
+    locations=['mmdet3d.models'])
+
+# mangage all kinds of optimizers like `SGD` and `Adam`
+OPTIMIZERS = Registry(
+    'optimizer',
+    parent=MMENGINE_OPTIMIZERS,
+    # TODO: update the location when mmdet3d has its own optimizer
+    locations=['mmdet3d.engine'])
+# manage optimizer wrapper
+OPTIM_WRAPPERS = Registry(
+    'optim wrapper',
+    parent=MMENGINE_OPTIM_WRAPPERS,
+    # TODO: update the location when mmdet3d has its own optimizer
+    locations=['mmdet3d.engine'])
+# manage constructors that customize the optimization hyperparameters.
+OPTIM_WRAPPER_CONSTRUCTORS = Registry(
+    'optimizer wrapper constructor',
+    parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS,
+    # TODO: update the location when mmdet3d has its own optimizer
+    locations=['mmdet3d.engine'])
+# mangage all kinds of parameter schedulers like `MultiStepLR`
+PARAM_SCHEDULERS = Registry(
+    'parameter scheduler',
+    parent=MMENGINE_PARAM_SCHEDULERS,
+    # TODO: update the location when mmdet3d has its own scheduler
+    locations=['mmdet3d.engine'])
+# manage all kinds of metrics
+METRICS = Registry(
+    'metric', parent=MMENGINE_METRICS, locations=['mmdet3d.evaluation'])
+# manage evaluator
+EVALUATOR = Registry(
+    'evaluator', parent=MMENGINE_EVALUATOR, locations=['mmdet3d.evaluation'])
+
+# manage task-specific modules like anchor generators and box coders
+TASK_UTILS = Registry(
+    'task util', parent=MMENGINE_TASK_UTILS, locations=['mmdet3d.models'])
+
+# manage visualizer
+VISUALIZERS = Registry(
+    'visualizer',
+    parent=MMENGINE_VISUALIZERS,
+    locations=['mmdet3d.visualization'])
+# manage visualizer backend
+VISBACKENDS = Registry(
+    'vis_backend',
+    parent=MMENGINE_VISBACKENDS,
+    locations=['mmdet3d.visualization'])
+
+# manage logprocessor
+LOG_PROCESSORS = Registry(
+    'log_processor',
+    parent=MMENGINE_LOG_PROCESSORS,
+    # TODO: update the location when mmdet3d has its own log processor
+    locations=['mmdet3d.engine'])
+
+# manage inferencer
+INFERENCERS = Registry(
+    'inferencer',
+    parent=MMENGINE_INFERENCERS,
+    locations=['mmdet3d.api.inferencers'])
diff --git a/mmde/mmdet3d/structures/__init__.py b/mmde/mmdet3d/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bb1924f5ecfc26fbf3c824d04d2774db8097b30
--- /dev/null
+++ b/mmde/mmdet3d/structures/__init__.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox_3d import (BaseInstance3DBoxes, Box3DMode, CameraInstance3DBoxes,
+                      Coord3DMode, DepthInstance3DBoxes, LiDARInstance3DBoxes,
+                      get_box_type, get_proj_mat_by_coord_type, limit_period,
+                      mono_cam_box2vis, points_cam2img, points_img2cam,
+                      rotation_3d_in_axis, xywhr2xyxyr)
+from .det3d_data_sample import Det3DDataSample
+# yapf: disable
+from .ops import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D,
+                  BboxOverlapsNearest3D, axis_aligned_bbox_overlaps_3d,
+                  bbox3d2result, bbox3d2roi, bbox3d_mapping_back,
+                  bbox_overlaps_3d, bbox_overlaps_nearest_3d,
+                  box2d_to_corner_jit, box3d_to_bbox, box_camera_to_lidar,
+                  boxes3d_to_corners3d_lidar, camera_to_lidar,
+                  center_to_corner_box2d, center_to_corner_box3d,
+                  center_to_minmax_2d, corner_to_standup_nd_jit,
+                  corner_to_surfaces_3d, corner_to_surfaces_3d_jit, corners_nd,
+                  create_anchors_3d_range, depth_to_lidar_points,
+                  depth_to_points, get_frustum, iou_jit, minmax_to_corner_2d,
+                  points_in_convex_polygon_3d_jit,
+                  points_in_convex_polygon_jit, points_in_rbbox,
+                  projection_matrix_to_CRT_kitti, rbbox2d_to_near_bbox,
+                  remove_outside_points, rotation_points_single_angle,
+                  surface_equ_3d)
+# yapf: enable
+from .point_data import PointData
+from .points import BasePoints, CameraPoints, DepthPoints, LiDARPoints
+
+__all__ = [
+    'BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints',
+    'Det3DDataSample', 'PointData', 'Box3DMode', 'BaseInstance3DBoxes',
+    'LiDARInstance3DBoxes', 'CameraInstance3DBoxes', 'DepthInstance3DBoxes',
+    'xywhr2xyxyr', 'get_box_type', 'rotation_3d_in_axis', 'limit_period',
+    'points_cam2img', 'points_img2cam', 'Coord3DMode', 'mono_cam_box2vis',
+    'get_proj_mat_by_coord_type', 'box2d_to_corner_jit', 'box3d_to_bbox',
+    'box_camera_to_lidar', 'boxes3d_to_corners3d_lidar', 'camera_to_lidar',
+    'center_to_corner_box2d', 'center_to_corner_box3d', 'center_to_minmax_2d',
+    'corner_to_standup_nd_jit', 'corner_to_surfaces_3d',
+    'corner_to_surfaces_3d_jit', 'corners_nd', 'create_anchors_3d_range',
+    'depth_to_lidar_points', 'depth_to_points', 'get_frustum', 'iou_jit',
+    'minmax_to_corner_2d', 'points_in_convex_polygon_3d_jit',
+    'points_in_convex_polygon_jit', 'points_in_rbbox',
+    'projection_matrix_to_CRT_kitti', 'rbbox2d_to_near_bbox',
+    'remove_outside_points', 'rotation_points_single_angle', 'surface_equ_3d',
+    'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
+    'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D',
+    'axis_aligned_bbox_overlaps_3d', 'bbox3d_mapping_back', 'bbox3d2roi',
+    'bbox3d2result'
+]
diff --git a/mmde/mmdet3d/structures/bbox_3d/__init__.py b/mmde/mmdet3d/structures/bbox_3d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..460035a533fbf25b56dd4852b6914f6c80a0e488
--- /dev/null
+++ b/mmde/mmdet3d/structures/bbox_3d/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_box3d import BaseInstance3DBoxes
+from .box_3d_mode import Box3DMode
+from .cam_box3d import CameraInstance3DBoxes
+from .coord_3d_mode import Coord3DMode
+from .depth_box3d import DepthInstance3DBoxes
+from .lidar_box3d import LiDARInstance3DBoxes
+from .utils import (get_box_type, get_proj_mat_by_coord_type, limit_period,
+                    mono_cam_box2vis, points_cam2img, points_img2cam,
+                    rotation_3d_in_axis, xywhr2xyxyr)
+
+__all__ = [
+    'Box3DMode', 'BaseInstance3DBoxes', 'LiDARInstance3DBoxes',
+    'CameraInstance3DBoxes', 'DepthInstance3DBoxes', 'xywhr2xyxyr',
+    'get_box_type', 'rotation_3d_in_axis', 'limit_period', 'points_cam2img',
+    'points_img2cam', 'Coord3DMode', 'mono_cam_box2vis',
+    'get_proj_mat_by_coord_type'
+]
diff --git a/mmde/mmdet3d/structures/bbox_3d/base_box3d.py b/mmde/mmdet3d/structures/bbox_3d/base_box3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fb703c7317cf0ea476e4d612a8327455bc0971e
--- /dev/null
+++ b/mmde/mmdet3d/structures/bbox_3d/base_box3d.py
@@ -0,0 +1,698 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from abc import abstractmethod
+from typing import Iterator, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from mmcv.ops import box_iou_rotated, points_in_boxes_all, points_in_boxes_part
+from torch import Tensor
+
+from mmdet3d.structures.points import BasePoints
+from .utils import limit_period
+
+
+class BaseInstance3DBoxes:
+    """Base class for 3D Boxes.
+
+    Note:
+        The box is bottom centered, i.e. the relative position of origin in the
+        box is (0.5, 0.5, 0).
+
+    Args:
+        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The boxes
+            data with shape (N, box_dim).
+        box_dim (int): Number of the dimension of a box. Each row is
+            (x, y, z, x_size, y_size, z_size, yaw). Defaults to 7.
+        with_yaw (bool): Whether the box is with yaw rotation. If False, the
+            value of yaw will be set to 0 as minmax boxes. Defaults to True.
+        origin (Tuple[float]): Relative position of the box origin.
+            Defaults to (0.5, 0.5, 0). This will guide the box be converted to
+            (0.5, 0.5, 0) mode.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, box_dim).
+        box_dim (int): Integer indicating the dimension of a box. Each row is
+            (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+
+    YAW_AXIS: int = 0
+
+    def __init__(
+        self,
+        tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
+        box_dim: int = 7,
+        with_yaw: bool = True,
+        origin: Tuple[float, float, float] = (0.5, 0.5, 0)
+    ) -> None:
+        if isinstance(tensor, Tensor):
+            device = tensor.device
+        else:
+            device = torch.device('cpu')
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that does
+            # not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((-1, box_dim))
+        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, \
+            ('The box dimension must be 2 and the length of the last '
+             f'dimension must be {box_dim}, but got boxes with shape '
+             f'{tensor.shape}.')
+
+        if tensor.shape[-1] == 6:
+            # If the dimension of boxes is 6, we expand box_dim by padding 0 as
+            # a fake yaw and set with_yaw to False
+            assert box_dim == 6
+            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
+            tensor = torch.cat((tensor, fake_rot), dim=-1)
+            self.box_dim = box_dim + 1
+            self.with_yaw = False
+        else:
+            self.box_dim = box_dim
+            self.with_yaw = with_yaw
+        self.tensor = tensor.clone()
+
+        if origin != (0.5, 0.5, 0):
+            dst = self.tensor.new_tensor((0.5, 0.5, 0))
+            src = self.tensor.new_tensor(origin)
+            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
+
+    @property
+    def shape(self) -> torch.Size:
+        """torch.Size: Shape of boxes."""
+        return self.tensor.shape
+
+    @property
+    def volume(self) -> Tensor:
+        """Tensor: A vector with volume of each box in shape (N, )."""
+        return self.tensor[:, 3] * self.tensor[:, 4] * self.tensor[:, 5]
+
+    @property
+    def dims(self) -> Tensor:
+        """Tensor: Size dimensions of each box in shape (N, 3)."""
+        return self.tensor[:, 3:6]
+
+    @property
+    def yaw(self) -> Tensor:
+        """Tensor: A vector with yaw of each box in shape (N, )."""
+        return self.tensor[:, 6]
+
+    @property
+    def height(self) -> Tensor:
+        """Tensor: A vector with height of each box in shape (N, )."""
+        return self.tensor[:, 5]
+
+    @property
+    def top_height(self) -> Tensor:
+        """Tensor: A vector with top height of each box in shape (N, )."""
+        return self.bottom_height + self.height
+
+    @property
+    def bottom_height(self) -> Tensor:
+        """Tensor: A vector with bottom height of each box in shape (N, )."""
+        return self.tensor[:, 2]
+
+    @property
+    def center(self) -> Tensor:
+        """Calculate the center of all the boxes.
+
+        Note:
+            In MMDetection3D's convention, the bottom center is usually taken
+            as the default center.
+
+            The relative position of the centers in different kinds of boxes
+            are different, e.g., the relative center of a boxes is
+            (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar. It is
+            recommended to use ``bottom_center`` or ``gravity_center`` for
+            clearer usage.
+
+        Returns:
+            Tensor: A tensor with center of each box in shape (N, 3).
+        """
+        return self.bottom_center
+
+    @property
+    def bottom_center(self) -> Tensor:
+        """Tensor: A tensor with center of each box in shape (N, 3)."""
+        return self.tensor[:, :3]
+
+    @property
+    def gravity_center(self) -> Tensor:
+        """Tensor: A tensor with center of each box in shape (N, 3)."""
+        bottom_center = self.bottom_center
+        gravity_center = torch.zeros_like(bottom_center)
+        gravity_center[:, :2] = bottom_center[:, :2]
+        gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
+        return gravity_center
+
+    @property
+    def corners(self) -> Tensor:
+        """Tensor: A tensor with 8 corners of each box in shape (N, 8, 3)."""
+        pass
+
+    @property
+    def bev(self) -> Tensor:
+        """Tensor: 2D BEV box of each box with rotation in XYWHR format, in
+        shape (N, 5)."""
+        return self.tensor[:, [0, 1, 3, 4, 6]]
+
+    @property
+    def nearest_bev(self) -> Tensor:
+        """Tensor: A tensor of 2D BEV box of each box without rotation."""
+        # Obtain BEV boxes with rotation in XYWHR format
+        bev_rotated_boxes = self.bev
+        # convert the rotation to a valid range
+        rotations = bev_rotated_boxes[:, -1]
+        normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
+
+        # find the center of boxes
+        conditions = (normed_rotations > np.pi / 4)[..., None]
+        bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
+                                                                [0, 1, 3, 2]],
+                                  bev_rotated_boxes[:, :4])
+
+        centers = bboxes_xywh[:, :2]
+        dims = bboxes_xywh[:, 2:]
+        bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
+        return bev_boxes
+
+    def in_range_bev(
+            self, box_range: Union[Tensor, np.ndarray,
+                                   Sequence[float]]) -> Tensor:
+        """Check whether the boxes are in the given range.
+
+        Args:
+            box_range (Tensor or np.ndarray or Sequence[float]): The range of
+                box in order of (x_min, y_min, x_max, y_max).
+
+        Note:
+            The original implementation of SECOND checks whether boxes in a
+            range by checking whether the points are in a convex polygon, we
+            reduce the burden for simpler cases.
+
+        Returns:
+            Tensor: A binary vector indicating whether each box is inside the
+            reference range.
+        """
+        in_range_flags = ((self.bev[:, 0] > box_range[0])
+                          & (self.bev[:, 1] > box_range[1])
+                          & (self.bev[:, 0] < box_range[2])
+                          & (self.bev[:, 1] < box_range[3]))
+        return in_range_flags
+
+    @abstractmethod
+    def rotate(
+        self,
+        angle: Union[Tensor, np.ndarray, float],
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tuple[Tensor, Tensor], Tuple[np.ndarray, np.ndarray], Tuple[
+            BasePoints, Tensor], None]:
+        """Rotate boxes with points (optional) with the given angle or rotation
+        matrix.
+
+        Args:
+            angle (Tensor or np.ndarray or float): Rotation angle or rotation
+                matrix.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns None,
+            otherwise it returns the rotated points and the rotation matrix
+            ``rot_mat_T``.
+        """
+        pass
+
+    @abstractmethod
+    def flip(
+        self,
+        bev_direction: str = 'horizontal',
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tensor, np.ndarray, BasePoints, None]:
+        """Flip the boxes in BEV along given BEV direction.
+
+        Args:
+            bev_direction (str): Direction by which to flip. Can be chosen from
+                'horizontal' and 'vertical'. Defaults to 'horizontal'.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to flip. Defaults to None.
+
+        Returns:
+            Tensor or np.ndarray or :obj:`BasePoints` or None: When ``points``
+            is None, the function returns None, otherwise it returns the
+            flipped points.
+        """
+        pass
+
+    def translate(self, trans_vector: Union[Tensor, np.ndarray]) -> None:
+        """Translate boxes with the given translation vector.
+
+        Args:
+            trans_vector (Tensor or np.ndarray): Translation vector of size
+                1x3.
+        """
+        if not isinstance(trans_vector, Tensor):
+            trans_vector = self.tensor.new_tensor(trans_vector)
+        self.tensor[:, :3] += trans_vector
+
+    def in_range_3d(
+            self, box_range: Union[Tensor, np.ndarray,
+                                   Sequence[float]]) -> Tensor:
+        """Check whether the boxes are in the given range.
+
+        Args:
+            box_range (Tensor or np.ndarray or Sequence[float]): The range of
+                box (x_min, y_min, z_min, x_max, y_max, z_max).
+
+        Note:
+            In the original implementation of SECOND, checking whether a box in
+            the range checks whether the points are in a convex polygon, we try
+            to reduce the burden for simpler cases.
+
+        Returns:
+            Tensor: A binary vector indicating whether each point is inside the
+            reference range.
+        """
+        gravity_center = self.gravity_center
+        in_range_flags = ((gravity_center[:, 0] > box_range[0])
+                          & (gravity_center[:, 1] > box_range[1])
+                          & (gravity_center[:, 2] > box_range[2])
+                          & (gravity_center[:, 0] < box_range[3])
+                          & (gravity_center[:, 1] < box_range[4])
+                          & (gravity_center[:, 2] < box_range[5]))
+        return in_range_flags
+
+    @abstractmethod
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor, np.ndarray]] = None,
+                   correct_yaw: bool = False) -> 'BaseInstance3DBoxes':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Box mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+            correct_yaw (bool): Whether to convert the yaw angle to the target
+                coordinate. Defaults to False.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: The converted box of the same type in
+            the ``dst`` mode.
+        """
+        pass
+
+    def scale(self, scale_factor: float) -> None:
+        """Scale the box with horizontal and vertical scaling factors.
+
+        Args:
+            scale_factors (float): Scale factors to scale the boxes.
+        """
+        self.tensor[:, :6] *= scale_factor
+        self.tensor[:, 7:] *= scale_factor  # velocity
+
+    def limit_yaw(self, offset: float = 0.5, period: float = np.pi) -> None:
+        """Limit the yaw to a given period and offset.
+
+        Args:
+            offset (float): The offset of the yaw. Defaults to 0.5.
+            period (float): The expected period. Defaults to np.pi.
+        """
+        self.tensor[:, 6] = limit_period(self.tensor[:, 6], offset, period)
+
+    def nonempty(self, threshold: float = 0.0) -> Tensor:
+        """Find boxes that are non-empty.
+
+        A box is considered empty if either of its side is no larger than
+        threshold.
+
+        Args:
+            threshold (float): The threshold of minimal sizes. Defaults to 0.0.
+
+        Returns:
+            Tensor: A binary vector which represents whether each box is empty
+            (False) or non-empty (True).
+        """
+        box = self.tensor
+        size_x = box[..., 3]
+        size_y = box[..., 4]
+        size_z = box[..., 5]
+        keep = ((size_x > threshold)
+                & (size_y > threshold) & (size_z > threshold))
+        return keep
+
+    def __getitem__(
+            self, item: Union[int, slice, np.ndarray,
+                              Tensor]) -> 'BaseInstance3DBoxes':
+        """
+        Args:
+            item (int or slice or np.ndarray or Tensor): Index of boxes.
+
+        Note:
+            The following usage are allowed:
+
+            1. `new_boxes = boxes[3]`: Return a `Boxes` that contains only one
+               box.
+            2. `new_boxes = boxes[2:10]`: Return a slice of boxes.
+            3. `new_boxes = boxes[vector]`: Where vector is a
+               torch.BoolTensor with `length = len(boxes)`. Nonzero elements in
+               the vector will be selected.
+
+            Note that the returned Boxes might share storage with this Boxes,
+            subject to PyTorch's indexing semantics.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new object of
+            :class:`BaseInstance3DBoxes` after indexing.
+        """
+        original_type = type(self)
+        if isinstance(item, int):
+            return original_type(
+                self.tensor[item].view(1, -1),
+                box_dim=self.box_dim,
+                with_yaw=self.with_yaw)
+        b = self.tensor[item]
+        assert b.dim() == 2, \
+            f'Indexing on Boxes with {item} failed to return a matrix!'
+        return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+    def __len__(self) -> int:
+        """int: Number of boxes in the current object."""
+        return self.tensor.shape[0]
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the object."""
+        return self.__class__.__name__ + '(\n    ' + str(self.tensor) + ')'
+
+    @classmethod
+    def cat(cls, boxes_list: Sequence['BaseInstance3DBoxes']
+            ) -> 'BaseInstance3DBoxes':
+        """Concatenate a list of Boxes into a single Boxes.
+
+        Args:
+            boxes_list (Sequence[:obj:`BaseInstance3DBoxes`]): List of boxes.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: The concatenated boxes.
+        """
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0))
+        assert all(isinstance(box, cls) for box in boxes_list)
+
+        # use torch.cat (v.s. layers.cat)
+        # so the returned boxes never share storage with input
+        cat_boxes = cls(
+            torch.cat([b.tensor for b in boxes_list], dim=0),
+            box_dim=boxes_list[0].box_dim,
+            with_yaw=boxes_list[0].with_yaw)
+        return cat_boxes
+
+    def numpy(self) -> np.ndarray:
+        """Reload ``numpy`` from self.tensor."""
+        return self.tensor.numpy()
+
+    def to(self, device: Union[str, torch.device], *args,
+           **kwargs) -> 'BaseInstance3DBoxes':
+        """Convert current boxes to a specific device.
+
+        Args:
+            device (str or :obj:`torch.device`): The name of the device.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new boxes object on the specific
+            device.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.to(device, *args, **kwargs),
+            box_dim=self.box_dim,
+            with_yaw=self.with_yaw)
+
+    def cpu(self) -> 'BaseInstance3DBoxes':
+        """Convert current boxes to cpu device.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new boxes object on the cpu device.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.cpu(), box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+    def cuda(self, *args, **kwargs) -> 'BaseInstance3DBoxes':
+        """Convert current boxes to cuda device.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new boxes object on the cuda device.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.cuda(*args, **kwargs),
+            box_dim=self.box_dim,
+            with_yaw=self.with_yaw)
+
+    def clone(self) -> 'BaseInstance3DBoxes':
+        """Clone the boxes.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: Box object with the same properties as
+            self.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.clone(), box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+    def detach(self) -> 'BaseInstance3DBoxes':
+        """Detach the boxes.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: Box object with the same properties as
+            self.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.detach(), box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+    @property
+    def device(self) -> torch.device:
+        """torch.device: The device of the boxes are on."""
+        return self.tensor.device
+
+    def __iter__(self) -> Iterator[Tensor]:
+        """Yield a box as a Tensor at a time.
+
+        Returns:
+            Iterator[Tensor]: A box of shape (box_dim, ).
+        """
+        yield from self.tensor
+
+    @classmethod
+    def height_overlaps(cls, boxes1: 'BaseInstance3DBoxes',
+                        boxes2: 'BaseInstance3DBoxes') -> Tensor:
+        """Calculate height overlaps of two boxes.
+
+        Note:
+            This function calculates the height overlaps between ``boxes1`` and
+            ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.
+
+        Args:
+            boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
+
+        Returns:
+            Tensor: Calculated height overlap of the boxes.
+        """
+        assert isinstance(boxes1, BaseInstance3DBoxes)
+        assert isinstance(boxes2, BaseInstance3DBoxes)
+        assert type(boxes1) == type(boxes2), \
+            '"boxes1" and "boxes2" should be in the same type, ' \
+            f'but got {type(boxes1)} and {type(boxes2)}.'
+
+        boxes1_top_height = boxes1.top_height.view(-1, 1)
+        boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
+        boxes2_top_height = boxes2.top_height.view(1, -1)
+        boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
+
+        heighest_of_bottom = torch.max(boxes1_bottom_height,
+                                       boxes2_bottom_height)
+        lowest_of_top = torch.min(boxes1_top_height, boxes2_top_height)
+        overlaps_h = torch.clamp(lowest_of_top - heighest_of_bottom, min=0)
+        return overlaps_h
+
+    @classmethod
+    def overlaps(cls,
+                 boxes1: 'BaseInstance3DBoxes',
+                 boxes2: 'BaseInstance3DBoxes',
+                 mode: str = 'iou') -> Tensor:
+        """Calculate 3D overlaps of two boxes.
+
+        Note:
+            This function calculates the overlaps between ``boxes1`` and
+            ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.
+
+        Args:
+            boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
+            mode (str): Mode of iou calculation. Defaults to 'iou'.
+
+        Returns:
+            Tensor: Calculated 3D overlap of the boxes.
+        """
+        assert isinstance(boxes1, BaseInstance3DBoxes)
+        assert isinstance(boxes2, BaseInstance3DBoxes)
+        assert type(boxes1) == type(boxes2), \
+            '"boxes1" and "boxes2" should be in the same type, ' \
+            f'but got {type(boxes1)} and {type(boxes2)}.'
+
+        assert mode in ['iou', 'iof']
+
+        rows = len(boxes1)
+        cols = len(boxes2)
+        if rows * cols == 0:
+            return boxes1.tensor.new(rows, cols)
+
+        # height overlap
+        overlaps_h = cls.height_overlaps(boxes1, boxes2)
+
+        # Restrict the min values of W and H to avoid memory overflow in
+        # ``box_iou_rotated``.
+        boxes1_bev, boxes2_bev = boxes1.bev, boxes2.bev
+        boxes1_bev[:, 2:4] = boxes1_bev[:, 2:4].clamp(min=1e-4)
+        boxes2_bev[:, 2:4] = boxes2_bev[:, 2:4].clamp(min=1e-4)
+
+        # bev overlap
+        iou2d = box_iou_rotated(boxes1_bev, boxes2_bev)
+        areas1 = (boxes1_bev[:, 2] * boxes1_bev[:, 3]).unsqueeze(1).expand(
+            rows, cols)
+        areas2 = (boxes2_bev[:, 2] * boxes2_bev[:, 3]).unsqueeze(0).expand(
+            rows, cols)
+        overlaps_bev = iou2d * (areas1 + areas2) / (1 + iou2d)
+
+        # 3d overlaps
+        overlaps_3d = overlaps_bev.to(boxes1.device) * overlaps_h
+
+        volume1 = boxes1.volume.view(-1, 1)
+        volume2 = boxes2.volume.view(1, -1)
+
+        if mode == 'iou':
+            # the clamp func is used to avoid division of 0
+            iou3d = overlaps_3d / torch.clamp(
+                volume1 + volume2 - overlaps_3d, min=1e-8)
+        else:
+            iou3d = overlaps_3d / torch.clamp(volume1, min=1e-8)
+
+        return iou3d
+
+    def new_box(
+        self, data: Union[Tensor, np.ndarray, Sequence[Sequence[float]]]
+    ) -> 'BaseInstance3DBoxes':
+        """Create a new box object with data.
+
+        The new box and its tensor has the similar properties as self and
+        self.tensor, respectively.
+
+        Args:
+            data (Tensor or np.ndarray or Sequence[Sequence[float]]): Data to
+                be copied.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new bbox object with ``data``, the
+            object's other properties are similar to ``self``.
+        """
+        new_tensor = self.tensor.new_tensor(data) \
+            if not isinstance(data, Tensor) else data.to(self.device)
+        original_type = type(self)
+        return original_type(
+            new_tensor, box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+    def points_in_boxes_part(
+            self,
+            points: Tensor,
+            boxes_override: Optional[Tensor] = None) -> Tensor:
+        """Find the box in which each point is.
+
+        Args:
+            points (Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions
+                are (x, y, z) in LiDAR or depth coordinate.
+            boxes_override (Tensor, optional): Boxes to override `self.tensor`.
+                Defaults to None.
+
+        Note:
+            If a point is enclosed by multiple boxes, the index of the first
+            box will be returned.
+
+        Returns:
+            Tensor: The index of the first box that each point is in with shape
+            (M, ). Default value is -1 (if the point is not enclosed by any
+            box).
+        """
+        if boxes_override is not None:
+            boxes = boxes_override
+        else:
+            boxes = self.tensor
+
+        points_clone = points.clone()[..., :3]
+        if points_clone.dim() == 2:
+            points_clone = points_clone.unsqueeze(0)
+        else:
+            assert points_clone.dim() == 3 and points_clone.shape[0] == 1
+
+        boxes = boxes.to(points_clone.device).unsqueeze(0)
+        box_idx = points_in_boxes_part(points_clone, boxes)
+
+        return box_idx.squeeze(0)
+
+    def points_in_boxes_all(self,
+                            points: Tensor,
+                            boxes_override: Optional[Tensor] = None) -> Tensor:
+        """Find all boxes in which each point is.
+
+        Args:
+            points (Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions
+                are (x, y, z) in LiDAR or depth coordinate.
+            boxes_override (Tensor, optional): Boxes to override `self.tensor`.
+                Defaults to None.
+
+        Returns:
+            Tensor: A tensor indicating whether a point is in a box with shape
+            (M, T). T is the number of boxes. Denote this tensor as A, it the
+            m^th point is in the t^th box, then `A[m, t] == 1`, otherwise
+            `A[m, t] == 0`.
+        """
+        if boxes_override is not None:
+            boxes = boxes_override
+        else:
+            boxes = self.tensor
+
+        points_clone = points.clone()[..., :3]
+        if points_clone.dim() == 2:
+            points_clone = points_clone.unsqueeze(0)
+        else:
+            assert points_clone.dim() == 3 and points_clone.shape[0] == 1
+
+        boxes = boxes.to(points_clone.device).unsqueeze(0)
+        box_idxs_of_pts = points_in_boxes_all(points_clone, boxes)
+
+        return box_idxs_of_pts.squeeze(0)
+
+    def points_in_boxes(self,
+                        points: Tensor,
+                        boxes_override: Optional[Tensor] = None) -> Tensor:
+        warnings.warn('DeprecationWarning: points_in_boxes is a deprecated '
+                      'method, please consider using points_in_boxes_part.')
+        return self.points_in_boxes_part(points, boxes_override)
+
+    def points_in_boxes_batch(
+            self,
+            points: Tensor,
+            boxes_override: Optional[Tensor] = None) -> Tensor:
+        warnings.warn('DeprecationWarning: points_in_boxes_batch is a '
+                      'deprecated method, please consider using '
+                      'points_in_boxes_all.')
+        return self.points_in_boxes_all(points, boxes_override)
diff --git a/mmde/mmdet3d/structures/bbox_3d/box_3d_mode.py b/mmde/mmdet3d/structures/bbox_3d/box_3d_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd0977ee7af59d9bd03a451a4b5460b91411dc92
--- /dev/null
+++ b/mmde/mmdet3d/structures/bbox_3d/box_3d_mode.py
@@ -0,0 +1,267 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from enum import IntEnum, unique
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from .base_box3d import BaseInstance3DBoxes
+from .cam_box3d import CameraInstance3DBoxes
+from .depth_box3d import DepthInstance3DBoxes
+from .lidar_box3d import LiDARInstance3DBoxes
+from .utils import limit_period
+
+
+@unique
+class Box3DMode(IntEnum):
+    """Enum of different ways to represent a box.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                    up z
+                       ^   x front
+                       |  /
+                       | /
+        left y <------ 0
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+
+    Coordinates in Camera:
+
+    .. code-block:: none
+
+                z front
+               /
+              /
+             0 ------> x right
+             |
+             |
+             v
+        down y
+
+    The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
+    and the yaw is around the y axis, thus the rotation axis=1.
+
+    Coordinates in Depth:
+
+    .. code-block:: none
+
+        up z
+           ^   y front
+           |  /
+           | /
+           0 ------> x right
+
+    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    """
+
+    LIDAR = 0
+    CAM = 1
+    DEPTH = 2
+
+    @staticmethod
+    def convert(
+        box: Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes],
+        src: 'Box3DMode',
+        dst: 'Box3DMode',
+        rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
+        with_yaw: bool = True,
+        correct_yaw: bool = False
+    ) -> Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes]:
+        """Convert boxes from ``src`` mode to ``dst`` mode.
+
+        Args:
+            box (Sequence[float] or np.ndarray or Tensor or
+                :obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk
+                array/tensor.
+            src (:obj:`Box3DMode`): The source box mode.
+            dst (:obj:`Box3DMode`): The target box mode.
+            rt_mat (np.ndarray or Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+            with_yaw (bool): If ``box`` is an instance of
+                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
+                Defaults to True.
+            correct_yaw (bool): If the yaw is rotated by rt_mat.
+                Defaults to False.
+
+        Returns:
+            Sequence[float] or np.ndarray or Tensor or
+            :obj:`BaseInstance3DBoxes`: The converted box of the same type.
+        """
+        if src == dst:
+            return box
+
+        is_numpy = isinstance(box, np.ndarray)
+        is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes)
+        single_box = isinstance(box, (list, tuple))
+        if single_box:
+            assert len(box) >= 7, (
+                'Box3DMode.convert takes either a k-tuple/list or '
+                'an Nxk array/tensor, where k >= 7')
+            arr = torch.tensor(box)[None, :]
+        else:
+            # avoid modifying the input box
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(box)).clone()
+            elif is_Instance3DBoxes:
+                arr = box.tensor.clone()
+            else:
+                arr = box.clone()
+
+        if is_Instance3DBoxes:
+            with_yaw = box.with_yaw
+
+        # convert box from `src` mode to `dst` mode.
+        x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6]
+        if with_yaw:
+            yaw = arr[..., 6:7]
+        if src == Box3DMode.LIDAR and dst == Box3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+            if with_yaw:
+                if correct_yaw:
+                    yaw_vector = torch.cat([
+                        torch.cos(yaw),
+                        torch.sin(yaw),
+                        torch.zeros_like(yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = -yaw - np.pi / 2
+                    yaw = limit_period(yaw, period=np.pi * 2)
+        elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+            if with_yaw:
+                if correct_yaw:
+                    yaw_vector = torch.cat([
+                        torch.cos(-yaw),
+                        torch.zeros_like(yaw),
+                        torch.sin(-yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = -yaw - np.pi / 2
+                    yaw = limit_period(yaw, period=np.pi * 2)
+        elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+            if with_yaw:
+                if correct_yaw:
+                    yaw_vector = torch.cat([
+                        torch.cos(yaw),
+                        torch.sin(yaw),
+                        torch.zeros_like(yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = -yaw
+        elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+            if with_yaw:
+                if correct_yaw:
+                    yaw_vector = torch.cat([
+                        torch.cos(-yaw),
+                        torch.zeros_like(yaw),
+                        torch.sin(-yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = -yaw
+        elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([x_size, y_size, z_size], dim=-1)
+            if with_yaw:
+                if correct_yaw:
+                    yaw_vector = torch.cat([
+                        torch.cos(yaw),
+                        torch.sin(yaw),
+                        torch.zeros_like(yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = yaw + np.pi / 2
+                    yaw = limit_period(yaw, period=np.pi * 2)
+        elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([x_size, y_size, z_size], dim=-1)
+            if with_yaw:
+                if correct_yaw:
+                    yaw_vector = torch.cat([
+                        torch.cos(yaw),
+                        torch.sin(yaw),
+                        torch.zeros_like(yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = yaw - np.pi / 2
+                    yaw = limit_period(yaw, period=np.pi * 2)
+        else:
+            raise NotImplementedError(
+                f'Conversion from Box3DMode {src} to {dst} '
+                'is not supported yet')
+
+        if not isinstance(rt_mat, Tensor):
+            rt_mat = arr.new_tensor(rt_mat)
+        if rt_mat.size(1) == 4:
+            extended_xyz = torch.cat(
+                [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1)
+            xyz = extended_xyz @ rt_mat.t()
+        else:
+            xyz = arr[..., :3] @ rt_mat.t()
+
+        # Note: we only use rotation in rt_mat
+        # so don't need to extend yaw_vector
+        if with_yaw and correct_yaw:
+            rot_yaw_vector = yaw_vector @ rt_mat[:3, :3].t()
+            if dst == Box3DMode.CAM:
+                yaw = torch.atan2(-rot_yaw_vector[:, [2]], rot_yaw_vector[:,
+                                                                          [0]])
+            elif dst in [Box3DMode.LIDAR, Box3DMode.DEPTH]:
+                yaw = torch.atan2(rot_yaw_vector[:, [1]], rot_yaw_vector[:,
+                                                                         [0]])
+            yaw = limit_period(yaw, period=np.pi * 2)
+
+        if with_yaw:
+            remains = arr[..., 7:]
+            arr = torch.cat([xyz[..., :3], xyz_size, yaw, remains], dim=-1)
+        else:
+            remains = arr[..., 6:]
+            arr = torch.cat([xyz[..., :3], xyz_size, remains], dim=-1)
+
+        # convert arr to the original type
+        original_type = type(box)
+        if single_box:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        elif is_Instance3DBoxes:
+            if dst == Box3DMode.CAM:
+                target_type = CameraInstance3DBoxes
+            elif dst == Box3DMode.LIDAR:
+                target_type = LiDARInstance3DBoxes
+            elif dst == Box3DMode.DEPTH:
+                target_type = DepthInstance3DBoxes
+            else:
+                raise NotImplementedError(
+                    f'Conversion to {dst} through {original_type} '
+                    'is not supported yet')
+            return target_type(arr, box_dim=arr.size(-1), with_yaw=with_yaw)
+        else:
+            return arr
diff --git a/mmde/mmdet3d/structures/bbox_3d/cam_box3d.py b/mmde/mmdet3d/structures/bbox_3d/cam_box3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9d6d5c1ab38fee8354983dd6c1bb2361093fa45
--- /dev/null
+++ b/mmde/mmdet3d/structures/bbox_3d/cam_box3d.py
@@ -0,0 +1,403 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet3d.structures.points import BasePoints
+from .base_box3d import BaseInstance3DBoxes
+from .utils import rotation_3d_in_axis, yaw2local
+
+
+class CameraInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in CAM coordinates.
+
+    Coordinates in Camera:
+
+    .. code-block:: none
+
+                z front (yaw=-0.5*pi)
+               /
+              /
+             0 ------> x right (yaw=0)
+             |
+             |
+             v
+        down y
+
+    The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
+    and the yaw is around the y axis, thus the rotation axis=1. The yaw is 0 at
+    the positive direction of x axis, and decreases from the positive direction
+    of x to the positive direction of z.
+
+    Args:
+        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The boxes
+            data with shape (N, box_dim).
+        box_dim (int): Number of the dimension of a box. Each row is
+            (x, y, z, x_size, y_size, z_size, yaw). Defaults to 7.
+        with_yaw (bool): Whether the box is with yaw rotation. If False, the
+            value of yaw will be set to 0 as minmax boxes. Defaults to True.
+        origin (Tuple[float]): Relative position of the box origin.
+            Defaults to (0.5, 1.0, 0.5). This will guide the box be converted
+            to (0.5, 1.0, 0.5) mode.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, box_dim).
+        box_dim (int): Integer indicating the dimension of a box. Each row is
+            (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+    YAW_AXIS = 1
+
+    def __init__(
+        self,
+        tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
+        box_dim: int = 7,
+        with_yaw: bool = True,
+        origin: Tuple[float, float, float] = (0.5, 1.0, 0.5)
+    ) -> None:
+        if isinstance(tensor, Tensor):
+            device = tensor.device
+        else:
+            device = torch.device('cpu')
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that does
+            # not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((-1, box_dim))
+        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, \
+            ('The box dimension must be 2 and the length of the last '
+             f'dimension must be {box_dim}, but got boxes with shape '
+             f'{tensor.shape}.')
+
+        if tensor.shape[-1] == 6:
+            # If the dimension of boxes is 6, we expand box_dim by padding 0 as
+            # a fake yaw and set with_yaw to False
+            assert box_dim == 6
+            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
+            tensor = torch.cat((tensor, fake_rot), dim=-1)
+            self.box_dim = box_dim + 1
+            self.with_yaw = False
+        else:
+            self.box_dim = box_dim
+            self.with_yaw = with_yaw
+        self.tensor = tensor.clone()
+
+        if origin != (0.5, 1.0, 0.5):
+            dst = self.tensor.new_tensor((0.5, 1.0, 0.5))
+            src = self.tensor.new_tensor(origin)
+            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
+
+    @property
+    def height(self) -> Tensor:
+        """Tensor: A vector with height of each box in shape (N, )."""
+        return self.tensor[:, 4]
+
+    @property
+    def top_height(self) -> Tensor:
+        """Tensor: A vector with top height of each box in shape (N, )."""
+        # the positive direction is down rather than up
+        return self.bottom_height - self.height
+
+    @property
+    def bottom_height(self) -> Tensor:
+        """Tensor: A vector with bottom height of each box in shape (N, )."""
+        return self.tensor[:, 1]
+
+    @property
+    def local_yaw(self) -> Tensor:
+        """Tensor: A vector with local yaw of each box in shape (N, ).
+        local_yaw equals to alpha in kitti, which is commonly used in monocular
+        3D object detection task, so only :obj:`CameraInstance3DBoxes` has the
+        property."""
+        yaw = self.yaw
+        loc = self.gravity_center
+        local_yaw = yaw2local(yaw, loc)
+
+        return local_yaw
+
+    @property
+    def gravity_center(self) -> Tensor:
+        """Tensor: A tensor with center of each box in shape (N, 3)."""
+        bottom_center = self.bottom_center
+        gravity_center = torch.zeros_like(bottom_center)
+        gravity_center[:, [0, 2]] = bottom_center[:, [0, 2]]
+        gravity_center[:, 1] = bottom_center[:, 1] - self.tensor[:, 4] * 0.5
+        return gravity_center
+
+    @property
+    def corners(self) -> Tensor:
+        """Convert boxes to corners in clockwise order, in the form of (x0y0z0,
+        x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0).
+
+        .. code-block:: none
+
+                         front z
+                              /
+                             /
+               (x0, y0, z1) + -----------  + (x1, y0, z1)
+                           /|            / |
+                          / |           /  |
+            (x0, y0, z0) + ----------- +   + (x1, y1, z1)
+                         |  /      .   |  /
+                         | / origin    | /
+            (x0, y1, z0) + ----------- + -------> right x
+                         |             (x1, y1, z0)
+                         |
+                         v
+                    down y
+
+        Returns:
+            Tensor: A tensor with 8 corners of each box in shape (N, 8, 3).
+        """
+        if self.tensor.numel() == 0:
+            return torch.empty([0, 8, 3], device=self.tensor.device)
+
+        dims = self.dims
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+                device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin (0.5, 1, 0.5)
+        corners_norm = corners_norm - dims.new_tensor([0.5, 1, 0.5])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        corners = rotation_3d_in_axis(
+            corners, self.tensor[:, 6], axis=self.YAW_AXIS)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    @property
+    def bev(self) -> Tensor:
+        """Tensor: 2D BEV box of each box with rotation in XYWHR format, in
+        shape (N, 5)."""
+        bev = self.tensor[:, [0, 2, 3, 5, 6]].clone()
+        # positive direction of the gravity axis
+        # in cam coord system points to the earth
+        # so the bev yaw angle needs to be reversed
+        bev[:, -1] = -bev[:, -1]
+        return bev
+
+    def rotate(
+        self,
+        angle: Union[Tensor, np.ndarray, float],
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tuple[Tensor, Tensor], Tuple[np.ndarray, np.ndarray], Tuple[
+            BasePoints, Tensor], None]:
+        """Rotate boxes with points (optional) with the given angle or rotation
+        matrix.
+
+        Args:
+            angle (Tensor or np.ndarray or float): Rotation angle or rotation
+                matrix.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns None,
+            otherwise it returns the rotated points and the rotation matrix
+            ``rot_mat_T``.
+        """
+        if not isinstance(angle, Tensor):
+            angle = self.tensor.new_tensor(angle)
+
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+            f'invalid rotation angle shape {angle.shape}'
+
+        if angle.numel() == 1:
+            self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis(
+                self.tensor[:, 0:3],
+                angle,
+                axis=self.YAW_AXIS,
+                return_mat=True)
+        else:
+            rot_mat_T = angle
+            rot_sin = rot_mat_T[2, 0]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+            self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
+
+        self.tensor[:, 6] += angle
+
+        if points is not None:
+            if isinstance(points, Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.cpu().numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                points.rotate(rot_mat_T)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+
+    def flip(
+        self,
+        bev_direction: str = 'horizontal',
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tensor, np.ndarray, BasePoints, None]:
+        """Flip the boxes in BEV along given BEV direction.
+
+        In CAM coordinates, it flips the x (horizontal) or z (vertical) axis.
+
+        Args:
+            bev_direction (str): Direction by which to flip. Can be chosen from
+                'horizontal' and 'vertical'. Defaults to 'horizontal'.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to flip. Defaults to None.
+
+        Returns:
+            Tensor or np.ndarray or :obj:`BasePoints` or None: When ``points``
+            is None, the function returns None, otherwise it returns the
+            flipped points.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == 'vertical':
+            self.tensor[:, 2::7] = -self.tensor[:, 2::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+
+        if points is not None:
+            assert isinstance(points, (Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (Tensor, np.ndarray)):
+                if bev_direction == 'horizontal':
+                    points[:, 0] = -points[:, 0]
+                elif bev_direction == 'vertical':
+                    points[:, 2] = -points[:, 2]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
+
+    @classmethod
+    def height_overlaps(cls, boxes1: 'CameraInstance3DBoxes',
+                        boxes2: 'CameraInstance3DBoxes') -> Tensor:
+        """Calculate height overlaps of two boxes.
+
+        Note:
+            This function calculates the height overlaps between ``boxes1`` and
+            ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.
+
+        Args:
+            boxes1 (:obj:`CameraInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`CameraInstance3DBoxes`): Boxes 2 contain M boxes.
+
+        Returns:
+            Tensor: Calculated height overlap of the boxes.
+        """
+        assert isinstance(boxes1, CameraInstance3DBoxes)
+        assert isinstance(boxes2, CameraInstance3DBoxes)
+
+        boxes1_top_height = boxes1.top_height.view(-1, 1)
+        boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
+        boxes2_top_height = boxes2.top_height.view(1, -1)
+        boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
+
+        # positive direction of the gravity axis
+        # in cam coord system points to the earth
+        heighest_of_bottom = torch.min(boxes1_bottom_height,
+                                       boxes2_bottom_height)
+        lowest_of_top = torch.max(boxes1_top_height, boxes2_top_height)
+        overlaps_h = torch.clamp(heighest_of_bottom - lowest_of_top, min=0)
+        return overlaps_h
+
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor, np.ndarray]] = None,
+                   correct_yaw: bool = False) -> 'BaseInstance3DBoxes':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Box mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+            correct_yaw (bool): Whether to convert the yaw angle to the target
+                coordinate. Defaults to False.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: The converted box of the same type in
+            the ``dst`` mode.
+        """
+        from .box_3d_mode import Box3DMode
+
+        # TODO: always set correct_yaw=True
+        return Box3DMode.convert(
+            box=self,
+            src=Box3DMode.CAM,
+            dst=dst,
+            rt_mat=rt_mat,
+            correct_yaw=correct_yaw)
+
+    def points_in_boxes_part(
+            self,
+            points: Tensor,
+            boxes_override: Optional[Tensor] = None) -> Tensor:
+        """Find the box in which each point is.
+
+        Args:
+            points (Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions
+                are (x, y, z) in LiDAR or depth coordinate.
+            boxes_override (Tensor, optional): Boxes to override `self.tensor`.
+                Defaults to None.
+
+        Returns:
+            Tensor: The index of the first box that each point is in with shape
+            (M, ). Default value is -1 (if the point is not enclosed by any
+            box).
+        """
+        from .coord_3d_mode import Coord3DMode
+
+        points_lidar = Coord3DMode.convert(points, Coord3DMode.CAM,
+                                           Coord3DMode.LIDAR)
+        if boxes_override is not None:
+            boxes_lidar = boxes_override
+        else:
+            boxes_lidar = Coord3DMode.convert(
+                self.tensor,
+                Coord3DMode.CAM,
+                Coord3DMode.LIDAR,
+                is_point=False)
+
+        box_idx = super().points_in_boxes_part(points_lidar, boxes_lidar)
+        return box_idx
+
+    def points_in_boxes_all(self,
+                            points: Tensor,
+                            boxes_override: Optional[Tensor] = None) -> Tensor:
+        """Find all boxes in which each point is.
+
+        Args:
+            points (Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions
+                are (x, y, z) in LiDAR or depth coordinate.
+            boxes_override (Tensor, optional): Boxes to override `self.tensor`.
+                Defaults to None.
+
+        Returns:
+            Tensor: The index of all boxes in which each point is with shape
+            (M, T).
+        """
+        from .coord_3d_mode import Coord3DMode
+
+        points_lidar = Coord3DMode.convert(points, Coord3DMode.CAM,
+                                           Coord3DMode.LIDAR)
+        if boxes_override is not None:
+            boxes_lidar = boxes_override
+        else:
+            boxes_lidar = Coord3DMode.convert(
+                self.tensor,
+                Coord3DMode.CAM,
+                Coord3DMode.LIDAR,
+                is_point=False)
+
+        box_idx = super().points_in_boxes_all(points_lidar, boxes_lidar)
+        return box_idx
diff --git a/mmde/mmdet3d/structures/bbox_3d/coord_3d_mode.py b/mmde/mmdet3d/structures/bbox_3d/coord_3d_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..699d930035c42e73c7a676809c9343e84d5cfd5f
--- /dev/null
+++ b/mmde/mmdet3d/structures/bbox_3d/coord_3d_mode.py
@@ -0,0 +1,273 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from enum import IntEnum, unique
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet3d.structures.points import (BasePoints, CameraPoints, DepthPoints,
+                                       LiDARPoints)
+from .base_box3d import BaseInstance3DBoxes
+from .box_3d_mode import Box3DMode
+
+
+@unique
+class Coord3DMode(IntEnum):
+    """Enum of different ways to represent a box and point cloud.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                    up z
+                       ^   x front
+                       |  /
+                       | /
+        left y <------ 0
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+
+    Coordinates in Camera:
+
+    .. code-block:: none
+
+                z front
+               /
+              /
+             0 ------> x right
+             |
+             |
+             v
+        down y
+
+    The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
+    and the yaw is around the y axis, thus the rotation axis=1.
+
+    Coordinates in Depth:
+
+    .. code-block:: none
+
+        up z
+           ^   y front
+           |  /
+           | /
+           0 ------> x right
+
+    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    """
+
+    LIDAR = 0
+    CAM = 1
+    DEPTH = 2
+
+    @staticmethod
+    def convert(input: Union[Sequence[float], np.ndarray, Tensor,
+                             BaseInstance3DBoxes, BasePoints],
+                src: Union[Box3DMode, 'Coord3DMode'],
+                dst: Union[Box3DMode, 'Coord3DMode'],
+                rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
+                with_yaw: bool = True,
+                correct_yaw: bool = False,
+                is_point: bool = True):
+        """Convert boxes or points from ``src`` mode to ``dst`` mode.
+
+        Args:
+            input (Sequence[float] or np.ndarray or Tensor or
+                :obj:`BaseInstance3DBoxes` or :obj:`BasePoints`): Can be a
+                k-tuple, k-list or an Nxk array/tensor.
+            src (:obj:`Box3DMode` or :obj:`Coord3DMode`): The source mode.
+            dst (:obj:`Box3DMode` or :obj:`Coord3DMode`): The target mode.
+            rt_mat (np.ndarray or Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+            with_yaw (bool): If ``box`` is an instance of
+                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
+                Defaults to True.
+            correct_yaw (bool): If the yaw is rotated by rt_mat.
+                Defaults to False.
+            is_point (bool): If ``input`` is neither an instance of
+                :obj:`BaseInstance3DBoxes` nor an instance of
+                :obj:`BasePoints`, whether or not it is point data.
+                Defaults to True.
+
+        Returns:
+            Sequence[float] or np.ndarray or Tensor or
+            :obj:`BaseInstance3DBoxes` or :obj:`BasePoints`: The converted box
+            or points of the same type.
+        """
+        if isinstance(input, BaseInstance3DBoxes):
+            return Coord3DMode.convert_box(
+                input,
+                src,
+                dst,
+                rt_mat=rt_mat,
+                with_yaw=with_yaw,
+                correct_yaw=correct_yaw)
+        elif isinstance(input, BasePoints):
+            return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat)
+        elif isinstance(input, (tuple, list, np.ndarray, Tensor)):
+            if is_point:
+                return Coord3DMode.convert_point(
+                    input, src, dst, rt_mat=rt_mat)
+            else:
+                return Coord3DMode.convert_box(
+                    input,
+                    src,
+                    dst,
+                    rt_mat=rt_mat,
+                    with_yaw=with_yaw,
+                    correct_yaw=correct_yaw)
+        else:
+            raise NotImplementedError
+
+    @staticmethod
+    def convert_box(
+        box: Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes],
+        src: Box3DMode,
+        dst: Box3DMode,
+        rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
+        with_yaw: bool = True,
+        correct_yaw: bool = False
+    ) -> Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes]:
+        """Convert boxes from ``src`` mode to ``dst`` mode.
+
+        Args:
+            box (Sequence[float] or np.ndarray or Tensor or
+                :obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk
+                array/tensor.
+            src (:obj:`Box3DMode`): The source box mode.
+            dst (:obj:`Box3DMode`): The target box mode.
+            rt_mat (np.ndarray or Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+            with_yaw (bool): If ``box`` is an instance of
+                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
+                Defaults to True.
+            correct_yaw (bool): If the yaw is rotated by rt_mat.
+                Defaults to False.
+
+        Returns:
+            Sequence[float] or np.ndarray or Tensor or
+            :obj:`BaseInstance3DBoxes`: The converted box of the same type.
+        """
+        return Box3DMode.convert(
+            box,
+            src,
+            dst,
+            rt_mat=rt_mat,
+            with_yaw=with_yaw,
+            correct_yaw=correct_yaw)
+
+    @staticmethod
+    def convert_point(
+        point: Union[Sequence[float], np.ndarray, Tensor, BasePoints],
+        src: 'Coord3DMode',
+        dst: 'Coord3DMode',
+        rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
+    ) -> Union[Sequence[float], np.ndarray, Tensor, BasePoints]:
+        """Convert points from ``src`` mode to ``dst`` mode.
+
+        Args:
+            box (Sequence[float] or np.ndarray or Tensor or :obj:`BasePoints`):
+                Can be a k-tuple, k-list or an Nxk array/tensor.
+            src (:obj:`Coord3DMode`): The source point mode.
+            dst (:obj:`Coord3DMode`): The target point mode.
+            rt_mat (np.ndarray or Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+
+        Returns:
+            Sequence[float] or np.ndarray or Tensor or :obj:`BasePoints`: The
+            converted point of the same type.
+        """
+        if src == dst:
+            return point
+
+        is_numpy = isinstance(point, np.ndarray)
+        is_InstancePoints = isinstance(point, BasePoints)
+        single_point = isinstance(point, (list, tuple))
+        if single_point:
+            assert len(point) >= 3, (
+                'Coord3DMode.convert takes either a k-tuple/list or '
+                'an Nxk array/tensor, where k >= 3')
+            arr = torch.tensor(point)[None, :]
+        else:
+            # avoid modifying the input point
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(point)).clone()
+            elif is_InstancePoints:
+                arr = point.tensor.clone()
+            else:
+                arr = point.clone()
+
+        # convert point from `src` mode to `dst` mode.
+        if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
+        elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
+        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+        elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+        elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+        else:
+            raise NotImplementedError(
+                f'Conversion from Coord3DMode {src} to {dst} '
+                'is not supported yet')
+
+        if not isinstance(rt_mat, Tensor):
+            rt_mat = arr.new_tensor(rt_mat)
+        if rt_mat.size(1) == 4:
+            extended_xyz = torch.cat(
+                [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1)
+            xyz = extended_xyz @ rt_mat.t()
+        else:
+            xyz = arr[..., :3] @ rt_mat.t()
+
+        remains = arr[..., 3:]
+        arr = torch.cat([xyz[..., :3], remains], dim=-1)
+
+        # convert arr to the original type
+        original_type = type(point)
+        if single_point:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        elif is_InstancePoints:
+            if dst == Coord3DMode.CAM:
+                target_type = CameraPoints
+            elif dst == Coord3DMode.LIDAR:
+                target_type = LiDARPoints
+            elif dst == Coord3DMode.DEPTH:
+                target_type = DepthPoints
+            else:
+                raise NotImplementedError(
+                    f'Conversion to {dst} through {original_type} '
+                    'is not supported yet')
+            return target_type(
+                arr,
+                points_dim=arr.size(-1),
+                attribute_dims=point.attribute_dims)
+        else:
+            return arr
diff --git a/mmde/mmdet3d/structures/bbox_3d/depth_box3d.py b/mmde/mmdet3d/structures/bbox_3d/depth_box3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5800569e2654265f8bac97e8f1dadc2836db82e
--- /dev/null
+++ b/mmde/mmdet3d/structures/bbox_3d/depth_box3d.py
@@ -0,0 +1,280 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet3d.structures.points import BasePoints
+from .base_box3d import BaseInstance3DBoxes
+from .utils import rotation_3d_in_axis
+
+
+class DepthInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in DEPTH coordinates.
+
+    Coordinates in Depth:
+
+    .. code-block:: none
+
+        up z    y front (yaw=0.5*pi)
+           ^   ^
+           |  /
+           | /
+           0 ------> x right (yaw=0)
+
+    The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2. The yaw is 0 at
+    the positive direction of x axis, and increases from the positive direction
+    of x to the positive direction of y.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, box_dim).
+        box_dim (int): Integer indicating the dimension of a box. Each row is
+            (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+    YAW_AXIS = 2
+
+    @property
+    def corners(self) -> Tensor:
+        """Convert boxes to corners in clockwise order, in the form of (x0y0z0,
+        x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0).
+
+        .. code-block:: none
+
+                                        up z
+                         front y           ^
+                              /            |
+                             /             |
+               (x0, y1, z1) + -----------  + (x1, y1, z1)
+                           /|            / |
+                          / |           /  |
+            (x0, y0, z1) + ----------- +   + (x1, y1, z0)
+                         |  /      .   |  /
+                         | / origin    | /
+            (x0, y0, z0) + ----------- + --------> right x
+                                       (x1, y0, z0)
+
+        Returns:
+            Tensor: A tensor with 8 corners of each box in shape (N, 8, 3).
+        """
+        if self.tensor.numel() == 0:
+            return torch.empty([0, 8, 3], device=self.tensor.device)
+
+        dims = self.dims
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+                device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin (0.5, 0.5, 0)
+        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate around z axis
+        corners = rotation_3d_in_axis(
+            corners, self.tensor[:, 6], axis=self.YAW_AXIS)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    def rotate(
+        self,
+        angle: Union[Tensor, np.ndarray, float],
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tuple[Tensor, Tensor], Tuple[np.ndarray, np.ndarray], Tuple[
+            BasePoints, Tensor], None]:
+        """Rotate boxes with points (optional) with the given angle or rotation
+        matrix.
+
+        Args:
+            angle (Tensor or np.ndarray or float): Rotation angle or rotation
+                matrix.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns None,
+            otherwise it returns the rotated points and the rotation matrix
+            ``rot_mat_T``.
+        """
+        if not isinstance(angle, Tensor):
+            angle = self.tensor.new_tensor(angle)
+
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+            f'invalid rotation angle shape {angle.shape}'
+
+        if angle.numel() == 1:
+            self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis(
+                self.tensor[:, 0:3],
+                angle,
+                axis=self.YAW_AXIS,
+                return_mat=True)
+        else:
+            rot_mat_T = angle
+            rot_sin = rot_mat_T[0, 1]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+            self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
+
+        if self.with_yaw:
+            self.tensor[:, 6] += angle
+        else:
+            # for axis-aligned boxes, we take the new
+            # enclosing axis-aligned boxes after rotation
+            corners_rot = self.corners @ rot_mat_T
+            new_x_size = corners_rot[..., 0].max(
+                dim=1, keepdim=True)[0] - corners_rot[..., 0].min(
+                    dim=1, keepdim=True)[0]
+            new_y_size = corners_rot[..., 1].max(
+                dim=1, keepdim=True)[0] - corners_rot[..., 1].min(
+                    dim=1, keepdim=True)[0]
+            self.tensor[:, 3:5] = torch.cat((new_x_size, new_y_size), dim=-1)
+
+        if points is not None:
+            if isinstance(points, Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.cpu().numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                points.rotate(rot_mat_T)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+
+    def flip(
+        self,
+        bev_direction: str = 'horizontal',
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tensor, np.ndarray, BasePoints, None]:
+        """Flip the boxes in BEV along given BEV direction.
+
+        In Depth coordinates, it flips the x (horizontal) or y (vertical) axis.
+
+        Args:
+            bev_direction (str): Direction by which to flip. Can be chosen from
+                'horizontal' and 'vertical'. Defaults to 'horizontal'.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to flip. Defaults to None.
+
+        Returns:
+            Tensor or np.ndarray or :obj:`BasePoints` or None: When ``points``
+            is None, the function returns None, otherwise it returns the
+            flipped points.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == 'vertical':
+            self.tensor[:, 1::7] = -self.tensor[:, 1::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+
+        if points is not None:
+            assert isinstance(points, (Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (Tensor, np.ndarray)):
+                if bev_direction == 'horizontal':
+                    points[:, 0] = -points[:, 0]
+                elif bev_direction == 'vertical':
+                    points[:, 1] = -points[:, 1]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
+
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor, np.ndarray]] = None,
+                   correct_yaw: bool = False) -> 'BaseInstance3DBoxes':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Box mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+            correct_yaw (bool): Whether to convert the yaw angle to the target
+                coordinate. Defaults to False.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: The converted box of the same type in
+            the ``dst`` mode.
+        """
+        from .box_3d_mode import Box3DMode
+        return Box3DMode.convert(
+            box=self,
+            src=Box3DMode.DEPTH,
+            dst=dst,
+            rt_mat=rt_mat,
+            correct_yaw=correct_yaw)
+
+    def enlarged_box(
+            self, extra_width: Union[float, Tensor]) -> 'DepthInstance3DBoxes':
+        """Enlarge the length, width and height of boxes.
+
+        Args:
+            extra_width (float or Tensor): Extra width to enlarge the box.
+
+        Returns:
+            :obj:`DepthInstance3DBoxes`: Enlarged boxes.
+        """
+        enlarged_boxes = self.tensor.clone()
+        enlarged_boxes[:, 3:6] += extra_width * 2
+        # bottom center z minus extra_width
+        enlarged_boxes[:, 2] -= extra_width
+        return self.new_box(enlarged_boxes)
+
+    def get_surface_line_center(self) -> Tuple[Tensor, Tensor]:
+        """Compute surface and line center of bounding boxes.
+
+        Returns:
+            Tuple[Tensor, Tensor]: Surface and line center of bounding boxes.
+        """
+        obj_size = self.dims
+        center = self.gravity_center.view(-1, 1, 3)
+        batch_size = center.shape[0]
+
+        rot_sin = torch.sin(-self.yaw)
+        rot_cos = torch.cos(-self.yaw)
+        rot_mat_T = self.yaw.new_zeros(tuple(list(self.yaw.shape) + [3, 3]))
+        rot_mat_T[..., 0, 0] = rot_cos
+        rot_mat_T[..., 0, 1] = -rot_sin
+        rot_mat_T[..., 1, 0] = rot_sin
+        rot_mat_T[..., 1, 1] = rot_cos
+        rot_mat_T[..., 2, 2] = 1
+
+        # Get the object surface center
+        offset = obj_size.new_tensor([[0, 0, 1], [0, 0, -1], [0, 1, 0],
+                                      [0, -1, 0], [1, 0, 0], [-1, 0, 0]])
+        offset = offset.view(1, 6, 3) / 2
+        surface_3d = (offset *
+                      obj_size.view(batch_size, 1, 3).repeat(1, 6, 1)).reshape(
+                          -1, 3)
+
+        # Get the object line center
+        offset = obj_size.new_tensor([[1, 0, 1], [-1, 0, 1], [0, 1, 1],
+                                      [0, -1, 1], [1, 0, -1], [-1, 0, -1],
+                                      [0, 1, -1], [0, -1, -1], [1, 1, 0],
+                                      [1, -1, 0], [-1, 1, 0], [-1, -1, 0]])
+        offset = offset.view(1, 12, 3) / 2
+
+        line_3d = (offset *
+                   obj_size.view(batch_size, 1, 3).repeat(1, 12, 1)).reshape(
+                       -1, 3)
+
+        surface_rot = rot_mat_T.repeat(6, 1, 1)
+        surface_3d = torch.matmul(surface_3d.unsqueeze(-2),
+                                  surface_rot).squeeze(-2)
+        surface_center = center.repeat(1, 6, 1).reshape(-1, 3) + surface_3d
+
+        line_rot = rot_mat_T.repeat(12, 1, 1)
+        line_3d = torch.matmul(line_3d.unsqueeze(-2), line_rot).squeeze(-2)
+        line_center = center.repeat(1, 12, 1).reshape(-1, 3) + line_3d
+
+        return surface_center, line_center
diff --git a/mmde/mmdet3d/structures/bbox_3d/lidar_box3d.py b/mmde/mmdet3d/structures/bbox_3d/lidar_box3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..172e20123716b95a721b6434251fcb91196c9cac
--- /dev/null
+++ b/mmde/mmdet3d/structures/bbox_3d/lidar_box3d.py
@@ -0,0 +1,223 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet3d.structures.points import BasePoints
+from .base_box3d import BaseInstance3DBoxes
+from .utils import rotation_3d_in_axis
+
+
+class LiDARInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in LIDAR coordinates.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                                 up z    x front (yaw=0)
+                                    ^   ^
+                                    |  /
+                                    | /
+        (yaw=0.5*pi) left y <------ 0
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2. The yaw is 0 at
+    the positive direction of x axis, and increases from the positive direction
+    of x to the positive direction of y.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, box_dim).
+        box_dim (int): Integer indicating the dimension of a box. Each row is
+            (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+    YAW_AXIS = 2
+
+    @property
+    def corners(self) -> Tensor:
+        """Convert boxes to corners in clockwise order, in the form of (x0y0z0,
+        x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0).
+
+        .. code-block:: none
+
+                                           up z
+                            front x           ^
+                                 /            |
+                                /             |
+                  (x1, y0, z1) + -----------  + (x1, y1, z1)
+                              /|            / |
+                             / |           /  |
+               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
+                            |  /      .   |  /
+                            | / origin    | /
+            left y <------- + ----------- + (x0, y1, z0)
+                (x0, y0, z0)
+
+        Returns:
+            Tensor: A tensor with 8 corners of each box in shape (N, 8, 3).
+        """
+        if self.tensor.numel() == 0:
+            return torch.empty([0, 8, 3], device=self.tensor.device)
+
+        dims = self.dims
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+                device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin (0.5, 0.5, 0)
+        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate around z axis
+        corners = rotation_3d_in_axis(
+            corners, self.tensor[:, 6], axis=self.YAW_AXIS)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    def rotate(
+        self,
+        angle: Union[Tensor, np.ndarray, float],
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tuple[Tensor, Tensor], Tuple[np.ndarray, np.ndarray], Tuple[
+            BasePoints, Tensor], None]:
+        """Rotate boxes with points (optional) with the given angle or rotation
+        matrix.
+
+        Args:
+            angle (Tensor or np.ndarray or float): Rotation angle or rotation
+                matrix.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns None,
+            otherwise it returns the rotated points and the rotation matrix
+            ``rot_mat_T``.
+        """
+        if not isinstance(angle, Tensor):
+            angle = self.tensor.new_tensor(angle)
+
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+            f'invalid rotation angle shape {angle.shape}'
+
+        if angle.numel() == 1:
+            self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis(
+                self.tensor[:, 0:3],
+                angle,
+                axis=self.YAW_AXIS,
+                return_mat=True)
+        else:
+            rot_mat_T = angle
+            rot_sin = rot_mat_T[0, 1]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+            self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
+
+        self.tensor[:, 6] += angle
+
+        if self.tensor.shape[1] == 9:
+            # rotate velo vector
+            self.tensor[:, 7:9] = self.tensor[:, 7:9] @ rot_mat_T[:2, :2]
+
+        if points is not None:
+            if isinstance(points, Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.cpu().numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                points.rotate(rot_mat_T)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+
+    def flip(
+        self,
+        bev_direction: str = 'horizontal',
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tensor, np.ndarray, BasePoints, None]:
+        """Flip the boxes in BEV along given BEV direction.
+
+        In LIDAR coordinates, it flips the y (horizontal) or x (vertical) axis.
+
+        Args:
+            bev_direction (str): Direction by which to flip. Can be chosen from
+                'horizontal' and 'vertical'. Defaults to 'horizontal'.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to flip. Defaults to None.
+
+        Returns:
+            Tensor or np.ndarray or :obj:`BasePoints` or None: When ``points``
+            is None, the function returns None, otherwise it returns the
+            flipped points.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 1::7] = -self.tensor[:, 1::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+        elif bev_direction == 'vertical':
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+
+        if points is not None:
+            assert isinstance(points, (Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (Tensor, np.ndarray)):
+                if bev_direction == 'horizontal':
+                    points[:, 1] = -points[:, 1]
+                elif bev_direction == 'vertical':
+                    points[:, 0] = -points[:, 0]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
+
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor, np.ndarray]] = None,
+                   correct_yaw: bool = False) -> 'BaseInstance3DBoxes':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Box mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+            correct_yaw (bool): Whether to convert the yaw angle to the target
+                coordinate. Defaults to False.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: The converted box of the same type in
+            the ``dst`` mode.
+        """
+        from .box_3d_mode import Box3DMode
+        return Box3DMode.convert(
+            box=self,
+            src=Box3DMode.LIDAR,
+            dst=dst,
+            rt_mat=rt_mat,
+            correct_yaw=correct_yaw)
+
+    def enlarged_box(
+            self, extra_width: Union[float, Tensor]) -> 'LiDARInstance3DBoxes':
+        """Enlarge the length, width and height of boxes.
+
+        Args:
+            extra_width (float or Tensor): Extra width to enlarge the box.
+
+        Returns:
+            :obj:`LiDARInstance3DBoxes`: Enlarged boxes.
+        """
+        enlarged_boxes = self.tensor.clone()
+        enlarged_boxes[:, 3:6] += extra_width * 2
+        # bottom center z minus extra_width
+        enlarged_boxes[:, 2] -= extra_width
+        return self.new_box(enlarged_boxes)
diff --git a/mmde/mmdet3d/structures/bbox_3d/utils.py b/mmde/mmdet3d/structures/bbox_3d/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..900ea105b5831f3d1f05665b6792ba82b5b3810f
--- /dev/null
+++ b/mmde/mmdet3d/structures/bbox_3d/utils.py
@@ -0,0 +1,367 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from logging import warning
+from typing import Tuple, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet3d.utils import array_converter
+
+
+@array_converter(apply_to=('val', ))
+def limit_period(val: Union[np.ndarray, Tensor],
+                 offset: float = 0.5,
+                 period: float = np.pi) -> Union[np.ndarray, Tensor]:
+    """Limit the value into a period for periodic function.
+
+    Args:
+        val (np.ndarray or Tensor): The value to be converted.
+        offset (float): Offset to set the value range. Defaults to 0.5.
+        period (float): Period of the value. Defaults to np.pi.
+
+    Returns:
+        np.ndarray or Tensor: Value in the range of
+        [-offset * period, (1-offset) * period].
+    """
+    limited_val = val - torch.floor(val / period + offset) * period
+    return limited_val
+
+
+@array_converter(apply_to=('points', 'angles'))
+def rotation_3d_in_axis(
+    points: Union[np.ndarray, Tensor],
+    angles: Union[np.ndarray, Tensor, float],
+    axis: int = 0,
+    return_mat: bool = False,
+    clockwise: bool = False
+) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[Tensor, Tensor], np.ndarray,
+           Tensor]:
+    """Rotate points by angles according to axis.
+
+    Args:
+        points (np.ndarray or Tensor): Points with shape (N, M, 3).
+        angles (np.ndarray or Tensor or float): Vector of angles with shape
+            (N, ).
+        axis (int): The axis to be rotated. Defaults to 0.
+        return_mat (bool): Whether or not to return the rotation matrix
+            (transposed). Defaults to False.
+        clockwise (bool): Whether the rotation is clockwise. Defaults to False.
+
+    Raises:
+        ValueError: When the axis is not in range [-3, -2, -1, 0, 1, 2], it
+            will raise ValueError.
+
+    Returns:
+        Tuple[np.ndarray, np.ndarray] or Tuple[Tensor, Tensor] or np.ndarray or
+        Tensor: Rotated points with shape (N, M, 3) and rotation matrix with
+        shape (N, 3, 3).
+    """
+    batch_free = len(points.shape) == 2
+    if batch_free:
+        points = points[None]
+
+    if isinstance(angles, float) or len(angles.shape) == 0:
+        angles = torch.full(points.shape[:1], angles)
+
+    assert len(points.shape) == 3 and len(angles.shape) == 1 and \
+        points.shape[0] == angles.shape[0], 'Incorrect shape of points ' \
+        f'angles: {points.shape}, {angles.shape}'
+
+    assert points.shape[-1] in [2, 3], \
+        f'Points size should be 2 or 3 instead of {points.shape[-1]}'
+
+    rot_sin = torch.sin(angles)
+    rot_cos = torch.cos(angles)
+    ones = torch.ones_like(rot_cos)
+    zeros = torch.zeros_like(rot_cos)
+
+    if points.shape[-1] == 3:
+        if axis == 1 or axis == -2:
+            rot_mat_T = torch.stack([
+                torch.stack([rot_cos, zeros, -rot_sin]),
+                torch.stack([zeros, ones, zeros]),
+                torch.stack([rot_sin, zeros, rot_cos])
+            ])
+        elif axis == 2 or axis == -1:
+            rot_mat_T = torch.stack([
+                torch.stack([rot_cos, rot_sin, zeros]),
+                torch.stack([-rot_sin, rot_cos, zeros]),
+                torch.stack([zeros, zeros, ones])
+            ])
+        elif axis == 0 or axis == -3:
+            rot_mat_T = torch.stack([
+                torch.stack([ones, zeros, zeros]),
+                torch.stack([zeros, rot_cos, rot_sin]),
+                torch.stack([zeros, -rot_sin, rot_cos])
+            ])
+        else:
+            raise ValueError(
+                f'axis should in range [-3, -2, -1, 0, 1, 2], got {axis}')
+    else:
+        rot_mat_T = torch.stack([
+            torch.stack([rot_cos, rot_sin]),
+            torch.stack([-rot_sin, rot_cos])
+        ])
+
+    if clockwise:
+        rot_mat_T = rot_mat_T.transpose(0, 1)
+
+    if points.shape[0] == 0:
+        points_new = points
+    else:
+        points_new = torch.einsum('aij,jka->aik', points, rot_mat_T)
+
+    if batch_free:
+        points_new = points_new.squeeze(0)
+
+    if return_mat:
+        rot_mat_T = torch.einsum('jka->ajk', rot_mat_T)
+        if batch_free:
+            rot_mat_T = rot_mat_T.squeeze(0)
+        return points_new, rot_mat_T
+    else:
+        return points_new
+
+
+@array_converter(apply_to=('boxes_xywhr', ))
+def xywhr2xyxyr(
+        boxes_xywhr: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
+    """Convert a rotated boxes in XYWHR format to XYXYR format.
+
+    Args:
+        boxes_xywhr (Tensor or np.ndarray): Rotated boxes in XYWHR format.
+
+    Returns:
+        Tensor or np.ndarray: Converted boxes in XYXYR format.
+    """
+    boxes = torch.zeros_like(boxes_xywhr)
+    half_w = boxes_xywhr[..., 2] / 2
+    half_h = boxes_xywhr[..., 3] / 2
+
+    boxes[..., 0] = boxes_xywhr[..., 0] - half_w
+    boxes[..., 1] = boxes_xywhr[..., 1] - half_h
+    boxes[..., 2] = boxes_xywhr[..., 0] + half_w
+    boxes[..., 3] = boxes_xywhr[..., 1] + half_h
+    boxes[..., 4] = boxes_xywhr[..., 4]
+    return boxes
+
+
+def get_box_type(box_type: str) -> Tuple[type, int]:
+    """Get the type and mode of box structure.
+
+    Args:
+        box_type (str): The type of box structure. The valid value are "LiDAR",
+            "Camera" and "Depth".
+
+    Raises:
+        ValueError: A ValueError is raised when ``box_type`` does not belong to
+            the three valid types.
+
+    Returns:
+        tuple: Box type and box mode.
+    """
+    from .box_3d_mode import (Box3DMode, CameraInstance3DBoxes,
+                              DepthInstance3DBoxes, LiDARInstance3DBoxes)
+    box_type_lower = box_type.lower()
+    if box_type_lower == 'lidar':
+        box_type_3d = LiDARInstance3DBoxes
+        box_mode_3d = Box3DMode.LIDAR
+    elif box_type_lower == 'camera':
+        box_type_3d = CameraInstance3DBoxes
+        box_mode_3d = Box3DMode.CAM
+    elif box_type_lower == 'depth':
+        box_type_3d = DepthInstance3DBoxes
+        box_mode_3d = Box3DMode.DEPTH
+    else:
+        raise ValueError('Only "box_type" of "camera", "lidar", "depth" are '
+                         f'supported, got {box_type}')
+
+    return box_type_3d, box_mode_3d
+
+
+@array_converter(apply_to=('points_3d', 'proj_mat'))
+def points_cam2img(points_3d: Union[Tensor, np.ndarray],
+                   proj_mat: Union[Tensor, np.ndarray],
+                   with_depth: bool = False) -> Union[Tensor, np.ndarray]:
+    """Project points in camera coordinates to image coordinates.
+
+    Args:
+        points_3d (Tensor or np.ndarray): Points in shape (N, 3).
+        proj_mat (Tensor or np.ndarray): Transformation matrix between
+            coordinates.
+        with_depth (bool): Whether to keep depth in the output.
+            Defaults to False.
+
+    Returns:
+        Tensor or np.ndarray: Points in image coordinates with shape [N, 2] if
+        ``with_depth=False``, else [N, 3].
+    """
+    points_shape = list(points_3d.shape)
+    points_shape[-1] = 1
+
+    assert len(proj_mat.shape) == 2, \
+        'The dimension of the projection matrix should be 2 ' \
+        f'instead of {len(proj_mat.shape)}.'
+    d1, d2 = proj_mat.shape[:2]
+    assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or \
+        (d1 == 4 and d2 == 4), 'The shape of the projection matrix ' \
+        f'({d1}*{d2}) is not supported.'
+    if d1 == 3:
+        proj_mat_expanded = torch.eye(
+            4, device=proj_mat.device, dtype=proj_mat.dtype)
+        proj_mat_expanded[:d1, :d2] = proj_mat
+        proj_mat = proj_mat_expanded
+
+    # previous implementation use new_zeros, new_one yields better results
+    points_4 = torch.cat([points_3d, points_3d.new_ones(points_shape)], dim=-1)
+
+    point_2d = points_4 @ proj_mat.T
+    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
+
+    if with_depth:
+        point_2d_res = torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1)
+
+    return point_2d_res
+
+
+@array_converter(apply_to=('points', 'cam2img'))
+def points_img2cam(
+        points: Union[Tensor, np.ndarray],
+        cam2img: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
+    """Project points in image coordinates to camera coordinates.
+
+    Args:
+        points (Tensor or np.ndarray): 2.5D points in 2D images with shape
+            [N, 3], 3 corresponds with x, y in the image and depth.
+        cam2img (Tensor or np.ndarray): Camera intrinsic matrix. The shape can
+            be [3, 3], [3, 4] or [4, 4].
+
+    Returns:
+        Tensor or np.ndarray: Points in 3D space with shape [N, 3], 3
+        corresponds with x, y, z in 3D space.
+    """
+    assert cam2img.shape[0] <= 4
+    assert cam2img.shape[1] <= 4
+    assert points.shape[1] == 3
+
+    xys = points[:, :2]
+    depths = points[:, 2].view(-1, 1)
+    unnormed_xys = torch.cat([xys * depths, depths], dim=1)
+
+    pad_cam2img = torch.eye(4, dtype=xys.dtype, device=xys.device)
+    pad_cam2img[:cam2img.shape[0], :cam2img.shape[1]] = cam2img
+    inv_pad_cam2img = torch.inverse(pad_cam2img).transpose(0, 1)
+
+    # Do operation in homogeneous coordinates.
+    num_points = unnormed_xys.shape[0]
+    homo_xys = torch.cat([unnormed_xys, xys.new_ones((num_points, 1))], dim=1)
+    points3D = torch.mm(homo_xys, inv_pad_cam2img)[:, :3]
+
+    return points3D
+
+
+def mono_cam_box2vis(cam_box):
+    """This is a post-processing function on the bboxes from Mono-3D task. If
+    we want to perform projection visualization, we need to:
+
+        1. rotate the box along x-axis for np.pi / 2 (roll)
+        2. change orientation from local yaw to global yaw
+        3. convert yaw by (np.pi / 2 - yaw)
+
+    After applying this function, we can project and draw it on 2D images.
+
+    Args:
+        cam_box (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate
+            system before conversion. Could be gt bbox loaded from dataset or
+            network prediction output.
+
+    Returns:
+        :obj:`CameraInstance3DBoxes`: Box after conversion.
+    """
+    warning.warn('DeprecationWarning: The hack of yaw and dimension in the '
+                 'monocular 3D detection on nuScenes has been removed. The '
+                 'function mono_cam_box2vis will be deprecated.')
+    from .cam_box3d import CameraInstance3DBoxes
+    assert isinstance(cam_box, CameraInstance3DBoxes), \
+        'input bbox should be CameraInstance3DBoxes!'
+    loc = cam_box.gravity_center
+    dim = cam_box.dims
+    yaw = cam_box.yaw
+    feats = cam_box.tensor[:, 7:]
+    # rotate along x-axis for np.pi / 2
+    # see also here: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L557  # noqa
+    dim[:, [1, 2]] = dim[:, [2, 1]]
+    # change local yaw to global yaw for visualization
+    # refer to https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L164-L166  # noqa
+    yaw += torch.atan2(loc[:, 0], loc[:, 2])
+    # convert yaw by (-yaw - np.pi / 2)
+    # this is because mono 3D box class such as `NuScenesBox` has different
+    # definition of rotation with our `CameraInstance3DBoxes`
+    yaw = -yaw - np.pi / 2
+    cam_box = torch.cat([loc, dim, yaw[:, None], feats], dim=1)
+    cam_box = CameraInstance3DBoxes(
+        cam_box, box_dim=cam_box.shape[-1], origin=(0.5, 0.5, 0.5))
+
+    return cam_box
+
+
+def get_proj_mat_by_coord_type(img_meta: dict, coord_type: str) -> Tensor:
+    """Obtain image features using points.
+
+    Args:
+        img_meta (dict): Meta information.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. Can be case-
+            insensitive.
+
+    Returns:
+        Tensor: Transformation matrix.
+    """
+    coord_type = coord_type.upper()
+    mapping = {'LIDAR': 'lidar2img', 'DEPTH': 'depth2img', 'CAMERA': 'cam2img'}
+    assert coord_type in mapping.keys()
+    return img_meta[mapping[coord_type]]
+
+
+def yaw2local(yaw: Tensor, loc: Tensor) -> Tensor:
+    """Transform global yaw to local yaw (alpha in kitti) in camera
+    coordinates, ranges from -pi to pi.
+
+    Args:
+        yaw (Tensor): A vector with local yaw of each box in shape (N, ).
+        loc (Tensor): Gravity center of each box in shape (N, 3).
+
+    Returns:
+        Tensor: Local yaw (alpha in kitti).
+    """
+    local_yaw = yaw - torch.atan2(loc[:, 0], loc[:, 2])
+    larger_idx = (local_yaw > np.pi).nonzero(as_tuple=False)
+    small_idx = (local_yaw < -np.pi).nonzero(as_tuple=False)
+    if len(larger_idx) != 0:
+        local_yaw[larger_idx] -= 2 * np.pi
+    if len(small_idx) != 0:
+        local_yaw[small_idx] += 2 * np.pi
+
+    return local_yaw
+
+
+def get_lidar2img(cam2img: Tensor, lidar2cam: Tensor) -> Tensor:
+    """Get the projection matrix of lidar2img.
+
+    Args:
+        cam2img (torch.Tensor): A 3x3 or 4x4 projection matrix.
+        lidar2cam (torch.Tensor): A 3x3 or 4x4 projection matrix.
+
+    Returns:
+        Tensor: Transformation matrix with shape 4x4.
+    """
+    if cam2img.shape == (3, 3):
+        temp = cam2img.new_zeros(4, 4)
+        temp[:3, :3] = cam2img
+        cam2img = temp
+
+    if lidar2cam.shape == (3, 3):
+        temp = lidar2cam.new_zeros(4, 4)
+        temp[:3, :3] = lidar2cam
+        lidar2cam = temp
+    return torch.matmul(cam2img, lidar2cam)
diff --git a/mmde/mmdet3d/structures/det3d_data_sample.py b/mmde/mmdet3d/structures/det3d_data_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..c14c0abe86e0933a01c2f387be960d34afdd79e4
--- /dev/null
+++ b/mmde/mmdet3d/structures/det3d_data_sample.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+from mmdet.structures import DetDataSample
+from mmengine.structures import InstanceData
+
+from .point_data import PointData
+
+
+class Det3DDataSample(DetDataSample):
+    """A data structure interface of MMDetection3D. They are used as interfaces
+    between different components.
+
+    The attributes in ``Det3DDataSample`` are divided into several parts:
+
+        - ``proposals`` (InstanceData): Region proposals used in two-stage
+          detectors.
+        - ``ignored_instances`` (InstanceData): Instances to be ignored during
+          training/testing.
+        - ``gt_instances_3d`` (InstanceData): Ground truth of 3D instance
+          annotations.
+        - ``gt_instances`` (InstanceData): Ground truth of 2D instance
+          annotations.
+        - ``pred_instances_3d`` (InstanceData): 3D instances of model
+          predictions.
+          - For point-cloud 3D object detection task whose input modality is
+            `use_lidar=True, use_camera=False`, the 3D predictions results are
+            saved in `pred_instances_3d`.
+          - For vision-only (monocular/multi-view) 3D object detection task
+            whose input modality is `use_lidar=False, use_camera=True`, the 3D
+            predictions are saved in `pred_instances_3d`.
+        - ``pred_instances`` (InstanceData): 2D instances of model predictions.
+          - For multi-modality 3D detection task whose input modality is
+            `use_lidar=True, use_camera=True`, the 2D predictions are saved in
+            `pred_instances`.
+        - ``pts_pred_instances_3d`` (InstanceData): 3D instances of model
+          predictions based on point cloud.
+          - For multi-modality 3D detection task whose input modality is
+            `use_lidar=True, use_camera=True`, the 3D predictions based on
+            point cloud are saved in `pts_pred_instances_3d` to distinguish
+            with `img_pred_instances_3d` which based on image.
+        - ``img_pred_instances_3d`` (InstanceData): 3D instances of model
+          predictions based on image.
+          - For multi-modality 3D detection task whose input modality is
+            `use_lidar=True, use_camera=True`, the 3D predictions based on
+            image are saved in `img_pred_instances_3d` to distinguish with
+            `pts_pred_instances_3d` which based on point cloud.
+        - ``gt_pts_seg`` (PointData): Ground truth of point cloud segmentation.
+        - ``pred_pts_seg`` (PointData): Prediction of point cloud segmentation.
+        - ``eval_ann_info`` (dict or None): Raw annotation, which will be
+          passed to evaluator and do the online evaluation.
+
+    Examples:
+        >>> import torch
+        >>> from mmengine.structures import InstanceData
+
+        >>> from mmdet3d.structures import Det3DDataSample
+        >>> from mmdet3d.structures.bbox_3d import BaseInstance3DBoxes
+
+        >>> data_sample = Det3DDataSample()
+        >>> meta_info = dict(
+        ...     img_shape=(800, 1196, 3),
+        ...     pad_shape=(800, 1216, 3))
+        >>> gt_instances_3d = InstanceData(metainfo=meta_info)
+        >>> gt_instances_3d.bboxes_3d = BaseInstance3DBoxes(torch.rand((5, 7)))
+        >>> gt_instances_3d.labels_3d = torch.randint(0, 3, (5,))
+        >>> data_sample.gt_instances_3d = gt_instances_3d
+        >>> assert 'img_shape' in data_sample.gt_instances_3d.metainfo_keys()
+        >>> len(data_sample.gt_instances_3d)
+        5
+        >>> print(data_sample)
+        <Det3DDataSample(
+            META INFORMATION
+            DATA FIELDS
+            gt_instances_3d: <InstanceData(
+                    META INFORMATION
+                    img_shape: (800, 1196, 3)
+                    pad_shape: (800, 1216, 3)
+                    DATA FIELDS
+                    labels_3d: tensor([1, 0, 2, 0, 1])
+                    bboxes_3d: BaseInstance3DBoxes(
+                            tensor([[1.9115e-01, 3.6061e-01, 6.7707e-01, 5.2902e-01, 8.0736e-01, 8.2759e-01,
+                                2.4328e-01],
+                                [5.6272e-01, 2.7508e-01, 5.7966e-01, 9.2410e-01, 3.0456e-01, 1.8912e-01,
+                                3.3176e-01],
+                                [8.1069e-01, 2.8684e-01, 7.7689e-01, 9.2397e-02, 5.5849e-01, 3.8007e-01,
+                                4.6719e-01],
+                                [6.6346e-01, 4.8005e-01, 5.2318e-02, 4.4137e-01, 4.1163e-01, 8.9339e-01,
+                                7.2847e-01],
+                                [2.4800e-01, 7.1944e-01, 3.4766e-01, 7.8583e-01, 8.5507e-01, 6.3729e-02,
+                                7.5161e-05]]))
+                ) at 0x7f7e29de3a00>
+        ) at 0x7f7e2a0e8640>
+        >>> pred_instances = InstanceData(metainfo=meta_info)
+        >>> pred_instances.bboxes = torch.rand((5, 4))
+        >>> pred_instances.scores = torch.rand((5, ))
+        >>> data_sample = Det3DDataSample(pred_instances=pred_instances)
+        >>> assert 'pred_instances' in data_sample
+
+        >>> pred_instances_3d = InstanceData(metainfo=meta_info)
+        >>> pred_instances_3d.bboxes_3d = BaseInstance3DBoxes(
+        ...     torch.rand((5, 7)))
+        >>> pred_instances_3d.scores_3d = torch.rand((5, ))
+        >>> pred_instances_3d.labels_3d = torch.rand((5, ))
+        >>> data_sample = Det3DDataSample(pred_instances_3d=pred_instances_3d)
+        >>> assert 'pred_instances_3d' in data_sample
+
+        >>> data_sample = Det3DDataSample()
+        >>> gt_instances_3d_data = dict(
+        ...     bboxes_3d=BaseInstance3DBoxes(torch.rand((2, 7))),
+        ...     labels_3d=torch.rand(2))
+        >>> gt_instances_3d = InstanceData(**gt_instances_3d_data)
+        >>> data_sample.gt_instances_3d = gt_instances_3d
+        >>> assert 'gt_instances_3d' in data_sample
+        >>> assert 'bboxes_3d' in data_sample.gt_instances_3d
+
+        >>> from mmdet3d.structures import PointData
+        >>> data_sample = Det3DDataSample()
+        >>> gt_pts_seg_data = dict(
+        ...     pts_instance_mask=torch.rand(2),
+        ...     pts_semantic_mask=torch.rand(2))
+        >>> data_sample.gt_pts_seg = PointData(**gt_pts_seg_data)
+        >>> print(data_sample)
+        <Det3DDataSample(
+            META INFORMATION
+            DATA FIELDS
+            gt_pts_seg: <PointData(
+                    META INFORMATION
+                    DATA FIELDS
+                    pts_semantic_mask: tensor([0.7199, 0.4006])
+                    pts_instance_mask: tensor([0.7363, 0.8096])
+                ) at 0x7f7e2962cc40>
+        ) at 0x7f7e29ff0d60>
+    """  # noqa: E501
+
+    @property
+    def gt_instances_3d(self) -> InstanceData:
+        return self._gt_instances_3d
+
+    @gt_instances_3d.setter
+    def gt_instances_3d(self, value: InstanceData) -> None:
+        self.set_field(value, '_gt_instances_3d', dtype=InstanceData)
+
+    @gt_instances_3d.deleter
+    def gt_instances_3d(self) -> None:
+        del self._gt_instances_3d
+
+    @property
+    def pred_instances_3d(self) -> InstanceData:
+        return self._pred_instances_3d
+
+    @pred_instances_3d.setter
+    def pred_instances_3d(self, value: InstanceData) -> None:
+        self.set_field(value, '_pred_instances_3d', dtype=InstanceData)
+
+    @pred_instances_3d.deleter
+    def pred_instances_3d(self) -> None:
+        del self._pred_instances_3d
+
+    @property
+    def pts_pred_instances_3d(self) -> InstanceData:
+        return self._pts_pred_instances_3d
+
+    @pts_pred_instances_3d.setter
+    def pts_pred_instances_3d(self, value: InstanceData) -> None:
+        self.set_field(value, '_pts_pred_instances_3d', dtype=InstanceData)
+
+    @pts_pred_instances_3d.deleter
+    def pts_pred_instances_3d(self) -> None:
+        del self._pts_pred_instances_3d
+
+    @property
+    def img_pred_instances_3d(self) -> InstanceData:
+        return self._img_pred_instances_3d
+
+    @img_pred_instances_3d.setter
+    def img_pred_instances_3d(self, value: InstanceData) -> None:
+        self.set_field(value, '_img_pred_instances_3d', dtype=InstanceData)
+
+    @img_pred_instances_3d.deleter
+    def img_pred_instances_3d(self) -> None:
+        del self._img_pred_instances_3d
+
+    @property
+    def gt_pts_seg(self) -> PointData:
+        return self._gt_pts_seg
+
+    @gt_pts_seg.setter
+    def gt_pts_seg(self, value: PointData) -> None:
+        self.set_field(value, '_gt_pts_seg', dtype=PointData)
+
+    @gt_pts_seg.deleter
+    def gt_pts_seg(self) -> None:
+        del self._gt_pts_seg
+
+    @property
+    def pred_pts_seg(self) -> PointData:
+        return self._pred_pts_seg
+
+    @pred_pts_seg.setter
+    def pred_pts_seg(self, value: PointData) -> None:
+        self.set_field(value, '_pred_pts_seg', dtype=PointData)
+
+    @pred_pts_seg.deleter
+    def pred_pts_seg(self) -> None:
+        del self._pred_pts_seg
+
+
+SampleList = List[Det3DDataSample]
+OptSampleList = Optional[SampleList]
+ForwardResults = Union[Dict[str, torch.Tensor], List[Det3DDataSample],
+                       Tuple[torch.Tensor], torch.Tensor]
diff --git a/mmde/mmdet3d/structures/ops/__init__.py b/mmde/mmdet3d/structures/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d71ec3015c47b31a09295dd7eeeede913223d0a6
--- /dev/null
+++ b/mmde/mmdet3d/structures/ops/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# yapf:disable
+from .box_np_ops import (box2d_to_corner_jit, box3d_to_bbox,
+                         box_camera_to_lidar, boxes3d_to_corners3d_lidar,
+                         camera_to_lidar, center_to_corner_box2d,
+                         center_to_corner_box3d, center_to_minmax_2d,
+                         corner_to_standup_nd_jit, corner_to_surfaces_3d,
+                         corner_to_surfaces_3d_jit, corners_nd,
+                         create_anchors_3d_range, depth_to_lidar_points,
+                         depth_to_points, get_frustum, iou_jit,
+                         minmax_to_corner_2d, points_in_convex_polygon_3d_jit,
+                         points_in_convex_polygon_jit, points_in_rbbox,
+                         projection_matrix_to_CRT_kitti, rbbox2d_to_near_bbox,
+                         remove_outside_points, rotation_points_single_angle,
+                         surface_equ_3d)
+# yapf:enable
+from .iou3d_calculator import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D,
+                               BboxOverlapsNearest3D,
+                               axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d,
+                               bbox_overlaps_nearest_3d)
+from .transforms import bbox3d2result, bbox3d2roi, bbox3d_mapping_back
+
+__all__ = [
+    'box2d_to_corner_jit', 'box3d_to_bbox', 'box_camera_to_lidar',
+    'boxes3d_to_corners3d_lidar', 'camera_to_lidar', 'center_to_corner_box2d',
+    'center_to_corner_box3d', 'center_to_minmax_2d',
+    'corner_to_standup_nd_jit', 'corner_to_surfaces_3d',
+    'corner_to_surfaces_3d_jit', 'corners_nd', 'create_anchors_3d_range',
+    'depth_to_lidar_points', 'depth_to_points', 'get_frustum', 'iou_jit',
+    'minmax_to_corner_2d', 'points_in_convex_polygon_3d_jit',
+    'points_in_convex_polygon_jit', 'points_in_rbbox',
+    'projection_matrix_to_CRT_kitti', 'rbbox2d_to_near_bbox',
+    'remove_outside_points', 'rotation_points_single_angle', 'surface_equ_3d',
+    'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
+    'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D',
+    'axis_aligned_bbox_overlaps_3d', 'bbox3d_mapping_back', 'bbox3d2roi',
+    'bbox3d2result'
+]
diff --git a/mmde/mmdet3d/structures/ops/box_np_ops.py b/mmde/mmdet3d/structures/ops/box_np_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..24eaae77e0832c37a93d5fe2075892ee01340e64
--- /dev/null
+++ b/mmde/mmdet3d/structures/ops/box_np_ops.py
@@ -0,0 +1,828 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# TODO: clean the functions in this file and move the APIs into box bbox_3d
+# in the future
+# NOTICE: All functions in this file are valid for LiDAR or depth boxes only
+# if we use default parameters.
+
+import numba
+import numpy as np
+
+from mmdet3d.structures.bbox_3d import (limit_period, points_cam2img,
+                                        rotation_3d_in_axis)
+
+
+def camera_to_lidar(points, r_rect, velo2cam):
+    """Convert points in camera coordinate to lidar coordinate.
+
+    Note:
+        This function is for KITTI only.
+
+    Args:
+        points (np.ndarray, shape=[N, 3]): Points in camera coordinate.
+        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+
+    Returns:
+        np.ndarray, shape=[N, 3]: Points in lidar coordinate.
+    """
+    points_shape = list(points.shape[0:-1])
+    if points.shape[-1] == 3:
+        points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1)
+    lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T)
+    return lidar_points[..., :3]
+
+
+def box_camera_to_lidar(data, r_rect, velo2cam):
+    """Convert boxes in camera coordinate to lidar coordinate.
+
+    Note:
+        This function is for KITTI only.
+
+    Args:
+        data (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
+        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+
+    Returns:
+        np.ndarray, shape=[N, 3]: Boxes in lidar coordinate.
+    """
+    xyz = data[:, 0:3]
+    x_size, y_size, z_size = data[:, 3:4], data[:, 4:5], data[:, 5:6]
+    r = data[:, 6:7]
+    xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam)
+    # yaw and dims also needs to be converted
+    r_new = -r - np.pi / 2
+    r_new = limit_period(r_new, period=np.pi * 2)
+    return np.concatenate([xyz_lidar, x_size, z_size, y_size, r_new], axis=1)
+
+
+def corners_nd(dims, origin=0.5):
+    """Generate relative box corners based on length per dim and origin point.
+
+    Args:
+        dims (np.ndarray, shape=[N, ndim]): Array of length per dim
+        origin (list or array or float, optional): origin point relate to
+            smallest point. Defaults to 0.5
+
+    Returns:
+        np.ndarray, shape=[N, 2 ** ndim, ndim]: Returned corners.
+        point layout example: (2d) x0y0, x0y1, x1y0, x1y1;
+            (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
+            where x0 < x1, y0 < y1, z0 < z1.
+    """
+    ndim = int(dims.shape[1])
+    corners_norm = np.stack(
+        np.unravel_index(np.arange(2**ndim), [2] * ndim),
+        axis=1).astype(dims.dtype)
+    # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1
+    # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
+    # so need to convert to a format which is convenient to do other computing.
+    # for 2d boxes, format is clockwise start with minimum point
+    # for 3d boxes, please draw lines by your hand.
+    if ndim == 2:
+        # generate clockwise box corners
+        corners_norm = corners_norm[[0, 1, 3, 2]]
+    elif ndim == 3:
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+    corners_norm = corners_norm - np.array(origin, dtype=dims.dtype)
+    corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape(
+        [1, 2**ndim, ndim])
+    return corners
+
+
+def center_to_corner_box2d(centers, dims, angles=None, origin=0.5):
+    """Convert kitti locations, dimensions and angles to corners.
+    format: center(xy), dims(xy), angles(counterclockwise when positive)
+
+    Args:
+        centers (np.ndarray): Locations in kitti label file with shape (N, 2).
+        dims (np.ndarray): Dimensions in kitti label file with shape (N, 2).
+        angles (np.ndarray, optional): Rotation_y in kitti label file with
+            shape (N). Defaults to None.
+        origin (list or array or float, optional): origin point relate to
+            smallest point. Defaults to 0.5.
+
+    Returns:
+        np.ndarray: Corners with the shape of (N, 4, 2).
+    """
+    # 'length' in kitti format is in x axis.
+    # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
+    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
+    corners = corners_nd(dims, origin=origin)
+    # corners: [N, 4, 2]
+    if angles is not None:
+        corners = rotation_3d_in_axis(corners, angles)
+    corners += centers.reshape([-1, 1, 2])
+    return corners
+
+
+@numba.jit(nopython=True)
+def depth_to_points(depth, trunc_pixel):
+    """Convert depth map to points.
+
+    Args:
+        depth (np.array, shape=[H, W]): Depth map which
+            the row of [0~`trunc_pixel`] are truncated.
+        trunc_pixel (int): The number of truncated row.
+
+    Returns:
+        np.ndarray: Points in camera coordinates.
+    """
+    num_pts = np.sum(depth[trunc_pixel:, ] > 0.1)
+    points = np.zeros((num_pts, 3), dtype=depth.dtype)
+    x = np.array([0, 0, 1], dtype=depth.dtype)
+    k = 0
+    for i in range(trunc_pixel, depth.shape[0]):
+        for j in range(depth.shape[1]):
+            if depth[i, j] > 0.1:
+                x = np.array([j, i, 1], dtype=depth.dtype)
+                points[k] = x * depth[i, j]
+                k += 1
+    return points
+
+
+def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam):
+    """Convert depth map to points in lidar coordinate.
+
+    Args:
+        depth (np.array, shape=[H, W]): Depth map which
+            the row of [0~`trunc_pixel`] are truncated.
+        trunc_pixel (int): The number of truncated row.
+        P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
+        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+
+    Returns:
+        np.ndarray: Points in lidar coordinates.
+    """
+    pts = depth_to_points(depth, trunc_pixel)
+    points_shape = list(pts.shape[0:-1])
+    points = np.concatenate([pts, np.ones(points_shape + [1])], axis=-1)
+    points = points @ np.linalg.inv(P2.T)
+    lidar_points = camera_to_lidar(points, r_rect, velo2cam)
+    return lidar_points
+
+
+def center_to_corner_box3d(centers,
+                           dims,
+                           angles=None,
+                           origin=(0.5, 1.0, 0.5),
+                           axis=1):
+    """Convert kitti locations, dimensions and angles to corners.
+
+    Args:
+        centers (np.ndarray): Locations in kitti label file with shape (N, 3).
+        dims (np.ndarray): Dimensions in kitti label file with shape (N, 3).
+        angles (np.ndarray, optional): Rotation_y in kitti label file with
+            shape (N). Defaults to None.
+        origin (list or array or float, optional): Origin point relate to
+            smallest point. Use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0)
+            in lidar. Defaults to (0.5, 1.0, 0.5).
+        axis (int, optional): Rotation axis. 1 for camera and 2 for lidar.
+            Defaults to 1.
+
+    Returns:
+        np.ndarray: Corners with the shape of (N, 8, 3).
+    """
+    # 'length' in kitti format is in x axis.
+    # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(lwh)(lidar)
+    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
+    corners = corners_nd(dims, origin=origin)
+    # corners: [N, 8, 3]
+    if angles is not None:
+        corners = rotation_3d_in_axis(corners, angles, axis=axis)
+    corners += centers.reshape([-1, 1, 3])
+    return corners
+
+
+@numba.jit(nopython=True)
+def box2d_to_corner_jit(boxes):
+    """Convert box2d to corner.
+
+    Args:
+        boxes (np.ndarray, shape=[N, 5]): Boxes2d with rotation.
+
+    Returns:
+        box_corners (np.ndarray, shape=[N, 4, 2]): Box corners.
+    """
+    num_box = boxes.shape[0]
+    corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
+    corners_norm[1, 1] = 1.0
+    corners_norm[2] = 1.0
+    corners_norm[3, 0] = 1.0
+    corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
+    corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(
+        1, 4, 2)
+    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+    box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype)
+    for i in range(num_box):
+        rot_sin = np.sin(boxes[i, -1])
+        rot_cos = np.cos(boxes[i, -1])
+        rot_mat_T[0, 0] = rot_cos
+        rot_mat_T[0, 1] = rot_sin
+        rot_mat_T[1, 0] = -rot_sin
+        rot_mat_T[1, 1] = rot_cos
+        box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2]
+    return box_corners
+
+
+@numba.njit
+def corner_to_standup_nd_jit(boxes_corner):
+    """Convert boxes_corner to aligned (min-max) boxes.
+
+    Args:
+        boxes_corner (np.ndarray, shape=[N, 2**dim, dim]): Boxes corners.
+
+    Returns:
+        np.ndarray, shape=[N, dim*2]: Aligned (min-max) boxes.
+    """
+    num_boxes = boxes_corner.shape[0]
+    ndim = boxes_corner.shape[-1]
+    result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype)
+    for i in range(num_boxes):
+        for j in range(ndim):
+            result[i, j] = np.min(boxes_corner[i, :, j])
+        for j in range(ndim):
+            result[i, j + ndim] = np.max(boxes_corner[i, :, j])
+    return result
+
+
+@numba.jit(nopython=True)
+def corner_to_surfaces_3d_jit(corners):
+    """Convert 3d box corners from corner function above to surfaces that
+    normal vectors all direct to internal.
+
+    Args:
+        corners (np.ndarray): 3d box corners with the shape of (N, 8, 3).
+
+    Returns:
+        np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
+    """
+    # box_corners: [N, 8, 3], must from corner functions in this module
+    num_boxes = corners.shape[0]
+    surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype)
+    corner_idxes = np.array([
+        0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7
+    ]).reshape(6, 4)
+    for i in range(num_boxes):
+        for j in range(6):
+            for k in range(4):
+                surfaces[i, j, k] = corners[i, corner_idxes[j, k]]
+    return surfaces
+
+
+def rotation_points_single_angle(points, angle, axis=0):
+    """Rotate points with a single angle.
+
+    Args:
+        points (np.ndarray, shape=[N, 3]]):
+        angle (np.ndarray, shape=[1]]):
+        axis (int, optional): Axis to rotate at. Defaults to 0.
+
+    Returns:
+        np.ndarray: Rotated points.
+    """
+    # points: [N, 3]
+    rot_sin = np.sin(angle)
+    rot_cos = np.cos(angle)
+    if axis == 1:
+        rot_mat_T = np.array(
+            [[rot_cos, 0, rot_sin], [0, 1, 0], [-rot_sin, 0, rot_cos]],
+            dtype=points.dtype)
+    elif axis == 2 or axis == -1:
+        rot_mat_T = np.array(
+            [[rot_cos, rot_sin, 0], [-rot_sin, rot_cos, 0], [0, 0, 1]],
+            dtype=points.dtype)
+    elif axis == 0:
+        rot_mat_T = np.array(
+            [[1, 0, 0], [0, rot_cos, rot_sin], [0, -rot_sin, rot_cos]],
+            dtype=points.dtype)
+    else:
+        raise ValueError('axis should in range')
+
+    return points @ rot_mat_T, rot_mat_T
+
+
+def box3d_to_bbox(box3d, P2):
+    """Convert box3d in camera coordinates to bbox in image coordinates.
+
+    Args:
+        box3d (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
+        P2 (np.array, shape=[4, 4]): Intrinsics of Camera2.
+
+    Returns:
+        np.ndarray, shape=[N, 4]: Boxes 2d in image coordinates.
+    """
+    box_corners = center_to_corner_box3d(
+        box3d[:, :3], box3d[:, 3:6], box3d[:, 6], [0.5, 1.0, 0.5], axis=1)
+    box_corners_in_image = points_cam2img(box_corners, P2)
+    # box_corners_in_image: [N, 8, 2]
+    minxy = np.min(box_corners_in_image, axis=1)
+    maxxy = np.max(box_corners_in_image, axis=1)
+    bbox = np.concatenate([minxy, maxxy], axis=1)
+    return bbox
+
+
+def corner_to_surfaces_3d(corners):
+    """convert 3d box corners from corner function above to surfaces that
+    normal vectors all direct to internal.
+
+    Args:
+        corners (np.ndarray): 3D box corners with shape of (N, 8, 3).
+
+    Returns:
+        np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
+    """
+    # box_corners: [N, 8, 3], must from corner functions in this module
+    surfaces = np.array([
+        [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]],
+        [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]],
+        [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]],
+        [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]],
+        [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]],
+        [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]],
+    ]).transpose([2, 0, 1, 3])
+    return surfaces
+
+
+def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)):
+    """Check points in rotated bbox and return indices.
+
+    Note:
+        This function is for counterclockwise boxes.
+
+    Args:
+        points (np.ndarray, shape=[N, 3+dim]): Points to query.
+        rbbox (np.ndarray, shape=[M, 7]): Boxes3d with rotation.
+        z_axis (int, optional): Indicate which axis is height.
+            Defaults to 2.
+        origin (tuple[int], optional): Indicate the position of
+            box center. Defaults to (0.5, 0.5, 0).
+
+    Returns:
+        np.ndarray, shape=[N, M]: Indices of points in each box.
+    """
+    # TODO: this function is different from PointCloud3D, be careful
+    # when start to use nuscene, check the input
+    rbbox_corners = center_to_corner_box3d(
+        rbbox[:, :3], rbbox[:, 3:6], rbbox[:, 6], origin=origin, axis=z_axis)
+    surfaces = corner_to_surfaces_3d(rbbox_corners)
+    indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces)
+    return indices
+
+
+def minmax_to_corner_2d(minmax_box):
+    """Convert minmax box to corners2d.
+
+    Args:
+        minmax_box (np.ndarray, shape=[N, dims]): minmax boxes.
+
+    Returns:
+        np.ndarray: 2d corners of boxes
+    """
+    ndim = minmax_box.shape[-1] // 2
+    center = minmax_box[..., :ndim]
+    dims = minmax_box[..., ndim:] - center
+    return center_to_corner_box2d(center, dims, origin=0.0)
+
+
+def create_anchors_3d_range(feature_size,
+                            anchor_range,
+                            sizes=((3.9, 1.6, 1.56), ),
+                            rotations=(0, np.pi / 2),
+                            dtype=np.float32):
+    """Create anchors 3d by range.
+
+    Args:
+        feature_size (list[float] | tuple[float]): Feature map size. It is
+            either a list of a tuple of [D, H, W](in order of z, y, and x).
+        anchor_range (torch.Tensor | list[float]): Range of anchors with
+            shape [6]. The order is consistent with that of anchors, i.e.,
+            (x_min, y_min, z_min, x_max, y_max, z_max).
+        sizes (list[list] | np.ndarray | torch.Tensor, optional):
+            Anchor size with shape [N, 3], in order of x, y, z.
+            Defaults to ((3.9, 1.6, 1.56), ).
+        rotations (list[float] | np.ndarray | torch.Tensor, optional):
+            Rotations of anchors in a single feature grid.
+            Defaults to (0, np.pi / 2).
+        dtype (type, optional): Data type. Defaults to np.float32.
+
+    Returns:
+        np.ndarray: Range based anchors with shape of
+            (*feature_size, num_sizes, num_rots, 7).
+    """
+    anchor_range = np.array(anchor_range, dtype)
+    z_centers = np.linspace(
+        anchor_range[2], anchor_range[5], feature_size[0], dtype=dtype)
+    y_centers = np.linspace(
+        anchor_range[1], anchor_range[4], feature_size[1], dtype=dtype)
+    x_centers = np.linspace(
+        anchor_range[0], anchor_range[3], feature_size[2], dtype=dtype)
+    sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3])
+    rotations = np.array(rotations, dtype=dtype)
+    rets = np.meshgrid(
+        x_centers, y_centers, z_centers, rotations, indexing='ij')
+    tile_shape = [1] * 5
+    tile_shape[-2] = int(sizes.shape[0])
+    for i in range(len(rets)):
+        rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape)
+        rets[i] = rets[i][..., np.newaxis]  # for concat
+    sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3])
+    tile_size_shape = list(rets[0].shape)
+    tile_size_shape[3] = 1
+    sizes = np.tile(sizes, tile_size_shape)
+    rets.insert(3, sizes)
+    ret = np.concatenate(rets, axis=-1)
+    return np.transpose(ret, [2, 1, 0, 3, 4, 5])
+
+
+def center_to_minmax_2d(centers, dims, origin=0.5):
+    """Center to minmax.
+
+    Args:
+        centers (np.ndarray): Center points.
+        dims (np.ndarray): Dimensions.
+        origin (list or array or float, optional): Origin point relate
+            to smallest point. Defaults to 0.5.
+
+    Returns:
+        np.ndarray: Minmax points.
+    """
+    if origin == 0.5:
+        return np.concatenate([centers - dims / 2, centers + dims / 2],
+                              axis=-1)
+    corners = center_to_corner_box2d(centers, dims, origin=origin)
+    return corners[:, [0, 2]].reshape([-1, 4])
+
+
+def rbbox2d_to_near_bbox(rbboxes):
+    """convert rotated bbox to nearest 'standing' or 'lying' bbox.
+
+    Args:
+        rbboxes (np.ndarray): Rotated bboxes with shape of
+            (N, 5(x, y, xdim, ydim, rad)).
+
+    Returns:
+        np.ndarray: Bounding boxes with the shape of
+            (N, 4(xmin, ymin, xmax, ymax)).
+    """
+    rots = rbboxes[..., -1]
+    rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi))
+    cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis]
+    bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4])
+    bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:])
+    return bboxes
+
+
+@numba.jit(nopython=True)
+def iou_jit(boxes, query_boxes, mode='iou', eps=0.0):
+    """Calculate box iou. Note that jit version runs ~10x faster than the
+    box_overlaps function in mmdet3d.core.evaluation.
+
+    Note:
+        This function is for counterclockwise boxes.
+
+    Args:
+        boxes (np.ndarray): Input bounding boxes with shape of (N, 4).
+        query_boxes (np.ndarray): Query boxes with shape of (K, 4).
+        mode (str, optional): IoU mode. Defaults to 'iou'.
+        eps (float, optional): Value added to denominator. Defaults to 0.
+
+    Returns:
+        np.ndarray: Overlap between boxes and query_boxes
+            with the shape of [N, K].
+    """
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + eps) *
+                    (query_boxes[k, 3] - query_boxes[k, 1] + eps))
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]) + eps)
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]) + eps)
+                if ih > 0:
+                    if mode == 'iou':
+                        ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
+                              (boxes[n, 3] - boxes[n, 1] + eps) + box_area -
+                              iw * ih)
+                    else:
+                        ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
+                              (boxes[n, 3] - boxes[n, 1] + eps))
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+def projection_matrix_to_CRT_kitti(proj):
+    """Split projection matrix of KITTI.
+
+    Note:
+        This function is for KITTI only.
+
+    P = C @ [R|T]
+    C is upper triangular matrix, so we need to inverse CR and use QR
+    stable for all kitti camera projection matrix.
+
+    Args:
+        proj (p.array, shape=[4, 4]): Intrinsics of camera.
+
+    Returns:
+        tuple[np.ndarray]: Splited matrix of C, R and T.
+    """
+
+    CR = proj[0:3, 0:3]
+    CT = proj[0:3, 3]
+    RinvCinv = np.linalg.inv(CR)
+    Rinv, Cinv = np.linalg.qr(RinvCinv)
+    C = np.linalg.inv(Cinv)
+    R = np.linalg.inv(Rinv)
+    T = Cinv @ CT
+    return C, R, T
+
+
+def remove_outside_points(points, rect, Trv2c, P2, image_shape):
+    """Remove points which are outside of image.
+
+    Note:
+        This function is for KITTI only.
+
+    Args:
+        points (np.ndarray, shape=[N, 3+dims]): Total points.
+        rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        Trv2c (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+        P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
+        image_shape (list[int]): Shape of image.
+
+    Returns:
+        np.ndarray, shape=[N, 3+dims]: Filtered points.
+    """
+    # 5x faster than remove_outside_points_v1(2ms vs 10ms)
+    C, R, T = projection_matrix_to_CRT_kitti(P2)
+    image_bbox = [0, 0, image_shape[1], image_shape[0]]
+    frustum = get_frustum(image_bbox, C)
+    frustum -= T
+    frustum = np.linalg.inv(R) @ frustum.T
+    frustum = camera_to_lidar(frustum.T, rect, Trv2c)
+    frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...])
+    indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces)
+    points = points[indices.reshape([-1])]
+    return points
+
+
+def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100):
+    """Get frustum corners in camera coordinates.
+
+    Args:
+        bbox_image (list[int]): box in image coordinates.
+        C (np.ndarray): Intrinsics.
+        near_clip (float, optional): Nearest distance of frustum.
+            Defaults to 0.001.
+        far_clip (float, optional): Farthest distance of frustum.
+            Defaults to 100.
+
+    Returns:
+        np.ndarray, shape=[8, 3]: coordinates of frustum corners.
+    """
+    fku = C[0, 0]
+    fkv = -C[1, 1]
+    u0v0 = C[0:2, 2]
+    z_points = np.array(
+        [near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[:, np.newaxis]
+    b = bbox_image
+    box_corners = np.array(
+        [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]],
+        dtype=C.dtype)
+    near_box_corners = (box_corners - u0v0) / np.array(
+        [fku / near_clip, -fkv / near_clip], dtype=C.dtype)
+    far_box_corners = (box_corners - u0v0) / np.array(
+        [fku / far_clip, -fkv / far_clip], dtype=C.dtype)
+    ret_xy = np.concatenate([near_box_corners, far_box_corners],
+                            axis=0)  # [8, 2]
+    ret_xyz = np.concatenate([ret_xy, z_points], axis=1)
+    return ret_xyz
+
+
+def surface_equ_3d(polygon_surfaces):
+    """
+
+    Args:
+        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
+            [num_polygon, max_num_surfaces, max_num_points_of_surface, 3].
+            All surfaces' normal vector must direct to internal.
+            Max_num_points_of_surface must at least 3.
+
+    Returns:
+        tuple: normal vector and its direction.
+    """
+    # return [a, b, c], d in ax+by+cz+d=0
+    # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3]
+    surface_vec = polygon_surfaces[:, :, :2, :] - \
+        polygon_surfaces[:, :, 1:3, :]
+    # normal_vec: [..., 3]
+    normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :])
+    # print(normal_vec.shape, points[..., 0, :].shape)
+    # d = -np.inner(normal_vec, points[..., 0, :])
+    d = np.einsum('aij, aij->ai', normal_vec, polygon_surfaces[:, :, 0, :])
+    return normal_vec, -d
+
+
+@numba.njit
+def _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d,
+                                     num_surfaces):
+    """
+    Args:
+        points (np.ndarray): Input points with shape of (num_points, 3).
+        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
+            (num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
+            All surfaces' normal vector must direct to internal.
+            Max_num_points_of_surface must at least 3.
+        normal_vec (np.ndarray): Normal vector of polygon_surfaces.
+        d (int): Directions of normal vector.
+        num_surfaces (np.ndarray): Number of surfaces a polygon contains
+            shape of (num_polygon).
+
+    Returns:
+        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
+    """
+    max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
+    num_points = points.shape[0]
+    num_polygons = polygon_surfaces.shape[0]
+    ret = np.ones((num_points, num_polygons), dtype=np.bool_)
+    sign = 0.0
+    for i in range(num_points):
+        for j in range(num_polygons):
+            for k in range(max_num_surfaces):
+                if k > num_surfaces[j]:
+                    break
+                sign = (
+                    points[i, 0] * normal_vec[j, k, 0] +
+                    points[i, 1] * normal_vec[j, k, 1] +
+                    points[i, 2] * normal_vec[j, k, 2] + d[j, k])
+                if sign >= 0:
+                    ret[i, j] = False
+                    break
+    return ret
+
+
+def points_in_convex_polygon_3d_jit(points,
+                                    polygon_surfaces,
+                                    num_surfaces=None):
+    """Check points is in 3d convex polygons.
+
+    Args:
+        points (np.ndarray): Input points with shape of (num_points, 3).
+        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
+            (num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
+            All surfaces' normal vector must direct to internal.
+            Max_num_points_of_surface must at least 3.
+        num_surfaces (np.ndarray, optional): Number of surfaces a polygon
+            contains shape of (num_polygon). Defaults to None.
+
+    Returns:
+        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
+    """
+    max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
+    # num_points = points.shape[0]
+    num_polygons = polygon_surfaces.shape[0]
+    if num_surfaces is None:
+        num_surfaces = np.full((num_polygons, ), 9999999, dtype=np.int64)
+    normal_vec, d = surface_equ_3d(polygon_surfaces[:, :, :3, :])
+    # normal_vec: [num_polygon, max_num_surfaces, 3]
+    # d: [num_polygon, max_num_surfaces]
+    return _points_in_convex_polygon_3d_jit(points, polygon_surfaces,
+                                            normal_vec, d, num_surfaces)
+
+
+@numba.njit
+def points_in_convex_polygon_jit(points, polygon, clockwise=False):
+    """Check points is in 2d convex polygons. True when point in polygon.
+
+    Args:
+        points (np.ndarray): Input points with the shape of [num_points, 2].
+        polygon (np.ndarray): Input polygon with the shape of
+            [num_polygon, num_points_of_polygon, 2].
+        clockwise (bool, optional): Indicate polygon is clockwise. Defaults
+            to True.
+
+    Returns:
+        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
+    """
+    # first convert polygon to directed lines
+    num_points_of_polygon = polygon.shape[1]
+    num_points = points.shape[0]
+    num_polygons = polygon.shape[0]
+    # vec for all the polygons
+    if clockwise:
+        vec1 = polygon - polygon[:,
+                                 np.array([num_points_of_polygon - 1] + list(
+                                     range(num_points_of_polygon - 1))), :]
+    else:
+        vec1 = polygon[:,
+                       np.array([num_points_of_polygon - 1] +
+                                list(range(num_points_of_polygon -
+                                           1))), :] - polygon
+    ret = np.zeros((num_points, num_polygons), dtype=np.bool_)
+    success = True
+    cross = 0.0
+    for i in range(num_points):
+        for j in range(num_polygons):
+            success = True
+            for k in range(num_points_of_polygon):
+                vec = vec1[j, k]
+                cross = vec[1] * (polygon[j, k, 0] - points[i, 0])
+                cross -= vec[0] * (polygon[j, k, 1] - points[i, 1])
+                if cross >= 0:
+                    success = False
+                    break
+            ret[i, j] = success
+    return ret
+
+
+def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True):
+    """Convert kitti center boxes to corners.
+
+        7 -------- 4
+       /|         /|
+      6 -------- 5 .
+      | |        | |
+      . 3 -------- 0
+      |/         |/
+      2 -------- 1
+
+    Note:
+        This function is for LiDAR boxes only.
+
+    Args:
+        boxes3d (np.ndarray): Boxes with shape of (N, 7)
+            [x, y, z, x_size, y_size, z_size, ry] in LiDAR coords,
+            see the definition of ry in KITTI dataset.
+        bottom_center (bool, optional): Whether z is on the bottom center
+            of object. Defaults to True.
+
+    Returns:
+        np.ndarray: Box corners with the shape of [N, 8, 3].
+    """
+    boxes_num = boxes3d.shape[0]
+    x_size, y_size, z_size = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5]
+    x_corners = np.array([
+        x_size / 2., -x_size / 2., -x_size / 2., x_size / 2., x_size / 2.,
+        -x_size / 2., -x_size / 2., x_size / 2.
+    ],
+                         dtype=np.float32).T
+    y_corners = np.array([
+        -y_size / 2., -y_size / 2., y_size / 2., y_size / 2., -y_size / 2.,
+        -y_size / 2., y_size / 2., y_size / 2.
+    ],
+                         dtype=np.float32).T
+    if bottom_center:
+        z_corners = np.zeros((boxes_num, 8), dtype=np.float32)
+        z_corners[:, 4:8] = z_size.reshape(boxes_num, 1).repeat(
+            4, axis=1)  # (N, 8)
+    else:
+        z_corners = np.array([
+            -z_size / 2., -z_size / 2., -z_size / 2., -z_size / 2.,
+            z_size / 2., z_size / 2., z_size / 2., z_size / 2.
+        ],
+                             dtype=np.float32).T
+
+    ry = boxes3d[:, 6]
+    zeros, ones = np.zeros(
+        ry.size, dtype=np.float32), np.ones(
+            ry.size, dtype=np.float32)
+    rot_list = np.array([[np.cos(ry), np.sin(ry), zeros],
+                         [-np.sin(ry), np.cos(ry), zeros],
+                         [zeros, zeros, ones]])  # (3, 3, N)
+    R_list = np.transpose(rot_list, (2, 0, 1))  # (N, 3, 3)
+
+    temp_corners = np.concatenate((x_corners.reshape(
+        -1, 8, 1), y_corners.reshape(-1, 8, 1), z_corners.reshape(-1, 8, 1)),
+                                  axis=2)  # (N, 8, 3)
+    rotated_corners = np.matmul(temp_corners, R_list)  # (N, 8, 3)
+    x_corners = rotated_corners[:, :, 0]
+    y_corners = rotated_corners[:, :, 1]
+    z_corners = rotated_corners[:, :, 2]
+
+    x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2]
+
+    x = x_loc.reshape(-1, 1) + x_corners.reshape(-1, 8)
+    y = y_loc.reshape(-1, 1) + y_corners.reshape(-1, 8)
+    z = z_loc.reshape(-1, 1) + z_corners.reshape(-1, 8)
+
+    corners = np.concatenate(
+        (x.reshape(-1, 8, 1), y.reshape(-1, 8, 1), z.reshape(-1, 8, 1)),
+        axis=2)
+
+    return corners.astype(np.float32)
diff --git a/mmde/mmdet3d/structures/ops/iou3d_calculator.py b/mmde/mmdet3d/structures/ops/iou3d_calculator.py
new file mode 100644
index 0000000000000000000000000000000000000000..baec1cbe451af6244b896c8e4b75f6acaddffd59
--- /dev/null
+++ b/mmde/mmdet3d/structures/ops/iou3d_calculator.py
@@ -0,0 +1,329 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmdet.structures.bbox import bbox_overlaps
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet3d.structures.bbox_3d import get_box_type
+
+
+@TASK_UTILS.register_module()
+class BboxOverlapsNearest3D(object):
+    """Nearest 3D IoU Calculator.
+
+    Note:
+        This IoU calculator first finds the nearest 2D boxes in bird eye view
+        (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
+
+    Args:
+        coordinate (str): 'camera', 'lidar', or 'depth' coordinate system.
+    """
+
+    def __init__(self, coordinate='lidar'):
+        assert coordinate in ['camera', 'lidar', 'depth']
+        self.coordinate = coordinate
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        """Calculate nearest 3D IoU.
+
+        Note:
+            If ``is_aligned`` is ``False``, then it calculates the ious between
+            each bbox of bboxes1 and bboxes2, otherwise it calculates the ious
+            between each aligned pair of bboxes1 and bboxes2.
+
+        Args:
+            bboxes1 (torch.Tensor): shape (N, 7+N)
+                [x, y, z, x_size, y_size, z_size, ry, v].
+            bboxes2 (torch.Tensor): shape (M, 7+N)
+                [x, y, z, x_size, y_size, z_size, ry, v].
+            mode (str): "iou" (intersection over union) or iof
+                (intersection over foreground).
+            is_aligned (bool): Whether the calculation is aligned.
+
+        Return:
+            torch.Tensor: If ``is_aligned`` is ``True``, return ious between
+                bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is
+                ``False``, return shape is M.
+        """
+        return bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode, is_aligned,
+                                        self.coordinate)
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(coordinate={self.coordinate}'
+        return repr_str
+
+
+@TASK_UTILS.register_module()
+class BboxOverlaps3D(object):
+    """3D IoU Calculator.
+
+    Args:
+        coordinate (str): The coordinate system, valid options are
+            'camera', 'lidar', and 'depth'.
+    """
+
+    def __init__(self, coordinate):
+        assert coordinate in ['camera', 'lidar', 'depth']
+        self.coordinate = coordinate
+
+    def __call__(self, bboxes1, bboxes2, mode='iou'):
+        """Calculate 3D IoU using cuda implementation.
+
+        Note:
+            This function calculate the IoU of 3D boxes based on their volumes.
+            IoU calculator ``:class:BboxOverlaps3D`` uses this function to
+            calculate the actual 3D IoUs of boxes.
+
+        Args:
+            bboxes1 (torch.Tensor): with shape (N, 7+C),
+                (x, y, z, x_size, y_size, z_size, ry, v*).
+            bboxes2 (torch.Tensor): with shape (M, 7+C),
+                (x, y, z, x_size, y_size, z_size, ry, v*).
+            mode (str): "iou" (intersection over union) or
+                iof (intersection over foreground).
+
+        Return:
+            torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2
+                with shape (M, N) (aligned mode is not supported currently).
+        """
+        return bbox_overlaps_3d(bboxes1, bboxes2, mode, self.coordinate)
+
+    def __repr__(self):
+        """str: return a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(coordinate={self.coordinate}'
+        return repr_str
+
+
+def bbox_overlaps_nearest_3d(bboxes1,
+                             bboxes2,
+                             mode='iou',
+                             is_aligned=False,
+                             coordinate='lidar'):
+    """Calculate nearest 3D IoU.
+
+    Note:
+        This function first finds the nearest 2D boxes in bird eye view
+        (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
+        This IoU calculator :class:`BboxOverlapsNearest3D` uses this
+        function to calculate IoUs of boxes.
+
+        If ``is_aligned`` is ``False``, then it calculates the ious between
+        each bbox of bboxes1 and bboxes2, otherwise the ious between each
+        aligned pair of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (torch.Tensor): with shape (N, 7+C),
+            (x, y, z, x_size, y_size, z_size, ry, v*).
+        bboxes2 (torch.Tensor): with shape (M, 7+C),
+            (x, y, z, x_size, y_size, z_size, ry, v*).
+        mode (str): "iou" (intersection over union) or iof
+            (intersection over foreground).
+        is_aligned (bool): Whether the calculation is aligned
+
+    Return:
+        torch.Tensor: If ``is_aligned`` is ``True``, return ious between
+            bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is
+            ``False``, return shape is M.
+    """
+    assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
+
+    box_type, _ = get_box_type(coordinate)
+
+    bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
+    bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
+
+    # Change the bboxes to bev
+    # box conversion and iou calculation in torch version on CUDA
+    # is 10x faster than that in numpy version
+    bboxes1_bev = bboxes1.nearest_bev
+    bboxes2_bev = bboxes2.nearest_bev
+
+    ret = bbox_overlaps(
+        bboxes1_bev, bboxes2_bev, mode=mode, is_aligned=is_aligned)
+    return ret
+
+
+def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'):
+    """Calculate 3D IoU using cuda implementation.
+
+    Note:
+        This function calculates the IoU of 3D boxes based on their volumes.
+        IoU calculator :class:`BboxOverlaps3D` uses this function to
+        calculate the actual IoUs of boxes.
+
+    Args:
+        bboxes1 (torch.Tensor): with shape (N, 7+C),
+            (x, y, z, x_size, y_size, z_size, ry, v*).
+        bboxes2 (torch.Tensor): with shape (M, 7+C),
+            (x, y, z, x_size, y_size, z_size, ry, v*).
+        mode (str): "iou" (intersection over union) or
+            iof (intersection over foreground).
+        coordinate (str): 'camera' or 'lidar' coordinate system.
+
+    Return:
+        torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2
+            with shape (M, N) (aligned mode is not supported currently).
+    """
+    assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
+
+    box_type, _ = get_box_type(coordinate)
+
+    bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
+    bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
+
+    return bboxes1.overlaps(bboxes1, bboxes2, mode=mode)
+
+
+@TASK_UTILS.register_module()
+class AxisAlignedBboxOverlaps3D(object):
+    """Axis-aligned 3D Overlaps (IoU) Calculator."""
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        """Calculate IoU between 2D bboxes.
+
+        Args:
+            bboxes1 (Tensor): shape (B, m, 6) in <x1, y1, z1, x2, y2, z2>
+                format or empty.
+            bboxes2 (Tensor): shape (B, n, 6) in <x1, y1, z1, x2, y2, z2>
+                format or empty.
+                B indicates the batch dim, in shape (B1, B2, ..., Bn).
+                If ``is_aligned`` is ``True``, then m and n must be equal.
+            mode (str): "iou" (intersection over union) or "giou" (generalized
+                intersection over union).
+            is_aligned (bool, optional): If True, then m and n must be equal.
+                Defaults to False.
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+        """
+        assert bboxes1.size(-1) == bboxes2.size(-1) == 6
+        return axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode,
+                                             is_aligned)
+
+    def __repr__(self):
+        """str: a string describing the module"""
+        repr_str = self.__class__.__name__ + '()'
+        return repr_str
+
+
+def axis_aligned_bbox_overlaps_3d(bboxes1,
+                                  bboxes2,
+                                  mode='iou',
+                                  is_aligned=False,
+                                  eps=1e-6):
+    """Calculate overlap between two set of axis aligned 3D bboxes. If
+    ``is_aligned`` is ``False``, then calculate the overlaps between each bbox
+    of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of
+    bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (Tensor): shape (B, m, 6) in <x1, y1, z1, x2, y2, z2>
+            format or empty.
+        bboxes2 (Tensor): shape (B, n, 6) in <x1, y1, z1, x2, y2, z2>
+            format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned`` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or "giou" (generalized
+            intersection over union).
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Defaults to False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Defaults to 1e-6.
+
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 0, 10, 10, 10],
+        >>>     [10, 10, 10, 20, 20, 20],
+        >>>     [32, 32, 32, 38, 40, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 0, 10, 20, 20],
+        >>>     [0, 10, 10, 10, 19, 20],
+        >>>     [10, 10, 10, 20, 20, 20],
+        >>> ])
+        >>> overlaps = axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2)
+        >>> assert overlaps.shape == (3, 3)
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
+        >>> assert overlaps.shape == (3, )
+    Example:
+        >>> empty = torch.empty(0, 6)
+        >>> nonempty = torch.FloatTensor([[0, 0, 0, 10, 9, 10]])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    assert mode in ['iou', 'giou'], f'Unsupported mode {mode}'
+    # Either the boxes are empty or the length of boxes's last dimension is 6
+    assert (bboxes1.size(-1) == 6 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 6 or bboxes2.size(0) == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.size(-2)
+    cols = bboxes2.size(-2)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return bboxes1.new(batch_shape + (rows, ))
+        else:
+            return bboxes1.new(batch_shape + (rows, cols))
+
+    area1 = (bboxes1[..., 3] -
+             bboxes1[..., 0]) * (bboxes1[..., 4] - bboxes1[..., 1]) * (
+                 bboxes1[..., 5] - bboxes1[..., 2])
+    area2 = (bboxes2[..., 3] -
+             bboxes2[..., 0]) * (bboxes2[..., 4] - bboxes2[..., 1]) * (
+                 bboxes2[..., 5] - bboxes2[..., 2])
+
+    if is_aligned:
+        lt = torch.max(bboxes1[..., :3], bboxes2[..., :3])  # [B, rows, 3]
+        rb = torch.min(bboxes1[..., 3:], bboxes2[..., 3:])  # [B, rows, 3]
+
+        wh = (rb - lt).clamp(min=0)  # [B, rows, 2]
+        overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :3], bboxes2[..., :3])
+            enclosed_rb = torch.max(bboxes1[..., 3:], bboxes2[..., 3:])
+    else:
+        lt = torch.max(bboxes1[..., :, None, :3],
+                       bboxes2[..., None, :, :3])  # [B, rows, cols, 3]
+        rb = torch.min(bboxes1[..., :, None, 3:],
+                       bboxes2[..., None, :, 3:])  # [B, rows, cols, 3]
+
+        wh = (rb - lt).clamp(min=0)  # [B, rows, cols, 3]
+        overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
+
+        if mode in ['iou', 'giou']:
+            union = area1[..., None] + area2[..., None, :] - overlap
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :, None, :3],
+                                    bboxes2[..., None, :, :3])
+            enclosed_rb = torch.max(bboxes1[..., :, None, 3:],
+                                    bboxes2[..., None, :, 3:])
+
+    eps = union.new_tensor([eps])
+    union = torch.max(union, eps)
+    ious = overlap / union
+    if mode in ['iou']:
+        return ious
+    # calculate gious
+    enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0)
+    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] * enclose_wh[..., 2]
+    enclose_area = torch.max(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return gious
diff --git a/mmde/mmdet3d/structures/ops/transforms.py b/mmde/mmdet3d/structures/ops/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e9f7006acb38b3c6ba6d52609d9d7be03047b05
--- /dev/null
+++ b/mmde/mmdet3d/structures/ops/transforms.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def bbox3d_mapping_back(bboxes, scale_factor, flip_horizontal, flip_vertical):
+    """Map bboxes from testing scale to original image scale.
+
+    Args:
+        bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back.
+        scale_factor (float): Scale factor.
+        flip_horizontal (bool): Whether to flip horizontally.
+        flip_vertical (bool): Whether to flip vertically.
+
+    Returns:
+        :obj:`BaseInstance3DBoxes`: Boxes mapped back.
+    """
+    new_bboxes = bboxes.clone()
+    if flip_horizontal:
+        new_bboxes.flip('horizontal')
+    if flip_vertical:
+        new_bboxes.flip('vertical')
+    new_bboxes.scale(1 / scale_factor)
+
+    return new_bboxes
+
+
+def bbox3d2roi(bbox_list):
+    """Convert a list of bounding boxes to roi format.
+
+    Args:
+        bbox_list (list[torch.Tensor]): A list of bounding boxes
+            corresponding to a batch of images.
+
+    Returns:
+        torch.Tensor: Region of interests in shape (n, c), where
+            the channels are in order of [batch_ind, x, y ...].
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        if bboxes.size(0) > 0:
+            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+            rois = torch.cat([img_inds, bboxes], dim=-1)
+        else:
+            rois = torch.zeros_like(bboxes)
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+# TODO delete this
+def bbox3d2result(bboxes, scores, labels, attrs=None):
+    """Convert detection results to a list of numpy arrays.
+
+    Args:
+        bboxes (torch.Tensor): Bounding boxes with shape (N, 5).
+        labels (torch.Tensor): Labels with shape (N, ).
+        scores (torch.Tensor): Scores with shape (N, ).
+        attrs (torch.Tensor, optional): Attributes with shape (N, ).
+            Defaults to None.
+
+    Returns:
+        dict[str, torch.Tensor]: Bounding box results in cpu mode.
+
+            - boxes_3d (torch.Tensor): 3D boxes.
+            - scores (torch.Tensor): Prediction scores.
+            - labels_3d (torch.Tensor): Box labels.
+            - attrs_3d (torch.Tensor, optional): Box attributes.
+    """
+    result_dict = dict(
+        bboxes_3d=bboxes.to('cpu'),
+        scores_3d=scores.cpu(),
+        labels_3d=labels.cpu())
+
+    if attrs is not None:
+        result_dict['attr_labels'] = attrs.cpu()
+
+    return result_dict
diff --git a/mmde/mmdet3d/structures/point_data.py b/mmde/mmdet3d/structures/point_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..f12d4c8692e968970ec8e70c0f49d7b81e035dc5
--- /dev/null
+++ b/mmde/mmdet3d/structures/point_data.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Sized
+from typing import Union
+
+import numpy as np
+import torch
+from mmengine.structures import BaseDataElement
+
+IndexType = Union[str, slice, int, list, torch.LongTensor,
+                  torch.cuda.LongTensor, torch.BoolTensor,
+                  torch.cuda.BoolTensor, np.ndarray]
+
+
+class PointData(BaseDataElement):
+    """Data structure for point-level annotations or predictions.
+
+    All data items in ``data_fields`` of ``PointData`` meet the following
+    requirements:
+
+    - They are all one dimension.
+    - They should have the same length.
+
+    `PointData` is used to save point-level semantic and instance mask,
+    it also can save `instances_labels` and `instances_scores` temporarily.
+    In the future, we would consider to move the instance-level info into
+    `gt_instances_3d` and `pred_instances_3d`.
+
+    Examples:
+        >>> metainfo = dict(
+        ...     sample_idx=random.randint(0, 100))
+        >>> points = np.random.randint(0, 255, (100, 3))
+        >>> point_data = PointData(metainfo=metainfo,
+        ...                        points=points)
+        >>> print(len(point_data))
+        100
+
+        >>> # slice
+        >>> slice_data = point_data[10:60]
+        >>> assert len(slice_data) == 50
+
+        >>> # set
+        >>> point_data.pts_semantic_mask = torch.randint(0, 255, (100,))
+        >>> point_data.pts_instance_mask = torch.randint(0, 255, (100,))
+        >>> assert tuple(point_data.pts_semantic_mask.shape) == (100,)
+        >>> assert tuple(point_data.pts_instance_mask.shape) == (100,)
+    """
+
+    def __setattr__(self, name: str, value: Sized) -> None:
+        """setattr is only used to set data.
+
+        The value must have the attribute of `__len__` and have the same length
+        of `PointData`.
+        """
+        if name in ('_metainfo_fields', '_data_fields'):
+            if not hasattr(self, name):
+                super().__setattr__(name, value)
+            else:
+                raise AttributeError(f'{name} has been used as a '
+                                     'private attribute, which is immutable.')
+
+        else:
+            assert isinstance(value,
+                              Sized), 'value must contain `__len__` attribute'
+            # TODO: make sure the input value share the same length
+            super().__setattr__(name, value)
+
+    __setitem__ = __setattr__
+
+    def __getitem__(self, item: IndexType) -> 'PointData':
+        """
+        Args:
+            item (str, int, list, :obj:`slice`, :obj:`numpy.ndarray`,
+                :obj:`torch.LongTensor`, :obj:`torch.BoolTensor`):
+                Get the corresponding values according to item.
+
+        Returns:
+            :obj:`PointData`: Corresponding values.
+        """
+        if isinstance(item, list):
+            item = np.array(item)
+        if isinstance(item, np.ndarray):
+            # The default int type of numpy is platform dependent, int32 for
+            # windows and int64 for linux. `torch.Tensor` requires the index
+            # should be int64, therefore we simply convert it to int64 here.
+            # Mode details in https://github.com/numpy/numpy/issues/9464
+            item = item.astype(np.int64) if item.dtype == np.int32 else item
+            item = torch.from_numpy(item)
+        assert isinstance(
+            item, (str, slice, int, torch.LongTensor, torch.cuda.LongTensor,
+                   torch.BoolTensor, torch.cuda.BoolTensor))
+
+        if isinstance(item, str):
+            return getattr(self, item)
+
+        if isinstance(item, int):
+            if item >= len(self) or item < -len(self):  # type: ignore
+                raise IndexError(f'Index {item} out of range!')
+            else:
+                # keep the dimension
+                item = slice(item, None, len(self))
+
+        new_data = self.__class__(metainfo=self.metainfo)
+        if isinstance(item, torch.Tensor):
+            assert item.dim() == 1, 'Only support to get the' \
+                                    ' values along the first dimension.'
+            if isinstance(item, (torch.BoolTensor, torch.cuda.BoolTensor)):
+                assert len(item) == len(self), 'The shape of the ' \
+                                               'input(BoolTensor) ' \
+                                               f'{len(item)} ' \
+                                               'does not match the shape ' \
+                                               'of the indexed tensor ' \
+                                               'in results_field ' \
+                                               f'{len(self)} at ' \
+                                               'first dimension.'
+
+            for k, v in self.items():
+                if isinstance(v, torch.Tensor):
+                    new_data[k] = v[item]
+                elif isinstance(v, np.ndarray):
+                    new_data[k] = v[item.cpu().numpy()]
+                elif isinstance(
+                        v, (str, list, tuple)) or (hasattr(v, '__getitem__')
+                                                   and hasattr(v, 'cat')):
+                    # convert to indexes from BoolTensor
+                    if isinstance(item,
+                                  (torch.BoolTensor, torch.cuda.BoolTensor)):
+                        indexes = torch.nonzero(item).view(
+                            -1).cpu().numpy().tolist()
+                    else:
+                        indexes = item.cpu().numpy().tolist()
+                    slice_list = []
+                    if indexes:
+                        for index in indexes:
+                            slice_list.append(slice(index, None, len(v)))
+                    else:
+                        slice_list.append(slice(None, 0, None))
+                    r_list = [v[s] for s in slice_list]
+                    if isinstance(v, (str, list, tuple)):
+                        new_value = r_list[0]
+                        for r in r_list[1:]:
+                            new_value = new_value + r
+                    else:
+                        new_value = v.cat(r_list)
+                    new_data[k] = new_value
+                else:
+                    raise ValueError(
+                        f'The type of `{k}` is `{type(v)}`, which has no '
+                        'attribute of `cat`, so it does not '
+                        'support slice with `bool`')
+        else:
+            # item is a slice
+            for k, v in self.items():
+                new_data[k] = v[item]
+        return new_data  # type: ignore
+
+    def __len__(self) -> int:
+        """int: The length of `PointData`."""
+        if len(self._data_fields) > 0:
+            return len(self.values()[0])
+        else:
+            return 0
diff --git a/mmde/mmdet3d/structures/points/__init__.py b/mmde/mmdet3d/structures/points/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eedae146d0b41df94ae3bb86f3992a7a8e7d27c4
--- /dev/null
+++ b/mmde/mmdet3d/structures/points/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_points import BasePoints
+from .cam_points import CameraPoints
+from .depth_points import DepthPoints
+from .lidar_points import LiDARPoints
+
+__all__ = ['BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints']
+
+
+def get_points_type(points_type: str) -> type:
+    """Get the class of points according to coordinate type.
+
+    Args:
+        points_type (str): The type of points coordinate. The valid value are
+            "CAMERA", "LIDAR" and "DEPTH".
+
+    Returns:
+        type: Points type.
+    """
+    points_type_upper = points_type.upper()
+    if points_type_upper == 'CAMERA':
+        points_cls = CameraPoints
+    elif points_type_upper == 'LIDAR':
+        points_cls = LiDARPoints
+    elif points_type_upper == 'DEPTH':
+        points_cls = DepthPoints
+    else:
+        raise ValueError('Only "points_type" of "CAMERA", "LIDAR" and "DEPTH" '
+                         f'are supported, got {points_type}')
+
+    return points_cls
diff --git a/mmde/mmdet3d/structures/points/base_points.py b/mmde/mmdet3d/structures/points/base_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cb54ce895155beb71181fd83e897a5e9b83afb0
--- /dev/null
+++ b/mmde/mmdet3d/structures/points/base_points.py
@@ -0,0 +1,523 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from abc import abstractmethod
+from typing import Iterator, Optional, Sequence, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet3d.structures.bbox_3d.utils import rotation_3d_in_axis
+
+
+class BasePoints:
+    """Base class for Points.
+
+    Args:
+        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
+            data with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...). Defaults to 3.
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...).
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self,
+                 tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
+                 points_dim: int = 3,
+                 attribute_dims: Optional[dict] = None) -> None:
+        if isinstance(tensor, Tensor):
+            device = tensor.device
+        else:
+            device = torch.device('cpu')
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that does
+            # not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((-1, points_dim))
+        assert tensor.dim() == 2 and tensor.size(-1) == points_dim, \
+            ('The points dimension must be 2 and the length of the last '
+             f'dimension must be {points_dim}, but got points with shape '
+             f'{tensor.shape}.')
+
+        self.tensor = tensor.clone()
+        self.points_dim = points_dim
+        self.attribute_dims = attribute_dims
+        self.rotation_axis = 0
+
+    @property
+    def coord(self) -> Tensor:
+        """Tensor: Coordinates of each point in shape (N, 3)."""
+        return self.tensor[:, :3]
+
+    @coord.setter
+    def coord(self, tensor: Union[Tensor, np.ndarray]) -> None:
+        """Set the coordinates of each point.
+
+        Args:
+            tensor (Tensor or np.ndarray): Coordinates of each point with shape
+                (N, 3).
+        """
+        try:
+            tensor = tensor.reshape(self.shape[0], 3)
+        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
+            raise ValueError(f'got unexpected shape {tensor.shape}')
+        if not isinstance(tensor, Tensor):
+            tensor = self.tensor.new_tensor(tensor)
+        self.tensor[:, :3] = tensor
+
+    @property
+    def height(self) -> Union[Tensor, None]:
+        """Tensor or None: Returns a vector with height of each point in shape
+        (N, )."""
+        if self.attribute_dims is not None and \
+                'height' in self.attribute_dims.keys():
+            return self.tensor[:, self.attribute_dims['height']]
+        else:
+            return None
+
+    @height.setter
+    def height(self, tensor: Union[Tensor, np.ndarray]) -> None:
+        """Set the height of each point.
+
+        Args:
+            tensor (Tensor or np.ndarray): Height of each point with shape
+                (N, ).
+        """
+        try:
+            tensor = tensor.reshape(self.shape[0])
+        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
+            raise ValueError(f'got unexpected shape {tensor.shape}')
+        if not isinstance(tensor, Tensor):
+            tensor = self.tensor.new_tensor(tensor)
+        if self.attribute_dims is not None and \
+                'height' in self.attribute_dims.keys():
+            self.tensor[:, self.attribute_dims['height']] = tensor
+        else:
+            # add height attribute
+            if self.attribute_dims is None:
+                self.attribute_dims = dict()
+            attr_dim = self.shape[1]
+            self.tensor = torch.cat([self.tensor, tensor.unsqueeze(1)], dim=1)
+            self.attribute_dims.update(dict(height=attr_dim))
+            self.points_dim += 1
+
+    @property
+    def color(self) -> Union[Tensor, None]:
+        """Tensor or None: Returns a vector with color of each point in shape
+        (N, 3)."""
+        if self.attribute_dims is not None and \
+                'color' in self.attribute_dims.keys():
+            return self.tensor[:, self.attribute_dims['color']]
+        else:
+            return None
+
+    @color.setter
+    def color(self, tensor: Union[Tensor, np.ndarray]) -> None:
+        """Set the color of each point.
+
+        Args:
+            tensor (Tensor or np.ndarray): Color of each point with shape
+                (N, 3).
+        """
+        try:
+            tensor = tensor.reshape(self.shape[0], 3)
+        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
+            raise ValueError(f'got unexpected shape {tensor.shape}')
+        if tensor.max() >= 256 or tensor.min() < 0:
+            warnings.warn('point got color value beyond [0, 255]')
+        if not isinstance(tensor, Tensor):
+            tensor = self.tensor.new_tensor(tensor)
+        if self.attribute_dims is not None and \
+                'color' in self.attribute_dims.keys():
+            self.tensor[:, self.attribute_dims['color']] = tensor
+        else:
+            # add color attribute
+            if self.attribute_dims is None:
+                self.attribute_dims = dict()
+            attr_dim = self.shape[1]
+            self.tensor = torch.cat([self.tensor, tensor], dim=1)
+            self.attribute_dims.update(
+                dict(color=[attr_dim, attr_dim + 1, attr_dim + 2]))
+            self.points_dim += 3
+
+    @property
+    def shape(self) -> torch.Size:
+        """torch.Size: Shape of points."""
+        return self.tensor.shape
+
+    def shuffle(self) -> Tensor:
+        """Shuffle the points.
+
+        Returns:
+            Tensor: The shuffled index.
+        """
+        idx = torch.randperm(self.__len__(), device=self.tensor.device)
+        self.tensor = self.tensor[idx]
+        return idx
+
+    def rotate(self,
+               rotation: Union[Tensor, np.ndarray, float],
+               axis: Optional[int] = None) -> Tensor:
+        """Rotate points with the given rotation matrix or angle.
+
+        Args:
+            rotation (Tensor or np.ndarray or float): Rotation matrix or angle.
+            axis (int, optional): Axis to rotate at. Defaults to None.
+
+        Returns:
+            Tensor: Rotation matrix.
+        """
+        if not isinstance(rotation, Tensor):
+            rotation = self.tensor.new_tensor(rotation)
+        assert rotation.shape == torch.Size([3, 3]) or rotation.numel() == 1, \
+            f'invalid rotation shape {rotation.shape}'
+
+        if axis is None:
+            axis = self.rotation_axis
+
+        if rotation.numel() == 1:
+            rotated_points, rot_mat_T = rotation_3d_in_axis(
+                self.tensor[:, :3][None], rotation, axis=axis, return_mat=True)
+            self.tensor[:, :3] = rotated_points.squeeze(0)
+            rot_mat_T = rot_mat_T.squeeze(0)
+        else:
+            # rotation.numel() == 9
+            self.tensor[:, :3] = self.tensor[:, :3] @ rotation
+            rot_mat_T = rotation
+
+        return rot_mat_T
+
+    @abstractmethod
+    def flip(self, bev_direction: str = 'horizontal') -> None:
+        """Flip the points along given BEV direction.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+                Defaults to 'horizontal'.
+        """
+        pass
+
+    def translate(self, trans_vector: Union[Tensor, np.ndarray]) -> None:
+        """Translate points with the given translation vector.
+
+        Args:
+            trans_vector (Tensor or np.ndarray): Translation vector of size 3
+                or nx3.
+        """
+        if not isinstance(trans_vector, Tensor):
+            trans_vector = self.tensor.new_tensor(trans_vector)
+        trans_vector = trans_vector.squeeze(0)
+        if trans_vector.dim() == 1:
+            assert trans_vector.shape[0] == 3
+        elif trans_vector.dim() == 2:
+            assert trans_vector.shape[0] == self.tensor.shape[0] and \
+                trans_vector.shape[1] == 3
+        else:
+            raise NotImplementedError(
+                f'Unsupported translation vector of shape {trans_vector.shape}'
+            )
+        self.tensor[:, :3] += trans_vector
+
+    def in_range_3d(
+            self, point_range: Union[Tensor, np.ndarray,
+                                     Sequence[float]]) -> Tensor:
+        """Check whether the points are in the given range.
+
+        Args:
+            point_range (Tensor or np.ndarray or Sequence[float]): The range of
+                point (x_min, y_min, z_min, x_max, y_max, z_max).
+
+        Note:
+            In the original implementation of SECOND, checking whether a box in
+            the range checks whether the points are in a convex polygon, we try
+            to reduce the burden for simpler cases.
+
+        Returns:
+            Tensor: A binary vector indicating whether each point is inside the
+            reference range.
+        """
+        in_range_flags = ((self.tensor[:, 0] > point_range[0])
+                          & (self.tensor[:, 1] > point_range[1])
+                          & (self.tensor[:, 2] > point_range[2])
+                          & (self.tensor[:, 0] < point_range[3])
+                          & (self.tensor[:, 1] < point_range[4])
+                          & (self.tensor[:, 2] < point_range[5]))
+        return in_range_flags
+
+    @property
+    def bev(self) -> Tensor:
+        """Tensor: BEV of the points in shape (N, 2)."""
+        return self.tensor[:, [0, 1]]
+
+    def in_range_bev(
+            self, point_range: Union[Tensor, np.ndarray,
+                                     Sequence[float]]) -> Tensor:
+        """Check whether the points are in the given range.
+
+        Args:
+            point_range (Tensor or np.ndarray or Sequence[float]): The range of
+                point in order of (x_min, y_min, x_max, y_max).
+
+        Returns:
+            Tensor: A binary vector indicating whether each point is inside the
+            reference range.
+        """
+        in_range_flags = ((self.bev[:, 0] > point_range[0])
+                          & (self.bev[:, 1] > point_range[1])
+                          & (self.bev[:, 0] < point_range[2])
+                          & (self.bev[:, 1] < point_range[3]))
+        return in_range_flags
+
+    @abstractmethod
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor,
+                                          np.ndarray]] = None) -> 'BasePoints':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Point mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted point of the same type in the
+            ``dst`` mode.
+        """
+        pass
+
+    def scale(self, scale_factor: float) -> None:
+        """Scale the points with horizontal and vertical scaling factors.
+
+        Args:
+            scale_factors (float): Scale factors to scale the points.
+        """
+        self.tensor[:, :3] *= scale_factor
+
+    def __getitem__(
+            self, item: Union[int, tuple, slice, np.ndarray,
+                              Tensor]) -> 'BasePoints':
+        """
+        Args:
+            item (int or tuple or slice or np.ndarray or Tensor): Index of
+                points.
+
+        Note:
+            The following usage are allowed:
+
+            1. `new_points = points[3]`: Return a `Points` that contains only
+               one point.
+            2. `new_points = points[2:10]`: Return a slice of points.
+            3. `new_points = points[vector]`: Whether vector is a
+               torch.BoolTensor with `length = len(points)`. Nonzero elements
+               in the vector will be selected.
+            4. `new_points = points[3:11, vector]`: Return a slice of points
+               and attribute dims.
+            5. `new_points = points[4:12, 2]`: Return a slice of points with
+               single attribute.
+
+            Note that the returned Points might share storage with this Points,
+            subject to PyTorch's indexing semantics.
+
+        Returns:
+            :obj:`BasePoints`: A new object of :class:`BasePoints` after
+            indexing.
+        """
+        original_type = type(self)
+        if isinstance(item, int):
+            return original_type(
+                self.tensor[item].view(1, -1),
+                points_dim=self.points_dim,
+                attribute_dims=self.attribute_dims)
+        elif isinstance(item, tuple) and len(item) == 2:
+            if isinstance(item[1], slice):
+                start = 0 if item[1].start is None else item[1].start
+                stop = self.tensor.shape[1] \
+                    if item[1].stop is None else item[1].stop
+                step = 1 if item[1].step is None else item[1].step
+                item = list(item)
+                item[1] = list(range(start, stop, step))
+                item = tuple(item)
+            elif isinstance(item[1], int):
+                item = list(item)
+                item[1] = [item[1]]
+                item = tuple(item)
+            p = self.tensor[item[0], item[1]]
+
+            keep_dims = list(
+                set(item[1]).intersection(set(range(3, self.tensor.shape[1]))))
+            if self.attribute_dims is not None:
+                attribute_dims = self.attribute_dims.copy()
+                for key in self.attribute_dims.keys():
+                    cur_attribute_dims = attribute_dims[key]
+                    if isinstance(cur_attribute_dims, int):
+                        cur_attribute_dims = [cur_attribute_dims]
+                    intersect_attr = list(
+                        set(cur_attribute_dims).intersection(set(keep_dims)))
+                    if len(intersect_attr) == 1:
+                        attribute_dims[key] = intersect_attr[0]
+                    elif len(intersect_attr) > 1:
+                        attribute_dims[key] = intersect_attr
+                    else:
+                        attribute_dims.pop(key)
+            else:
+                attribute_dims = None
+        elif isinstance(item, (slice, np.ndarray, Tensor)):
+            p = self.tensor[item]
+            attribute_dims = self.attribute_dims
+        else:
+            raise NotImplementedError(f'Invalid slice {item}!')
+
+        assert p.dim() == 2, \
+            f'Indexing on Points with {item} failed to return a matrix!'
+        return original_type(
+            p, points_dim=p.shape[1], attribute_dims=attribute_dims)
+
+    def __len__(self) -> int:
+        """int: Number of points in the current object."""
+        return self.tensor.shape[0]
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the object."""
+        return self.__class__.__name__ + '(\n    ' + str(self.tensor) + ')'
+
+    @classmethod
+    def cat(cls, points_list: Sequence['BasePoints']) -> 'BasePoints':
+        """Concatenate a list of Points into a single Points.
+
+        Args:
+            points_list (Sequence[:obj:`BasePoints`]): List of points.
+
+        Returns:
+            :obj:`BasePoints`: The concatenated points.
+        """
+        assert isinstance(points_list, (list, tuple))
+        if len(points_list) == 0:
+            return cls(torch.empty(0))
+        assert all(isinstance(points, cls) for points in points_list)
+
+        # use torch.cat (v.s. layers.cat)
+        # so the returned points never share storage with input
+        cat_points = cls(
+            torch.cat([p.tensor for p in points_list], dim=0),
+            points_dim=points_list[0].points_dim,
+            attribute_dims=points_list[0].attribute_dims)
+        return cat_points
+
+    def numpy(self) -> np.ndarray:
+        """Reload ``numpy`` from self.tensor."""
+        return self.tensor.numpy()
+
+    def to(self, device: Union[str, torch.device], *args,
+           **kwargs) -> 'BasePoints':
+        """Convert current points to a specific device.
+
+        Args:
+            device (str or :obj:`torch.device`): The name of the device.
+
+        Returns:
+            :obj:`BasePoints`: A new points object on the specific device.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.to(device, *args, **kwargs),
+            points_dim=self.points_dim,
+            attribute_dims=self.attribute_dims)
+
+    def cpu(self) -> 'BasePoints':
+        """Convert current points to cpu device.
+
+        Returns:
+            :obj:`BasePoints`: A new points object on the cpu device.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.cpu(),
+            points_dim=self.points_dim,
+            attribute_dims=self.attribute_dims)
+
+    def cuda(self, *args, **kwargs) -> 'BasePoints':
+        """Convert current points to cuda device.
+
+        Returns:
+            :obj:`BasePoints`: A new points object on the cuda device.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.cuda(*args, **kwargs),
+            points_dim=self.points_dim,
+            attribute_dims=self.attribute_dims)
+
+    def clone(self) -> 'BasePoints':
+        """Clone the points.
+
+        Returns:
+            :obj:`BasePoints`: Point object with the same properties as self.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.clone(),
+            points_dim=self.points_dim,
+            attribute_dims=self.attribute_dims)
+
+    def detach(self) -> 'BasePoints':
+        """Detach the points.
+
+        Returns:
+            :obj:`BasePoints`: Point object with the same properties as self.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.detach(),
+            points_dim=self.points_dim,
+            attribute_dims=self.attribute_dims)
+
+    @property
+    def device(self) -> torch.device:
+        """torch.device: The device of the points are on."""
+        return self.tensor.device
+
+    def __iter__(self) -> Iterator[Tensor]:
+        """Yield a point as a Tensor at a time.
+
+        Returns:
+            Iterator[Tensor]: A point of shape (points_dim, ).
+        """
+        yield from self.tensor
+
+    def new_point(
+        self, data: Union[Tensor, np.ndarray, Sequence[Sequence[float]]]
+    ) -> 'BasePoints':
+        """Create a new point object with data.
+
+        The new point and its tensor has the similar properties as self and
+        self.tensor, respectively.
+
+        Args:
+            data (Tensor or np.ndarray or Sequence[Sequence[float]]): Data to
+                be copied.
+
+        Returns:
+            :obj:`BasePoints`: A new point object with ``data``, the object's
+            other properties are similar to ``self``.
+        """
+        new_tensor = self.tensor.new_tensor(data) \
+            if not isinstance(data, Tensor) else data.to(self.device)
+        original_type = type(self)
+        return original_type(
+            new_tensor,
+            points_dim=self.points_dim,
+            attribute_dims=self.attribute_dims)
diff --git a/mmde/mmdet3d/structures/points/cam_points.py b/mmde/mmdet3d/structures/points/cam_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcf290d348bc30175a626006923fb920eb9eaa48
--- /dev/null
+++ b/mmde/mmdet3d/structures/points/cam_points.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import numpy as np
+from torch import Tensor
+
+from .base_points import BasePoints
+
+
+class CameraPoints(BasePoints):
+    """Points of instances in CAM coordinates.
+
+    Args:
+        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
+            data with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...). Defaults to 3.
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...).
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self,
+                 tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
+                 points_dim: int = 3,
+                 attribute_dims: Optional[dict] = None) -> None:
+        super(CameraPoints, self).__init__(
+            tensor, points_dim=points_dim, attribute_dims=attribute_dims)
+        self.rotation_axis = 1
+
+    def flip(self, bev_direction: str = 'horizontal') -> None:
+        """Flip the points along given BEV direction.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+                Defaults to 'horizontal'.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0] = -self.tensor[:, 0]
+        elif bev_direction == 'vertical':
+            self.tensor[:, 2] = -self.tensor[:, 2]
+
+    @property
+    def bev(self) -> Tensor:
+        """Tensor: BEV of the points in shape (N, 2)."""
+        return self.tensor[:, [0, 2]]
+
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor,
+                                          np.ndarray]] = None) -> 'BasePoints':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Point mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted point of the same type in the
+            ``dst`` mode.
+        """
+        from mmdet3d.structures.bbox_3d import Coord3DMode
+        return Coord3DMode.convert_point(
+            point=self, src=Coord3DMode.CAM, dst=dst, rt_mat=rt_mat)
diff --git a/mmde/mmdet3d/structures/points/depth_points.py b/mmde/mmdet3d/structures/points/depth_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..40f264370b907de434924254f8f702b9069af0c8
--- /dev/null
+++ b/mmde/mmdet3d/structures/points/depth_points.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import numpy as np
+from torch import Tensor
+
+from .base_points import BasePoints
+
+
+class DepthPoints(BasePoints):
+    """Points of instances in DEPTH coordinates.
+
+    Args:
+        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
+            data with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...). Defaults to 3.
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...).
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self,
+                 tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
+                 points_dim: int = 3,
+                 attribute_dims: Optional[dict] = None) -> None:
+        super(DepthPoints, self).__init__(
+            tensor, points_dim=points_dim, attribute_dims=attribute_dims)
+        self.rotation_axis = 2
+
+    def flip(self, bev_direction: str = 'horizontal') -> None:
+        """Flip the points along given BEV direction.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+                Defaults to 'horizontal'.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0] = -self.tensor[:, 0]
+        elif bev_direction == 'vertical':
+            self.tensor[:, 1] = -self.tensor[:, 1]
+
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor,
+                                          np.ndarray]] = None) -> 'BasePoints':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Point mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted point of the same type in the
+            ``dst`` mode.
+        """
+        from mmdet3d.structures.bbox_3d import Coord3DMode
+        return Coord3DMode.convert_point(
+            point=self, src=Coord3DMode.DEPTH, dst=dst, rt_mat=rt_mat)
diff --git a/mmde/mmdet3d/structures/points/lidar_points.py b/mmde/mmdet3d/structures/points/lidar_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..2035290e0ebc22943fe2e016abe09f41b2f01f14
--- /dev/null
+++ b/mmde/mmdet3d/structures/points/lidar_points.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import numpy as np
+from torch import Tensor
+
+from .base_points import BasePoints
+
+
+class LiDARPoints(BasePoints):
+    """Points of instances in LIDAR coordinates.
+
+    Args:
+        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
+            data with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...). Defaults to 3.
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...).
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self,
+                 tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
+                 points_dim: int = 3,
+                 attribute_dims: Optional[dict] = None) -> None:
+        super(LiDARPoints, self).__init__(
+            tensor, points_dim=points_dim, attribute_dims=attribute_dims)
+        self.rotation_axis = 2
+
+    def flip(self, bev_direction: str = 'horizontal') -> None:
+        """Flip the points along given BEV direction.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+                Defaults to 'horizontal'.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 1] = -self.tensor[:, 1]
+        elif bev_direction == 'vertical':
+            self.tensor[:, 0] = -self.tensor[:, 0]
+
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor,
+                                          np.ndarray]] = None) -> 'BasePoints':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Point mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted point of the same type in the
+            ``dst`` mode.
+        """
+        from mmdet3d.structures.bbox_3d import Coord3DMode
+        return Coord3DMode.convert_point(
+            point=self, src=Coord3DMode.LIDAR, dst=dst, rt_mat=rt_mat)
diff --git a/mmde/mmdet3d/testing/__init__.py b/mmde/mmdet3d/testing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0674a9422b0fc3b812b015deecdfb129dd23f349
--- /dev/null
+++ b/mmde/mmdet3d/testing/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .data_utils import (create_data_info_after_loading,
+                         create_dummy_data_info,
+                         create_mono3d_data_info_after_loading)
+from .model_utils import (create_detector_inputs, get_detector_cfg,
+                          get_model_cfg, setup_seed)
+
+__all__ = [
+    'create_dummy_data_info', 'create_data_info_after_loading',
+    'create_mono3d_data_info_after_loading', 'create_detector_inputs',
+    'get_detector_cfg', 'get_model_cfg', 'setup_seed'
+]
diff --git a/mmde/mmdet3d/testing/data_utils.py b/mmde/mmdet3d/testing/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..952a6cf2be4884418ec4ec9e2199fb99606b6e51
--- /dev/null
+++ b/mmde/mmdet3d/testing/data_utils.py
@@ -0,0 +1,196 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+# create a dummy `results` to test the pipeline
+from mmdet3d.datasets import LoadAnnotations3D, LoadPointsFromFile
+from mmdet3d.datasets.transforms.loading import LoadImageFromFileMono3D
+from mmdet3d.structures import LiDARInstance3DBoxes
+
+
+def create_dummy_data_info(with_ann=True):
+
+    ann_info = {
+        'gt_bboxes':
+        np.array([[712.4, 143., 810.73, 307.92]]),
+        'gt_labels':
+        np.array([1]),
+        'gt_bboxes_3d':
+        LiDARInstance3DBoxes(
+            np.array(
+                [[8.7314, -1.8559, -1.5997, 1.2000, 0.4800, 1.8900,
+                  -1.5808]])),
+        'gt_labels_3d':
+        np.array([1]),
+        'centers_2d':
+        np.array([[765.04, 214.56]]),
+        'depths':
+        np.array([8.410]),
+        'num_lidar_pts':
+        np.array([377]),
+        'difficulty':
+        np.array([0]),
+        'truncated':
+        np.array([0]),
+        'occluded':
+        np.array([0]),
+        'alpha':
+        np.array([-0.2]),
+        'score':
+        np.array([0.]),
+        'index':
+        np.array([0]),
+        'group_id':
+        np.array([0])
+    }
+    data_info = {
+        'sample_id':
+        0,
+        'images': {
+            'CAM0': {
+                'cam2img': [[707.0493, 0.0, 604.0814, 0.0],
+                            [0.0, 707.0493, 180.5066, 0.0],
+                            [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+            },
+            'CAM1': {
+                'cam2img': [[707.0493, 0.0, 604.0814, -379.7842],
+                            [0.0, 707.0493, 180.5066, 0.0],
+                            [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+            },
+            'CAM2': {
+                'img_path':
+                'tests/data/kitti/training/image_2/000000.png',
+                'height':
+                370,
+                'width':
+                1224,
+                'cam2img': [[707.0493, 0.0, 604.0814, 45.75831],
+                            [0.0, 707.0493, 180.5066, -0.3454157],
+                            [0.0, 0.0, 1.0, 0.004981016], [0.0, 0.0, 0.0, 1.0]]
+            },
+            'CAM3': {
+                'cam2img': [[707.0493, 0.0, 604.0814, -334.1081],
+                            [0.0, 707.0493, 180.5066, 2.33066],
+                            [0.0, 0.0, 1.0, 0.003201153], [0.0, 0.0, 0.0, 1.0]]
+            },
+            'R0_rect': [[
+                0.9999127984046936, 0.010092630051076412,
+                -0.008511931635439396, 0.0
+            ],
+                        [
+                            -0.010127290152013302, 0.9999405741691589,
+                            -0.004037670791149139, 0.0
+                        ],
+                        [
+                            0.008470674976706505, 0.0041235219687223434,
+                            0.9999555945396423, 0.0
+                        ], [0.0, 0.0, 0.0, 1.0]]
+        },
+        'lidar_points': {
+            'num_pts_feats':
+            4,
+            'lidar_path':
+            'tests/data/kitti/training/velodyne_reduced/000000.bin',
+            'lidar2cam': [[
+                -0.0015960992313921452, -0.9999162554740906,
+                -0.012840436771512032, -0.022366708144545555
+            ],
+                          [
+                              -0.00527064548805356, 0.012848696671426296,
+                              -0.9999035596847534, -0.05967890843749046
+                          ],
+                          [
+                              0.9999848008155823, -0.0015282672829926014,
+                              -0.005290712229907513, -0.33254900574684143
+                          ], [0.0, 0.0, 0.0, 1.0]],
+            'Tr_velo_to_cam': [[
+                0.006927963811904192, -0.9999722242355347, -0.0027578289154917,
+                -0.024577289819717407
+            ],
+                               [
+                                   -0.0011629819637164474,
+                                   0.0027498360723257065, -0.9999955296516418,
+                                   -0.06127237156033516
+                               ],
+                               [
+                                   0.999975323677063, 0.006931141018867493,
+                                   -0.0011438990477472544, -0.33210289478302
+                               ], [0.0, 0.0, 0.0, 1.0]],
+            'Tr_imu_to_velo': [[
+                0.999997615814209, 0.0007553070900030434,
+                -0.002035825978964567, -0.8086758852005005
+            ],
+                               [
+                                   -0.0007854027207940817, 0.9998897910118103,
+                                   -0.014822980388998985, 0.3195559084415436
+                               ],
+                               [
+                                   0.002024406101554632, 0.014824540354311466,
+                                   0.9998881220817566, -0.7997230887413025
+                               ], [0.0, 0.0, 0.0, 1.0]]
+        },
+        'instances': [{
+            'bbox': [712.4, 143.0, 810.73, 307.92],
+            'bbox_label':
+            -1,
+            'bbox_3d': [
+                1.840000033378601, 1.4700000286102295, 8.40999984741211,
+                1.2000000476837158, 1.8899999856948853, 0.47999998927116394,
+                0.009999999776482582
+            ],
+            'bbox_label_3d':
+            -1,
+            'center_2d': [765.04, 214.56],
+            'depth':
+            8.410,
+            'num_lidar_pts':
+            377,
+            'difficulty':
+            0,
+            'truncated':
+            0,
+            'occluded':
+            0,
+            'alpha':
+            -0.2,
+            'score':
+            0.0,
+            'index':
+            0,
+            'group_id':
+            0
+        }],
+        'plane':
+        None,
+        'pts_semantic_mask_path':
+        'tests/data/semantickitti/sequences/00/labels/000000.label',
+        'pts_panoptic_mask_path':
+        'tests/data/semantickitti/sequences/00/labels/000000.label',
+    }
+    if with_ann:
+        data_info['ann_info'] = ann_info
+    return data_info
+
+
+def create_data_info_after_loading():
+    load_anns_transform = LoadAnnotations3D(
+        with_bbox_3d=True, with_label_3d=True)
+    load_points_transform = LoadPointsFromFile(
+        coord_type='LIDAR', load_dim=4, use_dim=3)
+    data_info = create_dummy_data_info()
+    data_info = load_points_transform(data_info)
+    data_info_after_loading = load_anns_transform(data_info)
+    return data_info_after_loading
+
+
+def create_mono3d_data_info_after_loading():
+    load_anns_transform = LoadAnnotations3D(
+        with_bbox=True,
+        with_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True)
+    load_img_transform = LoadImageFromFileMono3D()
+    data_info = create_dummy_data_info()
+    data_info = load_img_transform(data_info)
+    data_info_after_loading = load_anns_transform(data_info)
+    return data_info_after_loading
diff --git a/mmde/mmdet3d/testing/model_utils.py b/mmde/mmdet3d/testing/model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..da449398d655635d51f40a964f67b59c9b891d2b
--- /dev/null
+++ b/mmde/mmdet3d/testing/model_utils.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import random
+from os.path import dirname, exists, join
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet3d.structures import (CameraInstance3DBoxes, DepthInstance3DBoxes,
+                                Det3DDataSample, LiDARInstance3DBoxes,
+                                PointData)
+
+
+def setup_seed(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+
+
+def _get_config_directory():
+    """Find the predefined detector config directory."""
+    try:
+        # Assume we are running in the source mmdetection3d repo
+        repo_dpath = dirname(dirname(dirname(__file__)))
+    except NameError:
+        # For IPython development when this __file__ is not defined
+        import mmdet3d
+        repo_dpath = dirname(dirname(mmdet3d.__file__))
+    config_dpath = join(repo_dpath, 'configs')
+    if not exists(config_dpath):
+        raise Exception('Cannot find config path')
+    return config_dpath
+
+
+def _get_config_module(fname):
+    """Load a configuration as a python module."""
+    from mmengine import Config
+    config_dpath = _get_config_directory()
+    config_fpath = join(config_dpath, fname)
+    config_mod = Config.fromfile(config_fpath)
+    return config_mod
+
+
+def get_model_cfg(fname):
+    """Grab configs necessary to create a model.
+
+    These are deep copied to allow for safe modification of parameters without
+    influencing other tests.
+    """
+    config = _get_config_module(fname)
+    model = copy.deepcopy(config.model)
+
+    return model
+
+
+def get_detector_cfg(fname):
+    """Grab configs necessary to create a detector.
+
+    These are deep copied to allow for safe modification of parameters without
+    influencing other tests.
+    """
+    import mmengine
+    config = _get_config_module(fname)
+    model = copy.deepcopy(config.model)
+    train_cfg = mmengine.Config(copy.deepcopy(config.model.train_cfg))
+    test_cfg = mmengine.Config(copy.deepcopy(config.model.test_cfg))
+
+    model.update(train_cfg=train_cfg)
+    model.update(test_cfg=test_cfg)
+    return model
+
+
+def create_detector_inputs(seed=0,
+                           with_points=True,
+                           with_img=False,
+                           img_size=10,
+                           num_gt_instance=20,
+                           num_points=10,
+                           points_feat_dim=4,
+                           num_classes=3,
+                           gt_bboxes_dim=7,
+                           with_pts_semantic_mask=False,
+                           with_pts_instance_mask=False,
+                           with_eval_ann_info=False,
+                           bboxes_3d_type='lidar'):
+    setup_seed(seed)
+    assert bboxes_3d_type in ('lidar', 'depth', 'cam')
+    bbox_3d_class = {
+        'lidar': LiDARInstance3DBoxes,
+        'depth': DepthInstance3DBoxes,
+        'cam': CameraInstance3DBoxes
+    }
+    meta_info = dict()
+    meta_info['depth2img'] = np.array(
+        [[5.23289349e+02, 3.68831943e+02, 6.10469439e+01],
+         [1.09560138e+02, 1.97404735e+02, -5.47377738e+02],
+         [1.25930002e-02, 9.92229998e-01, -1.23769999e-01]])
+    meta_info['lidar2img'] = np.array(
+        [[5.23289349e+02, 3.68831943e+02, 6.10469439e+01],
+         [1.09560138e+02, 1.97404735e+02, -5.47377738e+02],
+         [1.25930002e-02, 9.92229998e-01, -1.23769999e-01]])
+
+    inputs_dict = dict()
+
+    if with_points:
+        points = torch.rand([num_points, points_feat_dim])
+        inputs_dict['points'] = [points]
+
+    if with_img:
+        if isinstance(img_size, tuple):
+            img = torch.rand(3, img_size[0], img_size[1])
+            meta_info['img_shape'] = img_size
+            meta_info['ori_shape'] = img_size
+        else:
+            img = torch.rand(3, img_size, img_size)
+            meta_info['img_shape'] = (img_size, img_size)
+            meta_info['ori_shape'] = (img_size, img_size)
+        meta_info['scale_factor'] = np.array([1., 1.])
+        inputs_dict['img'] = [img]
+
+    gt_instance_3d = InstanceData()
+
+    gt_instance_3d.bboxes_3d = bbox_3d_class[bboxes_3d_type](
+        torch.rand([num_gt_instance, gt_bboxes_dim]), box_dim=gt_bboxes_dim)
+    gt_instance_3d.labels_3d = torch.randint(0, num_classes, [num_gt_instance])
+    data_sample = Det3DDataSample(
+        metainfo=dict(box_type_3d=bbox_3d_class[bboxes_3d_type]))
+    data_sample.set_metainfo(meta_info)
+    data_sample.gt_instances_3d = gt_instance_3d
+
+    gt_instance = InstanceData()
+    gt_instance.labels = torch.randint(0, num_classes, [num_gt_instance])
+    gt_instance.bboxes = torch.rand(num_gt_instance, 4)
+    gt_instance.bboxes[:,
+                       2:] = gt_instance.bboxes[:, :2] + gt_instance.bboxes[:,
+                                                                            2:]
+
+    data_sample.gt_instances = gt_instance
+    data_sample.gt_pts_seg = PointData()
+    if with_pts_instance_mask:
+        pts_instance_mask = torch.randint(0, num_gt_instance, [num_points])
+        data_sample.gt_pts_seg['pts_instance_mask'] = pts_instance_mask
+    if with_pts_semantic_mask:
+        pts_semantic_mask = torch.randint(0, num_classes, [num_points])
+        data_sample.gt_pts_seg['pts_semantic_mask'] = pts_semantic_mask
+    if with_eval_ann_info:
+        data_sample.eval_ann_info = dict()
+    else:
+        data_sample.eval_ann_info = None
+
+    return dict(inputs=inputs_dict, data_samples=[data_sample])
diff --git a/mmde/mmdet3d/utils/__init__.py b/mmde/mmdet3d/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5ed45629f51128279ea44248860e6418fce4dbd
--- /dev/null
+++ b/mmde/mmdet3d/utils/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .array_converter import ArrayConverter, array_converter
+from .collect_env import collect_env
+from .compat_cfg import compat_cfg
+from .misc import replace_ceph_backend
+from .setup_env import register_all_modules, setup_multi_processes
+from .typing_utils import (ConfigType, InstanceList, MultiConfig,
+                           OptConfigType, OptInstanceList, OptMultiConfig,
+                           OptSampleList, OptSamplingResultList)
+
+__all__ = [
+    'collect_env', 'setup_multi_processes', 'compat_cfg',
+    'register_all_modules', 'array_converter', 'ArrayConverter', 'ConfigType',
+    'OptConfigType', 'MultiConfig', 'OptMultiConfig', 'InstanceList',
+    'OptInstanceList', 'OptSamplingResultList', 'replace_ceph_backend',
+    'OptSampleList'
+]
diff --git a/mmde/mmdet3d/utils/array_converter.py b/mmde/mmdet3d/utils/array_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcd02f41c503f6d3088acd9c6239b6c4e49dbfbb
--- /dev/null
+++ b/mmde/mmdet3d/utils/array_converter.py
@@ -0,0 +1,348 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+from inspect import getfullargspec
+from typing import Callable, Optional, Tuple, Type, Union
+
+import numpy as np
+import torch
+
+TemplateArrayType = Union[np.ndarray, torch.Tensor, list, tuple, int, float]
+
+
+def array_converter(to_torch: bool = True,
+                    apply_to: Tuple[str, ...] = tuple(),
+                    template_arg_name_: Optional[str] = None,
+                    recover: bool = True) -> Callable:
+    """Wrapper function for data-type agnostic processing.
+
+    First converts input arrays to PyTorch tensors or NumPy arrays for middle
+    calculation, then convert output to original data-type if `recover=True`.
+
+    Args:
+        to_torch (bool): Whether to convert to PyTorch tensors for middle
+            calculation. Defaults to True.
+        apply_to (Tuple[str]): The arguments to which we apply data-type
+            conversion. Defaults to an empty tuple.
+        template_arg_name_ (str, optional): Argument serving as the template
+            (return arrays should have the same dtype and device as the
+            template). Defaults to None. If None, we will use the first
+            argument in `apply_to` as the template argument.
+        recover (bool): Whether or not to recover the wrapped function outputs
+            to the `template_arg_name_` type. Defaults to True.
+
+    Raises:
+        ValueError: When template_arg_name_ is not among all args, or when
+            apply_to contains an arg which is not among all args, a ValueError
+            will be raised. When the template argument or an argument to
+            convert is a list or tuple, and cannot be converted to a NumPy
+            array, a ValueError will be raised.
+        TypeError: When the type of the template argument or an argument to
+            convert does not belong to the above range, or the contents of such
+            an list-or-tuple-type argument do not share the same data type, a
+            TypeError will be raised.
+
+    Returns:
+        Callable: Wrapped function.
+
+    Examples:
+        >>> import torch
+        >>> import numpy as np
+        >>>
+        >>> # Use torch addition for a + b,
+        >>> # and convert return values to the type of a
+        >>> @array_converter(apply_to=('a', 'b'))
+        >>> def simple_add(a, b):
+        >>>     return a + b
+        >>>
+        >>> a = np.array([1.1])
+        >>> b = np.array([2.2])
+        >>> simple_add(a, b)
+        >>>
+        >>> # Use numpy addition for a + b,
+        >>> # and convert return values to the type of b
+        >>> @array_converter(to_torch=False, apply_to=('a', 'b'),
+        >>>                  template_arg_name_='b')
+        >>> def simple_add(a, b):
+        >>>     return a + b
+        >>>
+        >>> simple_add(a, b)
+        >>>
+        >>> # Use torch funcs for floor(a) if flag=True else ceil(a),
+        >>> # and return the torch tensor
+        >>> @array_converter(apply_to=('a',), recover=False)
+        >>> def floor_or_ceil(a, flag=True):
+        >>>     return torch.floor(a) if flag else torch.ceil(a)
+        >>>
+        >>> floor_or_ceil(a, flag=False)
+    """
+
+    def array_converter_wrapper(func):
+        """Outer wrapper for the function."""
+
+        @functools.wraps(func)
+        def new_func(*args, **kwargs):
+            """Inner wrapper for the arguments."""
+            if len(apply_to) == 0:
+                return func(*args, **kwargs)
+
+            func_name = func.__name__
+
+            arg_spec = getfullargspec(func)
+
+            arg_names = arg_spec.args
+            arg_num = len(arg_names)
+            default_arg_values = arg_spec.defaults
+            if default_arg_values is None:
+                default_arg_values = []
+            no_default_arg_num = len(arg_names) - len(default_arg_values)
+
+            kwonly_arg_names = arg_spec.kwonlyargs
+            kwonly_default_arg_values = arg_spec.kwonlydefaults
+            if kwonly_default_arg_values is None:
+                kwonly_default_arg_values = {}
+
+            all_arg_names = arg_names + kwonly_arg_names
+
+            # in case there are args in the form of *args
+            if len(args) > arg_num:
+                named_args = args[:arg_num]
+                nameless_args = args[arg_num:]
+            else:
+                named_args = args
+                nameless_args = []
+
+            # template argument data type is used for all array-like arguments
+            if template_arg_name_ is None:
+                template_arg_name = apply_to[0]
+            else:
+                template_arg_name = template_arg_name_
+
+            if template_arg_name not in all_arg_names:
+                raise ValueError(f'{template_arg_name} is not among the '
+                                 f'argument list of function {func_name}')
+
+            # inspect apply_to
+            for arg_to_apply in apply_to:
+                if arg_to_apply not in all_arg_names:
+                    raise ValueError(
+                        f'{arg_to_apply} is not an argument of {func_name}')
+
+            new_args = []
+            new_kwargs = {}
+
+            converter = ArrayConverter()
+            target_type = torch.Tensor if to_torch else np.ndarray
+
+            # non-keyword arguments
+            for i, arg_value in enumerate(named_args):
+                if arg_names[i] in apply_to:
+                    new_args.append(
+                        converter.convert(
+                            input_array=arg_value, target_type=target_type))
+                else:
+                    new_args.append(arg_value)
+
+                if arg_names[i] == template_arg_name:
+                    template_arg_value = arg_value
+
+            kwonly_default_arg_values.update(kwargs)
+            kwargs = kwonly_default_arg_values
+
+            # keyword arguments and non-keyword arguments using default value
+            for i in range(len(named_args), len(all_arg_names)):
+                arg_name = all_arg_names[i]
+                if arg_name in kwargs:
+                    if arg_name in apply_to:
+                        new_kwargs[arg_name] = converter.convert(
+                            input_array=kwargs[arg_name],
+                            target_type=target_type)
+                    else:
+                        new_kwargs[arg_name] = kwargs[arg_name]
+                else:
+                    default_value = default_arg_values[i - no_default_arg_num]
+                    if arg_name in apply_to:
+                        new_kwargs[arg_name] = converter.convert(
+                            input_array=default_value, target_type=target_type)
+                    else:
+                        new_kwargs[arg_name] = default_value
+                if arg_name == template_arg_name:
+                    template_arg_value = kwargs[arg_name]
+
+            # add nameless args provided by *args (if exists)
+            new_args += nameless_args
+
+            return_values = func(*new_args, **new_kwargs)
+            converter.set_template(template_arg_value)
+
+            def recursive_recover(input_data):
+                if isinstance(input_data, (tuple, list)):
+                    new_data = []
+                    for item in input_data:
+                        new_data.append(recursive_recover(item))
+                    return tuple(new_data) if isinstance(input_data,
+                                                         tuple) else new_data
+                elif isinstance(input_data, dict):
+                    new_data = {}
+                    for k, v in input_data.items():
+                        new_data[k] = recursive_recover(v)
+                    return new_data
+                elif isinstance(input_data, (torch.Tensor, np.ndarray)):
+                    return converter.recover(input_data)
+                else:
+                    return input_data
+
+            if recover:
+                return recursive_recover(return_values)
+            else:
+                return return_values
+
+        return new_func
+
+    return array_converter_wrapper
+
+
+class ArrayConverter:
+    """Utility class for data-type agnostic processing.
+
+    Args:
+        template_array (np.ndarray or torch.Tensor or list or tuple or int or
+            float, optional): Template array. Defaults to None.
+    """
+    SUPPORTED_NON_ARRAY_TYPES = (int, float, np.int8, np.int16, np.int32,
+                                 np.int64, np.uint8, np.uint16, np.uint32,
+                                 np.uint64, np.float16, np.float32, np.float64)
+
+    def __init__(self,
+                 template_array: Optional[TemplateArrayType] = None) -> None:
+        if template_array is not None:
+            self.set_template(template_array)
+
+    def set_template(self, array: TemplateArrayType) -> None:
+        """Set template array.
+
+        Args:
+            array (np.ndarray or torch.Tensor or list or tuple or int or
+                float): Template array.
+
+        Raises:
+            ValueError: If input is list or tuple and cannot be converted to a
+                NumPy array, a ValueError is raised.
+            TypeError: If input type does not belong to the above range, or the
+                contents of a list or tuple do not share the same data type, a
+                TypeError is raised.
+        """
+        self.array_type = type(array)
+        self.is_num = False
+        self.device = 'cpu'
+
+        if isinstance(array, np.ndarray):
+            self.dtype = array.dtype
+        elif isinstance(array, torch.Tensor):
+            self.dtype = array.dtype
+            self.device = array.device
+        elif isinstance(array, (list, tuple)):
+            try:
+                array = np.array(array)
+                if array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES:
+                    raise TypeError
+                self.dtype = array.dtype
+            except (ValueError, TypeError):
+                print('The following list cannot be converted to a numpy '
+                      f'array of supported dtype:\n{array}')
+                raise
+        elif isinstance(array, (int, float)):
+            self.array_type = np.ndarray
+            self.is_num = True
+            self.dtype = np.dtype(type(array))
+        else:
+            raise TypeError(
+                f'Template type {self.array_type} is not supported.')
+
+    def convert(
+        self,
+        input_array: TemplateArrayType,
+        target_type: Optional[Type] = None,
+        target_array: Optional[Union[np.ndarray, torch.Tensor]] = None
+    ) -> Union[np.ndarray, torch.Tensor]:
+        """Convert input array to target data type.
+
+        Args:
+            input_array (np.ndarray or torch.Tensor or list or tuple or int or
+                float): Input array.
+            target_type (Type, optional): Type to which input array is
+                converted. It should be `np.ndarray` or `torch.Tensor`.
+                Defaults to None.
+            target_array (np.ndarray or torch.Tensor, optional): Template array
+                to which input array is converted. Defaults to None.
+
+        Raises:
+            ValueError: If input is list or tuple and cannot be converted to a
+                NumPy array, a ValueError is raised.
+            TypeError: If input type does not belong to the above range, or the
+                contents of a list or tuple do not share the same data type, a
+                TypeError is raised.
+
+        Returns:
+            np.ndarray or torch.Tensor: The converted array.
+        """
+        if isinstance(input_array, (list, tuple)):
+            try:
+                input_array = np.array(input_array)
+                if input_array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES:
+                    raise TypeError
+            except (ValueError, TypeError):
+                print('The input cannot be converted to a single-type numpy '
+                      f'array:\n{input_array}')
+                raise
+        elif isinstance(input_array, self.SUPPORTED_NON_ARRAY_TYPES):
+            input_array = np.array(input_array)
+        array_type = type(input_array)
+        assert target_type is not None or target_array is not None, \
+            'must specify a target'
+        if target_type is not None:
+            assert target_type in (np.ndarray, torch.Tensor), \
+                'invalid target type'
+            if target_type == array_type:
+                return input_array
+            elif target_type == np.ndarray:
+                # default dtype is float32
+                converted_array = input_array.cpu().numpy().astype(np.float32)
+            else:
+                # default dtype is float32, device is 'cpu'
+                converted_array = torch.tensor(
+                    input_array, dtype=torch.float32)
+        else:
+            assert isinstance(target_array, (np.ndarray, torch.Tensor)), \
+                'invalid target array type'
+            if isinstance(target_array, array_type):
+                return input_array
+            elif isinstance(target_array, np.ndarray):
+                converted_array = input_array.cpu().numpy().astype(
+                    target_array.dtype)
+            else:
+                converted_array = target_array.new_tensor(input_array)
+        return converted_array
+
+    def recover(
+        self, input_array: Union[np.ndarray, torch.Tensor]
+    ) -> Union[np.ndarray, torch.Tensor, int, float]:
+        """Recover input type to original array type.
+
+        Args:
+            input_array (np.ndarray or torch.Tensor): Input array.
+
+        Returns:
+            np.ndarray or torch.Tensor or int or float: Converted array.
+        """
+        assert isinstance(input_array, (np.ndarray, torch.Tensor)), \
+            'invalid input array type'
+        if isinstance(input_array, self.array_type):
+            return input_array
+        elif isinstance(input_array, torch.Tensor):
+            converted_array = input_array.cpu().numpy().astype(self.dtype)
+        else:
+            converted_array = torch.tensor(
+                input_array, dtype=self.dtype, device=self.device)
+        if self.is_num:
+            converted_array = converted_array.item()
+        return converted_array
diff --git a/mmde/mmdet3d/utils/collect_env.py b/mmde/mmdet3d/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4de74fab1f7e6b37ee2b6c7cd1e3792d5716447
--- /dev/null
+++ b/mmde/mmdet3d/utils/collect_env.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmdet
+from mmengine.utils import get_git_hash
+from mmengine.utils.dl_utils import collect_env as collect_base_env
+
+import mmdet3d
+
+
+def collect_env():
+    """Collect the information of the running environments."""
+    env_info = collect_base_env()
+    env_info['MMDetection'] = mmdet.__version__
+    env_info['MMDetection3D'] = mmdet3d.__version__ + '+' + get_git_hash()[:7]
+    from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE
+    env_info['spconv2.0'] = IS_SPCONV2_AVAILABLE
+
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/mmde/mmdet3d/utils/compat_cfg.py b/mmde/mmdet3d/utils/compat_cfg.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d1a5f63f01496a895d245a5ed235a5a0f345f86
--- /dev/null
+++ b/mmde/mmdet3d/utils/compat_cfg.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+
+from mmengine import ConfigDict
+
+
+def compat_cfg(cfg):
+    """This function would modify some filed to keep the compatibility of
+    config.
+
+    For example, it will move some args which will be deprecated to the correct
+    fields.
+    """
+    cfg = copy.deepcopy(cfg)
+    cfg = compat_imgs_per_gpu(cfg)
+    cfg = compat_loader_args(cfg)
+    cfg = compat_runner_args(cfg)
+    return cfg
+
+
+def compat_runner_args(cfg):
+    if 'runner' not in cfg:
+        cfg.runner = ConfigDict({
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.total_epochs
+        })
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+    else:
+        if 'total_epochs' in cfg:
+            assert cfg.total_epochs == cfg.runner.max_epochs
+    return cfg
+
+
+def compat_imgs_per_gpu(cfg):
+    cfg = copy.deepcopy(cfg)
+    if 'imgs_per_gpu' in cfg.data:
+        warnings.warn('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+                      'Please use "samples_per_gpu" instead')
+        if 'samples_per_gpu' in cfg.data:
+            warnings.warn(
+                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+                f'={cfg.data.imgs_per_gpu} is used in this experiments')
+        else:
+            warnings.warn('Automatically set "samples_per_gpu"="imgs_per_gpu"='
+                          f'{cfg.data.imgs_per_gpu} in this experiments')
+        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+    return cfg
+
+
+def compat_loader_args(cfg):
+    """Deprecated sample_per_gpu in cfg.data."""
+
+    cfg = copy.deepcopy(cfg)
+    if 'train_dataloader' not in cfg.data:
+        cfg.data['train_dataloader'] = ConfigDict()
+    if 'val_dataloader' not in cfg.data:
+        cfg.data['val_dataloader'] = ConfigDict()
+    if 'test_dataloader' not in cfg.data:
+        cfg.data['test_dataloader'] = ConfigDict()
+
+    # special process for train_dataloader
+    if 'samples_per_gpu' in cfg.data:
+
+        samples_per_gpu = cfg.data.pop('samples_per_gpu')
+        assert 'samples_per_gpu' not in \
+               cfg.data.train_dataloader, ('`samples_per_gpu` are set '
+                                           'in `data` field and ` '
+                                           'data.train_dataloader` '
+                                           'at the same time. '
+                                           'Please only set it in '
+                                           '`data.train_dataloader`. ')
+        cfg.data.train_dataloader['samples_per_gpu'] = samples_per_gpu
+
+    if 'persistent_workers' in cfg.data:
+
+        persistent_workers = cfg.data.pop('persistent_workers')
+        assert 'persistent_workers' not in \
+               cfg.data.train_dataloader, ('`persistent_workers` are set '
+                                           'in `data` field and ` '
+                                           'data.train_dataloader` '
+                                           'at the same time. '
+                                           'Please only set it in '
+                                           '`data.train_dataloader`. ')
+        cfg.data.train_dataloader['persistent_workers'] = persistent_workers
+
+    if 'workers_per_gpu' in cfg.data:
+
+        workers_per_gpu = cfg.data.pop('workers_per_gpu')
+        cfg.data.train_dataloader['workers_per_gpu'] = workers_per_gpu
+        cfg.data.val_dataloader['workers_per_gpu'] = workers_per_gpu
+        cfg.data.test_dataloader['workers_per_gpu'] = workers_per_gpu
+
+    # special process for val_dataloader
+    if 'samples_per_gpu' in cfg.data.val:
+        # keep default value of `sample_per_gpu` is 1
+        assert 'samples_per_gpu' not in \
+               cfg.data.val_dataloader, ('`samples_per_gpu` are set '
+                                         'in `data.val` field and ` '
+                                         'data.val_dataloader` at '
+                                         'the same time. '
+                                         'Please only set it in '
+                                         '`data.val_dataloader`. ')
+        cfg.data.val_dataloader['samples_per_gpu'] = \
+            cfg.data.val.pop('samples_per_gpu')
+    # special process for val_dataloader
+
+    # in case the test dataset is concatenated
+    if isinstance(cfg.data.test, dict):
+        if 'samples_per_gpu' in cfg.data.test:
+            assert 'samples_per_gpu' not in \
+                   cfg.data.test_dataloader, ('`samples_per_gpu` are set '
+                                              'in `data.test` field and ` '
+                                              'data.test_dataloader` '
+                                              'at the same time. '
+                                              'Please only set it in '
+                                              '`data.test_dataloader`. ')
+
+            cfg.data.test_dataloader['samples_per_gpu'] = \
+                cfg.data.test.pop('samples_per_gpu')
+
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            if 'samples_per_gpu' in ds_cfg:
+                assert 'samples_per_gpu' not in \
+                       cfg.data.test_dataloader, ('`samples_per_gpu` are set '
+                                                  'in `data.test` field and ` '
+                                                  'data.test_dataloader` at'
+                                                  ' the same time. '
+                                                  'Please only set it in '
+                                                  '`data.test_dataloader`. ')
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        cfg.data.test_dataloader['samples_per_gpu'] = samples_per_gpu
+
+    return cfg
diff --git a/mmde/mmdet3d/utils/misc.py b/mmde/mmdet3d/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5f4b47d33734fd4403745c7631432f02c729178
--- /dev/null
+++ b/mmde/mmdet3d/utils/misc.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+
+def replace_ceph_backend(cfg):
+    cfg_pretty_text = cfg.pretty_text
+
+    replace_strs = \
+        r'''backend_args = dict(
+            backend='petrel',
+            path_mapping=dict({
+                './data/DATA/': 's3://openmmlab/datasets/detection3d/CEPH/',
+                'data/DATA/': 's3://openmmlab/datasets/detection3d/CEPH/'
+            }))
+        '''
+
+    if 'nuscenes' in cfg_pretty_text:
+        replace_strs = replace_strs.replace('DATA', 'nuscenes')
+        replace_strs = replace_strs.replace('CEPH', 'nuscenes')
+    elif 'lyft' in cfg_pretty_text:
+        replace_strs = replace_strs.replace('DATA', 'lyft')
+        replace_strs = replace_strs.replace('CEPH', 'lyft')
+    elif 'waymo' in cfg_pretty_text:
+        replace_strs = replace_strs.replace('DATA', 'waymo')
+        replace_strs = replace_strs.replace('CEPH', 'waymo')
+    elif 'kitti' in cfg_pretty_text:
+        replace_strs = replace_strs.replace('DATA', 'kitti')
+        replace_strs = replace_strs.replace('CEPH', 'kitti')
+    elif 'scannet' in cfg_pretty_text:
+        replace_strs = replace_strs.replace('DATA', 'scannet')
+        replace_strs = replace_strs.replace('CEPH', 'scannet_processed')
+    elif 's3dis' in cfg_pretty_text:
+        replace_strs = replace_strs.replace('DATA', 's3dis')
+        replace_strs = replace_strs.replace('CEPH', 's3dis_processed')
+    elif 'sunrgbd' in cfg_pretty_text:
+        replace_strs = replace_strs.replace('DATA', 'sunrgbd')
+        replace_strs = replace_strs.replace('CEPH', 'sunrgbd_processed')
+    elif 'semantickitti' in cfg_pretty_text:
+        replace_strs = replace_strs.replace('DATA', 'semantickitti')
+        replace_strs = replace_strs.replace('CEPH', 'semantickitti')
+    elif 'nuimages' in cfg_pretty_text:
+        replace_strs = replace_strs.replace('DATA', 'nuimages')
+        replace_strs = replace_strs.replace('CEPH', 'nuimages')
+    else:
+        NotImplemented('Does not support global replacement')
+
+    replace_strs = replace_strs.replace(' ', '').replace('\n', '')
+
+    # use data info file from ceph
+    # cfg_pretty_text = cfg_pretty_text.replace(
+    #   'ann_file', replace_strs + ', ann_file')
+
+    cfg_pretty_text = cfg_pretty_text.replace('backend_args=None', '')
+
+    # replace LoadImageFromFile
+    cfg_pretty_text = cfg_pretty_text.replace(
+        'LoadImageFromFile\'', 'LoadImageFromFile\',' + replace_strs)
+
+    # replace LoadImageFromFileMono3D
+    cfg_pretty_text = cfg_pretty_text.replace(
+        'LoadImageFromFileMono3D\'',
+        'LoadImageFromFileMono3D\',' + replace_strs)
+
+    # replace LoadMultiViewImageFromFiles
+    cfg_pretty_text = cfg_pretty_text.replace(
+        'LoadMultiViewImageFromFiles\'',
+        'LoadMultiViewImageFromFiles\',' + replace_strs)
+
+    # replace LoadPointsFromFile
+    cfg_pretty_text = cfg_pretty_text.replace(
+        'LoadPointsFromFile\'', 'LoadPointsFromFile\',' + replace_strs)
+
+    # replace LoadPointsFromMultiSweeps
+    cfg_pretty_text = cfg_pretty_text.replace(
+        'LoadPointsFromMultiSweeps\'',
+        'LoadPointsFromMultiSweeps\',' + replace_strs)
+
+    # replace LoadAnnotations
+    cfg_pretty_text = cfg_pretty_text.replace(
+        'LoadAnnotations\'', 'LoadAnnotations\',' + replace_strs)
+
+    # replace LoadAnnotations3D
+    cfg_pretty_text = cfg_pretty_text.replace(
+        'LoadAnnotations3D\'', 'LoadAnnotations3D\',' + replace_strs)
+
+    # replace KittiMetric
+    cfg_pretty_text = cfg_pretty_text.replace('KittiMetric\'',
+                                              'KittiMetric\',' + replace_strs)
+
+    # replace LyftMetric
+    cfg_pretty_text = cfg_pretty_text.replace('LyftMetric\'',
+                                              'LyftMetric\',' + replace_strs)
+
+    # replace NuScenesMetric
+    cfg_pretty_text = cfg_pretty_text.replace(
+        'NuScenesMetric\'', 'NuScenesMetric\',' + replace_strs)
+
+    # replace WaymoMetric
+    cfg_pretty_text = cfg_pretty_text.replace('WaymoMetric\'',
+                                              'WaymoMetric\',' + replace_strs)
+
+    # replace dbsampler
+    cfg_pretty_text = cfg_pretty_text.replace('info_path',
+                                              replace_strs + ', info_path')
+
+    cfg = cfg.fromstring(cfg_pretty_text, file_format='.py')
+    return cfg
diff --git a/mmde/mmdet3d/utils/setup_env.py b/mmde/mmdet3d/utils/setup_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ebee46494499c110bd1e402f389b0254bb89280
--- /dev/null
+++ b/mmde/mmdet3d/utils/setup_env.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import datetime
+import os
+import platform
+import warnings
+
+import cv2
+from mmengine import DefaultScope
+from torch import multiprocessing as mp
+
+
+def setup_multi_processes(cfg):
+    """Setup multi-processing environment variables."""
+    # set multi-process start method as `fork` to speed up the training
+    if platform.system() != 'Windows':
+        mp_start_method = cfg.get('mp_start_method', 'fork')
+        current_method = mp.get_start_method(allow_none=True)
+        if current_method is not None and current_method != mp_start_method:
+            warnings.warn(
+                f'Multi-processing start method `{mp_start_method}` is '
+                f'different from the previous setting `{current_method}`.'
+                f'It will be force set to `{mp_start_method}`. You can change '
+                f'this behavior by changing `mp_start_method` in your config.')
+        mp.set_start_method(mp_start_method, force=True)
+
+    # disable opencv multithreading to avoid system being overloaded
+    opencv_num_threads = cfg.get('opencv_num_threads', 0)
+    cv2.setNumThreads(opencv_num_threads)
+
+    # setup OMP threads
+    # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py  # noqa
+    workers_per_gpu = cfg.data.get('workers_per_gpu', 1)
+    if 'train_dataloader' in cfg.data:
+        workers_per_gpu = \
+            max(cfg.data.train_dataloader.get('workers_per_gpu', 1),
+                workers_per_gpu)
+
+    if 'OMP_NUM_THREADS' not in os.environ and workers_per_gpu > 1:
+        omp_num_threads = 1
+        warnings.warn(
+            f'Setting OMP_NUM_THREADS environment variable for each process '
+            f'to be {omp_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
+
+    # setup MKL threads
+    if 'MKL_NUM_THREADS' not in os.environ and workers_per_gpu > 1:
+        mkl_num_threads = 1
+        warnings.warn(
+            f'Setting MKL_NUM_THREADS environment variable for each process '
+            f'to be {mkl_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
+
+
+def register_all_modules(init_default_scope: bool = True) -> None:
+    """Register all modules in mmdet3d into the registries.
+
+    Args:
+        init_default_scope (bool): Whether initialize the mmdet default scope.
+            When `init_default_scope=True`, the global default scope will be
+            set to `mmdet3d`, and all registries will build modules from mmdet3d's
+            registry node. To understand more about the registry, please refer
+            to https://github.com/open-mmlab/mmengine/blob/main/docs/en/advanced_tutorials/registry.md
+            Defaults to True.
+    """  # noqa
+    import mmdet3d.datasets  # noqa: F401,F403
+    import mmdet3d.engine  # noqa: F401,F403
+    import mmdet3d.evaluation.metrics  # noqa: F401,F403
+    import mmdet3d.models  # noqa: F401,F403
+    import mmdet3d.structures  # noqa: F401,F403
+    import mmdet3d.visualization  # noqa: F401,F403
+    if init_default_scope:
+        never_created = DefaultScope.get_current_instance() is None \
+                        or not DefaultScope.check_instance_created('mmdet3d')
+        if never_created:
+            DefaultScope.get_instance('mmdet3d', scope_name='mmdet3d')
+            return
+        current_scope = DefaultScope.get_current_instance()
+        if current_scope.scope_name != 'mmdet3d':
+            warnings.warn('The current default scope '
+                          f'"{current_scope.scope_name}" is not "mmdet3d", '
+                          '`register_all_modules` will force the current'
+                          'default scope to be "mmdet3d". If this is not '
+                          'expected, please set `init_default_scope=False`.')
+            # avoid name conflict
+            new_instance_name = f'mmdet3d-{datetime.datetime.now()}'
+            DefaultScope.get_instance(new_instance_name, scope_name='mmdet3d')
diff --git a/mmde/mmdet3d/utils/typing_utils.py b/mmde/mmdet3d/utils/typing_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e6436c30c9c2b78cd8032243fd8fbd4ef64f44b
--- /dev/null
+++ b/mmde/mmdet3d/utils/typing_utils.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Collecting some commonly used type hint in MMDetection3D."""
+from typing import List, Optional, Union
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+
+from mmdet3d.structures.det3d_data_sample import Det3DDataSample
+
+# Type hint of config data
+ConfigType = Union[ConfigDict, dict]
+OptConfigType = Optional[ConfigType]
+
+# Type hint of one or more config data
+MultiConfig = Union[ConfigType, List[ConfigType]]
+OptMultiConfig = Optional[MultiConfig]
+
+InstanceList = List[InstanceData]
+OptInstanceList = Optional[InstanceList]
+
+SamplingResultList = List[SamplingResult]
+
+OptSamplingResultList = Optional[SamplingResultList]
+SampleList = List[Det3DDataSample]
+OptSampleList = Optional[SampleList]
diff --git a/mmde/mmdet3d/version.py b/mmde/mmdet3d/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7163ba86ef18c4d150882edc5908c6fe559e635
--- /dev/null
+++ b/mmde/mmdet3d/version.py
@@ -0,0 +1,28 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+
+__version__ = '1.4.0'
+short_version = __version__
+
+
+def parse_version_info(version_str: str) -> tuple:
+    """Parse a version string into a tuple.
+
+    Args:
+        version_str (str): The version string.
+
+    Returns:
+        tuple: The version info, e.g., "1.3.0" is parsed into (1, 3, 0), and
+        "2.0.0rc4" is parsed into (2, 0, 0, 'rc4').
+    """
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/mmde/mmdet3d/visualization/__init__.py b/mmde/mmdet3d/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1423601789066dbb8b1aa9e1cfaa595b82a0dc99
--- /dev/null
+++ b/mmde/mmdet3d/visualization/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .local_visualizer import Det3DLocalVisualizer
+from .vis_utils import (proj_camera_bbox3d_to_img, proj_depth_bbox3d_to_img,
+                        proj_lidar_bbox3d_to_img, to_depth_mode, write_obj,
+                        write_oriented_bbox)
+
+__all__ = [
+    'Det3DLocalVisualizer', 'write_obj', 'write_oriented_bbox',
+    'to_depth_mode', 'proj_lidar_bbox3d_to_img', 'proj_depth_bbox3d_to_img',
+    'proj_camera_bbox3d_to_img'
+]
diff --git a/mmde/mmdet3d/visualization/local_visualizer.py b/mmde/mmdet3d/visualization/local_visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1eff4af97f1b362853343df495cfe54d2456328
--- /dev/null
+++ b/mmde/mmdet3d/visualization/local_visualizer.py
@@ -0,0 +1,1100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+import os
+import sys
+import time
+from typing import List, Optional, Sequence, Tuple, Union
+
+import matplotlib.pyplot as plt
+import mmcv
+import numpy as np
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import PathPatch
+from matplotlib.path import Path
+from mmdet.visualization import DetLocalVisualizer, get_palette
+from mmengine.dist import master_only
+from mmengine.logging import print_log
+from mmengine.structures import InstanceData
+from mmengine.visualization import Visualizer as MMENGINE_Visualizer
+from mmengine.visualization.utils import (check_type, color_val_matplotlib,
+                                          tensor2ndarray)
+from torch import Tensor
+
+from mmdet3d.registry import VISUALIZERS
+from mmdet3d.structures import (BaseInstance3DBoxes, Box3DMode,
+                                CameraInstance3DBoxes, Coord3DMode,
+                                DepthInstance3DBoxes, DepthPoints,
+                                Det3DDataSample, LiDARInstance3DBoxes,
+                                PointData, points_cam2img)
+from .vis_utils import (proj_camera_bbox3d_to_img, proj_depth_bbox3d_to_img,
+                        proj_lidar_bbox3d_to_img, to_depth_mode)
+
+try:
+    import open3d as o3d
+    from open3d import geometry
+    from open3d.visualization import Visualizer
+except ImportError:
+    o3d = geometry = Visualizer = None
+
+
+@VISUALIZERS.register_module()
+class Det3DLocalVisualizer(DetLocalVisualizer):
+    """MMDetection3D Local Visualizer.
+
+    - 3D detection and segmentation drawing methods
+
+      - draw_bboxes_3d: draw 3D bounding boxes on point clouds
+      - draw_proj_bboxes_3d: draw projected 3D bounding boxes on image
+      - draw_seg_mask: draw segmentation mask via per-point colorization
+
+    Args:
+        name (str): Name of the instance. Defaults to 'visualizer'.
+        points (np.ndarray, optional): Points to visualize with shape (N, 3+C).
+            Defaults to None.
+        image (np.ndarray, optional): The origin image to draw. The format
+            should be RGB. Defaults to None.
+        pcd_mode (int): The point cloud mode (coordinates): 0 represents LiDAR,
+            1 represents CAMERA, 2 represents Depth. Defaults to 0.
+        vis_backends (List[dict], optional): Visual backend config list.
+            Defaults to None.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+            Defaults to None.
+        bbox_color (str or Tuple[int], optional): Color of bbox lines.
+            The tuple of color should be in BGR order. Defaults to None.
+        text_color (str or Tuple[int]): Color of texts. The tuple of color
+            should be in BGR order. Defaults to (200, 200, 200).
+        mask_color (str or Tuple[int], optional): Color of masks. The tuple of
+            color should be in BGR order. Defaults to None.
+        line_width (int or float): The linewidth of lines. Defaults to 3.
+        frame_cfg (dict): The coordinate frame config while Open3D
+            visualization initialization.
+            Defaults to dict(size=1, origin=[0, 0, 0]).
+        alpha (int or float): The transparency of bboxes or mask.
+            Defaults to 0.8.
+        multi_imgs_col (int): The number of columns in arrangement when showing
+            multi-view images.
+
+    Examples:
+        >>> import numpy as np
+        >>> import torch
+        >>> from mmengine.structures import InstanceData
+        >>> from mmdet3d.structures import (DepthInstance3DBoxes
+        ...                                 Det3DDataSample)
+        >>> from mmdet3d.visualization import Det3DLocalVisualizer
+
+        >>> det3d_local_visualizer = Det3DLocalVisualizer()
+        >>> image = np.random.randint(0, 256, size=(10, 12, 3)).astype('uint8')
+        >>> points = np.random.rand(1000, 3)
+        >>> gt_instances_3d = InstanceData()
+        >>> gt_instances_3d.bboxes_3d = DepthInstance3DBoxes(
+        ...     torch.rand((5, 7)))
+        >>> gt_instances_3d.labels_3d = torch.randint(0, 2, (5,))
+        >>> gt_det3d_data_sample = Det3DDataSample()
+        >>> gt_det3d_data_sample.gt_instances_3d = gt_instances_3d
+        >>> data_input = dict(img=image, points=points)
+        >>> det3d_local_visualizer.add_datasample('3D Scene', data_input,
+        ...                                       gt_det3d_data_sample)
+
+        >>> from mmdet3d.structures import PointData
+        >>> det3d_local_visualizer = Det3DLocalVisualizer()
+        >>> points = np.random.rand(1000, 3)
+        >>> gt_pts_seg = PointData()
+        >>> gt_pts_seg.pts_semantic_mask = torch.randint(0, 10, (1000, ))
+        >>> gt_det3d_data_sample = Det3DDataSample()
+        >>> gt_det3d_data_sample.gt_pts_seg = gt_pts_seg
+        >>> data_input = dict(points=points)
+        >>> det3d_local_visualizer.add_datasample('3D Scene', data_input,
+        ...                                       gt_det3d_data_sample,
+        ...                                       vis_task='lidar_seg')
+    """
+
+    def __init__(
+        self,
+        name: str = 'visualizer',
+        points: Optional[np.ndarray] = None,
+        image: Optional[np.ndarray] = None,
+        pcd_mode: int = 0,
+        vis_backends: Optional[List[dict]] = None,
+        save_dir: Optional[str] = None,
+        bbox_color: Optional[Union[str, Tuple[int]]] = None,
+        text_color: Union[str, Tuple[int]] = (200, 200, 200),
+        mask_color: Optional[Union[str, Tuple[int]]] = None,
+        line_width: Union[int, float] = 3,
+        frame_cfg: dict = dict(size=1, origin=[0, 0, 0]),
+        alpha: Union[int, float] = 0.8,
+        multi_imgs_col: int = 3,
+        fig_show_cfg: dict = dict(figsize=(18, 12))
+    ) -> None:
+        super().__init__(
+            name=name,
+            image=image,
+            vis_backends=vis_backends,
+            save_dir=save_dir,
+            bbox_color=bbox_color,
+            text_color=text_color,
+            mask_color=mask_color,
+            line_width=line_width,
+            alpha=alpha)
+        if points is not None:
+            self.set_points(points, pcd_mode=pcd_mode, frame_cfg=frame_cfg)
+        self.multi_imgs_col = multi_imgs_col
+        self.fig_show_cfg.update(fig_show_cfg)
+
+        self.flag_pause = False
+        self.flag_next = False
+        self.flag_exit = False
+
+    def _clear_o3d_vis(self) -> None:
+        """Clear open3d vis."""
+
+        if hasattr(self, 'o3d_vis'):
+            del self.o3d_vis
+            del self.points_colors
+            del self.view_control
+            if hasattr(self, 'pcd'):
+                del self.pcd
+
+    def _initialize_o3d_vis(self, show=True) -> Visualizer:
+        """Initialize open3d vis according to frame_cfg.
+
+        Args:
+            frame_cfg (dict): The config to create coordinate frame in open3d
+                vis.
+
+        Returns:
+            :obj:`o3d.visualization.Visualizer`: Created open3d vis.
+        """
+        if o3d is None or geometry is None:
+            raise ImportError(
+                'Please run "pip install open3d" to install open3d first.')
+        glfw_key_escape = 256  # Esc
+        glfw_key_space = 32  # Space
+        glfw_key_right = 262  # Right
+        o3d_vis = o3d.visualization.VisualizerWithKeyCallback()
+        o3d_vis.register_key_callback(glfw_key_escape, self.escape_callback)
+        o3d_vis.register_key_action_callback(glfw_key_space,
+                                             self.space_action_callback)
+        o3d_vis.register_key_callback(glfw_key_right, self.right_callback)
+        if os.environ.get('DISPLAY', None) is not None and show:
+            o3d_vis.create_window()
+            self.view_control = o3d_vis.get_view_control()
+        return o3d_vis
+
+    @master_only
+    def set_points(self,
+                   points: np.ndarray,
+                   pcd_mode: int = 0,
+                   vis_mode: str = 'replace',
+                   frame_cfg: dict = dict(size=1, origin=[0, 0, 0]),
+                   points_color: Tuple[float] = (0.8, 0.8, 0.8),
+                   points_size: int = 2,
+                   mode: str = 'xyz') -> None:
+        """Set the point cloud to draw.
+
+        Args:
+            points (np.ndarray): Points to visualize with shape (N, 3+C).
+            pcd_mode (int): The point cloud mode (coordinates): 0 represents
+                LiDAR, 1 represents CAMERA, 2 represents Depth. Defaults to 0.
+            vis_mode (str): The visualization mode in Open3D:
+
+                - 'replace': Replace the existing point cloud with input point
+                  cloud.
+                - 'add': Add input point cloud into existing point cloud.
+
+                Defaults to 'replace'.
+            frame_cfg (dict): The coordinate frame config for Open3D
+                visualization initialization.
+                Defaults to dict(size=1, origin=[0, 0, 0]).
+            points_color (Tuple[float]): The color of points.
+                Defaults to (1, 1, 1).
+            points_size (int): The size of points to show on visualizer.
+                Defaults to 2.
+            mode (str): Indicate type of the input points, available mode
+                ['xyz', 'xyzrgb']. Defaults to 'xyz'.
+        """
+        assert points is not None
+        assert vis_mode in ('replace', 'add')
+        check_type('points', points, np.ndarray)
+
+        if not hasattr(self, 'o3d_vis'):
+            self.o3d_vis = self._initialize_o3d_vis()
+
+        # for now we convert points into depth mode for visualization
+        if pcd_mode != Coord3DMode.DEPTH:
+            points = Coord3DMode.convert(points, pcd_mode, Coord3DMode.DEPTH)
+
+        if hasattr(self, 'pcd') and vis_mode != 'add':
+            self.o3d_vis.remove_geometry(self.pcd)
+
+        # set points size in Open3D
+        render_option = self.o3d_vis.get_render_option()
+        if render_option is not None:
+            render_option.point_size = points_size
+            render_option.background_color = np.asarray([0, 0, 0])
+
+        points = points.copy()
+        pcd = geometry.PointCloud()
+        if mode == 'xyz':
+            pcd.points = o3d.utility.Vector3dVector(points[:, :3])
+            points_colors = np.tile(
+                np.array(points_color), (points.shape[0], 1))
+        elif mode == 'xyzrgb':
+            pcd.points = o3d.utility.Vector3dVector(points[:, :3])
+            points_colors = points[:, 3:6]
+            # normalize to [0, 1] for Open3D drawing
+            if not ((points_colors >= 0.0) & (points_colors <= 1.0)).all():
+                points_colors /= 255.0
+        else:
+            raise NotImplementedError
+
+        # create coordinate frame
+        mesh_frame = geometry.TriangleMesh.create_coordinate_frame(**frame_cfg)
+        self.o3d_vis.add_geometry(mesh_frame)
+
+        pcd.colors = o3d.utility.Vector3dVector(points_colors)
+        self.o3d_vis.add_geometry(pcd)
+        self.pcd = pcd
+        self.points_colors = points_colors
+
+    # TODO: assign 3D Box color according to pred / GT labels
+    # We draw GT / pred bboxes on the same point cloud scenes
+    # for better detection performance comparison
+    def draw_bboxes_3d(self,
+                       bboxes_3d: BaseInstance3DBoxes,
+                       bbox_color: Tuple[float] = (0, 1, 0),
+                       points_in_box_color: Tuple[float] = (1, 0, 0),
+                       rot_axis: int = 2,
+                       center_mode: str = 'lidar_bottom',
+                       mode: str = 'xyz') -> None:
+        """Draw bbox on visualizer and change the color of points inside
+        bbox3d.
+
+        Args:
+            bboxes_3d (:obj:`BaseInstance3DBoxes`): 3D bbox
+                (x, y, z, x_size, y_size, z_size, yaw) to visualize.
+            bbox_color (Tuple[float]): The color of 3D bboxes.
+                Defaults to (0, 1, 0).
+            points_in_box_color (Tuple[float]): The color of points inside 3D
+                bboxes. Defaults to (1, 0, 0).
+            rot_axis (int): Rotation axis of 3D bboxes. Defaults to 2.
+            center_mode (str): Indicates the center of bbox is bottom center or
+                gravity center. Available mode
+                ['lidar_bottom', 'camera_bottom']. Defaults to 'lidar_bottom'.
+            mode (str): Indicates the type of input points, available mode
+                ['xyz', 'xyzrgb']. Defaults to 'xyz'.
+        """
+        # Before visualizing the 3D Boxes in point cloud scene
+        # we need to convert the boxes to Depth mode
+        check_type('bboxes', bboxes_3d, BaseInstance3DBoxes)
+
+        if not isinstance(bboxes_3d, DepthInstance3DBoxes):
+            bboxes_3d = bboxes_3d.convert_to(Box3DMode.DEPTH)
+
+        # convert bboxes to numpy dtype
+        bboxes_3d = tensor2ndarray(bboxes_3d.tensor)
+
+        # in_box_color = np.array(points_in_box_color)
+
+        for i in range(len(bboxes_3d)):
+            center = bboxes_3d[i, 0:3]
+            dim = bboxes_3d[i, 3:6]
+            yaw = np.zeros(3)
+            yaw[rot_axis] = bboxes_3d[i, 6]
+            rot_mat = geometry.get_rotation_matrix_from_xyz(yaw)
+
+            if center_mode == 'lidar_bottom':
+                # bottom center to gravity center
+                center[rot_axis] += dim[rot_axis] / 2
+            elif center_mode == 'camera_bottom':
+                # bottom center to gravity center
+                center[rot_axis] -= dim[rot_axis] / 2
+            box3d = geometry.OrientedBoundingBox(center, rot_mat, dim)
+
+            line_set = geometry.LineSet.create_from_oriented_bounding_box(
+                box3d)
+            line_set.paint_uniform_color(np.array(bbox_color[i]) / 255.)
+            # draw bboxes on visualizer
+            self.o3d_vis.add_geometry(line_set)
+
+            # change the color of points which are in box
+            if self.pcd is not None and mode == 'xyz':
+                indices = box3d.get_point_indices_within_bounding_box(
+                    self.pcd.points)
+                self.points_colors[indices] = np.array(bbox_color[i]) / 255.
+
+        # update points colors
+        if self.pcd is not None:
+            self.pcd.colors = o3d.utility.Vector3dVector(self.points_colors)
+            self.o3d_vis.update_geometry(self.pcd)
+
+    def set_bev_image(self,
+                      bev_image: Optional[np.ndarray] = None,
+                      bev_shape: int = 900) -> None:
+        """Set the bev image to draw.
+
+        Args:
+            bev_image (np.ndarray, optional): The bev image to draw.
+                Defaults to None.
+            bev_shape (int): The bev image shape. Defaults to 900.
+        """
+        if bev_image is None:
+            bev_image = np.zeros((bev_shape, bev_shape, 3), np.uint8)
+
+        self._image = bev_image
+        self.width, self.height = bev_image.shape[1], bev_image.shape[0]
+        self._default_font_size = max(
+            np.sqrt(self.height * self.width) // 90, 10)
+        self.ax_save.cla()
+        self.ax_save.axis(False)
+        self.ax_save.imshow(bev_image, origin='lower')
+        # plot camera view range
+        x1 = np.linspace(0, self.width / 2)
+        x2 = np.linspace(self.width / 2, self.width)
+        self.ax_save.plot(
+            x1,
+            self.width / 2 - x1,
+            ls='--',
+            color='grey',
+            linewidth=1,
+            alpha=0.5)
+        self.ax_save.plot(
+            x2,
+            x2 - self.width / 2,
+            ls='--',
+            color='grey',
+            linewidth=1,
+            alpha=0.5)
+        self.ax_save.plot(
+            self.width / 2,
+            0,
+            marker='+',
+            markersize=16,
+            markeredgecolor='red')
+
+    # TODO: Support bev point cloud visualization
+    @master_only
+    def draw_bev_bboxes(self,
+                        bboxes_3d: BaseInstance3DBoxes,
+                        scale: int = 15,
+                        edge_colors: Union[str, Tuple[int],
+                                           List[Union[str, Tuple[int]]]] = 'o',
+                        line_styles: Union[str, List[str]] = '-',
+                        line_widths: Union[int, float, List[Union[int,
+                                                                  float]]] = 1,
+                        face_colors: Union[str, Tuple[int],
+                                           List[Union[str,
+                                                      Tuple[int]]]] = 'none',
+                        alpha: Union[int, float] = 1) -> MMENGINE_Visualizer:
+        """Draw projected 3D boxes on the image.
+
+        Args:
+            bboxes_3d (:obj:`BaseInstance3DBoxes`): 3D bbox
+                (x, y, z, x_size, y_size, z_size, yaw) to visualize.
+            scale (dict): Value to scale the bev bboxes for better
+                visualization. Defaults to 15.
+            edge_colors (str or Tuple[int] or List[str or Tuple[int]]):
+                The colors of bboxes. ``colors`` can have the same length with
+                lines or just single value. If ``colors`` is single value, all
+                the lines will have the same colors. Refer to `matplotlib.
+                colors` for full list of formats that are accepted.
+                Defaults to 'o'.
+            line_styles (str or List[str]): The linestyle of lines.
+                ``line_styles`` can have the same length with texts or just
+                single value. If ``line_styles`` is single value, all the lines
+                will have the same linestyle. Reference to
+                https://matplotlib.org/stable/api/collections_api.html?highlight=collection#matplotlib.collections.AsteriskPolygonCollection.set_linestyle
+                for more details. Defaults to '-'.
+            line_widths (int or float or List[int or float]): The linewidth of
+                lines. ``line_widths`` can have the same length with lines or
+                just single value. If ``line_widths`` is single value, all the
+                lines will have the same linewidth. Defaults to 2.
+            face_colors (str or Tuple[int] or List[str or Tuple[int]]):
+                The face colors. Defaults to 'none'.
+            alpha (int or float): The transparency of bboxes. Defaults to 1.
+        """
+
+        check_type('bboxes', bboxes_3d, BaseInstance3DBoxes)
+        bev_bboxes = tensor2ndarray(bboxes_3d.bev)
+        # scale the bev bboxes for better visualization
+        bev_bboxes[:, :4] *= scale
+        ctr, w, h, theta = np.split(bev_bboxes, [2, 3, 4], axis=-1)
+        cos_value, sin_value = np.cos(theta), np.sin(theta)
+        vec1 = np.concatenate([w / 2 * cos_value, w / 2 * sin_value], axis=-1)
+        vec2 = np.concatenate([-h / 2 * sin_value, h / 2 * cos_value], axis=-1)
+        pt1 = ctr + vec1 + vec2
+        pt2 = ctr + vec1 - vec2
+        pt3 = ctr - vec1 - vec2
+        pt4 = ctr - vec1 + vec2
+        poly = np.stack([pt1, pt2, pt3, pt4], axis=-2)
+        # move the object along x-axis
+        poly[:, :, 0] += self.width / 2
+        poly = [p for p in poly]
+        return self.draw_polygons(
+            poly,
+            alpha=alpha,
+            edge_colors=edge_colors,
+            line_styles=line_styles,
+            line_widths=line_widths,
+            face_colors=face_colors)
+
+    @master_only
+    def draw_points_on_image(self,
+                             points: Union[np.ndarray, Tensor],
+                             pts2img: np.ndarray,
+                             sizes: Union[np.ndarray, int] = 3,
+                             max_depth: Optional[float] = None) -> None:
+        """Draw projected points on the image.
+
+        Args:
+            points (np.ndarray or Tensor): Points to draw.
+            pts2img (np.ndarray): The transformation matrix from the coordinate
+                of point cloud to image plane.
+            sizes (np.ndarray or int): The marker size. Defaults to 10.
+            max_depth (float): The max depth in the color map. Defaults to
+                None.
+        """
+        check_type('points', points, (np.ndarray, Tensor))
+        points = tensor2ndarray(points)
+        assert self._image is not None, 'Please set image using `set_image`'
+        projected_points = points_cam2img(points, pts2img, with_depth=True)
+        depths = projected_points[:, 2]
+        # Show depth adaptively consideing different scenes
+        if max_depth is None:
+            max_depth = depths.max()
+        colors = (depths % max_depth) / max_depth
+        # use colormap to obtain the render color
+        color_map = plt.get_cmap('jet')
+        self.ax_save.scatter(
+            projected_points[:, 0],
+            projected_points[:, 1],
+            c=colors,
+            cmap=color_map,
+            s=sizes,
+            alpha=0.7,
+            edgecolors='none')
+
+    # TODO: set bbox color according to palette
+    @master_only
+    def draw_proj_bboxes_3d(
+            self,
+            bboxes_3d: BaseInstance3DBoxes,
+            input_meta: dict,
+            edge_colors: Union[str, Tuple[int],
+                               List[Union[str, Tuple[int]]]] = 'royalblue',
+            line_styles: Union[str, List[str]] = '-',
+            line_widths: Union[int, float, List[Union[int, float]]] = 2,
+            face_colors: Union[str, Tuple[int],
+                               List[Union[str, Tuple[int]]]] = 'royalblue',
+            alpha: Union[int, float] = 0.4,
+            img_size: Optional[Tuple] = None):
+        """Draw projected 3D boxes on the image.
+
+        Args:
+            bboxes_3d (:obj:`BaseInstance3DBoxes`): 3D bbox
+                (x, y, z, x_size, y_size, z_size, yaw) to visualize.
+            input_meta (dict): Input meta information.
+            edge_colors (str or Tuple[int] or List[str or Tuple[int]]):
+                The colors of bboxes. ``colors`` can have the same length with
+                lines or just single value. If ``colors`` is single value, all
+                the lines will have the same colors. Refer to `matplotlib.
+                colors` for full list of formats that are accepted.
+                Defaults to 'royalblue'.
+            line_styles (str or List[str]): The linestyle of lines.
+                ``line_styles`` can have the same length with texts or just
+                single value. If ``line_styles`` is single value, all the lines
+                will have the same linestyle. Reference to
+                https://matplotlib.org/stable/api/collections_api.html?highlight=collection#matplotlib.collections.AsteriskPolygonCollection.set_linestyle
+                for more details. Defaults to '-'.
+            line_widths (int or float or List[int or float]): The linewidth of
+                lines. ``line_widths`` can have the same length with lines or
+                just single value. If ``line_widths`` is single value, all the
+                lines will have the same linewidth. Defaults to 2.
+            face_colors (str or Tuple[int] or List[str or Tuple[int]]):
+                The face colors. Defaults to 'royalblue'.
+            alpha (int or float): The transparency of bboxes. Defaults to 0.4.
+            img_size (tuple, optional): The size (w, h) of the image.
+        """
+
+        check_type('bboxes', bboxes_3d, BaseInstance3DBoxes)
+
+        if isinstance(bboxes_3d, DepthInstance3DBoxes):
+            proj_bbox3d_to_img = proj_depth_bbox3d_to_img
+        elif isinstance(bboxes_3d, LiDARInstance3DBoxes):
+            proj_bbox3d_to_img = proj_lidar_bbox3d_to_img
+        elif isinstance(bboxes_3d, CameraInstance3DBoxes):
+            proj_bbox3d_to_img = proj_camera_bbox3d_to_img
+        else:
+            raise NotImplementedError('unsupported box type!')
+
+        edge_colors_norm = color_val_matplotlib(edge_colors)
+
+        corners_2d = proj_bbox3d_to_img(bboxes_3d, input_meta)
+        if img_size is not None:
+            # Filter out the bbox where half of stuff is outside the image.
+            # This is for the visualization of multi-view image.
+            valid_point_idx = (corners_2d[..., 0] >= 0) & \
+                        (corners_2d[..., 0] <= img_size[0]) & \
+                        (corners_2d[..., 1] >= 0) & (corners_2d[..., 1] <= img_size[1])  # noqa: E501
+            valid_bbox_idx = valid_point_idx.sum(axis=-1) >= 4
+            corners_2d = corners_2d[valid_bbox_idx]
+            filter_edge_colors = []
+            filter_edge_colors_norm = []
+            for i, color in enumerate(edge_colors):
+                if valid_bbox_idx[i]:
+                    filter_edge_colors.append(color)
+                    filter_edge_colors_norm.append(edge_colors_norm[i])
+            edge_colors = filter_edge_colors
+            edge_colors_norm = filter_edge_colors_norm
+
+        lines_verts_idx = [0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 5, 1, 2, 6]
+        lines_verts = corners_2d[:, lines_verts_idx, :]
+        front_polys = corners_2d[:, 4:, :]
+        codes = [Path.LINETO] * lines_verts.shape[1]
+        codes[0] = Path.MOVETO
+        pathpatches = []
+        for i in range(len(corners_2d)):
+            verts = lines_verts[i]
+            pth = Path(verts, codes)
+            pathpatches.append(PathPatch(pth))
+
+        p = PatchCollection(
+            pathpatches,
+            facecolors='none',
+            edgecolors=edge_colors_norm,
+            linewidths=line_widths,
+            linestyles=line_styles)
+
+        self.ax_save.add_collection(p)
+
+        # draw a mask on the front of project bboxes
+        front_polys = [front_poly for front_poly in front_polys]
+        return self.draw_polygons(
+            front_polys,
+            alpha=alpha,
+            edge_colors=edge_colors,
+            line_styles=line_styles,
+            line_widths=line_widths,
+            face_colors=edge_colors)
+
+    @master_only
+    def draw_seg_mask(self, seg_mask_colors: np.ndarray) -> None:
+        """Add segmentation mask to visualizer via per-point colorization.
+
+        Args:
+            seg_mask_colors (np.ndarray): The segmentation mask with shape
+                (N, 6), whose first 3 dims are point coordinates and last 3
+                dims are converted colors.
+        """
+        # we can't draw the colors on existing points
+        # in case gt and pred mask would overlap
+        # instead we set a large offset along x-axis for each seg mask
+        if hasattr(self, 'pcd'):
+            offset = (np.array(self.pcd.points).max(0) -
+                      np.array(self.pcd.points).min(0))[0] * 1.2
+            mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
+                size=1, origin=[offset, 0,
+                                0])  # create coordinate frame for seg
+            self.o3d_vis.add_geometry(mesh_frame)
+        else:
+            offset = 0
+        seg_points = copy.deepcopy(seg_mask_colors)
+        seg_points[:, 0] += offset
+        self.set_points(seg_points, pcd_mode=2, vis_mode='add', mode='xyzrgb')
+
+    def _draw_instances_3d(self,
+                           data_input: dict,
+                           instances: InstanceData,
+                           input_meta: dict,
+                           vis_task: str,
+                           show_pcd_rgb: bool = False,
+                           palette: Optional[List[tuple]] = None) -> dict:
+        """Draw 3D instances of GT or prediction.
+
+        Args:
+            data_input (dict): The input dict to draw.
+            instances (:obj:`InstanceData`): Data structure for instance-level
+                annotations or predictions.
+            input_meta (dict): Meta information.
+            vis_task (str): Visualization task, it includes: 'lidar_det',
+                'multi-modality_det', 'mono_det'.
+            show_pcd_rgb (bool): Whether to show RGB point cloud.
+            palette (List[tuple], optional): Palette information corresponding
+                to the category. Defaults to None.
+
+        Returns:
+            dict: The drawn point cloud and image whose channel is RGB.
+        """
+
+        # Only visualize when there is at least one instance
+        if not len(instances) > 0:
+            return None
+
+        bboxes_3d = instances.bboxes_3d  # BaseInstance3DBoxes
+        labels_3d = instances.labels_3d
+
+        data_3d = dict()
+
+        if vis_task in ['lidar_det', 'multi-modality_det']:
+            assert 'points' in data_input
+            points = data_input['points']
+            check_type('points', points, (np.ndarray, Tensor))
+            points = tensor2ndarray(points)
+
+            if not isinstance(bboxes_3d, DepthInstance3DBoxes):
+                points, bboxes_3d_depth = to_depth_mode(points, bboxes_3d)
+            else:
+                bboxes_3d_depth = bboxes_3d.clone()
+
+            if 'axis_align_matrix' in input_meta:
+                points = DepthPoints(points, points_dim=points.shape[1])
+                rot_mat = input_meta['axis_align_matrix'][:3, :3]
+                trans_vec = input_meta['axis_align_matrix'][:3, -1]
+                points.rotate(rot_mat.T)
+                points.translate(trans_vec)
+                points = tensor2ndarray(points.tensor)
+
+            max_label = int(max(labels_3d) if len(labels_3d) > 0 else 0)
+            bbox_color = palette if self.bbox_color is None \
+                else self.bbox_color
+            bbox_palette = get_palette(bbox_color, max_label + 1)
+            colors = [bbox_palette[label] for label in labels_3d]
+
+            self.set_points(
+                points, pcd_mode=2, mode='xyzrgb' if show_pcd_rgb else 'xyz')
+            self.draw_bboxes_3d(bboxes_3d_depth, bbox_color=colors)
+
+            data_3d['bboxes_3d'] = tensor2ndarray(bboxes_3d_depth.tensor)
+            data_3d['points'] = points
+
+        if vis_task in ['mono_det', 'multi-modality_det']:
+            assert 'img' in data_input
+            img = data_input['img']
+            if isinstance(img, list) or (isinstance(img, (np.ndarray, Tensor))
+                                         and len(img.shape) == 4):
+                # show multi-view images
+                img_size = img[0].shape[:2] if isinstance(
+                    img, list) else img.shape[-2:]  # noqa: E501
+                img_col = self.multi_imgs_col
+                img_row = math.ceil(len(img) / img_col)
+                composed_img = np.zeros(
+                    (img_size[0] * img_row, img_size[1] * img_col, 3),
+                    dtype=np.uint8)
+                for i, single_img in enumerate(img):
+                    # Note that we should keep the same order of elements both
+                    # in `img` and `input_meta`
+                    if isinstance(single_img, Tensor):
+                        single_img = single_img.permute(1, 2, 0).numpy()
+                        single_img = single_img[..., [2, 1, 0]]  # bgr to rgb
+                    self.set_image(single_img)
+                    single_img_meta = dict()
+                    for key, meta in input_meta.items():
+                        if isinstance(meta,
+                                      (Sequence, np.ndarray,
+                                       Tensor)) and len(meta) == len(img):
+                            single_img_meta[key] = meta[i]
+                        else:
+                            single_img_meta[key] = meta
+
+                    max_label = int(
+                        max(labels_3d) if len(labels_3d) > 0 else 0)
+                    bbox_color = palette if self.bbox_color is None \
+                        else self.bbox_color
+                    bbox_palette = get_palette(bbox_color, max_label + 1)
+                    colors = [bbox_palette[label] for label in labels_3d]
+
+                    self.draw_proj_bboxes_3d(
+                        bboxes_3d,
+                        single_img_meta,
+                        img_size=single_img.shape[:2][::-1],
+                        edge_colors=colors)
+                    if vis_task == 'mono_det' and hasattr(
+                            instances, 'centers_2d'):
+                        centers_2d = instances.centers_2d
+                        self.draw_points(centers_2d)
+                    composed_img[(i // img_col) *
+                                 img_size[0]:(i // img_col + 1) * img_size[0],
+                                 (i % img_col) *
+                                 img_size[1]:(i % img_col + 1) *
+                                 img_size[1]] = self.get_image()
+                data_3d['img'] = composed_img
+            else:
+                # show single-view image
+                # TODO: Solve the problem: some line segments of 3d bboxes are
+                # out of image by a large margin
+                if isinstance(data_input['img'], Tensor):
+                    img = img.permute(1, 2, 0).numpy()
+                    img = img[..., [2, 1, 0]]  # bgr to rgb
+                self.set_image(img)
+
+                max_label = int(max(labels_3d) if len(labels_3d) > 0 else 0)
+                bbox_color = palette if self.bbox_color is None \
+                    else self.bbox_color
+                bbox_palette = get_palette(bbox_color, max_label + 1)
+                colors = [bbox_palette[label] for label in labels_3d]
+
+                self.draw_proj_bboxes_3d(
+                    bboxes_3d, input_meta, edge_colors=colors)
+                if vis_task == 'mono_det' and hasattr(instances, 'centers_2d'):
+                    centers_2d = instances.centers_2d
+                    self.draw_points(centers_2d)
+                drawn_img = self.get_image()
+                data_3d['img'] = drawn_img
+
+        return data_3d
+
+    def _draw_pts_sem_seg(self,
+                          points: Union[Tensor, np.ndarray],
+                          pts_seg: PointData,
+                          palette: Optional[List[tuple]] = None,
+                          keep_index: Optional[int] = None) -> None:
+        """Draw 3D semantic mask of GT or prediction.
+
+        Args:
+            points (Tensor or np.ndarray): The input point cloud to draw.
+            pts_seg (:obj:`PointData`): Data structure for pixel-level
+                annotations or predictions.
+            palette (List[tuple], optional): Palette information corresponding
+                to the category. Defaults to None.
+            ignore_index (int, optional): Ignore category. Defaults to None.
+        """
+        check_type('points', points, (np.ndarray, Tensor))
+
+        points = tensor2ndarray(points)
+        pts_sem_seg = tensor2ndarray(pts_seg.pts_semantic_mask)
+        palette = np.array(palette)
+
+        if keep_index is not None:
+            keep_index = tensor2ndarray(keep_index)
+            points = points[keep_index]
+            pts_sem_seg = pts_sem_seg[keep_index]
+
+        pts_color = palette[pts_sem_seg]
+        seg_color = np.concatenate([points[:, :3], pts_color], axis=1)
+
+        self.draw_seg_mask(seg_color)
+
+    @master_only
+    def show(self,
+             save_path: Optional[str] = None,
+             drawn_img_3d: Optional[np.ndarray] = None,
+             drawn_img: Optional[np.ndarray] = None,
+             win_name: str = 'image',
+             wait_time: int = -1,
+             continue_key: str = 'right',
+             vis_task: str = 'lidar_det') -> None:
+        """Show the drawn point cloud/image.
+
+        Args:
+            save_path (str, optional): Path to save open3d visualized results.
+                Defaults to None.
+            drawn_img_3d (np.ndarray, optional): The image to show. If
+                drawn_img_3d is not None, it will show the image got by
+                Visualizer. Defaults to None.
+            drawn_img (np.ndarray, optional): The image to show. If drawn_img
+                is not None, it will show the image got by Visualizer.
+                Defaults to None.
+            win_name (str): The image title. Defaults to 'image'.
+            wait_time (int): Delay in milliseconds. 0 is the special value that
+                means "forever". Defaults to 0.
+            continue_key (str): The key for users to continue. Defaults to ' '.
+        """
+
+        # In order to show multi-modal results at the same time, we show image
+        # firstly and then show point cloud since the running of
+        # Open3D will block the process
+        if hasattr(self, '_image'):
+            if drawn_img is None and drawn_img_3d is None:
+                # use the image got by Visualizer.get_image()
+                if vis_task == 'multi-modality_det':
+                    import matplotlib.pyplot as plt
+                    is_inline = 'inline' in plt.get_backend()
+                    img = self.get_image() if drawn_img is None else drawn_img
+                    self._init_manager(win_name)
+                    fig = self.manager.canvas.figure
+                    # remove white edges by set subplot margin
+                    fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
+                    fig.clear()
+                    ax = fig.add_subplot()
+                    ax.axis(False)
+                    ax.imshow(img)
+                    self.manager.canvas.draw()
+                    if is_inline:
+                        return fig
+                    else:
+                        fig.show()
+                    self.manager.canvas.flush_events()
+                else:
+                    super().show(drawn_img_3d, win_name, wait_time,
+                                 continue_key)
+            else:
+                if vis_task == 'multi-modality_det':
+                    import matplotlib.pyplot as plt
+                    is_inline = 'inline' in plt.get_backend()
+                    img = drawn_img if drawn_img_3d is None else drawn_img_3d
+                    self._init_manager(win_name)
+                    fig = self.manager.canvas.figure
+                    # remove white edges by set subplot margin
+                    fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
+                    fig.clear()
+                    ax = fig.add_subplot()
+                    ax.axis(False)
+                    ax.imshow(img)
+                    self.manager.canvas.draw()
+                    if is_inline:
+                        return fig
+                    else:
+                        fig.show()
+                    self.manager.canvas.flush_events()
+                else:
+                    if drawn_img_3d is not None:
+                        super().show(drawn_img_3d, win_name, wait_time,
+                                     continue_key)
+                    if drawn_img is not None:
+                        super().show(drawn_img, win_name, wait_time,
+                                     continue_key)
+
+        if hasattr(self, 'o3d_vis'):
+            if hasattr(self, 'view_port'):
+                self.view_control.convert_from_pinhole_camera_parameters(
+                    self.view_port)
+            self.flag_exit = not self.o3d_vis.poll_events()
+            self.o3d_vis.update_renderer()
+            # if not hasattr(self, 'view_control'):
+            #     self.o3d_vis.create_window()
+            #     self.view_control = self.o3d_vis.get_view_control()
+            self.view_port = \
+                self.view_control.convert_to_pinhole_camera_parameters()  # noqa: E501
+            if wait_time != -1:
+                self.last_time = time.time()
+                while time.time(
+                ) - self.last_time < wait_time and self.o3d_vis.poll_events():
+                    self.o3d_vis.update_renderer()
+                    self.view_port = \
+                        self.view_control.convert_to_pinhole_camera_parameters()  # noqa: E501
+                while self.flag_pause and self.o3d_vis.poll_events():
+                    self.o3d_vis.update_renderer()
+                    self.view_port = \
+                        self.view_control.convert_to_pinhole_camera_parameters()  # noqa: E501
+
+            else:
+                while not self.flag_next and self.o3d_vis.poll_events():
+                    self.o3d_vis.update_renderer()
+                    self.view_port = \
+                        self.view_control.convert_to_pinhole_camera_parameters()  # noqa: E501
+                self.flag_next = False
+            self.o3d_vis.clear_geometries()
+            try:
+                del self.pcd
+            except (KeyError, AttributeError):
+                pass
+            if save_path is not None:
+                if not (save_path.endswith('.png')
+                        or save_path.endswith('.jpg')):
+                    save_path += '.png'
+                self.o3d_vis.capture_screen_image(save_path)
+            if self.flag_exit:
+                self.o3d_vis.destroy_window()
+                self.o3d_vis.close()
+                self._clear_o3d_vis()
+                sys.exit(0)
+
+    def escape_callback(self, vis):
+        self.o3d_vis.clear_geometries()
+        self.o3d_vis.destroy_window()
+        self.o3d_vis.close()
+        self._clear_o3d_vis()
+        sys.exit(0)
+
+    def space_action_callback(self, vis, action, mods):
+        if action == 1:
+            if self.flag_pause:
+                print_log(
+                    'Playback continued, press [SPACE] to pause.',
+                    logger='current')
+            else:
+                print_log(
+                    'Playback paused, press [SPACE] to continue.',
+                    logger='current')
+            self.flag_pause = not self.flag_pause
+        return True
+
+    def right_callback(self, vis):
+        self.flag_next = True
+        return False
+
+    # TODO: Support Visualize the 3D results from image and point cloud
+    # respectively
+    @master_only
+    def add_datasample(self,
+                       name: str,
+                       data_input: dict,
+                       data_sample: Optional[Det3DDataSample] = None,
+                       draw_gt: bool = True,
+                       draw_pred: bool = True,
+                       show: bool = False,
+                       wait_time: float = 0,
+                       out_file: Optional[str] = None,
+                       o3d_save_path: Optional[str] = None,
+                       vis_task: str = 'mono_det',
+                       pred_score_thr: float = 0.3,
+                       step: int = 0,
+                       show_pcd_rgb: bool = False) -> None:
+        """Draw datasample and save to all backends.
+
+        - If GT and prediction are plotted at the same time, they are displayed
+          in a stitched image where the left image is the ground truth and the
+          right image is the prediction.
+        - If ``show`` is True, all storage backends are ignored, and the images
+          will be displayed in a local window.
+        - If ``out_file`` is specified, the drawn image will be saved to
+          ``out_file``. It is usually used when the display is not available.
+
+        Args:
+            name (str): The image identifier.
+            data_input (dict): It should include the point clouds or image
+                to draw.
+            data_sample (:obj:`Det3DDataSample`, optional): Prediction
+                Det3DDataSample. Defaults to None.
+            draw_gt (bool): Whether to draw GT Det3DDataSample.
+                Defaults to True.
+            draw_pred (bool): Whether to draw Prediction Det3DDataSample.
+                Defaults to True.
+            show (bool): Whether to display the drawn point clouds and image.
+                Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            out_file (str, optional): Path to output file. Defaults to None.
+            o3d_save_path (str, optional): Path to save open3d visualized
+                results. Defaults to None.
+            vis_task (str): Visualization task. Defaults to 'mono_det'.
+            pred_score_thr (float): The threshold to visualize the bboxes
+                and masks. Defaults to 0.3.
+            step (int): Global step value to record. Defaults to 0.
+            show_pcd_rgb (bool): Whether to show RGB point cloud. Defaults to
+                False.
+        """
+        assert vis_task in (
+            'mono_det', 'multi-view_det', 'lidar_det', 'lidar_seg',
+            'multi-modality_det'), f'got unexpected vis_task {vis_task}.'
+        classes = self.dataset_meta.get('classes', None)
+        # For object detection datasets, no palette is saved
+        palette = self.dataset_meta.get('palette', None)
+        ignore_index = self.dataset_meta.get('ignore_index', None)
+        if vis_task == 'lidar_seg' and ignore_index is not None and 'pts_semantic_mask' in data_sample.gt_pts_seg:  # noqa: E501
+            keep_index = data_sample.gt_pts_seg.pts_semantic_mask != ignore_index  # noqa: E501
+        else:
+            keep_index = None
+
+        gt_data_3d = None
+        pred_data_3d = None
+        gt_img_data = None
+        pred_img_data = None
+
+        if not hasattr(self, 'o3d_vis') and vis_task in [
+                'multi-view_det', 'lidar_det', 'lidar_seg',
+                'multi-modality_det'
+        ]:
+            self.o3d_vis = self._initialize_o3d_vis(show=show)
+
+        if draw_gt and data_sample is not None:
+            if 'gt_instances_3d' in data_sample:
+                gt_data_3d = self._draw_instances_3d(
+                    data_input, data_sample.gt_instances_3d,
+                    data_sample.metainfo, vis_task, show_pcd_rgb, palette)
+            if 'gt_instances' in data_sample:
+                if len(data_sample.gt_instances) > 0:
+                    assert 'img' in data_input
+                    img = data_input['img']
+                    if isinstance(data_input['img'], Tensor):
+                        img = data_input['img'].permute(1, 2, 0).numpy()
+                        img = img[..., [2, 1, 0]]  # bgr to rgb
+                    gt_img_data = self._draw_instances(
+                        img, data_sample.gt_instances, classes, palette)
+            if 'gt_pts_seg' in data_sample and vis_task == 'lidar_seg':
+                assert classes is not None, 'class information is ' \
+                                            'not provided when ' \
+                                            'visualizing semantic ' \
+                                            'segmentation results.'
+                assert 'points' in data_input
+                self._draw_pts_sem_seg(data_input['points'],
+                                       data_sample.gt_pts_seg, palette,
+                                       keep_index)
+
+        if draw_pred and data_sample is not None:
+            if 'pred_instances_3d' in data_sample:
+                pred_instances_3d = data_sample.pred_instances_3d
+                # .cpu can not be used for BaseInstance3DBoxes
+                # so we need to use .to('cpu')
+                pred_instances_3d = pred_instances_3d[
+                    pred_instances_3d.scores_3d > pred_score_thr].to('cpu')
+                pred_data_3d = self._draw_instances_3d(data_input,
+                                                       pred_instances_3d,
+                                                       data_sample.metainfo,
+                                                       vis_task, show_pcd_rgb,
+                                                       palette)
+            if 'pred_instances' in data_sample:
+                if 'img' in data_input and len(data_sample.pred_instances) > 0:
+                    pred_instances = data_sample.pred_instances
+                    pred_instances = pred_instances[
+                        pred_instances.scores > pred_score_thr].cpu()
+                    img = data_input['img']
+                    if isinstance(data_input['img'], Tensor):
+                        img = data_input['img'].permute(1, 2, 0).numpy()
+                        img = img[..., [2, 1, 0]]  # bgr to rgb
+                    pred_img_data = self._draw_instances(
+                        img, pred_instances, classes, palette)
+            if 'pred_pts_seg' in data_sample and vis_task == 'lidar_seg':
+                assert classes is not None, 'class information is ' \
+                                            'not provided when ' \
+                                            'visualizing semantic ' \
+                                            'segmentation results.'
+                assert 'points' in data_input
+                self._draw_pts_sem_seg(data_input['points'],
+                                       data_sample.pred_pts_seg, palette,
+                                       keep_index)
+
+        # monocular 3d object detection image
+        if vis_task in ['mono_det', 'multi-modality_det']:
+            if gt_data_3d is not None and pred_data_3d is not None:
+                drawn_img_3d = np.concatenate(
+                    (gt_data_3d['img'], pred_data_3d['img']), axis=1)
+            elif gt_data_3d is not None:
+                drawn_img_3d = gt_data_3d['img']
+            elif pred_data_3d is not None:
+                drawn_img_3d = pred_data_3d['img']
+            else:  # both instances of gt and pred are empty
+                drawn_img_3d = None
+        else:
+            drawn_img_3d = None
+
+        # 2d object detection image
+        if gt_img_data is not None and pred_img_data is not None:
+            drawn_img = np.concatenate((gt_img_data, pred_img_data), axis=1)
+        elif gt_img_data is not None:
+            drawn_img = gt_img_data
+        elif pred_img_data is not None:
+            drawn_img = pred_img_data
+        else:
+            drawn_img = None
+
+        if show:
+            self.show(
+                o3d_save_path,
+                drawn_img_3d,
+                drawn_img,
+                win_name=name,
+                wait_time=wait_time,
+                vis_task=vis_task)
+
+        if out_file is not None:
+            # check the suffix of the name of image file
+            if not (out_file.endswith('.png') or out_file.endswith('.jpg')):
+                out_file = f'{out_file}.png'
+            if drawn_img_3d is not None:
+                mmcv.imwrite(drawn_img_3d[..., ::-1], out_file)
+            if drawn_img is not None:
+                mmcv.imwrite(drawn_img[..., ::-1],
+                             out_file[:-4] + '_2d' + out_file[-4:])
+        else:
+            self.add_image(name, drawn_img_3d, step)
diff --git a/mmde/mmdet3d/visualization/vis_utils.py b/mmde/mmdet3d/visualization/vis_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..340f28f71695ab42348bff65ee4d1134879be43d
--- /dev/null
+++ b/mmde/mmdet3d/visualization/vis_utils.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Tuple
+
+import numpy as np
+import torch
+import trimesh
+
+from mmdet3d.structures import (BaseInstance3DBoxes, Box3DMode,
+                                CameraInstance3DBoxes, Coord3DMode,
+                                DepthInstance3DBoxes, LiDARInstance3DBoxes)
+
+
+def write_obj(points: np.ndarray, out_filename: str) -> None:
+    """Write points into ``obj`` format for meshlab visualization.
+
+    Args:
+        points (np.ndarray): Points in shape (N, dim).
+        out_filename (str): Filename to be saved.
+    """
+    N = points.shape[0]
+    fout = open(out_filename, 'w')
+    for i in range(N):
+        if points.shape[1] == 6:
+            c = points[i, 3:].astype(int)
+            fout.write(
+                'v %f %f %f %d %d %d\n' %
+                (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2]))
+
+        else:
+            fout.write('v %f %f %f\n' %
+                       (points[i, 0], points[i, 1], points[i, 2]))
+    fout.close()
+
+
+def write_oriented_bbox(scene_bbox: np.ndarray, out_filename: str) -> None:
+    """Export oriented (around Z axis) scene bbox to meshes.
+
+    Args:
+        scene_bbox (np.ndarray): xyz pos of center and 3 lengths
+            (x_size, y_size, z_size) and heading angle around Z axis.
+            Y forward, X right, Z upward, heading angle of positive X is 0,
+            heading angle of positive Y is 90 degrees.
+        out_filename (str): Filename.
+    """
+
+    def heading2rotmat(heading_angle: float) -> np.ndarray:
+        rotmat = np.zeros((3, 3))
+        rotmat[2, 2] = 1
+        cosval = np.cos(heading_angle)
+        sinval = np.sin(heading_angle)
+        rotmat[0:2, 0:2] = np.array([[cosval, -sinval], [sinval, cosval]])
+        return rotmat
+
+    def convert_oriented_box_to_trimesh_fmt(
+            box: np.ndarray) -> trimesh.base.Trimesh:
+        ctr = box[:3]
+        lengths = box[3:6]
+        trns = np.eye(4)
+        trns[0:3, 3] = ctr
+        trns[3, 3] = 1.0
+        trns[0:3, 0:3] = heading2rotmat(box[6])
+        box_trimesh_fmt = trimesh.creation.box(lengths, trns)
+        return box_trimesh_fmt
+
+    if len(scene_bbox) == 0:
+        scene_bbox = np.zeros((1, 7))
+    scene = trimesh.scene.Scene()
+    for box in scene_bbox:
+        scene.add_geometry(convert_oriented_box_to_trimesh_fmt(box))
+
+    mesh_list = trimesh.util.concatenate(scene.dump())
+    # save to obj file
+    trimesh.io.export.export_mesh(mesh_list, out_filename, file_type='obj')
+
+
+def to_depth_mode(
+        points: np.ndarray,
+        bboxes: BaseInstance3DBoxes) -> Tuple[np.ndarray, BaseInstance3DBoxes]:
+    """Convert points and bboxes to Depth Coord and Depth Box mode."""
+    if points is not None:
+        points = Coord3DMode.convert_point(points.copy(), Coord3DMode.LIDAR,
+                                           Coord3DMode.DEPTH)
+    if bboxes is not None:
+        bboxes = Box3DMode.convert(bboxes.clone(), Box3DMode.LIDAR,
+                                   Box3DMode.DEPTH)
+    return points, bboxes
+
+
+# TODO: refactor lidar2img to img_meta
+def proj_lidar_bbox3d_to_img(bboxes_3d: LiDARInstance3DBoxes,
+                             input_meta: dict) -> np.ndarray:
+    """Project the 3D bbox on 2D plane.
+
+    Args:
+        bboxes_3d (:obj:`LiDARInstance3DBoxes`): 3D bbox in lidar coordinate
+            system to visualize.
+        input_meta (dict): Meta information.
+    """
+    corners_3d = bboxes_3d.corners.cpu().numpy()
+    num_bbox = corners_3d.shape[0]
+    pts_4d = np.concatenate(
+        [corners_3d.reshape(-1, 3),
+         np.ones((num_bbox * 8, 1))], axis=-1)
+    lidar2img = copy.deepcopy(input_meta['lidar2img']).reshape(4, 4)
+    if isinstance(lidar2img, torch.Tensor):
+        lidar2img = lidar2img.cpu().numpy()
+    pts_2d = pts_4d @ lidar2img.T
+
+    pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=1e5)
+    pts_2d[:, 0] /= pts_2d[:, 2]
+    pts_2d[:, 1] /= pts_2d[:, 2]
+    imgfov_pts_2d = pts_2d[..., :2].reshape(num_bbox, 8, 2)
+
+    return imgfov_pts_2d
+
+
+# TODO: remove third parameter in all functions here in favour of img_metas
+def proj_depth_bbox3d_to_img(bboxes_3d: DepthInstance3DBoxes,
+                             input_meta: dict) -> np.ndarray:
+    """Project the 3D bbox on 2D plane and draw on input image.
+
+    Args:
+        bboxes_3d (:obj:`DepthInstance3DBoxes`): 3D bbox in depth coordinate
+            system to visualize.
+        input_meta (dict): Meta information.
+    """
+    from mmdet3d.models import apply_3d_transformation
+    from mmdet3d.structures import points_cam2img
+
+    input_meta = copy.deepcopy(input_meta)
+    corners_3d = bboxes_3d.corners
+    num_bbox = corners_3d.shape[0]
+    points_3d = corners_3d.reshape(-1, 3)
+
+    # first reverse the data transformations
+    xyz_depth = apply_3d_transformation(
+        points_3d, 'DEPTH', input_meta, reverse=True)
+
+    # project to 2d to get image coords (uv)
+    uv_origin = points_cam2img(xyz_depth,
+                               xyz_depth.new_tensor(input_meta['depth2img']))
+    uv_origin = (uv_origin - 1).round()
+    imgfov_pts_2d = uv_origin[..., :2].reshape(num_bbox, 8, 2).numpy()
+
+    return imgfov_pts_2d
+
+
+# project the camera bboxes 3d to image
+def proj_camera_bbox3d_to_img(bboxes_3d: CameraInstance3DBoxes,
+                              input_meta: dict) -> np.ndarray:
+    """Project the 3D bbox on 2D plane and draw on input image.
+
+    Args:
+        bboxes_3d (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate
+            system to visualize.
+        input_meta (dict): Meta information.
+    """
+    from mmdet3d.structures import points_cam2img
+
+    cam2img = copy.deepcopy(input_meta['cam2img'])
+    corners_3d = bboxes_3d.corners
+    num_bbox = corners_3d.shape[0]
+    points_3d = corners_3d.reshape(-1, 3)
+    if not isinstance(cam2img, torch.Tensor):
+        cam2img = torch.from_numpy(np.array(cam2img))
+
+    assert (cam2img.shape == torch.Size([3, 3])
+            or cam2img.shape == torch.Size([4, 4]))
+    cam2img = cam2img.float().cpu()
+
+    # project to 2d to get image coords (uv)
+    uv_origin = points_cam2img(points_3d, cam2img)
+    uv_origin = (uv_origin - 1).round()
+    imgfov_pts_2d = uv_origin[..., :2].reshape(num_bbox, 8, 2).numpy()
+
+    return imgfov_pts_2d
diff --git a/mmde/model-index.yml b/mmde/model-index.yml
new file mode 100644
index 0000000000000000000000000000000000000000..672e665a76c9e3e2ee3f27784f363e377a7c23b1
--- /dev/null
+++ b/mmde/model-index.yml
@@ -0,0 +1,30 @@
+Import:
+  - configs/3dssd/metafile.yml
+  - configs/centerpoint/metafile.yml
+  - configs/dgcnn/metafile.yml
+  - configs/dynamic_voxelization/metafile.yml
+  - configs/fcos3d/metafile.yml
+  - configs/free_anchor/metafile.yml
+  - configs/groupfree3d/metafile.yml
+  - configs/h3dnet/metafile.yml
+  - configs/imvotenet/metafile.yml
+  - configs/imvoxelnet/metafile.yml
+  - configs/monoflex/metafile.yml
+  - configs/mvxnet/metafile.yml
+  - configs/nuimages/metafile.yml
+  - configs/paconv/metafile.yml
+  - configs/parta2/metafile.yml
+  - configs/pgd/metafile.yml
+  - configs/point_rcnn/metafile.yml
+  - configs/pointnet2/metafile.yml
+  - configs/pointpillars/metafile.yml
+  - configs/regnet/metafile.yml
+  - configs/second/metafile.yml
+  - configs/smoke/metafile.yml
+  - configs/ssn/metafile.yml
+  - configs/votenet/metafile.yml
+  - configs/minkunet/metafile.yml
+  - configs/cylinder3d/metafile.yml
+  - configs/pv_rcnn/metafile.yml
+  - configs/fcaf3d/metafile.yml
+  - configs/spvcnn/metafile.yml
diff --git "a/mmde/pip_\350\277\207\347\250\213.txt" "b/mmde/pip_\350\277\207\347\250\213.txt"
new file mode 100644
index 0000000000000000000000000000000000000000..a19447e586bf6a6592f819d0821aa366ef13db14
--- /dev/null
+++ "b/mmde/pip_\350\277\207\347\250\213.txt"
@@ -0,0 +1,17 @@
+pip  uninstall  megatron-core
+pip uninstall  vllm
+pip uninstall  mmcv-full
+pip install OpenMPI 
+pip install mpi4py   Pillow  mmengine
+pip install mmdet
+pip install nuscenes-devkit
+
+pip install opencv-python==4.9.0.80
+pip install mmcv-2.1.0+das.opt1.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
+
+pip install -v -e . --no-build-isolation
+pip install -e . --no-build-isolation
+
+  Created wheel for mmdet3d: filename=mmdet3d-0.0.0-0.editable-cp310-cp310-linux_x86_64.whl size=6984 sha256=97be2853385856a1d76c007755805580a7380dbf324a1d24acb5b81e453c8732
+
+  python3 setup.py build_ext -v --inplace
\ No newline at end of file
diff --git a/mmde/projects/BEVFusion/README.md b/mmde/projects/BEVFusion/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9d5ebd4c5272833ba477f910cc721b715c4c13fd
--- /dev/null
+++ b/mmde/projects/BEVFusion/README.md
@@ -0,0 +1,137 @@
+# BEVFusion: Multi-Task Multi-Sensor Fusion with Unified Bird's-Eye View Representation
+
+> [BEVFusion: Multi-Task Multi-Sensor Fusion with Unified Bird's-Eye View Representation](https://arxiv.org/abs/2205.13542)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Multi-sensor fusion is essential for an accurate and reliable autonomous driving system. Recent approaches are based on point-level fusion: augmenting the LiDAR point cloud with camera features. However, the camera-to-LiDAR projection throws away the semantic density of camera features, hindering the effectiveness of such methods, especially for semantic-oriented tasks (such as 3D scene segmentation). In this paper, we break this deeply-rooted convention with BEVFusion, an efficient and generic multi-task multi-sensor fusion framework. It unifies multi-modal features in the shared bird's-eye view (BEV) representation space, which nicely preserves both geometric and semantic information. To achieve this, we diagnose and lift key efficiency bottlenecks in the view transformation with optimized BEV pooling, reducing latency by more than 40x. BEVFusion is fundamentally task-agnostic and seamlessly supports different 3D perception tasks with almost no architectural changes. It establishes the new state of the art on nuScenes, achieving 1.3% higher mAP and NDS on 3D object detection and 13.6% higher mIoU on BEV map segmentation, with 1.9x lower computation cost. Code to reproduce our
+results is available at https://github.com/mit-han-lab/bevfusion.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34888372/215313913-4b43f8a1-e2e2-49ba-b631-992155351922.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement BEVFusion and support training and testing on NuScenes dataset.
+
+## Usage
+
+<!-- For a typical model, this section should contain the commands for training and testing. You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`. -->
+
+### Compiling operations on CUDA
+
+**Note** that the voxelization OP in the original implementation of `BEVFusion` is different from the implementation in MMCV. If you want to use the original pretrained model [here](https://github.com/mit-han-lab/bevfusion/blob/main/README.md), you need to use the original implementation of voxelization OP.
+
+```python
+python projects/BEVFusion/setup.py develop
+```
+
+### Demo
+
+Run a demo on NuScenes data using [BEVFusion model](https://drive.google.com/file/d/1QkvbYDk4G2d6SZoeJqish13qSyXA4lp3/view?usp=share_link):
+
+```shell
+python projects/BEVFusion/demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show
+```
+
+### Training commands
+
+1. You should train the lidar-only detector first:
+
+```bash
+bash tools/dist_train.py projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py 8
+```
+
+2. Download the [Swin pre-trained model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/swint-nuimages-pretrained.pth). Given the image pre-trained backbone and the lidar-only pre-trained detector, you could train the lidar-camera fusion model:
+
+```bash
+bash tools/dist_train.sh projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py 8 --cfg-options load_from=${LIDAR_PRETRAINED_CHECKPOINT} model.img_backbone.init_cfg.checkpoint=${IMAGE_PRETRAINED_BACKBONE}
+```
+
+**Note** that if you want to reduce CUDA memory usage and computational overhead, you could directly add `--amp` on the tail of the above commands. The model under this setting will be trained in fp16 mode.
+
+### Testing commands
+
+In MMDetection3D's root directory, run the following command to test the model:
+
+```bash
+bash tools/dist_test.sh projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_PATH} 8
+```
+
+## Results and models
+
+### NuScenes
+
+|                                           Modality                                           | Voxel type (voxel size) | NMS | Mem (GB) | Inf time (fps) | NDS  | mAP  |                                                                                                                                                             Download                                                                                                                                                              |
+| :------------------------------------------------------------------------------------------: | :---------------------: | :-: | :------: | :------------: | :--: | :--: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|     [lidar](./configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py)     |      voxel (0.075)      |  ×  |    -     |       -        | 69.6 | 64.9 |     [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-2628f933.pth) [logs](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d_20230322_053447.log)     |
+| [lidar-cam](./configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py) |      voxel (0.075)      |  ×  |    -     |       -        | 71.4 | 68.6 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-5239b1af.pth) [logs](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d_20230524_001539.log) |
+
+## Citation
+
+```latex
+@inproceedings{liu2022bevfusion,
+  title={BEVFusion: Multi-Task Multi-Sensor Fusion with Unified Bird's-Eye View Representation},
+  author={Liu, Zhijian and Tang, Haotian and Amini, Alexander and Yang, Xingyu and Mao, Huizi and Rus, Daniela and Han, Song},
+  booktitle={IEEE International Conference on Robotics and Automation (ICRA)},
+  year={2023}
+}
+```
+
+## Checklist
+
+<!-- Here is a checklist illustrating a usual development workflow of a successful project, and also serves as an overview of this project's progress. The PIC (person in charge) or contributors of this project should check all the items that they believe have been finished, which will further be verified by codebase maintainers via a PR.
+OpenMMLab's maintainer will review the code to ensure the project's quality. Reaching the first milestone means that this project suffices the minimum requirement of being merged into 'projects/'. But this project is only eligible to become a part of the core package upon attaining the last milestone.
+Note that keeping this section up-to-date is crucial not only for this project's developers but the entire community, since there might be some other contributors joining this project and deciding their starting point from this list. It also helps maintainers accurately estimate time and effort on further code polishing, if needed.
+A project does not necessarily have to be finished in a single PR, but it's essential for the project to at least reach the first milestone in its very first PR. -->
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+    <!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmdet3d.registry.MODELS` and configurable via a config file. -->
+
+  - [x] Basic docstrings & proper citation
+
+    <!-- Each major object should contain a docstring, describing its functionality and arguments. If you have adapted the code from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
+
+  - [x] Test-time correctness
+
+    <!-- If you are reproducing the result from a paper, make sure your model's inference-time performance matches that in the original paper. The weights usually could be obtained by simply renaming the keys in the official pre-trained weights. This test could be skipped though, if you are able to prove the training-time correctness and check the second milestone. -->
+
+  - [x] A full README
+
+    <!-- As this template does. -->
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+  - [x] Training-time correctness
+
+    <!-- If you are reproducing the result from a paper, checking this item means that you should have trained your model from scratch based on the original paper's specification and verified that the final result matches the report within a minor error range. -->
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+    <!-- Ideally *all* the methods should have [type hints](https://www.pythontutorial.net/python-basics/python-type-hints/) and [docstrings](https://google.github.io/styleguide/pyguide.html#381-docstrings). [Example](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/models/detectors/fcos_mono3d.py) -->
+
+  - [ ] Unit tests
+
+    <!-- Unit tests for each module are required. [Example](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tests/test_models/test_dense_heads/test_fcos_mono3d_head.py) -->
+
+  - [ ] Code polishing
+
+    <!-- Refactor your code according to reviewer's comment. -->
+
+  - [ ] Metafile.yml
+
+    <!-- It will be parsed by MIM and Inferencer. [Example](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/fcos3d/metafile.yml) -->
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+  <!-- In particular, you may have to refactor this README into a standard one. [Example](/configs/textdet/dbnet/README.md) -->
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/mmde/projects/BEVFusion/bevfusion/__init__.py b/mmde/projects/BEVFusion/bevfusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..db06d3afa42cc50ff2f4efccf5a81b790d37d2c9
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/__init__.py
@@ -0,0 +1,20 @@
+from .bevfusion import BEVFusion
+from .bevfusion_necks import GeneralizedLSSFPN
+from .depth_lss import DepthLSSTransform, LSSTransform
+from .loading import BEVLoadMultiViewImageFromFiles
+from .sparse_encoder import BEVFusionSparseEncoder
+from .transformer import TransformerDecoderLayer
+from .transforms_3d import (BEVFusionGlobalRotScaleTrans,
+                            BEVFusionRandomFlip3D, GridMask, ImageAug3D)
+from .transfusion_head import ConvFuser, TransFusionHead
+from .utils import (BBoxBEVL1Cost, HeuristicAssigner3D, HungarianAssigner3D,
+                    IoU3DCost)
+
+__all__ = [
+    'BEVFusion', 'TransFusionHead', 'ConvFuser', 'ImageAug3D', 'GridMask',
+    'GeneralizedLSSFPN', 'HungarianAssigner3D', 'BBoxBEVL1Cost', 'IoU3DCost',
+    'HeuristicAssigner3D', 'DepthLSSTransform', 'LSSTransform',
+    'BEVLoadMultiViewImageFromFiles', 'BEVFusionSparseEncoder',
+    'TransformerDecoderLayer', 'BEVFusionRandomFlip3D',
+    'BEVFusionGlobalRotScaleTrans'
+]
diff --git a/mmde/projects/BEVFusion/bevfusion/bevfusion.py b/mmde/projects/BEVFusion/bevfusion/bevfusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f56934e663603150c2eeab7cd0291d0fa0f31aa
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/bevfusion.py
@@ -0,0 +1,298 @@
+from collections import OrderedDict
+from copy import deepcopy
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmengine.utils import is_list_of
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.models import Base3DDetector
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import Det3DDataSample
+from mmdet3d.utils import OptConfigType, OptMultiConfig, OptSampleList
+from .ops import Voxelization
+
+
+@MODELS.register_module()
+class BEVFusion(Base3DDetector):
+
+    def __init__(
+        self,
+        data_preprocessor: OptConfigType = None,
+        pts_voxel_encoder: Optional[dict] = None,
+        pts_middle_encoder: Optional[dict] = None,
+        fusion_layer: Optional[dict] = None,
+        img_backbone: Optional[dict] = None,
+        pts_backbone: Optional[dict] = None,
+        view_transform: Optional[dict] = None,
+        img_neck: Optional[dict] = None,
+        pts_neck: Optional[dict] = None,
+        bbox_head: Optional[dict] = None,
+        init_cfg: OptMultiConfig = None,
+        seg_head: Optional[dict] = None,
+        **kwargs,
+    ) -> None:
+        voxelize_cfg = data_preprocessor.pop('voxelize_cfg')
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+        self.voxelize_reduce = voxelize_cfg.pop('voxelize_reduce')
+        self.pts_voxel_layer = Voxelization(**voxelize_cfg)
+
+        self.pts_voxel_encoder = MODELS.build(pts_voxel_encoder)
+
+        self.img_backbone = MODELS.build(
+            img_backbone) if img_backbone is not None else None
+        self.img_neck = MODELS.build(
+            img_neck) if img_neck is not None else None
+        self.view_transform = MODELS.build(
+            view_transform) if view_transform is not None else None
+        self.pts_middle_encoder = MODELS.build(pts_middle_encoder)
+
+        self.fusion_layer = MODELS.build(
+            fusion_layer) if fusion_layer is not None else None
+
+        self.pts_backbone = MODELS.build(pts_backbone)
+        self.pts_neck = MODELS.build(pts_neck)
+
+        self.bbox_head = MODELS.build(bbox_head)
+
+        self.init_weights()
+
+    def _forward(self,
+                 batch_inputs: Tensor,
+                 batch_data_samples: OptSampleList = None):
+        """Network forward process.
+
+        Usually includes backbone, neck and head forward without any post-
+        processing.
+        """
+        pass
+
+    def parse_losses(
+        self, losses: Dict[str, torch.Tensor]
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """Parses the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: There are two elements. The first is the
+            loss tensor passed to optim_wrapper which may be a weighted sum
+            of all losses, and the second is log_vars which will be sent to
+            the logger.
+        """
+        log_vars = []
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars.append([loss_name, loss_value.mean()])
+            elif is_list_of(loss_value, torch.Tensor):
+                log_vars.append(
+                    [loss_name,
+                     sum(_loss.mean() for _loss in loss_value)])
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors')
+
+        loss = sum(value for key, value in log_vars if 'loss' in key)
+        log_vars.insert(0, ['loss', loss])
+        log_vars = OrderedDict(log_vars)  # type: ignore
+
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if dist.is_available() and dist.is_initialized():
+                loss_value = loss_value.data.clone()
+                dist.all_reduce(loss_value.div_(dist.get_world_size()))
+            log_vars[loss_name] = loss_value.item()
+
+        return loss, log_vars  # type: ignore
+
+    def init_weights(self) -> None:
+        if self.img_backbone is not None:
+            self.img_backbone.init_weights()
+
+    @property
+    def with_bbox_head(self):
+        """bool: Whether the detector has a box head."""
+        return hasattr(self, 'bbox_head') and self.bbox_head is not None
+
+    @property
+    def with_seg_head(self):
+        """bool: Whether the detector has a segmentation head.
+        """
+        return hasattr(self, 'seg_head') and self.seg_head is not None
+
+    def extract_img_feat(
+        self,
+        x,
+        points,
+        lidar2image,
+        camera_intrinsics,
+        camera2lidar,
+        img_aug_matrix,
+        lidar_aug_matrix,
+        img_metas,
+    ) -> torch.Tensor:
+        B, N, C, H, W = x.size()
+        x = x.view(B * N, C, H, W).contiguous()
+
+        x = self.img_backbone(x)
+        x = self.img_neck(x)
+
+        if not isinstance(x, torch.Tensor):
+            x = x[0]
+
+        BN, C, H, W = x.size()
+        x = x.view(B, int(BN / B), C, H, W)
+
+        with torch.autocast(device_type='cuda', dtype=torch.float32):
+            x = self.view_transform(
+                x,
+                points,
+                lidar2image,
+                camera_intrinsics,
+                camera2lidar,
+                img_aug_matrix,
+                lidar_aug_matrix,
+                img_metas,
+            )
+        return x
+
+    def extract_pts_feat(self, batch_inputs_dict) -> torch.Tensor:
+        points = batch_inputs_dict['points']
+        with torch.autocast('cuda', enabled=False):
+            points = [point.float() for point in points]
+            feats, coords, sizes = self.voxelize(points)
+            batch_size = coords[-1, 0] + 1
+        x = self.pts_middle_encoder(feats, coords, batch_size)
+        return x
+
+    @torch.no_grad()
+    def voxelize(self, points):
+        feats, coords, sizes = [], [], []
+        for k, res in enumerate(points):
+            ret = self.pts_voxel_layer(res)
+            if len(ret) == 3:
+                # hard voxelize
+                f, c, n = ret
+            else:
+                assert len(ret) == 2
+                f, c = ret
+                n = None
+            feats.append(f)
+            coords.append(F.pad(c, (1, 0), mode='constant', value=k))
+            if n is not None:
+                sizes.append(n)
+
+        feats = torch.cat(feats, dim=0)
+        coords = torch.cat(coords, dim=0)
+        if len(sizes) > 0:
+            sizes = torch.cat(sizes, dim=0)
+            if self.voxelize_reduce:
+                feats = feats.sum(
+                    dim=1, keepdim=False) / sizes.type_as(feats).view(-1, 1)
+                feats = feats.contiguous()
+
+        return feats, coords, sizes
+
+    def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
+                batch_data_samples: List[Det3DDataSample],
+                **kwargs) -> List[Det3DDataSample]:
+        """Forward of testing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points' keys.
+
+                - points (list[torch.Tensor]): Point cloud of each sample.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input sample. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+                (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bbox_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
+                contains a tensor with shape (num_instances, 7).
+        """
+        batch_input_metas = [item.metainfo for item in batch_data_samples]
+        feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
+
+        if self.with_bbox_head:
+            outputs = self.bbox_head.predict(feats, batch_input_metas)
+
+        res = self.add_pred_to_datasample(batch_data_samples, outputs)
+
+        return res
+
+    def extract_feat(
+        self,
+        batch_inputs_dict,
+        batch_input_metas,
+        **kwargs,
+    ):
+        imgs = batch_inputs_dict.get('imgs', None)
+        points = batch_inputs_dict.get('points', None)
+        features = []
+        if imgs is not None:
+            imgs = imgs.contiguous()
+            lidar2image, camera_intrinsics, camera2lidar = [], [], []
+            img_aug_matrix, lidar_aug_matrix = [], []
+            for i, meta in enumerate(batch_input_metas):
+                lidar2image.append(meta['lidar2img'])
+                camera_intrinsics.append(meta['cam2img'])
+                camera2lidar.append(meta['cam2lidar'])
+                img_aug_matrix.append(meta.get('img_aug_matrix', np.eye(4)))
+                lidar_aug_matrix.append(
+                    meta.get('lidar_aug_matrix', np.eye(4)))
+
+            lidar2image = imgs.new_tensor(np.asarray(lidar2image))
+            camera_intrinsics = imgs.new_tensor(np.array(camera_intrinsics))
+            camera2lidar = imgs.new_tensor(np.asarray(camera2lidar))
+            img_aug_matrix = imgs.new_tensor(np.asarray(img_aug_matrix))
+            lidar_aug_matrix = imgs.new_tensor(np.asarray(lidar_aug_matrix))
+            img_feature = self.extract_img_feat(imgs, deepcopy(points),
+                                                lidar2image, camera_intrinsics,
+                                                camera2lidar, img_aug_matrix,
+                                                lidar_aug_matrix,
+                                                batch_input_metas)
+            features.append(img_feature)
+        pts_feature = self.extract_pts_feat(batch_inputs_dict)
+        features.append(pts_feature)
+
+        if self.fusion_layer is not None:
+            x = self.fusion_layer(features)
+        else:
+            assert len(features) == 1, features
+            x = features[0]
+
+        x = self.pts_backbone(x)
+        x = self.pts_neck(x)
+
+        return x
+
+    def loss(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
+             batch_data_samples: List[Det3DDataSample],
+             **kwargs) -> List[Det3DDataSample]:
+        batch_input_metas = [item.metainfo for item in batch_data_samples]
+        feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
+
+        losses = dict()
+        if self.with_bbox_head:
+            bbox_loss = self.bbox_head.loss(feats, batch_data_samples)
+
+        losses.update(bbox_loss)
+
+        return losses
diff --git a/mmde/projects/BEVFusion/bevfusion/bevfusion_necks.py b/mmde/projects/BEVFusion/bevfusion/bevfusion_necks.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fc79c3c46e1d7e5d48518910245c58a9c796d39
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/bevfusion_necks.py
@@ -0,0 +1,99 @@
+# modify from https://github.com/mit-han-lab/bevfusion
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class GeneralizedLSSFPN(BaseModule):
+
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            num_outs,
+            start_level=0,
+            end_level=-1,
+            no_norm_on_lateral=False,
+            conv_cfg=None,
+            norm_cfg=dict(type='BN2d'),
+            act_cfg=dict(type='ReLU'),
+            upsample_cfg=dict(mode='bilinear', align_corners=True),
+    ) -> None:
+        super().__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins - 1
+            # assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i] +
+                (in_channels[i + 1] if i == self.backbone_end_level -
+                 1 else out_channels),
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False,
+            )
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False,
+            )
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+    def forward(self, inputs):
+        """Forward function."""
+        # upsample -> cat -> conv1x1 -> conv3x3
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [inputs[i + self.start_level] for i in range(len(inputs))]
+
+        # build top-down path
+        used_backbone_levels = len(laterals) - 1
+        for i in range(used_backbone_levels - 1, -1, -1):
+            x = F.interpolate(
+                laterals[i + 1],
+                size=laterals[i].shape[2:],
+                **self.upsample_cfg,
+            )
+            laterals[i] = torch.cat([laterals[i], x], dim=1)
+            laterals[i] = self.lateral_convs[i](laterals[i])
+            laterals[i] = self.fpn_convs[i](laterals[i])
+
+        # build outputs
+        outs = [laterals[i] for i in range(used_backbone_levels)]
+        return tuple(outs)
diff --git a/mmde/projects/BEVFusion/bevfusion/depth_lss.py b/mmde/projects/BEVFusion/bevfusion/depth_lss.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cc0cc1606fbe9d8971d22dbd68f9099bee0828e
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/depth_lss.py
@@ -0,0 +1,426 @@
+# modify from https://github.com/mit-han-lab/bevfusion
+from typing import Tuple
+
+import torch
+from torch import nn
+
+from mmdet3d.registry import MODELS
+from .ops import bev_pool
+
+
+def gen_dx_bx(xbound, ybound, zbound):
+    dx = torch.Tensor([row[2] for row in [xbound, ybound, zbound]])
+    bx = torch.Tensor(
+        [row[0] + row[2] / 2.0 for row in [xbound, ybound, zbound]])
+    nx = torch.LongTensor([(row[1] - row[0]) / row[2]
+                           for row in [xbound, ybound, zbound]])
+    return dx, bx, nx
+
+
+class BaseViewTransform(nn.Module):
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        image_size: Tuple[int, int],
+        feature_size: Tuple[int, int],
+        xbound: Tuple[float, float, float],
+        ybound: Tuple[float, float, float],
+        zbound: Tuple[float, float, float],
+        dbound: Tuple[float, float, float],
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.image_size = image_size
+        self.feature_size = feature_size
+        self.xbound = xbound
+        self.ybound = ybound
+        self.zbound = zbound
+        self.dbound = dbound
+
+        dx, bx, nx = gen_dx_bx(self.xbound, self.ybound, self.zbound)
+        self.dx = nn.Parameter(dx, requires_grad=False)
+        self.bx = nn.Parameter(bx, requires_grad=False)
+        self.nx = nn.Parameter(nx, requires_grad=False)
+
+        self.C = out_channels
+        self.frustum = self.create_frustum()
+        self.D = self.frustum.shape[0]
+        self.fp16_enabled = False
+
+    def create_frustum(self):
+        iH, iW = self.image_size
+        fH, fW = self.feature_size
+
+        ds = (
+            torch.arange(*self.dbound,
+                         dtype=torch.float).view(-1, 1, 1).expand(-1, fH, fW))
+        D, _, _ = ds.shape
+
+        xs = (
+            torch.linspace(0, iW - 1, fW,
+                           dtype=torch.float).view(1, 1, fW).expand(D, fH, fW))
+        ys = (
+            torch.linspace(0, iH - 1, fH,
+                           dtype=torch.float).view(1, fH, 1).expand(D, fH, fW))
+
+        frustum = torch.stack((xs, ys, ds), -1)
+        return nn.Parameter(frustum, requires_grad=False)
+
+    def get_geometry(
+        self,
+        camera2lidar_rots,
+        camera2lidar_trans,
+        intrins,
+        post_rots,
+        post_trans,
+        **kwargs,
+    ):
+        B, N, _ = camera2lidar_trans.shape
+
+        # undo post-transformation
+        # B x N x D x H x W x 3
+        points = self.frustum - post_trans.view(B, N, 1, 1, 1, 3)
+        points = (
+            torch.inverse(post_rots).view(B, N, 1, 1, 1, 3,
+                                          3).matmul(points.unsqueeze(-1)))
+        # cam_to_lidar
+        points = torch.cat(
+            (
+                points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3],
+                points[:, :, :, :, :, 2:3],
+            ),
+            5,
+        )
+        combine = camera2lidar_rots.matmul(torch.inverse(intrins))
+        points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
+        points += camera2lidar_trans.view(B, N, 1, 1, 1, 3)
+
+        if 'extra_rots' in kwargs:
+            extra_rots = kwargs['extra_rots']
+            points = (
+                extra_rots.view(B, 1, 1, 1, 1, 3,
+                                3).repeat(1, N, 1, 1, 1, 1, 1).matmul(
+                                    points.unsqueeze(-1)).squeeze(-1))
+        if 'extra_trans' in kwargs:
+            extra_trans = kwargs['extra_trans']
+            points += extra_trans.view(B, 1, 1, 1, 1,
+                                       3).repeat(1, N, 1, 1, 1, 1)
+
+        return points
+
+    def get_cam_feats(self, x):
+        raise NotImplementedError
+
+    def bev_pool(self, geom_feats, x):
+        B, N, D, H, W, C = x.shape
+        Nprime = B * N * D * H * W
+
+        # flatten x
+        x = x.reshape(Nprime, C)
+
+        # flatten indices
+        geom_feats = ((geom_feats - (self.bx - self.dx / 2.0)) /
+                      self.dx).long()
+        geom_feats = geom_feats.view(Nprime, 3)
+        batch_ix = torch.cat([
+            torch.full([Nprime // B, 1], ix, device=x.device, dtype=torch.long)
+            for ix in range(B)
+        ])
+        geom_feats = torch.cat((geom_feats, batch_ix), 1)
+
+        # filter out points that are outside box
+        kept = ((geom_feats[:, 0] >= 0)
+                & (geom_feats[:, 0] < self.nx[0])
+                & (geom_feats[:, 1] >= 0)
+                & (geom_feats[:, 1] < self.nx[1])
+                & (geom_feats[:, 2] >= 0)
+                & (geom_feats[:, 2] < self.nx[2]))
+        x = x[kept]
+        geom_feats = geom_feats[kept]
+
+        x = bev_pool(x, geom_feats, B, self.nx[2], self.nx[0], self.nx[1])
+
+        # collapse Z
+        final = torch.cat(x.unbind(dim=2), 1)
+
+        return final
+
+    def forward(
+        self,
+        img,
+        points,
+        lidar2image,
+        camera_intrinsics,
+        camera2lidar,
+        img_aug_matrix,
+        lidar_aug_matrix,
+        metas,
+        **kwargs,
+    ):
+        intrins = camera_intrinsics[..., :3, :3]
+        post_rots = img_aug_matrix[..., :3, :3]
+        post_trans = img_aug_matrix[..., :3, 3]
+        camera2lidar_rots = camera2lidar[..., :3, :3]
+        camera2lidar_trans = camera2lidar[..., :3, 3]
+
+        extra_rots = lidar_aug_matrix[..., :3, :3]
+        extra_trans = lidar_aug_matrix[..., :3, 3]
+
+        geom = self.get_geometry(
+            camera2lidar_rots,
+            camera2lidar_trans,
+            intrins,
+            post_rots,
+            post_trans,
+            extra_rots=extra_rots,
+            extra_trans=extra_trans,
+        )
+
+        x = self.get_cam_feats(img)
+        x = self.bev_pool(geom, x)
+        return x
+
+
+@MODELS.register_module()
+class LSSTransform(BaseViewTransform):
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        image_size: Tuple[int, int],
+        feature_size: Tuple[int, int],
+        xbound: Tuple[float, float, float],
+        ybound: Tuple[float, float, float],
+        zbound: Tuple[float, float, float],
+        dbound: Tuple[float, float, float],
+        downsample: int = 1,
+    ) -> None:
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            image_size=image_size,
+            feature_size=feature_size,
+            xbound=xbound,
+            ybound=ybound,
+            zbound=zbound,
+            dbound=dbound,
+        )
+        self.depthnet = nn.Conv2d(in_channels, self.D + self.C, 1)
+        if downsample > 1:
+            assert downsample == 2, downsample
+            self.downsample = nn.Sequential(
+                nn.Conv2d(
+                    out_channels, out_channels, 3, padding=1, bias=False),
+                nn.BatchNorm2d(out_channels),
+                nn.ReLU(True),
+                nn.Conv2d(
+                    out_channels,
+                    out_channels,
+                    3,
+                    stride=downsample,
+                    padding=1,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(out_channels),
+                nn.ReLU(True),
+                nn.Conv2d(
+                    out_channels, out_channels, 3, padding=1, bias=False),
+                nn.BatchNorm2d(out_channels),
+                nn.ReLU(True),
+            )
+        else:
+            self.downsample = nn.Identity()
+
+    def get_cam_feats(self, x):
+        B, N, C, fH, fW = x.shape
+
+        x = x.view(B * N, C, fH, fW)
+
+        x = self.depthnet(x)
+        depth = x[:, :self.D].softmax(dim=1)
+        x = depth.unsqueeze(1) * x[:, self.D:(self.D + self.C)].unsqueeze(2)
+
+        x = x.view(B, N, self.C, self.D, fH, fW)
+        x = x.permute(0, 1, 3, 4, 5, 2)
+        return x
+
+    def forward(self, *args, **kwargs):
+        x = super().forward(*args, **kwargs)
+        x = self.downsample(x)
+        return x
+
+
+class BaseDepthTransform(BaseViewTransform):
+
+    def forward(
+        self,
+        img,
+        points,
+        lidar2image,
+        cam_intrinsic,
+        camera2lidar,
+        img_aug_matrix,
+        lidar_aug_matrix,
+        metas,
+        **kwargs,
+    ):
+        intrins = cam_intrinsic[..., :3, :3]
+        post_rots = img_aug_matrix[..., :3, :3]
+        post_trans = img_aug_matrix[..., :3, 3]
+        camera2lidar_rots = camera2lidar[..., :3, :3]
+        camera2lidar_trans = camera2lidar[..., :3, 3]
+
+        batch_size = len(points)
+        depth = torch.zeros(batch_size, img.shape[1], 1,
+                            *self.image_size).to(points[0].device)
+
+        for b in range(batch_size):
+            cur_coords = points[b][:, :3]
+            cur_img_aug_matrix = img_aug_matrix[b]
+            cur_lidar_aug_matrix = lidar_aug_matrix[b]
+            cur_lidar2image = lidar2image[b]
+
+            # inverse aug
+            cur_coords -= cur_lidar_aug_matrix[:3, 3]
+            cur_coords = torch.inverse(cur_lidar_aug_matrix[:3, :3]).matmul(
+                cur_coords.transpose(1, 0))
+            # lidar2image
+            cur_coords = cur_lidar2image[:, :3, :3].matmul(cur_coords)
+            cur_coords += cur_lidar2image[:, :3, 3].reshape(-1, 3, 1)
+            # get 2d coords
+            dist = cur_coords[:, 2, :]
+            cur_coords[:, 2, :] = torch.clamp(cur_coords[:, 2, :], 1e-5, 1e5)
+            cur_coords[:, :2, :] /= cur_coords[:, 2:3, :]
+
+            # imgaug
+            cur_coords = cur_img_aug_matrix[:, :3, :3].matmul(cur_coords)
+            cur_coords += cur_img_aug_matrix[:, :3, 3].reshape(-1, 3, 1)
+            cur_coords = cur_coords[:, :2, :].transpose(1, 2)
+
+            # normalize coords for grid sample
+            cur_coords = cur_coords[..., [1, 0]]
+
+            on_img = ((cur_coords[..., 0] < self.image_size[0])
+                      & (cur_coords[..., 0] >= 0)
+                      & (cur_coords[..., 1] < self.image_size[1])
+                      & (cur_coords[..., 1] >= 0))
+            for c in range(on_img.shape[0]):
+                masked_coords = cur_coords[c, on_img[c]].long()
+                masked_dist = dist[c, on_img[c]]
+                depth = depth.to(masked_dist.dtype)
+                depth[b, c, 0, masked_coords[:, 0],
+                      masked_coords[:, 1]] = masked_dist
+
+        extra_rots = lidar_aug_matrix[..., :3, :3]
+        extra_trans = lidar_aug_matrix[..., :3, 3]
+        geom = self.get_geometry(
+            camera2lidar_rots,
+            camera2lidar_trans,
+            intrins,
+            post_rots,
+            post_trans,
+            extra_rots=extra_rots,
+            extra_trans=extra_trans,
+        )
+
+        x = self.get_cam_feats(img, depth)
+        x = self.bev_pool(geom, x)
+        return x
+
+
+@MODELS.register_module()
+class DepthLSSTransform(BaseDepthTransform):
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        image_size: Tuple[int, int],
+        feature_size: Tuple[int, int],
+        xbound: Tuple[float, float, float],
+        ybound: Tuple[float, float, float],
+        zbound: Tuple[float, float, float],
+        dbound: Tuple[float, float, float],
+        downsample: int = 1,
+    ) -> None:
+        """Compared with `LSSTransform`, `DepthLSSTransform` adds sparse depth
+        information from lidar points into the inputs of the `depthnet`."""
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            image_size=image_size,
+            feature_size=feature_size,
+            xbound=xbound,
+            ybound=ybound,
+            zbound=zbound,
+            dbound=dbound,
+        )
+        self.dtransform = nn.Sequential(
+            nn.Conv2d(1, 8, 1),
+            nn.BatchNorm2d(8),
+            nn.ReLU(True),
+            nn.Conv2d(8, 32, 5, stride=4, padding=2),
+            nn.BatchNorm2d(32),
+            nn.ReLU(True),
+            nn.Conv2d(32, 64, 5, stride=2, padding=2),
+            nn.BatchNorm2d(64),
+            nn.ReLU(True),
+        )
+        self.depthnet = nn.Sequential(
+            nn.Conv2d(in_channels + 64, in_channels, 3, padding=1),
+            nn.BatchNorm2d(in_channels),
+            nn.ReLU(True),
+            nn.Conv2d(in_channels, in_channels, 3, padding=1),
+            nn.BatchNorm2d(in_channels),
+            nn.ReLU(True),
+            nn.Conv2d(in_channels, self.D + self.C, 1),
+        )
+        if downsample > 1:
+            assert downsample == 2, downsample
+            self.downsample = nn.Sequential(
+                nn.Conv2d(
+                    out_channels, out_channels, 3, padding=1, bias=False),
+                nn.BatchNorm2d(out_channels),
+                nn.ReLU(True),
+                nn.Conv2d(
+                    out_channels,
+                    out_channels,
+                    3,
+                    stride=downsample,
+                    padding=1,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(out_channels),
+                nn.ReLU(True),
+                nn.Conv2d(
+                    out_channels, out_channels, 3, padding=1, bias=False),
+                nn.BatchNorm2d(out_channels),
+                nn.ReLU(True),
+            )
+        else:
+            self.downsample = nn.Identity()
+
+    def get_cam_feats(self, x, d):
+        B, N, C, fH, fW = x.shape
+
+        d = d.view(B * N, *d.shape[2:])
+        x = x.view(B * N, C, fH, fW)
+
+        d = self.dtransform(d)
+        x = torch.cat([d, x], dim=1)
+        x = self.depthnet(x)
+
+        depth = x[:, :self.D].softmax(dim=1)
+        x = depth.unsqueeze(1) * x[:, self.D:(self.D + self.C)].unsqueeze(2)
+
+        x = x.view(B, N, self.C, self.D, fH, fW)
+        x = x.permute(0, 1, 3, 4, 5, 2)
+        return x
+
+    def forward(self, *args, **kwargs):
+        x = super().forward(*args, **kwargs)
+        x = self.downsample(x)
+        return x
diff --git a/mmde/projects/BEVFusion/bevfusion/loading.py b/mmde/projects/BEVFusion/bevfusion/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..8615be7e3f4f826b6e55a21adc36e0302a04b4f7
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/loading.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Optional
+
+import mmcv
+import numpy as np
+from mmengine.fileio import get
+
+from mmdet3d.datasets.transforms import LoadMultiViewImageFromFiles
+from mmdet3d.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class BEVLoadMultiViewImageFromFiles(LoadMultiViewImageFromFiles):
+    """Load multi channel images from a list of separate channel files.
+
+    ``BEVLoadMultiViewImageFromFiles`` adds the following keys for the
+    convenience of view transforms in the forward:
+        - 'cam2lidar'
+        - 'lidar2img'
+
+    Args:
+        to_float32 (bool): Whether to convert the img to float32.
+            Defaults to False.
+        color_type (str): Color type of the file. Defaults to 'unchanged'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        num_views (int): Number of view in a frame. Defaults to 5.
+        num_ref_frames (int): Number of frame in loading. Defaults to -1.
+        test_mode (bool): Whether is test mode in loading. Defaults to False.
+        set_default_scale (bool): Whether to set default scale.
+            Defaults to True.
+    """
+
+    def transform(self, results: dict) -> Optional[dict]:
+        """Call function to load multi-view image from files.
+
+        Args:
+            results (dict): Result dict containing multi-view image filenames.
+
+        Returns:
+            dict: The result dict containing the multi-view image data.
+            Added keys and values are described below.
+
+                - filename (str): Multi-view image filenames.
+                - img (np.ndarray): Multi-view image arrays.
+                - img_shape (tuple[int]): Shape of multi-view image arrays.
+                - ori_shape (tuple[int]): Shape of original image arrays.
+                - pad_shape (tuple[int]): Shape of padded image arrays.
+                - scale_factor (float): Scale factor.
+                - img_norm_cfg (dict): Normalization configuration of images.
+        """
+        # TODO: consider split the multi-sweep part out of this pipeline
+        # Derive the mask and transform for loading of multi-sweep data
+        if self.num_ref_frames > 0:
+            # init choice with the current frame
+            init_choice = np.array([0], dtype=np.int64)
+            num_frames = len(results['img_filename']) // self.num_views - 1
+            if num_frames == 0:  # no previous frame, then copy cur frames
+                choices = np.random.choice(
+                    1, self.num_ref_frames, replace=True)
+            elif num_frames >= self.num_ref_frames:
+                # NOTE: suppose the info is saved following the order
+                # from latest to earlier frames
+                if self.test_mode:
+                    choices = np.arange(num_frames - self.num_ref_frames,
+                                        num_frames) + 1
+                # NOTE: +1 is for selecting previous frames
+                else:
+                    choices = np.random.choice(
+                        num_frames, self.num_ref_frames, replace=False) + 1
+            elif num_frames > 0 and num_frames < self.num_ref_frames:
+                if self.test_mode:
+                    base_choices = np.arange(num_frames) + 1
+                    random_choices = np.random.choice(
+                        num_frames,
+                        self.num_ref_frames - num_frames,
+                        replace=True) + 1
+                    choices = np.concatenate([base_choices, random_choices])
+                else:
+                    choices = np.random.choice(
+                        num_frames, self.num_ref_frames, replace=True) + 1
+            else:
+                raise NotImplementedError
+            choices = np.concatenate([init_choice, choices])
+            select_filename = []
+            for choice in choices:
+                select_filename += results['img_filename'][choice *
+                                                           self.num_views:
+                                                           (choice + 1) *
+                                                           self.num_views]
+            results['img_filename'] = select_filename
+            for key in ['cam2img', 'lidar2cam']:
+                if key in results:
+                    select_results = []
+                    for choice in choices:
+                        select_results += results[key][choice *
+                                                       self.num_views:(choice +
+                                                                       1) *
+                                                       self.num_views]
+                    results[key] = select_results
+            for key in ['ego2global']:
+                if key in results:
+                    select_results = []
+                    for choice in choices:
+                        select_results += [results[key][choice]]
+                    results[key] = select_results
+            # Transform lidar2cam to
+            # [cur_lidar]2[prev_img] and [cur_lidar]2[prev_cam]
+            for key in ['lidar2cam']:
+                if key in results:
+                    # only change matrices of previous frames
+                    for choice_idx in range(1, len(choices)):
+                        pad_prev_ego2global = np.eye(4)
+                        prev_ego2global = results['ego2global'][choice_idx]
+                        pad_prev_ego2global[:prev_ego2global.
+                                            shape[0], :prev_ego2global.
+                                            shape[1]] = prev_ego2global
+                        pad_cur_ego2global = np.eye(4)
+                        cur_ego2global = results['ego2global'][0]
+                        pad_cur_ego2global[:cur_ego2global.
+                                           shape[0], :cur_ego2global.
+                                           shape[1]] = cur_ego2global
+                        cur2prev = np.linalg.inv(pad_prev_ego2global).dot(
+                            pad_cur_ego2global)
+                        for result_idx in range(choice_idx * self.num_views,
+                                                (choice_idx + 1) *
+                                                self.num_views):
+                            results[key][result_idx] = \
+                                results[key][result_idx].dot(cur2prev)
+        # Support multi-view images with different shapes
+        # TODO: record the origin shape and padded shape
+        filename, cam2img, lidar2cam, cam2lidar, lidar2img = [], [], [], [], []
+        for _, cam_item in results['images'].items():
+            filename.append(cam_item['img_path'])
+            lidar2cam.append(cam_item['lidar2cam'])
+
+            lidar2cam_array = np.array(cam_item['lidar2cam']).astype(
+                np.float32)
+            lidar2cam_rot = lidar2cam_array[:3, :3]
+            lidar2cam_trans = lidar2cam_array[:3, 3:4]
+            camera2lidar = np.eye(4)
+            camera2lidar[:3, :3] = lidar2cam_rot.T
+            camera2lidar[:3, 3:4] = -1 * np.matmul(
+                lidar2cam_rot.T, lidar2cam_trans.reshape(3, 1))
+            cam2lidar.append(camera2lidar)
+
+            cam2img_array = np.eye(4).astype(np.float32)
+            cam2img_array[:3, :3] = np.array(cam_item['cam2img']).astype(
+                np.float32)
+            cam2img.append(cam2img_array)
+            lidar2img.append(cam2img_array @ lidar2cam_array)
+
+        results['img_path'] = filename
+        results['cam2img'] = np.stack(cam2img, axis=0)
+        results['lidar2cam'] = np.stack(lidar2cam, axis=0)
+        results['cam2lidar'] = np.stack(cam2lidar, axis=0)
+        results['lidar2img'] = np.stack(lidar2img, axis=0)
+
+        results['ori_cam2img'] = copy.deepcopy(results['cam2img'])
+
+        # img is of shape (h, w, c, num_views)
+        # h and w can be different for different views
+        img_bytes = [
+            get(name, backend_args=self.backend_args) for name in filename
+        ]
+        imgs = [
+            mmcv.imfrombytes(
+                img_byte,
+                flag=self.color_type,
+                backend='pillow',
+                channel_order='rgb') for img_byte in img_bytes
+        ]
+        # handle the image with different shape
+        img_shapes = np.stack([img.shape for img in imgs], axis=0)
+        img_shape_max = np.max(img_shapes, axis=0)
+        img_shape_min = np.min(img_shapes, axis=0)
+        assert img_shape_min[-1] == img_shape_max[-1]
+        if not np.all(img_shape_max == img_shape_min):
+            pad_shape = img_shape_max[:2]
+        else:
+            pad_shape = None
+        if pad_shape is not None:
+            imgs = [
+                mmcv.impad(img, shape=pad_shape, pad_val=0) for img in imgs
+            ]
+        img = np.stack(imgs, axis=-1)
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['filename'] = filename
+        # unravel to list, see `DefaultFormatBundle` in formating.py
+        # which will transpose each image separately and then stack into array
+        results['img'] = [img[..., i] for i in range(img.shape[-1])]
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        # Set initial values for default meta_keys
+        results['pad_shape'] = img.shape[:2]
+        if self.set_default_scale:
+            results['scale_factor'] = 1.0
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results['img_norm_cfg'] = dict(
+            mean=np.zeros(num_channels, dtype=np.float32),
+            std=np.ones(num_channels, dtype=np.float32),
+            to_rgb=False)
+        results['num_views'] = self.num_views
+        results['num_ref_frames'] = self.num_ref_frames
+        return results
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/__init__.py b/mmde/projects/BEVFusion/bevfusion/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03830c328f641cca1d7f69d62a4facdc5c83db50
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/__init__.py
@@ -0,0 +1,7 @@
+from .bev_pool import bev_pool
+from .voxel import DynamicScatter, Voxelization, dynamic_scatter, voxelization
+
+__all__ = [
+    'bev_pool', 'Voxelization', 'voxelization', 'dynamic_scatter',
+    'DynamicScatter'
+]
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/__init__.py b/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..842b03ccca562084bc04da6edf8746a78bc5276e
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/__init__.py
@@ -0,0 +1,3 @@
+from .bev_pool import bev_pool
+
+__all__ = ['bev_pool']
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/bev_pool.py b/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/bev_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..46cf532b433b8754a593c4d7179bc71dc35131ca
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/bev_pool.py
@@ -0,0 +1,94 @@
+import torch
+
+from . import bev_pool_ext
+
+
+class QuickCumsum(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, geom_feats, ranks):
+        x = x.cumsum(0)
+        kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool)
+        kept[:-1] = ranks[1:] != ranks[:-1]
+
+        x, geom_feats = x[kept], geom_feats[kept]
+        x = torch.cat((x[:1], x[1:] - x[:-1]))
+
+        # save kept for backward
+        ctx.save_for_backward(kept)
+
+        # no gradient for geom_feats
+        ctx.mark_non_differentiable(geom_feats)
+
+        return x, geom_feats
+
+    @staticmethod
+    def backward(ctx, gradx, gradgeom):
+        (kept, ) = ctx.saved_tensors
+        back = torch.cumsum(kept, 0)
+        back[kept] -= 1
+
+        val = gradx[back]
+
+        return val, None, None
+
+
+class QuickCumsumCuda(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, geom_feats, ranks, B, D, H, W):
+        kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool)
+        kept[1:] = ranks[1:] != ranks[:-1]
+        interval_starts = torch.where(kept)[0].int()
+        interval_lengths = torch.zeros_like(interval_starts)
+        interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
+        interval_lengths[-1] = x.shape[0] - interval_starts[-1]
+        geom_feats = geom_feats.int()
+
+        out = bev_pool_ext.bev_pool_forward(
+            x,
+            geom_feats,
+            interval_lengths,
+            interval_starts,
+            B,
+            D,
+            H,
+            W,
+        )
+
+        ctx.save_for_backward(interval_starts, interval_lengths, geom_feats)
+        ctx.saved_shapes = B, D, H, W
+        return out
+
+    @staticmethod
+    def backward(ctx, out_grad):
+        interval_starts, interval_lengths, geom_feats = ctx.saved_tensors
+        B, D, H, W = ctx.saved_shapes
+
+        out_grad = out_grad.contiguous()
+        x_grad = bev_pool_ext.bev_pool_backward(
+            out_grad,
+            geom_feats,
+            interval_lengths,
+            interval_starts,
+            B,
+            D,
+            H,
+            W,
+        )
+
+        return x_grad, None, None, None, None, None, None
+
+
+def bev_pool(feats, coords, B, D, H, W):
+    assert feats.shape[0] == coords.shape[0]
+
+    ranks = (
+        coords[:, 0] * (W * D * B) + coords[:, 1] * (D * B) +
+        coords[:, 2] * B + coords[:, 3])
+    indices = ranks.argsort()
+    feats, coords, ranks = feats[indices], coords[indices], ranks[indices]
+
+    x = QuickCumsumCuda.apply(feats, coords, ranks, B, D, H, W)
+    x = x.permute(0, 4, 1, 2, 3).contiguous()
+    return x
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/src/bev_pool.cpp b/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/src/bev_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6a04e0aab1d7ed46f3ee1ff2f46d3f4b01f7461
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/src/bev_pool.cpp
@@ -0,0 +1,94 @@
+#include <torch/torch.h>
+#include <c10/cuda/CUDAGuard.h>
+
+// CUDA function declarations
+void bev_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* x,
+    const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* out);
+
+void bev_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad,
+  const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* x_grad);
+
+
+/*
+  Function: pillar pooling (forward, cuda)
+  Args:
+    x                : input features, FloatTensor[n, c]
+    geom_feats       : input coordinates, IntTensor[n, 4]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+  Return:
+    out              : output features, FloatTensor[b, d, h, w, c]
+*/
+at::Tensor bev_pool_forward(
+  const at::Tensor _x,
+  const at::Tensor _geom_feats,
+  const at::Tensor _interval_lengths,
+  const at::Tensor _interval_starts,
+  int b, int d, int h, int w
+) {
+  int n = _x.size(0);
+  int c = _x.size(1);
+  int n_intervals = _interval_lengths.size(0);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(_x));
+  const float* x = _x.data_ptr<float>();
+  const int* geom_feats = _geom_feats.data_ptr<int>();
+  const int* interval_lengths = _interval_lengths.data_ptr<int>();
+  const int* interval_starts = _interval_starts.data_ptr<int>();
+
+  auto options =
+      torch::TensorOptions().dtype(_x.dtype()).device(_x.device());
+  at::Tensor _out = torch::zeros({b, d, h, w, c}, options);
+  float* out = _out.data_ptr<float>();
+  bev_pool(
+    b, d, h, w, n, c, n_intervals, x,
+    geom_feats, interval_starts, interval_lengths, out
+  );
+  return _out;
+}
+
+
+/*
+  Function: pillar pooling (backward, cuda)
+  Args:
+    out_grad         : input features, FloatTensor[b, d, h, w, c]
+    geom_feats       : input coordinates, IntTensor[n, 4]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+  Return:
+    x_grad           : output features, FloatTensor[n, 4]
+*/
+at::Tensor bev_pool_backward(
+  const at::Tensor _out_grad,
+  const at::Tensor _geom_feats,
+  const at::Tensor _interval_lengths,
+  const at::Tensor _interval_starts,
+  int b, int d, int h, int w
+) {
+  int n = _geom_feats.size(0);
+  int c = _out_grad.size(4);
+  int n_intervals = _interval_lengths.size(0);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(_out_grad));
+  const float* out_grad = _out_grad.data_ptr<float>();
+  const int* geom_feats = _geom_feats.data_ptr<int>();
+  const int* interval_lengths = _interval_lengths.data_ptr<int>();
+  const int* interval_starts = _interval_starts.data_ptr<int>();
+
+  auto options =
+      torch::TensorOptions().dtype(_out_grad.dtype()).device(_out_grad.device());
+  at::Tensor _x_grad = torch::zeros({n, c}, options);
+  float* x_grad = _x_grad.data_ptr<float>();
+
+  bev_pool_grad(
+    b, d, h, w, n, c, n_intervals, out_grad,
+    geom_feats, interval_starts, interval_lengths, x_grad
+  );
+
+  return _x_grad;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("bev_pool_forward", &bev_pool_forward,
+        "bev_pool_forward");
+  m.def("bev_pool_backward", &bev_pool_backward,
+        "bev_pool_backward");
+}
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/src/bev_pool_cuda.cu b/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/src/bev_pool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ea4e40781347c343699048e3c11b3124c2426d97
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/src/bev_pool_cuda.cu
@@ -0,0 +1,98 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+/*
+  Function: pillar pooling
+  Args:
+    b                : batch size
+    d                : depth of the feature map
+    h                : height of pooled feature map
+    w                : width of pooled feature map
+    n                : number of input points
+    c                : number of channels
+    n_intervals      : number of unique points
+    x                : input features, FloatTensor[n, c]
+    geom_feats       : input coordinates, IntTensor[n, 4]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+    out              : output features, FloatTensor[b, d, h, w, c]
+*/
+__global__ void bev_pool_kernel(int b, int d, int h, int w, int n, int c, int n_intervals,
+                                  const float *__restrict__ x,
+                                  const int *__restrict__ geom_feats,
+                                  const int *__restrict__ interval_starts,
+                                  const int *__restrict__ interval_lengths,
+                                  float* __restrict__ out) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int index = idx / c;
+  int cur_c = idx % c;
+  if (index >= n_intervals) return;
+  int interval_start = interval_starts[index];
+  int interval_length = interval_lengths[index];
+  const int* cur_geom_feats = geom_feats + interval_start * 4;
+  const float* cur_x = x + interval_start * c + cur_c;
+  float* cur_out = out + cur_geom_feats[3] * d * h * w * c +
+    cur_geom_feats[2] * h * w * c + cur_geom_feats[0] * w * c +
+    cur_geom_feats[1] * c + cur_c;
+  float psum = 0;
+  for(int i = 0; i < interval_length; i++){
+    psum += cur_x[i * c];
+  }
+  *cur_out = psum;
+}
+
+
+/*
+  Function: pillar pooling backward
+  Args:
+    b                : batch size
+    d                : depth of the feature map
+    h                : height of pooled feature map
+    w                : width of pooled feature map
+    n                : number of input points
+    c                : number of channels
+    n_intervals      : number of unique points
+    out_grad         : gradient of the BEV fmap from top, FloatTensor[b, d, h, w, c]
+    geom_feats       : input coordinates, IntTensor[n, 4]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+    x_grad           : gradient of the image fmap, FloatTensor
+*/
+__global__ void bev_pool_grad_kernel(int b, int d, int h, int w, int n, int c, int n_intervals,
+                                  const float *__restrict__ out_grad,
+                                  const int *__restrict__ geom_feats,
+                                  const int *__restrict__ interval_starts,
+                                  const int *__restrict__ interval_lengths,
+                                  float* __restrict__ x_grad) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int index = idx / c;
+  int cur_c = idx % c;
+  if (index >= n_intervals) return;
+  int interval_start = interval_starts[index];
+  int interval_length = interval_lengths[index];
+
+  const int* cur_geom_feats = geom_feats + interval_start * 4;
+  float* cur_x_grad = x_grad + interval_start * c + cur_c;
+
+  const float* cur_out_grad = out_grad + cur_geom_feats[3] * d * h * w * c +
+    cur_geom_feats[2] * h * w * c + cur_geom_feats[0] * w * c +
+    cur_geom_feats[1] * c + cur_c;
+  for(int i = 0; i < interval_length; i++){
+    cur_x_grad[i * c] = *cur_out_grad;
+  }
+
+}
+
+void bev_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* x,
+  const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* out) {
+  bev_pool_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>(
+    b, d, h, w, n, c, n_intervals, x, geom_feats, interval_starts, interval_lengths, out
+  );
+}
+
+void bev_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad,
+  const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* x_grad) {
+  bev_pool_grad_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>(
+    b, d, h, w, n, c, n_intervals, out_grad, geom_feats, interval_starts, interval_lengths, x_grad
+  );
+}
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/src/bev_pool_cuda.hip b/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/src/bev_pool_cuda.hip
new file mode 100644
index 0000000000000000000000000000000000000000..141ffa200e84fcaf7cc336b3d6b396c87cfc9620
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/src/bev_pool_cuda.hip
@@ -0,0 +1,101 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#include "hip/hip_runtime.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+/*
+  Function: pillar pooling
+  Args:
+    b                : batch size
+    d                : depth of the feature map
+    h                : height of pooled feature map
+    w                : width of pooled feature map
+    n                : number of input points
+    c                : number of channels
+    n_intervals      : number of unique points
+    x                : input features, FloatTensor[n, c]
+    geom_feats       : input coordinates, IntTensor[n, 4]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+    out              : output features, FloatTensor[b, d, h, w, c]
+*/
+__global__ void bev_pool_kernel(int b, int d, int h, int w, int n, int c, int n_intervals,
+                                  const float *__restrict__ x,
+                                  const int *__restrict__ geom_feats,
+                                  const int *__restrict__ interval_starts,
+                                  const int *__restrict__ interval_lengths,
+                                  float* __restrict__ out) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int index = idx / c;
+  int cur_c = idx % c;
+  if (index >= n_intervals) return;
+  int interval_start = interval_starts[index];
+  int interval_length = interval_lengths[index];
+  const int* cur_geom_feats = geom_feats + interval_start * 4;
+  const float* cur_x = x + interval_start * c + cur_c;
+  float* cur_out = out + cur_geom_feats[3] * d * h * w * c +
+    cur_geom_feats[2] * h * w * c + cur_geom_feats[0] * w * c +
+    cur_geom_feats[1] * c + cur_c;
+  float psum = 0;
+  for(int i = 0; i < interval_length; i++){
+    psum += cur_x[i * c];
+  }
+  *cur_out = psum;
+}
+
+
+/*
+  Function: pillar pooling backward
+  Args:
+    b                : batch size
+    d                : depth of the feature map
+    h                : height of pooled feature map
+    w                : width of pooled feature map
+    n                : number of input points
+    c                : number of channels
+    n_intervals      : number of unique points
+    out_grad         : gradient of the BEV fmap from top, FloatTensor[b, d, h, w, c]
+    geom_feats       : input coordinates, IntTensor[n, 4]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+    x_grad           : gradient of the image fmap, FloatTensor
+*/
+__global__ void bev_pool_grad_kernel(int b, int d, int h, int w, int n, int c, int n_intervals,
+                                  const float *__restrict__ out_grad,
+                                  const int *__restrict__ geom_feats,
+                                  const int *__restrict__ interval_starts,
+                                  const int *__restrict__ interval_lengths,
+                                  float* __restrict__ x_grad) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int index = idx / c;
+  int cur_c = idx % c;
+  if (index >= n_intervals) return;
+  int interval_start = interval_starts[index];
+  int interval_length = interval_lengths[index];
+
+  const int* cur_geom_feats = geom_feats + interval_start * 4;
+  float* cur_x_grad = x_grad + interval_start * c + cur_c;
+
+  const float* cur_out_grad = out_grad + cur_geom_feats[3] * d * h * w * c +
+    cur_geom_feats[2] * h * w * c + cur_geom_feats[0] * w * c +
+    cur_geom_feats[1] * c + cur_c;
+  for(int i = 0; i < interval_length; i++){
+    cur_x_grad[i * c] = *cur_out_grad;
+  }
+
+}
+
+void bev_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* x,
+  const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* out) {
+ hipLaunchKernelGGL(( bev_pool_kernel), dim3((int)ceil(((double)n_intervals * c / 256))), dim3(256), 0, 0, 
+    b, d, h, w, n, c, n_intervals, x, geom_feats, interval_starts, interval_lengths, out
+  );
+}
+
+void bev_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad,
+  const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* x_grad) {
+ hipLaunchKernelGGL(( bev_pool_grad_kernel), dim3((int)ceil(((double)n_intervals * c / 256))), dim3(256), 0, 0, 
+    b, d, h, w, n, c, n_intervals, out_grad, geom_feats, interval_starts, interval_lengths, x_grad
+  );
+}
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/src/bev_pool_hip.cpp b/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/src/bev_pool_hip.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bcf179c34a1e840471fea812ac0a68ab87484a94
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/bev_pool/src/bev_pool_hip.cpp
@@ -0,0 +1,96 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#include <torch/torch.h>
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+
+// CUDA function declarations
+void bev_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* x,
+    const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* out);
+
+void bev_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad,
+  const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* x_grad);
+
+
+/*
+  Function: pillar pooling (forward, cuda)
+  Args:
+    x                : input features, FloatTensor[n, c]
+    geom_feats       : input coordinates, IntTensor[n, 4]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+  Return:
+    out              : output features, FloatTensor[b, d, h, w, c]
+*/
+at::Tensor bev_pool_forward(
+  const at::Tensor _x,
+  const at::Tensor _geom_feats,
+  const at::Tensor _interval_lengths,
+  const at::Tensor _interval_starts,
+  int b, int d, int h, int w
+) {
+  int n = _x.size(0);
+  int c = _x.size(1);
+  int n_intervals = _interval_lengths.size(0);
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(_x));
+  const float* x = _x.data_ptr<float>();
+  const int* geom_feats = _geom_feats.data_ptr<int>();
+  const int* interval_lengths = _interval_lengths.data_ptr<int>();
+  const int* interval_starts = _interval_starts.data_ptr<int>();
+
+  auto options =
+      torch::TensorOptions().dtype(_x.dtype()).device(_x.device());
+  at::Tensor _out = torch::zeros({b, d, h, w, c}, options);
+  float* out = _out.data_ptr<float>();
+  bev_pool(
+    b, d, h, w, n, c, n_intervals, x,
+    geom_feats, interval_starts, interval_lengths, out
+  );
+  return _out;
+}
+
+
+/*
+  Function: pillar pooling (backward, cuda)
+  Args:
+    out_grad         : input features, FloatTensor[b, d, h, w, c]
+    geom_feats       : input coordinates, IntTensor[n, 4]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+  Return:
+    x_grad           : output features, FloatTensor[n, 4]
+*/
+at::Tensor bev_pool_backward(
+  const at::Tensor _out_grad,
+  const at::Tensor _geom_feats,
+  const at::Tensor _interval_lengths,
+  const at::Tensor _interval_starts,
+  int b, int d, int h, int w
+) {
+  int n = _geom_feats.size(0);
+  int c = _out_grad.size(4);
+  int n_intervals = _interval_lengths.size(0);
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(_out_grad));
+  const float* out_grad = _out_grad.data_ptr<float>();
+  const int* geom_feats = _geom_feats.data_ptr<int>();
+  const int* interval_lengths = _interval_lengths.data_ptr<int>();
+  const int* interval_starts = _interval_starts.data_ptr<int>();
+
+  auto options =
+      torch::TensorOptions().dtype(_out_grad.dtype()).device(_out_grad.device());
+  at::Tensor _x_grad = torch::zeros({n, c}, options);
+  float* x_grad = _x_grad.data_ptr<float>();
+
+  bev_pool_grad(
+    b, d, h, w, n, c, n_intervals, out_grad,
+    geom_feats, interval_starts, interval_lengths, x_grad
+  );
+
+  return _x_grad;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("bev_pool_forward", &bev_pool_forward,
+        "bev_pool_forward");
+  m.def("bev_pool_backward", &bev_pool_backward,
+        "bev_pool_backward");
+}
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/voxel/__init__.py b/mmde/projects/BEVFusion/bevfusion/ops/voxel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a74fb63f9fb06bb114846a90d96f38fdccff36b7
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/voxel/__init__.py
@@ -0,0 +1,4 @@
+from .scatter_points import DynamicScatter, dynamic_scatter
+from .voxelize import Voxelization, voxelization
+
+__all__ = ['Voxelization', 'voxelization', 'dynamic_scatter', 'DynamicScatter']
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/voxel/scatter_points.py b/mmde/projects/BEVFusion/bevfusion/ops/voxel/scatter_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..1862abd1d52497371493f5248894dfcaff044b47
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/voxel/scatter_points.py
@@ -0,0 +1,112 @@
+import torch
+from torch import nn
+from torch.autograd import Function
+
+from .voxel_layer import (dynamic_point_to_voxel_backward,
+                          dynamic_point_to_voxel_forward)
+
+
+class _dynamic_scatter(Function):
+
+    @staticmethod
+    def forward(ctx, feats, coors, reduce_type='max'):
+        """convert kitti points(N, >=3) to voxels.
+
+        Args:
+            feats: [N, C] float tensor. points features to be reduced
+                into voxels.
+            coors: [N, ndim] int tensor. corresponding voxel coordinates
+                (specifically multi-dim voxel index) of each points.
+            reduce_type: str. reduce op. support 'max', 'sum' and 'mean'
+        Returns:
+            tuple
+            voxel_feats: [M, C] float tensor. reduced features. input features
+                that shares the same voxel coordinates are reduced to one row
+            coordinates: [M, ndim] int tensor, voxel coordinates.
+        """
+        results = dynamic_point_to_voxel_forward(feats, coors, reduce_type)
+        (voxel_feats, voxel_coors, point2voxel_map,
+         voxel_points_count) = results
+        ctx.reduce_type = reduce_type
+        ctx.save_for_backward(feats, voxel_feats, point2voxel_map,
+                              voxel_points_count)
+        ctx.mark_non_differentiable(voxel_coors)
+        return voxel_feats, voxel_coors
+
+    @staticmethod
+    def backward(ctx, grad_voxel_feats, grad_voxel_coors=None):
+        (feats, voxel_feats, point2voxel_map,
+         voxel_points_count) = ctx.saved_tensors
+        grad_feats = torch.zeros_like(feats)
+        # TODO: whether to use index put or use cuda_backward
+        # To use index put, need point to voxel index
+        dynamic_point_to_voxel_backward(
+            grad_feats,
+            grad_voxel_feats.contiguous(),
+            feats,
+            voxel_feats,
+            point2voxel_map,
+            voxel_points_count,
+            ctx.reduce_type,
+        )
+        return grad_feats, None, None
+
+
+dynamic_scatter = _dynamic_scatter.apply
+
+
+class DynamicScatter(nn.Module):
+
+    def __init__(self, voxel_size, point_cloud_range, average_points: bool):
+        super(DynamicScatter, self).__init__()
+        """Scatters points into voxels, used in the voxel encoder with
+           dynamic voxelization
+
+        **Note**: The CPU and GPU implementation get the same output, but
+        have numerical difference after summation and division (e.g., 5e-7).
+
+        Args:
+            average_points (bool): whether to use avg pooling to scatter
+                points into voxel voxel_size (list): list [x, y, z] size
+                of three dimension
+            point_cloud_range (list):
+                [x_min, y_min, z_min, x_max, y_max, z_max]
+        """
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.average_points = average_points
+
+    def forward_single(self, points, coors):
+        reduce = 'mean' if self.average_points else 'max'
+        return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce)
+
+    def forward(self, points, coors):
+        """
+        Args:
+            input: NC points
+        """
+        if coors.size(-1) == 3:
+            return self.forward_single(points, coors)
+        else:
+            batch_size = coors[-1, 0] + 1
+            voxels, voxel_coors = [], []
+            for i in range(batch_size):
+                inds = torch.where(coors[:, 0] == i)
+                voxel, voxel_coor = self.forward_single(
+                    points[inds], coors[inds][:, 1:])
+                coor_pad = nn.functional.pad(
+                    voxel_coor, (1, 0), mode='constant', value=i)
+                voxel_coors.append(coor_pad)
+                voxels.append(voxel)
+            features = torch.cat(voxels, dim=0)
+            feature_coors = torch.cat(voxel_coors, dim=0)
+
+            return features, feature_coors
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + '('
+        tmpstr += 'voxel_size=' + str(self.voxel_size)
+        tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
+        tmpstr += ', average_points=' + str(self.average_points)
+        tmpstr += ')'
+        return tmpstr
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/scatter_points_cpu.cpp b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/scatter_points_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c22b8aeda6353887c91207e66e88d2729eae9d53
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/scatter_points_cpu.cpp
@@ -0,0 +1,122 @@
+#include <ATen/TensorUtils.h>
+#include <torch/extension.h>
+// #include "voxelization.h"
+
+namespace {
+
+template <typename T_int>
+void determin_max_points_kernel(
+    torch::TensorAccessor<T_int, 2> coor,
+    torch::TensorAccessor<T_int, 1> point_to_voxelidx,
+    torch::TensorAccessor<T_int, 1> num_points_per_voxel,
+    torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
+    int& max_points, const int num_points) {
+  int voxelidx, num;
+  for (int i = 0; i < num_points; ++i) {
+    if (coor[i][0] == -1) continue;
+    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+
+    // record voxel
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      voxel_num += 1;
+      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
+    }
+
+    // put points into voxel
+    num = num_points_per_voxel[voxelidx];
+    point_to_voxelidx[i] = num;
+    num_points_per_voxel[voxelidx] += 1;
+
+    // update max points per voxel
+    max_points = std::max(max_points, num + 1);
+  }
+
+  return;
+}
+
+template <typename T, typename T_int>
+void scatter_point_to_voxel_kernel(
+    const torch::TensorAccessor<T, 2> points,
+    torch::TensorAccessor<T_int, 2> coor,
+    torch::TensorAccessor<T_int, 1> point_to_voxelidx,
+    torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
+    torch::TensorAccessor<T, 3> voxels,
+    torch::TensorAccessor<T_int, 2> voxel_coors, const int num_features,
+    const int num_points, const int NDim) {
+  for (int i = 0; i < num_points; ++i) {
+    int num = point_to_voxelidx[i];
+    int voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+    for (int k = 0; k < num_features; ++k) {
+      voxels[voxelidx][num][k] = points[i][k];
+    }
+    for (int k = 0; k < NDim; ++k) {
+      voxel_coors[voxelidx][k] = coor[i][k];
+    }
+  }
+}
+
+}  // namespace
+
+namespace voxelization {
+
+std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
+    const at::Tensor& points, const at::Tensor& voxel_mapping,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range) {
+  // current version tooks about 0.02s_0.03s for one frame on cpu
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  const int NDim = voxel_mapping.size(1);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  std::vector<int> grid_size(NDim);
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  at::Tensor num_points_per_voxel = at::zeros(
+      {
+          num_points,
+      },
+      voxel_mapping.options());
+  at::Tensor coor_to_voxelidx = -at::ones(
+      {grid_size[2], grid_size[1], grid_size[0]}, voxel_mapping.options());
+  at::Tensor point_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      voxel_mapping.options());
+
+  int voxel_num = 0;
+  int max_points = 0;
+  AT_DISPATCH_ALL_TYPES(voxel_mapping.scalar_type(), "determin_max_point", [&] {
+    determin_max_points_kernel<scalar_t>(
+        voxel_mapping.accessor<scalar_t, 2>(),
+        point_to_voxelidx.accessor<scalar_t, 1>(),
+        num_points_per_voxel.accessor<scalar_t, 1>(),
+        coor_to_voxelidx.accessor<scalar_t, 3>(), voxel_num, max_points,
+        num_points);
+  });
+
+  at::Tensor voxels =
+      at::zeros({voxel_num, max_points, num_features}, points.options());
+  at::Tensor voxel_coors =
+      at::zeros({voxel_num, NDim}, points.options().dtype(at::kInt));
+
+  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "scatter_point_to_voxel", [&] {
+    scatter_point_to_voxel_kernel<scalar_t, int>(
+        points.accessor<scalar_t, 2>(), voxel_mapping.accessor<int, 2>(),
+        point_to_voxelidx.accessor<int, 1>(),
+        coor_to_voxelidx.accessor<int, 3>(), voxels.accessor<scalar_t, 3>(),
+        voxel_coors.accessor<int, 2>(), num_features, num_points, NDim);
+  });
+
+  at::Tensor num_points_per_voxel_out =
+      num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num);
+  return {voxels, voxel_coors, num_points_per_voxel_out};
+}
+
+}  // namespace voxelization
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/scatter_points_cuda.cu b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/scatter_points_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2ed18690972aade48649ec09a08234db1220d37b
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/scatter_points_cuda.cu
@@ -0,0 +1,310 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/types.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+namespace {
+int const threadsPerBlock = 512;
+int const maxGridDim = 50000;
+}  // namespace
+
+__device__ __forceinline__ static void reduceMax(float *address, float val) {
+  int *address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed,
+                    __float_as_int(fmaxf(val, __int_as_float(assumed))));
+  } while (assumed != old || __int_as_float(old) < val);
+}
+
+__device__ __forceinline__ static void reduceMax(double *address, double val) {
+  unsigned long long *address_as_ull =
+      reinterpret_cast<unsigned long long *>(address);
+  unsigned long long old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(
+        address_as_ull, assumed,
+        __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
+  } while (assumed != old || __longlong_as_double(old) < val);
+}
+
+// get rid of meaningless warnings when compiling host code
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ static void reduceAdd(float *address, float val) {
+#if (__CUDA_ARCH__ < 200)
+#warning \
+    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32"
+  int *address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed,
+                    __float_as_int(val + __int_as_float(assumed)));
+  } while (assumed != old);
+#else
+  atomicAdd(address, val);
+#endif
+}
+
+__device__ __forceinline__ static void reduceAdd(double *address, double val) {
+#if (__CUDA_ARCH__ < 600)
+#warning \
+    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64"
+  unsigned long long *address_as_ull =
+      reinterpret_cast<unsigned long long *>(address);
+  unsigned long long old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+#else
+  atomicAdd(address, val);
+#endif
+}
+#endif
+
+template <typename T>
+__global__ void
+feats_reduce_kernel(const T *feats, const int32_t *coors_map,
+                    T *reduced_feats, // shall be 0 at initialization
+                    const int num_input, const int num_feats,
+                    const reduce_t reduce_type) {
+  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
+       x += gridDim.x * blockDim.x) {
+    int32_t reduce_to = coors_map[x];
+    if (reduce_to == -1) continue;
+
+    const T *feats_offset = feats + x * num_feats;
+    T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;
+    if (reduce_type == reduce_t::MAX) {
+      for (int i = 0; i < num_feats; i++) {
+        reduceMax(&reduced_feats_offset[i], feats_offset[i]);
+      }
+    } else {
+      for (int i = 0; i < num_feats; i++) {
+        reduceAdd(&reduced_feats_offset[i], feats_offset[i]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void add_reduce_traceback_grad_kernel(
+    T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,
+    const int32_t *reduce_count, const int num_input, const int num_feats,
+    const reduce_t reduce_type) {
+  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
+       x += gridDim.x * blockDim.x) {
+    int32_t reduce_to = coors_map[x];
+    if (reduce_to == -1) {
+      continue;
+    }
+
+    const int input_offset = x * num_feats;
+    T *grad_feats_offset = grad_feats + input_offset;
+    const int reduced_offset = reduce_to * num_feats;
+    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
+
+    if (reduce_type == reduce_t::SUM) {
+      for (int i = 0; i < num_feats; i++) {
+        grad_feats_offset[i] = grad_reduced_feats_offset[i];
+      }
+    } else if (reduce_type == reduce_t::MEAN) {
+      for (int i = 0; i < num_feats; i++) {
+        grad_feats_offset[i] = grad_reduced_feats_offset[i] /
+                               static_cast<T>(reduce_count[reduce_to]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void max_reduce_traceback_scatter_idx_kernel(
+    const T *feats, const T *reduced_feats, int32_t *reduce_from,
+    const int32_t *coors_map, const int num_input, const int num_feats) {
+  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
+       x += gridDim.x * blockDim.x) {
+    int32_t reduce_to = coors_map[x];
+
+    const int input_offset = x * num_feats;
+    const T *feats_offset = feats + input_offset;
+
+    if (reduce_to == -1) {
+      continue;
+    }
+
+    const int reduced_offset = reduce_to * num_feats;
+    const T *reduced_feats_offset = reduced_feats + reduced_offset;
+    int32_t *reduce_from_offset = reduce_from + reduced_offset;
+
+    for (int i = 0; i < num_feats; i++) {
+      if (feats_offset[i] == reduced_feats_offset[i]) {
+        atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,
+                                               const T *grad_reduced_feats,
+                                               const int32_t *reduce_from,
+                                               const int num_reduced,
+                                               const int num_feats) {
+  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_reduced;
+       x += gridDim.x * blockDim.x) {
+    const int reduced_offset = x * num_feats;
+    const int32_t *scatter_to_offset = reduce_from + reduced_offset;
+    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
+
+    for (int i = 0; i < num_feats; i++) {
+      grad_feats[scatter_to_offset[i] * num_feats + i] =
+          grad_reduced_feats_offset[i];
+    }
+  }
+}
+
+namespace voxelization {
+
+std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
+    const at::Tensor &feats, const at::Tensor &coors,
+    const reduce_t reduce_type) {
+  CHECK_INPUT(feats);
+  CHECK_INPUT(coors);
+
+  const int num_input = feats.size(0);
+  const int num_feats = feats.size(1);
+
+  if (num_input == 0)
+    return {feats.clone().detach(),
+            coors.clone().detach(),
+            coors.new_empty({0}, torch::kInt32),
+            coors.new_empty({0}, torch::kInt32)};
+
+  at::Tensor out_coors;
+  at::Tensor coors_map;
+  at::Tensor reduce_count;
+
+  auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);
+
+  std::tie(out_coors, coors_map, reduce_count) =
+      at::unique_dim(coors_clean, 0, true, true, true);
+
+  if (out_coors.index({0, 0}).lt(0).item<bool>()) {
+    // the first element of out_coors (-1,-1,-1) and should be removed
+    out_coors = out_coors.slice(0, 1);
+    reduce_count = reduce_count.slice(0, 1);
+    coors_map = coors_map - 1;
+  }
+
+  coors_map = coors_map.to(torch::kInt32);
+  reduce_count = reduce_count.to(torch::kInt32);
+
+  auto reduced_feats =
+      at::empty({out_coors.size(0), num_feats}, feats.options());
+
+  AT_DISPATCH_FLOATING_TYPES(
+      feats.scalar_type(), "feats_reduce_kernel", ([&] {
+    if (reduce_type == reduce_t::MAX)
+      reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());
+    else
+      reduced_feats.fill_(static_cast<scalar_t>(0));
+
+    dim3 blocks(std::min(at::cuda::ATenCeilDiv(num_input, threadsPerBlock),
+                         maxGridDim));
+    dim3 threads(threadsPerBlock);
+    feats_reduce_kernel<<<blocks, threads>>>(
+        feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),
+        reduced_feats.data_ptr<scalar_t>(), num_input, num_feats, reduce_type);
+    if (reduce_type == reduce_t::MEAN)
+      reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());
+  }));
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return {reduced_feats, out_coors, coors_map, reduce_count};
+}
+
+void dynamic_point_to_voxel_backward_gpu(at::Tensor &grad_feats,
+                                         const at::Tensor &grad_reduced_feats,
+                                         const at::Tensor &feats,
+                                         const at::Tensor &reduced_feats,
+                                         const at::Tensor &coors_map,
+                                         const at::Tensor &reduce_count,
+                                         const reduce_t reduce_type) {
+  CHECK_INPUT(grad_feats);
+  CHECK_INPUT(grad_reduced_feats);
+  CHECK_INPUT(feats);
+  CHECK_INPUT(reduced_feats);
+  CHECK_INPUT(coors_map);
+  CHECK_INPUT(reduce_count);
+
+  const int num_input = feats.size(0);
+  const int num_reduced = reduced_feats.size(0);
+  const int num_feats = feats.size(1);
+
+  grad_feats.fill_(0);
+  // copy voxel grad to points
+
+  if (num_input == 0 || num_reduced == 0) return;
+
+  if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel",
+        ([&] {
+          dim3 blocks(std::min(
+              at::cuda::ATenCeilDiv(num_input, threadsPerBlock), maxGridDim));
+          dim3 threads(threadsPerBlock);
+          add_reduce_traceback_grad_kernel<<<blocks, threads>>>(
+              grad_feats.data_ptr<scalar_t>(),
+              grad_reduced_feats.data_ptr<scalar_t>(),
+              coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),
+              num_input, num_feats, reduce_type);
+        }));
+    AT_CUDA_CHECK(cudaGetLastError());
+  } else {
+    auto reduce_from = at::full({num_reduced, num_feats}, num_input,
+                                coors_map.options().dtype(torch::kInt32));
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(),
+        "max_reduce_traceback_scatter_idx_kernel", ([&] {
+          dim3 blocks(std::min(
+              at::cuda::ATenCeilDiv(num_input, threadsPerBlock), maxGridDim));
+          dim3 threads(threadsPerBlock);
+          max_reduce_traceback_scatter_idx_kernel<<<blocks, threads>>>(
+              feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),
+              reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),
+              num_input, num_feats);
+        }));
+    AT_CUDA_CHECK(cudaGetLastError());
+
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(),
+        "max_reduce_traceback_scatter_idx_kernel", ([&] {
+          dim3 blocks(std::min(
+              at::cuda::ATenCeilDiv(num_reduced, threadsPerBlock), maxGridDim));
+          dim3 threads(threadsPerBlock);
+          max_reduce_scatter_grad_kernel<<<blocks, threads>>>(
+              grad_feats.data_ptr<scalar_t>(),
+              grad_reduced_feats.data_ptr<scalar_t>(),
+              reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);
+        }));
+    AT_CUDA_CHECK(cudaGetLastError());
+  }
+  return;
+}
+
+}  // namespace voxelization
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/scatter_points_cuda.hip b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/scatter_points_cuda.hip
new file mode 100644
index 0000000000000000000000000000000000000000..558cb95c294491277c2b86036d7190e26f0f9a3f
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/scatter_points_cuda.hip
@@ -0,0 +1,313 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#include "hip/hip_runtime.h"
+#include <ATen/ATen.h>
+#include <ATen/hip/HIPContext.h>
+#include <torch/types.h>
+
+#include <ATen/hip/HIPApplyUtils.cuh>
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+namespace {
+int const threadsPerBlock = 512;
+int const maxGridDim = 50000;
+}  // namespace
+
+__device__ __forceinline__ static void reduceMax(float *address, float val) {
+  int *address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed,
+                    __float_as_int(fmaxf(val, __int_as_float(assumed))));
+  } while (assumed != old || __int_as_float(old) < val);
+}
+
+__device__ __forceinline__ static void reduceMax(double *address, double val) {
+  unsigned long long *address_as_ull =
+      reinterpret_cast<unsigned long long *>(address);
+  unsigned long long old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(
+        address_as_ull, assumed,
+        __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
+  } while (assumed != old || __longlong_as_double(old) < val);
+}
+
+// get rid of meaningless warnings when compiling host code
+#ifdef __DTK_ARCH__
+__device__ __forceinline__ static void reduceAdd(float *address, float val) {
+#if (__DTK_ARCH__ < 200)
+#warning \
+    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32"
+  int *address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed,
+                    __float_as_int(val + __int_as_float(assumed)));
+  } while (assumed != old);
+#else
+  atomicAdd(address, val);
+#endif
+}
+
+__device__ __forceinline__ static void reduceAdd(double *address, double val) {
+#if (__DTK_ARCH__ < 600)
+#warning \
+    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64"
+  unsigned long long *address_as_ull =
+      reinterpret_cast<unsigned long long *>(address);
+  unsigned long long old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+#else
+  atomicAdd(address, val);
+#endif
+}
+#endif
+
+template <typename T>
+__global__ void
+feats_reduce_kernel(const T *feats, const int32_t *coors_map,
+                    T *reduced_feats, // shall be 0 at initialization
+                    const int num_input, const int num_feats,
+                    const reduce_t reduce_type) {
+  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
+       x += gridDim.x * blockDim.x) {
+    int32_t reduce_to = coors_map[x];
+    if (reduce_to == -1) continue;
+
+    const T *feats_offset = feats + x * num_feats;
+    T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;
+    if (reduce_type == reduce_t::MAX) {
+      for (int i = 0; i < num_feats; i++) {
+        reduceMax(&reduced_feats_offset[i], feats_offset[i]);
+      }
+    } else {
+      for (int i = 0; i < num_feats; i++) {
+        reduceAdd(&reduced_feats_offset[i], feats_offset[i]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void add_reduce_traceback_grad_kernel(
+    T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,
+    const int32_t *reduce_count, const int num_input, const int num_feats,
+    const reduce_t reduce_type) {
+  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
+       x += gridDim.x * blockDim.x) {
+    int32_t reduce_to = coors_map[x];
+    if (reduce_to == -1) {
+      continue;
+    }
+
+    const int input_offset = x * num_feats;
+    T *grad_feats_offset = grad_feats + input_offset;
+    const int reduced_offset = reduce_to * num_feats;
+    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
+
+    if (reduce_type == reduce_t::SUM) {
+      for (int i = 0; i < num_feats; i++) {
+        grad_feats_offset[i] = grad_reduced_feats_offset[i];
+      }
+    } else if (reduce_type == reduce_t::MEAN) {
+      for (int i = 0; i < num_feats; i++) {
+        grad_feats_offset[i] = grad_reduced_feats_offset[i] /
+                               static_cast<T>(reduce_count[reduce_to]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void max_reduce_traceback_scatter_idx_kernel(
+    const T *feats, const T *reduced_feats, int32_t *reduce_from,
+    const int32_t *coors_map, const int num_input, const int num_feats) {
+  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_input;
+       x += gridDim.x * blockDim.x) {
+    int32_t reduce_to = coors_map[x];
+
+    const int input_offset = x * num_feats;
+    const T *feats_offset = feats + input_offset;
+
+    if (reduce_to == -1) {
+      continue;
+    }
+
+    const int reduced_offset = reduce_to * num_feats;
+    const T *reduced_feats_offset = reduced_feats + reduced_offset;
+    int32_t *reduce_from_offset = reduce_from + reduced_offset;
+
+    for (int i = 0; i < num_feats; i++) {
+      if (feats_offset[i] == reduced_feats_offset[i]) {
+        atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,
+                                               const T *grad_reduced_feats,
+                                               const int32_t *reduce_from,
+                                               const int num_reduced,
+                                               const int num_feats) {
+  for (int x = blockIdx.x * blockDim.x + threadIdx.x; x < num_reduced;
+       x += gridDim.x * blockDim.x) {
+    const int reduced_offset = x * num_feats;
+    const int32_t *scatter_to_offset = reduce_from + reduced_offset;
+    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
+
+    for (int i = 0; i < num_feats; i++) {
+      grad_feats[scatter_to_offset[i] * num_feats + i] =
+          grad_reduced_feats_offset[i];
+    }
+  }
+}
+
+namespace voxelization {
+
+std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
+    const at::Tensor &feats, const at::Tensor &coors,
+    const reduce_t reduce_type) {
+  CHECK_INPUT(feats);
+  CHECK_INPUT(coors);
+
+  const int num_input = feats.size(0);
+  const int num_feats = feats.size(1);
+
+  if (num_input == 0)
+    return {feats.clone().detach(),
+            coors.clone().detach(),
+            coors.new_empty({0}, torch::kInt32),
+            coors.new_empty({0}, torch::kInt32)};
+
+  at::Tensor out_coors;
+  at::Tensor coors_map;
+  at::Tensor reduce_count;
+
+  auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);
+
+  std::tie(out_coors, coors_map, reduce_count) =
+      at::unique_dim(coors_clean, 0, true, true, true);
+
+  if (out_coors.index({0, 0}).lt(0).item<bool>()) {
+    // the first element of out_coors (-1,-1,-1) and should be removed
+    out_coors = out_coors.slice(0, 1);
+    reduce_count = reduce_count.slice(0, 1);
+    coors_map = coors_map - 1;
+  }
+
+  coors_map = coors_map.to(torch::kInt32);
+  reduce_count = reduce_count.to(torch::kInt32);
+
+  auto reduced_feats =
+      at::empty({out_coors.size(0), num_feats}, feats.options());
+
+  AT_DISPATCH_FLOATING_TYPES(
+      feats.scalar_type(), "feats_reduce_kernel", ([&] {
+    if (reduce_type == reduce_t::MAX)
+      reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());
+    else
+      reduced_feats.fill_(static_cast<scalar_t>(0));
+
+    dim3 blocks(::min(at::cuda::ATenCeilDiv(num_input, threadsPerBlock),
+                         maxGridDim));
+    dim3 threads(threadsPerBlock);
+   hipLaunchKernelGGL(( feats_reduce_kernel), dim3(blocks), dim3(threads), 0, 0, 
+        feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),
+        reduced_feats.data_ptr<scalar_t>(), num_input, num_feats, reduce_type);
+    if (reduce_type == reduce_t::MEAN)
+      reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());
+  }));
+  AT_CUDA_CHECK(hipGetLastError());
+
+  return {reduced_feats, out_coors, coors_map, reduce_count};
+}
+
+void dynamic_point_to_voxel_backward_gpu(at::Tensor &grad_feats,
+                                         const at::Tensor &grad_reduced_feats,
+                                         const at::Tensor &feats,
+                                         const at::Tensor &reduced_feats,
+                                         const at::Tensor &coors_map,
+                                         const at::Tensor &reduce_count,
+                                         const reduce_t reduce_type) {
+  CHECK_INPUT(grad_feats);
+  CHECK_INPUT(grad_reduced_feats);
+  CHECK_INPUT(feats);
+  CHECK_INPUT(reduced_feats);
+  CHECK_INPUT(coors_map);
+  CHECK_INPUT(reduce_count);
+
+  const int num_input = feats.size(0);
+  const int num_reduced = reduced_feats.size(0);
+  const int num_feats = feats.size(1);
+
+  grad_feats.fill_(0);
+  // copy voxel grad to points
+
+  if (num_input == 0 || num_reduced == 0) return;
+
+  if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel",
+        ([&] {
+          dim3 blocks(::min(
+              at::cuda::ATenCeilDiv(num_input, threadsPerBlock), maxGridDim));
+          dim3 threads(threadsPerBlock);
+         hipLaunchKernelGGL(( add_reduce_traceback_grad_kernel), dim3(blocks), dim3(threads), 0, 0, 
+              grad_feats.data_ptr<scalar_t>(),
+              grad_reduced_feats.data_ptr<scalar_t>(),
+              coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),
+              num_input, num_feats, reduce_type);
+        }));
+    AT_CUDA_CHECK(hipGetLastError());
+  } else {
+    auto reduce_from = at::full({num_reduced, num_feats}, num_input,
+                                coors_map.options().dtype(torch::kInt32));
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(),
+        "max_reduce_traceback_scatter_idx_kernel", ([&] {
+          dim3 blocks(::min(
+              at::cuda::ATenCeilDiv(num_input, threadsPerBlock), maxGridDim));
+          dim3 threads(threadsPerBlock);
+         hipLaunchKernelGGL(( max_reduce_traceback_scatter_idx_kernel), dim3(blocks), dim3(threads), 0, 0, 
+              feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),
+              reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),
+              num_input, num_feats);
+        }));
+    AT_CUDA_CHECK(hipGetLastError());
+
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(),
+        "max_reduce_traceback_scatter_idx_kernel", ([&] {
+          dim3 blocks(::min(
+              at::cuda::ATenCeilDiv(num_reduced, threadsPerBlock), maxGridDim));
+          dim3 threads(threadsPerBlock);
+         hipLaunchKernelGGL(( max_reduce_scatter_grad_kernel), dim3(blocks), dim3(threads), 0, 0, 
+              grad_feats.data_ptr<scalar_t>(),
+              grad_reduced_feats.data_ptr<scalar_t>(),
+              reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);
+        }));
+    AT_CUDA_CHECK(hipGetLastError());
+  }
+  return;
+}
+
+}  // namespace voxelization
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/voxelization.cpp b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/voxelization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f83348e31a4b7e89fcf4187385eb3145e1a13bbb
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/voxelization.cpp
@@ -0,0 +1,13 @@
+#include <torch/extension.h>
+#include "voxelization.h"
+
+namespace voxelization {
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("hard_voxelize", &hard_voxelize, "hard voxelize");
+  m.def("dynamic_voxelize", &dynamic_voxelize, "dynamic voxelization");
+  m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward, "dynamic point to voxel forward");
+  m.def("dynamic_point_to_voxel_backward", &dynamic_point_to_voxel_backward, "dynamic point to voxel backward");
+}
+
+} // namespace voxelization
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/voxelization.h b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/voxelization.h
new file mode 100644
index 0000000000000000000000000000000000000000..765b30a5006d991075e9e8751ed0adfea336aa30
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/voxelization.h
@@ -0,0 +1,142 @@
+#pragma once
+#include <torch/extension.h>
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+namespace voxelization {
+
+int hard_voxelize_cpu(const at::Tensor &points, at::Tensor &voxels,
+                      at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                      const std::vector<float> voxel_size,
+                      const std::vector<float> coors_range,
+                      const int max_points, const int max_voxels,
+                      const int NDim = 3);
+
+void dynamic_voxelize_cpu(const at::Tensor &points, at::Tensor &coors,
+                          const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const int NDim = 3);
+
+std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
+    const at::Tensor &points, const at::Tensor &voxel_mapping,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range);
+
+#ifdef WITH_CUDA
+int hard_voxelize_gpu(const at::Tensor &points, at::Tensor &voxels,
+                      at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                      const std::vector<float> voxel_size,
+                      const std::vector<float> coors_range,
+                      const int max_points, const int max_voxels,
+                      const int NDim = 3);
+
+int nondisterministic_hard_voxelize_gpu(const at::Tensor &points, at::Tensor &voxels,
+                                        at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                                        const std::vector<float> voxel_size,
+                                        const std::vector<float> coors_range,
+                                        const int max_points, const int max_voxels,
+                                        const int NDim = 3);
+
+void dynamic_voxelize_gpu(const at::Tensor &points, at::Tensor &coors,
+                          const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const int NDim = 3);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_gpu(const torch::Tensor &feats,
+                                                              const torch::Tensor &coors,
+                                                              const reduce_t reduce_type);
+
+void dynamic_point_to_voxel_backward_gpu(torch::Tensor &grad_feats,
+                                         const torch::Tensor &grad_reduced_feats,
+                                         const torch::Tensor &feats,
+                                         const torch::Tensor &reduced_feats,
+                                         const torch::Tensor &coors_idx,
+                                         const torch::Tensor &reduce_count,
+                                         const reduce_t reduce_type);
+#endif
+
+// Interface for Python
+inline int hard_voxelize(const at::Tensor &points, at::Tensor &voxels,
+                         at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                         const std::vector<float> voxel_size,
+                         const std::vector<float> coors_range,
+                         const int max_points, const int max_voxels,
+                         const int NDim = 3, const bool deterministic = true) {
+  if (points.device().is_cuda()) {
+#ifdef WITH_CUDA
+    if (deterministic) {
+      return hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel,
+                               voxel_size, coors_range, max_points, max_voxels,
+                               NDim);
+    }
+    return nondisterministic_hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel,
+                                               voxel_size, coors_range, max_points, max_voxels,
+                                               NDim);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return hard_voxelize_cpu(points, voxels, coors, num_points_per_voxel,
+                           voxel_size, coors_range, max_points, max_voxels,
+                           NDim);
+}
+
+inline void dynamic_voxelize(const at::Tensor &points, at::Tensor &coors,
+                             const std::vector<float> voxel_size,
+                             const std::vector<float> coors_range,
+                             const int NDim = 3) {
+  if (points.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return dynamic_voxelize_gpu(points, coors, voxel_size, coors_range, NDim);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return dynamic_voxelize_cpu(points, coors, voxel_size, coors_range, NDim);
+}
+
+inline reduce_t convert_reduce_type(const std::string &reduce_type) {
+  if (reduce_type == "max")
+    return reduce_t::MAX;
+  else if (reduce_type == "sum")
+    return reduce_t::SUM;
+  else if (reduce_type == "mean")
+    return reduce_t::MEAN;
+  else TORCH_CHECK(false, "do not support reduce type " + reduce_type)
+  return reduce_t::SUM;
+}
+
+inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(const torch::Tensor &feats,
+                                                                 const torch::Tensor &coors,
+                                                                 const std::string &reduce_type) {
+  if (feats.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return dynamic_point_to_voxel_forward_gpu(feats, coors, convert_reduce_type(reduce_type));
+#else
+    TORCH_CHECK(false, "Not compiled with GPU support");
+#endif
+  }
+  TORCH_CHECK(false, "do not support cpu yet");
+  return std::vector<torch::Tensor>();
+}
+
+inline void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
+                                            const torch::Tensor &grad_reduced_feats,
+                                            const torch::Tensor &feats,
+                                            const torch::Tensor &reduced_feats,
+                                            const torch::Tensor &coors_idx,
+                                            const torch::Tensor &reduce_count,
+                                            const std::string &reduce_type) {
+  if (grad_feats.device().is_cuda()) {
+#ifdef WITH_CUDA
+    dynamic_point_to_voxel_backward_gpu(
+        grad_feats, grad_reduced_feats, feats, reduced_feats, coors_idx, reduce_count,
+        convert_reduce_type(reduce_type));
+    return;
+#else
+    TORCH_CHECK(false, "Not compiled with GPU support");
+#endif
+  }
+  TORCH_CHECK(false, "do not support cpu yet");
+}
+
+}  // namespace voxelization
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/voxelization_cpu.cpp b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/voxelization_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f87e26b3aeb436125a938801594544bb76f7da6
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/voxelization_cpu.cpp
@@ -0,0 +1,173 @@
+#include <ATen/TensorUtils.h>
+#include <torch/extension.h>
+// #include "voxelization.h"
+
+namespace {
+
+template <typename T, typename T_int>
+void dynamic_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
+                             torch::TensorAccessor<T_int, 2> coors,
+                             const std::vector<float> voxel_size,
+                             const std::vector<float> coors_range,
+                             const std::vector<int> grid_size,
+                             const int num_points, const int num_features,
+                             const int NDim) {
+  const int ndim_minus_1 = NDim - 1;
+  bool failed = false;
+  // int coor[NDim];
+  int* coor = new int[NDim]();
+  int c;
+
+  for (int i = 0; i < num_points; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
+      // necessary to rm points out of range
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[j] = c;
+    }
+
+    for (int k = 0; k < NDim; ++k) {
+      if (failed)
+        coors[i][k] = -1;
+      else
+        coors[i][k] = coor[k];
+    }
+  }
+
+  delete[] coor;
+  return;
+}
+
+template <typename T, typename T_int>
+void hard_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
+                          torch::TensorAccessor<T, 3> voxels,
+                          torch::TensorAccessor<T_int, 2> coors,
+                          torch::TensorAccessor<T_int, 1> num_points_per_voxel,
+                          torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
+                          int& voxel_num, const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const std::vector<int> grid_size,
+                          const int max_points, const int max_voxels,
+                          const int num_points, const int num_features,
+                          const int NDim) {
+  // declare a temp coors
+  at::Tensor temp_coors = at::zeros(
+      {num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
+
+  // First use dynamic voxelization to get coors,
+  // then check max points/voxels constraints
+  dynamic_voxelize_kernel<T, int>(points, temp_coors.accessor<int, 2>(),
+                                  voxel_size, coors_range, grid_size,
+                                  num_points, num_features, NDim);
+
+  int voxelidx, num;
+  auto coor = temp_coors.accessor<int, 2>();
+
+  for (int i = 0; i < num_points; ++i) {
+    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
+
+    if (coor[i][0] == -1) continue;
+
+    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+
+    // record voxel
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (max_voxels != -1 && voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+
+      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
+
+      for (int k = 0; k < NDim; ++k) {
+        coors[voxelidx][k] = coor[i][k];
+      }
+    }
+
+    // put points into voxel
+    num = num_points_per_voxel[voxelidx];
+    if (max_points == -1 || num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels[voxelidx][num][k] = points[i][k];
+      }
+      num_points_per_voxel[voxelidx] += 1;
+    }
+  }
+
+  return;
+}
+
+}  // namespace
+
+namespace voxelization {
+
+int hard_voxelize_cpu(const at::Tensor& points, at::Tensor& voxels,
+                      at::Tensor& coors, at::Tensor& num_points_per_voxel,
+                      const std::vector<float> voxel_size,
+                      const std::vector<float> coors_range,
+                      const int max_points, const int max_voxels,
+                      const int NDim = 3) {
+  // current version tooks about 0.02s_0.03s for one frame on cpu
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  // printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
+  // grid_size[1], grid_size[0]);
+  at::Tensor coor_to_voxelidx =
+      -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
+
+  int voxel_num = 0;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "hard_voxelize_forward", [&] {
+        hard_voxelize_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
+            coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
+            coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
+            coors_range, grid_size, max_points, max_voxels, num_points,
+            num_features, NDim);
+      });
+
+  return voxel_num;
+}
+
+void dynamic_voxelize_cpu(const at::Tensor& points, at::Tensor& coors,
+                          const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const int NDim = 3) {
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "hard_voxelize_forward", [&] {
+        dynamic_voxelize_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
+            voxel_size, coors_range, grid_size, num_points, num_features, NDim);
+      });
+
+  return;
+}
+
+}  // namespace voxelization
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/voxelization_cuda.cu b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/voxelization_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f2c4f5a385df830755047c155e0ea2cde0c6c1d5
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/voxelization_cuda.cu
@@ -0,0 +1,530 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/types.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+template <typename T, typename T_int>
+__global__ void dynamic_voxelize_kernel(
+    const T* points, T_int* coors, const float voxel_x, const float voxel_y,
+    const float voxel_z, const float coors_x_min, const float coors_y_min,
+    const float coors_z_min, const float coors_x_max, const float coors_y_max,
+    const float coors_z_max, const int grid_x, const int grid_y,
+    const int grid_z, const int num_points, const int num_features,
+    const int NDim) {
+  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    // To save some computation
+    auto points_offset = points + index * num_features;
+    auto coors_offset = coors + index * NDim;
+    int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
+    if (c_x < 0 || c_x >= grid_x) {
+      coors_offset[0] = -1;
+      return;
+    }
+
+    int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
+    if (c_y < 0 || c_y >= grid_y) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      return;
+    }
+
+    int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
+    if (c_z < 0 || c_z >= grid_z) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      coors_offset[2] = -1;
+    } else {
+      coors_offset[0] = c_x;
+      coors_offset[1] = c_y;
+      coors_offset[2] = c_z;
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_point_to_voxel(const int nthreads, const T* points,
+                                      T_int* point_to_voxelidx,
+                                      T_int* coor_to_voxelidx, T* voxels,
+                                      const int max_points,
+                                      const int num_features,
+                                      const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    int index = thread_idx / num_features;
+
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num > -1 && voxelidx > -1) {
+      auto voxels_offset =
+          voxels + voxelidx * max_points * num_features + num * num_features;
+
+      int k = thread_idx % num_features;
+      voxels_offset[k] = points[thread_idx];
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
+                                   T_int* point_to_voxelidx,
+                                   T_int* coor_to_voxelidx, T_int* voxel_coors,
+                                   const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    // if (index >= num_points) return;
+    int index = thread_idx / NDim;
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num == 0 && voxelidx > -1) {
+      auto coors_offset = voxel_coors + voxelidx * NDim;
+      int k = thread_idx % NDim;
+      coors_offset[k] = coor[thread_idx];
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void point_to_voxelidx_kernel(const T_int* coor,
+                                         T_int* point_to_voxelidx,
+                                         T_int* point_to_pointidx,
+                                         const int max_points,
+                                         const int max_voxels,
+                                         const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    auto coor_offset = coor + index * NDim;
+    // skip invalid points
+    if ((index >= num_points) || (coor_offset[0] == -1)) return;
+
+    int num = 0;
+    int coor_x = coor_offset[0];
+    int coor_y = coor_offset[1];
+    int coor_z = coor_offset[2];
+    // only calculate the coors before this coor[index]
+    for (int i = 0; i < index; ++i) {
+      auto prev_coor = coor + i * NDim;
+      if (prev_coor[0] == -1) continue;
+
+      // Find all previous points that have the same coors
+      // if find the same coor, record it
+      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
+          (prev_coor[2] == coor_z)) {
+        num++;
+        if (num == 1) {
+          // point to the same coor that first show up
+          point_to_pointidx[index] = i;
+        } else if (num >= max_points) {
+          // out of boundary
+          return;
+        }
+      }
+    }
+    if (num == 0) {
+      point_to_pointidx[index] = index;
+    }
+    if (num < max_points) {
+      point_to_voxelidx[index] = num;
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void determin_voxel_num(
+    // const T_int* coor,
+    T_int* num_points_per_voxel, T_int* point_to_voxelidx,
+    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
+    const int max_points, const int max_voxels, const int num_points) {
+  // only calculate the coors before this coor[index]
+  for (int i = 0; i < num_points; ++i) {
+    // if (coor[i][0] == -1)
+    //    continue;
+    int point_pos_in_voxel = point_to_voxelidx[i];
+    // record voxel
+    if (point_pos_in_voxel == -1) {
+      // out of max_points or invalid point
+      continue;
+    } else if (point_pos_in_voxel == 0) {
+      // record new voxel
+      int voxelidx = voxel_num[0];
+      if (voxel_num[0] >= max_voxels) continue;
+      voxel_num[0] += 1;
+      coor_to_voxelidx[i] = voxelidx;
+      num_points_per_voxel[voxelidx] = 1;
+    } else {
+      int point_idx = point_to_pointidx[i];
+      int voxelidx = coor_to_voxelidx[point_idx];
+      if (voxelidx != -1) {
+        coor_to_voxelidx[i] = voxelidx;
+        num_points_per_voxel[voxelidx] += 1;
+      }
+    }
+  }
+}
+
+__global__ void nondisterministic_get_assign_pos(
+    const int nthreads, const int32_t *coors_map, int32_t *pts_id,
+    int32_t *coors_count, int32_t *reduce_count, int32_t *coors_order) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    int coors_idx = coors_map[thread_idx];
+    if (coors_idx > -1) {
+      int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);
+      pts_id[thread_idx] = coors_pts_pos;
+      if (coors_pts_pos == 0) {
+        coors_order[coors_idx] = atomicAdd(coors_count, 1);
+      }
+    }
+  }
+}
+
+template<typename T>
+__global__ void nondisterministic_assign_point_voxel(
+    const int nthreads, const T *points, const int32_t *coors_map,
+    const int32_t *pts_id, const int32_t *coors_in,
+    const int32_t *reduce_count, const int32_t *coors_order,
+    T *voxels, int32_t *coors, int32_t *pts_count, const int max_voxels,
+    const int max_points, const int num_features, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    int coors_idx = coors_map[thread_idx];
+    int coors_pts_pos = pts_id[thread_idx];
+    if (coors_idx > -1) {
+      int coors_pos = coors_order[coors_idx];
+      if (coors_pos < max_voxels && coors_pts_pos < max_points) {
+        auto voxels_offset =
+            voxels + (coors_pos * max_points + coors_pts_pos) * num_features;
+        auto points_offset = points + thread_idx * num_features;
+        for (int k = 0; k < num_features; k++) {
+          voxels_offset[k] = points_offset[k];
+        }
+        if (coors_pts_pos == 0) {
+          pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);
+          auto coors_offset = coors + coors_pos * NDim;
+          auto coors_in_offset = coors_in + coors_idx * NDim;
+          for (int k = 0; k < NDim; k++) {
+            coors_offset[k] = coors_in_offset[k];
+          }
+        }
+      }
+    }
+  }
+}
+
+namespace voxelization {
+
+int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
+                      at::Tensor& coors, at::Tensor& num_points_per_voxel,
+                      const std::vector<float> voxel_size,
+                      const std::vector<float> coors_range,
+                      const int max_points, const int max_voxels,
+                      const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+  CHECK_INPUT(points);
+
+  at::cuda::CUDAGuard device_guard(points.device());
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 block(512);
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+        dynamic_voxelize_kernel<scalar_t, int>
+            <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
+                points.contiguous().data_ptr<scalar_t>(),
+                temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y,
+                voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
+                coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
+                num_features, NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 2. map point to the idx of the corresponding voxel, find duplicate coor
+  // create some temporary variables
+  auto point_to_pointidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto point_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+
+  dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 map_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      temp_coors.scalar_type(), "determin_duplicate", ([&] {
+        point_to_voxelidx_kernel<int>
+            <<<map_grid, map_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+                temp_coors.contiguous().data_ptr<int>(),
+                point_to_voxelidx.contiguous().data_ptr<int>(),
+                point_to_pointidx.contiguous().data_ptr<int>(), max_points,
+                max_voxels, num_points, NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 3. determined voxel num and voxel's coor index
+  // make the logic in the CUDA device could accelerate about 10 times
+  auto coor_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto voxel_num = at::zeros(
+      {
+          1,
+      },
+      points.options().dtype(at::kInt));  // must be zero from the beginning
+
+  AT_DISPATCH_ALL_TYPES(
+      temp_coors.scalar_type(), "determin_duplicate", ([&] {
+        determin_voxel_num<int><<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
+            num_points_per_voxel.contiguous().data_ptr<int>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            point_to_pointidx.contiguous().data_ptr<int>(),
+            coor_to_voxelidx.contiguous().data_ptr<int>(),
+            voxel_num.contiguous().data_ptr<int>(), max_points, max_voxels,
+            num_points);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 4. copy point features to voxels
+  // Step 4 & 5 could be parallel
+  auto pts_output_size = num_points * num_features;
+  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
+  dim3 cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        assign_point_to_voxel<float, int>
+            <<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+                pts_output_size, points.contiguous().data_ptr<float>(),
+                point_to_voxelidx.contiguous().data_ptr<int>(),
+                coor_to_voxelidx.contiguous().data_ptr<int>(),
+                voxels.contiguous().data_ptr<float>(), max_points, num_features,
+                num_points, NDim);
+      }));
+  //   cudaDeviceSynchronize();
+  //   AT_CUDA_CHECK(cudaGetLastError());
+
+  // 5. copy coors of each voxels
+  auto coors_output_size = num_points * NDim;
+  dim3 coors_cp_grid(
+      std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
+  dim3 coors_cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        assign_voxel_coors<float, int><<<coors_cp_grid, coors_cp_block, 0,
+                                         at::cuda::getCurrentCUDAStream()>>>(
+            coors_output_size, temp_coors.contiguous().data_ptr<int>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            coor_to_voxelidx.contiguous().data_ptr<int>(),
+            coors.contiguous().data_ptr<int>(), num_points, NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  auto voxel_num_cpu = voxel_num.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+
+  return voxel_num_int;
+}
+
+int nondisterministic_hard_voxelize_gpu(
+    const at::Tensor &points, at::Tensor &voxels,
+    at::Tensor &coors, at::Tensor &num_points_per_voxel,
+    const std::vector<float> voxel_size,
+    const std::vector<float> coors_range,
+    const int max_points, const int max_voxels,
+    const int NDim = 3) {
+
+  CHECK_INPUT(points);
+
+  at::cuda::CUDAGuard device_guard(points.device());
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  if (num_points == 0)
+    return 0;
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(torch::kInt32));
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 block(512);
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+    dynamic_voxelize_kernel<scalar_t, int>
+    <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
+        points.contiguous().data_ptr<scalar_t>(),
+        temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y,
+        voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
+        coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
+        num_features, NDim);
+  }));
+
+  at::Tensor coors_map;
+  at::Tensor coors_count;
+  at::Tensor coors_order;
+  at::Tensor reduce_count;
+  at::Tensor pts_id;
+
+  auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);
+
+  std::tie(temp_coors, coors_map, reduce_count) =
+      at::unique_dim(coors_clean, 0, true, true, false);
+
+  if (temp_coors.index({0, 0}).lt(0).item<bool>()) {
+    // the first element of temp_coors is (-1,-1,-1) and should be removed
+    temp_coors = temp_coors.slice(0, 1);
+    coors_map = coors_map - 1;
+  }
+
+  int num_coors = temp_coors.size(0);
+  temp_coors = temp_coors.to(torch::kInt32);
+  coors_map = coors_map.to(torch::kInt32);
+
+  coors_count = coors_map.new_zeros(1);
+  coors_order = coors_map.new_empty(num_coors);
+  reduce_count = coors_map.new_zeros(num_coors);
+  pts_id = coors_map.new_zeros(num_points);
+
+  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 cp_block(512);
+  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "get_assign_pos", ([&] {
+    nondisterministic_get_assign_pos<<<cp_grid, cp_block, 0,
+    at::cuda::getCurrentCUDAStream()>>>(
+        num_points,
+        coors_map.contiguous().data_ptr<int32_t>(),
+        pts_id.contiguous().data_ptr<int32_t>(),
+        coors_count.contiguous().data_ptr<int32_t>(),
+        reduce_count.contiguous().data_ptr<int32_t>(),
+        coors_order.contiguous().data_ptr<int32_t>());
+  }));
+
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+    nondisterministic_assign_point_voxel<scalar_t>
+    <<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+        num_points, points.contiguous().data_ptr<scalar_t>(),
+        coors_map.contiguous().data_ptr<int32_t>(),
+        pts_id.contiguous().data_ptr<int32_t>(),
+        temp_coors.contiguous().data_ptr<int32_t>(),
+        reduce_count.contiguous().data_ptr<int32_t>(),
+        coors_order.contiguous().data_ptr<int32_t>(),
+        voxels.contiguous().data_ptr<scalar_t>(),
+        coors.contiguous().data_ptr<int32_t>(),
+        num_points_per_voxel.contiguous().data_ptr<int32_t>(),
+        max_voxels, max_points,
+        num_features, NDim);
+  }));
+  AT_CUDA_CHECK(cudaGetLastError());
+  return max_voxels < num_coors ? max_voxels : num_coors;
+}
+
+void dynamic_voxelize_gpu(const at::Tensor& points, at::Tensor& coors,
+                          const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+  CHECK_INPUT(points);
+
+  at::cuda::CUDAGuard device_guard(points.device());
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
+  dim3 blocks(col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
+    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
+        points.contiguous().data_ptr<scalar_t>(),
+        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
+  });
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return;
+}
+
+}  // namespace voxelization
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/voxelization_cuda.hip b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/voxelization_cuda.hip
new file mode 100644
index 0000000000000000000000000000000000000000..b7ae0dbafa272f0de165894c3ebf95204f1165f7
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/voxel/src/voxelization_cuda.hip
@@ -0,0 +1,533 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#include "hip/hip_runtime.h"
+#include <ATen/ATen.h>
+#include <ATen/hip/HIPContext.h>
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+#include <torch/types.h>
+
+#include <ATen/hip/HIPApplyUtils.cuh>
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+template <typename T, typename T_int>
+__global__ void dynamic_voxelize_kernel(
+    const T* points, T_int* coors, const float voxel_x, const float voxel_y,
+    const float voxel_z, const float coors_x_min, const float coors_y_min,
+    const float coors_z_min, const float coors_x_max, const float coors_y_max,
+    const float coors_z_max, const int grid_x, const int grid_y,
+    const int grid_z, const int num_points, const int num_features,
+    const int NDim) {
+  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    // To save some computation
+    auto points_offset = points + index * num_features;
+    auto coors_offset = coors + index * NDim;
+    int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
+    if (c_x < 0 || c_x >= grid_x) {
+      coors_offset[0] = -1;
+      return;
+    }
+
+    int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
+    if (c_y < 0 || c_y >= grid_y) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      return;
+    }
+
+    int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
+    if (c_z < 0 || c_z >= grid_z) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      coors_offset[2] = -1;
+    } else {
+      coors_offset[0] = c_x;
+      coors_offset[1] = c_y;
+      coors_offset[2] = c_z;
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_point_to_voxel(const int nthreads, const T* points,
+                                      T_int* point_to_voxelidx,
+                                      T_int* coor_to_voxelidx, T* voxels,
+                                      const int max_points,
+                                      const int num_features,
+                                      const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    int index = thread_idx / num_features;
+
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num > -1 && voxelidx > -1) {
+      auto voxels_offset =
+          voxels + voxelidx * max_points * num_features + num * num_features;
+
+      int k = thread_idx % num_features;
+      voxels_offset[k] = points[thread_idx];
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
+                                   T_int* point_to_voxelidx,
+                                   T_int* coor_to_voxelidx, T_int* voxel_coors,
+                                   const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    // if (index >= num_points) return;
+    int index = thread_idx / NDim;
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num == 0 && voxelidx > -1) {
+      auto coors_offset = voxel_coors + voxelidx * NDim;
+      int k = thread_idx % NDim;
+      coors_offset[k] = coor[thread_idx];
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void point_to_voxelidx_kernel(const T_int* coor,
+                                         T_int* point_to_voxelidx,
+                                         T_int* point_to_pointidx,
+                                         const int max_points,
+                                         const int max_voxels,
+                                         const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    auto coor_offset = coor + index * NDim;
+    // skip invalid points
+    if ((index >= num_points) || (coor_offset[0] == -1)) return;
+
+    int num = 0;
+    int coor_x = coor_offset[0];
+    int coor_y = coor_offset[1];
+    int coor_z = coor_offset[2];
+    // only calculate the coors before this coor[index]
+    for (int i = 0; i < index; ++i) {
+      auto prev_coor = coor + i * NDim;
+      if (prev_coor[0] == -1) continue;
+
+      // Find all previous points that have the same coors
+      // if find the same coor, record it
+      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
+          (prev_coor[2] == coor_z)) {
+        num++;
+        if (num == 1) {
+          // point to the same coor that first show up
+          point_to_pointidx[index] = i;
+        } else if (num >= max_points) {
+          // out of boundary
+          return;
+        }
+      }
+    }
+    if (num == 0) {
+      point_to_pointidx[index] = index;
+    }
+    if (num < max_points) {
+      point_to_voxelidx[index] = num;
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void determin_voxel_num(
+    // const T_int* coor,
+    T_int* num_points_per_voxel, T_int* point_to_voxelidx,
+    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
+    const int max_points, const int max_voxels, const int num_points) {
+  // only calculate the coors before this coor[index]
+  for (int i = 0; i < num_points; ++i) {
+    // if (coor[i][0] == -1)
+    //    continue;
+    int point_pos_in_voxel = point_to_voxelidx[i];
+    // record voxel
+    if (point_pos_in_voxel == -1) {
+      // out of max_points or invalid point
+      continue;
+    } else if (point_pos_in_voxel == 0) {
+      // record new voxel
+      int voxelidx = voxel_num[0];
+      if (voxel_num[0] >= max_voxels) continue;
+      voxel_num[0] += 1;
+      coor_to_voxelidx[i] = voxelidx;
+      num_points_per_voxel[voxelidx] = 1;
+    } else {
+      int point_idx = point_to_pointidx[i];
+      int voxelidx = coor_to_voxelidx[point_idx];
+      if (voxelidx != -1) {
+        coor_to_voxelidx[i] = voxelidx;
+        num_points_per_voxel[voxelidx] += 1;
+      }
+    }
+  }
+}
+
+__global__ void nondisterministic_get_assign_pos(
+    const int nthreads, const int32_t *coors_map, int32_t *pts_id,
+    int32_t *coors_count, int32_t *reduce_count, int32_t *coors_order) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    int coors_idx = coors_map[thread_idx];
+    if (coors_idx > -1) {
+      int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);
+      pts_id[thread_idx] = coors_pts_pos;
+      if (coors_pts_pos == 0) {
+        coors_order[coors_idx] = atomicAdd(coors_count, 1);
+      }
+    }
+  }
+}
+
+template<typename T>
+__global__ void nondisterministic_assign_point_voxel(
+    const int nthreads, const T *points, const int32_t *coors_map,
+    const int32_t *pts_id, const int32_t *coors_in,
+    const int32_t *reduce_count, const int32_t *coors_order,
+    T *voxels, int32_t *coors, int32_t *pts_count, const int max_voxels,
+    const int max_points, const int num_features, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    int coors_idx = coors_map[thread_idx];
+    int coors_pts_pos = pts_id[thread_idx];
+    if (coors_idx > -1) {
+      int coors_pos = coors_order[coors_idx];
+      if (coors_pos < max_voxels && coors_pts_pos < max_points) {
+        auto voxels_offset =
+            voxels + (coors_pos * max_points + coors_pts_pos) * num_features;
+        auto points_offset = points + thread_idx * num_features;
+        for (int k = 0; k < num_features; k++) {
+          voxels_offset[k] = points_offset[k];
+        }
+        if (coors_pts_pos == 0) {
+          pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);
+          auto coors_offset = coors + coors_pos * NDim;
+          auto coors_in_offset = coors_in + coors_idx * NDim;
+          for (int k = 0; k < NDim; k++) {
+            coors_offset[k] = coors_in_offset[k];
+          }
+        }
+      }
+    }
+  }
+}
+
+namespace voxelization {
+
+int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
+                      at::Tensor& coors, at::Tensor& num_points_per_voxel,
+                      const std::vector<float> voxel_size,
+                      const std::vector<float> coors_range,
+                      const int max_points, const int max_voxels,
+                      const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+  CHECK_INPUT(points);
+
+  at::hip::HIPGuardMasqueradingAsCUDA device_guard(points.device());
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
+
+  dim3 grid(::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 block(512);
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+       hipLaunchKernelGGL(( dynamic_voxelize_kernel<scalar_t, int>)
+            , dim3(grid), dim3(block), 0, at::hip::getCurrentHIPStreamMasqueradingAsCUDA(), 
+                points.contiguous().data_ptr<scalar_t>(),
+                temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y,
+                voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
+                coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
+                num_features, NDim);
+      }));
+  hipDeviceSynchronize();
+  AT_CUDA_CHECK(hipGetLastError());
+
+  // 2. map point to the idx of the corresponding voxel, find duplicate coor
+  // create some temporary variables
+  auto point_to_pointidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto point_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+
+  dim3 map_grid(::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 map_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      temp_coors.scalar_type(), "determin_duplicate", ([&] {
+       hipLaunchKernelGGL(( point_to_voxelidx_kernel<int>)
+            , dim3(map_grid), dim3(map_block), 0, at::hip::getCurrentHIPStreamMasqueradingAsCUDA(), 
+                temp_coors.contiguous().data_ptr<int>(),
+                point_to_voxelidx.contiguous().data_ptr<int>(),
+                point_to_pointidx.contiguous().data_ptr<int>(), max_points,
+                max_voxels, num_points, NDim);
+      }));
+  hipDeviceSynchronize();
+  AT_CUDA_CHECK(hipGetLastError());
+
+  // 3. determined voxel num and voxel's coor index
+  // make the logic in the CUDA device could accelerate about 10 times
+  auto coor_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto voxel_num = at::zeros(
+      {
+          1,
+      },
+      points.options().dtype(at::kInt));  // must be zero from the beginning
+
+  AT_DISPATCH_ALL_TYPES(
+      temp_coors.scalar_type(), "determin_duplicate", ([&] {
+       hipLaunchKernelGGL(( determin_voxel_num<int>), dim3(1), dim3(1), 0, at::hip::getCurrentHIPStreamMasqueradingAsCUDA(), 
+            num_points_per_voxel.contiguous().data_ptr<int>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            point_to_pointidx.contiguous().data_ptr<int>(),
+            coor_to_voxelidx.contiguous().data_ptr<int>(),
+            voxel_num.contiguous().data_ptr<int>(), max_points, max_voxels,
+            num_points);
+      }));
+  hipDeviceSynchronize();
+  AT_CUDA_CHECK(hipGetLastError());
+
+  // 4. copy point features to voxels
+  // Step 4 & 5 could be parallel
+  auto pts_output_size = num_points * num_features;
+  dim3 cp_grid(::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
+  dim3 cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+       hipLaunchKernelGGL(( assign_point_to_voxel<float, int>)
+            , dim3(cp_grid), dim3(cp_block), 0, at::hip::getCurrentHIPStreamMasqueradingAsCUDA(), 
+                pts_output_size, points.contiguous().data_ptr<float>(),
+                point_to_voxelidx.contiguous().data_ptr<int>(),
+                coor_to_voxelidx.contiguous().data_ptr<int>(),
+                voxels.contiguous().data_ptr<float>(), max_points, num_features,
+                num_points, NDim);
+      }));
+  //   hipDeviceSynchronize();
+  //   AT_CUDA_CHECK(hipGetLastError());
+
+  // 5. copy coors of each voxels
+  auto coors_output_size = num_points * NDim;
+  dim3 coors_cp_grid(
+      ::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
+  dim3 coors_cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+       hipLaunchKernelGGL(( assign_voxel_coors<float, int>), dim3(coors_cp_grid), dim3(coors_cp_block), 0,
+                                         at::hip::getCurrentHIPStreamMasqueradingAsCUDA(), 
+            coors_output_size, temp_coors.contiguous().data_ptr<int>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            coor_to_voxelidx.contiguous().data_ptr<int>(),
+            coors.contiguous().data_ptr<int>(), num_points, NDim);
+      }));
+  hipDeviceSynchronize();
+  AT_CUDA_CHECK(hipGetLastError());
+
+  auto voxel_num_cpu = voxel_num.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+
+  return voxel_num_int;
+}
+
+int nondisterministic_hard_voxelize_gpu(
+    const at::Tensor &points, at::Tensor &voxels,
+    at::Tensor &coors, at::Tensor &num_points_per_voxel,
+    const std::vector<float> voxel_size,
+    const std::vector<float> coors_range,
+    const int max_points, const int max_voxels,
+    const int NDim = 3) {
+
+  CHECK_INPUT(points);
+
+  at::hip::HIPGuardMasqueradingAsCUDA device_guard(points.device());
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  if (num_points == 0)
+    return 0;
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(torch::kInt32));
+
+  dim3 grid(::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 block(512);
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+   hipLaunchKernelGGL(( dynamic_voxelize_kernel<scalar_t, int>)
+    , dim3(grid), dim3(block), 0, at::hip::getCurrentHIPStreamMasqueradingAsCUDA(), 
+        points.contiguous().data_ptr<scalar_t>(),
+        temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y,
+        voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
+        coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
+        num_features, NDim);
+  }));
+
+  at::Tensor coors_map;
+  at::Tensor coors_count;
+  at::Tensor coors_order;
+  at::Tensor reduce_count;
+  at::Tensor pts_id;
+
+  auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);
+
+  std::tie(temp_coors, coors_map, reduce_count) =
+      at::unique_dim(coors_clean, 0, true, true, false);
+
+  if (temp_coors.index({0, 0}).lt(0).item<bool>()) {
+    // the first element of temp_coors is (-1,-1,-1) and should be removed
+    temp_coors = temp_coors.slice(0, 1);
+    coors_map = coors_map - 1;
+  }
+
+  int num_coors = temp_coors.size(0);
+  temp_coors = temp_coors.to(torch::kInt32);
+  coors_map = coors_map.to(torch::kInt32);
+
+  coors_count = coors_map.new_zeros(1);
+  coors_order = coors_map.new_empty(num_coors);
+  reduce_count = coors_map.new_zeros(num_coors);
+  pts_id = coors_map.new_zeros(num_points);
+
+  dim3 cp_grid(::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 cp_block(512);
+  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "get_assign_pos", ([&] {
+   hipLaunchKernelGGL(( nondisterministic_get_assign_pos), dim3(cp_grid), dim3(cp_block), 0,
+    at::hip::getCurrentHIPStreamMasqueradingAsCUDA(), 
+        num_points,
+        coors_map.contiguous().data_ptr<int32_t>(),
+        pts_id.contiguous().data_ptr<int32_t>(),
+        coors_count.contiguous().data_ptr<int32_t>(),
+        reduce_count.contiguous().data_ptr<int32_t>(),
+        coors_order.contiguous().data_ptr<int32_t>());
+  }));
+
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+   hipLaunchKernelGGL(( nondisterministic_assign_point_voxel<scalar_t>)
+    , dim3(cp_grid), dim3(cp_block), 0, at::hip::getCurrentHIPStreamMasqueradingAsCUDA(), 
+        num_points, points.contiguous().data_ptr<scalar_t>(),
+        coors_map.contiguous().data_ptr<int32_t>(),
+        pts_id.contiguous().data_ptr<int32_t>(),
+        temp_coors.contiguous().data_ptr<int32_t>(),
+        reduce_count.contiguous().data_ptr<int32_t>(),
+        coors_order.contiguous().data_ptr<int32_t>(),
+        voxels.contiguous().data_ptr<scalar_t>(),
+        coors.contiguous().data_ptr<int32_t>(),
+        num_points_per_voxel.contiguous().data_ptr<int32_t>(),
+        max_voxels, max_points,
+        num_features, NDim);
+  }));
+  AT_CUDA_CHECK(hipGetLastError());
+  return max_voxels < num_coors ? max_voxels : num_coors;
+}
+
+void dynamic_voxelize_gpu(const at::Tensor& points, at::Tensor& coors,
+                          const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+  CHECK_INPUT(points);
+
+  at::hip::HIPGuardMasqueradingAsCUDA device_guard(points.device());
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
+  dim3 blocks(col_blocks);
+  dim3 threads(threadsPerBlock);
+  hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+
+  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
+   hipLaunchKernelGGL(( dynamic_voxelize_kernel<scalar_t, int>), dim3(blocks), dim3(threads), 0, stream, 
+        points.contiguous().data_ptr<scalar_t>(),
+        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
+  });
+  hipDeviceSynchronize();
+  AT_CUDA_CHECK(hipGetLastError());
+
+  return;
+}
+
+}  // namespace voxelization
diff --git a/mmde/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py b/mmde/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py
new file mode 100644
index 0000000000000000000000000000000000000000..00a937414eaf4a040b6ab6d8d03e19f8f431b263
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/ops/voxel/voxelize.py
@@ -0,0 +1,161 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from .voxel_layer import dynamic_voxelize, hard_voxelize
+
+
+class _Voxelization(Function):
+
+    @staticmethod
+    def forward(ctx,
+                points,
+                voxel_size,
+                coors_range,
+                max_points=35,
+                max_voxels=20000,
+                deterministic=True):
+        """convert kitti points(N, >=3) to voxels.
+
+        Args:
+            points: [N, ndim] float tensor. points[:, :3] contain xyz points
+                and points[:, 3:] contain other information like reflectivity
+            voxel_size: [3] list/tuple or array, float. xyz, indicate voxel
+                size
+            coors_range: [6] list/tuple or array, float. indicate voxel
+                range. format: xyzxyz, minmax
+            max_points: int. indicate maximum points contained in a voxel. if
+                max_points=-1, it means using dynamic_voxelize
+            max_voxels: int. indicate maximum voxels this function create.
+                for second, 20000 is a good choice. Users should shuffle points
+                before call this function because max_voxels may drop points.
+            deterministic: bool. whether to invoke the non-deterministic
+                version of hard-voxelization implementations. non-deterministic
+                version is considerablly fast but is not deterministic. only
+                affects hard voxelization. default True. for more information
+                of this argument and the implementation insights, please refer
+                to the following links:
+                https://github.com/open-mmlab/mmdetection3d/issues/894
+                https://github.com/open-mmlab/mmdetection3d/pull/904
+                it is an experimental feature and we will appreciate it if
+                you could share with us the failing cases.
+
+        Returns:
+            voxels: [M, max_points, ndim] float tensor. only contain points
+                    and returned when max_points != -1.
+            coordinates: [M, 3] int32 tensor, always returned.
+            num_points_per_voxel: [M] int32 tensor. Only returned when
+                max_points != -1.
+        """
+        if max_points == -1 or max_voxels == -1:
+            coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
+            dynamic_voxelize(points, coors, voxel_size, coors_range, 3)
+            return coors
+        else:
+            voxels = points.new_zeros(
+                size=(max_voxels, max_points, points.size(1)))
+            coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
+            num_points_per_voxel = points.new_zeros(
+                size=(max_voxels, ), dtype=torch.int)
+            voxel_num = hard_voxelize(
+                points,
+                voxels,
+                coors,
+                num_points_per_voxel,
+                voxel_size,
+                coors_range,
+                max_points,
+                max_voxels,
+                3,
+                deterministic,
+            )
+            # select the valid voxels
+            voxels_out = voxels[:voxel_num]
+            coors_out = coors[:voxel_num]
+            num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
+            return voxels_out, coors_out, num_points_per_voxel_out
+
+
+voxelization = _Voxelization.apply
+
+
+class Voxelization(nn.Module):
+
+    def __init__(self,
+                 voxel_size,
+                 point_cloud_range,
+                 max_num_points,
+                 max_voxels=20000,
+                 deterministic=True):
+        super(Voxelization, self).__init__()
+        """
+        Args:
+            voxel_size (list): list [x, y, z] size of three dimension
+            point_cloud_range (list):
+                [x_min, y_min, z_min, x_max, y_max, z_max]
+            max_num_points (int): max number of points per voxel
+            max_voxels (tuple or int): max number of voxels in
+                (training, testing) time
+            deterministic: bool. whether to invoke the non-deterministic
+                version of hard-voxelization implementations. non-deterministic
+                version is considerablly fast but is not deterministic. only
+                affects hard voxelization. default True. for more information
+                of this argument and the implementation insights, please refer
+                to the following links:
+                https://github.com/open-mmlab/mmdetection3d/issues/894
+                https://github.com/open-mmlab/mmdetection3d/pull/904
+                it is an experimental feature and we will appreciate it if
+                you could share with us the failing cases.
+        """
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.max_num_points = max_num_points
+        if isinstance(max_voxels, tuple):
+            self.max_voxels = max_voxels
+        else:
+            self.max_voxels = _pair(max_voxels)
+        self.deterministic = deterministic
+
+        point_cloud_range = torch.tensor(
+            point_cloud_range, dtype=torch.float32)
+        # [0, -40, -3, 70.4, 40, 1]
+        voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
+        grid_size = (point_cloud_range[3:] -
+                     point_cloud_range[:3]) / voxel_size
+        grid_size = torch.round(grid_size).long()
+        input_feat_shape = grid_size[:2]
+        self.grid_size = grid_size
+        # the origin shape is as [x-len, y-len, z-len]
+        # [w, h, d] -> [d, h, w] removed
+        self.pcd_shape = [*input_feat_shape, 1]  # [::-1]
+
+    def forward(self, input):
+        """
+        Args:
+            input: NC points
+        """
+        if self.training:
+            max_voxels = self.max_voxels[0]
+        else:
+            max_voxels = self.max_voxels[1]
+
+        return voxelization(
+            input,
+            self.voxel_size,
+            self.point_cloud_range,
+            self.max_num_points,
+            max_voxels,
+            self.deterministic,
+        )
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + '('
+        tmpstr += 'voxel_size=' + str(self.voxel_size)
+        tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
+        tmpstr += ', max_num_points=' + str(self.max_num_points)
+        tmpstr += ', max_voxels=' + str(self.max_voxels)
+        tmpstr += ', deterministic=' + str(self.deterministic)
+        tmpstr += ')'
+        return tmpstr
diff --git a/mmde/projects/BEVFusion/bevfusion/sparse_encoder.py b/mmde/projects/BEVFusion/bevfusion/sparse_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..68bf2bceba17ba7aa76ded01a9baaaf750d5799c
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/sparse_encoder.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet3d.models.layers import make_sparse_convmodule
+from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE
+from mmdet3d.models.middle_encoders import SparseEncoder
+from mmdet3d.registry import MODELS
+
+if IS_SPCONV2_AVAILABLE:
+    from spconv.pytorch import SparseConvTensor
+else:
+    from mmcv.ops import SparseConvTensor
+
+
+@MODELS.register_module()
+class BEVFusionSparseEncoder(SparseEncoder):
+    r"""Sparse encoder for BEVFusion. The difference between this
+    implementation and that of ``SparseEncoder`` is that the shape order of 3D
+    conv is (H, W, D) in ``BEVFusionSparseEncoder`` rather than (D, H, W) in
+    ``SparseEncoder``. This difference comes from the implementation of
+    ``voxelization``.
+
+    Args:
+        in_channels (int): The number of input channels.
+        sparse_shape (list[int]): The sparse shape of input tensor.
+        order (list[str], optional): Order of conv module.
+            Defaults to ('conv', 'norm', 'act').
+        norm_cfg (dict, optional): Config of normalization layer. Defaults to
+            dict(type='BN1d', eps=1e-3, momentum=0.01).
+        base_channels (int, optional): Out channels for conv_input layer.
+            Defaults to 16.
+        output_channels (int, optional): Out channels for conv_out layer.
+            Defaults to 128.
+        encoder_channels (tuple[tuple[int]], optional):
+            Convolutional channels of each encode block.
+            Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)).
+        encoder_paddings (tuple[tuple[int]], optional):
+            Paddings of each encode block.
+            Defaults to ((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)).
+        block_type (str, optional): Type of the block to use.
+            Defaults to 'conv_module'.
+        return_middle_feats (bool): Whether output middle features.
+            Default to False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 sparse_shape,
+                 order=('conv', 'norm', 'act'),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 base_channels=16,
+                 output_channels=128,
+                 encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64,
+                                                                        64)),
+                 encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1,
+                                                                 1)),
+                 block_type='conv_module',
+                 return_middle_feats=False):
+        super(SparseEncoder, self).__init__()
+        assert block_type in ['conv_module', 'basicblock']
+        self.sparse_shape = sparse_shape
+        self.in_channels = in_channels
+        self.order = order
+        self.base_channels = base_channels
+        self.output_channels = output_channels
+        self.encoder_channels = encoder_channels
+        self.encoder_paddings = encoder_paddings
+        self.stage_num = len(self.encoder_channels)
+        self.fp16_enabled = False
+        self.return_middle_feats = return_middle_feats
+        # Spconv init all weight on its own
+
+        assert isinstance(order, tuple) and len(order) == 3
+        assert set(order) == {'conv', 'norm', 'act'}
+
+        if self.order[0] != 'conv':  # pre activate
+            self.conv_input = make_sparse_convmodule(
+                in_channels,
+                self.base_channels,
+                3,
+                norm_cfg=norm_cfg,
+                padding=1,
+                indice_key='subm1',
+                conv_type='SubMConv3d',
+                order=('conv', ))
+        else:  # post activate
+            self.conv_input = make_sparse_convmodule(
+                in_channels,
+                self.base_channels,
+                3,
+                norm_cfg=norm_cfg,
+                padding=1,
+                indice_key='subm1',
+                conv_type='SubMConv3d')
+
+        encoder_out_channels = self.make_encoder_layers(
+            make_sparse_convmodule,
+            norm_cfg,
+            self.base_channels,
+            block_type=block_type)
+
+        self.conv_out = make_sparse_convmodule(
+            encoder_out_channels,
+            self.output_channels,
+            kernel_size=(1, 1, 3),
+            stride=(1, 1, 2),
+            norm_cfg=norm_cfg,
+            padding=0,
+            indice_key='spconv_down2',
+            conv_type='SparseConv3d')
+
+    def forward(self, voxel_features, coors, batch_size):
+        """Forward of SparseEncoder.
+
+        Args:
+            voxel_features (torch.Tensor): Voxel features in shape (N, C).
+            coors (torch.Tensor): Coordinates in shape (N, 4),
+                the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
+            batch_size (int): Batch size.
+
+        Returns:
+            torch.Tensor | tuple[torch.Tensor, list]: Return spatial features
+                include:
+
+            - spatial_features (torch.Tensor): Spatial features are out from
+                the last layer.
+            - encode_features (List[SparseConvTensor], optional): Middle layer
+                output features. When self.return_middle_feats is True, the
+                module returns middle features.
+        """
+        coors = coors.int()
+        input_sp_tensor = SparseConvTensor(voxel_features, coors,
+                                           self.sparse_shape, batch_size)
+        x = self.conv_input(input_sp_tensor)
+
+        encode_features = []
+        for encoder_layer in self.encoder_layers:
+            x = encoder_layer(x)
+            encode_features.append(x)
+
+        # for detection head
+        # [200, 176, 5] -> [200, 176, 2]
+        out = self.conv_out(encode_features[-1])
+        spatial_features = out.dense()
+
+        N, C, H, W, D = spatial_features.shape
+        spatial_features = spatial_features.permute(0, 1, 4, 2, 3).contiguous()
+        spatial_features = spatial_features.view(N, C * D, H, W)
+
+        if self.return_middle_feats:
+            return spatial_features, encode_features
+        else:
+            return spatial_features
diff --git a/mmde/projects/BEVFusion/bevfusion/transformer.py b/mmde/projects/BEVFusion/bevfusion/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b69d2c6a918cbd7f916d162e6a2a88e997ad67df
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/transformer.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models import DetrTransformerDecoderLayer
+from torch import Tensor, nn
+
+from mmdet3d.registry import MODELS
+
+
+class PositionEncodingLearned(nn.Module):
+    """Absolute pos embedding, learned."""
+
+    def __init__(self, input_channel, num_pos_feats=288):
+        super().__init__()
+        self.position_embedding_head = nn.Sequential(
+            nn.Conv1d(input_channel, num_pos_feats, kernel_size=1),
+            nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True),
+            nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1))
+
+    def forward(self, xyz):
+        xyz = xyz.transpose(1, 2).contiguous()
+        position_embedding = self.position_embedding_head(xyz)
+        return position_embedding
+
+
+@MODELS.register_module()
+class TransformerDecoderLayer(DetrTransformerDecoderLayer):
+
+    def __init__(self,
+                 pos_encoding_cfg=dict(input_channel=2, num_pos_feats=128),
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.self_posembed = PositionEncodingLearned(**pos_encoding_cfg)
+        self.cross_posembed = PositionEncodingLearned(**pos_encoding_cfg)
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                value: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                self_attn_mask: Tensor = None,
+                cross_attn_mask: Tensor = None,
+                key_padding_mask: Tensor = None,
+                **kwargs) -> Tensor:
+        """
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            key (Tensor, optional): The input key, has shape (bs, num_keys,
+                dim). If `None`, the `query` will be used. Defaults to `None`.
+            value (Tensor, optional): The input value, has the same shape as
+                `key`, as in `nn.MultiheadAttention.forward`. If `None`, the
+                `key` will be used. Defaults to `None`.
+            query_pos (Tensor, optional): The positional encoding for `query`,
+                has the same shape as `query`. If not `None`, it will be added
+                to `query` before forward function. Defaults to `None`.
+            key_pos (Tensor, optional): The positional encoding for `key`, has
+                the same shape as `key`. If not `None`, it will be added to
+                `key` before forward function. If None, and `query_pos` has the
+                same shape as `key`, then `query_pos` will be used for
+                `key_pos`. Defaults to None.
+            self_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            cross_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor, optional): The `key_padding_mask` of
+                `self_attn` input. ByteTensor, has shape (bs, num_value).
+                Defaults to None.
+
+        Returns:
+            Tensor: forwarded results, has shape (bs, num_queries, dim).
+        """
+        if self.self_posembed is not None and query_pos is not None:
+            query_pos = self.self_posembed(query_pos).transpose(1, 2)
+        else:
+            query_pos = None
+        if self.cross_posembed is not None and key_pos is not None:
+            key_pos = self.cross_posembed(key_pos).transpose(1, 2)
+        else:
+            key_pos = None
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        # Note that the `value` (equal to `query`) is encoded with `query_pos`.
+        # This is different from the standard DETR Decoder Layer.
+        query = self.self_attn(
+            query=query,
+            key=query,
+            value=query + query_pos,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_mask,
+            **kwargs)
+        query = self.norms[0](query)
+        # Note that the `value` (equal to `key`) is encoded with `key_pos`.
+        # This is different from the standard DETR Decoder Layer.
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            value=key + key_pos,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=cross_attn_mask,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+        query = self.norms[1](query)
+        query = self.ffn(query)
+        query = self.norms[2](query)
+
+        query = query.transpose(1, 2)
+        return query
diff --git a/mmde/projects/BEVFusion/bevfusion/transforms_3d.py b/mmde/projects/BEVFusion/bevfusion/transforms_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d2929512db357d5ab6f17cd4257086407cb3c6f
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/transforms_3d.py
@@ -0,0 +1,274 @@
+# modify from https://github.com/mit-han-lab/bevfusion
+from typing import Any, Dict
+
+import numpy as np
+import torch
+from mmcv.transforms import BaseTransform
+from PIL import Image
+
+from mmdet3d.datasets import GlobalRotScaleTrans
+from mmdet3d.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class ImageAug3D(BaseTransform):
+
+    def __init__(self, final_dim, resize_lim, bot_pct_lim, rot_lim, rand_flip,
+                 is_train):
+        self.final_dim = final_dim
+        self.resize_lim = resize_lim
+        self.bot_pct_lim = bot_pct_lim
+        self.rand_flip = rand_flip
+        self.rot_lim = rot_lim
+        self.is_train = is_train
+
+    def sample_augmentation(self, results):
+        H, W = results['ori_shape']
+        fH, fW = self.final_dim
+        if self.is_train:
+            resize = np.random.uniform(*self.resize_lim)
+            resize_dims = (int(W * resize), int(H * resize))
+            newW, newH = resize_dims
+            crop_h = int(
+                (1 - np.random.uniform(*self.bot_pct_lim)) * newH) - fH
+            crop_w = int(np.random.uniform(0, max(0, newW - fW)))
+            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
+            flip = False
+            if self.rand_flip and np.random.choice([0, 1]):
+                flip = True
+            rotate = np.random.uniform(*self.rot_lim)
+        else:
+            resize = np.mean(self.resize_lim)
+            resize_dims = (int(W * resize), int(H * resize))
+            newW, newH = resize_dims
+            crop_h = int((1 - np.mean(self.bot_pct_lim)) * newH) - fH
+            crop_w = int(max(0, newW - fW) / 2)
+            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
+            flip = False
+            rotate = 0
+        return resize, resize_dims, crop, flip, rotate
+
+    def img_transform(self, img, rotation, translation, resize, resize_dims,
+                      crop, flip, rotate):
+        # adjust image
+        img = Image.fromarray(img.astype('uint8'), mode='RGB')
+        img = img.resize(resize_dims)
+        img = img.crop(crop)
+        if flip:
+            img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
+        img = img.rotate(rotate)
+
+        # post-homography transformation
+        rotation *= resize
+        translation -= torch.Tensor(crop[:2])
+        if flip:
+            A = torch.Tensor([[-1, 0], [0, 1]])
+            b = torch.Tensor([crop[2] - crop[0], 0])
+            rotation = A.matmul(rotation)
+            translation = A.matmul(translation) + b
+        theta = rotate / 180 * np.pi
+        A = torch.Tensor([
+            [np.cos(theta), np.sin(theta)],
+            [-np.sin(theta), np.cos(theta)],
+        ])
+        b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2
+        b = A.matmul(-b) + b
+        rotation = A.matmul(rotation)
+        translation = A.matmul(translation) + b
+
+        return img, rotation, translation
+
+    def transform(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        imgs = data['img']
+        new_imgs = []
+        transforms = []
+        for img in imgs:
+            resize, resize_dims, crop, flip, rotate = self.sample_augmentation(
+                data)
+            post_rot = torch.eye(2)
+            post_tran = torch.zeros(2)
+            new_img, rotation, translation = self.img_transform(
+                img,
+                post_rot,
+                post_tran,
+                resize=resize,
+                resize_dims=resize_dims,
+                crop=crop,
+                flip=flip,
+                rotate=rotate,
+            )
+            transform = torch.eye(4)
+            transform[:2, :2] = rotation
+            transform[:2, 3] = translation
+            new_imgs.append(np.array(new_img).astype(np.float32))
+            transforms.append(transform.numpy())
+        data['img'] = new_imgs
+        # update the calibration matrices
+        data['img_aug_matrix'] = transforms
+        return data
+
+
+@TRANSFORMS.register_module()
+class BEVFusionRandomFlip3D:
+    """Compared with `RandomFlip3D`, this class directly records the lidar
+    augmentation matrix in the `data`."""
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        flip_horizontal = np.random.choice([0, 1])
+        flip_vertical = np.random.choice([0, 1])
+
+        rotation = np.eye(3)
+        if flip_horizontal:
+            rotation = np.array([[1, 0, 0], [0, -1, 0], [0, 0, 1]]) @ rotation
+            if 'points' in data:
+                data['points'].flip('horizontal')
+            if 'gt_bboxes_3d' in data:
+                data['gt_bboxes_3d'].flip('horizontal')
+            if 'gt_masks_bev' in data:
+                data['gt_masks_bev'] = data['gt_masks_bev'][:, :, ::-1].copy()
+
+        if flip_vertical:
+            rotation = np.array([[-1, 0, 0], [0, 1, 0], [0, 0, 1]]) @ rotation
+            if 'points' in data:
+                data['points'].flip('vertical')
+            if 'gt_bboxes_3d' in data:
+                data['gt_bboxes_3d'].flip('vertical')
+            if 'gt_masks_bev' in data:
+                data['gt_masks_bev'] = data['gt_masks_bev'][:, ::-1, :].copy()
+
+        if 'lidar_aug_matrix' not in data:
+            data['lidar_aug_matrix'] = np.eye(4)
+        data['lidar_aug_matrix'][:3, :] = rotation @ data[
+            'lidar_aug_matrix'][:3, :]
+        return data
+
+
+@TRANSFORMS.register_module()
+class BEVFusionGlobalRotScaleTrans(GlobalRotScaleTrans):
+    """Compared with `GlobalRotScaleTrans`, the augmentation order in this
+    class is rotation, translation and scaling (RTS)."""
+
+    def transform(self, input_dict: dict) -> dict:
+        """Private function to rotate, scale and translate bounding boxes and
+        points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after scaling, 'points', 'pcd_rotation',
+            'pcd_scale_factor', 'pcd_trans' and `gt_bboxes_3d` are updated
+            in the result dict.
+        """
+        if 'transformation_3d_flow' not in input_dict:
+            input_dict['transformation_3d_flow'] = []
+
+        self._rot_bbox_points(input_dict)
+
+        if 'pcd_scale_factor' not in input_dict:
+            self._random_scale(input_dict)
+        self._trans_bbox_points(input_dict)
+        self._scale_bbox_points(input_dict)
+
+        input_dict['transformation_3d_flow'].extend(['R', 'T', 'S'])
+
+        lidar_augs = np.eye(4)
+        lidar_augs[:3, :3] = input_dict['pcd_rotation'].T * input_dict[
+            'pcd_scale_factor']
+        lidar_augs[:3, 3] = input_dict['pcd_trans'] * \
+            input_dict['pcd_scale_factor']
+
+        if 'lidar_aug_matrix' not in input_dict:
+            input_dict['lidar_aug_matrix'] = np.eye(4)
+        input_dict[
+            'lidar_aug_matrix'] = lidar_augs @ input_dict['lidar_aug_matrix']
+
+        return input_dict
+
+
+@TRANSFORMS.register_module()
+class GridMask(BaseTransform):
+
+    def __init__(
+        self,
+        use_h,
+        use_w,
+        max_epoch,
+        rotate=1,
+        offset=False,
+        ratio=0.5,
+        mode=0,
+        prob=1.0,
+        fixed_prob=False,
+    ):
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.st_prob = prob
+        self.prob = prob
+        self.epoch = None
+        self.max_epoch = max_epoch
+        self.fixed_prob = fixed_prob
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+        if not self.fixed_prob:
+            self.set_prob(self.epoch, self.max_epoch)
+
+    def set_prob(self, epoch, max_epoch):
+        self.prob = self.st_prob * self.epoch / self.max_epoch
+
+    def transform(self, results):
+        if np.random.rand() > self.prob:
+            return results
+        imgs = results['img']
+        h = imgs[0].shape[0]
+        w = imgs[0].shape[1]
+        self.d1 = 2
+        self.d2 = min(h, w)
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
+        d = np.random.randint(self.d1, self.d2)
+        if self.ratio == 1:
+            self.length = np.random.randint(1, d)
+        else:
+            self.length = min(max(int(d * self.ratio + 0.5), 1), d - 1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh // d):
+                s = d * i + st_h
+                t = min(s + self.length, hh)
+                mask[s:t, :] *= 0
+        if self.use_w:
+            for i in range(ww // d):
+                s = d * i + st_w
+                t = min(s + self.length, ww)
+                mask[:, s:t] *= 0
+
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh - h) // 2:(hh - h) // 2 + h,
+                    (ww - w) // 2:(ww - w) // 2 + w]
+
+        mask = mask.astype(np.float32)
+        mask = mask[:, :, None]
+        if self.mode == 1:
+            mask = 1 - mask
+
+        # mask = mask.expand_as(imgs[0])
+        if self.offset:
+            offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float()
+            offset = (1 - mask) * offset
+            imgs = [x * mask + offset for x in imgs]
+        else:
+            imgs = [x * mask for x in imgs]
+
+        results.update(img=imgs)
+        return results
diff --git a/mmde/projects/BEVFusion/bevfusion/transfusion_head.py b/mmde/projects/BEVFusion/bevfusion/transfusion_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a3e1750db90d1f05ffaa8974829d44967803024
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/transfusion_head.py
@@ -0,0 +1,872 @@
+# modify from https://github.com/mit-han-lab/bevfusion
+import copy
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_conv_layer
+from mmdet.models.task_modules import (AssignResult, PseudoSampler,
+                                       build_assigner, build_bbox_coder,
+                                       build_sampler)
+from mmdet.models.utils import multi_apply
+from mmengine.structures import InstanceData
+from torch import nn
+
+from mmdet3d.models import circle_nms, draw_heatmap_gaussian, gaussian_radius
+from mmdet3d.models.dense_heads.centerpoint_head import SeparateHead
+from mmdet3d.models.layers import nms_bev
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import xywhr2xyxyr
+
+
+def clip_sigmoid(x, eps=1e-4):
+    y = torch.clamp(x.sigmoid_(), min=eps, max=1 - eps)
+    return y
+
+
+@MODELS.register_module()
+class ConvFuser(nn.Sequential):
+
+    def __init__(self, in_channels: int, out_channels: int) -> None:
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        super().__init__(
+            nn.Conv2d(
+                sum(in_channels), out_channels, 3, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(True),
+        )
+
+    def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
+        return super().forward(torch.cat(inputs, dim=1))
+
+
+@MODELS.register_module()
+class TransFusionHead(nn.Module):
+
+    def __init__(
+        self,
+        num_proposals=128,
+        auxiliary=True,
+        in_channels=128 * 3,
+        hidden_channel=128,
+        num_classes=4,
+        # config for Transformer
+        num_decoder_layers=3,
+        decoder_layer=dict(),
+        num_heads=8,
+        nms_kernel_size=1,
+        bn_momentum=0.1,
+        # config for FFN
+        common_heads=dict(),
+        num_heatmap_convs=2,
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        bias='auto',
+        # loss
+        loss_cls=dict(type='mmdet.GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='mmdet.L1Loss', reduction='mean'),
+        loss_heatmap=dict(type='mmdet.GaussianFocalLoss', reduction='mean'),
+        # others
+        train_cfg=None,
+        test_cfg=None,
+        bbox_coder=None,
+    ):
+        super(TransFusionHead, self).__init__()
+
+        self.num_classes = num_classes
+        self.num_proposals = num_proposals
+        self.auxiliary = auxiliary
+        self.in_channels = in_channels
+        self.num_heads = num_heads
+        self.num_decoder_layers = num_decoder_layers
+        self.bn_momentum = bn_momentum
+        self.nms_kernel_size = nms_kernel_size
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if not self.use_sigmoid_cls:
+            self.num_classes += 1
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.loss_heatmap = MODELS.build(loss_heatmap)
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.sampling = False
+
+        # a shared convolution
+        self.shared_conv = build_conv_layer(
+            dict(type='Conv2d'),
+            in_channels,
+            hidden_channel,
+            kernel_size=3,
+            padding=1,
+            bias=bias,
+        )
+
+        layers = []
+        layers.append(
+            ConvModule(
+                hidden_channel,
+                hidden_channel,
+                kernel_size=3,
+                padding=1,
+                bias=bias,
+                conv_cfg=dict(type='Conv2d'),
+                norm_cfg=dict(type='BN2d'),
+            ))
+        layers.append(
+            build_conv_layer(
+                dict(type='Conv2d'),
+                hidden_channel,
+                num_classes,
+                kernel_size=3,
+                padding=1,
+                bias=bias,
+            ))
+        self.heatmap_head = nn.Sequential(*layers)
+        self.class_encoding = nn.Conv1d(num_classes, hidden_channel, 1)
+
+        # transformer decoder layers for object query with LiDAR feature
+        self.decoder = nn.ModuleList()
+        for i in range(self.num_decoder_layers):
+            self.decoder.append(MODELS.build(decoder_layer))
+
+        # Prediction Head
+        self.prediction_heads = nn.ModuleList()
+        for i in range(self.num_decoder_layers):
+            heads = copy.deepcopy(common_heads)
+            heads.update(dict(heatmap=(self.num_classes, num_heatmap_convs)))
+            self.prediction_heads.append(
+                SeparateHead(
+                    hidden_channel,
+                    heads,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    bias=bias,
+                ))
+
+        self.init_weights()
+        self._init_assigner_sampler()
+
+        # Position Embedding for Cross-Attention, which is re-used during training # noqa: E501
+        x_size = self.test_cfg['grid_size'][0] // self.test_cfg[
+            'out_size_factor']
+        y_size = self.test_cfg['grid_size'][1] // self.test_cfg[
+            'out_size_factor']
+        self.bev_pos = self.create_2D_grid(x_size, y_size)
+
+        self.img_feat_pos = None
+        self.img_feat_collapsed_pos = None
+
+    def create_2D_grid(self, x_size, y_size):
+        meshgrid = [[0, x_size - 1, x_size], [0, y_size - 1, y_size]]
+        # NOTE: modified
+        batch_x, batch_y = torch.meshgrid(
+            *[torch.linspace(it[0], it[1], it[2]) for it in meshgrid])
+        batch_x = batch_x + 0.5
+        batch_y = batch_y + 0.5
+        coord_base = torch.cat([batch_x[None], batch_y[None]], dim=0)[None]
+        coord_base = coord_base.view(1, 2, -1).permute(0, 2, 1)
+        return coord_base
+
+    def init_weights(self):
+        # initialize transformer
+        for m in self.decoder.parameters():
+            if m.dim() > 1:
+                nn.init.xavier_uniform_(m)
+        if hasattr(self, 'query'):
+            nn.init.xavier_normal_(self.query)
+        self.init_bn_momentum()
+
+    def init_bn_momentum(self):
+        for m in self.modules():
+            if isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d)):
+                m.momentum = self.bn_momentum
+
+    def _init_assigner_sampler(self):
+        """Initialize the target assigner and sampler of the head."""
+        if self.train_cfg is None:
+            return
+
+        if self.sampling:
+            self.bbox_sampler = build_sampler(self.train_cfg.sampler)
+        else:
+            self.bbox_sampler = PseudoSampler()
+        if isinstance(self.train_cfg.assigner, dict):
+            self.bbox_assigner = build_assigner(self.train_cfg.assigner)
+        elif isinstance(self.train_cfg.assigner, list):
+            self.bbox_assigner = [
+                build_assigner(res) for res in self.train_cfg.assigner
+            ]
+
+    def forward_single(self, inputs, metas):
+        """Forward function for CenterPoint.
+        Args:
+            inputs (torch.Tensor): Input feature map with the shape of
+                [B, 512, 128(H), 128(W)]. (consistent with L748)
+        Returns:
+            list[dict]: Output results for tasks.
+        """
+        batch_size = inputs.shape[0]
+        fusion_feat = self.shared_conv(inputs)
+
+        #################################
+        # image to BEV
+        #################################
+        fusion_feat_flatten = fusion_feat.view(batch_size,
+                                               fusion_feat.shape[1],
+                                               -1)  # [BS, C, H*W]
+        bev_pos = self.bev_pos.repeat(batch_size, 1, 1).to(fusion_feat.device)
+
+        #################################
+        # query initialization
+        #################################
+        with torch.autocast('cuda', enabled=False):
+            dense_heatmap = self.heatmap_head(fusion_feat.float())
+        heatmap = dense_heatmap.detach().sigmoid()
+        padding = self.nms_kernel_size // 2
+        local_max = torch.zeros_like(heatmap)
+        # equals to nms radius = voxel_size * out_size_factor * kenel_size
+        local_max_inner = F.max_pool2d(
+            heatmap, kernel_size=self.nms_kernel_size, stride=1, padding=0)
+        local_max[:, :, padding:(-padding),
+                  padding:(-padding)] = local_max_inner
+        # for Pedestrian & Traffic_cone in nuScenes
+        if self.test_cfg['dataset'] == 'nuScenes':
+            local_max[:, 8, ] = F.max_pool2d(
+                heatmap[:, 8], kernel_size=1, stride=1, padding=0)
+            local_max[:, 9, ] = F.max_pool2d(
+                heatmap[:, 9], kernel_size=1, stride=1, padding=0)
+        elif self.test_cfg[
+                'dataset'] == 'Waymo':  # for Pedestrian & Cyclist in Waymo
+            local_max[:, 1, ] = F.max_pool2d(
+                heatmap[:, 1], kernel_size=1, stride=1, padding=0)
+            local_max[:, 2, ] = F.max_pool2d(
+                heatmap[:, 2], kernel_size=1, stride=1, padding=0)
+        heatmap = heatmap * (heatmap == local_max)
+        heatmap = heatmap.view(batch_size, heatmap.shape[1], -1)
+
+        # top num_proposals among all classes
+        top_proposals = heatmap.view(batch_size, -1).argsort(
+            dim=-1, descending=True)[..., :self.num_proposals]
+        top_proposals_class = top_proposals // heatmap.shape[-1]
+        top_proposals_index = top_proposals % heatmap.shape[-1]
+        query_feat = fusion_feat_flatten.gather(
+            index=top_proposals_index[:, None, :].expand(
+                -1, fusion_feat_flatten.shape[1], -1),
+            dim=-1,
+        )
+        self.query_labels = top_proposals_class
+
+        # add category embedding
+        one_hot = F.one_hot(
+            top_proposals_class,
+            num_classes=self.num_classes).permute(0, 2, 1)
+        query_cat_encoding = self.class_encoding(one_hot.float())
+        query_feat += query_cat_encoding
+
+        query_pos = bev_pos.gather(
+            index=top_proposals_index[:, None, :].permute(0, 2, 1).expand(
+                -1, -1, bev_pos.shape[-1]),
+            dim=1,
+        )
+        #################################
+        # transformer decoder layer (Fusion feature as K,V)
+        #################################
+        ret_dicts = []
+        for i in range(self.num_decoder_layers):
+            # Transformer Decoder Layer
+            # :param query: B C Pq    :param query_pos: B Pq 3/6
+            query_feat = self.decoder[i](
+                query_feat,
+                key=fusion_feat_flatten,
+                query_pos=query_pos,
+                key_pos=bev_pos)
+
+            # Prediction
+            res_layer = self.prediction_heads[i](query_feat)
+            res_layer['center'] = res_layer['center'] + query_pos.permute(
+                0, 2, 1)
+            ret_dicts.append(res_layer)
+
+            # for next level positional embedding
+            query_pos = res_layer['center'].detach().clone().permute(0, 2, 1)
+
+        ret_dicts[0]['query_heatmap_score'] = heatmap.gather(
+            index=top_proposals_index[:,
+                                      None, :].expand(-1, self.num_classes,
+                                                      -1),
+            dim=-1,
+        )  # [bs, num_classes, num_proposals]
+        ret_dicts[0]['dense_heatmap'] = dense_heatmap
+
+        if self.auxiliary is False:
+            # only return the results of last decoder layer
+            return [ret_dicts[-1]]
+
+        # return all the layer's results for auxiliary superivison
+        new_res = {}
+        for key in ret_dicts[0].keys():
+            if key not in [
+                    'dense_heatmap', 'dense_heatmap_old', 'query_heatmap_score'
+            ]:
+                new_res[key] = torch.cat(
+                    [ret_dict[key] for ret_dict in ret_dicts], dim=-1)
+            else:
+                new_res[key] = ret_dicts[0][key]
+        return [new_res]
+
+    def forward(self, feats, metas):
+        """Forward pass.
+
+        Args:
+            feats (list[torch.Tensor]): Multi-level features, e.g.,
+                features produced by FPN.
+        Returns:
+            tuple(list[dict]): Output results. first index by level, second
+            index by layer
+        """
+        if isinstance(feats, torch.Tensor):
+            feats = [feats]
+        res = multi_apply(self.forward_single, feats, [metas])
+        assert len(res) == 1, 'only support one level features.'
+        return res
+
+    def predict(self, batch_feats, batch_input_metas):
+        preds_dicts = self(batch_feats, batch_input_metas)
+        res = self.predict_by_feat(preds_dicts, batch_input_metas)
+        return res
+
+    def predict_by_feat(self,
+                        preds_dicts,
+                        metas,
+                        img=None,
+                        rescale=False,
+                        for_roi=False):
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results.
+        Returns:
+            list[list[dict]]: Decoded bbox, scores and labels for each layer
+            & each batch.
+        """
+        rets = []
+        for layer_id, preds_dict in enumerate(preds_dicts):
+            batch_size = preds_dict[0]['heatmap'].shape[0]
+            batch_score = preds_dict[0]['heatmap'][
+                ..., -self.num_proposals:].sigmoid()
+            # if self.loss_iou.loss_weight != 0:
+            #    batch_score = torch.sqrt(batch_score * preds_dict[0]['iou'][..., -self.num_proposals:].sigmoid()) # noqa: E501
+            one_hot = F.one_hot(
+                self.query_labels,
+                num_classes=self.num_classes).permute(0, 2, 1)
+            batch_score = batch_score * preds_dict[0][
+                'query_heatmap_score'] * one_hot
+
+            batch_center = preds_dict[0]['center'][..., -self.num_proposals:]
+            batch_height = preds_dict[0]['height'][..., -self.num_proposals:]
+            batch_dim = preds_dict[0]['dim'][..., -self.num_proposals:]
+            batch_rot = preds_dict[0]['rot'][..., -self.num_proposals:]
+            batch_vel = None
+            if 'vel' in preds_dict[0]:
+                batch_vel = preds_dict[0]['vel'][..., -self.num_proposals:]
+
+            temp = self.bbox_coder.decode(
+                batch_score,
+                batch_rot,
+                batch_dim,
+                batch_center,
+                batch_height,
+                batch_vel,
+                filter=True,
+            )
+
+            if self.test_cfg['dataset'] == 'nuScenes':
+                self.tasks = [
+                    dict(
+                        num_class=8,
+                        class_names=[],
+                        indices=[0, 1, 2, 3, 4, 5, 6, 7],
+                        radius=-1,
+                    ),
+                    dict(
+                        num_class=1,
+                        class_names=['pedestrian'],
+                        indices=[8],
+                        radius=0.175,
+                    ),
+                    dict(
+                        num_class=1,
+                        class_names=['traffic_cone'],
+                        indices=[9],
+                        radius=0.175,
+                    ),
+                ]
+            elif self.test_cfg['dataset'] == 'Waymo':
+                self.tasks = [
+                    dict(
+                        num_class=1,
+                        class_names=['Car'],
+                        indices=[0],
+                        radius=0.7),
+                    dict(
+                        num_class=1,
+                        class_names=['Pedestrian'],
+                        indices=[1],
+                        radius=0.7),
+                    dict(
+                        num_class=1,
+                        class_names=['Cyclist'],
+                        indices=[2],
+                        radius=0.7),
+                ]
+
+            ret_layer = []
+            for i in range(batch_size):
+                boxes3d = temp[i]['bboxes']
+                scores = temp[i]['scores']
+                labels = temp[i]['labels']
+                # adopt circle nms for different categories
+                if self.test_cfg['nms_type'] is not None:
+                    keep_mask = torch.zeros_like(scores)
+                    for task in self.tasks:
+                        task_mask = torch.zeros_like(scores)
+                        for cls_idx in task['indices']:
+                            task_mask += labels == cls_idx
+                        task_mask = task_mask.bool()
+                        if task['radius'] > 0:
+                            if self.test_cfg['nms_type'] == 'circle':
+                                boxes_for_nms = torch.cat(
+                                    [
+                                        boxes3d[task_mask][:, :2],
+                                        scores[:, None][task_mask],
+                                    ],
+                                    dim=1,
+                                )
+                                task_keep_indices = torch.tensor(
+                                    circle_nms(
+                                        boxes_for_nms.detach().cpu().numpy(),
+                                        task['radius'],
+                                    ))
+                            else:
+                                boxes_for_nms = xywhr2xyxyr(
+                                    metas[i]['box_type_3d'](
+                                        boxes3d[task_mask][:, :7], 7).bev)
+                                top_scores = scores[task_mask]
+                                task_keep_indices = nms_bev(
+                                    boxes_for_nms,
+                                    top_scores,
+                                    thresh=task['radius'],
+                                    pre_maxsize=self.test_cfg['pre_maxsize'],
+                                    post_max_size=self.
+                                    test_cfg['post_maxsize'],
+                                )
+                        else:
+                            task_keep_indices = torch.arange(task_mask.sum())
+                        if task_keep_indices.shape[0] != 0:
+                            keep_indices = torch.where(
+                                task_mask != 0)[0][task_keep_indices]
+                            keep_mask[keep_indices] = 1
+                    keep_mask = keep_mask.bool()
+                    ret = dict(
+                        bboxes=boxes3d[keep_mask],
+                        scores=scores[keep_mask],
+                        labels=labels[keep_mask],
+                    )
+                else:  # no nms
+                    ret = dict(bboxes=boxes3d, scores=scores, labels=labels)
+
+                temp_instances = InstanceData()
+                temp_instances.bboxes_3d = metas[0]['box_type_3d'](
+                    ret['bboxes'], box_dim=ret['bboxes'].shape[-1])
+                temp_instances.scores_3d = ret['scores']
+                temp_instances.labels_3d = ret['labels'].int()
+
+                ret_layer.append(temp_instances)
+
+            rets.append(ret_layer)
+        assert len(
+            rets
+        ) == 1, f'only support one layer now, but get {len(rets)} layers'
+
+        return rets[0]
+
+    def get_targets(self, batch_gt_instances_3d: List[InstanceData],
+                    preds_dict: List[dict]):
+        """Generate training targets.
+        Args:
+            batch_gt_instances_3d (List[InstanceData]):
+            preds_dict (list[dict]): The prediction results. The index of the
+                list is the index of layers. The inner dict contains
+                predictions of one mini-batch:
+                - center: (bs, 2, num_proposals)
+                - height: (bs, 1, num_proposals)
+                - dim: (bs, 3, num_proposals)
+                - rot: (bs, 2, num_proposals)
+                - vel: (bs, 2, num_proposals)
+                - cls_logit: (bs, num_classes, num_proposals)
+                - query_score: (bs, num_classes, num_proposals)
+                - heatmap: The original heatmap before fed into transformer
+                    decoder, with shape (bs, 10, h, w)
+        Returns:
+            tuple[torch.Tensor]: Tuple of target including \
+                the following results in order.
+                - torch.Tensor: classification target.  [BS, num_proposals]
+                - torch.Tensor: classification weights (mask)
+                    [BS, num_proposals]
+                - torch.Tensor: regression target. [BS, num_proposals, 8]
+                - torch.Tensor: regression weights. [BS, num_proposals, 8]
+        """
+        # change preds_dict into list of dict (index by batch_id)
+        # preds_dict[0]['center'].shape [bs, 3, num_proposal]
+        list_of_pred_dict = []
+        for batch_idx in range(len(batch_gt_instances_3d)):
+            pred_dict = {}
+            for key in preds_dict[0].keys():
+                preds = []
+                for i in range(self.num_decoder_layers):
+                    pred_one_layer = preds_dict[i][key][batch_idx:batch_idx +
+                                                        1]
+                    preds.append(pred_one_layer)
+                pred_dict[key] = torch.cat(preds)
+            list_of_pred_dict.append(pred_dict)
+
+        assert len(batch_gt_instances_3d) == len(list_of_pred_dict)
+        res_tuple = multi_apply(
+            self.get_targets_single,
+            batch_gt_instances_3d,
+            list_of_pred_dict,
+            np.arange(len(batch_gt_instances_3d)),
+        )
+        labels = torch.cat(res_tuple[0], dim=0)
+        label_weights = torch.cat(res_tuple[1], dim=0)
+        bbox_targets = torch.cat(res_tuple[2], dim=0)
+        bbox_weights = torch.cat(res_tuple[3], dim=0)
+        ious = torch.cat(res_tuple[4], dim=0)
+        num_pos = np.sum(res_tuple[5])
+        matched_ious = np.mean(res_tuple[6])
+        heatmap = torch.cat(res_tuple[7], dim=0)
+        return (
+            labels,
+            label_weights,
+            bbox_targets,
+            bbox_weights,
+            ious,
+            num_pos,
+            matched_ious,
+            heatmap,
+        )
+
+    def get_targets_single(self, gt_instances_3d, preds_dict, batch_idx):
+        """Generate training targets for a single sample.
+        Args:
+            gt_instances_3d (:obj:`InstanceData`): ground truth of instances.
+            preds_dict (dict): dict of prediction result for a single sample.
+        Returns:
+            tuple[torch.Tensor]: Tuple of target including \
+                the following results in order.
+                - torch.Tensor: classification target.  [1, num_proposals]
+                - torch.Tensor: classification weights (mask) [1,
+                    num_proposals] # noqa: E501
+                - torch.Tensor: regression target. [1, num_proposals, 8]
+                - torch.Tensor: regression weights. [1, num_proposals, 8]
+                - torch.Tensor: iou target. [1, num_proposals]
+                - int: number of positive proposals
+                - torch.Tensor: heatmap targets.
+        """
+        # 1. Assignment
+        gt_bboxes_3d = gt_instances_3d.bboxes_3d
+        gt_labels_3d = gt_instances_3d.labels_3d
+        num_proposals = preds_dict['center'].shape[-1]
+
+        # get pred boxes, carefully ! don't change the network outputs
+        score = copy.deepcopy(preds_dict['heatmap'].detach())
+        center = copy.deepcopy(preds_dict['center'].detach())
+        height = copy.deepcopy(preds_dict['height'].detach())
+        dim = copy.deepcopy(preds_dict['dim'].detach())
+        rot = copy.deepcopy(preds_dict['rot'].detach())
+        if 'vel' in preds_dict.keys():
+            vel = copy.deepcopy(preds_dict['vel'].detach())
+        else:
+            vel = None
+
+        boxes_dict = self.bbox_coder.decode(
+            score, rot, dim, center, height,
+            vel)  # decode the prediction to real world metric bbox
+        bboxes_tensor = boxes_dict[0]['bboxes']
+        gt_bboxes_tensor = gt_bboxes_3d.tensor.to(score.device)
+        # each layer should do label assign separately.
+        if self.auxiliary:
+            num_layer = self.num_decoder_layers
+        else:
+            num_layer = 1
+
+        assign_result_list = []
+        for idx_layer in range(num_layer):
+            bboxes_tensor_layer = bboxes_tensor[self.num_proposals *
+                                                idx_layer:self.num_proposals *
+                                                (idx_layer + 1), :]
+            score_layer = score[..., self.num_proposals *
+                                idx_layer:self.num_proposals *
+                                (idx_layer + 1), ]
+
+            if self.train_cfg.assigner.type == 'HungarianAssigner3D':
+                assign_result = self.bbox_assigner.assign(
+                    bboxes_tensor_layer,
+                    gt_bboxes_tensor,
+                    gt_labels_3d,
+                    score_layer,
+                    self.train_cfg,
+                )
+            elif self.train_cfg.assigner.type == 'HeuristicAssigner':
+                assign_result = self.bbox_assigner.assign(
+                    bboxes_tensor_layer,
+                    gt_bboxes_tensor,
+                    None,
+                    gt_labels_3d,
+                    self.query_labels[batch_idx],
+                )
+            else:
+                raise NotImplementedError
+            assign_result_list.append(assign_result)
+
+        # combine assign result of each layer
+        assign_result_ensemble = AssignResult(
+            num_gts=sum([res.num_gts for res in assign_result_list]),
+            gt_inds=torch.cat([res.gt_inds for res in assign_result_list]),
+            max_overlaps=torch.cat(
+                [res.max_overlaps for res in assign_result_list]),
+            labels=torch.cat([res.labels for res in assign_result_list]),
+        )
+
+        # 2. Sampling. Compatible with the interface of `PseudoSampler` in
+        # mmdet.
+        gt_instances, pred_instances = InstanceData(
+            bboxes=gt_bboxes_tensor), InstanceData(priors=bboxes_tensor)
+        sampling_result = self.bbox_sampler.sample(assign_result_ensemble,
+                                                   pred_instances,
+                                                   gt_instances)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        assert len(pos_inds) + len(neg_inds) == num_proposals
+
+        # 3. Create target for loss computation
+        bbox_targets = torch.zeros([num_proposals, self.bbox_coder.code_size
+                                    ]).to(center.device)
+        bbox_weights = torch.zeros([num_proposals, self.bbox_coder.code_size
+                                    ]).to(center.device)
+        ious = assign_result_ensemble.max_overlaps
+        ious = torch.clamp(ious, min=0.0, max=1.0)
+        labels = bboxes_tensor.new_zeros(num_proposals, dtype=torch.long)
+        label_weights = bboxes_tensor.new_zeros(
+            num_proposals, dtype=torch.long)
+
+        if gt_labels_3d is not None:  # default label is -1
+            labels += self.num_classes
+
+        # both pos and neg have classification loss, only pos has regression
+        # and iou loss
+        if len(pos_inds) > 0:
+            pos_bbox_targets = self.bbox_coder.encode(
+                sampling_result.pos_gt_bboxes)
+
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            if gt_labels_3d is None:
+                labels[pos_inds] = 1
+            else:
+                labels[pos_inds] = gt_labels_3d[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # # compute dense heatmap targets
+        device = labels.device
+        gt_bboxes_3d = torch.cat(
+            [gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]],
+            dim=1).to(device)
+        grid_size = torch.tensor(self.train_cfg['grid_size'])
+        pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
+        voxel_size = torch.tensor(self.train_cfg['voxel_size'])
+        feature_map_size = (grid_size[:2] // self.train_cfg['out_size_factor']
+                            )  # [x_len, y_len]
+        heatmap = gt_bboxes_3d.new_zeros(self.num_classes, feature_map_size[1],
+                                         feature_map_size[0])
+        for idx in range(len(gt_bboxes_3d)):
+            width = gt_bboxes_3d[idx][3]
+            length = gt_bboxes_3d[idx][4]
+            width = width / voxel_size[0] / self.train_cfg['out_size_factor']
+            length = length / voxel_size[1] / self.train_cfg['out_size_factor']
+            if width > 0 and length > 0:
+                radius = gaussian_radius(
+                    (length, width),
+                    min_overlap=self.train_cfg['gaussian_overlap'])
+                radius = max(self.train_cfg['min_radius'], int(radius))
+                x, y = gt_bboxes_3d[idx][0], gt_bboxes_3d[idx][1]
+
+                coor_x = ((x - pc_range[0]) / voxel_size[0] /
+                          self.train_cfg['out_size_factor'])
+                coor_y = ((y - pc_range[1]) / voxel_size[1] /
+                          self.train_cfg['out_size_factor'])
+
+                center = torch.tensor([coor_x, coor_y],
+                                      dtype=torch.float32,
+                                      device=device)
+                center_int = center.to(torch.int32)
+
+                # original
+                # draw_heatmap_gaussian(heatmap[gt_labels_3d[idx]], center_int, radius) # noqa: E501
+                # NOTE: fix
+                draw_heatmap_gaussian(heatmap[gt_labels_3d[idx]],
+                                      center_int[[1, 0]], radius)
+
+        mean_iou = ious[pos_inds].sum() / max(len(pos_inds), 1)
+        return (
+            labels[None],
+            label_weights[None],
+            bbox_targets[None],
+            bbox_weights[None],
+            ious[None],
+            int(pos_inds.shape[0]),
+            float(mean_iou),
+            heatmap[None],
+        )
+
+    def loss(self, batch_feats, batch_data_samples):
+        """Loss function for CenterHead.
+
+        Args:
+            batch_feats (): Features in a batch.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
+        Returns:
+            dict[str:torch.Tensor]: Loss of heatmap and bbox of each task.
+        """
+        batch_input_metas, batch_gt_instances_3d = [], []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+        preds_dicts = self(batch_feats, batch_input_metas)
+        loss = self.loss_by_feat(preds_dicts, batch_gt_instances_3d)
+
+        return loss
+
+    def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
+                     batch_gt_instances_3d: List[InstanceData], *args,
+                     **kwargs):
+        (
+            labels,
+            label_weights,
+            bbox_targets,
+            bbox_weights,
+            ious,
+            num_pos,
+            matched_ious,
+            heatmap,
+        ) = self.get_targets(batch_gt_instances_3d, preds_dicts[0])
+        if hasattr(self, 'on_the_image_mask'):
+            label_weights = label_weights * self.on_the_image_mask
+            bbox_weights = bbox_weights * self.on_the_image_mask[:, :, None]
+            num_pos = bbox_weights.max(-1).values.sum()
+        preds_dict = preds_dicts[0][0]
+        loss_dict = dict()
+
+        # compute heatmap loss
+        loss_heatmap = self.loss_heatmap(
+            clip_sigmoid(preds_dict['dense_heatmap']).float(),
+            heatmap.float(),
+            avg_factor=max(heatmap.eq(1).float().sum().item(), 1),
+        )
+        loss_dict['loss_heatmap'] = loss_heatmap
+
+        # compute loss for each layer
+        for idx_layer in range(
+                self.num_decoder_layers if self.auxiliary else 1):
+            if idx_layer == self.num_decoder_layers - 1 or (
+                    idx_layer == 0 and self.auxiliary is False):
+                prefix = 'layer_-1'
+            else:
+                prefix = f'layer_{idx_layer}'
+
+            layer_labels = labels[..., idx_layer *
+                                  self.num_proposals:(idx_layer + 1) *
+                                  self.num_proposals, ].reshape(-1)
+            layer_label_weights = label_weights[
+                ..., idx_layer * self.num_proposals:(idx_layer + 1) *
+                self.num_proposals, ].reshape(-1)
+            layer_score = preds_dict['heatmap'][..., idx_layer *
+                                                self.num_proposals:(idx_layer +
+                                                                    1) *
+                                                self.num_proposals, ]
+            layer_cls_score = layer_score.permute(0, 2, 1).reshape(
+                -1, self.num_classes)
+            layer_loss_cls = self.loss_cls(
+                layer_cls_score.float(),
+                layer_labels,
+                layer_label_weights,
+                avg_factor=max(num_pos, 1),
+            )
+
+            layer_center = preds_dict['center'][..., idx_layer *
+                                                self.num_proposals:(idx_layer +
+                                                                    1) *
+                                                self.num_proposals, ]
+            layer_height = preds_dict['height'][..., idx_layer *
+                                                self.num_proposals:(idx_layer +
+                                                                    1) *
+                                                self.num_proposals, ]
+            layer_rot = preds_dict['rot'][..., idx_layer *
+                                          self.num_proposals:(idx_layer + 1) *
+                                          self.num_proposals, ]
+            layer_dim = preds_dict['dim'][..., idx_layer *
+                                          self.num_proposals:(idx_layer + 1) *
+                                          self.num_proposals, ]
+            preds = torch.cat(
+                [layer_center, layer_height, layer_dim, layer_rot],
+                dim=1).permute(0, 2, 1)  # [BS, num_proposals, code_size]
+            if 'vel' in preds_dict.keys():
+                layer_vel = preds_dict['vel'][..., idx_layer *
+                                              self.num_proposals:(idx_layer +
+                                                                  1) *
+                                              self.num_proposals, ]
+                preds = torch.cat([
+                    layer_center, layer_height, layer_dim, layer_rot, layer_vel
+                ],
+                                  dim=1).permute(
+                                      0, 2,
+                                      1)  # [BS, num_proposals, code_size]
+            code_weights = self.train_cfg.get('code_weights', None)
+            layer_bbox_weights = bbox_weights[:, idx_layer *
+                                              self.num_proposals:(idx_layer +
+                                                                  1) *
+                                              self.num_proposals, :, ]
+            layer_reg_weights = layer_bbox_weights * layer_bbox_weights.new_tensor(  # noqa: E501
+                code_weights)
+            layer_bbox_targets = bbox_targets[:, idx_layer *
+                                              self.num_proposals:(idx_layer +
+                                                                  1) *
+                                              self.num_proposals, :, ]
+            layer_loss_bbox = self.loss_bbox(
+                preds,
+                layer_bbox_targets,
+                layer_reg_weights,
+                avg_factor=max(num_pos, 1))
+
+            loss_dict[f'{prefix}_loss_cls'] = layer_loss_cls
+            loss_dict[f'{prefix}_loss_bbox'] = layer_loss_bbox
+            # loss_dict[f'{prefix}_loss_iou'] = layer_loss_iou
+
+        loss_dict['matched_ious'] = layer_loss_cls.new_tensor(matched_ious)
+
+        return loss_dict
diff --git a/mmde/projects/BEVFusion/bevfusion/utils.py b/mmde/projects/BEVFusion/bevfusion/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..40f7412bfd8383ebc191775f84ce6bfc728a496a
--- /dev/null
+++ b/mmde/projects/BEVFusion/bevfusion/utils.py
@@ -0,0 +1,311 @@
+# modify from https://github.com/mit-han-lab/bevfusion
+import torch
+from mmdet.models.task_modules import AssignResult, BaseAssigner, BaseBBoxCoder
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+from mmengine.structures import InstanceData
+
+from mmdet3d.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class TransFusionBBoxCoder(BaseBBoxCoder):
+
+    def __init__(
+        self,
+        pc_range,
+        out_size_factor,
+        voxel_size,
+        post_center_range=None,
+        score_threshold=None,
+        code_size=8,
+    ):
+        self.pc_range = pc_range
+        self.out_size_factor = out_size_factor
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.score_threshold = score_threshold
+        self.code_size = code_size
+
+    def encode(self, dst_boxes):
+        targets = torch.zeros([dst_boxes.shape[0],
+                               self.code_size]).to(dst_boxes.device)
+        targets[:, 0] = (dst_boxes[:, 0] - self.pc_range[0]) / (
+            self.out_size_factor * self.voxel_size[0])
+        targets[:, 1] = (dst_boxes[:, 1] - self.pc_range[1]) / (
+            self.out_size_factor * self.voxel_size[1])
+        targets[:, 3] = dst_boxes[:, 3].log()
+        targets[:, 4] = dst_boxes[:, 4].log()
+        targets[:, 5] = dst_boxes[:, 5].log()
+        # bottom center to gravity center
+        targets[:, 2] = dst_boxes[:, 2] + dst_boxes[:, 5] * 0.5
+        targets[:, 6] = torch.sin(dst_boxes[:, 6])
+        targets[:, 7] = torch.cos(dst_boxes[:, 6])
+        if self.code_size == 10:
+            targets[:, 8:10] = dst_boxes[:, 7:]
+        return targets
+
+    def decode(self, heatmap, rot, dim, center, height, vel, filter=False):
+        """Decode bboxes.
+        Args:
+            heat (torch.Tensor): Heatmap with the shape of
+                [B, num_cls, num_proposals].
+            rot (torch.Tensor): Rotation with the shape of
+                [B, 1, num_proposals].
+            dim (torch.Tensor): Dim of the boxes with the shape of
+                [B, 3, num_proposals].
+            center (torch.Tensor): bev center of the boxes with the shape of
+                [B, 2, num_proposals]. (in feature map metric)
+            height (torch.Tensor): height of the boxes with the shape of
+                [B, 2, num_proposals]. (in real world metric)
+            vel (torch.Tensor): Velocity with the shape of
+                [B, 2, num_proposals].
+            filter: if False, return all box without checking score and
+                center_range
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        # class label
+        final_preds = heatmap.max(1, keepdims=False).indices
+        final_scores = heatmap.max(1, keepdims=False).values
+
+        # change size to real world metric
+        center[:,
+               0, :] = center[:,
+                              0, :] * self.out_size_factor * self.voxel_size[
+                                  0] + self.pc_range[0]
+        center[:,
+               1, :] = center[:,
+                              1, :] * self.out_size_factor * self.voxel_size[
+                                  1] + self.pc_range[1]
+        dim[:, 0, :] = dim[:, 0, :].exp()
+        dim[:, 1, :] = dim[:, 1, :].exp()
+        dim[:, 2, :] = dim[:, 2, :].exp()
+        height = height - dim[:,
+                              2:3, :] * 0.5  # gravity center to bottom center
+        rots, rotc = rot[:, 0:1, :], rot[:, 1:2, :]
+        rot = torch.atan2(rots, rotc)
+
+        if vel is None:
+            final_box_preds = torch.cat([center, height, dim, rot],
+                                        dim=1).permute(0, 2, 1)
+        else:
+            final_box_preds = torch.cat([center, height, dim, rot, vel],
+                                        dim=1).permute(0, 2, 1)
+
+        predictions_dicts = []
+        for i in range(heatmap.shape[0]):
+            boxes3d = final_box_preds[i]
+            scores = final_scores[i]
+            labels = final_preds[i]
+            predictions_dict = {
+                'bboxes': boxes3d,
+                'scores': scores,
+                'labels': labels
+            }
+            predictions_dicts.append(predictions_dict)
+
+        if filter is False:
+            return predictions_dicts
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=heatmap.device)
+            mask = (final_box_preds[..., :3] >=
+                    self.post_center_range[:3]).all(2)
+            mask &= (final_box_preds[..., :3] <=
+                     self.post_center_range[3:]).all(2)
+
+            predictions_dicts = []
+            for i in range(heatmap.shape[0]):
+                cmask = mask[i, :]
+                if self.score_threshold:
+                    cmask &= thresh_mask[i]
+
+                boxes3d = final_box_preds[i, cmask]
+                scores = final_scores[i, cmask]
+                labels = final_preds[i, cmask]
+                predictions_dict = {
+                    'bboxes': boxes3d,
+                    'scores': scores,
+                    'labels': labels
+                }
+
+                predictions_dicts.append(predictions_dict)
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+
+        return predictions_dicts
+
+
+@TASK_UTILS.register_module()
+class BBoxBEVL1Cost(object):
+
+    def __init__(self, weight):
+        self.weight = weight
+
+    def __call__(self, bboxes, gt_bboxes, train_cfg):
+        pc_start = bboxes.new(train_cfg['point_cloud_range'][0:2])
+        pc_range = bboxes.new(
+            train_cfg['point_cloud_range'][3:5]) - bboxes.new(
+                train_cfg['point_cloud_range'][0:2])
+        # normalize the box center to [0, 1]
+        normalized_bboxes_xy = (bboxes[:, :2] - pc_start) / pc_range
+        normalized_gt_bboxes_xy = (gt_bboxes[:, :2] - pc_start) / pc_range
+        reg_cost = torch.cdist(
+            normalized_bboxes_xy, normalized_gt_bboxes_xy, p=1)
+        return reg_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class IoU3DCost(object):
+
+    def __init__(self, weight):
+        self.weight = weight
+
+    def __call__(self, iou):
+        iou_cost = -iou
+        return iou_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class HeuristicAssigner3D(BaseAssigner):
+
+    def __init__(self,
+                 dist_thre=100,
+                 iou_calculator=dict(type='BboxOverlaps3D')):
+        self.dist_thre = dist_thre  # distance in meter
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               bboxes,
+               gt_bboxes,
+               gt_bboxes_ignore=None,
+               gt_labels=None,
+               query_labels=None):
+        dist_thre = self.dist_thre
+        num_gts, num_bboxes = len(gt_bboxes), len(bboxes)
+
+        bev_dist = torch.norm(
+            bboxes[:, 0:2][None, :, :] - gt_bboxes[:, 0:2][:, None, :],
+            dim=-1)  # [num_gts, num_bboxes]
+        if query_labels is not None:
+            # only match the gt box and query with same category
+            not_same_class = (query_labels[None] != gt_labels[:, None])
+            bev_dist += not_same_class * dist_thre
+
+        # for each gt box, assign it to the nearest pred box
+        nearest_values, nearest_indices = bev_dist.min(1)  # [num_gts]
+        assigned_gt_inds = torch.ones([
+            num_bboxes,
+        ]).to(bboxes) * 0
+        assigned_gt_vals = torch.ones([
+            num_bboxes,
+        ]).to(bboxes) * 10000
+        assigned_gt_labels = torch.ones([
+            num_bboxes,
+        ]).to(bboxes) * -1
+        for idx_gts in range(num_gts):
+            # for idx_pred in torch.where(bev_dist[idx_gts] < dist_thre)[0]:
+            # # each gt match to all the pred box within some radius
+            idx_pred = nearest_indices[
+                idx_gts]  # each gt only match to the nearest pred box
+            if bev_dist[idx_gts, idx_pred] <= dist_thre:
+                # if this pred box is assigned, then compare
+                if bev_dist[idx_gts, idx_pred] < assigned_gt_vals[idx_pred]:
+                    assigned_gt_vals[idx_pred] = bev_dist[idx_gts, idx_pred]
+                    # for AssignResult, 0 is negative, -1 is ignore, 1-based
+                    # indices are positive
+                    assigned_gt_inds[idx_pred] = idx_gts + 1
+                    assigned_gt_labels[idx_pred] = gt_labels[idx_gts]
+
+        max_overlaps = torch.zeros([
+            num_bboxes,
+        ]).to(bboxes)
+        matched_indices = torch.where(assigned_gt_inds > 0)
+        matched_iou = self.iou_calculator(
+            gt_bboxes[assigned_gt_inds[matched_indices].long() - 1],
+            bboxes[matched_indices]).diag()
+        max_overlaps[matched_indices] = matched_iou
+
+        return AssignResult(
+            num_gts,
+            assigned_gt_inds.long(),
+            max_overlaps,
+            labels=assigned_gt_labels)
+
+
+@TASK_UTILS.register_module()
+class HungarianAssigner3D(BaseAssigner):
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxBEVL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoU3DCost', weight=1.0),
+                 iou_calculator=dict(type='BboxOverlaps3D')):
+        self.cls_cost = TASK_UTILS.build(cls_cost)
+        self.reg_cost = TASK_UTILS.build(reg_cost)
+        self.iou_cost = TASK_UTILS.build(iou_cost)
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self, bboxes, gt_bboxes, gt_labels, cls_pred, train_cfg):
+        num_gts, num_bboxes = gt_bboxes.size(0), bboxes.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bboxes.new_full((num_bboxes, ),
+                                           -1,
+                                           dtype=torch.long)
+        assigned_labels = bboxes.new_full((num_bboxes, ), -1, dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+
+        # 2. compute the weighted costs
+        # Hard code here to be compatible with the interface of
+        # `ClassificationCost` in mmdet.
+        gt_instances, pred_instances = InstanceData(
+            labels=gt_labels), InstanceData(scores=cls_pred[0].T)
+        cls_cost = self.cls_cost(pred_instances, gt_instances)
+        reg_cost = self.reg_cost(bboxes, gt_bboxes, train_cfg)
+        iou = self.iou_calculator(bboxes, gt_bboxes)
+        iou_cost = self.iou_cost(iou)
+
+        # weighted sum of above three costs
+        cost = cls_cost + reg_cost + iou_cost
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(bboxes.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(bboxes.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+
+        max_overlaps = torch.zeros_like(iou.max(1).values)
+        max_overlaps[matched_row_inds] = iou[matched_row_inds,
+                                             matched_col_inds]
+        # max_overlaps = iou.max(1).values
+        return AssignResult(
+            num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
diff --git a/mmde/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/mmde/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..a08bb66ad1862c3b65ef414d6387944c58c03d06
--- /dev/null
+++ b/mmde/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -0,0 +1,235 @@
+_base_ = [
+    './bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py'
+]
+point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
+input_modality = dict(use_lidar=True, use_camera=True)
+backend_args = None
+
+model = dict(
+    type='BEVFusion',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=False),
+    img_backbone=dict(
+        type='mmdet.SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=[1, 2, 3],
+        with_cp=False,
+        convert_weights=True,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'  # noqa: E501
+        )),
+    img_neck=dict(
+        type='GeneralizedLSSFPN',
+        in_channels=[192, 384, 768],
+        out_channels=256,
+        start_level=0,
+        num_outs=3,
+        norm_cfg=dict(type='BN2d', requires_grad=True),
+        act_cfg=dict(type='ReLU', inplace=True),
+        upsample_cfg=dict(mode='bilinear', align_corners=False)),
+    view_transform=dict(
+        type='DepthLSSTransform',
+        in_channels=256,
+        out_channels=80,
+        image_size=[256, 704],
+        feature_size=[32, 88],
+        xbound=[-54.0, 54.0, 0.3],
+        ybound=[-54.0, 54.0, 0.3],
+        zbound=[-10.0, 10.0, 20.0],
+        dbound=[1.0, 60.0, 0.5],
+        downsample=2),
+    fusion_layer=dict(
+        type='ConvFuser', in_channels=[80, 256], out_channels=256))
+
+train_pipeline = [
+    dict(
+        type='BEVLoadMultiViewImageFromFiles',
+        to_float32=True,
+        color_type='color',
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        load_dim=5,
+        use_dim=5,
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_attr_label=False),
+    dict(
+        type='ImageAug3D',
+        final_dim=[256, 704],
+        resize_lim=[0.38, 0.55],
+        bot_pct_lim=[0.0, 0.0],
+        rot_lim=[-5.4, 5.4],
+        rand_flip=True,
+        is_train=True),
+    dict(
+        type='BEVFusionGlobalRotScaleTrans',
+        scale_ratio_range=[0.9, 1.1],
+        rot_range=[-0.78539816, 0.78539816],
+        translation_std=0.5),
+    dict(type='BEVFusionRandomFlip3D'),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(
+        type='ObjectNameFilter',
+        classes=[
+            'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
+            'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+        ]),
+    # Actually, 'GridMask' is not used here
+    dict(
+        type='GridMask',
+        use_h=True,
+        use_w=True,
+        max_epoch=6,
+        rotate=1,
+        offset=False,
+        ratio=0.5,
+        mode=1,
+        prob=0.0,
+        fixed_prob=True),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes',
+            'gt_labels'
+        ],
+        meta_keys=[
+            'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
+            'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
+            'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation',
+            'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix',
+            'lidar_aug_matrix', 'num_pts_feats'
+        ])
+]
+
+test_pipeline = [
+    dict(
+        type='BEVLoadMultiViewImageFromFiles',
+        to_float32=True,
+        color_type='color',
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        load_dim=5,
+        use_dim=5,
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type='ImageAug3D',
+        final_dim=[256, 704],
+        resize_lim=[0.48, 0.48],
+        bot_pct_lim=[0.0, 0.0],
+        rot_lim=[0.0, 0.0],
+        rand_flip=False,
+        is_train=False),
+    dict(
+        type='PointsRangeFilter',
+        point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'],
+        meta_keys=[
+            'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
+            'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
+            'lidar_path', 'img_path', 'num_pts_feats'
+        ])
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        dataset=dict(pipeline=train_pipeline, modality=input_modality)))
+val_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, modality=input_modality))
+test_dataloader = val_dataloader
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.33333333,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='CosineAnnealingLR',
+        begin=0,
+        T_max=6,
+        end=6,
+        by_epoch=True,
+        eta_min_ratio=1e-4,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    # During the first 8 epochs, momentum increases from 1 to 0.85 / 0.95
+    # during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1
+    dict(
+        type='CosineAnnealingMomentum',
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=2.4,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        eta_min=1,
+        begin=2.4,
+        end=6,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=6, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
+
+default_hooks = dict(
+    logger=dict(type='LoggerHook', interval=50),
+    checkpoint=dict(type='CheckpointHook', interval=1))
+del _base_.custom_hooks
diff --git a/mmde/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/mmde/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..39000effdfea0c2254065035c77fd6cefe7fdb78
--- /dev/null
+++ b/mmde/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -0,0 +1,384 @@
+_base_ = ['../../../configs/_base_/default_runtime.py']
+custom_imports = dict(
+    imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False)
+
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.075, 0.075, 0.2]
+point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+metainfo = dict(classes=class_names)
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+data_prefix = dict(
+    pts='samples/LIDAR_TOP',
+    CAM_FRONT='samples/CAM_FRONT',
+    CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+    CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+    CAM_BACK='samples/CAM_BACK',
+    CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+    CAM_BACK_LEFT='samples/CAM_BACK_LEFT',
+    sweeps='sweeps/LIDAR_TOP')
+input_modality = dict(use_lidar=True, use_camera=False)
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/':
+#         's3://openmmlab/datasets/detection3d/nuscenes/',
+#         'data/nuscenes/':
+#         's3://openmmlab/datasets/detection3d/nuscenes/',
+#         './data/nuscenes_mini/':
+#         's3://openmmlab/datasets/detection3d/nuscenes/',
+#         'data/nuscenes_mini/':
+#         's3://openmmlab/datasets/detection3d/nuscenes/'
+#     }))
+backend_args = None
+
+model = dict(
+    type='BEVFusion',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        pad_size_divisor=32,
+        voxelize_cfg=dict(
+            max_num_points=10,
+            point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0],
+            voxel_size=[0.075, 0.075, 0.2],
+            max_voxels=[120000, 160000],
+            voxelize_reduce=True)),
+    pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+    pts_middle_encoder=dict(
+        type='BEVFusionSparseEncoder',
+        in_channels=5,
+        sparse_shape=[1440, 1440, 41],
+        order=('conv', 'norm', 'act'),
+        norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01),
+        encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
+                                                                      128)),
+        encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)),
+        block_type='basicblock'),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        out_channels=[128, 256],
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
+        conv_cfg=dict(type='Conv2d', bias=False)),
+    pts_neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        out_channels=[256, 256],
+        upsample_strides=[1, 2],
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=True),
+    bbox_head=dict(
+        type='TransFusionHead',
+        num_proposals=200,
+        auxiliary=True,
+        in_channels=512,
+        hidden_channel=128,
+        num_classes=10,
+        nms_kernel_size=3,
+        bn_momentum=0.1,
+        num_decoder_layers=1,
+        decoder_layer=dict(
+            type='TransformerDecoderLayer',
+            self_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1),
+            cross_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1),
+            ffn_cfg=dict(
+                embed_dims=128,
+                feedforward_channels=256,
+                num_fcs=2,
+                ffn_drop=0.1,
+                act_cfg=dict(type='ReLU', inplace=True),
+            ),
+            norm_cfg=dict(type='LN'),
+            pos_encoding_cfg=dict(input_channel=2, num_pos_feats=128)),
+        train_cfg=dict(
+            dataset='nuScenes',
+            point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0],
+            grid_size=[1440, 1440, 41],
+            voxel_size=[0.075, 0.075, 0.2],
+            out_size_factor=8,
+            gaussian_overlap=0.1,
+            min_radius=2,
+            pos_weight=-1,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+            assigner=dict(
+                type='HungarianAssigner3D',
+                iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
+                cls_cost=dict(
+                    type='mmdet.FocalLossCost',
+                    gamma=2.0,
+                    alpha=0.25,
+                    weight=0.15),
+                reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25),
+                iou_cost=dict(type='IoU3DCost', weight=0.25))),
+        test_cfg=dict(
+            dataset='nuScenes',
+            grid_size=[1440, 1440, 41],
+            out_size_factor=8,
+            voxel_size=[0.075, 0.075],
+            pc_range=[-54.0, -54.0],
+            nms_type=None),
+        common_heads=dict(
+            center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]),
+        bbox_coder=dict(
+            type='TransFusionBBoxCoder',
+            pc_range=[-54.0, -54.0],
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            score_threshold=0.0,
+            out_size_factor=8,
+            voxel_size=[0.075, 0.075],
+            code_size=10),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            reduction='mean',
+            loss_weight=1.0),
+        loss_heatmap=dict(
+            type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.L1Loss', reduction='mean', loss_weight=0.25)))
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            car=5,
+            truck=5,
+            bus=5,
+            trailer=5,
+            construction_vehicle=5,
+            traffic_cone=5,
+            barrier=5,
+            motorcycle=5,
+            bicycle=5,
+            pedestrian=5)),
+    classes=class_names,
+    sample_groups=dict(
+        car=2,
+        truck=3,
+        construction_vehicle=7,
+        bus=4,
+        trailer=6,
+        barrier=2,
+        motorcycle=6,
+        bicycle=6,
+        pedestrian=2,
+        traffic_cone=2),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        load_dim=5,
+        use_dim=5,
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_attr_label=False),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='GlobalRotScaleTrans',
+        scale_ratio_range=[0.9, 1.1],
+        rot_range=[-0.78539816, 0.78539816],
+        translation_std=0.5),
+    dict(type='BEVFusionRandomFlip3D'),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(
+        type='ObjectNameFilter',
+        classes=[
+            'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
+            'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+        ]),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes',
+            'gt_labels'
+        ],
+        meta_keys=[
+            'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
+            'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
+            'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation',
+            'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix',
+            'lidar_aug_matrix'
+        ])
+]
+
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        load_dim=5,
+        use_dim=5,
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type='PointsRangeFilter',
+        point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'],
+        meta_keys=[
+            'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
+            'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
+            'lidar_path', 'img_path', 'num_pts_feats', 'num_views'
+        ])
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CBGSDataset',
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='nuscenes_infos_train.pkl',
+            pipeline=train_pipeline,
+            metainfo=metainfo,
+            modality=input_modality,
+            test_mode=False,
+            data_prefix=data_prefix,
+            use_valid_flag=True,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR')))
+val_dataloader = dict(
+    batch_size=4,
+    num_workers=16,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        data_prefix=data_prefix,
+        test_mode=True,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='NuScenesMetric',
+    data_root=data_root,
+    ann_file=data_root + 'nuscenes_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# learning rate
+lr = 0.0001
+param_scheduler = [
+    # learning rate scheduler
+    # During the first 8 epochs, learning rate increases from 0 to lr * 10
+    # during the next 12 epochs, learning rate decreases from lr * 10 to
+    # lr * 1e-4
+    dict(
+        type='CosineAnnealingLR',
+        T_max=8,
+        eta_min=lr * 10,
+        begin=0,
+        end=8,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=12,
+        eta_min=lr * 1e-4,
+        begin=8,
+        end=20,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    # During the first 8 epochs, momentum increases from 0 to 0.85 / 0.95
+    # during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=8,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=8,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=12,
+        eta_min=1,
+        begin=8,
+        end=20,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=5)
+val_cfg = dict()
+test_cfg = dict()
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
+log_processor = dict(window_size=50)
+
+default_hooks = dict(
+    logger=dict(type='LoggerHook', interval=50),
+    checkpoint=dict(type='CheckpointHook', interval=5))
+custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=15)]
diff --git "a/mmde/projects/BEVFusion/configs/\346\200\247\350\203\275" "b/mmde/projects/BEVFusion/configs/\346\200\247\350\203\275"
new file mode 100644
index 0000000000000000000000000000000000000000..d08a7c9f5db7b935ae4e0cb9198973f84917058f
--- /dev/null
+++ "b/mmde/projects/BEVFusion/configs/\346\200\247\350\203\275"
@@ -0,0 +1,41 @@
+python3 tools/analysis_tools/benchmark_patched.py \
+    projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py \
+    pth/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-2628f933_fixed.pth \
+    --fuse-conv-bn \
+    --cfg-options \
+    test_dataloader.dataset.ann_file=nuscenes_infos_mini_val.pkl \
+    test_dataloader.batch_size=1
+    性能：2.7
+python3 tools/analysis_tools/benchmark_patched.py \
+    projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py \
+    pth/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-2628f933_fixed.pth \
+    --fuse-conv-bn \
+    --cfg-options \
+    test_dataloader.dataset.ann_file=nuscenes_infos_mini_val.pkl \
+    test_dataloader.batch_size=4
+    性能：3.1
+
+python3 tools/analysis_tools/benchmark_patched.py     projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py     pth/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-2628f933_fixed.pth     --fuse-conv-bn     --cfg-options     test_dataloader.dataset.ann_file=nuscenes_infos_mini_val.pkl     test_dataloader.batch_size=8
+    性能3.6 frames / s
+
+
+############
+python3 tools/analysis_tools/benchmark_patched.py \
+    projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py \
+    pth/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-5239b1af_fixed.pth \
+    --fuse-conv-bn \
+    --cfg-options \
+    test_dataloader.dataset.ann_file=nuscenes_infos_mini_val.pkl \
+    test_dataloader.batch_size=1
+    性能：2.1
+python3 tools/analysis_tools/benchmark_patched.py \
+    projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py \
+    pth/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-5239b1af_fixed.pth \
+    --fuse-conv-bn \
+    --cfg-options \
+    test_dataloader.dataset.ann_file=nuscenes_infos_mini_val.pkl \
+    test_dataloader.batch_size=4
+    性能：2.5
+
+python3 tools/analysis_tools/benchmark_patched.py     projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py     pth/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-5239b1af_fixed.pth     --fuse-conv-bn     --cfg-options     test_dataloader.dataset.ann_file=nuscenes_infos_mini_val.pkl     test_dataloader.batch_size=8
+    性能2.8 frames / s
\ No newline at end of file
diff --git a/mmde/projects/BEVFusion/demo/multi_modality_demo.py b/mmde/projects/BEVFusion/demo/multi_modality_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f82867d40403abc5622335bb00fe99c2427155f
--- /dev/null
+++ b/mmde/projects/BEVFusion/demo/multi_modality_demo.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser
+
+import mmcv
+
+from mmdet3d.apis import inference_multi_modality_detector, init_model
+from mmdet3d.registry import VISUALIZERS
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('pcd', help='Point cloud file')
+    parser.add_argument('img', help='image file')
+    parser.add_argument('ann', help='ann file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--cam-type',
+        type=str,
+        default='CAM_FRONT',
+        help='choose camera type to inference')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.0, help='bbox score threshold')
+    parser.add_argument(
+        '--out-dir', type=str, default='demo', help='dir to save results')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='show online visualization results')
+    parser.add_argument(
+        '--snapshot',
+        action='store_true',
+        help='whether to save online visualization results')
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    # build the model from a config file and a checkpoint file
+    model = init_model(args.config, args.checkpoint, device=args.device)
+
+    # init visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    visualizer.dataset_meta = model.dataset_meta
+
+    # test a single image and point cloud sample
+    result, data = inference_multi_modality_detector(model, args.pcd, args.img,
+                                                     args.ann, args.cam_type)
+    points = data['inputs']['points']
+    if isinstance(result.img_path, list):
+        img = []
+        for img_path in result.img_path:
+            single_img = mmcv.imread(img_path)
+            single_img = mmcv.imconvert(single_img, 'bgr', 'rgb')
+            img.append(single_img)
+    else:
+        img = mmcv.imread(result.img_path)
+        img = mmcv.imconvert(img, 'bgr', 'rgb')
+    data_input = dict(points=points, img=img)
+
+    # show the results
+    visualizer.add_datasample(
+        'result',
+        data_input,
+        data_sample=result,
+        draw_gt=False,
+        show=args.show,
+        wait_time=-1,
+        out_file=args.out_dir,
+        pred_score_thr=args.score_thr,
+        vis_task='multi-modality_det')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/mmde/projects/BEVFusion/setup.py b/mmde/projects/BEVFusion/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..04e884e48608c5849ce3ffa0714e4f2af76afaad
--- /dev/null
+++ b/mmde/projects/BEVFusion/setup.py
@@ -0,0 +1,67 @@
+import os
+from setuptools import setup
+
+import torch
+from torch.utils.cpp_extension import (BuildExtension, CppExtension,
+                                       CUDAExtension)
+
+
+def make_cuda_ext(name,
+                  module,
+                  sources,
+                  sources_cuda=[],
+                  extra_args=[],
+                  extra_include_path=[]):
+
+    define_macros = []
+    extra_compile_args = {'cxx': [] + extra_args}
+
+    if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
+        define_macros += [('WITH_CUDA', None)]
+        extension = CUDAExtension
+        extra_compile_args['nvcc'] = extra_args + [
+            '-D__CUDA_NO_HALF_OPERATORS__',
+            '-D__CUDA_NO_HALF_CONVERSIONS__',
+            '-D__CUDA_NO_HALF2_OPERATORS__',
+            ]
+        sources += sources_cuda
+    else:
+        print('Compiling {} without CUDA'.format(name))
+        extension = CppExtension
+
+    return extension(
+        name='{}.{}'.format(module, name),
+        sources=[os.path.join(*module.split('.'), p) for p in sources],
+        include_dirs=extra_include_path,
+        define_macros=define_macros,
+        extra_compile_args=extra_compile_args,
+    )
+
+
+if __name__ == '__main__':
+    setup(
+        name='bev_pool',
+        ext_modules=[
+            make_cuda_ext(
+                name='bev_pool_ext',
+                module='bevfusion.ops.bev_pool',
+                sources=[
+                    'src/bev_pool.cpp',
+                    'src/bev_pool_cuda.cu',
+                ],
+            ),
+            make_cuda_ext(
+                name='voxel_layer',
+                module='bevfusion.ops.voxel',
+                sources=[
+                    'src/voxelization.cpp',
+                    'src/scatter_points_cpu.cpp',
+                    'src/scatter_points_cuda.cu',
+                    'src/voxelization_cpu.cpp',
+                    'src/voxelization_cuda.cu',
+                ],
+            ),
+        ],
+        cmdclass={'build_ext': BuildExtension},
+        zip_safe=False,
+    )
diff --git a/mmde/projects/CENet/README.md b/mmde/projects/CENet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..53b7b7c83c01e786a468a269eddda25a80a37e2c
--- /dev/null
+++ b/mmde/projects/CENet/README.md
@@ -0,0 +1,127 @@
+# CENet: Toward Concise and Efficient LiDAR Semantic Segmentation for Autonomous Driving
+
+> [CENet: Toward Concise and Efficient LiDAR Semantic Segmentation for Autonomous Driving](https://arxiv.org/abs/2207.12691)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Accurate and fast scene understanding is one of the challenging task for autonomous driving, which requires to take full advantage of LiDAR point clouds for semantic segmentation. In this paper, we present a concise and efficient image-based semantic segmentation network, named CENet. In order to improve the descriptive power of learned features and reduce the computational as well as time complexity, our CENet integrates the convolution with larger kernel size instead of MLP, carefully-selected activation functions, and multiple auxiliary segmentation heads with corresponding loss functions into architecture. Quantitative and qualitative experiments conducted on publicly available benchmarks, SemanticKITTI and SemanticPOSS, demonstrate that our pipeline achieves much better mIoU and inference performance compared with state-of-the-art models. The code will be available at https://github.com/huixiancheng/CENet.
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection3d/assets/55445986/2c268392-0e0c-4e93-bb9d-dc3417c56dad" width="800"/>
+</div>
+
+## Introduction
+
+We implement CENet and provide the results and pretrained checkpoints on SemanticKITTI dataset.
+
+## Usage
+
+<!-- For a typical model, this section should contain the commands for training and testing. You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`. -->
+
+### Training commands
+
+In MMDetection3D's root directory, run the following command to train the model:
+
+```bash
+python tools/train.py projects/CENet/configs/cenet-64x512_4xb4_semantickitti.py
+```
+
+For multi-gpu training, run:
+
+```bash
+python -m torch.distributed.launch --nnodes=1 --node_rank=0 --nproc_per_node=${NUM_GPUS} --master_port=29506 --master_addr="127.0.0.1" tools/train.py projects/CENet/configs/cenet-64x512_4xb4_semantickitti.py
+```
+
+### Testing commands
+
+In MMDetection3D's root directory, run the following command to test the model:
+
+```bash
+python tools/test.py projects/CENet/configs/cenet-64x512_4xb4_semantickitti.py ${CHECKPOINT_PATH}
+```
+
+## Results and models
+
+### NuScenes
+
+|                        Backbone                        | Input resolution | Mem (GB) | Inf time (fps) | mIoU  |         Download         |
+| :----------------------------------------------------: | :--------------: | :------: | :------------: | :---: | :----------------------: |
+| [CENet](./configs/cenet-64x512_4xb4_semantickitti.py)  |     64\*512      |          |      41.7      | 61.10 | [model](<>) \| [log](<>) |
+| [CENet](./configs/cenet-64x1024_4xb4_semantickitti.py) |     64\*1024     |          |      26.8      | 62.20 | [model](<>) \| [log](<>) |
+| [CENet](./configs/cenet-64x2048_4xb4_semantickitti.py) |     64\*2048     |          |      14.1      | 62.64 | [model](<>) \| [log](<>) |
+
+**Note**
+
+- We report point-based mIoU instead of range-view based mIoU
+- The mIoU is the best results during inference after each epoch training, which is consistent with official code
+- If your setting is different with our settings, we strongly suggest to enable `auto_scale_lr` to achieve comparable results.
+
+## Citation
+
+```latex
+@inproceedings{cheng2022cenet,
+  title={Cenet: Toward Concise and Efficient Lidar Semantic Segmentation for Autonomous Driving},
+  author={Cheng, Hui--Xian and Han, Xian--Feng and Xiao, Guo--Qiang},
+  booktitle={2022 IEEE International Conference on Multimedia and Expo (ICME)},
+  pages={01--06},
+  year={2022},
+  organization={IEEE}
+}
+```
+
+## Checklist
+
+<!-- Here is a checklist illustrating a usual development workflow of a successful project, and also serves as an overview of this project's progress. The PIC (person in charge) or contributors of this project should check all the items that they believe have been finished, which will further be verified by codebase maintainers via a PR.
+OpenMMLab's maintainer will review the code to ensure the project's quality. Reaching the first milestone means that this project suffices the minimum requirement of being merged into 'projects/'. But this project is only eligible to become a part of the core package upon attaining the last milestone.
+Note that keeping this section up-to-date is crucial not only for this project's developers but the entire community, since there might be some other contributors joining this project and deciding their starting point from this list. It also helps maintainers accurately estimate time and effort on further code polishing, if needed.
+A project does not necessarily have to be finished in a single PR, but it's essential for the project to at least reach the first milestone in its very first PR. -->
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+    <!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmdet3d.registry.MODELS` and configurable via a config file. -->
+
+  - [x] Basic docstrings & proper citation
+
+    <!-- Each major object should contain a docstring, describing its functionality and arguments. If you have adapted the code from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
+
+  - [x] Test-time correctness
+
+    <!-- If you are reproducing the result from a paper, make sure your model's inference-time performance matches that in the original paper. The weights usually could be obtained by simply renaming the keys in the official pre-trained weights. This test could be skipped though, if you are able to prove the training-time correctness and check the second milestone. -->
+
+  - [x] A full README
+
+    <!-- As this template does. -->
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+  - [x] Training-time correctness
+
+    <!-- If you are reproducing the result from a paper, checking this item means that you should have trained your model from scratch based on the original paper's specification and verified that the final result matches the report within a minor error range. -->
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+    <!-- Ideally *all* the methods should have [type hints](https://www.pythontutorial.net/python-basics/python-type-hints/) and [docstrings](https://google.github.io/styleguide/pyguide.html#381-docstrings). [Example](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/models/detectors/fcos_mono3d.py) -->
+
+  - [ ] Unit tests
+
+    <!-- Unit tests for each module are required. [Example](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tests/test_models/test_dense_heads/test_fcos_mono3d_head.py) -->
+
+  - [ ] Code polishing
+
+    <!-- Refactor your code according to reviewer's comment. -->
+
+  - [ ] Metafile.yml
+
+    <!-- It will be parsed by MIM and Inferencer. [Example](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/fcos3d/metafile.yml) -->
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+  <!-- In particular, you may have to refactor this README into a standard one. [Example](/configs/textdet/dbnet/README.md) -->
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/mmde/projects/CENet/cenet/__init__.py b/mmde/projects/CENet/cenet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..41a10261f06dcd994be897464dc82fa5eaa65a91
--- /dev/null
+++ b/mmde/projects/CENet/cenet/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .boundary_loss import BoundaryLoss
+from .cenet_backbone import CENet
+from .range_image_head import RangeImageHead
+from .range_image_segmentor import RangeImageSegmentor
+from .transforms_3d import SemkittiRangeView
+
+__all__ = [
+    'CENet', 'RangeImageHead', 'RangeImageSegmentor', 'SemkittiRangeView',
+    'BoundaryLoss'
+]
diff --git a/mmde/projects/CENet/cenet/boundary_loss.py b/mmde/projects/CENet/cenet/boundary_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..3028200cb925eff07adc946216594fe5d7fd1d01
--- /dev/null
+++ b/mmde/projects/CENet/cenet/boundary_loss.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+
+from mmdet3d.registry import MODELS
+
+
+def one_hot(label: Tensor,
+            n_classes: int,
+            requires_grad: bool = True) -> Tensor:
+    """Return One Hot Label."""
+    device = label.device
+    one_hot_label = torch.eye(
+        n_classes, device=device, requires_grad=requires_grad)[label]
+    one_hot_label = one_hot_label.transpose(1, 3).transpose(2, 3)
+
+    return one_hot_label
+
+
+@MODELS.register_module()
+class BoundaryLoss(nn.Module):
+    """Boundary loss."""
+
+    def __init__(self, theta0=3, theta=5, loss_weight: float = 1.0) -> None:
+        super(BoundaryLoss, self).__init__()
+        self.theta0 = theta0
+        self.theta = theta
+        self.loss_weight = loss_weight
+
+    def forward(self, pred: Tensor, gt: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): The output from model.
+            gt (Tensor): Ground truth map.
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        pred = F.softmax(pred, dim=1)
+        n, c, _, _ = pred.shape
+
+        # one-hot vector of ground truth
+        one_hot_gt = one_hot(gt, c)
+
+        # boundary map
+        gt_b = F.max_pool2d(
+            1 - one_hot_gt,
+            kernel_size=self.theta0,
+            stride=1,
+            padding=(self.theta0 - 1) // 2)
+        gt_b -= 1 - one_hot_gt
+
+        pred_b = F.max_pool2d(
+            1 - pred,
+            kernel_size=self.theta0,
+            stride=1,
+            padding=(self.theta0 - 1) // 2)
+        pred_b -= 1 - pred
+
+        gt_b = gt_b.view(n, c, -1)
+        pred_b = pred_b.view(n, c, -1)
+
+        # Precision, Recall
+        P = torch.sum(pred_b * gt_b, dim=2) / (torch.sum(pred_b, dim=2) + 1e-7)
+        R = torch.sum(pred_b * gt_b, dim=2) / (torch.sum(gt_b, dim=2) + 1e-7)
+
+        # Boundary F1 Score
+        BF1 = 2 * P * R / (P + R + 1e-7)
+
+        # summing BF1 Score for each class and average over mini-batch
+        loss = torch.mean(1 - BF1)
+
+        return self.loss_weight * loss
diff --git a/mmde/projects/CENet/cenet/cenet_backbone.py b/mmde/projects/CENet/cenet/cenet_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..835bd86d8348f9a2e527995abbfb76358523825e
--- /dev/null
+++ b/mmde/projects/CENet/cenet/cenet_backbone.py
@@ -0,0 +1,239 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Tuple
+
+import torch
+from mmcv.cnn import (ConvModule, build_activation_layer, build_conv_layer,
+                      build_norm_layer)
+from mmengine.model import BaseModule
+from torch import Tensor, nn
+from torch.nn import functional as F
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
+
+
+class BasicBlock(BaseModule):
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 act_cfg: ConfigType = dict(type='LeakyReLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(BasicBlock, self).__init__(init_cfg)
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=False)
+        self.add_module(self.norm2_name, norm2)
+        self.relu = build_activation_layer(act_cfg)
+        self.downsample = downsample
+
+    @property
+    def norm1(self) -> nn.Module:
+        """nn.Module: normalization layer after the first convolution layer."""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self) -> nn.Module:
+        """nn.Module: normalization layer after the second convolution layer.
+        """
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.norm1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.norm2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+@MODELS.register_module()
+class CENet(BaseModule):
+
+    def __init__(self,
+                 in_channels: int = 5,
+                 stem_channels: int = 128,
+                 num_stages: int = 4,
+                 stage_blocks: Sequence[int] = (3, 4, 6, 3),
+                 out_channels: Sequence[int] = (128, 128, 128, 128),
+                 strides: Sequence[int] = (1, 2, 2, 2),
+                 dilations: Sequence[int] = (1, 1, 1, 1),
+                 fuse_channels: Sequence[int] = (256, 128),
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 act_cfg: ConfigType = dict(type='LeakyReLU'),
+                 init_cfg=None) -> None:
+        super(CENet, self).__init__(init_cfg)
+
+        assert len(stage_blocks) == len(out_channels) == len(strides) == len(
+            dilations) == num_stages, \
+            'The length of stage_blocks, out_channels, strides and ' \
+            'dilations should be equal to num_stages'
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self._make_stem_layer(in_channels, stem_channels)
+
+        inplanes = stem_channels
+        self.res_layers = []
+        for i, num_blocks in enumerate(stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            planes = out_channels[i]
+            res_layer = self.make_res_layer(
+                inplanes=inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            inplanes = planes
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        in_channels = stem_channels + sum(out_channels)
+        self.fuse_layers = []
+        for i, fuse_channel in enumerate(fuse_channels):
+            fuse_layer = ConvModule(
+                in_channels,
+                fuse_channel,
+                kernel_size=3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            in_channels = fuse_channel
+            layer_name = f'fuse_layer{i + 1}'
+            self.add_module(layer_name, fuse_layer)
+            self.fuse_layers.append(layer_name)
+
+    def _make_stem_layer(self, in_channels: int, out_channels: int) -> None:
+        self.stem = nn.Sequential(
+            build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                out_channels // 2,
+                kernel_size=3,
+                padding=1,
+                bias=False),
+            build_norm_layer(self.norm_cfg, out_channels // 2)[1],
+            build_activation_layer(self.act_cfg),
+            build_conv_layer(
+                self.conv_cfg,
+                out_channels // 2,
+                out_channels,
+                kernel_size=3,
+                padding=1,
+                bias=False),
+            build_norm_layer(self.norm_cfg, out_channels)[1],
+            build_activation_layer(self.act_cfg),
+            build_conv_layer(
+                self.conv_cfg,
+                out_channels,
+                out_channels,
+                kernel_size=3,
+                padding=1,
+                bias=False),
+            build_norm_layer(self.norm_cfg, out_channels)[1],
+            build_activation_layer(self.act_cfg))
+
+    def make_res_layer(
+        self,
+        inplanes: int,
+        planes: int,
+        num_blocks: int,
+        stride: int,
+        dilation: int,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN'),
+        act_cfg: ConfigType = dict(type='LeakyReLU')
+    ) -> nn.Sequential:
+        downsample = None
+        if stride != 1 or inplanes != planes:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes)[1])
+
+        layers = []
+        layers.append(
+            BasicBlock(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride,
+                dilation=dilation,
+                downsample=downsample,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+        inplanes = planes
+        for _ in range(1, num_blocks):
+            layers.append(
+                BasicBlock(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=1,
+                    dilation=dilation,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        return nn.Sequential(*layers)
+
+    def forward(self, x: Tensor) -> Tuple[Tensor]:
+        x = self.stem(x)
+        outs = [x]
+        for layer_name in self.res_layers:
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            outs.append(x)
+
+        # TODO: move the following operation into neck.
+        for i in range(len(outs)):
+            if outs[i].shape != outs[0].shape:
+                outs[i] = F.interpolate(
+                    outs[i],
+                    size=outs[0].size()[2:],
+                    mode='bilinear',
+                    align_corners=True)
+
+        outs[0] = torch.cat(outs, dim=1)
+
+        for layer_name in self.fuse_layers:
+            fuse_layer = getattr(self, layer_name)
+            outs[0] = fuse_layer(outs[0])
+        return tuple(outs)
diff --git a/mmde/projects/CENet/cenet/range_image_head.py b/mmde/projects/CENet/cenet/range_image_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..758a19767473c59555b547ea2f9f75152bf546d8
--- /dev/null
+++ b/mmde/projects/CENet/cenet/range_image_head.py
@@ -0,0 +1,142 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from torch import Tensor, nn
+
+from mmdet3d.models import Base3DDecodeHead
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import ConfigType, OptConfigType
+
+
+@MODELS.register_module()
+class RangeImageHead(Base3DDecodeHead):
+    """RangeImage decoder head.
+
+    Args:
+        loss_ce (dict or :obj:`ConfigDict`): Config of CrossEntropy loss.
+            Defaults to dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=False,
+                     class_weight=None,
+                     loss_weight=1.0).
+        loss_lovasz (dict or :obj:`ConfigDict`, optional): Config of Lovasz
+            loss. Defaults to None.
+        lpss_boundary (dict or :obj:`ConfigDict`, optional): Config of boundary
+            loss. Defaults to None.
+        indices (int): The indice of features to use. Defaults to 0.
+    """
+
+    def __init__(self,
+                 loss_ce: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=False,
+                     class_weight=None,
+                     loss_weight=1.0),
+                 loss_lovasz: OptConfigType = None,
+                 loss_boundary: OptConfigType = None,
+                 indices: int = 0,
+                 **kwargs) -> None:
+        super(RangeImageHead, self).__init__(**kwargs)
+
+        self.loss_ce = MODELS.build(loss_ce)
+        if loss_lovasz is not None:
+            self.loss_lovasz = MODELS.build(loss_lovasz)
+        else:
+            self.loss_lovasz = None
+        if loss_boundary is not None:
+            self.loss_boundary = MODELS.build(loss_boundary)
+        else:
+            self.loss_boundary = None
+
+        self.indices = indices
+
+    def build_conv_seg(self, channels: int, num_classes: int,
+                       kernel_size: int) -> nn.Module:
+        return nn.Conv2d(channels, num_classes, kernel_size=kernel_size)
+
+    def forward(self, feats: Tuple[Tensor]) -> Tensor:
+        """Forward function."""
+        seg_logit = self.cls_seg(feats[self.indices])
+        return seg_logit
+
+    def _stack_batch_gt(self, batch_data_samples: SampleList) -> Tensor:
+        gt_semantic_segs = [
+            data_sample.gt_pts_seg.semantic_seg
+            for data_sample in batch_data_samples
+        ]
+        return torch.stack(gt_semantic_segs, dim=0)
+
+    def loss_by_feat(self, seg_logit: Tensor,
+                     batch_data_samples: SampleList) -> dict:
+        """Compute semantic segmentation loss.
+
+        Args:
+            seg_logit (Tensor): Predicted  logits.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_pts_seg`.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components.
+        """
+        seg_label = self._stack_batch_gt(batch_data_samples)
+        seg_label = seg_label.squeeze(dim=1)
+        loss = dict()
+        loss['loss_ce'] = self.loss_ce(
+            seg_logit, seg_label, ignore_index=self.ignore_index)
+        if self.loss_lovasz:
+            loss['loss_lovasz'] = self.loss_lovasz(
+                seg_logit, seg_label, ignore_index=self.ignore_index)
+        if self.loss_boundary:
+            loss['loss_boundary'] = self.loss_boundary(seg_logit, seg_label)
+        return loss
+
+    def predict(self, inputs: Tuple[Tensor], batch_input_metas: List[dict],
+                test_cfg: ConfigType) -> torch.Tensor:
+        """Forward function for testing.
+
+        Args:
+            inputs (Tuple[Tensor]): Features from backbone.
+            batch_input_metas (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`. We use `point2voxel_map` in this function.
+            test_cfg (dict or :obj:`ConfigDict`): The testing config.
+
+        Returns:
+            List[Tensor]: List of point-wise segmentation labels.
+        """
+        seg_logits = self.forward(inputs)
+        seg_labels = seg_logits.argmax(dim=1)
+        device = seg_logits.device
+        use_knn = test_cfg.get('use_knn', False)
+        if use_knn:
+            from .utils import KNN
+            post_module = KNN(
+                test_cfg=test_cfg,
+                num_classes=self.num_classes,
+                ignore_index=self.ignore_index)
+
+        seg_label_list = []
+        for i in range(len(batch_input_metas)):
+            input_metas = batch_input_metas[i]
+            proj_x = torch.tensor(
+                input_metas['proj_x'], dtype=torch.int64, device=device)
+            proj_y = torch.tensor(
+                input_metas['proj_y'], dtype=torch.int64, device=device)
+            proj_range = torch.tensor(
+                input_metas['proj_range'], dtype=torch.float32, device=device)
+            unproj_range = torch.tensor(
+                input_metas['unproj_range'],
+                dtype=torch.float32,
+                device=device)
+
+            if use_knn:
+                seg_label_list.append(
+                    post_module(proj_range, unproj_range, seg_labels[i],
+                                proj_x, proj_y))
+            else:
+                seg_label_list.append(seg_labels[i, proj_y, proj_x])
+
+        return seg_label_list
diff --git a/mmde/projects/CENet/cenet/range_image_segmentor.py b/mmde/projects/CENet/cenet/range_image_segmentor.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb0ad460e86c0e022bbc23a7cc99e42bb5185f15
--- /dev/null
+++ b/mmde/projects/CENet/cenet/range_image_segmentor.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List
+
+from torch import Tensor
+
+from mmdet3d.models import EncoderDecoder3D
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import PointData
+from mmdet3d.structures.det3d_data_sample import OptSampleList, SampleList
+
+
+@MODELS.register_module()
+class RangeImageSegmentor(EncoderDecoder3D):
+
+    def loss(self, batch_inputs_dict: dict,
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs_dict (dict): Input sample dict which
+                includes 'points' and 'imgs' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary of loss components.
+        """
+        # extract features using backbone
+        imgs = batch_inputs_dict['imgs']
+        x = self.extract_feat(imgs)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(x, batch_data_samples)
+        losses.update(loss_decode)
+
+        if self.with_auxiliary_head:
+            loss_aux = self._auxiliary_head_forward_train(
+                x, batch_data_samples)
+            losses.update(loss_aux)
+        return losses
+
+    def predict(self,
+                batch_inputs_dict: dict,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Simple test with single scene.
+
+        Args:
+            batch_inputs_dict (dict): Input sample dict which includes 'points'
+                and 'imgs' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+            rescale (bool): Whether transform to original number of points.
+                Will be used for voxelization based segmentors.
+                Defaults to True.
+
+        Returns:
+            List[:obj:`Det3DDataSample`]: Segmentation results of the input
+            points. Each Det3DDataSample usually contains:
+
+            - ``pred_pts_seg`` (PointData): Prediction of 3D semantic
+              segmentation.
+            - ``pts_seg_logits`` (PointData): Predicted logits of 3D semantic
+              segmentation before normalization.
+        """
+        # 3D segmentation requires per-point prediction, so it's impossible
+        # to use down-sampling to get a batch of scenes with same num_points
+        # therefore, we only support testing one scene every time
+        batch_input_metas = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+
+        imgs = batch_inputs_dict['imgs']
+        x = self.extract_feat(imgs)
+        seg_labels_list = self.decode_head.predict(x, batch_input_metas,
+                                                   self.test_cfg)
+
+        return self.postprocess_result(seg_labels_list, batch_data_samples)
+
+    def _forward(self,
+                 batch_inputs_dict: dict,
+                 batch_data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            batch_inputs_dict (dict): Input sample dict which includes 'points'
+                and 'imgs' keys.
+
+                - points (List[Tensor]): Point cloud of each sample.
+                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        imgs = batch_inputs_dict['imgs']
+        x = self.extract_feat(imgs)
+        return self.decode_head.forward(x)
+
+    def postprocess_result(self, seg_labels_list: List[Tensor],
+                           batch_data_samples: SampleList) -> SampleList:
+        """Convert results list to `Det3DDataSample`.
+
+        Args:
+            seg_labels_list (List[Tensor]): List of segmentation results,
+                seg_logits from model of each input point clouds sample.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
+                samples. It usually includes information such as `metainfo` and
+                `gt_pts_seg`.
+
+        Returns:
+            List[:obj:`Det3DDataSample`]: Segmentation results of the input
+            points. Each Det3DDataSample usually contains:
+
+            - ``pred_pts_seg`` (PointData): Prediction of 3D semantic
+              segmentation.
+            - ``pts_seg_logits`` (PointData): Predicted logits of 3D semantic
+              segmentation before normalization.
+        """
+
+        for i, seg_pred in enumerate(seg_labels_list):
+            batch_data_samples[i].set_data(
+                {'pred_pts_seg': PointData(**{'pts_semantic_mask': seg_pred})})
+        return batch_data_samples
diff --git a/mmde/projects/CENet/cenet/transforms_3d.py b/mmde/projects/CENet/cenet/transforms_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..73ec339613b29bb2d284d0b5643b0b359f381822
--- /dev/null
+++ b/mmde/projects/CENet/cenet/transforms_3d.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+import numpy as np
+from mmcv.transforms import BaseTransform
+
+from mmdet3d.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class SemkittiRangeView(BaseTransform):
+    """Convert Semantickitti point cloud dataset to range image."""
+
+    def __init__(self,
+                 H: int = 64,
+                 W: int = 2048,
+                 fov_up: float = 3.0,
+                 fov_down: float = -25.0,
+                 means: Sequence[float] = (11.71279, -0.1023471, 0.4952,
+                                           -1.0545, 0.2877),
+                 stds: Sequence[float] = (10.24, 12.295865, 9.4287, 0.8643,
+                                          0.1450),
+                 ignore_index: int = 19) -> None:
+        self.H = H
+        self.W = W
+        self.fov_up = fov_up / 180.0 * np.pi
+        self.fov_down = fov_down / 180.0 * np.pi
+        self.fov = abs(self.fov_down) + abs(self.fov_up)
+        self.means = np.array(means, dtype=np.float32)
+        self.stds = np.array(stds, dtype=np.float32)
+        self.ignore_index = ignore_index
+
+    def transform(self, results: dict) -> dict:
+        points_numpy = results['points'].numpy()
+
+        proj_image = np.full((self.H, self.W, 5), -1, dtype=np.float32)
+        proj_idx = np.full((self.H, self.W), -1, dtype=np.int64)
+
+        # get depth of all points
+        depth = np.linalg.norm(points_numpy[:, :3], 2, axis=1)
+
+        # get angles of all points
+        yaw = -np.arctan2(points_numpy[:, 1], points_numpy[:, 0])
+        pitch = np.arcsin(points_numpy[:, 2] / depth)
+
+        # get projection in image coords
+        proj_x = 0.5 * (yaw / np.pi + 1.0)
+        proj_y = 1.0 - (pitch + abs(self.fov_down)) / self.fov
+
+        # scale to image size using angular resolution
+        proj_x *= self.W
+        proj_y *= self.H
+
+        # round and clamp for use as index
+        proj_x = np.floor(proj_x)
+        proj_x = np.minimum(self.W - 1, proj_x)
+        proj_x = np.maximum(0, proj_x).astype(np.int64)
+
+        proj_y = np.floor(proj_y)
+        proj_y = np.minimum(self.H - 1, proj_y)
+        proj_y = np.maximum(0, proj_y).astype(np.int64)
+
+        results['proj_x'] = proj_x
+        results['proj_y'] = proj_y
+        results['unproj_range'] = depth
+
+        # order in decreasing depth
+        indices = np.arange(depth.shape[0])
+        order = np.argsort(depth)[::-1]
+        proj_idx[proj_y[order], proj_x[order]] = indices[order]
+        proj_image[proj_y[order], proj_x[order], 0] = depth[order]
+        proj_image[proj_y[order], proj_x[order], 1:] = points_numpy[order]
+        proj_mask = (proj_idx > 0).astype(np.int32)
+        results['proj_range'] = proj_image[..., 0]
+
+        proj_image = (proj_image -
+                      self.means[None, None, :]) / self.stds[None, None, :]
+        proj_image = proj_image * proj_mask[..., None].astype(np.float32)
+        results['img'] = proj_image
+
+        if 'pts_semantic_mask' in results:
+            proj_sem_label = np.full((self.H, self.W),
+                                     self.ignore_index,
+                                     dtype=np.int64)
+            proj_sem_label[proj_y[order],
+                           proj_x[order]] = results['pts_semantic_mask'][order]
+            results['gt_semantic_seg'] = proj_sem_label
+        return results
diff --git a/mmde/projects/CENet/cenet/utils.py b/mmde/projects/CENet/cenet/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd524c044687614b569f2b33acf9b0776b0eea7c
--- /dev/null
+++ b/mmde/projects/CENet/cenet/utils.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+
+from mmdet3d.utils import ConfigType
+
+
+def get_gaussian_kernel(kernel_size: int = 3, sigma: int = 2) -> Tensor:
+    # Create a x, y coordinate grid of shape (kernel_size, kernel_size, 2)
+    x_coord = torch.arange(kernel_size)
+    x_grid = x_coord.repeat(kernel_size).view(kernel_size, kernel_size)
+    y_grid = x_grid.t()
+    xy_grid = torch.stack([x_grid, y_grid], dim=-1).float()
+
+    mean = (kernel_size - 1) / 2.
+    variance = sigma**2.
+
+    # Calculate the 2-dimensional gaussian kernel which is
+    # the product of two gaussian distributions for two different
+    # variables (in this case called x and y)
+    gaussian_kernel = (1. / (2. * math.pi * variance)) * torch.exp(-torch.sum(
+        (xy_grid - mean)**2., dim=-1) / (2 * variance))
+
+    # Make sure sum of values in gaussian kernel equals 1.
+    gaussian_kernel = gaussian_kernel / torch.sum(gaussian_kernel)
+
+    # Reshape to 2d depthwise convolutional weight
+    gaussian_kernel = gaussian_kernel.view(kernel_size, kernel_size)
+
+    return gaussian_kernel
+
+
+class KNN(nn.Module):
+
+    def __init__(self, test_cfg: ConfigType, num_classes: int,
+                 ignore_index: int) -> None:
+        super(KNN, self).__init__()
+        self.knn = test_cfg.knn
+        self.search = test_cfg.search
+        self.sigma = test_cfg.sigma
+        self.cutoff = test_cfg.cutoff
+        self.num_classes = num_classes
+        self.ignore_index = ignore_index
+
+    def forward(self, proj_range: Tensor, unproj_range: Tensor,
+                proj_argmax: Tensor, px: Tensor, py: Tensor) -> Tensor:
+
+        # sizes of projection scan
+        H, W = proj_range.shape
+
+        # number of points
+        P = unproj_range.shape
+
+        # check if size of kernel is odd and complain
+        if self.search % 2 == 0:
+            raise ValueError('Nearest neighbor kernel must be odd number')
+
+        # calculate padding
+        pad = int((self.search - 1) / 2)
+
+        # unfold neighborhood to get nearest neighbors for each pixel
+        # (range image)
+        proj_unfold_k_rang = F.unfold(
+            proj_range[None, None, ...],
+            kernel_size=(self.search, self.search),
+            padding=(pad, pad))
+
+        # index with px, py to get ALL the pcld points
+        idx_list = py * W + px
+        unproj_unfold_k_rang = proj_unfold_k_rang[:, :, idx_list]
+
+        # WARNING, THIS IS A HACK
+        # Make non valid (<0) range points extremely big so that there is no
+        # screwing up the nn self.search
+        unproj_unfold_k_rang[unproj_unfold_k_rang < 0] = float('inf')
+
+        # now the matrix is unfolded TOTALLY, replace the middle points with
+        # the actual range points
+        center = int(((self.search * self.search) - 1) / 2)
+        unproj_unfold_k_rang[:, center, :] = unproj_range
+
+        # now compare range
+        k2_distances = torch.abs(unproj_unfold_k_rang - unproj_range)
+
+        # make a kernel to weigh the ranges according to distance in (x,y)
+        # I make this 1 - kernel because I want distances that are close
+        # in (x,y) to matter more
+        inv_gauss_k = (1 - get_gaussian_kernel(self.search, self.sigma)).view(
+            1, -1, 1)
+        inv_gauss_k = inv_gauss_k.to(proj_range.device).type(proj_range.type())
+
+        # apply weighing
+        k2_distances = k2_distances * inv_gauss_k
+
+        # find nearest neighbors
+        _, knn_idx = k2_distances.topk(
+            self.knn, dim=1, largest=False, sorted=False)
+
+        # do the same unfolding with the argmax
+        proj_unfold_1_argmax = F.unfold(
+            proj_argmax[None, None, ...].float(),
+            kernel_size=(self.search, self.search),
+            padding=(pad, pad)).long()
+        unproj_unfold_1_argmax = proj_unfold_1_argmax[:, :, idx_list]
+
+        # get the top k logits from the knn at each pixel
+        knn_argmax = torch.gather(
+            input=unproj_unfold_1_argmax, dim=1, index=knn_idx)
+
+        # fake an invalid argmax of classes + 1 for all cutoff items
+        if self.cutoff > 0:
+            knn_distances = torch.gather(
+                input=k2_distances, dim=1, index=knn_idx)
+            knn_invalid_idx = knn_distances > self.cutoff
+            knn_argmax[knn_invalid_idx] = self.num_classes
+
+        # now vote
+        # argmax onehot has an extra class for objects after cutoff
+        knn_argmax_onehot = torch.zeros(
+            (1, self.num_classes + 1, P[0]),
+            device=proj_range.device).type(proj_range.type())
+        ones = torch.ones_like(knn_argmax).type(proj_range.type())
+        knn_argmax_onehot = knn_argmax_onehot.scatter_add_(1, knn_argmax, ones)
+
+        # now vote (as a sum over the onehot shit)
+        # (don't let it choose unlabeled OR invalid)
+        if self.ignore_index == self.num_classes - 1:
+            knn_argmax_out = knn_argmax_onehot[:, :-2].argmax(dim=1)
+        elif self.ignore_index == 0:
+            knn_argmax_out = knn_argmax_onehot[:, 1:-1].argmax(dim=1) + 1
+        else:
+            knn_argmax_out = knn_argmax_onehot[:, :-1].argmax(dim=1)
+
+        # reshape again
+        knn_argmax_out = knn_argmax_out.view(P)
+
+        return knn_argmax_out
diff --git a/mmde/projects/CENet/configs/cenet-64x1024_4xb4_semantickitti.py b/mmde/projects/CENet/configs/cenet-64x1024_4xb4_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..172b51227f7325220d285e0027e6b996b43678b5
--- /dev/null
+++ b/mmde/projects/CENet/configs/cenet-64x1024_4xb4_semantickitti.py
@@ -0,0 +1,78 @@
+_base_ = ['./cenet-64x512_4xb4_semantickitti.py']
+
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti',
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointSample', num_points=0.9),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-3.1415929, 3.1415929],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.1, 0.1, 0.1],
+    ),
+    dict(
+        type='SemkittiRangeView',
+        H=64,
+        W=1024,
+        fov_up=3.0,
+        fov_down=-25.0,
+        means=(11.71279, -0.1023471, 0.4952, -1.0545, 0.2877),
+        stds=(10.24, 12.295865, 9.4287, 0.8643, 0.1450),
+        ignore_index=19),
+    dict(type='Pack3DDetInputs', keys=['img', 'gt_semantic_seg'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti',
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='SemkittiRangeView',
+        H=64,
+        W=1024,
+        fov_up=3.0,
+        fov_down=-25.0,
+        means=(11.71279, -0.1023471, 0.4952, -1.0545, 0.2877),
+        stds=(10.24, 12.295865, 9.4287, 0.8643, 0.1450),
+        ignore_index=19),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=('proj_x', 'proj_y', 'proj_range', 'unproj_range'))
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/mmde/projects/CENet/configs/cenet-64x2048_4xb4_semantickitti.py b/mmde/projects/CENet/configs/cenet-64x2048_4xb4_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a979d83eb82b9fdc411533d4c0fda025d78cf72
--- /dev/null
+++ b/mmde/projects/CENet/configs/cenet-64x2048_4xb4_semantickitti.py
@@ -0,0 +1,78 @@
+_base_ = ['./cenet-64x512_4xb4_semantickitti.py']
+
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti',
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointSample', num_points=0.9),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-3.1415929, 3.1415929],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.1, 0.1, 0.1],
+    ),
+    dict(
+        type='SemkittiRangeView',
+        H=64,
+        W=2048,
+        fov_up=3.0,
+        fov_down=-25.0,
+        means=(11.71279, -0.1023471, 0.4952, -1.0545, 0.2877),
+        stds=(10.24, 12.295865, 9.4287, 0.8643, 0.1450),
+        ignore_index=19),
+    dict(type='Pack3DDetInputs', keys=['img', 'gt_semantic_seg'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti',
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='SemkittiRangeView',
+        H=64,
+        W=2048,
+        fov_up=3.0,
+        fov_down=-25.0,
+        means=(11.71279, -0.1023471, 0.4952, -1.0545, 0.2877),
+        stds=(10.24, 12.295865, 9.4287, 0.8643, 0.1450),
+        ignore_index=19),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=('proj_x', 'proj_y', 'proj_range', 'unproj_range'))
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/mmde/projects/CENet/configs/cenet-64x512_4xb4_semantickitti.py b/mmde/projects/CENet/configs/cenet-64x512_4xb4_semantickitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..468261692ac7d6a4df1d228ce5f4e75e13b7212f
--- /dev/null
+++ b/mmde/projects/CENet/configs/cenet-64x512_4xb4_semantickitti.py
@@ -0,0 +1,300 @@
+_base_ = ['../../../configs/_base_/default_runtime.py']
+custom_imports = dict(
+    imports=['projects.CENet.cenet'], allow_failed_imports=False)
+
+# For SemanticKitti we usually do 19-class segmentation.
+# For labels_map we follow the uniform format of MMDetection & MMSegmentation
+# i.e. we consider the unlabeled class as the last one, which is different
+# from the original implementation of some methods e.g. Cylinder3D.
+dataset_type = 'SemanticKittiDataset'
+data_root = 'data/semantickitti/'
+class_names = [
+    'car', 'bicycle', 'motorcycle', 'truck', 'bus', 'person', 'bicyclist',
+    'motorcyclist', 'road', 'parking', 'sidewalk', 'other-ground', 'building',
+    'fence', 'vegetation', 'trunck', 'terrian', 'pole', 'traffic-sign'
+]
+labels_map = {
+    0: 19,  # "unlabeled"
+    1: 19,  # "outlier" mapped to "unlabeled" --------------mapped
+    10: 0,  # "car"
+    11: 1,  # "bicycle"
+    13: 4,  # "bus" mapped to "other-vehicle" --------------mapped
+    15: 2,  # "motorcycle"
+    16: 4,  # "on-rails" mapped to "other-vehicle" ---------mapped
+    18: 3,  # "truck"
+    20: 4,  # "other-vehicle"
+    30: 5,  # "person"
+    31: 6,  # "bicyclist"
+    32: 7,  # "motorcyclist"
+    40: 8,  # "road"
+    44: 9,  # "parking"
+    48: 10,  # "sidewalk"
+    49: 11,  # "other-ground"
+    50: 12,  # "building"
+    51: 13,  # "fence"
+    52: 19,  # "other-structure" mapped to "unlabeled" ------mapped
+    60: 8,  # "lane-marking" to "road" ---------------------mapped
+    70: 14,  # "vegetation"
+    71: 15,  # "trunk"
+    72: 16,  # "terrain"
+    80: 17,  # "pole"
+    81: 18,  # "traffic-sign"
+    99: 19,  # "other-object" to "unlabeled" ----------------mapped
+    252: 0,  # "moving-car" to "car" ------------------------mapped
+    253: 6,  # "moving-bicyclist" to "bicyclist" ------------mapped
+    254: 5,  # "moving-person" to "person" ------------------mapped
+    255: 7,  # "moving-motorcyclist" to "motorcyclist" ------mapped
+    256: 4,  # "moving-on-rails" mapped to "other-vehic------mapped
+    257: 4,  # "moving-bus" mapped to "other-vehicle" -------mapped
+    258: 3,  # "moving-truck" to "truck" --------------------mapped
+    259: 4  # "moving-other"-vehicle to "other-vehicle"-----mapped
+}
+
+metainfo = dict(
+    classes=class_names, seg_label_mapping=labels_map, max_label=259)
+
+input_modality = dict(use_lidar=True, use_camera=False)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/semantickitti/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti',
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointSample', num_points=0.9),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-3.1415929, 3.1415929],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.1, 0.1, 0.1],
+    ),
+    dict(
+        type='SemkittiRangeView',
+        H=64,
+        W=512,
+        fov_up=3.0,
+        fov_down=-25.0,
+        means=(11.71279, -0.1023471, 0.4952, -1.0545, 0.2877),
+        stds=(10.24, 12.295865, 9.4287, 0.8643, 0.1450),
+        ignore_index=19),
+    dict(type='Pack3DDetInputs', keys=['img', 'gt_semantic_seg'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        seg_3d_dtype='np.int32',
+        seg_offset=2**16,
+        dataset_type='semantickitti',
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='SemkittiRangeView',
+        H=64,
+        W=512,
+        fov_up=3.0,
+        fov_down=-25.0,
+        means=(11.71279, -0.1023471, 0.4952, -1.0545, 0.2877),
+        stds=(10.24, 12.295865, 9.4287, 0.8643, 0.1450),
+        ignore_index=19),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=('proj_x', 'proj_y', 'proj_range', 'unproj_range'))
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='semantickitti_infos_train.pkl',
+        pipeline=train_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        ignore_index=19,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='semantickitti_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        ignore_index=19,
+        test_mode=True,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='SegMetric')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+model = dict(
+    type='RangeImageSegmentor',
+    data_preprocessor=dict(type='Det3DDataPreprocessor'),
+    backbone=dict(
+        type='CENet',
+        in_channels=5,
+        stem_channels=128,
+        num_stages=4,
+        stage_blocks=(3, 4, 6, 3),
+        out_channels=(128, 128, 128, 128),
+        fuse_channels=(256, 128),
+        strides=(1, 2, 2, 2),
+        dilations=(1, 1, 1, 1),
+        act_cfg=dict(type='HSwish', inplace=True)),
+    decode_head=dict(
+        type='RangeImageHead',
+        channels=128,
+        num_classes=20,
+        dropout_ratio=0,
+        loss_ce=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            class_weight=None,
+            loss_weight=1.0),
+        loss_lovasz=dict(type='LovaszLoss', loss_weight=1.5, reduction='none'),
+        loss_boundary=dict(type='BoundaryLoss', loss_weight=1.0),
+        conv_seg_kernel_size=1,
+        ignore_index=19),
+    auxiliary_head=[
+        dict(
+            type='RangeImageHead',
+            channels=128,
+            num_classes=20,
+            dropout_ratio=0,
+            loss_ce=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                class_weight=None,
+                loss_weight=1.0),
+            loss_lovasz=dict(
+                type='LovaszLoss', loss_weight=1.5, reduction='none'),
+            loss_boundary=dict(type='BoundaryLoss', loss_weight=1.0),
+            conv_seg_kernel_size=1,
+            ignore_index=19,
+            indices=2),
+        dict(
+            type='RangeImageHead',
+            channels=128,
+            num_classes=20,
+            dropout_ratio=0,
+            loss_ce=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                class_weight=None,
+                loss_weight=1.0),
+            loss_lovasz=dict(
+                type='LovaszLoss', loss_weight=1.5, reduction='none'),
+            loss_boundary=dict(type='BoundaryLoss', loss_weight=1.0),
+            conv_seg_kernel_size=1,
+            ignore_index=19,
+            indices=3),
+        dict(
+            type='RangeImageHead',
+            channels=128,
+            num_classes=20,
+            dropout_ratio=0,
+            loss_ce=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                class_weight=None,
+                loss_weight=1.0),
+            loss_lovasz=dict(
+                type='LovaszLoss', loss_weight=1.5, reduction='none'),
+            loss_boundary=dict(type='BoundaryLoss', loss_weight=1.0),
+            conv_seg_kernel_size=1,
+            ignore_index=19,
+            indices=4)
+    ],
+    train_cfg=None,
+    test_cfg=dict(use_knn=True, knn=7, search=7, sigma=1.0, cutoff=2.0))
+
+# optimizer
+# This schedule is mainly used on Semantickitti dataset in segmentation task
+optim_wrapper = dict(
+    type='AmpOptimWrapper',
+    loss_scale='dynamic',
+    optimizer=dict(
+        type='AdamW',
+        lr=0.04,
+        betas=(0.9, 0.999),
+        weight_decay=(0.01),
+        eps=0.000005))
+
+param_scheduler = [
+    dict(
+        type='OneCycleLR',
+        total_steps=50,
+        by_epoch=True,
+        eta_max=0.0025,
+        pct_start=0.2,
+        div_factor=25.0,
+        final_div_factor=100.0,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=50, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (4 GPUs) x (4 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1))
diff --git a/mmde/projects/CenterFormer/README.md b/mmde/projects/CenterFormer/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f84556b69551fbf86f2f799bd26c9366e89892ee
--- /dev/null
+++ b/mmde/projects/CenterFormer/README.md
@@ -0,0 +1,82 @@
+# CenterFormer: Center-based Transformer for 3D Object Detection
+
+> [CenterFormer: Center-based Transformer for 3D Object Detection](https://arxiv.org/abs/2209.05588)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Query-based transformer has shown great potential in con-
+structing long-range attention in many image-domain tasks, but has
+rarely been considered in LiDAR-based 3D object detection due to the
+overwhelming size of the point cloud data. In this paper, we propose
+CenterFormer, a center-based transformer network for 3D object de-
+tection. CenterFormer first uses a center heatmap to select center candi-
+dates on top of a standard voxel-based point cloud encoder. It then uses
+the feature of the center candidate as the query embedding in the trans-
+former. To further aggregate features from multiple frames, we design
+an approach to fuse features through cross-attention. Lastly, regression
+heads are added to predict the bounding box on the output center feature
+representation. Our design reduces the convergence difficulty and compu-
+tational complexity of the transformer structure. The results show signif-
+icant improvements over the strong baseline of anchor-free object detec-
+tion networks. CenterFormer achieves state-of-the-art performance for a
+single model on the Waymo Open Dataset, with 73.7% mAPH on the val-
+idation set and 75.6% mAPH on the test set, significantly outperforming
+all previously published CNN and transformer-based methods. Our code
+is publicly available at https://github.com/TuSimple/centerformer
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34888372/209500088-b707d7cd-d4d5-4f20-8fdf-a2c7ad15df34.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement CenterFormer and provide the results and checkpoints on Waymo dataset.
+
+## Usage
+
+<!-- For a typical model, this section should contain the commands for training and testing. You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`. -->
+
+### Training commands
+
+In MMDetection3D's root directory, run the following command to train the model:
+
+```bash
+python tools/train.py projects/CenterFormer/configs/centerformer_voxel01_second-atten_secfpn-atten_4xb4-cyclic-20e_waymoD5-3d-3class.py
+```
+
+For multi-gpu training, run:
+
+```bash
+python -m torch.distributed.launch --nnodes=1 --node_rank=0 --nproc_per_node=${NUM_GPUS} --master_port=29506 --master_addr="127.0.0.1" tools/train.py projects/CenterFormer/configs/centerformer_voxel01_second-atten_secfpn-atten_4xb4-cyclic-20e_waymoD5-3d-3class.py
+```
+
+### Testing commands
+
+In MMDetection3D's root directory, run the following command to test the model:
+
+```bash
+python tools/test.py projects/CenterFormer/configs/centerformer_voxel01_second-atten_secfpn-atten_4xb4-cyclic-20e_waymoD5-3d-3class.py ${CHECKPOINT_PATH}
+```
+
+## Results and models
+
+### Waymo
+
+|                                                      Backbone                                                       | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | Mem (GB) | Inf time (fps) | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** |                                                                                                                                 Download                                                                                                                                  |
+| :-----------------------------------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :------: | :------------: | :----: | :-----: | :----: | :---------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [SECFPN_WithAttention](./configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py) |       5       |       voxel (0.1)       |        ✓        |      ×       |   14.8   |                |  72.2  |  69.5   |  65.9  |    63.3     | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/centerformer/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class_20221227_205613-70c9ad37.log) |
+
+**Note** that `SECFPN_WithAttention` denotes both SECOND and SECONDFPN with ChannelAttention and SpatialAttention.
+
+## Citation
+
+```latex
+@InProceedings{Zhou_centerformer,
+title = {CenterFormer: Center-based Transformer for 3D Object Detection},
+author = {Zhou, Zixiang and Zhao, Xiangchen and Wang, Yu and Wang, Panqu and Foroosh, Hassan},
+booktitle = {ECCV},
+year = {2022}
+}
+```
diff --git a/mmde/projects/CenterFormer/centerformer/__init__.py b/mmde/projects/CenterFormer/centerformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bd38cd670cc87c8fbd1db20a326886ddbb822e1
--- /dev/null
+++ b/mmde/projects/CenterFormer/centerformer/__init__.py
@@ -0,0 +1,11 @@
+from .bbox_ops import nms_iou3d
+from .centerformer import CenterFormer
+from .centerformer_backbone import (DeformableDecoderRPN,
+                                    MultiFrameDeformableDecoderRPN)
+from .centerformer_head import CenterFormerBboxHead
+from .losses import FastFocalLoss
+
+__all__ = [
+    'CenterFormer', 'DeformableDecoderRPN', 'CenterFormerBboxHead',
+    'FastFocalLoss', 'nms_iou3d', 'MultiFrameDeformableDecoderRPN'
+]
diff --git a/mmde/projects/CenterFormer/centerformer/bbox_ops.py b/mmde/projects/CenterFormer/centerformer/bbox_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..dca5d767e604ddddfcc6c7f599960c6c45d74618
--- /dev/null
+++ b/mmde/projects/CenterFormer/centerformer/bbox_ops.py
@@ -0,0 +1,41 @@
+import torch
+from mmcv.utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['iou3d_nms3d_forward'])
+
+
+def nms_iou3d(boxes, scores, thresh, pre_maxsize=None, post_max_size=None):
+    """NMS function GPU implementation (using IoU3D). The difference between
+    this implementation and nms3d in MMCV is that we add `pre_maxsize` and
+    `post_max_size` before and after NMS respectively.
+
+     Args:
+        boxes (Tensor): Input boxes with the shape of [N, 7]
+            ([cx, cy, cz, l, w, h, theta]).
+        scores (Tensor): Scores of boxes with the shape of [N].
+        thresh (float): Overlap threshold of NMS.
+        pre_max_size (int, optional): Max size of boxes before NMS.
+            Defaults to None.
+        post_max_size (int, optional): Max size of boxes after NMS.
+            Defaults to None.
+
+    Returns:
+        Tensor: Indexes after NMS.
+    """
+    # TODO: directly refactor ``nms3d`` in MMCV
+    assert boxes.size(1) == 7, 'Input boxes shape should be (N, 7)'
+    order = scores.sort(0, descending=True)[1]
+    if pre_maxsize is not None:
+        order = order[:pre_maxsize]
+    boxes = boxes[order].contiguous()
+
+    keep = boxes.new_zeros(boxes.size(0), dtype=torch.long)
+    num_out = boxes.new_zeros(size=(), dtype=torch.long)
+    ext_module.iou3d_nms3d_forward(
+        boxes, keep, num_out, nms_overlap_thresh=thresh)
+    keep = order[keep[:num_out].to(boxes.device)].contiguous()
+
+    if post_max_size is not None:
+        keep = keep[:post_max_size]
+
+    return keep
diff --git a/mmde/projects/CenterFormer/centerformer/centerformer.py b/mmde/projects/CenterFormer/centerformer/centerformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b8b64dcd936393bf92b3a8dd003076c445da963
--- /dev/null
+++ b/mmde/projects/CenterFormer/centerformer/centerformer.py
@@ -0,0 +1,180 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional
+
+import torch
+from torch import Tensor
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet3d.models.detectors import Base3DDetector
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import Det3DDataSample
+
+
+@MODELS.register_module()
+class CenterFormer(Base3DDetector):
+    """Base class of center-based 3D detector.
+
+    Args:
+        voxel_encoder (dict, optional): Point voxelization
+            encoder layer. Defaults to None.
+        middle_encoder (dict, optional): Middle encoder layer
+            of points cloud modality. Defaults to None.
+        pts_fusion_layer (dict, optional): Fusion layer.
+            Defaults to None.
+        backbone (dict, optional): Backbone of extracting
+            points features. Defaults to None.
+        neck (dict, optional): Neck of extracting
+            points features. Defaults to None.
+        bbox_head (dict, optional): Bboxes head of
+            point cloud modality. Defaults to None.
+        train_cfg (dict, optional): Train config of model.
+            Defaults to None.
+        test_cfg (dict, optional): Train config of model.
+            Defaults to None.
+        init_cfg (dict, optional): Initialize config of
+            model. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`Det3DDataPreprocessor`. Defaults to None.
+    """
+
+    def __init__(self,
+                 voxel_encoder: Optional[dict] = None,
+                 middle_encoder: Optional[dict] = None,
+                 backbone: Optional[dict] = None,
+                 neck: Optional[dict] = None,
+                 bbox_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 data_preprocessor: Optional[dict] = None,
+                 **kwargs):
+        super(CenterFormer, self).__init__(
+            init_cfg=init_cfg, data_preprocessor=data_preprocessor, **kwargs)
+
+        if voxel_encoder:
+            self.voxel_encoder = MODELS.build(voxel_encoder)
+        if middle_encoder:
+            self.middle_encoder = MODELS.build(middle_encoder)
+        if backbone:
+            backbone.update(train_cfg=train_cfg, test_cfg=test_cfg)
+            self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        if bbox_head:
+            bbox_head.update(train_cfg=train_cfg, test_cfg=test_cfg)
+            self.bbox_head = MODELS.build(bbox_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, _BatchNorm):
+                torch.nn.init.uniform_(m.weight)
+
+    @property
+    def with_bbox(self):
+        """bool: Whether the detector has a 3D box head."""
+        return hasattr(self, 'bbox_head') and self.bbox_head is not None
+
+    @property
+    def with_backbone(self):
+        """bool: Whether the detector has a 3D backbone."""
+        return hasattr(self, 'backbone') and self.backbone is not None
+
+    @property
+    def with_voxel_encoder(self):
+        """bool: Whether the detector has a voxel encoder."""
+        return hasattr(self,
+                       'voxel_encoder') and self.voxel_encoder is not None
+
+    @property
+    def with_middle_encoder(self):
+        """bool: Whether the detector has a middle encoder."""
+        return hasattr(self,
+                       'middle_encoder') and self.middle_encoder is not None
+
+    def _forward(self):
+        pass
+
+    def extract_feat(self, batch_inputs_dict: dict,
+                     batch_input_metas: List[dict]) -> tuple:
+        """Extract features from images and points.
+        Args:
+            batch_inputs_dict (dict): Dict of batch inputs. It
+                contains
+                - points (List[tensor]):  Point cloud of multiple inputs.
+                - imgs (tensor): Image tensor with shape (B, C, H, W).
+            batch_input_metas (list[dict]): Meta information of multiple inputs
+                in a batch.
+        Returns:
+             tuple: Two elements in tuple arrange as
+             image features and point cloud features.
+        """
+        voxel_dict = batch_inputs_dict.get('voxels', None)
+        voxel_features, feature_coors = self.voxel_encoder(
+            voxel_dict['voxels'], voxel_dict['coors'])
+        batch_size = voxel_dict['coors'][-1, 0].item() + 1
+        x = self.middle_encoder(voxel_features, feature_coors, batch_size)
+
+        return x
+
+    def loss(self, batch_inputs_dict: Dict[List, torch.Tensor],
+             batch_data_samples: List[Det3DDataSample],
+             **kwargs) -> List[Det3DDataSample]:
+        """
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points' and `imgs` keys.
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (torch.Tensor): Tensor of batch images, has shape
+                  (B, C, H ,W)
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, .
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        batch_input_metas = [item.metainfo for item in batch_data_samples]
+        pts_feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
+        preds, batch_tatgets = self.backbone(pts_feats, batch_data_samples)
+        preds = self.bbox_head(preds)
+        losses = dict()
+        losses.update(self.bbox_head.loss(preds, batch_tatgets))
+        return losses
+        # return self.bbox_head.predict(preds, batch_tatgets)
+
+    def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
+                batch_data_samples: List[Det3DDataSample],
+                **kwargs) -> List[Det3DDataSample]:
+        """Forward of testing.
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points' keys.
+                - points (list[torch.Tensor]): Point cloud of each sample.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input sample. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+            - scores_3d (Tensor): Classification scores, has a shape
+                (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bbox_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
+                contains a tensor with shape (num_instances, 7).
+        """
+        batch_input_metas = [item.metainfo for item in batch_data_samples]
+        pts_feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
+        preds, _ = self.backbone(pts_feats, batch_data_samples)
+
+        preds = self.bbox_head(preds)
+        results_list_3d = self.bbox_head.predict(preds, batch_input_metas)
+
+        detsamples = self.add_pred_to_datasample(batch_data_samples,
+                                                 results_list_3d)
+        return detsamples
diff --git a/mmde/projects/CenterFormer/centerformer/centerformer_backbone.py b/mmde/projects/CenterFormer/centerformer/centerformer_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c624716243f75274fe150837eeb22ab6569661f
--- /dev/null
+++ b/mmde/projects/CenterFormer/centerformer/centerformer_backbone.py
@@ -0,0 +1,980 @@
+# modify from https://github.com/TuSimple/centerformer/blob/master/det3d/models/necks/rpn_transformer.py # noqa
+
+from typing import List, Tuple
+
+import numpy as np
+import torch
+from mmcv.cnn import build_norm_layer
+from mmdet.models.utils import multi_apply
+from mmengine.logging import print_log
+from mmengine.structures import InstanceData
+from torch import Tensor, nn
+
+from mmdet3d.models.utils import draw_heatmap_gaussian, gaussian_radius
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import center_to_corner_box2d
+from .transformer import DeformableTransformerDecoder
+
+
+class ChannelAttention(nn.Module):
+
+    def __init__(self, in_planes, ratio=16):
+        super(ChannelAttention, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.max_pool = nn.AdaptiveMaxPool2d(1)
+
+        self.fc = nn.Sequential(
+            nn.Conv2d(in_planes, in_planes // 16, 1, bias=False),
+            nn.ReLU(),
+            nn.Conv2d(in_planes // 16, in_planes, 1, bias=False),
+        )
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        avg_out = self.fc(self.avg_pool(x))
+        max_out = self.fc(self.max_pool(x))
+        out = avg_out + max_out
+        return self.sigmoid(out) * x
+
+
+class SpatialAttention(nn.Module):
+
+    def __init__(self, kernel_size=7):
+        super(SpatialAttention, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            2, 1, kernel_size, padding=kernel_size // 2, bias=False)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        avg_out = torch.mean(x, dim=1, keepdim=True)
+        max_out, _ = torch.max(x, dim=1, keepdim=True)
+        y = torch.cat([avg_out, max_out], dim=1)
+        y = self.conv1(y)
+        return self.sigmoid(y) * x
+
+
+class MultiFrameSpatialAttention(nn.Module):
+
+    def __init__(self, kernel_size=7):
+        super(MultiFrameSpatialAttention, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            2, 1, kernel_size, padding=kernel_size // 2, bias=False)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, curr, prev):
+        avg_out = torch.mean(curr, dim=1, keepdim=True)
+        max_out, _ = torch.max(curr, dim=1, keepdim=True)
+        y = torch.cat([avg_out, max_out], dim=1)
+        y = self.conv1(y)
+        return self.sigmoid(y) * prev
+
+
+class BaseDecoderRPN(nn.Module):
+
+    def __init__(
+            self,
+            layer_nums,  # [2,2,2]
+            ds_num_filters,  # [128,256,64]
+            num_input_features,  # 256
+            transformer_config=None,
+            hm_head_layer=2,
+            corner_head_layer=2,
+            corner=False,
+            assign_label_window_size=1,
+            classes=3,
+            use_gt_training=False,
+            norm_cfg=None,
+            logger=None,
+            init_bias=-2.19,
+            score_threshold=0.1,
+            obj_num=500,
+            **kwargs):
+        super(BaseDecoderRPN, self).__init__()
+        self._layer_strides = [1, 2, -4]
+        self._num_filters = ds_num_filters
+        self._layer_nums = layer_nums
+        self._num_input_features = num_input_features
+        self.score_threshold = score_threshold
+        self.transformer_config = transformer_config
+        self.corner = corner
+        self.obj_num = obj_num
+        self.use_gt_training = use_gt_training
+        self.window_size = assign_label_window_size**2
+        self.cross_attention_kernel_size = [3, 3, 3]
+        self.batch_id = None
+
+        if norm_cfg is None:
+            norm_cfg = dict(type='BN', eps=1e-3, momentum=0.01)
+        self._norm_cfg = norm_cfg
+
+        assert len(self._layer_strides) == len(self._layer_nums)
+        assert len(self._num_filters) == len(self._layer_nums)
+        assert self.transformer_config is not None
+
+        in_filters = [
+            self._num_input_features,
+            self._num_filters[0],
+            self._num_filters[1],
+        ]
+        blocks = []
+
+        for i, layer_num in enumerate(self._layer_nums):
+            block, num_out_filters = self._make_layer(
+                in_filters[i],
+                self._num_filters[i],
+                layer_num,
+                stride=self._layer_strides[i],
+            )
+            blocks.append(block)
+        self.blocks = nn.ModuleList(blocks)
+        self.up = nn.Sequential(
+            nn.ConvTranspose2d(
+                self._num_filters[0],
+                self._num_filters[2],
+                2,
+                stride=2,
+                bias=False),
+            build_norm_layer(self._norm_cfg, self._num_filters[2])[1],
+            nn.ReLU())
+        # heatmap prediction
+        hm_head = []
+        for i in range(hm_head_layer - 1):
+            hm_head.append(
+                nn.Conv2d(
+                    self._num_filters[-1] * 2,
+                    64,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=True,
+                ))
+            hm_head.append(build_norm_layer(self._norm_cfg, 64)[1])
+            hm_head.append(nn.ReLU())
+
+        hm_head.append(
+            nn.Conv2d(
+                64, classes, kernel_size=3, stride=1, padding=1, bias=True))
+        hm_head[-1].bias.data.fill_(init_bias)
+        self.hm_head = nn.Sequential(*hm_head)
+
+        if self.corner:
+            self.corner_head = []
+            for i in range(corner_head_layer - 1):
+                self.corner_head.append(
+                    nn.Conv2d(
+                        self._num_filters[-1] * 2,
+                        64,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=True,
+                    ))
+                self.corner_head.append(
+                    build_norm_layer(self._norm_cfg, 64)[1])
+                self.corner_head.append(nn.ReLU())
+
+            self.corner_head.append(
+                nn.Conv2d(
+                    64, 1, kernel_size=3, stride=1, padding=1, bias=True))
+            self.corner_head[-1].bias.data.fill_(init_bias)
+            self.corner_head = nn.Sequential(*self.corner_head)
+
+    def _make_layer(self, inplanes, planes, num_blocks, stride=1):
+
+        if stride > 0:
+            block = [
+                nn.ZeroPad2d(1),
+                nn.Conv2d(inplanes, planes, 3, stride=stride, bias=False),
+                build_norm_layer(self._norm_cfg, planes)[1],
+                nn.ReLU(),
+            ]
+        else:
+            block = [
+                nn.ConvTranspose2d(
+                    inplanes, planes, -stride, stride=-stride, bias=False),
+                build_norm_layer(self._norm_cfg, planes)[1],
+                nn.ReLU(),
+            ]
+
+        for j in range(num_blocks):
+            block.append(nn.Conv2d(planes, planes, 3, padding=1, bias=False))
+            block.append(build_norm_layer(self._norm_cfg, planes)[1], )
+            block.append(nn.ReLU())
+
+        block.append(ChannelAttention(planes))
+        block.append(SpatialAttention())
+        block = nn.Sequential(*block)
+
+        return block, planes
+
+    def forward(self, x, example=None):
+        pass
+
+    def get_multi_scale_feature(self, center_pos, feats):
+        """
+        Args:
+            center_pos: center coor at the lowest scale feature map [B 500 2]
+            feats: multi scale BEV feature 3*[B C H W]
+        Returns:
+            neighbor_feat: [B 500 K C]
+            neighbor_pos: [B 500 K 2]
+        """
+        kernel_size = self.cross_attention_kernel_size
+        batch, num_cls, H, W = feats[0].size()
+
+        center_num = center_pos.shape[1]
+
+        relative_pos_list = []
+        neighbor_feat_list = []
+        for i, k in enumerate(kernel_size):
+            neighbor_coords = torch.arange(-(k // 2), (k // 2) + 1)
+            neighbor_coords = torch.flatten(
+                torch.stack(
+                    torch.meshgrid([neighbor_coords, neighbor_coords]), dim=0),
+                1,
+            )  # [2, k]
+            neighbor_coords = (neighbor_coords.permute(
+                1,
+                0).contiguous().to(center_pos))  # relative coordinate [k, 2]
+            neighbor_coords = (center_pos[:, :, None, :] // (2**i) +
+                               neighbor_coords[None, None, :, :]
+                               )  # coordinates [B, 500, k, 2]
+            neighbor_coords = torch.clamp(
+                neighbor_coords, min=0,
+                max=H // (2**i) - 1)  # prevent out of bound
+            feat_id = (neighbor_coords[:, :, :, 1] * (W // (2**i)) +
+                       neighbor_coords[:, :, :, 0])  # pixel id [B, 500, k]
+            feat_id = feat_id.reshape(batch, -1)  # pixel id [B, 500*k]
+            selected_feat = (
+                feats[i].reshape(batch, num_cls, (H * W) // (4**i)).permute(
+                    0, 2, 1).contiguous()[self.batch_id.repeat(1, k**2),
+                                          feat_id])  # B, 500*k, C
+            neighbor_feat_list.append(
+                selected_feat.reshape(batch, center_num, -1,
+                                      num_cls))  # B, 500, k, C
+            relative_pos_list.append(neighbor_coords * (2**i))  # B, 500, k, 2
+
+        neighbor_pos = torch.cat(relative_pos_list, dim=2)  # B, 500, K, 2/3
+        neighbor_feats = torch.cat(neighbor_feat_list, dim=2)  # B, 500, K, C
+        return neighbor_feats, neighbor_pos
+
+    def get_multi_scale_feature_multiframe(self, center_pos, feats, timeframe):
+        """
+        Args:
+            center_pos: center coor at the lowest scale feature map [B 500 2]
+            feats: multi scale BEV feature (3+k)*[B C H W]
+            timeframe: timeframe [B,k]
+        Returns:
+            neighbor_feat: [B 500 K C]
+            neighbor_pos: [B 500 K 2]
+            neighbor_time: [B 500 K 1]
+        """
+        kernel_size = self.cross_attention_kernel_size
+        batch, num_cls, H, W = feats[0].size()
+
+        center_num = center_pos.shape[1]
+
+        relative_pos_list = []
+        neighbor_feat_list = []
+        timeframe_list = []
+        for i, k in enumerate(kernel_size):
+            neighbor_coords = torch.arange(-(k // 2), (k // 2) + 1)
+            neighbor_coords = torch.flatten(
+                torch.stack(
+                    torch.meshgrid([neighbor_coords, neighbor_coords]), dim=0),
+                1,
+            )  # [2, k]
+            neighbor_coords = (neighbor_coords.permute(
+                1,
+                0).contiguous().to(center_pos))  # relative coordinate [k, 2]
+            neighbor_coords = (center_pos[:, :, None, :] // (2**i) +
+                               neighbor_coords[None, None, :, :]
+                               )  # coordinates [B, 500, k, 2]
+            neighbor_coords = torch.clamp(
+                neighbor_coords, min=0,
+                max=H // (2**i) - 1)  # prevent out of bound
+            feat_id = (neighbor_coords[:, :, :, 1] * (W // (2**i)) +
+                       neighbor_coords[:, :, :, 0])  # pixel id [B, 500, k]
+            feat_id = feat_id.reshape(batch, -1)  # pixel id [B, 500*k]
+            selected_feat = (
+                feats[i].reshape(batch, num_cls, (H * W) // (4**i)).permute(
+                    0, 2, 1).contiguous()[self.batch_id.repeat(1, k**2),
+                                          feat_id])  # B, 500*k, C
+            neighbor_feat_list.append(
+                selected_feat.reshape(batch, center_num, -1,
+                                      num_cls))  # B, 500, k, C
+            relative_pos_list.append(neighbor_coords * (2**i))  # B, 500, k, 2
+            timeframe_list.append(
+                torch.full_like(neighbor_coords[:, :, :, 0:1], 0))  # B, 500, k
+            if i == 0:
+                # add previous frame feature
+                for frame_num in range(feats[-1].shape[1]):
+                    selected_feat = (feats[-1][:, frame_num, :, :, :].reshape(
+                        batch, num_cls, (H * W) // (4**i)).permute(
+                            0, 2,
+                            1).contiguous()[self.batch_id.repeat(1, k**2),
+                                            feat_id])  # B, 500*k, C
+                    neighbor_feat_list.append(
+                        selected_feat.reshape(batch, center_num, -1, num_cls))
+                    relative_pos_list.append(neighbor_coords * (2**i))
+                    time = timeframe[:, frame_num + 1].to(selected_feat)  # B
+                    timeframe_list.append(
+                        time[:, None, None, None] * torch.full_like(
+                            neighbor_coords[:, :, :, 0:1], 1))  # B, 500, k
+
+        neighbor_pos = torch.cat(relative_pos_list, dim=2)  # B, 500, K, 2/3
+        neighbor_feats = torch.cat(neighbor_feat_list, dim=2)  # B, 500, K, C
+        neighbor_time = torch.cat(timeframe_list, dim=2)  # B, 500, K, 1
+
+        return neighbor_feats, neighbor_pos, neighbor_time
+
+
+@MODELS.register_module()
+class DeformableDecoderRPN(BaseDecoderRPN):
+    """The original implement of CenterFormer modules.
+
+    It fuse the backbone, neck and heatmap head into one module. The backbone
+    is `SECOND` with attention and the neck is `SECONDFPN` with attention.
+
+    TODO: split this module into backbone、neck and head.
+    """
+
+    def __init__(self,
+                 layer_nums,
+                 ds_num_filters,
+                 num_input_features,
+                 tasks=dict(),
+                 transformer_config=None,
+                 hm_head_layer=2,
+                 corner_head_layer=2,
+                 corner=False,
+                 parametric_embedding=False,
+                 assign_label_window_size=1,
+                 classes=3,
+                 use_gt_training=False,
+                 norm_cfg=None,
+                 logger=None,
+                 init_bias=-2.19,
+                 score_threshold=0.1,
+                 obj_num=500,
+                 train_cfg=None,
+                 test_cfg=None,
+                 **kwargs):
+        super(DeformableDecoderRPN, self).__init__(
+            layer_nums,
+            ds_num_filters,
+            num_input_features,
+            transformer_config,
+            hm_head_layer,
+            corner_head_layer,
+            corner,
+            assign_label_window_size,
+            classes,
+            use_gt_training,
+            norm_cfg,
+            logger,
+            init_bias,
+            score_threshold,
+            obj_num,
+        )
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.tasks = tasks
+        self.class_names = [t['class_names'] for t in tasks]
+
+        self.transformer_decoder = DeformableTransformerDecoder(
+            self._num_filters[-1] * 2,
+            depth=transformer_config.depth,
+            n_heads=transformer_config.n_heads,
+            dim_single_head=transformer_config.dim_single_head,
+            dim_ffn=transformer_config.dim_ffn,
+            dropout=transformer_config.dropout,
+            out_attention=transformer_config.out_attn,
+            n_points=transformer_config.get('n_points', 9),
+        )
+        self.pos_embedding_type = transformer_config.get(
+            'pos_embedding_type', 'linear')
+        if self.pos_embedding_type == 'linear':
+            self.pos_embedding = nn.Linear(2, self._num_filters[-1] * 2)
+        else:
+            raise NotImplementedError()
+        self.parametric_embedding = parametric_embedding
+        if self.parametric_embedding:
+            self.query_embed = nn.Embedding(self.obj_num,
+                                            self._num_filters[-1] * 2)
+            nn.init.uniform_(self.query_embed.weight, -1.0, 1.0)
+
+        print_log('Finish RPN_transformer_deformable Initialization',
+                  'current')
+
+    def _sigmoid(self, x):
+        y = torch.clamp(x.sigmoid_(), min=1e-4, max=1 - 1e-4)
+        return y
+
+    def forward(self, x, batch_data_samples):
+
+        batch_gt_instance_3d = []
+        for data_sample in batch_data_samples:
+            batch_gt_instance_3d.append(data_sample.gt_instances_3d)
+
+        # FPN
+        x = self.blocks[0](x)
+        x_down = self.blocks[1](x)
+        x_up = torch.cat([self.blocks[2](x_down), self.up(x)], dim=1)
+
+        # heatmap head
+        hm = self.hm_head(x_up)
+
+        if self.corner and self.corner_head.training:
+            corner_hm = self.corner_head(x_up)
+            corner_hm = self._sigmoid(corner_hm)
+
+        # find top K center location
+        hm = self._sigmoid(hm)
+        batch, num_cls, H, W = hm.size()
+
+        scores, labels = torch.max(
+            hm.reshape(batch, num_cls, H * W), dim=1)  # b,H*W
+        self.batch_id = torch.from_numpy(np.indices(
+            (batch, self.obj_num))[0]).to(labels)
+
+        if self.training:
+            heatmaps, anno_boxes, gt_inds, gt_masks, corner_heatmaps, cat_labels = self.get_targets(  # noqa: E501
+                batch_gt_instance_3d)
+            batch_targets = dict(
+                ind=gt_inds,
+                mask=gt_masks,
+                hm=heatmaps,
+                anno_box=anno_boxes,
+                corners=corner_heatmaps,
+                cat=cat_labels)
+            inds = gt_inds[0][:, (self.window_size // 2)::self.window_size]
+            masks = gt_masks[0][:, (self.window_size // 2)::self.window_size]
+            batch_id_gt = torch.from_numpy(
+                np.indices((batch, inds.shape[1]))[0]).to(labels)
+            scores[batch_id_gt, inds] = scores[batch_id_gt, inds] + masks
+            order = scores.sort(1, descending=True)[1]
+            order = order[:, :self.obj_num]
+            scores[batch_id_gt, inds] = scores[batch_id_gt, inds] - masks
+        else:
+            order = scores.sort(1, descending=True)[1]
+            order = order[:, :self.obj_num]
+            batch_targets = None
+
+        scores = torch.gather(scores, 1, order)
+        labels = torch.gather(labels, 1, order)
+        mask = scores > self.score_threshold
+
+        ct_feat = x_up.reshape(batch, -1, H * W).transpose(2, 1).contiguous()
+        ct_feat = ct_feat[self.batch_id, order]  # B, 500, C
+
+        # create position embedding for each center
+        y_coor = order // W
+        x_coor = order - y_coor * W
+        y_coor, x_coor = y_coor.to(ct_feat), x_coor.to(ct_feat)
+        y_coor, x_coor = y_coor / H, x_coor / W
+        pos_features = torch.stack([x_coor, y_coor], dim=2)
+
+        if self.parametric_embedding:
+            ct_feat = self.query_embed.weight
+            ct_feat = ct_feat.unsqueeze(0).expand(batch, -1, -1)
+
+        # run transformer
+        src = torch.cat(
+            (
+                x_up.reshape(batch, -1, H * W).transpose(2, 1).contiguous(),
+                x.reshape(batch, -1,
+                          (H * W) // 4).transpose(2, 1).contiguous(),
+                x_down.reshape(batch, -1,
+                               (H * W) // 16).transpose(2, 1).contiguous(),
+            ),
+            dim=1,
+        )  # B ,sum(H*W), C
+        spatial_shapes = torch.as_tensor(
+            [(H, W), (H // 2, W // 2), (H // 4, W // 4)],
+            dtype=torch.long,
+            device=ct_feat.device,
+        )
+        level_start_index = torch.cat((
+            spatial_shapes.new_zeros((1, )),
+            spatial_shapes.prod(1).cumsum(0)[:-1],
+        ))
+
+        transformer_out = self.transformer_decoder(
+            ct_feat,
+            self.pos_embedding,
+            src,
+            spatial_shapes,
+            level_start_index,
+            center_pos=pos_features,
+        )  # (B,N,C)
+
+        ct_feat = (transformer_out['ct_feat'].transpose(2, 1).contiguous()
+                   )  # B, C, 500
+
+        out_dict = {
+            'hm': hm,
+            'scores': scores,
+            'labels': labels,
+            'order': order,
+            'ct_feat': ct_feat,
+            'mask': mask,
+        }
+        if 'out_attention' in transformer_out:
+            out_dict.update(
+                {'out_attention': transformer_out['out_attention']})
+        if self.corner and self.corner_head.training:
+            out_dict.update({'corner_hm': corner_hm})
+
+        return out_dict, batch_targets
+
+    def get_targets(
+        self,
+        batch_gt_instances_3d: List[InstanceData],
+    ) -> Tuple[List[Tensor]]:
+        """Generate targets. How each output is transformed: Each nested list
+        is transposed so that all same-index elements in each sub-list (1, ...,
+        N) become the new sub-lists.
+
+                [ [a0, a1, a2, ... ], [b0, b1, b2, ... ], ... ]
+                ==> [ [a0, b0, ... ], [a1, b1, ... ], [a2, b2, ... ] ]
+            The new transposed nested list is converted into a list of N
+            tensors generated by concatenating tensors in the new sub-lists.
+                [ tensor0, tensor1, tensor2, ... ]
+        Args:
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+        Returns:
+            Returns:
+                tuple[list[torch.Tensor]]: Tuple of target including
+                    the following results in order.
+                - list[torch.Tensor]: Heatmap scores.
+                - list[torch.Tensor]: Ground truth boxes.
+                - list[torch.Tensor]: Indexes indicating the
+                    position of the valid boxes.
+                - list[torch.Tensor]: Masks indicating which
+                    boxes are valid.
+                - list[torch.Tensor]: catagrate labels.
+        """
+        heatmaps, anno_boxes, inds, masks, corner_heatmaps, cat_labels = multi_apply(  # noqa: E501
+            self.get_targets_single, batch_gt_instances_3d)
+        # Transpose heatmaps
+        heatmaps = list(map(list, zip(*heatmaps)))
+        heatmaps = [torch.stack(hms_) for hms_ in heatmaps]
+        # Transpose heatmaps
+        corner_heatmaps = list(map(list, zip(*corner_heatmaps)))
+        corner_heatmaps = [torch.stack(hms_) for hms_ in corner_heatmaps]
+        # Transpose anno_boxes
+        anno_boxes = list(map(list, zip(*anno_boxes)))
+        anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes]
+        # Transpose inds
+        inds = list(map(list, zip(*inds)))
+        inds = [torch.stack(inds_) for inds_ in inds]
+        # Transpose inds
+        masks = list(map(list, zip(*masks)))
+        masks = [torch.stack(masks_) for masks_ in masks]
+        # Transpose cat_labels
+        cat_labels = list(map(list, zip(*cat_labels)))
+        cat_labels = [torch.stack(labels_) for labels_ in cat_labels]
+        return heatmaps, anno_boxes, inds, masks, corner_heatmaps, cat_labels
+
+    def get_targets_single(self,
+                           gt_instances_3d: InstanceData) -> Tuple[Tensor]:
+        """Generate training targets for a single sample.
+        Args:
+            gt_instances_3d (:obj:`InstanceData`): Gt_instances of
+                single data sample. It usually includes
+                ``bboxes_3d`` and ``labels_3d`` attributes.
+        Returns:
+            tuple[list[torch.Tensor]]: Tuple of target including
+                the following results in order.
+                - list[torch.Tensor]: Heatmap scores.
+                - list[torch.Tensor]: Ground truth boxes.
+                - list[torch.Tensor]: Indexes indicating the position
+                    of the valid boxes.
+                - list[torch.Tensor]: Masks indicating which boxes
+                    are valid.
+                - list[torch.Tensor]: catagrate labels.
+        """
+        gt_labels_3d = gt_instances_3d.labels_3d
+        gt_bboxes_3d = gt_instances_3d.bboxes_3d
+        device = gt_labels_3d.device
+        gt_bboxes_3d = torch.cat(
+            (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]),
+            dim=1).to(device)
+        max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg']
+        grid_size = torch.tensor(self.train_cfg['grid_size'])
+        pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
+        voxel_size = torch.tensor(self.train_cfg['voxel_size'])
+
+        feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']
+
+        # reorganize the gt_dict by tasks
+        task_masks = []
+        flag = 0
+        for class_name in self.class_names:
+            task_masks.append([
+                torch.where(gt_labels_3d == class_name.index(i) + flag)
+                for i in class_name
+            ])
+            flag += len(class_name)
+
+        task_boxes = []
+        task_classes = []
+        flag2 = 0
+        for idx, mask in enumerate(task_masks):
+            task_box = []
+            task_class = []
+            for m in mask:
+                task_box.append(gt_bboxes_3d[m])
+                # 0 is background for each task, so we need to add 1 here.
+                task_class.append(gt_labels_3d[m] + 1 - flag2)
+            task_boxes.append(torch.cat(task_box, axis=0).to(device))
+            task_classes.append(torch.cat(task_class).long().to(device))
+            flag2 += len(mask)
+        draw_gaussian = draw_heatmap_gaussian
+        heatmaps, anno_boxes, inds, masks, corner_heatmaps, cat_labels = [], [], [], [], [], []  # noqa: E501
+
+        for idx in range(len(self.tasks)):
+            heatmap = gt_bboxes_3d.new_zeros(
+                (len(self.class_names[idx]), feature_map_size[1],
+                 feature_map_size[0]))
+            corner_heatmap = torch.zeros(
+                (1, feature_map_size[1], feature_map_size[0]),
+                dtype=torch.float32,
+                device=device)
+
+            anno_box = gt_bboxes_3d.new_zeros((max_objs, 8),
+                                              dtype=torch.float32)
+
+            ind = gt_labels_3d.new_zeros((max_objs), dtype=torch.int64)
+            mask = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.uint8)
+            cat_label = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.int64)
+
+            num_objs = min(task_boxes[idx].shape[0], max_objs)
+
+            for k in range(num_objs):
+                cls_id = task_classes[idx][k] - 1
+
+                # gt boxes [xyzlwhr]
+                length = task_boxes[idx][k][3]
+                width = task_boxes[idx][k][4]
+                length = length / voxel_size[0] / self.train_cfg[
+                    'out_size_factor']
+                width = width / voxel_size[1] / self.train_cfg[
+                    'out_size_factor']
+
+                if width > 0 and length > 0:
+                    radius = gaussian_radius(
+                        (width, length),
+                        min_overlap=self.train_cfg['gaussian_overlap'])
+                    radius = max(self.train_cfg['min_radius'], int(radius))
+
+                    # be really careful for the coordinate system of
+                    # your box annotation.
+                    x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][
+                        1], task_boxes[idx][k][2]
+
+                    coor_x = (
+                        x - pc_range[0]
+                    ) / voxel_size[0] / self.train_cfg['out_size_factor']
+                    coor_y = (
+                        y - pc_range[1]
+                    ) / voxel_size[1] / self.train_cfg['out_size_factor']
+
+                    center = torch.tensor([coor_x, coor_y],
+                                          dtype=torch.float32,
+                                          device=device)
+                    center_int = center.to(torch.int32)
+
+                    # throw out not in range objects to avoid out of array
+                    # area when creating the heatmap
+                    if not (0 <= center_int[0] < feature_map_size[0]
+                            and 0 <= center_int[1] < feature_map_size[1]):
+                        continue
+
+                    draw_gaussian(heatmap[cls_id], center_int, radius)
+
+                    radius = radius // 2
+                    # # draw four corner and center TODO: use torch
+                    rot = task_boxes[idx][k][6]
+                    corner_keypoints = center_to_corner_box2d(
+                        center.unsqueeze(0).cpu().numpy(),
+                        torch.tensor([[length, width]],
+                                     dtype=torch.float32).numpy(),
+                        angles=rot,
+                        origin=0.5)
+                    corner_keypoints = torch.from_numpy(corner_keypoints).to(
+                        center)
+
+                    draw_gaussian(corner_heatmap[0], center_int, radius)
+                    draw_gaussian(
+                        corner_heatmap[0],
+                        (corner_keypoints[0, 0] + corner_keypoints[0, 1]) / 2,
+                        radius)
+                    draw_gaussian(
+                        corner_heatmap[0],
+                        (corner_keypoints[0, 2] + corner_keypoints[0, 3]) / 2,
+                        radius)
+                    draw_gaussian(
+                        corner_heatmap[0],
+                        (corner_keypoints[0, 0] + corner_keypoints[0, 3]) / 2,
+                        radius)
+                    draw_gaussian(
+                        corner_heatmap[0],
+                        (corner_keypoints[0, 1] + corner_keypoints[0, 2]) / 2,
+                        radius)
+
+                    new_idx = k
+                    x, y = center_int[0], center_int[1]
+
+                    assert (y * feature_map_size[0] + x <
+                            feature_map_size[0] * feature_map_size[1])
+
+                    ind[new_idx] = y * feature_map_size[0] + x
+                    mask[new_idx] = 1
+                    cat_label[new_idx] = cls_id
+                    # TODO: support other outdoor dataset
+                    # vx, vy = task_boxes[idx][k][7:]
+                    rot = task_boxes[idx][k][6]
+                    box_dim = task_boxes[idx][k][3:6]
+                    box_dim = box_dim.log()
+                    anno_box[new_idx] = torch.cat([
+                        center - torch.tensor([x, y], device=device),
+                        z.unsqueeze(0), box_dim,
+                        torch.sin(rot).unsqueeze(0),
+                        torch.cos(rot).unsqueeze(0)
+                    ])
+
+            heatmaps.append(heatmap)
+            corner_heatmaps.append(corner_heatmap)
+            anno_boxes.append(anno_box)
+            masks.append(mask)
+            inds.append(ind)
+            cat_labels.append(cat_label)
+        return heatmaps, anno_boxes, inds, masks, corner_heatmaps, cat_labels
+
+
+@MODELS.register_module()
+class MultiFrameDeformableDecoderRPN(BaseDecoderRPN):
+    """The original implementation of CenterFormer modules.
+
+    The difference between this module and
+    `DeformableDecoderRPN` is that this module uses information from multi
+    frames.
+
+    TODO: split this module into backbone、neck and head.
+    """
+
+    def __init__(
+            self,
+            layer_nums,  # [2,2,2]
+            ds_num_filters,  # [128,256,64]
+            num_input_features,  # 256
+            transformer_config=None,
+            hm_head_layer=2,
+            corner_head_layer=2,
+            corner=False,
+            parametric_embedding=False,
+            assign_label_window_size=1,
+            classes=3,
+            use_gt_training=False,
+            norm_cfg=None,
+            logger=None,
+            init_bias=-2.19,
+            score_threshold=0.1,
+            obj_num=500,
+            frame=1,
+            **kwargs):
+        super(MultiFrameDeformableDecoderRPN, self).__init__(
+            layer_nums,
+            ds_num_filters,
+            num_input_features,
+            transformer_config,
+            hm_head_layer,
+            corner_head_layer,
+            corner,
+            assign_label_window_size,
+            classes,
+            use_gt_training,
+            norm_cfg,
+            logger,
+            init_bias,
+            score_threshold,
+            obj_num,
+        )
+        self.frame = frame
+
+        self.out = nn.Sequential(
+            nn.Conv2d(
+                self._num_filters[0] * frame,
+                self._num_filters[0],
+                3,
+                padding=1,
+                bias=False,
+            ),
+            build_norm_layer(self._norm_cfg, self._num_filters[0])[1],
+            nn.ReLU(),
+        )
+        self.mtf_attention = MultiFrameSpatialAttention()
+        self.time_embedding = nn.Linear(1, self._num_filters[0])
+
+        self.transformer_decoder = DeformableTransformerDecoder(
+            self._num_filters[-1] * 2,
+            depth=transformer_config.depth,
+            n_heads=transformer_config.n_heads,
+            n_levels=2 + self.frame,
+            dim_single_head=transformer_config.dim_single_head,
+            dim_ffn=transformer_config.dim_ffn,
+            dropout=transformer_config.dropout,
+            out_attention=transformer_config.out_attn,
+            n_points=transformer_config.get('n_points', 9),
+        )
+        self.pos_embedding_type = transformer_config.get(
+            'pos_embedding_type', 'linear')
+        if self.pos_embedding_type == 'linear':
+            self.pos_embedding = nn.Linear(2, self._num_filters[-1] * 2)
+        else:
+            raise NotImplementedError()
+        self.parametric_embedding = parametric_embedding
+        if self.parametric_embedding:
+            self.query_embed = nn.Embedding(self.obj_num,
+                                            self._num_filters[-1] * 2)
+            nn.init.uniform_(self.query_embed.weight, -1.0, 1.0)
+
+        print_log('Finish RPN_transformer_deformable Initialization',
+                  'current')
+
+    def forward(self, x, example=None):
+
+        # FPN
+        x = self.blocks[0](x)
+        x_down = self.blocks[1](x)
+        x_up = torch.cat([self.blocks[2](x_down), self.up(x)], dim=1)
+
+        # take out the BEV feature on current frame
+        x = torch.split(x, self.frame)
+        x_up = torch.split(x_up, self.frame)
+        x_down = torch.split(x_down, self.frame)
+        x_prev = torch.stack([t[1:] for t in x_up], dim=0)  # B,K,C,H,W
+        x = torch.stack([t[0] for t in x], dim=0)
+        x_down = torch.stack([t[0] for t in x_down], dim=0)
+
+        x_up = torch.stack([t[0] for t in x_up], dim=0)  # B,C,H,W
+        # use spatial attention in current frame on previous feature
+        x_prev_cat = self.mtf_attention(
+            x_up,
+            x_prev.reshape(x_up.shape[0], -1, x_up.shape[2],
+                           x_up.shape[3]))  # B,K*C,H,W
+        # time embedding
+        x_up_fuse = torch.cat((x_up, x_prev_cat), dim=1) + self.time_embedding(
+            example['times'][:, :, None].to(x_up)).reshape(
+                x_up.shape[0], -1, 1, 1)
+        # fuse mtf feature
+        x_up_fuse = self.out(x_up_fuse)
+
+        # heatmap head
+        hm = self.hm_head(x_up_fuse)
+
+        if self.corner and self.corner_head.training:
+            corner_hm = self.corner_head(x_up_fuse)
+            corner_hm = torch.sigmoid(corner_hm)
+
+        # find top K center location
+        hm = torch.sigmoid(hm)
+        batch, num_cls, H, W = hm.size()
+
+        scores, labels = torch.max(
+            hm.reshape(batch, num_cls, H * W), dim=1)  # b,H*W
+        self.batch_id = torch.from_numpy(np.indices(
+            (batch, self.obj_num))[0]).to(labels)
+
+        if self.use_gt_training and self.hm_head.training:
+            gt_inds = example['ind'][0][:, (self.window_size //
+                                            2)::self.window_size]
+            gt_masks = example['mask'][0][:, (self.window_size //
+                                              2)::self.window_size]
+            batch_id_gt = torch.from_numpy(
+                np.indices((batch, gt_inds.shape[1]))[0]).to(labels)
+            scores[batch_id_gt,
+                   gt_inds] = scores[batch_id_gt, gt_inds] + gt_masks
+            order = scores.sort(1, descending=True)[1]
+            order = order[:, :self.obj_num]
+            scores[batch_id_gt,
+                   gt_inds] = scores[batch_id_gt, gt_inds] - gt_masks
+        else:
+            order = scores.sort(1, descending=True)[1]
+            order = order[:, :self.obj_num]
+
+        scores = torch.gather(scores, 1, order)
+        labels = torch.gather(labels, 1, order)
+        mask = scores > self.score_threshold
+
+        ct_feat = (x_up.reshape(batch, -1,
+                                H * W).transpose(2,
+                                                 1).contiguous()[self.batch_id,
+                                                                 order]
+                   )  # B, 500, C
+
+        # create position embedding for each center
+        y_coor = order // W
+        x_coor = order - y_coor * W
+        y_coor, x_coor = y_coor.to(ct_feat), x_coor.to(ct_feat)
+        y_coor, x_coor = y_coor / H, x_coor / W
+        pos_features = torch.stack([x_coor, y_coor], dim=2)
+
+        if self.parametric_embedding:
+            ct_feat = self.query_embed.weight
+            ct_feat = ct_feat.unsqueeze(0).expand(batch, -1, -1)
+
+        # run transformer
+        src_list = [
+            x_up.reshape(batch, -1, H * W).transpose(2, 1).contiguous(),
+            x.reshape(batch, -1, (H * W) // 4).transpose(2, 1).contiguous(),
+            x_down.reshape(batch, -1, (H * W) // 16).transpose(2,
+                                                               1).contiguous(),
+        ]
+        for frame in range(x_prev.shape[1]):
+            src_list.append(x_prev[:, frame].reshape(batch,
+                                                     -1, (H * W)).transpose(
+                                                         2, 1).contiguous())
+        src = torch.cat(src_list, dim=1)  # B ,sum(H*W), C
+        spatial_list = [(H, W), (H // 2, W // 2), (H // 4, W // 4)]
+        spatial_list += [(H, W) for frame in range(x_prev.shape[1])]
+        spatial_shapes = torch.as_tensor(
+            spatial_list, dtype=torch.long, device=ct_feat.device)
+        level_start_index = torch.cat((
+            spatial_shapes.new_zeros((1, )),
+            spatial_shapes.prod(1).cumsum(0)[:-1],
+        ))
+
+        transformer_out = self.transformer_decoder(
+            ct_feat,
+            self.pos_embedding,
+            src,
+            spatial_shapes,
+            level_start_index,
+            center_pos=pos_features,
+        )  # (B,N,C)
+
+        ct_feat = (transformer_out['ct_feat'].transpose(2, 1).contiguous()
+                   )  # B, C, 500
+
+        out_dict = {
+            'hm': hm,
+            'scores': scores,
+            'labels': labels,
+            'order': order,
+            'ct_feat': ct_feat,
+            'mask': mask,
+        }
+        if 'out_attention' in transformer_out:
+            out_dict.update(
+                {'out_attention': transformer_out['out_attention']})
+        if self.corner and self.corner_head.training:
+            out_dict.update({'corner_hm': corner_hm})
+
+        return out_dict
diff --git a/mmde/projects/CenterFormer/centerformer/centerformer_head.py b/mmde/projects/CenterFormer/centerformer/centerformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1e5cbe937d847487e1a8d7698f812f71dcd015c
--- /dev/null
+++ b/mmde/projects/CenterFormer/centerformer/centerformer_head.py
@@ -0,0 +1,582 @@
+# ------------------------------------------------------------------------------
+# Portions of this code are from
+# det3d (https://github.com/poodarchu/Det3D/tree/56402d4761a5b73acd23080f537599b0888cce07) # noqa
+# Copyright (c) 2019 朱本金
+# Licensed under the MIT License
+# ------------------------------------------------------------------------------
+
+import copy
+import logging
+
+import numpy as np
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.ops import boxes_iou3d
+from mmengine.logging import print_log
+from mmengine.model import kaiming_init
+from mmengine.structures import InstanceData
+from torch import nn
+
+from mmdet3d.models.layers import circle_nms, nms_bev
+from mmdet3d.registry import MODELS
+from .bbox_ops import nms_iou3d
+from .losses import FastFocalLoss
+
+
+class SepHead(nn.Module):
+    """TODO: This module is the original implementation in CenterFormer and it
+    has few differences with ``SeperateHead`` in `mmdet3d` but refactor this
+    module will lower the performance a little.
+    """
+
+    def __init__(
+            self,
+            in_channels,
+            heads,
+            head_conv=64,
+            final_kernel=1,
+            bn=False,
+            init_bias=-2.19,
+            norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+            **kwargs,
+    ):
+        super(SepHead, self).__init__(**kwargs)
+
+        self.heads = heads
+        for head in self.heads:
+            classes, num_conv = self.heads[head]
+
+            fc = []
+            for i in range(num_conv - 1):
+                fc.append(
+                    nn.Conv1d(
+                        in_channels,
+                        head_conv,
+                        kernel_size=final_kernel,
+                        stride=1,
+                        padding=final_kernel // 2,
+                        bias=True,
+                    ))
+                if bn:
+                    fc.append(build_norm_layer(norm_cfg, head_conv)[1])
+                fc.append(nn.ReLU())
+
+            fc.append(
+                nn.Conv1d(
+                    head_conv,
+                    classes,
+                    kernel_size=final_kernel,
+                    stride=1,
+                    padding=final_kernel // 2,
+                    bias=True,
+                ))
+
+            if 'hm' in head:
+                fc[-1].bias.data.fill_(init_bias)
+            else:
+                for m in fc:
+                    if isinstance(m, nn.Conv1d):
+                        kaiming_init(m)
+
+            fc = nn.Sequential(*fc)
+            self.__setattr__(head, fc)
+
+    def forward(self, x, y):
+        for head in self.heads:
+            x[head] = self.__getattr__(head)(y)
+
+        return x
+
+
+@MODELS.register_module()
+class CenterFormerBboxHead(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 tasks,
+                 weight=0.25,
+                 iou_weight=1,
+                 corner_weight=1,
+                 code_weights=[],
+                 common_heads=dict(),
+                 logger=None,
+                 init_bias=-2.19,
+                 share_conv_channel=64,
+                 assign_label_window_size=1,
+                 iou_loss=False,
+                 corner_loss=False,
+                 iou_factor=[1, 1, 4],
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 bbox_code_size=7,
+                 test_cfg=None,
+                 **kawrgs):
+        super(CenterFormerBboxHead, self).__init__()
+
+        num_classes = [len(t['class_names']) for t in tasks]
+        self.class_names = [t['class_names'] for t in tasks]
+        self.code_weights = code_weights
+        self.bbox_code_size = 7
+        self.weight = weight  # weight between hm loss and loc loss
+        self.iou_weight = iou_weight
+        self.corner_weight = corner_weight
+        self.iou_factor = iou_factor
+
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.test_cfg = test_cfg
+
+        self.crit = FastFocalLoss(assign_label_window_size)
+        self.crit_reg = torch.nn.L1Loss(reduction='none')
+        self.use_iou_loss = iou_loss
+        if self.use_iou_loss:
+            self.crit_iou = torch.nn.SmoothL1Loss(reduction='none')
+        self.corner_loss = corner_loss
+        if self.corner_loss:
+            self.corner_crit = torch.nn.MSELoss(reduction='none')
+
+        self.box_n_dim = 9 if 'vel' in common_heads else 7
+        self.use_direction_classifier = False
+
+        if not logger:
+            logger = logging.getLogger('CenterFormerBboxHead')
+        self.logger = logger
+
+        logger.info(f'num_classes: {num_classes}')
+
+        # a shared convolution
+        self.shared_conv = nn.Sequential(
+            nn.Conv1d(
+                in_channels, share_conv_channel, kernel_size=1, bias=True),
+            build_norm_layer(norm_cfg, share_conv_channel)[1],
+            nn.ReLU(inplace=True),
+        )
+
+        self.tasks = nn.ModuleList()
+        print_log(f'Use HM Bias: {init_bias}', 'current')
+
+        for num_cls in num_classes:
+            heads = copy.deepcopy(common_heads)
+            self.tasks.append(
+                SepHead(
+                    share_conv_channel,
+                    heads,
+                    bn=True,
+                    init_bias=init_bias,
+                    final_kernel=1,
+                    norm_cfg=norm_cfg))
+
+        logger.info('Finish CenterHeadIoU Initialization')
+
+    def forward(self, x, *kwargs):
+        ret_dicts = []
+
+        y = self.shared_conv(x['ct_feat'].float())
+
+        for task in self.tasks:
+            ret_dicts.append(task(x, y))
+
+        return ret_dicts
+
+    def _sigmoid(self, x):
+        y = torch.clamp(x.sigmoid_(), min=1e-4, max=1 - 1e-4)
+        return y
+
+    def loss(self, preds_dicts, example, **kwargs):
+        losses = {}
+        for task_id, preds_dict in enumerate(preds_dicts):
+            # heatmap focal loss
+            hm_loss = self.crit(
+                preds_dict['hm'],
+                example['hm'][task_id],
+                example['ind'][task_id],
+                example['mask'][task_id],
+                example['cat'][task_id],
+            )
+
+            target_box = example['anno_box'][task_id]
+
+            if self.corner_loss:
+                corner_loss = self.corner_crit(preds_dict['corner_hm'],
+                                               example['corners'][task_id])
+                corner_mask = (example['corners'][task_id] > 0).to(corner_loss)
+                corner_loss = (corner_loss * corner_mask).sum() / (
+                    corner_mask.sum() + 1e-4)
+                losses.update({
+                    f'{task_id}_corner_loss':
+                    corner_loss * self.corner_weight
+                })
+
+            # reconstruct the anno_box from multiple reg heads
+            if 'vel' in preds_dict:
+                preds_dict['anno_box'] = torch.cat(
+                    (
+                        preds_dict['reg'],
+                        preds_dict['height'],
+                        preds_dict['dim'],
+                        preds_dict['vel'],
+                        preds_dict['rot'],
+                    ),
+                    dim=1,
+                )
+            else:
+                preds_dict['anno_box'] = torch.cat(
+                    (
+                        preds_dict['reg'],
+                        preds_dict['height'],
+                        preds_dict['dim'],
+                        preds_dict['rot'],
+                    ),
+                    dim=1,
+                )
+                target_box = target_box[..., [0, 1, 2, 3, 4, 5, -2,
+                                              -1]]  # remove vel target
+
+            # Regression loss for dimension, offset, height, rotation
+            # get corresponding gt box # B, 500
+            target_box, selected_mask, selected_cls = get_corresponding_box(
+                preds_dict['order'],
+                example['ind'][task_id],
+                example['mask'][task_id],
+                example['cat'][task_id],
+                target_box,
+            )
+            mask = selected_mask.float().unsqueeze(2)
+
+            weights = self.code_weights
+
+            box_loss = self.crit_reg(
+                preds_dict['anno_box'].transpose(1, 2) * mask,
+                target_box * mask)
+            box_loss = box_loss / (mask.sum() + 1e-4)
+            box_loss = box_loss.transpose(2, 0).sum(dim=2).sum(dim=1)
+
+            loc_loss = (box_loss * box_loss.new_tensor(weights)).sum()
+
+            if self.use_iou_loss:
+                with torch.no_grad():
+                    preds_box = get_box(
+                        preds_dict['anno_box'],
+                        preds_dict['order'],
+                        self.test_cfg,
+                        preds_dict['hm'].shape[2],
+                        preds_dict['hm'].shape[3],
+                    )
+                    cur_gt = get_box_gt(
+                        target_box,
+                        preds_dict['order'],
+                        self.test_cfg,
+                        preds_dict['hm'].shape[2],
+                        preds_dict['hm'].shape[3],
+                    )
+
+                    iou_targets = boxes_iou3d(
+                        preds_box.reshape(-1, 7), cur_gt.reshape(
+                            -1, 7))[range(preds_box.reshape(-1, 7).shape[0]),
+                                    range(cur_gt.reshape(-1, 7).shape[0])]
+                    iou_targets[torch.isnan(iou_targets)] = 0
+                    iou_targets = 2 * iou_targets - 1
+                iou_loss = self.crit_iou(preds_dict['iou'].reshape(-1),
+                                         iou_targets) * mask.reshape(-1)
+                iou_loss = iou_loss.sum() / (mask.sum() + 1e-4)
+
+                losses.update(
+                    {f'{task_id}_iou_loss': iou_loss * self.iou_weight})
+
+            losses.update({
+                f'{task_id}_hm_loss': hm_loss,
+                f'{task_id}_loc_loss': loc_loss * self.weight
+            })
+
+        return losses
+
+    def predict(self, preds_dicts, batch_input_metas, **kwargs):
+        """decode, nms, then return the detection result.
+
+        Additionally support double flip testing
+        """
+        rets = []
+
+        post_center_range = self.test_cfg.post_center_limit_range
+        if len(post_center_range) > 0:
+            post_center_range = torch.tensor(
+                post_center_range,
+                dtype=preds_dicts[0]['scores'].dtype,
+                device=preds_dicts[0]['scores'].device,
+            )
+
+        for task_id, preds_dict in enumerate(preds_dicts):
+            # convert B C N to B N C
+            for key, val in preds_dict.items():
+                if torch.is_tensor(preds_dict[key]):
+                    if len(preds_dict[key].shape) == 3:
+                        preds_dict[key] = val.permute(0, 2, 1).contiguous()
+
+            batch_score = preds_dict['scores']
+            batch_label = preds_dict['labels']
+            batch_mask = preds_dict['mask']
+            if self.use_iou_loss:
+                batch_iou = preds_dict['iou'].squeeze(2)
+            else:
+                batch_iou = None
+
+            batch_dim = torch.exp(preds_dict['dim'])
+
+            batch_rots = preds_dict['rot'][..., 0:1]
+            batch_rotc = preds_dict['rot'][..., 1:2]
+
+            batch_reg = preds_dict['reg']
+            batch_hei = preds_dict['height']
+            batch_rot = torch.atan2(batch_rots, batch_rotc)
+            if self.use_iou_loss:
+                batch_iou = (batch_iou + 1) * 0.5
+                batch_iou = torch.clamp(batch_iou, min=0.0, max=1.0)
+
+            batch, _, H, W = preds_dict['hm'].size()
+
+            ys, xs = torch.meshgrid([torch.arange(0, H), torch.arange(0, W)])
+            ys = ys.view(1, H, W).repeat(batch, 1, 1).to(batch_score)
+            xs = xs.view(1, H, W).repeat(batch, 1, 1).to(batch_score)
+
+            obj_num = preds_dict['order'].shape[1]
+            batch_id = np.indices((batch, obj_num))[0]
+            batch_id = torch.from_numpy(batch_id).to(preds_dict['order'])
+
+            xs = (
+                xs.view(batch, -1, 1)[batch_id, preds_dict['order']] +
+                batch_reg[:, :, 0:1])
+            ys = (
+                ys.view(batch, -1, 1)[batch_id, preds_dict['order']] +
+                batch_reg[:, :, 1:2])
+
+            xs = (
+                xs * self.test_cfg.out_size_factor *
+                self.test_cfg.voxel_size[0] + self.test_cfg.pc_range[0])
+            ys = (
+                ys * self.test_cfg.out_size_factor *
+                self.test_cfg.voxel_size[1] + self.test_cfg.pc_range[1])
+
+            if 'vel' in preds_dict:
+                batch_vel = preds_dict['vel']
+                batch_box_preds = torch.cat(
+                    [xs, ys, batch_hei, batch_dim, batch_vel, batch_rot],
+                    dim=2)
+            else:
+                batch_box_preds = torch.cat(
+                    [xs, ys, batch_hei, batch_dim, batch_rot], dim=2)
+
+            if self.test_cfg.get('per_class_nms', False):
+                pass
+            else:
+                rets.append(
+                    self.post_processing(
+                        batch_input_metas,
+                        batch_box_preds,
+                        batch_score,
+                        batch_label,
+                        self.test_cfg,
+                        post_center_range,
+                        task_id,
+                        batch_mask,
+                        batch_iou,
+                    ))
+
+        # Merge branches results
+        ret_list = []
+        num_samples = len(rets[0])
+
+        ret_list = []
+        for i in range(num_samples):
+            temp_instances = InstanceData()
+            for k in rets[0][i].keys():
+                if k == 'bboxes':
+                    bboxes = torch.cat([ret[i][k] for ret in rets])
+                    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+                    bboxes = batch_input_metas[i]['box_type_3d'](
+                        bboxes, self.bbox_code_size)
+                elif k == 'labels':
+                    flag = 0
+                    for j, num_class in enumerate(self.num_classes):
+                        rets[j][i][k] += flag
+                        flag += num_class
+                    labels = torch.cat([ret[i][k] for ret in rets])
+                elif k == 'scores':
+                    scores = torch.cat([ret[i][k] for ret in rets])
+
+            temp_instances.bboxes_3d = bboxes
+            temp_instances.scores_3d = scores
+            temp_instances.labels_3d = labels
+            ret_list.append(temp_instances)
+
+        return ret_list
+
+    def post_processing(
+        self,
+        img_metas,
+        batch_box_preds,
+        batch_score,
+        batch_label,
+        test_cfg,
+        post_center_range,
+        task_id,
+        batch_mask,
+        batch_iou,
+    ):
+        batch_size = len(batch_score)
+
+        prediction_dicts = []
+        for i in range(batch_size):
+            box_preds = batch_box_preds[i]
+            scores = batch_score[i]
+            labels = batch_label[i]
+            mask = batch_mask[i]
+
+            distance_mask = (box_preds[..., :3] >= post_center_range[:3]).all(
+                1) & (box_preds[..., :3] <= post_center_range[3:]).all(1)
+
+            mask = mask & distance_mask
+
+            box_preds = box_preds[mask]
+            scores = scores[mask]
+            labels = labels[mask]
+
+            if self.use_iou_loss:
+                iou_factor = torch.LongTensor(self.iou_factor).to(labels)
+                ious = batch_iou[i][mask]
+                ious = torch.pow(ious, iou_factor[labels])
+                scores = scores * ious
+
+            boxes_for_nms = box_preds[:, [0, 1, 2, 3, 4, 5, -1]]
+
+            if test_cfg.get('circular_nms', False):
+                centers = boxes_for_nms[:, [0, 1]]
+                boxes = torch.cat([centers, scores.view(-1, 1)], dim=1)
+                selected = _circle_nms(
+                    boxes,
+                    min_radius=test_cfg.min_radius[task_id],
+                    post_max_size=test_cfg.nms.nms_post_max_size,
+                )
+            elif test_cfg.nms.get('use_multi_class_nms', False):
+                # multi class nms
+                selected = []
+                for c in range(3):
+                    class_mask = labels == c
+                    if class_mask.sum() > 0:
+                        class_idx = class_mask.nonzero()
+                        select = nms_iou3d(
+                            boxes_for_nms[class_mask].float(),
+                            scores[class_mask].float(),
+                            thresh=test_cfg.nms.nms_iou_threshold[c],
+                            pre_maxsize=test_cfg.nms.nms_pre_max_size[c],
+                            post_max_size=test_cfg.nms.nms_post_max_size[c],
+                        )
+                        selected.append(class_idx[select, 0])
+                if len(selected) > 0:
+                    selected = torch.cat(selected, dim=0)
+            else:
+                selected = nms_bev(
+                    boxes_for_nms.float(),
+                    scores.float(),
+                    thresh=test_cfg.nms.nms_iou_threshold,
+                    pre_max_size=test_cfg.nms.nms_pre_max_size,
+                    post_max_size=test_cfg.nms.nms_post_max_size,
+                )
+
+            selected_boxes = box_preds[selected]
+            selected_scores = scores[selected]
+            selected_labels = labels[selected]
+
+            prediction_dict = {
+                'bboxes': selected_boxes,
+                'scores': selected_scores,
+                'labels': selected_labels,
+            }
+
+            prediction_dicts.append(prediction_dict)
+
+        return prediction_dicts
+
+
+def _circle_nms(boxes, min_radius, post_max_size=83):
+    """NMS according to center distance."""
+    keep = np.array(circle_nms(boxes.cpu().numpy(),
+                               thresh=min_radius))[:post_max_size]
+
+    keep = torch.from_numpy(keep).long().to(boxes.device)
+
+    return keep
+
+
+def get_box(pred_boxs, order, test_cfg, H, W):
+    batch = pred_boxs.shape[0]
+    obj_num = order.shape[1]
+    ys, xs = torch.meshgrid([torch.arange(0, H), torch.arange(0, W)])
+    ys = ys.view(1, H, W).repeat(batch, 1, 1).to(pred_boxs)
+    xs = xs.view(1, H, W).repeat(batch, 1, 1).to(pred_boxs)
+
+    batch_id = np.indices((batch, obj_num))[0]
+    batch_id = torch.from_numpy(batch_id).to(order)
+    xs = xs.view(batch, H * W)[batch_id, order].unsqueeze(1) + pred_boxs[:,
+                                                                         0:1]
+    ys = ys.view(batch, H * W)[batch_id, order].unsqueeze(1) + pred_boxs[:,
+                                                                         1:2]
+
+    xs = xs * test_cfg.out_size_factor * test_cfg.voxel_size[
+        0] + test_cfg.pc_range[0]
+    ys = ys * test_cfg.out_size_factor * test_cfg.voxel_size[
+        1] + test_cfg.pc_range[1]
+
+    rot = torch.atan2(pred_boxs[:, 6:7], pred_boxs[:, 7:8])
+    pred = torch.cat(
+        [xs, ys, pred_boxs[:, 2:3],
+         torch.exp(pred_boxs[:, 3:6]), rot], dim=1)
+
+    return torch.transpose(pred, 1, 2).contiguous()  # B M 7
+
+
+def get_box_gt(gt_boxs, order, test_cfg, H, W):
+    batch = gt_boxs.shape[0]
+    obj_num = order.shape[1]
+    ys, xs = torch.meshgrid([torch.arange(0, H), torch.arange(0, W)])
+    ys = ys.view(1, H, W).repeat(batch, 1, 1).to(gt_boxs)
+    xs = xs.view(1, H, W).repeat(batch, 1, 1).to(gt_boxs)
+
+    batch_id = np.indices((batch, obj_num))[0]
+    batch_id = torch.from_numpy(batch_id).to(order)
+
+    batch_gt_dim = torch.exp(gt_boxs[..., 3:6])
+    batch_gt_hei = gt_boxs[..., 2:3]
+    batch_gt_rot = torch.atan2(gt_boxs[..., -2:-1], gt_boxs[..., -1:])
+    xs = xs.view(batch, H * W)[batch_id, order].unsqueeze(2) + gt_boxs[...,
+                                                                       0:1]
+    ys = ys.view(batch, H * W)[batch_id, order].unsqueeze(2) + gt_boxs[...,
+                                                                       1:2]
+
+    xs = xs * test_cfg.out_size_factor * test_cfg.voxel_size[
+        0] + test_cfg.pc_range[0]
+    ys = ys * test_cfg.out_size_factor * test_cfg.voxel_size[
+        1] + test_cfg.pc_range[1]
+
+    batch_box_targets = torch.cat(
+        [xs, ys, batch_gt_hei, batch_gt_dim, batch_gt_rot], dim=-1)
+
+    return batch_box_targets  # B M 7
+
+
+def get_corresponding_box(x_ind, y_ind, y_mask, y_cls, target_box):
+    # find the id in y which has the same ind in x
+    select_target = torch.zeros(x_ind.shape[0], x_ind.shape[1],
+                                target_box.shape[2]).to(target_box)
+    select_mask = torch.zeros_like(x_ind).to(y_mask)
+    select_cls = torch.zeros_like(x_ind).to(y_cls)
+
+    for i in range(x_ind.shape[0]):
+        idx = torch.arange(y_ind[i].shape[-1]).to(x_ind)
+        idx = idx[y_mask[i]]
+        box_cls = y_cls[i][y_mask[i]]
+        valid_y_ind = y_ind[i][y_mask[i]]
+        match = (x_ind[i].unsqueeze(1) == valid_y_ind.unsqueeze(0)).nonzero()
+        select_target[i, match[:, 0]] = target_box[i, idx[match[:, 1]]]
+        select_mask[i, match[:, 0]] = 1
+        select_cls[i, match[:, 0]] = box_cls[match[:, 1]]
+
+    return select_target, select_mask, select_cls
diff --git a/mmde/projects/CenterFormer/centerformer/losses.py b/mmde/projects/CenterFormer/centerformer/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..e59dc8f984bfb9116ce661aedad7516c2c3b605b
--- /dev/null
+++ b/mmde/projects/CenterFormer/centerformer/losses.py
@@ -0,0 +1,58 @@
+# modify from https://github.com/TuSimple/centerformer/blob/master/det3d/models/losses/centernet_loss.py # noqa
+
+import torch
+from torch import nn
+
+from mmdet3d.registry import MODELS
+
+
+def _gather_feat(feat, ind, mask=None):
+    dim = feat.size(2)
+    ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
+    feat = feat.gather(1, ind)
+    if mask is not None:
+        mask = mask.unsqueeze(2).expand_as(feat)
+        feat = feat[mask]
+        feat = feat.view(-1, dim)
+    return feat
+
+
+def _transpose_and_gather_feat(feat, ind):
+    feat = feat.permute(0, 2, 3, 1).contiguous()
+    feat = feat.view(feat.size(0), -1, feat.size(3))
+    feat = _gather_feat(feat, ind)
+    return feat
+
+
+@MODELS.register_module()
+class FastFocalLoss(nn.Module):
+    """Reimplemented focal loss, exactly the same as the CornerNet version.
+
+    Faster and costs much less memory.
+    """
+
+    def __init__(self, focal_factor=2):
+        super(FastFocalLoss, self).__init__()
+        self.focal_factor = focal_factor
+
+    def forward(self, out, target, ind, mask, cat):
+        '''
+        Args:
+            out, target: B x C x H x W
+            ind, mask: B x M
+            cat (category id for peaks): B x M
+        '''
+        mask = mask.float()
+        gt = torch.pow(1 - target, 4)
+        neg_loss = torch.log(1 - out) * torch.pow(out, self.focal_factor) * gt
+        neg_loss = neg_loss.sum()
+
+        pos_pred_pix = _transpose_and_gather_feat(out, ind)  # B x M x C
+        pos_pred = pos_pred_pix.gather(2, cat.unsqueeze(2))  # B x M
+        num_pos = mask.sum()
+        pos_loss = torch.log(pos_pred) * torch.pow(
+            1 - pos_pred, self.focal_factor) * mask.unsqueeze(2)
+        pos_loss = pos_loss.sum()
+        if num_pos == 0:
+            return -neg_loss
+        return -(pos_loss + neg_loss) / num_pos
diff --git a/mmde/projects/CenterFormer/centerformer/multi_scale_deform_attn.py b/mmde/projects/CenterFormer/centerformer/multi_scale_deform_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c39af9cd8f959b6535aae6863041bbc65cf2e62
--- /dev/null
+++ b/mmde/projects/CenterFormer/centerformer/multi_scale_deform_attn.py
@@ -0,0 +1,229 @@
+# modify from https://github.com/TuSimple/centerformer/blob/master/det3d/models/ops/modules/ms_deform_attn.py # noqa
+
+import math
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from mmcv.utils import ext_loader
+from torch import Tensor, nn
+from torch.autograd.function import Function, once_differentiable
+from torch.nn.init import constant_, xavier_uniform_
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+class MultiScaleDeformableAttnFunction(Function):
+
+    @staticmethod
+    def forward(ctx, value: torch.Tensor, value_spatial_shapes: torch.Tensor,
+                value_level_start_index: torch.Tensor,
+                sampling_locations: torch.Tensor,
+                attention_weights: torch.Tensor,
+                im2col_step: torch.Tensor) -> torch.Tensor:
+        """GPU/MLU version of multi-scale deformable attention.
+
+        Args:
+            value (torch.Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (torch.Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (torch.Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (torch.Tensor): The weight of sampling points
+                used when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (torch.Tensor): The step used in image to column.
+        Returns:
+            torch.Tensor: has shape (bs, num_queries, embed_dims)
+        """
+
+        ctx.im2col_step = im2col_step
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        """GPU/MLU version of backward function.
+
+        Args:
+            grad_output (torch.Tensor): Gradient of output tensor of forward.
+        Returns:
+            tuple[Tensor]: Gradient of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index,\
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
+
+
+class MSDeformAttn(nn.Module):
+    """Multi-Scale Deformable Attention Module. Note that the difference
+    between this implementation and the implementation in MMCV is that the
+    dimension of input and hidden embedding in the multi-attention-head can be
+    specified respectively.
+
+    Args:
+        dim_model (int, optional): The input and output dimension in the model.
+            Defaults to 256.
+        dim_single_head (int, optional): hidden dimension in the single head.
+            Defaults to 64.
+        n_levels (int, optional): number of feature levels. Defaults to 4.
+        n_heads (int, optional): number of attention heads. Defaults to 8.
+        n_points (int, optional): number of sampling points per attention head
+            per feature level. Defaults to 4.
+        out_sample_loc (bool, optional): Whether to return the sampling
+            location. Defaults to False.
+    """
+
+    def __init__(self,
+                 dim_model=256,
+                 dim_single_head=64,
+                 n_levels=4,
+                 n_heads=8,
+                 n_points=4,
+                 out_sample_loc=False):
+        super().__init__()
+
+        self.im2col_step = 64
+
+        self.dim_model = dim_model
+        self.dim_single_head = dim_single_head
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+
+        self.out_sample_loc = out_sample_loc
+
+        self.sampling_offsets = nn.Linear(dim_model,
+                                          n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(dim_model,
+                                           n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(dim_model, dim_single_head * n_heads)
+        self.output_proj = nn.Linear(dim_single_head * n_heads, dim_model)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(
+            self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.n_heads, 1, 1, 2).repeat(1, self.n_levels,
+                                                       self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+
+    def forward(self,
+                query: Tensor,
+                reference_points: Tensor,
+                input_flatten: Tensor,
+                input_spatial_shapes: Tensor,
+                input_level_start_index: Tensor,
+                input_padding_mask: Optional[Tensor] = None):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (Tensor): (N, num_query, C)
+            reference_points (Tensor): (N, num_query, n_levels, 2). The
+                normalized reference points with shape
+                (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            input_flatten (Tensor): _description_
+            input_spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            input_level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            input_padding_mask (Optional[Tensor], optional): The padding mask
+                for value. Defaults to None.
+
+        Returns:
+            Tuple[Tensor, Tensor]: forwarded results.
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] *
+                input_spatial_shapes[:, 1]).sum() == Len_in
+
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+        value = value.view(N, Len_in, self.n_heads, self.dim_single_head)
+        sampling_offsets = self.sampling_offsets(query).view(
+            N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights,
+                                      -1).view(N, Len_q, self.n_heads,
+                                               self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]],
+                -1).to(sampling_offsets)
+
+            sampling_locations = reference_points[:, :, None, :, None, :] + \
+                sampling_offsets / offset_normalizer[None, None, None, :, None, :]  # noqa: E501
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5   # noqa: E501
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'  # noqa: E501
+                .format(reference_points.shape[-1]))
+        output = MultiScaleDeformableAttnFunction.apply(
+            value, input_spatial_shapes, input_level_start_index,
+            sampling_locations, attention_weights, self.im2col_step)
+        output = self.output_proj(output)
+        if self.out_sample_loc:
+            return output, torch.cat(
+                (sampling_locations, attention_weights[:, :, :, :, :, None]),
+                dim=-1)
+        else:
+            return output, None
diff --git a/mmde/projects/CenterFormer/centerformer/transformer.py b/mmde/projects/CenterFormer/centerformer/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..88b8ff2a559cc54d7934b79b536bf3b2dc8cd209
--- /dev/null
+++ b/mmde/projects/CenterFormer/centerformer/transformer.py
@@ -0,0 +1,261 @@
+# modify from https://github.com/TuSimple/centerformer/blob/master/det3d/models/utils/transformer.py # noqa
+
+import torch
+from einops import rearrange
+from mmcv.cnn.bricks.activation import GELU
+from torch import einsum, nn
+
+from .multi_scale_deform_attn import MSDeformAttn
+
+
+class PreNorm(nn.Module):
+
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.fn = fn
+
+    def forward(self, x, y=None, **kwargs):
+        if y is not None:
+            return self.fn(self.norm(x), self.norm(y), **kwargs)
+        else:
+            return self.fn(self.norm(x), **kwargs)
+
+
+class FFN(nn.Module):
+
+    def __init__(self, dim, hidden_dim, dropout=0.0):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden_dim),
+            GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, dim),
+            nn.Dropout(dropout),
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class SelfAttention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 n_heads=8,
+                 dim_single_head=64,
+                 dropout=0.0,
+                 out_attention=False):
+        super().__init__()
+        inner_dim = dim_single_head * n_heads
+        project_out = not (n_heads == 1 and dim_single_head == dim)
+
+        self.n_heads = n_heads
+        self.scale = dim_single_head**-0.5
+        self.out_attention = out_attention
+
+        self.attend = nn.Softmax(dim=-1)
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
+
+        self.to_out = (
+            nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout))
+            if project_out else nn.Identity())
+
+    def forward(self, x):
+        _, _, _, h = *x.shape, self.n_heads
+        qkv = self.to_qkv(x).chunk(3, dim=-1)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), qkv)
+
+        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
+
+        attn = self.attend(dots)
+
+        out = einsum('b h i j, b h j d -> b h i d', attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+
+        if self.out_attention:
+            return self.to_out(out), attn
+        else:
+            return self.to_out(out)
+
+
+class DeformableCrossAttention(nn.Module):
+
+    def __init__(
+        self,
+        dim_model=256,
+        dim_single_head=64,
+        dropout=0.3,
+        n_levels=3,
+        n_heads=6,
+        n_points=9,
+        out_sample_loc=False,
+    ):
+        super().__init__()
+
+        # cross attention
+        self.cross_attn = MSDeformAttn(
+            dim_model,
+            dim_single_head,
+            n_levels,
+            n_heads,
+            n_points,
+            out_sample_loc=out_sample_loc)
+        self.dropout = nn.Dropout(dropout)
+        self.out_sample_loc = out_sample_loc
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward(
+        self,
+        tgt,
+        src,
+        query_pos=None,
+        reference_points=None,
+        src_spatial_shapes=None,
+        level_start_index=None,
+        src_padding_mask=None,
+    ):
+        # cross attention
+        tgt2, sampling_locations = self.cross_attn(
+            self.with_pos_embed(tgt, query_pos),
+            reference_points,
+            src,
+            src_spatial_shapes,
+            level_start_index,
+            src_padding_mask,
+        )
+        tgt = self.dropout(tgt2)
+
+        if self.out_sample_loc:
+            return tgt, sampling_locations
+        else:
+            return tgt
+
+
+class DeformableTransformerDecoder(nn.Module):
+    """Deformable transformer decoder.
+
+    Note that the ``DeformableDetrTransformerDecoder`` in MMDet has different
+    interfaces in multi-head-attention which is customized here. For example,
+    'embed_dims' is not a position argument in our customized multi-head-self-
+    attention, but is required in MMDet. Thus, we can not directly use the
+    ``DeformableDetrTransformerDecoder`` in MMDET.
+    """
+
+    def __init__(
+        self,
+        dim,
+        n_levels=3,
+        depth=2,
+        n_heads=4,
+        dim_single_head=32,
+        dim_ffn=256,
+        dropout=0.0,
+        out_attention=False,
+        n_points=9,
+    ):
+        super().__init__()
+        self.out_attention = out_attention
+        self.layers = nn.ModuleList([])
+        self.depth = depth
+        self.n_levels = n_levels
+        self.n_points = n_points
+
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList([
+                    PreNorm(
+                        dim,
+                        SelfAttention(
+                            dim,
+                            n_heads=n_heads,
+                            dim_single_head=dim_single_head,
+                            dropout=dropout,
+                            out_attention=self.out_attention,
+                        ),
+                    ),
+                    PreNorm(
+                        dim,
+                        DeformableCrossAttention(
+                            dim,
+                            dim_single_head,
+                            n_levels=n_levels,
+                            n_heads=n_heads,
+                            dropout=dropout,
+                            n_points=n_points,
+                            out_sample_loc=self.out_attention,
+                        ),
+                    ),
+                    PreNorm(dim, FFN(dim, dim_ffn, dropout=dropout)),
+                ]))
+
+    def forward(self, x, pos_embedding, src, src_spatial_shapes,
+                level_start_index, center_pos):
+        if self.out_attention:
+            out_cross_attention_list = []
+        if pos_embedding is not None:
+            center_pos_embedding = pos_embedding(center_pos)
+        reference_points = center_pos[:, :,
+                                      None, :].repeat(1, 1, self.n_levels, 1)
+        for i, (self_attn, cross_attn, ff) in enumerate(self.layers):
+            if self.out_attention:
+                if center_pos_embedding is not None:
+                    x_att, self_att = self_attn(x + center_pos_embedding)
+                    x = x_att + x
+                    x_att, cross_att = cross_attn(
+                        x,
+                        src,
+                        query_pos=center_pos_embedding,
+                        reference_points=reference_points,
+                        src_spatial_shapes=src_spatial_shapes,
+                        level_start_index=level_start_index,
+                    )
+                else:
+                    x_att, self_att = self_attn(x)
+                    x = x_att + x
+                    x_att, cross_att = cross_attn(
+                        x,
+                        src,
+                        query_pos=None,
+                        reference_points=reference_points,
+                        src_spatial_shapes=src_spatial_shapes,
+                        level_start_index=level_start_index,
+                    )
+                out_cross_attention_list.append(cross_att)
+            else:
+                if center_pos_embedding is not None:
+                    x_att = self_attn(x + center_pos_embedding)
+                    x = x_att + x
+                    x_att = cross_attn(
+                        x,
+                        src,
+                        query_pos=center_pos_embedding,
+                        reference_points=reference_points,
+                        src_spatial_shapes=src_spatial_shapes,
+                        level_start_index=level_start_index,
+                    )
+                else:
+                    x_att = self_attn(x)
+                    x = x_att + x
+                    x_att = cross_attn(
+                        x,
+                        src,
+                        query_pos=None,
+                        reference_points=reference_points,
+                        src_spatial_shapes=src_spatial_shapes,
+                        level_start_index=level_start_index,
+                    )
+
+            x = x_att + x
+            x = ff(x) + x
+
+        out_dict = {'ct_feat': x}
+        if self.out_attention:
+            out_dict.update({
+                'out_attention':
+                torch.stack(out_cross_attention_list, dim=2)
+            })
+        return out_dict
diff --git a/mmde/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py b/mmde/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b207c7992f93ce9d5206ec07d7048b9f98b0f13
--- /dev/null
+++ b/mmde/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
@@ -0,0 +1,305 @@
+_base_ = ['../../../configs/_base_/default_runtime.py']
+custom_imports = dict(
+    imports=['projects.CenterFormer.centerformer'], allow_failed_imports=False)
+
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.1, 0.1, 0.15]
+point_cloud_range = [-75.2, -75.2, -2, 75.2, 75.2, 4]
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+tasks = [dict(num_class=3, class_names=['car', 'pedestrian', 'cyclist'])]
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=True, use_camera=False)
+backend_args = None
+
+model = dict(
+    type='CenterFormer',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        voxel=True,
+        voxel_type='dynamic',
+        voxel_layer=dict(
+            max_num_points=-1,
+            point_cloud_range=point_cloud_range,
+            voxel_size=voxel_size,
+            max_voxels=(-1, -1))),
+    voxel_encoder=dict(
+        type='DynamicSimpleVFE',
+        point_cloud_range=point_cloud_range,
+        voxel_size=voxel_size),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=5,
+        sparse_shape=[41, 1504, 1504],
+        order=('conv', 'norm', 'act'),
+        norm_cfg=dict(type='naiveSyncBN1d', eps=0.001, momentum=0.01),
+        encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
+                                                                      128)),
+        encoder_paddings=((1, 1, 1), (1, 1, 1), (1, 1, [0, 1, 1]), (1, 1)),
+        block_type='basicblock'),
+    backbone=dict(
+        type='DeformableDecoderRPN',
+        layer_nums=[5, 5, 1],
+        ds_num_filters=[256, 256, 128],
+        num_input_features=256,
+        tasks=tasks,
+        use_gt_training=True,
+        corner=True,
+        assign_label_window_size=1,
+        obj_num=500,
+        norm_cfg=dict(type='SyncBN', eps=1e-3, momentum=0.01),
+        transformer_config=dict(
+            depth=2,
+            n_heads=6,
+            dim_single_head=64,
+            dim_ffn=256,
+            dropout=0.3,
+            out_attn=False,
+            n_points=15,
+        ),
+    ),
+    bbox_head=dict(
+        type='CenterFormerBboxHead',
+        in_channels=256,
+        tasks=tasks,
+        dataset='waymo',
+        weight=2,
+        corner_loss=True,
+        iou_loss=True,
+        assign_label_window_size=1,
+        norm_cfg=dict(type='SyncBN', eps=1e-3, momentum=0.01),
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        common_heads={
+            'reg': (2, 2),
+            'height': (1, 2),
+            'dim': (3, 2),
+            'rot': (2, 2),
+            'iou': (1, 2)
+        },  # (output_channel, num_conv)
+    ),
+    train_cfg=dict(
+        grid_size=[1504, 1504, 40],
+        voxel_size=voxel_size,
+        out_size_factor=4,
+        dense_reg=1,
+        gaussian_overlap=0.1,
+        point_cloud_range=point_cloud_range,
+        max_objs=500,
+        min_radius=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]),
+    test_cfg=dict(
+        post_center_limit_range=[-80, -80, -10.0, 80, 80, 10.0],
+        nms=dict(
+            use_rotate_nms=False,
+            use_multi_class_nms=True,
+            nms_pre_max_size=[1600, 1600, 800],
+            nms_post_max_size=[200, 200, 100],
+            nms_iou_threshold=[0.8, 0.55, 0.55],
+        ),
+        score_threshold=0.1,
+        pc_range=[-75.2, -75.2],
+        out_size_factor=4,
+        voxel_size=[0.1, 0.1],
+        obj_num=1000,
+    ))
+
+data_root = 'data/waymo/kitti_format/'
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        norm_intensity=True,
+        backend_args=backend_args),
+    # Add this if using `MultiFrameDeformableDecoderRPN`
+    # dict(
+    #     type='LoadPointsFromMultiSweeps',
+    #     sweeps_num=9,
+    #     load_dim=6,
+    #     use_dim=[0, 1, 2, 3, 4],
+    #     pad_empty_sweeps=True,
+    #     remove_close=True),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.5, 0.5, 0]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        norm_intensity=True,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
+]
+
+dataset_type = 'WaymoDataset'
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR',
+        # load one frame every five frames
+        load_interval=5,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='WaymoMetric', waymo_bin_file='./data/waymo/waymo_format/gt.bin')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# For waymo dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 20. Please change the interval accordingly if you do not
+# use a default schedule.
+# optimizer
+lr = 3e-4
+# This schedule is mainly used by models on nuScenes dataset
+# max_norm=10 is better for SECOND
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01, betas=(0.9, 0.99)),
+    clip_grad=dict(max_norm=35, norm_type=2))
+# learning rate
+param_scheduler = [
+    # learning rate scheduler
+    # During the first 8 epochs, learning rate increases from 0 to lr * 10
+    # during the next 12 epochs, learning rate decreases from lr * 10 to
+    # lr * 1e-4
+    dict(
+        type='CosineAnnealingLR',
+        T_max=8,
+        eta_min=lr * 10,
+        begin=0,
+        end=8,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=12,
+        eta_min=lr * 1e-4,
+        begin=8,
+        end=20,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    # During the first 8 epochs, momentum increases from 0 to 0.85 / 0.95
+    # during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=8,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=8,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=12,
+        eta_min=1,
+        begin=8,
+        end=20,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=20)
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (4 GPUs) x (4 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+default_hooks = dict(
+    logger=dict(type='LoggerHook', interval=50),
+    checkpoint=dict(type='CheckpointHook', interval=5))
+custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=15)]
diff --git a/mmde/projects/DETR3D/README.md b/mmde/projects/DETR3D/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7ea35e68c58dedec63140e2007343e2ce8348374
--- /dev/null
+++ b/mmde/projects/DETR3D/README.md
@@ -0,0 +1,147 @@
+# DETR3D: 3D Object Detection from Multi-view Images via 3D-to-2D Queries
+
+> [DETR3D: 3D Object Detection from Multi-view Images via 3D-to-2D Queries](https://arxiv.org/abs/2110.06922)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We introduce a framework for multi-camera 3D object detection. In
+contrast to existing works, which estimate 3D bounding boxes directly from
+monocular images or use depth prediction networks to generate input for 3D object
+detection from 2D information, our method manipulates predictions directly
+in 3D space. Our architecture extracts 2D features from multiple camera images
+and then uses a sparse set of 3D object queries to index into these 2D features,
+linking 3D positions to multi-view images using camera transformation matrices.
+Finally, our model makes a bounding box prediction per object query, using a
+set-to-set loss to measure the discrepancy between the ground-truth and the prediction.
+This top-down approach outperforms its bottom-up counterpart in which
+object bounding box prediction follows per-pixel depth estimation, since it does
+not suffer from the compounding error introduced by a depth prediction model.
+Moreover, our method does not require post-processing such as non-maximum
+suppression, dramatically improving inference speed. We achieve state-of-the-art
+performance on the nuScenes autonomous driving benchmark.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/67246790/209751755-3d0f0ad5-6a39-4d14-a1c7-346b5c228a1b.png" width="800"/>
+</div>
+
+## Introduction
+
+This directory contains the implementations of DETR3D (https://arxiv.org/abs/2110.06922). Our implementations are built on top of MMdetection3D.
+We have updated DETR3D to be compatible with latest mmdet3d-dev1.x. The codebase and config files have all changed to adapt to the new mmdet3d version. All previous pretrained models are verified with the result listed below. However, newly trained models are yet to be uploaded.
+
+## Environment Setup
+
+We require the version of mmdet \<= V3.0.0rc5. The mmdet later than V3.0.0rc5 has refactored DETR-series and its config file, but our configs and code are yet to be updated.
+
+## Train
+
+1. Downloads the [pretrained backbone weights](https://drive.google.com/drive/folders/1h5bDg7Oh9hKvkFL-dRhu5-ahrEp2lRNN?usp=sharing) to pretrained/
+
+2. For example, to train DETR3D on 8 GPUs, please use
+
+```bash
+bash tools/dist_train.sh projects/DETR3D/configs/detr3d_res101_gridmask.py 8 --cfg-options load_from=pretrained/fcos3d.pth
+```
+
+## Evaluation using pretrained models
+
+1. Download the newly trained weights accordingly.
+
+   |                                                Backbone                                                 | mAP  | NDS  |                                                                                                                 Download                                                                                                                 |
+   | :-----------------------------------------------------------------------------------------------------: | :--: | :--: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+   |          [DETR3D, ResNet101 w/ DCN, evaluation on val set](./configs/detr3d_r101_gridmask.py)           | 35.5 | 42.8 |                 [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/detr3d/detr3d_r101_gridmask.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/detr3d/detr3d_r101_gridmask.log)                 |
+   |             [above, + CBGS, evaluation on val set](./configs/detr3d_r101_gridmask_cbgs.py)              | 35.2 | 42.7 |            [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/detr3d/detr3d_r101_gridmask_cbgs.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/detr3d/detr3d_r101_gridmask_cbgs.log)            |
+   | [DETR3D, VoVNet on trainval, evaluation on test set](./configs/detr3d_vovnet_gridmask_trainval_cbgs.py) | 41.4 | 48.1 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/detr3d/detr3d_vovnet_gridmask_trainval_cbgs.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/detr3d/detr3d_vovnet_gridmask_trainval_cbgs.log) |
+
+2. Testing
+
+   To test, use:
+
+   ```bash
+    bash tools/dist_test.sh projects/DETR3D/configs/detr3d_res101_gridmask.py ${CHECKPOINT_PATH} 8
+   ```
+
+## Converting old models (Optional)
+
+For old models please refer to [Object DGCNN & DETR3D](https://github.com/WangYueFt/detr3d)
+
+From v0.17.3 to v1.0.0, mmdet3d has changed its bbox representation. Given that Box(x,y,z,θ), we have x_new = y_old, y_new = x_old, θ_new = -θ_old - π/2.
+
+Old models are trained on v0.17.3. Our regression branch outputs (cx,cy,w,l,cz,h,sin(θ),cos(θ),vx,vy). For a previous model which outputs y=\[y0,y1,y2,y3,y4,y5,y6,y7,y8,y9\], we get y_new = \[...,y3,y2,...,-y7,-y6, ...\]. So we should change the final Linear layer's weight accordingly.
+
+To convert the old weights, please use
+
+```bash
+python projects/DETR3D/detr3d/old_detr3d_converter.py ${CHECKPOINT_DIR}/detr3d_resnet101.pth ${CHECKPOINT_DIR}/detr3d_r101_v1.0.0.pth --code_size 10
+```
+
+## Citation
+
+If you find this repo useful for your research, please consider citing the papers
+
+```
+@inproceedings{
+   detr3d,
+   title={DETR3D: 3D Object Detection from Multi-view Images via 3D-to-2D Queries},
+   author={Wang, Yue and Guizilini, Vitor and Zhang, Tianyuan and Wang, Yilun and Zhao, Hang and and Solomon, Justin M.},
+   booktitle={The Conference on Robot Learning ({CoRL})},
+   year={2021}
+}
+```
+
+## Checklist
+
+<!-- Here is a checklist illustrating a usual development workflow of a successful project, and also serves as an overview of this project's progress. The PIC (person in charge) or contributors of this project should check all the items that they believe have been finished, which will further be verified by codebase maintainers via a PR.
+OpenMMLab's maintainer will review the code to ensure the project's quality. Reaching the first milestone means that this project suffices the minimum requirement of being merged into 'projects/'. But this project is only eligible to become a part of the core package upon attaining the last milestone.
+Note that keeping this section up-to-date is crucial not only for this project's developers but the entire community, since there might be some other contributors joining this project and deciding their starting point from this list. It also helps maintainers accurately estimate time and effort on further code polishing, if needed.
+A project does not necessarily have to be finished in a single PR, but it's essential for the project to at least reach the first milestone in its very first PR. -->
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+    <!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmdet3d.registry.MODELS` and configurable via a config file. -->
+
+  - [x] Basic docstrings & proper citation
+
+    <!-- Each major object should contain a docstring, describing its functionality and arguments. If you have adapted the code from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
+
+  - [x] Test-time correctness
+
+    <!-- If you are reproducing the result from a paper, make sure your model's inference-time performance matches that in the original paper. The weights usually could be obtained by simply renaming the keys in the official pre-trained weights. This test could be skipped though, if you are able to prove the training-time correctness and check the second milestone. -->
+
+  - [x] A full README
+
+    <!-- As this template does. -->
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+  - [x] Training-time correctness
+
+    <!-- If you are reproducing the result from a paper, checking this item means that you should have trained your model from scratch based on the original paper's specification and verified that the final result matches the report within a minor error range. -->
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+    <!-- Ideally *all* the methods should have [type hints](https://www.pythontutorial.net/python-basics/python-type-hints/) and [docstrings](https://google.github.io/styleguide/pyguide.html#381-docstrings). [Example](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/models/detectors/fcos_mono3d.py) -->
+
+  - [ ] Unit tests
+
+    <!-- Unit tests for each module are required. [Example](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tests/test_models/test_dense_heads/test_fcos_mono3d_head.py) -->
+
+  - [ ] Code polishing
+
+    <!-- Refactor your code according to reviewer's comment. -->
+
+  - [ ] Metafile.yml
+
+    <!-- It will be parsed by MIM and Inferencer. [Example](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/fcos3d/metafile.yml) -->
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+  <!-- In particular, you may have to refactor this README into a standard one. [Example](/configs/textdet/dbnet/README.md) -->
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/mmde/projects/DETR3D/configs/detr3d_r101_gridmask.py b/mmde/projects/DETR3D/configs/detr3d_r101_gridmask.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef9a86383d46373d8b793d39a7d390c3b92e72a9
--- /dev/null
+++ b/mmde/projects/DETR3D/configs/detr3d_r101_gridmask.py
@@ -0,0 +1,258 @@
+_base_ = [
+    # 'mmdet3d::_base_/datasets/nus-3d.py',
+    '../../../configs/_base_/default_runtime.py'
+]
+
+custom_imports = dict(imports=['projects.DETR3D.detr3d'])
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], bgr_to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+# this means type='DETR3D' will be processed as 'mmdet3d.DETR3D'
+default_scope = 'mmdet3d'
+model = dict(
+    type='DETR3D',
+    use_grid_mask=True,
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor', **img_norm_cfg, pad_size_divisor=32),
+    img_backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN2d', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    img_neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=4,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='DETR3DHead',
+        num_query=900,
+        num_classes=10,
+        in_channels=256,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='Detr3DTransformer',
+            decoder=dict(
+                type='Detr3DTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='mmdet.DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',  # mmcv.
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='Detr3DCrossAtten',
+                            pc_range=point_cloud_range,
+                            num_points=1,
+                            embed_dims=256)
+                    ],
+                    feedforward_channels=512,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='mmdet.SinePositionalEncoding',
+            num_feats=128,
+            normalize=True,
+            offset=-0.5),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type='HungarianAssigner3D',
+                cls_cost=dict(type='mmdet.FocalLossCost', weight=2.0),
+                reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+                # ↓ Fake cost. This is just to get compatible with DETR head
+                iou_cost=dict(type='mmdet.IoUCost', weight=0.0),
+                pc_range=point_cloud_range))))
+
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+
+test_transforms = [
+    dict(
+        type='RandomResize3D',
+        scale=(1600, 900),
+        ratio_range=(1., 1.),
+        keep_ratio=True)
+]
+train_transforms = [dict(type='PhotoMetricDistortion3D')] + test_transforms
+
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadMultiViewImageFromFiles',
+        to_float32=True,
+        num_views=6,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_attr_label=False),
+    dict(type='MultiViewWrapper', transforms=train_transforms),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+
+test_pipeline = [
+    dict(
+        type='LoadMultiViewImageFromFiles',
+        to_float32=True,
+        num_views=6,
+        backend_args=backend_args),
+    dict(type='MultiViewWrapper', transforms=test_transforms),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+metainfo = dict(classes=class_names)
+data_prefix = dict(
+    pts='',
+    CAM_FRONT='samples/CAM_FRONT',
+    CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+    CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+    CAM_BACK='samples/CAM_BACK',
+    CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+    CAM_BACK_LEFT='samples/CAM_BACK_LEFT')
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        load_type='frame_based',
+        metainfo=metainfo,
+        modality=input_modality,
+        test_mode=False,
+        data_prefix=data_prefix,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='nuscenes_infos_val.pkl',
+        load_type='frame_based',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        test_mode=True,
+        data_prefix=data_prefix,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='NuScenesMetric',
+    data_root=data_root,
+    ann_file=data_root + 'nuscenes_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01),
+    paramwise_cfg=dict(custom_keys={'img_backbone': dict(lr_mult=0.1)}),
+    clip_grad=dict(max_norm=35, norm_type=2),
+)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='CosineAnnealingLR',
+        by_epoch=True,
+        begin=0,
+        end=24,
+        T_max=24,
+        eta_min_ratio=1e-3)
+]
+
+total_epochs = 24
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=total_epochs, val_interval=2)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook', interval=1, max_keep_ckpts=1, save_last=True))
+load_from = 'ckpts/fcos3d.pth'
+
+# setuptools 65 downgrades to 58.
+# In mmlab-node we use setuptools 61 but occurs NO errors
+vis_backends = [dict(type='TensorboardVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
diff --git a/mmde/projects/DETR3D/configs/detr3d_r101_gridmask_cbgs.py b/mmde/projects/DETR3D/configs/detr3d_r101_gridmask_cbgs.py
new file mode 100644
index 0000000000000000000000000000000000000000..06618ee966d684b7bb8deb988a61e2afcd529cad
--- /dev/null
+++ b/mmde/projects/DETR3D/configs/detr3d_r101_gridmask_cbgs.py
@@ -0,0 +1,80 @@
+_base_ = ['./detr3d_r101_gridmask.py']
+
+custom_imports = dict(imports=['projects.DETR3D.detr3d'])
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], bgr_to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+
+test_transforms = [
+    dict(
+        type='RandomResize3D',
+        scale=(1600, 900),
+        ratio_range=(1., 1.),
+        keep_ratio=True)
+]
+train_transforms = [dict(type='PhotoMetricDistortion3D')] + test_transforms
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_attr_label=False),
+    dict(type='MultiViewWrapper', transforms=train_transforms),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+
+metainfo = dict(classes=class_names)
+data_prefix = dict(
+    pts='',
+    CAM_FRONT='samples/CAM_FRONT',
+    CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+    CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+    CAM_BACK='samples/CAM_BACK',
+    CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+    CAM_BACK_LEFT='samples/CAM_BACK_LEFT')
+
+train_dataloader = dict(
+    _delete_=True,
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CBGSDataset',
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='nuscenes_infos_train.pkl',
+            pipeline=train_pipeline,
+            load_type='frame_based',
+            metainfo=metainfo,
+            modality=input_modality,
+            test_mode=False,
+            data_prefix=data_prefix,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR')))
diff --git a/mmde/projects/DETR3D/configs/detr3d_vovnet_gridmask_trainval_cbgs.py b/mmde/projects/DETR3D/configs/detr3d_vovnet_gridmask_trainval_cbgs.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fb8cb8e41740ea29d5380970e5474f876db2211
--- /dev/null
+++ b/mmde/projects/DETR3D/configs/detr3d_vovnet_gridmask_trainval_cbgs.py
@@ -0,0 +1,52 @@
+_base_ = ['./detr3d_r101_gridmask_cbgs.py']
+
+custom_imports = dict(imports=['projects.DETR3D.detr3d'])
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675],
+    std=[57.375, 57.120, 58.395],
+    bgr_to_rgb=False)
+
+# this means type='DETR3D' will be processed as 'mmdet3d.DETR3D'
+default_scope = 'mmdet3d'
+model = dict(
+    type='DETR3D',
+    use_grid_mask=True,
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor', **img_norm_cfg, pad_size_divisor=32),
+    img_backbone=dict(
+        _delete_=True,
+        type='VoVNet',
+        spec_name='V-99-eSE',
+        norm_eval=True,
+        frozen_stages=1,
+        input_ch=3,
+        out_features=['stage2', 'stage3', 'stage4', 'stage5']),
+    img_neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 768, 1024],
+        out_channels=256,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=4,
+        relu_before_extra_convs=True))
+
+train_dataloader = dict(
+    dataset=dict(
+        type='CBGSDataset',
+        dataset=dict(ann_file='nuscenes_infos_trainval.pkl')))
+
+test_dataloader = dict(
+    dataset=dict(
+        data_root='data/nuscenes-test', ann_file='nuscenes_infos_test.pkl'))
+
+test_evaluator = dict(
+    type='NuScenesMetric',
+    data_root='data/nuscenes-test',
+    ann_file='data/nuscenes-test/nuscenes_infos_test.pkl',
+    jsonfile_prefix='work_dirs/detr3d_vovnet_results_test',
+    format_only=True,
+    metric=[])
+
+load_from = 'ckpts/dd3d_det_final.pth'
+find_unused_parameters = True
diff --git a/mmde/projects/DETR3D/detr3d/__init__.py b/mmde/projects/DETR3D/detr3d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..677cc2a6646cfd836b5379087f9839e8d4eed753
--- /dev/null
+++ b/mmde/projects/DETR3D/detr3d/__init__.py
@@ -0,0 +1,14 @@
+from .detr3d import DETR3D
+from .detr3d_head import DETR3DHead
+from .detr3d_transformer import (Detr3DCrossAtten, Detr3DTransformer,
+                                 Detr3DTransformerDecoder)
+from .hungarian_assigner_3d import HungarianAssigner3D
+from .match_cost import BBox3DL1Cost
+from .nms_free_coder import NMSFreeCoder
+from .vovnet import VoVNet
+
+__all__ = [
+    'VoVNet', 'DETR3D', 'DETR3DHead', 'Detr3DTransformer',
+    'Detr3DTransformerDecoder', 'Detr3DCrossAtten', 'HungarianAssigner3D',
+    'BBox3DL1Cost', 'NMSFreeCoder'
+]
diff --git a/mmde/projects/DETR3D/detr3d/detr3d.py b/mmde/projects/DETR3D/detr3d/detr3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..78f0037f69a1e1556865ab0a837909acc20310e6
--- /dev/null
+++ b/mmde/projects/DETR3D/detr3d/detr3d.py
@@ -0,0 +1,201 @@
+from typing import Dict, List, Optional
+
+import torch
+from torch import Tensor
+
+from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import Det3DDataSample
+from mmdet3d.structures.bbox_3d.utils import get_lidar2img
+from .grid_mask import GridMask
+
+
+@MODELS.register_module()
+class DETR3D(MVXTwoStageDetector):
+    """DETR3D: 3D Object Detection from Multi-view Images via 3D-to-2D Queries
+
+    Args:
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`Det3DDataPreprocessor`. Defaults to None.
+        use_grid_mask (bool) : Data augmentation. Whether to mask out some
+            grids during extract_img_feat. Defaults to False.
+        img_backbone (dict, optional): Backbone of extracting
+            images feature. Defaults to None.
+        img_neck (dict, optional): Neck of extracting
+            image features. Defaults to None.
+        pts_bbox_head (dict, optional): Bboxes head of
+            detr3d. Defaults to None.
+        train_cfg (dict, optional): Train config of model.
+            Defaults to None.
+        test_cfg (dict, optional): Train config of model.
+            Defaults to None.
+        init_cfg (dict, optional): Initialize config of
+            model. Defaults to None.
+    """
+
+    def __init__(self,
+                 data_preprocessor=None,
+                 use_grid_mask=False,
+                 img_backbone=None,
+                 img_neck=None,
+                 pts_bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(DETR3D, self).__init__(
+            img_backbone=img_backbone,
+            img_neck=img_neck,
+            pts_bbox_head=pts_bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor)
+        self.grid_mask = GridMask(
+            True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
+        self.use_grid_mask = use_grid_mask
+
+    def extract_img_feat(self, img: Tensor,
+                         batch_input_metas: List[dict]) -> List[Tensor]:
+        """Extract features from images.
+
+        Args:
+            img (tensor): Batched multi-view image tensor with
+                shape (B, N, C, H, W).
+            batch_input_metas (list[dict]): Meta information of multiple inputs
+                in a batch.
+
+        Returns:
+             list[tensor]: multi-level image features.
+        """
+
+        B = img.size(0)
+        if img is not None:
+            input_shape = img.shape[-2:]  # bs nchw
+            # update real input shape of each single img
+            for img_meta in batch_input_metas:
+                img_meta.update(input_shape=input_shape)
+
+            if img.dim() == 5 and img.size(0) == 1:
+                img.squeeze_()
+            elif img.dim() == 5 and img.size(0) > 1:
+                B, N, C, H, W = img.size()
+                img = img.view(B * N, C, H, W)
+            if self.use_grid_mask:
+                img = self.grid_mask(img)  # mask out some grids
+            img_feats = self.img_backbone(img)
+            if isinstance(img_feats, dict):
+                img_feats = list(img_feats.values())
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+
+        img_feats_reshaped = []
+        for img_feat in img_feats:
+            BN, C, H, W = img_feat.size()
+            img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
+        return img_feats_reshaped
+
+    def extract_feat(self, batch_inputs_dict: Dict,
+                     batch_input_metas: List[dict]) -> List[Tensor]:
+        """Extract features from images.
+
+        Refer to self.extract_img_feat()
+        """
+        imgs = batch_inputs_dict.get('imgs', None)
+        img_feats = self.extract_img_feat(imgs, batch_input_metas)
+        return img_feats
+
+    def _forward(self):
+        raise NotImplementedError('tensor mode is yet to add')
+
+    # original forward_train
+    def loss(self, batch_inputs_dict: Dict[List, Tensor],
+             batch_data_samples: List[Det3DDataSample],
+             **kwargs) -> List[Det3DDataSample]:
+        """
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                `imgs` keys.
+                - imgs (torch.Tensor): Tensor of batched multi-view  images.
+                    It has shape (B, N, C, H ,W)
+            batch_data_samples (List[obj:`Det3DDataSample`]): The Data Samples
+                It usually includes information such as `gt_instance_3d`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+
+        """
+        batch_input_metas = [item.metainfo for item in batch_data_samples]
+        batch_input_metas = self.add_lidar2img(batch_input_metas)
+        img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
+        outs = self.pts_bbox_head(img_feats, batch_input_metas, **kwargs)
+
+        batch_gt_instances_3d = [
+            item.gt_instances_3d for item in batch_data_samples
+        ]
+        loss_inputs = [batch_gt_instances_3d, outs]
+        losses_pts = self.pts_bbox_head.loss_by_feat(*loss_inputs)
+
+        return losses_pts
+
+    # original simple_test
+    def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
+                batch_data_samples: List[Det3DDataSample],
+                **kwargs) -> List[Det3DDataSample]:
+        """Forward of testing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                `imgs` keys.
+
+                - imgs (torch.Tensor): Tensor of batched multi-view images.
+                    It has shape (B, N, C, H ,W)
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
+
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input sample. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+                (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bbox_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
+                contains a tensor with shape (num_instances, 9).
+        """
+        batch_input_metas = [item.metainfo for item in batch_data_samples]
+        batch_input_metas = self.add_lidar2img(batch_input_metas)
+        img_feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
+        outs = self.pts_bbox_head(img_feats, batch_input_metas)
+
+        results_list_3d = self.pts_bbox_head.predict_by_feat(
+            outs, batch_input_metas, **kwargs)
+
+        # change the bboxes' format
+        detsamples = self.add_pred_to_datasample(batch_data_samples,
+                                                 results_list_3d)
+        return detsamples
+
+    # may need speed-up
+    def add_lidar2img(self, batch_input_metas: List[Dict]) -> List[Dict]:
+        """add 'lidar2img' transformation matrix into batch_input_metas.
+
+        Args:
+            batch_input_metas (list[dict]): Meta information of multiple inputs
+                in a batch.
+
+        Returns:
+            batch_input_metas (list[dict]): Meta info with lidar2img added
+        """
+        for meta in batch_input_metas:
+            l2i = list()
+            for i in range(len(meta['cam2img'])):
+                c2i = torch.tensor(meta['cam2img'][i]).double()
+                l2c = torch.tensor(meta['lidar2cam'][i]).double()
+                l2i.append(get_lidar2img(c2i, l2c).float().numpy())
+            meta['lidar2img'] = l2i
+        return batch_input_metas
diff --git a/mmde/projects/DETR3D/detr3d/detr3d_head.py b/mmde/projects/DETR3D/detr3d/detr3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4143ad5820aba122aab4ea6f978e4a09e77ab6b
--- /dev/null
+++ b/mmde/projects/DETR3D/detr3d/detr3d_head.py
@@ -0,0 +1,447 @@
+import copy
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Linear
+from mmdet.models.dense_heads import DETRHead
+from mmdet.models.layers import inverse_sigmoid
+from mmdet.models.utils import multi_apply
+from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
+from mmengine.model import bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet3d.registry import MODELS, TASK_UTILS
+from .util import normalize_bbox
+
+
+@MODELS.register_module()
+class DETR3DHead(DETRHead):
+    """Head of DETR3D.
+
+    Args:
+        with_box_refine (bool): Whether to refine the reference points
+            in the decoder. Defaults to False.
+        as_two_stage (bool) : Whether to generate the proposal from
+            the outputs of encoder.
+        transformer (obj:`ConfigDict`): ConfigDict is used for building
+            the Encoder and Decoder.
+        bbox_coder (obj:`ConfigDict`): Configs to build the bbox coder
+        num_cls_fcs (int) : the number of layers in cls and reg branch
+        code_weights (List[double]) : loss weights of
+            (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y)
+        code_size (int) : size of code_weights
+    """
+
+    def __init__(
+            self,
+            *args,
+            with_box_refine=False,
+            as_two_stage=False,
+            transformer=None,
+            bbox_coder=None,
+            num_cls_fcs=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+            code_size=10,
+            **kwargs):
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        if self.as_two_stage:
+            transformer['as_two_stage'] = self.as_two_stage
+        self.code_size = code_size
+        self.code_weights = code_weights
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.pc_range = self.bbox_coder.pc_range
+        self.num_cls_fcs = num_cls_fcs - 1
+        super(DETR3DHead, self).__init__(
+            *args, transformer=transformer, **kwargs)
+        # DETR sampling=False, so use PseudoSampler, format the result
+        sampler_cfg = dict(type='PseudoSampler')
+        self.sampler = TASK_UTILS.build(sampler_cfg)
+
+        self.code_weights = nn.Parameter(
+            torch.tensor(self.code_weights, requires_grad=False),
+            requires_grad=False)
+
+    # forward_train -> loss
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+        cls_branch = []
+        for _ in range(self.num_reg_fcs):
+            cls_branch.append(Linear(self.embed_dims, self.embed_dims))
+            cls_branch.append(nn.LayerNorm(self.embed_dims))
+            cls_branch.append(nn.ReLU(inplace=True))
+        cls_branch.append(Linear(self.embed_dims, self.cls_out_channels))
+        fc_cls = nn.Sequential(*cls_branch)
+
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, self.code_size))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        # last reg_branch is used to generate proposal from
+        # encode feature map when as_two_stage is True.
+        num_pred = (self.transformer.decoder.num_layers + 1) if \
+            self.as_two_stage else self.transformer.decoder.num_layers
+
+        if self.with_box_refine:
+            self.cls_branches = _get_clones(fc_cls, num_pred)
+            self.reg_branches = _get_clones(reg_branch, num_pred)
+        else:
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(num_pred)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(num_pred)])
+
+        if not self.as_two_stage:
+            self.query_embedding = nn.Embedding(self.num_query,
+                                                self.embed_dims * 2)
+
+    def init_weights(self):
+        """Initialize weights of the DeformDETR head."""
+        self.transformer.init_weights()
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+
+    def forward(self, mlvl_feats: List[Tensor], img_metas: List[Dict],
+                **kwargs) -> Dict[str, Tensor]:
+        """Forward function.
+
+        Args:
+            mlvl_feats (List[Tensor]): Features from the upstream
+                network, each is a 5D-tensor with shape
+                (B, N, C, H, W).
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head,
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression
+                head with normalized coordinate format
+                (cx, cy, l, w, cz, h, sin(φ), cos(φ), vx, vy).
+                Shape [nb_dec, bs, num_query, 10].
+        """
+        query_embeds = self.query_embedding.weight
+        hs, init_reference, inter_references = self.transformer(
+            mlvl_feats,
+            query_embeds,
+            reg_branches=self.reg_branches if self.with_box_refine else None,
+            img_metas=img_metas,
+            **kwargs)
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_classes = []
+        outputs_coords = []
+
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.cls_branches[lvl](hs[lvl])
+            tmp = self.reg_branches[lvl](hs[lvl])  # shape: ([B, num_q, 10])
+            # TODO: check the shape of reference
+            assert reference.shape[-1] == 3
+            tmp[..., 0:2] += reference[..., 0:2]
+            tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
+            tmp[..., 4:5] += reference[..., 2:3]
+            tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
+
+            tmp[..., 0:1] = \
+                tmp[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) \
+                + self.pc_range[0]
+            tmp[..., 1:2] = \
+                tmp[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) \
+                + self.pc_range[1]
+            tmp[..., 4:5] = \
+                tmp[..., 4:5] * (self.pc_range[5] - self.pc_range[2]) \
+                + self.pc_range[2]
+
+            # TODO: check if using sigmoid
+            outputs_coord = tmp
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+
+        outputs_classes = torch.stack(outputs_classes)
+        outputs_coords = torch.stack(outputs_coords)
+        outs = {
+            'all_cls_scores': outputs_classes,
+            'all_bbox_preds': outputs_coords,
+            'enc_cls_scores': None,
+            'enc_bbox_preds': None,
+        }
+        return outs
+
+    def _get_target_single(
+            self,
+            cls_score: Tensor,  # [query, num_cls]
+            bbox_pred: Tensor,  # [query, 10]
+            gt_instances_3d: InstanceList) -> Tuple[Tensor, ...]:
+        """Compute regression and classification targets for a single image."""
+        # turn bottm center into gravity center
+        gt_bboxes = gt_instances_3d.bboxes_3d  # [num_gt, 9]
+        gt_bboxes = torch.cat(
+            (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), dim=1)
+
+        gt_labels = gt_instances_3d.labels_3d  # [num_gt, num_cls]
+        # assigner and sampler: PseudoSampler
+        assign_result = self.assigner.assign(
+            bbox_pred, cls_score, gt_bboxes, gt_labels, gt_bboxes_ignore=None)
+        sampling_result = self.sampler.sample(
+            assign_result, InstanceData(priors=bbox_pred),
+            InstanceData(bboxes_3d=gt_bboxes))
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        num_bboxes = bbox_pred.size(0)
+        labels = gt_bboxes.new_full((num_bboxes, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        # theta in gt_bbox here is still a single scalar
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :self.code_size - 1]
+        bbox_weights = torch.zeros_like(bbox_pred)
+        # only matched query will learn from bbox coord
+        bbox_weights[pos_inds] = 1.0
+
+        # fix empty gt bug in multi gpu training
+        if sampling_result.pos_gt_bboxes.shape[0] == 0:
+            sampling_result.pos_gt_bboxes = \
+                sampling_result.pos_gt_bboxes.reshape(0, self.code_size - 1)
+
+        bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    def get_targets(
+            self,
+            batch_cls_scores: List[Tensor],  # bs[num_q,num_cls]
+            batch_bbox_preds: List[Tensor],  # bs[num_q,10]
+            batch_gt_instances_3d: InstanceList) -> tuple():
+        """"Compute regression and classification targets for a batch image for
+        a single decoder layer.
+
+        Args:
+            batch_cls_scores (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            batch_bbox_preds (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y) and
+                shape [num_query, 10]
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes_3d``、``labels_3d``.
+        Returns:
+            tuple: a tuple containing the following targets.
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         pos_inds_list, neg_inds_list) = multi_apply(self._get_target_single,
+                                                     batch_cls_scores,
+                                                     batch_bbox_preds,
+                                                     batch_gt_instances_3d)
+
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def loss_by_feat_single(
+        self,
+        batch_cls_scores: Tensor,  # bs,num_q,num_cls
+        batch_bbox_preds: Tensor,  # bs,num_q,10
+        batch_gt_instances_3d: InstanceList
+    ) -> Tuple[Tensor, Tensor]:
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+
+        Args:
+           batch_cls_scores (Tensor): Box score logits from a single
+                decoder layer for batched images with shape [num_query,
+                cls_out_channels].
+            batch_bbox_preds (Tensor): Sigmoid outputs from a single
+                decoder layer for batched images, with normalized coordinate
+                (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y) and
+                shape [num_query, 10]
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d. It usually has ``bboxes_3d``,``labels_3d``.
+        Returns:
+            tulple(Tensor, Tensor): cls and reg loss for outputs from
+                a single decoder layer.
+        """
+        batch_size = batch_cls_scores.size(0)  # batch size
+        cls_scores_list = [batch_cls_scores[i] for i in range(batch_size)]
+        bbox_preds_list = [batch_bbox_preds[i] for i in range(batch_size)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           batch_gt_instances_3d)
+
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        batch_cls_scores = batch_cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                batch_cls_scores.new_tensor([cls_avg_factor]))
+
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_cls(
+            batch_cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # regression L1 loss
+        batch_bbox_preds = batch_bbox_preds.reshape(-1,
+                                                    batch_bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
+        # neg_query is all 0, log(0) is NaN
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.code_weights
+
+        loss_bbox = self.loss_bbox(
+            batch_bbox_preds[isnotnan, :self.code_size],
+            normalized_bbox_targets[isnotnan, :self.code_size],
+            bbox_weights[isnotnan, :self.code_size],
+            avg_factor=num_total_pos)
+
+        loss_cls = torch.nan_to_num(loss_cls)
+        loss_bbox = torch.nan_to_num(loss_bbox)
+        return loss_cls, loss_bbox
+
+    # original loss()
+    def loss_by_feat(
+            self,
+            batch_gt_instances_3d: InstanceList,
+            preds_dicts: Dict[str, Tensor],
+            batch_gt_instances_3d_ignore: OptInstanceList = None) -> Dict:
+        """Compute loss of the head.
+
+        Args:
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、`
+                `labels_3d``、``depths``、``centers_2d`` and attributes.
+                gt_instance.  It usually includes ``bboxes``、``labels``.
+            batch_gt_instances_3d_ignore (list[:obj:`InstanceData`], Optional):
+                NOT supported.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert batch_gt_instances_3d_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for batch_gt_instances_3d_ignore setting to None.'
+        all_cls_scores = preds_dicts[
+            'all_cls_scores']  # num_dec,bs,num_q,num_cls
+        all_bbox_preds = preds_dicts['all_bbox_preds']  # num_dec,bs,num_q,10
+        enc_cls_scores = preds_dicts['enc_cls_scores']
+        enc_bbox_preds = preds_dicts['enc_bbox_preds']
+
+        # calculate loss for each decoder layer
+        num_dec_layers = len(all_cls_scores)
+        batch_gt_instances_3d_list = [
+            batch_gt_instances_3d for _ in range(num_dec_layers)
+        ]
+        losses_cls, losses_bbox = multi_apply(self.loss_by_feat_single,
+                                              all_cls_scores, all_bbox_preds,
+                                              batch_gt_instances_3d_list)
+
+        loss_dict = dict()
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            enc_loss_cls, enc_losses_bbox = self.loss_by_feat_single(
+                enc_cls_scores, enc_bbox_preds, batch_gt_instances_3d_list)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def predict_by_feat(self,
+                        preds_dicts,
+                        img_metas,
+                        rescale=False) -> InstanceList:
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            preds_dicts (Dict[str, Tensor]):
+                -all_cls_scores (Tensor): Outputs from the classification head,
+                    shape [nb_dec, bs, num_query, cls_out_channels]. Note
+                    cls_out_channels should includes background.
+                -all_bbox_preds (Tensor): Sigmoid outputs from the regression
+                    head with normalized coordinate format
+                    (cx, cy, l, w, cz, h, rot_sine, rot_cosine, v_x, v_y).
+                    Shape [nb_dec, bs, num_query, 10].
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores_3d (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels_3d (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes_3d (Tensor): Contains a tensor with shape
+                  (num_instances, C), where C >= 7.
+        """
+        # sinθ & cosθ ---> θ
+        preds_dicts = self.bbox_coder.decode(preds_dicts)
+        num_samples = len(preds_dicts)  # batch size
+        ret_list = []
+        for i in range(num_samples):
+            results = InstanceData()
+            preds = preds_dicts[i]
+            bboxes = preds['bboxes']
+            bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+            bboxes = img_metas[i]['box_type_3d'](bboxes, self.code_size - 1)
+
+            results.bboxes_3d = bboxes
+            results.scores_3d = preds['scores']
+            results.labels_3d = preds['labels']
+            ret_list.append(results)
+        return ret_list
diff --git a/mmde/projects/DETR3D/detr3d/detr3d_transformer.py b/mmde/projects/DETR3D/detr3d/detr3d_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfe0765039285a16a7df79b238cb6367259c8bab
--- /dev/null
+++ b/mmde/projects/DETR3D/detr3d/detr3d_transformer.py
@@ -0,0 +1,447 @@
+import warnings
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn.bricks.transformer import (TransformerLayerSequence,
+                                         build_transformer_layer_sequence)
+from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention
+from mmengine.model import BaseModule, constant_init, xavier_init
+
+from mmdet3d.registry import MODELS
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+@MODELS.register_module()
+class Detr3DTransformer(BaseModule):
+    """Implements the DETR3D transformer.
+
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        num_cams (int): Number of cameras in the dataset.
+            Default: 6 in NuScenes Det.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+
+    def __init__(self,
+                 num_feature_levels=4,
+                 num_cams=6,
+                 two_stage_num_proposals=300,
+                 decoder=None,
+                 **kwargs):
+        super(Detr3DTransformer, self).__init__(**kwargs)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = self.decoder.embed_dims
+        self.num_feature_levels = num_feature_levels
+        self.num_cams = num_cams
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.init_layers()
+
+    def init_layers(self):
+        """Initialize layers of the Detr3DTransformer."""
+        self.reference_points = nn.Linear(self.embed_dims, 3)
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention) or isinstance(
+                    m, Detr3DCrossAtten):
+                m.init_weight()
+        xavier_init(self.reference_points, distribution='uniform', bias=0.)
+
+    def forward(self, mlvl_feats, query_embed, reg_branches=None, **kwargs):
+        """Forward function for `Detr3DTransformer`.
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                (B, N, C, H_lvl, W_lvl).
+            query_embed (Tensor): The query positional and semantic embedding
+                for decoder, with shape [num_query, c+c].
+            mlvl_pos_embeds (list(Tensor)): The positional encoding
+                of feats from different level, has the shape
+                [bs, N, embed_dims, h, w]. It is unused here.
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when `with_box_refine` is True. Default to None.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape
+                      (num_dec_layers, bs, num_query, embed_dims), else has
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference
+                    points in decoder, has shape
+                    (num_dec_layers, bs, num_query, embed_dims)
+        """
+        assert query_embed is not None
+        bs = mlvl_feats[0].size(0)
+        query_pos, query = torch.split(query_embed, self.embed_dims, dim=1)
+        query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)  # [bs,num_q,c]
+        query = query.unsqueeze(0).expand(bs, -1, -1)  # [bs,num_q,c]
+        reference_points = self.reference_points(query_pos)
+        reference_points = reference_points.sigmoid()
+        init_reference_out = reference_points
+
+        # decoder
+        query = query.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=mlvl_feats,
+            query_pos=query_pos,
+            reference_points=reference_points,
+            reg_branches=reg_branches,
+            **kwargs)
+
+        inter_references_out = inter_references
+        return inter_states, init_reference_out, inter_references_out
+
+
+@MODELS.register_module()
+class Detr3DTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR3D transformer.
+
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default:
+            `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+        super(Detr3DTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                reg_branches=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformerDecoder`.
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape self.reference_points =
+                                        nn.Linear(self.embed_dims, 3)
+            reg_branch: (obj:`nn.ModuleList`): Used for
+                refining the regression results. Only would
+                be passed when with_box_refine is True,
+                otherwise would be passed a `None`.
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for lid, layer in enumerate(self.layers):  # iterative refinement
+            reference_points_input = reference_points
+            output = layer(
+                output,
+                *args,
+                reference_points=reference_points_input,
+                **kwargs)
+            output = output.permute(1, 0, 2)
+            if reg_branches is not None:
+                tmp = reg_branches[lid](output)
+
+                assert reference_points.shape[-1] == 3
+
+                new_reference_points = torch.zeros_like(reference_points)
+                new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(
+                    reference_points[..., :2])
+                new_reference_points[...,
+                                     2:3] = tmp[..., 4:5] + inverse_sigmoid(
+                                         reference_points[..., 2:3])
+                new_reference_points = new_reference_points.sigmoid()
+
+                reference_points = new_reference_points.detach()
+
+            output = output.permute(1, 0, 2)
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+@MODELS.register_module()
+class Detr3DCrossAtten(BaseModule):
+    """An attention module used in Detr3d.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_residual`.
+            Default: 0..
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(
+        self,
+        embed_dims=256,
+        num_heads=8,
+        num_levels=4,
+        num_points=5,
+        num_cams=6,
+        im2col_step=64,
+        pc_range=None,
+        dropout=0.1,
+        norm_cfg=None,
+        init_cfg=None,
+        batch_first=False,
+    ):
+        super(Detr3DCrossAtten, self).__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.init_cfg = init_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.pc_range = pc_range
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.num_cams = num_cams
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_cams * num_levels * num_points)
+
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+
+        self.position_encoder = nn.Sequential(
+            nn.Linear(3, self.embed_dims),
+            nn.LayerNorm(self.embed_dims),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.embed_dims, self.embed_dims),
+            nn.LayerNorm(self.embed_dims),
+            nn.ReLU(inplace=True),
+        )
+        self.batch_first = batch_first
+        self.init_weight()
+
+    def init_weight(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+
+    def forward(self,
+                query,
+                key,
+                value,
+                residual=None,
+                query_pos=None,
+                reference_points=None,
+                **kwargs):
+        """Forward Function of Detr3DCrossAtten.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (List[Tensor]): Image features from
+                different level. Each element has shape
+                (B, N, C, H_lvl, W_lvl).
+            residual (Tensor): The tensor used for addition, with the
+                same shape as `x`. Default None. If None, `x` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            reference_points (Tensor): The normalized 3D reference
+                points with shape (bs, num_query, 3)
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+
+        if residual is None:
+            inp_residual = query
+        if query_pos is not None:
+            query = query + query_pos
+
+        query = query.permute(1, 0, 2)
+
+        bs, num_query, _ = query.size()
+
+        attention_weights = self.attention_weights(query).view(
+            bs, 1, num_query, self.num_cams, self.num_points, self.num_levels)
+        reference_points_3d, output, mask = feature_sampling(
+            value, reference_points, self.pc_range, kwargs['img_metas'])
+        output = torch.nan_to_num(output)
+        mask = torch.nan_to_num(mask)
+        attention_weights = attention_weights.sigmoid() * mask
+        output = output * attention_weights
+        output = output.sum(-1).sum(-1).sum(-1)
+        output = output.permute(2, 0, 1)
+        # (num_query, bs, embed_dims)
+        output = self.output_proj(output)
+        pos_feat = self.position_encoder(
+            inverse_sigmoid(reference_points_3d)).permute(1, 0, 2)
+        return self.dropout(output) + inp_residual + pos_feat
+
+
+def feature_sampling(mlvl_feats,
+                     ref_pt,
+                     pc_range,
+                     img_metas,
+                     no_sampling=False):
+    """ sample multi-level features by projecting 3D reference points
+            to 2D image
+        Args:
+            mlvl_feats (List[Tensor]): Image features from
+                different level. Each element has shape
+                (B, N, C, H_lvl, W_lvl).
+            ref_pt (Tensor): The normalized 3D reference
+                points with shape (bs, num_query, 3)
+            pc_range: perception range of the detector
+            img_metas (list[dict]): Meta information of multiple inputs
+                in a batch, containing `lidar2img`.
+            no_sampling (bool): If set 'True', the function will return
+                2D projected points and mask only.
+        Returns:
+            ref_pt_3d (Tensor): A copy of original ref_pt
+            sampled_feats (Tensor): sampled features with shape \
+                (B C num_q N 1 fpn_lvl)
+            mask (Tensor): Determine whether the reference point \
+                has projected outsied of images, with shape \
+                (B 1 num_q N 1 1)
+    """
+    lidar2img = [meta['lidar2img'] for meta in img_metas]
+    lidar2img = np.asarray(lidar2img)
+    lidar2img = ref_pt.new_tensor(lidar2img)
+    ref_pt = ref_pt.clone()
+    ref_pt_3d = ref_pt.clone()
+
+    B, num_query = ref_pt.size()[:2]
+    num_cam = lidar2img.size(1)
+    eps = 1e-5
+
+    ref_pt[..., 0:1] = \
+        ref_pt[..., 0:1] * (pc_range[3] - pc_range[0]) + pc_range[0]  # x
+    ref_pt[..., 1:2] = \
+        ref_pt[..., 1:2] * (pc_range[4] - pc_range[1]) + pc_range[1]  # y
+    ref_pt[..., 2:3] = \
+        ref_pt[..., 2:3] * (pc_range[5] - pc_range[2]) + pc_range[2]  # z
+
+    # (B num_q 3) -> (B num_q 4) -> (B 1 num_q 4) -> (B num_cam num_q 4 1)
+    ref_pt = torch.cat((ref_pt, torch.ones_like(ref_pt[..., :1])), -1)
+    ref_pt = ref_pt.view(B, 1, num_query, 4)
+    ref_pt = ref_pt.repeat(1, num_cam, 1, 1).unsqueeze(-1)
+    # (B num_cam 4 4) -> (B num_cam num_q 4 4)
+    lidar2img = lidar2img.view(B, num_cam, 1, 4, 4)\
+                         .repeat(1, 1, num_query, 1, 1)
+    # (... 4 4) * (... 4 1) -> (B num_cam num_q 4)
+    pt_cam = torch.matmul(lidar2img, ref_pt).squeeze(-1)
+
+    # (B num_cam num_q)
+    z = pt_cam[..., 2:3]
+    eps = eps * torch.ones_like(z)
+    mask = (z > eps)
+    pt_cam = pt_cam[..., 0:2] / torch.maximum(z, eps)  # prevent zero-division
+    # padded nuscene image: 928*1600
+    (h, w) = img_metas[0]['pad_shape']
+    pt_cam[..., 0] /= w
+    pt_cam[..., 1] /= h
+    # else:
+    # (h,w,_) = img_metas[0]['ori_shape'][0]          # waymo image
+    # pt_cam[..., 0] /= w # cam0~2: 1280*1920
+    # pt_cam[..., 1] /= h # cam3~4: 886 *1920 padded to 1280*1920
+    # mask[:, 3:5, :] &= (pt_cam[:, 3:5, :, 1:2] < 0.7) # filter pt_cam_y > 886
+
+    mask = (
+        mask & (pt_cam[..., 0:1] > 0.0)
+        & (pt_cam[..., 0:1] < 1.0)
+        & (pt_cam[..., 1:2] > 0.0)
+        & (pt_cam[..., 1:2] < 1.0))
+
+    if no_sampling:
+        return pt_cam, mask
+
+    # (B num_cam num_q) -> (B 1 num_q num_cam 1 1)
+    mask = mask.view(B, num_cam, 1, num_query, 1, 1).permute(0, 2, 3, 1, 4, 5)
+    mask = torch.nan_to_num(mask)
+
+    pt_cam = (pt_cam - 0.5) * 2  # [0,1] to [-1,1] to do grid_sample
+    sampled_feats = []
+    for lvl, feat in enumerate(mlvl_feats):
+        B, N, C, H, W = feat.size()
+        feat = feat.view(B * N, C, H, W)
+        pt_cam_lvl = pt_cam.view(B * N, num_query, 1, 2)
+        sampled_feat = F.grid_sample(feat, pt_cam_lvl)
+        # (B num_cam C num_query 1) -> List of (B C num_q num_cam 1)
+        sampled_feat = sampled_feat.view(B, N, C, num_query, 1)
+        sampled_feat = sampled_feat.permute(0, 2, 3, 1, 4)
+        sampled_feats.append(sampled_feat)
+
+    sampled_feats = torch.stack(sampled_feats, -1)
+    # (B C num_q num_cam fpn_lvl)
+    sampled_feats = \
+        sampled_feats.view(B, C, num_query, num_cam, 1, len(mlvl_feats))
+    return ref_pt_3d, sampled_feats, mask
diff --git a/mmde/projects/DETR3D/detr3d/grid_mask.py b/mmde/projects/DETR3D/detr3d/grid_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..33e2ccebb3628476100a9e9b22051352d450a8c2
--- /dev/null
+++ b/mmde/projects/DETR3D/detr3d/grid_mask.py
@@ -0,0 +1,142 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import Image
+
+
+class Grid(object):
+
+    def __init__(self,
+                 use_h,
+                 use_w,
+                 rotate=1,
+                 offset=False,
+                 ratio=0.5,
+                 mode=0,
+                 prob=1.):
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.st_prob = prob
+        self.prob = prob
+
+    def set_prob(self, epoch, max_epoch):
+        self.prob = self.st_prob * epoch / max_epoch
+
+    def __call__(self, img, label):
+        if np.random.rand() > self.prob:
+            return img, label
+        h = img.size(1)
+        w = img.size(2)
+        self.d1 = 2
+        self.d2 = min(h, w)
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
+        d = np.random.randint(self.d1, self.d2)
+        if self.ratio == 1:
+            self.L = np.random.randint(1, d)
+        else:
+            self.L = min(max(int(d * self.ratio + 0.5), 1), d - 1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh // d):
+                s = d * i + st_h
+                t = min(s + self.L, hh)
+                mask[s:t, :] *= 0
+        if self.use_w:
+            for i in range(ww // d):
+                s = d * i + st_w
+                t = min(s + self.L, ww)
+                mask[:, s:t] *= 0
+
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh - h) // 2:(hh - h) // 2 + h,
+                    (ww - w) // 2:(ww - w) // 2 + w]
+
+        mask = torch.from_numpy(mask).float()
+        if self.mode == 1:
+            mask = 1 - mask
+
+        mask = mask.expand_as(img)
+        if self.offset:
+            offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float()
+            offset = (1 - mask) * offset
+            img = img * mask + offset
+        else:
+            img = img * mask
+
+        return img, label
+
+
+class GridMask(nn.Module):
+
+    def __init__(self,
+                 use_h,
+                 use_w,
+                 rotate=1,
+                 offset=False,
+                 ratio=0.5,
+                 mode=0,
+                 prob=1.):
+        super(GridMask, self).__init__()
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.st_prob = prob
+        self.prob = prob
+
+    def set_prob(self, epoch, max_epoch):
+        self.prob = self.st_prob * epoch / max_epoch  # + 1.# 0.5
+
+    def forward(self, x):
+        if np.random.rand() > self.prob or not self.training:
+            return x
+        n, c, h, w = x.size()
+        x = x.view(-1, h, w)
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
+        d = np.random.randint(2, h)
+        self.L = min(max(int(d * self.ratio + 0.5), 1), d - 1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh // d):
+                s = d * i + st_h
+                t = min(s + self.L, hh)
+                mask[s:t, :] *= 0
+        if self.use_w:
+            for i in range(ww // d):
+                s = d * i + st_w
+                t = min(s + self.L, ww)
+                mask[:, s:t] *= 0
+
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh - h) // 2:(hh - h) // 2 + h,
+                    (ww - w) // 2:(ww - w) // 2 + w]
+
+        mask = torch.from_numpy(mask).to(x)
+        if self.mode == 1:
+            mask = 1 - mask
+        mask = mask.expand_as(x)
+        if self.offset:
+            offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).to(x)
+            x = x * mask + offset * (1 - mask)
+        else:
+            x = x * mask
+
+        return x.view(n, c, h, w)
diff --git a/mmde/projects/DETR3D/detr3d/hungarian_assigner_3d.py b/mmde/projects/DETR3D/detr3d/hungarian_assigner_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab9c47d08ee8722b74ec36ca0a60b2dfe077961f
--- /dev/null
+++ b/mmde/projects/DETR3D/detr3d/hungarian_assigner_3d.py
@@ -0,0 +1,135 @@
+from typing import List
+
+import torch
+from mmdet.models.task_modules.assigners import AssignResult  # check
+from mmdet.models.task_modules.assigners import BaseAssigner
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet3d.registry import TASK_UTILS
+from .util import normalize_bbox
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+
+@TASK_UTILS.register_module()
+class HungarianAssigner3D(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of some components.
+    For DETR3D the costs are weighted sum of classification cost, regression L1
+    cost and regression iou cost. The targets don't include the no_object, so
+    generally there are more predictions than targets. After the one-to-one
+    matching, the un-matched are treated as backgrounds. Thus each query
+    prediction will be assigned with `0` or a positive integer indicating the
+    ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        cls_cost (obj:`ConfigDict`) : Match cost configs.
+        reg_cost.
+        iou_cost.
+        pc_range: perception range of the detector
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoUCost', weight=0.0),
+                 pc_range: List = None):
+        self.cls_cost = TASK_UTILS.build(cls_cost)
+        self.reg_cost = TASK_UTILS.build(reg_cost)
+        self.iou_cost = TASK_UTILS.build(iou_cost)
+        self.pc_range = pc_range
+
+    def assign(self,
+               bbox_pred: Tensor,
+               cls_pred: Tensor,
+               gt_bboxes: Tensor,
+               gt_labels: Tensor,
+               gt_bboxes_ignore=None,
+               eps=1e-7) -> AssignResult:
+        """Computes one-to-one matching based on the weighted costs.
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y) which are all in
+                range [0, 1] and shape [num_query, 10].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (cx,cy,cz,l,w,h,φ,v_x,v_y). Shape [num_gt, 9].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): unused parameter
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)  # 9, 900
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        # # dev1.x interface alignment
+        pred_instances = InstanceData(scores=cls_pred)
+        gt_instances = InstanceData(labels=gt_labels)
+        cls_cost = self.cls_cost(pred_instances, gt_instances)
+        # regression L1 cost
+        normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range)
+        reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8])
+
+        # weighted sum of above two costs
+        cost = cls_cost + reg_cost
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)
diff --git a/mmde/projects/DETR3D/detr3d/match_cost.py b/mmde/projects/DETR3D/detr3d/match_cost.py
new file mode 100644
index 0000000000000000000000000000000000000000..420ff2f7998d63cd21970892ed59d5944726f606
--- /dev/null
+++ b/mmde/projects/DETR3D/detr3d/match_cost.py
@@ -0,0 +1,34 @@
+from typing import Union
+
+import torch
+from torch import Tensor
+
+from mmdet3d.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class BBox3DL1Cost(object):
+    """BBox3DL1Cost.
+
+    Args:
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self, weight: Union[float, int] = 1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred: Tensor, gt_bboxes: Tensor) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y)
+                which are all in range [0, 1] and shape [num_query, 10].
+            gt_bboxes (Tensor): Ground truth boxes with `normalized`
+                coordinates (cx,cy,l,w,cz,h,sin(φ),cos(φ),v_x,v_y).
+                Shape [num_gt, 10].
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
diff --git a/mmde/projects/DETR3D/detr3d/nms_free_coder.py b/mmde/projects/DETR3D/detr3d/nms_free_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdc36cd0e8ac5fd90f93b367bd8b33e34fcbc45b
--- /dev/null
+++ b/mmde/projects/DETR3D/detr3d/nms_free_coder.py
@@ -0,0 +1,118 @@
+import torch
+from mmdet.models.task_modules import BaseBBoxCoder
+
+from mmdet3d.registry import TASK_UTILS
+from .util import denormalize_bbox
+
+
+@TASK_UTILS.register_module()
+class NMSFreeCoder(BaseBBoxCoder):
+    """Bbox coder for NMS-free detector.
+
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+    """
+
+    def __init__(self,
+                 pc_range=None,
+                 voxel_size=None,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 num_classes=10):
+
+        self.pc_range = pc_range
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
+
+    def encode(self):
+        pass
+
+    def decode_single(self, cls_scores, bbox_preds):
+        """Decode bboxes.
+
+        Args:
+            cls_scores (Tensor): Outputs from the classification head,
+                shape [num_query, cls_out_channels]. Note that
+                cls_out_channels should includes background.
+            bbox_preds (Tensor): Outputs from the regression
+                head with normalized coordinate
+                (cx, cy, l, w, cz, h, rot_sine, rot_cosine, vx, vy).
+                Shape [num_query, 10].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        max_num = self.max_num
+
+        cls_scores = cls_scores.sigmoid()
+        scores, indexes = cls_scores.view(-1).topk(max_num)
+        labels = indexes % self.num_classes
+        bbox_index = indexes // self.num_classes
+        bbox_preds = bbox_preds[bbox_index]
+
+        # [[cx, cy, cz, l, w, h, rot, vx, vy]]
+        final_box_preds = denormalize_bbox(bbox_preds, None)
+        final_scores = scores
+        final_preds = labels
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=scores.device)
+            mask = (final_box_preds[..., :3] >=
+                    self.post_center_range[:3]).all(1)
+            mask &= (final_box_preds[..., :3] <=
+                     self.post_center_range[3:]).all(1)
+
+            if self.score_threshold:
+                mask &= thresh_mask
+
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+            labels = final_preds[mask]
+            predictions_dict = {
+                'bboxes': boxes3d,
+                'scores': scores,
+                'labels': labels
+            }
+
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dict
+
+    def decode(self, preds_dicts):
+        """Decode bboxes.
+
+        Args:
+            all_cls_scores (Tensor): Outputs from the classification head,
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression
+                head with normalized coordinate format
+                (cx, cy, l, w, cz, h, rot_sine, rot_cosine, vx, vy).
+                Shape [nb_dec, bs, num_query, 10].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        # cls & reg target of last decoder layer
+        all_cls_scores = preds_dicts['all_cls_scores'][-1]
+        all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
+
+        batch_size = all_cls_scores.size()[0]
+        predictions_list = []
+        for i in range(batch_size):
+            predictions_list.append(
+                self.decode_single(all_cls_scores[i], all_bbox_preds[i]))
+        return predictions_list
diff --git a/mmde/projects/DETR3D/detr3d/util.py b/mmde/projects/DETR3D/detr3d/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9f22d8d8696ac2009d51af5ed965eaaab6e1241
--- /dev/null
+++ b/mmde/projects/DETR3D/detr3d/util.py
@@ -0,0 +1,76 @@
+from typing import List
+
+import torch
+from torch import Tensor
+
+
+def normalize_bbox(bboxes: Tensor, pc_range: List) -> Tensor:
+    """ normalize bboxes
+        Args:
+            bboxes (Tensor): boxes with unnormalized
+                coordinates (cx,cy,cz,L,W,H,φ,v_x,v_y). Shape [num_gt, 9].
+            pc_range (List): Perception range of the detector
+        Returns:
+            normalized_bboxes (Tensor): boxes with normalized coordinate
+                (cx,cy,L,W,cz,H,sin(φ),cos(φ),v_x,v_y).
+                All in range [0, 1] and shape [num_query, 10].
+    """
+
+    cx = bboxes[..., 0:1]
+    cy = bboxes[..., 1:2]
+    cz = bboxes[..., 2:3]
+    L = bboxes[..., 3:4].log()
+    W = bboxes[..., 4:5].log()
+    H = bboxes[..., 5:6].log()
+
+    rot = bboxes[..., 6:7]
+    if bboxes.size(-1) > 7:
+        vx = bboxes[..., 7:8]
+        vy = bboxes[..., 8:9]
+        normalized_bboxes = torch.cat(
+            (cx, cy, L, W, cz, H, rot.sin(), rot.cos(), vx, vy), dim=-1)
+    else:
+        normalized_bboxes = torch.cat(
+            (cx, cy, L, W, cz, H, rot.sin(), rot.cos()), dim=-1)
+    return normalized_bboxes
+
+
+def denormalize_bbox(normalized_bboxes, pc_range):
+    """ denormalize bboxes
+        Args:
+            normalized_bboxes (Tensor): boxes with normalized coordinate
+                (cx,cy,L,W,cz,H,sin(φ),cos(φ),v_x,v_y).
+                All in range [0, 1] and shape [num_query, 10].
+            pc_range (List): Perception range of the detector
+        Returns:
+            denormalized_bboxes (Tensor): boxes with unnormalized
+                coordinates (cx,cy,cz,L,W,H,φ,v_x,v_y). Shape [num_gt, 9].
+    """
+    # rotation
+    rot_sine = normalized_bboxes[..., 6:7]
+
+    rot_cosine = normalized_bboxes[..., 7:8]
+    rot = torch.atan2(rot_sine, rot_cosine)
+
+    # center in the bev
+    cx = normalized_bboxes[..., 0:1]
+    cy = normalized_bboxes[..., 1:2]
+    cz = normalized_bboxes[..., 4:5]
+
+    # size, the meaning of L,W may alter in different version of mmdet3d
+    L = normalized_bboxes[..., 2:3]
+    W = normalized_bboxes[..., 3:4]
+    H = normalized_bboxes[..., 5:6]
+
+    L = L.exp()
+    W = W.exp()
+    H = H.exp()
+    if normalized_bboxes.size(-1) > 8:
+        # velocity
+        vx = normalized_bboxes[:, 8:9]
+        vy = normalized_bboxes[:, 9:10]
+        denormalized_bboxes = torch.cat([cx, cy, cz, L, W, H, rot, vx, vy],
+                                        dim=-1)
+    else:
+        denormalized_bboxes = torch.cat([cx, cy, cz, L, W, H, rot], dim=-1)
+    return denormalized_bboxes
diff --git a/mmde/projects/DETR3D/detr3d/vovnet.py b/mmde/projects/DETR3D/detr3d/vovnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..63b5773bba0c3ba0af097d71889e65ed27e3f955
--- /dev/null
+++ b/mmde/projects/DETR3D/detr3d/vovnet.py
@@ -0,0 +1,442 @@
+import warnings
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet3d.registry import MODELS
+
+VoVNet19_slim_dw_eSE = {
+    'stem': [64, 64, 64],
+    'stage_conv_ch': [64, 80, 96, 112],
+    'stage_out_ch': [112, 256, 384, 512],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    'dw': True
+}
+
+VoVNet19_dw_eSE = {
+    'stem': [64, 64, 64],
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    'dw': True
+}
+
+VoVNet19_slim_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [64, 80, 96, 112],
+    'stage_out_ch': [112, 256, 384, 512],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    'dw': False
+}
+
+VoVNet19_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    'dw': False
+}
+
+VoVNet39_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 5,
+    'block_per_stage': [1, 1, 2, 2],
+    'eSE': True,
+    'dw': False
+}
+
+VoVNet57_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 5,
+    'block_per_stage': [1, 1, 4, 3],
+    'eSE': True,
+    'dw': False
+}
+
+VoVNet99_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 5,
+    'block_per_stage': [1, 3, 9, 3],
+    'eSE': True,
+    'dw': False
+}
+
+_STAGE_SPECS = {
+    'V-19-slim-dw-eSE': VoVNet19_slim_dw_eSE,
+    'V-19-dw-eSE': VoVNet19_dw_eSE,
+    'V-19-slim-eSE': VoVNet19_slim_eSE,
+    'V-19-eSE': VoVNet19_eSE,
+    'V-39-eSE': VoVNet39_eSE,
+    'V-57-eSE': VoVNet57_eSE,
+    'V-99-eSE': VoVNet99_eSE,
+}
+
+
+def dw_conv3x3(in_channels,
+               out_channels,
+               module_name,
+               postfix,
+               stride=1,
+               kernel_size=3,
+               padding=1):
+    """3x3 convolution with padding."""
+    return [
+        ('{}_{}/dw_conv3x3'.format(module_name, postfix),
+         nn.Conv2d(
+             in_channels,
+             out_channels,
+             kernel_size=kernel_size,
+             stride=stride,
+             padding=padding,
+             groups=out_channels,
+             bias=False)),
+        ('{}_{}/pw_conv1x1'.format(module_name, postfix),
+         nn.Conv2d(
+             in_channels,
+             out_channels,
+             kernel_size=1,
+             stride=1,
+             padding=0,
+             groups=1,
+             bias=False)),
+        ('{}_{}/pw_norm'.format(module_name,
+                                postfix), nn.BatchNorm2d(out_channels)),
+        ('{}_{}/pw_relu'.format(module_name, postfix), nn.ReLU(inplace=True)),
+    ]
+
+
+def conv3x3(in_channels,
+            out_channels,
+            module_name,
+            postfix,
+            stride=1,
+            groups=1,
+            kernel_size=3,
+            padding=1):
+    """3x3 convolution with padding."""
+    return [
+        (
+            f'{module_name}_{postfix}/conv',
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f'{module_name}_{postfix}/norm', nn.BatchNorm2d(out_channels)),
+        (f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True)),
+    ]
+
+
+def conv1x1(in_channels,
+            out_channels,
+            module_name,
+            postfix,
+            stride=1,
+            groups=1,
+            kernel_size=1,
+            padding=0):
+    """1x1 convolution with padding."""
+    return [
+        (
+            f'{module_name}_{postfix}/conv',
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f'{module_name}_{postfix}/norm', nn.BatchNorm2d(out_channels)),
+        (f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True)),
+    ]
+
+
+class Hsigmoid(nn.Module):
+
+    def __init__(self, inplace=True):
+        super(Hsigmoid, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return F.relu6(x + 3.0, inplace=self.inplace) / 6.0
+
+
+class eSEModule(nn.Module):
+
+    def __init__(self, channel, reduction=4):
+        super(eSEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0)
+        self.hsigmoid = Hsigmoid()
+
+    def forward(self, x):
+        input = x
+        x = self.avg_pool(x)
+        x = self.fc(x)
+        x = self.hsigmoid(x)
+        return input * x
+
+
+class _OSA_module(nn.Module):
+
+    def __init__(self,
+                 in_ch,
+                 stage_ch,
+                 concat_ch,
+                 layer_per_block,
+                 module_name,
+                 SE=False,
+                 identity=False,
+                 depthwise=False):
+
+        super(_OSA_module, self).__init__()
+
+        self.identity = identity
+        self.depthwise = depthwise
+        self.isReduced = False
+        self.layers = nn.ModuleList()
+        in_channel = in_ch
+        if self.depthwise and in_channel != stage_ch:
+            self.isReduced = True
+            self.conv_reduction = nn.Sequential(
+                OrderedDict(
+                    conv1x1(in_channel, stage_ch,
+                            '{}_reduction'.format(module_name), '0')))
+        for i in range(layer_per_block):
+            if self.depthwise:
+                self.layers.append(
+                    nn.Sequential(
+                        OrderedDict(
+                            dw_conv3x3(stage_ch, stage_ch, module_name, i))))
+            else:
+                self.layers.append(
+                    nn.Sequential(
+                        OrderedDict(
+                            conv3x3(in_channel, stage_ch, module_name, i))))
+            in_channel = stage_ch
+
+        # feature aggregation
+        in_channel = in_ch + layer_per_block * stage_ch
+        self.concat = nn.Sequential(
+            OrderedDict(conv1x1(in_channel, concat_ch, module_name, 'concat')))
+
+        self.ese = eSEModule(concat_ch)
+
+    def forward(self, x):
+
+        identity_feat = x
+
+        output = []
+        output.append(x)
+        if self.depthwise and self.isReduced:
+            x = self.conv_reduction(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+
+        x = torch.cat(output, dim=1)
+        xt = self.concat(x)
+
+        xt = self.ese(xt)
+
+        if self.identity:
+            xt = xt + identity_feat
+
+        return xt
+
+
+class _OSA_stage(nn.Sequential):
+
+    def __init__(self,
+                 in_ch,
+                 stage_ch,
+                 concat_ch,
+                 block_per_stage,
+                 layer_per_block,
+                 stage_num,
+                 SE=False,
+                 depthwise=False):
+
+        super(_OSA_stage, self).__init__()
+
+        if not stage_num == 2:
+            self.add_module(
+                'Pooling',
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True))
+
+        if block_per_stage != 1:
+            SE = False
+        module_name = f'OSA{stage_num}_1'
+        self.add_module(
+            module_name,
+            _OSA_module(
+                in_ch,
+                stage_ch,
+                concat_ch,
+                layer_per_block,
+                module_name,
+                SE,
+                depthwise=depthwise))
+        for i in range(block_per_stage - 1):
+            if i != block_per_stage - 2:  # last block
+                SE = False
+            module_name = f'OSA{stage_num}_{i + 2}'
+            self.add_module(
+                module_name,
+                _OSA_module(
+                    concat_ch,
+                    stage_ch,
+                    concat_ch,
+                    layer_per_block,
+                    module_name,
+                    SE,
+                    identity=True,
+                    depthwise=depthwise),
+            )
+
+
+@MODELS.register_module()
+class VoVNet(BaseModule):
+
+    def __init__(self,
+                 spec_name,
+                 input_ch=3,
+                 out_features=None,
+                 frozen_stages=-1,
+                 norm_eval=True,
+                 pretrained=None,
+                 init_cfg=None):
+        """
+        Args:
+            input_ch(int) : the number of input channel
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "stage2" ...
+        """
+        super(VoVNet, self).__init__(init_cfg)
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        stage_specs = _STAGE_SPECS[spec_name]
+
+        stem_ch = stage_specs['stem']
+        config_stage_ch = stage_specs['stage_conv_ch']
+        config_concat_ch = stage_specs['stage_out_ch']
+        block_per_stage = stage_specs['block_per_stage']
+        layer_per_block = stage_specs['layer_per_block']
+        SE = stage_specs['eSE']
+        depthwise = stage_specs['dw']
+
+        self._out_features = out_features
+
+        # Stem module
+        conv_type = dw_conv3x3 if depthwise else conv3x3
+        stem = conv3x3(input_ch, stem_ch[0], 'stem', '1', 2)
+        stem += conv_type(stem_ch[0], stem_ch[1], 'stem', '2', 1)
+        stem += conv_type(stem_ch[1], stem_ch[2], 'stem', '3', 2)
+        self.add_module('stem', nn.Sequential((OrderedDict(stem))))
+        current_stirde = 4
+        self._out_feature_strides = {
+            'stem': current_stirde,
+            'stage2': current_stirde
+        }
+        self._out_feature_channels = {'stem': stem_ch[2]}
+
+        stem_out_ch = [stem_ch[2]]
+        in_ch_list = stem_out_ch + config_concat_ch[:-1]
+        # OSA stages
+        self.stage_names = []
+        for i in range(4):  # num_stages
+            name = 'stage%d' % (i + 2)  # stage 2 ... stage 5
+            self.stage_names.append(name)
+            self.add_module(
+                name,
+                _OSA_stage(
+                    in_ch_list[i],
+                    config_stage_ch[i],
+                    config_concat_ch[i],
+                    block_per_stage[i],
+                    layer_per_block,
+                    i + 2,
+                    SE,
+                    depthwise,
+                ),
+            )
+
+            self._out_feature_channels[name] = config_concat_ch[i]
+            if not i == 0:
+                self._out_feature_strides[name] = current_stirde = int(
+                    current_stirde * 2)
+
+        # initialize weights
+        # self._initialize_weights()
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        if 'stem' in self._out_features:
+            outputs['stem'] = x
+        for name in self.stage_names:
+            x = getattr(self, name)(x)
+            if name in self._out_features:
+                outputs[name] = x
+
+        return outputs
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            m = getattr(self, 'stem')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'stage{i+1}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(VoVNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmde/projects/DETR3D/old_detr3d_converter.py b/mmde/projects/DETR3D/old_detr3d_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..9913ab62bbd646a2a86f5a12860de83f1b25a470
--- /dev/null
+++ b/mmde/projects/DETR3D/old_detr3d_converter.py
@@ -0,0 +1,25 @@
+from argparse import ArgumentParser
+
+import torch
+
+parser = ArgumentParser()
+parser.add_argument('src', default='old.pth')
+parser.add_argument('dst', default='new.pth')  # ('training','validation')
+parser.add_argument('--code_size', type=int, default='10')
+args = parser.parse_args()
+model = torch.load(args.src)
+code_size = args.code_size
+if model['meta'].get('detr3d_convert_tag') is not None:
+    print('this model has already converted!')
+else:
+    print('converting...')
+    # (cx, cy, w, l, cz, h, sin(φ), cos(φ), vx, vy)
+    for key in model['state_dict']:
+        tsr = model['state_dict'][key]
+        if 'reg_branches' in key and tsr.shape[0] == code_size:
+            print(key, ' with ', tsr.shape, 'has changed')
+            tsr[[2, 3], ...] = tsr[[3, 2], ...]
+            tsr[[6, 7], ...] = -tsr[[7, 6], ...]
+    model['meta']['detr3d_convert_tag'] = True
+    torch.save(model, args.dst)
+    print('done...')
diff --git a/mmde/projects/DSVT/README.md b/mmde/projects/DSVT/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e8c1262ba4fdc6404a363e429d422a070ae7dcd4
--- /dev/null
+++ b/mmde/projects/DSVT/README.md
@@ -0,0 +1,89 @@
+# DSVT: Dynamic Sparse Voxel Transformer with Rotated Sets
+
+> [DSVT: Dynamic Sparse Voxel Transformer with Rotated Sets](https://arxiv.org/abs/2301.06051)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Designing an efficient yet deployment-friendly 3D backbone to handle sparse point clouds is a fundamental problem
+in 3D perception. Compared with the customized sparse
+convolution, the attention mechanism in Transformers is
+more appropriate for flexibly modeling long-range relationships and is easier to be deployed in real-world applications.
+However, due to the sparse characteristics of point clouds,
+it is non-trivial to apply a standard transformer on sparse
+points. In this paper, we present Dynamic Sparse Voxel
+Transformer (DSVT), a single-stride window-based voxel
+Transformer backbone for outdoor 3D perception. In order
+to efficiently process sparse points in parallel, we propose
+Dynamic Sparse Window Attention, which partitions a series
+of local regions in each window according to its sparsity
+and then computes the features of all regions in a fully parallel manner. To allow the cross-set connection, we design
+a rotated set partitioning strategy that alternates between
+two partitioning configurations in consecutive self-attention
+layers. To support effective downsampling and better encode geometric information, we also propose an attentionstyle 3D pooling module on sparse points, which is powerful
+and deployment-friendly without utilizing any customized
+CUDA operations. Our model achieves state-of-the-art performance with a broad range of 3D perception tasks. More
+importantly, DSVT can be easily deployed by TensorRT with
+real-time inference speed (27Hz). Code will be available at
+https://github.com/Haiyang-W/DSVT.
+
+<div align=center>
+<img src="https://github-production-user-asset-6210df.s3.amazonaws.com/34888372/245692705-e61be20c-2a7d-4ab9-85e3-b36f662c1bdf.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement DSVT and provide the results on Waymo dataset.
+
+## Usage
+
+<!-- For a typical model, this section should contain the commands for training and testing. You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`. -->
+
+### Installation
+
+```shell
+pip install torch_scatter==2.0.9
+python projects/DSVT/setup.py develop # compile `ingroup_inds_op` cuda operation
+```
+
+### Testing commands
+
+In MMDetection3D's root directory, run the following command to test the model:
+
+```bash
+python tools/test.py projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py ${CHECKPOINT_PATH}
+```
+
+### Training commands
+
+In MMDetection3D's root directory, run the following command to test the model:
+
+```bash
+tools/dist_train.sh projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py 8 --sync_bn torch
+```
+
+## Results and models
+
+### Waymo
+
+|                                     Middle Encoder                                     |                                          Backbone                                           | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** |                                                                           Download                                                                           |
+| :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :----: | :-----: | :----: | :---------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) |       5       |      voxel (0.32)       |        ✓        |      ×       |  75.5  |  72.4   |  69.2  |    66.3     | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/dsvt/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class_20230917_102130.log) |
+
+**Note**:
+
+- `ResSECOND` denotes the base block in SECOND has residual layers.
+
+- Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
+
+## Citation
+
+```latex
+@inproceedings{wang2023dsvt,
+    title={DSVT: Dynamic Sparse Voxel Transformer with Rotated Sets},
+    author={Haiyang Wang, Chen Shi, Shaoshuai Shi, Meng Lei, Sen Wang, Di He, Bernt Schiele and Liwei Wang},
+    booktitle={CVPR},
+    year={2023}
+}
+```
diff --git a/mmde/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py b/mmde/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d2faf854c4fbcb527fc12008d6c4574ad506366
--- /dev/null
+++ b/mmde/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -0,0 +1,299 @@
+_base_ = ['../../../configs/_base_/default_runtime.py']
+custom_imports = dict(
+    imports=['projects.DSVT.dsvt'], allow_failed_imports=False)
+
+voxel_size = [0.32, 0.32, 6]
+grid_size = [468, 468, 1]
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4.0]
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=True, use_camera=False)
+backend_args = None
+
+model = dict(
+    type='DSVT',
+    data_preprocessor=dict(type='Det3DDataPreprocessor', voxel=False),
+    voxel_encoder=dict(
+        type='DynamicPillarVFE3D',
+        with_distance=False,
+        use_absolute_xyz=True,
+        use_norm=True,
+        num_filters=[192, 192],
+        num_point_features=5,
+        voxel_size=voxel_size,
+        grid_size=grid_size,
+        point_cloud_range=point_cloud_range),
+    middle_encoder=dict(
+        type='DSVTMiddleEncoder',
+        input_layer=dict(
+            sparse_shape=grid_size,
+            downsample_stride=[],
+            dim_model=[192],
+            set_info=[[36, 4]],
+            window_shape=[[12, 12, 1]],
+            hybrid_factor=[2, 2, 1],  # x, y, z
+            shift_list=[[[0, 0, 0], [6, 6, 0]]],
+            normalize_pos=False),
+        set_info=[[36, 4]],
+        dim_model=[192],
+        dim_feedforward=[384],
+        stage_num=1,
+        nhead=[8],
+        conv_out_channel=192,
+        output_shape=[468, 468],
+        dropout=0.,
+        activation='gelu'),
+    map2bev=dict(
+        type='PointPillarsScatter3D',
+        output_shape=grid_size,
+        num_bev_feats=192),
+    backbone=dict(
+        type='ResSECOND',
+        in_channels=192,
+        out_channels=[128, 128, 256],
+        blocks_nums=[1, 2, 2],
+        layer_strides=[1, 2, 2]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 128, 256],
+        out_channels=[128, 128, 128],
+        upsample_strides=[1, 2, 4],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=False),
+    bbox_head=dict(
+        type='DSVTCenterHead',
+        in_channels=sum([128, 128, 128]),
+        tasks=[dict(num_class=3, class_names=class_names)],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), iou=(1, 2)),
+        share_conv_channel=64,
+        conv_cfg=dict(type='Conv2d'),
+        norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.01),
+        bbox_coder=dict(
+            type='DSVTBBoxCoder',
+            pc_range=point_cloud_range,
+            max_num=500,
+            post_center_range=[-80, -80, -10.0, 80, 80, 10.0],
+            score_threshold=0.1,
+            out_size_factor=1,
+            voxel_size=voxel_size[:2],
+            code_size=7),
+        separate_head=dict(
+            type='SeparateHead',
+            init_bias=-2.19,
+            final_kernel=3,
+            norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.01)),
+        loss_cls=dict(
+            type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0),
+        loss_bbox=dict(type='mmdet.L1Loss', reduction='mean', loss_weight=2.0),
+        loss_iou=dict(type='mmdet.L1Loss', reduction='sum', loss_weight=1.0),
+        loss_reg_iou=dict(
+            type='mmdet3d.DIoU3DLoss', reduction='mean', loss_weight=2.0),
+        norm_bbox=True),
+    # model training and testing settings
+    train_cfg=dict(
+        grid_size=grid_size,
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=1,
+        dense_reg=1,
+        gaussian_overlap=0.1,
+        max_objs=500,
+        min_radius=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]),
+    test_cfg=dict(
+        max_per_img=500,
+        max_pool_nms=False,
+        min_radius=[4, 12, 10, 1, 0.85, 0.175],
+        iou_rectifier=[[0.68, 0.71, 0.65]],
+        pc_range=[-80, -80],
+        out_size_factor=1,
+        voxel_size=voxel_size[:2],
+        nms_type='rotate',
+        multi_class_nms=True,
+        pre_max_size=[[4096, 4096, 4096]],
+        post_max_size=[[500, 500, 500]],
+        nms_thr=[[0.7, 0.6, 0.55]]))
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4],
+        norm_intensity=True,
+        norm_elongation=True,
+        backend_args=backend_args),
+    backend_args=backend_args)
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        norm_intensity=True,
+        norm_elongation=True,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.5, 0.5, 0.5]),
+    dict(type='PointsRangeFilter3D', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter3D', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        norm_intensity=True,
+        norm_elongation=True,
+        backend_args=backend_args),
+    dict(type='PointsRangeFilter3D', point_cloud_range=point_cloud_range),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
+]
+
+dataset_type = 'WaymoDataset'
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR',
+        # load one frame every five frames
+        load_interval=5,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='WaymoMetric',
+    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
+    result_prefix='./dsvt_pred')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# schedules
+lr = 1e-5
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.05, betas=(0.9, 0.99)),
+    clip_grad=dict(max_norm=10, norm_type=2))
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=1.2,
+        eta_min=lr * 100,
+        begin=0,
+        end=1.2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=10.8,
+        eta_min=lr * 1e-4,
+        begin=1.2,
+        end=12,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=1.2,
+        eta_min=0.85,
+        begin=0,
+        end=1.2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=10.8,
+        eta_min=0.95,
+        begin=1.2,
+        end=12,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=12, val_interval=1)
+
+# runtime settings
+val_cfg = dict()
+test_cfg = dict()
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (1 samples per GPU).
+# auto_scale_lr = dict(enable=False, base_batch_size=8)
+
+default_hooks = dict(
+    logger=dict(type='LoggerHook', interval=50),
+    checkpoint=dict(type='CheckpointHook', interval=1))
+custom_hooks = [
+    dict(
+        type='DisableAugHook',
+        disable_after_epoch=11,
+        disable_aug_list=[
+            'GlobalRotScaleTrans', 'RandomFlip3D', 'ObjectSample'
+        ])
+]
diff --git a/mmde/projects/DSVT/dsvt/__init__.py b/mmde/projects/DSVT/dsvt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3666f0535a8125feb985f20dc2f1bd54f619d7d
--- /dev/null
+++ b/mmde/projects/DSVT/dsvt/__init__.py
@@ -0,0 +1,15 @@
+from .disable_aug_hook import DisableAugHook
+from .dsvt import DSVT
+from .dsvt_head import DSVTCenterHead
+from .dsvt_transformer import DSVTMiddleEncoder
+from .dynamic_pillar_vfe import DynamicPillarVFE3D
+from .map2bev import PointPillarsScatter3D
+from .res_second import ResSECOND
+from .transforms_3d import ObjectRangeFilter3D, PointsRangeFilter3D
+from .utils import DSVTBBoxCoder
+
+__all__ = [
+    'DSVTCenterHead', 'DSVT', 'DSVTMiddleEncoder', 'DynamicPillarVFE3D',
+    'PointPillarsScatter3D', 'ResSECOND', 'DSVTBBoxCoder',
+    'ObjectRangeFilter3D', 'PointsRangeFilter3D', 'DisableAugHook'
+]
diff --git a/mmde/projects/DSVT/dsvt/disable_aug_hook.py b/mmde/projects/DSVT/dsvt/disable_aug_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a4dff5fb2f035a43e8c4f8a3090c30b7eee3925
--- /dev/null
+++ b/mmde/projects/DSVT/dsvt/disable_aug_hook.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+from mmengine.dataset import BaseDataset
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+from mmengine.runner import Runner
+
+from mmdet3d.registry import HOOKS
+
+
+@HOOKS.register_module()
+class DisableAugHook(Hook):
+    """The hook of disabling augmentations during training.
+
+    Args:
+        disable_after_epoch (int): The number of epochs after which
+            the data augmentation will be closed in the training.
+            Defaults to 15.
+        disable_aug_list (list): the list of data augmentation will
+            be closed in the training. Defaults to [].
+    """
+
+    def __init__(self,
+                 disable_after_epoch: int = 15,
+                 disable_aug_list: List = []):
+        self.disable_after_epoch = disable_after_epoch
+        self.disable_aug_list = disable_aug_list
+        self._restart_dataloader = False
+
+    def before_train_epoch(self, runner: Runner):
+        """Close augmentation.
+
+        Args:
+            runner (Runner): The runner.
+        """
+        epoch = runner.epoch
+        train_loader = runner.train_dataloader
+        model = runner.model
+        # TODO: refactor after mmengine using model wrapper
+        if is_model_wrapper(model):
+            model = model.module
+        if epoch == self.disable_after_epoch:
+
+            dataset = runner.train_dataloader.dataset
+            # handle dataset wrapper
+            if not isinstance(dataset, BaseDataset):
+                dataset = dataset.dataset
+            new_transforms = []
+            for transform in dataset.pipeline.transforms:  # noqa: E501
+                if transform.__class__.__name__ not in self.disable_aug_list:
+                    new_transforms.append(transform)
+                else:
+                    runner.logger.info(
+                        f'Disable {transform.__class__.__name__}')
+            dataset.pipeline.transforms = new_transforms
+            # The dataset pipeline cannot be updated when persistent_workers
+            # is True, so we need to force the dataloader's multi-process
+            # restart. This is a very hacky approach.
+            if hasattr(train_loader, 'persistent_workers'
+                       ) and train_loader.persistent_workers is True:
+                train_loader._DataLoader__initialized = False
+                train_loader._iterator = None
+                self._restart_dataloader = True
+        else:
+            # Once the restart is complete, we need to restore
+            # the initialization flag.
+            if self._restart_dataloader:
+                train_loader._DataLoader__initialized = True
diff --git a/mmde/projects/DSVT/dsvt/dsvt.py b/mmde/projects/DSVT/dsvt/dsvt.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6f6ceffbef4e1fad6a5444780a3aecd187c796b
--- /dev/null
+++ b/mmde/projects/DSVT/dsvt/dsvt.py
@@ -0,0 +1,140 @@
+from typing import Dict, List, Optional
+
+import torch
+from torch import Tensor
+
+from mmdet3d.models import Base3DDetector
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import Det3DDataSample
+
+
+@MODELS.register_module()
+class DSVT(Base3DDetector):
+    """DSVT detector."""
+
+    def __init__(self,
+                 voxel_encoder: Optional[dict] = None,
+                 middle_encoder: Optional[dict] = None,
+                 backbone: Optional[dict] = None,
+                 neck: Optional[dict] = None,
+                 map2bev: Optional[dict] = None,
+                 bbox_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 data_preprocessor: Optional[dict] = None,
+                 **kwargs):
+        super(DSVT, self).__init__(
+            init_cfg=init_cfg, data_preprocessor=data_preprocessor, **kwargs)
+
+        if voxel_encoder:
+            self.voxel_encoder = MODELS.build(voxel_encoder)
+        if middle_encoder:
+            self.middle_encoder = MODELS.build(middle_encoder)
+        if backbone:
+            self.backbone = MODELS.build(backbone)
+        self.map2bev = MODELS.build(map2bev)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        if bbox_head:
+            bbox_head.update(train_cfg=train_cfg, test_cfg=test_cfg)
+            self.bbox_head = MODELS.build(bbox_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    @property
+    def with_bbox(self):
+        """bool: Whether the detector has a 3D box head."""
+        return hasattr(self, 'bbox_head') and self.bbox_head is not None
+
+    @property
+    def with_backbone(self):
+        """bool: Whether the detector has a 3D backbone."""
+        return hasattr(self, 'backbone') and self.backbone is not None
+
+    @property
+    def with_voxel_encoder(self):
+        """bool: Whether the detector has a voxel encoder."""
+        return hasattr(self,
+                       'voxel_encoder') and self.voxel_encoder is not None
+
+    @property
+    def with_middle_encoder(self):
+        """bool: Whether the detector has a middle encoder."""
+        return hasattr(self,
+                       'middle_encoder') and self.middle_encoder is not None
+
+    def _forward(self):
+        pass
+
+    def extract_feat(self, batch_inputs_dict: dict) -> tuple:
+        """Extract features from images and points.
+        Args:
+            batch_inputs_dict (dict): Dict of batch inputs. It
+                contains
+                - points (List[tensor]):  Point cloud of multiple inputs.
+                - imgs (tensor): Image tensor with shape (B, C, H, W).
+        Returns:
+             tuple: Two elements in tuple arrange as
+             image features and point cloud features.
+        """
+        batch_out_dict = self.voxel_encoder(batch_inputs_dict)
+        batch_out_dict = self.middle_encoder(batch_out_dict)
+        batch_out_dict = self.map2bev(batch_out_dict)
+        multi_feats = self.backbone(batch_out_dict['spatial_features'])
+        feats = self.neck(multi_feats)
+
+        return feats
+
+    def loss(self, batch_inputs_dict: Dict[List, torch.Tensor],
+             batch_data_samples: List[Det3DDataSample],
+             **kwargs) -> List[Det3DDataSample]:
+        """
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points' and `imgs` keys.
+                - points (list[torch.Tensor]): Point cloud of each sample.
+                - imgs (torch.Tensor): Tensor of batch images, has shape
+                  (B, C, H ,W)
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, .
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        pts_feats = self.extract_feat(batch_inputs_dict)
+        losses = dict()
+        loss = self.bbox_head.loss(pts_feats, batch_data_samples)
+        losses.update(loss)
+        return losses
+
+    def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
+                batch_data_samples: List[Det3DDataSample],
+                **kwargs) -> List[Det3DDataSample]:
+        """Forward of testing.
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                'points' keys.
+                - points (list[torch.Tensor]): Point cloud of each sample.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
+        Returns:
+            list[:obj:`Det3DDataSample`]: Detection results of the
+            input sample. Each Det3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+            - scores_3d (Tensor): Classification scores, has a shape
+                (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bbox_3d (:obj:`BaseInstance3DBoxes`): Prediction of bboxes,
+                contains a tensor with shape (num_instances, 7).
+        """
+        pts_feats = self.extract_feat(batch_inputs_dict)
+        results_list_3d = self.bbox_head.predict(pts_feats, batch_data_samples)
+
+        detsamples = self.add_pred_to_datasample(batch_data_samples,
+                                                 results_list_3d)
+        return detsamples
diff --git a/mmde/projects/DSVT/dsvt/dsvt_head.py b/mmde/projects/DSVT/dsvt/dsvt_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffc766923417cbeae10f581e41479a4cf5a2e1dc
--- /dev/null
+++ b/mmde/projects/DSVT/dsvt/dsvt_head.py
@@ -0,0 +1,735 @@
+import math
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.ops import boxes_iou3d
+from mmdet.models.utils import multi_apply
+from mmengine.model import kaiming_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn.init import constant_
+
+from mmdet3d.models import CenterHead
+from mmdet3d.models.layers import circle_nms, nms_bev
+from mmdet3d.models.utils import (clip_sigmoid, draw_heatmap_gaussian,
+                                  gaussian_radius)
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import Det3DDataSample, xywhr2xyxyr
+
+
+@MODELS.register_module()
+class DSVTCenterHead(CenterHead):
+    """CenterHead for DSVT.
+
+    This head adds IoU prediction branch based on the original CenterHead.
+    """
+
+    def __init__(self,
+                 loss_iou=dict(
+                     type='mmdet.L1Loss', reduction='mean', loss_weight=1),
+                 loss_reg_iou=None,
+                 *args,
+                 **kwargs):
+        super(DSVTCenterHead, self).__init__(*args, **kwargs)
+        self.loss_iou = MODELS.build(loss_iou)
+        self.loss_iou_reg = MODELS.build(
+            loss_reg_iou) if loss_reg_iou is not None else None
+
+    def init_weights(self):
+        kaiming_init(
+            self.shared_conv.conv,
+            a=math.sqrt(5),
+            mode='fan_in',
+            nonlinearity='leaky_relu',
+            distribution='uniform')
+        for head in self.task_heads[0].heads:
+            if head == 'heatmap':
+                constant_(self.task_heads[0].__getattr__(head)[-1].bias,
+                          self.task_heads[0].init_bias)
+            else:
+                for m in self.task_heads[0].__getattr__(head).modules():
+                    if isinstance(m, nn.Conv2d):
+                        kaiming_init(
+                            m, mode='fan_in', nonlinearity='leaky_relu')
+
+    def forward_single(self, x: Tensor) -> dict:
+        """Forward function for CenterPoint.
+
+        Args:
+            x (torch.Tensor): Input feature map with the shape of
+                [B, 512, 128, 128].
+
+        Returns:
+            list[dict]: Output results for tasks.
+        """
+        ret_dicts = []
+
+        x = self.shared_conv(x)
+
+        for task in self.task_heads:
+            ret_dicts.append(task(x))
+
+        return ret_dicts
+
+    def forward(self, feats: List[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward pass.
+
+        Args:
+            feats (list[torch.Tensor]): Multi-level features, e.g.,
+                features produced by FPN.
+
+        Returns:
+            tuple(list[dict]): Output results for tasks.
+        """
+        return multi_apply(self.forward_single, feats)
+
+    def loss(self, pts_feats: List[Tensor],
+             batch_data_samples: List[Det3DDataSample], *args,
+             **kwargs) -> Dict[str, Tensor]:
+        """Forward function of training.
+
+        Args:
+            pts_feats (list[torch.Tensor]): Features of point cloud branch
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, .
+
+        Returns:
+            dict: Losses of each branch.
+        """
+        outs = self(pts_feats)
+        batch_gt_instance_3d = []
+        for data_sample in batch_data_samples:
+            batch_gt_instance_3d.append(data_sample.gt_instances_3d)
+        losses = self.loss_by_feat(outs, batch_gt_instance_3d)
+        return losses
+
+    def _decode_all_preds(self,
+                          pred_dict,
+                          point_cloud_range=None,
+                          voxel_size=None):
+        batch_size, _, H, W = pred_dict['reg'].shape
+
+        batch_center = pred_dict['reg'].permute(0, 2, 3, 1).contiguous().view(
+            batch_size, H * W, 2)  # (B, H, W, 2)
+        batch_center_z = pred_dict['height'].permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 1)  # (B, H, W, 1)
+        batch_dim = pred_dict['dim'].exp().permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 3)  # (B, H, W, 3)
+        batch_rot_cos = pred_dict['rot'][:, 0].unsqueeze(dim=1).permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 1)  # (B, H, W, 1)
+        batch_rot_sin = pred_dict['rot'][:, 1].unsqueeze(dim=1).permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 1)  # (B, H, W, 1)
+        batch_vel = pred_dict['vel'].permute(0, 2, 3, 1).contiguous().view(
+            batch_size, H * W, 2) if 'vel' in pred_dict.keys() else None
+
+        angle = torch.atan2(batch_rot_sin, batch_rot_cos)  # (B, H*W, 1)
+
+        ys, xs = torch.meshgrid([
+            torch.arange(
+                0, H, device=batch_center.device, dtype=batch_center.dtype),
+            torch.arange(
+                0, W, device=batch_center.device, dtype=batch_center.dtype)
+        ])
+        ys = ys.view(1, H, W).repeat(batch_size, 1, 1)
+        xs = xs.view(1, H, W).repeat(batch_size, 1, 1)
+        xs = xs.view(batch_size, -1, 1) + batch_center[:, :, 0:1]
+        ys = ys.view(batch_size, -1, 1) + batch_center[:, :, 1:2]
+
+        xs = xs * voxel_size[0] + point_cloud_range[0]
+        ys = ys * voxel_size[1] + point_cloud_range[1]
+
+        box_part_list = [xs, ys, batch_center_z, batch_dim, angle]
+        if batch_vel is not None:
+            box_part_list.append(batch_vel)
+
+        box_preds = torch.cat((box_part_list),
+                              dim=-1).view(batch_size, H, W, -1)
+
+        return box_preds
+
+    def _transpose_and_gather_feat(self, feat, ind):
+        feat = feat.permute(0, 2, 3, 1).contiguous()
+        feat = feat.view(feat.size(0), -1, feat.size(3))
+        feat = self._gather_feat(feat, ind)
+        return feat
+
+    def calc_iou_loss(self, iou_preds, batch_box_preds, mask, ind, gt_boxes):
+        """
+        Args:
+            iou_preds: (batch x 1 x h x w)
+            batch_box_preds: (batch x (7 or 9) x h x w)
+            mask: (batch x max_objects)
+            ind: (batch x max_objects)
+            gt_boxes: List of batch groundtruth boxes.
+
+        Returns:
+            Tensor: IoU Loss.
+        """
+        if mask.sum() == 0:
+            return iou_preds.new_zeros((1))
+
+        mask = mask.bool()
+        selected_iou_preds = self._transpose_and_gather_feat(iou_preds,
+                                                             ind)[mask]
+
+        selected_box_preds = self._transpose_and_gather_feat(
+            batch_box_preds, ind)[mask]
+        gt_boxes = torch.cat(gt_boxes)
+        assert gt_boxes.size(0) == selected_box_preds.size(0)
+        iou_target = boxes_iou3d(selected_box_preds[:, 0:7], gt_boxes[:, 0:7])
+        iou_target = torch.diag(iou_target).view(-1)
+        iou_target = iou_target * 2 - 1  # [0, 1] ==> [-1, 1]
+
+        loss = self.loss_iou(selected_iou_preds.view(-1), iou_target)
+        loss = loss / torch.clamp(mask.sum(), min=1e-4)
+        return loss
+
+    def calc_iou_reg_loss(self, batch_box_preds, mask, ind, gt_boxes):
+        if mask.sum() == 0:
+            return batch_box_preds.new_zeros((1))
+
+        mask = mask.bool()
+
+        selected_box_preds = self._transpose_and_gather_feat(
+            batch_box_preds, ind)[mask]
+        gt_boxes = torch.cat(gt_boxes)
+        assert gt_boxes.size(0) == selected_box_preds.size(0)
+        loss = self.loss_iou_reg(selected_box_preds[:, 0:7], gt_boxes[:, 0:7])
+
+        return loss
+
+    def get_targets(
+        self,
+        batch_gt_instances_3d: List[InstanceData],
+    ) -> Tuple[List[Tensor]]:
+        """Generate targets.
+
+        How each output is transformed:
+
+            Each nested list is transposed so that all same-index elements in
+            each sub-list (1, ..., N) become the new sub-lists.
+                [ [a0, a1, a2, ... ], [b0, b1, b2, ... ], ... ]
+                ==> [ [a0, b0, ... ], [a1, b1, ... ], [a2, b2, ... ] ]
+
+            The new transposed nested list is converted into a list of N
+            tensors generated by concatenating tensors in the new sub-lists.
+                [ tensor0, tensor1, tensor2, ... ]
+
+        Args:
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and\
+                ``labels_3d`` attributes.
+
+        Returns:
+            Returns:
+                tuple[list[torch.Tensor]]: Tuple of target including
+                    the following results in order.
+
+                - list[torch.Tensor]: Heatmap scores.
+                - list[torch.Tensor]: Ground truth boxes.
+                - list[torch.Tensor]: Indexes indicating the
+                    position of the valid boxes.
+                - list[torch.Tensor]: Masks indicating which
+                    boxes are valid.
+        """
+        heatmaps, anno_boxes, inds, masks, task_gt_bboxes = multi_apply(
+            self.get_targets_single, batch_gt_instances_3d)
+        # Transpose heatmaps
+        heatmaps = list(map(list, zip(*heatmaps)))
+        heatmaps = [torch.stack(hms_) for hms_ in heatmaps]
+        # Transpose anno_boxes
+        anno_boxes = list(map(list, zip(*anno_boxes)))
+        anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes]
+        # Transpose inds
+        inds = list(map(list, zip(*inds)))
+        inds = [torch.stack(inds_) for inds_ in inds]
+        # Transpose masks
+        masks = list(map(list, zip(*masks)))
+        masks = [torch.stack(masks_) for masks_ in masks]
+        # Transpose task_gt_bboxes
+        task_gt_bboxes = list(map(list, zip(*task_gt_bboxes)))
+        return heatmaps, anno_boxes, inds, masks, task_gt_bboxes
+
+    def get_targets_single(self,
+                           gt_instances_3d: InstanceData) -> Tuple[Tensor]:
+        """Generate training targets for a single sample.
+
+        Args:
+            gt_instances_3d (:obj:`InstanceData`): Gt_instances_3d of
+                single data sample. It usually includes
+                ``bboxes_3d`` and ``labels_3d`` attributes.
+
+        Returns:
+            tuple[list[torch.Tensor]]: Tuple of target including
+                the following results in order.
+
+                - list[torch.Tensor]: Heatmap scores.
+                - list[torch.Tensor]: Ground truth boxes.
+                - list[torch.Tensor]: Indexes indicating the position
+                    of the valid boxes.
+                - list[torch.Tensor]: Masks indicating which boxes
+                    are valid.
+        """
+        gt_labels_3d = gt_instances_3d.labels_3d
+        gt_bboxes_3d = gt_instances_3d.bboxes_3d
+        device = gt_labels_3d.device
+        gt_bboxes_3d = torch.cat(
+            (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]),
+            dim=1).to(device)
+        max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg']
+        grid_size = torch.tensor(self.train_cfg['grid_size']).to(device)
+        pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
+        voxel_size = torch.tensor(self.train_cfg['voxel_size'])
+
+        feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']
+
+        # reorganize the gt_dict by tasks
+        task_masks = []
+        flag = 0
+        for class_name in self.class_names:
+            task_masks.append([
+                torch.where(gt_labels_3d == class_name.index(i) + flag)
+                for i in class_name
+            ])
+            flag += len(class_name)
+
+        task_boxes = []
+        task_classes = []
+        flag2 = 0
+        for idx, mask in enumerate(task_masks):
+            task_box = []
+            task_class = []
+            for m in mask:
+                task_box.append(gt_bboxes_3d[m])
+                # 0 is background for each task, so we need to add 1 here.
+                task_class.append(gt_labels_3d[m] + 1 - flag2)
+            task_boxes.append(torch.cat(task_box, axis=0).to(device))
+            task_classes.append(torch.cat(task_class).long().to(device))
+            flag2 += len(mask)
+        draw_gaussian = draw_heatmap_gaussian
+        heatmaps, anno_boxes, inds, masks = [], [], [], []
+
+        for idx, task_head in enumerate(self.task_heads):
+            heatmap = gt_bboxes_3d.new_zeros(
+                (len(self.class_names[idx]), feature_map_size[1],
+                 feature_map_size[0]))
+
+            anno_box = gt_bboxes_3d.new_zeros((max_objs, 8),
+                                              dtype=torch.float32)
+
+            ind = gt_labels_3d.new_zeros((max_objs), dtype=torch.int64)
+            mask = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.uint8)
+
+            num_objs = min(task_boxes[idx].shape[0], max_objs)
+
+            for k in range(num_objs):
+                cls_id = task_classes[idx][k] - 1
+
+                length = task_boxes[idx][k][3]
+                width = task_boxes[idx][k][4]
+                length = length / voxel_size[0] / self.train_cfg[
+                    'out_size_factor']
+                width = width / voxel_size[1] / self.train_cfg[
+                    'out_size_factor']
+
+                if width > 0 and length > 0:
+                    radius = gaussian_radius(
+                        (width, length),
+                        min_overlap=self.train_cfg['gaussian_overlap'])
+                    radius = max(self.train_cfg['min_radius'], int(radius))
+
+                    # be really careful for the coordinate system of
+                    # your box annotation.
+                    x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][
+                        1], task_boxes[idx][k][2]
+
+                    coor_x = (
+                        x - pc_range[0]
+                    ) / voxel_size[0] / self.train_cfg['out_size_factor']
+                    coor_y = (
+                        y - pc_range[1]
+                    ) / voxel_size[1] / self.train_cfg['out_size_factor']
+
+                    center = torch.tensor([coor_x, coor_y],
+                                          dtype=torch.float32,
+                                          device=device)
+                    center_int = center.to(torch.int32)
+
+                    # throw out not in range objects to avoid out of array
+                    # area when creating the heatmap
+                    if not (0 <= center_int[0] < feature_map_size[0]
+                            and 0 <= center_int[1] < feature_map_size[1]):
+                        continue
+
+                    draw_gaussian(heatmap[cls_id], center_int, radius)
+
+                    new_idx = k
+                    x, y = center_int[0], center_int[1]
+
+                    assert (y * feature_map_size[0] + x <
+                            feature_map_size[0] * feature_map_size[1])
+
+                    ind[new_idx] = y * feature_map_size[0] + x
+                    mask[new_idx] = 1
+                    # TODO: support other outdoor dataset
+                    rot = task_boxes[idx][k][6]
+                    box_dim = task_boxes[idx][k][3:6]
+                    if self.norm_bbox:
+                        box_dim = box_dim.log()
+                    anno_box[new_idx] = torch.cat([
+                        center - torch.tensor([x, y], device=device),
+                        z.unsqueeze(0), box_dim,
+                        torch.cos(rot).unsqueeze(0),
+                        torch.sin(rot).unsqueeze(0)
+                    ])
+
+            heatmaps.append(heatmap)
+            anno_boxes.append(anno_box)
+            masks.append(mask)
+            inds.append(ind)
+        return heatmaps, anno_boxes, inds, masks, task_boxes
+
+    def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
+                     batch_gt_instances_3d: List[InstanceData], *args,
+                     **kwargs):
+        """Loss function for CenterHead.
+
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results of
+                multiple tasks. The outer tuple indicate  different
+                tasks head, and the internal list indicate different
+                FPN level.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances_3d. It usually includes ``bboxes_3d`` and
+                ``labels_3d`` attributes.
+
+        Returns:
+            dict[str,torch.Tensor]: Loss of heatmap and bbox of each task.
+        """
+        heatmaps, anno_boxes, inds, masks, task_gt_bboxes = self.get_targets(
+            batch_gt_instances_3d)
+        loss_dict = dict()
+        for task_id, preds_dict in enumerate(preds_dicts):
+            # heatmap focal loss
+            preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap'])
+            num_pos = heatmaps[task_id].eq(1).float().sum().item()
+            loss_heatmap = self.loss_cls(
+                preds_dict[0]['heatmap'],
+                heatmaps[task_id],
+                avg_factor=max(num_pos, 1))
+            target_box = anno_boxes[task_id]
+            # reconstruct the anno_box from multiple reg heads
+            preds_dict[0]['anno_box'] = torch.cat(
+                (preds_dict[0]['reg'], preds_dict[0]['height'],
+                 preds_dict[0]['dim'], preds_dict[0]['rot']),
+                dim=1)
+
+            # Regression loss for dimension, offset, height, rotation
+            ind = inds[task_id]
+            num = masks[task_id].float().sum()
+            pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous()
+            pred = pred.view(pred.size(0), -1, pred.size(3))
+            pred = self._gather_feat(pred, ind)
+            mask = masks[task_id].unsqueeze(2).expand_as(target_box).float()
+            isnotnan = (~torch.isnan(target_box)).float()
+            mask *= isnotnan
+
+            code_weights = self.train_cfg.get('code_weights', None)
+            bbox_weights = mask * mask.new_tensor(code_weights)
+            loss_bbox = self.loss_bbox(
+                pred, target_box, bbox_weights, avg_factor=(num + 1e-4))
+            loss_dict[f'task{task_id}.loss_heatmap'] = loss_heatmap
+            loss_dict[f'task{task_id}.loss_bbox'] = loss_bbox
+
+            if 'iou' in preds_dict[0]:
+                batch_box_preds = self._decode_all_preds(
+                    pred_dict=preds_dict[0],
+                    point_cloud_range=self.train_cfg['point_cloud_range'],
+                    voxel_size=self.train_cfg['voxel_size']
+                )  # (B, H, W, 7 or 9)
+
+                batch_box_preds_for_iou = batch_box_preds.permute(
+                    0, 3, 1, 2)  # (B, 7 or 9, H, W)
+                loss_dict[f'task{task_id}.loss_iou'] = self.calc_iou_loss(
+                    iou_preds=preds_dict[0]['iou'],
+                    batch_box_preds=batch_box_preds_for_iou.clone().detach(),
+                    mask=masks[task_id],
+                    ind=ind,
+                    gt_boxes=task_gt_bboxes[task_id])
+
+                if self.loss_iou_reg is not None:
+                    loss_dict[f'task{task_id}.loss_reg_iou'] = \
+                        self.calc_iou_reg_loss(
+                            batch_box_preds=batch_box_preds_for_iou,
+                            mask=masks[task_id],
+                            ind=ind,
+                            gt_boxes=task_gt_bboxes[task_id])
+
+        return loss_dict
+
+    def predict(self,
+                pts_feats: Tuple[torch.Tensor],
+                batch_data_samples: List[Det3DDataSample],
+                rescale=True,
+                **kwargs) -> List[InstanceData]:
+        """
+        Args:
+            pts_feats (Tuple[torch.Tensor]): Point features..
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes meta information of data.
+            rescale (bool): Whether rescale the resutls to
+                the original scale.
+
+        Returns:
+            list[:obj:`InstanceData`]: List of processed predictions. Each
+            InstanceData contains 3d Bounding boxes and corresponding
+            scores and labels.
+        """
+        preds_dict = self(pts_feats)
+        batch_size = len(batch_data_samples)
+        batch_input_metas = []
+        for batch_index in range(batch_size):
+            metainfo = batch_data_samples[batch_index].metainfo
+            batch_input_metas.append(metainfo)
+
+        results_list = self.predict_by_feat(
+            preds_dict, batch_input_metas, rescale=rescale, **kwargs)
+        return results_list
+
+    def predict_by_feat(self, preds_dicts: Tuple[List[dict]],
+                        batch_input_metas: List[dict], *args,
+                        **kwargs) -> List[InstanceData]:
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results of
+                multiple tasks. The outer tuple indicate  different
+                tasks head, and the internal list indicate different
+                FPN level.
+            batch_input_metas (list[dict]): Meta info of multiple
+                inputs.
+
+        Returns:
+            list[:obj:`InstanceData`]: Instance prediction
+            results of each sample after the post process.
+            Each item usually contains following keys.
+
+                - scores_3d (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels_3d (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes_3d (:obj:`LiDARInstance3DBoxes`): Prediction
+                  of bboxes, contains a tensor with shape
+                  (num_instances, 7) or (num_instances, 9), and
+                  the last 2 dimensions of 9 is
+                  velocity.
+        """
+        rets = []
+        for task_id, preds_dict in enumerate(preds_dicts):
+            num_class_with_bg = self.num_classes[task_id]
+            batch_size = preds_dict[0]['heatmap'].shape[0]
+            batch_heatmap = preds_dict[0]['heatmap'].sigmoid()
+
+            batch_reg = preds_dict[0]['reg']
+            batch_hei = preds_dict[0]['height']
+
+            if self.norm_bbox:
+                batch_dim = torch.exp(preds_dict[0]['dim'])
+            else:
+                batch_dim = preds_dict[0]['dim']
+
+            # It's different from CenterHead
+            batch_rotc = preds_dict[0]['rot'][:, 0].unsqueeze(1)
+            batch_rots = preds_dict[0]['rot'][:, 1].unsqueeze(1)
+            batch_iou = (preds_dict[0]['iou'] +
+                         1) * 0.5 if 'iou' in preds_dict[0] else None
+
+            if 'vel' in preds_dict[0]:
+                batch_vel = preds_dict[0]['vel']
+            else:
+                batch_vel = None
+            temp = self.bbox_coder.decode(
+                batch_heatmap,
+                batch_rots,
+                batch_rotc,
+                batch_hei,
+                batch_dim,
+                batch_vel,
+                reg=batch_reg,
+                iou=batch_iou)
+            assert self.test_cfg['nms_type'] in ['circle', 'rotate']
+            batch_reg_preds, batch_cls_preds, batch_cls_labels, batch_iou_preds = [], [], [], []  # noqa: E501
+            for box in temp:
+                batch_reg_preds.append(box['bboxes'])
+                batch_cls_preds.append(box['scores'])
+                batch_cls_labels.append(box['labels'].long())
+                batch_iou_preds.append(box['iou'])
+            if self.test_cfg['nms_type'] == 'circle':
+                ret_task = []
+                for i in range(batch_size):
+                    boxes3d = temp[i]['bboxes']
+                    scores = temp[i]['scores']
+                    labels = temp[i]['labels']
+                    centers = boxes3d[:, [0, 1]]
+                    boxes = torch.cat([centers, scores.view(-1, 1)], dim=1)
+                    keep = torch.tensor(
+                        circle_nms(
+                            boxes.detach().cpu().numpy(),
+                            self.test_cfg['min_radius'][task_id],
+                            post_max_size=self.test_cfg['post_max_size']),
+                        dtype=torch.long,
+                        device=boxes.device)
+
+                    boxes3d = boxes3d[keep]
+                    scores = scores[keep]
+                    labels = labels[keep]
+                    ret = dict(bboxes=boxes3d, scores=scores, labels=labels)
+                    ret_task.append(ret)
+                rets.append(ret_task)
+            else:
+                rets.append(
+                    self.get_task_detections(task_id, num_class_with_bg,
+                                             batch_cls_preds, batch_reg_preds,
+                                             batch_iou_preds, batch_cls_labels,
+                                             batch_input_metas))
+
+        # Merge branches results
+        num_samples = len(rets[0])
+
+        ret_list = []
+        for i in range(num_samples):
+            temp_instances = InstanceData()
+            for k in rets[0][i].keys():
+                if k == 'bboxes':
+                    bboxes = torch.cat([ret[i][k] for ret in rets])
+                    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+                    bboxes = batch_input_metas[i]['box_type_3d'](
+                        bboxes, self.bbox_coder.code_size)
+                elif k == 'scores':
+                    scores = torch.cat([ret[i][k] for ret in rets])
+                elif k == 'labels':
+                    flag = 0
+                    for j, num_class in enumerate(self.num_classes):
+                        rets[j][i][k] += flag
+                        flag += num_class
+                    labels = torch.cat([ret[i][k].int() for ret in rets])
+            temp_instances.bboxes_3d = bboxes
+            temp_instances.scores_3d = scores
+            temp_instances.labels_3d = labels
+            ret_list.append(temp_instances)
+        return ret_list
+
+    def get_task_detections(self, task_id, num_class_with_bg, batch_cls_preds,
+                            batch_reg_preds, batch_iou_preds, batch_cls_labels,
+                            img_metas):
+        """Rotate nms for each task.
+
+        Args:
+            num_class_with_bg (int): Number of classes for the current task.
+            batch_cls_preds (list[torch.Tensor]): Prediction score with the
+                shape of [N].
+            batch_reg_preds (list[torch.Tensor]): Prediction bbox with the
+                shape of [N, 9].
+            batch_iou_preds (list[torch.Tensor]): Prediction IoU with the
+                shape of [N].
+            batch_cls_labels (list[torch.Tensor]): Prediction label with the
+                shape of [N].
+            img_metas (list[dict]): Meta information of each sample.
+
+        Returns:
+            list[dict[str: torch.Tensor]]: contains the following keys:
+
+                -bboxes (torch.Tensor): Prediction bboxes after nms with the
+                    shape of [N, 9].
+                -scores (torch.Tensor): Prediction scores after nms with the
+                    shape of [N].
+                -labels (torch.Tensor): Prediction labels after nms with the
+                    shape of [N].
+        """
+        predictions_dicts = []
+        for i, (box_preds, cls_preds, iou_preds, cls_labels) in enumerate(
+                zip(batch_reg_preds, batch_cls_preds, batch_iou_preds,
+                    batch_cls_labels)):
+            pred_iou = torch.clamp(iou_preds, min=0, max=1.0)
+            iou_rectifier = pred_iou.new_tensor(
+                self.test_cfg['iou_rectifier'][task_id])
+            cls_preds = torch.pow(cls_preds,
+                                  1 - iou_rectifier[cls_labels]) * torch.pow(
+                                      pred_iou, iou_rectifier[cls_labels])
+
+            # Apply NMS in bird eye view
+            # get the highest score per prediction, then apply nms
+            # to remove overlapped box.
+            if num_class_with_bg == 1:
+                top_scores = cls_preds
+                top_labels = torch.zeros(
+                    cls_preds.shape[0],
+                    device=cls_preds.device,
+                    dtype=torch.long)
+
+            else:
+                top_labels = cls_labels.long()
+                top_scores = cls_preds
+
+            if top_scores.shape[0] != 0:
+                boxes_for_nms = xywhr2xyxyr(img_metas[i]['box_type_3d'](
+                    box_preds[:, :], self.bbox_coder.code_size).bev)
+
+                pre_max_size = self.test_cfg['pre_max_size'][task_id]
+                post_max_size = self.test_cfg['post_max_size'][task_id]
+                # cls_label_per_task = self.cls_id_mapping_per_task[task_id]
+                all_selected_mask = torch.zeros_like(top_labels, dtype=bool)
+                all_indices = torch.arange(top_labels.size(0)).to(
+                    top_labels.device)
+                # Mind this when training on the new coordinate
+                # Transform to old mmdet3d coordinate
+                boxes_for_nms[:, 4] = (-boxes_for_nms[:, 4] + torch.pi / 2 * 1)
+                boxes_for_nms[:, 4] = (boxes_for_nms[:, 4] +
+                                       torch.pi) % (2 * torch.pi) - torch.pi
+
+                for i, nms_thr in enumerate(self.test_cfg['nms_thr'][task_id]):
+                    label_mask = top_labels == i
+                    selected = nms_bev(
+                        boxes_for_nms[label_mask],
+                        top_scores[label_mask],
+                        thresh=nms_thr,
+                        pre_max_size=pre_max_size[i],
+                        post_max_size=post_max_size[i])
+                    indices = all_indices[label_mask][selected]
+                    all_selected_mask.scatter_(0, indices, True)
+            else:
+                all_selected_mask = []
+
+            # if selected is not None:
+            selected_boxes = box_preds[all_selected_mask]
+            selected_labels = top_labels[all_selected_mask]
+            selected_scores = top_scores[all_selected_mask]
+
+            # finally generate predictions.
+            if selected_boxes.shape[0] != 0:
+                box_preds = selected_boxes
+                scores = selected_scores
+                label_preds = selected_labels
+                final_box_preds = box_preds
+                final_scores = scores
+                final_labels = label_preds
+                predictions_dict = dict(
+                    bboxes=final_box_preds,
+                    scores=final_scores,
+                    labels=final_labels)
+            else:
+                dtype = batch_reg_preds[0].dtype
+                device = batch_reg_preds[0].device
+                predictions_dict = dict(
+                    bboxes=torch.zeros([0, self.bbox_coder.code_size],
+                                       dtype=dtype,
+                                       device=device),
+                    scores=torch.zeros([0], dtype=dtype, device=device),
+                    labels=torch.zeros([0],
+                                       dtype=top_labels.dtype,
+                                       device=device))
+
+            predictions_dicts.append(predictions_dict)
+        return predictions_dicts
diff --git a/mmde/projects/DSVT/dsvt/dsvt_input_layer.py b/mmde/projects/DSVT/dsvt/dsvt_input_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7aca02da263602120887eb676bf07d4ceaac301
--- /dev/null
+++ b/mmde/projects/DSVT/dsvt/dsvt_input_layer.py
@@ -0,0 +1,405 @@
+# modified from https://github.com/Haiyang-W/DSVT
+from math import ceil
+
+import torch
+from torch import nn
+
+from .utils import (PositionEmbeddingLearned, get_continous_inds,
+                    get_inner_win_inds_cuda, get_pooling_index,
+                    get_window_coors)
+
+
+class DSVTInputLayer(nn.Module):
+    '''
+    This class converts the output of vfe to dsvt input.
+    We do in this class:
+    1. Window partition: partition voxels to non-overlapping windows.
+    2. Set partition: generate non-overlapped and size-equivalent local sets
+        within each window.
+    3. Pre-compute the downsample information between two consecutive stages.
+    4. Pre-compute the position embedding vectors.
+
+    Args:
+        sparse_shape (tuple[int, int, int]): Shape of input space
+            (xdim, ydim, zdim).
+        window_shape (list[list[int, int, int]]): Window shapes
+            (winx, winy, winz) in different stages. Length: stage_num.
+        downsample_stride (list[list[int, int, int]]): Downsample
+            strides between two consecutive stages.
+            Element i is [ds_x, ds_y, ds_z], which is used between stage_i and
+            stage_{i+1}. Length: stage_num - 1.
+        dim_model (list[int]): Number of input channels for each stage. Length:
+            stage_num.
+        set_info (list[list[int, int]]): A list of set config for each stage.
+            Eelement i contains
+            [set_size, block_num], where set_size is the number of voxel in a
+            set and block_num is the
+            number of blocks for stage i. Length: stage_num.
+        hybrid_factor (list[int, int, int]): Control the window shape in
+            different blocks.
+            e.g. for block_{0} and block_{1} in stage_0, window shapes are
+            [win_x, win_y, win_z] and
+            [win_x * h[0], win_y * h[1], win_z * h[2]] respectively.
+        shift_list (list): Shift window. Length: stage_num.
+        normalize_pos (bool): Whether to normalize coordinates in position
+            embedding.
+    '''
+
+    def __init__(self, sparse_shape, window_shape, downsample_stride,
+                 dim_model, set_info, hybrid_factor, shift_list,
+                 normalize_pos):
+        super().__init__()
+
+        self.sparse_shape = sparse_shape
+        self.window_shape = window_shape
+        self.downsample_stride = downsample_stride
+        self.dim_model = dim_model
+        self.set_info = set_info
+        self.stage_num = len(self.dim_model)
+
+        self.hybrid_factor = hybrid_factor
+        self.window_shape = [[
+            self.window_shape[s_id],
+            [
+                self.window_shape[s_id][coord_id] *
+                self.hybrid_factor[coord_id] for coord_id in range(3)
+            ]
+        ] for s_id in range(self.stage_num)]
+        self.shift_list = shift_list
+        self.normalize_pos = normalize_pos
+
+        self.num_shifts = [
+            2,
+        ] * len(self.window_shape)
+
+        self.sparse_shape_list = [self.sparse_shape]
+        # compute sparse shapes for each stage
+        for ds_stride in self.downsample_stride:
+            last_sparse_shape = self.sparse_shape_list[-1]
+            self.sparse_shape_list.append(
+                (ceil(last_sparse_shape[0] / ds_stride[0]),
+                 ceil(last_sparse_shape[1] / ds_stride[1]),
+                 ceil(last_sparse_shape[2] / ds_stride[2])))
+
+        # position embedding layers
+        self.posembed_layers = nn.ModuleList()
+        for i in range(len(self.set_info)):
+            input_dim = 3 if self.sparse_shape_list[i][-1] > 1 else 2
+            stage_posembed_layers = nn.ModuleList()
+            for j in range(self.set_info[i][1]):
+                block_posembed_layers = nn.ModuleList()
+                for s in range(self.num_shifts[i]):
+                    block_posembed_layers.append(
+                        PositionEmbeddingLearned(input_dim, self.dim_model[i]))
+                stage_posembed_layers.append(block_posembed_layers)
+            self.posembed_layers.append(stage_posembed_layers)
+
+    def forward(self, batch_dict):
+        '''
+        Args:
+            bacth_dict (dict):
+                The dict contains the following keys
+                - voxel_features (Tensor[float]): Voxel features after VFE
+                    with shape (N, dim_model[0]),
+                    where N is the number of input voxels.
+                - voxel_coords (Tensor[int]): Shape of (N, 4), corresponding
+                    voxel coordinates of each voxels.
+                    Each row is (batch_id, z, y, x).
+                - ...
+
+        Returns:
+            voxel_info (dict):
+                The dict contains the following keys
+                - voxel_coors_stage{i} (Tensor[int]): Shape of (N_i, 4). N is
+                    the number of voxels in stage_i.
+                    Each row is (batch_id, z, y, x).
+                - set_voxel_inds_stage{i}_shift{j} (Tensor[int]): Set partition
+                    index with shape (2, set_num, set_info[i][0]).
+                    2 indicates x-axis partition and y-axis partition.
+                - set_voxel_mask_stage{i}_shift{i} (Tensor[bool]): Key mask
+                    used in set attention with shape
+                    (2, set_num, set_info[i][0]).
+                - pos_embed_stage{i}_block{i}_shift{i} (Tensor[float]):
+                    Position embedding vectors with shape (N_i, dim_model[i]).
+                    N_i is the number of remain voxels in stage_i;
+                - pooling_mapping_index_stage{i} (Tensor[int]): Pooling region
+                    index used in pooling operation between stage_{i-1}
+                    and stage_{i} with shape (N_{i-1}).
+                - pooling_index_in_pool_stage{i} (Tensor[int]): Index inner
+                    region with shape (N_{i-1}). Combined with
+                    pooling_mapping_index_stage{i}, we can map each voxel in
+                    satge_{i-1} to pooling_preholder_feats_stage{i}, which
+                    are input of downsample operation.
+                - pooling_preholder_feats_stage{i} (Tensor[int]): Preholder
+                    features initial with value 0.
+                    Shape of (N_{i}, downsample_stride[i-1].prob(),
+                    d_moel[i-1]), where prob() returns the product of
+                    all elements.
+                - ...
+        '''
+        voxel_feats = batch_dict['voxel_features']
+        voxel_coors = batch_dict['voxel_coords'].long()
+
+        voxel_info = {}
+        voxel_info['voxel_feats_stage0'] = voxel_feats.clone()
+        voxel_info['voxel_coors_stage0'] = voxel_coors.clone()
+
+        for stage_id in range(self.stage_num):
+            # window partition of corresponding stage-map
+            voxel_info = self.window_partition(voxel_info, stage_id)
+            # generate set id of corresponding stage-map
+            voxel_info = self.get_set(voxel_info, stage_id)
+            for block_id in range(self.set_info[stage_id][1]):
+                for shift_id in range(self.num_shifts[stage_id]):
+                    layer_name = f'pos_embed_stage{stage_id}_block{block_id}_shift{shift_id}'  # noqa: E501
+                    pos_name = f'coors_in_win_stage{stage_id}_shift{shift_id}'
+                    voxel_info[layer_name] = self.get_pos_embed(
+                        voxel_info[pos_name], stage_id, block_id, shift_id)
+
+            # compute pooling information
+            if stage_id < self.stage_num - 1:
+                voxel_info = self.subm_pooling(voxel_info, stage_id)
+
+        return voxel_info
+
+    @torch.no_grad()
+    def subm_pooling(self, voxel_info, stage_id):
+        # x,y,z stride
+        cur_stage_downsample = self.downsample_stride[stage_id]
+        # batch_win_coords is from 1 of x, y
+        batch_win_inds, _, index_in_win, batch_win_coors = get_pooling_index(
+            voxel_info[f'voxel_coors_stage{stage_id}'],
+            self.sparse_shape_list[stage_id], cur_stage_downsample)
+        # compute pooling mapping index
+        unique_batch_win_inds, contiguous_batch_win_inds = torch.unique(
+            batch_win_inds, return_inverse=True)
+        voxel_info[
+            f'pooling_mapping_index_stage{stage_id+1}'] = \
+            contiguous_batch_win_inds
+
+        # generate empty placeholder features
+        placeholder_prepool_feats = voxel_info['voxel_feats_stage0'].new_zeros(
+            (len(unique_batch_win_inds),
+             torch.prod(torch.IntTensor(cur_stage_downsample)).item(),
+             self.dim_model[stage_id]))
+        voxel_info[f'pooling_index_in_pool_stage{stage_id+1}'] = index_in_win
+        voxel_info[
+            f'pooling_preholder_feats_stage{stage_id+1}'] = \
+            placeholder_prepool_feats
+
+        # compute pooling coordinates
+        unique, inverse = unique_batch_win_inds.clone(
+        ), contiguous_batch_win_inds.clone()
+        perm = torch.arange(
+            inverse.size(0), dtype=inverse.dtype, device=inverse.device)
+        inverse, perm = inverse.flip([0]), perm.flip([0])
+        perm = inverse.new_empty(unique.size(0)).scatter_(0, inverse, perm)
+        pool_coors = batch_win_coors[perm]
+
+        voxel_info[f'voxel_coors_stage{stage_id+1}'] = pool_coors
+
+        return voxel_info
+
+    def get_set(self, voxel_info, stage_id):
+        '''
+        This is one of the core operation of DSVT.
+        Given voxels' window ids and relative-coords inner window, we partition
+        them into window-bounded and size-equivalent local sets. To make it
+        clear and easy to follow, we do not use loop to process two shifts.
+        Args:
+            voxel_info (dict):
+                The dict contains the following keys
+                - batch_win_inds_s{i} (Tensor[float]): Windows indices of each
+                    voxel with shape (N), computed by 'window_partition'.
+                - coors_in_win_shift{i} (Tensor[int]): Relative-coords inner
+                    window of each voxel with shape (N, 3), computed by
+                    'window_partition'. Each row is (z, y, x).
+                - ...
+
+        Returns:
+            See from 'forward' function.
+        '''
+        batch_win_inds_shift0 = voxel_info[
+            f'batch_win_inds_stage{stage_id}_shift0']
+        coors_in_win_shift0 = voxel_info[
+            f'coors_in_win_stage{stage_id}_shift0']
+        set_voxel_inds_shift0 = self.get_set_single_shift(
+            batch_win_inds_shift0,
+            stage_id,
+            shift_id=0,
+            coors_in_win=coors_in_win_shift0)
+        voxel_info[
+            f'set_voxel_inds_stage{stage_id}_shift0'] = set_voxel_inds_shift0
+        # compute key masks, voxel duplication must happen continuously
+        prefix_set_voxel_inds_s0 = torch.roll(
+            set_voxel_inds_shift0.clone(), shifts=1, dims=-1)
+        prefix_set_voxel_inds_s0[:, :, 0] = -1
+        set_voxel_mask_s0 = (set_voxel_inds_shift0 == prefix_set_voxel_inds_s0)
+        voxel_info[
+            f'set_voxel_mask_stage{stage_id}_shift0'] = set_voxel_mask_s0
+
+        batch_win_inds_shift1 = voxel_info[
+            f'batch_win_inds_stage{stage_id}_shift1']
+        coors_in_win_shift1 = voxel_info[
+            f'coors_in_win_stage{stage_id}_shift1']
+        set_voxel_inds_shift1 = self.get_set_single_shift(
+            batch_win_inds_shift1,
+            stage_id,
+            shift_id=1,
+            coors_in_win=coors_in_win_shift1)
+        voxel_info[
+            f'set_voxel_inds_stage{stage_id}_shift1'] = set_voxel_inds_shift1
+        # compute key masks, voxel duplication must happen continuously
+        prefix_set_voxel_inds_s1 = torch.roll(
+            set_voxel_inds_shift1.clone(), shifts=1, dims=-1)
+        prefix_set_voxel_inds_s1[:, :, 0] = -1
+        set_voxel_mask_s1 = (set_voxel_inds_shift1 == prefix_set_voxel_inds_s1)
+        voxel_info[
+            f'set_voxel_mask_stage{stage_id}_shift1'] = set_voxel_mask_s1
+
+        return voxel_info
+
+    def get_set_single_shift(self,
+                             batch_win_inds,
+                             stage_id,
+                             shift_id=None,
+                             coors_in_win=None):
+        device = batch_win_inds.device
+        # the number of voxels assigned to a set
+        voxel_num_set = self.set_info[stage_id][0]
+        # max number of voxels in a window
+        max_voxel = self.window_shape[stage_id][shift_id][
+            0] * self.window_shape[stage_id][shift_id][1] * self.window_shape[
+                stage_id][shift_id][2]
+        # get unique set indices
+        contiguous_win_inds = torch.unique(
+            batch_win_inds, return_inverse=True)[1]
+        voxelnum_per_win = torch.bincount(contiguous_win_inds)
+        win_num = voxelnum_per_win.shape[0]
+        setnum_per_win_float = voxelnum_per_win / voxel_num_set
+        setnum_per_win = torch.ceil(setnum_per_win_float).long()
+        set_win_inds, set_inds_in_win = get_continous_inds(setnum_per_win)
+
+        # compution of Eq.3 in 'DSVT: Dynamic Sparse Voxel Transformer with
+        # Rotated Sets' - https://arxiv.org/abs/2301.06051,
+        # for each window, we can get voxel indices belong to different sets.
+        offset_idx = set_inds_in_win[:, None].repeat(
+            1, voxel_num_set) * voxel_num_set
+        base_idx = torch.arange(0, voxel_num_set, 1, device=device)
+        base_select_idx = offset_idx + base_idx
+        base_select_idx = base_select_idx * voxelnum_per_win[
+            set_win_inds][:, None]
+        base_select_idx = base_select_idx.double() / (
+            setnum_per_win[set_win_inds] * voxel_num_set)[:, None].double()
+        base_select_idx = torch.floor(base_select_idx)
+        # obtain unique indices in whole space
+        select_idx = base_select_idx
+        select_idx = select_idx + set_win_inds.view(-1, 1) * max_voxel
+
+        # this function will return unordered inner window indices of
+        # each voxel
+        inner_voxel_inds = get_inner_win_inds_cuda(contiguous_win_inds)
+        global_voxel_inds = contiguous_win_inds * max_voxel + inner_voxel_inds
+        _, order1 = torch.sort(global_voxel_inds)
+
+        # get y-axis partition results
+        global_voxel_inds_sorty = contiguous_win_inds * max_voxel + \
+            coors_in_win[:, 1] * self.window_shape[stage_id][shift_id][0] * \
+            self.window_shape[stage_id][shift_id][2] + coors_in_win[:, 2] * \
+            self.window_shape[stage_id][shift_id][2] + \
+            coors_in_win[:, 0]
+        _, order2 = torch.sort(global_voxel_inds_sorty)
+        inner_voxel_inds_sorty = -torch.ones_like(inner_voxel_inds)
+        inner_voxel_inds_sorty.scatter_(
+            dim=0, index=order2, src=inner_voxel_inds[order1]
+        )  # get y-axis ordered inner window indices of each voxel
+        voxel_inds_in_batch_sorty = inner_voxel_inds_sorty + max_voxel * \
+            contiguous_win_inds
+        voxel_inds_padding_sorty = -1 * torch.ones(
+            (win_num * max_voxel), dtype=torch.long, device=device)
+        voxel_inds_padding_sorty[voxel_inds_in_batch_sorty] = torch.arange(
+            0,
+            voxel_inds_in_batch_sorty.shape[0],
+            dtype=torch.long,
+            device=device)
+        set_voxel_inds_sorty = voxel_inds_padding_sorty[select_idx.long()]
+
+        # get x-axis partition results
+        global_voxel_inds_sortx = contiguous_win_inds * max_voxel + \
+            coors_in_win[:, 2] * self.window_shape[stage_id][shift_id][1] * \
+            self.window_shape[stage_id][shift_id][2] + \
+            coors_in_win[:, 1] * self.window_shape[stage_id][shift_id][2] + \
+            coors_in_win[:, 0]
+        _, order2 = torch.sort(global_voxel_inds_sortx)
+        inner_voxel_inds_sortx = -torch.ones_like(inner_voxel_inds)
+        inner_voxel_inds_sortx.scatter_(
+            dim=0, index=order2, src=inner_voxel_inds[order1]
+        )  # get x-axis ordered inner window indices of each voxel
+        voxel_inds_in_batch_sortx = inner_voxel_inds_sortx + max_voxel * \
+            contiguous_win_inds
+        voxel_inds_padding_sortx = -1 * torch.ones(
+            (win_num * max_voxel), dtype=torch.long, device=device)
+        voxel_inds_padding_sortx[voxel_inds_in_batch_sortx] = torch.arange(
+            0,
+            voxel_inds_in_batch_sortx.shape[0],
+            dtype=torch.long,
+            device=device)
+        set_voxel_inds_sortx = voxel_inds_padding_sortx[select_idx.long()]
+
+        all_set_voxel_inds = torch.stack(
+            (set_voxel_inds_sorty, set_voxel_inds_sortx), dim=0)
+        return all_set_voxel_inds
+
+    @torch.no_grad()
+    def window_partition(self, voxel_info, stage_id):
+        for i in range(2):
+            batch_win_inds, coors_in_win = get_window_coors(
+                voxel_info[f'voxel_coors_stage{stage_id}'],
+                self.sparse_shape_list[stage_id],
+                self.window_shape[stage_id][i], i == 1,
+                self.shift_list[stage_id][i])
+
+            voxel_info[
+                f'batch_win_inds_stage{stage_id}_shift{i}'] = batch_win_inds
+            voxel_info[f'coors_in_win_stage{stage_id}_shift{i}'] = coors_in_win
+
+        return voxel_info
+
+    def get_pos_embed(self, coors_in_win, stage_id, block_id, shift_id):
+        '''
+        Args:
+            coors_in_win: shape=[N, 3], order: z, y, x
+        '''
+        # [N,]
+        window_shape = self.window_shape[stage_id][shift_id]
+
+        embed_layer = self.posembed_layers[stage_id][block_id][shift_id]
+        if len(window_shape) == 2:
+            ndim = 2
+            win_x, win_y = window_shape
+            win_z = 0
+        elif window_shape[-1] == 1:
+            ndim = 2
+            win_x, win_y = window_shape[:2]
+            win_z = 0
+        else:
+            win_x, win_y, win_z = window_shape
+            ndim = 3
+
+        assert coors_in_win.size(1) == 3
+        z, y, x = coors_in_win[:, 0] - win_z / 2,\
+            coors_in_win[:, 1] - win_y / 2,\
+            coors_in_win[:, 2] - win_x / 2
+
+        if self.normalize_pos:
+            x = x / win_x * 2 * 3.1415  # [-pi, pi]
+            y = y / win_y * 2 * 3.1415  # [-pi, pi]
+            z = z / win_z * 2 * 3.1415  # [-pi, pi]
+
+        if ndim == 2:
+            location = torch.stack((x, y), dim=-1)
+        else:
+            location = torch.stack((x, y, z), dim=-1)
+        pos_embed = embed_layer(location)
+
+        return pos_embed
diff --git a/mmde/projects/DSVT/dsvt/dsvt_transformer.py b/mmde/projects/DSVT/dsvt/dsvt_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9290db71f743ef06df03170c26003f30106ecc31
--- /dev/null
+++ b/mmde/projects/DSVT/dsvt/dsvt_transformer.py
@@ -0,0 +1,413 @@
+# modified from https://github.com/Haiyang-W/DSVT
+import torch
+import torch.nn as nn
+
+from mmdet3d.registry import MODELS
+from .dsvt_input_layer import DSVTInputLayer
+
+
+@MODELS.register_module()
+class DSVTMiddleEncoder(nn.Module):
+    '''Dynamic Sparse Voxel Transformer Backbone.
+    Args:
+        INPUT_LAYER: Config of input layer, which converts the output of vfe
+            to dsvt input.
+        block_name (list[string]): Name of blocks for each stage. Length:
+            stage_num.
+        set_info (list[list[int, int]]): A list of set config for each stage.
+            Eelement i contains
+            [set_size, block_num], where set_size is the number of voxel in a
+            set and block_num is the number of blocks for stage i. Length:
+            stage_num.
+        dim_model (list[int]): Number of input channels for each stage.
+            Length: stage_num.
+        nhead (list[int]): Number of attention heads for each stage.
+            Length: stage_num.
+        dim_feedforward (list[int]): Dimensions of the feedforward network in
+            set attention for each stage. Length: stage num.
+        dropout (float): Drop rate of set attention.
+        activation (string): Name of activation layer in set attention.
+        reduction_type (string): Pooling method between stages.
+            One of: "attention", "maxpool", "linear".
+        output_shape (tuple[int, int]): Shape of output bev feature.
+        conv_out_channel (int): Number of output channels.
+
+    '''
+
+    def __init__(
+            self,
+            input_layer=dict(
+                sparse_shape=[468, 468, 1],
+                downsample_stride=[],
+                dim_model=[192],
+                set_info=[[36, 4]],
+                window_shape=[[12, 12, 1]],
+                hybrid_factor=[2, 2, 1],  # x, y, z
+                shifts_list=[[[0, 0, 0], [6, 6, 0]]],
+                normalize_pos=False),
+            stage_num=1,
+            output_shape=[468, 468],
+            reduction_type='attention',
+            downsample_stride=[],
+            set_info=[[36, 4]],
+            dim_model=[192],
+            dim_feedforward=[384],
+            nhead=[8],
+            conv_out_channel=192,
+            dropout=0.,
+            activation='gelu'):
+        super().__init__()
+        self.input_layer = DSVTInputLayer(**input_layer)
+        self.reduction_type = reduction_type
+
+        # Sparse Regional Attention Blocks
+        for stage_id in range(stage_num):
+            num_blocks_this_stage = set_info[stage_id][-1]
+            dmodel_this_stage = dim_model[stage_id]
+            dfeed_this_stage = dim_feedforward[stage_id]
+            num_head_this_stage = nhead[stage_id]
+            block_list = []
+            norm_list = []
+            for i in range(num_blocks_this_stage):
+                block_list.append(
+                    DSVTBlock(
+                        dmodel_this_stage,
+                        num_head_this_stage,
+                        dfeed_this_stage,
+                        dropout,
+                        activation,
+                        batch_first=True))
+                norm_list.append(nn.LayerNorm(dmodel_this_stage))
+            self.__setattr__(f'stage_{stage_id}', nn.ModuleList(block_list))
+            self.__setattr__(f'residual_norm_stage_{stage_id}',
+                             nn.ModuleList(norm_list))
+
+            # apply pooling except the last stage
+            if stage_id < stage_num - 1:
+                downsample_window = downsample_stride[stage_id]
+                dmodel_next_stage = dim_model[stage_id + 1]
+                pool_volume = torch.IntTensor(downsample_window).prod().item()
+                if self.reduction_type == 'linear':
+                    cat_feat_dim = dmodel_this_stage * torch.IntTensor(
+                        downsample_window).prod().item()
+                    self.__setattr__(
+                        f'stage_{stage_id}_reduction',
+                        StageReductionBlock(cat_feat_dim, dmodel_next_stage))
+                elif self.reduction_type == 'maxpool':
+                    self.__setattr__(f'stage_{stage_id}_reduction',
+                                     torch.nn.MaxPool1d(pool_volume))
+                elif self.reduction_type == 'attention':
+                    self.__setattr__(
+                        f'stage_{stage_id}_reduction',
+                        StageReductionAttBlock(dmodel_this_stage, pool_volume))
+                else:
+                    raise NotImplementedError
+
+        self.num_shifts = [2] * stage_num
+        self.output_shape = output_shape
+        self.stage_num = stage_num
+        self.set_info = set_info
+        self.num_point_features = conv_out_channel
+
+        self._reset_parameters()
+
+    def forward(self, batch_dict):
+        '''
+        Args:
+            bacth_dict (dict):
+                The dict contains the following keys
+                - voxel_features (Tensor[float]): Voxel features after VFE.
+                    Shape of (N, dim_model[0]),
+                    where N is the number of input voxels.
+                - voxel_coords (Tensor[int]): Shape of (N, 4), corresponding
+                    voxel coordinates of each voxels.
+                    Each row is (batch_id, z, y, x).
+                - ...
+
+        Returns:
+            bacth_dict (dict):
+                The dict contains the following keys
+                - pillar_features (Tensor[float]):
+                - voxel_coords (Tensor[int]):
+                - ...
+        '''
+        voxel_info = self.input_layer(batch_dict)
+
+        voxel_feat = voxel_info['voxel_feats_stage0']
+        set_voxel_inds_list = [[
+            voxel_info[f'set_voxel_inds_stage{s}_shift{i}']
+            for i in range(self.num_shifts[s])
+        ] for s in range(self.stage_num)]
+        set_voxel_masks_list = [[
+            voxel_info[f'set_voxel_mask_stage{s}_shift{i}']
+            for i in range(self.num_shifts[s])
+        ] for s in range(self.stage_num)]
+        pos_embed_list = [[[
+            voxel_info[f'pos_embed_stage{s}_block{b}_shift{i}']
+            for i in range(self.num_shifts[s])
+        ] for b in range(self.set_info[s][1])] for s in range(self.stage_num)]
+        pooling_mapping_index = [
+            voxel_info[f'pooling_mapping_index_stage{s+1}']
+            for s in range(self.stage_num - 1)
+        ]
+        pooling_index_in_pool = [
+            voxel_info[f'pooling_index_in_pool_stage{s+1}']
+            for s in range(self.stage_num - 1)
+        ]
+        pooling_preholder_feats = [
+            voxel_info[f'pooling_preholder_feats_stage{s+1}']
+            for s in range(self.stage_num - 1)
+        ]
+
+        output = voxel_feat
+        block_id = 0
+        for stage_id in range(self.stage_num):
+            block_layers = self.__getattr__(f'stage_{stage_id}')
+            residual_norm_layers = self.__getattr__(
+                f'residual_norm_stage_{stage_id}')
+            for i in range(len(block_layers)):
+                block = block_layers[i]
+                residual = output.clone()
+                output = block(
+                    output,
+                    set_voxel_inds_list[stage_id],
+                    set_voxel_masks_list[stage_id],
+                    pos_embed_list[stage_id][i],
+                    block_id=block_id)
+                output = residual_norm_layers[i](output + residual)
+                block_id += 1
+            if stage_id < self.stage_num - 1:
+                # pooling
+                prepool_features = pooling_preholder_feats[stage_id].type_as(
+                    output)
+                pooled_voxel_num = prepool_features.shape[0]
+                pool_volume = prepool_features.shape[1]
+                prepool_features[pooling_mapping_index[stage_id],
+                                 pooling_index_in_pool[stage_id]] = output
+                prepool_features = prepool_features.view(
+                    prepool_features.shape[0], -1)
+
+                if self.reduction_type == 'linear':
+                    output = self.__getattr__(f'stage_{stage_id}_reduction')(
+                        prepool_features)
+                elif self.reduction_type == 'maxpool':
+                    prepool_features = prepool_features.view(
+                        pooled_voxel_num, pool_volume, -1).permute(0, 2, 1)
+                    output = self.__getattr__(f'stage_{stage_id}_reduction')(
+                        prepool_features).squeeze(-1)
+                elif self.reduction_type == 'attention':
+                    prepool_features = prepool_features.view(
+                        pooled_voxel_num, pool_volume, -1).permute(0, 2, 1)
+                    key_padding_mask = torch.zeros(
+                        (pooled_voxel_num,
+                         pool_volume)).to(prepool_features.device).int()
+                    output = self.__getattr__(f'stage_{stage_id}_reduction')(
+                        prepool_features, key_padding_mask)
+                else:
+                    raise NotImplementedError
+
+        batch_dict['pillar_features'] = batch_dict['voxel_features'] = output
+        batch_dict['voxel_coords'] = voxel_info[
+            f'voxel_coors_stage{self.stage_num - 1}']
+        return batch_dict
+
+    def _reset_parameters(self):
+        for name, p in self.named_parameters():
+            if p.dim() > 1 and 'scaler' not in name:
+                nn.init.xavier_uniform_(p)
+
+
+class DSVTBlock(nn.Module):
+    """Consist of two encoder layer, shift and shift back."""
+
+    def __init__(self,
+                 dim_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation='relu',
+                 batch_first=True):
+        super().__init__()
+
+        encoder_1 = DSVTEncoderLayer(dim_model, nhead, dim_feedforward,
+                                     dropout, activation, batch_first)
+        encoder_2 = DSVTEncoderLayer(dim_model, nhead, dim_feedforward,
+                                     dropout, activation, batch_first)
+        self.encoder_list = nn.ModuleList([encoder_1, encoder_2])
+
+    def forward(
+        self,
+        src,
+        set_voxel_inds_list,
+        set_voxel_masks_list,
+        pos_embed_list,
+        block_id,
+    ):
+        num_shifts = 2
+        output = src
+        for i in range(num_shifts):
+            set_id = i
+            shift_id = block_id % 2
+            pos_embed_id = i
+            set_voxel_inds = set_voxel_inds_list[shift_id][set_id]
+            set_voxel_masks = set_voxel_masks_list[shift_id][set_id]
+            pos_embed = pos_embed_list[pos_embed_id]
+            layer = self.encoder_list[i]
+            output = layer(output, set_voxel_inds, set_voxel_masks, pos_embed)
+
+        return output
+
+
+class DSVTEncoderLayer(nn.Module):
+
+    def __init__(self,
+                 dim_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation='relu',
+                 batch_first=True,
+                 mlp_dropout=0):
+        super().__init__()
+        self.win_attn = SetAttention(dim_model, nhead, dropout,
+                                     dim_feedforward, activation, batch_first,
+                                     mlp_dropout)
+        self.norm = nn.LayerNorm(dim_model)
+        self.dim_model = dim_model
+
+    def forward(self, src, set_voxel_inds, set_voxel_masks, pos=None):
+        identity = src
+        src = self.win_attn(src, pos, set_voxel_masks, set_voxel_inds)
+        src = src + identity
+        src = self.norm(src)
+
+        return src
+
+
+class SetAttention(nn.Module):
+
+    def __init__(self,
+                 dim_model,
+                 nhead,
+                 dropout,
+                 dim_feedforward=2048,
+                 activation='relu',
+                 batch_first=True,
+                 mlp_dropout=0):
+        super().__init__()
+        self.nhead = nhead
+        if batch_first:
+            self.self_attn = nn.MultiheadAttention(
+                dim_model, nhead, dropout=dropout, batch_first=batch_first)
+        else:
+            self.self_attn = nn.MultiheadAttention(
+                dim_model, nhead, dropout=dropout)
+
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(dim_model, dim_feedforward)
+        self.dropout = nn.Dropout(mlp_dropout)
+        self.linear2 = nn.Linear(dim_feedforward, dim_model)
+        self.dim_model = dim_model
+        self.norm1 = nn.LayerNorm(dim_model)
+        self.norm2 = nn.LayerNorm(dim_model)
+        self.dropout1 = nn.Identity()
+        self.dropout2 = nn.Identity()
+
+        self.activation = _get_activation_fn(activation)
+
+    def forward(self, src, pos=None, key_padding_mask=None, voxel_inds=None):
+        '''
+        Args:
+            src (Tensor[float]): Voxel features with shape (N, C), where N is
+                the number of voxels.
+            pos (Tensor[float]): Position embedding vectors with shape (N, C).
+            key_padding_mask (Tensor[bool]): Mask for redundant voxels
+                within set. Shape of (set_num, set_size).
+            voxel_inds (Tensor[int]): Voxel indices for each set.
+                Shape of (set_num, set_size).
+        Returns:
+            src (Tensor[float]): Voxel features.
+        '''
+        set_features = src[voxel_inds]
+        if pos is not None:
+            set_pos = pos[voxel_inds]
+        else:
+            set_pos = None
+        if pos is not None:
+            query = set_features + set_pos
+            key = set_features + set_pos
+            value = set_features
+
+        if key_padding_mask is not None:
+            src2 = self.self_attn(query, key, value, key_padding_mask)[0]
+        else:
+            src2 = self.self_attn(query, key, value)[0]
+
+        # map voxel features from set space to voxel space:
+        # (set_num, set_size, C) --> (N, C)
+        flatten_inds = voxel_inds.reshape(-1)
+        unique_flatten_inds, inverse = torch.unique(
+            flatten_inds, return_inverse=True)
+        perm = torch.arange(
+            inverse.size(0), dtype=inverse.dtype, device=inverse.device)
+        inverse, perm = inverse.flip([0]), perm.flip([0])
+        perm = inverse.new_empty(unique_flatten_inds.size(0)).scatter_(
+            0, inverse, perm)
+        src2 = src2.reshape(-1, self.dim_model)[perm]
+
+        # FFN layer
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+
+        return src
+
+
+class StageReductionBlock(nn.Module):
+
+    def __init__(self, input_channel, output_channel):
+        super().__init__()
+        self.linear1 = nn.Linear(input_channel, output_channel, bias=False)
+        self.norm = nn.LayerNorm(output_channel)
+
+    def forward(self, x):
+        src = x
+        src = self.norm(self.linear1(x))
+        return src
+
+
+class StageReductionAttBlock(nn.Module):
+
+    def __init__(self, input_channel, pool_volume):
+        super().__init__()
+        self.pool_volume = pool_volume
+        self.query_func = torch.nn.MaxPool1d(pool_volume)
+        self.norm = nn.LayerNorm(input_channel)
+        self.self_attn = nn.MultiheadAttention(
+            input_channel, 8, batch_first=True)
+        self.pos_embedding = nn.Parameter(
+            torch.randn(pool_volume, input_channel))
+        nn.init.normal_(self.pos_embedding, std=.01)
+
+    def forward(self, x, key_padding_mask):
+        # x: [voxel_num, c_dim, pool_volume]
+        src = self.query_func(x).permute(0, 2, 1)  # voxel_num, 1, c_dim
+        key = value = x.permute(0, 2, 1)
+        key = key + self.pos_embedding.unsqueeze(0).repeat(src.shape[0], 1, 1)
+        query = src.clone()
+        output = self.self_attn(query, key, value, key_padding_mask)[0]
+        src = self.norm(output + src).squeeze(1)
+        return src
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string."""
+    if activation == 'relu':
+        return torch.nn.functional.relu
+    if activation == 'gelu':
+        return torch.nn.functional.gelu
+    if activation == 'glu':
+        return torch.nn.functional.glu
+    raise RuntimeError(F'activation should be relu/gelu, not {activation}.')
diff --git a/mmde/projects/DSVT/dsvt/dynamic_pillar_vfe.py b/mmde/projects/DSVT/dsvt/dynamic_pillar_vfe.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fc5266c569b6fcd729bdaeccaf8176040eaa245
--- /dev/null
+++ b/mmde/projects/DSVT/dsvt/dynamic_pillar_vfe.py
@@ -0,0 +1,178 @@
+# modified from https://github.com/Haiyang-W/DSVT
+import numpy as np
+import torch
+import torch.nn as nn
+import torch_scatter
+
+from mmdet3d.registry import MODELS
+
+
+class PFNLayerV2(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 use_norm=True,
+                 last_layer=False):
+        super().__init__()
+
+        self.last_vfe = last_layer
+        self.use_norm = use_norm
+        if not self.last_vfe:
+            out_channels = out_channels // 2
+
+        if self.use_norm:
+            self.linear = nn.Linear(in_channels, out_channels, bias=False)
+            self.norm = nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01)
+        else:
+            self.linear = nn.Linear(in_channels, out_channels, bias=True)
+
+        self.relu = nn.ReLU()
+
+    def forward(self, inputs, unq_inv):
+
+        x = self.linear(inputs)
+        x = self.norm(x) if self.use_norm else x
+        x = self.relu(x)
+        x_max = torch_scatter.scatter_max(x, unq_inv, dim=0)[0]
+
+        if self.last_vfe:
+            return x_max
+        else:
+            x_concatenated = torch.cat([x, x_max[unq_inv, :]], dim=1)
+            return x_concatenated
+
+
+@MODELS.register_module()
+class DynamicPillarVFE3D(nn.Module):
+    """The difference between `DynamicPillarVFE3D` and `DynamicPillarVFE` is
+    that the voxel in this module is along 3 dims: (x, y, z)."""
+
+    def __init__(self, with_distance, use_absolute_xyz, use_norm, num_filters,
+                 num_point_features, voxel_size, grid_size, point_cloud_range):
+        super().__init__()
+        self.use_norm = use_norm
+        self.with_distance = with_distance
+        self.use_absolute_xyz = use_absolute_xyz
+        num_point_features += 6 if self.use_absolute_xyz else 3
+        if self.with_distance:
+            num_point_features += 1
+
+        self.num_filters = num_filters
+        assert len(self.num_filters) > 0
+        num_filters = [num_point_features] + list(self.num_filters)
+
+        pfn_layers = []
+        for i in range(len(num_filters) - 1):
+            in_filters = num_filters[i]
+            out_filters = num_filters[i + 1]
+            pfn_layers.append(
+                PFNLayerV2(
+                    in_filters,
+                    out_filters,
+                    self.use_norm,
+                    last_layer=(i >= len(num_filters) - 2)))
+        self.pfn_layers = nn.ModuleList(pfn_layers)
+
+        self.voxel_x = voxel_size[0]
+        self.voxel_y = voxel_size[1]
+        self.voxel_z = voxel_size[2]
+        point_cloud_range = np.array(point_cloud_range).astype(np.float32)
+        self.x_offset = self.voxel_x / 2 + point_cloud_range[0]
+        self.y_offset = self.voxel_y / 2 + point_cloud_range[1]
+        self.z_offset = self.voxel_z / 2 + point_cloud_range[2]
+
+        self.scale_xyz = grid_size[0] * grid_size[1] * grid_size[2]
+        self.scale_yz = grid_size[1] * grid_size[2]
+        self.scale_z = grid_size[2]
+
+        self.grid_size = torch.tensor(grid_size).cuda()
+        self.voxel_size = torch.tensor(voxel_size).cuda()
+        self.point_cloud_range = torch.tensor(point_cloud_range).cuda()
+
+    def get_output_feature_dim(self):
+        return self.num_filters[-1]
+
+    def forward(self, batch_dict, **kwargs):
+        """Forward function.
+
+        Args:
+            batch_dict (dict[list]): Batch input data:
+                - points [list[Tensor]]: list of batch input points.
+
+        Returns:
+            dict: Voxelization outputs:
+                - points:
+                - pillar_features/voxel_features:
+                - voxel_coords
+        """
+        batch_prefix_points = []
+        for batch_idx, points in enumerate(batch_dict['points']):
+            prefix_batch_idx = torch.Tensor([batch_idx
+                                             ]).tile(points.size(0),
+                                                     1).to(points)
+            prefix_points = torch.cat((prefix_batch_idx, points),
+                                      dim=1)  # (batch_idx, x, y, z, i, e)
+            batch_prefix_points.append(prefix_points)
+
+        points = torch.cat(batch_prefix_points, dim=0)
+        del prefix_points, batch_prefix_points
+
+        points_coords = torch.floor(
+            (points[:, [1, 2, 3]] - self.point_cloud_range[[0, 1, 2]]) /
+            self.voxel_size[[0, 1, 2]]).int()
+        mask = ((points_coords >= 0) &
+                (points_coords < self.grid_size[[0, 1, 2]])).all(dim=1)
+        points = points[mask]
+        points_coords = points_coords[mask]
+        points_xyz = points[:, [1, 2, 3]].contiguous()
+
+        merge_coords = points[:, 0].int() * self.scale_xyz + \
+            points_coords[:, 0] * self.scale_yz + \
+            points_coords[:, 1] * self.scale_z + points_coords[:, 2]
+
+        unq_coords, unq_inv, unq_cnt = torch.unique(
+            merge_coords, return_inverse=True, return_counts=True, dim=0)
+
+        points_mean = torch_scatter.scatter_mean(points_xyz, unq_inv, dim=0)
+        f_cluster = points_xyz - points_mean[unq_inv, :]
+
+        f_center = torch.zeros_like(points_xyz)
+        f_center[:, 0] = points_xyz[:, 0] - (
+            points_coords[:, 0].to(points_xyz.dtype) * self.voxel_x +
+            self.x_offset)
+        f_center[:, 1] = points_xyz[:, 1] - (
+            points_coords[:, 1].to(points_xyz.dtype) * self.voxel_y +
+            self.y_offset)
+        # f_center[:, 2] = points_xyz[:, 2] - self.z_offset
+        f_center[:, 2] = points_xyz[:, 2] - (
+            points_coords[:, 2].to(points_xyz.dtype) * self.voxel_z +
+            self.z_offset)
+
+        if self.use_absolute_xyz:
+            features = [points[:, 1:], f_cluster, f_center]
+        else:
+            features = [points[:, 4:], f_cluster, f_center]
+
+        if self.with_distance:
+            points_dist = torch.norm(points[:, 1:4], 2, dim=1, keepdim=True)
+            features.append(points_dist)
+        features = torch.cat(features, dim=-1)
+
+        for pfn in self.pfn_layers:
+            features = pfn(features, unq_inv)
+
+        # generate voxel coordinates
+        unq_coords = unq_coords.int()
+        voxel_coords = torch.stack(
+            (unq_coords // self.scale_xyz,
+             (unq_coords % self.scale_xyz) // self.scale_yz,
+             (unq_coords % self.scale_yz) // self.scale_z,
+             unq_coords % self.scale_z),
+            dim=1)
+        voxel_coords = voxel_coords[:, [0, 3, 2, 1]]
+
+        batch_dict['pillar_features'] = batch_dict['voxel_features'] = features
+        batch_dict['voxel_coords'] = voxel_coords
+
+        return batch_dict
diff --git a/mmde/projects/DSVT/dsvt/map2bev.py b/mmde/projects/DSVT/dsvt/map2bev.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f4dc21b525a3b9db8ed521133a5f30d9d7a1151
--- /dev/null
+++ b/mmde/projects/DSVT/dsvt/map2bev.py
@@ -0,0 +1,46 @@
+# modified from https://github.com/Haiyang-W/DSVT
+import torch
+import torch.nn as nn
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class PointPillarsScatter3D(nn.Module):
+    """The difference between `PointPillarsScatter3D` and `PointPillarsScatter`
+    is that the voxel in this module is along 3 dims: (x, y, z)."""
+
+    def __init__(self, output_shape, num_bev_feats, **kwargs):
+        super().__init__()
+        self.nx, self.ny, self.nz = output_shape
+        self.num_bev_feats = num_bev_feats
+        self.num_bev_feats_ori = num_bev_feats // self.nz
+
+    def forward(self, batch_dict, **kwargs):
+        pillar_features, coords = batch_dict['pillar_features'], batch_dict[
+            'voxel_coords']
+
+        batch_spatial_features = []
+        batch_size = coords[:, 0].max().int().item() + 1
+        for batch_idx in range(batch_size):
+            spatial_feature = torch.zeros(
+                self.num_bev_feats_ori,
+                self.nz * self.nx * self.ny,
+                dtype=pillar_features.dtype,
+                device=pillar_features.device)
+
+            batch_mask = coords[:, 0] == batch_idx
+            this_coords = coords[batch_mask, :]
+            indices = this_coords[:, 1] * self.ny * self.nx + \
+                this_coords[:, 2] * self.nx + this_coords[:,  3]
+            indices = indices.type(torch.long)
+            pillars = pillar_features[batch_mask, :]
+            pillars = pillars.t()
+            spatial_feature[:, indices] = pillars
+            batch_spatial_features.append(spatial_feature)
+
+        batch_spatial_features = torch.stack(batch_spatial_features, 0)
+        batch_spatial_features = batch_spatial_features.view(
+            batch_size, self.num_bev_feats_ori * self.nz, self.ny, self.nx)
+        batch_dict['spatial_features'] = batch_spatial_features
+        return batch_dict
diff --git a/mmde/projects/DSVT/dsvt/ops/ingroup_inds/ingroup_inds_op.py b/mmde/projects/DSVT/dsvt/ops/ingroup_inds/ingroup_inds_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7594083875407199238e14370ad105b1cfe4e301
--- /dev/null
+++ b/mmde/projects/DSVT/dsvt/ops/ingroup_inds/ingroup_inds_op.py
@@ -0,0 +1,34 @@
+import torch
+from torch.autograd import Function
+
+try:
+    from . import ingroup_inds_cuda
+
+    # import ingroup_indices
+except ImportError:
+    ingroup_indices = None
+    print('Can not import ingroup indices')
+
+ingroup_indices = ingroup_inds_cuda
+
+
+class IngroupIndicesFunction(Function):
+
+    @staticmethod
+    def forward(ctx, group_inds):
+
+        out_inds = torch.zeros_like(group_inds) - 1
+
+        ingroup_indices.forward(group_inds, out_inds)
+
+        ctx.mark_non_differentiable(out_inds)
+
+        return out_inds
+
+    @staticmethod
+    def backward(ctx, g):
+
+        return None
+
+
+ingroup_inds = IngroupIndicesFunction.apply
diff --git a/mmde/projects/DSVT/dsvt/ops/ingroup_inds/src/error.cuh b/mmde/projects/DSVT/dsvt/ops/ingroup_inds/src/error.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7057ae5037957ea5ad6e7a631e7443403e37afa5
--- /dev/null
+++ b/mmde/projects/DSVT/dsvt/ops/ingroup_inds/src/error.cuh
@@ -0,0 +1,18 @@
+#pragma once
+#include <stdio.h>
+
+#define CHECK_CALL(call)                                   \
+do                                                    \
+{                                                     \
+    const cudaError_t error_code = call;              \
+    if (error_code != cudaSuccess)                    \
+    {                                                 \
+        printf("CUDA Error:\n");                      \
+        printf("    File:       %s\n", __FILE__);     \
+        printf("    Line:       %d\n", __LINE__);     \
+        printf("    Error code: %d\n", error_code);   \
+        printf("    Error text: %s\n",                \
+            cudaGetErrorString(error_code));          \
+        exit(1);                                      \
+    }                                                 \
+} while (0)
diff --git a/mmde/projects/DSVT/dsvt/ops/ingroup_inds/src/ingroup_inds.cpp b/mmde/projects/DSVT/dsvt/ops/ingroup_inds/src/ingroup_inds.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae1f329ee133beff7b4521f8d60b1ab92fc77284
--- /dev/null
+++ b/mmde/projects/DSVT/dsvt/ops/ingroup_inds/src/ingroup_inds.cpp
@@ -0,0 +1,54 @@
+#include <assert.h>
+#include <torch/extension.h>
+#include <torch/serialize/tensor.h>
+#include <vector>
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+
+void ingroup_inds_launcher(
+    const long *group_inds_data,
+    long *out_inds_data,
+    int N,
+    int max_group_id
+);
+
+
+void ingroup_inds_gpu(
+  at::Tensor group_inds,
+  at::Tensor out_inds
+);
+
+void ingroup_inds_gpu(
+  at::Tensor group_inds,
+  at::Tensor out_inds
+) {
+
+  CHECK_INPUT(group_inds);
+  CHECK_INPUT(out_inds);
+  int N = group_inds.size(0);
+  int max_group_id = group_inds.max().item().toLong();
+
+
+  long *group_inds_data = group_inds.data_ptr<long>();
+  long *out_inds_data = out_inds.data_ptr<long>();
+
+  ingroup_inds_launcher(
+      group_inds_data,
+      out_inds_data,
+      N,
+      max_group_id
+  );
+
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &ingroup_inds_gpu, "cuda version of get_inner_win_inds of SST");
+}
diff --git a/mmde/projects/DSVT/dsvt/ops/ingroup_inds/src/ingroup_inds_kernel.cu b/mmde/projects/DSVT/dsvt/ops/ingroup_inds/src/ingroup_inds_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b1aeeb3769bfdb5d62dce93ef99dfa9fdc784a29
--- /dev/null
+++ b/mmde/projects/DSVT/dsvt/ops/ingroup_inds/src/ingroup_inds_kernel.cu
@@ -0,0 +1,79 @@
+#include <assert.h>
+#include <vector>
+#include <math.h>
+#include <stdio.h>
+#include <torch/serialize/tensor.h>
+#include <torch/extension.h>
+#include <torch/types.h>
+#include "cuda_fp16.h"
+// #include "error.cuh"
+
+#define CHECK_CALL(call)                                   \
+do                                                    \
+{                                                     \
+    const cudaError_t error_code = call;              \
+    if (error_code != cudaSuccess)                    \
+    {                                                 \
+        printf("CUDA Error:\n");                      \
+        printf("    File:       %s\n", __FILE__);     \
+        printf("    Line:       %d\n", __LINE__);     \
+        printf("    Error code: %d\n", error_code);   \
+        printf("    Error text: %s\n",                \
+            cudaGetErrorString(error_code));          \
+        exit(1);                                      \
+    }                                                 \
+} while (0)
+
+#define THREADS_PER_BLOCK 256
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+// #define DEBUG
+// #define ASSERTION
+
+__global__ void ingroup_inds_kernel(
+    const long *group_inds,
+    long *out_inds,
+    int *ingroup_counter,
+    int N
+) {
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) return;
+  long this_group_id = group_inds[idx];
+
+  int cnt = atomicAdd(&ingroup_counter[this_group_id], 1);
+  out_inds[idx] = cnt;
+}
+
+
+ void ingroup_inds_launcher(
+  const long *group_inds,
+  long *out_inds,
+  int N,
+  int max_group_id
+  ) {
+
+  int *ingroup_counter = NULL;
+  CHECK_CALL(cudaMalloc(&ingroup_counter,   (max_group_id + 1) * sizeof(int)));
+  CHECK_CALL(cudaMemset(ingroup_counter, 0, (max_group_id + 1) * sizeof(int)));
+
+  dim3 blocks(DIVUP(N, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  ingroup_inds_kernel<<<blocks, threads>>>(
+      group_inds,
+      out_inds,
+      ingroup_counter,
+      N
+  );
+
+  cudaFree(ingroup_counter);
+
+  #ifdef DEBUG
+  CHECK_CALL(cudaGetLastError());
+  CHECK_CALL(cudaDeviceSynchronize());
+  #endif
+
+  return;
+
+}
diff --git a/mmde/projects/DSVT/dsvt/res_second.py b/mmde/projects/DSVT/dsvt/res_second.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8775e34e8009ba2c296582acb5dea3afe6e4519
--- /dev/null
+++ b/mmde/projects/DSVT/dsvt/res_second.py
@@ -0,0 +1,122 @@
+# modified from https://github.com/Haiyang-W/DSVT
+from typing import Sequence, Tuple
+
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.registry import MODELS
+from mmdet3d.utils import OptMultiConfig
+
+
+class BasicResBlock(nn.Module):
+    expansion: int = 1
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        padding: int = 1,
+        downsample: bool = False,
+    ) -> None:
+        super().__init__()
+        self.conv1 = nn.Conv2d(
+            inplanes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=padding,
+            bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, eps=1e-3, momentum=0.01)
+        self.relu1 = nn.ReLU()
+        self.conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, eps=1e-3, momentum=0.01)
+        self.relu2 = nn.ReLU()
+        self.downsample = downsample
+        if self.downsample:
+            self.downsample_layer = nn.Sequential(
+                nn.Conv2d(
+                    inplanes,
+                    planes,
+                    kernel_size=1,
+                    stride=stride,
+                    padding=0,
+                    bias=False),
+                nn.BatchNorm2d(planes, eps=1e-3, momentum=0.01))
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample:
+            identity = self.downsample_layer(x)
+
+        out += identity
+        out = self.relu2(out)
+
+        return out
+
+
+@MODELS.register_module()
+class ResSECOND(BaseModule):
+    """Backbone network for DSVT. The difference between `ResSECOND` and
+    `SECOND` is that the basic block in this module contains residual layers.
+
+    Args:
+        in_channels (int): Input channels.
+        out_channels (list[int]): Output channels for multi-scale feature maps.
+        blocks_nums (list[int]): Number of blocks in each stage.
+        layer_strides (list[int]): Strides of each stage.
+        init_cfg (dict, optional): Config for weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int = 128,
+                 out_channels: Sequence[int] = [128, 128, 256],
+                 blocks_nums: Sequence[int] = [1, 2, 2],
+                 layer_strides: Sequence[int] = [2, 2, 2],
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(ResSECOND, self).__init__(init_cfg=init_cfg)
+        assert len(layer_strides) == len(blocks_nums)
+        assert len(out_channels) == len(blocks_nums)
+
+        in_filters = [in_channels, *out_channels[:-1]]
+        blocks = []
+        for i, block_num in enumerate(blocks_nums):
+            cur_layers = [
+                BasicResBlock(
+                    in_filters[i],
+                    out_channels[i],
+                    stride=layer_strides[i],
+                    downsample=True)
+            ]
+            for _ in range(block_num):
+                cur_layers.append(
+                    BasicResBlock(out_channels[i], out_channels[i]))
+            blocks.append(nn.Sequential(*cur_layers))
+        self.blocks = nn.Sequential(*blocks)
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, ...]:
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): Input with shape (N, C, H, W).
+
+        Returns:
+            tuple[torch.Tensor]: Multi-scale features.
+        """
+        outs = []
+        for i in range(len(self.blocks)):
+            x = self.blocks[i](x)
+            outs.append(x)
+        return tuple(outs)
diff --git a/mmde/projects/DSVT/dsvt/transforms_3d.py b/mmde/projects/DSVT/dsvt/transforms_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff0c9a2314b4a3d1573bd83ca20581c9087c99ec
--- /dev/null
+++ b/mmde/projects/DSVT/dsvt/transforms_3d.py
@@ -0,0 +1,116 @@
+from typing import List
+
+import numpy as np
+from mmcv import BaseTransform
+
+from mmdet3d.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class ObjectRangeFilter3D(BaseTransform):
+    """Filter objects by the range. It differs from `ObjectRangeFilter` by
+    using `in_range_3d` instead of `in_range_bev`.
+
+    Required Keys:
+
+    - gt_bboxes_3d
+
+    Modified Keys:
+
+    - gt_bboxes_3d
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range: List[float]) -> None:
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to filter objects by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
+            keys are updated in the result dict.
+        """
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+        mask = gt_bboxes_3d.in_range_3d(self.pcd_range)
+        gt_bboxes_3d = gt_bboxes_3d[mask]
+        # mask is a torch tensor but gt_labels_3d is still numpy array
+        # using mask to index gt_labels_3d will cause bug when
+        # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+        # as gt_labels_3d[1] and cause out of index error
+        gt_labels_3d = gt_labels_3d[mask.numpy().astype(bool)]
+
+        # limit rad to [-pi, pi]
+        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PointsRangeFilter3D(BaseTransform):
+    """Filter points by the range. It differs from `PointRangeFilter` by using
+    `in_range_bev` instead of `in_range_3d`.
+
+    Required Keys:
+
+    - points
+    - pts_instance_mask (optional)
+
+    Modified Keys:
+
+    - points
+    - pts_instance_mask (optional)
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range: List[float]) -> None:
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to filter points by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask'
+            and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = input_dict['points']
+        points_mask = points.in_range_bev(self.pcd_range[[0, 1, 3, 4]])
+        clean_points = points[points_mask]
+        input_dict['points'] = clean_points
+        points_mask = points_mask.numpy()
+
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[points_mask]
+
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[points_mask]
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
diff --git a/mmde/projects/DSVT/dsvt/utils.py b/mmde/projects/DSVT/dsvt/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..706ee04280dfa72cb41847dbcd531b6efc521df0
--- /dev/null
+++ b/mmde/projects/DSVT/dsvt/utils.py
@@ -0,0 +1,440 @@
+from typing import Dict, List, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmdet.models.losses.utils import weighted_loss
+from torch import Tensor
+
+from mmdet3d.models.task_modules import CenterPointBBoxCoder
+from mmdet3d.registry import MODELS, TASK_UTILS
+from .ops.ingroup_inds.ingroup_inds_op import ingroup_inds
+
+get_inner_win_inds_cuda = ingroup_inds
+
+
+class PositionEmbeddingLearned(nn.Module):
+    """Absolute pos embedding, learned."""
+
+    def __init__(self, input_channel, num_pos_feats):
+        super().__init__()
+        self.position_embedding_head = nn.Sequential(
+            nn.Linear(input_channel, num_pos_feats),
+            nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True),
+            nn.Linear(num_pos_feats, num_pos_feats))
+
+    def forward(self, xyz):
+        position_embedding = self.position_embedding_head(xyz)
+        return position_embedding
+
+
+@torch.no_grad()
+def get_window_coors(coors,
+                     sparse_shape,
+                     window_shape,
+                     do_shift,
+                     shift_list=None,
+                     return_win_coors=False):
+
+    if len(window_shape) == 2:
+        win_shape_x, win_shape_y = window_shape
+        win_shape_z = sparse_shape[-1]
+    else:
+        win_shape_x, win_shape_y, win_shape_z = window_shape
+
+    sparse_shape_x, sparse_shape_y, sparse_shape_z = sparse_shape
+    assert sparse_shape_z < sparse_shape_x, 'Usually holds... in case of wrong order'  # noqa: E501
+
+    max_num_win_x = int(np.ceil((sparse_shape_x / win_shape_x)) +
+                        1)  # plus one here to meet the needs of shift.
+    max_num_win_y = int(np.ceil((sparse_shape_y / win_shape_y)) +
+                        1)  # plus one here to meet the needs of shift.
+    max_num_win_z = int(np.ceil((sparse_shape_z / win_shape_z)) +
+                        1)  # plus one here to meet the needs of shift.
+    max_num_win_per_sample = max_num_win_x * max_num_win_y * max_num_win_z
+
+    if do_shift:
+        if shift_list is not None:
+            shift_x, shift_y, shift_z = shift_list[0], shift_list[
+                1], shift_list[2]
+        else:
+            shift_x, shift_y, shift_z = win_shape_x // 2, win_shape_y // 2, win_shape_z // 2  # noqa: E501
+    else:
+        if shift_list is not None:
+            shift_x, shift_y, shift_z = shift_list[0], shift_list[
+                1], shift_list[2]
+        else:
+            shift_x, shift_y, shift_z = win_shape_x, win_shape_y, win_shape_z
+
+    # compatibility between 2D window and 3D window
+    if sparse_shape_z == win_shape_z:
+        shift_z = 0
+
+    shifted_coors_x = coors[:, 3] + shift_x
+    shifted_coors_y = coors[:, 2] + shift_y
+    shifted_coors_z = coors[:, 1] + shift_z
+
+    win_coors_x = shifted_coors_x // win_shape_x
+    win_coors_y = shifted_coors_y // win_shape_y
+    win_coors_z = shifted_coors_z // win_shape_z
+
+    if len(window_shape) == 2:
+        assert (win_coors_z == 0).all()
+
+    batch_win_inds = coors[:, 0] * max_num_win_per_sample + \
+        win_coors_x * max_num_win_y * max_num_win_z + \
+        win_coors_y * max_num_win_z + win_coors_z
+
+    coors_in_win_x = shifted_coors_x % win_shape_x
+    coors_in_win_y = shifted_coors_y % win_shape_y
+    coors_in_win_z = shifted_coors_z % win_shape_z
+    coors_in_win = torch.stack(
+        [coors_in_win_z, coors_in_win_y, coors_in_win_x], dim=-1)
+    # coors_in_win = torch.stack([coors_in_win_x, coors_in_win_y], dim=-1)
+    if return_win_coors:
+        batch_win_coords = torch.stack([win_coors_z, win_coors_y, win_coors_x],
+                                       dim=-1)
+        return batch_win_inds, coors_in_win, batch_win_coords
+
+    return batch_win_inds, coors_in_win
+
+
+def get_pooling_index(coors, sparse_shape, window_shape):
+    win_shape_x, win_shape_y, win_shape_z = window_shape
+    sparse_shape_x, sparse_shape_y, sparse_shape_z = sparse_shape
+
+    max_num_win_x = int(np.ceil((sparse_shape_x / win_shape_x)))
+    max_num_win_y = int(np.ceil((sparse_shape_y / win_shape_y)))
+    max_num_win_z = int(np.ceil((sparse_shape_z / win_shape_z)))
+    max_num_win_per_sample = max_num_win_x * max_num_win_y * max_num_win_z
+
+    coors_x = coors[:, 3]
+    coors_y = coors[:, 2]
+    coors_z = coors[:, 1]
+
+    win_coors_x = coors_x // win_shape_x
+    win_coors_y = coors_y // win_shape_y
+    win_coors_z = coors_z // win_shape_z
+
+    batch_win_inds = coors[:, 0] * max_num_win_per_sample + \
+        win_coors_x * max_num_win_y * max_num_win_z + \
+        win_coors_y * max_num_win_z + win_coors_z
+
+    coors_in_win_x = coors_x % win_shape_x
+    coors_in_win_y = coors_y % win_shape_y
+    coors_in_win_z = coors_z % win_shape_z
+    coors_in_win = torch.stack(
+        [coors_in_win_z, coors_in_win_y, coors_in_win_x], dim=-1)
+
+    index_in_win = coors_in_win_x * win_shape_y * win_shape_z + \
+        coors_in_win_y * win_shape_z + coors_in_win_z
+
+    batch_win_coords = torch.stack(
+        [coors[:, 0], win_coors_z, win_coors_y, win_coors_x], dim=-1)
+    return batch_win_inds, coors_in_win, index_in_win, batch_win_coords
+
+
+def get_continous_inds(setnum_per_win):
+    '''
+    Args:
+        setnum_per_win (Tensor[int]): Number of sets assigned to each window
+            with shape (win_num).
+    Returns:
+        set_win_inds (Tensor[int]): Window indices of each set with shape
+            (set_num).
+        set_inds_in_win (Tensor[int]): Set indices inner window with shape
+            (set_num).
+
+    Examples:
+        setnum_per_win = torch.tensor([1, 2, 1, 3])
+        set_inds_in_win = get_continous_inds(setnum_per_win)
+        # we can get: set_inds_in_win = tensor([0, 0, 1, 0, 0, 1, 2])
+    '''
+    set_num = setnum_per_win.sum().item()  # set_num = 7
+    setnum_per_win_cumsum = torch.cumsum(
+        setnum_per_win, dim=0)[:-1]  # [1, 3, 4]
+    set_win_inds = torch.full((set_num, ), 0, device=setnum_per_win.device)
+    set_win_inds[setnum_per_win_cumsum] = 1  # [0, 1, 0, 1, 1, 0, 0]
+    set_win_inds = torch.cumsum(set_win_inds, dim=0)  # [0, 1, 1, 2, 3, 3, 3]
+
+    roll_set_win_inds_left = torch.roll(set_win_inds,
+                                        -1)  # [1, 1, 2, 3, 3, 3, 0]
+    diff = set_win_inds - roll_set_win_inds_left  # [-1, 0, -1, -1, 0, 0, 3]
+    end_pos_mask = diff != 0
+    template = torch.ones_like(set_win_inds)
+    template[end_pos_mask] = (setnum_per_win -
+                              1) * -1  # [ 0, 1, -1, 0, 1, 1, -2]
+    set_inds_in_win = torch.cumsum(template, dim=0)  # [0, 1, 0, 0, 1, 2, 0]
+    set_inds_in_win[end_pos_mask] = setnum_per_win  # [1, 1, 2, 1, 1, 2, 3]
+    set_inds_in_win = set_inds_in_win - 1  # [0, 0, 1, 0, 0, 1, 2]
+
+    return set_win_inds, set_inds_in_win
+
+
+@TASK_UTILS.register_module()
+class DSVTBBoxCoder(CenterPointBBoxCoder):
+    """Bbox coder for DSVT.
+
+    Compared with `CenterPointBBoxCoder`, this coder contains IoU predictions
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super(DSVTBBoxCoder, self).__init__(*args, **kwargs)
+
+    def decode(self,
+               heat: Tensor,
+               rot_sine: Tensor,
+               rot_cosine: Tensor,
+               hei: Tensor,
+               dim: Tensor,
+               vel: Tensor,
+               reg: Optional[Tensor] = None,
+               iou: Optional[Tensor] = None) -> List[Dict[str, Tensor]]:
+        """
+
+        Args:
+            heat (torch.Tensor): Heatmap with the shape of [B, N, W, H].
+            rot_sine (torch.Tensor): Sine of rotation with the shape of
+                [B, 1, W, H].
+            rot_cosine (torch.Tensor): Cosine of rotation with the shape of
+                [B, 1, W, H].
+            hei (torch.Tensor): Height of the boxes with the shape
+                of [B, 1, W, H].
+            dim (torch.Tensor): Dim of the boxes with the shape of
+                [B, 1, W, H].
+            vel (torch.Tensor): Velocity with the shape of [B, 1, W, H].
+            reg (torch.Tensor, optional): Regression value of the boxes in
+                2D with the shape of [B, 2, W, H]. Default: None.
+
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        batch, cat, _, _ = heat.size()
+
+        scores, inds, clses, ys, xs = self._topk(heat, K=self.max_num)
+
+        if reg is not None:
+            reg = self._transpose_and_gather_feat(reg, inds)
+            reg = reg.view(batch, self.max_num, 2)
+            xs = xs.view(batch, self.max_num, 1) + reg[:, :, 0:1]
+            ys = ys.view(batch, self.max_num, 1) + reg[:, :, 1:2]
+        else:
+            xs = xs.view(batch, self.max_num, 1) + 0.5
+            ys = ys.view(batch, self.max_num, 1) + 0.5
+
+        # rotation value and direction label
+        rot_sine = self._transpose_and_gather_feat(rot_sine, inds)
+        rot_sine = rot_sine.view(batch, self.max_num, 1)
+
+        rot_cosine = self._transpose_and_gather_feat(rot_cosine, inds)
+        rot_cosine = rot_cosine.view(batch, self.max_num, 1)
+        rot = torch.atan2(rot_sine, rot_cosine)
+
+        # height in the bev
+        hei = self._transpose_and_gather_feat(hei, inds)
+        hei = hei.view(batch, self.max_num, 1)
+
+        # dim of the box
+        dim = self._transpose_and_gather_feat(dim, inds)
+        dim = dim.view(batch, self.max_num, 3)
+
+        # class label
+        clses = clses.view(batch, self.max_num).float()
+        scores = scores.view(batch, self.max_num)
+
+        xs = xs.view(
+            batch, self.max_num,
+            1) * self.out_size_factor * self.voxel_size[0] + self.pc_range[0]
+        ys = ys.view(
+            batch, self.max_num,
+            1) * self.out_size_factor * self.voxel_size[1] + self.pc_range[1]
+
+        if vel is None:  # KITTI FORMAT
+            final_box_preds = torch.cat([xs, ys, hei, dim, rot], dim=2)
+        else:  # exist velocity, nuscene format
+            vel = self._transpose_and_gather_feat(vel, inds)
+            vel = vel.view(batch, self.max_num, 2)
+            final_box_preds = torch.cat([xs, ys, hei, dim, rot, vel], dim=2)
+        if iou is not None:
+            iou = self._transpose_and_gather_feat(iou, inds).view(
+                batch, self.max_num)
+
+        final_scores = scores
+        final_preds = clses
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+
+        if self.post_center_range is not None:
+            self.post_center_range = torch.as_tensor(
+                self.post_center_range, device=heat.device)
+            mask = (final_box_preds[..., :3] >=
+                    self.post_center_range[:3]).all(2)
+            mask &= (final_box_preds[..., :3] <=
+                     self.post_center_range[3:]).all(2)
+
+            predictions_dicts = []
+            for i in range(batch):
+                cmask = mask[i, :]
+                if self.score_threshold:
+                    cmask &= thresh_mask[i]
+
+                boxes3d = final_box_preds[i, cmask]
+                scores = final_scores[i, cmask]
+                labels = final_preds[i, cmask]
+                predictions_dict = {
+                    'bboxes': boxes3d,
+                    'scores': scores,
+                    'labels': labels,
+                }
+                if iou is not None:
+                    pred_iou = iou[i, cmask]
+                    predictions_dict['iou'] = pred_iou
+
+                predictions_dicts.append(predictions_dict)
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+
+        return predictions_dicts
+
+
+def center_to_corner2d(center, dim):
+    corners_norm = torch.tensor(
+        [[-0.5, -0.5], [-0.5, 0.5], [0.5, 0.5], [0.5, -0.5]],
+        device=dim.device).type_as(center)  # (4, 2)
+    corners = dim.view([-1, 1, 2]) * corners_norm.view([1, 4, 2])  # (N, 4, 2)
+    corners = corners + center.view(-1, 1, 2)
+    return corners
+
+
+@weighted_loss
+def diou3d_loss(pred_boxes, gt_boxes, eps: float = 1e-7):
+    """
+    modified from https://github.com/agent-sgs/PillarNet/blob/master/det3d/core/utils/center_utils.py # noqa
+    Args:
+        pred_boxes (N, 7):
+        gt_boxes (N, 7):
+
+    Returns:
+        Tensor: Distance-IoU Loss.
+    """
+    assert pred_boxes.shape[0] == gt_boxes.shape[0]
+
+    qcorners = center_to_corner2d(pred_boxes[:, :2],
+                                  pred_boxes[:, 3:5])  # (N, 4, 2)
+    gcorners = center_to_corner2d(gt_boxes[:, :2], gt_boxes[:,
+                                                            3:5])  # (N, 4, 2)
+
+    inter_max_xy = torch.minimum(qcorners[:, 2], gcorners[:, 2])
+    inter_min_xy = torch.maximum(qcorners[:, 0], gcorners[:, 0])
+    out_max_xy = torch.maximum(qcorners[:, 2], gcorners[:, 2])
+    out_min_xy = torch.minimum(qcorners[:, 0], gcorners[:, 0])
+
+    # calculate area
+    volume_pred_boxes = pred_boxes[:, 3] * pred_boxes[:, 4] * pred_boxes[:, 5]
+    volume_gt_boxes = gt_boxes[:, 3] * gt_boxes[:, 4] * gt_boxes[:, 5]
+
+    inter_h = torch.minimum(
+        pred_boxes[:, 2] + 0.5 * pred_boxes[:, 5],
+        gt_boxes[:, 2] + 0.5 * gt_boxes[:, 5]) - torch.maximum(
+            pred_boxes[:, 2] - 0.5 * pred_boxes[:, 5],
+            gt_boxes[:, 2] - 0.5 * gt_boxes[:, 5])
+    inter_h = torch.clamp(inter_h, min=0)
+
+    inter = torch.clamp((inter_max_xy - inter_min_xy), min=0)
+    volume_inter = inter[:, 0] * inter[:, 1] * inter_h
+    volume_union = volume_gt_boxes + volume_pred_boxes - volume_inter + eps
+
+    # boxes_iou3d_gpu(pred_boxes, gt_boxes)
+    inter_diag = torch.pow(gt_boxes[:, 0:3] - pred_boxes[:, 0:3], 2).sum(-1)
+
+    outer_h = torch.maximum(
+        gt_boxes[:, 2] + 0.5 * gt_boxes[:, 5],
+        pred_boxes[:, 2] + 0.5 * pred_boxes[:, 5]) - torch.minimum(
+            gt_boxes[:, 2] - 0.5 * gt_boxes[:, 5],
+            pred_boxes[:, 2] - 0.5 * pred_boxes[:, 5])
+    outer_h = torch.clamp(outer_h, min=0)
+    outer = torch.clamp((out_max_xy - out_min_xy), min=0)
+    outer_diag = outer[:, 0]**2 + outer[:, 1]**2 + outer_h**2 + eps
+
+    dious = volume_inter / volume_union - inter_diag / outer_diag
+    dious = torch.clamp(dious, min=-1.0, max=1.0)
+
+    loss = 1 - dious
+
+    return loss
+
+
+@MODELS.register_module()
+class DIoU3DLoss(nn.Module):
+    r"""3D bboxes Implementation of `Distance-IoU Loss: Faster and Better
+    Learning for Bounding Box Regression <https://arxiv.org/abs/1911.08287>`_.
+
+    Code is modified from https://github.com/Zzh-tju/DIoU.
+
+    Args:
+        eps (float): Epsilon to avoid log(0). Defaults to 1e-6.
+        reduction (str): Options are "none", "mean" and "sum".
+            Defaults to "mean".
+        loss_weight (float): Weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * diou3d_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
diff --git a/mmde/projects/DSVT/setup.py b/mmde/projects/DSVT/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..7640f9a96b9e8affd7fba069052c0aa3f3a5250a
--- /dev/null
+++ b/mmde/projects/DSVT/setup.py
@@ -0,0 +1,59 @@
+import os
+from setuptools import setup
+
+import torch
+from torch.utils.cpp_extension import (BuildExtension, CppExtension,
+                                       CUDAExtension)
+
+
+def make_cuda_ext(name,
+                  module,
+                  sources,
+                  sources_cuda=[],
+                  extra_args=[],
+                  extra_include_path=[]):
+
+    define_macros = []
+    extra_compile_args = {'cxx': [] + extra_args}
+
+    if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
+        define_macros += [('WITH_CUDA', None)]
+        extension = CUDAExtension
+        extra_compile_args['nvcc'] = extra_args + [
+            '-D__CUDA_NO_HALF_OPERATORS__',
+            '-D__CUDA_NO_HALF_CONVERSIONS__',
+            '-D__CUDA_NO_HALF2_OPERATORS__',
+            '-gencode=arch=compute_70,code=sm_70',
+            '-gencode=arch=compute_75,code=sm_75',
+            '-gencode=arch=compute_80,code=sm_80',
+            '-gencode=arch=compute_86,code=sm_86',
+        ]
+        sources += sources_cuda
+    else:
+        print('Compiling {} without CUDA'.format(name))
+        extension = CppExtension
+
+    return extension(
+        name='{}.{}'.format(module, name),
+        sources=[os.path.join(*module.split('.'), p) for p in sources],
+        include_dirs=extra_include_path,
+        define_macros=define_macros,
+        extra_compile_args=extra_compile_args,
+    )
+
+
+if __name__ == '__main__':
+    setup(
+        name='dsvt',
+        ext_modules=[
+            make_cuda_ext(
+                name='ingroup_inds_cuda',
+                module='projects.DSVT.dsvt.ops.ingroup_inds',
+                sources=[
+                    'src/ingroup_inds.cpp',
+                    'src/ingroup_inds_kernel.cu',
+                ]),
+        ],
+        cmdclass={'build_ext': BuildExtension},
+        zip_safe=False,
+    )
diff --git a/mmde/projects/NeRF-Det/README.md b/mmde/projects/NeRF-Det/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..93119895e92b869c43f689981d4183ef15f8ddfa
--- /dev/null
+++ b/mmde/projects/NeRF-Det/README.md
@@ -0,0 +1,115 @@
+# NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection
+
+> [NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection](https://arxiv.org/abs/2307.14620)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+NeRF-Det is a novel method for indoor 3D detection with posed RGB images as input. Unlike existing indoor 3D detection methods that struggle to model scene geometry, NeRF-Det makes novel use of NeRF in an end-to-end manner to explicitly estimate 3D geometry, thereby improving 3D detection performance. Specifically, to avoid the significant extra latency associated with per-scene optimization of NeRF, NeRF-Det introduce sufficient geometry priors to enhance the generalizability of NeRF-MLP. Furthermore, it subtly connect the detection and NeRF branches through a shared MLP, enabling an efficient adaptation of NeRF to detection and yielding geometry-aware volumetric representations for 3D detection. NeRF-Det outperforms state-of-the-arts by 3.9 mAP and 3.1 mAP on the ScanNet and ARKITScenes benchmarks, respectively. The author provide extensive analysis to shed light on how NeRF-Det works. As a result of joint-training design,  NeRF-Det is able to generalize well to unseen scenes for object detection, view synthesis, and depth estimation tasks without requiring per-scene optimization. Code will be available at https://github.com/facebookresearch/NeRF-Det
+
+<div align=center>
+<img src="https://chenfengxu714.github.io/nerfdet/static/images/method-cropped_1.png" width="800"/>
+</div>
+
+## Introduction
+
+This directory contains the implementations of NeRF-Det (https://arxiv.org/abs/2307.14620). Our implementations are built on top of MMdetection3D. We have updated NeRF-Det to be compatible with latest mmdet3d version. The codebase and config files have all changed to adapt to the new mmdet3d version. All previous pretrained models are verified with the result listed below. However, newly trained models are yet to be uploaded.
+
+<!-- Share any information you would like others to know. For example:
+Author: @xxx.
+This is an implementation of \[XXX\]. -->
+
+## Dataset
+
+The format of the scannet dataset in the latest version of mmdet3d only supports the lidar tasks. For NeRF-Det, we need to create the new format of ScanNet Dataset.
+
+Please following the files in mmdet3d to prepare the raw data of ScanNet. After that, please use this command to generate the pkls used in nerfdet.
+
+```bash
+python projects/NeRF-Det/prepare_infos.py --root-path ./data/scannet --out-dir ./data/scannet
+```
+
+The new format of the pkl is organized as below:
+
+- scannet_infos_train.pkl: The train data infos, the detailed info of each scan is as follows:
+  - info\['instances'\]:A list of dict contains all annotations, each dict contains all annotation information of single instance.For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 6 numbers representing the axis_aligned in depth coordinate system, in (x,y,z,l,w,h) order.
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: The label of each 3d bounding boxes.
+  - info\['cam2img'\]: The intrinsic matrix.Every scene has one matrix.
+  - info\['lidar2cam'\]: The extrinsic matrixes.Every scene has 300 matrixes.
+  - info\['img_paths'\]: The paths of the 300 rgb pictures.
+  - info\['axis_align_matrix'\]: The align matrix.Every scene has one matrix.
+
+After preparing your scannet dataset pkls,please change the paths in configs to fit your project.
+
+## Train
+
+In MMDet3D's root directory, run the following command to train the model:
+
+```bash
+python tools/train.py projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py ${WORK_DIR}
+```
+
+## Results and Models
+
+### NeRF-Det
+
+|                            Backbone                             | mAP@25 | mAP@50 |    Log    |
+| :-------------------------------------------------------------: | :----: | :----: | :-------: |
+|      [NeRF-Det-R50](./configs/nerfdet_res50_2x_low_res.py)      |  53.0  |  26.8  | [log](<>) |
+|  [NeRF-Det-R50\*](./configs/nerfdet_res50_2x_low_res_depth.py)  |  52.2  |  28.5  | [log](<>) |
+| [NeRF-Det-R101\*](./configs/nerfdet_res101_2x_low_res_depth.py) |  52.3  |  28.5  | [log](<>) |
+
+(Here NeRF-Det-R50\* means this model uses depth information in the training step)
+
+### Notes
+
+- The values showed in the chart all represents the best mAP in the training.
+
+- Since there is a lot of randomness in the behavior of the model, we conducted three experiments on each config and took the average. The mAP showed on the above chart are all average values.
+
+- We also conducted the same experiments in the original code, the results are showed below.
+
+  |    Backbone     | mAP@25 | mAP@50 |
+  | :-------------: | :----: | :----: |
+  |  NeRF-Det-R50   |  52.8  |  26.8  |
+  | NeRF-Det-R50\*  |  52.4  |  27.5  |
+  | NeRF-Det-R101\* |  52.8  |  28.6  |
+
+- Attention: Because of the randomness in the construction of the ScanNet dataset itself and the behavior of the model, the training results will fluctuate considerably. According to experimental results and experience, the experimental results will fluctuate by plus or minus 1.5 points.
+
+## Evaluation using pretrained models
+
+1. Download the pretrained checkpoints through the linkings in the above chart.
+
+2. Testing
+
+   To test, use:
+
+   ```bash
+   python tools/test.py projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py ${CHECKPOINT_PATH}
+   ```
+
+## Citation
+
+<!-- You may remove this section if not applicable. -->
+
+```latex
+@inproceedings{
+  xu2023nerfdet,
+  title={NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection},
+  author={Xu, Chenfeng and Wu, Bichen and Hou, Ji and Tsai, Sam and Li, Ruilong and Wang, Jialiang and Zhan, Wei and He, Zijian and Vajda, Peter and Keutzer, Kurt and Tomizuka, Masayoshi},
+  booktitle={ICCV},
+  year={2023},
+}
+
+@inproceedings{
+park2023time,
+title={Time Will Tell: New Outlooks and A Baseline for Temporal Multi-View 3D Object Detection},
+author={Jinhyung Park and Chenfeng Xu and Shijia Yang and Kurt Keutzer and Kris M. Kitani and Masayoshi Tomizuka and Wei Zhan},
+booktitle={The Eleventh International Conference on Learning Representations },
+year={2023},
+url={https://openreview.net/forum?id=H3HcEJA2Um}
+}
+```
diff --git a/mmde/projects/NeRF-Det/configs/nerfdet_res101_2x_low_res_depth.py b/mmde/projects/NeRF-Det/configs/nerfdet_res101_2x_low_res_depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3c639f19eef990d05b4074d51e7cce1f0e3b301
--- /dev/null
+++ b/mmde/projects/NeRF-Det/configs/nerfdet_res101_2x_low_res_depth.py
@@ -0,0 +1,198 @@
+_base_ = ['../../../configs/_base_/default_runtime.py']
+
+custom_imports = dict(imports=['projects.NeRF-Det.nerfdet'])
+prior_generator = dict(
+    type='AlignedAnchor3DRangeGenerator',
+    ranges=[[-3.2, -3.2, -1.28, 3.2, 3.2, 1.28]],
+    rotations=[.0])
+
+model = dict(
+    type='NerfDet',
+    data_preprocessor=dict(
+        type='NeRFDetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=10),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        style='pytorch'),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    neck_3d=dict(
+        type='IndoorImVoxelNeck',
+        in_channels=256,
+        out_channels=128,
+        n_blocks=[1, 1, 1]),
+    bbox_head=dict(
+        type='NerfDetHead',
+        bbox_loss=dict(type='AxisAlignedIoULoss', loss_weight=1.0),
+        n_classes=18,
+        n_levels=3,
+        n_channels=128,
+        n_reg_outs=6,
+        pts_assign_threshold=27,
+        pts_center_threshold=18,
+        prior_generator=prior_generator),
+    prior_generator=prior_generator,
+    voxel_size=[.16, .16, .2],
+    n_voxels=[40, 40, 16],
+    aabb=([-2.7, -2.7, -0.78], [3.7, 3.7, 1.78]),
+    near_far_range=[0.2, 8.0],
+    N_samples=64,
+    N_rand=2048,
+    nerf_mode='image',
+    depth_supervise=True,
+    use_nerf_mask=True,
+    nerf_sample_view=20,
+    squeeze_scale=4,
+    nerf_density=True,
+    train_cfg=dict(),
+    test_cfg=dict(nms_pre=1000, iou_thr=.25, score_thr=.01))
+
+dataset_type = 'MultiViewScanNetDataset'
+data_root = 'data/scannet/'
+class_names = [
+    'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
+    'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
+    'toilet', 'sink', 'bathtub', 'garbagebin'
+]
+metainfo = dict(CLASSES=class_names)
+file_client_args = dict(backend='disk')
+
+input_modality = dict(
+    use_camera=True,
+    use_depth=True,
+    use_lidar=False,
+    use_neuralrecon_depth=False,
+    use_ray=True)
+backend_args = None
+
+train_collect_keys = [
+    'img', 'gt_bboxes_3d', 'gt_labels_3d', 'depth', 'lightpos', 'nerf_sizes',
+    'raydirs', 'gt_images', 'gt_depths', 'denorm_images'
+]
+
+test_collect_keys = [
+    'img',
+    'depth',
+    'lightpos',
+    'nerf_sizes',
+    'raydirs',
+    'gt_images',
+    'gt_depths',
+    'denorm_images',
+]
+
+train_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=48,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=10),
+    dict(type='RandomShiftOrigin', std=(.7, .7, .0)),
+    dict(type='PackNeRFDetInputs', keys=train_collect_keys)
+]
+
+test_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=101,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=1),
+    dict(type='PackNeRFDetInputs', keys=test_collect_keys)
+]
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=6,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train_new.pkl',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            filter_empty_gt=True,
+            box_type_3d='Depth',
+            metainfo=metainfo)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=5,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val_new.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        filter_empty_gt=True,
+        box_type_3d='Depth',
+        metainfo=metainfo))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+test_cfg = dict()
+val_cfg = dict()
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
+    clip_grad=dict(max_norm=35., norm_type=2))
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=12))
+
+# runtime
+find_unused_parameters = True  # only 1 of 4 FPN outputs is used
diff --git a/mmde/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py b/mmde/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py
new file mode 100644
index 0000000000000000000000000000000000000000..0321d54bbabd23d5f8e299e9b372301e473446b1
--- /dev/null
+++ b/mmde/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py
@@ -0,0 +1,104 @@
+_base_ = ['./nerfdet_res50_2x_low_res_depth.py']
+
+model = dict(depth_supervise=False)
+
+dataset_type = 'MultiViewScanNetDataset'
+data_root = 'data/scannet/'
+class_names = [
+    'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
+    'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
+    'toilet', 'sink', 'bathtub', 'garbagebin'
+]
+metainfo = dict(CLASSES=class_names)
+file_client_args = dict(backend='disk')
+
+input_modality = dict(use_depth=False)
+backend_args = None
+
+train_collect_keys = [
+    'img', 'gt_bboxes_3d', 'gt_labels_3d', 'lightpos', 'nerf_sizes', 'raydirs',
+    'gt_images', 'gt_depths', 'denorm_images'
+]
+
+test_collect_keys = [
+    'img',
+    'lightpos',
+    'nerf_sizes',
+    'raydirs',
+    'gt_images',
+    'gt_depths',
+    'denorm_images',
+]
+
+train_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=50,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=10),
+    dict(type='RandomShiftOrigin', std=(.7, .7, .0)),
+    dict(type='PackNeRFDetInputs', keys=train_collect_keys)
+]
+
+test_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=101,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=1),
+    dict(type='PackNeRFDetInputs', keys=test_collect_keys)
+]
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=6,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train_new.pkl',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            filter_empty_gt=True,
+            box_type_3d='Depth',
+            metainfo=metainfo)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val_new.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        filter_empty_gt=True,
+        box_type_3d='Depth',
+        metainfo=metainfo))
+test_dataloader = val_dataloader
diff --git a/mmde/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res_depth.py b/mmde/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res_depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..0143a8084ab54948a8b8590ab8fe649e24e7f0ea
--- /dev/null
+++ b/mmde/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res_depth.py
@@ -0,0 +1,198 @@
+_base_ = ['../../../configs/_base_/default_runtime.py']
+
+custom_imports = dict(imports=['projects.NeRF-Det.nerfdet'])
+prior_generator = dict(
+    type='AlignedAnchor3DRangeGenerator',
+    ranges=[[-3.2, -3.2, -1.28, 3.2, 3.2, 1.28]],
+    rotations=[.0])
+
+model = dict(
+    type='NerfDet',
+    data_preprocessor=dict(
+        type='NeRFDetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=10),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+        style='pytorch'),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    neck_3d=dict(
+        type='IndoorImVoxelNeck',
+        in_channels=256,
+        out_channels=128,
+        n_blocks=[1, 1, 1]),
+    bbox_head=dict(
+        type='NerfDetHead',
+        bbox_loss=dict(type='AxisAlignedIoULoss', loss_weight=1.0),
+        n_classes=18,
+        n_levels=3,
+        n_channels=128,
+        n_reg_outs=6,
+        pts_assign_threshold=27,
+        pts_center_threshold=18,
+        prior_generator=prior_generator),
+    prior_generator=prior_generator,
+    voxel_size=[.16, .16, .2],
+    n_voxels=[40, 40, 16],
+    aabb=([-2.7, -2.7, -0.78], [3.7, 3.7, 1.78]),
+    near_far_range=[0.2, 8.0],
+    N_samples=64,
+    N_rand=2048,
+    nerf_mode='image',
+    depth_supervise=True,
+    use_nerf_mask=True,
+    nerf_sample_view=20,
+    squeeze_scale=4,
+    nerf_density=True,
+    train_cfg=dict(),
+    test_cfg=dict(nms_pre=1000, iou_thr=.25, score_thr=.01))
+
+dataset_type = 'MultiViewScanNetDataset'
+data_root = 'data/scannet/'
+class_names = [
+    'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
+    'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
+    'toilet', 'sink', 'bathtub', 'garbagebin'
+]
+metainfo = dict(CLASSES=class_names)
+file_client_args = dict(backend='disk')
+
+input_modality = dict(
+    use_camera=True,
+    use_depth=True,
+    use_lidar=False,
+    use_neuralrecon_depth=False,
+    use_ray=True)
+backend_args = None
+
+train_collect_keys = [
+    'img', 'gt_bboxes_3d', 'gt_labels_3d', 'depth', 'lightpos', 'nerf_sizes',
+    'raydirs', 'gt_images', 'gt_depths', 'denorm_images'
+]
+
+test_collect_keys = [
+    'img',
+    'depth',
+    'lightpos',
+    'nerf_sizes',
+    'raydirs',
+    'gt_images',
+    'gt_depths',
+    'denorm_images',
+]
+
+train_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=50,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=10),
+    dict(type='RandomShiftOrigin', std=(.7, .7, .0)),
+    dict(type='PackNeRFDetInputs', keys=train_collect_keys)
+]
+
+test_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=101,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=1),
+    dict(type='PackNeRFDetInputs', keys=test_collect_keys)
+]
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=6,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train_new.pkl',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            filter_empty_gt=True,
+            box_type_3d='Depth',
+            metainfo=metainfo)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=5,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val_new.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        filter_empty_gt=True,
+        box_type_3d='Depth',
+        metainfo=metainfo))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+test_cfg = dict()
+val_cfg = dict()
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
+    clip_grad=dict(max_norm=35., norm_type=2))
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=12))
+
+# runtime
+find_unused_parameters = True  # only 1 of 4 FPN outputs is used
diff --git a/mmde/projects/NeRF-Det/nerfdet/__init__.py b/mmde/projects/NeRF-Det/nerfdet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ddef2f7be40792f00e56349a7c5076e8c644757
--- /dev/null
+++ b/mmde/projects/NeRF-Det/nerfdet/__init__.py
@@ -0,0 +1,11 @@
+from .data_preprocessor import NeRFDetDataPreprocessor
+from .formating import PackNeRFDetInputs
+from .multiview_pipeline import MultiViewPipeline, RandomShiftOrigin
+from .nerfdet import NerfDet
+from .nerfdet_head import NerfDetHead
+from .scannet_multiview_dataset import MultiViewScanNetDataset
+
+__all__ = [
+    'MultiViewScanNetDataset', 'MultiViewPipeline', 'RandomShiftOrigin',
+    'PackNeRFDetInputs', 'NeRFDetDataPreprocessor', 'NerfDetHead', 'NerfDet'
+]
diff --git a/mmde/projects/NeRF-Det/nerfdet/data_preprocessor.py b/mmde/projects/NeRF-Det/nerfdet/data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..582a09f63c07084ae7a1ffedd7fe12e13eca43ad
--- /dev/null
+++ b/mmde/projects/NeRF-Det/nerfdet/data_preprocessor.py
@@ -0,0 +1,583 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from numbers import Number
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from mmdet.models import DetDataPreprocessor
+from mmdet.models.utils.misc import samplelist_boxtype2tensor
+from mmengine.model import stack_batch
+from mmengine.utils import is_seq_of
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.models.data_preprocessors.utils import multiview_img_stack_batch
+from mmdet3d.models.data_preprocessors.voxelize import (
+    VoxelizationByGridShape, dynamic_scatter_3d)
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import OptConfigType
+
+
+@MODELS.register_module()
+class NeRFDetDataPreprocessor(DetDataPreprocessor):
+    """In NeRF-Det, some extra information is needed in NeRF branch. We put the
+    datapreprocessor operations of these new information such as stack and pack
+    operations in this class. You can find the stack operations in subfuction
+    'collate_data' and the pack operations in 'simple_process'. Other codes are
+    the same as the default class 'DetDataPreprocessor'.
+
+    Points / Image pre-processor for point clouds / vision-only / multi-
+    modality 3D detection tasks.
+
+    It provides the data pre-processing as follows
+
+    - Collate and move image and point cloud data to the target device.
+
+    - 1) For image data:
+
+      - Pad images in inputs to the maximum size of current batch with defined
+        ``pad_value``. The padding size can be divisible by a defined
+        ``pad_size_divisor``.
+      - Stack images in inputs to batch_imgs.
+      - Convert images in inputs from bgr to rgb if the shape of input is
+        (3, H, W).
+      - Normalize images in inputs with defined std and mean.
+      - Do batch augmentations during training.
+
+    - 2) For point cloud data:
+
+      - If no voxelization, directly return list of point cloud data.
+      - If voxelization is applied, voxelize point cloud according to
+        ``voxel_type`` and obtain ``voxels``.
+
+    Args:
+        voxel (bool): Whether to apply voxelization to point cloud.
+            Defaults to False.
+        voxel_type (str): Voxelization type. Two voxelization types are
+            provided: 'hard' and 'dynamic', respectively for hard voxelization
+            and dynamic voxelization. Defaults to 'hard'.
+        voxel_layer (dict or :obj:`ConfigDict`, optional): Voxelization layer
+            config. Defaults to None.
+        batch_first (bool): Whether to put the batch dimension to the first
+            dimension when getting voxel coordinates. Defaults to True.
+        max_voxels (int, optional): Maximum number of voxels in each voxel
+            grid. Defaults to None.
+        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
+            Defaults to None.
+        std (Sequence[Number], optional): The pixel standard deviation of
+            R, G, B channels. Defaults to None.
+        pad_size_divisor (int): The size of padded image should be divisible by
+            ``pad_size_divisor``. Defaults to 1.
+        pad_value (float or int): The padded pixel value. Defaults to 0.
+        pad_mask (bool): Whether to pad instance masks. Defaults to False.
+        mask_pad_value (int): The padded pixel value for instance masks.
+            Defaults to 0.
+        pad_seg (bool): Whether to pad semantic segmentation maps.
+            Defaults to False.
+        seg_pad_value (int): The padded pixel value for semantic segmentation
+            maps. Defaults to 255.
+        bgr_to_rgb (bool): Whether to convert image from BGR to RGB.
+            Defaults to False.
+        rgb_to_bgr (bool): Whether to convert image from RGB to BGR.
+            Defaults to False.
+        boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of
+            bboxes data to ``Tensor`` type. Defaults to True.
+        non_blocking (bool): Whether to block current process when transferring
+            data to device. Defaults to False.
+        batch_augments (List[dict], optional): Batch-level augmentations.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 voxel: bool = False,
+                 voxel_type: str = 'hard',
+                 voxel_layer: OptConfigType = None,
+                 batch_first: bool = True,
+                 max_voxels: Optional[int] = None,
+                 mean: Sequence[Number] = None,
+                 std: Sequence[Number] = None,
+                 pad_size_divisor: int = 1,
+                 pad_value: Union[float, int] = 0,
+                 pad_mask: bool = False,
+                 mask_pad_value: int = 0,
+                 pad_seg: bool = False,
+                 seg_pad_value: int = 255,
+                 bgr_to_rgb: bool = False,
+                 rgb_to_bgr: bool = False,
+                 boxtype2tensor: bool = True,
+                 non_blocking: bool = False,
+                 batch_augments: Optional[List[dict]] = None) -> None:
+        super(NeRFDetDataPreprocessor, self).__init__(
+            mean=mean,
+            std=std,
+            pad_size_divisor=pad_size_divisor,
+            pad_value=pad_value,
+            pad_mask=pad_mask,
+            mask_pad_value=mask_pad_value,
+            pad_seg=pad_seg,
+            seg_pad_value=seg_pad_value,
+            bgr_to_rgb=bgr_to_rgb,
+            rgb_to_bgr=rgb_to_bgr,
+            boxtype2tensor=boxtype2tensor,
+            non_blocking=non_blocking,
+            batch_augments=batch_augments)
+        self.voxel = voxel
+        self.voxel_type = voxel_type
+        self.batch_first = batch_first
+        self.max_voxels = max_voxels
+        if voxel:
+            self.voxel_layer = VoxelizationByGridShape(**voxel_layer)
+
+    def forward(self,
+                data: Union[dict, List[dict]],
+                training: bool = False) -> Union[dict, List[dict]]:
+        """Perform normalization, padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict or List[dict]): Data from dataloader. The dict contains
+                the whole batch data, when it is a list[dict], the list
+                indicates test time augmentation.
+            training (bool): Whether to enable training time augmentation.
+                Defaults to False.
+
+        Returns:
+            dict or List[dict]: Data in the same format as the model input.
+        """
+        if isinstance(data, list):
+            num_augs = len(data)
+            aug_batch_data = []
+            for aug_id in range(num_augs):
+                single_aug_batch_data = self.simple_process(
+                    data[aug_id], training)
+                aug_batch_data.append(single_aug_batch_data)
+            return aug_batch_data
+
+        else:
+            return self.simple_process(data, training)
+
+    def simple_process(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization, padding and bgr2rgb conversion for img data
+        based on ``BaseDataPreprocessor``, and voxelize point cloud if `voxel`
+        is set to be True.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+                Defaults to False.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        if 'img' in data['inputs']:
+            batch_pad_shape = self._get_pad_shape(data)
+
+        data = self.collate_data(data)
+        inputs, data_samples = data['inputs'], data['data_samples']
+        batch_inputs = dict()
+
+        if 'points' in inputs:
+            batch_inputs['points'] = inputs['points']
+
+            if self.voxel:
+                voxel_dict = self.voxelize(inputs['points'], data_samples)
+                batch_inputs['voxels'] = voxel_dict
+
+        if 'imgs' in inputs:
+            imgs = inputs['imgs']
+
+            if data_samples is not None:
+                # NOTE the batched image size information may be useful, e.g.
+                # in DETR, this is needed for the construction of masks, which
+                # is then used for the transformer_head.
+                batch_input_shape = tuple(imgs[0].size()[-2:])
+                for data_sample, pad_shape in zip(data_samples,
+                                                  batch_pad_shape):
+                    data_sample.set_metainfo({
+                        'batch_input_shape': batch_input_shape,
+                        'pad_shape': pad_shape
+                    })
+
+                if self.boxtype2tensor:
+                    samplelist_boxtype2tensor(data_samples)
+                if self.pad_mask:
+                    self.pad_gt_masks(data_samples)
+                if self.pad_seg:
+                    self.pad_gt_sem_seg(data_samples)
+
+            if training and self.batch_augments is not None:
+                for batch_aug in self.batch_augments:
+                    imgs, data_samples = batch_aug(imgs, data_samples)
+            batch_inputs['imgs'] = imgs
+        # Hard code here, will be changed later.
+        # if len(inputs['depth']) != 0:
+        if 'depth' in inputs.keys():
+            batch_inputs['depth'] = inputs['depth']
+        batch_inputs['lightpos'] = inputs['lightpos']
+        batch_inputs['nerf_sizes'] = inputs['nerf_sizes']
+        batch_inputs['denorm_images'] = inputs['denorm_images']
+        batch_inputs['raydirs'] = inputs['raydirs']
+
+        return {'inputs': batch_inputs, 'data_samples': data_samples}
+
+    def preprocess_img(self, _batch_img: Tensor) -> Tensor:
+        # channel transform
+        if self._channel_conversion:
+            _batch_img = _batch_img[[2, 1, 0], ...]
+        # Convert to float after channel conversion to ensure
+        # efficiency
+        _batch_img = _batch_img.float()
+        # Normalization.
+        if self._enable_normalize:
+            if self.mean.shape[0] == 3:
+                assert _batch_img.dim() == 3 and _batch_img.shape[0] == 3, (
+                    'If the mean has 3 values, the input tensor '
+                    'should in shape of (3, H, W), but got the '
+                    f'tensor with shape {_batch_img.shape}')
+            _batch_img = (_batch_img - self.mean) / self.std
+        return _batch_img
+
+    def collate_data(self, data: dict) -> dict:
+        """Copy data to the target device and perform normalization, padding
+        and bgr2rgb conversion and stack based on ``BaseDataPreprocessor``.
+
+        Collates the data sampled from dataloader into a list of dict and list
+        of labels, and then copies tensor to the target device.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        data = self.cast_data(data)  # type: ignore
+
+        if 'img' in data['inputs']:
+            _batch_imgs = data['inputs']['img']
+            # Process data with `pseudo_collate`.
+            if is_seq_of(_batch_imgs, torch.Tensor):
+                batch_imgs = []
+                img_dim = _batch_imgs[0].dim()
+                for _batch_img in _batch_imgs:
+                    if img_dim == 3:  # standard img
+                        _batch_img = self.preprocess_img(_batch_img)
+                    elif img_dim == 4:
+                        _batch_img = [
+                            self.preprocess_img(_img) for _img in _batch_img
+                        ]
+
+                        _batch_img = torch.stack(_batch_img, dim=0)
+
+                    batch_imgs.append(_batch_img)
+
+                # Pad and stack Tensor.
+                if img_dim == 3:
+                    batch_imgs = stack_batch(batch_imgs, self.pad_size_divisor,
+                                             self.pad_value)
+                elif img_dim == 4:
+                    batch_imgs = multiview_img_stack_batch(
+                        batch_imgs, self.pad_size_divisor, self.pad_value)
+
+            # Process data with `default_collate`.
+            elif isinstance(_batch_imgs, torch.Tensor):
+                assert _batch_imgs.dim() == 4, (
+                    'The input of `ImgDataPreprocessor` should be a NCHW '
+                    'tensor or a list of tensor, but got a tensor with '
+                    f'shape: {_batch_imgs.shape}')
+                if self._channel_conversion:
+                    _batch_imgs = _batch_imgs[:, [2, 1, 0], ...]
+                # Convert to float after channel conversion to ensure
+                # efficiency
+                _batch_imgs = _batch_imgs.float()
+                if self._enable_normalize:
+                    _batch_imgs = (_batch_imgs - self.mean) / self.std
+                h, w = _batch_imgs.shape[2:]
+                target_h = math.ceil(
+                    h / self.pad_size_divisor) * self.pad_size_divisor
+                target_w = math.ceil(
+                    w / self.pad_size_divisor) * self.pad_size_divisor
+                pad_h = target_h - h
+                pad_w = target_w - w
+                batch_imgs = F.pad(_batch_imgs, (0, pad_w, 0, pad_h),
+                                   'constant', self.pad_value)
+            else:
+                raise TypeError(
+                    'Output of `cast_data` should be a list of dict '
+                    'or a tuple with inputs and data_samples, but got '
+                    f'{type(data)}: {data}')
+
+            data['inputs']['imgs'] = batch_imgs
+        if 'raydirs' in data['inputs']:
+            _batch_dirs = data['inputs']['raydirs']
+            batch_dirs = stack_batch(_batch_dirs)
+            data['inputs']['raydirs'] = batch_dirs
+
+        if 'lightpos' in data['inputs']:
+            _batch_poses = data['inputs']['lightpos']
+            batch_poses = stack_batch(_batch_poses)
+            data['inputs']['lightpos'] = batch_poses
+
+        if 'denorm_images' in data['inputs']:
+            _batch_denorm_imgs = data['inputs']['denorm_images']
+            # Process data with `pseudo_collate`.
+            if is_seq_of(_batch_denorm_imgs, torch.Tensor):
+                denorm_img_dim = _batch_denorm_imgs[0].dim()
+                # Pad and stack Tensor.
+                if denorm_img_dim == 3:
+                    batch_denorm_imgs = stack_batch(_batch_denorm_imgs,
+                                                    self.pad_size_divisor,
+                                                    self.pad_value)
+                elif denorm_img_dim == 4:
+                    batch_denorm_imgs = multiview_img_stack_batch(
+                        _batch_denorm_imgs, self.pad_size_divisor,
+                        self.pad_value)
+            data['inputs']['denorm_images'] = batch_denorm_imgs
+
+        data.setdefault('data_samples', None)
+
+        return data
+
+    def _get_pad_shape(self, data: dict) -> List[Tuple[int, int]]:
+        """Get the pad_shape of each image based on data and
+        pad_size_divisor."""
+        # rewrite `_get_pad_shape` for obtaining image inputs.
+        _batch_inputs = data['inputs']['img']
+        # Process data with `pseudo_collate`.
+        if is_seq_of(_batch_inputs, torch.Tensor):
+            batch_pad_shape = []
+            for ori_input in _batch_inputs:
+                if ori_input.dim() == 4:
+                    # mean multiview input, select one of the
+                    # image to calculate the pad shape
+                    ori_input = ori_input[0]
+                pad_h = int(
+                    np.ceil(ori_input.shape[1] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                pad_w = int(
+                    np.ceil(ori_input.shape[2] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                batch_pad_shape.append((pad_h, pad_w))
+        # Process data with `default_collate`.
+        elif isinstance(_batch_inputs, torch.Tensor):
+            assert _batch_inputs.dim() == 4, (
+                'The input of `ImgDataPreprocessor` should be a NCHW tensor '
+                'or a list of tensor, but got a tensor with shape: '
+                f'{_batch_inputs.shape}')
+            pad_h = int(
+                np.ceil(_batch_inputs.shape[1] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            pad_w = int(
+                np.ceil(_batch_inputs.shape[2] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0]
+        else:
+            raise TypeError('Output of `cast_data` should be a list of dict '
+                            'or a tuple with inputs and data_samples, but got '
+                            f'{type(data)}: {data}')
+        return batch_pad_shape
+
+    @torch.no_grad()
+    def voxelize(self, points: List[Tensor],
+                 data_samples: SampleList) -> Dict[str, Tensor]:
+        """Apply voxelization to point cloud.
+
+        Args:
+            points (List[Tensor]): Point cloud in one data batch.
+            data_samples: (list[:obj:`NeRFDet3DDataSample`]): The annotation
+                data of every samples. Add voxel-wise annotation for
+                segmentation.
+
+        Returns:
+            Dict[str, Tensor]: Voxelization information.
+
+            - voxels (Tensor): Features of voxels, shape is MxNxC for hard
+              voxelization, NxC for dynamic voxelization.
+            - coors (Tensor): Coordinates of voxels, shape is Nx(1+NDim),
+              where 1 represents the batch index.
+            - num_points (Tensor, optional): Number of points in each voxel.
+            - voxel_centers (Tensor, optional): Centers of voxels.
+        """
+
+        voxel_dict = dict()
+
+        if self.voxel_type == 'hard':
+            voxels, coors, num_points, voxel_centers = [], [], [], []
+            for i, res in enumerate(points):
+                res_voxels, res_coors, res_num_points = self.voxel_layer(res)
+                res_voxel_centers = (
+                    res_coors[:, [2, 1, 0]] + 0.5) * res_voxels.new_tensor(
+                        self.voxel_layer.voxel_size) + res_voxels.new_tensor(
+                            self.voxel_layer.point_cloud_range[0:3])
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
+                voxels.append(res_voxels)
+                coors.append(res_coors)
+                num_points.append(res_num_points)
+                voxel_centers.append(res_voxel_centers)
+
+            voxels = torch.cat(voxels, dim=0)
+            coors = torch.cat(coors, dim=0)
+            num_points = torch.cat(num_points, dim=0)
+            voxel_centers = torch.cat(voxel_centers, dim=0)
+
+            voxel_dict['num_points'] = num_points
+            voxel_dict['voxel_centers'] = voxel_centers
+        elif self.voxel_type == 'dynamic':
+            coors = []
+            # dynamic voxelization only provide a coors mapping
+            for i, res in enumerate(points):
+                res_coors = self.voxel_layer(res)
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
+                coors.append(res_coors)
+            voxels = torch.cat(points, dim=0)
+            coors = torch.cat(coors, dim=0)
+        elif self.voxel_type == 'cylindrical':
+            voxels, coors = [], []
+            for i, (res, data_sample) in enumerate(zip(points, data_samples)):
+                rho = torch.sqrt(res[:, 0]**2 + res[:, 1]**2)
+                phi = torch.atan2(res[:, 1], res[:, 0])
+                polar_res = torch.stack((rho, phi, res[:, 2]), dim=-1)
+                min_bound = polar_res.new_tensor(
+                    self.voxel_layer.point_cloud_range[:3])
+                max_bound = polar_res.new_tensor(
+                    self.voxel_layer.point_cloud_range[3:])
+                try:  # only support PyTorch >= 1.9.0
+                    polar_res_clamp = torch.clamp(polar_res, min_bound,
+                                                  max_bound)
+                except TypeError:
+                    polar_res_clamp = polar_res.clone()
+                    for coor_idx in range(3):
+                        polar_res_clamp[:, coor_idx][
+                            polar_res[:, coor_idx] >
+                            max_bound[coor_idx]] = max_bound[coor_idx]
+                        polar_res_clamp[:, coor_idx][
+                            polar_res[:, coor_idx] <
+                            min_bound[coor_idx]] = min_bound[coor_idx]
+                res_coors = torch.floor(
+                    (polar_res_clamp - min_bound) / polar_res_clamp.new_tensor(
+                        self.voxel_layer.voxel_size)).int()
+                self.get_voxel_seg(res_coors, data_sample)
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
+                res_voxels = torch.cat((polar_res, res[:, :2], res[:, 3:]),
+                                       dim=-1)
+                voxels.append(res_voxels)
+                coors.append(res_coors)
+            voxels = torch.cat(voxels, dim=0)
+            coors = torch.cat(coors, dim=0)
+        elif self.voxel_type == 'minkunet':
+            voxels, coors = [], []
+            voxel_size = points[0].new_tensor(self.voxel_layer.voxel_size)
+            for i, (res, data_sample) in enumerate(zip(points, data_samples)):
+                res_coors = torch.round(res[:, :3] / voxel_size).int()
+                res_coors -= res_coors.min(0)[0]
+
+                res_coors_numpy = res_coors.cpu().numpy()
+                inds, point2voxel_map = self.sparse_quantize(
+                    res_coors_numpy, return_index=True, return_inverse=True)
+                point2voxel_map = torch.from_numpy(point2voxel_map).cuda()
+                if self.training and self.max_voxels is not None:
+                    if len(inds) > self.max_voxels:
+                        inds = np.random.choice(
+                            inds, self.max_voxels, replace=False)
+                inds = torch.from_numpy(inds).cuda()
+                if hasattr(data_sample.gt_pts_seg, 'pts_semantic_mask'):
+                    data_sample.gt_pts_seg.voxel_semantic_mask \
+                        = data_sample.gt_pts_seg.pts_semantic_mask[inds]
+                res_voxel_coors = res_coors[inds]
+                res_voxels = res[inds]
+                if self.batch_first:
+                    res_voxel_coors = F.pad(
+                        res_voxel_coors, (1, 0), mode='constant', value=i)
+                    data_sample.batch_idx = res_voxel_coors[:, 0]
+                else:
+                    res_voxel_coors = F.pad(
+                        res_voxel_coors, (0, 1), mode='constant', value=i)
+                    data_sample.batch_idx = res_voxel_coors[:, -1]
+                data_sample.point2voxel_map = point2voxel_map.long()
+                voxels.append(res_voxels)
+                coors.append(res_voxel_coors)
+            voxels = torch.cat(voxels, dim=0)
+            coors = torch.cat(coors, dim=0)
+
+        else:
+            raise ValueError(f'Invalid voxelization type {self.voxel_type}')
+
+        voxel_dict['voxels'] = voxels
+        voxel_dict['coors'] = coors
+
+        return voxel_dict
+
+    def get_voxel_seg(self, res_coors: Tensor,
+                      data_sample: SampleList) -> None:
+        """Get voxel-wise segmentation label and point2voxel map.
+
+        Args:
+            res_coors (Tensor): The voxel coordinates of points, Nx3.
+            data_sample: (:obj:`NeRFDet3DDataSample`): The annotation data of
+                every samples. Add voxel-wise annotation forsegmentation.
+        """
+
+        if self.training:
+            pts_semantic_mask = data_sample.gt_pts_seg.pts_semantic_mask
+            voxel_semantic_mask, _, point2voxel_map = dynamic_scatter_3d(
+                F.one_hot(pts_semantic_mask.long()).float(), res_coors, 'mean',
+                True)
+            voxel_semantic_mask = torch.argmax(voxel_semantic_mask, dim=-1)
+            data_sample.gt_pts_seg.voxel_semantic_mask = voxel_semantic_mask
+            data_sample.point2voxel_map = point2voxel_map
+        else:
+            pseudo_tensor = res_coors.new_ones([res_coors.shape[0], 1]).float()
+            _, _, point2voxel_map = dynamic_scatter_3d(pseudo_tensor,
+                                                       res_coors, 'mean', True)
+            data_sample.point2voxel_map = point2voxel_map
+
+    def ravel_hash(self, x: np.ndarray) -> np.ndarray:
+        """Get voxel coordinates hash for np.unique.
+
+        Args:
+            x (np.ndarray): The voxel coordinates of points, Nx3.
+
+        Returns:
+            np.ndarray: Voxels coordinates hash.
+        """
+        assert x.ndim == 2, x.shape
+
+        x = x - np.min(x, axis=0)
+        x = x.astype(np.uint64, copy=False)
+        xmax = np.max(x, axis=0).astype(np.uint64) + 1
+
+        h = np.zeros(x.shape[0], dtype=np.uint64)
+        for k in range(x.shape[1] - 1):
+            h += x[:, k]
+            h *= xmax[k + 1]
+        h += x[:, -1]
+        return h
+
+    def sparse_quantize(self,
+                        coords: np.ndarray,
+                        return_index: bool = False,
+                        return_inverse: bool = False) -> List[np.ndarray]:
+        """Sparse Quantization for voxel coordinates used in Minkunet.
+
+        Args:
+            coords (np.ndarray): The voxel coordinates of points, Nx3.
+            return_index (bool): Whether to return the indices of the unique
+                coords, shape (M,).
+            return_inverse (bool): Whether to return the indices of the
+                original coords, shape (N,).
+
+        Returns:
+            List[np.ndarray]: Return index and inverse map if return_index and
+            return_inverse is True.
+        """
+        _, indices, inverse_indices = np.unique(
+            self.ravel_hash(coords), return_index=True, return_inverse=True)
+        coords = coords[indices]
+
+        outputs = []
+        if return_index:
+            outputs += [indices]
+        if return_inverse:
+            outputs += [inverse_indices]
+        return outputs
diff --git a/mmde/projects/NeRF-Det/nerfdet/formating.py b/mmde/projects/NeRF-Det/nerfdet/formating.py
new file mode 100644
index 0000000000000000000000000000000000000000..6063d634cf0cc4952483b48fc3d3337670f78302
--- /dev/null
+++ b/mmde/projects/NeRF-Det/nerfdet/formating.py
@@ -0,0 +1,350 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Union
+
+import mmengine
+import numpy as np
+import torch
+from mmcv import BaseTransform
+from mmengine.structures import InstanceData
+from numpy import dtype
+
+from mmdet3d.registry import TRANSFORMS
+from mmdet3d.structures import BaseInstance3DBoxes, PointData
+from mmdet3d.structures.points import BasePoints
+# from .det3d_data_sample import Det3DDataSample
+from .nerf_det3d_data_sample import NeRFDet3DDataSample
+
+
+def to_tensor(
+    data: Union[torch.Tensor, np.ndarray, Sequence, int,
+                float]) -> torch.Tensor:
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+
+    Returns:
+        torch.Tensor: the converted data.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        if data.dtype is dtype('float64'):
+            data = data.astype(np.float32)
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmengine.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@TRANSFORMS.register_module()
+class PackNeRFDetInputs(BaseTransform):
+    INPUTS_KEYS = ['points', 'img']
+    NERF_INPUT_KEYS = [
+        'img', 'denorm_images', 'depth', 'lightpos', 'nerf_sizes', 'raydirs'
+    ]
+
+    INSTANCEDATA_3D_KEYS = [
+        'gt_bboxes_3d', 'gt_labels_3d', 'attr_labels', 'depths', 'centers_2d'
+    ]
+    INSTANCEDATA_2D_KEYS = [
+        'gt_bboxes',
+        'gt_bboxes_labels',
+    ]
+    NERF_3D_KEYS = ['gt_images', 'gt_depths']
+
+    SEG_KEYS = [
+        'gt_seg_map', 'pts_instance_mask', 'pts_semantic_mask',
+        'gt_semantic_seg'
+    ]
+
+    def __init__(
+        self,
+        keys: tuple,
+        meta_keys: tuple = ('img_path', 'ori_shape', 'img_shape', 'lidar2img',
+                            'depth2img', 'cam2img', 'pad_shape',
+                            'scale_factor', 'flip', 'pcd_horizontal_flip',
+                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                            'img_norm_cfg', 'num_pts_feats', 'pcd_trans',
+                            'sample_idx', 'pcd_scale_factor', 'pcd_rotation',
+                            'pcd_rotation_angle', 'lidar_path',
+                            'transformation_3d_flow', 'trans_mat',
+                            'affine_aug', 'sweep_img_metas', 'ori_cam2img',
+                            'cam2global', 'crop_offset', 'img_crop_offset',
+                            'resize_img_shape', 'lidar2cam', 'ori_lidar2img',
+                            'num_ref_frames', 'num_views', 'ego2global',
+                            'axis_align_matrix')
+    ) -> None:
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def _remove_prefix(self, key: str) -> str:
+        if key.startswith('gt_'):
+            key = key[3:]
+        return key
+
+    def transform(self, results: Union[dict,
+                                       List[dict]]) -> Union[dict, List[dict]]:
+        """Method to pack the input data. when the value in this dict is a
+        list, it usually is in Augmentations Testing.
+
+        Args:
+            results (dict | list[dict]): Result dict from the data pipeline.
+
+        Returns:
+            dict | List[dict]:
+
+            - 'inputs' (dict): The forward data of models. It usually contains
+              following keys:
+
+                - points
+                - img
+
+            - 'data_samples' (:obj:`NeRFDet3DDataSample`): The annotation info
+              of the sample.
+        """
+        # augtest
+        if isinstance(results, list):
+            if len(results) == 1:
+                # simple test
+                return self.pack_single_results(results[0])
+            pack_results = []
+            for single_result in results:
+                pack_results.append(self.pack_single_results(single_result))
+            return pack_results
+        # norm training and simple testing
+        elif isinstance(results, dict):
+            return self.pack_single_results(results)
+        else:
+            raise NotImplementedError
+
+    def pack_single_results(self, results: dict) -> dict:
+        """Method to pack the single input data. when the value in this dict is
+        a list, it usually is in Augmentations Testing.
+
+        Args:
+            results (dict): Result dict from the data pipeline.
+
+        Returns:
+            dict: A dict contains
+
+            - 'inputs' (dict): The forward data of models. It usually contains
+              following keys:
+
+                - points
+                - img
+
+            - 'data_samples' (:obj:`NeRFDet3DDataSample`): The annotation info
+              of the sample.
+        """
+        # Format 3D data
+        if 'points' in results:
+            if isinstance(results['points'], BasePoints):
+                results['points'] = results['points'].tensor
+
+        if 'img' in results:
+            if isinstance(results['img'], list):
+                # process multiple imgs in single frame
+                imgs = np.stack(results['img'], axis=0)
+                if imgs.flags.c_contiguous:
+                    imgs = to_tensor(imgs).permute(0, 3, 1, 2).contiguous()
+                else:
+                    imgs = to_tensor(
+                        np.ascontiguousarray(imgs.transpose(0, 3, 1, 2)))
+                results['img'] = imgs
+            else:
+                img = results['img']
+                if len(img.shape) < 3:
+                    img = np.expand_dims(img, -1)
+                # To improve the computational speed by by 3-5 times, apply:
+                # `torch.permute()` rather than `np.transpose()`.
+                # Refer to https://github.com/open-mmlab/mmdetection/pull/9533
+                # for more details
+                if img.flags.c_contiguous:
+                    img = to_tensor(img).permute(2, 0, 1).contiguous()
+                else:
+                    img = to_tensor(
+                        np.ascontiguousarray(img.transpose(2, 0, 1)))
+                results['img'] = img
+
+        if 'depth' in results:
+            if isinstance(results['depth'], list):
+                # process multiple depth imgs in single frame
+                depth_imgs = np.stack(results['depth'], axis=0)
+                if depth_imgs.flags.c_contiguous:
+                    depth_imgs = to_tensor(depth_imgs).contiguous()
+                else:
+                    depth_imgs = to_tensor(np.ascontiguousarray(depth_imgs))
+                results['depth'] = depth_imgs
+            else:
+                depth_img = results['depth']
+                if len(depth_img.shape) < 3:
+                    depth_img = np.expand_dims(depth_img, -1)
+                if depth_img.flags.c_contiguous:
+                    depth_img = to_tensor(depth_img).contiguous()
+                else:
+                    depth_img = to_tensor(np.ascontiguousarray(depth_img))
+                results['depth'] = depth_img
+
+        if 'ray_info' in results:
+            if isinstance(results['raydirs'], list):
+                raydirs = np.stack(results['raydirs'], axis=0)
+                if raydirs.flags.c_contiguous:
+                    raydirs = to_tensor(raydirs).contiguous()
+                else:
+                    raydirs = to_tensor(np.ascontiguousarray(raydirs))
+                results['raydirs'] = raydirs
+
+            if isinstance(results['lightpos'], list):
+                lightposes = np.stack(results['lightpos'], axis=0)
+                if lightposes.flags.c_contiguous:
+                    lightposes = to_tensor(lightposes).contiguous()
+                else:
+                    lightposes = to_tensor(np.ascontiguousarray(lightposes))
+                lightposes = lightposes.unsqueeze(1).repeat(
+                    1, raydirs.shape[1], 1)
+                results['lightpos'] = lightposes
+
+            if isinstance(results['gt_images'], list):
+                gt_images = np.stack(results['gt_images'], axis=0)
+                if gt_images.flags.c_contiguous:
+                    gt_images = to_tensor(gt_images).contiguous()
+                else:
+                    gt_images = to_tensor(np.ascontiguousarray(gt_images))
+                results['gt_images'] = gt_images
+
+            if isinstance(results['gt_depths'],
+                          list) and len(results['gt_depths']) != 0:
+                gt_depths = np.stack(results['gt_depths'], axis=0)
+                if gt_depths.flags.c_contiguous:
+                    gt_depths = to_tensor(gt_depths).contiguous()
+                else:
+                    gt_depths = to_tensor(np.ascontiguousarray(gt_depths))
+                results['gt_depths'] = gt_depths
+
+            if isinstance(results['denorm_images'], list):
+                denorm_imgs = np.stack(results['denorm_images'], axis=0)
+                if denorm_imgs.flags.c_contiguous:
+                    denorm_imgs = to_tensor(denorm_imgs).permute(
+                        0, 3, 1, 2).contiguous()
+                else:
+                    denorm_imgs = to_tensor(
+                        np.ascontiguousarray(
+                            denorm_imgs.transpose(0, 3, 1, 2)))
+                results['denorm_images'] = denorm_imgs
+
+        for key in [
+                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
+                'gt_bboxes_labels', 'attr_labels', 'pts_instance_mask',
+                'pts_semantic_mask', 'centers_2d', 'depths', 'gt_labels_3d'
+        ]:
+            if key not in results:
+                continue
+            if isinstance(results[key], list):
+                results[key] = [to_tensor(res) for res in results[key]]
+            else:
+                results[key] = to_tensor(results[key])
+        if 'gt_bboxes_3d' in results:
+            if not isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
+                results['gt_bboxes_3d'] = to_tensor(results['gt_bboxes_3d'])
+
+        if 'gt_semantic_seg' in results:
+            results['gt_semantic_seg'] = to_tensor(
+                results['gt_semantic_seg'][None])
+        if 'gt_seg_map' in results:
+            results['gt_seg_map'] = results['gt_seg_map'][None, ...]
+
+        if 'gt_images' in results:
+            results['gt_images'] = to_tensor(results['gt_images'])
+        if 'gt_depths' in results:
+            results['gt_depths'] = to_tensor(results['gt_depths'])
+
+        data_sample = NeRFDet3DDataSample()
+        gt_instances_3d = InstanceData()
+        gt_instances = InstanceData()
+        gt_pts_seg = PointData()
+        gt_nerf_images = InstanceData()
+        gt_nerf_depths = InstanceData()
+
+        data_metas = {}
+        for key in self.meta_keys:
+            if key in results:
+                data_metas[key] = results[key]
+            elif 'images' in results:
+                if len(results['images'].keys()) == 1:
+                    cam_type = list(results['images'].keys())[0]
+                    # single-view image
+                    if key in results['images'][cam_type]:
+                        data_metas[key] = results['images'][cam_type][key]
+                else:
+                    # multi-view image
+                    img_metas = []
+                    cam_types = list(results['images'].keys())
+                    for cam_type in cam_types:
+                        if key in results['images'][cam_type]:
+                            img_metas.append(results['images'][cam_type][key])
+                    if len(img_metas) > 0:
+                        data_metas[key] = img_metas
+            elif 'lidar_points' in results:
+                if key in results['lidar_points']:
+                    data_metas[key] = results['lidar_points'][key]
+        data_sample.set_metainfo(data_metas)
+
+        inputs = {}
+        for key in self.keys:
+            if key in results:
+                # if key in self.INPUTS_KEYS:
+                if key in self.NERF_INPUT_KEYS:
+                    inputs[key] = results[key]
+                elif key in self.NERF_3D_KEYS:
+                    if key == 'gt_images':
+                        gt_nerf_images[self._remove_prefix(key)] = results[key]
+                    else:
+                        gt_nerf_depths[self._remove_prefix(key)] = results[key]
+                elif key in self.INSTANCEDATA_3D_KEYS:
+                    gt_instances_3d[self._remove_prefix(key)] = results[key]
+                elif key in self.INSTANCEDATA_2D_KEYS:
+                    if key == 'gt_bboxes_labels':
+                        gt_instances['labels'] = results[key]
+                    else:
+                        gt_instances[self._remove_prefix(key)] = results[key]
+                elif key in self.SEG_KEYS:
+                    gt_pts_seg[self._remove_prefix(key)] = results[key]
+                else:
+                    raise NotImplementedError(f'Please modified '
+                                              f'`Pack3DDetInputs` '
+                                              f'to put {key} to '
+                                              f'corresponding field')
+
+        data_sample.gt_instances_3d = gt_instances_3d
+        data_sample.gt_instances = gt_instances
+        data_sample.gt_pts_seg = gt_pts_seg
+        data_sample.gt_nerf_images = gt_nerf_images
+        data_sample.gt_nerf_depths = gt_nerf_depths
+        if 'eval_ann_info' in results:
+            data_sample.eval_ann_info = results['eval_ann_info']
+        else:
+            data_sample.eval_ann_info = None
+
+        packed_results = dict()
+        packed_results['data_samples'] = data_sample
+        packed_results['inputs'] = inputs
+
+        return packed_results
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(keys={self.keys})'
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str
diff --git a/mmde/projects/NeRF-Det/nerfdet/multiview_pipeline.py b/mmde/projects/NeRF-Det/nerfdet/multiview_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..23e84ed71f7bd4940d5679dcd5d511a7293c5627
--- /dev/null
+++ b/mmde/projects/NeRF-Det/nerfdet/multiview_pipeline.py
@@ -0,0 +1,297 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform, Compose
+from PIL import Image
+
+from mmdet3d.registry import TRANSFORMS
+
+
+def get_dtu_raydir(pixelcoords, intrinsic, rot, dir_norm=None):
+    # rot is c2w
+    # pixelcoords: H x W x 2
+    x = (pixelcoords[..., 0] + 0.5 - intrinsic[0, 2]) / intrinsic[0, 0]
+    y = (pixelcoords[..., 1] + 0.5 - intrinsic[1, 2]) / intrinsic[1, 1]
+    z = np.ones_like(x)
+    dirs = np.stack([x, y, z], axis=-1)
+    # dirs = np.sum(dirs[...,None,:] * rot[:,:], axis=-1) # h*w*1*3   x   3*3
+    dirs = dirs @ rot[:, :].T  #
+    if dir_norm:
+        dirs = dirs / (np.linalg.norm(dirs, axis=-1, keepdims=True) + 1e-5)
+
+    return dirs
+
+
+@TRANSFORMS.register_module()
+class MultiViewPipeline(BaseTransform):
+    """MultiViewPipeline used in nerfdet.
+
+    Required Keys:
+
+    - depth_info
+    - img_prefix
+    - img_info
+    - lidar2img
+    - c2w
+    - cammrotc2w
+    - lightpos
+    - ray_info
+
+    Modified Keys:
+
+    - lidar2img
+
+    Added Keys:
+
+    - img
+    - denorm_images
+    - depth
+    - c2w
+    - camrotc2w
+    - lightpos
+    - pixels
+    - raydirs
+    - gt_images
+    - gt_depths
+    - nerf_sizes
+    - depth_range
+
+    Args:
+        transforms (list[dict]): The transform pipeline
+            used to process the imgs.
+        n_images (int): The number of sampled views.
+        mean (array): The mean values used in normalization.
+        std (array): The variance values used in normalization.
+        margin (int): The margin value. Defaults to 10.
+        depth_range (array): The range of the depth.
+            Defaults to [0.5, 5.5].
+        loading (str): The mode of loading. Defaults to 'random'.
+        nerf_target_views (int): The number of novel views.
+        sample_freq (int): The frequency of sampling.
+    """
+
+    def __init__(self,
+                 transforms: dict,
+                 n_images: int,
+                 mean: tuple = [123.675, 116.28, 103.53],
+                 std: tuple = [58.395, 57.12, 57.375],
+                 margin: int = 10,
+                 depth_range: tuple = [0.5, 5.5],
+                 loading: str = 'random',
+                 nerf_target_views: int = 0,
+                 sample_freq: int = 3):
+        self.transforms = Compose(transforms)
+        self.depth_transforms = Compose(transforms[1])
+        self.n_images = n_images
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.margin = margin
+        self.depth_range = depth_range
+        self.loading = loading
+        self.sample_freq = sample_freq
+        self.nerf_target_views = nerf_target_views
+
+    def transform(self, results: dict) -> dict:
+        """Nerfdet transform function.
+
+        Args:
+            results (dict): Result dict from loading pipeline
+
+        Returns:
+            dict: The result dict containing the processed results.
+            Updated key and value are described below.
+
+                - img (list): The loaded origin image.
+                - denorm_images (list): The denormalized image.
+                - depth (list): The origin depth image.
+                - c2w (list): The c2w matrixes.
+                - camrotc2w (list): The rotation matrixes.
+                - lightpos (list): The transform parameters of the camera.
+                - pixels (list): Some pixel information.
+                - raydirs (list): The ray-directions.
+                - gt_images (list): The groundtruth images.
+                - gt_depths (list): The groundtruth depth images.
+                - nerf_sizes (array): The size of the groundtruth images.
+                - depth_range (array): The range of the depth.
+
+        Here we give a detailed explanation of some keys mentioned above.
+        Let P_c be the coordinate of camera, P_w be the coordinate of world.
+        There is such a conversion relationship: P_c = R @ P_w + T.
+        The 'camrotc2w' mentioned above corresponds to the R matrix here.
+        The 'lightpos' corresponds to the T matrix here. And if you put
+        R and T together, you can get the camera extrinsics matrix. It
+        corresponds to the 'c2w' mentioned above.
+        """
+        imgs = []
+        depths = []
+        extrinsics = []
+        c2ws = []
+        camrotc2ws = []
+        lightposes = []
+        pixels = []
+        raydirs = []
+        gt_images = []
+        gt_depths = []
+        denorm_imgs_list = []
+        nerf_sizes = []
+
+        if self.loading == 'random':
+            ids = np.arange(len(results['img_info']))
+            replace = True if self.n_images > len(ids) else False
+            ids = np.random.choice(ids, self.n_images, replace=replace)
+            if self.nerf_target_views != 0:
+                target_id = np.random.choice(
+                    ids, self.nerf_target_views, replace=False)
+                ids = np.setdiff1d(ids, target_id)
+                ids = ids.tolist()
+                target_id = target_id.tolist()
+
+        else:
+            ids = np.arange(len(results['img_info']))
+            begin_id = 0
+            ids = np.arange(begin_id,
+                            begin_id + self.n_images * self.sample_freq,
+                            self.sample_freq)
+            if self.nerf_target_views != 0:
+                target_id = ids
+
+        ratio = 0
+        size = (240, 320)
+        for i in ids:
+            _results = dict()
+            _results['img_path'] = results['img_info'][i]['filename']
+            _results = self.transforms(_results)
+            imgs.append(_results['img'])
+            # normalize
+            for key in _results.get('img_fields', ['img']):
+                _results[key] = mmcv.imnormalize(_results[key], self.mean,
+                                                 self.std, True)
+            _results['img_norm_cfg'] = dict(
+                mean=self.mean, std=self.std, to_rgb=True)
+            # pad
+            for key in _results.get('img_fields', ['img']):
+                padded_img = mmcv.impad(_results[key], shape=size, pad_val=0)
+                _results[key] = padded_img
+            _results['pad_shape'] = padded_img.shape
+            _results['pad_fixed_size'] = size
+            ori_shape = _results['ori_shape']
+            aft_shape = _results['img_shape']
+            ratio = ori_shape[0] / aft_shape[0]
+            # prepare the depth information
+            if 'depth_info' in results.keys():
+                if '.npy' in results['depth_info'][i]['filename']:
+                    _results['depth'] = np.load(
+                        results['depth_info'][i]['filename'])
+                else:
+                    _results['depth'] = np.asarray((Image.open(
+                        results['depth_info'][i]['filename']))) / 1000
+                    _results['depth'] = mmcv.imresize(
+                        _results['depth'], (aft_shape[1], aft_shape[0]))
+                depths.append(_results['depth'])
+
+            denorm_img = mmcv.imdenormalize(
+                _results['img'], self.mean, self.std, to_bgr=True).astype(
+                    np.uint8) / 255.0
+            denorm_imgs_list.append(denorm_img)
+            height, width = padded_img.shape[:2]
+            extrinsics.append(results['lidar2img']['extrinsic'][i])
+
+        # prepare the nerf information
+        if 'ray_info' in results.keys():
+            intrinsics_nerf = results['lidar2img']['intrinsic'].copy()
+            intrinsics_nerf[:2] = intrinsics_nerf[:2] / ratio
+            assert self.nerf_target_views > 0
+            for i in target_id:
+                c2ws.append(results['c2w'][i])
+                camrotc2ws.append(results['camrotc2w'][i])
+                lightposes.append(results['lightpos'][i])
+                px, py = np.meshgrid(
+                    np.arange(self.margin,
+                              width - self.margin).astype(np.float32),
+                    np.arange(self.margin,
+                              height - self.margin).astype(np.float32))
+                pixelcoords = np.stack((px, py),
+                                       axis=-1).astype(np.float32)  # H x W x 2
+                pixels.append(pixelcoords)
+                raydir = get_dtu_raydir(pixelcoords, intrinsics_nerf,
+                                        results['camrotc2w'][i])
+                raydirs.append(np.reshape(raydir.astype(np.float32), (-1, 3)))
+                # read target images
+                temp_results = dict()
+                temp_results['img_path'] = results['img_info'][i]['filename']
+
+                temp_results_ = self.transforms(temp_results)
+                # normalize
+                for key in temp_results.get('img_fields', ['img']):
+                    temp_results[key] = mmcv.imnormalize(
+                        temp_results[key], self.mean, self.std, True)
+                temp_results['img_norm_cfg'] = dict(
+                    mean=self.mean, std=self.std, to_rgb=True)
+                # pad
+                for key in temp_results.get('img_fields', ['img']):
+                    padded_img = mmcv.impad(
+                        temp_results[key], shape=size, pad_val=0)
+                    temp_results[key] = padded_img
+                temp_results['pad_shape'] = padded_img.shape
+                temp_results['pad_fixed_size'] = size
+                # denormalize target_images.
+                denorm_imgs = mmcv.imdenormalize(
+                    temp_results_['img'], self.mean, self.std,
+                    to_bgr=True).astype(np.uint8)
+                gt_rgb_shape = denorm_imgs.shape
+
+                gt_image = denorm_imgs[py.astype(np.int32),
+                                       px.astype(np.int32), :]
+                nerf_sizes.append(np.array(gt_image.shape))
+                gt_image = np.reshape(gt_image, (-1, 3))
+                gt_images.append(gt_image / 255.0)
+                if 'depth_info' in results.keys():
+                    if '.npy' in results['depth_info'][i]['filename']:
+                        _results['depth'] = np.load(
+                            results['depth_info'][i]['filename'])
+                    else:
+                        depth_image = Image.open(
+                            results['depth_info'][i]['filename'])
+                        _results['depth'] = np.asarray(depth_image) / 1000
+                        _results['depth'] = mmcv.imresize(
+                            _results['depth'],
+                            (gt_rgb_shape[1], gt_rgb_shape[0]))
+
+                    _results['depth'] = _results['depth']
+                    gt_depth = _results['depth'][py.astype(np.int32),
+                                                 px.astype(np.int32)]
+                    gt_depths.append(gt_depth)
+
+        for key in _results.keys():
+            if key not in ['img', 'img_info']:
+                results[key] = _results[key]
+        results['img'] = imgs
+
+        if 'ray_info' in results.keys():
+            results['c2w'] = c2ws
+            results['camrotc2w'] = camrotc2ws
+            results['lightpos'] = lightposes
+            results['pixels'] = pixels
+            results['raydirs'] = raydirs
+            results['gt_images'] = gt_images
+            results['gt_depths'] = gt_depths
+            results['nerf_sizes'] = nerf_sizes
+            results['denorm_images'] = denorm_imgs_list
+            results['depth_range'] = np.array([self.depth_range])
+
+        if len(depths) != 0:
+            results['depth'] = depths
+        results['lidar2img']['extrinsic'] = extrinsics
+        return results
+
+
+@TRANSFORMS.register_module()
+class RandomShiftOrigin(BaseTransform):
+
+    def __init__(self, std):
+        self.std = std
+
+    def transform(self, results):
+        shift = np.random.normal(.0, self.std, 3)
+        results['lidar2img']['origin'] += shift
+        return results
diff --git a/mmde/projects/NeRF-Det/nerfdet/nerf_det3d_data_sample.py b/mmde/projects/NeRF-Det/nerfdet/nerf_det3d_data_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..439e9a69ba08c0ba728dae03ae846e172eed6785
--- /dev/null
+++ b/mmde/projects/NeRF-Det/nerfdet/nerf_det3d_data_sample.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet3d.structures import Det3DDataSample
+
+
+class NeRFDet3DDataSample(Det3DDataSample):
+    """A data structure interface inheirted from Det3DDataSample. Some new
+    attributes are added to match the NeRF-Det project.
+
+    The attributes added in ``NeRFDet3DDataSample`` are divided into two parts:
+
+        - ``gt_nerf_images`` (InstanceData): Ground truth of the images which
+          will be used in the NeRF branch.
+        - ``gt_nerf_depths`` (InstanceData): Ground truth of the depth images
+          which will be used in the NeRF branch if needed.
+
+    For more details and examples, please refer to the 'Det3DDataSample' file.
+    """
+
+    @property
+    def gt_nerf_images(self) -> InstanceData:
+        return self._gt_nerf_images
+
+    @gt_nerf_images.setter
+    def gt_nerf_images(self, value: InstanceData) -> None:
+        self.set_field(value, '_gt_nerf_images', dtype=InstanceData)
+
+    @gt_nerf_images.deleter
+    def gt_nerf_images(self) -> None:
+        del self._gt_nerf_images
+
+    @property
+    def gt_nerf_depths(self) -> InstanceData:
+        return self._gt_nerf_depths
+
+    @gt_nerf_depths.setter
+    def gt_nerf_depths(self, value: InstanceData) -> None:
+        self.set_field(value, '_gt_nerf_depths', dtype=InstanceData)
+
+    @gt_nerf_depths.deleter
+    def gt_nerf_depths(self) -> None:
+        del self._gt_nerf_depths
+
+
+SampleList = List[NeRFDet3DDataSample]
+OptSampleList = Optional[SampleList]
+ForwardResults = Union[Dict[str, torch.Tensor], List[NeRFDet3DDataSample],
+                       Tuple[torch.Tensor], torch.Tensor]
diff --git a/mmde/projects/NeRF-Det/nerfdet/nerf_utils/nerf_mlp.py b/mmde/projects/NeRF-Det/nerfdet/nerf_utils/nerf_mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc579ea23ba5a0b6e353085067dc912698ee16b3
--- /dev/null
+++ b/mmde/projects/NeRF-Det/nerfdet/nerf_utils/nerf_mlp.py
@@ -0,0 +1,277 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Callable, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class MLP(nn.Module):
+    """The MLP module used in NerfDet.
+
+    Args:
+        input_dim (int): The number of input tensor channels.
+        output_dim (int): The number of output tensor channels.
+        net_depth (int): The depth of the MLP. Defaults to 8.
+        net_width (int): The width of the MLP. Defaults to 256.
+        skip_layer (int): The layer to add skip layers to. Defaults to 4.
+
+        hidden_init (Callable): The initialize method of the hidden layers.
+        hidden_activation (Callable): The activation function of hidden
+            layers, defaults to ReLU.
+        output_enabled (bool): If true, the output layers will be used.
+            Defaults to True.
+        output_init (Optional): The initialize method of the output layer.
+        output_activation(Optional): The activation function of output layers.
+        bias_enabled (Bool): If true, the bias will be used.
+        bias_init (Callable): The initialize method of the bias.
+            Defaults to True.
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int = None,
+        net_depth: int = 8,
+        net_width: int = 256,
+        skip_layer: int = 4,
+        hidden_init: Callable = nn.init.xavier_uniform_,
+        hidden_activation: Callable = nn.ReLU(),
+        output_enabled: bool = True,
+        output_init: Optional[Callable] = nn.init.xavier_uniform_,
+        output_activation: Optional[Callable] = nn.Identity(),
+        bias_enabled: bool = True,
+        bias_init: Callable = nn.init.zeros_,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.net_depth = net_depth
+        self.net_width = net_width
+        self.skip_layer = skip_layer
+        self.hidden_init = hidden_init
+        self.hidden_activation = hidden_activation
+        self.output_enabled = output_enabled
+        self.output_init = output_init
+        self.output_activation = output_activation
+        self.bias_enabled = bias_enabled
+        self.bias_init = bias_init
+
+        self.hidden_layers = nn.ModuleList()
+        in_features = self.input_dim
+        for i in range(self.net_depth):
+            self.hidden_layers.append(
+                nn.Linear(in_features, self.net_width, bias=bias_enabled))
+            if (self.skip_layer is not None) and (i % self.skip_layer
+                                                  == 0) and (i > 0):
+                in_features = self.net_width + self.input_dim
+            else:
+                in_features = self.net_width
+        if self.output_enabled:
+            self.output_layer = nn.Linear(
+                in_features, self.output_dim, bias=bias_enabled)
+        else:
+            self.output_dim = in_features
+
+        self.initialize()
+
+    def initialize(self):
+
+        def init_func_hidden(m):
+            if isinstance(m, nn.Linear):
+                if self.hidden_init is not None:
+                    self.hidden_init(m.weight)
+                if self.bias_enabled and self.bias_init is not None:
+                    self.bias_init(m.bias)
+
+        self.hidden_layers.apply(init_func_hidden)
+        if self.output_enabled:
+
+            def init_func_output(m):
+                if isinstance(m, nn.Linear):
+                    if self.output_init is not None:
+                        self.output_init(m.weight)
+                    if self.bias_enabled and self.bias_init is not None:
+                        self.bias_init(m.bias)
+
+            self.output_layer.apply(init_func_output)
+
+    def forward(self, x):
+        inputs = x
+        for i in range(self.net_depth):
+            x = self.hidden_layers[i](x)
+            x = self.hidden_activation(x)
+            if (self.skip_layer is not None) and (i % self.skip_layer
+                                                  == 0) and (i > 0):
+                x = torch.cat([x, inputs], dim=-1)
+        if self.output_enabled:
+            x = self.output_layer(x)
+            x = self.output_activation(x)
+        return x
+
+
+class DenseLayer(MLP):
+
+    def __init__(self, input_dim, output_dim, **kwargs):
+        super().__init__(
+            input_dim=input_dim,
+            output_dim=output_dim,
+            net_depth=0,  # no hidden layers
+            **kwargs,
+        )
+
+
+class NerfMLP(nn.Module):
+    """The Nerf-MLP Module.
+
+    Args:
+        input_dim (int): The number of input tensor channels.
+        condition_dim (int): The number of condition tensor channels.
+        feature_dim (int): The number of feature channels. Defaults to 0.
+        net_depth (int): The depth of the MLP. Defaults to 8.
+        net_width (int): The width of the MLP. Defaults to 256.
+        skip_layer (int): The layer to add skip layers to. Defaults to 4.
+        net_depth_condition (int): The depth of the second part of MLP.
+            Defaults to 1.
+        net_width_condition (int): The width of the second part of MLP.
+            Defaults to 128.
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        condition_dim: int,
+        feature_dim: int = 0,
+        net_depth: int = 8,
+        net_width: int = 256,
+        skip_layer: int = 4,
+        net_depth_condition: int = 1,
+        net_width_condition: int = 128,
+    ):
+        super().__init__()
+        self.base = MLP(
+            input_dim=input_dim + feature_dim,
+            net_depth=net_depth,
+            net_width=net_width,
+            skip_layer=skip_layer,
+            output_enabled=False,
+        )
+        hidden_features = self.base.output_dim
+        self.sigma_layer = DenseLayer(hidden_features, 1)
+
+        if condition_dim > 0:
+            self.bottleneck_layer = DenseLayer(hidden_features, net_width)
+            self.rgb_layer = MLP(
+                input_dim=net_width + condition_dim,
+                output_dim=3,
+                net_depth=net_depth_condition,
+                net_width=net_width_condition,
+                skip_layer=None,
+            )
+        else:
+            self.rgb_layer = DenseLayer(hidden_features, 3)
+
+    def query_density(self, x, features=None):
+        """Calculate the raw sigma."""
+        if features is not None:
+            x = self.base(torch.cat([x, features], dim=-1))
+        else:
+            x = self.base(x)
+        raw_sigma = self.sigma_layer(x)
+        return raw_sigma
+
+    def forward(self, x, condition=None, features=None):
+        if features is not None:
+            x = self.base(torch.cat([x, features], dim=-1))
+        else:
+            x = self.base(x)
+        raw_sigma = self.sigma_layer(x)
+        if condition is not None:
+            if condition.shape[:-1] != x.shape[:-1]:
+                num_rays, n_dim = condition.shape
+                condition = condition.view(
+                    [num_rays] + [1] * (x.dim() - condition.dim()) +
+                    [n_dim]).expand(list(x.shape[:-1]) + [n_dim])
+            bottleneck = self.bottleneck_layer(x)
+            x = torch.cat([bottleneck, condition], dim=-1)
+        raw_rgb = self.rgb_layer(x)
+        return raw_rgb, raw_sigma
+
+
+class SinusoidalEncoder(nn.Module):
+    """Sinusodial Positional Encoder used in NeRF."""
+
+    def __init__(self, x_dim, min_deg, max_deg, use_identity: bool = True):
+        super().__init__()
+        self.x_dim = x_dim
+        self.min_deg = min_deg
+        self.max_deg = max_deg
+        self.use_identity = use_identity
+        self.register_buffer(
+            'scales', torch.tensor([2**i for i in range(min_deg, max_deg)]))
+
+    @property
+    def latent_dim(self) -> int:
+        return (int(self.use_identity) +
+                (self.max_deg - self.min_deg) * 2) * self.x_dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.max_deg == self.min_deg:
+            return x
+        xb = torch.reshape(
+            (x[Ellipsis, None, :] * self.scales[:, None]),
+            list(x.shape[:-1]) + [(self.max_deg - self.min_deg) * self.x_dim],
+        )
+        latent = torch.sin(torch.cat([xb, xb + 0.5 * math.pi], dim=-1))
+        if self.use_identity:
+            latent = torch.cat([x] + [latent], dim=-1)
+        return latent
+
+
+class VanillaNeRF(nn.Module):
+    """The Nerf-MLP with the positional encoder.
+
+    Args:
+        net_depth (int): The depth of the MLP. Defaults to 8.
+        net_width (int): The width of the MLP. Defaults to 256.
+        skip_layer (int): The layer to add skip layers to. Defaults to 4.
+        feature_dim (int): The number of feature channels. Defaults to 0.
+        net_depth_condition (int): The depth of the second part of MLP.
+            Defaults to 1.
+        net_width_condition (int): The width of the second part of MLP.
+            Defaults to 128.
+    """
+
+    def __init__(self,
+                 net_depth: int = 8,
+                 net_width: int = 256,
+                 skip_layer: int = 4,
+                 feature_dim: int = 0,
+                 net_depth_condition: int = 1,
+                 net_width_condition: int = 128):
+        super().__init__()
+        self.posi_encoder = SinusoidalEncoder(3, 0, 10, True)
+        self.view_encoder = SinusoidalEncoder(3, 0, 4, True)
+        self.mlp = NerfMLP(
+            input_dim=self.posi_encoder.latent_dim,
+            condition_dim=self.view_encoder.latent_dim,
+            feature_dim=feature_dim,
+            net_depth=net_depth,
+            net_width=net_width,
+            skip_layer=skip_layer,
+            net_depth_condition=net_depth_condition,
+            net_width_condition=net_width_condition,
+        )
+
+    def query_density(self, x, features=None):
+        x = self.posi_encoder(x)
+        sigma = self.mlp.query_density(x, features)
+        return F.relu(sigma)
+
+    def forward(self, x, condition=None, features=None):
+        x = self.posi_encoder(x)
+        if condition is not None:
+            condition = self.view_encoder(condition)
+        rgb, sigma = self.mlp(x, condition=condition, features=features)
+        return torch.sigmoid(rgb), F.relu(sigma)
diff --git a/mmde/projects/NeRF-Det/nerfdet/nerf_utils/projection.py b/mmde/projects/NeRF-Det/nerfdet/nerf_utils/projection.py
new file mode 100644
index 0000000000000000000000000000000000000000..d88e28142087eed15e7638c10859144036849eda
--- /dev/null
+++ b/mmde/projects/NeRF-Det/nerfdet/nerf_utils/projection.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Attention: This file is mainly modified based on the file with the same
+# name in the original project. For more details, please refer to the
+# origin project.
+import torch
+import torch.nn.functional as F
+
+
+class Projector():
+
+    def __init__(self, device='cuda'):
+        self.device = device
+
+    def inbound(self, pixel_locations, h, w):
+        """check if the pixel locations are in valid range."""
+        return (pixel_locations[..., 0] <= w - 1.) & \
+               (pixel_locations[..., 0] >= 0) & \
+               (pixel_locations[..., 1] <= h - 1.) &\
+               (pixel_locations[..., 1] >= 0)
+
+    def normalize(self, pixel_locations, h, w):
+        resize_factor = torch.tensor([w - 1., h - 1.
+                                      ]).to(pixel_locations.device)[None,
+                                                                    None, :]
+        normalized_pixel_locations = 2 * pixel_locations / resize_factor - 1.
+        return normalized_pixel_locations
+
+    def compute_projections(self, xyz, train_cameras):
+        """project 3D points into cameras."""
+
+        original_shape = xyz.shape[:2]
+        xyz = xyz.reshape(-1, 3)
+        num_views = len(train_cameras)
+        train_intrinsics = train_cameras[:, 2:18].reshape(-1, 4, 4)
+        train_poses = train_cameras[:, -16:].reshape(-1, 4, 4)
+        xyz_h = torch.cat([xyz, torch.ones_like(xyz[..., :1])], dim=-1)
+        # projections = train_intrinsics.bmm(torch.inverse(train_poses))
+        # we have inverse the pose in dataloader so
+        # do not need to inverse here.
+        projections = train_intrinsics.bmm(train_poses) \
+            .bmm(xyz_h.t()[None, ...].repeat(num_views, 1, 1))
+        projections = projections.permute(0, 2, 1)
+        pixel_locations = projections[..., :2] / torch.clamp(
+            projections[..., 2:3], min=1e-8)
+        pixel_locations = torch.clamp(pixel_locations, min=-1e6, max=1e6)
+        mask = projections[..., 2] > 0
+        return pixel_locations.reshape((num_views, ) + original_shape + (2, )), \
+               mask.reshape((num_views, ) + original_shape) # noqa
+
+    def compute_angle(self, xyz, query_camera, train_cameras):
+
+        original_shape = xyz.shape[:2]
+        xyz = xyz.reshape(-1, 3)
+        train_poses = train_cameras[:, -16:].reshape(-1, 4, 4)
+        num_views = len(train_poses)
+        query_pose = query_camera[-16:].reshape(-1, 4,
+                                                4).repeat(num_views, 1, 1)
+        ray2tar_pose = (query_pose[:, :3, 3].unsqueeze(1) - xyz.unsqueeze(0))
+        ray2tar_pose /= (torch.norm(ray2tar_pose, dim=-1, keepdim=True) + 1e-6)
+        ray2train_pose = (
+            train_poses[:, :3, 3].unsqueeze(1) - xyz.unsqueeze(0))
+        ray2train_pose /= (
+            torch.norm(ray2train_pose, dim=-1, keepdim=True) + 1e-6)
+        ray_diff = ray2tar_pose - ray2train_pose
+        ray_diff_norm = torch.norm(ray_diff, dim=-1, keepdim=True)
+        ray_diff_dot = torch.sum(
+            ray2tar_pose * ray2train_pose, dim=-1, keepdim=True)
+        ray_diff_direction = ray_diff / torch.clamp(ray_diff_norm, min=1e-6)
+        ray_diff = torch.cat([ray_diff_direction, ray_diff_dot], dim=-1)
+        ray_diff = ray_diff.reshape((num_views, ) + original_shape + (4, ))
+        return ray_diff
+
+    def compute(self,
+                xyz,
+                train_imgs,
+                train_cameras,
+                featmaps=None,
+                grid_sample=True):
+
+        assert (train_imgs.shape[0] == 1) \
+               and (train_cameras.shape[0] == 1)
+        # only support batch_size=1 for now
+
+        train_imgs = train_imgs.squeeze(0)
+        train_cameras = train_cameras.squeeze(0)
+
+        train_imgs = train_imgs.permute(0, 3, 1, 2)
+        h, w = train_cameras[0][:2]
+
+        # compute the projection of the query points to each reference image
+        pixel_locations, mask_in_front = self.compute_projections(
+            xyz, train_cameras)
+        normalized_pixel_locations = self.normalize(pixel_locations, h, w)
+        # rgb sampling
+        rgbs_sampled = F.grid_sample(
+            train_imgs, normalized_pixel_locations, align_corners=True)
+        rgb_sampled = rgbs_sampled.permute(2, 3, 0, 1)
+
+        # deep feature sampling
+        if featmaps is not None:
+            if grid_sample:
+                feat_sampled = F.grid_sample(
+                    featmaps, normalized_pixel_locations, align_corners=True)
+                feat_sampled = feat_sampled.permute(
+                    2, 3, 0, 1)  # [n_rays, n_samples, n_views, d]
+                rgb_feat_sampled = torch.cat(
+                    [rgb_sampled, feat_sampled],
+                    dim=-1)  # [n_rays, n_samples, n_views, d+3]
+                # rgb_feat_sampled = feat_sampled
+            else:
+                n_images, n_channels, f_h, f_w = featmaps.shape
+                resize_factor = torch.tensor([f_w / w - 1., f_h / h - 1.]).to(
+                    pixel_locations.device)[None, None, :]
+                sample_location = (pixel_locations *
+                                   resize_factor).round().long()
+                n_images, n_ray, n_sample, _ = sample_location.shape
+                sample_x = sample_location[..., 0].view(n_images, -1)
+                sample_y = sample_location[..., 1].view(n_images, -1)
+                valid = (sample_x >= 0) & (sample_y >=
+                                           0) & (sample_x < f_w) & (
+                                               sample_y < f_h)
+                valid = valid * mask_in_front.view(n_images, -1)
+                feat_sampled = torch.zeros(
+                    (n_images, n_channels, sample_x.shape[-1]),
+                    device=featmaps.device)
+                for i in range(n_images):
+                    feat_sampled[i, :,
+                                 valid[i]] = featmaps[i, :, sample_y[i,
+                                                                     valid[i]],
+                                                      sample_y[i, valid[i]]]
+                feat_sampled = feat_sampled.view(n_images, n_channels, n_ray,
+                                                 n_sample)
+                rgb_feat_sampled = feat_sampled.permute(2, 3, 0, 1)
+
+        else:
+            rgb_feat_sampled = None
+        inbound = self.inbound(pixel_locations, h, w)
+        mask = (inbound * mask_in_front).float().permute(
+            1, 2, 0)[..., None]  # [n_rays, n_samples, n_views, 1]
+        return rgb_feat_sampled, mask
diff --git a/mmde/projects/NeRF-Det/nerfdet/nerf_utils/render_ray.py b/mmde/projects/NeRF-Det/nerfdet/nerf_utils/render_ray.py
new file mode 100644
index 0000000000000000000000000000000000000000..76582c57736a8382cce48b0b065c037442a522ac
--- /dev/null
+++ b/mmde/projects/NeRF-Det/nerfdet/nerf_utils/render_ray.py
@@ -0,0 +1,431 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Attention: This file is mainly modified based on the file with the same
+# name in the original project. For more details, please refer to the
+# origin project.
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+rng = np.random.RandomState(234)
+
+
+# helper functions for nerf ray rendering
+def volume_sampling(sample_pts, features, aabb):
+    B, C, D, W, H = features.shape
+    assert B == 1
+    aabb = torch.Tensor(aabb).to(sample_pts.device)
+    N_rays, N_samples, coords = sample_pts.shape
+    sample_pts = sample_pts.view(1, N_rays * N_samples, 1, 1,
+                                 3).repeat(B, 1, 1, 1, 1)
+    aabbSize = aabb[1] - aabb[0]
+    invgridSize = 1.0 / aabbSize * 2
+    norm_pts = (sample_pts - aabb[0]) * invgridSize - 1
+    sample_features = F.grid_sample(
+        features, norm_pts, align_corners=True, padding_mode='border')
+    masks = ((norm_pts < 1) & (norm_pts > -1)).float().sum(dim=-1)
+    masks = (masks.view(N_rays, N_samples) == 3)
+    return sample_features.view(C, N_rays,
+                                N_samples).permute(1, 2, 0).contiguous(), masks
+
+
+def _compute_projection(img_meta):
+    views = len(img_meta['lidar2img']['extrinsic'])
+    intrinsic = torch.tensor(img_meta['lidar2img']['intrinsic'][:4, :4])
+    ratio = img_meta['ori_shape'][0] / img_meta['img_shape'][0]
+    intrinsic[:2] /= ratio
+    intrinsic = intrinsic.unsqueeze(0).view(1, 16).repeat(views, 1)
+    img_size = torch.Tensor(img_meta['img_shape'][:2]).to(intrinsic.device)
+    img_size = img_size.unsqueeze(0).repeat(views, 1)
+    extrinsics = []
+    for v in range(views):
+        extrinsics.append(
+            torch.Tensor(img_meta['lidar2img']['extrinsic'][v]).to(
+                intrinsic.device))
+    extrinsic = torch.stack(extrinsics).view(views, 16)
+    train_cameras = torch.cat([img_size, intrinsic, extrinsic], dim=-1)
+    return train_cameras.unsqueeze(0)
+
+
+def compute_mask_points(feature, mask):
+    weight = mask / (torch.sum(mask, dim=2, keepdim=True) + 1e-8)
+    mean = torch.sum(feature * weight, dim=2, keepdim=True)
+    var = torch.sum((feature - mean)**2, dim=2, keepdim=True)
+    var = var / (torch.sum(mask, dim=2, keepdim=True) + 1e-8)
+    var = torch.exp(-var)
+    return mean, var
+
+
+def sample_pdf(bins, weights, N_samples, det=False):
+    """Helper function used for sampling.
+
+    Args:
+        bins (tensor):Tensor of shape [N_rays, M+1], M is the number of bins
+        weights (tensor):Tensor of shape [N_rays, M+1], M is the number of bins
+        N_samples (int):Number of samples along each ray
+        det (bool):If True, will perform deterministic sampling
+
+    Returns:
+        samples (tuple): [N_rays, N_samples]
+    """
+
+    M = weights.shape[1]
+    weights += 1e-5
+    # Get pdf
+    pdf = weights / torch.sum(weights, dim=-1, keepdim=True)
+    cdf = torch.cumsum(pdf, dim=-1)
+    cdf = torch.cat([torch.zeros_like(cdf[:, 0:1]), cdf], dim=-1)
+
+    # Take uniform samples
+    if det:
+        u = torch.linspace(0., 1., N_samples, device=bins.device)
+        u = u.unsqueeze(0).repeat(bins.shape[0], 1)
+    else:
+        u = torch.rand(bins.shape[0], N_samples, device=bins.device)
+
+    # Invert CDF
+    above_inds = torch.zeros_like(u, dtype=torch.long)
+    for i in range(M):
+        above_inds += (u >= cdf[:, i:i + 1]).long()
+
+    # random sample inside each bin
+    below_inds = torch.clamp(above_inds - 1, min=0)
+    inds_g = torch.stack((below_inds, above_inds), dim=2)
+
+    cdf = cdf.unsqueeze(1).repeat(1, N_samples, 1)
+    cdf_g = torch.gather(input=cdf, dim=-1, index=inds_g)
+
+    bins = bins.unsqueeze(1).repeat(1, N_samples, 1)
+    bins_g = torch.gather(input=bins, dim=-1, index=inds_g)
+
+    denom = cdf_g[:, :, 1] - cdf_g[:, :, 0]
+    denom = torch.where(denom < 1e-5, torch.ones_like(denom), denom)
+    t = (u - cdf_g[:, :, 0]) / denom
+
+    samples = bins_g[:, :, 0] + t * (bins_g[:, :, 1] - bins_g[:, :, 0])
+
+    return samples
+
+
+def sample_along_camera_ray(ray_o,
+                            ray_d,
+                            depth_range,
+                            N_samples,
+                            inv_uniform=False,
+                            det=False):
+    """Sampling along the camera ray.
+
+    Args:
+        ray_o (tensor): Origin of the ray in scene coordinate system;
+            tensor of shape [N_rays, 3]
+        ray_d (tensor): Homogeneous ray direction vectors in
+            scene coordinate system; tensor of shape [N_rays, 3]
+        depth_range (tuple): [near_depth, far_depth]
+        inv_uniform (bool): If True,uniformly sampling inverse depth.
+        det (bool): If True, will perform deterministic sampling.
+    Returns:
+        pts (tensor): Tensor of shape [N_rays, N_samples, 3]
+        z_vals (tensor): Tensor of shape [N_rays, N_samples]
+    """
+    # will sample inside [near_depth, far_depth]
+    # assume the nearest possible depth is at least (min_ratio * depth)
+    near_depth_value = depth_range[0]
+    far_depth_value = depth_range[1]
+    assert near_depth_value > 0 and far_depth_value > 0 \
+        and far_depth_value > near_depth_value
+
+    near_depth = near_depth_value * torch.ones_like(ray_d[..., 0])
+
+    far_depth = far_depth_value * torch.ones_like(ray_d[..., 0])
+
+    if inv_uniform:
+        start = 1. / near_depth
+        step = (1. / far_depth - start) / (N_samples - 1)
+        inv_z_vals = torch.stack([start + i * step for i in range(N_samples)],
+                                 dim=1)
+        z_vals = 1. / inv_z_vals
+    else:
+        start = near_depth
+        step = (far_depth - near_depth) / (N_samples - 1)
+        z_vals = torch.stack([start + i * step for i in range(N_samples)],
+                             dim=1)
+
+    if not det:
+        # get intervals between samples
+        mids = .5 * (z_vals[:, 1:] + z_vals[:, :-1])
+        upper = torch.cat([mids, z_vals[:, -1:]], dim=-1)
+        lower = torch.cat([z_vals[:, 0:1], mids], dim=-1)
+        # uniform samples in those intervals
+        t_rand = torch.rand_like(z_vals)
+        z_vals = lower + (upper - lower) * t_rand
+
+    ray_d = ray_d.unsqueeze(1).repeat(1, N_samples, 1)
+    ray_o = ray_o.unsqueeze(1).repeat(1, N_samples, 1)
+    pts = z_vals.unsqueeze(2) * ray_d + ray_o  # [N_rays, N_samples, 3]
+    return pts, z_vals
+
+
+# ray rendering of nerf
+def raw2outputs(raw, z_vals, mask, white_bkgd=False):
+    """Transform raw data to outputs:
+
+    Args:
+        raw(tensor):Raw network output.Tensor of shape [N_rays, N_samples, 4]
+        z_vals(tensor):Depth of point samples along rays.
+            Tensor of shape [N_rays, N_samples]
+        ray_d(tensor):[N_rays, 3]
+
+    Returns:
+        ret(dict):
+            -rgb(tensor):[N_rays, 3]
+            -depth(tensor):[N_rays,]
+            -weights(tensor):[N_rays,]
+            -depth_std(tensor):[N_rays,]
+    """
+    rgb = raw[:, :, :3]  # [N_rays, N_samples, 3]
+    sigma = raw[:, :, 3]  # [N_rays, N_samples]
+
+    # note: we did not use the intervals here,
+    # because in practice different scenes from COLMAP can have
+    # very different scales, and using interval can affect
+    # the model's generalization ability.
+    # Therefore we don't use the intervals for both training and evaluation.
+    sigma2alpha = lambda sigma, dists: 1. - torch.exp(-sigma)  # noqa
+
+    # point samples are ordered with increasing depth
+    # interval between samples
+    dists = z_vals[:, 1:] - z_vals[:, :-1]
+    dists = torch.cat((dists, dists[:, -1:]), dim=-1)
+
+    alpha = sigma2alpha(sigma, dists)
+
+    T = torch.cumprod(1. - alpha + 1e-10, dim=-1)[:, :-1]
+    T = torch.cat((torch.ones_like(T[:, 0:1]), T), dim=-1)
+
+    # maths show weights, and summation of weights along a ray,
+    # are always inside [0, 1]
+    weights = alpha * T
+    rgb_map = torch.sum(weights.unsqueeze(2) * rgb, dim=1)
+
+    if white_bkgd:
+        rgb_map = rgb_map + (1. - torch.sum(weights, dim=-1, keepdim=True))
+
+    if mask is not None:
+        mask = mask.float().sum(dim=1) > 8
+
+    depth_map = torch.sum(
+        weights * z_vals, dim=-1) / (
+            torch.sum(weights, dim=-1) + 1e-8)
+    depth_map = torch.clamp(depth_map, z_vals.min(), z_vals.max())
+
+    ret = OrderedDict([('rgb', rgb_map), ('depth', depth_map),
+                       ('weights', weights), ('mask', mask), ('alpha', alpha),
+                       ('z_vals', z_vals), ('transparency', T)])
+
+    return ret
+
+
+def render_rays_func(
+        ray_o,
+        ray_d,
+        mean_volume,
+        cov_volume,
+        features_2D,
+        img,
+        aabb,
+        near_far_range,
+        N_samples,
+        N_rand=4096,
+        nerf_mlp=None,
+        img_meta=None,
+        projector=None,
+        mode='volume',  # volume and image
+        nerf_sample_view=3,
+        inv_uniform=False,
+        N_importance=0,
+        det=False,
+        is_train=True,
+        white_bkgd=False,
+        gt_rgb=None,
+        gt_depth=None):
+
+    ret = {
+        'outputs_coarse': None,
+        'outputs_fine': None,
+        'gt_rgb': gt_rgb,
+        'gt_depth': gt_depth
+    }
+
+    # pts: [N_rays, N_samples, 3]
+    # z_vals: [N_rays, N_samples]
+    pts, z_vals = sample_along_camera_ray(
+        ray_o=ray_o,
+        ray_d=ray_d,
+        depth_range=near_far_range,
+        N_samples=N_samples,
+        inv_uniform=inv_uniform,
+        det=det)
+    N_rays, N_samples = pts.shape[:2]
+
+    if mode == 'image':
+        img = img.permute(0, 2, 3, 1).unsqueeze(0)
+        train_camera = _compute_projection(img_meta).to(img.device)
+        rgb_feat, mask = projector.compute(
+            pts, img, train_camera, features_2D, grid_sample=True)
+        pixel_mask = mask[..., 0].sum(dim=2) > 1
+        mean, var = compute_mask_points(rgb_feat, mask)
+        globalfeat = torch.cat([mean, var], dim=-1).squeeze(2)
+        rgb_pts, density_pts = nerf_mlp(pts, ray_d, globalfeat)
+        raw_coarse = torch.cat([rgb_pts, density_pts], dim=-1)
+        ret['sigma'] = density_pts
+
+    elif mode == 'volume':
+        mean_pts, inbound_masks = volume_sampling(pts, mean_volume, aabb)
+        cov_pts, inbound_masks = volume_sampling(pts, cov_volume, aabb)
+        # This masks is for indicating which points outside of aabb
+        img = img.permute(0, 2, 3, 1).unsqueeze(0)
+        train_camera = _compute_projection(img_meta).to(img.device)
+        _, view_mask = projector.compute(pts, img, train_camera, None)
+        pixel_mask = view_mask[..., 0].sum(dim=2) > 1
+        # plot_3D_vis(pts, aabb, img, train_camera)
+        # [N_rays, N_samples], should at least have 2 observations
+        # This mask is for indicating which points do not have projected point
+        globalpts = torch.cat([mean_pts, cov_pts], dim=-1)
+        rgb_pts, density_pts = nerf_mlp(pts, ray_d, globalpts)
+        density_pts = density_pts * inbound_masks.unsqueeze(dim=-1)
+
+        raw_coarse = torch.cat([rgb_pts, density_pts], dim=-1)
+
+    outputs_coarse = raw2outputs(
+        raw_coarse, z_vals, pixel_mask, white_bkgd=white_bkgd)
+    ret['outputs_coarse'] = outputs_coarse
+
+    return ret
+
+
+def render_rays(
+        ray_batch,
+        mean_volume,
+        cov_volume,
+        features_2D,
+        img,
+        aabb,
+        near_far_range,
+        N_samples,
+        N_rand=4096,
+        nerf_mlp=None,
+        img_meta=None,
+        projector=None,
+        mode='volume',  # volume and image
+        nerf_sample_view=3,
+        inv_uniform=False,
+        N_importance=0,
+        det=False,
+        is_train=True,
+        white_bkgd=False,
+        render_testing=False):
+    """The function of the nerf rendering."""
+
+    ray_o = ray_batch['ray_o']
+    ray_d = ray_batch['ray_d']
+    gt_rgb = ray_batch['gt_rgb']
+    gt_depth = ray_batch['gt_depth']
+    nerf_sizes = ray_batch['nerf_sizes']
+    if is_train:
+        ray_o = ray_o.view(-1, 3)
+        ray_d = ray_d.view(-1, 3)
+        gt_rgb = gt_rgb.view(-1, 3)
+        if gt_depth.shape[1] != 0:
+            gt_depth = gt_depth.view(-1, 1)
+            non_zero_depth = (gt_depth > 0).squeeze(-1)
+            ray_o = ray_o[non_zero_depth]
+            ray_d = ray_d[non_zero_depth]
+            gt_rgb = gt_rgb[non_zero_depth]
+            gt_depth = gt_depth[non_zero_depth]
+        else:
+            gt_depth = None
+        total_rays = ray_d.shape[0]
+        select_inds = rng.choice(total_rays, size=(N_rand, ), replace=False)
+        ray_o = ray_o[select_inds]
+        ray_d = ray_d[select_inds]
+        gt_rgb = gt_rgb[select_inds]
+        if gt_depth is not None:
+            gt_depth = gt_depth[select_inds]
+
+        rets = render_rays_func(
+            ray_o,
+            ray_d,
+            mean_volume,
+            cov_volume,
+            features_2D,
+            img,
+            aabb,
+            near_far_range,
+            N_samples,
+            N_rand,
+            nerf_mlp,
+            img_meta,
+            projector,
+            mode,  # volume and image
+            nerf_sample_view,
+            inv_uniform,
+            N_importance,
+            det,
+            is_train,
+            white_bkgd,
+            gt_rgb,
+            gt_depth)
+
+    elif render_testing:
+        nerf_size = nerf_sizes[0]
+        view_num = ray_o.shape[1]
+        H = nerf_size[0][0]
+        W = nerf_size[0][1]
+        ray_o = ray_o.view(-1, 3)
+        ray_d = ray_d.view(-1, 3)
+        gt_rgb = gt_rgb.view(-1, 3)
+        print(gt_rgb.shape)
+        if len(gt_depth) != 0:
+            gt_depth = gt_depth.view(-1, 1)
+        else:
+            gt_depth = None
+        assert view_num * H * W == ray_o.shape[0]
+        num_rays = ray_o.shape[0]
+        results = []
+        rgbs = []
+        for i in range(0, num_rays, N_rand):
+            ray_o_chunck = ray_o[i:i + N_rand, :]
+            ray_d_chunck = ray_d[i:i + N_rand, :]
+
+            ret = render_rays_func(ray_o_chunck, ray_d_chunck, mean_volume,
+                                   cov_volume, features_2D, img, aabb,
+                                   near_far_range, N_samples, N_rand, nerf_mlp,
+                                   img_meta, projector, mode, nerf_sample_view,
+                                   inv_uniform, N_importance, True, is_train,
+                                   white_bkgd, gt_rgb, gt_depth)
+            results.append(ret)
+
+        rgbs = []
+        depths = []
+
+        if results[0]['outputs_coarse'] is not None:
+            for i in range(len(results)):
+                rgb = results[i]['outputs_coarse']['rgb']
+                rgbs.append(rgb)
+                depth = results[i]['outputs_coarse']['depth']
+                depths.append(depth)
+
+        rets = {
+            'outputs_coarse': {
+                'rgb': torch.cat(rgbs, dim=0).view(view_num, H, W, 3),
+                'depth': torch.cat(depths, dim=0).view(view_num, H, W, 1),
+            },
+            'gt_rgb':
+            gt_rgb.view(view_num, H, W, 3),
+            'gt_depth':
+            gt_depth.view(view_num, H, W, 1) if gt_depth is not None else None,
+        }
+    else:
+        rets = None
+    return rets
diff --git a/mmde/projects/NeRF-Det/nerfdet/nerf_utils/save_rendered_img.py b/mmde/projects/NeRF-Det/nerfdet/nerf_utils/save_rendered_img.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9de3e3107c7569d6ca6e0e34bfa452b9fe101ad
--- /dev/null
+++ b/mmde/projects/NeRF-Det/nerfdet/nerf_utils/save_rendered_img.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import cv2
+import numpy as np
+import torch
+from skimage.metrics import structural_similarity
+
+
+def compute_psnr_from_mse(mse):
+    return -10.0 * torch.log(mse) / np.log(10.0)
+
+
+def compute_psnr(pred, target, mask=None):
+    """Compute psnr value (we assume the maximum pixel value is 1)."""
+    if mask is not None:
+        pred, target = pred[mask], target[mask]
+    mse = ((pred - target)**2).mean()
+    return compute_psnr_from_mse(mse).cpu().numpy()
+
+
+def compute_ssim(pred, target, mask=None):
+    """Computes Masked SSIM following the neuralbody paper."""
+    assert pred.shape == target.shape and pred.shape[-1] == 3
+    if mask is not None:
+        x, y, w, h = cv2.boundingRect(mask.cpu().numpy().astype(np.uint8))
+        pred = pred[y:y + h, x:x + w]
+        target = target[y:y + h, x:x + w]
+    try:
+        ssim = structural_similarity(
+            pred.cpu().numpy(), target.cpu().numpy(), channel_axis=-1)
+    except ValueError:
+        ssim = structural_similarity(
+            pred.cpu().numpy(), target.cpu().numpy(), multichannel=True)
+    return ssim
+
+
+def save_rendered_img(img_meta, rendered_results):
+    filename = img_meta[0]['filename']
+    scenes = filename.split('/')[-2]
+
+    for ret in rendered_results:
+        depth = ret['outputs_coarse']['depth']
+        rgb = ret['outputs_coarse']['rgb']
+        gt = ret['gt_rgb']
+        gt_depth = ret['gt_depth']
+
+    # save images
+    psnr_total = 0
+    ssim_total = 0
+    rsme = 0
+    for v in range(gt.shape[0]):
+        rsme += ((depth[v] - gt_depth[v])**2).cpu().numpy()
+        depth_ = ((depth[v] - depth[v].min()) /
+                  (depth[v].max() - depth[v].min() + 1e-8)).repeat(1, 1, 3)
+        img_to_save = torch.cat([rgb[v], gt[v], depth_], dim=1)
+        image_path = os.path.join('nerf_vs_rebuttal', scenes)
+        if not os.path.exists(image_path):
+            os.makedirs(image_path)
+        save_dir = os.path.join(image_path, 'view_' + str(v) + '.png')
+
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        org = (50, 50)
+        fontScale = 1
+        color = (255, 0, 0)
+        thickness = 2
+        image = np.uint8(img_to_save.cpu().numpy() * 255.0)
+        psnr = compute_psnr(rgb[v], gt[v], mask=None)
+        psnr_total += psnr
+        ssim = compute_ssim(rgb[v], gt[v], mask=None)
+        ssim_total += ssim
+        image = cv2.putText(
+            image, 'PSNR: ' + '%.2f' % compute_psnr(rgb[v], gt[v], mask=None),
+            org, font, fontScale, color, thickness, cv2.LINE_AA)
+
+        cv2.imwrite(save_dir, image)
+
+    return psnr_total / gt.shape[0], ssim_total / gt.shape[0], rsme / gt.shape[
+        0]
diff --git a/mmde/projects/NeRF-Det/nerfdet/nerfdet.py b/mmde/projects/NeRF-Det/nerfdet/nerfdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee66387cb5dbd67aedab39e7317de1a2efbce95d
--- /dev/null
+++ b/mmde/projects/NeRF-Det/nerfdet/nerfdet.py
@@ -0,0 +1,632 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet3d.models.detectors import Base3DDetector
+from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import ConfigType, OptConfigType
+from .nerf_utils.nerf_mlp import VanillaNeRF
+from .nerf_utils.projection import Projector
+from .nerf_utils.render_ray import render_rays
+
+# from ..utils.nerf_utils.save_rendered_img import save_rendered_img
+
+
+@MODELS.register_module()
+class NerfDet(Base3DDetector):
+    r"""`ImVoxelNet <https://arxiv.org/abs/2307.14620>`_.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        neck_3d(:obj:`ConfigDict` or dict): The 3D neck config.
+        bbox_head(:obj:`ConfigDict` or dict): The bbox head config.
+        prior_generator (:obj:`ConfigDict` or dict): The prior generator
+            config.
+        n_voxels (list): Number of voxels along x, y, z axis.
+        voxel_size (list): The size of voxels.Each voxel represents
+            a cube of `voxel_size[0]` meters, `voxel_size[1]` meters,
+            ``
+        train_cfg (:obj:`ConfigDict` or dict, optional): Config dict of
+            training hyper-parameters. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Config dict of test
+            hyper-parameters. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): The initialization
+            config. Defaults to None.
+        render_testing (bool): If you want to render novel view, please set
+            "render_testing = True" in config
+        The other args are the parameters of NeRF, you can just use the
+            default values.
+    """
+
+    def __init__(
+            self,
+            backbone: ConfigType,
+            neck: ConfigType,
+            neck_3d: ConfigType,
+            bbox_head: ConfigType,
+            prior_generator: ConfigType,
+            n_voxels: List,
+            voxel_size: List,
+            head_2d: ConfigType = None,
+            train_cfg: OptConfigType = None,
+            test_cfg: OptConfigType = None,
+            data_preprocessor: OptConfigType = None,
+            init_cfg: OptConfigType = None,
+            #  pretrained,
+            aabb: Tuple = None,
+            near_far_range: List = None,
+            N_samples: int = 64,
+            N_rand: int = 2048,
+            depth_supervise: bool = False,
+            use_nerf_mask: bool = True,
+            nerf_sample_view: int = 3,
+            nerf_mode: str = 'volume',
+            squeeze_scale: int = 4,
+            rgb_supervision: bool = True,
+            nerf_density: bool = False,
+            render_testing: bool = False):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        self.neck = MODELS.build(neck)
+        self.neck_3d = MODELS.build(neck_3d)
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = MODELS.build(bbox_head)
+        self.head_2d = MODELS.build(head_2d) if head_2d is not None else None
+        self.n_voxels = n_voxels
+        self.prior_generator = TASK_UTILS.build(prior_generator)
+        self.voxel_size = voxel_size
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.aabb = aabb
+        self.near_far_range = near_far_range
+        self.N_samples = N_samples
+        self.N_rand = N_rand
+        self.depth_supervise = depth_supervise
+        self.projector = Projector()
+        self.squeeze_scale = squeeze_scale
+        self.use_nerf_mask = use_nerf_mask
+        self.rgb_supervision = rgb_supervision
+        nerf_feature_dim = neck['out_channels'] // squeeze_scale
+        self.nerf_mlp = VanillaNeRF(
+            net_depth=4,  # The depth of the MLP
+            net_width=256,  # The width of the MLP
+            skip_layer=3,  # The layer to add skip layers to.
+            feature_dim=nerf_feature_dim + 6,  # + RGB original imgs
+            net_depth_condition=1,  # The depth of the second part of MLP
+            net_width_condition=128)
+        self.nerf_mode = nerf_mode
+        self.nerf_density = nerf_density
+        self.nerf_sample_view = nerf_sample_view
+        self.render_testing = render_testing
+
+        # hard code here, will deal with batch issue later.
+        self.cov = nn.Sequential(
+            nn.Conv3d(
+                neck['out_channels'],
+                neck['out_channels'],
+                kernel_size=3,
+                padding=1), nn.ReLU(inplace=True),
+            nn.Conv3d(
+                neck['out_channels'],
+                neck['out_channels'],
+                kernel_size=3,
+                padding=1), nn.ReLU(inplace=True),
+            nn.Conv3d(neck['out_channels'], 1, kernel_size=1))
+
+        self.mean_mapping = nn.Sequential(
+            nn.Conv3d(
+                neck['out_channels'], nerf_feature_dim // 2, kernel_size=1))
+
+        self.cov_mapping = nn.Sequential(
+            nn.Conv3d(
+                neck['out_channels'], nerf_feature_dim // 2, kernel_size=1))
+
+        self.mapping = nn.Sequential(
+            nn.Linear(neck['out_channels'], nerf_feature_dim // 2))
+
+        self.mapping_2d = nn.Sequential(
+            nn.Conv2d(
+                neck['out_channels'], nerf_feature_dim // 2, kernel_size=1))
+        # self.overfit_nerfmlp = overfit_nerfmlp
+        # if self.overfit_nerfmlp:
+        #     self. _finetuning_NeRF_MLP()
+        self.render_testing = render_testing
+
+    def extract_feat(self,
+                     batch_inputs_dict: dict,
+                     batch_data_samples: SampleList,
+                     mode,
+                     depth=None,
+                     ray_batch=None):
+        """Extract 3d features from the backbone -> fpn -> 3d projection.
+
+        -> 3d neck -> bbox_head.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                the 'imgs' key.
+
+                    - imgs (torch.Tensor, optional): Image of each sample.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instances` of `gt_panoptic_seg` or `gt_sem_seg`
+
+        Returns:
+            Tuple:
+            - torch.Tensor: Features of shape (N, C_out, N_x, N_y, N_z).
+            - torch.Tensor: Valid mask of shape (N, 1, N_x, N_y, N_z).
+            - torch.Tensor: 2D features if needed.
+            - dict: The nerf rendered information including the
+                'output_coarse', 'gt_rgb' and 'gt_depth' keys.
+        """
+        img = batch_inputs_dict['imgs']
+        img = img.float()
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        batch_size = img.shape[0]
+
+        if len(img.shape) > 4:
+            img = img.reshape([-1] + list(img.shape)[2:])
+            x = self.backbone(img)
+            x = self.neck(x)[0]
+            x = x.reshape([batch_size, -1] + list(x.shape[1:]))
+        else:
+            x = self.backbone(img)
+            x = self.neck(x)[0]
+
+        if depth is not None:
+            depth_bs = depth.shape[0]
+            assert depth_bs == batch_size
+            depth = batch_inputs_dict['depth']
+            depth = depth.reshape([-1] + list(depth.shape)[2:])
+
+        features_2d = self.head_2d.forward(x[-1], batch_img_metas) \
+            if self.head_2d is not None else None
+
+        stride = img.shape[-1] / x.shape[-1]
+        assert stride == 4
+        stride = int(stride)
+
+        volumes, valids = [], []
+        rgb_preds = []
+
+        for feature, img_meta in zip(x, batch_img_metas):
+            angles = features_2d[
+                0] if features_2d is not None and mode == 'test' else None
+            projection = self._compute_projection(img_meta, stride,
+                                                  angles).to(x.device)
+            points = get_points(
+                n_voxels=torch.tensor(self.n_voxels),
+                voxel_size=torch.tensor(self.voxel_size),
+                origin=torch.tensor(img_meta['lidar2img']['origin'])).to(
+                    x.device)
+
+            height = img_meta['img_shape'][0] // stride
+            width = img_meta['img_shape'][1] // stride
+            # Construct the volume space
+            # volume together with valid is the constructed scene
+            # volume represents V_i and valid represents M_p
+            volume, valid = backproject(feature[:, :, :height, :width], points,
+                                        projection, depth, self.voxel_size)
+            density = None
+            volume_sum = volume.sum(dim=0)
+            # cov_valid = valid.clone().detach()
+            valid = valid.sum(dim=0)
+            volume_mean = volume_sum / (valid + 1e-8)
+            volume_mean[:, valid[0] == 0] = .0
+            # volume_cov = (volume - volume_mean.unsqueeze(0)) ** 2 * cov_valid
+            # volume_cov = torch.sum(volume_cov, dim=0) / (valid + 1e-8)
+            volume_cov = torch.sum(
+                (volume - volume_mean.unsqueeze(0))**2, dim=0) / (
+                    valid + 1e-8)
+            volume_cov[:, valid[0] == 0] = 1e6
+            volume_cov = torch.exp(-volume_cov)  # default setting
+            # be careful here, the smaller the cov, the larger the weight.
+            n_channels, n_x_voxels, n_y_voxels, n_z_voxels = volume_mean.shape
+            if ray_batch is not None:
+                if self.nerf_mode == 'volume':
+                    mean_volume = self.mean_mapping(volume_mean.unsqueeze(0))
+                    cov_volume = self.cov_mapping(volume_cov.unsqueeze(0))
+                    feature_2d = feature[:, :, :height, :width]
+
+                elif self.nerf_mode == 'image':
+                    mean_volume = None
+                    cov_volume = None
+                    feature_2d = feature[:, :, :height, :width]
+                    n_v, C, height, width = feature_2d.shape
+                    feature_2d = feature_2d.view(n_v, C,
+                                                 -1).permute(0, 2,
+                                                             1).contiguous()
+                    feature_2d = self.mapping(feature_2d).permute(
+                        0, 2, 1).contiguous().view(n_v, -1, height, width)
+
+                denorm_images = ray_batch['denorm_images']
+                denorm_images = denorm_images.reshape(
+                    [-1] + list(denorm_images.shape)[2:])
+                rgb_projection = self._compute_projection(
+                    img_meta, stride=1, angles=None).to(x.device)
+
+                rgb_volume, _ = backproject(
+                    denorm_images[:, :, :img_meta['img_shape'][0], :
+                                  img_meta['img_shape'][1]], points,
+                    rgb_projection, depth, self.voxel_size)
+
+                ret = render_rays(
+                    ray_batch,
+                    mean_volume,
+                    cov_volume,
+                    feature_2d,
+                    denorm_images,
+                    self.aabb,
+                    self.near_far_range,
+                    self.N_samples,
+                    self.N_rand,
+                    self.nerf_mlp,
+                    img_meta,
+                    self.projector,
+                    self.nerf_mode,
+                    self.nerf_sample_view,
+                    is_train=True if mode == 'train' else False,
+                    render_testing=self.render_testing)
+                rgb_preds.append(ret)
+
+                if self.nerf_density:
+                    # would have 0 bias issue for mean_mapping.
+                    n_v, C, n_x_voxels, n_y_voxels, n_z_voxels = volume.shape
+                    volume = volume.view(n_v, C, -1).permute(0, 2,
+                                                             1).contiguous()
+                    mapping_volume = self.mapping(volume).permute(
+                        0, 2, 1).contiguous().view(n_v, -1, n_x_voxels,
+                                                   n_y_voxels, n_z_voxels)
+
+                    mapping_volume = torch.cat([rgb_volume, mapping_volume],
+                                               dim=1)
+                    mapping_volume_sum = mapping_volume.sum(dim=0)
+                    mapping_volume_mean = mapping_volume_sum / (valid + 1e-8)
+                    # mapping_volume_cov = (
+                    #         mapping_volume - mapping_volume_mean.unsqueeze(0)
+                    #     ) ** 2 * cov_valid
+                    mapping_volume_cov = (mapping_volume -
+                                          mapping_volume_mean.unsqueeze(0))**2
+                    mapping_volume_cov = torch.sum(
+                        mapping_volume_cov, dim=0) / (
+                            valid + 1e-8)
+                    mapping_volume_cov[:, valid[0] == 0] = 1e6
+                    mapping_volume_cov = torch.exp(
+                        -mapping_volume_cov)  # default setting
+                    global_volume = torch.cat(
+                        [mapping_volume_mean, mapping_volume_cov], dim=1)
+                    global_volume = global_volume.view(
+                        -1, n_x_voxels * n_y_voxels * n_z_voxels).permute(
+                            1, 0).contiguous()
+                    points = points.view(3, -1).permute(1, 0).contiguous()
+                    density = self.nerf_mlp.query_density(
+                        points, global_volume)
+                    alpha = 1 - torch.exp(-density)
+                    # density -> alpha
+                    # (1, n_x_voxels, n_y_voxels, n_z_voxels)
+                volume = alpha.view(1, n_x_voxels, n_y_voxels,
+                                    n_z_voxels) * volume_mean
+                volume[:, valid[0] == 0] = .0
+
+            volumes.append(volume)
+            valids.append(valid)
+        x = torch.stack(volumes)
+        x = self.neck_3d(x)
+
+        return x, torch.stack(valids).float(), features_2d, rgb_preds
+
+    def loss(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+             **kwargs) -> Union[dict, list]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                the 'imgs' key.
+
+                    - imgs (torch.Tensor, optional): Image of each sample.
+            batch_data_samples (list[:obj: `DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        ray_batchs = {}
+        batch_images = []
+        batch_depths = []
+        if 'images' in batch_data_samples[0].gt_nerf_images:
+            for data_samples in batch_data_samples:
+                image = data_samples.gt_nerf_images['images']
+                batch_images.append(image)
+        batch_images = torch.stack(batch_images)
+
+        if 'depths' in batch_data_samples[0].gt_nerf_depths:
+            for data_samples in batch_data_samples:
+                depth = data_samples.gt_nerf_depths['depths']
+                batch_depths.append(depth)
+        batch_depths = torch.stack(batch_depths)
+
+        if 'raydirs' in batch_inputs_dict.keys():
+            ray_batchs['ray_o'] = batch_inputs_dict['lightpos']
+            ray_batchs['ray_d'] = batch_inputs_dict['raydirs']
+            ray_batchs['gt_rgb'] = batch_images
+            ray_batchs['gt_depth'] = batch_depths
+            ray_batchs['nerf_sizes'] = batch_inputs_dict['nerf_sizes']
+            ray_batchs['denorm_images'] = batch_inputs_dict['denorm_images']
+            x, valids, features_2d, rgb_preds = self.extract_feat(
+                batch_inputs_dict,
+                batch_data_samples,
+                'train',
+                depth=None,
+                ray_batch=ray_batchs)
+        else:
+            x, valids, features_2d, rgb_preds = self.extract_feat(
+                batch_inputs_dict, batch_data_samples, 'train')
+        x += (valids, )
+        losses = self.bbox_head.loss(x, batch_data_samples, **kwargs)
+
+        # if self.head_2d is not None:
+        #     losses.update(
+        #         self.head_2d.loss(*features_2d, batch_data_samples)
+        #     )
+        if len(ray_batchs) != 0 and self.rgb_supervision:
+            losses.update(self.nvs_loss_func(rgb_preds))
+        if self.depth_supervise:
+            losses.update(self.depth_loss_func(rgb_preds))
+        return losses
+
+    def nvs_loss_func(self, rgb_pred):
+        loss = 0
+        for ret in rgb_pred:
+            rgb = ret['outputs_coarse']['rgb']
+            gt = ret['gt_rgb']
+            masks = ret['outputs_coarse']['mask']
+            if self.use_nerf_mask:
+                loss += torch.sum(masks.unsqueeze(-1) * (rgb - gt)**2) / (
+                    masks.sum() + 1e-6)
+            else:
+                loss += torch.mean((rgb - gt)**2)
+        return dict(loss_nvs=loss)
+
+    def depth_loss_func(self, rgb_pred):
+        loss = 0
+        for ret in rgb_pred:
+            depth = ret['outputs_coarse']['depth']
+            gt = ret['gt_depth'].squeeze(-1)
+            masks = ret['outputs_coarse']['mask']
+            if self.use_nerf_mask:
+                loss += torch.sum(masks * torch.abs(depth - gt)) / (
+                    masks.sum() + 1e-6)
+            else:
+                loss += torch.mean(torch.abs(depth - gt))
+
+        return dict(loss_depth=loss)
+
+    def predict(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+                **kwargs) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                the 'imgs' key.
+
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`NeRFDet3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
+
+        Returns:
+            list[:obj:`NeRFDet3DDataSample`]: Detection results of the
+            input images. Each NeRFDet3DDataSample usually contain
+            'pred_instances_3d'. And the ``pred_instances_3d`` usually
+            contains following keys.
+
+                - scores_3d (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels_3d (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes_3d (Tensor): Contains a tensor with shape
+                    (num_instances, C) where C = 6.
+        """
+        ray_batchs = {}
+        batch_images = []
+        batch_depths = []
+        if 'images' in batch_data_samples[0].gt_nerf_images:
+            for data_samples in batch_data_samples:
+                image = data_samples.gt_nerf_images['images']
+                batch_images.append(image)
+        batch_images = torch.stack(batch_images)
+
+        if 'depths' in batch_data_samples[0].gt_nerf_depths:
+            for data_samples in batch_data_samples:
+                depth = data_samples.gt_nerf_depths['depths']
+                batch_depths.append(depth)
+        batch_depths = torch.stack(batch_depths)
+
+        if 'raydirs' in batch_inputs_dict.keys():
+            ray_batchs['ray_o'] = batch_inputs_dict['lightpos']
+            ray_batchs['ray_d'] = batch_inputs_dict['raydirs']
+            ray_batchs['gt_rgb'] = batch_images
+            ray_batchs['gt_depth'] = batch_depths
+            ray_batchs['nerf_sizes'] = batch_inputs_dict['nerf_sizes']
+            ray_batchs['denorm_images'] = batch_inputs_dict['denorm_images']
+            x, valids, features_2d, rgb_preds = self.extract_feat(
+                batch_inputs_dict,
+                batch_data_samples,
+                'test',
+                depth=None,
+                ray_batch=ray_batchs)
+        else:
+            x, valids, features_2d, rgb_preds = self.extract_feat(
+                batch_inputs_dict, batch_data_samples, 'test')
+        x += (valids, )
+        results_list = self.bbox_head.predict(x, batch_data_samples, **kwargs)
+        predictions = self.add_pred_to_datasample(batch_data_samples,
+                                                  results_list)
+        return predictions
+
+    def _forward(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
+                 *args, **kwargs) -> Tuple[List[torch.Tensor]]:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            batch_inputs_dict (dict): The model input dict which include
+                the 'imgs' key.
+
+                    - imgs (torch.Tensor, optional): Image of each sample.
+
+            batch_data_samples (List[:obj:`NeRFDet3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`
+
+        Returns:
+            tuple[list]: A tuple of features from ``bbox_head`` forward
+        """
+        ray_batchs = {}
+        batch_images = []
+        batch_depths = []
+        if 'images' in batch_data_samples[0].gt_nerf_images:
+            for data_samples in batch_data_samples:
+                image = data_samples.gt_nerf_images['images']
+                batch_images.append(image)
+        batch_images = torch.stack(batch_images)
+
+        if 'depths' in batch_data_samples[0].gt_nerf_depths:
+            for data_samples in batch_data_samples:
+                depth = data_samples.gt_nerf_depths['depths']
+                batch_depths.append(depth)
+        batch_depths = torch.stack(batch_depths)
+        if 'raydirs' in batch_inputs_dict.keys():
+            ray_batchs['ray_o'] = batch_inputs_dict['lightpos']
+            ray_batchs['ray_d'] = batch_inputs_dict['raydirs']
+            ray_batchs['gt_rgb'] = batch_images
+            ray_batchs['gt_depth'] = batch_depths
+            ray_batchs['nerf_sizes'] = batch_inputs_dict['nerf_sizes']
+            ray_batchs['denorm_images'] = batch_inputs_dict['denorm_images']
+            x, valids, features_2d, rgb_preds = self.extract_feat(
+                batch_inputs_dict,
+                batch_data_samples,
+                'train',
+                depth=None,
+                ray_batch=ray_batchs)
+        else:
+            x, valids, features_2d, rgb_preds = self.extract_feat(
+                batch_inputs_dict, batch_data_samples, 'train')
+        x += (valids, )
+        results = self.bbox_head.forward(x)
+        return results
+
+    def aug_test(self, batch_inputs_dict, batch_data_samples):
+        pass
+
+    def show_results(self, *args, **kwargs):
+        pass
+
+    @staticmethod
+    def _compute_projection(img_meta, stride, angles):
+        projection = []
+        intrinsic = torch.tensor(img_meta['lidar2img']['intrinsic'][:3, :3])
+        ratio = img_meta['ori_shape'][0] / (img_meta['img_shape'][0] / stride)
+        intrinsic[:2] /= ratio
+        # use predict pitch and roll for SUNRGBDTotal test
+        if angles is not None:
+            extrinsics = []
+            for angle in angles:
+                extrinsics.append(get_extrinsics(angle).to(intrinsic.device))
+        else:
+            extrinsics = map(torch.tensor, img_meta['lidar2img']['extrinsic'])
+        for extrinsic in extrinsics:
+            projection.append(intrinsic @ extrinsic[:3])
+        return torch.stack(projection)
+
+
+@torch.no_grad()
+def get_points(n_voxels, voxel_size, origin):
+    # origin: point-cloud center.
+    points = torch.stack(
+        torch.meshgrid([
+            torch.arange(n_voxels[0]),  # 40 W width, x
+            torch.arange(n_voxels[1]),  # 40 D depth, y
+            torch.arange(n_voxels[2])  # 16 H Height, z
+        ]))
+    new_origin = origin - n_voxels / 2. * voxel_size
+    points = points * voxel_size.view(3, 1, 1, 1) + new_origin.view(3, 1, 1, 1)
+    return points
+
+
+# modify from https://github.com/magicleap/Atlas/blob/master/atlas/model.py
+def backproject(features, points, projection, depth, voxel_size):
+    n_images, n_channels, height, width = features.shape
+    n_x_voxels, n_y_voxels, n_z_voxels = points.shape[-3:]
+    points = points.view(1, 3, -1).expand(n_images, 3, -1)
+    points = torch.cat((points, torch.ones_like(points[:, :1])), dim=1)
+    points_2d_3 = torch.bmm(projection, points)
+
+    x = (points_2d_3[:, 0] / points_2d_3[:, 2]).round().long()
+    y = (points_2d_3[:, 1] / points_2d_3[:, 2]).round().long()
+    z = points_2d_3[:, 2]
+    valid = (x >= 0) & (y >= 0) & (x < width) & (y < height) & (z > 0)
+    # below is using depth to sample feature
+    if depth is not None:
+        depth = F.interpolate(
+            depth.unsqueeze(1), size=(height, width),
+            mode='bilinear').squeeze(1)
+        for i in range(n_images):
+            z_mask = z.clone() > 0
+            z_mask[i, valid[i]] = \
+                (z[i, valid[i]] > depth[i, y[i, valid[i]], x[i, valid[i]]] - voxel_size[-1]) & \
+                (z[i, valid[i]] < depth[i, y[i, valid[i]], x[i, valid[i]]] + voxel_size[-1]) # noqa
+            valid = valid & z_mask
+
+    volume = torch.zeros((n_images, n_channels, points.shape[-1]),
+                         device=features.device)
+    for i in range(n_images):
+        volume[i, :, valid[i]] = features[i, :, y[i, valid[i]], x[i, valid[i]]]
+    volume = volume.view(n_images, n_channels, n_x_voxels, n_y_voxels,
+                         n_z_voxels)
+    valid = valid.view(n_images, 1, n_x_voxels, n_y_voxels, n_z_voxels)
+
+    return volume, valid
+
+
+# for SUNRGBDTotal test
+def get_extrinsics(angles):
+    yaw = angles.new_zeros(())
+    pitch, roll = angles
+    r = angles.new_zeros((3, 3))
+    r[0, 0] = torch.cos(yaw) * torch.cos(pitch)
+    r[0, 1] = torch.sin(yaw) * torch.sin(roll) - torch.cos(yaw) * torch.cos(
+        roll) * torch.sin(pitch)
+    r[0, 2] = torch.cos(roll) * torch.sin(yaw) + torch.cos(yaw) * torch.sin(
+        pitch) * torch.sin(roll)
+    r[1, 0] = torch.sin(pitch)
+    r[1, 1] = torch.cos(pitch) * torch.cos(roll)
+    r[1, 2] = -torch.cos(pitch) * torch.sin(roll)
+    r[2, 0] = -torch.cos(pitch) * torch.sin(yaw)
+    r[2, 1] = torch.cos(yaw) * torch.sin(roll) + torch.cos(roll) * torch.sin(
+        yaw) * torch.sin(pitch)
+    r[2, 2] = torch.cos(yaw) * torch.cos(roll) - torch.sin(yaw) * torch.sin(
+        pitch) * torch.sin(roll)
+
+    # follow Total3DUnderstanding
+    t = angles.new_tensor([[0., 0., 1.], [0., -1., 0.], [-1., 0., 0.]])
+    r = t @ r.T
+    # follow DepthInstance3DBoxes
+    r = r[:, [2, 0, 1]]
+    r[2] *= -1
+    extrinsic = angles.new_zeros((4, 4))
+    extrinsic[:3, :3] = r
+    extrinsic[3, 3] = 1.
+    return extrinsic
diff --git a/mmde/projects/NeRF-Det/nerfdet/nerfdet_head.py b/mmde/projects/NeRF-Det/nerfdet/nerfdet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5faa0adc115339ccc8aab912f281b9c8e47dbf1
--- /dev/null
+++ b/mmde/projects/NeRF-Det/nerfdet/nerfdet_head.py
@@ -0,0 +1,629 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from mmcv.cnn import Scale
+# from mmcv.ops import nms3d, nms3d_normal
+from mmdet.models.utils import multi_apply
+from mmdet.utils import reduce_mean
+# from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor, nn
+
+from mmdet3d.registry import MODELS, TASK_UTILS
+# from mmdet3d.structures.bbox_3d.utils import rotation_3d_in_axis
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils.typing_utils import (ConfigType, InstanceList,
+                                        OptConfigType, OptInstanceList)
+
+
+@torch.no_grad()
+def get_points(n_voxels, voxel_size, origin):
+    # origin: point-cloud center.
+    points = torch.stack(
+        torch.meshgrid([
+            torch.arange(n_voxels[0]),  # 40 W width, x
+            torch.arange(n_voxels[1]),  # 40 D depth, y
+            torch.arange(n_voxels[2])  # 16 H Height, z
+        ]))
+    new_origin = origin - n_voxels / 2. * voxel_size
+    points = points * voxel_size.view(3, 1, 1, 1) + new_origin.view(3, 1, 1, 1)
+    return points
+
+
+@MODELS.register_module()
+class NerfDetHead(BaseModule):
+    r"""`ImVoxelNet<https://arxiv.org/abs/2106.01178>`_ head for indoor
+    datasets.
+
+    Args:
+        n_classes (int): Number of classes.
+        n_levels (int): Number of feature levels.
+        n_channels (int): Number of channels in input tensors.
+        n_reg_outs (int): Number of regression layer channels.
+        pts_assign_threshold (int): Min number of location per box to
+            be assigned with.
+        pts_center_threshold (int): Max number of locations per box to
+            be assigned with.
+        center_loss (dict, optional): Config of centerness loss.
+            Default: dict(type='CrossEntropyLoss', use_sigmoid=True).
+        bbox_loss (dict, optional): Config of bbox loss.
+            Default: dict(type='RotatedIoU3DLoss').
+        cls_loss (dict, optional): Config of classification loss.
+            Default: dict(type='FocalLoss').
+        train_cfg (dict, optional): Config for train stage. Defaults to None.
+        test_cfg (dict, optional): Config for test stage. Defaults to None.
+        init_cfg (dict, optional): Config for weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 n_classes: int,
+                 n_levels: int,
+                 n_channels: int,
+                 n_reg_outs: int,
+                 pts_assign_threshold: int,
+                 pts_center_threshold: int,
+                 prior_generator: ConfigType,
+                 center_loss: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss', use_sigmoid=True),
+                 bbox_loss: ConfigType = dict(type='RotatedIoU3DLoss'),
+                 cls_loss: ConfigType = dict(type='mmdet.FocalLoss'),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super(NerfDetHead, self).__init__(init_cfg)
+        self.n_classes = n_classes
+        self.n_levels = n_levels
+        self.n_reg_outs = n_reg_outs
+        self.pts_assign_threshold = pts_assign_threshold
+        self.pts_center_threshold = pts_center_threshold
+        self.prior_generator = TASK_UTILS.build(prior_generator)
+        self.center_loss = MODELS.build(center_loss)
+        self.bbox_loss = MODELS.build(bbox_loss)
+        self.cls_loss = MODELS.build(cls_loss)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers(n_channels, n_reg_outs, n_classes, n_levels)
+
+    def _init_layers(self, n_channels, n_reg_outs, n_classes, n_levels):
+        """Initialize neural network layers of the head."""
+        self.conv_center = nn.Conv3d(n_channels, 1, 3, padding=1, bias=False)
+        self.conv_reg = nn.Conv3d(
+            n_channels, n_reg_outs, 3, padding=1, bias=False)
+        self.conv_cls = nn.Conv3d(n_channels, n_classes, 3, padding=1)
+        self.scales = nn.ModuleList([Scale(1.) for _ in range(n_levels)])
+
+    def init_weights(self):
+        """Initialize all layer weights."""
+        normal_init(self.conv_center, std=.01)
+        normal_init(self.conv_reg, std=.01)
+        normal_init(self.conv_cls, std=.01, bias=bias_init_with_prob(.01))
+
+    def _forward_single(self, x: Tensor, scale: Scale):
+        """Forward pass per level.
+
+        Args:
+            x (Tensor): Per level 3d neck output tensor.
+            scale (mmcv.cnn.Scale): Per level multiplication weight.
+
+        Returns:
+            tuple[Tensor]: Centerness, bbox and classification predictions.
+        """
+        return (self.conv_center(x), torch.exp(scale(self.conv_reg(x))),
+                self.conv_cls(x))
+
+    def forward(self, x):
+        return multi_apply(self._forward_single, x, self.scales)
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`NeRFDet3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        valid_pred = x[-1]
+        outs = self(x[:-1])
+
+        batch_gt_instances_3d = []
+        batch_gt_instances_ignore = []
+        batch_input_metas = []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+            batch_gt_instances_ignore.append(
+                data_sample.get('ignored_instances', None))
+
+        loss_inputs = outs + (valid_pred, batch_gt_instances_3d,
+                              batch_input_metas, batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat(self,
+                     center_preds: List[List[Tensor]],
+                     bbox_preds: List[List[Tensor]],
+                     cls_preds: List[List[Tensor]],
+                     valid_pred: Tensor,
+                     batch_gt_instances_3d: InstanceList,
+                     batch_input_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None,
+                     **kwargs) -> dict:
+        """Per scene loss function.
+
+        Args:
+            center_preds (list[list[Tensor]]): Centerness predictions for
+                all scenes. The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            bbox_preds (list[list[Tensor]]): Bbox predictions for all scenes.
+                The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            cls_preds (list[list[Tensor]]): Classification predictions for all
+                scenes. The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            valid_pred (Tensor): Valid mask prediction for all scenes.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、`
+                `labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_input_metas (list[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: Centerness, bbox, and classification loss values.
+        """
+        valid_preds = self._upsample_valid_preds(valid_pred, center_preds)
+        center_losses, bbox_losses, cls_losses = [], [], []
+        for i in range(len(batch_input_metas)):
+            center_loss, bbox_loss, cls_loss = self._loss_by_feat_single(
+                center_preds=[x[i] for x in center_preds],
+                bbox_preds=[x[i] for x in bbox_preds],
+                cls_preds=[x[i] for x in cls_preds],
+                valid_preds=[x[i] for x in valid_preds],
+                input_meta=batch_input_metas[i],
+                gt_bboxes=batch_gt_instances_3d[i].bboxes_3d,
+                gt_labels=batch_gt_instances_3d[i].labels_3d)
+            center_losses.append(center_loss)
+            bbox_losses.append(bbox_loss)
+            cls_losses.append(cls_loss)
+        return dict(
+            center_loss=torch.mean(torch.stack(center_losses)),
+            bbox_loss=torch.mean(torch.stack(bbox_losses)),
+            cls_loss=torch.mean(torch.stack(cls_losses)))
+
+    def _loss_by_feat_single(self, center_preds, bbox_preds, cls_preds,
+                             valid_preds, input_meta, gt_bboxes, gt_labels):
+        featmap_sizes = [featmap.size()[-3:] for featmap in center_preds]
+        points = self._get_points(
+            featmap_sizes=featmap_sizes,
+            origin=input_meta['lidar2img']['origin'],
+            device=gt_bboxes.device)
+        center_targets, bbox_targets, cls_targets = self._get_targets(
+            points, gt_bboxes, gt_labels)
+
+        center_preds = torch.cat(
+            [x.permute(1, 2, 3, 0).reshape(-1) for x in center_preds])
+        bbox_preds = torch.cat([
+            x.permute(1, 2, 3, 0).reshape(-1, x.shape[0]) for x in bbox_preds
+        ])
+        cls_preds = torch.cat(
+            [x.permute(1, 2, 3, 0).reshape(-1, x.shape[0]) for x in cls_preds])
+        valid_preds = torch.cat(
+            [x.permute(1, 2, 3, 0).reshape(-1) for x in valid_preds])
+        points = torch.cat(points)
+
+        # cls loss
+        pos_inds = torch.nonzero(
+            torch.logical_and(cls_targets >= 0, valid_preds)).squeeze(1)
+        n_pos = points.new_tensor(len(pos_inds))
+        n_pos = max(reduce_mean(n_pos), 1.)
+        if torch.any(valid_preds):
+            cls_loss = self.cls_loss(
+                cls_preds[valid_preds],
+                cls_targets[valid_preds],
+                avg_factor=n_pos)
+        else:
+            cls_loss = cls_preds[valid_preds].sum()
+
+        # bbox and centerness losses
+        pos_center_preds = center_preds[pos_inds]
+        pos_bbox_preds = bbox_preds[pos_inds]
+        if len(pos_inds) > 0:
+            pos_center_targets = center_targets[pos_inds]
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_points = points[pos_inds]
+            center_loss = self.center_loss(
+                pos_center_preds, pos_center_targets, avg_factor=n_pos)
+            bbox_loss = self.bbox_loss(
+                self._bbox_pred_to_bbox(pos_points, pos_bbox_preds),
+                pos_bbox_targets,
+                weight=pos_center_targets,
+                avg_factor=pos_center_targets.sum())
+        else:
+            center_loss = pos_center_preds.sum()
+            bbox_loss = pos_bbox_preds.sum()
+        return center_loss, bbox_loss, cls_loss
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the 3D detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`NeRFDet3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`, `gt_pts_panoptic_seg` and
+                `gt_pts_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each sample
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores_3d (Tensor): Classification scores, has a shape
+              (num_instances, )
+            - labels_3d (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
+              contains a tensor with shape (num_instances, C), where
+              C >= 6.
+        """
+        batch_input_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        valid_pred = x[-1]
+        outs = self(x[:-1])
+        predictions = self.predict_by_feat(
+            *outs,
+            valid_pred=valid_pred,
+            batch_input_metas=batch_input_metas,
+            rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self, center_preds: List[List[Tensor]],
+                        bbox_preds: List[List[Tensor]],
+                        cls_preds: List[List[Tensor]], valid_pred: Tensor,
+                        batch_input_metas: List[dict],
+                        **kwargs) -> List[InstanceData]:
+        """Generate boxes for all scenes.
+
+        Args:
+            center_preds (list[list[Tensor]]): Centerness predictions for
+                all scenes.
+            bbox_preds (list[list[Tensor]]): Bbox predictions for all scenes.
+            cls_preds (list[list[Tensor]]): Classification predictions for all
+                scenes.
+            valid_pred (Tensor): Valid mask prediction for all scenes.
+            batch_input_metas (list[dict]): Meta infos for all scenes.
+
+        Returns:
+            list[tuple[Tensor]]: Predicted bboxes, scores, and labels for
+                all scenes.
+        """
+        valid_preds = self._upsample_valid_preds(valid_pred, center_preds)
+        results = []
+        for i in range(len(batch_input_metas)):
+            results.append(
+                self._predict_by_feat_single(
+                    center_preds=[x[i] for x in center_preds],
+                    bbox_preds=[x[i] for x in bbox_preds],
+                    cls_preds=[x[i] for x in cls_preds],
+                    valid_preds=[x[i] for x in valid_preds],
+                    input_meta=batch_input_metas[i]))
+        return results
+
+    def _predict_by_feat_single(self, center_preds: List[Tensor],
+                                bbox_preds: List[Tensor],
+                                cls_preds: List[Tensor],
+                                valid_preds: List[Tensor],
+                                input_meta: dict) -> InstanceData:
+        """Generate boxes for single sample.
+
+        Args:
+            center_preds (list[Tensor]): Centerness predictions for all levels.
+            bbox_preds (list[Tensor]): Bbox predictions for all levels.
+            cls_preds (list[Tensor]): Classification predictions for all
+                levels.
+            valid_preds (tuple[Tensor]): Upsampled valid masks for all feature
+                levels.
+            input_meta (dict): Scene meta info.
+
+        Returns:
+            tuple[Tensor]: Predicted bounding boxes, scores and labels.
+        """
+        featmap_sizes = [featmap.size()[-3:] for featmap in center_preds]
+        points = self._get_points(
+            featmap_sizes=featmap_sizes,
+            origin=input_meta['lidar2img']['origin'],
+            device=center_preds[0].device)
+        mlvl_bboxes, mlvl_scores = [], []
+        for center_pred, bbox_pred, cls_pred, valid_pred, point in zip(
+                center_preds, bbox_preds, cls_preds, valid_preds, points):
+            center_pred = center_pred.permute(1, 2, 3, 0).reshape(-1, 1)
+            bbox_pred = bbox_pred.permute(1, 2, 3,
+                                          0).reshape(-1, bbox_pred.shape[0])
+            cls_pred = cls_pred.permute(1, 2, 3,
+                                        0).reshape(-1, cls_pred.shape[0])
+            valid_pred = valid_pred.permute(1, 2, 3, 0).reshape(-1, 1)
+            scores = cls_pred.sigmoid() * center_pred.sigmoid() * valid_pred
+            max_scores, _ = scores.max(dim=1)
+
+            if len(scores) > self.test_cfg.nms_pre > 0:
+                _, ids = max_scores.topk(self.test_cfg.nms_pre)
+                bbox_pred = bbox_pred[ids]
+                scores = scores[ids]
+                point = point[ids]
+
+            bboxes = self._bbox_pred_to_bbox(point, bbox_pred)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+
+        bboxes = torch.cat(mlvl_bboxes)
+        scores = torch.cat(mlvl_scores)
+        bboxes, scores, labels = self._nms(bboxes, scores, input_meta)
+
+        bboxes = input_meta['box_type_3d'](
+            bboxes, box_dim=6, with_yaw=False, origin=(.5, .5, .5))
+
+        results = InstanceData()
+        results.bboxes_3d = bboxes
+        results.scores_3d = scores
+        results.labels_3d = labels
+        return results
+
+    @staticmethod
+    def _upsample_valid_preds(valid_pred, features):
+        """Upsample valid mask predictions.
+
+        Args:
+            valid_pred (Tensor): Valid mask prediction.
+            features (Tensor): Feature tensor.
+
+        Returns:
+            tuple[Tensor]: Upsampled valid masks for all feature levels.
+        """
+        return [
+            nn.Upsample(size=x.shape[-3:],
+                        mode='trilinear')(valid_pred).round().bool()
+            for x in features
+        ]
+
+    @torch.no_grad()
+    def _get_points(self, featmap_sizes, origin, device):
+        mlvl_points = []
+        tmp_voxel_size = [.16, .16, .2]
+        for i, featmap_size in enumerate(featmap_sizes):
+            mlvl_points.append(
+                get_points(
+                    n_voxels=torch.tensor(featmap_size),
+                    voxel_size=torch.tensor(tmp_voxel_size) * (2**i),
+                    origin=torch.tensor(origin)).reshape(3, -1).transpose(
+                        0, 1).to(device))
+        return mlvl_points
+
+    def _bbox_pred_to_bbox(self, points, bbox_pred):
+        return torch.stack([
+            points[:, 0] - bbox_pred[:, 0], points[:, 1] - bbox_pred[:, 2],
+            points[:, 2] - bbox_pred[:, 4], points[:, 0] + bbox_pred[:, 1],
+            points[:, 1] + bbox_pred[:, 3], points[:, 2] + bbox_pred[:, 5]
+        ], -1)
+
+    def _bbox_pred_to_loss(self, points, bbox_preds):
+        return self._bbox_pred_to_bbox(points, bbox_preds)
+
+    # The function is directly copied from FCAF3DHead.
+    @staticmethod
+    def _get_face_distances(points, boxes):
+        """Calculate distances from point to box faces.
+
+        Args:
+            points (Tensor): Final locations of shape (N_points, N_boxes, 3).
+            boxes (Tensor): 3D boxes of shape (N_points, N_boxes, 7)
+
+        Returns:
+            Tensor: Face distances of shape (N_points, N_boxes, 6),
+                (dx_min, dx_max, dy_min, dy_max, dz_min, dz_max).
+        """
+        dx_min = points[..., 0] - boxes[..., 0] + boxes[..., 3] / 2
+        dx_max = boxes[..., 0] + boxes[..., 3] / 2 - points[..., 0]
+        dy_min = points[..., 1] - boxes[..., 1] + boxes[..., 4] / 2
+        dy_max = boxes[..., 1] + boxes[..., 4] / 2 - points[..., 1]
+        dz_min = points[..., 2] - boxes[..., 2] + boxes[..., 5] / 2
+        dz_max = boxes[..., 2] + boxes[..., 5] / 2 - points[..., 2]
+        return torch.stack((dx_min, dx_max, dy_min, dy_max, dz_min, dz_max),
+                           dim=-1)
+
+    @staticmethod
+    def _get_centerness(face_distances):
+        """Compute point centerness w.r.t containing box.
+
+        Args:
+            face_distances (Tensor): Face distances of shape (B, N, 6),
+                (dx_min, dx_max, dy_min, dy_max, dz_min, dz_max).
+
+        Returns:
+            Tensor: Centerness of shape (B, N).
+        """
+        x_dims = face_distances[..., [0, 1]]
+        y_dims = face_distances[..., [2, 3]]
+        z_dims = face_distances[..., [4, 5]]
+        centerness_targets = x_dims.min(dim=-1)[0] / x_dims.max(dim=-1)[0] * \
+            y_dims.min(dim=-1)[0] / y_dims.max(dim=-1)[0] * \
+            z_dims.min(dim=-1)[0] / z_dims.max(dim=-1)[0]
+        return torch.sqrt(centerness_targets)
+
+    @torch.no_grad()
+    def _get_targets(self, points, gt_bboxes, gt_labels):
+        """Compute targets for final locations for a single scene.
+
+        Args:
+            points (list[Tensor]): Final locations for all levels.
+            gt_bboxes (BaseInstance3DBoxes): Ground truth boxes.
+            gt_labels (Tensor): Ground truth labels.
+
+        Returns:
+            tuple[Tensor]: Centerness, bbox and classification
+                targets for all locations.
+        """
+        float_max = 1e8
+        expanded_scales = [
+            points[i].new_tensor(i).expand(len(points[i])).to(gt_labels.device)
+            for i in range(len(points))
+        ]
+        points = torch.cat(points, dim=0).to(gt_labels.device)
+        scales = torch.cat(expanded_scales, dim=0)
+
+        # below is based on FCOSHead._get_target_single
+        n_points = len(points)
+        n_boxes = len(gt_bboxes)
+        volumes = gt_bboxes.volume.to(points.device)
+        volumes = volumes.expand(n_points, n_boxes).contiguous()
+        gt_bboxes = torch.cat(
+            (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:6]), dim=1)
+        gt_bboxes = gt_bboxes.to(points.device).expand(n_points, n_boxes, 6)
+        expanded_points = points.unsqueeze(1).expand(n_points, n_boxes, 3)
+        bbox_targets = self._get_face_distances(expanded_points, gt_bboxes)
+
+        # condition1: inside a gt bbox
+        inside_gt_bbox_mask = bbox_targets[..., :6].min(
+            -1)[0] > 0  # skip angle
+
+        # condition2: positive points per scale >= limit
+        # calculate positive points per scale
+        n_pos_points_per_scale = []
+        for i in range(self.n_levels):
+            n_pos_points_per_scale.append(
+                torch.sum(inside_gt_bbox_mask[scales == i], dim=0))
+        # find best scale
+        n_pos_points_per_scale = torch.stack(n_pos_points_per_scale, dim=0)
+        lower_limit_mask = n_pos_points_per_scale < self.pts_assign_threshold
+        # fix nondeterministic argmax for torch<1.7
+        extra = torch.arange(self.n_levels, 0, -1).unsqueeze(1).expand(
+            self.n_levels, n_boxes).to(lower_limit_mask.device)
+        lower_index = torch.argmax(lower_limit_mask.int() * extra, dim=0) - 1
+        lower_index = torch.where(lower_index < 0,
+                                  torch.zeros_like(lower_index), lower_index)
+        all_upper_limit_mask = torch.all(
+            torch.logical_not(lower_limit_mask), dim=0)
+        best_scale = torch.where(
+            all_upper_limit_mask,
+            torch.ones_like(all_upper_limit_mask) * self.n_levels - 1,
+            lower_index)
+        # keep only points with best scale
+        best_scale = torch.unsqueeze(best_scale, 0).expand(n_points, n_boxes)
+        scales = torch.unsqueeze(scales, 1).expand(n_points, n_boxes)
+        inside_best_scale_mask = best_scale == scales
+
+        # condition3: limit topk locations per box by centerness
+        centerness = self._get_centerness(bbox_targets)
+        centerness = torch.where(inside_gt_bbox_mask, centerness,
+                                 torch.ones_like(centerness) * -1)
+        centerness = torch.where(inside_best_scale_mask, centerness,
+                                 torch.ones_like(centerness) * -1)
+        top_centerness = torch.topk(
+            centerness, self.pts_center_threshold + 1, dim=0).values[-1]
+        inside_top_centerness_mask = centerness > top_centerness.unsqueeze(0)
+
+        # if there are still more than one objects for a location,
+        # we choose the one with minimal area
+        volumes = torch.where(inside_gt_bbox_mask, volumes,
+                              torch.ones_like(volumes) * float_max)
+        volumes = torch.where(inside_best_scale_mask, volumes,
+                              torch.ones_like(volumes) * float_max)
+        volumes = torch.where(inside_top_centerness_mask, volumes,
+                              torch.ones_like(volumes) * float_max)
+        min_area, min_area_inds = volumes.min(dim=1)
+
+        labels = gt_labels[min_area_inds]
+        labels = torch.where(min_area == float_max,
+                             torch.ones_like(labels) * -1, labels)
+        bbox_targets = bbox_targets[range(n_points), min_area_inds]
+        centerness_targets = self._get_centerness(bbox_targets)
+
+        return centerness_targets, self._bbox_pred_to_bbox(
+            points, bbox_targets), labels
+
+    def _nms(self, bboxes, scores, img_meta):
+        scores, labels = scores.max(dim=1)
+        ids = scores > self.test_cfg.score_thr
+        bboxes = bboxes[ids]
+        scores = scores[ids]
+        labels = labels[ids]
+        ids = self.aligned_3d_nms(bboxes, scores, labels,
+                                  self.test_cfg.iou_thr)
+        bboxes = bboxes[ids]
+        bboxes = torch.stack(
+            ((bboxes[:, 0] + bboxes[:, 3]) / 2.,
+             (bboxes[:, 1] + bboxes[:, 4]) / 2.,
+             (bboxes[:, 2] + bboxes[:, 5]) / 2., bboxes[:, 3] - bboxes[:, 0],
+             bboxes[:, 4] - bboxes[:, 1], bboxes[:, 5] - bboxes[:, 2]),
+            dim=1)
+        return bboxes, scores[ids], labels[ids]
+
+    @staticmethod
+    def aligned_3d_nms(boxes, scores, classes, thresh):
+        """3d nms for aligned boxes.
+
+        Args:
+            boxes (torch.Tensor): Aligned box with shape [n, 6].
+            scores (torch.Tensor): Scores of each box.
+            classes (torch.Tensor): Class of each box.
+            thresh (float): Iou threshold for nms.
+
+        Returns:
+            torch.Tensor: Indices of selected boxes.
+        """
+        x1 = boxes[:, 0]
+        y1 = boxes[:, 1]
+        z1 = boxes[:, 2]
+        x2 = boxes[:, 3]
+        y2 = boxes[:, 4]
+        z2 = boxes[:, 5]
+        area = (x2 - x1) * (y2 - y1) * (z2 - z1)
+        zero = boxes.new_zeros(1, )
+
+        score_sorted = torch.argsort(scores)
+        pick = []
+        while (score_sorted.shape[0] != 0):
+            last = score_sorted.shape[0]
+            i = score_sorted[-1]
+            pick.append(i)
+
+            xx1 = torch.max(x1[i], x1[score_sorted[:last - 1]])
+            yy1 = torch.max(y1[i], y1[score_sorted[:last - 1]])
+            zz1 = torch.max(z1[i], z1[score_sorted[:last - 1]])
+            xx2 = torch.min(x2[i], x2[score_sorted[:last - 1]])
+            yy2 = torch.min(y2[i], y2[score_sorted[:last - 1]])
+            zz2 = torch.min(z2[i], z2[score_sorted[:last - 1]])
+            classes1 = classes[i]
+            classes2 = classes[score_sorted[:last - 1]]
+            inter_l = torch.max(zero, xx2 - xx1)
+            inter_w = torch.max(zero, yy2 - yy1)
+            inter_h = torch.max(zero, zz2 - zz1)
+
+            inter = inter_l * inter_w * inter_h
+            iou = inter / (area[i] + area[score_sorted[:last - 1]] - inter)
+            iou = iou * (classes1 == classes2).float()
+            score_sorted = score_sorted[torch.nonzero(
+                iou <= thresh, as_tuple=False).flatten()]
+
+        indices = boxes.new_tensor(pick, dtype=torch.long)
+        return indices
diff --git a/mmde/projects/NeRF-Det/nerfdet/scannet_multiview_dataset.py b/mmde/projects/NeRF-Det/nerfdet/scannet_multiview_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..a20bc3eec0f15cf524b18622dcd874b0d6ff7d6b
--- /dev/null
+++ b/mmde/projects/NeRF-Det/nerfdet/scannet_multiview_dataset.py
@@ -0,0 +1,202 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from os import path as osp
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+
+from mmdet3d.datasets import Det3DDataset
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import DepthInstance3DBoxes
+
+
+@DATASETS.register_module()
+class MultiViewScanNetDataset(Det3DDataset):
+    r"""Multi-View ScanNet Dataset for NeRF-detection Task
+
+    This class serves as the API for experiments on the ScanNet Dataset.
+
+    Please refer to the `github repo <https://github.com/ScanNet/ScanNet>`_
+    for data downloading.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        pipeline (List[dict]): Pipeline used for data processing.
+            Defaults to [].
+        modality (dict): Modality to specify the sensor data used as input.
+            Defaults to dict(use_camera=True, use_lidar=False).
+        box_type_3d (str): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'Depth' in this dataset. Available options includes:
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool): Whether to filter the data with empty GT.
+            If it's set to be True, the example with empty annotations after
+            data pipeline will be dropped and a random example will be chosen
+            in `__getitem__`. Defaults to True.
+        test_mode (bool): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+    METAINFO = {
+        'classes':
+        ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+         'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator',
+         'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin')
+    }
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 metainfo: Optional[dict] = None,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 modality: dict = dict(use_camera=True, use_lidar=False),
+                 box_type_3d: str = 'Depth',
+                 filter_empty_gt: bool = True,
+                 remove_dontcare: bool = False,
+                 test_mode: bool = False,
+                 **kwargs) -> None:
+
+        self.remove_dontcare = remove_dontcare
+
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            metainfo=metainfo,
+            pipeline=pipeline,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+
+        assert 'use_camera' in self.modality and \
+               'use_lidar' in self.modality
+        assert self.modality['use_camera'] or self.modality['use_lidar']
+
+    @staticmethod
+    def _get_axis_align_matrix(info: dict) -> np.ndarray:
+        """Get axis_align_matrix from info. If not exist, return identity mat.
+
+        Args:
+            info (dict): Info of a single sample data.
+
+        Returns:
+            np.ndarray: 4x4 transformation matrix.
+        """
+        if 'axis_align_matrix' in info:
+            return np.array(info['axis_align_matrix'])
+        else:
+            warnings.warn(
+                'axis_align_matrix is not found in ScanNet data info, please '
+                'use new pre-process scripts to re-generate ScanNet data')
+            return np.eye(4).astype(np.float32)
+
+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        Convert all relative path of needed modality data file to
+        the absolute path.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+        if self.modality['use_depth']:
+            info['depth_info'] = []
+        if self.modality['use_neuralrecon_depth']:
+            info['depth_info'] = []
+
+        if self.modality['use_lidar']:
+            # implement lidar processing in the future
+            raise NotImplementedError(
+                'Please modified '
+                '`MultiViewPipeline` to support lidar processing')
+
+        info['axis_align_matrix'] = self._get_axis_align_matrix(info)
+        info['img_info'] = []
+        info['lidar2img'] = []
+        info['c2w'] = []
+        info['camrotc2w'] = []
+        info['lightpos'] = []
+        # load img and depth_img
+        for i in range(len(info['img_paths'])):
+            img_filename = osp.join(self.data_root, info['img_paths'][i])
+
+            info['img_info'].append(dict(filename=img_filename))
+            if 'depth_info' in info.keys():
+                if self.modality['use_neuralrecon_depth']:
+                    info['depth_info'].append(
+                        dict(filename=img_filename[:-4] + '.npy'))
+                else:
+                    info['depth_info'].append(
+                        dict(filename=img_filename[:-4] + '.png'))
+            # implement lidar_info in input.keys() in the future.
+            extrinsic = np.linalg.inv(
+                info['axis_align_matrix'] @ info['lidar2cam'][i])
+            info['lidar2img'].append(extrinsic.astype(np.float32))
+            if self.modality['use_ray']:
+                c2w = (
+                    info['axis_align_matrix'] @ info['lidar2cam'][i]).astype(
+                        np.float32)  # noqa
+                info['c2w'].append(c2w)
+                info['camrotc2w'].append(c2w[0:3, 0:3])
+                info['lightpos'].append(c2w[0:3, 3])
+        origin = np.array([.0, .0, .5])
+        info['lidar2img'] = dict(
+            extrinsic=info['lidar2img'],
+            intrinsic=info['cam2img'].astype(np.float32),
+            origin=origin.astype(np.float32))
+
+        if self.modality['use_ray']:
+            info['ray_info'] = []
+
+        if not self.test_mode:
+            info['ann_info'] = self.parse_ann_info(info)
+        if self.test_mode and self.load_eval_anns:
+            info['ann_info'] = self.parse_ann_info(info)
+            info['eval_ann_info'] = self._remove_dontcare(info['ann_info'])
+
+        return info
+
+    def parse_ann_info(self, info: dict) -> dict:
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Info dict.
+
+        Returns:
+            dict: Processed `ann_info`.
+        """
+        ann_info = super().parse_ann_info(info)
+
+        if self.remove_dontcare:
+            ann_info = self._remove_dontcare(ann_info)
+
+        # empty gt
+        if ann_info is None:
+            ann_info = dict()
+            ann_info['gt_bboxes_3d'] = np.zeros((0, 6), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros((0, ), dtype=np.int64)
+
+        ann_info['gt_bboxes_3d'] = DepthInstance3DBoxes(
+            ann_info['gt_bboxes_3d'],
+            box_dim=ann_info['gt_bboxes_3d'].shape[-1],
+            with_yaw=False,
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        # count the numbers
+        for label in ann_info['gt_labels_3d']:
+            if label != -1:
+                cat_name = self.metainfo['classes'][label]
+                self.num_ins_per_cat[cat_name] += 1
+
+        return ann_info
diff --git a/mmde/projects/NeRF-Det/prepare_infos.py b/mmde/projects/NeRF-Det/prepare_infos.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e1a13516f971c2560f7725f22d4c4d40fbe5ad9
--- /dev/null
+++ b/mmde/projects/NeRF-Det/prepare_infos.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Prepare the dataset for NeRF-Det.
+
+Example:
+    python projects/NeRF-Det/prepare_infos.py
+        --root-path ./data/scannet
+        --out-dir ./data/scannet
+"""
+import argparse
+import time
+from os import path as osp
+from pathlib import Path
+
+import mmengine
+
+from ...tools.dataset_converters import indoor_converter as indoor
+from ...tools.dataset_converters.update_infos_to_v2 import (
+    clear_data_info_unused_keys, clear_instance_unused_keys,
+    get_empty_instance, get_empty_standard_data_info)
+
+
+def update_scannet_infos_nerfdet(pkl_path, out_dir):
+    """Update the origin pkl to the new format which will be used in nerf-det.
+
+    Args:
+        pkl_path (str): Path of the origin pkl.
+        out_dir (str): Output directory of the generated info file.
+
+    Returns:
+        The pkl will be overwritTen.
+        The new pkl is a dict containing two keys:
+        metainfo: Some base information of the pkl
+        data_list (list): A list containing all the information of the scenes.
+    """
+    print('The new refactored process is running.')
+    print(f'{pkl_path} will be modified.')
+    if out_dir in pkl_path:
+        print(f'Warning, you may overwriting '
+              f'the original data {pkl_path}.')
+        time.sleep(5)
+    METAINFO = {
+        'classes':
+        ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+         'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator',
+         'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin')
+    }
+    print(f'Reading from input file: {pkl_path}.')
+    data_list = mmengine.load(pkl_path)
+    print('Start updating:')
+    converted_list = []
+    for ori_info_dict in mmengine.track_iter_progress(data_list):
+        temp_data_info = get_empty_standard_data_info()
+
+        # intrinsics, extrinsics and imgs
+        temp_data_info['cam2img'] = ori_info_dict['intrinsics']
+        temp_data_info['lidar2cam'] = ori_info_dict['extrinsics']
+        temp_data_info['img_paths'] = ori_info_dict['img_paths']
+
+        # annotation information
+        anns = ori_info_dict.get('annos', None)
+        ignore_class_name = set()
+        if anns is not None:
+            temp_data_info['axis_align_matrix'] = anns[
+                'axis_align_matrix'].tolist()
+            if anns['gt_num'] == 0:
+                instance_list = []
+            else:
+                num_instances = len(anns['name'])
+                instance_list = []
+                for instance_id in range(num_instances):
+                    empty_instance = get_empty_instance()
+                    empty_instance['bbox_3d'] = anns['gt_boxes_upright_depth'][
+                        instance_id].tolist()
+
+                    if anns['name'][instance_id] in METAINFO['classes']:
+                        empty_instance['bbox_label_3d'] = METAINFO[
+                            'classes'].index(anns['name'][instance_id])
+                    else:
+                        ignore_class_name.add(anns['name'][instance_id])
+                        empty_instance['bbox_label_3d'] = -1
+
+                    empty_instance = clear_instance_unused_keys(empty_instance)
+                    instance_list.append(empty_instance)
+            temp_data_info['instances'] = instance_list
+        temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
+        converted_list.append(temp_data_info)
+    pkl_name = Path(pkl_path).name
+    out_path = osp.join(out_dir, pkl_name)
+    print(f'Writing to output file: {out_path}.')
+    print(f'ignore classes: {ignore_class_name}')
+
+    # dataset metainfo
+    metainfo = dict()
+    metainfo['categories'] = {k: i for i, k in enumerate(METAINFO['classes'])}
+    if ignore_class_name:
+        for ignore_class in ignore_class_name:
+            metainfo['categories'][ignore_class] = -1
+    metainfo['dataset'] = 'scannet'
+    metainfo['info_version'] = '1.1'
+
+    converted_data_info = dict(metainfo=metainfo, data_list=converted_list)
+
+    mmengine.dump(converted_data_info, out_path, 'pkl')
+
+
+def scannet_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for scannet dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+        version (str): Only used to generate the dataset of nerfdet now.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+    info_train_path = osp.join(out_dir, f'{info_prefix}_infos_train.pkl')
+    info_val_path = osp.join(out_dir, f'{info_prefix}_infos_val.pkl')
+    info_test_path = osp.join(out_dir, f'{info_prefix}_infos_test.pkl')
+    update_scannet_infos_nerfdet(out_dir=out_dir, pkl_path=info_train_path)
+    update_scannet_infos_nerfdet(out_dir=out_dir, pkl_path=info_val_path)
+    update_scannet_infos_nerfdet(out_dir=out_dir, pkl_path=info_test_path)
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument(
+    '--root-path',
+    type=str,
+    default='./data/scannet',
+    help='specify the root path of dataset')
+parser.add_argument(
+    '--out-dir',
+    type=str,
+    default='./data/scannet',
+    required=False,
+    help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='scannet')
+parser.add_argument(
+    '--workers', type=int, default=4, help='number of threads to be used')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    from mmdet3d.utils import register_all_modules
+    register_all_modules()
+
+    scannet_data_prep(
+        root_path=args.root_path,
+        info_prefix=args.extra_tag,
+        out_dir=args.out_dir,
+        workers=args.workers)
diff --git a/mmde/projects/PETR/README.md b/mmde/projects/PETR/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..dcf7182ed2e849b6bb660fed2c74729177697fdc
--- /dev/null
+++ b/mmde/projects/PETR/README.md
@@ -0,0 +1,39 @@
+# PETR
+
+This is an README for `PETR`.
+
+## Description
+
+Author: @SekiroRong.
+This is an implementation of *PETR*.
+
+## Usage
+
+<!-- For a typical model, this section should contain the commands for training and testing. You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`. -->
+
+### Training commands
+
+In MMDet3D's root directory, run the following command to train the model:
+
+```bash
+python tools/train.py projects/PETR/configs/petr_vovnet_gridmask_p4_800x320.py
+```
+
+### Testing commands
+
+In MMDet3D's root directory, run the following command to test the model:
+
+```bash
+python tools/test.py projects/PETR/configs/petr_vovnet_gridmask_p4_800x320.py ${CHECKPOINT_PATH}
+```
+
+## Results
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmdetection3d/edit/dev-1.x/configs/fcos3d/README.md)
+ You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+This Result is trained by petr_vovnet_gridmask_p4_800x320.py and use [weights](https://drive.google.com/file/d/1ABI5BoQCkCkP4B0pO5KBJ3Ni0tei0gZi/view?usp=sharing) as pretrain weight.
+
+|                                   Backbone                                    | Lr schd | Mem (GB) | Inf time (fps) | mAP  | NDS  |                                                                                                      Download                                                                                                       |
+| :---------------------------------------------------------------------------: | :-----: | :------: | :------------: | :--: | :--: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [petr_vovnet_gridmask_p4_800x320](configs/petr_vovnet_gridmask_p4_800x320.py) |   1x    |   7.62   |      18.7      | 38.3 | 43.5 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/petr/petr_vovnet_gridmask_p4_800x320-e2191752.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/petr/20221222_232156.log) |
diff --git a/mmde/projects/PETR/configs/petr_vovnet_gridmask_p4_800x320.py b/mmde/projects/PETR/configs/petr_vovnet_gridmask_p4_800x320.py
new file mode 100644
index 0000000000000000000000000000000000000000..c61b36218ca9a5d540d260483c885677a8a00a53
--- /dev/null
+++ b/mmde/projects/PETR/configs/petr_vovnet_gridmask_p4_800x320.py
@@ -0,0 +1,369 @@
+_base_ = [
+    '../../../configs/_base_/datasets/nus-3d.py',
+    '../../../configs/_base_/default_runtime.py',
+    '../../../configs/_base_/schedules/cyclic-20e.py'
+]
+backbone_norm_cfg = dict(type='LN', requires_grad=True)
+custom_imports = dict(imports=['projects.PETR.petr'])
+
+randomness = dict(seed=1, deterministic=False, diff_rank_seed=False)
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675],
+    std=[57.375, 57.120, 58.395],
+    to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+metainfo = dict(classes=class_names)
+
+input_modality = dict(use_camera=True)
+model = dict(
+    type='PETR',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[57.375, 57.120, 58.395],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    use_grid_mask=True,
+    img_backbone=dict(
+        type='VoVNetCP',
+        spec_name='V-99-eSE',
+        norm_eval=True,
+        frozen_stages=-1,
+        input_ch=3,
+        out_features=(
+            'stage4',
+            'stage5',
+        )),
+    img_neck=dict(
+        type='CPFPN', in_channels=[768, 1024], out_channels=256, num_outs=2),
+    pts_bbox_head=dict(
+        type='PETRHead',
+        num_classes=10,
+        in_channels=256,
+        num_query=900,
+        LID=True,
+        with_position=True,
+        with_multiview=True,
+        position_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+        normedlinear=False,
+        transformer=dict(
+            type='PETRTransformer',
+            decoder=dict(
+                type='PETRTransformerDecoder',
+                return_intermediate=True,
+                num_layers=6,
+                transformerlayers=dict(
+                    type='PETRTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            attn_drop=0.1,
+                            dropout_layer=dict(type='Dropout', drop_prob=0.1)),
+                        dict(
+                            type='PETRMultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            attn_drop=0.1,
+                            dropout_layer=dict(type='Dropout', drop_prob=0.1)),
+                    ],
+                    feedforward_channels=2048,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')),
+            )),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='SinePositionalEncoding3D', num_feats=128, normalize=True),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type='HungarianAssigner3D',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+                iou_cost=dict(
+                    type='IoUCost', weight=0.0
+                ),  # Fake cost. Just to be compatible with DETR head.
+                pc_range=point_cloud_range))))
+
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+backend_args = None
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            car=5,
+            truck=5,
+            bus=5,
+            trailer=5,
+            construction_vehicle=5,
+            traffic_cone=5,
+            barrier=5,
+            motorcycle=5,
+            bicycle=5,
+            pedestrian=5)),
+    classes=class_names,
+    sample_groups=dict(
+        car=2,
+        truck=3,
+        construction_vehicle=7,
+        bus=4,
+        trailer=6,
+        barrier=2,
+        motorcycle=6,
+        bicycle=6,
+        pedestrian=2,
+        traffic_cone=2),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args),
+    backend_args=backend_args)
+ida_aug_conf = {
+    'resize_lim': (0.47, 0.625),
+    'final_dim': (320, 800),
+    'bot_pct_lim': (0.0, 0.0),
+    'rot_lim': (0.0, 0.0),
+    'H': 900,
+    'W': 1600,
+    'rand_flip': True,
+}
+
+train_pipeline = [
+    dict(
+        type='LoadMultiViewImageFromFiles',
+        to_float32=True,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(
+        type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf, training=True),
+    dict(
+        type='GlobalRotScaleTransImage',
+        rot_range=[-0.3925, 0.3925],
+        translation_std=[0, 0, 0],
+        scale_ratio_range=[0.95, 1.05],
+        reverse_angle=False,
+        training=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'attr_labels',
+            'gt_bboxes_3d', 'gt_labels_3d', 'centers_2d', 'depths'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadMultiViewImageFromFiles',
+        to_float32=True,
+        backend_args=backend_args),
+    dict(
+        type='ResizeCropFlipImage', data_aug_conf=ida_aug_conf,
+        training=False),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    dataset=dict(
+        type=dataset_type,
+        data_prefix=dict(
+            pts='samples/LIDAR_TOP',
+            CAM_FRONT='samples/CAM_FRONT',
+            CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+            CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+            CAM_BACK='samples/CAM_BACK',
+            CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+            CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
+        pipeline=train_pipeline,
+        box_type_3d='LiDAR',
+        metainfo=metainfo,
+        test_mode=False,
+        modality=input_modality,
+        use_valid_flag=True,
+        backend_args=backend_args))
+test_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        data_prefix=dict(
+            pts='samples/LIDAR_TOP',
+            CAM_FRONT='samples/CAM_FRONT',
+            CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+            CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+            CAM_BACK='samples/CAM_BACK',
+            CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+            CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
+        pipeline=test_pipeline,
+        box_type_3d='LiDAR',
+        metainfo=metainfo,
+        test_mode=True,
+        modality=input_modality,
+        use_valid_flag=True,
+        backend_args=backend_args))
+val_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        data_prefix=dict(
+            pts='samples/LIDAR_TOP',
+            CAM_FRONT='samples/CAM_FRONT',
+            CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+            CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+            CAM_BACK='samples/CAM_BACK',
+            CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+            CAM_BACK_LEFT='samples/CAM_BACK_LEFT'),
+        pipeline=test_pipeline,
+        box_type_3d='LiDAR',
+        metainfo=metainfo,
+        test_mode=True,
+        modality=input_modality,
+        use_valid_flag=True,
+        backend_args=backend_args))
+
+# Different from original PETR:
+# We don't use special lr for image_backbone
+# This seems won't affect model performance
+optim_wrapper = dict(
+    # TODO Add Amp
+    # type='AmpOptimWrapper',
+    # loss_scale='dynamic',
+    optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01),
+    paramwise_cfg=dict(custom_keys={
+        'img_backbone': dict(lr_mult=0.1),
+    }),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+num_epochs = 24
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        begin=0,
+        end=500,
+        by_epoch=False),
+    dict(
+        type='CosineAnnealingLR',
+        # TODO Figure out what T_max
+        T_max=num_epochs,
+        by_epoch=True,
+    )
+]
+
+train_cfg = dict(max_epochs=num_epochs, val_interval=num_epochs)
+
+find_unused_parameters = False
+
+# pretrain_path can be found here:
+# https://drive.google.com/file/d/1ABI5BoQCkCkP4B0pO5KBJ3Ni0tei0gZi/view
+load_from = '/mnt/d/fcos3d_vovnet_imgbackbone-remapped.pth'
+resume = False
+
+# --------------Original---------------
+# mAP: 0.3778
+# mATE: 0.7463
+# mASE: 0.2718
+# mAOE: 0.4883
+# mAVE: 0.9062
+# mAAE: 0.2123
+# NDS: 0.4264
+# Eval time: 242.1s
+
+# Per-class results:
+# Object Class    AP      ATE     ASE     AOE     AVE     AAE
+# car     0.556   0.555   0.153   0.091   0.917   0.216
+# truck   0.330   0.805   0.218   0.119   0.859   0.250
+# bus     0.412   0.789   0.205   0.162   2.067   0.337
+# trailer 0.221   0.976   0.233   0.663   0.797   0.146
+# construction_vehicle    0.094   1.096   0.493   1.145   0.190   0.349
+# pedestrian      0.453   0.688   0.289   0.636   0.549   0.235
+# motorcycle      0.368   0.690   0.256   0.622   1.417   0.149
+# bicycle 0.341   0.609   0.270   0.812   0.455   0.017
+# traffic_cone    0.531   0.582   0.320   nan     nan     nan
+# barrier 0.472   0.673   0.281   0.145   nan     nan
+
+# --------------Refactored in mmdet3d v1.0---------------
+# mAP: 0.3827
+# mATE: 0.7375
+# mASE: 0.2703
+# mAOE: 0.4799
+# mAVE: 0.8699
+# mAAE: 0.2038
+# NDS: 0.4352
+# Eval time: 124.8s
+
+# Per-class results:
+# Object Class	  AP	  ATE	  ASE	  AOE	  AVE	  AAE
+# car	  0.574	  0.519	  0.150	  0.087	  0.865	  0.206
+# truck	  0.349	  0.773	  0.213	  0.117	  0.855	  0.220
+# bus	  0.423	  0.781	  0.204	  0.122	  1.902	  0.319
+# trailer 0.219	  1.034	  0.231	  0.608	  0.830	  0.149
+# construction_vehicle	  0.084	  1.062	  0.486	  1.245	  0.172	  0.360
+# pedestrian	  0.452	  0.681	  0.293	  0.646	  0.529	  0.231
+# motorcycle	  0.378	  0.670	  0.250	  0.567	  1.334	  0.130
+# bicycle	      0.347	  0.639	  0.264	  0.788	  0.472	  0.016
+# traffic_cone	  0.538	  0.553	  0.325	  nan	  nan	  nan
+# barrier	      0.464	  0.662	 0.287	  0.137	  nan	  nan
+
+# --------------Refactored in mmdet3d v1.1---------------
+# mAP: 0.3830
+# mATE: 0.7547
+# mASE: 0.2683
+# mAOE: 0.4948
+# mAVE: 0.8331
+# mAAE: 0.2056
+# NDS: 0.4358
+# Eval time: 118.7s
+
+# Per-class results:
+# Object Class	  AP	  ATE	  ASE	  AOE	  AVE	  AAE
+# car	  0.567	  0.538	  0.151	  0.086	  0.873	  0.212
+# truck	  0.341	  0.785	  0.213	  0.113	  0.821	  0.234
+# bus	  0.426	  0.766	  0.201	  0.128	  1.813	  0.343
+# trailer 0.216	  1.116	  0.227	  0.649	  0.640	  0.122
+# construction_vehicle	  0.093	  1.118	  0.483	  1.292	  0.217	  0.330
+# pedestrian	  0.453	  0.685	  0.293	  0.644	  0.535	  0.238
+# motorcycle	  0.374	  0.700	  0.253	  0.624	  1.291	  0.154
+# bicycle	      0.345	  0.622	  0.262	  0.775	  0.475	  0.011
+# traffic_cone	  0.539	  0.557	  0.319	  nan	  nan	  nan
+# barrier	      0.476	  0.661	  0.279	  0.142	  nan	  nan
diff --git a/mmde/projects/PETR/petr/__init__.py b/mmde/projects/PETR/petr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ed2ecc90816f517ba2e4d833b2027e64c213449
--- /dev/null
+++ b/mmde/projects/PETR/petr/__init__.py
@@ -0,0 +1,24 @@
+from .cp_fpn import CPFPN
+from .hungarian_assigner_3d import HungarianAssigner3D
+from .match_cost import BBox3DL1Cost
+from .nms_free_coder import NMSFreeCoder
+from .petr import PETR
+from .petr_head import PETRHead
+from .petr_transformer import (PETRDNTransformer, PETRMultiheadAttention,
+                               PETRTransformer, PETRTransformerDecoder,
+                               PETRTransformerDecoderLayer,
+                               PETRTransformerEncoder)
+from .positional_encoding import (LearnedPositionalEncoding3D,
+                                  SinePositionalEncoding3D)
+from .transforms_3d import GlobalRotScaleTransImage, ResizeCropFlipImage
+from .utils import denormalize_bbox, normalize_bbox
+from .vovnetcp import VoVNetCP
+
+__all__ = [
+    'GlobalRotScaleTransImage', 'ResizeCropFlipImage', 'VoVNetCP', 'PETRHead',
+    'CPFPN', 'HungarianAssigner3D', 'NMSFreeCoder', 'BBox3DL1Cost',
+    'LearnedPositionalEncoding3D', 'PETRDNTransformer',
+    'PETRMultiheadAttention', 'PETRTransformer', 'PETRTransformerDecoder',
+    'PETRTransformerDecoderLayer', 'PETRTransformerEncoder', 'PETR',
+    'SinePositionalEncoding3D', 'denormalize_bbox', 'normalize_bbox'
+]
diff --git a/mmde/projects/PETR/petr/cp_fpn.py b/mmde/projects/PETR/petr/cp_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..02c902485bb7d6bbef6cdd2396c20097509cc4e7
--- /dev/null
+++ b/mmde/projects/PETR/petr/cp_fpn.py
@@ -0,0 +1,211 @@
+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from mmdetection (https://github.com/open-mmlab/mmdetection)
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+
+
+# This FPN remove unused parameters which can used with checkpoint
+# (with_cp = True)
+@MODELS.register_module()
+class CPFPN(BaseModule):
+    r"""Feature Pyramid Network.
+
+    This is an implementation of paper `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral':  Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (str): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(mode='nearest')`
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest'),
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(CPFPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            self.add_extra_convs = 'on_input'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            self.lateral_convs.append(l_conv)
+            if i == 0:
+                fpn_conv = ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    # @auto_fp16()
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                laterals[i - 1] += F.interpolate(laterals[i],
+                                                 **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] += F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) if i == 0 else laterals[i]
+            for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/mmde/projects/PETR/petr/grid_mask.py b/mmde/projects/PETR/petr/grid_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..279d6b2b177e0b3d4f0f676d21fac7d2b2da25bb
--- /dev/null
+++ b/mmde/projects/PETR/petr/grid_mask.py
@@ -0,0 +1,146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import Image
+
+
+class Grid(object):
+
+    def __init__(self,
+                 use_h,
+                 use_w,
+                 rotate=1,
+                 offset=False,
+                 ratio=0.5,
+                 mode=0,
+                 prob=1.,
+                 length=1):
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.st_prob = prob
+        self.prob = prob
+        self.length = length
+
+    def set_prob(self, epoch, max_epoch):
+        self.prob = self.st_prob * epoch / max_epoch
+
+    def __call__(self, img, label):
+        if np.random.rand() > self.prob:
+            return img, label
+        h = img.size(1)
+        w = img.size(2)
+        self.d1 = 2
+        self.d2 = min(h, w)
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
+        d = np.random.randint(self.d1, self.d2)
+        if self.ratio == 1:
+            self.length = np.random.randint(1, d)
+        else:
+            self.length = min(max(int(d * self.ratio + 0.5), 1), d - 1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh // d):
+                s = d * i + st_h
+                t = min(s + self.length, hh)
+                mask[s:t, :] *= 0
+        if self.use_w:
+            for i in range(ww // d):
+                s = d * i + st_w
+                t = min(s + self.length, ww)
+                mask[:, s:t] *= 0
+
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh - h) // 2:(hh - h) // 2 + h,
+                    (ww - w) // 2:(ww - w) // 2 + w]
+
+        mask = torch.from_numpy(mask).float()
+        if self.mode == 1:
+            mask = 1 - mask
+
+        mask = mask.expand_as(img)
+        if self.offset:
+            offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float()
+            offset = (1 - mask) * offset
+            img = img * mask + offset
+        else:
+            img = img * mask
+
+        return img, label
+
+
+class GridMask(nn.Module):
+
+    def __init__(self,
+                 use_h,
+                 use_w,
+                 rotate=1,
+                 offset=False,
+                 ratio=0.5,
+                 mode=0,
+                 prob=1.):
+        super(GridMask, self).__init__()
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.st_prob = prob
+        self.prob = prob
+
+    def set_prob(self, epoch, max_epoch):
+        self.prob = self.st_prob * epoch / max_epoch  # + 1.#0.5
+
+    def forward(self, x):
+        if np.random.rand() > self.prob or not self.training:
+            return x
+        n, c, h, w = x.size()
+        x = x.view(-1, h, w)
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
+        d = np.random.randint(2, h)
+        self.length = min(max(int(d * self.ratio + 0.5), 1), d - 1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh // d):
+                s = d * i + st_h
+                t = min(s + self.length, hh)
+                mask[s:t, :] *= 0
+        if self.use_w:
+            for i in range(ww // d):
+                s = d * i + st_w
+                t = min(s + self.length, ww)
+                mask[:, s:t] *= 0
+
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh - h) // 2:(hh - h) // 2 + h,
+                    (ww - w) // 2:(ww - w) // 2 + w]
+
+        mask = torch.from_numpy(mask).float().cuda()
+        if self.mode == 1:
+            mask = 1 - mask
+        mask = mask.expand_as(x)
+        if self.offset:
+            offset = torch.from_numpy(
+                2 * (np.random.rand(h, w) - 0.5)).float().cuda()
+            x = x * mask + offset * (1 - mask)
+        else:
+            x = x * mask
+
+        return x.view(n, c, h, w)
diff --git a/mmde/projects/PETR/petr/hungarian_assigner_3d.py b/mmde/projects/PETR/petr/hungarian_assigner_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..860032324ffbbc8dd3b64e7b9e3db71ea2b8d534
--- /dev/null
+++ b/mmde/projects/PETR/petr/hungarian_assigner_3d.py
@@ -0,0 +1,142 @@
+# ------------------------------------------------------------------------
+# Copyright (c) 2021 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from DETR3D (https://github.com/WangYueFt/detr3d)
+# Copyright (c) 2021 Wang, Yue
+# ------------------------------------------------------------------------
+# Modified from mmdetection (https://github.com/open-mmlab/mmdetection)
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+import torch
+from mmdet.models.task_modules import AssignResult, BaseAssigner
+
+from mmdet3d.registry import TASK_UTILS
+from projects.PETR.petr.utils import normalize_bbox
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+
+@TASK_UTILS.register_module()
+class HungarianAssigner3D(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth. This
+    class computes an assignment between the targets and the predictions based
+    on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched are
+    treated as backgrounds. Thus each query prediction will be assigned with
+    `0` or a positive integer indicating the ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        bbox_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 1.0.
+        iou_calculator (dict | optional): The config for the iou calculation.
+            Default type `BboxOverlaps2D`.
+        iou_mode (str | optional): "iou" (intersection over union), "iof"
+                (intersection over foreground), or "giou" (generalized
+                intersection over union). Default "giou".
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoUCost', weight=0.0),
+                 pc_range=None):
+        self.cls_cost = TASK_UTILS.build(cls_cost)
+        self.reg_cost = TASK_UTILS.build(reg_cost)
+        self.iou_cost = TASK_UTILS.build(iou_cost)
+        self.pc_range = pc_range
+
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               gt_bboxes,
+               gt_labels,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+        normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range)
+        reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8])
+
+        # weighted sum of above two costs
+        cost = cls_cost + reg_cost
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        cost = torch.nan_to_num(cost, nan=100.0, posinf=100.0, neginf=-100.0)
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)
diff --git a/mmde/projects/PETR/petr/match_cost.py b/mmde/projects/PETR/petr/match_cost.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee48d4ba4b3ad23670a1f7f6f87461c62e31a3d4
--- /dev/null
+++ b/mmde/projects/PETR/petr/match_cost.py
@@ -0,0 +1,338 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet3d.registry import TASK_UTILS
+
+
+def fp16_clamp(x, min=None, max=None):
+    if not x.is_cuda and x.dtype == torch.float16:
+        # clamp for cpu float16, tensor fp16 has no clamp implementation
+        return x.float().clamp(min, max).half()
+
+    return x.clamp(min, max)
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
+    """Calculate overlap between two set of bboxes.
+    FP16 Contributed by https://github.com/open-mmlab/mmdetection/pull/4889
+    Note:
+        Assume bboxes1 is M x 4, bboxes2 is N x 4, when mode is 'iou',
+        there are some new generated variable when calculating IOU
+        using bbox_overlaps function:
+        1) is_aligned is False
+            area1: M x 1
+            area2: N x 1
+            lt: M x N x 2
+            rb: M x N x 2
+            wh: M x N x 2
+            overlap: M x N x 1
+            union: M x N x 1
+            ious: M x N x 1
+            Total memory:
+                S = (9 x N x M + N + M) * 4 Byte,
+            When using FP16, we can reduce:
+                R = (9 x N x M + N + M) * 4 / 2 Byte
+                R large than (N + M) * 4 * 2 is always true when N and M >= 1.
+                Obviously, N + M <= N * M < 3 * N * M, when N >=2 and M >=2,
+                           N + 1 < 3 * N, when N or M is 1.
+            Given M = 40 (ground truth), N = 400000 (three anchor boxes
+            in per grid, FPN, R-CNNs),
+                R = 275 MB (one times)
+            A special case (dense detection), M = 512 (ground truth),
+                R = 3516 MB = 3.43 GB
+            When the batch size is B, reduce:
+                B x R
+            Therefore, CUDA memory runs out frequently.
+            Experiments on GeForce RTX 2080Ti (11019 MiB):
+            |   dtype   |   M   |   N   |   Use    |   Real   |   Ideal   |
+            |:----:|:----:|:----:|:----:|:----:|:----:|
+            |   FP32   |   512 | 400000 | 8020 MiB |   --   |   --   |
+            |   FP16   |   512 | 400000 |   4504 MiB | 3516 MiB | 3516 MiB |
+            |   FP32   |   40 | 400000 |   1540 MiB |   --   |   --   |
+            |   FP16   |   40 | 400000 |   1264 MiB |   276MiB   | 275 MiB |
+        2) is_aligned is True
+            area1: N x 1
+            area2: N x 1
+            lt: N x 2
+            rb: N x 2
+            wh: N x 2
+            overlap: N x 1
+            union: N x 1
+            ious: N x 1
+            Total memory:
+                S = 11 x N * 4 Byte
+            When using FP16, we can reduce:
+                R = 11 x N * 4 / 2 Byte
+        So do the 'giou' (large than 'iou').
+        Time-wise, FP16 is generally faster than FP32.
+        When gpu_assign_thr is not -1, it takes more time on cpu
+        but not reduce memory.
+        There, we can reduce half the memory and keep the speed.
+    If ``is_aligned`` is ``False``, then calculate the overlaps between each
+    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
+    pair of bboxes1 and bboxes2.
+    Args:
+        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
+        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned`` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union), "iof" (intersection over
+            foreground) or "giou" (generalized intersection over union).
+            Default "iou".
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Default False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-6.
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 10, 10],
+        >>>     [10, 10, 20, 20],
+        >>>     [32, 32, 38, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 10, 20],
+        >>>     [0, 10, 10, 19],
+        >>>     [10, 10, 20, 20],
+        >>> ])
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2)
+        >>> assert overlaps.shape == (3, 3)
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
+        >>> assert overlaps.shape == (3, )
+    Example:
+        >>> empty = torch.empty(0, 4)
+        >>> nonempty = torch.FloatTensor([[0, 0, 10, 9]])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
+    # Either the boxes are empty or the length of boxes' last dimension is 4
+    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.size(-2)
+    cols = bboxes2.size(-2)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return bboxes1.new(batch_shape + (rows, ))
+        else:
+            return bboxes1.new(batch_shape + (rows, cols))
+
+    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+        bboxes1[..., 3] - bboxes1[..., 1])
+    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+        bboxes2[..., 3] - bboxes2[..., 1])
+
+    if is_aligned:
+        lt = torch.max(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
+        rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]
+
+        wh = fp16_clamp(rb - lt, min=0)
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2])
+            enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:])
+    else:
+        lt = torch.max(bboxes1[..., :, None, :2],
+                       bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
+        rb = torch.min(bboxes1[..., :, None, 2:],
+                       bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]
+
+        wh = fp16_clamp(rb - lt, min=0)
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1[..., None] + area2[..., None, :] - overlap
+        else:
+            union = area1[..., None]
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :, None, :2],
+                                    bboxes2[..., None, :, :2])
+            enclosed_rb = torch.max(bboxes1[..., :, None, 2:],
+                                    bboxes2[..., None, :, 2:])
+
+    eps = union.new_tensor([eps])
+    union = torch.max(union, eps)
+    ious = overlap / union
+    if mode in ['iou', 'iof']:
+        return ious
+    # calculate gious
+    enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0)
+    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
+    enclose_area = torch.max(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return gious
+
+
+@TASK_UTILS.register_module()
+class BBox3DL1Cost(object):
+    """BBox3DL1Cost.
+
+    Args:
+        weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class FocalLossCost:
+    """FocalLossCost.
+     Args:
+         weight (int | float, optional): loss_weight
+         alpha (int | float, optional): focal_loss alpha
+         gamma (int | float, optional): focal_loss gamma
+         eps (float, optional): default 1e-12
+         binary_input (bool, optional): Whether the input is binary,
+            default False.
+     Examples:
+         >>> from mmdet.core.bbox.match_costs.match_cost import FocalLossCost
+         >>> import torch
+         >>> self = FocalLossCost()
+         >>> cls_pred = torch.rand(4, 3)
+         >>> gt_labels = torch.tensor([0, 1, 2])
+         >>> factor = torch.tensor([10, 8, 10, 8])
+         >>> self(cls_pred, gt_labels)
+         tensor([[-0.3236, -0.3364, -0.2699],
+                [-0.3439, -0.3209, -0.4807],
+                [-0.4099, -0.3795, -0.2929],
+                [-0.1950, -0.1207, -0.2626]])
+    """
+
+    def __init__(self,
+                 weight=1.,
+                 alpha=0.25,
+                 gamma=2,
+                 eps=1e-12,
+                 binary_input=False):
+        self.weight = weight
+        self.alpha = alpha
+        self.gamma = gamma
+        self.eps = eps
+        self.binary_input = binary_input
+
+    def _focal_loss_cost(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_query, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = pos_cost[:, gt_labels] - neg_cost[:, gt_labels]
+        return cls_cost * self.weight
+
+    def _mask_focal_loss_cost(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits
+                in shape (num_query, d1, ..., dn), dtype=torch.float32.
+            gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),
+                dtype=torch.long. Labels should be binary.
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1)
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = torch.einsum('nc,mc->nm', pos_cost, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
+        return cls_cost / n * self.weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits.
+            gt_labels (Tensor)): Labels.
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        if self.binary_input:
+            return self._mask_focal_loss_cost(cls_pred, gt_labels)
+        else:
+            return self._focal_loss_cost(cls_pred, gt_labels)
+
+
+@TASK_UTILS.register_module()
+class IoUCost:
+    """IoUCost.
+     Args:
+         iou_mode (str, optional): iou mode such as 'iou' | 'giou'
+         weight (int | float, optional): loss weight
+     Examples:
+         >>> from mmdet.core.bbox.match_costs.match_cost import IoUCost
+         >>> import torch
+         >>> self = IoUCost()
+         >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
+         >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+         >>> self(bboxes, gt_bboxes)
+         tensor([[-0.1250,  0.1667],
+                [ 0.1667, -0.5000]])
+    """
+
+    def __init__(self, iou_mode='giou', weight=1.):
+        self.weight = weight
+        self.iou_mode = iou_mode
+
+    def __call__(self, bboxes, gt_bboxes):
+        """
+        Args:
+            bboxes (Tensor): Predicted boxes with unnormalized coordinates
+                (x1, y1, x2, y2). Shape (num_query, 4).
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape (num_gt, 4).
+        Returns:
+            torch.Tensor: iou_cost value with weight
+        """
+        # overlaps: [num_bboxes, num_gt]
+        overlaps = bbox_overlaps(
+            bboxes, gt_bboxes, mode=self.iou_mode, is_aligned=False)
+        # The 1 is a constant that doesn't change the matching, so omitted.
+        iou_cost = -overlaps
+        return iou_cost * self.weight
diff --git a/mmde/projects/PETR/petr/nms_free_coder.py b/mmde/projects/PETR/petr/nms_free_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1415d4c0e191250794c3011344377cbe6454be9
--- /dev/null
+++ b/mmde/projects/PETR/petr/nms_free_coder.py
@@ -0,0 +1,246 @@
+# ------------------------------------------------------------------------
+# Copyright (c) 2021 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from DETR3D (https://github.com/WangYueFt/detr3d)
+# Copyright (c) 2021 Wang, Yue
+# ------------------------------------------------------------------------
+# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+import torch
+import torch.nn.functional as F
+from mmdet.models.task_modules import BaseBBoxCoder
+
+from mmdet3d.registry import TASK_UTILS
+from projects.PETR.petr.utils import denormalize_bbox
+
+
+@TASK_UTILS.register_module()
+class NMSFreeCoder(BaseBBoxCoder):
+    """Bbox coder for NMS-free detector.
+
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+        code_size (int): Code size of bboxes. Default: 9
+    """
+
+    def __init__(self,
+                 pc_range,
+                 voxel_size=None,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 num_classes=10):
+
+        self.pc_range = pc_range
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
+
+    def encode(self):
+        pass
+
+    def decode_single(self, cls_scores, bbox_preds):
+        """Decode bboxes.
+
+        Args:
+            cls_scores (Tensor): Outputs from the classification head, \
+                shape [num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            bbox_preds (Tensor): Outputs from the regression \
+                head with normalized coordinate format \
+                (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        max_num = self.max_num
+
+        cls_scores = cls_scores.sigmoid()
+        scores, indexes = cls_scores.view(-1).topk(max_num)
+        labels = indexes % self.num_classes
+        bbox_index = indexes // self.num_classes
+        bbox_preds = bbox_preds[bbox_index]
+
+        final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)
+        final_scores = scores
+        final_preds = labels
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=scores.device)
+
+            mask = (final_box_preds[..., :3] >=
+                    self.post_center_range[:3]).all(1)
+            mask &= (final_box_preds[..., :3] <=
+                     self.post_center_range[3:]).all(1)
+
+            if self.score_threshold:
+                mask &= thresh_mask
+
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+            labels = final_preds[mask]
+            predictions_dict = {
+                'bboxes': boxes3d,
+                'scores': scores,
+                'labels': labels
+            }
+
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dict
+
+    def decode(self, preds_dicts):
+        """Decode bboxes.
+
+        Args:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format \
+                (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        all_cls_scores = preds_dicts['all_cls_scores'][-1]
+        all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
+
+        batch_size = all_cls_scores.size()[0]
+        predictions_list = []
+        for i in range(batch_size):
+            predictions_list.append(
+                self.decode_single(all_cls_scores[i], all_bbox_preds[i]))
+        return predictions_list
+
+
+@TASK_UTILS.register_module()
+class NMSFreeClsCoder(BaseBBoxCoder):
+    """Bbox coder for NMS-free detector.
+
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+        code_size (int): Code size of bboxes. Default: 9
+    """
+
+    def __init__(self,
+                 pc_range,
+                 voxel_size=None,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 num_classes=10):
+
+        self.pc_range = pc_range
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
+
+    def encode(self):
+        pass
+
+    def decode_single(self, cls_scores, bbox_preds):
+        """Decode bboxes.
+
+        Args:
+            cls_scores (Tensor): Outputs from the classification head, \
+                shape [num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            bbox_preds (Tensor): Outputs from the regression \
+                head with normalized coordinate format \
+                (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        max_num = self.max_num
+
+        # cls_scores = cls_scores.sigmoid()
+        # scores, indexes = cls_scores.view(-1).topk(max_num)
+        # labels = indexes % self.num_classes
+        # bbox_index = indexes // self.num_classes
+        # bbox_preds = bbox_preds[bbox_index]
+
+        cls_scores, labels = F.softmax(cls_scores, dim=-1)[..., :-1].max(-1)
+        scores, indexes = cls_scores.view(-1).topk(max_num)
+        labels = labels[indexes]
+        bbox_preds = bbox_preds[indexes]
+
+        final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)
+        final_scores = scores
+        final_preds = labels
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=scores.device)
+
+            mask = (final_box_preds[..., :3] >=
+                    self.post_center_range[:3]).all(1)
+            mask &= (final_box_preds[..., :3] <=
+                     self.post_center_range[3:]).all(1)
+
+            if self.score_threshold:
+                mask &= thresh_mask
+
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+            labels = final_preds[mask]
+            predictions_dict = {
+                'bboxes': boxes3d,
+                'scores': scores,
+                'labels': labels
+            }
+
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dict
+
+    def decode(self, preds_dicts):
+        """Decode bboxes.
+
+        Args:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format \
+                (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        all_cls_scores = preds_dicts['all_cls_scores'][-1]
+        all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
+
+        batch_size = all_cls_scores.size()[0]
+        predictions_list = []
+        for i in range(batch_size):
+            predictions_list.append(
+                self.decode_single(all_cls_scores[i], all_bbox_preds[i]))
+        return predictions_list
diff --git a/mmde/projects/PETR/petr/petr.py b/mmde/projects/PETR/petr/petr.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4acff661a95ee04d40f71d20af213d94ad08ee9
--- /dev/null
+++ b/mmde/projects/PETR/petr/petr.py
@@ -0,0 +1,282 @@
+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from DETR3D (https://github.com/WangYueFt/detr3d)
+# Copyright (c) 2021 Wang, Yue
+# ------------------------------------------------------------------------
+# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.ops import bbox3d2result
+from .grid_mask import GridMask
+
+
+@MODELS.register_module()
+class PETR(MVXTwoStageDetector):
+    """PETR."""
+
+    def __init__(self,
+                 use_grid_mask=False,
+                 pts_voxel_layer=None,
+                 pts_middle_encoder=None,
+                 pts_fusion_layer=None,
+                 img_backbone=None,
+                 pts_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_roi_head=None,
+                 img_rpn_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 data_preprocessor=None,
+                 **kwargs):
+        super(PETR,
+              self).__init__(pts_voxel_layer, pts_middle_encoder,
+                             pts_fusion_layer, img_backbone, pts_backbone,
+                             img_neck, pts_neck, pts_bbox_head, img_roi_head,
+                             img_rpn_head, train_cfg, test_cfg, init_cfg,
+                             data_preprocessor)
+        self.grid_mask = GridMask(
+            True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
+        self.use_grid_mask = use_grid_mask
+
+    def extract_img_feat(self, img, img_metas):
+        """Extract features of images."""
+        if isinstance(img, list):
+            img = torch.stack(img, dim=0)
+
+        B = img.size(0)
+        if img is not None:
+            input_shape = img.shape[-2:]
+            # update real input shape of each single img
+            for img_meta in img_metas:
+                img_meta.update(input_shape=input_shape)
+            if img.dim() == 5:
+                if img.size(0) == 1 and img.size(1) != 1:
+                    img.squeeze_()
+                else:
+                    B, N, C, H, W = img.size()
+                    img = img.view(B * N, C, H, W)
+            if self.use_grid_mask:
+                img = self.grid_mask(img)
+            img_feats = self.img_backbone(img)
+            if isinstance(img_feats, dict):
+                img_feats = list(img_feats.values())
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+        img_feats_reshaped = []
+        for img_feat in img_feats:
+            BN, C, H, W = img_feat.size()
+            img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
+        return img_feats_reshaped
+
+    # @auto_fp16(apply_to=('img'), out_fp32=True)
+    def extract_feat(self, img, img_metas):
+        """Extract features from images and points."""
+        img_feats = self.extract_img_feat(img, img_metas)
+        return img_feats
+
+    def forward_pts_train(self,
+                          pts_feats,
+                          gt_bboxes_3d,
+                          gt_labels_3d,
+                          img_metas,
+                          gt_bboxes_ignore=None):
+        """Forward function for point cloud branch.
+
+        Args:
+            pts_feats (list[torch.Tensor]): Features of point cloud branch
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            img_metas (list[dict]): Meta information of samples.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of each branch.
+        """
+        outs = self.pts_bbox_head(pts_feats, img_metas)
+        loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
+        losses = self.pts_bbox_head.loss_by_feat(*loss_inputs)
+
+        return losses
+
+    def _forward(self, mode='loss', **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+
+        Note this setting will change the expected inputs. When
+        `return_loss=True`, img and img_metas are single-nested (i.e.
+        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
+        img_metas should be double nested (i.e.  list[torch.Tensor],
+        list[list[dict]]), with the outer list indicating test time
+        augmentations.
+        """
+        raise NotImplementedError('tensor mode is yet to add')
+
+    def loss(self,
+             inputs=None,
+             data_samples=None,
+             mode=None,
+             points=None,
+             img_metas=None,
+             gt_bboxes_3d=None,
+             gt_labels_3d=None,
+             gt_labels=None,
+             gt_bboxes=None,
+             img=None,
+             proposals=None,
+             gt_bboxes_ignore=None,
+             img_depth=None,
+             img_mask=None):
+        """Forward training function.
+
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of different branches.
+        """
+        img = inputs['imgs']
+        batch_img_metas = [ds.metainfo for ds in data_samples]
+        batch_gt_instances_3d = [ds.gt_instances_3d for ds in data_samples]
+        gt_bboxes_3d = [gt.bboxes_3d for gt in batch_gt_instances_3d]
+        gt_labels_3d = [gt.labels_3d for gt in batch_gt_instances_3d]
+        gt_bboxes_ignore = None
+
+        batch_img_metas = self.add_lidar2img(img, batch_img_metas)
+
+        img_feats = self.extract_feat(img=img, img_metas=batch_img_metas)
+
+        losses = dict()
+        losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d,
+                                            gt_labels_3d, batch_img_metas,
+                                            gt_bboxes_ignore)
+        losses.update(losses_pts)
+        return losses
+
+    def predict(self, inputs=None, data_samples=None, mode=None, **kwargs):
+        img = inputs['imgs']
+        batch_img_metas = [ds.metainfo for ds in data_samples]
+        for var, name in [(batch_img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+        img = [img] if img is None else img
+
+        batch_img_metas = self.add_lidar2img(img, batch_img_metas)
+
+        results_list_3d = self.simple_test(batch_img_metas, img, **kwargs)
+
+        for i, data_sample in enumerate(data_samples):
+            results_list_3d_i = InstanceData(
+                metainfo=results_list_3d[i]['pts_bbox'])
+            data_sample.pred_instances_3d = results_list_3d_i
+            data_sample.pred_instances = InstanceData()
+
+        return data_samples
+
+    def simple_test_pts(self, x, img_metas, rescale=False):
+        """Test function of point cloud branch."""
+        outs = self.pts_bbox_head(x, img_metas)
+        bbox_list = self.pts_bbox_head.get_bboxes(
+            outs, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
+
+    def simple_test(self, img_metas, img=None, rescale=False):
+        """Test function without augmentaiton."""
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+
+        bbox_list = [dict() for i in range(len(img_metas))]
+        bbox_pts = self.simple_test_pts(img_feats, img_metas, rescale=rescale)
+        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+            result_dict['pts_bbox'] = pts_bbox
+        return bbox_list
+
+    def aug_test_pts(self, feats, img_metas, rescale=False):
+        feats_list = []
+        for j in range(len(feats[0])):
+            feats_list_level = []
+            for i in range(len(feats)):
+                feats_list_level.append(feats[i][j])
+            feats_list.append(torch.stack(feats_list_level, -1).mean(-1))
+        outs = self.pts_bbox_head(feats_list, img_metas)
+        bbox_list = self.pts_bbox_head.get_bboxes(
+            outs, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
+
+    def aug_test(self, img_metas, imgs=None, rescale=False):
+        """Test function with augmentaiton."""
+        img_feats = self.extract_feats(img_metas, imgs)
+        img_metas = img_metas[0]
+        bbox_list = [dict() for i in range(len(img_metas))]
+        bbox_pts = self.aug_test_pts(img_feats, img_metas, rescale)
+        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+            result_dict['pts_bbox'] = pts_bbox
+        return bbox_list
+
+    # may need speed-up
+    def add_lidar2img(self, img, batch_input_metas):
+        """add 'lidar2img' transformation matrix into batch_input_metas.
+
+        Args:
+            batch_input_metas (list[dict]): Meta information of multiple inputs
+                in a batch.
+        Returns:
+            batch_input_metas (list[dict]): Meta info with lidar2img added
+        """
+        for meta in batch_input_metas:
+            lidar2img_rts = []
+            # obtain lidar to image transformation matrix
+            for i in range(len(meta['cam2img'])):
+                lidar2cam_rt = torch.tensor(meta['lidar2cam'][i]).double()
+                intrinsic = torch.tensor(meta['cam2img'][i]).double()
+                viewpad = torch.eye(4).double()
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt)
+                # The extrinsics mean the transformation from lidar to camera.
+                # If anyone want to use the extrinsics as sensor to lidar,
+                # please use np.linalg.inv(lidar2cam_rt.T)
+                # and modify the ResizeCropFlipImage
+                # and LoadMultiViewImageFromMultiSweepsFiles.
+                lidar2img_rts.append(lidar2img_rt)
+            meta['lidar2img'] = lidar2img_rts
+            img_shape = meta['img_shape'][:3]
+            meta['img_shape'] = [img_shape] * len(img[0])
+
+        return batch_input_metas
diff --git a/mmde/projects/PETR/petr/petr_head.py b/mmde/projects/PETR/petr/petr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..acebd627f4e26acb8cb397ac49ae7f25ad968016
--- /dev/null
+++ b/mmde/projects/PETR/petr/petr_head.py
@@ -0,0 +1,825 @@
+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from DETR3D (https://github.com/WangYueFt/detr3d)
+# Copyright (c) 2021 Wang, Yue
+# ------------------------------------------------------------------------
+# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, Linear
+from mmdet.models.dense_heads.anchor_free_head import AnchorFreeHead
+from mmdet.models.layers import NormedLinear
+from mmdet.models.layers.transformer import inverse_sigmoid
+from mmdet.models.utils import multi_apply
+from mmengine.model.weight_init import bias_init_with_prob
+from mmengine.structures import InstanceData
+
+from mmdet3d.registry import MODELS, TASK_UTILS
+from projects.PETR.petr.utils import normalize_bbox
+
+
+def pos2posemb3d(pos, num_pos_feats=128, temperature=10000):
+    scale = 2 * math.pi
+    pos = pos * scale
+    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device)
+    dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats)
+    pos_x = pos[..., 0, None] / dim_t
+    pos_y = pos[..., 1, None] / dim_t
+    pos_z = pos[..., 2, None] / dim_t
+    pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()),
+                        dim=-1).flatten(-2)
+    pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()),
+                        dim=-1).flatten(-2)
+    pos_z = torch.stack((pos_z[..., 0::2].sin(), pos_z[..., 1::2].cos()),
+                        dim=-1).flatten(-2)
+    posemb = torch.cat((pos_y, pos_x, pos_z), dim=-1)
+    return posemb
+
+
+@MODELS.register_module()
+class PETRHead(AnchorFreeHead):
+    """Implements the DETR transformer head. See `paper: End-to-End Object
+    Detection with Transformers.
+
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+    Args:
+        num_classes (int): Number of categories excluding the background.
+        in_channels (int): Number of channels in the input feature map.
+        num_query (int): Number of query in Transformer.
+        num_reg_fcs (int, optional): Number of fully-connected layers used in
+            `FFN`, which is then used for the regression head. Default 2.
+        transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer.
+            Default: None.
+        sync_cls_avg_factor (bool): Whether to sync the avg_factor of
+            all ranks. Default to False.
+        positional_encoding (obj:`mmcv.ConfigDict`|dict):
+            Config for position encoding.
+        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the
+            classification loss. Default `CrossEntropyLoss`.
+        loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression loss. Default `L1Loss`.
+        loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression iou loss. Default `GIoULoss`.
+        tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of
+            transformer head.
+        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of
+            transformer head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+    _version = 2
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 num_query=100,
+                 num_reg_fcs=2,
+                 transformer=None,
+                 sync_cls_avg_factor=False,
+                 positional_encoding=dict(
+                     type='SinePositionalEncoding',
+                     num_feats=128,
+                     normalize=True),
+                 code_weights=None,
+                 bbox_coder=None,
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     bg_cls_weight=0.1,
+                     use_sigmoid=False,
+                     loss_weight=1.0,
+                     class_weight=1.0),
+                 loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                 loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                 train_cfg=dict(
+                     assigner=dict(
+                         type='HungarianAssigner',
+                         cls_cost=dict(type='ClassificationCost', weight=1.),
+                         reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+                         iou_cost=dict(
+                             type='IoUCost', iou_mode='giou', weight=2.0))),
+                 test_cfg=dict(max_per_img=100),
+                 with_position=True,
+                 with_multiview=False,
+                 depth_step=0.8,
+                 depth_num=64,
+                 LID=False,
+                 depth_start=1,
+                 position_range=[-65, -65, -8.0, 65, 65, 8.0],
+                 init_cfg=None,
+                 normedlinear=False,
+                 **kwargs):
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since it brings inconvenience when the initialization of
+        # `AnchorFreeHead` is called.
+        if 'code_size' in kwargs:
+            self.code_size = kwargs['code_size']
+        else:
+            self.code_size = 10
+        if code_weights is not None:
+            self.code_weights = code_weights
+        else:
+            self.code_weights = [
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2
+            ]
+        self.code_weights = self.code_weights[:self.code_size]
+        self.bg_cls_weight = 0
+        self.sync_cls_avg_factor = sync_cls_avg_factor
+        class_weight = loss_cls.get('class_weight', None)
+        if class_weight is not None and (self.__class__ is PETRHead):
+            assert isinstance(class_weight, float), 'Expected ' \
+                'class_weight to have type float. Found ' \
+                f'{type(class_weight)}.'
+            # NOTE following the official DETR rep0, bg_cls_weight means
+            # relative classification weight of the no-object class.
+            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
+            assert isinstance(bg_cls_weight, float), 'Expected ' \
+                'bg_cls_weight to have type float. Found ' \
+                f'{type(bg_cls_weight)}.'
+            class_weight = torch.ones(num_classes + 1) * class_weight
+            # set background class as the last indice
+            class_weight[num_classes] = bg_cls_weight
+            loss_cls.update({'class_weight': class_weight})
+            if 'bg_cls_weight' in loss_cls:
+                loss_cls.pop('bg_cls_weight')
+            self.bg_cls_weight = bg_cls_weight
+
+        if train_cfg:
+            assert 'assigner' in train_cfg, 'assigner should be provided '\
+                'when train_cfg is set.'
+            assigner = train_cfg['assigner']
+            assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'], \
+                'The classification weight for loss and matcher should be' \
+                'exactly the same.'
+            assert loss_bbox['loss_weight'] == assigner['reg_cost'][
+                'weight'], 'The regression L1 weight for loss and matcher ' \
+                'should be exactly the same.'
+            # assert loss_iou['loss_weight'] == assigner['iou_cost'][
+            #   'weight'], \
+            # 'The regression iou weight for loss and matcher should be' \
+            # 'exactly the same.'
+            self.assigner = TASK_UTILS.build(assigner)
+            # DETR sampling=False, so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = TASK_UTILS.build(sampler_cfg)
+
+        self.num_query = num_query
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.num_reg_fcs = num_reg_fcs
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.fp16_enabled = False
+        self.embed_dims = 256
+        self.depth_step = depth_step
+        self.depth_num = depth_num
+        self.position_dim = 3 * self.depth_num
+        self.position_range = position_range
+        self.LID = LID
+        self.depth_start = depth_start
+        self.position_level = 0
+        self.with_position = with_position
+        self.with_multiview = with_multiview
+        assert 'num_feats' in positional_encoding
+        num_feats = positional_encoding['num_feats']
+        assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
+            f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
+            f' and {num_feats}.'
+        self.act_cfg = transformer.get('act_cfg',
+                                       dict(type='ReLU', inplace=True))
+        self.num_pred = 6
+        self.normedlinear = normedlinear
+        super(PETRHead, self).__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            bbox_coder=bbox_coder,
+            init_cfg=init_cfg)
+
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.loss_iou = MODELS.build(loss_iou)
+
+        if self.loss_cls.use_sigmoid:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+        # self.activate = build_activation_layer(self.act_cfg)
+        # if self.with_multiview or not self.with_position:
+        #     self.positional_encoding = build_positional_encoding(
+        #         positional_encoding)
+        self.positional_encoding = TASK_UTILS.build(positional_encoding)
+        self.transformer = MODELS.build(transformer)
+        self.code_weights = nn.Parameter(
+            torch.tensor(self.code_weights, requires_grad=False),
+            requires_grad=False)
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.pc_range = self.bbox_coder.pc_range
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the transformer head."""
+        if self.with_position:
+            self.input_proj = Conv2d(
+                self.in_channels, self.embed_dims, kernel_size=1)
+        else:
+            self.input_proj = Conv2d(
+                self.in_channels, self.embed_dims, kernel_size=1)
+
+        cls_branch = []
+        for _ in range(self.num_reg_fcs):
+            cls_branch.append(Linear(self.embed_dims, self.embed_dims))
+            cls_branch.append(nn.LayerNorm(self.embed_dims))
+            cls_branch.append(nn.ReLU(inplace=True))
+        if self.normedlinear:
+            cls_branch.append(
+                NormedLinear(self.embed_dims, self.cls_out_channels))
+        else:
+            cls_branch.append(Linear(self.embed_dims, self.cls_out_channels))
+        fc_cls = nn.Sequential(*cls_branch)
+
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, self.code_size))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        self.cls_branches = nn.ModuleList(
+            [fc_cls for _ in range(self.num_pred)])
+        self.reg_branches = nn.ModuleList(
+            [reg_branch for _ in range(self.num_pred)])
+
+        if self.with_multiview:
+            self.adapt_pos3d = nn.Sequential(
+                nn.Conv2d(
+                    self.embed_dims * 3 // 2,
+                    self.embed_dims * 4,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0),
+                nn.ReLU(),
+                nn.Conv2d(
+                    self.embed_dims * 4,
+                    self.embed_dims,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0),
+            )
+        else:
+            self.adapt_pos3d = nn.Sequential(
+                nn.Conv2d(
+                    self.embed_dims,
+                    self.embed_dims,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0),
+                nn.ReLU(),
+                nn.Conv2d(
+                    self.embed_dims,
+                    self.embed_dims,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0),
+            )
+
+        if self.with_position:
+            self.position_encoder = nn.Sequential(
+                nn.Conv2d(
+                    self.position_dim,
+                    self.embed_dims * 4,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0),
+                nn.ReLU(),
+                nn.Conv2d(
+                    self.embed_dims * 4,
+                    self.embed_dims,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0),
+            )
+
+        self.reference_points = nn.Embedding(self.num_query, 3)
+        self.query_embedding = nn.Sequential(
+            nn.Linear(self.embed_dims * 3 // 2, self.embed_dims),
+            nn.ReLU(),
+            nn.Linear(self.embed_dims, self.embed_dims),
+        )
+
+    def init_weights(self):
+        """Initialize weights of the transformer head."""
+        # The initialization for transformer is important
+        self.transformer.init_weights()
+        nn.init.uniform_(self.reference_points.weight.data, 0, 1)
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+
+    def position_embeding(self, img_feats, img_metas, masks=None):
+        eps = 1e-5
+        pad_h, pad_w = img_metas[0]['pad_shape']
+        B, N, C, H, W = img_feats[self.position_level].shape
+        coords_h = torch.arange(
+            H, device=img_feats[0].device).float() * pad_h / H
+        coords_w = torch.arange(
+            W, device=img_feats[0].device).float() * pad_w / W
+
+        if self.LID:
+            index = torch.arange(
+                start=0,
+                end=self.depth_num,
+                step=1,
+                device=img_feats[0].device).float()
+            index_1 = index + 1
+            bin_size = (self.position_range[3] - self.depth_start) / (
+                self.depth_num * (1 + self.depth_num))
+            coords_d = self.depth_start + bin_size * index * index_1
+        else:
+            index = torch.arange(
+                start=0,
+                end=self.depth_num,
+                step=1,
+                device=img_feats[0].device).float()
+            bin_size = (self.position_range[3] -
+                        self.depth_start) / self.depth_num
+            coords_d = self.depth_start + bin_size * index
+
+        D = coords_d.shape[0]
+        coords = torch.stack(torch.meshgrid([coords_w, coords_h, coords_d
+                                             ])).permute(1, 2, 3,
+                                                         0)  # W, H, D, 3
+        coords = torch.cat((coords, torch.ones_like(coords[..., :1])), -1)
+        coords[..., :2] = coords[..., :2] * torch.maximum(
+            coords[..., 2:3],
+            torch.ones_like(coords[..., 2:3]) * eps)
+
+        img2lidars = []
+        for img_meta in img_metas:
+            img2lidar = []
+            for i in range(len(img_meta['lidar2img'])):
+                img2lidar.append(np.linalg.inv(img_meta['lidar2img'][i]))
+            img2lidars.append(np.asarray(img2lidar))
+        img2lidars = np.asarray(img2lidars)
+        img2lidars = coords.new_tensor(img2lidars)  # (B, N, 4, 4)
+
+        coords = coords.view(1, 1, W, H, D, 4, 1).repeat(B, N, 1, 1, 1, 1, 1)
+        img2lidars = img2lidars.view(B, N, 1, 1, 1, 4,
+                                     4).repeat(1, 1, W, H, D, 1, 1)
+        coords3d = torch.matmul(img2lidars, coords).squeeze(-1)[..., :3]
+        coords3d[..., 0:1] = (coords3d[..., 0:1] - self.position_range[0]) / (
+            self.position_range[3] - self.position_range[0])
+        coords3d[..., 1:2] = (coords3d[..., 1:2] - self.position_range[1]) / (
+            self.position_range[4] - self.position_range[1])
+        coords3d[..., 2:3] = (coords3d[..., 2:3] - self.position_range[2]) / (
+            self.position_range[5] - self.position_range[2])
+
+        coords_mask = (coords3d > 1.0) | (coords3d < 0.0)
+        coords_mask = coords_mask.flatten(-2).sum(-1) > (D * 0.5)
+        coords_mask = masks | coords_mask.permute(0, 1, 3, 2)
+        coords3d = coords3d.permute(0, 1, 4, 5, 3,
+                                    2).contiguous().view(B * N, -1, H, W)
+        coords3d = inverse_sigmoid(coords3d)
+        coords_position_embeding = self.position_encoder(coords3d)
+
+        return coords_position_embeding.view(B, N, self.embed_dims, H,
+                                             W), coords_mask
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """load checkpoints."""
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since `AnchorFreeHead._load_from_state_dict` should not be
+        # called here. Invoking the default `Module._load_from_state_dict`
+        # is enough.
+
+        # Names of some parameters in has been changed.
+        version = local_metadata.get('version', None)
+        if (version is None or version < 2) and self.__class__ is PETRHead:
+            convert_dict = {
+                '.self_attn.': '.attentions.0.',
+                # '.ffn.': '.ffns.0.',
+                '.multihead_attn.': '.attentions.1.',
+                '.decoder.norm.': '.decoder.post_norm.'
+            }
+            state_dict_keys = list(state_dict.keys())
+            for k in state_dict_keys:
+                for ori_key, convert_key in convert_dict.items():
+                    if ori_key in k:
+                        convert_key = k.replace(ori_key, convert_key)
+                        state_dict[convert_key] = state_dict[k]
+                        del state_dict[k]
+
+        super(AnchorFreeHead,
+              self)._load_from_state_dict(state_dict, prefix, local_metadata,
+                                          strict, missing_keys,
+                                          unexpected_keys, error_msgs)
+
+    def forward(self, mlvl_feats, img_metas):
+        """Forward function.
+
+        Args:
+            mlvl_feats (tuple[Tensor]): Features from the upstream
+                network, each is a 5D-tensor with shape
+                (B, N, C, H, W).
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format \
+                (cx, cy, w, l, cz, h, theta, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        """
+
+        x = mlvl_feats[0]
+        batch_size, num_cams = x.size(0), x.size(1)
+        input_img_h, input_img_w = img_metas[0]['pad_shape']
+        masks = x.new_ones((batch_size, num_cams, input_img_h, input_img_w))
+        for img_id in range(batch_size):
+            for cam_id in range(num_cams):
+                img_h, img_w = img_metas[img_id]['img_shape'][cam_id]
+                masks[img_id, cam_id, :img_h, :img_w] = 0
+        x = self.input_proj(x.flatten(0, 1))
+        x = x.view(batch_size, num_cams, *x.shape[-3:])
+        # interpolate masks to have the same spatial shape with x
+        masks = F.interpolate(masks, size=x.shape[-2:]).to(torch.bool)
+
+        if self.with_position:
+            coords_position_embeding, _ = self.position_embeding(
+                mlvl_feats, img_metas, masks)
+            pos_embed = coords_position_embeding
+            if self.with_multiview:
+                sin_embed = self.positional_encoding(masks)
+                sin_embed = self.adapt_pos3d(sin_embed.flatten(0, 1)).view(
+                    x.size())
+                pos_embed = pos_embed + sin_embed
+            else:
+                pos_embeds = []
+                for i in range(num_cams):
+                    xy_embed = self.positional_encoding(masks[:, i, :, :])
+                    pos_embeds.append(xy_embed.unsqueeze(1))
+                sin_embed = torch.cat(pos_embeds, 1)
+                sin_embed = self.adapt_pos3d(sin_embed.flatten(0, 1)).view(
+                    x.size())
+                pos_embed = pos_embed + sin_embed
+        else:
+            if self.with_multiview:
+                pos_embed = self.positional_encoding(masks)
+                pos_embed = self.adapt_pos3d(pos_embed.flatten(0, 1)).view(
+                    x.size())
+            else:
+                pos_embeds = []
+                for i in range(num_cams):
+                    pos_embed = self.positional_encoding(masks[:, i, :, :])
+                    pos_embeds.append(pos_embed.unsqueeze(1))
+                pos_embed = torch.cat(pos_embeds, 1)
+
+        reference_points = self.reference_points.weight
+        query_embeds = self.query_embedding(pos2posemb3d(reference_points))
+        reference_points = reference_points.unsqueeze(0).repeat(
+            batch_size, 1, 1)  # .sigmoid()
+
+        outs_dec, _ = self.transformer(x, masks, query_embeds, pos_embed,
+                                       self.reg_branches)
+        outs_dec = torch.nan_to_num(outs_dec)
+        outputs_classes = []
+        outputs_coords = []
+        for lvl in range(outs_dec.shape[0]):
+            reference = inverse_sigmoid(reference_points.clone())
+            assert reference.shape[-1] == 3
+            outputs_class = self.cls_branches[lvl](outs_dec[lvl]).to(
+                torch.float32)
+            tmp = self.reg_branches[lvl](outs_dec[lvl]).to(torch.float32)
+
+            tmp[..., 0:2] += reference[..., 0:2]
+            tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
+            tmp[..., 4:5] += reference[..., 2:3]
+            tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
+
+            outputs_coord = tmp
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+
+        all_cls_scores = torch.stack(outputs_classes)
+        all_bbox_preds = torch.stack(outputs_coords)
+
+        all_bbox_preds[..., 0:1] = (
+            all_bbox_preds[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) +
+            self.pc_range[0])
+        all_bbox_preds[..., 1:2] = (
+            all_bbox_preds[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) +
+            self.pc_range[1])
+        all_bbox_preds[..., 4:5] = (
+            all_bbox_preds[..., 4:5] * (self.pc_range[5] - self.pc_range[2]) +
+            self.pc_range[2])
+
+        outs = {
+            'all_cls_scores': all_cls_scores,
+            'all_bbox_preds': all_bbox_preds,
+            'enc_cls_scores': None,
+            'enc_bbox_preds': None,
+        }
+        return outs
+
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_labels,
+                           gt_bboxes,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
+                                             gt_labels, gt_bboxes_ignore)
+        pred_instance_3d = InstanceData(priors=bbox_pred)
+        gt_instances_3d = InstanceData(bboxes_3d=gt_bboxes)
+        sampling_result = self.sampler.sample(assign_result, pred_instance_3d,
+                                              gt_instances_3d)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        code_size = gt_bboxes.size(1)
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :code_size]
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+        # DETR
+        bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            tuple: a tuple containing the following targets.
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+        gt_labels_list = gt_labels_list[0]
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         pos_inds_list,
+         neg_inds_list) = multi_apply(self._get_target_single, cls_scores_list,
+                                      bbox_preds_list, gt_labels_list,
+                                      gt_bboxes_list, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def loss_by_feat_single(self,
+                            cls_scores,
+                            bbox_preds,
+                            gt_bboxes_list,
+                            gt_labels_list,
+                            gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in
+                [tl_x, tl_y, br_x,loss_by_feat_single br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs
+                from a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           gt_bboxes_list, gt_labels_list,
+                                           gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        # if self.sync_cls_avg_factor:
+        #     cls_avg_factor = reduce_mean(
+        #         cls_scores.new_tensor([cls_avg_factor]))
+
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        # num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+        num_total_pos = torch.clamp(num_total_pos, min=1).item()
+
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.code_weights
+
+        loss_bbox = self.loss_bbox(
+            bbox_preds[isnotnan, :10],
+            normalized_bbox_targets[isnotnan, :10],
+            bbox_weights[isnotnan, :10],
+            avg_factor=num_total_pos)
+
+        loss_cls = torch.nan_to_num(loss_cls)
+        loss_bbox = torch.nan_to_num(loss_bbox)
+        return loss_cls, loss_bbox
+
+    def loss_by_feat(self,
+                     gt_bboxes_list,
+                     gt_labels_list,
+                     preds_dicts,
+                     gt_bboxes_ignore=None):
+        """"Loss function.
+        Args:
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            preds_dicts:
+                all_cls_scores (Tensor): Classification score of all
+                    decoder layers, has shape
+                    [nb_dec, bs, num_query, cls_out_channels].
+                all_bbox_preds (Tensor): Sigmoid regression
+                    outputs of all decode layers. Each is a 4D-tensor with
+                    normalized coordinate format (cx, cy, w, h) and shape
+                    [nb_dec, bs, num_query, 4].
+                enc_cls_scores (Tensor): Classification scores of
+                    points on encode feature map , has shape
+                    (N, h*w, num_classes). Only be passed when as_two_stage is
+                    True, otherwise is None.
+                enc_bbox_preds (Tensor): Regression results of each points
+                    on the encode feature map, has shape (N, h*w, 4). Only be
+                    passed when as_two_stage is True, otherwise is None.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+
+        all_cls_scores = preds_dicts['all_cls_scores']
+        all_bbox_preds = preds_dicts['all_bbox_preds']
+        enc_cls_scores = preds_dicts['enc_cls_scores']
+        enc_bbox_preds = preds_dicts['enc_bbox_preds']
+
+        num_dec_layers = len(all_cls_scores)
+        device = gt_labels_list[0].device
+
+        gt_bboxes_list = [
+            torch.cat((gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+                      dim=1).to(device) for gt_bboxes in gt_bboxes_list
+        ]
+
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [[gt_labels_list] for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+
+        losses_cls, losses_bbox = multi_apply(self.loss_by_feat_single,
+                                              all_cls_scores, all_bbox_preds,
+                                              all_gt_bboxes_list,
+                                              all_gt_labels_list,
+                                              all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            binary_labels_list = [
+                torch.zeros_like(gt_labels_list[i])
+                for i in range(len(all_gt_labels_list))
+            ]
+            enc_loss_cls, enc_losses_bbox = \
+                self.loss_single(enc_cls_scores, enc_bbox_preds,
+                                 gt_bboxes_list, binary_labels_list,
+                                 gt_bboxes_ignore)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def get_bboxes(self, preds_dicts, img_metas, rescale=False):
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results.
+            img_metas (list[dict]): Point cloud and image's meta info.
+        Returns:
+            list[dict]: Decoded bbox, scores and labels after nms.
+        """
+        preds_dicts = self.bbox_coder.decode(preds_dicts)
+        num_samples = len(preds_dicts)
+
+        ret_list = []
+        for i in range(num_samples):
+            preds = preds_dicts[i]
+            bboxes = preds['bboxes']
+            bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+            bboxes = img_metas[i]['box_type_3d'](bboxes, bboxes.size(-1))
+            scores = preds['scores']
+            labels = preds['labels']
+            ret_list.append([bboxes, scores, labels])
+        return ret_list
diff --git a/mmde/projects/PETR/petr/petr_transformer.py b/mmde/projects/PETR/petr/petr_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbb4cc332bc6e7aff1750c49ab843b1e0a929b27
--- /dev/null
+++ b/mmde/projects/PETR/petr/petr_transformer.py
@@ -0,0 +1,540 @@
+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from DETR3D (https://github.com/WangYueFt/detr3d)
+# Copyright (c) 2021 Wang, Yue
+# ------------------------------------------------------------------------
+# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import (BaseTransformerLayer,
+                                         TransformerLayerSequence)
+from mmengine.model import BaseModule
+from mmengine.model.weight_init import xavier_init
+
+# from mmcv.utils import deprecated_api_warning
+from mmdet3d.registry import MODELS, TASK_UTILS
+
+
+@MODELS.register_module()
+class PETRTransformer(BaseModule):
+    """Implements the DETR transformer. Following the official DETR
+    implementation, this module copy-paste from torch.nn.Transformer with
+    modifications:
+
+        * positional encodings are passed in MultiheadAttention
+        * extra LN at the end of encoder is removed
+        * decoder returns a stack of activations from all decoding layers
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+    Args:
+        encoder (`mmcv.ConfigDict` | Dict): Config of
+            TransformerEncoder. Defaults to None.
+        decoder ((`mmcv.ConfigDict` | Dict)): Config of
+            TransformerDecoder. Defaults to None
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self, encoder=None, decoder=None, init_cfg=None, cross=False):
+        super(PETRTransformer, self).__init__(init_cfg=init_cfg)
+        if encoder is not None:
+            self.encoder = MODELS.build(encoder)
+        else:
+            self.encoder = None
+        self.decoder = MODELS.build(decoder)
+        self.embed_dims = self.decoder.embed_dims
+        self.cross = cross
+
+    def init_weights(self):
+        # follow the official DETR to init parameters
+        for m in self.modules():
+            if hasattr(m, 'weight') and m.weight.dim() > 1:
+                xavier_init(m, distribution='uniform')
+        self._is_init = True
+
+    def forward(self, x, mask, query_embed, pos_embed, reg_branch=None):
+        """Forward function for `Transformer`.
+        Args:
+            x (Tensor): Input query with shape [bs, c, h, w] where
+                c = embed_dims.
+            mask (Tensor): The key_padding_mask used for encoder and decoder,
+                with shape [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder, with shape
+                [num_query, c].
+            pos_embed (Tensor): The positional encoding for encoder and
+                decoder, with the same shape as `x`.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - out_dec: Output from decoder. If return_intermediate_dec \
+                      is True output has shape [num_dec_layers, bs,
+                      num_query, embed_dims], else has shape [1, bs, \
+                      num_query, embed_dims].
+                - memory: Output results from encoder, with shape \
+                      [bs, embed_dims, h, w].
+        """
+        bs, n, c, h, w = x.shape
+        memory = x.permute(1, 3, 4, 0,
+                           2).reshape(-1, bs,
+                                      c)  # [bs, n, c, h, w] -> [n*h*w, bs, c]
+        pos_embed = pos_embed.permute(1, 3, 4, 0, 2).reshape(
+            -1, bs, c)  # [bs, n, c, h, w] -> [n*h*w, bs, c]
+        query_embed = query_embed.unsqueeze(1).repeat(
+            1, bs, 1)  # [num_query, dim] -> [num_query, bs, dim]
+        mask = mask.view(bs, -1)  # [bs, n, h, w] -> [bs, n*h*w]
+        target = torch.zeros_like(query_embed)
+
+        # out_dec: [num_layers, num_query, bs, dim]
+        out_dec = self.decoder(
+            query=target,
+            key=memory,
+            value=memory,
+            key_pos=pos_embed,
+            query_pos=query_embed,
+            key_padding_mask=mask,
+            reg_branch=reg_branch,
+        )
+        out_dec = out_dec.transpose(1, 2)
+        memory = memory.reshape(n, h, w, bs, c).permute(3, 0, 4, 1, 2)
+        return out_dec, memory
+
+
+@MODELS.register_module()
+class PETRDNTransformer(BaseModule):
+    """Implements the DETR transformer. Following the official DETR
+    implementation, this module copy-paste from torch.nn.Transformer with
+    modifications:
+
+        * positional encodings are passed in MultiheadAttention
+        * extra LN at the end of encoder is removed
+        * decoder returns a stack of activations from all decoding layers
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+    Args:
+        encoder (`mmcv.ConfigDict` | Dict): Config of
+            TransformerEncoder. Defaults to None.
+        decoder ((`mmcv.ConfigDict` | Dict)): Config of
+            TransformerDecoder. Defaults to None
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self, encoder=None, decoder=None, init_cfg=None, cross=False):
+        super(PETRDNTransformer, self).__init__(init_cfg=init_cfg)
+        if encoder is not None:
+            self.encoder = MODELS.build(encoder)
+        else:
+            self.encoder = None
+        self.decoder = MODELS.build(decoder)
+        self.embed_dims = self.decoder.embed_dims
+        self.cross = cross
+
+    def init_weights(self):
+        # follow the official DETR to init parameters
+        for m in self.modules():
+            if hasattr(m, 'weight') and m.weight.dim() > 1:
+                xavier_init(m, distribution='uniform')
+        self._is_init = True
+
+    def forward(self,
+                x,
+                mask,
+                query_embed,
+                pos_embed,
+                attn_masks=None,
+                reg_branch=None):
+        """Forward function for `Transformer`.
+        Args:
+            x (Tensor): Input query with shape [bs, c, h, w] where
+                c = embed_dims.
+            mask (Tensor): The key_padding_mask used for encoder and decoder,
+                with shape [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder, with shape
+                [num_query, c].
+            pos_embed (Tensor): The positional encoding for encoder and
+                decoder, with the same shape as `x`.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - out_dec: Output from decoder. If return_intermediate_dec \
+                      is True output has shape [num_dec_layers, bs,
+                      num_query, embed_dims], else has shape [1, bs, \
+                      num_query, embed_dims].
+                - memory: Output results from encoder, with shape \
+                      [bs, embed_dims, h, w].
+        """
+        bs, n, c, h, w = x.shape
+        memory = x.permute(1, 3, 4, 0,
+                           2).reshape(-1, bs,
+                                      c)  # [bs, n, c, h, w] -> [n*h*w, bs, c]
+        pos_embed = pos_embed.permute(1, 3, 4, 0, 2).reshape(
+            -1, bs, c)  # [bs, n, c, h, w] -> [n*h*w, bs, c]
+        query_embed = query_embed.transpose(
+            0, 1)  # [num_query, dim] -> [num_query, bs, dim]
+        mask = mask.view(bs, -1)  # [bs, n, h, w] -> [bs, n*h*w]
+        target = torch.zeros_like(query_embed)
+        # out_dec: [num_layers, num_query, bs, dim]
+        out_dec = self.decoder(
+            query=target,
+            key=memory,
+            value=memory,
+            key_pos=pos_embed,
+            query_pos=query_embed,
+            key_padding_mask=mask,
+            attn_masks=[attn_masks, None],
+            reg_branch=reg_branch,
+        )
+        out_dec = out_dec.transpose(1, 2)
+        memory = memory.reshape(n, h, w, bs, c).permute(3, 0, 4, 1, 2)
+        return out_dec, memory
+
+
+@MODELS.register_module()
+class PETRTransformerDecoderLayer(BaseTransformerLayer):
+    """Implements decoder layer in DETR transformer.
+
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
+            Configs for self_attention or cross_attention, the order
+            should be consistent with it in `operation_order`. If it is
+            a dict, it would be expand to the number of attention in
+            `operation_order`.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        ffn_dropout (float): Probability of an element to be zeroed
+            in ffn. Default 0.0.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Default：None
+        act_cfg (dict): The activation config for FFNs. Default: `LN`
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: `LN`.
+        ffn_num_fcs (int): The number of fully-connected layers in FFNs.
+            Default：2.
+    """
+
+    def __init__(self,
+                 attn_cfgs,
+                 feedforward_channels,
+                 ffn_dropout=0.0,
+                 operation_order=None,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 ffn_num_fcs=2,
+                 with_cp=True,
+                 **kwargs):
+        super(PETRTransformerDecoderLayer, self).__init__(
+            attn_cfgs=attn_cfgs,
+            feedforward_channels=feedforward_channels,
+            ffn_dropout=ffn_dropout,
+            operation_order=operation_order,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            ffn_num_fcs=ffn_num_fcs,
+            **kwargs)
+        assert len(operation_order) == 6
+        assert set(operation_order) == set(
+            ['self_attn', 'norm', 'cross_attn', 'ffn'])
+        self.use_checkpoint = with_cp
+
+    def _forward(
+        self,
+        query,
+        key=None,
+        value=None,
+        query_pos=None,
+        key_pos=None,
+        attn_masks=None,
+        query_key_padding_mask=None,
+        key_padding_mask=None,
+    ):
+        """Forward function for `TransformerCoder`.
+
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        x = super(PETRTransformerDecoderLayer, self).forward(
+            query,
+            key=key,
+            value=value,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_masks=attn_masks,
+            query_key_padding_mask=query_key_padding_mask,
+            key_padding_mask=key_padding_mask,
+        )
+
+        return x
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerCoder`.
+
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if self.use_checkpoint and self.training:
+            x = cp.checkpoint(
+                self._forward,
+                query,
+                key,
+                value,
+                query_pos,
+                key_pos,
+                attn_masks,
+                query_key_padding_mask,
+                key_padding_mask,
+            )
+        else:
+            x = self._forward(
+                query,
+                key=key,
+                value=value,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                attn_masks=attn_masks,
+                query_key_padding_mask=query_key_padding_mask,
+                key_padding_mask=key_padding_mask)
+        return x
+
+
+@MODELS.register_module()
+class PETRMultiheadAttention(BaseModule):
+    """A wrapper for ``torch.nn.MultiheadAttention``.
+
+    This module implements MultiheadAttention with identity connection,
+    and positional encoding  is also passed as input.
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): When it is True,  Key, Query and Value are shape of
+            (batch, n, embed_dim), otherwise (n, batch, embed_dim).
+             Default to False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=dict(type='Dropout', drop_prob=0.),
+                 init_cfg=None,
+                 batch_first=False,
+                 **kwargs):
+        super(PETRMultiheadAttention, self).__init__(init_cfg)
+        if 'dropout' in kwargs:
+            warnings.warn(
+                'The arguments `dropout` in MultiheadAttention '
+                'has been deprecated, now you can separately '
+                'set `attn_drop`(float), proj_drop(float), '
+                'and `dropout_layer`(dict) ', DeprecationWarning)
+            attn_drop = kwargs['dropout']
+            dropout_layer['drop_prob'] = kwargs.pop('dropout')
+
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.batch_first = batch_first
+
+        self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
+                                          **kwargs)
+
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.dropout_layer = MODELS.build(
+            dropout_layer) if dropout_layer else nn.Identity()
+
+    # @deprecated_api_warning({'residual': 'identity'},
+    #                         cls_name='MultiheadAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_pos=None,
+                attn_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `MultiheadAttention`.
+
+        **kwargs allow passing a more general data flow when combining
+        with other operations in `transformerlayer`.
+        Args:
+            query (Tensor): The input query with shape [num_queries, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+                If None, the ``query`` will be used. Defaults to None.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+                If None, the `key` will be used.
+            identity (Tensor): This tensor, with the same shape as x,
+                will be used for the identity link.
+                If None, `x` will be used. Defaults to None.
+            query_pos (Tensor): The positional encoding for query, with
+                the same shape as `x`. If not None, it will
+                be added to `x` before forward function. Defaults to None.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. Defaults to None. If not None, it will
+                be added to `key` before forward function. If None, and
+                `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+        Returns:
+            Tensor: forwarded results with shape
+            [num_queries, bs, embed_dims]
+            if self.batch_first is False, else
+            [bs, num_queries embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+        if identity is None:
+            identity = query
+        if key_pos is None:
+            if query_pos is not None:
+                # use query_pos if key_pos is not available
+                if query_pos.shape == key.shape:
+                    key_pos = query_pos
+                else:
+                    warnings.warn(f'position encoding of key is'
+                                  f'missing in {self.__class__.__name__}.')
+        if query_pos is not None:
+            query = query + query_pos
+        if key_pos is not None:
+            key = key + key_pos
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            query = query.transpose(0, 1)
+            key = key.transpose(0, 1)
+            value = value.transpose(0, 1)
+
+        out = self.attn(
+            query=query,
+            key=key,
+            value=value,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+@MODELS.register_module()
+class PETRTransformerEncoder(TransformerLayerSequence):
+    """TransformerEncoder of DETR.
+
+    Args:
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`. Only used when `self.pre_norm` is `True`
+    """
+
+    def __init__(self, *args, post_norm_cfg=dict(type='LN'), **kwargs):
+        super(PETRTransformerEncoder, self).__init__(*args, **kwargs)
+        if post_norm_cfg is not None:
+            self.post_norm = TASK_UTILS.build(
+                post_norm_cfg, self.embed_dims)[1] if self.pre_norm else None
+        else:
+            assert not self.pre_norm, f'Use prenorm in ' \
+                                      f'{self.__class__.__name__},' \
+                                      f'Please specify post_norm_cfg'
+            self.post_norm = None
+
+    def forward(self, *args, **kwargs):
+        """Forward function for `TransformerCoder`.
+
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        x = super(PETRTransformerEncoder, self).forward(*args, **kwargs)
+        if self.post_norm is not None:
+            x = self.post_norm(x)
+        return x
+
+
+@MODELS.register_module()
+class PETRTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR transformer.
+
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self,
+                 *args,
+                 post_norm_cfg=dict(type='LN'),
+                 return_intermediate=False,
+                 **kwargs):
+
+        super(PETRTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(post_norm_cfg,
+                                              self.embed_dims)[1]
+        else:
+            self.post_norm = None
+
+    def forward(self, query, *args, **kwargs):
+        """Forward function for `TransformerDecoder`.
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        if not self.return_intermediate:
+            x = super().forward(query, *args, **kwargs)
+            if self.post_norm:
+                x = self.post_norm(x)[None]
+            return x
+
+        intermediate = []
+        for layer in self.layers:
+            query = layer(query, *args, **kwargs)
+            if self.return_intermediate:
+                if self.post_norm is not None:
+                    intermediate.append(self.post_norm(query))
+                else:
+                    intermediate.append(query)
+        return torch.stack(intermediate)
diff --git a/mmde/projects/PETR/petr/positional_encoding.py b/mmde/projects/PETR/petr/positional_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fb0a007aa1b57eb6dd4e2f5c780b24f59aa9db7
--- /dev/null
+++ b/mmde/projects/PETR/petr/positional_encoding.py
@@ -0,0 +1,171 @@
+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from mmdetection (https://github.com/open-mmlab/mmdetection)
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+import math
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS, TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class SinePositionalEncoding3D(BaseModule):
+    """Position encoding with sine and cosine functions. See `End-to-End Object
+    Detection with Transformers.
+
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 num_feats,
+                 temperature=10000,
+                 normalize=False,
+                 scale=2 * math.pi,
+                 eps=1e-6,
+                 offset=0.,
+                 init_cfg=None):
+        super(SinePositionalEncoding3D, self).__init__(init_cfg)
+        if normalize:
+            assert isinstance(scale, (float, int)), 'when normalize is set,' \
+                'scale should be provided and in float or int type, ' \
+                f'found {type(scale)}'
+        self.num_feats = num_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = scale
+        self.eps = eps
+        self.offset = offset
+
+    def forward(self, mask):
+        """Forward function for `SinePositionalEncoding`.
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        # For convenience of exporting to ONNX, it's required to convert
+        # `masks` from bool to int.
+        mask = mask.to(torch.int)
+        not_mask = 1 - mask  # logical_not
+        n_embed = not_mask.cumsum(1, dtype=torch.float32)
+        y_embed = not_mask.cumsum(2, dtype=torch.float32)
+        x_embed = not_mask.cumsum(3, dtype=torch.float32)
+        if self.normalize:
+            n_embed = (n_embed + self.offset) / \
+                      (n_embed[:, -1:, :, :] + self.eps) * self.scale
+            y_embed = (y_embed + self.offset) / \
+                      (y_embed[:, :, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / \
+                      (x_embed[:, :, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+        pos_n = n_embed[:, :, :, :, None] / dim_t
+        pos_x = x_embed[:, :, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, :, None] / dim_t
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        B, N, H, W = mask.size()
+        pos_n = torch.stack(
+            (pos_n[:, :, :, :, 0::2].sin(), pos_n[:, :, :, :, 1::2].cos()),
+            dim=4).view(B, N, H, W, -1)
+        pos_x = torch.stack(
+            (pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()),
+            dim=4).view(B, N, H, W, -1)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()),
+            dim=4).view(B, N, H, W, -1)
+        pos = torch.cat((pos_n, pos_y, pos_x), dim=4).permute(0, 1, 4, 2, 3)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'temperature={self.temperature}, '
+        repr_str += f'normalize={self.normalize}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'eps={self.eps})'
+        return repr_str
+
+
+@MODELS.register_module()
+class LearnedPositionalEncoding3D(BaseModule):
+    """Position embedding with learnable embedding weights.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. The final returned dimension for
+            each position is 2 times of this value.
+        row_num_embed (int, optional): The dictionary size of row embeddings.
+            Default 50.
+        col_num_embed (int, optional): The dictionary size of col embeddings.
+            Default 50.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_feats,
+                 row_num_embed=50,
+                 col_num_embed=50,
+                 init_cfg=dict(type='Uniform', layer='Embedding')):
+        super(LearnedPositionalEncoding3D, self).__init__(init_cfg)
+        self.row_embed = nn.Embedding(row_num_embed, num_feats)
+        self.col_embed = nn.Embedding(col_num_embed, num_feats)
+        self.num_feats = num_feats
+        self.row_num_embed = row_num_embed
+        self.col_num_embed = col_num_embed
+
+    def forward(self, mask):
+        """Forward function for `LearnedPositionalEncoding`.
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        h, w = mask.shape[-2:]
+        x = torch.arange(w, device=mask.device)
+        y = torch.arange(h, device=mask.device)
+        x_embed = self.col_embed(x)
+        y_embed = self.row_embed(y)
+        pos = torch.cat(
+            (x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(
+                1, w, 1)),
+            dim=-1).permute(2, 0,
+                            1).unsqueeze(0).repeat(mask.shape[0], 1, 1, 1)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'row_num_embed={self.row_num_embed}, '
+        repr_str += f'col_num_embed={self.col_num_embed})'
+        return repr_str
diff --git a/mmde/projects/PETR/petr/transforms_3d.py b/mmde/projects/PETR/petr/transforms_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa43f4950bb4c7a914936872357bbac66bc3a9dc
--- /dev/null
+++ b/mmde/projects/PETR/petr/transforms_3d.py
@@ -0,0 +1,209 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.transforms import BaseTransform
+from PIL import Image
+
+from mmdet3d.registry import TRANSFORMS
+from mmdet3d.structures.bbox_3d import LiDARInstance3DBoxes
+
+
+@TRANSFORMS.register_module()
+class ResizeCropFlipImage(BaseTransform):
+    """Random resize, Crop and flip the image
+    Args:
+        size (tuple, optional): Fixed padding size.
+    """
+
+    def __init__(self, data_aug_conf=None, training=True):
+        self.data_aug_conf = data_aug_conf
+        self.training = training
+
+    def transform(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+
+        imgs = results['img']
+        N = len(imgs)
+        new_imgs = []
+        resize, resize_dims, crop, flip, rotate = self._sample_augmentation()
+        results['lidar2cam'] = np.array(results['lidar2cam'])
+        for i in range(N):
+            intrinsic = np.array(results['cam2img'][i])
+            viewpad = np.eye(4)
+            viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+            results['cam2img'][i] = viewpad
+            img = Image.fromarray(np.uint8(imgs[i]))
+            # augmentation (resize, crop, horizontal flip, rotate)
+            # different view use different aug (BEV Det)
+            img, ida_mat = self._img_transform(
+                img,
+                resize=resize,
+                resize_dims=resize_dims,
+                crop=crop,
+                flip=flip,
+                rotate=rotate,
+            )
+            new_imgs.append(np.array(img).astype(np.float32))
+            results['cam2img'][
+                i][:3, :3] = ida_mat @ results['cam2img'][i][:3, :3]
+
+        results['img'] = new_imgs
+
+        return results
+
+    def _get_rot(self, h):
+
+        return torch.Tensor([
+            [np.cos(h), np.sin(h)],
+            [-np.sin(h), np.cos(h)],
+        ])
+
+    def _img_transform(self, img, resize, resize_dims, crop, flip, rotate):
+        ida_rot = torch.eye(2)
+        ida_tran = torch.zeros(2)
+        # adjust image
+        img = img.resize(resize_dims)
+        img = img.crop(crop)
+        if flip:
+            img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
+        img = img.rotate(rotate)
+
+        # post-homography transformation
+        ida_rot *= resize
+        ida_tran -= torch.Tensor(crop[:2])
+        if flip:
+            A = torch.Tensor([[-1, 0], [0, 1]])
+            b = torch.Tensor([crop[2] - crop[0], 0])
+            ida_rot = A.matmul(ida_rot)
+            ida_tran = A.matmul(ida_tran) + b
+        A = self._get_rot(rotate / 180 * np.pi)
+        b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2
+        b = A.matmul(-b) + b
+        ida_rot = A.matmul(ida_rot)
+        ida_tran = A.matmul(ida_tran) + b
+        ida_mat = torch.eye(3)
+        ida_mat[:2, :2] = ida_rot
+        ida_mat[:2, 2] = ida_tran
+        return img, ida_mat
+
+    def _sample_augmentation(self):
+        H, W = self.data_aug_conf['H'], self.data_aug_conf['W']
+        fH, fW = self.data_aug_conf['final_dim']
+        if self.training:
+            resize = np.random.uniform(*self.data_aug_conf['resize_lim'])
+            resize_dims = (int(W * resize), int(H * resize))
+            newW, newH = resize_dims
+            crop_h = int(
+                (1 - np.random.uniform(*self.data_aug_conf['bot_pct_lim'])) *
+                newH) - fH
+            crop_w = int(np.random.uniform(0, max(0, newW - fW)))
+            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
+            flip = False
+            if self.data_aug_conf['rand_flip'] and np.random.choice([0, 1]):
+                flip = True
+            rotate = np.random.uniform(*self.data_aug_conf['rot_lim'])
+        else:
+            resize = max(fH / H, fW / W)
+            resize_dims = (int(W * resize), int(H * resize))
+            newW, newH = resize_dims
+            crop_h = int(
+                (1 - np.mean(self.data_aug_conf['bot_pct_lim'])) * newH) - fH
+            crop_w = int(max(0, newW - fW) / 2)
+            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
+            flip = False
+            rotate = 0
+        return resize, resize_dims, crop, flip, rotate
+
+
+@TRANSFORMS.register_module()
+class GlobalRotScaleTransImage(BaseTransform):
+    """Random resize, Crop and flip the image
+    Args:
+        size (tuple, optional): Fixed padding size.
+    """
+
+    def __init__(
+        self,
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+        reverse_angle=False,
+        training=True,
+    ):
+
+        self.rot_range = rot_range
+        self.scale_ratio_range = scale_ratio_range
+        self.translation_std = translation_std
+
+        self.reverse_angle = reverse_angle
+        self.training = training
+
+    def transform(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+        # random rotate
+        rot_angle = np.random.uniform(*self.rot_range)
+
+        self.rotate_bev_along_z(results, rot_angle)
+        if self.reverse_angle:
+            rot_angle *= -1
+        results['gt_bboxes_3d'].rotate(np.array(rot_angle))
+
+        # random scale
+        scale_ratio = np.random.uniform(*self.scale_ratio_range)
+        self.scale_xyz(results, scale_ratio)
+        results['gt_bboxes_3d'].scale(scale_ratio)
+
+        # TODO: support translation
+        if not self.reverse_angle:
+            gt_bboxes_3d = results['gt_bboxes_3d'].numpy()
+            gt_bboxes_3d[:, 6] -= 2 * rot_angle
+            results['gt_bboxes_3d'] = LiDARInstance3DBoxes(
+                gt_bboxes_3d, box_dim=9)
+
+        return results
+
+    def rotate_bev_along_z(self, results, angle):
+        rot_cos = torch.cos(torch.tensor(angle))
+        rot_sin = torch.sin(torch.tensor(angle))
+
+        rot_mat = torch.tensor([[rot_cos, -rot_sin, 0, 0],
+                                [rot_sin, rot_cos, 0, 0], [0, 0, 1, 0],
+                                [0, 0, 0, 1]])
+        rot_mat_inv = torch.inverse(rot_mat)
+        num_view = len(results['lidar2cam'])
+        for view in range(num_view):
+            results['lidar2cam'][view] = (
+                torch.tensor(np.array(results['lidar2cam'][view]).T).float()
+                @ rot_mat_inv).T.numpy()
+
+        return
+
+    def scale_xyz(self, results, scale_ratio):
+        rot_mat = torch.tensor([
+            [scale_ratio, 0, 0, 0],
+            [0, scale_ratio, 0, 0],
+            [0, 0, scale_ratio, 0],
+            [0, 0, 0, 1],
+        ])
+
+        rot_mat_inv = torch.inverse(rot_mat)
+
+        num_view = len(results['lidar2cam'])
+        for view in range(num_view):
+            results['lidar2cam'][view] = (torch.tensor(
+                rot_mat_inv.T
+                @ results['lidar2cam'][view].T).float()).T.numpy()
+
+        return
diff --git a/mmde/projects/PETR/petr/utils.py b/mmde/projects/PETR/petr/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..edf2b763c9550486e6a55dbc629620e38c0f247a
--- /dev/null
+++ b/mmde/projects/PETR/petr/utils.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet3d.structures.bbox_3d.utils import limit_period
+
+
+def normalize_bbox(bboxes, pc_range):
+
+    cx = bboxes[..., 0:1]
+    cy = bboxes[..., 1:2]
+    cz = bboxes[..., 2:3]
+    length = bboxes[..., 3:4].log()
+    width = bboxes[..., 4:5].log()
+    height = bboxes[..., 5:6].log()
+
+    rot = -bboxes[..., 6:7] - np.pi / 2
+    rot = limit_period(rot, period=np.pi * 2)
+    if bboxes.size(-1) > 7:
+        vx = bboxes[..., 7:8]
+        vy = bboxes[..., 8:9]
+        normalized_bboxes = torch.cat(
+            (cx, cy, length, width, cz, height, rot.sin(), rot.cos(), vx, vy),
+            dim=-1)
+    else:
+        normalized_bboxes = torch.cat(
+            (cx, cy, length, width, cz, height, rot.sin(), rot.cos()), dim=-1)
+    return normalized_bboxes
+
+
+def denormalize_bbox(normalized_bboxes, pc_range):
+    # rotation
+    rot_sine = normalized_bboxes[..., 6:7]
+
+    rot_cosine = normalized_bboxes[..., 7:8]
+    rot = torch.atan2(rot_sine, rot_cosine)
+    rot = -rot - np.pi / 2
+    rot = limit_period(rot, period=np.pi * 2)
+
+    # center in the bev
+    cx = normalized_bboxes[..., 0:1]
+    cy = normalized_bboxes[..., 1:2]
+    cz = normalized_bboxes[..., 4:5]
+
+    # size
+    length = normalized_bboxes[..., 2:3]
+    width = normalized_bboxes[..., 3:4]
+    height = normalized_bboxes[..., 5:6]
+
+    width = width.exp()
+    length = length.exp()
+    height = height.exp()
+    if normalized_bboxes.size(-1) > 8:
+        # velocity
+        vx = normalized_bboxes[:, 8:9]
+        vy = normalized_bboxes[:, 9:10]
+        denormalized_bboxes = torch.cat(
+            [cx, cy, cz, length, width, height, rot, vx, vy], dim=-1)
+    else:
+        denormalized_bboxes = torch.cat(
+            [cx, cy, cz, length, width, height, rot], dim=-1)
+
+    return denormalized_bboxes
diff --git a/mmde/projects/PETR/petr/vovnetcp.py b/mmde/projects/PETR/petr/vovnetcp.py
new file mode 100644
index 0000000000000000000000000000000000000000..62f0fdeafb504da44ab7af6469a9a8707adcb59a
--- /dev/null
+++ b/mmde/projects/PETR/petr/vovnetcp.py
@@ -0,0 +1,475 @@
+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from DETR3D (https://github.com/WangYueFt/detr3d)
+# Copyright (c) 2021 Wang, Yue
+# ------------------------------------------------------------------------
+# Copyright (c) Youngwan Lee (ETRI) All Rights Reserved.
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# ------------------------------------------------------------------------
+import warnings
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet3d.registry import MODELS
+
+VoVNet19_slim_dw_eSE = {
+    'stem': [64, 64, 64],
+    'stage_conv_ch': [64, 80, 96, 112],
+    'stage_out_ch': [112, 256, 384, 512],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    'dw': True
+}
+
+VoVNet19_dw_eSE = {
+    'stem': [64, 64, 64],
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    'dw': True
+}
+
+VoVNet19_slim_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [64, 80, 96, 112],
+    'stage_out_ch': [112, 256, 384, 512],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    'dw': False
+}
+
+VoVNet19_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    'dw': False
+}
+
+VoVNet39_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 5,
+    'block_per_stage': [1, 1, 2, 2],
+    'eSE': True,
+    'dw': False
+}
+
+VoVNet57_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 5,
+    'block_per_stage': [1, 1, 4, 3],
+    'eSE': True,
+    'dw': False
+}
+
+VoVNet99_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [128, 160, 192, 224],
+    'stage_out_ch': [256, 512, 768, 1024],
+    'layer_per_block': 5,
+    'block_per_stage': [1, 3, 9, 3],
+    'eSE': True,
+    'dw': False
+}
+
+_STAGE_SPECS = {
+    'V-19-slim-dw-eSE': VoVNet19_slim_dw_eSE,
+    'V-19-dw-eSE': VoVNet19_dw_eSE,
+    'V-19-slim-eSE': VoVNet19_slim_eSE,
+    'V-19-eSE': VoVNet19_eSE,
+    'V-39-eSE': VoVNet39_eSE,
+    'V-57-eSE': VoVNet57_eSE,
+    'V-99-eSE': VoVNet99_eSE,
+}
+
+
+def dw_conv3x3(in_channels,
+               out_channels,
+               module_name,
+               postfix,
+               stride=1,
+               kernel_size=3,
+               padding=1):
+    """3x3 convolution with padding."""
+    return [
+        ('{}_{}/dw_conv3x3'.format(module_name, postfix),
+         nn.Conv2d(
+             in_channels,
+             out_channels,
+             kernel_size=kernel_size,
+             stride=stride,
+             padding=padding,
+             groups=out_channels,
+             bias=False)),
+        ('{}_{}/pw_conv1x1'.format(module_name, postfix),
+         nn.Conv2d(
+             in_channels,
+             out_channels,
+             kernel_size=1,
+             stride=1,
+             padding=0,
+             groups=1,
+             bias=False)),
+        ('{}_{}/pw_norm'.format(module_name,
+                                postfix), nn.BatchNorm2d(out_channels)),
+        ('{}_{}/pw_relu'.format(module_name, postfix), nn.ReLU(inplace=True)),
+    ]
+
+
+def conv3x3(in_channels,
+            out_channels,
+            module_name,
+            postfix,
+            stride=1,
+            groups=1,
+            kernel_size=3,
+            padding=1):
+    """3x3 convolution with padding."""
+    return [
+        (
+            f'{module_name}_{postfix}/conv',
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f'{module_name}_{postfix}/norm', nn.BatchNorm2d(out_channels)),
+        (f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True)),
+    ]
+
+
+def conv1x1(in_channels,
+            out_channels,
+            module_name,
+            postfix,
+            stride=1,
+            groups=1,
+            kernel_size=1,
+            padding=0):
+    """1x1 convolution with padding."""
+    return [
+        (
+            f'{module_name}_{postfix}/conv',
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f'{module_name}_{postfix}/norm', nn.BatchNorm2d(out_channels)),
+        (f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True)),
+    ]
+
+
+class Hsigmoid(nn.Module):
+
+    def __init__(self, inplace=True):
+        super(Hsigmoid, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return F.relu6(x + 3.0, inplace=self.inplace) / 6.0
+
+
+class eSEModule(nn.Module):
+
+    def __init__(self, channel, reduction=4):
+        super(eSEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0)
+        self.hsigmoid = Hsigmoid()
+
+    def forward(self, x):
+        input = x
+        x = self.avg_pool(x)
+        x = self.fc(x)
+        x = self.hsigmoid(x)
+        return input * x
+
+
+class _OSA_module(nn.Module):
+
+    def __init__(self,
+                 in_ch,
+                 stage_ch,
+                 concat_ch,
+                 layer_per_block,
+                 module_name,
+                 SE=False,
+                 identity=False,
+                 depthwise=False,
+                 with_cp=True):
+
+        super(_OSA_module, self).__init__()
+
+        self.identity = identity
+        self.depthwise = depthwise
+        self.isReduced = False
+        self.use_checkpoint = with_cp
+        self.layers = nn.ModuleList()
+        in_channel = in_ch
+        if self.depthwise and in_channel != stage_ch:
+            self.isReduced = True
+            self.conv_reduction = nn.Sequential(
+                OrderedDict(
+                    conv1x1(in_channel, stage_ch,
+                            '{}_reduction'.format(module_name), '0')))
+        for i in range(layer_per_block):
+            if self.depthwise:
+                self.layers.append(
+                    nn.Sequential(
+                        OrderedDict(
+                            dw_conv3x3(stage_ch, stage_ch, module_name, i))))
+            else:
+                self.layers.append(
+                    nn.Sequential(
+                        OrderedDict(
+                            conv3x3(in_channel, stage_ch, module_name, i))))
+            in_channel = stage_ch
+
+        # feature aggregation
+        in_channel = in_ch + layer_per_block * stage_ch
+        self.concat = nn.Sequential(
+            OrderedDict(conv1x1(in_channel, concat_ch, module_name, 'concat')))
+
+        self.ese = eSEModule(concat_ch)
+
+    def _forward(self, x):
+
+        identity_feat = x
+
+        output = []
+        output.append(x)
+        if self.depthwise and self.isReduced:
+            x = self.conv_reduction(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+
+        x = torch.cat(output, dim=1)
+        xt = self.concat(x)
+
+        xt = self.ese(xt)
+
+        if self.identity:
+            xt = xt + identity_feat
+
+        return xt
+
+    def forward(self, x):
+
+        if self.use_checkpoint and self.training:
+            xt = cp.checkpoint(self._forward, x)
+        else:
+            xt = self._forward(x)
+
+        return xt
+
+
+class _OSA_stage(nn.Sequential):
+
+    def __init__(self,
+                 in_ch,
+                 stage_ch,
+                 concat_ch,
+                 block_per_stage,
+                 layer_per_block,
+                 stage_num,
+                 SE=False,
+                 depthwise=False):
+
+        super(_OSA_stage, self).__init__()
+
+        if not stage_num == 2:
+            self.add_module(
+                'Pooling',
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True))
+
+        if block_per_stage != 1:
+            SE = False
+        module_name = f'OSA{stage_num}_1'
+        self.add_module(
+            module_name,
+            _OSA_module(
+                in_ch,
+                stage_ch,
+                concat_ch,
+                layer_per_block,
+                module_name,
+                SE,
+                depthwise=depthwise))
+        for i in range(block_per_stage - 1):
+            if i != block_per_stage - 2:  # last block
+                SE = False
+            module_name = f'OSA{stage_num}_{i + 2}'
+            self.add_module(
+                module_name,
+                _OSA_module(
+                    concat_ch,
+                    stage_ch,
+                    concat_ch,
+                    layer_per_block,
+                    module_name,
+                    SE,
+                    identity=True,
+                    depthwise=depthwise),
+            )
+
+
+@MODELS.register_module()
+class VoVNetCP(BaseModule):
+
+    def __init__(self,
+                 spec_name,
+                 input_ch=3,
+                 out_features=None,
+                 frozen_stages=-1,
+                 norm_eval=True,
+                 pretrained=None,
+                 init_cfg=None):
+        """
+        Args:
+            input_ch(int) : the number of input channel
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "stage2" ...
+        """
+        super(VoVNetCP, self).__init__(init_cfg)
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        stage_specs = _STAGE_SPECS[spec_name]
+
+        stem_ch = stage_specs['stem']
+        config_stage_ch = stage_specs['stage_conv_ch']
+        config_concat_ch = stage_specs['stage_out_ch']
+        block_per_stage = stage_specs['block_per_stage']
+        layer_per_block = stage_specs['layer_per_block']
+        SE = stage_specs['eSE']
+        depthwise = stage_specs['dw']
+
+        self._out_features = out_features
+
+        # Stem module
+        conv_type = dw_conv3x3 if depthwise else conv3x3
+        stem = conv3x3(input_ch, stem_ch[0], 'stem', '1', 2)
+        stem += conv_type(stem_ch[0], stem_ch[1], 'stem', '2', 1)
+        stem += conv_type(stem_ch[1], stem_ch[2], 'stem', '3', 2)
+        self.add_module('stem', nn.Sequential((OrderedDict(stem))))
+        current_stirde = 4
+        self._out_feature_strides = {
+            'stem': current_stirde,
+            'stage2': current_stirde
+        }
+        self._out_feature_channels = {'stem': stem_ch[2]}
+
+        stem_out_ch = [stem_ch[2]]
+        in_ch_list = stem_out_ch + config_concat_ch[:-1]
+        # OSA stages
+        self.stage_names = []
+        for i in range(4):  # num_stages
+            name = 'stage%d' % (i + 2)  # stage 2 ... stage 5
+            self.stage_names.append(name)
+            self.add_module(
+                name,
+                _OSA_stage(
+                    in_ch_list[i],
+                    config_stage_ch[i],
+                    config_concat_ch[i],
+                    block_per_stage[i],
+                    layer_per_block,
+                    i + 2,
+                    SE,
+                    depthwise,
+                ),
+            )
+
+            self._out_feature_channels[name] = config_concat_ch[i]
+            if not i == 0:
+                self._out_feature_strides[name] = current_stirde = int(
+                    current_stirde * 2)
+
+        # initialize weights
+        # self._initialize_weights()
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+
+    # def forward(self, x):
+    #     outputs = {}
+    #     x = self.stem(x)
+    #     if "stem" in self._out_features:
+    #         outputs["stem"] = x
+    #     for name in self.stage_names:
+    #         x = getattr(self, name)(x)
+    #         if name in self._out_features:
+    #             outputs[name] = x
+
+    #     return outputs
+
+    def forward(self, x):
+        outputs = []
+        x = self.stem(x)
+        if 'stem' in self._out_features:
+            outputs.append(x)
+        for name in self.stage_names:
+            x = getattr(self, name)(x)
+            if name in self._out_features:
+                outputs.append(x)
+
+        return outputs
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            m = getattr(self, 'stem')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'stage{i+1}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(VoVNetCP, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/mmde/projects/TPVFormer/README.md b/mmde/projects/TPVFormer/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fcc06ec6797ffed39fa536b672fa5c648b275e4d
--- /dev/null
+++ b/mmde/projects/TPVFormer/README.md
@@ -0,0 +1,60 @@
+# Tri-Perspective View for Vision-Based 3D Semantic Occupancy Prediction
+
+> [Tri-Perspective View for Vision-Based 3D Semantic Occupancy Prediction](https://arxiv.org/abs/2302.07817)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Modern methods for vision-centric autonomous driving perception widely adopt the bird's-eye-view (BEV) representation to describe a 3D scene. Despite its better efficiency than voxel representation, it has difficulty describing the fine-grained 3D structure of a scene with a single plane. To address this, we propose a tri-perspective view (TPV) representation which accompanies BEV with two additional perpendicular planes. We model each point in the 3D space by summing its projected features on the three planes. To lift image features to the 3D TPV space, we further propose a transformer-based TPV encoder (TPVFormer) to obtain the TPV features effectively. We employ the attention mechanism to aggregate the image features corresponding to each query in each TPV plane. Experiments show that our model trained with sparse supervision effectively predicts the semantic occupancy for all voxels. We demonstrate for the first time that using only camera inputs can achieve comparable performance with LiDAR-based methods on the LiDAR segmentation task on nuScenes. Code: https://github.com/wzzheng/TPVFormer.
+
+<div align=center>
+<img src="https://github.com/traveller59/spconv/assets/72679458/8cc8caa6-b330-4f32-9599-3811dc5d7332" width="800"/>
+</div>
+
+## Introduction
+
+We implement TPVFormer and provide the results and checkpoints on nuScenes dataset.
+
+## Usage
+
+<!-- For a typical model, this section should contain the commands for training and testing. You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`. -->
+
+### Training commands
+
+In MMDetection3D's root directory, run the following command to train the model:
+
+1. Downloads the [pretrained backbone weights](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/tpvformer/tpvformer_8xb1-2x_nus-seg/tpvformer_pretrained_fcos3d_r101_dcn.pth) to checkpoints/
+
+2. For example, to train TPVFormer on 8 GPUs, please use
+
+```bash
+bash tools/dist_train.sh projects/TPVFormer/config/tpvformer_8xb1-2x_nus-seg.py 8
+```
+
+### Testing commands
+
+In MMDetection3D's root directory, run the following command to test the model on 8 GPUs:
+
+```bash
+bash tools/dist_test.sh projects/TPVFormer/config/tpvformer_8xb1-2x_nus-seg.py  ${CHECKPOINT_PATH} 8
+```
+
+## Results and models
+
+### nuScenes
+
+| Backbone                                                                                                                                         | Neck | Mem (GB) | Inf time (fps) | mIoU | Downloads                                                                                                                                                                                                                                                                                                             |
+| ------------------------------------------------------------------------------------------------------------------------------------------------ | ---- | -------- | -------------- | ---- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [ResNet101 w/ DCN](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py) | FPN  | 32.0     | -              | 68.9 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/tpvformer/tpvformer_8xb1-2x_nus-seg/tpvformer_8xb1-2x_nus-seg_20230411_150639-bd3844e2.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/tpvformer/tpvformer_8xb1-2x_nus-seg/tpvformer_8xb1-2x_nus-seg_20230411_150639.log) |
+
+## Citation
+
+```latex
+@article{huang2023tri,
+    title={Tri-Perspective View for Vision-Based 3D Semantic Occupancy Prediction},
+    author={Huang, Yuanhui and Zheng, Wenzhao and Zhang, Yunpeng and Zhou, Jie and Lu, Jiwen },
+    journal={arXiv preprint arXiv:2302.07817},
+    year={2023}
+}
+```
diff --git a/mmde/projects/TPVFormer/configs/tpvformer_8xb1-2x_nus-seg.py b/mmde/projects/TPVFormer/configs/tpvformer_8xb1-2x_nus-seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..70021d8ff5226582673780914b5ebab13fb5edea
--- /dev/null
+++ b/mmde/projects/TPVFormer/configs/tpvformer_8xb1-2x_nus-seg.py
@@ -0,0 +1,317 @@
+_base_ = ['../../../configs/_base_/default_runtime.py']
+
+custom_imports = dict(
+    imports=['projects.TPVFormer.tpvformer'], allow_failed_imports=False)
+
+dataset_type = 'NuScenesSegDataset'
+data_root = 'data/nuscenes/'
+data_prefix = dict(
+    pts='samples/LIDAR_TOP',
+    pts_semantic_mask='lidarseg/v1.0-trainval',
+    CAM_FRONT='samples/CAM_FRONT',
+    CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+    CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+    CAM_BACK='samples/CAM_BACK',
+    CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+    CAM_BACK_LEFT='samples/CAM_BACK_LEFT')
+
+backend_args = None
+
+train_pipeline = [
+    dict(
+        type='BEVLoadMultiViewImageFromFiles',
+        to_float32=False,
+        color_type='unchanged',
+        num_views=6,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=3,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        with_attr_label=False,
+        seg_3d_dtype='np.uint8'),
+    dict(
+        type='MultiViewWrapper',
+        transforms=dict(type='PhotoMetricDistortion3D')),
+    dict(type='SegLabelMapping'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img', 'points', 'pts_semantic_mask'],
+        meta_keys=['lidar2img'])
+]
+
+val_pipeline = [
+    dict(
+        type='BEVLoadMultiViewImageFromFiles',
+        to_float32=False,
+        color_type='unchanged',
+        num_views=6,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=3,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_seg_3d=True,
+        with_attr_label=False,
+        seg_3d_dtype='np.uint8'),
+    dict(type='SegLabelMapping'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img', 'points', 'pts_semantic_mask'],
+        meta_keys=['lidar2img'])
+]
+
+test_pipeline = val_pipeline
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=data_prefix,
+        ann_file='nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        test_mode=False))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=data_prefix,
+        ann_file='nuscenes_infos_val.pkl',
+        pipeline=val_pipeline,
+        test_mode=True))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='SegMetric')
+
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01),
+    paramwise_cfg=dict(custom_keys={
+        'backbone': dict(lr_mult=0.1),
+    }),
+    clip_grad=dict(max_norm=35, norm_type=2),
+)
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=1e-5, by_epoch=False, begin=0, end=500),
+    dict(
+        type='CosineAnnealingLR',
+        begin=0,
+        T_max=24,
+        by_epoch=True,
+        eta_min=1e-6,
+        convert_to_iter_based=True)
+]
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1))
+
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+_dim_ = 128
+num_heads = 8
+_ffn_dim_ = _dim_ * 2
+
+tpv_h_ = 200
+tpv_w_ = 200
+tpv_z_ = 16
+scale_h = 1
+scale_w = 1
+scale_z = 1
+num_points_in_pillar = [4, 32, 32]
+num_points = [8, 64, 64]
+hybrid_attn_anchors = 16
+hybrid_attn_points = 32
+hybrid_attn_init = 0
+
+grid_shape = [tpv_h_ * scale_h, tpv_w_ * scale_w, tpv_z_ * scale_z]
+
+self_cross_layer = dict(
+    type='TPVFormerLayer',
+    attn_cfgs=[
+        dict(
+            type='TPVCrossViewHybridAttention',
+            tpv_h=tpv_h_,
+            tpv_w=tpv_w_,
+            tpv_z=tpv_z_,
+            num_anchors=hybrid_attn_anchors,
+            embed_dims=_dim_,
+            num_heads=num_heads,
+            num_points=hybrid_attn_points,
+            init_mode=hybrid_attn_init,
+            dropout=0.1),
+        dict(
+            type='TPVImageCrossAttention',
+            pc_range=point_cloud_range,
+            num_cams=6,
+            dropout=0.1,
+            deformable_attention=dict(
+                type='TPVMSDeformableAttention3D',
+                embed_dims=_dim_,
+                num_heads=num_heads,
+                num_points=num_points,
+                num_z_anchors=num_points_in_pillar,
+                num_levels=4,
+                floor_sampling_offset=False,
+                tpv_h=tpv_h_,
+                tpv_w=tpv_w_,
+                tpv_z=tpv_z_),
+            embed_dims=_dim_,
+            tpv_h=tpv_h_,
+            tpv_w=tpv_w_,
+            tpv_z=tpv_z_)
+    ],
+    feedforward_channels=_ffn_dim_,
+    ffn_dropout=0.1,
+    operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm'))
+
+self_layer = dict(
+    type='TPVFormerLayer',
+    attn_cfgs=[
+        dict(
+            type='TPVCrossViewHybridAttention',
+            tpv_h=tpv_h_,
+            tpv_w=tpv_w_,
+            tpv_z=tpv_z_,
+            num_anchors=hybrid_attn_anchors,
+            embed_dims=_dim_,
+            num_heads=num_heads,
+            num_points=hybrid_attn_points,
+            init_mode=hybrid_attn_init,
+            dropout=0.1)
+    ],
+    feedforward_channels=_ffn_dim_,
+    ffn_dropout=0.1,
+    operation_order=('self_attn', 'norm', 'ffn', 'norm'))
+
+model = dict(
+    type='TPVFormer',
+    data_preprocessor=dict(
+        type='TPVFormerDataPreprocessor',
+        pad_size_divisor=32,
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        voxel=True,
+        voxel_type='cylindrical',
+        voxel_layer=dict(
+            grid_shape=grid_shape,
+            point_cloud_range=point_cloud_range,
+            max_num_points=-1,
+            max_voxels=-1,
+        ),
+        batch_augments=[
+            dict(
+                type='GridMask',
+                use_h=True,
+                use_w=True,
+                rotate=1,
+                offset=False,
+                ratio=0.5,
+                mode=1,
+                prob=0.7)
+        ]),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN2d', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        dcn=dict(
+            type='DCNv2', deform_groups=1, fallback_on_stride=False
+        ),  # original DCNv2 will print log when perform load_state_dict
+        stage_with_dcn=(False, False, True, True),
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='checkpoints/tpvformer_pretrained_fcos3d_r101_dcn.pth',
+            prefix='backbone.')),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=4,
+        relu_before_extra_convs=True,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='checkpoints/tpvformer_pretrained_fcos3d_r101_dcn.pth',
+            prefix='neck.')),
+    encoder=dict(
+        type='TPVFormerEncoder',
+        tpv_h=tpv_h_,
+        tpv_w=tpv_w_,
+        tpv_z=tpv_z_,
+        num_layers=5,
+        pc_range=point_cloud_range,
+        num_points_in_pillar=num_points_in_pillar,
+        num_points_in_pillar_cross_view=[16, 16, 16],
+        return_intermediate=False,
+        transformerlayers=[
+            self_cross_layer, self_cross_layer, self_cross_layer, self_layer,
+            self_layer
+        ],
+        embed_dims=_dim_,
+        positional_encoding=dict(
+            type='TPVFormerPositionalEncoding',
+            num_feats=[48, 48, 32],
+            h=tpv_h_,
+            w=tpv_w_,
+            z=tpv_z_)),
+    decode_head=dict(
+        type='TPVFormerDecoder',
+        tpv_h=tpv_h_,
+        tpv_w=tpv_w_,
+        tpv_z=tpv_z_,
+        num_classes=17,
+        in_dims=_dim_,
+        hidden_dims=2 * _dim_,
+        out_dims=_dim_,
+        scale_h=scale_h,
+        scale_w=scale_w,
+        scale_z=scale_z,
+        loss_ce=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            class_weight=None,
+            avg_non_ignore=True,
+            loss_weight=1.0),
+        loss_lovasz=dict(type='LovaszLoss', loss_weight=1.0, reduction='none'),
+        lovasz_input='points',
+        ce_input='voxel',
+        ignore_index=0))
diff --git a/mmde/projects/TPVFormer/tpvformer/__init__.py b/mmde/projects/TPVFormer/tpvformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6162558cfb1f2da3558b3f6c43801ee5f7b40e80
--- /dev/null
+++ b/mmde/projects/TPVFormer/tpvformer/__init__.py
@@ -0,0 +1,17 @@
+from .cross_view_hybrid_attention import TPVCrossViewHybridAttention
+from .data_preprocessor import TPVFormerDataPreprocessor
+from .image_cross_attention import TPVImageCrossAttention
+from .loading import BEVLoadMultiViewImageFromFiles, SegLabelMapping
+from .nuscenes_dataset import NuScenesSegDataset
+from .positional_encoding import TPVFormerPositionalEncoding
+from .tpvformer import TPVFormer
+from .tpvformer_encoder import TPVFormerEncoder
+from .tpvformer_head import TPVFormerDecoder
+from .tpvformer_layer import TPVFormerLayer
+
+__all__ = [
+    'TPVCrossViewHybridAttention', 'TPVImageCrossAttention',
+    'TPVFormerPositionalEncoding', 'TPVFormer', 'TPVFormerEncoder',
+    'TPVFormerLayer', 'NuScenesSegDataset', 'BEVLoadMultiViewImageFromFiles',
+    'SegLabelMapping', 'TPVFormerDecoder', 'TPVFormerDataPreprocessor'
+]
diff --git a/mmde/projects/TPVFormer/tpvformer/cross_view_hybrid_attention.py b/mmde/projects/TPVFormer/tpvformer/cross_view_hybrid_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e881775dd41e46194771817de38b2c936d54ba0
--- /dev/null
+++ b/mmde/projects/TPVFormer/tpvformer/cross_view_hybrid_attention.py
@@ -0,0 +1,209 @@
+import math
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.ops.multi_scale_deform_attn import (
+    MultiScaleDeformableAttnFunction, multi_scale_deformable_attn_pytorch)
+from mmengine.model import BaseModule, constant_init, xavier_init
+from torch import Tensor
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class TPVCrossViewHybridAttention(BaseModule):
+    """TPVFormer Cross-view Hybrid Attention Module."""
+
+    def __init__(self,
+                 tpv_h: int,
+                 tpv_w: int,
+                 tpv_z: int,
+                 embed_dims: int = 256,
+                 num_heads: int = 8,
+                 num_points: int = 4,
+                 num_anchors: int = 2,
+                 init_mode: int = 0,
+                 dropout: float = 0.1,
+                 **kwargs):
+        super().__init__()
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.num_levels = 3
+        self.num_points = num_points
+        self.num_anchors = num_anchors
+        self.init_mode = init_mode
+        self.dropout = nn.ModuleList([nn.Dropout(dropout) for _ in range(3)])
+        self.output_proj = nn.ModuleList(
+            [nn.Linear(embed_dims, embed_dims) for _ in range(3)])
+        self.sampling_offsets = nn.ModuleList([
+            nn.Linear(embed_dims, num_heads * 3 * num_points * 2)
+            for _ in range(3)
+        ])
+        self.attention_weights = nn.ModuleList([
+            nn.Linear(embed_dims, num_heads * 3 * (num_points + 1))
+            for _ in range(3)
+        ])
+        self.value_proj = nn.ModuleList(
+            [nn.Linear(embed_dims, embed_dims) for _ in range(3)])
+
+        self.tpv_h, self.tpv_w, self.tpv_z = tpv_h, tpv_w, tpv_z
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        device = next(self.parameters()).device
+        # self plane
+        theta_self = torch.arange(
+            self.num_heads, dtype=torch.float32,
+            device=device) * (2.0 * math.pi / self.num_heads)
+        grid_self = torch.stack(
+            [theta_self.cos(), theta_self.sin()], -1)  # H, 2
+        grid_self = grid_self.view(self.num_heads, 1,
+                                   2).repeat(1, self.num_points, 1)
+        for j in range(self.num_points):
+            grid_self[:, j, :] *= (j + 1) / 2
+
+        if self.init_mode == 0:
+            # num_phi = 4
+            phi = torch.arange(
+                4, dtype=torch.float32, device=device) * (2.0 * math.pi / 4)
+            assert self.num_heads % 4 == 0
+            num_theta = int(self.num_heads / 4)
+            theta = torch.arange(
+                num_theta, dtype=torch.float32, device=device) * (
+                    math.pi / num_theta) + (math.pi / num_theta / 2)  # 3
+            x = torch.matmul(theta.sin().unsqueeze(-1),
+                             phi.cos().unsqueeze(0)).flatten()
+            y = torch.matmul(theta.sin().unsqueeze(-1),
+                             phi.sin().unsqueeze(0)).flatten()
+            z = theta.cos().unsqueeze(-1).repeat(1, 4).flatten()
+            xyz = torch.stack([x, y, z], dim=-1)  # H, 3
+
+        elif self.init_mode == 1:
+
+            xyz = [[0, 0, 1], [0, 0, -1], [0, 1, 0], [0, -1, 0], [1, 0, 0],
+                   [-1, 0, 0]]
+            xyz = torch.tensor(xyz, dtype=torch.float32, device=device)
+
+        grid_hw = xyz[:, [0, 1]]  # H, 2
+        grid_zh = xyz[:, [2, 0]]
+        grid_wz = xyz[:, [1, 2]]
+
+        for i in range(3):
+            grid = torch.stack([grid_hw, grid_zh, grid_wz], dim=1)  # H, 3, 2
+            grid = grid.unsqueeze(2).repeat(1, 1, self.num_points, 1)
+
+            grid = grid.reshape(self.num_heads, self.num_levels,
+                                self.num_anchors, -1, 2)
+            for j in range(self.num_points // self.num_anchors):
+                grid[:, :, :, j, :] *= 2 * (j + 1)
+            grid = grid.flatten(2, 3)
+            grid[:, i, :, :] = grid_self
+
+            constant_init(self.sampling_offsets[i], 0.)
+            self.sampling_offsets[i].bias.data = grid.view(-1)
+
+            constant_init(self.attention_weights[i], val=0., bias=0.)
+            attn_bias = torch.zeros(
+                self.num_heads, 3, self.num_points + 1, device=device)
+            attn_bias[:, i, -1] = 10
+            self.attention_weights[i].bias.data = attn_bias.flatten()
+            xavier_init(self.value_proj[i], distribution='uniform', bias=0.)
+            xavier_init(self.output_proj[i], distribution='uniform', bias=0.)
+
+    def get_sampling_offsets_and_attention(
+            self, queries: List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+        offsets = []
+        attns = []
+        for i, (query, fc, attn) in enumerate(
+                zip(queries, self.sampling_offsets, self.attention_weights)):
+            bs, l, d = query.shape
+
+            offset = fc(query).reshape(bs, l, self.num_heads, self.num_levels,
+                                       self.num_points, 2)
+            offsets.append(offset)
+
+            attention = attn(query).reshape(bs, l, self.num_heads, 3, -1)
+            level_attention = attention[:, :, :, :,
+                                        -1:].softmax(-2)  # bs, l, H, 3, 1
+            attention = attention[:, :, :, :, :-1]
+            attention = attention.softmax(-1)  # bs, l, H, 3, p
+            attention = attention * level_attention
+            attns.append(attention)
+
+        offsets = torch.cat(offsets, dim=1)
+        attns = torch.cat(attns, dim=1)
+        return offsets, attns
+
+    def reshape_output(self, output: Tensor, lens: List[int]) -> List[Tensor]:
+        outputs = torch.split(output, [lens[0], lens[1], lens[2]], dim=1)
+        return outputs
+
+    def forward(self,
+                query: List[Tensor],
+                identity: Optional[List[Tensor]] = None,
+                query_pos: Optional[List[Tensor]] = None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None):
+        identity = query if identity is None else identity
+        if query_pos is not None:
+            query = [q + p for q, p in zip(query, query_pos)]
+
+        # value proj
+        query_lens = [q.shape[1] for q in query]
+        value = [layer(q) for layer, q in zip(self.value_proj, query)]
+        value = torch.cat(value, dim=1)
+        bs, num_value, _ = value.shape
+        value = value.view(bs, num_value, self.num_heads, -1)
+
+        # sampling offsets and weights
+        sampling_offsets, attention_weights = \
+            self.get_sampling_offsets_and_attention(query)
+
+        if reference_points.shape[-1] == 2:
+            """For each tpv query, it owns `num_Z_anchors` in 3D space that
+            having different heights. After projecting, each tpv query has
+            `num_Z_anchors` reference points in each 2D image. For each
+            referent point, we sample `num_points` sampling points.
+
+            For `num_Z_anchors` reference points,
+            it has overall `num_points * num_Z_anchors` sampling points.
+            """
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+
+            bs, num_query, _, num_Z_anchors, xy = reference_points.shape
+            reference_points = reference_points[:, :, None, :, :, None, :]
+            sampling_offsets = sampling_offsets / \
+                offset_normalizer[None, None, None, :, None, :]
+            bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape  # noqa
+            sampling_offsets = sampling_offsets.view(
+                bs, num_query, num_heads, num_levels, num_Z_anchors,
+                num_all_points // num_Z_anchors, xy)
+            sampling_locations = reference_points + sampling_offsets
+            bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape  # noqa
+
+            sampling_locations = sampling_locations.view(
+                bs, num_query, num_heads, num_levels, num_all_points, xy)
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2, but get {reference_points.shape[-1]} instead.')
+
+        if torch.cuda.is_available() and value.is_cuda:
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, 64)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        outputs = self.reshape_output(output, query_lens)
+
+        results = []
+        for out, layer, drop, residual in zip(outputs, self.output_proj,
+                                              self.dropout, identity):
+            results.append(residual + drop(layer(out)))
+
+        return results
diff --git a/mmde/projects/TPVFormer/tpvformer/data_preprocessor.py b/mmde/projects/TPVFormer/tpvformer/data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..e340873a35f0a9c9ca6adac921cc9e8d2bb00b0f
--- /dev/null
+++ b/mmde/projects/TPVFormer/tpvformer/data_preprocessor.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import Image
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.models import Det3DDataPreprocessor
+from mmdet3d.models.data_preprocessors.voxelize import dynamic_scatter_3d
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.det3d_data_sample import SampleList
+
+
+@MODELS.register_module()
+class TPVFormerDataPreprocessor(Det3DDataPreprocessor):
+
+    @torch.no_grad()
+    def voxelize(self, points: List[Tensor],
+                 data_samples: SampleList) -> List[Tensor]:
+        """Apply voxelization to point cloud. In TPVFormer, it will get voxel-
+        wise segmentation label and voxel/point coordinates.
+
+        Args:
+            points (List[Tensor]): Point cloud in one data batch.
+            data_samples: (List[:obj:`Det3DDataSample`]): The annotation data
+                of every samples. Add voxel-wise annotation for segmentation.
+
+        Returns:
+            List[Tensor]: Coordinates of voxels, shape is Nx3,
+        """
+        for point, data_sample in zip(points, data_samples):
+            min_bound = point.new_tensor(
+                self.voxel_layer.point_cloud_range[:3])
+            max_bound = point.new_tensor(
+                self.voxel_layer.point_cloud_range[3:])
+            point_clamp = torch.clamp(point, min_bound, max_bound + 1e-6)
+            coors = torch.floor(
+                (point_clamp - min_bound) /
+                point_clamp.new_tensor(self.voxel_layer.voxel_size)).int()
+            self.get_voxel_seg(coors, data_sample)
+            data_sample.point_coors = coors
+
+    def get_voxel_seg(self, res_coors: Tensor, data_sample: SampleList):
+        """Get voxel-wise segmentation label and point2voxel map.
+
+        Args:
+            res_coors (Tensor): The voxel coordinates of points, Nx3.
+            data_sample: (:obj:`Det3DDataSample`): The annotation data of
+                every samples. Add voxel-wise annotation forsegmentation.
+        """
+
+        if self.training:
+            pts_semantic_mask = data_sample.gt_pts_seg.pts_semantic_mask
+            pts_semantic_mask = F.one_hot(pts_semantic_mask.long()).float()
+            voxel_semantic_mask, voxel_coors, point2voxel_map = \
+                dynamic_scatter_3d(pts_semantic_mask, res_coors, 'mean', True)
+            voxel_semantic_mask = torch.argmax(voxel_semantic_mask, dim=-1)
+            data_sample.gt_pts_seg.voxel_semantic_mask = voxel_semantic_mask
+            data_sample.point2voxel_map = point2voxel_map
+            data_sample.voxel_coors = voxel_coors
+        else:
+            pseudo_tensor = res_coors.new_ones([res_coors.shape[0], 1]).float()
+            _, _, point2voxel_map = dynamic_scatter_3d(pseudo_tensor,
+                                                       res_coors, 'mean', True)
+            data_sample.point2voxel_map = point2voxel_map
+
+
+@MODELS.register_module()
+class GridMask(nn.Module):
+    """GridMask data augmentation.
+
+        Modified from https://github.com/dvlab-research/GridMask.
+
+    Args:
+        use_h (bool): Whether to mask on height dimension. Defaults to True.
+        use_w (bool): Whether to mask on width dimension. Defaults to True.
+        rotate (int): Rotation degree. Defaults to 1.
+        offset (bool): Whether to mask offset. Defaults to False.
+        ratio (float): Mask ratio. Defaults to 0.5.
+        mode (int): Mask mode. if mode == 0, mask with square grid.
+            if mode == 1, mask the rest. Defaults to 0
+        prob (float): Probability of applying the augmentation.
+            Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 use_h: bool = True,
+                 use_w: bool = True,
+                 rotate: int = 1,
+                 offset: bool = False,
+                 ratio: float = 0.5,
+                 mode: int = 0,
+                 prob: float = 1.0):
+        super().__init__()
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.prob = prob
+
+    def forward(self, inputs: Tensor,
+                data_samples: SampleList) -> Tuple[Tensor, SampleList]:
+        if np.random.rand() > self.prob:
+            return inputs, data_samples
+        height, width = inputs.shape[-2:]
+        mask_height = int(1.5 * height)
+        mask_width = int(1.5 * width)
+        distance = np.random.randint(2, min(height, width))
+        length = min(max(int(distance * self.ratio + 0.5), 1), distance - 1)
+        mask = np.ones((mask_height, mask_width), np.float32)
+        stride_on_height = np.random.randint(distance)
+        stride_on_width = np.random.randint(distance)
+        if self.use_h:
+            for i in range(mask_height // distance):
+                start = distance * i + stride_on_height
+                end = min(start + length, mask_height)
+                mask[start:end, :] *= 0
+        if self.use_w:
+            for i in range(mask_width // distance):
+                start = distance * i + stride_on_width
+                end = min(start + length, mask_width)
+                mask[:, start:end] *= 0
+
+        # NOTE: r is the rotation radian, here is a random counterclockwise
+        # rotation of 1° or remain unchanged, which follows the implementation
+        # of the official detection version.
+        # https://github.com/dvlab-research/GridMask.
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+
+        mask = mask.rotate(r)
+        mask = np.array(mask)
+        mask = mask[int(0.25 * height):int(0.25 * height) + height,
+                    int(0.25 * width):int(0.25 * width) + width]
+
+        mask = inputs.new_tensor(mask)
+        if self.mode == 1:
+            mask = 1 - mask
+        mask = mask.expand_as(inputs)
+        if self.offset:
+            offset = inputs.new_tensor(2 *
+                                       (np.random.rand(height, width) - 0.5))
+            inputs = inputs * mask + offset * (1 - mask)
+        else:
+            inputs = inputs * mask
+
+        return inputs, data_samples
diff --git a/mmde/projects/TPVFormer/tpvformer/image_cross_attention.py b/mmde/projects/TPVFormer/tpvformer/image_cross_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..06ad6331a1b562225f5f1a9790156b66cd43732d
--- /dev/null
+++ b/mmde/projects/TPVFormer/tpvformer/image_cross_attention.py
@@ -0,0 +1,465 @@
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+from mmcv.ops.multi_scale_deform_attn import (
+    MultiScaleDeformableAttnFunction, multi_scale_deformable_attn_pytorch)
+from mmengine.model import BaseModule, constant_init, xavier_init
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class TPVImageCrossAttention(BaseModule):
+    """An attention module used in TPVFormer.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_cams (int): The number of cameras
+        dropout (float): A Dropout layer on `inp_residual`.
+            Default: 0.1.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Whether the first dimension of the input is batch.
+        deformable_attention: (dict): The config for the deformable
+            attention used in SCA.
+        tpv_h (int): The height of the TPV.
+        tpv_w (int): The width of the TPV.
+        tpv_z (int): The depth of the TPV.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_cams=6,
+                 pc_range=None,
+                 dropout=0.1,
+                 init_cfg=None,
+                 batch_first=True,
+                 deformable_attention=dict(
+                     type='MSDeformableAttention3D',
+                     embed_dims=256,
+                     num_levels=4),
+                 tpv_h=None,
+                 tpv_w=None,
+                 tpv_z=None):
+        super().__init__(init_cfg)
+
+        self.init_cfg = init_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.pc_range = pc_range
+        self.fp16_enabled = False
+        self.deformable_attention = MODELS.build(deformable_attention)
+        self.embed_dims = embed_dims
+        self.num_cams = num_cams
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.batch_first = batch_first
+        self.tpv_h, self.tpv_w, self.tpv_z = tpv_h, tpv_w, tpv_z
+        self.init_weight()
+
+    def init_weight(self):
+        """Default initialization for Parameters of Module."""
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+
+    def forward(self,
+                query,
+                key,
+                value,
+                residual=None,
+                spatial_shapes=None,
+                reference_points_cams=None,
+                tpv_masks=None,
+                level_start_index=None):
+        """Forward Function of Detr3DCrossAtten.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (bs, num_query, embed_dims).
+            key (Tensor): The key tensor with shape
+                (bs, num_key, embed_dims).
+            value (Tensor): The value tensor with shape
+                (bs, num_key, embed_dims).
+            residual (Tensor): The tensor used for addition, with the
+                same shape as `x`. Default None. If None, `x` will be used.
+            spatial_shapes (Tensor): Spatial shape of features in
+                different level. With shape  (num_levels, 2),
+                last dimension represent (h, w).
+            tpv_masks (List[Tensor]): The mask of each views.
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            reference_points_cams (List[Tensor]): The reference points in
+                each camera.
+            tpv_masks (List[Tensor]): The mask of each views.
+            level_start_index (List[int]): The start index of each level.
+
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+
+        if residual is None:
+            inp_residual = query
+        bs, _, _ = query.size()
+
+        queries = torch.split(
+            query, [
+                self.tpv_h * self.tpv_w, self.tpv_z * self.tpv_h,
+                self.tpv_w * self.tpv_z
+            ],
+            dim=1)
+        if residual is None:
+            slots = [torch.zeros_like(q) for q in queries]
+        indexeses = []
+        max_lens = []
+        queries_rebatches = []
+        reference_points_rebatches = []
+        for tpv_idx, tpv_mask in enumerate(tpv_masks):
+            indexes = []
+            for _, mask_per_img in enumerate(tpv_mask):
+                index_query_per_img = mask_per_img[0].sum(
+                    -1).nonzero().squeeze(-1)
+                indexes.append(index_query_per_img)
+            max_len = max([len(each) for each in indexes])
+            max_lens.append(max_len)
+            indexeses.append(indexes)
+
+            reference_points_cam = reference_points_cams[tpv_idx]
+            D = reference_points_cam.size(3)
+
+            queries_rebatch = queries[tpv_idx].new_zeros(
+                [bs * self.num_cams, max_len, self.embed_dims])
+            reference_points_rebatch = reference_points_cam.new_zeros(
+                [bs * self.num_cams, max_len, D, 2])
+
+            for i, reference_points_per_img in enumerate(reference_points_cam):
+                for j in range(bs):
+                    index_query_per_img = indexes[i]
+                    queries_rebatch[j * self.num_cams +
+                                    i, :len(index_query_per_img)] = queries[
+                                        tpv_idx][j, index_query_per_img]
+                    reference_points_rebatch[j * self.num_cams + i, :len(
+                        index_query_per_img)] = reference_points_per_img[
+                            j, index_query_per_img]
+
+            queries_rebatches.append(queries_rebatch)
+            reference_points_rebatches.append(reference_points_rebatch)
+
+        num_cams, l, bs, embed_dims = key.shape
+
+        key = key.permute(0, 2, 1, 3).view(self.num_cams * bs, l,
+                                           self.embed_dims)
+        value = value.permute(0, 2, 1, 3).view(self.num_cams * bs, l,
+                                               self.embed_dims)
+
+        queries = self.deformable_attention(
+            query=queries_rebatches,
+            key=key,
+            value=value,
+            reference_points=reference_points_rebatches,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+        )
+
+        for tpv_idx, indexes in enumerate(indexeses):
+            for i, index_query_per_img in enumerate(indexes):
+                for j in range(bs):
+                    slots[tpv_idx][j, index_query_per_img] += queries[tpv_idx][
+                        j * self.num_cams + i, :len(index_query_per_img)]
+
+            count = tpv_masks[tpv_idx].sum(-1) > 0
+            count = count.permute(1, 2, 0).sum(-1)
+            count = torch.clamp(count, min=1.0)
+            slots[tpv_idx] = slots[tpv_idx] / count[..., None]
+        slots = torch.cat(slots, dim=1)
+        slots = self.output_proj(slots)
+
+        return self.dropout(slots) + inp_residual
+
+
+@MODELS.register_module()
+class TPVMSDeformableAttention3D(BaseModule):
+    """An attention module used in tpvFormer based on Deformable-Detr.
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(
+        self,
+        embed_dims=256,
+        num_heads=8,
+        num_levels=4,
+        num_points=[8, 64, 64],
+        num_z_anchors=[4, 32, 32],
+        pc_range=None,
+        im2col_step=64,
+        dropout=0.1,
+        batch_first=True,
+        norm_cfg=None,
+        init_cfg=None,
+        floor_sampling_offset=True,
+        tpv_h=None,
+        tpv_w=None,
+        tpv_z=None,
+    ):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.batch_first = batch_first
+        self.output_proj = None
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.num_z_anchors = num_z_anchors
+        self.base_num_points = num_points[0]
+        self.base_z_anchors = num_z_anchors[0]
+        self.points_multiplier = [
+            points // self.base_z_anchors for points in num_z_anchors
+        ]
+        self.pc_range = pc_range
+        self.tpv_h, self.tpv_w, self.tpv_z = tpv_h, tpv_w, tpv_z
+        self.sampling_offsets = nn.ModuleList([
+            nn.Linear(embed_dims, num_heads * num_levels * num_points[i] * 2)
+            for i in range(3)
+        ])
+        self.floor_sampling_offset = floor_sampling_offset
+        self.attention_weights = nn.ModuleList([
+            nn.Linear(embed_dims, num_heads * num_levels * num_points[i])
+            for i in range(3)
+        ])
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        device = next(self.parameters()).device
+        for i in range(3):
+            constant_init(self.sampling_offsets[i], 0.)
+            thetas = torch.arange(
+                self.num_heads, dtype=torch.float32,
+                device=device) * (2.0 * math.pi / self.num_heads)
+            grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+            grid_init = (grid_init /
+                         grid_init.abs().max(-1, keepdim=True)[0]).view(
+                             self.num_heads, 1, 1,
+                             2).repeat(1, self.num_levels, self.num_points[i],
+                                       1)
+            grid_init = grid_init.reshape(self.num_heads, self.num_levels,
+                                          self.num_z_anchors[i], -1, 2)
+            for j in range(self.num_points[i] // self.num_z_anchors[i]):
+                grid_init[:, :, :, j, :] *= j + 1
+
+            self.sampling_offsets[i].bias.data = grid_init.view(-1)
+            constant_init(self.attention_weights[i], val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    def get_sampling_offsets_and_attention(self, queries):
+        offsets = []
+        attns = []
+        for i, (query, fc, attn) in enumerate(
+                zip(queries, self.sampling_offsets, self.attention_weights)):
+            bs, l, d = query.shape
+
+            offset = fc(query).reshape(bs, l, self.num_heads, self.num_levels,
+                                       self.points_multiplier[i], -1, 2)
+            offset = offset.permute(0, 1, 4, 2, 3, 5, 6).flatten(1, 2)
+            offsets.append(offset)
+
+            attention = attn(query).reshape(bs, l, self.num_heads, -1)
+            attention = attention.softmax(-1)
+            attention = attention.view(bs, l, self.num_heads, self.num_levels,
+                                       self.points_multiplier[i], -1)
+            attention = attention.permute(0, 1, 4, 2, 3, 5).flatten(1, 2)
+            attns.append(attention)
+
+        offsets = torch.cat(offsets, dim=1)
+        attns = torch.cat(attns, dim=1)
+        return offsets, attns
+
+    def reshape_reference_points(self, reference_points):
+        reference_point_list = []
+        for i, reference_point in enumerate(reference_points):
+            bs, l, z_anchors, _ = reference_point.shape
+            reference_point = reference_point.reshape(
+                bs, l, self.points_multiplier[i], -1, 2)
+            reference_point = reference_point.flatten(1, 2)
+            reference_point_list.append(reference_point)
+        return torch.cat(reference_point_list, dim=1)
+
+    def reshape_output(self, output, lens):
+        bs, _, d = output.shape
+        outputs = torch.split(
+            output, [
+                lens[0] * self.points_multiplier[0], lens[1] *
+                self.points_multiplier[1], lens[2] * self.points_multiplier[2]
+            ],
+            dim=1)
+
+        outputs = [
+            o.reshape(bs, -1, self.points_multiplier[i], d).sum(dim=2)
+            for i, o in enumerate(outputs)
+        ]
+        return outputs
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                ( bs, num_query, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [bs, num_query, embed_dims].
+        """
+
+        if value is None:
+            value = query
+        if identity is None:
+            identity = query
+
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = [q.permute(1, 0, 2) for q in query]
+            value = value.permute(1, 0, 2)
+
+        # bs, num_query, _ = query.shape
+        query_lens = [q.shape[1] for q in query]
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        value = value.view(bs, num_value, self.num_heads, -1)
+
+        sampling_offsets, attention_weights = \
+            self.get_sampling_offsets_and_attention(query)
+
+        reference_points = self.reshape_reference_points(reference_points)
+
+        if reference_points.shape[-1] == 2:
+            """For each tpv query, it owns `num_Z_anchors` in 3D space that
+            having different heights. After projecting, each tpv query has
+            `num_Z_anchors` reference points in each 2D image. For each
+            referent point, we sample `num_points` sampling points.
+
+            For `num_Z_anchors` reference points,
+            it has overall `num_points * num_Z_anchors` sampling points.
+            """
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+
+            bs, num_query, num_Z_anchors, xy = reference_points.shape
+            reference_points = reference_points[:, :, None, None, :, None, :]
+            sampling_offsets = sampling_offsets / \
+                offset_normalizer[None, None, None, :, None, :]
+            bs, num_query, num_heads, num_levels, num_all_points, xy = \
+                sampling_offsets.shape
+            sampling_offsets = sampling_offsets.view(
+                bs, num_query, num_heads, num_levels, num_Z_anchors,
+                num_all_points // num_Z_anchors, xy)
+            sampling_locations = reference_points + sampling_offsets
+            bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, \
+                xy = sampling_locations.shape
+            assert num_all_points == num_points * num_Z_anchors
+
+            sampling_locations = sampling_locations.view(
+                bs, num_query, num_heads, num_levels, num_all_points, xy)
+
+            if self.floor_sampling_offset:
+                sampling_locations = sampling_locations - torch.floor(
+                    sampling_locations)
+
+        elif reference_points.shape[-1] == 4:
+            assert False
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+
+        if torch.cuda.is_available() and value.is_cuda:
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        output = self.reshape_output(output, query_lens)
+        if not self.batch_first:
+            output = [o.permute(1, 0, 2) for o in output]
+
+        return output
diff --git a/mmde/projects/TPVFormer/tpvformer/loading.py b/mmde/projects/TPVFormer/tpvformer/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5c3e74fcef64da01088e31908b547ea71cbd7f5
--- /dev/null
+++ b/mmde/projects/TPVFormer/tpvformer/loading.py
@@ -0,0 +1,172 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Optional, Union
+
+import mmcv
+import numpy as np
+from mmcv.transforms.base import BaseTransform
+from mmengine.fileio import get
+
+from mmdet3d.datasets.transforms import LoadMultiViewImageFromFiles
+from mmdet3d.registry import TRANSFORMS
+
+Number = Union[int, float]
+
+
+@TRANSFORMS.register_module()
+class BEVLoadMultiViewImageFromFiles(LoadMultiViewImageFromFiles):
+    """Load multi channel images from a list of separate channel files.
+
+    ``BEVLoadMultiViewImageFromFiles`` adds the following keys for the
+    convenience of view transforms in the forward:
+        - 'cam2lidar'
+        - 'lidar2img'
+
+    Args:
+        to_float32 (bool): Whether to convert the img to float32.
+            Defaults to False.
+        color_type (str): Color type of the file. Defaults to 'unchanged'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        num_views (int): Number of view in a frame. Defaults to 5.
+        num_ref_frames (int): Number of frame in loading. Defaults to -1.
+        test_mode (bool): Whether is test mode in loading. Defaults to False.
+        set_default_scale (bool): Whether to set default scale.
+            Defaults to True.
+    """
+
+    def transform(self, results: dict) -> Optional[dict]:
+        """Call function to load multi-view image from files.
+
+        Args:
+            results (dict): Result dict containing multi-view image filenames.
+
+        Returns:
+            dict: The result dict containing the multi-view image data.
+            Added keys and values are described below.
+
+                - filename (str): Multi-view image filenames.
+                - img (np.ndarray): Multi-view image arrays.
+                - img_shape (tuple[int]): Shape of multi-view image arrays.
+                - ori_shape (tuple[int]): Shape of original image arrays.
+                - pad_shape (tuple[int]): Shape of padded image arrays.
+                - scale_factor (float): Scale factor.
+                - img_norm_cfg (dict): Normalization configuration of images.
+        """
+        filename, cam2img, lidar2cam, lidar2img = [], [], [], []
+        for _, cam_item in results['images'].items():
+            filename.append(cam_item['img_path'])
+            lidar2cam.append(cam_item['lidar2cam'])
+
+            lidar2cam_array = np.array(cam_item['lidar2cam'])
+            cam2img_array = np.eye(4).astype(np.float64)
+            cam2img_array[:3, :3] = np.array(cam_item['cam2img'])
+            cam2img.append(cam2img_array)
+            lidar2img.append(cam2img_array @ lidar2cam_array)
+
+        results['img_path'] = filename
+        results['cam2img'] = np.stack(cam2img, axis=0)
+        results['lidar2cam'] = np.stack(lidar2cam, axis=0)
+        results['lidar2img'] = np.stack(lidar2img, axis=0)
+
+        results['ori_cam2img'] = copy.deepcopy(results['cam2img'])
+
+        # img is of shape (h, w, c, num_views)
+        # h and w can be different for different views
+        img_bytes = [
+            get(name, backend_args=self.backend_args) for name in filename
+        ]
+        # gbr follow tpvformer
+        imgs = [
+            mmcv.imfrombytes(img_byte, flag=self.color_type)
+            for img_byte in img_bytes
+        ]
+        # handle the image with different shape
+        img_shapes = np.stack([img.shape for img in imgs], axis=0)
+        img_shape_max = np.max(img_shapes, axis=0)
+        img_shape_min = np.min(img_shapes, axis=0)
+        assert img_shape_min[-1] == img_shape_max[-1]
+        if not np.all(img_shape_max == img_shape_min):
+            pad_shape = img_shape_max[:2]
+        else:
+            pad_shape = None
+        if pad_shape is not None:
+            imgs = [
+                mmcv.impad(img, shape=pad_shape, pad_val=0) for img in imgs
+            ]
+        img = np.stack(imgs, axis=-1)
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['filename'] = filename
+        # unravel to list, see `DefaultFormatBundle` in formating.py
+        # which will transpose each image separately and then stack into array
+        results['img'] = [img[..., i] for i in range(img.shape[-1])]
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        # Set initial values for default meta_keys
+        results['pad_shape'] = img.shape[:2]
+        if self.set_default_scale:
+            results['scale_factor'] = 1.0
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results['img_norm_cfg'] = dict(
+            mean=np.zeros(num_channels, dtype=np.float32),
+            std=np.ones(num_channels, dtype=np.float32),
+            to_rgb=False)
+        results['num_views'] = self.num_views
+        results['num_ref_frames'] = self.num_ref_frames
+        return results
+
+
+@TRANSFORMS.register_module()
+class SegLabelMapping(BaseTransform):
+    """Map original semantic class to valid category ids.
+
+    Required Keys:
+
+    - seg_label_mapping (np.ndarray)
+    - pts_semantic_mask (np.ndarray)
+
+    Added Keys:
+
+    - points (np.float32)
+
+    Map valid classes as 0~len(valid_cat_ids)-1 and
+    others as len(valid_cat_ids).
+    """
+
+    def transform(self, results: dict) -> dict:
+        """Call function to map original semantic class to valid category ids.
+
+        Args:
+            results (dict): Result dict containing point semantic masks.
+
+        Returns:
+            dict: The result dict containing the mapped category ids.
+            Updated key and value are described below.
+
+                - pts_semantic_mask (np.ndarray): Mapped semantic masks.
+        """
+        assert 'pts_semantic_mask' in results
+        pts_semantic_mask = results['pts_semantic_mask']
+
+        assert 'seg_label_mapping' in results
+        label_mapping = results['seg_label_mapping']
+        converted_pts_sem_mask = np.vectorize(
+            label_mapping.__getitem__, otypes=[np.uint8])(
+                pts_semantic_mask)
+
+        results['pts_semantic_mask'] = converted_pts_sem_mask
+
+        # 'eval_ann_info' will be passed to evaluator
+        if 'eval_ann_info' in results:
+            assert 'pts_semantic_mask' in results['eval_ann_info']
+            results['eval_ann_info']['pts_semantic_mask'] = \
+                converted_pts_sem_mask
+
+        return results
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        return repr_str
diff --git a/mmde/projects/TPVFormer/tpvformer/nuscenes_dataset.py b/mmde/projects/TPVFormer/tpvformer/nuscenes_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..763226bc101a27c7424424f7c7faaf1a5d088bd7
--- /dev/null
+++ b/mmde/projects/TPVFormer/tpvformer/nuscenes_dataset.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Callable, List, Union
+
+from mmengine.dataset import BaseDataset
+
+from mmdet3d.registry import DATASETS
+
+
+@DATASETS.register_module()
+class NuScenesSegDataset(BaseDataset):
+    r"""NuScenes Dataset.
+
+    This class serves as the API for experiments on the NuScenes Dataset.
+
+    Please refer to `NuScenes Dataset <https://www.nuscenes.org/download>`_
+    for data downloading.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict]): Pipeline used for data processing.
+            Defaults to [].
+        test_mode (bool): Store `True` when building test or val dataset.
+    """
+    METAINFO = {
+        'classes':
+        ('noise', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle',
+         'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck',
+         'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade',
+         'vegetation'),
+        'ignore_index':
+        0,
+        'label_mapping':
+        dict([(1, 0), (5, 0), (7, 0), (8, 0), (10, 0), (11, 0), (13, 0),
+              (19, 0), (20, 0), (0, 0), (29, 0), (31, 0), (9, 1), (14, 2),
+              (15, 3), (16, 3), (17, 4), (18, 5), (21, 6), (2, 7), (3, 7),
+              (4, 7), (6, 7), (12, 8), (22, 9), (23, 10), (24, 11), (25, 12),
+              (26, 13), (27, 14), (28, 15), (30, 16)]),
+        'palette': [
+            [0, 0, 0],  # noise
+            [255, 120, 50],  # barrier              orange
+            [255, 192, 203],  # bicycle              pink
+            [255, 255, 0],  # bus                  yellow
+            [0, 150, 245],  # car                  blue
+            [0, 255, 255],  # construction_vehicle cyan
+            [255, 127, 0],  # motorcycle           dark orange
+            [255, 0, 0],  # pedestrian           red
+            [255, 240, 150],  # traffic_cone         light yellow
+            [135, 60, 0],  # trailer              brown
+            [160, 32, 240],  # truck                purple
+            [255, 0, 255],  # driveable_surface    dark pink
+            [139, 137, 137],  # other_flat           dark red
+            [75, 0, 75],  # sidewalk             dard purple
+            [150, 240, 80],  # terrain              light green
+            [230, 230, 250],  # manmade              white
+            [0, 175, 0],  # vegetation           green
+        ]
+    }
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 **kwargs) -> None:
+        metainfo = dict(label2cat={
+            i: cat_name
+            for i, cat_name in enumerate(self.METAINFO['classes'])
+        })
+        super().__init__(
+            ann_file=ann_file,
+            data_root=data_root,
+            metainfo=metainfo,
+            pipeline=pipeline,
+            test_mode=test_mode,
+            **kwargs)
+
+    def parse_data_info(self, info: dict) -> Union[List[dict], dict]:
+        """Process the raw data info.
+
+        The only difference with it in `Det3DDataset`
+        is the specific process for `plane`.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            List[dict] or dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+
+        data_list = []
+        info['lidar_points']['lidar_path'] = \
+            osp.join(
+                self.data_prefix.get('pts', ''),
+                info['lidar_points']['lidar_path'])
+
+        for cam_id, img_info in info['images'].items():
+            if 'img_path' in img_info:
+                if cam_id in self.data_prefix:
+                    cam_prefix = self.data_prefix[cam_id]
+                else:
+                    cam_prefix = self.data_prefix.get('img', '')
+                img_info['img_path'] = osp.join(cam_prefix,
+                                                img_info['img_path'])
+
+        if 'pts_semantic_mask_path' in info:
+            info['pts_semantic_mask_path'] = \
+                osp.join(self.data_prefix.get('pts_semantic_mask', ''),
+                         info['pts_semantic_mask_path'])
+
+        # only be used in `PointSegClassMapping` in pipeline
+        # to map original semantic class to valid category ids.
+        info['seg_label_mapping'] = self.metainfo['label_mapping']
+
+        # 'eval_ann_info' will be updated in loading transforms
+        if self.test_mode:
+            info['eval_ann_info'] = dict()
+
+        data_list.append(info)
+        return data_list
diff --git a/mmde/projects/TPVFormer/tpvformer/positional_encoding.py b/mmde/projects/TPVFormer/tpvformer/positional_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c5aa89fec8c5ce9a5a8f86d27c9542384cbcf9b
--- /dev/null
+++ b/mmde/projects/TPVFormer/tpvformer/positional_encoding.py
@@ -0,0 +1,54 @@
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class TPVFormerPositionalEncoding(BaseModule):
+
+    def __init__(self,
+                 num_feats,
+                 h,
+                 w,
+                 z,
+                 init_cfg=dict(type='Uniform', layer='Embedding')):
+        super().__init__(init_cfg)
+        if not isinstance(num_feats, list):
+            num_feats = [num_feats] * 3
+        self.h_embed = nn.Embedding(h, num_feats[0])
+        self.w_embed = nn.Embedding(w, num_feats[1])
+        self.z_embed = nn.Embedding(z, num_feats[2])
+        self.num_feats = num_feats
+        self.h, self.w, self.z = h, w, z
+
+    def forward(self, bs, device, ignore_axis='z'):
+        if ignore_axis == 'h':
+            h_embed = torch.zeros(
+                1, 1, self.num_feats[0],
+                device=device).repeat(self.w, self.z, 1)  # w, z, d
+            w_embed = self.w_embed(torch.arange(self.w, device=device))
+            w_embed = w_embed.reshape(self.w, 1, -1).repeat(1, self.z, 1)
+            z_embed = self.z_embed(torch.arange(self.z, device=device))
+            z_embed = z_embed.reshape(1, self.z, -1).repeat(self.w, 1, 1)
+        elif ignore_axis == 'w':
+            h_embed = self.h_embed(torch.arange(self.h, device=device))
+            h_embed = h_embed.reshape(1, self.h, -1).repeat(self.z, 1, 1)
+            w_embed = torch.zeros(
+                1, 1, self.num_feats[1],
+                device=device).repeat(self.z, self.h, 1)
+            z_embed = self.z_embed(torch.arange(self.z, device=device))
+            z_embed = z_embed.reshape(self.z, 1, -1).repeat(1, self.h, 1)
+        elif ignore_axis == 'z':
+            h_embed = self.h_embed(torch.arange(self.h, device=device))
+            h_embed = h_embed.reshape(self.h, 1, -1).repeat(1, self.w, 1)
+            w_embed = self.w_embed(torch.arange(self.w, device=device))
+            w_embed = w_embed.reshape(1, self.w, -1).repeat(self.h, 1, 1)
+            z_embed = torch.zeros(
+                1, 1, self.num_feats[2],
+                device=device).repeat(self.h, self.w, 1)
+
+        pos = torch.cat((h_embed, w_embed, z_embed),
+                        dim=-1).flatten(0, 1).unsqueeze(0).repeat(bs, 1, 1)
+        return pos
diff --git a/mmde/projects/TPVFormer/tpvformer/tpvformer.py b/mmde/projects/TPVFormer/tpvformer/tpvformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cd3de3548ffe65564916845a3a137a518789b97
--- /dev/null
+++ b/mmde/projects/TPVFormer/tpvformer/tpvformer.py
@@ -0,0 +1,72 @@
+from typing import Optional, Union
+
+from torch import nn
+
+from mmdet3d.models import Base3DSegmentor
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.det3d_data_sample import SampleList
+
+
+@MODELS.register_module()
+class TPVFormer(Base3DSegmentor):
+
+    def __init__(self,
+                 data_preprocessor: Optional[Union[dict, nn.Module]] = None,
+                 backbone=None,
+                 neck=None,
+                 encoder=None,
+                 decode_head=None):
+
+        super().__init__(data_preprocessor=data_preprocessor)
+
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        self.encoder = MODELS.build(encoder)
+        self.decode_head = MODELS.build(decode_head)
+
+    def extract_feat(self, img):
+        """Extract features of images."""
+        B, N, C, H, W = img.size()
+        img = img.view(B * N, C, H, W)
+        img_feats = self.backbone(img)
+
+        if hasattr(self, 'neck'):
+            img_feats = self.neck(img_feats)
+
+        img_feats_reshaped = []
+        for img_feat in img_feats:
+            _, C, H, W = img_feat.size()
+            img_feats_reshaped.append(img_feat.view(B, N, C, H, W))
+        return img_feats_reshaped
+
+    def _forward(self, batch_inputs, batch_data_samples):
+        """Forward training function."""
+        img_feats = self.extract_feat(batch_inputs['imgs'])
+        outs = self.encoder(img_feats, batch_data_samples)
+        outs = self.decode_head(outs, batch_inputs['voxels']['coors'])
+        return outs
+
+    def loss(self, batch_inputs: dict,
+             batch_data_samples: SampleList) -> SampleList:
+        img_feats = self.extract_feat(batch_inputs['imgs'])
+        queries = self.encoder(img_feats, batch_data_samples)
+        losses = self.decode_head.loss(queries, batch_data_samples)
+        return losses
+
+    def predict(self, batch_inputs: dict,
+                batch_data_samples: SampleList) -> SampleList:
+        """Forward predict function."""
+        img_feats = self.extract_feat(batch_inputs['imgs'])
+        tpv_queries = self.encoder(img_feats, batch_data_samples)
+        seg_logits = self.decode_head.predict(tpv_queries, batch_data_samples)
+        seg_preds = [seg_logit.argmax(dim=1) for seg_logit in seg_logits]
+
+        return self.postprocess_result(seg_preds, batch_data_samples)
+
+    def aug_test(self, batch_inputs, batch_data_samples):
+        pass
+
+    def encode_decode(self, batch_inputs: dict,
+                      batch_data_samples: SampleList) -> SampleList:
+        pass
diff --git a/mmde/projects/TPVFormer/tpvformer/tpvformer_encoder.py b/mmde/projects/TPVFormer/tpvformer/tpvformer_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea75df0b12f8e4cd7008cfeccdd20a5f7ca1c648
--- /dev/null
+++ b/mmde/projects/TPVFormer/tpvformer/tpvformer_encoder.py
@@ -0,0 +1,340 @@
+import numpy as np
+import torch
+from mmcv.cnn.bricks.transformer import TransformerLayerSequence
+from mmengine.registry import MODELS
+from torch import nn
+from torch.nn.init import normal_
+
+from .cross_view_hybrid_attention import TPVCrossViewHybridAttention
+from .image_cross_attention import TPVMSDeformableAttention3D
+
+
+@MODELS.register_module()
+class TPVFormerEncoder(TransformerLayerSequence):
+
+    def __init__(self,
+                 tpv_h=200,
+                 tpv_w=200,
+                 tpv_z=16,
+                 pc_range=[-51.2, -51.2, -5, 51.2, 51.2, 3],
+                 num_feature_levels=4,
+                 num_cams=6,
+                 embed_dims=256,
+                 num_points_in_pillar=[4, 32, 32],
+                 num_points_in_pillar_cross_view=[32, 32, 32],
+                 num_layers=5,
+                 transformerlayers=None,
+                 positional_encoding=None,
+                 return_intermediate=False):
+        super().__init__(transformerlayers, num_layers)
+
+        self.tpv_h = tpv_h
+        self.tpv_w = tpv_w
+        self.tpv_z = tpv_z
+        self.pc_range = pc_range
+        self.real_w = pc_range[3] - pc_range[0]
+        self.real_h = pc_range[4] - pc_range[1]
+        self.real_z = pc_range[5] - pc_range[2]
+
+        self.level_embeds = nn.Parameter(
+            torch.Tensor(num_feature_levels, embed_dims))
+        self.cams_embeds = nn.Parameter(torch.Tensor(num_cams, embed_dims))
+        self.tpv_embedding_hw = nn.Embedding(tpv_h * tpv_w, embed_dims)
+        self.tpv_embedding_zh = nn.Embedding(tpv_z * tpv_h, embed_dims)
+        self.tpv_embedding_wz = nn.Embedding(tpv_w * tpv_z, embed_dims)
+
+        ref_3d_hw = self.get_reference_points(tpv_h, tpv_w, self.real_z,
+                                              num_points_in_pillar[0])
+        ref_3d_zh = self.get_reference_points(tpv_z, tpv_h, self.real_w,
+                                              num_points_in_pillar[1])
+        ref_3d_zh = ref_3d_zh.permute(3, 0, 1, 2)[[2, 0, 1]]  # change to x,y,z
+        ref_3d_zh = ref_3d_zh.permute(1, 2, 3, 0)
+        ref_3d_wz = self.get_reference_points(tpv_w, tpv_z, self.real_h,
+                                              num_points_in_pillar[2])
+        ref_3d_wz = ref_3d_wz.permute(3, 0, 1, 2)[[1, 2, 0]]  # change to x,y,z
+        ref_3d_wz = ref_3d_wz.permute(1, 2, 3, 0)
+        self.register_buffer('ref_3d_hw', ref_3d_hw)
+        self.register_buffer('ref_3d_zh', ref_3d_zh)
+        self.register_buffer('ref_3d_wz', ref_3d_wz)
+
+        cross_view_ref_points = self.get_cross_view_ref_points(
+            tpv_h, tpv_w, tpv_z, num_points_in_pillar_cross_view)
+        self.register_buffer('cross_view_ref_points', cross_view_ref_points)
+
+        # positional encoding
+        self.positional_encoding = MODELS.build(positional_encoding)
+        self.return_intermediate = return_intermediate
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, TPVMSDeformableAttention3D) or isinstance(
+                    m, TPVCrossViewHybridAttention):
+                m.init_weights()
+        normal_(self.level_embeds)
+        normal_(self.cams_embeds)
+
+    @staticmethod
+    def get_cross_view_ref_points(tpv_h, tpv_w, tpv_z, num_points_in_pillar):
+        # ref points generating target: (#query)hw+zh+wz, (#level)3, #p, 2
+        # generate points for hw and level 1
+        h_ranges = torch.linspace(0.5, tpv_h - 0.5, tpv_h) / tpv_h
+        w_ranges = torch.linspace(0.5, tpv_w - 0.5, tpv_w) / tpv_w
+        h_ranges = h_ranges.unsqueeze(-1).expand(-1, tpv_w).flatten()
+        w_ranges = w_ranges.unsqueeze(0).expand(tpv_h, -1).flatten()
+        hw_hw = torch.stack([w_ranges, h_ranges], dim=-1)  # hw, 2
+        hw_hw = hw_hw.unsqueeze(1).expand(-1, num_points_in_pillar[2],
+                                          -1)  # hw, #p, 2
+        # generate points for hw and level 2
+        z_ranges = torch.linspace(0.5, tpv_z - 0.5,
+                                  num_points_in_pillar[2]) / tpv_z  # #p
+        z_ranges = z_ranges.unsqueeze(0).expand(tpv_h * tpv_w, -1)  # hw, #p
+        h_ranges = torch.linspace(0.5, tpv_h - 0.5, tpv_h) / tpv_h
+        h_ranges = h_ranges.reshape(-1, 1, 1).expand(
+            -1, tpv_w, num_points_in_pillar[2]).flatten(0, 1)
+        hw_zh = torch.stack([h_ranges, z_ranges], dim=-1)  # hw, #p, 2
+        # generate points for hw and level 3
+        z_ranges = torch.linspace(0.5, tpv_z - 0.5,
+                                  num_points_in_pillar[2]) / tpv_z  # #p
+        z_ranges = z_ranges.unsqueeze(0).expand(tpv_h * tpv_w, -1)  # hw, #p
+        w_ranges = torch.linspace(0.5, tpv_w - 0.5, tpv_w) / tpv_w
+        w_ranges = w_ranges.reshape(1, -1, 1).expand(
+            tpv_h, -1, num_points_in_pillar[2]).flatten(0, 1)
+        hw_wz = torch.stack([z_ranges, w_ranges], dim=-1)  # hw, #p, 2
+
+        # generate points for zh and level 1
+        w_ranges = torch.linspace(0.5, tpv_w - 0.5,
+                                  num_points_in_pillar[1]) / tpv_w
+        w_ranges = w_ranges.unsqueeze(0).expand(tpv_z * tpv_h, -1)
+        h_ranges = torch.linspace(0.5, tpv_h - 0.5, tpv_h) / tpv_h
+        h_ranges = h_ranges.reshape(1, -1, 1).expand(
+            tpv_z, -1, num_points_in_pillar[1]).flatten(0, 1)
+        zh_hw = torch.stack([w_ranges, h_ranges], dim=-1)
+        # generate points for zh and level 2
+        z_ranges = torch.linspace(0.5, tpv_z - 0.5, tpv_z) / tpv_z
+        z_ranges = z_ranges.reshape(-1, 1, 1).expand(
+            -1, tpv_h, num_points_in_pillar[1]).flatten(0, 1)
+        h_ranges = torch.linspace(0.5, tpv_h - 0.5, tpv_h) / tpv_h
+        h_ranges = h_ranges.reshape(1, -1, 1).expand(
+            tpv_z, -1, num_points_in_pillar[1]).flatten(0, 1)
+        zh_zh = torch.stack([h_ranges, z_ranges], dim=-1)  # zh, #p, 2
+        # generate points for zh and level 3
+        w_ranges = torch.linspace(0.5, tpv_w - 0.5,
+                                  num_points_in_pillar[1]) / tpv_w
+        w_ranges = w_ranges.unsqueeze(0).expand(tpv_z * tpv_h, -1)
+        z_ranges = torch.linspace(0.5, tpv_z - 0.5, tpv_z) / tpv_z
+        z_ranges = z_ranges.reshape(-1, 1, 1).expand(
+            -1, tpv_h, num_points_in_pillar[1]).flatten(0, 1)
+        zh_wz = torch.stack([z_ranges, w_ranges], dim=-1)
+
+        # generate points for wz and level 1
+        h_ranges = torch.linspace(0.5, tpv_h - 0.5,
+                                  num_points_in_pillar[0]) / tpv_h
+        h_ranges = h_ranges.unsqueeze(0).expand(tpv_w * tpv_z, -1)
+        w_ranges = torch.linspace(0.5, tpv_w - 0.5, tpv_w) / tpv_w
+        w_ranges = w_ranges.reshape(-1, 1, 1).expand(
+            -1, tpv_z, num_points_in_pillar[0]).flatten(0, 1)
+        wz_hw = torch.stack([w_ranges, h_ranges], dim=-1)
+        # generate points for wz and level 2
+        h_ranges = torch.linspace(0.5, tpv_h - 0.5,
+                                  num_points_in_pillar[0]) / tpv_h
+        h_ranges = h_ranges.unsqueeze(0).expand(tpv_w * tpv_z, -1)
+        z_ranges = torch.linspace(0.5, tpv_z - 0.5, tpv_z) / tpv_z
+        z_ranges = z_ranges.reshape(1, -1, 1).expand(
+            tpv_w, -1, num_points_in_pillar[0]).flatten(0, 1)
+        wz_zh = torch.stack([h_ranges, z_ranges], dim=-1)
+        # generate points for wz and level 3
+        w_ranges = torch.linspace(0.5, tpv_w - 0.5, tpv_w) / tpv_w
+        w_ranges = w_ranges.reshape(-1, 1, 1).expand(
+            -1, tpv_z, num_points_in_pillar[0]).flatten(0, 1)
+        z_ranges = torch.linspace(0.5, tpv_z - 0.5, tpv_z) / tpv_z
+        z_ranges = z_ranges.reshape(1, -1, 1).expand(
+            tpv_w, -1, num_points_in_pillar[0]).flatten(0, 1)
+        wz_wz = torch.stack([z_ranges, w_ranges], dim=-1)
+
+        reference_points = torch.cat([
+            torch.stack([hw_hw, hw_zh, hw_wz], dim=1),
+            torch.stack([zh_hw, zh_zh, zh_wz], dim=1),
+            torch.stack([wz_hw, wz_zh, wz_wz], dim=1)
+        ],
+                                     dim=0)  # hw+zh+wz, 3, #p, 2
+
+        return reference_points
+
+    @staticmethod
+    def get_reference_points(H,
+                             W,
+                             Z=8,
+                             num_points_in_pillar=4,
+                             dim='3d',
+                             bs=1,
+                             device='cuda',
+                             dtype=torch.float):
+        """Get the reference points used in SCA and TSA.
+
+        Args:
+            H, W: spatial shape of tpv.
+            Z: height of pillar.
+            device (obj:`device`): The device where
+                reference_points should be.
+        Returns:
+            Tensor: reference points used in decoder, has \
+                shape (bs, num_keys, num_levels, 2).
+        """
+
+        # reference points in 3D space, used in spatial cross-attention (SCA)
+        zs = torch.linspace(
+            0.5, Z - 0.5, num_points_in_pillar,
+            dtype=dtype, device=device).view(-1, 1, 1).expand(
+                num_points_in_pillar, H, W) / Z
+        xs = torch.linspace(
+            0.5, W - 0.5, W, dtype=dtype, device=device).view(1, 1, -1).expand(
+                num_points_in_pillar, H, W) / W
+        ys = torch.linspace(
+            0.5, H - 0.5, H, dtype=dtype, device=device).view(1, -1, 1).expand(
+                num_points_in_pillar, H, W) / H
+        ref_3d = torch.stack((xs, ys, zs), -1)
+        ref_3d = ref_3d.permute(0, 3, 1, 2).flatten(2).permute(0, 2, 1)
+        ref_3d = ref_3d[None].repeat(bs, 1, 1, 1)
+        return ref_3d
+
+    def point_sampling(self, reference_points, pc_range, batch_data_smaples):
+
+        lidar2img = []
+        for data_sample in batch_data_smaples:
+            lidar2img.append(data_sample.lidar2img)
+        lidar2img = np.asarray(lidar2img)
+        lidar2img = reference_points.new_tensor(lidar2img)  # (B, N, 4, 4)
+        reference_points = reference_points.clone()
+
+        reference_points[..., 0:1] = reference_points[..., 0:1] * \
+            (pc_range[3] - pc_range[0]) + pc_range[0]
+        reference_points[..., 1:2] = reference_points[..., 1:2] * \
+            (pc_range[4] - pc_range[1]) + pc_range[1]
+        reference_points[..., 2:3] = reference_points[..., 2:3] * \
+            (pc_range[5] - pc_range[2]) + pc_range[2]
+
+        reference_points = torch.cat(
+            (reference_points, torch.ones_like(reference_points[..., :1])), -1)
+
+        reference_points = reference_points.permute(1, 0, 2, 3)
+        D, B, num_query = reference_points.size()[:3]
+        num_cam = lidar2img.size(1)
+
+        reference_points = reference_points.view(D, B, 1, num_query, 4).repeat(
+            1, 1, num_cam, 1, 1).unsqueeze(-1)
+
+        lidar2img = lidar2img.view(1, B, num_cam, 1, 4,
+                                   4).repeat(D, 1, 1, num_query, 1, 1)
+
+        reference_points_cam = torch.matmul(
+            lidar2img.to(torch.float32),
+            reference_points.to(torch.float32)).squeeze(-1)
+        eps = 1e-5
+
+        tpv_mask = (reference_points_cam[..., 2:3] > eps)
+        reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum(
+            reference_points_cam[..., 2:3],
+            torch.ones_like(reference_points_cam[..., 2:3]) * eps)
+
+        reference_points_cam[..., 0] /= data_sample.batch_input_shape[1]
+        reference_points_cam[..., 1] /= data_sample.batch_input_shape[0]
+
+        tpv_mask = (
+            tpv_mask & (reference_points_cam[..., 1:2] > 0.0)
+            & (reference_points_cam[..., 1:2] < 1.0)
+            & (reference_points_cam[..., 0:1] < 1.0)
+            & (reference_points_cam[..., 0:1] > 0.0))
+
+        tpv_mask = torch.nan_to_num(tpv_mask)
+
+        reference_points_cam = reference_points_cam.permute(2, 1, 3, 0, 4)
+        tpv_mask = tpv_mask.permute(2, 1, 3, 0, 4).squeeze(-1)
+
+        return reference_points_cam, tpv_mask
+
+    def forward(self, mlvl_feats, batch_data_samples):
+        """Forward function.
+
+        Args:
+            mlvl_feats (tuple[Tensor]): Features from the upstream
+                network, each is a 5D-tensor with shape
+                (B, N, C, H, W).
+        """
+        bs = mlvl_feats[0].shape[0]
+        dtype = mlvl_feats[0].dtype
+        device = mlvl_feats[0].device
+
+        # tpv queries and pos embeds
+        tpv_queries_hw = self.tpv_embedding_hw.weight.to(dtype)
+        tpv_queries_zh = self.tpv_embedding_zh.weight.to(dtype)
+        tpv_queries_wz = self.tpv_embedding_wz.weight.to(dtype)
+        tpv_queries_hw = tpv_queries_hw.unsqueeze(0).repeat(bs, 1, 1)
+        tpv_queries_zh = tpv_queries_zh.unsqueeze(0).repeat(bs, 1, 1)
+        tpv_queries_wz = tpv_queries_wz.unsqueeze(0).repeat(bs, 1, 1)
+        tpv_query = [tpv_queries_hw, tpv_queries_zh, tpv_queries_wz]
+
+        tpv_pos_hw = self.positional_encoding(bs, device, 'z')
+        tpv_pos_zh = self.positional_encoding(bs, device, 'w')
+        tpv_pos_wz = self.positional_encoding(bs, device, 'h')
+        tpv_pos = [tpv_pos_hw, tpv_pos_zh, tpv_pos_wz]
+
+        # flatten image features of different scales
+        feat_flatten = []
+        spatial_shapes = []
+        for lvl, feat in enumerate(mlvl_feats):
+            bs, num_cam, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            feat = feat.flatten(3).permute(1, 0, 3, 2)  # num_cam, bs, hw, c
+            feat = feat + self.cams_embeds[:, None, None, :].to(dtype)
+            feat = feat + self.level_embeds[None, None,
+                                            lvl:lvl + 1, :].to(dtype)
+            spatial_shapes.append(spatial_shape)
+            feat_flatten.append(feat)
+
+        feat_flatten = torch.cat(feat_flatten, 2)  # num_cam, bs, hw++, c
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        feat_flatten = feat_flatten.permute(
+            0, 2, 1, 3)  # (num_cam, H*W, bs, embed_dims)
+
+        reference_points_cams, tpv_masks = [], []
+        ref_3ds = [self.ref_3d_hw, self.ref_3d_zh, self.ref_3d_wz]
+        for ref_3d in ref_3ds:
+            reference_points_cam, tpv_mask = self.point_sampling(
+                ref_3d, self.pc_range,
+                batch_data_samples)  # num_cam, bs, hw++, #p, 2
+            reference_points_cams.append(reference_points_cam)
+            tpv_masks.append(tpv_mask)
+
+        ref_cross_view = self.cross_view_ref_points.clone().unsqueeze(
+            0).expand(bs, -1, -1, -1, -1)
+
+        intermediate = []
+        for layer in self.layers:
+            output = layer(
+                tpv_query,
+                feat_flatten,
+                feat_flatten,
+                tpv_pos=tpv_pos,
+                ref_2d=ref_cross_view,
+                tpv_h=self.tpv_h,
+                tpv_w=self.tpv_w,
+                tpv_z=self.tpv_z,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                reference_points_cams=reference_points_cams,
+                tpv_masks=tpv_masks)
+            tpv_query = output
+            if self.return_intermediate:
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return output
diff --git a/mmde/projects/TPVFormer/tpvformer/tpvformer_head.py b/mmde/projects/TPVFormer/tpvformer/tpvformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c477f16058b0ab9d626eaaa80e0f2bc17b9b08e
--- /dev/null
+++ b/mmde/projects/TPVFormer/tpvformer/tpvformer_head.py
@@ -0,0 +1,298 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class TPVFormerDecoder(BaseModule):
+
+    def __init__(self,
+                 tpv_h,
+                 tpv_w,
+                 tpv_z,
+                 num_classes=20,
+                 in_dims=64,
+                 hidden_dims=128,
+                 out_dims=None,
+                 scale_h=2,
+                 scale_w=2,
+                 scale_z=2,
+                 ignore_index=0,
+                 loss_lovasz=None,
+                 loss_ce=None,
+                 lovasz_input='points',
+                 ce_input='voxel'):
+        super().__init__()
+        self.tpv_h = tpv_h
+        self.tpv_w = tpv_w
+        self.tpv_z = tpv_z
+        self.scale_h = scale_h
+        self.scale_w = scale_w
+        self.scale_z = scale_z
+
+        out_dims = in_dims if out_dims is None else out_dims
+        self.in_dims = in_dims
+        self.decoder = nn.Sequential(
+            nn.Linear(in_dims, hidden_dims), nn.Softplus(),
+            nn.Linear(hidden_dims, out_dims))
+
+        self.classifier = nn.Linear(out_dims, num_classes)
+        self.loss_lovasz = MODELS.build(loss_lovasz)
+        self.loss_ce = MODELS.build(loss_ce)
+        self.ignore_index = ignore_index
+        self.lovasz_input = lovasz_input
+        self.ce_input = ce_input
+
+    def forward(self, tpv_list, points=None):
+        """
+        tpv_list[0]: bs, h*w, c
+        tpv_list[1]: bs, z*h, c
+        tpv_list[2]: bs, w*z, c
+        """
+        tpv_hw, tpv_zh, tpv_wz = tpv_list[0], tpv_list[1], tpv_list[2]
+        bs, _, c = tpv_hw.shape
+        tpv_hw = tpv_hw.permute(0, 2, 1).reshape(bs, c, self.tpv_h, self.tpv_w)
+        tpv_zh = tpv_zh.permute(0, 2, 1).reshape(bs, c, self.tpv_z, self.tpv_h)
+        tpv_wz = tpv_wz.permute(0, 2, 1).reshape(bs, c, self.tpv_w, self.tpv_z)
+
+        if self.scale_h != 1 or self.scale_w != 1:
+            tpv_hw = F.interpolate(
+                tpv_hw,
+                size=(self.tpv_h * self.scale_h, self.tpv_w * self.scale_w),
+                mode='bilinear')
+        if self.scale_z != 1 or self.scale_h != 1:
+            tpv_zh = F.interpolate(
+                tpv_zh,
+                size=(self.tpv_z * self.scale_z, self.tpv_h * self.scale_h),
+                mode='bilinear')
+        if self.scale_w != 1 or self.scale_z != 1:
+            tpv_wz = F.interpolate(
+                tpv_wz,
+                size=(self.tpv_w * self.scale_w, self.tpv_z * self.scale_z),
+                mode='bilinear')
+
+        if points is not None:
+            # points: bs, n, 3
+            _, n, _ = points.shape
+            points = points.reshape(bs, 1, n, 3).float()
+            points[...,
+                   0] = points[..., 0] / (self.tpv_w * self.scale_w) * 2 - 1
+            points[...,
+                   1] = points[..., 1] / (self.tpv_h * self.scale_h) * 2 - 1
+            points[...,
+                   2] = points[..., 2] / (self.tpv_z * self.scale_z) * 2 - 1
+            sample_loc = points[:, :, :, [0, 1]]
+            tpv_hw_pts = F.grid_sample(tpv_hw,
+                                       sample_loc).squeeze(2)  # bs, c, n
+            sample_loc = points[:, :, :, [1, 2]]
+            tpv_zh_pts = F.grid_sample(tpv_zh, sample_loc).squeeze(2)
+            sample_loc = points[:, :, :, [2, 0]]
+            tpv_wz_pts = F.grid_sample(tpv_wz, sample_loc).squeeze(2)
+
+            tpv_hw_vox = tpv_hw.unsqueeze(-1).permute(0, 1, 3, 2, 4).expand(
+                -1, -1, -1, -1, self.scale_z * self.tpv_z)
+            tpv_zh_vox = tpv_zh.unsqueeze(-1).permute(0, 1, 4, 3, 2).expand(
+                -1, -1, self.scale_w * self.tpv_w, -1, -1)
+            tpv_wz_vox = tpv_wz.unsqueeze(-1).permute(0, 1, 2, 4, 3).expand(
+                -1, -1, -1, self.scale_h * self.tpv_h, -1)
+
+            fused_vox = (tpv_hw_vox + tpv_zh_vox + tpv_wz_vox).flatten(2)
+            fused_pts = tpv_hw_pts + tpv_zh_pts + tpv_wz_pts
+            fused = torch.cat([fused_vox, fused_pts], dim=-1)  # bs, c, whz+n
+
+            fused = fused.permute(0, 2, 1)
+            if self.use_checkpoint:
+                fused = torch.utils.checkpoint.checkpoint(self.decoder, fused)
+                logits = torch.utils.checkpoint.checkpoint(
+                    self.classifier, fused)
+            else:
+                fused = self.decoder(fused)
+                logits = self.classifier(fused)
+            logits = logits.permute(0, 2, 1)
+            logits_vox = logits[:, :, :(-n)].reshape(bs, self.classes,
+                                                     self.scale_w * self.tpv_w,
+                                                     self.scale_h * self.tpv_h,
+                                                     self.scale_z * self.tpv_z)
+            logits_pts = logits[:, :, (-n):].reshape(bs, self.classes, n, 1, 1)
+            return logits_vox, logits_pts
+
+        else:
+            tpv_hw = tpv_hw.unsqueeze(-1).permute(0, 1, 3, 2, 4).expand(
+                -1, -1, -1, -1, self.scale_z * self.tpv_z)
+            tpv_zh = tpv_zh.unsqueeze(-1).permute(0, 1, 4, 3, 2).expand(
+                -1, -1, self.scale_w * self.tpv_w, -1, -1)
+            tpv_wz = tpv_wz.unsqueeze(-1).permute(0, 1, 2, 4, 3).expand(
+                -1, -1, -1, self.scale_h * self.tpv_h, -1)
+
+            fused = tpv_hw + tpv_zh + tpv_wz
+            fused = fused.permute(0, 2, 3, 4, 1)
+            if self.use_checkpoint:
+                fused = torch.utils.checkpoint.checkpoint(self.decoder, fused)
+                logits = torch.utils.checkpoint.checkpoint(
+                    self.classifier, fused)
+            else:
+                fused = self.decoder(fused)
+                logits = self.classifier(fused)
+            logits = logits.permute(0, 4, 1, 2, 3)
+
+            return logits
+
+    def predict(self, tpv_list, batch_data_samples):
+        """
+        tpv_list[0]: bs, h*w, c
+        tpv_list[1]: bs, z*h, c
+        tpv_list[2]: bs, w*z, c
+        """
+        tpv_hw, tpv_zh, tpv_wz = tpv_list
+        bs, _, c = tpv_hw.shape
+        tpv_hw = tpv_hw.permute(0, 2, 1).reshape(bs, c, self.tpv_h, self.tpv_w)
+        tpv_zh = tpv_zh.permute(0, 2, 1).reshape(bs, c, self.tpv_z, self.tpv_h)
+        tpv_wz = tpv_wz.permute(0, 2, 1).reshape(bs, c, self.tpv_w, self.tpv_z)
+
+        if self.scale_h != 1 or self.scale_w != 1:
+            tpv_hw = F.interpolate(
+                tpv_hw,
+                size=(self.tpv_h * self.scale_h, self.tpv_w * self.scale_w),
+                mode='bilinear')
+        if self.scale_z != 1 or self.scale_h != 1:
+            tpv_zh = F.interpolate(
+                tpv_zh,
+                size=(self.tpv_z * self.scale_z, self.tpv_h * self.scale_h),
+                mode='bilinear')
+        if self.scale_w != 1 or self.scale_z != 1:
+            tpv_wz = F.interpolate(
+                tpv_wz,
+                size=(self.tpv_w * self.scale_w, self.tpv_z * self.scale_z),
+                mode='bilinear')
+
+        logits = []
+        for i, data_sample in enumerate(batch_data_samples):
+            point_coors = data_sample.point_coors.reshape(1, 1, -1, 3).float()
+            point_coors[
+                ...,
+                0] = point_coors[..., 0] / (self.tpv_w * self.scale_w) * 2 - 1
+            point_coors[
+                ...,
+                1] = point_coors[..., 1] / (self.tpv_h * self.scale_h) * 2 - 1
+            point_coors[
+                ...,
+                2] = point_coors[..., 2] / (self.tpv_z * self.scale_z) * 2 - 1
+            sample_loc = point_coors[..., [0, 1]]
+            tpv_hw_pts = F.grid_sample(
+                tpv_hw[i:i + 1], sample_loc, align_corners=False)
+            sample_loc = point_coors[..., [1, 2]]
+            tpv_zh_pts = F.grid_sample(
+                tpv_zh[i:i + 1], sample_loc, align_corners=False)
+            sample_loc = point_coors[..., [2, 0]]
+            tpv_wz_pts = F.grid_sample(
+                tpv_wz[i:i + 1], sample_loc, align_corners=False)
+
+            fused_pts = tpv_hw_pts + tpv_zh_pts + tpv_wz_pts
+
+            fused_pts = fused_pts.squeeze(0).squeeze(1).transpose(0, 1)
+            fused_pts = self.decoder(fused_pts)
+            logit = self.classifier(fused_pts)
+            logits.append(logit)
+
+        return logits
+
+    def loss(self, tpv_list, batch_data_samples):
+        tpv_hw, tpv_zh, tpv_wz = tpv_list
+        bs, _, c = tpv_hw.shape
+        tpv_hw = tpv_hw.permute(0, 2, 1).reshape(bs, c, self.tpv_h, self.tpv_w)
+        tpv_zh = tpv_zh.permute(0, 2, 1).reshape(bs, c, self.tpv_z, self.tpv_h)
+        tpv_wz = tpv_wz.permute(0, 2, 1).reshape(bs, c, self.tpv_w, self.tpv_z)
+
+        if self.scale_h != 1 or self.scale_w != 1:
+            tpv_hw = F.interpolate(
+                tpv_hw,
+                size=(self.tpv_h * self.scale_h, self.tpv_w * self.scale_w),
+                mode='bilinear')
+        if self.scale_z != 1 or self.scale_h != 1:
+            tpv_zh = F.interpolate(
+                tpv_zh,
+                size=(self.tpv_z * self.scale_z, self.tpv_h * self.scale_h),
+                mode='bilinear')
+        if self.scale_w != 1 or self.scale_z != 1:
+            tpv_wz = F.interpolate(
+                tpv_wz,
+                size=(self.tpv_w * self.scale_w, self.tpv_z * self.scale_z),
+                mode='bilinear')
+
+        batch_pts, batch_vox = [], []
+        for i, data_sample in enumerate(batch_data_samples):
+            point_coors = data_sample.point_coors.reshape(1, 1, -1, 3).float()
+            point_coors[
+                ...,
+                0] = point_coors[..., 0] / (self.tpv_w * self.scale_w) * 2 - 1
+            point_coors[
+                ...,
+                1] = point_coors[..., 1] / (self.tpv_h * self.scale_h) * 2 - 1
+            point_coors[
+                ...,
+                2] = point_coors[..., 2] / (self.tpv_z * self.scale_z) * 2 - 1
+            sample_loc = point_coors[..., [0, 1]]
+            tpv_hw_pts = F.grid_sample(
+                tpv_hw[i:i + 1], sample_loc, align_corners=False)
+            sample_loc = point_coors[..., [1, 2]]
+            tpv_zh_pts = F.grid_sample(
+                tpv_zh[i:i + 1], sample_loc, align_corners=False)
+            sample_loc = point_coors[..., [2, 0]]
+            tpv_wz_pts = F.grid_sample(
+                tpv_wz[i:i + 1], sample_loc, align_corners=False)
+            fused_pts = (tpv_hw_pts + tpv_zh_pts +
+                         tpv_wz_pts).squeeze(0).squeeze(1)
+            batch_pts.append(fused_pts)
+
+            tpv_hw_vox = tpv_hw.unsqueeze(-1).permute(0, 1, 3, 2, 4).expand(
+                -1, -1, -1, -1, self.scale_z * self.tpv_z)
+            tpv_zh_vox = tpv_zh.unsqueeze(-1).permute(0, 1, 4, 3, 2).expand(
+                -1, -1, self.scale_w * self.tpv_w, -1, -1)
+            tpv_wz_vox = tpv_wz.unsqueeze(-1).permute(0, 1, 2, 4, 3).expand(
+                -1, -1, -1, self.scale_h * self.tpv_h, -1)
+            fused_vox = tpv_hw_vox + tpv_zh_vox + tpv_wz_vox
+            voxel_coors = data_sample.voxel_coors.long()
+            fused_vox = fused_vox[:, :, voxel_coors[:, 0], voxel_coors[:, 1],
+                                  voxel_coors[:, 2]]
+            fused_vox = fused_vox.squeeze(0)
+            batch_vox.append(fused_vox)
+        batch_pts = torch.cat(batch_pts, dim=1)
+        batch_vox = torch.cat(batch_vox, dim=1)
+        num_points = batch_pts.shape[1]
+
+        logits = self.decoder(
+            torch.cat([batch_pts, batch_vox], dim=1).transpose(0, 1))
+        logits = self.classifier(logits)
+        pts_logits = logits[:num_points, :]
+        vox_logits = logits[num_points:, :]
+
+        pts_seg_label = torch.cat([
+            data_sample.gt_pts_seg.pts_semantic_mask
+            for data_sample in batch_data_samples
+        ])
+        voxel_seg_label = torch.cat([
+            data_sample.gt_pts_seg.voxel_semantic_mask
+            for data_sample in batch_data_samples
+        ])
+        if self.ce_input == 'voxel':
+            ce_input = vox_logits
+            ce_label = voxel_seg_label
+        else:
+            ce_input = pts_logits
+            ce_label = pts_seg_label
+        if self.lovasz_input == 'voxel':
+            lovasz_input = vox_logits
+            lovasz_label = voxel_seg_label
+        else:
+            lovasz_input = pts_logits
+            lovasz_label = pts_seg_label
+
+        loss = dict()
+        loss['loss_ce'] = self.loss_ce(
+            ce_input, ce_label, ignore_index=self.ignore_index)
+        loss['loss_lovasz'] = self.loss_lovasz(
+            lovasz_input, lovasz_label, ignore_index=self.ignore_index)
+        return loss
diff --git a/mmde/projects/TPVFormer/tpvformer/tpvformer_layer.py b/mmde/projects/TPVFormer/tpvformer/tpvformer_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..03569fdd12d8cb435a845badb7c1668992e10ee8
--- /dev/null
+++ b/mmde/projects/TPVFormer/tpvformer/tpvformer_layer.py
@@ -0,0 +1,223 @@
+import copy
+import warnings
+
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import (build_attention,
+                                         build_feedforward_network)
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, ModuleList
+from mmengine.registry import MODELS
+
+
+@MODELS.register_module()
+class TPVFormerLayer(BaseModule):
+    """Base `TPVFormerLayer` for vision transformer.
+
+    It can be built from `mmcv.ConfigDict` and support more flexible
+    customization, for example, using any number of `FFN or LN ` and
+    use different kinds of `attention` by specifying a list of `ConfigDict`
+    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
+    when you specifying `norm` as the first element of `operation_order`.
+    More details about the `prenorm`: `On Layer Normalization in the
+    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for `self_attention` or `cross_attention` modules,
+            The order of the configs in the list should be consistent with
+            corresponding attentions in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config. Default: None.
+        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for FFN, The order of the configs in the list should be
+            consistent with corresponding ffn in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Support `prenorm` when you specifying first element as `norm`.
+            Default: None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape
+            of (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+    """
+
+    def __init__(self,
+                 attn_cfgs=None,
+                 ffn_cfgs=dict(
+                     type='FFN',
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                 ),
+                 operation_order=None,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None,
+                 batch_first=True,
+                 **kwargs):
+        deprecated_args = dict(
+            feedforward_channels='feedforward_channels',
+            ffn_dropout='ffn_drop',
+            ffn_num_fcs='num_fcs')
+        for ori_name, new_name in deprecated_args.items():
+            if ori_name in kwargs:
+                warnings.warn(
+                    f'The arguments `{ori_name}` in BaseTransformerLayer '
+                    f'has been deprecated, now you should set `{new_name}` '
+                    f'and other FFN related arguments '
+                    f'to a dict named `ffn_cfgs`. ')
+                ffn_cfgs[new_name] = kwargs[ori_name]
+
+        super().__init__(init_cfg)
+
+        self.batch_first = batch_first
+
+        num_attn = operation_order.count('self_attn') + operation_order.count(
+            'cross_attn')
+        if isinstance(attn_cfgs, dict):
+            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
+        else:
+            assert num_attn == len(attn_cfgs), f'The length ' \
+                f'of attn_cfg {num_attn} is ' \
+                f'not consistent with the number of attention' \
+                f'in operation_order {operation_order}.'
+
+        self.num_attn = num_attn
+        self.operation_order = operation_order
+        self.norm_cfg = norm_cfg
+        self.pre_norm = operation_order[0] == 'norm'
+        self.attentions = ModuleList()
+
+        index = 0
+        for operation_name in operation_order:
+            if operation_name in ['self_attn', 'cross_attn']:
+                if 'batch_first' in attn_cfgs[index]:
+                    assert self.batch_first == attn_cfgs[index]['batch_first']
+                else:
+                    attn_cfgs[index]['batch_first'] = self.batch_first
+                attention = build_attention(attn_cfgs[index])
+                # Some custom attentions used as `self_attn`
+                # or `cross_attn` can have different behavior.
+                attention.operation_name = operation_name
+                self.attentions.append(attention)
+                index += 1
+
+        self.embed_dims = self.attentions[0].embed_dims
+
+        self.ffns = ModuleList()
+        num_ffns = operation_order.count('ffn')
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = ConfigDict(ffn_cfgs)
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
+        assert len(ffn_cfgs) == num_ffns
+        for ffn_index in range(num_ffns):
+            if 'embed_dims' not in ffn_cfgs[ffn_index]:
+                ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims
+            else:
+                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
+
+            self.ffns.append(build_feedforward_network(ffn_cfgs[ffn_index]))
+
+        self.norms = ModuleList()
+        num_norms = operation_order.count('norm')
+        for _ in range(num_norms):
+            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                tpv_pos=None,
+                ref_2d=None,
+                tpv_h=None,
+                tpv_w=None,
+                tpv_z=None,
+                reference_points_cams=None,
+                tpv_masks=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """
+        **kwargs contains some specific arguments of attentions.
+
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            tpv_pos (Tensor): The positional encoding for self attn.
+        Returns:
+            Tensor: forwarded results with shape
+                [[bs, num_queries, embed_dims] * 3] for 3 tpv planes.
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        if self.operation_order[0] == 'cross_attn':
+            query = torch.cat(query, dim=1)
+        identity = query
+
+        for layer in self.operation_order:
+            # cross view hybrid-attention
+            if layer == 'self_attn':
+                ss = torch.tensor(
+                    [[tpv_h, tpv_w], [tpv_z, tpv_h], [tpv_w, tpv_z]],
+                    device=query[0].device)
+                lsi = torch.tensor(
+                    [0, tpv_h * tpv_w, tpv_h * tpv_w + tpv_z * tpv_h],
+                    device=query[0].device)
+
+                if not isinstance(query, (list, tuple)):
+                    query = torch.split(
+                        query, [tpv_h * tpv_w, tpv_z * tpv_h, tpv_w * tpv_z],
+                        dim=1)
+
+                query = self.attentions[attn_index](
+                    query,
+                    identity if self.pre_norm else None,
+                    query_pos=tpv_pos,
+                    reference_points=ref_2d,
+                    spatial_shapes=ss,
+                    level_start_index=lsi,
+                    **kwargs)
+                attn_index += 1
+                query = torch.cat(query, dim=1)
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            # image cross attention
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    reference_points_cams=reference_points_cams,
+                    tpv_masks=tpv_masks,
+                    spatial_shapes=spatial_shapes,
+                    level_start_index=level_start_index,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+        query = torch.split(
+            query, [tpv_h * tpv_w, tpv_z * tpv_h, tpv_w * tpv_z], dim=1)
+        return query
diff --git a/mmde/projects/TR3D/README.md b/mmde/projects/TR3D/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2b8e20dd0052651008f64a9b67a2a9a022763763
--- /dev/null
+++ b/mmde/projects/TR3D/README.md
@@ -0,0 +1,97 @@
+# TR3D: Towards Real-Time Indoor 3D Object Detection
+
+> [TR3D: Towards Real-Time Indoor 3D Object Detection](https://arxiv.org/abs/2302.02858)
+
+## Abstract
+
+Recently, sparse 3D convolutions have changed 3D object detection. Performing on par with the voting-based approaches, 3D CNNs are memory-efficient and scale to large scenes better. However, there is still room for improvement. With a conscious, practice-oriented approach to problem-solving, we analyze the performance of such methods and localize the weaknesses. Applying modifications that resolve the found issues one by one, we end up with TR3D: a fast fully-convolutional 3D object detection model trained end-to-end, that achieves state-of-the-art results on the standard benchmarks, ScanNet v2, SUN RGB-D, and S3DIS. Moreover, to take advantage of both point cloud and RGB inputs, we introduce an early fusion of 2D and 3D features. We employ our fusion module to make conventional 3D object detection methods multimodal and demonstrate an impressive boost in performance. Our model with early feature fusion, which we refer to as TR3D+FF, outperforms existing 3D object detection approaches on the SUN RGB-D dataset. Overall, besides being accurate, both TR3D and TR3D+FF models are lightweight, memory-efficient, and fast, thereby marking another milestone on the way toward real-time 3D object detection.
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/6030962/219644780-646516ec-a6c1-4ec5-9b8c-63bbc9702d05.png" width="800"/>
+</div>
+
+## Usage
+
+Training and inference in this project were tested with `mmdet3d==1.1.0rc3`.
+
+### Training commands
+
+In MMDet3D's root directory, run the following command to train the model:
+
+```bash
+python tools/train.py projects/TR3D/configs/tr3d_1xb16_scannet-3d-18class.py
+```
+
+### Testing commands
+
+In MMDet3D's root directory, run the following command to test the model:
+
+```bash
+python tools/test.py projects/TR3D/configs/tr3d_1xb16_scannet-3d-18class.py ${CHECKPOINT_PATH}
+```
+
+## Results and models
+
+### ScanNet
+
+|                          Backbone                          | Mem (GB) | Inf time (fps) |   AP@0.25   |   AP@0.5    |                                                                                                                                        Download                                                                                                                                         |
+| :--------------------------------------------------------: | :------: | :------------: | :---------: | :---------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [MinkResNet34](./configs/tr3d_1xb16_scannet-3d-18class.py) |   8.6    |      23.7      | 72.9 (72.0) | 59.3 (57.4) | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/tr3d/tr3d_1xb16_scannet-3d-18class/tr3d_1xb16_scannet-3d-18class.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/tr3d/tr3d_1xb16_scannet-3d-18class/tr3d_1xb16_scannet-3d-18class.log.json) |
+
+### SUN RGB-D
+
+|                          Backbone                          | Mem (GB) | Inf time (fps) |   AP@0.25   |   AP@0.5    |                                                                                                                                        Download                                                                                                                                         |
+| :--------------------------------------------------------: | :------: | :------------: | :---------: | :---------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [MinkResNet34](./configs/tr3d_1xb16_sunrgbd-3d-10class.py) |   3.8    |      27.5      | 67.1 (66.3) | 50.4 (49.6) | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/tr3d/tr3d_1xb16_sunrgbd-3d-10class/tr3d_1xb16_sunrgbd-3d-10class.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/tr3d/tr3d_1xb16_sunrgbd-3d-10class/tr3d_1xb16_sunrgbd-3d-10class.log.json) |
+
+### S3DIS
+
+|                        Backbone                         | Mem (GB) | Inf time (fps) |   AP@0.25   |   AP@0.5    |                                                                                                                                  Download                                                                                                                                   |
+| :-----------------------------------------------------: | :------: | :------------: | :---------: | :---------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [MinkResNet34](./configs/tr3d_1xb16_s3dis-3d-5class.py) |   15.2   |      21.0      | 74.5 (72.1) | 51.7 (47.6) | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/tr3d/tr3d_1xb16_s3dis-3d-5class/tr3d_1xb16_s3dis-3d-5class.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/tr3d/tr3d_1xb16_s3dis-3d-5class/tr3d_1xb16_s3dis-3d-5class.log.json) |
+
+**Note**
+
+- We report the results across 5 train runs followed by 5 test runs. Median values are in round brackets.
+- Inference time is given for a single NVidia GeForce RTX 4090 GPU.
+
+## Citation
+
+```latex
+@article{rukhovich2023tr3d,
+  title={TR3D: Towards Real-Time Indoor 3D Object Detection},
+  author={Rukhovich, Danila and Vorontsova, Anna and Konushin, Anton},
+  journal={arXiv preprint arXiv:2302.02858},
+  year={2023}
+}
+```
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [x] Test-time correctness
+
+  - [x] A full README
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+  - [x] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [x] Type hints and docstrings
+
+  - [ ] Unit tests
+
+  - [ ] Code polishing
+
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/mmde/projects/TR3D/configs/tr3d.py b/mmde/projects/TR3D/configs/tr3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..c203f0d88a488e7b9ee44a693b0b664d75f6b283
--- /dev/null
+++ b/mmde/projects/TR3D/configs/tr3d.py
@@ -0,0 +1,43 @@
+_base_ = ['../../../configs/_base_/default_runtime.py']
+custom_imports = dict(imports=['projects.TR3D.tr3d'])
+
+model = dict(
+    type='MinkSingleStage3DDetector',
+    data_preprocessor=dict(type='Det3DDataPreprocessor'),
+    backbone=dict(
+        type='TR3DMinkResNet',
+        in_channels=3,
+        depth=34,
+        norm='batch',
+        num_planes=(64, 128, 128, 128)),
+    neck=dict(
+        type='TR3DNeck', in_channels=(64, 128, 128, 128), out_channels=128),
+    bbox_head=dict(
+        type='TR3DHead',
+        in_channels=128,
+        voxel_size=0.01,
+        pts_center_threshold=6,
+        num_reg_outs=6),
+    train_cfg=dict(),
+    test_cfg=dict(nms_pre=1000, iou_thr=0.5, score_thr=0.01))
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=10, norm_type=2))
+
+# learning rate
+param_scheduler = dict(
+    type='MultiStepLR',
+    begin=0,
+    end=12,
+    by_epoch=True,
+    milestones=[8, 11],
+    gamma=0.1)
+
+custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
+
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/mmde/projects/TR3D/configs/tr3d_1xb16_s3dis-3d-5class.py b/mmde/projects/TR3D/configs/tr3d_1xb16_s3dis-3d-5class.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b7869f522e8ad1655e666c7b473e13bdb414871
--- /dev/null
+++ b/mmde/projects/TR3D/configs/tr3d_1xb16_s3dis-3d-5class.py
@@ -0,0 +1,51 @@
+_base_ = ['./tr3d.py', 'mmdet3d::_base_/datasets/s3dis-3d.py']
+custom_imports = dict(imports=['projects.TR3D.tr3d'])
+
+dataset_type = 'S3DISDataset'
+data_root = 'data/s3dis/'
+metainfo = dict(classes=('table', 'chair', 'sofa', 'bookcase', 'board'))
+train_area = [1, 2, 3, 4, 6]
+
+model = dict(bbox_head=dict(label2level=[1, 0, 1, 1, 0]))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='LoadAnnotations3D'),
+    dict(type='PointSample', num_points=100000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[0, 0],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.1, 0.1, 0.1],
+        shift_height=False),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    dataset=dict(
+        dataset=dict(datasets=[
+            dict(
+                type=dataset_type,
+                data_root=data_root,
+                ann_file=f's3dis_infos_Area_{i}.pkl',
+                pipeline=train_pipeline,
+                filter_empty_gt=False,
+                metainfo=metainfo,
+                box_type_3d='Depth') for i in train_area
+        ])))
diff --git a/mmde/projects/TR3D/configs/tr3d_1xb16_scannet-3d-18class.py b/mmde/projects/TR3D/configs/tr3d_1xb16_scannet-3d-18class.py
new file mode 100644
index 0000000000000000000000000000000000000000..e022e47f1113ba9c42c814dd0b38dd6fbaaa3ecd
--- /dev/null
+++ b/mmde/projects/TR3D/configs/tr3d_1xb16_scannet-3d-18class.py
@@ -0,0 +1,68 @@
+_base_ = ['./tr3d.py', 'mmdet3d::_base_/datasets/scannet-3d.py']
+custom_imports = dict(imports=['projects.TR3D.tr3d'])
+
+model = dict(
+    bbox_head=dict(
+        label2level=[0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0]))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='LoadAnnotations3D'),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    # We do not sample 100k points for ScanNet, as very few scenes have
+    # significantly more then 100k points. So we sample 33 to 100% of them.
+    dict(type='TR3DPointSample', num_points=0.33),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.02, 0.02],
+        scale_ratio_range=[0.9, 1.1],
+        translation_std=[0.1, 0.1, 0.1],
+        shift_height=False),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            # We do not sample 100k points for ScanNet, as very few scenes have
+            # significantly more then 100k points. So it doesn't affect
+            # inference time and we can accept all points.
+            # dict(type='PointSample', num_points=100000),
+            dict(type='NormalizePointsColor', color_mean=None),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    dataset=dict(
+        type='RepeatDataset',
+        times=15,
+        dataset=dict(pipeline=train_pipeline, filter_empty_gt=False)))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/mmde/projects/TR3D/configs/tr3d_1xb16_sunrgbd-3d-10class.py b/mmde/projects/TR3D/configs/tr3d_1xb16_sunrgbd-3d-10class.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cd557927dc5e9221a27b28a7c40892981ccd509
--- /dev/null
+++ b/mmde/projects/TR3D/configs/tr3d_1xb16_sunrgbd-3d-10class.py
@@ -0,0 +1,62 @@
+_base_ = ['./tr3d.py', 'mmdet3d::_base_/datasets/sunrgbd-3d.py']
+custom_imports = dict(imports=['projects.TR3D.tr3d'])
+
+model = dict(
+    bbox_head=dict(
+        num_reg_outs=8,
+        label2level=[1, 1, 1, 0, 0, 1, 0, 0, 1, 0],
+        bbox_loss=dict(
+            type='TR3DRotatedIoU3DLoss', mode='diou', reduction='none')))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='LoadAnnotations3D'),
+    dict(type='PointSample', num_points=100000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.523599, 0.523599],
+        scale_ratio_range=[.85, 1.15],
+        translation_std=[.1, .1, .1],
+        shift_height=False),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='PointSample', num_points=100000),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(pipeline=train_pipeline, filter_empty_gt=False)))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/mmde/projects/TR3D/tr3d/__init__.py b/mmde/projects/TR3D/tr3d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..812b9e887c031e942f48fb2b813fca739c01d174
--- /dev/null
+++ b/mmde/projects/TR3D/tr3d/__init__.py
@@ -0,0 +1,11 @@
+from .axis_aligned_iou_loss import TR3DAxisAlignedIoULoss
+from .mink_resnet import TR3DMinkResNet
+from .rotated_iou_loss import TR3DRotatedIoU3DLoss
+from .tr3d_head import TR3DHead
+from .tr3d_neck import TR3DNeck
+from .transforms_3d import TR3DPointSample
+
+__all__ = [
+    'TR3DAxisAlignedIoULoss', 'TR3DMinkResNet', 'TR3DRotatedIoU3DLoss',
+    'TR3DHead', 'TR3DNeck', 'TR3DPointSample'
+]
diff --git a/mmde/projects/TR3D/tr3d/axis_aligned_iou_loss.py b/mmde/projects/TR3D/tr3d/axis_aligned_iou_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b56f8022a8cb67a1c08a53ca3ba0e13c2802301a
--- /dev/null
+++ b/mmde/projects/TR3D/tr3d/axis_aligned_iou_loss.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmdet.models.losses.utils import weighted_loss
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.models import axis_aligned_iou_loss
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import AxisAlignedBboxOverlaps3D
+
+
+@weighted_loss
+def axis_aligned_diou_loss(pred: Tensor, target: Tensor) -> Tensor:
+    """Calculate the DIoU loss (1-DIoU) of two sets of axis aligned bounding
+    boxes. Note that predictions and targets are one-to-one corresponded.
+
+    Args:
+        pred (torch.Tensor): Bbox predictions with shape [..., 6]
+            (x1, y1, z1, x2, y2, z2).
+        target (torch.Tensor): Bbox targets (gt) with shape [..., 6]
+            (x1, y1, z1, x2, y2, z2).
+
+    Returns:
+        torch.Tensor: DIoU loss between predictions and targets.
+    """
+    axis_aligned_iou = AxisAlignedBboxOverlaps3D()(
+        pred, target, is_aligned=True)
+    iou_loss = 1 - axis_aligned_iou
+
+    xp1, yp1, zp1, xp2, yp2, zp2 = pred.split(1, dim=-1)
+    xt1, yt1, zt1, xt2, yt2, zt2 = target.split(1, dim=-1)
+
+    xpc = (xp1 + xp2) / 2
+    ypc = (yp1 + yp2) / 2
+    zpc = (zp1 + zp2) / 2
+    xtc = (xt1 + xt2) / 2
+    ytc = (yt1 + yt2) / 2
+    ztc = (zt1 + zt2) / 2
+    r2 = (xpc - xtc)**2 + (ypc - ytc)**2 + (zpc - ztc)**2
+
+    x_min = torch.minimum(xp1, xt1)
+    x_max = torch.maximum(xp2, xt2)
+    y_min = torch.minimum(yp1, yt1)
+    y_max = torch.maximum(yp2, yt2)
+    z_min = torch.minimum(zp1, zt1)
+    z_max = torch.maximum(zp2, zt2)
+    c2 = (x_min - x_max)**2 + (y_min - y_max)**2 + (z_min - z_max)**2
+
+    diou_loss = iou_loss + (r2 / c2)[:, 0]
+
+    return diou_loss
+
+
+@MODELS.register_module()
+class TR3DAxisAlignedIoULoss(nn.Module):
+    """Calculate the IoU loss (1-IoU) of axis aligned bounding boxes. The only
+    difference with original AxisAlignedIoULoss is the addition of DIoU mode.
+    These classes should be merged in the future.
+
+    Args:
+        mode (str): 'iou' for intersection over union or 'diou' for
+            distance-iou loss. Defaults to 'iou'.
+        reduction (str): Method to reduce losses.
+            The valid reduction method are 'none', 'sum' or 'mean'.
+            Defaults to 'mean'.
+        loss_weight (float): Weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 mode: str = 'iou',
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super(TR3DAxisAlignedIoULoss, self).__init__()
+        assert mode in ['iou', 'diou']
+        self.loss = axis_aligned_iou_loss if mode == 'iou' \
+            else axis_aligned_diou_loss
+        assert reduction in ['none', 'sum', 'mean']
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function of loss calculation.
+
+        Args:
+            pred (Tensor): Bbox predictions with shape [..., 3].
+            target (Tensor): Bbox targets (gt) with shape [..., 3].
+            weight (Tensor, optional): Weight of loss.
+                Defaults to None.
+            avg_factor (float, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): Method to reduce losses.
+                The valid reduction method are 'none', 'sum' or 'mean'.
+                Defaults to None.
+
+        Returns:
+            Tensor: IoU loss between predictions and targets.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if (weight is not None) and (not torch.any(weight > 0)) and (
+                reduction != 'none'):
+            return (pred * weight).sum()
+        return self.loss(
+            pred,
+            target,
+            weight=weight,
+            avg_factor=avg_factor,
+            reduction=reduction) * self.loss_weight
diff --git a/mmde/projects/TR3D/tr3d/mink_resnet.py b/mmde/projects/TR3D/tr3d/mink_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..85e0543e97ac3f3aa404cabe84f0c9c27833daf3
--- /dev/null
+++ b/mmde/projects/TR3D/tr3d/mink_resnet.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+try:
+    import MinkowskiEngine as ME
+except ImportError:
+    # Please follow getting_started.md to install MinkowskiEngine.
+    ME = SparseTensor = None
+    pass
+
+from mmdet3d.models.backbones import MinkResNet
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class TR3DMinkResNet(MinkResNet):
+    r"""Minkowski ResNet backbone. See `4D Spatio-Temporal ConvNets
+    <https://arxiv.org/abs/1904.08755>`_ for more details. The onle difference
+    with MinkResNet is the `norm` and `num_planes` parameters. These classes
+    should be merged in the future.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input channels, 3 for RGB.
+        num_stages (int): Resnet stages. Defaults to 4.
+        pool (bool): Whether to add max pooling after first conv.
+            Defaults to True.
+        norm (str): Norm type ('instance' or 'batch') for stem layer.
+            Usually ResNet implies BatchNorm but for some reason
+            original MinkResNet implies InstanceNorm. Defaults to 'instance'.
+        num_planes (tuple[int]): Number of planes per block before
+            block.expansion. Defaults to (64, 128, 256, 512).
+    """
+
+    def __init__(self,
+                 depth: int,
+                 in_channels: int,
+                 num_stages: int = 4,
+                 pool: bool = True,
+                 norm: str = 'instance',
+                 num_planes: Tuple[int] = (64, 128, 256, 512)):
+        super(TR3DMinkResNet, self).__init__(depth, in_channels, num_stages,
+                                             pool)
+        block, stage_blocks = self.arch_settings[depth]
+        self.inplanes = 64
+        norm_layer = ME.MinkowskiInstanceNorm if norm == 'instance' else \
+            ME.MinkowskiBatchNorm
+        self.norm1 = norm_layer(self.inplanes)
+
+        for i in range(len(stage_blocks)):
+            setattr(
+                self, f'layer{i + 1}',
+                self._make_layer(
+                    block, num_planes[i], stage_blocks[i], stride=2))
diff --git a/mmde/projects/TR3D/tr3d/rotated_iou_loss.py b/mmde/projects/TR3D/tr3d/rotated_iou_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..bef7b849ef0059f4d5cd68fdadd0c430160b130c
--- /dev/null
+++ b/mmde/projects/TR3D/tr3d/rotated_iou_loss.py
@@ -0,0 +1,149 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmcv.ops.diff_iou_rotated import box2corners, oriented_box_intersection_2d
+from mmdet.models.losses.utils import weighted_loss
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet3d.models import rotated_iou_3d_loss
+from mmdet3d.registry import MODELS
+
+
+def diff_diou_rotated_3d(box3d1: Tensor, box3d2: Tensor) -> Tensor:
+    """Calculate differentiable DIoU of rotated 3d boxes.
+
+    Args:
+        box3d1 (Tensor): (B, N, 3+3+1) First box (x,y,z,w,h,l,alpha).
+        box3d2 (Tensor): (B, N, 3+3+1) Second box (x,y,z,w,h,l,alpha).
+    Returns:
+        Tensor: (B, N) DIoU.
+    """
+    box1 = box3d1[..., [0, 1, 3, 4, 6]]
+    box2 = box3d2[..., [0, 1, 3, 4, 6]]
+    corners1 = box2corners(box1)
+    corners2 = box2corners(box2)
+    intersection, _ = oriented_box_intersection_2d(corners1, corners2)
+    zmax1 = box3d1[..., 2] + box3d1[..., 5] * 0.5
+    zmin1 = box3d1[..., 2] - box3d1[..., 5] * 0.5
+    zmax2 = box3d2[..., 2] + box3d2[..., 5] * 0.5
+    zmin2 = box3d2[..., 2] - box3d2[..., 5] * 0.5
+    z_overlap = (torch.min(zmax1, zmax2) -
+                 torch.max(zmin1, zmin2)).clamp_(min=0.)
+    intersection_3d = intersection * z_overlap
+    volume1 = box3d1[..., 3] * box3d1[..., 4] * box3d1[..., 5]
+    volume2 = box3d2[..., 3] * box3d2[..., 4] * box3d2[..., 5]
+    union_3d = volume1 + volume2 - intersection_3d
+
+    x1_max = torch.max(corners1[..., 0], dim=2)[0]
+    x1_min = torch.min(corners1[..., 0], dim=2)[0]
+    y1_max = torch.max(corners1[..., 1], dim=2)[0]
+    y1_min = torch.min(corners1[..., 1], dim=2)[0]
+
+    x2_max = torch.max(corners2[..., 0], dim=2)[0]
+    x2_min = torch.min(corners2[..., 0], dim=2)[0]
+    y2_max = torch.max(corners2[..., 1], dim=2)[0]
+    y2_min = torch.min(corners2[..., 1], dim=2)[0]
+
+    x_max = torch.max(x1_max, x2_max)
+    x_min = torch.min(x1_min, x2_min)
+    y_max = torch.max(y1_max, y2_max)
+    y_min = torch.min(y1_min, y2_min)
+
+    z_max = torch.max(zmax1, zmax2)
+    z_min = torch.min(zmin1, zmin2)
+
+    r2 = ((box1[..., :3] - box2[..., :3])**2).sum(dim=-1)
+    c2 = (x_min - x_max)**2 + (y_min - y_max)**2 + (z_min - z_max)**2
+
+    return intersection_3d / union_3d - r2 / c2
+
+
+@weighted_loss
+def rotated_diou_3d_loss(pred: Tensor, target: Tensor) -> Tensor:
+    """Calculate the DIoU loss (1-DIoU) of two sets of rotated bounding boxes.
+    Note that predictions and targets are one-to-one corresponded.
+
+    Args:
+        pred (torch.Tensor): Bbox predictions with shape [N, 7]
+            (x, y, z, w, l, h, alpha).
+        target (torch.Tensor): Bbox targets (gt) with shape [N, 7]
+            (x, y, z, w, l, h, alpha).
+
+    Returns:
+        torch.Tensor: IoU loss between predictions and targets.
+    """
+    diou_loss = 1 - diff_diou_rotated_3d(
+        pred.unsqueeze(0), target.unsqueeze(0))[0]
+    return diou_loss
+
+
+@MODELS.register_module()
+class TR3DRotatedIoU3DLoss(nn.Module):
+    """Calculate the IoU loss (1-IoU) of rotated bounding boxes. The only
+    difference with original RotatedIoU3DLoss is the addition of DIoU mode.
+    These classes should be merged in the future.
+
+    Args:
+        mode (str): 'iou' for intersection over union or 'diou' for
+            distance-iou loss. Defaults to 'iou'.
+        reduction (str): Method to reduce losses.
+            The valid reduction method are 'none', 'sum' or 'mean'.
+            Defaults to 'mean'.
+        loss_weight (float): Weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 mode: str = 'iou',
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super(TR3DRotatedIoU3DLoss, self).__init__()
+        assert mode in ['iou', 'diou']
+        self.loss = rotated_iou_3d_loss if mode == 'iou' \
+            else rotated_diou_3d_loss
+        assert reduction in ['none', 'sum', 'mean']
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function of loss calculation.
+
+        Args:
+            pred (Tensor): Bbox predictions with shape [..., 7]
+                (x, y, z, w, l, h, alpha).
+            target (Tensor): Bbox targets (gt) with shape [..., 7]
+                (x, y, z, w, l, h, alpha).
+            weight (Tensor, optional): Weight of loss.
+                Defaults to None.
+            avg_factor (float, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): Method to reduce losses.
+                The valid reduction method are 'none', 'sum' or 'mean'.
+                Defaults to None.
+
+        Returns:
+            Tensor: IoU loss between predictions and targets.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            return pred.sum() * weight.sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            weight = weight.mean(-1)
+        loss = self.loss_weight * self.loss(
+            pred,
+            target,
+            weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+
+        return loss
diff --git a/mmde/projects/TR3D/tr3d/tr3d_head.py b/mmde/projects/TR3D/tr3d/tr3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..48d3b33a6192c81b69257625f4e5aa2dc1a1dd3b
--- /dev/null
+++ b/mmde/projects/TR3D/tr3d/tr3d_head.py
@@ -0,0 +1,472 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapted from https://github.com/SamsungLabs/tr3d/blob/master/mmdet3d/models/dense_heads/tr3d_head.py # noqa
+from typing import List, Optional, Tuple
+
+try:
+    import MinkowskiEngine as ME
+    from MinkowskiEngine import SparseTensor
+except ImportError:
+    # Please follow getting_started.md to install MinkowskiEngine.
+    ME = SparseTensor = None
+    pass
+
+import torch
+from mmcv.ops import nms3d, nms3d_normal
+from mmengine.model import bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor, nn
+
+from mmdet3d.models import Base3DDenseHead
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import BaseInstance3DBoxes
+from mmdet3d.utils import InstanceList, OptInstanceList
+
+
+@MODELS.register_module()
+class TR3DHead(Base3DDenseHead):
+    r"""Bbox head of `TR3D <https://arxiv.org/abs/2302.02858>`_.
+
+    Args:
+        in_channels (int): Number of channels in input tensors.
+        num_reg_outs (int): Number of regression layer channels.
+        voxel_size (float): Voxel size in meters.
+        pts_center_threshold (int): Box to location assigner parameter.
+            After feature level for the box is determined, assigner selects
+            pts_center_threshold locations closest to the box center.
+        bbox_loss (dict): Config of bbox loss. Defaults to
+            dict(type='AxisAlignedIoULoss', mode='diou', reduction=None).
+        cls_loss (dict): Config of classification loss. Defaults to
+            dict = dict(type='mmdet.FocalLoss', reduction=None).
+        train_cfg (dict, optional): Config for train stage. Defaults to None.
+        test_cfg (dict, optional): Config for test stage. Defaults to None.
+        init_cfg (dict, optional): Config for weight initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 num_reg_outs: int,
+                 voxel_size: int,
+                 pts_center_threshold: int,
+                 label2level: Tuple[int],
+                 bbox_loss: dict = dict(
+                     type='TR3DAxisAlignedIoULoss',
+                     mode='diou',
+                     reduction='none'),
+                 cls_loss: dict = dict(
+                     type='mmdet.FocalLoss', reduction='none'),
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None):
+        super(TR3DHead, self).__init__(init_cfg)
+        if ME is None:
+            raise ImportError(
+                'Please follow `getting_started.md` to install MinkowskiEngine.`'  # noqa: E501
+            )
+        self.voxel_size = voxel_size
+        self.pts_center_threshold = pts_center_threshold
+        self.label2level = label2level
+        self.bbox_loss = MODELS.build(bbox_loss)
+        self.cls_loss = MODELS.build(cls_loss)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers(len(self.label2level), in_channels, num_reg_outs)
+
+    def _init_layers(self, num_classes: int, in_channels: int,
+                     num_reg_outs: int):
+        """Initialize layers.
+
+        Args:
+            in_channels (int): Number of channels in input tensors.
+            num_reg_outs (int): Number of regression layer channels.
+            num_classes (int): Number of classes.
+        """
+        self.conv_reg = ME.MinkowskiConvolution(
+            in_channels, num_reg_outs, kernel_size=1, bias=True, dimension=3)
+        self.conv_cls = ME.MinkowskiConvolution(
+            in_channels, num_classes, kernel_size=1, bias=True, dimension=3)
+
+    def init_weights(self):
+        """Initialize weights."""
+        nn.init.normal_(self.conv_reg.kernel, std=.01)
+        nn.init.normal_(self.conv_cls.kernel, std=.01)
+        nn.init.constant_(self.conv_cls.bias, bias_init_with_prob(.01))
+
+    def _forward_single(self, x: SparseTensor) -> Tuple[Tensor, ...]:
+        """Forward pass per level.
+
+        Args:
+            x (SparseTensor): Per level neck output tensor.
+
+        Returns:
+            tuple[Tensor]: Per level head predictions.
+        """
+        reg_final = self.conv_reg(x).features
+        reg_distance = torch.exp(reg_final[:, 3:6])
+        reg_angle = reg_final[:, 6:]
+        bbox_pred = torch.cat((reg_final[:, :3], reg_distance, reg_angle),
+                              dim=1)
+        cls_pred = self.conv_cls(x).features
+
+        bbox_preds, cls_preds, points = [], [], []
+        for permutation in x.decomposition_permutations:
+            bbox_preds.append(bbox_pred[permutation])
+            cls_preds.append(cls_pred[permutation])
+            points.append(x.coordinates[permutation][:, 1:] * self.voxel_size)
+
+        return bbox_preds, cls_preds, points
+
+    def forward(self, x: List[Tensor]) -> Tuple[List[Tensor], ...]:
+        """Forward pass.
+
+        Args:
+            x (list[Tensor]): Features from the backbone.
+
+        Returns:
+            Tuple[List[Tensor], ...]: Predictions of the head.
+        """
+        bbox_preds, cls_preds, points = [], [], []
+        for i in range(len(x)):
+            bbox_pred, cls_pred, point = self._forward_single(x[i])
+            bbox_preds.append(bbox_pred)
+            cls_preds.append(cls_pred)
+            points.append(point)
+        return bbox_preds, cls_preds, points
+
+    def _loss_by_feat_single(self, bbox_preds: List[Tensor],
+                             cls_preds: List[Tensor], points: List[Tensor],
+                             gt_bboxes: BaseInstance3DBoxes, gt_labels: Tensor,
+                             input_meta: dict) -> Tuple[Tensor, ...]:
+        """Loss function of single sample.
+
+        Args:
+            bbox_preds (list[Tensor]): Bbox predictions for all levels.
+            cls_preds (list[Tensor]): Classification predictions for all
+                levels.
+            points (list[Tensor]): Final location coordinates for all levels.
+            gt_bboxes (:obj:`BaseInstance3DBoxes`): Ground truth boxes.
+            gt_labels (Tensor): Ground truth labels.
+            input_meta (dict): Scene meta info.
+
+        Returns:
+            tuple[Tensor, ...]: Bbox and classification loss
+                values and a boolean mask of assigned points.
+        """
+        num_classes = cls_preds[0].shape[1]
+        bbox_targets, cls_targets = self.get_targets(points, gt_bboxes,
+                                                     gt_labels, num_classes)
+        bbox_preds = torch.cat(bbox_preds)
+        cls_preds = torch.cat(cls_preds)
+        points = torch.cat(points)
+
+        # cls loss
+        cls_loss = self.cls_loss(cls_preds, cls_targets)
+
+        # bbox loss
+        pos_mask = cls_targets < num_classes
+        pos_bbox_preds = bbox_preds[pos_mask]
+        if pos_mask.sum() > 0:
+            pos_points = points[pos_mask]
+            pos_bbox_preds = bbox_preds[pos_mask]
+            pos_bbox_targets = bbox_targets[pos_mask]
+            bbox_loss = self.bbox_loss(
+                self._bbox_to_loss(
+                    self._bbox_pred_to_bbox(pos_points, pos_bbox_preds)),
+                self._bbox_to_loss(pos_bbox_targets))
+        else:
+            bbox_loss = pos_bbox_preds
+        return bbox_loss, cls_loss, pos_mask
+
+    def loss_by_feat(self,
+                     bbox_preds: List[List[Tensor]],
+                     cls_preds: List[List[Tensor]],
+                     points: List[List[Tensor]],
+                     batch_gt_instances_3d: InstanceList,
+                     batch_input_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None,
+                     **kwargs) -> dict:
+        """Loss function about feature.
+
+        Args:
+            bbox_preds (list[list[Tensor]]): Bbox predictions for all scenes.
+                The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            cls_preds (list[list[Tensor]]): Classification predictions for all
+                scenes. The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            points (list[list[Tensor]]): Final location coordinates for all
+                scenes. The first list contains predictions from different
+                levels. The second list contains predictions in a mini-batch.
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instance_3d.  It usually includes ``bboxes_3d``、`
+                `labels_3d``、``depths``、``centers_2d`` and attributes.
+            batch_input_metas (list[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+
+        Returns:
+            dict: Bbox, and classification losses.
+        """
+        bbox_losses, cls_losses, pos_masks = [], [], []
+        for i in range(len(batch_input_metas)):
+            bbox_loss, cls_loss, pos_mask = self._loss_by_feat_single(
+                bbox_preds=[x[i] for x in bbox_preds],
+                cls_preds=[x[i] for x in cls_preds],
+                points=[x[i] for x in points],
+                input_meta=batch_input_metas[i],
+                gt_bboxes=batch_gt_instances_3d[i].bboxes_3d,
+                gt_labels=batch_gt_instances_3d[i].labels_3d)
+            if len(bbox_loss) > 0:
+                bbox_losses.append(bbox_loss)
+            cls_losses.append(cls_loss)
+            pos_masks.append(pos_mask)
+        return dict(
+            bbox_loss=torch.mean(torch.cat(bbox_losses)),
+            cls_loss=torch.sum(torch.cat(cls_losses)) /
+            torch.sum(torch.cat(pos_masks)))
+
+    def _predict_by_feat_single(self, bbox_preds: List[Tensor],
+                                cls_preds: List[Tensor], points: List[Tensor],
+                                input_meta: dict) -> InstanceData:
+        """Generate boxes for single sample.
+
+        Args:
+            center_preds (list[Tensor]): Centerness predictions for all levels.
+            bbox_preds (list[Tensor]): Bbox predictions for all levels.
+            cls_preds (list[Tensor]): Classification predictions for all
+                levels.
+            points (list[Tensor]): Final location coordinates for all levels.
+            input_meta (dict): Scene meta info.
+
+        Returns:
+            InstanceData: Predicted bounding boxes, scores and labels.
+        """
+        scores = torch.cat(cls_preds).sigmoid()
+        bbox_preds = torch.cat(bbox_preds)
+        points = torch.cat(points)
+        max_scores, _ = scores.max(dim=1)
+
+        if len(scores) > self.test_cfg.nms_pre > 0:
+            _, ids = max_scores.topk(self.test_cfg.nms_pre)
+            bbox_preds = bbox_preds[ids]
+            scores = scores[ids]
+            points = points[ids]
+
+        bboxes = self._bbox_pred_to_bbox(points, bbox_preds)
+        bboxes, scores, labels = self._single_scene_multiclass_nms(
+            bboxes, scores, input_meta)
+
+        bboxes = input_meta['box_type_3d'](
+            bboxes,
+            box_dim=bboxes.shape[1],
+            with_yaw=bboxes.shape[1] == 7,
+            origin=(.5, .5, .5))
+
+        results = InstanceData()
+        results.bboxes_3d = bboxes
+        results.scores_3d = scores
+        results.labels_3d = labels
+        return results
+
+    def predict_by_feat(self, bbox_preds: List[List[Tensor]], cls_preds,
+                        points: List[List[Tensor]],
+                        batch_input_metas: List[dict],
+                        **kwargs) -> List[InstanceData]:
+        """Generate boxes for all scenes.
+
+        Args:
+            bbox_preds (list[list[Tensor]]): Bbox predictions for all scenes.
+            cls_preds (list[list[Tensor]]): Classification predictions for all
+                scenes.
+            points (list[list[Tensor]]): Final location coordinates for all
+                scenes.
+            batch_input_metas (list[dict]): Meta infos for all scenes.
+
+        Returns:
+            list[InstanceData]: Predicted bboxes, scores, and labels for
+            all scenes.
+        """
+        results = []
+        for i in range(len(batch_input_metas)):
+            result = self._predict_by_feat_single(
+                bbox_preds=[x[i] for x in bbox_preds],
+                cls_preds=[x[i] for x in cls_preds],
+                points=[x[i] for x in points],
+                input_meta=batch_input_metas[i])
+            results.append(result)
+        return results
+
+    @staticmethod
+    def _bbox_to_loss(bbox):
+        """Transform box to the axis-aligned or rotated iou loss format.
+
+        Args:
+            bbox (Tensor): 3D box of shape (N, 6) or (N, 7).
+
+        Returns:
+            Tensor: Transformed 3D box of shape (N, 6) or (N, 7).
+        """
+        # rotated iou loss accepts (x, y, z, w, h, l, heading)
+        if bbox.shape[-1] != 6:
+            return bbox
+
+        # axis-aligned case: x, y, z, w, h, l -> x1, y1, z1, x2, y2, z2
+        return torch.stack(
+            (bbox[..., 0] - bbox[..., 3] / 2, bbox[..., 1] - bbox[..., 4] / 2,
+             bbox[..., 2] - bbox[..., 5] / 2, bbox[..., 0] + bbox[..., 3] / 2,
+             bbox[..., 1] + bbox[..., 4] / 2, bbox[..., 2] + bbox[..., 5] / 2),
+            dim=-1)
+
+    @staticmethod
+    def _bbox_pred_to_bbox(points, bbox_pred):
+        """Transform predicted bbox parameters to bbox.
+
+        Args:
+            points (Tensor): Final locations of shape (N, 3)
+            bbox_pred (Tensor): Predicted bbox parameters of shape (N, 6)
+                or (N, 8).
+        Returns:
+            Tensor: Transformed 3D box of shape (N, 6) or (N, 7).
+        """
+        if bbox_pred.shape[0] == 0:
+            return bbox_pred
+
+        x_center = points[:, 0] + bbox_pred[:, 0]
+        y_center = points[:, 1] + bbox_pred[:, 1]
+        z_center = points[:, 2] + bbox_pred[:, 2]
+        base_bbox = torch.stack([
+            x_center, y_center, z_center, bbox_pred[:, 3], bbox_pred[:, 4],
+            bbox_pred[:, 5]
+        ], -1)
+
+        # axis-aligned case
+        if bbox_pred.shape[1] == 6:
+            return base_bbox
+
+        # rotated case: ..., sin(2a)ln(q), cos(2a)ln(q)
+        scale = bbox_pred[:, 3] + bbox_pred[:, 4]
+        q = torch.exp(
+            torch.sqrt(
+                torch.pow(bbox_pred[:, 6], 2) + torch.pow(bbox_pred[:, 7], 2)))
+        alpha = 0.5 * torch.atan2(bbox_pred[:, 6], bbox_pred[:, 7])
+        return torch.stack(
+            (x_center, y_center, z_center, scale / (1 + q), scale /
+             (1 + q) * q, bbox_pred[:, 5] + bbox_pred[:, 4], alpha),
+            dim=-1)
+
+    @torch.no_grad()
+    def get_targets(self, points: Tensor, gt_bboxes: BaseInstance3DBoxes,
+                    gt_labels: Tensor, num_classes: int) -> Tuple[Tensor, ...]:
+        """Compute targets for final locations for a single scene.
+
+        Args:
+            points (list[Tensor]): Final locations for all levels.
+            gt_bboxes (BaseInstance3DBoxes): Ground truth boxes.
+            gt_labels (Tensor): Ground truth labels.
+            num_classes (int): Number of classes.
+
+        Returns:
+            tuple[Tensor, ...]: Bbox and classification targets for all
+                locations.
+        """
+        float_max = points[0].new_tensor(1e8)
+        levels = torch.cat([
+            points[i].new_tensor(i, dtype=torch.long).expand(len(points[i]))
+            for i in range(len(points))
+        ])
+        points = torch.cat(points)
+        n_points = len(points)
+        n_boxes = len(gt_bboxes)
+
+        if len(gt_labels) == 0:
+            return points.new_tensor([]), \
+                gt_labels.new_full((n_points,), num_classes)
+
+        boxes = torch.cat((gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+                          dim=1)
+        boxes = boxes.to(points.device).expand(n_points, n_boxes, 7)
+        points = points.unsqueeze(1).expand(n_points, n_boxes, 3)
+
+        # condition 1: fix level for label
+        label2level = gt_labels.new_tensor(self.label2level)
+        label_levels = label2level[gt_labels].unsqueeze(0).expand(
+            n_points, n_boxes)
+        point_levels = torch.unsqueeze(levels, 1).expand(n_points, n_boxes)
+        level_condition = label_levels == point_levels
+
+        # condition 2: keep topk location per box by center distance
+        center = boxes[..., :3]
+        center_distances = torch.sum(torch.pow(center - points, 2), dim=-1)
+        center_distances = torch.where(level_condition, center_distances,
+                                       float_max)
+        topk_distances = torch.topk(
+            center_distances,
+            min(self.pts_center_threshold + 1, len(center_distances)),
+            largest=False,
+            dim=0).values[-1]
+        topk_condition = center_distances < topk_distances.unsqueeze(0)
+
+        # condition 3: min center distance to box per point
+        center_distances = torch.where(topk_condition, center_distances,
+                                       float_max)
+        min_values, min_ids = center_distances.min(dim=1)
+        min_inds = torch.where(min_values < float_max, min_ids, -1)
+
+        bbox_targets = boxes[0][min_inds]
+        if not gt_bboxes.with_yaw:
+            bbox_targets = bbox_targets[:, :-1]
+        cls_targets = torch.where(min_inds >= 0, gt_labels[min_inds],
+                                  num_classes)
+        return bbox_targets, cls_targets
+
+    def _single_scene_multiclass_nms(self, bboxes: Tensor, scores: Tensor,
+                                     input_meta: dict) -> Tuple[Tensor, ...]:
+        """Multi-class nms for a single scene.
+
+        Args:
+            bboxes (Tensor): Predicted boxes of shape (N_boxes, 6) or
+                (N_boxes, 7).
+            scores (Tensor): Predicted scores of shape (N_boxes, N_classes).
+            input_meta (dict): Scene meta data.
+
+        Returns:
+            tuple[Tensor, ...]: Predicted bboxes, scores and labels.
+        """
+        num_classes = scores.shape[1]
+        with_yaw = bboxes.shape[1] == 7
+        nms_bboxes, nms_scores, nms_labels = [], [], []
+        for i in range(num_classes):
+            ids = scores[:, i] > self.test_cfg.score_thr
+            if not ids.any():
+                continue
+
+            class_scores = scores[ids, i]
+            class_bboxes = bboxes[ids]
+            if with_yaw:
+                nms_function = nms3d
+            else:
+                class_bboxes = torch.cat(
+                    (class_bboxes, torch.zeros_like(class_bboxes[:, :1])),
+                    dim=1)
+                nms_function = nms3d_normal
+
+            nms_ids = nms_function(class_bboxes, class_scores,
+                                   self.test_cfg.iou_thr)
+            nms_bboxes.append(class_bboxes[nms_ids])
+            nms_scores.append(class_scores[nms_ids])
+            nms_labels.append(
+                bboxes.new_full(
+                    class_scores[nms_ids].shape, i, dtype=torch.long))
+
+        if len(nms_bboxes):
+            nms_bboxes = torch.cat(nms_bboxes, dim=0)
+            nms_scores = torch.cat(nms_scores, dim=0)
+            nms_labels = torch.cat(nms_labels, dim=0)
+        else:
+            nms_bboxes = bboxes.new_zeros((0, bboxes.shape[1]))
+            nms_scores = bboxes.new_zeros((0, ))
+            nms_labels = bboxes.new_zeros((0, ))
+
+        if not with_yaw:
+            nms_bboxes = nms_bboxes[:, :6]
+
+        return nms_bboxes, nms_scores, nms_labels
diff --git a/mmde/projects/TR3D/tr3d/tr3d_neck.py b/mmde/projects/TR3D/tr3d/tr3d_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..41e54b034b896f770f578cd3a5bd5254e28fa90a
--- /dev/null
+++ b/mmde/projects/TR3D/tr3d/tr3d_neck.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapted from https://github.com/SamsungLabs/tr3d/blob/master/mmdet3d/models/necks/tr3d_neck.py # noqa
+from typing import List, Tuple
+
+try:
+    import MinkowskiEngine as ME
+    from MinkowskiEngine import SparseTensor
+except ImportError:
+    # Please follow getting_started.md to install MinkowskiEngine.
+    ME = SparseTensor = None
+    pass
+
+from mmengine.model import BaseModule
+from torch import nn
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class TR3DNeck(BaseModule):
+    r"""Neck of `TR3D <https://arxiv.org/abs/2302.02858>`_.
+
+    Args:
+        in_channels (tuple[int]): Number of channels in input tensors.
+        out_channels (int): Number of channels in output tensors.
+    """
+
+    def __init__(self, in_channels: Tuple[int], out_channels: int):
+        super(TR3DNeck, self).__init__()
+        self._init_layers(in_channels[1:], out_channels)
+
+    def _init_layers(self, in_channels: Tuple[int], out_channels: int):
+        """Initialize layers.
+
+        Args:
+            in_channels (tuple[int]): Number of channels in input tensors.
+            out_channels (int): Number of channels in output tensors.
+        """
+        for i in range(len(in_channels)):
+            if i > 0:
+                self.add_module(
+                    f'up_block_{i}',
+                    self._make_block(in_channels[i], in_channels[i - 1], True,
+                                     2))
+            if i < len(in_channels) - 1:
+                self.add_module(
+                    f'lateral_block_{i}',
+                    self._make_block(in_channels[i], in_channels[i]))
+                self.add_module(f'out_block_{i}',
+                                self._make_block(in_channels[i], out_channels))
+
+    def init_weights(self):
+        """Initialize weights."""
+        for m in self.modules():
+            if isinstance(m, ME.MinkowskiConvolution):
+                ME.utils.kaiming_normal_(
+                    m.kernel, mode='fan_out', nonlinearity='relu')
+
+            if isinstance(m, ME.MinkowskiBatchNorm):
+                nn.init.constant_(m.bn.weight, 1)
+                nn.init.constant_(m.bn.bias, 0)
+
+    def forward(self, x: List[SparseTensor]) -> List[SparseTensor]:
+        """Forward pass.
+
+        Args:
+            x (list[SparseTensor]): Features from the backbone.
+
+        Returns:
+            List[Tensor]: Output features from the neck.
+        """
+        x = x[1:]
+        outs = []
+        inputs = x
+        x = inputs[-1]
+        for i in range(len(inputs) - 1, -1, -1):
+            if i < len(inputs) - 1:
+                x = self.__getattr__(f'up_block_{i + 1}')(x)
+                x = inputs[i] + x
+                x = self.__getattr__(f'lateral_block_{i}')(x)
+                out = self.__getattr__(f'out_block_{i}')(x)
+                outs.append(out)
+        return outs[::-1]
+
+    @staticmethod
+    def _make_block(in_channels: int,
+                    out_channels: int,
+                    generative: bool = False,
+                    stride: int = 1) -> nn.Module:
+        """Construct Conv-Norm-Act block.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            generative (bool): Use generative convolution if True.
+                Defaults to False.
+            stride (int): Stride of the convolution. Defaults to 1.
+
+        Returns:
+            torch.nn.Module: With corresponding layers.
+        """
+        conv = ME.MinkowskiGenerativeConvolutionTranspose if generative \
+            else ME.MinkowskiConvolution
+        return nn.Sequential(
+            conv(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=stride,
+                dimension=3), ME.MinkowskiBatchNorm(out_channels),
+            ME.MinkowskiReLU(inplace=True))
diff --git a/mmde/projects/TR3D/tr3d/transforms_3d.py b/mmde/projects/TR3D/tr3d/transforms_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5f19241172910278d2a08757efc4b3f2c1fa68f
--- /dev/null
+++ b/mmde/projects/TR3D/tr3d/transforms_3d.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import numpy as np
+
+from mmdet3d.datasets import PointSample
+from mmdet3d.registry import TRANSFORMS
+from mmdet3d.structures.points import BasePoints
+
+
+@TRANSFORMS.register_module()
+class TR3DPointSample(PointSample):
+    """The only difference with PointSample is the support of float num_points
+    parameter.
+
+    In this case we sample random fraction of points from num_points to 100%
+    points. These classes should be merged in the future.
+    """
+
+    def _points_random_sampling(
+        self,
+        points: BasePoints,
+        num_samples: Union[int, float],
+        sample_range: Optional[float] = None,
+        replace: bool = False,
+        return_choices: bool = False
+    ) -> Union[Tuple[BasePoints, np.ndarray], BasePoints]:
+        """Points random sampling.
+
+        Sample points to a certain number.
+
+        Args:
+            points (:obj:`BasePoints`): 3D Points.
+            num_samples (int): Number of samples to be sampled.
+            sample_range (float, optional): Indicating the range where the
+                points will be sampled. Defaults to None.
+            replace (bool): Sampling with or without replacement.
+                Defaults to False.
+            return_choices (bool): Whether return choice. Defaults to False.
+
+        Returns:
+            tuple[:obj:`BasePoints`, np.ndarray] | :obj:`BasePoints`:
+
+                - points (:obj:`BasePoints`): 3D Points.
+                - choices (np.ndarray, optional): The generated random samples.
+        """
+        if isinstance(num_samples, float):
+            assert num_samples < 1
+            num_samples = int(
+                np.random.uniform(self.num_points, 1.) * points.shape[0])
+
+        if not replace:
+            replace = (points.shape[0] < num_samples)
+        point_range = range(len(points))
+        if sample_range is not None and not replace:
+            # Only sampling the near points when len(points) >= num_samples
+            dist = np.linalg.norm(points.coord.numpy(), axis=1)
+            far_inds = np.where(dist >= sample_range)[0]
+            near_inds = np.where(dist < sample_range)[0]
+            # in case there are too many far points
+            if len(far_inds) > num_samples:
+                far_inds = np.random.choice(
+                    far_inds, num_samples, replace=False)
+            point_range = near_inds
+            num_samples -= len(far_inds)
+        choices = np.random.choice(point_range, num_samples, replace=replace)
+        if sample_range is not None and not replace:
+            choices = np.concatenate((far_inds, choices))
+            # Shuffle points after sampling
+            np.random.shuffle(choices)
+        if return_choices:
+            return points[choices], choices
+        else:
+            return points[choices]
diff --git a/mmde/projects/example_project/README.md b/mmde/projects/example_project/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d23c8e1b016cee37cfb8eaa024a11319e04220b7
--- /dev/null
+++ b/mmde/projects/example_project/README.md
@@ -0,0 +1,115 @@
+# Dummy ResNet Wrapper
+
+This is an example README for community `projects/`. We have provided detailed explanations for each field in the form of html comments, which are visible when you read the source of this README file. If you wish to submit your project to our main repository, then all the fields in this README are mandatory for others to understand what you have achieved in this implementation.
+
+## Description
+
+<!-- Share any information you would like others to know. For example:
+Author: @xxx.
+This is an implementation of \[XXX\]. -->
+
+This project implements a dummy ResNet wrapper, which literally does nothing new but prints "hello world" during initialization.
+
+## Usage
+
+<!-- For a typical model, this section should contain the commands for training and testing. You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`. -->
+
+### Training commands
+
+In MMDet3D's root directory, run the following command to train the model:
+
+```bash
+python tools/train.py projects/example_project/configs/fcos3d_dummy-resnet-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py
+```
+
+### Testing commands
+
+In MMDet3D's root directory, run the following command to test the model:
+
+```bash
+python tools/test.py projects/example_project/configs/fcos3d_dummy-resnet-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py ${CHECKPOINT_PATH}
+```
+
+## Results
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmdetection3d/edit/dev-1.x/configs/fcos3d/README.md)
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+|                                                     Backbone                                                     | Lr schd | Mem (GB) | Inf time (fps) | mAP  | NDS  |         Download         |
+| :--------------------------------------------------------------------------------------------------------------: | :-----: | :------: | :------------: | :--: | :--: | :----------------------: |
+| [FCOS3D_dummy](projects/example_project/configs/fcos3d_dummy-resnet-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py) |   1x    |   8.69   |                | 29.8 | 37.7 | [model](<>) \| [log](<>) |
+
+## Citation
+
+<!-- You may remove this section if not applicable. -->
+
+```latex
+@inproceedings{wang2021fcos3d,
+	title={{FCOS3D: Fully} Convolutional One-Stage Monocular 3D Object Detection},
+	author={Wang, Tai and Zhu, Xinge and Pang, Jiangmiao and Lin, Dahua},
+	booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops},
+	year={2021}
+}
+# For the original 2D version
+@inproceedings{tian2019fcos,
+  title     =  {{FCOS: Fully} Convolutional One-Stage Object Detection},
+  author    =  {Tian, Zhi and Shen, Chunhua and Chen, Hao and He, Tong},
+  booktitle =  {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
+  year      =  {2019}
+}
+```
+
+## Checklist
+
+<!-- Here is a checklist illustrating a usual development workflow of a successful project, and also serves as an overview of this project's progress. The PIC (person in charge) or contributors of this project should check all the items that they believe have been finished, which will further be verified by codebase maintainers via a PR.
+OpenMMLab's maintainer will review the code to ensure the project's quality. Reaching the first milestone means that this project suffices the minimum requirement of being merged into 'projects/'. But this project is only eligible to become a part of the core package upon attaining the last milestone.
+Note that keeping this section up-to-date is crucial not only for this project's developers but the entire community, since there might be some other contributors joining this project and deciding their starting point from this list. It also helps maintainers accurately estimate time and effort on further code polishing, if needed.
+A project does not necessarily have to be finished in a single PR, but it's essential for the project to at least reach the first milestone in its very first PR. -->
+
+- [ ] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [ ] Finish the code
+
+    <!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmdet3d.registry.MODELS` and configurable via a config file. -->
+
+  - [ ] Basic docstrings & proper citation
+
+    <!-- Each major object should contain a docstring, describing its functionality and arguments. If you have adapted the code from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
+
+  - [ ] Test-time correctness
+
+    <!-- If you are reproducing the result from a paper, make sure your model's inference-time performance matches that in the original paper. The weights usually could be obtained by simply renaming the keys in the official pre-trained weights. This test could be skipped though, if you are able to prove the training-time correctness and check the second milestone. -->
+
+  - [ ] A full README
+
+    <!-- As this template does. -->
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+    <!-- If you are reproducing the result from a paper, checking this item means that you should have trained your model from scratch based on the original paper's specification and verified that the final result matches the report within a minor error range. -->
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+    <!-- Ideally *all* the methods should have [type hints](https://www.pythontutorial.net/python-basics/python-type-hints/) and [docstrings](https://google.github.io/styleguide/pyguide.html#381-docstrings). [Example](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/mmdet3d/models/detectors/fcos_mono3d.py) -->
+
+  - [ ] Unit tests
+
+    <!-- Unit tests for each module are required. [Example](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/tests/test_models/test_dense_heads/test_fcos_mono3d_head.py) -->
+
+  - [ ] Code polishing
+
+    <!-- Refactor your code according to reviewer's comment. -->
+
+  - [ ] Metafile.yml
+
+    <!-- It will be parsed by MIM and Inferencer. [Example](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/fcos3d/metafile.yml) -->
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+  <!-- In particular, you may have to refactor this README into a standard one. [Example](/configs/textdet/dbnet/README.md) -->
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/mmde/projects/example_project/configs/fcos3d_dummy-resnet-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py b/mmde/projects/example_project/configs/fcos3d_dummy-resnet-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f19c0cea4181b876862b762495c9a6cb0b2a8f0
--- /dev/null
+++ b/mmde/projects/example_project/configs/fcos3d_dummy-resnet-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../../../configs/fcos3d/fcos3d_r101-caffe-dcn_fpn_head-gn_8xb2-1x_nus-mono3d.py'  # noqa
+]
+
+custom_imports = dict(imports=['projects.example_project.dummy'])
+
+_base_.model.backbone.type = 'DummyResNet'
diff --git a/mmde/projects/example_project/dummy/__init__.py b/mmde/projects/example_project/dummy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..70df7896d6ddb28688204a6402a2270e09ec255a
--- /dev/null
+++ b/mmde/projects/example_project/dummy/__init__.py
@@ -0,0 +1,3 @@
+from .dummy_resnet import DummyResNet
+
+__all__ = ['DummyResNet']
diff --git a/mmde/projects/example_project/dummy/dummy_resnet.py b/mmde/projects/example_project/dummy/dummy_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..63b5fa107ea38cec196b1588f6d4e96698daf003
--- /dev/null
+++ b/mmde/projects/example_project/dummy/dummy_resnet.py
@@ -0,0 +1,15 @@
+from mmdet.models.backbones import ResNet
+
+from mmdet3d.registry import MODELS
+
+
+@MODELS.register_module()
+class DummyResNet(ResNet):
+    """Implements a dummy ResNet wrapper for demonstration purpose.
+    Args:
+        **kwargs: All the arguments are passed to the parent class.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        print('Hello world!')
+        super().__init__(**kwargs)
diff --git a/mmde/requirements.txt b/mmde/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6981bd723391a980c0f22baeab39d0adbcb68679
--- /dev/null
+++ b/mmde/requirements.txt
@@ -0,0 +1,4 @@
+-r requirements/build.txt
+-r requirements/optional.txt
+-r requirements/runtime.txt
+-r requirements/tests.txt
diff --git a/mmde/requirements/build.txt b/mmde/requirements/build.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mmde/requirements/docs.txt b/mmde/requirements/docs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9dca39d57a7c90dc799c5fceac6d98a931471534
--- /dev/null
+++ b/mmde/requirements/docs.txt
@@ -0,0 +1,10 @@
+docutils==0.16.0
+markdown>=3.4.0
+myst-parser
+-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+sphinx==4.0.2
+sphinx-tabs
+sphinx_copybutton
+sphinx_markdown_tables>=0.0.16
+tabulate
+urllib3<2.0.0
diff --git a/mmde/requirements/mminstall.txt b/mmde/requirements/mminstall.txt
new file mode 100644
index 0000000000000000000000000000000000000000..066a0ae47d0387a3fe51c2fcc0b659321c5cf88a
--- /dev/null
+++ b/mmde/requirements/mminstall.txt
@@ -0,0 +1,3 @@
+mmcv>=2.0.0rc4,<2.2.0
+mmdet>=3.0.0,<3.3.0
+mmengine>=0.7.1,<1.0.0
diff --git a/mmde/requirements/optional.txt b/mmde/requirements/optional.txt
new file mode 100644
index 0000000000000000000000000000000000000000..099ad8a201a6ecc6375509b79beef7028e076da1
--- /dev/null
+++ b/mmde/requirements/optional.txt
@@ -0,0 +1,3 @@
+black==20.8b1 # be compatible with typing-extensions 3.7.4
+typing-extensions # required by tensorflow<=2.6
+waymo-open-dataset-tf-2-6-0 # requires python>=3.7
diff --git a/mmde/requirements/readthedocs.txt b/mmde/requirements/readthedocs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4a334703ef7899376579b6f6ee0b52934f4f793b
--- /dev/null
+++ b/mmde/requirements/readthedocs.txt
@@ -0,0 +1,5 @@
+mmcv>=2.0.0rc4
+mmdet>=3.0.0
+mmengine>=0.7.1
+torch
+torchvision
diff --git a/mmde/requirements/runtime.txt b/mmde/requirements/runtime.txt
new file mode 100644
index 0000000000000000000000000000000000000000..705f7f49dbd649bfad3b99b067a098a5b7ba5c2e
--- /dev/null
+++ b/mmde/requirements/runtime.txt
@@ -0,0 +1,11 @@
+lyft_dataset_sdk
+networkx>=2.5
+numba # you should install numba==0.53.0 if your environment is cuda-9.0
+numpy
+nuscenes-devkit
+open3d
+plyfile
+scikit-image
+# by default we also use tensorboard to log results
+tensorboard
+trimesh
diff --git a/mmde/requirements/tests.txt b/mmde/requirements/tests.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e358587a4943c0dfd248be96d905bc370f5fa459
--- /dev/null
+++ b/mmde/requirements/tests.txt
@@ -0,0 +1,13 @@
+codecov
+flake8
+interrogate
+isort
+# Note: used for kwarray.group_items, this may be ported to mmcv in the future.
+kwarray
+parameterized
+pytest
+pytest-cov
+pytest-runner
+ubelt
+xdoctest >= 0.10.0
+yapf
diff --git a/mmde/resources/mmdet3d_outdoor_demo.gif b/mmde/resources/mmdet3d_outdoor_demo.gif
new file mode 100644
index 0000000000000000000000000000000000000000..1c7541a98612a38d3bc34cd2c61ad5ea4f7c6bb6
Binary files /dev/null and b/mmde/resources/mmdet3d_outdoor_demo.gif differ
diff --git a/mmde/resources/nuimages_demo.gif b/mmde/resources/nuimages_demo.gif
new file mode 100644
index 0000000000000000000000000000000000000000..7436fab01e153681c49e039d8ac2a2fe010b5596
Binary files /dev/null and b/mmde/resources/nuimages_demo.gif differ
diff --git a/mmde/resources/open3d_visual.gif b/mmde/resources/open3d_visual.gif
new file mode 100644
index 0000000000000000000000000000000000000000..02b1f869777023f0766ac64e442b0a1c70d44def
Binary files /dev/null and b/mmde/resources/open3d_visual.gif differ
diff --git a/mmde/setup.cfg b/mmde/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..1ad205d5e44f8ab724f97ed1966930d2a8fcd19f
--- /dev/null
+++ b/mmde/setup.cfg
@@ -0,0 +1,19 @@
+[yapf]
+BASED_ON_STYLE = pep8
+BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
+SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
+
+[isort]
+line_length = 79
+multi_line_output = 0
+extra_standard_library = setuptools
+known_first_party = mmdet3d
+known_third_party = cv2,imageio,indoor3d_util,load_scannet_data,lyft_dataset_sdk,m2r,matplotlib,mmcv,mmdet,mmengine,nuimages,numba,numpy,nuscenes,pandas,plyfile,pycocotools,pyquaternion,pytest,pytorch_sphinx_theme,recommonmark,requests,scannet_utils,scipy,seaborn,shapely,skimage,sphinx,tensorflow,terminaltables,torch,trimesh,ts,waymo_open_dataset
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = THIRDPARTY
+
+[codespell]
+ignore-words-list = ans,refridgerator,crate,hist,formating,dout,wan,nd,fo,avod,AVOD,warmup
+
+[flake8]
+per-file-ignores = mmdet3d/configs/*:F401,F403,F405
diff --git a/mmde/setup.py b/mmde/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..86ad8a266d05288fda0e97cd3ed9c22580f0fba5
--- /dev/null
+++ b/mmde/setup.py
@@ -0,0 +1,227 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import platform
+import shutil
+import sys
+import warnings
+from os import path as osp
+from setuptools import find_packages, setup
+
+import torch
+from torch.utils.cpp_extension import (BuildExtension, CppExtension,
+                                       CUDAExtension)
+
+
+def readme():
+    with open('README.md', encoding='utf-8') as f:
+        content = f.read()
+    return content
+
+
+version_file = 'mmdet3d/version.py'
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+def make_cuda_ext(name,
+                  module,
+                  sources,
+                  sources_cuda=[],
+                  extra_args=[],
+                  extra_include_path=[]):
+
+    define_macros = []
+    extra_compile_args = {'cxx': [] + extra_args}
+
+    if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
+        define_macros += [('WITH_CUDA', None)]
+        extension = CUDAExtension
+        extra_compile_args['nvcc'] = extra_args + [
+            '-D__CUDA_NO_HALF_OPERATORS__',
+            '-D__CUDA_NO_HALF_CONVERSIONS__',
+            '-D__CUDA_NO_HALF2_OPERATORS__',
+        ]
+        sources += sources_cuda
+    else:
+        print('Compiling {} without CUDA'.format(name))
+        extension = CppExtension
+        # raise EnvironmentError('CUDA is required to compile MMDetection!')
+
+    return extension(
+        name='{}.{}'.format(module, name),
+        sources=[os.path.join(*module.split('.'), p) for p in sources],
+        include_dirs=extra_include_path,
+        define_macros=define_macros,
+        extra_compile_args=extra_compile_args)
+
+
+def parse_requirements(fname='requirements.txt', with_version=True):
+    """Parse the package dependencies listed in a requirements file but strips
+    specific versioning information.
+
+    Args:
+        fname (str): path to requirements file
+        with_version (bool, default=False): if True include version specs
+
+    Returns:
+        list[str]: list of requirements items
+
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+    import re
+    import sys
+    from os.path import exists
+    require_fpath = fname
+
+    def parse_line(line):
+        """Parse information from a line in a requirements text file."""
+        if line.startswith('-r '):
+            # Allow specifying requirements in other files
+            target = line.split(' ')[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {'line': line}
+            if line.startswith('-e '):
+                info['package'] = line.split('#egg=')[1]
+            else:
+                # Remove versioning from the package
+                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+
+                info['package'] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ';' in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(';'))
+                        info['platform_deps'] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info['version'] = (op, version)
+            yield info
+
+    def parse_require_file(fpath):
+        with open(fpath, 'r') as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    for info in parse_line(line):
+                        yield info
+
+    def gen_packages_items():
+        if exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info['package']]
+                if with_version and 'version' in info:
+                    parts.extend(info['version'])
+                if not sys.version.startswith('3.4'):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get('platform_deps')
+                    if platform_deps is not None:
+                        parts.append(';' + platform_deps)
+                item = ''.join(parts)
+                yield item
+
+    packages = list(gen_packages_items())
+    return packages
+
+
+def add_mim_extention():
+    """Add extra files that are required to support MIM into the package.
+
+    These files will be added by creating a symlink to the originals if the
+    package is installed in `editable` mode (e.g. pip install -e .), or by
+    copying from the originals otherwise.
+    """
+
+    # parse installment mode
+    if 'develop' in sys.argv:
+        # installed by `pip install -e .`
+        if platform.system() == 'Windows':
+            # set `copy` mode here since symlink fails on Windows.
+            mode = 'copy'
+        else:
+            mode = 'symlink'
+    elif 'sdist' in sys.argv or 'bdist_wheel' in sys.argv:
+        # installed by `pip install .`
+        # or create source distribution by `python setup.py sdist`
+        mode = 'copy'
+    else:
+        return
+
+    filenames = [
+        'tools', 'configs', 'demo', 'model-index.yml', 'dataset-index.yml'
+    ]
+    repo_path = osp.dirname(__file__)
+    mim_path = osp.join(repo_path, 'mmdet3d', '.mim')
+    os.makedirs(mim_path, exist_ok=True)
+
+    for filename in filenames:
+        if osp.exists(filename):
+            src_path = osp.join(repo_path, filename)
+            tar_path = osp.join(mim_path, filename)
+
+            if osp.isfile(tar_path) or osp.islink(tar_path):
+                os.remove(tar_path)
+            elif osp.isdir(tar_path):
+                shutil.rmtree(tar_path)
+
+            if mode == 'symlink':
+                src_relpath = osp.relpath(src_path, osp.dirname(tar_path))
+                os.symlink(src_relpath, tar_path)
+            elif mode == 'copy':
+                if osp.isfile(src_path):
+                    shutil.copyfile(src_path, tar_path)
+                elif osp.isdir(src_path):
+                    shutil.copytree(src_path, tar_path)
+                else:
+                    warnings.warn(f'Cannot copy file {src_path}.')
+            else:
+                raise ValueError(f'Invalid mode {mode}')
+
+
+if __name__ == '__main__':
+    add_mim_extention()
+    setup(
+        name='mmdet3d',
+        version=get_version(),
+        description=("OpenMMLab's next-generation platform"
+                     'for general 3D object detection.'),
+        long_description=readme(),
+        long_description_content_type='text/markdown',
+        author='MMDetection3D Contributors',
+        author_email='zwwdev@gmail.com',
+        keywords='computer vision, 3D object detection',
+        url='https://github.com/open-mmlab/mmdetection3d',
+        packages=find_packages(exclude=('configs', 'tools', 'demo')),
+        include_package_data=True,
+        classifiers=[
+            'Development Status :: 5 - Production/Stable',
+            'License :: OSI Approved :: Apache Software License',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.7',
+            'Programming Language :: Python :: 3.8',
+            'Programming Language :: Python :: 3.9',
+        ],
+        license='Apache License 2.0',
+        install_requires=parse_requirements('requirements/runtime.txt'),
+        extras_require={
+            'all': parse_requirements('requirements.txt'),
+            'tests': parse_requirements('requirements/tests.txt'),
+            'build': parse_requirements('requirements/build.txt'),
+            'optional': parse_requirements('requirements/optional.txt'),
+            'mim': parse_requirements('requirements/mminstall.txt'),
+        },
+        ext_modules=[],
+        cmdclass={'build_ext': BuildExtension},
+        zip_safe=False)
diff --git a/mmde/testmodel.py b/mmde/testmodel.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f297392942fad84ea62e1a960851c29d1a4367
--- /dev/null
+++ b/mmde/testmodel.py
@@ -0,0 +1,32 @@
+import torch
+from mmengine.config import Config
+from mmdet3d.registry import MODELS
+
+# 1. 唤醒 MMDetection3D 的全家桶注册表 (修复报错的这行极其关键！)
+from mmdet3d.utils import register_all_modules
+register_all_modules(init_default_scope=True)
+
+# 2. 显式导入 BEVFusion 项目，触发自定义算子和模块的注册！
+import projects.BEVFusion.bevfusion
+
+print("🔍 正在解析 BEVFusion 配置文件...")
+# 使用官方提供的默认配置文件
+config_file = 'projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py'
+cfg = Config.fromfile(config_file)
+
+print("🧱 正在海光 DCU 上构建 BEVFusion 模型架构...")
+try:
+    # 实例化模型
+    model = MODELS.build(cfg.model)
+    
+    # 推入海光 GPU (DCU) 显存
+    model.cuda()
+    
+    # 打印一下网络参数量，确认实体存在
+    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"\n✅ 帅！模型在显存中构建成功！")
+    print(f"📊 模型总可训练参数量: {num_params / 1e6:.2f} M (百万)")
+    print("🚀 恭喜！高层 API 与配置文件解析完美通关！")
+    
+except Exception as e:
+    print(f"\n❌ 模型构建失败，报错信息如下:\n{e}")
\ No newline at end of file
diff --git a/mmde/tests/test_apis/test_inferencers/test_lidar_det3d_inferencer.py b/mmde/tests/test_apis/test_inferencers/test_lidar_det3d_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4fc77160685c029a21c2514c9ff78a08de88e70
--- /dev/null
+++ b/mmde/tests/test_apis/test_inferencers/test_lidar_det3d_inferencer.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+from unittest import TestCase
+
+import mmengine
+import numpy as np
+import torch
+from mmengine.utils import is_list_of
+
+from mmdet3d.apis import LidarDet3DInferencer
+from mmdet3d.structures import Det3DDataSample
+
+
+class TestLidarDet3DInferencer(TestCase):
+
+    def setUp(self):
+        # init from alias
+        self.inferencer = LidarDet3DInferencer('pointpillars_kitti-3class')
+
+    def test_init(self):
+        # init from metafile
+        LidarDet3DInferencer('pointpillars_waymod5-3class')
+        # init from cfg
+        LidarDet3DInferencer(
+            'configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py',  # noqa
+            weights=  # noqa
+            'https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class_20220301_150306-37dc2420.pth'  # noqa
+        )
+
+    def assert_predictions_equal(self, preds1, preds2):
+        for pred1, pred2 in zip(preds1, preds2):
+            if 'bboxes_3d' in pred1:
+                self.assertTrue(
+                    np.allclose(pred1['bboxes_3d'], pred2['bboxes_3d'], 0.1))
+            if 'scores_3d' in pred1:
+                self.assertTrue(
+                    np.allclose(pred1['scores_3d'], pred2['scores_3d'], 0.1))
+            if 'labels_3d' in pred1:
+                self.assertTrue(
+                    np.allclose(pred1['labels_3d'], pred2['labels_3d']))
+
+    def test_call(self):
+        if not torch.cuda.is_available():
+            return
+        # single point cloud
+        inputs = dict(points='tests/data/kitti/training/velodyne/000000.bin')
+        res_path = self.inferencer(inputs, return_vis=True)
+        # ndarray
+        pts_bytes = mmengine.fileio.get(inputs['points'])
+        points = np.frombuffer(pts_bytes, dtype=np.float32)
+        points = points.reshape(-1, 4)
+        points = points[:, :4]
+        inputs = dict(points=points)
+        res_ndarray = self.inferencer(inputs, return_vis=True)
+        self.assert_predictions_equal(res_path['predictions'],
+                                      res_ndarray['predictions'])
+        self.assertIn('visualization', res_path)
+        self.assertIn('visualization', res_ndarray)
+
+        # multiple point clouds
+        inputs = [
+            dict(points='tests/data/kitti/training/velodyne/000000.bin'),
+            dict(points='tests/data/kitti/training/velodyne/000000.bin')
+        ]
+        res_path = self.inferencer(inputs, return_vis=True)
+        # list of ndarray
+        all_points = []
+        for p in inputs:
+            pts_bytes = mmengine.fileio.get(p['points'])
+            points = np.frombuffer(pts_bytes, dtype=np.float32)
+            points = points.reshape(-1, 4)
+            all_points.append(dict(points=points))
+        res_ndarray = self.inferencer(all_points, return_vis=True)
+        self.assert_predictions_equal(res_path['predictions'],
+                                      res_ndarray['predictions'])
+        self.assertIn('visualization', res_path)
+        self.assertIn('visualization', res_ndarray)
+
+        # point cloud dir, test different batch sizes
+        pc_dir = dict(points='tests/data/kitti/training/velodyne/')
+        res_bs2 = self.inferencer(pc_dir, batch_size=2, return_vis=True)
+        self.assertIn('visualization', res_bs2)
+        self.assertIn('predictions', res_bs2)
+
+    def test_visualize(self):
+        if not torch.cuda.is_available():
+            return
+        inputs = dict(points='tests/data/kitti/training/velodyne/000000.bin'),
+        # img_out_dir
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            self.inferencer(inputs, out_dir=tmp_dir)
+            # TODO: For LiDAR-based detection, the saved image only exists when
+            # show=True.
+            # self.assertTrue(osp.exists(osp.join(tmp_dir, '000000.png')))
+
+    def test_postprocess(self):
+        if not torch.cuda.is_available():
+            return
+        # return_datasample
+        inputs = dict(points='tests/data/kitti/training/velodyne/000000.bin')
+        res = self.inferencer(inputs, return_datasamples=True)
+        self.assertTrue(is_list_of(res['predictions'], Det3DDataSample))
+
+        # pred_out_dir
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            res = self.inferencer(inputs, print_result=True, out_dir=tmp_dir)
+            dumped_res = mmengine.load(
+                osp.join(tmp_dir, 'preds', '000000.json'))
+            self.assertEqual(res['predictions'][0], dumped_res)
diff --git a/mmde/tests/test_apis/test_inferencers/test_lidar_seg3d_inferencer.py b/mmde/tests/test_apis/test_inferencers/test_lidar_seg3d_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bbdb037a1bc1e9009cefc4b075421898b97f3cb
--- /dev/null
+++ b/mmde/tests/test_apis/test_inferencers/test_lidar_seg3d_inferencer.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+from unittest import TestCase
+
+import mmengine
+import numpy as np
+import pytest
+import torch
+from mmengine.utils import is_list_of
+
+from mmdet3d.apis import LidarSeg3DInferencer
+from mmdet3d.structures import Det3DDataSample
+
+
+class TestLiDARSeg3DInferencer(TestCase):
+
+    def setUp(self):
+        # init from alias
+        self.inferencer = LidarSeg3DInferencer('pointnet2-ssg_s3dis-seg')
+
+    def test_init(self):
+        # init from metafile
+        LidarSeg3DInferencer('pointnet2-ssg_s3dis-seg')
+        # init from cfg
+        LidarSeg3DInferencer(
+            'configs/pointnet2/pointnet2_ssg_2xb16-cosine-50e_s3dis-seg.py',
+            'https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointnet2/pointnet2_ssg_16x2_cosine_50e_s3dis_seg-3d-13class/pointnet2_ssg_16x2_cosine_50e_s3dis_seg-3d-13class_20210514_144205-995d0119.pth'  # noqa
+        )
+
+    def assert_predictions_equal(self, preds1, preds2):
+        for pred1, pred2 in zip(preds1, preds2):
+            self.assertTrue(
+                np.allclose(pred1['pts_semantic_mask'],
+                            pred2['pts_semantic_mask']))
+
+    @pytest.mark.skipif(
+        not torch.cuda.is_available(), reason='requires CUDA support')
+    @pytest.mark.skipif(
+        'DISPLAY' not in os.environ, reason='requires DISPLAY device')
+    def test_call(self):
+        # single point cloud
+        inputs = dict(points='tests/data/s3dis/points/Area_1_office_2.bin')
+        torch.manual_seed(0)
+        res_path = self.inferencer(inputs, return_vis=True)
+        # ndarray
+        pts_bytes = mmengine.fileio.get(inputs['points'])
+        points = np.frombuffer(pts_bytes, dtype=np.float32)
+        points = points.reshape(-1, 6)
+        inputs = dict(points=points)
+        torch.manual_seed(0)
+        res_ndarray = self.inferencer(inputs, return_vis=True)
+        self.assert_predictions_equal(res_path['predictions'],
+                                      res_ndarray['predictions'])
+        self.assertIn('visualization', res_path)
+        self.assertIn('visualization', res_ndarray)
+
+        # multiple point clouds
+        inputs = [
+            dict(points='tests/data/s3dis/points/Area_1_office_2.bin'),
+            dict(points='tests/data/s3dis/points/Area_1_office_2.bin')
+        ]
+        torch.manual_seed(0)
+        res_path = self.inferencer(inputs, return_vis=True)
+        # list of ndarray
+        all_points = []
+        for p in inputs:
+            pts_bytes = mmengine.fileio.get(p['points'])
+            points = np.frombuffer(pts_bytes, dtype=np.float32)
+            points = points.reshape(-1, 6)
+            all_points.append(dict(points=points))
+        torch.manual_seed(0)
+        res_ndarray = self.inferencer(all_points, return_vis=True)
+        self.assert_predictions_equal(res_path['predictions'],
+                                      res_ndarray['predictions'])
+        self.assertIn('visualization', res_path)
+        self.assertIn('visualization', res_ndarray)
+
+        # point cloud dir, test different batch sizes
+        pc_dir = dict(points='tests/data/s3dis/points/')
+        res_bs2 = self.inferencer(pc_dir, batch_size=2, return_vis=True)
+        self.assertIn('visualization', res_bs2)
+        self.assertIn('predictions', res_bs2)
+
+    @pytest.mark.skipif(
+        not torch.cuda.is_available(), reason='requires CUDA support')
+    @pytest.mark.skipif(
+        'DISPLAY' not in os.environ, reason='requires DISPLAY device')
+    def test_visualizer(self):
+        inputs = dict(points='tests/data/s3dis/points/Area_1_office_2.bin')
+        # img_out_dir
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            self.inferencer(inputs, out_dir=tmp_dir)
+
+    def test_post_processor(self):
+        if not torch.cuda.is_available():
+            return
+        # return_datasample
+        inputs = dict(points='tests/data/s3dis/points/Area_1_office_2.bin')
+        res = self.inferencer(inputs, return_datasamples=True)
+        self.assertTrue(is_list_of(res['predictions'], Det3DDataSample))
+
+        # pred_out_dir
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            res = self.inferencer(inputs, print_result=True, out_dir=tmp_dir)
+            dumped_res = mmengine.load(
+                osp.join(tmp_dir, 'preds', 'Area_1_office_2.json'))
+            self.assertEqual(res['predictions'][0], dumped_res)
diff --git a/mmde/tests/test_apis/test_inferencers/test_mono_det3d_inferencer.py b/mmde/tests/test_apis/test_inferencers/test_mono_det3d_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d30f156485fa0ae5ba3c1014fdd87839f906fd08
--- /dev/null
+++ b/mmde/tests/test_apis/test_inferencers/test_mono_det3d_inferencer.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+from unittest import TestCase
+
+import mmcv
+import mmengine
+import numpy as np
+from mmengine.utils import is_list_of
+from parameterized import parameterized
+
+from mmdet3d.apis import MonoDet3DInferencer
+from mmdet3d.structures import Det3DDataSample
+
+
+class TestMonoDet3DInferencer(TestCase):
+
+    def test_init(self):
+        # init from metafile
+        MonoDet3DInferencer('pgd_kitti')
+        # init from cfg
+        MonoDet3DInferencer(
+            'configs/pgd/pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d.py',
+            'https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/'
+            'pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d/'
+            'pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d_'
+            '20211022_102608-8a97533b.pth')
+
+    def assert_predictions_equal(self, preds1, preds2):
+        for pred1, pred2 in zip(preds1, preds2):
+            if 'bboxes_3d' in pred1:
+                self.assertTrue(
+                    np.allclose(pred1['bboxes_3d'], pred2['bboxes_3d'], 0.1))
+            if 'scores_3d' in pred1:
+                self.assertTrue(
+                    np.allclose(pred1['scores_3d'], pred2['scores_3d'], 0.1))
+            if 'labels_3d' in pred1:
+                self.assertTrue(
+                    np.allclose(pred1['labels_3d'], pred2['labels_3d']))
+
+    @parameterized.expand(['pgd_kitti'])
+    def test_call(self, model):
+        # single img
+        img_path = 'demo/data/kitti/000008.png'
+        infos_path = 'demo/data/kitti/000008.pkl'
+        inferencer = MonoDet3DInferencer(model)
+        inputs = dict(img=img_path, infos=infos_path)
+        res_path = inferencer(inputs, return_vis=True)
+        # ndarray
+        img = mmcv.imread(img_path)
+        inputs = dict(img=img, infos=infos_path)
+        res_ndarray = inferencer(inputs, return_vis=True)
+        self.assert_predictions_equal(res_path['predictions'],
+                                      res_ndarray['predictions'])
+        self.assertIn('visualization', res_path)
+        self.assertIn('visualization', res_ndarray)
+
+        # multiple images
+        inputs = [
+            dict(
+                img='demo/data/kitti/000008.png',
+                infos='demo/data/kitti/000008.pkl'),
+            dict(
+                img='demo/data/kitti/000008.png',
+                infos='demo/data/kitti/000008.pkl')
+        ]
+        res_path = inferencer(inputs, return_vis=True)
+        # list of ndarray
+        imgs = [mmcv.imread(p['img']) for p in inputs]
+        inputs = [
+            dict(img=imgs[0], infos='demo/data/kitti/000008.pkl'),
+            dict(img=imgs[1], infos='demo/data/kitti/000008.pkl')
+        ]
+        res_ndarray = inferencer(inputs, return_vis=True)
+        self.assert_predictions_equal(res_path['predictions'],
+                                      res_ndarray['predictions'])
+        self.assertIn('visualization', res_path)
+        self.assertIn('visualization', res_ndarray)
+
+    @parameterized.expand(['pgd_kitti'])
+    def test_visualize(self, model):
+        inputs = dict(
+            img='demo/data/kitti/000008.png',
+            infos='demo/data/kitti/000008.pkl')
+        inferencer = MonoDet3DInferencer(model)
+        # img_out_dir
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            inferencer(inputs, out_dir=tmp_dir)
+            self.assertTrue(
+                osp.exists(osp.join(tmp_dir, 'vis_camera/CAM2/000008.png')))
+
+    @parameterized.expand(['pgd_kitti'])
+    def test_postprocess(self, model):
+        # return_datasample
+        img_path = 'demo/data/kitti/000008.png'
+        infos_path = 'demo/data/kitti/000008.pkl'
+        inputs = dict(img=img_path, infos=infos_path)
+        inferencer = MonoDet3DInferencer(model)
+        res = inferencer(inputs, return_datasamples=True)
+        self.assertTrue(is_list_of(res['predictions'], Det3DDataSample))
+
+        # pred_out_dir
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            inputs = dict(img=img_path, infos=infos_path)
+            res = inferencer(inputs, print_result=True, out_dir=tmp_dir)
+            dumped_res = mmengine.load(
+                osp.join(tmp_dir, 'preds', '000008.json'))
+            self.assertEqual(res['predictions'][0], dumped_res)
diff --git a/mmde/tests/test_apis/test_inferencers/test_multi_modality_det3d_inferencer.py b/mmde/tests/test_apis/test_inferencers/test_multi_modality_det3d_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c51f7c764b3e1e2562b047ad3bebd9367294f5eb
--- /dev/null
+++ b/mmde/tests/test_apis/test_inferencers/test_multi_modality_det3d_inferencer.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+from unittest import TestCase
+
+import mmcv
+import mmengine
+import numpy as np
+import torch
+from mmengine.utils import is_list_of
+
+from mmdet3d.apis import MultiModalityDet3DInferencer
+from mmdet3d.structures import Det3DDataSample
+
+
+class TestMultiModalityDet3DInferencer(TestCase):
+
+    def setUp(self):
+        # init from alias
+        self.inferencer = MultiModalityDet3DInferencer('mvxnet_kitti-3class')
+
+    def test_init(self):
+        # init from metafile
+        MultiModalityDet3DInferencer('mvxnet_kitti-3class')
+        # init from cfg
+        MultiModalityDet3DInferencer(
+            'configs/mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py',  # noqa
+            weights=  # noqa
+            'https://download.openmmlab.com/mmdetection3d/v1.0.0_models/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class_20210831_060805-83442923.pth'  # noqa
+        )
+
+    def assert_predictions_equal(self, preds1, preds2):
+        for pred1, pred2 in zip(preds1, preds2):
+            if 'bboxes_3d' in pred1:
+                self.assertTrue(
+                    np.allclose(pred1['bboxes_3d'], pred2['bboxes_3d'], 0.1))
+            if 'scores_3d' in pred1:
+                self.assertTrue(
+                    np.allclose(pred1['scores_3d'], pred2['scores_3d'], 0.1))
+            if 'labels_3d' in pred1:
+                self.assertTrue(
+                    np.allclose(pred1['labels_3d'], pred2['labels_3d']))
+
+    def test_call(self):
+        if not torch.cuda.is_available():
+            return
+        infos_path = 'demo/data/kitti/000008.pkl'
+        points_path = 'demo/data/kitti/000008.bin'
+        img_path = 'demo/data/kitti/000008.png'
+        # single img & point cloud
+        inputs = dict(points=points_path, img=img_path, infos=infos_path)
+        res_path = self.inferencer(inputs, return_vis=True)
+
+        # ndarray
+        pts_bytes = mmengine.fileio.get(inputs['points'])
+        points = np.frombuffer(pts_bytes, dtype=np.float32)
+        points = points.reshape(-1, 4)
+        points = points[:, :4]
+        img = mmcv.imread(inputs['img'])
+        inputs = dict(points=points, img=img, infos=infos_path)
+        res_ndarray = self.inferencer(inputs, return_vis=True)
+        self.assert_predictions_equal(res_path['predictions'],
+                                      res_ndarray['predictions'])
+        self.assertIn('visualization', res_path)
+        self.assertIn('visualization', res_ndarray)
+
+        # multiple imgs & point clouds
+        inputs = [
+            dict(points=points_path, img=img_path, infos=infos_path),
+            dict(points=points_path, img=img_path, infos=infos_path)
+        ]
+        res_path = self.inferencer(inputs, return_vis=True)
+        # list of ndarray
+        all_inputs = []
+        for p in inputs:
+            pts_bytes = mmengine.fileio.get(p['points'])
+            points = np.frombuffer(pts_bytes, dtype=np.float32)
+            points = points.reshape(-1, 4)
+            img = mmcv.imread(p['img'])
+            all_inputs.append(dict(points=points, img=img, infos=infos_path))
+
+        res_ndarray = self.inferencer(all_inputs, return_vis=True)
+        self.assert_predictions_equal(res_path['predictions'],
+                                      res_ndarray['predictions'])
+        self.assertIn('visualization', res_path)
+        self.assertIn('visualization', res_ndarray)
+
+    def test_visualize(self):
+        if not torch.cuda.is_available():
+            return
+        inputs = dict(
+            points='demo/data/kitti/000008.bin',
+            img='demo/data/kitti/000008.png',
+            infos='demo/data/kitti/000008.pkl'),
+        # img_out_dir
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            self.inferencer(inputs, out_dir=tmp_dir)
+            # TODO: For results of LiDAR-based detection, the saved image only
+            # exists when show=True.
+            # self.assertTrue(osp.exists(osp.join(tmp_dir, '000000.png')))
+
+    def test_postprocess(self):
+        if not torch.cuda.is_available():
+            return
+        # return_datasample
+        infos_path = 'demo/data/kitti/000008.pkl'
+        points_path = 'demo/data/kitti/000008.bin'
+        img_path = 'demo/data/kitti/000008.png'
+        # single img & point cloud
+        inputs = dict(points=points_path, img=img_path, infos=infos_path)
+        res = self.inferencer(inputs, return_datasamples=True)
+        self.assertTrue(is_list_of(res['predictions'], Det3DDataSample))
+
+        # pred_out_dir
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            inputs = dict(points=points_path, img=img_path, infos=infos_path)
+            res = self.inferencer(inputs, print_result=True, out_dir=tmp_dir)
+            dumped_res = mmengine.load(
+                osp.join(tmp_dir, 'preds', '000008.json'))
+            self.assertEqual(res['predictions'][0], dumped_res)
diff --git a/mmde/tests/test_datasets/test_dataset_wrappers.py b/mmde/tests/test_datasets/test_dataset_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fd8eeab4935d4b7ac4fe3945f86eb8629e3e897
--- /dev/null
+++ b/mmde/tests/test_datasets/test_dataset_wrappers.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import numpy as np
+import pytest
+from mmcv.transforms.base import BaseTransform
+from mmengine.structures import InstanceData
+
+from mmdet3d.datasets import CBGSDataset, NuScenesDataset
+from mmdet3d.registry import DATASETS, TRANSFORMS
+from mmdet3d.structures import Det3DDataSample
+
+
+def is_equal(dict_a, dict_b):
+    for key in dict_a:
+        if key not in dict_b:
+            return False
+        if isinstance(dict_a[key], dict):
+            return is_equal(dict_a[key], dict_b[key])
+        elif isinstance(dict_a[key], np.ndarray):
+            if not (dict_a[key] == dict_b[key]).any():
+                return False
+        else:
+            if not (dict_a[key] == dict_b[key]):
+                return False
+    return True
+
+
+@TRANSFORMS.register_module()
+class Identity(BaseTransform):
+
+    def transform(self, info):
+        packed_input = dict(data_samples=Det3DDataSample())
+        if 'ann_info' in info:
+            packed_input['data_samples'].gt_instances_3d = InstanceData()
+            packed_input['data_samples'].gt_instances_3d.labels_3d = info[
+                'ann_info']['gt_labels_3d']
+        return packed_input
+
+
+@DATASETS.register_module()
+class CustomDataset(NuScenesDataset):
+    pass
+
+
+class TestCBGSDataset:
+
+    def setup(self):
+        dataset = NuScenesDataset
+        self.dataset = dataset(
+            data_root=osp.join(osp.dirname(__file__), '../data/nuscenes'),
+            ann_file='nus_info.pkl',
+            data_prefix=dict(
+                pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP'),
+            pipeline=[dict(type=Identity)])
+
+        self.sample_indices = [0, 0, 1, 1, 1]
+        # test init
+        self.cbgs_datasets = CBGSDataset(dataset=self.dataset)
+        self.cbgs_datasets.sample_indices = self.sample_indices
+
+    def test_init(self):
+        # Test build dataset from cfg
+        dataset_cfg = dict(
+            type=CustomDataset,
+            data_root=osp.join(osp.dirname(__file__), '../data/nuscenes'),
+            ann_file='nus_info.pkl',
+            data_prefix=dict(
+                pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP'),
+            pipeline=[dict(type=Identity)])
+        cbgs_datasets = CBGSDataset(dataset=dataset_cfg)
+        cbgs_datasets.sample_indices = self.sample_indices
+        cbgs_datasets.dataset.pipeline = self.dataset.pipeline
+        assert len(cbgs_datasets) == len(self.cbgs_datasets)
+        for i in range(len(cbgs_datasets)):
+            assert is_equal(
+                cbgs_datasets.get_data_info(i),
+                self.cbgs_datasets.get_data_info(i))
+            assert (cbgs_datasets[i]['data_samples'].gt_instances_3d.labels_3d
+                    == self.cbgs_datasets[i]
+                    ['data_samples'].gt_instances_3d.labels_3d).any()
+
+        with pytest.raises(TypeError):
+            CBGSDataset(dataset=[0])
+
+    def test_full_init(self):
+        self.cbgs_datasets.full_init()
+        self.cbgs_datasets.sample_indices = self.sample_indices
+        assert len(self.cbgs_datasets) == len(self.sample_indices)
+        # Reinit `sample_indices`
+        self.cbgs_datasets._fully_initialized = False
+        self.cbgs_datasets.sample_indices = self.sample_indices
+        assert len(self.cbgs_datasets) != len(self.sample_indices)
+
+        with pytest.raises(NotImplementedError):
+            self.cbgs_datasets.get_subset_(1)
+
+        with pytest.raises(NotImplementedError):
+            self.cbgs_datasets.get_subset(1)
+
+    def test_metainfo(self):
+        assert self.cbgs_datasets.metainfo == self.dataset.metainfo
+
+    def test_length(self):
+        assert len(self.cbgs_datasets) == len(self.sample_indices)
+
+    def test_getitem(self):
+        for i in range(len(self.sample_indices)):
+            assert (self.cbgs_datasets[i]['data_samples'].gt_instances_3d.
+                    labels_3d == self.dataset[self.sample_indices[i]]
+                    ['data_samples'].gt_instances_3d.labels_3d).any()
+
+    def test_get_data_info(self):
+        for i in range(len(self.sample_indices)):
+            assert is_equal(
+                self.cbgs_datasets.get_data_info(i),
+                self.dataset.get_data_info(self.sample_indices[i]))
+
+    def test_get_cat_ids(self):
+        for i in range(len(self.sample_indices)):
+            assert self.cbgs_datasets.get_cat_ids(
+                i) == self.dataset.get_cat_ids(self.sample_indices[i])
diff --git a/mmde/tests/test_datasets/test_kitti_dataset.py b/mmde/tests/test_datasets/test_kitti_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea7488d7d365bb6c11a8171c14fd633c4c4e1d90
--- /dev/null
+++ b/mmde/tests/test_datasets/test_kitti_dataset.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import numpy as np
+import torch
+from mmcv.transforms.base import BaseTransform
+from mmengine.registry import TRANSFORMS
+from mmengine.structures import InstanceData
+
+from mmdet3d.datasets import KittiDataset
+from mmdet3d.structures import Det3DDataSample, LiDARInstance3DBoxes
+
+
+def _generate_kitti_dataset_config():
+    data_root = 'tests/data/kitti'
+    ann_file = 'kitti_infos_train.pkl'
+    classes = ['Pedestrian', 'Cyclist', 'Car']
+    # wait for pipline refactor
+
+    if 'Identity' not in TRANSFORMS:
+
+        @TRANSFORMS.register_module()
+        class Identity(BaseTransform):
+
+            def transform(self, info):
+                if 'ann_info' in info:
+                    info['gt_labels_3d'] = info['ann_info']['gt_labels_3d']
+                data_sample = Det3DDataSample()
+                gt_instances_3d = InstanceData()
+                gt_instances_3d.labels_3d = info['gt_labels_3d']
+                data_sample.gt_instances_3d = gt_instances_3d
+                info['data_samples'] = data_sample
+                return info
+
+    pipeline = [
+        dict(type='Identity'),
+    ]
+
+    modality = dict(use_lidar=True, use_camera=False)
+    data_prefix = dict(pts='training/velodyne_reduced', img='training/image_2')
+    return data_root, ann_file, classes, data_prefix, pipeline, modality
+
+
+def test_getitem():
+    np.random.seed(0)
+    data_root, ann_file, classes, data_prefix, \
+        pipeline, modality, = _generate_kitti_dataset_config()
+    modality['use_camera'] = True
+
+    kitti_dataset = KittiDataset(
+        data_root,
+        ann_file,
+        data_prefix=dict(
+            pts='training/velodyne_reduced',
+            img='training/image_2',
+        ),
+        pipeline=pipeline,
+        metainfo=dict(classes=classes),
+        modality=modality)
+
+    kitti_dataset.prepare_data(0)
+    input_dict = kitti_dataset.get_data_info(0)
+    kitti_dataset[0]
+    # assert the the path should contains data_prefix and data_root
+    assert data_prefix['pts'] in input_dict['lidar_points']['lidar_path']
+    assert data_root in input_dict['lidar_points']['lidar_path']
+    for cam_id, img_info in input_dict['images'].items():
+        if 'img_path' in img_info:
+            assert data_prefix['img'] in img_info['img_path']
+            assert data_root in img_info['img_path']
+
+    ann_info = kitti_dataset.parse_ann_info(input_dict)
+
+    # assert the keys in ann_info and the type
+    assert 'instances' in ann_info
+
+    # only one instance
+    assert 'gt_labels_3d' in ann_info
+    assert ann_info['gt_labels_3d'].dtype == np.int64
+
+    assert 'gt_bboxes_3d' in ann_info
+    assert isinstance(ann_info['gt_bboxes_3d'], LiDARInstance3DBoxes)
+    assert torch.allclose(ann_info['gt_bboxes_3d'].tensor.sum(),
+                          torch.tensor(7.2650))
+    assert 'centers_2d' in ann_info
+    assert ann_info['centers_2d'].dtype == np.float32
+    assert 'depths' in ann_info
+    assert ann_info['depths'].dtype == np.float32
+
+    car_kitti_dataset = KittiDataset(
+        data_root,
+        ann_file,
+        data_prefix=dict(
+            pts='training/velodyne_reduced',
+            img='training/image_2',
+        ),
+        pipeline=pipeline,
+        metainfo=dict(classes=['Car']),
+        modality=modality)
+
+    input_dict = car_kitti_dataset.get_data_info(0)
+    ann_info = car_kitti_dataset.parse_ann_info(input_dict)
+
+    # assert the keys in ann_info and the type
+    assert 'instances' in ann_info
+    assert ann_info['gt_labels_3d'].dtype == np.int64
+    # all instance have been filtered by classes
+    assert len(ann_info['gt_labels_3d']) == 0
+    assert len(car_kitti_dataset.metainfo['classes']) == 1
diff --git a/mmde/tests/test_datasets/test_lyft_dataset.py b/mmde/tests/test_datasets/test_lyft_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b99b71819a288a4b63ccc3412d9a1c951290823e
--- /dev/null
+++ b/mmde/tests/test_datasets/test_lyft_dataset.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.transforms.base import BaseTransform
+from mmengine.registry import TRANSFORMS
+from mmengine.structures import InstanceData
+
+from mmdet3d.datasets import LyftDataset
+from mmdet3d.structures import Det3DDataSample, LiDARInstance3DBoxes
+
+
+def _generate_nus_dataset_config():
+    data_root = 'tests/data/lyft'
+    ann_file = 'lyft_infos.pkl'
+    classes = [
+        'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle',
+        'motorcycle', 'bicycle', 'pedestrian', 'animal'
+    ]
+    if 'Identity' not in TRANSFORMS:
+
+        @TRANSFORMS.register_module()
+        class Identity(BaseTransform):
+
+            def transform(self, info):
+                packed_input = dict(data_samples=Det3DDataSample())
+                if 'ann_info' in info:
+                    packed_input[
+                        'data_samples'].gt_instances_3d = InstanceData()
+                    packed_input[
+                        'data_samples'].gt_instances_3d.labels_3d = info[
+                            'ann_info']['gt_labels_3d']
+                return packed_input
+
+    pipeline = [
+        dict(type='Identity'),
+    ]
+    modality = dict(use_lidar=True, use_camera=False)
+    data_prefix = dict(pts='lidar', img='', sweeps='sweeps/LIDAR_TOP')
+    return data_root, ann_file, classes, data_prefix, pipeline, modality
+
+
+def test_getitem():
+    np.random.seed(0)
+    data_root, ann_file, classes, data_prefix, pipeline, modality = \
+        _generate_nus_dataset_config()
+
+    lyft_dataset = LyftDataset(
+        data_root,
+        ann_file,
+        data_prefix=data_prefix,
+        pipeline=pipeline,
+        metainfo=dict(classes=classes),
+        modality=modality)
+
+    lyft_dataset.prepare_data(0)
+    input_dict = lyft_dataset.get_data_info(0)
+    # assert the the path should contains data_prefix and data_root
+    assert data_prefix['pts'] in input_dict['lidar_points']['lidar_path']
+    assert data_root in input_dict['lidar_points']['lidar_path']
+
+    ann_info = lyft_dataset.parse_ann_info(input_dict)
+
+    # assert the keys in ann_info and the type
+    assert 'gt_labels_3d' in ann_info
+    assert ann_info['gt_labels_3d'].dtype == np.int64
+    assert len(ann_info['gt_labels_3d']) == 3
+
+    assert 'gt_bboxes_3d' in ann_info
+    assert isinstance(ann_info['gt_bboxes_3d'], LiDARInstance3DBoxes)
+
+    assert len(lyft_dataset.metainfo['classes']) == 9
diff --git a/mmde/tests/test_datasets/test_nuscenes_dataset.py b/mmde/tests/test_datasets/test_nuscenes_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b85f34f4efe6dddd559e8b40cb0eb4f2736ef14
--- /dev/null
+++ b/mmde/tests/test_datasets/test_nuscenes_dataset.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.transforms.base import BaseTransform
+from mmengine.registry import TRANSFORMS
+from mmengine.structures import InstanceData
+
+from mmdet3d.datasets import NuScenesDataset
+from mmdet3d.structures import Det3DDataSample, LiDARInstance3DBoxes
+
+
+def _generate_nus_dataset_config():
+    data_root = 'tests/data/nuscenes'
+    ann_file = 'nus_info.pkl'
+    classes = [
+        'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+        'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+    ]
+    if 'Identity' not in TRANSFORMS:
+
+        @TRANSFORMS.register_module()
+        class Identity(BaseTransform):
+
+            def transform(self, info):
+                packed_input = dict(data_samples=Det3DDataSample())
+                if 'ann_info' in info:
+                    packed_input[
+                        'data_samples'].gt_instances_3d = InstanceData()
+                    packed_input[
+                        'data_samples'].gt_instances_3d.labels_3d = info[
+                            'ann_info']['gt_labels_3d']
+                return packed_input
+
+    pipeline = [
+        dict(type='Identity'),
+    ]
+    modality = dict(use_lidar=True, use_camera=True)
+    data_prefix = dict(
+        pts='samples/LIDAR_TOP',
+        img='samples/CAM_BACK_LEFT',
+        sweeps='sweeps/LIDAR_TOP')
+    return data_root, ann_file, classes, data_prefix, pipeline, modality
+
+
+def test_getitem():
+    np.random.seed(0)
+    data_root, ann_file, classes, data_prefix, pipeline, modality = \
+        _generate_nus_dataset_config()
+
+    nus_dataset = NuScenesDataset(
+        data_root=data_root,
+        ann_file=ann_file,
+        data_prefix=data_prefix,
+        pipeline=pipeline,
+        metainfo=dict(classes=classes),
+        modality=modality)
+
+    nus_dataset.prepare_data(0)
+    input_dict = nus_dataset.get_data_info(0)
+    # assert the the path should contains data_prefix and data_root
+    assert data_prefix['pts'] in input_dict['lidar_points']['lidar_path']
+    assert data_root in input_dict['lidar_points']['lidar_path']
+
+    for cam_id, img_info in input_dict['images'].items():
+        if 'img_path' in img_info:
+            assert data_prefix['img'] in img_info['img_path']
+            assert data_root in img_info['img_path']
+
+    ann_info = nus_dataset.parse_ann_info(input_dict)
+
+    # assert the keys in ann_info and the type
+    assert 'gt_labels_3d' in ann_info
+    assert ann_info['gt_labels_3d'].dtype == np.int64
+    assert len(ann_info['gt_labels_3d']) == 37
+
+    assert 'gt_bboxes_3d' in ann_info
+    assert isinstance(ann_info['gt_bboxes_3d'], LiDARInstance3DBoxes)
+
+    assert len(nus_dataset.metainfo['classes']) == 10
+
+    assert input_dict['token'] == 'fd8420396768425eabec9bdddf7e64b6'
+    assert input_dict['timestamp'] == 1533201470.448696
diff --git a/mmde/tests/test_datasets/test_s3dis_dataset.py b/mmde/tests/test_datasets/test_s3dis_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..542953a7794b1a05c16a549c9da9bda052c3db10
--- /dev/null
+++ b/mmde/tests/test_datasets/test_s3dis_dataset.py
@@ -0,0 +1,206 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import numpy as np
+import torch
+from mmengine.testing import assert_allclose
+
+from mmdet3d.datasets import S3DISDataset, S3DISSegDataset
+from mmdet3d.structures import DepthInstance3DBoxes
+from mmdet3d.utils import register_all_modules
+
+
+def _generate_s3dis_seg_dataset_config():
+    data_root = './tests/data/s3dis/'
+    ann_file = 's3dis_infos.pkl'
+    classes = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+    palette = [[0, 255, 0], [0, 0, 255], [0, 255, 255], [255, 255, 0],
+               [255, 0, 255], [100, 100, 255], [200, 200, 100],
+               [170, 120, 200], [255, 0, 0], [200, 100, 100], [10, 200, 100],
+               [200, 200, 200], [50, 50, 50]]
+    scene_idxs = [0 for _ in range(20)]
+    modality = dict(use_lidar=True, use_camera=False)
+    pipeline = [
+        dict(
+            type='LoadPointsFromFile',
+            coord_type='DEPTH',
+            shift_height=False,
+            use_color=True,
+            load_dim=6,
+            use_dim=[0, 1, 2, 3, 4, 5]),
+        dict(
+            type='LoadAnnotations3D',
+            with_bbox_3d=False,
+            with_label_3d=False,
+            with_mask_3d=False,
+            with_seg_3d=True),
+        dict(type='PointSegClassMapping'),
+        dict(
+            type='IndoorPatchPointSample',
+            num_points=5,
+            block_size=1.0,
+            ignore_index=len(classes),
+            use_normalized_coord=True,
+            enlarge_size=0.2,
+            min_unique_num=None),
+        dict(type='NormalizePointsColor', color_mean=None),
+        dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+    ]
+
+    data_prefix = dict(
+        pts='points',
+        pts_instance_mask='instance_mask',
+        pts_semantic_mask='semantic_mask')
+
+    return (data_root, ann_file, classes, palette, scene_idxs, data_prefix,
+            pipeline, modality)
+
+
+def _generate_s3dis_dataset_config():
+    data_root = 'tests/data/s3dis'
+    ann_file = 's3dis_infos.pkl'
+    classes = ('table', 'chair', 'sofa', 'bookcase', 'board')
+    modality = dict(use_lidar=True, use_camera=False)
+    pipeline = [
+        dict(
+            type='LoadPointsFromFile',
+            coord_type='DEPTH',
+            shift_height=False,
+            use_color=True,
+            load_dim=6,
+            use_dim=[0, 1, 2, 3, 4, 5]),
+        dict(
+            type='LoadAnnotations3D',
+            with_bbox_3d=True,
+            with_label_3d=True,
+            with_mask_3d=True,
+            with_seg_3d=True),
+        dict(type='PointSegClassMapping'),
+        dict(type='PointSample', num_points=5),
+        dict(
+            type='RandomFlip3D',
+            sync_2d=False,
+            flip_ratio_bev_horizontal=1.0,
+            flip_ratio_bev_vertical=1.0),
+        dict(
+            type='GlobalRotScaleTrans',
+            rot_range=[-0.087266, 0.087266],
+            scale_ratio_range=[1.0, 1.0]),
+        dict(type='NormalizePointsColor', color_mean=None),
+        dict(
+            type='Pack3DDetInputs',
+            keys=[
+                'points', 'pts_semantic_mask', 'gt_bboxes_3d', 'gt_labels_3d',
+                'pts_instance_mask'
+            ])
+    ]
+    data_prefix = dict(
+        pts='points',
+        pts_instance_mask='instance_mask',
+        pts_semantic_mask='semantic_mask')
+    return data_root, ann_file, classes, data_prefix, pipeline, modality
+
+
+class TestS3DISDataset(unittest.TestCase):
+
+    def test_s3dis(self):
+        np.random.seed(0)
+        data_root, ann_file, classes, data_prefix, \
+            pipeline, modality = _generate_s3dis_dataset_config()
+        register_all_modules()
+        s3dis_dataset = S3DISDataset(
+            data_root,
+            ann_file,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            metainfo=dict(classes=classes),
+            modality=modality)
+
+        s3dis_dataset.prepare_data(0)
+        input_dict = s3dis_dataset.get_data_info(0)
+        s3dis_dataset[0]
+        # assert the path should contains data_prefix and data_root
+        self.assertIn(data_prefix['pts'],
+                      input_dict['lidar_points']['lidar_path'])
+        self.assertIn(data_root, input_dict['lidar_points']['lidar_path'])
+
+        ann_info = s3dis_dataset.parse_ann_info(input_dict)
+
+        # assert the keys in ann_info and the type
+        except_label = np.array([1, 1, 3, 1, 2, 0, 0, 0, 3])
+
+        self.assertEqual(ann_info['gt_labels_3d'].dtype, np.int64)
+        assert_allclose(ann_info['gt_labels_3d'], except_label)
+        self.assertIsInstance(ann_info['gt_bboxes_3d'], DepthInstance3DBoxes)
+        assert len(ann_info['gt_bboxes_3d']) == 9
+        assert torch.allclose(ann_info['gt_bboxes_3d'].tensor.sum(),
+                              torch.tensor([63.0455]))
+
+        no_class_s3dis_dataset = S3DISDataset(
+            data_root, ann_file, metainfo=dict(classes=['table']))
+
+        input_dict = no_class_s3dis_dataset.get_data_info(0)
+        ann_info = no_class_s3dis_dataset.parse_ann_info(input_dict)
+
+        # assert the keys in ann_info and the type
+        self.assertIn('gt_labels_3d', ann_info)
+        # assert mapping to -1 or 1
+        assert (ann_info['gt_labels_3d'] <= 0).all()
+        self.assertEqual(ann_info['gt_labels_3d'].dtype, np.int64)
+        # all instance have been filtered by classes
+        self.assertEqual(len(ann_info['gt_labels_3d']), 9)
+        self.assertEqual(len(no_class_s3dis_dataset.metainfo['classes']), 1)
+
+    def test_s3dis_seg(self):
+        data_root, ann_file, classes, palette, scene_idxs, data_prefix, \
+            pipeline, modality, = _generate_s3dis_seg_dataset_config()
+
+        register_all_modules()
+        np.random.seed(0)
+
+        s3dis_seg_dataset = S3DISSegDataset(
+            data_root,
+            ann_file,
+            metainfo=dict(classes=classes, palette=palette),
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            modality=modality,
+            scene_idxs=scene_idxs)
+
+        input_dict = s3dis_seg_dataset.prepare_data(0)
+
+        points = input_dict['inputs']['points']
+        data_sample = input_dict['data_samples']
+        pts_semantic_mask = data_sample.gt_pts_seg.pts_semantic_mask
+
+        expected_points = torch.tensor([[
+            0.0000, 0.0000, 3.1720, 0.4706, 0.4431, 0.3725, 0.4624, 0.7502,
+            0.9543
+        ],
+                                        [
+                                            0.2880, -0.5900, 0.0650, 0.3451,
+                                            0.3373, 0.3490, 0.5119, 0.5518,
+                                            0.0196
+                                        ],
+                                        [
+                                            0.1570, 0.6000, 3.1700, 0.4941,
+                                            0.4667, 0.3569, 0.4893, 0.9519,
+                                            0.9537
+                                        ],
+                                        [
+                                            -0.1320, 0.3950, 0.2720, 0.3216,
+                                            0.2863, 0.2275, 0.4397, 0.8830,
+                                            0.0818
+                                        ],
+                                        [
+                                            -0.4860, -0.0640, 3.1710, 0.3843,
+                                            0.3725, 0.3059, 0.3789, 0.7286,
+                                            0.9540
+                                        ]])
+
+        expected_pts_semantic_mask = np.array([0, 1, 0, 8, 0])
+
+        assert torch.allclose(points, expected_points, 1e-2)
+        self.assertTrue(
+            (pts_semantic_mask.numpy() == expected_pts_semantic_mask).all())
diff --git a/mmde/tests/test_datasets/test_scannet_dataset.py b/mmde/tests/test_datasets/test_scannet_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdbf213c5f59d24168af88206020155fa1f42dcc
--- /dev/null
+++ b/mmde/tests/test_datasets/test_scannet_dataset.py
@@ -0,0 +1,229 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import numpy as np
+import torch
+from mmengine.testing import assert_allclose
+
+from mmdet3d.datasets import ScanNetDataset, ScanNetSegDataset
+from mmdet3d.structures import DepthInstance3DBoxes
+from mmdet3d.utils import register_all_modules
+
+
+def _generate_scannet_seg_dataset_config():
+    data_root = './tests/data/scannet/'
+    ann_file = 'scannet_infos.pkl'
+    classes = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table',
+               'door', 'window', 'bookshelf', 'picture', 'counter', 'desk',
+               'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink',
+               'bathtub', 'otherfurniture')
+    palette = [
+        [174, 199, 232],
+        [152, 223, 138],
+        [31, 119, 180],
+        [255, 187, 120],
+        [188, 189, 34],
+        [140, 86, 75],
+        [255, 152, 150],
+        [214, 39, 40],
+        [197, 176, 213],
+        [148, 103, 189],
+        [196, 156, 148],
+        [23, 190, 207],
+        [247, 182, 210],
+        [219, 219, 141],
+        [255, 127, 14],
+        [158, 218, 229],
+        [44, 160, 44],
+        [112, 128, 144],
+        [227, 119, 194],
+        [82, 84, 163],
+    ]
+    scene_idxs = [0]
+    modality = dict(use_lidar=True, use_camera=False)
+    pipeline = [
+        dict(
+            type='LoadPointsFromFile',
+            coord_type='DEPTH',
+            shift_height=False,
+            use_color=True,
+            load_dim=6,
+            use_dim=[0, 1, 2, 3, 4, 5]),
+        dict(
+            type='LoadAnnotations3D',
+            with_bbox_3d=False,
+            with_label_3d=False,
+            with_mask_3d=False,
+            with_seg_3d=True),
+        dict(type='PointSegClassMapping'),
+        dict(
+            type='IndoorPatchPointSample',
+            num_points=5,
+            block_size=1.5,
+            ignore_index=len(classes),
+            use_normalized_coord=True,
+            enlarge_size=0.2,
+            min_unique_num=None),
+        dict(type='NormalizePointsColor', color_mean=None),
+        dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+    ]
+
+    data_prefix = dict(
+        pts='points',
+        pts_instance_mask='instance_mask',
+        pts_semantic_mask='semantic_mask')
+    return (data_root, ann_file, classes, palette, scene_idxs, data_prefix,
+            pipeline, modality)
+
+
+def _generate_scannet_dataset_config():
+    data_root = 'tests/data/scannet'
+    ann_file = 'scannet_infos.pkl'
+    classes = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+    modality = dict(use_lidar=True, use_camera=False)
+    pipeline = [
+        dict(
+            type='LoadPointsFromFile',
+            coord_type='DEPTH',
+            shift_height=True,
+            load_dim=6,
+            use_dim=[0, 1, 2]),
+        dict(
+            type='LoadAnnotations3D',
+            with_bbox_3d=True,
+            with_label_3d=True,
+            with_mask_3d=True,
+            with_seg_3d=True),
+        dict(type='GlobalAlignment', rotation_axis=2),
+        dict(type='PointSegClassMapping'),
+        dict(type='PointSample', num_points=5),
+        dict(
+            type='RandomFlip3D',
+            sync_2d=False,
+            flip_ratio_bev_horizontal=1.0,
+            flip_ratio_bev_vertical=1.0),
+        dict(
+            type='GlobalRotScaleTrans',
+            rot_range=[-0.087266, 0.087266],
+            scale_ratio_range=[1.0, 1.0],
+            shift_height=True),
+        dict(
+            type='Pack3DDetInputs',
+            keys=[
+                'points', 'pts_semantic_mask', 'gt_bboxes_3d', 'gt_labels_3d',
+                'pts_instance_mask'
+            ])
+    ]
+    data_prefix = dict(
+        pts='points',
+        pts_instance_mask='instance_mask',
+        pts_semantic_mask='semantic_mask')
+    return data_root, ann_file, classes, data_prefix, pipeline, modality
+
+
+class TestScanNetDataset(unittest.TestCase):
+
+    def test_scannet(self):
+        np.random.seed(0)
+        data_root, ann_file, classes, data_prefix, \
+            pipeline, modality, = _generate_scannet_dataset_config()
+        register_all_modules()
+        scannet_dataset = ScanNetDataset(
+            data_root,
+            ann_file,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            metainfo=dict(classes=classes),
+            modality=modality)
+
+        scannet_dataset.prepare_data(0)
+        input_dict = scannet_dataset.get_data_info(0)
+        scannet_dataset[0]
+        # assert the the path should contains data_prefix and data_root
+        self.assertIn(data_prefix['pts'],
+                      input_dict['lidar_points']['lidar_path'])
+        self.assertIn(data_root, input_dict['lidar_points']['lidar_path'])
+
+        ann_info = scannet_dataset.parse_ann_info(input_dict)
+
+        # assert the keys in ann_info and the type
+        except_label = np.array([
+            6, 6, 4, 9, 11, 11, 10, 0, 15, 17, 17, 17, 3, 12, 4, 4, 14, 1, 0,
+            0, 0, 0, 0, 0, 5, 5, 5
+        ])
+
+        self.assertEqual(ann_info['gt_labels_3d'].dtype, np.int64)
+        assert_allclose(ann_info['gt_labels_3d'], except_label)
+        self.assertIsInstance(ann_info['gt_bboxes_3d'], DepthInstance3DBoxes)
+        assert len(ann_info['gt_bboxes_3d']) == 27
+        assert torch.allclose(ann_info['gt_bboxes_3d'].tensor.sum(),
+                              torch.tensor([107.7353]))
+
+        no_class_scannet_dataset = ScanNetDataset(
+            data_root, ann_file, metainfo=dict(classes=['cabinet']))
+
+        input_dict = no_class_scannet_dataset.get_data_info(0)
+        ann_info = no_class_scannet_dataset.parse_ann_info(input_dict)
+
+        # assert the keys in ann_info and the type
+        self.assertIn('gt_labels_3d', ann_info)
+        # assert mapping to -1 or 1
+        assert (ann_info['gt_labels_3d'] <= 0).all()
+        self.assertEqual(ann_info['gt_labels_3d'].dtype, np.int64)
+        # all instance have been filtered by classes
+        self.assertEqual(len(ann_info['gt_labels_3d']), 27)
+        self.assertEqual(len(no_class_scannet_dataset.metainfo['classes']), 1)
+
+    def test_scannet_seg(self):
+        data_root, ann_file, classes, palette, scene_idxs, data_prefix, \
+            pipeline, modality, = _generate_scannet_seg_dataset_config()
+
+        register_all_modules()
+        np.random.seed(0)
+        scannet_seg_dataset = ScanNetSegDataset(
+            data_root,
+            ann_file,
+            metainfo=dict(classes=classes, palette=palette),
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            modality=modality,
+            scene_idxs=scene_idxs)
+
+        input_dict = scannet_seg_dataset.prepare_data(0)
+
+        points = input_dict['inputs']['points']
+        data_sample = input_dict['data_samples']
+        pts_semantic_mask = data_sample.gt_pts_seg.pts_semantic_mask
+
+        expected_points = torch.tensor([[
+            0.0000, 0.0000, 1.2427, 0.6118, 0.5529, 0.4471, -0.6462, -1.0046,
+            0.4280
+        ],
+                                        [
+                                            0.1553, -0.0074, 1.6077, 0.5882,
+                                            0.6157, 0.5569, -0.6001, -1.0068,
+                                            0.5537
+                                        ],
+                                        [
+                                            0.1518, 0.6016, 0.6548, 0.1490,
+                                            0.1059, 0.0431, -0.6012, -0.8309,
+                                            0.2255
+                                        ],
+                                        [
+                                            -0.7494, 0.1033, 0.6756, 0.5216,
+                                            0.4353, 0.3333, -0.8687, -0.9748,
+                                            0.2327
+                                        ],
+                                        [
+                                            -0.6836, -0.0203, 0.5884, 0.5765,
+                                            0.5020, 0.4510, -0.8491, -1.0105,
+                                            0.2027
+                                        ]])
+        expected_pts_semantic_mask = np.array([13, 13, 12, 2, 0])
+
+        assert torch.allclose(points, expected_points, 1e-2)
+        self.assertTrue(
+            (pts_semantic_mask.numpy() == expected_pts_semantic_mask).all())
diff --git a/mmde/tests/test_datasets/test_semantickitti_dataset.py b/mmde/tests/test_datasets/test_semantickitti_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d334870da5245e49be9c41194934ece8ea1b7aee
--- /dev/null
+++ b/mmde/tests/test_datasets/test_semantickitti_dataset.py
@@ -0,0 +1,115 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import numpy as np
+
+from mmdet3d.datasets import SemanticKittiDataset
+from mmdet3d.utils import register_all_modules
+
+
+def _generate_semantickitti_dataset_config():
+    data_root = './tests/data/semantickitti/'
+    ann_file = 'semantickitti_infos.pkl'
+    classes = ('car', 'bicycle', 'motorcycle', 'truck', 'bus', 'person',
+               'bicyclist', 'motorcyclist', 'road', 'parking', 'sidewalk',
+               'other-ground', 'building', 'fence', 'vegetation', 'trunck',
+               'terrian', 'pole', 'traffic-sign')
+
+    seg_label_mapping = {
+        0: 19,  # "unlabeled"
+        1: 19,  # "outlier" mapped to "unlabeled" --------------mapped
+        10: 0,  # "car"
+        11: 1,  # "bicycle"
+        13: 4,  # "bus" mapped to "other-vehicle" --------------mapped
+        15: 2,  # "motorcycle"
+        16: 4,  # "on-rails" mapped to "other-vehicle" ---------mapped
+        18: 3,  # "truck"
+        20: 4,  # "other-vehicle"
+        30: 5,  # "person"
+        31: 6,  # "bicyclist"
+        32: 7,  # "motorcyclist"
+        40: 8,  # "road"
+        44: 9,  # "parking"
+        48: 10,  # "sidewalk"
+        49: 11,  # "other-ground"
+        50: 12,  # "building"
+        51: 13,  # "fence"
+        52: 19,  # "other-structure" mapped to "unlabeled" ------mapped
+        60: 8,  # "lane-marking" to "road" ---------------------mapped
+        70: 14,  # "vegetation"
+        71: 15,  # "trunk"
+        72: 16,  # "terrain"
+        80: 17,  # "pole"
+        81: 18,  # "traffic-sign"
+        99: 19,  # "other-object" to "unlabeled" ----------------mapped
+        252: 0,  # "moving-car" to "car" ------------------------mapped
+        253: 6,  # "moving-bicyclist" to "bicyclist" ------------mapped
+        254: 5,  # "moving-person" to "person" ------------------mapped
+        255: 7,  # "moving-motorcyclist" to "motorcyclist" ------mapped
+        256: 4,  # "moving-on-rails" mapped to "other-vehic------mapped
+        257: 4,  # "moving-bus" mapped to "other-vehicle" -------mapped
+        258: 3,  # "moving-truck" to "truck" --------------------mapped
+        259: 4  # "moving-other"-vehicle to "other-vehicle"-----mapped
+    }
+    max_label = 259
+    modality = dict(use_lidar=True, use_camera=False)
+    pipeline = [
+        dict(
+            type='LoadPointsFromFile',
+            coord_type='LIDAR',
+            shift_height=True,
+            load_dim=4,
+            use_dim=[0, 1, 2]),
+        dict(
+            type='LoadAnnotations3D',
+            with_bbox_3d=False,
+            with_label_3d=False,
+            with_mask_3d=False,
+            with_seg_3d=True,
+            seg_3d_dtype='np.int32'),
+        dict(type='PointSegClassMapping'),
+        dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+    ]
+
+    data_prefix = dict(
+        pts='sequences/00/velodyne', pts_semantic_mask='sequences/00/labels')
+
+    return (data_root, ann_file, classes, data_prefix, pipeline, modality,
+            seg_label_mapping, max_label)
+
+
+class TestSemanticKittiDataset(unittest.TestCase):
+
+    def test_semantickitti(self):
+        (data_root, ann_file, classes, data_prefix, pipeline, modality,
+         seg_label_mapping,
+         max_label) = _generate_semantickitti_dataset_config()
+
+        register_all_modules()
+        np.random.seed(0)
+        semantickitti_dataset = SemanticKittiDataset(
+            data_root,
+            ann_file,
+            metainfo=dict(
+                classes=classes,
+                seg_label_mapping=seg_label_mapping,
+                max_label=max_label),
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            modality=modality)
+
+        input_dict = semantickitti_dataset.prepare_data(0)
+
+        points = input_dict['inputs']['points']
+        data_sample = input_dict['data_samples']
+        pts_semantic_mask = data_sample.gt_pts_seg.pts_semantic_mask
+        self.assertEqual(points.shape[0], pts_semantic_mask.shape[0])
+
+        expected_pts_semantic_mask = np.array([
+            12, 12, 12, 14, 14, 12, 19, 12, 14, 12, 12, 14, 15, 19, 14, 12, 12,
+            12, 12, 19, 12, 12, 12, 12, 12, 14, 12, 15, 12, 14, 14, 17, 12, 14,
+            14, 14, 15, 14, 12, 12, 14, 12, 17, 14, 12, 14, 12, 14, 14, 12
+        ])
+
+        self.assertTrue(
+            (pts_semantic_mask.numpy() == expected_pts_semantic_mask).all())
diff --git a/mmde/tests/test_datasets/test_sunrgbd_dataset.py b/mmde/tests/test_datasets/test_sunrgbd_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8519ea7f0484f6ee889e64ac6261f13f814b7ba
--- /dev/null
+++ b/mmde/tests/test_datasets/test_sunrgbd_dataset.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import numpy as np
+import torch
+from mmengine.testing import assert_allclose
+
+from mmdet3d.datasets import SUNRGBDDataset
+from mmdet3d.structures import DepthInstance3DBoxes
+
+
+def _generate_scannet_dataset_config():
+    data_root = 'tests/data/sunrgbd'
+    ann_file = 'sunrgbd_infos.pkl'
+
+    classes = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+               'night_stand', 'bookshelf', 'bathtub')
+
+    from mmcv.transforms.base import BaseTransform
+    from mmengine.registry import TRANSFORMS
+
+    if 'Identity' not in TRANSFORMS:
+
+        @TRANSFORMS.register_module()
+        class Identity(BaseTransform):
+
+            def transform(self, info):
+                if 'ann_info' in info:
+                    info['gt_labels_3d'] = info['ann_info']['gt_labels_3d']
+                return info
+
+    modality = dict(use_camera=True, use_lidar=True)
+    pipeline = [
+        dict(type='Identity'),
+    ]
+    data_prefix = dict(pts='points', img='sunrgbd_trainval')
+    return data_root, ann_file, classes, data_prefix, pipeline, modality
+
+
+class TestScanNetDataset(unittest.TestCase):
+
+    def test_sunrgbd_ataset(self):
+        np.random.seed(0)
+        data_root, ann_file, classes, data_prefix, \
+            pipeline, modality, = _generate_scannet_dataset_config()
+        scannet_dataset = SUNRGBDDataset(
+            data_root,
+            ann_file,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            metainfo=dict(classes=classes),
+            modality=modality)
+
+        scannet_dataset.prepare_data(0)
+        input_dict = scannet_dataset.get_data_info(0)
+        scannet_dataset[0]
+        # assert the the path should contains data_prefix and data_root
+        assert data_prefix['pts'] in input_dict['lidar_points']['lidar_path']
+        assert data_root in input_dict['lidar_points']['lidar_path']
+        for cam_id, img_info in input_dict['images'].items():
+            if 'img_path' in img_info:
+                assert data_prefix['img'] in img_info['img_path']
+                assert data_root in img_info['img_path']
+
+        ann_info = scannet_dataset.parse_ann_info(input_dict)
+
+        # assert the keys in ann_info and the type
+        except_label = np.array([0, 7, 6])
+
+        self.assertEqual(ann_info['gt_labels_3d'].dtype, np.int64)
+        assert_allclose(ann_info['gt_labels_3d'], except_label)
+        self.assertIsInstance(ann_info['gt_bboxes_3d'], DepthInstance3DBoxes)
+
+        self.assertEqual(len(ann_info['gt_bboxes_3d']), 3)
+        assert_allclose(ann_info['gt_bboxes_3d'].tensor.sum(),
+                        torch.tensor(19.2575))
+
+        classes = ['bed']
+        bed_scannet_dataset = SUNRGBDDataset(
+            data_root,
+            ann_file,
+            data_prefix=data_prefix,
+            pipeline=pipeline,
+            metainfo=dict(classes=classes),
+            modality=modality)
+
+        input_dict = bed_scannet_dataset.get_data_info(0)
+        ann_info = bed_scannet_dataset.parse_ann_info(input_dict)
+
+        # assert the keys in ann_info and the type
+        self.assertIn('gt_labels_3d', ann_info)
+        # assert mapping to -1 or 1
+        assert (ann_info['gt_labels_3d'] <= 0).all()
+        assert ann_info['gt_labels_3d'].dtype == np.int64
+        # all instance have been filtered by classes
+        self.assertEqual(len(ann_info['gt_labels_3d']), 3)
+        self.assertEqual(len(bed_scannet_dataset.metainfo['classes']), 1)
diff --git a/mmde/tests/test_datasets/test_transforms/test_formating.py b/mmde/tests/test_datasets/test_transforms/test_formating.py
new file mode 100644
index 0000000000000000000000000000000000000000..d306fa479b7c79cf9e306cbcfc68349c7d682243
--- /dev/null
+++ b/mmde/tests/test_datasets/test_transforms/test_formating.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import torch
+from mmengine.testing import assert_allclose
+
+from mmdet3d.datasets.transforms.formating import Pack3DDetInputs
+from mmdet3d.structures import LiDARInstance3DBoxes
+from mmdet3d.testing import create_data_info_after_loading
+
+
+class TestPack3DDetInputs(unittest.TestCase):
+
+    def test_packinputs(self):
+        ori_data_info = create_data_info_after_loading()
+        pack_input = Pack3DDetInputs(
+            keys=['points', 'gt_labels_3d', 'gt_bboxes_3d'])
+        packed_results = pack_input(ori_data_info)
+        inputs = packed_results['inputs']
+
+        # annotations
+        gt_instances = packed_results['data_samples'].gt_instances_3d
+        self.assertIn('points', inputs)
+        self.assertIsInstance(inputs['points'], torch.Tensor)
+        assert_allclose(inputs['points'].sum(), torch.tensor(13062.6436))
+        # assert to_tensor
+        self.assertIsInstance(inputs['points'], torch.Tensor)
+        self.assertIn('labels_3d', gt_instances)
+        assert_allclose(gt_instances.labels_3d, torch.tensor([1]))
+        # assert to_tensor
+        self.assertIsInstance(gt_instances.labels_3d, torch.Tensor)
+
+        self.assertIn('bboxes_3d', gt_instances)
+        self.assertIsInstance(gt_instances.bboxes_3d, LiDARInstance3DBoxes)
+        assert_allclose(gt_instances.bboxes_3d.tensor.sum(),
+                        torch.tensor(7.2650))
diff --git a/mmde/tests/test_datasets/test_transforms/test_loading.py b/mmde/tests/test_datasets/test_transforms/test_loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..d07122358080b75c6b6a22f5c5283392ba467bb0
--- /dev/null
+++ b/mmde/tests/test_datasets/test_transforms/test_loading.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import numpy as np
+import torch
+from mmengine.testing import assert_allclose
+
+from mmdet3d.datasets.transforms import PointSegClassMapping
+from mmdet3d.datasets.transforms.loading import (LoadAnnotations3D,
+                                                 LoadPointsFromFile)
+from mmdet3d.structures import DepthPoints, LiDARPoints
+from mmdet3d.testing import create_dummy_data_info
+
+
+class TestLoadPointsFromFile(unittest.TestCase):
+
+    def test_load_points_from_file(self):
+        use_dim = 3
+        backend_args = None
+        load_points_transform = LoadPointsFromFile(
+            coord_type='LIDAR',
+            load_dim=4,
+            use_dim=use_dim,
+            backend_args=backend_args)
+        data_info = create_dummy_data_info()
+        info = load_points_transform(data_info)
+        self.assertIn('points', info)
+        self.assertIsInstance(info['points'], LiDARPoints)
+        load_points_transform = LoadPointsFromFile(
+            coord_type='DEPTH',
+            load_dim=4,
+            use_dim=use_dim,
+            backend_args=backend_args)
+        info = load_points_transform(data_info)
+        self.assertIsInstance(info['points'], DepthPoints)
+        self.assertEqual(info['points'].shape[-1], use_dim)
+        load_points_transform = LoadPointsFromFile(
+            coord_type='DEPTH',
+            load_dim=4,
+            use_dim=use_dim,
+            shift_height=True,
+            backend_args=backend_args)
+        info = load_points_transform(data_info)
+        # extra height dim
+        self.assertEqual(info['points'].shape[-1], use_dim + 1)
+
+        repr_str = repr(load_points_transform)
+        self.assertIn('shift_height=True', repr_str)
+        self.assertIn('use_color=False', repr_str)
+        self.assertIn('load_dim=4', repr_str)
+
+
+class TestLoadAnnotations3D(unittest.TestCase):
+
+    def test_load_points_from_file(self):
+        backend_args = None
+
+        load_anns_transform = LoadAnnotations3D(
+            with_bbox_3d=True,
+            with_label_3d=True,
+            with_panoptic_3d=True,
+            seg_offset=2**16,
+            dataset_type='semantickitti',
+            seg_3d_dtype='np.uint32',
+            backend_args=backend_args)
+        self.assertIs(load_anns_transform.with_seg, False)
+        self.assertIs(load_anns_transform.with_bbox_3d, True)
+        self.assertIs(load_anns_transform.with_label_3d, True)
+        data_info = create_dummy_data_info()
+        info = load_anns_transform(data_info)
+        self.assertIn('gt_bboxes_3d', info)
+        assert_allclose(info['gt_bboxes_3d'].tensor.sum(),
+                        torch.tensor(7.2650))
+        self.assertIn('gt_labels_3d', info)
+        assert_allclose(info['gt_labels_3d'], torch.tensor([1]))
+        self.assertIn('pts_semantic_mask', info)
+        self.assertIn('pts_instance_mask', info)
+        assert_allclose(
+            info['pts_semantic_mask'],
+            np.array([
+                50, 50, 50, 70, 70, 50, 0, 50, 70, 50, 50, 70, 71, 52, 70, 50,
+                50, 50, 50, 0, 50, 50, 50, 50, 50, 70, 50, 71, 50, 70, 70, 80,
+                50, 70, 70, 70, 71, 70, 50, 50, 70, 50, 80, 70, 50, 70, 50, 70,
+                70, 50
+            ]))
+        assert_allclose(
+            info['pts_instance_mask'],
+            np.array([
+                50, 50, 50, 70, 70, 50, 0, 50, 70, 50, 50, 70, 71, 52, 70, 50,
+                50, 50, 50, 0, 50, 50, 50, 50, 50, 70, 50, 71, 50, 70, 70, 80,
+                50, 70, 70, 70, 71, 70, 50, 50, 70, 50, 80, 70, 50, 70, 50, 70,
+                70, 50
+            ]))
+        repr_str = repr(load_anns_transform)
+        self.assertIn('with_bbox_3d=True', repr_str)
+        self.assertIn('with_label_3d=True', repr_str)
+        self.assertIn('with_bbox_depth=False', repr_str)
+        self.assertIn('with_panoptic_3d=True', repr_str)
+
+
+class TestPointSegClassMapping(unittest.TestCase):
+
+    def test_point_seg_class_mapping(self):
+        results = dict()
+        results['pts_semantic_mask'] = np.array([1, 2, 3, 4, 5])
+        results['seg_label_mapping'] = np.array([3, 0, 1, 2, 3, 3])
+        point_seg_mapping_transform = PointSegClassMapping()
+        results = point_seg_mapping_transform(results)
+        assert_allclose(results['pts_semantic_mask'], np.array([0, 1, 2, 3,
+                                                                3]))
diff --git a/mmde/tests/test_datasets/test_transforms/test_transforms_3d.py b/mmde/tests/test_datasets/test_transforms/test_transforms_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..94d2e0c55ee075710b79bba0b8dfb7bd179f4ac8
--- /dev/null
+++ b/mmde/tests/test_datasets/test_transforms/test_transforms_3d.py
@@ -0,0 +1,303 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import unittest
+
+import numpy as np
+import torch
+from mmengine.testing import assert_allclose
+
+from mmdet3d.datasets import (GlobalAlignment, RandomFlip3D,
+                              SemanticKittiDataset)
+from mmdet3d.datasets.transforms import GlobalRotScaleTrans, LaserMix, PolarMix
+from mmdet3d.structures import LiDARPoints
+from mmdet3d.testing import create_data_info_after_loading
+from mmdet3d.utils import register_all_modules
+
+register_all_modules()
+
+
+class TestGlobalRotScaleTrans(unittest.TestCase):
+
+    def test_globle_rotation_scale_trans(self):
+        rot_trans = GlobalRotScaleTrans(
+            rot_range=[-0.78, 0.78], scale_ratio_range=[1, 1])
+        scale_trans = GlobalRotScaleTrans(
+            rot_range=[0, 0], scale_ratio_range=[0.95, 1.05])
+
+        ori_data_info = create_data_info_after_loading()
+
+        data_info = copy.deepcopy(ori_data_info)
+        rot_data_info = rot_trans(data_info)
+        self.assertIn('pcd_rotation', rot_data_info)
+        self.assertIn('pcd_rotation_angle', rot_data_info)
+        self.assertIn('pcd_scale_factor', rot_data_info)
+        self.assertEqual(rot_data_info['pcd_scale_factor'], 1)
+        self.assertIs(-0.79 < rot_data_info['pcd_rotation_angle'] < 0.79, True)
+
+        # assert the rot angle should in rot_range
+        before_rot_gt_bbox_3d = ori_data_info['gt_bboxes_3d']
+        after_rot_gt_bbox_3d = rot_data_info['gt_bboxes_3d']
+        assert (after_rot_gt_bbox_3d.tensor[:, -1] -
+                before_rot_gt_bbox_3d.tensor[:, -1]).abs().max() < 0.79
+
+        data_info = copy.deepcopy(ori_data_info)
+        scale_data_info = scale_trans(data_info)
+        # assert the rot angle should in rot_range
+        before_scale_gt_bbox_3d = ori_data_info['gt_bboxes_3d'].tensor
+        after_scale_gt_bbox_3d = scale_data_info['gt_bboxes_3d'].tensor
+        before_scale_points = ori_data_info['points'].tensor
+        after_scale_points = scale_data_info['points'].tensor
+        self.assertEqual(scale_data_info['pcd_rotation_angle'], 0)
+        # assert  scale_factor range
+        assert (0.94 < (after_scale_points / before_scale_points)).all()
+        assert (1.06 >
+                (after_scale_gt_bbox_3d / before_scale_gt_bbox_3d)).all()
+
+
+class TestRandomFlip3D(unittest.TestCase):
+
+    def test_random_flip3d(self):
+        ori_data_info = create_data_info_after_loading()
+        no_flip_transform = RandomFlip3D(flip_ratio_bev_horizontal=0.)
+        always_flip_transform = RandomFlip3D(flip_ratio_bev_horizontal=1.)
+        data_info = copy.deepcopy(ori_data_info)
+        data_info = no_flip_transform(data_info)
+        self.assertIn('pcd_horizontal_flip', data_info)
+        assert_allclose(data_info['points'].tensor,
+                        ori_data_info['points'].tensor)
+
+        torch.allclose(data_info['gt_bboxes_3d'].tensor,
+                       ori_data_info['gt_bboxes_3d'].tensor)
+        data_info = copy.deepcopy(ori_data_info)
+        data_info = always_flip_transform(data_info)
+        assert_allclose(data_info['points'].tensor[:, 0],
+                        ori_data_info['points'].tensor[:, 0])
+        assert_allclose(data_info['points'].tensor[:, 1],
+                        -ori_data_info['points'].tensor[:, 1])
+        assert_allclose(data_info['points'].tensor[:, 2],
+                        ori_data_info['points'].tensor[:, 2])
+
+        assert_allclose(data_info['gt_bboxes_3d'].tensor[:, 0],
+                        ori_data_info['gt_bboxes_3d'].tensor[:, 0])
+        assert_allclose(data_info['gt_bboxes_3d'].tensor[:, 1],
+                        -ori_data_info['gt_bboxes_3d'].tensor[:, 1])
+        assert_allclose(data_info['gt_bboxes_3d'].tensor[:, 2],
+                        ori_data_info['gt_bboxes_3d'].tensor[:, 2])
+
+
+class TestGlobalAlignment(unittest.TestCase):
+
+    def test_global_alignment(self):
+        data_info = create_data_info_after_loading()
+        global_align_transform = GlobalAlignment(rotation_axis=2)
+        data_info['axis_align_matrix'] = np.array(
+            [[0.945519, 0.325568, 0., -5.38439],
+             [-0.325568, 0.945519, 0., -2.87178], [0., 0., 1., -0.06435],
+             [0., 0., 0., 1.]],
+            dtype=np.float32)
+        global_align_transform(data_info)
+
+        data_info['axis_align_matrix'] = np.array(
+            [[0.945519, 0.325568, 0., -5.38439], [0, 2, 0., -2.87178],
+             [0., 0., 1., -0.06435], [0., 0., 0., 1.]],
+            dtype=np.float32)
+        # assert the rot metric
+        with self.assertRaises(AssertionError):
+            global_align_transform(data_info)
+
+
+class TestPolarMix(unittest.TestCase):
+
+    def setUp(self):
+        self.pre_transform = [
+            dict(
+                type='LoadPointsFromFile',
+                coord_type='LIDAR',
+                load_dim=4,
+                use_dim=4),
+            dict(
+                type='LoadAnnotations3D',
+                with_bbox_3d=False,
+                with_label_3d=False,
+                with_mask_3d=False,
+                with_seg_3d=True,
+                seg_3d_dtype='np.int32'),
+            dict(type='PointSegClassMapping'),
+        ]
+        classes = ('car', 'bicycle', 'motorcycle', 'truck', 'bus', 'person',
+                   'bicyclist', 'motorcyclist', 'road', 'parking', 'sidewalk',
+                   'other-ground', 'building', 'fence', 'vegetation', 'trunck',
+                   'terrian', 'pole', 'traffic-sign')
+        seg_label_mapping = {
+            0: 0,  # "unlabeled"
+            1: 0,  # "outlier" mapped to "unlabeled" --------------mapped
+            10: 1,  # "car"
+            11: 2,  # "bicycle"
+            13: 5,  # "bus" mapped to "other-vehicle" --------------mapped
+            15: 3,  # "motorcycle"
+            16: 5,  # "on-rails" mapped to "other-vehicle" ---------mapped
+            18: 4,  # "truck"
+            20: 5,  # "other-vehicle"
+            30: 6,  # "person"
+            31: 7,  # "bicyclist"
+            32: 8,  # "motorcyclist"
+            40: 9,  # "road"
+            44: 10,  # "parking"
+            48: 11,  # "sidewalk"
+            49: 12,  # "other-ground"
+            50: 13,  # "building"
+            51: 14,  # "fence"
+            52: 0,  # "other-structure" mapped to "unlabeled" ------mapped
+            60: 9,  # "lane-marking" to "road" ---------------------mapped
+            70: 15,  # "vegetation"
+            71: 16,  # "trunk"
+            72: 17,  # "terrain"
+            80: 18,  # "pole"
+            81: 19,  # "traffic-sign"
+            99: 0,  # "other-object" to "unlabeled" ----------------mapped
+            252: 1,  # "moving-car" to "car" ------------------------mapped
+            253: 7,  # "moving-bicyclist" to "bicyclist" ------------mapped
+            254: 6,  # "moving-person" to "person" ------------------mapped
+            255: 8,  # "moving-motorcyclist" to "motorcyclist" ------mapped
+            256: 5,  # "moving-on-rails" mapped to "other-vehic------mapped
+            257: 5,  # "moving-bus" mapped to "other-vehicle" -------mapped
+            258: 4,  # "moving-truck" to "truck" --------------------mapped
+            259: 5  # "moving-other"-vehicle to "other-vehicle"-----mapped
+        }
+        max_label = 259
+        self.dataset = SemanticKittiDataset(
+            './tests/data/semantickitti/',
+            'semantickitti_infos.pkl',
+            metainfo=dict(
+                classes=classes,
+                seg_label_mapping=seg_label_mapping,
+                max_label=max_label),
+            data_prefix=dict(
+                pts='sequences/00/velodyne',
+                pts_semantic_mask='sequences/00/labels'),
+            pipeline=[],
+            modality=dict(use_lidar=True, use_camera=False))
+        points = np.random.random((100, 4))
+        self.results = {
+            'points': LiDARPoints(points, points_dim=4),
+            'pts_semantic_mask': np.random.randint(0, 20, (100, )),
+            'dataset': self.dataset
+        }
+
+    def test_transform(self):
+        # test assertion for invalid instance_classes
+        with self.assertRaises(AssertionError):
+            transform = PolarMix(instance_classes=1)
+
+        with self.assertRaises(AssertionError):
+            transform = PolarMix(instance_classes=[1.0, 2.0])
+
+        transform = PolarMix(
+            instance_classes=[15, 16, 17],
+            swap_ratio=1.0,
+            pre_transform=self.pre_transform)
+        results = transform.transform(copy.deepcopy(self.results))
+        self.assertTrue(results['points'].shape[0] ==
+                        results['pts_semantic_mask'].shape[0])
+
+
+class TestLaserMix(unittest.TestCase):
+
+    def setUp(self):
+        self.pre_transform = [
+            dict(
+                type='LoadPointsFromFile',
+                coord_type='LIDAR',
+                load_dim=4,
+                use_dim=4),
+            dict(
+                type='LoadAnnotations3D',
+                with_bbox_3d=False,
+                with_label_3d=False,
+                with_mask_3d=False,
+                with_seg_3d=True,
+                seg_3d_dtype='np.int32'),
+            dict(type='PointSegClassMapping'),
+        ]
+        classes = ('car', 'bicycle', 'motorcycle', 'truck', 'bus', 'person',
+                   'bicyclist', 'motorcyclist', 'road', 'parking', 'sidewalk',
+                   'other-ground', 'building', 'fence', 'vegetation', 'trunck',
+                   'terrian', 'pole', 'traffic-sign')
+        seg_label_mapping = {
+            0: 0,  # "unlabeled"
+            1: 0,  # "outlier" mapped to "unlabeled" --------------mapped
+            10: 1,  # "car"
+            11: 2,  # "bicycle"
+            13: 5,  # "bus" mapped to "other-vehicle" --------------mapped
+            15: 3,  # "motorcycle"
+            16: 5,  # "on-rails" mapped to "other-vehicle" ---------mapped
+            18: 4,  # "truck"
+            20: 5,  # "other-vehicle"
+            30: 6,  # "person"
+            31: 7,  # "bicyclist"
+            32: 8,  # "motorcyclist"
+            40: 9,  # "road"
+            44: 10,  # "parking"
+            48: 11,  # "sidewalk"
+            49: 12,  # "other-ground"
+            50: 13,  # "building"
+            51: 14,  # "fence"
+            52: 0,  # "other-structure" mapped to "unlabeled" ------mapped
+            60: 9,  # "lane-marking" to "road" ---------------------mapped
+            70: 15,  # "vegetation"
+            71: 16,  # "trunk"
+            72: 17,  # "terrain"
+            80: 18,  # "pole"
+            81: 19,  # "traffic-sign"
+            99: 0,  # "other-object" to "unlabeled" ----------------mapped
+            252: 1,  # "moving-car" to "car" ------------------------mapped
+            253: 7,  # "moving-bicyclist" to "bicyclist" ------------mapped
+            254: 6,  # "moving-person" to "person" ------------------mapped
+            255: 8,  # "moving-motorcyclist" to "motorcyclist" ------mapped
+            256: 5,  # "moving-on-rails" mapped to "other-vehic------mapped
+            257: 5,  # "moving-bus" mapped to "other-vehicle" -------mapped
+            258: 4,  # "moving-truck" to "truck" --------------------mapped
+            259: 5  # "moving-other"-vehicle to "other-vehicle"-----mapped
+        }
+        max_label = 259
+        self.dataset = SemanticKittiDataset(
+            './tests/data/semantickitti/',
+            'semantickitti_infos.pkl',
+            metainfo=dict(
+                classes=classes,
+                seg_label_mapping=seg_label_mapping,
+                max_label=max_label),
+            data_prefix=dict(
+                pts='sequences/00/velodyne',
+                pts_semantic_mask='sequences/00/labels'),
+            pipeline=[],
+            modality=dict(use_lidar=True, use_camera=False))
+        points = np.random.random((100, 4))
+        self.results = {
+            'points': LiDARPoints(points, points_dim=4),
+            'pts_semantic_mask': np.random.randint(0, 20, (100, )),
+            'dataset': self.dataset
+        }
+
+    def test_transform(self):
+        # test assertion for invalid num_areas
+        with self.assertRaises(AssertionError):
+            transform = LaserMix(num_areas=3, pitch_angles=[-20, 0])
+
+        with self.assertRaises(AssertionError):
+            transform = LaserMix(num_areas=[3.0, 4.0], pitch_angles=[-20, 0])
+
+        # test assertion for invalid pitch_angles
+        with self.assertRaises(AssertionError):
+            transform = LaserMix(num_areas=[3, 4], pitch_angles=[-20])
+
+        with self.assertRaises(AssertionError):
+            transform = LaserMix(num_areas=[3, 4], pitch_angles=[0, -20])
+
+        transform = LaserMix(
+            num_areas=[3, 4, 5, 6],
+            pitch_angles=[-20, 0],
+            pre_transform=self.pre_transform)
+        results = transform.transform(copy.deepcopy(self.results))
+        self.assertTrue(results['points'].shape[0] ==
+                        results['pts_semantic_mask'].shape[0])
diff --git a/mmde/tests/test_datasets/test_transforms/utils.py b/mmde/tests/test_datasets/test_transforms/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a54c64d4562b12ae1c00548ce7e3812603d6e3bc
--- /dev/null
+++ b/mmde/tests/test_datasets/test_transforms/utils.py
@@ -0,0 +1,192 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+# create a dummy `results` to test the pipeline
+from mmdet3d.datasets import LoadAnnotations3D, LoadPointsFromFile
+from mmdet3d.datasets.transforms.loading import LoadImageFromFileMono3D
+from mmdet3d.structures import LiDARInstance3DBoxes
+
+
+def create_dummy_data_info(with_ann=True):
+
+    ann_info = {
+        'gt_bboxes':
+        np.array([[712.4, 143., 810.73, 307.92]]),
+        'gt_labels':
+        np.array([1]),
+        'gt_bboxes_3d':
+        LiDARInstance3DBoxes(
+            np.array(
+                [[8.7314, -1.8559, -1.5997, 1.2000, 0.4800, 1.8900,
+                  -1.5808]])),
+        'gt_labels_3d':
+        np.array([1]),
+        'centers_2d':
+        np.array([[765.04, 214.56]]),
+        'depths':
+        np.array([8.410]),
+        'num_lidar_pts':
+        np.array([377]),
+        'difficulty':
+        np.array([0]),
+        'truncated':
+        np.array([0]),
+        'occluded':
+        np.array([0]),
+        'alpha':
+        np.array([-0.2]),
+        'score':
+        np.array([0.]),
+        'index':
+        np.array([0]),
+        'group_id':
+        np.array([0])
+    }
+    data_info = {
+        'sample_id':
+        0,
+        'images': {
+            'CAM0': {
+                'cam2img': [[707.0493, 0.0, 604.0814, 0.0],
+                            [0.0, 707.0493, 180.5066, 0.0],
+                            [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+            },
+            'CAM1': {
+                'cam2img': [[707.0493, 0.0, 604.0814, -379.7842],
+                            [0.0, 707.0493, 180.5066, 0.0],
+                            [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+            },
+            'CAM2': {
+                'img_path':
+                'tests/data/kitti/training/image_2/000000.png',
+                'height':
+                370,
+                'width':
+                1224,
+                'cam2img': [[707.0493, 0.0, 604.0814, 45.75831],
+                            [0.0, 707.0493, 180.5066, -0.3454157],
+                            [0.0, 0.0, 1.0, 0.004981016], [0.0, 0.0, 0.0, 1.0]]
+            },
+            'CAM3': {
+                'cam2img': [[707.0493, 0.0, 604.0814, -334.1081],
+                            [0.0, 707.0493, 180.5066, 2.33066],
+                            [0.0, 0.0, 1.0, 0.003201153], [0.0, 0.0, 0.0, 1.0]]
+            },
+            'R0_rect': [[
+                0.9999127984046936, 0.010092630051076412,
+                -0.008511931635439396, 0.0
+            ],
+                        [
+                            -0.010127290152013302, 0.9999405741691589,
+                            -0.004037670791149139, 0.0
+                        ],
+                        [
+                            0.008470674976706505, 0.0041235219687223434,
+                            0.9999555945396423, 0.0
+                        ], [0.0, 0.0, 0.0, 1.0]]
+        },
+        'lidar_points': {
+            'num_pts_feats':
+            4,
+            'lidar_path':
+            'tests/data/kitti/training/velodyne_reduced/000000.bin',
+            'lidar2cam': [[
+                -0.0015960992313921452, -0.9999162554740906,
+                -0.012840436771512032, -0.022366708144545555
+            ],
+                          [
+                              -0.00527064548805356, 0.012848696671426296,
+                              -0.9999035596847534, -0.05967890843749046
+                          ],
+                          [
+                              0.9999848008155823, -0.0015282672829926014,
+                              -0.005290712229907513, -0.33254900574684143
+                          ], [0.0, 0.0, 0.0, 1.0]],
+            'Tr_velo_to_cam': [[
+                0.006927963811904192, -0.9999722242355347, -0.0027578289154917,
+                -0.024577289819717407
+            ],
+                               [
+                                   -0.0011629819637164474,
+                                   0.0027498360723257065, -0.9999955296516418,
+                                   -0.06127237156033516
+                               ],
+                               [
+                                   0.999975323677063, 0.006931141018867493,
+                                   -0.0011438990477472544, -0.33210289478302
+                               ], [0.0, 0.0, 0.0, 1.0]],
+            'Tr_imu_to_velo': [[
+                0.999997615814209, 0.0007553070900030434,
+                -0.002035825978964567, -0.8086758852005005
+            ],
+                               [
+                                   -0.0007854027207940817, 0.9998897910118103,
+                                   -0.014822980388998985, 0.3195559084415436
+                               ],
+                               [
+                                   0.002024406101554632, 0.014824540354311466,
+                                   0.9998881220817566, -0.7997230887413025
+                               ], [0.0, 0.0, 0.0, 1.0]]
+        },
+        'instances': [{
+            'bbox': [712.4, 143.0, 810.73, 307.92],
+            'bbox_label':
+            -1,
+            'bbox_3d': [
+                1.840000033378601, 1.4700000286102295, 8.40999984741211,
+                1.2000000476837158, 1.8899999856948853, 0.47999998927116394,
+                0.009999999776482582
+            ],
+            'bbox_label_3d':
+            -1,
+            'center_2d': [765.04, 214.56],
+            'depth':
+            8.410,
+            'num_lidar_pts':
+            377,
+            'difficulty':
+            0,
+            'truncated':
+            0,
+            'occluded':
+            0,
+            'alpha':
+            -0.2,
+            'score':
+            0.0,
+            'index':
+            0,
+            'group_id':
+            0
+        }],
+        'plane':
+        None
+    }
+    if with_ann:
+        data_info['ann_info'] = ann_info
+    return data_info
+
+
+def create_data_info_after_loading():
+    load_anns_transform = LoadAnnotations3D(
+        with_bbox_3d=True, with_label_3d=True)
+    load_points_transform = LoadPointsFromFile(
+        coord_type='LIDAR', load_dim=4, use_dim=3)
+    data_info = create_dummy_data_info()
+    data_info = load_points_transform(data_info)
+    data_info_after_loading = load_anns_transform(data_info)
+    return data_info_after_loading
+
+
+def create_mono3d_data_info_after_loading():
+    load_anns_transform = LoadAnnotations3D(
+        with_bbox=True,
+        with_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True)
+    load_img_transform = LoadImageFromFileMono3D()
+    data_info = create_dummy_data_info()
+    data_info = load_img_transform(data_info)
+    data_info_after_loading = load_anns_transform(data_info)
+    return data_info_after_loading
diff --git a/mmde/tests/test_datasets/test_tta.py b/mmde/tests/test_datasets/test_tta.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa191cba967130479d043f87cb15afffa168dcce
--- /dev/null
+++ b/mmde/tests/test_datasets/test_tta.py
@@ -0,0 +1,210 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
+import pytest
+from mmengine import DefaultScope
+
+from mmdet3d.datasets.transforms import *  # noqa
+from mmdet3d.registry import TRANSFORMS
+from mmdet3d.structures.points import LiDARPoints
+
+DefaultScope.get_instance('test_multi_scale_flip_aug_3d', scope_name='mmdet3d')
+
+
+class TestMuitiScaleFlipAug3D(TestCase):
+
+    def test_exception(self):
+        with pytest.raises(TypeError):
+            tta_transform = dict(
+                type='TestTimeAug',
+                transforms=[
+                    dict(
+                        type='RandomFlip3D',
+                        flip_ratio_bev_horizontal=0.0,
+                        flip_ratio_bev_vertical=0.0)
+                ])
+            TRANSFORMS.build(tta_transform)
+
+    def test_multi_scale_flip_aug(self):
+        tta_transform = dict(
+            type='TestTimeAug',
+            transforms=[[
+                dict(
+                    type='RandomFlip3D',
+                    flip_ratio_bev_horizontal=0.0,
+                    flip_ratio_bev_vertical=0.0),
+                dict(
+                    type='RandomFlip3D',
+                    flip_ratio_bev_horizontal=0.0,
+                    flip_ratio_bev_vertical=1.0),
+                dict(
+                    type='RandomFlip3D',
+                    flip_ratio_bev_horizontal=1.0,
+                    flip_ratio_bev_vertical=0.0),
+                dict(
+                    type='RandomFlip3D',
+                    flip_ratio_bev_horizontal=1.0,
+                    flip_ratio_bev_vertical=1.0)
+            ], [dict(type='Pack3DDetInputs', keys=['points'])]])
+        tta_module = TRANSFORMS.build(tta_transform)
+
+        results = dict()
+        points = LiDARPoints(np.random.random((100, 4)), 4)
+        results['points'] = points
+
+        tta_results = tta_module(results.copy())
+        assert [
+            data_sample.metainfo['pcd_horizontal_flip']
+            for data_sample in tta_results['data_samples']
+        ] == [False, False, True, True]
+        assert [
+            data_sample.metainfo['pcd_vertical_flip']
+            for data_sample in tta_results['data_samples']
+        ] == [False, True, False, True]
+
+        tta_transform = dict(
+            type='TestTimeAug',
+            transforms=[[
+                dict(
+                    type='GlobalRotScaleTrans',
+                    rot_range=[-0.78539816, -0.78539816],
+                    scale_ratio_range=[1.0, 1.0],
+                    translation_std=[0, 0, 0]),
+                dict(
+                    type='GlobalRotScaleTrans',
+                    rot_range=[0, 0],
+                    scale_ratio_range=[1.0, 1.0],
+                    translation_std=[0, 0, 0]),
+                dict(
+                    type='GlobalRotScaleTrans',
+                    rot_range=[0.78539816, 0.78539816],
+                    scale_ratio_range=[1.0, 1.0],
+                    translation_std=[0, 0, 0])
+            ], [dict(type='Pack3DDetInputs', keys=['points'])]])
+        tta_module = TRANSFORMS.build(tta_transform)
+
+        results = dict()
+        points = LiDARPoints(np.random.random((100, 4)), 4)
+        results['points'] = points
+
+        tta_results = tta_module(results.copy())
+        assert [
+            data_sample.metainfo['pcd_rotation_angle']
+            for data_sample in tta_results['data_samples']
+        ] == [-0.78539816, 0, 0.78539816]
+        assert [
+            data_sample.metainfo['pcd_scale_factor']
+            for data_sample in tta_results['data_samples']
+        ] == [1.0, 1.0, 1.0]
+
+        tta_transform = dict(
+            type='TestTimeAug',
+            transforms=[[
+                dict(
+                    type='GlobalRotScaleTrans',
+                    rot_range=[0, 0],
+                    scale_ratio_range=[0.95, 0.95],
+                    translation_std=[0, 0, 0]),
+                dict(
+                    type='GlobalRotScaleTrans',
+                    rot_range=[0, 0],
+                    scale_ratio_range=[1.0, 1.0],
+                    translation_std=[0, 0, 0]),
+                dict(
+                    type='GlobalRotScaleTrans',
+                    rot_range=[0, 0],
+                    scale_ratio_range=[1.05, 1.05],
+                    translation_std=[0, 0, 0])
+            ], [dict(type='Pack3DDetInputs', keys=['points'])]])
+        tta_module = TRANSFORMS.build(tta_transform)
+
+        results = dict()
+        points = LiDARPoints(np.random.random((100, 4)), 4)
+        results['points'] = points
+
+        tta_results = tta_module(results.copy())
+        assert [
+            data_sample.metainfo['pcd_rotation_angle']
+            for data_sample in tta_results['data_samples']
+        ] == [0, 0, 0]
+        assert [
+            data_sample.metainfo['pcd_scale_factor']
+            for data_sample in tta_results['data_samples']
+        ] == [0.95, 1, 1.05]
+
+        tta_transform = dict(
+            type='TestTimeAug',
+            transforms=[
+                [
+                    dict(
+                        type='RandomFlip3D',
+                        flip_ratio_bev_horizontal=0.0,
+                        flip_ratio_bev_vertical=0.0),
+                    dict(
+                        type='RandomFlip3D',
+                        flip_ratio_bev_horizontal=0.0,
+                        flip_ratio_bev_vertical=1.0),
+                    dict(
+                        type='RandomFlip3D',
+                        flip_ratio_bev_horizontal=1.0,
+                        flip_ratio_bev_vertical=0.0),
+                    dict(
+                        type='RandomFlip3D',
+                        flip_ratio_bev_horizontal=1.0,
+                        flip_ratio_bev_vertical=1.0)
+                ],
+                [
+                    dict(
+                        type='GlobalRotScaleTrans',
+                        rot_range=[pcd_rotate_range, pcd_rotate_range],
+                        scale_ratio_range=[pcd_scale_factor, pcd_scale_factor],
+                        translation_std=[0, 0, 0])
+                    for pcd_rotate_range in [-0.78539816, 0.0, 0.78539816]
+                    for pcd_scale_factor in [0.95, 1.0, 1.05]
+                ], [dict(type='Pack3DDetInputs', keys=['points'])]
+            ])
+        tta_module = TRANSFORMS.build(tta_transform)
+
+        results = dict()
+        points = LiDARPoints(np.random.random((100, 4)), 4)
+        results['points'] = points
+
+        tta_results = tta_module(results.copy())
+        assert [
+            data_sample.metainfo['pcd_horizontal_flip']
+            for data_sample in tta_results['data_samples']
+        ] == [
+            False, False, False, False, False, False, False, False, False,
+            False, False, False, False, False, False, False, False, False,
+            True, True, True, True, True, True, True, True, True, True, True,
+            True, True, True, True, True, True, True
+        ]
+        assert [
+            data_sample.metainfo['pcd_vertical_flip']
+            for data_sample in tta_results['data_samples']
+        ] == [
+            False, False, False, False, False, False, False, False, False,
+            True, True, True, True, True, True, True, True, True, False, False,
+            False, False, False, False, False, False, False, True, True, True,
+            True, True, True, True, True, True
+        ]
+        assert [
+            data_sample.metainfo['pcd_rotation_angle']
+            for data_sample in tta_results['data_samples']
+        ] == [
+            -0.78539816, -0.78539816, -0.78539816, 0.0, 0.0, 0.0, 0.78539816,
+            0.78539816, 0.78539816, -0.78539816, -0.78539816, -0.78539816, 0.0,
+            0.0, 0.0, 0.78539816, 0.78539816, 0.78539816, -0.78539816,
+            -0.78539816, -0.78539816, 0.0, 0.0, 0.0, 0.78539816, 0.78539816,
+            0.78539816, -0.78539816, -0.78539816, -0.78539816, 0.0, 0.0, 0.0,
+            0.78539816, 0.78539816, 0.78539816
+        ]
+        assert [
+            data_sample.metainfo['pcd_scale_factor']
+            for data_sample in tta_results['data_samples']
+        ] == [
+            0.95, 1.0, 1.05, 0.95, 1.0, 1.05, 0.95, 1.0, 1.05, 0.95, 1.0, 1.05,
+            0.95, 1.0, 1.05, 0.95, 1.0, 1.05, 0.95, 1.0, 1.05, 0.95, 1.0, 1.05,
+            0.95, 1.0, 1.05, 0.95, 1.0, 1.05, 0.95, 1.0, 1.05, 0.95, 1.0, 1.05
+        ]
diff --git a/mmde/tests/test_datasets/test_waymo_dataset.py b/mmde/tests/test_datasets/test_waymo_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..20ec1fc173340a470ae4275ab67ec6938646d217
--- /dev/null
+++ b/mmde/tests/test_datasets/test_waymo_dataset.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import numpy as np
+import torch
+from mmcv.transforms.base import BaseTransform
+from mmengine.registry import TRANSFORMS
+from mmengine.structures import InstanceData
+
+from mmdet3d.datasets import WaymoDataset
+from mmdet3d.structures import Det3DDataSample, LiDARInstance3DBoxes
+
+
+def _generate_waymo_dataset_config():
+    data_root = 'tests/data/waymo/kitti_format'
+    ann_file = 'waymo_infos_train.pkl'
+    classes = ['Car', 'Pedestrian', 'Cyclist']
+    # wait for pipline refactor
+
+    if 'Identity' not in TRANSFORMS:
+
+        @TRANSFORMS.register_module()
+        class Identity(BaseTransform):
+
+            def transform(self, info):
+                if 'ann_info' in info:
+                    info['gt_labels_3d'] = info['ann_info']['gt_labels_3d']
+                data_sample = Det3DDataSample()
+                gt_instances_3d = InstanceData()
+                gt_instances_3d.labels_3d = info['gt_labels_3d']
+                data_sample.gt_instances_3d = gt_instances_3d
+                info['data_samples'] = data_sample
+                return info
+
+    pipeline = [
+        dict(type='Identity'),
+    ]
+
+    modality = dict(use_lidar=True, use_camera=True)
+    data_prefix = data_prefix = dict(
+        pts='training/velodyne', CAM_FRONT='training/image_0')
+    return data_root, ann_file, classes, data_prefix, pipeline, modality
+
+
+def test_getitem():
+    data_root, ann_file, classes, data_prefix, \
+        pipeline, modality, = _generate_waymo_dataset_config()
+
+    waymo_dataset = WaymoDataset(
+        data_root,
+        ann_file,
+        data_prefix=data_prefix,
+        pipeline=pipeline,
+        metainfo=dict(classes=classes),
+        modality=modality)
+
+    waymo_dataset.prepare_data(0)
+    input_dict = waymo_dataset.get_data_info(0)
+    waymo_dataset[0]
+    # assert the the path should contains data_prefix and data_root
+    assert data_prefix['pts'] in input_dict['lidar_points']['lidar_path']
+    assert data_root in input_dict['lidar_points']['lidar_path']
+    for cam_id, img_info in input_dict['images'].items():
+        if 'img_path' in img_info:
+            assert data_prefix['CAM_FRONT'] in img_info['img_path']
+            assert data_root in img_info['img_path']
+
+    ann_info = waymo_dataset.parse_ann_info(input_dict)
+
+    # only one instance
+    assert 'gt_labels_3d' in ann_info
+    assert ann_info['gt_labels_3d'].dtype == np.int64
+
+    assert 'gt_bboxes_3d' in ann_info
+    assert isinstance(ann_info['gt_bboxes_3d'], LiDARInstance3DBoxes)
+    assert torch.allclose(ann_info['gt_bboxes_3d'].tensor.sum(),
+                          torch.tensor(43.3103))
+    assert 'centers_2d' in ann_info
+    assert ann_info['centers_2d'].dtype == np.float32
+    assert 'depths' in ann_info
+    assert ann_info['depths'].dtype == np.float32
diff --git a/mmde/tests/test_engine/test_hooks/test_disable_object_sample_hook.py b/mmde/tests/test_engine/test_hooks/test_disable_object_sample_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbe7cd65d5943975db47c847fd6c95591c98097d
--- /dev/null
+++ b/mmde/tests/test_engine/test_hooks/test_disable_object_sample_hook.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+from unittest.mock import Mock
+
+from mmengine.dataset import BaseDataset
+
+from mmdet3d.datasets.transforms import ObjectSample
+from mmdet3d.engine.hooks import DisableObjectSampleHook
+
+
+class TestDisableObjectSampleHook(TestCase):
+
+    runner = Mock()
+    runner.train_dataloader = Mock()
+    runner.train_dataloader.dataset = Mock(spec=BaseDataset)
+    runner.train_dataloader.dataset.pipeline = Mock()
+    runner.train_dataloader._DataLoader__initialized = True
+    runner.train_dataloader.dataset.pipeline.transforms = [
+        ObjectSample(
+            db_sampler=dict(
+                data_root='tests/data/waymo/kitti_format',
+                info_path=  # noqa
+                'tests/data/waymo/kitti_format/waymo_dbinfos_train.pkl',
+                rate=1.0,
+                prepare=dict(
+                    filter_by_difficulty=[-1],
+                    filter_by_min_points=dict(Car=5)),
+                classes=['Car'],
+                sample_groups=dict(Car=15),
+            ))
+    ]
+
+    def test_is_model_wrapper_and_persistent_workers_on(self):
+        self.runner.train_dataloader.dataset.pipeline.transforms[
+            0].disabled = False
+        self.runner.train_dataloader.persistent_workers = True
+        hook = DisableObjectSampleHook(disable_after_epoch=15)
+        self.runner.epoch = 14
+        hook.before_train_epoch(self.runner)
+        self.assertFalse(self.runner.train_dataloader.dataset.pipeline.
+                         transforms[0].disabled)  # noqa: E501
+
+        self.runner.epoch = 15
+        hook.before_train_epoch(self.runner)
+        self.assertTrue(self.runner.train_dataloader.dataset.pipeline.
+                        transforms[0].disabled)  # noqa: E501
+        self.assertTrue(hook._restart_dataloader)
+        self.assertFalse(self.runner.train_dataloader._DataLoader__initialized)
+
+        self.runner.epoch = 16
+        hook.before_train_epoch(self.runner)
+        self.assertTrue(self.runner.train_dataloader._DataLoader__initialized)
+        self.assertTrue(self.runner.train_dataloader.dataset.pipeline.
+                        transforms[0].disabled)  # noqa: E501
+
+    def test_not_model_wrapper_and_persistent_workers_off(self):
+        self.runner.train_dataloader.dataset.pipeline.transforms[
+            0].disabled = False
+        self.runner.train_dataloader.persistent_workers = False
+        hook = DisableObjectSampleHook(disable_after_epoch=15)
+        self.runner.epoch = 14
+        hook.before_train_epoch(self.runner)
+        self.assertFalse(self.runner.train_dataloader.dataset.pipeline.
+                         transforms[0].disabled)  # noqa: E501
+
+        self.runner.epoch = 15
+        hook.before_train_epoch(self.runner)
+        self.assertTrue(self.runner.train_dataloader.dataset.pipeline.
+                        transforms[0].disabled)  # noqa: E501
+        self.assertFalse(hook._restart_dataloader)
+        self.assertTrue(self.runner.train_dataloader._DataLoader__initialized)
+
+        self.runner.epoch = 16
+        hook.before_train_epoch(self.runner)
+        self.assertTrue(self.runner.train_dataloader._DataLoader__initialized)
+        self.assertTrue(self.runner.train_dataloader.dataset.pipeline.
+                        transforms[0].disabled)  # noqa: E501
diff --git a/mmde/tests/test_engine/test_hooks/test_visualization_hook.py b/mmde/tests/test_engine/test_hooks/test_visualization_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bd1da5294c0a74f1aafdd0a2c79d9639a7fb4b1
--- /dev/null
+++ b/mmde/tests/test_engine/test_hooks/test_visualization_hook.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import shutil
+import time
+from unittest import TestCase
+from unittest.mock import Mock
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet3d.engine.hooks import Det3DVisualizationHook
+from mmdet3d.structures import Det3DDataSample, LiDARInstance3DBoxes
+from mmdet3d.visualization import Det3DLocalVisualizer
+
+
+class TestVisualizationHook(TestCase):
+
+    def setUp(self) -> None:
+        Det3DLocalVisualizer.get_instance('visualizer')
+
+        pred_instances_3d = InstanceData()
+        pred_instances_3d.bboxes_3d = LiDARInstance3DBoxes(
+            torch.tensor(
+                [[8.7314, -1.8559, -1.5997, 1.2000, 0.4800, 1.8900, -1.5808]]))
+        pred_instances_3d.labels_3d = torch.tensor([0])
+        pred_instances_3d.scores_3d = torch.tensor([0.8])
+
+        pred_det3d_data_sample = Det3DDataSample()
+        pred_det3d_data_sample.set_metainfo({
+            'num_pts_feats':
+            4,
+            'lidar2img':
+            np.array([[
+                6.02943734e+02, -7.07913286e+02, -1.22748427e+01,
+                -1.70942724e+02
+            ],
+                      [
+                          1.76777261e+02, 8.80879902e+00, -7.07936120e+02,
+                          -1.02568636e+02
+                      ],
+                      [
+                          9.99984860e-01, -1.52826717e-03, -5.29071223e-03,
+                          -3.27567990e-01
+                      ],
+                      [
+                          0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
+                          1.00000000e+00
+                      ]]),
+            'img_path':
+            osp.join(
+                osp.dirname(__file__),
+                '../../data/kitti/training/image_2/000000.png'),
+            'lidar_path':
+            osp.join(
+                osp.dirname(__file__),
+                '../../data/kitti/training/velodyne_reduced/000000.bin')
+        })
+        pred_det3d_data_sample.pred_instances_3d = pred_instances_3d
+        self.outputs = [pred_det3d_data_sample] * 2
+
+    def test_after_val_iter(self):
+        runner = Mock()
+        runner.iter = 1
+        hook = Det3DVisualizationHook()
+        hook.after_val_iter(runner, 1, {}, self.outputs)
+
+    def test_after_test_iter(self):
+        runner = Mock()
+        runner.iter = 1
+        hook = Det3DVisualizationHook(draw=True)
+        hook.after_test_iter(runner, 1, {}, self.outputs)
+        self.assertEqual(hook._test_index, 2)
+
+        # test
+        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+        test_out_dir = timestamp + '1'
+        runner.work_dir = timestamp
+        runner.timestamp = '1'
+        hook = Det3DVisualizationHook(draw=False, test_out_dir=test_out_dir)
+        hook.after_test_iter(runner, 1, {}, self.outputs)
+        self.assertTrue(not osp.exists(f'{timestamp}/1/{test_out_dir}'))
+
+        hook = Det3DVisualizationHook(draw=True, test_out_dir=test_out_dir)
+        hook.after_test_iter(runner, 1, {}, self.outputs)
+        self.assertTrue(osp.exists(f'{timestamp}/1/{test_out_dir}'))
+        shutil.rmtree(f'{timestamp}')
diff --git a/mmde/tests/test_evaluation/test_functional/test_instance_seg_eval.py b/mmde/tests/test_evaluation/test_functional/test_instance_seg_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..89f93dca375ff56578b35bf97b873276724fbfc2
--- /dev/null
+++ b/mmde/tests/test_evaluation/test_functional/test_instance_seg_eval.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet3d.evaluation import instance_seg_eval
+
+
+def test_instance_seg_eval():
+    valid_class_ids = (3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
+                       36, 39)
+    class_labels = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door',
+                    'window', 'bookshelf', 'picture', 'counter', 'desk',
+                    'curtain', 'refrigerator', 'showercurtrain', 'toilet',
+                    'sink', 'bathtub', 'garbagebin')
+    n_points_list = [3300, 3000]
+    gt_labels_list = [[0, 0, 0, 0, 0, 0, 14, 14, 2, 1],
+                      [13, 13, 2, 1, 3, 3, 0, 0, 0]]
+    gt_instance_masks = []
+    gt_semantic_masks = []
+    pred_instance_masks = []
+    pred_instance_labels = []
+    pred_instance_scores = []
+    for n_points, gt_labels in zip(n_points_list, gt_labels_list):
+        gt_instance_mask = np.ones(n_points, dtype=np.int64) * -1
+        gt_semantic_mask = np.ones(n_points, dtype=np.int64) * -1
+        pred_instance_mask = np.ones(n_points, dtype=np.int64) * -1
+        labels = []
+        scores = []
+        for i, gt_label in enumerate(gt_labels):
+            begin = i * 300
+            end = begin + 300
+            gt_instance_mask[begin:end] = i
+            gt_semantic_mask[begin:end] = gt_label
+            pred_instance_mask[begin:end] = i
+            labels.append(gt_label)
+            scores.append(.99)
+        gt_instance_masks.append(torch.tensor(gt_instance_mask))
+        gt_semantic_masks.append(torch.tensor(gt_semantic_mask))
+        pred_instance_masks.append(torch.tensor(pred_instance_mask))
+        pred_instance_labels.append(torch.tensor(labels))
+        pred_instance_scores.append(torch.tensor(scores))
+
+    ret_value = instance_seg_eval(
+        gt_semantic_masks=gt_semantic_masks,
+        gt_instance_masks=gt_instance_masks,
+        pred_instance_masks=pred_instance_masks,
+        pred_instance_labels=pred_instance_labels,
+        pred_instance_scores=pred_instance_scores,
+        valid_class_ids=valid_class_ids,
+        class_labels=class_labels)
+    for label in [
+            'cabinet', 'bed', 'chair', 'sofa', 'showercurtrain', 'toilet'
+    ]:
+        metrics = ret_value['classes'][label]
+        assert metrics['ap'] == 1.0
+        assert metrics['ap50%'] == 1.0
+        assert metrics['ap25%'] == 1.0
+
+    pred_instance_masks[1][2240:2700] = -1
+    pred_instance_masks[0][2700:3000] = 8
+    pred_instance_labels[0][9] = 2
+    ret_value = instance_seg_eval(
+        gt_semantic_masks=gt_semantic_masks,
+        gt_instance_masks=gt_instance_masks,
+        pred_instance_masks=pred_instance_masks,
+        pred_instance_labels=pred_instance_labels,
+        pred_instance_scores=pred_instance_scores,
+        valid_class_ids=valid_class_ids,
+        class_labels=class_labels)
+    assert abs(ret_value['classes']['cabinet']['ap50%'] - 0.72916) < 0.01
+    assert abs(ret_value['classes']['cabinet']['ap25%'] - 0.88888) < 0.01
+    assert abs(ret_value['classes']['bed']['ap50%'] - 0.5) < 0.01
+    assert abs(ret_value['classes']['bed']['ap25%'] - 0.5) < 0.01
+    assert abs(ret_value['classes']['chair']['ap50%'] - 0.375) < 0.01
+    assert abs(ret_value['classes']['chair']['ap25%'] - 1.0) < 0.01
diff --git a/mmde/tests/test_evaluation/test_functional/test_kitti_eval.py b/mmde/tests/test_evaluation/test_functional/test_kitti_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8608af794f790cd6e0917462d94dac136df69ba
--- /dev/null
+++ b/mmde/tests/test_evaluation/test_functional/test_kitti_eval.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmdet3d.evaluation import do_eval, eval_class, kitti_eval
+
+
+def test_do_eval():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and CUDA')
+    gt_name = np.array(
+        ['Pedestrian', 'Cyclist', 'Car', 'Car', 'Car', 'DontCare', 'DontCare'])
+    gt_truncated = np.array([0., 0., 0., -1., -1., -1., -1.])
+    gt_occluded = np.array([0, 0, 3, -1, -1, -1, -1])
+    gt_alpha = np.array([-1.57, 1.85, -1.65, -10., -10., -10., -10.])
+    gt_bbox = np.array([[674.9179, 165.48549, 693.23694, 193.42134],
+                        [676.21954, 165.70988, 691.63745, 193.83748],
+                        [389.4093, 182.48041, 421.49072, 202.13422],
+                        [232.0577, 186.16724, 301.94623, 217.4024],
+                        [758.6537, 172.98509, 816.32434, 212.76743],
+                        [532.37, 176.35, 542.68, 185.27],
+                        [559.62, 175.83, 575.4, 183.15]])
+    gt_dimensions = np.array([[12.34, 2.85, 2.63], [3.69, 1.67, 1.87],
+                              [2.02, 1.86, 0.6], [-1., -1., -1.],
+                              [-1., -1., -1.], [-1., -1., -1.],
+                              [-1., -1., -1.]])
+    gt_location = np.array([[4.700e-01, 1.490e+00, 6.944e+01],
+                            [-1.653e+01, 2.390e+00, 5.849e+01],
+                            [4.590e+00, 1.320e+00, 4.584e+01],
+                            [-1.000e+03, -1.000e+03, -1.000e+03],
+                            [-1.000e+03, -1.000e+03, -1.000e+03],
+                            [-1.000e+03, -1.000e+03, -1.000e+03],
+                            [-1.000e+03, -1.000e+03, -1.000e+03]])
+    gt_rotation_y = [-1.56, 1.57, -1.55, -10., -10., -10., -10.]
+    gt_anno = dict(
+        name=gt_name,
+        truncated=gt_truncated,
+        occluded=gt_occluded,
+        alpha=gt_alpha,
+        bbox=gt_bbox,
+        dimensions=gt_dimensions,
+        location=gt_location,
+        rotation_y=gt_rotation_y)
+
+    dt_name = np.array(['Pedestrian', 'Cyclist', 'Car', 'Car', 'Car'])
+    dt_truncated = np.array([0., 0., 0., 0., 0.])
+    dt_occluded = np.array([0, 0, 0, 0, 0])
+    dt_alpha = np.array([1.0744612, 1.2775835, 1.82563, 2.1145396, -1.7676563])
+    dt_dimensions = np.array([[1.4441837, 1.7450154, 0.53160036],
+                              [1.6501029, 1.7540325, 0.5162356],
+                              [3.9313498, 1.4899347, 1.5655756],
+                              [4.0111866, 1.5350999, 1.585221],
+                              [3.7337692, 1.5117968, 1.5515774]])
+    dt_location = np.array([[4.6671643, 1.285098, 45.836895],
+                            [4.658241, 1.3088846, 45.85148],
+                            [-16.598526, 2.298814, 58.618088],
+                            [-18.629122, 2.2990575, 39.305355],
+                            [7.0964046, 1.5178275, 29.32426]])
+    dt_rotation_y = np.array(
+        [1.174933, 1.3778262, 1.550529, 1.6742425, -1.5330327])
+    dt_bbox = np.array([[674.9179, 165.48549, 693.23694, 193.42134],
+                        [676.21954, 165.70988, 691.63745, 193.83748],
+                        [389.4093, 182.48041, 421.49072, 202.13422],
+                        [232.0577, 186.16724, 301.94623, 217.4024],
+                        [758.6537, 172.98509, 816.32434, 212.76743]])
+    dt_score = np.array(
+        [0.18151495, 0.57920843, 0.27795696, 0.23100418, 0.21541929])
+    dt_anno = dict(
+        name=dt_name,
+        truncated=dt_truncated,
+        occluded=dt_occluded,
+        alpha=dt_alpha,
+        bbox=dt_bbox,
+        dimensions=dt_dimensions,
+        location=dt_location,
+        rotation_y=dt_rotation_y,
+        score=dt_score)
+    current_classes = [1, 2, 0]
+    min_overlaps = np.array([[[0.5, 0.5, 0.7], [0.5, 0.5, 0.7],
+                              [0.5, 0.5, 0.7]],
+                             [[0.5, 0.5, 0.7], [0.25, 0.25, 0.5],
+                              [0.25, 0.25, 0.5]]])
+    eval_types = ['bbox', 'bev', '3d', 'aos']
+    mAP11_bbox, mAP11_bev, mAP11_3d, mAP11_aos, mAP40_bbox,\
+        mAP40_bev, mAP40_3d, mAP40_aos = do_eval([gt_anno], [dt_anno],
+                                                 current_classes, min_overlaps,
+                                                 eval_types)
+    expected_mAP11_bbox = np.array([[[0., 0.], [9.09090909, 9.09090909],
+                                     [9.09090909, 9.09090909]],
+                                    [[0., 0.], [9.09090909, 9.09090909],
+                                     [9.09090909, 9.09090909]],
+                                    [[0., 0.], [9.09090909, 9.09090909],
+                                     [9.09090909, 9.09090909]]])
+    expected_mAP40_bbox = np.array([[[0., 0.], [0., 0.], [0., 0.]],
+                                    [[0., 0.], [0., 0.], [0., 0.]],
+                                    [[0., 0.], [2.5, 2.5], [2.5, 2.5]]])
+    expected_mAP11_bev = np.array([[[0., 0.], [0., 0.], [0., 0.]],
+                                   [[0., 0.], [0., 0.], [0., 0.]],
+                                   [[0., 0.], [0., 0.], [0., 0.]]])
+    expected_mAP40_bev = np.array([[[0., 0.], [0., 0.], [0., 0.]],
+                                   [[0., 0.], [0., 0.], [0., 0.]],
+                                   [[0., 0.], [0., 0.], [0., 0.]]])
+    expected_mAP11_3d = np.array([[[0., 0.], [0., 0.], [0., 0.]],
+                                  [[0., 0.], [0., 0.], [0., 0.]],
+                                  [[0., 0.], [0., 0.], [0., 0.]]])
+    expected_mAP40_3d = np.array([[[0., 0.], [0., 0.], [0., 0.]],
+                                  [[0., 0.], [0., 0.], [0., 0.]],
+                                  [[0., 0.], [0., 0.], [0., 0.]]])
+    expected_mAP11_aos = np.array([[[0., 0.], [0.55020816, 0.55020816],
+                                    [0.55020816, 0.55020816]],
+                                   [[0., 0.], [8.36633862, 8.36633862],
+                                    [8.36633862, 8.36633862]],
+                                   [[0., 0.], [8.63476893, 8.63476893],
+                                    [8.63476893, 8.63476893]]])
+    expected_mAP40_aos = np.array([[[0., 0.], [0., 0.], [0., 0.]],
+                                   [[0., 0.], [0., 0.], [0., 0.]],
+                                   [[0., 0.], [1.58140643, 1.58140643],
+                                    [1.58140643, 1.58140643]]])
+    assert np.allclose(mAP11_bbox, expected_mAP11_bbox)
+    assert np.allclose(mAP11_bev, expected_mAP11_bev)
+    assert np.allclose(mAP11_3d, expected_mAP11_3d)
+    assert np.allclose(mAP11_aos, expected_mAP11_aos)
+    assert np.allclose(mAP40_bbox, expected_mAP40_bbox)
+    assert np.allclose(mAP40_bev, expected_mAP40_bev)
+    assert np.allclose(mAP40_3d, expected_mAP40_3d)
+    assert np.allclose(mAP40_aos, expected_mAP40_aos)
+
+
+def test_kitti_eval():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and CUDA')
+    gt_name = np.array(
+        ['Pedestrian', 'Cyclist', 'Car', 'Car', 'Car', 'DontCare', 'DontCare'])
+    gt_truncated = np.array([0., 0., 0., -1., -1., -1., -1.])
+    gt_occluded = np.array([0, 0, 3, -1, -1, -1, -1])
+    gt_alpha = np.array([-1.57, 1.85, -1.65, -10., -10., -10., -10.])
+    gt_bbox = np.array([[674.9179, 165.48549, 693.23694, 193.42134],
+                        [676.21954, 165.70988, 691.63745, 193.83748],
+                        [389.4093, 182.48041, 421.49072, 202.13422],
+                        [232.0577, 186.16724, 301.94623, 217.4024],
+                        [758.6537, 172.98509, 816.32434, 212.76743],
+                        [532.37, 176.35, 542.68, 185.27],
+                        [559.62, 175.83, 575.4, 183.15]])
+    gt_dimensions = np.array([[12.34, 2.85, 2.63], [3.69, 1.67, 1.87],
+                              [2.02, 1.86, 0.6], [-1., -1., -1.],
+                              [-1., -1., -1.], [-1., -1., -1.],
+                              [-1., -1., -1.]])
+    gt_location = np.array([[4.700e-01, 1.490e+00, 6.944e+01],
+                            [-1.653e+01, 2.390e+00, 5.849e+01],
+                            [4.590e+00, 1.320e+00, 4.584e+01],
+                            [-1.000e+03, -1.000e+03, -1.000e+03],
+                            [-1.000e+03, -1.000e+03, -1.000e+03],
+                            [-1.000e+03, -1.000e+03, -1.000e+03],
+                            [-1.000e+03, -1.000e+03, -1.000e+03]])
+    gt_rotation_y = [-1.56, 1.57, -1.55, -10., -10., -10., -10.]
+    gt_anno = dict(
+        name=gt_name,
+        truncated=gt_truncated,
+        occluded=gt_occluded,
+        alpha=gt_alpha,
+        bbox=gt_bbox,
+        dimensions=gt_dimensions,
+        location=gt_location,
+        rotation_y=gt_rotation_y)
+
+    dt_name = np.array(['Pedestrian', 'Cyclist', 'Car', 'Car', 'Car'])
+    dt_truncated = np.array([0., 0., 0., 0., 0.])
+    dt_occluded = np.array([0, 0, 0, 0, 0])
+    dt_alpha = np.array([1.0744612, 1.2775835, 1.82563, 2.1145396, -1.7676563])
+    dt_dimensions = np.array([[1.4441837, 1.7450154, 0.53160036],
+                              [1.6501029, 1.7540325, 0.5162356],
+                              [3.9313498, 1.4899347, 1.5655756],
+                              [4.0111866, 1.5350999, 1.585221],
+                              [3.7337692, 1.5117968, 1.5515774]])
+    dt_location = np.array([[4.6671643, 1.285098, 45.836895],
+                            [4.658241, 1.3088846, 45.85148],
+                            [-16.598526, 2.298814, 58.618088],
+                            [-18.629122, 2.2990575, 39.305355],
+                            [7.0964046, 1.5178275, 29.32426]])
+    dt_rotation_y = np.array(
+        [1.174933, 1.3778262, 1.550529, 1.6742425, -1.5330327])
+    dt_bbox = np.array([[674.9179, 165.48549, 693.23694, 193.42134],
+                        [676.21954, 165.70988, 691.63745, 193.83748],
+                        [389.4093, 182.48041, 421.49072, 202.13422],
+                        [232.0577, 186.16724, 301.94623, 217.4024],
+                        [758.6537, 172.98509, 816.32434, 212.76743]])
+    dt_score = np.array(
+        [0.18151495, 0.57920843, 0.27795696, 0.23100418, 0.21541929])
+    dt_anno = dict(
+        name=dt_name,
+        truncated=dt_truncated,
+        occluded=dt_occluded,
+        alpha=dt_alpha,
+        bbox=dt_bbox,
+        dimensions=dt_dimensions,
+        location=dt_location,
+        rotation_y=dt_rotation_y,
+        score=dt_score)
+
+    current_classes = [1, 2, 0]
+    result, ret_dict = kitti_eval([gt_anno], [dt_anno], current_classes)
+    assert np.isclose(ret_dict['KITTI/Overall_2D_AP11_moderate'],
+                      9.090909090909092)
+    assert np.isclose(ret_dict['KITTI/Overall_2D_AP11_hard'],
+                      9.090909090909092)
+    assert np.isclose(ret_dict['KITTI/Overall_2D_AP40_moderate'],
+                      0.8333333333333334)
+    assert np.isclose(ret_dict['KITTI/Overall_2D_AP40_hard'],
+                      0.8333333333333334)
+
+
+def test_eval_class():
+    gt_name = np.array(
+        ['Pedestrian', 'Cyclist', 'Car', 'Car', 'Car', 'DontCare', 'DontCare'])
+    gt_truncated = np.array([0., 0., 0., -1., -1., -1., -1.])
+    gt_occluded = np.array([0, 0, 3, -1, -1, -1, -1])
+    gt_alpha = np.array([-1.57, 1.85, -1.65, -10., -10., -10., -10.])
+    gt_bbox = np.array([[674.9179, 165.48549, 693.23694, 193.42134],
+                        [676.21954, 165.70988, 691.63745, 193.83748],
+                        [389.4093, 182.48041, 421.49072, 202.13422],
+                        [232.0577, 186.16724, 301.94623, 217.4024],
+                        [758.6537, 172.98509, 816.32434, 212.76743],
+                        [532.37, 176.35, 542.68, 185.27],
+                        [559.62, 175.83, 575.4, 183.15]])
+    gt_anno = dict(
+        name=gt_name,
+        truncated=gt_truncated,
+        occluded=gt_occluded,
+        alpha=gt_alpha,
+        bbox=gt_bbox)
+
+    dt_name = np.array(['Pedestrian', 'Cyclist', 'Car', 'Car', 'Car'])
+    dt_truncated = np.array([0., 0., 0., 0., 0.])
+    dt_occluded = np.array([0, 0, 0, 0, 0])
+    dt_alpha = np.array([1.0744612, 1.2775835, 1.82563, 2.1145396, -1.7676563])
+    dt_bbox = np.array([[674.9179, 165.48549, 693.23694, 193.42134],
+                        [676.21954, 165.70988, 691.63745, 193.83748],
+                        [389.4093, 182.48041, 421.49072, 202.13422],
+                        [232.0577, 186.16724, 301.94623, 217.4024],
+                        [758.6537, 172.98509, 816.32434, 212.76743]])
+    dt_score = np.array(
+        [0.18151495, 0.57920843, 0.27795696, 0.23100418, 0.21541929])
+    dt_anno = dict(
+        name=dt_name,
+        truncated=dt_truncated,
+        occluded=dt_occluded,
+        alpha=dt_alpha,
+        bbox=dt_bbox,
+        score=dt_score)
+    current_classes = [1, 2, 0]
+    difficultys = [0, 1, 2]
+    metric = 0
+    min_overlaps = np.array([[[0.5, 0.5, 0.7], [0.5, 0.5, 0.7],
+                              [0.5, 0.5, 0.7]],
+                             [[0.5, 0.5, 0.7], [0.25, 0.25, 0.5],
+                              [0.25, 0.25, 0.5]]])
+
+    ret_dict = eval_class([gt_anno], [dt_anno], current_classes, difficultys,
+                          metric, min_overlaps, True, 1)
+    recall_sum = np.sum(ret_dict['recall'])
+    precision_sum = np.sum(ret_dict['precision'])
+    orientation_sum = np.sum(ret_dict['orientation'])
+    assert np.isclose(recall_sum, 16)
+    assert np.isclose(precision_sum, 16)
+    assert np.isclose(orientation_sum, 10.252829201850309)
diff --git a/mmde/tests/test_evaluation/test_functional/test_panoptic_seg_eval.py b/mmde/tests/test_evaluation/test_functional/test_panoptic_seg_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..d67abe6be70a6e16cbac2dc678ac7b55f5d637eb
--- /dev/null
+++ b/mmde/tests/test_evaluation/test_functional/test_panoptic_seg_eval.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmdet3d.evaluation.functional.panoptic_seg_eval import panoptic_seg_eval
+
+
+def test_panoptic_seg_eval():
+    if not torch.cuda.is_available():
+        pytest.skip()
+
+    classes = ['unlabeled', 'person', 'dog', 'grass', 'sky']
+    label2cat = {
+        0: 'unlabeled',
+        1: 'person',
+        2: 'dog',
+        3: 'grass',
+        4: 'sky',
+    }
+
+    thing_classes = ['person', 'dog']
+    stuff_classes = ['grass', 'sky']
+    ignore_index = [0]  # only ignore ignore class
+    min_points = 1  # for this example we care about all points
+    offset = 2**16
+
+    # generate ground truth and prediction
+    semantic_preds = []
+    instance_preds = []
+    gt_semantic = []
+    gt_instance = []
+
+    # some ignore stuff
+    num_ignore = 50
+    semantic_preds.extend([0 for i in range(num_ignore)])
+    instance_preds.extend([0 for i in range(num_ignore)])
+    gt_semantic.extend([0 for i in range(num_ignore)])
+    gt_instance.extend([0 for i in range(num_ignore)])
+
+    # grass segment
+    num_grass = 50
+    num_grass_pred = 40  # rest is sky
+    semantic_preds.extend([1 for i in range(num_grass_pred)])  # grass
+    semantic_preds.extend([2
+                           for i in range(num_grass - num_grass_pred)])  # sky
+    instance_preds.extend([0 for i in range(num_grass)])
+    gt_semantic.extend([1 for i in range(num_grass)])  # grass
+    gt_instance.extend([0 for i in range(num_grass)])
+
+    # sky segment
+    num_sky = 50
+    num_sky_pred = 40  # rest is grass
+    semantic_preds.extend([2 for i in range(num_sky_pred)])  # sky
+    semantic_preds.extend([1 for i in range(num_sky - num_sky_pred)])  # grass
+    instance_preds.extend([0 for i in range(num_sky)])  # first instance
+    gt_semantic.extend([2 for i in range(num_sky)])  # sky
+    gt_instance.extend([0 for i in range(num_sky)])  # first instance
+
+    # wrong dog as person prediction
+    num_dog = 50
+    num_person = num_dog
+    semantic_preds.extend([3 for i in range(num_person)])
+    instance_preds.extend([35 for i in range(num_person)])
+    gt_semantic.extend([4 for i in range(num_dog)])
+    gt_instance.extend([22 for i in range(num_dog)])
+
+    # two persons in prediction, but three in gt
+    num_person = 50
+    semantic_preds.extend([3 for i in range(6 * num_person)])
+    instance_preds.extend([8 for i in range(4 * num_person)])
+    instance_preds.extend([95 for i in range(2 * num_person)])
+    gt_semantic.extend([3 for i in range(6 * num_person)])
+    gt_instance.extend([33 for i in range(3 * num_person)])
+    gt_instance.extend([42 for i in range(num_person)])
+    gt_instance.extend([11 for i in range(2 * num_person)])
+
+    # gt and pred to numpy
+    semantic_preds = np.array(semantic_preds, dtype=int).reshape(1, -1)
+    instance_preds = np.array(instance_preds, dtype=int).reshape(1, -1)
+    gt_semantic = np.array(gt_semantic, dtype=int).reshape(1, -1)
+    gt_instance = np.array(gt_instance, dtype=int).reshape(1, -1)
+
+    gt_labels = [{
+        'pts_semantic_mask': gt_semantic,
+        'pts_instance_mask': gt_instance
+    }]
+
+    seg_preds = [{
+        'pts_semantic_mask': semantic_preds,
+        'pts_instance_mask': instance_preds
+    }]
+
+    ret_value = panoptic_seg_eval(gt_labels, seg_preds, classes, thing_classes,
+                                  stuff_classes, min_points, offset, label2cat,
+                                  ignore_index)
+
+    assert np.isclose(ret_value['pq'], 0.47916666666666663)
+    assert np.isclose(ret_value['rq_mean'], 0.6666666666666666)
+    assert np.isclose(ret_value['sq_mean'], 0.5520833333333333)
+    assert np.isclose(ret_value['miou'], 0.5476190476190476)
diff --git a/mmde/tests/test_evaluation/test_functional/test_seg_eval.py b/mmde/tests/test_evaluation/test_functional/test_seg_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2b44854aa671c5bf85016b4c1dcbb5c2402209c
--- /dev/null
+++ b/mmde/tests/test_evaluation/test_functional/test_seg_eval.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmdet3d.evaluation.functional.seg_eval import seg_eval
+
+
+def test_indoor_eval():
+    if not torch.cuda.is_available():
+        pytest.skip()
+    seg_preds = [
+        np.array([
+            0, 0, 1, 0, 0, 2, 1, 3, 1, 2, 1, 0, 2, 2, 2, 2, 1, 3, 0, 3, 3, 4, 0
+        ])
+    ]
+    gt_labels = [
+        np.array([
+            0, 0, 0, 4, 0, 0, 1, 1, 1, 4, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4
+        ])
+    ]
+
+    label2cat = {
+        0: 'car',
+        1: 'bicycle',
+        2: 'motorcycle',
+        3: 'truck',
+        4: 'unlabeled'
+    }
+    ret_value = seg_eval(gt_labels, seg_preds, label2cat, ignore_index=4)
+
+    assert np.isclose(ret_value['car'], 0.428571429)
+    assert np.isclose(ret_value['bicycle'], 0.428571429)
+    assert np.isclose(ret_value['motorcycle'], 0.6666667)
+    assert np.isclose(ret_value['truck'], 0.5)
+
+    assert np.isclose(ret_value['acc'], 0.65)
+    assert np.isclose(ret_value['acc_cls'], 0.65)
+    assert np.isclose(ret_value['miou'], 0.50595238)
diff --git a/mmde/tests/test_evaluation/test_metrics/test_indoor_metric.py b/mmde/tests/test_evaluation/test_metrics/test_indoor_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6c1fddff3e6e8f543fb116eff603e4ca507f697
--- /dev/null
+++ b/mmde/tests/test_evaluation/test_metrics/test_indoor_metric.py
@@ -0,0 +1,65 @@
+import unittest
+from io import StringIO
+from unittest.mock import patch
+
+import numpy as np
+import torch
+
+from mmdet3d.evaluation.metrics import IndoorMetric
+from mmdet3d.structures import DepthInstance3DBoxes
+
+
+class TestIndoorMetric(unittest.TestCase):
+
+    @patch('sys.stdout', new_callable=StringIO)
+    def test_process(self, stdout):
+        indoor_metric = IndoorMetric()
+        eval_ann_info = {
+            'gt_bboxes_3d':
+            DepthInstance3DBoxes(
+                torch.tensor([
+                    [2.3578, 1.7841, -0.0987, 0.5532, 0.4948, 0.6474, 0.0000],
+                    [-0.2773, -2.1403, 0.0615, 0.4786, 0.5170, 0.3842, 0.0000],
+                    [0.0259, -2.7954, -0.0157, 0.3869, 0.4361, 0.5229, 0.0000],
+                    [-2.3968, 1.1040, 0.0945, 2.5563, 1.5989, 0.9322, 0.0000],
+                    [
+                        -0.3173, -2.7770, -0.0134, 0.5473, 0.8569, 0.5577,
+                        0.0000
+                    ],
+                    [-2.4882, -1.4437, 0.0987, 1.2199, 0.4859, 0.6461, 0.0000],
+                    [-3.4702, -0.1315, 0.2463, 1.3137, 0.8022, 0.4765, 0.0000],
+                    [1.9786, 3.0196, -0.0934, 1.6129, 0.5834, 1.4662, 0.0000],
+                    [2.3835, 2.2691, -0.1376, 0.5197, 0.5099, 0.6896, 0.0000],
+                    [2.5986, -0.5313, 1.4269, 0.0696, 0.2933, 0.3104, 0.0000],
+                    [0.4555, -3.1278, -0.0637, 2.0247, 0.1292, 0.2419, 0.0000],
+                    [0.4655, -3.1941, 0.3769, 2.1132, 0.3536, 1.9803, 0.0000]
+                ])),
+            'gt_labels_3d':
+            np.array([2, 2, 2, 3, 4, 17, 4, 7, 2, 8, 17, 11])
+        }
+
+        pred_instances_3d = dict()
+        pred_instances_3d['scores_3d'] = torch.ones(
+            len(eval_ann_info['gt_bboxes_3d']))
+        pred_instances_3d['bboxes_3d'] = eval_ann_info['gt_bboxes_3d']
+        pred_instances_3d['labels_3d'] = torch.Tensor(
+            eval_ann_info['gt_labels_3d'])
+        pred_dict = dict()
+        pred_dict['pred_instances_3d'] = pred_instances_3d
+        pred_dict['eval_ann_info'] = eval_ann_info
+
+        indoor_metric.dataset_meta = {
+            'classes': ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door',
+                        'window', 'bookshelf', 'picture', 'counter', 'desk',
+                        'curtain', 'refrigerator', 'showercurtrain', 'toilet',
+                        'sink', 'bathtub', 'garbagebin'),
+            'box_type_3d':
+            'Depth',
+        }
+
+        indoor_metric.process({}, [pred_dict])
+
+        eval_results = indoor_metric.evaluate(1)
+        for v in eval_results.values():
+            # map == 1
+            self.assertEqual(1, v)
diff --git a/mmde/tests/test_evaluation/test_metrics/test_instance_seg_metric.py b/mmde/tests/test_evaluation/test_metrics/test_instance_seg_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ae1dcedbe2ce6e86587f5e86caacdecb57bdeff
--- /dev/null
+++ b/mmde/tests/test_evaluation/test_metrics/test_instance_seg_metric.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import numpy as np
+import torch
+from mmengine.structures import BaseDataElement
+
+from mmdet3d.evaluation.metrics import InstanceSegMetric
+from mmdet3d.structures import Det3DDataSample, PointData
+
+
+class TestInstanceSegMetric(unittest.TestCase):
+
+    def _demo_mm_model_output(self):
+        """Create a superset of inputs needed to run test or train batches."""
+
+        n_points = 3300
+        gt_labels = [0, 0, 0, 0, 0, 0, 14, 14, 2, 1]
+        gt_instance_mask = np.ones(n_points, dtype=np.int64) * -1
+        gt_semantic_mask = np.ones(n_points, dtype=np.int64) * -1
+        for i, gt_label in enumerate(gt_labels):
+            begin = i * 300
+            end = begin + 300
+            gt_instance_mask[begin:end] = i
+            gt_semantic_mask[begin:end] = gt_label
+
+        ann_info_data = dict()
+        ann_info_data['pts_instance_mask'] = torch.tensor(gt_instance_mask)
+        ann_info_data['pts_semantic_mask'] = torch.tensor(gt_semantic_mask)
+
+        results_dict = dict()
+        n_points = 3300
+        gt_labels = [0, 0, 0, 0, 0, 0, 14, 14, 2, 1]
+        pred_instance_mask = np.ones(n_points, dtype=np.int64) * -1
+        labels = []
+        scores = []
+        for i, gt_label in enumerate(gt_labels):
+            begin = i * 300
+            end = begin + 300
+            pred_instance_mask[begin:end] = i
+            labels.append(gt_label)
+            scores.append(.99)
+
+        results_dict['pts_instance_mask'] = torch.tensor(pred_instance_mask)
+        results_dict['instance_labels'] = torch.tensor(labels)
+        results_dict['instance_scores'] = torch.tensor(scores)
+        data_sample = Det3DDataSample()
+        data_sample.pred_pts_seg = PointData(**results_dict)
+        data_sample.eval_ann_info = ann_info_data
+        batch_data_samples = [data_sample]
+
+        predictions = []
+        for pred in batch_data_samples:
+            if isinstance(pred, BaseDataElement):
+                pred = pred.to_dict()
+            predictions.append(pred)
+
+        return predictions
+
+    def test_evaluate(self):
+        data_batch = {}
+        predictions = self._demo_mm_model_output()
+        seg_valid_class_ids = (3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
+                               33, 34, 36, 39)
+        class_labels = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door',
+                        'window', 'bookshelf', 'picture', 'counter', 'desk',
+                        'curtain', 'refrigerator', 'showercurtrain', 'toilet',
+                        'sink', 'bathtub', 'garbagebin')
+        dataset_meta = dict(
+            seg_valid_class_ids=seg_valid_class_ids, classes=class_labels)
+        instance_seg_metric = InstanceSegMetric()
+        instance_seg_metric.dataset_meta = dataset_meta
+        instance_seg_metric.process(data_batch, predictions)
+        res = instance_seg_metric.evaluate(1)
+        self.assertIsInstance(res, dict)
diff --git a/mmde/tests/test_evaluation/test_metrics/test_kitti_metric.py b/mmde/tests/test_evaluation/test_metrics/test_kitti_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..070314150bbe8d3ab4f3eb2e83c0ed7e117f0721
--- /dev/null
+++ b/mmde/tests/test_evaluation/test_metrics/test_kitti_metric.py
@@ -0,0 +1,89 @@
+import numpy as np
+import pytest
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet3d.evaluation.metrics import KittiMetric
+from mmdet3d.structures import Det3DDataSample, LiDARInstance3DBoxes
+
+data_root = 'tests/data/kitti'
+
+
+def _init_evaluate_input():
+    metainfo = dict(sample_idx=0)
+    predictions = Det3DDataSample()
+    pred_instances_3d = InstanceData()
+    pred_instances_3d.bboxes_3d = LiDARInstance3DBoxes(
+        torch.tensor(
+            [[8.7314, -1.8559, -1.5997, 0.4800, 1.2000, 1.8900, 0.0100]]))
+    pred_instances_3d.scores_3d = torch.Tensor([0.9])
+    pred_instances_3d.labels_3d = torch.Tensor([0])
+
+    predictions.pred_instances_3d = pred_instances_3d
+    predictions.pred_instances = InstanceData()
+    predictions.set_metainfo(metainfo)
+    predictions = predictions.to_dict()
+    return {}, [predictions]
+
+
+def _init_multi_modal_evaluate_input():
+    metainfo = dict(sample_idx=0)
+    predictions = Det3DDataSample()
+    pred_instances_3d = InstanceData()
+    pred_instances = InstanceData()
+    pred_instances.bboxes = torch.tensor([[712.4, 143, 810.7, 307.92]])
+    pred_instances.scores = torch.Tensor([0.9])
+    pred_instances.labels = torch.Tensor([0])
+    pred_instances_3d.bboxes_3d = LiDARInstance3DBoxes(
+        torch.tensor(
+            [[8.7314, -1.8559, -1.5997, 0.4800, 1.2000, 1.8900, 0.0100]]))
+
+    pred_instances_3d.scores_3d = torch.Tensor([0.9])
+    pred_instances_3d.labels_3d = torch.Tensor([0])
+
+    predictions.pred_instances_3d = pred_instances_3d
+    predictions.pred_instances = pred_instances
+    predictions.set_metainfo(metainfo)
+    predictions = predictions.to_dict()
+    return {}, [predictions]
+
+
+def test_multi_modal_kitti_metric():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    kittimetric = KittiMetric(
+        data_root + '/kitti_infos_train.pkl', metric=['mAP'])
+    kittimetric.dataset_meta = dict(classes=['Pedestrian', 'Cyclist', 'Car'])
+    data_batch, predictions = _init_multi_modal_evaluate_input()
+    kittimetric.process(data_batch, predictions)
+    ap_dict = kittimetric.compute_metrics(kittimetric.results)
+    assert np.isclose(ap_dict['pred_instances_3d/KITTI/Overall_3D_AP11_easy'],
+                      3.0303030303030307)
+    assert np.isclose(ap_dict['pred_instances_3d/KITTI/Overall_BEV_AP11_easy'],
+                      3.0303030303030307)
+    assert np.isclose(ap_dict['pred_instances_3d/KITTI/Overall_2D_AP11_easy'],
+                      3.0303030303030307)
+    assert np.isclose(ap_dict['pred_instances/KITTI/Overall_2D_AP11_easy'],
+                      3.0303030303030307)
+    assert np.isclose(ap_dict['pred_instances/KITTI/Overall_2D_AP11_moderate'],
+                      3.0303030303030307)
+    assert np.isclose(ap_dict['pred_instances/KITTI/Overall_2D_AP11_hard'],
+                      3.0303030303030307)
+
+
+def test_kitti_metric_mAP():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    kittimetric = KittiMetric(
+        data_root + '/kitti_infos_train.pkl', metric=['mAP'])
+    kittimetric.dataset_meta = dict(classes=['Pedestrian', 'Cyclist', 'Car'])
+    data_batch, predictions = _init_evaluate_input()
+    kittimetric.process(data_batch, predictions)
+    ap_dict = kittimetric.compute_metrics(kittimetric.results)
+    assert np.isclose(ap_dict['pred_instances_3d/KITTI/Overall_3D_AP11_easy'],
+                      3.0303030303030307)
+    assert np.isclose(
+        ap_dict['pred_instances_3d/KITTI/Overall_3D_AP11_moderate'],
+        3.0303030303030307)
+    assert np.isclose(ap_dict['pred_instances_3d/KITTI/Overall_3D_AP11_hard'],
+                      3.0303030303030307)
diff --git a/mmde/tests/test_evaluation/test_metrics/test_panoptic_seg_metric.py b/mmde/tests/test_evaluation/test_metrics/test_panoptic_seg_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbf8f6df917c729599fe5300bfc86f02b8671399
--- /dev/null
+++ b/mmde/tests/test_evaluation/test_metrics/test_panoptic_seg_metric.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import numpy as np
+import torch
+from mmengine.structures import BaseDataElement
+
+from mmdet3d.evaluation.metrics import PanopticSegMetric
+from mmdet3d.structures import Det3DDataSample, PointData
+
+
+class TestPanopticSegMetric(unittest.TestCase):
+
+    def _demo_mm_model_output(self):
+        """Create a superset of inputs needed to run test or train batches."""
+        # generate ground truth and prediction
+        semantic_preds = []
+        instance_preds = []
+        gt_semantic = []
+        gt_instance = []
+
+        # some ignore stuff
+        num_ignore = 50
+        semantic_preds.extend([0 for i in range(num_ignore)])
+        instance_preds.extend([0 for i in range(num_ignore)])
+        gt_semantic.extend([0 for i in range(num_ignore)])
+        gt_instance.extend([0 for i in range(num_ignore)])
+
+        # grass segment
+        num_grass = 50
+        num_grass_pred = 40  # rest is sky
+        semantic_preds.extend([1 for i in range(num_grass_pred)])  # grass
+        semantic_preds.extend([2 for i in range(num_grass - num_grass_pred)
+                               ])  # sky
+        instance_preds.extend([0 for i in range(num_grass)])
+        gt_semantic.extend([1 for i in range(num_grass)])  # grass
+        gt_instance.extend([0 for i in range(num_grass)])
+
+        # sky segment
+        num_sky = 50
+        num_sky_pred = 40  # rest is grass
+        semantic_preds.extend([2 for i in range(num_sky_pred)])  # sky
+        semantic_preds.extend([1 for i in range(num_sky - num_sky_pred)
+                               ])  # grass
+        instance_preds.extend([0 for i in range(num_sky)])  # first instance
+        gt_semantic.extend([2 for i in range(num_sky)])  # sky
+        gt_instance.extend([0 for i in range(num_sky)])  # first instance
+
+        # wrong dog as person prediction
+        num_dog = 50
+        num_person = num_dog
+        semantic_preds.extend([3 for i in range(num_person)])
+        instance_preds.extend([35 for i in range(num_person)])
+        gt_semantic.extend([4 for i in range(num_dog)])
+        gt_instance.extend([22 for i in range(num_dog)])
+
+        # two persons in prediction, but three in gt
+        num_person = 50
+        semantic_preds.extend([3 for i in range(6 * num_person)])
+        instance_preds.extend([8 for i in range(4 * num_person)])
+        instance_preds.extend([95 for i in range(2 * num_person)])
+        gt_semantic.extend([3 for i in range(6 * num_person)])
+        gt_instance.extend([33 for i in range(3 * num_person)])
+        gt_instance.extend([42 for i in range(num_person)])
+        gt_instance.extend([11 for i in range(2 * num_person)])
+
+        # gt and pred to numpy
+        semantic_preds = np.array(semantic_preds, dtype=int).reshape(1, -1)
+        instance_preds = np.array(instance_preds, dtype=int).reshape(1, -1)
+        gt_semantic = np.array(gt_semantic, dtype=int).reshape(1, -1)
+        gt_instance = np.array(gt_instance, dtype=int).reshape(1, -1)
+
+        pred_pts_semantic_mask = torch.Tensor(semantic_preds)
+        pred_pts_instance_mask = torch.Tensor(instance_preds)
+        pred_pts_seg_data = dict(
+            pts_semantic_mask=pred_pts_semantic_mask,
+            pts_instance_mask=pred_pts_instance_mask)
+        data_sample = Det3DDataSample()
+        data_sample.pred_pts_seg = PointData(**pred_pts_seg_data)
+
+        ann_info_data = dict(
+            pts_semantic_mask=gt_semantic, pts_instance_mask=gt_instance)
+        data_sample.eval_ann_info = ann_info_data
+
+        batch_data_samples = [data_sample]
+
+        predictions = []
+        for pred in batch_data_samples:
+            if isinstance(pred, BaseDataElement):
+                pred = pred.to_dict()
+            predictions.append(pred)
+
+        return predictions
+
+    def test_evaluate(self):
+        data_batch = {}
+        predictions = self._demo_mm_model_output()
+
+        classes = ['unlabeled', 'person', 'dog', 'grass', 'sky']
+        label2cat = {
+            0: 'unlabeled',
+            1: 'person',
+            2: 'dog',
+            3: 'grass',
+            4: 'sky',
+        }
+
+        ignore_index = [0]  # only ignore ignore class
+        min_num_points = 1  # for this example we care about all points
+        id_offset = 2**16
+
+        dataset_meta = dict(
+            label2cat=label2cat, ignore_index=ignore_index, classes=classes)
+        panoptic_seg_metric = PanopticSegMetric(
+            thing_class_inds=[0, 1],
+            stuff_class_inds=[2, 3],
+            min_num_points=min_num_points,
+            id_offset=id_offset,
+        )
+        panoptic_seg_metric.dataset_meta = dataset_meta
+        panoptic_seg_metric.process(data_batch, predictions)
+        res = panoptic_seg_metric.evaluate(1)
+        self.assertIsInstance(res, dict)
diff --git a/mmde/tests/test_evaluation/test_metrics/test_seg_metric.py b/mmde/tests/test_evaluation/test_metrics/test_seg_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f2507a04803984a44c2a37acd6ddcf5ad9e1238
--- /dev/null
+++ b/mmde/tests/test_evaluation/test_metrics/test_seg_metric.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import numpy as np
+import torch
+from mmengine.structures import BaseDataElement
+
+from mmdet3d.evaluation.metrics import SegMetric
+from mmdet3d.structures import Det3DDataSample, PointData
+
+
+class TestSegMetric(unittest.TestCase):
+
+    def _demo_mm_model_output(self):
+        """Create a superset of inputs needed to run test or train batches."""
+        pred_pts_semantic_mask = torch.Tensor([
+            0, 0, 1, 0, 0, 2, 1, 3, 1, 2, 1, 0, 2, 2, 2, 2, 1, 3, 0, 3, 3, 3, 3
+        ])
+        pred_pts_seg_data = dict(pts_semantic_mask=pred_pts_semantic_mask)
+        data_sample = Det3DDataSample()
+        data_sample.pred_pts_seg = PointData(**pred_pts_seg_data)
+
+        gt_pts_semantic_mask = np.array([
+            0, 0, 0, 255, 0, 0, 1, 1, 1, 255, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3,
+            3, 255
+        ])
+        ann_info_data = dict(pts_semantic_mask=gt_pts_semantic_mask)
+        data_sample.eval_ann_info = ann_info_data
+
+        batch_data_samples = [data_sample]
+
+        predictions = []
+        for pred in batch_data_samples:
+            if isinstance(pred, BaseDataElement):
+                pred = pred.to_dict()
+            predictions.append(pred)
+
+        return predictions
+
+    def test_evaluate(self):
+        data_batch = {}
+        predictions = self._demo_mm_model_output()
+        label2cat = {
+            0: 'car',
+            1: 'bicycle',
+            2: 'motorcycle',
+            3: 'truck',
+        }
+        dataset_meta = dict(label2cat=label2cat, ignore_index=255)
+        seg_metric = SegMetric()
+        seg_metric.dataset_meta = dataset_meta
+        seg_metric.process(data_batch, predictions)
+        res = seg_metric.evaluate(1)
+        self.assertIsInstance(res, dict)
diff --git a/mmde/tests/test_models/test_backbones/test_cylinder3d_backbone.py b/mmde/tests/test_models/test_backbones/test_cylinder3d_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea6b3e7ba2045a13cb33ff790053d5b555fcfd6d
--- /dev/null
+++ b/mmde/tests/test_models/test_backbones/test_cylinder3d_backbone.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet3d.registry import MODELS
+
+
+def test_cylinder3d():
+    if not torch.cuda.is_available():
+        pytest.skip()
+    cfg = dict(
+        type='Asymm3DSpconv',
+        grid_size=[48, 32, 4],
+        input_channels=16,
+        base_channels=32,
+        norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.1))
+    self = MODELS.build(cfg)
+    self.cuda()
+
+    batch_size = 1
+    coorx = torch.randint(0, 48, (50, 1))
+    coory = torch.randint(0, 36, (50, 1))
+    coorz = torch.randint(0, 4, (50, 1))
+    coorbatch = torch.zeros(50, 1)
+    coors = torch.cat([coorbatch, coorx, coory, coorz], dim=1).cuda()
+    voxel_features = torch.rand(50, 16).cuda()
+
+    # test forward
+    feature = self(voxel_features, coors, batch_size)
+
+    assert feature.features.shape == (50, 128)
+    assert feature.indices.data.shape == (50, 4)
diff --git a/mmde/tests/test_models/test_backbones/test_dgcnn.py b/mmde/tests/test_models/test_backbones/test_dgcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..27d7cffb52238f1089c498018c2679a4357c30fc
--- /dev/null
+++ b/mmde/tests/test_models/test_backbones/test_dgcnn.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmdet3d.registry import MODELS
+
+
+def test_dgcnn_gf():
+    if not torch.cuda.is_available():
+        pytest.skip()
+
+    # DGCNNGF used in segmentation
+    cfg = dict(
+        type='DGCNNBackbone',
+        in_channels=6,
+        num_samples=(20, 20, 20),
+        knn_modes=['D-KNN', 'F-KNN', 'F-KNN'],
+        radius=(None, None, None),
+        gf_channels=((64, 64), (64, 64), (64, )),
+        fa_channels=(1024, ),
+        act_cfg=dict(type='ReLU'))
+
+    self = MODELS.build(cfg)
+    self.cuda()
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', dtype=np.float32)
+    xyz = torch.from_numpy(xyz).view(1, -1, 6).cuda()  # (B, N, 6)
+    # test forward
+    ret_dict = self(xyz)
+    gf_points = ret_dict['gf_points']
+    fa_points = ret_dict['fa_points']
+
+    assert len(gf_points) == 4
+    assert gf_points[0].shape == torch.Size([1, 100, 6])
+    assert gf_points[1].shape == torch.Size([1, 100, 64])
+    assert gf_points[2].shape == torch.Size([1, 100, 64])
+    assert gf_points[3].shape == torch.Size([1, 100, 64])
+    assert fa_points.shape == torch.Size([1, 100, 1216])
diff --git a/mmde/tests/test_models/test_backbones/test_dla.py b/mmde/tests/test_models/test_backbones/test_dla.py
new file mode 100644
index 0000000000000000000000000000000000000000..915c024c0e6c609b45891ab6391481498abac509
--- /dev/null
+++ b/mmde/tests/test_models/test_backbones/test_dla.py
@@ -0,0 +1,26 @@
+import torch
+
+from mmdet3d.registry import MODELS
+
+
+def test_dla_net():
+    # test DLANet used in SMOKE
+    # test list config
+    cfg = dict(
+        type='DLANet',
+        depth=34,
+        in_channels=3,
+        norm_cfg=dict(type='GN', num_groups=32))
+
+    img = torch.randn((4, 3, 32, 32))
+    self = MODELS.build(cfg)
+    self.init_weights()
+
+    results = self(img)
+    assert len(results) == 6
+    assert results[0].shape == torch.Size([4, 16, 32, 32])
+    assert results[1].shape == torch.Size([4, 32, 16, 16])
+    assert results[2].shape == torch.Size([4, 64, 8, 8])
+    assert results[3].shape == torch.Size([4, 128, 4, 4])
+    assert results[4].shape == torch.Size([4, 256, 2, 2])
+    assert results[5].shape == torch.Size([4, 512, 1, 1])
diff --git a/mmde/tests/test_models/test_backbones/test_mink_resnet.py b/mmde/tests/test_models/test_backbones/test_mink_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..04552b7272bc9635c43008574d96dc380ee75f9c
--- /dev/null
+++ b/mmde/tests/test_models/test_backbones/test_mink_resnet.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmdet3d.registry import MODELS
+
+
+def test_mink_resnet():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+
+    try:
+        import MinkowskiEngine as ME
+    except ImportError:
+        pytest.skip('test requires MinkowskiEngine installation')
+
+    coordinates, features = [], []
+    np.random.seed(42)
+    # batch of 2 point clouds
+    for i in range(2):
+        c = torch.from_numpy(np.random.rand(500, 3) * 100)
+        coordinates.append(c.float().cuda())
+        f = torch.from_numpy(np.random.rand(500, 3))
+        features.append(f.float().cuda())
+    tensor_coordinates, tensor_features = ME.utils.sparse_collate(
+        coordinates, features)
+    x = ME.SparseTensor(
+        features=tensor_features, coordinates=tensor_coordinates)
+
+    # MinkResNet34 with 4 outputs
+    cfg = dict(type='MinkResNet', depth=34, in_channels=3)
+    self = MODELS.build(cfg).cuda()
+    self.init_weights()
+
+    y = self(x)
+    assert len(y) == 4
+    assert y[0].F.shape == torch.Size([900, 64])
+    assert y[0].tensor_stride[0] == 8
+    assert y[1].F.shape == torch.Size([472, 128])
+    assert y[1].tensor_stride[0] == 16
+    assert y[2].F.shape == torch.Size([105, 256])
+    assert y[2].tensor_stride[0] == 32
+    assert y[3].F.shape == torch.Size([16, 512])
+    assert y[3].tensor_stride[0] == 64
+
+    # MinkResNet50 with 2 outputs
+    cfg = dict(
+        type='MinkResNet', depth=34, in_channels=3, num_stages=2, pool=False)
+    self = MODELS.build(cfg).cuda()
+    self.init_weights()
+
+    y = self(x)
+    assert len(y) == 2
+    assert y[0].F.shape == torch.Size([985, 64])
+    assert y[0].tensor_stride[0] == 4
+    assert y[1].F.shape == torch.Size([900, 128])
+    assert y[1].tensor_stride[0] == 8
diff --git a/mmde/tests/test_models/test_backbones/test_minkunet_backbone.py b/mmde/tests/test_models/test_backbones/test_minkunet_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..180bcbd23bfa8f3d1abb1516c78a3fbc031d9f67
--- /dev/null
+++ b/mmde/tests/test_models/test_backbones/test_minkunet_backbone.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+import torch.nn.functional as F
+
+from mmdet3d.registry import MODELS
+
+
+def test_minkunet_backbone():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+
+    try:
+        import torchsparse  # noqa: F401
+    except ImportError:
+        pytest.skip('test requires Torchsparse installation')
+
+    coordinates, features = [], []
+    for i in range(2):
+        c = torch.randint(0, 16, (100, 3)).int()
+        c = F.pad(c, (0, 1), mode='constant', value=i)
+        coordinates.append(c)
+        f = torch.rand(100, 4)
+        features.append(f)
+    features = torch.cat(features, dim=0).cuda()
+    coordinates = torch.cat(coordinates, dim=0).cuda()
+
+    cfg = dict(type='MinkUNetBackbone')
+    self = MODELS.build(cfg).cuda()
+    self.init_weights()
+
+    y = self(features, coordinates)
+    assert y.shape == torch.Size([200, 96])
diff --git a/mmde/tests/test_models/test_backbones/test_multi_backbone.py b/mmde/tests/test_models/test_backbones/test_multi_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..02186fa2436a488fe456e9a5ea226713cfc5042a
--- /dev/null
+++ b/mmde/tests/test_models/test_backbones/test_multi_backbone.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmdet3d.registry import MODELS
+
+
+def test_multi_backbone():
+    if not torch.cuda.is_available():
+        pytest.skip()
+
+    # test list config
+    cfg_list = dict(
+        type='MultiBackbone',
+        num_streams=4,
+        suffixes=['net0', 'net1', 'net2', 'net3'],
+        backbones=[
+            dict(
+                type='PointNet2SASSG',
+                in_channels=4,
+                num_points=(256, 128, 64, 32),
+                radius=(0.2, 0.4, 0.8, 1.2),
+                num_samples=(64, 32, 16, 16),
+                sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                             (128, 128, 256)),
+                fp_channels=((256, 256), (256, 256)),
+                norm_cfg=dict(type='BN2d')),
+            dict(
+                type='PointNet2SASSG',
+                in_channels=4,
+                num_points=(256, 128, 64, 32),
+                radius=(0.2, 0.4, 0.8, 1.2),
+                num_samples=(64, 32, 16, 16),
+                sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                             (128, 128, 256)),
+                fp_channels=((256, 256), (256, 256)),
+                norm_cfg=dict(type='BN2d')),
+            dict(
+                type='PointNet2SASSG',
+                in_channels=4,
+                num_points=(256, 128, 64, 32),
+                radius=(0.2, 0.4, 0.8, 1.2),
+                num_samples=(64, 32, 16, 16),
+                sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                             (128, 128, 256)),
+                fp_channels=((256, 256), (256, 256)),
+                norm_cfg=dict(type='BN2d')),
+            dict(
+                type='PointNet2SASSG',
+                in_channels=4,
+                num_points=(256, 128, 64, 32),
+                radius=(0.2, 0.4, 0.8, 1.2),
+                num_samples=(64, 32, 16, 16),
+                sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                             (128, 128, 256)),
+                fp_channels=((256, 256), (256, 256)),
+                norm_cfg=dict(type='BN2d'))
+        ])
+
+    self = MODELS.build(cfg_list)
+    self.cuda()
+
+    assert len(self.backbone_list) == 4
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', dtype=np.float32)
+    xyz = torch.from_numpy(xyz).view(1, -1, 6).cuda()  # (B, N, 6)
+    # test forward
+    ret_dict = self(xyz[:, :, :4])
+
+    assert ret_dict['hd_feature'].shape == torch.Size([1, 256, 128])
+    assert ret_dict['fp_xyz_net0'][-1].shape == torch.Size([1, 128, 3])
+    assert ret_dict['fp_features_net0'][-1].shape == torch.Size([1, 256, 128])
+
+    # test dict config
+    cfg_dict = dict(
+        type='MultiBackbone',
+        num_streams=2,
+        suffixes=['net0', 'net1'],
+        aggregation_mlp_channels=[512, 128],
+        backbones=dict(
+            type='PointNet2SASSG',
+            in_channels=4,
+            num_points=(256, 128, 64, 32),
+            radius=(0.2, 0.4, 0.8, 1.2),
+            num_samples=(64, 32, 16, 16),
+            sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                         (128, 128, 256)),
+            fp_channels=((256, 256), (256, 256)),
+            norm_cfg=dict(type='BN2d')))
+
+    self = MODELS.build(cfg_dict)
+    self.cuda()
+
+    assert len(self.backbone_list) == 2
+
+    # test forward
+    ret_dict = self(xyz[:, :, :4])
+
+    assert ret_dict['hd_feature'].shape == torch.Size([1, 128, 128])
+    assert ret_dict['fp_xyz_net0'][-1].shape == torch.Size([1, 128, 3])
+    assert ret_dict['fp_features_net0'][-1].shape == torch.Size([1, 256, 128])
+
+    # Length of backbone configs list should be equal to num_streams
+    with pytest.raises(AssertionError):
+        cfg_list['num_streams'] = 3
+        MODELS.build(cfg_list)
+
+    # Length of suffixes list should be equal to num_streams
+    with pytest.raises(AssertionError):
+        cfg_dict['suffixes'] = ['net0', 'net1', 'net2']
+        MODELS.build(cfg_dict)
+
+    # Type of 'backbones' should be Dict or List[Dict].
+    with pytest.raises(AssertionError):
+        cfg_dict['backbones'] = 'PointNet2SASSG'
+        MODELS.build(cfg_dict)
diff --git a/mmde/tests/test_models/test_backbones/test_pointnet2_sa_msg.py b/mmde/tests/test_models/test_backbones/test_pointnet2_sa_msg.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8e87506e2b40715855c4f552f261925ad3b4c82
--- /dev/null
+++ b/mmde/tests/test_models/test_backbones/test_pointnet2_sa_msg.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmdet3d.registry import MODELS
+
+
+def test_pointnet2_sa_msg():
+    if not torch.cuda.is_available():
+        pytest.skip()
+
+    # PN2MSG used in 3DSSD
+    cfg = dict(
+        type='PointNet2SAMSG',
+        in_channels=4,
+        num_points=(256, 64, (32, 32)),
+        radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)),
+        num_samples=((8, 8, 16), (8, 8, 16), (8, 8, 8)),
+        sa_channels=(((8, 8, 16), (8, 8, 16),
+                      (8, 8, 16)), ((16, 16, 32), (16, 16, 32), (16, 24, 32)),
+                     ((32, 32, 64), (32, 24, 64), (32, 64, 64))),
+        aggregation_channels=(16, 32, 64),
+        fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')),
+        fps_sample_range_lists=((-1), (-1), (64, -1)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModuleMSG',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False))
+
+    self = MODELS.build(cfg)
+    self.cuda()
+    assert self.SA_modules[0].mlps[0].layer0.conv.in_channels == 4
+    assert self.SA_modules[0].mlps[0].layer0.conv.out_channels == 8
+    assert self.SA_modules[0].mlps[1].layer1.conv.out_channels == 8
+    assert self.SA_modules[2].mlps[2].layer2.conv.out_channels == 64
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', dtype=np.float32)
+    xyz = torch.from_numpy(xyz).view(1, -1, 6).cuda()  # (B, N, 6)
+    # test forward
+    ret_dict = self(xyz[:, :, :4])
+    sa_xyz = ret_dict['sa_xyz'][-1]
+    sa_features = ret_dict['sa_features'][-1]
+    sa_indices = ret_dict['sa_indices'][-1]
+
+    assert sa_xyz.shape == torch.Size([1, 64, 3])
+    assert sa_features.shape == torch.Size([1, 64, 64])
+    assert sa_indices.shape == torch.Size([1, 64])
+
+    # out_indices should smaller than the length of SA Modules.
+    with pytest.raises(AssertionError):
+        MODELS.build(
+            dict(
+                type='PointNet2SAMSG',
+                in_channels=4,
+                num_points=(256, 64, (32, 32)),
+                radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)),
+                num_samples=((8, 8, 16), (8, 8, 16), (8, 8, 8)),
+                sa_channels=(((8, 8, 16), (8, 8, 16), (8, 8, 16)),
+                             ((16, 16, 32), (16, 16, 32), (16, 24, 32)),
+                             ((32, 32, 64), (32, 24, 64), (32, 64, 64))),
+                aggregation_channels=(16, 32, 64),
+                fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')),
+                fps_sample_range_lists=((-1), (-1), (64, -1)),
+                out_indices=(2, 3),
+                norm_cfg=dict(type='BN2d'),
+                sa_cfg=dict(
+                    type='PointSAModuleMSG',
+                    pool_mod='max',
+                    use_xyz=True,
+                    normalize_xyz=False)))
+
+    # PN2MSG used in segmentation
+    cfg = dict(
+        type='PointNet2SAMSG',
+        in_channels=6,  # [xyz, rgb]
+        num_points=(1024, 256, 64, 16),
+        radii=((0.05, 0.1), (0.1, 0.2), (0.2, 0.4), (0.4, 0.8)),
+        num_samples=((16, 32), (16, 32), (16, 32), (16, 32)),
+        sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96,
+                                                                    128)),
+                     ((128, 196, 256), (128, 196, 256)), ((256, 256, 512),
+                                                          (256, 384, 512))),
+        aggregation_channels=(None, None, None, None),
+        fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')),
+        fps_sample_range_lists=((-1), (-1), (-1), (-1)),
+        dilated_group=(False, False, False, False),
+        out_indices=(0, 1, 2, 3),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModuleMSG',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False))
+
+    self = MODELS.build(cfg)
+    self.cuda()
+    ret_dict = self(xyz)
+    sa_xyz = ret_dict['sa_xyz']
+    sa_features = ret_dict['sa_features']
+    sa_indices = ret_dict['sa_indices']
+
+    assert len(sa_xyz) == len(sa_features) == len(sa_indices) == 5
+    assert sa_xyz[0].shape == torch.Size([1, 100, 3])
+    assert sa_xyz[1].shape == torch.Size([1, 1024, 3])
+    assert sa_xyz[2].shape == torch.Size([1, 256, 3])
+    assert sa_xyz[3].shape == torch.Size([1, 64, 3])
+    assert sa_xyz[4].shape == torch.Size([1, 16, 3])
+    assert sa_features[0].shape == torch.Size([1, 3, 100])
+    assert sa_features[1].shape == torch.Size([1, 96, 1024])
+    assert sa_features[2].shape == torch.Size([1, 256, 256])
+    assert sa_features[3].shape == torch.Size([1, 512, 64])
+    assert sa_features[4].shape == torch.Size([1, 1024, 16])
+    assert sa_indices[0].shape == torch.Size([1, 100])
+    assert sa_indices[1].shape == torch.Size([1, 1024])
+    assert sa_indices[2].shape == torch.Size([1, 256])
+    assert sa_indices[3].shape == torch.Size([1, 64])
+    assert sa_indices[4].shape == torch.Size([1, 16])
diff --git a/mmde/tests/test_models/test_backbones/test_pointnet2_sa_ssg.py b/mmde/tests/test_models/test_backbones/test_pointnet2_sa_ssg.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd4d993566c1300670c8df932d4de2b1710acccb
--- /dev/null
+++ b/mmde/tests/test_models/test_backbones/test_pointnet2_sa_ssg.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmdet3d.registry import MODELS
+
+
+def test_pointnet2_sa_ssg():
+    if not torch.cuda.is_available():
+        pytest.skip()
+
+    cfg = dict(
+        type='PointNet2SASSG',
+        in_channels=6,
+        num_points=(32, 16),
+        radius=(0.8, 1.2),
+        num_samples=(16, 8),
+        sa_channels=((8, 16), (16, 16)),
+        fp_channels=((16, 16), (16, 16)))
+    self = MODELS.build(cfg)
+    self.cuda()
+    assert self.SA_modules[0].mlps[0].layer0.conv.in_channels == 6
+    assert self.SA_modules[0].mlps[0].layer0.conv.out_channels == 8
+    assert self.SA_modules[0].mlps[0].layer1.conv.out_channels == 16
+    assert self.SA_modules[1].mlps[0].layer1.conv.out_channels == 16
+    assert self.FP_modules[0].mlps.layer0.conv.in_channels == 32
+    assert self.FP_modules[0].mlps.layer0.conv.out_channels == 16
+    assert self.FP_modules[1].mlps.layer0.conv.in_channels == 19
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', dtype=np.float32)
+    xyz = torch.from_numpy(xyz).view(1, -1, 6).cuda()  # (B, N, 6)
+    # test forward
+    ret_dict = self(xyz)
+    fp_xyz = ret_dict['fp_xyz']
+    fp_features = ret_dict['fp_features']
+    fp_indices = ret_dict['fp_indices']
+    sa_xyz = ret_dict['sa_xyz']
+    sa_features = ret_dict['sa_features']
+    sa_indices = ret_dict['sa_indices']
+    assert len(fp_xyz) == len(fp_features) == len(fp_indices) == 3
+    assert len(sa_xyz) == len(sa_features) == len(sa_indices) == 3
+    assert fp_xyz[0].shape == torch.Size([1, 16, 3])
+    assert fp_xyz[1].shape == torch.Size([1, 32, 3])
+    assert fp_xyz[2].shape == torch.Size([1, 100, 3])
+    assert fp_features[0].shape == torch.Size([1, 16, 16])
+    assert fp_features[1].shape == torch.Size([1, 16, 32])
+    assert fp_features[2].shape == torch.Size([1, 16, 100])
+    assert fp_indices[0].shape == torch.Size([1, 16])
+    assert fp_indices[1].shape == torch.Size([1, 32])
+    assert fp_indices[2].shape == torch.Size([1, 100])
+    assert sa_xyz[0].shape == torch.Size([1, 100, 3])
+    assert sa_xyz[1].shape == torch.Size([1, 32, 3])
+    assert sa_xyz[2].shape == torch.Size([1, 16, 3])
+    assert sa_features[0].shape == torch.Size([1, 3, 100])
+    assert sa_features[1].shape == torch.Size([1, 16, 32])
+    assert sa_features[2].shape == torch.Size([1, 16, 16])
+    assert sa_indices[0].shape == torch.Size([1, 100])
+    assert sa_indices[1].shape == torch.Size([1, 32])
+    assert sa_indices[2].shape == torch.Size([1, 16])
+
+    # test only xyz input without features
+    cfg['in_channels'] = 3
+    self = MODELS.build(cfg)
+    self.cuda()
+    ret_dict = self(xyz[..., :3])
+    assert len(fp_xyz) == len(fp_features) == len(fp_indices) == 3
+    assert len(sa_xyz) == len(sa_features) == len(sa_indices) == 3
+    assert fp_features[0].shape == torch.Size([1, 16, 16])
+    assert fp_features[1].shape == torch.Size([1, 16, 32])
+    assert fp_features[2].shape == torch.Size([1, 16, 100])
+    assert sa_features[0].shape == torch.Size([1, 3, 100])
+    assert sa_features[1].shape == torch.Size([1, 16, 32])
+    assert sa_features[2].shape == torch.Size([1, 16, 16])
diff --git a/mmde/tests/test_models/test_backbones/test_spvcnn_backbone.py b/mmde/tests/test_models/test_backbones/test_spvcnn_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..504f2cc9feea8513ff0da2be9830e1cfa10d3797
--- /dev/null
+++ b/mmde/tests/test_models/test_backbones/test_spvcnn_backbone.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+import torch.nn.functional as F
+
+from mmdet3d.registry import MODELS
+
+
+def test_spvcnn_backbone():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+
+    try:
+        import torchsparse  # noqa: F401
+    except ImportError:
+        pytest.skip('test requires Torchsparse installation')
+
+    coordinates, features = [], []
+    for i in range(2):
+        c = torch.randint(0, 10, (100, 3)).int()
+        c = F.pad(c, (0, 1), mode='constant', value=i)
+        coordinates.append(c)
+        f = torch.rand(100, 4)
+        features.append(f)
+    features = torch.cat(features, dim=0).cuda()
+    coordinates = torch.cat(coordinates, dim=0).cuda()
+
+    cfg = dict(type='SPVCNNBackbone')
+    self = MODELS.build(cfg).cuda()
+    self.init_weights()
+
+    y = self(features, coordinates)
+    assert y.F.shape == torch.Size([200, 96])
+    assert y.C.shape == torch.Size([200, 4])
diff --git a/mmde/tests/test_models/test_data_preprocessors/test_data_preprocessor.py b/mmde/tests/test_models/test_data_preprocessors/test_data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..3db374ca00c22ea4ae16f574a129f99a755a2cec
--- /dev/null
+++ b/mmde/tests/test_models/test_data_preprocessors/test_data_preprocessor.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import pytest
+import torch
+
+from mmdet3d.models.data_preprocessors import Det3DDataPreprocessor
+from mmdet3d.structures import Det3DDataSample, PointData
+
+
+class TestDet3DDataPreprocessor(TestCase):
+
+    def test_init(self):
+        # test mean is None
+        processor = Det3DDataPreprocessor()
+        self.assertTrue(not hasattr(processor, 'mean'))
+        self.assertTrue(processor._enable_normalize is False)
+
+        # test mean is not None
+        processor = Det3DDataPreprocessor(mean=[0, 0, 0], std=[1, 1, 1])
+        self.assertTrue(hasattr(processor, 'mean'))
+        self.assertTrue(hasattr(processor, 'std'))
+        self.assertTrue(processor._enable_normalize)
+
+        # please specify both mean and std
+        with self.assertRaises(AssertionError):
+            Det3DDataPreprocessor(mean=[0, 0, 0])
+
+        # bgr2rgb and rgb2bgr cannot be set to True at the same time
+        with self.assertRaises(AssertionError):
+            Det3DDataPreprocessor(bgr_to_rgb=True, rgb_to_bgr=True)
+
+    def test_forward(self):
+        processor = Det3DDataPreprocessor(mean=[0, 0, 0], std=[1, 1, 1])
+
+        points = torch.randn((5000, 3))
+        image = torch.randint(0, 256, (3, 11, 10)).float()
+        inputs_dict = dict(points=[points], img=[image])
+
+        data = {'inputs': inputs_dict, 'data_samples': [Det3DDataSample()]}
+        out_data = processor(data)
+        batch_inputs, batch_data_samples = out_data['inputs'], out_data[
+            'data_samples']
+
+        self.assertEqual(batch_inputs['imgs'].shape, (1, 3, 11, 10))
+        self.assertEqual(len(batch_inputs['points']), 1)
+        self.assertEqual(len(batch_data_samples), 1)
+
+        # test image channel_conversion
+        processor = Det3DDataPreprocessor(
+            mean=[0., 0., 0.], std=[1., 1., 1.], bgr_to_rgb=True)
+        out_data = processor(data)
+        batch_inputs, batch_data_samples = out_data['inputs'], out_data[
+            'data_samples']
+        self.assertEqual(batch_inputs['imgs'].shape, (1, 3, 11, 10))
+        self.assertEqual(len(batch_data_samples), 1)
+
+        # test image padding
+        data = {
+            'inputs': {
+                'points': [torch.randn((5000, 3)),
+                           torch.randn((5000, 3))],
+                'img': [
+                    torch.randint(0, 256, (3, 10, 11)),
+                    torch.randint(0, 256, (3, 9, 14))
+                ]
+            }
+        }
+        processor = Det3DDataPreprocessor(
+            mean=[0., 0., 0.], std=[1., 1., 1.], bgr_to_rgb=True)
+        out_data = processor(data)
+        batch_inputs, batch_data_samples = out_data['inputs'], out_data[
+            'data_samples']
+        self.assertEqual(batch_inputs['imgs'].shape, (2, 3, 10, 14))
+        self.assertIsNone(batch_data_samples)
+
+        # test pad_size_divisor
+        data = {
+            'inputs': {
+                'points': [torch.randn((5000, 3)),
+                           torch.randn((5000, 3))],
+                'img': [
+                    torch.randint(0, 256, (3, 10, 11)),
+                    torch.randint(0, 256, (3, 9, 24))
+                ]
+            },
+            'data_samples': [Det3DDataSample()] * 2
+        }
+        processor = Det3DDataPreprocessor(
+            mean=[0., 0., 0.], std=[1., 1., 1.], pad_size_divisor=5)
+        out_data = processor(data)
+        batch_inputs, batch_data_samples = out_data['inputs'], out_data[
+            'data_samples']
+        self.assertEqual(batch_inputs['imgs'].shape, (2, 3, 10, 25))
+        self.assertEqual(len(batch_data_samples), 2)
+        for data_sample, expected_shape in zip(batch_data_samples, [(10, 15),
+                                                                    (10, 25)]):
+            self.assertEqual(data_sample.pad_shape, expected_shape)
+
+        # test cylindrical voxelization
+        if not torch.cuda.is_available():
+            pytest.skip('test requires GPU and CUDA')
+        point_cloud_range = [0, -180, -4, 50, 180, 2]
+        grid_shape = [480, 360, 32]
+        voxel_layer = dict(
+            grid_shape=grid_shape,
+            point_cloud_range=point_cloud_range,
+            max_num_points=-1,
+            max_voxels=-1)
+        processor = Det3DDataPreprocessor(
+            voxel=True, voxel_type='cylindrical',
+            voxel_layer=voxel_layer).cuda()
+        num_points = 5000
+        xy = torch.rand(num_points, 2) * 140 - 70
+        z = torch.rand(num_points, 1) * 9 - 6
+        ref = torch.rand(num_points, 1)
+        points = [torch.cat([xy, z, ref], dim=-1)] * 2
+        data_sample = Det3DDataSample()
+        gt_pts_seg = PointData()
+        gt_pts_seg.pts_semantic_mask = torch.randint(0, 10, (num_points, ))
+        data_sample.gt_pts_seg = gt_pts_seg
+        data_samples = [data_sample] * 2
+        inputs = dict(inputs=dict(points=points), data_samples=data_samples)
+        out_data = processor(inputs)
+        batch_inputs, batch_data_samples = out_data['inputs'], out_data[
+            'data_samples']
+        self.assertEqual(batch_inputs['voxels']['voxels'].shape, (10000, 6))
+        self.assertEqual(batch_inputs['voxels']['coors'].shape, (10000, 4))
diff --git a/mmde/tests/test_models/test_decode_heads/test_cylinder3d_head.py b/mmde/tests/test_models/test_decode_heads/test_cylinder3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8fae827e888b081189daf3377498e9c390ac97b
--- /dev/null
+++ b/mmde/tests/test_models/test_decode_heads/test_cylinder3d_head.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import pytest
+import torch
+from mmcv.ops import SparseConvTensor
+
+from mmdet3d.models.decode_heads import Cylinder3DHead
+from mmdet3d.structures import Det3DDataSample, PointData
+
+
+class TestCylinder3DHead(TestCase):
+
+    def test_cylinder3d_head_loss(self):
+        """Tests Cylinder3D head loss."""
+        if not torch.cuda.is_available():
+            pytest.skip('test requires GPU and torch+cuda')
+        cylinder3d_head = Cylinder3DHead(
+            channels=128,
+            num_classes=20,
+            loss_ce=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                class_weight=None,
+                loss_weight=1.0),
+            loss_lovasz=dict(
+                type='LovaszLoss', loss_weight=1.0, reduction='none'),
+        ).cuda()
+
+        voxel_feats = torch.rand(50, 128).cuda()
+        coorx = torch.randint(0, 480, (50, 1)).int().cuda()
+        coory = torch.randint(0, 360, (50, 1)).int().cuda()
+        coorz = torch.randint(0, 32, (50, 1)).int().cuda()
+        coorbatch0 = torch.zeros(50, 1).int().cuda()
+        coors = torch.cat([coorbatch0, coorx, coory, coorz], dim=1)
+        grid_size = [480, 360, 32]
+        batch_size = 1
+
+        sparse_voxels = SparseConvTensor(voxel_feats, coors, grid_size,
+                                         batch_size)
+        # Test forward
+        seg_logits = cylinder3d_head.forward(sparse_voxels)
+
+        self.assertEqual(seg_logits.features.shape, torch.Size([50, 20]))
+
+        # When truth is non-empty then losses
+        # should be nonzero for random inputs
+        voxel_semantic_mask = torch.randint(0, 20, (50, )).long().cuda()
+        gt_pts_seg = PointData(voxel_semantic_mask=voxel_semantic_mask)
+
+        datasample = Det3DDataSample()
+        datasample.gt_pts_seg = gt_pts_seg
+
+        losses = cylinder3d_head.loss_by_feat(seg_logits, [datasample])
+
+        loss_ce = losses['loss_ce'].item()
+        loss_lovasz = losses['loss_lovasz'].item()
+
+        self.assertGreater(loss_ce, 0, 'ce loss should be positive')
+        self.assertGreater(loss_lovasz, 0, 'lovasz loss should be positive')
+
+        batch_inputs_dict = dict(voxels=dict(voxel_coors=coors))
+        datasample.point2voxel_map = torch.randint(0, 50, (100, )).int().cuda()
+        point_logits = cylinder3d_head.predict(sparse_voxels,
+                                               batch_inputs_dict, [datasample])
+        assert point_logits[0].shape == torch.Size([100, 20])
diff --git a/mmde/tests/test_models/test_decode_heads/test_dgcnn_head.py b/mmde/tests/test_models/test_decode_heads/test_dgcnn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..727f82a78ef8f964219a120bb2503978d9edc58b
--- /dev/null
+++ b/mmde/tests/test_models/test_decode_heads/test_dgcnn_head.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmdet3d.models.decode_heads import DGCNNHead
+from mmdet3d.structures import Det3DDataSample, PointData
+
+
+class TestDGCNNHead(TestCase):
+
+    def test_dgcnn_head_loss(self):
+        """Tests DGCNN head loss."""
+
+        dgcnn_head = DGCNNHead(
+            fp_channels=(1024, 512),
+            channels=256,
+            num_classes=13,
+            dropout_ratio=0.5,
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            act_cfg=dict(type='LeakyReLU', negative_slope=0.2),
+            loss_decode=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                class_weight=None,
+                loss_weight=1.0),
+            ignore_index=13)
+
+        # DGCNN head expects dict format features
+        fa_points = torch.rand(1, 4096, 1024).float()
+        feat_dict = dict(fa_points=fa_points)
+
+        # Test forward
+        seg_logits = dgcnn_head.forward(feat_dict)
+
+        self.assertEqual(seg_logits.shape, torch.Size([1, 13, 4096]))
+
+        # When truth is non-empty then losses
+        # should be nonzero for random inputs
+        pts_semantic_mask = torch.randint(0, 13, (4096, )).long()
+        gt_pts_seg = PointData(pts_semantic_mask=pts_semantic_mask)
+
+        datasample = Det3DDataSample()
+        datasample.gt_pts_seg = gt_pts_seg
+
+        gt_losses = dgcnn_head.loss_by_feat(seg_logits, [datasample])
+
+        gt_sem_seg_loss = gt_losses['loss_sem_seg'].item()
+
+        self.assertGreater(gt_sem_seg_loss, 0,
+                           'semantic seg loss should be positive')
diff --git a/mmde/tests/test_models/test_decode_heads/test_minkunet_head.py b/mmde/tests/test_models/test_decode_heads/test_minkunet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c684565dedc64aa6b19c21052102b51187de6e5e
--- /dev/null
+++ b/mmde/tests/test_models/test_decode_heads/test_minkunet_head.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from mmdet3d.models.decode_heads import MinkUNetHead
+from mmdet3d.structures import Det3DDataSample, PointData
+
+
+class TestMinkUNetHead(TestCase):
+
+    def test_minkunet_head_loss(self):
+        """Tests PAConv head loss."""
+
+        try:
+            import torchsparse
+        except ImportError:
+            pytest.skip('test requires Torchsparse installation')
+        if torch.cuda.is_available():
+            minkunet_head = MinkUNetHead(channels=4, num_classes=19)
+
+            minkunet_head.cuda()
+            coordinates, features = [], []
+            for i in range(2):
+                c = torch.randint(0, 10, (100, 3)).int()
+                c = F.pad(c, (0, 1), mode='constant', value=i)
+                coordinates.append(c)
+                f = torch.rand(100, 4)
+                features.append(f)
+            features = torch.cat(features, dim=0).cuda()
+            coordinates = torch.cat(coordinates, dim=0).cuda()
+            x = torchsparse.SparseTensor(feats=features, coords=coordinates)
+
+            # Test forward
+            seg_logits = minkunet_head.forward(x)
+
+            self.assertEqual(seg_logits.shape, torch.Size([200, 19]))
+
+            # When truth is non-empty then losses
+            # should be nonzero for random inputs
+            voxel_semantic_mask = torch.randint(0, 19, (100, )).long().cuda()
+            gt_pts_seg = PointData(voxel_semantic_mask=voxel_semantic_mask)
+
+            datasample = Det3DDataSample()
+            datasample.gt_pts_seg = gt_pts_seg
+
+            gt_losses = minkunet_head.loss(x, [datasample, datasample], {})
+
+            gt_sem_seg_loss = gt_losses['loss_sem_seg'].item()
+
+            self.assertGreater(gt_sem_seg_loss, 0,
+                               'semantic seg loss should be positive')
diff --git a/mmde/tests/test_models/test_decode_heads/test_paconv_head.py b/mmde/tests/test_models/test_decode_heads/test_paconv_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..92286b671779c05541c94a153d68e49e821243e5
--- /dev/null
+++ b/mmde/tests/test_models/test_decode_heads/test_paconv_head.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmdet3d.models.decode_heads import PAConvHead
+from mmdet3d.structures import Det3DDataSample, PointData
+
+
+class TestPAConvHead(TestCase):
+
+    def test_paconv_head_loss(self):
+        """Tests PAConv head loss."""
+
+        if torch.cuda.is_available():
+            paconv_head = PAConvHead(
+                fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+                             (128 + 6, 128, 128, 128)),
+                channels=128,
+                num_classes=20,
+                dropout_ratio=0.5,
+                conv_cfg=dict(type='Conv1d'),
+                norm_cfg=dict(type='BN1d'),
+                act_cfg=dict(type='ReLU'),
+                loss_decode=dict(
+                    type='mmdet.CrossEntropyLoss',
+                    use_sigmoid=False,
+                    class_weight=None,
+                    loss_weight=1.0),
+                ignore_index=20)
+
+            paconv_head.cuda()
+            # PAConv head expects dict format features
+            sa_xyz = [
+                torch.rand(1, 4096, 3).float().cuda(),
+                torch.rand(1, 1024, 3).float().cuda(),
+                torch.rand(1, 256, 3).float().cuda(),
+                torch.rand(1, 64, 3).float().cuda(),
+                torch.rand(1, 16, 3).float().cuda(),
+            ]
+            sa_features = [
+                torch.rand(1, 6, 4096).float().cuda(),
+                torch.rand(1, 64, 1024).float().cuda(),
+                torch.rand(1, 128, 256).float().cuda(),
+                torch.rand(1, 256, 64).float().cuda(),
+                torch.rand(1, 512, 16).float().cuda(),
+            ]
+            feat_dict = dict(sa_xyz=sa_xyz, sa_features=sa_features)
+
+            # Test forward
+            seg_logits = paconv_head.forward(feat_dict)
+
+            self.assertEqual(seg_logits.shape, torch.Size([1, 20, 4096]))
+
+            # When truth is non-empty then losses
+            # should be nonzero for random inputs
+            pts_semantic_mask = torch.randint(0, 20, (4096, )).long().cuda()
+            gt_pts_seg = PointData(pts_semantic_mask=pts_semantic_mask)
+
+            datasample = Det3DDataSample()
+            datasample.gt_pts_seg = gt_pts_seg
+
+            gt_losses = paconv_head.loss_by_feat(seg_logits, [datasample])
+
+            gt_sem_seg_loss = gt_losses['loss_sem_seg'].item()
+
+            self.assertGreater(gt_sem_seg_loss, 0,
+                               'semantic seg loss should be positive')
diff --git a/mmde/tests/test_models/test_decode_heads/test_pointnet2_head.py b/mmde/tests/test_models/test_decode_heads/test_pointnet2_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c10ae1f97dba13c67d54e552f0433a7152d82ab9
--- /dev/null
+++ b/mmde/tests/test_models/test_decode_heads/test_pointnet2_head.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmdet3d.models.decode_heads import PointNet2Head
+from mmdet3d.structures import Det3DDataSample, PointData
+
+
+class TestPointNet2Head(TestCase):
+
+    def test_paconv_head_loss(self):
+        """Tests PAConv head loss."""
+
+        if torch.cuda.is_available():
+            pointnet2_head = PointNet2Head(
+                fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+                             (128, 128, 128, 128)),
+                channels=128,
+                num_classes=20,
+                dropout_ratio=0.5,
+                conv_cfg=dict(type='Conv1d'),
+                norm_cfg=dict(type='BN1d'),
+                act_cfg=dict(type='ReLU'),
+                loss_decode=dict(
+                    type='mmdet.CrossEntropyLoss',
+                    use_sigmoid=False,
+                    class_weight=None,
+                    loss_weight=1.0),
+                ignore_index=20)
+
+            pointnet2_head.cuda()
+
+            # DGCNN head expects dict format features
+            sa_xyz = [
+                torch.rand(1, 4096, 3).float().cuda(),
+                torch.rand(1, 1024, 3).float().cuda(),
+                torch.rand(1, 256, 3).float().cuda(),
+                torch.rand(1, 64, 3).float().cuda(),
+                torch.rand(1, 16, 3).float().cuda(),
+            ]
+            sa_features = [
+                torch.rand(1, 6, 4096).float().cuda(),
+                torch.rand(1, 64, 1024).float().cuda(),
+                torch.rand(1, 128, 256).float().cuda(),
+                torch.rand(1, 256, 64).float().cuda(),
+                torch.rand(1, 512, 16).float().cuda(),
+            ]
+            feat_dict = dict(sa_xyz=sa_xyz, sa_features=sa_features)
+
+            # Test forward
+            seg_logits = pointnet2_head.forward(feat_dict)
+
+            self.assertEqual(seg_logits.shape, torch.Size([1, 20, 4096]))
+
+            # When truth is non-empty then losses
+            # should be nonzero for random inputs
+            pts_semantic_mask = torch.randint(0, 20, (4096, )).long().cuda()
+            gt_pts_seg = PointData(pts_semantic_mask=pts_semantic_mask)
+
+            datasample = Det3DDataSample()
+            datasample.gt_pts_seg = gt_pts_seg
+
+            gt_losses = pointnet2_head.loss_by_feat(seg_logits, [datasample])
+
+            gt_sem_seg_loss = gt_losses['loss_sem_seg'].item()
+
+            self.assertGreater(gt_sem_seg_loss, 0,
+                               'semantic seg loss should be positive')
diff --git a/mmde/tests/test_models/test_dense_heads/test_anchor3d_head.py b/mmde/tests/test_models/test_dense_heads/test_anchor3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..82fda6261af16d9ce079bc0ee4b329b20d7a80f9
--- /dev/null
+++ b/mmde/tests/test_models/test_dense_heads/test_anchor3d_head.py
@@ -0,0 +1,196 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+from mmengine import Config
+from mmengine.structures import InstanceData
+
+from mmdet3d import *  # noqa
+from mmdet3d.models.dense_heads import Anchor3DHead
+from mmdet3d.structures import Box3DMode, LiDARInstance3DBoxes
+
+
+class TestAnchor3DHead(TestCase):
+
+    def test_anchor3d_head_loss(self):
+        """Test anchor head loss when truth is empty and non-empty."""
+
+        cfg = Config(
+            dict(
+                assigner=[
+                    dict(  # for Pedestrian
+                        type='Max3DIoUAssigner',
+                        iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                        pos_iou_thr=0.35,
+                        neg_iou_thr=0.2,
+                        min_pos_iou=0.2,
+                        ignore_iof_thr=-1),
+                    dict(  # for Cyclist
+                        type='Max3DIoUAssigner',
+                        iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                        pos_iou_thr=0.35,
+                        neg_iou_thr=0.2,
+                        min_pos_iou=0.2,
+                        ignore_iof_thr=-1),
+                    dict(  # for Car
+                        type='Max3DIoUAssigner',
+                        iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                        pos_iou_thr=0.6,
+                        neg_iou_thr=0.45,
+                        min_pos_iou=0.45,
+                        ignore_iof_thr=-1),
+                ],
+                allowed_border=0,
+                pos_weight=-1,
+                debug=False))
+
+        anchor3d_head = Anchor3DHead(
+            num_classes=3,
+            in_channels=512,
+            feat_channels=512,
+            use_direction_classifier=True,
+            anchor_generator=dict(
+                type='Anchor3DRangeGenerator',
+                ranges=[
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+                ],
+                sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+                rotations=[0, 1.57],
+                reshape_out=False),
+            diff_rad_by_sin=True,
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            loss_cls=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+            loss_dir=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=0.2),
+            train_cfg=cfg)
+
+        # Anchor head expects a multiple levels of features per image
+        feats = (torch.rand([1, 512, 200, 176], dtype=torch.float32), )
+        (cls_scores, bbox_preds, dir_cls_preds) = anchor3d_head.forward(feats)
+
+        self.assertEqual(cls_scores[0].shape, torch.Size([1, 18, 200, 176]))
+        self.assertEqual(bbox_preds[0].shape, torch.Size([1, 42, 200, 176]))
+        self.assertEqual(dir_cls_preds[0].shape, torch.Size([1, 12, 200, 176]))
+
+        # # Test that empty ground truth encourages the network to
+        # # predict background
+        gt_instances = InstanceData()
+        gt_bboxes_3d = LiDARInstance3DBoxes(torch.empty((0, 7)))
+        gt_labels_3d = torch.tensor([])
+        input_metas = dict(sample_idx=1234)
+        # fake input_metas
+        gt_instances.bboxes_3d = gt_bboxes_3d
+        gt_instances.labels_3d = gt_labels_3d
+
+        empty_gt_losses = anchor3d_head.loss_by_feat(cls_scores, bbox_preds,
+                                                     dir_cls_preds,
+                                                     [gt_instances],
+                                                     [input_metas])
+
+        # When there is no truth, the cls loss should be nonzero but
+        # there should be no box and dir loss.
+        self.assertGreater(empty_gt_losses['loss_cls'][0], 0,
+                           'cls loss should be non-zero')
+        self.assertEqual(
+            empty_gt_losses['loss_bbox'][0], 0,
+            'there should be no box loss when there are no true boxes')
+        self.assertEqual(
+            empty_gt_losses['loss_dir'][0], 0,
+            'there should be no dir loss when there are no true dirs')
+
+        # When truth is non-empty then both cls and box loss
+        # should be nonzero for random inputs
+        gt_instances = InstanceData()
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            torch.tensor(
+                [[6.4118, -3.4305, -1.7291, 1.7033, 3.4693, 1.6197, -0.9091]],
+                dtype=torch.float32))
+        gt_labels_3d = torch.tensor([1], dtype=torch.int64)
+        gt_instances.bboxes_3d = gt_bboxes_3d
+        gt_instances.labels_3d = gt_labels_3d
+
+        gt_losses = anchor3d_head.loss_by_feat(cls_scores, bbox_preds,
+                                               dir_cls_preds, [gt_instances],
+                                               [input_metas])
+
+        self.assertGreater(gt_losses['loss_cls'][0], 0,
+                           'cls loss should be non-zero')
+        self.assertGreater(gt_losses['loss_bbox'][0], 0,
+                           'box loss should be non-zero')
+        self.assertGreater(gt_losses['loss_dir'][0], 0,
+                           'dir loss should be none-zero')
+
+    def test_anchor3d_head_predict(self):
+
+        cfg = Config(
+            dict(
+                use_rotate_nms=True,
+                nms_across_levels=False,
+                nms_thr=0.01,
+                score_thr=0.1,
+                min_bbox_size=0,
+                nms_pre=100,
+                max_num=50))
+
+        anchor3d_head = Anchor3DHead(
+            num_classes=3,
+            in_channels=512,
+            feat_channels=512,
+            use_direction_classifier=True,
+            anchor_generator=dict(
+                type='Anchor3DRangeGenerator',
+                ranges=[
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+                ],
+                sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+                rotations=[0, 1.57],
+                reshape_out=False),
+            diff_rad_by_sin=True,
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            loss_cls=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+            loss_dir=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=0.2),
+            test_cfg=cfg)
+
+        feats = (torch.rand([2, 512, 200, 176], dtype=torch.float32), )
+        (cls_scores, bbox_preds, dir_cls_preds) = anchor3d_head.forward(feats)
+        # fake input_metas
+        input_metas = [{
+            'sample_idx': 1234,
+            'box_type_3d': LiDARInstance3DBoxes,
+            'box_mode_3d': Box3DMode.LIDAR
+        }, {
+            'sample_idx': 2345,
+            'box_type_3d': LiDARInstance3DBoxes,
+            'box_mode_3d': Box3DMode.LIDAR
+        }]
+        # test get_boxes
+        cls_scores[0] -= 1.5  # too many positive samples may cause cuda oom
+        results = anchor3d_head.predict_by_feat(cls_scores, bbox_preds,
+                                                dir_cls_preds, input_metas)
+        pred_instances = results[0]
+        scores_3d = pred_instances.scores_3d
+
+        assert (scores_3d > 0.3).all()
diff --git a/mmde/tests/test_models/test_dense_heads/test_fcaf3d_head.py b/mmde/tests/test_models/test_dense_heads/test_fcaf3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..eec83175984e8738addd11cdfc917de09263642a
--- /dev/null
+++ b/mmde/tests/test_models/test_dense_heads/test_fcaf3d_head.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import pytest
+import torch
+
+from mmdet3d import *  # noqa
+from mmdet3d.models.dense_heads import FCAF3DHead
+from mmdet3d.testing import create_detector_inputs
+
+
+class TestFCAF3DHead(TestCase):
+
+    def test_fcaf3d_head_loss(self):
+        """Test fcaf3d head loss when truth is empty and non-empty."""
+        if not torch.cuda.is_available():
+            pytest.skip('test requires GPU and torch+cuda')
+
+        try:
+            import MinkowskiEngine as ME
+        except ImportError:
+            pytest.skip('test requires MinkowskiEngine installation')
+
+        # build head
+        fcaf3d_head = FCAF3DHead(
+            in_channels=(64, 128, 256, 512),
+            out_channels=128,
+            voxel_size=1.,
+            pts_prune_threshold=1000,
+            pts_assign_threshold=27,
+            pts_center_threshold=18,
+            num_classes=18,
+            num_reg_outs=6,
+            test_cfg=dict(nms_pre=1000, iou_thr=.5, score_thr=.01),
+            center_loss=dict(type='mmdet.CrossEntropyLoss', use_sigmoid=True),
+            bbox_loss=dict(type='AxisAlignedIoULoss'),
+            cls_loss=dict(type='mmdet.FocalLoss'),
+        )
+        fcaf3d_head = fcaf3d_head.cuda()
+
+        # fake input of head
+        coordinates, features = [torch.randn(500, 3).cuda() * 100
+                                 ], [torch.randn(500, 3).cuda()]
+        tensor_coordinates, tensor_features = ME.utils.sparse_collate(
+            coordinates, features)
+        x = ME.SparseTensor(
+            features=tensor_features, coordinates=tensor_coordinates)
+        # backbone
+        conv1 = ME.MinkowskiConvolution(
+            3, 64, kernel_size=3, stride=2, dimension=3).cuda()
+        conv2 = ME.MinkowskiConvolution(
+            64, 128, kernel_size=3, stride=2, dimension=3).cuda()
+        conv3 = ME.MinkowskiConvolution(
+            128, 256, kernel_size=3, stride=2, dimension=3).cuda()
+        conv4 = ME.MinkowskiConvolution(
+            256, 512, kernel_size=3, stride=2, dimension=3).cuda()
+
+        # backbone outputs of 4 levels
+        x1 = conv1(x)
+        x2 = conv2(x1)
+        x3 = conv3(x2)
+        x4 = conv4(x3)
+        x = (x1, x2, x3, x4)
+
+        # fake annotation
+        packed_inputs = create_detector_inputs(
+            with_points=False,
+            with_img=False,
+            num_gt_instance=3,
+            num_classes=1,
+            points_feat_dim=6,
+            gt_bboxes_dim=6)
+        data_samples = [
+            sample.cuda() for sample in packed_inputs['data_samples']
+        ]
+
+        gt_losses = fcaf3d_head.loss(x, data_samples)
+        print(gt_losses)
+        self.assertGreaterEqual(gt_losses['cls_loss'], 0,
+                                'cls loss should be non-zero')
+        self.assertGreaterEqual(gt_losses['bbox_loss'], 0,
+                                'box loss should be non-zero')
+        self.assertGreaterEqual(gt_losses['center_loss'], 0,
+                                'dir loss should be none-zero')
diff --git a/mmde/tests/test_models/test_dense_heads/test_fcos_mono3d_head.py b/mmde/tests/test_models/test_dense_heads/test_fcos_mono3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..47ad88c201dc1e3b2bca6d1adcc77dd9372a3ad6
--- /dev/null
+++ b/mmde/tests/test_models/test_dense_heads/test_fcos_mono3d_head.py
@@ -0,0 +1,185 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import mmengine
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet3d.models.dense_heads import FCOSMono3DHead
+from mmdet3d.structures import CameraInstance3DBoxes
+
+
+class TestFCOSMono3DHead(TestCase):
+
+    def test_fcos_mono3d_head_loss(self):
+        """Tests FCOS3D head loss and inference."""
+
+        img_metas = [
+            dict(
+                cam2img=[[1260.8474446004698, 0.0, 807.968244525554],
+                         [0.0, 1260.8474446004698, 495.3344268742088],
+                         [0.0, 0.0, 1.0]],
+                scale_factor=np.array([1., 1., 1., 1.], dtype=np.float32),
+                box_type_3d=CameraInstance3DBoxes)
+        ]
+
+        train_cfg = dict(
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05],
+            pos_weight=-1,
+            debug=False)
+
+        test_cfg = dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_thr=0.8,
+            score_thr=0.05,
+            min_bbox_size=0,
+            max_per_img=200)
+
+        train_cfg = mmengine.Config(train_cfg)
+        test_cfg = mmengine.Config(test_cfg)
+
+        fcos_mono3d_head = FCOSMono3DHead(
+            num_classes=10,
+            in_channels=32,
+            stacked_convs=2,
+            feat_channels=32,
+            use_direction_classifier=True,
+            diff_rad_by_sin=True,
+            pred_attrs=True,
+            pred_velo=True,
+            dir_offset=0.7854,  # pi/4
+            dir_limit_offset=0,
+            strides=[8, 16, 32, 64, 128],
+            group_reg_dims=(2, 1, 3, 1, 2),  # offset, depth, size, rot, velo
+            cls_branch=(32, ),
+            reg_branch=(
+                (32, ),  # offset
+                (32, ),  # depth
+                (32, ),  # size
+                (32, ),  # rot
+                ()  # velo
+            ),
+            dir_branch=(32, ),
+            attr_branch=(32, ),
+            loss_cls=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+            loss_dir=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0),
+            loss_attr=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0),
+            loss_centerness=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                loss_weight=1.0),
+            bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9),
+            norm_on_bbox=True,
+            centerness_on_reg=True,
+            center_sampling=True,
+            conv_bias=True,
+            dcn_on_last_conv=False,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg)
+
+        # FCOS3D head expects a multiple levels of features per image
+        feats = [
+            torch.rand([1, 32, 116, 200], dtype=torch.float32),
+            torch.rand([1, 32, 58, 100], dtype=torch.float32),
+            torch.rand([1, 32, 29, 50], dtype=torch.float32),
+            torch.rand([1, 32, 15, 25], dtype=torch.float32),
+            torch.rand([1, 32, 8, 13], dtype=torch.float32)
+        ]
+
+        # Test forward
+        ret_dict = fcos_mono3d_head.forward(feats)
+
+        self.assertEqual(
+            len(ret_dict), 5, 'the length of forward feature should be 5')
+        self.assertEqual(
+            len(ret_dict[0]), 5, 'each feature should have 5 levels')
+        self.assertEqual(
+            ret_dict[0][0].shape, torch.Size([1, 10, 116, 200]),
+            'the fist level feature shape should be [1, 10, 116, 200]')
+
+        # When truth is non-empty then all losses
+        # should be nonzero for random inputs
+        gt_instances_3d = InstanceData()
+        gt_instances = InstanceData()
+
+        gt_bboxes = torch.rand([3, 4], dtype=torch.float32)
+        gt_bboxes_3d = CameraInstance3DBoxes(torch.rand([3, 9]), box_dim=9)
+        gt_labels = torch.randint(0, 10, [3])
+        gt_labels_3d = gt_labels
+        centers_2d = torch.rand([3, 2], dtype=torch.float32)
+        depths = torch.rand([3], dtype=torch.float32)
+
+        attr_labels = torch.randint(0, 9, [3])
+
+        gt_instances_3d.bboxes_3d = gt_bboxes_3d
+        gt_instances_3d.labels_3d = gt_labels_3d
+        gt_instances.bboxes = gt_bboxes
+        gt_instances.labels = gt_labels
+        gt_instances_3d.centers_2d = centers_2d
+        gt_instances_3d.depths = depths
+        gt_instances_3d.attr_labels = attr_labels
+
+        gt_losses = fcos_mono3d_head.loss_by_feat(*ret_dict, [gt_instances_3d],
+                                                  [gt_instances], img_metas)
+
+        gt_cls_loss = gt_losses['loss_cls'].item()
+        gt_siz_loss = gt_losses['loss_size'].item()
+        gt_ctr_loss = gt_losses['loss_centerness'].item()
+        gt_off_loss = gt_losses['loss_offset'].item()
+        gt_dep_loss = gt_losses['loss_depth'].item()
+        gt_rot_loss = gt_losses['loss_rotsin'].item()
+        gt_vel_loss = gt_losses['loss_velo'].item()
+        gt_dir_loss = gt_losses['loss_dir'].item()
+        gt_atr_loss = gt_losses['loss_attr'].item()
+
+        self.assertGreater(gt_cls_loss, 0, 'cls loss should be positive')
+        self.assertGreater(gt_siz_loss, 0, 'size loss should be positive')
+        self.assertGreater(gt_ctr_loss, 0,
+                           'centerness loss should be positive')
+        self.assertGreater(gt_off_loss, 0, 'offset loss should be positive')
+        self.assertGreater(gt_dep_loss, 0, 'depth loss should be positive')
+        self.assertGreater(gt_rot_loss, 0, 'rotsin loss should be positive')
+        self.assertGreater(gt_vel_loss, 0, 'velocity loss should be positive')
+        self.assertGreater(gt_dir_loss, 0, 'direction loss should be positive')
+        self.assertGreater(gt_atr_loss, 0, 'attribue loss should be positive')
+
+        # test get_results
+        results_list_3d, results_list_2d = fcos_mono3d_head.predict_by_feat(
+            *ret_dict, img_metas)
+        self.assertEqual(len(results_list_3d), 1, 'batch size should be 1')
+        self.assertEqual(results_list_2d, None,
+                         'there is no 2d result in fcos3d')
+        results = results_list_3d[0]
+        pred_bboxes_3d = results.bboxes_3d
+        pred_scores_3d = results.scores_3d
+        pred_labels_3d = results.labels_3d
+        pred_attr_labels = results.attr_labels
+        self.assertEqual(
+            pred_bboxes_3d.tensor.shape, torch.Size([200, 9]),
+            'the shape of predicted 3d bboxes should be [200, 9]')
+        self.assertEqual(
+            pred_scores_3d.shape, torch.Size([200]),
+            'the shape of predicted 3d bbox scores should be [200]')
+        self.assertEqual(
+            pred_labels_3d.shape, torch.Size([200]),
+            'the shape of predicted 3d bbox labels should be [200]')
+        self.assertEqual(
+            pred_attr_labels.shape, torch.Size([200]),
+            'the shape of predicted 3d bbox attribute labels should be [200]')
diff --git a/mmde/tests/test_models/test_dense_heads/test_freeanchors.py b/mmde/tests/test_models/test_dense_heads/test_freeanchors.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d9543481c8ff876601579279995e926759af2ca
--- /dev/null
+++ b/mmde/tests/test_models/test_dense_heads/test_freeanchors.py
@@ -0,0 +1,80 @@
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestFreeAnchor(unittest.TestCase):
+
+    def test_freeanchor(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models.dense_heads, 'FreeAnchor3DHead')
+        DefaultScope.get_instance('test_freeanchor', scope_name='mmdet3d')
+        setup_seed(0)
+        freeanchor_cfg = get_detector_cfg(
+            'free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor'
+            '_sbn-all_8xb4-2x_nus-3d.py')
+        # decrease channels to reduce cuda memory.
+        freeanchor_cfg.pts_voxel_encoder.feat_channels = [1, 1]
+        freeanchor_cfg.pts_middle_encoder.in_channels = 1
+        freeanchor_cfg.pts_backbone.base_channels = 1
+        freeanchor_cfg.pts_backbone.stem_channels = 1
+        freeanchor_cfg.pts_neck.out_channels = 1
+        freeanchor_cfg.pts_bbox_head.feat_channels = 1
+        freeanchor_cfg.pts_bbox_head.in_channels = 1
+        model = MODELS.build(freeanchor_cfg)
+        num_gt_instance = 3
+        packed_inputs = create_detector_inputs(
+            num_gt_instance=num_gt_instance, gt_bboxes_dim=9)
+
+        # TODO: Support aug_test
+        # aug_data = [
+        #     create_detector_inputs(
+        #         num_gt_instance=num_gt_instance, gt_bboxes_dim=9),
+        #     create_detector_inputs(
+        #         num_gt_instance=num_gt_instance + 1, gt_bboxes_dim=9)
+        # ]
+        # # test_aug_test
+        # metainfo = {
+        #     'pcd_scale_factor': 1,
+        #     'pcd_horizontal_flip': 1,
+        #     'pcd_vertical_flip': 1,
+        #     'box_type_3d': LiDARInstance3DBoxes
+        # }
+        # for item in aug_data:
+        #     item['data_sample'].set_metainfo(metainfo)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                torch.cuda.empty_cache()
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # TODO: Support aug_test
+            # batch_inputs, data_samples = model.data_preprocessor(
+            #     aug_data, True)
+            # aug_results = model.forward(
+            #     batch_inputs, data_samples, mode='predict')
+            # self.assertEqual(len(results), len(data))
+            # self.assertIn('bboxes_3d', aug_results[0].pred_instances_3d)
+            # self.assertIn('scores_3d', aug_results[0].pred_instances_3d)
+            # self.assertIn('labels_3d', aug_results[0].pred_instances_3d)
+            # self.assertIn('bboxes_3d', aug_results[1].pred_instances_3d)
+            # self.assertIn('scores_3d', aug_results[1].pred_instances_3d)
+            # self.assertIn('labels_3d', aug_results[1].pred_instances_3d)
+
+            losses = model.forward(**data, mode='loss')
+
+            self.assertGreaterEqual(losses['positive_bag_loss'], 0)
+            self.assertGreaterEqual(losses['negative_bag_loss'], 0)
diff --git a/mmde/tests/test_models/test_dense_heads/test_imvoxel_head.py b/mmde/tests/test_models/test_dense_heads/test_imvoxel_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d55a78bf65ab17e8daa3f7274b56401c01de88c
--- /dev/null
+++ b/mmde/tests/test_models/test_dense_heads/test_imvoxel_head.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import pytest
+import torch
+
+from mmdet3d import *  # noqa
+from mmdet3d.models.dense_heads import ImVoxelHead
+from mmdet3d.testing import create_detector_inputs
+
+
+class TestImVoxelHead(TestCase):
+
+    def test_imvoxel_head_loss(self):
+        """Test imvoxel head loss when truth is empty and non-empty."""
+        if not torch.cuda.is_available():
+            pytest.skip('test requires GPU and torch+cuda')
+
+        # build head
+        prior_generator = dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-3.2, -0.2, -2.28, 3.2, 6.2, 0.28]],
+            rotations=[.0])
+        imvoxel_head = ImVoxelHead(
+            n_classes=1,
+            n_levels=1,
+            n_channels=32,
+            n_reg_outs=7,
+            pts_assign_threshold=27,
+            pts_center_threshold=18,
+            prior_generator=prior_generator,
+            center_loss=dict(type='mmdet.CrossEntropyLoss', use_sigmoid=True),
+            bbox_loss=dict(type='RotatedIoU3DLoss'),
+            cls_loss=dict(type='mmdet.FocalLoss'),
+        )
+        imvoxel_head = imvoxel_head.cuda()
+
+        # fake input of head
+        # (x, valid_preds)
+        x = [
+            torch.randn(1, 32, 10, 10, 4).cuda(),
+            torch.ones(1, 1, 10, 10, 4).cuda()
+        ]
+
+        # fake annotation
+        num_gt_instance = 1
+        packed_inputs = create_detector_inputs(
+            with_points=False,
+            with_img=True,
+            img_size=(128, 128),
+            num_gt_instance=num_gt_instance,
+            with_pts_semantic_mask=False,
+            with_pts_instance_mask=False)
+        data_samples = [
+            sample.cuda() for sample in packed_inputs['data_samples']
+        ]
+
+        losses = imvoxel_head.loss(x, data_samples)
+        print(losses)
+        self.assertGreaterEqual(losses['center_loss'], 0)
+        self.assertGreaterEqual(losses['bbox_loss'], 0)
+        self.assertGreaterEqual(losses['cls_loss'], 0)
diff --git a/mmde/tests/test_models/test_dense_heads/test_monoflex_head.py b/mmde/tests/test_models/test_dense_heads/test_monoflex_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..e146cc7924f42be5da6b5f57dab78c6b1d2941da
--- /dev/null
+++ b/mmde/tests/test_models/test_dense_heads/test_monoflex_head.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
+import torch
+
+from mmdet3d.models.dense_heads import MonoFlexHead
+
+
+class TestMonoFlexHead(TestCase):
+
+    def test_monoflex_head_loss(self):
+        """Tests MonoFlex head loss and inference."""
+
+        input_metas = [dict(img_shape=(110, 110), pad_shape=(128, 128))]
+
+        monoflex_head = MonoFlexHead(
+            num_classes=3,
+            in_channels=64,
+            use_edge_fusion=True,
+            edge_fusion_inds=[(1, 0)],
+            edge_heatmap_ratio=1 / 8,
+            stacked_convs=0,
+            feat_channels=64,
+            use_direction_classifier=False,
+            diff_rad_by_sin=False,
+            pred_attrs=False,
+            pred_velo=False,
+            dir_offset=0,
+            strides=None,
+            group_reg_dims=((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ),
+                            (1, )),
+            cls_branch=(256, ),
+            reg_branch=((256, ), (256, ), (256, ), (256, ), (256, ), (256, ),
+                        (256, ), (256, )),
+            num_attrs=0,
+            bbox_code_size=7,
+            dir_branch=(),
+            attr_branch=(),
+            bbox_coder=dict(
+                type='MonoFlexCoder',
+                depth_mode='exp',
+                base_depth=(26.494627, 16.05988),
+                depth_range=[0.1, 100],
+                combine_depth=True,
+                uncertainty_range=[-10, 10],
+                base_dims=((3.8840, 1.5261, 1.6286, 0.4259, 0.1367, 0.1022),
+                           (0.8423, 1.7607, 0.6602, 0.2349, 0.1133, 0.1427),
+                           (1.7635, 1.7372, 0.5968, 0.1766, 0.0948, 0.1242)),
+                dims_mode='linear',
+                multibin=True,
+                num_dir_bins=4,
+                bin_centers=[0, np.pi / 2, np.pi, -np.pi / 2],
+                bin_margin=np.pi / 6,
+                code_size=7),
+            conv_bias=True,
+            dcn_on_last_conv=False)
+
+        # Monoflex head expects a single level of features per image
+        feats = [torch.rand([1, 64, 32, 32], dtype=torch.float32)]
+
+        # Test forward
+        cls_score, out_reg = monoflex_head.forward(feats, input_metas)
+
+        self.assertEqual(cls_score[0].shape, torch.Size([1, 3, 32, 32]),
+                         'the shape of cls_score should be [1, 3, 32, 32]')
+        self.assertEqual(out_reg[0].shape, torch.Size([1, 50, 32, 32]),
+                         'the shape of out_reg should be [1, 50, 32, 32]')
diff --git a/mmde/tests/test_models/test_dense_heads/test_pgd_head.py b/mmde/tests/test_models/test_dense_heads/test_pgd_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c335356e302a58f37e1a5da29ba4118c2d9770b
--- /dev/null
+++ b/mmde/tests/test_models/test_dense_heads/test_pgd_head.py
@@ -0,0 +1,210 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import mmengine
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet3d.models.dense_heads import PGDHead
+from mmdet3d.structures import CameraInstance3DBoxes
+
+
+class TestFGDHead(TestCase):
+
+    def test_pgd_head_loss(self):
+        """Tests PGD head loss and inference."""
+
+        img_metas = [
+            dict(
+                img_shape=[384, 1248],
+                cam2img=[[721.5377, 0.0, 609.5593, 44.85728],
+                         [0.0, 721.5377, 172.854, 0.2163791],
+                         [0.0, 0.0, 1.0, 0.002745884], [0.0, 0.0, 0.0, 1.0]],
+                scale_factor=np.array([1., 1., 1., 1.], dtype=np.float32),
+                box_type_3d=CameraInstance3DBoxes)
+        ]
+
+        train_cfg = dict(code_weight=[
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+            0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0,
+            1.0
+        ])
+
+        test_cfg = dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_pre=100,
+            nms_thr=0.05,
+            score_thr=0.001,
+            min_bbox_size=0,
+            max_per_img=20)
+
+        train_cfg = mmengine.Config(train_cfg)
+        test_cfg = mmengine.Config(test_cfg)
+
+        pgd_head = PGDHead(
+            num_classes=3,
+            in_channels=256,
+            stacked_convs=2,
+            feat_channels=256,
+            use_direction_classifier=True,
+            bbox_code_size=7,
+            diff_rad_by_sin=True,
+            pred_attrs=False,
+            pred_velo=False,
+            pred_bbox2d=True,
+            pred_keypoints=True,
+            use_onlyreg_proj=True,
+            dir_offset=0.7854,  # pi/4
+            dir_limit_offset=0,
+            strides=(4, 8, 16, 32),
+            regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 1e8)),
+            group_reg_dims=(2, 1, 3, 1, 16,
+                            4),  # offset, depth, size, rot, kpts, bbox2d
+            cls_branch=(256, ),
+            reg_branch=(
+                (256, ),  # offset
+                (256, ),  # depth
+                (256, ),  # size
+                (256, ),  # rot
+                (256, ),  # kpts
+                (256, )  # bbox2d
+            ),
+            dir_branch=(256, ),
+            attr_branch=(256, ),
+            centerness_branch=(256, ),
+            loss_cls=dict(
+                type='mmdet.FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_bbox=dict(
+                type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+            loss_dir=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0),
+            loss_attr=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0),
+            loss_centerness=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=True,
+                loss_weight=1.0),
+            norm_on_bbox=True,
+            centerness_on_reg=True,
+            center_sampling=True,
+            conv_bias=True,
+            dcn_on_last_conv=False,
+            use_depth_classifier=True,
+            depth_branch=(256, ),
+            depth_range=(0, 70),
+            depth_unit=10,
+            division='uniform',
+            depth_bins=8,
+            weight_dim=1,
+            loss_depth=dict(
+                type='UncertainSmoothL1Loss',
+                alpha=1.0,
+                beta=3.0,
+                loss_weight=1.0),
+            bbox_coder=dict(
+                type='PGDBBoxCoder',
+                base_depths=((28.01, 16.32), ),
+                base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6), (3.9, 1.56,
+                                                                 1.6)),
+                code_size=7),
+            train_cfg=train_cfg,
+            test_cfg=test_cfg)
+
+        # PGD head expects a multiple levels of features per image
+        feats = [
+            torch.rand([1, 256, 96, 312], dtype=torch.float32),
+            torch.rand([1, 256, 48, 156], dtype=torch.float32),
+            torch.rand([1, 256, 24, 78], dtype=torch.float32),
+            torch.rand([1, 256, 12, 39], dtype=torch.float32),
+        ]
+
+        # Test forward
+        ret_dict = pgd_head.forward(feats)
+
+        self.assertEqual(
+            len(ret_dict), 7, 'the length of forward feature should be 7')
+        self.assertEqual(
+            len(ret_dict[0]), 4, 'each feature should have 4 levels')
+        self.assertEqual(
+            ret_dict[0][0].shape, torch.Size([1, 3, 96, 312]),
+            'the fist level feature shape should be [1, 3, 96, 312]')
+
+        # When truth is non-empty then all losses
+        # should be nonzero for random inputs
+        gt_instances_3d = InstanceData()
+        gt_instances = InstanceData()
+
+        gt_bboxes = torch.rand([3, 4], dtype=torch.float32)
+        gt_bboxes_3d = CameraInstance3DBoxes(torch.rand([3, 7]), box_dim=7)
+        gt_labels = torch.randint(0, 3, [3])
+        gt_labels_3d = gt_labels
+        centers_2d = torch.rand([3, 2], dtype=torch.float32)
+        depths = torch.rand([3], dtype=torch.float32)
+
+        gt_instances_3d.bboxes_3d = gt_bboxes_3d
+        gt_instances_3d.labels_3d = gt_labels_3d
+        gt_instances.bboxes = gt_bboxes
+        gt_instances.labels = gt_labels
+        gt_instances_3d.centers_2d = centers_2d
+        gt_instances_3d.depths = depths
+
+        gt_losses = pgd_head.loss_by_feat(*ret_dict, [gt_instances_3d],
+                                          [gt_instances], img_metas)
+
+        gt_cls_loss = gt_losses['loss_cls'].item()
+        gt_siz_loss = gt_losses['loss_size'].item()
+        gt_ctr_loss = gt_losses['loss_centerness'].item()
+        gt_off_loss = gt_losses['loss_offset'].item()
+        gt_dep_loss = gt_losses['loss_depth'].item()
+        gt_rot_loss = gt_losses['loss_rotsin'].item()
+        gt_kpt_loss = gt_losses['loss_kpts'].item()
+        gt_dir_loss = gt_losses['loss_dir'].item()
+        gt_box_loss = gt_losses['loss_bbox2d'].item()
+        gt_cos_loss = gt_losses['loss_consistency'].item()
+
+        self.assertGreater(gt_cls_loss, 0, 'cls loss should be positive')
+        self.assertGreater(gt_siz_loss, 0, 'size loss should be positive')
+        self.assertGreater(gt_ctr_loss, 0,
+                           'centerness loss should be positive')
+        self.assertGreater(gt_off_loss, 0, 'offset loss should be positive')
+        self.assertGreater(gt_dep_loss, 0, 'depth loss should be positive')
+        self.assertGreater(gt_rot_loss, 0, 'rotsin loss should be positive')
+        self.assertGreater(gt_kpt_loss, 0, 'keypoints loss should be positive')
+        self.assertGreater(gt_dir_loss, 0, 'direction loss should be positive')
+        self.assertGreater(gt_box_loss, 0, '2d bbox loss should be positive')
+        self.assertGreater(gt_cos_loss, 0,
+                           'consistency loss should be positive')
+
+        # test get_results
+        results_list_3d, results_list_2d = pgd_head.predict_by_feat(
+            *ret_dict, img_metas)
+        self.assertEqual(len(results_list_3d), 1, 'batch size should be 1')
+        self.assertEqual(len(results_list_2d), 1, 'batch size should be 1')
+        results = results_list_3d[0]
+        results_2d = results_list_2d[0]
+        pred_bboxes_3d = results.bboxes_3d
+        pred_scores_3d = results.scores_3d
+        pred_labels_3d = results.labels_3d
+        pred_bboxes_2d = results_2d.bboxes
+        self.assertEqual(pred_bboxes_3d.tensor.shape, torch.Size([20, 7]),
+                         'the shape of predicted 3d bboxes should be [20, 7]')
+        self.assertEqual(
+            pred_scores_3d.shape, torch.Size([20]),
+            'the shape of predicted 3d bbox scores should be [20]')
+        self.assertEqual(
+            pred_labels_3d.shape, torch.Size([20]),
+            'the shape of predicted 3d bbox labels should be [20]')
+        self.assertEqual(
+            pred_bboxes_2d.shape, torch.Size([20, 4]),
+            'the shape of predicted 2d bbox attribute labels should be [20, 4]'
+        )
diff --git a/mmde/tests/test_models/test_dense_heads/test_smoke_mono3d_head.py b/mmde/tests/test_models/test_dense_heads/test_smoke_mono3d_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..70c0a856e15b0147624f7429cd2f3005d1bd970c
--- /dev/null
+++ b/mmde/tests/test_models/test_dense_heads/test_smoke_mono3d_head.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet3d.models.dense_heads import SMOKEMono3DHead
+from mmdet3d.structures import CameraInstance3DBoxes
+
+
+class TestSMOKEMono3DHead(TestCase):
+
+    def test_smoke_mono3d_head_loss(self):
+        """Tests SMOKE head loss and inference."""
+
+        img_metas = [
+            dict(
+                cam2img=[[1260.8474446004698, 0.0, 807.968244525554, 40.1111],
+                         [0.0, 1260.8474446004698, 495.3344268742088, 2.34422],
+                         [0.0, 0.0, 1.0, 0.00333333], [0.0, 0.0, 0.0, 1.0]],
+                scale_factor=np.array([1., 1., 1., 1.], dtype=np.float32),
+                pad_shape=[128, 128],
+                trans_mat=np.array(
+                    [[0.25, 0., 0.], [0., 0.25, 0], [0., 0., 1.]],
+                    dtype=np.float32),
+                affine_aug=False,
+                box_type_3d=CameraInstance3DBoxes)
+        ]
+
+        smoke_mono3d_head = SMOKEMono3DHead(
+            num_classes=3,
+            in_channels=64,
+            dim_channel=[3, 4, 5],
+            ori_channel=[6, 7],
+            stacked_convs=0,
+            feat_channels=64,
+            use_direction_classifier=False,
+            diff_rad_by_sin=False,
+            pred_attrs=False,
+            pred_velo=False,
+            dir_offset=0,
+            strides=None,
+            group_reg_dims=(8, ),
+            cls_branch=(256, ),
+            reg_branch=((256, ), ),
+            num_attrs=0,
+            bbox_code_size=7,
+            dir_branch=(),
+            attr_branch=(),
+            bbox_coder=dict(
+                type='SMOKECoder',
+                base_depth=(28.01, 16.32),
+                base_dims=((0.88, 1.73, 0.67), (1.78, 1.70, 0.58), (3.88, 1.63,
+                                                                    1.53)),
+                code_size=7),
+            loss_cls=dict(type='mmdet.GaussianFocalLoss', loss_weight=1.0),
+            loss_bbox=dict(
+                type='mmdet.L1Loss', reduction='sum', loss_weight=1 / 300),
+            loss_dir=dict(
+                type='mmdet.CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0),
+            loss_attr=None,
+            conv_bias=True,
+            dcn_on_last_conv=False)
+
+        # SMOKE head expects a single level of features per image
+        feats = [torch.rand([1, 64, 32, 32], dtype=torch.float32)]
+
+        # Test forward
+        ret_dict = smoke_mono3d_head.forward(feats)
+
+        self.assertEqual(
+            len(ret_dict), 2, 'the length of forward feature should be 2')
+        self.assertEqual(
+            len(ret_dict[0]), 1, 'each feature should have 1 level')
+        self.assertEqual(
+            ret_dict[0][0].shape, torch.Size([1, 3, 32, 32]),
+            'the fist level feature shape should be [1, 3, 32, 32]')
+
+        # When truth is non-empty then all losses
+        # should be nonzero for random inputs
+        gt_instances_3d = InstanceData()
+        gt_instances = InstanceData()
+
+        gt_bboxes = torch.Tensor([[1.0, 2.0, 20.0, 40.0],
+                                  [45.0, 50.0, 80.0, 70.1],
+                                  [34.0, 39.0, 65.0, 64.0]])
+        gt_bboxes_3d = CameraInstance3DBoxes(torch.rand([3, 7]), box_dim=7)
+        gt_labels = torch.randint(0, 3, [3])
+        gt_labels_3d = gt_labels
+        centers_2d = torch.randint(0, 60, (3, 2))
+        depths = torch.rand([3], dtype=torch.float32)
+
+        gt_instances_3d.bboxes_3d = gt_bboxes_3d
+        gt_instances_3d.labels_3d = gt_labels_3d
+        gt_instances.bboxes = gt_bboxes
+        gt_instances.labels = gt_labels
+        gt_instances_3d.centers_2d = centers_2d
+        gt_instances_3d.depths = depths
+
+        gt_losses = smoke_mono3d_head.loss_by_feat(*ret_dict,
+                                                   [gt_instances_3d],
+                                                   [gt_instances], img_metas)
+
+        gt_cls_loss = gt_losses['loss_cls'].item()
+        gt_box_loss = gt_losses['loss_bbox'].item()
+
+        self.assertGreater(gt_cls_loss, 0, 'cls loss should be positive')
+        self.assertGreater(gt_box_loss, 0, 'bbox loss should be positive')
+
+        # test get_results
+        results_list = smoke_mono3d_head.predict_by_feat(*ret_dict, img_metas)
+        self.assertEqual(
+            len(results_list), 1, 'there should be one image results')
+        results = results_list[0]
+        pred_bboxes_3d = results.bboxes_3d
+        pred_scores_3d = results.scores_3d
+        pred_labels_3d = results.labels_3d
+
+        self.assertEqual(
+            pred_bboxes_3d.tensor.shape, torch.Size([100, 7]),
+            'the shape of predicted 3d bboxes should be [100, 7]')
+        self.assertEqual(
+            pred_scores_3d.shape, torch.Size([100]),
+            'the shape of predicted 3d bbox scores should be [100]')
+        self.assertEqual(
+            pred_labels_3d.shape, torch.Size([100]),
+            'the shape of predicted 3d bbox labels should be [100]')
diff --git a/mmde/tests/test_models/test_dense_heads/test_ssn.py b/mmde/tests/test_models/test_dense_heads/test_ssn.py
new file mode 100644
index 0000000000000000000000000000000000000000..80a440d23127055bb4612dc6b92eb98f258d784e
--- /dev/null
+++ b/mmde/tests/test_models/test_dense_heads/test_ssn.py
@@ -0,0 +1,79 @@
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestSSN(unittest.TestCase):
+
+    def test_ssn(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models.dense_heads, 'ShapeAwareHead')
+        DefaultScope.get_instance('test_ssn', scope_name='mmdet3d')
+        setup_seed(0)
+        ssn_cfg = get_detector_cfg(
+            'ssn/ssn_hv_secfpn_sbn-all_16xb2-2x_nus-3d.py')
+        ssn_cfg.pts_voxel_encoder.feat_channels = [1, 1]
+        ssn_cfg.pts_middle_encoder.in_channels = 1
+        ssn_cfg.pts_backbone.in_channels = 1
+        ssn_cfg.pts_backbone.out_channels = [1, 1, 1]
+        ssn_cfg.pts_neck.in_channels = [1, 1, 1]
+        ssn_cfg.pts_neck.out_channels = [1, 1, 1]
+        ssn_cfg.pts_bbox_head.in_channels = 3
+        ssn_cfg.pts_bbox_head.feat_channels = 1
+        model = MODELS.build(ssn_cfg)
+        num_gt_instance = 50
+        packed_inputs = create_detector_inputs(
+            num_gt_instance=num_gt_instance, gt_bboxes_dim=9)
+
+        # TODO: Support aug_test
+        # aug_data = [
+        #     create_detector_inputs(
+        #         num_gt_instance=num_gt_instance, gt_bboxes_dim=9),
+        #     create_detector_inputs(
+        #         num_gt_instance=num_gt_instance + 1, gt_bboxes_dim=9)
+        # ]
+        # test_aug_test
+        # metainfo = {
+        #     'pcd_scale_factor': 1,
+        #     'pcd_horizontal_flip': 1,
+        #     'pcd_vertical_flip': 1,
+        #     'box_type_3d': LiDARInstance3DBoxes
+        # }
+        # for item in aug_data:
+        #     item['data_sample'].set_metainfo(metainfo)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # TODO: Support aug_test
+            # batch_inputs, data_samples = model.data_preprocessor(
+            #     aug_data, True)
+            # aug_results = model.forward(
+            #     batch_inputs, data_samples, mode='predict')
+            # self.assertEqual(len(results), len(data))
+            # self.assertIn('bboxes_3d', aug_results[0].pred_instances_3d)
+            # self.assertIn('scores_3d', aug_results[0].pred_instances_3d)
+            # self.assertIn('labels_3d', aug_results[0].pred_instances_3d)
+            # self.assertIn('bboxes_3d', aug_results[1].pred_instances_3d)
+            # self.assertIn('scores_3d', aug_results[1].pred_instances_3d)
+            # self.assertIn('labels_3d', aug_results[1].pred_instances_3d)
+
+            losses = model.forward(**data, mode='loss')
+
+            self.assertGreaterEqual(losses['loss_cls'][0], 0)
+            self.assertGreaterEqual(losses['loss_bbox'][0], 0)
+            self.assertGreaterEqual(losses['loss_dir'][0], 0)
diff --git a/mmde/tests/test_models/test_detectors/test_3dssd.py b/mmde/tests/test_models/test_detectors/test_3dssd.py
new file mode 100644
index 0000000000000000000000000000000000000000..627994d91a2fa7ea00ab1b182b3a84b8e97952e3
--- /dev/null
+++ b/mmde/tests/test_models/test_detectors/test_3dssd.py
@@ -0,0 +1,39 @@
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class Test3DSSD(unittest.TestCase):
+
+    def test_3dssd(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'SSD3DNet')
+        DefaultScope.get_instance('test_ssd3d', scope_name='mmdet3d')
+        setup_seed(0)
+        voxel_net_cfg = get_detector_cfg('3dssd/3dssd_4xb4_kitti-3d-car.py')
+        model = MODELS.build(voxel_net_cfg)
+        num_gt_instance = 3
+        packed_inputs = create_detector_inputs(
+            num_gt_instance=num_gt_instance, num_classes=1)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                torch.cuda.empty_cache()
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            losses = model.forward(**data, mode='loss')
+
+            self.assertGreater(losses['centerness_loss'], 0)
diff --git a/mmde/tests/test_models/test_detectors/test_center_point.py b/mmde/tests/test_models/test_detectors/test_center_point.py
new file mode 100644
index 0000000000000000000000000000000000000000..cab2b799d768713712b6774995e0314fd3307b6f
--- /dev/null
+++ b/mmde/tests/test_models/test_detectors/test_center_point.py
@@ -0,0 +1,63 @@
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestCenterPoint(unittest.TestCase):
+
+    def test_center_point(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'CenterPoint')
+
+        setup_seed(0)
+        DefaultScope.get_instance('test_center_point', scope_name='mmdet3d')
+        centerpoint_net_cfg = get_detector_cfg(
+            'centerpoint/centerpoint_voxel01_second_secfpn_8xb4-cyclic-20e_nus-3d.py'  # noqa
+        )
+        model = MODELS.build(centerpoint_net_cfg)
+        num_gt_instance = 50
+        packed_inputs = create_detector_inputs(
+            with_img=True, num_gt_instance=num_gt_instance, points_feat_dim=5)
+
+        for sample_id in range(len(packed_inputs['data_samples'])):
+            det_sample = packed_inputs['data_samples'][sample_id]
+            num_instances = len(det_sample.gt_instances_3d.bboxes_3d)
+            bbox_3d_class = det_sample.gt_instances_3d.bboxes_3d.__class__
+            det_sample.gt_instances_3d.bboxes_3d = bbox_3d_class(
+                torch.rand(num_instances, 9), box_dim=9)
+
+        if torch.cuda.is_available():
+
+            model = model.cuda()
+            # test simple_test
+
+            data = model.data_preprocessor(packed_inputs, True)
+            with torch.no_grad():
+                torch.cuda.empty_cache()
+                losses = model.forward(**data, mode='loss')
+            assert losses['task0.loss_heatmap'] >= 0
+            assert losses['task0.loss_bbox'] >= 0
+            assert losses['task1.loss_heatmap'] >= 0
+            assert losses['task1.loss_bbox'] >= 0
+            assert losses['task2.loss_heatmap'] >= 0
+            assert losses['task2.loss_bbox'] >= 0
+            assert losses['task3.loss_heatmap'] >= 0
+            assert losses['task3.loss_bbox'] >= 0
+            assert losses['task3.loss_bbox'] >= 0
+            assert losses['task4.loss_bbox'] >= 0
+            assert losses['task5.loss_heatmap'] >= 0
+            assert losses['task5.loss_bbox'] >= 0
+
+            with torch.no_grad():
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+        # TODO test_aug_test
diff --git a/mmde/tests/test_models/test_detectors/test_fcaf3d.py b/mmde/tests/test_models/test_detectors/test_fcaf3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce9851515028977abca0db5e32dee171123a370b
--- /dev/null
+++ b/mmde/tests/test_models/test_detectors/test_fcaf3d.py
@@ -0,0 +1,48 @@
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestFCAF3d(unittest.TestCase):
+
+    def test_fcaf3d(self):
+        try:
+            import MinkowskiEngine  # noqa: F401
+        except ImportError:
+            return
+
+        import mmdet3d.models
+        assert hasattr(mmdet3d.models, 'MinkSingleStage3DDetector')
+        DefaultScope.get_instance('test_fcaf3d', scope_name='mmdet3d')
+        setup_seed(0)
+        fcaf3d_net_cfg = get_detector_cfg(
+            'fcaf3d/fcaf3d_2xb8_scannet-3d-18class.py')
+        model = MODELS.build(fcaf3d_net_cfg)
+        num_gt_instance = 3
+        packed_inputs = create_detector_inputs(
+            num_gt_instance=num_gt_instance,
+            num_classes=1,
+            points_feat_dim=6,
+            gt_bboxes_dim=6)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, False)
+                torch.cuda.empty_cache()
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            losses = model.forward(**data, mode='loss')
+
+            self.assertGreater(losses['center_loss'], 0)
+            self.assertGreater(losses['bbox_loss'], 0)
+            self.assertGreater(losses['cls_loss'], 0)
diff --git a/mmde/tests/test_models/test_detectors/test_groupfree3d.py b/mmde/tests/test_models/test_detectors/test_groupfree3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..784de7a408f4863db622b422fe15e7d8971a8b0a
--- /dev/null
+++ b/mmde/tests/test_models/test_detectors/test_groupfree3d.py
@@ -0,0 +1,49 @@
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestGroupfree3d(unittest.TestCase):
+
+    def test_groupfree3d(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'GroupFree3DNet')
+        DefaultScope.get_instance('test_groupfree3d', scope_name='mmdet3d')
+        setup_seed(0)
+        voxel_net_cfg = get_detector_cfg(
+            'groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py')
+        model = MODELS.build(voxel_net_cfg)
+        num_gt_instance = 5
+        packed_inputs = create_detector_inputs(
+            num_gt_instance=num_gt_instance,
+            points_feat_dim=3,
+            with_pts_semantic_mask=True,
+            with_pts_instance_mask=True)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                torch.cuda.empty_cache()
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(**data, mode='loss')
+
+            self.assertGreater(losses['sampling_objectness_loss'], 0)
+            self.assertGreater(losses['proposal.objectness_loss'], 0)
+            self.assertGreater(losses['s0.objectness_loss'], 0)
+            self.assertGreater(losses['s1.size_res_loss'], 0)
+            self.assertGreater(losses['s4.size_class_loss'], 0)
diff --git a/mmde/tests/test_models/test_detectors/test_h3dnet.py b/mmde/tests/test_models/test_detectors/test_h3dnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..09507c3df781cafb1429feb4a18a2f40497c4eac
--- /dev/null
+++ b/mmde/tests/test_models/test_detectors/test_h3dnet.py
@@ -0,0 +1,46 @@
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestH3D(unittest.TestCase):
+
+    def test_h3dnet(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'H3DNet')
+        DefaultScope.get_instance('test_H3DNet', scope_name='mmdet3d')
+        setup_seed(0)
+        voxel_net_cfg = get_detector_cfg('h3dnet/h3dnet_8xb3_scannet-seg.py')
+        model = MODELS.build(voxel_net_cfg)
+        num_gt_instance = 5
+        packed_inputs = create_detector_inputs(
+            num_gt_instance=num_gt_instance,
+            points_feat_dim=4,
+            bboxes_3d_type='depth',
+            with_pts_semantic_mask=True,
+            with_pts_instance_mask=True)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(**data, mode='loss')
+
+            self.assertGreater(losses['vote_loss'], 0)
+            self.assertGreater(losses['objectness_loss'], 0)
+            self.assertGreater(losses['center_loss'], 0)
diff --git a/mmde/tests/test_models/test_detectors/test_imvotenet.py b/mmde/tests/test_models/test_detectors/test_imvotenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac0fb9a5f304f0a140435f941c1cffd19df3d715
--- /dev/null
+++ b/mmde/tests/test_models/test_detectors/test_imvotenet.py
@@ -0,0 +1,80 @@
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestImvoteNet(unittest.TestCase):
+
+    def test_imvotenet_only_img(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'ImVoteNet')
+        DefaultScope.get_instance('test_imvotenet_img', scope_name='mmdet3d')
+        setup_seed(0)
+        votenet_net_cfg = get_detector_cfg(
+            'imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py')
+        model = MODELS.build(votenet_net_cfg)
+
+        packed_inputs = create_detector_inputs(
+            with_points=False, with_img=True, img_size=128)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes', results[0].pred_instances)
+            self.assertIn('scores', results[0].pred_instances)
+            self.assertIn('labels', results[0].pred_instances)
+
+            # save the memory
+            with torch.no_grad():
+                torch.cuda.empty_cache()
+                losses = model.forward(**data, mode='loss')
+
+            self.assertGreater(sum(losses['loss_rpn_cls']), 0)
+
+            self.assertGreater(losses['loss_cls'], 0)
+            self.assertGreater(losses['loss_bbox'], 0)
+
+    def test_imvotenet(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'ImVoteNet')
+        DefaultScope.get_instance('test_imvotenet', scope_name='mmdet3d')
+        setup_seed(0)
+        votenet_net_cfg = get_detector_cfg(
+            'imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py')
+        model = MODELS.build(votenet_net_cfg)
+
+        packed_inputs = create_detector_inputs(
+            with_points=True,
+            with_img=True,
+            img_size=128,
+            bboxes_3d_type='depth')
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(**data, mode='loss')
+
+            self.assertGreater(losses['vote_loss'], 0)
+            self.assertGreater(losses['objectness_loss'], 0)
+            self.assertGreater(losses['semantic_loss'], 0)
diff --git a/mmde/tests/test_models/test_detectors/test_imvoxelnet.py b/mmde/tests/test_models/test_detectors/test_imvoxelnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..41584060ee9397093aa24b47ceb5823fc4ebb0bc
--- /dev/null
+++ b/mmde/tests/test_models/test_detectors/test_imvoxelnet.py
@@ -0,0 +1,89 @@
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestImVoxelNet(unittest.TestCase):
+
+    def test_imvoxelnet_kitti(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'ImVoxelNet')
+        DefaultScope.get_instance(
+            'test_imvoxelnet_kitti', scope_name='mmdet3d')
+        setup_seed(0)
+        imvoxel_net_cfg = get_detector_cfg(
+            'imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py')
+        model = MODELS.build(imvoxel_net_cfg)
+        num_gt_instance = 1
+        packed_inputs = create_detector_inputs(
+            with_points=False,
+            with_img=True,
+            img_size=(128, 128),
+            num_gt_instance=num_gt_instance,
+            with_pts_semantic_mask=False,
+            with_pts_instance_mask=False)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                torch.cuda.empty_cache()
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(**data, mode='loss')
+
+            self.assertGreaterEqual(losses['loss_cls'][0], 0)
+            self.assertGreaterEqual(losses['loss_bbox'][0], 0)
+            self.assertGreaterEqual(losses['loss_dir'][0], 0)
+
+    def test_imvoxelnet_sunrgbd(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'ImVoxelNet')
+        DefaultScope.get_instance(
+            'test_imvoxelnet_sunrgbd', scope_name='mmdet3d')
+        setup_seed(0)
+        imvoxel_net_cfg = get_detector_cfg(
+            'imvoxelnet/imvoxelnet_2xb4_sunrgbd-3d-10class.py')
+        model = MODELS.build(imvoxel_net_cfg)
+        num_gt_instance = 1
+        packed_inputs = create_detector_inputs(
+            with_points=False,
+            with_img=True,
+            img_size=(128, 128),
+            num_gt_instance=num_gt_instance,
+            with_pts_semantic_mask=False,
+            with_pts_instance_mask=False)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                torch.cuda.empty_cache()
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(**data, mode='loss')
+
+            self.assertGreaterEqual(losses['center_loss'], 0)
+            self.assertGreaterEqual(losses['bbox_loss'], 0)
+            self.assertGreaterEqual(losses['cls_loss'], 0)
diff --git a/mmde/tests/test_models/test_detectors/test_mvxnet.py b/mmde/tests/test_models/test_detectors/test_mvxnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0463d32483d65997acec00884311222ff480036
--- /dev/null
+++ b/mmde/tests/test_models/test_detectors/test_mvxnet.py
@@ -0,0 +1,47 @@
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestMVXNet(unittest.TestCase):
+
+    def test_mvxnet(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'DynamicMVXFasterRCNN')
+
+        setup_seed(0)
+        DefaultScope.get_instance('test_mvxnet', scope_name='mmdet3d')
+        mvx_net_cfg = get_detector_cfg(
+            'mvxnet/mvxnet_fpn_dv_second_secfpn_8xb2-80e_kitti-3d-3class.py'  # noqa
+        )
+        model = MODELS.build(mvx_net_cfg)
+        num_gt_instance = 1
+        packed_inputs = create_detector_inputs(
+            with_img=False, num_gt_instance=num_gt_instance, points_feat_dim=4)
+
+        if torch.cuda.is_available():
+
+            model = model.cuda()
+            # test simple_test
+            data = model.data_preprocessor(packed_inputs, True)
+            # save the memory when do the unitest
+            with torch.no_grad():
+                torch.cuda.empty_cache()
+                losses = model.forward(**data, mode='loss')
+            assert losses['loss_cls'][0] >= 0
+            assert losses['loss_bbox'][0] >= 0
+            assert losses['loss_dir'][0] >= 0
+
+            with torch.no_grad():
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+        # TODO test_aug_test
diff --git a/mmde/tests/test_models/test_detectors/test_parta2.py b/mmde/tests/test_models/test_detectors/test_parta2.py
new file mode 100644
index 0000000000000000000000000000000000000000..0409e973bc2f2f1c177745e3ec9202d0d4dcf283
--- /dev/null
+++ b/mmde/tests/test_models/test_detectors/test_parta2.py
@@ -0,0 +1,61 @@
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestPartA2(unittest.TestCase):
+
+    def test_parta2(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'PartA2')
+        DefaultScope.get_instance('test_parta2', scope_name='mmdet3d')
+        setup_seed(0)
+        parta2_cfg = get_detector_cfg(
+            'parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py')
+        model = MODELS.build(parta2_cfg)
+        num_gt_instance = 2
+        packed_inputs = create_detector_inputs(num_gt_instance=num_gt_instance)
+
+        # TODO: Support aug data test
+        # aug_packed_inputs = [
+        #     create_detector_inputs(num_gt_instance=num_gt_instance),
+        #     create_detector_inputs(num_gt_instance=num_gt_instance + 1)
+        # ]
+        # test_aug_test
+        # metainfo = {
+        #     'pcd_scale_factor': 1,
+        #     'pcd_horizontal_flip': 1,
+        #     'pcd_vertical_flip': 1,
+        #     'box_type_3d': LiDARInstance3DBoxes
+        # }
+        # for item in aug_packed_inputs:
+        #     for batch_id in len(item['data_samples']):
+        #         item['data_samples'][batch_id].set_metainfo(metainfo)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                torch.cuda.empty_cache()
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(**data, mode='loss')
+                torch.cuda.empty_cache()
+            self.assertGreater(losses['loss_rpn_cls'][0], 0)
+            self.assertGreaterEqual(losses['loss_rpn_bbox'][0], 0)
+            self.assertGreater(losses['loss_seg'], 0)
+            self.assertGreater(losses['loss_part'], 0)
+            self.assertGreater(losses['loss_cls'], 0)
diff --git a/mmde/tests/test_models/test_detectors/test_pointrcnn.py b/mmde/tests/test_models/test_detectors/test_pointrcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb03e2871ee880d637fddb4836dcb25fdfd8aacd
--- /dev/null
+++ b/mmde/tests/test_models/test_detectors/test_pointrcnn.py
@@ -0,0 +1,46 @@
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestPointRCNN(unittest.TestCase):
+
+    def test_pointrcnn(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'PointRCNN')
+        DefaultScope.get_instance('test_pointrcnn', scope_name='mmdet3d')
+        setup_seed(0)
+        pointrcnn_cfg = get_detector_cfg(
+            'point_rcnn/point-rcnn_8xb2_kitti-3d-3class.py')
+        model = MODELS.build(pointrcnn_cfg)
+        num_gt_instance = 2
+        packed_inputs = create_detector_inputs(
+            num_points=10101, num_gt_instance=num_gt_instance)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                torch.cuda.empty_cache()
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(**data, mode='loss')
+                torch.cuda.empty_cache()
+            self.assertGreaterEqual(losses['rpn_bbox_loss'], 0)
+            self.assertGreaterEqual(losses['rpn_semantic_loss'], 0)
+            self.assertGreaterEqual(losses['loss_cls'], 0)
+            self.assertGreaterEqual(losses['loss_bbox'], 0)
+            self.assertGreaterEqual(losses['loss_corner'], 0)
diff --git a/mmde/tests/test_models/test_detectors/test_pvrcnn.py b/mmde/tests/test_models/test_detectors/test_pvrcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..05d1801f4b9f52bc1d7181040b35e5141fa30b4d
--- /dev/null
+++ b/mmde/tests/test_models/test_detectors/test_pvrcnn.py
@@ -0,0 +1,63 @@
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestPVRCNN(unittest.TestCase):
+
+    def test_pvrcnn(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'PointVoxelRCNN')
+        DefaultScope.get_instance('test_pvrcnn', scope_name='mmdet3d')
+        setup_seed(0)
+        pvrcnn_cfg = get_detector_cfg(
+            'pv_rcnn/pv_rcnn_8xb2-80e_kitti-3d-3class.py')
+        model = MODELS.build(pvrcnn_cfg)
+        num_gt_instance = 2
+        packed_inputs = create_detector_inputs(num_gt_instance=num_gt_instance)
+
+        # TODO: Support aug data test
+        # aug_packed_inputs = [
+        #     create_detector_inputs(num_gt_instance=num_gt_instance),
+        #     create_detector_inputs(num_gt_instance=num_gt_instance + 1)
+        # ]
+        # test_aug_test
+        # metainfo = {
+        #     'pcd_scale_factor': 1,
+        #     'pcd_horizontal_flip': 1,
+        #     'pcd_vertical_flip': 1,
+        #     'box_type_3d': LiDARInstance3DBoxes
+        # }
+        # for item in aug_packed_inputs:
+        #     for batch_id in len(item['data_samples']):
+        #         item['data_samples'][batch_id].set_metainfo(metainfo)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                torch.cuda.empty_cache()
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(**data, mode='loss')
+                torch.cuda.empty_cache()
+            self.assertGreater(losses['loss_rpn_cls'][0], 0)
+            self.assertGreaterEqual(losses['loss_rpn_bbox'][0], 0)
+            self.assertGreaterEqual(losses['loss_rpn_dir'][0], 0)
+            self.assertGreater(losses['loss_semantic'], 0)
+            self.assertGreaterEqual(losses['loss_bbox'], 0)
+            self.assertGreaterEqual(losses['loss_cls'], 0)
+            self.assertGreaterEqual(losses['loss_corner'], 0)
diff --git a/mmde/tests/test_models/test_detectors/test_sassd.py b/mmde/tests/test_models/test_detectors/test_sassd.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9cc12f71dd06b5edb1dd41163e2d05c9860bbbb
--- /dev/null
+++ b/mmde/tests/test_models/test_detectors/test_sassd.py
@@ -0,0 +1,43 @@
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestSDSSD(unittest.TestCase):
+
+    def test_3dssd(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'SASSD')
+        DefaultScope.get_instance('test_sassd', scope_name='mmdet3d')
+        setup_seed(0)
+        voxel_net_cfg = get_detector_cfg(
+            'sassd/sassd_8xb6-80e_kitti-3d-3class.py')
+        model = MODELS.build(voxel_net_cfg)
+        num_gt_instance = 3
+        packed_inputs = create_detector_inputs(
+            num_gt_instance=num_gt_instance, num_classes=1)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                torch.cuda.empty_cache()
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            losses = model.forward(**data, mode='loss')
+            self.assertGreaterEqual(losses['loss_dir'][0], 0)
+            self.assertGreaterEqual(losses['loss_bbox'][0], 0)
+            self.assertGreaterEqual(losses['loss_cls'][0], 0)
+            self.assertGreater(losses['aux_loss_cls'][0], 0)
+            self.assertGreater(losses['aux_loss_reg'][0], 0)
diff --git a/mmde/tests/test_models/test_detectors/test_votenet.py b/mmde/tests/test_models/test_detectors/test_votenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..456db1b80b574c6a32a67840a6deafacd9af4534
--- /dev/null
+++ b/mmde/tests/test_models/test_detectors/test_votenet.py
@@ -0,0 +1,72 @@
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestVotenet(unittest.TestCase):
+
+    def test_voxel_net(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'VoteNet')
+        DefaultScope.get_instance('test_vote_net', scope_name='mmdet3d')
+        setup_seed(0)
+        voxel_net_cfg = get_detector_cfg('votenet/votenet_8xb16_sunrgbd-3d.py')
+        model = MODELS.build(voxel_net_cfg)
+        num_gt_instance = 50
+        packed_inputs = create_detector_inputs(num_gt_instance=num_gt_instance)
+
+        # TODO: Support aug test
+        # aug_data = [
+        #     create_detector_inputs(num_gt_instance=num_gt_instance),
+        #     create_detector_inputs(num_gt_instance=num_gt_instance + 1)
+        # ]
+        # # test_aug_test
+        # metainfo = {
+        #     'pcd_scale_factor': 1,
+        #     'pcd_horizontal_flip': 1,
+        #     'pcd_vertical_flip': 1,
+        #     'box_type_3d': LiDARInstance3DBoxes
+        # # }
+        # for item in aug_data:
+        #     item['data_sample'].set_metainfo(metainfo)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # TODO: Support aug_test
+            # batch_inputs, data_samples = model.data_preprocessor(
+            #     aug_data, True)
+            # aug_results = model.forward(
+            #     batch_inputs, data_samples, mode='predict')
+
+            # self.assertIn('bboxes_3d', aug_results[0].pred_instances_3d)
+            # self.assertIn('scores_3d', aug_results[0].pred_instances_3d)
+            # self.assertIn('labels_3d', aug_results[0].pred_instances_3d)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(**data, mode='loss')
+
+            self.assertGreater(losses['vote_loss'], 0)
+            self.assertGreater(losses['objectness_loss'], 0)
+            self.assertGreater(losses['semantic_loss'], 0)
+            self.assertGreater(losses['dir_res_loss'], 0)
+            self.assertGreater(losses['size_class_loss'], 0)
+            self.assertGreater(losses['size_res_loss'], 0)
+            self.assertGreater(losses['size_res_loss'], 0)
+
+        # TODO test_aug_test
diff --git a/mmde/tests/test_models/test_detectors/test_voxelnet.py b/mmde/tests/test_models/test_detectors/test_voxelnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..eaccb225d4f08bc82543e80992b8cf12b1e8d614
--- /dev/null
+++ b/mmde/tests/test_models/test_detectors/test_voxelnet.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestVoxelNet(unittest.TestCase):
+
+    def test_voxelnet(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'VoxelNet')
+        DefaultScope.get_instance('test_voxelnet', scope_name='mmdet3d')
+        setup_seed(0)
+        pointpillars_cfg = get_detector_cfg(
+            'pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py')
+        model = MODELS.build(pointpillars_cfg)
+        num_gt_instance = 2
+        packed_inputs = create_detector_inputs(num_gt_instance=num_gt_instance)
+
+        # TODO: Support aug_test
+        # aug_data = [
+        #     create_detector_inputs(num_gt_instance=num_gt_instance),
+        #     create_detector_inputs(num_gt_instance=num_gt_instance + 1)
+        # ]
+        # # test_aug_test
+        # metainfo = {
+        #     'pcd_scale_factor': 1,
+        #     'pcd_horizontal_flip': 1,
+        #     'pcd_vertical_flip': 1,
+        #     'box_type_3d': LiDARInstance3DBoxes
+        # }
+        # for item in aug_data:
+        #     item['data_sample'].set_metainfo(metainfo)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                torch.cuda.empty_cache()
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # TODO: Support aug_test
+            # batch_inputs, data_samples = model.data_preprocessor(
+            #     aug_data, True)
+            # aug_results = model.forward(
+            #     batch_inputs, data_samples, mode='predict')
+            # self.assertEqual(len(results), len(data))
+            # self.assertIn('bboxes_3d', aug_results[0].pred_instances_3d)
+            # self.assertIn('scores_3d', aug_results[0].pred_instances_3d)
+            # self.assertIn('labels_3d', aug_results[0].pred_instances_3d)
+            # self.assertIn('bboxes_3d', aug_results[1].pred_instances_3d)
+            # self.assertIn('scores_3d', aug_results[1].pred_instances_3d)
+            # self.assertIn('labels_3d', aug_results[1].pred_instances_3d)
+
+            # save the memory
+
+            with torch.no_grad():
+                losses = model.forward(**data, mode='loss')
+                torch.cuda.empty_cache()
+            self.assertGreaterEqual(losses['loss_dir'][0], 0)
+            self.assertGreaterEqual(losses['loss_bbox'][0], 0)
+            self.assertGreaterEqual(losses['loss_cls'][0], 0)
diff --git a/mmde/tests/test_models/test_layers/test_box3d_nms.py b/mmde/tests/test_models/test_layers/test_box3d_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0b8752ce0dbd9afad0c5c15e4f7050be09ab90e
--- /dev/null
+++ b/mmde/tests/test_models/test_layers/test_box3d_nms.py
@@ -0,0 +1,114 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+
+def test_aligned_3d_nms():
+    from mmdet3d.models.layers import aligned_3d_nms
+
+    boxes = torch.tensor([[1.2261, 0.6679, -1.2678, 2.6547, 1.0428, 0.1000],
+                          [5.0919, 0.6512, 0.7238, 5.4821, 1.2451, 2.1095],
+                          [6.8392, -1.2205, 0.8570, 7.6920, 0.3220, 3.2223],
+                          [3.6900, -0.4235, -1.0380, 4.4415, 0.2671, -0.1442],
+                          [4.8071, -1.4311, 0.7004, 5.5788, -0.6837, 1.2487],
+                          [2.1807, -1.5811, -1.1289, 3.0151, -0.1346, -0.5351],
+                          [4.4631, -4.2588, -1.1403, 5.3012, -3.4463, -0.3212],
+                          [4.7607, -3.3311, 0.5993, 5.2976, -2.7874, 1.2273],
+                          [3.1265, 0.7113, -0.0296, 3.8944, 1.3532, 0.9785],
+                          [5.5828, -3.5350, 1.0105, 8.2841, -0.0405, 3.3614],
+                          [3.0003, -2.1099, -1.0608, 5.3423, 0.0328, 0.6252],
+                          [2.7148, 0.6082, -1.1738, 3.6995, 1.2375, -0.0209],
+                          [4.9263, -0.2152, 0.2889, 5.6963, 0.3416, 1.3471],
+                          [5.0713, 1.3459, -0.2598, 5.6278, 1.9300, 1.2835],
+                          [4.5985, -2.3996, -0.3393, 5.2705, -1.7306, 0.5698],
+                          [4.1386, 0.5658, 0.0422, 4.8937, 1.1983, 0.9911],
+                          [2.7694, -1.9822, -1.0637, 4.0691, 0.3575, -0.1393],
+                          [4.6464, -3.0123, -1.0694, 5.1421, -2.4450, -0.3758],
+                          [3.4754, 0.4443, -1.1282, 4.6727, 1.3786, 0.2550],
+                          [2.5905, -0.3504, -1.1202, 3.1599, 0.1153, -0.3036],
+                          [4.1336, -3.4813, 1.1477, 6.2091, -0.8776, 2.6757],
+                          [3.9966, 0.2069, -1.1148, 5.0841, 1.0525, -0.0648],
+                          [4.3216, -1.8647, 0.4733, 6.2069, 0.6671, 3.3363],
+                          [4.7683, 0.4286, -0.0500, 5.5642, 1.2906, 0.8902],
+                          [1.7337, 0.7625, -1.0058, 3.0675, 1.3617, 0.3849],
+                          [4.7193, -3.3687, -0.9635, 5.1633, -2.7656, 1.1001],
+                          [4.4704, -2.7744, -1.1127, 5.0971, -2.0228, -0.3150],
+                          [2.7027, 0.6122, -0.9169, 3.3083, 1.2117, 0.6129],
+                          [4.8789, -2.0025, 0.8385, 5.5214, -1.3668, 1.3552],
+                          [3.7856, -1.7582, -0.1738, 5.3373, -0.6300, 0.5558]])
+
+    scores = torch.tensor([
+        3.6414e-03, 2.2901e-02, 2.7576e-04, 1.2238e-02, 5.9310e-04, 1.2659e-01,
+        2.4104e-02, 5.0742e-03, 2.3581e-03, 2.0946e-07, 8.8039e-01, 1.9127e-01,
+        5.0469e-05, 9.3638e-03, 3.0663e-03, 9.4350e-03, 5.3380e-02, 1.7895e-01,
+        2.0048e-01, 1.1294e-03, 3.0304e-08, 2.0237e-01, 1.0894e-08, 6.7972e-02,
+        6.7156e-01, 9.3986e-04, 7.9470e-01, 3.9736e-01, 1.8000e-04, 7.9151e-04
+    ])
+
+    cls = torch.tensor([
+        8, 8, 8, 3, 3, 1, 3, 3, 7, 8, 0, 6, 7, 8, 3, 7, 2, 7, 6, 3, 8, 6, 6, 7,
+        6, 8, 7, 6, 3, 1
+    ])
+
+    pick = aligned_3d_nms(boxes, scores, cls, 0.25)
+    expected_pick = torch.tensor([
+        10, 26, 24, 27, 21, 18, 17, 5, 23, 16, 6, 1, 3, 15, 13, 7, 0, 14, 8,
+        19, 25, 29, 4, 2, 28, 12, 9, 20, 22
+    ])
+
+    assert torch.all(pick == expected_pick)
+
+
+def test_circle_nms():
+    from mmdet3d.models.layers import circle_nms
+    boxes = torch.tensor([[-11.1100, 2.1300, 0.8823],
+                          [-11.2810, 2.2422, 0.8914],
+                          [-10.3966, -0.3198, 0.8643],
+                          [-10.2906, -13.3159,
+                           0.8401], [5.6518, 9.9791, 0.8271],
+                          [-11.2652, 13.3637, 0.8267],
+                          [4.7768, -13.0409, 0.7810], [5.6621, 9.0422, 0.7753],
+                          [-10.5561, 18.9627, 0.7518],
+                          [-10.5643, 13.2293, 0.7200]])
+    keep = circle_nms(boxes.numpy(), 0.175)
+    expected_keep = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+    assert np.all(keep == expected_keep)
+
+
+# copied from tests/test_ops/test_iou3d.py from mmcv<=1.5
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_nms_bev():
+    from mmdet3d.models.layers import nms_bev
+
+    np_boxes = np.array(
+        [[6.0, 3.0, 8.0, 7.0, 2.0], [3.0, 6.0, 9.0, 11.0, 1.0],
+         [3.0, 7.0, 10.0, 12.0, 1.0], [1.0, 4.0, 13.0, 7.0, 3.0]],
+        dtype=np.float32)
+    np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
+    np_inds = np.array([1, 0, 3])
+    boxes = torch.from_numpy(np_boxes)
+    scores = torch.from_numpy(np_scores)
+    inds = nms_bev(boxes.cuda(), scores.cuda(), thresh=0.3)
+
+    assert np.allclose(inds.cpu().numpy(), np_inds)
+
+
+# copied from tests/test_ops/test_iou3d.py from mmcv<=1.5
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_nms_normal_bev():
+    from mmdet3d.models.layers import nms_normal_bev
+
+    np_boxes = np.array(
+        [[6.0, 3.0, 8.0, 7.0, 2.0], [3.0, 6.0, 9.0, 11.0, 1.0],
+         [3.0, 7.0, 10.0, 12.0, 1.0], [1.0, 4.0, 13.0, 7.0, 3.0]],
+        dtype=np.float32)
+    np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
+    np_inds = np.array([1, 0, 3])
+    boxes = torch.from_numpy(np_boxes)
+    scores = torch.from_numpy(np_scores)
+    inds = nms_normal_bev(boxes.cuda(), scores.cuda(), thresh=0.3)
+
+    assert np.allclose(inds.cpu().numpy(), np_inds)
diff --git a/mmde/tests/test_models/test_layers/test_dgcnn_modules/test_dgcnn_fa_module.py b/mmde/tests/test_models/test_layers/test_dgcnn_modules/test_dgcnn_fa_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb1f86c9226b73cf21725bfd432964c1ad0b5119
--- /dev/null
+++ b/mmde/tests/test_models/test_layers/test_dgcnn_modules/test_dgcnn_fa_module.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+
+def test_dgcnn_fa_module():
+    if not torch.cuda.is_available():
+        pytest.skip()
+    from mmdet3d.models.layers import DGCNNFAModule
+
+    self = DGCNNFAModule(mlp_channels=[24, 16]).cuda()
+    assert self.mlps.layer0.conv.in_channels == 24
+    assert self.mlps.layer0.conv.out_channels == 16
+
+    points = [torch.rand(1, 200, 12).float().cuda() for _ in range(3)]
+
+    fa_points = self(points)
+    assert fa_points.shape == torch.Size([1, 200, 40])
diff --git a/mmde/tests/test_models/test_layers/test_dgcnn_modules/test_dgcnn_fp_module.py b/mmde/tests/test_models/test_layers/test_dgcnn_modules/test_dgcnn_fp_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec57db6c6ddbd12b1fe4555de490b99f17378ddb
--- /dev/null
+++ b/mmde/tests/test_models/test_layers/test_dgcnn_modules/test_dgcnn_fp_module.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+
+def test_dgcnn_fp_module():
+    if not torch.cuda.is_available():
+        pytest.skip()
+    from mmdet3d.models.layers import DGCNNFPModule
+
+    self = DGCNNFPModule(mlp_channels=[24, 16]).cuda()
+    assert self.mlps.layer0.conv.in_channels == 24
+    assert self.mlps.layer0.conv.out_channels == 16
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin',
+                      np.float32).reshape((-1, 6))
+
+    # (B, N, 3)
+    xyz = torch.from_numpy(xyz).view(1, -1, 3).cuda()
+    points = xyz.repeat([1, 1, 8]).cuda()
+
+    fp_points = self(points)
+    assert fp_points.shape == torch.Size([1, 200, 16])
diff --git a/mmde/tests/test_models/test_layers/test_dgcnn_modules/test_dgcnn_gf_module.py b/mmde/tests/test_models/test_layers/test_dgcnn_modules/test_dgcnn_gf_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddc14a4ee0fff662e3d9134b2fda292393ced5b6
--- /dev/null
+++ b/mmde/tests/test_models/test_layers/test_dgcnn_modules/test_dgcnn_gf_module.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+
+def test_dgcnn_gf_module():
+    if not torch.cuda.is_available():
+        pytest.skip()
+    from mmdet3d.models.layers import DGCNNGFModule
+
+    self = DGCNNGFModule(
+        mlp_channels=[18, 64, 64],
+        num_sample=20,
+        knn_mode='D-KNN',
+        radius=None,
+        norm_cfg=dict(type='BN2d'),
+        act_cfg=dict(type='ReLU'),
+        pool_mode='max').cuda()
+
+    assert self.mlps[0].layer0.conv.in_channels == 18
+    assert self.mlps[0].layer0.conv.out_channels == 64
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', np.float32)
+
+    # (B, N, C)
+    xyz = torch.from_numpy(xyz).view(1, -1, 3).cuda()
+    points = xyz.repeat([1, 1, 3])
+
+    # test forward
+    new_points = self(points)
+
+    assert new_points.shape == torch.Size([1, 200, 64])
+
+    # test F-KNN mod
+    self = DGCNNGFModule(
+        mlp_channels=[6, 64, 64],
+        num_sample=20,
+        knn_mode='F-KNN',
+        radius=None,
+        norm_cfg=dict(type='BN2d'),
+        act_cfg=dict(type='ReLU'),
+        pool_mode='max').cuda()
+
+    # test forward
+    new_points = self(xyz)
+    assert new_points.shape == torch.Size([1, 200, 64])
+
+    # test ball query
+    self = DGCNNGFModule(
+        mlp_channels=[6, 64, 64],
+        num_sample=20,
+        knn_mode='F-KNN',
+        radius=0.2,
+        norm_cfg=dict(type='BN2d'),
+        act_cfg=dict(type='ReLU'),
+        pool_mode='max').cuda()
diff --git a/mmde/tests/test_models/test_layers/test_fusion_layers/test_fusion_coord_trans.py b/mmde/tests/test_models/test_layers/test_fusion_layers/test_fusion_coord_trans.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fbd34cd8c888b26595a069030a669773a52cb2c
--- /dev/null
+++ b/mmde/tests/test_models/test_layers/test_fusion_layers/test_fusion_coord_trans.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Tests coords transformation in fusion modules.
+
+CommandLine:
+    pytest tests/test_models/test_fusion/test_fusion_coord_trans.py
+"""
+
+import torch
+
+from mmdet3d.models.layers.fusion_layers import apply_3d_transformation
+
+
+def test_coords_transformation():
+    """Test the transformation of 3d coords."""
+
+    # H+R+S+T, not reverse, depth
+    img_meta = {
+        'pcd_scale_factor':
+        1.2311e+00,
+        'pcd_rotation': [[8.660254e-01, 0.5, 0], [-0.5, 8.660254e-01, 0],
+                         [0, 0, 1.0e+00]],
+        'pcd_trans': [1.111e-02, -8.88e-03, 0.0],
+        'pcd_horizontal_flip':
+        True,
+        'transformation_3d_flow': ['HF', 'R', 'S', 'T']
+    }
+
+    pcd = torch.tensor([[-5.2422e+00, -2.9757e-01, 4.0021e+01],
+                        [-9.1435e-01, 2.6675e+01, -5.5950e+00],
+                        [2.0089e-01, 5.8098e+00, -3.5409e+01],
+                        [-1.9461e-01, 3.1309e+01, -1.0901e+00]])
+
+    pcd_transformed = apply_3d_transformation(
+        pcd, 'DEPTH', img_meta, reverse=False)
+
+    expected_tensor = torch.tensor(
+        [[5.78332345e+00, 2.900697e+00, 4.92698531e+01],
+         [-1.5433839e+01, 2.8993850e+01, -6.8880045e+00],
+         [-3.77929405e+00, 6.061661e+00, -4.35920199e+01],
+         [-1.9053658e+01, 3.3491436e+01, -1.34202211e+00]])
+
+    assert torch.allclose(expected_tensor, pcd_transformed, 1e-4)
+
+    # H+R+S+T, reverse, depth
+    img_meta = {
+        'pcd_scale_factor':
+        7.07106781e-01,
+        'pcd_rotation': [[7.07106781e-01, 7.07106781e-01, 0.0],
+                         [-7.07106781e-01, 7.07106781e-01, 0.0],
+                         [0.0, 0.0, 1.0e+00]],
+        'pcd_trans': [0.0, 0.0, 0.0],
+        'pcd_horizontal_flip':
+        False,
+        'transformation_3d_flow': ['HF', 'R', 'S', 'T']
+    }
+
+    pcd = torch.tensor([[-5.2422e+00, -2.9757e-01, 4.0021e+01],
+                        [-9.1435e+01, 2.6675e+01, -5.5950e+00],
+                        [6.061661e+00, -0.0, -1.0e+02]])
+
+    pcd_transformed = apply_3d_transformation(
+        pcd, 'DEPTH', img_meta, reverse=True)
+
+    expected_tensor = torch.tensor(
+        [[-5.53977e+00, 4.94463e+00, 5.65982409e+01],
+         [-6.476e+01, 1.1811e+02, -7.91252488e+00],
+         [6.061661e+00, -6.061661e+00, -1.41421356e+02]])
+    assert torch.allclose(expected_tensor, pcd_transformed, 1e-4)
+
+    # H+R+S+T, not reverse, camera
+    img_meta = {
+        'pcd_scale_factor':
+        1.0 / 7.07106781e-01,
+        'pcd_rotation': [[7.07106781e-01, 0.0, 7.07106781e-01],
+                         [0.0, 1.0e+00, 0.0],
+                         [-7.07106781e-01, 0.0, 7.07106781e-01]],
+        'pcd_trans': [1.0e+00, -1.0e+00, 0.0],
+        'pcd_horizontal_flip':
+        True,
+        'transformation_3d_flow': ['HF', 'S', 'R', 'T']
+    }
+
+    pcd = torch.tensor([[-5.2422e+00, 4.0021e+01, -2.9757e-01],
+                        [-9.1435e+01, -5.5950e+00, 2.6675e+01],
+                        [6.061661e+00, -1.0e+02, -0.0]])
+
+    pcd_transformed = apply_3d_transformation(
+        pcd, 'CAMERA', img_meta, reverse=False)
+
+    expected_tensor = torch.tensor(
+        [[6.53977e+00, 5.55982409e+01, 4.94463e+00],
+         [6.576e+01, -8.91252488e+00, 1.1811e+02],
+         [-5.061661e+00, -1.42421356e+02, -6.061661e+00]])
+
+    assert torch.allclose(expected_tensor, pcd_transformed, 1e-4)
+
+    # V, reverse, camera
+    img_meta = {'pcd_vertical_flip': True, 'transformation_3d_flow': ['VF']}
+
+    pcd_transformed = apply_3d_transformation(
+        pcd, 'CAMERA', img_meta, reverse=True)
+
+    expected_tensor = torch.tensor([[-5.2422e+00, 4.0021e+01, 2.9757e-01],
+                                    [-9.1435e+01, -5.5950e+00, -2.6675e+01],
+                                    [6.061661e+00, -1.0e+02, 0.0]])
+
+    assert torch.allclose(expected_tensor, pcd_transformed, 1e-4)
+
+    # V+H, not reverse, depth
+    img_meta = {
+        'pcd_vertical_flip': True,
+        'pcd_horizontal_flip': True,
+        'transformation_3d_flow': ['VF', 'HF']
+    }
+
+    pcd_transformed = apply_3d_transformation(
+        pcd, 'DEPTH', img_meta, reverse=False)
+
+    expected_tensor = torch.tensor([[5.2422e+00, -4.0021e+01, -2.9757e-01],
+                                    [9.1435e+01, 5.5950e+00, 2.6675e+01],
+                                    [-6.061661e+00, 1.0e+02, 0.0]])
+    assert torch.allclose(expected_tensor, pcd_transformed, 1e-4)
+
+    # V+H, reverse, lidar
+    img_meta = {
+        'pcd_vertical_flip': True,
+        'pcd_horizontal_flip': True,
+        'transformation_3d_flow': ['VF', 'HF']
+    }
+
+    pcd_transformed = apply_3d_transformation(
+        pcd, 'LIDAR', img_meta, reverse=True)
+
+    expected_tensor = torch.tensor([[5.2422e+00, -4.0021e+01, -2.9757e-01],
+                                    [9.1435e+01, 5.5950e+00, 2.6675e+01],
+                                    [-6.061661e+00, 1.0e+02, 0.0]])
+    assert torch.allclose(expected_tensor, pcd_transformed, 1e-4)
diff --git a/mmde/tests/test_models/test_layers/test_fusion_layers/test_point_fusion.py b/mmde/tests/test_models/test_layers/test_fusion_layers/test_point_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..916c404c1302197cd30dc6e6e5c78fecf1acd3e3
--- /dev/null
+++ b/mmde/tests/test_models/test_layers/test_fusion_layers/test_point_fusion.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Tests the core function of point fusion.
+
+CommandLine:
+    pytest tests/test_models/test_fusion/test_point_fusion.py
+"""
+
+import torch
+
+from mmdet3d.models.layers.fusion_layers import PointFusion
+
+
+def test_sample_single():
+    # this function makes sure the rewriting of 3d coords transformation
+    # in point fusion does not change the original behaviour
+    lidar2img = torch.tensor(
+        [[6.0294e+02, -7.0791e+02, -1.2275e+01, -1.7094e+02],
+         [1.7678e+02, 8.8088e+00, -7.0794e+02, -1.0257e+02],
+         [9.9998e-01, -1.5283e-03, -5.2907e-03, -3.2757e-01],
+         [0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00]])
+
+    #  all use default
+    img_meta = {
+        'transformation_3d_flow': ['R', 'S', 'T', 'HF'],
+        'input_shape': [370, 1224],
+        'img_shape': [370, 1224],
+        'lidar2img': lidar2img,
+    }
+
+    #  dummy parameters
+    fuse = PointFusion(1, 1, 1, 1)
+    img_feat = torch.arange(370 * 1224)[None, ...].view(
+        370, 1224)[None, None, ...].float() / (370 * 1224)
+    pts = torch.tensor([[8.356, -4.312, -0.445], [11.777, -6.724, -0.564],
+                        [6.453, 2.53, -1.612], [6.227, -3.839, -0.563]])
+    out = fuse.sample_single(img_feat, pts, img_meta)
+
+    expected_tensor = torch.tensor(
+        [0.5560822, 0.5476625, 0.9687978, 0.6241757])
+    assert torch.allclose(expected_tensor, out, 1e-4)
+
+    pcd_rotation = torch.tensor([[8.660254e-01, 0.5, 0],
+                                 [-0.5, 8.660254e-01, 0], [0, 0, 1.0e+00]])
+    pcd_scale_factor = 1.111
+    pcd_trans = torch.tensor([1.0, -1.0, 0.5])
+    pts = pts @ pcd_rotation
+    pts *= pcd_scale_factor
+    pts += pcd_trans
+    pts[:, 1] = -pts[:, 1]
+
+    # not use default
+    img_meta.update({
+        'pcd_scale_factor': pcd_scale_factor,
+        'pcd_rotation': pcd_rotation,
+        'pcd_trans': pcd_trans,
+        'pcd_horizontal_flip': True
+    })
+    out = fuse.sample_single(img_feat, pts, img_meta)
+    expected_tensor = torch.tensor(
+        [0.5560822, 0.5476625, 0.9687978, 0.6241757])
+    assert torch.allclose(expected_tensor, out, 1e-4)
diff --git a/mmde/tests/test_models/test_layers/test_fusion_layers/test_vote_fusion.py b/mmde/tests/test_models/test_layers/test_fusion_layers/test_vote_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f8bed0c945d7b11dc25ced16ff6cfa28eeeadaa
--- /dev/null
+++ b/mmde/tests/test_models/test_layers/test_fusion_layers/test_vote_fusion.py
@@ -0,0 +1,322 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Tests the core function of vote fusion.
+
+CommandLine:
+    pytest tests/test_models/test_fusion/test_vote_fusion.py
+"""
+
+import torch
+
+from mmdet3d.models.layers.fusion_layers import VoteFusion
+
+
+def test_vote_fusion():
+    img_meta = {
+        'ori_shape': (530, 730),
+        'img_shape': (600, 826),
+        'pad_shape': (608, 832),
+        'scale_factor':
+        torch.tensor([1.1315, 1.1321, 1.1315, 1.1321]),
+        'flip':
+        False,
+        'pcd_horizontal_flip':
+        False,
+        'pcd_vertical_flip':
+        False,
+        'pcd_trans':
+        torch.tensor([0., 0., 0.]),
+        'pcd_scale_factor':
+        1.0308290128214932,
+        'pcd_rotation':
+        torch.tensor([[0.9747, 0.2234, 0.0000], [-0.2234, 0.9747, 0.0000],
+                      [0.0000, 0.0000, 1.0000]]),
+        'transformation_3d_flow': ['HF', 'R', 'S', 'T']
+    }
+
+    rt_mat = torch.tensor([[0.979570, 0.047954, -0.195330],
+                           [0.047954, 0.887470, 0.458370],
+                           [0.195330, -0.458370, 0.867030]])
+    k_mat = torch.tensor([[529.5000, 0.0000, 365.0000],
+                          [0.0000, 529.5000, 265.0000],
+                          [0.0000, 0.0000, 1.0000]])
+    rt_mat = rt_mat.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]
+                                ]) @ rt_mat.transpose(1, 0)
+    depth2img = k_mat @ rt_mat
+    img_meta['depth2img'] = depth2img
+
+    bboxes = torch.tensor([[[
+        5.4286e+02, 9.8283e+01, 6.1700e+02, 1.6742e+02, 9.7922e-01, 3.0000e+00
+    ], [
+        4.2613e+02, 8.4646e+01, 4.9091e+02, 1.6237e+02, 9.7848e-01, 3.0000e+00
+    ], [
+        2.5606e+02, 7.3244e+01, 3.7883e+02, 1.8471e+02, 9.7317e-01, 3.0000e+00
+    ], [
+        6.0104e+02, 1.0648e+02, 6.6757e+02, 1.9216e+02, 8.4607e-01, 3.0000e+00
+    ], [
+        2.2923e+02, 1.4984e+02, 7.0163e+02, 4.6537e+02, 3.5719e-01, 0.0000e+00
+    ], [
+        2.5614e+02, 7.4965e+01, 3.3275e+02, 1.5908e+02, 2.8688e-01, 3.0000e+00
+    ], [
+        9.8718e+00, 1.4142e+02, 2.0213e+02, 3.3878e+02, 1.0935e-01, 3.0000e+00
+    ], [
+        6.1930e+02, 1.1768e+02, 6.8505e+02, 2.0318e+02, 1.0720e-01, 3.0000e+00
+    ]]])
+
+    seeds_3d = torch.tensor([[[0.044544, 1.675476, -1.531831],
+                              [2.500625, 7.238662, -0.737675],
+                              [-0.600003, 4.827733, -0.084022],
+                              [1.396212, 3.994484, -1.551180],
+                              [-2.054746, 2.012759, -0.357472],
+                              [-0.582477, 6.580470, -1.466052],
+                              [1.313331, 5.722039, 0.123904],
+                              [-1.107057, 3.450359, -1.043422],
+                              [1.759746, 5.655951, -1.519564],
+                              [-0.203003, 6.453243, 0.137703],
+                              [-0.910429, 0.904407, -0.512307],
+                              [0.434049, 3.032374, -0.763842],
+                              [1.438146, 2.289263, -1.546332],
+                              [0.575622, 5.041906, -0.891143],
+                              [-1.675931, 1.417597, -1.588347]]])
+
+    imgs = torch.linspace(
+        -1, 1, steps=608 * 832).reshape(1, 608, 832).repeat(3, 1, 1)[None]
+
+    expected_tensor1 = torch.tensor(
+        [[[
+            0.000000e+00, -0.000000e+00, 0.000000e+00, -0.000000e+00,
+            0.000000e+00, 1.193706e-01, -0.000000e+00, -2.879214e-01,
+            -0.000000e+00, 0.000000e+00, 1.422463e-01, -6.474612e-01,
+            -0.000000e+00, 1.490057e-02, 0.000000e+00
+        ],
+          [
+              0.000000e+00, -0.000000e+00, -0.000000e+00, 0.000000e+00,
+              0.000000e+00, -1.873745e+00, -0.000000e+00, 1.576240e-01,
+              0.000000e+00, -0.000000e+00, -3.646177e-02, -7.751858e-01,
+              0.000000e+00, 9.593642e-02, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, -6.263277e-02, 0.000000e+00, -3.646387e-01,
+              0.000000e+00, 0.000000e+00, -5.875812e-01, -6.263450e-02,
+              0.000000e+00, 1.149264e-01, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 8.899736e-01, 0.000000e+00, 9.019017e-01,
+              0.000000e+00, 0.000000e+00, 6.917775e-01, 8.899733e-01,
+              0.000000e+00, 9.812444e-01, 0.000000e+00
+          ],
+          [
+              -0.000000e+00, -0.000000e+00, -0.000000e+00, -0.000000e+00,
+              -0.000000e+00, -4.516903e-01, -0.000000e+00, -2.315422e-01,
+              -0.000000e+00, -0.000000e+00, -4.197519e-01, -4.516906e-01,
+              -0.000000e+00, -1.547615e-01, -0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 3.571937e-01, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 3.571937e-01,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 9.731653e-01,
+              0.000000e+00, 0.000000e+00, 1.093455e-01, 0.000000e+00,
+              0.000000e+00, 8.460656e-01, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              2.316288e-03, -1.948284e-03, -3.694394e-03, 2.176163e-04,
+              -3.882605e-03, -1.901490e-03, -3.355042e-03, -1.774631e-03,
+              -6.981542e-04, -3.886823e-03, -1.302233e-03, -1.189933e-03,
+              2.540967e-03, -1.834944e-03, 1.032048e-03
+          ],
+          [
+              2.316288e-03, -1.948284e-03, -3.694394e-03, 2.176163e-04,
+              -3.882605e-03, -1.901490e-03, -3.355042e-03, -1.774631e-03,
+              -6.981542e-04, -3.886823e-03, -1.302233e-03, -1.189933e-03,
+              2.540967e-03, -1.834944e-03, 1.032048e-03
+          ],
+          [
+              2.316288e-03, -1.948284e-03, -3.694394e-03, 2.176163e-04,
+              -3.882605e-03, -1.901490e-03, -3.355042e-03, -1.774631e-03,
+              -6.981542e-04, -3.886823e-03, -1.302233e-03, -1.189933e-03,
+              2.540967e-03, -1.834944e-03, 1.032048e-03
+          ]]])
+
+    expected_tensor2 = torch.tensor([[
+        False, False, False, False, False, True, False, True, False, False,
+        True, True, False, True, False, False, False, False, False, False,
+        False, False, True, False, False, False, False, False, True, False,
+        False, False, False, False, False, False, False, False, False, False,
+        False, False, False, True, False
+    ]])
+
+    expected_tensor3 = torch.tensor(
+        [[[
+            -0.000000e+00, -0.000000e+00, -0.000000e+00, -0.000000e+00,
+            0.000000e+00, -0.000000e+00, -0.000000e+00, 0.000000e+00,
+            -0.000000e+00, -0.000000e+00, 0.000000e+00, -0.000000e+00,
+            -0.000000e+00, 1.720988e-01, 0.000000e+00
+        ],
+          [
+              0.000000e+00, -0.000000e+00, -0.000000e+00, 0.000000e+00,
+              -0.000000e+00, 0.000000e+00, -0.000000e+00, 0.000000e+00,
+              0.000000e+00, -0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 4.824460e-02, 0.000000e+00
+          ],
+          [
+              -0.000000e+00, -0.000000e+00, -0.000000e+00, -0.000000e+00,
+              -0.000000e+00, -0.000000e+00, -0.000000e+00, 0.000000e+00,
+              -0.000000e+00, -0.000000e+00, -0.000000e+00, -0.000000e+00,
+              -0.000000e+00, 1.447314e-01, -0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 9.759269e-01, 0.000000e+00
+          ],
+          [
+              -0.000000e+00, -0.000000e+00, -0.000000e+00, -0.000000e+00,
+              -0.000000e+00, -0.000000e+00, -0.000000e+00, -0.000000e+00,
+              -0.000000e+00, -0.000000e+00, -0.000000e+00, -0.000000e+00,
+              -0.000000e+00, -1.631542e-01, -0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 1.072001e-01, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
+              0.000000e+00, 0.000000e+00, 0.000000e+00
+          ],
+          [
+              2.316288e-03, -1.948284e-03, -3.694394e-03, 2.176163e-04,
+              -3.882605e-03, -1.901490e-03, -3.355042e-03, -1.774631e-03,
+              -6.981542e-04, -3.886823e-03, -1.302233e-03, -1.189933e-03,
+              2.540967e-03, -1.834944e-03, 1.032048e-03
+          ],
+          [
+              2.316288e-03, -1.948284e-03, -3.694394e-03, 2.176163e-04,
+              -3.882605e-03, -1.901490e-03, -3.355042e-03, -1.774631e-03,
+              -6.981542e-04, -3.886823e-03, -1.302233e-03, -1.189933e-03,
+              2.540967e-03, -1.834944e-03, 1.032048e-03
+          ],
+          [
+              2.316288e-03, -1.948284e-03, -3.694394e-03, 2.176163e-04,
+              -3.882605e-03, -1.901490e-03, -3.355042e-03, -1.774631e-03,
+              -6.981542e-04, -3.886823e-03, -1.302233e-03, -1.189933e-03,
+              2.540967e-03, -1.834944e-03, 1.032048e-03
+          ]]])
+
+    fusion = VoteFusion()
+    out1, out2 = fusion(imgs, bboxes, seeds_3d, [img_meta])
+    assert torch.allclose(expected_tensor1, out1[:, :, :15], 1e-3)
+    assert torch.allclose(expected_tensor2.float(), out2.float(), 1e-3)
+    assert torch.allclose(expected_tensor3, out1[:, :, 30:45], 1e-3)
+
+    out1, out2 = fusion(imgs, bboxes[:, :2], seeds_3d, [img_meta])
+    out1 = out1[:, :15, 30:45]
+    out2 = out2[:, 30:45].float()
+    assert torch.allclose(torch.zeros_like(out1), out1, 1e-3)
+    assert torch.allclose(torch.zeros_like(out2), out2, 1e-3)
diff --git a/mmde/tests/test_models/test_layers/test_minkowski_engine/test_minkowski_engine_module.py b/mmde/tests/test_models/test_layers/test_minkowski_engine/test_minkowski_engine_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..5996fe206ff5c36fcbc0bb17a998f645e359f95b
--- /dev/null
+++ b/mmde/tests/test_models/test_layers/test_minkowski_engine/test_minkowski_engine_module.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet3d.models.layers.minkowski_engine_block import \
+    IS_MINKOWSKI_ENGINE_AVAILABLE
+
+if IS_MINKOWSKI_ENGINE_AVAILABLE:
+    from MinkowskiEngine import SparseTensor
+
+    from mmdet3d.models.layers.minkowski_engine_block import (
+        MinkowskiBasicBlock, MinkowskiBottleneck, MinkowskiConvModule)
+else:
+    pytest.skip('test requires Minkowski Engine.', allow_module_level=True)
+
+
+def test_MinkowskiConvModule():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    voxel_features = torch.tensor(
+        [[6.56126, 0.9648336, -1.7339306, 0.315],
+         [6.8162713, -2.480431, -1.3616394, 0.36],
+         [11.643568, -4.744306, -1.3580885, 0.16],
+         [23.482342, 6.5036807, 0.5806964, 0.35]],
+        dtype=torch.float32).cuda()  # n, point_features
+    coordinates = torch.tensor(
+        [[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232],
+         [1, 35, 930, 469]],
+        dtype=torch.int32).cuda()  # n, 4(batch, ind_x, ind_y, ind_z)
+
+    # test
+    input_sp_tensor = SparseTensor(voxel_features, coordinates)
+
+    self = MinkowskiConvModule(4, 4, kernel_size=2, stride=2).cuda()
+
+    out_features = self(input_sp_tensor)
+    assert out_features.F.shape == torch.Size([4, 4])
+
+
+def test_MinkowskiResidualBlock():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    voxel_features = torch.tensor(
+        [[6.56126, 0.9648336, -1.7339306, 0.315],
+         [6.8162713, -2.480431, -1.3616394, 0.36],
+         [11.643568, -4.744306, -1.3580885, 0.16],
+         [23.482342, 6.5036807, 0.5806964, 0.35]],
+        dtype=torch.float32).cuda()  # n, point_features
+    coordinates = torch.tensor(
+        [[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232],
+         [1, 35, 930, 469]],
+        dtype=torch.int32).cuda()  # n, 4(batch, ind_x, ind_y, ind_z)
+
+    # test
+    input_sp_tensor = SparseTensor(voxel_features, coordinates)
+
+    sparse_block0 = MinkowskiBasicBlock(4, 4, kernel_size=3).cuda()
+    sparse_block1 = MinkowskiBottleneck(
+        4,
+        4,
+        downsample=MinkowskiConvModule(4, 16, kernel_size=1, act_cfg=None),
+        kernel_size=3).cuda()
+
+    # test forward
+    out_features0 = sparse_block0(input_sp_tensor)
+    out_features1 = sparse_block1(input_sp_tensor)
+    assert out_features0.F.shape == torch.Size([4, 4])
+    assert out_features1.F.shape == torch.Size([4, 16])
diff --git a/mmde/tests/test_models/test_layers/test_paconv/test_paconv_modules.py b/mmde/tests/test_models/test_layers/test_paconv/test_paconv_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c278d479365f243b62b52a6cf26572fb9988eb6
--- /dev/null
+++ b/mmde/tests/test_models/test_layers/test_paconv/test_paconv_modules.py
@@ -0,0 +1,300 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+
+def test_paconv_sa_module_msg():
+    if not torch.cuda.is_available():
+        pytest.skip()
+    from mmdet3d.models.layers import PAConvSAModuleMSG
+
+    # paconv_num_kernels should have same length as mlp_channels
+    with pytest.raises(AssertionError):
+        self = PAConvSAModuleMSG(
+            num_point=16,
+            radii=[0.2, 0.4],
+            sample_nums=[4, 8],
+            mlp_channels=[[12, 16], [12, 32]],
+            paconv_num_kernels=[[4]]).cuda()
+
+    # paconv_num_kernels inner num should match as mlp_channels
+    with pytest.raises(AssertionError):
+        self = PAConvSAModuleMSG(
+            num_point=16,
+            radii=[0.2, 0.4],
+            sample_nums=[4, 8],
+            mlp_channels=[[12, 16], [12, 32]],
+            paconv_num_kernels=[[4, 4], [8, 8]]).cuda()
+
+    self = PAConvSAModuleMSG(
+        num_point=16,
+        radii=[0.2, 0.4],
+        sample_nums=[4, 8],
+        mlp_channels=[[12, 16], [12, 32]],
+        paconv_num_kernels=[[4], [8]],
+        norm_cfg=dict(type='BN2d'),
+        use_xyz=False,
+        pool_mod='max',
+        paconv_kernel_input='w_neighbor').cuda()
+
+    assert self.mlps[0].layer0.in_channels == 12 * 2
+    assert self.mlps[0].layer0.out_channels == 16
+    assert self.mlps[1].layer0.in_channels == 12 * 2
+    assert self.mlps[1].layer0.out_channels == 32
+    assert self.mlps[0].layer0.bn.num_features == 16
+    assert self.mlps[1].layer0.bn.num_features == 32
+
+    assert self.mlps[0].layer0.scorenet.mlps.layer0.conv.in_channels == 7
+    assert self.mlps[0].layer0.scorenet.mlps.layer3.conv.out_channels == 4
+    assert self.mlps[1].layer0.scorenet.mlps.layer0.conv.in_channels == 7
+    assert self.mlps[1].layer0.scorenet.mlps.layer3.conv.out_channels == 8
+
+    # last conv in ScoreNet has neither bn nor relu
+    with pytest.raises(AttributeError):
+        _ = self.mlps[0].layer0.scorenet.mlps.layer3.bn
+    with pytest.raises(AttributeError):
+        _ = self.mlps[0].layer0.scorenet.mlps.layer3.activate
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', np.float32)
+
+    # (B, N, 3)
+    xyz = torch.from_numpy(xyz).view(1, -1, 3).cuda()
+    # (B, C, N)
+    features = xyz.repeat([1, 1, 4]).transpose(1, 2).contiguous().cuda()
+
+    # test forward
+    new_xyz, new_features, inds = self(xyz, features)
+    assert new_xyz.shape == torch.Size([1, 16, 3])
+    assert new_features.shape == torch.Size([1, 48, 16])
+    assert inds.shape == torch.Size([1, 16])
+
+    # test with identity kernel input
+    self = PAConvSAModuleMSG(
+        num_point=16,
+        radii=[0.2, 0.4],
+        sample_nums=[4, 8],
+        mlp_channels=[[12, 16], [12, 32]],
+        paconv_num_kernels=[[4], [8]],
+        norm_cfg=dict(type='BN2d'),
+        use_xyz=False,
+        pool_mod='max',
+        paconv_kernel_input='identity').cuda()
+
+    assert self.mlps[0].layer0.in_channels == 12 * 1
+    assert self.mlps[0].layer0.out_channels == 16
+    assert self.mlps[0].layer0.num_kernels == 4
+    assert self.mlps[1].layer0.in_channels == 12 * 1
+    assert self.mlps[1].layer0.out_channels == 32
+    assert self.mlps[1].layer0.num_kernels == 8
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', np.float32)
+
+    # (B, N, 3)
+    xyz = torch.from_numpy(xyz).view(1, -1, 3).cuda()
+    # (B, C, N)
+    features = xyz.repeat([1, 1, 4]).transpose(1, 2).contiguous().cuda()
+
+    # test forward
+    new_xyz, new_features, inds = self(xyz, features)
+    assert new_xyz.shape == torch.Size([1, 16, 3])
+    assert new_features.shape == torch.Size([1, 48, 16])
+    assert inds.shape == torch.Size([1, 16])
+
+
+def test_paconv_sa_module():
+    if not torch.cuda.is_available():
+        pytest.skip()
+    from mmdet3d.models.layers import build_sa_module
+    sa_cfg = dict(
+        type='PAConvSAModule',
+        num_point=16,
+        radius=0.2,
+        num_sample=8,
+        mlp_channels=[12, 32],
+        paconv_num_kernels=[8],
+        norm_cfg=dict(type='BN2d'),
+        use_xyz=True,
+        pool_mod='max',
+        paconv_kernel_input='w_neighbor')
+    self = build_sa_module(sa_cfg).cuda()
+
+    assert self.mlps[0].layer0.in_channels == 15 * 2
+    assert self.mlps[0].layer0.out_channels == 32
+    assert self.mlps[0].layer0.num_kernels == 8
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', np.float32)
+
+    # (B, N, 3)
+    xyz = torch.from_numpy(xyz[..., :3]).view(1, -1, 3).cuda()
+    # (B, C, N)
+    features = xyz.repeat([1, 1, 4]).transpose(1, 2).contiguous().cuda()
+
+    # test forward
+    new_xyz, new_features, inds = self(xyz, features)
+    assert new_xyz.shape == torch.Size([1, 16, 3])
+    assert new_features.shape == torch.Size([1, 32, 16])
+    assert inds.shape == torch.Size([1, 16])
+
+    # test kNN sampling when radius is None
+    sa_cfg = dict(
+        type='PAConvSAModule',
+        num_point=16,
+        radius=None,
+        num_sample=8,
+        mlp_channels=[12, 32],
+        paconv_num_kernels=[8],
+        norm_cfg=dict(type='BN2d'),
+        use_xyz=True,
+        pool_mod='max',
+        paconv_kernel_input='identity')
+    self = build_sa_module(sa_cfg).cuda()
+    assert self.mlps[0].layer0.in_channels == 15 * 1
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', np.float32)
+
+    xyz = torch.from_numpy(xyz[..., :3]).view(1, -1, 3).cuda()
+    features = xyz.repeat([1, 1, 4]).transpose(1, 2).contiguous().cuda()
+    new_xyz, new_features, inds = self(xyz, features)
+    assert new_xyz.shape == torch.Size([1, 16, 3])
+    assert new_features.shape == torch.Size([1, 32, 16])
+    assert inds.shape == torch.Size([1, 16])
+
+
+def test_paconv_cuda_sa_module_msg():
+    if not torch.cuda.is_available():
+        pytest.skip()
+    from mmdet3d.models.layers import PAConvCUDASAModuleMSG
+
+    # paconv_num_kernels should have same length as mlp_channels
+    with pytest.raises(AssertionError):
+        self = PAConvCUDASAModuleMSG(
+            num_point=16,
+            radii=[0.2, 0.4],
+            sample_nums=[4, 8],
+            mlp_channels=[[12, 16], [12, 32]],
+            paconv_num_kernels=[[4]]).cuda()
+
+    # paconv_num_kernels inner num should match as mlp_channels
+    with pytest.raises(AssertionError):
+        self = PAConvCUDASAModuleMSG(
+            num_point=16,
+            radii=[0.2, 0.4],
+            sample_nums=[4, 8],
+            mlp_channels=[[12, 16], [12, 32]],
+            paconv_num_kernels=[[4, 4], [8, 8]]).cuda()
+
+    self = PAConvCUDASAModuleMSG(
+        num_point=16,
+        radii=[0.2, 0.4],
+        sample_nums=[4, 8],
+        mlp_channels=[[12, 16], [12, 32]],
+        paconv_num_kernels=[[4], [8]],
+        norm_cfg=dict(type='BN2d'),
+        use_xyz=False,
+        pool_mod='max',
+        paconv_kernel_input='w_neighbor').cuda()
+
+    assert self.mlps[0][0].in_channels == 12 * 2
+    assert self.mlps[0][0].out_channels == 16
+    assert self.mlps[0][0].num_kernels == 4
+    assert self.mlps[0][0].bn.num_features == 16
+    assert self.mlps[1][0].in_channels == 12 * 2
+    assert self.mlps[1][0].out_channels == 32
+    assert self.mlps[1][0].num_kernels == 8
+    assert self.mlps[1][0].bn.num_features == 32
+
+    assert self.mlps[0][0].scorenet.mlps.layer0.conv.in_channels == 7
+    assert self.mlps[0][0].scorenet.mlps.layer3.conv.out_channels == 4
+    assert self.mlps[1][0].scorenet.mlps.layer0.conv.in_channels == 7
+    assert self.mlps[1][0].scorenet.mlps.layer3.conv.out_channels == 8
+
+    # last conv in ScoreNet has neither bn nor relu
+    with pytest.raises(AttributeError):
+        _ = self.mlps[0][0].scorenet.mlps.layer3.bn
+    with pytest.raises(AttributeError):
+        _ = self.mlps[0][0].scorenet.mlps.layer3.activate
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', np.float32)
+
+    # (B, N, 3)
+    xyz = torch.from_numpy(xyz).view(1, -1, 3).cuda()
+    # (B, C, N)
+    features = xyz.repeat([1, 1, 4]).transpose(1, 2).contiguous().cuda()
+
+    # test forward
+    new_xyz, new_features, inds = self(xyz, features)
+    assert new_xyz.shape == torch.Size([1, 16, 3])
+    assert new_features.shape == torch.Size([1, 48, 16])
+    assert inds.shape == torch.Size([1, 16])
+
+    # CUDA PAConv only supports w_neighbor kernel_input
+    with pytest.raises(AssertionError):
+        self = PAConvCUDASAModuleMSG(
+            num_point=16,
+            radii=[0.2, 0.4],
+            sample_nums=[4, 8],
+            mlp_channels=[[12, 16], [12, 32]],
+            paconv_num_kernels=[[4], [8]],
+            norm_cfg=dict(type='BN2d'),
+            use_xyz=False,
+            pool_mod='max',
+            paconv_kernel_input='identity').cuda()
+
+
+def test_paconv_cuda_sa_module():
+    if not torch.cuda.is_available():
+        pytest.skip()
+    from mmdet3d.models.layers import build_sa_module
+    sa_cfg = dict(
+        type='PAConvCUDASAModule',
+        num_point=16,
+        radius=0.2,
+        num_sample=8,
+        mlp_channels=[12, 32],
+        paconv_num_kernels=[8],
+        norm_cfg=dict(type='BN2d'),
+        use_xyz=True,
+        pool_mod='max',
+        paconv_kernel_input='w_neighbor')
+    self = build_sa_module(sa_cfg).cuda()
+
+    assert self.mlps[0][0].in_channels == 15 * 2
+    assert self.mlps[0][0].out_channels == 32
+    assert self.mlps[0][0].num_kernels == 8
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', np.float32)
+
+    # (B, N, 3)
+    xyz = torch.from_numpy(xyz[..., :3]).view(1, -1, 3).cuda()
+    # (B, C, N)
+    features = xyz.repeat([1, 1, 4]).transpose(1, 2).contiguous().cuda()
+
+    # test forward
+    new_xyz, new_features, inds = self(xyz, features)
+    assert new_xyz.shape == torch.Size([1, 16, 3])
+    assert new_features.shape == torch.Size([1, 32, 16])
+    assert inds.shape == torch.Size([1, 16])
+
+    # test kNN sampling when radius is None
+    sa_cfg = dict(
+        type='PAConvCUDASAModule',
+        num_point=16,
+        radius=None,
+        num_sample=8,
+        mlp_channels=[12, 32],
+        paconv_num_kernels=[8],
+        norm_cfg=dict(type='BN2d'),
+        use_xyz=True,
+        pool_mod='max',
+        paconv_kernel_input='w_neighbor')
+    self = build_sa_module(sa_cfg).cuda()
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', np.float32)
+
+    xyz = torch.from_numpy(xyz[..., :3]).view(1, -1, 3).cuda()
+    features = xyz.repeat([1, 1, 4]).transpose(1, 2).contiguous().cuda()
+    new_xyz, new_features, inds = self(xyz, features)
+    assert new_xyz.shape == torch.Size([1, 16, 3])
+    assert new_features.shape == torch.Size([1, 32, 16])
+    assert inds.shape == torch.Size([1, 16])
diff --git a/mmde/tests/test_models/test_layers/test_paconv/test_paconv_ops.py b/mmde/tests/test_models/test_layers/test_paconv/test_paconv_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f3f9cdb304c6809916e6ea4b1e698d9a14a5c1e
--- /dev/null
+++ b/mmde/tests/test_models/test_layers/test_paconv/test_paconv_ops.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet3d.models.layers import PAConv, PAConvCUDA
+
+
+def test_paconv():
+    B = 2
+    in_channels = 6
+    out_channels = 12
+    npoint = 4
+    K = 3
+    num_kernels = 4
+    points_xyz = torch.randn(B, 3, npoint, K)
+    features = torch.randn(B, in_channels, npoint, K)
+
+    paconv = PAConv(in_channels, out_channels, num_kernels)
+    assert paconv.weight_bank.shape == torch.Size(
+        [in_channels * 2, out_channels * num_kernels])
+
+    with torch.no_grad():
+        new_features, _ = paconv((features, points_xyz))
+
+    assert new_features.shape == torch.Size([B, out_channels, npoint, K])
+
+
+def test_paconv_cuda():
+    if not torch.cuda.is_available():
+        pytest.skip()
+    B = 2
+    in_channels = 6
+    out_channels = 12
+    N = 32
+    npoint = 4
+    K = 3
+    num_kernels = 4
+    points_xyz = torch.randn(B, 3, npoint, K).float().cuda()
+    features = torch.randn(B, in_channels, N).float().cuda()
+    points_idx = torch.randint(0, N, (B, npoint, K)).long().cuda()
+
+    paconv = PAConvCUDA(in_channels, out_channels, num_kernels).cuda()
+    assert paconv.weight_bank.shape == torch.Size(
+        [in_channels * 2, out_channels * num_kernels])
+
+    with torch.no_grad():
+        new_features, _, _ = paconv((features, points_xyz, points_idx))
+
+    assert new_features.shape == torch.Size([B, out_channels, npoint, K])
diff --git a/mmde/tests/test_models/test_layers/test_pointnet_modules/test_point_fp_module.py b/mmde/tests/test_models/test_layers/test_pointnet_modules/test_point_fp_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..c413f275fbed406e8fa22ff7b85c2bd2154ef9db
--- /dev/null
+++ b/mmde/tests/test_models/test_layers/test_pointnet_modules/test_point_fp_module.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+
+def test_pointnet_fp_module():
+    if not torch.cuda.is_available():
+        pytest.skip()
+    from mmdet3d.models.layers import PointFPModule
+
+    self = PointFPModule(mlp_channels=[24, 16]).cuda()
+    assert self.mlps.layer0.conv.in_channels == 24
+    assert self.mlps.layer0.conv.out_channels == 16
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin',
+                      np.float32).reshape((-1, 6))
+
+    # (B, N, 3)
+    xyz1 = torch.from_numpy(xyz[0::2, :3]).view(1, -1, 3).cuda()
+    # (B, C1, N)
+    features1 = xyz1.repeat([1, 1, 4]).transpose(1, 2).contiguous().cuda()
+
+    # (B, M, 3)
+    xyz2 = torch.from_numpy(xyz[1::3, :3]).view(1, -1, 3).cuda()
+    # (B, C2, N)
+    features2 = xyz2.repeat([1, 1, 4]).transpose(1, 2).contiguous().cuda()
+
+    fp_features = self(xyz1, xyz2, features1, features2)
+    assert fp_features.shape == torch.Size([1, 16, 50])
diff --git a/mmde/tests/test_models/test_layers/test_pointnet_modules/test_point_sa_module.py b/mmde/tests/test_models/test_layers/test_pointnet_modules/test_point_sa_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b3e6783452693676fc4005f42252c61df54130b
--- /dev/null
+++ b/mmde/tests/test_models/test_layers/test_pointnet_modules/test_point_sa_module.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+
+def test_pointnet_sa_module_msg():
+    if not torch.cuda.is_available():
+        pytest.skip()
+    from mmdet3d.models.layers import PointSAModuleMSG
+
+    self = PointSAModuleMSG(
+        num_point=16,
+        radii=[0.2, 0.4],
+        sample_nums=[4, 8],
+        mlp_channels=[[12, 16], [12, 32]],
+        norm_cfg=dict(type='BN2d'),
+        use_xyz=False,
+        pool_mod='max').cuda()
+
+    assert self.mlps[0].layer0.conv.in_channels == 12
+    assert self.mlps[0].layer0.conv.out_channels == 16
+    assert self.mlps[1].layer0.conv.in_channels == 12
+    assert self.mlps[1].layer0.conv.out_channels == 32
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', np.float32)
+
+    # (B, N, 3)
+    xyz = torch.from_numpy(xyz).view(1, -1, 3).cuda()
+    # (B, C, N)
+    features = xyz.repeat([1, 1, 4]).transpose(1, 2).contiguous().cuda()
+
+    # test forward
+    new_xyz, new_features, inds = self(xyz, features)
+    assert new_xyz.shape == torch.Size([1, 16, 3])
+    assert new_features.shape == torch.Size([1, 48, 16])
+    assert inds.shape == torch.Size([1, 16])
+
+    # test D-FPS mod
+    self = PointSAModuleMSG(
+        num_point=16,
+        radii=[0.2, 0.4],
+        sample_nums=[4, 8],
+        mlp_channels=[[12, 16], [12, 32]],
+        norm_cfg=dict(type='BN2d'),
+        use_xyz=False,
+        pool_mod='max',
+        fps_mod=['D-FPS'],
+        fps_sample_range_list=[-1]).cuda()
+
+    # test forward
+    new_xyz, new_features, inds = self(xyz, features)
+    assert new_xyz.shape == torch.Size([1, 16, 3])
+    assert new_features.shape == torch.Size([1, 48, 16])
+    assert inds.shape == torch.Size([1, 16])
+
+    # test F-FPS mod
+    self = PointSAModuleMSG(
+        num_point=16,
+        radii=[0.2, 0.4],
+        sample_nums=[4, 8],
+        mlp_channels=[[12, 16], [12, 32]],
+        norm_cfg=dict(type='BN2d'),
+        use_xyz=False,
+        pool_mod='max',
+        fps_mod=['F-FPS'],
+        fps_sample_range_list=[-1]).cuda()
+
+    # test forward
+    new_xyz, new_features, inds = self(xyz, features)
+    assert new_xyz.shape == torch.Size([1, 16, 3])
+    assert new_features.shape == torch.Size([1, 48, 16])
+    assert inds.shape == torch.Size([1, 16])
+
+    # test FS mod
+    self = PointSAModuleMSG(
+        num_point=8,
+        radii=[0.2, 0.4],
+        sample_nums=[4, 8],
+        mlp_channels=[[12, 16], [12, 32]],
+        norm_cfg=dict(type='BN2d'),
+        use_xyz=False,
+        pool_mod='max',
+        fps_mod=['FS'],
+        fps_sample_range_list=[-1]).cuda()
+
+    # test forward
+    new_xyz, new_features, inds = self(xyz, features)
+    assert new_xyz.shape == torch.Size([1, 16, 3])
+    assert new_features.shape == torch.Size([1, 48, 16])
+    assert inds.shape == torch.Size([1, 16])
+
+    # test using F-FPS mod and D-FPS mod simultaneously
+    self = PointSAModuleMSG(
+        num_point=[8, 12],
+        radii=[0.2, 0.4],
+        sample_nums=[4, 8],
+        mlp_channels=[[12, 16], [12, 32]],
+        norm_cfg=dict(type='BN2d'),
+        use_xyz=False,
+        pool_mod='max',
+        fps_mod=['F-FPS', 'D-FPS'],
+        fps_sample_range_list=[64, -1]).cuda()
+
+    # test forward
+    new_xyz, new_features, inds = self(xyz, features)
+    assert new_xyz.shape == torch.Size([1, 20, 3])
+    assert new_features.shape == torch.Size([1, 48, 20])
+    assert inds.shape == torch.Size([1, 20])
+
+    # test num_points = None
+    self = PointSAModuleMSG(
+        num_point=None,
+        radii=[0.2, 0.4],
+        sample_nums=[4, 8],
+        mlp_channels=[[12, 16], [12, 32]],
+        norm_cfg=dict(type='BN2d'),
+        use_xyz=False,
+        pool_mod='max').cuda()
+
+    # test forward
+    new_xyz, new_features, inds = self(xyz, features)
+    assert new_features.shape == torch.Size([1, 48, 1])
+
+    # length of 'fps_mod' should be same as 'fps_sample_range_list'
+    with pytest.raises(AssertionError):
+        PointSAModuleMSG(
+            num_point=8,
+            radii=[0.2, 0.4],
+            sample_nums=[4, 8],
+            mlp_channels=[[12, 16], [12, 32]],
+            norm_cfg=dict(type='BN2d'),
+            use_xyz=False,
+            pool_mod='max',
+            fps_mod=['F-FPS', 'D-FPS'],
+            fps_sample_range_list=[-1]).cuda()
+
+    # length of 'num_point' should be same as 'fps_sample_range_list'
+    with pytest.raises(AssertionError):
+        PointSAModuleMSG(
+            num_point=[8, 8],
+            radii=[0.2, 0.4],
+            sample_nums=[4, 8],
+            mlp_channels=[[12, 16], [12, 32]],
+            norm_cfg=dict(type='BN2d'),
+            use_xyz=False,
+            pool_mod='max',
+            fps_mod=['F-FPS'],
+            fps_sample_range_list=[-1]).cuda()
+
+
+def test_pointnet_sa_module():
+    if not torch.cuda.is_available():
+        pytest.skip()
+    from mmdet3d.models.layers import build_sa_module
+    sa_cfg = dict(
+        type='PointSAModule',
+        num_point=16,
+        radius=0.2,
+        num_sample=8,
+        mlp_channels=[12, 32],
+        norm_cfg=dict(type='BN2d'),
+        use_xyz=True,
+        pool_mod='max')
+    self = build_sa_module(sa_cfg).cuda()
+
+    assert self.mlps[0].layer0.conv.in_channels == 15
+    assert self.mlps[0].layer0.conv.out_channels == 32
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', np.float32)
+
+    # (B, N, 3)
+    xyz = torch.from_numpy(xyz[..., :3]).view(1, -1, 3).cuda()
+    # (B, C, N)
+    features = xyz.repeat([1, 1, 4]).transpose(1, 2).contiguous().cuda()
+
+    # test forward
+    new_xyz, new_features, inds = self(xyz, features)
+    assert new_xyz.shape == torch.Size([1, 16, 3])
+    assert new_features.shape == torch.Size([1, 32, 16])
+    assert inds.shape == torch.Size([1, 16])
+
+    # can't set normalize_xyz when radius is None
+    with pytest.raises(AssertionError):
+        sa_cfg = dict(
+            type='PointSAModule',
+            num_point=16,
+            radius=None,
+            num_sample=8,
+            mlp_channels=[12, 32],
+            norm_cfg=dict(type='BN2d'),
+            use_xyz=True,
+            pool_mod='max',
+            normalize_xyz=True)
+        self = build_sa_module(sa_cfg)
+
+    # test kNN sampling when radius is None
+    sa_cfg['normalize_xyz'] = False
+    self = build_sa_module(sa_cfg).cuda()
+
+    xyz = np.fromfile('tests/data/sunrgbd/points/000001.bin', np.float32)
+
+    xyz = torch.from_numpy(xyz[..., :3]).view(1, -1, 3).cuda()
+    features = xyz.repeat([1, 1, 4]).transpose(1, 2).contiguous().cuda()
+    new_xyz, new_features, inds = self(xyz, features)
+    assert new_xyz.shape == torch.Size([1, 16, 3])
+    assert new_features.shape == torch.Size([1, 32, 16])
+    assert inds.shape == torch.Size([1, 16])
diff --git a/mmde/tests/test_models/test_layers/test_spconv/test_spconv_module.py b/mmde/tests/test_models/test_layers/test_spconv/test_spconv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ae7691e85cad197db75a5cc3e47cf0309b496cf
--- /dev/null
+++ b/mmde/tests/test_models/test_layers/test_spconv/test_spconv_module.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet3d.models.layers import SparseBasicBlock
+from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE
+
+if IS_SPCONV2_AVAILABLE:
+    from spconv.pytorch import (SparseConvTensor, SparseInverseConv3d,
+                                SubMConv3d)
+else:
+    from mmcv.ops import SparseConvTensor, SparseInverseConv3d, SubMConv3d
+
+
+def test_SparseBasicBlock():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    voxel_features = torch.tensor(
+        [[6.56126, 0.9648336, -1.7339306, 0.315],
+         [6.8162713, -2.480431, -1.3616394, 0.36],
+         [11.643568, -4.744306, -1.3580885, 0.16],
+         [23.482342, 6.5036807, 0.5806964, 0.35]],
+        dtype=torch.float32).cuda()  # n, point_features
+    coordinates = torch.tensor(
+        [[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232],
+         [1, 35, 930, 469]],
+        dtype=torch.int32).cuda()  # n, 4(batch, ind_x, ind_y, ind_z)
+
+    # test
+    input_sp_tensor = SparseConvTensor(voxel_features, coordinates,
+                                       [41, 1600, 1408], 2)
+    self = SparseBasicBlock(
+        4,
+        4,
+        conv_cfg=dict(type='SubMConv3d', indice_key='subm1'),
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01)).cuda()
+    # test conv and bn layer
+    assert isinstance(self.conv1, SubMConv3d)
+    assert self.conv1.in_channels == 4
+    assert self.conv1.out_channels == 4
+    assert isinstance(self.conv2, SubMConv3d)
+    assert self.conv2.out_channels == 4
+    assert self.conv2.out_channels == 4
+    assert self.bn1.eps == 1e-3
+    assert self.bn1.momentum == 0.01
+
+    out_features = self(input_sp_tensor)
+    assert out_features.features.shape == torch.Size([4, 4])
+
+
+def test_make_sparse_convmodule():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    from mmdet3d.models.layers import make_sparse_convmodule
+
+    voxel_features = torch.tensor(
+        [[6.56126, 0.9648336, -1.7339306, 0.315],
+         [6.8162713, -2.480431, -1.3616394, 0.36],
+         [11.643568, -4.744306, -1.3580885, 0.16],
+         [23.482342, 6.5036807, 0.5806964, 0.35]],
+        dtype=torch.float32).cuda()  # n, point_features
+    coordinates = torch.tensor(
+        [[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232],
+         [1, 35, 930, 469]],
+        dtype=torch.int32).cuda()  # n, 4(batch, ind_x, ind_y, ind_z)
+
+    # test
+    input_sp_tensor = SparseConvTensor(voxel_features, coordinates,
+                                       [41, 1600, 1408], 2)
+
+    sparse_block0 = make_sparse_convmodule(
+        4,
+        16,
+        3,
+        'test0',
+        stride=1,
+        padding=0,
+        conv_type='SubMConv3d',
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+        order=('conv', 'norm', 'act')).cuda()
+    assert isinstance(sparse_block0[0], SubMConv3d)
+    assert sparse_block0[0].in_channels == 4
+    assert sparse_block0[0].out_channels == 16
+    assert isinstance(sparse_block0[1], torch.nn.BatchNorm1d)
+    assert sparse_block0[1].eps == 0.001
+    assert sparse_block0[1].momentum == 0.01
+    assert isinstance(sparse_block0[2], torch.nn.ReLU)
+
+    # test forward
+    out_features = sparse_block0(input_sp_tensor)
+    assert out_features.features.shape == torch.Size([4, 16])
+
+    sparse_block1 = make_sparse_convmodule(
+        4,
+        16,
+        3,
+        'test1',
+        stride=1,
+        padding=0,
+        conv_type='SparseInverseConv3d',
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+        order=('norm', 'act', 'conv'))
+    assert isinstance(sparse_block1[0], torch.nn.BatchNorm1d)
+    assert isinstance(sparse_block1[1], torch.nn.ReLU)
+    assert isinstance(sparse_block1[2], SparseInverseConv3d)
diff --git a/mmde/tests/test_models/test_layers/test_torchsparse/test_torchsparse_module.py b/mmde/tests/test_models/test_layers/test_torchsparse/test_torchsparse_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..10aa503a4dbfd519585bee6c50800fcc42b03915
--- /dev/null
+++ b/mmde/tests/test_models/test_layers/test_torchsparse/test_torchsparse_module.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet3d.models.layers.torchsparse import IS_TORCHSPARSE_AVAILABLE
+
+if IS_TORCHSPARSE_AVAILABLE:
+    from torchsparse import SparseTensor
+
+    from mmdet3d.models.layers.torchsparse_block import (TorchSparseBasicBlock,
+                                                         TorchSparseBottleneck,
+                                                         TorchSparseConvModule)
+else:
+    pytest.skip('test requires Torchsparse', allow_module_level=True)
+
+
+def test_TorchsparseConvModule():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    voxel_features = torch.tensor(
+        [[6.56126, 0.9648336, -1.7339306, 0.315],
+         [6.8162713, -2.480431, -1.3616394, 0.36],
+         [11.643568, -4.744306, -1.3580885, 0.16],
+         [23.482342, 6.5036807, 0.5806964, 0.35]],
+        dtype=torch.float32).cuda()  # n, point_features
+    coordinates = torch.tensor(
+        [[12, 819, 131, 0], [16, 750, 136, 0], [16, 705, 232, 1],
+         [35, 930, 469, 1]],
+        dtype=torch.int32).cuda()  # n, 4(ind_x, ind_y, ind_z, batch)
+
+    # test
+    input_sp_tensor = SparseTensor(voxel_features, coordinates)
+
+    self = TorchSparseConvModule(4, 4, kernel_size=2, stride=2).cuda()
+
+    out_features = self(input_sp_tensor)
+    assert out_features.F.shape == torch.Size([4, 4])
+
+
+def test_TorchsparseResidualBlock():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    voxel_features = torch.tensor(
+        [[6.56126, 0.9648336, -1.7339306, 0.315],
+         [6.8162713, -2.480431, -1.3616394, 0.36],
+         [11.643568, -4.744306, -1.3580885, 0.16],
+         [23.482342, 6.5036807, 0.5806964, 0.35]],
+        dtype=torch.float32).cuda()  # n, point_features
+    coordinates = torch.tensor(
+        [[12, 819, 131, 0], [16, 750, 136, 0], [16, 705, 232, 1],
+         [35, 930, 469, 1]],
+        dtype=torch.int32).cuda()  # n, 4(ind_x, ind_y, ind_z, batch)
+
+    # test
+    input_sp_tensor = SparseTensor(voxel_features, coordinates)
+
+    sparse_block0 = TorchSparseBasicBlock(4, 16, kernel_size=3).cuda()
+    sparse_block1 = TorchSparseBottleneck(4, 16, kernel_size=3).cuda()
+
+    # test forward
+    out_features0 = sparse_block0(input_sp_tensor)
+    out_features1 = sparse_block1(input_sp_tensor)
+    assert out_features0.F.shape == torch.Size([4, 16])
+    assert out_features1.F.shape == torch.Size([4, 16])
diff --git a/mmde/tests/test_models/test_layers/test_vote_module.py b/mmde/tests/test_models/test_layers/test_vote_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..31a594403eafc961f5dcbc9317b2a389b203d06b
--- /dev/null
+++ b/mmde/tests/test_models/test_layers/test_vote_module.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def test_vote_module():
+    from mmdet3d.models.layers import VoteModule
+
+    vote_loss = dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='none',
+        loss_dst_weight=10.0)
+    self = VoteModule(vote_per_seed=3, in_channels=8, vote_loss=vote_loss)
+
+    seed_xyz = torch.rand([2, 64, 3], dtype=torch.float32)  # (b, npoints, 3)
+    seed_features = torch.rand(
+        [2, 8, 64], dtype=torch.float32)  # (b, in_channels, npoints)
+
+    # test forward
+    vote_xyz, vote_features, vote_offset = self(seed_xyz, seed_features)
+    assert vote_xyz.shape == torch.Size([2, 192, 3])
+    assert vote_features.shape == torch.Size([2, 8, 192])
+    assert vote_offset.shape == torch.Size([2, 3, 192])
+
+    # test clip offset and without feature residual
+    self = VoteModule(
+        vote_per_seed=1,
+        in_channels=8,
+        num_points=32,
+        with_res_feat=False,
+        vote_xyz_range=(2.0, 2.0, 2.0))
+
+    vote_xyz, vote_features, vote_offset = self(seed_xyz, seed_features)
+    assert vote_xyz.shape == torch.Size([2, 32, 3])
+    assert vote_features.shape == torch.Size([2, 8, 32])
+    assert vote_offset.shape == torch.Size([2, 3, 32])
+    assert torch.allclose(seed_features[..., :32], vote_features)
+    assert vote_offset.max() <= 2.0
+    assert vote_offset.min() >= -2.0
diff --git a/mmde/tests/test_models/test_losses/test_chamfer_disrance.py b/mmde/tests/test_models/test_losses/test_chamfer_disrance.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aaff2f77fc1b4573c00e25b4e4e58f197f5ee6f
--- /dev/null
+++ b/mmde/tests/test_models/test_losses/test_chamfer_disrance.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+
+def test_chamfer_disrance():
+    from mmdet3d.models.losses import ChamferDistance, chamfer_distance
+
+    with pytest.raises(AssertionError):
+        # test invalid mode
+        ChamferDistance(mode='smoothl1')
+        # test invalid type of reduction
+        ChamferDistance(mode='l2', reduction=None)
+
+    self = ChamferDistance(
+        mode='l2', reduction='sum', loss_src_weight=1.0, loss_dst_weight=1.0)
+    source = torch.tensor([[[-0.9888, 0.9683, -0.8494],
+                            [-6.4536, 4.5146,
+                             1.6861], [2.0482, 5.6936, -1.4701],
+                            [-0.5173, 5.6472, 2.1748],
+                            [-2.8010, 5.4423, -1.2158],
+                            [2.4018, 2.4389, -0.2403],
+                            [-2.8811, 3.8486, 1.4750],
+                            [-0.2031, 3.8969,
+                             -1.5245], [1.3827, 4.9295, 1.1537],
+                            [-2.6961, 2.2621, -1.0976]],
+                           [[0.3692, 1.8409,
+                             -1.4983], [1.9995, 6.3602, 0.1798],
+                            [-2.1317, 4.6011,
+                             -0.7028], [2.4158, 3.1482, 0.3169],
+                            [-0.5836, 3.6250, -1.2650],
+                            [-1.9862, 1.6182, -1.4901],
+                            [2.5992, 1.2847, -0.8471],
+                            [-0.3467, 5.3681, -1.4755],
+                            [-0.8576, 3.3400, -1.7399],
+                            [2.7447, 4.6349, 0.1994]]])
+
+    target = torch.tensor([[[-0.4758, 1.0094, -0.8645],
+                            [-0.3130, 0.8564, -0.9061],
+                            [-0.1560, 2.0394, -0.8936],
+                            [-0.3685, 1.6467, -0.8271],
+                            [-0.2740, 2.2212, -0.7980]],
+                           [[1.4856, 2.5299,
+                             -1.0047], [2.3262, 3.3065, -0.9475],
+                            [2.4593, 2.5870,
+                             -0.9423], [0.0000, 0.0000, 0.0000],
+                            [0.0000, 0.0000, 0.0000]]])
+
+    loss_source, loss_target, indices1, indices2 = self(
+        source, target, return_indices=True)
+
+    assert torch.allclose(loss_source, torch.tensor(219.5936))
+    assert torch.allclose(loss_target, torch.tensor(22.3705))
+
+    expected_inds1 = [[0, 4, 4, 4, 4, 2, 4, 4, 4, 3],
+                      [0, 1, 0, 1, 0, 4, 2, 0, 0, 1]]
+    expected_inds2 = [[0, 4, 4, 4, 4, 2, 4, 4, 4, 3],
+                      [0, 1, 0, 1, 0, 3, 2, 0, 0, 1]]
+    assert (torch.equal(indices1, indices1.new_tensor(expected_inds1))
+            or torch.equal(indices1, indices1.new_tensor(expected_inds2)))
+    assert torch.equal(indices2,
+                       indices2.new_tensor([[0, 0, 0, 0, 0], [0, 3, 6, 0, 0]]))
+
+    loss_source, loss_target, indices1, indices2 = chamfer_distance(
+        source, target, reduction='sum')
+
+    assert torch.allclose(loss_source, torch.tensor(219.5936))
+    assert torch.allclose(loss_target, torch.tensor(22.3705))
+    assert (torch.equal(indices1, indices1.new_tensor(expected_inds1))
+            or torch.equal(indices1, indices1.new_tensor(expected_inds2)))
+    assert (indices2 == indices2.new_tensor([[0, 0, 0, 0, 0], [0, 3, 6, 0,
+                                                               0]])).all()
diff --git a/mmde/tests/test_models/test_losses/test_multibin_loss.py b/mmde/tests/test_models/test_losses/test_multibin_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..28ed1337b8da876dfb843be4398e684a865a5b93
--- /dev/null
+++ b/mmde/tests/test_models/test_losses/test_multibin_loss.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet3d.registry import MODELS
+
+
+def test_multibin_loss():
+    from mmdet3d.models.losses import MultiBinLoss
+
+    # reduction should be in ['none', 'mean', 'sum']
+    with pytest.raises(AssertionError):
+        multibin_loss = MultiBinLoss(reduction='l2')
+
+    pred = torch.tensor([[
+        0.81, 0.32, 0.78, 0.52, 0.24, 0.12, 0.32, 0.11, 1.20, 1.30, 0.20, 0.11,
+        0.12, 0.11, 0.23, 0.31
+    ],
+                         [
+                             0.02, 0.19, 0.78, 0.22, 0.31, 0.12, 0.22, 0.11,
+                             1.20, 1.30, 0.45, 0.51, 0.12, 0.11, 0.13, 0.61
+                         ]])
+    target = torch.tensor([[1, 1, 0, 0, 2.14, 3.12, 0.68, -2.15],
+                           [1, 1, 0, 0, 3.12, 3.12, 2.34, 1.23]])
+    multibin_loss_cfg = dict(
+        type='MultiBinLoss', reduction='none', loss_weight=1.0)
+    multibin_loss = MODELS.build(multibin_loss_cfg)
+    output_multibin_loss = multibin_loss(pred, target, num_dir_bins=4)
+    expected_multibin_loss = torch.tensor(2.1120)
+    assert torch.allclose(
+        output_multibin_loss, expected_multibin_loss, atol=1e-4)
diff --git a/mmde/tests/test_models/test_losses/test_paconv_regularization_loss.py b/mmde/tests/test_models/test_losses/test_paconv_regularization_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..754fa54a9af755edbe7504057016ead46d2f0d25
--- /dev/null
+++ b/mmde/tests/test_models/test_losses/test_paconv_regularization_loss.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+
+import numpy as np
+import pytest
+import torch
+from torch import nn as nn
+
+
+def set_random_seed(seed, deterministic=False):
+    """Set random seed.
+
+    Args:
+        seed (int): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Default: False.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+
+def test_paconv_regularization_loss():
+    from mmdet3d.models.layers import PAConv, PAConvCUDA
+    from mmdet3d.models.losses import PAConvRegularizationLoss
+
+    class ToyModel(nn.Module):
+
+        def __init__(self):
+            super(ToyModel, self).__init__()
+
+            self.paconvs = nn.ModuleList()
+            self.paconvs.append(PAConv(8, 16, 8))
+            self.paconvs.append(PAConv(8, 16, 8, kernel_input='identity'))
+            self.paconvs.append(PAConvCUDA(8, 16, 8))
+
+            self.conv1 = nn.Conv1d(3, 8, 1)
+
+    set_random_seed(0, True)
+    model = ToyModel()
+
+    # reduction should be in ['none', 'mean', 'sum']
+    with pytest.raises(AssertionError):
+        paconv_corr_loss = PAConvRegularizationLoss(reduction='l2')
+
+    paconv_corr_loss = PAConvRegularizationLoss(reduction='mean')
+    mean_corr_loss = paconv_corr_loss(model.modules())
+    assert mean_corr_loss >= 0
+    assert mean_corr_loss.requires_grad
+
+    sum_corr_loss = paconv_corr_loss(model.modules(), reduction_override='sum')
+    assert torch.allclose(sum_corr_loss, mean_corr_loss * 3)
+
+    none_corr_loss = paconv_corr_loss(
+        model.modules(), reduction_override='none')
+    assert none_corr_loss.shape[0] == 3
+    assert torch.allclose(none_corr_loss.mean(), mean_corr_loss)
diff --git a/mmde/tests/test_models/test_losses/test_rotated_iou_loss.py b/mmde/tests/test_models/test_losses/test_rotated_iou_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f293522eecc2a16e8f8345800fedd112e8fbb64
--- /dev/null
+++ b/mmde/tests/test_models/test_losses/test_rotated_iou_loss.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import numpy as np
+import torch
+
+from mmdet3d.models.losses import RotatedIoU3DLoss
+
+
+def test_rotated_iou_3d_loss():
+
+    if not torch.cuda.is_available():
+        return
+
+    boxes1 = torch.tensor([[.5, .5, .5, 1., 1., 1., .0],
+                           [.5, .5, .5, 1., 1., 1., .0],
+                           [.5, .5, .5, 1., 1., 1., .0],
+                           [.5, .5, .5, 1., 1., 1., .0],
+                           [.5, .5, .5, 1., 1., 1., .0]]).cuda()
+    boxes2 = torch.tensor([[.5, .5, .5, 1., 1., 1., .0],
+                           [.5, .5, .5, 1., 1., 2., np.pi / 2],
+                           [.5, .5, .5, 1., 1., 1., np.pi / 4],
+                           [1., 1., 1., 1., 1., 1., .0],
+                           [-1.5, -1.5, -1.5, 2.5, 2.5, 2.5, .0]]).cuda()
+
+    expect_ious = 1 - torch.tensor([[1., .5, .7071, 1 / 15, .0]]).cuda()
+    ious = RotatedIoU3DLoss(reduction='none')(boxes1, boxes2)
+    assert torch.allclose(ious, expect_ious, atol=1e-4)
diff --git a/mmde/tests/test_models/test_losses/test_uncertain_smooth_l1_loss.py b/mmde/tests/test_models/test_losses/test_uncertain_smooth_l1_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d6f2b4695e6eba2ca085ef96946fab452817f84
--- /dev/null
+++ b/mmde/tests/test_models/test_losses/test_uncertain_smooth_l1_loss.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet3d.registry import MODELS
+
+
+def test_uncertain_smooth_l1_loss():
+    from mmdet3d.models.losses import UncertainL1Loss, UncertainSmoothL1Loss
+
+    # reduction should be in ['none', 'mean', 'sum']
+    with pytest.raises(AssertionError):
+        uncertain_l1_loss = UncertainL1Loss(reduction='l2')
+    with pytest.raises(AssertionError):
+        uncertain_smooth_l1_loss = UncertainSmoothL1Loss(reduction='l2')
+
+    pred = torch.tensor([1.5783, 0.5972, 1.4821, 0.9488])
+    target = torch.tensor([1.0813, -0.3466, -1.1404, -0.9665])
+    sigma = torch.tensor([-1.0053, 0.4710, -1.7784, -0.8603])
+
+    # test uncertain l1 loss
+    uncertain_l1_loss_cfg = dict(
+        type='UncertainL1Loss', alpha=1.0, reduction='mean', loss_weight=1.0)
+    uncertain_l1_loss = MODELS.build(uncertain_l1_loss_cfg)
+    mean_l1_loss = uncertain_l1_loss(pred, target, sigma)
+    expected_l1_loss = torch.tensor(4.7069)
+    assert torch.allclose(mean_l1_loss, expected_l1_loss, atol=1e-4)
+
+    # test uncertain smooth l1 loss
+    uncertain_smooth_l1_loss_cfg = dict(
+        type='UncertainSmoothL1Loss',
+        alpha=1.0,
+        beta=0.5,
+        reduction='mean',
+        loss_weight=1.0)
+    uncertain_smooth_l1_loss = MODELS.build(uncertain_smooth_l1_loss_cfg)
+    mean_smooth_l1_loss = uncertain_smooth_l1_loss(pred, target, sigma)
+    expected_smooth_l1_loss = torch.tensor(3.9795)
+    assert torch.allclose(
+        mean_smooth_l1_loss, expected_smooth_l1_loss, atol=1e-4)
diff --git a/mmde/tests/test_models/test_middle_encoders/test_sparse_encoders.py b/mmde/tests/test_models/test_middle_encoders/test_sparse_encoders.py
new file mode 100644
index 0000000000000000000000000000000000000000..698282321f3ab582557a628af3841049d49d2c9a
--- /dev/null
+++ b/mmde/tests/test_models/test_middle_encoders/test_sparse_encoders.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet3d.registry import MODELS
+
+
+def test_sparse_encoder():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    sparse_encoder_cfg = dict(
+        type='SparseEncoder',
+        in_channels=5,
+        sparse_shape=[40, 1024, 1024],
+        order=('conv', 'norm', 'act'),
+        encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
+                                                                      128)),
+        encoder_paddings=((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1,
+                                                                       1)),
+        block_type='basicblock')
+
+    sparse_encoder = MODELS.build(sparse_encoder_cfg).cuda()
+    voxel_features = torch.rand([207842, 5]).cuda()
+    coors = torch.randint(0, 4, [207842, 4]).cuda()
+
+    ret = sparse_encoder(voxel_features, coors, 4)
+    assert ret.shape == torch.Size([4, 256, 128, 128])
+
+
+def test_sparse_encoder_for_ssd():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    sparse_encoder_for_ssd_cfg = dict(
+        type='SparseEncoderSASSD',
+        in_channels=5,
+        sparse_shape=[40, 1024, 1024],
+        order=('conv', 'norm', 'act'),
+        encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
+                                                                      128)),
+        encoder_paddings=((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1,
+                                                                       1)),
+        block_type='basicblock')
+
+    sparse_encoder = MODELS.build(sparse_encoder_for_ssd_cfg).cuda()
+    voxel_features = torch.rand([207842, 5]).cuda()
+    coors = torch.randint(0, 4, [207842, 4]).cuda()
+
+    ret, _ = sparse_encoder(voxel_features, coors, 4, True)
+    assert ret.shape == torch.Size([4, 256, 128, 128])
diff --git a/mmde/tests/test_models/test_middle_encoders/test_sparse_unet.py b/mmde/tests/test_models/test_middle_encoders/test_sparse_unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fe1e5b85f777bc784886f15c126e6d8ce1a3468
--- /dev/null
+++ b/mmde/tests/test_models/test_middle_encoders/test_sparse_unet.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet3d.models.layers import SparseBasicBlock
+from mmdet3d.models.layers.spconv import IS_SPCONV2_AVAILABLE
+
+if IS_SPCONV2_AVAILABLE:
+    from spconv.pytorch import SparseConv3d, SparseInverseConv3d, SubMConv3d
+else:
+    from mmcv.ops import SparseConv3d, SparseInverseConv3d, SubMConv3d
+
+
+def test_SparseUNet():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    from mmdet3d.models.middle_encoders.sparse_unet import SparseUNet
+    self = SparseUNet(in_channels=4, sparse_shape=[41, 1600, 1408]).cuda()
+
+    # test encoder layers
+    assert len(self.encoder_layers) == 4
+    assert self.encoder_layers.encoder_layer1[0][0].in_channels == 16
+    assert self.encoder_layers.encoder_layer1[0][0].out_channels == 16
+    assert isinstance(self.encoder_layers.encoder_layer1[0][0], SubMConv3d)
+    assert isinstance(self.encoder_layers.encoder_layer1[0][1],
+                      torch.nn.modules.batchnorm.BatchNorm1d)
+    assert isinstance(self.encoder_layers.encoder_layer1[0][2],
+                      torch.nn.modules.activation.ReLU)
+    assert self.encoder_layers.encoder_layer4[0][0].in_channels == 64
+    assert self.encoder_layers.encoder_layer4[0][0].out_channels == 64
+    assert isinstance(self.encoder_layers.encoder_layer4[0][0], SparseConv3d)
+    assert isinstance(self.encoder_layers.encoder_layer4[2][0], SubMConv3d)
+
+    # test decoder layers
+    assert isinstance(self.lateral_layer1, SparseBasicBlock)
+    assert isinstance(self.merge_layer1[0], SubMConv3d)
+    assert isinstance(self.upsample_layer1[0], SubMConv3d)
+    assert isinstance(self.upsample_layer2[0], SparseInverseConv3d)
+
+    voxel_features = torch.tensor(
+        [[6.56126, 0.9648336, -1.7339306, 0.315],
+         [6.8162713, -2.480431, -1.3616394, 0.36],
+         [11.643568, -4.744306, -1.3580885, 0.16],
+         [23.482342, 6.5036807, 0.5806964, 0.35]],
+        dtype=torch.float32).cuda()  # n, point_features
+    coordinates = torch.tensor(
+        [[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232],
+         [1, 35, 930, 469]],
+        dtype=torch.int32).cuda()  # n, 4(batch, ind_x, ind_y, ind_z)
+
+    unet_ret_dict = self.forward(voxel_features, coordinates, 2)
+    seg_features = unet_ret_dict['seg_features']
+    spatial_features = unet_ret_dict['spatial_features']
+
+    assert seg_features.shape == torch.Size([4, 16])
+    assert spatial_features.shape == torch.Size([2, 256, 200, 176])
diff --git a/mmde/tests/test_models/test_necks/test_dla_neck.py b/mmde/tests/test_models/test_necks/test_dla_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bde3e18700d5cf857f09764c36fc2dcad69dfba
--- /dev/null
+++ b/mmde/tests/test_models/test_necks/test_dla_neck.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+
+from mmdet3d.registry import MODELS
+
+
+def test_dla_neck():
+
+    s = 32
+    in_channels = [16, 32, 64, 128, 256, 512]
+    feat_sizes = [s // 2**i for i in range(6)]  # [32, 16, 8, 4, 2, 1]
+
+    if torch.cuda.is_available():
+        # Test DLA Neck with DCNv2 on GPU
+        neck_cfg = dict(
+            type='DLANeck',
+            in_channels=[16, 32, 64, 128, 256, 512],
+            start_level=2,
+            end_level=5,
+            norm_cfg=dict(type='GN', num_groups=32))
+        neck = MODELS.build(neck_cfg)
+        neck.init_weights()
+        neck.cuda()
+        feats = [
+            torch.rand(4, in_channels[i], feat_sizes[i], feat_sizes[i]).cuda()
+            for i in range(len(in_channels))
+        ]
+        outputs = neck(feats)
+        assert outputs[0].shape == (4, 64, 8, 8)
+    else:
+        # Test DLA Neck without DCNv2 on CPU
+        neck_cfg = dict(
+            type='DLANeck',
+            in_channels=[16, 32, 64, 128, 256, 512],
+            start_level=2,
+            end_level=5,
+            norm_cfg=dict(type='GN', num_groups=32),
+            use_dcn=False)
+        neck = MODELS.build(neck_cfg)
+        neck.init_weights()
+        feats = [
+            torch.rand(4, in_channels[i], feat_sizes[i], feat_sizes[i])
+            for i in range(len(in_channels))
+        ]
+        outputs = neck(feats)
+        assert outputs[0].shape == (4, 64, 8, 8)
diff --git a/mmde/tests/test_models/test_necks/test_imvoxel_neck.py b/mmde/tests/test_models/test_necks/test_imvoxel_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d3a071d78f1e300ad766c55f0da6e272875fb8c
--- /dev/null
+++ b/mmde/tests/test_models/test_necks/test_imvoxel_neck.py
@@ -0,0 +1,16 @@
+import pytest
+import torch
+
+from mmdet3d.registry import MODELS
+
+
+def test_imvoxel_neck():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+
+    neck_cfg = dict(
+        type='OutdoorImVoxelNeck', in_channels=64, out_channels=256)
+    neck = MODELS.build(neck_cfg).cuda()
+    inputs = torch.rand([1, 64, 216, 248, 12], device='cuda')
+    outputs = neck(inputs)
+    assert outputs[0].shape == (1, 256, 248, 216)
diff --git a/mmde/tests/test_models/test_necks/test_pointnet2_fp_neck.py b/mmde/tests/test_models/test_necks/test_pointnet2_fp_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf5df0b3e83c4b665c92bb3b801ab63c78b60748
--- /dev/null
+++ b/mmde/tests/test_models/test_necks/test_pointnet2_fp_neck.py
@@ -0,0 +1,37 @@
+import pytest
+import torch
+
+from mmdet3d.registry import MODELS
+
+
+def test_pointnet2_fp_neck():
+    if not torch.cuda.is_available():
+        pytest.skip()
+
+    xyzs = [16384, 4096, 1024, 256, 64]
+    feat_channels = [1, 96, 256, 512, 1024]
+    channel_num = 5
+
+    sa_xyz = [torch.rand(3, xyzs[i], 3) for i in range(channel_num)]
+    sa_features = [
+        torch.rand(3, feat_channels[i], xyzs[i]) for i in range(channel_num)
+    ]
+
+    neck_cfg = dict(
+        type='PointNetFPNeck',
+        fp_channels=((1536, 512, 512), (768, 512, 512), (608, 256, 256),
+                     (257, 128, 128)))
+
+    neck = MODELS.build(neck_cfg)
+    neck.init_weights()
+
+    if torch.cuda.is_available():
+        sa_xyz = [x.cuda() for x in sa_xyz]
+        sa_features = [x.cuda() for x in sa_features]
+        neck.cuda()
+
+    feats_sa = {'sa_xyz': sa_xyz, 'sa_features': sa_features}
+    outputs = neck(feats_sa)
+    assert outputs['fp_xyz'].cpu().numpy().shape == (3, 16384, 3)
+    assert outputs['fp_features'].detach().cpu().numpy().shape == (3, 128,
+                                                                   16384)
diff --git a/mmde/tests/test_models/test_necks/test_second_fpn.py b/mmde/tests/test_models/test_necks/test_second_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0edf7d7e36e0d3b8d9c16edac93d770d9e02bce
--- /dev/null
+++ b/mmde/tests/test_models/test_necks/test_second_fpn.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet3d.registry import MODELS
+
+
+def test_secfpn():
+    neck_cfg = dict(
+        type='SECONDFPN',
+        in_channels=[2, 3],
+        upsample_strides=[1, 2],
+        out_channels=[4, 6],
+    )
+    neck = MODELS.build(neck_cfg)
+    assert neck.deblocks[0][0].in_channels == 2
+    assert neck.deblocks[1][0].in_channels == 3
+    assert neck.deblocks[0][0].out_channels == 4
+    assert neck.deblocks[1][0].out_channels == 6
+    assert neck.deblocks[0][0].stride == (1, 1)
+    assert neck.deblocks[1][0].stride == (2, 2)
+    assert neck is not None
+
+    neck_cfg = dict(
+        type='SECONDFPN',
+        in_channels=[2, 2],
+        upsample_strides=[1, 2, 4],
+        out_channels=[2, 2],
+    )
+    with pytest.raises(AssertionError):
+        MODELS.build(neck_cfg)
+
+    neck_cfg = dict(
+        type='SECONDFPN',
+        in_channels=[2, 2, 4],
+        upsample_strides=[1, 2, 4],
+        out_channels=[2, 2],
+    )
+    with pytest.raises(AssertionError):
+        MODELS.build(neck_cfg)
+
+
+def test_centerpoint_fpn():
+
+    second_cfg = dict(
+        type='SECOND',
+        in_channels=2,
+        out_channels=[2, 2, 2],
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        conv_cfg=dict(type='Conv2d', bias=False))
+
+    second = MODELS.build(second_cfg)
+
+    # centerpoint usage of fpn
+    centerpoint_fpn_cfg = dict(
+        type='SECONDFPN',
+        in_channels=[2, 2, 2],
+        out_channels=[2, 2, 2],
+        upsample_strides=[0.5, 1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=True)
+
+    # original usage of fpn
+    fpn_cfg = dict(
+        type='SECONDFPN',
+        in_channels=[2, 2, 2],
+        upsample_strides=[1, 2, 4],
+        out_channels=[2, 2, 2])
+
+    second_fpn = MODELS.build(fpn_cfg)
+
+    centerpoint_second_fpn = MODELS.build(centerpoint_fpn_cfg)
+
+    input = torch.rand([2, 2, 32, 32])
+    sec_output = second(input)
+    centerpoint_output = centerpoint_second_fpn(sec_output)
+    second_output = second_fpn(sec_output)
+    assert centerpoint_output[0].shape == torch.Size([2, 6, 8, 8])
+    assert second_output[0].shape == torch.Size([2, 6, 16, 16])
diff --git a/mmde/tests/test_models/test_segmentors/test_cylinder3d.py b/mmde/tests/test_models/test_segmentors/test_cylinder3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6e6a0b496ee6462b88bf19afb815320783ee9af
--- /dev/null
+++ b/mmde/tests/test_models/test_segmentors/test_cylinder3d.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestCylinder3D(unittest.TestCase):
+
+    def test_cylinder3d(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'Cylinder3D')
+        DefaultScope.get_instance('test_cylinder3d', scope_name='mmdet3d')
+        setup_seed(0)
+        cylinder3d_cfg = get_detector_cfg(
+            'cylinder3d/cylinder3d_4xb4-3x_semantickitti.py')
+        cylinder3d_cfg.decode_head['ignore_index'] = 1
+        model = MODELS.build(cylinder3d_cfg)
+        num_gt_instance = 3
+        packed_inputs = create_detector_inputs(
+            num_gt_instance=num_gt_instance,
+            num_classes=1,
+            with_pts_semantic_mask=True)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                torch.cuda.empty_cache()
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('pts_semantic_mask', results[0].pred_pts_seg)
+
+            losses = model.forward(**data, mode='loss')
+
+            self.assertGreater(losses['decode.loss_ce'], 0)
+            self.assertGreater(losses['decode.loss_lovasz'], 0)
diff --git a/mmde/tests/test_models/test_segmentors/test_minkunet.py b/mmde/tests/test_models/test_segmentors/test_minkunet.py
new file mode 100644
index 0000000000000000000000000000000000000000..16312c293e3cc1de1cbcfbbb7237144211f6e8b6
--- /dev/null
+++ b/mmde/tests/test_models/test_segmentors/test_minkunet.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import pytest
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import MODELS
+from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,
+                             setup_seed)
+
+
+class TestMinkUNet(unittest.TestCase):
+
+    def test_minkunet(self):
+        try:
+            import torchsparse  # noqa
+        except ImportError:
+            pytest.skip('test requires Torchsparse installation')
+
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'MinkUNet')
+        DefaultScope.get_instance('test_minkunet', scope_name='mmdet3d')
+        setup_seed(0)
+        model_cfg = get_detector_cfg('_base_/models/minkunet.py')
+        model = MODELS.build(model_cfg)
+        num_gt_instance = 3
+        packed_inputs = create_detector_inputs(
+            num_gt_instance=num_gt_instance,
+            num_classes=19,
+            with_pts_semantic_mask=True)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                torch.cuda.empty_cache()
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('pts_semantic_mask', results[0].pred_pts_seg)
+
+            losses = model.forward(**data, mode='loss')
+
+            self.assertGreater(losses['loss_sem_seg'], 0)
diff --git a/mmde/tests/test_models/test_segmentors/test_seg3d_tta_model.py b/mmde/tests/test_models/test_segmentors/test_seg3d_tta_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..24bfa225f3930b4b4062e74f83f934fee3d7d983
--- /dev/null
+++ b/mmde/tests/test_models/test_segmentors/test_seg3d_tta_model.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+from mmengine import ConfigDict, DefaultScope
+
+from mmdet3d.models import Seg3DTTAModel
+from mmdet3d.registry import MODELS
+from mmdet3d.structures import Det3DDataSample
+from mmdet3d.testing import get_detector_cfg
+
+
+class TestSeg3DTTAModel(TestCase):
+
+    def test_seg3d_tta_model(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'Cylinder3D')
+        DefaultScope.get_instance('test_cylinder3d', scope_name='mmdet3d')
+        segmentor3d_cfg = get_detector_cfg(
+            'cylinder3d/cylinder3d_4xb4-3x_semantickitti.py')
+        cfg = ConfigDict(type='Seg3DTTAModel', module=segmentor3d_cfg)
+
+        model: Seg3DTTAModel = MODELS.build(cfg)
+
+        points = []
+        data_samples = []
+        pcd_horizontal_flip_list = [False, False, True, True]
+        pcd_vertical_flip_list = [False, True, False, True]
+        for i in range(4):
+            points.append({'points': [torch.randn(200, 4)]})
+            data_samples.append([
+                Det3DDataSample(
+                    metainfo=dict(
+                        pcd_horizontal_flip=pcd_horizontal_flip_list[i],
+                        pcd_vertical_flip=pcd_vertical_flip_list[i]))
+            ])
+        if torch.cuda.is_available():
+            model.eval().cuda()
+            model.test_step(dict(inputs=points, data_samples=data_samples))
diff --git a/mmde/tests/test_models/test_task_modules/test_anchor/test_anchor_3d_generator.py b/mmde/tests/test_models/test_task_modules/test_anchor/test_anchor_3d_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..294aa93f13e49a0703e25b078c33bbcf834337b6
--- /dev/null
+++ b/mmde/tests/test_models/test_task_modules/test_anchor/test_anchor_3d_generator.py
@@ -0,0 +1,263 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""
+CommandLine:
+    pytest tests/test_utils/test_anchor.py
+    xdoctest tests/test_utils/test_anchor.py zero
+
+"""
+import torch
+from mmengine import DefaultScope
+
+from mmdet3d.registry import TASK_UTILS
+
+
+def test_anchor_3d_range_generator():
+
+    import mmdet3d.models.task_modules
+
+    assert hasattr(mmdet3d.models.task_modules, 'Anchor3DRangeGenerator')
+    DefaultScope.get_instance(
+        'test_ancho3drange_generator', scope_name='mmdet3d')
+
+    if torch.cuda.is_available():
+        device = 'cuda'
+    else:
+        device = 'cpu'
+    anchor_generator_cfg = dict(
+        type='Anchor3DRangeGenerator',
+        ranges=[
+            [0, -39.68, -0.6, 70.4, 39.68, -0.6],
+            [0, -39.68, -0.6, 70.4, 39.68, -0.6],
+            [0, -39.68, -1.78, 70.4, 39.68, -1.78],
+        ],
+        sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],
+        rotations=[0, 1.57],
+        reshape_out=False)
+
+    anchor_generator = TASK_UTILS.build(anchor_generator_cfg)
+    repr_str = repr(anchor_generator)
+    expected_repr_str = 'Anchor3DRangeGenerator(anchor_range=' \
+                        '[[0, -39.68, -0.6, 70.4, 39.68, -0.6], ' \
+                        '[0, -39.68, -0.6, 70.4, 39.68, -0.6], ' \
+                        '[0, -39.68, -1.78, 70.4, 39.68, -1.78]],' \
+                        '\nscales=[1],\nsizes=[[0.8, 0.6, 1.73], ' \
+                        '[1.76, 0.6, 1.73], [3.9, 1.6, 1.56]],' \
+                        '\nrotations=[0, 1.57],\nreshape_out=False,' \
+                        '\nsize_per_range=True)'
+    assert repr_str == expected_repr_str
+    featmap_size = (8, 8)
+    mr_anchors = anchor_generator.single_level_grid_anchors(
+        featmap_size, 1.1, device=device)
+    assert mr_anchors.shape == torch.Size([1, 8, 8, 3, 2, 7])
+
+
+def test_aligned_anchor_generator():
+
+    import mmdet3d.models.task_modules
+
+    assert hasattr(mmdet3d.models.task_modules,
+                   'AlignedAnchor3DRangeGenerator')
+    DefaultScope.get_instance(
+        'test_aligned_ancho3drange_generator', scope_name='mmdet3d')
+
+    if torch.cuda.is_available():
+        device = 'cuda'
+    else:
+        device = 'cpu'
+
+    anchor_generator_cfg = dict(
+        type='AlignedAnchor3DRangeGenerator',
+        ranges=[[-51.2, -51.2, -1.80, 51.2, 51.2, -1.80]],
+        scales=[1, 2, 4],
+        sizes=[
+            [2.5981, 0.8660, 1.],  # 1.5/sqrt(3)
+            [1.7321, 0.5774, 1.],  # 1/sqrt(3)
+            [1., 1., 1.],
+            [0.4, 0.4, 1],
+        ],
+        custom_values=[0, 0],
+        rotations=[0, 1.57],
+        size_per_range=False,
+        reshape_out=True)
+
+    featmap_sizes = [(16, 16), (8, 8), (4, 4)]
+    anchor_generator = TASK_UTILS.build(anchor_generator_cfg)
+    assert anchor_generator.num_base_anchors == 8
+
+    # check base anchors
+    expected_grid_anchors = [
+        torch.tensor([[
+            -48.0000, -48.0000, -1.8000, 2.5981, 0.8660, 1.0000, 0.0000,
+            0.0000, 0.0000
+        ],
+                      [
+                          -48.0000, -48.0000, -1.8000, 0.4000, 0.4000, 1.0000,
+                          1.5700, 0.0000, 0.0000
+                      ],
+                      [
+                          -41.6000, -48.0000, -1.8000, 0.4000, 0.4000, 1.0000,
+                          0.0000, 0.0000, 0.0000
+                      ],
+                      [
+                          -35.2000, -48.0000, -1.8000, 1.0000, 1.0000, 1.0000,
+                          1.5700, 0.0000, 0.0000
+                      ],
+                      [
+                          -28.8000, -48.0000, -1.8000, 1.0000, 1.0000, 1.0000,
+                          0.0000, 0.0000, 0.0000
+                      ],
+                      [
+                          -22.4000, -48.0000, -1.8000, 1.7321, 0.5774, 1.0000,
+                          1.5700, 0.0000, 0.0000
+                      ],
+                      [
+                          -16.0000, -48.0000, -1.8000, 1.7321, 0.5774, 1.0000,
+                          0.0000, 0.0000, 0.0000
+                      ],
+                      [
+                          -9.6000, -48.0000, -1.8000, 2.5981, 0.8660, 1.0000,
+                          1.5700, 0.0000, 0.0000
+                      ]],
+                     device=device),
+        torch.tensor([[
+            -44.8000, -44.8000, -1.8000, 5.1962, 1.7320, 2.0000, 0.0000,
+            0.0000, 0.0000
+        ],
+                      [
+                          -44.8000, -44.8000, -1.8000, 0.8000, 0.8000, 2.0000,
+                          1.5700, 0.0000, 0.0000
+                      ],
+                      [
+                          -32.0000, -44.8000, -1.8000, 0.8000, 0.8000, 2.0000,
+                          0.0000, 0.0000, 0.0000
+                      ],
+                      [
+                          -19.2000, -44.8000, -1.8000, 2.0000, 2.0000, 2.0000,
+                          1.5700, 0.0000, 0.0000
+                      ],
+                      [
+                          -6.4000, -44.8000, -1.8000, 2.0000, 2.0000, 2.0000,
+                          0.0000, 0.0000, 0.0000
+                      ],
+                      [
+                          6.4000, -44.8000, -1.8000, 3.4642, 1.1548, 2.0000,
+                          1.5700, 0.0000, 0.0000
+                      ],
+                      [
+                          19.2000, -44.8000, -1.8000, 3.4642, 1.1548, 2.0000,
+                          0.0000, 0.0000, 0.0000
+                      ],
+                      [
+                          32.0000, -44.8000, -1.8000, 5.1962, 1.7320, 2.0000,
+                          1.5700, 0.0000, 0.0000
+                      ]],
+                     device=device),
+        torch.tensor([[
+            -38.4000, -38.4000, -1.8000, 10.3924, 3.4640, 4.0000, 0.0000,
+            0.0000, 0.0000
+        ],
+                      [
+                          -38.4000, -38.4000, -1.8000, 1.6000, 1.6000, 4.0000,
+                          1.5700, 0.0000, 0.0000
+                      ],
+                      [
+                          -12.8000, -38.4000, -1.8000, 1.6000, 1.6000, 4.0000,
+                          0.0000, 0.0000, 0.0000
+                      ],
+                      [
+                          12.8000, -38.4000, -1.8000, 4.0000, 4.0000, 4.0000,
+                          1.5700, 0.0000, 0.0000
+                      ],
+                      [
+                          38.4000, -38.4000, -1.8000, 4.0000, 4.0000, 4.0000,
+                          0.0000, 0.0000, 0.0000
+                      ],
+                      [
+                          -38.4000, -12.8000, -1.8000, 6.9284, 2.3096, 4.0000,
+                          1.5700, 0.0000, 0.0000
+                      ],
+                      [
+                          -12.8000, -12.8000, -1.8000, 6.9284, 2.3096, 4.0000,
+                          0.0000, 0.0000, 0.0000
+                      ],
+                      [
+                          12.8000, -12.8000, -1.8000, 10.3924, 3.4640, 4.0000,
+                          1.5700, 0.0000, 0.0000
+                      ]],
+                     device=device)
+    ]
+    multi_level_anchors = anchor_generator.grid_anchors(
+        featmap_sizes, device=device)
+    expected_multi_level_shapes = [
+        torch.Size([2048, 9]),
+        torch.Size([512, 9]),
+        torch.Size([128, 9])
+    ]
+    for i, single_level_anchor in enumerate(multi_level_anchors):
+        assert single_level_anchor.shape == expected_multi_level_shapes[i]
+        # set [:56:7] thus it could cover 8 (len(size) * len(rotations))
+        # anchors on 8 location
+        assert single_level_anchor[:56:7].allclose(expected_grid_anchors[i])
+
+
+def test_aligned_anchor_generator_per_cls():
+
+    import mmdet3d.models.task_modules
+
+    assert hasattr(mmdet3d.models.task_modules,
+                   'AlignedAnchor3DRangeGeneratorPerCls')
+    DefaultScope.get_instance(
+        'test_ancho3drange_generator_percls', scope_name='mmdet3d')
+
+    if torch.cuda.is_available():
+        device = 'cuda'
+    else:
+        device = 'cpu'
+
+    anchor_generator_cfg = dict(
+        type='AlignedAnchor3DRangeGeneratorPerCls',
+        ranges=[[-100, -100, -1.80, 100, 100, -1.80],
+                [-100, -100, -1.30, 100, 100, -1.30]],
+        sizes=[[1.76, 0.63, 1.44], [2.35, 0.96, 1.59]],
+        custom_values=[0, 0],
+        rotations=[0, 1.57],
+        reshape_out=False)
+
+    featmap_sizes = [(100, 100), (50, 50)]
+    anchor_generator = TASK_UTILS.build(anchor_generator_cfg)
+
+    # check base anchors
+    expected_grid_anchors = [[
+        torch.tensor([[
+            -99.0000, -99.0000, -1.8000, 1.7600, 0.6300, 1.4400, 0.0000,
+            0.0000, 0.0000
+        ],
+                      [
+                          -99.0000, -99.0000, -1.8000, 1.7600, 0.6300, 1.4400,
+                          1.5700, 0.0000, 0.0000
+                      ]],
+                     device=device),
+        torch.tensor([[
+            -98.0000, -98.0000, -1.3000, 2.3500, 0.9600, 1.5900, 0.0000,
+            0.0000, 0.0000
+        ],
+                      [
+                          -98.0000, -98.0000, -1.3000, 2.3500, 0.9600, 1.5900,
+                          1.5700, 0.0000, 0.0000
+                      ]],
+                     device=device)
+    ]]
+    multi_level_anchors = anchor_generator.grid_anchors(
+        featmap_sizes, device=device)
+    expected_multi_level_shapes = [[
+        torch.Size([20000, 9]), torch.Size([5000, 9])
+    ]]
+    for i, single_level_anchor in enumerate(multi_level_anchors):
+        assert len(single_level_anchor) == len(expected_multi_level_shapes[i])
+        # set [:2*interval:interval] thus it could cover
+        # 2 (len(size) * len(rotations)) anchors on 2 location
+        # Note that len(size) for each class is always 1 in this case
+        for j in range(len(single_level_anchor)):
+            interval = int(expected_multi_level_shapes[i][j][0] / 2)
+            assert single_level_anchor[j][:2 * interval:interval].allclose(
+                expected_grid_anchors[i][j])
diff --git a/mmde/tests/test_models/test_task_modules/test_coders/test_anchor_free_box_coder.py b/mmde/tests/test_models/test_task_modules/test_coders/test_anchor_free_box_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..30858fc9849e7f2a059a43965b2744bc4e173c54
--- /dev/null
+++ b/mmde/tests/test_models/test_task_modules/test_coders/test_anchor_free_box_coder.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet3d.structures import LiDARInstance3DBoxes
+
+
+def test_anchor_free_box_coder():
+    box_coder_cfg = dict(
+        type='AnchorFreeBBoxCoder', num_dir_bins=12, with_rot=True)
+    box_coder = TASK_UTILS.build(box_coder_cfg)
+
+    # test encode
+    gt_bboxes = LiDARInstance3DBoxes([[
+        2.1227e+00, 5.7951e+00, -9.9900e-01, 1.6736e+00, 4.2419e+00,
+        1.5473e+00, -1.5501e+00
+    ],
+                                      [
+                                          1.1791e+01, 9.0276e+00, -8.5772e-01,
+                                          1.6210e+00, 3.5367e+00, 1.4841e+00,
+                                          -1.7369e+00
+                                      ],
+                                      [
+                                          2.3638e+01, 9.6997e+00, -5.6713e-01,
+                                          1.7578e+00, 4.6103e+00, 1.5999e+00,
+                                          -1.4556e+00
+                                      ]])
+    gt_labels = torch.tensor([0, 0, 0])
+
+    (center_targets, size_targets, dir_class_targets,
+     dir_res_targets) = box_coder.encode(gt_bboxes, gt_labels)
+
+    expected_center_target = torch.tensor([[2.1227, 5.7951, -0.2253],
+                                           [11.7908, 9.0276, -0.1156],
+                                           [23.6380, 9.6997, 0.2328]])
+    expected_size_targets = torch.tensor([[0.8368, 2.1210, 0.7736],
+                                          [0.8105, 1.7683, 0.7421],
+                                          [0.8789, 2.3052, 0.8000]])
+    expected_dir_class_target = torch.tensor([9, 9, 9])
+    expected_dir_res_target = torch.tensor([0.0394, -0.3172, 0.2199])
+    assert torch.allclose(center_targets, expected_center_target, atol=1e-4)
+    assert torch.allclose(size_targets, expected_size_targets, atol=1e-4)
+    assert torch.all(dir_class_targets == expected_dir_class_target)
+    assert torch.allclose(dir_res_targets, expected_dir_res_target, atol=1e-3)
+
+    # test decode
+    center = torch.tensor([[[14.5954, 6.3312, 0.7671],
+                            [67.5245, 22.4422, 1.5610],
+                            [47.7693, -6.7980, 1.4395]]])
+
+    size_res = torch.tensor([[[-1.0752, 1.8760, 0.7715],
+                              [-0.8016, 1.1754, 0.0102],
+                              [-1.2789, 0.5948, 0.4728]]])
+
+    dir_class = torch.tensor([[[
+        0.1512, 1.7914, -1.7658, 2.1572, -0.9215, 1.2139, 0.1749, 0.8606,
+        1.1743, -0.7679, -1.6005, 0.4623
+    ],
+                               [
+                                   -0.3957, 1.2026, -1.2677, 1.3863, -0.5754,
+                                   1.7083, 0.2601, 0.1129, 0.7146, -0.1367,
+                                   -1.2892, -0.0083
+                               ],
+                               [
+                                   -0.8862, 1.2050, -1.3881, 1.6604, -0.9087,
+                                   1.1907, -0.0280, 0.2027, 1.0644, -0.7205,
+                                   -1.0738, 0.4748
+                               ]]])
+
+    dir_res = torch.tensor([[[
+        1.1151, 0.5535, -0.2053, -0.6582, -0.1616, -0.1821, 0.4675, 0.6621,
+        0.8146, -0.0448, -0.7253, -0.7171
+    ],
+                             [
+                                 0.7888, 0.2478, -0.1962, -0.7267, 0.0573,
+                                 -0.2398, 0.6984, 0.5859, 0.7507, -0.1980,
+                                 -0.6538, -0.6602
+                             ],
+                             [
+                                 0.9039, 0.6109, 0.1960, -0.5016, 0.0551,
+                                 -0.4086, 0.3398, 0.2759, 0.7247, -0.0655,
+                                 -0.5052, -0.9026
+                             ]]])
+    bbox_out = dict(
+        center=center, size=size_res, dir_class=dir_class, dir_res=dir_res)
+
+    bbox3d = box_coder.decode(bbox_out)
+    expected_bbox3d = torch.tensor(
+        [[[14.5954, 6.3312, 0.7671, 0.1000, 3.7521, 1.5429, 0.9126],
+          [67.5245, 22.4422, 1.5610, 0.1000, 2.3508, 0.1000, 2.3782],
+          [47.7693, -6.7980, 1.4395, 0.1000, 1.1897, 0.9456, 1.0692]]])
+    assert torch.allclose(bbox3d, expected_bbox3d, atol=1e-4)
+
+    # test split_pred
+    cls_preds = torch.rand(2, 1, 256)
+    reg_preds = torch.rand(2, 30, 256)
+    base_xyz = torch.rand(2, 256, 3)
+    results = box_coder.split_pred(cls_preds, reg_preds, base_xyz)
+    obj_scores = results['obj_scores']
+    center = results['center']
+    center_offset = results['center_offset']
+    dir_class = results['dir_class']
+    dir_res_norm = results['dir_res_norm']
+    dir_res = results['dir_res']
+    size = results['size']
+    assert obj_scores.shape == torch.Size([2, 1, 256])
+    assert center.shape == torch.Size([2, 256, 3])
+    assert center_offset.shape == torch.Size([2, 256, 3])
+    assert dir_class.shape == torch.Size([2, 256, 12])
+    assert dir_res_norm.shape == torch.Size([2, 256, 12])
+    assert dir_res.shape == torch.Size([2, 256, 12])
+    assert size.shape == torch.Size([2, 256, 3])
diff --git a/mmde/tests/test_models/test_task_modules/test_coders/test_centerpoint_bbox_coder.py b/mmde/tests/test_models/test_task_modules/test_coders/test_centerpoint_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1764fd8eb0e329c96b43bf6d42f440a4cc5e021
--- /dev/null
+++ b/mmde/tests/test_models/test_task_modules/test_coders/test_centerpoint_bbox_coder.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet3d.registry import TASK_UTILS
+
+
+def test_centerpoint_bbox_coder():
+    bbox_coder_cfg = dict(
+        type='CenterPointBBoxCoder',
+        post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+        max_num=500,
+        score_threshold=0.1,
+        pc_range=[-51.2, -51.2],
+        out_size_factor=4,
+        voxel_size=[0.2, 0.2])
+
+    bbox_coder = TASK_UTILS.build(bbox_coder_cfg)
+
+    batch_dim = torch.rand([2, 3, 128, 128])
+    batch_hei = torch.rand([2, 1, 128, 128])
+    batch_hm = torch.rand([2, 2, 128, 128])
+    batch_reg = torch.rand([2, 2, 128, 128])
+    batch_rotc = torch.rand([2, 1, 128, 128])
+    batch_rots = torch.rand([2, 1, 128, 128])
+    batch_vel = torch.rand([2, 2, 128, 128])
+
+    temp = bbox_coder.decode(batch_hm, batch_rots, batch_rotc, batch_hei,
+                             batch_dim, batch_vel, batch_reg, 5)
+    for i in range(len(temp)):
+        assert temp[i]['bboxes'].shape == torch.Size([500, 9])
+        assert temp[i]['scores'].shape == torch.Size([500])
+        assert temp[i]['labels'].shape == torch.Size([500])
diff --git a/mmde/tests/test_models/test_task_modules/test_coders/test_fcos3d_bbox_coder.py b/mmde/tests/test_models/test_task_modules/test_coders/test_fcos3d_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1f53f31af0cbae858944da4514f6785abe25f80
--- /dev/null
+++ b/mmde/tests/test_models/test_task_modules/test_coders/test_fcos3d_bbox_coder.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import Scale
+from torch import nn as nn
+
+from mmdet3d.registry import TASK_UTILS
+
+
+def test_fcos3d_bbox_coder():
+    # test a config without priors
+    bbox_coder_cfg = dict(
+        type='FCOS3DBBoxCoder',
+        base_depths=None,
+        base_dims=None,
+        code_size=7,
+        norm_on_bbox=True)
+    bbox_coder = TASK_UTILS.build(bbox_coder_cfg)
+
+    # test decode
+    # [2, 7, 1, 1]
+    batch_bbox = torch.tensor([[[[0.3130]], [[0.7094]], [[0.8743]], [[0.0570]],
+                                [[0.5579]], [[0.1593]], [[0.4553]]],
+                               [[[0.7758]], [[0.2298]], [[0.3925]], [[0.6307]],
+                                [[0.4377]], [[0.3339]], [[0.1966]]]])
+    batch_scale = nn.ModuleList([Scale(1.0) for _ in range(3)])
+    stride = 2
+    training = False
+    cls_score = torch.randn([2, 2, 1, 1]).sigmoid()
+    decode_bbox = bbox_coder.decode(batch_bbox, batch_scale, stride, training,
+                                    cls_score)
+
+    expected_bbox = torch.tensor([[[[0.6261]], [[1.4188]], [[2.3971]],
+                                   [[1.0586]], [[1.7470]], [[1.1727]],
+                                   [[0.4553]]],
+                                  [[[1.5516]], [[0.4596]], [[1.4806]],
+                                   [[1.8790]], [[1.5492]], [[1.3965]],
+                                   [[0.1966]]]])
+    assert torch.allclose(decode_bbox, expected_bbox, atol=1e-3)
+
+    # test a config with priors
+    prior_bbox_coder_cfg = dict(
+        type='FCOS3DBBoxCoder',
+        base_depths=((28., 13.), (25., 12.)),
+        base_dims=((2., 3., 1.), (1., 2., 3.)),
+        code_size=7,
+        norm_on_bbox=True)
+    prior_bbox_coder = TASK_UTILS.build(prior_bbox_coder_cfg)
+
+    # test decode
+    batch_bbox = torch.tensor([[[[0.3130]], [[0.7094]], [[0.8743]], [[0.0570]],
+                                [[0.5579]], [[0.1593]], [[0.4553]]],
+                               [[[0.7758]], [[0.2298]], [[0.3925]], [[0.6307]],
+                                [[0.4377]], [[0.3339]], [[0.1966]]]])
+    batch_scale = nn.ModuleList([Scale(1.0) for _ in range(3)])
+    stride = 2
+    training = False
+    cls_score = torch.tensor([[[[0.5811]], [[0.6198]]], [[[0.4889]],
+                                                         [[0.8142]]]])
+    decode_bbox = prior_bbox_coder.decode(batch_bbox, batch_scale, stride,
+                                          training, cls_score)
+    expected_bbox = torch.tensor([[[[0.6260]], [[1.4188]], [[35.4916]],
+                                   [[1.0587]], [[3.4940]], [[3.5181]],
+                                   [[0.4553]]],
+                                  [[[1.5516]], [[0.4596]], [[29.7100]],
+                                   [[1.8789]], [[3.0983]], [[4.1892]],
+                                   [[0.1966]]]])
+    assert torch.allclose(decode_bbox, expected_bbox, atol=1e-3)
+
+    # test decode_yaw
+    decode_bbox = decode_bbox.permute(0, 2, 3, 1).view(-1, 7)
+    batch_centers2d = torch.tensor([[100., 150.], [200., 100.]])
+    batch_dir_cls = torch.tensor([0., 1.])
+    dir_offset = 0.7854
+    cam2img = torch.tensor([[700., 0., 450., 0.], [0., 700., 200., 0.],
+                            [0., 0., 1., 0.], [0., 0., 0., 1.]])
+    decode_bbox = prior_bbox_coder.decode_yaw(decode_bbox, batch_centers2d,
+                                              batch_dir_cls, dir_offset,
+                                              cam2img)
+    expected_bbox = torch.tensor(
+        [[0.6260, 1.4188, 35.4916, 1.0587, 3.4940, 3.5181, 3.1332],
+         [1.5516, 0.4596, 29.7100, 1.8789, 3.0983, 4.1892, 6.1368]])
+    assert torch.allclose(decode_bbox, expected_bbox, atol=1e-3)
diff --git a/mmde/tests/test_models/test_task_modules/test_coders/test_monoflex_bbox_coder.py b/mmde/tests/test_models/test_task_modules/test_coders/test_monoflex_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..d379ef42f410b633eb90f2372bee9d7943f988b9
--- /dev/null
+++ b/mmde/tests/test_models/test_task_modules/test_coders/test_monoflex_bbox_coder.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet3d.structures import CameraInstance3DBoxes
+
+
+def test_monoflex_bbox_coder():
+    bbox_coder_cfg = dict(
+        type='MonoFlexCoder',
+        depth_mode='exp',
+        base_depth=(26.494627, 16.05988),
+        depth_range=[0.1, 100],
+        combine_depth=True,
+        uncertainty_range=[-10, 10],
+        base_dims=((3.8840, 1.5261, 1.6286, 0.4259, 0.1367,
+                    0.1022), (0.8423, 1.7607, 0.6602, 0.2349, 0.1133, 0.1427),
+                   (1.7635, 1.7372, 0.5968, 0.1766, 0.0948, 0.1242)),
+        dims_mode='linear',
+        multibin=True,
+        num_dir_bins=4,
+        bin_centers=[0, np.pi / 2, np.pi, -np.pi / 2],
+        bin_margin=np.pi / 6,
+        code_size=7)
+    bbox_coder = TASK_UTILS.build(bbox_coder_cfg)
+    gt_bboxes_3d = CameraInstance3DBoxes(torch.rand([6, 7]))
+    orientation_target = bbox_coder.encode(gt_bboxes_3d)
+    assert orientation_target.shape == torch.Size([6, 8])
+
+    regression = torch.rand([100, 50])
+    base_centers2d = torch.rand([100, 2])
+    labels = torch.ones([100])
+    downsample_ratio = 4
+    cam2imgs = torch.rand([100, 4, 4])
+
+    preds = bbox_coder.decode(regression, base_centers2d, labels,
+                              downsample_ratio, cam2imgs)
+
+    assert preds['bboxes2d'].shape == torch.Size([100, 4])
+    assert preds['dimensions'].shape == torch.Size([100, 3])
+    assert preds['offsets2d'].shape == torch.Size([100, 2])
+    assert preds['keypoints2d'].shape == torch.Size([100, 10, 2])
+    assert preds['orientations'].shape == torch.Size([100, 16])
+    assert preds['direct_depth'].shape == torch.Size([
+        100,
+    ])
+    assert preds['keypoints_depth'].shape == torch.Size([100, 3])
+    assert preds['combined_depth'].shape == torch.Size([
+        100,
+    ])
+    assert preds['direct_depth_uncertainty'].shape == torch.Size([
+        100,
+    ])
+    assert preds['keypoints_depth_uncertainty'].shape == torch.Size([100, 3])
+
+    offsets_2d = torch.randn([100, 2])
+    depths = torch.randn([
+        100,
+    ])
+    locations = bbox_coder.decode_location(base_centers2d, offsets_2d, depths,
+                                           cam2imgs, downsample_ratio)
+    assert locations.shape == torch.Size([100, 3])
+
+    orientations = torch.randn([100, 16])
+    yaws, local_yaws = bbox_coder.decode_orientation(orientations, locations)
+    assert yaws.shape == torch.Size([
+        100,
+    ])
+    assert local_yaws.shape == torch.Size([
+        100,
+    ])
diff --git a/mmde/tests/test_models/test_task_modules/test_coders/test_partial_bin_based_box_coder.py b/mmde/tests/test_models/test_task_modules/test_coders/test_partial_bin_based_box_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad62e623adde716d5e4d82ddb09ad2dbdd9137b4
--- /dev/null
+++ b/mmde/tests/test_models/test_task_modules/test_coders/test_partial_bin_based_box_coder.py
@@ -0,0 +1,219 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet3d.structures import DepthInstance3DBoxes
+
+
+def test_partial_bin_based_box_coder():
+    box_coder_cfg = dict(
+        type='PartialBinBasedBBoxCoder',
+        num_sizes=10,
+        num_dir_bins=12,
+        with_rot=True,
+        mean_sizes=[[2.114256, 1.620300, 0.927272],
+                    [0.791118, 1.279516, 0.718182],
+                    [0.923508, 1.867419, 0.845495],
+                    [0.591958, 0.552978, 0.827272],
+                    [0.699104, 0.454178, 0.75625],
+                    [0.69519, 1.346299, 0.736364],
+                    [0.528526, 1.002642, 1.172878],
+                    [0.500618, 0.632163, 0.683424],
+                    [0.404671, 1.071108, 1.688889],
+                    [0.76584, 1.398258, 0.472728]])
+    box_coder = TASK_UTILS.build(box_coder_cfg)
+
+    # test eocode
+    gt_bboxes = DepthInstance3DBoxes(
+        [[0.8308, 4.1168, -1.2035, 2.2493, 1.8444, 1.9245, 1.6486],
+         [2.3002, 4.8149, -1.2442, 0.5718, 0.8629, 0.9510, 1.6030],
+         [-1.1477, 1.8090, -1.1725, 0.6965, 1.5273, 2.0563, 0.0552]])
+
+    gt_labels = torch.tensor([0, 1, 2])
+    center_target, size_class_target, size_res_target, dir_class_target, \
+        dir_res_target = box_coder.encode(gt_bboxes, gt_labels)
+    expected_center_target = torch.tensor([[0.8308, 4.1168, -0.2413],
+                                           [2.3002, 4.8149, -0.7687],
+                                           [-1.1477, 1.8090, -0.1444]])
+    expected_size_class_target = torch.tensor([0, 1, 2])
+    expected_size_res_target = torch.tensor([[0.1350, 0.2241, 0.9972],
+                                             [-0.2193, -0.4166, 0.2328],
+                                             [-0.2270, -0.3401, 1.2108]])
+    expected_dir_class_target = torch.tensor([3, 3, 0])
+    expected_dir_res_target = torch.tensor([0.0778, 0.0322, 0.0552])
+    assert torch.allclose(center_target, expected_center_target, atol=1e-4)
+    assert torch.all(size_class_target == expected_size_class_target)
+    assert torch.allclose(size_res_target, expected_size_res_target, atol=1e-4)
+    assert torch.all(dir_class_target == expected_dir_class_target)
+    assert torch.allclose(dir_res_target, expected_dir_res_target, atol=1e-4)
+
+    # test decode
+    center = torch.tensor([[[0.8014, 3.4134,
+                             -0.6133], [2.6375, 8.4191, 2.0438],
+                            [4.2017, 5.2504,
+                             -0.7851], [-1.0088, 5.4107, 1.6293],
+                            [1.4837, 4.0268, 0.6222]]])
+
+    size_class = torch.tensor([[[
+        -1.0061, -2.2788, 1.1322, -4.4380, -11.0526, -2.8113, -2.0642, -7.5886,
+        -4.8627, -5.0437
+    ],
+                                [
+                                    -2.2058, -0.3527, -1.9976, 0.8815, -2.7980,
+                                    -1.9053, -0.5097, -2.0232, -1.4242, -4.1192
+                                ],
+                                [
+                                    -1.4783, -0.1009, -1.1537, 0.3052, -4.3147,
+                                    -2.6529, 0.2729, -0.3755, -2.6479, -3.7548
+                                ],
+                                [
+                                    -6.1809, -3.5024, -8.3273, 1.1252, -4.3315,
+                                    -7.8288, -4.6091, -5.8153, 0.7480, -10.1396
+                                ],
+                                [
+                                    -9.0424, -3.7883, -6.0788, -1.8855,
+                                    -10.2493, -9.7164, -1.0658, -4.1713,
+                                    1.1173, -10.6204
+                                ]]])
+
+    size_res = torch.tensor([[[[-9.8976e-02, -5.2152e-01, -7.6421e-02],
+                               [1.4593e-01, 5.6099e-01, 8.9421e-02],
+                               [5.1481e-02, 3.9280e-01, 1.2705e-01],
+                               [3.6869e-01, 7.0558e-01, 1.4647e-01],
+                               [4.7683e-01, 3.3644e-01, 2.3481e-01],
+                               [8.7346e-02, 8.4987e-01, 3.3265e-01],
+                               [2.1393e-01, 8.5585e-01, 9.8948e-02],
+                               [7.8530e-02, 5.9694e-02, -8.7211e-02],
+                               [1.8551e-01, 1.1308e+00, -5.1864e-01],
+                               [3.6485e-01, 7.3757e-01, 1.5264e-01]],
+                              [[-9.5593e-01, -5.0455e-01, 1.9554e-01],
+                               [-1.0870e-01, 1.8025e-01, 1.0228e-01],
+                               [-8.2882e-02, -4.3771e-01, 9.2135e-02],
+                               [-4.0840e-02, -5.9841e-02, 1.1982e-01],
+                               [7.3448e-02, 5.2045e-02, 1.7301e-01],
+                               [-4.0440e-02, 4.9532e-02, 1.1266e-01],
+                               [3.5857e-02, 1.3564e-02, 1.0212e-01],
+                               [-1.0407e-01, -5.9321e-02, 9.2622e-02],
+                               [7.4691e-03, 9.3080e-02, -4.4077e-01],
+                               [-6.0121e-02, -1.3381e-01, -6.8083e-02]],
+                              [[-9.3970e-01, -9.7823e-01, -5.1075e-02],
+                               [-1.2843e-01, -1.8381e-01, 7.1327e-02],
+                               [-1.2247e-01, -8.1115e-01, 3.6495e-02],
+                               [4.9154e-02, -4.5440e-02, 8.9520e-02],
+                               [1.5653e-01, 3.5990e-02, 1.6414e-01],
+                               [-5.9621e-02, 4.9357e-03, 1.4264e-01],
+                               [8.5235e-04, -1.0030e-01, -3.0712e-02],
+                               [-3.7255e-02, 2.8996e-02, 5.5545e-02],
+                               [3.9298e-02, -4.7420e-02, -4.9147e-01],
+                               [-1.1548e-01, -1.5895e-01, -3.9155e-02]],
+                              [[-1.8725e+00, -7.4102e-01, 1.0524e+00],
+                               [-3.3210e-01, 4.7828e-02, -3.2666e-02],
+                               [-2.7949e-01, 5.5541e-02, -1.0059e-01],
+                               [-8.5533e-02, 1.4870e-01, -1.6709e-01],
+                               [3.8283e-01, 2.6609e-01, 2.1361e-01],
+                               [-4.2156e-01, 3.2455e-01, 6.7309e-01],
+                               [-2.4336e-02, -8.3366e-02, 3.9913e-01],
+                               [8.2142e-03, 4.8323e-02, -1.5247e-01],
+                               [-4.8142e-02, -3.0074e-01, -1.6829e-01],
+                               [1.3274e-01, -2.3825e-01, -1.8127e-01]],
+                              [[-1.2576e+00, -6.1550e-01, 7.9430e-01],
+                               [-4.7222e-01, 1.5634e+00, -5.9460e-02],
+                               [-3.5367e-01, 1.3616e+00, -1.6421e-01],
+                               [-1.6611e-02, 2.4231e-01, -9.6188e-02],
+                               [5.4486e-01, 4.6833e-01, 5.1151e-01],
+                               [-6.1755e-01, 1.0292e+00, 1.2458e+00],
+                               [-6.8152e-02, 2.4786e-01, 9.5088e-01],
+                               [-4.8745e-02, 1.5134e-01, -9.9962e-02],
+                               [2.4485e-03, -7.5991e-02, 1.3545e-01],
+                               [4.1608e-01, -1.2093e-01, -3.1643e-01]]]])
+
+    dir_class = torch.tensor([[[
+        -1.0230, -5.1965, -5.2195, 2.4030, -2.7661, -7.3399, -1.1640, -4.0630,
+        -5.2940, 0.8245, -3.1869, -6.1743
+    ],
+                               [
+                                   -1.9503, -1.6940, -0.8716, -1.1494, -0.8196,
+                                   0.2862, -0.2921, -0.7894, -0.2481, -0.9916,
+                                   -1.4304, -1.2466
+                               ],
+                               [
+                                   -1.7435, -1.2043, -0.1265, 0.5083, -0.0717,
+                                   -0.9560, -1.6171, -2.6463, -2.3863, -2.1358,
+                                   -1.8812, -2.3117
+                               ],
+                               [
+                                   -1.9282, 0.3792, -1.8426, -1.4587, -0.8582,
+                                   -3.4639, -3.2133, -3.7867, -7.6781, -6.4459,
+                                   -6.2455, -5.4797
+                               ],
+                               [
+                                   -3.1869, 0.4456, -0.5824, 0.9994, -1.0554,
+                                   -8.4232, -7.7019, -7.1382, -10.2724,
+                                   -7.8229, -8.1860, -8.6194
+                               ]]])
+
+    dir_res = torch.tensor(
+        [[[
+            1.1022e-01, -2.3750e-01, 2.0381e-01, 1.2177e-01, -2.8501e-01,
+            1.5351e-01, 1.2218e-01, -2.0677e-01, 1.4468e-01, 1.1593e-01,
+            -2.6864e-01, 1.1290e-01
+        ],
+          [
+              -1.5788e-02, 4.1538e-02, -2.2857e-04, -1.4011e-02, 4.2560e-02,
+              -3.1186e-03, -5.0343e-02, 6.8110e-03, -2.6728e-02, -3.2781e-02,
+              3.6889e-02, -1.5609e-03
+          ],
+          [
+              1.9004e-02, 5.7105e-03, 6.0329e-02, 1.3074e-02, -2.5546e-02,
+              -1.1456e-02, -3.2484e-02, -3.3487e-02, 1.6609e-03, 1.7095e-02,
+              1.2647e-05, 2.4814e-02
+          ],
+          [
+              1.4482e-01, -6.3083e-02, 5.8307e-02, 9.1396e-02, -8.4571e-02,
+              4.5890e-02, 5.6243e-02, -1.2448e-01, -9.5244e-02, 4.5746e-02,
+              -1.7390e-02, 9.0267e-02
+          ],
+          [
+              1.8065e-01, -2.0078e-02, 8.5401e-02, 1.0784e-01, -1.2495e-01,
+              2.2796e-02, 1.1310e-01, -8.4364e-02, -1.1904e-01, 6.1180e-02,
+              -1.8109e-02, 1.1229e-01
+          ]]])
+    bbox_out = dict(
+        center=center,
+        size_class=size_class,
+        size_res=size_res,
+        dir_class=dir_class,
+        dir_res=dir_res)
+
+    bbox3d = box_coder.decode(bbox_out)
+    expected_bbox3d = torch.tensor(
+        [[[0.8014, 3.4134, -0.6133, 0.9750, 2.2602, 0.9725, 1.6926],
+          [2.6375, 8.4191, 2.0438, 0.5511, 0.4931, 0.9471, 2.6149],
+          [4.2017, 5.2504, -0.7851, 0.6411, 0.5075, 0.9168, 1.5839],
+          [-1.0088, 5.4107, 1.6293, 0.5064, 0.7017, 0.6602, 0.4605],
+          [1.4837, 4.0268, 0.6222, 0.4071, 0.9951, 1.8243, 1.6786]]])
+    assert torch.allclose(bbox3d, expected_bbox3d, atol=1e-4)
+
+    # test split_pred
+    cls_preds = torch.rand(2, 12, 256)
+    reg_preds = torch.rand(2, 67, 256)
+    base_xyz = torch.rand(2, 256, 3)
+    results = box_coder.split_pred(cls_preds, reg_preds, base_xyz)
+    obj_scores = results['obj_scores']
+    center = results['center']
+    dir_class = results['dir_class']
+    dir_res_norm = results['dir_res_norm']
+    dir_res = results['dir_res']
+    size_class = results['size_class']
+    size_res_norm = results['size_res_norm']
+    size_res = results['size_res']
+    sem_scores = results['sem_scores']
+    assert obj_scores.shape == torch.Size([2, 256, 2])
+    assert center.shape == torch.Size([2, 256, 3])
+    assert dir_class.shape == torch.Size([2, 256, 12])
+    assert dir_res_norm.shape == torch.Size([2, 256, 12])
+    assert dir_res.shape == torch.Size([2, 256, 12])
+    assert size_class.shape == torch.Size([2, 256, 10])
+    assert size_res_norm.shape == torch.Size([2, 256, 10, 3])
+    assert size_res.shape == torch.Size([2, 256, 10, 3])
+    assert sem_scores.shape == torch.Size([2, 256, 10])
diff --git a/mmde/tests/test_models/test_task_modules/test_coders/test_pgd_bbox_coder.py b/mmde/tests/test_models/test_task_modules/test_coders/test_pgd_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a3a998920c89e6359ec2b52c43212418bbc7364
--- /dev/null
+++ b/mmde/tests/test_models/test_task_modules/test_coders/test_pgd_bbox_coder.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import Scale
+from torch import nn as nn
+
+from mmdet3d.registry import TASK_UTILS
+
+
+def test_pgd_bbox_coder():
+    # test a config without priors
+    bbox_coder_cfg = dict(
+        type='PGDBBoxCoder',
+        base_depths=None,
+        base_dims=None,
+        code_size=7,
+        norm_on_bbox=True)
+    bbox_coder = TASK_UTILS.build(bbox_coder_cfg)
+
+    # test decode_2d
+    # [2, 27, 1, 1]
+    batch_bbox = torch.tensor([[[[0.0103]], [[0.7394]], [[0.3296]], [[0.4708]],
+                                [[0.1439]], [[0.0778]], [[0.9399]], [[0.8366]],
+                                [[0.1264]], [[0.3030]], [[0.1898]], [[0.0714]],
+                                [[0.4144]], [[0.4341]], [[0.6442]], [[0.2951]],
+                                [[0.2890]], [[0.4486]], [[0.2848]], [[0.1071]],
+                                [[0.9530]], [[0.9460]], [[0.3822]], [[0.9320]],
+                                [[0.2611]], [[0.5580]], [[0.0397]]],
+                               [[[0.8612]], [[0.1680]], [[0.5167]], [[0.8502]],
+                                [[0.0377]], [[0.3615]], [[0.9550]], [[0.5219]],
+                                [[0.1402]], [[0.6843]], [[0.2121]], [[0.9468]],
+                                [[0.6238]], [[0.7918]], [[0.1646]], [[0.0500]],
+                                [[0.6290]], [[0.3956]], [[0.2901]], [[0.4612]],
+                                [[0.7333]], [[0.1194]], [[0.6999]], [[0.3980]],
+                                [[0.3262]], [[0.7185]], [[0.4474]]]])
+    batch_scale = nn.ModuleList([Scale(1.0) for _ in range(5)])
+    stride = 2
+    training = False
+    cls_score = torch.randn([2, 2, 1, 1]).sigmoid()
+    decode_bbox = bbox_coder.decode(batch_bbox, batch_scale, stride, training,
+                                    cls_score)
+    max_regress_range = 16
+    pred_keypoints = True
+    pred_bbox2d = True
+    decode_bbox_w2d = bbox_coder.decode_2d(decode_bbox, batch_scale, stride,
+                                           max_regress_range, training,
+                                           pred_keypoints, pred_bbox2d)
+    expected_decode_bbox_w2d = torch.tensor(
+        [[[[0.0206]], [[1.4788]],
+          [[1.3904]], [[1.6013]], [[1.1548]], [[1.0809]], [[0.9399]],
+          [[10.9441]], [[2.0117]], [[4.7049]], [[3.0009]], [[1.1405]],
+          [[6.2752]], [[6.5399]], [[9.0840]], [[4.5892]], [[4.4994]],
+          [[6.7320]], [[4.4375]], [[1.7071]], [[11.8582]], [[11.8075]],
+          [[5.8339]], [[1.8640]], [[0.5222]], [[1.1160]], [[0.0794]]],
+         [[[1.7224]], [[0.3360]], [[1.6765]], [[2.3401]], [[1.0384]],
+          [[1.4355]], [[0.9550]], [[7.6666]], [[2.2286]], [[9.5089]],
+          [[3.3436]], [[11.8133]], [[8.8603]], [[10.5508]], [[2.6101]],
+          [[0.7993]], [[8.9178]], [[6.0188]], [[4.5156]], [[6.8970]],
+          [[10.0013]], [[1.9014]], [[9.6689]], [[0.7960]], [[0.6524]],
+          [[1.4370]], [[0.8948]]]])
+    assert torch.allclose(expected_decode_bbox_w2d, decode_bbox_w2d, atol=1e-3)
+
+    # test decode_prob_depth
+    # [10, 8]
+    depth_cls_preds = torch.tensor([
+        [-0.4383, 0.7207, -0.4092, 0.4649, 0.8526, 0.6186, -1.4312, -0.7150],
+        [0.0621, 0.2369, 0.5170, 0.8484, -0.1099, 0.1829, -0.0072, 1.0618],
+        [-1.6114, -0.1057, 0.5721, -0.5986, -2.0471, 0.8140, -0.8385, -0.4822],
+        [0.0742, -0.3261, 0.4607, 1.8155, -0.3571, -0.0234, 0.3787, 2.3251],
+        [1.0492, -0.6881, -0.0136, -1.8291, 0.8460, -1.0171, 2.5691, -0.8114],
+        [0.0968, -0.5601, 1.0458, 0.2560, 1.3018, 0.1635, 0.0680, -1.0263],
+        [-0.0765, 0.1498, -2.7321, 1.0047, -0.2505, 0.0871, -0.4820, -0.3003],
+        [-0.4123, 0.2298, -0.1330, -0.6008, 0.6526, 0.7118, 0.9728, -0.7793],
+        [1.6940, 0.3355, 1.4661, 0.5477, 0.8667, 0.0527, -0.9975, -0.0689],
+        [0.4724, -0.3632, -0.0654, 0.4034, -0.3494, -0.7548, 0.7297, 1.2754]
+    ])
+    depth_range = (0, 70)
+    depth_unit = 10
+    num_depth_cls = 8
+    uniform_prob_depth_preds = bbox_coder.decode_prob_depth(
+        depth_cls_preds, depth_range, depth_unit, 'uniform', num_depth_cls)
+    expected_preds = torch.tensor([
+        32.0441, 38.4689, 36.1831, 48.2096, 46.1560, 32.7973, 33.2155, 39.9822,
+        21.9905, 43.0161
+    ])
+    assert torch.allclose(uniform_prob_depth_preds, expected_preds, atol=1e-3)
+
+    linear_prob_depth_preds = bbox_coder.decode_prob_depth(
+        depth_cls_preds, depth_range, depth_unit, 'linear', num_depth_cls)
+    expected_preds = torch.tensor([
+        21.1431, 30.2421, 25.8964, 41.6116, 38.6234, 21.4582, 23.2993, 30.1111,
+        13.9273, 36.8419
+    ])
+    assert torch.allclose(linear_prob_depth_preds, expected_preds, atol=1e-3)
+
+    log_prob_depth_preds = bbox_coder.decode_prob_depth(
+        depth_cls_preds, depth_range, depth_unit, 'log', num_depth_cls)
+    expected_preds = torch.tensor([
+        12.6458, 24.2487, 17.4015, 36.9375, 27.5982, 12.5510, 15.6635, 19.8408,
+        9.1605, 31.3765
+    ])
+    assert torch.allclose(log_prob_depth_preds, expected_preds, atol=1e-3)
+
+    loguniform_prob_depth_preds = bbox_coder.decode_prob_depth(
+        depth_cls_preds, depth_range, depth_unit, 'loguniform', num_depth_cls)
+    expected_preds = torch.tensor([
+        6.9925, 10.3273, 8.9895, 18.6524, 16.4667, 7.3196, 7.5078, 11.3207,
+        3.7987, 13.6095
+    ])
+    assert torch.allclose(
+        loguniform_prob_depth_preds, expected_preds, atol=1e-3)
diff --git a/mmde/tests/test_models/test_task_modules/test_coders/test_point_xyzwhlr_bbox_coder.py b/mmde/tests/test_models/test_task_modules/test_coders/test_point_xyzwhlr_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e29f7401d59db34222a8a9216b1d238f54b442e9
--- /dev/null
+++ b/mmde/tests/test_models/test_task_modules/test_coders/test_point_xyzwhlr_bbox_coder.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+
+from mmdet3d.registry import TASK_UTILS
+
+
+def test_point_xyzwhlr_bbox_coder():
+    bbox_coder_cfg = dict(
+        type='PointXYZWHLRBBoxCoder',
+        use_mean_size=True,
+        mean_size=[[3.9, 1.6, 1.56], [0.8, 0.6, 1.73], [1.76, 0.6, 1.73]])
+    boxcoder = TASK_UTILS.build(bbox_coder_cfg)
+
+    # test encode
+    gt_bboxes_3d = torch.tensor(
+        [[13.3329, 2.3514, -0.7004, 1.7508, 0.4702, 1.7909, -3.0522],
+         [2.2068, -2.6994, -0.3277, 3.8703, 1.6602, 1.6913, -1.9057],
+         [5.5269, 2.5085, -1.0129, 1.1496, 0.8006, 1.8887, 2.1756]])
+
+    points = torch.tensor([[13.70, 2.40, 0.12], [3.20, -3.00, 0.2],
+                           [5.70, 2.20, -0.4]])
+
+    gt_labels_3d = torch.tensor([2, 0, 1])
+
+    bbox_target = boxcoder.encode(gt_bboxes_3d, points, gt_labels_3d)
+    expected_bbox_target = torch.tensor([[
+        -0.1974, -0.0261, -0.4742, -0.0052, -0.2438, 0.0346, -0.9960, -0.0893
+    ], [-0.2356, 0.0713, -0.3383, -0.0076, 0.0369, 0.0808, -0.3287, -0.9444
+        ], [-0.1731, 0.3085, -0.3543, 0.3626, 0.2884, 0.0878, -0.5686,
+            0.8226]])
+    assert torch.allclose(expected_bbox_target, bbox_target, atol=1e-4)
+    # test decode
+    bbox3d_out = boxcoder.decode(bbox_target, points, gt_labels_3d)
+    assert torch.allclose(bbox3d_out, gt_bboxes_3d, atol=1e-4)
diff --git a/mmde/tests/test_models/test_task_modules/test_coders/test_smoke_bbox_coder.py b/mmde/tests/test_models/test_task_modules/test_coders/test_smoke_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a02796187069bf186c1d41c12d70c5d39c3ac484
--- /dev/null
+++ b/mmde/tests/test_models/test_task_modules/test_coders/test_smoke_bbox_coder.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet3d.registry import TASK_UTILS
+from mmdet3d.structures import CameraInstance3DBoxes
+
+
+def test_smoke_bbox_coder():
+    bbox_coder_cfg = dict(
+        type='SMOKECoder',
+        base_depth=(28.01, 16.32),
+        base_dims=((3.88, 1.63, 1.53), (1.78, 1.70, 0.58), (0.88, 1.73, 0.67)),
+        code_size=7)
+
+    bbox_coder = TASK_UTILS.build(bbox_coder_cfg)
+    regression = torch.rand([200, 8])
+    points = torch.rand([200, 2])
+    labels = torch.ones([2, 100])
+    cam2imgs = torch.rand([2, 4, 4])
+    trans_mats = torch.rand([2, 3, 3])
+
+    img_metas = [dict(box_type_3d=CameraInstance3DBoxes) for i in range(2)]
+    locations, dimensions, orientations = bbox_coder.decode(
+        regression, points, labels, cam2imgs, trans_mats)
+    assert locations.shape == torch.Size([200, 3])
+    assert dimensions.shape == torch.Size([200, 3])
+    assert orientations.shape == torch.Size([200, 1])
+    bboxes = bbox_coder.encode(locations, dimensions, orientations, img_metas)
+    assert bboxes.tensor.shape == torch.Size([200, 7])
+
+    # specically designed to test orientation decode function's
+    # special cases.
+    ori_vector = torch.tensor([[-0.9, -0.01], [-0.9, 0.01]])
+    locations = torch.tensor([[15., 2., 1.], [15., 2., -1.]])
+    orientations = bbox_coder._decode_orientation(ori_vector, locations)
+    assert orientations.shape == torch.Size([2, 1])
diff --git a/mmde/tests/test_models/test_task_modules/test_samplers/test_iou_piecewise_sampler.py b/mmde/tests/test_models/test_task_modules/test_samplers/test_iou_piecewise_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..b996364908c9f3a6129e9f61364c27a9a11338dc
--- /dev/null
+++ b/mmde/tests/test_models/test_task_modules/test_samplers/test_iou_piecewise_sampler.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet3d.models.task_modules import IoUNegPiecewiseSampler
+from mmdet3d.models.task_modules.assigners import Max3DIoUAssigner
+
+
+def test_iou_piecewise_sampler():
+    if not torch.cuda.is_available():
+        pytest.skip()
+    assigner = Max3DIoUAssigner(
+        pos_iou_thr=0.55,
+        neg_iou_thr=0.55,
+        min_pos_iou=0.55,
+        ignore_iof_thr=-1,
+        iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'))
+    bboxes = torch.tensor(
+        [[32, 32, 16, 8, 38, 42, -0.3], [32, 32, 16, 8, 38, 42, -0.3],
+         [32, 32, 16, 8, 38, 42, -0.3], [32, 32, 16, 8, 38, 42, -0.3],
+         [0, 0, 0, 10, 10, 10, 0.2], [10, 10, 10, 20, 20, 15, 0.6],
+         [5, 5, 5, 15, 15, 15, 0.7], [5, 5, 5, 15, 15, 15, 0.7],
+         [5, 5, 5, 15, 15, 15, 0.7], [32, 32, 16, 8, 38, 42, -0.3],
+         [32, 32, 16, 8, 38, 42, -0.3], [32, 32, 16, 8, 38, 42, -0.3]],
+        dtype=torch.float32).cuda()
+    gt_bboxes = torch.tensor(
+        [[0, 0, 0, 10, 10, 9, 0.2], [5, 10, 10, 20, 20, 15, 0.6]],
+        dtype=torch.float32).cuda()
+    gt_labels = torch.tensor([1, 1], dtype=torch.int64).cuda()
+    gt_instanses = InstanceData()
+    gt_instanses.bboxes_3d = gt_bboxes
+    gt_instanses.labels_3d = gt_labels
+    pred_instaces = InstanceData()
+    pred_instaces.priors = bboxes
+
+    assign_result = assigner.assign(pred_instaces, gt_instanses)
+
+    sampler = IoUNegPiecewiseSampler(
+        num=10,
+        pos_fraction=0.55,
+        neg_piece_fractions=[0.8, 0.2],
+        neg_iou_piece_thrs=[0.55, 0.1],
+        neg_pos_ub=-1,
+        add_gt_as_proposals=False)
+
+    sample_result = sampler.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+
+    assert sample_result.pos_inds == 4
+    assert len(sample_result.pos_bboxes) == len(sample_result.pos_inds)
+    assert len(sample_result.neg_bboxes) == len(sample_result.neg_inds)
diff --git a/mmde/tests/test_models/test_task_modules/test_voxel/test_voxel_generator.py b/mmde/tests/test_models/test_task_modules/test_voxel/test_voxel_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..806fc78dee21ae93b568afa595577d37600112ec
--- /dev/null
+++ b/mmde/tests/test_models/test_task_modules/test_voxel/test_voxel_generator.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from mmdet3d.models.task_modules.voxel import VoxelGenerator
+
+
+def test_voxel_generator():
+    np.random.seed(0)
+    voxel_size = [5, 5, 1]
+    point_cloud_range = [0, 0, 0, 20, 40, 4]
+    max_num_points = 5
+    self = VoxelGenerator(voxel_size, point_cloud_range, max_num_points)
+    points = np.random.uniform(0, 4, (20, 3))
+    voxels = self.generate(points)
+    voxels, coors, num_points_per_voxel = voxels
+    expected_coors = np.array([[2, 0, 0], [3, 0, 0], [0, 0, 0], [1, 0, 0]])
+    expected_num_points_per_voxel = np.array([5, 5, 5, 3])
+    assert voxels.shape == (4, 5, 3)
+    assert np.all(coors == expected_coors)
+    assert np.all(num_points_per_voxel == expected_num_points_per_voxel)
diff --git a/mmde/tests/test_models/test_utils/test_utils.py b/mmde/tests/test_models/test_utils/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..db879b9053e4569322ed1f61f7b6eb7f17e2b84e
--- /dev/null
+++ b/mmde/tests/test_models/test_utils/test_utils.py
@@ -0,0 +1,289 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmdet3d.models import draw_heatmap_gaussian
+from mmdet3d.models.utils import (filter_outside_objs, get_edge_indices,
+                                  get_keypoints, handle_proj_objs)
+from mmdet3d.structures import CameraInstance3DBoxes, points_img2cam
+from mmdet3d.utils import array_converter
+
+
+def test_gaussian():
+    heatmap = torch.zeros((128, 128))
+    ct_int = torch.tensor([64, 64], dtype=torch.int32)
+    radius = 2
+    draw_heatmap_gaussian(heatmap, ct_int, radius)
+    assert torch.isclose(torch.sum(heatmap), torch.tensor(4.3505), atol=1e-3)
+
+
+def test_array_converter():
+    # to torch
+    @array_converter(to_torch=True, apply_to=('array_a', 'array_b'))
+    def test_func_1(array_a, array_b, container):
+        container.append(array_a)
+        container.append(array_b)
+        return array_a.clone(), array_b.clone()
+
+    np_array_a = np.array([0.0])
+    np_array_b = np.array([0.0])
+    container = []
+    new_array_a, new_array_b = test_func_1(np_array_a, np_array_b, container)
+
+    assert isinstance(new_array_a, np.ndarray)
+    assert isinstance(new_array_b, np.ndarray)
+    assert isinstance(container[0], torch.Tensor)
+    assert isinstance(container[1], torch.Tensor)
+
+    # one to torch and one not
+    @array_converter(to_torch=True, apply_to=('array_a', ))
+    def test_func_2(array_a, array_b):
+        return torch.cat([array_a, array_b])
+
+    with pytest.raises(TypeError):
+        _ = test_func_2(np_array_a, np_array_b)
+
+    # wrong template_arg_name_
+    @array_converter(
+        to_torch=True, apply_to=('array_a', ), template_arg_name_='array_c')
+    def test_func_3(array_a, array_b):
+        return torch.cat([array_a, array_b])
+
+    with pytest.raises(ValueError):
+        _ = test_func_3(np_array_a, np_array_b)
+
+    # wrong apply_to
+    @array_converter(to_torch=True, apply_to=('array_a', 'array_c'))
+    def test_func_4(array_a, array_b):
+        return torch.cat([array_a, array_b])
+
+    with pytest.raises(ValueError):
+        _ = test_func_4(np_array_a, np_array_b)
+
+    # to numpy
+    @array_converter(to_torch=False, apply_to=('array_a', 'array_b'))
+    def test_func_5(array_a, array_b, container):
+        container.append(array_a)
+        container.append(array_b)
+        return array_a.copy(), array_b.copy()
+
+    pt_array_a = torch.tensor([0.0])
+    pt_array_b = torch.tensor([0.0])
+    container = []
+    new_array_a, new_array_b = test_func_5(pt_array_a, pt_array_b, container)
+
+    assert isinstance(container[0], np.ndarray)
+    assert isinstance(container[1], np.ndarray)
+    assert isinstance(new_array_a, torch.Tensor)
+    assert isinstance(new_array_b, torch.Tensor)
+
+    # apply_to = None
+    @array_converter(to_torch=False)
+    def test_func_6(array_a, array_b, container):
+        container.append(array_a)
+        container.append(array_b)
+        return array_a.clone(), array_b.clone()
+
+    container = []
+    new_array_a, new_array_b = test_func_6(pt_array_a, pt_array_b, container)
+
+    assert isinstance(container[0], torch.Tensor)
+    assert isinstance(container[1], torch.Tensor)
+    assert isinstance(new_array_a, torch.Tensor)
+    assert isinstance(new_array_b, torch.Tensor)
+
+    # with default arg
+    @array_converter(to_torch=True, apply_to=('array_a', 'array_b'))
+    def test_func_7(array_a, container, array_b=np.array([2.])):
+        container.append(array_a)
+        container.append(array_b)
+        return array_a.clone(), array_b.clone()
+
+    container = []
+    new_array_a, new_array_b = test_func_7(np_array_a, container)
+
+    assert isinstance(container[0], torch.Tensor)
+    assert isinstance(container[1], torch.Tensor)
+    assert isinstance(new_array_a, np.ndarray)
+    assert isinstance(new_array_b, np.ndarray)
+    assert np.allclose(new_array_b, np.array([2.]), 1e-3)
+
+    # override default arg
+
+    container = []
+    new_array_a, new_array_b = test_func_7(np_array_a, container,
+                                           np.array([4.]))
+
+    assert isinstance(container[0], torch.Tensor)
+    assert isinstance(container[1], torch.Tensor)
+    assert isinstance(new_array_a, np.ndarray)
+    assert np.allclose(new_array_b, np.array([4.]), 1e-3)
+
+    # list arg
+    @array_converter(to_torch=True, apply_to=('array_a', 'array_b'))
+    def test_func_8(container, array_a, array_b=[2.]):
+        container.append(array_a)
+        container.append(array_b)
+        return array_a.clone(), array_b.clone()
+
+    container = []
+    new_array_a, new_array_b = test_func_8(container, [3.])
+
+    assert isinstance(container[0], torch.Tensor)
+    assert isinstance(container[1], torch.Tensor)
+    assert np.allclose(new_array_a, np.array([3.]), 1e-3)
+    assert np.allclose(new_array_b, np.array([2.]), 1e-3)
+
+    # number arg
+    @array_converter(to_torch=True, apply_to=('array_a', 'array_b'))
+    def test_func_9(container, array_a, array_b=1):
+        container.append(array_a)
+        container.append(array_b)
+        return array_a.clone(), array_b.clone()
+
+    container = []
+    new_array_a, new_array_b = test_func_9(container, np_array_a)
+
+    assert isinstance(container[0], torch.FloatTensor)
+    assert isinstance(container[1], torch.FloatTensor)
+    assert np.allclose(new_array_a, np_array_a, 1e-3)
+    assert np.allclose(new_array_b, np.array(1.0), 1e-3)
+
+    # feed kwargs
+    container = []
+    kwargs = {'array_a': [5.], 'array_b': [6.]}
+    new_array_a, new_array_b = test_func_8(container, **kwargs)
+
+    assert isinstance(container[0], torch.Tensor)
+    assert isinstance(container[1], torch.Tensor)
+    assert np.allclose(new_array_a, np.array([5.]), 1e-3)
+    assert np.allclose(new_array_b, np.array([6.]), 1e-3)
+
+    # feed args and kwargs
+    container = []
+    kwargs = {'array_b': [7.]}
+    args = (container, [8.])
+    new_array_a, new_array_b = test_func_8(*args, **kwargs)
+
+    assert isinstance(container[0], torch.Tensor)
+    assert isinstance(container[1], torch.Tensor)
+    assert np.allclose(new_array_a, np.array([8.]), 1e-3)
+    assert np.allclose(new_array_b, np.array([7.]), 1e-3)
+
+    # wrong template arg type
+    with pytest.raises(TypeError):
+        new_array_a, new_array_b = test_func_9(container, 3 + 4j)
+
+    with pytest.raises(TypeError):
+        new_array_a, new_array_b = test_func_9(container, {})
+
+    # invalid template arg list
+    with pytest.raises((TypeError, ValueError)):
+        new_array_a, new_array_b = test_func_9(container,
+                                               [True, np.array([3.0])])
+
+
+def test_points_img2cam():
+    points = torch.tensor([[0.5764, 0.9109, 0.7576], [0.6656, 0.5498, 0.9813]])
+    cam2img = torch.tensor([[700., 0., 450., 0.], [0., 700., 200., 0.],
+                            [0., 0., 1., 0.]])
+    xyzs = points_img2cam(points, cam2img)
+    expected_xyzs = torch.tensor([[-0.4864, -0.2155, 0.7576],
+                                  [-0.6299, -0.2796, 0.9813]])
+    assert torch.allclose(xyzs, expected_xyzs, atol=1e-3)
+
+
+def test_generate_edge_indices():
+
+    input_metas = [
+        dict(img_shape=(110, 110), pad_shape=(128, 128)),
+        dict(img_shape=(98, 110), pad_shape=(128, 128))
+    ]
+    downsample_ratio = 4
+    edge_indices_list = get_edge_indices(input_metas, downsample_ratio)
+
+    assert edge_indices_list[0].shape[0] == 108
+    assert edge_indices_list[1].shape[0] == 102
+
+
+def test_truncation_hanlde():
+
+    centers2d_list = [
+        torch.tensor([[-99.86, 199.45], [499.50, 399.20], [201.20, 99.86]])
+    ]
+
+    gt_bboxes_list = [
+        torch.tensor([[0.25, 99.8, 99.8, 199.6], [300.2, 250.1, 399.8, 299.6],
+                      [100.2, 20.1, 300.8, 180.7]])
+    ]
+    img_metas = [dict(img_shape=[300, 400])]
+    centers2d_target_list, offsets2d_list, trunc_mask_list = \
+        handle_proj_objs(centers2d_list, gt_bboxes_list, img_metas)
+
+    centers2d_target = torch.tensor([[0., 166.30435501], [379.03437877, 299.],
+                                     [201.2, 99.86]])
+
+    offsets2d = torch.tensor([[-99.86, 33.45], [120.5, 100.2], [0.2, -0.14]])
+    trunc_mask = torch.tensor([True, True, False])
+
+    assert torch.allclose(centers2d_target_list[0], centers2d_target)
+    assert torch.allclose(offsets2d_list[0], offsets2d, atol=1e-4)
+    assert torch.all(trunc_mask_list[0] == trunc_mask)
+    assert torch.allclose(
+        centers2d_target_list[0].round().int() + offsets2d_list[0],
+        centers2d_list[0])
+
+
+def test_filter_outside_objs():
+
+    centers2d_list = [
+        torch.tensor([[-99.86, 199.45], [499.50, 399.20], [201.20, 99.86]]),
+        torch.tensor([[-47.86, 199.45], [410.50, 399.20], [401.20, 349.86]])
+    ]
+    gt_bboxes_list = [
+        torch.rand([3, 4], dtype=torch.float32),
+        torch.rand([3, 4], dtype=torch.float32)
+    ]
+    gt_bboxes_3d_list = [
+        CameraInstance3DBoxes(torch.rand([3, 7]), box_dim=7),
+        CameraInstance3DBoxes(torch.rand([3, 7]), box_dim=7)
+    ]
+    gt_labels_list = [torch.tensor([0, 1, 2]), torch.tensor([2, 0, 0])]
+    gt_labels_3d_list = [torch.tensor([0, 1, 2]), torch.tensor([2, 0, 0])]
+    img_metas = [dict(img_shape=[300, 400]), dict(img_shape=[500, 450])]
+    filter_outside_objs(gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list,
+                        gt_labels_3d_list, centers2d_list, img_metas)
+
+    assert len(centers2d_list[0]) == len(gt_bboxes_3d_list[0]) == \
+        len(gt_bboxes_list[0]) == len(gt_labels_3d_list[0]) == \
+        len(gt_labels_list[0]) == 1
+
+    assert len(centers2d_list[1]) == len(gt_bboxes_3d_list[1]) == \
+        len(gt_bboxes_list[1]) == len(gt_labels_3d_list[1]) == \
+        len(gt_labels_list[1]) == 2
+
+
+def test_generate_keypoints():
+
+    centers2d_list = [
+        torch.tensor([[-99.86, 199.45], [499.50, 399.20], [201.20, 99.86]]),
+        torch.tensor([[-47.86, 199.45], [410.50, 399.20], [401.20, 349.86]])
+    ]
+    gt_bboxes_3d_list = [
+        CameraInstance3DBoxes(torch.rand([3, 7])),
+        CameraInstance3DBoxes(torch.rand([3, 7]))
+    ]
+    img_metas = [
+        dict(
+            cam2img=[[1260.8474446004698, 0.0, 807.968244525554, 40.1111],
+                     [0.0, 1260.8474446004698, 495.3344268742088, 2.34422],
+                     [0.0, 0.0, 1.0, 0.00333333], [0.0, 0.0, 0.0, 1.0]],
+            img_shape=(300, 400)) for i in range(2)
+    ]
+
+    keypoints2d_list, keypoints_depth_mask_list = \
+        get_keypoints(gt_bboxes_3d_list, centers2d_list, img_metas)
+
+    assert keypoints2d_list[0].shape == (3, 10, 3)
+    assert keypoints_depth_mask_list[0].shape == (3, 3)
diff --git a/mmde/tests/test_models/test_voxel_encoders/test_pillar_encoder.py b/mmde/tests/test_models/test_voxel_encoders/test_pillar_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f1f619f80b542052632b585b260ce9442ab2f91
--- /dev/null
+++ b/mmde/tests/test_models/test_voxel_encoders/test_pillar_encoder.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmdet3d.registry import MODELS
+
+
+def test_pillar_feature_net():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    pillar_feature_net_cfg = dict(
+        type='PillarFeatureNet',
+        in_channels=5,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=(0.2, 0.2, 8),
+        point_cloud_range=(-51.2, -51.2, -5.0, 51.2, 51.2, 3.0),
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01))
+    pillar_feature_net = MODELS.build(pillar_feature_net_cfg)
+
+    features = torch.rand([97297, 20, 5])
+    num_voxels = torch.randint(1, 100, [97297])
+    coors = torch.randint(0, 100, [97297, 4])
+
+    features = pillar_feature_net(features, num_voxels, coors)
+    assert features.shape == torch.Size([97297, 64])
diff --git a/mmde/tests/test_models/test_voxel_encoders/test_voxel_encoders.py b/mmde/tests/test_models/test_voxel_encoders/test_voxel_encoders.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5eb630ab15299874828f62184a6d53bc7adfd28
--- /dev/null
+++ b/mmde/tests/test_models/test_voxel_encoders/test_voxel_encoders.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+import torch.nn.functional as F
+
+from mmdet3d.registry import MODELS
+
+
+def test_hard_simple_VFE():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    hard_simple_VFE_cfg = dict(type='HardSimpleVFE', num_features=5)
+    hard_simple_VFE = MODELS.build(hard_simple_VFE_cfg)
+    features = torch.rand([240000, 10, 5])
+    num_voxels = torch.randint(1, 10, [240000])
+
+    outputs = hard_simple_VFE(features, num_voxels, None)
+    assert outputs.shape == torch.Size([240000, 5])
+
+
+def test_seg_VFE():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    seg_VFE_cfg = dict(
+        type='SegVFE',
+        feat_channels=[64, 128, 256, 256],
+        grid_shape=[480, 360, 32],
+        with_voxel_center=True,
+        feat_compression=16,
+        return_point_feats=True)
+    seg_VFE = MODELS.build(seg_VFE_cfg)
+    seg_VFE = seg_VFE.cuda()
+    features = torch.rand([240000, 6]).cuda()
+    coors = []
+    for i in range(4):
+        coor = torch.randint(0, 10, (60000, 3))
+        coor = F.pad(coor, (1, 0), mode='constant', value=i)
+        coors.append(coor)
+    coors = torch.cat(coors, dim=0).cuda()
+    out_features, out_coors, out_point_features = seg_VFE(features, coors)
+    assert out_features.shape[0] == out_coors.shape[0]
+    assert len(out_point_features) == 4
+    assert out_point_features[0].shape == torch.Size([240000, 64])
+    assert out_point_features[1].shape == torch.Size([240000, 128])
+    assert out_point_features[2].shape == torch.Size([240000, 256])
+    assert out_point_features[3].shape == torch.Size([240000, 256])
diff --git a/mmde/tests/test_samples/parta2_roihead_inputs.npz b/mmde/tests/test_samples/parta2_roihead_inputs.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8dc8e548d9c6c800df200282a78a7e0a41204bef
Binary files /dev/null and b/mmde/tests/test_samples/parta2_roihead_inputs.npz differ
diff --git a/mmde/tests/test_structures/test_bbox/test_box3d.py b/mmde/tests/test_structures/test_bbox/test_box3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fb1893116f4afa7e78b17300d09358876c4d2ee
--- /dev/null
+++ b/mmde/tests/test_structures/test_bbox/test_box3d.py
@@ -0,0 +1,1796 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+import numpy as np
+import pytest
+import torch
+
+from mmdet3d.structures import (BaseInstance3DBoxes, Box3DMode,
+                                CameraInstance3DBoxes, Coord3DMode,
+                                DepthInstance3DBoxes, LiDARInstance3DBoxes,
+                                bbox3d2roi, bbox3d_mapping_back)
+from mmdet3d.structures.bbox_3d.utils import (get_box_type, limit_period,
+                                              points_cam2img,
+                                              rotation_3d_in_axis, xywhr2xyxyr)
+from mmdet3d.structures.points import CameraPoints, DepthPoints, LiDARPoints
+
+
+def test_bbox3d_mapping_back():
+    bboxes = BaseInstance3DBoxes(
+        [[
+            -5.24223238e+00, 4.00209696e+01, 2.97570381e-01, 2.06200000e+00,
+            4.40900000e+00, 1.54800000e+00, -1.48801203e+00
+        ],
+         [
+             -2.66751588e+01, 5.59499564e+00, -9.14345860e-01, 3.43000000e-01,
+             4.58000000e-01, 7.82000000e-01, -4.62759755e+00
+         ],
+         [
+             -5.80979675e+00, 3.54092357e+01, 2.00889888e-01, 2.39600000e+00,
+             3.96900000e+00, 1.73200000e+00, -4.65203216e+00
+         ],
+         [
+             -3.13086877e+01, 1.09007628e+00, -1.94612112e-01, 1.94400000e+00,
+             3.85700000e+00, 1.72300000e+00, -2.81427027e+00
+         ]])
+    new_bboxes = bbox3d_mapping_back(bboxes, 1.1, True, True)
+    expected_new_bboxes = torch.tensor(
+        [[-4.7657, 36.3827, 0.2705, 1.8745, 4.0082, 1.4073, -1.4880],
+         [-24.2501, 5.0864, -0.8312, 0.3118, 0.4164, 0.7109, -4.6276],
+         [-5.2816, 32.1902, 0.1826, 2.1782, 3.6082, 1.5745, -4.6520],
+         [-28.4624, 0.9910, -0.1769, 1.7673, 3.5064, 1.5664, -2.8143]])
+    assert torch.allclose(new_bboxes.tensor, expected_new_bboxes, atol=1e-4)
+
+
+def test_bbox3d2roi():
+    bbox_0 = torch.tensor(
+        [[-5.2422, 4.0020, 2.9757, 2.0620, 4.4090, 1.5480, -1.4880],
+         [-5.8097, 3.5409, 2.0088, 2.3960, 3.9690, 1.7320, -4.6520]])
+    bbox_1 = torch.tensor(
+        [[-2.6675, 5.5949, -9.1434, 3.4300, 4.5800, 7.8200, -4.6275],
+         [-3.1308, 1.0900, -1.9461, 1.9440, 3.8570, 1.7230, -2.8142]])
+    bbox_list = [bbox_0, bbox_1]
+    rois = bbox3d2roi(bbox_list)
+    expected_rois = torch.tensor(
+        [[0.0000, -5.2422, 4.0020, 2.9757, 2.0620, 4.4090, 1.5480, -1.4880],
+         [0.0000, -5.8097, 3.5409, 2.0088, 2.3960, 3.9690, 1.7320, -4.6520],
+         [1.0000, -2.6675, 5.5949, -9.1434, 3.4300, 4.5800, 7.8200, -4.6275],
+         [1.0000, -3.1308, 1.0900, -1.9461, 1.9440, 3.8570, 1.7230, -2.8142]])
+    assert torch.all(torch.eq(rois, expected_rois))
+
+
+def test_base_boxes3d():
+    # test empty initialization
+    empty_boxes = []
+    boxes = BaseInstance3DBoxes(empty_boxes)
+    assert boxes.tensor.shape[0] == 0
+    assert boxes.tensor.shape[1] == 7
+
+    # Test init with origin
+    gravity_center_box = np.array(
+        [[
+            -5.24223238e+00, 4.00209696e+01, 2.97570381e-01, 2.06200000e+00,
+            4.40900000e+00, 1.54800000e+00, -1.48801203e+00
+        ],
+         [
+             -2.66751588e+01, 5.59499564e+00, -9.14345860e-01, 3.43000000e-01,
+             4.58000000e-01, 7.82000000e-01, -4.62759755e+00
+         ],
+         [
+             -5.80979675e+00, 3.54092357e+01, 2.00889888e-01, 2.39600000e+00,
+             3.96900000e+00, 1.73200000e+00, -4.65203216e+00
+         ],
+         [
+             -3.13086877e+01, 1.09007628e+00, -1.94612112e-01, 1.94400000e+00,
+             3.85700000e+00, 1.72300000e+00, -2.81427027e+00
+         ]],
+        dtype=np.float32)
+
+    bottom_center_box = BaseInstance3DBoxes(
+        gravity_center_box, origin=(0.5, 0.5, 0.5))
+
+    assert bottom_center_box.yaw.shape[0] == 4
+
+
+def test_lidar_boxes3d():
+    # test empty initialization
+    empty_boxes = []
+    boxes = LiDARInstance3DBoxes(empty_boxes)
+    assert boxes.tensor.shape[0] == 0
+    assert boxes.tensor.shape[1] == 7
+
+    # Test init with origin
+    gravity_center_box = np.array(
+        [[
+            -5.24223238e+00, 4.00209696e+01, 2.97570381e-01, 2.06200000e+00,
+            4.40900000e+00, 1.54800000e+00, -1.48801203e+00
+        ],
+         [
+             -2.66751588e+01, 5.59499564e+00, -9.14345860e-01, 3.43000000e-01,
+             4.58000000e-01, 7.82000000e-01, -4.62759755e+00
+         ],
+         [
+             -5.80979675e+00, 3.54092357e+01, 2.00889888e-01, 2.39600000e+00,
+             3.96900000e+00, 1.73200000e+00, -4.65203216e+00
+         ],
+         [
+             -3.13086877e+01, 1.09007628e+00, -1.94612112e-01, 1.94400000e+00,
+             3.85700000e+00, 1.72300000e+00, -2.81427027e+00
+         ]],
+        dtype=np.float32)
+    bottom_center_box = LiDARInstance3DBoxes(
+        gravity_center_box, origin=(0.5, 0.5, 0.5))
+    expected_tensor = torch.tensor(
+        [[
+            -5.24223238e+00, 4.00209696e+01, -4.76429619e-01, 2.06200000e+00,
+            4.40900000e+00, 1.54800000e+00, -1.48801203e+00
+        ],
+         [
+             -2.66751588e+01, 5.59499564e+00, -1.30534586e+00, 3.43000000e-01,
+             4.58000000e-01, 7.82000000e-01, -4.62759755e+00
+         ],
+         [
+             -5.80979675e+00, 3.54092357e+01, -6.65110112e-01, 2.39600000e+00,
+             3.96900000e+00, 1.73200000e+00, -4.65203216e+00
+         ],
+         [
+             -3.13086877e+01, 1.09007628e+00, -1.05611211e+00, 1.94400000e+00,
+             3.85700000e+00, 1.72300000e+00, -2.81427027e+00
+         ]])
+    assert torch.allclose(expected_tensor, bottom_center_box.tensor)
+
+    # Test init with numpy array
+    np_boxes = np.array([[
+        1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65,
+        1.48 - 0.13603681398218053 * 4
+    ],
+                         [
+                             8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57,
+                             1.62 - 0.13603681398218053 * 4
+                         ]],
+                        dtype=np.float32)
+    boxes_1 = LiDARInstance3DBoxes(np_boxes)
+    assert torch.allclose(boxes_1.tensor, torch.from_numpy(np_boxes))
+
+    # test properties
+    assert boxes_1.volume.size(0) == 2
+    assert (boxes_1.center == boxes_1.bottom_center).all()
+    assert repr(boxes) == (
+        'LiDARInstance3DBoxes(\n    tensor([], size=(0, 7)))')
+
+    # test init with torch.Tensor
+    th_boxes = torch.tensor(
+        [[
+            28.29669987, -0.5557558, -1.30332506, 1.47000003, 2.23000002,
+            1.48000002, -1.57000005 - 0.13603681398218053 * 4
+        ],
+         [
+             26.66901946, 21.82302134, -1.73605708, 1.55999994, 3.48000002,
+             1.39999998, -1.69000006 - 0.13603681398218053 * 4
+         ],
+         [
+             31.31977974, 8.16214412, -1.62177875, 1.74000001, 3.76999998,
+             1.48000002, 2.78999996 - 0.13603681398218053 * 4
+         ]],
+        dtype=torch.float32)
+    boxes_2 = LiDARInstance3DBoxes(th_boxes)
+    assert torch.allclose(boxes_2.tensor, th_boxes)
+
+    # test clone/to/device
+    boxes_2 = boxes_2.clone()
+    boxes_1 = boxes_1.to(boxes_2.device)
+
+    # test box concatenation
+    expected_tensor = torch.tensor([[
+        1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65,
+        1.48 - 0.13603681398218053 * 4
+    ],
+                                    [
+                                        8.959413, 2.4567227, -1.6357126, 1.54,
+                                        4.01, 1.57,
+                                        1.62 - 0.13603681398218053 * 4
+                                    ],
+                                    [
+                                        28.2967, -0.5557558, -1.303325, 1.47,
+                                        2.23, 1.48,
+                                        -1.57 - 0.13603681398218053 * 4
+                                    ],
+                                    [
+                                        26.66902, 21.82302, -1.736057, 1.56,
+                                        3.48, 1.4,
+                                        -1.69 - 0.13603681398218053 * 4
+                                    ],
+                                    [
+                                        31.31978, 8.162144, -1.6217787, 1.74,
+                                        3.77, 1.48,
+                                        2.79 - 0.13603681398218053 * 4
+                                    ]])
+    boxes = LiDARInstance3DBoxes.cat([boxes_1, boxes_2])
+    assert torch.allclose(boxes.tensor, expected_tensor)
+    # concatenate empty list
+    empty_boxes = LiDARInstance3DBoxes.cat([])
+    assert empty_boxes.tensor.shape[0] == 0
+    assert empty_boxes.tensor.shape[-1] == 7
+
+    # test box flip
+    points = torch.tensor([[1.2559, -0.6762, -1.4658],
+                           [4.7814, -0.8784,
+                            -1.3857], [6.7053, 0.2517, -0.9697],
+                           [0.6533, -0.5520, -0.5265],
+                           [4.5870, 0.5358, -1.4741]])
+    expected_tensor = torch.tensor(
+        [[
+            1.7802081, -2.516249, -1.7501148, 1.75, 3.39, 1.65,
+            1.6615927 - np.pi + 0.13603681398218053 * 4
+        ],
+         [
+             8.959413, -2.4567227, -1.6357126, 1.54, 4.01, 1.57,
+             1.5215927 - np.pi + 0.13603681398218053 * 4
+         ],
+         [
+             28.2967, 0.5557558, -1.303325, 1.47, 2.23, 1.48,
+             4.7115927 - np.pi + 0.13603681398218053 * 4
+         ],
+         [
+             26.66902, -21.82302, -1.736057, 1.56, 3.48, 1.4,
+             4.8315926 - np.pi + 0.13603681398218053 * 4
+         ],
+         [
+             31.31978, -8.162144, -1.6217787, 1.74, 3.77, 1.48,
+             0.35159278 - np.pi + 0.13603681398218053 * 4
+         ]])
+    expected_points = torch.tensor([[1.2559, 0.6762, -1.4658],
+                                    [4.7814, 0.8784, -1.3857],
+                                    [6.7053, -0.2517, -0.9697],
+                                    [0.6533, 0.5520, -0.5265],
+                                    [4.5870, -0.5358, -1.4741]])
+    points = boxes.flip('horizontal', points)
+    assert torch.allclose(boxes.tensor, expected_tensor)
+    assert torch.allclose(points, expected_points, 1e-3)
+
+    expected_tensor = torch.tensor(
+        [[
+            -1.7802, -2.5162, -1.7501, 1.7500, 3.3900, 1.6500,
+            -1.6616 + np.pi * 2 - 0.13603681398218053 * 4
+        ],
+         [
+             -8.9594, -2.4567, -1.6357, 1.5400, 4.0100, 1.5700,
+             -1.5216 + np.pi * 2 - 0.13603681398218053 * 4
+         ],
+         [
+             -28.2967, 0.5558, -1.3033, 1.4700, 2.2300, 1.4800,
+             -4.7116 + np.pi * 2 - 0.13603681398218053 * 4
+         ],
+         [
+             -26.6690, -21.8230, -1.7361, 1.5600, 3.4800, 1.4000,
+             -4.8316 + np.pi * 2 - 0.13603681398218053 * 4
+         ],
+         [
+             -31.3198, -8.1621, -1.6218, 1.7400, 3.7700, 1.4800,
+             -0.3516 + np.pi * 2 - 0.13603681398218053 * 4
+         ]])
+    boxes_flip_vert = boxes.clone()
+    points = boxes_flip_vert.flip('vertical', points)
+    expected_points = torch.tensor([[-1.2559, 0.6762, -1.4658],
+                                    [-4.7814, 0.8784, -1.3857],
+                                    [-6.7053, -0.2517, -0.9697],
+                                    [-0.6533, 0.5520, -0.5265],
+                                    [-4.5870, -0.5358, -1.4741]])
+    assert torch.allclose(boxes_flip_vert.tensor, expected_tensor, 1e-4)
+    assert torch.allclose(points, expected_points)
+
+    # test box rotation
+    # with input torch.Tensor points and angle
+    expected_tensor = torch.tensor(
+        [[
+            1.4225, -2.7344, -1.7501, 1.7500, 3.3900, 1.6500,
+            1.7976 - np.pi + 0.13603681398218053 * 2
+        ],
+         [
+             8.5435, -3.6491, -1.6357, 1.5400, 4.0100, 1.5700,
+             1.6576 - np.pi + 0.13603681398218053 * 2
+         ],
+         [
+             28.1106, -3.2869, -1.3033, 1.4700, 2.2300, 1.4800,
+             4.8476 - np.pi + 0.13603681398218053 * 2
+         ],
+         [
+             23.4630, -25.2382, -1.7361, 1.5600, 3.4800, 1.4000,
+             4.9676 - np.pi + 0.13603681398218053 * 2
+         ],
+         [
+             29.9235, -12.3342, -1.6218, 1.7400, 3.7700, 1.4800,
+             0.4876 - np.pi + 0.13603681398218053 * 2
+         ]])
+    points, rot_mat_T = boxes.rotate(-0.13603681398218053, points)
+    expected_points = torch.tensor([[-1.1526, 0.8403, -1.4658],
+                                    [-4.6181, 1.5187, -1.3857],
+                                    [-6.6775, 0.6600, -0.9697],
+                                    [-0.5724, 0.6355, -0.5265],
+                                    [-4.6173, 0.0912, -1.4741]])
+    expected_rot_mat_T = torch.tensor([[0.9908, -0.1356, 0.0000],
+                                       [0.1356, 0.9908, 0.0000],
+                                       [0.0000, 0.0000, 1.0000]])
+    assert torch.allclose(boxes.tensor, expected_tensor, 1e-3)
+    assert torch.allclose(points, expected_points, 1e-3)
+    assert torch.allclose(rot_mat_T, expected_rot_mat_T, 1e-3)
+
+    # with input torch.Tensor points and rotation matrix
+    points, rot_mat_T = boxes.rotate(0.13603681398218053, points)  # back
+    rot_mat = np.array([[0.99076125, -0.13561762, 0.],
+                        [0.13561762, 0.99076125, 0.], [0., 0., 1.]])
+    points, rot_mat_T = boxes.rotate(rot_mat, points)
+    assert torch.allclose(boxes.tensor, expected_tensor, 1e-3)
+    assert torch.allclose(points, expected_points, 1e-3)
+    assert torch.allclose(rot_mat_T, expected_rot_mat_T, 1e-3)
+
+    # with input np.ndarray points and angle
+    points_np = np.array([[-1.0280, 0.9888,
+                           -1.4658], [-4.3695, 2.1310, -1.3857],
+                          [-6.5263, 1.5595,
+                           -0.9697], [-0.4809, 0.7073, -0.5265],
+                          [-4.5623, 0.7166, -1.4741]])
+    points_np, rot_mat_T_np = boxes.rotate(-0.13603681398218053, points_np)
+    expected_points_np = np.array([[-0.8844, 1.1191, -1.4658],
+                                   [-4.0401, 2.7039, -1.3857],
+                                   [-6.2545, 2.4302, -0.9697],
+                                   [-0.3805, 0.7660, -0.5265],
+                                   [-4.4230, 1.3287, -1.4741]])
+    expected_rot_mat_T_np = np.array([[0.9908, -0.1356, 0.0000],
+                                      [0.1356, 0.9908, 0.0000],
+                                      [0.0000, 0.0000, 1.0000]])
+
+    assert np.allclose(points_np, expected_points_np, 1e-3)
+    assert np.allclose(rot_mat_T_np, expected_rot_mat_T_np, 1e-3)
+
+    # with input LiDARPoints and rotation matrix
+    points_np, rot_mat_T_np = boxes.rotate(0.13603681398218053, points_np)
+    lidar_points = LiDARPoints(points_np)
+    lidar_points, rot_mat_T_np = boxes.rotate(rot_mat, lidar_points)
+    points_np = lidar_points.numpy()
+
+    assert np.allclose(points_np, expected_points_np, 1e-3)
+    assert np.allclose(rot_mat_T_np, expected_rot_mat_T_np, 1e-3)
+
+    # test box scaling
+    expected_tensor = torch.tensor([[
+        1.0443488, -2.9183323, -1.7599131, 1.7597977, 3.4089797, 1.6592377,
+        1.9336663 - np.pi
+    ],
+                                    [
+                                        8.014273, -4.8007393, -1.6448704,
+                                        1.5486219, 4.0324507, 1.57879,
+                                        1.7936664 - np.pi
+                                    ],
+                                    [
+                                        27.558605, -7.1084175, -1.310622,
+                                        1.4782301, 2.242485, 1.488286,
+                                        4.9836664 - np.pi
+                                    ],
+                                    [
+                                        19.934517, -28.344835, -1.7457767,
+                                        1.5687338, 3.4994833, 1.4078381,
+                                        5.1036663 - np.pi
+                                    ],
+                                    [
+                                        28.130915, -16.369587, -1.6308585,
+                                        1.7497417, 3.791107, 1.488286,
+                                        0.6236664 - np.pi
+                                    ]])
+    boxes.scale(1.00559866335275)
+    assert torch.allclose(boxes.tensor, expected_tensor)
+
+    # test box translation
+    expected_tensor = torch.tensor([[
+        1.1281544, -3.0507944, -1.9169292, 1.7597977, 3.4089797, 1.6592377,
+        1.9336663 - np.pi
+    ],
+                                    [
+                                        8.098079, -4.9332013, -1.8018866,
+                                        1.5486219, 4.0324507, 1.57879,
+                                        1.7936664 - np.pi
+                                    ],
+                                    [
+                                        27.64241, -7.2408795, -1.4676381,
+                                        1.4782301, 2.242485, 1.488286,
+                                        4.9836664 - np.pi
+                                    ],
+                                    [
+                                        20.018322, -28.477297, -1.9027928,
+                                        1.5687338, 3.4994833, 1.4078381,
+                                        5.1036663 - np.pi
+                                    ],
+                                    [
+                                        28.21472, -16.502048, -1.7878747,
+                                        1.7497417, 3.791107, 1.488286,
+                                        0.6236664 - np.pi
+                                    ]])
+    boxes.translate([0.0838056, -0.13246193, -0.15701613])
+    assert torch.allclose(boxes.tensor, expected_tensor)
+
+    # test bbox in_range_bev
+    expected_tensor = torch.tensor(
+        [[1.1282, -3.0508, 1.7598, 3.4090, -1.2079],
+         [8.0981, -4.9332, 1.5486, 4.0325, -1.3479],
+         [27.6424, -7.2409, 1.4782, 2.2425, 1.8421],
+         [20.0183, -28.4773, 1.5687, 3.4995, 1.9621],
+         [28.2147, -16.5020, 1.7497, 3.7911, -2.5179]])
+    assert torch.allclose(boxes.bev, expected_tensor, atol=1e-3)
+    expected_tensor = torch.tensor([1, 1, 1, 1, 1], dtype=torch.bool)
+    mask = boxes.in_range_bev([0., -40., 70.4, 40.])
+    assert (mask == expected_tensor).all()
+    mask = boxes.nonempty()
+    assert (mask == expected_tensor).all()
+
+    # test bbox in_range
+    expected_tensor = torch.tensor([1, 1, 0, 0, 0], dtype=torch.bool)
+    mask = boxes.in_range_3d([0, -20, -2, 22, 2, 5])
+    assert (mask == expected_tensor).all()
+
+    # test bbox indexing
+    index_boxes = boxes[2:5]
+    expected_tensor = torch.tensor([[
+        27.64241, -7.2408795, -1.4676381, 1.4782301, 2.242485, 1.488286,
+        4.9836664 - np.pi
+    ],
+                                    [
+                                        20.018322, -28.477297, -1.9027928,
+                                        1.5687338, 3.4994833, 1.4078381,
+                                        5.1036663 - np.pi
+                                    ],
+                                    [
+                                        28.21472, -16.502048, -1.7878747,
+                                        1.7497417, 3.791107, 1.488286,
+                                        0.6236664 - np.pi
+                                    ]])
+    assert len(index_boxes) == 3
+    assert torch.allclose(index_boxes.tensor, expected_tensor)
+
+    index_boxes = boxes[2]
+    expected_tensor = torch.tensor([[
+        27.64241, -7.2408795, -1.4676381, 1.4782301, 2.242485, 1.488286,
+        4.9836664 - np.pi
+    ]])
+    assert len(index_boxes) == 1
+    assert torch.allclose(index_boxes.tensor, expected_tensor)
+
+    index_boxes = boxes[[2, 4]]
+    expected_tensor = torch.tensor([[
+        27.64241, -7.2408795, -1.4676381, 1.4782301, 2.242485, 1.488286,
+        4.9836664 - np.pi
+    ],
+                                    [
+                                        28.21472, -16.502048, -1.7878747,
+                                        1.7497417, 3.791107, 1.488286,
+                                        0.6236664 - np.pi
+                                    ]])
+    assert len(index_boxes) == 2
+    assert torch.allclose(index_boxes.tensor, expected_tensor)
+
+    # test iteration
+    for i, box in enumerate(index_boxes):
+        torch.allclose(box, expected_tensor[i])
+
+    # test properties
+    assert torch.allclose(boxes.bottom_center, boxes.tensor[:, :3])
+    expected_tensor = (
+        boxes.tensor[:, :3] - boxes.tensor[:, 3:6] *
+        (torch.tensor([0.5, 0.5, 0]) - torch.tensor([0.5, 0.5, 0.5])))
+    assert torch.allclose(boxes.gravity_center, expected_tensor)
+
+    boxes.limit_yaw()
+    assert (boxes.tensor[:, 6] <= np.pi / 2).all()
+    assert (boxes.tensor[:, 6] >= -np.pi / 2).all()
+
+    Box3DMode.convert(boxes, Box3DMode.LIDAR, Box3DMode.LIDAR)
+    expected_tensor = boxes.tensor.clone()
+    assert torch.allclose(expected_tensor, boxes.tensor)
+
+    boxes.flip()
+    boxes.flip()
+    boxes.limit_yaw()
+    assert torch.allclose(expected_tensor, boxes.tensor)
+
+    # test nearest_bev
+    expected_tensor = torch.tensor([[-0.5763, -3.9307, 2.8326, -2.1709],
+                                    [6.0819, -5.7075, 10.1143, -4.1589],
+                                    [26.5212, -7.9800, 28.7637, -6.5018],
+                                    [18.2686, -29.2617, 21.7681, -27.6929],
+                                    [27.3398, -18.3976, 29.0896, -14.6065]])
+    assert torch.allclose(
+        boxes.nearest_bev, expected_tensor, rtol=1e-4, atol=1e-7)
+
+    expected_tensor = torch.tensor([[[-7.7767e-01, -2.8332e+00, -1.9169e+00],
+                                     [-7.7767e-01, -2.8332e+00, -2.5769e-01],
+                                     [2.4093e+00, -1.6232e+00, -2.5769e-01],
+                                     [2.4093e+00, -1.6232e+00, -1.9169e+00],
+                                     [-1.5301e-01, -4.4784e+00, -1.9169e+00],
+                                     [-1.5301e-01, -4.4784e+00, -2.5769e-01],
+                                     [3.0340e+00, -3.2684e+00, -2.5769e-01],
+                                     [3.0340e+00, -3.2684e+00, -1.9169e+00]],
+                                    [[5.9606e+00, -4.6237e+00, -1.8019e+00],
+                                     [5.9606e+00, -4.6237e+00, -2.2310e-01],
+                                     [9.8933e+00, -3.7324e+00, -2.2310e-01],
+                                     [9.8933e+00, -3.7324e+00, -1.8019e+00],
+                                     [6.3029e+00, -6.1340e+00, -1.8019e+00],
+                                     [6.3029e+00, -6.1340e+00, -2.2310e-01],
+                                     [1.0236e+01, -5.2427e+00, -2.2310e-01],
+                                     [1.0236e+01, -5.2427e+00, -1.8019e+00]],
+                                    [[2.6364e+01, -6.8292e+00, -1.4676e+00],
+                                     [2.6364e+01, -6.8292e+00, 2.0648e-02],
+                                     [2.8525e+01, -6.2283e+00, 2.0648e-02],
+                                     [2.8525e+01, -6.2283e+00, -1.4676e+00],
+                                     [2.6760e+01, -8.2534e+00, -1.4676e+00],
+                                     [2.6760e+01, -8.2534e+00, 2.0648e-02],
+                                     [2.8921e+01, -7.6525e+00, 2.0648e-02],
+                                     [2.8921e+01, -7.6525e+00, -1.4676e+00]],
+                                    [[1.8102e+01, -2.8420e+01, -1.9028e+00],
+                                     [1.8102e+01, -2.8420e+01, -4.9495e-01],
+                                     [2.1337e+01, -2.7085e+01, -4.9495e-01],
+                                     [2.1337e+01, -2.7085e+01, -1.9028e+00],
+                                     [1.8700e+01, -2.9870e+01, -1.9028e+00],
+                                     [1.8700e+01, -2.9870e+01, -4.9495e-01],
+                                     [2.1935e+01, -2.8535e+01, -4.9495e-01],
+                                     [2.1935e+01, -2.8535e+01, -1.9028e+00]],
+                                    [[2.8612e+01, -1.8552e+01, -1.7879e+00],
+                                     [2.8612e+01, -1.8552e+01, -2.9959e-01],
+                                     [2.6398e+01, -1.5474e+01, -2.9959e-01],
+                                     [2.6398e+01, -1.5474e+01, -1.7879e+00],
+                                     [3.0032e+01, -1.7530e+01, -1.7879e+00],
+                                     [3.0032e+01, -1.7530e+01, -2.9959e-01],
+                                     [2.7818e+01, -1.4452e+01, -2.9959e-01],
+                                     [2.7818e+01, -1.4452e+01, -1.7879e+00]]])
+
+    assert torch.allclose(boxes.corners, expected_tensor, rtol=1e-4, atol=1e-7)
+
+    # test new_box
+    new_box1 = boxes.new_box([[1, 2, 3, 4, 5, 6, 7]])
+    assert torch.allclose(
+        new_box1.tensor,
+        torch.tensor([[1, 2, 3, 4, 5, 6, 7]], dtype=boxes.tensor.dtype))
+    assert new_box1.device == boxes.device
+    assert new_box1.with_yaw == boxes.with_yaw
+    assert new_box1.box_dim == boxes.box_dim
+
+    new_box2 = boxes.new_box(np.array([[1, 2, 3, 4, 5, 6, 7]]))
+    assert torch.allclose(
+        new_box2.tensor,
+        torch.tensor([[1, 2, 3, 4, 5, 6, 7]], dtype=boxes.tensor.dtype))
+
+    new_box3 = boxes.new_box(torch.tensor([[1, 2, 3, 4, 5, 6, 7]]))
+    assert torch.allclose(
+        new_box3.tensor,
+        torch.tensor([[1, 2, 3, 4, 5, 6, 7]], dtype=boxes.tensor.dtype))
+
+
+def test_boxes_conversion():
+    """Test the conversion of boxes between different modes.
+
+    CommandLine:
+        xdoctest tests/test_box3d.py::test_boxes_conversion zero
+    """
+    lidar_boxes = LiDARInstance3DBoxes(
+        [[1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, 1.48],
+         [8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57, 1.62],
+         [28.2967, -0.5557558, -1.303325, 1.47, 2.23, 1.48, -1.57],
+         [26.66902, 21.82302, -1.736057, 1.56, 3.48, 1.4, -1.69],
+         [31.31978, 8.162144, -1.6217787, 1.74, 3.77, 1.48, 2.79]])
+    cam_box_tensor = Box3DMode.convert(lidar_boxes.tensor, Box3DMode.LIDAR,
+                                       Box3DMode.CAM)
+    expected_box = lidar_boxes.convert_to(Box3DMode.CAM)
+    assert torch.equal(expected_box.tensor, cam_box_tensor)
+
+    # Some properties should be the same
+    cam_boxes = CameraInstance3DBoxes(cam_box_tensor)
+    assert torch.equal(cam_boxes.height, lidar_boxes.height)
+    assert torch.equal(cam_boxes.top_height, -lidar_boxes.top_height)
+    assert torch.equal(cam_boxes.bottom_height, -lidar_boxes.bottom_height)
+    assert torch.allclose(cam_boxes.volume, lidar_boxes.volume)
+
+    lidar_box_tensor = Box3DMode.convert(cam_box_tensor, Box3DMode.CAM,
+                                         Box3DMode.LIDAR)
+    expected_tensor = torch.tensor(
+        [[1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, 1.48],
+         [8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57, 1.62],
+         [28.2967, -0.5557558, -1.303325, 1.47, 2.23, 1.48, -1.57],
+         [26.66902, 21.82302, -1.736057, 1.56, 3.48, 1.4, -1.69],
+         [31.31978, 8.162144, -1.6217787, 1.74, 3.77, 1.48, 2.79]])
+
+    assert torch.allclose(expected_tensor, lidar_box_tensor)
+    assert torch.allclose(lidar_boxes.tensor, lidar_box_tensor)
+
+    depth_box_tensor = Box3DMode.convert(cam_box_tensor, Box3DMode.CAM,
+                                         Box3DMode.DEPTH)
+    depth_to_cam_box_tensor = Box3DMode.convert(depth_box_tensor,
+                                                Box3DMode.DEPTH, Box3DMode.CAM)
+    assert torch.allclose(cam_box_tensor, depth_to_cam_box_tensor)
+
+    # test similar mode conversion
+    same_results = Box3DMode.convert(depth_box_tensor, Box3DMode.DEPTH,
+                                     Box3DMode.DEPTH)
+    assert torch.equal(same_results, depth_box_tensor)
+
+    # test conversion with a given rt_mat
+    camera_boxes = CameraInstance3DBoxes(
+        [[0.06, 1.77, 21.4, 3.2, 1.61, 1.66, -1.54],
+         [6.59, 1.53, 6.76, 12.78, 3.66, 2.28, 1.55],
+         [6.71, 1.59, 22.18, 14.73, 3.64, 2.32, 1.59],
+         [7.11, 1.58, 34.54, 10.04, 3.61, 2.32, 1.61],
+         [7.78, 1.65, 45.95, 12.83, 3.63, 2.34, 1.64]])
+
+    rect = torch.tensor(
+        [[0.9999239, 0.00983776, -0.00744505, 0.],
+         [-0.0098698, 0.9999421, -0.00427846, 0.],
+         [0.00740253, 0.00435161, 0.9999631, 0.], [0., 0., 0., 1.]],
+        dtype=torch.float32)
+
+    Trv2c = torch.tensor(
+        [[7.533745e-03, -9.999714e-01, -6.166020e-04, -4.069766e-03],
+         [1.480249e-02, 7.280733e-04, -9.998902e-01, -7.631618e-02],
+         [9.998621e-01, 7.523790e-03, 1.480755e-02, -2.717806e-01],
+         [0.000000e+00, 0.000000e+00, 0.000000e+00, 1.000000e+00]],
+        dtype=torch.float32)
+
+    # coord sys refactor (reverse sign of yaw)
+    expected_tensor = torch.tensor(
+        [[
+            2.16902434e+01, -4.06038554e-02, -1.61906639e+00, 3.20000005e+00,
+            1.65999997e+00, 1.61000001e+00, 1.53999996e+00 - np.pi / 2
+        ],
+         [
+             7.05006905e+00, -6.57459601e+00, -1.60107949e+00, 1.27799997e+01,
+             2.27999997e+00, 3.66000009e+00, -1.54999995e+00 - np.pi / 2
+         ],
+         [
+             2.24698818e+01, -6.69203759e+00, -1.50118145e+00, 1.47299995e+01,
+             2.31999993e+00, 3.64000010e+00, -1.59000003e+00 + 3 * np.pi / 2
+         ],
+         [
+             3.48291965e+01, -7.09058388e+00, -1.36622983e+00, 1.00400000e+01,
+             2.31999993e+00, 3.60999990e+00, -1.61000001e+00 + 3 * np.pi / 2
+         ],
+         [
+             4.62394617e+01, -7.75838800e+00, -1.32405020e+00, 1.28299999e+01,
+             2.33999991e+00, 3.63000011e+00, -1.63999999e+00 + 3 * np.pi / 2
+         ]],
+        dtype=torch.float32)
+
+    rt_mat = rect @ Trv2c
+    # test conversion with Box type
+    cam_to_lidar_box = Box3DMode.convert(camera_boxes, Box3DMode.CAM,
+                                         Box3DMode.LIDAR, rt_mat.inverse())
+    assert torch.allclose(cam_to_lidar_box.tensor, expected_tensor)
+
+    lidar_to_cam_box = Box3DMode.convert(cam_to_lidar_box.tensor,
+                                         Box3DMode.LIDAR, Box3DMode.CAM,
+                                         rt_mat)
+    assert torch.allclose(lidar_to_cam_box, camera_boxes.tensor)
+
+    # test numpy convert
+    cam_to_lidar_box = Box3DMode.convert(camera_boxes.numpy(), Box3DMode.CAM,
+                                         Box3DMode.LIDAR,
+                                         rt_mat.inverse().numpy())
+    assert np.allclose(cam_to_lidar_box, expected_tensor.numpy())
+
+    # test list convert
+    cam_to_lidar_box = Box3DMode.convert(
+        camera_boxes.tensor[0].numpy().tolist(), Box3DMode.CAM,
+        Box3DMode.LIDAR,
+        rt_mat.inverse().numpy())
+    assert np.allclose(np.array(cam_to_lidar_box), expected_tensor[0].numpy())
+
+    # test convert from depth to lidar
+    depth_boxes = torch.tensor(
+        [[2.4593, 2.5870, -0.4321, 0.8597, 0.6193, 1.0204, 3.0693],
+         [1.4856, 2.5299, -0.5570, 0.9385, 2.1404, 0.8954, 3.0601]],
+        dtype=torch.float32)
+    depth_boxes = DepthInstance3DBoxes(depth_boxes)
+    depth_to_lidar_box = depth_boxes.convert_to(Box3DMode.LIDAR)
+    expected_box = depth_to_lidar_box.convert_to(Box3DMode.DEPTH)
+    assert torch.equal(depth_boxes.tensor, expected_box.tensor)
+
+    lidar_to_depth_box = Box3DMode.convert(depth_to_lidar_box, Box3DMode.LIDAR,
+                                           Box3DMode.DEPTH)
+    assert torch.allclose(depth_boxes.tensor, lidar_to_depth_box.tensor)
+    assert torch.allclose(depth_boxes.volume, lidar_to_depth_box.volume)
+
+    # test convert from depth to camera
+    depth_to_cam_box = Box3DMode.convert(depth_boxes, Box3DMode.DEPTH,
+                                         Box3DMode.CAM)
+    cam_to_depth_box = Box3DMode.convert(depth_to_cam_box, Box3DMode.CAM,
+                                         Box3DMode.DEPTH)
+    expected_tensor = depth_to_cam_box.convert_to(Box3DMode.DEPTH)
+    assert torch.equal(expected_tensor.tensor, cam_to_depth_box.tensor)
+    assert torch.allclose(depth_boxes.tensor, cam_to_depth_box.tensor)
+    assert torch.allclose(depth_boxes.volume, cam_to_depth_box.volume)
+
+    with pytest.raises(NotImplementedError):
+        # assert invalid convert mode
+        Box3DMode.convert(depth_boxes, Box3DMode.DEPTH, 3)
+
+
+def test_camera_boxes3d():
+    # Test init with numpy array
+    np_boxes = np.array([[
+        1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65,
+        1.48 - 0.13603681398218053 * 4 - 2 * np.pi
+    ],
+                         [
+                             8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57,
+                             1.62 - 0.13603681398218053 * 4 - 2 * np.pi
+                         ]],
+                        dtype=np.float32)
+
+    boxes_1 = Box3DMode.convert(
+        LiDARInstance3DBoxes(np_boxes), Box3DMode.LIDAR, Box3DMode.CAM)
+    assert isinstance(boxes_1, CameraInstance3DBoxes)
+
+    cam_np_boxes = Box3DMode.convert(np_boxes, Box3DMode.LIDAR, Box3DMode.CAM)
+    assert torch.allclose(boxes_1.tensor,
+                          boxes_1.tensor.new_tensor(cam_np_boxes))
+
+    # test init with torch.Tensor
+    th_boxes = torch.tensor(
+        [[
+            28.29669987, -0.5557558, -1.30332506, 1.47000003, 2.23000002,
+            1.48000002, -1.57000005 - 0.13603681398218053 * 4 - 2 * np.pi
+        ],
+         [
+             26.66901946, 21.82302134, -1.73605708, 1.55999994, 3.48000002,
+             1.39999998, -1.69000006 - 0.13603681398218053 * 4 - 2 * np.pi
+         ],
+         [
+             31.31977974, 8.16214412, -1.62177875, 1.74000001, 3.76999998,
+             1.48000002, 2.78999996 - 0.13603681398218053 * 4 - 2 * np.pi
+         ]],
+        dtype=torch.float32)
+    cam_th_boxes = Box3DMode.convert(th_boxes, Box3DMode.LIDAR, Box3DMode.CAM)
+    boxes_2 = CameraInstance3DBoxes(cam_th_boxes)
+    assert torch.allclose(boxes_2.tensor, cam_th_boxes)
+
+    # test clone/to/device
+    boxes_2 = boxes_2.clone()
+    boxes_1 = boxes_1.to(boxes_2.device)
+
+    # test box concatenation
+    expected_tensor = Box3DMode.convert(
+        torch.tensor([[
+            1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65,
+            1.48 - 0.13603681398218053 * 4 - 2 * np.pi
+        ],
+                      [
+                          8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57,
+                          1.62 - 0.13603681398218053 * 4 - 2 * np.pi
+                      ],
+                      [
+                          28.2967, -0.5557558, -1.303325, 1.47, 2.23, 1.48,
+                          -1.57 - 0.13603681398218053 * 4 - 2 * np.pi
+                      ],
+                      [
+                          26.66902, 21.82302, -1.736057, 1.56, 3.48, 1.4,
+                          -1.69 - 0.13603681398218053 * 4 - 2 * np.pi
+                      ],
+                      [
+                          31.31978, 8.162144, -1.6217787, 1.74, 3.77, 1.48,
+                          2.79 - 0.13603681398218053 * 4 - 2 * np.pi
+                      ]]), Box3DMode.LIDAR, Box3DMode.CAM)
+    boxes = CameraInstance3DBoxes.cat([boxes_1, boxes_2])
+    assert torch.allclose(boxes.tensor, expected_tensor)
+
+    # test box flip
+    points = torch.tensor([[0.6762, 1.4658, 1.2559], [0.8784, 1.3857, 4.7814],
+                           [-0.2517, 0.9697, 6.7053], [0.5520, 0.5265, 0.6533],
+                           [-0.5358, 1.4741, 4.5870]])
+    expected_tensor = Box3DMode.convert(
+        torch.tensor([[
+            1.7802081, -2.516249, -1.7501148, 1.75, 3.39, 1.65,
+            1.6615927 + 0.13603681398218053 * 4 - np.pi
+        ],
+                      [
+                          8.959413, -2.4567227, -1.6357126, 1.54, 4.01, 1.57,
+                          1.5215927 + 0.13603681398218053 * 4 - np.pi
+                      ],
+                      [
+                          28.2967, 0.5557558, -1.303325, 1.47, 2.23, 1.48,
+                          4.7115927 + 0.13603681398218053 * 4 - np.pi
+                      ],
+                      [
+                          26.66902, -21.82302, -1.736057, 1.56, 3.48, 1.4,
+                          4.8315926 + 0.13603681398218053 * 4 - np.pi
+                      ],
+                      [
+                          31.31978, -8.162144, -1.6217787, 1.74, 3.77, 1.48,
+                          0.35159278 + 0.13603681398218053 * 4 - np.pi
+                      ]]), Box3DMode.LIDAR, Box3DMode.CAM)
+    points = boxes.flip('horizontal', points)
+    expected_points = torch.tensor([[-0.6762, 1.4658, 1.2559],
+                                    [-0.8784, 1.3857, 4.7814],
+                                    [0.2517, 0.9697, 6.7053],
+                                    [-0.5520, 0.5265, 0.6533],
+                                    [0.5358, 1.4741, 4.5870]])
+
+    yaw_normalized_tensor = boxes.tensor.clone()
+    yaw_normalized_tensor[:, -1:] = limit_period(
+        yaw_normalized_tensor[:, -1:], period=np.pi * 2)
+    assert torch.allclose(yaw_normalized_tensor, expected_tensor, 1e-3)
+    assert torch.allclose(points, expected_points, 1e-3)
+
+    expected_tensor = torch.tensor(
+        [[
+            2.5162, 1.7501, -1.7802, 1.7500, 1.6500, 3.3900,
+            1.6616 + 0.13603681398218053 * 4 - np.pi / 2
+        ],
+         [
+             2.4567, 1.6357, -8.9594, 1.5400, 1.5700, 4.0100,
+             1.5216 + 0.13603681398218053 * 4 - np.pi / 2
+         ],
+         [
+             -0.5558, 1.3033, -28.2967, 1.4700, 1.4800, 2.2300,
+             4.7116 + 0.13603681398218053 * 4 - np.pi / 2
+         ],
+         [
+             21.8230, 1.7361, -26.6690, 1.5600, 1.4000, 3.4800,
+             4.8316 + 0.13603681398218053 * 4 - np.pi / 2
+         ],
+         [
+             8.1621, 1.6218, -31.3198, 1.7400, 1.4800, 3.7700,
+             0.3516 + 0.13603681398218053 * 4 - np.pi / 2
+         ]])
+    boxes_flip_vert = boxes.clone()
+    points = boxes_flip_vert.flip('vertical', points)
+    expected_points = torch.tensor([[-0.6762, 1.4658, -1.2559],
+                                    [-0.8784, 1.3857, -4.7814],
+                                    [0.2517, 0.9697, -6.7053],
+                                    [-0.5520, 0.5265, -0.6533],
+                                    [0.5358, 1.4741, -4.5870]])
+
+    yaw_normalized_tensor = boxes_flip_vert.tensor.clone()
+    yaw_normalized_tensor[:, -1:] = limit_period(
+        yaw_normalized_tensor[:, -1:], period=np.pi * 2)
+    expected_tensor[:, -1:] = limit_period(
+        expected_tensor[:, -1:], period=np.pi * 2)
+    assert torch.allclose(yaw_normalized_tensor, expected_tensor, 1e-4)
+    assert torch.allclose(points, expected_points)
+
+    # test box rotation
+    # with input torch.Tensor points and angle
+    expected_tensor = Box3DMode.convert(
+        torch.tensor([[
+            1.4225, -2.7344, -1.7501, 1.7500, 3.3900, 1.6500,
+            1.7976 + 0.13603681398218053 * 2 - np.pi
+        ],
+                      [
+                          8.5435, -3.6491, -1.6357, 1.5400, 4.0100, 1.5700,
+                          1.6576 + 0.13603681398218053 * 2 - np.pi
+                      ],
+                      [
+                          28.1106, -3.2869, -1.3033, 1.4700, 2.2300, 1.4800,
+                          4.8476 + 0.13603681398218053 * 2 - np.pi
+                      ],
+                      [
+                          23.4630, -25.2382, -1.7361, 1.5600, 3.4800, 1.4000,
+                          4.9676 + 0.13603681398218053 * 2 - np.pi
+                      ],
+                      [
+                          29.9235, -12.3342, -1.6218, 1.7400, 3.7700, 1.4800,
+                          0.4876 + 0.13603681398218053 * 2 - np.pi
+                      ]]), Box3DMode.LIDAR, Box3DMode.CAM)
+    points, rot_mat_T = boxes.rotate(torch.tensor(0.13603681398218053), points)
+    expected_points = torch.tensor([[-0.8403, 1.4658, -1.1526],
+                                    [-1.5187, 1.3857, -4.6181],
+                                    [-0.6600, 0.9697, -6.6775],
+                                    [-0.6355, 0.5265, -0.5724],
+                                    [-0.0912, 1.4741, -4.6173]])
+    expected_rot_mat_T = torch.tensor([[0.9908, 0.0000, -0.1356],
+                                       [0.0000, 1.0000, 0.0000],
+                                       [0.1356, 0.0000, 0.9908]])
+    yaw_normalized_tensor = boxes.tensor.clone()
+    yaw_normalized_tensor[:, -1:] = limit_period(
+        yaw_normalized_tensor[:, -1:], period=np.pi * 2)
+    expected_tensor[:, -1:] = limit_period(
+        expected_tensor[:, -1:], period=np.pi * 2)
+    assert torch.allclose(yaw_normalized_tensor, expected_tensor, 1e-3)
+    assert torch.allclose(points, expected_points, 1e-3)
+    assert torch.allclose(rot_mat_T, expected_rot_mat_T, 1e-3)
+
+    # with input torch.Tensor points and rotation matrix
+    points, rot_mat_T = boxes.rotate(
+        torch.tensor(-0.13603681398218053), points)  # back
+    rot_mat = np.array([[0.99076125, 0., -0.13561762], [0., 1., 0.],
+                        [0.13561762, 0., 0.99076125]])
+    points, rot_mat_T = boxes.rotate(rot_mat, points)
+    yaw_normalized_tensor = boxes.tensor.clone()
+    yaw_normalized_tensor[:, -1:] = limit_period(
+        yaw_normalized_tensor[:, -1:], period=np.pi * 2)
+    assert torch.allclose(yaw_normalized_tensor, expected_tensor, 1e-3)
+    assert torch.allclose(points, expected_points, 1e-3)
+    assert torch.allclose(rot_mat_T, expected_rot_mat_T, 1e-3)
+
+    # with input np.ndarray points and angle
+    points_np = np.array([[0.6762, 1.2559, -1.4658, 2.5359],
+                          [0.8784, 4.7814, -1.3857, 0.7167],
+                          [-0.2517, 6.7053, -0.9697, 0.5599],
+                          [0.5520, 0.6533, -0.5265, 1.0032],
+                          [-0.5358, 4.5870, -1.4741, 0.0556]])
+    points_np, rot_mat_T_np = boxes.rotate(
+        torch.tensor(0.13603681398218053), points_np)
+    expected_points_np = np.array([[0.4712, 1.2559, -1.5440, 2.5359],
+                                   [0.6824, 4.7814, -1.4920, 0.7167],
+                                   [-0.3809, 6.7053, -0.9266, 0.5599],
+                                   [0.4755, 0.6533, -0.5965, 1.0032],
+                                   [-0.7308, 4.5870, -1.3878, 0.0556]])
+    expected_rot_mat_T_np = np.array([[0.9908, 0.0000, -0.1356],
+                                      [0.0000, 1.0000, 0.0000],
+                                      [0.1356, 0.0000, 0.9908]])
+
+    assert np.allclose(points_np, expected_points_np, 1e-3)
+    assert np.allclose(rot_mat_T_np, expected_rot_mat_T_np, 1e-3)
+
+    # with input CameraPoints and rotation matrix
+    points_np, rot_mat_T_np = boxes.rotate(
+        torch.tensor(-0.13603681398218053), points_np)
+    camera_points = CameraPoints(points_np, points_dim=4)
+    camera_points, rot_mat_T_np = boxes.rotate(rot_mat, camera_points)
+    points_np = camera_points.numpy()
+    assert np.allclose(points_np, expected_points_np, 1e-3)
+    assert np.allclose(rot_mat_T_np, expected_rot_mat_T_np, 1e-3)
+
+    # test box scaling
+    expected_tensor = Box3DMode.convert(
+        torch.tensor([[
+            1.0443488, -2.9183323, -1.7599131, 1.7597977, 3.4089797, 1.6592377,
+            1.9336663 - np.pi
+        ],
+                      [
+                          8.014273, -4.8007393, -1.6448704, 1.5486219,
+                          4.0324507, 1.57879, 1.7936664 - np.pi
+                      ],
+                      [
+                          27.558605, -7.1084175, -1.310622, 1.4782301,
+                          2.242485, 1.488286, 4.9836664 - np.pi
+                      ],
+                      [
+                          19.934517, -28.344835, -1.7457767, 1.5687338,
+                          3.4994833, 1.4078381, 5.1036663 - np.pi
+                      ],
+                      [
+                          28.130915, -16.369587, -1.6308585, 1.7497417,
+                          3.791107, 1.488286, 0.6236664 - np.pi
+                      ]]), Box3DMode.LIDAR, Box3DMode.CAM)
+    boxes.scale(1.00559866335275)
+    yaw_normalized_tensor = boxes.tensor.clone()
+    yaw_normalized_tensor[:, -1:] = limit_period(
+        yaw_normalized_tensor[:, -1:], period=np.pi * 2)
+    expected_tensor[:, -1:] = limit_period(
+        expected_tensor[:, -1:], period=np.pi * 2)
+    assert torch.allclose(yaw_normalized_tensor, expected_tensor)
+
+    # test box translation
+    expected_tensor = Box3DMode.convert(
+        torch.tensor([[
+            1.1281544, -3.0507944, -1.9169292, 1.7597977, 3.4089797, 1.6592377,
+            1.9336663 - np.pi
+        ],
+                      [
+                          8.098079, -4.9332013, -1.8018866, 1.5486219,
+                          4.0324507, 1.57879, 1.7936664 - np.pi
+                      ],
+                      [
+                          27.64241, -7.2408795, -1.4676381, 1.4782301,
+                          2.242485, 1.488286, 4.9836664 - np.pi
+                      ],
+                      [
+                          20.018322, -28.477297, -1.9027928, 1.5687338,
+                          3.4994833, 1.4078381, 5.1036663 - np.pi
+                      ],
+                      [
+                          28.21472, -16.502048, -1.7878747, 1.7497417,
+                          3.791107, 1.488286, 0.6236664 - np.pi
+                      ]]), Box3DMode.LIDAR, Box3DMode.CAM)
+    boxes.translate(torch.tensor([0.13246193, 0.15701613, 0.0838056]))
+    yaw_normalized_tensor = boxes.tensor.clone()
+    yaw_normalized_tensor[:, -1:] = limit_period(
+        yaw_normalized_tensor[:, -1:], period=np.pi * 2)
+    expected_tensor[:, -1:] = limit_period(
+        expected_tensor[:, -1:], period=np.pi * 2)
+    assert torch.allclose(yaw_normalized_tensor, expected_tensor)
+
+    # test bbox in_range_bev
+    expected_tensor = torch.tensor([1, 1, 1, 1, 1], dtype=torch.bool)
+    mask = boxes.in_range_bev([0., -40., 70.4, 40.])
+    assert (mask == expected_tensor).all()
+    mask = boxes.nonempty()
+    assert (mask == expected_tensor).all()
+
+    # test bbox in_range
+    expected_tensor = torch.tensor([1, 1, 0, 0, 0], dtype=torch.bool)
+    mask = boxes.in_range_3d([-2, -5, 0, 20, 2, 22])
+    assert (mask == expected_tensor).all()
+
+    expected_tensor = torch.tensor(
+        [[3.0508, 1.1282, 1.7598, 3.4090, -5.9203],
+         [4.9332, 8.0981, 1.5486, 4.0325, -6.0603],
+         [7.2409, 27.6424, 1.4782, 2.2425, -2.8703],
+         [28.4773, 20.0183, 1.5687, 3.4995, -2.7503],
+         [16.5020, 28.2147, 1.7497, 3.7911, -0.9471]])
+    assert torch.allclose(boxes.bev, expected_tensor, atol=1e-3)
+
+    # test properties
+    assert torch.allclose(boxes.bottom_center, boxes.tensor[:, :3])
+    expected_tensor = (
+        boxes.tensor[:, :3] - boxes.tensor[:, 3:6] *
+        (torch.tensor([0.5, 1.0, 0.5]) - torch.tensor([0.5, 0.5, 0.5])))
+    assert torch.allclose(boxes.gravity_center, expected_tensor)
+
+    boxes.limit_yaw()
+    assert (boxes.tensor[:, 6] <= np.pi / 2).all()
+    assert (boxes.tensor[:, 6] >= -np.pi / 2).all()
+
+    Box3DMode.convert(boxes, Box3DMode.LIDAR, Box3DMode.LIDAR)
+    expected_tensor = boxes.tensor.clone()
+    assert torch.allclose(expected_tensor, boxes.tensor)
+
+    boxes.flip()
+    boxes.flip()
+    boxes.limit_yaw()
+    assert torch.allclose(expected_tensor, boxes.tensor)
+
+    # test nearest_bev
+    # BEV box in lidar coordinates (x, y)
+    lidar_expected_tensor = torch.tensor(
+        [[-0.5763, -3.9307, 2.8326, -2.1709],
+         [6.0819, -5.7075, 10.1143, -4.1589],
+         [26.5212, -7.9800, 28.7637, -6.5018],
+         [18.2686, -29.2617, 21.7681, -27.6929],
+         [27.3398, -18.3976, 29.0896, -14.6065]])
+    # BEV box in camera coordinate (-y, x)
+    expected_tensor = lidar_expected_tensor.clone()
+    expected_tensor[:, 0::2] = -lidar_expected_tensor[:, [3, 1]]
+    expected_tensor[:, 1::2] = lidar_expected_tensor[:, 0::2]
+    assert torch.allclose(
+        boxes.nearest_bev, expected_tensor, rtol=1e-4, atol=1e-7)
+
+    expected_tensor = torch.tensor([[[2.8332e+00, 2.5769e-01, -7.7767e-01],
+                                     [1.6232e+00, 2.5769e-01, 2.4093e+00],
+                                     [1.6232e+00, 1.9169e+00, 2.4093e+00],
+                                     [2.8332e+00, 1.9169e+00, -7.7767e-01],
+                                     [4.4784e+00, 2.5769e-01, -1.5302e-01],
+                                     [3.2684e+00, 2.5769e-01, 3.0340e+00],
+                                     [3.2684e+00, 1.9169e+00, 3.0340e+00],
+                                     [4.4784e+00, 1.9169e+00, -1.5302e-01]],
+                                    [[4.6237e+00, 2.2310e-01, 5.9606e+00],
+                                     [3.7324e+00, 2.2310e-01, 9.8933e+00],
+                                     [3.7324e+00, 1.8019e+00, 9.8933e+00],
+                                     [4.6237e+00, 1.8019e+00, 5.9606e+00],
+                                     [6.1340e+00, 2.2310e-01, 6.3029e+00],
+                                     [5.2427e+00, 2.2310e-01, 1.0236e+01],
+                                     [5.2427e+00, 1.8019e+00, 1.0236e+01],
+                                     [6.1340e+00, 1.8019e+00, 6.3029e+00]],
+                                    [[6.8292e+00, -2.0648e-02, 2.6364e+01],
+                                     [6.2283e+00, -2.0648e-02, 2.8525e+01],
+                                     [6.2283e+00, 1.4676e+00, 2.8525e+01],
+                                     [6.8292e+00, 1.4676e+00, 2.6364e+01],
+                                     [8.2534e+00, -2.0648e-02, 2.6760e+01],
+                                     [7.6525e+00, -2.0648e-02, 2.8921e+01],
+                                     [7.6525e+00, 1.4676e+00, 2.8921e+01],
+                                     [8.2534e+00, 1.4676e+00, 2.6760e+01]],
+                                    [[2.8420e+01, 4.9495e-01, 1.8102e+01],
+                                     [2.7085e+01, 4.9495e-01, 2.1337e+01],
+                                     [2.7085e+01, 1.9028e+00, 2.1337e+01],
+                                     [2.8420e+01, 1.9028e+00, 1.8102e+01],
+                                     [2.9870e+01, 4.9495e-01, 1.8700e+01],
+                                     [2.8535e+01, 4.9495e-01, 2.1935e+01],
+                                     [2.8535e+01, 1.9028e+00, 2.1935e+01],
+                                     [2.9870e+01, 1.9028e+00, 1.8700e+01]],
+                                    [[1.4452e+01, 2.9959e-01, 2.7818e+01],
+                                     [1.7530e+01, 2.9959e-01, 3.0032e+01],
+                                     [1.7530e+01, 1.7879e+00, 3.0032e+01],
+                                     [1.4452e+01, 1.7879e+00, 2.7818e+01],
+                                     [1.5474e+01, 2.9959e-01, 2.6398e+01],
+                                     [1.8552e+01, 2.9959e-01, 2.8612e+01],
+                                     [1.8552e+01, 1.7879e+00, 2.8612e+01],
+                                     [1.5474e+01, 1.7879e+00, 2.6398e+01]]])
+
+    assert torch.allclose(boxes.corners, expected_tensor, rtol=1e-3, atol=1e-4)
+
+    th_boxes = torch.tensor(
+        [[
+            28.29669987, -0.5557558, -1.30332506, 1.47000003, 2.23000002,
+            1.48000002, -1.57000005
+        ],
+         [
+             26.66901946, 21.82302134, -1.73605708, 1.55999994, 3.48000002,
+             1.39999998, -1.69000006
+         ],
+         [
+             31.31977974, 8.16214412, -1.62177875, 1.74000001, 3.76999998,
+             1.48000002, 2.78999996
+         ]],
+        dtype=torch.float32)
+
+    # test init with a given origin
+    boxes_origin_given = CameraInstance3DBoxes(
+        th_boxes.clone(), box_dim=7, origin=(0.5, 0.5, 0.5))
+    expected_tensor = th_boxes.clone()
+    expected_tensor[:, :3] = th_boxes[:, :3] + th_boxes[:, 3:6] * (
+        th_boxes.new_tensor((0.5, 1.0, 0.5)) - th_boxes.new_tensor(
+            (0.5, 0.5, 0.5)))
+    assert torch.allclose(boxes_origin_given.tensor, expected_tensor)
+
+
+def test_boxes3d_overlaps():
+    """Test the iou calculation of boxes in different modes.
+
+    CommandLine:
+        xdoctest tests/test_box3d.py::test_boxes3d_overlaps zero
+    """
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+
+    # Test LiDAR boxes 3D overlaps
+    boxes1_tensor = torch.tensor(
+        [[1.8, -2.5, -1.8, 1.75, 3.39, 1.65, -1.6615927],
+         [8.9, -2.5, -1.6, 1.54, 4.01, 1.57, -1.5215927],
+         [28.3, 0.5, -1.3, 1.47, 2.23, 1.48, -4.7115927],
+         [31.3, -8.2, -1.6, 1.74, 3.77, 1.48, -0.35]],
+        device='cuda')
+    boxes1 = LiDARInstance3DBoxes(boxes1_tensor)
+
+    boxes2_tensor = torch.tensor([[1.2, -3.0, -1.9, 1.8, 3.4, 1.7, -1.9],
+                                  [8.1, -2.9, -1.8, 1.5, 4.1, 1.6, -1.8],
+                                  [31.3, -8.2, -1.6, 1.74, 3.77, 1.48, -0.35],
+                                  [20.1, -28.5, -1.9, 1.6, 3.5, 1.4, -5.1]],
+                                 device='cuda')
+    boxes2 = LiDARInstance3DBoxes(boxes2_tensor)
+
+    expected_iou_tensor = torch.tensor(
+        [[0.3710, 0.0000, 0.0000, 0.0000], [0.0000, 0.3322, 0.0000, 0.0000],
+         [0.0000, 0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 1.0000, 0.0000]],
+        device='cuda')
+    overlaps_3d_iou = boxes1.overlaps(boxes1, boxes2)
+    assert torch.allclose(
+        expected_iou_tensor, overlaps_3d_iou, rtol=1e-4, atol=1e-7)
+
+    expected_iof_tensor = torch.tensor(
+        [[0.5582, 0.0000, 0.0000, 0.0000], [0.0000, 0.5025, 0.0000, 0.0000],
+         [0.0000, 0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 1.0000, 0.0000]],
+        device='cuda')
+    overlaps_3d_iof = boxes1.overlaps(boxes1, boxes2, mode='iof')
+    assert torch.allclose(
+        expected_iof_tensor, overlaps_3d_iof, rtol=1e-4, atol=1e-7)
+
+    empty_boxes = []
+    boxes3 = LiDARInstance3DBoxes(empty_boxes)
+    overlaps_3d_empty = boxes1.overlaps(boxes3, boxes2)
+    assert overlaps_3d_empty.shape[0] == 0
+    assert overlaps_3d_empty.shape[1] == 4
+    # Test camera boxes 3D overlaps
+    cam_boxes1_tensor = Box3DMode.convert(boxes1_tensor, Box3DMode.LIDAR,
+                                          Box3DMode.CAM)
+    cam_boxes1 = CameraInstance3DBoxes(cam_boxes1_tensor)
+
+    cam_boxes2_tensor = Box3DMode.convert(boxes2_tensor, Box3DMode.LIDAR,
+                                          Box3DMode.CAM)
+    cam_boxes2 = CameraInstance3DBoxes(cam_boxes2_tensor)
+    cam_overlaps_3d = cam_boxes1.overlaps(cam_boxes1, cam_boxes2)
+
+    # same boxes under different coordinates should have the same iou
+    assert torch.allclose(
+        expected_iou_tensor, cam_overlaps_3d, rtol=1e-3, atol=1e-4)
+    assert torch.allclose(
+        cam_overlaps_3d, overlaps_3d_iou, rtol=1e-3, atol=1e-4)
+
+    with pytest.raises(AssertionError):
+        cam_boxes1.overlaps(cam_boxes1, boxes1)
+    with pytest.raises(AssertionError):
+        boxes1.overlaps(cam_boxes1, boxes1)
+
+
+def test_depth_boxes3d():
+    # test empty initialization
+    empty_boxes = []
+    boxes = DepthInstance3DBoxes(empty_boxes)
+    assert boxes.tensor.shape[0] == 0
+    assert boxes.tensor.shape[1] == 7
+
+    # Test init with numpy array
+    np_boxes = np.array(
+        [[1.4856, 2.5299, -0.5570, 0.9385, 2.1404, 0.8954, 3.0601],
+         [2.3262, 3.3065, --0.44255, 0.8234, 0.5325, 1.0099, 2.9971]],
+        dtype=np.float32)
+    boxes_1 = DepthInstance3DBoxes(np_boxes)
+    assert torch.allclose(boxes_1.tensor, torch.from_numpy(np_boxes))
+
+    # test properties
+
+    assert boxes_1.volume.size(0) == 2
+    assert (boxes_1.center == boxes_1.bottom_center).all()
+    expected_tensor = torch.tensor([[1.4856, 2.5299, -0.1093],
+                                    [2.3262, 3.3065, 0.9475]])
+    assert torch.allclose(boxes_1.gravity_center, expected_tensor)
+    expected_tensor = torch.tensor([[1.4856, 2.5299, 0.9385, 2.1404, 3.0601],
+                                    [2.3262, 3.3065, 0.8234, 0.5325, 2.9971]])
+    assert torch.allclose(boxes_1.bev, expected_tensor)
+    expected_tensor = torch.tensor([[1.0164, 1.4597, 1.9548, 3.6001],
+                                    [1.9145, 3.0402, 2.7379, 3.5728]])
+    assert torch.allclose(boxes_1.nearest_bev, expected_tensor, 1e-4)
+    assert repr(boxes) == (
+        'DepthInstance3DBoxes(\n    tensor([], size=(0, 7)))')
+
+    # test init with torch.Tensor
+    th_boxes = torch.tensor(
+        [[2.4593, 2.5870, -0.4321, 0.8597, 0.6193, 1.0204, 3.0693],
+         [1.4856, 2.5299, -0.5570, 0.9385, 2.1404, 0.8954, 3.0601]],
+        dtype=torch.float32)
+    boxes_2 = DepthInstance3DBoxes(th_boxes)
+    assert torch.allclose(boxes_2.tensor, th_boxes)
+
+    # test clone/to/device
+    boxes_2 = boxes_2.clone()
+    boxes_1 = boxes_1.to(boxes_2.device)
+
+    # test box concatenation
+    expected_tensor = torch.tensor(
+        [[1.4856, 2.5299, -0.5570, 0.9385, 2.1404, 0.8954, 3.0601],
+         [2.3262, 3.3065, 0.44255, 0.8234, 0.5325, 1.0099, 2.9971],
+         [2.4593, 2.5870, -0.4321, 0.8597, 0.6193, 1.0204, 3.0693],
+         [1.4856, 2.5299, -0.5570, 0.9385, 2.1404, 0.8954, 3.0601]])
+    boxes = DepthInstance3DBoxes.cat([boxes_1, boxes_2])
+    assert torch.allclose(boxes.tensor, expected_tensor)
+    # concatenate empty list
+    empty_boxes = DepthInstance3DBoxes.cat([])
+    assert empty_boxes.tensor.shape[0] == 0
+    assert empty_boxes.tensor.shape[-1] == 7
+
+    # test box flip
+    points = torch.tensor([[0.6762, 1.2559, -1.4658, 2.5359],
+                           [0.8784, 4.7814, -1.3857, 0.7167],
+                           [-0.2517, 6.7053, -0.9697, 0.5599],
+                           [0.5520, 0.6533, -0.5265, 1.0032],
+                           [-0.5358, 4.5870, -1.4741, 0.0556]])
+    expected_tensor = torch.tensor(
+        [[-1.4856, 2.5299, -0.5570, 0.9385, 2.1404, 0.8954, 0.0815],
+         [-2.3262, 3.3065, 0.4426, 0.8234, 0.5325, 1.0099, 0.1445],
+         [-2.4593, 2.5870, -0.4321, 0.8597, 0.6193, 1.0204, 0.0723],
+         [-1.4856, 2.5299, -0.5570, 0.9385, 2.1404, 0.8954, 0.0815]])
+    points = boxes.flip(bev_direction='horizontal', points=points)
+    expected_points = torch.tensor([[-0.6762, 1.2559, -1.4658, 2.5359],
+                                    [-0.8784, 4.7814, -1.3857, 0.7167],
+                                    [0.2517, 6.7053, -0.9697, 0.5599],
+                                    [-0.5520, 0.6533, -0.5265, 1.0032],
+                                    [0.5358, 4.5870, -1.4741, 0.0556]])
+    assert torch.allclose(boxes.tensor, expected_tensor, 1e-3)
+    assert torch.allclose(points, expected_points)
+    expected_tensor = torch.tensor(
+        [[-1.4856, -2.5299, -0.5570, 0.9385, 2.1404, 0.8954, -0.0815],
+         [-2.3262, -3.3065, 0.4426, 0.8234, 0.5325, 1.0099, -0.1445],
+         [-2.4593, -2.5870, -0.4321, 0.8597, 0.6193, 1.0204, -0.0723],
+         [-1.4856, -2.5299, -0.5570, 0.9385, 2.1404, 0.8954, -0.0815]])
+    points = boxes.flip(bev_direction='vertical', points=points)
+    expected_points = torch.tensor([[-0.6762, -1.2559, -1.4658, 2.5359],
+                                    [-0.8784, -4.7814, -1.3857, 0.7167],
+                                    [0.2517, -6.7053, -0.9697, 0.5599],
+                                    [-0.5520, -0.6533, -0.5265, 1.0032],
+                                    [0.5358, -4.5870, -1.4741, 0.0556]])
+    assert torch.allclose(boxes.tensor, expected_tensor, 1e-3)
+    assert torch.allclose(points, expected_points)
+
+    # test box rotation
+    # with input torch.Tensor points and angle
+    boxes_rot = boxes.clone()
+    expected_tensor = torch.tensor(
+        [[-1.5434, -2.4951, -0.5570, 0.9385, 2.1404, 0.8954, -0.0585],
+         [-2.4016, -3.2521, 0.4426, 0.8234, 0.5325, 1.0099, -0.1215],
+         [-2.5181, -2.5298, -0.4321, 0.8597, 0.6193, 1.0204, -0.0493],
+         [-1.5434, -2.4951, -0.5570, 0.9385, 2.1404, 0.8954, -0.0585]])
+    expected_tensor[:, -1:] -= 0.022998953275003075 * 2
+    points, rot_mat_T = boxes_rot.rotate(-0.022998953275003075, points)
+    expected_points = torch.tensor([[-0.7049, -1.2400, -1.4658, 2.5359],
+                                    [-0.9881, -4.7599, -1.3857, 0.7167],
+                                    [0.0974, -6.7093, -0.9697, 0.5599],
+                                    [-0.5669, -0.6404, -0.5265, 1.0032],
+                                    [0.4302, -4.5981, -1.4741, 0.0556]])
+    expected_rot_mat_T = torch.tensor([[0.9997, -0.0230, 0.0000],
+                                       [0.0230, 0.9997, 0.0000],
+                                       [0.0000, 0.0000, 1.0000]])
+    assert torch.allclose(boxes_rot.tensor, expected_tensor, 1e-3)
+    assert torch.allclose(points, expected_points, 1e-3)
+    assert torch.allclose(rot_mat_T, expected_rot_mat_T, 1e-3)
+
+    # with input torch.Tensor points and rotation matrix
+    points, rot_mat_T = boxes.rotate(-0.022998953275003075, points)  # back
+    rot_mat = np.array([[0.99973554, 0.02299693, 0.],
+                        [-0.02299693, 0.99973554, 0.], [0., 0., 1.]])
+    points, rot_mat_T = boxes.rotate(rot_mat, points)
+    expected_rot_mat_T = torch.tensor([[0.99973554, 0.02299693, 0.0000],
+                                       [-0.02299693, 0.99973554, 0.0000],
+                                       [0.0000, 0.0000, 1.0000]])
+    assert torch.allclose(boxes_rot.tensor, expected_tensor, 1e-3)
+    assert torch.allclose(points, expected_points, 1e-3)
+    assert torch.allclose(rot_mat_T, expected_rot_mat_T, 1e-3)
+
+    # with input np.ndarray points and angle
+    points_np = np.array([[0.6762, 1.2559, -1.4658, 2.5359],
+                          [0.8784, 4.7814, -1.3857, 0.7167],
+                          [-0.2517, 6.7053, -0.9697, 0.5599],
+                          [0.5520, 0.6533, -0.5265, 1.0032],
+                          [-0.5358, 4.5870, -1.4741, 0.0556]])
+    points_np, rot_mat_T_np = boxes.rotate(-0.022998953275003075, points_np)
+    expected_points_np = np.array([[0.7049, 1.2400, -1.4658, 2.5359],
+                                   [0.9881, 4.7599, -1.3857, 0.7167],
+                                   [-0.0974, 6.7093, -0.9697, 0.5599],
+                                   [0.5669, 0.6404, -0.5265, 1.0032],
+                                   [-0.4302, 4.5981, -1.4741, 0.0556]])
+    expected_rot_mat_T_np = np.array([[0.99973554, -0.02299693, 0.0000],
+                                      [0.02299693, 0.99973554, 0.0000],
+                                      [0.0000, 0.0000, 1.0000]])
+    expected_tensor = torch.tensor(
+        [[-1.5434, -2.4951, -0.5570, 0.9385, 2.1404, 0.8954, -0.0585],
+         [-2.4016, -3.2521, 0.4426, 0.8234, 0.5325, 1.0099, -0.1215],
+         [-2.5181, -2.5298, -0.4321, 0.8597, 0.6193, 1.0204, -0.0493],
+         [-1.5434, -2.4951, -0.5570, 0.9385, 2.1404, 0.8954, -0.0585]])
+    expected_tensor[:, -1:] -= 0.022998953275003075 * 2
+    assert torch.allclose(boxes.tensor, expected_tensor, 1e-3)
+    assert np.allclose(points_np, expected_points_np, 1e-3)
+    assert np.allclose(rot_mat_T_np, expected_rot_mat_T_np, 1e-3)
+
+    # with input DepthPoints and rotation matrix
+    points_np, rot_mat_T_np = boxes.rotate(-0.022998953275003075, points_np)
+    depth_points = DepthPoints(points_np, points_dim=4)
+    depth_points, rot_mat_T_np = boxes.rotate(rot_mat, depth_points)
+    points_np = depth_points.numpy()
+    expected_rot_mat_T_np = expected_rot_mat_T_np.T
+    assert torch.allclose(boxes.tensor, expected_tensor, 1e-3)
+    assert np.allclose(points_np, expected_points_np, 1e-3)
+    assert np.allclose(rot_mat_T_np, expected_rot_mat_T_np, 1e-3)
+
+    expected_tensor = torch.tensor([[[-2.1217, -3.5105, -0.5570],
+                                     [-2.1217, -3.5105, 0.3384],
+                                     [-1.8985, -1.3818, 0.3384],
+                                     [-1.8985, -1.3818, -0.5570],
+                                     [-1.1883, -3.6084, -0.5570],
+                                     [-1.1883, -3.6084, 0.3384],
+                                     [-0.9651, -1.4796, 0.3384],
+                                     [-0.9651, -1.4796, -0.5570]],
+                                    [[-2.8519, -3.4460, 0.4426],
+                                     [-2.8519, -3.4460, 1.4525],
+                                     [-2.7632, -2.9210, 1.4525],
+                                     [-2.7632, -2.9210, 0.4426],
+                                     [-2.0401, -3.5833, 0.4426],
+                                     [-2.0401, -3.5833, 1.4525],
+                                     [-1.9513, -3.0582, 1.4525],
+                                     [-1.9513, -3.0582, 0.4426]],
+                                    [[-2.9755, -2.7971, -0.4321],
+                                     [-2.9755, -2.7971, 0.5883],
+                                     [-2.9166, -2.1806, 0.5883],
+                                     [-2.9166, -2.1806, -0.4321],
+                                     [-2.1197, -2.8789, -0.4321],
+                                     [-2.1197, -2.8789, 0.5883],
+                                     [-2.0608, -2.2624, 0.5883],
+                                     [-2.0608, -2.2624, -0.4321]],
+                                    [[-2.1217, -3.5105, -0.5570],
+                                     [-2.1217, -3.5105, 0.3384],
+                                     [-1.8985, -1.3818, 0.3384],
+                                     [-1.8985, -1.3818, -0.5570],
+                                     [-1.1883, -3.6084, -0.5570],
+                                     [-1.1883, -3.6084, 0.3384],
+                                     [-0.9651, -1.4796, 0.3384],
+                                     [-0.9651, -1.4796, -0.5570]]])
+
+    assert torch.allclose(boxes.corners, expected_tensor, 1e-3)
+
+    th_boxes = torch.tensor(
+        [[0.61211395, 0.8129094, 0.10563634, 1.497534, 0.16927195, 0.27956772],
+         [1.430009, 0.49797538, 0.9382923, 0.07694054, 0.9312509, 1.8919173]],
+        dtype=torch.float32)
+    boxes = DepthInstance3DBoxes(th_boxes, box_dim=6, with_yaw=False)
+    expected_tensor = torch.tensor([[
+        0.64884546, 0.78390356, 0.10563634, 1.50373348, 0.23795205, 0.27956772,
+        0
+    ],
+                                    [
+                                        1.45139421, 0.43169443, 0.93829232,
+                                        0.11967964, 0.93380373, 1.89191735, 0
+                                    ]])
+    boxes_3 = boxes.clone()
+    boxes_3.rotate(-0.04599790655000615)
+    assert torch.allclose(boxes_3.tensor, expected_tensor)
+    boxes.rotate(torch.tensor(-0.04599790655000615))
+    assert torch.allclose(boxes.tensor, expected_tensor)
+
+    # test bbox in_range_bev
+    expected_tensor = torch.tensor([1, 1], dtype=torch.bool)
+    mask = boxes.in_range_bev([0., -40., 70.4, 40.])
+    assert (mask == expected_tensor).all()
+    mask = boxes.nonempty()
+    assert (mask == expected_tensor).all()
+
+    # test bbox in_range
+    expected_tensor = torch.tensor([0, 1], dtype=torch.bool)
+    mask = boxes.in_range_3d([1, 0, -2, 2, 1, 5])
+    assert (mask == expected_tensor).all()
+
+    expected_tensor = torch.tensor([[[-0.1030, 0.6649, 0.1056],
+                                     [-0.1030, 0.6649, 0.3852],
+                                     [-0.1030, 0.9029, 0.3852],
+                                     [-0.1030, 0.9029, 0.1056],
+                                     [1.4007, 0.6649, 0.1056],
+                                     [1.4007, 0.6649, 0.3852],
+                                     [1.4007, 0.9029, 0.3852],
+                                     [1.4007, 0.9029, 0.1056]],
+                                    [[1.3916, -0.0352, 0.9383],
+                                     [1.3916, -0.0352, 2.8302],
+                                     [1.3916, 0.8986, 2.8302],
+                                     [1.3916, 0.8986, 0.9383],
+                                     [1.5112, -0.0352, 0.9383],
+                                     [1.5112, -0.0352, 2.8302],
+                                     [1.5112, 0.8986, 2.8302],
+                                     [1.5112, 0.8986, 0.9383]]])
+    assert torch.allclose(boxes.corners, expected_tensor, 1e-3)
+
+    # test points in boxes
+    if torch.cuda.is_available():
+        box_idxs_of_pts = boxes.points_in_boxes_all(points.cuda())
+        expected_idxs_of_pts = torch.tensor(
+            [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
+            device='cuda:0',
+            dtype=torch.int32)
+        assert torch.all(box_idxs_of_pts == expected_idxs_of_pts)
+
+    # test get_surface_line_center
+    boxes = torch.tensor(
+        [[0.3294, 1.0359, 0.1171, 1.0822, 1.1247, 1.3721, -0.4916],
+         [-2.4630, -2.6324, -0.1616, 0.9202, 1.7896, 0.1992, -0.3185]])
+    boxes = DepthInstance3DBoxes(
+        boxes, box_dim=boxes.shape[-1], with_yaw=True, origin=(0.5, 0.5, 0.5))
+    surface_center, line_center = boxes.get_surface_line_center()
+
+    expected_surface_center = torch.tensor([[0.3294, 1.0359, 0.8031],
+                                            [0.3294, 1.0359, -0.5689],
+                                            [0.5949, 1.5317, 0.1171],
+                                            [0.1533, 0.5018, 0.1171],
+                                            [0.8064, 0.7805, 0.1171],
+                                            [-0.1845, 1.2053, 0.1171],
+                                            [-2.4630, -2.6324, -0.0620],
+                                            [-2.4630, -2.6324, -0.2612],
+                                            [-2.0406, -1.8436, -0.1616],
+                                            [-2.7432, -3.4822, -0.1616],
+                                            [-2.0574, -2.8496, -0.1616],
+                                            [-2.9000, -2.4883, -0.1616]])
+
+    expected_line_center = torch.tensor([[0.8064, 0.7805, 0.8031],
+                                         [-0.1845, 1.2053, 0.8031],
+                                         [0.5949, 1.5317, 0.8031],
+                                         [0.1533, 0.5018, 0.8031],
+                                         [0.8064, 0.7805, -0.5689],
+                                         [-0.1845, 1.2053, -0.5689],
+                                         [0.5949, 1.5317, -0.5689],
+                                         [0.1533, 0.5018, -0.5689],
+                                         [1.0719, 1.2762, 0.1171],
+                                         [0.6672, 0.3324, 0.1171],
+                                         [0.1178, 1.7871, 0.1171],
+                                         [-0.3606, 0.6713, 0.1171],
+                                         [-2.0574, -2.8496, -0.0620],
+                                         [-2.9000, -2.4883, -0.0620],
+                                         [-2.0406, -1.8436, -0.0620],
+                                         [-2.7432, -3.4822, -0.0620],
+                                         [-2.0574, -2.8496, -0.2612],
+                                         [-2.9000, -2.4883, -0.2612],
+                                         [-2.0406, -1.8436, -0.2612],
+                                         [-2.7432, -3.4822, -0.2612],
+                                         [-1.6350, -2.0607, -0.1616],
+                                         [-2.3062, -3.6263, -0.1616],
+                                         [-2.4462, -1.6264, -0.1616],
+                                         [-3.1802, -3.3381, -0.1616]])
+
+    assert torch.allclose(surface_center, expected_surface_center, atol=1e-04)
+    assert torch.allclose(line_center, expected_line_center, atol=1e-04)
+
+
+def test_rotation_3d_in_axis():
+    # clockwise
+    points = torch.tensor([[[-0.4599, -0.0471, 0.0000],
+                            [-0.4599, -0.0471, 1.8433],
+                            [-0.4599, 0.0471, 1.8433]],
+                           [[-0.2555, -0.2683, 0.0000],
+                            [-0.2555, -0.2683, 0.9072],
+                            [-0.2555, 0.2683, 0.9072]]])
+    rotated = rotation_3d_in_axis(
+        points,
+        torch.tensor([-np.pi / 10, np.pi / 10]),
+        axis=0,
+        clockwise=True)
+    expected_rotated = torch.tensor(
+        [[[-0.4599, -0.0448, -0.0146], [-0.4599, -0.6144, 1.7385],
+          [-0.4599, -0.5248, 1.7676]],
+         [[-0.2555, -0.2552, 0.0829], [-0.2555, 0.0252, 0.9457],
+          [-0.2555, 0.5355, 0.7799]]],
+        dtype=torch.float32)
+    assert torch.allclose(rotated, expected_rotated, atol=1e-3)
+
+    # anti-clockwise with return rotation mat
+    points = torch.tensor([[[-0.4599, -0.0471, 0.0000],
+                            [-0.4599, -0.0471, 1.8433]]])
+    rotated = rotation_3d_in_axis(points, torch.tensor([np.pi / 2]), axis=0)
+    expected_rotated = torch.tensor([[[-0.4599, 0.0000, -0.0471],
+                                      [-0.4599, -1.8433, -0.0471]]])
+    assert torch.allclose(rotated, expected_rotated, 1e-3)
+
+    points = torch.tensor([[[-0.4599, -0.0471, 0.0000],
+                            [-0.4599, -0.0471, 1.8433]]])
+    rotated, mat = rotation_3d_in_axis(
+        points, torch.tensor([np.pi / 2]), axis=0, return_mat=True)
+    expected_rotated = torch.tensor([[[-0.4599, 0.0000, -0.0471],
+                                      [-0.4599, -1.8433, -0.0471]]])
+    expected_mat = torch.tensor([[[1, 0, 0], [0, 0, 1], [0, -1, 0]]]).float()
+    assert torch.allclose(rotated, expected_rotated, atol=1e-6)
+    assert torch.allclose(mat, expected_mat, atol=1e-6)
+
+    points = torch.tensor([[[-0.4599, -0.0471, 0.0000],
+                            [-0.4599, -0.0471, 1.8433]],
+                           [[-0.2555, -0.2683, 0.0000],
+                            [-0.2555, -0.2683, 0.9072]]])
+    rotated = rotation_3d_in_axis(points, np.pi / 2, axis=0)
+    expected_rotated = torch.tensor([[[-0.4599, 0.0000, -0.0471],
+                                      [-0.4599, -1.8433, -0.0471]],
+                                     [[-0.2555, 0.0000, -0.2683],
+                                      [-0.2555, -0.9072, -0.2683]]])
+    assert torch.allclose(rotated, expected_rotated, atol=1e-3)
+
+    points = np.array([[[-0.4599, -0.0471, 0.0000], [-0.4599, -0.0471,
+                                                     1.8433]],
+                       [[-0.2555, -0.2683, 0.0000],
+                        [-0.2555, -0.2683, 0.9072]]]).astype(np.float32)
+
+    rotated = rotation_3d_in_axis(points, np.pi / 2, axis=0)
+    expected_rotated = np.array([[[-0.4599, 0.0000, -0.0471],
+                                  [-0.4599, -1.8433, -0.0471]],
+                                 [[-0.2555, 0.0000, -0.2683],
+                                  [-0.2555, -0.9072, -0.2683]]])
+    assert np.allclose(rotated, expected_rotated, atol=1e-3)
+
+    points = torch.tensor([[[-0.4599, -0.0471, 0.0000],
+                            [-0.4599, -0.0471, 1.8433]],
+                           [[-0.2555, -0.2683, 0.0000],
+                            [-0.2555, -0.2683, 0.9072]]])
+    angles = [np.pi / 2, -np.pi / 2]
+    rotated = rotation_3d_in_axis(points, angles, axis=0).numpy()
+    expected_rotated = np.array([[[-0.4599, 0.0000, -0.0471],
+                                  [-0.4599, -1.8433, -0.0471]],
+                                 [[-0.2555, 0.0000, 0.2683],
+                                  [-0.2555, 0.9072, 0.2683]]])
+    assert np.allclose(rotated, expected_rotated, atol=1e-3)
+
+    points = torch.tensor([[[-0.4599, -0.0471, 0.0000],
+                            [-0.4599, -0.0471, 1.8433]],
+                           [[-0.2555, -0.2683, 0.0000],
+                            [-0.2555, -0.2683, 0.9072]]])
+    angles = [np.pi / 2, -np.pi / 2]
+    rotated = rotation_3d_in_axis(points, angles, axis=1).numpy()
+    expected_rotated = np.array([[[0.0000, -0.0471, 0.4599],
+                                  [1.8433, -0.0471, 0.4599]],
+                                 [[0.0000, -0.2683, -0.2555],
+                                  [-0.9072, -0.2683, -0.2555]]])
+    assert np.allclose(rotated, expected_rotated, atol=1e-3)
+
+    points = torch.tensor([[[-0.4599, -0.0471, 0.0000],
+                            [-0.4599, 0.0471, 1.8433]],
+                           [[-0.2555, -0.2683, 0.0000],
+                            [0.2555, -0.2683, 0.9072]]])
+    angles = [np.pi / 2, -np.pi / 2]
+    rotated = rotation_3d_in_axis(points, angles, axis=2).numpy()
+    expected_rotated = np.array([[[0.0471, -0.4599, 0.0000],
+                                  [-0.0471, -0.4599, 1.8433]],
+                                 [[-0.2683, 0.2555, 0.0000],
+                                  [-0.2683, -0.2555, 0.9072]]])
+    assert np.allclose(rotated, expected_rotated, atol=1e-3)
+
+    points = torch.tensor([[[-0.0471, 0.0000], [-0.0471, 1.8433]],
+                           [[-0.2683, 0.0000], [-0.2683, 0.9072]]])
+    angles = [np.pi / 2, -np.pi / 2]
+    rotated = rotation_3d_in_axis(points, angles)
+    expected_rotated = np.array([[[0.0000, -0.0471], [-1.8433, -0.0471]],
+                                 [[0.0000, 0.2683], [0.9072, 0.2683]]])
+    assert np.allclose(rotated, expected_rotated, atol=1e-3)
+
+
+def test_rotation_2d():
+    angles = np.array([3.14])
+    corners = np.array([[[-0.235, -0.49], [-0.235, 0.49], [0.235, 0.49],
+                         [0.235, -0.49]]])
+    corners_rotated = rotation_3d_in_axis(corners, angles)
+    expected_corners = np.array([[[0.2357801, 0.48962511],
+                                  [0.2342193, -0.49037365],
+                                  [-0.2357801, -0.48962511],
+                                  [-0.2342193, 0.49037365]]])
+    assert np.allclose(corners_rotated, expected_corners)
+
+
+def test_limit_period():
+    torch.manual_seed(0)
+    val = torch.rand([5, 1])
+    result = limit_period(val)
+    expected_result = torch.tensor([[0.4963], [0.7682], [0.0885], [0.1320],
+                                    [0.3074]])
+    assert torch.allclose(result, expected_result, 1e-3)
+
+    val = val.numpy()
+    result = limit_period(val)
+    expected_result = expected_result.numpy()
+    assert np.allclose(result, expected_result, 1e-3)
+
+
+def test_xywhr2xyxyr():
+    torch.manual_seed(0)
+    xywhr = torch.tensor([[1., 2., 3., 4., 5.], [0., 1., 2., 3., 4.]])
+    xyxyr = xywhr2xyxyr(xywhr)
+    expected_xyxyr = torch.tensor([[-0.5000, 0.0000, 2.5000, 4.0000, 5.0000],
+                                   [-1.0000, -0.5000, 1.0000, 2.5000, 4.0000]])
+
+    assert torch.allclose(xyxyr, expected_xyxyr)
+
+
+class test_get_box_type(unittest.TestCase):
+
+    def test_get_box_type(self):
+        box_type_3d, box_mode_3d = get_box_type('camera')
+        assert box_type_3d == CameraInstance3DBoxes
+        assert box_mode_3d == Box3DMode.CAM
+
+        box_type_3d, box_mode_3d = get_box_type('depth')
+        assert box_type_3d == DepthInstance3DBoxes
+        assert box_mode_3d == Box3DMode.DEPTH
+
+        box_type_3d, box_mode_3d = get_box_type('lidar')
+        assert box_type_3d == LiDARInstance3DBoxes
+        assert box_mode_3d == Box3DMode.LIDAR
+
+    def test_bad_box_type(self):
+        self.assertRaises(ValueError, get_box_type, 'test')
+
+
+def test_points_cam2img():
+    torch.manual_seed(0)
+    points = torch.rand([5, 3])
+    proj_mat = torch.rand([4, 4])
+    point_2d_res = points_cam2img(points, proj_mat)
+    expected_point_2d_res = torch.tensor([[0.5832, 0.6496], [0.6146, 0.7910],
+                                          [0.6994, 0.7782], [0.5623, 0.6303],
+                                          [0.4359, 0.6532]])
+    assert torch.allclose(point_2d_res, expected_point_2d_res, 1e-3)
+
+    points = points.numpy()
+    proj_mat = proj_mat.numpy()
+    point_2d_res = points_cam2img(points, proj_mat)
+    expected_point_2d_res = expected_point_2d_res.numpy()
+    assert np.allclose(point_2d_res, expected_point_2d_res, 1e-3)
+
+    points = torch.from_numpy(points)
+    point_2d_res = points_cam2img(points, proj_mat)
+    expected_point_2d_res = torch.from_numpy(expected_point_2d_res)
+    assert torch.allclose(point_2d_res, expected_point_2d_res, 1e-3)
+
+    point_2d_res = points_cam2img(points, proj_mat, with_depth=True)
+    expected_point_2d_res = torch.tensor([[0.5832, 0.6496, 1.7577],
+                                          [0.6146, 0.7910, 1.5477],
+                                          [0.6994, 0.7782, 2.0091],
+                                          [0.5623, 0.6303, 1.8739],
+                                          [0.4359, 0.6532, 1.2056]])
+    assert torch.allclose(point_2d_res, expected_point_2d_res, 1e-3)
+
+
+def test_points_in_boxes():
+    if not torch.cuda.is_available():
+        pytest.skip('test requires GPU and torch+cuda')
+    lidar_pts = torch.tensor([[1.0, 4.3, 0.1], [1.0, 4.4,
+                                                0.1], [1.1, 4.3, 0.1],
+                              [0.9, 4.3, 0.1], [1.0, -0.3, 0.1],
+                              [1.0, -0.4, 0.1], [2.9, 0.1, 6.0],
+                              [-0.9, 3.9, 6.0]]).cuda()
+    lidar_boxes = torch.tensor([[1.0, 2.0, 0.0, 4.0, 4.0, 6.0, np.pi / 6],
+                                [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, np.pi / 2],
+                                [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, 7 * np.pi / 6],
+                                [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, -np.pi / 6]],
+                               dtype=torch.float32).cuda()
+    lidar_boxes = LiDARInstance3DBoxes(lidar_boxes)
+
+    point_indices = lidar_boxes.points_in_boxes_all(lidar_pts)
+    expected_point_indices = torch.tensor(
+        [[1, 0, 1, 1], [0, 0, 0, 0], [1, 0, 1, 0], [0, 0, 0, 1], [1, 0, 1, 1],
+         [0, 0, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0]],
+        dtype=torch.int32).cuda()
+    assert point_indices.shape == torch.Size([8, 4])
+    assert (point_indices == expected_point_indices).all()
+
+    lidar_pts = torch.tensor([[1.0, 4.3, 0.1], [1.0, 4.4,
+                                                0.1], [1.1, 4.3, 0.1],
+                              [0.9, 4.3, 0.1], [1.0, -0.3, 0.1],
+                              [1.0, -0.4, 0.1], [2.9, 0.1, 6.0],
+                              [-0.9, 3.9, 6.0]]).cuda()
+    lidar_boxes = torch.tensor([[1.0, 2.0, 0.0, 4.0, 4.0, 6.0, np.pi / 6],
+                                [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, np.pi / 2],
+                                [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, 7 * np.pi / 6],
+                                [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, -np.pi / 6]],
+                               dtype=torch.float32).cuda()
+    lidar_boxes = LiDARInstance3DBoxes(lidar_boxes)
+
+    point_indices = lidar_boxes.points_in_boxes_part(lidar_pts)
+    expected_point_indices = torch.tensor([0, -1, 0, 3, 0, -1, 1, 1],
+                                          dtype=torch.int32).cuda()
+    assert point_indices.shape == torch.Size([8])
+    assert (point_indices == expected_point_indices).all()
+
+    depth_boxes = torch.tensor([[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
+                                [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]],
+                               dtype=torch.float32).cuda()
+    depth_boxes = DepthInstance3DBoxes(depth_boxes)
+    depth_pts = torch.tensor(
+        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
+          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
+          [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [
+              -16, -18, 9
+          ], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]],
+        dtype=torch.float32).cuda()
+
+    point_indices = depth_boxes.points_in_boxes_all(depth_pts)
+    expected_point_indices = torch.tensor(
+        [[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0],
+         [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
+        dtype=torch.int32).cuda()
+    assert point_indices.shape == torch.Size([15, 2])
+    assert (point_indices == expected_point_indices).all()
+
+    point_indices = depth_boxes.points_in_boxes_part(depth_pts)
+    expected_point_indices = torch.tensor(
+        [0, 0, 0, 0, 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+        dtype=torch.int32).cuda()
+    assert point_indices.shape == torch.Size([15])
+    assert (point_indices == expected_point_indices).all()
+
+    depth_boxes = torch.tensor([[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
+                                [-10.0, 23.0, 16.0, 10, 20, 20, 0.5],
+                                [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, np.pi / 6],
+                                [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, np.pi / 2],
+                                [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, 7 * np.pi / 6],
+                                [1.0, 2.0, 0.0, 4.0, 4.0, 6.0, -np.pi / 6]],
+                               dtype=torch.float32).cuda()
+    cam_boxes = DepthInstance3DBoxes(depth_boxes).convert_to(Box3DMode.CAM)
+    depth_pts = torch.tensor(
+        [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
+         [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
+         [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9],
+         [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4], [1.0, 4.3, 0.1],
+         [1.0, 4.4, 0.1], [1.1, 4.3, 0.1], [0.9, 4.3, 0.1], [1.0, -0.3, 0.1],
+         [1.0, -0.4, 0.1], [2.9, 0.1, 6.0], [-0.9, 3.9, 6.0]],
+        dtype=torch.float32).cuda()
+
+    cam_pts = DepthPoints(depth_pts).convert_to(Coord3DMode.CAM).tensor
+
+    point_indices = cam_boxes.points_in_boxes_all(cam_pts)
+    expected_point_indices = torch.tensor(
+        [[1, 0, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1],
+         [1, 0, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1], [0, 1, 0, 0, 0, 0],
+         [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0],
+         [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0],
+         [0, 0, 1, 0, 1, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0],
+         [0, 0, 1, 0, 1, 1], [0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 1, 0],
+         [0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 1, 1], [0, 0, 0, 0, 0, 0],
+         [1, 0, 0, 1, 0, 0], [1, 0, 0, 1, 0, 0]],
+        dtype=torch.int32).cuda()
+    assert point_indices.shape == torch.Size([23, 6])
+    assert (point_indices == expected_point_indices).all()
+
+    point_indices = cam_boxes.points_in_boxes_batch(cam_pts)
+    assert (point_indices == expected_point_indices).all()
+
+    point_indices = cam_boxes.points_in_boxes_part(cam_pts)
+    expected_point_indices = torch.tensor([
+        0, 0, 0, 0, 0, 1, -1, -1, -1, -1, -1, -1, 2, -1, -1, 2, -1, 2, 5, 2,
+        -1, 0, 0
+    ],
+                                          dtype=torch.int32).cuda()
+    assert point_indices.shape == torch.Size([23])
+    assert (point_indices == expected_point_indices).all()
+
+    point_indices = cam_boxes.points_in_boxes(cam_pts)
+    assert (point_indices == expected_point_indices).all()
diff --git a/mmde/tests/test_structures/test_bbox/test_coord_3d_mode.py b/mmde/tests/test_structures/test_bbox/test_coord_3d_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..bffbe398f165f3b8b3153ccc610e5db7bab57a65
--- /dev/null
+++ b/mmde/tests/test_structures/test_bbox/test_coord_3d_mode.py
@@ -0,0 +1,351 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet3d.structures import (CameraInstance3DBoxes, Coord3DMode,
+                                DepthInstance3DBoxes, LiDARInstance3DBoxes,
+                                limit_period)
+from mmdet3d.structures.points import CameraPoints, DepthPoints, LiDARPoints
+
+
+def test_points_conversion():
+    """Test the conversion of points between different modes."""
+    points_np = np.array([[
+        -5.24223238e+00, 4.00209696e+01, 2.97570381e-01, 0.6666, 0.1956,
+        0.4974, 0.9409
+    ],
+                          [
+                              -2.66751588e+01, 5.59499564e+00, -9.14345860e-01,
+                              0.1502, 0.3707, 0.1086, 0.6297
+                          ],
+                          [
+                              -5.80979675e+00, 3.54092357e+01, 2.00889888e-01,
+                              0.6565, 0.6248, 0.6954, 0.2538
+                          ],
+                          [
+                              -3.13086877e+01, 1.09007628e+00, -1.94612112e-01,
+                              0.2803, 0.0258, 0.4896, 0.3269
+                          ]],
+                         dtype=np.float32)
+
+    # test CAM to LIDAR and DEPTH
+    cam_points = CameraPoints(
+        points_np,
+        points_dim=7,
+        attribute_dims=dict(color=[3, 4, 5], height=6))
+
+    convert_lidar_points = cam_points.convert_to(Coord3DMode.LIDAR)
+    expected_tensor = torch.tensor([[
+        2.9757e-01, 5.2422e+00, -4.0021e+01, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -9.1435e-01, 2.6675e+01, -5.5950e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        2.0089e-01, 5.8098e+00, -3.5409e+01,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -1.9461e-01, 3.1309e+01, -1.0901e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+
+    lidar_point_tensor = Coord3DMode.convert_point(cam_points.tensor,
+                                                   Coord3DMode.CAM,
+                                                   Coord3DMode.LIDAR)
+    assert torch.allclose(expected_tensor, convert_lidar_points.tensor, 1e-4)
+    assert torch.allclose(lidar_point_tensor, convert_lidar_points.tensor,
+                          1e-4)
+
+    convert_depth_points = cam_points.convert_to(Coord3DMode.DEPTH)
+    expected_tensor = torch.tensor([[
+        -5.2422e+00, 2.9757e-01, -4.0021e+01, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.6675e+01, -9.1435e-01, -5.5950e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        -5.8098e+00, 2.0089e-01, -3.5409e+01,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -3.1309e+01, -1.9461e-01, -1.0901e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+
+    depth_point_tensor = Coord3DMode.convert_point(cam_points.tensor,
+                                                   Coord3DMode.CAM,
+                                                   Coord3DMode.DEPTH)
+    assert torch.allclose(expected_tensor, convert_depth_points.tensor, 1e-4)
+    assert torch.allclose(depth_point_tensor, convert_depth_points.tensor,
+                          1e-4)
+
+    # test LIDAR to CAM and DEPTH
+    lidar_points = LiDARPoints(
+        points_np,
+        points_dim=7,
+        attribute_dims=dict(color=[3, 4, 5], height=6))
+
+    convert_cam_points = lidar_points.convert_to(Coord3DMode.CAM)
+    expected_tensor = torch.tensor([[
+        -4.0021e+01, -2.9757e-01, -5.2422e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -5.5950e+00, 9.1435e-01, -2.6675e+01,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        -3.5409e+01, -2.0089e-01, -5.8098e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -1.0901e+00, 1.9461e-01, -3.1309e+01,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+
+    cam_point_tensor = Coord3DMode.convert_point(lidar_points.tensor,
+                                                 Coord3DMode.LIDAR,
+                                                 Coord3DMode.CAM)
+    assert torch.allclose(expected_tensor, convert_cam_points.tensor, 1e-4)
+    assert torch.allclose(cam_point_tensor, convert_cam_points.tensor, 1e-4)
+
+    convert_depth_points = lidar_points.convert_to(Coord3DMode.DEPTH)
+    expected_tensor = torch.tensor([[
+        -4.0021e+01, -5.2422e+00, 2.9757e-01, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -5.5950e+00, -2.6675e+01, -9.1435e-01,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        -3.5409e+01, -5.8098e+00, 2.0089e-01,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -1.0901e+00, -3.1309e+01, -1.9461e-01,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+
+    depth_point_tensor = Coord3DMode.convert_point(lidar_points.tensor,
+                                                   Coord3DMode.LIDAR,
+                                                   Coord3DMode.DEPTH)
+    assert torch.allclose(expected_tensor, convert_depth_points.tensor, 1e-4)
+    assert torch.allclose(depth_point_tensor, convert_depth_points.tensor,
+                          1e-4)
+
+    # test DEPTH to CAM and LIDAR
+    depth_points = DepthPoints(
+        points_np,
+        points_dim=7,
+        attribute_dims=dict(color=[3, 4, 5], height=6))
+
+    convert_cam_points = depth_points.convert_to(Coord3DMode.CAM)
+    expected_tensor = torch.tensor([[
+        -5.2422e+00, -2.9757e-01, 4.0021e+01, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.6675e+01, 9.1435e-01, 5.5950e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        -5.8098e+00, -2.0089e-01, 3.5409e+01,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -3.1309e+01, 1.9461e-01, 1.0901e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+
+    cam_point_tensor = Coord3DMode.convert_point(depth_points.tensor,
+                                                 Coord3DMode.DEPTH,
+                                                 Coord3DMode.CAM)
+    assert torch.allclose(expected_tensor, convert_cam_points.tensor, 1e-4)
+    assert torch.allclose(cam_point_tensor, convert_cam_points.tensor, 1e-4)
+
+    rt_mat_provided = torch.tensor([[0.99789, -0.012698, -0.063678],
+                                    [-0.012698, 0.92359, -0.38316],
+                                    [0.063678, 0.38316, 0.92148]])
+
+    depth_points_new = torch.cat([
+        depth_points.tensor[:, :3] @ rt_mat_provided.t(),
+        depth_points.tensor[:, 3:]
+    ],
+                                 dim=1)
+    mat = rt_mat_provided.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+    rt_mat_provided = mat @ rt_mat_provided.transpose(1, 0)
+    cam_point_tensor_new = Coord3DMode.convert_point(
+        depth_points_new,
+        Coord3DMode.DEPTH,
+        Coord3DMode.CAM,
+        rt_mat=rt_mat_provided)
+    assert torch.allclose(expected_tensor, cam_point_tensor_new, 1e-4)
+
+    convert_lidar_points = depth_points.convert_to(Coord3DMode.LIDAR)
+    expected_tensor = torch.tensor([[
+        4.0021e+01, 5.2422e+00, 2.9757e-01, 6.6660e-01, 1.9560e-01, 4.9740e-01,
+        9.4090e-01
+    ],
+                                    [
+                                        5.5950e+00, 2.6675e+01, -9.1435e-01,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        3.5409e+01, 5.8098e+00, 2.0089e-01,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        1.0901e+00, 3.1309e+01, -1.9461e-01,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+
+    lidar_point_tensor = Coord3DMode.convert_point(depth_points.tensor,
+                                                   Coord3DMode.DEPTH,
+                                                   Coord3DMode.LIDAR)
+    assert torch.allclose(lidar_point_tensor, convert_lidar_points.tensor,
+                          1e-4)
+    assert torch.allclose(lidar_point_tensor, convert_lidar_points.tensor,
+                          1e-4)
+
+
+def test_boxes_conversion():
+    # test CAM to LIDAR and DEPTH
+    cam_boxes = CameraInstance3DBoxes(
+        [[1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, 1.48],
+         [8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57, 1.62],
+         [28.2967, -0.5557558, -1.303325, 1.47, 2.23, 1.48, -1.57],
+         [26.66902, 21.82302, -1.736057, 1.56, 3.48, 1.4, -1.69],
+         [31.31978, 8.162144, -1.6217787, 1.74, 3.77, 1.48, 2.79]])
+    convert_lidar_boxes = Coord3DMode.convert(cam_boxes, Coord3DMode.CAM,
+                                              Coord3DMode.LIDAR)
+
+    expected_tensor = torch.tensor([[
+        -1.7501, -1.7802, -2.5162, 1.7500, 1.6500, 3.3900, -1.4800 - np.pi / 2
+    ], [
+        -1.6357, -8.9594, -2.4567, 1.5400, 1.5700, 4.0100, -1.6200 - np.pi / 2
+    ], [-1.3033, -28.2967, 0.5558, 1.4700, 1.4800, 2.2300, 1.5700 - np.pi / 2],
+                                    [
+                                        -1.7361, -26.6690, -21.8230, 1.5600,
+                                        1.4000, 3.4800, 1.6900 - np.pi / 2
+                                    ],
+                                    [
+                                        -1.6218, -31.3198, -8.1621, 1.7400,
+                                        1.4800, 3.7700, -2.7900 - np.pi / 2
+                                    ]])
+    expected_tensor[:, -1:] = limit_period(
+        expected_tensor[:, -1:], period=np.pi * 2)
+    assert torch.allclose(expected_tensor, convert_lidar_boxes.tensor, 1e-3)
+
+    convert_depth_boxes = Coord3DMode.convert(cam_boxes, Coord3DMode.CAM,
+                                              Coord3DMode.DEPTH)
+    expected_tensor = torch.tensor(
+        [[1.7802, -1.7501, -2.5162, 1.7500, 1.6500, 3.3900, -1.4800],
+         [8.9594, -1.6357, -2.4567, 1.5400, 1.5700, 4.0100, -1.6200],
+         [28.2967, -1.3033, 0.5558, 1.4700, 1.4800, 2.2300, 1.5700],
+         [26.6690, -1.7361, -21.8230, 1.5600, 1.4000, 3.4800, 1.6900],
+         [31.3198, -1.6218, -8.1621, 1.7400, 1.4800, 3.7700, -2.7900]])
+    assert torch.allclose(expected_tensor, convert_depth_boxes.tensor, 1e-3)
+
+    # test LIDAR to CAM and DEPTH
+    lidar_boxes = LiDARInstance3DBoxes(
+        [[1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, 1.48],
+         [8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57, 1.62],
+         [28.2967, -0.5557558, -1.303325, 1.47, 2.23, 1.48, -1.57],
+         [26.66902, 21.82302, -1.736057, 1.56, 3.48, 1.4, -1.69],
+         [31.31978, 8.162144, -1.6217787, 1.74, 3.77, 1.48, 2.79]])
+    convert_cam_boxes = Coord3DMode.convert(lidar_boxes, Coord3DMode.LIDAR,
+                                            Coord3DMode.CAM)
+    expected_tensor = torch.tensor([
+        [-2.5162, 1.7501, 1.7802, 1.7500, 1.6500, 3.3900, -1.4800 - np.pi / 2],
+        [-2.4567, 1.6357, 8.9594, 1.5400, 1.5700, 4.0100, -1.6200 - np.pi / 2],
+        [0.5558, 1.3033, 28.2967, 1.4700, 1.4800, 2.2300, 1.5700 - np.pi / 2],
+        [
+            -21.8230, 1.7361, 26.6690, 1.5600, 1.4000, 3.4800,
+            1.6900 - np.pi / 2
+        ],
+        [
+            -8.1621, 1.6218, 31.3198, 1.7400, 1.4800, 3.7700,
+            -2.7900 - np.pi / 2
+        ]
+    ])
+    expected_tensor[:, -1:] = limit_period(
+        expected_tensor[:, -1:], period=np.pi * 2)
+    assert torch.allclose(expected_tensor, convert_cam_boxes.tensor, 1e-3)
+
+    convert_depth_boxes = Coord3DMode.convert(lidar_boxes, Coord3DMode.LIDAR,
+                                              Coord3DMode.DEPTH)
+    expected_tensor = torch.tensor([[
+        -2.5162, 1.7802, -1.7501, 1.7500, 3.3900, 1.6500, 1.4800 + np.pi / 2
+    ], [-2.4567, 8.9594, -1.6357, 1.5400, 4.0100, 1.5700, 1.6200 + np.pi / 2],
+                                    [
+                                        0.5558, 28.2967, -1.3033, 1.4700,
+                                        2.2300, 1.4800, -1.5700 + np.pi / 2
+                                    ],
+                                    [
+                                        -21.8230, 26.6690, -1.7361, 1.5600,
+                                        3.4800, 1.4000, -1.6900 + np.pi / 2
+                                    ],
+                                    [
+                                        -8.1621, 31.3198, -1.6218, 1.7400,
+                                        3.7700, 1.4800, 2.7900 + np.pi / 2
+                                    ]])
+    expected_tensor[:, -1:] = limit_period(
+        expected_tensor[:, -1:], period=np.pi * 2)
+    assert torch.allclose(expected_tensor, convert_depth_boxes.tensor, 1e-3)
+
+    # test DEPTH to CAM and LIDAR
+    depth_boxes = DepthInstance3DBoxes(
+        [[1.7802081, 2.516249, -1.7501148, 1.75, 3.39, 1.65, 1.48],
+         [8.959413, 2.4567227, -1.6357126, 1.54, 4.01, 1.57, 1.62],
+         [28.2967, -0.5557558, -1.303325, 1.47, 2.23, 1.48, -1.57],
+         [26.66902, 21.82302, -1.736057, 1.56, 3.48, 1.4, -1.69],
+         [31.31978, 8.162144, -1.6217787, 1.74, 3.77, 1.48, 2.79]])
+    convert_cam_boxes = Coord3DMode.convert(depth_boxes, Coord3DMode.DEPTH,
+                                            Coord3DMode.CAM)
+    expected_tensor = torch.tensor(
+        [[1.7802, 1.7501, 2.5162, 1.7500, 1.6500, 3.3900, -1.4800],
+         [8.9594, 1.6357, 2.4567, 1.5400, 1.5700, 4.0100, -1.6200],
+         [28.2967, 1.3033, -0.5558, 1.4700, 1.4800, 2.2300, 1.5700],
+         [26.6690, 1.7361, 21.8230, 1.5600, 1.4000, 3.4800, 1.6900],
+         [31.3198, 1.6218, 8.1621, 1.7400, 1.4800, 3.7700, -2.7900]])
+    assert torch.allclose(expected_tensor, convert_cam_boxes.tensor, 1e-3)
+
+    convert_lidar_boxes = Coord3DMode.convert(depth_boxes, Coord3DMode.DEPTH,
+                                              Coord3DMode.LIDAR)
+    expected_tensor = torch.tensor([[
+        2.5162, -1.7802, -1.7501, 1.7500, 3.3900, 1.6500, 1.4800 - np.pi / 2
+    ], [
+        2.4567, -8.9594, -1.6357, 1.5400, 4.0100, 1.5700, 1.6200 - np.pi / 2
+    ], [
+        -0.5558, -28.2967, -1.3033, 1.4700, 2.2300, 1.4800, -1.5700 - np.pi / 2
+    ], [
+        21.8230, -26.6690, -1.7361, 1.5600, 3.4800, 1.4000, -1.6900 - np.pi / 2
+    ], [8.1621, -31.3198, -1.6218, 1.7400, 3.7700, 1.4800,
+        2.7900 - np.pi / 2]])
+    expected_tensor[:, -1:] = limit_period(
+        expected_tensor[:, -1:], period=np.pi * 2)
+    assert torch.allclose(expected_tensor, convert_lidar_boxes.tensor, 1e-3)
diff --git a/mmde/tests/test_structures/test_det3d_data_sample.py b/mmde/tests/test_structures/test_det3d_data_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..5532036426c5bce0470b364b9716fc5b46e6fb95
--- /dev/null
+++ b/mmde/tests/test_structures/test_det3d_data_sample.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
+import pytest
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet3d.structures import Det3DDataSample, PointData
+
+
+def _equal(a, b):
+    if isinstance(a, (torch.Tensor, np.ndarray)):
+        return (a == b).all()
+    else:
+        return a == b
+
+
+class TestDet3DDataSample(TestCase):
+
+    def test_init(self):
+        meta_info = dict(
+            img_size=[256, 256],
+            scale_factor=np.array([1.5, 1.5]),
+            img_shape=torch.rand(4))
+
+        det3d_data_sample = Det3DDataSample(metainfo=meta_info)
+        assert 'img_size' in det3d_data_sample
+        assert det3d_data_sample.img_size == [256, 256]
+        assert det3d_data_sample.get('img_size') == [256, 256]
+
+    def test_setter(self):
+        det3d_data_sample = Det3DDataSample()
+        # test gt_instances_3d
+        gt_instances_3d_data = dict(
+            bboxes_3d=torch.rand(4, 7), labels_3d=torch.rand(4))
+        gt_instances_3d = InstanceData(**gt_instances_3d_data)
+        det3d_data_sample.gt_instances_3d = gt_instances_3d
+        assert 'gt_instances_3d' in det3d_data_sample
+        assert _equal(det3d_data_sample.gt_instances_3d.bboxes_3d,
+                      gt_instances_3d_data['bboxes_3d'])
+        assert _equal(det3d_data_sample.gt_instances_3d.labels_3d,
+                      gt_instances_3d_data['labels_3d'])
+
+        # test pred_instances_3d
+        pred_instances_3d_data = dict(
+            bboxes_3d=torch.rand(2, 7),
+            labels_3d=torch.rand(2),
+            scores_3d=torch.rand(2))
+        pred_instances_3d = InstanceData(**pred_instances_3d_data)
+        det3d_data_sample.pred_instances_3d = pred_instances_3d
+        assert 'pred_instances_3d' in det3d_data_sample
+        assert _equal(det3d_data_sample.pred_instances_3d.bboxes_3d,
+                      pred_instances_3d_data['bboxes_3d'])
+        assert _equal(det3d_data_sample.pred_instances_3d.labels_3d,
+                      pred_instances_3d_data['labels_3d'])
+        assert _equal(det3d_data_sample.pred_instances_3d.scores_3d,
+                      pred_instances_3d_data['scores_3d'])
+
+        # test pts_pred_instances_3d
+        pts_pred_instances_3d_data = dict(
+            bboxes_3d=torch.rand(2, 7),
+            labels_3d=torch.rand(2),
+            scores_3d=torch.rand(2))
+        pts_pred_instances_3d = InstanceData(**pts_pred_instances_3d_data)
+        det3d_data_sample.pts_pred_instances_3d = pts_pred_instances_3d
+        assert 'pts_pred_instances_3d' in det3d_data_sample
+        assert _equal(det3d_data_sample.pts_pred_instances_3d.bboxes_3d,
+                      pts_pred_instances_3d_data['bboxes_3d'])
+        assert _equal(det3d_data_sample.pts_pred_instances_3d.labels_3d,
+                      pts_pred_instances_3d_data['labels_3d'])
+        assert _equal(det3d_data_sample.pts_pred_instances_3d.scores_3d,
+                      pts_pred_instances_3d_data['scores_3d'])
+
+        # test img_pred_instances_3d
+        img_pred_instances_3d_data = dict(
+            bboxes_3d=torch.rand(2, 7),
+            labels_3d=torch.rand(2),
+            scores_3d=torch.rand(2))
+        img_pred_instances_3d = InstanceData(**img_pred_instances_3d_data)
+        det3d_data_sample.img_pred_instances_3d = img_pred_instances_3d
+        assert 'img_pred_instances_3d' in det3d_data_sample
+        assert _equal(det3d_data_sample.img_pred_instances_3d.bboxes_3d,
+                      img_pred_instances_3d_data['bboxes_3d'])
+        assert _equal(det3d_data_sample.img_pred_instances_3d.labels_3d,
+                      img_pred_instances_3d_data['labels_3d'])
+        assert _equal(det3d_data_sample.img_pred_instances_3d.scores_3d,
+                      img_pred_instances_3d_data['scores_3d'])
+
+        # test gt_pts_seg
+        gt_pts_seg_data = dict(
+            pts_instance_mask=torch.rand(20), pts_semantic_mask=torch.rand(20))
+        gt_pts_seg = PointData(**gt_pts_seg_data)
+        det3d_data_sample.gt_pts_seg = gt_pts_seg
+        assert 'gt_pts_seg' in det3d_data_sample
+        assert _equal(det3d_data_sample.gt_pts_seg.pts_instance_mask,
+                      gt_pts_seg_data['pts_instance_mask'])
+        assert _equal(det3d_data_sample.gt_pts_seg.pts_semantic_mask,
+                      gt_pts_seg_data['pts_semantic_mask'])
+
+        # test pred_pts_seg
+        pred_pts_seg_data = dict(
+            pts_instance_mask=torch.rand(20), pts_semantic_mask=torch.rand(20))
+        pred_pts_seg = PointData(**pred_pts_seg_data)
+        det3d_data_sample.pred_pts_seg = pred_pts_seg
+        assert 'pred_pts_seg' in det3d_data_sample
+        assert _equal(det3d_data_sample.pred_pts_seg.pts_instance_mask,
+                      pred_pts_seg_data['pts_instance_mask'])
+        assert _equal(det3d_data_sample.pred_pts_seg.pts_semantic_mask,
+                      pred_pts_seg_data['pts_semantic_mask'])
+
+        # test type error
+        with pytest.raises(AssertionError):
+            det3d_data_sample.pred_instances_3d = torch.rand(2, 4)
+
+        with pytest.raises(AssertionError):
+            det3d_data_sample.pred_pts_seg = torch.rand(20)
+
+    def test_deleter(self):
+        tmp_instances_3d_data = dict(
+            bboxes_3d=torch.rand(4, 4), labels_3d=torch.rand(4))
+
+        det3d_data_sample = Det3DDataSample()
+        gt_instances_3d = InstanceData(data=tmp_instances_3d_data)
+        det3d_data_sample.gt_instances_3d = gt_instances_3d
+        assert 'gt_instances_3d' in det3d_data_sample
+        del det3d_data_sample.gt_instances_3d
+        assert 'gt_instances_3d' not in det3d_data_sample
+
+        pred_instances_3d = InstanceData(data=tmp_instances_3d_data)
+        det3d_data_sample.pred_instances_3d = pred_instances_3d
+        assert 'pred_instances_3d' in det3d_data_sample
+        del det3d_data_sample.pred_instances_3d
+        assert 'pred_instances_3d' not in det3d_data_sample
+
+        pts_pred_instances_3d = InstanceData(data=tmp_instances_3d_data)
+        det3d_data_sample.pts_pred_instances_3d = pts_pred_instances_3d
+        assert 'pts_pred_instances_3d' in det3d_data_sample
+        del det3d_data_sample.pts_pred_instances_3d
+        assert 'pts_pred_instances_3d' not in det3d_data_sample
+
+        img_pred_instances_3d = InstanceData(data=tmp_instances_3d_data)
+        det3d_data_sample.img_pred_instances_3d = img_pred_instances_3d
+        assert 'img_pred_instances_3d' in det3d_data_sample
+        del det3d_data_sample.img_pred_instances_3d
+        assert 'img_pred_instances_3d' not in det3d_data_sample
+
+        pred_pts_seg_data = dict(
+            pts_instance_mask=torch.rand(20), pts_semantic_mask=torch.rand(20))
+        pred_pts_seg = PointData(**pred_pts_seg_data)
+        det3d_data_sample.pred_pts_seg = pred_pts_seg
+        assert 'pred_pts_seg' in det3d_data_sample
+        del det3d_data_sample.pred_pts_seg
+        assert 'pred_pts_seg' not in det3d_data_sample
diff --git a/mmde/tests/test_structures/test_ops/test_box_np_ops.py b/mmde/tests/test_structures/test_ops/test_box_np_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ce6cf6ceed1fa211b4cb8ea11c09c972f69de8d
--- /dev/null
+++ b/mmde/tests/test_structures/test_ops/test_box_np_ops.py
@@ -0,0 +1,83 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+
+def test_camera_to_lidar():
+    from mmdet3d.structures.ops.box_np_ops import camera_to_lidar
+    points = np.array([[1.84, 1.47, 8.41]])
+    rect = np.array([[0.9999128, 0.01009263, -0.00851193, 0.],
+                     [-0.01012729, 0.9999406, -0.00403767, 0.],
+                     [0.00847068, 0.00412352, 0.9999556, 0.], [0., 0., 0.,
+                                                               1.]])
+    Trv2c = np.array([[0.00692796, -0.9999722, -0.00275783, -0.02457729],
+                      [-0.00116298, 0.00274984, -0.9999955, -0.06127237],
+                      [0.9999753, 0.00693114, -0.0011439, -0.3321029],
+                      [0., 0., 0., 1.]])
+    points_lidar = camera_to_lidar(points, rect, Trv2c)
+    expected_points = np.array([[8.73138192, -1.85591746, -1.59969933]])
+    assert np.allclose(points_lidar, expected_points)
+
+
+def test_box_camera_to_lidar():
+    from mmdet3d.structures.ops.box_np_ops import box_camera_to_lidar
+    box = np.array([[1.84, 1.47, 8.41, 1.2, 1.89, 0.48, -0.01]])
+    rect = np.array([[0.9999128, 0.01009263, -0.00851193, 0.],
+                     [-0.01012729, 0.9999406, -0.00403767, 0.],
+                     [0.00847068, 0.00412352, 0.9999556, 0.], [0., 0., 0.,
+                                                               1.]])
+    Trv2c = np.array([[0.00692796, -0.9999722, -0.00275783, -0.02457729],
+                      [-0.00116298, 0.00274984, -0.9999955, -0.06127237],
+                      [0.9999753, 0.00693114, -0.0011439, -0.3321029],
+                      [0., 0., 0., 1.]])
+    box_lidar = box_camera_to_lidar(box, rect, Trv2c)
+    expected_box = np.array([[
+        8.73138192, -1.85591746, -1.59969933, 1.2, 0.48, 1.89, 0.01 - np.pi / 2
+    ]])
+    assert np.allclose(box_lidar, expected_box)
+
+
+def test_corners_nd():
+    from mmdet3d.structures.ops.box_np_ops import corners_nd
+    dims = np.array([[0.47, 0.98]])
+    corners = corners_nd(dims)
+    expected_corners = np.array([[[-0.235, -0.49], [-0.235, 0.49],
+                                  [0.235, 0.49], [0.235, -0.49]]])
+    assert np.allclose(corners, expected_corners)
+
+
+def test_center_to_corner_box2d():
+    from mmdet3d.structures.ops.box_np_ops import center_to_corner_box2d
+    center = np.array([[9.348705, -3.6271024]])
+    dims = np.array([[0.47, 0.98]])
+    angles = np.array([3.14])
+    corner = center_to_corner_box2d(center, dims, angles)
+    expected_corner = np.array([[[9.584485, -3.1374772], [9.582925, -4.117476],
+                                 [9.112926, -4.1167274],
+                                 [9.114486, -3.1367288]]])
+    assert np.allclose(corner, expected_corner)
+
+    center = np.array([[-0.0, 0.0]])
+    dims = np.array([[4.0, 8.0]])
+    angles = np.array([-0.785398])  # -45 degrees
+    corner = center_to_corner_box2d(center, dims, angles)
+    expected_corner = np.array([[[-4.24264, -1.41421], [1.41421, 4.24264],
+                                 [4.24264, 1.41421], [-1.41421, -4.24264]]])
+    assert np.allclose(corner, expected_corner)
+
+
+def test_points_in_convex_polygon_jit():
+    from mmdet3d.structures.ops.box_np_ops import points_in_convex_polygon_jit
+    points = np.array([[0.4, 0.4], [0.5, 0.5], [0.6, 0.6]])
+    polygons = np.array([[[1.0, 0.0], [0.0, 1.0], [0.0, 0.5], [0.0, 0.0]],
+                         [[1.0, 0.0], [1.0, 1.0], [0.5, 1.0], [0.0, 1.0]],
+                         [[1.0, 0.0], [0.0, 1.0], [-1.0, 0.0], [0.0, -1.0]]])
+    res = points_in_convex_polygon_jit(points, polygons)
+    expected_res = np.array([[1, 0, 1], [0, 0, 0], [0, 1, 0]]).astype(bool)
+    assert np.allclose(res, expected_res)
+
+    polygons = np.array([[[0.0, 0.0], [0.0, 1.0], [0.5, 0.5], [1.0, 0.0]],
+                         [[0.0, 1.0], [1.0, 1.0], [1.0, 0.5], [1.0, 0.0]],
+                         [[1.0, 0.0], [0.0, -1.0], [-1.0, 0.0], [0.0, 1.1]]])
+    res = points_in_convex_polygon_jit(points, polygons, clockwise=True)
+    expected_res = np.array([[1, 0, 1], [0, 0, 1], [0, 1, 0]]).astype(bool)
+    assert np.allclose(res, expected_res)
diff --git a/mmde/tests/test_structures/test_point_data.py b/mmde/tests/test_structures/test_point_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..20a72a61e702107688557a79a4650059e45b12dd
--- /dev/null
+++ b/mmde/tests/test_structures/test_point_data.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from unittest import TestCase
+
+import numpy as np
+import pytest
+import torch
+
+from mmdet3d.structures import PointData
+
+
+class TestPointData(TestCase):
+
+    def setup_data(self):
+        metainfo = dict(sample_idx=random.randint(0, 100))
+        points = torch.rand((5, 3))
+        point_data = PointData(metainfo=metainfo, points=points)
+        return point_data
+
+    def test_set_data(self):
+        point_data = self.setup_data()
+
+        # test set '_metainfo_fields' or '_data_fields'
+        with self.assertRaises(AttributeError):
+            point_data._metainfo_fields = 1
+        with self.assertRaises(AttributeError):
+            point_data._data_fields = 1
+
+        point_data.keypoints = torch.rand((5, 2))
+        assert 'keypoints' in point_data
+
+    def test_getitem(self):
+        point_data = PointData()
+        # length must be greater than 0
+        with self.assertRaises(IndexError):
+            point_data[1]
+
+        point_data = self.setup_data()
+        assert len(point_data) == 5
+        slice_point_data = point_data[:2]
+        assert len(slice_point_data) == 2
+        slice_point_data = point_data[1]
+        assert len(slice_point_data) == 1
+        # assert the index should in 0 ~ len(point_data) - 1
+        with pytest.raises(IndexError):
+            point_data[5]
+
+        # isinstance(str, slice, int, torch.LongTensor, torch.BoolTensor)
+        item = torch.Tensor([1, 2, 3, 4])  # float
+        with pytest.raises(AssertionError):
+            point_data[item]
+
+        # when input is a bool tensor, The shape of
+        # the input at index 0 should equal to
+        # the value length in instance_data_field
+        with pytest.raises(AssertionError):
+            point_data[item.bool()]
+
+        # test LongTensor
+        long_tensor = torch.randint(5, (2, ))
+        long_index_point_data = point_data[long_tensor]
+        assert len(long_index_point_data) == len(long_tensor)
+
+        # test BoolTensor
+        bool_tensor = torch.rand(5) > 0.5
+        bool_index_point_data = point_data[bool_tensor]
+        assert len(bool_index_point_data) == bool_tensor.sum()
+        bool_tensor = torch.rand(5) > 1
+        empty_point_data = point_data[bool_tensor]
+        assert len(empty_point_data) == bool_tensor.sum()
+
+        # test list index
+        list_index = [1, 2]
+        list_index_point_data = point_data[list_index]
+        assert len(list_index_point_data) == len(list_index)
+
+        # test list bool
+        list_bool = [True, False, True, False, False]
+        list_bool_point_data = point_data[list_bool]
+        assert len(list_bool_point_data) == 2
+
+        # test numpy
+        long_numpy = np.random.randint(5, size=2)
+        long_numpy_point_data = point_data[long_numpy]
+        assert len(long_numpy_point_data) == len(long_numpy)
+
+        bool_numpy = np.random.rand(5) > 0.5
+        bool_numpy_point_data = point_data[bool_numpy]
+        assert len(bool_numpy_point_data) == bool_numpy.sum()
+
+    def test_len(self):
+        point_data = self.setup_data()
+        assert len(point_data) == 5
+        point_data = PointData()
+        assert len(point_data) == 0
diff --git a/mmde/tests/test_structures/test_points/test_base_points.py b/mmde/tests/test_structures/test_points/test_base_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..7984fdbf9db2787f0b51ad23e3099788d7fb3d86
--- /dev/null
+++ b/mmde/tests/test_structures/test_points/test_base_points.py
@@ -0,0 +1,268 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmdet3d.structures.points import BasePoints
+
+
+def test_base_points():
+    # test empty initialization
+    empty_boxes = []
+    points = BasePoints(empty_boxes)
+    assert points.tensor.shape[0] == 0
+    assert points.tensor.shape[1] == 3
+
+    # Test init with origin
+    points_np = np.array([[-5.24223238e+00, 4.00209696e+01, 2.97570381e-01],
+                          [-2.66751588e+01, 5.59499564e+00, -9.14345860e-01],
+                          [-5.80979675e+00, 3.54092357e+01, 2.00889888e-01],
+                          [-3.13086877e+01, 1.09007628e+00, -1.94612112e-01]],
+                         dtype=np.float32)
+    base_points = BasePoints(points_np, points_dim=3)
+    assert base_points.tensor.shape[0] == 4
+
+    # Test init with color and height
+    points_np = np.array([[
+        -5.24223238e+00, 4.00209696e+01, 2.97570381e-01, 0.6666, 0.1956,
+        0.4974, 0.9409
+    ],
+                          [
+                              -2.66751588e+01, 5.59499564e+00, -9.14345860e-01,
+                              0.1502, 0.3707, 0.1086, 0.6297
+                          ],
+                          [
+                              -5.80979675e+00, 3.54092357e+01, 2.00889888e-01,
+                              0.6565, 0.6248, 0.6954, 0.2538
+                          ],
+                          [
+                              -3.13086877e+01, 1.09007628e+00, -1.94612112e-01,
+                              0.2803, 0.0258, 0.4896, 0.3269
+                          ]],
+                         dtype=np.float32)
+    base_points = BasePoints(
+        points_np,
+        points_dim=7,
+        attribute_dims=dict(color=[3, 4, 5], height=6))
+    expected_tensor = torch.tensor([[
+        -5.24223238e+00, 4.00209696e+01, 2.97570381e-01, 0.6666, 0.1956,
+        0.4974, 0.9409
+    ],
+                                    [
+                                        -2.66751588e+01, 5.59499564e+00,
+                                        -9.14345860e-01, 0.1502, 0.3707,
+                                        0.1086, 0.6297
+                                    ],
+                                    [
+                                        -5.80979675e+00, 3.54092357e+01,
+                                        2.00889888e-01, 0.6565, 0.6248, 0.6954,
+                                        0.2538
+                                    ],
+                                    [
+                                        -3.13086877e+01, 1.09007628e+00,
+                                        -1.94612112e-01, 0.2803, 0.0258,
+                                        0.4896, 0.3269
+                                    ]])
+
+    assert torch.allclose(expected_tensor, base_points.tensor)
+    assert torch.allclose(expected_tensor[:, :2], base_points.bev)
+    assert torch.allclose(expected_tensor[:, :3], base_points.coord)
+    assert torch.allclose(expected_tensor[:, 3:6], base_points.color)
+    assert torch.allclose(expected_tensor[:, 6], base_points.height)
+
+    # test points clone
+    new_base_points = base_points.clone()
+    assert torch.allclose(new_base_points.tensor, base_points.tensor)
+
+    # test points shuffle
+    new_base_points.shuffle()
+    assert new_base_points.tensor.shape == torch.Size([4, 7])
+
+    # test points rotation
+    rot_mat = torch.tensor([[0.93629336, -0.27509585, 0.21835066],
+                            [0.28962948, 0.95642509, -0.03695701],
+                            [-0.19866933, 0.0978434, 0.97517033]])
+
+    base_points.rotate(rot_mat)
+    expected_tensor = torch.tensor([[
+        6.6239e+00, 3.9748e+01, -2.3335e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.3174e+01, 1.2600e+01, -6.9230e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        4.7760e+00, 3.5484e+01, -2.3813e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -2.8960e+01, 9.6364e+00, -7.0663e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, base_points.tensor, 1e-3)
+
+    new_base_points = base_points.clone()
+    new_base_points.rotate(0.1, axis=2)
+    expected_tensor = torch.tensor([[
+        2.6226e+00, 4.0211e+01, -2.3335e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.4316e+01, 1.0224e+01, -6.9230e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        1.2096e+00, 3.5784e+01, -2.3813e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -2.9777e+01, 6.6971e+00, -7.0663e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, new_base_points.tensor, 1e-3)
+
+    # test points translation
+    translation_vector = torch.tensor([0.93629336, -0.27509585, 0.21835066])
+    base_points.translate(translation_vector)
+    expected_tensor = torch.tensor([[
+        7.5602e+00, 3.9473e+01, -2.1152e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.2237e+01, 1.2325e+01, -6.7046e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        5.7123e+00, 3.5209e+01, -2.1629e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -2.8023e+01, 9.3613e+00, -6.8480e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, base_points.tensor, 1e-4)
+
+    # test points filter
+    point_range = [-10, -40, -10, 10, 40, 10]
+    in_range_flags = base_points.in_range_3d(point_range)
+    expected_flags = torch.tensor([True, False, True, False])
+    assert torch.all(in_range_flags == expected_flags)
+
+    # test points scale
+    base_points.scale(1.2)
+    expected_tensor = torch.tensor([[
+        9.0722e+00, 4.7368e+01, -2.5382e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.6685e+01, 1.4790e+01, -8.0455e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        6.8547e+00, 4.2251e+01, -2.5955e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -3.3628e+01, 1.1234e+01, -8.2176e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, base_points.tensor, 1e-3)
+
+    # test get_item
+    expected_tensor = torch.tensor(
+        [[-26.6848, 14.7898, -8.0455, 0.1502, 0.3707, 0.1086, 0.6297]])
+    assert torch.allclose(expected_tensor, base_points[1].tensor, 1e-4)
+    expected_tensor = torch.tensor(
+        [[-26.6848, 14.7898, -8.0455, 0.1502, 0.3707, 0.1086, 0.6297],
+         [6.8547, 42.2509, -2.5955, 0.6565, 0.6248, 0.6954, 0.2538]])
+    assert torch.allclose(expected_tensor, base_points[1:3].tensor, 1e-4)
+    mask = torch.tensor([True, False, True, False])
+    expected_tensor = torch.tensor(
+        [[9.0722, 47.3678, -2.5382, 0.6666, 0.1956, 0.4974, 0.9409],
+         [6.8547, 42.2509, -2.5955, 0.6565, 0.6248, 0.6954, 0.2538]])
+    assert torch.allclose(expected_tensor, base_points[mask].tensor, 1e-4)
+    expected_tensor = torch.tensor([[0.6666], [0.1502], [0.6565], [0.2803]])
+    assert torch.allclose(expected_tensor, base_points[:, 3].tensor, 1e-4)
+
+    # test length
+    assert len(base_points) == 4
+
+    # test repr
+    expected_repr = 'BasePoints(\n    '\
+        'tensor([[ 9.0722e+00,  4.7368e+01, -2.5382e+00,  '\
+        '6.6660e-01,  1.9560e-01,\n          4.9740e-01,  '\
+        '9.4090e-01],\n        '\
+        '[-2.6685e+01,  1.4790e+01, -8.0455e+00,  1.5020e-01,  '\
+        '3.7070e-01,\n          '\
+        '1.0860e-01,  6.2970e-01],\n        '\
+        '[ 6.8547e+00,  4.2251e+01, -2.5955e+00,  6.5650e-01,  '\
+        '6.2480e-01,\n          '\
+        '6.9540e-01,  2.5380e-01],\n        '\
+        '[-3.3628e+01,  1.1234e+01, -8.2176e+00,  2.8030e-01,  '\
+        '2.5800e-02,\n          '\
+        '4.8960e-01,  3.2690e-01]]))'
+    assert expected_repr == str(base_points)
+
+    # test concatenate
+    base_points_clone = base_points.clone()
+    cat_points = BasePoints.cat([base_points, base_points_clone])
+    assert torch.allclose(cat_points.tensor[:len(base_points)],
+                          base_points.tensor)
+
+    # test iteration
+    for i, point in enumerate(base_points):
+        assert torch.allclose(point, base_points.tensor[i])
+
+    # test new_point
+    new_points = base_points.new_point([[1, 2, 3, 4, 5, 6, 7]])
+    assert torch.allclose(
+        new_points.tensor,
+        torch.tensor([[1, 2, 3, 4, 5, 6, 7]], dtype=base_points.tensor.dtype))
+
+    # test BasePoint indexing
+    base_points = BasePoints(
+        points_np,
+        points_dim=7,
+        attribute_dims=dict(height=3, color=[4, 5, 6]))
+    assert torch.all(base_points[:, 3:].tensor == torch.tensor(points_np[:,
+                                                                         3:]))
+
+    # test set and get function for BasePoint color and height
+    base_points = BasePoints(points_np[:, :3])
+    assert base_points.attribute_dims is None
+    base_points.height = points_np[:, 3]
+    assert base_points.attribute_dims == dict(height=3)
+    base_points.color = points_np[:, 4:]
+    assert base_points.attribute_dims == dict(height=3, color=[4, 5, 6])
+    assert torch.allclose(base_points.height,
+                          torch.tensor([0.6666, 0.1502, 0.6565, 0.2803]))
+    assert torch.allclose(
+        base_points.color,
+        torch.tensor([[0.1956, 0.4974, 0.9409], [0.3707, 0.1086, 0.6297],
+                      [0.6248, 0.6954, 0.2538], [0.0258, 0.4896, 0.3269]]))
+    # values to be set should have correct shape (e.g. number of points)
+    with pytest.raises(ValueError):
+        base_points.coord = np.random.rand(5, 3)
+    with pytest.raises(ValueError):
+        base_points.height = np.random.rand(3)
+    with pytest.raises(ValueError):
+        base_points.color = np.random.rand(4, 2)
+    base_points.coord = points_np[:, [1, 2, 3]]
+    base_points.height = points_np[:, 0]
+    base_points.color = points_np[:, [4, 5, 6]]
+    assert np.allclose(base_points.coord, points_np[:, 1:4])
+    assert np.allclose(base_points.height, points_np[:, 0])
+    assert np.allclose(base_points.color, points_np[:, 4:])
diff --git a/mmde/tests/test_structures/test_points/test_cam_points.py b/mmde/tests/test_structures/test_points/test_cam_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9ead56c95ec073fd619d1267d4d1e03b8c11b1a
--- /dev/null
+++ b/mmde/tests/test_structures/test_points/test_cam_points.py
@@ -0,0 +1,559 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet3d.structures.points import CameraPoints, LiDARPoints
+
+
+def test_cam_points():
+    # test empty initialization
+    empty_boxes = []
+    points = CameraPoints(empty_boxes)
+    assert points.tensor.shape[0] == 0
+    assert points.tensor.shape[1] == 3
+
+    # Test init with origin
+    points_np = np.array([[-5.24223238e+00, 4.00209696e+01, 2.97570381e-01],
+                          [-2.66751588e+01, 5.59499564e+00, -9.14345860e-01],
+                          [-5.80979675e+00, 3.54092357e+01, 2.00889888e-01],
+                          [-3.13086877e+01, 1.09007628e+00, -1.94612112e-01]],
+                         dtype=np.float32)
+    cam_points = CameraPoints(points_np, points_dim=3)
+    assert cam_points.tensor.shape[0] == 4
+
+    # Test init with color and height
+    points_np = np.array([[
+        -5.24223238e+00, 4.00209696e+01, 2.97570381e-01, 0.6666, 0.1956,
+        0.4974, 0.9409
+    ],
+                          [
+                              -2.66751588e+01, 5.59499564e+00, -9.14345860e-01,
+                              0.1502, 0.3707, 0.1086, 0.6297
+                          ],
+                          [
+                              -5.80979675e+00, 3.54092357e+01, 2.00889888e-01,
+                              0.6565, 0.6248, 0.6954, 0.2538
+                          ],
+                          [
+                              -3.13086877e+01, 1.09007628e+00, -1.94612112e-01,
+                              0.2803, 0.0258, 0.4896, 0.3269
+                          ]],
+                         dtype=np.float32)
+    cam_points = CameraPoints(
+        points_np,
+        points_dim=7,
+        attribute_dims=dict(color=[3, 4, 5], height=6))
+    expected_tensor = torch.tensor([[
+        -5.24223238e+00, 4.00209696e+01, 2.97570381e-01, 0.6666, 0.1956,
+        0.4974, 0.9409
+    ],
+                                    [
+                                        -2.66751588e+01, 5.59499564e+00,
+                                        -9.14345860e-01, 0.1502, 0.3707,
+                                        0.1086, 0.6297
+                                    ],
+                                    [
+                                        -5.80979675e+00, 3.54092357e+01,
+                                        2.00889888e-01, 0.6565, 0.6248, 0.6954,
+                                        0.2538
+                                    ],
+                                    [
+                                        -3.13086877e+01, 1.09007628e+00,
+                                        -1.94612112e-01, 0.2803, 0.0258,
+                                        0.4896, 0.3269
+                                    ]])
+
+    assert torch.allclose(expected_tensor, cam_points.tensor)
+    assert torch.allclose(expected_tensor[:, [0, 2]], cam_points.bev)
+    assert torch.allclose(expected_tensor[:, :3], cam_points.coord)
+    assert torch.allclose(expected_tensor[:, 3:6], cam_points.color)
+    assert torch.allclose(expected_tensor[:, 6], cam_points.height)
+
+    # test points clone
+    new_cam_points = cam_points.clone()
+    assert torch.allclose(new_cam_points.tensor, cam_points.tensor)
+
+    # test points shuffle
+    new_cam_points.shuffle()
+    assert new_cam_points.tensor.shape == torch.Size([4, 7])
+
+    # test points rotation
+    rot_mat = torch.tensor([[0.93629336, -0.27509585, 0.21835066],
+                            [0.28962948, 0.95642509, -0.03695701],
+                            [-0.19866933, 0.0978434, 0.97517033]])
+    cam_points.rotate(rot_mat)
+    expected_tensor = torch.tensor([[
+        6.6239e+00, 3.9748e+01, -2.3335e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.3174e+01, 1.2600e+01, -6.9230e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        4.7760e+00, 3.5484e+01, -2.3813e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -2.8960e+01, 9.6364e+00, -7.0663e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, cam_points.tensor, 1e-3)
+
+    new_cam_points = cam_points.clone()
+    new_cam_points.rotate(0.1, axis=2)
+    expected_tensor = torch.tensor([[
+        2.6226e+00, 4.0211e+01, -2.3335e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.4316e+01, 1.0224e+01, -6.9230e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        1.2096e+00, 3.5784e+01, -2.3813e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -2.9777e+01, 6.6971e+00, -7.0663e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, new_cam_points.tensor, 1e-3)
+
+    # test points translation
+    translation_vector = torch.tensor([0.93629336, -0.27509585, 0.21835066])
+    cam_points.translate(translation_vector)
+    expected_tensor = torch.tensor([[
+        7.5602e+00, 3.9473e+01, -2.1152e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.2237e+01, 1.2325e+01, -6.7046e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        5.7123e+00, 3.5209e+01, -2.1629e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -2.8023e+01, 9.3613e+00, -6.8480e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, cam_points.tensor, 1e-4)
+
+    # test points filter
+    point_range = [-10, -40, -10, 10, 40, 10]
+    in_range_flags = cam_points.in_range_3d(point_range)
+    expected_flags = torch.tensor([True, False, True, False])
+    assert torch.all(in_range_flags == expected_flags)
+
+    # test points scale
+    cam_points.scale(1.2)
+    expected_tensor = torch.tensor([[
+        9.0722e+00, 4.7368e+01, -2.5382e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.6685e+01, 1.4790e+01, -8.0455e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        6.8547e+00, 4.2251e+01, -2.5955e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -3.3628e+01, 1.1234e+01, -8.2176e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, cam_points.tensor, 1e-3)
+
+    # test get_item
+    expected_tensor = torch.tensor(
+        [[-26.6848, 14.7898, -8.0455, 0.1502, 0.3707, 0.1086, 0.6297]])
+    assert torch.allclose(expected_tensor, cam_points[1].tensor, 1e-4)
+    expected_tensor = torch.tensor(
+        [[-26.6848, 14.7898, -8.0455, 0.1502, 0.3707, 0.1086, 0.6297],
+         [6.8547, 42.2509, -2.5955, 0.6565, 0.6248, 0.6954, 0.2538]])
+    assert torch.allclose(expected_tensor, cam_points[1:3].tensor, 1e-4)
+    mask = torch.tensor([True, False, True, False])
+    expected_tensor = torch.tensor(
+        [[9.0722, 47.3678, -2.5382, 0.6666, 0.1956, 0.4974, 0.9409],
+         [6.8547, 42.2509, -2.5955, 0.6565, 0.6248, 0.6954, 0.2538]])
+    assert torch.allclose(expected_tensor, cam_points[mask].tensor, 1e-4)
+    expected_tensor = torch.tensor([[0.6666], [0.1502], [0.6565], [0.2803]])
+    assert torch.allclose(expected_tensor, cam_points[:, 3].tensor, 1e-4)
+
+    # test length
+    assert len(cam_points) == 4
+
+    # test repr
+    expected_repr = 'CameraPoints(\n    '\
+        'tensor([[ 9.0722e+00,  4.7368e+01, -2.5382e+00,  '\
+        '6.6660e-01,  1.9560e-01,\n          4.9740e-01,  '\
+        '9.4090e-01],\n        '\
+        '[-2.6685e+01,  1.4790e+01, -8.0455e+00,  1.5020e-01,  '\
+        '3.7070e-01,\n          '\
+        '1.0860e-01,  6.2970e-01],\n        '\
+        '[ 6.8547e+00,  4.2251e+01, -2.5955e+00,  6.5650e-01,  '\
+        '6.2480e-01,\n          '\
+        '6.9540e-01,  2.5380e-01],\n        '\
+        '[-3.3628e+01,  1.1234e+01, -8.2176e+00,  2.8030e-01,  '\
+        '2.5800e-02,\n          '\
+        '4.8960e-01,  3.2690e-01]]))'
+    assert expected_repr == str(cam_points)
+
+    # test concatenate
+    cam_points_clone = cam_points.clone()
+    cat_points = CameraPoints.cat([cam_points, cam_points_clone])
+    assert torch.allclose(cat_points.tensor[:len(cam_points)],
+                          cam_points.tensor)
+
+    # test iteration
+    for i, point in enumerate(cam_points):
+        assert torch.allclose(point, cam_points.tensor[i])
+
+    # test new_point
+    new_points = cam_points.new_point([[1, 2, 3, 4, 5, 6, 7]])
+    assert torch.allclose(
+        new_points.tensor,
+        torch.tensor([[1, 2, 3, 4, 5, 6, 7]], dtype=cam_points.tensor.dtype))
+
+    # test in_range_bev
+    point_bev_range = [-10, -10, 10, 10]
+    in_range_flags = cam_points.in_range_bev(point_bev_range)
+    expected_flags = torch.tensor([True, False, True, False])
+    assert torch.all(in_range_flags == expected_flags)
+
+    # test flip
+    cam_points.flip(bev_direction='horizontal')
+    expected_tensor = torch.tensor([[
+        -9.0722e+00, 4.7368e+01, -2.5382e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        2.6685e+01, 1.4790e+01, -8.0455e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        -6.8547e+00, 4.2251e+01, -2.5955e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        3.3628e+01, 1.1234e+01, -8.2176e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, cam_points.tensor, 1e-4)
+
+    cam_points.flip(bev_direction='vertical')
+    expected_tensor = torch.tensor([[
+        -9.0722e+00, 4.7368e+01, 2.5382e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        2.6685e+01, 1.4790e+01, 8.0455e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        -6.8547e+00, 4.2251e+01, 2.5955e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        3.3628e+01, 1.1234e+01, 8.2176e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, cam_points.tensor, 1e-4)
+
+
+def test_lidar_points():
+    # test empty initialization
+    empty_boxes = []
+    points = LiDARPoints(empty_boxes)
+    assert points.tensor.shape[0] == 0
+    assert points.tensor.shape[1] == 3
+
+    # Test init with origin
+    points_np = np.array([[-5.24223238e+00, 4.00209696e+01, 2.97570381e-01],
+                          [-2.66751588e+01, 5.59499564e+00, -9.14345860e-01],
+                          [-5.80979675e+00, 3.54092357e+01, 2.00889888e-01],
+                          [-3.13086877e+01, 1.09007628e+00, -1.94612112e-01]],
+                         dtype=np.float32)
+    lidar_points = LiDARPoints(points_np, points_dim=3)
+    assert lidar_points.tensor.shape[0] == 4
+
+    # Test init with color and height
+    points_np = np.array([[
+        -5.24223238e+00, 4.00209696e+01, 2.97570381e-01, 0.6666, 0.1956,
+        0.4974, 0.9409
+    ],
+                          [
+                              -2.66751588e+01, 5.59499564e+00, -9.14345860e-01,
+                              0.1502, 0.3707, 0.1086, 0.6297
+                          ],
+                          [
+                              -5.80979675e+00, 3.54092357e+01, 2.00889888e-01,
+                              0.6565, 0.6248, 0.6954, 0.2538
+                          ],
+                          [
+                              -3.13086877e+01, 1.09007628e+00, -1.94612112e-01,
+                              0.2803, 0.0258, 0.4896, 0.3269
+                          ]],
+                         dtype=np.float32)
+    lidar_points = LiDARPoints(
+        points_np,
+        points_dim=7,
+        attribute_dims=dict(color=[3, 4, 5], height=6))
+    expected_tensor = torch.tensor([[
+        -5.24223238e+00, 4.00209696e+01, 2.97570381e-01, 0.6666, 0.1956,
+        0.4974, 0.9409
+    ],
+                                    [
+                                        -2.66751588e+01, 5.59499564e+00,
+                                        -9.14345860e-01, 0.1502, 0.3707,
+                                        0.1086, 0.6297
+                                    ],
+                                    [
+                                        -5.80979675e+00, 3.54092357e+01,
+                                        2.00889888e-01, 0.6565, 0.6248, 0.6954,
+                                        0.2538
+                                    ],
+                                    [
+                                        -3.13086877e+01, 1.09007628e+00,
+                                        -1.94612112e-01, 0.2803, 0.0258,
+                                        0.4896, 0.3269
+                                    ]])
+
+    assert torch.allclose(expected_tensor, lidar_points.tensor)
+    assert torch.allclose(expected_tensor[:, :2], lidar_points.bev)
+    assert torch.allclose(expected_tensor[:, :3], lidar_points.coord)
+    assert torch.allclose(expected_tensor[:, 3:6], lidar_points.color)
+    assert torch.allclose(expected_tensor[:, 6], lidar_points.height)
+
+    # test points clone
+    new_lidar_points = lidar_points.clone()
+    assert torch.allclose(new_lidar_points.tensor, lidar_points.tensor)
+
+    # test points shuffle
+    new_lidar_points.shuffle()
+    assert new_lidar_points.tensor.shape == torch.Size([4, 7])
+
+    # test points rotation
+    rot_mat = torch.tensor([[0.93629336, -0.27509585, 0.21835066],
+                            [0.28962948, 0.95642509, -0.03695701],
+                            [-0.19866933, 0.0978434, 0.97517033]])
+    lidar_points.rotate(rot_mat)
+    expected_tensor = torch.tensor([[
+        6.6239e+00, 3.9748e+01, -2.3335e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.3174e+01, 1.2600e+01, -6.9230e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        4.7760e+00, 3.5484e+01, -2.3813e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -2.8960e+01, 9.6364e+00, -7.0663e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, lidar_points.tensor, 1e-3)
+
+    new_lidar_points = lidar_points.clone()
+    new_lidar_points.rotate(0.1, axis=2)
+    expected_tensor = torch.tensor([[
+        2.6226e+00, 4.0211e+01, -2.3335e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.4316e+01, 1.0224e+01, -6.9230e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        1.2096e+00, 3.5784e+01, -2.3813e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -2.9777e+01, 6.6971e+00, -7.0663e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, new_lidar_points.tensor, 1e-3)
+
+    # test points translation
+    translation_vector = torch.tensor([0.93629336, -0.27509585, 0.21835066])
+    lidar_points.translate(translation_vector)
+    expected_tensor = torch.tensor([[
+        7.5602e+00, 3.9473e+01, -2.1152e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.2237e+01, 1.2325e+01, -6.7046e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        5.7123e+00, 3.5209e+01, -2.1629e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -2.8023e+01, 9.3613e+00, -6.8480e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, lidar_points.tensor, 1e-4)
+
+    # test points filter
+    point_range = [-10, -40, -10, 10, 40, 10]
+    in_range_flags = lidar_points.in_range_3d(point_range)
+    expected_flags = torch.tensor([True, False, True, False])
+    assert torch.all(in_range_flags == expected_flags)
+
+    # test points scale
+    lidar_points.scale(1.2)
+    expected_tensor = torch.tensor([[
+        9.0722e+00, 4.7368e+01, -2.5382e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.6685e+01, 1.4790e+01, -8.0455e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        6.8547e+00, 4.2251e+01, -2.5955e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -3.3628e+01, 1.1234e+01, -8.2176e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, lidar_points.tensor, 1e-3)
+
+    # test get_item
+    expected_tensor = torch.tensor(
+        [[-26.6848, 14.7898, -8.0455, 0.1502, 0.3707, 0.1086, 0.6297]])
+    assert torch.allclose(expected_tensor, lidar_points[1].tensor, 1e-4)
+    expected_tensor = torch.tensor(
+        [[-26.6848, 14.7898, -8.0455, 0.1502, 0.3707, 0.1086, 0.6297],
+         [6.8547, 42.2509, -2.5955, 0.6565, 0.6248, 0.6954, 0.2538]])
+    assert torch.allclose(expected_tensor, lidar_points[1:3].tensor, 1e-4)
+    mask = torch.tensor([True, False, True, False])
+    expected_tensor = torch.tensor(
+        [[9.0722, 47.3678, -2.5382, 0.6666, 0.1956, 0.4974, 0.9409],
+         [6.8547, 42.2509, -2.5955, 0.6565, 0.6248, 0.6954, 0.2538]])
+    assert torch.allclose(expected_tensor, lidar_points[mask].tensor, 1e-4)
+    expected_tensor = torch.tensor([[0.6666], [0.1502], [0.6565], [0.2803]])
+    assert torch.allclose(expected_tensor, lidar_points[:, 3].tensor, 1e-4)
+
+    # test length
+    assert len(lidar_points) == 4
+
+    # test repr
+    expected_repr = 'LiDARPoints(\n    '\
+        'tensor([[ 9.0722e+00,  4.7368e+01, -2.5382e+00,  '\
+        '6.6660e-01,  1.9560e-01,\n          4.9740e-01,  '\
+        '9.4090e-01],\n        '\
+        '[-2.6685e+01,  1.4790e+01, -8.0455e+00,  1.5020e-01,  '\
+        '3.7070e-01,\n          '\
+        '1.0860e-01,  6.2970e-01],\n        '\
+        '[ 6.8547e+00,  4.2251e+01, -2.5955e+00,  6.5650e-01,  '\
+        '6.2480e-01,\n          '\
+        '6.9540e-01,  2.5380e-01],\n        '\
+        '[-3.3628e+01,  1.1234e+01, -8.2176e+00,  2.8030e-01,  '\
+        '2.5800e-02,\n          '\
+        '4.8960e-01,  3.2690e-01]]))'
+    assert expected_repr == str(lidar_points)
+
+    # test concatenate
+    lidar_points_clone = lidar_points.clone()
+    cat_points = LiDARPoints.cat([lidar_points, lidar_points_clone])
+    assert torch.allclose(cat_points.tensor[:len(lidar_points)],
+                          lidar_points.tensor)
+
+    # test iteration
+    for i, point in enumerate(lidar_points):
+        assert torch.allclose(point, lidar_points.tensor[i])
+
+    # test new_point
+    new_points = lidar_points.new_point([[1, 2, 3, 4, 5, 6, 7]])
+    assert torch.allclose(
+        new_points.tensor,
+        torch.tensor([[1, 2, 3, 4, 5, 6, 7]], dtype=lidar_points.tensor.dtype))
+
+    # test in_range_bev
+    point_bev_range = [-30, -40, 30, 40]
+    in_range_flags = lidar_points.in_range_bev(point_bev_range)
+    expected_flags = torch.tensor([False, True, False, False])
+    assert torch.all(in_range_flags == expected_flags)
+
+    # test flip
+    lidar_points.flip(bev_direction='horizontal')
+    expected_tensor = torch.tensor([[
+        9.0722e+00, -4.7368e+01, -2.5382e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.6685e+01, -1.4790e+01, -8.0455e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        6.8547e+00, -4.2251e+01, -2.5955e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -3.3628e+01, -1.1234e+01, -8.2176e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, lidar_points.tensor, 1e-4)
+
+    lidar_points.flip(bev_direction='vertical')
+    expected_tensor = torch.tensor([[
+        -9.0722e+00, -4.7368e+01, -2.5382e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        2.6685e+01, -1.4790e+01, -8.0455e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        -6.8547e+00, -4.2251e+01, -2.5955e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        3.3628e+01, -1.1234e+01, -8.2176e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, lidar_points.tensor, 1e-4)
diff --git a/mmde/tests/test_structures/test_points/test_depth_points.py b/mmde/tests/test_structures/test_points/test_depth_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc5f911216c0341cfc8772767f62f35083d1ee18
--- /dev/null
+++ b/mmde/tests/test_structures/test_points/test_depth_points.py
@@ -0,0 +1,282 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet3d.structures.points import DepthPoints
+
+
+def test_depth_points():
+    # test empty initialization
+    empty_boxes = []
+    points = DepthPoints(empty_boxes)
+    assert points.tensor.shape[0] == 0
+    assert points.tensor.shape[1] == 3
+
+    # Test init with origin
+    points_np = np.array([[-5.24223238e+00, 4.00209696e+01, 2.97570381e-01],
+                          [-2.66751588e+01, 5.59499564e+00, -9.14345860e-01],
+                          [-5.80979675e+00, 3.54092357e+01, 2.00889888e-01],
+                          [-3.13086877e+01, 1.09007628e+00, -1.94612112e-01]],
+                         dtype=np.float32)
+    depth_points = DepthPoints(points_np, points_dim=3)
+    assert depth_points.tensor.shape[0] == 4
+
+    # Test init with color and height
+    points_np = np.array([[
+        -5.24223238e+00, 4.00209696e+01, 2.97570381e-01, 0.6666, 0.1956,
+        0.4974, 0.9409
+    ],
+                          [
+                              -2.66751588e+01, 5.59499564e+00, -9.14345860e-01,
+                              0.1502, 0.3707, 0.1086, 0.6297
+                          ],
+                          [
+                              -5.80979675e+00, 3.54092357e+01, 2.00889888e-01,
+                              0.6565, 0.6248, 0.6954, 0.2538
+                          ],
+                          [
+                              -3.13086877e+01, 1.09007628e+00, -1.94612112e-01,
+                              0.2803, 0.0258, 0.4896, 0.3269
+                          ]],
+                         dtype=np.float32)
+    depth_points = DepthPoints(
+        points_np,
+        points_dim=7,
+        attribute_dims=dict(color=[3, 4, 5], height=6))
+    expected_tensor = torch.tensor([[
+        -5.24223238e+00, 4.00209696e+01, 2.97570381e-01, 0.6666, 0.1956,
+        0.4974, 0.9409
+    ],
+                                    [
+                                        -2.66751588e+01, 5.59499564e+00,
+                                        -9.14345860e-01, 0.1502, 0.3707,
+                                        0.1086, 0.6297
+                                    ],
+                                    [
+                                        -5.80979675e+00, 3.54092357e+01,
+                                        2.00889888e-01, 0.6565, 0.6248, 0.6954,
+                                        0.2538
+                                    ],
+                                    [
+                                        -3.13086877e+01, 1.09007628e+00,
+                                        -1.94612112e-01, 0.2803, 0.0258,
+                                        0.4896, 0.3269
+                                    ]])
+
+    assert torch.allclose(expected_tensor, depth_points.tensor)
+    assert torch.allclose(expected_tensor[:, :2], depth_points.bev)
+    assert torch.allclose(expected_tensor[:, :3], depth_points.coord)
+    assert torch.allclose(expected_tensor[:, 3:6], depth_points.color)
+    assert torch.allclose(expected_tensor[:, 6], depth_points.height)
+
+    # test points clone
+    new_depth_points = depth_points.clone()
+    assert torch.allclose(new_depth_points.tensor, depth_points.tensor)
+
+    # test points shuffle
+    new_depth_points.shuffle()
+    assert new_depth_points.tensor.shape == torch.Size([4, 7])
+
+    # test points rotation
+    rot_mat = torch.tensor([[0.93629336, -0.27509585, 0.21835066],
+                            [0.28962948, 0.95642509, -0.03695701],
+                            [-0.19866933, 0.0978434, 0.97517033]])
+    depth_points.rotate(rot_mat)
+    expected_tensor = torch.tensor([[
+        6.6239e+00, 3.9748e+01, -2.3335e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.3174e+01, 1.2600e+01, -6.9230e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        4.7760e+00, 3.5484e+01, -2.3813e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -2.8960e+01, 9.6364e+00, -7.0663e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, depth_points.tensor, 1e-3)
+
+    new_depth_points = depth_points.clone()
+    new_depth_points.rotate(0.1, axis=2)
+    expected_tensor = torch.tensor([[
+        2.6226e+00, 4.0211e+01, -2.3335e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.4316e+01, 1.0224e+01, -6.9230e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        1.2096e+00, 3.5784e+01, -2.3813e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -2.9777e+01, 6.6971e+00, -7.0663e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, new_depth_points.tensor, 1e-3)
+
+    # test points translation
+    translation_vector = torch.tensor([0.93629336, -0.27509585, 0.21835066])
+    depth_points.translate(translation_vector)
+    expected_tensor = torch.tensor([[
+        7.5602e+00, 3.9473e+01, -2.1152e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.2237e+01, 1.2325e+01, -6.7046e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        5.7123e+00, 3.5209e+01, -2.1629e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -2.8023e+01, 9.3613e+00, -6.8480e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, depth_points.tensor, 1e-4)
+
+    # test points filter
+    point_range = [-10, -40, -10, 10, 40, 10]
+    in_range_flags = depth_points.in_range_3d(point_range)
+    expected_flags = torch.tensor([True, False, True, False])
+    assert torch.all(in_range_flags == expected_flags)
+
+    # test points scale
+    depth_points.scale(1.2)
+    expected_tensor = torch.tensor([[
+        9.0722e+00, 4.7368e+01, -2.5382e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        -2.6685e+01, 1.4790e+01, -8.0455e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        6.8547e+00, 4.2251e+01, -2.5955e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        -3.3628e+01, 1.1234e+01, -8.2176e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, depth_points.tensor, 1e-3)
+
+    # test get_item
+    expected_tensor = torch.tensor(
+        [[-26.6848, 14.7898, -8.0455, 0.1502, 0.3707, 0.1086, 0.6297]])
+    assert torch.allclose(expected_tensor, depth_points[1].tensor, 1e-4)
+    expected_tensor = torch.tensor(
+        [[-26.6848, 14.7898, -8.0455, 0.1502, 0.3707, 0.1086, 0.6297],
+         [6.8547, 42.2509, -2.5955, 0.6565, 0.6248, 0.6954, 0.2538]])
+    assert torch.allclose(expected_tensor, depth_points[1:3].tensor, 1e-4)
+    mask = torch.tensor([True, False, True, False])
+    expected_tensor = torch.tensor(
+        [[9.0722, 47.3678, -2.5382, 0.6666, 0.1956, 0.4974, 0.9409],
+         [6.8547, 42.2509, -2.5955, 0.6565, 0.6248, 0.6954, 0.2538]])
+    assert torch.allclose(expected_tensor, depth_points[mask].tensor, 1e-4)
+    expected_tensor = torch.tensor([[0.6666], [0.1502], [0.6565], [0.2803]])
+    assert torch.allclose(expected_tensor, depth_points[:, 3].tensor, 1e-4)
+
+    # test length
+    assert len(depth_points) == 4
+
+    # test repr
+    expected_repr = 'DepthPoints(\n    '\
+        'tensor([[ 9.0722e+00,  4.7368e+01, -2.5382e+00,  '\
+        '6.6660e-01,  1.9560e-01,\n          4.9740e-01,  '\
+        '9.4090e-01],\n        '\
+        '[-2.6685e+01,  1.4790e+01, -8.0455e+00,  1.5020e-01,  '\
+        '3.7070e-01,\n          '\
+        '1.0860e-01,  6.2970e-01],\n        '\
+        '[ 6.8547e+00,  4.2251e+01, -2.5955e+00,  6.5650e-01,  '\
+        '6.2480e-01,\n          '\
+        '6.9540e-01,  2.5380e-01],\n        '\
+        '[-3.3628e+01,  1.1234e+01, -8.2176e+00,  2.8030e-01,  '\
+        '2.5800e-02,\n          '\
+        '4.8960e-01,  3.2690e-01]]))'
+    assert expected_repr == str(depth_points)
+
+    # test concatenate
+    depth_points_clone = depth_points.clone()
+    cat_points = DepthPoints.cat([depth_points, depth_points_clone])
+    assert torch.allclose(cat_points.tensor[:len(depth_points)],
+                          depth_points.tensor)
+
+    # test iteration
+    for i, point in enumerate(depth_points):
+        assert torch.allclose(point, depth_points.tensor[i])
+
+    # test new_point
+    new_points = depth_points.new_point([[1, 2, 3, 4, 5, 6, 7]])
+    assert torch.allclose(
+        new_points.tensor,
+        torch.tensor([[1, 2, 3, 4, 5, 6, 7]], dtype=depth_points.tensor.dtype))
+
+    # test in_range_bev
+    point_bev_range = [-30, -40, 30, 40]
+    in_range_flags = depth_points.in_range_bev(point_bev_range)
+    expected_flags = torch.tensor([False, True, False, False])
+    assert torch.all(in_range_flags == expected_flags)
+
+    # test flip
+    depth_points.flip(bev_direction='horizontal')
+    expected_tensor = torch.tensor([[
+        -9.0722e+00, 4.7368e+01, -2.5382e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        2.6685e+01, 1.4790e+01, -8.0455e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        -6.8547e+00, 4.2251e+01, -2.5955e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        3.3628e+01, 1.1234e+01, -8.2176e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, depth_points.tensor, 1e-4)
+
+    depth_points.flip(bev_direction='vertical')
+    expected_tensor = torch.tensor([[
+        -9.0722e+00, -4.7368e+01, -2.5382e+00, 6.6660e-01, 1.9560e-01,
+        4.9740e-01, 9.4090e-01
+    ],
+                                    [
+                                        2.6685e+01, -1.4790e+01, -8.0455e+00,
+                                        1.5020e-01, 3.7070e-01, 1.0860e-01,
+                                        6.2970e-01
+                                    ],
+                                    [
+                                        -6.8547e+00, -4.2251e+01, -2.5955e+00,
+                                        6.5650e-01, 6.2480e-01, 6.9540e-01,
+                                        2.5380e-01
+                                    ],
+                                    [
+                                        3.3628e+01, -1.1234e+01, -8.2176e+00,
+                                        2.8030e-01, 2.5800e-02, 4.8960e-01,
+                                        3.2690e-01
+                                    ]])
+    assert torch.allclose(expected_tensor, depth_points.tensor, 1e-4)
diff --git a/mmde/tests/test_utils/test_compat_cfg.py b/mmde/tests/test_utils/test_compat_cfg.py
new file mode 100644
index 0000000000000000000000000000000000000000..4672f330b20988222ede3d6b4009dd669ec1f6b3
--- /dev/null
+++ b/mmde/tests/test_utils/test_compat_cfg.py
@@ -0,0 +1,113 @@
+import pytest
+from mmengine import ConfigDict
+
+from mmdet3d.utils.compat_cfg import (compat_imgs_per_gpu, compat_loader_args,
+                                      compat_runner_args)
+
+
+def test_compat_runner_args():
+    cfg = ConfigDict(dict(total_epochs=12))
+    with pytest.warns(None) as record:
+        cfg = compat_runner_args(cfg)
+    assert len(record) == 1
+    assert 'runner' in record.list[0].message.args[0]
+    assert 'runner' in cfg
+    assert cfg.runner.type == 'EpochBasedRunner'
+    assert cfg.runner.max_epochs == cfg.total_epochs
+
+
+def test_compat_loader_args():
+    cfg = ConfigDict(dict(data=dict(val=dict(), test=dict(), train=dict())))
+    cfg = compat_loader_args(cfg)
+    # auto fill loader args
+    assert 'val_dataloader' in cfg.data
+    assert 'train_dataloader' in cfg.data
+    assert 'test_dataloader' in cfg.data
+    cfg = ConfigDict(
+        dict(
+            data=dict(
+                samples_per_gpu=1,
+                persistent_workers=True,
+                workers_per_gpu=1,
+                val=dict(samples_per_gpu=3),
+                test=dict(samples_per_gpu=2),
+                train=dict())))
+    cfg = compat_loader_args(cfg)
+
+    assert cfg.data.train_dataloader.workers_per_gpu == 1
+    assert cfg.data.train_dataloader.samples_per_gpu == 1
+    assert cfg.data.train_dataloader.persistent_workers
+    assert cfg.data.val_dataloader.workers_per_gpu == 1
+    assert cfg.data.val_dataloader.samples_per_gpu == 3
+    assert cfg.data.test_dataloader.workers_per_gpu == 1
+    assert cfg.data.test_dataloader.samples_per_gpu == 2
+
+    # test test is a list
+    cfg = ConfigDict(
+        dict(
+            data=dict(
+                samples_per_gpu=1,
+                persistent_workers=True,
+                workers_per_gpu=1,
+                val=dict(samples_per_gpu=3),
+                test=[dict(samples_per_gpu=2),
+                      dict(samples_per_gpu=3)],
+                train=dict())))
+
+    cfg = compat_loader_args(cfg)
+
+    # assert can not set args at the same time
+    cfg = ConfigDict(
+        dict(
+            data=dict(
+                samples_per_gpu=1,
+                persistent_workers=True,
+                workers_per_gpu=1,
+                val=dict(samples_per_gpu=3),
+                test=dict(samples_per_gpu=2),
+                train=dict(),
+                train_dataloader=dict(samples_per_gpu=2))))
+    # samples_per_gpu can not be set in `train_dataloader`
+    # and data field at the same time
+    with pytest.raises(AssertionError):
+        compat_loader_args(cfg)
+    cfg = ConfigDict(
+        dict(
+            data=dict(
+                samples_per_gpu=1,
+                persistent_workers=True,
+                workers_per_gpu=1,
+                val=dict(samples_per_gpu=3),
+                test=dict(samples_per_gpu=2),
+                train=dict(),
+                val_dataloader=dict(samples_per_gpu=2))))
+    # samples_per_gpu can not be set in `val_dataloader`
+    # and data field at the same time
+    with pytest.raises(AssertionError):
+        compat_loader_args(cfg)
+    cfg = ConfigDict(
+        dict(
+            data=dict(
+                samples_per_gpu=1,
+                persistent_workers=True,
+                workers_per_gpu=1,
+                val=dict(samples_per_gpu=3),
+                test=dict(samples_per_gpu=2),
+                test_dataloader=dict(samples_per_gpu=2))))
+    # samples_per_gpu can not be set in `test_dataloader`
+    # and data field at the same time
+    with pytest.raises(AssertionError):
+        compat_loader_args(cfg)
+
+
+def test_compat_imgs_per_gpu():
+    cfg = ConfigDict(
+        dict(
+            data=dict(
+                imgs_per_gpu=1,
+                samples_per_gpu=2,
+                val=dict(),
+                test=dict(),
+                train=dict())))
+    cfg = compat_imgs_per_gpu(cfg)
+    assert cfg.data.samples_per_gpu == cfg.data.imgs_per_gpu
diff --git a/mmde/tests/test_utils/test_setup_env.py b/mmde/tests/test_utils/test_setup_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7374b720422633af7b9d1a23fb1e93aa36d4cb7
--- /dev/null
+++ b/mmde/tests/test_utils/test_setup_env.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import multiprocessing as mp
+import os
+import platform
+import sys
+
+import cv2
+from mmengine import Config, DefaultScope
+
+from mmdet3d.utils import register_all_modules, setup_multi_processes
+
+
+def test_register_all_modules():
+    from mmdet3d.registry import DATASETS
+
+    sys.modules.pop('mmdet3d.datasets', None)
+    sys.modules.pop('mmdet3d.datasets.kitti_dataset', None)
+    DATASETS._module_dict.pop('KittiDataset', None)
+    assert 'KittiDataset' not in DATASETS.module_dict
+    register_all_modules(init_default_scope=True)
+    assert 'KittiDataset' in DATASETS.module_dict
+    assert DefaultScope.get_current_instance().scope_name == 'mmdet3d'
+
+
+def test_setup_multi_processes():
+    # temp save system setting
+    sys_start_mehod = mp.get_start_method(allow_none=True)
+    sys_cv_threads = cv2.getNumThreads()
+    # pop and temp save system env vars
+    sys_omp_threads = os.environ.pop('OMP_NUM_THREADS', default=None)
+    sys_mkl_threads = os.environ.pop('MKL_NUM_THREADS', default=None)
+
+    # test config without setting env
+    config = dict(data=dict(workers_per_gpu=2))
+    cfg = Config(config)
+    setup_multi_processes(cfg)
+    assert os.getenv('OMP_NUM_THREADS') == '1'
+    assert os.getenv('MKL_NUM_THREADS') == '1'
+    # when set to 0, the num threads will be 1
+    assert cv2.getNumThreads() == 1
+    if platform.system() != 'Windows':
+        assert mp.get_start_method() == 'fork'
+
+    # test num workers <= 1
+    os.environ.pop('OMP_NUM_THREADS')
+    os.environ.pop('MKL_NUM_THREADS')
+    config = dict(data=dict(workers_per_gpu=0))
+    cfg = Config(config)
+    setup_multi_processes(cfg)
+    assert 'OMP_NUM_THREADS' not in os.environ
+    assert 'MKL_NUM_THREADS' not in os.environ
+
+    # test manually set env var
+    os.environ['OMP_NUM_THREADS'] = '4'
+    config = dict(data=dict(workers_per_gpu=2))
+    cfg = Config(config)
+    setup_multi_processes(cfg)
+    assert os.getenv('OMP_NUM_THREADS') == '4'
+
+    # test manually set opencv threads and mp start method
+    config = dict(
+        data=dict(workers_per_gpu=2),
+        opencv_num_threads=4,
+        mp_start_method='spawn')
+    cfg = Config(config)
+    setup_multi_processes(cfg)
+    assert cv2.getNumThreads() == 4
+    assert mp.get_start_method() == 'spawn'
+
+    # revert setting to avoid affecting other programs
+    if sys_start_mehod:
+        mp.set_start_method(sys_start_mehod, force=True)
+    cv2.setNumThreads(sys_cv_threads)
+    if sys_omp_threads:
+        os.environ['OMP_NUM_THREADS'] = sys_omp_threads
+    else:
+        os.environ.pop('OMP_NUM_THREADS')
+    if sys_mkl_threads:
+        os.environ['MKL_NUM_THREADS'] = sys_mkl_threads
+    else:
+        os.environ.pop('MKL_NUM_THREADS')
diff --git a/mmde/tools/analysis_tools/analyze_logs.py b/mmde/tools/analysis_tools/analyze_logs.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb7429b81ceece78c3762a24117adeb174c19b53
--- /dev/null
+++ b/mmde/tools/analysis_tools/analyze_logs.py
@@ -0,0 +1,209 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+from collections import defaultdict
+
+import numpy as np
+import seaborn as sns
+from matplotlib import pyplot as plt
+
+
+def cal_train_time(log_dicts, args):
+    for i, log_dict in enumerate(log_dicts):
+        print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
+        all_times = []
+        for epoch in log_dict.keys():
+            if args.include_outliers:
+                all_times.append(log_dict[epoch]['time'])
+            else:
+                all_times.append(log_dict[epoch]['time'][1:])
+        if not all_times:
+            raise KeyError(
+                'Please reduce the log interval in the config so that '
+                'interval is less than iterations of one epoch.')
+        epoch_ave_time = np.array(list(map(lambda x: np.mean(x), all_times)))
+        slowest_epoch = epoch_ave_time.argmax()
+        fastest_epoch = epoch_ave_time.argmin()
+        std_over_epoch = epoch_ave_time.std()
+        print(f'slowest epoch {slowest_epoch + 1}, '
+              f'average time is {epoch_ave_time[slowest_epoch]:.4f} s/iter')
+        print(f'fastest epoch {fastest_epoch + 1}, '
+              f'average time is {epoch_ave_time[fastest_epoch]:.4f} s/iter')
+        print(f'time std over epochs is {std_over_epoch:.4f}')
+        print(f'average iter time: {np.mean(epoch_ave_time):.4f} s/iter\n')
+
+
+def plot_curve(log_dicts, args):
+    if args.backend is not None:
+        plt.switch_backend(args.backend)
+    sns.set_style(args.style)
+    # if legend is None, use {filename}_{key} as legend
+    legend = args.legend
+    if legend is None:
+        legend = []
+        for json_log in args.json_logs:
+            for metric in args.keys:
+                legend.append(f'{json_log}_{metric}')
+    assert len(legend) == (len(args.json_logs) * len(args.keys))
+    metrics = args.keys
+
+    num_metrics = len(metrics)
+    for i, log_dict in enumerate(log_dicts):
+        epochs = list(log_dict.keys())
+        for j, metric in enumerate(metrics):
+            print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+            if metric not in log_dict[epochs[int(args.eval_interval) - 1]]:
+                if args.eval:
+                    raise KeyError(
+                        f'{args.json_logs[i]} does not contain metric '
+                        f'{metric}. Please check if "--no-validate" is '
+                        'specified when you trained the model. Or check '
+                        f'if the eval_interval {args.eval_interval} in args '
+                        'is equal to the `eval_interval` during training.')
+                raise KeyError(
+                    f'{args.json_logs[i]} does not contain metric {metric}. '
+                    'Please reduce the log interval in the config so that '
+                    'interval is less than iterations of one epoch.')
+
+            if args.eval:
+                xs = []
+                ys = []
+                for epoch in epochs:
+                    ys += log_dict[epoch][metric]
+                    if log_dict[epoch][metric]:
+                        xs += [epoch]
+                plt.xlabel('epoch')
+                plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o')
+            else:
+                xs = []
+                ys = []
+                for epoch in epochs:
+                    iters = log_dict[epoch]['step']
+                    xs.append(np.array(iters))
+                    ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
+                xs = np.concatenate(xs)
+                ys = np.concatenate(ys)
+                plt.xlabel('iter')
+                plt.plot(
+                    xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
+                plt.legend()
+        if args.title is not None:
+            plt.title(args.title)
+    if args.out is None:
+        plt.show()
+    else:
+        print(f'save curve to: {args.out}')
+        plt.savefig(args.out)
+        plt.cla()
+
+
+def add_plot_parser(subparsers):
+    parser_plt = subparsers.add_parser(
+        'plot_curve', help='parser for plotting curves')
+    parser_plt.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_plt.add_argument(
+        '--keys',
+        type=str,
+        nargs='+',
+        default=['mAP_0.25'],
+        help='the metric that you want to plot')
+    parser_plt.add_argument(
+        '--eval',
+        action='store_true',
+        help='whether to plot evaluation metric')
+    parser_plt.add_argument(
+        '--eval-interval',
+        type=str,
+        default='1',
+        help='the eval interval when training')
+    parser_plt.add_argument('--title', type=str, help='title of figure')
+    parser_plt.add_argument(
+        '--legend',
+        type=str,
+        nargs='+',
+        default=None,
+        help='legend of each plot')
+    parser_plt.add_argument(
+        '--backend', type=str, default=None, help='backend of plt')
+    parser_plt.add_argument(
+        '--style', type=str, default='dark', help='style of plt')
+    parser_plt.add_argument('--out', type=str, default=None)
+
+
+def add_time_parser(subparsers):
+    parser_time = subparsers.add_parser(
+        'cal_train_time',
+        help='parser for computing the average time per training iteration')
+    parser_time.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_time.add_argument(
+        '--include-outliers',
+        action='store_true',
+        help='include the first value of every epoch when computing '
+        'the average time')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Analyze Json Log')
+    # currently only support plot curve and calculate average train time
+    subparsers = parser.add_subparsers(dest='task', help='task parser')
+    add_plot_parser(subparsers)
+    add_time_parser(subparsers)
+    args = parser.parse_args()
+    return args
+
+
+def load_json_logs(json_logs):
+    # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+    # keys of sub dict is different metrics, e.g. memory, bbox_mAP
+    # value of sub dict is a list of corresponding values of all iterations
+    log_dicts = [dict() for _ in json_logs]
+    for json_log, log_dict in zip(json_logs, log_dicts):
+        with open(json_log, 'r') as log_file:
+            epoch = 1
+            for i, line in enumerate(log_file):
+                log = json.loads(line.strip())
+                val_flag = False
+                # skip lines only contains one key
+                if not len(log) > 1:
+                    continue
+
+                if epoch not in log_dict:
+                    log_dict[epoch] = defaultdict(list)
+
+                for k, v in log.items():
+                    if '/' in k:
+                        log_dict[epoch][k.split('/')[-1]].append(v)
+                        val_flag = True
+                    elif val_flag:
+                        continue
+                    else:
+                        log_dict[epoch][k].append(v)
+
+                if 'epoch' in log.keys():
+                    epoch = log['epoch']
+
+    return log_dicts
+
+
+def main():
+    args = parse_args()
+
+    json_logs = args.json_logs
+    for json_log in json_logs:
+        assert json_log.endswith('.json')
+
+    log_dicts = load_json_logs(json_logs)
+
+    eval(args.task)(log_dicts, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/tools/analysis_tools/benchmark.py b/mmde/tools/analysis_tools/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6017ac367580c9340c2e3b25ccba85482129712f
--- /dev/null
+++ b/mmde/tools/analysis_tools/benchmark.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+
+import torch
+from mmengine import Config
+from mmengine.device import get_device
+from mmengine.registry import init_default_scope
+from mmengine.runner import Runner, autocast, load_checkpoint
+
+from mmdet3d.registry import MODELS
+from tools.misc.fuse_conv_bn import fuse_module
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet benchmark a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--samples', default=2000, help='samples to benchmark')
+    parser.add_argument(
+        '--log-interval', default=50, help='interval of logging')
+    parser.add_argument(
+        '--amp',
+        action='store_true',
+        help='Whether to use automatic mixed precision inference')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    init_default_scope('mmdet3d')
+
+    # build config and set cudnn_benchmark
+    cfg = Config.fromfile(args.config)
+
+    if cfg.env_cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # build dataloader
+    dataloader = Runner.build_dataloader(cfg.test_dataloader)
+
+    # build model and load checkpoint
+    model = MODELS.build(cfg.model)
+    load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if args.fuse_conv_bn:
+        model = fuse_module(model)
+    model.to(get_device())
+    model.eval()
+
+    # the first several iterations may be very slow so skip them
+    num_warmup = 5
+    pure_inf_time = 0
+
+    # benchmark with several samples and take the average
+    for i, data in enumerate(dataloader):
+
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+
+        with autocast(enabled=args.amp):
+            model.test_step(data)
+
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start_time
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % args.log_interval == 0:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(f'Done sample [{i + 1:<3}/ {args.samples}], '
+                      f'fps: {fps:.1f} sample / s')
+
+        if (i + 1) == args.samples:
+            pure_inf_time += elapsed
+            fps = (i + 1 - num_warmup) / pure_inf_time
+            print(f'Overall fps: {fps:.1f} sample / s')
+            break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/tools/analysis_tools/benchmark_patched.py b/mmde/tools/analysis_tools/benchmark_patched.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc68e70ccc7be7025f199060fabb2625674628ad
--- /dev/null
+++ b/mmde/tools/analysis_tools/benchmark_patched.py
@@ -0,0 +1,96 @@
+import argparse
+import time
+import torch
+from mmengine import Config
+from mmengine.config import DictAction
+from mmengine.device import get_device
+from mmengine.registry import init_default_scope
+from mmengine.runner import Runner, autocast, load_checkpoint
+from mmdet3d.registry import MODELS
+from tools.misc.fuse_conv_bn import fuse_module
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet benchmark a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--samples', default=2000, type=int, help='samples to benchmark')
+    parser.add_argument('--log-interval', default=10, type=int, help='interval of logging')
+    parser.add_argument('--amp', action='store_true', help='Use AMP')
+    parser.add_argument('--fuse-conv-bn', action='store_true', help='Fuse conv and bn')
+    # 补丁 1：增加对 cfg-options 的支持
+    parser.add_argument('--cfg-options', nargs='+', action=DictAction, help='override config')
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+    init_default_scope('mmdet3d')
+    cfg = Config.fromfile(args.config)
+    
+    # 补丁 2：将命令行参数合并到配置中
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    if cfg.env_cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    dataloader = Runner.build_dataloader(cfg.test_dataloader)
+    model = MODELS.build(cfg.model)
+    load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if args.fuse_conv_bn:
+        model = fuse_module(model)
+    model.to(get_device())
+    model.eval()
+
+    num_warmup = 5
+    pure_inf_time = 0
+    total_batches = len(dataloader)
+
+    for i, data in enumerate(dataloader):
+        # ================= [新增的打印 Shape 代码] =================
+        if i == 0:
+            print("\n" + "🔥" * 25)
+            print("🔍 [Debug] 当前输入模型的 Data Shape:")
+            if 'inputs' in data:
+                inputs = data['inputs']
+                # 1. 打印雷达点云的 shape
+                if 'points' in inputs:
+                    print(f"   👉 雷达点云 (points): 包含 {len(inputs['points'])} 个样本 (Batch Size)")
+                    for b_idx, pts in enumerate(inputs['points']):
+                        print(f"      - 样本 {b_idx} shape: {pts.shape}")
+                
+                # 2. 打印相机图像的 shape (如果跑的是多模态)
+                if 'img' in inputs:
+                    print(f"   👉 相机图像 (img): {inputs['img'][0].shape}")
+            print("🔥" * 25 + "\n")
+        # =========================================================
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+
+        with autocast(enabled=args.amp):
+            model.test_step(data)
+
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start_time
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % args.log_interval == 0:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(f'Done batch [{i + 1:<3}/ {total_batches}], speed: {fps:.1f} batch / s')
+
+        # 补丁 3：只要读完当前数据集，立刻退出并打印，防止哑巴
+        if (i + 1) == args.samples or (i + 1) == total_batches:
+            break
+
+    if (i + 1) > num_warmup:
+        fps = (i + 1 - num_warmup) / pure_inf_time
+        bs = cfg.test_dataloader.get('batch_size', 1)
+        print(f'\n✅ 测试完成！')
+        print(f'🚀 纯模型前向速度 (Batch/s): {fps:.1f}')
+        print(f'🚀 真实吞吐量 (等效 task/s): {fps * bs:.1f} frames / s')
+    else:
+        print('数据量太少，连 Warmup 都没跑完。')
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/tools/analysis_tools/get_flops.py b/mmde/tools/analysis_tools/get_flops.py
new file mode 100644
index 0000000000000000000000000000000000000000..19b524df2c5c90fae9b5ac22de174dbe22ba1727
--- /dev/null
+++ b/mmde/tools/analysis_tools/get_flops.py
@@ -0,0 +1,83 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import torch
+from mmengine import Config, DictAction
+from mmengine.registry import init_default_scope
+
+from mmdet3d.registry import MODELS
+
+try:
+    from mmcv.cnn import get_model_complexity_info
+except ImportError:
+    raise ImportError('Please upgrade mmcv to >0.6.2')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[40000, 4],
+        help='input point cloud size')
+    parser.add_argument(
+        '--modality',
+        type=str,
+        default='point',
+        choices=['point', 'image', 'multi'],
+        help='input data modality')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.modality == 'point':
+        assert len(args.shape) == 2, 'invalid input shape'
+        input_shape = tuple(args.shape)
+    elif args.modality == 'image':
+        if len(args.shape) == 1:
+            input_shape = (3, args.shape[0], args.shape[0])
+        elif len(args.shape) == 2:
+            input_shape = (3, ) + tuple(args.shape)
+        else:
+            raise ValueError('invalid input shape')
+    elif args.modality == 'multi':
+        raise NotImplementedError(
+            'FLOPs counter is currently not supported for models with '
+            'multi-modality input')
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    init_default_scope(cfg.get('default_scope', 'mmdet3d'))
+
+    model = MODELS.build(cfg.model)
+    if torch.cuda.is_available():
+        model.cuda()
+    model.eval()
+
+    flops, params = get_model_complexity_info(model, input_shape)
+    split_line = '=' * 30
+    print(f'{split_line}\nInput shape: {input_shape}\n'
+          f'Flops: {flops}\nParams: {params}\n{split_line}')
+    print('!!!Please be cautious if you use the results in papers. '
+          'You may need to check if all ops are supported and verify that the '
+          'flops computation is correct.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/tools/create_data.py b/mmde/tools/create_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8c6495d6ac6054129b4681fb030fdb3cf7625e8
--- /dev/null
+++ b/mmde/tools/create_data.py
@@ -0,0 +1,420 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from os import path as osp
+
+from mmengine import print_log
+
+from tools.dataset_converters import indoor_converter as indoor
+from tools.dataset_converters import kitti_converter as kitti
+from tools.dataset_converters import lyft_converter as lyft_converter
+from tools.dataset_converters import nuscenes_converter as nuscenes_converter
+from tools.dataset_converters import semantickitti_converter
+from tools.dataset_converters.create_gt_database import (
+    GTDatabaseCreater, create_groundtruth_database)
+from tools.dataset_converters.update_infos_to_v2 import update_pkl_infos
+
+
+def kitti_data_prep(root_path,
+                    info_prefix,
+                    version,
+                    out_dir,
+                    with_plane=False):
+    """Prepare data related to Kitti dataset.
+
+    Related data consists of '.pkl' files recording basic infos,
+    2D annotations and groundtruth database.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        out_dir (str): Output directory of the groundtruth database info.
+        with_plane (bool, optional): Whether to use plane information.
+            Default: False.
+    """
+    kitti.create_kitti_info_file(root_path, info_prefix, with_plane)
+    kitti.create_reduced_point_cloud(root_path, info_prefix)
+
+    info_train_path = osp.join(out_dir, f'{info_prefix}_infos_train.pkl')
+    info_val_path = osp.join(out_dir, f'{info_prefix}_infos_val.pkl')
+    info_trainval_path = osp.join(out_dir, f'{info_prefix}_infos_trainval.pkl')
+    info_test_path = osp.join(out_dir, f'{info_prefix}_infos_test.pkl')
+    update_pkl_infos('kitti', out_dir=out_dir, pkl_path=info_train_path)
+    update_pkl_infos('kitti', out_dir=out_dir, pkl_path=info_val_path)
+    update_pkl_infos('kitti', out_dir=out_dir, pkl_path=info_trainval_path)
+    update_pkl_infos('kitti', out_dir=out_dir, pkl_path=info_test_path)
+    create_groundtruth_database(
+        'KittiDataset',
+        root_path,
+        info_prefix,
+        f'{info_prefix}_infos_train.pkl',
+        relative_path=False,
+        mask_anno_path='instances_train.json',
+        with_mask=(version == 'mask'))
+
+
+def nuscenes_data_prep(root_path,
+                       info_prefix,
+                       version,
+                       dataset_name,
+                       out_dir,
+                       max_sweeps=10):
+    """Prepare data related to nuScenes dataset.
+
+    Related data consists of '.pkl' files recording basic infos,
+    2D annotations and groundtruth database.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        dataset_name (str): The dataset class name.
+        out_dir (str): Output directory of the groundtruth database info.
+        max_sweeps (int, optional): Number of input consecutive frames.
+            Default: 10
+    """
+    nuscenes_converter.create_nuscenes_infos(
+        root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+    if version == 'v1.0-test':
+        info_test_path = osp.join(out_dir, f'{info_prefix}_infos_test.pkl')
+        update_pkl_infos('nuscenes', out_dir=out_dir, pkl_path=info_test_path)
+        return
+
+    info_train_path = osp.join(out_dir, f'{info_prefix}_infos_train.pkl')
+    info_val_path = osp.join(out_dir, f'{info_prefix}_infos_val.pkl')
+    update_pkl_infos('nuscenes', out_dir=out_dir, pkl_path=info_train_path)
+    update_pkl_infos('nuscenes', out_dir=out_dir, pkl_path=info_val_path)
+    create_groundtruth_database(dataset_name, root_path, info_prefix,
+                                f'{info_prefix}_infos_train.pkl')
+
+
+def lyft_data_prep(root_path, info_prefix, version, max_sweeps=10):
+    """Prepare data related to Lyft dataset.
+
+    Related data consists of '.pkl' files recording basic infos.
+    Although the ground truth database and 2D annotations are not used in
+    Lyft, it can also be generated like nuScenes.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        max_sweeps (int, optional): Number of input consecutive frames.
+            Defaults to 10.
+    """
+    lyft_converter.create_lyft_infos(
+        root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+    if version == 'v1.01-test':
+        info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl')
+        update_pkl_infos('lyft', out_dir=root_path, pkl_path=info_test_path)
+    elif version == 'v1.01-train':
+        info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl')
+        info_val_path = osp.join(root_path, f'{info_prefix}_infos_val.pkl')
+        update_pkl_infos('lyft', out_dir=root_path, pkl_path=info_train_path)
+        update_pkl_infos('lyft', out_dir=root_path, pkl_path=info_val_path)
+
+
+def scannet_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for scannet dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+    info_train_path = osp.join(out_dir, f'{info_prefix}_infos_train.pkl')
+    info_val_path = osp.join(out_dir, f'{info_prefix}_infos_val.pkl')
+    info_test_path = osp.join(out_dir, f'{info_prefix}_infos_test.pkl')
+    update_pkl_infos('scannet', out_dir=out_dir, pkl_path=info_train_path)
+    update_pkl_infos('scannet', out_dir=out_dir, pkl_path=info_val_path)
+    update_pkl_infos('scannet', out_dir=out_dir, pkl_path=info_test_path)
+
+
+def s3dis_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for s3dis dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+    splits = [f'Area_{i}' for i in [1, 2, 3, 4, 5, 6]]
+    for split in splits:
+        filename = osp.join(out_dir, f'{info_prefix}_infos_{split}.pkl')
+        update_pkl_infos('s3dis', out_dir=out_dir, pkl_path=filename)
+
+
+def sunrgbd_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for sunrgbd dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+    info_train_path = osp.join(out_dir, f'{info_prefix}_infos_train.pkl')
+    info_val_path = osp.join(out_dir, f'{info_prefix}_infos_val.pkl')
+    update_pkl_infos('sunrgbd', out_dir=out_dir, pkl_path=info_train_path)
+    update_pkl_infos('sunrgbd', out_dir=out_dir, pkl_path=info_val_path)
+
+
+def waymo_data_prep(root_path,
+                    info_prefix,
+                    version,
+                    out_dir,
+                    workers,
+                    max_sweeps=10,
+                    only_gt_database=False,
+                    save_senor_data=False,
+                    skip_cam_instances_infos=False):
+    """Prepare waymo dataset. There are 3 steps as follows:
+
+    Step 1. Extract camera images and lidar point clouds from waymo raw
+        data in '*.tfreord' and save as kitti format.
+    Step 2. Generate waymo train/val/test infos and save as pickle file.
+    Step 3. Generate waymo ground truth database (point clouds within
+        each 3D bounding box) for data augmentation in training.
+    Steps 1 and 2 will be done in Waymo2KITTI, and step 3 will be done in
+    GTDatabaseCreater.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+        max_sweeps (int, optional): Number of input consecutive frames.
+            Default to 10. Here we store ego2global information of these
+            frames for later use.
+        only_gt_database (bool, optional): Whether to only generate ground
+            truth database. Default to False.
+        save_senor_data (bool, optional): Whether to skip saving
+            image and lidar. Default to False.
+        skip_cam_instances_infos (bool, optional): Whether to skip
+            gathering cam_instances infos in Step 2. Default to False.
+    """
+    from tools.dataset_converters import waymo_converter as waymo
+
+    if version == 'v1.4':
+        splits = [
+            'training', 'validation', 'testing',
+            'testing_3d_camera_only_detection'
+        ]
+    elif version == 'v1.4-mini':
+        splits = ['training', 'validation']
+    else:
+        raise NotImplementedError(f'Unsupported Waymo version {version}!')
+    out_dir = osp.join(out_dir, 'kitti_format')
+
+    if not only_gt_database:
+        for i, split in enumerate(splits):
+            load_dir = osp.join(root_path, 'waymo_format', split)
+            if split == 'validation':
+                save_dir = osp.join(out_dir, 'training')
+            else:
+                save_dir = osp.join(out_dir, split)
+            converter = waymo.Waymo2KITTI(
+                load_dir,
+                save_dir,
+                prefix=str(i),
+                workers=workers,
+                test_mode=(split
+                           in ['testing', 'testing_3d_camera_only_detection']),
+                info_prefix=info_prefix,
+                max_sweeps=max_sweeps,
+                split=split,
+                save_senor_data=save_senor_data,
+                save_cam_instances=not skip_cam_instances_infos)
+            converter.convert()
+            if split == 'validation':
+                converter.merge_trainval_infos()
+
+        from tools.dataset_converters.waymo_converter import \
+            create_ImageSets_img_ids
+        create_ImageSets_img_ids(out_dir, splits)
+
+    GTDatabaseCreater(
+        'WaymoDataset',
+        out_dir,
+        info_prefix,
+        f'{info_prefix}_infos_train.pkl',
+        relative_path=False,
+        with_mask=False,
+        num_worker=workers).create()
+
+    print_log('Successfully preparing Waymo Open Dataset')
+
+
+def semantickitti_data_prep(info_prefix, out_dir):
+    """Prepare the info file for SemanticKITTI dataset.
+
+    Args:
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+    """
+    semantickitti_converter.create_semantickitti_info_file(
+        info_prefix, out_dir)
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
+parser.add_argument(
+    '--root-path',
+    type=str,
+    default='./data/kitti',
+    help='specify the root path of dataset')
+parser.add_argument(
+    '--version',
+    type=str,
+    default='v1.0',
+    required=False,
+    help='specify the dataset version, no need for kitti')
+parser.add_argument(
+    '--max-sweeps',
+    type=int,
+    default=10,
+    required=False,
+    help='specify sweeps of lidar per example')
+parser.add_argument(
+    '--with-plane',
+    action='store_true',
+    help='Whether to use plane information for kitti.')
+parser.add_argument(
+    '--out-dir',
+    type=str,
+    default='./data/kitti',
+    required=False,
+    help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='kitti')
+parser.add_argument(
+    '--workers', type=int, default=4, help='number of threads to be used')
+parser.add_argument(
+    '--only-gt-database',
+    action='store_true',
+    help='''Whether to only generate ground truth database.
+        Only used when dataset is NuScenes or Waymo!''')
+parser.add_argument(
+    '--skip-cam_instances-infos',
+    action='store_true',
+    help='''Whether to skip gathering cam_instances infos.
+        Only used when dataset is Waymo!''')
+parser.add_argument(
+    '--skip-saving-sensor-data',
+    action='store_true',
+    help='''Whether to skip saving image and lidar.
+        Only used when dataset is Waymo!''')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    from mmengine.registry import init_default_scope
+    init_default_scope('mmdet3d')
+
+    if args.dataset == 'kitti':
+        if args.only_gt_database:
+            create_groundtruth_database(
+                'KittiDataset',
+                args.root_path,
+                args.extra_tag,
+                f'{args.extra_tag}_infos_train.pkl',
+                relative_path=False,
+                mask_anno_path='instances_train.json',
+                with_mask=(args.version == 'mask'))
+        else:
+            kitti_data_prep(
+                root_path=args.root_path,
+                info_prefix=args.extra_tag,
+                version=args.version,
+                out_dir=args.out_dir,
+                with_plane=args.with_plane)
+    elif args.dataset == 'nuscenes' and args.version != 'v1.0-mini':
+        if args.only_gt_database:
+            create_groundtruth_database('NuScenesDataset', args.root_path,
+                                        args.extra_tag,
+                                        f'{args.extra_tag}_infos_train.pkl')
+        else:
+            train_version = f'{args.version}-trainval'
+            nuscenes_data_prep(
+                root_path=args.root_path,
+                info_prefix=args.extra_tag,
+                version=train_version,
+                dataset_name='NuScenesDataset',
+                out_dir=args.out_dir,
+                max_sweeps=args.max_sweeps)
+            test_version = f'{args.version}-test'
+            nuscenes_data_prep(
+                root_path=args.root_path,
+                info_prefix=args.extra_tag,
+                version=test_version,
+                dataset_name='NuScenesDataset',
+                out_dir=args.out_dir,
+                max_sweeps=args.max_sweeps)
+    elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini':
+        if args.only_gt_database:
+            create_groundtruth_database('NuScenesDataset', args.root_path,
+                                        args.extra_tag,
+                                        f'{args.extra_tag}_infos_train.pkl')
+        else:
+            train_version = f'{args.version}'
+            nuscenes_data_prep(
+                root_path=args.root_path,
+                info_prefix=args.extra_tag,
+                version=train_version,
+                dataset_name='NuScenesDataset',
+                out_dir=args.out_dir,
+                max_sweeps=args.max_sweeps)
+    elif args.dataset == 'waymo':
+        waymo_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=args.version,
+            out_dir=args.out_dir,
+            workers=args.workers,
+            max_sweeps=args.max_sweeps,
+            only_gt_database=args.only_gt_database,
+            save_senor_data=not args.skip_saving_sensor_data,
+            skip_cam_instances_infos=args.skip_cam_instances_infos)
+    elif args.dataset == 'lyft':
+        train_version = f'{args.version}-train'
+        lyft_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            max_sweeps=args.max_sweeps)
+        test_version = f'{args.version}-test'
+        lyft_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=test_version,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'scannet':
+        scannet_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
+    elif args.dataset == 's3dis':
+        s3dis_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
+    elif args.dataset == 'sunrgbd':
+        sunrgbd_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
+    elif args.dataset == 'semantickitti':
+        semantickitti_data_prep(
+            info_prefix=args.extra_tag, out_dir=args.out_dir)
+    else:
+        raise NotImplementedError(f'Don\'t support {args.dataset} dataset.')
diff --git a/mmde/tools/create_data.sh b/mmde/tools/create_data.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0a1946585d2ad44abc9f37e0131e9a968b64e87a
--- /dev/null
+++ b/mmde/tools/create_data.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+set -x
+export PYTHONPATH=`pwd`:$PYTHONPATH
+
+PARTITION=$1
+JOB_NAME=$2
+DATASET=$3
+WORKERS=$4
+GPUS=${GPUS:-1}
+GPUS_PER_NODE=${GPUS_PER_NODE:-1}
+SRUN_ARGS=${SRUN_ARGS:-""}
+PY_ARGS=${@:5}
+
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/create_data.py ${DATASET} \
+            --root-path ./data/${DATASET} \
+            --out-dir ./data/${DATASET} \
+            --workers ${WORKERS} \
+            --extra-tag ${DATASET} \
+            ${PY_ARGS}
diff --git a/mmde/tools/dataset_converters/create_gt_database.py b/mmde/tools/dataset_converters/create_gt_database.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb84256fd84ebb96d2381c9efe1129446847d13b
--- /dev/null
+++ b/mmde/tools/dataset_converters/create_gt_database.py
@@ -0,0 +1,646 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pickle
+from os import path as osp
+
+import mmcv
+import mmengine
+import numpy as np
+from mmcv.ops import roi_align
+from mmdet.evaluation import bbox_overlaps
+from mmengine import print_log, track_iter_progress
+from pycocotools import mask as maskUtils
+from pycocotools.coco import COCO
+
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures.ops import box_np_ops as box_np_ops
+
+
+def _poly2mask(mask_ann, img_h, img_w):
+    if isinstance(mask_ann, list):
+        # polygon -- a single object might consist of multiple parts
+        # we merge all parts into one mask rle code
+        rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        rle = maskUtils.merge(rles)
+    elif isinstance(mask_ann['counts'], list):
+        # uncompressed RLE
+        rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+    else:
+        # rle
+        rle = mask_ann
+    mask = maskUtils.decode(rle)
+    return mask
+
+
+def _parse_coco_ann_info(ann_info):
+    gt_bboxes = []
+    gt_labels = []
+    gt_bboxes_ignore = []
+    gt_masks_ann = []
+
+    for i, ann in enumerate(ann_info):
+        if ann.get('ignore', False):
+            continue
+        x1, y1, w, h = ann['bbox']
+        if ann['area'] <= 0:
+            continue
+        bbox = [x1, y1, x1 + w, y1 + h]
+        if ann.get('iscrowd', False):
+            gt_bboxes_ignore.append(bbox)
+        else:
+            gt_bboxes.append(bbox)
+            gt_masks_ann.append(ann['segmentation'])
+
+    if gt_bboxes:
+        gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+        gt_labels = np.array(gt_labels, dtype=np.int64)
+    else:
+        gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+        gt_labels = np.array([], dtype=np.int64)
+
+    if gt_bboxes_ignore:
+        gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+    else:
+        gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+    ann = dict(
+        bboxes=gt_bboxes, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann)
+
+    return ann
+
+
+def crop_image_patch_v2(pos_proposals, pos_assigned_gt_inds, gt_masks):
+    import torch
+    from torch.nn.modules.utils import _pair
+    device = pos_proposals.device
+    num_pos = pos_proposals.size(0)
+    fake_inds = (
+        torch.arange(num_pos,
+                     device=device).to(dtype=pos_proposals.dtype)[:, None])
+    rois = torch.cat([fake_inds, pos_proposals], dim=1)  # Nx5
+    mask_size = _pair(28)
+    rois = rois.to(device=device)
+    gt_masks_th = (
+        torch.from_numpy(gt_masks).to(device).index_select(
+            0, pos_assigned_gt_inds).to(dtype=rois.dtype))
+    # Use RoIAlign could apparently accelerate the training (~0.1s/iter)
+    targets = (
+        roi_align(gt_masks_th, rois, mask_size[::-1], 1.0, 0, True).squeeze(1))
+    return targets
+
+
+def crop_image_patch(pos_proposals, gt_masks, pos_assigned_gt_inds, org_img):
+    num_pos = pos_proposals.shape[0]
+    masks = []
+    img_patches = []
+    for i in range(num_pos):
+        gt_mask = gt_masks[pos_assigned_gt_inds[i]]
+        bbox = pos_proposals[i, :].astype(np.int32)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1 + 1, 1)
+        h = np.maximum(y2 - y1 + 1, 1)
+
+        mask_patch = gt_mask[y1:y1 + h, x1:x1 + w]
+        masked_img = gt_mask[..., None] * org_img
+        img_patch = masked_img[y1:y1 + h, x1:x1 + w]
+
+        img_patches.append(img_patch)
+        masks.append(mask_patch)
+    return img_patches, masks
+
+
+def create_groundtruth_database(dataset_class_name,
+                                data_path,
+                                info_prefix,
+                                info_path=None,
+                                mask_anno_path=None,
+                                used_classes=None,
+                                database_save_path=None,
+                                db_info_save_path=None,
+                                relative_path=True,
+                                add_rgb=False,
+                                lidar_only=False,
+                                bev_only=False,
+                                coors_range=None,
+                                with_mask=False):
+    """Given the raw data, generate the ground truth database.
+
+    Args:
+        dataset_class_name (str): Name of the input dataset.
+        data_path (str): Path of the data.
+        info_prefix (str): Prefix of the info file.
+        info_path (str, optional): Path of the info file.
+            Default: None.
+        mask_anno_path (str, optional): Path of the mask_anno.
+            Default: None.
+        used_classes (list[str], optional): Classes have been used.
+            Default: None.
+        database_save_path (str, optional): Path to save database.
+            Default: None.
+        db_info_save_path (str, optional): Path to save db_info.
+            Default: None.
+        relative_path (bool, optional): Whether to use relative path.
+            Default: True.
+        with_mask (bool, optional): Whether to use mask.
+            Default: False.
+    """
+    print(f'Create GT Database of {dataset_class_name}')
+    dataset_cfg = dict(
+        type=dataset_class_name, data_root=data_path, ann_file=info_path)
+    if dataset_class_name == 'KittiDataset':
+        backend_args = None
+        dataset_cfg.update(
+            modality=dict(
+                use_lidar=True,
+                use_camera=with_mask,
+            ),
+            data_prefix=dict(
+                pts='training/velodyne_reduced', img='training/image_2'),
+            pipeline=[
+                dict(
+                    type='LoadPointsFromFile',
+                    coord_type='LIDAR',
+                    load_dim=4,
+                    use_dim=4,
+                    backend_args=backend_args),
+                dict(
+                    type='LoadAnnotations3D',
+                    with_bbox_3d=True,
+                    with_label_3d=True,
+                    backend_args=backend_args)
+            ])
+
+    elif dataset_class_name == 'NuScenesDataset':
+        dataset_cfg.update(
+            use_valid_flag=True,
+            data_prefix=dict(
+                pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP'),
+            pipeline=[
+                dict(
+                    type='LoadPointsFromFile',
+                    coord_type='LIDAR',
+                    load_dim=5,
+                    use_dim=5),
+                dict(
+                    type='LoadPointsFromMultiSweeps',
+                    sweeps_num=10,
+                    use_dim=[0, 1, 2, 3, 4],
+                    pad_empty_sweeps=True,
+                    remove_close=True),
+                dict(
+                    type='LoadAnnotations3D',
+                    with_bbox_3d=True,
+                    with_label_3d=True)
+            ])
+
+    elif dataset_class_name == 'WaymoDataset':
+        backend_args = None
+        dataset_cfg.update(
+            test_mode=False,
+            data_prefix=dict(
+                pts='training/velodyne', img='', sweeps='training/velodyne'),
+            modality=dict(
+                use_lidar=True,
+                use_depth=False,
+                use_lidar_intensity=True,
+                use_camera=False,
+            ),
+            pipeline=[
+                dict(
+                    type='LoadPointsFromFile',
+                    coord_type='LIDAR',
+                    load_dim=6,
+                    use_dim=6,
+                    backend_args=backend_args),
+                dict(
+                    type='LoadAnnotations3D',
+                    with_bbox_3d=True,
+                    with_label_3d=True,
+                    backend_args=backend_args)
+            ])
+
+    dataset = DATASETS.build(dataset_cfg)
+
+    if database_save_path is None:
+        database_save_path = osp.join(data_path, f'{info_prefix}_gt_database')
+    if db_info_save_path is None:
+        db_info_save_path = osp.join(data_path,
+                                     f'{info_prefix}_dbinfos_train.pkl')
+    mmengine.mkdir_or_exist(database_save_path)
+    all_db_infos = dict()
+    if with_mask:
+        coco = COCO(osp.join(data_path, mask_anno_path))
+        imgIds = coco.getImgIds()
+        file2id = dict()
+        for i in imgIds:
+            info = coco.loadImgs([i])[0]
+            file2id.update({info['file_name']: i})
+
+    group_counter = 0
+    for j in track_iter_progress(list(range(len(dataset)))):
+        data_info = dataset.get_data_info(j)
+        example = dataset.pipeline(data_info)
+        annos = example['ann_info']
+        image_idx = example['sample_idx']
+        points = example['points'].numpy()
+        gt_boxes_3d = annos['gt_bboxes_3d'].numpy()
+        names = [dataset.metainfo['classes'][i] for i in annos['gt_labels_3d']]
+        group_dict = dict()
+        if 'group_ids' in annos:
+            group_ids = annos['group_ids']
+        else:
+            group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64)
+        difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32)
+        if 'difficulty' in annos:
+            difficulty = annos['difficulty']
+
+        num_obj = gt_boxes_3d.shape[0]
+        point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d)
+
+        if with_mask:
+            # prepare masks
+            gt_boxes = annos['gt_bboxes']
+            img_path = osp.split(example['img_info']['filename'])[-1]
+            if img_path not in file2id.keys():
+                print(f'skip image {img_path} for empty mask')
+                continue
+            img_id = file2id[img_path]
+            kins_annIds = coco.getAnnIds(imgIds=img_id)
+            kins_raw_info = coco.loadAnns(kins_annIds)
+            kins_ann_info = _parse_coco_ann_info(kins_raw_info)
+            h, w = annos['img_shape'][:2]
+            gt_masks = [
+                _poly2mask(mask, h, w) for mask in kins_ann_info['masks']
+            ]
+            # get mask inds based on iou mapping
+            bbox_iou = bbox_overlaps(kins_ann_info['bboxes'], gt_boxes)
+            mask_inds = bbox_iou.argmax(axis=0)
+            valid_inds = (bbox_iou.max(axis=0) > 0.5)
+
+            # mask the image
+            # use more precise crop when it is ready
+            # object_img_patches = np.ascontiguousarray(
+            #     np.stack(object_img_patches, axis=0).transpose(0, 3, 1, 2))
+            # crop image patches using roi_align
+            # object_img_patches = crop_image_patch_v2(
+            #     torch.Tensor(gt_boxes),
+            #     torch.Tensor(mask_inds).long(), object_img_patches)
+            object_img_patches, object_masks = crop_image_patch(
+                gt_boxes, gt_masks, mask_inds, annos['img'])
+
+        for i in range(num_obj):
+            filename = f'{image_idx}_{names[i]}_{i}.bin'
+            abs_filepath = osp.join(database_save_path, filename)
+            rel_filepath = osp.join(f'{info_prefix}_gt_database', filename)
+
+            # save point clouds and image patches for each object
+            gt_points = points[point_indices[:, i]]
+            gt_points[:, :3] -= gt_boxes_3d[i, :3]
+
+            if with_mask:
+                if object_masks[i].sum() == 0 or not valid_inds[i]:
+                    # Skip object for empty or invalid mask
+                    continue
+                img_patch_path = abs_filepath + '.png'
+                mask_patch_path = abs_filepath + '.mask.png'
+                mmcv.imwrite(object_img_patches[i], img_patch_path)
+                mmcv.imwrite(object_masks[i], mask_patch_path)
+
+            with open(abs_filepath, 'w') as f:
+                gt_points.tofile(f)
+
+            if (used_classes is None) or names[i] in used_classes:
+                db_info = {
+                    'name': names[i],
+                    'path': rel_filepath,
+                    'image_idx': image_idx,
+                    'gt_idx': i,
+                    'box3d_lidar': gt_boxes_3d[i],
+                    'num_points_in_gt': gt_points.shape[0],
+                    'difficulty': difficulty[i],
+                }
+                local_group_id = group_ids[i]
+                # if local_group_id >= 0:
+                if local_group_id not in group_dict:
+                    group_dict[local_group_id] = group_counter
+                    group_counter += 1
+                db_info['group_id'] = group_dict[local_group_id]
+                if 'score' in annos:
+                    db_info['score'] = annos['score'][i]
+                if with_mask:
+                    db_info.update({'box2d_camera': gt_boxes[i]})
+                if names[i] in all_db_infos:
+                    all_db_infos[names[i]].append(db_info)
+                else:
+                    all_db_infos[names[i]] = [db_info]
+
+    for k, v in all_db_infos.items():
+        print(f'load {len(v)} {k} database infos')
+
+    with open(db_info_save_path, 'wb') as f:
+        pickle.dump(all_db_infos, f)
+
+
+class GTDatabaseCreater:
+    """Given the raw data, generate the ground truth database. This is the
+    parallel version. For serialized version, please refer to
+    `create_groundtruth_database`
+
+    Args:
+        dataset_class_name (str): Name of the input dataset.
+        data_path (str): Path of the data.
+        info_prefix (str): Prefix of the info file.
+        info_path (str, optional): Path of the info file.
+            Default: None.
+        mask_anno_path (str, optional): Path of the mask_anno.
+            Default: None.
+        used_classes (list[str], optional): Classes have been used.
+            Default: None.
+        database_save_path (str, optional): Path to save database.
+            Default: None.
+        db_info_save_path (str, optional): Path to save db_info.
+            Default: None.
+        relative_path (bool, optional): Whether to use relative path.
+            Default: True.
+        with_mask (bool, optional): Whether to use mask.
+            Default: False.
+        num_worker (int, optional): the number of parallel workers to use.
+            Default: 8.
+    """
+
+    def __init__(self,
+                 dataset_class_name,
+                 data_path,
+                 info_prefix,
+                 info_path=None,
+                 mask_anno_path=None,
+                 used_classes=None,
+                 database_save_path=None,
+                 db_info_save_path=None,
+                 relative_path=True,
+                 add_rgb=False,
+                 lidar_only=False,
+                 bev_only=False,
+                 coors_range=None,
+                 with_mask=False,
+                 num_worker=8) -> None:
+        self.dataset_class_name = dataset_class_name
+        self.data_path = data_path
+        self.info_prefix = info_prefix
+        self.info_path = info_path
+        self.mask_anno_path = mask_anno_path
+        self.used_classes = used_classes
+        self.database_save_path = database_save_path
+        self.db_info_save_path = db_info_save_path
+        self.relative_path = relative_path
+        self.add_rgb = add_rgb
+        self.lidar_only = lidar_only
+        self.bev_only = bev_only
+        self.coors_range = coors_range
+        self.with_mask = with_mask
+        self.num_worker = num_worker
+        self.pipeline = None
+
+    def create_single(self, input_dict):
+        group_counter = 0
+        single_db_infos = dict()
+        example = self.pipeline(input_dict)
+        annos = example['ann_info']
+        image_idx = example['sample_idx']
+        points = example['points'].numpy()
+        gt_boxes_3d = annos['gt_bboxes_3d'].numpy()
+        names = [
+            self.dataset.metainfo['classes'][i] for i in annos['gt_labels_3d']
+        ]
+        group_dict = dict()
+        if 'group_ids' in annos:
+            group_ids = annos['group_ids']
+        else:
+            group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64)
+        difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32)
+        if 'difficulty' in annos:
+            difficulty = annos['difficulty']
+
+        num_obj = gt_boxes_3d.shape[0]
+        point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d)
+
+        if self.with_mask:
+            # prepare masks
+            gt_boxes = annos['gt_bboxes']
+            img_path = osp.split(example['img_info']['filename'])[-1]
+            if img_path not in self.file2id.keys():
+                print(f'skip image {img_path} for empty mask')
+                return single_db_infos
+            img_id = self.file2id[img_path]
+            kins_annIds = self.coco.getAnnIds(imgIds=img_id)
+            kins_raw_info = self.coco.loadAnns(kins_annIds)
+            kins_ann_info = _parse_coco_ann_info(kins_raw_info)
+            h, w = annos['img_shape'][:2]
+            gt_masks = [
+                _poly2mask(mask, h, w) for mask in kins_ann_info['masks']
+            ]
+            # get mask inds based on iou mapping
+            bbox_iou = bbox_overlaps(kins_ann_info['bboxes'], gt_boxes)
+            mask_inds = bbox_iou.argmax(axis=0)
+            valid_inds = (bbox_iou.max(axis=0) > 0.5)
+
+            # mask the image
+            # use more precise crop when it is ready
+            # object_img_patches = np.ascontiguousarray(
+            #     np.stack(object_img_patches, axis=0).transpose(0, 3, 1, 2))
+            # crop image patches using roi_align
+            # object_img_patches = crop_image_patch_v2(
+            #     torch.Tensor(gt_boxes),
+            #     torch.Tensor(mask_inds).long(), object_img_patches)
+            object_img_patches, object_masks = crop_image_patch(
+                gt_boxes, gt_masks, mask_inds, annos['img'])
+
+        for i in range(num_obj):
+            filename = f'{image_idx}_{names[i]}_{i}.bin'
+            abs_filepath = osp.join(self.database_save_path, filename)
+            rel_filepath = osp.join(f'{self.info_prefix}_gt_database',
+                                    filename)
+
+            # save point clouds and image patches for each object
+            gt_points = points[point_indices[:, i]]
+            gt_points[:, :3] -= gt_boxes_3d[i, :3]
+
+            if self.with_mask:
+                if object_masks[i].sum() == 0 or not valid_inds[i]:
+                    # Skip object for empty or invalid mask
+                    continue
+                img_patch_path = abs_filepath + '.png'
+                mask_patch_path = abs_filepath + '.mask.png'
+                mmcv.imwrite(object_img_patches[i], img_patch_path)
+                mmcv.imwrite(object_masks[i], mask_patch_path)
+
+            with open(abs_filepath, 'w') as f:
+                gt_points.tofile(f)
+
+            if (self.used_classes is None) or names[i] in self.used_classes:
+                db_info = {
+                    'name': names[i],
+                    'path': rel_filepath,
+                    'image_idx': image_idx,
+                    'gt_idx': i,
+                    'box3d_lidar': gt_boxes_3d[i],
+                    'num_points_in_gt': gt_points.shape[0],
+                    'difficulty': difficulty[i],
+                }
+                local_group_id = group_ids[i]
+                # if local_group_id >= 0:
+                if local_group_id not in group_dict:
+                    group_dict[local_group_id] = group_counter
+                    group_counter += 1
+                db_info['group_id'] = group_dict[local_group_id]
+                if 'score' in annos:
+                    db_info['score'] = annos['score'][i]
+                if self.with_mask:
+                    db_info.update({'box2d_camera': gt_boxes[i]})
+                if names[i] in single_db_infos:
+                    single_db_infos[names[i]].append(db_info)
+                else:
+                    single_db_infos[names[i]] = [db_info]
+
+        return single_db_infos
+
+    def create(self):
+        print_log(
+            f'Create GT Database of {self.dataset_class_name}',
+            logger='current')
+        dataset_cfg = dict(
+            type=self.dataset_class_name,
+            data_root=self.data_path,
+            ann_file=self.info_path)
+        if self.dataset_class_name == 'KittiDataset':
+            backend_args = None
+            dataset_cfg.update(
+                test_mode=False,
+                data_prefix=dict(
+                    pts='training/velodyne_reduced', img='training/image_2'),
+                modality=dict(
+                    use_lidar=True,
+                    use_depth=False,
+                    use_lidar_intensity=True,
+                    use_camera=self.with_mask,
+                ),
+                pipeline=[
+                    dict(
+                        type='LoadPointsFromFile',
+                        coord_type='LIDAR',
+                        load_dim=4,
+                        use_dim=4,
+                        backend_args=backend_args),
+                    dict(
+                        type='LoadAnnotations3D',
+                        with_bbox_3d=True,
+                        with_label_3d=True,
+                        backend_args=backend_args)
+                ])
+
+        elif self.dataset_class_name == 'NuScenesDataset':
+            dataset_cfg.update(
+                use_valid_flag=True,
+                data_prefix=dict(
+                    pts='samples/LIDAR_TOP', img='',
+                    sweeps='sweeps/LIDAR_TOP'),
+                pipeline=[
+                    dict(
+                        type='LoadPointsFromFile',
+                        coord_type='LIDAR',
+                        load_dim=5,
+                        use_dim=5),
+                    dict(
+                        type='LoadPointsFromMultiSweeps',
+                        sweeps_num=10,
+                        use_dim=[0, 1, 2, 3, 4],
+                        pad_empty_sweeps=True,
+                        remove_close=True),
+                    dict(
+                        type='LoadAnnotations3D',
+                        with_bbox_3d=True,
+                        with_label_3d=True)
+                ])
+
+        elif self.dataset_class_name == 'WaymoDataset':
+            backend_args = None
+            dataset_cfg.update(
+                test_mode=False,
+                data_prefix=dict(
+                    pts='training/velodyne',
+                    img='',
+                    sweeps='training/velodyne'),
+                modality=dict(
+                    use_lidar=True,
+                    use_depth=False,
+                    use_lidar_intensity=True,
+                    use_camera=False,
+                ),
+                pipeline=[
+                    dict(
+                        type='LoadPointsFromFile',
+                        coord_type='LIDAR',
+                        load_dim=6,
+                        use_dim=6,
+                        backend_args=backend_args),
+                    dict(
+                        type='LoadAnnotations3D',
+                        with_bbox_3d=True,
+                        with_label_3d=True,
+                        backend_args=backend_args)
+                ])
+
+        self.dataset = DATASETS.build(dataset_cfg)
+        self.pipeline = self.dataset.pipeline
+        if self.database_save_path is None:
+            self.database_save_path = osp.join(
+                self.data_path, f'{self.info_prefix}_gt_database')
+        if self.db_info_save_path is None:
+            self.db_info_save_path = osp.join(
+                self.data_path, f'{self.info_prefix}_dbinfos_train.pkl')
+        mmengine.mkdir_or_exist(self.database_save_path)
+        if self.with_mask:
+            self.coco = COCO(osp.join(self.data_path, self.mask_anno_path))
+            imgIds = self.coco.getImgIds()
+            self.file2id = dict()
+            for i in imgIds:
+                info = self.coco.loadImgs([i])[0]
+                self.file2id.update({info['file_name']: i})
+
+        def loop_dataset(i):
+            input_dict = self.dataset.get_data_info(i)
+            input_dict['box_type_3d'] = self.dataset.box_type_3d
+            input_dict['box_mode_3d'] = self.dataset.box_mode_3d
+            return input_dict
+
+        if self.num_worker == 0:
+            multi_db_infos = mmengine.track_progress(
+                self.create_single,
+                ((loop_dataset(i)
+                  for i in range(len(self.dataset))), len(self.dataset)))
+        else:
+            multi_db_infos = mmengine.track_parallel_progress(
+                self.create_single,
+                ((loop_dataset(i)
+                  for i in range(len(self.dataset))), len(self.dataset)),
+                self.num_worker,
+                chunksize=1000)
+        print_log('Make global unique group id', logger='current')
+        group_counter_offset = 0
+        all_db_infos = dict()
+        for single_db_infos in track_iter_progress(multi_db_infos):
+            group_id = -1
+            for name, name_db_infos in single_db_infos.items():
+                for db_info in name_db_infos:
+                    group_id = max(group_id, db_info['group_id'])
+                    db_info['group_id'] += group_counter_offset
+                if name not in all_db_infos:
+                    all_db_infos[name] = []
+                all_db_infos[name].extend(name_db_infos)
+            group_counter_offset += (group_id + 1)
+
+        for k, v in all_db_infos.items():
+            print_log(f'load {len(v)} {k} database infos', logger='current')
+
+        print_log(f'Saving GT database infos into {self.db_info_save_path}')
+        with open(self.db_info_save_path, 'wb') as f:
+            pickle.dump(all_db_infos, f)
diff --git a/mmde/tools/dataset_converters/indoor_converter.py b/mmde/tools/dataset_converters/indoor_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..90922856c0911bf24c0f1ff70e1c26d4417bc647
--- /dev/null
+++ b/mmde/tools/dataset_converters/indoor_converter.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import mmengine
+import numpy as np
+
+from tools.dataset_converters.s3dis_data_utils import S3DISData, S3DISSegData
+from tools.dataset_converters.scannet_data_utils import (ScanNetData,
+                                                         ScanNetSegData)
+from tools.dataset_converters.sunrgbd_data_utils import SUNRGBDData
+
+
+def create_indoor_info_file(data_path,
+                            pkl_prefix='sunrgbd',
+                            save_path=None,
+                            use_v1=False,
+                            workers=4):
+    """Create indoor information file.
+
+    Get information of the raw data and save it to the pkl file.
+
+    Args:
+        data_path (str): Path of the data.
+        pkl_prefix (str, optional): Prefix of the pkl to be saved.
+            Default: 'sunrgbd'.
+        save_path (str, optional): Path of the pkl to be saved. Default: None.
+        use_v1 (bool, optional): Whether to use v1. Default: False.
+        workers (int, optional): Number of threads to be used. Default: 4.
+    """
+    assert os.path.exists(data_path)
+    assert pkl_prefix in ['sunrgbd', 'scannet', 's3dis'], \
+        f'unsupported indoor dataset {pkl_prefix}'
+    save_path = data_path if save_path is None else save_path
+    assert os.path.exists(save_path)
+
+    # generate infos for both detection and segmentation task
+    if pkl_prefix in ['sunrgbd', 'scannet']:
+        train_filename = os.path.join(save_path,
+                                      f'{pkl_prefix}_infos_train.pkl')
+        val_filename = os.path.join(save_path, f'{pkl_prefix}_infos_val.pkl')
+        if pkl_prefix == 'sunrgbd':
+            # SUN RGB-D has a train-val split
+            train_dataset = SUNRGBDData(
+                root_path=data_path, split='train', use_v1=use_v1)
+            val_dataset = SUNRGBDData(
+                root_path=data_path, split='val', use_v1=use_v1)
+        else:
+            # ScanNet has a train-val-test split
+            train_dataset = ScanNetData(root_path=data_path, split='train')
+            val_dataset = ScanNetData(root_path=data_path, split='val')
+            test_dataset = ScanNetData(root_path=data_path, split='test')
+            test_filename = os.path.join(save_path,
+                                         f'{pkl_prefix}_infos_test.pkl')
+
+        infos_train = train_dataset.get_infos(
+            num_workers=workers, has_label=True)
+        mmengine.dump(infos_train, train_filename, 'pkl')
+        print(f'{pkl_prefix} info train file is saved to {train_filename}')
+
+        infos_val = val_dataset.get_infos(num_workers=workers, has_label=True)
+        mmengine.dump(infos_val, val_filename, 'pkl')
+        print(f'{pkl_prefix} info val file is saved to {val_filename}')
+
+    if pkl_prefix == 'scannet':
+        infos_test = test_dataset.get_infos(
+            num_workers=workers, has_label=False)
+        mmengine.dump(infos_test, test_filename, 'pkl')
+        print(f'{pkl_prefix} info test file is saved to {test_filename}')
+
+    # generate infos for the semantic segmentation task
+    # e.g. re-sampled scene indexes and label weights
+    # scene indexes are used to re-sample rooms with different number of points
+    # label weights are used to balance classes with different number of points
+    if pkl_prefix == 'scannet':
+        # label weight computation function is adopted from
+        # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24
+        train_dataset = ScanNetSegData(
+            data_root=data_path,
+            ann_file=train_filename,
+            split='train',
+            num_points=8192,
+            label_weight_func=lambda x: 1.0 / np.log(1.2 + x))
+        # TODO: do we need to generate on val set?
+        val_dataset = ScanNetSegData(
+            data_root=data_path,
+            ann_file=val_filename,
+            split='val',
+            num_points=8192,
+            label_weight_func=lambda x: 1.0 / np.log(1.2 + x))
+        # no need to generate for test set
+        train_dataset.get_seg_infos()
+        val_dataset.get_seg_infos()
+    elif pkl_prefix == 's3dis':
+        # S3DIS doesn't have a fixed train-val split
+        # it has 6 areas instead, so we generate info file for each of them
+        # in training, we will use dataset to wrap different areas
+        splits = [f'Area_{i}' for i in [1, 2, 3, 4, 5, 6]]
+        for split in splits:
+            dataset = S3DISData(root_path=data_path, split=split)
+            info = dataset.get_infos(num_workers=workers, has_label=True)
+            filename = os.path.join(save_path,
+                                    f'{pkl_prefix}_infos_{split}.pkl')
+            mmengine.dump(info, filename, 'pkl')
+            print(f'{pkl_prefix} info {split} file is saved to {filename}')
+            seg_dataset = S3DISSegData(
+                data_root=data_path,
+                ann_file=filename,
+                split=split,
+                num_points=4096,
+                label_weight_func=lambda x: 1.0 / np.log(1.2 + x))
+            seg_dataset.get_seg_infos()
diff --git a/mmde/tools/dataset_converters/kitti_converter.py b/mmde/tools/dataset_converters/kitti_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..367cfd7ba9ac91f63d4462409d0e1df2eccdf20e
--- /dev/null
+++ b/mmde/tools/dataset_converters/kitti_converter.py
@@ -0,0 +1,626 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from pathlib import Path
+
+import mmcv
+import mmengine
+import numpy as np
+from nuscenes.utils.geometry_utils import view_points
+
+from mmdet3d.structures import points_cam2img
+from mmdet3d.structures.ops import box_np_ops
+from .kitti_data_utils import WaymoInfoGatherer, get_kitti_image_info
+from .nuscenes_converter import post_process_coords
+
+kitti_categories = ('Pedestrian', 'Cyclist', 'Car')
+
+
+def convert_to_kitti_info_version2(info):
+    """convert kitti info v1 to v2 if possible.
+
+    Args:
+        info (dict): Info of the input kitti data.
+            - image (dict): image info
+            - calib (dict): calibration info
+            - point_cloud (dict): point cloud info
+    """
+    if 'image' not in info or 'calib' not in info or 'point_cloud' not in info:
+        info['image'] = {
+            'image_shape': info['img_shape'],
+            'image_idx': info['image_idx'],
+            'image_path': info['img_path'],
+        }
+        info['calib'] = {
+            'R0_rect': info['calib/R0_rect'],
+            'Tr_velo_to_cam': info['calib/Tr_velo_to_cam'],
+            'P2': info['calib/P2'],
+        }
+        info['point_cloud'] = {
+            'velodyne_path': info['velodyne_path'],
+        }
+
+
+def _read_imageset_file(path):
+    with open(path, 'r') as f:
+        lines = f.readlines()
+    return [int(line) for line in lines]
+
+
+class _NumPointsInGTCalculater:
+    """Calculate the number of points inside the ground truth box. This is the
+    parallel version. For the serialized version, please refer to
+    `_calculate_num_points_in_gt`.
+
+    Args:
+        data_path (str): Path of the data.
+        relative_path (bool): Whether to use relative path.
+        remove_outside (bool, optional): Whether to remove points which are
+            outside of image. Default: True.
+        num_features (int, optional): Number of features per point.
+            Default: False.
+        num_worker (int, optional): the number of parallel workers to use.
+            Default: 8.
+    """
+
+    def __init__(self,
+                 data_path,
+                 relative_path,
+                 remove_outside=True,
+                 num_features=4,
+                 num_worker=8) -> None:
+        self.data_path = data_path
+        self.relative_path = relative_path
+        self.remove_outside = remove_outside
+        self.num_features = num_features
+        self.num_worker = num_worker
+
+    def calculate_single(self, info):
+        pc_info = info['point_cloud']
+        image_info = info['image']
+        calib = info['calib']
+        if self.relative_path:
+            v_path = str(Path(self.data_path) / pc_info['velodyne_path'])
+        else:
+            v_path = pc_info['velodyne_path']
+        points_v = np.fromfile(
+            v_path, dtype=np.float32,
+            count=-1).reshape([-1, self.num_features])
+        rect = calib['R0_rect']
+        Trv2c = calib['Tr_velo_to_cam']
+        P2 = calib['P2']
+        if self.remove_outside:
+            points_v = box_np_ops.remove_outside_points(
+                points_v, rect, Trv2c, P2, image_info['image_shape'])
+        annos = info['annos']
+        num_obj = len([n for n in annos['name'] if n != 'DontCare'])
+        dims = annos['dimensions'][:num_obj]
+        loc = annos['location'][:num_obj]
+        rots = annos['rotation_y'][:num_obj]
+        gt_boxes_camera = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                         axis=1)
+        gt_boxes_lidar = box_np_ops.box_camera_to_lidar(
+            gt_boxes_camera, rect, Trv2c)
+        indices = box_np_ops.points_in_rbbox(points_v[:, :3], gt_boxes_lidar)
+        num_points_in_gt = indices.sum(0)
+        num_ignored = len(annos['dimensions']) - num_obj
+        num_points_in_gt = np.concatenate(
+            [num_points_in_gt, -np.ones([num_ignored])])
+        annos['num_points_in_gt'] = num_points_in_gt.astype(np.int32)
+        return info
+
+    def calculate(self, infos):
+        ret_infos = mmengine.track_parallel_progress(self.calculate_single,
+                                                     infos, self.num_worker)
+        for i, ret_info in enumerate(ret_infos):
+            infos[i] = ret_info
+
+
+def _calculate_num_points_in_gt(data_path,
+                                infos,
+                                relative_path,
+                                remove_outside=True,
+                                num_features=4):
+    for info in mmengine.track_iter_progress(infos):
+        pc_info = info['point_cloud']
+        image_info = info['image']
+        calib = info['calib']
+        if relative_path:
+            v_path = str(Path(data_path) / pc_info['velodyne_path'])
+        else:
+            v_path = pc_info['velodyne_path']
+        points_v = np.fromfile(
+            v_path, dtype=np.float32, count=-1).reshape([-1, num_features])
+        rect = calib['R0_rect']
+        Trv2c = calib['Tr_velo_to_cam']
+        P2 = calib['P2']
+        if remove_outside:
+            points_v = box_np_ops.remove_outside_points(
+                points_v, rect, Trv2c, P2, image_info['image_shape'])
+
+        # points_v = points_v[points_v[:, 0] > 0]
+        annos = info['annos']
+        num_obj = len([n for n in annos['name'] if n != 'DontCare'])
+        # annos = kitti.filter_kitti_anno(annos, ['DontCare'])
+        dims = annos['dimensions'][:num_obj]
+        loc = annos['location'][:num_obj]
+        rots = annos['rotation_y'][:num_obj]
+        gt_boxes_camera = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                         axis=1)
+        gt_boxes_lidar = box_np_ops.box_camera_to_lidar(
+            gt_boxes_camera, rect, Trv2c)
+        indices = box_np_ops.points_in_rbbox(points_v[:, :3], gt_boxes_lidar)
+        num_points_in_gt = indices.sum(0)
+        num_ignored = len(annos['dimensions']) - num_obj
+        num_points_in_gt = np.concatenate(
+            [num_points_in_gt, -np.ones([num_ignored])])
+        annos['num_points_in_gt'] = num_points_in_gt.astype(np.int32)
+
+
+def create_kitti_info_file(data_path,
+                           pkl_prefix='kitti',
+                           with_plane=False,
+                           save_path=None,
+                           relative_path=True):
+    """Create info file of KITTI dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        data_path (str): Path of the data root.
+        pkl_prefix (str, optional): Prefix of the info file to be generated.
+            Default: 'kitti'.
+        with_plane (bool, optional): Whether to use plane information.
+            Default: False.
+        save_path (str, optional): Path to save the info file.
+            Default: None.
+        relative_path (bool, optional): Whether to use relative path.
+            Default: True.
+    """
+    imageset_folder = Path(data_path) / 'ImageSets'
+    train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt'))
+    val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt'))
+    test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt'))
+
+    print('Generate info. this may take several minutes.')
+    if save_path is None:
+        save_path = Path(data_path)
+    else:
+        save_path = Path(save_path)
+    kitti_infos_train = get_kitti_image_info(
+        data_path,
+        training=True,
+        velodyne=True,
+        calib=True,
+        with_plane=with_plane,
+        image_ids=train_img_ids,
+        relative_path=relative_path)
+    _calculate_num_points_in_gt(data_path, kitti_infos_train, relative_path)
+    filename = save_path / f'{pkl_prefix}_infos_train.pkl'
+    print(f'Kitti info train file is saved to {filename}')
+    mmengine.dump(kitti_infos_train, filename)
+    kitti_infos_val = get_kitti_image_info(
+        data_path,
+        training=True,
+        velodyne=True,
+        calib=True,
+        with_plane=with_plane,
+        image_ids=val_img_ids,
+        relative_path=relative_path)
+    _calculate_num_points_in_gt(data_path, kitti_infos_val, relative_path)
+    filename = save_path / f'{pkl_prefix}_infos_val.pkl'
+    print(f'Kitti info val file is saved to {filename}')
+    mmengine.dump(kitti_infos_val, filename)
+    filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
+    print(f'Kitti info trainval file is saved to {filename}')
+    mmengine.dump(kitti_infos_train + kitti_infos_val, filename)
+
+    kitti_infos_test = get_kitti_image_info(
+        data_path,
+        training=False,
+        label_info=False,
+        velodyne=True,
+        calib=True,
+        with_plane=False,
+        image_ids=test_img_ids,
+        relative_path=relative_path)
+    filename = save_path / f'{pkl_prefix}_infos_test.pkl'
+    print(f'Kitti info test file is saved to {filename}')
+    mmengine.dump(kitti_infos_test, filename)
+
+
+def create_waymo_info_file(data_path,
+                           pkl_prefix='waymo',
+                           save_path=None,
+                           relative_path=True,
+                           max_sweeps=5,
+                           workers=8):
+    """Create info file of waymo dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        data_path (str): Path of the data root.
+        pkl_prefix (str, optional): Prefix of the info file to be generated.
+            Default: 'waymo'.
+        save_path (str, optional): Path to save the info file.
+            Default: None.
+        relative_path (bool, optional): Whether to use relative path.
+            Default: True.
+        max_sweeps (int, optional): Max sweeps before the detection frame
+            to be used. Default: 5.
+    """
+    imageset_folder = Path(data_path) / 'ImageSets'
+    train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt'))
+    val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt'))
+    test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt'))
+
+    print('Generate info. this may take several minutes.')
+    if save_path is None:
+        save_path = Path(data_path)
+    else:
+        save_path = Path(save_path)
+    waymo_infos_gatherer_trainval = WaymoInfoGatherer(
+        data_path,
+        training=True,
+        velodyne=True,
+        calib=True,
+        pose=True,
+        relative_path=relative_path,
+        max_sweeps=max_sweeps,
+        num_worker=workers)
+    waymo_infos_gatherer_test = WaymoInfoGatherer(
+        data_path,
+        training=False,
+        label_info=False,
+        velodyne=True,
+        calib=True,
+        pose=True,
+        relative_path=relative_path,
+        max_sweeps=max_sweeps,
+        num_worker=workers)
+    num_points_in_gt_calculater = _NumPointsInGTCalculater(
+        data_path,
+        relative_path,
+        num_features=6,
+        remove_outside=False,
+        num_worker=workers)
+
+    waymo_infos_train = waymo_infos_gatherer_trainval.gather(train_img_ids)
+    num_points_in_gt_calculater.calculate(waymo_infos_train)
+    filename = save_path / f'{pkl_prefix}_infos_train.pkl'
+    print(f'Waymo info train file is saved to {filename}')
+    mmengine.dump(waymo_infos_train, filename)
+    waymo_infos_val = waymo_infos_gatherer_trainval.gather(val_img_ids)
+    num_points_in_gt_calculater.calculate(waymo_infos_val)
+    filename = save_path / f'{pkl_prefix}_infos_val.pkl'
+    print(f'Waymo info val file is saved to {filename}')
+    mmengine.dump(waymo_infos_val, filename)
+    filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
+    print(f'Waymo info trainval file is saved to {filename}')
+    mmengine.dump(waymo_infos_train + waymo_infos_val, filename)
+    waymo_infos_test = waymo_infos_gatherer_test.gather(test_img_ids)
+    filename = save_path / f'{pkl_prefix}_infos_test.pkl'
+    print(f'Waymo info test file is saved to {filename}')
+    mmengine.dump(waymo_infos_test, filename)
+
+
+def _create_reduced_point_cloud(data_path,
+                                info_path,
+                                save_path=None,
+                                back=False,
+                                num_features=4,
+                                front_camera_id=2):
+    """Create reduced point clouds for given info.
+
+    Args:
+        data_path (str): Path of original data.
+        info_path (str): Path of data info.
+        save_path (str, optional): Path to save reduced point cloud
+            data. Default: None.
+        back (bool, optional): Whether to flip the points to back.
+            Default: False.
+        num_features (int, optional): Number of point features. Default: 4.
+        front_camera_id (int, optional): The referenced/front camera ID.
+            Default: 2.
+    """
+    kitti_infos = mmengine.load(info_path)
+
+    for info in mmengine.track_iter_progress(kitti_infos):
+        pc_info = info['point_cloud']
+        image_info = info['image']
+        calib = info['calib']
+
+        v_path = pc_info['velodyne_path']
+        v_path = Path(data_path) / v_path
+        points_v = np.fromfile(
+            str(v_path), dtype=np.float32,
+            count=-1).reshape([-1, num_features])
+        rect = calib['R0_rect']
+        if front_camera_id == 2:
+            P2 = calib['P2']
+        else:
+            P2 = calib[f'P{str(front_camera_id)}']
+        Trv2c = calib['Tr_velo_to_cam']
+        # first remove z < 0 points
+        # keep = points_v[:, -1] > 0
+        # points_v = points_v[keep]
+        # then remove outside.
+        if back:
+            points_v[:, 0] = -points_v[:, 0]
+        points_v = box_np_ops.remove_outside_points(points_v, rect, Trv2c, P2,
+                                                    image_info['image_shape'])
+        if save_path is None:
+            save_dir = v_path.parent.parent / (v_path.parent.stem + '_reduced')
+            if not save_dir.exists():
+                save_dir.mkdir()
+            save_filename = save_dir / v_path.name
+            # save_filename = str(v_path) + '_reduced'
+            if back:
+                save_filename += '_back'
+        else:
+            save_filename = str(Path(save_path) / v_path.name)
+            if back:
+                save_filename += '_back'
+        with open(save_filename, 'w') as f:
+            points_v.tofile(f)
+
+
+def create_reduced_point_cloud(data_path,
+                               pkl_prefix,
+                               train_info_path=None,
+                               val_info_path=None,
+                               test_info_path=None,
+                               save_path=None,
+                               with_back=False):
+    """Create reduced point clouds for training/validation/testing.
+
+    Args:
+        data_path (str): Path of original data.
+        pkl_prefix (str): Prefix of info files.
+        train_info_path (str, optional): Path of training set info.
+            Default: None.
+        val_info_path (str, optional): Path of validation set info.
+            Default: None.
+        test_info_path (str, optional): Path of test set info.
+            Default: None.
+        save_path (str, optional): Path to save reduced point cloud data.
+            Default: None.
+        with_back (bool, optional): Whether to flip the points to back.
+            Default: False.
+    """
+    if train_info_path is None:
+        train_info_path = Path(data_path) / f'{pkl_prefix}_infos_train.pkl'
+    if val_info_path is None:
+        val_info_path = Path(data_path) / f'{pkl_prefix}_infos_val.pkl'
+    if test_info_path is None:
+        test_info_path = Path(data_path) / f'{pkl_prefix}_infos_test.pkl'
+
+    print('create reduced point cloud for training set')
+    _create_reduced_point_cloud(data_path, train_info_path, save_path)
+    print('create reduced point cloud for validation set')
+    _create_reduced_point_cloud(data_path, val_info_path, save_path)
+    print('create reduced point cloud for testing set')
+    _create_reduced_point_cloud(data_path, test_info_path, save_path)
+    if with_back:
+        _create_reduced_point_cloud(
+            data_path, train_info_path, save_path, back=True)
+        _create_reduced_point_cloud(
+            data_path, val_info_path, save_path, back=True)
+        _create_reduced_point_cloud(
+            data_path, test_info_path, save_path, back=True)
+
+
+def export_2d_annotation(root_path, info_path, mono3d=True):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        mono3d (bool, optional): Whether to export mono3d annotation.
+            Default: True.
+    """
+    # get bbox annotations for camera
+    kitti_infos = mmengine.load(info_path)
+    cat2Ids = [
+        dict(id=kitti_categories.index(cat_name), name=cat_name)
+        for cat_name in kitti_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    from os import path as osp
+    for info in mmengine.track_iter_progress(kitti_infos):
+        coco_infos = get_2d_boxes(info, occluded=[0, 1, 2, 3], mono3d=mono3d)
+        (height, width,
+         _) = mmcv.imread(osp.join(root_path,
+                                   info['image']['image_path'])).shape
+        coco_2d_dict['images'].append(
+            dict(
+                file_name=info['image']['image_path'],
+                id=info['image']['image_idx'],
+                Tri2v=info['calib']['Tr_imu_to_velo'],
+                Trv2c=info['calib']['Tr_velo_to_cam'],
+                rect=info['calib']['R0_rect'],
+                cam_intrinsic=info['calib']['P2'],
+                width=width,
+                height=height))
+        for coco_info in coco_infos:
+            if coco_info is None:
+                continue
+            # add an empty key for coco format
+            coco_info['segmentation'] = []
+            coco_info['id'] = coco_ann_id
+            coco_2d_dict['annotations'].append(coco_info)
+            coco_ann_id += 1
+    if mono3d:
+        json_prefix = f'{info_path[:-4]}_mono3d'
+    else:
+        json_prefix = f'{info_path[:-4]}'
+    mmengine.dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(info, occluded, mono3d=True):
+    """Get the 2D annotation records for a given info.
+
+    Args:
+        info: Information of the given sample data.
+        occluded: Integer (0, 1, 2, 3) indicating occlusion state:
+            0 = fully visible, 1 = partly occluded, 2 = largely occluded,
+            3 = unknown, -1 = DontCare
+        mono3d (bool): Whether to get boxes with mono3d annotation.
+
+    Return:
+        list[dict]: List of 2D annotation record that belongs to the input
+            `sample_data_token`.
+    """
+    # Get calibration information
+    P2 = info['calib']['P2']
+
+    repro_recs = []
+    # if no annotations in info (test dataset), then return
+    if 'annos' not in info:
+        return repro_recs
+
+    # Get all the annotation with the specified visibilties.
+    ann_dicts = info['annos']
+    mask = [(ocld in occluded) for ocld in ann_dicts['occluded']]
+    for k in ann_dicts.keys():
+        ann_dicts[k] = ann_dicts[k][mask]
+
+    # convert dict of list to list of dict
+    ann_recs = []
+    for i in range(len(ann_dicts['occluded'])):
+        ann_rec = {}
+        for k in ann_dicts.keys():
+            ann_rec[k] = ann_dicts[k][i]
+        ann_recs.append(ann_rec)
+
+    for ann_idx, ann_rec in enumerate(ann_recs):
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = \
+            f"{info['image']['image_idx']}.{ann_idx}"
+        ann_rec['sample_data_token'] = info['image']['image_idx']
+        sample_data_token = info['image']['image_idx']
+
+        loc = ann_rec['location'][np.newaxis, :]
+        dim = ann_rec['dimensions'][np.newaxis, :]
+        rot = ann_rec['rotation_y'][np.newaxis, np.newaxis]
+        # transform the center from [0.5, 1.0, 0.5] to [0.5, 0.5, 0.5]
+        dst = np.array([0.5, 0.5, 0.5])
+        src = np.array([0.5, 1.0, 0.5])
+        loc = loc + dim * (dst - src)
+        offset = (info['calib']['P2'][0, 3] - info['calib']['P0'][0, 3]) \
+            / info['calib']['P2'][0, 0]
+        loc_3d = np.copy(loc)
+        loc_3d[0, 0] += offset
+        gt_bbox_3d = np.concatenate([loc, dim, rot], axis=1).astype(np.float32)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box_np_ops.center_to_corner_box3d(
+            gt_bbox_3d[:, :3],
+            gt_bbox_3d[:, 3:6],
+            gt_bbox_3d[:, 6], [0.5, 0.5, 0.5],
+            axis=1)
+        corners_3d = corners_3d[0].T  # (1, 8, 3) -> (3, 8)
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        camera_intrinsic = P2
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    sample_data_token,
+                                    info['image']['image_path'])
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            repro_rec['bbox_cam3d'] = np.concatenate(
+                [loc_3d, dim, rot],
+                axis=1).astype(np.float32).squeeze().tolist()
+            repro_rec['velo_cam3d'] = -1  # no velocity in KITTI
+
+            center3d = np.array(loc).reshape([1, 3])
+            center2d = points_cam2img(
+                center3d, camera_intrinsic, with_depth=True)
+            repro_rec['center2d'] = center2d.squeeze().tolist()
+            # normalized center2D + depth
+            # samples with depth < 0 will be removed
+            if repro_rec['center2d'][2] <= 0:
+                continue
+
+            repro_rec['attribute_name'] = -1  # no attribute in KITTI
+            repro_rec['attribute_id'] = -1
+
+        repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def generate_record(ann_rec, x1, y1, x2, y2, sample_data_token, filename):
+    """Generate one 2D annotation record given various information on top of
+    the 2D bounding box coordinates.
+
+    Args:
+        ann_rec (dict): Original 3d annotation record.
+        x1 (float): Minimum value of the x coordinate.
+        y1 (float): Minimum value of the y coordinate.
+        x2 (float): Maximum value of the x coordinate.
+        y2 (float): Maximum value of the y coordinate.
+        sample_data_token (str): Sample data token.
+        filename (str):The corresponding image file where the annotation
+            is present.
+
+    Returns:
+        dict: A sample 2D annotation record.
+            - file_name (str): file name
+            - image_id (str): sample data token
+            - area (float): 2d box area
+            - category_name (str): category name
+            - category_id (int): category id
+            - bbox (list[float]): left x, top y, x_size, y_size of 2d box
+            - iscrowd (int): whether the area is crowd
+    """
+    repro_rec = OrderedDict()
+    repro_rec['sample_data_token'] = sample_data_token
+    coco_rec = dict()
+
+    key_mapping = {
+        'name': 'category_name',
+        'num_points_in_gt': 'num_lidar_pts',
+        'sample_annotation_token': 'sample_annotation_token',
+        'sample_data_token': 'sample_data_token',
+    }
+
+    for key, value in ann_rec.items():
+        if key in key_mapping.keys():
+            repro_rec[key_mapping[key]] = value
+
+    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+    repro_rec['filename'] = filename
+
+    coco_rec['file_name'] = filename
+    coco_rec['image_id'] = sample_data_token
+    coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+    if repro_rec['category_name'] not in kitti_categories:
+        return None
+    cat_name = repro_rec['category_name']
+    coco_rec['category_name'] = cat_name
+    coco_rec['category_id'] = kitti_categories.index(cat_name)
+    coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+    coco_rec['iscrowd'] = 0
+
+    return coco_rec
diff --git a/mmde/tools/dataset_converters/kitti_data_utils.py b/mmde/tools/dataset_converters/kitti_data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..64c3bc415b764ef97ba902fd9a68c909b804f281
--- /dev/null
+++ b/mmde/tools/dataset_converters/kitti_data_utils.py
@@ -0,0 +1,668 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from concurrent import futures as futures
+from os import path as osp
+from pathlib import Path
+
+import mmengine
+import numpy as np
+from PIL import Image
+from skimage import io
+
+
+def get_image_index_str(img_idx, use_prefix_id=False):
+    if use_prefix_id:
+        return '{:07d}'.format(img_idx)
+    else:
+        return '{:06d}'.format(img_idx)
+
+
+def get_kitti_info_path(idx,
+                        prefix,
+                        info_type='image_2',
+                        file_tail='.png',
+                        training=True,
+                        relative_path=True,
+                        exist_check=True,
+                        use_prefix_id=False):
+    img_idx_str = get_image_index_str(idx, use_prefix_id)
+    img_idx_str += file_tail
+    prefix = Path(prefix)
+    if training:
+        file_path = Path('training') / info_type / img_idx_str
+    else:
+        file_path = Path('testing') / info_type / img_idx_str
+    if exist_check and not (prefix / file_path).exists():
+        raise ValueError('file not exist: {}'.format(file_path))
+    if relative_path:
+        return str(file_path)
+    else:
+        return str(prefix / file_path)
+
+
+def get_image_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True,
+                   info_type='image_2',
+                   file_tail='.png',
+                   use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, info_type, file_tail, training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_label_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True,
+                   info_type='label_2',
+                   use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, info_type, '.txt', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_plane_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True,
+                   info_type='planes',
+                   use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, info_type, '.txt', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_velodyne_path(idx,
+                      prefix,
+                      training=True,
+                      relative_path=True,
+                      exist_check=True,
+                      use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, 'velodyne', '.bin', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_calib_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True,
+                   use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, 'calib', '.txt', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_pose_path(idx,
+                  prefix,
+                  training=True,
+                  relative_path=True,
+                  exist_check=True,
+                  use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, 'pose', '.txt', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_timestamp_path(idx,
+                       prefix,
+                       training=True,
+                       relative_path=True,
+                       exist_check=True,
+                       use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, 'timestamp', '.txt', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_label_anno(label_path):
+    annotations = {}
+    annotations.update({
+        'name': [],
+        'truncated': [],
+        'occluded': [],
+        'alpha': [],
+        'bbox': [],
+        'dimensions': [],
+        'location': [],
+        'rotation_y': []
+    })
+    with open(label_path, 'r') as f:
+        lines = f.readlines()
+    # if len(lines) == 0 or len(lines[0]) < 15:
+    #     content = []
+    # else:
+    content = [line.strip().split(' ') for line in lines]
+    num_objects = len([x[0] for x in content if x[0] != 'DontCare'])
+    annotations['name'] = np.array([x[0] for x in content])
+    num_gt = len(annotations['name'])
+    annotations['truncated'] = np.array([float(x[1]) for x in content])
+    annotations['occluded'] = np.array([int(x[2]) for x in content])
+    annotations['alpha'] = np.array([float(x[3]) for x in content])
+    annotations['bbox'] = np.array([[float(info) for info in x[4:8]]
+                                    for x in content]).reshape(-1, 4)
+    # dimensions will convert hwl format to standard lhw(camera) format.
+    annotations['dimensions'] = np.array([[float(info) for info in x[8:11]]
+                                          for x in content
+                                          ]).reshape(-1, 3)[:, [2, 0, 1]]
+    annotations['location'] = np.array([[float(info) for info in x[11:14]]
+                                        for x in content]).reshape(-1, 3)
+    annotations['rotation_y'] = np.array([float(x[14])
+                                          for x in content]).reshape(-1)
+    if len(content) != 0 and len(content[0]) == 16:  # have score
+        annotations['score'] = np.array([float(x[15]) for x in content])
+    else:
+        annotations['score'] = np.zeros((annotations['bbox'].shape[0], ))
+    index = list(range(num_objects)) + [-1] * (num_gt - num_objects)
+    annotations['index'] = np.array(index, dtype=np.int32)
+    annotations['group_ids'] = np.arange(num_gt, dtype=np.int32)
+    return annotations
+
+
+def _extend_matrix(mat):
+    mat = np.concatenate([mat, np.array([[0., 0., 0., 1.]])], axis=0)
+    return mat
+
+
+def get_kitti_image_info(path,
+                         training=True,
+                         label_info=True,
+                         velodyne=False,
+                         calib=False,
+                         with_plane=False,
+                         image_ids=7481,
+                         extend_matrix=True,
+                         num_worker=8,
+                         relative_path=True,
+                         with_imageshape=True):
+    """
+    KITTI annotation format version 2:
+    {
+        [optional]points: [N, 3+] point cloud
+        [optional, for kitti]image: {
+            image_idx: ...
+            image_path: ...
+            image_shape: ...
+        }
+        point_cloud: {
+            num_features: 4
+            velodyne_path: ...
+        }
+        [optional, for kitti]calib: {
+            R0_rect: ...
+            Tr_velo_to_cam: ...
+            P2: ...
+        }
+        annos: {
+            location: [num_gt, 3] array
+            dimensions: [num_gt, 3] array
+            rotation_y: [num_gt] angle array
+            name: [num_gt] ground truth name array
+            [optional]difficulty: kitti difficulty
+            [optional]group_ids: used for multi-part object
+        }
+    }
+    """
+    root_path = Path(path)
+    if not isinstance(image_ids, list):
+        image_ids = list(range(image_ids))
+
+    def map_func(idx):
+        info = {}
+        pc_info = {'num_features': 4}
+        calib_info = {}
+
+        image_info = {'image_idx': idx}
+        annotations = None
+        if velodyne:
+            pc_info['velodyne_path'] = get_velodyne_path(
+                idx, path, training, relative_path)
+        image_info['image_path'] = get_image_path(idx, path, training,
+                                                  relative_path)
+        if with_imageshape:
+            img_path = image_info['image_path']
+            if relative_path:
+                img_path = str(root_path / img_path)
+            image_info['image_shape'] = np.array(
+                io.imread(img_path).shape[:2], dtype=np.int32)
+        if label_info:
+            label_path = get_label_path(idx, path, training, relative_path)
+            if relative_path:
+                label_path = str(root_path / label_path)
+            annotations = get_label_anno(label_path)
+        info['image'] = image_info
+        info['point_cloud'] = pc_info
+        if calib:
+            calib_path = get_calib_path(
+                idx, path, training, relative_path=False)
+            with open(calib_path, 'r') as f:
+                lines = f.readlines()
+            P0 = np.array([float(info) for info in lines[0].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P1 = np.array([float(info) for info in lines[1].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P2 = np.array([float(info) for info in lines[2].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P3 = np.array([float(info) for info in lines[3].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            if extend_matrix:
+                P0 = _extend_matrix(P0)
+                P1 = _extend_matrix(P1)
+                P2 = _extend_matrix(P2)
+                P3 = _extend_matrix(P3)
+            R0_rect = np.array([
+                float(info) for info in lines[4].split(' ')[1:10]
+            ]).reshape([3, 3])
+            if extend_matrix:
+                rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype)
+                rect_4x4[3, 3] = 1.
+                rect_4x4[:3, :3] = R0_rect
+            else:
+                rect_4x4 = R0_rect
+
+            Tr_velo_to_cam = np.array([
+                float(info) for info in lines[5].split(' ')[1:13]
+            ]).reshape([3, 4])
+            Tr_imu_to_velo = np.array([
+                float(info) for info in lines[6].split(' ')[1:13]
+            ]).reshape([3, 4])
+            if extend_matrix:
+                Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam)
+                Tr_imu_to_velo = _extend_matrix(Tr_imu_to_velo)
+            calib_info['P0'] = P0
+            calib_info['P1'] = P1
+            calib_info['P2'] = P2
+            calib_info['P3'] = P3
+            calib_info['R0_rect'] = rect_4x4
+            calib_info['Tr_velo_to_cam'] = Tr_velo_to_cam
+            calib_info['Tr_imu_to_velo'] = Tr_imu_to_velo
+            info['calib'] = calib_info
+
+        if with_plane:
+            plane_path = get_plane_path(idx, path, training, relative_path)
+            if relative_path:
+                plane_path = str(root_path / plane_path)
+            lines = mmengine.list_from_file(plane_path)
+            info['plane'] = np.array([float(i) for i in lines[3].split()])
+
+        if annotations is not None:
+            info['annos'] = annotations
+            add_difficulty_to_annos(info)
+        return info
+
+    with futures.ThreadPoolExecutor(num_worker) as executor:
+        image_infos = executor.map(map_func, image_ids)
+
+    return list(image_infos)
+
+
+class WaymoInfoGatherer:
+    """
+    Parallel version of waymo dataset information gathering.
+    Waymo annotation format version like KITTI:
+    {
+        [optional]points: [N, 3+] point cloud
+        [optional, for kitti]image: {
+            image_idx: ...
+            image_path: ...
+            image_shape: ...
+        }
+        point_cloud: {
+            num_features: 6
+            velodyne_path: ...
+        }
+        [optional, for kitti]calib: {
+            R0_rect: ...
+            Tr_velo_to_cam0: ...
+            P0: ...
+        }
+        annos: {
+            location: [num_gt, 3] array
+            dimensions: [num_gt, 3] array
+            rotation_y: [num_gt] angle array
+            name: [num_gt] ground truth name array
+            [optional]difficulty: kitti difficulty
+            [optional]group_ids: used for multi-part object
+        }
+    }
+    """
+
+    def __init__(self,
+                 path,
+                 training=True,
+                 label_info=True,
+                 velodyne=False,
+                 calib=False,
+                 pose=False,
+                 extend_matrix=True,
+                 num_worker=8,
+                 relative_path=True,
+                 with_imageshape=True,
+                 max_sweeps=5) -> None:
+        self.path = path
+        self.training = training
+        self.label_info = label_info
+        self.velodyne = velodyne
+        self.calib = calib
+        self.pose = pose
+        self.extend_matrix = extend_matrix
+        self.num_worker = num_worker
+        self.relative_path = relative_path
+        self.with_imageshape = with_imageshape
+        self.max_sweeps = max_sweeps
+
+    def gather_single(self, idx):
+        root_path = Path(self.path)
+        info = {}
+        pc_info = {'num_features': 6}
+        calib_info = {}
+
+        image_info = {'image_idx': idx}
+        annotations = None
+        if self.velodyne:
+            pc_info['velodyne_path'] = get_velodyne_path(
+                idx,
+                self.path,
+                self.training,
+                self.relative_path,
+                use_prefix_id=True)
+        with open(
+                get_timestamp_path(
+                    idx,
+                    self.path,
+                    self.training,
+                    relative_path=False,
+                    use_prefix_id=True)) as f:
+            info['timestamp'] = np.int64(f.read())
+        image_info['image_path'] = get_image_path(
+            idx,
+            self.path,
+            self.training,
+            self.relative_path,
+            info_type='image_0',
+            file_tail='.jpg',
+            use_prefix_id=True)
+        if self.with_imageshape:
+            img_path = image_info['image_path']
+            if self.relative_path:
+                img_path = str(root_path / img_path)
+            # io using PIL is significantly faster than skimage
+            w, h = Image.open(img_path).size
+            image_info['image_shape'] = np.array((h, w), dtype=np.int32)
+        if self.label_info:
+            label_path = get_label_path(
+                idx,
+                self.path,
+                self.training,
+                self.relative_path,
+                info_type='label_all',
+                use_prefix_id=True)
+            cam_sync_label_path = get_label_path(
+                idx,
+                self.path,
+                self.training,
+                self.relative_path,
+                info_type='cam_sync_label_all',
+                use_prefix_id=True)
+            if self.relative_path:
+                label_path = str(root_path / label_path)
+                cam_sync_label_path = str(root_path / cam_sync_label_path)
+            annotations = get_label_anno(label_path)
+            cam_sync_annotations = get_label_anno(cam_sync_label_path)
+        info['image'] = image_info
+        info['point_cloud'] = pc_info
+        if self.calib:
+            calib_path = get_calib_path(
+                idx,
+                self.path,
+                self.training,
+                relative_path=False,
+                use_prefix_id=True)
+            with open(calib_path, 'r') as f:
+                lines = f.readlines()
+            P0 = np.array([float(info) for info in lines[0].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P1 = np.array([float(info) for info in lines[1].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P2 = np.array([float(info) for info in lines[2].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P3 = np.array([float(info) for info in lines[3].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P4 = np.array([float(info) for info in lines[4].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            if self.extend_matrix:
+                P0 = _extend_matrix(P0)
+                P1 = _extend_matrix(P1)
+                P2 = _extend_matrix(P2)
+                P3 = _extend_matrix(P3)
+                P4 = _extend_matrix(P4)
+            R0_rect = np.array([
+                float(info) for info in lines[5].split(' ')[1:10]
+            ]).reshape([3, 3])
+            if self.extend_matrix:
+                rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype)
+                rect_4x4[3, 3] = 1.
+                rect_4x4[:3, :3] = R0_rect
+            else:
+                rect_4x4 = R0_rect
+
+            # TODO: naming Tr_velo_to_cam or Tr_velo_to_cam0
+            Tr_velo_to_cam = np.array([
+                float(info) for info in lines[6].split(' ')[1:13]
+            ]).reshape([3, 4])
+            Tr_velo_to_cam1 = np.array([
+                float(info) for info in lines[7].split(' ')[1:13]
+            ]).reshape([3, 4])
+            Tr_velo_to_cam2 = np.array([
+                float(info) for info in lines[8].split(' ')[1:13]
+            ]).reshape([3, 4])
+            Tr_velo_to_cam3 = np.array([
+                float(info) for info in lines[9].split(' ')[1:13]
+            ]).reshape([3, 4])
+            Tr_velo_to_cam4 = np.array([
+                float(info) for info in lines[10].split(' ')[1:13]
+            ]).reshape([3, 4])
+            if self.extend_matrix:
+                Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam)
+                Tr_velo_to_cam1 = _extend_matrix(Tr_velo_to_cam1)
+                Tr_velo_to_cam2 = _extend_matrix(Tr_velo_to_cam2)
+                Tr_velo_to_cam3 = _extend_matrix(Tr_velo_to_cam3)
+                Tr_velo_to_cam4 = _extend_matrix(Tr_velo_to_cam4)
+            calib_info['P0'] = P0
+            calib_info['P1'] = P1
+            calib_info['P2'] = P2
+            calib_info['P3'] = P3
+            calib_info['P4'] = P4
+            calib_info['R0_rect'] = rect_4x4
+            calib_info['Tr_velo_to_cam'] = Tr_velo_to_cam
+            calib_info['Tr_velo_to_cam1'] = Tr_velo_to_cam1
+            calib_info['Tr_velo_to_cam2'] = Tr_velo_to_cam2
+            calib_info['Tr_velo_to_cam3'] = Tr_velo_to_cam3
+            calib_info['Tr_velo_to_cam4'] = Tr_velo_to_cam4
+            info['calib'] = calib_info
+
+        if self.pose:
+            pose_path = get_pose_path(
+                idx,
+                self.path,
+                self.training,
+                relative_path=False,
+                use_prefix_id=True)
+            info['pose'] = np.loadtxt(pose_path)
+
+        if annotations is not None:
+            info['annos'] = annotations
+            info['annos']['camera_id'] = info['annos'].pop('score')
+            add_difficulty_to_annos(info)
+            info['cam_sync_annos'] = cam_sync_annotations
+            # NOTE: the 2D labels do not have strict correspondence with
+            # the projected 2D lidar labels
+            # e.g.: the projected 2D labels can be in camera 2
+            # while the most_visible_camera can have id 4
+            info['cam_sync_annos']['camera_id'] = info['cam_sync_annos'].pop(
+                'score')
+
+        sweeps = []
+        prev_idx = idx
+        while len(sweeps) < self.max_sweeps:
+            prev_info = {}
+            prev_idx -= 1
+            prev_info['velodyne_path'] = get_velodyne_path(
+                prev_idx,
+                self.path,
+                self.training,
+                self.relative_path,
+                exist_check=False,
+                use_prefix_id=True)
+            if_prev_exists = osp.exists(
+                Path(self.path) / prev_info['velodyne_path'])
+            if if_prev_exists:
+                with open(
+                        get_timestamp_path(
+                            prev_idx,
+                            self.path,
+                            self.training,
+                            relative_path=False,
+                            use_prefix_id=True)) as f:
+                    prev_info['timestamp'] = np.int64(f.read())
+                prev_info['image_path'] = get_image_path(
+                    prev_idx,
+                    self.path,
+                    self.training,
+                    self.relative_path,
+                    info_type='image_0',
+                    file_tail='.jpg',
+                    use_prefix_id=True)
+                prev_pose_path = get_pose_path(
+                    prev_idx,
+                    self.path,
+                    self.training,
+                    relative_path=False,
+                    use_prefix_id=True)
+                prev_info['pose'] = np.loadtxt(prev_pose_path)
+                sweeps.append(prev_info)
+            else:
+                break
+        info['sweeps'] = sweeps
+
+        return info
+
+    def gather(self, image_ids):
+        if not isinstance(image_ids, list):
+            image_ids = list(range(image_ids))
+        image_infos = mmengine.track_parallel_progress(self.gather_single,
+                                                       image_ids,
+                                                       self.num_worker)
+        return list(image_infos)
+
+
+def kitti_anno_to_label_file(annos, folder):
+    folder = Path(folder)
+    for anno in annos:
+        image_idx = anno['metadata']['image_idx']
+        label_lines = []
+        for j in range(anno['bbox'].shape[0]):
+            label_dict = {
+                'name': anno['name'][j],
+                'alpha': anno['alpha'][j],
+                'bbox': anno['bbox'][j],
+                'location': anno['location'][j],
+                'dimensions': anno['dimensions'][j],
+                'rotation_y': anno['rotation_y'][j],
+                'score': anno['score'][j],
+            }
+            label_line = kitti_result_line(label_dict)
+            label_lines.append(label_line)
+        label_file = folder / f'{get_image_index_str(image_idx)}.txt'
+        label_str = '\n'.join(label_lines)
+        with open(label_file, 'w') as f:
+            f.write(label_str)
+
+
+def add_difficulty_to_annos(info):
+    min_height = [40, 25,
+                  25]  # minimum height for evaluated groundtruth/detections
+    max_occlusion = [
+        0, 1, 2
+    ]  # maximum occlusion level of the groundtruth used for evaluation
+    max_trunc = [
+        0.15, 0.3, 0.5
+    ]  # maximum truncation level of the groundtruth used for evaluation
+    annos = info['annos']
+    dims = annos['dimensions']  # lhw format
+    bbox = annos['bbox']
+    height = bbox[:, 3] - bbox[:, 1]
+    occlusion = annos['occluded']
+    truncation = annos['truncated']
+    diff = []
+    easy_mask = np.ones((len(dims), ), dtype=bool)
+    moderate_mask = np.ones((len(dims), ), dtype=bool)
+    hard_mask = np.ones((len(dims), ), dtype=bool)
+    i = 0
+    for h, o, t in zip(height, occlusion, truncation):
+        if o > max_occlusion[0] or h <= min_height[0] or t > max_trunc[0]:
+            easy_mask[i] = False
+        if o > max_occlusion[1] or h <= min_height[1] or t > max_trunc[1]:
+            moderate_mask[i] = False
+        if o > max_occlusion[2] or h <= min_height[2] or t > max_trunc[2]:
+            hard_mask[i] = False
+        i += 1
+    is_easy = easy_mask
+    is_moderate = np.logical_xor(easy_mask, moderate_mask)
+    is_hard = np.logical_xor(hard_mask, moderate_mask)
+
+    for i in range(len(dims)):
+        if is_easy[i]:
+            diff.append(0)
+        elif is_moderate[i]:
+            diff.append(1)
+        elif is_hard[i]:
+            diff.append(2)
+        else:
+            diff.append(-1)
+    annos['difficulty'] = np.array(diff, np.int32)
+    return diff
+
+
+def kitti_result_line(result_dict, precision=4):
+    prec_float = '{' + ':.{}f'.format(precision) + '}'
+    res_line = []
+    all_field_default = OrderedDict([
+        ('name', None),
+        ('truncated', -1),
+        ('occluded', -1),
+        ('alpha', -10),
+        ('bbox', None),
+        ('dimensions', [-1, -1, -1]),
+        ('location', [-1000, -1000, -1000]),
+        ('rotation_y', -10),
+        ('score', 0.0),
+    ])
+    res_dict = [(key, None) for key, val in all_field_default.items()]
+    res_dict = OrderedDict(res_dict)
+    for key, val in result_dict.items():
+        if all_field_default[key] is None and val is None:
+            raise ValueError('you must specify a value for {}'.format(key))
+        res_dict[key] = val
+
+    for key, val in res_dict.items():
+        if key == 'name':
+            res_line.append(val)
+        elif key in ['truncated', 'alpha', 'rotation_y', 'score']:
+            if val is None:
+                res_line.append(str(all_field_default[key]))
+            else:
+                res_line.append(prec_float.format(val))
+        elif key == 'occluded':
+            if val is None:
+                res_line.append(str(all_field_default[key]))
+            else:
+                res_line.append('{}'.format(val))
+        elif key in ['bbox', 'dimensions', 'location']:
+            if val is None:
+                res_line += [str(v) for v in all_field_default[key]]
+            else:
+                res_line += [prec_float.format(v) for v in val]
+        else:
+            raise ValueError('unknown key. supported key:{}'.format(
+                res_dict.keys()))
+    return ' '.join(res_line)
diff --git a/mmde/tools/dataset_converters/kitti_unzip.sh b/mmde/tools/dataset_converters/kitti_unzip.sh
new file mode 100644
index 0000000000000000000000000000000000000000..834ddaf82e5efc1a9de941ddf37f5c696e8e3123
--- /dev/null
+++ b/mmde/tools/dataset_converters/kitti_unzip.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+DOWNLOAD_DIR=$1  # The directory where the downloaded data set is stored
+DATA_ROOT=$2  # The root directory of the converted dataset
+
+for zip_file in $DOWNLOAD_DIR/KITTI_Object/raw/*.zip; do
+    echo "Unzipping $zip_file to $DATA_ROOT ......"
+	unzip -oq $zip_file -d $DATA_ROOT
+    echo "[Done] Unzip $zip_file to $DATA_ROOT"
+    # delete the original files
+	rm -f $zip_file
+done
diff --git a/mmde/tools/dataset_converters/lyft_converter.py b/mmde/tools/dataset_converters/lyft_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9fd98af7940a5338293b819f3e62259b0085a3d
--- /dev/null
+++ b/mmde/tools/dataset_converters/lyft_converter.py
@@ -0,0 +1,273 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from logging import warning
+from os import path as osp
+
+import mmcv
+import mmengine
+import numpy as np
+from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft
+from pyquaternion import Quaternion
+
+from mmdet3d.datasets.convert_utils import LyftNameMapping
+from .nuscenes_converter import (get_2d_boxes, get_available_scenes,
+                                 obtain_sensor2top)
+
+lyft_categories = ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle',
+                   'motorcycle', 'bicycle', 'pedestrian', 'animal')
+
+
+def create_lyft_infos(root_path,
+                      info_prefix,
+                      version='v1.01-train',
+                      max_sweeps=10):
+    """Create info file of lyft dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        root_path (str): Path of the data root.
+        info_prefix (str): Prefix of the info file to be generated.
+        version (str, optional): Version of the data.
+            Default: 'v1.01-train'.
+        max_sweeps (int, optional): Max number of sweeps.
+            Default: 10.
+    """
+    lyft = Lyft(
+        data_path=osp.join(root_path, version),
+        json_path=osp.join(root_path, version, version),
+        verbose=True)
+    available_vers = ['v1.01-train', 'v1.01-test']
+    assert version in available_vers
+    if version == 'v1.01-train':
+        train_scenes = mmengine.list_from_file('data/lyft/train.txt')
+        val_scenes = mmengine.list_from_file('data/lyft/val.txt')
+    elif version == 'v1.01-test':
+        train_scenes = mmengine.list_from_file('data/lyft/test.txt')
+        val_scenes = []
+    else:
+        raise ValueError('unknown')
+
+    # filter existing scenes.
+    available_scenes = get_available_scenes(lyft)
+    available_scene_names = [s['name'] for s in available_scenes]
+    train_scenes = list(
+        filter(lambda x: x in available_scene_names, train_scenes))
+    val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+    train_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in train_scenes
+    ])
+    val_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in val_scenes
+    ])
+
+    test = 'test' in version
+    if test:
+        print(f'test scene: {len(train_scenes)}')
+    else:
+        print(f'train scene: {len(train_scenes)}, \
+                val scene: {len(val_scenes)}')
+    train_lyft_infos, val_lyft_infos = _fill_trainval_infos(
+        lyft, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+    metadata = dict(version=version)
+    if test:
+        print(f'test sample: {len(train_lyft_infos)}')
+        data = dict(infos=train_lyft_infos, metadata=metadata)
+        info_name = f'{info_prefix}_infos_test'
+        info_path = osp.join(root_path, f'{info_name}.pkl')
+        mmengine.dump(data, info_path)
+    else:
+        print(f'train sample: {len(train_lyft_infos)}, \
+                val sample: {len(val_lyft_infos)}')
+        data = dict(infos=train_lyft_infos, metadata=metadata)
+        train_info_name = f'{info_prefix}_infos_train'
+        info_path = osp.join(root_path, f'{train_info_name}.pkl')
+        mmengine.dump(data, info_path)
+        data['infos'] = val_lyft_infos
+        val_info_name = f'{info_prefix}_infos_val'
+        info_val_path = osp.join(root_path, f'{val_info_name}.pkl')
+        mmengine.dump(data, info_val_path)
+
+
+def _fill_trainval_infos(lyft,
+                         train_scenes,
+                         val_scenes,
+                         test=False,
+                         max_sweeps=10):
+    """Generate the train/val infos from the raw data.
+
+    Args:
+        lyft (:obj:`LyftDataset`): Dataset class in the Lyft dataset.
+        train_scenes (list[str]): Basic information of training scenes.
+        val_scenes (list[str]): Basic information of validation scenes.
+        test (bool, optional): Whether use the test mode. In the test mode, no
+            annotations can be accessed. Default: False.
+        max_sweeps (int, optional): Max number of sweeps. Default: 10.
+
+    Returns:
+        tuple[list[dict]]: Information of training set and
+            validation set that will be saved to the info file.
+    """
+    train_lyft_infos = []
+    val_lyft_infos = []
+
+    for sample in mmengine.track_iter_progress(lyft.sample):
+        lidar_token = sample['data']['LIDAR_TOP']
+        sd_rec = lyft.get('sample_data', sample['data']['LIDAR_TOP'])
+        cs_record = lyft.get('calibrated_sensor',
+                             sd_rec['calibrated_sensor_token'])
+        pose_record = lyft.get('ego_pose', sd_rec['ego_pose_token'])
+        abs_lidar_path, boxes, _ = lyft.get_sample_data(lidar_token)
+        # nuScenes devkit returns more convenient relative paths while
+        # lyft devkit returns absolute paths
+        abs_lidar_path = str(abs_lidar_path)  # absolute path
+        lidar_path = abs_lidar_path.split(f'{os.getcwd()}/')[-1]
+        # relative path
+
+        mmengine.check_file_exist(lidar_path)
+
+        info = {
+            'lidar_path': lidar_path,
+            'num_features': 5,
+            'token': sample['token'],
+            'sweeps': [],
+            'cams': dict(),
+            'lidar2ego_translation': cs_record['translation'],
+            'lidar2ego_rotation': cs_record['rotation'],
+            'ego2global_translation': pose_record['translation'],
+            'ego2global_rotation': pose_record['rotation'],
+            'timestamp': sample['timestamp'],
+        }
+
+        l2e_r = info['lidar2ego_rotation']
+        l2e_t = info['lidar2ego_translation']
+        e2g_r = info['ego2global_rotation']
+        e2g_t = info['ego2global_translation']
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        # obtain 6 image's information per frame
+        camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_FRONT_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_BACK_RIGHT',
+        ]
+        for cam in camera_types:
+            cam_token = sample['data'][cam]
+            cam_path, _, cam_intrinsic = lyft.get_sample_data(cam_token)
+            cam_info = obtain_sensor2top(lyft, cam_token, l2e_t, l2e_r_mat,
+                                         e2g_t, e2g_r_mat, cam)
+            cam_info.update(cam_intrinsic=cam_intrinsic)
+            info['cams'].update({cam: cam_info})
+
+        # obtain sweeps for a single key-frame
+        sd_rec = lyft.get('sample_data', sample['data']['LIDAR_TOP'])
+        sweeps = []
+        while len(sweeps) < max_sweeps:
+            if not sd_rec['prev'] == '':
+                sweep = obtain_sensor2top(lyft, sd_rec['prev'], l2e_t,
+                                          l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+                sweeps.append(sweep)
+                sd_rec = lyft.get('sample_data', sd_rec['prev'])
+            else:
+                break
+        info['sweeps'] = sweeps
+        # obtain annotation
+        if not test:
+            annotations = [
+                lyft.get('sample_annotation', token)
+                for token in sample['anns']
+            ]
+            locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+            dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+            rots = np.array([b.orientation.yaw_pitch_roll[0]
+                             for b in boxes]).reshape(-1, 1)
+
+            names = [b.name for b in boxes]
+            for i in range(len(names)):
+                if names[i] in LyftNameMapping:
+                    names[i] = LyftNameMapping[names[i]]
+            names = np.array(names)
+
+            # we need to convert box size to
+            # the format of our lidar coordinate system
+            # which is x_size, y_size, z_size (corresponding to l, w, h)
+            gt_boxes = np.concatenate([locs, dims[:, [1, 0, 2]], rots], axis=1)
+            assert len(gt_boxes) == len(
+                annotations), f'{len(gt_boxes)}, {len(annotations)}'
+            info['gt_boxes'] = gt_boxes
+            info['gt_names'] = names
+            info['num_lidar_pts'] = np.array(
+                [a['num_lidar_pts'] for a in annotations])
+            info['num_radar_pts'] = np.array(
+                [a['num_radar_pts'] for a in annotations])
+
+        if sample['scene_token'] in train_scenes:
+            train_lyft_infos.append(info)
+        else:
+            val_lyft_infos.append(info)
+
+    return train_lyft_infos, val_lyft_infos
+
+
+def export_2d_annotation(root_path, info_path, version):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        version (str): Dataset version.
+    """
+    warning.warn('DeprecationWarning: 2D annotations are not used on the '
+                 'Lyft dataset. The function export_2d_annotation will be '
+                 'deprecated.')
+    # get bbox annotations for camera
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_FRONT_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_LEFT',
+        'CAM_BACK_RIGHT',
+    ]
+    lyft_infos = mmengine.load(info_path)['infos']
+    lyft = Lyft(
+        data_path=osp.join(root_path, version),
+        json_path=osp.join(root_path, version, version),
+        verbose=True)
+    # info_2d_list = []
+    cat2Ids = [
+        dict(id=lyft_categories.index(cat_name), name=cat_name)
+        for cat_name in lyft_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    for info in mmengine.track_iter_progress(lyft_infos):
+        for cam in camera_types:
+            cam_info = info['cams'][cam]
+            coco_infos = get_2d_boxes(
+                lyft,
+                cam_info['sample_data_token'],
+                visibilities=['', '1', '2', '3', '4'])
+            (height, width, _) = mmcv.imread(cam_info['data_path']).shape
+            coco_2d_dict['images'].append(
+                dict(
+                    file_name=cam_info['data_path'],
+                    id=cam_info['sample_data_token'],
+                    width=width,
+                    height=height))
+            for coco_info in coco_infos:
+                if coco_info is None:
+                    continue
+                # add an empty key for coco format
+                coco_info['segmentation'] = []
+                coco_info['id'] = coco_ann_id
+                coco_2d_dict['annotations'].append(coco_info)
+                coco_ann_id += 1
+    mmengine.dump(coco_2d_dict, f'{info_path[:-4]}.coco.json')
diff --git a/mmde/tools/dataset_converters/lyft_data_fixer.py b/mmde/tools/dataset_converters/lyft_data_fixer.py
new file mode 100644
index 0000000000000000000000000000000000000000..55103515a266c429f334e3c9f87fc0de9a57d746
--- /dev/null
+++ b/mmde/tools/dataset_converters/lyft_data_fixer.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+
+import numpy as np
+
+
+def fix_lyft(root_folder='./data/lyft', version='v1.01'):
+    # refer to https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110000  # noqa
+    lidar_path = 'lidar/host-a011_lidar1_1233090652702363606.bin'
+    root_folder = os.path.join(root_folder, f'{version}-train')
+    lidar_path = os.path.join(root_folder, lidar_path)
+    assert os.path.isfile(lidar_path), f'Please download the complete Lyft ' \
+        f'dataset and make sure {lidar_path} is present.'
+    points = np.fromfile(lidar_path, dtype=np.float32, count=-1)
+    try:
+        points.reshape([-1, 5])
+        print(f'This fix is not required for version {version}.')
+    except ValueError:
+        new_points = np.array(list(points) + [100.0, 1.0], dtype='float32')
+        new_points.tofile(lidar_path)
+        print(f'Appended 100.0 and 1.0 to the end of {lidar_path}.')
+
+
+parser = argparse.ArgumentParser(description='Lyft dataset fixer arg parser')
+parser.add_argument(
+    '--root-folder',
+    type=str,
+    default='./data/lyft',
+    help='specify the root path of Lyft dataset')
+parser.add_argument(
+    '--version',
+    type=str,
+    default='v1.01',
+    help='specify Lyft dataset version')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    fix_lyft(root_folder=args.root_folder, version=args.version)
diff --git a/mmde/tools/dataset_converters/nuimage_converter.py b/mmde/tools/dataset_converters/nuimage_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c1c60c7b98b5efaec47ff85aea3193717ac0151
--- /dev/null
+++ b/mmde/tools/dataset_converters/nuimage_converter.py
@@ -0,0 +1,227 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import base64
+from os import path as osp
+
+import mmcv
+import mmengine
+import numpy as np
+from nuimages import NuImages
+from nuimages.utils.utils import mask_decode, name_to_index_mapping
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+                  'barrier')
+
+NAME_MAPPING = {
+    'movable_object.barrier': 'barrier',
+    'vehicle.bicycle': 'bicycle',
+    'vehicle.bus.bendy': 'bus',
+    'vehicle.bus.rigid': 'bus',
+    'vehicle.car': 'car',
+    'vehicle.construction': 'construction_vehicle',
+    'vehicle.motorcycle': 'motorcycle',
+    'human.pedestrian.adult': 'pedestrian',
+    'human.pedestrian.child': 'pedestrian',
+    'human.pedestrian.construction_worker': 'pedestrian',
+    'human.pedestrian.police_officer': 'pedestrian',
+    'movable_object.trafficcone': 'traffic_cone',
+    'vehicle.trailer': 'trailer',
+    'vehicle.truck': 'truck',
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Data converter arg parser')
+    parser.add_argument(
+        '--data-root',
+        type=str,
+        default='./data/nuimages',
+        help='specify the root path of dataset')
+    parser.add_argument(
+        '--version',
+        type=str,
+        nargs='+',
+        default=['v1.0-mini'],
+        required=False,
+        help='specify the dataset version')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='./data/nuimages/annotations/',
+        required=False,
+        help='path to save the exported json')
+    parser.add_argument(
+        '--nproc',
+        type=int,
+        default=4,
+        required=False,
+        help='workers to process semantic masks')
+    parser.add_argument('--extra-tag', type=str, default='nuimages')
+    args = parser.parse_args()
+    return args
+
+
+def get_img_annos(nuim, img_info, cat2id, out_dir, data_root, seg_root):
+    """Get semantic segmentation map for an image.
+
+    Args:
+        nuim (obj:`NuImages`): NuImages dataset object
+        img_info (dict): Meta information of img
+
+    Returns:
+        np.ndarray: Semantic segmentation map of the image
+    """
+    sd_token = img_info['token']
+    image_id = img_info['id']
+    name_to_index = name_to_index_mapping(nuim.category)
+
+    # Get image data.
+    width, height = img_info['width'], img_info['height']
+    semseg_mask = np.zeros((height, width)).astype('uint8')
+
+    # Load stuff / surface regions.
+    surface_anns = [
+        o for o in nuim.surface_ann if o['sample_data_token'] == sd_token
+    ]
+
+    # Draw stuff / surface regions.
+    for ann in surface_anns:
+        # Get color and mask.
+        category_token = ann['category_token']
+        category_name = nuim.get('category', category_token)['name']
+        if ann['mask'] is None:
+            continue
+        mask = mask_decode(ann['mask'])
+
+        # Draw mask for semantic segmentation.
+        semseg_mask[mask == 1] = name_to_index[category_name]
+
+    # Load object instances.
+    object_anns = [
+        o for o in nuim.object_ann if o['sample_data_token'] == sd_token
+    ]
+
+    # Sort by token to ensure that objects always appear in the
+    # instance mask in the same order.
+    object_anns = sorted(object_anns, key=lambda k: k['token'])
+
+    # Draw object instances.
+    # The 0 index is reserved for background; thus, the instances
+    # should start from index 1.
+    annotations = []
+    for i, ann in enumerate(object_anns, start=1):
+        # Get color, box, mask and name.
+        category_token = ann['category_token']
+        category_name = nuim.get('category', category_token)['name']
+        if ann['mask'] is None:
+            continue
+        mask = mask_decode(ann['mask'])
+
+        # Draw masks for semantic segmentation and instance segmentation.
+        semseg_mask[mask == 1] = name_to_index[category_name]
+
+        if category_name in NAME_MAPPING:
+            cat_name = NAME_MAPPING[category_name]
+            cat_id = cat2id[cat_name]
+
+            x_min, y_min, x_max, y_max = ann['bbox']
+            # encode calibrated instance mask
+            mask_anno = dict()
+            mask_anno['counts'] = base64.b64decode(
+                ann['mask']['counts']).decode()
+            mask_anno['size'] = ann['mask']['size']
+
+            data_anno = dict(
+                image_id=image_id,
+                category_id=cat_id,
+                bbox=[x_min, y_min, x_max - x_min, y_max - y_min],
+                area=(x_max - x_min) * (y_max - y_min),
+                segmentation=mask_anno,
+                iscrowd=0)
+            annotations.append(data_anno)
+
+    # after process, save semantic masks
+    img_filename = img_info['file_name']
+    seg_filename = img_filename.replace('jpg', 'png')
+    seg_filename = osp.join(seg_root, seg_filename)
+    mmcv.imwrite(semseg_mask, seg_filename)
+    return annotations, np.max(semseg_mask)
+
+
+def export_nuim_to_coco(nuim, data_root, out_dir, extra_tag, version, nproc):
+    print('Process category information')
+    categories = []
+    categories = [
+        dict(id=nus_categories.index(cat_name), name=cat_name)
+        for cat_name in nus_categories
+    ]
+    cat2id = {k_v['name']: k_v['id'] for k_v in categories}
+
+    images = []
+    print('Process image meta information...')
+    for sample_info in mmengine.track_iter_progress(nuim.sample_data):
+        if sample_info['is_key_frame']:
+            img_idx = len(images)
+            images.append(
+                dict(
+                    id=img_idx,
+                    token=sample_info['token'],
+                    file_name=sample_info['filename'],
+                    width=sample_info['width'],
+                    height=sample_info['height']))
+
+    seg_root = f'{out_dir}semantic_masks'
+    mmengine.mkdir_or_exist(seg_root)
+    mmengine.mkdir_or_exist(osp.join(data_root, 'calibrated'))
+
+    global process_img_anno
+
+    def process_img_anno(img_info):
+        single_img_annos, max_cls_id = get_img_annos(nuim, img_info, cat2id,
+                                                     out_dir, data_root,
+                                                     seg_root)
+        return single_img_annos, max_cls_id
+
+    print('Process img annotations...')
+    if nproc > 1:
+        outputs = mmengine.track_parallel_progress(
+            process_img_anno, images, nproc=nproc)
+    else:
+        outputs = []
+        for img_info in mmengine.track_iter_progress(images):
+            outputs.append(process_img_anno(img_info))
+
+    # Determine the index of object annotation
+    print('Process annotation information...')
+    annotations = []
+    max_cls_ids = []
+    for single_img_annos, max_cls_id in outputs:
+        max_cls_ids.append(max_cls_id)
+        for img_anno in single_img_annos:
+            img_anno.update(id=len(annotations))
+            annotations.append(img_anno)
+
+    max_cls_id = max(max_cls_ids)
+    print(f'Max ID of class in the semantic map: {max_cls_id}')
+
+    coco_format_json = dict(
+        images=images, annotations=annotations, categories=categories)
+
+    mmengine.mkdir_or_exist(out_dir)
+    out_file = osp.join(out_dir, f'{extra_tag}_{version}.json')
+    print(f'Annotation dumped to {out_file}')
+    mmengine.dump(coco_format_json, out_file)
+
+
+def main():
+    args = parse_args()
+    for version in args.version:
+        nuim = NuImages(
+            dataroot=args.data_root, version=version, verbose=True, lazy=True)
+        export_nuim_to_coco(nuim, args.data_root, args.out_dir, args.extra_tag,
+                            version, args.nproc)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/tools/dataset_converters/nuscenes_converter.py b/mmde/tools/dataset_converters/nuscenes_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..c53a2871f375f00a0243b9db741a07caad20962d
--- /dev/null
+++ b/mmde/tools/dataset_converters/nuscenes_converter.py
@@ -0,0 +1,635 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from collections import OrderedDict
+from os import path as osp
+from typing import List, Tuple, Union
+
+import mmcv
+import mmengine
+import numpy as np
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.geometry_utils import view_points
+from pyquaternion import Quaternion
+from shapely.geometry import MultiPoint, box
+
+from mmdet3d.datasets.convert_utils import NuScenesNameMapping
+from mmdet3d.structures import points_cam2img
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+                  'barrier')
+
+nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
+                  'pedestrian.moving', 'pedestrian.standing',
+                  'pedestrian.sitting_lying_down', 'vehicle.moving',
+                  'vehicle.parked', 'vehicle.stopped', 'None')
+
+
+def create_nuscenes_infos(root_path,
+                          info_prefix,
+                          version='v1.0-trainval',
+                          max_sweeps=10):
+    """Create info file of nuscene dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        root_path (str): Path of the data root.
+        info_prefix (str): Prefix of the info file to be generated.
+        version (str, optional): Version of the data.
+            Default: 'v1.0-trainval'.
+        max_sweeps (int, optional): Max number of sweeps.
+            Default: 10.
+    """
+    from nuscenes.nuscenes import NuScenes
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    from nuscenes.utils import splits
+    available_vers = ['v1.0-trainval', 'v1.0-test', 'v1.0-mini']
+    assert version in available_vers
+    if version == 'v1.0-trainval':
+        train_scenes = splits.train
+        val_scenes = splits.val
+    elif version == 'v1.0-test':
+        train_scenes = splits.test
+        val_scenes = []
+    elif version == 'v1.0-mini':
+        train_scenes = splits.mini_train
+        val_scenes = splits.mini_val
+    else:
+        raise ValueError('unknown')
+
+    # filter existing scenes.
+    available_scenes = get_available_scenes(nusc)
+    available_scene_names = [s['name'] for s in available_scenes]
+    train_scenes = list(
+        filter(lambda x: x in available_scene_names, train_scenes))
+    val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+    train_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in train_scenes
+    ])
+    val_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in val_scenes
+    ])
+
+    test = 'test' in version
+    if test:
+        print('test scene: {}'.format(len(train_scenes)))
+    else:
+        print('train scene: {}, val scene: {}'.format(
+            len(train_scenes), len(val_scenes)))
+    train_nusc_infos, val_nusc_infos = _fill_trainval_infos(
+        nusc, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+    metadata = dict(version=version)
+    if test:
+        print('test sample: {}'.format(len(train_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(root_path,
+                             '{}_infos_test.pkl'.format(info_prefix))
+        mmengine.dump(data, info_path)
+    else:
+        print('train sample: {}, val sample: {}'.format(
+            len(train_nusc_infos), len(val_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(root_path,
+                             '{}_infos_train.pkl'.format(info_prefix))
+        mmengine.dump(data, info_path)
+        data['infos'] = val_nusc_infos
+        info_val_path = osp.join(root_path,
+                                 '{}_infos_val.pkl'.format(info_prefix))
+        mmengine.dump(data, info_val_path)
+
+
+def get_available_scenes(nusc):
+    """Get available scenes from the input nuscenes class.
+
+    Given the raw data, get the information of available scenes for
+    further info generation.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+
+    Returns:
+        available_scenes (list[dict]): List of basic information for the
+            available scenes.
+    """
+    available_scenes = []
+    print('total scene num: {}'.format(len(nusc.scene)))
+    for scene in nusc.scene:
+        scene_token = scene['token']
+        scene_rec = nusc.get('scene', scene_token)
+        sample_rec = nusc.get('sample', scene_rec['first_sample_token'])
+        sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+        has_more_frames = True
+        scene_not_exist = False
+        while has_more_frames:
+            lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token'])
+            lidar_path = str(lidar_path)
+            if os.getcwd() in lidar_path:
+                # path from lyftdataset is absolute path
+                lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1]
+                # relative path
+            if not mmengine.is_filepath(lidar_path):
+                scene_not_exist = True
+                break
+            else:
+                break
+        if scene_not_exist:
+            continue
+        available_scenes.append(scene)
+    print('exist scene num: {}'.format(len(available_scenes)))
+    return available_scenes
+
+
+def _fill_trainval_infos(nusc,
+                         train_scenes,
+                         val_scenes,
+                         test=False,
+                         max_sweeps=10):
+    """Generate the train/val infos from the raw data.
+
+    Args:
+        nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset.
+        train_scenes (list[str]): Basic information of training scenes.
+        val_scenes (list[str]): Basic information of validation scenes.
+        test (bool, optional): Whether use the test mode. In test mode, no
+            annotations can be accessed. Default: False.
+        max_sweeps (int, optional): Max number of sweeps. Default: 10.
+
+    Returns:
+        tuple[list[dict]]: Information of training set and validation set
+            that will be saved to the info file.
+    """
+    train_nusc_infos = []
+    val_nusc_infos = []
+
+    for sample in mmengine.track_iter_progress(nusc.sample):
+        lidar_token = sample['data']['LIDAR_TOP']
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        cs_record = nusc.get('calibrated_sensor',
+                             sd_rec['calibrated_sensor_token'])
+        pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+        lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)
+
+        mmengine.check_file_exist(lidar_path)
+
+        info = {
+            'lidar_path': lidar_path,
+            'num_features': 5,
+            'token': sample['token'],
+            'sweeps': [],
+            'cams': dict(),
+            'lidar2ego_translation': cs_record['translation'],
+            'lidar2ego_rotation': cs_record['rotation'],
+            'ego2global_translation': pose_record['translation'],
+            'ego2global_rotation': pose_record['rotation'],
+            'timestamp': sample['timestamp'],
+        }
+
+        l2e_r = info['lidar2ego_rotation']
+        l2e_t = info['lidar2ego_translation']
+        e2g_r = info['ego2global_rotation']
+        e2g_t = info['ego2global_translation']
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        # obtain 6 image's information per frame
+        camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_FRONT_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_BACK_RIGHT',
+        ]
+        for cam in camera_types:
+            cam_token = sample['data'][cam]
+            cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token)
+            cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat,
+                                         e2g_t, e2g_r_mat, cam)
+            cam_info.update(cam_intrinsic=cam_intrinsic)
+            info['cams'].update({cam: cam_info})
+
+        # obtain sweeps for a single key-frame
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        sweeps = []
+        while len(sweeps) < max_sweeps:
+            if not sd_rec['prev'] == '':
+                sweep = obtain_sensor2top(nusc, sd_rec['prev'], l2e_t,
+                                          l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+                sweeps.append(sweep)
+                sd_rec = nusc.get('sample_data', sd_rec['prev'])
+            else:
+                break
+        info['sweeps'] = sweeps
+        # obtain annotation
+        if not test:
+            annotations = [
+                nusc.get('sample_annotation', token)
+                for token in sample['anns']
+            ]
+            locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+            dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+            rots = np.array([b.orientation.yaw_pitch_roll[0]
+                             for b in boxes]).reshape(-1, 1)
+            velocity = np.array(
+                [nusc.box_velocity(token)[:2] for token in sample['anns']])
+            valid_flag = np.array(
+                [(anno['num_lidar_pts'] + anno['num_radar_pts']) > 0
+                 for anno in annotations],
+                dtype=bool).reshape(-1)
+            # convert velo from global to lidar
+            for i in range(len(boxes)):
+                velo = np.array([*velocity[i], 0.0])
+                velo = velo @ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(
+                    l2e_r_mat).T
+                velocity[i] = velo[:2]
+
+            names = [b.name for b in boxes]
+            for i in range(len(names)):
+                if names[i] in NuScenesNameMapping:
+                    names[i] = NuScenesNameMapping[names[i]]
+            names = np.array(names)
+            # we need to convert box size to
+            # the format of our lidar coordinate system
+            # which is x_size, y_size, z_size (corresponding to l, w, h)
+            gt_boxes = np.concatenate([locs, dims[:, [1, 0, 2]], rots], axis=1)
+            assert len(gt_boxes) == len(
+                annotations), f'{len(gt_boxes)}, {len(annotations)}'
+            info['gt_boxes'] = gt_boxes
+            info['gt_names'] = names
+            info['gt_velocity'] = velocity.reshape(-1, 2)
+            info['num_lidar_pts'] = np.array(
+                [a['num_lidar_pts'] for a in annotations])
+            info['num_radar_pts'] = np.array(
+                [a['num_radar_pts'] for a in annotations])
+            info['valid_flag'] = valid_flag
+
+            if 'lidarseg' in nusc.table_names:
+                info['pts_semantic_mask_path'] = osp.join(
+                    nusc.dataroot,
+                    nusc.get('lidarseg', lidar_token)['filename'])
+
+        if sample['scene_token'] in train_scenes:
+            train_nusc_infos.append(info)
+        else:
+            val_nusc_infos.append(info)
+
+    return train_nusc_infos, val_nusc_infos
+
+
+def obtain_sensor2top(nusc,
+                      sensor_token,
+                      l2e_t,
+                      l2e_r_mat,
+                      e2g_t,
+                      e2g_r_mat,
+                      sensor_type='lidar'):
+    """Obtain the info with RT matric from general sensor to Top LiDAR.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+        sensor_token (str): Sample data token corresponding to the
+            specific sensor type.
+        l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
+        l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
+            in shape (3, 3).
+        e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
+        e2g_r_mat (np.ndarray): Rotation matrix from ego to global
+            in shape (3, 3).
+        sensor_type (str, optional): Sensor to calibrate. Default: 'lidar'.
+
+    Returns:
+        sweep (dict): Sweep information after transformation.
+    """
+    sd_rec = nusc.get('sample_data', sensor_token)
+    cs_record = nusc.get('calibrated_sensor',
+                         sd_rec['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    data_path = str(nusc.get_sample_data_path(sd_rec['token']))
+    if os.getcwd() in data_path:  # path from lyftdataset is absolute path
+        data_path = data_path.split(f'{os.getcwd()}/')[-1]  # relative path
+    sweep = {
+        'data_path': data_path,
+        'type': sensor_type,
+        'sample_data_token': sd_rec['token'],
+        'sensor2ego_translation': cs_record['translation'],
+        'sensor2ego_rotation': cs_record['rotation'],
+        'ego2global_translation': pose_record['translation'],
+        'ego2global_rotation': pose_record['rotation'],
+        'timestamp': sd_rec['timestamp']
+    }
+    l2e_r_s = sweep['sensor2ego_rotation']
+    l2e_t_s = sweep['sensor2ego_translation']
+    e2g_r_s = sweep['ego2global_rotation']
+    e2g_t_s = sweep['ego2global_translation']
+
+    # obtain the RT from sensor to Top LiDAR
+    # sweep->ego->global->ego'->lidar
+    l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+    e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+    R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+                  ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
+    sweep['sensor2lidar_rotation'] = R.T  # points @ R.T + T
+    sweep['sensor2lidar_translation'] = T
+    return sweep
+
+
+def export_2d_annotation(root_path, info_path, version, mono3d=True):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        version (str): Dataset version.
+        mono3d (bool, optional): Whether to export mono3d annotation.
+            Default: True.
+    """
+    # get bbox annotations for camera
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_FRONT_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_LEFT',
+        'CAM_BACK_RIGHT',
+    ]
+    nusc_infos = mmengine.load(info_path)['infos']
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    # info_2d_list = []
+    cat2Ids = [
+        dict(id=nus_categories.index(cat_name), name=cat_name)
+        for cat_name in nus_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    for info in mmengine.track_iter_progress(nusc_infos):
+        for cam in camera_types:
+            cam_info = info['cams'][cam]
+            coco_infos = get_2d_boxes(
+                nusc,
+                cam_info['sample_data_token'],
+                visibilities=['', '1', '2', '3', '4'],
+                mono3d=mono3d)
+            (height, width, _) = mmcv.imread(cam_info['data_path']).shape
+            coco_2d_dict['images'].append(
+                dict(
+                    file_name=cam_info['data_path'].split('data/nuscenes/')
+                    [-1],
+                    id=cam_info['sample_data_token'],
+                    token=info['token'],
+                    cam2ego_rotation=cam_info['sensor2ego_rotation'],
+                    cam2ego_translation=cam_info['sensor2ego_translation'],
+                    ego2global_rotation=info['ego2global_rotation'],
+                    ego2global_translation=info['ego2global_translation'],
+                    cam_intrinsic=cam_info['cam_intrinsic'],
+                    width=width,
+                    height=height))
+            for coco_info in coco_infos:
+                if coco_info is None:
+                    continue
+                # add an empty key for coco format
+                coco_info['segmentation'] = []
+                coco_info['id'] = coco_ann_id
+                coco_2d_dict['annotations'].append(coco_info)
+                coco_ann_id += 1
+    if mono3d:
+        json_prefix = f'{info_path[:-4]}_mono3d'
+    else:
+        json_prefix = f'{info_path[:-4]}'
+    mmengine.dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(nusc,
+                 sample_data_token: str,
+                 visibilities: List[str],
+                 mono3d=True):
+    """Get the 2D annotation records for a given `sample_data_token`.
+
+    Args:
+        sample_data_token (str): Sample data token belonging to a camera
+            keyframe.
+        visibilities (list[str]): Visibility filter.
+        mono3d (bool): Whether to get boxes with mono3d annotation.
+
+    Return:
+        list[dict]: List of 2D annotation record that belongs to the input
+            `sample_data_token`.
+    """
+
+    # Get the sample data and the sample corresponding to that sample data.
+    sd_rec = nusc.get('sample_data', sample_data_token)
+
+    assert sd_rec[
+        'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
+        ' for camera sample_data!'
+    if not sd_rec['is_key_frame']:
+        raise ValueError(
+            'The 2D re-projections are available only for keyframes.')
+
+    s_rec = nusc.get('sample', sd_rec['sample_token'])
+
+    # Get the calibrated sensor and ego pose
+    # record to get the transformation matrices.
+    cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
+    pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
+
+    # Get all the annotation with the specified visibilties.
+    ann_recs = [
+        nusc.get('sample_annotation', token) for token in s_rec['anns']
+    ]
+    ann_recs = [
+        ann_rec for ann_rec in ann_recs
+        if (ann_rec['visibility_token'] in visibilities)
+    ]
+
+    repro_recs = []
+
+    for ann_rec in ann_recs:
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = ann_rec['token']
+        ann_rec['sample_data_token'] = sample_data_token
+
+        # Get the box in global coordinates.
+        box = nusc.get_box(ann_rec['token'])
+
+        # Move them to the ego-pose frame.
+        box.translate(-np.array(pose_rec['translation']))
+        box.rotate(Quaternion(pose_rec['rotation']).inverse)
+
+        # Move them to the calibrated sensor frame.
+        box.translate(-np.array(cs_rec['translation']))
+        box.rotate(Quaternion(cs_rec['rotation']).inverse)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box.corners()
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    sample_data_token, sd_rec['filename'])
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            loc = box.center.tolist()
+
+            dim = box.wlh
+            dim[[0, 1, 2]] = dim[[1, 2, 0]]  # convert wlh to our lhw
+            dim = dim.tolist()
+
+            rot = box.orientation.yaw_pitch_roll[0]
+            rot = [-rot]  # convert the rot to our cam coordinate
+
+            global_velo2d = nusc.box_velocity(box.token)[:2]
+            global_velo3d = np.array([*global_velo2d, 0.0])
+            e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
+            c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
+            cam_velo3d = global_velo3d @ np.linalg.inv(
+                e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
+            velo = cam_velo3d[0::2].tolist()
+
+            repro_rec['bbox_cam3d'] = loc + dim + rot
+            repro_rec['velo_cam3d'] = velo
+
+            center3d = np.array(loc).reshape([1, 3])
+            center2d = points_cam2img(
+                center3d, camera_intrinsic, with_depth=True)
+            repro_rec['center2d'] = center2d.squeeze().tolist()
+            # normalized center2D + depth
+            # if samples with depth < 0 will be removed
+            if repro_rec['center2d'][2] <= 0:
+                continue
+
+            ann_token = nusc.get('sample_annotation',
+                                 box.token)['attribute_tokens']
+            if len(ann_token) == 0:
+                attr_name = 'None'
+            else:
+                attr_name = nusc.get('attribute', ann_token[0])['name']
+            attr_id = nus_attributes.index(attr_name)
+            repro_rec['attribute_name'] = attr_name
+            repro_rec['attribute_id'] = attr_id
+
+        repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def post_process_coords(
+    corner_coords: List, imsize: Tuple[int, int] = (1600, 900)
+) -> Union[Tuple[float, float, float, float], None]:
+    """Get the intersection of the convex hull of the reprojected bbox corners
+    and the image canvas, return None if no intersection.
+
+    Args:
+        corner_coords (list[int]): Corner coordinates of reprojected
+            bounding box.
+        imsize (tuple[int]): Size of the image canvas.
+
+    Return:
+        tuple [float]: Intersection of the convex hull of the 2D box
+            corners and the image canvas.
+    """
+    polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
+    img_canvas = box(0, 0, imsize[0], imsize[1])
+
+    if polygon_from_2d_box.intersects(img_canvas):
+        img_intersection = polygon_from_2d_box.intersection(img_canvas)
+        intersection_coords = np.array(
+            [coord for coord in img_intersection.exterior.coords])
+
+        min_x = min(intersection_coords[:, 0])
+        min_y = min(intersection_coords[:, 1])
+        max_x = max(intersection_coords[:, 0])
+        max_y = max(intersection_coords[:, 1])
+
+        return min_x, min_y, max_x, max_y
+    else:
+        return None
+
+
+def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
+                    sample_data_token: str, filename: str) -> OrderedDict:
+    """Generate one 2D annotation record given various information on top of
+    the 2D bounding box coordinates.
+
+    Args:
+        ann_rec (dict): Original 3d annotation record.
+        x1 (float): Minimum value of the x coordinate.
+        y1 (float): Minimum value of the y coordinate.
+        x2 (float): Maximum value of the x coordinate.
+        y2 (float): Maximum value of the y coordinate.
+        sample_data_token (str): Sample data token.
+        filename (str):The corresponding image file where the annotation
+            is present.
+
+    Returns:
+        dict: A sample 2D annotation record.
+            - file_name (str): file name
+            - image_id (str): sample data token
+            - area (float): 2d box area
+            - category_name (str): category name
+            - category_id (int): category id
+            - bbox (list[float]): left x, top y, dx, dy of 2d box
+            - iscrowd (int): whether the area is crowd
+    """
+    repro_rec = OrderedDict()
+    repro_rec['sample_data_token'] = sample_data_token
+    coco_rec = dict()
+
+    relevant_keys = [
+        'attribute_tokens',
+        'category_name',
+        'instance_token',
+        'next',
+        'num_lidar_pts',
+        'num_radar_pts',
+        'prev',
+        'sample_annotation_token',
+        'sample_data_token',
+        'visibility_token',
+    ]
+
+    for key, value in ann_rec.items():
+        if key in relevant_keys:
+            repro_rec[key] = value
+
+    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+    repro_rec['filename'] = filename
+
+    coco_rec['file_name'] = filename
+    coco_rec['image_id'] = sample_data_token
+    coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+    if repro_rec['category_name'] not in NuScenesNameMapping:
+        return None
+    cat_name = NuScenesNameMapping[repro_rec['category_name']]
+    coco_rec['category_name'] = cat_name
+    coco_rec['category_id'] = nus_categories.index(cat_name)
+    coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+    coco_rec['iscrowd'] = 0
+
+    return coco_rec
diff --git a/mmde/tools/dataset_converters/nuscenes_unzip.sh b/mmde/tools/dataset_converters/nuscenes_unzip.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2ba1f5fa9f91d93b1818ac6c7182638f3ba0c39e
--- /dev/null
+++ b/mmde/tools/dataset_converters/nuscenes_unzip.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+DOWNLOAD_DIR=$1  # The directory where the downloaded data set is stored
+DATA_ROOT=$2  # The root directory of the converted dataset
+
+for split in $DOWNLOAD_DIR/nuScenes/raw/*; do
+    for tgz_file in $split/*; do
+        if [[ $tgz_file == *.tgz ]]
+        then
+            echo "Unzipping $tgz_file to $DATA_ROOT ......"
+            tar -zxvf $tgz_file -C $DATA_ROOT/
+            echo "[Done] Unzip $tgz_file to $DATA_ROOT"
+        fi
+        # delete the original files
+        rm -f $tgz_file
+    done
+done
diff --git a/mmde/tools/dataset_converters/s3dis_data_utils.py b/mmde/tools/dataset_converters/s3dis_data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7a76a975060cc012a49650b4706d7148acff97b
--- /dev/null
+++ b/mmde/tools/dataset_converters/s3dis_data_utils.py
@@ -0,0 +1,247 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from concurrent import futures as futures
+from os import path as osp
+
+import mmengine
+import numpy as np
+
+
+class S3DISData(object):
+    """S3DIS data.
+
+    Generate s3dis infos for s3dis_converter.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        split (str, optional): Set split type of the data. Default: 'Area_1'.
+    """
+
+    def __init__(self, root_path, split='Area_1'):
+        self.root_dir = root_path
+        self.split = split
+        self.data_dir = osp.join(root_path,
+                                 'Stanford3dDataset_v1.2_Aligned_Version')
+
+        # Following `GSDN <https://arxiv.org/abs/2006.12356>`_, use 5 furniture
+        # classes for detection: table, chair, sofa, bookcase, board.
+        self.cat_ids = np.array([7, 8, 9, 10, 11])
+        self.cat_ids2class = {
+            cat_id: i
+            for i, cat_id in enumerate(list(self.cat_ids))
+        }
+
+        assert split in [
+            'Area_1', 'Area_2', 'Area_3', 'Area_4', 'Area_5', 'Area_6'
+        ]
+        self.sample_id_list = os.listdir(osp.join(self.data_dir,
+                                                  split))  # conferenceRoom_1
+        for sample_id in self.sample_id_list:
+            if os.path.isfile(osp.join(self.data_dir, split, sample_id)):
+                self.sample_id_list.remove(sample_id)
+
+    def __len__(self):
+        return len(self.sample_id_list)
+
+    def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+        """Get data infos.
+
+        This method gets information from the raw data.
+
+        Args:
+            num_workers (int, optional): Number of threads to be used.
+                Default: 4.
+            has_label (bool, optional): Whether the data has label.
+                Default: True.
+            sample_id_list (list[int], optional): Index list of the sample.
+                Default: None.
+
+        Returns:
+            infos (list[dict]): Information of the raw data.
+        """
+
+        def process_single_scene(sample_idx):
+            print(f'{self.split} sample_idx: {sample_idx}')
+            info = dict()
+            pc_info = {
+                'num_features': 6,
+                'lidar_idx': f'{self.split}_{sample_idx}'
+            }
+            info['point_cloud'] = pc_info
+            pts_filename = osp.join(self.root_dir, 's3dis_data',
+                                    f'{self.split}_{sample_idx}_point.npy')
+            pts_instance_mask_path = osp.join(
+                self.root_dir, 's3dis_data',
+                f'{self.split}_{sample_idx}_ins_label.npy')
+            pts_semantic_mask_path = osp.join(
+                self.root_dir, 's3dis_data',
+                f'{self.split}_{sample_idx}_sem_label.npy')
+
+            points = np.load(pts_filename).astype(np.float32)
+            pts_instance_mask = np.load(pts_instance_mask_path).astype(
+                np.int64)
+            pts_semantic_mask = np.load(pts_semantic_mask_path).astype(
+                np.int64)
+
+            mmengine.mkdir_or_exist(osp.join(self.root_dir, 'points'))
+            mmengine.mkdir_or_exist(osp.join(self.root_dir, 'instance_mask'))
+            mmengine.mkdir_or_exist(osp.join(self.root_dir, 'semantic_mask'))
+
+            points.tofile(
+                osp.join(self.root_dir, 'points',
+                         f'{self.split}_{sample_idx}.bin'))
+            pts_instance_mask.tofile(
+                osp.join(self.root_dir, 'instance_mask',
+                         f'{self.split}_{sample_idx}.bin'))
+            pts_semantic_mask.tofile(
+                osp.join(self.root_dir, 'semantic_mask',
+                         f'{self.split}_{sample_idx}.bin'))
+
+            info['pts_path'] = osp.join('points',
+                                        f'{self.split}_{sample_idx}.bin')
+            info['pts_instance_mask_path'] = osp.join(
+                'instance_mask', f'{self.split}_{sample_idx}.bin')
+            info['pts_semantic_mask_path'] = osp.join(
+                'semantic_mask', f'{self.split}_{sample_idx}.bin')
+            info['annos'] = self.get_bboxes(points, pts_instance_mask,
+                                            pts_semantic_mask)
+
+            return info
+
+        sample_id_list = sample_id_list if sample_id_list is not None \
+            else self.sample_id_list
+        with futures.ThreadPoolExecutor(num_workers) as executor:
+            infos = executor.map(process_single_scene, sample_id_list)
+        return list(infos)
+
+    def get_bboxes(self, points, pts_instance_mask, pts_semantic_mask):
+        """Convert instance masks to axis-aligned bounding boxes.
+
+        Args:
+            points (np.array): Scene points of shape (n, 6).
+            pts_instance_mask (np.ndarray): Instance labels of shape (n,).
+            pts_semantic_mask (np.ndarray): Semantic labels of shape (n,).
+
+        Returns:
+            dict: A dict containing detection infos with following keys:
+
+                - gt_boxes_upright_depth (np.ndarray): Bounding boxes
+                    of shape (n, 6)
+                - class (np.ndarray): Box labels of shape (n,)
+                - gt_num (int): Number of boxes.
+        """
+        bboxes, labels = [], []
+        for i in range(1, pts_instance_mask.max() + 1):
+            ids = pts_instance_mask == i
+            # check if all instance points have same semantic label
+            assert pts_semantic_mask[ids].min() == pts_semantic_mask[ids].max()
+            label = pts_semantic_mask[ids][0]
+            # keep only furniture objects
+            if label in self.cat_ids2class:
+                labels.append(self.cat_ids2class[pts_semantic_mask[ids][0]])
+                pts = points[:, :3][ids]
+                min_pts = pts.min(axis=0)
+                max_pts = pts.max(axis=0)
+                locations = (min_pts + max_pts) / 2
+                dimensions = max_pts - min_pts
+                bboxes.append(np.concatenate((locations, dimensions)))
+        annotation = dict()
+        # follow ScanNet and SUN RGB-D keys
+        annotation['gt_boxes_upright_depth'] = np.array(bboxes)
+        annotation['class'] = np.array(labels)
+        annotation['gt_num'] = len(labels)
+        return annotation
+
+
+class S3DISSegData(object):
+    """S3DIS dataset used to generate infos for semantic segmentation task.
+
+    Args:
+        data_root (str): Root path of the raw data.
+        ann_file (str): The generated scannet infos.
+        split (str, optional): Set split type of the data. Default: 'train'.
+        num_points (int, optional): Number of points in each data input.
+            Default: 8192.
+        label_weight_func (function, optional): Function to compute the
+            label weight. Default: None.
+    """
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 split='Area_1',
+                 num_points=4096,
+                 label_weight_func=None):
+        self.data_root = data_root
+        self.data_infos = mmengine.load(ann_file)
+        self.split = split
+        self.num_points = num_points
+
+        self.all_ids = np.arange(13)  # all possible ids
+        self.cat_ids = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                 12])  # used for seg task
+        self.ignore_index = len(self.cat_ids)
+
+        self.cat_id2class = np.ones(
+            (self.all_ids.shape[0], ), dtype=np.int64) * self.ignore_index
+        for i, cat_id in enumerate(self.cat_ids):
+            self.cat_id2class[cat_id] = i
+
+        # label weighting function is taken from
+        # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24
+        self.label_weight_func = (lambda x: 1.0 / np.log(1.2 + x)) if \
+            label_weight_func is None else label_weight_func
+
+    def get_seg_infos(self):
+        scene_idxs, label_weight = self.get_scene_idxs_and_label_weight()
+        save_folder = osp.join(self.data_root, 'seg_info')
+        mmengine.mkdir_or_exist(save_folder)
+        np.save(
+            osp.join(save_folder, f'{self.split}_resampled_scene_idxs.npy'),
+            scene_idxs)
+        np.save(
+            osp.join(save_folder, f'{self.split}_label_weight.npy'),
+            label_weight)
+        print(f'{self.split} resampled scene index and label weight saved')
+
+    def _convert_to_label(self, mask):
+        """Convert class_id in loaded segmentation mask to label."""
+        if isinstance(mask, str):
+            if mask.endswith('npy'):
+                mask = np.load(mask)
+            else:
+                mask = np.fromfile(mask, dtype=np.int64)
+        label = self.cat_id2class[mask]
+        return label
+
+    def get_scene_idxs_and_label_weight(self):
+        """Compute scene_idxs for data sampling and label weight for loss
+        calculation.
+
+        We sample more times for scenes with more points. Label_weight is
+        inversely proportional to number of class points.
+        """
+        num_classes = len(self.cat_ids)
+        num_point_all = []
+        label_weight = np.zeros((num_classes + 1, ))  # ignore_index
+        for data_info in self.data_infos:
+            label = self._convert_to_label(
+                osp.join(self.data_root, data_info['pts_semantic_mask_path']))
+            num_point_all.append(label.shape[0])
+            class_count, _ = np.histogram(label, range(num_classes + 2))
+            label_weight += class_count
+
+        # repeat scene_idx for num_scene_point // num_sample_point times
+        sample_prob = np.array(num_point_all) / float(np.sum(num_point_all))
+        num_iter = int(np.sum(num_point_all) / float(self.num_points))
+        scene_idxs = []
+        for idx in range(len(self.data_infos)):
+            scene_idxs.extend([idx] * int(round(sample_prob[idx] * num_iter)))
+        scene_idxs = np.array(scene_idxs).astype(np.int32)
+
+        # calculate label weight, adopted from PointNet++
+        label_weight = label_weight[:-1].astype(np.float32)
+        label_weight = label_weight / label_weight.sum()
+        label_weight = self.label_weight_func(label_weight).astype(np.float32)
+
+        return scene_idxs, label_weight
diff --git a/mmde/tools/dataset_converters/scannet_data_utils.py b/mmde/tools/dataset_converters/scannet_data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d03c2208feab09f3883f81c2b91275e802cc7533
--- /dev/null
+++ b/mmde/tools/dataset_converters/scannet_data_utils.py
@@ -0,0 +1,299 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from concurrent import futures as futures
+from os import path as osp
+
+import mmengine
+import numpy as np
+
+
+class ScanNetData(object):
+    """ScanNet data.
+
+    Generate scannet infos for scannet_converter.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        split (str, optional): Set split type of the data. Default: 'train'.
+    """
+
+    def __init__(self, root_path, split='train'):
+        self.root_dir = root_path
+        self.split = split
+        self.split_dir = osp.join(root_path)
+        self.classes = [
+            'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+            'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+            'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+            'garbagebin'
+        ]
+        self.cat2label = {cat: self.classes.index(cat) for cat in self.classes}
+        self.label2cat = {self.cat2label[t]: t for t in self.cat2label}
+        self.cat_ids = np.array(
+            [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39])
+        self.cat_ids2class = {
+            nyu40id: i
+            for i, nyu40id in enumerate(list(self.cat_ids))
+        }
+        assert split in ['train', 'val', 'test']
+        split_file = osp.join(self.root_dir, 'meta_data',
+                              f'scannetv2_{split}.txt')
+        mmengine.check_file_exist(split_file)
+        self.sample_id_list = mmengine.list_from_file(split_file)
+        self.test_mode = (split == 'test')
+
+    def __len__(self):
+        return len(self.sample_id_list)
+
+    def get_aligned_box_label(self, idx):
+        box_file = osp.join(self.root_dir, 'scannet_instance_data',
+                            f'{idx}_aligned_bbox.npy')
+        mmengine.check_file_exist(box_file)
+        return np.load(box_file)
+
+    def get_unaligned_box_label(self, idx):
+        box_file = osp.join(self.root_dir, 'scannet_instance_data',
+                            f'{idx}_unaligned_bbox.npy')
+        mmengine.check_file_exist(box_file)
+        return np.load(box_file)
+
+    def get_axis_align_matrix(self, idx):
+        matrix_file = osp.join(self.root_dir, 'scannet_instance_data',
+                               f'{idx}_axis_align_matrix.npy')
+        mmengine.check_file_exist(matrix_file)
+        return np.load(matrix_file)
+
+    def get_images(self, idx):
+        paths = []
+        path = osp.join(self.root_dir, 'posed_images', idx)
+        for file in sorted(os.listdir(path)):
+            if file.endswith('.jpg'):
+                paths.append(osp.join('posed_images', idx, file))
+        return paths
+
+    def get_extrinsics(self, idx):
+        extrinsics = []
+        path = osp.join(self.root_dir, 'posed_images', idx)
+        for file in sorted(os.listdir(path)):
+            if file.endswith('.txt') and not file == 'intrinsic.txt':
+                extrinsics.append(np.loadtxt(osp.join(path, file)))
+        return extrinsics
+
+    def get_intrinsics(self, idx):
+        matrix_file = osp.join(self.root_dir, 'posed_images', idx,
+                               'intrinsic.txt')
+        mmengine.check_file_exist(matrix_file)
+        return np.loadtxt(matrix_file)
+
+    def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+        """Get data infos.
+
+        This method gets information from the raw data.
+
+        Args:
+            num_workers (int, optional): Number of threads to be used.
+                Default: 4.
+            has_label (bool, optional): Whether the data has label.
+                Default: True.
+            sample_id_list (list[int], optional): Index list of the sample.
+                Default: None.
+
+        Returns:
+            infos (list[dict]): Information of the raw data.
+        """
+
+        def process_single_scene(sample_idx):
+            print(f'{self.split} sample_idx: {sample_idx}')
+            info = dict()
+            pc_info = {'num_features': 6, 'lidar_idx': sample_idx}
+            info['point_cloud'] = pc_info
+            pts_filename = osp.join(self.root_dir, 'scannet_instance_data',
+                                    f'{sample_idx}_vert.npy')
+            points = np.load(pts_filename)
+            mmengine.mkdir_or_exist(osp.join(self.root_dir, 'points'))
+            points.tofile(
+                osp.join(self.root_dir, 'points', f'{sample_idx}.bin'))
+            info['pts_path'] = osp.join('points', f'{sample_idx}.bin')
+
+            # update with RGB image paths if exist
+            if os.path.exists(osp.join(self.root_dir, 'posed_images')):
+                info['intrinsics'] = self.get_intrinsics(sample_idx)
+                all_extrinsics = self.get_extrinsics(sample_idx)
+                all_img_paths = self.get_images(sample_idx)
+                # some poses in ScanNet are invalid
+                extrinsics, img_paths = [], []
+                for extrinsic, img_path in zip(all_extrinsics, all_img_paths):
+                    if np.all(np.isfinite(extrinsic)):
+                        img_paths.append(img_path)
+                        extrinsics.append(extrinsic)
+                info['extrinsics'] = extrinsics
+                info['img_paths'] = img_paths
+
+            if not self.test_mode:
+                pts_instance_mask_path = osp.join(
+                    self.root_dir, 'scannet_instance_data',
+                    f'{sample_idx}_ins_label.npy')
+                pts_semantic_mask_path = osp.join(
+                    self.root_dir, 'scannet_instance_data',
+                    f'{sample_idx}_sem_label.npy')
+
+                pts_instance_mask = np.load(pts_instance_mask_path).astype(
+                    np.int64)
+                pts_semantic_mask = np.load(pts_semantic_mask_path).astype(
+                    np.int64)
+
+                mmengine.mkdir_or_exist(
+                    osp.join(self.root_dir, 'instance_mask'))
+                mmengine.mkdir_or_exist(
+                    osp.join(self.root_dir, 'semantic_mask'))
+
+                pts_instance_mask.tofile(
+                    osp.join(self.root_dir, 'instance_mask',
+                             f'{sample_idx}.bin'))
+                pts_semantic_mask.tofile(
+                    osp.join(self.root_dir, 'semantic_mask',
+                             f'{sample_idx}.bin'))
+
+                info['pts_instance_mask_path'] = osp.join(
+                    'instance_mask', f'{sample_idx}.bin')
+                info['pts_semantic_mask_path'] = osp.join(
+                    'semantic_mask', f'{sample_idx}.bin')
+
+            if has_label:
+                annotations = {}
+                # box is of shape [k, 6 + class]
+                aligned_box_label = self.get_aligned_box_label(sample_idx)
+                unaligned_box_label = self.get_unaligned_box_label(sample_idx)
+                annotations['gt_num'] = aligned_box_label.shape[0]
+                if annotations['gt_num'] != 0:
+                    aligned_box = aligned_box_label[:, :-1]  # k, 6
+                    unaligned_box = unaligned_box_label[:, :-1]
+                    classes = aligned_box_label[:, -1]  # k
+                    annotations['name'] = np.array([
+                        self.label2cat[self.cat_ids2class[classes[i]]]
+                        for i in range(annotations['gt_num'])
+                    ])
+                    # default names are given to aligned bbox for compatibility
+                    # we also save unaligned bbox info with marked names
+                    annotations['location'] = aligned_box[:, :3]
+                    annotations['dimensions'] = aligned_box[:, 3:6]
+                    annotations['gt_boxes_upright_depth'] = aligned_box
+                    annotations['unaligned_location'] = unaligned_box[:, :3]
+                    annotations['unaligned_dimensions'] = unaligned_box[:, 3:6]
+                    annotations[
+                        'unaligned_gt_boxes_upright_depth'] = unaligned_box
+                    annotations['index'] = np.arange(
+                        annotations['gt_num'], dtype=np.int32)
+                    annotations['class'] = np.array([
+                        self.cat_ids2class[classes[i]]
+                        for i in range(annotations['gt_num'])
+                    ])
+                axis_align_matrix = self.get_axis_align_matrix(sample_idx)
+                annotations['axis_align_matrix'] = axis_align_matrix  # 4x4
+                info['annos'] = annotations
+            return info
+
+        sample_id_list = sample_id_list if sample_id_list is not None \
+            else self.sample_id_list
+        with futures.ThreadPoolExecutor(num_workers) as executor:
+            infos = executor.map(process_single_scene, sample_id_list)
+        return list(infos)
+
+
+class ScanNetSegData(object):
+    """ScanNet dataset used to generate infos for semantic segmentation task.
+
+    Args:
+        data_root (str): Root path of the raw data.
+        ann_file (str): The generated scannet infos.
+        split (str, optional): Set split type of the data. Default: 'train'.
+        num_points (int, optional): Number of points in each data input.
+            Default: 8192.
+        label_weight_func (function, optional): Function to compute the
+            label weight. Default: None.
+    """
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 split='train',
+                 num_points=8192,
+                 label_weight_func=None):
+        self.data_root = data_root
+        self.data_infos = mmengine.load(ann_file)
+        self.split = split
+        assert split in ['train', 'val', 'test']
+        self.num_points = num_points
+
+        self.all_ids = np.arange(41)  # all possible ids
+        self.cat_ids = np.array([
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36,
+            39
+        ])  # used for seg task
+        self.ignore_index = len(self.cat_ids)
+
+        self.cat_id2class = np.ones(
+            (self.all_ids.shape[0], ), dtype=np.int64) * self.ignore_index
+        for i, cat_id in enumerate(self.cat_ids):
+            self.cat_id2class[cat_id] = i
+
+        # label weighting function is taken from
+        # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24
+        self.label_weight_func = (lambda x: 1.0 / np.log(1.2 + x)) if \
+            label_weight_func is None else label_weight_func
+
+    def get_seg_infos(self):
+        if self.split == 'test':
+            return
+        scene_idxs, label_weight = self.get_scene_idxs_and_label_weight()
+        save_folder = osp.join(self.data_root, 'seg_info')
+        mmengine.mkdir_or_exist(save_folder)
+        np.save(
+            osp.join(save_folder, f'{self.split}_resampled_scene_idxs.npy'),
+            scene_idxs)
+        np.save(
+            osp.join(save_folder, f'{self.split}_label_weight.npy'),
+            label_weight)
+        print(f'{self.split} resampled scene index and label weight saved')
+
+    def _convert_to_label(self, mask):
+        """Convert class_id in loaded segmentation mask to label."""
+        if isinstance(mask, str):
+            if mask.endswith('npy'):
+                mask = np.load(mask)
+            else:
+                mask = np.fromfile(mask, dtype=np.int64)
+        label = self.cat_id2class[mask]
+        return label
+
+    def get_scene_idxs_and_label_weight(self):
+        """Compute scene_idxs for data sampling and label weight for loss
+        calculation.
+
+        We sample more times for scenes with more points. Label_weight is
+        inversely proportional to number of class points.
+        """
+        num_classes = len(self.cat_ids)
+        num_point_all = []
+        label_weight = np.zeros((num_classes + 1, ))  # ignore_index
+        for data_info in self.data_infos:
+            label = self._convert_to_label(
+                osp.join(self.data_root, data_info['pts_semantic_mask_path']))
+            num_point_all.append(label.shape[0])
+            class_count, _ = np.histogram(label, range(num_classes + 2))
+            label_weight += class_count
+
+        # repeat scene_idx for num_scene_point // num_sample_point times
+        sample_prob = np.array(num_point_all) / float(np.sum(num_point_all))
+        num_iter = int(np.sum(num_point_all) / float(self.num_points))
+        scene_idxs = []
+        for idx in range(len(self.data_infos)):
+            scene_idxs.extend([idx] * int(round(sample_prob[idx] * num_iter)))
+        scene_idxs = np.array(scene_idxs).astype(np.int32)
+
+        # calculate label weight, adopted from PointNet++
+        label_weight = label_weight[:-1].astype(np.float32)
+        label_weight = label_weight / label_weight.sum()
+        label_weight = self.label_weight_func(label_weight).astype(np.float32)
+
+        return scene_idxs, label_weight
diff --git a/mmde/tools/dataset_converters/semantickitti_converter.py b/mmde/tools/dataset_converters/semantickitti_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..2454eea6f98d3d48cf1de95331fa90385248cf6d
--- /dev/null
+++ b/mmde/tools/dataset_converters/semantickitti_converter.py
@@ -0,0 +1,103 @@
+from os import path as osp
+from pathlib import Path
+
+import mmengine
+
+total_num = {
+    0: 4541,
+    1: 1101,
+    2: 4661,
+    3: 801,
+    4: 271,
+    5: 2761,
+    6: 1101,
+    7: 1101,
+    8: 4071,
+    9: 1591,
+    10: 1201,
+    11: 921,
+    12: 1061,
+    13: 3281,
+    14: 631,
+    15: 1901,
+    16: 1731,
+    17: 491,
+    18: 1801,
+    19: 4981,
+    20: 831,
+    21: 2721,
+}
+fold_split = {
+    'train': [0, 1, 2, 3, 4, 5, 6, 7, 9, 10],
+    'val': [8],
+    'test': [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
+}
+split_list = ['train', 'valid', 'test']
+
+
+def get_semantickitti_info(split):
+    """Create info file in the form of
+        data_infos={
+            'metainfo': {'DATASET': 'SemanticKITTI'},
+            'data_list': {
+                00000: {
+                    'lidar_points':{
+                        'lidat_path':'sequences/00/velodyne/000000.bin'
+                    },
+                    'pts_semantic_mask_path':
+                        'sequences/000/labels/000000.labbel',
+                    'sample_id': '00'
+                },
+                ...
+            }
+        }
+    """
+    data_infos = dict()
+    data_infos['metainfo'] = dict(DATASET='SemanticKITTI')
+    data_list = []
+    for i_folder in fold_split[split]:
+        for j in range(0, total_num[i_folder]):
+            data_list.append({
+                'lidar_points': {
+                    'lidar_path':
+                    osp.join('sequences',
+                             str(i_folder).zfill(2), 'velodyne',
+                             str(j).zfill(6) + '.bin'),
+                    'num_pts_feats':
+                    4
+                },
+                'pts_semantic_mask_path':
+                osp.join('sequences',
+                         str(i_folder).zfill(2), 'labels',
+                         str(j).zfill(6) + '.label'),
+                'sample_id':
+                str(i_folder) + str(j)
+            })
+    data_infos.update(dict(data_list=data_list))
+    return data_infos
+
+
+def create_semantickitti_info_file(pkl_prefix, save_path):
+    """Create info file of SemanticKITTI dataset.
+
+    Directly generate info file without raw data.
+
+    Args:
+        pkl_prefix (str): Prefix of the info file to be generated.
+        save_path (str): Path to save the info file.
+    """
+    print('Generate info.')
+    save_path = Path(save_path)
+
+    semantickitti_infos_train = get_semantickitti_info(split='train')
+    filename = save_path / f'{pkl_prefix}_infos_train.pkl'
+    print(f'SemanticKITTI info train file is saved to {filename}')
+    mmengine.dump(semantickitti_infos_train, filename)
+    semantickitti_infos_val = get_semantickitti_info(split='val')
+    filename = save_path / f'{pkl_prefix}_infos_val.pkl'
+    print(f'SemanticKITTI info val file is saved to {filename}')
+    mmengine.dump(semantickitti_infos_val, filename)
+    semantickitti_infos_test = get_semantickitti_info(split='test')
+    filename = save_path / f'{pkl_prefix}_infos_test.pkl'
+    print(f'SemanticKITTI info test file is saved to {filename}')
+    mmengine.dump(semantickitti_infos_test, filename)
diff --git a/mmde/tools/dataset_converters/semantickitti_unzip.sh b/mmde/tools/dataset_converters/semantickitti_unzip.sh
new file mode 100644
index 0000000000000000000000000000000000000000..79892cc7149c0b545f69a22fa53724b29d0699e1
--- /dev/null
+++ b/mmde/tools/dataset_converters/semantickitti_unzip.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+DOWNLOAD_DIR=$1  # The directory where the downloaded data set is stored
+DATA_ROOT=$2  # The root directory of the converted dataset
+
+for zip_file in $DOWNLOAD_DIR/SemanticKITTI/raw/*.zip; do
+    echo "Unzipping $zip_file to $DATA_ROOT ......"
+	unzip -oq $zip_file -d $DATA_ROOT
+    echo "[Done] Unzip $zip_file to $DATA_ROOT"
+    # delete the original files
+	rm -f $zip_file
+done
diff --git a/mmde/tools/dataset_converters/sunrgbd_data_utils.py b/mmde/tools/dataset_converters/sunrgbd_data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6444c4b6ef1f00b1dfe851cd4f8b30ab824a8da8
--- /dev/null
+++ b/mmde/tools/dataset_converters/sunrgbd_data_utils.py
@@ -0,0 +1,227 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from concurrent import futures as futures
+from os import path as osp
+
+import mmcv
+import mmengine
+import numpy as np
+from scipy import io as sio
+
+
+def random_sampling(points, num_points, replace=None, return_choices=False):
+    """Random sampling.
+
+    Sampling point cloud to a certain number of points.
+
+    Args:
+        points (ndarray): Point cloud.
+        num_points (int): The number of samples.
+        replace (bool): Whether the sample is with or without replacement.
+        return_choices (bool): Whether to return choices.
+
+    Returns:
+        points (ndarray): Point cloud after sampling.
+    """
+
+    if replace is None:
+        replace = (points.shape[0] < num_points)
+    choices = np.random.choice(points.shape[0], num_points, replace=replace)
+    if return_choices:
+        return points[choices], choices
+    else:
+        return points[choices]
+
+
+class SUNRGBDInstance(object):
+
+    def __init__(self, line):
+        data = line.split(' ')
+        data[1:] = [float(x) for x in data[1:]]
+        self.classname = data[0]
+        self.xmin = data[1]
+        self.ymin = data[2]
+        self.xmax = data[1] + data[3]
+        self.ymax = data[2] + data[4]
+        self.box2d = np.array([self.xmin, self.ymin, self.xmax, self.ymax])
+        self.centroid = np.array([data[5], data[6], data[7]])
+        self.width = data[8]
+        self.length = data[9]
+        self.height = data[10]
+        # data[9] is x_size (length), data[8] is y_size (width), data[10] is
+        # z_size (height) in our depth coordinate system,
+        # l corresponds to the size along the x axis
+        self.size = np.array([data[9], data[8], data[10]]) * 2
+        self.orientation = np.zeros((3, ))
+        self.orientation[0] = data[11]
+        self.orientation[1] = data[12]
+        self.heading_angle = np.arctan2(self.orientation[1],
+                                        self.orientation[0])
+        self.box3d = np.concatenate(
+            [self.centroid, self.size, self.heading_angle[None]])
+
+
+class SUNRGBDData(object):
+    """SUNRGBD data.
+
+    Generate scannet infos for sunrgbd_converter.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        split (str, optional): Set split type of the data. Default: 'train'.
+        use_v1 (bool, optional): Whether to use v1. Default: False.
+    """
+
+    def __init__(self, root_path, split='train', use_v1=False):
+        self.root_dir = root_path
+        self.split = split
+        self.split_dir = osp.join(root_path, 'sunrgbd_trainval')
+        self.classes = [
+            'bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+            'night_stand', 'bookshelf', 'bathtub'
+        ]
+        self.cat2label = {cat: self.classes.index(cat) for cat in self.classes}
+        self.label2cat = {
+            label: self.classes[label]
+            for label in range(len(self.classes))
+        }
+        assert split in ['train', 'val', 'test']
+        split_file = osp.join(self.split_dir, f'{split}_data_idx.txt')
+        mmengine.check_file_exist(split_file)
+        self.sample_id_list = map(int, mmengine.list_from_file(split_file))
+        self.image_dir = osp.join(self.split_dir, 'image')
+        self.calib_dir = osp.join(self.split_dir, 'calib')
+        self.depth_dir = osp.join(self.split_dir, 'depth')
+        if use_v1:
+            self.label_dir = osp.join(self.split_dir, 'label_v1')
+        else:
+            self.label_dir = osp.join(self.split_dir, 'label')
+
+    def __len__(self):
+        return len(self.sample_id_list)
+
+    def get_image(self, idx):
+        img_filename = osp.join(self.image_dir, f'{idx:06d}.jpg')
+        return mmcv.imread(img_filename)
+
+    def get_image_shape(self, idx):
+        image = self.get_image(idx)
+        return np.array(image.shape[:2], dtype=np.int32)
+
+    def get_depth(self, idx):
+        depth_filename = osp.join(self.depth_dir, f'{idx:06d}.mat')
+        depth = sio.loadmat(depth_filename)['instance']
+        return depth
+
+    def get_calibration(self, idx):
+        calib_filepath = osp.join(self.calib_dir, f'{idx:06d}.txt')
+        lines = [line.rstrip() for line in open(calib_filepath)]
+        Rt = np.array([float(x) for x in lines[0].split(' ')])
+        Rt = np.reshape(Rt, (3, 3), order='F').astype(np.float32)
+        K = np.array([float(x) for x in lines[1].split(' ')])
+        K = np.reshape(K, (3, 3), order='F').astype(np.float32)
+        return K, Rt
+
+    def get_label_objects(self, idx):
+        label_filename = osp.join(self.label_dir, f'{idx:06d}.txt')
+        lines = [line.rstrip() for line in open(label_filename)]
+        objects = [SUNRGBDInstance(line) for line in lines]
+        return objects
+
+    def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+        """Get data infos.
+
+        This method gets information from the raw data.
+
+        Args:
+            num_workers (int, optional): Number of threads to be used.
+                Default: 4.
+            has_label (bool, optional): Whether the data has label.
+                Default: True.
+            sample_id_list (list[int], optional): Index list of the sample.
+                Default: None.
+
+        Returns:
+            infos (list[dict]): Information of the raw data.
+        """
+
+        def process_single_scene(sample_idx):
+            print(f'{self.split} sample_idx: {sample_idx}')
+            # convert depth to points
+            SAMPLE_NUM = 50000
+            # TODO: Check whether can move the point
+            #  sampling process during training.
+            pc_upright_depth = self.get_depth(sample_idx)
+            pc_upright_depth_subsampled = random_sampling(
+                pc_upright_depth, SAMPLE_NUM)
+
+            info = dict()
+            pc_info = {'num_features': 6, 'lidar_idx': sample_idx}
+            info['point_cloud'] = pc_info
+
+            mmengine.mkdir_or_exist(osp.join(self.root_dir, 'points'))
+            pc_upright_depth_subsampled.tofile(
+                osp.join(self.root_dir, 'points', f'{sample_idx:06d}.bin'))
+
+            info['pts_path'] = osp.join('points', f'{sample_idx:06d}.bin')
+            img_path = osp.join('image', f'{sample_idx:06d}.jpg')
+            image_info = {
+                'image_idx': sample_idx,
+                'image_shape': self.get_image_shape(sample_idx),
+                'image_path': img_path
+            }
+            info['image'] = image_info
+
+            K, Rt = self.get_calibration(sample_idx)
+            calib_info = {'K': K, 'Rt': Rt}
+            info['calib'] = calib_info
+
+            if has_label:
+                obj_list = self.get_label_objects(sample_idx)
+                annotations = {}
+                annotations['gt_num'] = len([
+                    obj.classname for obj in obj_list
+                    if obj.classname in self.cat2label.keys()
+                ])
+                if annotations['gt_num'] != 0:
+                    annotations['name'] = np.array([
+                        obj.classname for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])
+                    annotations['bbox'] = np.concatenate([
+                        obj.box2d.reshape(1, 4) for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ],
+                                                         axis=0)
+                    annotations['location'] = np.concatenate([
+                        obj.centroid.reshape(1, 3) for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ],
+                                                             axis=0)
+                    annotations['dimensions'] = 2 * np.array([
+                        [obj.length, obj.width, obj.height] for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])  # lwh (depth) format
+                    annotations['rotation_y'] = np.array([
+                        obj.heading_angle for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])
+                    annotations['index'] = np.arange(
+                        len(obj_list), dtype=np.int32)
+                    annotations['class'] = np.array([
+                        self.cat2label[obj.classname] for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])
+                    annotations['gt_boxes_upright_depth'] = np.stack(
+                        [
+                            obj.box3d for obj in obj_list
+                            if obj.classname in self.cat2label.keys()
+                        ],
+                        axis=0)  # (K,8)
+                info['annos'] = annotations
+            return info
+
+        sample_id_list = sample_id_list if \
+            sample_id_list is not None else self.sample_id_list
+        with futures.ThreadPoolExecutor(num_workers) as executor:
+            infos = executor.map(process_single_scene, sample_id_list)
+        return list(infos)
diff --git a/mmde/tools/dataset_converters/update_infos_to_v2.py b/mmde/tools/dataset_converters/update_infos_to_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2ddbd0688d0ace278b7b5012c0d67888d7aa5f9
--- /dev/null
+++ b/mmde/tools/dataset_converters/update_infos_to_v2.py
@@ -0,0 +1,1160 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Convert the annotation pkl to the standard format in OpenMMLab V2.0.
+
+Example:
+    python tools/dataset_converters/update_infos_to_v2.py
+        --dataset kitti
+        --pkl-path ./data/kitti/kitti_infos_train.pkl
+        --out-dir ./kitti_v2/
+"""
+
+import argparse
+import copy
+import time
+from os import path as osp
+from pathlib import Path
+
+import mmengine
+import numpy as np
+from nuscenes.nuscenes import NuScenes
+
+from mmdet3d.datasets.convert_utils import (convert_annos,
+                                            get_kitti_style_2d_boxes,
+                                            get_nuscenes_2d_boxes)
+from mmdet3d.datasets.utils import convert_quaternion_to_matrix
+from mmdet3d.structures import points_cam2img
+
+
+def get_empty_instance():
+    """Empty annotation for single instance."""
+    instance = dict(
+        # (list[float], required): list of 4 numbers representing
+        # the bounding box of the instance, in (x1, y1, x2, y2) order.
+        bbox=None,
+        # (int, required): an integer in the range
+        # [0, num_categories-1] representing the category label.
+        bbox_label=None,
+        #  (list[float], optional): list of 7 (or 9) numbers representing
+        #  the 3D bounding box of the instance,
+        #  in [x, y, z, w, h, l, yaw]
+        #  (or [x, y, z, w, h, l, yaw, vx, vy]) order.
+        bbox_3d=None,
+        # (bool, optional): Whether to use the
+        # 3D bounding box during training.
+        bbox_3d_isvalid=None,
+        # (int, optional): 3D category label
+        # (typically the same as label).
+        bbox_label_3d=None,
+        # (float, optional): Projected center depth of the
+        # 3D bounding box compared to the image plane.
+        depth=None,
+        #  (list[float], optional): Projected
+        #  2D center of the 3D bounding box.
+        center_2d=None,
+        # (int, optional): Attribute labels
+        # (fine-grained labels such as stopping, moving, ignore, crowd).
+        attr_label=None,
+        # (int, optional): The number of LiDAR
+        # points in the 3D bounding box.
+        num_lidar_pts=None,
+        # (int, optional): The number of Radar
+        # points in the 3D bounding box.
+        num_radar_pts=None,
+        # (int, optional): Difficulty level of
+        # detecting the 3D bounding box.
+        difficulty=None,
+        unaligned_bbox_3d=None)
+    return instance
+
+
+def get_empty_multicamera_instances(camera_types):
+
+    cam_instance = dict()
+    for cam_type in camera_types:
+        cam_instance[cam_type] = None
+    return cam_instance
+
+
+def get_empty_lidar_points():
+    lidar_points = dict(
+        # (int, optional) : Number of features for each point.
+        num_pts_feats=None,
+        # (str, optional): Path of LiDAR data file.
+        lidar_path=None,
+        # (list[list[float]], optional): Transformation matrix
+        # from lidar to ego-vehicle
+        # with shape [4, 4].
+        # (Referenced camera coordinate system is ego in KITTI.)
+        lidar2ego=None,
+    )
+    return lidar_points
+
+
+def get_empty_radar_points():
+    radar_points = dict(
+        # (int, optional) : Number of features for each point.
+        num_pts_feats=None,
+        # (str, optional): Path of RADAR data file.
+        radar_path=None,
+        # Transformation matrix from lidar to
+        # ego-vehicle with shape [4, 4].
+        # (Referenced camera coordinate system is ego in KITTI.)
+        radar2ego=None,
+    )
+    return radar_points
+
+
+def get_empty_img_info():
+    img_info = dict(
+        # (str, required): the path to the image file.
+        img_path=None,
+        # (int) The height of the image.
+        height=None,
+        # (int) The width of the image.
+        width=None,
+        # (str, optional): Path of the depth map file
+        depth_map=None,
+        # (list[list[float]], optional) : Transformation
+        # matrix from camera to image with
+        # shape [3, 3], [3, 4] or [4, 4].
+        cam2img=None,
+        # (list[list[float]]): Transformation matrix from lidar
+        # or depth to image with shape [4, 4].
+        lidar2img=None,
+        # (list[list[float]], optional) : Transformation
+        # matrix from camera to ego-vehicle
+        # with shape [4, 4].
+        cam2ego=None)
+    return img_info
+
+
+def get_single_image_sweep(camera_types):
+    single_image_sweep = dict(
+        # (float, optional) : Timestamp of the current frame.
+        timestamp=None,
+        # (list[list[float]], optional) : Transformation matrix
+        # from ego-vehicle to the global
+        ego2global=None)
+    # (dict): Information of images captured by multiple cameras
+    images = dict()
+    for cam_type in camera_types:
+        images[cam_type] = get_empty_img_info()
+    single_image_sweep['images'] = images
+    return single_image_sweep
+
+
+def get_single_lidar_sweep():
+    single_lidar_sweep = dict(
+        # (float, optional) : Timestamp of the current frame.
+        timestamp=None,
+        # (list[list[float]], optional) : Transformation matrix
+        # from ego-vehicle to the global
+        ego2global=None,
+        # (dict): Information of images captured by multiple cameras
+        lidar_points=get_empty_lidar_points())
+    return single_lidar_sweep
+
+
+def get_empty_standard_data_info(
+        camera_types=['CAM0', 'CAM1', 'CAM2', 'CAM3', 'CAM4']):
+
+    data_info = dict(
+        # (str): Sample id of the frame.
+        sample_idx=None,
+        # (str, optional): '000010'
+        token=None,
+        **get_single_image_sweep(camera_types),
+        # (dict, optional): dict contains information
+        # of LiDAR point cloud frame.
+        lidar_points=get_empty_lidar_points(),
+        # (dict, optional) Each dict contains
+        # information of Radar point cloud frame.
+        radar_points=get_empty_radar_points(),
+        # (list[dict], optional): Image sweeps data.
+        image_sweeps=[],
+        lidar_sweeps=[],
+        instances=[],
+        # (list[dict], optional): Required by object
+        # detection, instance  to be ignored during training.
+        instances_ignore=[],
+        # (str, optional): Path of semantic labels for each point.
+        pts_semantic_mask_path=None,
+        # (str, optional): Path of instance labels for each point.
+        pts_instance_mask_path=None)
+    return data_info
+
+
+def clear_instance_unused_keys(instance):
+    keys = list(instance.keys())
+    for k in keys:
+        if instance[k] is None:
+            del instance[k]
+    return instance
+
+
+def clear_data_info_unused_keys(data_info):
+    keys = list(data_info.keys())
+    empty_flag = True
+    for key in keys:
+        # we allow no annotations in datainfo
+        if key in ['instances', 'cam_sync_instances', 'cam_instances']:
+            empty_flag = False
+            continue
+        if isinstance(data_info[key], list):
+            if len(data_info[key]) == 0:
+                del data_info[key]
+            else:
+                empty_flag = False
+        elif data_info[key] is None:
+            del data_info[key]
+        elif isinstance(data_info[key], dict):
+            _, sub_empty_flag = clear_data_info_unused_keys(data_info[key])
+            if sub_empty_flag is False:
+                empty_flag = False
+            else:
+                # sub field is empty
+                del data_info[key]
+        else:
+            empty_flag = False
+
+    return data_info, empty_flag
+
+
+def generate_nuscenes_camera_instances(info, nusc):
+
+    # get bbox annotations for camera
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_FRONT_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_LEFT',
+        'CAM_BACK_RIGHT',
+    ]
+
+    empty_multicamera_instance = get_empty_multicamera_instances(camera_types)
+
+    for cam in camera_types:
+        cam_info = info['cams'][cam]
+        # list[dict]
+        ann_infos = get_nuscenes_2d_boxes(
+            nusc,
+            cam_info['sample_data_token'],
+            visibilities=['', '1', '2', '3', '4'])
+        empty_multicamera_instance[cam] = ann_infos
+
+    return empty_multicamera_instance
+
+
+def update_nuscenes_infos(pkl_path, out_dir):
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_FRONT_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_LEFT',
+        'CAM_BACK_RIGHT',
+    ]
+    print(f'{pkl_path} will be modified.')
+    if out_dir in pkl_path:
+        print(f'Warning, you may overwriting '
+              f'the original data {pkl_path}.')
+    print(f'Reading from input file: {pkl_path}.')
+    data_list = mmengine.load(pkl_path)
+    METAINFO = {
+        'classes':
+        ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+         'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'),
+    }
+    nusc = NuScenes(
+        version=data_list['metadata']['version'],
+        dataroot='./data/nuscenes',
+        verbose=True)
+
+    print('Start updating:')
+    converted_list = []
+    for i, ori_info_dict in enumerate(
+            mmengine.track_iter_progress(data_list['infos'])):
+        temp_data_info = get_empty_standard_data_info(
+            camera_types=camera_types)
+        temp_data_info['sample_idx'] = i
+        temp_data_info['token'] = ori_info_dict['token']
+        temp_data_info['ego2global'] = convert_quaternion_to_matrix(
+            ori_info_dict['ego2global_rotation'],
+            ori_info_dict['ego2global_translation'])
+        temp_data_info['lidar_points']['num_pts_feats'] = ori_info_dict.get(
+            'num_features', 5)
+        temp_data_info['lidar_points']['lidar_path'] = Path(
+            ori_info_dict['lidar_path']).name
+        temp_data_info['lidar_points'][
+            'lidar2ego'] = convert_quaternion_to_matrix(
+                ori_info_dict['lidar2ego_rotation'],
+                ori_info_dict['lidar2ego_translation'])
+        # bc-breaking: Timestamp has divided 1e6 in pkl infos.
+        temp_data_info['timestamp'] = ori_info_dict['timestamp'] / 1e6
+        for ori_sweep in ori_info_dict['sweeps']:
+            temp_lidar_sweep = get_single_lidar_sweep()
+            temp_lidar_sweep['lidar_points'][
+                'lidar2ego'] = convert_quaternion_to_matrix(
+                    ori_sweep['sensor2ego_rotation'],
+                    ori_sweep['sensor2ego_translation'])
+            temp_lidar_sweep['ego2global'] = convert_quaternion_to_matrix(
+                ori_sweep['ego2global_rotation'],
+                ori_sweep['ego2global_translation'])
+            lidar2sensor = np.eye(4)
+            rot = ori_sweep['sensor2lidar_rotation']
+            trans = ori_sweep['sensor2lidar_translation']
+            lidar2sensor[:3, :3] = rot.T
+            lidar2sensor[:3, 3:4] = -1 * np.matmul(rot.T, trans.reshape(3, 1))
+            temp_lidar_sweep['lidar_points'][
+                'lidar2sensor'] = lidar2sensor.astype(np.float32).tolist()
+            temp_lidar_sweep['timestamp'] = ori_sweep['timestamp'] / 1e6
+            temp_lidar_sweep['lidar_points']['lidar_path'] = ori_sweep[
+                'data_path']
+            temp_lidar_sweep['sample_data_token'] = ori_sweep[
+                'sample_data_token']
+            temp_data_info['lidar_sweeps'].append(temp_lidar_sweep)
+        temp_data_info['images'] = {}
+        for cam in ori_info_dict['cams']:
+            empty_img_info = get_empty_img_info()
+            empty_img_info['img_path'] = Path(
+                ori_info_dict['cams'][cam]['data_path']).name
+            empty_img_info['cam2img'] = ori_info_dict['cams'][cam][
+                'cam_intrinsic'].tolist()
+            empty_img_info['sample_data_token'] = ori_info_dict['cams'][cam][
+                'sample_data_token']
+            # bc-breaking: Timestamp has divided 1e6 in pkl infos.
+            empty_img_info[
+                'timestamp'] = ori_info_dict['cams'][cam]['timestamp'] / 1e6
+            empty_img_info['cam2ego'] = convert_quaternion_to_matrix(
+                ori_info_dict['cams'][cam]['sensor2ego_rotation'],
+                ori_info_dict['cams'][cam]['sensor2ego_translation'])
+            lidar2sensor = np.eye(4)
+            rot = ori_info_dict['cams'][cam]['sensor2lidar_rotation']
+            trans = ori_info_dict['cams'][cam]['sensor2lidar_translation']
+            lidar2sensor[:3, :3] = rot.T
+            lidar2sensor[:3, 3:4] = -1 * np.matmul(rot.T, trans.reshape(3, 1))
+            empty_img_info['lidar2cam'] = lidar2sensor.astype(
+                np.float32).tolist()
+            temp_data_info['images'][cam] = empty_img_info
+        ignore_class_name = set()
+        if 'gt_boxes' in ori_info_dict:
+            num_instances = ori_info_dict['gt_boxes'].shape[0]
+            for i in range(num_instances):
+                empty_instance = get_empty_instance()
+                empty_instance['bbox_3d'] = ori_info_dict['gt_boxes'][
+                    i, :].tolist()
+                if ori_info_dict['gt_names'][i] in METAINFO['classes']:
+                    empty_instance['bbox_label'] = METAINFO['classes'].index(
+                        ori_info_dict['gt_names'][i])
+                else:
+                    ignore_class_name.add(ori_info_dict['gt_names'][i])
+                    empty_instance['bbox_label'] = -1
+                empty_instance['bbox_label_3d'] = copy.deepcopy(
+                    empty_instance['bbox_label'])
+                empty_instance['velocity'] = ori_info_dict['gt_velocity'][
+                    i, :].tolist()
+                empty_instance['num_lidar_pts'] = ori_info_dict[
+                    'num_lidar_pts'][i]
+                empty_instance['num_radar_pts'] = ori_info_dict[
+                    'num_radar_pts'][i]
+                empty_instance['bbox_3d_isvalid'] = ori_info_dict[
+                    'valid_flag'][i]
+                empty_instance = clear_instance_unused_keys(empty_instance)
+                temp_data_info['instances'].append(empty_instance)
+            temp_data_info[
+                'cam_instances'] = generate_nuscenes_camera_instances(
+                    ori_info_dict, nusc)
+        if 'pts_semantic_mask_path' in ori_info_dict:
+            temp_data_info['pts_semantic_mask_path'] = Path(
+                ori_info_dict['pts_semantic_mask_path']).name
+        temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
+        converted_list.append(temp_data_info)
+    pkl_name = Path(pkl_path).name
+    out_path = osp.join(out_dir, pkl_name)
+    print(f'Writing to output file: {out_path}.')
+    print(f'ignore classes: {ignore_class_name}')
+
+    metainfo = dict()
+    metainfo['categories'] = {k: i for i, k in enumerate(METAINFO['classes'])}
+    if ignore_class_name:
+        for ignore_class in ignore_class_name:
+            metainfo['categories'][ignore_class] = -1
+    metainfo['dataset'] = 'nuscenes'
+    metainfo['version'] = data_list['metadata']['version']
+    metainfo['info_version'] = '1.1'
+    converted_data_info = dict(metainfo=metainfo, data_list=converted_list)
+
+    mmengine.dump(converted_data_info, out_path, 'pkl')
+
+
+def update_kitti_infos(pkl_path, out_dir):
+    print(f'{pkl_path} will be modified.')
+    if out_dir in pkl_path:
+        print(f'Warning, you may overwriting '
+              f'the original data {pkl_path}.')
+        time.sleep(5)
+    # TODO update to full label
+    # TODO discuss how to process 'Van', 'DontCare'
+    METAINFO = {
+        'classes': ('Pedestrian', 'Cyclist', 'Car', 'Van', 'Truck',
+                    'Person_sitting', 'Tram', 'Misc'),
+    }
+    print(f'Reading from input file: {pkl_path}.')
+    data_list = mmengine.load(pkl_path)
+    print('Start updating:')
+    converted_list = []
+    for ori_info_dict in mmengine.track_iter_progress(data_list):
+        temp_data_info = get_empty_standard_data_info()
+
+        if 'plane' in ori_info_dict:
+            temp_data_info['plane'] = ori_info_dict['plane']
+
+        temp_data_info['sample_idx'] = ori_info_dict['image']['image_idx']
+
+        temp_data_info['images']['CAM0']['cam2img'] = ori_info_dict['calib'][
+            'P0'].tolist()
+        temp_data_info['images']['CAM1']['cam2img'] = ori_info_dict['calib'][
+            'P1'].tolist()
+        temp_data_info['images']['CAM2']['cam2img'] = ori_info_dict['calib'][
+            'P2'].tolist()
+        temp_data_info['images']['CAM3']['cam2img'] = ori_info_dict['calib'][
+            'P3'].tolist()
+
+        temp_data_info['images']['CAM2']['img_path'] = Path(
+            ori_info_dict['image']['image_path']).name
+        h, w = ori_info_dict['image']['image_shape']
+        temp_data_info['images']['CAM2']['height'] = h
+        temp_data_info['images']['CAM2']['width'] = w
+        temp_data_info['lidar_points']['num_pts_feats'] = ori_info_dict[
+            'point_cloud']['num_features']
+        temp_data_info['lidar_points']['lidar_path'] = Path(
+            ori_info_dict['point_cloud']['velodyne_path']).name
+
+        rect = ori_info_dict['calib']['R0_rect'].astype(np.float32)
+        Trv2c = ori_info_dict['calib']['Tr_velo_to_cam'].astype(np.float32)
+        lidar2cam = rect @ Trv2c
+        temp_data_info['images']['CAM2']['lidar2cam'] = lidar2cam.tolist()
+        temp_data_info['images']['CAM0']['lidar2img'] = (
+            ori_info_dict['calib']['P0'] @ lidar2cam).tolist()
+        temp_data_info['images']['CAM1']['lidar2img'] = (
+            ori_info_dict['calib']['P1'] @ lidar2cam).tolist()
+        temp_data_info['images']['CAM2']['lidar2img'] = (
+            ori_info_dict['calib']['P2'] @ lidar2cam).tolist()
+        temp_data_info['images']['CAM3']['lidar2img'] = (
+            ori_info_dict['calib']['P3'] @ lidar2cam).tolist()
+
+        temp_data_info['lidar_points']['Tr_velo_to_cam'] = Trv2c.tolist()
+
+        # for potential usage
+        temp_data_info['images']['R0_rect'] = ori_info_dict['calib'][
+            'R0_rect'].astype(np.float32).tolist()
+        temp_data_info['lidar_points']['Tr_imu_to_velo'] = ori_info_dict[
+            'calib']['Tr_imu_to_velo'].astype(np.float32).tolist()
+
+        cam2img = ori_info_dict['calib']['P2']
+
+        anns = ori_info_dict.get('annos', None)
+        ignore_class_name = set()
+        if anns is not None:
+            num_instances = len(anns['name'])
+            instance_list = []
+            for instance_id in range(num_instances):
+                empty_instance = get_empty_instance()
+                empty_instance['bbox'] = anns['bbox'][instance_id].tolist()
+
+                if anns['name'][instance_id] in METAINFO['classes']:
+                    empty_instance['bbox_label'] = METAINFO['classes'].index(
+                        anns['name'][instance_id])
+                else:
+                    ignore_class_name.add(anns['name'][instance_id])
+                    empty_instance['bbox_label'] = -1
+
+                empty_instance['bbox'] = anns['bbox'][instance_id].tolist()
+
+                loc = anns['location'][instance_id]
+                dims = anns['dimensions'][instance_id]
+                rots = anns['rotation_y'][:, None][instance_id]
+
+                dst = np.array([0.5, 0.5, 0.5])
+                src = np.array([0.5, 1.0, 0.5])
+
+                center_3d = loc + dims * (dst - src)
+                center_2d = points_cam2img(
+                    center_3d.reshape([1, 3]), cam2img, with_depth=True)
+                center_2d = center_2d.squeeze().tolist()
+                empty_instance['center_2d'] = center_2d[:2]
+                empty_instance['depth'] = center_2d[2]
+
+                gt_bboxes_3d = np.concatenate([loc, dims, rots]).tolist()
+                empty_instance['bbox_3d'] = gt_bboxes_3d
+                empty_instance['bbox_label_3d'] = copy.deepcopy(
+                    empty_instance['bbox_label'])
+                empty_instance['bbox'] = anns['bbox'][instance_id].tolist()
+                empty_instance['truncated'] = anns['truncated'][
+                    instance_id].tolist()
+                empty_instance['occluded'] = anns['occluded'][
+                    instance_id].tolist()
+                empty_instance['alpha'] = anns['alpha'][instance_id].tolist()
+                empty_instance['score'] = anns['score'][instance_id].tolist()
+                empty_instance['index'] = anns['index'][instance_id].tolist()
+                empty_instance['group_id'] = anns['group_ids'][
+                    instance_id].tolist()
+                empty_instance['difficulty'] = anns['difficulty'][
+                    instance_id].tolist()
+                empty_instance['num_lidar_pts'] = anns['num_points_in_gt'][
+                    instance_id].tolist()
+                empty_instance = clear_instance_unused_keys(empty_instance)
+                instance_list.append(empty_instance)
+            temp_data_info['instances'] = instance_list
+            cam_instances = generate_kitti_camera_instances(ori_info_dict)
+            temp_data_info['cam_instances'] = cam_instances
+        temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
+        converted_list.append(temp_data_info)
+    pkl_name = Path(pkl_path).name
+    out_path = osp.join(out_dir, pkl_name)
+    print(f'Writing to output file: {out_path}.')
+    print(f'ignore classes: {ignore_class_name}')
+
+    # dataset metainfo
+    metainfo = dict()
+    metainfo['categories'] = {k: i for i, k in enumerate(METAINFO['classes'])}
+    if ignore_class_name:
+        for ignore_class in ignore_class_name:
+            metainfo['categories'][ignore_class] = -1
+    metainfo['dataset'] = 'kitti'
+    metainfo['info_version'] = '1.1'
+    converted_data_info = dict(metainfo=metainfo, data_list=converted_list)
+
+    mmengine.dump(converted_data_info, out_path, 'pkl')
+
+
+def update_s3dis_infos(pkl_path, out_dir):
+    print(f'{pkl_path} will be modified.')
+    if out_dir in pkl_path:
+        print(f'Warning, you may overwriting '
+              f'the original data {pkl_path}.')
+        time.sleep(5)
+    METAINFO = {'classes': ('table', 'chair', 'sofa', 'bookcase', 'board')}
+    print(f'Reading from input file: {pkl_path}.')
+    data_list = mmengine.load(pkl_path)
+    print('Start updating:')
+    converted_list = []
+    for i, ori_info_dict in enumerate(mmengine.track_iter_progress(data_list)):
+        temp_data_info = get_empty_standard_data_info()
+        temp_data_info['sample_idx'] = i
+        temp_data_info['lidar_points']['num_pts_feats'] = ori_info_dict[
+            'point_cloud']['num_features']
+        temp_data_info['lidar_points']['lidar_path'] = Path(
+            ori_info_dict['pts_path']).name
+        if 'pts_semantic_mask_path' in ori_info_dict:
+            temp_data_info['pts_semantic_mask_path'] = Path(
+                ori_info_dict['pts_semantic_mask_path']).name
+        if 'pts_instance_mask_path' in ori_info_dict:
+            temp_data_info['pts_instance_mask_path'] = Path(
+                ori_info_dict['pts_instance_mask_path']).name
+
+        # TODO support camera
+        # np.linalg.inv(info['axis_align_matrix'] @ extrinsic): depth2cam
+        anns = ori_info_dict.get('annos', None)
+        ignore_class_name = set()
+        if anns is not None:
+            if anns['gt_num'] == 0:
+                instance_list = []
+            else:
+                num_instances = len(anns['class'])
+                instance_list = []
+                for instance_id in range(num_instances):
+                    empty_instance = get_empty_instance()
+                    empty_instance['bbox_3d'] = anns['gt_boxes_upright_depth'][
+                        instance_id].tolist()
+
+                    if anns['class'][instance_id] < len(METAINFO['classes']):
+                        empty_instance['bbox_label_3d'] = anns['class'][
+                            instance_id]
+                    else:
+                        ignore_class_name.add(
+                            METAINFO['classes'][anns['class'][instance_id]])
+                        empty_instance['bbox_label_3d'] = -1
+
+                    empty_instance = clear_instance_unused_keys(empty_instance)
+                    instance_list.append(empty_instance)
+            temp_data_info['instances'] = instance_list
+        temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
+        converted_list.append(temp_data_info)
+    pkl_name = Path(pkl_path).name
+    out_path = osp.join(out_dir, pkl_name)
+    print(f'Writing to output file: {out_path}.')
+    print(f'ignore classes: {ignore_class_name}')
+
+    # dataset metainfo
+    metainfo = dict()
+    metainfo['categories'] = {k: i for i, k in enumerate(METAINFO['classes'])}
+    if ignore_class_name:
+        for ignore_class in ignore_class_name:
+            metainfo['categories'][ignore_class] = -1
+    metainfo['dataset'] = 's3dis'
+    metainfo['info_version'] = '1.1'
+
+    converted_data_info = dict(metainfo=metainfo, data_list=converted_list)
+
+    mmengine.dump(converted_data_info, out_path, 'pkl')
+
+
+def update_scannet_infos(pkl_path, out_dir):
+    print(f'{pkl_path} will be modified.')
+    if out_dir in pkl_path:
+        print(f'Warning, you may overwriting '
+              f'the original data {pkl_path}.')
+        time.sleep(5)
+    METAINFO = {
+        'classes':
+        ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+         'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator',
+         'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin')
+    }
+    print(f'Reading from input file: {pkl_path}.')
+    data_list = mmengine.load(pkl_path)
+    print('Start updating:')
+    converted_list = []
+    for ori_info_dict in mmengine.track_iter_progress(data_list):
+        temp_data_info = get_empty_standard_data_info()
+        temp_data_info['lidar_points']['num_pts_feats'] = ori_info_dict[
+            'point_cloud']['num_features']
+        temp_data_info['lidar_points']['lidar_path'] = Path(
+            ori_info_dict['pts_path']).name
+        if 'pts_semantic_mask_path' in ori_info_dict:
+            temp_data_info['pts_semantic_mask_path'] = Path(
+                ori_info_dict['pts_semantic_mask_path']).name
+        if 'pts_instance_mask_path' in ori_info_dict:
+            temp_data_info['pts_instance_mask_path'] = Path(
+                ori_info_dict['pts_instance_mask_path']).name
+
+        # TODO support camera
+        # np.linalg.inv(info['axis_align_matrix'] @ extrinsic): depth2cam
+        anns = ori_info_dict.get('annos', None)
+        ignore_class_name = set()
+        if anns is not None:
+            temp_data_info['axis_align_matrix'] = anns[
+                'axis_align_matrix'].tolist()
+            if anns['gt_num'] == 0:
+                instance_list = []
+            else:
+                num_instances = len(anns['name'])
+                instance_list = []
+                for instance_id in range(num_instances):
+                    empty_instance = get_empty_instance()
+                    empty_instance['bbox_3d'] = anns['gt_boxes_upright_depth'][
+                        instance_id].tolist()
+
+                    if anns['name'][instance_id] in METAINFO['classes']:
+                        empty_instance['bbox_label_3d'] = METAINFO[
+                            'classes'].index(anns['name'][instance_id])
+                    else:
+                        ignore_class_name.add(anns['name'][instance_id])
+                        empty_instance['bbox_label_3d'] = -1
+
+                    empty_instance = clear_instance_unused_keys(empty_instance)
+                    instance_list.append(empty_instance)
+            temp_data_info['instances'] = instance_list
+        temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
+        converted_list.append(temp_data_info)
+    pkl_name = Path(pkl_path).name
+    out_path = osp.join(out_dir, pkl_name)
+    print(f'Writing to output file: {out_path}.')
+    print(f'ignore classes: {ignore_class_name}')
+
+    # dataset metainfo
+    metainfo = dict()
+    metainfo['categories'] = {k: i for i, k in enumerate(METAINFO['classes'])}
+    if ignore_class_name:
+        for ignore_class in ignore_class_name:
+            metainfo['categories'][ignore_class] = -1
+    metainfo['dataset'] = 'scannet'
+    metainfo['info_version'] = '1.1'
+
+    converted_data_info = dict(metainfo=metainfo, data_list=converted_list)
+
+    mmengine.dump(converted_data_info, out_path, 'pkl')
+
+
+def update_sunrgbd_infos(pkl_path, out_dir):
+    print(f'{pkl_path} will be modified.')
+    if out_dir in pkl_path:
+        print(f'Warning, you may overwriting '
+              f'the original data {pkl_path}.')
+        time.sleep(5)
+    METAINFO = {
+        'classes': ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk',
+                    'dresser', 'night_stand', 'bookshelf', 'bathtub')
+    }
+    print(f'Reading from input file: {pkl_path}.')
+    data_list = mmengine.load(pkl_path)
+    print('Start updating:')
+    converted_list = []
+    for ori_info_dict in mmengine.track_iter_progress(data_list):
+        temp_data_info = get_empty_standard_data_info()
+        temp_data_info['lidar_points']['num_pts_feats'] = ori_info_dict[
+            'point_cloud']['num_features']
+        temp_data_info['lidar_points']['lidar_path'] = Path(
+            ori_info_dict['pts_path']).name
+        calib = ori_info_dict['calib']
+        rt_mat = calib['Rt']
+        # follow Coord3DMode.convert_point
+        rt_mat = np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]
+                           ]) @ rt_mat.transpose(1, 0)
+        depth2img = calib['K'] @ rt_mat
+        temp_data_info['images']['CAM0']['depth2img'] = depth2img.tolist()
+        temp_data_info['images']['CAM0']['img_path'] = Path(
+            ori_info_dict['image']['image_path']).name
+        h, w = ori_info_dict['image']['image_shape']
+        temp_data_info['images']['CAM0']['height'] = h
+        temp_data_info['images']['CAM0']['width'] = w
+
+        anns = ori_info_dict.get('annos', None)
+        if anns is not None:
+            if anns['gt_num'] == 0:
+                instance_list = []
+            else:
+                num_instances = len(anns['name'])
+                ignore_class_name = set()
+                instance_list = []
+                for instance_id in range(num_instances):
+                    empty_instance = get_empty_instance()
+                    empty_instance['bbox_3d'] = anns['gt_boxes_upright_depth'][
+                        instance_id].tolist()
+                    empty_instance['bbox'] = anns['bbox'][instance_id].tolist()
+                    if anns['name'][instance_id] in METAINFO['classes']:
+                        empty_instance['bbox_label_3d'] = METAINFO[
+                            'classes'].index(anns['name'][instance_id])
+                        empty_instance['bbox_label'] = empty_instance[
+                            'bbox_label_3d']
+                    else:
+                        ignore_class_name.add(anns['name'][instance_id])
+                        empty_instance['bbox_label_3d'] = -1
+                        empty_instance['bbox_label'] = -1
+                    empty_instance = clear_instance_unused_keys(empty_instance)
+                    instance_list.append(empty_instance)
+            temp_data_info['instances'] = instance_list
+        temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
+        converted_list.append(temp_data_info)
+    pkl_name = Path(pkl_path).name
+    out_path = osp.join(out_dir, pkl_name)
+    print(f'Writing to output file: {out_path}.')
+    print(f'ignore classes: {ignore_class_name}')
+
+    # dataset metainfo
+    metainfo = dict()
+    metainfo['categories'] = {k: i for i, k in enumerate(METAINFO['classes'])}
+    if ignore_class_name:
+        for ignore_class in ignore_class_name:
+            metainfo['categories'][ignore_class] = -1
+    metainfo['dataset'] = 'sunrgbd'
+    metainfo['info_version'] = '1.1'
+
+    converted_data_info = dict(metainfo=metainfo, data_list=converted_list)
+
+    mmengine.dump(converted_data_info, out_path, 'pkl')
+
+
+def update_lyft_infos(pkl_path, out_dir):
+    print(f'{pkl_path} will be modified.')
+    if out_dir in pkl_path:
+        print(f'Warning, you may overwriting '
+              f'the original data {pkl_path}.')
+    print(f'Reading from input file: {pkl_path}.')
+    data_list = mmengine.load(pkl_path)
+    METAINFO = {
+        'classes':
+        ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle',
+         'motorcycle', 'bicycle', 'pedestrian', 'animal'),
+    }
+    print('Start updating:')
+    converted_list = []
+    for i, ori_info_dict in enumerate(
+            mmengine.track_iter_progress(data_list['infos'])):
+        temp_data_info = get_empty_standard_data_info()
+        temp_data_info['sample_idx'] = i
+        temp_data_info['token'] = ori_info_dict['token']
+        temp_data_info['ego2global'] = convert_quaternion_to_matrix(
+            ori_info_dict['ego2global_rotation'],
+            ori_info_dict['ego2global_translation'])
+        temp_data_info['lidar_points']['num_pts_feats'] = ori_info_dict.get(
+            'num_features', 5)
+        temp_data_info['lidar_points']['lidar_path'] = Path(
+            ori_info_dict['lidar_path']).name
+        temp_data_info['lidar_points'][
+            'lidar2ego'] = convert_quaternion_to_matrix(
+                ori_info_dict['lidar2ego_rotation'],
+                ori_info_dict['lidar2ego_translation'])
+        # bc-breaking: Timestamp has divided 1e6 in pkl infos.
+        temp_data_info['timestamp'] = ori_info_dict['timestamp'] / 1e6
+        for ori_sweep in ori_info_dict['sweeps']:
+            temp_lidar_sweep = get_single_lidar_sweep()
+            temp_lidar_sweep['lidar_points'][
+                'lidar2ego'] = convert_quaternion_to_matrix(
+                    ori_sweep['sensor2ego_rotation'],
+                    ori_sweep['sensor2ego_translation'])
+            temp_lidar_sweep['ego2global'] = convert_quaternion_to_matrix(
+                ori_sweep['ego2global_rotation'],
+                ori_sweep['ego2global_translation'])
+            lidar2sensor = np.eye(4)
+            rot = ori_sweep['sensor2lidar_rotation']
+            trans = ori_sweep['sensor2lidar_translation']
+            lidar2sensor[:3, :3] = rot.T
+            lidar2sensor[:3, 3:4] = -1 * np.matmul(rot.T, trans.reshape(3, 1))
+            temp_lidar_sweep['lidar_points'][
+                'lidar2sensor'] = lidar2sensor.astype(np.float32).tolist()
+            # bc-breaking: Timestamp has divided 1e6 in pkl infos.
+            temp_lidar_sweep['timestamp'] = ori_sweep['timestamp'] / 1e6
+            temp_lidar_sweep['lidar_points']['lidar_path'] = ori_sweep[
+                'data_path']
+            temp_lidar_sweep['sample_data_token'] = ori_sweep[
+                'sample_data_token']
+            temp_data_info['lidar_sweeps'].append(temp_lidar_sweep)
+        temp_data_info['images'] = {}
+        for cam in ori_info_dict['cams']:
+            empty_img_info = get_empty_img_info()
+            empty_img_info['img_path'] = Path(
+                ori_info_dict['cams'][cam]['data_path']).name
+            empty_img_info['cam2img'] = ori_info_dict['cams'][cam][
+                'cam_intrinsic'].tolist()
+            empty_img_info['sample_data_token'] = ori_info_dict['cams'][cam][
+                'sample_data_token']
+            empty_img_info[
+                'timestamp'] = ori_info_dict['cams'][cam]['timestamp'] / 1e6
+            empty_img_info['cam2ego'] = convert_quaternion_to_matrix(
+                ori_info_dict['cams'][cam]['sensor2ego_rotation'],
+                ori_info_dict['cams'][cam]['sensor2ego_translation'])
+            lidar2sensor = np.eye(4)
+            rot = ori_info_dict['cams'][cam]['sensor2lidar_rotation']
+            trans = ori_info_dict['cams'][cam]['sensor2lidar_translation']
+            lidar2sensor[:3, :3] = rot.T
+            lidar2sensor[:3, 3:4] = -1 * np.matmul(rot.T, trans.reshape(3, 1))
+            empty_img_info['lidar2cam'] = lidar2sensor.astype(
+                np.float32).tolist()
+            temp_data_info['images'][cam] = empty_img_info
+        ignore_class_name = set()
+        if 'gt_boxes' in ori_info_dict:
+            num_instances = ori_info_dict['gt_boxes'].shape[0]
+            for i in range(num_instances):
+                empty_instance = get_empty_instance()
+                empty_instance['bbox_3d'] = ori_info_dict['gt_boxes'][
+                    i, :].tolist()
+                if ori_info_dict['gt_names'][i] in METAINFO['classes']:
+                    empty_instance['bbox_label'] = METAINFO['classes'].index(
+                        ori_info_dict['gt_names'][i])
+                else:
+                    ignore_class_name.add(ori_info_dict['gt_names'][i])
+                    empty_instance['bbox_label'] = -1
+                empty_instance['bbox_label_3d'] = copy.deepcopy(
+                    empty_instance['bbox_label'])
+                empty_instance = clear_instance_unused_keys(empty_instance)
+                temp_data_info['instances'].append(empty_instance)
+        temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
+        converted_list.append(temp_data_info)
+    pkl_name = Path(pkl_path).name
+    out_path = osp.join(out_dir, pkl_name)
+    print(f'Writing to output file: {out_path}.')
+    print(f'ignore classes: {ignore_class_name}')
+
+    metainfo = dict()
+    metainfo['categories'] = {k: i for i, k in enumerate(METAINFO['classes'])}
+    if ignore_class_name:
+        for ignore_class in ignore_class_name:
+            metainfo['categories'][ignore_class] = -1
+    metainfo['dataset'] = 'lyft'
+    metainfo['version'] = data_list['metadata']['version']
+    metainfo['info_version'] = '1.1'
+    converted_data_info = dict(metainfo=metainfo, data_list=converted_list)
+
+    mmengine.dump(converted_data_info, out_path, 'pkl')
+
+
+def update_waymo_infos(pkl_path, out_dir):
+    # the input pkl is based on the
+    # pkl generated in the waymo cam only challenage.
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_LEFT',
+        'CAM_FRONT_RIGHT',
+        'CAM_SIDE_LEFT',
+        'CAM_SIDE_RIGHT',
+    ]
+    print(f'{pkl_path} will be modified.')
+    if out_dir in pkl_path:
+        print(f'Warning, you may overwriting '
+              f'the original data {pkl_path}.')
+        time.sleep(5)
+    # TODO update to full label
+    # TODO discuss how to process 'Van', 'DontCare'
+    METAINFO = {
+        'classes': ('Car', 'Pedestrian', 'Cyclist', 'Sign'),
+    }
+    print(f'Reading from input file: {pkl_path}.')
+    data_list = mmengine.load(pkl_path)
+    print('Start updating:')
+    converted_list = []
+    for ori_info_dict in mmengine.track_iter_progress(data_list):
+        temp_data_info = get_empty_standard_data_info(camera_types)
+
+        if 'plane' in ori_info_dict:
+            temp_data_info['plane'] = ori_info_dict['plane']
+        temp_data_info['sample_idx'] = ori_info_dict['image']['image_idx']
+
+        # calib matrix
+        for cam_idx, cam_key in enumerate(camera_types):
+            temp_data_info['images'][cam_key]['cam2img'] =\
+                 ori_info_dict['calib'][f'P{cam_idx}'].tolist()
+
+        for cam_idx, cam_key in enumerate(camera_types):
+            rect = ori_info_dict['calib']['R0_rect'].astype(np.float32)
+            velo_to_cam = 'Tr_velo_to_cam'
+            if cam_idx != 0:
+                velo_to_cam += str(cam_idx)
+            Trv2c = ori_info_dict['calib'][velo_to_cam].astype(np.float32)
+
+            lidar2cam = rect @ Trv2c
+            temp_data_info['images'][cam_key]['lidar2cam'] = lidar2cam.tolist()
+            temp_data_info['images'][cam_key]['lidar2img'] = (
+                ori_info_dict['calib'][f'P{cam_idx}'] @ lidar2cam).tolist()
+
+        # image path
+        base_img_path = Path(ori_info_dict['image']['image_path']).name
+
+        for cam_idx, cam_key in enumerate(camera_types):
+            temp_data_info['images'][cam_key]['timestamp'] = ori_info_dict[
+                'timestamp']
+            temp_data_info['images'][cam_key]['img_path'] = base_img_path
+
+        h, w = ori_info_dict['image']['image_shape']
+
+        # for potential usage
+        temp_data_info['images'][camera_types[0]]['height'] = h
+        temp_data_info['images'][camera_types[0]]['width'] = w
+        temp_data_info['lidar_points']['num_pts_feats'] = ori_info_dict[
+            'point_cloud']['num_features']
+        temp_data_info['lidar_points']['timestamp'] = ori_info_dict[
+            'timestamp']
+        velo_path = ori_info_dict['point_cloud'].get('velodyne_path')
+        if velo_path is not None:
+            temp_data_info['lidar_points']['lidar_path'] = Path(velo_path).name
+
+        # TODO discuss the usage of Tr_velo_to_cam in lidar
+        Trv2c = ori_info_dict['calib']['Tr_velo_to_cam'].astype(np.float32)
+
+        temp_data_info['lidar_points']['Tr_velo_to_cam'] = Trv2c.tolist()
+
+        # for potential usage
+        # temp_data_info['images']['R0_rect'] = ori_info_dict['calib'][
+        #     'R0_rect'].astype(np.float32).tolist()
+
+        # for the sweeps part:
+        temp_data_info['timestamp'] = ori_info_dict['timestamp']
+        temp_data_info['ego2global'] = ori_info_dict['pose']
+
+        for ori_sweep in ori_info_dict['sweeps']:
+            # lidar sweeps
+            lidar_sweep = get_single_lidar_sweep()
+            lidar_sweep['ego2global'] = ori_sweep['pose']
+            lidar_sweep['timestamp'] = ori_sweep['timestamp']
+            lidar_sweep['lidar_points']['lidar_path'] = Path(
+                ori_sweep['velodyne_path']).name
+            # image sweeps
+            image_sweep = get_single_image_sweep(camera_types)
+            image_sweep['ego2global'] = ori_sweep['pose']
+            image_sweep['timestamp'] = ori_sweep['timestamp']
+            img_path = Path(ori_sweep['image_path']).name
+            for cam_idx, cam_key in enumerate(camera_types):
+                image_sweep['images'][cam_key]['img_path'] = img_path
+
+            temp_data_info['lidar_sweeps'].append(lidar_sweep)
+            temp_data_info['image_sweeps'].append(image_sweep)
+
+        anns = ori_info_dict.get('annos', None)
+        ignore_class_name = set()
+        if anns is not None:
+            num_instances = len(anns['name'])
+
+            instance_list = []
+            for instance_id in range(num_instances):
+                empty_instance = get_empty_instance()
+                empty_instance['bbox'] = anns['bbox'][instance_id].tolist()
+
+                if anns['name'][instance_id] in METAINFO['classes']:
+                    empty_instance['bbox_label'] = METAINFO['classes'].index(
+                        anns['name'][instance_id])
+                else:
+                    ignore_class_name.add(anns['name'][instance_id])
+                    empty_instance['bbox_label'] = -1
+
+                empty_instance['bbox'] = anns['bbox'][instance_id].tolist()
+
+                loc = anns['location'][instance_id]
+                dims = anns['dimensions'][instance_id]
+                rots = anns['rotation_y'][:, None][instance_id]
+                gt_bboxes_3d = np.concatenate([loc, dims, rots
+                                               ]).astype(np.float32).tolist()
+                empty_instance['bbox_3d'] = gt_bboxes_3d
+                empty_instance['bbox_label_3d'] = copy.deepcopy(
+                    empty_instance['bbox_label'])
+                empty_instance['bbox'] = anns['bbox'][instance_id].tolist()
+                empty_instance['truncated'] = int(
+                    anns['truncated'][instance_id].tolist())
+                empty_instance['occluded'] = anns['occluded'][
+                    instance_id].tolist()
+                empty_instance['alpha'] = anns['alpha'][instance_id].tolist()
+                empty_instance['index'] = anns['index'][instance_id].tolist()
+                empty_instance['group_id'] = anns['group_ids'][
+                    instance_id].tolist()
+                empty_instance['difficulty'] = anns['difficulty'][
+                    instance_id].tolist()
+                empty_instance['num_lidar_pts'] = anns['num_points_in_gt'][
+                    instance_id].tolist()
+                empty_instance['camera_id'] = anns['camera_id'][
+                    instance_id].tolist()
+                empty_instance = clear_instance_unused_keys(empty_instance)
+                instance_list.append(empty_instance)
+            temp_data_info['instances'] = instance_list
+
+        # waymo provide the labels that sync with cam
+        anns = ori_info_dict.get('cam_sync_annos', None)
+        ignore_class_name = set()
+        if anns is not None:
+            num_instances = len(anns['name'])
+            instance_list = []
+            for instance_id in range(num_instances):
+                empty_instance = get_empty_instance()
+                empty_instance['bbox'] = anns['bbox'][instance_id].tolist()
+
+                if anns['name'][instance_id] in METAINFO['classes']:
+                    empty_instance['bbox_label'] = METAINFO['classes'].index(
+                        anns['name'][instance_id])
+                else:
+                    ignore_class_name.add(anns['name'][instance_id])
+                    empty_instance['bbox_label'] = -1
+
+                empty_instance['bbox'] = anns['bbox'][instance_id].tolist()
+
+                loc = anns['location'][instance_id]
+                dims = anns['dimensions'][instance_id]
+                rots = anns['rotation_y'][:, None][instance_id]
+                gt_bboxes_3d = np.concatenate([loc, dims, rots
+                                               ]).astype(np.float32).tolist()
+                empty_instance['bbox_3d'] = gt_bboxes_3d
+                empty_instance['bbox_label_3d'] = copy.deepcopy(
+                    empty_instance['bbox_label'])
+                empty_instance['bbox'] = anns['bbox'][instance_id].tolist()
+                empty_instance['truncated'] = int(
+                    anns['truncated'][instance_id].tolist())
+                empty_instance['occluded'] = anns['occluded'][
+                    instance_id].tolist()
+                empty_instance['alpha'] = anns['alpha'][instance_id].tolist()
+                empty_instance['index'] = anns['index'][instance_id].tolist()
+                empty_instance['group_id'] = anns['group_ids'][
+                    instance_id].tolist()
+                empty_instance['camera_id'] = anns['camera_id'][
+                    instance_id].tolist()
+                empty_instance = clear_instance_unused_keys(empty_instance)
+                instance_list.append(empty_instance)
+            temp_data_info['cam_sync_instances'] = instance_list
+
+            cam_instances = generate_waymo_camera_instances(
+                ori_info_dict, camera_types)
+            temp_data_info['cam_instances'] = cam_instances
+
+        temp_data_info, _ = clear_data_info_unused_keys(temp_data_info)
+        converted_list.append(temp_data_info)
+    pkl_name = Path(pkl_path).name
+    out_path = osp.join(out_dir, pkl_name)
+    print(f'Writing to output file: {out_path}.')
+    print(f'ignore classes: {ignore_class_name}')
+
+    # dataset metainfo
+    metainfo = dict()
+    metainfo['categories'] = {k: i for i, k in enumerate(METAINFO['classes'])}
+    if ignore_class_name:
+        for ignore_class in ignore_class_name:
+            metainfo['categories'][ignore_class] = -1
+    metainfo['dataset'] = 'waymo'
+    metainfo['version'] = '1.4'
+    metainfo['info_version'] = '1.1'
+
+    converted_data_info = dict(metainfo=metainfo, data_list=converted_list)
+
+    mmengine.dump(converted_data_info, out_path, 'pkl')
+
+
+def generate_kitti_camera_instances(ori_info_dict):
+
+    cam_key = 'CAM2'
+    empty_camera_instances = get_empty_multicamera_instances([cam_key])
+    annos = copy.deepcopy(ori_info_dict['annos'])
+    ann_infos = get_kitti_style_2d_boxes(
+        ori_info_dict, occluded=[0, 1, 2, 3], annos=annos)
+    empty_camera_instances[cam_key] = ann_infos
+
+    return empty_camera_instances
+
+
+def generate_waymo_camera_instances(ori_info_dict, cam_keys):
+
+    empty_multicamera_instances = get_empty_multicamera_instances(cam_keys)
+
+    for cam_idx, cam_key in enumerate(cam_keys):
+        annos = copy.deepcopy(ori_info_dict['cam_sync_annos'])
+        if cam_idx != 0:
+            annos = convert_annos(ori_info_dict, cam_idx)
+
+        ann_infos = get_kitti_style_2d_boxes(
+            ori_info_dict, cam_idx, occluded=[0], annos=annos, dataset='waymo')
+
+        empty_multicamera_instances[cam_key] = ann_infos
+    return empty_multicamera_instances
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Arg parser for data coords '
+                                     'update due to coords sys refactor.')
+    parser.add_argument(
+        '--dataset', type=str, default='kitti', help='name of dataset')
+    parser.add_argument(
+        '--pkl-path',
+        type=str,
+        default='./data/kitti/kitti_infos_train.pkl ',
+        help='specify the root dir of dataset')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='converted_annotations',
+        required=False,
+        help='output direction of info pkl')
+    args = parser.parse_args()
+    return args
+
+
+def update_pkl_infos(dataset, out_dir, pkl_path):
+    if dataset.lower() == 'kitti':
+        update_kitti_infos(pkl_path=pkl_path, out_dir=out_dir)
+    elif dataset.lower() == 'waymo':
+        update_waymo_infos(pkl_path=pkl_path, out_dir=out_dir)
+    elif dataset.lower() == 'scannet':
+        update_scannet_infos(pkl_path=pkl_path, out_dir=out_dir)
+    elif dataset.lower() == 'sunrgbd':
+        update_sunrgbd_infos(pkl_path=pkl_path, out_dir=out_dir)
+    elif dataset.lower() == 'lyft':
+        update_lyft_infos(pkl_path=pkl_path, out_dir=out_dir)
+    elif dataset.lower() == 'nuscenes':
+        update_nuscenes_infos(pkl_path=pkl_path, out_dir=out_dir)
+    elif dataset.lower() == 's3dis':
+        update_s3dis_infos(pkl_path=pkl_path, out_dir=out_dir)
+    else:
+        raise NotImplementedError(f'Do not support convert {dataset} to v2.')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    if args.out_dir is None:
+        args.out_dir = args.root_dir
+    update_pkl_infos(
+        dataset=args.dataset, out_dir=args.out_dir, pkl_path=args.pkl_path)
diff --git a/mmde/tools/dataset_converters/waymo_converter.py b/mmde/tools/dataset_converters/waymo_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..e383c238df8dbf7db83d293369157d6f9dd74bba
--- /dev/null
+++ b/mmde/tools/dataset_converters/waymo_converter.py
@@ -0,0 +1,723 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Adapted from `Waymo to KITTI converter
+    <https://github.com/caizhongang/waymo_kitti_converter>`_.
+"""
+
+try:
+    from waymo_open_dataset import dataset_pb2
+except ImportError:
+    raise ImportError('Please run "pip install waymo-open-dataset-tf-2-6-0" '
+                      '>1.4.5 to install the official devkit first.')
+
+import copy
+import os
+import os.path as osp
+from glob import glob
+from io import BytesIO
+from os.path import exists, join
+
+import mmengine
+import numpy as np
+import tensorflow as tf
+from mmengine import print_log
+from nuscenes.utils.geometry_utils import view_points
+from PIL import Image
+from waymo_open_dataset.utils import range_image_utils, transform_utils
+from waymo_open_dataset.utils.frame_utils import \
+    parse_range_image_and_camera_projection
+
+from mmdet3d.datasets.convert_utils import post_process_coords
+from mmdet3d.structures import Box3DMode, LiDARInstance3DBoxes, points_cam2img
+
+
+class Waymo2KITTI(object):
+    """Waymo to KITTI converter. There are 2 steps as follows:
+
+    Step 1. Extract camera images and lidar point clouds from waymo raw data in
+        '*.tfreord' and save as kitti format.
+    Step 2. Generate waymo train/val/test infos and save as pickle file.
+
+    Args:
+        load_dir (str): Directory to load waymo raw data.
+        save_dir (str): Directory to save data in KITTI format.
+        prefix (str): Prefix of filename. In general, 0 for training, 1 for
+            validation and 2 for testing.
+        workers (int, optional): Number of workers for the parallel process.
+            Defaults to 64.
+        test_mode (bool, optional): Whether in the test_mode.
+            Defaults to False.
+        save_senor_data (bool, optional): Whether to save image and lidar
+            data. Defaults to True.
+        save_cam_sync_instances (bool, optional): Whether to save cam sync
+            instances. Defaults to True.
+        save_cam_instances (bool, optional): Whether to save cam instances.
+            Defaults to False.
+        info_prefix (str, optional): Prefix of info filename.
+            Defaults to 'waymo'.
+        max_sweeps (int, optional): Max length of sweeps. Defaults to 10.
+        split (str, optional): Split of the data. Defaults to 'training'.
+    """
+
+    def __init__(self,
+                 load_dir,
+                 save_dir,
+                 prefix,
+                 workers=64,
+                 test_mode=False,
+                 save_senor_data=True,
+                 save_cam_sync_instances=True,
+                 save_cam_instances=True,
+                 info_prefix='waymo',
+                 max_sweeps=10,
+                 split='training'):
+        # turn on eager execution for older tensorflow versions
+        if int(tf.__version__.split('.')[0]) < 2:
+            tf.enable_eager_execution()
+
+        # keep the order defined by the official protocol
+        self.cam_list = [
+            '_FRONT',
+            '_FRONT_LEFT',
+            '_FRONT_RIGHT',
+            '_SIDE_LEFT',
+            '_SIDE_RIGHT',
+        ]
+        self.lidar_list = ['TOP', 'FRONT', 'SIDE_LEFT', 'SIDE_RIGHT', 'REAR']
+        self.type_list = [
+            'UNKNOWN', 'VEHICLE', 'PEDESTRIAN', 'SIGN', 'CYCLIST'
+        ]
+
+        # MMDetection3D unified camera keys & class names
+        self.camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_LEFT',
+            'CAM_FRONT_RIGHT',
+            'CAM_SIDE_LEFT',
+            'CAM_SIDE_RIGHT',
+        ]
+        self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST']
+        self.info_map = {
+            'training': '_infos_train.pkl',
+            'validation': '_infos_val.pkl',
+            'testing': '_infos_test.pkl',
+            'testing_3d_camera_only_detection': '_infos_test_cam_only.pkl'
+        }
+
+        self.load_dir = load_dir
+        self.save_dir = save_dir
+        self.prefix = prefix
+        self.workers = int(workers)
+        self.test_mode = test_mode
+        self.save_senor_data = save_senor_data
+        self.save_cam_sync_instances = save_cam_sync_instances
+        self.save_cam_instances = save_cam_instances
+        self.info_prefix = info_prefix
+        self.max_sweeps = max_sweeps
+        self.split = split
+
+        # TODO: Discuss filter_empty_3dboxes and filter_no_label_zone_points
+        self.filter_empty_3dboxes = True
+        self.filter_no_label_zone_points = True
+        self.save_track_id = False
+
+        self.tfrecord_pathnames = sorted(
+            glob(join(self.load_dir, '*.tfrecord')))
+
+        self.image_save_dir = f'{self.save_dir}/image_'
+        self.point_cloud_save_dir = f'{self.save_dir}/velodyne'
+
+        # Create folder for saving KITTI format camera images and
+        # lidar point clouds.
+        if 'testing_3d_camera_only_detection' not in self.load_dir:
+            mmengine.mkdir_or_exist(self.point_cloud_save_dir)
+        for i in range(5):
+            mmengine.mkdir_or_exist(f'{self.image_save_dir}{str(i)}')
+
+    def convert(self):
+        """Convert action."""
+        print_log(f'Start converting {self.split} dataset', logger='current')
+        if self.workers == 0:
+            data_infos = mmengine.track_progress(self.convert_one,
+                                                 range(len(self)))
+        else:
+            data_infos = mmengine.track_parallel_progress(
+                self.convert_one, range(len(self)), self.workers)
+        data_list = []
+        for data_info in data_infos:
+            data_list.extend(data_info)
+        metainfo = dict()
+        metainfo['dataset'] = 'waymo'
+        metainfo['version'] = 'waymo_v1.4'
+        metainfo['info_version'] = 'mmdet3d_v1.4'
+        waymo_infos = dict(data_list=data_list, metainfo=metainfo)
+        filenames = osp.join(
+            osp.dirname(self.save_dir),
+            f'{self.info_prefix + self.info_map[self.split]}')
+        print_log(f'Saving {self.split} dataset infos into {filenames}')
+        mmengine.dump(waymo_infos, filenames)
+
+    def convert_one(self, file_idx):
+        """Convert one '*.tfrecord' file to kitti format. Each file stores all
+        the frames (about 200 frames) in current scene. We treat each frame as
+        a sample, save their images and point clouds in kitti format, and then
+        create info for all frames.
+
+        Args:
+            file_idx (int): Index of the file to be converted.
+
+        Returns:
+            List[dict]: Waymo infos for all frames in current file.
+        """
+        pathname = self.tfrecord_pathnames[file_idx]
+        dataset = tf.data.TFRecordDataset(pathname, compression_type='')
+
+        # NOTE: file_infos is not shared between processes, only stores frame
+        # infos within the current file.
+        file_infos = []
+        for frame_idx, data in enumerate(dataset):
+
+            frame = dataset_pb2.Frame()
+            frame.ParseFromString(bytearray(data.numpy()))
+
+            # Step 1. Extract camera images and lidar point clouds from waymo
+            # raw data in '*.tfreord' and save as kitti format.
+            if self.save_senor_data:
+                self.save_image(frame, file_idx, frame_idx)
+                self.save_lidar(frame, file_idx, frame_idx)
+
+            # Step 2. Generate waymo train/val/test infos and save as pkl file.
+            # TODO save the depth image for waymo challenge solution.
+            self.create_waymo_info_file(frame, file_idx, frame_idx, file_infos)
+        return file_infos
+
+    def __len__(self):
+        """Length of the filename list."""
+        return len(self.tfrecord_pathnames)
+
+    def save_image(self, frame, file_idx, frame_idx):
+        """Parse and save the images in jpg format.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        for img in frame.images:
+            img_path = f'{self.image_save_dir}{str(img.name - 1)}/' + \
+                f'{self.prefix}{str(file_idx).zfill(3)}' + \
+                f'{str(frame_idx).zfill(3)}.jpg'
+            with open(img_path, 'wb') as fp:
+                fp.write(img.image)
+
+    def save_lidar(self, frame, file_idx, frame_idx):
+        """Parse and save the lidar data in psd format.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        range_images, camera_projections, seg_labels, range_image_top_pose = \
+            parse_range_image_and_camera_projection(frame)
+
+        if range_image_top_pose is None:
+            # the camera only split doesn't contain lidar points.
+            return
+        # First return
+        points_0, cp_points_0, intensity_0, elongation_0, mask_indices_0 = \
+            self.convert_range_image_to_point_cloud(
+                frame,
+                range_images,
+                camera_projections,
+                range_image_top_pose,
+                ri_index=0
+            )
+        points_0 = np.concatenate(points_0, axis=0)
+        intensity_0 = np.concatenate(intensity_0, axis=0)
+        elongation_0 = np.concatenate(elongation_0, axis=0)
+        mask_indices_0 = np.concatenate(mask_indices_0, axis=0)
+
+        # Second return
+        points_1, cp_points_1, intensity_1, elongation_1, mask_indices_1 = \
+            self.convert_range_image_to_point_cloud(
+                frame,
+                range_images,
+                camera_projections,
+                range_image_top_pose,
+                ri_index=1
+            )
+        points_1 = np.concatenate(points_1, axis=0)
+        intensity_1 = np.concatenate(intensity_1, axis=0)
+        elongation_1 = np.concatenate(elongation_1, axis=0)
+        mask_indices_1 = np.concatenate(mask_indices_1, axis=0)
+
+        points = np.concatenate([points_0, points_1], axis=0)
+        intensity = np.concatenate([intensity_0, intensity_1], axis=0)
+        elongation = np.concatenate([elongation_0, elongation_1], axis=0)
+        mask_indices = np.concatenate([mask_indices_0, mask_indices_1], axis=0)
+
+        # timestamp = frame.timestamp_micros * np.ones_like(intensity)
+
+        # concatenate x,y,z, intensity, elongation, timestamp (6-dim)
+        point_cloud = np.column_stack(
+            (points, intensity, elongation, mask_indices))
+
+        pc_path = f'{self.point_cloud_save_dir}/{self.prefix}' + \
+            f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.bin'
+        point_cloud.astype(np.float32).tofile(pc_path)
+
+    def convert_range_image_to_point_cloud(self,
+                                           frame,
+                                           range_images,
+                                           camera_projections,
+                                           range_image_top_pose,
+                                           ri_index=0):
+        """Convert range images to point cloud.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame.
+            range_images (dict): Mapping from laser_name to list of two
+                range images corresponding with two returns.
+            camera_projections (dict): Mapping from laser_name to list of two
+                camera projections corresponding with two returns.
+            range_image_top_pose (:obj:`Transform`): Range image pixel pose for
+                top lidar.
+            ri_index (int, optional): 0 for the first return,
+                1 for the second return. Default: 0.
+
+        Returns:
+            tuple[list[np.ndarray]]: (List of points with shape [N, 3],
+                camera projections of points with shape [N, 6], intensity
+                with shape [N, 1], elongation with shape [N, 1], points'
+                position in the depth map (element offset if points come from
+                the main lidar otherwise -1) with shape[N, 1]). All the
+                lists have the length of lidar numbers (5).
+        """
+        calibrations = sorted(
+            frame.context.laser_calibrations, key=lambda c: c.name)
+        points = []
+        cp_points = []
+        intensity = []
+        elongation = []
+        mask_indices = []
+
+        frame_pose = tf.convert_to_tensor(
+            value=np.reshape(np.array(frame.pose.transform), [4, 4]))
+        # [H, W, 6]
+        range_image_top_pose_tensor = tf.reshape(
+            tf.convert_to_tensor(value=range_image_top_pose.data),
+            range_image_top_pose.shape.dims)
+        # [H, W, 3, 3]
+        range_image_top_pose_tensor_rotation = \
+            transform_utils.get_rotation_matrix(
+                range_image_top_pose_tensor[..., 0],
+                range_image_top_pose_tensor[..., 1],
+                range_image_top_pose_tensor[..., 2])
+        range_image_top_pose_tensor_translation = \
+            range_image_top_pose_tensor[..., 3:]
+        range_image_top_pose_tensor = transform_utils.get_transform(
+            range_image_top_pose_tensor_rotation,
+            range_image_top_pose_tensor_translation)
+        for c in calibrations:
+            range_image = range_images[c.name][ri_index]
+            if len(c.beam_inclinations) == 0:
+                beam_inclinations = range_image_utils.compute_inclination(
+                    tf.constant(
+                        [c.beam_inclination_min, c.beam_inclination_max]),
+                    height=range_image.shape.dims[0])
+            else:
+                beam_inclinations = tf.constant(c.beam_inclinations)
+
+            beam_inclinations = tf.reverse(beam_inclinations, axis=[-1])
+            extrinsic = np.reshape(np.array(c.extrinsic.transform), [4, 4])
+
+            range_image_tensor = tf.reshape(
+                tf.convert_to_tensor(value=range_image.data),
+                range_image.shape.dims)
+            pixel_pose_local = None
+            frame_pose_local = None
+            if c.name == dataset_pb2.LaserName.TOP:
+                pixel_pose_local = range_image_top_pose_tensor
+                pixel_pose_local = tf.expand_dims(pixel_pose_local, axis=0)
+                frame_pose_local = tf.expand_dims(frame_pose, axis=0)
+            range_image_mask = range_image_tensor[..., 0] > 0
+
+            if self.filter_no_label_zone_points:
+                nlz_mask = range_image_tensor[..., 3] != 1.0  # 1.0: in NLZ
+                range_image_mask = range_image_mask & nlz_mask
+
+            range_image_cartesian = \
+                range_image_utils.extract_point_cloud_from_range_image(
+                    tf.expand_dims(range_image_tensor[..., 0], axis=0),
+                    tf.expand_dims(extrinsic, axis=0),
+                    tf.expand_dims(tf.convert_to_tensor(
+                        value=beam_inclinations), axis=0),
+                    pixel_pose=pixel_pose_local,
+                    frame_pose=frame_pose_local)
+
+            mask_index = tf.where(range_image_mask)
+
+            range_image_cartesian = tf.squeeze(range_image_cartesian, axis=0)
+            points_tensor = tf.gather_nd(range_image_cartesian, mask_index)
+
+            cp = camera_projections[c.name][ri_index]
+            cp_tensor = tf.reshape(
+                tf.convert_to_tensor(value=cp.data), cp.shape.dims)
+            cp_points_tensor = tf.gather_nd(cp_tensor, mask_index)
+            points.append(points_tensor.numpy())
+            cp_points.append(cp_points_tensor.numpy())
+
+            intensity_tensor = tf.gather_nd(range_image_tensor[..., 1],
+                                            mask_index)
+            intensity.append(intensity_tensor.numpy())
+
+            elongation_tensor = tf.gather_nd(range_image_tensor[..., 2],
+                                             mask_index)
+            elongation.append(elongation_tensor.numpy())
+            if c.name == 1:
+                mask_index = (ri_index * range_image_mask.shape[0] +
+                              mask_index[:, 0]
+                              ) * range_image_mask.shape[1] + mask_index[:, 1]
+                mask_index = mask_index.numpy().astype(elongation[-1].dtype)
+            else:
+                mask_index = np.full_like(elongation[-1], -1)
+
+            mask_indices.append(mask_index)
+
+        return points, cp_points, intensity, elongation, mask_indices
+
+    def cart_to_homo(self, mat):
+        """Convert transformation matrix in Cartesian coordinates to
+        homogeneous format.
+
+        Args:
+            mat (np.ndarray): Transformation matrix in Cartesian.
+                The input matrix shape is 3x3 or 3x4.
+
+        Returns:
+            np.ndarray: Transformation matrix in homogeneous format.
+                The matrix shape is 4x4.
+        """
+        ret = np.eye(4)
+        if mat.shape == (3, 3):
+            ret[:3, :3] = mat
+        elif mat.shape == (3, 4):
+            ret[:3, :] = mat
+        else:
+            raise ValueError(mat.shape)
+        return ret
+
+    def create_waymo_info_file(self, frame, file_idx, frame_idx, file_infos):
+        r"""Generate waymo train/val/test infos.
+
+        For more details about infos, please refer to:
+        https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html
+        """  # noqa: E501
+        frame_infos = dict()
+
+        # Gather frame infos
+        sample_idx = \
+            f'{self.prefix}{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}'
+        frame_infos['sample_idx'] = int(sample_idx)
+        frame_infos['timestamp'] = frame.timestamp_micros
+        frame_infos['ego2global'] = np.array(frame.pose.transform).reshape(
+            4, 4).astype(np.float32).tolist()
+        frame_infos['context_name'] = frame.context.name
+
+        # Gather camera infos
+        frame_infos['images'] = dict()
+        # waymo front camera to kitti reference camera
+        T_front_cam_to_ref = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0],
+                                       [1.0, 0.0, 0.0]])
+        camera_calibs = []
+        Tr_velo_to_cams = []
+        for camera in frame.context.camera_calibrations:
+            # extrinsic parameters
+            T_cam_to_vehicle = np.array(camera.extrinsic.transform).reshape(
+                4, 4)
+            T_vehicle_to_cam = np.linalg.inv(T_cam_to_vehicle)
+            Tr_velo_to_cam = \
+                self.cart_to_homo(T_front_cam_to_ref) @ T_vehicle_to_cam
+            Tr_velo_to_cams.append(Tr_velo_to_cam)
+
+            # intrinsic parameters
+            camera_calib = np.zeros((3, 4))
+            camera_calib[0, 0] = camera.intrinsic[0]
+            camera_calib[1, 1] = camera.intrinsic[1]
+            camera_calib[0, 2] = camera.intrinsic[2]
+            camera_calib[1, 2] = camera.intrinsic[3]
+            camera_calib[2, 2] = 1
+            camera_calibs.append(camera_calib)
+
+        for i, (cam_key, camera_calib, Tr_velo_to_cam) in enumerate(
+                zip(self.camera_types, camera_calibs, Tr_velo_to_cams)):
+            cam_infos = dict()
+            cam_infos['img_path'] = str(sample_idx) + '.jpg'
+            # NOTE: frames.images order is different
+            for img in frame.images:
+                if img.name == i + 1:
+                    width, height = Image.open(BytesIO(img.image)).size
+            cam_infos['height'] = height
+            cam_infos['width'] = width
+            cam_infos['lidar2cam'] = Tr_velo_to_cam.astype(np.float32).tolist()
+            cam_infos['cam2img'] = camera_calib.astype(np.float32).tolist()
+            cam_infos['lidar2img'] = (camera_calib @ Tr_velo_to_cam).astype(
+                np.float32).tolist()
+            frame_infos['images'][cam_key] = cam_infos
+
+        # Gather lidar infos
+        lidar_infos = dict()
+        lidar_infos['lidar_path'] = str(sample_idx) + '.bin'
+        lidar_infos['num_pts_feats'] = 6
+        frame_infos['lidar_points'] = lidar_infos
+
+        # Gather lidar sweeps and camera sweeps infos
+        # TODO: Add lidar2img in image sweeps infos when we need it.
+        # TODO: Consider merging lidar sweeps infos and image sweeps infos.
+        lidar_sweeps_infos, image_sweeps_infos = [], []
+        for prev_offset in range(-1, -self.max_sweeps - 1, -1):
+            prev_lidar_infos = dict()
+            prev_image_infos = dict()
+            if frame_idx + prev_offset >= 0:
+                prev_frame_infos = file_infos[prev_offset]
+                prev_lidar_infos['timestamp'] = prev_frame_infos['timestamp']
+                prev_lidar_infos['ego2global'] = prev_frame_infos['ego2global']
+                prev_lidar_infos['lidar_points'] = dict()
+                lidar_path = prev_frame_infos['lidar_points']['lidar_path']
+                prev_lidar_infos['lidar_points']['lidar_path'] = lidar_path
+                lidar_sweeps_infos.append(prev_lidar_infos)
+
+                prev_image_infos['timestamp'] = prev_frame_infos['timestamp']
+                prev_image_infos['ego2global'] = prev_frame_infos['ego2global']
+                prev_image_infos['images'] = dict()
+                for cam_key in self.camera_types:
+                    prev_image_infos['images'][cam_key] = dict()
+                    img_path = prev_frame_infos['images'][cam_key]['img_path']
+                    prev_image_infos['images'][cam_key]['img_path'] = img_path
+                image_sweeps_infos.append(prev_image_infos)
+        if lidar_sweeps_infos:
+            frame_infos['lidar_sweeps'] = lidar_sweeps_infos
+        if image_sweeps_infos:
+            frame_infos['image_sweeps'] = image_sweeps_infos
+
+        if not self.test_mode:
+            # Gather instances infos which is used for lidar-based 3D detection
+            frame_infos['instances'] = self.gather_instance_info(frame)
+            # Gather cam_sync_instances infos which is used for image-based
+            # (multi-view) 3D detection.
+            if self.save_cam_sync_instances:
+                frame_infos['cam_sync_instances'] = self.gather_instance_info(
+                    frame, cam_sync=True)
+            # Gather cam_instances infos which is used for image-based
+            # (monocular) 3D detection (optional).
+            # TODO: Should we use cam_sync_instances to generate cam_instances?
+            if self.save_cam_instances:
+                frame_infos['cam_instances'] = self.gather_cam_instance_info(
+                    copy.deepcopy(frame_infos['instances']),
+                    frame_infos['images'])
+        file_infos.append(frame_infos)
+
+    def gather_instance_info(self, frame, cam_sync=False):
+        """Generate instances and cam_sync_instances infos.
+
+        For more details about infos, please refer to:
+        https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html
+        """  # noqa: E501
+        id_to_bbox = dict()
+        id_to_name = dict()
+        for labels in frame.projected_lidar_labels:
+            name = labels.name
+            for label in labels.labels:
+                # TODO: need a workaround as bbox may not belong to front cam
+                bbox = [
+                    label.box.center_x - label.box.length / 2,
+                    label.box.center_y - label.box.width / 2,
+                    label.box.center_x + label.box.length / 2,
+                    label.box.center_y + label.box.width / 2
+                ]
+                id_to_bbox[label.id] = bbox
+                id_to_name[label.id] = name - 1
+
+        group_id = 0
+        instance_infos = []
+        for obj in frame.laser_labels:
+            instance_info = dict()
+            bounding_box = None
+            name = None
+            id = obj.id
+            for proj_cam in self.cam_list:
+                if id + proj_cam in id_to_bbox:
+                    bounding_box = id_to_bbox.get(id + proj_cam)
+                    name = id_to_name.get(id + proj_cam)
+                    break
+
+            # NOTE: the 2D labels do not have strict correspondence with
+            # the projected 2D lidar labels
+            # e.g.: the projected 2D labels can be in camera 2
+            # while the most_visible_camera can have id 4
+            if cam_sync:
+                if obj.most_visible_camera_name:
+                    name = self.cam_list.index(
+                        f'_{obj.most_visible_camera_name}')
+                    box3d = obj.camera_synced_box
+                else:
+                    continue
+            else:
+                box3d = obj.box
+
+            if bounding_box is None or name is None:
+                name = 0
+                bounding_box = [0.0, 0.0, 0.0, 0.0]
+
+            my_type = self.type_list[obj.type]
+
+            if my_type not in self.selected_waymo_classes:
+                continue
+            else:
+                label = self.selected_waymo_classes.index(my_type)
+
+            if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1:
+                continue
+
+            group_id += 1
+            instance_info['group_id'] = group_id
+            instance_info['camera_id'] = name
+            instance_info['bbox'] = bounding_box
+            instance_info['bbox_label'] = label
+
+            height = box3d.height
+            width = box3d.width
+            length = box3d.length
+
+            # NOTE: We save the bottom center of 3D bboxes.
+            x = box3d.center_x
+            y = box3d.center_y
+            z = box3d.center_z - height / 2
+
+            rotation_y = box3d.heading
+
+            instance_info['bbox_3d'] = np.array(
+                [x, y, z, length, width, height,
+                 rotation_y]).astype(np.float32).tolist()
+            instance_info['bbox_label_3d'] = label
+            instance_info['num_lidar_pts'] = obj.num_lidar_points_in_box
+
+            if self.save_track_id:
+                instance_info['track_id'] = obj.id
+            instance_infos.append(instance_info)
+        return instance_infos
+
+    def gather_cam_instance_info(self, instances: dict, images: dict):
+        """Generate cam_instances infos.
+
+        For more details about infos, please refer to:
+        https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html
+        """  # noqa: E501
+        cam_instances = dict()
+        for cam_type in self.camera_types:
+            lidar2cam = np.array(images[cam_type]['lidar2cam'])
+            cam2img = np.array(images[cam_type]['cam2img'])
+            cam_instances[cam_type] = []
+            for instance in instances:
+                cam_instance = dict()
+                gt_bboxes_3d = np.array(instance['bbox_3d'])
+                # Convert lidar coordinates to camera coordinates
+                gt_bboxes_3d = LiDARInstance3DBoxes(
+                    gt_bboxes_3d[None, :]).convert_to(
+                        Box3DMode.CAM, lidar2cam, correct_yaw=True)
+                corners_3d = gt_bboxes_3d.corners.numpy()
+                corners_3d = corners_3d[0].T  # (1, 8, 3) -> (3, 8)
+                in_camera = np.argwhere(corners_3d[2, :] > 0).flatten()
+                corners_3d = corners_3d[:, in_camera]
+                # Project 3d box to 2d.
+                corner_coords = view_points(corners_3d, cam2img,
+                                            True).T[:, :2].tolist()
+
+                # Keep only corners that fall within the image.
+                # TODO: imsize should be determined by the current image size
+                # CAM_FRONT: (1920, 1280)
+                # CAM_FRONT_LEFT: (1920, 1280)
+                # CAM_SIDE_LEFT: (1920, 886)
+                final_coords = post_process_coords(
+                    corner_coords,
+                    imsize=(images['CAM_FRONT']['width'],
+                            images['CAM_FRONT']['height']))
+
+                # Skip if the convex hull of the re-projected corners
+                # does not intersect the image canvas.
+                if final_coords is None:
+                    continue
+                else:
+                    min_x, min_y, max_x, max_y = final_coords
+
+                cam_instance['bbox'] = [min_x, min_y, max_x, max_y]
+                cam_instance['bbox_label'] = instance['bbox_label']
+                cam_instance['bbox_3d'] = gt_bboxes_3d.numpy().squeeze(
+                ).astype(np.float32).tolist()
+                cam_instance['bbox_label_3d'] = instance['bbox_label_3d']
+
+                center_3d = gt_bboxes_3d.gravity_center.numpy()
+                center_2d_with_depth = points_cam2img(
+                    center_3d, cam2img, with_depth=True)
+                center_2d_with_depth = center_2d_with_depth.squeeze().tolist()
+
+                # normalized center2D + depth
+                # if samples with depth < 0 will be removed
+                if center_2d_with_depth[2] <= 0:
+                    continue
+                cam_instance['center_2d'] = center_2d_with_depth[:2]
+                cam_instance['depth'] = center_2d_with_depth[2]
+
+                # TODO: Discuss whether following info is necessary
+                cam_instance['bbox_3d_isvalid'] = True
+                cam_instance['velocity'] = -1
+                cam_instances[cam_type].append(cam_instance)
+
+        return cam_instances
+
+    def merge_trainval_infos(self):
+        """Merge training and validation infos into a single file."""
+        train_infos_path = osp.join(
+            osp.dirname(self.save_dir), f'{self.info_prefix}_infos_train.pkl')
+        val_infos_path = osp.join(
+            osp.dirname(self.save_dir), f'{self.info_prefix}_infos_val.pkl')
+        train_infos = mmengine.load(train_infos_path)
+        val_infos = mmengine.load(val_infos_path)
+        trainval_infos = dict(
+            metainfo=train_infos['metainfo'],
+            data_list=train_infos['data_list'] + val_infos['data_list'])
+        mmengine.dump(
+            trainval_infos,
+            osp.join(
+                osp.dirname(self.save_dir),
+                f'{self.info_prefix}_infos_trainval.pkl'))
+
+
+def create_ImageSets_img_ids(root_dir, splits):
+    """Create txt files indicating what to collect in each split."""
+    save_dir = join(root_dir, 'ImageSets/')
+    if not exists(save_dir):
+        os.mkdir(save_dir)
+
+    idx_all = [[] for _ in splits]
+    for i, split in enumerate(splits):
+        path = join(root_dir, split, 'image_0')
+        if not exists(path):
+            RawNames = []
+        else:
+            RawNames = os.listdir(path)
+
+        for name in RawNames:
+            if name.endswith('.jpg'):
+                idx = name.replace('.jpg', '\n')
+                idx_all[int(idx[0])].append(idx)
+        idx_all[i].sort()
+
+    open(save_dir + 'train.txt', 'w').writelines(idx_all[0])
+    open(save_dir + 'val.txt', 'w').writelines(idx_all[1])
+    open(save_dir + 'trainval.txt', 'w').writelines(idx_all[0] + idx_all[1])
+    if len(idx_all) >= 3:
+        open(save_dir + 'test.txt', 'w').writelines(idx_all[2])
+    if len(idx_all) >= 4:
+        open(save_dir + 'test_cam_only.txt', 'w').writelines(idx_all[3])
+    print('created txt files indicating what to collect in ', splits)
diff --git a/mmde/tools/deployment/mmdet3d2torchserve.py b/mmde/tools/deployment/mmdet3d2torchserve.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f80b1f36e4ae788fe11fc1097ad1fee9a2a9562
--- /dev/null
+++ b/mmde/tools/deployment/mmdet3d2torchserve.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import mmengine
+
+try:
+    from model_archiver.model_packaging import package_model
+    from model_archiver.model_packaging_utils import ModelExportUtils
+except ImportError:
+    package_model = None
+
+
+def mmdet3d2torchserve(
+    config_file: str,
+    checkpoint_file: str,
+    output_folder: str,
+    model_name: str,
+    model_version: str = '1.0',
+    force: bool = False,
+):
+    """Converts MMDetection3D model (config + checkpoint) to TorchServe `.mar`.
+
+    Args:
+        config_file (str):
+            In MMDetection3D config format.
+            The contents vary for each task repository.
+        checkpoint_file (str):
+            In MMDetection3D checkpoint format.
+            The contents vary for each task repository.
+        output_folder (str):
+            Folder where `{model_name}.mar` will be created.
+            The file created will be in TorchServe archive format.
+        model_name (str):
+            If not None, used for naming the `{model_name}.mar` file
+            that will be created under `output_folder`.
+            If None, `{Path(checkpoint_file).stem}` will be used.
+        model_version (str, optional):
+            Model's version. Default: '1.0'.
+        force (bool, optional):
+            If True, if there is an existing `{model_name}.mar`
+            file under `output_folder` it will be overwritten.
+            Default: False.
+    """
+    mmengine.mkdir_or_exist(output_folder)
+
+    config = mmengine.Config.fromfile(config_file)
+
+    with TemporaryDirectory() as tmpdir:
+        config.dump(f'{tmpdir}/config.py')
+
+        args = Namespace(
+            **{
+                'model_file': f'{tmpdir}/config.py',
+                'serialized_file': checkpoint_file,
+                'handler': f'{Path(__file__).parent}/mmdet3d_handler.py',
+                'model_name': model_name or Path(checkpoint_file).stem,
+                'version': model_version,
+                'export_path': output_folder,
+                'force': force,
+                'requirements_file': None,
+                'extra_files': None,
+                'runtime': 'python',
+                'archive_format': 'default'
+            })
+        manifest = ModelExportUtils.generate_manifest_json(args)
+        package_model(args, manifest)
+
+
+def parse_args():
+    parser = ArgumentParser(
+        description='Convert MMDetection models to TorchServe `.mar` format.')
+    parser.add_argument('config', type=str, help='config file path')
+    parser.add_argument('checkpoint', type=str, help='checkpoint file path')
+    parser.add_argument(
+        '--output-folder',
+        type=str,
+        required=True,
+        help='Folder where `{model_name}.mar` will be created.')
+    parser.add_argument(
+        '--model-name',
+        type=str,
+        default=None,
+        help='If not None, used for naming the `{model_name}.mar`'
+        'file that will be created under `output_folder`.'
+        'If None, `{Path(checkpoint_file).stem}` will be used.')
+    parser.add_argument(
+        '--model-version',
+        type=str,
+        default='1.0',
+        help='Number used for versioning.')
+    parser.add_argument(
+        '-f',
+        '--force',
+        action='store_true',
+        help='overwrite the existing `{model_name}.mar`')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if package_model is None:
+        raise ImportError('`torch-model-archiver` is required.'
+                          'Try: pip install torch-model-archiver')
+
+    mmdet3d2torchserve(args.config, args.checkpoint, args.output_folder,
+                       args.model_name, args.model_version, args.force)
diff --git a/mmde/tools/deployment/mmdet3d_handler.py b/mmde/tools/deployment/mmdet3d_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c121d5a434d3baeefe1a6def205fd0862c64e4e8
--- /dev/null
+++ b/mmde/tools/deployment/mmdet3d_handler.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import base64
+import os
+
+import numpy as np
+import torch
+from ts.torch_handler.base_handler import BaseHandler
+
+from mmdet3d.apis import inference_detector, init_model
+from mmdet3d.structures.points import get_points_type
+
+
+class MMdet3dHandler(BaseHandler):
+    """MMDetection3D Handler used in TorchServe.
+
+    Handler to load models in MMDetection3D, and it will process data to get
+    predicted results. For now, it only supports SECOND.
+    """
+    threshold = 0.5
+    load_dim = 4
+    use_dim = [0, 1, 2, 3]
+    coord_type = 'LIDAR'
+    attribute_dims = None
+
+    def initialize(self, context):
+        """Initialize function loads the model in MMDetection3D.
+
+        Args:
+            context (context): It is a JSON Object containing information
+                pertaining to the model artifacts parameters.
+        """
+        properties = context.system_properties
+        self.map_location = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = torch.device(self.map_location + ':' +
+                                   str(properties.get('gpu_id')) if torch.cuda.
+                                   is_available() else self.map_location)
+        self.manifest = context.manifest
+
+        model_dir = properties.get('model_dir')
+        serialized_file = self.manifest['model']['serializedFile']
+        checkpoint = os.path.join(model_dir, serialized_file)
+        self.config_file = os.path.join(model_dir, 'config.py')
+        self.model = init_model(self.config_file, checkpoint, self.device)
+        self.initialized = True
+
+    def preprocess(self, data):
+        """Preprocess function converts data into LiDARPoints class.
+
+        Args:
+            data (List): Input data from the request.
+
+        Returns:
+            `LiDARPoints` : The preprocess function returns the input
+                point cloud data as LiDARPoints class.
+        """
+        for row in data:
+            # Compat layer: normally the envelope should just return the data
+            # directly, but older versions of Torchserve didn't have envelope.
+            pts = row.get('data') or row.get('body')
+            if isinstance(pts, str):
+                pts = base64.b64decode(pts)
+
+            points = np.frombuffer(pts, dtype=np.float32)
+            points = points.reshape(-1, self.load_dim)
+            points = points[:, self.use_dim]
+            points_class = get_points_type(self.coord_type)
+            points = points_class(
+                points,
+                points_dim=points.shape[-1],
+                attribute_dims=self.attribute_dims)
+
+        return points
+
+    def inference(self, data):
+        """Inference Function.
+
+        This function is used to make a prediction call on the
+        given input request.
+
+        Args:
+            data (`LiDARPoints`): LiDARPoints class passed to make
+                the inference request.
+
+        Returns:
+            List(dict) : The predicted result is returned in this function.
+        """
+        results, _ = inference_detector(self.model, data)
+        return results
+
+    def postprocess(self, data):
+        """Postprocess function.
+
+        This function makes use of the output from the inference and
+        converts it into a torchserve supported response output.
+
+        Args:
+            data (List[dict]): The data received from the prediction
+                output of the model.
+
+        Returns:
+            List: The post process function returns a list of the predicted
+                output.
+        """
+        output = []
+        for pts_index, result in enumerate(data):
+            output.append([])
+            if 'pts_bbox' in result.keys():
+                pred_bboxes = result['pts_bbox']['boxes_3d'].numpy()
+                pred_scores = result['pts_bbox']['scores_3d'].numpy()
+            else:
+                pred_bboxes = result['boxes_3d'].numpy()
+                pred_scores = result['scores_3d'].numpy()
+
+            index = pred_scores > self.threshold
+            bbox_coords = pred_bboxes[index].tolist()
+            score = pred_scores[index].tolist()
+
+            output[pts_index].append({'3dbbox': bbox_coords, 'score': score})
+
+        return output
diff --git a/mmde/tools/deployment/test_torchserver.py b/mmde/tools/deployment/test_torchserver.py
new file mode 100644
index 0000000000000000000000000000000000000000..c66205a66684e37954a8d3b997d66ba5a6d9a295
--- /dev/null
+++ b/mmde/tools/deployment/test_torchserver.py
@@ -0,0 +1,56 @@
+from argparse import ArgumentParser
+
+import numpy as np
+import requests
+
+from mmdet3d.apis import inference_detector, init_model
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('pcd', help='Point cloud file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('model_name', help='The model name in the server')
+    parser.add_argument(
+        '--inference-addr',
+        default='127.0.0.1:8080',
+        help='Address and port of the inference server')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.5, help='3d bbox score threshold')
+    args = parser.parse_args()
+    return args
+
+
+def parse_result(input):
+    bbox = input[0]['3dbbox']
+    result = np.array(bbox)
+    return result
+
+
+def main(args):
+    # build the model from a config file and a checkpoint file
+    model = init_model(args.config, args.checkpoint, device=args.device)
+    # test a single point cloud file
+    model_result, _ = inference_detector(model, args.pcd)
+    # filter the 3d bboxes whose scores > 0.5
+    if 'pts_bbox' in model_result[0].keys():
+        pred_bboxes = model_result[0]['pts_bbox']['boxes_3d'].numpy()
+        pred_scores = model_result[0]['pts_bbox']['scores_3d'].numpy()
+    else:
+        pred_bboxes = model_result[0]['boxes_3d'].numpy()
+        pred_scores = model_result[0]['scores_3d'].numpy()
+    model_result = pred_bboxes[pred_scores > 0.5]
+
+    url = 'http://' + args.inference_addr + '/predictions/' + args.model_name
+    with open(args.pcd, 'rb') as points:
+        response = requests.post(url, points)
+    server_result = parse_result(response.json())
+    assert np.allclose(model_result, server_result)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/mmde/tools/dist_test.sh b/mmde/tools/dist_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..dea131b43ea8f1222661d20603d40c18ea7f28a1
--- /dev/null
+++ b/mmde/tools/dist_test.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/test.py \
+    $CONFIG \
+    $CHECKPOINT \
+    --launcher pytorch \
+    ${@:4}
diff --git a/mmde/tools/dist_train.sh b/mmde/tools/dist_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3fca7641dec4090930c85991a079c28409529d4e
--- /dev/null
+++ b/mmde/tools/dist_train.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/train.py \
+    $CONFIG \
+    --launcher pytorch ${@:3}
diff --git a/mmde/tools/misc/browse_dataset.py b/mmde/tools/misc/browse_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..164800952c0cf30d5d6db21fd765f597f91ce2e2
--- /dev/null
+++ b/mmde/tools/misc/browse_dataset.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from os import path as osp
+
+from mmengine.config import Config, DictAction
+from mmengine.registry import init_default_scope
+from mmengine.utils import ProgressBar, mkdir_or_exist
+
+from mmdet3d.registry import DATASETS, VISUALIZERS
+from mmdet3d.utils import replace_ceph_backend
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--not-show', default=False, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=float,
+        default=2,
+        help='the interval of show (s)')
+    parser.add_argument(
+        '--task',
+        type=str,
+        choices=[
+            'mono_det', 'multi-view_det', 'lidar_det', 'lidar_seg',
+            'multi-modality_det'
+        ],
+        help='Determine the visualization method depending on the task.')
+    parser.add_argument(
+        '--aug',
+        action='store_true',
+        help='Whether to visualize augmented datasets or original dataset.')
+    parser.add_argument(
+        '--ceph', action='store_true', help='Use ceph as data storage backend')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def build_data_cfg(config_path, aug, cfg_options):
+    """Build data config for loading visualization data."""
+
+    cfg = Config.fromfile(config_path)
+    if cfg_options is not None:
+        cfg.merge_from_dict(cfg_options)
+
+    # extract inner dataset of `RepeatDataset` as
+    # `cfg.train_dataloader.dataset` so we don't
+    # need to worry about it later
+    if cfg.train_dataloader.dataset['type'] == 'RepeatDataset':
+        cfg.train_dataloader.dataset = cfg.train_dataloader.dataset.dataset
+    # use only first dataset for `ConcatDataset`
+    if cfg.train_dataloader.dataset['type'] == 'ConcatDataset':
+        cfg.train_dataloader.dataset = cfg.train_dataloader.dataset.datasets[0]
+    if cfg.train_dataloader.dataset['type'] == 'CBGSDataset':
+        cfg.train_dataloader.dataset = cfg.train_dataloader.dataset.dataset
+
+    train_data_cfg = cfg.train_dataloader.dataset
+
+    if aug:
+        show_pipeline = cfg.train_pipeline
+    else:
+        show_pipeline = cfg.test_pipeline
+        for i in range(len(cfg.train_pipeline)):
+            if cfg.train_pipeline[i]['type'] == 'LoadAnnotations3D':
+                show_pipeline.insert(i, cfg.train_pipeline[i])
+            # Collect data as well as labels
+            if cfg.train_pipeline[i]['type'] == 'Pack3DDetInputs':
+                if show_pipeline[-1]['type'] == 'Pack3DDetInputs':
+                    show_pipeline[-1] = cfg.train_pipeline[i]
+                else:
+                    show_pipeline.append(cfg.train_pipeline[i])
+
+    train_data_cfg['pipeline'] = show_pipeline
+
+    return cfg
+
+
+def main():
+    args = parse_args()
+
+    if args.output_dir is not None:
+        mkdir_or_exist(args.output_dir)
+
+    cfg = build_data_cfg(args.config, args.aug, args.cfg_options)
+
+    # TODO: We will unify the ceph support approach with other OpenMMLab repos
+    if args.ceph:
+        cfg = replace_ceph_backend(cfg)
+
+    init_default_scope(cfg.get('default_scope', 'mmdet3d'))
+
+    try:
+        dataset = DATASETS.build(
+            cfg.train_dataloader.dataset,
+            default_args=dict(filter_empty_gt=False))
+    except TypeError:  # seg dataset doesn't have `filter_empty_gt` key
+        dataset = DATASETS.build(cfg.train_dataloader.dataset)
+
+    # configure visualization mode
+    vis_task = args.task
+
+    visualizer = VISUALIZERS.build(cfg.visualizer)
+    visualizer.dataset_meta = dataset.metainfo
+
+    progress_bar = ProgressBar(len(dataset))
+
+    for i, item in enumerate(dataset):
+        # the 3D Boxes in input could be in any of three coordinates
+        data_input = item['inputs']
+        data_sample = item['data_samples'].numpy()
+
+        out_file = osp.join(
+            args.output_dir,
+            f'{i}.jpg') if args.output_dir is not None else None
+
+        # o3d_save_path is valid when args.not_show is False
+        o3d_save_path = osp.join(args.output_dir, f'pc_{i}.png') if (
+            args.output_dir is not None
+            and vis_task in ['lidar_det', 'lidar_seg', 'multi-modality_det']
+            and not args.not_show) else None
+
+        visualizer.add_datasample(
+            '3d visualzier',
+            data_input,
+            data_sample=data_sample,
+            show=not args.not_show,
+            wait_time=args.show_interval,
+            out_file=out_file,
+            o3d_save_path=o3d_save_path,
+            vis_task=vis_task)
+
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/tools/misc/fuse_conv_bn.py b/mmde/tools/misc/fuse_conv_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..90d30cebe0f5f695852f5cb11f4b3afdfbfe4f64
--- /dev/null
+++ b/mmde/tools/misc/fuse_conv_bn.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import torch
+from mmengine.runner import save_checkpoint
+from torch import nn as nn
+
+from mmdet3d.apis import init_model
+
+
+def fuse_conv_bn(conv, bn):
+    """During inference, the functionary of batch norm layers is turned off but
+    only the mean and var alone channels are used, which exposes the chance to
+    fuse it with the preceding conv layers to save computations and simplify
+    network bboxes_3d."""
+    conv_w = conv.weight
+    conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
+        bn.running_mean)
+
+    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+    conv.weight = nn.Parameter(conv_w *
+                               factor.reshape([conv.out_channels, 1, 1, 1]))
+    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+    return conv
+
+
+def fuse_module(m):
+    last_conv = None
+    last_conv_name = None
+
+    for name, child in m.named_children():
+        if isinstance(child, (nn.BatchNorm2d, nn.SyncBatchNorm)):
+            if last_conv is None:  # only fuse BN that is after Conv
+                continue
+            fused_conv = fuse_conv_bn(last_conv, child)
+            m._modules[last_conv_name] = fused_conv
+            # To reduce changes, set BN as Identity instead of deleting it.
+            m._modules[name] = nn.Identity()
+            last_conv = None
+        elif isinstance(child, nn.Conv2d):
+            last_conv = child
+            last_conv_name = name
+        else:
+            fuse_module(child)
+    return m
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='fuse Conv and BN layers in a model')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument('checkpoint', help='checkpoint file path')
+    parser.add_argument('out', help='output path of the converted model')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    # build the model from a config file and a checkpoint file
+    model = init_model(args.config, args.checkpoint)
+    # fuse conv and bn layers of the model
+    fused_model = fuse_module(model)
+    save_checkpoint(fused_model, args.out)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/tools/misc/print_config.py b/mmde/tools/misc/print_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5e6e641e40ced032848593d2bbc0f008f27c91c
--- /dev/null
+++ b/mmde/tools/misc/print_config.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+from mmengine import Config, DictAction
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Print the whole config')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '--options', nargs='+', action=DictAction, help='arguments in dict')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.options is not None:
+        cfg.merge_from_dict(args.options)
+    print(f'Config:\n{cfg.pretty_text}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/tools/misc/visualize_results.py b/mmde/tools/misc/visualize_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9d3452bc538368d09184652033c9cf9b827002b
--- /dev/null
+++ b/mmde/tools/misc/visualize_results.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import mmengine
+from mmengine import Config
+
+from mmdet3d.registry import DATASETS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet3D visualize the results')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--result', help='results file in pickle format')
+    parser.add_argument(
+        '--show-dir', help='directory where visualize results will be saved')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.result is not None and \
+            not args.result.endswith(('.pkl', '.pickle')):
+        raise ValueError('The results file must be a pkl file.')
+
+    cfg = Config.fromfile(args.config)
+    cfg.data.test.test_mode = True
+
+    # build the dataset
+    dataset = DATASETS.build(cfg.data.test)
+    results = mmengine.load(args.result)
+
+    if getattr(dataset, 'show', None) is not None:
+        # data loading pipeline for showing
+        eval_pipeline = cfg.get('eval_pipeline', {})
+        if eval_pipeline:
+            dataset.show(results, args.show_dir, pipeline=eval_pipeline)
+        else:
+            dataset.show(results, args.show_dir)  # use default pipeline
+    else:
+        raise NotImplementedError(
+            'Show is not implemented for dataset {}!'.format(
+                type(dataset).__name__))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/tools/model_converters/convert_h3dnet_checkpoints.py b/mmde/tools/model_converters/convert_h3dnet_checkpoints.py
new file mode 100644
index 0000000000000000000000000000000000000000..08c27da676793496a9e3d83ea561f43d6e482f1f
--- /dev/null
+++ b/mmde/tools/model_converters/convert_h3dnet_checkpoints.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+
+import torch
+from mmcv import Config
+from mmengine.runner import load_state_dict
+
+from mmdet3d.registry import MODELS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet3D upgrade model version(before v0.6.0) of H3DNet')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--out', help='path of the output checkpoint file')
+    args = parser.parse_args()
+    return args
+
+
+def parse_config(config_strings):
+    """Parse config from strings.
+
+    Args:
+        config_strings (string): strings of model config.
+
+    Returns:
+        Config: model config
+    """
+    temp_file = tempfile.NamedTemporaryFile()
+    config_path = f'{temp_file.name}.py'
+    with open(config_path, 'w') as f:
+        f.write(config_strings)
+
+    config = Config.fromfile(config_path)
+
+    # Update backbone config
+    if 'pool_mod' in config.model.backbone.backbones:
+        config.model.backbone.backbones.pop('pool_mod')
+
+    if 'sa_cfg' not in config.model.backbone:
+        config.model.backbone['sa_cfg'] = dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)
+
+    if 'type' not in config.model.rpn_head.vote_aggregation_cfg:
+        config.model.rpn_head.vote_aggregation_cfg['type'] = 'PointSAModule'
+
+    # Update rpn_head config
+    if 'pred_layer_cfg' not in config.model.rpn_head:
+        config.model.rpn_head['pred_layer_cfg'] = dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True)
+
+    if 'feat_channels' in config.model.rpn_head:
+        config.model.rpn_head.pop('feat_channels')
+
+    if 'vote_moudule_cfg' in config.model.rpn_head:
+        config.model.rpn_head['vote_module_cfg'] = config.model.rpn_head.pop(
+            'vote_moudule_cfg')
+
+    if config.model.rpn_head.vote_aggregation_cfg.use_xyz:
+        config.model.rpn_head.vote_aggregation_cfg.mlp_channels[0] -= 3
+
+    for cfg in config.model.roi_head.primitive_list:
+        cfg['vote_module_cfg'] = cfg.pop('vote_moudule_cfg')
+        cfg.vote_aggregation_cfg.mlp_channels[0] -= 3
+        if 'type' not in cfg.vote_aggregation_cfg:
+            cfg.vote_aggregation_cfg['type'] = 'PointSAModule'
+
+    if 'type' not in config.model.roi_head.bbox_head.suface_matching_cfg:
+        config.model.roi_head.bbox_head.suface_matching_cfg[
+            'type'] = 'PointSAModule'
+
+    if config.model.roi_head.bbox_head.suface_matching_cfg.use_xyz:
+        config.model.roi_head.bbox_head.suface_matching_cfg.mlp_channels[
+            0] -= 3
+
+    if 'type' not in config.model.roi_head.bbox_head.line_matching_cfg:
+        config.model.roi_head.bbox_head.line_matching_cfg[
+            'type'] = 'PointSAModule'
+
+    if config.model.roi_head.bbox_head.line_matching_cfg.use_xyz:
+        config.model.roi_head.bbox_head.line_matching_cfg.mlp_channels[0] -= 3
+
+    if 'proposal_module_cfg' in config.model.roi_head.bbox_head:
+        config.model.roi_head.bbox_head.pop('proposal_module_cfg')
+
+    temp_file.close()
+
+    return config
+
+
+def main():
+    """Convert keys in checkpoints for VoteNet.
+
+    There can be some breaking changes during the development of mmdetection3d,
+    and this tool is used for upgrading checkpoints trained with old versions
+    (before v0.6.0) to the latest one.
+    """
+    args = parse_args()
+    checkpoint = torch.load(args.checkpoint)
+    cfg = parse_config(checkpoint['meta']['config'])
+    # Build the model and load checkpoint
+    model = MODELS.build(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    orig_ckpt = checkpoint['state_dict']
+    converted_ckpt = orig_ckpt.copy()
+
+    if cfg['dataset_type'] == 'ScanNetDataset':
+        NUM_CLASSES = 18
+    elif cfg['dataset_type'] == 'SUNRGBDDataset':
+        NUM_CLASSES = 10
+    else:
+        raise NotImplementedError
+
+    RENAME_PREFIX = {
+        'rpn_head.conv_pred.0': 'rpn_head.conv_pred.shared_convs.layer0',
+        'rpn_head.conv_pred.1': 'rpn_head.conv_pred.shared_convs.layer1'
+    }
+
+    DEL_KEYS = [
+        'rpn_head.conv_pred.0.bn.num_batches_tracked',
+        'rpn_head.conv_pred.1.bn.num_batches_tracked'
+    ]
+
+    EXTRACT_KEYS = {
+        'rpn_head.conv_pred.conv_cls.weight':
+        ('rpn_head.conv_pred.conv_out.weight', [(0, 2), (-NUM_CLASSES, -1)]),
+        'rpn_head.conv_pred.conv_cls.bias':
+        ('rpn_head.conv_pred.conv_out.bias', [(0, 2), (-NUM_CLASSES, -1)]),
+        'rpn_head.conv_pred.conv_reg.weight':
+        ('rpn_head.conv_pred.conv_out.weight', [(2, -NUM_CLASSES)]),
+        'rpn_head.conv_pred.conv_reg.bias':
+        ('rpn_head.conv_pred.conv_out.bias', [(2, -NUM_CLASSES)])
+    }
+
+    # Delete some useless keys
+    for key in DEL_KEYS:
+        converted_ckpt.pop(key)
+
+    # Rename keys with specific prefix
+    RENAME_KEYS = dict()
+    for old_key in converted_ckpt.keys():
+        for rename_prefix in RENAME_PREFIX.keys():
+            if rename_prefix in old_key:
+                new_key = old_key.replace(rename_prefix,
+                                          RENAME_PREFIX[rename_prefix])
+                RENAME_KEYS[new_key] = old_key
+    for new_key, old_key in RENAME_KEYS.items():
+        converted_ckpt[new_key] = converted_ckpt.pop(old_key)
+
+    # Extract weights and rename the keys
+    for new_key, (old_key, indices) in EXTRACT_KEYS.items():
+        cur_layers = orig_ckpt[old_key]
+        converted_layers = []
+        for (start, end) in indices:
+            if end != -1:
+                converted_layers.append(cur_layers[start:end])
+            else:
+                converted_layers.append(cur_layers[start:])
+        converted_layers = torch.cat(converted_layers, 0)
+        converted_ckpt[new_key] = converted_layers
+        if old_key in converted_ckpt.keys():
+            converted_ckpt.pop(old_key)
+
+    # Check the converted checkpoint by loading to the model
+    load_state_dict(model, converted_ckpt, strict=True)
+    checkpoint['state_dict'] = converted_ckpt
+    torch.save(checkpoint, args.out)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/tools/model_converters/convert_votenet_checkpoints.py b/mmde/tools/model_converters/convert_votenet_checkpoints.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a9a32f4426f6130d29bfd8ece8e05f98aa6789e
--- /dev/null
+++ b/mmde/tools/model_converters/convert_votenet_checkpoints.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+
+import torch
+from mmengine import Config
+from mmengine.runner import load_state_dict
+
+from mmdet3d.registry import MODELS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet3D upgrade model version(before v0.6.0) of VoteNet')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--out', help='path of the output checkpoint file')
+    args = parser.parse_args()
+    return args
+
+
+def parse_config(config_strings):
+    """Parse config from strings.
+
+    Args:
+        config_strings (string): strings of model config.
+
+    Returns:
+        Config: model config
+    """
+    temp_file = tempfile.NamedTemporaryFile()
+    config_path = f'{temp_file.name}.py'
+    with open(config_path, 'w') as f:
+        f.write(config_strings)
+
+    config = Config.fromfile(config_path)
+
+    # Update backbone config
+    if 'pool_mod' in config.model.backbone:
+        config.model.backbone.pop('pool_mod')
+
+    if 'sa_cfg' not in config.model.backbone:
+        config.model.backbone['sa_cfg'] = dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)
+
+    if 'type' not in config.model.bbox_head.vote_aggregation_cfg:
+        config.model.bbox_head.vote_aggregation_cfg['type'] = 'PointSAModule'
+
+    # Update bbox_head config
+    if 'pred_layer_cfg' not in config.model.bbox_head:
+        config.model.bbox_head['pred_layer_cfg'] = dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True)
+
+    if 'feat_channels' in config.model.bbox_head:
+        config.model.bbox_head.pop('feat_channels')
+
+    if 'vote_moudule_cfg' in config.model.bbox_head:
+        config.model.bbox_head['vote_module_cfg'] = config.model.bbox_head.pop(
+            'vote_moudule_cfg')
+
+    if config.model.bbox_head.vote_aggregation_cfg.use_xyz:
+        config.model.bbox_head.vote_aggregation_cfg.mlp_channels[0] -= 3
+
+    temp_file.close()
+
+    return config
+
+
+def main():
+    """Convert keys in checkpoints for VoteNet.
+
+    There can be some breaking changes during the development of mmdetection3d,
+    and this tool is used for upgrading checkpoints trained with old versions
+    (before v0.6.0) to the latest one.
+    """
+    args = parse_args()
+    checkpoint = torch.load(args.checkpoint)
+    cfg = parse_config(checkpoint['meta']['config'])
+    # Build the model and load checkpoint
+    model = MODELS.build(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    orig_ckpt = checkpoint['state_dict']
+    converted_ckpt = orig_ckpt.copy()
+
+    if cfg['dataset_type'] == 'ScanNetDataset':
+        NUM_CLASSES = 18
+    elif cfg['dataset_type'] == 'SUNRGBDDataset':
+        NUM_CLASSES = 10
+    else:
+        raise NotImplementedError
+
+    RENAME_PREFIX = {
+        'bbox_head.conv_pred.0': 'bbox_head.conv_pred.shared_convs.layer0',
+        'bbox_head.conv_pred.1': 'bbox_head.conv_pred.shared_convs.layer1'
+    }
+
+    DEL_KEYS = [
+        'bbox_head.conv_pred.0.bn.num_batches_tracked',
+        'bbox_head.conv_pred.1.bn.num_batches_tracked'
+    ]
+
+    EXTRACT_KEYS = {
+        'bbox_head.conv_pred.conv_cls.weight':
+        ('bbox_head.conv_pred.conv_out.weight', [(0, 2), (-NUM_CLASSES, -1)]),
+        'bbox_head.conv_pred.conv_cls.bias':
+        ('bbox_head.conv_pred.conv_out.bias', [(0, 2), (-NUM_CLASSES, -1)]),
+        'bbox_head.conv_pred.conv_reg.weight':
+        ('bbox_head.conv_pred.conv_out.weight', [(2, -NUM_CLASSES)]),
+        'bbox_head.conv_pred.conv_reg.bias':
+        ('bbox_head.conv_pred.conv_out.bias', [(2, -NUM_CLASSES)])
+    }
+
+    # Delete some useless keys
+    for key in DEL_KEYS:
+        converted_ckpt.pop(key)
+
+    # Rename keys with specific prefix
+    RENAME_KEYS = dict()
+    for old_key in converted_ckpt.keys():
+        for rename_prefix in RENAME_PREFIX.keys():
+            if rename_prefix in old_key:
+                new_key = old_key.replace(rename_prefix,
+                                          RENAME_PREFIX[rename_prefix])
+                RENAME_KEYS[new_key] = old_key
+    for new_key, old_key in RENAME_KEYS.items():
+        converted_ckpt[new_key] = converted_ckpt.pop(old_key)
+
+    # Extract weights and rename the keys
+    for new_key, (old_key, indices) in EXTRACT_KEYS.items():
+        cur_layers = orig_ckpt[old_key]
+        converted_layers = []
+        for (start, end) in indices:
+            if end != -1:
+                converted_layers.append(cur_layers[start:end])
+            else:
+                converted_layers.append(cur_layers[start:])
+        converted_layers = torch.cat(converted_layers, 0)
+        converted_ckpt[new_key] = converted_layers
+        if old_key in converted_ckpt.keys():
+            converted_ckpt.pop(old_key)
+
+    # Check the converted checkpoint by loading to the model
+    load_state_dict(model, converted_ckpt, strict=True)
+    checkpoint['state_dict'] = converted_ckpt
+    torch.save(checkpoint, args.out)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/tools/model_converters/publish_model.py b/mmde/tools/model_converters/publish_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2660578af83fe99954cff32729a0e0c2d75e005
--- /dev/null
+++ b/mmde/tools/model_converters/publish_model.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    args = parser.parse_args()
+    return args
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    torch.save(checkpoint, out_file)
+    sha = subprocess.check_output(['sha256sum', out_file]).decode()
+    final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8])
+    subprocess.Popen(['mv', out_file, final_file])
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/tools/model_converters/regnet2mmdet.py b/mmde/tools/model_converters/regnet2mmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbf8c8f33a90839fef055aea0a775e76ff84afd3
--- /dev/null
+++ b/mmde/tools/model_converters/regnet2mmdet.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from collections import OrderedDict
+
+import torch
+
+
+def convert_stem(model_key, model_weight, state_dict, converted_names):
+    new_key = model_key.replace('stem.conv', 'conv1')
+    new_key = new_key.replace('stem.bn', 'bn1')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+    print(f'Convert {model_key} to {new_key}')
+
+
+def convert_head(model_key, model_weight, state_dict, converted_names):
+    new_key = model_key.replace('head.fc', 'fc')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+    print(f'Convert {model_key} to {new_key}')
+
+
+def convert_reslayer(model_key, model_weight, state_dict, converted_names):
+    split_keys = model_key.split('.')
+    layer, block, module = split_keys[:3]
+    block_id = int(block[1:])
+    layer_name = f'layer{int(layer[1:])}'
+    block_name = f'{block_id - 1}'
+
+    if block_id == 1 and module == 'bn':
+        new_key = f'{layer_name}.{block_name}.downsample.1.{split_keys[-1]}'
+    elif block_id == 1 and module == 'proj':
+        new_key = f'{layer_name}.{block_name}.downsample.0.{split_keys[-1]}'
+    elif module == 'f':
+        if split_keys[3] == 'a_bn':
+            module_name = 'bn1'
+        elif split_keys[3] == 'b_bn':
+            module_name = 'bn2'
+        elif split_keys[3] == 'c_bn':
+            module_name = 'bn3'
+        elif split_keys[3] == 'a':
+            module_name = 'conv1'
+        elif split_keys[3] == 'b':
+            module_name = 'conv2'
+        elif split_keys[3] == 'c':
+            module_name = 'conv3'
+        new_key = f'{layer_name}.{block_name}.{module_name}.{split_keys[-1]}'
+    else:
+        raise ValueError(f'Unsupported conversion of key {model_key}')
+    print(f'Convert {model_key} to {new_key}')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+
+
+def convert(src, dst):
+    """Convert keys in pycls pretrained RegNet models to mmdet style."""
+    # load caffe model
+    regnet_model = torch.load(src)
+    blobs = regnet_model['model_state']
+    # convert to pytorch style
+    state_dict = OrderedDict()
+    converted_names = set()
+    for key, weight in blobs.items():
+        if 'stem' in key:
+            convert_stem(key, weight, state_dict, converted_names)
+        elif 'head' in key:
+            convert_head(key, weight, state_dict, converted_names)
+        elif key.startswith('s'):
+            convert_reslayer(key, weight, state_dict, converted_names)
+
+    # check if all layers are converted
+    for key in blobs:
+        if key not in converted_names:
+            print(f'not converted: {key}')
+    # save checkpoint
+    checkpoint = dict()
+    checkpoint['state_dict'] = state_dict
+    torch.save(checkpoint, dst)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert model keys')
+    parser.add_argument('src', help='src detectron model path')
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+    convert(args.src, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/tools/slurm_test.sh b/mmde/tools/slurm_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6dd67e57442b741fc30f26102eb5afe16139edb1
--- /dev/null
+++ b/mmde/tools/slurm_test.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+CHECKPOINT=$4
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+PY_ARGS=${@:5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
diff --git a/mmde/tools/slurm_train.sh b/mmde/tools/slurm_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b3feb3d9c7a6c33d82739cdf5ee10365673aaded
--- /dev/null
+++ b/mmde/tools/slurm_train.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+WORK_DIR=$4
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+PY_ARGS=${@:5}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
diff --git a/mmde/tools/test.py b/mmde/tools/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4afc25597be5e0d66af4fbcefbdbbabe58ba20ee
--- /dev/null
+++ b/mmde/tools/test.py
@@ -0,0 +1,149 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+from mmengine.config import Config, ConfigDict, DictAction
+from mmengine.registry import RUNNERS
+from mmengine.runner import Runner
+
+from mmdet3d.utils import replace_ceph_backend
+
+
+# TODO: support fuse_conv_bn and format_only
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet3D test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing evaluation metrics')
+    parser.add_argument(
+        '--ceph', action='store_true', help='Use ceph as data storage backend')
+    parser.add_argument(
+        '--show', action='store_true', help='show prediction results')
+    parser.add_argument(
+        '--show-dir',
+        help='directory where painted images will be saved. '
+        'If specified, it will be automatically saved '
+        'to the work_dir/timestamp/show_dir')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.1, help='bbox score threshold')
+    parser.add_argument(
+        '--task',
+        type=str,
+        choices=[
+            'mono_det', 'multi-view_det', 'lidar_det', 'lidar_seg',
+            'multi-modality_det'
+        ],
+        help='Determine the visualization method depending on the task.')
+    parser.add_argument(
+        '--wait-time', type=float, default=2, help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument(
+        '--tta', action='store_true', help='Test time augmentation')
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/test.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def trigger_visualization_hook(cfg, args):
+    default_hooks = cfg.default_hooks
+    if 'visualization' in default_hooks:
+        visualization_hook = default_hooks['visualization']
+        # Turn on visualization
+        visualization_hook['draw'] = True
+        if args.show:
+            visualization_hook['show'] = True
+            visualization_hook['wait_time'] = args.wait_time
+        if args.show_dir:
+            visualization_hook['test_out_dir'] = args.show_dir
+        all_task_choices = [
+            'mono_det', 'multi-view_det', 'lidar_det', 'lidar_seg',
+            'multi-modality_det'
+        ]
+        assert args.task in all_task_choices, 'You must set '\
+            f"'--task' in {all_task_choices} in the command " \
+            'if you want to use visualization hook'
+        visualization_hook['vis_task'] = args.task
+        visualization_hook['score_thr'] = args.score_thr
+    else:
+        raise RuntimeError(
+            'VisualizationHook must be included in default_hooks.'
+            'refer to usage '
+            '"visualization=dict(type=\'VisualizationHook\')"')
+
+    return cfg
+
+
+def main():
+    args = parse_args()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+
+    # TODO: We will unify the ceph support approach with other OpenMMLab repos
+    if args.ceph:
+        cfg = replace_ceph_backend(cfg)
+
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    cfg.load_from = args.checkpoint
+
+    if args.show or args.show_dir:
+        cfg = trigger_visualization_hook(cfg, args)
+
+    if args.tta:
+        # Currently, we only support tta for 3D segmentation
+        # TODO: Support tta for 3D detection
+        assert 'tta_model' in cfg, 'Cannot find ``tta_model`` in config.'
+        assert 'tta_pipeline' in cfg, 'Cannot find ``tta_pipeline`` in config.'
+        cfg.test_dataloader.dataset.pipeline = cfg.tta_pipeline
+        cfg.model = ConfigDict(**cfg.tta_model, module=cfg.model)
+
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
+    else:
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
+
+    # start testing
+    runner.test()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/tools/train.py b/mmde/tools/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b9c3b0842c0a6fe5f311d9979d4f207958af96d
--- /dev/null
+++ b/mmde/tools/train.py
@@ -0,0 +1,145 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import logging
+import os
+import os.path as osp
+
+from mmengine.config import Config, DictAction
+from mmengine.logging import print_log
+from mmengine.registry import RUNNERS
+from mmengine.runner import Runner
+
+from mmdet3d.utils import replace_ceph_backend
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a 3D detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--amp',
+        action='store_true',
+        default=False,
+        help='enable automatic-mixed-precision training')
+    parser.add_argument(
+        '--sync_bn',
+        choices=['none', 'torch', 'mmcv'],
+        default='none',
+        help='convert all BatchNorm layers in the model to SyncBatchNorm '
+        '(SyncBN) or mmcv.ops.sync_bn.SyncBatchNorm (MMSyncBN) layers.')
+    parser.add_argument(
+        '--auto-scale-lr',
+        action='store_true',
+        help='enable automatically scaling LR.')
+    parser.add_argument(
+        '--resume',
+        nargs='?',
+        type=str,
+        const='auto',
+        help='If specify checkpoint path, resume from it, while if not '
+        'specify, try to auto resume from the latest checkpoint '
+        'in the work directory.')
+    parser.add_argument(
+        '--ceph', action='store_true', help='Use ceph as data storage backend')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/train.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+
+    # TODO: We will unify the ceph support approach with other OpenMMLab repos
+    if args.ceph:
+        cfg = replace_ceph_backend(cfg)
+
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    # enable automatic-mixed-precision training
+    if args.amp is True:
+        optim_wrapper = cfg.optim_wrapper.type
+        if optim_wrapper == 'AmpOptimWrapper':
+            print_log(
+                'AMP training is already enabled in your config.',
+                logger='current',
+                level=logging.WARNING)
+        else:
+            assert optim_wrapper == 'OptimWrapper', (
+                '`--amp` is only supported when the optimizer wrapper type is '
+                f'`OptimWrapper` but got {optim_wrapper}.')
+            cfg.optim_wrapper.type = 'AmpOptimWrapper'
+            cfg.optim_wrapper.loss_scale = 'dynamic'
+
+    # convert BatchNorm layers
+    if args.sync_bn != 'none':
+        cfg.sync_bn = args.sync_bn
+
+    # enable automatically scaling LR
+    if args.auto_scale_lr:
+        if 'auto_scale_lr' in cfg and \
+                'enable' in cfg.auto_scale_lr and \
+                'base_batch_size' in cfg.auto_scale_lr:
+            cfg.auto_scale_lr.enable = True
+        else:
+            raise RuntimeError('Can not find "auto_scale_lr" or '
+                               '"auto_scale_lr.enable" or '
+                               '"auto_scale_lr.base_batch_size" in your'
+                               ' configuration file.')
+
+    # resume is determined in this priority: resume from > auto_resume
+    if args.resume == 'auto':
+        cfg.resume = True
+        cfg.load_from = None
+    elif args.resume is not None:
+        cfg.resume = True
+        cfg.load_from = args.resume
+
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
+    else:
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
+
+    # start training
+    runner.train()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmde/tools/update_data_coords.py b/mmde/tools/update_data_coords.py
new file mode 100644
index 0000000000000000000000000000000000000000..280e78d16ea7f507214ad13a067008951273a21a
--- /dev/null
+++ b/mmde/tools/update_data_coords.py
@@ -0,0 +1,168 @@
+import argparse
+import time
+from os import path as osp
+
+import mmengine
+import numpy as np
+
+from mmdet3d.structures import limit_period
+
+
+def update_sunrgbd_infos(root_dir, out_dir, pkl_files):
+    print(f'{pkl_files} will be modified because '
+          f'of the refactor of the Depth coordinate system.')
+    if root_dir == out_dir:
+        print(f'Warning, you are overwriting '
+              f'the original data under {root_dir}.')
+        time.sleep(3)
+    for pkl_file in pkl_files:
+        in_path = osp.join(root_dir, pkl_file)
+        print(f'Reading from input file: {in_path}.')
+        a = mmengine.load(in_path)
+        print('Start updating:')
+        for item in mmengine.track_iter_progress(a):
+            if 'rotation_y' in item['annos']:
+                item['annos']['rotation_y'] = -item['annos']['rotation_y']
+                item['annos']['gt_boxes_upright_depth'][:, -1:] = \
+                    -item['annos']['gt_boxes_upright_depth'][:, -1:]
+
+        out_path = osp.join(out_dir, pkl_file)
+        print(f'Writing to output file: {out_path}.')
+        mmengine.dump(a, out_path, 'pkl')
+
+
+def update_outdoor_dbinfos(root_dir, out_dir, pkl_files):
+    print(f'{pkl_files} will be modified because '
+          f'of the refactor of the LIDAR coordinate system.')
+    if root_dir == out_dir:
+        print(f'Warning, you are overwriting '
+              f'the original data under {root_dir}.')
+        time.sleep(3)
+    for pkl_file in pkl_files:
+        in_path = osp.join(root_dir, pkl_file)
+        print(f'Reading from input file: {in_path}.')
+        a = mmengine.load(in_path)
+        print('Start updating:')
+        for k in a.keys():
+            print(f'Updating samples of class {k}:')
+            for item in mmengine.track_iter_progress(a[k]):
+                boxes = item['box3d_lidar'].copy()
+                # swap l, w (or dx, dy)
+                item['box3d_lidar'][3] = boxes[4]
+                item['box3d_lidar'][4] = boxes[3]
+                # change yaw
+                item['box3d_lidar'][6] = -boxes[6] - np.pi / 2
+                item['box3d_lidar'][6] = limit_period(
+                    item['box3d_lidar'][6], period=np.pi * 2)
+
+        out_path = osp.join(out_dir, pkl_file)
+        print(f'Writing to output file: {out_path}.')
+        mmengine.dump(a, out_path, 'pkl')
+
+
+def update_nuscenes_or_lyft_infos(root_dir, out_dir, pkl_files):
+
+    print(f'{pkl_files} will be modified because '
+          f'of the refactor of the LIDAR coordinate system.')
+    if root_dir == out_dir:
+        print(f'Warning, you are overwriting '
+              f'the original data under {root_dir}.')
+        time.sleep(3)
+    for pkl_file in pkl_files:
+        in_path = osp.join(root_dir, pkl_file)
+        print(f'Reading from input file: {in_path}.')
+        a = mmengine.load(in_path)
+        print('Start updating:')
+        for item in mmengine.track_iter_progress(a['infos']):
+            boxes = item['gt_boxes'].copy()
+            # swap l, w (or dx, dy)
+            item['gt_boxes'][:, 3] = boxes[:, 4]
+            item['gt_boxes'][:, 4] = boxes[:, 3]
+            # change yaw
+            item['gt_boxes'][:, 6] = -boxes[:, 6] - np.pi / 2
+            item['gt_boxes'][:, 6] = limit_period(
+                item['gt_boxes'][:, 6], period=np.pi * 2)
+
+        out_path = osp.join(out_dir, pkl_file)
+        print(f'Writing to output file: {out_path}.')
+        mmengine.dump(a, out_path, 'pkl')
+
+
+parser = argparse.ArgumentParser(description='Arg parser for data coords '
+                                 'update due to coords sys refactor.')
+parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
+parser.add_argument(
+    '--root-dir',
+    type=str,
+    default='./data/kitti',
+    help='specify the root dir of dataset')
+parser.add_argument(
+    '--version',
+    type=str,
+    default='v1.0',
+    required=False,
+    help='specify the dataset version, no need for kitti')
+parser.add_argument(
+    '--out-dir',
+    type=str,
+    default=None,
+    required=False,
+    help='name of info pkl')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    if args.out_dir is None:
+        args.out_dir = args.root_dir
+    if args.dataset == 'kitti':
+        # KITTI infos is in CAM coord sys (unchanged)
+        # KITTI dbinfos is in LIDAR coord sys (changed)
+        # so we only update dbinfos
+        pkl_files = ['kitti_dbinfos_train.pkl']
+        update_outdoor_dbinfos(
+            root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files)
+    elif args.dataset == 'nuscenes':
+        # nuScenes infos is in LIDAR coord sys (changed)
+        # nuScenes dbinfos is in LIDAR coord sys (changed)
+        # so we update both infos and dbinfos
+        pkl_files = ['nuscenes_infos_val.pkl']
+        if args.version != 'v1.0-mini':
+            pkl_files.append('nuscenes_infos_train.pkl')
+        else:
+            pkl_files.append('nuscenes_infos_train_tiny.pkl')
+        update_nuscenes_or_lyft_infos(
+            root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files)
+        if args.version != 'v1.0-mini':
+            pkl_files = ['nuscenes_dbinfos_train.pkl']
+            update_outdoor_dbinfos(
+                root_dir=args.root_dir,
+                out_dir=args.out_dir,
+                pkl_files=pkl_files)
+    elif args.dataset == 'lyft':
+        # Lyft infos is in LIDAR coord sys (changed)
+        # Lyft has no dbinfos
+        # so we update infos
+        pkl_files = ['lyft_infos_train.pkl', 'lyft_infos_val.pkl']
+        update_nuscenes_or_lyft_infos(
+            root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files)
+    elif args.dataset == 'waymo':
+        # Waymo infos is in CAM coord sys (unchanged)
+        # Waymo dbinfos is in LIDAR coord sys (changed)
+        # so we only update dbinfos
+        pkl_files = ['waymo_dbinfos_train.pkl']
+        update_outdoor_dbinfos(
+            root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files)
+    elif args.dataset == 'scannet':
+        # ScanNet infos is in DEPTH coord sys (changed)
+        # but bbox is without yaw
+        # so ScanNet is unaffected
+        pass
+    elif args.dataset == 's3dis':
+        # Segmentation datasets are not affected
+        pass
+    elif args.dataset == 'sunrgbd':
+        # SUNRGBD infos is in DEPTH coord sys (changed)
+        # and bbox is with yaw
+        # so we update infos
+        pkl_files = ['sunrgbd_infos_train.pkl', 'sunrgbd_infos_val.pkl']
+        update_sunrgbd_infos(
+            root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files)
diff --git a/mmde/tools/update_data_coords.sh b/mmde/tools/update_data_coords.sh
new file mode 100644
index 0000000000000000000000000000000000000000..bd8db628383757c675aaf7e6d5c6e6f21616125a
--- /dev/null
+++ b/mmde/tools/update_data_coords.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+set -x
+export PYTHONPATH=`pwd`:$PYTHONPATH
+
+PARTITION=$1
+DATASET=$2
+GPUS=${GPUS:-1}
+GPUS_PER_NODE=${GPUS_PER_NODE:-1}
+SRUN_ARGS=${SRUN_ARGS:-""}
+JOB_NAME=update_data_coords
+
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/update_data_coords.py ${DATASET} \
+            --root-dir ./data/${DATASET} \
+            --out-dir ./data/${DATASET}
diff --git "a/mmde/\345\221\275\344\273\244" "b/mmde/\345\221\275\344\273\244"
new file mode 100644
index 0000000000000000000000000000000000000000..4c791d6de1d577bdf394e104f884248c5dd19123
--- /dev/null
+++ "b/mmde/\345\221\275\344\273\244"
@@ -0,0 +1,11 @@
+python3 tools/test.py \
+    projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py \
+    pth/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-5239b1af_fixed.pth \
+    --cfg-options \
+    test_dataloader.dataset.ann_file=nuscenes_infos_mini_val.pkl test_dataloader.batch_size=4
+
+python3 tools/test.py \
+    projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py \
+    pth/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-2628f933_fixed.pth \
+    --cfg-options \
+    test_dataloader.dataset.ann_file=nuscenes_infos_mini_val.pkl
\ No newline at end of file
diff --git a/mmdetection3d/projects/BEVFusion/setup.py b/mmdetection3d/projects/BEVFusion/setup.py
index 41aa96b11941954c92fd5bdf5240c369ce76b41f..04e884e48608c5849ce3ffa0714e4f2af76afaad 100644
--- a/mmdetection3d/projects/BEVFusion/setup.py
+++ b/mmdetection3d/projects/BEVFusion/setup.py
@@ -23,11 +23,7 @@ def make_cuda_ext(name,
             '-D__CUDA_NO_HALF_OPERATORS__',
             '-D__CUDA_NO_HALF_CONVERSIONS__',
             '-D__CUDA_NO_HALF2_OPERATORS__',
-            '-gencode=arch=compute_70,code=sm_70',
-            '-gencode=arch=compute_75,code=sm_75',
-            '-gencode=arch=compute_80,code=sm_80',
-            '-gencode=arch=compute_86,code=sm_86',
-        ]
+            ]
         sources += sources_cuda
     else:
         print('Compiling {} without CUDA'.format(name))
@@ -48,7 +44,7 @@ if __name__ == '__main__':
         ext_modules=[
             make_cuda_ext(
                 name='bev_pool_ext',
-                module='projects.BEVFusion.bevfusion.ops.bev_pool',
+                module='bevfusion.ops.bev_pool',
                 sources=[
                     'src/bev_pool.cpp',
                     'src/bev_pool_cuda.cu',
@@ -56,7 +52,7 @@ if __name__ == '__main__':
             ),
             make_cuda_ext(
                 name='voxel_layer',
-                module='projects.BEVFusion.bevfusion.ops.voxel',
+                module='bevfusion.ops.voxel',
                 sources=[
                     'src/voxelization.cpp',
                     'src/scatter_points_cpu.cpp',